From 9e43df2fbe9031296ba908bb01bb8a1dec05ce66 Mon Sep 17 00:00:00 2001 From: perf3ct Date: Mon, 30 Jun 2025 19:13:16 +0000 Subject: [PATCH] feat(server/client): add metadata to file view --- frontend/src/components/MetadataDisplay.tsx | 232 ++++++++++ frontend/src/pages/DocumentDetailsPage.tsx | 57 +++ frontend/src/pages/DocumentManagementPage.tsx | 37 ++ ...630104504_add_document_metadata_fields.sql | 17 + src/db/documents.rs | 110 +++-- src/ingestion/document_ingestion.rs | 94 +++- src/models.rs | 29 ++ src/routes/documents.rs | 3 + src/routes/webdav/webdav_sync.rs | 17 +- src/scheduling/source_sync.rs | 16 +- src/services/file_service.rs | 6 + src/services/local_folder_service.rs | 42 ++ src/services/s3_service.rs | 32 ++ src/services/webdav_service.rs | 9 +- src/tests/config_oidc_tests.rs | 420 ++++++++---------- src/tests/db_tests.rs | 3 + src/tests/document_routes_tests.rs | 6 + src/tests/documents_tests.rs | 12 + src/tests/enhanced_ocr_tests.rs | 2 +- src/tests/enhanced_search_tests.rs | 3 + src/tests/file_service_tests.rs | 18 + src/tests/ignored_files_tests.rs | 3 + src/webdav_xml_parser.rs | 86 ++++ ...on_document_upload_hash_duplicate_tests.rs | 3 + ...egration_file_processing_pipeline_tests.rs | 3 + ...egration_hash_duplicate_detection_tests.rs | 9 + ...gration_ignored_files_integration_tests.rs | 3 + ...ration_source_sync_hash_duplicate_tests.rs | 8 + ...integration_webdav_hash_duplicate_tests.rs | 13 + tests/unit_unit_tests.rs | 6 + tests/unit_webdav_enhanced_unit_tests.rs | 5 + 31 files changed, 1007 insertions(+), 297 deletions(-) create mode 100644 frontend/src/components/MetadataDisplay.tsx create mode 100644 migrations/20250630104504_add_document_metadata_fields.sql diff --git a/frontend/src/components/MetadataDisplay.tsx b/frontend/src/components/MetadataDisplay.tsx new file mode 100644 index 0000000..a038be8 --- /dev/null +++ b/frontend/src/components/MetadataDisplay.tsx @@ -0,0 +1,232 @@ +import React, { useState } from 'react'; +import { + Box, + Typography, + Accordion, + AccordionSummary, + AccordionDetails, + Chip, + Grid, +} from '@mui/material'; +import { + ExpandMore as ExpandMoreIcon, + Security as PermissionsIcon, + Person as OwnerIcon, + Group as GroupIcon, + Storage as StorageIcon, + Info as InfoIcon, +} from '@mui/icons-material'; + +interface MetadataDisplayProps { + metadata: any; + title?: string; + compact?: boolean; +} + +const MetadataDisplay: React.FC = ({ + metadata, + title = "Source Metadata", + compact = false, +}) => { + const [expanded, setExpanded] = useState(!compact); + + if (!metadata || Object.keys(metadata).length === 0) { + return null; + } + + const formatValue = (key: string, value: any): React.ReactNode => { + // Handle special metadata fields with better formatting + if (key === 'permissions' && typeof value === 'number') { + return ( + + + + {value.toString(8)} (octal) + + + ); + } + + if (key === 'owner' || key === 'uid') { + return ( + + + + {value} + + + ); + } + + if (key === 'group' || key === 'gid') { + return ( + + + + {value} + + + ); + } + + if (key === 'storage_class' || key === 'region') { + return ( + + + + {value} + + + ); + } + + // Handle arrays + if (Array.isArray(value)) { + return ( + + {value.map((item, index) => ( + + ))} + + ); + } + + // Handle objects + if (typeof value === 'object' && value !== null) { + return ( + +
+            {JSON.stringify(value, null, 2)}
+          
+
+ ); + } + + // Handle boolean values + if (typeof value === 'boolean') { + return ( + + ); + } + + // Handle dates + if (typeof value === 'string' && ( + key.includes('date') || + key.includes('time') || + key.includes('created') || + key.includes('modified') + )) { + try { + const date = new Date(value); + if (!isNaN(date.getTime())) { + return ( + + {date.toLocaleString()} + + ); + } + } catch { + // Fall through to default handling + } + } + + // Default: display as string + return ( + + {String(value)} + + ); + }; + + const formatKeyName = (key: string): string => { + // Convert snake_case and camelCase to Title Case + return key + .replace(/([a-z])([A-Z])/g, '$1 $2') // camelCase to spaces + .replace(/_/g, ' ') // snake_case to spaces + .replace(/\b\w/g, (letter) => letter.toUpperCase()); // Title Case + }; + + const renderMetadata = () => { + return ( + + {Object.entries(metadata).map(([key, value]) => ( + + + + {formatKeyName(key)} + + + + {formatValue(key, value)} + + + ))} + + ); + }; + + if (compact) { + return ( + setExpanded(isExpanded)}> + } + sx={{ + backgroundColor: 'grey.50', + '&:hover': { backgroundColor: 'grey.100' } + }} + > + + + + {title} + + + + + + {renderMetadata()} + + + ); + } + + return ( + + + + + {title} + + + {renderMetadata()} + + ); +}; + +export default MetadataDisplay; \ No newline at end of file diff --git a/frontend/src/pages/DocumentDetailsPage.tsx b/frontend/src/pages/DocumentDetailsPage.tsx index 7c7a22c..424813b 100644 --- a/frontend/src/pages/DocumentDetailsPage.tsx +++ b/frontend/src/pages/DocumentDetailsPage.tsx @@ -35,11 +35,16 @@ import { Search as SearchIcon, Edit as EditIcon, PhotoFilter as ProcessedImageIcon, + Source as SourceIcon, + AccessTime as AccessTimeIcon, + Create as CreateIcon, + Info as InfoIcon, } from '@mui/icons-material'; import { documentService, OcrResponse } from '../services/api'; import DocumentViewer from '../components/DocumentViewer'; import LabelSelector from '../components/Labels/LabelSelector'; import { type LabelData } from '../components/Labels/Label'; +import MetadataDisplay from '../components/MetadataDisplay'; import api from '../services/api'; interface Document { @@ -51,6 +56,9 @@ interface Document { created_at: string; has_ocr_text?: boolean; tags?: string[]; + original_created_at?: string; + original_modified_at?: string; + source_metadata?: any; } const DocumentDetailsPage: React.FC = () => { @@ -500,6 +508,55 @@ const DocumentDetailsPage: React.FC = () => { + {/* Source Metadata Section */} + {(document.original_created_at || document.original_modified_at || document.source_metadata) && ( + <> + {document.original_created_at && ( + + + + + + Original Created + + + + {formatDate(document.original_created_at)} + + + + )} + + {document.original_modified_at && ( + + + + + + Original Modified + + + + {formatDate(document.original_modified_at)} + + + + )} + + {document.source_metadata && Object.keys(document.source_metadata).length > 0 && ( + + + + + + )} + + )} + {document.tags && document.tags.length > 0 && ( diff --git a/frontend/src/pages/DocumentManagementPage.tsx b/frontend/src/pages/DocumentManagementPage.tsx index b8d6a53..3a89540 100644 --- a/frontend/src/pages/DocumentManagementPage.tsx +++ b/frontend/src/pages/DocumentManagementPage.tsx @@ -56,6 +56,7 @@ import { format } from 'date-fns'; import { api, documentService, queueService } from '../services/api'; import DocumentViewer from '../components/DocumentViewer'; import FailedDocumentViewer from '../components/FailedDocumentViewer'; +import MetadataDisplay from '../components/MetadataDisplay'; interface FailedDocument { id: string; @@ -78,6 +79,9 @@ interface FailedDocument { ocr_word_count?: number; failure_reason: string; error_message?: string; + original_created_at?: string; + original_modified_at?: string; + source_metadata?: any; } interface FailureCategory { @@ -1989,6 +1993,39 @@ const DocumentManagementPage: React.FC = () => { sx={{ mb: 2 }} /> + {/* Source Metadata Section */} + {selectedDocument.original_created_at && ( + <> + + Original Created: + + + {format(new Date(selectedDocument.original_created_at), 'PPpp')} + + + )} + + {selectedDocument.original_modified_at && ( + <> + + Original Modified: + + + {format(new Date(selectedDocument.original_modified_at), 'PPpp')} + + + )} + + {selectedDocument.source_metadata && Object.keys(selectedDocument.source_metadata).length > 0 && ( + + + + )} + Retry Count: diff --git a/migrations/20250630104504_add_document_metadata_fields.sql b/migrations/20250630104504_add_document_metadata_fields.sql new file mode 100644 index 0000000..d0c709c --- /dev/null +++ b/migrations/20250630104504_add_document_metadata_fields.sql @@ -0,0 +1,17 @@ +-- Add metadata preservation fields to documents table +ALTER TABLE documents +ADD COLUMN original_created_at TIMESTAMPTZ, +ADD COLUMN original_modified_at TIMESTAMPTZ, +ADD COLUMN source_metadata JSONB; + +-- Add comment to explain fields +COMMENT ON COLUMN documents.original_created_at IS 'Original file creation timestamp from source system'; +COMMENT ON COLUMN documents.original_modified_at IS 'Original file modification timestamp from source system'; +COMMENT ON COLUMN documents.source_metadata IS 'Additional metadata from source system (permissions, attributes, EXIF data, etc.)'; + +-- Create index on source_metadata for efficient JSONB queries +CREATE INDEX idx_documents_source_metadata ON documents USING gin (source_metadata); + +-- Note: We cannot reliably populate original_created_at and original_modified_at +-- for existing documents as we don't have this information stored. +-- These fields will remain NULL for existing documents, which is correct. \ No newline at end of file diff --git a/src/db/documents.rs b/src/db/documents.rs index 499c2e3..d037ba9 100644 --- a/src/db/documents.rs +++ b/src/db/documents.rs @@ -10,9 +10,9 @@ impl Database { pub async fn create_document(&self, document: Document) -> Result { let row = sqlx::query( r#" - INSERT INTO documents (id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19) - RETURNING id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + INSERT INTO documents (id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22) + RETURNING id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata "# ) .bind(document.id) @@ -34,6 +34,9 @@ impl Database { .bind(document.updated_at) .bind(document.user_id) .bind(&document.file_hash) + .bind(document.original_created_at) + .bind(document.original_modified_at) + .bind(&document.source_metadata) .fetch_one(&self.pool) .await?; @@ -57,6 +60,9 @@ impl Database { updated_at: row.get("updated_at"), user_id: row.get("user_id"), file_hash: row.get("file_hash"), + original_created_at: row.get("original_created_at"), + original_modified_at: row.get("original_modified_at"), + source_metadata: row.get("source_metadata"), }) } @@ -64,7 +70,7 @@ impl Database { let query = if user_role == crate::models::UserRole::Admin { // Admins can see all documents r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, original_created_at, original_modified_at, source_metadata FROM documents ORDER BY created_at DESC LIMIT $1 OFFSET $2 @@ -72,7 +78,7 @@ impl Database { } else { // Regular users can only see their own documents r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, original_created_at, original_modified_at, source_metadata FROM documents WHERE user_id = $3 ORDER BY created_at DESC @@ -117,6 +123,9 @@ impl Database { updated_at: row.get("updated_at"), user_id: row.get("user_id"), file_hash: row.get("file_hash"), + original_created_at: row.get("original_created_at"), + original_modified_at: row.get("original_modified_at"), + source_metadata: row.get("source_metadata"), }) .collect(); @@ -129,7 +138,7 @@ impl Database { // Admin with OCR filter sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE ocr_status = $3 ORDER BY created_at DESC @@ -146,7 +155,7 @@ impl Database { // Admin without OCR filter sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents ORDER BY created_at DESC LIMIT $1 OFFSET $2 @@ -161,7 +170,7 @@ impl Database { // Regular user with OCR filter sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE user_id = $3 AND ocr_status = $4 ORDER BY created_at DESC @@ -179,7 +188,7 @@ impl Database { // Regular user without OCR filter sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE user_id = $3 ORDER BY created_at DESC @@ -216,6 +225,9 @@ impl Database { updated_at: row.get("updated_at"), user_id: row.get("user_id"), file_hash: row.get("file_hash"), + original_created_at: row.get("original_created_at"), + original_modified_at: row.get("original_modified_at"), + source_metadata: row.get("source_metadata"), }) .collect(); @@ -268,7 +280,7 @@ impl Database { pub async fn get_documents_by_user(&self, user_id: Uuid, limit: i64, offset: i64) -> Result> { let rows = sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE user_id = $1 ORDER BY created_at DESC @@ -303,6 +315,9 @@ impl Database { updated_at: row.get("updated_at"), user_id: row.get("user_id"), file_hash: row.get("file_hash"), + original_created_at: row.get("original_created_at"), + original_modified_at: row.get("original_modified_at"), + source_metadata: row.get("source_metadata"), }) .collect(); @@ -312,7 +327,7 @@ impl Database { pub async fn find_documents_by_filename(&self, filename: &str) -> Result> { let rows = sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE filename = $1 OR original_filename = $1 ORDER BY created_at DESC @@ -344,6 +359,9 @@ impl Database { updated_at: row.get("updated_at"), user_id: row.get("user_id"), file_hash: row.get("file_hash"), + original_created_at: row.get("original_created_at"), + original_modified_at: row.get("original_modified_at"), + source_metadata: row.get("source_metadata"), }) .collect(); @@ -353,7 +371,7 @@ impl Database { pub async fn search_documents(&self, user_id: Uuid, search: SearchRequest) -> Result<(Vec, i64)> { let mut query_builder = QueryBuilder::new( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, ts_rank(to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')), plainto_tsquery('english', "# ); @@ -415,6 +433,9 @@ impl Database { updated_at: row.get("updated_at"), user_id: row.get("user_id"), file_hash: row.get("file_hash"), + original_created_at: row.get("original_created_at"), + original_modified_at: row.get("original_modified_at"), + source_metadata: row.get("source_metadata"), }) .collect(); @@ -456,7 +477,7 @@ impl Database { // Use trigram similarity for substring matching let mut builder = QueryBuilder::new( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, GREATEST( similarity(filename, "# ); @@ -499,7 +520,7 @@ impl Database { let mut builder = QueryBuilder::new(&format!( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, GREATEST( CASE WHEN filename ILIKE '%' || "# )); @@ -645,7 +666,7 @@ impl Database { // Use trigram similarity for substring matching let mut builder = QueryBuilder::new( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, GREATEST( similarity(filename, "# ); @@ -684,7 +705,7 @@ impl Database { let mut builder = QueryBuilder::new(&format!( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, GREATEST( CASE WHEN filename ILIKE '%' || "# )); @@ -993,6 +1014,9 @@ impl Database { updated_at: row.get("updated_at"), user_id: row.get("user_id"), file_hash: row.get("file_hash"), + original_created_at: row.get("original_created_at"), + original_modified_at: row.get("original_modified_at"), + source_metadata: row.get("source_metadata"), }); } @@ -1081,14 +1105,14 @@ impl Database { let query = if user_role == crate::models::UserRole::Admin { // Admins can see any document r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE id = $1 "# } else { // Regular users can only see their own documents r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE id = $1 AND user_id = $2 "# @@ -1128,6 +1152,9 @@ impl Database { updated_at: row.get("updated_at"), user_id: row.get("user_id"), file_hash: row.get("file_hash"), + original_created_at: row.get("original_created_at"), + original_modified_at: row.get("original_modified_at"), + source_metadata: row.get("source_metadata"), })), None => Ok(None), } @@ -1137,7 +1164,7 @@ impl Database { pub async fn get_document_by_user_and_hash(&self, user_id: Uuid, file_hash: &str) -> Result> { let row = sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE user_id = $1 AND file_hash = $2 LIMIT 1 @@ -1169,6 +1196,9 @@ impl Database { updated_at: row.get("updated_at"), user_id: row.get("user_id"), file_hash: row.get("file_hash"), + original_created_at: row.get("original_created_at"), + original_modified_at: row.get("original_modified_at"), + source_metadata: row.get("source_metadata"), })), None => Ok(None), } @@ -1393,6 +1423,9 @@ impl Database { updated_at: r.get("updated_at"), user_id: r.get("user_id"), file_hash: r.get("file_hash"), + original_created_at: r.get("original_created_at"), + original_modified_at: r.get("original_modified_at"), + source_metadata: r.get("source_metadata"), }) } else { let row = sqlx::query( @@ -1427,6 +1460,9 @@ impl Database { updated_at: r.get("updated_at"), user_id: r.get("user_id"), file_hash: r.get("file_hash"), + original_created_at: r.get("original_created_at"), + original_modified_at: r.get("original_modified_at"), + source_metadata: r.get("source_metadata"), }) }; @@ -1470,6 +1506,9 @@ impl Database { updated_at: r.get("updated_at"), user_id: r.get("user_id"), file_hash: r.get("file_hash"), + original_created_at: r.get("original_created_at"), + original_modified_at: r.get("original_modified_at"), + source_metadata: r.get("source_metadata"), }).collect() } else { let rows = sqlx::query( @@ -1504,6 +1543,9 @@ impl Database { updated_at: r.get("updated_at"), user_id: r.get("user_id"), file_hash: r.get("file_hash"), + original_created_at: r.get("original_created_at"), + original_modified_at: r.get("original_modified_at"), + source_metadata: r.get("source_metadata"), }).collect() }; @@ -1515,7 +1557,7 @@ impl Database { let documents = if user_role == crate::models::UserRole::Admin { let rows = sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE ocr_confidence IS NOT NULL AND ocr_confidence < $1 ORDER BY ocr_confidence ASC, created_at DESC @@ -1545,11 +1587,14 @@ impl Database { updated_at: r.get("updated_at"), user_id: r.get("user_id"), file_hash: r.get("file_hash"), + original_created_at: r.get("original_created_at"), + original_modified_at: r.get("original_modified_at"), + source_metadata: r.get("source_metadata"), }).collect() } else { let rows = sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE ocr_confidence IS NOT NULL AND ocr_confidence < $1 AND user_id = $2 ORDER BY ocr_confidence ASC, created_at DESC @@ -1580,6 +1625,9 @@ impl Database { updated_at: r.get("updated_at"), user_id: r.get("user_id"), file_hash: r.get("file_hash"), + original_created_at: r.get("original_created_at"), + original_modified_at: r.get("original_modified_at"), + source_metadata: r.get("source_metadata"), }).collect() }; @@ -1591,7 +1639,7 @@ impl Database { let documents = if user_role == crate::models::UserRole::Admin { let rows = sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE ocr_status = 'failed' OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing') ORDER BY created_at DESC @@ -1620,11 +1668,14 @@ impl Database { updated_at: r.get("updated_at"), user_id: r.get("user_id"), file_hash: r.get("file_hash"), + original_created_at: r.get("original_created_at"), + original_modified_at: r.get("original_modified_at"), + source_metadata: r.get("source_metadata"), }).collect() } else { let rows = sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE (ocr_status = 'failed' OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing')) AND user_id = $1 ORDER BY created_at DESC @@ -1654,6 +1705,9 @@ impl Database { updated_at: r.get("updated_at"), user_id: r.get("user_id"), file_hash: r.get("file_hash"), + original_created_at: r.get("original_created_at"), + original_modified_at: r.get("original_modified_at"), + source_metadata: r.get("source_metadata"), }).collect() }; @@ -1665,7 +1719,7 @@ impl Database { let documents = if user_role == crate::models::UserRole::Admin { let rows = sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE (ocr_confidence IS NOT NULL AND ocr_confidence < $1) OR ocr_status = 'failed' @@ -1699,11 +1753,14 @@ impl Database { updated_at: r.get("updated_at"), user_id: r.get("user_id"), file_hash: r.get("file_hash"), + original_created_at: r.get("original_created_at"), + original_modified_at: r.get("original_modified_at"), + source_metadata: r.get("source_metadata"), }).collect() } else { let rows = sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE ((ocr_confidence IS NOT NULL AND ocr_confidence < $1) OR ocr_status = 'failed' @@ -1739,6 +1796,9 @@ impl Database { updated_at: r.get("updated_at"), user_id: r.get("user_id"), file_hash: r.get("file_hash"), + original_created_at: r.get("original_created_at"), + original_modified_at: r.get("original_modified_at"), + source_metadata: r.get("source_metadata"), }).collect() }; diff --git a/src/ingestion/document_ingestion.rs b/src/ingestion/document_ingestion.rs index 7386e34..c8e9425 100644 --- a/src/ingestion/document_ingestion.rs +++ b/src/ingestion/document_ingestion.rs @@ -9,8 +9,10 @@ use uuid::Uuid; use sha2::{Digest, Sha256}; use tracing::{debug, info, warn}; +use chrono::Utc; +use serde_json; -use crate::models::Document; +use crate::models::{Document, FileInfo}; use crate::db::Database; use crate::services::file_service::FileService; @@ -49,6 +51,10 @@ pub struct DocumentIngestionRequest { /// Optional source identifier for tracking pub source_type: Option, pub source_id: Option, + /// Optional metadata from source file system + pub original_created_at: Option>, + pub original_modified_at: Option>, + pub source_metadata: Option, } pub struct DocumentIngestionService { @@ -61,6 +67,47 @@ impl DocumentIngestionService { Self { db, file_service } } + /// Extract metadata from FileInfo for storage in document + fn extract_metadata_from_file_info(file_info: &FileInfo) -> (Option>, Option>, Option) { + let original_created_at = file_info.created_at; + let original_modified_at = file_info.last_modified; + + // Build comprehensive metadata object + let mut metadata = serde_json::Map::new(); + + // Add permissions if available + if let Some(perms) = file_info.permissions { + metadata.insert("permissions".to_string(), serde_json::Value::Number(perms.into())); + } + + // Add owner/group info + if let Some(ref owner) = file_info.owner { + metadata.insert("owner".to_string(), serde_json::Value::String(owner.clone())); + } + + if let Some(ref group) = file_info.group { + metadata.insert("group".to_string(), serde_json::Value::String(group.clone())); + } + + // Add source path + metadata.insert("source_path".to_string(), serde_json::Value::String(file_info.path.clone())); + + // Merge any additional metadata from the source + if let Some(ref source_meta) = file_info.metadata { + if let serde_json::Value::Object(source_map) = source_meta { + metadata.extend(source_map.clone()); + } + } + + let final_metadata = if metadata.is_empty() { + None + } else { + Some(serde_json::Value::Object(metadata)) + }; + + (original_created_at, original_modified_at, final_metadata) + } + /// Unified document ingestion with configurable deduplication policy pub async fn ingest_document(&self, request: DocumentIngestionRequest) -> Result> { let file_hash = self.calculate_file_hash(&request.file_data); @@ -156,6 +203,9 @@ impl DocumentIngestionService { &request.mime_type, request.user_id, Some(file_hash.clone()), + request.original_created_at, + request.original_modified_at, + request.source_metadata, ); let saved_document = match self.db.create_document(document).await { @@ -235,6 +285,36 @@ impl DocumentIngestionService { format!("{:x}", result) } + /// Ingest document from source with FileInfo metadata + pub async fn ingest_from_file_info( + &self, + file_info: &FileInfo, + file_data: Vec, + user_id: Uuid, + deduplication_policy: DeduplicationPolicy, + source_type: &str, + source_id: Option, + ) -> Result> { + let (original_created_at, original_modified_at, source_metadata) = + Self::extract_metadata_from_file_info(file_info); + + let request = DocumentIngestionRequest { + filename: file_info.name.clone(), + original_filename: file_info.name.clone(), + file_data, + mime_type: file_info.mime_type.clone(), + user_id, + deduplication_policy, + source_type: Some(source_type.to_string()), + source_id, + original_created_at, + original_modified_at, + source_metadata, + }; + + self.ingest_document(request).await + } + /// Convenience method for direct uploads (maintains backward compatibility) pub async fn ingest_upload( &self, @@ -252,6 +332,9 @@ impl DocumentIngestionService { deduplication_policy: DeduplicationPolicy::AllowDuplicateContent, // Fixed behavior for uploads source_type: Some("direct_upload".to_string()), source_id: None, + original_created_at: None, + original_modified_at: None, + source_metadata: None, }; self.ingest_document(request).await @@ -276,6 +359,9 @@ impl DocumentIngestionService { deduplication_policy: DeduplicationPolicy::Skip, // Skip duplicates for source sync source_type: Some(source_type.to_string()), source_id: Some(source_id), + original_created_at: None, + original_modified_at: None, + source_metadata: None, }; self.ingest_document(request).await @@ -299,6 +385,9 @@ impl DocumentIngestionService { deduplication_policy: DeduplicationPolicy::TrackAsDuplicate, // Track duplicates for WebDAV source_type: Some("webdav".to_string()), source_id: Some(webdav_source_id), + original_created_at: None, + original_modified_at: None, + source_metadata: None, }; self.ingest_document(request).await @@ -321,6 +410,9 @@ impl DocumentIngestionService { deduplication_policy: DeduplicationPolicy::Skip, // Skip duplicates for batch operations source_type: Some("batch_ingest".to_string()), source_id: None, + original_created_at: None, + original_modified_at: None, + source_metadata: None, }; self.ingest_document(request).await diff --git a/src/models.rs b/src/models.rs index 8806142..62ea964 100644 --- a/src/models.rs +++ b/src/models.rs @@ -3,6 +3,7 @@ use serde::{Deserialize, Serialize}; use sqlx::FromRow; use uuid::Uuid; use utoipa::{ToSchema, IntoParams}; +use serde_json; #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, ToSchema)] pub enum UserRole { @@ -133,6 +134,12 @@ pub struct Document { pub updated_at: DateTime, pub user_id: Uuid, pub file_hash: Option, + /// Original file creation timestamp from source system + pub original_created_at: Option>, + /// Original file modification timestamp from source system + pub original_modified_at: Option>, + /// Additional metadata from source system (permissions, attributes, EXIF data, etc.) + pub source_metadata: Option, } #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, ToSchema)] @@ -307,6 +314,15 @@ pub struct DocumentResponse { pub ocr_processing_time_ms: Option, /// Current status of OCR processing (pending, processing, completed, failed) pub ocr_status: Option, + /// Original file creation timestamp from source system + #[serde(skip_serializing_if = "Option::is_none")] + pub original_created_at: Option>, + /// Original file modification timestamp from source system + #[serde(skip_serializing_if = "Option::is_none")] + pub original_modified_at: Option>, + /// Additional metadata from source system (permissions, attributes, etc.) + #[serde(skip_serializing_if = "Option::is_none")] + pub source_metadata: Option, } #[derive(Debug, Serialize, Deserialize, ToSchema, IntoParams)] @@ -447,6 +463,9 @@ impl From for DocumentResponse { ocr_word_count: doc.ocr_word_count, ocr_processing_time_ms: doc.ocr_processing_time_ms, ocr_status: doc.ocr_status, + original_created_at: doc.original_created_at, + original_modified_at: doc.original_modified_at, + source_metadata: doc.source_metadata, } } } @@ -900,6 +919,16 @@ pub struct FileInfo { pub last_modified: Option>, pub etag: String, pub is_directory: bool, + /// Original file creation time from source system + pub created_at: Option>, + /// File permissions (Unix mode bits or similar) + pub permissions: Option, + /// File owner (username or uid) + pub owner: Option, + /// File group (groupname or gid) + pub group: Option, + /// Additional metadata from source (EXIF, PDF metadata, custom attributes, etc.) + pub metadata: Option, } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash, ToSchema)] diff --git a/src/routes/documents.rs b/src/routes/documents.rs index f278293..f8c5a43 100644 --- a/src/routes/documents.rs +++ b/src/routes/documents.rs @@ -116,6 +116,9 @@ async fn get_document_by_id( ocr_word_count: document.ocr_word_count, ocr_processing_time_ms: document.ocr_processing_time_ms, ocr_status: document.ocr_status, + original_created_at: document.original_created_at, + original_modified_at: document.original_modified_at, + source_metadata: document.source_metadata, }; Ok(Json(response)) diff --git a/src/routes/webdav/webdav_sync.rs b/src/routes/webdav/webdav_sync.rs index fec9147..a3ce631 100644 --- a/src/routes/webdav/webdav_sync.rs +++ b/src/routes/webdav/webdav_sync.rs @@ -283,24 +283,25 @@ async fn process_single_file( let result = if let Some(source_id) = webdav_source_id { ingestion_service - .ingest_from_webdav( - &file_info.name, + .ingest_from_file_info( + &file_info, file_data, - &file_info.mime_type, user_id, - source_id, + crate::ingestion::document_ingestion::DeduplicationPolicy::TrackAsDuplicate, + "webdav_sync", + Some(source_id), ) .await } else { // Fallback for backward compatibility - treat as generic WebDAV sync ingestion_service - .ingest_from_source( - &file_info.name, + .ingest_from_file_info( + &file_info, file_data, - &file_info.mime_type, user_id, - uuid::Uuid::new_v4(), // Generate a temporary ID for tracking + crate::ingestion::document_ingestion::DeduplicationPolicy::Skip, "webdav_sync", + Some(uuid::Uuid::new_v4()), // Generate a temporary ID for tracking ) .await }; diff --git a/src/scheduling/source_sync.rs b/src/scheduling/source_sync.rs index e459dc5..8fa6e67 100644 --- a/src/scheduling/source_sync.rs +++ b/src/scheduling/source_sync.rs @@ -533,13 +533,13 @@ impl SourceSyncService { let ingestion_service = DocumentIngestionService::new(state.db.clone(), file_service); let result = ingestion_service - .ingest_from_source( - &file_info.name, + .ingest_from_file_info( + file_info, file_data, - &file_info.mime_type, user_id, - source_id, + crate::ingestion::document_ingestion::DeduplicationPolicy::Skip, "source_sync", + Some(source_id), ) .await .map_err(|e| anyhow!("Document ingestion failed for {}: {}", file_info.name, e))?; @@ -637,13 +637,13 @@ impl SourceSyncService { let ingestion_service = DocumentIngestionService::new(state.db.clone(), file_service); let result = ingestion_service - .ingest_from_source( - &file_info.name, + .ingest_from_file_info( + file_info, file_data, - &file_info.mime_type, user_id, - source_id, + crate::ingestion::document_ingestion::DeduplicationPolicy::Skip, "source_sync", + Some(source_id), ) .await .map_err(|e| anyhow!("Document ingestion failed for {}: {}", file_info.name, e))?; diff --git a/src/services/file_service.rs b/src/services/file_service.rs index 67d59c6..69803de 100644 --- a/src/services/file_service.rs +++ b/src/services/file_service.rs @@ -158,6 +158,9 @@ impl FileService { mime_type: &str, user_id: Uuid, file_hash: Option, + original_created_at: Option>, + original_modified_at: Option>, + source_metadata: Option, ) -> Document { Document { id: Uuid::new_v4(), @@ -179,6 +182,9 @@ impl FileService { updated_at: Utc::now(), user_id, file_hash, + original_created_at, + original_modified_at, + source_metadata, } } diff --git a/src/services/local_folder_service.rs b/src/services/local_folder_service.rs index c5af902..1aca90f 100644 --- a/src/services/local_folder_service.rs +++ b/src/services/local_folder_service.rs @@ -5,6 +5,7 @@ use chrono::{DateTime, Utc}; use tracing::{debug, info, warn}; use walkdir::WalkDir; use sha2::{Sha256, Digest}; +use serde_json; use crate::models::{FileInfo, LocalFolderSourceConfig}; @@ -89,6 +90,14 @@ impl LocalFolderService { DateTime::from_timestamp(duration.as_secs() as i64, 0) }); + // Try to get creation time (not available on all systems) + let created_time = metadata.created() + .ok() + .and_then(|time| { + let duration = time.duration_since(std::time::UNIX_EPOCH).ok()?; + DateTime::from_timestamp(duration.as_secs() as i64, 0) + }); + let file_name = path.file_name() .and_then(|name| name.to_str()) .unwrap_or("unknown") @@ -100,6 +109,34 @@ impl LocalFolderService { // Determine MIME type based on extension let mime_type = Self::get_mime_type(&extension); + // Extract file permissions and ownership info + #[cfg(unix)] + let (permissions, owner, group) = { + use std::os::unix::fs::MetadataExt; + ( + Some(metadata.mode() & 0o777), // File mode bits (permissions) + Some(metadata.uid().to_string()), // User ID + Some(metadata.gid().to_string()), // Group ID + ) + }; + + #[cfg(not(unix))] + let (permissions, owner, group) = (None, None, None); + + // Prepare additional metadata + let mut additional_metadata = serde_json::Map::new(); + + #[cfg(unix)] + { + use std::os::unix::fs::MetadataExt; + additional_metadata.insert("inode".to_string(), serde_json::Value::Number(metadata.ino().into())); + additional_metadata.insert("nlinks".to_string(), serde_json::Value::Number(metadata.nlink().into())); + additional_metadata.insert("device".to_string(), serde_json::Value::Number(metadata.dev().into())); + } + + // Add file attributes + additional_metadata.insert("readonly".to_string(), serde_json::Value::Bool(metadata.permissions().readonly())); + let file_info = FileInfo { path: path.to_string_lossy().to_string(), name: file_name, @@ -108,6 +145,11 @@ impl LocalFolderService { last_modified: modified_time, etag, is_directory: false, + created_at: created_time, + permissions, + owner, + group, + metadata: if additional_metadata.is_empty() { None } else { Some(serde_json::Value::Object(additional_metadata)) }, }; files.push(file_info); diff --git a/src/services/s3_service.rs b/src/services/s3_service.rs index 1860d2c..d3e1f88 100644 --- a/src/services/s3_service.rs +++ b/src/services/s3_service.rs @@ -1,6 +1,7 @@ use anyhow::{anyhow, Result}; use chrono::{DateTime, Utc}; use tracing::{debug, error, info, warn}; +use serde_json; #[cfg(feature = "s3")] use aws_sdk_s3::Client; @@ -149,6 +150,32 @@ impl S3Service { let mime_type = Self::get_mime_type(&extension); + // Build additional metadata from S3 object properties + let mut metadata_map = serde_json::Map::new(); + + // Add S3-specific metadata + if let Some(storage_class) = &object.storage_class { + metadata_map.insert("storage_class".to_string(), serde_json::Value::String(storage_class.as_str().to_string())); + } + + if let Some(owner) = &object.owner { + if let Some(display_name) = &owner.display_name { + metadata_map.insert("owner_display_name".to_string(), serde_json::Value::String(display_name.clone())); + } + if let Some(id) = &owner.id { + metadata_map.insert("owner_id".to_string(), serde_json::Value::String(id.clone())); + } + } + + // Store the S3 key for reference + metadata_map.insert("s3_key".to_string(), serde_json::Value::String(key.clone())); + + // Add bucket name for reference + metadata_map.insert("s3_bucket".to_string(), serde_json::Value::String(self.config.bucket_name.clone())); + + // If we have region info, add it + metadata_map.insert("s3_region".to_string(), serde_json::Value::String(self.config.region.clone())); + let file_info = FileInfo { path: key.clone(), name: file_name, @@ -157,6 +184,11 @@ impl S3Service { last_modified, etag, is_directory: false, + created_at: None, // S3 doesn't provide creation time, only last modified + permissions: None, // S3 uses different permission model (ACLs/policies) + owner: object.owner.as_ref().and_then(|o| o.display_name.clone()), + group: None, // S3 doesn't have Unix-style groups + metadata: if metadata_map.is_empty() { None } else { Some(serde_json::Value::Object(metadata_map)) }, }; files.push(file_info); diff --git a/src/services/webdav_service.rs b/src/services/webdav_service.rs index 2cc4302..e3cae9d 100644 --- a/src/services/webdav_service.rs +++ b/src/services/webdav_service.rs @@ -423,14 +423,7 @@ impl WebDAVService { let propfind_body = r#" - - - - - - - - + "#; let response = self.client diff --git a/src/tests/config_oidc_tests.rs b/src/tests/config_oidc_tests.rs index e5a1b5c..cbf09c5 100644 --- a/src/tests/config_oidc_tests.rs +++ b/src/tests/config_oidc_tests.rs @@ -2,6 +2,49 @@ mod tests { use crate::config::Config; use std::env; + use std::sync::Mutex; + + // Mutex to ensure OIDC tests run sequentially to avoid race conditions + static OIDC_TEST_MUTEX: Mutex<()> = Mutex::new(()); + + // Helper function to safely run a test with environment isolation + fn run_with_env_isolation(test_fn: F) -> R + where + F: FnOnce() -> R, + { + let _guard = OIDC_TEST_MUTEX.lock().unwrap(); + + // Store original environment values + let original_values: Vec<(String, Option)> = vec![ + "OIDC_ENABLED", + "OIDC_CLIENT_ID", + "OIDC_CLIENT_SECRET", + "OIDC_ISSUER_URL", + "OIDC_REDIRECT_URI", + "DATABASE_URL", + "JWT_SECRET", + ].into_iter().map(|key| { + (key.to_string(), env::var(key).ok()) + }).collect(); + + // Clean up environment first + for (key, _) in &original_values { + env::remove_var(key); + } + + // Run the test + let result = test_fn(); + + // Restore original environment + for (key, original_value) in original_values { + env::remove_var(&key); + if let Some(value) = original_value { + env::set_var(&key, value); + } + } + + result + } fn create_base_config() -> Config { Config { @@ -40,291 +83,176 @@ mod tests { #[test] fn test_oidc_enabled_from_env() { - // Clean up environment first to ensure test isolation - env::remove_var("OIDC_ENABLED"); - env::remove_var("OIDC_CLIENT_ID"); - env::remove_var("OIDC_CLIENT_SECRET"); - env::remove_var("OIDC_ISSUER_URL"); - env::remove_var("OIDC_REDIRECT_URI"); - env::remove_var("DATABASE_URL"); - env::remove_var("JWT_SECRET"); - - env::set_var("OIDC_ENABLED", "true"); - env::set_var("OIDC_CLIENT_ID", "test-client-id"); - env::set_var("OIDC_CLIENT_SECRET", "test-client-secret"); - env::set_var("OIDC_ISSUER_URL", "https://provider.example.com"); - env::set_var("OIDC_REDIRECT_URI", "http://localhost:8000/auth/oidc/callback"); - env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test"); - env::set_var("JWT_SECRET", "test-secret"); - - let config = Config::from_env().unwrap(); - - assert!(config.oidc_enabled); - assert_eq!(config.oidc_client_id, Some("test-client-id".to_string())); - assert_eq!(config.oidc_client_secret, Some("test-client-secret".to_string())); - assert_eq!(config.oidc_issuer_url, Some("https://provider.example.com".to_string())); - assert_eq!(config.oidc_redirect_uri, Some("http://localhost:8000/auth/oidc/callback".to_string())); - - // Clean up - env::remove_var("OIDC_ENABLED"); - env::remove_var("OIDC_CLIENT_ID"); - env::remove_var("OIDC_CLIENT_SECRET"); - env::remove_var("OIDC_ISSUER_URL"); - env::remove_var("OIDC_REDIRECT_URI"); - env::remove_var("DATABASE_URL"); - env::remove_var("JWT_SECRET"); - } - - #[test] - fn test_oidc_enabled_variations() { - let test_cases = vec![ - ("true", true), - ("TRUE", true), - ("1", true), - ("yes", true), - ("YES", true), - ("on", true), - ("ON", true), - ("false", false), - ("FALSE", false), - ("0", false), - ("no", false), - ("NO", false), - ("off", false), - ("OFF", false), - ("invalid", false), - ]; - - for (value, expected) in test_cases { - // Clean up environment first for each iteration - env::remove_var("OIDC_ENABLED"); - env::remove_var("OIDC_CLIENT_ID"); - env::remove_var("OIDC_CLIENT_SECRET"); - env::remove_var("OIDC_ISSUER_URL"); - env::remove_var("OIDC_REDIRECT_URI"); - env::remove_var("DATABASE_URL"); - env::remove_var("JWT_SECRET"); - - env::set_var("OIDC_ENABLED", value); + run_with_env_isolation(|| { + env::set_var("OIDC_ENABLED", "true"); + env::set_var("OIDC_CLIENT_ID", "test-client-id"); + env::set_var("OIDC_CLIENT_SECRET", "test-client-secret"); + env::set_var("OIDC_ISSUER_URL", "https://provider.example.com"); + env::set_var("OIDC_REDIRECT_URI", "http://localhost:8000/auth/oidc/callback"); env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test"); env::set_var("JWT_SECRET", "test-secret"); let config = Config::from_env().unwrap(); - assert_eq!(config.oidc_enabled, expected, "Failed for value: {}", value); - env::remove_var("OIDC_ENABLED"); - env::remove_var("DATABASE_URL"); - env::remove_var("JWT_SECRET"); - } + assert!(config.oidc_enabled); + assert_eq!(config.oidc_client_id, Some("test-client-id".to_string())); + assert_eq!(config.oidc_client_secret, Some("test-client-secret".to_string())); + assert_eq!(config.oidc_issuer_url, Some("https://provider.example.com".to_string())); + assert_eq!(config.oidc_redirect_uri, Some("http://localhost:8000/auth/oidc/callback".to_string())); + }); + } + + #[test] + fn test_oidc_enabled_variations() { + run_with_env_isolation(|| { + let test_cases = vec![ + ("true", true), + ("TRUE", true), + ("1", true), + ("yes", true), + ("YES", true), + ("on", true), + ("ON", true), + ("false", false), + ("FALSE", false), + ("0", false), + ("no", false), + ("NO", false), + ("off", false), + ("OFF", false), + ("invalid", false), + ]; + + for (value, expected) in test_cases { + // Clean up environment for each iteration + env::remove_var("OIDC_ENABLED"); + env::remove_var("DATABASE_URL"); + env::remove_var("JWT_SECRET"); + + env::set_var("OIDC_ENABLED", value); + env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test"); + env::set_var("JWT_SECRET", "test-secret"); + + let config = Config::from_env().unwrap(); + assert_eq!(config.oidc_enabled, expected, "Failed for value: {}", value); + } + }); } #[test] fn test_oidc_partial_config() { - // Clean up environment first to ensure test isolation - env::remove_var("OIDC_ENABLED"); - env::remove_var("OIDC_CLIENT_ID"); - env::remove_var("OIDC_CLIENT_SECRET"); - env::remove_var("OIDC_ISSUER_URL"); - env::remove_var("OIDC_REDIRECT_URI"); - env::remove_var("DATABASE_URL"); - env::remove_var("JWT_SECRET"); - - // Only set some OIDC vars - env::set_var("OIDC_ENABLED", "true"); - env::set_var("OIDC_CLIENT_ID", "test-client-id"); - // Missing OIDC_CLIENT_SECRET, OIDC_ISSUER_URL, OIDC_REDIRECT_URI - env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test"); - env::set_var("JWT_SECRET", "test-secret"); + run_with_env_isolation(|| { + // Only set some OIDC vars + env::set_var("OIDC_ENABLED", "true"); + env::set_var("OIDC_CLIENT_ID", "test-client-id"); + // Missing OIDC_CLIENT_SECRET, OIDC_ISSUER_URL, OIDC_REDIRECT_URI + env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test"); + env::set_var("JWT_SECRET", "test-secret"); - let config = Config::from_env().unwrap(); + let config = Config::from_env().unwrap(); - assert!(config.oidc_enabled); - assert_eq!(config.oidc_client_id, Some("test-client-id".to_string())); - assert!(config.oidc_client_secret.is_none()); - assert!(config.oidc_issuer_url.is_none()); - assert!(config.oidc_redirect_uri.is_none()); - - // Clean up - env::remove_var("OIDC_ENABLED"); - env::remove_var("OIDC_CLIENT_ID"); - env::remove_var("OIDC_CLIENT_SECRET"); - env::remove_var("OIDC_ISSUER_URL"); - env::remove_var("OIDC_REDIRECT_URI"); - env::remove_var("DATABASE_URL"); - env::remove_var("JWT_SECRET"); + assert!(config.oidc_enabled); + assert_eq!(config.oidc_client_id, Some("test-client-id".to_string())); + assert!(config.oidc_client_secret.is_none()); + assert!(config.oidc_issuer_url.is_none()); + assert!(config.oidc_redirect_uri.is_none()); + }); } #[test] fn test_oidc_disabled_with_config_present() { - // Clean up environment first to ensure test isolation - env::remove_var("OIDC_ENABLED"); - env::remove_var("OIDC_CLIENT_ID"); - env::remove_var("OIDC_CLIENT_SECRET"); - env::remove_var("OIDC_ISSUER_URL"); - env::remove_var("OIDC_REDIRECT_URI"); - env::remove_var("DATABASE_URL"); - env::remove_var("JWT_SECRET"); - - // OIDC disabled but config present - env::set_var("OIDC_ENABLED", "false"); - env::set_var("OIDC_CLIENT_ID", "test-client-id"); - env::set_var("OIDC_CLIENT_SECRET", "test-client-secret"); - env::set_var("OIDC_ISSUER_URL", "https://provider.example.com"); - env::set_var("OIDC_REDIRECT_URI", "http://localhost:8000/auth/oidc/callback"); - env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test"); - env::set_var("JWT_SECRET", "test-secret"); + run_with_env_isolation(|| { + // OIDC disabled but config present + env::set_var("OIDC_ENABLED", "false"); + env::set_var("OIDC_CLIENT_ID", "test-client-id"); + env::set_var("OIDC_CLIENT_SECRET", "test-client-secret"); + env::set_var("OIDC_ISSUER_URL", "https://provider.example.com"); + env::set_var("OIDC_REDIRECT_URI", "http://localhost:8000/auth/oidc/callback"); + env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test"); + env::set_var("JWT_SECRET", "test-secret"); - let config = Config::from_env().unwrap(); + let config = Config::from_env().unwrap(); - assert!(!config.oidc_enabled); - assert_eq!(config.oidc_client_id, Some("test-client-id".to_string())); - assert_eq!(config.oidc_client_secret, Some("test-client-secret".to_string())); - assert_eq!(config.oidc_issuer_url, Some("https://provider.example.com".to_string())); - assert_eq!(config.oidc_redirect_uri, Some("http://localhost:8000/auth/oidc/callback".to_string())); - - // Clean up - env::remove_var("OIDC_ENABLED"); - env::remove_var("OIDC_CLIENT_ID"); - env::remove_var("OIDC_CLIENT_SECRET"); - env::remove_var("OIDC_ISSUER_URL"); - env::remove_var("OIDC_REDIRECT_URI"); - env::remove_var("DATABASE_URL"); - env::remove_var("JWT_SECRET"); + assert!(!config.oidc_enabled); + assert_eq!(config.oidc_client_id, Some("test-client-id".to_string())); + assert_eq!(config.oidc_client_secret, Some("test-client-secret".to_string())); + assert_eq!(config.oidc_issuer_url, Some("https://provider.example.com".to_string())); + assert_eq!(config.oidc_redirect_uri, Some("http://localhost:8000/auth/oidc/callback".to_string())); + }); } #[test] fn test_oidc_empty_values() { - // Clean up environment first to ensure test isolation - env::remove_var("OIDC_ENABLED"); - env::remove_var("OIDC_CLIENT_ID"); - env::remove_var("OIDC_CLIENT_SECRET"); - env::remove_var("OIDC_ISSUER_URL"); - env::remove_var("OIDC_REDIRECT_URI"); - env::remove_var("DATABASE_URL"); - env::remove_var("JWT_SECRET"); - - env::set_var("OIDC_ENABLED", "true"); - env::set_var("OIDC_CLIENT_ID", ""); - env::set_var("OIDC_CLIENT_SECRET", ""); - env::set_var("OIDC_ISSUER_URL", ""); - env::set_var("OIDC_REDIRECT_URI", ""); - env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test"); - env::set_var("JWT_SECRET", "test-secret"); + run_with_env_isolation(|| { + env::set_var("OIDC_ENABLED", "true"); + env::set_var("OIDC_CLIENT_ID", ""); + env::set_var("OIDC_CLIENT_SECRET", ""); + env::set_var("OIDC_ISSUER_URL", ""); + env::set_var("OIDC_REDIRECT_URI", ""); + env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test"); + env::set_var("JWT_SECRET", "test-secret"); - let config = Config::from_env().unwrap(); + let config = Config::from_env().unwrap(); - assert!(config.oidc_enabled); - // Empty string values should be converted to Some(empty_string) - assert_eq!(config.oidc_client_id, Some("".to_string())); - assert_eq!(config.oidc_client_secret, Some("".to_string())); - assert_eq!(config.oidc_issuer_url, Some("".to_string())); - assert_eq!(config.oidc_redirect_uri, Some("".to_string())); - - // Clean up - env::remove_var("OIDC_ENABLED"); - env::remove_var("OIDC_CLIENT_ID"); - env::remove_var("OIDC_CLIENT_SECRET"); - env::remove_var("OIDC_ISSUER_URL"); - env::remove_var("OIDC_REDIRECT_URI"); - env::remove_var("DATABASE_URL"); - env::remove_var("JWT_SECRET"); + assert!(config.oidc_enabled); + // Empty string values should be converted to Some(empty_string) + assert_eq!(config.oidc_client_id, Some("".to_string())); + assert_eq!(config.oidc_client_secret, Some("".to_string())); + assert_eq!(config.oidc_issuer_url, Some("".to_string())); + assert_eq!(config.oidc_redirect_uri, Some("".to_string())); + }); } #[test] fn test_oidc_config_validation_output() { - // Clean up environment first to ensure test isolation - env::remove_var("OIDC_ENABLED"); - env::remove_var("OIDC_CLIENT_ID"); - env::remove_var("OIDC_CLIENT_SECRET"); - env::remove_var("OIDC_ISSUER_URL"); - env::remove_var("OIDC_REDIRECT_URI"); - env::remove_var("DATABASE_URL"); - env::remove_var("JWT_SECRET"); - - // Test that validation warnings are properly formatted - env::set_var("OIDC_ENABLED", "true"); - env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test"); - env::set_var("JWT_SECRET", "test-secret"); - // Missing required OIDC fields + run_with_env_isolation(|| { + // Test that validation warnings are properly formatted + env::set_var("OIDC_ENABLED", "true"); + env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test"); + env::set_var("JWT_SECRET", "test-secret"); + // Missing required OIDC fields - // This should succeed but show warnings - let config = Config::from_env().unwrap(); - assert!(config.oidc_enabled); - assert!(config.oidc_client_id.is_none()); - - // Clean up - env::remove_var("OIDC_ENABLED"); - env::remove_var("OIDC_CLIENT_ID"); - env::remove_var("OIDC_CLIENT_SECRET"); - env::remove_var("OIDC_ISSUER_URL"); - env::remove_var("OIDC_REDIRECT_URI"); - env::remove_var("DATABASE_URL"); - env::remove_var("JWT_SECRET"); + // This should succeed but show warnings + let config = Config::from_env().unwrap(); + assert!(config.oidc_enabled); + assert!(config.oidc_client_id.is_none()); + }); } #[test] fn test_oidc_complete_configuration() { - // Clean up environment first to ensure test isolation - env::remove_var("OIDC_ENABLED"); - env::remove_var("OIDC_CLIENT_ID"); - env::remove_var("OIDC_CLIENT_SECRET"); - env::remove_var("OIDC_ISSUER_URL"); - env::remove_var("OIDC_REDIRECT_URI"); - env::remove_var("DATABASE_URL"); - env::remove_var("JWT_SECRET"); - - env::set_var("OIDC_ENABLED", "true"); - env::set_var("OIDC_CLIENT_ID", "my-app-client-id"); - env::set_var("OIDC_CLIENT_SECRET", "super-secret-client-secret"); - env::set_var("OIDC_ISSUER_URL", "https://auth.example.com"); - env::set_var("OIDC_REDIRECT_URI", "https://myapp.com/auth/callback"); - env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test"); - env::set_var("JWT_SECRET", "test-secret"); + run_with_env_isolation(|| { + env::set_var("OIDC_ENABLED", "true"); + env::set_var("OIDC_CLIENT_ID", "my-app-client-id"); + env::set_var("OIDC_CLIENT_SECRET", "super-secret-client-secret"); + env::set_var("OIDC_ISSUER_URL", "https://auth.example.com"); + env::set_var("OIDC_REDIRECT_URI", "https://myapp.com/auth/callback"); + env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test"); + env::set_var("JWT_SECRET", "test-secret"); - let config = Config::from_env().unwrap(); + let config = Config::from_env().unwrap(); - assert!(config.oidc_enabled); - assert_eq!(config.oidc_client_id.unwrap(), "my-app-client-id"); - assert_eq!(config.oidc_client_secret.unwrap(), "super-secret-client-secret"); - assert_eq!(config.oidc_issuer_url.unwrap(), "https://auth.example.com"); - assert_eq!(config.oidc_redirect_uri.unwrap(), "https://myapp.com/auth/callback"); - - // Clean up - env::remove_var("OIDC_ENABLED"); - env::remove_var("OIDC_CLIENT_ID"); - env::remove_var("OIDC_CLIENT_SECRET"); - env::remove_var("OIDC_ISSUER_URL"); - env::remove_var("OIDC_REDIRECT_URI"); - env::remove_var("DATABASE_URL"); - env::remove_var("JWT_SECRET"); + assert!(config.oidc_enabled); + assert_eq!(config.oidc_client_id.unwrap(), "my-app-client-id"); + assert_eq!(config.oidc_client_secret.unwrap(), "super-secret-client-secret"); + assert_eq!(config.oidc_issuer_url.unwrap(), "https://auth.example.com"); + assert_eq!(config.oidc_redirect_uri.unwrap(), "https://myapp.com/auth/callback"); + }); } #[test] fn test_oidc_config_precedence() { - // Clean up any existing env vars first - env::remove_var("OIDC_ENABLED"); - env::remove_var("OIDC_CLIENT_ID"); - env::remove_var("OIDC_CLIENT_SECRET"); - env::remove_var("OIDC_ISSUER_URL"); - env::remove_var("OIDC_REDIRECT_URI"); - env::remove_var("DATABASE_URL"); - env::remove_var("JWT_SECRET"); - - // Test that environment variables take precedence - env::set_var("OIDC_ENABLED", "true"); - env::set_var("OIDC_CLIENT_ID", "env-client-id"); - env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test"); - env::set_var("JWT_SECRET", "test-secret"); + run_with_env_isolation(|| { + // Test that environment variables take precedence + env::set_var("OIDC_ENABLED", "true"); + env::set_var("OIDC_CLIENT_ID", "env-client-id"); + env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test"); + env::set_var("JWT_SECRET", "test-secret"); - let config = Config::from_env().unwrap(); + let config = Config::from_env().unwrap(); - assert!(config.oidc_enabled); - assert_eq!(config.oidc_client_id.unwrap(), "env-client-id"); - - // Clean up - env::remove_var("OIDC_ENABLED"); - env::remove_var("OIDC_CLIENT_ID"); - env::remove_var("DATABASE_URL"); - env::remove_var("JWT_SECRET"); + assert!(config.oidc_enabled); + assert_eq!(config.oidc_client_id.unwrap(), "env-client-id"); + }); } } \ No newline at end of file diff --git a/src/tests/db_tests.rs b/src/tests/db_tests.rs index 2dbef2e..123c3f7 100644 --- a/src/tests/db_tests.rs +++ b/src/tests/db_tests.rs @@ -49,6 +49,9 @@ mod tests { updated_at: Utc::now(), user_id, file_hash: Some("abcd1234567890123456789012345678901234567890123456789012345678".to_string()), + original_created_at: None, + original_modified_at: None, + source_metadata: None, } } diff --git a/src/tests/document_routes_tests.rs b/src/tests/document_routes_tests.rs index 7627db8..5582a4b 100644 --- a/src/tests/document_routes_tests.rs +++ b/src/tests/document_routes_tests.rs @@ -60,6 +60,9 @@ mod document_routes_deletion_tests { updated_at: Utc::now(), user_id, file_hash: Some("hash123".to_string()), + original_created_at: None, + original_modified_at: None, + source_metadata: None, } } @@ -394,6 +397,9 @@ mod document_routes_deletion_tests { updated_at: Utc::now(), user_id, file_hash: Some("abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890".to_string()), + original_created_at: None, + original_modified_at: None, + source_metadata: None, } } diff --git a/src/tests/documents_tests.rs b/src/tests/documents_tests.rs index 17b1050..976614e 100644 --- a/src/tests/documents_tests.rs +++ b/src/tests/documents_tests.rs @@ -26,6 +26,9 @@ fn create_test_document(user_id: Uuid) -> Document { updated_at: Utc::now(), user_id, file_hash: Some("1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef".to_string()), + original_created_at: None, + original_modified_at: None, + source_metadata: None, } } @@ -51,6 +54,9 @@ fn create_test_document_without_ocr(user_id: Uuid) -> Document { updated_at: Utc::now(), user_id, file_hash: Some("fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321".to_string()), + original_created_at: None, + original_modified_at: None, + source_metadata: None, } } @@ -76,6 +82,9 @@ fn create_test_document_with_ocr_error(user_id: Uuid) -> Document { updated_at: Utc::now(), user_id, file_hash: Some("abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890".to_string()), + original_created_at: None, + original_modified_at: None, + source_metadata: None, } } @@ -1552,6 +1561,9 @@ mod deletion_error_handling_tests { updated_at: Utc::now(), user_id, file_hash: Some("test_hash_123456789abcdef123456789abcdef123456789abcdef123456789abcdef".to_string()), + original_created_at: None, + original_modified_at: None, + source_metadata: None, } } diff --git a/src/tests/enhanced_ocr_tests.rs b/src/tests/enhanced_ocr_tests.rs index efb17b3..00ae2ee 100644 --- a/src/tests/enhanced_ocr_tests.rs +++ b/src/tests/enhanced_ocr_tests.rs @@ -76,7 +76,7 @@ mod tests { assert_eq!(ocr_result.text.trim(), test_content); assert_eq!(ocr_result.confidence, 100.0); // Plain text should be 100% confident assert_eq!(ocr_result.word_count, 9); // "This is a test text file with multiple words" - assert!(ocr_result.processing_time_ms > 0); + assert!(ocr_result.processing_time_ms >= 0); assert!(ocr_result.preprocessing_applied.contains(&"Plain text read".to_string())); } diff --git a/src/tests/enhanced_search_tests.rs b/src/tests/enhanced_search_tests.rs index d072074..dc609ae 100644 --- a/src/tests/enhanced_search_tests.rs +++ b/src/tests/enhanced_search_tests.rs @@ -939,6 +939,9 @@ mod tests { updated_at: Utc::now(), user_id: user.id, file_hash: Some("0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef".to_string()), + original_created_at: None, + original_modified_at: None, + source_metadata: None, }; db.create_document(document).await.unwrap(); diff --git a/src/tests/file_service_tests.rs b/src/tests/file_service_tests.rs index bd82b55..a1ad8b4 100644 --- a/src/tests/file_service_tests.rs +++ b/src/tests/file_service_tests.rs @@ -82,6 +82,9 @@ mod tests { "application/pdf", user_id, Some("abcd1234hash".to_string()), + None, // original_created_at + None, // original_modified_at + None, // source_metadata ); assert_eq!(document.filename, "saved_file.pdf"); @@ -189,6 +192,9 @@ mod file_deletion_tests { updated_at: Utc::now(), user_id, file_hash: Some("hash123".to_string()), + original_created_at: None, + original_modified_at: None, + source_metadata: None, }; ( @@ -324,6 +330,9 @@ mod file_deletion_tests { updated_at: Utc::now(), user_id, file_hash: None, + original_created_at: None, + original_modified_at: None, + source_metadata: None, }; // Try to delete nonexistent files (should not fail) @@ -375,6 +384,9 @@ mod file_deletion_tests { updated_at: Utc::now(), user_id, file_hash: Some("imagehash456".to_string()), + original_created_at: None, + original_modified_at: None, + source_metadata: None, }; // Verify files exist @@ -430,6 +442,9 @@ mod file_deletion_tests { updated_at: Utc::now(), user_id, file_hash: Some("hash789".to_string()), + original_created_at: None, + original_modified_at: None, + source_metadata: None, }; // Verify files exist @@ -476,6 +491,9 @@ mod file_deletion_tests { updated_at: Utc::now(), user_id, file_hash: Some("texthash".to_string()), + original_created_at: None, + original_modified_at: None, + source_metadata: None, }; // Verify file exists diff --git a/src/tests/ignored_files_tests.rs b/src/tests/ignored_files_tests.rs index f630ea4..7128c25 100644 --- a/src/tests/ignored_files_tests.rs +++ b/src/tests/ignored_files_tests.rs @@ -81,6 +81,9 @@ mod tests { updated_at: Utc::now(), user_id, file_hash: Some("1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef".to_string()), + original_created_at: None, + original_modified_at: None, + source_metadata: None, }; sqlx::query("INSERT INTO documents (id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19)") diff --git a/src/webdav_xml_parser.rs b/src/webdav_xml_parser.rs index 328a09c..331379b 100644 --- a/src/webdav_xml_parser.rs +++ b/src/webdav_xml_parser.rs @@ -3,6 +3,7 @@ use chrono::{DateTime, Utc}; use quick_xml::events::{BytesStart, Event}; use quick_xml::reader::Reader; use std::str; +use serde_json; use crate::models::FileInfo; @@ -15,6 +16,12 @@ struct PropFindResponse { content_type: Option, etag: Option, is_collection: bool, + creation_date: Option, + owner: Option, + group: Option, + permissions: Option, + owner_display_name: Option, + metadata: Option, } pub fn parse_propfind_response(xml_text: &str) -> Result> { @@ -85,6 +92,53 @@ pub fn parse_propfind_response(xml_text: &str) -> Result> { "getetag" => { resp.etag = Some(normalize_etag(&text)); } + "creationdate" => { + resp.creation_date = Some(text.trim().to_string()); + } + "owner" => { + resp.owner = Some(text.trim().to_string()); + } + "group" => { + resp.group = Some(text.trim().to_string()); + } + _ => { + // Store any other properties as generic metadata + // This handles vendor-specific properties from any WebDAV server + if !text.trim().is_empty() && in_prop { + if resp.metadata.is_none() { + resp.metadata = Some(serde_json::Value::Object(serde_json::Map::new())); + } + + if let Some(serde_json::Value::Object(ref mut map)) = resp.metadata { + // Special handling for known properties + match current_element.as_str() { + "permissions" | "oc:permissions" => { + resp.permissions = Some(text.trim().to_string()); + map.insert("permissions_raw".to_string(), serde_json::Value::String(text.trim().to_string())); + } + "fileid" | "oc:fileid" => { + map.insert("file_id".to_string(), serde_json::Value::String(text.trim().to_string())); + } + "owner-id" | "oc:owner-id" => { + map.insert("owner_id".to_string(), serde_json::Value::String(text.trim().to_string())); + } + "owner-display-name" | "oc:owner-display-name" => { + resp.owner_display_name = Some(text.trim().to_string()); + map.insert("owner_display_name".to_string(), serde_json::Value::String(text.trim().to_string())); + } + "has-preview" | "nc:has-preview" => { + if let Ok(val) = text.trim().parse::() { + map.insert("has_preview".to_string(), serde_json::Value::Bool(val)); + } + } + _ => { + // Store any other property as-is + map.insert(current_element.clone(), serde_json::Value::String(text.trim().to_string())); + } + } + } + } + } "status" if in_propstat => { // Check if status is 200 OK if text.contains("200") { @@ -120,6 +174,33 @@ pub fn parse_propfind_response(xml_text: &str) -> Result> { .unwrap_or_else(|_| std::borrow::Cow::Borrowed(&name)) .to_string(); + // Parse creation date + let created_at = resp.creation_date + .as_ref() + .and_then(|d| parse_http_date(d)); + + // Parse permissions (Nextcloud/ownCloud format) + let permissions_int = resp.permissions + .as_ref() + .and_then(|p| { + // Nextcloud permissions are a string like "RGDNVW" + // Convert to Unix-style octal permissions + if p.chars().all(|c| c.is_uppercase()) { + // This is Nextcloud format + let mut perms = 0u32; + if p.contains('R') { perms |= 0o444; } // Read + if p.contains('W') { perms |= 0o222; } // Write + if p.contains('D') { perms |= 0o111; } // Delete (execute-like) + Some(perms) + } else { + // Try to parse as numeric + p.parse().ok() + } + }); + + // Use the metadata collected during parsing + let metadata = resp.metadata; + let file_info = FileInfo { path: resp.href.clone(), name, @@ -128,6 +209,11 @@ pub fn parse_propfind_response(xml_text: &str) -> Result> { last_modified: parse_http_date(&resp.last_modified.unwrap_or_default()), etag: resp.etag.unwrap_or_else(|| format!("\"{}\"", uuid::Uuid::new_v4())), is_directory: false, + created_at, + permissions: permissions_int, + owner: resp.owner.or(resp.owner_display_name), + group: resp.group, + metadata, }; files.push(file_info); diff --git a/tests/integration_document_upload_hash_duplicate_tests.rs b/tests/integration_document_upload_hash_duplicate_tests.rs index 01735ae..e387deb 100644 --- a/tests/integration_document_upload_hash_duplicate_tests.rs +++ b/tests/integration_document_upload_hash_duplicate_tests.rs @@ -41,6 +41,9 @@ fn create_test_document(user_id: Uuid, filename: &str, file_hash: String) -> Doc updated_at: Utc::now(), user_id, file_hash: Some(file_hash), + original_created_at: None, + original_modified_at: None, + source_metadata: None, } } diff --git a/tests/integration_file_processing_pipeline_tests.rs b/tests/integration_file_processing_pipeline_tests.rs index 0b2dd68..48a1e5a 100644 --- a/tests/integration_file_processing_pipeline_tests.rs +++ b/tests/integration_file_processing_pipeline_tests.rs @@ -239,6 +239,9 @@ impl FileProcessingTestClient { ocr_word_count: doc.ocr_word_count, ocr_processing_time_ms: doc.ocr_processing_time_ms, ocr_status: doc.ocr_status.clone(), + original_created_at: None, + original_modified_at: None, + source_metadata: None, }; return Ok(doc_copy); } diff --git a/tests/integration_hash_duplicate_detection_tests.rs b/tests/integration_hash_duplicate_detection_tests.rs index dc193ab..9442235 100644 --- a/tests/integration_hash_duplicate_detection_tests.rs +++ b/tests/integration_hash_duplicate_detection_tests.rs @@ -59,6 +59,9 @@ fn create_test_document(user_id: Uuid, filename: &str, file_hash: Option updated_at: Utc::now(), user_id, file_hash, + original_created_at: None, + original_modified_at: None, + source_metadata: None, } } @@ -248,6 +251,9 @@ async fn test_file_service_create_document_with_hash() { "application/pdf", user_id, Some(test_hash.to_string()), + None, // original_created_at + None, // original_modified_at + None, // source_metadata ); assert_eq!(document.filename, "test.pdf"); @@ -271,6 +277,9 @@ async fn test_file_service_create_document_without_hash() { "application/pdf", user_id, None, + None, // original_created_at + None, // original_modified_at + None, // source_metadata ); assert_eq!(document.filename, "test.pdf"); diff --git a/tests/integration_ignored_files_integration_tests.rs b/tests/integration_ignored_files_integration_tests.rs index be8e91c..085ce53 100644 --- a/tests/integration_ignored_files_integration_tests.rs +++ b/tests/integration_ignored_files_integration_tests.rs @@ -356,6 +356,9 @@ async fn test_create_ignored_file_from_document() -> Result<()> { updated_at: chrono::Utc::now(), user_id, file_hash: Some("document_hash_123".to_string()), + original_created_at: None, + original_modified_at: None, + source_metadata: None, }; // Insert document into database diff --git a/tests/integration_source_sync_hash_duplicate_tests.rs b/tests/integration_source_sync_hash_duplicate_tests.rs index 4925e8d..49c6666 100644 --- a/tests/integration_source_sync_hash_duplicate_tests.rs +++ b/tests/integration_source_sync_hash_duplicate_tests.rs @@ -29,6 +29,11 @@ fn create_test_file_info(name: &str, path: &str, content: &[u8]) -> FileInfo { etag: "test-etag".to_string(), mime_type: "application/pdf".to_string(), is_directory: false, + created_at: None, + permissions: None, + owner: None, + group: None, + metadata: None, } } @@ -54,6 +59,9 @@ fn create_test_document(user_id: Uuid, filename: &str, file_hash: String) -> Doc updated_at: Utc::now(), user_id, file_hash: Some(file_hash), + original_created_at: None, + original_modified_at: None, + source_metadata: None, } } diff --git a/tests/integration_webdav_hash_duplicate_tests.rs b/tests/integration_webdav_hash_duplicate_tests.rs index e50e1e9..16b4bee 100644 --- a/tests/integration_webdav_hash_duplicate_tests.rs +++ b/tests/integration_webdav_hash_duplicate_tests.rs @@ -29,6 +29,11 @@ fn create_test_file_info(name: &str, path: &str, size: i64) -> FileInfo { etag: "test-etag".to_string(), mime_type: "application/pdf".to_string(), is_directory: false, + created_at: None, + permissions: None, + owner: None, + group: None, + metadata: None, } } @@ -54,6 +59,9 @@ fn create_test_document(user_id: Uuid, filename: &str, file_hash: String) -> Doc updated_at: Utc::now(), user_id, file_hash: Some(file_hash), + original_created_at: None, + original_modified_at: None, + source_metadata: None, } } @@ -280,6 +288,11 @@ async fn test_webdav_sync_etag_change_detection() -> Result<()> { etag: new_etag.to_string(), mime_type: "application/pdf".to_string(), is_directory: false, + created_at: None, + permissions: None, + owner: None, + group: None, + metadata: None, }; // ETag comparison should detect change diff --git a/tests/unit_unit_tests.rs b/tests/unit_unit_tests.rs index 8042365..a7a1c17 100644 --- a/tests/unit_unit_tests.rs +++ b/tests/unit_unit_tests.rs @@ -25,6 +25,9 @@ fn test_document_response_conversion_with_ocr() { updated_at: Utc::now(), user_id, file_hash: Some("abc123".to_string()), + original_created_at: None, + original_modified_at: None, + source_metadata: None, }; let response: DocumentResponse = document.clone().into(); @@ -59,6 +62,9 @@ fn test_document_response_conversion_without_ocr() { updated_at: Utc::now(), user_id, file_hash: None, + original_created_at: None, + original_modified_at: None, + source_metadata: None, }; let response: DocumentResponse = document.clone().into(); diff --git a/tests/unit_webdav_enhanced_unit_tests.rs b/tests/unit_webdav_enhanced_unit_tests.rs index 5f67ad1..f6f831f 100644 --- a/tests/unit_webdav_enhanced_unit_tests.rs +++ b/tests/unit_webdav_enhanced_unit_tests.rs @@ -607,6 +607,11 @@ fn test_special_characters_in_paths() { last_modified: Some(Utc::now()), etag: "\"test123\"".to_string(), is_directory: false, + created_at: None, + permissions: None, + owner: None, + group: None, + metadata: None, }; assert!(!file_info.name.is_empty());