From 046577789001ad75115a8a7f4fdbe77b78af61cd Mon Sep 17 00:00:00 2001 From: perf3ct Date: Thu, 10 Jul 2025 21:02:15 +0000 Subject: [PATCH] feat(client): show more fields for Documents --- frontend/src/pages/DocumentDetailsPage.tsx | 156 ++++++++++++++++++ frontend/src/services/api.ts | 18 +- ...1_add_remaining_source_metadata_fields.sql | 46 ++++++ src/db/documents/helpers.rs | 9 +- src/ingestion/document_ingestion.rs | 42 ++++- src/models/document.rs | 14 +- src/models/responses.rs | 39 ++++- src/routes/documents/crud.rs | 4 + src/services/file_service.rs | 12 ++ 9 files changed, 329 insertions(+), 11 deletions(-) create mode 100644 migrations/20250710000001_add_remaining_source_metadata_fields.sql diff --git a/frontend/src/pages/DocumentDetailsPage.tsx b/frontend/src/pages/DocumentDetailsPage.tsx index ffd5c29..cefc0fa 100644 --- a/frontend/src/pages/DocumentDetailsPage.tsx +++ b/frontend/src/pages/DocumentDetailsPage.tsx @@ -548,6 +548,66 @@ const DocumentDetailsPage: React.FC = () => { {formatDate(document.created_at)} + + {document.source_type && ( + + + Source Type + + + + )} + + {document.source_path && ( + + + Original Path + + + {document.source_path} + + + )} + + {document.original_created_at && ( + + + Original Created + + + {formatDate(document.original_created_at)} + + + )} + + {document.original_modified_at && ( + + + Original Modified + + + {formatDate(document.original_modified_at)} + + + )} {document.has_ocr_text && ( @@ -829,6 +889,102 @@ const DocumentDetailsPage: React.FC = () => { ocrError={ocrData?.ocr_error} /> + {/* Source Information */} + {(document.source_type || document.file_permissions || document.file_owner || document.file_group) && ( + + + + + Source Information + + + + {document.source_type && ( + + + + Source Type + + + + + )} + + {document.file_permissions && ( + + + + File Permissions + + + {document.file_permissions.toString(8)} ({document.file_permissions}) + + + + )} + + {document.file_owner && ( + + + + File Owner + + + {document.file_owner} + + + + )} + + {document.file_group && ( + + + + File Group + + + {document.file_group} + + + + )} + + {document.source_path && ( + + + + Original Source Path + + + {document.source_path} + + + + )} + + + + )} + {/* Enhanced Metadata Display */} {document.source_metadata && Object.keys(document.source_metadata).length > 0 && ( has_ocr_text: boolean ocr_confidence?: number ocr_word_count?: number ocr_processing_time_ms?: number ocr_status?: string - // New metadata fields - original_created_at?: string - original_modified_at?: string - source_metadata?: Record } export interface SearchRequest { diff --git a/migrations/20250710000001_add_remaining_source_metadata_fields.sql b/migrations/20250710000001_add_remaining_source_metadata_fields.sql new file mode 100644 index 0000000..aeeafc0 --- /dev/null +++ b/migrations/20250710000001_add_remaining_source_metadata_fields.sql @@ -0,0 +1,46 @@ +-- Add remaining dedicated metadata fields to documents table +-- These fields extract commonly used metadata from source_metadata JSON +-- into dedicated columns for better querying and indexing + +-- Add source path (original file location from source system) +ALTER TABLE documents +ADD COLUMN IF NOT EXISTS source_path TEXT; + +-- Add source type (e.g., 'web_upload', 'filesystem', 'webdav', 's3') +ALTER TABLE documents +ADD COLUMN IF NOT EXISTS source_type TEXT; + +-- Add file permissions (Unix mode bits from source system) +ALTER TABLE documents +ADD COLUMN IF NOT EXISTS file_permissions INTEGER; + +-- Add file owner (username or uid from source system) +ALTER TABLE documents +ADD COLUMN IF NOT EXISTS file_owner TEXT; + +-- Add file group (groupname or gid from source system) +ALTER TABLE documents +ADD COLUMN IF NOT EXISTS file_group TEXT; + +-- Create indexes for efficient querying +CREATE INDEX IF NOT EXISTS idx_documents_source_path ON documents(source_path) +WHERE source_path IS NOT NULL; + +CREATE INDEX IF NOT EXISTS idx_documents_source_type ON documents(source_type) +WHERE source_type IS NOT NULL; + +CREATE INDEX IF NOT EXISTS idx_documents_file_permissions ON documents(file_permissions) +WHERE file_permissions IS NOT NULL; + +CREATE INDEX IF NOT EXISTS idx_documents_file_owner ON documents(file_owner) +WHERE file_owner IS NOT NULL; + +CREATE INDEX IF NOT EXISTS idx_documents_file_group ON documents(file_group) +WHERE file_group IS NOT NULL; + +-- Add helpful comments +COMMENT ON COLUMN documents.source_path IS 'Original path where the file was located in the source system'; +COMMENT ON COLUMN documents.source_type IS 'Type of source where file was ingested from (web_upload, filesystem, webdav, s3, etc.)'; +COMMENT ON COLUMN documents.file_permissions IS 'File permissions from source system (Unix mode bits)'; +COMMENT ON COLUMN documents.file_owner IS 'File owner from source system (username or uid)'; +COMMENT ON COLUMN documents.file_group IS 'File group from source system (groupname or gid)'; \ No newline at end of file diff --git a/src/db/documents/helpers.rs b/src/db/documents/helpers.rs index a082056..de1df24 100644 --- a/src/db/documents/helpers.rs +++ b/src/db/documents/helpers.rs @@ -9,7 +9,8 @@ pub const DOCUMENT_FIELDS: &str = r#" content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, - original_modified_at, source_metadata + original_modified_at, source_path, source_type, source_id, file_permissions, + file_owner, file_group, source_metadata "#; /// Maps a database row to a Document struct @@ -39,6 +40,12 @@ pub fn map_row_to_document(row: &sqlx::postgres::PgRow) -> Document { file_hash: row.get("file_hash"), original_created_at: row.get("original_created_at"), original_modified_at: row.get("original_modified_at"), + source_path: row.get("source_path"), + source_type: row.get("source_type"), + source_id: row.get("source_id"), + file_permissions: row.get("file_permissions"), + file_owner: row.get("file_owner"), + file_group: row.get("file_group"), source_metadata: row.get("source_metadata"), } } diff --git a/src/ingestion/document_ingestion.rs b/src/ingestion/document_ingestion.rs index 1b30e92..c0b4b1d 100644 --- a/src/ingestion/document_ingestion.rs +++ b/src/ingestion/document_ingestion.rs @@ -54,6 +54,15 @@ pub struct DocumentIngestionRequest { /// Optional metadata from source file system pub original_created_at: Option>, pub original_modified_at: Option>, + /// Original file path in source system + pub source_path: Option, + /// File permissions from source system (Unix mode bits) + pub file_permissions: Option, + /// File owner from source system + pub file_owner: Option, + /// File group from source system + pub file_group: Option, + /// Additional metadata from source system (EXIF, PDF metadata, etc.) pub source_metadata: Option, } @@ -112,6 +121,9 @@ impl DocumentIngestionService { pub async fn ingest_document(&self, request: DocumentIngestionRequest) -> Result> { let file_hash = self.calculate_file_hash(&request.file_data); let file_size = request.file_data.len() as i64; + + // Clone source_type early for error handling + let source_type_for_error = request.source_type.clone(); debug!( "Ingesting document: {} for user {} (hash: {}, size: {} bytes, policy: {:?})", @@ -184,7 +196,7 @@ impl DocumentIngestionService { failure_reason: "storage_error".to_string(), failure_stage: "storage".to_string(), existing_document_id: None, - ingestion_source: request.source_type.unwrap_or_else(|| "upload".to_string()), + ingestion_source: source_type_for_error.clone().unwrap_or_else(|| "upload".to_string()), error_message: Some(e.to_string()), retry_count: Some(0), last_retry_at: None, @@ -211,6 +223,12 @@ impl DocumentIngestionService { Some(file_hash.clone()), request.original_created_at, request.original_modified_at, + request.source_path, + request.source_type, + request.source_id, + request.file_permissions, + request.file_owner, + request.file_group, request.source_metadata, ); @@ -264,7 +282,7 @@ impl DocumentIngestionService { failure_reason: "database_error".to_string(), failure_stage: "ingestion".to_string(), existing_document_id: None, - ingestion_source: request.source_type.unwrap_or_else(|| "upload".to_string()), + ingestion_source: source_type_for_error.clone().unwrap_or_else(|| "upload".to_string()), error_message: Some(e.to_string()), retry_count: Some(0), last_retry_at: None, @@ -321,6 +339,10 @@ impl DocumentIngestionService { source_id, original_created_at, original_modified_at, + source_path: Some(file_info.path.clone()), + file_permissions: file_info.permissions.map(|p| p as i32), + file_owner: file_info.owner.clone(), + file_group: file_info.group.clone(), source_metadata, }; @@ -346,6 +368,10 @@ impl DocumentIngestionService { source_id: None, original_created_at: None, original_modified_at: None, + source_path: None, // Direct uploads don't have a source path + file_permissions: None, // Direct uploads don't preserve permissions + file_owner: None, // Direct uploads don't preserve owner + file_group: None, // Direct uploads don't preserve group source_metadata: None, }; @@ -373,6 +399,10 @@ impl DocumentIngestionService { source_id: Some(source_id), original_created_at: None, original_modified_at: None, + source_path: None, // Source sync files don't have a source path + file_permissions: None, // Source sync files don't preserve permissions + file_owner: None, // Source sync files don't preserve owner + file_group: None, // Source sync files don't preserve group source_metadata: None, }; @@ -399,6 +429,10 @@ impl DocumentIngestionService { source_id: Some(webdav_source_id), original_created_at: None, original_modified_at: None, + source_path: None, // WebDAV files don't have a source path in this method + file_permissions: None, // WebDAV files don't preserve permissions in this method + file_owner: None, // WebDAV files don't preserve owner in this method + file_group: None, // WebDAV files don't preserve group in this method source_metadata: None, }; @@ -424,6 +458,10 @@ impl DocumentIngestionService { source_id: None, original_created_at: None, original_modified_at: None, + source_path: None, // Batch files don't have a source path + file_permissions: None, // Batch files don't preserve permissions + file_owner: None, // Batch files don't preserve owner + file_group: None, // Batch files don't preserve group source_metadata: None, }; diff --git a/src/models/document.rs b/src/models/document.rs index 0eb019c..5705962 100644 --- a/src/models/document.rs +++ b/src/models/document.rs @@ -32,7 +32,19 @@ pub struct Document { pub original_created_at: Option>, /// Original file modification timestamp from source system pub original_modified_at: Option>, - /// Additional metadata from source system (permissions, attributes, EXIF data, etc.) + /// Original path where the file was located (from source system) + pub source_path: Option, + /// Type of source where file was ingested from (e.g., "web_upload", "filesystem", "webdav") + pub source_type: Option, + /// UUID of the source system/configuration + pub source_id: Option, + /// File permissions from source system (Unix mode bits) + pub file_permissions: Option, + /// File owner from source system (username or uid) + pub file_owner: Option, + /// File group from source system (groupname or gid) + pub file_group: Option, + /// Additional metadata from source system (EXIF data, PDF metadata, custom attributes, etc.) pub source_metadata: Option, } diff --git a/src/models/responses.rs b/src/models/responses.rs index 1aa1c76..e3b6f49 100644 --- a/src/models/responses.rs +++ b/src/models/responses.rs @@ -34,6 +34,8 @@ pub struct DocumentResponse { pub filename: String, /// Original filename when uploaded pub original_filename: String, + /// File path where the document is stored + pub file_path: String, /// File size in bytes pub file_size: i64, /// MIME type of the file @@ -45,6 +47,13 @@ pub struct DocumentResponse { pub labels: Vec, /// When the document was created pub created_at: DateTime, + /// When the document was last updated + pub updated_at: DateTime, + /// User who uploaded/owns the document + pub user_id: Uuid, + /// SHA256 hash of the file content + #[serde(skip_serializing_if = "Option::is_none", default)] + pub file_hash: Option, /// Whether OCR text has been extracted pub has_ocr_text: bool, /// OCR confidence score (0-100, higher is better) @@ -61,7 +70,25 @@ pub struct DocumentResponse { /// Original file modification timestamp from source system #[serde(skip_serializing_if = "Option::is_none", default)] pub original_modified_at: Option>, - /// Additional metadata from source system (permissions, attributes, etc.) + /// Original path where the file was located (from source system) + #[serde(skip_serializing_if = "Option::is_none", default)] + pub source_path: Option, + /// Type of source where file was ingested from + #[serde(skip_serializing_if = "Option::is_none", default)] + pub source_type: Option, + /// UUID of the source system/configuration + #[serde(skip_serializing_if = "Option::is_none", default)] + pub source_id: Option, + /// File permissions from source system (Unix mode bits) + #[serde(skip_serializing_if = "Option::is_none", default)] + pub file_permissions: Option, + /// File owner from source system + #[serde(skip_serializing_if = "Option::is_none", default)] + pub file_owner: Option, + /// File group from source system + #[serde(skip_serializing_if = "Option::is_none", default)] + pub file_group: Option, + /// Additional metadata from source system (EXIF data, PDF metadata, custom attributes, etc.) #[serde(skip_serializing_if = "Option::is_none", default)] pub source_metadata: Option, } @@ -236,11 +263,15 @@ impl From for DocumentResponse { id: doc.id, filename: doc.filename, original_filename: doc.original_filename, + file_path: doc.file_path, file_size: doc.file_size, mime_type: doc.mime_type, tags: doc.tags, labels: Vec::new(), // Labels will be populated separately where needed created_at: doc.created_at, + updated_at: doc.updated_at, + user_id: doc.user_id, + file_hash: doc.file_hash, has_ocr_text: doc.ocr_text.is_some(), ocr_confidence: doc.ocr_confidence, ocr_word_count: doc.ocr_word_count, @@ -248,6 +279,12 @@ impl From for DocumentResponse { ocr_status: doc.ocr_status, original_created_at: doc.original_created_at, original_modified_at: doc.original_modified_at, + source_path: doc.source_path, + source_type: doc.source_type, + source_id: doc.source_id, + file_permissions: doc.file_permissions, + file_owner: doc.file_owner, + file_group: doc.file_group, source_metadata: doc.source_metadata, } } diff --git a/src/routes/documents/crud.rs b/src/routes/documents/crud.rs index 2b97ef1..1f33b69 100644 --- a/src/routes/documents/crud.rs +++ b/src/routes/documents/crud.rs @@ -92,6 +92,10 @@ pub async fn upload_document( deduplication_policy: crate::ingestion::document_ingestion::DeduplicationPolicy::Skip, original_created_at: None, original_modified_at: None, + source_path: None, // Web uploads don't have a source path + file_permissions: None, // Web uploads don't preserve permissions + file_owner: None, // Web uploads don't preserve owner + file_group: None, // Web uploads don't preserve group source_metadata: None, }; diff --git a/src/services/file_service.rs b/src/services/file_service.rs index 6a293cb..514083d 100644 --- a/src/services/file_service.rs +++ b/src/services/file_service.rs @@ -159,6 +159,12 @@ impl FileService { file_hash: Option, original_created_at: Option>, original_modified_at: Option>, + source_path: Option, + source_type: Option, + source_id: Option, + file_permissions: Option, + file_owner: Option, + file_group: Option, source_metadata: Option, ) -> Document { Document { @@ -185,6 +191,12 @@ impl FileService { file_hash, original_created_at, original_modified_at, + source_path, + source_type, + source_id, + file_permissions, + file_owner, + file_group, source_metadata, } }