diff --git a/frontend/src/pages/DocumentDetailsPage.tsx b/frontend/src/pages/DocumentDetailsPage.tsx
index ffd5c29..cefc0fa 100644
--- a/frontend/src/pages/DocumentDetailsPage.tsx
+++ b/frontend/src/pages/DocumentDetailsPage.tsx
@@ -548,6 +548,66 @@ const DocumentDetailsPage: React.FC = () => {
{formatDate(document.created_at)}
+
+ {document.source_type && (
+
+
+ Source Type
+
+
+
+ )}
+
+ {document.source_path && (
+
+
+ Original Path
+
+
+ {document.source_path}
+
+
+ )}
+
+ {document.original_created_at && (
+
+
+ Original Created
+
+
+ {formatDate(document.original_created_at)}
+
+
+ )}
+
+ {document.original_modified_at && (
+
+
+ Original Modified
+
+
+ {formatDate(document.original_modified_at)}
+
+
+ )}
{document.has_ocr_text && (
@@ -829,6 +889,102 @@ const DocumentDetailsPage: React.FC = () => {
ocrError={ocrData?.ocr_error}
/>
+ {/* Source Information */}
+ {(document.source_type || document.file_permissions || document.file_owner || document.file_group) && (
+
+
+
+
+ Source Information
+
+
+
+ {document.source_type && (
+
+
+
+ Source Type
+
+
+
+
+ )}
+
+ {document.file_permissions && (
+
+
+
+ File Permissions
+
+
+ {document.file_permissions.toString(8)} ({document.file_permissions})
+
+
+
+ )}
+
+ {document.file_owner && (
+
+
+
+ File Owner
+
+
+ {document.file_owner}
+
+
+
+ )}
+
+ {document.file_group && (
+
+
+
+ File Group
+
+
+ {document.file_group}
+
+
+
+ )}
+
+ {document.source_path && (
+
+
+
+ Original Source Path
+
+
+ {document.source_path}
+
+
+
+ )}
+
+
+
+ )}
+
{/* Enhanced Metadata Display */}
{document.source_metadata && Object.keys(document.source_metadata).length > 0 && (
has_ocr_text: boolean
ocr_confidence?: number
ocr_word_count?: number
ocr_processing_time_ms?: number
ocr_status?: string
- // New metadata fields
- original_created_at?: string
- original_modified_at?: string
- source_metadata?: Record
}
export interface SearchRequest {
diff --git a/migrations/20250710000001_add_remaining_source_metadata_fields.sql b/migrations/20250710000001_add_remaining_source_metadata_fields.sql
new file mode 100644
index 0000000..aeeafc0
--- /dev/null
+++ b/migrations/20250710000001_add_remaining_source_metadata_fields.sql
@@ -0,0 +1,46 @@
+-- Add remaining dedicated metadata fields to documents table
+-- These fields extract commonly used metadata from source_metadata JSON
+-- into dedicated columns for better querying and indexing
+
+-- Add source path (original file location from source system)
+ALTER TABLE documents
+ADD COLUMN IF NOT EXISTS source_path TEXT;
+
+-- Add source type (e.g., 'web_upload', 'filesystem', 'webdav', 's3')
+ALTER TABLE documents
+ADD COLUMN IF NOT EXISTS source_type TEXT;
+
+-- Add file permissions (Unix mode bits from source system)
+ALTER TABLE documents
+ADD COLUMN IF NOT EXISTS file_permissions INTEGER;
+
+-- Add file owner (username or uid from source system)
+ALTER TABLE documents
+ADD COLUMN IF NOT EXISTS file_owner TEXT;
+
+-- Add file group (groupname or gid from source system)
+ALTER TABLE documents
+ADD COLUMN IF NOT EXISTS file_group TEXT;
+
+-- Create indexes for efficient querying
+CREATE INDEX IF NOT EXISTS idx_documents_source_path ON documents(source_path)
+WHERE source_path IS NOT NULL;
+
+CREATE INDEX IF NOT EXISTS idx_documents_source_type ON documents(source_type)
+WHERE source_type IS NOT NULL;
+
+CREATE INDEX IF NOT EXISTS idx_documents_file_permissions ON documents(file_permissions)
+WHERE file_permissions IS NOT NULL;
+
+CREATE INDEX IF NOT EXISTS idx_documents_file_owner ON documents(file_owner)
+WHERE file_owner IS NOT NULL;
+
+CREATE INDEX IF NOT EXISTS idx_documents_file_group ON documents(file_group)
+WHERE file_group IS NOT NULL;
+
+-- Add helpful comments
+COMMENT ON COLUMN documents.source_path IS 'Original path where the file was located in the source system';
+COMMENT ON COLUMN documents.source_type IS 'Type of source where file was ingested from (web_upload, filesystem, webdav, s3, etc.)';
+COMMENT ON COLUMN documents.file_permissions IS 'File permissions from source system (Unix mode bits)';
+COMMENT ON COLUMN documents.file_owner IS 'File owner from source system (username or uid)';
+COMMENT ON COLUMN documents.file_group IS 'File group from source system (groupname or gid)';
\ No newline at end of file
diff --git a/src/db/documents/helpers.rs b/src/db/documents/helpers.rs
index a082056..de1df24 100644
--- a/src/db/documents/helpers.rs
+++ b/src/db/documents/helpers.rs
@@ -9,7 +9,8 @@ pub const DOCUMENT_FIELDS: &str = r#"
content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms,
ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason,
tags, created_at, updated_at, user_id, file_hash, original_created_at,
- original_modified_at, source_metadata
+ original_modified_at, source_path, source_type, source_id, file_permissions,
+ file_owner, file_group, source_metadata
"#;
/// Maps a database row to a Document struct
@@ -39,6 +40,12 @@ pub fn map_row_to_document(row: &sqlx::postgres::PgRow) -> Document {
file_hash: row.get("file_hash"),
original_created_at: row.get("original_created_at"),
original_modified_at: row.get("original_modified_at"),
+ source_path: row.get("source_path"),
+ source_type: row.get("source_type"),
+ source_id: row.get("source_id"),
+ file_permissions: row.get("file_permissions"),
+ file_owner: row.get("file_owner"),
+ file_group: row.get("file_group"),
source_metadata: row.get("source_metadata"),
}
}
diff --git a/src/ingestion/document_ingestion.rs b/src/ingestion/document_ingestion.rs
index 1b30e92..c0b4b1d 100644
--- a/src/ingestion/document_ingestion.rs
+++ b/src/ingestion/document_ingestion.rs
@@ -54,6 +54,15 @@ pub struct DocumentIngestionRequest {
/// Optional metadata from source file system
pub original_created_at: Option>,
pub original_modified_at: Option>,
+ /// Original file path in source system
+ pub source_path: Option,
+ /// File permissions from source system (Unix mode bits)
+ pub file_permissions: Option,
+ /// File owner from source system
+ pub file_owner: Option,
+ /// File group from source system
+ pub file_group: Option,
+ /// Additional metadata from source system (EXIF, PDF metadata, etc.)
pub source_metadata: Option,
}
@@ -112,6 +121,9 @@ impl DocumentIngestionService {
pub async fn ingest_document(&self, request: DocumentIngestionRequest) -> Result> {
let file_hash = self.calculate_file_hash(&request.file_data);
let file_size = request.file_data.len() as i64;
+
+ // Clone source_type early for error handling
+ let source_type_for_error = request.source_type.clone();
debug!(
"Ingesting document: {} for user {} (hash: {}, size: {} bytes, policy: {:?})",
@@ -184,7 +196,7 @@ impl DocumentIngestionService {
failure_reason: "storage_error".to_string(),
failure_stage: "storage".to_string(),
existing_document_id: None,
- ingestion_source: request.source_type.unwrap_or_else(|| "upload".to_string()),
+ ingestion_source: source_type_for_error.clone().unwrap_or_else(|| "upload".to_string()),
error_message: Some(e.to_string()),
retry_count: Some(0),
last_retry_at: None,
@@ -211,6 +223,12 @@ impl DocumentIngestionService {
Some(file_hash.clone()),
request.original_created_at,
request.original_modified_at,
+ request.source_path,
+ request.source_type,
+ request.source_id,
+ request.file_permissions,
+ request.file_owner,
+ request.file_group,
request.source_metadata,
);
@@ -264,7 +282,7 @@ impl DocumentIngestionService {
failure_reason: "database_error".to_string(),
failure_stage: "ingestion".to_string(),
existing_document_id: None,
- ingestion_source: request.source_type.unwrap_or_else(|| "upload".to_string()),
+ ingestion_source: source_type_for_error.clone().unwrap_or_else(|| "upload".to_string()),
error_message: Some(e.to_string()),
retry_count: Some(0),
last_retry_at: None,
@@ -321,6 +339,10 @@ impl DocumentIngestionService {
source_id,
original_created_at,
original_modified_at,
+ source_path: Some(file_info.path.clone()),
+ file_permissions: file_info.permissions.map(|p| p as i32),
+ file_owner: file_info.owner.clone(),
+ file_group: file_info.group.clone(),
source_metadata,
};
@@ -346,6 +368,10 @@ impl DocumentIngestionService {
source_id: None,
original_created_at: None,
original_modified_at: None,
+ source_path: None, // Direct uploads don't have a source path
+ file_permissions: None, // Direct uploads don't preserve permissions
+ file_owner: None, // Direct uploads don't preserve owner
+ file_group: None, // Direct uploads don't preserve group
source_metadata: None,
};
@@ -373,6 +399,10 @@ impl DocumentIngestionService {
source_id: Some(source_id),
original_created_at: None,
original_modified_at: None,
+ source_path: None, // Source sync files don't have a source path
+ file_permissions: None, // Source sync files don't preserve permissions
+ file_owner: None, // Source sync files don't preserve owner
+ file_group: None, // Source sync files don't preserve group
source_metadata: None,
};
@@ -399,6 +429,10 @@ impl DocumentIngestionService {
source_id: Some(webdav_source_id),
original_created_at: None,
original_modified_at: None,
+ source_path: None, // WebDAV files don't have a source path in this method
+ file_permissions: None, // WebDAV files don't preserve permissions in this method
+ file_owner: None, // WebDAV files don't preserve owner in this method
+ file_group: None, // WebDAV files don't preserve group in this method
source_metadata: None,
};
@@ -424,6 +458,10 @@ impl DocumentIngestionService {
source_id: None,
original_created_at: None,
original_modified_at: None,
+ source_path: None, // Batch files don't have a source path
+ file_permissions: None, // Batch files don't preserve permissions
+ file_owner: None, // Batch files don't preserve owner
+ file_group: None, // Batch files don't preserve group
source_metadata: None,
};
diff --git a/src/models/document.rs b/src/models/document.rs
index 0eb019c..5705962 100644
--- a/src/models/document.rs
+++ b/src/models/document.rs
@@ -32,7 +32,19 @@ pub struct Document {
pub original_created_at: Option>,
/// Original file modification timestamp from source system
pub original_modified_at: Option>,
- /// Additional metadata from source system (permissions, attributes, EXIF data, etc.)
+ /// Original path where the file was located (from source system)
+ pub source_path: Option,
+ /// Type of source where file was ingested from (e.g., "web_upload", "filesystem", "webdav")
+ pub source_type: Option,
+ /// UUID of the source system/configuration
+ pub source_id: Option,
+ /// File permissions from source system (Unix mode bits)
+ pub file_permissions: Option,
+ /// File owner from source system (username or uid)
+ pub file_owner: Option,
+ /// File group from source system (groupname or gid)
+ pub file_group: Option,
+ /// Additional metadata from source system (EXIF data, PDF metadata, custom attributes, etc.)
pub source_metadata: Option,
}
diff --git a/src/models/responses.rs b/src/models/responses.rs
index 1aa1c76..e3b6f49 100644
--- a/src/models/responses.rs
+++ b/src/models/responses.rs
@@ -34,6 +34,8 @@ pub struct DocumentResponse {
pub filename: String,
/// Original filename when uploaded
pub original_filename: String,
+ /// File path where the document is stored
+ pub file_path: String,
/// File size in bytes
pub file_size: i64,
/// MIME type of the file
@@ -45,6 +47,13 @@ pub struct DocumentResponse {
pub labels: Vec,
/// When the document was created
pub created_at: DateTime,
+ /// When the document was last updated
+ pub updated_at: DateTime,
+ /// User who uploaded/owns the document
+ pub user_id: Uuid,
+ /// SHA256 hash of the file content
+ #[serde(skip_serializing_if = "Option::is_none", default)]
+ pub file_hash: Option,
/// Whether OCR text has been extracted
pub has_ocr_text: bool,
/// OCR confidence score (0-100, higher is better)
@@ -61,7 +70,25 @@ pub struct DocumentResponse {
/// Original file modification timestamp from source system
#[serde(skip_serializing_if = "Option::is_none", default)]
pub original_modified_at: Option>,
- /// Additional metadata from source system (permissions, attributes, etc.)
+ /// Original path where the file was located (from source system)
+ #[serde(skip_serializing_if = "Option::is_none", default)]
+ pub source_path: Option,
+ /// Type of source where file was ingested from
+ #[serde(skip_serializing_if = "Option::is_none", default)]
+ pub source_type: Option,
+ /// UUID of the source system/configuration
+ #[serde(skip_serializing_if = "Option::is_none", default)]
+ pub source_id: Option,
+ /// File permissions from source system (Unix mode bits)
+ #[serde(skip_serializing_if = "Option::is_none", default)]
+ pub file_permissions: Option,
+ /// File owner from source system
+ #[serde(skip_serializing_if = "Option::is_none", default)]
+ pub file_owner: Option,
+ /// File group from source system
+ #[serde(skip_serializing_if = "Option::is_none", default)]
+ pub file_group: Option,
+ /// Additional metadata from source system (EXIF data, PDF metadata, custom attributes, etc.)
#[serde(skip_serializing_if = "Option::is_none", default)]
pub source_metadata: Option,
}
@@ -236,11 +263,15 @@ impl From for DocumentResponse {
id: doc.id,
filename: doc.filename,
original_filename: doc.original_filename,
+ file_path: doc.file_path,
file_size: doc.file_size,
mime_type: doc.mime_type,
tags: doc.tags,
labels: Vec::new(), // Labels will be populated separately where needed
created_at: doc.created_at,
+ updated_at: doc.updated_at,
+ user_id: doc.user_id,
+ file_hash: doc.file_hash,
has_ocr_text: doc.ocr_text.is_some(),
ocr_confidence: doc.ocr_confidence,
ocr_word_count: doc.ocr_word_count,
@@ -248,6 +279,12 @@ impl From for DocumentResponse {
ocr_status: doc.ocr_status,
original_created_at: doc.original_created_at,
original_modified_at: doc.original_modified_at,
+ source_path: doc.source_path,
+ source_type: doc.source_type,
+ source_id: doc.source_id,
+ file_permissions: doc.file_permissions,
+ file_owner: doc.file_owner,
+ file_group: doc.file_group,
source_metadata: doc.source_metadata,
}
}
diff --git a/src/routes/documents/crud.rs b/src/routes/documents/crud.rs
index 2b97ef1..1f33b69 100644
--- a/src/routes/documents/crud.rs
+++ b/src/routes/documents/crud.rs
@@ -92,6 +92,10 @@ pub async fn upload_document(
deduplication_policy: crate::ingestion::document_ingestion::DeduplicationPolicy::Skip,
original_created_at: None,
original_modified_at: None,
+ source_path: None, // Web uploads don't have a source path
+ file_permissions: None, // Web uploads don't preserve permissions
+ file_owner: None, // Web uploads don't preserve owner
+ file_group: None, // Web uploads don't preserve group
source_metadata: None,
};
diff --git a/src/services/file_service.rs b/src/services/file_service.rs
index 6a293cb..514083d 100644
--- a/src/services/file_service.rs
+++ b/src/services/file_service.rs
@@ -159,6 +159,12 @@ impl FileService {
file_hash: Option,
original_created_at: Option>,
original_modified_at: Option>,
+ source_path: Option,
+ source_type: Option,
+ source_id: Option,
+ file_permissions: Option,
+ file_owner: Option,
+ file_group: Option,
source_metadata: Option,
) -> Document {
Document {
@@ -185,6 +191,12 @@ impl FileService {
file_hash,
original_created_at,
original_modified_at,
+ source_path,
+ source_type,
+ source_id,
+ file_permissions,
+ file_owner,
+ file_group,
source_metadata,
}
}