feat(server/client): add metadata to file view

This commit is contained in:
perf3ct 2025-06-30 19:13:16 +00:00
parent 1695787f1d
commit 9e43df2fbe
31 changed files with 1007 additions and 297 deletions

View File

@ -0,0 +1,232 @@
import React, { useState } from 'react';
import {
Box,
Typography,
Accordion,
AccordionSummary,
AccordionDetails,
Chip,
Grid,
} from '@mui/material';
import {
ExpandMore as ExpandMoreIcon,
Security as PermissionsIcon,
Person as OwnerIcon,
Group as GroupIcon,
Storage as StorageIcon,
Info as InfoIcon,
} from '@mui/icons-material';
interface MetadataDisplayProps {
metadata: any;
title?: string;
compact?: boolean;
}
const MetadataDisplay: React.FC<MetadataDisplayProps> = ({
metadata,
title = "Source Metadata",
compact = false,
}) => {
const [expanded, setExpanded] = useState(!compact);
if (!metadata || Object.keys(metadata).length === 0) {
return null;
}
const formatValue = (key: string, value: any): React.ReactNode => {
// Handle special metadata fields with better formatting
if (key === 'permissions' && typeof value === 'number') {
return (
<Box sx={{ display: 'flex', alignItems: 'center', gap: 1 }}>
<PermissionsIcon sx={{ fontSize: 16, color: 'primary.main' }} />
<Typography variant="body2" component="span">
{value.toString(8)} (octal)
</Typography>
</Box>
);
}
if (key === 'owner' || key === 'uid') {
return (
<Box sx={{ display: 'flex', alignItems: 'center', gap: 1 }}>
<OwnerIcon sx={{ fontSize: 16, color: 'primary.main' }} />
<Typography variant="body2" component="span">
{value}
</Typography>
</Box>
);
}
if (key === 'group' || key === 'gid') {
return (
<Box sx={{ display: 'flex', alignItems: 'center', gap: 1 }}>
<GroupIcon sx={{ fontSize: 16, color: 'primary.main' }} />
<Typography variant="body2" component="span">
{value}
</Typography>
</Box>
);
}
if (key === 'storage_class' || key === 'region') {
return (
<Box sx={{ display: 'flex', alignItems: 'center', gap: 1 }}>
<StorageIcon sx={{ fontSize: 16, color: 'primary.main' }} />
<Typography variant="body2" component="span">
{value}
</Typography>
</Box>
);
}
// Handle arrays
if (Array.isArray(value)) {
return (
<Box sx={{ display: 'flex', flexWrap: 'wrap', gap: 0.5 }}>
{value.map((item, index) => (
<Chip
key={index}
label={String(item)}
size="small"
variant="outlined"
/>
))}
</Box>
);
}
// Handle objects
if (typeof value === 'object' && value !== null) {
return (
<Box sx={{
backgroundColor: 'grey.100',
p: 1,
borderRadius: 1,
fontFamily: 'monospace',
fontSize: '0.75rem',
maxHeight: '100px',
overflow: 'auto'
}}>
<pre style={{ margin: 0, whiteSpace: 'pre-wrap' }}>
{JSON.stringify(value, null, 2)}
</pre>
</Box>
);
}
// Handle boolean values
if (typeof value === 'boolean') {
return (
<Chip
label={value ? 'Yes' : 'No'}
color={value ? 'success' : 'default'}
size="small"
variant="outlined"
/>
);
}
// Handle dates
if (typeof value === 'string' && (
key.includes('date') ||
key.includes('time') ||
key.includes('created') ||
key.includes('modified')
)) {
try {
const date = new Date(value);
if (!isNaN(date.getTime())) {
return (
<Typography variant="body2" component="span">
{date.toLocaleString()}
</Typography>
);
}
} catch {
// Fall through to default handling
}
}
// Default: display as string
return (
<Typography variant="body2" component="span">
{String(value)}
</Typography>
);
};
const formatKeyName = (key: string): string => {
// Convert snake_case and camelCase to Title Case
return key
.replace(/([a-z])([A-Z])/g, '$1 $2') // camelCase to spaces
.replace(/_/g, ' ') // snake_case to spaces
.replace(/\b\w/g, (letter) => letter.toUpperCase()); // Title Case
};
const renderMetadata = () => {
return (
<Grid container spacing={2}>
{Object.entries(metadata).map(([key, value]) => (
<Grid item xs={12} sm={6} key={key}>
<Box sx={{ mb: 1 }}>
<Typography
variant="caption"
color="text.secondary"
sx={{ fontWeight: 600, textTransform: 'uppercase', letterSpacing: 0.5 }}
>
{formatKeyName(key)}
</Typography>
</Box>
<Box sx={{ pl: 1 }}>
{formatValue(key, value)}
</Box>
</Grid>
))}
</Grid>
);
};
if (compact) {
return (
<Accordion expanded={expanded} onChange={(_, isExpanded) => setExpanded(isExpanded)}>
<AccordionSummary
expandIcon={<ExpandMoreIcon />}
sx={{
backgroundColor: 'grey.50',
'&:hover': { backgroundColor: 'grey.100' }
}}
>
<Box sx={{ display: 'flex', alignItems: 'center', gap: 1 }}>
<InfoIcon sx={{ fontSize: 20, color: 'primary.main' }} />
<Typography variant="subtitle2" sx={{ fontWeight: 600 }}>
{title}
</Typography>
<Chip
label={`${Object.keys(metadata).length} fields`}
size="small"
variant="outlined"
/>
</Box>
</AccordionSummary>
<AccordionDetails>
{renderMetadata()}
</AccordionDetails>
</Accordion>
);
}
return (
<Box>
<Box sx={{ display: 'flex', alignItems: 'center', gap: 1, mb: 2 }}>
<InfoIcon sx={{ color: 'primary.main' }} />
<Typography variant="subtitle2" sx={{ fontWeight: 600 }}>
{title}
</Typography>
</Box>
{renderMetadata()}
</Box>
);
};
export default MetadataDisplay;

View File

@ -35,11 +35,16 @@ import {
Search as SearchIcon,
Edit as EditIcon,
PhotoFilter as ProcessedImageIcon,
Source as SourceIcon,
AccessTime as AccessTimeIcon,
Create as CreateIcon,
Info as InfoIcon,
} from '@mui/icons-material';
import { documentService, OcrResponse } from '../services/api';
import DocumentViewer from '../components/DocumentViewer';
import LabelSelector from '../components/Labels/LabelSelector';
import { type LabelData } from '../components/Labels/Label';
import MetadataDisplay from '../components/MetadataDisplay';
import api from '../services/api';
interface Document {
@ -51,6 +56,9 @@ interface Document {
created_at: string;
has_ocr_text?: boolean;
tags?: string[];
original_created_at?: string;
original_modified_at?: string;
source_metadata?: any;
}
const DocumentDetailsPage: React.FC = () => {
@ -500,6 +508,55 @@ const DocumentDetailsPage: React.FC = () => {
</Paper>
</Grid>
{/* Source Metadata Section */}
{(document.original_created_at || document.original_modified_at || document.source_metadata) && (
<>
{document.original_created_at && (
<Grid item xs={12} sm={6}>
<Paper sx={{ p: 2, height: '100%' }}>
<Box sx={{ display: 'flex', alignItems: 'center', mb: 1 }}>
<CreateIcon color="primary" sx={{ mr: 1 }} />
<Typography variant="subtitle2" color="text.secondary">
Original Created
</Typography>
</Box>
<Typography variant="body1" sx={{ fontWeight: 500 }}>
{formatDate(document.original_created_at)}
</Typography>
</Paper>
</Grid>
)}
{document.original_modified_at && (
<Grid item xs={12} sm={6}>
<Paper sx={{ p: 2, height: '100%' }}>
<Box sx={{ display: 'flex', alignItems: 'center', mb: 1 }}>
<AccessTimeIcon color="primary" sx={{ mr: 1 }} />
<Typography variant="subtitle2" color="text.secondary">
Original Modified
</Typography>
</Box>
<Typography variant="body1" sx={{ fontWeight: 500 }}>
{formatDate(document.original_modified_at)}
</Typography>
</Paper>
</Grid>
)}
{document.source_metadata && Object.keys(document.source_metadata).length > 0 && (
<Grid item xs={12}>
<Paper sx={{ p: 2 }}>
<MetadataDisplay
metadata={document.source_metadata}
title="Source Metadata"
compact={false}
/>
</Paper>
</Grid>
)}
</>
)}
{document.tags && document.tags.length > 0 && (
<Grid item xs={12}>
<Paper sx={{ p: 2 }}>

View File

@ -56,6 +56,7 @@ import { format } from 'date-fns';
import { api, documentService, queueService } from '../services/api';
import DocumentViewer from '../components/DocumentViewer';
import FailedDocumentViewer from '../components/FailedDocumentViewer';
import MetadataDisplay from '../components/MetadataDisplay';
interface FailedDocument {
id: string;
@ -78,6 +79,9 @@ interface FailedDocument {
ocr_word_count?: number;
failure_reason: string;
error_message?: string;
original_created_at?: string;
original_modified_at?: string;
source_metadata?: any;
}
interface FailureCategory {
@ -1989,6 +1993,39 @@ const DocumentManagementPage: React.FC = () => {
sx={{ mb: 2 }}
/>
{/* Source Metadata Section */}
{selectedDocument.original_created_at && (
<>
<Typography variant="body2" color="text.secondary" component="div">
<strong>Original Created:</strong>
</Typography>
<Typography variant="body2" sx={{ mb: 2 }}>
{format(new Date(selectedDocument.original_created_at), 'PPpp')}
</Typography>
</>
)}
{selectedDocument.original_modified_at && (
<>
<Typography variant="body2" color="text.secondary" component="div">
<strong>Original Modified:</strong>
</Typography>
<Typography variant="body2" sx={{ mb: 2 }}>
{format(new Date(selectedDocument.original_modified_at), 'PPpp')}
</Typography>
</>
)}
{selectedDocument.source_metadata && Object.keys(selectedDocument.source_metadata).length > 0 && (
<Box sx={{ mt: 2, mb: 2 }}>
<MetadataDisplay
metadata={selectedDocument.source_metadata}
title="Source Metadata"
compact={true}
/>
</Box>
)}
<Typography variant="body2" color="text.secondary" component="div" sx={{ mt: 2 }}>
<strong>Retry Count:</strong>
</Typography>

View File

@ -0,0 +1,17 @@
-- Add metadata preservation fields to documents table
ALTER TABLE documents
ADD COLUMN original_created_at TIMESTAMPTZ,
ADD COLUMN original_modified_at TIMESTAMPTZ,
ADD COLUMN source_metadata JSONB;
-- Add comment to explain fields
COMMENT ON COLUMN documents.original_created_at IS 'Original file creation timestamp from source system';
COMMENT ON COLUMN documents.original_modified_at IS 'Original file modification timestamp from source system';
COMMENT ON COLUMN documents.source_metadata IS 'Additional metadata from source system (permissions, attributes, EXIF data, etc.)';
-- Create index on source_metadata for efficient JSONB queries
CREATE INDEX idx_documents_source_metadata ON documents USING gin (source_metadata);
-- Note: We cannot reliably populate original_created_at and original_modified_at
-- for existing documents as we don't have this information stored.
-- These fields will remain NULL for existing documents, which is correct.

View File

@ -10,9 +10,9 @@ impl Database {
pub async fn create_document(&self, document: Document) -> Result<Document> {
let row = sqlx::query(
r#"
INSERT INTO documents (id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19)
RETURNING id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
INSERT INTO documents (id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22)
RETURNING id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
"#
)
.bind(document.id)
@ -34,6 +34,9 @@ impl Database {
.bind(document.updated_at)
.bind(document.user_id)
.bind(&document.file_hash)
.bind(document.original_created_at)
.bind(document.original_modified_at)
.bind(&document.source_metadata)
.fetch_one(&self.pool)
.await?;
@ -57,6 +60,9 @@ impl Database {
updated_at: row.get("updated_at"),
user_id: row.get("user_id"),
file_hash: row.get("file_hash"),
original_created_at: row.get("original_created_at"),
original_modified_at: row.get("original_modified_at"),
source_metadata: row.get("source_metadata"),
})
}
@ -64,7 +70,7 @@ impl Database {
let query = if user_role == crate::models::UserRole::Admin {
// Admins can see all documents
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, original_created_at, original_modified_at, source_metadata
FROM documents
ORDER BY created_at DESC
LIMIT $1 OFFSET $2
@ -72,7 +78,7 @@ impl Database {
} else {
// Regular users can only see their own documents
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, original_created_at, original_modified_at, source_metadata
FROM documents
WHERE user_id = $3
ORDER BY created_at DESC
@ -117,6 +123,9 @@ impl Database {
updated_at: row.get("updated_at"),
user_id: row.get("user_id"),
file_hash: row.get("file_hash"),
original_created_at: row.get("original_created_at"),
original_modified_at: row.get("original_modified_at"),
source_metadata: row.get("source_metadata"),
})
.collect();
@ -129,7 +138,7 @@ impl Database {
// Admin with OCR filter
sqlx::query(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
FROM documents
WHERE ocr_status = $3
ORDER BY created_at DESC
@ -146,7 +155,7 @@ impl Database {
// Admin without OCR filter
sqlx::query(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
FROM documents
ORDER BY created_at DESC
LIMIT $1 OFFSET $2
@ -161,7 +170,7 @@ impl Database {
// Regular user with OCR filter
sqlx::query(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
FROM documents
WHERE user_id = $3 AND ocr_status = $4
ORDER BY created_at DESC
@ -179,7 +188,7 @@ impl Database {
// Regular user without OCR filter
sqlx::query(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
FROM documents
WHERE user_id = $3
ORDER BY created_at DESC
@ -216,6 +225,9 @@ impl Database {
updated_at: row.get("updated_at"),
user_id: row.get("user_id"),
file_hash: row.get("file_hash"),
original_created_at: row.get("original_created_at"),
original_modified_at: row.get("original_modified_at"),
source_metadata: row.get("source_metadata"),
})
.collect();
@ -268,7 +280,7 @@ impl Database {
pub async fn get_documents_by_user(&self, user_id: Uuid, limit: i64, offset: i64) -> Result<Vec<Document>> {
let rows = sqlx::query(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
FROM documents
WHERE user_id = $1
ORDER BY created_at DESC
@ -303,6 +315,9 @@ impl Database {
updated_at: row.get("updated_at"),
user_id: row.get("user_id"),
file_hash: row.get("file_hash"),
original_created_at: row.get("original_created_at"),
original_modified_at: row.get("original_modified_at"),
source_metadata: row.get("source_metadata"),
})
.collect();
@ -312,7 +327,7 @@ impl Database {
pub async fn find_documents_by_filename(&self, filename: &str) -> Result<Vec<Document>> {
let rows = sqlx::query(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
FROM documents
WHERE filename = $1 OR original_filename = $1
ORDER BY created_at DESC
@ -344,6 +359,9 @@ impl Database {
updated_at: row.get("updated_at"),
user_id: row.get("user_id"),
file_hash: row.get("file_hash"),
original_created_at: row.get("original_created_at"),
original_modified_at: row.get("original_modified_at"),
source_metadata: row.get("source_metadata"),
})
.collect();
@ -353,7 +371,7 @@ impl Database {
pub async fn search_documents(&self, user_id: Uuid, search: SearchRequest) -> Result<(Vec<Document>, i64)> {
let mut query_builder = QueryBuilder::new(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash,
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata,
ts_rank(to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')), plainto_tsquery('english', "#
);
@ -415,6 +433,9 @@ impl Database {
updated_at: row.get("updated_at"),
user_id: row.get("user_id"),
file_hash: row.get("file_hash"),
original_created_at: row.get("original_created_at"),
original_modified_at: row.get("original_modified_at"),
source_metadata: row.get("source_metadata"),
})
.collect();
@ -456,7 +477,7 @@ impl Database {
// Use trigram similarity for substring matching
let mut builder = QueryBuilder::new(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash,
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata,
GREATEST(
similarity(filename, "#
);
@ -499,7 +520,7 @@ impl Database {
let mut builder = QueryBuilder::new(&format!(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash,
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata,
GREATEST(
CASE WHEN filename ILIKE '%' || "#
));
@ -645,7 +666,7 @@ impl Database {
// Use trigram similarity for substring matching
let mut builder = QueryBuilder::new(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash,
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata,
GREATEST(
similarity(filename, "#
);
@ -684,7 +705,7 @@ impl Database {
let mut builder = QueryBuilder::new(&format!(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash,
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata,
GREATEST(
CASE WHEN filename ILIKE '%' || "#
));
@ -993,6 +1014,9 @@ impl Database {
updated_at: row.get("updated_at"),
user_id: row.get("user_id"),
file_hash: row.get("file_hash"),
original_created_at: row.get("original_created_at"),
original_modified_at: row.get("original_modified_at"),
source_metadata: row.get("source_metadata"),
});
}
@ -1081,14 +1105,14 @@ impl Database {
let query = if user_role == crate::models::UserRole::Admin {
// Admins can see any document
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
FROM documents
WHERE id = $1
"#
} else {
// Regular users can only see their own documents
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
FROM documents
WHERE id = $1 AND user_id = $2
"#
@ -1128,6 +1152,9 @@ impl Database {
updated_at: row.get("updated_at"),
user_id: row.get("user_id"),
file_hash: row.get("file_hash"),
original_created_at: row.get("original_created_at"),
original_modified_at: row.get("original_modified_at"),
source_metadata: row.get("source_metadata"),
})),
None => Ok(None),
}
@ -1137,7 +1164,7 @@ impl Database {
pub async fn get_document_by_user_and_hash(&self, user_id: Uuid, file_hash: &str) -> Result<Option<Document>> {
let row = sqlx::query(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
FROM documents
WHERE user_id = $1 AND file_hash = $2
LIMIT 1
@ -1169,6 +1196,9 @@ impl Database {
updated_at: row.get("updated_at"),
user_id: row.get("user_id"),
file_hash: row.get("file_hash"),
original_created_at: row.get("original_created_at"),
original_modified_at: row.get("original_modified_at"),
source_metadata: row.get("source_metadata"),
})),
None => Ok(None),
}
@ -1393,6 +1423,9 @@ impl Database {
updated_at: r.get("updated_at"),
user_id: r.get("user_id"),
file_hash: r.get("file_hash"),
original_created_at: r.get("original_created_at"),
original_modified_at: r.get("original_modified_at"),
source_metadata: r.get("source_metadata"),
})
} else {
let row = sqlx::query(
@ -1427,6 +1460,9 @@ impl Database {
updated_at: r.get("updated_at"),
user_id: r.get("user_id"),
file_hash: r.get("file_hash"),
original_created_at: r.get("original_created_at"),
original_modified_at: r.get("original_modified_at"),
source_metadata: r.get("source_metadata"),
})
};
@ -1470,6 +1506,9 @@ impl Database {
updated_at: r.get("updated_at"),
user_id: r.get("user_id"),
file_hash: r.get("file_hash"),
original_created_at: r.get("original_created_at"),
original_modified_at: r.get("original_modified_at"),
source_metadata: r.get("source_metadata"),
}).collect()
} else {
let rows = sqlx::query(
@ -1504,6 +1543,9 @@ impl Database {
updated_at: r.get("updated_at"),
user_id: r.get("user_id"),
file_hash: r.get("file_hash"),
original_created_at: r.get("original_created_at"),
original_modified_at: r.get("original_modified_at"),
source_metadata: r.get("source_metadata"),
}).collect()
};
@ -1515,7 +1557,7 @@ impl Database {
let documents = if user_role == crate::models::UserRole::Admin {
let rows = sqlx::query(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
FROM documents
WHERE ocr_confidence IS NOT NULL AND ocr_confidence < $1
ORDER BY ocr_confidence ASC, created_at DESC
@ -1545,11 +1587,14 @@ impl Database {
updated_at: r.get("updated_at"),
user_id: r.get("user_id"),
file_hash: r.get("file_hash"),
original_created_at: r.get("original_created_at"),
original_modified_at: r.get("original_modified_at"),
source_metadata: r.get("source_metadata"),
}).collect()
} else {
let rows = sqlx::query(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
FROM documents
WHERE ocr_confidence IS NOT NULL AND ocr_confidence < $1 AND user_id = $2
ORDER BY ocr_confidence ASC, created_at DESC
@ -1580,6 +1625,9 @@ impl Database {
updated_at: r.get("updated_at"),
user_id: r.get("user_id"),
file_hash: r.get("file_hash"),
original_created_at: r.get("original_created_at"),
original_modified_at: r.get("original_modified_at"),
source_metadata: r.get("source_metadata"),
}).collect()
};
@ -1591,7 +1639,7 @@ impl Database {
let documents = if user_role == crate::models::UserRole::Admin {
let rows = sqlx::query(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
FROM documents
WHERE ocr_status = 'failed' OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing')
ORDER BY created_at DESC
@ -1620,11 +1668,14 @@ impl Database {
updated_at: r.get("updated_at"),
user_id: r.get("user_id"),
file_hash: r.get("file_hash"),
original_created_at: r.get("original_created_at"),
original_modified_at: r.get("original_modified_at"),
source_metadata: r.get("source_metadata"),
}).collect()
} else {
let rows = sqlx::query(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
FROM documents
WHERE (ocr_status = 'failed' OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing')) AND user_id = $1
ORDER BY created_at DESC
@ -1654,6 +1705,9 @@ impl Database {
updated_at: r.get("updated_at"),
user_id: r.get("user_id"),
file_hash: r.get("file_hash"),
original_created_at: r.get("original_created_at"),
original_modified_at: r.get("original_modified_at"),
source_metadata: r.get("source_metadata"),
}).collect()
};
@ -1665,7 +1719,7 @@ impl Database {
let documents = if user_role == crate::models::UserRole::Admin {
let rows = sqlx::query(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
FROM documents
WHERE (ocr_confidence IS NOT NULL AND ocr_confidence < $1)
OR ocr_status = 'failed'
@ -1699,11 +1753,14 @@ impl Database {
updated_at: r.get("updated_at"),
user_id: r.get("user_id"),
file_hash: r.get("file_hash"),
original_created_at: r.get("original_created_at"),
original_modified_at: r.get("original_modified_at"),
source_metadata: r.get("source_metadata"),
}).collect()
} else {
let rows = sqlx::query(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
FROM documents
WHERE ((ocr_confidence IS NOT NULL AND ocr_confidence < $1)
OR ocr_status = 'failed'
@ -1739,6 +1796,9 @@ impl Database {
updated_at: r.get("updated_at"),
user_id: r.get("user_id"),
file_hash: r.get("file_hash"),
original_created_at: r.get("original_created_at"),
original_modified_at: r.get("original_modified_at"),
source_metadata: r.get("source_metadata"),
}).collect()
};

View File

@ -9,8 +9,10 @@
use uuid::Uuid;
use sha2::{Digest, Sha256};
use tracing::{debug, info, warn};
use chrono::Utc;
use serde_json;
use crate::models::Document;
use crate::models::{Document, FileInfo};
use crate::db::Database;
use crate::services::file_service::FileService;
@ -49,6 +51,10 @@ pub struct DocumentIngestionRequest {
/// Optional source identifier for tracking
pub source_type: Option<String>,
pub source_id: Option<Uuid>,
/// Optional metadata from source file system
pub original_created_at: Option<chrono::DateTime<chrono::Utc>>,
pub original_modified_at: Option<chrono::DateTime<chrono::Utc>>,
pub source_metadata: Option<serde_json::Value>,
}
pub struct DocumentIngestionService {
@ -61,6 +67,47 @@ impl DocumentIngestionService {
Self { db, file_service }
}
/// Extract metadata from FileInfo for storage in document
fn extract_metadata_from_file_info(file_info: &FileInfo) -> (Option<chrono::DateTime<chrono::Utc>>, Option<chrono::DateTime<chrono::Utc>>, Option<serde_json::Value>) {
let original_created_at = file_info.created_at;
let original_modified_at = file_info.last_modified;
// Build comprehensive metadata object
let mut metadata = serde_json::Map::new();
// Add permissions if available
if let Some(perms) = file_info.permissions {
metadata.insert("permissions".to_string(), serde_json::Value::Number(perms.into()));
}
// Add owner/group info
if let Some(ref owner) = file_info.owner {
metadata.insert("owner".to_string(), serde_json::Value::String(owner.clone()));
}
if let Some(ref group) = file_info.group {
metadata.insert("group".to_string(), serde_json::Value::String(group.clone()));
}
// Add source path
metadata.insert("source_path".to_string(), serde_json::Value::String(file_info.path.clone()));
// Merge any additional metadata from the source
if let Some(ref source_meta) = file_info.metadata {
if let serde_json::Value::Object(source_map) = source_meta {
metadata.extend(source_map.clone());
}
}
let final_metadata = if metadata.is_empty() {
None
} else {
Some(serde_json::Value::Object(metadata))
};
(original_created_at, original_modified_at, final_metadata)
}
/// Unified document ingestion with configurable deduplication policy
pub async fn ingest_document(&self, request: DocumentIngestionRequest) -> Result<IngestionResult, Box<dyn std::error::Error + Send + Sync>> {
let file_hash = self.calculate_file_hash(&request.file_data);
@ -156,6 +203,9 @@ impl DocumentIngestionService {
&request.mime_type,
request.user_id,
Some(file_hash.clone()),
request.original_created_at,
request.original_modified_at,
request.source_metadata,
);
let saved_document = match self.db.create_document(document).await {
@ -235,6 +285,36 @@ impl DocumentIngestionService {
format!("{:x}", result)
}
/// Ingest document from source with FileInfo metadata
pub async fn ingest_from_file_info(
&self,
file_info: &FileInfo,
file_data: Vec<u8>,
user_id: Uuid,
deduplication_policy: DeduplicationPolicy,
source_type: &str,
source_id: Option<Uuid>,
) -> Result<IngestionResult, Box<dyn std::error::Error + Send + Sync>> {
let (original_created_at, original_modified_at, source_metadata) =
Self::extract_metadata_from_file_info(file_info);
let request = DocumentIngestionRequest {
filename: file_info.name.clone(),
original_filename: file_info.name.clone(),
file_data,
mime_type: file_info.mime_type.clone(),
user_id,
deduplication_policy,
source_type: Some(source_type.to_string()),
source_id,
original_created_at,
original_modified_at,
source_metadata,
};
self.ingest_document(request).await
}
/// Convenience method for direct uploads (maintains backward compatibility)
pub async fn ingest_upload(
&self,
@ -252,6 +332,9 @@ impl DocumentIngestionService {
deduplication_policy: DeduplicationPolicy::AllowDuplicateContent, // Fixed behavior for uploads
source_type: Some("direct_upload".to_string()),
source_id: None,
original_created_at: None,
original_modified_at: None,
source_metadata: None,
};
self.ingest_document(request).await
@ -276,6 +359,9 @@ impl DocumentIngestionService {
deduplication_policy: DeduplicationPolicy::Skip, // Skip duplicates for source sync
source_type: Some(source_type.to_string()),
source_id: Some(source_id),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
};
self.ingest_document(request).await
@ -299,6 +385,9 @@ impl DocumentIngestionService {
deduplication_policy: DeduplicationPolicy::TrackAsDuplicate, // Track duplicates for WebDAV
source_type: Some("webdav".to_string()),
source_id: Some(webdav_source_id),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
};
self.ingest_document(request).await
@ -321,6 +410,9 @@ impl DocumentIngestionService {
deduplication_policy: DeduplicationPolicy::Skip, // Skip duplicates for batch operations
source_type: Some("batch_ingest".to_string()),
source_id: None,
original_created_at: None,
original_modified_at: None,
source_metadata: None,
};
self.ingest_document(request).await

View File

@ -3,6 +3,7 @@ use serde::{Deserialize, Serialize};
use sqlx::FromRow;
use uuid::Uuid;
use utoipa::{ToSchema, IntoParams};
use serde_json;
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, ToSchema)]
pub enum UserRole {
@ -133,6 +134,12 @@ pub struct Document {
pub updated_at: DateTime<Utc>,
pub user_id: Uuid,
pub file_hash: Option<String>,
/// Original file creation timestamp from source system
pub original_created_at: Option<DateTime<Utc>>,
/// Original file modification timestamp from source system
pub original_modified_at: Option<DateTime<Utc>>,
/// Additional metadata from source system (permissions, attributes, EXIF data, etc.)
pub source_metadata: Option<serde_json::Value>,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, ToSchema)]
@ -307,6 +314,15 @@ pub struct DocumentResponse {
pub ocr_processing_time_ms: Option<i32>,
/// Current status of OCR processing (pending, processing, completed, failed)
pub ocr_status: Option<String>,
/// Original file creation timestamp from source system
#[serde(skip_serializing_if = "Option::is_none")]
pub original_created_at: Option<DateTime<Utc>>,
/// Original file modification timestamp from source system
#[serde(skip_serializing_if = "Option::is_none")]
pub original_modified_at: Option<DateTime<Utc>>,
/// Additional metadata from source system (permissions, attributes, etc.)
#[serde(skip_serializing_if = "Option::is_none")]
pub source_metadata: Option<serde_json::Value>,
}
#[derive(Debug, Serialize, Deserialize, ToSchema, IntoParams)]
@ -447,6 +463,9 @@ impl From<Document> for DocumentResponse {
ocr_word_count: doc.ocr_word_count,
ocr_processing_time_ms: doc.ocr_processing_time_ms,
ocr_status: doc.ocr_status,
original_created_at: doc.original_created_at,
original_modified_at: doc.original_modified_at,
source_metadata: doc.source_metadata,
}
}
}
@ -900,6 +919,16 @@ pub struct FileInfo {
pub last_modified: Option<DateTime<Utc>>,
pub etag: String,
pub is_directory: bool,
/// Original file creation time from source system
pub created_at: Option<DateTime<Utc>>,
/// File permissions (Unix mode bits or similar)
pub permissions: Option<u32>,
/// File owner (username or uid)
pub owner: Option<String>,
/// File group (groupname or gid)
pub group: Option<String>,
/// Additional metadata from source (EXIF, PDF metadata, custom attributes, etc.)
pub metadata: Option<serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash, ToSchema)]

View File

@ -116,6 +116,9 @@ async fn get_document_by_id(
ocr_word_count: document.ocr_word_count,
ocr_processing_time_ms: document.ocr_processing_time_ms,
ocr_status: document.ocr_status,
original_created_at: document.original_created_at,
original_modified_at: document.original_modified_at,
source_metadata: document.source_metadata,
};
Ok(Json(response))

View File

@ -283,24 +283,25 @@ async fn process_single_file(
let result = if let Some(source_id) = webdav_source_id {
ingestion_service
.ingest_from_webdav(
&file_info.name,
.ingest_from_file_info(
&file_info,
file_data,
&file_info.mime_type,
user_id,
source_id,
crate::ingestion::document_ingestion::DeduplicationPolicy::TrackAsDuplicate,
"webdav_sync",
Some(source_id),
)
.await
} else {
// Fallback for backward compatibility - treat as generic WebDAV sync
ingestion_service
.ingest_from_source(
&file_info.name,
.ingest_from_file_info(
&file_info,
file_data,
&file_info.mime_type,
user_id,
uuid::Uuid::new_v4(), // Generate a temporary ID for tracking
crate::ingestion::document_ingestion::DeduplicationPolicy::Skip,
"webdav_sync",
Some(uuid::Uuid::new_v4()), // Generate a temporary ID for tracking
)
.await
};

View File

@ -533,13 +533,13 @@ impl SourceSyncService {
let ingestion_service = DocumentIngestionService::new(state.db.clone(), file_service);
let result = ingestion_service
.ingest_from_source(
&file_info.name,
.ingest_from_file_info(
file_info,
file_data,
&file_info.mime_type,
user_id,
source_id,
crate::ingestion::document_ingestion::DeduplicationPolicy::Skip,
"source_sync",
Some(source_id),
)
.await
.map_err(|e| anyhow!("Document ingestion failed for {}: {}", file_info.name, e))?;
@ -637,13 +637,13 @@ impl SourceSyncService {
let ingestion_service = DocumentIngestionService::new(state.db.clone(), file_service);
let result = ingestion_service
.ingest_from_source(
&file_info.name,
.ingest_from_file_info(
file_info,
file_data,
&file_info.mime_type,
user_id,
source_id,
crate::ingestion::document_ingestion::DeduplicationPolicy::Skip,
"source_sync",
Some(source_id),
)
.await
.map_err(|e| anyhow!("Document ingestion failed for {}: {}", file_info.name, e))?;

View File

@ -158,6 +158,9 @@ impl FileService {
mime_type: &str,
user_id: Uuid,
file_hash: Option<String>,
original_created_at: Option<chrono::DateTime<chrono::Utc>>,
original_modified_at: Option<chrono::DateTime<chrono::Utc>>,
source_metadata: Option<serde_json::Value>,
) -> Document {
Document {
id: Uuid::new_v4(),
@ -179,6 +182,9 @@ impl FileService {
updated_at: Utc::now(),
user_id,
file_hash,
original_created_at,
original_modified_at,
source_metadata,
}
}

View File

@ -5,6 +5,7 @@ use chrono::{DateTime, Utc};
use tracing::{debug, info, warn};
use walkdir::WalkDir;
use sha2::{Sha256, Digest};
use serde_json;
use crate::models::{FileInfo, LocalFolderSourceConfig};
@ -89,6 +90,14 @@ impl LocalFolderService {
DateTime::from_timestamp(duration.as_secs() as i64, 0)
});
// Try to get creation time (not available on all systems)
let created_time = metadata.created()
.ok()
.and_then(|time| {
let duration = time.duration_since(std::time::UNIX_EPOCH).ok()?;
DateTime::from_timestamp(duration.as_secs() as i64, 0)
});
let file_name = path.file_name()
.and_then(|name| name.to_str())
.unwrap_or("unknown")
@ -100,6 +109,34 @@ impl LocalFolderService {
// Determine MIME type based on extension
let mime_type = Self::get_mime_type(&extension);
// Extract file permissions and ownership info
#[cfg(unix)]
let (permissions, owner, group) = {
use std::os::unix::fs::MetadataExt;
(
Some(metadata.mode() & 0o777), // File mode bits (permissions)
Some(metadata.uid().to_string()), // User ID
Some(metadata.gid().to_string()), // Group ID
)
};
#[cfg(not(unix))]
let (permissions, owner, group) = (None, None, None);
// Prepare additional metadata
let mut additional_metadata = serde_json::Map::new();
#[cfg(unix)]
{
use std::os::unix::fs::MetadataExt;
additional_metadata.insert("inode".to_string(), serde_json::Value::Number(metadata.ino().into()));
additional_metadata.insert("nlinks".to_string(), serde_json::Value::Number(metadata.nlink().into()));
additional_metadata.insert("device".to_string(), serde_json::Value::Number(metadata.dev().into()));
}
// Add file attributes
additional_metadata.insert("readonly".to_string(), serde_json::Value::Bool(metadata.permissions().readonly()));
let file_info = FileInfo {
path: path.to_string_lossy().to_string(),
name: file_name,
@ -108,6 +145,11 @@ impl LocalFolderService {
last_modified: modified_time,
etag,
is_directory: false,
created_at: created_time,
permissions,
owner,
group,
metadata: if additional_metadata.is_empty() { None } else { Some(serde_json::Value::Object(additional_metadata)) },
};
files.push(file_info);

View File

@ -1,6 +1,7 @@
use anyhow::{anyhow, Result};
use chrono::{DateTime, Utc};
use tracing::{debug, error, info, warn};
use serde_json;
#[cfg(feature = "s3")]
use aws_sdk_s3::Client;
@ -149,6 +150,32 @@ impl S3Service {
let mime_type = Self::get_mime_type(&extension);
// Build additional metadata from S3 object properties
let mut metadata_map = serde_json::Map::new();
// Add S3-specific metadata
if let Some(storage_class) = &object.storage_class {
metadata_map.insert("storage_class".to_string(), serde_json::Value::String(storage_class.as_str().to_string()));
}
if let Some(owner) = &object.owner {
if let Some(display_name) = &owner.display_name {
metadata_map.insert("owner_display_name".to_string(), serde_json::Value::String(display_name.clone()));
}
if let Some(id) = &owner.id {
metadata_map.insert("owner_id".to_string(), serde_json::Value::String(id.clone()));
}
}
// Store the S3 key for reference
metadata_map.insert("s3_key".to_string(), serde_json::Value::String(key.clone()));
// Add bucket name for reference
metadata_map.insert("s3_bucket".to_string(), serde_json::Value::String(self.config.bucket_name.clone()));
// If we have region info, add it
metadata_map.insert("s3_region".to_string(), serde_json::Value::String(self.config.region.clone()));
let file_info = FileInfo {
path: key.clone(),
name: file_name,
@ -157,6 +184,11 @@ impl S3Service {
last_modified,
etag,
is_directory: false,
created_at: None, // S3 doesn't provide creation time, only last modified
permissions: None, // S3 uses different permission model (ACLs/policies)
owner: object.owner.as_ref().and_then(|o| o.display_name.clone()),
group: None, // S3 doesn't have Unix-style groups
metadata: if metadata_map.is_empty() { None } else { Some(serde_json::Value::Object(metadata_map)) },
};
files.push(file_info);

View File

@ -423,14 +423,7 @@ impl WebDAVService {
let propfind_body = r#"<?xml version="1.0"?>
<d:propfind xmlns:d="DAV:">
<d:prop>
<d:displayname/>
<d:getcontentlength/>
<d:getlastmodified/>
<d:getcontenttype/>
<d:getetag/>
<d:resourcetype/>
</d:prop>
<d:allprop/>
</d:propfind>"#;
let response = self.client

View File

@ -2,6 +2,49 @@
mod tests {
use crate::config::Config;
use std::env;
use std::sync::Mutex;
// Mutex to ensure OIDC tests run sequentially to avoid race conditions
static OIDC_TEST_MUTEX: Mutex<()> = Mutex::new(());
// Helper function to safely run a test with environment isolation
fn run_with_env_isolation<F, R>(test_fn: F) -> R
where
F: FnOnce() -> R,
{
let _guard = OIDC_TEST_MUTEX.lock().unwrap();
// Store original environment values
let original_values: Vec<(String, Option<String>)> = vec![
"OIDC_ENABLED",
"OIDC_CLIENT_ID",
"OIDC_CLIENT_SECRET",
"OIDC_ISSUER_URL",
"OIDC_REDIRECT_URI",
"DATABASE_URL",
"JWT_SECRET",
].into_iter().map(|key| {
(key.to_string(), env::var(key).ok())
}).collect();
// Clean up environment first
for (key, _) in &original_values {
env::remove_var(key);
}
// Run the test
let result = test_fn();
// Restore original environment
for (key, original_value) in original_values {
env::remove_var(&key);
if let Some(value) = original_value {
env::set_var(&key, value);
}
}
result
}
fn create_base_config() -> Config {
Config {
@ -40,291 +83,176 @@ mod tests {
#[test]
fn test_oidc_enabled_from_env() {
// Clean up environment first to ensure test isolation
env::remove_var("OIDC_ENABLED");
env::remove_var("OIDC_CLIENT_ID");
env::remove_var("OIDC_CLIENT_SECRET");
env::remove_var("OIDC_ISSUER_URL");
env::remove_var("OIDC_REDIRECT_URI");
env::remove_var("DATABASE_URL");
env::remove_var("JWT_SECRET");
env::set_var("OIDC_ENABLED", "true");
env::set_var("OIDC_CLIENT_ID", "test-client-id");
env::set_var("OIDC_CLIENT_SECRET", "test-client-secret");
env::set_var("OIDC_ISSUER_URL", "https://provider.example.com");
env::set_var("OIDC_REDIRECT_URI", "http://localhost:8000/auth/oidc/callback");
env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test");
env::set_var("JWT_SECRET", "test-secret");
let config = Config::from_env().unwrap();
assert!(config.oidc_enabled);
assert_eq!(config.oidc_client_id, Some("test-client-id".to_string()));
assert_eq!(config.oidc_client_secret, Some("test-client-secret".to_string()));
assert_eq!(config.oidc_issuer_url, Some("https://provider.example.com".to_string()));
assert_eq!(config.oidc_redirect_uri, Some("http://localhost:8000/auth/oidc/callback".to_string()));
// Clean up
env::remove_var("OIDC_ENABLED");
env::remove_var("OIDC_CLIENT_ID");
env::remove_var("OIDC_CLIENT_SECRET");
env::remove_var("OIDC_ISSUER_URL");
env::remove_var("OIDC_REDIRECT_URI");
env::remove_var("DATABASE_URL");
env::remove_var("JWT_SECRET");
}
#[test]
fn test_oidc_enabled_variations() {
let test_cases = vec![
("true", true),
("TRUE", true),
("1", true),
("yes", true),
("YES", true),
("on", true),
("ON", true),
("false", false),
("FALSE", false),
("0", false),
("no", false),
("NO", false),
("off", false),
("OFF", false),
("invalid", false),
];
for (value, expected) in test_cases {
// Clean up environment first for each iteration
env::remove_var("OIDC_ENABLED");
env::remove_var("OIDC_CLIENT_ID");
env::remove_var("OIDC_CLIENT_SECRET");
env::remove_var("OIDC_ISSUER_URL");
env::remove_var("OIDC_REDIRECT_URI");
env::remove_var("DATABASE_URL");
env::remove_var("JWT_SECRET");
env::set_var("OIDC_ENABLED", value);
run_with_env_isolation(|| {
env::set_var("OIDC_ENABLED", "true");
env::set_var("OIDC_CLIENT_ID", "test-client-id");
env::set_var("OIDC_CLIENT_SECRET", "test-client-secret");
env::set_var("OIDC_ISSUER_URL", "https://provider.example.com");
env::set_var("OIDC_REDIRECT_URI", "http://localhost:8000/auth/oidc/callback");
env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test");
env::set_var("JWT_SECRET", "test-secret");
let config = Config::from_env().unwrap();
assert_eq!(config.oidc_enabled, expected, "Failed for value: {}", value);
env::remove_var("OIDC_ENABLED");
env::remove_var("DATABASE_URL");
env::remove_var("JWT_SECRET");
}
assert!(config.oidc_enabled);
assert_eq!(config.oidc_client_id, Some("test-client-id".to_string()));
assert_eq!(config.oidc_client_secret, Some("test-client-secret".to_string()));
assert_eq!(config.oidc_issuer_url, Some("https://provider.example.com".to_string()));
assert_eq!(config.oidc_redirect_uri, Some("http://localhost:8000/auth/oidc/callback".to_string()));
});
}
#[test]
fn test_oidc_enabled_variations() {
run_with_env_isolation(|| {
let test_cases = vec![
("true", true),
("TRUE", true),
("1", true),
("yes", true),
("YES", true),
("on", true),
("ON", true),
("false", false),
("FALSE", false),
("0", false),
("no", false),
("NO", false),
("off", false),
("OFF", false),
("invalid", false),
];
for (value, expected) in test_cases {
// Clean up environment for each iteration
env::remove_var("OIDC_ENABLED");
env::remove_var("DATABASE_URL");
env::remove_var("JWT_SECRET");
env::set_var("OIDC_ENABLED", value);
env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test");
env::set_var("JWT_SECRET", "test-secret");
let config = Config::from_env().unwrap();
assert_eq!(config.oidc_enabled, expected, "Failed for value: {}", value);
}
});
}
#[test]
fn test_oidc_partial_config() {
// Clean up environment first to ensure test isolation
env::remove_var("OIDC_ENABLED");
env::remove_var("OIDC_CLIENT_ID");
env::remove_var("OIDC_CLIENT_SECRET");
env::remove_var("OIDC_ISSUER_URL");
env::remove_var("OIDC_REDIRECT_URI");
env::remove_var("DATABASE_URL");
env::remove_var("JWT_SECRET");
// Only set some OIDC vars
env::set_var("OIDC_ENABLED", "true");
env::set_var("OIDC_CLIENT_ID", "test-client-id");
// Missing OIDC_CLIENT_SECRET, OIDC_ISSUER_URL, OIDC_REDIRECT_URI
env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test");
env::set_var("JWT_SECRET", "test-secret");
run_with_env_isolation(|| {
// Only set some OIDC vars
env::set_var("OIDC_ENABLED", "true");
env::set_var("OIDC_CLIENT_ID", "test-client-id");
// Missing OIDC_CLIENT_SECRET, OIDC_ISSUER_URL, OIDC_REDIRECT_URI
env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test");
env::set_var("JWT_SECRET", "test-secret");
let config = Config::from_env().unwrap();
let config = Config::from_env().unwrap();
assert!(config.oidc_enabled);
assert_eq!(config.oidc_client_id, Some("test-client-id".to_string()));
assert!(config.oidc_client_secret.is_none());
assert!(config.oidc_issuer_url.is_none());
assert!(config.oidc_redirect_uri.is_none());
// Clean up
env::remove_var("OIDC_ENABLED");
env::remove_var("OIDC_CLIENT_ID");
env::remove_var("OIDC_CLIENT_SECRET");
env::remove_var("OIDC_ISSUER_URL");
env::remove_var("OIDC_REDIRECT_URI");
env::remove_var("DATABASE_URL");
env::remove_var("JWT_SECRET");
assert!(config.oidc_enabled);
assert_eq!(config.oidc_client_id, Some("test-client-id".to_string()));
assert!(config.oidc_client_secret.is_none());
assert!(config.oidc_issuer_url.is_none());
assert!(config.oidc_redirect_uri.is_none());
});
}
#[test]
fn test_oidc_disabled_with_config_present() {
// Clean up environment first to ensure test isolation
env::remove_var("OIDC_ENABLED");
env::remove_var("OIDC_CLIENT_ID");
env::remove_var("OIDC_CLIENT_SECRET");
env::remove_var("OIDC_ISSUER_URL");
env::remove_var("OIDC_REDIRECT_URI");
env::remove_var("DATABASE_URL");
env::remove_var("JWT_SECRET");
// OIDC disabled but config present
env::set_var("OIDC_ENABLED", "false");
env::set_var("OIDC_CLIENT_ID", "test-client-id");
env::set_var("OIDC_CLIENT_SECRET", "test-client-secret");
env::set_var("OIDC_ISSUER_URL", "https://provider.example.com");
env::set_var("OIDC_REDIRECT_URI", "http://localhost:8000/auth/oidc/callback");
env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test");
env::set_var("JWT_SECRET", "test-secret");
run_with_env_isolation(|| {
// OIDC disabled but config present
env::set_var("OIDC_ENABLED", "false");
env::set_var("OIDC_CLIENT_ID", "test-client-id");
env::set_var("OIDC_CLIENT_SECRET", "test-client-secret");
env::set_var("OIDC_ISSUER_URL", "https://provider.example.com");
env::set_var("OIDC_REDIRECT_URI", "http://localhost:8000/auth/oidc/callback");
env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test");
env::set_var("JWT_SECRET", "test-secret");
let config = Config::from_env().unwrap();
let config = Config::from_env().unwrap();
assert!(!config.oidc_enabled);
assert_eq!(config.oidc_client_id, Some("test-client-id".to_string()));
assert_eq!(config.oidc_client_secret, Some("test-client-secret".to_string()));
assert_eq!(config.oidc_issuer_url, Some("https://provider.example.com".to_string()));
assert_eq!(config.oidc_redirect_uri, Some("http://localhost:8000/auth/oidc/callback".to_string()));
// Clean up
env::remove_var("OIDC_ENABLED");
env::remove_var("OIDC_CLIENT_ID");
env::remove_var("OIDC_CLIENT_SECRET");
env::remove_var("OIDC_ISSUER_URL");
env::remove_var("OIDC_REDIRECT_URI");
env::remove_var("DATABASE_URL");
env::remove_var("JWT_SECRET");
assert!(!config.oidc_enabled);
assert_eq!(config.oidc_client_id, Some("test-client-id".to_string()));
assert_eq!(config.oidc_client_secret, Some("test-client-secret".to_string()));
assert_eq!(config.oidc_issuer_url, Some("https://provider.example.com".to_string()));
assert_eq!(config.oidc_redirect_uri, Some("http://localhost:8000/auth/oidc/callback".to_string()));
});
}
#[test]
fn test_oidc_empty_values() {
// Clean up environment first to ensure test isolation
env::remove_var("OIDC_ENABLED");
env::remove_var("OIDC_CLIENT_ID");
env::remove_var("OIDC_CLIENT_SECRET");
env::remove_var("OIDC_ISSUER_URL");
env::remove_var("OIDC_REDIRECT_URI");
env::remove_var("DATABASE_URL");
env::remove_var("JWT_SECRET");
env::set_var("OIDC_ENABLED", "true");
env::set_var("OIDC_CLIENT_ID", "");
env::set_var("OIDC_CLIENT_SECRET", "");
env::set_var("OIDC_ISSUER_URL", "");
env::set_var("OIDC_REDIRECT_URI", "");
env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test");
env::set_var("JWT_SECRET", "test-secret");
run_with_env_isolation(|| {
env::set_var("OIDC_ENABLED", "true");
env::set_var("OIDC_CLIENT_ID", "");
env::set_var("OIDC_CLIENT_SECRET", "");
env::set_var("OIDC_ISSUER_URL", "");
env::set_var("OIDC_REDIRECT_URI", "");
env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test");
env::set_var("JWT_SECRET", "test-secret");
let config = Config::from_env().unwrap();
let config = Config::from_env().unwrap();
assert!(config.oidc_enabled);
// Empty string values should be converted to Some(empty_string)
assert_eq!(config.oidc_client_id, Some("".to_string()));
assert_eq!(config.oidc_client_secret, Some("".to_string()));
assert_eq!(config.oidc_issuer_url, Some("".to_string()));
assert_eq!(config.oidc_redirect_uri, Some("".to_string()));
// Clean up
env::remove_var("OIDC_ENABLED");
env::remove_var("OIDC_CLIENT_ID");
env::remove_var("OIDC_CLIENT_SECRET");
env::remove_var("OIDC_ISSUER_URL");
env::remove_var("OIDC_REDIRECT_URI");
env::remove_var("DATABASE_URL");
env::remove_var("JWT_SECRET");
assert!(config.oidc_enabled);
// Empty string values should be converted to Some(empty_string)
assert_eq!(config.oidc_client_id, Some("".to_string()));
assert_eq!(config.oidc_client_secret, Some("".to_string()));
assert_eq!(config.oidc_issuer_url, Some("".to_string()));
assert_eq!(config.oidc_redirect_uri, Some("".to_string()));
});
}
#[test]
fn test_oidc_config_validation_output() {
// Clean up environment first to ensure test isolation
env::remove_var("OIDC_ENABLED");
env::remove_var("OIDC_CLIENT_ID");
env::remove_var("OIDC_CLIENT_SECRET");
env::remove_var("OIDC_ISSUER_URL");
env::remove_var("OIDC_REDIRECT_URI");
env::remove_var("DATABASE_URL");
env::remove_var("JWT_SECRET");
// Test that validation warnings are properly formatted
env::set_var("OIDC_ENABLED", "true");
env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test");
env::set_var("JWT_SECRET", "test-secret");
// Missing required OIDC fields
run_with_env_isolation(|| {
// Test that validation warnings are properly formatted
env::set_var("OIDC_ENABLED", "true");
env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test");
env::set_var("JWT_SECRET", "test-secret");
// Missing required OIDC fields
// This should succeed but show warnings
let config = Config::from_env().unwrap();
assert!(config.oidc_enabled);
assert!(config.oidc_client_id.is_none());
// Clean up
env::remove_var("OIDC_ENABLED");
env::remove_var("OIDC_CLIENT_ID");
env::remove_var("OIDC_CLIENT_SECRET");
env::remove_var("OIDC_ISSUER_URL");
env::remove_var("OIDC_REDIRECT_URI");
env::remove_var("DATABASE_URL");
env::remove_var("JWT_SECRET");
// This should succeed but show warnings
let config = Config::from_env().unwrap();
assert!(config.oidc_enabled);
assert!(config.oidc_client_id.is_none());
});
}
#[test]
fn test_oidc_complete_configuration() {
// Clean up environment first to ensure test isolation
env::remove_var("OIDC_ENABLED");
env::remove_var("OIDC_CLIENT_ID");
env::remove_var("OIDC_CLIENT_SECRET");
env::remove_var("OIDC_ISSUER_URL");
env::remove_var("OIDC_REDIRECT_URI");
env::remove_var("DATABASE_URL");
env::remove_var("JWT_SECRET");
env::set_var("OIDC_ENABLED", "true");
env::set_var("OIDC_CLIENT_ID", "my-app-client-id");
env::set_var("OIDC_CLIENT_SECRET", "super-secret-client-secret");
env::set_var("OIDC_ISSUER_URL", "https://auth.example.com");
env::set_var("OIDC_REDIRECT_URI", "https://myapp.com/auth/callback");
env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test");
env::set_var("JWT_SECRET", "test-secret");
run_with_env_isolation(|| {
env::set_var("OIDC_ENABLED", "true");
env::set_var("OIDC_CLIENT_ID", "my-app-client-id");
env::set_var("OIDC_CLIENT_SECRET", "super-secret-client-secret");
env::set_var("OIDC_ISSUER_URL", "https://auth.example.com");
env::set_var("OIDC_REDIRECT_URI", "https://myapp.com/auth/callback");
env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test");
env::set_var("JWT_SECRET", "test-secret");
let config = Config::from_env().unwrap();
let config = Config::from_env().unwrap();
assert!(config.oidc_enabled);
assert_eq!(config.oidc_client_id.unwrap(), "my-app-client-id");
assert_eq!(config.oidc_client_secret.unwrap(), "super-secret-client-secret");
assert_eq!(config.oidc_issuer_url.unwrap(), "https://auth.example.com");
assert_eq!(config.oidc_redirect_uri.unwrap(), "https://myapp.com/auth/callback");
// Clean up
env::remove_var("OIDC_ENABLED");
env::remove_var("OIDC_CLIENT_ID");
env::remove_var("OIDC_CLIENT_SECRET");
env::remove_var("OIDC_ISSUER_URL");
env::remove_var("OIDC_REDIRECT_URI");
env::remove_var("DATABASE_URL");
env::remove_var("JWT_SECRET");
assert!(config.oidc_enabled);
assert_eq!(config.oidc_client_id.unwrap(), "my-app-client-id");
assert_eq!(config.oidc_client_secret.unwrap(), "super-secret-client-secret");
assert_eq!(config.oidc_issuer_url.unwrap(), "https://auth.example.com");
assert_eq!(config.oidc_redirect_uri.unwrap(), "https://myapp.com/auth/callback");
});
}
#[test]
fn test_oidc_config_precedence() {
// Clean up any existing env vars first
env::remove_var("OIDC_ENABLED");
env::remove_var("OIDC_CLIENT_ID");
env::remove_var("OIDC_CLIENT_SECRET");
env::remove_var("OIDC_ISSUER_URL");
env::remove_var("OIDC_REDIRECT_URI");
env::remove_var("DATABASE_URL");
env::remove_var("JWT_SECRET");
// Test that environment variables take precedence
env::set_var("OIDC_ENABLED", "true");
env::set_var("OIDC_CLIENT_ID", "env-client-id");
env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test");
env::set_var("JWT_SECRET", "test-secret");
run_with_env_isolation(|| {
// Test that environment variables take precedence
env::set_var("OIDC_ENABLED", "true");
env::set_var("OIDC_CLIENT_ID", "env-client-id");
env::set_var("DATABASE_URL", "postgresql://test:test@localhost/test");
env::set_var("JWT_SECRET", "test-secret");
let config = Config::from_env().unwrap();
let config = Config::from_env().unwrap();
assert!(config.oidc_enabled);
assert_eq!(config.oidc_client_id.unwrap(), "env-client-id");
// Clean up
env::remove_var("OIDC_ENABLED");
env::remove_var("OIDC_CLIENT_ID");
env::remove_var("DATABASE_URL");
env::remove_var("JWT_SECRET");
assert!(config.oidc_enabled);
assert_eq!(config.oidc_client_id.unwrap(), "env-client-id");
});
}
}

View File

@ -49,6 +49,9 @@ mod tests {
updated_at: Utc::now(),
user_id,
file_hash: Some("abcd1234567890123456789012345678901234567890123456789012345678".to_string()),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
}
}

View File

@ -60,6 +60,9 @@ mod document_routes_deletion_tests {
updated_at: Utc::now(),
user_id,
file_hash: Some("hash123".to_string()),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
}
}
@ -394,6 +397,9 @@ mod document_routes_deletion_tests {
updated_at: Utc::now(),
user_id,
file_hash: Some("abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890".to_string()),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
}
}

View File

@ -26,6 +26,9 @@ fn create_test_document(user_id: Uuid) -> Document {
updated_at: Utc::now(),
user_id,
file_hash: Some("1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef".to_string()),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
}
}
@ -51,6 +54,9 @@ fn create_test_document_without_ocr(user_id: Uuid) -> Document {
updated_at: Utc::now(),
user_id,
file_hash: Some("fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321".to_string()),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
}
}
@ -76,6 +82,9 @@ fn create_test_document_with_ocr_error(user_id: Uuid) -> Document {
updated_at: Utc::now(),
user_id,
file_hash: Some("abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890".to_string()),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
}
}
@ -1552,6 +1561,9 @@ mod deletion_error_handling_tests {
updated_at: Utc::now(),
user_id,
file_hash: Some("test_hash_123456789abcdef123456789abcdef123456789abcdef123456789abcdef".to_string()),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
}
}

View File

@ -76,7 +76,7 @@ mod tests {
assert_eq!(ocr_result.text.trim(), test_content);
assert_eq!(ocr_result.confidence, 100.0); // Plain text should be 100% confident
assert_eq!(ocr_result.word_count, 9); // "This is a test text file with multiple words"
assert!(ocr_result.processing_time_ms > 0);
assert!(ocr_result.processing_time_ms >= 0);
assert!(ocr_result.preprocessing_applied.contains(&"Plain text read".to_string()));
}

View File

@ -939,6 +939,9 @@ mod tests {
updated_at: Utc::now(),
user_id: user.id,
file_hash: Some("0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef".to_string()),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
};
db.create_document(document).await.unwrap();

View File

@ -82,6 +82,9 @@ mod tests {
"application/pdf",
user_id,
Some("abcd1234hash".to_string()),
None, // original_created_at
None, // original_modified_at
None, // source_metadata
);
assert_eq!(document.filename, "saved_file.pdf");
@ -189,6 +192,9 @@ mod file_deletion_tests {
updated_at: Utc::now(),
user_id,
file_hash: Some("hash123".to_string()),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
};
(
@ -324,6 +330,9 @@ mod file_deletion_tests {
updated_at: Utc::now(),
user_id,
file_hash: None,
original_created_at: None,
original_modified_at: None,
source_metadata: None,
};
// Try to delete nonexistent files (should not fail)
@ -375,6 +384,9 @@ mod file_deletion_tests {
updated_at: Utc::now(),
user_id,
file_hash: Some("imagehash456".to_string()),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
};
// Verify files exist
@ -430,6 +442,9 @@ mod file_deletion_tests {
updated_at: Utc::now(),
user_id,
file_hash: Some("hash789".to_string()),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
};
// Verify files exist
@ -476,6 +491,9 @@ mod file_deletion_tests {
updated_at: Utc::now(),
user_id,
file_hash: Some("texthash".to_string()),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
};
// Verify file exists

View File

@ -81,6 +81,9 @@ mod tests {
updated_at: Utc::now(),
user_id,
file_hash: Some("1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef".to_string()),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
};
sqlx::query("INSERT INTO documents (id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19)")

View File

@ -3,6 +3,7 @@ use chrono::{DateTime, Utc};
use quick_xml::events::{BytesStart, Event};
use quick_xml::reader::Reader;
use std::str;
use serde_json;
use crate::models::FileInfo;
@ -15,6 +16,12 @@ struct PropFindResponse {
content_type: Option<String>,
etag: Option<String>,
is_collection: bool,
creation_date: Option<String>,
owner: Option<String>,
group: Option<String>,
permissions: Option<String>,
owner_display_name: Option<String>,
metadata: Option<serde_json::Value>,
}
pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileInfo>> {
@ -85,6 +92,53 @@ pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileInfo>> {
"getetag" => {
resp.etag = Some(normalize_etag(&text));
}
"creationdate" => {
resp.creation_date = Some(text.trim().to_string());
}
"owner" => {
resp.owner = Some(text.trim().to_string());
}
"group" => {
resp.group = Some(text.trim().to_string());
}
_ => {
// Store any other properties as generic metadata
// This handles vendor-specific properties from any WebDAV server
if !text.trim().is_empty() && in_prop {
if resp.metadata.is_none() {
resp.metadata = Some(serde_json::Value::Object(serde_json::Map::new()));
}
if let Some(serde_json::Value::Object(ref mut map)) = resp.metadata {
// Special handling for known properties
match current_element.as_str() {
"permissions" | "oc:permissions" => {
resp.permissions = Some(text.trim().to_string());
map.insert("permissions_raw".to_string(), serde_json::Value::String(text.trim().to_string()));
}
"fileid" | "oc:fileid" => {
map.insert("file_id".to_string(), serde_json::Value::String(text.trim().to_string()));
}
"owner-id" | "oc:owner-id" => {
map.insert("owner_id".to_string(), serde_json::Value::String(text.trim().to_string()));
}
"owner-display-name" | "oc:owner-display-name" => {
resp.owner_display_name = Some(text.trim().to_string());
map.insert("owner_display_name".to_string(), serde_json::Value::String(text.trim().to_string()));
}
"has-preview" | "nc:has-preview" => {
if let Ok(val) = text.trim().parse::<bool>() {
map.insert("has_preview".to_string(), serde_json::Value::Bool(val));
}
}
_ => {
// Store any other property as-is
map.insert(current_element.clone(), serde_json::Value::String(text.trim().to_string()));
}
}
}
}
}
"status" if in_propstat => {
// Check if status is 200 OK
if text.contains("200") {
@ -120,6 +174,33 @@ pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileInfo>> {
.unwrap_or_else(|_| std::borrow::Cow::Borrowed(&name))
.to_string();
// Parse creation date
let created_at = resp.creation_date
.as_ref()
.and_then(|d| parse_http_date(d));
// Parse permissions (Nextcloud/ownCloud format)
let permissions_int = resp.permissions
.as_ref()
.and_then(|p| {
// Nextcloud permissions are a string like "RGDNVW"
// Convert to Unix-style octal permissions
if p.chars().all(|c| c.is_uppercase()) {
// This is Nextcloud format
let mut perms = 0u32;
if p.contains('R') { perms |= 0o444; } // Read
if p.contains('W') { perms |= 0o222; } // Write
if p.contains('D') { perms |= 0o111; } // Delete (execute-like)
Some(perms)
} else {
// Try to parse as numeric
p.parse().ok()
}
});
// Use the metadata collected during parsing
let metadata = resp.metadata;
let file_info = FileInfo {
path: resp.href.clone(),
name,
@ -128,6 +209,11 @@ pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileInfo>> {
last_modified: parse_http_date(&resp.last_modified.unwrap_or_default()),
etag: resp.etag.unwrap_or_else(|| format!("\"{}\"", uuid::Uuid::new_v4())),
is_directory: false,
created_at,
permissions: permissions_int,
owner: resp.owner.or(resp.owner_display_name),
group: resp.group,
metadata,
};
files.push(file_info);

View File

@ -41,6 +41,9 @@ fn create_test_document(user_id: Uuid, filename: &str, file_hash: String) -> Doc
updated_at: Utc::now(),
user_id,
file_hash: Some(file_hash),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
}
}

View File

@ -239,6 +239,9 @@ impl FileProcessingTestClient {
ocr_word_count: doc.ocr_word_count,
ocr_processing_time_ms: doc.ocr_processing_time_ms,
ocr_status: doc.ocr_status.clone(),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
};
return Ok(doc_copy);
}

View File

@ -59,6 +59,9 @@ fn create_test_document(user_id: Uuid, filename: &str, file_hash: Option<String>
updated_at: Utc::now(),
user_id,
file_hash,
original_created_at: None,
original_modified_at: None,
source_metadata: None,
}
}
@ -248,6 +251,9 @@ async fn test_file_service_create_document_with_hash() {
"application/pdf",
user_id,
Some(test_hash.to_string()),
None, // original_created_at
None, // original_modified_at
None, // source_metadata
);
assert_eq!(document.filename, "test.pdf");
@ -271,6 +277,9 @@ async fn test_file_service_create_document_without_hash() {
"application/pdf",
user_id,
None,
None, // original_created_at
None, // original_modified_at
None, // source_metadata
);
assert_eq!(document.filename, "test.pdf");

View File

@ -356,6 +356,9 @@ async fn test_create_ignored_file_from_document() -> Result<()> {
updated_at: chrono::Utc::now(),
user_id,
file_hash: Some("document_hash_123".to_string()),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
};
// Insert document into database

View File

@ -29,6 +29,11 @@ fn create_test_file_info(name: &str, path: &str, content: &[u8]) -> FileInfo {
etag: "test-etag".to_string(),
mime_type: "application/pdf".to_string(),
is_directory: false,
created_at: None,
permissions: None,
owner: None,
group: None,
metadata: None,
}
}
@ -54,6 +59,9 @@ fn create_test_document(user_id: Uuid, filename: &str, file_hash: String) -> Doc
updated_at: Utc::now(),
user_id,
file_hash: Some(file_hash),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
}
}

View File

@ -29,6 +29,11 @@ fn create_test_file_info(name: &str, path: &str, size: i64) -> FileInfo {
etag: "test-etag".to_string(),
mime_type: "application/pdf".to_string(),
is_directory: false,
created_at: None,
permissions: None,
owner: None,
group: None,
metadata: None,
}
}
@ -54,6 +59,9 @@ fn create_test_document(user_id: Uuid, filename: &str, file_hash: String) -> Doc
updated_at: Utc::now(),
user_id,
file_hash: Some(file_hash),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
}
}
@ -280,6 +288,11 @@ async fn test_webdav_sync_etag_change_detection() -> Result<()> {
etag: new_etag.to_string(),
mime_type: "application/pdf".to_string(),
is_directory: false,
created_at: None,
permissions: None,
owner: None,
group: None,
metadata: None,
};
// ETag comparison should detect change

View File

@ -25,6 +25,9 @@ fn test_document_response_conversion_with_ocr() {
updated_at: Utc::now(),
user_id,
file_hash: Some("abc123".to_string()),
original_created_at: None,
original_modified_at: None,
source_metadata: None,
};
let response: DocumentResponse = document.clone().into();
@ -59,6 +62,9 @@ fn test_document_response_conversion_without_ocr() {
updated_at: Utc::now(),
user_id,
file_hash: None,
original_created_at: None,
original_modified_at: None,
source_metadata: None,
};
let response: DocumentResponse = document.clone().into();

View File

@ -607,6 +607,11 @@ fn test_special_characters_in_paths() {
last_modified: Some(Utc::now()),
etag: "\"test123\"".to_string(),
is_directory: false,
created_at: None,
permissions: None,
owner: None,
group: None,
metadata: None,
};
assert!(!file_info.name.is_empty());