diff --git a/frontend/src/pages/FailedOcrPage.tsx b/frontend/src/pages/FailedOcrPage.tsx index d6f8a10..538a5e4 100644 --- a/frontend/src/pages/FailedOcrPage.tsx +++ b/frontend/src/pages/FailedOcrPage.tsx @@ -29,6 +29,7 @@ import { Snackbar, Tabs, Tab, + useTheme, } from '@mui/material'; import { Refresh as RefreshIcon, @@ -40,6 +41,8 @@ import { Visibility as VisibilityIcon, Download as DownloadIcon, FileCopy as FileCopyIcon, + Delete as DeleteIcon, + FindInPage as FindInPageIcon, } from '@mui/icons-material'; import { format } from 'date-fns'; import { api, documentService } from '../services/api'; @@ -122,6 +125,7 @@ interface DuplicatesResponse { } const FailedOcrPage: React.FC = () => { + const theme = useTheme(); const [currentTab, setCurrentTab] = useState(0); const [documents, setDocuments] = useState([]); const [duplicates, setDuplicates] = useState([]); @@ -580,9 +584,19 @@ const FailedOcrPage: React.FC = () => { ) : ( <> - Duplicate Documents + Duplicate Documents Found These documents have identical content but may have different filenames. - You can click on each group to see all the documents with the same content. + You can expand each group to see all files with the same content and choose which ones to keep. + + + + What should you do? + +
  • Review each group: Click to expand and see all duplicate files
  • +
  • Keep the best version: Choose the file with the most descriptive name
  • +
  • Check content: Use View/Download to verify files are truly identical
  • +
  • Note for admin: Consider implementing bulk delete functionality for duplicates
  • +
    @@ -640,45 +654,124 @@ const FailedOcrPage: React.FC = () => { - - + + + Duplicate Files ({group.duplicate_count} total) + + + Storage Impact: These {group.duplicate_count} files contain identical content. + Consider keeping only the best-named version to save space. + + {group.documents.map((doc, index) => ( - - + + - - {doc.filename} - + + + {doc.filename} + + {index === 0 && ( + + )} + + {doc.original_filename !== doc.filename && ( - + Original: {doc.original_filename} )} - + + {formatFileSize(doc.file_size)} • {doc.mime_type} - + + Uploaded: {format(new Date(doc.created_at), 'MMM dd, yyyy HH:mm')} - - - + + + window.open(`/api/documents/${doc.id}/view`, '_blank')} + sx={{ color: theme.palette.primary.main }} + > + + + + + window.open(`/api/documents/${doc.id}/download`, '_blank')} + sx={{ color: theme.palette.secondary.main }} + > + + + + + + + diff --git a/src/db/documents.rs b/src/db/documents.rs index 12d946d..78ec47d 100644 --- a/src/db/documents.rs +++ b/src/db/documents.rs @@ -128,7 +128,7 @@ impl Database { // Admin with OCR filter sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash FROM documents WHERE ocr_status = $3 ORDER BY created_at DESC @@ -145,7 +145,7 @@ impl Database { // Admin without OCR filter sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash FROM documents ORDER BY created_at DESC LIMIT $1 OFFSET $2 @@ -160,7 +160,7 @@ impl Database { // Regular user with OCR filter sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash FROM documents WHERE user_id = $3 AND ocr_status = $4 ORDER BY created_at DESC @@ -178,7 +178,7 @@ impl Database { // Regular user without OCR filter sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash FROM documents WHERE user_id = $3 ORDER BY created_at DESC @@ -267,7 +267,7 @@ impl Database { pub async fn get_documents_by_user(&self, user_id: Uuid, limit: i64, offset: i64) -> Result> { let rows = sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash FROM documents WHERE user_id = $1 ORDER BY created_at DESC @@ -311,7 +311,7 @@ impl Database { pub async fn find_documents_by_filename(&self, filename: &str) -> Result> { let rows = sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash FROM documents WHERE filename = $1 OR original_filename = $1 ORDER BY created_at DESC @@ -352,7 +352,7 @@ impl Database { pub async fn search_documents(&self, user_id: Uuid, search: SearchRequest) -> Result<(Vec, i64)> { let mut query_builder = QueryBuilder::new( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, ts_rank(to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')), plainto_tsquery('english', "# ); @@ -455,7 +455,7 @@ impl Database { // Use trigram similarity for substring matching let mut builder = QueryBuilder::new( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, GREATEST( similarity(filename, "# ); @@ -498,7 +498,7 @@ impl Database { let mut builder = QueryBuilder::new(&format!( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, GREATEST( CASE WHEN filename ILIKE '%' || "# )); @@ -644,7 +644,7 @@ impl Database { // Use trigram similarity for substring matching let mut builder = QueryBuilder::new( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, GREATEST( similarity(filename, "# ); @@ -683,7 +683,7 @@ impl Database { let mut builder = QueryBuilder::new(&format!( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, GREATEST( CASE WHEN filename ILIKE '%' || "# )); @@ -1080,14 +1080,14 @@ impl Database { let query = if user_role == crate::models::UserRole::Admin { // Admins can see any document r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash FROM documents WHERE id = $1 "# } else { // Regular users can only see their own documents r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash FROM documents WHERE id = $1 AND user_id = $2 "#