diff --git a/frontend/src/pages/DocumentManagementPage.tsx b/frontend/src/pages/DocumentManagementPage.tsx index 96079a6..da50d4c 100644 --- a/frontend/src/pages/DocumentManagementPage.tsx +++ b/frontend/src/pages/DocumentManagementPage.tsx @@ -19,6 +19,7 @@ import { Dialog, DialogTitle, DialogContent, + DialogContentText, DialogActions, Pagination, CircularProgress, @@ -233,6 +234,7 @@ const DocumentManagementPage: React.FC = () => { const [retryHistoryModalOpen, setRetryHistoryModalOpen] = useState(false); const [selectedDocumentForHistory, setSelectedDocumentForHistory] = useState(null); const [selectedDocumentIds, setSelectedDocumentIds] = useState([]); + const [confirmRetryAllOpen, setConfirmRetryAllOpen] = useState(false); const fetchFailedDocuments = async () => { try { @@ -358,6 +360,42 @@ const DocumentManagementPage: React.FC = () => { } }; + const handleRetryAllDocuments = async () => { + try { + setRetryingAll(true); + const response = await documentService.bulkRetryOcr({ + mode: 'all', + preview_only: false + }); + + if (response.data.queued_count > 0) { + setSnackbar({ + open: true, + message: `Successfully queued ${response.data.queued_count} documents for OCR retry. Estimated processing time: ${Math.ceil(response.data.estimated_total_time_minutes)} minutes.`, + severity: 'success' + }); + + // Refresh all tabs since we're retrying all documents + await refreshCurrentTab(); + } else { + setSnackbar({ + open: true, + message: 'No documents found to retry', + severity: 'info' + }); + } + } catch (error) { + console.error('Error retrying all documents:', error); + setSnackbar({ + open: true, + message: 'Failed to retry documents. Please try again.', + severity: 'error' + }); + } finally { + setRetryingAll(false); + } + }; + const handleRetryAllFailed = async () => { try { setRetryingAll(true); @@ -735,14 +773,33 @@ const DocumentManagementPage: React.FC = () => { Document Management - + + + + @@ -825,7 +882,7 @@ const DocumentManagementPage: React.FC = () => { size="small" fullWidth > - {retryingAll ? 'Retrying All...' : 'Retry All Failed OCR'} + {retryingAll ? 'Retrying...' : 'Retry Failed Only'} @@ -2219,6 +2276,43 @@ const DocumentManagementPage: React.FC = () => { + {/* Confirm Retry All Documents Dialog */} + setConfirmRetryAllOpen(false)}> + + + + Retry All Documents + + + + + This will retry OCR processing for all documents in your library, regardless of their current OCR status. + This includes documents that have already been successfully processed. + + + + Note: This is a resource-intensive operation that may take a significant amount of time depending on the number of documents. + + + + + + + + + {/* Advanced Retry Modal */} Result { let row = sqlx::query( r#" - INSERT INTO documents (id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22) - RETURNING id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata + INSERT INTO documents (id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24) + RETURNING id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata "# ) .bind(document.id) @@ -29,6 +29,8 @@ impl Database { .bind(&document.ocr_status) .bind(&document.ocr_error) .bind(document.ocr_completed_at) + .bind(document.ocr_retry_count) + .bind(&document.ocr_failure_reason) .bind(&document.tags) .bind(document.created_at) .bind(document.updated_at) @@ -55,6 +57,8 @@ impl Database { ocr_status: row.get("ocr_status"), ocr_error: row.get("ocr_error"), ocr_completed_at: row.get("ocr_completed_at"), + ocr_retry_count: row.get("ocr_retry_count"), + ocr_failure_reason: row.get("ocr_failure_reason"), tags: row.get("tags"), created_at: row.get("created_at"), updated_at: row.get("updated_at"), @@ -70,7 +74,7 @@ impl Database { let query = if user_role == crate::models::UserRole::Admin { // Admins can see all documents r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, original_created_at, original_modified_at, source_metadata + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents ORDER BY created_at DESC LIMIT $1 OFFSET $2 @@ -78,7 +82,7 @@ impl Database { } else { // Regular users can only see their own documents r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, original_created_at, original_modified_at, source_metadata + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE user_id = $3 ORDER BY created_at DESC @@ -118,6 +122,8 @@ impl Database { ocr_status: row.get("ocr_status"), ocr_error: row.get("ocr_error"), ocr_completed_at: row.get("ocr_completed_at"), + ocr_retry_count: row.get("ocr_retry_count"), + ocr_failure_reason: row.get("ocr_failure_reason"), tags: row.get("tags"), created_at: row.get("created_at"), updated_at: row.get("updated_at"), @@ -138,7 +144,7 @@ impl Database { // Admin with OCR filter sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE ocr_status = $3 ORDER BY created_at DESC @@ -155,7 +161,7 @@ impl Database { // Admin without OCR filter sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents ORDER BY created_at DESC LIMIT $1 OFFSET $2 @@ -170,7 +176,7 @@ impl Database { // Regular user with OCR filter sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE user_id = $3 AND ocr_status = $4 ORDER BY created_at DESC @@ -188,7 +194,7 @@ impl Database { // Regular user without OCR filter sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE user_id = $3 ORDER BY created_at DESC @@ -220,6 +226,8 @@ impl Database { ocr_status: row.get("ocr_status"), ocr_error: row.get("ocr_error"), ocr_completed_at: row.get("ocr_completed_at"), + ocr_retry_count: row.get("ocr_retry_count"), + ocr_failure_reason: row.get("ocr_failure_reason"), tags: row.get("tags"), created_at: row.get("created_at"), updated_at: row.get("updated_at"), @@ -280,7 +288,7 @@ impl Database { pub async fn get_documents_by_user(&self, user_id: Uuid, limit: i64, offset: i64) -> Result> { let rows = sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE user_id = $1 ORDER BY created_at DESC @@ -310,6 +318,8 @@ impl Database { ocr_status: row.get("ocr_status"), ocr_error: row.get("ocr_error"), ocr_completed_at: row.get("ocr_completed_at"), + ocr_retry_count: row.get("ocr_retry_count"), + ocr_failure_reason: row.get("ocr_failure_reason"), tags: row.get("tags"), created_at: row.get("created_at"), updated_at: row.get("updated_at"), @@ -327,7 +337,7 @@ impl Database { pub async fn find_documents_by_filename(&self, filename: &str) -> Result> { let rows = sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE filename = $1 OR original_filename = $1 ORDER BY created_at DESC @@ -354,6 +364,8 @@ impl Database { ocr_status: row.get("ocr_status"), ocr_error: row.get("ocr_error"), ocr_completed_at: row.get("ocr_completed_at"), + ocr_retry_count: row.get("ocr_retry_count"), + ocr_failure_reason: row.get("ocr_failure_reason"), tags: row.get("tags"), created_at: row.get("created_at"), updated_at: row.get("updated_at"), @@ -371,7 +383,7 @@ impl Database { pub async fn search_documents(&self, user_id: Uuid, search: SearchRequest) -> Result<(Vec, i64)> { let mut query_builder = QueryBuilder::new( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, ts_rank(to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')), plainto_tsquery('english', "# ); @@ -428,6 +440,8 @@ impl Database { ocr_status: row.get("ocr_status"), ocr_error: row.get("ocr_error"), ocr_completed_at: row.get("ocr_completed_at"), + ocr_retry_count: row.get("ocr_retry_count"), + ocr_failure_reason: row.get("ocr_failure_reason"), tags: row.get("tags"), created_at: row.get("created_at"), updated_at: row.get("updated_at"), @@ -477,7 +491,7 @@ impl Database { // Use trigram similarity for substring matching let mut builder = QueryBuilder::new( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, GREATEST( similarity(filename, "# ); @@ -520,7 +534,7 @@ impl Database { let mut builder = QueryBuilder::new(&format!( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, GREATEST( CASE WHEN filename ILIKE '%' || "# )); @@ -666,7 +680,7 @@ impl Database { // Use trigram similarity for substring matching let mut builder = QueryBuilder::new( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, GREATEST( similarity(filename, "# ); @@ -705,7 +719,7 @@ impl Database { let mut builder = QueryBuilder::new(&format!( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata, GREATEST( CASE WHEN filename ILIKE '%' || "# )); @@ -982,7 +996,7 @@ impl Database { pub async fn get_recent_documents_for_source(&self, source_id: Uuid, limit: i64) -> Result> { let rows = sqlx::query( - r#"SELECT * FROM documents + r#"SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE source_id = $1 ORDER BY created_at DESC LIMIT $2"# @@ -1009,6 +1023,8 @@ impl Database { ocr_status: row.get("ocr_status"), ocr_error: row.get("ocr_error"), ocr_completed_at: row.get("ocr_completed_at"), + ocr_retry_count: row.get("ocr_retry_count"), + ocr_failure_reason: row.get("ocr_failure_reason"), tags: row.get("tags"), created_at: row.get("created_at"), updated_at: row.get("updated_at"), @@ -1105,14 +1121,14 @@ impl Database { let query = if user_role == crate::models::UserRole::Admin { // Admins can see any document r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE id = $1 "# } else { // Regular users can only see their own documents r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE id = $1 AND user_id = $2 "# @@ -1147,6 +1163,8 @@ impl Database { ocr_status: row.get("ocr_status"), ocr_error: row.get("ocr_error"), ocr_completed_at: row.get("ocr_completed_at"), + ocr_retry_count: row.get("ocr_retry_count"), + ocr_failure_reason: row.get("ocr_failure_reason"), tags: row.get("tags"), created_at: row.get("created_at"), updated_at: row.get("updated_at"), @@ -1164,7 +1182,7 @@ impl Database { pub async fn get_document_by_user_and_hash(&self, user_id: Uuid, file_hash: &str) -> Result> { let row = sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE user_id = $1 AND file_hash = $2 LIMIT 1 @@ -1191,6 +1209,8 @@ impl Database { ocr_status: row.get("ocr_status"), ocr_error: row.get("ocr_error"), ocr_completed_at: row.get("ocr_completed_at"), + ocr_retry_count: row.get("ocr_retry_count"), + ocr_failure_reason: row.get("ocr_failure_reason"), tags: row.get("tags"), created_at: row.get("created_at"), updated_at: row.get("updated_at"), @@ -1396,7 +1416,7 @@ impl Database { r#" DELETE FROM documents WHERE id = $1 - RETURNING id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata + RETURNING id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata "#, ) .bind(document_id) @@ -1418,6 +1438,8 @@ impl Database { ocr_status: r.get("ocr_status"), ocr_error: r.get("ocr_error"), ocr_completed_at: r.get("ocr_completed_at"), + ocr_retry_count: r.get("ocr_retry_count"), + ocr_failure_reason: r.get("ocr_failure_reason"), tags: r.get("tags"), created_at: r.get("created_at"), updated_at: r.get("updated_at"), @@ -1432,7 +1454,7 @@ impl Database { r#" DELETE FROM documents WHERE id = $1 AND user_id = $2 - RETURNING id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata + RETURNING id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata "#, ) .bind(document_id) @@ -1455,6 +1477,8 @@ impl Database { ocr_status: r.get("ocr_status"), ocr_error: r.get("ocr_error"), ocr_completed_at: r.get("ocr_completed_at"), + ocr_retry_count: r.get("ocr_retry_count"), + ocr_failure_reason: r.get("ocr_failure_reason"), tags: r.get("tags"), created_at: r.get("created_at"), updated_at: r.get("updated_at"), @@ -1479,7 +1503,7 @@ impl Database { r#" DELETE FROM documents WHERE id = ANY($1) - RETURNING id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata + RETURNING id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata "#, ) .bind(document_ids) @@ -1501,6 +1525,8 @@ impl Database { ocr_status: r.get("ocr_status"), ocr_error: r.get("ocr_error"), ocr_completed_at: r.get("ocr_completed_at"), + ocr_retry_count: r.get("ocr_retry_count"), + ocr_failure_reason: r.get("ocr_failure_reason"), tags: r.get("tags"), created_at: r.get("created_at"), updated_at: r.get("updated_at"), @@ -1515,7 +1541,7 @@ impl Database { r#" DELETE FROM documents WHERE id = ANY($1) AND user_id = $2 - RETURNING id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata + RETURNING id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata "#, ) .bind(document_ids) @@ -1538,6 +1564,8 @@ impl Database { ocr_status: r.get("ocr_status"), ocr_error: r.get("ocr_error"), ocr_completed_at: r.get("ocr_completed_at"), + ocr_retry_count: r.get("ocr_retry_count"), + ocr_failure_reason: r.get("ocr_failure_reason"), tags: r.get("tags"), created_at: r.get("created_at"), updated_at: r.get("updated_at"), @@ -1557,7 +1585,7 @@ impl Database { let documents = if user_role == crate::models::UserRole::Admin { let rows = sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE ocr_confidence IS NOT NULL AND ocr_confidence < $1 ORDER BY ocr_confidence ASC, created_at DESC @@ -1582,6 +1610,8 @@ impl Database { ocr_status: r.get("ocr_status"), ocr_error: r.get("ocr_error"), ocr_completed_at: r.get("ocr_completed_at"), + ocr_retry_count: r.get("ocr_retry_count"), + ocr_failure_reason: r.get("ocr_failure_reason"), tags: r.get("tags"), created_at: r.get("created_at"), updated_at: r.get("updated_at"), @@ -1594,7 +1624,7 @@ impl Database { } else { let rows = sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE ocr_confidence IS NOT NULL AND ocr_confidence < $1 AND user_id = $2 ORDER BY ocr_confidence ASC, created_at DESC @@ -1620,6 +1650,8 @@ impl Database { ocr_status: r.get("ocr_status"), ocr_error: r.get("ocr_error"), ocr_completed_at: r.get("ocr_completed_at"), + ocr_retry_count: r.get("ocr_retry_count"), + ocr_failure_reason: r.get("ocr_failure_reason"), tags: r.get("tags"), created_at: r.get("created_at"), updated_at: r.get("updated_at"), @@ -1639,7 +1671,7 @@ impl Database { let documents = if user_role == crate::models::UserRole::Admin { let rows = sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE ocr_status = 'failed' OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing') ORDER BY created_at DESC @@ -1663,6 +1695,8 @@ impl Database { ocr_status: r.get("ocr_status"), ocr_error: r.get("ocr_error"), ocr_completed_at: r.get("ocr_completed_at"), + ocr_retry_count: r.get("ocr_retry_count"), + ocr_failure_reason: r.get("ocr_failure_reason"), tags: r.get("tags"), created_at: r.get("created_at"), updated_at: r.get("updated_at"), @@ -1675,7 +1709,7 @@ impl Database { } else { let rows = sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE (ocr_status = 'failed' OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing')) AND user_id = $1 ORDER BY created_at DESC @@ -1700,6 +1734,8 @@ impl Database { ocr_status: r.get("ocr_status"), ocr_error: r.get("ocr_error"), ocr_completed_at: r.get("ocr_completed_at"), + ocr_retry_count: r.get("ocr_retry_count"), + ocr_failure_reason: r.get("ocr_failure_reason"), tags: r.get("tags"), created_at: r.get("created_at"), updated_at: r.get("updated_at"), @@ -1719,7 +1755,7 @@ impl Database { let documents = if user_role == crate::models::UserRole::Admin { let rows = sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE (ocr_confidence IS NOT NULL AND ocr_confidence < $1) OR ocr_status = 'failed' @@ -1747,6 +1783,8 @@ impl Database { ocr_status: r.get("ocr_status"), ocr_error: r.get("ocr_error"), ocr_completed_at: r.get("ocr_completed_at"), + ocr_retry_count: r.get("ocr_retry_count"), + ocr_failure_reason: r.get("ocr_failure_reason"), tags: r.get("tags"), created_at: r.get("created_at"), updated_at: r.get("updated_at"), @@ -1759,7 +1797,7 @@ impl Database { } else { let rows = sqlx::query( r#" - SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, ocr_retry_count, ocr_failure_reason, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE ((ocr_confidence IS NOT NULL AND ocr_confidence < $1) OR ocr_status = 'failed') @@ -1789,6 +1827,8 @@ impl Database { ocr_status: r.get("ocr_status"), ocr_error: r.get("ocr_error"), ocr_completed_at: r.get("ocr_completed_at"), + ocr_retry_count: r.get("ocr_retry_count"), + ocr_failure_reason: r.get("ocr_failure_reason"), tags: r.get("tags"), created_at: r.get("created_at"), updated_at: r.get("updated_at"), diff --git a/src/db/ocr_retry.rs b/src/db/ocr_retry.rs index 3e9b1c4..23b0cf4 100644 --- a/src/db/ocr_retry.rs +++ b/src/db/ocr_retry.rs @@ -27,7 +27,21 @@ pub async fn record_ocr_retry( priority: i32, queue_id: Option, ) -> Result { + crate::debug_log!("OCR_RETRY_HISTORY", + "document_id" => document_id, + "user_id" => user_id, + "retry_reason" => retry_reason, + "priority" => priority, + "queue_id" => queue_id.unwrap_or_default(), + "message" => "Recording OCR retry attempt" + ); + // First get the current OCR status + crate::debug_log!("OCR_RETRY_HISTORY", + "document_id" => document_id, + "message" => "Fetching current OCR status" + ); + let current_status = sqlx::query( r#" SELECT ocr_status, ocr_failure_reason, ocr_error @@ -37,19 +51,38 @@ pub async fn record_ocr_retry( ) .bind(document_id) .fetch_optional(pool) - .await?; + .await + .map_err(|e| { + crate::debug_error!("OCR_RETRY_HISTORY", format!("Failed to fetch current status for document {}: {}", document_id, e)); + e + })?; let (previous_status, previous_failure_reason, previous_error) = if let Some(row) = current_status { - ( - row.get::, _>("ocr_status"), - row.get::, _>("ocr_failure_reason"), - row.get::, _>("ocr_error"), - ) + let status = row.get::, _>("ocr_status"); + let failure = row.get::, _>("ocr_failure_reason"); + let error = row.get::, _>("ocr_error"); + + crate::debug_log!("OCR_RETRY_HISTORY", + "document_id" => document_id, + "status" => status.as_deref().unwrap_or("none"), + "failure_reason" => failure.as_deref().unwrap_or("none"), + "has_error" => error.is_some(), + "message" => "Found current document status" + ); + + (status, failure, error) } else { + crate::debug_warn!("OCR_RETRY_HISTORY", "Document not found when recording retry history"); (None, None, None) }; // Insert retry history record + crate::debug_log!("OCR_RETRY_HISTORY", + "document_id" => document_id, + "previous_status" => previous_status.as_deref().unwrap_or("none"), + "message" => "Inserting retry history record" + ); + let retry_id: Uuid = sqlx::query_scalar( r#" INSERT INTO ocr_retry_history ( @@ -63,15 +96,25 @@ pub async fn record_ocr_retry( .bind(document_id) .bind(user_id) .bind(retry_reason) - .bind(previous_status) - .bind(previous_failure_reason) - .bind(previous_error) + .bind(&previous_status) + .bind(&previous_failure_reason) + .bind(&previous_error) .bind(priority) .bind(queue_id) .fetch_one(pool) - .await?; + .await + .map_err(|e| { + crate::debug_error!("OCR_RETRY_HISTORY", format!("Failed to insert retry history for document {}: {}", document_id, e)); + e + })?; // Increment retry count + crate::debug_log!("OCR_RETRY_HISTORY", + "document_id" => document_id, + "retry_id" => retry_id, + "message" => "Incrementing retry count" + ); + sqlx::query( r#" UPDATE documents @@ -82,7 +125,18 @@ pub async fn record_ocr_retry( ) .bind(document_id) .execute(pool) - .await?; + .await + .map_err(|e| { + crate::debug_error!("OCR_RETRY_HISTORY", format!("Failed to increment retry count for document {}: {}", document_id, e)); + e + })?; + + crate::debug_log!("OCR_RETRY_HISTORY", + "document_id" => document_id, + "retry_id" => retry_id, + "user_id" => user_id, + "message" => "Successfully recorded retry history" + ); Ok(retry_id) } diff --git a/src/lib.rs b/src/lib.rs index 90b15f8..c85f9c8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,6 +12,7 @@ pub mod scheduling; pub mod seed; pub mod services; pub mod swagger; +pub mod utils; pub mod webdav_xml_parser; #[cfg(test)] diff --git a/src/models.rs b/src/models.rs index b0fb0ed..28cf9de 100644 --- a/src/models.rs +++ b/src/models.rs @@ -129,6 +129,8 @@ pub struct Document { pub ocr_status: Option, pub ocr_error: Option, pub ocr_completed_at: Option>, + pub ocr_retry_count: Option, + pub ocr_failure_reason: Option, pub tags: Vec, pub created_at: DateTime, pub updated_at: DateTime, diff --git a/src/ocr/queue.rs b/src/ocr/queue.rs index 01f9186..2d32e86 100644 --- a/src/ocr/queue.rs +++ b/src/ocr/queue.rs @@ -75,6 +75,13 @@ impl OcrQueueService { /// Add a document to the OCR queue pub async fn enqueue_document(&self, document_id: Uuid, priority: i32, file_size: i64) -> Result { + crate::debug_log!("OCR_QUEUE", + "document_id" => document_id, + "priority" => priority, + "file_size" => file_size, + "message" => "Enqueueing document" + ); + let row = sqlx::query( r#" INSERT INTO ocr_queue (document_id, priority, file_size) @@ -86,10 +93,22 @@ impl OcrQueueService { .bind(priority) .bind(file_size) .fetch_one(&self.pool) - .await?; + .await + .map_err(|e| { + crate::debug_error!("OCR_QUEUE", format!("Failed to insert document {} into queue: {}", document_id, e)); + e + })?; let id: Uuid = row.get("id"); + crate::debug_log!("OCR_QUEUE", + "document_id" => document_id, + "queue_id" => id, + "priority" => priority, + "file_size" => file_size, + "message" => "Successfully enqueued document" + ); + info!("Enqueued document {} with priority {} for OCR processing", document_id, priority); Ok(id) } diff --git a/src/routes/documents.rs b/src/routes/documents.rs index 1475cc1..726e987 100644 --- a/src/routes/documents.rs +++ b/src/routes/documents.rs @@ -571,28 +571,55 @@ async fn retry_ocr( auth_user: AuthUser, Path(document_id): Path, ) -> Result, StatusCode> { + crate::debug_log!("OCR_RETRY", + "document_id" => document_id, + "user_id" => auth_user.user.id, + "message" => "Starting OCR retry request" + ); + // Check if document exists and belongs to user let document = state .db .get_document_by_id(document_id, auth_user.user.id, auth_user.user.role) .await - .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)? - .ok_or(StatusCode::NOT_FOUND)?; + .map_err(|e| { + crate::debug_error!("OCR_RETRY", format!("Failed to get document {}: {}", document_id, e)); + StatusCode::INTERNAL_SERVER_ERROR + })? + .ok_or_else(|| { + crate::debug_log!("OCR_RETRY", &format!("Document {} not found or access denied for user {}", document_id, auth_user.user.id)); + StatusCode::NOT_FOUND + })?; - // Check if document is eligible for OCR retry (failed or not processed) - let eligible = document.ocr_status.as_ref().map_or(true, |status| { - status == "failed" || status == "pending" - }); + // Check if document is eligible for OCR retry (all documents are now retryable) + let current_status = document.ocr_status.as_deref().unwrap_or("unknown"); + let eligible = true; // All documents are retryable + + crate::debug_log!("OCR_RETRY", + "document_id" => document_id, + "filename" => &document.filename, + "current_status" => current_status, + "eligible" => eligible, + "file_size" => document.file_size, + "retry_count" => document.ocr_retry_count.unwrap_or(0), + "message" => "Checking document eligibility" + ); if !eligible { + crate::debug_log!("OCR_RETRY", &format!("Document {} is not eligible for retry - current status: {}", document_id, current_status)); return Ok(Json(serde_json::json!({ "success": false, - "message": "Document is not eligible for OCR retry. Current status: {}", + "message": format!("Document is not eligible for OCR retry. Current status: {}", current_status), "current_status": document.ocr_status }))); } // Reset document OCR fields + crate::debug_log!("OCR_RETRY", + "document_id" => document_id, + "message" => "Resetting document OCR fields" + ); + let reset_result = sqlx::query( r#" UPDATE documents @@ -611,12 +638,22 @@ async fn retry_ocr( .bind(document_id) .execute(state.db.get_pool()) .await - .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; + .map_err(|e| { + crate::debug_error!("OCR_RETRY", format!("Failed to reset OCR fields for document {}: {}", document_id, e)); + StatusCode::INTERNAL_SERVER_ERROR + })?; if reset_result.rows_affected() == 0 { + crate::debug_error!("OCR_RETRY", format!("No rows affected when resetting OCR fields for document {}", document_id)); return Err(StatusCode::NOT_FOUND); } + crate::debug_log!("OCR_RETRY", + "document_id" => document_id, + "rows_affected" => reset_result.rows_affected(), + "message" => "Successfully reset OCR fields" + ); + // Calculate priority based on file size (higher priority for retries) let priority = match document.file_size { 0..=1048576 => 15, // <= 1MB: highest priority (boosted for retry) @@ -626,10 +663,38 @@ async fn retry_ocr( _ => 6, // > 50MB: lowest priority }; + crate::debug_log!("OCR_RETRY", + "document_id" => document_id, + "file_size" => document.file_size, + "priority" => priority, + "message" => "Calculated retry priority" + ); + // Add to OCR queue with detailed logging + crate::debug_log!("OCR_RETRY", + "document_id" => document_id, + "priority" => priority, + "file_size" => document.file_size, + "message" => "Enqueueing document for OCR processing" + ); + match state.queue_service.enqueue_document(document_id, priority, document.file_size).await { Ok(queue_id) => { + crate::debug_log!("OCR_RETRY", + "document_id" => document_id, + "queue_id" => queue_id, + "priority" => priority, + "message" => "Successfully enqueued document" + ); + // Record retry history + crate::debug_log!("OCR_RETRY", + "document_id" => document_id, + "user_id" => auth_user.user.id, + "queue_id" => queue_id, + "message" => "Recording retry history" + ); + if let Err(e) = crate::db::ocr_retry::record_ocr_retry( state.db.get_pool(), document_id, @@ -638,9 +703,25 @@ async fn retry_ocr( priority, Some(queue_id), ).await { + crate::debug_error!("OCR_RETRY", format!("Failed to record retry history for document {}: {}", document_id, e)); tracing::warn!("Failed to record retry history for document {}: {}", document_id, e); + } else { + crate::debug_log!("OCR_RETRY", + "document_id" => document_id, + "queue_id" => queue_id, + "message" => "Successfully recorded retry history" + ); } + crate::debug_log!("OCR_RETRY", + "document_id" => document_id, + "filename" => &document.filename, + "queue_id" => queue_id, + "priority" => priority, + "file_size" => document.file_size, + "message" => "OCR retry process completed successfully" + ); + tracing::info!( "OCR retry queued for document {} ({}): queue_id={}, priority={}, size={}", document_id, document.filename, queue_id, priority, document.file_size @@ -656,6 +737,7 @@ async fn retry_ocr( }))) } Err(e) => { + crate::debug_error!("OCR_RETRY", format!("Failed to enqueue document {}: {}", document_id, e)); tracing::error!("Failed to queue OCR retry for document {}: {}", document_id, e); Err(StatusCode::INTERNAL_SERVER_ERROR) } diff --git a/src/routes/documents_ocr_retry.rs b/src/routes/documents_ocr_retry.rs index 92baf64..bce5f03 100644 --- a/src/routes/documents_ocr_retry.rs +++ b/src/routes/documents_ocr_retry.rs @@ -101,62 +101,120 @@ pub async fn bulk_retry_ocr( auth_user: AuthUser, Json(request): Json, ) -> Result, StatusCode> { + crate::debug_log!("BULK_OCR_RETRY", + "user_id" => auth_user.user.id, + "mode" => format!("{:?}", request.mode), + "preview_only" => request.preview_only.unwrap_or(false), + "priority_override" => request.priority_override.unwrap_or(-1), + "message" => "Starting bulk OCR retry request" + ); + info!("Bulk OCR retry requested by user {} with mode: {:?}", auth_user.user.id, request.mode); let preview_only = request.preview_only.unwrap_or(false); // Build query based on selection mode + crate::debug_log!("BULK_OCR_RETRY", "Building document query based on selection mode"); + let documents = match request.mode { SelectionMode::All => { + crate::debug_log!("BULK_OCR_RETRY", "Fetching all documents for retry"); get_all_failed_ocr_documents(&state, &auth_user).await? } SelectionMode::Specific => { - if let Some(ids) = request.document_ids { - get_specific_documents(&state, &auth_user, ids).await? + if let Some(ids) = &request.document_ids { + crate::debug_log!("BULK_OCR_RETRY", + "document_count" => ids.len(), + "message" => "Fetching specific documents" + ); + get_specific_documents(&state, &auth_user, ids.clone()).await? } else { + crate::debug_error!("BULK_OCR_RETRY", "Specific mode requested but no document IDs provided"); return Err(StatusCode::BAD_REQUEST); } } SelectionMode::Filter => { - if let Some(filter) = request.filter { - get_filtered_documents(&state, &auth_user, filter).await? + if let Some(filter) = &request.filter { + crate::debug_log!("BULK_OCR_RETRY", + "filter_mime_types" => filter.mime_types.as_ref().map(|v| v.len()).unwrap_or(0), + "filter_failure_reasons" => filter.failure_reasons.as_ref().map(|v| v.len()).unwrap_or(0), + "message" => "Fetching filtered documents" + ); + get_filtered_documents(&state, &auth_user, filter.clone()).await? } else { + crate::debug_error!("BULK_OCR_RETRY", "Filter mode requested but no filter provided"); return Err(StatusCode::BAD_REQUEST); } } }; let matched_count = documents.len(); + crate::debug_log!("BULK_OCR_RETRY", + "matched_count" => matched_count, + "message" => "Document query completed" + ); let mut retry_documents = Vec::new(); let mut queued_count = 0; let mut total_estimated_time = 0.0; - for doc in documents { + for (index, doc) in documents.iter().enumerate() { let priority = calculate_priority(doc.file_size, request.priority_override); + crate::debug_log!("BULK_OCR_RETRY", + "index" => index + 1, + "total" => matched_count, + "document_id" => doc.id, + "filename" => &doc.filename, + "file_size" => doc.file_size, + "priority" => priority, + "failure_reason" => doc.ocr_failure_reason.as_deref().unwrap_or("none"), + "message" => "Processing document" + ); + let mut doc_info = OcrRetryDocumentInfo { id: doc.id, filename: doc.filename.clone(), file_size: doc.file_size, - mime_type: doc.mime_type, - ocr_failure_reason: doc.ocr_failure_reason, + mime_type: doc.mime_type.clone(), + ocr_failure_reason: doc.ocr_failure_reason.clone(), priority, queue_id: None, }; if !preview_only { // Reset OCR fields + crate::debug_log!("BULK_OCR_RETRY", + "document_id" => doc.id, + "message" => "Resetting OCR status for document" + ); + if let Err(e) = reset_document_ocr_status(&state, doc.id).await { + crate::debug_error!("BULK_OCR_RETRY", format!("Failed to reset OCR status for document {}: {}", doc.id, e)); warn!("Failed to reset OCR status for document {}: {}", doc.id, e); continue; } // Queue for OCR + crate::debug_log!("BULK_OCR_RETRY", + "document_id" => doc.id, + "priority" => priority, + "file_size" => doc.file_size, + "message" => "Enqueueing document for OCR" + ); + match state.queue_service.enqueue_document(doc.id, priority, doc.file_size).await { Ok(queue_id) => { doc_info.queue_id = Some(queue_id); queued_count += 1; + crate::debug_log!("BULK_OCR_RETRY", + "document_id" => doc.id, + "queue_id" => queue_id, + "priority" => priority, + "queued_count" => queued_count, + "message" => "Successfully enqueued document" + ); + // Record retry history let retry_reason = match &request.mode { SelectionMode::All => "bulk_retry_all", @@ -164,6 +222,13 @@ pub async fn bulk_retry_ocr( SelectionMode::Filter => "bulk_retry_filtered", }; + crate::debug_log!("BULK_OCR_RETRY", + "document_id" => doc.id, + "retry_reason" => retry_reason, + "queue_id" => queue_id, + "message" => "Recording retry history" + ); + if let Err(e) = crate::db::ocr_retry::record_ocr_retry( state.db.get_pool(), doc.id, @@ -172,12 +237,20 @@ pub async fn bulk_retry_ocr( priority, Some(queue_id), ).await { + crate::debug_error!("BULK_OCR_RETRY", format!("Failed to record retry history for document {}: {}", doc.id, e)); warn!("Failed to record retry history for document {}: {}", doc.id, e); + } else { + crate::debug_log!("BULK_OCR_RETRY", + "document_id" => doc.id, + "queue_id" => queue_id, + "message" => "Successfully recorded retry history" + ); } info!("Queued document {} for OCR retry with priority {}", doc.id, priority); } Err(e) => { + crate::debug_error!("BULK_OCR_RETRY", format!("Failed to enqueue document {}: {}", doc.id, e)); error!("Failed to queue document {} for OCR retry: {}", doc.id, e); } } @@ -188,6 +261,15 @@ pub async fn bulk_retry_ocr( retry_documents.push(doc_info); } + crate::debug_log!("BULK_OCR_RETRY", + "matched_count" => matched_count, + "queued_count" => queued_count, + "preview_only" => preview_only, + "estimated_time_minutes" => (total_estimated_time / 60.0) as i32, + "user_id" => auth_user.user.id, + "message" => "Bulk retry operation completed" + ); + let response = BulkOcrRetryResponse { success: true, message: if preview_only { @@ -303,8 +385,7 @@ pub async fn get_ocr_retry_stats( MIN(created_at) as first_occurrence, MAX(updated_at) as last_occurrence FROM documents - WHERE ocr_status = 'failed' - AND ($1::uuid IS NULL OR user_id = $1) + WHERE ($1::uuid IS NULL OR user_id = $1) GROUP BY ocr_failure_reason ORDER BY count DESC "# @@ -322,8 +403,7 @@ pub async fn get_ocr_retry_stats( COUNT(*) as count, AVG(file_size) as avg_file_size FROM documents - WHERE ocr_status = 'failed' - AND ($1::uuid IS NULL OR user_id = $1) + WHERE ($1::uuid IS NULL OR user_id = $1) GROUP BY mime_type ORDER BY count DESC "# @@ -441,8 +521,7 @@ async fn get_all_failed_ocr_documents( r#" SELECT id, filename, file_size, mime_type, ocr_failure_reason FROM documents - WHERE ocr_status = 'failed' - AND ($1::uuid IS NULL OR user_id = $1) + WHERE ($1::uuid IS NULL OR user_id = $1) ORDER BY created_at DESC "# ) @@ -465,12 +544,33 @@ async fn get_specific_documents( Some(auth_user.user.id) }; + // First let's debug what documents we're looking for and their current status + for doc_id in &document_ids { + if let Ok(Some(row)) = sqlx::query("SELECT id, filename, ocr_status FROM documents WHERE id = $1") + .bind(doc_id) + .fetch_optional(state.db.get_pool()) + .await { + let status: Option = row.get("ocr_status"); + let filename: String = row.get("filename"); + crate::debug_log!("BULK_OCR_RETRY", + "requested_document_id" => doc_id, + "filename" => &filename, + "current_ocr_status" => status.as_deref().unwrap_or("NULL"), + "message" => "Document found in database" + ); + } else { + crate::debug_log!("BULK_OCR_RETRY", + "requested_document_id" => doc_id, + "message" => "Document NOT found in database" + ); + } + } + let documents = sqlx::query_as::<_, DocumentInfo>( r#" SELECT id, filename, file_size, mime_type, ocr_failure_reason FROM documents WHERE id = ANY($1) - AND ocr_status = 'failed' AND ($2::uuid IS NULL OR user_id = $2) "# ) @@ -489,7 +589,7 @@ async fn get_filtered_documents( filter: OcrRetryFilter ) -> Result, StatusCode> { let mut query = sqlx::QueryBuilder::new( - "SELECT id, filename, file_size, mime_type, ocr_failure_reason FROM documents WHERE ocr_status = 'failed'" + "SELECT id, filename, file_size, mime_type, ocr_failure_reason FROM documents WHERE 1=1" ); // User filter @@ -585,6 +685,7 @@ async fn reset_document_ocr_status(state: &Arc, document_id: Uuid) -> ocr_text = NULL, ocr_error = NULL, ocr_failure_reason = NULL, + ocr_retry_count = NULL, ocr_confidence = NULL, ocr_word_count = NULL, ocr_processing_time_ms = NULL, diff --git a/src/services/file_service.rs b/src/services/file_service.rs index 69803de..e4e4e6e 100644 --- a/src/services/file_service.rs +++ b/src/services/file_service.rs @@ -177,6 +177,8 @@ impl FileService { ocr_status: Some("pending".to_string()), ocr_error: None, ocr_completed_at: None, + ocr_retry_count: None, + ocr_failure_reason: None, tags: Vec::new(), created_at: Utc::now(), updated_at: Utc::now(), diff --git a/src/tests/db_tests.rs b/src/tests/db_tests.rs index 123c3f7..dca299c 100644 --- a/src/tests/db_tests.rs +++ b/src/tests/db_tests.rs @@ -52,6 +52,8 @@ mod tests { original_created_at: None, original_modified_at: None, source_metadata: None, + ocr_retry_count: None, + ocr_failure_reason: None, } } diff --git a/src/tests/document_response_serialization_tests.rs b/src/tests/document_response_serialization_tests.rs index d97741f..5b69fec 100644 --- a/src/tests/document_response_serialization_tests.rs +++ b/src/tests/document_response_serialization_tests.rs @@ -103,6 +103,8 @@ mod tests { original_created_at: Some(DateTime::parse_from_rfc3339("2023-12-01T10:00:00Z").unwrap().with_timezone(&Utc)), original_modified_at: Some(DateTime::parse_from_rfc3339("2023-12-15T15:30:00Z").unwrap().with_timezone(&Utc)), source_metadata: Some(serde_json::json!({"permissions": "644", "owner": "user1"})), + ocr_retry_count: None, + ocr_failure_reason: None, }; // Convert to DocumentResponse diff --git a/src/tests/document_routes_tests.rs b/src/tests/document_routes_tests.rs index 5582a4b..3e7af69 100644 --- a/src/tests/document_routes_tests.rs +++ b/src/tests/document_routes_tests.rs @@ -63,6 +63,8 @@ mod document_routes_deletion_tests { original_created_at: None, original_modified_at: None, source_metadata: None, + ocr_retry_count: None, + ocr_failure_reason: None, } } @@ -400,6 +402,8 @@ mod document_routes_deletion_tests { original_created_at: None, original_modified_at: None, source_metadata: None, + ocr_retry_count: None, + ocr_failure_reason: None, } } diff --git a/src/tests/documents_tests.rs b/src/tests/documents_tests.rs index 2340ff9..9973974 100644 --- a/src/tests/documents_tests.rs +++ b/src/tests/documents_tests.rs @@ -29,6 +29,8 @@ fn create_test_document(user_id: Uuid) -> Document { original_created_at: None, original_modified_at: None, source_metadata: None, + ocr_retry_count: None, + ocr_failure_reason: None, } } @@ -57,6 +59,8 @@ fn create_test_document_without_ocr(user_id: Uuid) -> Document { original_created_at: None, original_modified_at: None, source_metadata: None, + ocr_retry_count: None, + ocr_failure_reason: None, } } @@ -85,6 +89,8 @@ fn create_test_document_with_ocr_error(user_id: Uuid) -> Document { original_created_at: None, original_modified_at: None, source_metadata: None, + ocr_retry_count: None, + ocr_failure_reason: None, } } @@ -1564,6 +1570,8 @@ mod deletion_error_handling_tests { original_created_at: None, original_modified_at: None, source_metadata: None, + ocr_retry_count: None, + ocr_failure_reason: None, } } diff --git a/src/tests/enhanced_search_tests.rs b/src/tests/enhanced_search_tests.rs index dc609ae..4da7afd 100644 --- a/src/tests/enhanced_search_tests.rs +++ b/src/tests/enhanced_search_tests.rs @@ -942,6 +942,8 @@ mod tests { original_created_at: None, original_modified_at: None, source_metadata: None, + ocr_retry_count: None, + ocr_failure_reason: None, }; db.create_document(document).await.unwrap(); diff --git a/src/tests/file_service_tests.rs b/src/tests/file_service_tests.rs index a1ad8b4..ba3a6b6 100644 --- a/src/tests/file_service_tests.rs +++ b/src/tests/file_service_tests.rs @@ -195,6 +195,8 @@ mod file_deletion_tests { original_created_at: None, original_modified_at: None, source_metadata: None, + ocr_retry_count: None, + ocr_failure_reason: None, }; ( @@ -333,6 +335,8 @@ mod file_deletion_tests { original_created_at: None, original_modified_at: None, source_metadata: None, + ocr_retry_count: None, + ocr_failure_reason: None, }; // Try to delete nonexistent files (should not fail) @@ -387,6 +391,8 @@ mod file_deletion_tests { original_created_at: None, original_modified_at: None, source_metadata: None, + ocr_retry_count: None, + ocr_failure_reason: None, }; // Verify files exist @@ -445,6 +451,8 @@ mod file_deletion_tests { original_created_at: None, original_modified_at: None, source_metadata: None, + ocr_retry_count: None, + ocr_failure_reason: None, }; // Verify files exist @@ -494,6 +502,8 @@ mod file_deletion_tests { original_created_at: None, original_modified_at: None, source_metadata: None, + ocr_retry_count: None, + ocr_failure_reason: None, }; // Verify file exists diff --git a/src/tests/ignored_files_tests.rs b/src/tests/ignored_files_tests.rs index 7128c25..dc93cc9 100644 --- a/src/tests/ignored_files_tests.rs +++ b/src/tests/ignored_files_tests.rs @@ -84,6 +84,8 @@ mod tests { original_created_at: None, original_modified_at: None, source_metadata: None, + ocr_retry_count: None, + ocr_failure_reason: None, }; sqlx::query("INSERT INTO documents (id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19)") diff --git a/src/utils/debug.rs b/src/utils/debug.rs new file mode 100644 index 0000000..970566e --- /dev/null +++ b/src/utils/debug.rs @@ -0,0 +1,85 @@ +use std::env; +use tracing::{debug, info, warn, error}; + +/// Check if DEBUG environment variable is set to enable verbose debug output +pub fn is_debug_enabled() -> bool { + env::var("DEBUG") + .map(|val| !val.is_empty() && val != "0" && val.to_lowercase() != "false") + .unwrap_or(false) +} + +/// Log debug message only if DEBUG environment variable is set +pub fn debug_log(message: &str) { + if is_debug_enabled() { + info!("🐛 DEBUG: {}", message); + } +} + +/// Log debug message with context only if DEBUG environment variable is set +pub fn debug_log_context(context: &str, message: &str) { + if is_debug_enabled() { + info!("🐛 DEBUG [{}]: {}", context, message); + } +} + +/// Log debug message with structured data only if DEBUG environment variable is set +pub fn debug_log_structured(context: &str, key_values: &[(&str, &dyn std::fmt::Display)]) { + if is_debug_enabled() { + let mut formatted = String::new(); + for (i, (key, value)) in key_values.iter().enumerate() { + if i > 0 { + formatted.push_str(", "); + } + formatted.push_str(&format!("{}={}", key, value)); + } + info!("🐛 DEBUG [{}]: {}", context, formatted); + } +} + +/// Log error with debug context +pub fn debug_error(context: &str, error: &dyn std::fmt::Display) { + if is_debug_enabled() { + error!("🐛 DEBUG ERROR [{}]: {}", context, error); + } else { + error!("[{}]: {}", context, error); + } +} + +/// Log warning with debug context +pub fn debug_warn(context: &str, message: &str) { + if is_debug_enabled() { + warn!("🐛 DEBUG WARN [{}]: {}", context, message); + } else { + warn!("[{}]: {}", context, message); + } +} + +/// Macro for easier debug logging with automatic context +#[macro_export] +macro_rules! debug_log { + ($msg:expr) => { + crate::utils::debug::debug_log($msg) + }; + ($context:expr, $msg:expr) => { + crate::utils::debug::debug_log_context($context, $msg) + }; + ($context:expr, $($key:expr => $value:expr),+ $(,)?) => { + crate::utils::debug::debug_log_structured($context, &[$(($key, &$value)),+]) + }; +} + +/// Macro for debug error logging +#[macro_export] +macro_rules! debug_error { + ($context:expr, $error:expr) => { + crate::utils::debug::debug_error($context, &$error) + }; +} + +/// Macro for debug warning logging +#[macro_export] +macro_rules! debug_warn { + ($context:expr, $msg:expr) => { + crate::utils::debug::debug_warn($context, $msg) + }; +} \ No newline at end of file diff --git a/src/utils/mod.rs b/src/utils/mod.rs new file mode 100644 index 0000000..477cc78 --- /dev/null +++ b/src/utils/mod.rs @@ -0,0 +1 @@ +pub mod debug; \ No newline at end of file diff --git a/tests/integration_debug_ocr_test.rs b/tests/integration_debug_ocr_test.rs index d726fbd..0ce3ad0 100644 --- a/tests/integration_debug_ocr_test.rs +++ b/tests/integration_debug_ocr_test.rs @@ -109,6 +109,13 @@ async fn debug_ocr_content() { .await .expect("Upload should work"); + println!("📤 Document 1 upload response status: {}", doc1_response.status()); + if !doc1_response.status().is_success() { + let status = doc1_response.status(); + let error_text = doc1_response.text().await.unwrap_or_else(|_| "No response body".to_string()); + panic!("Document 1 upload failed with status {}: {}", status, error_text); + } + let doc2_response = client .post(&format!("{}/api/documents", get_base_url())) .header("Authorization", format!("Bearer {}", token)) @@ -117,8 +124,15 @@ async fn debug_ocr_content() { .await .expect("Upload should work"); - let doc1: DocumentResponse = doc1_response.json().await.expect("Valid JSON"); - let doc2: DocumentResponse = doc2_response.json().await.expect("Valid JSON"); + println!("📤 Document 2 upload response status: {}", doc2_response.status()); + if !doc2_response.status().is_success() { + let status = doc2_response.status(); + let error_text = doc2_response.text().await.unwrap_or_else(|_| "No response body".to_string()); + panic!("Document 2 upload failed with status {}: {}", status, error_text); + } + + let doc1: DocumentResponse = doc1_response.json().await.expect("Valid JSON for doc1"); + let doc2: DocumentResponse = doc2_response.json().await.expect("Valid JSON for doc2"); println!("📄 Document 1: {}", doc1.id); println!("📄 Document 2: {}", doc2.id); diff --git a/tests/integration_document_upload_hash_duplicate_tests.rs b/tests/integration_document_upload_hash_duplicate_tests.rs index e387deb..80974a3 100644 --- a/tests/integration_document_upload_hash_duplicate_tests.rs +++ b/tests/integration_document_upload_hash_duplicate_tests.rs @@ -36,6 +36,8 @@ fn create_test_document(user_id: Uuid, filename: &str, file_hash: String) -> Doc ocr_status: Some("pending".to_string()), ocr_error: None, ocr_completed_at: None, + ocr_retry_count: None, + ocr_failure_reason: None, tags: Vec::new(), created_at: Utc::now(), updated_at: Utc::now(), diff --git a/tests/integration_hash_duplicate_detection_tests.rs b/tests/integration_hash_duplicate_detection_tests.rs index 9442235..1b4db3f 100644 --- a/tests/integration_hash_duplicate_detection_tests.rs +++ b/tests/integration_hash_duplicate_detection_tests.rs @@ -54,6 +54,8 @@ fn create_test_document(user_id: Uuid, filename: &str, file_hash: Option ocr_status: Some("pending".to_string()), ocr_error: None, ocr_completed_at: None, + ocr_retry_count: None, + ocr_failure_reason: None, tags: Vec::new(), created_at: Utc::now(), updated_at: Utc::now(), diff --git a/tests/integration_ignored_files_integration_tests.rs b/tests/integration_ignored_files_integration_tests.rs index 085ce53..e37c95c 100644 --- a/tests/integration_ignored_files_integration_tests.rs +++ b/tests/integration_ignored_files_integration_tests.rs @@ -351,6 +351,8 @@ async fn test_create_ignored_file_from_document() -> Result<()> { ocr_status: Some("completed".to_string()), ocr_error: None, ocr_completed_at: Some(chrono::Utc::now()), + ocr_retry_count: None, + ocr_failure_reason: None, tags: vec!["test".to_string()], created_at: chrono::Utc::now(), updated_at: chrono::Utc::now(), diff --git a/tests/integration_source_sync_hash_duplicate_tests.rs b/tests/integration_source_sync_hash_duplicate_tests.rs index 49c6666..fb9e3a3 100644 --- a/tests/integration_source_sync_hash_duplicate_tests.rs +++ b/tests/integration_source_sync_hash_duplicate_tests.rs @@ -54,6 +54,8 @@ fn create_test_document(user_id: Uuid, filename: &str, file_hash: String) -> Doc ocr_status: Some("pending".to_string()), ocr_error: None, ocr_completed_at: None, + ocr_retry_count: None, + ocr_failure_reason: None, tags: Vec::new(), created_at: Utc::now(), updated_at: Utc::now(), diff --git a/tests/integration_webdav_hash_duplicate_tests.rs b/tests/integration_webdav_hash_duplicate_tests.rs index 16b4bee..7b97e48 100644 --- a/tests/integration_webdav_hash_duplicate_tests.rs +++ b/tests/integration_webdav_hash_duplicate_tests.rs @@ -54,6 +54,8 @@ fn create_test_document(user_id: Uuid, filename: &str, file_hash: String) -> Doc ocr_status: Some("pending".to_string()), ocr_error: None, ocr_completed_at: None, + ocr_retry_count: None, + ocr_failure_reason: None, tags: Vec::new(), created_at: Utc::now(), updated_at: Utc::now(), diff --git a/tests/unit_unit_tests.rs b/tests/unit_unit_tests.rs index a7a1c17..edf7238 100644 --- a/tests/unit_unit_tests.rs +++ b/tests/unit_unit_tests.rs @@ -20,6 +20,8 @@ fn test_document_response_conversion_with_ocr() { ocr_status: Some("completed".to_string()), ocr_error: None, ocr_completed_at: Some(Utc::now()), + ocr_retry_count: None, + ocr_failure_reason: None, tags: vec!["test".to_string()], created_at: Utc::now(), updated_at: Utc::now(), @@ -57,6 +59,8 @@ fn test_document_response_conversion_without_ocr() { ocr_status: Some("pending".to_string()), ocr_error: None, ocr_completed_at: None, + ocr_retry_count: None, + ocr_failure_reason: None, tags: vec![], created_at: Utc::now(), updated_at: Utc::now(),