From 3ae542088b77804d69fe1d3967f2bb628490a844 Mon Sep 17 00:00:00 2001 From: perf3ct Date: Tue, 17 Jun 2025 02:56:59 +0000 Subject: [PATCH] feat(client/server): advanced search, along with fixing build errors --- frontend/src/App.tsx | 2 + .../AdvancedSearchPanel.tsx | 4 +- .../EnhancedSearchGuide.tsx | 11 +- frontend/src/components/Layout/AppLayout.tsx | 2 + frontend/src/pages/FailedOcrPage.tsx | 576 ++++++++++++++++++ frontend/src/services/api.ts | 10 + src/routes/documents.rs | 316 ++++++++++ 7 files changed, 916 insertions(+), 5 deletions(-) create mode 100644 frontend/src/pages/FailedOcrPage.tsx diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index ffc4678..bed1dc7 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -14,6 +14,7 @@ import DocumentDetailsPage from './pages/DocumentDetailsPage'; import SettingsPage from './pages/SettingsPage'; import SourcesPage from './pages/SourcesPage'; import WatchFolderPage from './pages/WatchFolderPage'; +import FailedOcrPage from './pages/FailedOcrPage'; function App(): JSX.Element { const { user, loading } = useAuth(); @@ -69,6 +70,7 @@ function App(): JSX.Element { } /> } /> } /> + } /> Profile Page - Coming Soon} /> diff --git a/frontend/src/components/AdvancedSearchPanel/AdvancedSearchPanel.tsx b/frontend/src/components/AdvancedSearchPanel/AdvancedSearchPanel.tsx index db7c47c..7d2608f 100644 --- a/frontend/src/components/AdvancedSearchPanel/AdvancedSearchPanel.tsx +++ b/frontend/src/components/AdvancedSearchPanel/AdvancedSearchPanel.tsx @@ -347,8 +347,8 @@ const AdvancedSearchPanel: React.FC = ({ label="Boost Recent Documents" /> - - + + )} {/* Results Display Section */} diff --git a/frontend/src/components/EnhancedSearchGuide/EnhancedSearchGuide.tsx b/frontend/src/components/EnhancedSearchGuide/EnhancedSearchGuide.tsx index b061a93..45bb3e5 100644 --- a/frontend/src/components/EnhancedSearchGuide/EnhancedSearchGuide.tsx +++ b/frontend/src/components/EnhancedSearchGuide/EnhancedSearchGuide.tsx @@ -181,6 +181,7 @@ const EnhancedSearchGuide: React.FC = ({ onExampleClic sx={{ mb: 1.5, transition: 'all 0.2s', + backgroundColor: (theme) => theme.palette.mode === 'dark' ? 'grey.800' : 'background.paper', '&:hover': { boxShadow: 2, transform: 'translateY(-2px)', @@ -200,7 +201,7 @@ const EnhancedSearchGuide: React.FC = ({ onExampleClic variant="body2" fontFamily="monospace" sx={{ - backgroundColor: 'grey.100', + backgroundColor: (theme) => theme.palette.mode === 'dark' ? 'grey.800' : 'grey.100', px: 1, py: 0.5, borderRadius: 1, @@ -273,7 +274,11 @@ const EnhancedSearchGuide: React.FC = ({ onExampleClic if (compact && !expanded) { return ( - + theme.palette.mode === 'dark' ? 'grey.900' : 'background.paper' + }}> @@ -294,7 +299,7 @@ const EnhancedSearchGuide: React.FC = ({ onExampleClic } return ( - + theme.palette.mode === 'dark' ? 'grey.900' : 'grey.50' }}> diff --git a/frontend/src/components/Layout/AppLayout.tsx b/frontend/src/components/Layout/AppLayout.tsx index ecc4760..b99ab9c 100644 --- a/frontend/src/components/Layout/AppLayout.tsx +++ b/frontend/src/components/Layout/AppLayout.tsx @@ -32,6 +32,7 @@ import { Logout as LogoutIcon, Description as DocumentIcon, Storage as StorageIcon, + Error as ErrorIcon, } from '@mui/icons-material'; import { useNavigate, useLocation } from 'react-router-dom'; import { useAuth } from '../../contexts/AuthContext'; @@ -64,6 +65,7 @@ const navigationItems: NavigationItem[] = [ { text: 'Search', icon: SearchIcon, path: '/search' }, { text: 'Sources', icon: StorageIcon, path: '/sources' }, { text: 'Watch Folder', icon: FolderIcon, path: '/watch' }, + { text: 'Failed OCR', icon: ErrorIcon, path: '/failed-ocr' }, ]; const AppLayout: React.FC = ({ children }) => { diff --git a/frontend/src/pages/FailedOcrPage.tsx b/frontend/src/pages/FailedOcrPage.tsx new file mode 100644 index 0000000..2a13e78 --- /dev/null +++ b/frontend/src/pages/FailedOcrPage.tsx @@ -0,0 +1,576 @@ +import React, { useState, useEffect } from 'react'; +import { + Box, + Typography, + Card, + CardContent, + Grid, + Button, + Chip, + Alert, + AlertTitle, + Table, + TableBody, + TableCell, + TableContainer, + TableHead, + TableRow, + Paper, + Dialog, + DialogTitle, + DialogContent, + DialogActions, + Pagination, + CircularProgress, + Tooltip, + IconButton, + Collapse, + LinearProgress, + Snackbar, +} from '@mui/material'; +import { + Refresh as RefreshIcon, + Error as ErrorIcon, + Info as InfoIcon, + ExpandMore as ExpandMoreIcon, + ExpandLess as ExpandLessIcon, + Schedule as ScheduleIcon, + Visibility as VisibilityIcon, + Download as DownloadIcon, +} from '@mui/icons-material'; +import { format } from 'date-fns'; +import { api, documentService } from '../services/api'; + +interface FailedDocument { + id: string; + filename: string; + original_filename: string; + file_size: number; + mime_type: string; + created_at: string; + updated_at: string; + tags: string[]; + ocr_status: string; + ocr_error: string; + ocr_failure_reason: string; + ocr_completed_at?: string; + retry_count: number; + last_attempt_at?: string; + can_retry: boolean; + failure_category: string; +} + +interface FailureCategory { + reason: string; + display_name: string; + count: number; +} + +interface FailedOcrResponse { + documents: FailedDocument[]; + pagination: { + total: number; + limit: number; + offset: number; + has_more: boolean; + }; + statistics: { + total_failed: number; + failure_categories: FailureCategory[]; + }; +} + +interface RetryResponse { + success: boolean; + message: string; + queue_id?: string; + estimated_wait_minutes?: number; +} + +const FailedOcrPage: React.FC = () => { + const [documents, setDocuments] = useState([]); + const [loading, setLoading] = useState(true); + const [retrying, setRetrying] = useState(null); + const [statistics, setStatistics] = useState(null); + const [pagination, setPagination] = useState({ page: 1, limit: 25 }); + const [totalPages, setTotalPages] = useState(0); + const [selectedDocument, setSelectedDocument] = useState(null); + const [detailsOpen, setDetailsOpen] = useState(false); + const [expandedRows, setExpandedRows] = useState>(new Set()); + const [snackbar, setSnackbar] = useState<{ open: boolean; message: string; severity: 'success' | 'error' }>({ + open: false, + message: '', + severity: 'success' + }); + + const fetchFailedDocuments = async () => { + try { + setLoading(true); + const offset = (pagination.page - 1) * pagination.limit; + const response = await documentService.getFailedOcrDocuments(pagination.limit, offset); + + setDocuments(response.data.documents); + setStatistics(response.data.statistics); + setTotalPages(Math.ceil(response.data.pagination.total / pagination.limit)); + } catch (error) { + console.error('Failed to fetch failed OCR documents:', error); + setSnackbar({ + open: true, + message: 'Failed to load failed OCR documents', + severity: 'error' + }); + } finally { + setLoading(false); + } + }; + + useEffect(() => { + fetchFailedDocuments(); + }, [pagination.page]); + + const handleRetryOcr = async (document: FailedDocument) => { + try { + setRetrying(document.id); + const response = await documentService.retryOcr(document.id); + + if (response.data.success) { + setSnackbar({ + open: true, + message: `OCR retry queued for "${document.filename}". Estimated wait time: ${response.data.estimated_wait_minutes || 'Unknown'} minutes.`, + severity: 'success' + }); + + // Refresh the list to update retry counts and status + await fetchFailedDocuments(); + } else { + setSnackbar({ + open: true, + message: response.data.message || 'Failed to retry OCR', + severity: 'error' + }); + } + } catch (error) { + console.error('Failed to retry OCR:', error); + setSnackbar({ + open: true, + message: 'Failed to retry OCR processing', + severity: 'error' + }); + } finally { + setRetrying(null); + } + }; + + const formatFileSize = (bytes: number): string => { + if (bytes === 0) return '0 B'; + const k = 1024; + const sizes = ['B', 'KB', 'MB', 'GB']; + const i = Math.floor(Math.log(bytes) / Math.log(k)); + return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i]; + }; + + const getFailureCategoryColor = (category: string): "error" | "warning" | "info" | "default" => { + switch (category) { + case 'PDF Font Issues': + case 'PDF Corruption': + case 'PDF Parsing Error': + return 'warning'; + case 'Timeout': + case 'Memory Limit': + return 'error'; + case 'Unknown Error': + return 'info'; + default: + return 'default'; + } + }; + + const toggleRowExpansion = (documentId: string) => { + const newExpanded = new Set(expandedRows); + if (newExpanded.has(documentId)) { + newExpanded.delete(documentId); + } else { + newExpanded.add(documentId); + } + setExpandedRows(newExpanded); + }; + + const showDocumentDetails = (document: FailedDocument) => { + setSelectedDocument(document); + setDetailsOpen(true); + }; + + if (loading && documents.length === 0) { + return ( + + + + ); + } + + return ( + + + + Failed OCR Documents + + + + + {/* Statistics Overview */} + {statistics && ( + + + + + + + Total Failed + + + {statistics.total_failed} + + + + + + + + + Failure Categories + + + {statistics.failure_categories.map((category) => ( + + ))} + + + + + + )} + + {documents.length === 0 ? ( + + Great news! + No documents have failed OCR processing. All your documents are processing successfully. + + ) : ( + <> + + OCR Failures + These documents failed OCR processing. You can retry OCR with detailed output to understand why failures occurred. + Common causes include corrupted PDFs, unsupported fonts, or memory limitations. + + + + + + + + Document + Failure Type + Retry Count + Last Failed + Actions + + + + {documents.map((document) => ( + + + + toggleRowExpansion(document.id)} + > + {expandedRows.has(document.id) ? : } + + + + + + {document.filename} + + + {formatFileSize(document.file_size)} • {document.mime_type} + + + + + + + + + {document.retry_count} attempts + + + + + {document.updated_at ? format(new Date(document.updated_at), 'MMM dd, yyyy HH:mm') : 'Unknown'} + + + + + + handleRetryOcr(document)} + disabled={retrying === document.id || !document.can_retry} + > + {retrying === document.id ? ( + + ) : ( + + )} + + + + showDocumentDetails(document)} + > + + + + + window.open(`/api/documents/${document.id}/download`, '_blank')} + > + + + + + + + + + + + + Error Details + + + + + Failure Reason: + + + {document.ocr_failure_reason || 'Not specified'} + + + + Error Message: + + + {document.ocr_error || 'No error message available'} + + + + + Last Attempt: + + + {document.last_attempt_at + ? format(new Date(document.last_attempt_at), 'PPpp') + : 'No previous attempts'} + + + + File Created: + + + {format(new Date(document.created_at), 'PPpp')} + + + + + + + + + ))} + +
+
+ + {/* Pagination */} + {totalPages > 1 && ( + + setPagination(prev => ({ ...prev, page }))} + color="primary" + /> + + )} + + )} + + {/* Document Details Dialog */} + setDetailsOpen(false)} + maxWidth="md" + fullWidth + > + + Document Details: {selectedDocument?.filename} + + + {selectedDocument && ( + + + + Original Filename: + + + {selectedDocument.original_filename} + + + + File Size: + + + {formatFileSize(selectedDocument.file_size)} + + + + MIME Type: + + + {selectedDocument.mime_type} + + + + Tags: + + + {selectedDocument.tags.length > 0 ? ( + selectedDocument.tags.map((tag) => ( + + )) + ) : ( + No tags + )} + + + + + Failure Category: + + + + + Retry Count: + + + {selectedDocument.retry_count} attempts + + + + Created: + + + {format(new Date(selectedDocument.created_at), 'PPpp')} + + + + Last Updated: + + + {format(new Date(selectedDocument.updated_at), 'PPpp')} + + + + + Full Error Message: + + + + {selectedDocument.ocr_error || 'No error message available'} + + + + + )} + + + {selectedDocument?.can_retry && ( + + )} + + + + + {/* Success/Error Snackbar */} + setSnackbar(prev => ({ ...prev, open: false }))} + > + setSnackbar(prev => ({ ...prev, open: false }))} + severity={snackbar.severity} + sx={{ width: '100%' }} + > + {snackbar.message} + + +
+ ); +}; + +export default FailedOcrPage; \ No newline at end of file diff --git a/frontend/src/services/api.ts b/frontend/src/services/api.ts index e45d142..2e2f619 100644 --- a/frontend/src/services/api.ts +++ b/frontend/src/services/api.ts @@ -186,6 +186,16 @@ export const documentService = { }) }, + retryOcr: (id: string) => { + return api.post(`/documents/${id}/retry-ocr`) + }, + + getFailedOcrDocuments: (limit = 50, offset = 0) => { + return api.get(`/documents/failed-ocr`, { + params: { limit, offset }, + }) + }, + search: (searchRequest: SearchRequest) => { return api.get('/search', { params: searchRequest, diff --git a/src/routes/documents.rs b/src/routes/documents.rs index 4216543..87c4715 100644 --- a/src/routes/documents.rs +++ b/src/routes/documents.rs @@ -9,6 +9,7 @@ use serde::Deserialize; use std::sync::Arc; use utoipa::ToSchema; use sha2::{Sha256, Digest}; +use sqlx::Row; use crate::{ auth::AuthUser, @@ -33,6 +34,8 @@ pub fn router() -> Router> { .route("/{id}/thumbnail", get(get_document_thumbnail)) .route("/{id}/ocr", get(get_document_ocr)) .route("/{id}/processed-image", get(get_processed_image)) + .route("/{id}/retry-ocr", post(retry_ocr)) + .route("/failed-ocr", get(get_failed_ocr_documents)) } #[utoipa::path( @@ -471,4 +474,317 @@ async fn get_processed_image( .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; Ok(response) +} + +#[utoipa::path( + post, + path = "/api/documents/{id}/retry-ocr", + tag = "documents", + security( + ("bearer_auth" = []) + ), + params( + ("id" = uuid::Uuid, Path, description = "Document ID") + ), + responses( + (status = 200, description = "OCR retry queued successfully", body = String), + (status = 404, description = "Document not found"), + (status = 400, description = "Document is not eligible for OCR retry"), + (status = 401, description = "Unauthorized") + ) +)] +async fn retry_ocr( + State(state): State>, + auth_user: AuthUser, + Path(document_id): Path, +) -> Result, StatusCode> { + // Check if document exists and belongs to user + let documents = state + .db + .get_documents_by_user_with_role(auth_user.user.id, auth_user.user.role, 1000, 0) + .await + .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; + + let document = documents + .into_iter() + .find(|doc| doc.id == document_id) + .ok_or(StatusCode::NOT_FOUND)?; + + // Check if document is eligible for OCR retry (failed or not processed) + let eligible = document.ocr_status.as_ref().map_or(true, |status| { + status == "failed" || status == "pending" + }); + + if !eligible { + return Ok(Json(serde_json::json!({ + "success": false, + "message": "Document is not eligible for OCR retry. Current status: {}", + "current_status": document.ocr_status + }))); + } + + // Reset document OCR fields + let reset_result = sqlx::query( + r#" + UPDATE documents + SET ocr_status = 'pending', + ocr_text = NULL, + ocr_error = NULL, + ocr_failure_reason = NULL, + ocr_confidence = NULL, + ocr_word_count = NULL, + ocr_processing_time_ms = NULL, + ocr_completed_at = NULL, + updated_at = NOW() + WHERE id = $1 + "# + ) + .bind(document_id) + .execute(state.db.get_pool()) + .await + .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; + + if reset_result.rows_affected() == 0 { + return Err(StatusCode::NOT_FOUND); + } + + // Calculate priority based on file size (higher priority for retries) + let priority = match document.file_size { + 0..=1048576 => 15, // <= 1MB: highest priority (boosted for retry) + ..=5242880 => 12, // 1-5MB: high priority + ..=10485760 => 10, // 5-10MB: medium priority + ..=52428800 => 8, // 10-50MB: low priority + _ => 6, // > 50MB: lowest priority + }; + + // Add to OCR queue with detailed logging + match state.queue_service.enqueue_document(document_id, priority, document.file_size).await { + Ok(queue_id) => { + tracing::info!( + "OCR retry queued for document {} ({}): queue_id={}, priority={}, size={}", + document_id, document.filename, queue_id, priority, document.file_size + ); + + Ok(Json(serde_json::json!({ + "success": true, + "message": "OCR retry queued successfully", + "queue_id": queue_id, + "document_id": document_id, + "priority": priority, + "estimated_wait_minutes": calculate_estimated_wait_time(priority).await + }))) + } + Err(e) => { + tracing::error!("Failed to queue OCR retry for document {}: {}", document_id, e); + Err(StatusCode::INTERNAL_SERVER_ERROR) + } + } +} + +#[utoipa::path( + get, + path = "/api/documents/failed-ocr", + tag = "documents", + security( + ("bearer_auth" = []) + ), + params( + ("limit" = Option, Query, description = "Number of documents to return (default: 50)"), + ("offset" = Option, Query, description = "Number of documents to skip (default: 0)") + ), + responses( + (status = 200, description = "List of documents with failed OCR", body = String), + (status = 401, description = "Unauthorized") + ) +)] +async fn get_failed_ocr_documents( + State(state): State>, + auth_user: AuthUser, + Query(pagination): Query, +) -> Result, StatusCode> { + let limit = pagination.limit.unwrap_or(50); + let offset = pagination.offset.unwrap_or(0); + + // Get failed OCR documents with additional failure details + let failed_docs = sqlx::query( + r#" + SELECT d.id, d.filename, d.original_filename, d.file_path, d.file_size, + d.mime_type, d.created_at, d.updated_at, d.user_id, + d.ocr_status, d.ocr_error, d.ocr_failure_reason, + d.ocr_completed_at, d.tags, + -- Count retry attempts from OCR queue + COALESCE(q.retry_count, 0) as retry_count, + q.last_attempt_at + FROM documents d + LEFT JOIN ( + SELECT document_id, + COUNT(*) as retry_count, + MAX(created_at) as last_attempt_at + FROM ocr_queue + WHERE status IN ('failed', 'completed') + GROUP BY document_id + ) q ON d.id = q.document_id + WHERE d.ocr_status = 'failed' + AND ($1 = $1 OR d.user_id = $1) -- Admin can see all, users see only their own + ORDER BY d.updated_at DESC + LIMIT $2 OFFSET $3 + "# + ) + .bind(if auth_user.user.role == crate::models::UserRole::Admin { + None + } else { + Some(auth_user.user.id) + }) + .bind(limit) + .bind(offset) + .fetch_all(state.db.get_pool()) + .await + .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; + + // Count total failed documents + let total_count: i64 = sqlx::query_scalar( + r#" + SELECT COUNT(*) + FROM documents + WHERE ocr_status = 'failed' + AND ($1 = $1 OR user_id = $1) + "# + ) + .bind(if auth_user.user.role == crate::models::UserRole::Admin { + None + } else { + Some(auth_user.user.id) + }) + .fetch_one(state.db.get_pool()) + .await + .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; + + let failed_documents: Vec = failed_docs + .into_iter() + .map(|row| { + let tags: Vec = row.get::>, _>("tags").unwrap_or_default(); + + serde_json::json!({ + "id": row.get::("id"), + "filename": row.get::("filename"), + "original_filename": row.get::("original_filename"), + "file_size": row.get::("file_size"), + "mime_type": row.get::("mime_type"), + "created_at": row.get::, _>("created_at"), + "updated_at": row.get::, _>("updated_at"), + "tags": tags, + "ocr_status": row.get::, _>("ocr_status"), + "ocr_error": row.get::, _>("ocr_error"), + "ocr_failure_reason": row.get::, _>("ocr_failure_reason"), + "ocr_completed_at": row.get::>, _>("ocr_completed_at"), + "retry_count": row.get::, _>("retry_count").unwrap_or(0), + "last_attempt_at": row.get::>, _>("last_attempt_at"), + "can_retry": true, + "failure_category": categorize_failure_reason( + row.get::, _>("ocr_failure_reason").as_deref(), + row.get::, _>("ocr_error").as_deref() + ) + }) + }) + .collect(); + + let response = serde_json::json!({ + "documents": failed_documents, + "pagination": { + "total": total_count, + "limit": limit, + "offset": offset, + "has_more": offset + limit < total_count + }, + "statistics": { + "total_failed": total_count, + "failure_categories": get_failure_statistics(&state, auth_user.user.id, auth_user.user.role.clone()).await? + } + }); + + Ok(Json(response)) +} + +async fn calculate_estimated_wait_time(priority: i32) -> i64 { + // Simple estimation based on priority - in a real implementation, + // this would check actual queue depth and processing times + match priority { + 15.. => 1, // High priority retry: ~1 minute + 10..14 => 3, // Medium priority: ~3 minutes + 5..9 => 10, // Low priority: ~10 minutes + _ => 30, // Very low priority: ~30 minutes + } +} + +fn categorize_failure_reason(failure_reason: Option<&str>, error_message: Option<&str>) -> &'static str { + match failure_reason { + Some("pdf_font_encoding") => "PDF Font Issues", + Some("pdf_corruption") => "PDF Corruption", + Some("processing_timeout") => "Timeout", + Some("memory_limit") => "Memory Limit", + Some("pdf_parsing_panic") => "PDF Parsing Error", + Some("unknown") | None => { + // Try to categorize based on error message + if let Some(error) = error_message { + let error_lower = error.to_lowercase(); + if error_lower.contains("timeout") { + "Timeout" + } else if error_lower.contains("memory") { + "Memory Limit" + } else if error_lower.contains("font") || error_lower.contains("encoding") { + "PDF Font Issues" + } else if error_lower.contains("corrupt") { + "PDF Corruption" + } else { + "Unknown Error" + } + } else { + "Unknown Error" + } + } + _ => "Other" + } +} + +async fn get_failure_statistics( + state: &Arc, + user_id: uuid::Uuid, + user_role: crate::models::UserRole +) -> Result { + let stats = sqlx::query( + r#" + SELECT + ocr_failure_reason, + COUNT(*) as count + FROM documents + WHERE ocr_status = 'failed' + AND ($1 = $1 OR user_id = $1) + GROUP BY ocr_failure_reason + ORDER BY count DESC + "# + ) + .bind(if user_role == crate::models::UserRole::Admin { + None + } else { + Some(user_id) + }) + .fetch_all(state.db.get_pool()) + .await + .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; + + let categories: Vec = stats + .into_iter() + .map(|row| { + let reason = row.get::, _>("ocr_failure_reason"); + let count = row.get::("count"); + + serde_json::json!({ + "reason": reason.clone().unwrap_or_else(|| "unknown".to_string()), + "display_name": categorize_failure_reason(reason.as_deref(), None), + "count": count + }) + }) + .collect(); + + Ok(serde_json::json!(categories)) } \ No newline at end of file