diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx
index ffc4678..bed1dc7 100644
--- a/frontend/src/App.tsx
+++ b/frontend/src/App.tsx
@@ -14,6 +14,7 @@ import DocumentDetailsPage from './pages/DocumentDetailsPage';
import SettingsPage from './pages/SettingsPage';
import SourcesPage from './pages/SourcesPage';
import WatchFolderPage from './pages/WatchFolderPage';
+import FailedOcrPage from './pages/FailedOcrPage';
function App(): JSX.Element {
const { user, loading } = useAuth();
@@ -69,6 +70,7 @@ function App(): JSX.Element {
} />
} />
} />
+ } />
Profile Page - Coming Soon} />
diff --git a/frontend/src/components/AdvancedSearchPanel/AdvancedSearchPanel.tsx b/frontend/src/components/AdvancedSearchPanel/AdvancedSearchPanel.tsx
index db7c47c..7d2608f 100644
--- a/frontend/src/components/AdvancedSearchPanel/AdvancedSearchPanel.tsx
+++ b/frontend/src/components/AdvancedSearchPanel/AdvancedSearchPanel.tsx
@@ -347,8 +347,8 @@ const AdvancedSearchPanel: React.FC = ({
label="Boost Recent Documents"
/>
-
-
+
+
)}
{/* Results Display Section */}
diff --git a/frontend/src/components/EnhancedSearchGuide/EnhancedSearchGuide.tsx b/frontend/src/components/EnhancedSearchGuide/EnhancedSearchGuide.tsx
index b061a93..45bb3e5 100644
--- a/frontend/src/components/EnhancedSearchGuide/EnhancedSearchGuide.tsx
+++ b/frontend/src/components/EnhancedSearchGuide/EnhancedSearchGuide.tsx
@@ -181,6 +181,7 @@ const EnhancedSearchGuide: React.FC = ({ onExampleClic
sx={{
mb: 1.5,
transition: 'all 0.2s',
+ backgroundColor: (theme) => theme.palette.mode === 'dark' ? 'grey.800' : 'background.paper',
'&:hover': {
boxShadow: 2,
transform: 'translateY(-2px)',
@@ -200,7 +201,7 @@ const EnhancedSearchGuide: React.FC = ({ onExampleClic
variant="body2"
fontFamily="monospace"
sx={{
- backgroundColor: 'grey.100',
+ backgroundColor: (theme) => theme.palette.mode === 'dark' ? 'grey.800' : 'grey.100',
px: 1,
py: 0.5,
borderRadius: 1,
@@ -273,7 +274,11 @@ const EnhancedSearchGuide: React.FC = ({ onExampleClic
if (compact && !expanded) {
return (
-
+ theme.palette.mode === 'dark' ? 'grey.900' : 'background.paper'
+ }}>
@@ -294,7 +299,7 @@ const EnhancedSearchGuide: React.FC = ({ onExampleClic
}
return (
-
+ theme.palette.mode === 'dark' ? 'grey.900' : 'grey.50' }}>
diff --git a/frontend/src/components/Layout/AppLayout.tsx b/frontend/src/components/Layout/AppLayout.tsx
index ecc4760..b99ab9c 100644
--- a/frontend/src/components/Layout/AppLayout.tsx
+++ b/frontend/src/components/Layout/AppLayout.tsx
@@ -32,6 +32,7 @@ import {
Logout as LogoutIcon,
Description as DocumentIcon,
Storage as StorageIcon,
+ Error as ErrorIcon,
} from '@mui/icons-material';
import { useNavigate, useLocation } from 'react-router-dom';
import { useAuth } from '../../contexts/AuthContext';
@@ -64,6 +65,7 @@ const navigationItems: NavigationItem[] = [
{ text: 'Search', icon: SearchIcon, path: '/search' },
{ text: 'Sources', icon: StorageIcon, path: '/sources' },
{ text: 'Watch Folder', icon: FolderIcon, path: '/watch' },
+ { text: 'Failed OCR', icon: ErrorIcon, path: '/failed-ocr' },
];
const AppLayout: React.FC = ({ children }) => {
diff --git a/frontend/src/pages/FailedOcrPage.tsx b/frontend/src/pages/FailedOcrPage.tsx
new file mode 100644
index 0000000..2a13e78
--- /dev/null
+++ b/frontend/src/pages/FailedOcrPage.tsx
@@ -0,0 +1,576 @@
+import React, { useState, useEffect } from 'react';
+import {
+ Box,
+ Typography,
+ Card,
+ CardContent,
+ Grid,
+ Button,
+ Chip,
+ Alert,
+ AlertTitle,
+ Table,
+ TableBody,
+ TableCell,
+ TableContainer,
+ TableHead,
+ TableRow,
+ Paper,
+ Dialog,
+ DialogTitle,
+ DialogContent,
+ DialogActions,
+ Pagination,
+ CircularProgress,
+ Tooltip,
+ IconButton,
+ Collapse,
+ LinearProgress,
+ Snackbar,
+} from '@mui/material';
+import {
+ Refresh as RefreshIcon,
+ Error as ErrorIcon,
+ Info as InfoIcon,
+ ExpandMore as ExpandMoreIcon,
+ ExpandLess as ExpandLessIcon,
+ Schedule as ScheduleIcon,
+ Visibility as VisibilityIcon,
+ Download as DownloadIcon,
+} from '@mui/icons-material';
+import { format } from 'date-fns';
+import { api, documentService } from '../services/api';
+
+interface FailedDocument {
+ id: string;
+ filename: string;
+ original_filename: string;
+ file_size: number;
+ mime_type: string;
+ created_at: string;
+ updated_at: string;
+ tags: string[];
+ ocr_status: string;
+ ocr_error: string;
+ ocr_failure_reason: string;
+ ocr_completed_at?: string;
+ retry_count: number;
+ last_attempt_at?: string;
+ can_retry: boolean;
+ failure_category: string;
+}
+
+interface FailureCategory {
+ reason: string;
+ display_name: string;
+ count: number;
+}
+
+interface FailedOcrResponse {
+ documents: FailedDocument[];
+ pagination: {
+ total: number;
+ limit: number;
+ offset: number;
+ has_more: boolean;
+ };
+ statistics: {
+ total_failed: number;
+ failure_categories: FailureCategory[];
+ };
+}
+
+interface RetryResponse {
+ success: boolean;
+ message: string;
+ queue_id?: string;
+ estimated_wait_minutes?: number;
+}
+
+const FailedOcrPage: React.FC = () => {
+ const [documents, setDocuments] = useState([]);
+ const [loading, setLoading] = useState(true);
+ const [retrying, setRetrying] = useState(null);
+ const [statistics, setStatistics] = useState(null);
+ const [pagination, setPagination] = useState({ page: 1, limit: 25 });
+ const [totalPages, setTotalPages] = useState(0);
+ const [selectedDocument, setSelectedDocument] = useState(null);
+ const [detailsOpen, setDetailsOpen] = useState(false);
+ const [expandedRows, setExpandedRows] = useState>(new Set());
+ const [snackbar, setSnackbar] = useState<{ open: boolean; message: string; severity: 'success' | 'error' }>({
+ open: false,
+ message: '',
+ severity: 'success'
+ });
+
+ const fetchFailedDocuments = async () => {
+ try {
+ setLoading(true);
+ const offset = (pagination.page - 1) * pagination.limit;
+ const response = await documentService.getFailedOcrDocuments(pagination.limit, offset);
+
+ setDocuments(response.data.documents);
+ setStatistics(response.data.statistics);
+ setTotalPages(Math.ceil(response.data.pagination.total / pagination.limit));
+ } catch (error) {
+ console.error('Failed to fetch failed OCR documents:', error);
+ setSnackbar({
+ open: true,
+ message: 'Failed to load failed OCR documents',
+ severity: 'error'
+ });
+ } finally {
+ setLoading(false);
+ }
+ };
+
+ useEffect(() => {
+ fetchFailedDocuments();
+ }, [pagination.page]);
+
+ const handleRetryOcr = async (document: FailedDocument) => {
+ try {
+ setRetrying(document.id);
+ const response = await documentService.retryOcr(document.id);
+
+ if (response.data.success) {
+ setSnackbar({
+ open: true,
+ message: `OCR retry queued for "${document.filename}". Estimated wait time: ${response.data.estimated_wait_minutes || 'Unknown'} minutes.`,
+ severity: 'success'
+ });
+
+ // Refresh the list to update retry counts and status
+ await fetchFailedDocuments();
+ } else {
+ setSnackbar({
+ open: true,
+ message: response.data.message || 'Failed to retry OCR',
+ severity: 'error'
+ });
+ }
+ } catch (error) {
+ console.error('Failed to retry OCR:', error);
+ setSnackbar({
+ open: true,
+ message: 'Failed to retry OCR processing',
+ severity: 'error'
+ });
+ } finally {
+ setRetrying(null);
+ }
+ };
+
+ const formatFileSize = (bytes: number): string => {
+ if (bytes === 0) return '0 B';
+ const k = 1024;
+ const sizes = ['B', 'KB', 'MB', 'GB'];
+ const i = Math.floor(Math.log(bytes) / Math.log(k));
+ return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
+ };
+
+ const getFailureCategoryColor = (category: string): "error" | "warning" | "info" | "default" => {
+ switch (category) {
+ case 'PDF Font Issues':
+ case 'PDF Corruption':
+ case 'PDF Parsing Error':
+ return 'warning';
+ case 'Timeout':
+ case 'Memory Limit':
+ return 'error';
+ case 'Unknown Error':
+ return 'info';
+ default:
+ return 'default';
+ }
+ };
+
+ const toggleRowExpansion = (documentId: string) => {
+ const newExpanded = new Set(expandedRows);
+ if (newExpanded.has(documentId)) {
+ newExpanded.delete(documentId);
+ } else {
+ newExpanded.add(documentId);
+ }
+ setExpandedRows(newExpanded);
+ };
+
+ const showDocumentDetails = (document: FailedDocument) => {
+ setSelectedDocument(document);
+ setDetailsOpen(true);
+ };
+
+ if (loading && documents.length === 0) {
+ return (
+
+
+
+ );
+ }
+
+ return (
+
+
+
+ Failed OCR Documents
+
+ }
+ onClick={fetchFailedDocuments}
+ disabled={loading}
+ >
+ Refresh
+
+
+
+ {/* Statistics Overview */}
+ {statistics && (
+
+
+
+
+
+
+ Total Failed
+
+
+ {statistics.total_failed}
+
+
+
+
+
+
+
+
+ Failure Categories
+
+
+ {statistics.failure_categories.map((category) => (
+
+ ))}
+
+
+
+
+
+ )}
+
+ {documents.length === 0 ? (
+
+ Great news!
+ No documents have failed OCR processing. All your documents are processing successfully.
+
+ ) : (
+ <>
+
+ OCR Failures
+ These documents failed OCR processing. You can retry OCR with detailed output to understand why failures occurred.
+ Common causes include corrupted PDFs, unsupported fonts, or memory limitations.
+
+
+
+
+
+
+
+ Document
+ Failure Type
+ Retry Count
+ Last Failed
+ Actions
+
+
+
+ {documents.map((document) => (
+
+
+
+ toggleRowExpansion(document.id)}
+ >
+ {expandedRows.has(document.id) ? : }
+
+
+
+
+
+ {document.filename}
+
+
+ {formatFileSize(document.file_size)} • {document.mime_type}
+
+
+
+
+
+
+
+
+ {document.retry_count} attempts
+
+
+
+
+ {document.updated_at ? format(new Date(document.updated_at), 'MMM dd, yyyy HH:mm') : 'Unknown'}
+
+
+
+
+
+ handleRetryOcr(document)}
+ disabled={retrying === document.id || !document.can_retry}
+ >
+ {retrying === document.id ? (
+
+ ) : (
+
+ )}
+
+
+
+ showDocumentDetails(document)}
+ >
+
+
+
+
+ window.open(`/api/documents/${document.id}/download`, '_blank')}
+ >
+
+
+
+
+
+
+
+
+
+
+
+ Error Details
+
+
+
+
+ Failure Reason:
+
+
+ {document.ocr_failure_reason || 'Not specified'}
+
+
+
+ Error Message:
+
+
+ {document.ocr_error || 'No error message available'}
+
+
+
+
+ Last Attempt:
+
+
+ {document.last_attempt_at
+ ? format(new Date(document.last_attempt_at), 'PPpp')
+ : 'No previous attempts'}
+
+
+
+ File Created:
+
+
+ {format(new Date(document.created_at), 'PPpp')}
+
+
+
+
+
+
+
+
+ ))}
+
+
+
+
+ {/* Pagination */}
+ {totalPages > 1 && (
+
+ setPagination(prev => ({ ...prev, page }))}
+ color="primary"
+ />
+
+ )}
+ >
+ )}
+
+ {/* Document Details Dialog */}
+
+
+ {/* Success/Error Snackbar */}
+ setSnackbar(prev => ({ ...prev, open: false }))}
+ >
+ setSnackbar(prev => ({ ...prev, open: false }))}
+ severity={snackbar.severity}
+ sx={{ width: '100%' }}
+ >
+ {snackbar.message}
+
+
+
+ );
+};
+
+export default FailedOcrPage;
\ No newline at end of file
diff --git a/frontend/src/services/api.ts b/frontend/src/services/api.ts
index e45d142..2e2f619 100644
--- a/frontend/src/services/api.ts
+++ b/frontend/src/services/api.ts
@@ -186,6 +186,16 @@ export const documentService = {
})
},
+ retryOcr: (id: string) => {
+ return api.post(`/documents/${id}/retry-ocr`)
+ },
+
+ getFailedOcrDocuments: (limit = 50, offset = 0) => {
+ return api.get(`/documents/failed-ocr`, {
+ params: { limit, offset },
+ })
+ },
+
search: (searchRequest: SearchRequest) => {
return api.get('/search', {
params: searchRequest,
diff --git a/src/routes/documents.rs b/src/routes/documents.rs
index 4216543..87c4715 100644
--- a/src/routes/documents.rs
+++ b/src/routes/documents.rs
@@ -9,6 +9,7 @@ use serde::Deserialize;
use std::sync::Arc;
use utoipa::ToSchema;
use sha2::{Sha256, Digest};
+use sqlx::Row;
use crate::{
auth::AuthUser,
@@ -33,6 +34,8 @@ pub fn router() -> Router> {
.route("/{id}/thumbnail", get(get_document_thumbnail))
.route("/{id}/ocr", get(get_document_ocr))
.route("/{id}/processed-image", get(get_processed_image))
+ .route("/{id}/retry-ocr", post(retry_ocr))
+ .route("/failed-ocr", get(get_failed_ocr_documents))
}
#[utoipa::path(
@@ -471,4 +474,317 @@ async fn get_processed_image(
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
Ok(response)
+}
+
+#[utoipa::path(
+ post,
+ path = "/api/documents/{id}/retry-ocr",
+ tag = "documents",
+ security(
+ ("bearer_auth" = [])
+ ),
+ params(
+ ("id" = uuid::Uuid, Path, description = "Document ID")
+ ),
+ responses(
+ (status = 200, description = "OCR retry queued successfully", body = String),
+ (status = 404, description = "Document not found"),
+ (status = 400, description = "Document is not eligible for OCR retry"),
+ (status = 401, description = "Unauthorized")
+ )
+)]
+async fn retry_ocr(
+ State(state): State>,
+ auth_user: AuthUser,
+ Path(document_id): Path,
+) -> Result, StatusCode> {
+ // Check if document exists and belongs to user
+ let documents = state
+ .db
+ .get_documents_by_user_with_role(auth_user.user.id, auth_user.user.role, 1000, 0)
+ .await
+ .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
+
+ let document = documents
+ .into_iter()
+ .find(|doc| doc.id == document_id)
+ .ok_or(StatusCode::NOT_FOUND)?;
+
+ // Check if document is eligible for OCR retry (failed or not processed)
+ let eligible = document.ocr_status.as_ref().map_or(true, |status| {
+ status == "failed" || status == "pending"
+ });
+
+ if !eligible {
+ return Ok(Json(serde_json::json!({
+ "success": false,
+ "message": "Document is not eligible for OCR retry. Current status: {}",
+ "current_status": document.ocr_status
+ })));
+ }
+
+ // Reset document OCR fields
+ let reset_result = sqlx::query(
+ r#"
+ UPDATE documents
+ SET ocr_status = 'pending',
+ ocr_text = NULL,
+ ocr_error = NULL,
+ ocr_failure_reason = NULL,
+ ocr_confidence = NULL,
+ ocr_word_count = NULL,
+ ocr_processing_time_ms = NULL,
+ ocr_completed_at = NULL,
+ updated_at = NOW()
+ WHERE id = $1
+ "#
+ )
+ .bind(document_id)
+ .execute(state.db.get_pool())
+ .await
+ .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
+
+ if reset_result.rows_affected() == 0 {
+ return Err(StatusCode::NOT_FOUND);
+ }
+
+ // Calculate priority based on file size (higher priority for retries)
+ let priority = match document.file_size {
+ 0..=1048576 => 15, // <= 1MB: highest priority (boosted for retry)
+ ..=5242880 => 12, // 1-5MB: high priority
+ ..=10485760 => 10, // 5-10MB: medium priority
+ ..=52428800 => 8, // 10-50MB: low priority
+ _ => 6, // > 50MB: lowest priority
+ };
+
+ // Add to OCR queue with detailed logging
+ match state.queue_service.enqueue_document(document_id, priority, document.file_size).await {
+ Ok(queue_id) => {
+ tracing::info!(
+ "OCR retry queued for document {} ({}): queue_id={}, priority={}, size={}",
+ document_id, document.filename, queue_id, priority, document.file_size
+ );
+
+ Ok(Json(serde_json::json!({
+ "success": true,
+ "message": "OCR retry queued successfully",
+ "queue_id": queue_id,
+ "document_id": document_id,
+ "priority": priority,
+ "estimated_wait_minutes": calculate_estimated_wait_time(priority).await
+ })))
+ }
+ Err(e) => {
+ tracing::error!("Failed to queue OCR retry for document {}: {}", document_id, e);
+ Err(StatusCode::INTERNAL_SERVER_ERROR)
+ }
+ }
+}
+
+#[utoipa::path(
+ get,
+ path = "/api/documents/failed-ocr",
+ tag = "documents",
+ security(
+ ("bearer_auth" = [])
+ ),
+ params(
+ ("limit" = Option, Query, description = "Number of documents to return (default: 50)"),
+ ("offset" = Option, Query, description = "Number of documents to skip (default: 0)")
+ ),
+ responses(
+ (status = 200, description = "List of documents with failed OCR", body = String),
+ (status = 401, description = "Unauthorized")
+ )
+)]
+async fn get_failed_ocr_documents(
+ State(state): State>,
+ auth_user: AuthUser,
+ Query(pagination): Query,
+) -> Result, StatusCode> {
+ let limit = pagination.limit.unwrap_or(50);
+ let offset = pagination.offset.unwrap_or(0);
+
+ // Get failed OCR documents with additional failure details
+ let failed_docs = sqlx::query(
+ r#"
+ SELECT d.id, d.filename, d.original_filename, d.file_path, d.file_size,
+ d.mime_type, d.created_at, d.updated_at, d.user_id,
+ d.ocr_status, d.ocr_error, d.ocr_failure_reason,
+ d.ocr_completed_at, d.tags,
+ -- Count retry attempts from OCR queue
+ COALESCE(q.retry_count, 0) as retry_count,
+ q.last_attempt_at
+ FROM documents d
+ LEFT JOIN (
+ SELECT document_id,
+ COUNT(*) as retry_count,
+ MAX(created_at) as last_attempt_at
+ FROM ocr_queue
+ WHERE status IN ('failed', 'completed')
+ GROUP BY document_id
+ ) q ON d.id = q.document_id
+ WHERE d.ocr_status = 'failed'
+ AND ($1 = $1 OR d.user_id = $1) -- Admin can see all, users see only their own
+ ORDER BY d.updated_at DESC
+ LIMIT $2 OFFSET $3
+ "#
+ )
+ .bind(if auth_user.user.role == crate::models::UserRole::Admin {
+ None
+ } else {
+ Some(auth_user.user.id)
+ })
+ .bind(limit)
+ .bind(offset)
+ .fetch_all(state.db.get_pool())
+ .await
+ .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
+
+ // Count total failed documents
+ let total_count: i64 = sqlx::query_scalar(
+ r#"
+ SELECT COUNT(*)
+ FROM documents
+ WHERE ocr_status = 'failed'
+ AND ($1 = $1 OR user_id = $1)
+ "#
+ )
+ .bind(if auth_user.user.role == crate::models::UserRole::Admin {
+ None
+ } else {
+ Some(auth_user.user.id)
+ })
+ .fetch_one(state.db.get_pool())
+ .await
+ .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
+
+ let failed_documents: Vec = failed_docs
+ .into_iter()
+ .map(|row| {
+ let tags: Vec = row.get::