import React, { useState, useEffect } from 'react'; import { useNavigate } from 'react-router-dom'; import { Box, Typography, Card, CardContent, Button, Chip, Alert, AlertTitle, Table, TableBody, TableCell, TableContainer, TableHead, TableRow, Paper, Dialog, DialogTitle, DialogContent, DialogActions, Pagination, CircularProgress, Tooltip, IconButton, Collapse, LinearProgress, Snackbar, Tabs, Tab, TextField, useTheme, Divider, } from '@mui/material'; import Grid from '@mui/material/GridLegacy'; import { Refresh as RefreshIcon, Error as ErrorIcon, Info as InfoIcon, ExpandMore as ExpandMoreIcon, ExpandLess as ExpandLessIcon, Schedule as ScheduleIcon, Visibility as VisibilityIcon, Download as DownloadIcon, FileCopy as FileCopyIcon, Delete as DeleteIcon, FindInPage as FindInPageIcon, OpenInNew as OpenInNewIcon, Warning as WarningIcon, } from '@mui/icons-material'; import { format } from 'date-fns'; import { api, documentService, queueService } from '../services/api'; import DocumentViewer from '../components/DocumentViewer'; interface FailedDocument { id: string; filename: string; original_filename: string; file_size: number; mime_type: string; created_at: string; updated_at: string; tags: string[]; ocr_status: string; ocr_error: string; ocr_failure_reason: string; ocr_completed_at?: string; retry_count: number; last_attempt_at?: string; can_retry: boolean; failure_category: string; } interface FailureCategory { reason: string; display_name: string; count: number; } interface FailedOcrResponse { documents: FailedDocument[]; pagination: { total: number; limit: number; offset: number; total_pages: number; }; statistics: { total_failed: number; by_reason: Record; by_stage: Record; }; } interface RetryResponse { success: boolean; message: string; queue_id?: string; estimated_wait_minutes?: number; } interface DuplicateDocument { id: string; filename: string; original_filename: string; file_size: number; mime_type: string; created_at: string; user_id: string; } interface DuplicateGroup { file_hash: string; duplicate_count: number; first_uploaded: string; last_uploaded: string; documents: DuplicateDocument[]; } interface DuplicatesResponse { duplicates: DuplicateGroup[]; pagination: { total: number; limit: number; offset: number; has_more: boolean; }; statistics: { total_duplicate_groups: number; }; } const FailedOcrPage: React.FC = () => { const theme = useTheme(); const navigate = useNavigate(); const [currentTab, setCurrentTab] = useState(0); const [documents, setDocuments] = useState([]); const [duplicates, setDuplicates] = useState([]); const [failedDocuments, setFailedDocuments] = useState([]); const [loading, setLoading] = useState(true); const [duplicatesLoading, setDuplicatesLoading] = useState(false); const [failedDocumentsLoading, setFailedDocumentsLoading] = useState(false); const [failedDocumentsFilters, setFailedDocumentsFilters] = useState<{ stage?: string; reason?: string }>({}); const [selectedFailedDocument, setSelectedFailedDocument] = useState(null); const [retrying, setRetrying] = useState(null); const [retryingAll, setRetryingAll] = useState(false); const [statistics, setStatistics] = useState(null); const [duplicateStatistics, setDuplicateStatistics] = useState(null); const [pagination, setPagination] = useState({ page: 1, limit: 25 }); const [duplicatesPagination, setDuplicatesPagination] = useState({ page: 1, limit: 25 }); const [failedDocumentsPagination, setFailedDocumentsPagination] = useState({ page: 1, limit: 25 }); const [totalPages, setTotalPages] = useState(0); const [duplicatesTotalPages, setDuplicatesTotalPages] = useState(0); const [failedDocumentsTotalPages, setFailedDocumentsTotalPages] = useState(0); const [selectedDocument, setSelectedDocument] = useState(null); const [detailsOpen, setDetailsOpen] = useState(false); const [expandedRows, setExpandedRows] = useState>(new Set()); const [expandedDuplicateGroups, setExpandedDuplicateGroups] = useState>(new Set()); const [snackbar, setSnackbar] = useState<{ open: boolean; message: string; severity: 'success' | 'error' | 'info' | 'warning' }>({ open: false, message: '', severity: 'success' }); // Low confidence documents state const [confidenceThreshold, setConfidenceThreshold] = useState(30); const [lowConfidenceLoading, setLowConfidenceLoading] = useState(false); const [previewData, setPreviewData] = useState(null); const [confirmDeleteOpen, setConfirmDeleteOpen] = useState(false); // Failed documents deletion state const [failedDocsLoading, setFailedDocsLoading] = useState(false); const [failedPreviewData, setFailedPreviewData] = useState(null); const [confirmDeleteFailedOpen, setConfirmDeleteFailedOpen] = useState(false); const fetchFailedDocuments = async () => { try { setLoading(true); const offset = (pagination.page - 1) * pagination.limit; const response = await documentService.getFailedOcrDocuments(pagination.limit, offset); if (response?.data) { setDocuments(response.data.documents || []); setStatistics(response.data.statistics || null); if (response.data.pagination) { setTotalPages(Math.ceil(response.data.pagination.total / pagination.limit)); } } } catch (error) { console.error('Failed to fetch failed OCR documents:', error); setSnackbar({ open: true, message: 'Failed to load failed OCR documents', severity: 'error' }); } finally { setLoading(false); } }; const fetchDuplicates = async () => { try { setDuplicatesLoading(true); const offset = (duplicatesPagination.page - 1) * duplicatesPagination.limit; const response = await documentService.getDuplicates(duplicatesPagination.limit, offset); if (response?.data) { setDuplicates(response.data.duplicates || []); setDuplicateStatistics(response.data.statistics || null); if (response.data.pagination) { setDuplicatesTotalPages(Math.ceil(response.data.pagination.total / duplicatesPagination.limit)); } } } catch (error) { console.error('Failed to fetch duplicates:', error); setSnackbar({ open: true, message: 'Failed to load duplicate documents', severity: 'error' }); } finally { setDuplicatesLoading(false); } }; useEffect(() => { fetchFailedDocuments(); }, [pagination.page]); useEffect(() => { if (currentTab === 1) { fetchDuplicates(); } else if (currentTab === 4) { fetchFailedDocumentsList(); } }, [currentTab, duplicatesPagination.page, failedDocumentsPagination.page, failedDocumentsFilters]); const fetchFailedDocumentsList = async () => { try { setFailedDocumentsLoading(true); const offset = (failedDocumentsPagination.page - 1) * failedDocumentsPagination.limit; const response = await documentService.getFailedDocuments( failedDocumentsPagination.limit, offset, failedDocumentsFilters.stage, failedDocumentsFilters.reason ); if (response?.data) { setFailedDocuments(response.data.documents || []); if (response.data.pagination) { setFailedDocumentsTotalPages(Math.ceil(response.data.pagination.total / failedDocumentsPagination.limit)); } } } catch (error) { console.error('Failed to fetch failed documents:', error); setSnackbar({ open: true, message: 'Failed to load failed documents', severity: 'error' }); } finally { setFailedDocumentsLoading(false); } }; const getFailureReasonColor = (reason: string): "error" | "warning" | "info" | "default" => { switch (reason) { case 'low_ocr_confidence': case 'ocr_timeout': case 'ocr_memory_limit': case 'pdf_parsing_error': return 'error'; case 'duplicate_content': case 'unsupported_format': case 'file_too_large': return 'warning'; case 'file_corrupted': case 'access_denied': case 'permission_denied': return 'error'; default: return 'default'; } }; const handleRetryOcr = async (document: FailedDocument) => { try { setRetrying(document.id); const response = await documentService.retryOcr(document.id); if (response.data.success) { setSnackbar({ open: true, message: `OCR retry queued for "${document.filename}". Estimated wait time: ${response.data.estimated_wait_minutes || 'Unknown'} minutes.`, severity: 'success' }); // Refresh the list to update retry counts and status await fetchFailedDocuments(); } else { setSnackbar({ open: true, message: response.data.message || 'Failed to retry OCR', severity: 'error' }); } } catch (error) { console.error('Failed to retry OCR:', error); setSnackbar({ open: true, message: 'Failed to retry OCR processing', severity: 'error' }); } finally { setRetrying(null); } }; const handleRetryAllFailed = async () => { try { setRetryingAll(true); const response = await queueService.requeueFailed(); if (response.data.requeued_count > 0) { setSnackbar({ open: true, message: `Successfully queued ${response.data.requeued_count} failed documents for OCR retry. Check the queue stats for progress.`, severity: 'success' }); // Refresh the list to update status await fetchFailedDocuments(); } else { setSnackbar({ open: true, message: 'No failed documents found to retry', severity: 'info' }); } } catch (error) { console.error('Failed to retry all failed OCR:', error); setSnackbar({ open: true, message: 'Failed to retry all failed OCR documents', severity: 'error' }); } finally { setRetryingAll(false); } }; const formatFileSize = (bytes: number): string => { if (bytes === 0) return '0 B'; const k = 1024; const sizes = ['B', 'KB', 'MB', 'GB']; const i = Math.floor(Math.log(bytes) / Math.log(k)); return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i]; }; const getFailureCategoryColor = (category: string): "error" | "warning" | "info" | "default" => { switch (category) { case 'PDF Font Issues': case 'PDF Corruption': case 'PDF Parsing Error': return 'warning'; case 'Timeout': case 'Memory Limit': return 'error'; case 'Low OCR Confidence': return 'warning'; case 'Unknown Error': return 'info'; default: return 'default'; } }; const toggleRowExpansion = (documentId: string) => { const newExpanded = new Set(expandedRows); if (newExpanded.has(documentId)) { newExpanded.delete(documentId); } else { newExpanded.add(documentId); } setExpandedRows(newExpanded); }; const showDocumentDetails = (document: FailedDocument) => { setSelectedDocument(document); setDetailsOpen(true); }; const toggleDuplicateGroupExpansion = (groupHash: string) => { const newExpanded = new Set(expandedDuplicateGroups); if (newExpanded.has(groupHash)) { newExpanded.delete(groupHash); } else { newExpanded.add(groupHash); } setExpandedDuplicateGroups(newExpanded); }; const handleTabChange = (event: React.SyntheticEvent, newValue: number) => { setCurrentTab(newValue); }; const refreshCurrentTab = () => { if (currentTab === 0) { fetchFailedDocuments(); } else if (currentTab === 1) { fetchDuplicates(); } else if (currentTab === 2) { handlePreviewLowConfidence(); } else if (currentTab === 3) { handlePreviewFailedDocuments(); } else if (currentTab === 4) { fetchFailedDocumentsList(); } }; // Low confidence document handlers const handlePreviewLowConfidence = async () => { try { setLowConfidenceLoading(true); const response = await documentService.deleteLowConfidence(confidenceThreshold, true); setPreviewData(response.data); setSnackbar({ open: true, message: response.data.message, severity: 'info' }); } catch (error) { setSnackbar({ open: true, message: 'Failed to preview low confidence documents', severity: 'error' }); } finally { setLowConfidenceLoading(false); } }; const handleDeleteLowConfidence = async () => { if (!previewData || previewData.matched_count === 0) { setSnackbar({ open: true, message: 'No documents to delete', severity: 'warning' }); return; } try { setLowConfidenceLoading(true); const response = await documentService.deleteLowConfidence(confidenceThreshold, false); setSnackbar({ open: true, message: response.data.message, severity: 'success' }); setPreviewData(null); setConfirmDeleteOpen(false); // Refresh other tabs if they have data affected if (currentTab === 0) { fetchFailedDocuments(); } } catch (error) { setSnackbar({ open: true, message: 'Failed to delete low confidence documents', severity: 'error' }); } finally { setLowConfidenceLoading(false); } }; // Failed documents handlers const handlePreviewFailedDocuments = async () => { try { setFailedDocsLoading(true); const response = await documentService.deleteFailedOcr(true); setFailedPreviewData(response.data); } catch (error) { setSnackbar({ open: true, message: 'Failed to preview failed documents', severity: 'error' }); } finally { setFailedDocsLoading(false); } }; const handleDeleteFailedDocuments = async () => { try { setFailedDocsLoading(true); const response = await documentService.deleteFailedOcr(false); setSnackbar({ open: true, message: response.data.message, severity: 'success' }); setFailedPreviewData(null); setConfirmDeleteFailedOpen(false); // Refresh failed OCR tab if currently viewing it if (currentTab === 0) { fetchFailedDocuments(); } } catch (error) { setSnackbar({ open: true, message: 'Failed to delete failed documents', severity: 'error' }); } finally { setFailedDocsLoading(false); } }; if (loading && (!documents || documents.length === 0)) { return ( ); } return ( Document Management } label={`Failed Documents${statistics ? ` (${statistics.total_failed})` : ''}`} iconPosition="start" /> } label={`Duplicate Files${duplicateStatistics ? ` (${duplicateStatistics.total_duplicate_groups})` : ''}`} iconPosition="start" /> } label={`Low Quality Manager${previewData ? ` (${previewData.matched_count})` : ''}`} iconPosition="start" /> } label="Bulk Cleanup" iconPosition="start" /> } label="Failed Documents" iconPosition="start" /> {/* Failed OCR Tab Content */} {currentTab === 0 && ( <> {/* Statistics Overview */} {statistics && ( Total Failed {statistics.total_failed} Failure Categories {statistics?.by_reason ? Object.entries(statistics.by_reason).map(([reason, count]) => ( )) : ( No failure data available )} )} {(!documents || documents.length === 0) ? ( Great news! No documents have failed OCR processing. All your documents are processing successfully. ) : ( <> OCR Failures These documents failed OCR processing. You can retry OCR with detailed output to understand why failures occurred. Common causes include corrupted PDFs, unsupported fonts, or memory limitations. Document Failure Type Retry Count Last Failed Actions {(documents || []).map((document) => ( toggleRowExpansion(document.id)} > {expandedRows.has(document.id) ? : } {document.filename} {formatFileSize(document.file_size)} • {document.mime_type} {document.retry_count} attempts {document.updated_at ? format(new Date(document.updated_at), 'MMM dd, yyyy HH:mm') : 'Unknown'} handleRetryOcr(document)} disabled={retrying === document.id || !document.can_retry} > {retrying === document.id ? ( ) : ( )} showDocumentDetails(document)} > { try { await documentService.downloadFile(document.id, document.original_filename || document.filename); } catch (error) { console.error('Download failed:', error); } }} > theme.palette.mode === 'dark' ? 'grey.900' : 'grey.50', borderRadius: 1 }}> Error Details Failure Reason: {document.ocr_failure_reason || 'Not specified'} Error Message: theme.palette.mode === 'dark' ? 'grey.800' : 'grey.100', p: 1, borderRadius: 1, fontSize: '0.75rem', wordBreak: 'break-word' }} > {document.ocr_error || 'No error message available'} Last Attempt: {document.last_attempt_at ? format(new Date(document.last_attempt_at), 'PPpp') : 'No previous attempts'} File Created: {format(new Date(document.created_at), 'PPpp')} ))}
{/* Pagination */} {totalPages > 1 && ( setPagination(prev => ({ ...prev, page }))} color="primary" /> )} )} )} {/* Duplicates Tab Content */} {currentTab === 1 && ( <> {/* Duplicate Statistics Overview */} {duplicateStatistics && ( Total Duplicate Groups {duplicateStatistics.total_duplicate_groups} )} {duplicatesLoading ? ( ) : duplicates.length === 0 ? ( No duplicates found! You don't have any duplicate documents. All your files have unique content. ) : ( <> Duplicate Documents Found These documents have identical content but may have different filenames. You can expand each group to see all files with the same content and choose which ones to keep. What should you do?
  • Review each group: Click to expand and see all duplicate files
  • Keep the best version: Choose the file with the most descriptive name
  • Check content: Use View/Download to verify files are truly identical
  • Note for admin: Consider implementing bulk delete functionality for duplicates
  • Content Hash Duplicate Count First Uploaded Last Uploaded Actions {duplicates.map((group) => ( toggleDuplicateGroupExpansion(group.file_hash)} > {expandedDuplicateGroups.has(group.file_hash) ? : } {group.file_hash.substring(0, 16)}... {format(new Date(group.first_uploaded), 'MMM dd, yyyy')} {format(new Date(group.last_uploaded), 'MMM dd, yyyy')} View files below Duplicate Files ({group.duplicate_count} total) Storage Impact: These {group.duplicate_count} files contain identical content. Consider keeping only the best-named version to save space. {group.documents.map((doc, index) => ( {doc.filename} {index === 0 && ( )} {doc.original_filename !== doc.filename && ( Original: {doc.original_filename} )} {formatFileSize(doc.file_size)} • {doc.mime_type} Uploaded: {format(new Date(doc.created_at), 'MMM dd, yyyy HH:mm')} window.open(`/api/documents/${doc.id}/view`, '_blank')} sx={{ color: theme.palette.primary.main }} > { try { await documentService.downloadFile(doc.id, doc.original_filename || doc.filename); } catch (error) { console.error('Download failed:', error); } }} sx={{ color: theme.palette.secondary.main }} > ))} ))}
    {/* Duplicates Pagination */} {duplicatesTotalPages > 1 && ( setDuplicatesPagination(prev => ({ ...prev, page }))} color="primary" /> )} )} )} {/* Low Confidence Documents Tab Content */} {currentTab === 2 && ( <> Low Confidence Document Deletion This tool allows you to delete documents with OCR confidence below a specified threshold. Use the preview feature first to see what documents would be affected before deleting. setConfidenceThreshold(Math.max(0, Math.min(100, Number(e.target.value))))} fullWidth inputProps={{ min: 0, max: 100, step: 1 }} helperText="Documents with confidence below this value will be deleted" /> {/* Preview Results */} {previewData && ( Preview Results 0 ? 'warning.main' : 'success.main'}> {previewData.message} {previewData.matched_count > 0 && previewData.documents && ( Documents that would be deleted: Filename Size OCR Confidence Status Date {previewData.documents.slice(0, 20).map((doc: any) => ( {doc.original_filename || doc.filename} {formatFileSize(doc.file_size)} {doc.ocr_confidence ? `${doc.ocr_confidence.toFixed(1)}%` : 'N/A'} {new Date(doc.created_at).toLocaleDateString()} ))}
    {previewData.documents.length > 20 && ( ... and {previewData.documents.length - 20} more documents )}
    )}
    )} {/* Loading State */} {lowConfidenceLoading && !previewData && ( Processing request... )} )} {/* Delete Failed Documents Tab Content */} {currentTab === 3 && ( <> Delete Failed OCR Documents This tool allows you to delete all documents where OCR processing failed completely. This includes documents with NULL confidence values or explicit failure status. Use the preview feature first to see what documents would be affected before deleting. {/* Preview Results */} {failedPreviewData && ( Preview Results 0 ? 'error.main' : 'success.main'}> {failedPreviewData.message} {failedPreviewData.matched_count > 0 && ( Document IDs that would be deleted: {failedPreviewData.document_ids.slice(0, 10).join(', ')} {failedPreviewData.document_ids.length > 10 && ` ... and ${failedPreviewData.document_ids.length - 10} more`} )} )} {/* Loading State */} {failedDocsLoading && !failedPreviewData && ( Processing request... )} )} {/* Failed Documents Tab Content */} {currentTab === 4 && ( <> Failed Documents Overview This shows all documents that failed at any stage of processing: ingestion, validation, OCR, storage, etc. Use the filters to narrow down by failure stage or specific reason. {/* Filter Controls */} setFailedDocumentsFilters(prev => ({ ...prev, stage: e.target.value || undefined }))} fullWidth SelectProps={{ native: true }} > setFailedDocumentsFilters(prev => ({ ...prev, reason: e.target.value || undefined }))} fullWidth SelectProps={{ native: true }} > {/* Failed Documents List */} {failedDocuments.length > 0 && ( Failed Documents ({failedDocuments.length}) Filename Stage Reason Size Date Actions {failedDocuments.slice((failedDocumentsPagination.page - 1) * failedDocumentsPagination.limit, failedDocumentsPagination.page * failedDocumentsPagination.limit).map((doc: any) => ( {doc.original_filename || doc.filename} {doc.ingestion_source && ( )} {doc.file_size ? formatFileSize(doc.file_size) : 'N/A'} {new Date(doc.created_at).toLocaleDateString()} setSelectedFailedDocument(doc)} title="View Details" > {doc.existing_document_id && ( navigate(`/documents/${doc.existing_document_id}`)} title="View Existing Document" > )} ))}
    {/* Pagination */} {failedDocumentsTotalPages > 1 && ( setFailedDocumentsPagination(prev => ({ ...prev, page }))} color="primary" /> )}
    )} {/* Loading State */} {failedDocumentsLoading && ( Loading failed documents... )} {/* Empty State */} {!failedDocumentsLoading && failedDocuments.length === 0 && ( No Failed Documents Found No documents have failed processing with the current filters. This is good! )} )} {/* Confirmation Dialog */} setConfirmDeleteOpen(false)} maxWidth="sm" fullWidth > Confirm Low Confidence Document Deletion Are you sure you want to delete {previewData?.matched_count || 0} documents with OCR confidence below {confidenceThreshold}%? This action cannot be undone. The documents and their files will be permanently deleted. {/* Confirmation Dialog for Failed Documents */} setConfirmDeleteFailedOpen(false)} maxWidth="sm" fullWidth > Confirm Failed Document Deletion Are you sure you want to delete {failedPreviewData?.matched_count || 0} documents with failed OCR processing? This action cannot be undone. The documents and their files will be permanently deleted. {/* Document Details Dialog */} setDetailsOpen(false)} maxWidth="lg" fullWidth > Document Details: {selectedDocument?.filename} {selectedDocument && ( {/* File Preview Section */} File Preview { if (selectedDocument) { navigate(`/documents/${selectedDocument.id}`); } }} sx={{ cursor: 'pointer', border: '2px dashed', borderColor: 'primary.main', borderRadius: 2, p: 1, transition: 'all 0.2s ease-in-out', '&:hover': { borderColor: 'primary.dark', boxShadow: 2, }, }} > Click to open full document details page {/* Document Information Section */} Document Information Original Filename: {selectedDocument.original_filename} File Size: {formatFileSize(selectedDocument.file_size)} MIME Type: {selectedDocument.mime_type} Failure Category: Retry Count: {selectedDocument.retry_count} attempts Created: {format(new Date(selectedDocument.created_at), 'PPpp')} Last Updated: {format(new Date(selectedDocument.updated_at), 'PPpp')} Tags: {selectedDocument.tags.length > 0 ? ( selectedDocument.tags.map((tag) => ( )) ) : ( No tags )} {/* Error Details Section */} Error Details Full Error Message: {selectedDocument.ocr_error || 'No error message available'} )} {selectedDocument?.can_retry && ( )} {/* Success/Error Snackbar */} setSnackbar(prev => ({ ...prev, open: false }))} > setSnackbar(prev => ({ ...prev, open: false }))} severity={snackbar.severity} sx={{ width: '100%' }} > {snackbar.message}
    ); }; export default FailedOcrPage;