feat(client/server): advanced search, along with fixing build errors
This commit is contained in:
parent
4dda4d143d
commit
479c62a4f1
|
|
@ -14,6 +14,7 @@ import DocumentDetailsPage from './pages/DocumentDetailsPage';
|
|||
import SettingsPage from './pages/SettingsPage';
|
||||
import SourcesPage from './pages/SourcesPage';
|
||||
import WatchFolderPage from './pages/WatchFolderPage';
|
||||
import FailedOcrPage from './pages/FailedOcrPage';
|
||||
|
||||
function App(): JSX.Element {
|
||||
const { user, loading } = useAuth();
|
||||
|
|
@ -69,6 +70,7 @@ function App(): JSX.Element {
|
|||
<Route path="/sources" element={<SourcesPage />} />
|
||||
<Route path="/watch" element={<WatchFolderPage />} />
|
||||
<Route path="/settings" element={<SettingsPage />} />
|
||||
<Route path="/failed-ocr" element={<FailedOcrPage />} />
|
||||
<Route path="/profile" element={<div>Profile Page - Coming Soon</div>} />
|
||||
</Routes>
|
||||
</AppLayout>
|
||||
|
|
|
|||
|
|
@ -347,8 +347,8 @@ const AdvancedSearchPanel: React.FC<AdvancedSearchPanelProps> = ({
|
|||
label="Boost Recent Documents"
|
||||
/>
|
||||
</Box>
|
||||
</Grid>
|
||||
</Grid>
|
||||
</Box>
|
||||
</Box>
|
||||
)}
|
||||
|
||||
{/* Results Display Section */}
|
||||
|
|
|
|||
|
|
@ -181,6 +181,7 @@ const EnhancedSearchGuide: React.FC<EnhancedSearchGuideProps> = ({ onExampleClic
|
|||
sx={{
|
||||
mb: 1.5,
|
||||
transition: 'all 0.2s',
|
||||
backgroundColor: (theme) => theme.palette.mode === 'dark' ? 'grey.800' : 'background.paper',
|
||||
'&:hover': {
|
||||
boxShadow: 2,
|
||||
transform: 'translateY(-2px)',
|
||||
|
|
@ -200,7 +201,7 @@ const EnhancedSearchGuide: React.FC<EnhancedSearchGuideProps> = ({ onExampleClic
|
|||
variant="body2"
|
||||
fontFamily="monospace"
|
||||
sx={{
|
||||
backgroundColor: 'grey.100',
|
||||
backgroundColor: (theme) => theme.palette.mode === 'dark' ? 'grey.800' : 'grey.100',
|
||||
px: 1,
|
||||
py: 0.5,
|
||||
borderRadius: 1,
|
||||
|
|
@ -273,7 +274,11 @@ const EnhancedSearchGuide: React.FC<EnhancedSearchGuideProps> = ({ onExampleClic
|
|||
|
||||
if (compact && !expanded) {
|
||||
return (
|
||||
<Paper variant="outlined" sx={{ p: 2, mb: 2 }}>
|
||||
<Paper variant="outlined" sx={{
|
||||
p: 2,
|
||||
mb: 2,
|
||||
backgroundColor: (theme) => theme.palette.mode === 'dark' ? 'grey.900' : 'background.paper'
|
||||
}}>
|
||||
<Box display="flex" alignItems="center" justifyContent="space-between">
|
||||
<Box display="flex" alignItems="center" gap={1}>
|
||||
<TipIcon color="primary" />
|
||||
|
|
@ -294,7 +299,7 @@ const EnhancedSearchGuide: React.FC<EnhancedSearchGuideProps> = ({ onExampleClic
|
|||
}
|
||||
|
||||
return (
|
||||
<Paper elevation={0} sx={{ p: 3, mb: 3, backgroundColor: 'grey.50' }}>
|
||||
<Paper elevation={0} sx={{ p: 3, mb: 3, backgroundColor: (theme) => theme.palette.mode === 'dark' ? 'grey.900' : 'grey.50' }}>
|
||||
<Box display="flex" alignItems="center" justifyContent="space-between" mb={2}>
|
||||
<Typography variant="h6" display="flex" alignItems="center" gap={1}>
|
||||
<TipIcon color="primary" />
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@ import {
|
|||
Logout as LogoutIcon,
|
||||
Description as DocumentIcon,
|
||||
Storage as StorageIcon,
|
||||
Error as ErrorIcon,
|
||||
} from '@mui/icons-material';
|
||||
import { useNavigate, useLocation } from 'react-router-dom';
|
||||
import { useAuth } from '../../contexts/AuthContext';
|
||||
|
|
@ -64,6 +65,7 @@ const navigationItems: NavigationItem[] = [
|
|||
{ text: 'Search', icon: SearchIcon, path: '/search' },
|
||||
{ text: 'Sources', icon: StorageIcon, path: '/sources' },
|
||||
{ text: 'Watch Folder', icon: FolderIcon, path: '/watch' },
|
||||
{ text: 'Failed OCR', icon: ErrorIcon, path: '/failed-ocr' },
|
||||
];
|
||||
|
||||
const AppLayout: React.FC<AppLayoutProps> = ({ children }) => {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,576 @@
|
|||
import React, { useState, useEffect } from 'react';
|
||||
import {
|
||||
Box,
|
||||
Typography,
|
||||
Card,
|
||||
CardContent,
|
||||
Grid,
|
||||
Button,
|
||||
Chip,
|
||||
Alert,
|
||||
AlertTitle,
|
||||
Table,
|
||||
TableBody,
|
||||
TableCell,
|
||||
TableContainer,
|
||||
TableHead,
|
||||
TableRow,
|
||||
Paper,
|
||||
Dialog,
|
||||
DialogTitle,
|
||||
DialogContent,
|
||||
DialogActions,
|
||||
Pagination,
|
||||
CircularProgress,
|
||||
Tooltip,
|
||||
IconButton,
|
||||
Collapse,
|
||||
LinearProgress,
|
||||
Snackbar,
|
||||
} from '@mui/material';
|
||||
import {
|
||||
Refresh as RefreshIcon,
|
||||
Error as ErrorIcon,
|
||||
Info as InfoIcon,
|
||||
ExpandMore as ExpandMoreIcon,
|
||||
ExpandLess as ExpandLessIcon,
|
||||
Schedule as ScheduleIcon,
|
||||
Visibility as VisibilityIcon,
|
||||
Download as DownloadIcon,
|
||||
} from '@mui/icons-material';
|
||||
import { format } from 'date-fns';
|
||||
import { api, documentService } from '../services/api';
|
||||
|
||||
interface FailedDocument {
|
||||
id: string;
|
||||
filename: string;
|
||||
original_filename: string;
|
||||
file_size: number;
|
||||
mime_type: string;
|
||||
created_at: string;
|
||||
updated_at: string;
|
||||
tags: string[];
|
||||
ocr_status: string;
|
||||
ocr_error: string;
|
||||
ocr_failure_reason: string;
|
||||
ocr_completed_at?: string;
|
||||
retry_count: number;
|
||||
last_attempt_at?: string;
|
||||
can_retry: boolean;
|
||||
failure_category: string;
|
||||
}
|
||||
|
||||
interface FailureCategory {
|
||||
reason: string;
|
||||
display_name: string;
|
||||
count: number;
|
||||
}
|
||||
|
||||
interface FailedOcrResponse {
|
||||
documents: FailedDocument[];
|
||||
pagination: {
|
||||
total: number;
|
||||
limit: number;
|
||||
offset: number;
|
||||
has_more: boolean;
|
||||
};
|
||||
statistics: {
|
||||
total_failed: number;
|
||||
failure_categories: FailureCategory[];
|
||||
};
|
||||
}
|
||||
|
||||
interface RetryResponse {
|
||||
success: boolean;
|
||||
message: string;
|
||||
queue_id?: string;
|
||||
estimated_wait_minutes?: number;
|
||||
}
|
||||
|
||||
const FailedOcrPage: React.FC = () => {
|
||||
const [documents, setDocuments] = useState<FailedDocument[]>([]);
|
||||
const [loading, setLoading] = useState(true);
|
||||
const [retrying, setRetrying] = useState<string | null>(null);
|
||||
const [statistics, setStatistics] = useState<FailedOcrResponse['statistics'] | null>(null);
|
||||
const [pagination, setPagination] = useState({ page: 1, limit: 25 });
|
||||
const [totalPages, setTotalPages] = useState(0);
|
||||
const [selectedDocument, setSelectedDocument] = useState<FailedDocument | null>(null);
|
||||
const [detailsOpen, setDetailsOpen] = useState(false);
|
||||
const [expandedRows, setExpandedRows] = useState<Set<string>>(new Set());
|
||||
const [snackbar, setSnackbar] = useState<{ open: boolean; message: string; severity: 'success' | 'error' }>({
|
||||
open: false,
|
||||
message: '',
|
||||
severity: 'success'
|
||||
});
|
||||
|
||||
const fetchFailedDocuments = async () => {
|
||||
try {
|
||||
setLoading(true);
|
||||
const offset = (pagination.page - 1) * pagination.limit;
|
||||
const response = await documentService.getFailedOcrDocuments(pagination.limit, offset);
|
||||
|
||||
setDocuments(response.data.documents);
|
||||
setStatistics(response.data.statistics);
|
||||
setTotalPages(Math.ceil(response.data.pagination.total / pagination.limit));
|
||||
} catch (error) {
|
||||
console.error('Failed to fetch failed OCR documents:', error);
|
||||
setSnackbar({
|
||||
open: true,
|
||||
message: 'Failed to load failed OCR documents',
|
||||
severity: 'error'
|
||||
});
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
fetchFailedDocuments();
|
||||
}, [pagination.page]);
|
||||
|
||||
const handleRetryOcr = async (document: FailedDocument) => {
|
||||
try {
|
||||
setRetrying(document.id);
|
||||
const response = await documentService.retryOcr(document.id);
|
||||
|
||||
if (response.data.success) {
|
||||
setSnackbar({
|
||||
open: true,
|
||||
message: `OCR retry queued for "${document.filename}". Estimated wait time: ${response.data.estimated_wait_minutes || 'Unknown'} minutes.`,
|
||||
severity: 'success'
|
||||
});
|
||||
|
||||
// Refresh the list to update retry counts and status
|
||||
await fetchFailedDocuments();
|
||||
} else {
|
||||
setSnackbar({
|
||||
open: true,
|
||||
message: response.data.message || 'Failed to retry OCR',
|
||||
severity: 'error'
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Failed to retry OCR:', error);
|
||||
setSnackbar({
|
||||
open: true,
|
||||
message: 'Failed to retry OCR processing',
|
||||
severity: 'error'
|
||||
});
|
||||
} finally {
|
||||
setRetrying(null);
|
||||
}
|
||||
};
|
||||
|
||||
const formatFileSize = (bytes: number): string => {
|
||||
if (bytes === 0) return '0 B';
|
||||
const k = 1024;
|
||||
const sizes = ['B', 'KB', 'MB', 'GB'];
|
||||
const i = Math.floor(Math.log(bytes) / Math.log(k));
|
||||
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
|
||||
};
|
||||
|
||||
const getFailureCategoryColor = (category: string): "error" | "warning" | "info" | "default" => {
|
||||
switch (category) {
|
||||
case 'PDF Font Issues':
|
||||
case 'PDF Corruption':
|
||||
case 'PDF Parsing Error':
|
||||
return 'warning';
|
||||
case 'Timeout':
|
||||
case 'Memory Limit':
|
||||
return 'error';
|
||||
case 'Unknown Error':
|
||||
return 'info';
|
||||
default:
|
||||
return 'default';
|
||||
}
|
||||
};
|
||||
|
||||
const toggleRowExpansion = (documentId: string) => {
|
||||
const newExpanded = new Set(expandedRows);
|
||||
if (newExpanded.has(documentId)) {
|
||||
newExpanded.delete(documentId);
|
||||
} else {
|
||||
newExpanded.add(documentId);
|
||||
}
|
||||
setExpandedRows(newExpanded);
|
||||
};
|
||||
|
||||
const showDocumentDetails = (document: FailedDocument) => {
|
||||
setSelectedDocument(document);
|
||||
setDetailsOpen(true);
|
||||
};
|
||||
|
||||
if (loading && documents.length === 0) {
|
||||
return (
|
||||
<Box display="flex" justifyContent="center" alignItems="center" minHeight="400px">
|
||||
<CircularProgress />
|
||||
</Box>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<Box sx={{ p: 3 }}>
|
||||
<Box display="flex" justifyContent="space-between" alignItems="center" mb={3}>
|
||||
<Typography variant="h4" component="h1">
|
||||
Failed OCR Documents
|
||||
</Typography>
|
||||
<Button
|
||||
variant="outlined"
|
||||
startIcon={<RefreshIcon />}
|
||||
onClick={fetchFailedDocuments}
|
||||
disabled={loading}
|
||||
>
|
||||
Refresh
|
||||
</Button>
|
||||
</Box>
|
||||
|
||||
{/* Statistics Overview */}
|
||||
{statistics && (
|
||||
<Grid container spacing={3} mb={3}>
|
||||
<Grid item xs={12} md={4}>
|
||||
<Card>
|
||||
<CardContent>
|
||||
<Typography variant="h6" color="error">
|
||||
<ErrorIcon sx={{ mr: 1, verticalAlign: 'middle' }} />
|
||||
Total Failed
|
||||
</Typography>
|
||||
<Typography variant="h3" color="error.main">
|
||||
{statistics.total_failed}
|
||||
</Typography>
|
||||
</CardContent>
|
||||
</Card>
|
||||
</Grid>
|
||||
<Grid item xs={12} md={8}>
|
||||
<Card>
|
||||
<CardContent>
|
||||
<Typography variant="h6" mb={2}>
|
||||
Failure Categories
|
||||
</Typography>
|
||||
<Box display="flex" flexWrap="wrap" gap={1}>
|
||||
{statistics.failure_categories.map((category) => (
|
||||
<Chip
|
||||
key={category.reason}
|
||||
label={`${category.display_name}: ${category.count}`}
|
||||
color={getFailureCategoryColor(category.display_name)}
|
||||
variant="outlined"
|
||||
size="small"
|
||||
/>
|
||||
))}
|
||||
</Box>
|
||||
</CardContent>
|
||||
</Card>
|
||||
</Grid>
|
||||
</Grid>
|
||||
)}
|
||||
|
||||
{documents.length === 0 ? (
|
||||
<Alert severity="success" sx={{ mt: 2 }}>
|
||||
<AlertTitle>Great news!</AlertTitle>
|
||||
No documents have failed OCR processing. All your documents are processing successfully.
|
||||
</Alert>
|
||||
) : (
|
||||
<>
|
||||
<Alert severity="info" sx={{ mb: 2 }}>
|
||||
<AlertTitle>OCR Failures</AlertTitle>
|
||||
These documents failed OCR processing. You can retry OCR with detailed output to understand why failures occurred.
|
||||
Common causes include corrupted PDFs, unsupported fonts, or memory limitations.
|
||||
</Alert>
|
||||
|
||||
<TableContainer component={Paper}>
|
||||
<Table>
|
||||
<TableHead>
|
||||
<TableRow>
|
||||
<TableCell />
|
||||
<TableCell>Document</TableCell>
|
||||
<TableCell>Failure Type</TableCell>
|
||||
<TableCell>Retry Count</TableCell>
|
||||
<TableCell>Last Failed</TableCell>
|
||||
<TableCell>Actions</TableCell>
|
||||
</TableRow>
|
||||
</TableHead>
|
||||
<TableBody>
|
||||
{documents.map((document) => (
|
||||
<React.Fragment key={document.id}>
|
||||
<TableRow>
|
||||
<TableCell>
|
||||
<IconButton
|
||||
size="small"
|
||||
onClick={() => toggleRowExpansion(document.id)}
|
||||
>
|
||||
{expandedRows.has(document.id) ? <ExpandLessIcon /> : <ExpandMoreIcon />}
|
||||
</IconButton>
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
<Box>
|
||||
<Typography variant="body2" fontWeight="bold">
|
||||
{document.filename}
|
||||
</Typography>
|
||||
<Typography variant="caption" color="text.secondary">
|
||||
{formatFileSize(document.file_size)} • {document.mime_type}
|
||||
</Typography>
|
||||
</Box>
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
<Chip
|
||||
label={document.failure_category}
|
||||
color={getFailureCategoryColor(document.failure_category)}
|
||||
size="small"
|
||||
/>
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
<Typography variant="body2">
|
||||
{document.retry_count} attempts
|
||||
</Typography>
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
<Typography variant="body2">
|
||||
{document.updated_at ? format(new Date(document.updated_at), 'MMM dd, yyyy HH:mm') : 'Unknown'}
|
||||
</Typography>
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
<Box display="flex" gap={1}>
|
||||
<Tooltip title="Retry OCR">
|
||||
<IconButton
|
||||
size="small"
|
||||
onClick={() => handleRetryOcr(document)}
|
||||
disabled={retrying === document.id || !document.can_retry}
|
||||
>
|
||||
{retrying === document.id ? (
|
||||
<CircularProgress size={16} />
|
||||
) : (
|
||||
<RefreshIcon />
|
||||
)}
|
||||
</IconButton>
|
||||
</Tooltip>
|
||||
<Tooltip title="View Details">
|
||||
<IconButton
|
||||
size="small"
|
||||
onClick={() => showDocumentDetails(document)}
|
||||
>
|
||||
<VisibilityIcon />
|
||||
</IconButton>
|
||||
</Tooltip>
|
||||
<Tooltip title="Download Document">
|
||||
<IconButton
|
||||
size="small"
|
||||
onClick={() => window.open(`/api/documents/${document.id}/download`, '_blank')}
|
||||
>
|
||||
<DownloadIcon />
|
||||
</IconButton>
|
||||
</Tooltip>
|
||||
</Box>
|
||||
</TableCell>
|
||||
</TableRow>
|
||||
<TableRow>
|
||||
<TableCell sx={{ paddingBottom: 0, paddingTop: 0 }} colSpan={6}>
|
||||
<Collapse in={expandedRows.has(document.id)} timeout="auto" unmountOnExit>
|
||||
<Box sx={{ margin: 1, p: 2, bgcolor: 'grey.50' }}>
|
||||
<Typography variant="h6" gutterBottom>
|
||||
Error Details
|
||||
</Typography>
|
||||
<Grid container spacing={2}>
|
||||
<Grid item xs={12} md={6}>
|
||||
<Typography variant="body2" color="text.secondary">
|
||||
<strong>Failure Reason:</strong>
|
||||
</Typography>
|
||||
<Typography variant="body2" sx={{ mb: 1 }}>
|
||||
{document.ocr_failure_reason || 'Not specified'}
|
||||
</Typography>
|
||||
|
||||
<Typography variant="body2" color="text.secondary">
|
||||
<strong>Error Message:</strong>
|
||||
</Typography>
|
||||
<Typography
|
||||
variant="body2"
|
||||
sx={{
|
||||
fontFamily: 'monospace',
|
||||
bgcolor: 'grey.100',
|
||||
p: 1,
|
||||
borderRadius: 1,
|
||||
fontSize: '0.75rem',
|
||||
wordBreak: 'break-word'
|
||||
}}
|
||||
>
|
||||
{document.ocr_error || 'No error message available'}
|
||||
</Typography>
|
||||
</Grid>
|
||||
<Grid item xs={12} md={6}>
|
||||
<Typography variant="body2" color="text.secondary">
|
||||
<strong>Last Attempt:</strong>
|
||||
</Typography>
|
||||
<Typography variant="body2" sx={{ mb: 1 }}>
|
||||
{document.last_attempt_at
|
||||
? format(new Date(document.last_attempt_at), 'PPpp')
|
||||
: 'No previous attempts'}
|
||||
</Typography>
|
||||
|
||||
<Typography variant="body2" color="text.secondary">
|
||||
<strong>File Created:</strong>
|
||||
</Typography>
|
||||
<Typography variant="body2">
|
||||
{format(new Date(document.created_at), 'PPpp')}
|
||||
</Typography>
|
||||
</Grid>
|
||||
</Grid>
|
||||
</Box>
|
||||
</Collapse>
|
||||
</TableCell>
|
||||
</TableRow>
|
||||
</React.Fragment>
|
||||
))}
|
||||
</TableBody>
|
||||
</Table>
|
||||
</TableContainer>
|
||||
|
||||
{/* Pagination */}
|
||||
{totalPages > 1 && (
|
||||
<Box display="flex" justifyContent="center" mt={3}>
|
||||
<Pagination
|
||||
count={totalPages}
|
||||
page={pagination.page}
|
||||
onChange={(_, page) => setPagination(prev => ({ ...prev, page }))}
|
||||
color="primary"
|
||||
/>
|
||||
</Box>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
|
||||
{/* Document Details Dialog */}
|
||||
<Dialog
|
||||
open={detailsOpen}
|
||||
onClose={() => setDetailsOpen(false)}
|
||||
maxWidth="md"
|
||||
fullWidth
|
||||
>
|
||||
<DialogTitle>
|
||||
Document Details: {selectedDocument?.filename}
|
||||
</DialogTitle>
|
||||
<DialogContent>
|
||||
{selectedDocument && (
|
||||
<Grid container spacing={2}>
|
||||
<Grid item xs={12} md={6}>
|
||||
<Typography variant="body2" color="text.secondary">
|
||||
<strong>Original Filename:</strong>
|
||||
</Typography>
|
||||
<Typography variant="body2" sx={{ mb: 2 }}>
|
||||
{selectedDocument.original_filename}
|
||||
</Typography>
|
||||
|
||||
<Typography variant="body2" color="text.secondary">
|
||||
<strong>File Size:</strong>
|
||||
</Typography>
|
||||
<Typography variant="body2" sx={{ mb: 2 }}>
|
||||
{formatFileSize(selectedDocument.file_size)}
|
||||
</Typography>
|
||||
|
||||
<Typography variant="body2" color="text.secondary">
|
||||
<strong>MIME Type:</strong>
|
||||
</Typography>
|
||||
<Typography variant="body2" sx={{ mb: 2 }}>
|
||||
{selectedDocument.mime_type}
|
||||
</Typography>
|
||||
|
||||
<Typography variant="body2" color="text.secondary">
|
||||
<strong>Tags:</strong>
|
||||
</Typography>
|
||||
<Box sx={{ mb: 2 }}>
|
||||
{selectedDocument.tags.length > 0 ? (
|
||||
selectedDocument.tags.map((tag) => (
|
||||
<Chip key={tag} label={tag} size="small" sx={{ mr: 1, mb: 1 }} />
|
||||
))
|
||||
) : (
|
||||
<Typography variant="body2" color="text.secondary">No tags</Typography>
|
||||
)}
|
||||
</Box>
|
||||
</Grid>
|
||||
<Grid item xs={12} md={6}>
|
||||
<Typography variant="body2" color="text.secondary">
|
||||
<strong>Failure Category:</strong>
|
||||
</Typography>
|
||||
<Chip
|
||||
label={selectedDocument.failure_category}
|
||||
color={getFailureCategoryColor(selectedDocument.failure_category)}
|
||||
sx={{ mb: 2 }}
|
||||
/>
|
||||
|
||||
<Typography variant="body2" color="text.secondary">
|
||||
<strong>Retry Count:</strong>
|
||||
</Typography>
|
||||
<Typography variant="body2" sx={{ mb: 2 }}>
|
||||
{selectedDocument.retry_count} attempts
|
||||
</Typography>
|
||||
|
||||
<Typography variant="body2" color="text.secondary">
|
||||
<strong>Created:</strong>
|
||||
</Typography>
|
||||
<Typography variant="body2" sx={{ mb: 2 }}>
|
||||
{format(new Date(selectedDocument.created_at), 'PPpp')}
|
||||
</Typography>
|
||||
|
||||
<Typography variant="body2" color="text.secondary">
|
||||
<strong>Last Updated:</strong>
|
||||
</Typography>
|
||||
<Typography variant="body2">
|
||||
{format(new Date(selectedDocument.updated_at), 'PPpp')}
|
||||
</Typography>
|
||||
</Grid>
|
||||
<Grid item xs={12}>
|
||||
<Typography variant="body2" color="text.secondary" sx={{ mb: 1 }}>
|
||||
<strong>Full Error Message:</strong>
|
||||
</Typography>
|
||||
<Paper sx={{ p: 2, bgcolor: 'grey.50' }}>
|
||||
<Typography
|
||||
variant="body2"
|
||||
sx={{
|
||||
fontFamily: 'monospace',
|
||||
fontSize: '0.875rem',
|
||||
wordBreak: 'break-word',
|
||||
whiteSpace: 'pre-wrap'
|
||||
}}
|
||||
>
|
||||
{selectedDocument.ocr_error || 'No error message available'}
|
||||
</Typography>
|
||||
</Paper>
|
||||
</Grid>
|
||||
</Grid>
|
||||
)}
|
||||
</DialogContent>
|
||||
<DialogActions>
|
||||
{selectedDocument?.can_retry && (
|
||||
<Button
|
||||
onClick={() => {
|
||||
setDetailsOpen(false);
|
||||
if (selectedDocument) {
|
||||
handleRetryOcr(selectedDocument);
|
||||
}
|
||||
}}
|
||||
startIcon={<RefreshIcon />}
|
||||
disabled={retrying === selectedDocument?.id}
|
||||
>
|
||||
Retry OCR
|
||||
</Button>
|
||||
)}
|
||||
<Button onClick={() => setDetailsOpen(false)}>Close</Button>
|
||||
</DialogActions>
|
||||
</Dialog>
|
||||
|
||||
{/* Success/Error Snackbar */}
|
||||
<Snackbar
|
||||
open={snackbar.open}
|
||||
autoHideDuration={6000}
|
||||
onClose={() => setSnackbar(prev => ({ ...prev, open: false }))}
|
||||
>
|
||||
<Alert
|
||||
onClose={() => setSnackbar(prev => ({ ...prev, open: false }))}
|
||||
severity={snackbar.severity}
|
||||
sx={{ width: '100%' }}
|
||||
>
|
||||
{snackbar.message}
|
||||
</Alert>
|
||||
</Snackbar>
|
||||
</Box>
|
||||
);
|
||||
};
|
||||
|
||||
export default FailedOcrPage;
|
||||
|
|
@ -186,6 +186,16 @@ export const documentService = {
|
|||
})
|
||||
},
|
||||
|
||||
retryOcr: (id: string) => {
|
||||
return api.post(`/documents/${id}/retry-ocr`)
|
||||
},
|
||||
|
||||
getFailedOcrDocuments: (limit = 50, offset = 0) => {
|
||||
return api.get(`/documents/failed-ocr`, {
|
||||
params: { limit, offset },
|
||||
})
|
||||
},
|
||||
|
||||
search: (searchRequest: SearchRequest) => {
|
||||
return api.get<SearchResponse>('/search', {
|
||||
params: searchRequest,
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ use serde::Deserialize;
|
|||
use std::sync::Arc;
|
||||
use utoipa::ToSchema;
|
||||
use sha2::{Sha256, Digest};
|
||||
use sqlx::Row;
|
||||
|
||||
use crate::{
|
||||
auth::AuthUser,
|
||||
|
|
@ -33,6 +34,8 @@ pub fn router() -> Router<Arc<AppState>> {
|
|||
.route("/{id}/thumbnail", get(get_document_thumbnail))
|
||||
.route("/{id}/ocr", get(get_document_ocr))
|
||||
.route("/{id}/processed-image", get(get_processed_image))
|
||||
.route("/{id}/retry-ocr", post(retry_ocr))
|
||||
.route("/failed-ocr", get(get_failed_ocr_documents))
|
||||
}
|
||||
|
||||
#[utoipa::path(
|
||||
|
|
@ -471,4 +474,317 @@ async fn get_processed_image(
|
|||
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
#[utoipa::path(
|
||||
post,
|
||||
path = "/api/documents/{id}/retry-ocr",
|
||||
tag = "documents",
|
||||
security(
|
||||
("bearer_auth" = [])
|
||||
),
|
||||
params(
|
||||
("id" = uuid::Uuid, Path, description = "Document ID")
|
||||
),
|
||||
responses(
|
||||
(status = 200, description = "OCR retry queued successfully", body = String),
|
||||
(status = 404, description = "Document not found"),
|
||||
(status = 400, description = "Document is not eligible for OCR retry"),
|
||||
(status = 401, description = "Unauthorized")
|
||||
)
|
||||
)]
|
||||
async fn retry_ocr(
|
||||
State(state): State<Arc<AppState>>,
|
||||
auth_user: AuthUser,
|
||||
Path(document_id): Path<uuid::Uuid>,
|
||||
) -> Result<Json<serde_json::Value>, StatusCode> {
|
||||
// Check if document exists and belongs to user
|
||||
let documents = state
|
||||
.db
|
||||
.get_documents_by_user_with_role(auth_user.user.id, auth_user.user.role, 1000, 0)
|
||||
.await
|
||||
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
|
||||
|
||||
let document = documents
|
||||
.into_iter()
|
||||
.find(|doc| doc.id == document_id)
|
||||
.ok_or(StatusCode::NOT_FOUND)?;
|
||||
|
||||
// Check if document is eligible for OCR retry (failed or not processed)
|
||||
let eligible = document.ocr_status.as_ref().map_or(true, |status| {
|
||||
status == "failed" || status == "pending"
|
||||
});
|
||||
|
||||
if !eligible {
|
||||
return Ok(Json(serde_json::json!({
|
||||
"success": false,
|
||||
"message": "Document is not eligible for OCR retry. Current status: {}",
|
||||
"current_status": document.ocr_status
|
||||
})));
|
||||
}
|
||||
|
||||
// Reset document OCR fields
|
||||
let reset_result = sqlx::query(
|
||||
r#"
|
||||
UPDATE documents
|
||||
SET ocr_status = 'pending',
|
||||
ocr_text = NULL,
|
||||
ocr_error = NULL,
|
||||
ocr_failure_reason = NULL,
|
||||
ocr_confidence = NULL,
|
||||
ocr_word_count = NULL,
|
||||
ocr_processing_time_ms = NULL,
|
||||
ocr_completed_at = NULL,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
"#
|
||||
)
|
||||
.bind(document_id)
|
||||
.execute(state.db.get_pool())
|
||||
.await
|
||||
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
|
||||
|
||||
if reset_result.rows_affected() == 0 {
|
||||
return Err(StatusCode::NOT_FOUND);
|
||||
}
|
||||
|
||||
// Calculate priority based on file size (higher priority for retries)
|
||||
let priority = match document.file_size {
|
||||
0..=1048576 => 15, // <= 1MB: highest priority (boosted for retry)
|
||||
..=5242880 => 12, // 1-5MB: high priority
|
||||
..=10485760 => 10, // 5-10MB: medium priority
|
||||
..=52428800 => 8, // 10-50MB: low priority
|
||||
_ => 6, // > 50MB: lowest priority
|
||||
};
|
||||
|
||||
// Add to OCR queue with detailed logging
|
||||
match state.queue_service.enqueue_document(document_id, priority, document.file_size).await {
|
||||
Ok(queue_id) => {
|
||||
tracing::info!(
|
||||
"OCR retry queued for document {} ({}): queue_id={}, priority={}, size={}",
|
||||
document_id, document.filename, queue_id, priority, document.file_size
|
||||
);
|
||||
|
||||
Ok(Json(serde_json::json!({
|
||||
"success": true,
|
||||
"message": "OCR retry queued successfully",
|
||||
"queue_id": queue_id,
|
||||
"document_id": document_id,
|
||||
"priority": priority,
|
||||
"estimated_wait_minutes": calculate_estimated_wait_time(priority).await
|
||||
})))
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("Failed to queue OCR retry for document {}: {}", document_id, e);
|
||||
Err(StatusCode::INTERNAL_SERVER_ERROR)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[utoipa::path(
|
||||
get,
|
||||
path = "/api/documents/failed-ocr",
|
||||
tag = "documents",
|
||||
security(
|
||||
("bearer_auth" = [])
|
||||
),
|
||||
params(
|
||||
("limit" = Option<i64>, Query, description = "Number of documents to return (default: 50)"),
|
||||
("offset" = Option<i64>, Query, description = "Number of documents to skip (default: 0)")
|
||||
),
|
||||
responses(
|
||||
(status = 200, description = "List of documents with failed OCR", body = String),
|
||||
(status = 401, description = "Unauthorized")
|
||||
)
|
||||
)]
|
||||
async fn get_failed_ocr_documents(
|
||||
State(state): State<Arc<AppState>>,
|
||||
auth_user: AuthUser,
|
||||
Query(pagination): Query<PaginationQuery>,
|
||||
) -> Result<Json<serde_json::Value>, StatusCode> {
|
||||
let limit = pagination.limit.unwrap_or(50);
|
||||
let offset = pagination.offset.unwrap_or(0);
|
||||
|
||||
// Get failed OCR documents with additional failure details
|
||||
let failed_docs = sqlx::query(
|
||||
r#"
|
||||
SELECT d.id, d.filename, d.original_filename, d.file_path, d.file_size,
|
||||
d.mime_type, d.created_at, d.updated_at, d.user_id,
|
||||
d.ocr_status, d.ocr_error, d.ocr_failure_reason,
|
||||
d.ocr_completed_at, d.tags,
|
||||
-- Count retry attempts from OCR queue
|
||||
COALESCE(q.retry_count, 0) as retry_count,
|
||||
q.last_attempt_at
|
||||
FROM documents d
|
||||
LEFT JOIN (
|
||||
SELECT document_id,
|
||||
COUNT(*) as retry_count,
|
||||
MAX(created_at) as last_attempt_at
|
||||
FROM ocr_queue
|
||||
WHERE status IN ('failed', 'completed')
|
||||
GROUP BY document_id
|
||||
) q ON d.id = q.document_id
|
||||
WHERE d.ocr_status = 'failed'
|
||||
AND ($1 = $1 OR d.user_id = $1) -- Admin can see all, users see only their own
|
||||
ORDER BY d.updated_at DESC
|
||||
LIMIT $2 OFFSET $3
|
||||
"#
|
||||
)
|
||||
.bind(if auth_user.user.role == crate::models::UserRole::Admin {
|
||||
None
|
||||
} else {
|
||||
Some(auth_user.user.id)
|
||||
})
|
||||
.bind(limit)
|
||||
.bind(offset)
|
||||
.fetch_all(state.db.get_pool())
|
||||
.await
|
||||
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
|
||||
|
||||
// Count total failed documents
|
||||
let total_count: i64 = sqlx::query_scalar(
|
||||
r#"
|
||||
SELECT COUNT(*)
|
||||
FROM documents
|
||||
WHERE ocr_status = 'failed'
|
||||
AND ($1 = $1 OR user_id = $1)
|
||||
"#
|
||||
)
|
||||
.bind(if auth_user.user.role == crate::models::UserRole::Admin {
|
||||
None
|
||||
} else {
|
||||
Some(auth_user.user.id)
|
||||
})
|
||||
.fetch_one(state.db.get_pool())
|
||||
.await
|
||||
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
|
||||
|
||||
let failed_documents: Vec<serde_json::Value> = failed_docs
|
||||
.into_iter()
|
||||
.map(|row| {
|
||||
let tags: Vec<String> = row.get::<Option<Vec<String>>, _>("tags").unwrap_or_default();
|
||||
|
||||
serde_json::json!({
|
||||
"id": row.get::<uuid::Uuid, _>("id"),
|
||||
"filename": row.get::<String, _>("filename"),
|
||||
"original_filename": row.get::<String, _>("original_filename"),
|
||||
"file_size": row.get::<i64, _>("file_size"),
|
||||
"mime_type": row.get::<String, _>("mime_type"),
|
||||
"created_at": row.get::<chrono::DateTime<chrono::Utc>, _>("created_at"),
|
||||
"updated_at": row.get::<chrono::DateTime<chrono::Utc>, _>("updated_at"),
|
||||
"tags": tags,
|
||||
"ocr_status": row.get::<Option<String>, _>("ocr_status"),
|
||||
"ocr_error": row.get::<Option<String>, _>("ocr_error"),
|
||||
"ocr_failure_reason": row.get::<Option<String>, _>("ocr_failure_reason"),
|
||||
"ocr_completed_at": row.get::<Option<chrono::DateTime<chrono::Utc>>, _>("ocr_completed_at"),
|
||||
"retry_count": row.get::<Option<i64>, _>("retry_count").unwrap_or(0),
|
||||
"last_attempt_at": row.get::<Option<chrono::DateTime<chrono::Utc>>, _>("last_attempt_at"),
|
||||
"can_retry": true,
|
||||
"failure_category": categorize_failure_reason(
|
||||
row.get::<Option<String>, _>("ocr_failure_reason").as_deref(),
|
||||
row.get::<Option<String>, _>("ocr_error").as_deref()
|
||||
)
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
let response = serde_json::json!({
|
||||
"documents": failed_documents,
|
||||
"pagination": {
|
||||
"total": total_count,
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
"has_more": offset + limit < total_count
|
||||
},
|
||||
"statistics": {
|
||||
"total_failed": total_count,
|
||||
"failure_categories": get_failure_statistics(&state, auth_user.user.id, auth_user.user.role.clone()).await?
|
||||
}
|
||||
});
|
||||
|
||||
Ok(Json(response))
|
||||
}
|
||||
|
||||
async fn calculate_estimated_wait_time(priority: i32) -> i64 {
|
||||
// Simple estimation based on priority - in a real implementation,
|
||||
// this would check actual queue depth and processing times
|
||||
match priority {
|
||||
15.. => 1, // High priority retry: ~1 minute
|
||||
10..14 => 3, // Medium priority: ~3 minutes
|
||||
5..9 => 10, // Low priority: ~10 minutes
|
||||
_ => 30, // Very low priority: ~30 minutes
|
||||
}
|
||||
}
|
||||
|
||||
fn categorize_failure_reason(failure_reason: Option<&str>, error_message: Option<&str>) -> &'static str {
|
||||
match failure_reason {
|
||||
Some("pdf_font_encoding") => "PDF Font Issues",
|
||||
Some("pdf_corruption") => "PDF Corruption",
|
||||
Some("processing_timeout") => "Timeout",
|
||||
Some("memory_limit") => "Memory Limit",
|
||||
Some("pdf_parsing_panic") => "PDF Parsing Error",
|
||||
Some("unknown") | None => {
|
||||
// Try to categorize based on error message
|
||||
if let Some(error) = error_message {
|
||||
let error_lower = error.to_lowercase();
|
||||
if error_lower.contains("timeout") {
|
||||
"Timeout"
|
||||
} else if error_lower.contains("memory") {
|
||||
"Memory Limit"
|
||||
} else if error_lower.contains("font") || error_lower.contains("encoding") {
|
||||
"PDF Font Issues"
|
||||
} else if error_lower.contains("corrupt") {
|
||||
"PDF Corruption"
|
||||
} else {
|
||||
"Unknown Error"
|
||||
}
|
||||
} else {
|
||||
"Unknown Error"
|
||||
}
|
||||
}
|
||||
_ => "Other"
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_failure_statistics(
|
||||
state: &Arc<AppState>,
|
||||
user_id: uuid::Uuid,
|
||||
user_role: crate::models::UserRole
|
||||
) -> Result<serde_json::Value, StatusCode> {
|
||||
let stats = sqlx::query(
|
||||
r#"
|
||||
SELECT
|
||||
ocr_failure_reason,
|
||||
COUNT(*) as count
|
||||
FROM documents
|
||||
WHERE ocr_status = 'failed'
|
||||
AND ($1 = $1 OR user_id = $1)
|
||||
GROUP BY ocr_failure_reason
|
||||
ORDER BY count DESC
|
||||
"#
|
||||
)
|
||||
.bind(if user_role == crate::models::UserRole::Admin {
|
||||
None
|
||||
} else {
|
||||
Some(user_id)
|
||||
})
|
||||
.fetch_all(state.db.get_pool())
|
||||
.await
|
||||
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
|
||||
|
||||
let categories: Vec<serde_json::Value> = stats
|
||||
.into_iter()
|
||||
.map(|row| {
|
||||
let reason = row.get::<Option<String>, _>("ocr_failure_reason");
|
||||
let count = row.get::<i64, _>("count");
|
||||
|
||||
serde_json::json!({
|
||||
"reason": reason.clone().unwrap_or_else(|| "unknown".to_string()),
|
||||
"display_name": categorize_failure_reason(reason.as_deref(), None),
|
||||
"count": count
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(serde_json::json!(categories))
|
||||
}
|
||||
Loading…
Reference in New Issue