feat(server/client): add failed_documents table to handle failures, and move logic of failures
This commit is contained in:
parent
881e4c5a8e
commit
34bc207e39
|
|
@ -36,6 +36,7 @@ import {
|
||||||
Label as LabelIcon,
|
Label as LabelIcon,
|
||||||
Block as BlockIcon,
|
Block as BlockIcon,
|
||||||
Api as ApiIcon,
|
Api as ApiIcon,
|
||||||
|
ManageAccounts as ManageIcon,
|
||||||
} from '@mui/icons-material';
|
} from '@mui/icons-material';
|
||||||
import { useNavigate, useLocation } from 'react-router-dom';
|
import { useNavigate, useLocation } from 'react-router-dom';
|
||||||
import { useAuth } from '../../contexts/AuthContext';
|
import { useAuth } from '../../contexts/AuthContext';
|
||||||
|
|
@ -69,7 +70,7 @@ const navigationItems: NavigationItem[] = [
|
||||||
{ text: 'Labels', icon: LabelIcon, path: '/labels' },
|
{ text: 'Labels', icon: LabelIcon, path: '/labels' },
|
||||||
{ text: 'Sources', icon: StorageIcon, path: '/sources' },
|
{ text: 'Sources', icon: StorageIcon, path: '/sources' },
|
||||||
{ text: 'Watch Folder', icon: FolderIcon, path: '/watch' },
|
{ text: 'Watch Folder', icon: FolderIcon, path: '/watch' },
|
||||||
{ text: 'Failed OCR', icon: ErrorIcon, path: '/failed-ocr' },
|
{ text: 'Document Management', icon: ManageIcon, path: '/failed-ocr' },
|
||||||
{ text: 'Ignored Files', icon: BlockIcon, path: '/ignored-files' },
|
{ text: 'Ignored Files', icon: BlockIcon, path: '/ignored-files' },
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -47,6 +47,7 @@ import {
|
||||||
Delete as DeleteIcon,
|
Delete as DeleteIcon,
|
||||||
FindInPage as FindInPageIcon,
|
FindInPage as FindInPageIcon,
|
||||||
OpenInNew as OpenInNewIcon,
|
OpenInNew as OpenInNewIcon,
|
||||||
|
Warning as WarningIcon,
|
||||||
} from '@mui/icons-material';
|
} from '@mui/icons-material';
|
||||||
import { format } from 'date-fns';
|
import { format } from 'date-fns';
|
||||||
import { api, documentService, queueService } from '../services/api';
|
import { api, documentService, queueService } from '../services/api';
|
||||||
|
|
@ -135,16 +136,22 @@ const FailedOcrPage: React.FC = () => {
|
||||||
const [currentTab, setCurrentTab] = useState(0);
|
const [currentTab, setCurrentTab] = useState(0);
|
||||||
const [documents, setDocuments] = useState<FailedDocument[]>([]);
|
const [documents, setDocuments] = useState<FailedDocument[]>([]);
|
||||||
const [duplicates, setDuplicates] = useState<DuplicateGroup[]>([]);
|
const [duplicates, setDuplicates] = useState<DuplicateGroup[]>([]);
|
||||||
|
const [failedDocuments, setFailedDocuments] = useState<any[]>([]);
|
||||||
const [loading, setLoading] = useState(true);
|
const [loading, setLoading] = useState(true);
|
||||||
const [duplicatesLoading, setDuplicatesLoading] = useState(false);
|
const [duplicatesLoading, setDuplicatesLoading] = useState(false);
|
||||||
|
const [failedDocumentsLoading, setFailedDocumentsLoading] = useState(false);
|
||||||
|
const [failedDocumentsFilters, setFailedDocumentsFilters] = useState<{ stage?: string; reason?: string }>({});
|
||||||
|
const [selectedFailedDocument, setSelectedFailedDocument] = useState<any>(null);
|
||||||
const [retrying, setRetrying] = useState<string | null>(null);
|
const [retrying, setRetrying] = useState<string | null>(null);
|
||||||
const [retryingAll, setRetryingAll] = useState(false);
|
const [retryingAll, setRetryingAll] = useState(false);
|
||||||
const [statistics, setStatistics] = useState<FailedOcrResponse['statistics'] | null>(null);
|
const [statistics, setStatistics] = useState<FailedOcrResponse['statistics'] | null>(null);
|
||||||
const [duplicateStatistics, setDuplicateStatistics] = useState<DuplicatesResponse['statistics'] | null>(null);
|
const [duplicateStatistics, setDuplicateStatistics] = useState<DuplicatesResponse['statistics'] | null>(null);
|
||||||
const [pagination, setPagination] = useState({ page: 1, limit: 25 });
|
const [pagination, setPagination] = useState({ page: 1, limit: 25 });
|
||||||
const [duplicatesPagination, setDuplicatesPagination] = useState({ page: 1, limit: 25 });
|
const [duplicatesPagination, setDuplicatesPagination] = useState({ page: 1, limit: 25 });
|
||||||
|
const [failedDocumentsPagination, setFailedDocumentsPagination] = useState({ page: 1, limit: 25 });
|
||||||
const [totalPages, setTotalPages] = useState(0);
|
const [totalPages, setTotalPages] = useState(0);
|
||||||
const [duplicatesTotalPages, setDuplicatesTotalPages] = useState(0);
|
const [duplicatesTotalPages, setDuplicatesTotalPages] = useState(0);
|
||||||
|
const [failedDocumentsTotalPages, setFailedDocumentsTotalPages] = useState(0);
|
||||||
const [selectedDocument, setSelectedDocument] = useState<FailedDocument | null>(null);
|
const [selectedDocument, setSelectedDocument] = useState<FailedDocument | null>(null);
|
||||||
const [detailsOpen, setDetailsOpen] = useState(false);
|
const [detailsOpen, setDetailsOpen] = useState(false);
|
||||||
const [expandedRows, setExpandedRows] = useState<Set<string>>(new Set());
|
const [expandedRows, setExpandedRows] = useState<Set<string>>(new Set());
|
||||||
|
|
@ -223,8 +230,59 @@ const FailedOcrPage: React.FC = () => {
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (currentTab === 1) {
|
if (currentTab === 1) {
|
||||||
fetchDuplicates();
|
fetchDuplicates();
|
||||||
|
} else if (currentTab === 4) {
|
||||||
|
fetchFailedDocumentsList();
|
||||||
}
|
}
|
||||||
}, [currentTab, duplicatesPagination.page]);
|
}, [currentTab, duplicatesPagination.page, failedDocumentsPagination.page, failedDocumentsFilters]);
|
||||||
|
|
||||||
|
const fetchFailedDocumentsList = async () => {
|
||||||
|
try {
|
||||||
|
setFailedDocumentsLoading(true);
|
||||||
|
const offset = (failedDocumentsPagination.page - 1) * failedDocumentsPagination.limit;
|
||||||
|
const response = await documentService.getFailedDocuments(
|
||||||
|
failedDocumentsPagination.limit,
|
||||||
|
offset,
|
||||||
|
failedDocumentsFilters.stage,
|
||||||
|
failedDocumentsFilters.reason
|
||||||
|
);
|
||||||
|
|
||||||
|
if (response?.data) {
|
||||||
|
setFailedDocuments(response.data.documents || []);
|
||||||
|
if (response.data.pagination) {
|
||||||
|
setFailedDocumentsTotalPages(Math.ceil(response.data.pagination.total / failedDocumentsPagination.limit));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Failed to fetch failed documents:', error);
|
||||||
|
setSnackbar({
|
||||||
|
open: true,
|
||||||
|
message: 'Failed to load failed documents',
|
||||||
|
severity: 'error'
|
||||||
|
});
|
||||||
|
} finally {
|
||||||
|
setFailedDocumentsLoading(false);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const getFailureReasonColor = (reason: string): "error" | "warning" | "info" | "default" => {
|
||||||
|
switch (reason) {
|
||||||
|
case 'low_ocr_confidence':
|
||||||
|
case 'ocr_timeout':
|
||||||
|
case 'ocr_memory_limit':
|
||||||
|
case 'pdf_parsing_error':
|
||||||
|
return 'error';
|
||||||
|
case 'duplicate_content':
|
||||||
|
case 'unsupported_format':
|
||||||
|
case 'file_too_large':
|
||||||
|
return 'warning';
|
||||||
|
case 'file_corrupted':
|
||||||
|
case 'access_denied':
|
||||||
|
case 'permission_denied':
|
||||||
|
return 'error';
|
||||||
|
default:
|
||||||
|
return 'default';
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
const handleRetryOcr = async (document: FailedDocument) => {
|
const handleRetryOcr = async (document: FailedDocument) => {
|
||||||
try {
|
try {
|
||||||
|
|
@ -309,6 +367,8 @@ const FailedOcrPage: React.FC = () => {
|
||||||
case 'Timeout':
|
case 'Timeout':
|
||||||
case 'Memory Limit':
|
case 'Memory Limit':
|
||||||
return 'error';
|
return 'error';
|
||||||
|
case 'Low OCR Confidence':
|
||||||
|
return 'warning';
|
||||||
case 'Unknown Error':
|
case 'Unknown Error':
|
||||||
return 'info';
|
return 'info';
|
||||||
default:
|
default:
|
||||||
|
|
@ -354,6 +414,8 @@ const FailedOcrPage: React.FC = () => {
|
||||||
handlePreviewLowConfidence();
|
handlePreviewLowConfidence();
|
||||||
} else if (currentTab === 3) {
|
} else if (currentTab === 3) {
|
||||||
handlePreviewFailedDocuments();
|
handlePreviewFailedDocuments();
|
||||||
|
} else if (currentTab === 4) {
|
||||||
|
fetchFailedDocumentsList();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -488,22 +550,27 @@ const FailedOcrPage: React.FC = () => {
|
||||||
<Tabs value={currentTab} onChange={handleTabChange} aria-label="document management tabs">
|
<Tabs value={currentTab} onChange={handleTabChange} aria-label="document management tabs">
|
||||||
<Tab
|
<Tab
|
||||||
icon={<ErrorIcon />}
|
icon={<ErrorIcon />}
|
||||||
label={`Failed OCR${statistics ? ` (${statistics.total_failed})` : ''}`}
|
label={`Failed Documents${statistics ? ` (${statistics.total_failed})` : ''}`}
|
||||||
iconPosition="start"
|
iconPosition="start"
|
||||||
/>
|
/>
|
||||||
<Tab
|
<Tab
|
||||||
icon={<FileCopyIcon />}
|
icon={<FileCopyIcon />}
|
||||||
label={`Duplicates${duplicateStatistics ? ` (${duplicateStatistics.total_duplicate_groups})` : ''}`}
|
label={`Duplicate Files${duplicateStatistics ? ` (${duplicateStatistics.total_duplicate_groups})` : ''}`}
|
||||||
iconPosition="start"
|
iconPosition="start"
|
||||||
/>
|
/>
|
||||||
<Tab
|
<Tab
|
||||||
icon={<FindInPageIcon />}
|
icon={<FindInPageIcon />}
|
||||||
label={`Low Confidence${previewData ? ` (${previewData.matched_count})` : ''}`}
|
label={`Low Quality Manager${previewData ? ` (${previewData.matched_count})` : ''}`}
|
||||||
iconPosition="start"
|
iconPosition="start"
|
||||||
/>
|
/>
|
||||||
<Tab
|
<Tab
|
||||||
icon={<DeleteIcon />}
|
icon={<DeleteIcon />}
|
||||||
label="Delete Failed"
|
label="Bulk Cleanup"
|
||||||
|
iconPosition="start"
|
||||||
|
/>
|
||||||
|
<Tab
|
||||||
|
icon={<WarningIcon />}
|
||||||
|
label="Failed Documents"
|
||||||
iconPosition="start"
|
iconPosition="start"
|
||||||
/>
|
/>
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
@ -1073,15 +1140,62 @@ const FailedOcrPage: React.FC = () => {
|
||||||
<Typography color={previewData.matched_count > 0 ? 'warning.main' : 'success.main'}>
|
<Typography color={previewData.matched_count > 0 ? 'warning.main' : 'success.main'}>
|
||||||
{previewData.message}
|
{previewData.message}
|
||||||
</Typography>
|
</Typography>
|
||||||
{previewData.matched_count > 0 && (
|
{previewData.matched_count > 0 && previewData.documents && (
|
||||||
<Box sx={{ mt: 2 }}>
|
<Box sx={{ mt: 2 }}>
|
||||||
<Typography variant="body2" color="text.secondary">
|
<Typography variant="body2" color="text.secondary" gutterBottom>
|
||||||
Document IDs that would be deleted:
|
Documents that would be deleted:
|
||||||
</Typography>
|
</Typography>
|
||||||
<Typography variant="body2" sx={{ fontFamily: 'monospace', wordBreak: 'break-all' }}>
|
<TableContainer component={Paper} variant="outlined" sx={{ mt: 2 }}>
|
||||||
{previewData.document_ids.slice(0, 10).join(', ')}
|
<Table size="small">
|
||||||
{previewData.document_ids.length > 10 && ` ... and ${previewData.document_ids.length - 10} more`}
|
<TableHead>
|
||||||
|
<TableRow>
|
||||||
|
<TableCell>Filename</TableCell>
|
||||||
|
<TableCell>Size</TableCell>
|
||||||
|
<TableCell>OCR Confidence</TableCell>
|
||||||
|
<TableCell>Status</TableCell>
|
||||||
|
<TableCell>Date</TableCell>
|
||||||
|
</TableRow>
|
||||||
|
</TableHead>
|
||||||
|
<TableBody>
|
||||||
|
{previewData.documents.slice(0, 20).map((doc: any) => (
|
||||||
|
<TableRow key={doc.id}>
|
||||||
|
<TableCell>
|
||||||
|
<Typography variant="body2" noWrap>
|
||||||
|
{doc.original_filename || doc.filename}
|
||||||
</Typography>
|
</Typography>
|
||||||
|
</TableCell>
|
||||||
|
<TableCell>
|
||||||
|
<Typography variant="body2">
|
||||||
|
{formatFileSize(doc.file_size)}
|
||||||
|
</Typography>
|
||||||
|
</TableCell>
|
||||||
|
<TableCell>
|
||||||
|
<Typography variant="body2" color={doc.ocr_confidence ? 'warning.main' : 'error.main'}>
|
||||||
|
{doc.ocr_confidence ? `${doc.ocr_confidence.toFixed(1)}%` : 'N/A'}
|
||||||
|
</Typography>
|
||||||
|
</TableCell>
|
||||||
|
<TableCell>
|
||||||
|
<Chip
|
||||||
|
size="small"
|
||||||
|
label={doc.ocr_status || 'Unknown'}
|
||||||
|
color={doc.ocr_status === 'failed' ? 'error' : 'default'}
|
||||||
|
/>
|
||||||
|
</TableCell>
|
||||||
|
<TableCell>
|
||||||
|
<Typography variant="body2">
|
||||||
|
{new Date(doc.created_at).toLocaleDateString()}
|
||||||
|
</Typography>
|
||||||
|
</TableCell>
|
||||||
|
</TableRow>
|
||||||
|
))}
|
||||||
|
</TableBody>
|
||||||
|
</Table>
|
||||||
|
</TableContainer>
|
||||||
|
{previewData.documents.length > 20 && (
|
||||||
|
<Typography variant="body2" color="text.secondary" sx={{ mt: 1 }}>
|
||||||
|
... and {previewData.documents.length - 20} more documents
|
||||||
|
</Typography>
|
||||||
|
)}
|
||||||
</Box>
|
</Box>
|
||||||
)}
|
)}
|
||||||
</CardContent>
|
</CardContent>
|
||||||
|
|
@ -1175,6 +1289,187 @@ const FailedOcrPage: React.FC = () => {
|
||||||
</>
|
</>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
|
{/* Failed Documents Tab Content */}
|
||||||
|
{currentTab === 4 && (
|
||||||
|
<>
|
||||||
|
<Alert severity="info" sx={{ mb: 3 }}>
|
||||||
|
<AlertTitle>Failed Documents Overview</AlertTitle>
|
||||||
|
<Typography>
|
||||||
|
This shows all documents that failed at any stage of processing: ingestion, validation, OCR, storage, etc.
|
||||||
|
Use the filters to narrow down by failure stage or specific reason.
|
||||||
|
</Typography>
|
||||||
|
</Alert>
|
||||||
|
|
||||||
|
{/* Filter Controls */}
|
||||||
|
<Card sx={{ mb: 3 }}>
|
||||||
|
<CardContent>
|
||||||
|
<Grid container spacing={3} alignItems="center">
|
||||||
|
<Grid item xs={12} md={3}>
|
||||||
|
<TextField
|
||||||
|
label="Filter by Stage"
|
||||||
|
select
|
||||||
|
value={failedDocumentsFilters.stage || ''}
|
||||||
|
onChange={(e) => setFailedDocumentsFilters(prev => ({ ...prev, stage: e.target.value || undefined }))}
|
||||||
|
fullWidth
|
||||||
|
SelectProps={{ native: true }}
|
||||||
|
>
|
||||||
|
<option value="">All Stages</option>
|
||||||
|
<option value="ocr">OCR Processing</option>
|
||||||
|
<option value="ingestion">Document Ingestion</option>
|
||||||
|
<option value="validation">Validation</option>
|
||||||
|
<option value="storage">File Storage</option>
|
||||||
|
<option value="processing">Processing</option>
|
||||||
|
<option value="sync">Synchronization</option>
|
||||||
|
</TextField>
|
||||||
|
</Grid>
|
||||||
|
<Grid item xs={12} md={3}>
|
||||||
|
<TextField
|
||||||
|
label="Filter by Reason"
|
||||||
|
select
|
||||||
|
value={failedDocumentsFilters.reason || ''}
|
||||||
|
onChange={(e) => setFailedDocumentsFilters(prev => ({ ...prev, reason: e.target.value || undefined }))}
|
||||||
|
fullWidth
|
||||||
|
SelectProps={{ native: true }}
|
||||||
|
>
|
||||||
|
<option value="">All Reasons</option>
|
||||||
|
<option value="duplicate_content">Duplicate Content</option>
|
||||||
|
<option value="low_ocr_confidence">Low OCR Confidence</option>
|
||||||
|
<option value="unsupported_format">Unsupported Format</option>
|
||||||
|
<option value="file_too_large">File Too Large</option>
|
||||||
|
<option value="file_corrupted">File Corrupted</option>
|
||||||
|
<option value="ocr_timeout">OCR Timeout</option>
|
||||||
|
<option value="pdf_parsing_error">PDF Parsing Error</option>
|
||||||
|
<option value="other">Other</option>
|
||||||
|
</TextField>
|
||||||
|
</Grid>
|
||||||
|
<Grid item xs={12} md={2}>
|
||||||
|
<Button
|
||||||
|
variant="outlined"
|
||||||
|
onClick={fetchFailedDocumentsList}
|
||||||
|
disabled={failedDocumentsLoading}
|
||||||
|
startIcon={failedDocumentsLoading ? <CircularProgress size={20} /> : <RefreshIcon />}
|
||||||
|
fullWidth
|
||||||
|
>
|
||||||
|
Apply Filters
|
||||||
|
</Button>
|
||||||
|
</Grid>
|
||||||
|
</Grid>
|
||||||
|
</CardContent>
|
||||||
|
</Card>
|
||||||
|
|
||||||
|
{/* Failed Documents List */}
|
||||||
|
{failedDocuments.length > 0 && (
|
||||||
|
<Card sx={{ mb: 3 }}>
|
||||||
|
<CardContent>
|
||||||
|
<Typography variant="h6" gutterBottom>
|
||||||
|
Failed Documents ({failedDocuments.length})
|
||||||
|
</Typography>
|
||||||
|
<TableContainer component={Paper} variant="outlined">
|
||||||
|
<Table>
|
||||||
|
<TableHead>
|
||||||
|
<TableRow>
|
||||||
|
<TableCell>Filename</TableCell>
|
||||||
|
<TableCell>Stage</TableCell>
|
||||||
|
<TableCell>Reason</TableCell>
|
||||||
|
<TableCell>Size</TableCell>
|
||||||
|
<TableCell>Date</TableCell>
|
||||||
|
<TableCell>Actions</TableCell>
|
||||||
|
</TableRow>
|
||||||
|
</TableHead>
|
||||||
|
<TableBody>
|
||||||
|
{failedDocuments.slice((failedDocumentsPagination.page - 1) * failedDocumentsPagination.limit, failedDocumentsPagination.page * failedDocumentsPagination.limit).map((doc: any) => (
|
||||||
|
<TableRow key={doc.id}>
|
||||||
|
<TableCell>
|
||||||
|
<Typography variant="body2" noWrap>
|
||||||
|
{doc.original_filename || doc.filename}
|
||||||
|
</Typography>
|
||||||
|
{doc.ingestion_source && (
|
||||||
|
<Chip size="small" label={doc.ingestion_source} variant="outlined" sx={{ mt: 0.5 }} />
|
||||||
|
)}
|
||||||
|
</TableCell>
|
||||||
|
<TableCell>
|
||||||
|
<Chip
|
||||||
|
size="small"
|
||||||
|
label={doc.source || doc.failure_stage}
|
||||||
|
color={doc.failure_stage === 'ocr' ? 'error' : 'warning'}
|
||||||
|
/>
|
||||||
|
</TableCell>
|
||||||
|
<TableCell>
|
||||||
|
<Chip
|
||||||
|
size="small"
|
||||||
|
label={doc.failure_category || doc.failure_reason}
|
||||||
|
color={getFailureReasonColor(doc.failure_reason)}
|
||||||
|
/>
|
||||||
|
</TableCell>
|
||||||
|
<TableCell>
|
||||||
|
<Typography variant="body2">
|
||||||
|
{doc.file_size ? formatFileSize(doc.file_size) : 'N/A'}
|
||||||
|
</Typography>
|
||||||
|
</TableCell>
|
||||||
|
<TableCell>
|
||||||
|
<Typography variant="body2">
|
||||||
|
{new Date(doc.created_at).toLocaleDateString()}
|
||||||
|
</Typography>
|
||||||
|
</TableCell>
|
||||||
|
<TableCell>
|
||||||
|
<IconButton
|
||||||
|
size="small"
|
||||||
|
onClick={() => setSelectedFailedDocument(doc)}
|
||||||
|
title="View Details"
|
||||||
|
>
|
||||||
|
<InfoIcon />
|
||||||
|
</IconButton>
|
||||||
|
{doc.existing_document_id && (
|
||||||
|
<IconButton
|
||||||
|
size="small"
|
||||||
|
onClick={() => navigate(`/documents/${doc.existing_document_id}`)}
|
||||||
|
title="View Existing Document"
|
||||||
|
>
|
||||||
|
<OpenInNewIcon />
|
||||||
|
</IconButton>
|
||||||
|
)}
|
||||||
|
</TableCell>
|
||||||
|
</TableRow>
|
||||||
|
))}
|
||||||
|
</TableBody>
|
||||||
|
</Table>
|
||||||
|
</TableContainer>
|
||||||
|
|
||||||
|
{/* Pagination */}
|
||||||
|
{failedDocumentsTotalPages > 1 && (
|
||||||
|
<Box display="flex" justifyContent="center" mt={2}>
|
||||||
|
<Pagination
|
||||||
|
count={failedDocumentsTotalPages}
|
||||||
|
page={failedDocumentsPagination.page}
|
||||||
|
onChange={(_, page) => setFailedDocumentsPagination(prev => ({ ...prev, page }))}
|
||||||
|
color="primary"
|
||||||
|
/>
|
||||||
|
</Box>
|
||||||
|
)}
|
||||||
|
</CardContent>
|
||||||
|
</Card>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Loading State */}
|
||||||
|
{failedDocumentsLoading && (
|
||||||
|
<Box display="flex" justifyContent="center" alignItems="center" minHeight="200px">
|
||||||
|
<CircularProgress />
|
||||||
|
<Typography sx={{ ml: 2 }}>Loading failed documents...</Typography>
|
||||||
|
</Box>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Empty State */}
|
||||||
|
{!failedDocumentsLoading && failedDocuments.length === 0 && (
|
||||||
|
<Alert severity="success">
|
||||||
|
<AlertTitle>No Failed Documents Found</AlertTitle>
|
||||||
|
<Typography>
|
||||||
|
No documents have failed processing with the current filters. This is good!
|
||||||
|
</Typography>
|
||||||
|
</Alert>
|
||||||
|
)}
|
||||||
|
</>
|
||||||
|
)}
|
||||||
|
|
||||||
{/* Confirmation Dialog */}
|
{/* Confirmation Dialog */}
|
||||||
<Dialog
|
<Dialog
|
||||||
open={confirmDeleteOpen}
|
open={confirmDeleteOpen}
|
||||||
|
|
|
||||||
|
|
@ -151,6 +151,7 @@ const SourcesPage: React.FC = () => {
|
||||||
const [testingConnection, setTestingConnection] = useState(false);
|
const [testingConnection, setTestingConnection] = useState(false);
|
||||||
const [syncingSource, setSyncingSource] = useState<string | null>(null);
|
const [syncingSource, setSyncingSource] = useState<string | null>(null);
|
||||||
const [stoppingSync, setStoppingSync] = useState<string | null>(null);
|
const [stoppingSync, setStoppingSync] = useState<string | null>(null);
|
||||||
|
const [autoRefreshing, setAutoRefreshing] = useState(false);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
loadSources();
|
loadSources();
|
||||||
|
|
@ -159,6 +160,25 @@ const SourcesPage: React.FC = () => {
|
||||||
}
|
}
|
||||||
}, [user]);
|
}, [user]);
|
||||||
|
|
||||||
|
// Auto-refresh sources when any source is syncing
|
||||||
|
useEffect(() => {
|
||||||
|
const activeSyncingSources = sources.filter(source => source.status === 'syncing');
|
||||||
|
|
||||||
|
if (activeSyncingSources.length > 0) {
|
||||||
|
setAutoRefreshing(true);
|
||||||
|
const interval = setInterval(() => {
|
||||||
|
loadSources();
|
||||||
|
}, 5000); // Poll every 5 seconds during active sync
|
||||||
|
|
||||||
|
return () => {
|
||||||
|
clearInterval(interval);
|
||||||
|
setAutoRefreshing(false);
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
setAutoRefreshing(false);
|
||||||
|
}
|
||||||
|
}, [sources]);
|
||||||
|
|
||||||
// Update default folders when source type changes
|
// Update default folders when source type changes
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (!editingSource) { // Only for new sources
|
if (!editingSource) { // Only for new sources
|
||||||
|
|
@ -979,8 +999,9 @@ const SourcesPage: React.FC = () => {
|
||||||
<Button
|
<Button
|
||||||
variant="outlined"
|
variant="outlined"
|
||||||
size="large"
|
size="large"
|
||||||
startIcon={<AutoFixHighIcon />}
|
startIcon={autoRefreshing ? <CircularProgress size={20} /> : <AutoFixHighIcon />}
|
||||||
onClick={loadSources}
|
onClick={loadSources}
|
||||||
|
disabled={autoRefreshing}
|
||||||
sx={{
|
sx={{
|
||||||
borderRadius: 3,
|
borderRadius: 3,
|
||||||
px: 4,
|
px: 4,
|
||||||
|
|
@ -993,7 +1014,7 @@ const SourcesPage: React.FC = () => {
|
||||||
transition: 'all 0.3s cubic-bezier(0.4, 0, 0.2, 1)',
|
transition: 'all 0.3s cubic-bezier(0.4, 0, 0.2, 1)',
|
||||||
}}
|
}}
|
||||||
>
|
>
|
||||||
Refresh
|
{autoRefreshing ? 'Auto-refreshing...' : 'Refresh'}
|
||||||
</Button>
|
</Button>
|
||||||
|
|
||||||
{/* OCR Controls for Admin Users */}
|
{/* OCR Controls for Admin Users */}
|
||||||
|
|
|
||||||
|
|
@ -200,8 +200,8 @@ export const documentService = {
|
||||||
},
|
},
|
||||||
|
|
||||||
getFailedOcrDocuments: (limit = 50, offset = 0) => {
|
getFailedOcrDocuments: (limit = 50, offset = 0) => {
|
||||||
return api.get(`/documents/failed-ocr`, {
|
return api.get(`/documents/failed`, {
|
||||||
params: { limit, offset },
|
params: { stage: 'ocr', limit, offset },
|
||||||
})
|
})
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|
@ -253,6 +253,13 @@ export const documentService = {
|
||||||
preview_only: previewOnly
|
preview_only: previewOnly
|
||||||
})
|
})
|
||||||
},
|
},
|
||||||
|
|
||||||
|
getFailedDocuments: (limit = 25, offset = 0, stage?: string, reason?: string) => {
|
||||||
|
const params: any = { limit, offset };
|
||||||
|
if (stage) params.stage = stage;
|
||||||
|
if (reason) params.reason = reason;
|
||||||
|
return api.get('/documents/failed', { params })
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface OcrStatusResponse {
|
export interface OcrStatusResponse {
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,103 @@
|
||||||
|
-- Add table to track documents that failed at any stage of processing
|
||||||
|
-- This provides visibility into documents that failed during: ingestion, validation, OCR, etc.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS failed_documents (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
user_id UUID REFERENCES users(id) ON DELETE CASCADE,
|
||||||
|
filename TEXT NOT NULL,
|
||||||
|
original_filename TEXT, -- Original name when uploaded (if available)
|
||||||
|
original_path TEXT, -- Path where file was located
|
||||||
|
file_path TEXT, -- Stored file path (if file was saved before failure)
|
||||||
|
file_size BIGINT,
|
||||||
|
file_hash VARCHAR(64),
|
||||||
|
mime_type TEXT,
|
||||||
|
|
||||||
|
-- Document content (if available before failure)
|
||||||
|
content TEXT, -- Raw content if extracted
|
||||||
|
tags TEXT[], -- Tags that were assigned/detected
|
||||||
|
|
||||||
|
-- OCR-related fields (for OCR stage failures)
|
||||||
|
ocr_text TEXT, -- Partial OCR text if extracted before failure
|
||||||
|
ocr_confidence REAL, -- OCR confidence if calculated
|
||||||
|
ocr_word_count INTEGER, -- Word count if calculated
|
||||||
|
ocr_processing_time_ms INTEGER, -- Processing time before failure
|
||||||
|
|
||||||
|
-- Failure information
|
||||||
|
failure_reason TEXT NOT NULL,
|
||||||
|
failure_stage TEXT NOT NULL, -- 'ingestion', 'validation', 'ocr', 'storage', etc.
|
||||||
|
existing_document_id UUID REFERENCES documents(id) ON DELETE SET NULL,
|
||||||
|
ingestion_source TEXT NOT NULL, -- 'batch', 'sync', 'webdav', 'upload', etc.
|
||||||
|
error_message TEXT, -- Detailed error information
|
||||||
|
|
||||||
|
-- Retry information
|
||||||
|
retry_count INTEGER DEFAULT 0,
|
||||||
|
last_retry_at TIMESTAMPTZ,
|
||||||
|
|
||||||
|
-- Timestamps
|
||||||
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
|
||||||
|
CONSTRAINT check_failure_reason CHECK (failure_reason IN (
|
||||||
|
'duplicate_content',
|
||||||
|
'duplicate_filename',
|
||||||
|
'unsupported_format',
|
||||||
|
'file_too_large',
|
||||||
|
'file_corrupted',
|
||||||
|
'access_denied',
|
||||||
|
'low_ocr_confidence',
|
||||||
|
'ocr_timeout',
|
||||||
|
'ocr_memory_limit',
|
||||||
|
'pdf_parsing_error',
|
||||||
|
'storage_quota_exceeded',
|
||||||
|
'network_error',
|
||||||
|
'permission_denied',
|
||||||
|
'virus_detected',
|
||||||
|
'invalid_structure',
|
||||||
|
'policy_violation',
|
||||||
|
'other'
|
||||||
|
)),
|
||||||
|
|
||||||
|
CONSTRAINT check_failure_stage CHECK (failure_stage IN (
|
||||||
|
'ingestion',
|
||||||
|
'validation',
|
||||||
|
'ocr',
|
||||||
|
'storage',
|
||||||
|
'processing',
|
||||||
|
'sync'
|
||||||
|
))
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Indexes for efficient querying
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_failed_documents_user_id ON failed_documents(user_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_failed_documents_created_at ON failed_documents(created_at DESC);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_failed_documents_failure_reason ON failed_documents(failure_reason);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_failed_documents_failure_stage ON failed_documents(failure_stage);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_failed_documents_ingestion_source ON failed_documents(ingestion_source);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_failed_documents_file_hash ON failed_documents(file_hash) WHERE file_hash IS NOT NULL;
|
||||||
|
|
||||||
|
-- Add comments for documentation
|
||||||
|
COMMENT ON TABLE failed_documents IS 'Tracks documents that failed at any stage of processing (ingestion, validation, OCR, etc.)';
|
||||||
|
COMMENT ON COLUMN failed_documents.failure_reason IS 'Specific reason why the document failed';
|
||||||
|
COMMENT ON COLUMN failed_documents.failure_stage IS 'Stage at which the document failed (ingestion, validation, ocr, etc.)';
|
||||||
|
COMMENT ON COLUMN failed_documents.existing_document_id IS 'Reference to existing document if failed due to duplicate content';
|
||||||
|
COMMENT ON COLUMN failed_documents.ingestion_source IS 'Source of the ingestion attempt (batch, sync, webdav, upload, etc.)';
|
||||||
|
COMMENT ON COLUMN failed_documents.error_message IS 'Detailed error message for troubleshooting';
|
||||||
|
|
||||||
|
-- Create a view for failed documents summary by reason and stage
|
||||||
|
CREATE OR REPLACE VIEW failed_documents_summary AS
|
||||||
|
SELECT
|
||||||
|
failure_reason,
|
||||||
|
failure_stage,
|
||||||
|
ingestion_source,
|
||||||
|
COUNT(*) as document_count,
|
||||||
|
SUM(file_size) as total_size,
|
||||||
|
AVG(file_size) as avg_size,
|
||||||
|
MIN(created_at) as first_occurrence,
|
||||||
|
MAX(created_at) as last_occurrence
|
||||||
|
FROM failed_documents
|
||||||
|
GROUP BY failure_reason, failure_stage, ingestion_source
|
||||||
|
ORDER BY document_count DESC;
|
||||||
|
|
||||||
|
-- Grant appropriate permissions
|
||||||
|
-- GRANT SELECT, INSERT ON failed_documents TO readur_user;
|
||||||
|
-- GRANT SELECT ON failed_documents_summary TO readur_user;
|
||||||
|
|
@ -0,0 +1,123 @@
|
||||||
|
-- Migration to move existing failed OCR documents from documents table to failed_documents table
|
||||||
|
-- This consolidates all failure tracking into a single table
|
||||||
|
|
||||||
|
-- First, ensure the failed_documents table exists
|
||||||
|
-- (This migration depends on 20250628000003_add_failed_documents_table.sql)
|
||||||
|
|
||||||
|
-- Move failed OCR documents to failed_documents table
|
||||||
|
INSERT INTO failed_documents (
|
||||||
|
user_id,
|
||||||
|
filename,
|
||||||
|
original_filename,
|
||||||
|
file_path,
|
||||||
|
file_size,
|
||||||
|
file_hash,
|
||||||
|
mime_type,
|
||||||
|
content,
|
||||||
|
tags,
|
||||||
|
ocr_text,
|
||||||
|
ocr_confidence,
|
||||||
|
ocr_word_count,
|
||||||
|
ocr_processing_time_ms,
|
||||||
|
failure_reason,
|
||||||
|
failure_stage,
|
||||||
|
ingestion_source,
|
||||||
|
error_message,
|
||||||
|
retry_count,
|
||||||
|
created_at,
|
||||||
|
updated_at
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
d.user_id,
|
||||||
|
d.filename,
|
||||||
|
d.original_filename,
|
||||||
|
d.file_path,
|
||||||
|
d.file_size,
|
||||||
|
d.file_hash,
|
||||||
|
d.mime_type,
|
||||||
|
d.content,
|
||||||
|
d.tags,
|
||||||
|
d.ocr_text,
|
||||||
|
d.ocr_confidence,
|
||||||
|
d.ocr_word_count,
|
||||||
|
d.ocr_processing_time_ms,
|
||||||
|
COALESCE(d.ocr_failure_reason, 'other') as failure_reason,
|
||||||
|
'ocr' as failure_stage,
|
||||||
|
'migration' as ingestion_source, -- Mark these as migrated from existing system
|
||||||
|
d.ocr_error as error_message,
|
||||||
|
COALESCE(q.retry_count, 0) as retry_count,
|
||||||
|
d.created_at,
|
||||||
|
d.updated_at
|
||||||
|
FROM documents d
|
||||||
|
LEFT JOIN (
|
||||||
|
SELECT document_id, COUNT(*) as retry_count
|
||||||
|
FROM ocr_queue
|
||||||
|
WHERE status IN ('failed', 'completed')
|
||||||
|
GROUP BY document_id
|
||||||
|
) q ON d.id = q.document_id
|
||||||
|
WHERE d.ocr_status = 'failed';
|
||||||
|
|
||||||
|
-- Log the migration for audit purposes
|
||||||
|
INSERT INTO failed_documents (
|
||||||
|
user_id,
|
||||||
|
filename,
|
||||||
|
original_filename,
|
||||||
|
failure_reason,
|
||||||
|
failure_stage,
|
||||||
|
ingestion_source,
|
||||||
|
error_message,
|
||||||
|
created_at,
|
||||||
|
updated_at
|
||||||
|
) VALUES (
|
||||||
|
'00000000-0000-0000-0000-000000000000'::uuid, -- System user ID
|
||||||
|
'migration_log',
|
||||||
|
'Failed OCR Migration Log',
|
||||||
|
'migration_completed',
|
||||||
|
'migration',
|
||||||
|
'system',
|
||||||
|
'Migrated ' || (SELECT COUNT(*) FROM documents WHERE ocr_status = 'failed') || ' failed OCR documents to failed_documents table',
|
||||||
|
NOW(),
|
||||||
|
NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Remove failed OCR documents from documents table
|
||||||
|
-- Note: This uses CASCADE to also clean up related records in ocr_queue table
|
||||||
|
DELETE FROM documents WHERE ocr_status = 'failed';
|
||||||
|
|
||||||
|
-- Update statistics and constraints
|
||||||
|
ANALYZE documents;
|
||||||
|
ANALYZE failed_documents;
|
||||||
|
|
||||||
|
-- Add comment documenting the migration
|
||||||
|
COMMENT ON TABLE failed_documents IS 'Tracks all documents that failed at any stage of processing. Consolidated from documents table (OCR failures) and new ingestion failures as of migration 20250628000004.';
|
||||||
|
|
||||||
|
-- Create indexes for efficient querying of migrated data
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_failed_documents_failure_stage_reason ON failed_documents(failure_stage, failure_reason);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_failed_documents_ocr_confidence ON failed_documents(ocr_confidence) WHERE ocr_confidence IS NOT NULL;
|
||||||
|
|
||||||
|
-- Optional: Create a view for backward compatibility during transition
|
||||||
|
CREATE OR REPLACE VIEW legacy_failed_ocr_documents AS
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
user_id,
|
||||||
|
filename,
|
||||||
|
original_filename,
|
||||||
|
file_path,
|
||||||
|
file_size,
|
||||||
|
mime_type,
|
||||||
|
tags,
|
||||||
|
ocr_text,
|
||||||
|
ocr_confidence,
|
||||||
|
ocr_word_count,
|
||||||
|
ocr_processing_time_ms,
|
||||||
|
failure_reason as ocr_failure_reason,
|
||||||
|
error_message as ocr_error,
|
||||||
|
'failed' as ocr_status,
|
||||||
|
retry_count,
|
||||||
|
created_at,
|
||||||
|
updated_at
|
||||||
|
FROM failed_documents
|
||||||
|
WHERE failure_stage = 'ocr';
|
||||||
|
|
||||||
|
-- Grant appropriate permissions
|
||||||
|
-- GRANT SELECT ON legacy_failed_ocr_documents TO readur_user;
|
||||||
143
src/models.rs
143
src/models.rs
|
|
@ -135,6 +135,149 @@ pub struct Document {
|
||||||
pub file_hash: Option<String>,
|
pub file_hash: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, ToSchema)]
|
||||||
|
pub enum FailureReason {
|
||||||
|
#[serde(rename = "duplicate_content")]
|
||||||
|
DuplicateContent,
|
||||||
|
#[serde(rename = "duplicate_filename")]
|
||||||
|
DuplicateFilename,
|
||||||
|
#[serde(rename = "unsupported_format")]
|
||||||
|
UnsupportedFormat,
|
||||||
|
#[serde(rename = "file_too_large")]
|
||||||
|
FileTooLarge,
|
||||||
|
#[serde(rename = "file_corrupted")]
|
||||||
|
FileCorrupted,
|
||||||
|
#[serde(rename = "access_denied")]
|
||||||
|
AccessDenied,
|
||||||
|
#[serde(rename = "low_ocr_confidence")]
|
||||||
|
LowOcrConfidence,
|
||||||
|
#[serde(rename = "ocr_timeout")]
|
||||||
|
OcrTimeout,
|
||||||
|
#[serde(rename = "ocr_memory_limit")]
|
||||||
|
OcrMemoryLimit,
|
||||||
|
#[serde(rename = "pdf_parsing_error")]
|
||||||
|
PdfParsingError,
|
||||||
|
#[serde(rename = "storage_quota_exceeded")]
|
||||||
|
StorageQuotaExceeded,
|
||||||
|
#[serde(rename = "network_error")]
|
||||||
|
NetworkError,
|
||||||
|
#[serde(rename = "permission_denied")]
|
||||||
|
PermissionDenied,
|
||||||
|
#[serde(rename = "virus_detected")]
|
||||||
|
VirusDetected,
|
||||||
|
#[serde(rename = "invalid_structure")]
|
||||||
|
InvalidStructure,
|
||||||
|
#[serde(rename = "policy_violation")]
|
||||||
|
PolicyViolation,
|
||||||
|
#[serde(rename = "other")]
|
||||||
|
Other,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, ToSchema)]
|
||||||
|
pub enum FailureStage {
|
||||||
|
#[serde(rename = "ingestion")]
|
||||||
|
Ingestion,
|
||||||
|
#[serde(rename = "validation")]
|
||||||
|
Validation,
|
||||||
|
#[serde(rename = "ocr")]
|
||||||
|
Ocr,
|
||||||
|
#[serde(rename = "storage")]
|
||||||
|
Storage,
|
||||||
|
#[serde(rename = "processing")]
|
||||||
|
Processing,
|
||||||
|
#[serde(rename = "sync")]
|
||||||
|
Sync,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for FailureReason {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
FailureReason::DuplicateContent => write!(f, "duplicate_content"),
|
||||||
|
FailureReason::DuplicateFilename => write!(f, "duplicate_filename"),
|
||||||
|
FailureReason::UnsupportedFormat => write!(f, "unsupported_format"),
|
||||||
|
FailureReason::FileTooLarge => write!(f, "file_too_large"),
|
||||||
|
FailureReason::FileCorrupted => write!(f, "file_corrupted"),
|
||||||
|
FailureReason::AccessDenied => write!(f, "access_denied"),
|
||||||
|
FailureReason::LowOcrConfidence => write!(f, "low_ocr_confidence"),
|
||||||
|
FailureReason::OcrTimeout => write!(f, "ocr_timeout"),
|
||||||
|
FailureReason::OcrMemoryLimit => write!(f, "ocr_memory_limit"),
|
||||||
|
FailureReason::PdfParsingError => write!(f, "pdf_parsing_error"),
|
||||||
|
FailureReason::StorageQuotaExceeded => write!(f, "storage_quota_exceeded"),
|
||||||
|
FailureReason::NetworkError => write!(f, "network_error"),
|
||||||
|
FailureReason::PermissionDenied => write!(f, "permission_denied"),
|
||||||
|
FailureReason::VirusDetected => write!(f, "virus_detected"),
|
||||||
|
FailureReason::InvalidStructure => write!(f, "invalid_structure"),
|
||||||
|
FailureReason::PolicyViolation => write!(f, "policy_violation"),
|
||||||
|
FailureReason::Other => write!(f, "other"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for FailureStage {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
FailureStage::Ingestion => write!(f, "ingestion"),
|
||||||
|
FailureStage::Validation => write!(f, "validation"),
|
||||||
|
FailureStage::Ocr => write!(f, "ocr"),
|
||||||
|
FailureStage::Storage => write!(f, "storage"),
|
||||||
|
FailureStage::Processing => write!(f, "processing"),
|
||||||
|
FailureStage::Sync => write!(f, "sync"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, FromRow, ToSchema)]
|
||||||
|
pub struct FailedDocument {
|
||||||
|
/// Unique identifier for the failed document record
|
||||||
|
pub id: Uuid,
|
||||||
|
/// User who attempted to ingest the document
|
||||||
|
pub user_id: Uuid,
|
||||||
|
/// Filename of the failed document
|
||||||
|
pub filename: String,
|
||||||
|
/// Original filename when uploaded
|
||||||
|
pub original_filename: Option<String>,
|
||||||
|
/// Original path where the file was located
|
||||||
|
pub original_path: Option<String>,
|
||||||
|
/// Stored file path (if file was saved before failure)
|
||||||
|
pub file_path: Option<String>,
|
||||||
|
/// Size of the file in bytes
|
||||||
|
pub file_size: Option<i64>,
|
||||||
|
/// SHA256 hash of the file content
|
||||||
|
pub file_hash: Option<String>,
|
||||||
|
/// MIME type of the file
|
||||||
|
pub mime_type: Option<String>,
|
||||||
|
/// Raw content if extracted before failure
|
||||||
|
pub content: Option<String>,
|
||||||
|
/// Tags that were assigned/detected
|
||||||
|
pub tags: Vec<String>,
|
||||||
|
/// Partial OCR text if extracted before failure
|
||||||
|
pub ocr_text: Option<String>,
|
||||||
|
/// OCR confidence if calculated
|
||||||
|
pub ocr_confidence: Option<f32>,
|
||||||
|
/// Word count if calculated
|
||||||
|
pub ocr_word_count: Option<i32>,
|
||||||
|
/// Processing time before failure in milliseconds
|
||||||
|
pub ocr_processing_time_ms: Option<i32>,
|
||||||
|
/// Reason why the document failed
|
||||||
|
pub failure_reason: String,
|
||||||
|
/// Stage at which the document failed
|
||||||
|
pub failure_stage: String,
|
||||||
|
/// Reference to existing document if failed due to duplicate
|
||||||
|
pub existing_document_id: Option<Uuid>,
|
||||||
|
/// Source of the ingestion attempt
|
||||||
|
pub ingestion_source: String,
|
||||||
|
/// Detailed error message
|
||||||
|
pub error_message: Option<String>,
|
||||||
|
/// Number of retry attempts
|
||||||
|
pub retry_count: Option<i32>,
|
||||||
|
/// Last retry timestamp
|
||||||
|
pub last_retry_at: Option<DateTime<Utc>>,
|
||||||
|
/// When the document failed
|
||||||
|
pub created_at: DateTime<Utc>,
|
||||||
|
/// Last update timestamp
|
||||||
|
pub updated_at: DateTime<Utc>,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)]
|
#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)]
|
||||||
pub struct DocumentResponse {
|
pub struct DocumentResponse {
|
||||||
/// Unique identifier for the document
|
/// Unique identifier for the document
|
||||||
|
|
|
||||||
|
|
@ -322,11 +322,12 @@ impl OcrQueueService {
|
||||||
warn!("⚠️ OCR quality issues for '{}' | Job: {} | Document: {} | {:.1}% confidence | {} words",
|
warn!("⚠️ OCR quality issues for '{}' | Job: {} | Document: {} | {:.1}% confidence | {} words",
|
||||||
filename, item.id, item.document_id, ocr_result.confidence, ocr_result.word_count);
|
filename, item.id, item.document_id, ocr_result.confidence, ocr_result.word_count);
|
||||||
|
|
||||||
// Mark as failed for quality issues
|
// Mark as failed for quality issues with proper failure reason
|
||||||
sqlx::query(
|
sqlx::query(
|
||||||
r#"
|
r#"
|
||||||
UPDATE documents
|
UPDATE documents
|
||||||
SET ocr_status = 'failed',
|
SET ocr_status = 'failed',
|
||||||
|
ocr_failure_reason = 'low_ocr_confidence',
|
||||||
ocr_error = $2,
|
ocr_error = $2,
|
||||||
updated_at = NOW()
|
updated_at = NOW()
|
||||||
WHERE id = $1
|
WHERE id = $1
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,14 @@ struct PaginationQuery {
|
||||||
ocr_status: Option<String>,
|
ocr_status: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize, ToSchema)]
|
||||||
|
struct FailedDocumentsQuery {
|
||||||
|
limit: Option<i64>,
|
||||||
|
offset: Option<i64>,
|
||||||
|
stage: Option<String>, // 'ocr', 'ingestion', 'validation', etc.
|
||||||
|
reason: Option<String>, // 'duplicate_content', 'low_ocr_confidence', etc.
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Deserialize, Serialize, ToSchema)]
|
#[derive(Deserialize, Serialize, ToSchema)]
|
||||||
pub struct BulkDeleteRequest {
|
pub struct BulkDeleteRequest {
|
||||||
pub document_ids: Vec<uuid::Uuid>,
|
pub document_ids: Vec<uuid::Uuid>,
|
||||||
|
|
@ -50,8 +58,8 @@ pub fn router() -> Router<Arc<AppState>> {
|
||||||
.route("/{id}/ocr", get(get_document_ocr))
|
.route("/{id}/ocr", get(get_document_ocr))
|
||||||
.route("/{id}/processed-image", get(get_processed_image))
|
.route("/{id}/processed-image", get(get_processed_image))
|
||||||
.route("/{id}/retry-ocr", post(retry_ocr))
|
.route("/{id}/retry-ocr", post(retry_ocr))
|
||||||
.route("/failed-ocr", get(get_failed_ocr_documents))
|
|
||||||
.route("/duplicates", get(get_user_duplicates))
|
.route("/duplicates", get(get_user_duplicates))
|
||||||
|
.route("/failed", get(get_failed_documents))
|
||||||
.route("/delete-low-confidence", post(delete_low_confidence_documents))
|
.route("/delete-low-confidence", post(delete_low_confidence_documents))
|
||||||
.route("/delete-failed-ocr", post(delete_failed_ocr_documents))
|
.route("/delete-failed-ocr", post(delete_failed_ocr_documents))
|
||||||
}
|
}
|
||||||
|
|
@ -757,6 +765,202 @@ async fn get_failed_ocr_documents(
|
||||||
Ok(Json(response))
|
Ok(Json(response))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[utoipa::path(
|
||||||
|
get,
|
||||||
|
path = "/api/documents/failed",
|
||||||
|
tag = "documents",
|
||||||
|
security(
|
||||||
|
("bearer_auth" = [])
|
||||||
|
),
|
||||||
|
params(
|
||||||
|
("limit" = Option<i64>, Query, description = "Number of documents to return"),
|
||||||
|
("offset" = Option<i64>, Query, description = "Number of documents to skip"),
|
||||||
|
("stage" = Option<String>, Query, description = "Filter by failure stage (ocr, ingestion, validation, etc.)"),
|
||||||
|
("reason" = Option<String>, Query, description = "Filter by failure reason")
|
||||||
|
),
|
||||||
|
responses(
|
||||||
|
(status = 200, description = "List of failed documents", body = String),
|
||||||
|
(status = 401, description = "Unauthorized")
|
||||||
|
)
|
||||||
|
)]
|
||||||
|
async fn get_failed_documents(
|
||||||
|
State(state): State<Arc<AppState>>,
|
||||||
|
auth_user: AuthUser,
|
||||||
|
Query(params): Query<FailedDocumentsQuery>,
|
||||||
|
) -> Result<Json<serde_json::Value>, StatusCode> {
|
||||||
|
let limit = params.limit.unwrap_or(25);
|
||||||
|
let offset = params.offset.unwrap_or(0);
|
||||||
|
|
||||||
|
// Query the unified failed_documents table
|
||||||
|
let mut query_builder = sqlx::QueryBuilder::new(
|
||||||
|
r#"
|
||||||
|
SELECT id, filename, original_filename, file_path, file_size, mime_type,
|
||||||
|
content, tags, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms,
|
||||||
|
failure_reason, failure_stage, error_message, existing_document_id,
|
||||||
|
ingestion_source, retry_count, last_retry_at, created_at, updated_at
|
||||||
|
FROM failed_documents
|
||||||
|
WHERE ($1::uuid IS NULL OR user_id = $1)
|
||||||
|
"#
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut bind_count = 1;
|
||||||
|
|
||||||
|
// Add stage filter if specified
|
||||||
|
if let Some(stage) = ¶ms.stage {
|
||||||
|
bind_count += 1;
|
||||||
|
query_builder.push(&format!(" AND failure_stage = ${}", bind_count));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add reason filter if specified
|
||||||
|
if let Some(reason) = ¶ms.reason {
|
||||||
|
bind_count += 1;
|
||||||
|
query_builder.push(&format!(" AND failure_reason = ${}", bind_count));
|
||||||
|
}
|
||||||
|
|
||||||
|
query_builder.push(" ORDER BY created_at DESC");
|
||||||
|
query_builder.push(&format!(" LIMIT ${} OFFSET ${}", bind_count + 1, bind_count + 2));
|
||||||
|
|
||||||
|
let mut query = query_builder.build();
|
||||||
|
|
||||||
|
// Bind parameters in order
|
||||||
|
query = query.bind(if auth_user.user.role == crate::models::UserRole::Admin {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(auth_user.user.id)
|
||||||
|
});
|
||||||
|
|
||||||
|
if let Some(stage) = ¶ms.stage {
|
||||||
|
query = query.bind(stage);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(reason) = ¶ms.reason {
|
||||||
|
query = query.bind(reason);
|
||||||
|
}
|
||||||
|
|
||||||
|
query = query.bind(limit).bind(offset);
|
||||||
|
|
||||||
|
let failed_docs = query
|
||||||
|
.fetch_all(state.db.get_pool())
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
tracing::error!("Failed to fetch failed documents: {}", e);
|
||||||
|
StatusCode::INTERNAL_SERVER_ERROR
|
||||||
|
})?;
|
||||||
|
|
||||||
|
// Count total for pagination
|
||||||
|
let mut count_query_builder = sqlx::QueryBuilder::new(
|
||||||
|
"SELECT COUNT(*) FROM failed_documents WHERE ($1::uuid IS NULL OR user_id = $1)"
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut count_bind_count = 1;
|
||||||
|
|
||||||
|
if let Some(stage) = ¶ms.stage {
|
||||||
|
count_bind_count += 1;
|
||||||
|
count_query_builder.push(&format!(" AND failure_stage = ${}", count_bind_count));
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(reason) = ¶ms.reason {
|
||||||
|
count_bind_count += 1;
|
||||||
|
count_query_builder.push(&format!(" AND failure_reason = ${}", count_bind_count));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut count_query = count_query_builder.build_query_scalar::<i64>();
|
||||||
|
|
||||||
|
count_query = count_query.bind(if auth_user.user.role == crate::models::UserRole::Admin {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(auth_user.user.id)
|
||||||
|
});
|
||||||
|
|
||||||
|
if let Some(stage) = ¶ms.stage {
|
||||||
|
count_query = count_query.bind(stage);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(reason) = ¶ms.reason {
|
||||||
|
count_query = count_query.bind(reason);
|
||||||
|
}
|
||||||
|
|
||||||
|
let total_count = count_query
|
||||||
|
.fetch_one(state.db.get_pool())
|
||||||
|
.await
|
||||||
|
.unwrap_or(0);
|
||||||
|
|
||||||
|
// Convert to JSON response format
|
||||||
|
let documents: Vec<serde_json::Value> = failed_docs.iter().map(|row| {
|
||||||
|
serde_json::json!({
|
||||||
|
"id": row.get::<uuid::Uuid, _>("id"),
|
||||||
|
"filename": row.get::<String, _>("filename"),
|
||||||
|
"original_filename": row.get::<Option<String>, _>("original_filename"),
|
||||||
|
"file_path": row.get::<Option<String>, _>("file_path"),
|
||||||
|
"file_size": row.get::<Option<i64>, _>("file_size"),
|
||||||
|
"mime_type": row.get::<Option<String>, _>("mime_type"),
|
||||||
|
"content": row.get::<Option<String>, _>("content"),
|
||||||
|
"tags": row.get::<Vec<String>, _>("tags"),
|
||||||
|
"ocr_text": row.get::<Option<String>, _>("ocr_text"),
|
||||||
|
"ocr_confidence": row.get::<Option<f32>, _>("ocr_confidence"),
|
||||||
|
"ocr_word_count": row.get::<Option<i32>, _>("ocr_word_count"),
|
||||||
|
"ocr_processing_time_ms": row.get::<Option<i32>, _>("ocr_processing_time_ms"),
|
||||||
|
"failure_reason": row.get::<String, _>("failure_reason"),
|
||||||
|
"failure_stage": row.get::<String, _>("failure_stage"),
|
||||||
|
"error_message": row.get::<Option<String>, _>("error_message"),
|
||||||
|
"existing_document_id": row.get::<Option<uuid::Uuid>, _>("existing_document_id"),
|
||||||
|
"ingestion_source": row.get::<String, _>("ingestion_source"),
|
||||||
|
"retry_count": row.get::<Option<i32>, _>("retry_count"),
|
||||||
|
"last_retry_at": row.get::<Option<chrono::DateTime<chrono::Utc>>, _>("last_retry_at"),
|
||||||
|
"created_at": row.get::<chrono::DateTime<chrono::Utc>, _>("created_at"),
|
||||||
|
"updated_at": row.get::<chrono::DateTime<chrono::Utc>, _>("updated_at"),
|
||||||
|
|
||||||
|
// Computed fields for backward compatibility
|
||||||
|
"failure_category": categorize_failure_reason(
|
||||||
|
Some(&row.get::<String, _>("failure_reason")),
|
||||||
|
row.get::<Option<String>, _>("error_message").as_deref()
|
||||||
|
),
|
||||||
|
"source": match row.get::<String, _>("failure_stage").as_str() {
|
||||||
|
"ocr" => "OCR Processing",
|
||||||
|
"ingestion" => "Document Ingestion",
|
||||||
|
"validation" => "Document Validation",
|
||||||
|
"storage" => "File Storage",
|
||||||
|
"processing" => "Document Processing",
|
||||||
|
"sync" => "Source Synchronization",
|
||||||
|
_ => "Unknown"
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}).collect();
|
||||||
|
|
||||||
|
// Calculate statistics for the response
|
||||||
|
let mut stage_stats = std::collections::HashMap::new();
|
||||||
|
let mut reason_stats = std::collections::HashMap::new();
|
||||||
|
|
||||||
|
for doc in &documents {
|
||||||
|
let stage = doc["failure_stage"].as_str().unwrap_or("unknown");
|
||||||
|
let reason = doc["failure_reason"].as_str().unwrap_or("unknown");
|
||||||
|
|
||||||
|
*stage_stats.entry(stage).or_insert(0) += 1;
|
||||||
|
*reason_stats.entry(reason).or_insert(0) += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
let response = serde_json::json!({
|
||||||
|
"documents": documents,
|
||||||
|
"pagination": {
|
||||||
|
"limit": limit,
|
||||||
|
"offset": offset,
|
||||||
|
"total": total_count,
|
||||||
|
"total_pages": (total_count as f64 / limit as f64).ceil() as i64
|
||||||
|
},
|
||||||
|
"statistics": {
|
||||||
|
"total_failed": total_count,
|
||||||
|
"by_stage": stage_stats,
|
||||||
|
"by_reason": reason_stats
|
||||||
|
},
|
||||||
|
"filters": {
|
||||||
|
"stage": params.stage,
|
||||||
|
"reason": params.reason
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(Json(response))
|
||||||
|
}
|
||||||
|
|
||||||
async fn calculate_estimated_wait_time(priority: i32) -> i64 {
|
async fn calculate_estimated_wait_time(priority: i32) -> i64 {
|
||||||
// Simple estimation based on priority - in a real implementation,
|
// Simple estimation based on priority - in a real implementation,
|
||||||
// this would check actual queue depth and processing times
|
// this would check actual queue depth and processing times
|
||||||
|
|
@ -775,6 +979,7 @@ fn categorize_failure_reason(failure_reason: Option<&str>, error_message: Option
|
||||||
Some("processing_timeout") => "Timeout",
|
Some("processing_timeout") => "Timeout",
|
||||||
Some("memory_limit") => "Memory Limit",
|
Some("memory_limit") => "Memory Limit",
|
||||||
Some("pdf_parsing_panic") => "PDF Parsing Error",
|
Some("pdf_parsing_panic") => "PDF Parsing Error",
|
||||||
|
Some("low_ocr_confidence") => "Low OCR Confidence",
|
||||||
Some("unknown") | None => {
|
Some("unknown") | None => {
|
||||||
// Try to categorize based on error message
|
// Try to categorize based on error message
|
||||||
if let Some(error) = error_message {
|
if let Some(error) = error_message {
|
||||||
|
|
@ -787,6 +992,8 @@ fn categorize_failure_reason(failure_reason: Option<&str>, error_message: Option
|
||||||
"PDF Font Issues"
|
"PDF Font Issues"
|
||||||
} else if error_lower.contains("corrupt") {
|
} else if error_lower.contains("corrupt") {
|
||||||
"PDF Corruption"
|
"PDF Corruption"
|
||||||
|
} else if error_lower.contains("quality below threshold") || error_lower.contains("confidence") {
|
||||||
|
"Low OCR Confidence"
|
||||||
} else {
|
} else {
|
||||||
"Unknown Error"
|
"Unknown Error"
|
||||||
}
|
}
|
||||||
|
|
@ -1066,12 +1273,27 @@ pub async fn delete_low_confidence_documents(
|
||||||
let matched_count = matched_documents.len();
|
let matched_count = matched_documents.len();
|
||||||
|
|
||||||
if is_preview {
|
if is_preview {
|
||||||
|
// Convert documents to response format with key details
|
||||||
|
let document_details: Vec<serde_json::Value> = matched_documents.iter().map(|d| {
|
||||||
|
serde_json::json!({
|
||||||
|
"id": d.id,
|
||||||
|
"filename": d.filename,
|
||||||
|
"original_filename": d.original_filename,
|
||||||
|
"file_size": d.file_size,
|
||||||
|
"ocr_confidence": d.ocr_confidence,
|
||||||
|
"ocr_status": d.ocr_status,
|
||||||
|
"created_at": d.created_at,
|
||||||
|
"mime_type": d.mime_type
|
||||||
|
})
|
||||||
|
}).collect();
|
||||||
|
|
||||||
return Ok(Json(serde_json::json!({
|
return Ok(Json(serde_json::json!({
|
||||||
"success": true,
|
"success": true,
|
||||||
"message": format!("Found {} documents with OCR confidence below {}%", matched_count, request.max_confidence),
|
"message": format!("Found {} documents with OCR confidence below {}%", matched_count, request.max_confidence),
|
||||||
"matched_count": matched_count,
|
"matched_count": matched_count,
|
||||||
"preview": true,
|
"preview": true,
|
||||||
"document_ids": matched_documents.iter().map(|d| d.id).collect::<Vec<_>>()
|
"document_ids": matched_documents.iter().map(|d| d.id).collect::<Vec<_>>(),
|
||||||
|
"documents": document_details
|
||||||
})));
|
})));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue