feat(client/server): advanced search, along with fixing build errors

This commit is contained in:
perf3ct 2025-06-17 02:56:59 +00:00
parent 76529f83be
commit 3ae542088b
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
7 changed files with 916 additions and 5 deletions

View File

@ -14,6 +14,7 @@ import DocumentDetailsPage from './pages/DocumentDetailsPage';
import SettingsPage from './pages/SettingsPage';
import SourcesPage from './pages/SourcesPage';
import WatchFolderPage from './pages/WatchFolderPage';
import FailedOcrPage from './pages/FailedOcrPage';
function App(): JSX.Element {
const { user, loading } = useAuth();
@ -69,6 +70,7 @@ function App(): JSX.Element {
<Route path="/sources" element={<SourcesPage />} />
<Route path="/watch" element={<WatchFolderPage />} />
<Route path="/settings" element={<SettingsPage />} />
<Route path="/failed-ocr" element={<FailedOcrPage />} />
<Route path="/profile" element={<div>Profile Page - Coming Soon</div>} />
</Routes>
</AppLayout>

View File

@ -347,8 +347,8 @@ const AdvancedSearchPanel: React.FC<AdvancedSearchPanelProps> = ({
label="Boost Recent Documents"
/>
</Box>
</Grid>
</Grid>
</Box>
</Box>
)}
{/* Results Display Section */}

View File

@ -181,6 +181,7 @@ const EnhancedSearchGuide: React.FC<EnhancedSearchGuideProps> = ({ onExampleClic
sx={{
mb: 1.5,
transition: 'all 0.2s',
backgroundColor: (theme) => theme.palette.mode === 'dark' ? 'grey.800' : 'background.paper',
'&:hover': {
boxShadow: 2,
transform: 'translateY(-2px)',
@ -200,7 +201,7 @@ const EnhancedSearchGuide: React.FC<EnhancedSearchGuideProps> = ({ onExampleClic
variant="body2"
fontFamily="monospace"
sx={{
backgroundColor: 'grey.100',
backgroundColor: (theme) => theme.palette.mode === 'dark' ? 'grey.800' : 'grey.100',
px: 1,
py: 0.5,
borderRadius: 1,
@ -273,7 +274,11 @@ const EnhancedSearchGuide: React.FC<EnhancedSearchGuideProps> = ({ onExampleClic
if (compact && !expanded) {
return (
<Paper variant="outlined" sx={{ p: 2, mb: 2 }}>
<Paper variant="outlined" sx={{
p: 2,
mb: 2,
backgroundColor: (theme) => theme.palette.mode === 'dark' ? 'grey.900' : 'background.paper'
}}>
<Box display="flex" alignItems="center" justifyContent="space-between">
<Box display="flex" alignItems="center" gap={1}>
<TipIcon color="primary" />
@ -294,7 +299,7 @@ const EnhancedSearchGuide: React.FC<EnhancedSearchGuideProps> = ({ onExampleClic
}
return (
<Paper elevation={0} sx={{ p: 3, mb: 3, backgroundColor: 'grey.50' }}>
<Paper elevation={0} sx={{ p: 3, mb: 3, backgroundColor: (theme) => theme.palette.mode === 'dark' ? 'grey.900' : 'grey.50' }}>
<Box display="flex" alignItems="center" justifyContent="space-between" mb={2}>
<Typography variant="h6" display="flex" alignItems="center" gap={1}>
<TipIcon color="primary" />

View File

@ -32,6 +32,7 @@ import {
Logout as LogoutIcon,
Description as DocumentIcon,
Storage as StorageIcon,
Error as ErrorIcon,
} from '@mui/icons-material';
import { useNavigate, useLocation } from 'react-router-dom';
import { useAuth } from '../../contexts/AuthContext';
@ -64,6 +65,7 @@ const navigationItems: NavigationItem[] = [
{ text: 'Search', icon: SearchIcon, path: '/search' },
{ text: 'Sources', icon: StorageIcon, path: '/sources' },
{ text: 'Watch Folder', icon: FolderIcon, path: '/watch' },
{ text: 'Failed OCR', icon: ErrorIcon, path: '/failed-ocr' },
];
const AppLayout: React.FC<AppLayoutProps> = ({ children }) => {

View File

@ -0,0 +1,576 @@
import React, { useState, useEffect } from 'react';
import {
Box,
Typography,
Card,
CardContent,
Grid,
Button,
Chip,
Alert,
AlertTitle,
Table,
TableBody,
TableCell,
TableContainer,
TableHead,
TableRow,
Paper,
Dialog,
DialogTitle,
DialogContent,
DialogActions,
Pagination,
CircularProgress,
Tooltip,
IconButton,
Collapse,
LinearProgress,
Snackbar,
} from '@mui/material';
import {
Refresh as RefreshIcon,
Error as ErrorIcon,
Info as InfoIcon,
ExpandMore as ExpandMoreIcon,
ExpandLess as ExpandLessIcon,
Schedule as ScheduleIcon,
Visibility as VisibilityIcon,
Download as DownloadIcon,
} from '@mui/icons-material';
import { format } from 'date-fns';
import { api, documentService } from '../services/api';
interface FailedDocument {
id: string;
filename: string;
original_filename: string;
file_size: number;
mime_type: string;
created_at: string;
updated_at: string;
tags: string[];
ocr_status: string;
ocr_error: string;
ocr_failure_reason: string;
ocr_completed_at?: string;
retry_count: number;
last_attempt_at?: string;
can_retry: boolean;
failure_category: string;
}
interface FailureCategory {
reason: string;
display_name: string;
count: number;
}
interface FailedOcrResponse {
documents: FailedDocument[];
pagination: {
total: number;
limit: number;
offset: number;
has_more: boolean;
};
statistics: {
total_failed: number;
failure_categories: FailureCategory[];
};
}
interface RetryResponse {
success: boolean;
message: string;
queue_id?: string;
estimated_wait_minutes?: number;
}
const FailedOcrPage: React.FC = () => {
const [documents, setDocuments] = useState<FailedDocument[]>([]);
const [loading, setLoading] = useState(true);
const [retrying, setRetrying] = useState<string | null>(null);
const [statistics, setStatistics] = useState<FailedOcrResponse['statistics'] | null>(null);
const [pagination, setPagination] = useState({ page: 1, limit: 25 });
const [totalPages, setTotalPages] = useState(0);
const [selectedDocument, setSelectedDocument] = useState<FailedDocument | null>(null);
const [detailsOpen, setDetailsOpen] = useState(false);
const [expandedRows, setExpandedRows] = useState<Set<string>>(new Set());
const [snackbar, setSnackbar] = useState<{ open: boolean; message: string; severity: 'success' | 'error' }>({
open: false,
message: '',
severity: 'success'
});
const fetchFailedDocuments = async () => {
try {
setLoading(true);
const offset = (pagination.page - 1) * pagination.limit;
const response = await documentService.getFailedOcrDocuments(pagination.limit, offset);
setDocuments(response.data.documents);
setStatistics(response.data.statistics);
setTotalPages(Math.ceil(response.data.pagination.total / pagination.limit));
} catch (error) {
console.error('Failed to fetch failed OCR documents:', error);
setSnackbar({
open: true,
message: 'Failed to load failed OCR documents',
severity: 'error'
});
} finally {
setLoading(false);
}
};
useEffect(() => {
fetchFailedDocuments();
}, [pagination.page]);
const handleRetryOcr = async (document: FailedDocument) => {
try {
setRetrying(document.id);
const response = await documentService.retryOcr(document.id);
if (response.data.success) {
setSnackbar({
open: true,
message: `OCR retry queued for "${document.filename}". Estimated wait time: ${response.data.estimated_wait_minutes || 'Unknown'} minutes.`,
severity: 'success'
});
// Refresh the list to update retry counts and status
await fetchFailedDocuments();
} else {
setSnackbar({
open: true,
message: response.data.message || 'Failed to retry OCR',
severity: 'error'
});
}
} catch (error) {
console.error('Failed to retry OCR:', error);
setSnackbar({
open: true,
message: 'Failed to retry OCR processing',
severity: 'error'
});
} finally {
setRetrying(null);
}
};
const formatFileSize = (bytes: number): string => {
if (bytes === 0) return '0 B';
const k = 1024;
const sizes = ['B', 'KB', 'MB', 'GB'];
const i = Math.floor(Math.log(bytes) / Math.log(k));
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
};
const getFailureCategoryColor = (category: string): "error" | "warning" | "info" | "default" => {
switch (category) {
case 'PDF Font Issues':
case 'PDF Corruption':
case 'PDF Parsing Error':
return 'warning';
case 'Timeout':
case 'Memory Limit':
return 'error';
case 'Unknown Error':
return 'info';
default:
return 'default';
}
};
const toggleRowExpansion = (documentId: string) => {
const newExpanded = new Set(expandedRows);
if (newExpanded.has(documentId)) {
newExpanded.delete(documentId);
} else {
newExpanded.add(documentId);
}
setExpandedRows(newExpanded);
};
const showDocumentDetails = (document: FailedDocument) => {
setSelectedDocument(document);
setDetailsOpen(true);
};
if (loading && documents.length === 0) {
return (
<Box display="flex" justifyContent="center" alignItems="center" minHeight="400px">
<CircularProgress />
</Box>
);
}
return (
<Box sx={{ p: 3 }}>
<Box display="flex" justifyContent="space-between" alignItems="center" mb={3}>
<Typography variant="h4" component="h1">
Failed OCR Documents
</Typography>
<Button
variant="outlined"
startIcon={<RefreshIcon />}
onClick={fetchFailedDocuments}
disabled={loading}
>
Refresh
</Button>
</Box>
{/* Statistics Overview */}
{statistics && (
<Grid container spacing={3} mb={3}>
<Grid item xs={12} md={4}>
<Card>
<CardContent>
<Typography variant="h6" color="error">
<ErrorIcon sx={{ mr: 1, verticalAlign: 'middle' }} />
Total Failed
</Typography>
<Typography variant="h3" color="error.main">
{statistics.total_failed}
</Typography>
</CardContent>
</Card>
</Grid>
<Grid item xs={12} md={8}>
<Card>
<CardContent>
<Typography variant="h6" mb={2}>
Failure Categories
</Typography>
<Box display="flex" flexWrap="wrap" gap={1}>
{statistics.failure_categories.map((category) => (
<Chip
key={category.reason}
label={`${category.display_name}: ${category.count}`}
color={getFailureCategoryColor(category.display_name)}
variant="outlined"
size="small"
/>
))}
</Box>
</CardContent>
</Card>
</Grid>
</Grid>
)}
{documents.length === 0 ? (
<Alert severity="success" sx={{ mt: 2 }}>
<AlertTitle>Great news!</AlertTitle>
No documents have failed OCR processing. All your documents are processing successfully.
</Alert>
) : (
<>
<Alert severity="info" sx={{ mb: 2 }}>
<AlertTitle>OCR Failures</AlertTitle>
These documents failed OCR processing. You can retry OCR with detailed output to understand why failures occurred.
Common causes include corrupted PDFs, unsupported fonts, or memory limitations.
</Alert>
<TableContainer component={Paper}>
<Table>
<TableHead>
<TableRow>
<TableCell />
<TableCell>Document</TableCell>
<TableCell>Failure Type</TableCell>
<TableCell>Retry Count</TableCell>
<TableCell>Last Failed</TableCell>
<TableCell>Actions</TableCell>
</TableRow>
</TableHead>
<TableBody>
{documents.map((document) => (
<React.Fragment key={document.id}>
<TableRow>
<TableCell>
<IconButton
size="small"
onClick={() => toggleRowExpansion(document.id)}
>
{expandedRows.has(document.id) ? <ExpandLessIcon /> : <ExpandMoreIcon />}
</IconButton>
</TableCell>
<TableCell>
<Box>
<Typography variant="body2" fontWeight="bold">
{document.filename}
</Typography>
<Typography variant="caption" color="text.secondary">
{formatFileSize(document.file_size)} {document.mime_type}
</Typography>
</Box>
</TableCell>
<TableCell>
<Chip
label={document.failure_category}
color={getFailureCategoryColor(document.failure_category)}
size="small"
/>
</TableCell>
<TableCell>
<Typography variant="body2">
{document.retry_count} attempts
</Typography>
</TableCell>
<TableCell>
<Typography variant="body2">
{document.updated_at ? format(new Date(document.updated_at), 'MMM dd, yyyy HH:mm') : 'Unknown'}
</Typography>
</TableCell>
<TableCell>
<Box display="flex" gap={1}>
<Tooltip title="Retry OCR">
<IconButton
size="small"
onClick={() => handleRetryOcr(document)}
disabled={retrying === document.id || !document.can_retry}
>
{retrying === document.id ? (
<CircularProgress size={16} />
) : (
<RefreshIcon />
)}
</IconButton>
</Tooltip>
<Tooltip title="View Details">
<IconButton
size="small"
onClick={() => showDocumentDetails(document)}
>
<VisibilityIcon />
</IconButton>
</Tooltip>
<Tooltip title="Download Document">
<IconButton
size="small"
onClick={() => window.open(`/api/documents/${document.id}/download`, '_blank')}
>
<DownloadIcon />
</IconButton>
</Tooltip>
</Box>
</TableCell>
</TableRow>
<TableRow>
<TableCell sx={{ paddingBottom: 0, paddingTop: 0 }} colSpan={6}>
<Collapse in={expandedRows.has(document.id)} timeout="auto" unmountOnExit>
<Box sx={{ margin: 1, p: 2, bgcolor: 'grey.50' }}>
<Typography variant="h6" gutterBottom>
Error Details
</Typography>
<Grid container spacing={2}>
<Grid item xs={12} md={6}>
<Typography variant="body2" color="text.secondary">
<strong>Failure Reason:</strong>
</Typography>
<Typography variant="body2" sx={{ mb: 1 }}>
{document.ocr_failure_reason || 'Not specified'}
</Typography>
<Typography variant="body2" color="text.secondary">
<strong>Error Message:</strong>
</Typography>
<Typography
variant="body2"
sx={{
fontFamily: 'monospace',
bgcolor: 'grey.100',
p: 1,
borderRadius: 1,
fontSize: '0.75rem',
wordBreak: 'break-word'
}}
>
{document.ocr_error || 'No error message available'}
</Typography>
</Grid>
<Grid item xs={12} md={6}>
<Typography variant="body2" color="text.secondary">
<strong>Last Attempt:</strong>
</Typography>
<Typography variant="body2" sx={{ mb: 1 }}>
{document.last_attempt_at
? format(new Date(document.last_attempt_at), 'PPpp')
: 'No previous attempts'}
</Typography>
<Typography variant="body2" color="text.secondary">
<strong>File Created:</strong>
</Typography>
<Typography variant="body2">
{format(new Date(document.created_at), 'PPpp')}
</Typography>
</Grid>
</Grid>
</Box>
</Collapse>
</TableCell>
</TableRow>
</React.Fragment>
))}
</TableBody>
</Table>
</TableContainer>
{/* Pagination */}
{totalPages > 1 && (
<Box display="flex" justifyContent="center" mt={3}>
<Pagination
count={totalPages}
page={pagination.page}
onChange={(_, page) => setPagination(prev => ({ ...prev, page }))}
color="primary"
/>
</Box>
)}
</>
)}
{/* Document Details Dialog */}
<Dialog
open={detailsOpen}
onClose={() => setDetailsOpen(false)}
maxWidth="md"
fullWidth
>
<DialogTitle>
Document Details: {selectedDocument?.filename}
</DialogTitle>
<DialogContent>
{selectedDocument && (
<Grid container spacing={2}>
<Grid item xs={12} md={6}>
<Typography variant="body2" color="text.secondary">
<strong>Original Filename:</strong>
</Typography>
<Typography variant="body2" sx={{ mb: 2 }}>
{selectedDocument.original_filename}
</Typography>
<Typography variant="body2" color="text.secondary">
<strong>File Size:</strong>
</Typography>
<Typography variant="body2" sx={{ mb: 2 }}>
{formatFileSize(selectedDocument.file_size)}
</Typography>
<Typography variant="body2" color="text.secondary">
<strong>MIME Type:</strong>
</Typography>
<Typography variant="body2" sx={{ mb: 2 }}>
{selectedDocument.mime_type}
</Typography>
<Typography variant="body2" color="text.secondary">
<strong>Tags:</strong>
</Typography>
<Box sx={{ mb: 2 }}>
{selectedDocument.tags.length > 0 ? (
selectedDocument.tags.map((tag) => (
<Chip key={tag} label={tag} size="small" sx={{ mr: 1, mb: 1 }} />
))
) : (
<Typography variant="body2" color="text.secondary">No tags</Typography>
)}
</Box>
</Grid>
<Grid item xs={12} md={6}>
<Typography variant="body2" color="text.secondary">
<strong>Failure Category:</strong>
</Typography>
<Chip
label={selectedDocument.failure_category}
color={getFailureCategoryColor(selectedDocument.failure_category)}
sx={{ mb: 2 }}
/>
<Typography variant="body2" color="text.secondary">
<strong>Retry Count:</strong>
</Typography>
<Typography variant="body2" sx={{ mb: 2 }}>
{selectedDocument.retry_count} attempts
</Typography>
<Typography variant="body2" color="text.secondary">
<strong>Created:</strong>
</Typography>
<Typography variant="body2" sx={{ mb: 2 }}>
{format(new Date(selectedDocument.created_at), 'PPpp')}
</Typography>
<Typography variant="body2" color="text.secondary">
<strong>Last Updated:</strong>
</Typography>
<Typography variant="body2">
{format(new Date(selectedDocument.updated_at), 'PPpp')}
</Typography>
</Grid>
<Grid item xs={12}>
<Typography variant="body2" color="text.secondary" sx={{ mb: 1 }}>
<strong>Full Error Message:</strong>
</Typography>
<Paper sx={{ p: 2, bgcolor: 'grey.50' }}>
<Typography
variant="body2"
sx={{
fontFamily: 'monospace',
fontSize: '0.875rem',
wordBreak: 'break-word',
whiteSpace: 'pre-wrap'
}}
>
{selectedDocument.ocr_error || 'No error message available'}
</Typography>
</Paper>
</Grid>
</Grid>
)}
</DialogContent>
<DialogActions>
{selectedDocument?.can_retry && (
<Button
onClick={() => {
setDetailsOpen(false);
if (selectedDocument) {
handleRetryOcr(selectedDocument);
}
}}
startIcon={<RefreshIcon />}
disabled={retrying === selectedDocument?.id}
>
Retry OCR
</Button>
)}
<Button onClick={() => setDetailsOpen(false)}>Close</Button>
</DialogActions>
</Dialog>
{/* Success/Error Snackbar */}
<Snackbar
open={snackbar.open}
autoHideDuration={6000}
onClose={() => setSnackbar(prev => ({ ...prev, open: false }))}
>
<Alert
onClose={() => setSnackbar(prev => ({ ...prev, open: false }))}
severity={snackbar.severity}
sx={{ width: '100%' }}
>
{snackbar.message}
</Alert>
</Snackbar>
</Box>
);
};
export default FailedOcrPage;

View File

@ -186,6 +186,16 @@ export const documentService = {
})
},
retryOcr: (id: string) => {
return api.post(`/documents/${id}/retry-ocr`)
},
getFailedOcrDocuments: (limit = 50, offset = 0) => {
return api.get(`/documents/failed-ocr`, {
params: { limit, offset },
})
},
search: (searchRequest: SearchRequest) => {
return api.get<SearchResponse>('/search', {
params: searchRequest,

View File

@ -9,6 +9,7 @@ use serde::Deserialize;
use std::sync::Arc;
use utoipa::ToSchema;
use sha2::{Sha256, Digest};
use sqlx::Row;
use crate::{
auth::AuthUser,
@ -33,6 +34,8 @@ pub fn router() -> Router<Arc<AppState>> {
.route("/{id}/thumbnail", get(get_document_thumbnail))
.route("/{id}/ocr", get(get_document_ocr))
.route("/{id}/processed-image", get(get_processed_image))
.route("/{id}/retry-ocr", post(retry_ocr))
.route("/failed-ocr", get(get_failed_ocr_documents))
}
#[utoipa::path(
@ -471,4 +474,317 @@ async fn get_processed_image(
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
Ok(response)
}
#[utoipa::path(
post,
path = "/api/documents/{id}/retry-ocr",
tag = "documents",
security(
("bearer_auth" = [])
),
params(
("id" = uuid::Uuid, Path, description = "Document ID")
),
responses(
(status = 200, description = "OCR retry queued successfully", body = String),
(status = 404, description = "Document not found"),
(status = 400, description = "Document is not eligible for OCR retry"),
(status = 401, description = "Unauthorized")
)
)]
async fn retry_ocr(
State(state): State<Arc<AppState>>,
auth_user: AuthUser,
Path(document_id): Path<uuid::Uuid>,
) -> Result<Json<serde_json::Value>, StatusCode> {
// Check if document exists and belongs to user
let documents = state
.db
.get_documents_by_user_with_role(auth_user.user.id, auth_user.user.role, 1000, 0)
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
let document = documents
.into_iter()
.find(|doc| doc.id == document_id)
.ok_or(StatusCode::NOT_FOUND)?;
// Check if document is eligible for OCR retry (failed or not processed)
let eligible = document.ocr_status.as_ref().map_or(true, |status| {
status == "failed" || status == "pending"
});
if !eligible {
return Ok(Json(serde_json::json!({
"success": false,
"message": "Document is not eligible for OCR retry. Current status: {}",
"current_status": document.ocr_status
})));
}
// Reset document OCR fields
let reset_result = sqlx::query(
r#"
UPDATE documents
SET ocr_status = 'pending',
ocr_text = NULL,
ocr_error = NULL,
ocr_failure_reason = NULL,
ocr_confidence = NULL,
ocr_word_count = NULL,
ocr_processing_time_ms = NULL,
ocr_completed_at = NULL,
updated_at = NOW()
WHERE id = $1
"#
)
.bind(document_id)
.execute(state.db.get_pool())
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
if reset_result.rows_affected() == 0 {
return Err(StatusCode::NOT_FOUND);
}
// Calculate priority based on file size (higher priority for retries)
let priority = match document.file_size {
0..=1048576 => 15, // <= 1MB: highest priority (boosted for retry)
..=5242880 => 12, // 1-5MB: high priority
..=10485760 => 10, // 5-10MB: medium priority
..=52428800 => 8, // 10-50MB: low priority
_ => 6, // > 50MB: lowest priority
};
// Add to OCR queue with detailed logging
match state.queue_service.enqueue_document(document_id, priority, document.file_size).await {
Ok(queue_id) => {
tracing::info!(
"OCR retry queued for document {} ({}): queue_id={}, priority={}, size={}",
document_id, document.filename, queue_id, priority, document.file_size
);
Ok(Json(serde_json::json!({
"success": true,
"message": "OCR retry queued successfully",
"queue_id": queue_id,
"document_id": document_id,
"priority": priority,
"estimated_wait_minutes": calculate_estimated_wait_time(priority).await
})))
}
Err(e) => {
tracing::error!("Failed to queue OCR retry for document {}: {}", document_id, e);
Err(StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
#[utoipa::path(
get,
path = "/api/documents/failed-ocr",
tag = "documents",
security(
("bearer_auth" = [])
),
params(
("limit" = Option<i64>, Query, description = "Number of documents to return (default: 50)"),
("offset" = Option<i64>, Query, description = "Number of documents to skip (default: 0)")
),
responses(
(status = 200, description = "List of documents with failed OCR", body = String),
(status = 401, description = "Unauthorized")
)
)]
async fn get_failed_ocr_documents(
State(state): State<Arc<AppState>>,
auth_user: AuthUser,
Query(pagination): Query<PaginationQuery>,
) -> Result<Json<serde_json::Value>, StatusCode> {
let limit = pagination.limit.unwrap_or(50);
let offset = pagination.offset.unwrap_or(0);
// Get failed OCR documents with additional failure details
let failed_docs = sqlx::query(
r#"
SELECT d.id, d.filename, d.original_filename, d.file_path, d.file_size,
d.mime_type, d.created_at, d.updated_at, d.user_id,
d.ocr_status, d.ocr_error, d.ocr_failure_reason,
d.ocr_completed_at, d.tags,
-- Count retry attempts from OCR queue
COALESCE(q.retry_count, 0) as retry_count,
q.last_attempt_at
FROM documents d
LEFT JOIN (
SELECT document_id,
COUNT(*) as retry_count,
MAX(created_at) as last_attempt_at
FROM ocr_queue
WHERE status IN ('failed', 'completed')
GROUP BY document_id
) q ON d.id = q.document_id
WHERE d.ocr_status = 'failed'
AND ($1 = $1 OR d.user_id = $1) -- Admin can see all, users see only their own
ORDER BY d.updated_at DESC
LIMIT $2 OFFSET $3
"#
)
.bind(if auth_user.user.role == crate::models::UserRole::Admin {
None
} else {
Some(auth_user.user.id)
})
.bind(limit)
.bind(offset)
.fetch_all(state.db.get_pool())
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
// Count total failed documents
let total_count: i64 = sqlx::query_scalar(
r#"
SELECT COUNT(*)
FROM documents
WHERE ocr_status = 'failed'
AND ($1 = $1 OR user_id = $1)
"#
)
.bind(if auth_user.user.role == crate::models::UserRole::Admin {
None
} else {
Some(auth_user.user.id)
})
.fetch_one(state.db.get_pool())
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
let failed_documents: Vec<serde_json::Value> = failed_docs
.into_iter()
.map(|row| {
let tags: Vec<String> = row.get::<Option<Vec<String>>, _>("tags").unwrap_or_default();
serde_json::json!({
"id": row.get::<uuid::Uuid, _>("id"),
"filename": row.get::<String, _>("filename"),
"original_filename": row.get::<String, _>("original_filename"),
"file_size": row.get::<i64, _>("file_size"),
"mime_type": row.get::<String, _>("mime_type"),
"created_at": row.get::<chrono::DateTime<chrono::Utc>, _>("created_at"),
"updated_at": row.get::<chrono::DateTime<chrono::Utc>, _>("updated_at"),
"tags": tags,
"ocr_status": row.get::<Option<String>, _>("ocr_status"),
"ocr_error": row.get::<Option<String>, _>("ocr_error"),
"ocr_failure_reason": row.get::<Option<String>, _>("ocr_failure_reason"),
"ocr_completed_at": row.get::<Option<chrono::DateTime<chrono::Utc>>, _>("ocr_completed_at"),
"retry_count": row.get::<Option<i64>, _>("retry_count").unwrap_or(0),
"last_attempt_at": row.get::<Option<chrono::DateTime<chrono::Utc>>, _>("last_attempt_at"),
"can_retry": true,
"failure_category": categorize_failure_reason(
row.get::<Option<String>, _>("ocr_failure_reason").as_deref(),
row.get::<Option<String>, _>("ocr_error").as_deref()
)
})
})
.collect();
let response = serde_json::json!({
"documents": failed_documents,
"pagination": {
"total": total_count,
"limit": limit,
"offset": offset,
"has_more": offset + limit < total_count
},
"statistics": {
"total_failed": total_count,
"failure_categories": get_failure_statistics(&state, auth_user.user.id, auth_user.user.role.clone()).await?
}
});
Ok(Json(response))
}
async fn calculate_estimated_wait_time(priority: i32) -> i64 {
// Simple estimation based on priority - in a real implementation,
// this would check actual queue depth and processing times
match priority {
15.. => 1, // High priority retry: ~1 minute
10..14 => 3, // Medium priority: ~3 minutes
5..9 => 10, // Low priority: ~10 minutes
_ => 30, // Very low priority: ~30 minutes
}
}
fn categorize_failure_reason(failure_reason: Option<&str>, error_message: Option<&str>) -> &'static str {
match failure_reason {
Some("pdf_font_encoding") => "PDF Font Issues",
Some("pdf_corruption") => "PDF Corruption",
Some("processing_timeout") => "Timeout",
Some("memory_limit") => "Memory Limit",
Some("pdf_parsing_panic") => "PDF Parsing Error",
Some("unknown") | None => {
// Try to categorize based on error message
if let Some(error) = error_message {
let error_lower = error.to_lowercase();
if error_lower.contains("timeout") {
"Timeout"
} else if error_lower.contains("memory") {
"Memory Limit"
} else if error_lower.contains("font") || error_lower.contains("encoding") {
"PDF Font Issues"
} else if error_lower.contains("corrupt") {
"PDF Corruption"
} else {
"Unknown Error"
}
} else {
"Unknown Error"
}
}
_ => "Other"
}
}
async fn get_failure_statistics(
state: &Arc<AppState>,
user_id: uuid::Uuid,
user_role: crate::models::UserRole
) -> Result<serde_json::Value, StatusCode> {
let stats = sqlx::query(
r#"
SELECT
ocr_failure_reason,
COUNT(*) as count
FROM documents
WHERE ocr_status = 'failed'
AND ($1 = $1 OR user_id = $1)
GROUP BY ocr_failure_reason
ORDER BY count DESC
"#
)
.bind(if user_role == crate::models::UserRole::Admin {
None
} else {
Some(user_id)
})
.fetch_all(state.db.get_pool())
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
let categories: Vec<serde_json::Value> = stats
.into_iter()
.map(|row| {
let reason = row.get::<Option<String>, _>("ocr_failure_reason");
let count = row.get::<i64, _>("count");
serde_json::json!({
"reason": reason.clone().unwrap_or_else(|| "unknown".to_string()),
"display_name": categorize_failure_reason(reason.as_deref(), None),
"count": count
})
})
.collect();
Ok(serde_json::json!(categories))
}