From 2a1eeeda50e62dab15f398e436112e77a37bfe5a Mon Sep 17 00:00:00 2001 From: perf3ct Date: Tue, 1 Jul 2025 00:15:48 +0000 Subject: [PATCH] feat(debug): debug page actually works and does something --- frontend/src/App.tsx | 2 + frontend/src/components/Layout/AppLayout.tsx | 2 + frontend/src/pages/DebugPage.tsx | 1057 ++++++++++++++++++ src/routes/documents.rs | 555 +++++++++ 4 files changed, 1616 insertions(+) create mode 100644 frontend/src/pages/DebugPage.tsx diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index 3c5de8f..30b0371 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -18,6 +18,7 @@ import WatchFolderPage from './pages/WatchFolderPage'; import DocumentManagementPage from './pages/DocumentManagementPage'; import LabelsPage from './pages/LabelsPage'; import IgnoredFilesPage from './pages/IgnoredFilesPage'; +import DebugPage from './pages/DebugPage'; function App(): React.ReactElement { const { user, loading } = useAuth(); @@ -77,6 +78,7 @@ function App(): React.ReactElement { } /> } /> } /> + } /> Profile Page - Coming Soon} /> diff --git a/frontend/src/components/Layout/AppLayout.tsx b/frontend/src/components/Layout/AppLayout.tsx index 1fb91aa..5855e11 100644 --- a/frontend/src/components/Layout/AppLayout.tsx +++ b/frontend/src/components/Layout/AppLayout.tsx @@ -37,6 +37,7 @@ import { Block as BlockIcon, Api as ApiIcon, ManageAccounts as ManageIcon, + BugReport as BugReportIcon, } from '@mui/icons-material'; import { useNavigate, useLocation } from 'react-router-dom'; import { useAuth } from '../../contexts/AuthContext'; @@ -72,6 +73,7 @@ const navigationItems: NavigationItem[] = [ { text: 'Watch Folder', icon: FolderIcon, path: '/watch' }, { text: 'Document Management', icon: ManageIcon, path: '/documents/management' }, { text: 'Ignored Files', icon: BlockIcon, path: '/ignored-files' }, + { text: 'Debug', icon: BugReportIcon, path: '/debug' }, ]; const AppLayout: React.FC = ({ children }) => { diff --git a/frontend/src/pages/DebugPage.tsx b/frontend/src/pages/DebugPage.tsx new file mode 100644 index 0000000..790205e --- /dev/null +++ b/frontend/src/pages/DebugPage.tsx @@ -0,0 +1,1057 @@ +import React, { useState, useCallback, useEffect } from 'react'; +import { + Box, + Card, + CardContent, + Typography, + TextField, + Button, + Grid, + Paper, + Stepper, + Step, + StepLabel, + StepContent, + Alert, + Chip, + Table, + TableBody, + TableCell, + TableContainer, + TableHead, + TableRow, + Accordion, + AccordionSummary, + AccordionDetails, + CircularProgress, + Container, + Tabs, + Tab, + LinearProgress, + Divider, +} from '@mui/material'; +import { + ExpandMore as ExpandMoreIcon, + BugReport as BugReportIcon, + CheckCircle as CheckCircleIcon, + Error as ErrorIcon, + Warning as WarningIcon, + Pending as PendingIcon, + PlayArrow as PlayArrowIcon, + CloudUpload as UploadIcon, + Search as SearchIcon, + Refresh as RefreshIcon, + Visibility as PreviewIcon, +} from '@mui/icons-material'; +import { api } from '../services/api'; + +interface DebugStep { + step: number; + name: string; + status: string; + details: any; + success: boolean; + error?: string; +} + +interface DebugInfo { + document_id: string; + filename: string; + overall_status: string; + pipeline_steps: DebugStep[]; + failed_document_info?: any; + user_settings: any; + debug_timestamp: string; +} + +const DebugPage: React.FC = () => { + const [activeTab, setActiveTab] = useState(0); + const [documentId, setDocumentId] = useState(''); + const [debugInfo, setDebugInfo] = useState(null); + const [loading, setLoading] = useState(false); + const [error, setError] = useState(''); + + // Upload functionality + const [selectedFile, setSelectedFile] = useState(null); + const [uploading, setUploading] = useState(false); + const [uploadProgress, setUploadProgress] = useState(0); + const [uploadedDocumentId, setUploadedDocumentId] = useState(''); + const [monitoringInterval, setMonitoringInterval] = useState(null); + const [processingStatus, setProcessingStatus] = useState(''); + + const getStepIcon = (status: string, success: boolean) => { + if (status === 'processing') return ; + if (success || status === 'completed' || status === 'passed') return ; + if (status === 'failed' || status === 'error') return ; + if (status === 'pending' || status === 'not_reached') return ; + if (status === 'not_queued' || status === 'ocr_disabled') return ; + return ; + }; + + const getStatusColor = (status: string, success: boolean): "default" | "primary" | "secondary" | "error" | "info" | "success" | "warning" => { + if (status === 'processing') return 'info'; + if (success || status === 'completed' || status === 'passed') return 'success'; + if (status === 'failed' || status === 'error') return 'error'; + if (status === 'pending' || status === 'not_reached') return 'default'; + if (status === 'not_queued' || status === 'ocr_disabled') return 'warning'; + return 'primary'; + }; + + const fetchDebugInfo = useCallback(async (docId?: string, retryCount = 0) => { + const targetDocId = docId || documentId; + if (!targetDocId.trim()) { + setError('Please enter a document ID'); + return; + } + + setLoading(true); + if (retryCount === 0) { + setError(''); // Only clear error on first attempt + } + + try { + const response = await api.get(`/documents/${targetDocId}/debug`); + setDebugInfo(response.data); + setError(''); // Clear any previous errors + } catch (err: any) { + console.error('Debug fetch error:', err); + + // If it's a 404 and we haven't retried much, try again after a short delay + if (err.response?.status === 404 && retryCount < 3) { + console.log(`Document not found, retrying in ${(retryCount + 1) * 1000}ms... (attempt ${retryCount + 1})`); + setTimeout(() => { + fetchDebugInfo(docId, retryCount + 1); + }, (retryCount + 1) * 1000); + return; + } + + const errorMessage = err.response?.status === 404 + ? `Document ${targetDocId} not found. It may still be processing or may have been moved to failed documents.` + : err.response?.data?.message || `Failed to fetch debug information: ${err.message}`; + setError(errorMessage); + setDebugInfo(null); + } finally { + if (retryCount === 0) { + setLoading(false); + } + } + }, [documentId]); + + const handleFileSelect = (event: React.ChangeEvent) => { + const file = event.target.files?.[0]; + if (file) { + setSelectedFile(file); + setError(''); + } + }; + + const uploadDocument = useCallback(async () => { + if (!selectedFile) { + setError('Please select a file to upload'); + return; + } + + setUploading(true); + setUploadProgress(0); + setError(''); + setProcessingStatus('Uploading file...'); + + try { + const formData = new FormData(); + formData.append('file', selectedFile); + + const response = await api.post('/documents', formData, { + headers: { + 'Content-Type': 'multipart/form-data', + }, + onUploadProgress: (progressEvent) => { + const progress = progressEvent.total + ? Math.round((progressEvent.loaded * 100) / progressEvent.total) + : 0; + setUploadProgress(progress); + }, + }); + + const uploadedDoc = response.data; + setUploadedDocumentId(uploadedDoc.id); + setDocumentId(uploadedDoc.id); + setProcessingStatus('Document uploaded successfully. Starting OCR processing...'); + + // Start monitoring the processing + startProcessingMonitor(uploadedDoc.id); + } catch (err: any) { + setError(err.response?.data?.message || 'Failed to upload document'); + setProcessingStatus('Upload failed'); + } finally { + setUploading(false); + setUploadProgress(0); + } + }, [selectedFile]); + + const startProcessingMonitor = useCallback((docId: string) => { + // Clear any existing interval + if (monitoringInterval) { + clearInterval(monitoringInterval); + } + + const interval = setInterval(async () => { + try { + const response = await api.get(`/documents/${docId}`); + const doc = response.data; + + if (doc.ocr_status === 'completed' || doc.ocr_status === 'failed') { + setProcessingStatus(`Processing ${doc.ocr_status}!`); + clearInterval(interval); + setMonitoringInterval(null); + + // Auto-fetch debug info when processing is complete OR failed (but don't switch tabs) + setTimeout(() => { + fetchDebugInfo(docId); + // Don't auto-switch tabs - let user decide when to view debug info + }, 2000); // Give it a bit more time to ensure document is saved + } else if (doc.ocr_status === 'processing') { + setProcessingStatus('OCR processing in progress...'); + } else if (doc.ocr_status === 'pending') { + setProcessingStatus('Document queued for OCR processing...'); + } else { + setProcessingStatus('Checking processing status...'); + } + } catch (err) { + console.error('Error monitoring processing:', err); + } + }, 2000); // Check every 2 seconds + + setMonitoringInterval(interval); + + // Auto-clear monitoring after 5 minutes + setTimeout(() => { + clearInterval(interval); + setMonitoringInterval(null); + setProcessingStatus('Monitoring stopped (timeout)'); + }, 300000); + }, [monitoringInterval, fetchDebugInfo]); + + // Cleanup interval on unmount + useEffect(() => { + return () => { + if (monitoringInterval) { + clearInterval(monitoringInterval); + } + }; + }, [monitoringInterval]); + + const renderStepDetails = (step: DebugStep) => { + const details = step.details; + + return ( + + {step.error && ( + + {step.error} + + )} + + {step.step === 1 && ( // File Upload & Ingestion + + + + + File Information + Filename: {details.filename} + Original: {details.original_filename} + Size: {(details.file_size / 1024 / 1024).toFixed(2)} MB + MIME Type: {details.mime_type} + File Exists: + + + + + File Metadata + {details.file_metadata ? ( + <> + Actual Size: {(details.file_metadata.size / 1024 / 1024).toFixed(2)} MB + Is File: {details.file_metadata.is_file ? 'Yes' : 'No'} + Modified: {details.file_metadata.modified ? new Date(details.file_metadata.modified.secs_since_epoch * 1000).toLocaleString() : 'Unknown'} + + ) : ( + File metadata not available + )} + Created: {new Date(details.created_at).toLocaleString()} + + + + + {details.file_analysis && ( + + Detailed File Analysis + + + + Basic Analysis + File Type: {details.file_analysis.file_type} + Size: {(details.file_analysis.file_size_bytes / 1024 / 1024).toFixed(2)} MB + Readable: + {details.file_analysis.error_details && ( + + File Error: {details.file_analysis.error_details} + + )} + + + + {details.file_analysis.pdf_info ? ( + + PDF Analysis + Valid PDF: + PDF Version: {details.file_analysis.pdf_info.pdf_version || 'Unknown'} + Pages: {details.file_analysis.pdf_info.page_count || 'Unknown'} + Has Text: + Has Images: + Encrypted: + Font Count: {details.file_analysis.pdf_info.font_count} + Text Length: {details.file_analysis.pdf_info.estimated_text_length} chars + {details.file_analysis.pdf_info.text_extraction_error && ( + + PDF Text Extraction Error: {details.file_analysis.pdf_info.text_extraction_error} + + )} + + ) : details.file_analysis.text_preview ? ( + + Text Preview + + {details.file_analysis.text_preview} + + + ) : ( + + File Content + No preview available for this file type + + )} + + + + )} + + )} + + {step.step === 2 && ( // OCR Queue Enrollment + + + + + Queue Status + User OCR Enabled: + Queue Entries: {details.queue_entries_count} + + + + + {details.queue_history && details.queue_history.length > 0 && ( + + Queue History + + + + + Status + Priority + Created + Started + Completed + Attempts + Worker + + + + {details.queue_history.map((entry: any, index: number) => ( + + + + + {entry.priority} + {new Date(entry.created_at).toLocaleString()} + {entry.started_at ? new Date(entry.started_at).toLocaleString() : '-'} + {entry.completed_at ? new Date(entry.completed_at).toLocaleString() : '-'} + {entry.attempts} + {entry.worker_id || '-'} + + ))} + +
+
+
+ )} +
+ )} + + {step.step === 3 && ( // OCR Processing + + + + OCR Results + Text Length: {details.ocr_text_length} characters + Confidence: {details.ocr_confidence ? `${details.ocr_confidence.toFixed(1)}%` : 'N/A'} + Word Count: {details.ocr_word_count || 0} + Processing Time: {details.ocr_processing_time_ms ? `${details.ocr_processing_time_ms}ms` : 'N/A'} + Completed: {details.ocr_completed_at ? new Date(details.ocr_completed_at).toLocaleString() : 'Not completed'} + + + + + Processing Details + Has Processed Image: + {details.processed_image_info && ( + <> + Image Size: {details.processed_image_info.image_width}x{details.processed_image_info.image_height} + File Size: {(details.processed_image_info.file_size / 1024).toFixed(1)} KB + Processing Steps: {details.processed_image_info.processing_steps?.join(', ') || 'None'} + {details.processed_image_info.processing_parameters && ( + Processing Parameters: {JSON.stringify(details.processed_image_info.processing_parameters)} + )} + + )} + + + + )} + + {step.step === 4 && ( // Quality Validation + + + + + Quality Thresholds + Min Confidence: {details.quality_thresholds.min_confidence}% + Brightness: {details.quality_thresholds.brightness_threshold} + Contrast: {details.quality_thresholds.contrast_threshold} + Noise: {details.quality_thresholds.noise_threshold} + Sharpness: {details.quality_thresholds.sharpness_threshold} + + + + + Actual Values + Confidence: {details.actual_values.confidence ? `${details.actual_values.confidence.toFixed(1)}%` : 'N/A'} + Word Count: {details.actual_values.word_count || 0} + Processed Image Available: + {details.actual_values.processing_parameters && ( + Processing Parameters: {JSON.stringify(details.actual_values.processing_parameters)} + )} + + + + + + Quality Checks + + {Object.entries(details.quality_checks).map(([check, passed]: [string, any]) => ( + + : passed === false ? : } + /> + + ))} + + + + )} +
+ ); + }; + + const renderUploadTab = () => ( + + + + + Upload Document for Debug Analysis + + + Upload a PDF or image file to analyze the processing pipeline in real-time. + + + + + + + {selectedFile && ( + + + Selected: {selectedFile.name} ({(selectedFile.size / 1024 / 1024).toFixed(2)} MB) + + + )} + + {selectedFile && ( + + )} + + + {uploading && uploadProgress > 0 && ( + + + Upload Progress: {uploadProgress}% + + + + )} + + {processingStatus && ( + + {processingStatus} + {monitoringInterval && ( + + + + )} + + )} + + {uploadedDocumentId && ( + + + Document ID: {uploadedDocumentId} + + + + + + + + )} + + {selectedFile && selectedFile.type.startsWith('image/') && ( + + Preview + + + )} + + + + ); + + const renderSearchTab = () => ( + + + + + Debug Existing Document + + + Enter a document ID to analyze the processing pipeline for an existing document. + + + + setDocumentId(e.target.value)} + placeholder="e.g., 123e4567-e89b-12d3-a456-426614174000" + fullWidth + size="small" + /> + + + + {error && ( + + {error} + + )} + + + + ); + + return ( + + + + + Document Processing Debug + + + Upload documents or analyze existing ones to troubleshoot OCR processing issues. + + + + + + setActiveTab(newValue)}> + } + iconPosition="start" + /> + } + iconPosition="start" + /> + {debugInfo && ( + } + iconPosition="start" + /> + )} + + + + + {activeTab === 0 && renderUploadTab()} + {activeTab === 1 && renderSearchTab()} + + + + {error && ( + + Debug Error + {error} + + )} + + {debugInfo && activeTab === 2 && ( + + + + + Document: {debugInfo.filename} + + + + + Debug run at: {new Date(debugInfo.debug_timestamp).toLocaleString()} + + + + + + + + + Processing Pipeline + + + {debugInfo.pipeline_steps.map((step) => ( + + + + {step.name} + + + + + {renderStepDetails(step)} + + + ))} + + + + + {debugInfo.failed_document_info && ( + + + + Failed Document Information + + + + + Failure Details + Failure Reason: {debugInfo.failed_document_info.failure_reason} + Failure Stage: {debugInfo.failed_document_info.failure_stage} + Retry Count: {debugInfo.failed_document_info.retry_count || 0} + Created: {new Date(debugInfo.failed_document_info.created_at).toLocaleString()} + {debugInfo.failed_document_info.last_retry_at && ( + Last Retry: {new Date(debugInfo.failed_document_info.last_retry_at).toLocaleString()} + )} + + + + + Failed OCR Results + {debugInfo.failed_document_info.failed_ocr_text ? ( + <> + OCR Text Length: {debugInfo.failed_document_info.failed_ocr_text.length} chars + OCR Confidence: {debugInfo.failed_document_info.failed_ocr_confidence?.toFixed(1)}% + Word Count: {debugInfo.failed_document_info.failed_ocr_word_count || 0} + Processing Time: {debugInfo.failed_document_info.failed_ocr_processing_time_ms || 0}ms + + ) : ( + No OCR results available + )} + + + {debugInfo.failed_document_info.error_message && ( + + + Error Message: {debugInfo.failed_document_info.error_message} + + + )} + {debugInfo.failed_document_info.content_preview && ( + + + Content Preview + + {debugInfo.failed_document_info.content_preview} + + + + )} + + + + )} + + {debugInfo.detailed_processing_logs && debugInfo.detailed_processing_logs.length > 0 && ( + + + + Detailed Processing Logs + + + Complete history of all OCR processing attempts for this document. + + + + + + Attempt + Status + Priority + Created + Started + Completed + Duration + Wait Time + Attempts + Worker + Error + + + + {debugInfo.detailed_processing_logs.map((log: any, index: number) => ( + + {index + 1} + + + + {log.priority} + {new Date(log.created_at).toLocaleString()} + {log.started_at ? new Date(log.started_at).toLocaleString() : '-'} + {log.completed_at ? new Date(log.completed_at).toLocaleString() : '-'} + {log.processing_duration_ms ? `${log.processing_duration_ms}ms` : '-'} + {log.queue_wait_time_ms ? `${log.queue_wait_time_ms}ms` : '-'} + {log.attempts || 0} + {log.worker_id || '-'} + + {log.error_message ? ( + + {log.error_message} + + ) : '-'} + + + ))} + +
+
+
+
+ )} + + {debugInfo.file_analysis && ( + + + + File Analysis Summary + + + + + File Properties + File Type: {debugInfo.file_analysis.file_type} + Size: {(debugInfo.file_analysis.file_size_bytes / 1024 / 1024).toFixed(2)} MB + Readable: + + + + {debugInfo.file_analysis.pdf_info && ( + + PDF Properties + Valid PDF: + Has Text Content: + Text Length: {debugInfo.file_analysis.pdf_info.estimated_text_length} chars + Page Count: {debugInfo.file_analysis.pdf_info.page_count || 'Unknown'} + Encrypted: + + )} + + {debugInfo.file_analysis.pdf_info?.text_extraction_error && ( + + + PDF Text Extraction Issue: {debugInfo.file_analysis.pdf_info.text_extraction_error} + + + )} + + + + )} + + {debugInfo.pipeline_steps.some(step => step.step === 3 && step.details.has_processed_image) && ( + + + + Processed Images + + + + + Original Document + + + + + + Processed Image (OCR Input) + { + (e.target as HTMLImageElement).style.display = 'none'; + (e.target as HTMLImageElement).parentNode?.appendChild( + document.createTextNode('Processed image not available') + ); + }} + sx={{ + maxWidth: '100%', + maxHeight: '300px', + objectFit: 'contain', + border: '1px solid', + borderColor: 'divider', + borderRadius: 1 + }} + /> + + + + + + )} + + + + + }> + User Settings + + + + + + OCR Settings + Background OCR: {debugInfo.user_settings.enable_background_ocr ? 'Enabled' : 'Disabled'} + Min Confidence: {debugInfo.user_settings.ocr_min_confidence}% + Max File Size: {debugInfo.user_settings.max_file_size_mb} MB + + + + + Quality Thresholds + Brightness: {debugInfo.user_settings.quality_thresholds.brightness} + Contrast: {debugInfo.user_settings.quality_thresholds.contrast} + Noise: {debugInfo.user_settings.quality_thresholds.noise} + Sharpness: {debugInfo.user_settings.quality_thresholds.sharpness} + + + + + + + +
+ )} +
+ ); +}; + +export default DebugPage; \ No newline at end of file diff --git a/src/routes/documents.rs b/src/routes/documents.rs index f8c5a43..cec3549 100644 --- a/src/routes/documents.rs +++ b/src/routes/documents.rs @@ -58,6 +58,7 @@ pub fn router() -> Router> { .route("/{id}/ocr", get(get_document_ocr)) .route("/{id}/processed-image", get(get_processed_image)) .route("/{id}/retry-ocr", post(retry_ocr)) + .route("/{id}/debug", get(get_document_debug_info)) .route("/duplicates", get(get_user_duplicates)) .route("/failed", get(get_failed_documents)) .route("/failed/{id}/view", get(view_failed_document)) @@ -645,6 +646,560 @@ async fn retry_ocr( } } +#[utoipa::path( + get, + path = "/api/documents/{id}/debug", + tag = "documents", + security( + ("bearer_auth" = []) + ), + params( + ("id" = uuid::Uuid, Path, description = "Document ID") + ), + responses( + (status = 200, description = "Debug information for document processing pipeline", body = String), + (status = 404, description = "Document not found"), + (status = 401, description = "Unauthorized") + ) +)] +async fn get_document_debug_info( + State(state): State>, + auth_user: AuthUser, + Path(document_id): Path, +) -> Result, StatusCode> { + tracing::info!("Starting debug analysis for document {} by user {}", document_id, auth_user.user.id); + + // Get the document + let document = match state + .db + .get_document_by_id(document_id, auth_user.user.id, auth_user.user.role) + .await + { + Ok(Some(doc)) => { + tracing::info!("Found document: {} ({})", doc.filename, doc.mime_type); + doc + } + Ok(None) => { + tracing::warn!("Document {} not found for user {}", document_id, auth_user.user.id); + return Err(StatusCode::NOT_FOUND); + } + Err(e) => { + tracing::error!("Database error fetching document {}: {}", document_id, e); + return Err(StatusCode::INTERNAL_SERVER_ERROR); + } + }; + + // Get user settings + tracing::info!("Fetching user settings for user {}", auth_user.user.id); + let settings = match state + .db + .get_user_settings(auth_user.user.id) + .await + { + Ok(Some(s)) => { + tracing::info!("Found user settings: OCR enabled={}, min_confidence={}", s.enable_background_ocr, s.ocr_min_confidence); + s + } + Ok(None) => { + tracing::info!("No user settings found, using defaults"); + crate::models::Settings::default() + } + Err(e) => { + tracing::error!("Error fetching user settings: {}", e); + return Err(StatusCode::INTERNAL_SERVER_ERROR); + } + }; + + // Get OCR queue history for this document + tracing::info!("Fetching OCR queue history for document {}", document_id); + let queue_history = match sqlx::query( + r#" + SELECT id, status, priority, created_at, started_at, completed_at, + error_message, attempts, worker_id + FROM ocr_queue + WHERE document_id = $1 + ORDER BY created_at DESC + LIMIT 10 + "# + ) + .bind(document_id) + .fetch_all(state.db.get_pool()) + .await { + Ok(history) => { + tracing::info!("Queue history query successful, found {} entries", history.len()); + history + }, + Err(e) => { + tracing::error!("Queue history query error: {}", e); + return Err(StatusCode::INTERNAL_SERVER_ERROR); + } + }; + + // Get processed image info if it exists + tracing::info!("Fetching processed image for document {}", document_id); + let processed_image = match state + .db + .get_processed_image_by_document_id(document_id, auth_user.user.id) + .await { + Ok(Some(img)) => { + tracing::info!("Found processed image for document {}", document_id); + Some(img) + }, + Ok(None) => { + tracing::info!("No processed image found for document {}", document_id); + None + }, + Err(e) => { + tracing::warn!("Error fetching processed image for document {}: {}", document_id, e); + None + } + }; + + // Get failed document record if it exists + tracing::info!("Fetching failed document record for document {}", document_id); + let failed_document = match sqlx::query( + r#" + SELECT failure_reason, failure_stage, error_message, retry_count, + last_retry_at, created_at, content, ocr_text, ocr_confidence, + ocr_word_count, ocr_processing_time_ms + FROM failed_documents + WHERE id = $1 OR existing_document_id = $1 + ORDER BY created_at DESC + LIMIT 1 + "# + ) + .bind(document_id) + .fetch_optional(state.db.get_pool()) + .await { + Ok(result) => { + tracing::info!("Failed document query successful, found: {}", result.is_some()); + result + }, + Err(e) => { + tracing::error!("Failed document query error: {}", e); + return Err(StatusCode::INTERNAL_SERVER_ERROR); + } + }; + + // Get detailed OCR processing logs and attempts + tracing::info!("Fetching detailed OCR processing logs for document {}", document_id); + let ocr_processing_logs = match sqlx::query( + r#" + SELECT id, status, priority, created_at, started_at, completed_at, + error_message, attempts, worker_id, processing_time_ms, file_size + FROM ocr_queue + WHERE document_id = $1 + ORDER BY created_at ASC + "# + ) + .bind(document_id) + .fetch_all(state.db.get_pool()) + .await { + Ok(logs) => { + tracing::info!("OCR processing logs query successful, found {} entries", logs.len()); + logs + }, + Err(e) => { + tracing::error!("OCR processing logs query error: {}", e); + return Err(StatusCode::INTERNAL_SERVER_ERROR); + } + }; + + // File service for file info + let file_service = FileService::new(state.config.upload_path.clone()); + + // Check if file exists + let file_exists = tokio::fs::metadata(&document.file_path).await.is_ok(); + let file_metadata = if file_exists { + tokio::fs::metadata(&document.file_path).await.ok() + } else { + None + }; + + // Try to analyze file content for additional diagnostic info + tracing::info!("Analyzing file content for document {} (exists: {})", document_id, file_exists); + let file_analysis = if file_exists { + match analyze_file_content(&document.file_path, &document.mime_type).await { + Ok(analysis) => { + tracing::info!("File analysis successful for document {}", document_id); + analysis + }, + Err(e) => { + tracing::warn!("Failed to analyze file content for {}: {}", document_id, e); + FileAnalysis { + error_details: Some(format!("File analysis failed: {}", e)), + ..Default::default() + } + } + } + } else { + tracing::warn!("File does not exist for document {}, skipping analysis", document_id); + FileAnalysis::default() + }; + + // Pipeline steps analysis + let mut pipeline_steps = Vec::new(); + + // Step 1: File Upload & Ingestion + pipeline_steps.push(serde_json::json!({ + "step": 1, + "name": "File Upload & Ingestion", + "status": "completed", // Document exists if we got this far + "details": { + "filename": document.filename, + "original_filename": document.original_filename, + "file_size": document.file_size, + "mime_type": document.mime_type, + "file_exists": file_exists, + "file_path": document.file_path, + "created_at": document.created_at, + "file_metadata": file_metadata.as_ref().map(|m| serde_json::json!({ + "size": m.len(), + "modified": m.modified().ok(), + "is_file": m.is_file(), + "is_dir": m.is_dir() + })), + "file_analysis": file_analysis + }, + "success": true, + "error": None:: + })); + + // Step 2: OCR Queue Enrollment + let queue_enrollment_status = if queue_history.is_empty() { + if settings.enable_background_ocr { + "not_queued" + } else { + "ocr_disabled" + } + } else { + "queued" + }; + + pipeline_steps.push(serde_json::json!({ + "step": 2, + "name": "OCR Queue Enrollment", + "status": queue_enrollment_status, + "details": { + "user_ocr_enabled": settings.enable_background_ocr, + "queue_entries_count": queue_history.len(), + "queue_history": queue_history.iter().map(|row| serde_json::json!({ + "id": row.get::("id"), + "status": row.get::("status"), + "priority": row.get::("priority"), + "created_at": row.get::, _>("created_at"), + "started_at": row.get::>, _>("started_at"), + "completed_at": row.get::>, _>("completed_at"), + "error_message": row.get::, _>("error_message"), + "attempts": row.get::("attempts"), + "worker_id": row.get::, _>("worker_id") + })).collect::>() + }, + "success": !queue_history.is_empty() || !settings.enable_background_ocr, + "error": if !settings.enable_background_ocr && queue_history.is_empty() { + Some("OCR processing is disabled in user settings") + } else { None } + })); + + // Step 3: OCR Processing + let ocr_status = document.ocr_status.as_deref().unwrap_or("not_started"); + let ocr_success = matches!(ocr_status, "completed"); + + pipeline_steps.push(serde_json::json!({ + "step": 3, + "name": "OCR Text Extraction", + "status": ocr_status, + "details": { + "ocr_text_length": document.ocr_text.as_ref().map(|t| t.len()).unwrap_or(0), + "ocr_confidence": document.ocr_confidence, + "ocr_word_count": document.ocr_word_count, + "ocr_processing_time_ms": document.ocr_processing_time_ms, + "ocr_completed_at": document.ocr_completed_at, + "ocr_error": document.ocr_error, + "has_processed_image": processed_image.is_some(), + "processed_image_info": processed_image.as_ref().map(|pi| serde_json::json!({ + "image_path": pi.processed_image_path, + "image_width": pi.image_width, + "image_height": pi.image_height, + "file_size": pi.file_size, + "processing_parameters": pi.processing_parameters, + "processing_steps": pi.processing_steps, + "created_at": pi.created_at + })) + }, + "success": ocr_success, + "error": document.ocr_error.clone() + })); + + // Step 4: Quality Validation + let quality_passed = if let Some(confidence) = document.ocr_confidence { + confidence >= settings.ocr_min_confidence && document.ocr_word_count.unwrap_or(0) > 0 + } else { + false + }; + + pipeline_steps.push(serde_json::json!({ + "step": 4, + "name": "OCR Quality Validation", + "status": if ocr_success { + if quality_passed { "passed" } else { "failed" } + } else { + "not_reached" + }, + "details": { + "quality_thresholds": { + "min_confidence": settings.ocr_min_confidence, + "brightness_threshold": settings.ocr_quality_threshold_brightness, + "contrast_threshold": settings.ocr_quality_threshold_contrast, + "noise_threshold": settings.ocr_quality_threshold_noise, + "sharpness_threshold": settings.ocr_quality_threshold_sharpness + }, + "actual_values": { + "confidence": document.ocr_confidence, + "word_count": document.ocr_word_count, + "processed_image_available": processed_image.is_some(), + "processing_parameters": processed_image.as_ref().map(|pi| &pi.processing_parameters) + }, + "quality_checks": { + "confidence_check": document.ocr_confidence.map(|c| c >= settings.ocr_min_confidence), + "word_count_check": document.ocr_word_count.map(|w| w > 0), + "processed_image_available": processed_image.is_some() + } + }, + "success": quality_passed, + "error": if !quality_passed && ocr_success { + Some(format!("Quality validation failed: confidence {:.1}% (required: {:.1}%), words: {}", + document.ocr_confidence.unwrap_or(0.0), + settings.ocr_min_confidence, + document.ocr_word_count.unwrap_or(0) + )) + } else { None } + })); + + // Overall summary + let overall_status = if quality_passed { + "success" + } else if matches!(ocr_status, "failed") { + "failed" + } else if matches!(ocr_status, "processing") { + "processing" + } else if matches!(ocr_status, "pending") { + "pending" + } else { + "not_started" + }; + + Ok(Json(serde_json::json!({ + "document_id": document_id, + "filename": document.filename, + "overall_status": overall_status, + "pipeline_steps": pipeline_steps, + "failed_document_info": failed_document.as_ref().map(|row| serde_json::json!({ + "failure_reason": row.get::("failure_reason"), + "failure_stage": row.get::("failure_stage"), + "error_message": row.get::, _>("error_message"), + "retry_count": row.get::, _>("retry_count"), + "last_retry_at": row.get::>, _>("last_retry_at"), + "created_at": row.get::, _>("created_at"), + "content_preview": row.get::, _>("content").map(|c| + c.chars().take(200).collect::() + ), + "failed_ocr_text": row.get::, _>("ocr_text"), + "failed_ocr_confidence": row.get::, _>("ocr_confidence"), + "failed_ocr_word_count": row.get::, _>("ocr_word_count"), + "failed_ocr_processing_time_ms": row.get::, _>("ocr_processing_time_ms") + })), + "user_settings": { + "enable_background_ocr": settings.enable_background_ocr, + "ocr_min_confidence": settings.ocr_min_confidence, + "max_file_size_mb": settings.max_file_size_mb, + "quality_thresholds": { + "brightness": settings.ocr_quality_threshold_brightness, + "contrast": settings.ocr_quality_threshold_contrast, + "noise": settings.ocr_quality_threshold_noise, + "sharpness": settings.ocr_quality_threshold_sharpness + } + }, + "debug_timestamp": chrono::Utc::now(), + "file_analysis": file_analysis, + "detailed_processing_logs": ocr_processing_logs.iter().map(|row| serde_json::json!({ + "id": row.get::("id"), + "status": row.get::("status"), + "priority": row.get::("priority"), + "created_at": row.get::, _>("created_at"), + "started_at": row.get::>, _>("started_at"), + "completed_at": row.get::>, _>("completed_at"), + "error_message": row.get::, _>("error_message"), + "attempts": row.get::("attempts"), + "worker_id": row.get::, _>("worker_id"), + "processing_time_ms": row.get::, _>("processing_time_ms"), + "file_size": row.get::, _>("file_size"), + // Calculate processing duration if both timestamps are available + "processing_duration_ms": if let (Some(started), Some(completed)) = ( + row.get::>, _>("started_at"), + row.get::>, _>("completed_at") + ) { + Some((completed.timestamp_millis() - started.timestamp_millis()) as i32) + } else { + row.get::, _>("processing_time_ms") + }, + // Calculate queue wait time + "queue_wait_time_ms": if let Some(started) = row.get::>, _>("started_at") { + let created = row.get::, _>("created_at"); + Some((started.timestamp_millis() - created.timestamp_millis()) as i32) + } else { + None:: + } + })).collect::>() + }))) +} + +#[derive(Debug, Default, serde::Serialize)] +struct FileAnalysis { + file_type: String, + file_size_bytes: u64, + is_readable: bool, + pdf_info: Option, + text_preview: Option, + error_details: Option, +} + +#[derive(Debug, serde::Serialize)] +struct PdfAnalysis { + is_valid_pdf: bool, + page_count: Option, + has_text_content: bool, + has_images: bool, + is_encrypted: bool, + pdf_version: Option, + font_count: usize, + text_extraction_error: Option, + estimated_text_length: usize, +} + +async fn analyze_file_content(file_path: &str, mime_type: &str) -> Result> { + let mut analysis = FileAnalysis { + file_type: mime_type.to_string(), + ..Default::default() + }; + + // Try to read file size + if let Ok(metadata) = tokio::fs::metadata(file_path).await { + analysis.file_size_bytes = metadata.len(); + } + + // Try to read the file + let file_content = match tokio::fs::read(file_path).await { + Ok(content) => { + analysis.is_readable = true; + content + } + Err(e) => { + analysis.error_details = Some(format!("Failed to read file: {}", e)); + return Ok(analysis); + } + }; + + // Analyze based on file type + if mime_type.contains("pdf") { + analysis.pdf_info = Some(analyze_pdf_content(&file_content).await); + } else if mime_type.starts_with("text/") { + // For text files, show a preview + match String::from_utf8(file_content.clone()) { + Ok(text) => { + analysis.text_preview = Some(text.chars().take(500).collect()); + } + Err(e) => { + analysis.error_details = Some(format!("Failed to decode text file: {}", e)); + } + } + } + + Ok(analysis) +} + +async fn analyze_pdf_content(content: &[u8]) -> PdfAnalysis { + use std::panic; + + let mut analysis = PdfAnalysis { + is_valid_pdf: false, + page_count: None, + has_text_content: false, + has_images: false, + is_encrypted: false, + pdf_version: None, + font_count: 0, + text_extraction_error: None, + estimated_text_length: 0, + }; + + // Check PDF header + if content.len() < 8 { + analysis.text_extraction_error = Some("File too small to be a valid PDF".to_string()); + return analysis; + } + + if !content.starts_with(b"%PDF-") { + analysis.text_extraction_error = Some("File does not start with PDF header".to_string()); + return analysis; + } + + analysis.is_valid_pdf = true; + + // Extract PDF version from header + if content.len() >= 8 { + if let Ok(header) = std::str::from_utf8(&content[0..8]) { + if let Some(version) = header.strip_prefix("%PDF-") { + analysis.pdf_version = Some(version.to_string()); + } + } + } + + // Try to extract text using pdf_extract (same as the main OCR pipeline) + let text_result = panic::catch_unwind(|| { + pdf_extract::extract_text_from_mem(content) + }); + + match text_result { + Ok(Ok(text)) => { + analysis.has_text_content = !text.trim().is_empty(); + analysis.estimated_text_length = text.len(); + + // Count words for comparison with OCR results + let word_count = text.split_whitespace().count(); + if word_count == 0 && text.len() > 0 { + analysis.text_extraction_error = Some("PDF contains characters but no extractable words".to_string()); + } + } + Ok(Err(e)) => { + analysis.text_extraction_error = Some(format!("PDF text extraction failed: {}", e)); + } + Err(_) => { + analysis.text_extraction_error = Some("PDF text extraction panicked (likely corrupted PDF)".to_string()); + } + } + + // Basic PDF structure analysis + let content_str = String::from_utf8_lossy(content); + + // Check for encryption + analysis.is_encrypted = content_str.contains("/Encrypt"); + + // Check for images + analysis.has_images = content_str.contains("/Image") || content_str.contains("/XObject"); + + // Estimate page count (rough) + let page_matches = content_str.matches("/Type /Page").count(); + if page_matches > 0 { + analysis.page_count = Some(page_matches as i32); + } + + // Count fonts (rough) + analysis.font_count = content_str.matches("/Type /Font").count(); + + analysis +} + #[utoipa::path( get, path = "/api/documents/failed-ocr",