diff --git a/Dockerfile b/Dockerfile
index 7292b3c..7bbfdb0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -20,6 +20,7 @@ RUN apt-get update && apt-get install -y \
libclang-dev \
clang \
poppler-utils \
+ ocrmypdf \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
@@ -37,6 +38,7 @@ RUN apt-get update && apt-get install -y \
tesseract-ocr-eng \
ca-certificates \
poppler-utils \
+ ocrmypdf \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
diff --git a/create_test_pdfs.py b/create_test_pdfs.py
new file mode 100644
index 0000000..d4055d3
--- /dev/null
+++ b/create_test_pdfs.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+"""
+Create proper test PDFs for debugging OCR word counting issues.
+"""
+
+try:
+ from reportlab.pdfgen import canvas
+ from reportlab.lib.pagesizes import letter
+ import os
+except ImportError:
+ print("reportlab not installed. Trying alternative method...")
+ # Alternative: create simple text files for testing
+ import os
+
+ def create_simple_test_files():
+ """Create simple text files as a fallback"""
+ test_dir = "tests/test_pdfs"
+ os.makedirs(test_dir, exist_ok=True)
+
+ # Test cases that would be similar to PDF extraction results
+ test_cases = [
+ ("normal_spacing.txt", "This is a normal document with proper word spacing and punctuation."),
+ ("soclogix_sample.txt", "SOCLogix Non-Disclosure Agreement\nThis agreement is entered into between SOCLogix and the recipient for the purpose of protecting confidential information."),
+ ("multiline_text.txt", "Line one with several words\nLine two with more content\nLine three continues the pattern\nFinal line ends the document"),
+ ("mixed_content.txt", "Document with numbers 123 and symbols @#$ mixed with normal text."),
+ ("special_chars.txt", "Text with special characters: café naïve résumé — and 'quotes' • bullets"),
+ ]
+
+ for filename, content in test_cases:
+ with open(f"{test_dir}/{filename}", "w", encoding="utf-8") as f:
+ f.write(content)
+
+ print("Created simple text files for testing")
+ return True
+
+ if not create_simple_test_files():
+ exit(1)
+ exit(0)
+
+def create_test_pdfs():
+ """Create proper test PDFs using reportlab"""
+ test_dir = "tests/test_pdfs"
+ os.makedirs(test_dir, exist_ok=True)
+
+ # Test case 1: Normal spacing (like SOCLogix NDA)
+ pdf_path = f"{test_dir}/soclogix_nda_realistic.pdf"
+ c = canvas.Canvas(pdf_path, pagesize=letter)
+ width, height = letter
+
+ # Add text with normal spacing
+ c.setFont("Helvetica", 12)
+ y_position = height - 100
+
+ lines = [
+ "SOCLogix Non-Disclosure Agreement",
+ "",
+ "This agreement is entered into between SOCLogix and the recipient",
+ "for the purpose of protecting confidential information.",
+ "",
+ "The recipient agrees to maintain strict confidentiality",
+ "regarding all proprietary information disclosed.",
+ "",
+ "This includes but is not limited to technical specifications,",
+ "business plans, customer lists, and financial data.",
+ "",
+ "Any breach of this agreement may result in legal action.",
+ "The agreement remains in effect for a period of five years.",
+ ]
+
+ for line in lines:
+ if line: # Skip empty lines for positioning
+ c.drawString(72, y_position, line)
+ y_position -= 20
+
+ c.save()
+ print(f"Created: {pdf_path}")
+
+ # Test case 2: Multi-page document
+ pdf_path = f"{test_dir}/multipage_realistic.pdf"
+ c = canvas.Canvas(pdf_path, pagesize=letter)
+
+ # Page 1
+ c.setFont("Helvetica", 12)
+ y_position = height - 100
+
+ page1_lines = [
+ "Page 1: Document with Multiple Pages",
+ "",
+ "This is the first page of a multi-page document.",
+ "It contains multiple sentences with proper spacing.",
+ "Each line should be counted as separate words.",
+ "Word boundaries are clearly defined with spaces.",
+ "",
+ "Numbers like 123, 456, and 789 should also count.",
+ "Punctuation marks help separate thoughts.",
+ "Total words on this page should be easily counted.",
+ ]
+
+ for line in page1_lines:
+ if line:
+ c.drawString(72, y_position, line)
+ y_position -= 20
+
+ # Start new page
+ c.showPage()
+ y_position = height - 100
+
+ page2_lines = [
+ "Page 2: Continuing from Previous Page",
+ "",
+ "This page also has normal text formatting.",
+ "Word counting should work correctly here too.",
+ "Mixed content: ABC123 def456 GHI789 works fine.",
+ "",
+ "Special characters like café, naïve, and résumé",
+ "should also be handled properly by the extraction.",
+ "",
+ "End of document with proper word boundaries.",
+ ]
+
+ for line in page2_lines:
+ if line:
+ c.drawString(72, y_position, line)
+ y_position -= 20
+
+ c.save()
+ print(f"Created: {pdf_path}")
+
+ # Test case 3: Document with problematic patterns
+ pdf_path = f"{test_dir}/edge_cases_realistic.pdf"
+ c = canvas.Canvas(pdf_path, pagesize=letter)
+ c.setFont("Helvetica", 12)
+ y_position = height - 100
+
+ edge_case_lines = [
+ "Edge Cases for Word Counting",
+ "",
+ "Normal text with proper spacing works fine.",
+ "TextWithoutSpacesButCamelCase should be detected.",
+ "ALLCAPSTEXT might be problematic.",
+ "mixed123CASE456text789 has transitions.",
+ "",
+ "Punctuation!!! should not count as words.",
+ "But text-with-hyphens should count properly.",
+ "Email@example.com and URLs http://test.com too.",
+ "",
+ "End with normal text to verify counting.",
+ ]
+
+ for line in edge_case_lines:
+ if line:
+ c.drawString(72, y_position, line)
+ y_position -= 20
+
+ c.save()
+ print(f"Created: {pdf_path}")
+
+ print("\nAll test PDFs created successfully!")
+ return True
+
+if __name__ == "__main__":
+ create_test_pdfs()
\ No newline at end of file
diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx
index 3c5de8f..30b0371 100644
--- a/frontend/src/App.tsx
+++ b/frontend/src/App.tsx
@@ -18,6 +18,7 @@ import WatchFolderPage from './pages/WatchFolderPage';
import DocumentManagementPage from './pages/DocumentManagementPage';
import LabelsPage from './pages/LabelsPage';
import IgnoredFilesPage from './pages/IgnoredFilesPage';
+import DebugPage from './pages/DebugPage';
function App(): React.ReactElement {
const { user, loading } = useAuth();
@@ -77,6 +78,7 @@ function App(): React.ReactElement {
} />
} />
} />
+ } />
Profile Page - Coming Soon} />
diff --git a/frontend/src/components/Layout/AppLayout.tsx b/frontend/src/components/Layout/AppLayout.tsx
index 1fb91aa..5855e11 100644
--- a/frontend/src/components/Layout/AppLayout.tsx
+++ b/frontend/src/components/Layout/AppLayout.tsx
@@ -37,6 +37,7 @@ import {
Block as BlockIcon,
Api as ApiIcon,
ManageAccounts as ManageIcon,
+ BugReport as BugReportIcon,
} from '@mui/icons-material';
import { useNavigate, useLocation } from 'react-router-dom';
import { useAuth } from '../../contexts/AuthContext';
@@ -72,6 +73,7 @@ const navigationItems: NavigationItem[] = [
{ text: 'Watch Folder', icon: FolderIcon, path: '/watch' },
{ text: 'Document Management', icon: ManageIcon, path: '/documents/management' },
{ text: 'Ignored Files', icon: BlockIcon, path: '/ignored-files' },
+ { text: 'Debug', icon: BugReportIcon, path: '/debug' },
];
const AppLayout: React.FC = ({ children }) => {
diff --git a/frontend/src/pages/DebugPage.tsx b/frontend/src/pages/DebugPage.tsx
new file mode 100644
index 0000000..2dc1b1f
--- /dev/null
+++ b/frontend/src/pages/DebugPage.tsx
@@ -0,0 +1,1069 @@
+import React, { useState, useCallback, useEffect } from 'react';
+import {
+ Box,
+ Card,
+ CardContent,
+ Typography,
+ TextField,
+ Button,
+ Grid,
+ Paper,
+ Stepper,
+ Step,
+ StepLabel,
+ StepContent,
+ Alert,
+ Chip,
+ Table,
+ TableBody,
+ TableCell,
+ TableContainer,
+ TableHead,
+ TableRow,
+ Accordion,
+ AccordionSummary,
+ AccordionDetails,
+ CircularProgress,
+ Container,
+ Tabs,
+ Tab,
+ LinearProgress,
+ Divider,
+} from '@mui/material';
+import {
+ ExpandMore as ExpandMoreIcon,
+ BugReport as BugReportIcon,
+ CheckCircle as CheckCircleIcon,
+ Error as ErrorIcon,
+ Warning as WarningIcon,
+ Pending as PendingIcon,
+ PlayArrow as PlayArrowIcon,
+ CloudUpload as UploadIcon,
+ Search as SearchIcon,
+ Refresh as RefreshIcon,
+ Visibility as PreviewIcon,
+} from '@mui/icons-material';
+import { api } from '../services/api';
+
+interface DebugStep {
+ step: number;
+ name: string;
+ status: string;
+ details: any;
+ success: boolean;
+ error?: string;
+}
+
+interface DebugInfo {
+ document_id: string;
+ filename: string;
+ overall_status: string;
+ pipeline_steps: DebugStep[];
+ failed_document_info?: any;
+ user_settings: any;
+ debug_timestamp: string;
+ detailed_processing_logs?: any[];
+ file_analysis?: {
+ file_size: number;
+ mime_type: string;
+ is_text_file: boolean;
+ is_image_file: boolean;
+ character_count: number;
+ word_count: number;
+ estimated_processing_time: number;
+ complexity_score: number;
+ [key: string]: any;
+ };
+}
+
+const DebugPage: React.FC = () => {
+ const [activeTab, setActiveTab] = useState(0);
+ const [documentId, setDocumentId] = useState('');
+ const [debugInfo, setDebugInfo] = useState(null);
+ const [loading, setLoading] = useState(false);
+ const [error, setError] = useState('');
+
+ // Upload functionality
+ const [selectedFile, setSelectedFile] = useState(null);
+ const [uploading, setUploading] = useState(false);
+ const [uploadProgress, setUploadProgress] = useState(0);
+ const [uploadedDocumentId, setUploadedDocumentId] = useState('');
+ const [monitoringInterval, setMonitoringInterval] = useState(null);
+ const [processingStatus, setProcessingStatus] = useState('');
+
+ const getStepIcon = (status: string, success: boolean) => {
+ if (status === 'processing') return ;
+ if (success || status === 'completed' || status === 'passed') return ;
+ if (status === 'failed' || status === 'error') return ;
+ if (status === 'pending' || status === 'not_reached') return ;
+ if (status === 'not_queued' || status === 'ocr_disabled') return ;
+ return ;
+ };
+
+ const getStatusColor = (status: string, success: boolean): "default" | "primary" | "secondary" | "error" | "info" | "success" | "warning" => {
+ if (status === 'processing') return 'info';
+ if (success || status === 'completed' || status === 'passed') return 'success';
+ if (status === 'failed' || status === 'error') return 'error';
+ if (status === 'pending' || status === 'not_reached') return 'default';
+ if (status === 'not_queued' || status === 'ocr_disabled') return 'warning';
+ return 'primary';
+ };
+
+ const fetchDebugInfo = useCallback(async (docId?: string, retryCount = 0) => {
+ const targetDocId = docId || documentId;
+ if (!targetDocId.trim()) {
+ setError('Please enter a document ID');
+ return;
+ }
+
+ setLoading(true);
+ if (retryCount === 0) {
+ setError(''); // Only clear error on first attempt
+ }
+
+ try {
+ const response = await api.get(`/documents/${targetDocId}/debug`);
+ setDebugInfo(response.data);
+ setError(''); // Clear any previous errors
+ } catch (err: any) {
+ console.error('Debug fetch error:', err);
+
+ // If it's a 404 and we haven't retried much, try again after a short delay
+ if (err.response?.status === 404 && retryCount < 3) {
+ console.log(`Document not found, retrying in ${(retryCount + 1) * 1000}ms... (attempt ${retryCount + 1})`);
+ setTimeout(() => {
+ fetchDebugInfo(docId, retryCount + 1);
+ }, (retryCount + 1) * 1000);
+ return;
+ }
+
+ const errorMessage = err.response?.status === 404
+ ? `Document ${targetDocId} not found. It may still be processing or may have been moved to failed documents.`
+ : err.response?.data?.message || `Failed to fetch debug information: ${err.message}`;
+ setError(errorMessage);
+ setDebugInfo(null);
+ } finally {
+ if (retryCount === 0) {
+ setLoading(false);
+ }
+ }
+ }, [documentId]);
+
+ const handleFileSelect = (event: React.ChangeEvent) => {
+ const file = event.target.files?.[0];
+ if (file) {
+ setSelectedFile(file);
+ setError('');
+ }
+ };
+
+ const uploadDocument = useCallback(async () => {
+ if (!selectedFile) {
+ setError('Please select a file to upload');
+ return;
+ }
+
+ setUploading(true);
+ setUploadProgress(0);
+ setError('');
+ setProcessingStatus('Uploading file...');
+
+ try {
+ const formData = new FormData();
+ formData.append('file', selectedFile);
+
+ const response = await api.post('/documents', formData, {
+ headers: {
+ 'Content-Type': 'multipart/form-data',
+ },
+ onUploadProgress: (progressEvent) => {
+ const progress = progressEvent.total
+ ? Math.round((progressEvent.loaded * 100) / progressEvent.total)
+ : 0;
+ setUploadProgress(progress);
+ },
+ });
+
+ const uploadedDoc = response.data;
+ setUploadedDocumentId(uploadedDoc.id);
+ setDocumentId(uploadedDoc.id);
+ setProcessingStatus('Document uploaded successfully. Starting OCR processing...');
+
+ // Start monitoring the processing
+ startProcessingMonitor(uploadedDoc.id);
+ } catch (err: any) {
+ setError(err.response?.data?.message || 'Failed to upload document');
+ setProcessingStatus('Upload failed');
+ } finally {
+ setUploading(false);
+ setUploadProgress(0);
+ }
+ }, [selectedFile]);
+
+ const startProcessingMonitor = useCallback((docId: string) => {
+ // Clear any existing interval
+ if (monitoringInterval) {
+ clearInterval(monitoringInterval);
+ }
+
+ const interval = setInterval(async () => {
+ try {
+ const response = await api.get(`/documents/${docId}`);
+ const doc = response.data;
+
+ if (doc.ocr_status === 'completed' || doc.ocr_status === 'failed') {
+ setProcessingStatus(`Processing ${doc.ocr_status}!`);
+ clearInterval(interval);
+ setMonitoringInterval(null);
+
+ // Auto-fetch debug info when processing is complete OR failed (but don't switch tabs)
+ setTimeout(() => {
+ fetchDebugInfo(docId);
+ // Don't auto-switch tabs - let user decide when to view debug info
+ }, 2000); // Give it a bit more time to ensure document is saved
+ } else if (doc.ocr_status === 'processing') {
+ setProcessingStatus('OCR processing in progress...');
+ } else if (doc.ocr_status === 'pending') {
+ setProcessingStatus('Document queued for OCR processing...');
+ } else {
+ setProcessingStatus('Checking processing status...');
+ }
+ } catch (err) {
+ console.error('Error monitoring processing:', err);
+ }
+ }, 2000); // Check every 2 seconds
+
+ setMonitoringInterval(interval);
+
+ // Auto-clear monitoring after 5 minutes
+ setTimeout(() => {
+ clearInterval(interval);
+ setMonitoringInterval(null);
+ setProcessingStatus('Monitoring stopped (timeout)');
+ }, 300000);
+ }, [monitoringInterval, fetchDebugInfo]);
+
+ // Cleanup interval on unmount
+ useEffect(() => {
+ return () => {
+ if (monitoringInterval) {
+ clearInterval(monitoringInterval);
+ }
+ };
+ }, [monitoringInterval]);
+
+ const renderStepDetails = (step: DebugStep) => {
+ const details = step.details;
+
+ return (
+
+ {step.error && (
+
+ {step.error}
+
+ )}
+
+ {step.step === 1 && ( // File Upload & Ingestion
+
+
+
+
+ File Information
+ Filename: {details.filename}
+ Original: {details.original_filename}
+ Size: {(details.file_size / 1024 / 1024).toFixed(2)} MB
+ MIME Type: {details.mime_type}
+ File Exists:
+
+
+
+
+ File Metadata
+ {details.file_metadata ? (
+ <>
+ Actual Size: {(details.file_metadata.size / 1024 / 1024).toFixed(2)} MB
+ Is File: {details.file_metadata.is_file ? 'Yes' : 'No'}
+ Modified: {details.file_metadata.modified ? new Date(details.file_metadata.modified.secs_since_epoch * 1000).toLocaleString() : 'Unknown'}
+ >
+ ) : (
+ File metadata not available
+ )}
+ Created: {new Date(details.created_at).toLocaleString()}
+
+
+
+
+ {details.file_analysis && (
+
+ Detailed File Analysis
+
+
+
+ Basic Analysis
+ File Type: {details.file_analysis.file_type}
+ Size: {(details.file_analysis.file_size_bytes / 1024 / 1024).toFixed(2)} MB
+ Readable:
+ {details.file_analysis.error_details && (
+
+ File Error: {details.file_analysis.error_details}
+
+ )}
+
+
+
+ {details.file_analysis.pdf_info ? (
+
+ PDF Analysis
+ Valid PDF:
+ PDF Version: {details.file_analysis.pdf_info.pdf_version || 'Unknown'}
+ Pages: {details.file_analysis.pdf_info.page_count || 'Unknown'}
+ Has Text:
+ Has Images:
+ Encrypted:
+ Font Count: {details.file_analysis.pdf_info.font_count}
+ Text Length: {details.file_analysis.pdf_info.estimated_text_length} chars
+ {details.file_analysis.pdf_info.text_extraction_error && (
+
+ PDF Text Extraction Error: {details.file_analysis.pdf_info.text_extraction_error}
+
+ )}
+
+ ) : details.file_analysis.text_preview ? (
+
+ Text Preview
+
+ {details.file_analysis.text_preview}
+
+
+ ) : (
+
+ File Content
+ No preview available for this file type
+
+ )}
+
+
+
+ )}
+
+ )}
+
+ {step.step === 2 && ( // OCR Queue Enrollment
+
+
+
+
+ Queue Status
+ User OCR Enabled:
+ Queue Entries: {details.queue_entries_count}
+
+
+
+
+ {details.queue_history && details.queue_history.length > 0 && (
+
+ Queue History
+
+
+
+
+ Status
+ Priority
+ Created
+ Started
+ Completed
+ Attempts
+ Worker
+
+
+
+ {details.queue_history.map((entry: any, index: number) => (
+
+
+
+
+ {entry.priority}
+ {new Date(entry.created_at).toLocaleString()}
+ {entry.started_at ? new Date(entry.started_at).toLocaleString() : '-'}
+ {entry.completed_at ? new Date(entry.completed_at).toLocaleString() : '-'}
+ {entry.attempts}
+ {entry.worker_id || '-'}
+
+ ))}
+
+
+
+
+ )}
+
+ )}
+
+ {step.step === 3 && ( // OCR Processing
+
+
+
+ OCR Results
+ Text Length: {details.ocr_text_length} characters
+ Confidence: {details.ocr_confidence ? `${details.ocr_confidence.toFixed(1)}%` : 'N/A'}
+ Word Count: {details.ocr_word_count || 0}
+ Processing Time: {details.ocr_processing_time_ms ? `${details.ocr_processing_time_ms}ms` : 'N/A'}
+ Completed: {details.ocr_completed_at ? new Date(details.ocr_completed_at).toLocaleString() : 'Not completed'}
+
+
+
+
+ Processing Details
+ Has Processed Image:
+ {details.processed_image_info && (
+ <>
+ Image Size: {details.processed_image_info.image_width}x{details.processed_image_info.image_height}
+ File Size: {(details.processed_image_info.file_size / 1024).toFixed(1)} KB
+ Processing Steps: {details.processed_image_info.processing_steps?.join(', ') || 'None'}
+ {details.processed_image_info.processing_parameters && (
+ Processing Parameters: {JSON.stringify(details.processed_image_info.processing_parameters)}
+ )}
+ >
+ )}
+
+
+
+ )}
+
+ {step.step === 4 && ( // Quality Validation
+
+
+
+
+ Quality Thresholds
+ Min Confidence: {details.quality_thresholds.min_confidence}%
+ Brightness: {details.quality_thresholds.brightness_threshold}
+ Contrast: {details.quality_thresholds.contrast_threshold}
+ Noise: {details.quality_thresholds.noise_threshold}
+ Sharpness: {details.quality_thresholds.sharpness_threshold}
+
+
+
+
+ Actual Values
+ Confidence: {details.actual_values.confidence ? `${details.actual_values.confidence.toFixed(1)}%` : 'N/A'}
+ Word Count: {details.actual_values.word_count || 0}
+ Processed Image Available:
+ {details.actual_values.processing_parameters && (
+ Processing Parameters: {JSON.stringify(details.actual_values.processing_parameters)}
+ )}
+
+
+
+
+
+ Quality Checks
+
+ {Object.entries(details.quality_checks).map(([check, passed]: [string, any]) => (
+
+ : passed === false ? : }
+ />
+
+ ))}
+
+
+
+ )}
+
+ );
+ };
+
+ const renderUploadTab = () => (
+
+
+
+
+ Upload Document for Debug Analysis
+
+
+ Upload a PDF or image file to analyze the processing pipeline in real-time.
+
+
+
+
+
+
+ {selectedFile && (
+
+
+ Selected: {selectedFile.name} ({(selectedFile.size / 1024 / 1024).toFixed(2)} MB)
+
+
+ )}
+
+ {selectedFile && (
+ : }
+ sx={{ mt: 2 }}
+ >
+ {uploading ? 'Uploading...' : 'Upload & Debug'}
+
+ )}
+
+
+ {uploading && uploadProgress > 0 && (
+
+
+ Upload Progress: {uploadProgress}%
+
+
+
+ )}
+
+ {processingStatus && (
+
+ {processingStatus}
+ {monitoringInterval && (
+
+
+
+ )}
+
+ )}
+
+ {uploadedDocumentId && (
+
+
+ Document ID: {uploadedDocumentId}
+
+
+
+
+
+
+
+ )}
+
+ {selectedFile && selectedFile.type.startsWith('image/') && (
+
+ Preview
+
+
+ )}
+
+
+
+ );
+
+ const renderSearchTab = () => (
+
+
+
+
+ Debug Existing Document
+
+
+ Enter a document ID to analyze the processing pipeline for an existing document.
+
+
+
+ setDocumentId(e.target.value)}
+ placeholder="e.g., 123e4567-e89b-12d3-a456-426614174000"
+ fullWidth
+ size="small"
+ />
+
+
+
+ {error && (
+
+ {error}
+
+ )}
+
+
+
+ );
+
+ return (
+
+
+
+
+ Document Processing Debug
+
+
+ Upload documents or analyze existing ones to troubleshoot OCR processing issues.
+
+
+
+
+
+ setActiveTab(newValue)}>
+ }
+ iconPosition="start"
+ />
+ }
+ iconPosition="start"
+ />
+ {debugInfo && (
+ }
+ iconPosition="start"
+ />
+ )}
+
+
+
+
+ {activeTab === 0 && renderUploadTab()}
+ {activeTab === 1 && renderSearchTab()}
+
+
+
+ {error && (
+
+ Debug Error
+ {error}
+
+ )}
+
+ {debugInfo && activeTab === 2 && (
+
+
+
+
+ Document: {debugInfo.filename}
+
+
+
+
+ Debug run at: {new Date(debugInfo.debug_timestamp).toLocaleString()}
+
+
+
+
+
+
+
+
+ Processing Pipeline
+
+
+ {debugInfo.pipeline_steps.map((step) => (
+
+
+
+ {step.name}
+
+
+
+
+ {renderStepDetails(step)}
+
+
+ ))}
+
+
+
+
+ {debugInfo.failed_document_info && (
+
+
+
+ Failed Document Information
+
+
+
+
+ Failure Details
+ Failure Reason: {debugInfo.failed_document_info.failure_reason}
+ Failure Stage: {debugInfo.failed_document_info.failure_stage}
+ Retry Count: {debugInfo.failed_document_info.retry_count || 0}
+ Created: {new Date(debugInfo.failed_document_info.created_at).toLocaleString()}
+ {debugInfo.failed_document_info.last_retry_at && (
+ Last Retry: {new Date(debugInfo.failed_document_info.last_retry_at).toLocaleString()}
+ )}
+
+
+
+
+ Failed OCR Results
+ {debugInfo.failed_document_info.failed_ocr_text ? (
+ <>
+ OCR Text Length: {debugInfo.failed_document_info.failed_ocr_text.length} chars
+ OCR Confidence: {debugInfo.failed_document_info.failed_ocr_confidence?.toFixed(1)}%
+ Word Count: {debugInfo.failed_document_info.failed_ocr_word_count || 0}
+ Processing Time: {debugInfo.failed_document_info.failed_ocr_processing_time_ms || 0}ms
+ >
+ ) : (
+ No OCR results available
+ )}
+
+
+ {debugInfo.failed_document_info.error_message && (
+
+
+ Error Message: {debugInfo.failed_document_info.error_message}
+
+
+ )}
+ {debugInfo.failed_document_info.content_preview && (
+
+
+ Content Preview
+
+ {debugInfo.failed_document_info.content_preview}
+
+
+
+ )}
+
+
+
+ )}
+
+ {debugInfo.detailed_processing_logs && debugInfo.detailed_processing_logs.length > 0 && (
+
+
+
+ Detailed Processing Logs
+
+
+ Complete history of all OCR processing attempts for this document.
+
+
+
+
+
+ Attempt
+ Status
+ Priority
+ Created
+ Started
+ Completed
+ Duration
+ Wait Time
+ Attempts
+ Worker
+ Error
+
+
+
+ {debugInfo.detailed_processing_logs.map((log: any, index: number) => (
+
+ {index + 1}
+
+
+
+ {log.priority}
+ {new Date(log.created_at).toLocaleString()}
+ {log.started_at ? new Date(log.started_at).toLocaleString() : '-'}
+ {log.completed_at ? new Date(log.completed_at).toLocaleString() : '-'}
+ {log.processing_duration_ms ? `${log.processing_duration_ms}ms` : '-'}
+ {log.queue_wait_time_ms ? `${log.queue_wait_time_ms}ms` : '-'}
+ {log.attempts || 0}
+ {log.worker_id || '-'}
+
+ {log.error_message ? (
+
+ {log.error_message}
+
+ ) : '-'}
+
+
+ ))}
+
+
+
+
+
+ )}
+
+ {debugInfo.file_analysis && (
+
+
+
+ File Analysis Summary
+
+
+
+
+ File Properties
+ File Type: {debugInfo.file_analysis.file_type}
+ Size: {(debugInfo.file_analysis.file_size_bytes / 1024 / 1024).toFixed(2)} MB
+ Readable:
+
+
+
+ {debugInfo.file_analysis.pdf_info && (
+
+ PDF Properties
+ Valid PDF:
+ Has Text Content:
+ Text Length: {debugInfo.file_analysis.pdf_info.estimated_text_length} chars
+ Page Count: {debugInfo.file_analysis.pdf_info.page_count || 'Unknown'}
+ Encrypted:
+
+ )}
+
+ {debugInfo.file_analysis.pdf_info?.text_extraction_error && (
+
+
+ PDF Text Extraction Issue: {debugInfo.file_analysis.pdf_info.text_extraction_error}
+
+
+ )}
+
+
+
+ )}
+
+ {debugInfo.pipeline_steps.some(step => step.step === 3 && step.details.has_processed_image) && (
+
+
+
+ Processed Images
+
+
+
+
+ Original Document
+
+
+
+
+
+ Processed Image (OCR Input)
+ {
+ (e.target as HTMLImageElement).style.display = 'none';
+ (e.target as HTMLImageElement).parentNode?.appendChild(
+ document.createTextNode('Processed image not available')
+ );
+ }}
+ sx={{
+ maxWidth: '100%',
+ maxHeight: '300px',
+ objectFit: 'contain',
+ border: '1px solid',
+ borderColor: 'divider',
+ borderRadius: 1
+ }}
+ />
+
+
+
+
+
+ )}
+
+
+
+
+ }>
+ User Settings
+
+
+
+
+
+ OCR Settings
+ Background OCR: {debugInfo.user_settings.enable_background_ocr ? 'Enabled' : 'Disabled'}
+ Min Confidence: {debugInfo.user_settings.ocr_min_confidence}%
+ Max File Size: {debugInfo.user_settings.max_file_size_mb} MB
+
+
+
+
+ Quality Thresholds
+ Brightness: {debugInfo.user_settings.quality_thresholds.brightness}
+ Contrast: {debugInfo.user_settings.quality_thresholds.contrast}
+ Noise: {debugInfo.user_settings.quality_thresholds.noise}
+ Sharpness: {debugInfo.user_settings.quality_thresholds.sharpness}
+
+
+
+
+
+
+
+
+ )}
+
+ );
+};
+
+export default DebugPage;
\ No newline at end of file
diff --git a/src/ocr/enhanced.rs b/src/ocr/enhanced.rs
index 2c1e3f7..f333a66 100644
--- a/src/ocr/enhanced.rs
+++ b/src/ocr/enhanced.rs
@@ -791,7 +791,7 @@ impl EnhancedOcrService {
/// Extract text from PDF with size and time limits
#[cfg(feature = "ocr")]
- pub async fn extract_text_from_pdf(&self, file_path: &str, _settings: &Settings) -> Result {
+ pub async fn extract_text_from_pdf(&self, file_path: &str, settings: &Settings) -> Result {
let start_time = std::time::Instant::now();
info!("Extracting text from PDF: {}", file_path);
@@ -888,16 +888,190 @@ impl EnhancedOcrService {
trimmed_text.chars().take(200).collect::()
);
+ // Smart detection: assess if text extraction quality is good enough
+ if self.is_text_extraction_quality_sufficient(&trimmed_text, word_count, file_size) {
+ info!("PDF text extraction successful for '{}', using extracted text", file_path);
+ Ok(OcrResult {
+ text: trimmed_text,
+ confidence: 95.0, // PDF text extraction is generally high confidence
+ processing_time_ms: processing_time,
+ word_count,
+ preprocessing_applied: vec!["PDF text extraction".to_string()],
+ processed_image_path: None,
+ })
+ } else {
+ info!("PDF text extraction insufficient for '{}' ({} words), falling back to OCR", file_path, word_count);
+ // Fall back to OCR using ocrmypdf
+ self.extract_text_from_pdf_with_ocr(file_path, settings, start_time).await
+ }
+ }
+
+ /// Assess if text extraction quality is sufficient or if OCR fallback is needed
+ #[cfg(feature = "ocr")]
+ fn is_text_extraction_quality_sufficient(&self, text: &str, word_count: usize, file_size: u64) -> bool {
+ // If we got no words at all, definitely need OCR
+ if word_count == 0 {
+ return false;
+ }
+
+ // For very small files, low word count might be normal
+ if file_size < 50_000 && word_count >= 1 {
+ return true;
+ }
+
+ // Calculate word density (words per KB)
+ let file_size_kb = (file_size as f64) / 1024.0;
+ let word_density = (word_count as f64) / file_size_kb;
+
+ // Reasonable thresholds based on typical PDF content:
+ // - Text-based PDFs typically have 50-200 words per KB
+ // - Below 5 words per KB suggests mostly images/scanned content
+ const MIN_WORD_DENSITY: f64 = 5.0;
+ const MIN_WORDS_FOR_LARGE_FILES: usize = 10;
+
+ if word_density < MIN_WORD_DENSITY && word_count < MIN_WORDS_FOR_LARGE_FILES {
+ debug!("PDF appears to be image-based: {} words in {:.1} KB (density: {:.2} words/KB)",
+ word_count, file_size_kb, word_density);
+ return false;
+ }
+
+ // Additional check: if text is mostly non-alphanumeric, might be extraction artifacts
+ let alphanumeric_chars = text.chars().filter(|c| c.is_alphanumeric()).count();
+ let alphanumeric_ratio = if text.len() > 0 {
+ (alphanumeric_chars as f64) / (text.len() as f64)
+ } else {
+ 0.0
+ };
+
+ // If less than 30% alphanumeric content, likely poor extraction
+ if alphanumeric_ratio < 0.3 {
+ debug!("PDF text has low alphanumeric content: {:.1}% ({} of {} chars)",
+ alphanumeric_ratio * 100.0, alphanumeric_chars, text.len());
+ return false;
+ }
+
+ debug!("PDF text extraction quality sufficient: {} words, {:.2} words/KB, {:.1}% alphanumeric",
+ word_count, word_density, alphanumeric_ratio * 100.0);
+ true
+ }
+
+ /// Extract text from PDF using OCR (ocrmypdf) for image-based or poor-quality PDFs
+ #[cfg(feature = "ocr")]
+ async fn extract_text_from_pdf_with_ocr(&self, file_path: &str, settings: &Settings, start_time: std::time::Instant) -> Result {
+ info!("Starting OCR extraction for PDF: {}", file_path);
+
+ // Check if ocrmypdf is available
+ if !self.is_ocrmypdf_available().await {
+ return Err(anyhow!(
+ "ocrmypdf is not available on this system. To extract text from image-based PDFs like '{}', please install ocrmypdf. \
+ On Ubuntu/Debian: 'apt-get install ocrmypdf'. \
+ On macOS: 'brew install ocrmypdf'. \
+ Alternatively, convert the PDF to images and upload those instead.",
+ file_path
+ ));
+ }
+
+ // Generate temporary file path for OCR'd PDF
+ let temp_ocr_filename = format!("ocr_{}_{}.pdf",
+ std::process::id(),
+ std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?.as_millis()
+ );
+ let temp_ocr_path = format!("{}/{}", self.temp_dir, temp_ocr_filename);
+
+ // Run ocrmypdf to create searchable PDF
+ let ocrmypdf_result = tokio::time::timeout(
+ std::time::Duration::from_secs(300), // 5 minute timeout for OCR
+ tokio::task::spawn_blocking({
+ let file_path = file_path.to_string();
+ let temp_ocr_path = temp_ocr_path.clone();
+ move || {
+ std::process::Command::new("ocrmypdf")
+ .arg("--force-ocr") // OCR even if text is detected
+ .arg("-O2") // Optimize level 2 (balanced quality/speed)
+ .arg("--deskew") // Correct skewed pages
+ .arg("--clean") // Clean up artifacts
+ .arg("--language")
+ .arg("eng") // English language
+ .arg(&file_path)
+ .arg(&temp_ocr_path)
+ .output()
+ }
+ })
+ ).await;
+
+ let ocrmypdf_output = match ocrmypdf_result {
+ Ok(Ok(output)) => output?,
+ Ok(Err(e)) => return Err(anyhow!("Failed to join ocrmypdf task: {}", e)),
+ Err(_) => return Err(anyhow!("ocrmypdf timed out after 5 minutes for file '{}'", file_path)),
+ };
+
+ if !ocrmypdf_output.status.success() {
+ let stderr = String::from_utf8_lossy(&ocrmypdf_output.stderr);
+ let stdout = String::from_utf8_lossy(&ocrmypdf_output.stdout);
+ return Err(anyhow!(
+ "ocrmypdf failed for '{}': Exit code {}\nStderr: {}\nStdout: {}",
+ file_path, ocrmypdf_output.status.code().unwrap_or(-1), stderr, stdout
+ ));
+ }
+
+ // Extract text from the OCR'd PDF
+ let ocr_text_result = tokio::task::spawn_blocking({
+ let temp_ocr_path = temp_ocr_path.clone();
+ move || -> Result {
+ let bytes = std::fs::read(&temp_ocr_path)?;
+ let text = pdf_extract::extract_text_from_mem(&bytes)?;
+ Ok(text.trim().to_string())
+ }
+ }).await??;
+
+ // Clean up temporary file
+ let _ = tokio::fs::remove_file(&temp_ocr_path).await;
+
+ let processing_time = start_time.elapsed().as_millis() as u64;
+ let word_count = self.count_words_safely(&ocr_text_result);
+
+ info!("OCR extraction completed for '{}': {} words in {}ms",
+ file_path, word_count, processing_time);
+
Ok(OcrResult {
- text: trimmed_text,
- confidence: 95.0, // PDF text extraction is generally high confidence
+ text: ocr_text_result,
+ confidence: 85.0, // OCR is generally lower confidence than direct text extraction
processing_time_ms: processing_time,
word_count,
- preprocessing_applied: vec!["PDF text extraction".to_string()],
- processed_image_path: None, // No image processing for PDF text extraction
+ preprocessing_applied: vec!["OCR via ocrmypdf".to_string()],
+ processed_image_path: None,
})
}
+ /// Check if ocrmypdf is available on the system
+ #[cfg(feature = "ocr")]
+ async fn is_ocrmypdf_available(&self) -> bool {
+ match tokio::process::Command::new("ocrmypdf")
+ .arg("--version")
+ .output()
+ .await
+ {
+ Ok(output) => output.status.success(),
+ Err(_) => false,
+ }
+ }
+
+ #[cfg(not(feature = "ocr"))]
+ fn is_text_extraction_quality_sufficient(&self, _text: &str, _word_count: usize, _file_size: u64) -> bool {
+ // When OCR is disabled, always accept text extraction results
+ true
+ }
+
+ #[cfg(not(feature = "ocr"))]
+ async fn is_ocrmypdf_available(&self) -> bool {
+ false // OCR feature not enabled
+ }
+
+ #[cfg(not(feature = "ocr"))]
+ async fn extract_text_from_pdf_with_ocr(&self, file_path: &str, _settings: &Settings, _start_time: std::time::Instant) -> Result {
+ Err(anyhow::anyhow!("OCR feature not enabled - cannot process image-based PDF: {}", file_path))
+ }
+
/// Resolve file path to actual location, handling both old and new directory structures
async fn resolve_file_path(&self, file_path: &str) -> Result {
// Use the FileService's resolve_file_path method
@@ -988,7 +1162,7 @@ impl EnhancedOcrService {
/// Safely count words to prevent overflow on very large texts
#[cfg(feature = "ocr")]
- fn count_words_safely(&self, text: &str) -> usize {
+ pub fn count_words_safely(&self, text: &str) -> usize {
// For very large texts, sample to estimate word count to prevent overflow
if text.len() > 1_000_000 { // > 1MB of text
// Sample first 100KB and extrapolate
@@ -1008,31 +1182,51 @@ impl EnhancedOcrService {
fn count_words_in_text(&self, text: &str) -> usize {
let whitespace_words = text.split_whitespace().count();
- // If no whitespace-separated words found but text exists, try alternative word detection
- if whitespace_words == 0 && !text.trim().is_empty() {
- // For PDFs that extract as continuous text, estimate words based on character patterns
- // Look for transitions from letters to non-letters as potential word boundaries
- let mut word_count = 0;
- let mut in_word = false;
+ // If we have exactly 1 "word" but it's very long (likely continuous text), try enhanced detection
+ // OR if we have no whitespace words but text exists
+ let is_continuous_text = whitespace_words == 1 && text.len() > 15; // 15+ chars suggests it might be continuous
+ let is_no_words = whitespace_words == 0 && !text.trim().is_empty();
+
+ if is_continuous_text || is_no_words {
+ // Count total alphanumeric characters first
+ let alphanumeric_chars = text.chars().filter(|c| c.is_alphanumeric()).count();
- for c in text.chars() {
- if c.is_alphabetic() {
- if !in_word {
- word_count += 1;
- in_word = true;
- }
- } else {
- in_word = false;
+ // If no alphanumeric content, it's pure punctuation/symbols
+ if alphanumeric_chars == 0 {
+ return 0;
+ }
+
+ // For continuous text, look for word boundaries using multiple strategies
+ let mut word_count = 0;
+
+ // Strategy 1: Count transitions from lowercase to uppercase (camelCase detection)
+ let chars: Vec = text.chars().collect();
+ let mut camel_transitions = 0;
+
+ for i in 1..chars.len() {
+ let prev_char = chars[i-1];
+ let curr_char = chars[i];
+
+ // Count transitions from lowercase letter to uppercase letter
+ if prev_char.is_lowercase() && curr_char.is_uppercase() {
+ camel_transitions += 1;
+ }
+ // Count transitions from letter to digit or digit to letter
+ else if (prev_char.is_alphabetic() && curr_char.is_numeric()) ||
+ (prev_char.is_numeric() && curr_char.is_alphabetic()) {
+ camel_transitions += 1;
}
}
- // If still no words found but we have alphanumeric content,
- // estimate based on reasonable word length (assume ~5 chars per word)
+ // If we found camelCase transitions, estimate words
+ if camel_transitions > 0 {
+ word_count = camel_transitions + 1; // +1 for the first word
+ }
+
+ // Strategy 2: If no camelCase detected, estimate based on character count
if word_count == 0 {
- let alphanumeric_chars = text.chars().filter(|c| c.is_alphanumeric()).count();
- if alphanumeric_chars > 0 {
- word_count = (alphanumeric_chars / 5).max(1);
- }
+ // Estimate based on typical word length (4-6 characters per word)
+ word_count = (alphanumeric_chars / 5).max(1);
}
word_count
diff --git a/src/routes/documents.rs b/src/routes/documents.rs
index f8c5a43..cec3549 100644
--- a/src/routes/documents.rs
+++ b/src/routes/documents.rs
@@ -58,6 +58,7 @@ pub fn router() -> Router> {
.route("/{id}/ocr", get(get_document_ocr))
.route("/{id}/processed-image", get(get_processed_image))
.route("/{id}/retry-ocr", post(retry_ocr))
+ .route("/{id}/debug", get(get_document_debug_info))
.route("/duplicates", get(get_user_duplicates))
.route("/failed", get(get_failed_documents))
.route("/failed/{id}/view", get(view_failed_document))
@@ -645,6 +646,560 @@ async fn retry_ocr(
}
}
+#[utoipa::path(
+ get,
+ path = "/api/documents/{id}/debug",
+ tag = "documents",
+ security(
+ ("bearer_auth" = [])
+ ),
+ params(
+ ("id" = uuid::Uuid, Path, description = "Document ID")
+ ),
+ responses(
+ (status = 200, description = "Debug information for document processing pipeline", body = String),
+ (status = 404, description = "Document not found"),
+ (status = 401, description = "Unauthorized")
+ )
+)]
+async fn get_document_debug_info(
+ State(state): State>,
+ auth_user: AuthUser,
+ Path(document_id): Path,
+) -> Result, StatusCode> {
+ tracing::info!("Starting debug analysis for document {} by user {}", document_id, auth_user.user.id);
+
+ // Get the document
+ let document = match state
+ .db
+ .get_document_by_id(document_id, auth_user.user.id, auth_user.user.role)
+ .await
+ {
+ Ok(Some(doc)) => {
+ tracing::info!("Found document: {} ({})", doc.filename, doc.mime_type);
+ doc
+ }
+ Ok(None) => {
+ tracing::warn!("Document {} not found for user {}", document_id, auth_user.user.id);
+ return Err(StatusCode::NOT_FOUND);
+ }
+ Err(e) => {
+ tracing::error!("Database error fetching document {}: {}", document_id, e);
+ return Err(StatusCode::INTERNAL_SERVER_ERROR);
+ }
+ };
+
+ // Get user settings
+ tracing::info!("Fetching user settings for user {}", auth_user.user.id);
+ let settings = match state
+ .db
+ .get_user_settings(auth_user.user.id)
+ .await
+ {
+ Ok(Some(s)) => {
+ tracing::info!("Found user settings: OCR enabled={}, min_confidence={}", s.enable_background_ocr, s.ocr_min_confidence);
+ s
+ }
+ Ok(None) => {
+ tracing::info!("No user settings found, using defaults");
+ crate::models::Settings::default()
+ }
+ Err(e) => {
+ tracing::error!("Error fetching user settings: {}", e);
+ return Err(StatusCode::INTERNAL_SERVER_ERROR);
+ }
+ };
+
+ // Get OCR queue history for this document
+ tracing::info!("Fetching OCR queue history for document {}", document_id);
+ let queue_history = match sqlx::query(
+ r#"
+ SELECT id, status, priority, created_at, started_at, completed_at,
+ error_message, attempts, worker_id
+ FROM ocr_queue
+ WHERE document_id = $1
+ ORDER BY created_at DESC
+ LIMIT 10
+ "#
+ )
+ .bind(document_id)
+ .fetch_all(state.db.get_pool())
+ .await {
+ Ok(history) => {
+ tracing::info!("Queue history query successful, found {} entries", history.len());
+ history
+ },
+ Err(e) => {
+ tracing::error!("Queue history query error: {}", e);
+ return Err(StatusCode::INTERNAL_SERVER_ERROR);
+ }
+ };
+
+ // Get processed image info if it exists
+ tracing::info!("Fetching processed image for document {}", document_id);
+ let processed_image = match state
+ .db
+ .get_processed_image_by_document_id(document_id, auth_user.user.id)
+ .await {
+ Ok(Some(img)) => {
+ tracing::info!("Found processed image for document {}", document_id);
+ Some(img)
+ },
+ Ok(None) => {
+ tracing::info!("No processed image found for document {}", document_id);
+ None
+ },
+ Err(e) => {
+ tracing::warn!("Error fetching processed image for document {}: {}", document_id, e);
+ None
+ }
+ };
+
+ // Get failed document record if it exists
+ tracing::info!("Fetching failed document record for document {}", document_id);
+ let failed_document = match sqlx::query(
+ r#"
+ SELECT failure_reason, failure_stage, error_message, retry_count,
+ last_retry_at, created_at, content, ocr_text, ocr_confidence,
+ ocr_word_count, ocr_processing_time_ms
+ FROM failed_documents
+ WHERE id = $1 OR existing_document_id = $1
+ ORDER BY created_at DESC
+ LIMIT 1
+ "#
+ )
+ .bind(document_id)
+ .fetch_optional(state.db.get_pool())
+ .await {
+ Ok(result) => {
+ tracing::info!("Failed document query successful, found: {}", result.is_some());
+ result
+ },
+ Err(e) => {
+ tracing::error!("Failed document query error: {}", e);
+ return Err(StatusCode::INTERNAL_SERVER_ERROR);
+ }
+ };
+
+ // Get detailed OCR processing logs and attempts
+ tracing::info!("Fetching detailed OCR processing logs for document {}", document_id);
+ let ocr_processing_logs = match sqlx::query(
+ r#"
+ SELECT id, status, priority, created_at, started_at, completed_at,
+ error_message, attempts, worker_id, processing_time_ms, file_size
+ FROM ocr_queue
+ WHERE document_id = $1
+ ORDER BY created_at ASC
+ "#
+ )
+ .bind(document_id)
+ .fetch_all(state.db.get_pool())
+ .await {
+ Ok(logs) => {
+ tracing::info!("OCR processing logs query successful, found {} entries", logs.len());
+ logs
+ },
+ Err(e) => {
+ tracing::error!("OCR processing logs query error: {}", e);
+ return Err(StatusCode::INTERNAL_SERVER_ERROR);
+ }
+ };
+
+ // File service for file info
+ let file_service = FileService::new(state.config.upload_path.clone());
+
+ // Check if file exists
+ let file_exists = tokio::fs::metadata(&document.file_path).await.is_ok();
+ let file_metadata = if file_exists {
+ tokio::fs::metadata(&document.file_path).await.ok()
+ } else {
+ None
+ };
+
+ // Try to analyze file content for additional diagnostic info
+ tracing::info!("Analyzing file content for document {} (exists: {})", document_id, file_exists);
+ let file_analysis = if file_exists {
+ match analyze_file_content(&document.file_path, &document.mime_type).await {
+ Ok(analysis) => {
+ tracing::info!("File analysis successful for document {}", document_id);
+ analysis
+ },
+ Err(e) => {
+ tracing::warn!("Failed to analyze file content for {}: {}", document_id, e);
+ FileAnalysis {
+ error_details: Some(format!("File analysis failed: {}", e)),
+ ..Default::default()
+ }
+ }
+ }
+ } else {
+ tracing::warn!("File does not exist for document {}, skipping analysis", document_id);
+ FileAnalysis::default()
+ };
+
+ // Pipeline steps analysis
+ let mut pipeline_steps = Vec::new();
+
+ // Step 1: File Upload & Ingestion
+ pipeline_steps.push(serde_json::json!({
+ "step": 1,
+ "name": "File Upload & Ingestion",
+ "status": "completed", // Document exists if we got this far
+ "details": {
+ "filename": document.filename,
+ "original_filename": document.original_filename,
+ "file_size": document.file_size,
+ "mime_type": document.mime_type,
+ "file_exists": file_exists,
+ "file_path": document.file_path,
+ "created_at": document.created_at,
+ "file_metadata": file_metadata.as_ref().map(|m| serde_json::json!({
+ "size": m.len(),
+ "modified": m.modified().ok(),
+ "is_file": m.is_file(),
+ "is_dir": m.is_dir()
+ })),
+ "file_analysis": file_analysis
+ },
+ "success": true,
+ "error": None::
+ }));
+
+ // Step 2: OCR Queue Enrollment
+ let queue_enrollment_status = if queue_history.is_empty() {
+ if settings.enable_background_ocr {
+ "not_queued"
+ } else {
+ "ocr_disabled"
+ }
+ } else {
+ "queued"
+ };
+
+ pipeline_steps.push(serde_json::json!({
+ "step": 2,
+ "name": "OCR Queue Enrollment",
+ "status": queue_enrollment_status,
+ "details": {
+ "user_ocr_enabled": settings.enable_background_ocr,
+ "queue_entries_count": queue_history.len(),
+ "queue_history": queue_history.iter().map(|row| serde_json::json!({
+ "id": row.get::("id"),
+ "status": row.get::("status"),
+ "priority": row.get::("priority"),
+ "created_at": row.get::, _>("created_at"),
+ "started_at": row.get::