diff --git a/Dockerfile b/Dockerfile index 7292b3c..7bbfdb0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,6 +20,7 @@ RUN apt-get update && apt-get install -y \ libclang-dev \ clang \ poppler-utils \ + ocrmypdf \ && rm -rf /var/lib/apt/lists/* WORKDIR /app @@ -37,6 +38,7 @@ RUN apt-get update && apt-get install -y \ tesseract-ocr-eng \ ca-certificates \ poppler-utils \ + ocrmypdf \ && rm -rf /var/lib/apt/lists/* WORKDIR /app diff --git a/create_test_pdfs.py b/create_test_pdfs.py new file mode 100644 index 0000000..d4055d3 --- /dev/null +++ b/create_test_pdfs.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +""" +Create proper test PDFs for debugging OCR word counting issues. +""" + +try: + from reportlab.pdfgen import canvas + from reportlab.lib.pagesizes import letter + import os +except ImportError: + print("reportlab not installed. Trying alternative method...") + # Alternative: create simple text files for testing + import os + + def create_simple_test_files(): + """Create simple text files as a fallback""" + test_dir = "tests/test_pdfs" + os.makedirs(test_dir, exist_ok=True) + + # Test cases that would be similar to PDF extraction results + test_cases = [ + ("normal_spacing.txt", "This is a normal document with proper word spacing and punctuation."), + ("soclogix_sample.txt", "SOCLogix Non-Disclosure Agreement\nThis agreement is entered into between SOCLogix and the recipient for the purpose of protecting confidential information."), + ("multiline_text.txt", "Line one with several words\nLine two with more content\nLine three continues the pattern\nFinal line ends the document"), + ("mixed_content.txt", "Document with numbers 123 and symbols @#$ mixed with normal text."), + ("special_chars.txt", "Text with special characters: café naïve résumé — and 'quotes' • bullets"), + ] + + for filename, content in test_cases: + with open(f"{test_dir}/{filename}", "w", encoding="utf-8") as f: + f.write(content) + + print("Created simple text files for testing") + return True + + if not create_simple_test_files(): + exit(1) + exit(0) + +def create_test_pdfs(): + """Create proper test PDFs using reportlab""" + test_dir = "tests/test_pdfs" + os.makedirs(test_dir, exist_ok=True) + + # Test case 1: Normal spacing (like SOCLogix NDA) + pdf_path = f"{test_dir}/soclogix_nda_realistic.pdf" + c = canvas.Canvas(pdf_path, pagesize=letter) + width, height = letter + + # Add text with normal spacing + c.setFont("Helvetica", 12) + y_position = height - 100 + + lines = [ + "SOCLogix Non-Disclosure Agreement", + "", + "This agreement is entered into between SOCLogix and the recipient", + "for the purpose of protecting confidential information.", + "", + "The recipient agrees to maintain strict confidentiality", + "regarding all proprietary information disclosed.", + "", + "This includes but is not limited to technical specifications,", + "business plans, customer lists, and financial data.", + "", + "Any breach of this agreement may result in legal action.", + "The agreement remains in effect for a period of five years.", + ] + + for line in lines: + if line: # Skip empty lines for positioning + c.drawString(72, y_position, line) + y_position -= 20 + + c.save() + print(f"Created: {pdf_path}") + + # Test case 2: Multi-page document + pdf_path = f"{test_dir}/multipage_realistic.pdf" + c = canvas.Canvas(pdf_path, pagesize=letter) + + # Page 1 + c.setFont("Helvetica", 12) + y_position = height - 100 + + page1_lines = [ + "Page 1: Document with Multiple Pages", + "", + "This is the first page of a multi-page document.", + "It contains multiple sentences with proper spacing.", + "Each line should be counted as separate words.", + "Word boundaries are clearly defined with spaces.", + "", + "Numbers like 123, 456, and 789 should also count.", + "Punctuation marks help separate thoughts.", + "Total words on this page should be easily counted.", + ] + + for line in page1_lines: + if line: + c.drawString(72, y_position, line) + y_position -= 20 + + # Start new page + c.showPage() + y_position = height - 100 + + page2_lines = [ + "Page 2: Continuing from Previous Page", + "", + "This page also has normal text formatting.", + "Word counting should work correctly here too.", + "Mixed content: ABC123 def456 GHI789 works fine.", + "", + "Special characters like café, naïve, and résumé", + "should also be handled properly by the extraction.", + "", + "End of document with proper word boundaries.", + ] + + for line in page2_lines: + if line: + c.drawString(72, y_position, line) + y_position -= 20 + + c.save() + print(f"Created: {pdf_path}") + + # Test case 3: Document with problematic patterns + pdf_path = f"{test_dir}/edge_cases_realistic.pdf" + c = canvas.Canvas(pdf_path, pagesize=letter) + c.setFont("Helvetica", 12) + y_position = height - 100 + + edge_case_lines = [ + "Edge Cases for Word Counting", + "", + "Normal text with proper spacing works fine.", + "TextWithoutSpacesButCamelCase should be detected.", + "ALLCAPSTEXT might be problematic.", + "mixed123CASE456text789 has transitions.", + "", + "Punctuation!!! should not count as words.", + "But text-with-hyphens should count properly.", + "Email@example.com and URLs http://test.com too.", + "", + "End with normal text to verify counting.", + ] + + for line in edge_case_lines: + if line: + c.drawString(72, y_position, line) + y_position -= 20 + + c.save() + print(f"Created: {pdf_path}") + + print("\nAll test PDFs created successfully!") + return True + +if __name__ == "__main__": + create_test_pdfs() \ No newline at end of file diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index 3c5de8f..30b0371 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -18,6 +18,7 @@ import WatchFolderPage from './pages/WatchFolderPage'; import DocumentManagementPage from './pages/DocumentManagementPage'; import LabelsPage from './pages/LabelsPage'; import IgnoredFilesPage from './pages/IgnoredFilesPage'; +import DebugPage from './pages/DebugPage'; function App(): React.ReactElement { const { user, loading } = useAuth(); @@ -77,6 +78,7 @@ function App(): React.ReactElement { } /> } /> } /> + } /> Profile Page - Coming Soon} /> diff --git a/frontend/src/components/Layout/AppLayout.tsx b/frontend/src/components/Layout/AppLayout.tsx index 1fb91aa..5855e11 100644 --- a/frontend/src/components/Layout/AppLayout.tsx +++ b/frontend/src/components/Layout/AppLayout.tsx @@ -37,6 +37,7 @@ import { Block as BlockIcon, Api as ApiIcon, ManageAccounts as ManageIcon, + BugReport as BugReportIcon, } from '@mui/icons-material'; import { useNavigate, useLocation } from 'react-router-dom'; import { useAuth } from '../../contexts/AuthContext'; @@ -72,6 +73,7 @@ const navigationItems: NavigationItem[] = [ { text: 'Watch Folder', icon: FolderIcon, path: '/watch' }, { text: 'Document Management', icon: ManageIcon, path: '/documents/management' }, { text: 'Ignored Files', icon: BlockIcon, path: '/ignored-files' }, + { text: 'Debug', icon: BugReportIcon, path: '/debug' }, ]; const AppLayout: React.FC = ({ children }) => { diff --git a/frontend/src/pages/DebugPage.tsx b/frontend/src/pages/DebugPage.tsx new file mode 100644 index 0000000..2dc1b1f --- /dev/null +++ b/frontend/src/pages/DebugPage.tsx @@ -0,0 +1,1069 @@ +import React, { useState, useCallback, useEffect } from 'react'; +import { + Box, + Card, + CardContent, + Typography, + TextField, + Button, + Grid, + Paper, + Stepper, + Step, + StepLabel, + StepContent, + Alert, + Chip, + Table, + TableBody, + TableCell, + TableContainer, + TableHead, + TableRow, + Accordion, + AccordionSummary, + AccordionDetails, + CircularProgress, + Container, + Tabs, + Tab, + LinearProgress, + Divider, +} from '@mui/material'; +import { + ExpandMore as ExpandMoreIcon, + BugReport as BugReportIcon, + CheckCircle as CheckCircleIcon, + Error as ErrorIcon, + Warning as WarningIcon, + Pending as PendingIcon, + PlayArrow as PlayArrowIcon, + CloudUpload as UploadIcon, + Search as SearchIcon, + Refresh as RefreshIcon, + Visibility as PreviewIcon, +} from '@mui/icons-material'; +import { api } from '../services/api'; + +interface DebugStep { + step: number; + name: string; + status: string; + details: any; + success: boolean; + error?: string; +} + +interface DebugInfo { + document_id: string; + filename: string; + overall_status: string; + pipeline_steps: DebugStep[]; + failed_document_info?: any; + user_settings: any; + debug_timestamp: string; + detailed_processing_logs?: any[]; + file_analysis?: { + file_size: number; + mime_type: string; + is_text_file: boolean; + is_image_file: boolean; + character_count: number; + word_count: number; + estimated_processing_time: number; + complexity_score: number; + [key: string]: any; + }; +} + +const DebugPage: React.FC = () => { + const [activeTab, setActiveTab] = useState(0); + const [documentId, setDocumentId] = useState(''); + const [debugInfo, setDebugInfo] = useState(null); + const [loading, setLoading] = useState(false); + const [error, setError] = useState(''); + + // Upload functionality + const [selectedFile, setSelectedFile] = useState(null); + const [uploading, setUploading] = useState(false); + const [uploadProgress, setUploadProgress] = useState(0); + const [uploadedDocumentId, setUploadedDocumentId] = useState(''); + const [monitoringInterval, setMonitoringInterval] = useState(null); + const [processingStatus, setProcessingStatus] = useState(''); + + const getStepIcon = (status: string, success: boolean) => { + if (status === 'processing') return ; + if (success || status === 'completed' || status === 'passed') return ; + if (status === 'failed' || status === 'error') return ; + if (status === 'pending' || status === 'not_reached') return ; + if (status === 'not_queued' || status === 'ocr_disabled') return ; + return ; + }; + + const getStatusColor = (status: string, success: boolean): "default" | "primary" | "secondary" | "error" | "info" | "success" | "warning" => { + if (status === 'processing') return 'info'; + if (success || status === 'completed' || status === 'passed') return 'success'; + if (status === 'failed' || status === 'error') return 'error'; + if (status === 'pending' || status === 'not_reached') return 'default'; + if (status === 'not_queued' || status === 'ocr_disabled') return 'warning'; + return 'primary'; + }; + + const fetchDebugInfo = useCallback(async (docId?: string, retryCount = 0) => { + const targetDocId = docId || documentId; + if (!targetDocId.trim()) { + setError('Please enter a document ID'); + return; + } + + setLoading(true); + if (retryCount === 0) { + setError(''); // Only clear error on first attempt + } + + try { + const response = await api.get(`/documents/${targetDocId}/debug`); + setDebugInfo(response.data); + setError(''); // Clear any previous errors + } catch (err: any) { + console.error('Debug fetch error:', err); + + // If it's a 404 and we haven't retried much, try again after a short delay + if (err.response?.status === 404 && retryCount < 3) { + console.log(`Document not found, retrying in ${(retryCount + 1) * 1000}ms... (attempt ${retryCount + 1})`); + setTimeout(() => { + fetchDebugInfo(docId, retryCount + 1); + }, (retryCount + 1) * 1000); + return; + } + + const errorMessage = err.response?.status === 404 + ? `Document ${targetDocId} not found. It may still be processing or may have been moved to failed documents.` + : err.response?.data?.message || `Failed to fetch debug information: ${err.message}`; + setError(errorMessage); + setDebugInfo(null); + } finally { + if (retryCount === 0) { + setLoading(false); + } + } + }, [documentId]); + + const handleFileSelect = (event: React.ChangeEvent) => { + const file = event.target.files?.[0]; + if (file) { + setSelectedFile(file); + setError(''); + } + }; + + const uploadDocument = useCallback(async () => { + if (!selectedFile) { + setError('Please select a file to upload'); + return; + } + + setUploading(true); + setUploadProgress(0); + setError(''); + setProcessingStatus('Uploading file...'); + + try { + const formData = new FormData(); + formData.append('file', selectedFile); + + const response = await api.post('/documents', formData, { + headers: { + 'Content-Type': 'multipart/form-data', + }, + onUploadProgress: (progressEvent) => { + const progress = progressEvent.total + ? Math.round((progressEvent.loaded * 100) / progressEvent.total) + : 0; + setUploadProgress(progress); + }, + }); + + const uploadedDoc = response.data; + setUploadedDocumentId(uploadedDoc.id); + setDocumentId(uploadedDoc.id); + setProcessingStatus('Document uploaded successfully. Starting OCR processing...'); + + // Start monitoring the processing + startProcessingMonitor(uploadedDoc.id); + } catch (err: any) { + setError(err.response?.data?.message || 'Failed to upload document'); + setProcessingStatus('Upload failed'); + } finally { + setUploading(false); + setUploadProgress(0); + } + }, [selectedFile]); + + const startProcessingMonitor = useCallback((docId: string) => { + // Clear any existing interval + if (monitoringInterval) { + clearInterval(monitoringInterval); + } + + const interval = setInterval(async () => { + try { + const response = await api.get(`/documents/${docId}`); + const doc = response.data; + + if (doc.ocr_status === 'completed' || doc.ocr_status === 'failed') { + setProcessingStatus(`Processing ${doc.ocr_status}!`); + clearInterval(interval); + setMonitoringInterval(null); + + // Auto-fetch debug info when processing is complete OR failed (but don't switch tabs) + setTimeout(() => { + fetchDebugInfo(docId); + // Don't auto-switch tabs - let user decide when to view debug info + }, 2000); // Give it a bit more time to ensure document is saved + } else if (doc.ocr_status === 'processing') { + setProcessingStatus('OCR processing in progress...'); + } else if (doc.ocr_status === 'pending') { + setProcessingStatus('Document queued for OCR processing...'); + } else { + setProcessingStatus('Checking processing status...'); + } + } catch (err) { + console.error('Error monitoring processing:', err); + } + }, 2000); // Check every 2 seconds + + setMonitoringInterval(interval); + + // Auto-clear monitoring after 5 minutes + setTimeout(() => { + clearInterval(interval); + setMonitoringInterval(null); + setProcessingStatus('Monitoring stopped (timeout)'); + }, 300000); + }, [monitoringInterval, fetchDebugInfo]); + + // Cleanup interval on unmount + useEffect(() => { + return () => { + if (monitoringInterval) { + clearInterval(monitoringInterval); + } + }; + }, [monitoringInterval]); + + const renderStepDetails = (step: DebugStep) => { + const details = step.details; + + return ( + + {step.error && ( + + {step.error} + + )} + + {step.step === 1 && ( // File Upload & Ingestion + + + + + File Information + Filename: {details.filename} + Original: {details.original_filename} + Size: {(details.file_size / 1024 / 1024).toFixed(2)} MB + MIME Type: {details.mime_type} + File Exists: + + + + + File Metadata + {details.file_metadata ? ( + <> + Actual Size: {(details.file_metadata.size / 1024 / 1024).toFixed(2)} MB + Is File: {details.file_metadata.is_file ? 'Yes' : 'No'} + Modified: {details.file_metadata.modified ? new Date(details.file_metadata.modified.secs_since_epoch * 1000).toLocaleString() : 'Unknown'} + + ) : ( + File metadata not available + )} + Created: {new Date(details.created_at).toLocaleString()} + + + + + {details.file_analysis && ( + + Detailed File Analysis + + + + Basic Analysis + File Type: {details.file_analysis.file_type} + Size: {(details.file_analysis.file_size_bytes / 1024 / 1024).toFixed(2)} MB + Readable: + {details.file_analysis.error_details && ( + + File Error: {details.file_analysis.error_details} + + )} + + + + {details.file_analysis.pdf_info ? ( + + PDF Analysis + Valid PDF: + PDF Version: {details.file_analysis.pdf_info.pdf_version || 'Unknown'} + Pages: {details.file_analysis.pdf_info.page_count || 'Unknown'} + Has Text: + Has Images: + Encrypted: + Font Count: {details.file_analysis.pdf_info.font_count} + Text Length: {details.file_analysis.pdf_info.estimated_text_length} chars + {details.file_analysis.pdf_info.text_extraction_error && ( + + PDF Text Extraction Error: {details.file_analysis.pdf_info.text_extraction_error} + + )} + + ) : details.file_analysis.text_preview ? ( + + Text Preview + + {details.file_analysis.text_preview} + + + ) : ( + + File Content + No preview available for this file type + + )} + + + + )} + + )} + + {step.step === 2 && ( // OCR Queue Enrollment + + + + + Queue Status + User OCR Enabled: + Queue Entries: {details.queue_entries_count} + + + + + {details.queue_history && details.queue_history.length > 0 && ( + + Queue History + + + + + Status + Priority + Created + Started + Completed + Attempts + Worker + + + + {details.queue_history.map((entry: any, index: number) => ( + + + + + {entry.priority} + {new Date(entry.created_at).toLocaleString()} + {entry.started_at ? new Date(entry.started_at).toLocaleString() : '-'} + {entry.completed_at ? new Date(entry.completed_at).toLocaleString() : '-'} + {entry.attempts} + {entry.worker_id || '-'} + + ))} + +
+
+
+ )} +
+ )} + + {step.step === 3 && ( // OCR Processing + + + + OCR Results + Text Length: {details.ocr_text_length} characters + Confidence: {details.ocr_confidence ? `${details.ocr_confidence.toFixed(1)}%` : 'N/A'} + Word Count: {details.ocr_word_count || 0} + Processing Time: {details.ocr_processing_time_ms ? `${details.ocr_processing_time_ms}ms` : 'N/A'} + Completed: {details.ocr_completed_at ? new Date(details.ocr_completed_at).toLocaleString() : 'Not completed'} + + + + + Processing Details + Has Processed Image: + {details.processed_image_info && ( + <> + Image Size: {details.processed_image_info.image_width}x{details.processed_image_info.image_height} + File Size: {(details.processed_image_info.file_size / 1024).toFixed(1)} KB + Processing Steps: {details.processed_image_info.processing_steps?.join(', ') || 'None'} + {details.processed_image_info.processing_parameters && ( + Processing Parameters: {JSON.stringify(details.processed_image_info.processing_parameters)} + )} + + )} + + + + )} + + {step.step === 4 && ( // Quality Validation + + + + + Quality Thresholds + Min Confidence: {details.quality_thresholds.min_confidence}% + Brightness: {details.quality_thresholds.brightness_threshold} + Contrast: {details.quality_thresholds.contrast_threshold} + Noise: {details.quality_thresholds.noise_threshold} + Sharpness: {details.quality_thresholds.sharpness_threshold} + + + + + Actual Values + Confidence: {details.actual_values.confidence ? `${details.actual_values.confidence.toFixed(1)}%` : 'N/A'} + Word Count: {details.actual_values.word_count || 0} + Processed Image Available: + {details.actual_values.processing_parameters && ( + Processing Parameters: {JSON.stringify(details.actual_values.processing_parameters)} + )} + + + + + + Quality Checks + + {Object.entries(details.quality_checks).map(([check, passed]: [string, any]) => ( + + : passed === false ? : } + /> + + ))} + + + + )} +
+ ); + }; + + const renderUploadTab = () => ( + + + + + Upload Document for Debug Analysis + + + Upload a PDF or image file to analyze the processing pipeline in real-time. + + + + + + + {selectedFile && ( + + + Selected: {selectedFile.name} ({(selectedFile.size / 1024 / 1024).toFixed(2)} MB) + + + )} + + {selectedFile && ( + + )} + + + {uploading && uploadProgress > 0 && ( + + + Upload Progress: {uploadProgress}% + + + + )} + + {processingStatus && ( + + {processingStatus} + {monitoringInterval && ( + + + + )} + + )} + + {uploadedDocumentId && ( + + + Document ID: {uploadedDocumentId} + + + + + + + + )} + + {selectedFile && selectedFile.type.startsWith('image/') && ( + + Preview + + + )} + + + + ); + + const renderSearchTab = () => ( + + + + + Debug Existing Document + + + Enter a document ID to analyze the processing pipeline for an existing document. + + + + setDocumentId(e.target.value)} + placeholder="e.g., 123e4567-e89b-12d3-a456-426614174000" + fullWidth + size="small" + /> + + + + {error && ( + + {error} + + )} + + + + ); + + return ( + + + + + Document Processing Debug + + + Upload documents or analyze existing ones to troubleshoot OCR processing issues. + + + + + + setActiveTab(newValue)}> + } + iconPosition="start" + /> + } + iconPosition="start" + /> + {debugInfo && ( + } + iconPosition="start" + /> + )} + + + + + {activeTab === 0 && renderUploadTab()} + {activeTab === 1 && renderSearchTab()} + + + + {error && ( + + Debug Error + {error} + + )} + + {debugInfo && activeTab === 2 && ( + + + + + Document: {debugInfo.filename} + + + + + Debug run at: {new Date(debugInfo.debug_timestamp).toLocaleString()} + + + + + + + + + Processing Pipeline + + + {debugInfo.pipeline_steps.map((step) => ( + + + + {step.name} + + + + + {renderStepDetails(step)} + + + ))} + + + + + {debugInfo.failed_document_info && ( + + + + Failed Document Information + + + + + Failure Details + Failure Reason: {debugInfo.failed_document_info.failure_reason} + Failure Stage: {debugInfo.failed_document_info.failure_stage} + Retry Count: {debugInfo.failed_document_info.retry_count || 0} + Created: {new Date(debugInfo.failed_document_info.created_at).toLocaleString()} + {debugInfo.failed_document_info.last_retry_at && ( + Last Retry: {new Date(debugInfo.failed_document_info.last_retry_at).toLocaleString()} + )} + + + + + Failed OCR Results + {debugInfo.failed_document_info.failed_ocr_text ? ( + <> + OCR Text Length: {debugInfo.failed_document_info.failed_ocr_text.length} chars + OCR Confidence: {debugInfo.failed_document_info.failed_ocr_confidence?.toFixed(1)}% + Word Count: {debugInfo.failed_document_info.failed_ocr_word_count || 0} + Processing Time: {debugInfo.failed_document_info.failed_ocr_processing_time_ms || 0}ms + + ) : ( + No OCR results available + )} + + + {debugInfo.failed_document_info.error_message && ( + + + Error Message: {debugInfo.failed_document_info.error_message} + + + )} + {debugInfo.failed_document_info.content_preview && ( + + + Content Preview + + {debugInfo.failed_document_info.content_preview} + + + + )} + + + + )} + + {debugInfo.detailed_processing_logs && debugInfo.detailed_processing_logs.length > 0 && ( + + + + Detailed Processing Logs + + + Complete history of all OCR processing attempts for this document. + + + + + + Attempt + Status + Priority + Created + Started + Completed + Duration + Wait Time + Attempts + Worker + Error + + + + {debugInfo.detailed_processing_logs.map((log: any, index: number) => ( + + {index + 1} + + + + {log.priority} + {new Date(log.created_at).toLocaleString()} + {log.started_at ? new Date(log.started_at).toLocaleString() : '-'} + {log.completed_at ? new Date(log.completed_at).toLocaleString() : '-'} + {log.processing_duration_ms ? `${log.processing_duration_ms}ms` : '-'} + {log.queue_wait_time_ms ? `${log.queue_wait_time_ms}ms` : '-'} + {log.attempts || 0} + {log.worker_id || '-'} + + {log.error_message ? ( + + {log.error_message} + + ) : '-'} + + + ))} + +
+
+
+
+ )} + + {debugInfo.file_analysis && ( + + + + File Analysis Summary + + + + + File Properties + File Type: {debugInfo.file_analysis.file_type} + Size: {(debugInfo.file_analysis.file_size_bytes / 1024 / 1024).toFixed(2)} MB + Readable: + + + + {debugInfo.file_analysis.pdf_info && ( + + PDF Properties + Valid PDF: + Has Text Content: + Text Length: {debugInfo.file_analysis.pdf_info.estimated_text_length} chars + Page Count: {debugInfo.file_analysis.pdf_info.page_count || 'Unknown'} + Encrypted: + + )} + + {debugInfo.file_analysis.pdf_info?.text_extraction_error && ( + + + PDF Text Extraction Issue: {debugInfo.file_analysis.pdf_info.text_extraction_error} + + + )} + + + + )} + + {debugInfo.pipeline_steps.some(step => step.step === 3 && step.details.has_processed_image) && ( + + + + Processed Images + + + + + Original Document + + + + + + Processed Image (OCR Input) + { + (e.target as HTMLImageElement).style.display = 'none'; + (e.target as HTMLImageElement).parentNode?.appendChild( + document.createTextNode('Processed image not available') + ); + }} + sx={{ + maxWidth: '100%', + maxHeight: '300px', + objectFit: 'contain', + border: '1px solid', + borderColor: 'divider', + borderRadius: 1 + }} + /> + + + + + + )} + + + + + }> + User Settings + + + + + + OCR Settings + Background OCR: {debugInfo.user_settings.enable_background_ocr ? 'Enabled' : 'Disabled'} + Min Confidence: {debugInfo.user_settings.ocr_min_confidence}% + Max File Size: {debugInfo.user_settings.max_file_size_mb} MB + + + + + Quality Thresholds + Brightness: {debugInfo.user_settings.quality_thresholds.brightness} + Contrast: {debugInfo.user_settings.quality_thresholds.contrast} + Noise: {debugInfo.user_settings.quality_thresholds.noise} + Sharpness: {debugInfo.user_settings.quality_thresholds.sharpness} + + + + + + + +
+ )} +
+ ); +}; + +export default DebugPage; \ No newline at end of file diff --git a/src/ocr/enhanced.rs b/src/ocr/enhanced.rs index 2c1e3f7..f333a66 100644 --- a/src/ocr/enhanced.rs +++ b/src/ocr/enhanced.rs @@ -791,7 +791,7 @@ impl EnhancedOcrService { /// Extract text from PDF with size and time limits #[cfg(feature = "ocr")] - pub async fn extract_text_from_pdf(&self, file_path: &str, _settings: &Settings) -> Result { + pub async fn extract_text_from_pdf(&self, file_path: &str, settings: &Settings) -> Result { let start_time = std::time::Instant::now(); info!("Extracting text from PDF: {}", file_path); @@ -888,16 +888,190 @@ impl EnhancedOcrService { trimmed_text.chars().take(200).collect::() ); + // Smart detection: assess if text extraction quality is good enough + if self.is_text_extraction_quality_sufficient(&trimmed_text, word_count, file_size) { + info!("PDF text extraction successful for '{}', using extracted text", file_path); + Ok(OcrResult { + text: trimmed_text, + confidence: 95.0, // PDF text extraction is generally high confidence + processing_time_ms: processing_time, + word_count, + preprocessing_applied: vec!["PDF text extraction".to_string()], + processed_image_path: None, + }) + } else { + info!("PDF text extraction insufficient for '{}' ({} words), falling back to OCR", file_path, word_count); + // Fall back to OCR using ocrmypdf + self.extract_text_from_pdf_with_ocr(file_path, settings, start_time).await + } + } + + /// Assess if text extraction quality is sufficient or if OCR fallback is needed + #[cfg(feature = "ocr")] + fn is_text_extraction_quality_sufficient(&self, text: &str, word_count: usize, file_size: u64) -> bool { + // If we got no words at all, definitely need OCR + if word_count == 0 { + return false; + } + + // For very small files, low word count might be normal + if file_size < 50_000 && word_count >= 1 { + return true; + } + + // Calculate word density (words per KB) + let file_size_kb = (file_size as f64) / 1024.0; + let word_density = (word_count as f64) / file_size_kb; + + // Reasonable thresholds based on typical PDF content: + // - Text-based PDFs typically have 50-200 words per KB + // - Below 5 words per KB suggests mostly images/scanned content + const MIN_WORD_DENSITY: f64 = 5.0; + const MIN_WORDS_FOR_LARGE_FILES: usize = 10; + + if word_density < MIN_WORD_DENSITY && word_count < MIN_WORDS_FOR_LARGE_FILES { + debug!("PDF appears to be image-based: {} words in {:.1} KB (density: {:.2} words/KB)", + word_count, file_size_kb, word_density); + return false; + } + + // Additional check: if text is mostly non-alphanumeric, might be extraction artifacts + let alphanumeric_chars = text.chars().filter(|c| c.is_alphanumeric()).count(); + let alphanumeric_ratio = if text.len() > 0 { + (alphanumeric_chars as f64) / (text.len() as f64) + } else { + 0.0 + }; + + // If less than 30% alphanumeric content, likely poor extraction + if alphanumeric_ratio < 0.3 { + debug!("PDF text has low alphanumeric content: {:.1}% ({} of {} chars)", + alphanumeric_ratio * 100.0, alphanumeric_chars, text.len()); + return false; + } + + debug!("PDF text extraction quality sufficient: {} words, {:.2} words/KB, {:.1}% alphanumeric", + word_count, word_density, alphanumeric_ratio * 100.0); + true + } + + /// Extract text from PDF using OCR (ocrmypdf) for image-based or poor-quality PDFs + #[cfg(feature = "ocr")] + async fn extract_text_from_pdf_with_ocr(&self, file_path: &str, settings: &Settings, start_time: std::time::Instant) -> Result { + info!("Starting OCR extraction for PDF: {}", file_path); + + // Check if ocrmypdf is available + if !self.is_ocrmypdf_available().await { + return Err(anyhow!( + "ocrmypdf is not available on this system. To extract text from image-based PDFs like '{}', please install ocrmypdf. \ + On Ubuntu/Debian: 'apt-get install ocrmypdf'. \ + On macOS: 'brew install ocrmypdf'. \ + Alternatively, convert the PDF to images and upload those instead.", + file_path + )); + } + + // Generate temporary file path for OCR'd PDF + let temp_ocr_filename = format!("ocr_{}_{}.pdf", + std::process::id(), + std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?.as_millis() + ); + let temp_ocr_path = format!("{}/{}", self.temp_dir, temp_ocr_filename); + + // Run ocrmypdf to create searchable PDF + let ocrmypdf_result = tokio::time::timeout( + std::time::Duration::from_secs(300), // 5 minute timeout for OCR + tokio::task::spawn_blocking({ + let file_path = file_path.to_string(); + let temp_ocr_path = temp_ocr_path.clone(); + move || { + std::process::Command::new("ocrmypdf") + .arg("--force-ocr") // OCR even if text is detected + .arg("-O2") // Optimize level 2 (balanced quality/speed) + .arg("--deskew") // Correct skewed pages + .arg("--clean") // Clean up artifacts + .arg("--language") + .arg("eng") // English language + .arg(&file_path) + .arg(&temp_ocr_path) + .output() + } + }) + ).await; + + let ocrmypdf_output = match ocrmypdf_result { + Ok(Ok(output)) => output?, + Ok(Err(e)) => return Err(anyhow!("Failed to join ocrmypdf task: {}", e)), + Err(_) => return Err(anyhow!("ocrmypdf timed out after 5 minutes for file '{}'", file_path)), + }; + + if !ocrmypdf_output.status.success() { + let stderr = String::from_utf8_lossy(&ocrmypdf_output.stderr); + let stdout = String::from_utf8_lossy(&ocrmypdf_output.stdout); + return Err(anyhow!( + "ocrmypdf failed for '{}': Exit code {}\nStderr: {}\nStdout: {}", + file_path, ocrmypdf_output.status.code().unwrap_or(-1), stderr, stdout + )); + } + + // Extract text from the OCR'd PDF + let ocr_text_result = tokio::task::spawn_blocking({ + let temp_ocr_path = temp_ocr_path.clone(); + move || -> Result { + let bytes = std::fs::read(&temp_ocr_path)?; + let text = pdf_extract::extract_text_from_mem(&bytes)?; + Ok(text.trim().to_string()) + } + }).await??; + + // Clean up temporary file + let _ = tokio::fs::remove_file(&temp_ocr_path).await; + + let processing_time = start_time.elapsed().as_millis() as u64; + let word_count = self.count_words_safely(&ocr_text_result); + + info!("OCR extraction completed for '{}': {} words in {}ms", + file_path, word_count, processing_time); + Ok(OcrResult { - text: trimmed_text, - confidence: 95.0, // PDF text extraction is generally high confidence + text: ocr_text_result, + confidence: 85.0, // OCR is generally lower confidence than direct text extraction processing_time_ms: processing_time, word_count, - preprocessing_applied: vec!["PDF text extraction".to_string()], - processed_image_path: None, // No image processing for PDF text extraction + preprocessing_applied: vec!["OCR via ocrmypdf".to_string()], + processed_image_path: None, }) } + /// Check if ocrmypdf is available on the system + #[cfg(feature = "ocr")] + async fn is_ocrmypdf_available(&self) -> bool { + match tokio::process::Command::new("ocrmypdf") + .arg("--version") + .output() + .await + { + Ok(output) => output.status.success(), + Err(_) => false, + } + } + + #[cfg(not(feature = "ocr"))] + fn is_text_extraction_quality_sufficient(&self, _text: &str, _word_count: usize, _file_size: u64) -> bool { + // When OCR is disabled, always accept text extraction results + true + } + + #[cfg(not(feature = "ocr"))] + async fn is_ocrmypdf_available(&self) -> bool { + false // OCR feature not enabled + } + + #[cfg(not(feature = "ocr"))] + async fn extract_text_from_pdf_with_ocr(&self, file_path: &str, _settings: &Settings, _start_time: std::time::Instant) -> Result { + Err(anyhow::anyhow!("OCR feature not enabled - cannot process image-based PDF: {}", file_path)) + } + /// Resolve file path to actual location, handling both old and new directory structures async fn resolve_file_path(&self, file_path: &str) -> Result { // Use the FileService's resolve_file_path method @@ -988,7 +1162,7 @@ impl EnhancedOcrService { /// Safely count words to prevent overflow on very large texts #[cfg(feature = "ocr")] - fn count_words_safely(&self, text: &str) -> usize { + pub fn count_words_safely(&self, text: &str) -> usize { // For very large texts, sample to estimate word count to prevent overflow if text.len() > 1_000_000 { // > 1MB of text // Sample first 100KB and extrapolate @@ -1008,31 +1182,51 @@ impl EnhancedOcrService { fn count_words_in_text(&self, text: &str) -> usize { let whitespace_words = text.split_whitespace().count(); - // If no whitespace-separated words found but text exists, try alternative word detection - if whitespace_words == 0 && !text.trim().is_empty() { - // For PDFs that extract as continuous text, estimate words based on character patterns - // Look for transitions from letters to non-letters as potential word boundaries - let mut word_count = 0; - let mut in_word = false; + // If we have exactly 1 "word" but it's very long (likely continuous text), try enhanced detection + // OR if we have no whitespace words but text exists + let is_continuous_text = whitespace_words == 1 && text.len() > 15; // 15+ chars suggests it might be continuous + let is_no_words = whitespace_words == 0 && !text.trim().is_empty(); + + if is_continuous_text || is_no_words { + // Count total alphanumeric characters first + let alphanumeric_chars = text.chars().filter(|c| c.is_alphanumeric()).count(); - for c in text.chars() { - if c.is_alphabetic() { - if !in_word { - word_count += 1; - in_word = true; - } - } else { - in_word = false; + // If no alphanumeric content, it's pure punctuation/symbols + if alphanumeric_chars == 0 { + return 0; + } + + // For continuous text, look for word boundaries using multiple strategies + let mut word_count = 0; + + // Strategy 1: Count transitions from lowercase to uppercase (camelCase detection) + let chars: Vec = text.chars().collect(); + let mut camel_transitions = 0; + + for i in 1..chars.len() { + let prev_char = chars[i-1]; + let curr_char = chars[i]; + + // Count transitions from lowercase letter to uppercase letter + if prev_char.is_lowercase() && curr_char.is_uppercase() { + camel_transitions += 1; + } + // Count transitions from letter to digit or digit to letter + else if (prev_char.is_alphabetic() && curr_char.is_numeric()) || + (prev_char.is_numeric() && curr_char.is_alphabetic()) { + camel_transitions += 1; } } - // If still no words found but we have alphanumeric content, - // estimate based on reasonable word length (assume ~5 chars per word) + // If we found camelCase transitions, estimate words + if camel_transitions > 0 { + word_count = camel_transitions + 1; // +1 for the first word + } + + // Strategy 2: If no camelCase detected, estimate based on character count if word_count == 0 { - let alphanumeric_chars = text.chars().filter(|c| c.is_alphanumeric()).count(); - if alphanumeric_chars > 0 { - word_count = (alphanumeric_chars / 5).max(1); - } + // Estimate based on typical word length (4-6 characters per word) + word_count = (alphanumeric_chars / 5).max(1); } word_count diff --git a/src/routes/documents.rs b/src/routes/documents.rs index f8c5a43..cec3549 100644 --- a/src/routes/documents.rs +++ b/src/routes/documents.rs @@ -58,6 +58,7 @@ pub fn router() -> Router> { .route("/{id}/ocr", get(get_document_ocr)) .route("/{id}/processed-image", get(get_processed_image)) .route("/{id}/retry-ocr", post(retry_ocr)) + .route("/{id}/debug", get(get_document_debug_info)) .route("/duplicates", get(get_user_duplicates)) .route("/failed", get(get_failed_documents)) .route("/failed/{id}/view", get(view_failed_document)) @@ -645,6 +646,560 @@ async fn retry_ocr( } } +#[utoipa::path( + get, + path = "/api/documents/{id}/debug", + tag = "documents", + security( + ("bearer_auth" = []) + ), + params( + ("id" = uuid::Uuid, Path, description = "Document ID") + ), + responses( + (status = 200, description = "Debug information for document processing pipeline", body = String), + (status = 404, description = "Document not found"), + (status = 401, description = "Unauthorized") + ) +)] +async fn get_document_debug_info( + State(state): State>, + auth_user: AuthUser, + Path(document_id): Path, +) -> Result, StatusCode> { + tracing::info!("Starting debug analysis for document {} by user {}", document_id, auth_user.user.id); + + // Get the document + let document = match state + .db + .get_document_by_id(document_id, auth_user.user.id, auth_user.user.role) + .await + { + Ok(Some(doc)) => { + tracing::info!("Found document: {} ({})", doc.filename, doc.mime_type); + doc + } + Ok(None) => { + tracing::warn!("Document {} not found for user {}", document_id, auth_user.user.id); + return Err(StatusCode::NOT_FOUND); + } + Err(e) => { + tracing::error!("Database error fetching document {}: {}", document_id, e); + return Err(StatusCode::INTERNAL_SERVER_ERROR); + } + }; + + // Get user settings + tracing::info!("Fetching user settings for user {}", auth_user.user.id); + let settings = match state + .db + .get_user_settings(auth_user.user.id) + .await + { + Ok(Some(s)) => { + tracing::info!("Found user settings: OCR enabled={}, min_confidence={}", s.enable_background_ocr, s.ocr_min_confidence); + s + } + Ok(None) => { + tracing::info!("No user settings found, using defaults"); + crate::models::Settings::default() + } + Err(e) => { + tracing::error!("Error fetching user settings: {}", e); + return Err(StatusCode::INTERNAL_SERVER_ERROR); + } + }; + + // Get OCR queue history for this document + tracing::info!("Fetching OCR queue history for document {}", document_id); + let queue_history = match sqlx::query( + r#" + SELECT id, status, priority, created_at, started_at, completed_at, + error_message, attempts, worker_id + FROM ocr_queue + WHERE document_id = $1 + ORDER BY created_at DESC + LIMIT 10 + "# + ) + .bind(document_id) + .fetch_all(state.db.get_pool()) + .await { + Ok(history) => { + tracing::info!("Queue history query successful, found {} entries", history.len()); + history + }, + Err(e) => { + tracing::error!("Queue history query error: {}", e); + return Err(StatusCode::INTERNAL_SERVER_ERROR); + } + }; + + // Get processed image info if it exists + tracing::info!("Fetching processed image for document {}", document_id); + let processed_image = match state + .db + .get_processed_image_by_document_id(document_id, auth_user.user.id) + .await { + Ok(Some(img)) => { + tracing::info!("Found processed image for document {}", document_id); + Some(img) + }, + Ok(None) => { + tracing::info!("No processed image found for document {}", document_id); + None + }, + Err(e) => { + tracing::warn!("Error fetching processed image for document {}: {}", document_id, e); + None + } + }; + + // Get failed document record if it exists + tracing::info!("Fetching failed document record for document {}", document_id); + let failed_document = match sqlx::query( + r#" + SELECT failure_reason, failure_stage, error_message, retry_count, + last_retry_at, created_at, content, ocr_text, ocr_confidence, + ocr_word_count, ocr_processing_time_ms + FROM failed_documents + WHERE id = $1 OR existing_document_id = $1 + ORDER BY created_at DESC + LIMIT 1 + "# + ) + .bind(document_id) + .fetch_optional(state.db.get_pool()) + .await { + Ok(result) => { + tracing::info!("Failed document query successful, found: {}", result.is_some()); + result + }, + Err(e) => { + tracing::error!("Failed document query error: {}", e); + return Err(StatusCode::INTERNAL_SERVER_ERROR); + } + }; + + // Get detailed OCR processing logs and attempts + tracing::info!("Fetching detailed OCR processing logs for document {}", document_id); + let ocr_processing_logs = match sqlx::query( + r#" + SELECT id, status, priority, created_at, started_at, completed_at, + error_message, attempts, worker_id, processing_time_ms, file_size + FROM ocr_queue + WHERE document_id = $1 + ORDER BY created_at ASC + "# + ) + .bind(document_id) + .fetch_all(state.db.get_pool()) + .await { + Ok(logs) => { + tracing::info!("OCR processing logs query successful, found {} entries", logs.len()); + logs + }, + Err(e) => { + tracing::error!("OCR processing logs query error: {}", e); + return Err(StatusCode::INTERNAL_SERVER_ERROR); + } + }; + + // File service for file info + let file_service = FileService::new(state.config.upload_path.clone()); + + // Check if file exists + let file_exists = tokio::fs::metadata(&document.file_path).await.is_ok(); + let file_metadata = if file_exists { + tokio::fs::metadata(&document.file_path).await.ok() + } else { + None + }; + + // Try to analyze file content for additional diagnostic info + tracing::info!("Analyzing file content for document {} (exists: {})", document_id, file_exists); + let file_analysis = if file_exists { + match analyze_file_content(&document.file_path, &document.mime_type).await { + Ok(analysis) => { + tracing::info!("File analysis successful for document {}", document_id); + analysis + }, + Err(e) => { + tracing::warn!("Failed to analyze file content for {}: {}", document_id, e); + FileAnalysis { + error_details: Some(format!("File analysis failed: {}", e)), + ..Default::default() + } + } + } + } else { + tracing::warn!("File does not exist for document {}, skipping analysis", document_id); + FileAnalysis::default() + }; + + // Pipeline steps analysis + let mut pipeline_steps = Vec::new(); + + // Step 1: File Upload & Ingestion + pipeline_steps.push(serde_json::json!({ + "step": 1, + "name": "File Upload & Ingestion", + "status": "completed", // Document exists if we got this far + "details": { + "filename": document.filename, + "original_filename": document.original_filename, + "file_size": document.file_size, + "mime_type": document.mime_type, + "file_exists": file_exists, + "file_path": document.file_path, + "created_at": document.created_at, + "file_metadata": file_metadata.as_ref().map(|m| serde_json::json!({ + "size": m.len(), + "modified": m.modified().ok(), + "is_file": m.is_file(), + "is_dir": m.is_dir() + })), + "file_analysis": file_analysis + }, + "success": true, + "error": None:: + })); + + // Step 2: OCR Queue Enrollment + let queue_enrollment_status = if queue_history.is_empty() { + if settings.enable_background_ocr { + "not_queued" + } else { + "ocr_disabled" + } + } else { + "queued" + }; + + pipeline_steps.push(serde_json::json!({ + "step": 2, + "name": "OCR Queue Enrollment", + "status": queue_enrollment_status, + "details": { + "user_ocr_enabled": settings.enable_background_ocr, + "queue_entries_count": queue_history.len(), + "queue_history": queue_history.iter().map(|row| serde_json::json!({ + "id": row.get::("id"), + "status": row.get::("status"), + "priority": row.get::("priority"), + "created_at": row.get::, _>("created_at"), + "started_at": row.get::>, _>("started_at"), + "completed_at": row.get::>, _>("completed_at"), + "error_message": row.get::, _>("error_message"), + "attempts": row.get::("attempts"), + "worker_id": row.get::, _>("worker_id") + })).collect::>() + }, + "success": !queue_history.is_empty() || !settings.enable_background_ocr, + "error": if !settings.enable_background_ocr && queue_history.is_empty() { + Some("OCR processing is disabled in user settings") + } else { None } + })); + + // Step 3: OCR Processing + let ocr_status = document.ocr_status.as_deref().unwrap_or("not_started"); + let ocr_success = matches!(ocr_status, "completed"); + + pipeline_steps.push(serde_json::json!({ + "step": 3, + "name": "OCR Text Extraction", + "status": ocr_status, + "details": { + "ocr_text_length": document.ocr_text.as_ref().map(|t| t.len()).unwrap_or(0), + "ocr_confidence": document.ocr_confidence, + "ocr_word_count": document.ocr_word_count, + "ocr_processing_time_ms": document.ocr_processing_time_ms, + "ocr_completed_at": document.ocr_completed_at, + "ocr_error": document.ocr_error, + "has_processed_image": processed_image.is_some(), + "processed_image_info": processed_image.as_ref().map(|pi| serde_json::json!({ + "image_path": pi.processed_image_path, + "image_width": pi.image_width, + "image_height": pi.image_height, + "file_size": pi.file_size, + "processing_parameters": pi.processing_parameters, + "processing_steps": pi.processing_steps, + "created_at": pi.created_at + })) + }, + "success": ocr_success, + "error": document.ocr_error.clone() + })); + + // Step 4: Quality Validation + let quality_passed = if let Some(confidence) = document.ocr_confidence { + confidence >= settings.ocr_min_confidence && document.ocr_word_count.unwrap_or(0) > 0 + } else { + false + }; + + pipeline_steps.push(serde_json::json!({ + "step": 4, + "name": "OCR Quality Validation", + "status": if ocr_success { + if quality_passed { "passed" } else { "failed" } + } else { + "not_reached" + }, + "details": { + "quality_thresholds": { + "min_confidence": settings.ocr_min_confidence, + "brightness_threshold": settings.ocr_quality_threshold_brightness, + "contrast_threshold": settings.ocr_quality_threshold_contrast, + "noise_threshold": settings.ocr_quality_threshold_noise, + "sharpness_threshold": settings.ocr_quality_threshold_sharpness + }, + "actual_values": { + "confidence": document.ocr_confidence, + "word_count": document.ocr_word_count, + "processed_image_available": processed_image.is_some(), + "processing_parameters": processed_image.as_ref().map(|pi| &pi.processing_parameters) + }, + "quality_checks": { + "confidence_check": document.ocr_confidence.map(|c| c >= settings.ocr_min_confidence), + "word_count_check": document.ocr_word_count.map(|w| w > 0), + "processed_image_available": processed_image.is_some() + } + }, + "success": quality_passed, + "error": if !quality_passed && ocr_success { + Some(format!("Quality validation failed: confidence {:.1}% (required: {:.1}%), words: {}", + document.ocr_confidence.unwrap_or(0.0), + settings.ocr_min_confidence, + document.ocr_word_count.unwrap_or(0) + )) + } else { None } + })); + + // Overall summary + let overall_status = if quality_passed { + "success" + } else if matches!(ocr_status, "failed") { + "failed" + } else if matches!(ocr_status, "processing") { + "processing" + } else if matches!(ocr_status, "pending") { + "pending" + } else { + "not_started" + }; + + Ok(Json(serde_json::json!({ + "document_id": document_id, + "filename": document.filename, + "overall_status": overall_status, + "pipeline_steps": pipeline_steps, + "failed_document_info": failed_document.as_ref().map(|row| serde_json::json!({ + "failure_reason": row.get::("failure_reason"), + "failure_stage": row.get::("failure_stage"), + "error_message": row.get::, _>("error_message"), + "retry_count": row.get::, _>("retry_count"), + "last_retry_at": row.get::>, _>("last_retry_at"), + "created_at": row.get::, _>("created_at"), + "content_preview": row.get::, _>("content").map(|c| + c.chars().take(200).collect::() + ), + "failed_ocr_text": row.get::, _>("ocr_text"), + "failed_ocr_confidence": row.get::, _>("ocr_confidence"), + "failed_ocr_word_count": row.get::, _>("ocr_word_count"), + "failed_ocr_processing_time_ms": row.get::, _>("ocr_processing_time_ms") + })), + "user_settings": { + "enable_background_ocr": settings.enable_background_ocr, + "ocr_min_confidence": settings.ocr_min_confidence, + "max_file_size_mb": settings.max_file_size_mb, + "quality_thresholds": { + "brightness": settings.ocr_quality_threshold_brightness, + "contrast": settings.ocr_quality_threshold_contrast, + "noise": settings.ocr_quality_threshold_noise, + "sharpness": settings.ocr_quality_threshold_sharpness + } + }, + "debug_timestamp": chrono::Utc::now(), + "file_analysis": file_analysis, + "detailed_processing_logs": ocr_processing_logs.iter().map(|row| serde_json::json!({ + "id": row.get::("id"), + "status": row.get::("status"), + "priority": row.get::("priority"), + "created_at": row.get::, _>("created_at"), + "started_at": row.get::>, _>("started_at"), + "completed_at": row.get::>, _>("completed_at"), + "error_message": row.get::, _>("error_message"), + "attempts": row.get::("attempts"), + "worker_id": row.get::, _>("worker_id"), + "processing_time_ms": row.get::, _>("processing_time_ms"), + "file_size": row.get::, _>("file_size"), + // Calculate processing duration if both timestamps are available + "processing_duration_ms": if let (Some(started), Some(completed)) = ( + row.get::>, _>("started_at"), + row.get::>, _>("completed_at") + ) { + Some((completed.timestamp_millis() - started.timestamp_millis()) as i32) + } else { + row.get::, _>("processing_time_ms") + }, + // Calculate queue wait time + "queue_wait_time_ms": if let Some(started) = row.get::>, _>("started_at") { + let created = row.get::, _>("created_at"); + Some((started.timestamp_millis() - created.timestamp_millis()) as i32) + } else { + None:: + } + })).collect::>() + }))) +} + +#[derive(Debug, Default, serde::Serialize)] +struct FileAnalysis { + file_type: String, + file_size_bytes: u64, + is_readable: bool, + pdf_info: Option, + text_preview: Option, + error_details: Option, +} + +#[derive(Debug, serde::Serialize)] +struct PdfAnalysis { + is_valid_pdf: bool, + page_count: Option, + has_text_content: bool, + has_images: bool, + is_encrypted: bool, + pdf_version: Option, + font_count: usize, + text_extraction_error: Option, + estimated_text_length: usize, +} + +async fn analyze_file_content(file_path: &str, mime_type: &str) -> Result> { + let mut analysis = FileAnalysis { + file_type: mime_type.to_string(), + ..Default::default() + }; + + // Try to read file size + if let Ok(metadata) = tokio::fs::metadata(file_path).await { + analysis.file_size_bytes = metadata.len(); + } + + // Try to read the file + let file_content = match tokio::fs::read(file_path).await { + Ok(content) => { + analysis.is_readable = true; + content + } + Err(e) => { + analysis.error_details = Some(format!("Failed to read file: {}", e)); + return Ok(analysis); + } + }; + + // Analyze based on file type + if mime_type.contains("pdf") { + analysis.pdf_info = Some(analyze_pdf_content(&file_content).await); + } else if mime_type.starts_with("text/") { + // For text files, show a preview + match String::from_utf8(file_content.clone()) { + Ok(text) => { + analysis.text_preview = Some(text.chars().take(500).collect()); + } + Err(e) => { + analysis.error_details = Some(format!("Failed to decode text file: {}", e)); + } + } + } + + Ok(analysis) +} + +async fn analyze_pdf_content(content: &[u8]) -> PdfAnalysis { + use std::panic; + + let mut analysis = PdfAnalysis { + is_valid_pdf: false, + page_count: None, + has_text_content: false, + has_images: false, + is_encrypted: false, + pdf_version: None, + font_count: 0, + text_extraction_error: None, + estimated_text_length: 0, + }; + + // Check PDF header + if content.len() < 8 { + analysis.text_extraction_error = Some("File too small to be a valid PDF".to_string()); + return analysis; + } + + if !content.starts_with(b"%PDF-") { + analysis.text_extraction_error = Some("File does not start with PDF header".to_string()); + return analysis; + } + + analysis.is_valid_pdf = true; + + // Extract PDF version from header + if content.len() >= 8 { + if let Ok(header) = std::str::from_utf8(&content[0..8]) { + if let Some(version) = header.strip_prefix("%PDF-") { + analysis.pdf_version = Some(version.to_string()); + } + } + } + + // Try to extract text using pdf_extract (same as the main OCR pipeline) + let text_result = panic::catch_unwind(|| { + pdf_extract::extract_text_from_mem(content) + }); + + match text_result { + Ok(Ok(text)) => { + analysis.has_text_content = !text.trim().is_empty(); + analysis.estimated_text_length = text.len(); + + // Count words for comparison with OCR results + let word_count = text.split_whitespace().count(); + if word_count == 0 && text.len() > 0 { + analysis.text_extraction_error = Some("PDF contains characters but no extractable words".to_string()); + } + } + Ok(Err(e)) => { + analysis.text_extraction_error = Some(format!("PDF text extraction failed: {}", e)); + } + Err(_) => { + analysis.text_extraction_error = Some("PDF text extraction panicked (likely corrupted PDF)".to_string()); + } + } + + // Basic PDF structure analysis + let content_str = String::from_utf8_lossy(content); + + // Check for encryption + analysis.is_encrypted = content_str.contains("/Encrypt"); + + // Check for images + analysis.has_images = content_str.contains("/Image") || content_str.contains("/XObject"); + + // Estimate page count (rough) + let page_matches = content_str.matches("/Type /Page").count(); + if page_matches > 0 { + analysis.page_count = Some(page_matches as i32); + } + + // Count fonts (rough) + analysis.font_count = content_str.matches("/Type /Font").count(); + + analysis +} + #[utoipa::path( get, path = "/api/documents/failed-ocr", diff --git a/src/tests/enhanced_ocr_tests.rs b/src/tests/enhanced_ocr_tests.rs index 00ae2ee..28e7ded 100644 --- a/src/tests/enhanced_ocr_tests.rs +++ b/src/tests/enhanced_ocr_tests.rs @@ -38,6 +38,108 @@ mod tests { assert_eq!(stats.sharpness, 0.8); } + #[test] + fn test_count_words_safely_whitespace_separated() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let service = EnhancedOcrService::new(temp_path); + + // Test normal whitespace-separated text + let text = "Hello world this is a test"; + let count = service.count_words_safely(&text); + assert_eq!(count, 6); + + // Test with extra whitespace + let text = " Hello world \n test "; + let count = service.count_words_safely(&text); + assert_eq!(count, 3); + } + + #[test] + fn test_count_words_safely_continuous_text() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let service = EnhancedOcrService::new(temp_path); + + // Test continuous text without spaces (like some PDF extractions) + let text = "HelloWorldThisIsAContinuousText"; + let count = service.count_words_safely(&text); + assert!(count > 0, "Should detect words even without whitespace"); + + // Test mixed alphanumeric without spaces + let text = "ABC123DEF456GHI789"; + let count = service.count_words_safely(&text); + assert!(count > 0, "Should detect alphanumeric patterns as words"); + } + + #[test] + fn test_count_words_safely_edge_cases() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let service = EnhancedOcrService::new(temp_path); + + // Test empty text + let count = service.count_words_safely(""); + assert_eq!(count, 0); + + // Test only whitespace + let count = service.count_words_safely(" \n\t "); + assert_eq!(count, 0); + + // Test only punctuation + let text = "!@#$%^&*()_+-=[]{}|;':\",./<>?"; + let count = service.count_words_safely(&text); + // Since there are no alphabetic or alphanumeric chars, should be 0 + assert_eq!(count, 0, "Pure punctuation should not count as words, got {}", count); + + // Test single character + let count = service.count_words_safely("A"); + assert_eq!(count, 1); + + // Test mixed content with low alphanumeric ratio + let text = "A!!!B@@@C###D$$$E%%%"; + let count = service.count_words_safely(&text); + assert!(count > 0, "Should detect words in mixed content"); + } + + #[test] + fn test_count_words_safely_large_text() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let service = EnhancedOcrService::new(temp_path); + + // Test with large text (over 1MB) to trigger sampling + let word = "test "; + let large_text = word.repeat(250_000); // Creates ~1.25MB of text + let count = service.count_words_safely(&large_text); + + // Should estimate around 250,000 words (may vary due to sampling) + assert!(count > 200_000, "Should estimate large word count: got {}", count); + assert!(count <= 10_000_000, "Should cap at max limit: got {}", count); + } + + #[test] + fn test_count_words_safely_fallback_patterns() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let service = EnhancedOcrService::new(temp_path); + + // Test letter transition detection + let text = "OneWordAnotherWordFinalWord"; + let count = service.count_words_safely(&text); + assert!(count >= 3, "Should detect at least 3 words from transitions: got {}", count); + + // Test alphanumeric estimation fallback + let text = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; // 26 chars, should estimate ~5-6 words + let count = service.count_words_safely(&text); + assert!(count >= 1 && count <= 10, "Should estimate reasonable word count: got {}", count); + + // Test mixed case with numbers + let text = "ABC123def456GHI789jkl"; + let count = service.count_words_safely(&text); + assert!(count >= 1, "Should detect words in mixed alphanumeric: got {}", count); + } + #[test] fn test_ocr_result_structure() { let result = OcrResult { diff --git a/tests/integration_pdf_word_count_tests.rs b/tests/integration_pdf_word_count_tests.rs new file mode 100644 index 0000000..88ed57e --- /dev/null +++ b/tests/integration_pdf_word_count_tests.rs @@ -0,0 +1,293 @@ +#[cfg(test)] +mod pdf_word_count_integration_tests { + use readur::ocr::enhanced::EnhancedOcrService; + use readur::models::Settings; + use std::fs::File; + use std::io::Write; + use tempfile::{NamedTempFile, TempDir}; + + fn create_test_settings() -> Settings { + Settings::default() + } + + fn create_temp_dir() -> TempDir { + TempDir::new().expect("Failed to create temp directory") + } + + /// Create a mock PDF with specific text patterns for testing + fn create_mock_pdf_file(content: &str) -> NamedTempFile { + let mut temp_file = NamedTempFile::new().expect("Failed to create temp file"); + + // Create a minimal PDF structure that pdf-extract can read + // This is a very basic PDF that contains the specified text + let pdf_content = format!( + "%PDF-1.4\n\ + 1 0 obj\n\ + <<\n\ + /Type /Catalog\n\ + /Pages 2 0 R\n\ + >>\n\ + endobj\n\ + 2 0 obj\n\ + <<\n\ + /Type /Pages\n\ + /Kids [3 0 R]\n\ + /Count 1\n\ + >>\n\ + endobj\n\ + 3 0 obj\n\ + <<\n\ + /Type /Page\n\ + /Parent 2 0 R\n\ + /Contents 4 0 R\n\ + >>\n\ + endobj\n\ + 4 0 obj\n\ + <<\n\ + /Length {}\n\ + >>\n\ + stream\n\ + BT\n\ + /F1 12 Tf\n\ + 72 720 Td\n\ + ({}) Tj\n\ + ET\n\ + endstream\n\ + endobj\n\ + xref\n\ + 0 5\n\ + 0000000000 65535 f \n\ + 0000000009 00000 n \n\ + 0000000074 00000 n \n\ + 0000000120 00000 n \n\ + 0000000179 00000 n \n\ + trailer\n\ + <<\n\ + /Size 5\n\ + /Root 1 0 R\n\ + >>\n\ + startxref\n\ + {}\n\ + %%EOF", + content.len() + 42, // Approximate content length + content, + 300 // Approximate xref position + ); + + temp_file.write_all(pdf_content.as_bytes()).expect("Failed to write PDF content"); + temp_file.flush().expect("Failed to flush temp file"); + temp_file + } + + #[tokio::test] + async fn test_pdf_extraction_with_normal_text() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let service = EnhancedOcrService::new(temp_path); + let settings = create_test_settings(); + + // Create a PDF with normal spaced text + let pdf_content = "Hello world this is a test document with normal spacing"; + let pdf_file = create_mock_pdf_file(pdf_content); + + // Note: This test may fail because our mock PDF might not be perfectly formatted + // for pdf-extract, but it demonstrates the testing pattern + match service.extract_text_from_pdf(pdf_file.path().to_str().unwrap(), &settings).await { + Ok(result) => { + assert!(result.word_count > 0, "Should extract words from PDF with normal text"); + assert!(result.confidence >= 90.0, "PDF extraction should have high confidence"); + assert!(!result.text.is_empty(), "Should extract non-empty text"); + } + Err(e) => { + // Mock PDF might not work with pdf-extract, but we can still test the pattern + println!("PDF extraction failed (expected with mock PDF): {}", e); + } + } + } + + #[tokio::test] + async fn test_pdf_extraction_with_continuous_text() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let service = EnhancedOcrService::new(temp_path); + let settings = create_test_settings(); + + // Create a PDF with continuous text (no spaces) + let pdf_content = "HelloWorldThisIsAContinuousTextWithoutSpaces"; + let pdf_file = create_mock_pdf_file(pdf_content); + + match service.extract_text_from_pdf(pdf_file.path().to_str().unwrap(), &settings).await { + Ok(result) => { + // The enhanced word counting should detect words even without spaces + assert!(result.word_count > 0, "Should detect words in continuous text: got {} words", result.word_count); + assert!(result.confidence >= 90.0, "PDF extraction should have high confidence"); + + // Verify the text was extracted + assert!(!result.text.is_empty(), "Should extract non-empty text"); + assert!(result.text.contains("Hello") || result.text.contains("World"), + "Should contain expected content"); + } + Err(e) => { + println!("PDF extraction failed (expected with mock PDF): {}", e); + } + } + } + + #[tokio::test] + async fn test_pdf_extraction_with_mixed_content() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let service = EnhancedOcrService::new(temp_path); + let settings = create_test_settings(); + + // Create a PDF with mixed content (letters, numbers, punctuation) + let pdf_content = "ABC123xyz789!@#DefGhi456"; + let pdf_file = create_mock_pdf_file(pdf_content); + + match service.extract_text_from_pdf(pdf_file.path().to_str().unwrap(), &settings).await { + Ok(result) => { + // Should detect alphanumeric patterns as words + assert!(result.word_count > 0, "Should detect words in mixed content: got {} words", result.word_count); + assert!(result.confidence >= 90.0, "PDF extraction should have high confidence"); + } + Err(e) => { + println!("PDF extraction failed (expected with mock PDF): {}", e); + } + } + } + + #[tokio::test] + async fn test_pdf_extraction_empty_content() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let service = EnhancedOcrService::new(temp_path); + let settings = create_test_settings(); + + // Create a PDF with only whitespace/empty content + let pdf_content = " \n\t "; + let pdf_file = create_mock_pdf_file(pdf_content); + + match service.extract_text_from_pdf(pdf_file.path().to_str().unwrap(), &settings).await { + Ok(result) => { + assert_eq!(result.word_count, 0, "Empty content should have 0 words"); + assert!(result.text.trim().is_empty(), "Should extract empty/whitespace text"); + } + Err(e) => { + println!("PDF extraction failed (expected with mock PDF): {}", e); + } + } + } + + #[tokio::test] + async fn test_pdf_extraction_punctuation_only() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let service = EnhancedOcrService::new(temp_path); + let settings = create_test_settings(); + + // Create a PDF with only punctuation + let pdf_content = "!@#$%^&*()_+-=[]{}|;':\",./<>?"; + let pdf_file = create_mock_pdf_file(pdf_content); + + match service.extract_text_from_pdf(pdf_file.path().to_str().unwrap(), &settings).await { + Ok(result) => { + // Pure punctuation should not count as words + assert_eq!(result.word_count, 0, "Pure punctuation should have 0 words: got {} words", result.word_count); + } + Err(e) => { + println!("PDF extraction failed (expected with mock PDF): {}", e); + } + } + } + + #[tokio::test] + async fn test_pdf_quality_validation() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let service = EnhancedOcrService::new(temp_path); + let settings = create_test_settings(); + + // Create a PDF with good content + let pdf_content = "This is a quality document with proper text content"; + let pdf_file = create_mock_pdf_file(pdf_content); + + match service.extract_text_from_pdf(pdf_file.path().to_str().unwrap(), &settings).await { + Ok(result) => { + // Test quality validation + let is_valid = service.validate_ocr_quality(&result, &settings); + + if result.word_count > 0 { + assert!(is_valid, "Good quality PDF should pass validation"); + } else { + assert!(!is_valid, "PDF with 0 words should fail validation"); + } + + // Verify OCR result structure + assert!(result.confidence >= 0.0 && result.confidence <= 100.0, "Confidence should be in valid range"); + assert!(result.processing_time_ms > 0, "Should have processing time"); + assert!(result.preprocessing_applied.contains(&"PDF text extraction".to_string()), + "Should indicate PDF extraction was used"); + assert!(result.processed_image_path.is_none(), "PDF extraction should not produce processed image"); + } + Err(e) => { + println!("PDF extraction failed (expected with mock PDF): {}", e); + } + } + } + + /// Test PDF extraction with actual file-like scenarios + #[tokio::test] + async fn test_pdf_file_size_validation() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let service = EnhancedOcrService::new(temp_path); + let settings = create_test_settings(); + + // Create a small PDF file to test file operations + let pdf_content = "Small test document"; + let pdf_file = create_mock_pdf_file(pdf_content); + + // Test that the file exists and can be read + let file_path = pdf_file.path().to_str().unwrap(); + assert!(std::path::Path::new(file_path).exists(), "PDF file should exist"); + + // Test file size checking (this will work even if PDF extraction fails) + let metadata = tokio::fs::metadata(file_path).await.expect("Should read file metadata"); + assert!(metadata.len() > 0, "PDF file should have content"); + assert!(metadata.len() < 100 * 1024 * 1024, "Test PDF should be under size limit"); + } + + #[test] + fn test_word_counting_regression_cases() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let service = EnhancedOcrService::new(temp_path); + + // Regression test cases for the specific PDF issue + let test_cases = vec![ + // Case 1: Continuous text like NDA documents + ("SOCLogixNDAConfidentialityAgreement", "SOC Logix NDA type content"), + + // Case 2: Mixed case and numbers + ("ABC123DEF456", "Mixed alphanumeric content"), + + // Case 3: Document-like text patterns + ("ThisIsATestDocumentWithCamelCase", "CamelCase document text"), + + // Case 4: All caps + ("THISISALLCAPSTEXT", "All caps text"), + + // Case 5: Mixed with punctuation + ("Text.With.Dots.Between", "Text with dot separators"), + ]; + + for (input, description) in test_cases { + let count = service.count_words_safely(input); + assert!(count > 0, "Should detect words in {}: '{}' -> {} words", description, input, count); + + // Test that the counting is consistent + let count2 = service.count_words_safely(input); + assert_eq!(count, count2, "Word counting should be consistent for {}", description); + } + } +} \ No newline at end of file diff --git a/tests/test_pdfs/continuous_text.pdf b/tests/test_pdfs/continuous_text.pdf new file mode 100644 index 0000000..ffe2364 --- /dev/null +++ b/tests/test_pdfs/continuous_text.pdf @@ -0,0 +1,58 @@ +%PDF-1.4 +1 0 obj +<< +/Type /Catalog +/Pages 2 0 R +>> +endobj +2 0 obj +<< +/Type /Pages +/Kids [3 0 R] +/Count 1 +>> +endobj +3 0 obj +<< +/Type /Page +/Parent 2 0 R +/Contents 4 0 R +/MediaBox [0 0 612 792] +/Resources << + /Font << + /F1 << + /Type /Font + /Subtype /Type1 + /BaseFont /Helvetica + >> + >> +>> +>> +endobj +4 0 obj +<< +/Length 85 +>> +stream +BT +/F1 12 Tf +72 720 Td +(HelloWorldThisIsAContinuousTextDocumentWithoutSpaces) Tj +ET +endstream +endobj +xref +0 5 +0000000000 65535 f +0000000009 00000 n +0000000074 00000 n +0000000120 00000 n +0000000324 00000 n +trailer +<< +/Size 5 +/Root 1 0 R +>> +startxref +458 +%%EOF \ No newline at end of file diff --git a/tests/test_pdfs/edge_cases_realistic.pdf b/tests/test_pdfs/edge_cases_realistic.pdf new file mode 100644 index 0000000..2e10f89 --- /dev/null +++ b/tests/test_pdfs/edge_cases_realistic.pdf @@ -0,0 +1,68 @@ +%PDF-1.3 +%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com +1 0 obj +<< +/F1 2 0 R +>> +endobj +2 0 obj +<< +/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font +>> +endobj +3 0 obj +<< +/Contents 7 0 R /MediaBox [ 0 0 612 792 ] /Parent 6 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +4 0 obj +<< +/PageMode /UseNone /Pages 6 0 R /Type /Catalog +>> +endobj +5 0 obj +<< +/Author (anonymous) /CreationDate (D:20250701003050+00'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20250701003050+00'00') /Producer (ReportLab PDF Library - www.reportlab.com) + /Subject (unspecified) /Title (untitled) /Trapped /False +>> +endobj +6 0 obj +<< +/Count 1 /Kids [ 3 0 R ] /Type /Pages +>> +endobj +7 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 435 +>> +stream +Gas2FbAP0N&4Q>@`F[Y%.Ps7\=+r&.TaDN9FDI%j\;fl(SeUts3[@6I8n?$Og\5CDGQL#''qG##!t0\Z>C^)]YsU51N?FOC7&T-/BFrVn-8U.TE>KpJ67pIGcigO]?M8F/E%`kc`e_!I@T)ejA9U6WoDt7H'Vi28;J^,8m7$:9q3l'iI&$Kd?fAaqin.kYAftnQE9*#KQ/5,X!8q,?l72@d:bYAqOBaqO@fa1UiD9W26MY^GZp@Zk#,6`]A;nO).OP(V&Grg<@)LX`fc[/f)8_8ISc8#~>endstream +endobj +xref +0 8 +0000000000 65535 f +0000000073 00000 n +0000000104 00000 n +0000000211 00000 n +0000000404 00000 n +0000000472 00000 n +0000000768 00000 n +0000000827 00000 n +trailer +<< +/ID +[<4d1effad52f9f6c5ad297996ada120d6><4d1effad52f9f6c5ad297996ada120d6>] +% ReportLab generated PDF document -- digest (http://www.reportlab.com) + +/Info 5 0 R +/Root 4 0 R +/Size 8 +>> +startxref +1352 +%%EOF diff --git a/tests/test_pdfs/mixed_content.txt b/tests/test_pdfs/mixed_content.txt new file mode 100644 index 0000000..cd64050 --- /dev/null +++ b/tests/test_pdfs/mixed_content.txt @@ -0,0 +1 @@ +Document with numbers 123 and symbols @#$ mixed with normal text. \ No newline at end of file diff --git a/tests/test_pdfs/multiline_text.txt b/tests/test_pdfs/multiline_text.txt new file mode 100644 index 0000000..6cc5b89 --- /dev/null +++ b/tests/test_pdfs/multiline_text.txt @@ -0,0 +1,4 @@ +Line one with several words +Line two with more content +Line three continues the pattern +Final line ends the document \ No newline at end of file diff --git a/tests/test_pdfs/multipage_document.pdf b/tests/test_pdfs/multipage_document.pdf new file mode 100644 index 0000000..7e6b2e7 --- /dev/null +++ b/tests/test_pdfs/multipage_document.pdf @@ -0,0 +1,101 @@ +%PDF-1.4 +1 0 obj +<< +/Type /Catalog +/Pages 2 0 R +>> +endobj +2 0 obj +<< +/Type /Pages +/Kids [3 0 R 4 0 R] +/Count 2 +>> +endobj +3 0 obj +<< +/Type /Page +/Parent 2 0 R +/Contents 5 0 R +/MediaBox [0 0 612 792] +/Resources << + /Font << + /F1 << + /Type /Font + /Subtype /Type1 + /BaseFont /Helvetica + >> + >> +>> +>> +endobj +4 0 obj +<< +/Type /Page +/Parent 2 0 R +/Contents 6 0 R +/MediaBox [0 0 612 792] +/Resources << + /Font << + /F1 << + /Type /Font + /Subtype /Type1 + /BaseFont /Helvetica + >> + >> +>> +>> +endobj +5 0 obj +<< +/Length 200 +>> +stream +BT +/F1 12 Tf +72 720 Td +(Page 1: This is the first page of a multi-page document.) Tj +0 -24 Td +(It contains multiple sentences with proper spacing.) Tj +0 -24 Td +(Each line should be counted as separate words.) Tj +0 -24 Td +(Total words on this page should be easily counted.) Tj +ET +endstream +endobj +6 0 obj +<< +/Length 180 +>> +stream +BT +/F1 12 Tf +72 720 Td +(Page 2: Continuing from the previous page.) Tj +0 -24 Td +(This page also has normal text formatting.) Tj +0 -24 Td +(Word counting should work correctly here too.) Tj +0 -24 Td +(End of document with proper word boundaries.) Tj +ET +endstream +endobj +xref +0 7 +0000000000 65535 f +0000000009 00000 n +0000000074 00000 n +0000000125 00000 n +0000000369 00000 n +0000000613 00000 n +0000000863 00000 n +trailer +<< +/Size 7 +/Root 1 0 R +>> +startxref +1092 +%%EOF \ No newline at end of file diff --git a/tests/test_pdfs/multipage_realistic.pdf b/tests/test_pdfs/multipage_realistic.pdf new file mode 100644 index 0000000..a8a062f --- /dev/null +++ b/tests/test_pdfs/multipage_realistic.pdf @@ -0,0 +1,87 @@ +%PDF-1.3 +%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com +1 0 obj +<< +/F1 2 0 R +>> +endobj +2 0 obj +<< +/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font +>> +endobj +3 0 obj +<< +/Contents 8 0 R /MediaBox [ 0 0 612 792 ] /Parent 7 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +4 0 obj +<< +/Contents 9 0 R /MediaBox [ 0 0 612 792 ] /Parent 7 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +5 0 obj +<< +/PageMode /UseNone /Pages 7 0 R /Type /Catalog +>> +endobj +6 0 obj +<< +/Author (anonymous) /CreationDate (D:20250701003050+00'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20250701003050+00'00') /Producer (ReportLab PDF Library - www.reportlab.com) + /Subject (unspecified) /Title (untitled) /Trapped /False +>> +endobj +7 0 obj +<< +/Count 2 /Kids [ 3 0 R 4 0 R ] /Type /Pages +>> +endobj +8 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 406 +>> +stream +Gas30:J\kN(^BL,5/hSB1T0TCo4XYNM4,GU4MNCg`DL6Hgfua>[qrB]-MdM:E<`236A!g$1D67*\dA.-ck2're$?]L,EP&5d/rS9LEhgfJ2<>Ml"DM/7&6N2>sT:If`6;H#EbotJhg,Xt"A)9AH~>endstream +endobj +9 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 402 +>> +stream +Gas2D95h[$&;9NMME/*[U3.TSE%?`.UfKfsUgA@%p*OcNCh,UirdAXH6Os?'nlkA0[kkqk&.UeZU&c#im[kA!K"M)QN$cX'Z-!"6n#B#\(+M[f/P'3)&;@^>b?,'Zg*ikPX%Z_j*/r^.VR7[sI6pAgi/P96@iar(*@^!T-gg)A=tEf3L88:A:@[oOQ9EO=E/-bo]$*gRNq^(/Tn4jnEQa%`lVOW661El\S?a:p%lN+j&6ql\fQ:sDb@Pi=@diu[i.aRS>1Q";cC,d,i-KHrY6Z;u\69bu7;d$n'f;9ZdYP=endstream +endobj +xref +0 10 +0000000000 65535 f +0000000073 00000 n +0000000104 00000 n +0000000211 00000 n +0000000404 00000 n +0000000597 00000 n +0000000665 00000 n +0000000961 00000 n +0000001026 00000 n +0000001522 00000 n +trailer +<< +/ID +[<098a3737c0449fe862826c7afae59697><098a3737c0449fe862826c7afae59697>] +% ReportLab generated PDF document -- digest (http://www.reportlab.com) + +/Info 6 0 R +/Root 5 0 R +/Size 10 +>> +startxref +2014 +%%EOF diff --git a/tests/test_pdfs/normal_spacing.txt b/tests/test_pdfs/normal_spacing.txt new file mode 100644 index 0000000..8c655d1 --- /dev/null +++ b/tests/test_pdfs/normal_spacing.txt @@ -0,0 +1 @@ +This is a normal document with proper word spacing and punctuation. \ No newline at end of file diff --git a/tests/test_pdfs/normal_text.pdf b/tests/test_pdfs/normal_text.pdf new file mode 100644 index 0000000..4fb6a3c --- /dev/null +++ b/tests/test_pdfs/normal_text.pdf @@ -0,0 +1,58 @@ +%PDF-1.4 +1 0 obj +<< +/Type /Catalog +/Pages 2 0 R +>> +endobj +2 0 obj +<< +/Type /Pages +/Kids [3 0 R] +/Count 1 +>> +endobj +3 0 obj +<< +/Type /Page +/Parent 2 0 R +/Contents 4 0 R +/MediaBox [0 0 612 792] +/Resources << + /Font << + /F1 << + /Type /Font + /Subtype /Type1 + /BaseFont /Helvetica + >> + >> +>> +>> +endobj +4 0 obj +<< +/Length 75 +>> +stream +BT +/F1 12 Tf +72 720 Td +(This is a normal document with proper word spacing) Tj +ET +endstream +endobj +xref +0 5 +0000000000 65535 f +0000000009 00000 n +0000000074 00000 n +0000000120 00000 n +0000000324 00000 n +trailer +<< +/Size 5 +/Root 1 0 R +>> +startxref +448 +%%EOF \ No newline at end of file diff --git a/tests/test_pdfs/problematic_encoding.pdf b/tests/test_pdfs/problematic_encoding.pdf new file mode 100644 index 0000000..e3d0b9f --- /dev/null +++ b/tests/test_pdfs/problematic_encoding.pdf @@ -0,0 +1,64 @@ +%PDF-1.4 +1 0 obj +<< +/Type /Catalog +/Pages 2 0 R +>> +endobj +2 0 obj +<< +/Type /Pages +/Kids [3 0 R] +/Count 1 +>> +endobj +3 0 obj +<< +/Type /Page +/Parent 2 0 R +/Contents 4 0 R +/MediaBox [0 0 612 792] +/Resources << + /Font << + /F1 << + /Type /Font + /Subtype /Type1 + /BaseFont /Helvetica + >> + >> +>> +>> +endobj +4 0 obj +<< +/Length 165 +>> +stream +BT +/F1 12 Tf +72 720 Td +(Text with special characters: caf\351 na\357ve r\351sum\351) Tj +0 -24 Td +(Unicode characters: \u2022 \u2013 \u2014 \u201C \u201D) Tj +0 -24 Td +(Mixed content: ABC123 def456 GHI789) Tj +0 -24 Td +(Normal text: This should work fine.) Tj +ET +endstream +endobj +xref +0 5 +0000000000 65535 f +0000000009 00000 n +0000000074 00000 n +0000000120 00000 n +0000000324 00000 n +trailer +<< +/Size 5 +/Root 1 0 R +>> +startxref +538 +%%EOF \ No newline at end of file diff --git a/tests/test_pdfs/special_chars.txt b/tests/test_pdfs/special_chars.txt new file mode 100644 index 0000000..0e231e1 --- /dev/null +++ b/tests/test_pdfs/special_chars.txt @@ -0,0 +1 @@ +Text with special characters: café naïve résumé — and 'quotes' • bullets \ No newline at end of file