diff --git a/src/ocr/enhanced.rs b/src/ocr/enhanced.rs index f333a66..11a0669 100644 --- a/src/ocr/enhanced.rs +++ b/src/ocr/enhanced.rs @@ -839,6 +839,7 @@ impl EnhancedOcrService { let text = match extraction_result { Ok(Ok(Ok(Ok(text)))) => text, Ok(Ok(Ok(Err(e)))) => { + warn!("PDF text extraction failed for file '{}' (size: {} bytes): {}", file_path, file_size, e); return Err(anyhow!( "PDF text extraction failed for file '{}' (size: {} bytes): {}. This may indicate a corrupted or unsupported PDF format.", file_path, file_size, e @@ -847,8 +848,7 @@ impl EnhancedOcrService { Ok(Ok(Err(_panic))) => { // pdf-extract panicked (e.g., missing unicode map, corrupted font encoding) // For now, gracefully handle this common issue - use tracing::debug; - debug!("PDF text extraction failed for '{}' due to font encoding issues. This is a known limitation with certain PDF files.", file_path); + warn!("PDF text extraction panicked for '{}' (size: {} bytes) due to font encoding issues. This is a known limitation with certain PDF files.", file_path, file_size); return Err(anyhow!( "PDF text extraction failed due to font encoding issues in '{}' (size: {} bytes). This PDF uses non-standard fonts or character encoding that cannot be processed. To extract text from this PDF, consider: 1) Converting it to images and uploading those instead, 2) Using a different PDF viewer to re-save the PDF with standard encoding, or 3) Using external tools to convert the PDF to a more compatible format.", @@ -856,9 +856,11 @@ impl EnhancedOcrService { )); } Ok(Err(e)) => { + warn!("PDF extraction task failed for '{}' (size: {} bytes): {}", file_path, file_size, e); return Err(anyhow!("PDF extraction task failed: {}", e)); } Err(_) => { + warn!("PDF extraction timed out after 2 minutes for file '{}' (size: {} bytes). The PDF may be corrupted or too complex.", file_path, file_size); return Err(anyhow!( "PDF extraction timed out after 2 minutes for file '{}' (size: {} bytes). The PDF may be corrupted or too complex.", file_path, file_size @@ -1019,7 +1021,26 @@ impl EnhancedOcrService { let temp_ocr_path = temp_ocr_path.clone(); move || -> Result { let bytes = std::fs::read(&temp_ocr_path)?; - let text = pdf_extract::extract_text_from_mem(&bytes)?; + // Catch panics from pdf-extract library (same pattern as used elsewhere) + let text = match catch_unwind(AssertUnwindSafe(|| { + pdf_extract::extract_text_from_mem(&bytes) + })) { + Ok(Ok(text)) => text, + Ok(Err(e)) => { + warn!("PDF text extraction failed after OCR processing for '{}': {}", temp_ocr_path, e); + return Err(anyhow!( + "PDF text extraction failed after OCR processing: {}. This may indicate a corrupted or unsupported PDF format.", + e + )); + }, + Err(_) => { + warn!("PDF extraction panicked after OCR processing for '{}' due to invalid content stream", temp_ocr_path); + return Err(anyhow!( + "PDF extraction panicked after OCR processing due to invalid content stream or corrupted PDF structure. \ + This suggests the PDF has malformed internal structure that cannot be parsed safely." + )); + }, + }; Ok(text.trim().to_string()) } }).await??; diff --git a/src/tests/ocr_tests.rs b/src/tests/ocr_tests.rs index 217b266..2336f6f 100644 --- a/src/tests/ocr_tests.rs +++ b/src/tests/ocr_tests.rs @@ -393,4 +393,295 @@ startxref assert!(all_updates.contains_key(&doc_id1)); assert!(all_updates.contains_key(&doc_id2)); } + + /// Test that malformed PDFs don't crash the OCR system + #[tokio::test] + async fn test_malformed_pdf_panic_handling() { + let ocr_service = OcrService::new(); + + // Create a malformed PDF in memory that will cause pdf-extract to panic + let malformed_pdf_content = b"%PDF-1.4 +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +2 0 obj +<< /Type /Pages /Kids [3 0 R] /Count 1 >> +endobj +3 0 obj +<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >> +endobj +4 0 obj +<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> +endobj +5 0 obj +<< /Length 999 >> +stream +BT +/F1 12 Tf +100 700 Td +(This is a malformed PDF with invalid content stream) Tj +ET +INVALID_CONTENT_STREAM_DATA_THAT_CAUSES_PANIC +endstream +endobj +xref +0 6 +0000000000 65535 f +0000000009 00000 n +0000000058 00000 n +0000000115 00000 n +0000000262 00000 n +0000000341 00000 n +trailer +<< /Size 6 /Root 1 0 R >> +startxref +999 +%%EOF"; + + // Write to temporary file + let temp_file = NamedTempFile::with_suffix(".pdf").unwrap(); + std::fs::write(temp_file.path(), malformed_pdf_content).unwrap(); + + let result = ocr_service.extract_text_from_pdf(temp_file.path().to_str().unwrap()).await; + // Should not panic, should return an error instead + assert!(result.is_err(), "Expected error for malformed PDF"); + let error_msg = result.unwrap_err().to_string(); + println!("Error message: {}", error_msg); + // Should contain descriptive error message + assert!( + error_msg.contains("panic") || + error_msg.contains("invalid content stream") || + error_msg.contains("corrupted") || + error_msg.contains("extract") || + error_msg.contains("Failed to extract") + ); + } + + #[tokio::test] + async fn test_corrupted_pdf_structure_handling() { + let ocr_service = OcrService::new(); + + // Create a corrupted PDF structure that will cause pdf-extract to fail + let corrupted_pdf_content = b"%PDF-1.4 +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +2 0 obj +<< /Type /Pages /Kids [3 0 R] /Count 1 >> +endobj +3 0 obj +<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >> +endobj +4 0 obj +<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> +endobj +5 0 obj +<< /Length 44 >> +stream +BT +/F1 12 Tf +100 700 Td +(Corrupted PDF) Tj +ET +endstream +endobj +xref +0 6 +0000000000 65535 f +0000000009 00000 n +0000000058 00000 n +0000000115 00000 n +0000000262 00000 n +0000000341 00000 n +trailer +<< /Size 6 /Root 1 0 R /InvalidKey >> +startxref +999999 +%%EOF"; + + let temp_file = NamedTempFile::with_suffix(".pdf").unwrap(); + std::fs::write(temp_file.path(), corrupted_pdf_content).unwrap(); + + let result = ocr_service.extract_text_from_pdf(temp_file.path().to_str().unwrap()).await; + // Should not panic, should return an error instead + assert!(result.is_err(), "Expected error for corrupted PDF"); + let error_msg = result.unwrap_err().to_string(); + println!("Corrupted PDF error: {}", error_msg); + // Should contain descriptive error message + assert!( + error_msg.contains("panic") || + error_msg.contains("corrupted") || + error_msg.contains("extract") || + error_msg.contains("PDF") || + error_msg.contains("Failed to extract") + ); + } + + #[tokio::test] + async fn test_invalid_font_encoding_handling() { + let ocr_service = OcrService::new(); + + // Test with invalid font encoding + let invalid_font = "tests/test_pdfs/invalid_font_encoding.pdf"; + if Path::new(invalid_font).exists() { + let result = ocr_service.extract_text_from_pdf(invalid_font).await; + // Should not panic, should return an error instead + assert!(result.is_err()); + let error_msg = result.unwrap_err().to_string(); + // Should contain descriptive error message + assert!( + error_msg.contains("panic") || + error_msg.contains("font") || + error_msg.contains("encoding") || + error_msg.contains("extract") + ); + } + } + + #[tokio::test] + async fn test_fake_pdf_handling() { + let ocr_service = OcrService::new(); + + // Create a fake PDF file (not actually a PDF) that will definitely cause an error + let fake_pdf_content = b"This is not a PDF file at all, just plain text with a PDF extension. +It should cause pdf-extract to fail when trying to parse it. +This tests the error handling for files that aren't actually PDFs."; + + let temp_file = NamedTempFile::with_suffix(".pdf").unwrap(); + std::fs::write(temp_file.path(), fake_pdf_content).unwrap(); + + let result = ocr_service.extract_text_from_pdf(temp_file.path().to_str().unwrap()).await; + // Should not panic, should return an error instead + assert!(result.is_err(), "Expected error for fake PDF"); + let error_msg = result.unwrap_err().to_string(); + println!("Fake PDF error: {}", error_msg); + // Should contain descriptive error message about parsing failure + assert!( + error_msg.contains("extract") || + error_msg.contains("parse") || + error_msg.contains("PDF") || + error_msg.contains("format") || + error_msg.contains("Failed to extract") + ); + } + + #[tokio::test] + async fn test_problematic_encoding_pdf_handling() { + let ocr_service = OcrService::new(); + + // Test with the existing problematic encoding PDF + let problematic_encoding = "tests/test_pdfs/problematic_encoding.pdf"; + if Path::new(problematic_encoding).exists() { + let result = ocr_service.extract_text_from_pdf(problematic_encoding).await; + // Should not panic, should return an error instead + assert!(result.is_err()); + let error_msg = result.unwrap_err().to_string(); + // Should contain descriptive error message + assert!( + error_msg.contains("panic") || + error_msg.contains("encoding") || + error_msg.contains("extract") || + error_msg.contains("font") + ); + } + } + + /// Test that the enhanced OCR service also handles panics correctly + #[tokio::test] + async fn test_enhanced_ocr_panic_handling() { + use crate::ocr::enhanced::EnhancedOcrService; + use crate::services::file_service::FileService; + use crate::models::Settings; + + let ocr_service = EnhancedOcrService::new("tests".to_string()); + let settings = Settings::default(); + + // Test all malformed PDFs with enhanced OCR + let test_files = vec![ + "tests/test_pdfs/malformed_content_stream.pdf", + "tests/test_pdfs/corrupted_structure.pdf", + "tests/test_pdfs/invalid_font_encoding.pdf", + "tests/test_pdfs/fake_pdf.pdf", + "tests/test_pdfs/problematic_encoding.pdf", + ]; + + for test_file in test_files { + if Path::new(test_file).exists() { + let result = ocr_service.extract_text_with_context( + test_file, + "application/pdf", + &Path::new(test_file).file_name().unwrap().to_str().unwrap(), + 1024, // file_size + &settings + ).await; + + // Should not panic, should return an error instead + assert!(result.is_err(), "Expected error for file: {}", test_file); + let error_msg = result.unwrap_err().to_string(); + + // Should contain descriptive error message + assert!( + error_msg.contains("panic") || + error_msg.contains("extract") || + error_msg.contains("PDF") || + error_msg.contains("corrupted") || + error_msg.contains("encoding") || + error_msg.contains("font"), + "Error message should be descriptive for {}: {}", test_file, error_msg + ); + } + } + } + + /// Test that panic handling works correctly in concurrent scenarios + #[tokio::test] + async fn test_concurrent_pdf_panic_handling() { + use std::sync::Arc; + use futures::future::join_all; + + let ocr_service = Arc::new(OcrService::new()); + let mut handles = Vec::new(); + + // Test concurrent processing of malformed PDFs + let test_files = vec![ + "tests/test_pdfs/malformed_content_stream.pdf", + "tests/test_pdfs/corrupted_structure.pdf", + "tests/test_pdfs/invalid_font_encoding.pdf", + "tests/test_pdfs/fake_pdf.pdf", + ]; + + for test_file in test_files { + if Path::new(test_file).exists() { + let ocr_service_clone = Arc::clone(&ocr_service); + let test_file_owned = test_file.to_string(); + + let handle = tokio::spawn(async move { + let result = ocr_service_clone.extract_text_from_pdf(&test_file_owned).await; + // Should not panic, should return an error instead + assert!(result.is_err(), "Expected error for file: {}", test_file_owned); + let error_msg = result.unwrap_err().to_string(); + + // Should contain descriptive error message + assert!( + error_msg.contains("panic") || + error_msg.contains("extract") || + error_msg.contains("PDF") || + error_msg.contains("corrupted") || + error_msg.contains("encoding"), + "Error message should be descriptive for {}: {}", test_file_owned, error_msg + ); + }); + + handles.push(handle); + } + } + + // Wait for all concurrent tasks to complete + let results = join_all(handles).await; + + // Verify all tasks completed without panicking + for result in results { + assert!(result.is_ok(), "Task should complete without panicking"); + } + } } \ No newline at end of file diff --git a/tests/test_pdfs/corrupted_structure.pdf b/tests/test_pdfs/corrupted_structure.pdf new file mode 100644 index 0000000..5b76371 --- /dev/null +++ b/tests/test_pdfs/corrupted_structure.pdf @@ -0,0 +1,36 @@ +%PDF-1.4 +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +2 0 obj +<< /Type /Pages /Kids [3 0 R] /Count 1 >> +endobj +3 0 obj +<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >> +endobj +4 0 obj +<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> +endobj +5 0 obj +<< /Length 44 >> +stream +BT +/F1 12 Tf +100 700 Td +(Corrupted PDF) Tj +ET +endstream +endobj +xref +0 6 +0000000000 65535 f +0000000009 00000 n +0000000058 00000 n +0000000115 00000 n +0000000262 00000 n +0000000341 00000 n +trailer +<< /Size 6 /Root 1 0 R /InvalidKey >> +startxref +999999 +%%EOF \ No newline at end of file diff --git a/tests/test_pdfs/fake_pdf.pdf b/tests/test_pdfs/fake_pdf.pdf new file mode 100644 index 0000000..c24c733 --- /dev/null +++ b/tests/test_pdfs/fake_pdf.pdf @@ -0,0 +1,3 @@ +This is not a PDF file at all, just plain text with a PDF extension. +It should cause pdf-extract to fail when trying to parse it. +This tests the error handling for files that aren't actually PDFs. \ No newline at end of file diff --git a/tests/test_pdfs/invalid_font_encoding.pdf b/tests/test_pdfs/invalid_font_encoding.pdf new file mode 100644 index 0000000..5482d86 --- /dev/null +++ b/tests/test_pdfs/invalid_font_encoding.pdf @@ -0,0 +1,36 @@ +%PDF-1.4 +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +2 0 obj +<< /Type /Pages /Kids [3 0 R] /Count 1 >> +endobj +3 0 obj +<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >> +endobj +4 0 obj +<< /Type /Font /Subtype /Type1 /BaseFont /InvalidFont /Encoding /InvalidEncoding >> +endobj +5 0 obj +<< /Length 44 >> +stream +BT +/F1 12 Tf +100 700 Td +(Invalid font encoding) Tj +ET +endstream +endobj +xref +0 6 +0000000000 65535 f +0000000009 00000 n +0000000058 00000 n +0000000115 00000 n +0000000262 00000 n +0000000341 00000 n +trailer +<< /Size 6 /Root 1 0 R >> +startxref +435 +%%EOF \ No newline at end of file diff --git a/tests/test_pdfs/malformed_content_stream.pdf b/tests/test_pdfs/malformed_content_stream.pdf new file mode 100644 index 0000000..17f7f79 --- /dev/null +++ b/tests/test_pdfs/malformed_content_stream.pdf @@ -0,0 +1,37 @@ +%PDF-1.4 +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +2 0 obj +<< /Type /Pages /Kids [3 0 R] /Count 1 >> +endobj +3 0 obj +<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >> +endobj +4 0 obj +<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> +endobj +5 0 obj +<< /Length 999 >> +stream +BT +/F1 12 Tf +100 700 Td +(This is a malformed PDF with invalid content stream) Tj +ET +INVALID_CONTENT_STREAM_DATA_THAT_CAUSES_PANIC +endstream +endobj +xref +0 6 +0000000000 65535 f +0000000009 00000 n +0000000058 00000 n +0000000115 00000 n +0000000262 00000 n +0000000341 00000 n +trailer +<< /Size 6 /Root 1 0 R >> +startxref +999 +%%EOF \ No newline at end of file