feat(ocr): gracefully handle problematic PDFs in all the ways, create tests so that it doesn't happen again

2025-07-14 16:36:32 +00:00 · 2025-07-14 16:36:32 +00:00 · 6165148e4d
parent ad27d99949
commit 6165148e4d
6 changed files with 427 additions and 3 deletions
--- a/src/ocr/enhanced.rs
+++ b/src/ocr/enhanced.rs
@ -839,6 +839,7 @@ impl EnhancedOcrService {
        let text = match extraction_result {
            Ok(Ok(Ok(Ok(text)))) => text,
            Ok(Ok(Ok(Err(e)))) => {
+                warn!("PDF text extraction failed for file '{}' (size: {} bytes): {}", file_path, file_size, e);
                return Err(anyhow!(
                    "PDF text extraction failed for file '{}' (size: {} bytes): {}. This may indicate a corrupted or unsupported PDF format.",
                    file_path, file_size, e
@ -847,8 +848,7 @@ impl EnhancedOcrService {
            Ok(Ok(Err(_panic))) => {
                // pdf-extract panicked (e.g., missing unicode map, corrupted font encoding)
                // For now, gracefully handle this common issue
-                use tracing::debug;
-                debug!("PDF text extraction failed for '{}' due to font encoding issues. This is a known limitation with certain PDF files.", file_path);
+                warn!("PDF text extraction panicked for '{}' (size: {} bytes) due to font encoding issues. This is a known limitation with certain PDF files.", file_path, file_size);
                
                return Err(anyhow!(
                    "PDF text extraction failed due to font encoding issues in '{}' (size: {} bytes). This PDF uses non-standard fonts or character encoding that cannot be processed. To extract text from this PDF, consider: 1) Converting it to images and uploading those instead, 2) Using a different PDF viewer to re-save the PDF with standard encoding, or 3) Using external tools to convert the PDF to a more compatible format.",
@ -856,9 +856,11 @@ impl EnhancedOcrService {
                ));
            }
            Ok(Err(e)) => {
+                warn!("PDF extraction task failed for '{}' (size: {} bytes): {}", file_path, file_size, e);
                return Err(anyhow!("PDF extraction task failed: {}", e));
            }
            Err(_) => {
+                warn!("PDF extraction timed out after 2 minutes for file '{}' (size: {} bytes). The PDF may be corrupted or too complex.", file_path, file_size);
                return Err(anyhow!(
                    "PDF extraction timed out after 2 minutes for file '{}' (size: {} bytes). The PDF may be corrupted or too complex.",
                    file_path, file_size
@ -1019,7 +1021,26 @@ impl EnhancedOcrService {
            let temp_ocr_path = temp_ocr_path.clone();
            move || -> Result<String> {
                let bytes = std::fs::read(&temp_ocr_path)?;
-                let text = pdf_extract::extract_text_from_mem(&bytes)?;
+                // Catch panics from pdf-extract library (same pattern as used elsewhere)
+                let text = match catch_unwind(AssertUnwindSafe(|| {
+                    pdf_extract::extract_text_from_mem(&bytes)
+                })) {
+                    Ok(Ok(text)) => text,
+                    Ok(Err(e)) => {
+                        warn!("PDF text extraction failed after OCR processing for '{}': {}", temp_ocr_path, e);
+                        return Err(anyhow!(
+                            "PDF text extraction failed after OCR processing: {}. This may indicate a corrupted or unsupported PDF format.",
+                            e
+                        ));
+                    },
+                    Err(_) => {
+                        warn!("PDF extraction panicked after OCR processing for '{}' due to invalid content stream", temp_ocr_path);
+                        return Err(anyhow!(
+                            "PDF extraction panicked after OCR processing due to invalid content stream or corrupted PDF structure. \
+                            This suggests the PDF has malformed internal structure that cannot be parsed safely."
+                        ));
+                    },
+                };
                Ok(text.trim().to_string())
            }
        }).await??;
--- a/src/tests/ocr_tests.rs
+++ b/src/tests/ocr_tests.rs
@ -393,4 +393,295 @@ startxref
        assert!(all_updates.contains_key(&doc_id1));
        assert!(all_updates.contains_key(&doc_id2));
    }
+
+    /// Test that malformed PDFs don't crash the OCR system
+    #[tokio::test]
+    async fn test_malformed_pdf_panic_handling() {
+        let ocr_service = OcrService::new();
+        
+        // Create a malformed PDF in memory that will cause pdf-extract to panic
+        let malformed_pdf_content = b"%PDF-1.4
+1 0 obj
+<< /Type /Catalog /Pages 2 0 R >>
+endobj
+2 0 obj
+<< /Type /Pages /Kids [3 0 R] /Count 1 >>
+endobj
+3 0 obj
+<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
+endobj
+4 0 obj
+<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
+endobj
+5 0 obj
+<< /Length 999 >>
+stream
+BT
+/F1 12 Tf
+100 700 Td
+(This is a malformed PDF with invalid content stream) Tj
+ET
+INVALID_CONTENT_STREAM_DATA_THAT_CAUSES_PANIC
+endstream
+endobj
+xref
+0 6
+0000000000 65535 f
+0000000009 00000 n
+0000000058 00000 n
+0000000115 00000 n
+0000000262 00000 n
+0000000341 00000 n
+trailer
+<< /Size 6 /Root 1 0 R >>
+startxref
+999
+%%EOF";
+
+        // Write to temporary file
+        let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
+        std::fs::write(temp_file.path(), malformed_pdf_content).unwrap();
+        
+        let result = ocr_service.extract_text_from_pdf(temp_file.path().to_str().unwrap()).await;
+        // Should not panic, should return an error instead
+        assert!(result.is_err(), "Expected error for malformed PDF");
+        let error_msg = result.unwrap_err().to_string();
+        println!("Error message: {}", error_msg);
+        // Should contain descriptive error message
+        assert!(
+            error_msg.contains("panic") || 
+            error_msg.contains("invalid content stream") ||
+            error_msg.contains("corrupted") ||
+            error_msg.contains("extract") ||
+            error_msg.contains("Failed to extract")
+        );
+    }
+
+    #[tokio::test]
+    async fn test_corrupted_pdf_structure_handling() {
+        let ocr_service = OcrService::new();
+        
+        // Create a corrupted PDF structure that will cause pdf-extract to fail
+        let corrupted_pdf_content = b"%PDF-1.4
+1 0 obj
+<< /Type /Catalog /Pages 2 0 R >>
+endobj
+2 0 obj
+<< /Type /Pages /Kids [3 0 R] /Count 1 >>
+endobj
+3 0 obj
+<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
+endobj
+4 0 obj
+<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
+endobj
+5 0 obj
+<< /Length 44 >>
+stream
+BT
+/F1 12 Tf
+100 700 Td
+(Corrupted PDF) Tj
+ET
+endstream
+endobj
+xref
+0 6
+0000000000 65535 f
+0000000009 00000 n
+0000000058 00000 n
+0000000115 00000 n
+0000000262 00000 n
+0000000341 00000 n
+trailer
+<< /Size 6 /Root 1 0 R /InvalidKey >>
+startxref
+999999
+%%EOF";
+
+        let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
+        std::fs::write(temp_file.path(), corrupted_pdf_content).unwrap();
+        
+        let result = ocr_service.extract_text_from_pdf(temp_file.path().to_str().unwrap()).await;
+        // Should not panic, should return an error instead
+        assert!(result.is_err(), "Expected error for corrupted PDF");
+        let error_msg = result.unwrap_err().to_string();
+        println!("Corrupted PDF error: {}", error_msg);
+        // Should contain descriptive error message
+        assert!(
+            error_msg.contains("panic") || 
+            error_msg.contains("corrupted") ||
+            error_msg.contains("extract") ||
+            error_msg.contains("PDF") ||
+            error_msg.contains("Failed to extract")
+        );
+    }
+
+    #[tokio::test]
+    async fn test_invalid_font_encoding_handling() {
+        let ocr_service = OcrService::new();
+        
+        // Test with invalid font encoding
+        let invalid_font = "tests/test_pdfs/invalid_font_encoding.pdf";
+        if Path::new(invalid_font).exists() {
+            let result = ocr_service.extract_text_from_pdf(invalid_font).await;
+            // Should not panic, should return an error instead
+            assert!(result.is_err());
+            let error_msg = result.unwrap_err().to_string();
+            // Should contain descriptive error message
+            assert!(
+                error_msg.contains("panic") || 
+                error_msg.contains("font") ||
+                error_msg.contains("encoding") ||
+                error_msg.contains("extract")
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn test_fake_pdf_handling() {
+        let ocr_service = OcrService::new();
+        
+        // Create a fake PDF file (not actually a PDF) that will definitely cause an error
+        let fake_pdf_content = b"This is not a PDF file at all, just plain text with a PDF extension.
+It should cause pdf-extract to fail when trying to parse it.
+This tests the error handling for files that aren't actually PDFs.";
+
+        let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
+        std::fs::write(temp_file.path(), fake_pdf_content).unwrap();
+        
+        let result = ocr_service.extract_text_from_pdf(temp_file.path().to_str().unwrap()).await;
+        // Should not panic, should return an error instead
+        assert!(result.is_err(), "Expected error for fake PDF");
+        let error_msg = result.unwrap_err().to_string();
+        println!("Fake PDF error: {}", error_msg);
+        // Should contain descriptive error message about parsing failure
+        assert!(
+            error_msg.contains("extract") ||
+            error_msg.contains("parse") ||
+            error_msg.contains("PDF") ||
+            error_msg.contains("format") ||
+            error_msg.contains("Failed to extract")
+        );
+    }
+
+    #[tokio::test]
+    async fn test_problematic_encoding_pdf_handling() {
+        let ocr_service = OcrService::new();
+        
+        // Test with the existing problematic encoding PDF
+        let problematic_encoding = "tests/test_pdfs/problematic_encoding.pdf";
+        if Path::new(problematic_encoding).exists() {
+            let result = ocr_service.extract_text_from_pdf(problematic_encoding).await;
+            // Should not panic, should return an error instead
+            assert!(result.is_err());
+            let error_msg = result.unwrap_err().to_string();
+            // Should contain descriptive error message
+            assert!(
+                error_msg.contains("panic") || 
+                error_msg.contains("encoding") ||
+                error_msg.contains("extract") ||
+                error_msg.contains("font")
+            );
+        }
+    }
+
+    /// Test that the enhanced OCR service also handles panics correctly
+    #[tokio::test]
+    async fn test_enhanced_ocr_panic_handling() {
+        use crate::ocr::enhanced::EnhancedOcrService;
+        use crate::services::file_service::FileService;
+        use crate::models::Settings;
+        
+        let ocr_service = EnhancedOcrService::new("tests".to_string());
+        let settings = Settings::default();
+        
+        // Test all malformed PDFs with enhanced OCR
+        let test_files = vec![
+            "tests/test_pdfs/malformed_content_stream.pdf",
+            "tests/test_pdfs/corrupted_structure.pdf", 
+            "tests/test_pdfs/invalid_font_encoding.pdf",
+            "tests/test_pdfs/fake_pdf.pdf",
+            "tests/test_pdfs/problematic_encoding.pdf",
+        ];
+        
+        for test_file in test_files {
+            if Path::new(test_file).exists() {
+                let result = ocr_service.extract_text_with_context(
+                    test_file,
+                    "application/pdf",
+                    &Path::new(test_file).file_name().unwrap().to_str().unwrap(),
+                    1024, // file_size
+                    &settings
+                ).await;
+                
+                // Should not panic, should return an error instead
+                assert!(result.is_err(), "Expected error for file: {}", test_file);
+                let error_msg = result.unwrap_err().to_string();
+                
+                // Should contain descriptive error message
+                assert!(
+                    error_msg.contains("panic") || 
+                    error_msg.contains("extract") ||
+                    error_msg.contains("PDF") ||
+                    error_msg.contains("corrupted") ||
+                    error_msg.contains("encoding") ||
+                    error_msg.contains("font"),
+                    "Error message should be descriptive for {}: {}", test_file, error_msg
+                );
+            }
+        }
+    }
+
+    /// Test that panic handling works correctly in concurrent scenarios
+    #[tokio::test]
+    async fn test_concurrent_pdf_panic_handling() {
+        use std::sync::Arc;
+        use futures::future::join_all;
+        
+        let ocr_service = Arc::new(OcrService::new());
+        let mut handles = Vec::new();
+        
+        // Test concurrent processing of malformed PDFs
+        let test_files = vec![
+            "tests/test_pdfs/malformed_content_stream.pdf",
+            "tests/test_pdfs/corrupted_structure.pdf",
+            "tests/test_pdfs/invalid_font_encoding.pdf",
+            "tests/test_pdfs/fake_pdf.pdf",
+        ];
+        
+        for test_file in test_files {
+            if Path::new(test_file).exists() {
+                let ocr_service_clone = Arc::clone(&ocr_service);
+                let test_file_owned = test_file.to_string();
+                
+                let handle = tokio::spawn(async move {
+                    let result = ocr_service_clone.extract_text_from_pdf(&test_file_owned).await;
+                    // Should not panic, should return an error instead
+                    assert!(result.is_err(), "Expected error for file: {}", test_file_owned);
+                    let error_msg = result.unwrap_err().to_string();
+                    
+                    // Should contain descriptive error message
+                    assert!(
+                        error_msg.contains("panic") || 
+                        error_msg.contains("extract") ||
+                        error_msg.contains("PDF") ||
+                        error_msg.contains("corrupted") ||
+                        error_msg.contains("encoding"),
+                        "Error message should be descriptive for {}: {}", test_file_owned, error_msg
+                    );
+                });
+                
+                handles.push(handle);
+            }
+        }
+        
+        // Wait for all concurrent tasks to complete
+        let results = join_all(handles).await;
+        
+        // Verify all tasks completed without panicking
+        for result in results {
+            assert!(result.is_ok(), "Task should complete without panicking");
+        }
+    }
 }
--- a/tests/test_pdfs/corrupted_structure.pdf
+++ b/tests/test_pdfs/corrupted_structure.pdf
@ -0,0 +1,36 @@
+%PDF-1.4
+1 0 obj
+<< /Type /Catalog /Pages 2 0 R >>
+endobj
+2 0 obj
+<< /Type /Pages /Kids [3 0 R] /Count 1 >>
+endobj
+3 0 obj
+<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
+endobj
+4 0 obj
+<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
+endobj
+5 0 obj
+<< /Length 44 >>
+stream
+BT
+/F1 12 Tf
+100 700 Td
+(Corrupted PDF) Tj
+ET
+endstream
+endobj
+xref
+0 6
+0000000000 65535 f
+0000000009 00000 n
+0000000058 00000 n
+0000000115 00000 n
+0000000262 00000 n
+0000000341 00000 n
+trailer
+<< /Size 6 /Root 1 0 R /InvalidKey >>
+startxref
+999999
+%%EOF
--- a/tests/test_pdfs/fake_pdf.pdf
+++ b/tests/test_pdfs/fake_pdf.pdf
@ -0,0 +1,3 @@
+This is not a PDF file at all, just plain text with a PDF extension.
+It should cause pdf-extract to fail when trying to parse it.
+This tests the error handling for files that aren't actually PDFs.
--- a/tests/test_pdfs/invalid_font_encoding.pdf
+++ b/tests/test_pdfs/invalid_font_encoding.pdf
@ -0,0 +1,36 @@
+%PDF-1.4
+1 0 obj
+<< /Type /Catalog /Pages 2 0 R >>
+endobj
+2 0 obj
+<< /Type /Pages /Kids [3 0 R] /Count 1 >>
+endobj
+3 0 obj
+<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
+endobj
+4 0 obj
+<< /Type /Font /Subtype /Type1 /BaseFont /InvalidFont /Encoding /InvalidEncoding >>
+endobj
+5 0 obj
+<< /Length 44 >>
+stream
+BT
+/F1 12 Tf
+100 700 Td
+(Invalid font encoding) Tj
+ET
+endstream
+endobj
+xref
+0 6
+0000000000 65535 f
+0000000009 00000 n
+0000000058 00000 n
+0000000115 00000 n
+0000000262 00000 n
+0000000341 00000 n
+trailer
+<< /Size 6 /Root 1 0 R >>
+startxref
+435
+%%EOF
--- a/tests/test_pdfs/malformed_content_stream.pdf
+++ b/tests/test_pdfs/malformed_content_stream.pdf
@ -0,0 +1,37 @@
+%PDF-1.4
+1 0 obj
+<< /Type /Catalog /Pages 2 0 R >>
+endobj
+2 0 obj
+<< /Type /Pages /Kids [3 0 R] /Count 1 >>
+endobj
+3 0 obj
+<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
+endobj
+4 0 obj
+<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
+endobj
+5 0 obj
+<< /Length 999 >>
+stream
+BT
+/F1 12 Tf
+100 700 Td
+(This is a malformed PDF with invalid content stream) Tj
+ET
+INVALID_CONTENT_STREAM_DATA_THAT_CAUSES_PANIC
+endstream
+endobj
+xref
+0 6
+0000000000 65535 f
+0000000009 00000 n
+0000000058 00000 n
+0000000115 00000 n
+0000000262 00000 n
+0000000341 00000 n
+trailer
+<< /Size 6 /Root 1 0 R >>
+startxref
+999
+%%EOF