#[cfg(test)] mod tests { use crate::ocr::OcrService; use std::fs; use std::path::Path; use tempfile::NamedTempFile; // Mock database for testing mod mock_db { use anyhow::Result; use uuid::Uuid; use std::sync::{Arc, Mutex}; use std::collections::HashMap; #[derive(Clone)] pub struct MockDatabase { ocr_updates: Arc>>, } impl MockDatabase { pub fn new() -> Self { Self { ocr_updates: Arc::new(Mutex::new(HashMap::new())), } } pub async fn update_document_ocr(&self, id: Uuid, ocr_text: &str) -> Result<()> { let mut updates = self.ocr_updates.lock().unwrap(); updates.insert(id, ocr_text.to_string()); Ok(()) } pub fn get_ocr_text(&self, id: &Uuid) -> Option { let updates = self.ocr_updates.lock().unwrap(); updates.get(id).cloned() } pub fn get_all_ocr_updates(&self) -> HashMap { let updates = self.ocr_updates.lock().unwrap(); updates.clone() } } } use mock_db::MockDatabase; #[test] fn test_is_image_file() { let ocr_service = OcrService::new(); assert!(ocr_service.is_image_file("image.png")); assert!(ocr_service.is_image_file("photo.jpg")); assert!(ocr_service.is_image_file("picture.JPEG")); assert!(ocr_service.is_image_file("scan.tiff")); assert!(ocr_service.is_image_file("bitmap.bmp")); assert!(ocr_service.is_image_file("animation.gif")); assert!(!ocr_service.is_image_file("document.pdf")); assert!(!ocr_service.is_image_file("text.txt")); assert!(!ocr_service.is_image_file("archive.zip")); assert!(!ocr_service.is_image_file("noextension")); } #[tokio::test] async fn test_extract_text_from_plain_text() { let ocr_service = OcrService::new(); let temp_file = NamedTempFile::new().unwrap(); let test_content = "This is a test text file.\nWith multiple lines."; fs::write(temp_file.path(), test_content).unwrap(); let result = ocr_service .extract_text(temp_file.path().to_str().unwrap(), "text/plain") .await; assert!(result.is_ok()); let extracted_text = result.unwrap(); assert_eq!(extracted_text, test_content); } #[tokio::test] async fn test_extract_text_unsupported_type() { let ocr_service = OcrService::new(); let temp_file = NamedTempFile::new().unwrap(); fs::write(temp_file.path(), "some content").unwrap(); let result = ocr_service .extract_text(temp_file.path().to_str().unwrap(), "application/zip") .await; assert!(result.is_err()); assert!(result.unwrap_err().to_string().contains("Unsupported MIME type")); } #[tokio::test] async fn test_extract_text_from_nonexistent_file() { let ocr_service = OcrService::new(); let result = ocr_service .extract_text("/path/to/nonexistent/file.txt", "text/plain") .await; assert!(result.is_err()); } #[tokio::test] #[ignore = "Requires tesseract runtime - run with: cargo test --release -- --ignored"] async fn test_extract_text_with_real_image() { let ocr_service = OcrService::new(); // Create a simple test image with text if it doesn't exist let test_image_path = "test_data/hello_ocr.png"; // Skip test if test data doesn't exist if !Path::new(test_image_path).exists() { eprintln!("Skipping test_extract_text_with_real_image: test data not found"); return; } let result = ocr_service .extract_text(test_image_path, "image/png") .await; match result { Ok(text) => { println!("OCR extracted text: '{}'", text); // OCR might not be perfect, so we check if it contains expected words assert!(text.to_lowercase().contains("hello") || text.to_lowercase().contains("ocr")); } Err(e) => { eprintln!("OCR test failed: {}", e); // Don't fail the test if OCR is not available } } } #[tokio::test] async fn test_extract_text_from_pdf_with_content() { let ocr_service = OcrService::new(); // Create a minimal valid PDF let temp_file = NamedTempFile::with_suffix(".pdf").unwrap(); // This is a minimal PDF that says "Hello" let pdf_content = b"%PDF-1.4 1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj 2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj 3 0 obj << /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >> endobj 4 0 obj << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> endobj 5 0 obj << /Length 44 >> stream BT /F1 12 Tf 100 700 Td (Hello) Tj ET endstream endobj xref 0 6 0000000000 65535 f 0000000009 00000 n 0000000058 00000 n 0000000115 00000 n 0000000262 00000 n 0000000341 00000 n trailer << /Size 6 /Root 1 0 R >> startxref 435 %%EOF"; fs::write(temp_file.path(), pdf_content).unwrap(); let result = ocr_service .extract_text_from_pdf(temp_file.path().to_str().unwrap()) .await; // The pdf-extract library might not work with our minimal PDF // so we just check that it attempts to process it match result { Ok(text) => { println!("PDF extracted text: '{}'", text); } Err(e) => { println!("PDF extraction error (expected): {}", e); } } } #[tokio::test] async fn test_extract_text_with_image_extension_fallback() { let ocr_service = OcrService::new(); let temp_file = NamedTempFile::with_suffix(".png").unwrap(); fs::write(temp_file.path(), "fake image data").unwrap(); let result = ocr_service .extract_text(temp_file.path().to_str().unwrap(), "unknown/type") .await; // This should try to process as image due to extension, but fail due to invalid data assert!(result.is_err()); } #[tokio::test] async fn test_ocr_with_mock_database_integration() { let ocr_service = OcrService::new(); let mock_db = MockDatabase::new(); let doc_id = uuid::Uuid::new_v4(); // Create a simple text file to simulate OCR processing let temp_file = NamedTempFile::with_suffix(".txt").unwrap(); let test_content = "This is test OCR content for mock database integration."; fs::write(temp_file.path(), test_content).unwrap(); // Extract text using OCR service let result = ocr_service .extract_text(temp_file.path().to_str().unwrap(), "text/plain") .await; assert!(result.is_ok()); let extracted_text = result.unwrap(); // Mock database update let update_result = mock_db.update_document_ocr(doc_id, &extracted_text).await; assert!(update_result.is_ok()); // Verify the text was stored in mock database let stored_text = mock_db.get_ocr_text(&doc_id); assert!(stored_text.is_some()); assert_eq!(stored_text.unwrap(), test_content); } #[tokio::test] async fn test_ocr_error_handling_with_mock_db() { let ocr_service = OcrService::new(); let mock_db = MockDatabase::new(); let doc_id = uuid::Uuid::new_v4(); // Test with non-existent file let result = ocr_service .extract_text("/nonexistent/path/file.txt", "text/plain") .await; assert!(result.is_err()); // Verify no update was made to mock database for failed OCR let stored_text = mock_db.get_ocr_text(&doc_id); assert!(stored_text.is_none()); } #[tokio::test] async fn test_batch_ocr_processing_with_mock_db() { let ocr_service = OcrService::new(); let mock_db = MockDatabase::new(); let mut doc_ids = Vec::new(); let mut temp_files = Vec::new(); // Create multiple test files for i in 0..3 { let temp_file = NamedTempFile::with_suffix(".txt").unwrap(); let content = format!("Test document {} content for batch processing.", i + 1); fs::write(temp_file.path(), &content).unwrap(); let doc_id = uuid::Uuid::new_v4(); doc_ids.push(doc_id); temp_files.push((temp_file, content)); } // Process all files for (i, (temp_file, _expected_content)) in temp_files.iter().enumerate() { let result = ocr_service .extract_text(temp_file.path().to_str().unwrap(), "text/plain") .await; assert!(result.is_ok()); let extracted_text = result.unwrap(); let update_result = mock_db.update_document_ocr(doc_ids[i], &extracted_text).await; assert!(update_result.is_ok()); } // Verify all documents were processed let all_updates = mock_db.get_all_ocr_updates(); assert_eq!(all_updates.len(), 3); for (i, doc_id) in doc_ids.iter().enumerate() { let stored_text = all_updates.get(doc_id); assert!(stored_text.is_some()); assert!(stored_text.unwrap().contains(&format!("Test document {}", i + 1))); } } #[tokio::test] async fn test_ocr_language_support() { let ocr_service = OcrService::new(); let temp_file = NamedTempFile::with_suffix(".txt").unwrap(); let test_content = "Hello world test content"; fs::write(temp_file.path(), test_content).unwrap(); // Test different language codes let languages = vec!["eng", "spa", "fra", "deu"]; for lang in languages { let result = ocr_service .extract_text_with_lang(temp_file.path().to_str().unwrap(), "text/plain", lang) .await; // Should succeed for text files regardless of language setting assert!(result.is_ok()); let extracted = result.unwrap(); assert_eq!(extracted, test_content); } } #[tokio::test] async fn test_ocr_mime_type_detection() { let ocr_service = OcrService::new(); // Test various mime types let test_cases = vec![ ("test.txt", "text/plain"), ("document.pdf", "application/pdf"), ("image.png", "image/png"), ("photo.jpg", "image/jpeg"), ("scan.tiff", "image/tiff"), ]; for (filename, mime_type) in test_cases { let temp_file = NamedTempFile::with_suffix(&Path::new(filename).extension().unwrap().to_str().unwrap()).unwrap(); if mime_type == "text/plain" { fs::write(temp_file.path(), "test content").unwrap(); let result = ocr_service .extract_text(temp_file.path().to_str().unwrap(), mime_type) .await; assert!(result.is_ok(), "Failed for mime type: {}", mime_type); } else { // For non-text files, we expect either success or specific errors let result = ocr_service .extract_text(temp_file.path().to_str().unwrap(), mime_type) .await; // These will likely fail with our test setup, but should not panic if result.is_err() { println!("Expected failure for {}: {}", mime_type, result.unwrap_err()); } } } } #[test] fn test_mock_database_functionality() { let mock_db = MockDatabase::new(); let doc_id1 = uuid::Uuid::new_v4(); let doc_id2 = uuid::Uuid::new_v4(); // Test empty state assert!(mock_db.get_ocr_text(&doc_id1).is_none()); // Test single update let rt = tokio::runtime::Runtime::new().unwrap(); rt.block_on(async { let result = mock_db.update_document_ocr(doc_id1, "Test OCR text").await; assert!(result.is_ok()); }); assert_eq!(mock_db.get_ocr_text(&doc_id1).unwrap(), "Test OCR text"); // Test multiple updates rt.block_on(async { let result = mock_db.update_document_ocr(doc_id2, "Another OCR text").await; assert!(result.is_ok()); }); let all_updates = mock_db.get_all_ocr_updates(); assert_eq!(all_updates.len(), 2); assert!(all_updates.contains_key(&doc_id1)); assert!(all_updates.contains_key(&doc_id2)); } /// Test that malformed PDFs don't crash the OCR system #[tokio::test] async fn test_malformed_pdf_panic_handling() { let ocr_service = OcrService::new(); // Create a malformed PDF in memory that will cause pdf-extract to panic let malformed_pdf_content = b"%PDF-1.4 1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj 2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj 3 0 obj << /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >> endobj 4 0 obj << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> endobj 5 0 obj << /Length 999 >> stream BT /F1 12 Tf 100 700 Td (This is a malformed PDF with invalid content stream) Tj ET INVALID_CONTENT_STREAM_DATA_THAT_CAUSES_PANIC endstream endobj xref 0 6 0000000000 65535 f 0000000009 00000 n 0000000058 00000 n 0000000115 00000 n 0000000262 00000 n 0000000341 00000 n trailer << /Size 6 /Root 1 0 R >> startxref 999 %%EOF"; // Write to temporary file let temp_file = NamedTempFile::with_suffix(".pdf").unwrap(); std::fs::write(temp_file.path(), malformed_pdf_content).unwrap(); let result = ocr_service.extract_text_from_pdf(temp_file.path().to_str().unwrap()).await; // With the enhanced OCR service, this should either succeed or fail gracefully match result { Ok(text) => { println!("Successfully extracted text from malformed PDF: '{}'", text); // The robust extraction might find some text even in malformed PDFs assert!(!text.is_empty() || text.contains("Test")); } Err(e) => { println!("Error extracting from malformed PDF: {}", e); // Should contain descriptive error message if it fails let error_msg = e.to_string(); assert!( error_msg.contains("ocrmypdf") || error_msg.contains("extraction") || error_msg.contains("InputFileError") || error_msg.contains("Failed to extract") || error_msg.contains("All PDF extraction strategies failed") ); } } } #[tokio::test] async fn test_corrupted_pdf_structure_handling() { let ocr_service = OcrService::new(); // Create a corrupted PDF structure that will cause pdf-extract to fail let corrupted_pdf_content = b"%PDF-1.4 1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj 2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj 3 0 obj << /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >> endobj 4 0 obj << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> endobj 5 0 obj << /Length 44 >> stream BT /F1 12 Tf 100 700 Td (Corrupted PDF) Tj ET endstream endobj xref 0 6 0000000000 65535 f 0000000009 00000 n 0000000058 00000 n 0000000115 00000 n 0000000262 00000 n 0000000341 00000 n trailer << /Size 6 /Root 1 0 R /InvalidKey >> startxref 999999 %%EOF"; let temp_file = NamedTempFile::with_suffix(".pdf").unwrap(); std::fs::write(temp_file.path(), corrupted_pdf_content).unwrap(); let result = ocr_service.extract_text_from_pdf(temp_file.path().to_str().unwrap()).await; // The enhanced OCR service might extract text even from corrupted PDFs match result { Ok(text) => { println!("Successfully extracted text from corrupted PDF: '{}'", text); // The robust extraction might find "Corrupted PDF" text assert!(text.contains("Corrupted PDF") || !text.is_empty()); }, Err(e) => { let error_msg = e.to_string(); println!("Corrupted PDF error: {}", error_msg); // Should contain descriptive error message assert!( error_msg.contains("panic") || error_msg.contains("corrupted") || error_msg.contains("extract") || error_msg.contains("PDF") || error_msg.contains("Failed to extract") || error_msg.contains("All PDF extraction strategies failed") ); } } } #[tokio::test] async fn test_invalid_font_encoding_handling() { let ocr_service = OcrService::new(); // Test with invalid font encoding let invalid_font = "tests/test_pdfs/invalid_font_encoding.pdf"; if Path::new(invalid_font).exists() { let result = ocr_service.extract_text_from_pdf(invalid_font).await; // With the enhanced OCR service, this might succeed or fail gracefully match result { Ok(text) => { println!("Successfully extracted text from invalid font PDF: '{}'", text); // Even with invalid fonts, OCR service might extract something }, Err(e) => { let error_msg = e.to_string(); println!("Failed to extract from invalid font PDF: {}", error_msg); // Should contain descriptive error message assert!( error_msg.contains("panic") || error_msg.contains("font") || error_msg.contains("encoding") || error_msg.contains("extract") || error_msg.contains("All PDF extraction strategies failed") ); } } } } #[tokio::test] async fn test_fake_pdf_handling() { let ocr_service = OcrService::new(); // Create a fake PDF file (not actually a PDF) that will definitely cause an error let fake_pdf_content = b"This is not a PDF file at all, just plain text with a PDF extension. It should cause pdf-extract to fail when trying to parse it. This tests the error handling for files that aren't actually PDFs."; let temp_file = NamedTempFile::with_suffix(".pdf").unwrap(); std::fs::write(temp_file.path(), fake_pdf_content).unwrap(); let result = ocr_service.extract_text_from_pdf(temp_file.path().to_str().unwrap()).await; // The enhanced OCR might extract the text content even from a fake PDF match result { Ok(text) => { println!("Extracted text from fake PDF: '{}'", text); // Should contain the actual text content assert!(text.contains("This is not a PDF") || text.contains("plain text")); }, Err(e) => { let error_msg = e.to_string(); println!("Fake PDF error: {}", error_msg); // Should contain descriptive error message about parsing failure assert!( error_msg.contains("extract") || error_msg.contains("parse") || error_msg.contains("PDF") || error_msg.contains("format") || error_msg.contains("Failed to extract") || error_msg.contains("All PDF extraction strategies failed") ); } } } #[tokio::test] async fn test_problematic_encoding_pdf_handling() { let ocr_service = OcrService::new(); // Test with the existing problematic encoding PDF let problematic_encoding = "tests/test_pdfs/problematic_encoding.pdf"; if Path::new(problematic_encoding).exists() { let result = ocr_service.extract_text_from_pdf(problematic_encoding).await; // With ocrmypdf, this may succeed gracefully or return descriptive error match result { Ok(text) => { println!("Successfully extracted text from problematic encoding PDF: '{}'", text); // OCRmyPDF's robustness allows it to handle some problematic encoding PDFs } Err(e) => { println!("Error extracting from problematic encoding PDF: {}", e); let error_msg = e.to_string(); assert!( error_msg.contains("ocrmypdf") || error_msg.contains("extraction") || error_msg.contains("strategies") || error_msg.contains("Failed to extract") ); } } } } /// Test that the enhanced OCR service also handles panics correctly #[tokio::test] async fn test_enhanced_ocr_panic_handling() { use crate::ocr::enhanced::EnhancedOcrService; use crate::services::file_service::FileService; use crate::models::Settings; let ocr_service = EnhancedOcrService::new("tests".to_string()); let settings = Settings::default(); // Test all malformed PDFs with enhanced OCR let test_files = vec![ "tests/test_pdfs/malformed_content_stream.pdf", "tests/test_pdfs/corrupted_structure.pdf", "tests/test_pdfs/invalid_font_encoding.pdf", "tests/test_pdfs/fake_pdf.pdf", "tests/test_pdfs/problematic_encoding.pdf", ]; for test_file in test_files { if Path::new(test_file).exists() { let result = ocr_service.extract_text_with_context( test_file, "application/pdf", &Path::new(test_file).file_name().unwrap().to_str().unwrap(), 1024, // file_size &settings ).await; // The enhanced OCR service might succeed or fail gracefully match result { Ok(ocr_result) => { println!("Enhanced OCR successfully extracted from {}: '{}'", test_file, ocr_result.text); // Even problematic PDFs might yield some text with the robust extraction }, Err(e) => { let error_msg = e.to_string(); println!("Enhanced OCR failed for {}: {}", test_file, error_msg); // Should contain descriptive error message assert!( error_msg.contains("panic") || error_msg.contains("extract") || error_msg.contains("PDF") || error_msg.contains("corrupted") || error_msg.contains("encoding") || error_msg.contains("font") || error_msg.contains("All PDF extraction strategies failed"), "Error message should be descriptive for {}: {}", test_file, error_msg ); } } } } } /// Test that panic handling works correctly in concurrent scenarios #[tokio::test] async fn test_concurrent_pdf_panic_handling() { use std::sync::Arc; use futures::future::join_all; let ocr_service = Arc::new(OcrService::new()); let mut handles = Vec::new(); // Test concurrent processing of malformed PDFs let test_files = vec![ "tests/test_pdfs/malformed_content_stream.pdf", "tests/test_pdfs/corrupted_structure.pdf", "tests/test_pdfs/invalid_font_encoding.pdf", "tests/test_pdfs/fake_pdf.pdf", ]; for test_file in test_files { if Path::new(test_file).exists() { let ocr_service_clone = Arc::clone(&ocr_service); let test_file_owned = test_file.to_string(); let handle = tokio::spawn(async move { let result = ocr_service_clone.extract_text_from_pdf(&test_file_owned).await; // The enhanced OCR might succeed or fail gracefully match result { Ok(text) => { println!("Concurrent test: Successfully extracted from {}: '{}'", test_file_owned, text); // Even problematic PDFs might yield some text }, Err(e) => { let error_msg = e.to_string(); println!("Concurrent test: Failed for {}: {}", test_file_owned, error_msg); // Should contain descriptive error message assert!( error_msg.contains("panic") || error_msg.contains("extract") || error_msg.contains("PDF") || error_msg.contains("corrupted") || error_msg.contains("encoding") || error_msg.contains("All PDF extraction strategies failed") || error_msg.contains("No such file or directory"), "Error message should be descriptive for {}: {}", test_file_owned, error_msg ); } } }); handles.push(handle); } } // Wait for all concurrent tasks to complete let results = join_all(handles).await; // Verify all tasks completed without panicking for result in results { assert!(result.is_ok(), "Task should complete without panicking"); } } }