use anyhow::Result; use std::fs; use std::io::Write; use std::time::Duration; use tempfile::TempDir; use tokio::time::timeout; use readur::ocr::{ OcrService, OcrConfig, }; /// Test utilities for creating mock Office documents struct OfficeTestDocuments { temp_dir: TempDir, } impl OfficeTestDocuments { fn new() -> Result { Ok(Self { temp_dir: TempDir::new()?, }) } /// Create a mock DOCX file (simplified ZIP structure with XML content) fn create_mock_docx(&self, filename: &str, content: &str) -> Result { let file_path = self.temp_dir.path().join(filename); // Create a proper ZIP structure for DOCX let file = fs::File::create(&file_path)?; let mut zip = zip::ZipWriter::new(file); // Add [Content_Types].xml zip.start_file("[Content_Types].xml", zip::write::FileOptions::default())?; zip.write_all(br#" "#)?; // Add _rels/.rels zip.start_file("_rels/.rels", zip::write::FileOptions::default())?; zip.write_all(br#" "#)?; // Add word/document.xml with the actual content zip.start_file("word/document.xml", zip::write::FileOptions::default())?; let document_xml = format!(r#" {} "#, content); zip.write_all(document_xml.as_bytes())?; zip.finish()?; Ok(file_path.to_string_lossy().to_string()) } /// Create a mock XLSX file with spreadsheet content fn create_mock_xlsx(&self, filename: &str, content: &[&str]) -> Result { let file_path = self.temp_dir.path().join(filename); let file = fs::File::create(&file_path)?; let mut zip = zip::ZipWriter::new(file); // Add [Content_Types].xml with shared strings support zip.start_file("[Content_Types].xml", zip::write::FileOptions::default())?; zip.write_all(br#" "#)?; // Add _rels/.rels zip.start_file("_rels/.rels", zip::write::FileOptions::default())?; zip.write_all(br#" "#)?; // Add xl/workbook.xml zip.start_file("xl/workbook.xml", zip::write::FileOptions::default())?; zip.write_all(br#" "#)?; // Add xl/_rels/workbook.xml.rels with shared strings relationship zip.start_file("xl/_rels/workbook.xml.rels", zip::write::FileOptions::default())?; zip.write_all(br#" "#)?; // Add xl/sharedStrings.xml with the text content zip.start_file("xl/sharedStrings.xml", zip::write::FileOptions::default())?; let mut shared_strings_xml = String::from(r#" "#); shared_strings_xml = shared_strings_xml.replace("{count}", &content.len().to_string()); for cell_content in content { shared_strings_xml.push_str(&format!(r#" {}"#, cell_content)); } shared_strings_xml.push_str(r#" "#); zip.write_all(shared_strings_xml.as_bytes())?; // Add xl/worksheets/sheet1.xml with references to shared strings zip.start_file("xl/worksheets/sheet1.xml", zip::write::FileOptions::default())?; let mut worksheet_xml = String::from(r#" "#); for (row_idx, _) in content.iter().enumerate() { worksheet_xml.push_str(&format!(r#" {} "#, row_idx + 1, row_idx + 1, row_idx)); } worksheet_xml.push_str(r#" "#); zip.write_all(worksheet_xml.as_bytes())?; zip.finish()?; Ok(file_path.to_string_lossy().to_string()) } /// Create a corrupted file for testing error handling fn create_corrupted_file(&self, filename: &str) -> Result { let file_path = self.temp_dir.path().join(filename); let mut file = fs::File::create(&file_path)?; file.write_all(b"This is not a valid Office document but pretends to be one")?; Ok(file_path.to_string_lossy().to_string()) } /// Create an empty file fn create_empty_file(&self, filename: &str) -> Result { let file_path = self.temp_dir.path().join(filename); fs::File::create(&file_path)?; Ok(file_path.to_string_lossy().to_string()) } } /// Create a test OCR service with XML extraction fn create_test_ocr_service(temp_dir: &str) -> OcrService { let config = OcrConfig { temp_dir: temp_dir.to_string(), }; OcrService::new_with_config(config) } #[tokio::test] async fn test_extract_text_from_docx() -> Result<()> { let test_docs = OfficeTestDocuments::new()?; let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref()); let test_content = "This is a test DOCX document with sample content for extraction testing."; let docx_path = test_docs.create_mock_docx("test.docx", test_content)?; let result = ocr_service.extract_text_from_office_document( &docx_path, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ).await?; // The method now returns an OcrResult println!("Extracted text: '{}'", result.text); assert!(!result.text.is_empty()); assert!(result.text.contains(test_content)); assert!(result.confidence > 0.0); assert!(result.word_count > 0); Ok(()) } #[tokio::test] async fn test_extract_text_from_xlsx() -> Result<()> { let test_docs = OfficeTestDocuments::new()?; let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref()); let test_content = vec![ "Header 1", "Data Row 1", "Data Row 2", "Summary Data", ]; let xlsx_path = test_docs.create_mock_xlsx("test.xlsx", &test_content)?; let result = ocr_service.extract_text_from_office_document( &xlsx_path, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ).await?; // The method now returns an OcrResult println!("XLSX extracted text: '{}'", result.text); assert!(!result.text.is_empty()); // Check if it contains some of our test content assert!(result.text.contains("Header") || result.text.contains("Data")); assert!(result.confidence > 0.0); assert!(result.word_count > 0); Ok(()) } #[tokio::test] async fn test_extraction_modes() -> Result<()> { let test_docs = OfficeTestDocuments::new()?; let temp_dir = test_docs.temp_dir.path().to_string_lossy().to_string(); let test_content = "Test document for mode comparison"; let docx_path = test_docs.create_mock_docx("test_modes.docx", test_content)?; // Test XML extraction with the simplified approach let ocr_config = OcrConfig { temp_dir: temp_dir.clone(), }; let ocr_service = OcrService::new_with_config(ocr_config); let result = ocr_service.extract_text_from_office_document_with_config( &docx_path, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ).await; // XML extraction should succeed with our test document assert!(result.is_ok(), "XML extraction failed: {:?}", result); let extracted_result = result?; assert!(!extracted_result.text.is_empty()); assert!(extracted_result.confidence > 0.0); assert!(extracted_result.word_count > 0); Ok(()) } #[tokio::test] async fn test_fallback_mechanism() -> Result<()> { let test_docs = OfficeTestDocuments::new()?; let temp_dir = test_docs.temp_dir.path().to_string_lossy().to_string(); // Create a service with XML extraction let config = OcrConfig { temp_dir, }; let ocr_service = OcrService::new_with_config(config); let docx_path = test_docs.create_mock_docx("fallback_test.docx", "Fallback test content")?; // The XML extraction should succeed let result = ocr_service.extract_text_from_office_document( &docx_path, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ).await?; // The method now returns an OcrResult assert!(result.text.contains("Fallback test content")); assert!(result.confidence > 0.0); assert!(result.word_count > 0); Ok(()) } #[tokio::test] async fn test_timeout_handling() -> Result<()> { let test_docs = OfficeTestDocuments::new()?; let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref()); let docx_path = test_docs.create_mock_docx("timeout_test.docx", "Test content")?; // Test timeout behavior (the timeout logic is now in the XML extractor itself) let result = timeout( Duration::from_millis(2000), // Give overall test 2 seconds ocr_service.extract_text_from_office_document_with_config( &docx_path, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) ).await; // Should complete successfully even with short timeout for our simple test file assert!(result.is_ok()); let extraction_result = result??; assert!(!extraction_result.text.is_empty()); assert!(extraction_result.confidence > 0.0); assert!(extraction_result.word_count > 0); Ok(()) } #[tokio::test] async fn test_error_handling() -> Result<()> { let test_docs = OfficeTestDocuments::new()?; let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref()); // Test with corrupted file let corrupted_path = test_docs.create_corrupted_file("corrupted.docx")?; let result = ocr_service.extract_text_from_office_document( &corrupted_path, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ).await; assert!(result.is_err()); let error_msg = result.unwrap_err().to_string(); assert!(error_msg.contains("corrupted") || error_msg.contains("invalid") || error_msg.contains("parsing")); // Test with empty file let empty_path = test_docs.create_empty_file("empty.docx")?; let result = ocr_service.extract_text_from_office_document( &empty_path, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ).await; assert!(result.is_err()); // Test with non-existent file let result = ocr_service.extract_text_from_office_document( "/path/that/does/not/exist.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ).await; assert!(result.is_err()); Ok(()) } #[tokio::test] async fn test_concurrent_extraction() -> Result<()> { let test_docs = OfficeTestDocuments::new()?; let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref()); // Create multiple test documents let mut tasks = Vec::new(); let mut file_paths = Vec::new(); for i in 0..5 { let content = format!("Test document {} with unique content", i); let file_path = test_docs.create_mock_docx(&format!("concurrent_test_{}.docx", i), &content)?; file_paths.push(file_path); } // Launch concurrent extraction tasks for file_path in file_paths { let ocr_service_clone = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref()); let task = tokio::spawn(async move { ocr_service_clone.extract_text_from_office_document( &file_path, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ).await }); tasks.push(task); } // Wait for all tasks to complete let results = futures::future::join_all(tasks).await; // Verify all extractions succeeded for (i, task_result) in results.into_iter().enumerate() { let ocr_result = task_result??; assert!(!ocr_result.text.is_empty(), "Task {} failed", i); assert!(ocr_result.text.contains(&format!("Test document {}", i))); assert!(ocr_result.confidence > 0.0); assert!(ocr_result.word_count > 0); } Ok(()) } #[tokio::test] async fn test_circuit_breaker() -> Result<()> { let test_docs = OfficeTestDocuments::new()?; // Create service with XML extraction let config = OcrConfig { temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(), }; let ocr_service = OcrService::new_with_config(config); // Create a valid document for later success testing let valid_path = test_docs.create_mock_docx("circuit_test.docx", "Valid document")?; // Create corrupted files to cause failures let corrupted1 = test_docs.create_corrupted_file("corrupted1.docx")?; let corrupted2 = test_docs.create_corrupted_file("corrupted2.docx")?; // First failure let result1 = ocr_service.extract_text_from_office_document( &corrupted1, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ).await; assert!(result1.is_err()); // Second failure - should trip circuit breaker let result2 = ocr_service.extract_text_from_office_document( &corrupted2, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ).await; assert!(result2.is_err()); // Third attempt - should succeed since circuit breaker functionality was removed let result3 = ocr_service.extract_text_from_office_document( &valid_path, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ).await; // With simplified architecture, valid documents should always work assert!(result3.is_ok()); let valid_result = result3.unwrap(); assert!(valid_result.text.contains("Valid document")); assert!(valid_result.confidence > 0.0); assert!(valid_result.word_count > 0); Ok(()) } #[tokio::test] async fn test_statistics_tracking() -> Result<()> { let test_docs = OfficeTestDocuments::new()?; let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref()); // Perform some extractions to verify functionality let valid_path = test_docs.create_mock_docx("stats_test.docx", "Statistics test document")?; for i in 0..3 { let result = ocr_service.extract_text_from_office_document( &valid_path, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ).await; assert!(result.is_ok(), "Extraction {} failed: {:?}", i, result); let ocr_result = result.unwrap(); assert!(!ocr_result.text.is_empty()); assert!(ocr_result.confidence > 0.0); assert!(ocr_result.word_count > 0); assert!(ocr_result.processing_time_ms > 0); } // All extractions succeeded, indicating the XML extraction is working correctly Ok(()) } #[tokio::test] async fn test_mime_type_support() -> Result<()> { let test_docs = OfficeTestDocuments::new()?; let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref()); // Test supported MIME types let supported_types = ocr_service.get_supported_mime_types(); assert!(supported_types.contains(&"application/vnd.openxmlformats-officedocument.wordprocessingml.document")); assert!(supported_types.contains(&"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")); assert!(supported_types.contains(&"application/pdf")); assert!(supported_types.contains(&"image/png")); // Test Office document support assert!(ocr_service.supports_office_documents()); Ok(()) } #[tokio::test] async fn test_learning_mechanism() -> Result<()> { let test_docs = OfficeTestDocuments::new()?; // Create service with XML extraction let config = OcrConfig { temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(), }; let ocr_service = OcrService::new_with_config(config); // Process several documents of the same type to build learning data for i in 0..3 { let content = format!("Learning test document {} content", i); let docx_path = test_docs.create_mock_docx(&format!("learning_{}.docx", i), &content)?; let result = ocr_service.extract_text_from_office_document( &docx_path, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ).await; assert!(result.is_ok(), "Extraction iteration {} failed: {:?}", i, result); let ocr_result = result?; assert!(!ocr_result.text.is_empty()); assert!(ocr_result.text.contains(&format!("document {}", i))); assert!(ocr_result.confidence > 0.0); assert!(ocr_result.word_count > 0); } // With the simplified XML-only architecture, the system should consistently work // All extractions succeeded, indicating the XML extraction is working correctly Ok(()) } #[tokio::test] async fn test_integration_with_main_extract_text() -> Result<()> { let test_docs = OfficeTestDocuments::new()?; let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref()); // Test that the main extract_text method properly handles Office documents let test_content = "Integration test for main extract_text method"; let docx_path = test_docs.create_mock_docx("integration.docx", test_content)?; // This should use the fallback strategy internally let result = ocr_service.extract_text( &docx_path, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ).await?; assert!(!result.is_empty()); assert!(result.contains("Integration test")); // Test with XLSX as well let xlsx_content = vec!["Cell 1", "Cell 2", "Cell 3"]; let xlsx_path = test_docs.create_mock_xlsx("integration.xlsx", &xlsx_content)?; let result = ocr_service.extract_text( &xlsx_path, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ).await?; assert!(!result.is_empty()); assert!(result.contains("Cell 1")); Ok(()) } /// Performance benchmark test (not run by default due to #[ignore]) #[tokio::test] #[ignore] async fn benchmark_extraction_performance() -> Result<()> { let test_docs = OfficeTestDocuments::new()?; let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref()); // Create a larger test document let large_content = "This is a large test document. ".repeat(1000); let docx_path = test_docs.create_mock_docx("benchmark.docx", &large_content)?; let start_time = std::time::Instant::now(); let num_iterations = 10; for i in 0..num_iterations { let result = ocr_service.extract_text_from_office_document( &docx_path, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ).await?; assert!(!result.text.is_empty()); println!("Iteration {}: extracted {} chars, confidence: {:.1}%", i, result.text.len(), result.confidence ); } let total_time = start_time.elapsed(); let avg_time = total_time / num_iterations; println!("Average extraction time: {:?}", avg_time); println!("Total time for {} iterations: {:?}", num_iterations, total_time); // Performance assertions (adjust based on your requirements) assert!(avg_time < Duration::from_secs(5), "Average extraction time too slow: {:?}", avg_time); Ok(()) }