Readur/tests/integration_office_extracti...

656 lines
25 KiB
Rust

use anyhow::Result;
use std::fs;
use std::io::Write;
use std::time::Duration;
use tempfile::TempDir;
use tokio::time::timeout;
use readur::ocr::{
OcrService, OcrConfig,
fallback_strategy::{FallbackConfig, CircuitBreakerConfig, LearningConfig, MethodTimeouts},
};
/// Test utilities for creating mock Office documents
struct OfficeTestDocuments {
temp_dir: TempDir,
}
impl OfficeTestDocuments {
fn new() -> Result<Self> {
Ok(Self {
temp_dir: TempDir::new()?,
})
}
/// Create a mock DOCX file (simplified ZIP structure with XML content)
fn create_mock_docx(&self, filename: &str, content: &str) -> Result<String> {
let file_path = self.temp_dir.path().join(filename);
// Create a proper ZIP structure for DOCX
let file = fs::File::create(&file_path)?;
let mut zip = zip::ZipWriter::new(file);
// Add [Content_Types].xml
zip.start_file("[Content_Types].xml", zip::write::FileOptions::default())?;
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
</Types>"#)?;
// Add _rels/.rels
zip.start_file("_rels/.rels", zip::write::FileOptions::default())?;
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
</Relationships>"#)?;
// Add word/document.xml with the actual content
zip.start_file("word/document.xml", zip::write::FileOptions::default())?;
let document_xml = format!(r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<w:r>
<w:t>{}</w:t>
</w:r>
</w:p>
</w:body>
</w:document>"#, content);
zip.write_all(document_xml.as_bytes())?;
zip.finish()?;
Ok(file_path.to_string_lossy().to_string())
}
/// Create a mock XLSX file with spreadsheet content
fn create_mock_xlsx(&self, filename: &str, content: &[&str]) -> Result<String> {
let file_path = self.temp_dir.path().join(filename);
let file = fs::File::create(&file_path)?;
let mut zip = zip::ZipWriter::new(file);
// Add [Content_Types].xml
zip.start_file("[Content_Types].xml", zip::write::FileOptions::default())?;
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/xl/workbook.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"/>
<Override PartName="/xl/worksheets/sheet1.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml"/>
</Types>"#)?;
// Add _rels/.rels
zip.start_file("_rels/.rels", zip::write::FileOptions::default())?;
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="xl/workbook.xml"/>
</Relationships>"#)?;
// Add xl/workbook.xml
zip.start_file("xl/workbook.xml", zip::write::FileOptions::default())?;
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
<sheets>
<sheet name="Sheet1" sheetId="1" r:id="rId1"/>
</sheets>
</workbook>"#)?;
// Add xl/_rels/workbook.xml.rels
zip.start_file("xl/_rels/workbook.xml.rels", zip::write::FileOptions::default())?;
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet" Target="worksheets/sheet1.xml"/>
</Relationships>"#)?;
// Add xl/worksheets/sheet1.xml with actual content
zip.start_file("xl/worksheets/sheet1.xml", zip::write::FileOptions::default())?;
let mut worksheet_xml = String::from(r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
<sheetData>"#);
for (row_idx, cell_content) in content.iter().enumerate() {
worksheet_xml.push_str(&format!(r#"
<row r="{}">
<c r="A{}" t="inlineStr">
<is><t>{}</t></is>
</c>
</row>"#, row_idx + 1, row_idx + 1, cell_content));
}
worksheet_xml.push_str(r#"
</sheetData>
</worksheet>"#);
zip.write_all(worksheet_xml.as_bytes())?;
zip.finish()?;
Ok(file_path.to_string_lossy().to_string())
}
/// Create a corrupted file for testing error handling
fn create_corrupted_file(&self, filename: &str) -> Result<String> {
let file_path = self.temp_dir.path().join(filename);
let mut file = fs::File::create(&file_path)?;
file.write_all(b"This is not a valid Office document but pretends to be one")?;
Ok(file_path.to_string_lossy().to_string())
}
/// Create an empty file
fn create_empty_file(&self, filename: &str) -> Result<String> {
let file_path = self.temp_dir.path().join(filename);
fs::File::create(&file_path)?;
Ok(file_path.to_string_lossy().to_string())
}
}
/// Create a test OCR service with fallback strategy
fn create_test_ocr_service(temp_dir: &str) -> OcrService {
let config = OcrConfig {
fallback_config: FallbackConfig {
enabled: true,
max_retries: 2,
initial_retry_delay_ms: 100,
max_retry_delay_ms: 1000,
circuit_breaker: CircuitBreakerConfig {
enabled: true,
failure_threshold: 3,
recovery_timeout_seconds: 5,
success_threshold_percentage: 70,
},
learning: LearningConfig {
enabled: true,
cache_successful_methods: true,
cache_ttl_hours: 1,
},
method_timeouts: MethodTimeouts::default(),
},
temp_dir: temp_dir.to_string(),
};
OcrService::new_with_config(config)
}
#[tokio::test]
async fn test_extract_text_from_docx() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
let test_content = "This is a test DOCX document with sample content for extraction testing.";
let docx_path = test_docs.create_mock_docx("test.docx", test_content)?;
let result = ocr_service.extract_text_from_office_document(
&docx_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await?;
assert!(result.success);
// Since we're using a placeholder library extraction, check for the actual content
println!("Extracted text: '{}'", result.text);
println!("Method used: {}", result.method_name);
assert!(!result.text.is_empty());
assert!(result.word_count > 0);
assert!(result.confidence > 0.0);
assert!(result.processing_time < Duration::from_secs(30));
// The method might be Library-based extraction (placeholder) or XML extraction
assert!(result.method_name.contains("extraction"));
Ok(())
}
#[tokio::test]
async fn test_extract_text_from_xlsx() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
let test_content = vec![
"Header 1",
"Data Row 1",
"Data Row 2",
"Summary Data",
];
let xlsx_path = test_docs.create_mock_xlsx("test.xlsx", &test_content)?;
let result = ocr_service.extract_text_from_office_document(
&xlsx_path,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
).await?;
assert!(result.success);
// Since we're using placeholder extraction, check basic properties
println!("XLSX extracted text: '{}'", result.text);
println!("XLSX method used: {}", result.method_name);
assert!(!result.text.is_empty());
assert!(result.word_count > 0);
assert!(result.confidence > 0.0);
Ok(())
}
#[tokio::test]
async fn test_extraction_modes() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let temp_dir = test_docs.temp_dir.path().to_string_lossy().to_string();
let test_content = "Test document for mode comparison";
let docx_path = test_docs.create_mock_docx("test_modes.docx", test_content)?;
// Test XML extraction with the simplified approach
let ocr_config = OcrConfig {
fallback_config: FallbackConfig::default(),
temp_dir: temp_dir.clone(),
};
let ocr_service = OcrService::new_with_config(ocr_config);
let result = ocr_service.extract_text_from_office_document_with_config(
&docx_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
).await;
// XML extraction should succeed with our test document
assert!(result.is_ok(), "XML extraction failed: {:?}", result);
let extracted_text = result?;
assert!(!extracted_text.is_empty());
Ok(())
}
#[tokio::test]
async fn test_fallback_mechanism() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let temp_dir = test_docs.temp_dir.path().to_string_lossy().to_string();
// Create a service with library-first mode
let config = OcrConfig {
fallback_config: FallbackConfig {
enabled: true,
max_retries: 1,
initial_retry_delay_ms: 50,
max_retry_delay_ms: 200,
circuit_breaker: CircuitBreakerConfig {
enabled: false, // Disable for this test
failure_threshold: 5,
recovery_timeout_seconds: 10,
success_threshold_percentage: 50,
},
learning: LearningConfig {
enabled: true,
cache_successful_methods: true,
cache_ttl_hours: 1,
},
method_timeouts: MethodTimeouts {
library_timeout_seconds: 1, // Very short timeout to force fallback
xml_timeout_seconds: 30,
ocr_timeout_seconds: 60,
},
},
temp_dir,
};
let ocr_service = OcrService::new_with_config(config);
let docx_path = test_docs.create_mock_docx("fallback_test.docx", "Fallback test content")?;
// The library method should timeout and fallback to XML
let result = ocr_service.extract_text_from_office_document(
&docx_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await?;
assert!(result.success);
assert!(result.text.contains("Fallback test content"));
// Should have used XML extraction due to library timeout
assert!(result.method_name.contains("XML"));
Ok(())
}
#[tokio::test]
async fn test_timeout_handling() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
let docx_path = test_docs.create_mock_docx("timeout_test.docx", "Test content")?;
// Test timeout behavior (the timeout logic is now in the XML extractor itself)
let result = timeout(
Duration::from_millis(2000), // Give overall test 2 seconds
ocr_service.extract_text_from_office_document_with_config(
&docx_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
).await;
// Should complete successfully even with short timeout for our simple test file
assert!(result.is_ok());
let extraction_result = result??;
assert!(extraction_result.success);
Ok(())
}
#[tokio::test]
async fn test_error_handling() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
// Test with corrupted file
let corrupted_path = test_docs.create_corrupted_file("corrupted.docx")?;
let result = ocr_service.extract_text_from_office_document(
&corrupted_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await;
assert!(result.is_err());
let error_msg = result.unwrap_err().to_string();
assert!(error_msg.contains("corrupted") || error_msg.contains("invalid") || error_msg.contains("parsing"));
// Test with empty file
let empty_path = test_docs.create_empty_file("empty.docx")?;
let result = ocr_service.extract_text_from_office_document(
&empty_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await;
assert!(result.is_err());
// Test with non-existent file
let result = ocr_service.extract_text_from_office_document(
"/path/that/does/not/exist.docx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await;
assert!(result.is_err());
Ok(())
}
#[tokio::test]
async fn test_concurrent_extraction() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
// Create multiple test documents
let mut tasks = Vec::new();
let mut file_paths = Vec::new();
for i in 0..5 {
let content = format!("Test document {} with unique content", i);
let file_path = test_docs.create_mock_docx(&format!("concurrent_test_{}.docx", i), &content)?;
file_paths.push(file_path);
}
// Launch concurrent extraction tasks
for file_path in file_paths {
let ocr_service_clone = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
let task = tokio::spawn(async move {
ocr_service_clone.extract_text_from_office_document(
&file_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await
});
tasks.push(task);
}
// Wait for all tasks to complete
let results = futures::future::join_all(tasks).await;
// Verify all extractions succeeded
for (i, task_result) in results.into_iter().enumerate() {
let extraction_result = task_result??;
assert!(extraction_result.success, "Task {} failed", i);
assert!(extraction_result.text.contains(&format!("Test document {}", i)));
assert!(extraction_result.word_count > 0);
}
Ok(())
}
#[tokio::test]
async fn test_circuit_breaker() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
// Create service with aggressive circuit breaker settings
let config = OcrConfig {
fallback_config: FallbackConfig {
enabled: true,
max_retries: 0, // No retries to make failures immediate
initial_retry_delay_ms: 10,
max_retry_delay_ms: 100,
circuit_breaker: CircuitBreakerConfig {
enabled: true,
failure_threshold: 2, // Trip after just 2 failures
recovery_timeout_seconds: 1,
success_threshold_percentage: 100, // Require 100% success to close
},
learning: LearningConfig::default(),
method_timeouts: MethodTimeouts {
library_timeout_seconds: 30,
xml_timeout_seconds: 30,
ocr_timeout_seconds: 30,
},
},
temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(),
};
let ocr_service = OcrService::new_with_config(config);
// Create a valid document for later success testing
let valid_path = test_docs.create_mock_docx("circuit_test.docx", "Valid document")?;
// Create corrupted files to cause failures
let corrupted1 = test_docs.create_corrupted_file("corrupted1.docx")?;
let corrupted2 = test_docs.create_corrupted_file("corrupted2.docx")?;
// First failure
let result1 = ocr_service.extract_text_from_office_document(
&corrupted1,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await;
assert!(result1.is_err());
// Second failure - should trip circuit breaker
let result2 = ocr_service.extract_text_from_office_document(
&corrupted2,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await;
assert!(result2.is_err());
// Third attempt - should fail fast due to circuit breaker
let result3 = ocr_service.extract_text_from_office_document(
&valid_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await;
assert!(result3.is_err());
let error_msg = result3.unwrap_err().to_string();
assert!(error_msg.contains("circuit breaker") || error_msg.contains("open"));
// Wait for recovery timeout
tokio::time::sleep(Duration::from_secs(2)).await;
// Now should be able to process valid document (circuit goes to half-open)
let _result4 = ocr_service.extract_text_from_office_document(
&valid_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await;
// This might still fail if circuit is still open, which is acceptable behavior
Ok(())
}
#[tokio::test]
async fn test_statistics_tracking() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
// Reset stats
ocr_service.reset_fallback_stats().await?;
let initial_stats = ocr_service.get_fallback_stats().await.unwrap();
assert_eq!(initial_stats.total_extractions, 0);
// Perform some extractions
let valid_path = test_docs.create_mock_docx("stats_test.docx", "Statistics test document")?;
for i in 0..3 {
let result = ocr_service.extract_text_from_office_document(
&valid_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await;
assert!(result.is_ok(), "Extraction {} failed: {:?}", i, result);
}
// Check updated stats
let final_stats = ocr_service.get_fallback_stats().await.unwrap();
assert_eq!(final_stats.total_extractions, 3);
assert!(final_stats.success_rate_percentage > 0.0);
assert!(final_stats.average_processing_time_ms > 0.0);
Ok(())
}
#[tokio::test]
async fn test_mime_type_support() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
// Test supported MIME types
let supported_types = ocr_service.get_supported_mime_types();
assert!(supported_types.contains(&"application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
assert!(supported_types.contains(&"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
assert!(supported_types.contains(&"application/pdf"));
assert!(supported_types.contains(&"image/png"));
// Test Office document support
assert!(ocr_service.supports_office_documents());
Ok(())
}
#[tokio::test]
async fn test_learning_mechanism() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
// Create service with learning enabled
let config = OcrConfig {
fallback_config: FallbackConfig {
enabled: true,
max_retries: 1,
initial_retry_delay_ms: 10,
max_retry_delay_ms: 100,
circuit_breaker: CircuitBreakerConfig {
enabled: false, // Disable to focus on learning
failure_threshold: 10,
recovery_timeout_seconds: 10,
success_threshold_percentage: 50,
},
learning: LearningConfig {
enabled: true,
cache_successful_methods: true,
cache_ttl_hours: 1,
},
method_timeouts: MethodTimeouts::default(),
},
temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(),
};
let ocr_service = OcrService::new_with_config(config);
// Process several documents of the same type to build learning data
for i in 0..3 {
let content = format!("Learning test document {} content", i);
let docx_path = test_docs.create_mock_docx(&format!("learning_{}.docx", i), &content)?;
let result = ocr_service.extract_text_from_office_document(
&docx_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await;
assert!(result.is_ok(), "Learning iteration {} failed: {:?}", i, result);
let result = result?;
assert!(result.success);
assert!(result.text.contains(&format!("document {}", i)));
}
// The learning mechanism should now have preferences cached
// We can't easily test this directly without exposing internal state,
// but the fact that all extractions succeeded indicates the system is working
Ok(())
}
#[tokio::test]
async fn test_integration_with_main_extract_text() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
// Test that the main extract_text method properly handles Office documents
let test_content = "Integration test for main extract_text method";
let docx_path = test_docs.create_mock_docx("integration.docx", test_content)?;
// This should use the fallback strategy internally
let result = ocr_service.extract_text(
&docx_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await?;
assert!(!result.is_empty());
assert!(result.contains("Integration test"));
// Test with XLSX as well
let xlsx_content = vec!["Cell 1", "Cell 2", "Cell 3"];
let xlsx_path = test_docs.create_mock_xlsx("integration.xlsx", &xlsx_content)?;
let result = ocr_service.extract_text(
&xlsx_path,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
).await?;
assert!(!result.is_empty());
assert!(result.contains("Cell 1"));
Ok(())
}
/// Performance benchmark test (not run by default due to #[ignore])
#[tokio::test]
#[ignore]
async fn benchmark_extraction_performance() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
// Create a larger test document
let large_content = "This is a large test document. ".repeat(1000);
let docx_path = test_docs.create_mock_docx("benchmark.docx", &large_content)?;
let start_time = std::time::Instant::now();
let num_iterations = 10;
for i in 0..num_iterations {
let result = ocr_service.extract_text_from_office_document(
&docx_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await?;
assert!(result.success);
println!("Iteration {}: {} ms, {} words",
i,
result.processing_time.as_millis(),
result.word_count
);
}
let total_time = start_time.elapsed();
let avg_time = total_time / num_iterations;
println!("Average extraction time: {:?}", avg_time);
println!("Total time for {} iterations: {:?}", num_iterations, total_time);
// Performance assertions (adjust based on your requirements)
assert!(avg_time < Duration::from_secs(5), "Average extraction time too slow: {:?}", avg_time);
Ok(())
}