860 lines
35 KiB
Rust
860 lines
35 KiB
Rust
#[cfg(test)]
|
|
mod tests {
|
|
use readur::ocr::enhanced::{EnhancedOcrService, OcrResult, ImageQualityStats};
|
|
use readur::models::Settings;
|
|
use readur::services::file_service::FileService;
|
|
use readur::storage::{StorageConfig, factory::create_storage_backend};
|
|
use std::fs;
|
|
use tempfile::{NamedTempFile, TempDir};
|
|
|
|
fn create_test_settings() -> Settings {
|
|
Settings::default()
|
|
}
|
|
|
|
fn create_temp_dir() -> TempDir {
|
|
TempDir::new().expect("Failed to create temp directory")
|
|
}
|
|
|
|
async fn create_test_file_service(temp_path: &str) -> FileService {
|
|
let storage_config = StorageConfig::Local { upload_path: temp_path.to_string() };
|
|
let storage_backend = create_storage_backend(storage_config).await.unwrap();
|
|
FileService::with_storage(temp_path.to_string(), storage_backend)
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_enhanced_ocr_service_creation() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path.clone(), file_service);
|
|
|
|
// Service should be created successfully
|
|
assert!(!service.temp_dir.is_empty());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_image_quality_stats_creation() {
|
|
let stats = ImageQualityStats {
|
|
average_brightness: 128.0,
|
|
contrast_ratio: 0.5,
|
|
noise_level: 0.1,
|
|
sharpness: 0.8,
|
|
};
|
|
|
|
assert_eq!(stats.average_brightness, 128.0);
|
|
assert_eq!(stats.contrast_ratio, 0.5);
|
|
assert_eq!(stats.noise_level, 0.1);
|
|
assert_eq!(stats.sharpness, 0.8);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_count_words_safely_whitespace_separated() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path.clone(), file_service);
|
|
|
|
// Test normal whitespace-separated text
|
|
let text = "Hello world this is a test";
|
|
let count = service.count_words_safely(&text);
|
|
assert_eq!(count, 6);
|
|
|
|
// Test with extra whitespace
|
|
let text = " Hello world \n test ";
|
|
let count = service.count_words_safely(&text);
|
|
assert_eq!(count, 3);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_count_words_safely_continuous_text() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path.clone(), file_service);
|
|
|
|
// Test continuous text without spaces (like some PDF extractions)
|
|
let text = "HelloWorldThisIsAContinuousText";
|
|
let count = service.count_words_safely(&text);
|
|
assert!(count > 0, "Should detect words even without whitespace");
|
|
|
|
// Test mixed alphanumeric without spaces
|
|
let text = "ABC123DEF456GHI789";
|
|
let count = service.count_words_safely(&text);
|
|
assert!(count > 0, "Should detect alphanumeric patterns as words");
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_count_words_safely_edge_cases() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path.clone(), file_service);
|
|
|
|
// Test empty text
|
|
let count = service.count_words_safely("");
|
|
assert_eq!(count, 0);
|
|
|
|
// Test only whitespace
|
|
let count = service.count_words_safely(" \n\t ");
|
|
assert_eq!(count, 0);
|
|
|
|
// Test only punctuation
|
|
let text = "!@#$%^&*()_+-=[]{}|;':\",./<>?";
|
|
let count = service.count_words_safely(&text);
|
|
// Since there are no alphabetic or alphanumeric chars, should be 0
|
|
assert_eq!(count, 0, "Pure punctuation should not count as words, got {}", count);
|
|
|
|
// Test single character
|
|
let count = service.count_words_safely("A");
|
|
assert_eq!(count, 1);
|
|
|
|
// Test mixed content with low alphanumeric ratio
|
|
let text = "A!!!B@@@C###D$$$E%%%";
|
|
let count = service.count_words_safely(&text);
|
|
assert!(count > 0, "Should detect words in mixed content");
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_count_words_safely_large_text() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path.clone(), file_service);
|
|
|
|
// Test with large text (over 1MB) to trigger sampling
|
|
let word = "test ";
|
|
let large_text = word.repeat(250_000); // Creates ~1.25MB of text
|
|
let count = service.count_words_safely(&large_text);
|
|
|
|
// Should estimate around 250,000 words (may vary due to sampling)
|
|
assert!(count > 200_000, "Should estimate large word count: got {}", count);
|
|
assert!(count <= 10_000_000, "Should cap at max limit: got {}", count);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_count_words_safely_fallback_patterns() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
|
|
// Test letter transition detection
|
|
let text = "OneWordAnotherWordFinalWord";
|
|
let count = service.count_words_safely(&text);
|
|
assert!(count >= 3, "Should detect at least 3 words from transitions: got {}", count);
|
|
|
|
// Test alphanumeric estimation fallback
|
|
let text = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; // 26 chars, should estimate ~5-6 words
|
|
let count = service.count_words_safely(&text);
|
|
assert!(count >= 1 && count <= 10, "Should estimate reasonable word count: got {}", count);
|
|
|
|
// Test mixed case with numbers
|
|
let text = "ABC123def456GHI789jkl";
|
|
let count = service.count_words_safely(&text);
|
|
assert!(count >= 1, "Should detect words in mixed alphanumeric: got {}", count);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_ocr_result_structure() {
|
|
let result = OcrResult {
|
|
text: "Test text".to_string(),
|
|
confidence: 85.5,
|
|
processing_time_ms: 1500,
|
|
word_count: 2,
|
|
preprocessing_applied: vec!["noise_reduction".to_string()],
|
|
processed_image_path: Some("/tmp/processed.png".to_string()),
|
|
};
|
|
|
|
assert_eq!(result.text, "Test text");
|
|
assert_eq!(result.confidence, 85.5);
|
|
assert_eq!(result.processing_time_ms, 1500);
|
|
assert_eq!(result.word_count, 2);
|
|
assert_eq!(result.preprocessing_applied.len(), 1);
|
|
assert!(result.processed_image_path.is_some());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_extract_text_from_plain_text() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
|
|
let test_content = "This is a test text file with multiple words.";
|
|
fs::write(temp_file.path(), test_content).unwrap();
|
|
|
|
let result = service
|
|
.extract_text(temp_file.path().to_str().unwrap(), "text/plain", &settings)
|
|
.await;
|
|
|
|
assert!(result.is_ok());
|
|
let ocr_result = result.unwrap();
|
|
assert_eq!(ocr_result.text.trim(), test_content);
|
|
assert_eq!(ocr_result.confidence, 100.0); // Plain text should be 100% confident
|
|
assert_eq!(ocr_result.word_count, 9); // "This is a test text file with multiple words"
|
|
assert!(ocr_result.processing_time_ms >= 0);
|
|
assert!(ocr_result.preprocessing_applied.contains(&"Plain text read".to_string()));
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_extract_text_with_context() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
|
|
let test_content = "Context test content";
|
|
fs::write(temp_file.path(), test_content).unwrap();
|
|
|
|
let result = service
|
|
.extract_text_with_context(
|
|
temp_file.path().to_str().unwrap(),
|
|
"text/plain",
|
|
"test_file.txt",
|
|
19, // Length of "Context test content"
|
|
&settings,
|
|
)
|
|
.await;
|
|
|
|
assert!(result.is_ok());
|
|
let ocr_result = result.unwrap();
|
|
assert_eq!(ocr_result.text.trim(), test_content);
|
|
assert_eq!(ocr_result.confidence, 100.0);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_extract_text_unsupported_mime_type() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
let temp_file = NamedTempFile::new().unwrap();
|
|
fs::write(temp_file.path(), "some content").unwrap();
|
|
|
|
let result = service
|
|
.extract_text(temp_file.path().to_str().unwrap(), "application/unknown", &settings)
|
|
.await;
|
|
|
|
assert!(result.is_err());
|
|
let error_msg = result.unwrap_err().to_string();
|
|
assert!(error_msg.contains("Unsupported file type"));
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_extract_text_nonexistent_file() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
let result = service
|
|
.extract_text("/nonexistent/file.txt", "text/plain", &settings)
|
|
.await;
|
|
|
|
assert!(result.is_err());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_extract_text_large_file_truncation() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
|
|
|
|
// Create a file larger than the limit (50MB for text files)
|
|
// Using smaller size and explicit drop for CI environments
|
|
let large_content = "A".repeat(50 * 1024 * 1024 + 1024); // 50MB + 1KB (just over the limit)
|
|
fs::write(temp_file.path(), &large_content).unwrap();
|
|
drop(large_content); // Explicitly free memory
|
|
|
|
let result = service
|
|
.extract_text(temp_file.path().to_str().unwrap(), "text/plain", &settings)
|
|
.await;
|
|
|
|
// Should fail due to size limit
|
|
assert!(result.is_err());
|
|
let error_msg = result.unwrap_err().to_string();
|
|
assert!(error_msg.contains("too large"));
|
|
}
|
|
|
|
#[cfg(feature = "ocr")]
|
|
#[tokio::test]
|
|
async fn test_validate_ocr_quality_high_confidence() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let mut settings = create_test_settings();
|
|
settings.ocr_min_confidence = 30.0;
|
|
|
|
let result = OcrResult {
|
|
text: "This is high quality OCR text with good words.".to_string(),
|
|
confidence: 95.0,
|
|
processing_time_ms: 1000,
|
|
word_count: 9,
|
|
preprocessing_applied: vec![],
|
|
processed_image_path: None,
|
|
};
|
|
|
|
let result_validation = service.validate_ocr_quality(&result, &settings);
|
|
assert!(result_validation.is_ok());
|
|
}
|
|
|
|
#[cfg(feature = "ocr")]
|
|
#[tokio::test]
|
|
async fn test_validate_ocr_quality_low_confidence() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let mut settings = create_test_settings();
|
|
settings.ocr_min_confidence = 50.0;
|
|
|
|
let result = OcrResult {
|
|
text: "Poor quality text".to_string(),
|
|
confidence: 25.0, // Below threshold but still accepted
|
|
processing_time_ms: 1000,
|
|
word_count: 3,
|
|
preprocessing_applied: vec![],
|
|
processed_image_path: None,
|
|
};
|
|
|
|
// Low confidence is now accepted with a warning, not rejected
|
|
let result_validation = service.validate_ocr_quality(&result, &settings);
|
|
assert!(result_validation.is_ok());
|
|
}
|
|
|
|
#[cfg(feature = "ocr")]
|
|
#[tokio::test]
|
|
async fn test_validate_ocr_quality_no_words() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
let result = OcrResult {
|
|
text: "".to_string(),
|
|
confidence: 95.0,
|
|
processing_time_ms: 1000,
|
|
word_count: 0, // No words
|
|
preprocessing_applied: vec![],
|
|
processed_image_path: None,
|
|
};
|
|
|
|
let result_validation = service.validate_ocr_quality(&result, &settings);
|
|
assert!(result_validation.is_err());
|
|
}
|
|
|
|
#[cfg(feature = "ocr")]
|
|
#[tokio::test]
|
|
async fn test_validate_ocr_quality_poor_character_distribution() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
let result = OcrResult {
|
|
text: "!!!@@@###$$$%%%^^^&&&***".to_string(), // Mostly symbols, < 10% content
|
|
confidence: 85.0,
|
|
processing_time_ms: 1000,
|
|
word_count: 1,
|
|
preprocessing_applied: vec![],
|
|
processed_image_path: None,
|
|
};
|
|
|
|
let result_validation = service.validate_ocr_quality(&result, &settings);
|
|
assert!(result_validation.is_err());
|
|
}
|
|
|
|
#[cfg(feature = "ocr")]
|
|
#[tokio::test]
|
|
async fn test_validate_ocr_quality_good_character_distribution() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
let result = OcrResult {
|
|
text: "The quick brown fox jumps over the lazy dog. 123".to_string(), // Good alphanumeric ratio
|
|
confidence: 85.0,
|
|
processing_time_ms: 1000,
|
|
word_count: 10,
|
|
preprocessing_applied: vec![],
|
|
processed_image_path: None,
|
|
};
|
|
|
|
let result_validation = service.validate_ocr_quality(&result, &settings);
|
|
assert!(result_validation.is_ok());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_word_count_calculation() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
let test_cases = vec![
|
|
("", 0),
|
|
("word", 1),
|
|
("two words", 2),
|
|
(" spaced words ", 2),
|
|
("Multiple\nlines\nof\ntext", 4),
|
|
("punctuation, words! work? correctly.", 4),
|
|
];
|
|
|
|
for (content, expected_count) in test_cases {
|
|
let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
|
|
fs::write(temp_file.path(), content).unwrap();
|
|
|
|
let result = service
|
|
.extract_text(temp_file.path().to_str().unwrap(), "text/plain", &settings)
|
|
.await;
|
|
|
|
assert!(result.is_ok());
|
|
let ocr_result = result.unwrap();
|
|
assert_eq!(ocr_result.word_count, expected_count, "Failed for content: '{}'", content);
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_pdf_extraction_with_invalid_pdf() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
|
|
fs::write(temp_file.path(), "Not a valid PDF").unwrap();
|
|
|
|
let result = service
|
|
.extract_text(temp_file.path().to_str().unwrap(), "application/pdf", &settings)
|
|
.await;
|
|
|
|
assert!(result.is_err());
|
|
let error_msg = result.unwrap_err().to_string();
|
|
assert!(error_msg.contains("Invalid PDF") || error_msg.contains("Missing") || error_msg.contains("corrupted"));
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_pdf_extraction_with_minimal_valid_pdf() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
// Minimal PDF with "Hello" text
|
|
let pdf_content = b"%PDF-1.4
|
|
1 0 obj
|
|
<< /Type /Catalog /Pages 2 0 R >>
|
|
endobj
|
|
2 0 obj
|
|
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
|
|
endobj
|
|
3 0 obj
|
|
<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
|
|
endobj
|
|
4 0 obj
|
|
<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
|
|
endobj
|
|
5 0 obj
|
|
<< /Length 44 >>
|
|
stream
|
|
BT
|
|
/F1 12 Tf
|
|
100 700 Td
|
|
(Hello) Tj
|
|
ET
|
|
endstream
|
|
endobj
|
|
xref
|
|
0 6
|
|
0000000000 65535 f
|
|
0000000009 00000 n
|
|
0000000058 00000 n
|
|
0000000115 00000 n
|
|
0000000262 00000 n
|
|
0000000341 00000 n
|
|
trailer
|
|
<< /Size 6 /Root 1 0 R >>
|
|
startxref
|
|
435
|
|
%%EOF";
|
|
|
|
let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
|
|
fs::write(temp_file.path(), pdf_content).unwrap();
|
|
|
|
let result = service
|
|
.extract_text(temp_file.path().to_str().unwrap(), "application/pdf", &settings)
|
|
.await;
|
|
|
|
match result {
|
|
Ok(ocr_result) => {
|
|
// PDF extraction succeeded
|
|
assert_eq!(ocr_result.confidence, 95.0); // PDF text extraction should be high confidence
|
|
// Skip processing time check for minimal PDFs as they might process too fast
|
|
// assert!(ocr_result.processing_time_ms > 0);
|
|
assert!(
|
|
ocr_result.preprocessing_applied.iter().any(|s| s.contains("PDF text extraction")) ||
|
|
ocr_result.preprocessing_applied.iter().any(|s| s.contains("OCR via ocrmypdf")),
|
|
"Expected PDF processing method in preprocessing_applied: {:?}",
|
|
ocr_result.preprocessing_applied
|
|
);
|
|
println!("PDF extracted text: '{}'", ocr_result.text);
|
|
}
|
|
Err(e) => {
|
|
// PDF extraction might fail depending on the pdf-extract library
|
|
println!("PDF extraction failed (may be expected): {}", e);
|
|
}
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_pdf_size_limit() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
|
|
|
|
// Create a file larger than the 100MB PDF limit
|
|
// Using smaller size and explicit drop for CI environments
|
|
let large_pdf_content = format!("%PDF-1.4\n{}", "A".repeat(100 * 1024 * 1024 + 1024)); // 100MB + 1KB (just over the limit)
|
|
fs::write(temp_file.path(), &large_pdf_content).unwrap();
|
|
drop(large_pdf_content); // Explicitly free memory
|
|
|
|
let result = service
|
|
.extract_text(temp_file.path().to_str().unwrap(), "application/pdf", &settings)
|
|
.await;
|
|
|
|
assert!(result.is_err());
|
|
let error_msg = result.unwrap_err().to_string();
|
|
assert!(error_msg.contains("too large"));
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_settings_default_values() {
|
|
let settings = Settings::default();
|
|
|
|
// Test that OCR-related settings have reasonable defaults
|
|
assert_eq!(settings.ocr_min_confidence, 30.0);
|
|
assert_eq!(settings.ocr_dpi, 300);
|
|
assert_eq!(settings.ocr_page_segmentation_mode, 3);
|
|
assert_eq!(settings.ocr_engine_mode, 3);
|
|
assert!(settings.enable_background_ocr);
|
|
assert!(settings.ocr_enhance_contrast);
|
|
assert!(settings.ocr_remove_noise);
|
|
assert!(settings.ocr_detect_orientation);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_concurrent_ocr_processing() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
let mut handles = vec![];
|
|
|
|
// Process multiple files concurrently
|
|
for i in 0..5 {
|
|
let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
|
|
let content = format!("Concurrent test content {}", i);
|
|
fs::write(temp_file.path(), &content).unwrap();
|
|
|
|
let temp_path_clone = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service_clone = create_test_file_service(&temp_path_clone).await;
|
|
let service_clone = EnhancedOcrService::new(temp_path_clone, file_service_clone);
|
|
let settings_clone = settings.clone();
|
|
let file_path = temp_file.path().to_str().unwrap().to_string();
|
|
|
|
let handle = tokio::spawn(async move {
|
|
let result = service_clone
|
|
.extract_text(&file_path, "text/plain", &settings_clone)
|
|
.await;
|
|
|
|
// Keep temp_file alive until task completes
|
|
drop(temp_file);
|
|
result
|
|
});
|
|
|
|
handles.push(handle);
|
|
}
|
|
|
|
// Wait for all tasks to complete
|
|
let results = futures::future::join_all(handles).await;
|
|
|
|
// All tasks should succeed
|
|
for (i, result) in results.into_iter().enumerate() {
|
|
assert!(result.is_ok(), "Task {} failed", i);
|
|
let ocr_result = result.unwrap().unwrap();
|
|
assert!(ocr_result.text.contains(&format!("Concurrent test content {}", i)));
|
|
assert_eq!(ocr_result.confidence, 100.0);
|
|
}
|
|
}
|
|
|
|
// New validation tests for updated OCR validation logic
|
|
|
|
#[cfg(feature = "ocr")]
|
|
#[tokio::test]
|
|
async fn test_validate_ocr_quality_below_hard_minimum() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
// Test OCR with confidence below the hard minimum (5%)
|
|
// This should be rejected as critically low/corrupted
|
|
let result = OcrResult {
|
|
text: "Some text".to_string(),
|
|
confidence: 4.9, // Below hard minimum of 5%
|
|
processing_time_ms: 1000,
|
|
word_count: 2,
|
|
preprocessing_applied: vec![],
|
|
processed_image_path: None,
|
|
};
|
|
|
|
let validation_result = service.validate_ocr_quality(&result, &settings);
|
|
assert!(validation_result.is_err(), "Expected validation to fail for confidence below hard minimum");
|
|
|
|
let error_msg = validation_result.unwrap_err();
|
|
assert!(error_msg.contains("critically low"),
|
|
"Expected 'critically low' in error message, got: {}", error_msg);
|
|
}
|
|
|
|
#[cfg(feature = "ocr")]
|
|
#[tokio::test]
|
|
async fn test_validate_ocr_quality_at_hard_minimum_boundary() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
// Test OCR with exactly 5% confidence (boundary case)
|
|
// This should be accepted (at the hard minimum threshold)
|
|
let result = OcrResult {
|
|
text: "Boundary test text".to_string(),
|
|
confidence: 5.0, // Exactly at hard minimum
|
|
processing_time_ms: 1000,
|
|
word_count: 3,
|
|
preprocessing_applied: vec![],
|
|
processed_image_path: None,
|
|
};
|
|
|
|
let validation_result = service.validate_ocr_quality(&result, &settings);
|
|
assert!(validation_result.is_ok(),
|
|
"Expected validation to pass at hard minimum boundary (5%)");
|
|
}
|
|
|
|
#[cfg(feature = "ocr")]
|
|
#[tokio::test]
|
|
async fn test_validate_ocr_quality_numeric_document() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
// Test invoice/receipt with lots of digits
|
|
// Should be accepted because digits count as valuable content (letters+digits >= 10%)
|
|
let result = OcrResult {
|
|
text: "Invoice #12345\n$1,234.56\n$2,345.67\nTotal: $3,580.23\n!!!".to_string(),
|
|
confidence: 60.0,
|
|
processing_time_ms: 1000,
|
|
word_count: 5,
|
|
preprocessing_applied: vec![],
|
|
processed_image_path: None,
|
|
};
|
|
|
|
// Calculate to verify we have good content ratio (letters + digits)
|
|
let content_count = result.text.chars().filter(|c| c.is_alphanumeric()).count();
|
|
let total_chars = result.text.len();
|
|
let content_ratio = content_count as f32 / total_chars as f32;
|
|
assert!(content_ratio >= 0.10, "Test data should have >=10% content, got {:.1}%", content_ratio * 100.0);
|
|
|
|
let validation_result = service.validate_ocr_quality(&result, &settings);
|
|
assert!(validation_result.is_ok(),
|
|
"Expected validation to pass for numeric document with {:.1}% content", content_ratio * 100.0);
|
|
}
|
|
|
|
#[cfg(feature = "ocr")]
|
|
#[tokio::test]
|
|
async fn test_validate_ocr_quality_numeric_document_boundary() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
// Test document with 30% digits and 70% letters (100% content)
|
|
// Should easily pass since content ratio = 100%
|
|
let result = OcrResult {
|
|
text: "123456789012345678901234567890AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA".to_string(),
|
|
confidence: 60.0,
|
|
processing_time_ms: 1000,
|
|
word_count: 2,
|
|
preprocessing_applied: vec![],
|
|
processed_image_path: None,
|
|
};
|
|
|
|
// Verify 100% content (30% digits + 70% letters)
|
|
let content_count = result.text.chars().filter(|c| c.is_alphanumeric()).count();
|
|
let total_chars = result.text.len();
|
|
let content_ratio = content_count as f32 / total_chars as f32;
|
|
assert_eq!(content_count, 100, "Test data should have 100% content");
|
|
assert_eq!(total_chars, 100, "Test data should have exactly 100 chars");
|
|
assert!((content_ratio - 1.0).abs() < 0.01, "Should have 100% content, got {:.1}%", content_ratio * 100.0);
|
|
|
|
let validation_result = service.validate_ocr_quality(&result, &settings);
|
|
// Should pass easily with 100% content (letters + digits)
|
|
assert!(validation_result.is_ok(),
|
|
"Expected validation to pass with 100% content");
|
|
}
|
|
|
|
#[cfg(feature = "ocr")]
|
|
#[tokio::test]
|
|
async fn test_validate_ocr_quality_alphanumeric_boundary() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
// Test text with exactly 10% content (letters+digits) - boundary case
|
|
// 1 letter + 9 symbols = 10 total chars = 10% content
|
|
let result = OcrResult {
|
|
text: "a!!!!!!!!!".to_string(), // 1 letter + 9 symbols = 10% content
|
|
confidence: 60.0,
|
|
processing_time_ms: 1000,
|
|
word_count: 1,
|
|
preprocessing_applied: vec![],
|
|
processed_image_path: None,
|
|
};
|
|
|
|
// Verify exactly 10% content (letters+digits)
|
|
let content_count = result.text.chars().filter(|c| c.is_alphanumeric()).count();
|
|
let total_chars = result.text.len();
|
|
let content_ratio = content_count as f32 / total_chars as f32;
|
|
assert_eq!(content_count, 1, "Test data should have exactly 1 content char");
|
|
assert_eq!(total_chars, 10, "Test data should have exactly 10 chars");
|
|
assert!((content_ratio - 0.1).abs() < 0.01, "Should have exactly 10% content, got {:.1}%", content_ratio * 100.0);
|
|
|
|
let validation_result = service.validate_ocr_quality(&result, &settings);
|
|
assert!(validation_result.is_ok(),
|
|
"Expected validation to pass at 10% content boundary");
|
|
}
|
|
|
|
#[cfg(feature = "ocr")]
|
|
#[tokio::test]
|
|
async fn test_validate_ocr_quality_below_alphanumeric_threshold() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
// Test text with <10% content (letters+digits) - pure garbage
|
|
// 1 letter + 14 symbols = 15 total chars = 6.67% content
|
|
let result = OcrResult {
|
|
text: "a!!!!!!!!!!!!!!".to_string(), // 1 letter + 14 symbols = ~7% content
|
|
confidence: 60.0,
|
|
processing_time_ms: 1000,
|
|
word_count: 1,
|
|
preprocessing_applied: vec![],
|
|
processed_image_path: None,
|
|
};
|
|
|
|
// Verify <10% content (letters+digits)
|
|
let content_count = result.text.chars().filter(|c| c.is_alphanumeric()).count();
|
|
let total_chars = result.text.len();
|
|
let content_ratio = content_count as f32 / total_chars as f32;
|
|
assert!(content_ratio < 0.10, "Test data should have <10% content, got {:.1}%", content_ratio * 100.0);
|
|
|
|
let validation_result = service.validate_ocr_quality(&result, &settings);
|
|
assert!(validation_result.is_err(),
|
|
"Expected validation to fail for <10% content");
|
|
|
|
let error_msg = validation_result.unwrap_err();
|
|
assert!(error_msg.contains("meaningful content"),
|
|
"Expected error about meaningful content, got: {}", error_msg);
|
|
}
|
|
|
|
#[cfg(feature = "ocr")]
|
|
#[tokio::test]
|
|
async fn test_validate_ocr_quality_empty_text() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
// Test completely empty text
|
|
// Should fail with "no characters" error (not "no words")
|
|
let result = OcrResult {
|
|
text: "".to_string(),
|
|
confidence: 60.0,
|
|
processing_time_ms: 1000,
|
|
word_count: 0,
|
|
preprocessing_applied: vec![],
|
|
processed_image_path: None,
|
|
};
|
|
|
|
let validation_result = service.validate_ocr_quality(&result, &settings);
|
|
assert!(validation_result.is_err(),
|
|
"Expected validation to fail for empty text");
|
|
|
|
let error_msg = validation_result.unwrap_err();
|
|
assert!(error_msg.contains("no characters"),
|
|
"Expected error about 'no characters' (not 'no words'), got: {}", error_msg);
|
|
}
|
|
|
|
#[cfg(feature = "ocr")]
|
|
#[tokio::test]
|
|
async fn test_validate_ocr_quality_whitespace_only() {
|
|
let temp_dir = create_temp_dir();
|
|
let temp_path = temp_dir.path().to_str().unwrap().to_string();
|
|
let file_service = create_test_file_service(&temp_path).await;
|
|
let service = EnhancedOcrService::new(temp_path, file_service);
|
|
let settings = create_test_settings();
|
|
|
|
// Test text with only whitespace
|
|
// Has characters but no words - should fail with "No words" error
|
|
let result = OcrResult {
|
|
text: " \n\n\t\t".to_string(),
|
|
confidence: 60.0,
|
|
processing_time_ms: 1000,
|
|
word_count: 0, // Whitespace doesn't count as words
|
|
preprocessing_applied: vec![],
|
|
processed_image_path: None,
|
|
};
|
|
|
|
let validation_result = service.validate_ocr_quality(&result, &settings);
|
|
assert!(validation_result.is_err(),
|
|
"Expected validation to fail for whitespace-only text");
|
|
|
|
let error_msg = validation_result.unwrap_err();
|
|
assert!(error_msg.contains("No words"),
|
|
"Expected error about 'No words' (not 'no characters'), got: {}", error_msg);
|
|
}
|
|
} |