Readur/tests/integration_test_image_ocr_...

315 lines
11 KiB
Rust

//! Integration tests for OCR processing using real test images
//!
//! This test suite uses the actual test images from tests/test_images/
//! to verify OCR functionality with known content.
use readur::ocr::OcrService;
use std::path::Path;
/// Simple test image information
#[derive(Debug, Clone)]
struct TestImage {
filename: &'static str,
path: String,
mime_type: &'static str,
expected_content: &'static str,
}
impl TestImage {
fn new(filename: &'static str, mime_type: &'static str, expected_content: &'static str) -> Self {
Self {
filename,
path: format!("tests/test_images/{}", filename),
mime_type,
expected_content,
}
}
fn exists(&self) -> bool {
Path::new(&self.path).exists()
}
async fn load_data(&self) -> Result<Vec<u8>, std::io::Error> {
tokio::fs::read(&self.path).await
}
}
/// Get available test images (only those that exist)
fn get_available_test_images() -> Vec<TestImage> {
let all_images = vec![
TestImage::new("test1.png", "image/png", "Test 1\nThis is some text from text 1"),
TestImage::new("test2.jpg", "image/jpeg", "Test 2\nThis is some text from text 2"),
TestImage::new("test3.jpeg", "image/jpeg", "Test 3\nThis is some text from text 3"),
TestImage::new("test4.png", "image/png", "Test 4\nThis is some text from text 4"),
TestImage::new("test5.jpg", "image/jpeg", "Test 5\nThis is some text from text 5"),
];
all_images.into_iter().filter(|img| img.exists()).collect()
}
#[tokio::test]
async fn test_ocr_with_all_available_test_images() {
let available_images = get_available_test_images();
if available_images.is_empty() {
println!("No test images found - skipping OCR tests");
return;
}
println!("Testing OCR with {} available test images", available_images.len());
for test_image in available_images {
println!("Testing OCR with {}", test_image.filename);
// Load the image data
let image_data = match test_image.load_data().await {
Ok(data) => data,
Err(e) => {
println!("Failed to load {}: {}", test_image.filename, e);
continue;
}
};
// Create a temporary file for OCR processing
let temp_path = format!("./temp_test_{}", test_image.filename);
if let Err(e) = tokio::fs::write(&temp_path, &image_data).await {
println!("Failed to write temp file for {}: {}", test_image.filename, e);
continue;
}
// Test OCR processing
let ocr_service = OcrService::new();
let result = ocr_service.extract_text(&temp_path, test_image.mime_type).await;
// Clean up temp file
let _ = tokio::fs::remove_file(&temp_path).await;
match result {
Ok(extracted_text) => {
println!("✅ OCR Success for {}: '{}'", test_image.filename, extracted_text);
// Verify the extracted text contains expected content
let normalized_extracted = extracted_text.trim().to_lowercase();
let normalized_expected = test_image.expected_content.trim().to_lowercase();
// Check for key parts of expected content
let test_number = test_image.filename.chars()
.filter(|c| c.is_numeric())
.collect::<String>();
if !test_number.is_empty() {
assert!(
normalized_extracted.contains(&format!("test {}", test_number)) ||
normalized_extracted.contains(&test_number),
"OCR result '{}' should contain test number '{}' for image {}",
extracted_text, test_number, test_image.filename
);
}
// Check for presence of "text" keyword
assert!(
normalized_extracted.contains("text") || normalized_extracted.contains("some"),
"OCR result '{}' should contain expected text content for image {}",
extracted_text, test_image.filename
);
}
Err(e) => {
println!("⚠️ OCR Failed for {}: {}", test_image.filename, e);
// Don't fail the test immediately - log the error but continue
// This allows us to see which images work and which don't
}
}
}
}
#[tokio::test]
async fn test_ocr_with_specific_test_images() {
// Test specific images that should definitely work
let test_cases = vec![1, 2, 3]; // Test with first 3 images
let available_images = get_available_test_images();
for test_num in test_cases {
let test_image = match available_images.get(test_num - 1) {
Some(img) => img.clone(),
None => continue,
};
if !test_image.exists() {
println!("Skipping test{}: file not found", test_num);
continue;
}
println!("Running OCR test for {}", test_image.filename);
// Load image data
let image_data = test_image.load_data().await
.expect("Should be able to load test image");
assert!(!image_data.is_empty(), "Test image should not be empty");
// Verify file format based on MIME type
match test_image.mime_type {
"image/png" => {
assert!(image_data.starts_with(&[0x89, 0x50, 0x4E, 0x47]),
"PNG file should start with PNG signature");
}
"image/jpeg" => {
assert!(image_data.starts_with(&[0xFF, 0xD8, 0xFF]),
"JPEG file should start with JPEG signature");
}
_ => {}
}
println!("Image {} loaded successfully: {} bytes, type: {}",
test_image.filename, image_data.len(), test_image.mime_type);
}
}
#[tokio::test]
async fn test_ocr_error_handling_with_corrupted_image() {
// Create a corrupted image file
let corrupted_data = vec![0xFF; 100]; // Invalid image data
let temp_path = "./temp_corrupted_test.png";
tokio::fs::write(temp_path, &corrupted_data).await
.expect("Should be able to write corrupted test file");
let ocr_service = OcrService::new();
let result = ocr_service.extract_text(temp_path, "image/png").await;
// Clean up
let _ = tokio::fs::remove_file(temp_path).await;
// Should handle the error gracefully
match result {
Ok(text) => {
println!("Unexpected success with corrupted image: '{}'", text);
// Some OCR systems might return empty text instead of error
}
Err(e) => {
println!("Expected error with corrupted image: {}", e);
// This is the expected behavior
}
}
}
#[tokio::test]
async fn test_multiple_image_formats() {
let images = get_available_test_images();
let mut png_count = 0;
let mut jpeg_count = 0;
for image in &images {
match image.mime_type {
"image/png" => png_count += 1,
"image/jpeg" => jpeg_count += 1,
_ => {}
}
}
println!("Available test images: {} PNG, {} JPEG", png_count, jpeg_count);
// Ensure we have at least one of each format for comprehensive testing
if png_count > 0 && jpeg_count > 0 {
println!("✅ Both PNG and JPEG formats available for testing");
} else {
println!("⚠️ Limited format coverage: PNG={}, JPEG={}", png_count, jpeg_count);
}
// Test at least one of each format if available
for image in images.iter().take(2) {
if image.exists() {
println!("Testing format: {} ({})", image.mime_type, image.filename);
let image_data = image.load_data().await
.expect("Should load test image");
assert!(!image_data.is_empty(), "Image data should not be empty");
assert!(image_data.len() > 100, "Image should be reasonably sized");
}
}
}
#[tokio::test]
#[ignore = "Long running test - run with: cargo test test_ocr_performance -- --ignored"]
async fn test_ocr_performance_with_test_images() {
let available_images = get_available_test_images();
if available_images.is_empty() {
println!("No test images available for performance testing");
return;
}
let start_time = std::time::Instant::now();
let mut successful_ocr = 0;
let mut failed_ocr = 0;
for test_image in available_images {
let image_start = std::time::Instant::now();
// Load image
let image_data = match test_image.load_data().await {
Ok(data) => data,
Err(_) => {
failed_ocr += 1;
continue;
}
};
// Write to temp file
let temp_path = format!("./temp_perf_{}", test_image.filename);
if tokio::fs::write(&temp_path, &image_data).await.is_err() {
failed_ocr += 1;
continue;
}
// Run OCR
let ocr_service = OcrService::new();
let result = ocr_service.extract_text(&temp_path, test_image.mime_type).await;
// Clean up
let _ = tokio::fs::remove_file(&temp_path).await;
let duration = image_start.elapsed();
match result {
Ok(text) => {
successful_ocr += 1;
println!("{} processed in {:?}: '{}'",
test_image.filename, duration, text.chars().take(50).collect::<String>());
}
Err(e) => {
failed_ocr += 1;
println!("{} failed in {:?}: {}",
test_image.filename, duration, e);
}
}
}
let total_duration = start_time.elapsed();
let total_images = successful_ocr + failed_ocr;
println!("\n📊 OCR Performance Summary:");
println!("Total images: {}", total_images);
println!("Successful: {}", successful_ocr);
println!("Failed: {}", failed_ocr);
println!("Total time: {:?}", total_duration);
if total_images > 0 {
println!("Average time per image: {:?}", total_duration / total_images);
let success_rate = (successful_ocr as f64 / total_images as f64) * 100.0;
println!("Success rate: {:.1}%", success_rate);
}
// Performance assertions
if successful_ocr > 0 {
let avg_time_per_image = total_duration / successful_ocr;
assert!(avg_time_per_image.as_secs() < 30,
"OCR should complete within 30 seconds per image on average");
}
}