701 lines
24 KiB
Rust
701 lines
24 KiB
Rust
#[cfg(test)]
|
|
mod tests {
|
|
use crate::ocr::OcrService;
|
|
use std::fs;
|
|
use std::path::Path;
|
|
use tempfile::NamedTempFile;
|
|
|
|
// Mock database for testing
|
|
mod mock_db {
|
|
use anyhow::Result;
|
|
use uuid::Uuid;
|
|
use std::sync::{Arc, Mutex};
|
|
use std::collections::HashMap;
|
|
|
|
#[derive(Clone)]
|
|
pub struct MockDatabase {
|
|
ocr_updates: Arc<Mutex<HashMap<Uuid, String>>>,
|
|
}
|
|
|
|
impl MockDatabase {
|
|
pub fn new() -> Self {
|
|
Self {
|
|
ocr_updates: Arc::new(Mutex::new(HashMap::new())),
|
|
}
|
|
}
|
|
|
|
pub async fn update_document_ocr(&self, id: Uuid, ocr_text: &str) -> Result<()> {
|
|
let mut updates = self.ocr_updates.lock().unwrap();
|
|
updates.insert(id, ocr_text.to_string());
|
|
Ok(())
|
|
}
|
|
|
|
pub fn get_ocr_text(&self, id: &Uuid) -> Option<String> {
|
|
let updates = self.ocr_updates.lock().unwrap();
|
|
updates.get(id).cloned()
|
|
}
|
|
|
|
pub fn get_all_ocr_updates(&self) -> HashMap<Uuid, String> {
|
|
let updates = self.ocr_updates.lock().unwrap();
|
|
updates.clone()
|
|
}
|
|
}
|
|
}
|
|
|
|
use mock_db::MockDatabase;
|
|
|
|
#[test]
|
|
fn test_is_image_file() {
|
|
let ocr_service = OcrService::new();
|
|
|
|
assert!(ocr_service.is_image_file("image.png"));
|
|
assert!(ocr_service.is_image_file("photo.jpg"));
|
|
assert!(ocr_service.is_image_file("picture.JPEG"));
|
|
assert!(ocr_service.is_image_file("scan.tiff"));
|
|
assert!(ocr_service.is_image_file("bitmap.bmp"));
|
|
assert!(ocr_service.is_image_file("animation.gif"));
|
|
|
|
assert!(!ocr_service.is_image_file("document.pdf"));
|
|
assert!(!ocr_service.is_image_file("text.txt"));
|
|
assert!(!ocr_service.is_image_file("archive.zip"));
|
|
assert!(!ocr_service.is_image_file("noextension"));
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_extract_text_from_plain_text() {
|
|
let ocr_service = OcrService::new();
|
|
|
|
let temp_file = NamedTempFile::new().unwrap();
|
|
let test_content = "This is a test text file.\nWith multiple lines.";
|
|
fs::write(temp_file.path(), test_content).unwrap();
|
|
|
|
let result = ocr_service
|
|
.extract_text(temp_file.path().to_str().unwrap(), "text/plain")
|
|
.await;
|
|
|
|
assert!(result.is_ok());
|
|
let extracted_text = result.unwrap();
|
|
assert_eq!(extracted_text, test_content);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_extract_text_unsupported_type() {
|
|
let ocr_service = OcrService::new();
|
|
|
|
let temp_file = NamedTempFile::new().unwrap();
|
|
fs::write(temp_file.path(), "some content").unwrap();
|
|
|
|
let result = ocr_service
|
|
.extract_text(temp_file.path().to_str().unwrap(), "application/zip")
|
|
.await;
|
|
|
|
assert!(result.is_err());
|
|
assert!(result.unwrap_err().to_string().contains("Unsupported MIME type"));
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_extract_text_from_nonexistent_file() {
|
|
let ocr_service = OcrService::new();
|
|
|
|
let result = ocr_service
|
|
.extract_text("/path/to/nonexistent/file.txt", "text/plain")
|
|
.await;
|
|
|
|
assert!(result.is_err());
|
|
}
|
|
|
|
#[tokio::test]
|
|
#[ignore = "Requires tesseract runtime - run with: cargo test --release -- --ignored"]
|
|
async fn test_extract_text_with_real_image() {
|
|
let ocr_service = OcrService::new();
|
|
|
|
// Create a simple test image with text if it doesn't exist
|
|
let test_image_path = "test_data/hello_ocr.png";
|
|
|
|
// Skip test if test data doesn't exist
|
|
if !Path::new(test_image_path).exists() {
|
|
eprintln!("Skipping test_extract_text_with_real_image: test data not found");
|
|
return;
|
|
}
|
|
|
|
let result = ocr_service
|
|
.extract_text(test_image_path, "image/png")
|
|
.await;
|
|
|
|
match result {
|
|
Ok(text) => {
|
|
println!("OCR extracted text: '{}'", text);
|
|
// OCR might not be perfect, so we check if it contains expected words
|
|
assert!(text.to_lowercase().contains("hello") || text.to_lowercase().contains("ocr"));
|
|
}
|
|
Err(e) => {
|
|
eprintln!("OCR test failed: {}", e);
|
|
// Don't fail the test if OCR is not available
|
|
}
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_extract_text_from_pdf_with_content() {
|
|
let ocr_service = OcrService::new();
|
|
|
|
// Create a minimal valid PDF
|
|
let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
|
|
|
|
// This is a minimal PDF that says "Hello"
|
|
let pdf_content = b"%PDF-1.4
|
|
1 0 obj
|
|
<< /Type /Catalog /Pages 2 0 R >>
|
|
endobj
|
|
2 0 obj
|
|
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
|
|
endobj
|
|
3 0 obj
|
|
<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
|
|
endobj
|
|
4 0 obj
|
|
<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
|
|
endobj
|
|
5 0 obj
|
|
<< /Length 44 >>
|
|
stream
|
|
BT
|
|
/F1 12 Tf
|
|
100 700 Td
|
|
(Hello) Tj
|
|
ET
|
|
endstream
|
|
endobj
|
|
xref
|
|
0 6
|
|
0000000000 65535 f
|
|
0000000009 00000 n
|
|
0000000058 00000 n
|
|
0000000115 00000 n
|
|
0000000262 00000 n
|
|
0000000341 00000 n
|
|
trailer
|
|
<< /Size 6 /Root 1 0 R >>
|
|
startxref
|
|
435
|
|
%%EOF";
|
|
|
|
fs::write(temp_file.path(), pdf_content).unwrap();
|
|
|
|
let result = ocr_service
|
|
.extract_text_from_pdf(temp_file.path().to_str().unwrap())
|
|
.await;
|
|
|
|
// The pdf-extract library might not work with our minimal PDF
|
|
// so we just check that it attempts to process it
|
|
match result {
|
|
Ok(text) => {
|
|
println!("PDF extracted text: '{}'", text);
|
|
}
|
|
Err(e) => {
|
|
println!("PDF extraction error (expected): {}", e);
|
|
}
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_extract_text_with_image_extension_fallback() {
|
|
let ocr_service = OcrService::new();
|
|
|
|
let temp_file = NamedTempFile::with_suffix(".png").unwrap();
|
|
fs::write(temp_file.path(), "fake image data").unwrap();
|
|
|
|
let result = ocr_service
|
|
.extract_text(temp_file.path().to_str().unwrap(), "unknown/type")
|
|
.await;
|
|
|
|
// This should try to process as image due to extension, but fail due to invalid data
|
|
assert!(result.is_err());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_ocr_with_mock_database_integration() {
|
|
let ocr_service = OcrService::new();
|
|
let mock_db = MockDatabase::new();
|
|
let doc_id = uuid::Uuid::new_v4();
|
|
|
|
// Create a simple text file to simulate OCR processing
|
|
let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
|
|
let test_content = "This is test OCR content for mock database integration.";
|
|
fs::write(temp_file.path(), test_content).unwrap();
|
|
|
|
// Extract text using OCR service
|
|
let result = ocr_service
|
|
.extract_text(temp_file.path().to_str().unwrap(), "text/plain")
|
|
.await;
|
|
|
|
assert!(result.is_ok());
|
|
let extracted_text = result.unwrap();
|
|
|
|
// Mock database update
|
|
let update_result = mock_db.update_document_ocr(doc_id, &extracted_text).await;
|
|
assert!(update_result.is_ok());
|
|
|
|
// Verify the text was stored in mock database
|
|
let stored_text = mock_db.get_ocr_text(&doc_id);
|
|
assert!(stored_text.is_some());
|
|
assert_eq!(stored_text.unwrap(), test_content);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_ocr_error_handling_with_mock_db() {
|
|
let ocr_service = OcrService::new();
|
|
let mock_db = MockDatabase::new();
|
|
let doc_id = uuid::Uuid::new_v4();
|
|
|
|
// Test with non-existent file
|
|
let result = ocr_service
|
|
.extract_text("/nonexistent/path/file.txt", "text/plain")
|
|
.await;
|
|
|
|
assert!(result.is_err());
|
|
|
|
// Verify no update was made to mock database for failed OCR
|
|
let stored_text = mock_db.get_ocr_text(&doc_id);
|
|
assert!(stored_text.is_none());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_batch_ocr_processing_with_mock_db() {
|
|
let ocr_service = OcrService::new();
|
|
let mock_db = MockDatabase::new();
|
|
|
|
let mut doc_ids = Vec::new();
|
|
let mut temp_files = Vec::new();
|
|
|
|
// Create multiple test files
|
|
for i in 0..3 {
|
|
let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
|
|
let content = format!("Test document {} content for batch processing.", i + 1);
|
|
fs::write(temp_file.path(), &content).unwrap();
|
|
|
|
let doc_id = uuid::Uuid::new_v4();
|
|
doc_ids.push(doc_id);
|
|
temp_files.push((temp_file, content));
|
|
}
|
|
|
|
// Process all files
|
|
for (i, (temp_file, _expected_content)) in temp_files.iter().enumerate() {
|
|
let result = ocr_service
|
|
.extract_text(temp_file.path().to_str().unwrap(), "text/plain")
|
|
.await;
|
|
|
|
assert!(result.is_ok());
|
|
let extracted_text = result.unwrap();
|
|
|
|
let update_result = mock_db.update_document_ocr(doc_ids[i], &extracted_text).await;
|
|
assert!(update_result.is_ok());
|
|
}
|
|
|
|
// Verify all documents were processed
|
|
let all_updates = mock_db.get_all_ocr_updates();
|
|
assert_eq!(all_updates.len(), 3);
|
|
|
|
for (i, doc_id) in doc_ids.iter().enumerate() {
|
|
let stored_text = all_updates.get(doc_id);
|
|
assert!(stored_text.is_some());
|
|
assert!(stored_text.unwrap().contains(&format!("Test document {}", i + 1)));
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_ocr_language_support() {
|
|
let ocr_service = OcrService::new();
|
|
|
|
let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
|
|
let test_content = "Hello world test content";
|
|
fs::write(temp_file.path(), test_content).unwrap();
|
|
|
|
// Test different language codes
|
|
let languages = vec!["eng", "spa", "fra", "deu"];
|
|
|
|
for lang in languages {
|
|
let result = ocr_service
|
|
.extract_text_with_lang(temp_file.path().to_str().unwrap(), "text/plain", lang)
|
|
.await;
|
|
|
|
// Should succeed for text files regardless of language setting
|
|
assert!(result.is_ok());
|
|
let extracted = result.unwrap();
|
|
assert_eq!(extracted, test_content);
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_ocr_mime_type_detection() {
|
|
let ocr_service = OcrService::new();
|
|
|
|
// Test various mime types
|
|
let test_cases = vec![
|
|
("test.txt", "text/plain"),
|
|
("document.pdf", "application/pdf"),
|
|
("image.png", "image/png"),
|
|
("photo.jpg", "image/jpeg"),
|
|
("scan.tiff", "image/tiff"),
|
|
];
|
|
|
|
for (filename, mime_type) in test_cases {
|
|
let temp_file = NamedTempFile::with_suffix(&Path::new(filename).extension().unwrap().to_str().unwrap()).unwrap();
|
|
|
|
if mime_type == "text/plain" {
|
|
fs::write(temp_file.path(), "test content").unwrap();
|
|
|
|
let result = ocr_service
|
|
.extract_text(temp_file.path().to_str().unwrap(), mime_type)
|
|
.await;
|
|
|
|
assert!(result.is_ok(), "Failed for mime type: {}", mime_type);
|
|
} else {
|
|
// For non-text files, we expect either success or specific errors
|
|
let result = ocr_service
|
|
.extract_text(temp_file.path().to_str().unwrap(), mime_type)
|
|
.await;
|
|
|
|
// These will likely fail with our test setup, but should not panic
|
|
if result.is_err() {
|
|
println!("Expected failure for {}: {}", mime_type, result.unwrap_err());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_mock_database_functionality() {
|
|
let mock_db = MockDatabase::new();
|
|
let doc_id1 = uuid::Uuid::new_v4();
|
|
let doc_id2 = uuid::Uuid::new_v4();
|
|
|
|
// Test empty state
|
|
assert!(mock_db.get_ocr_text(&doc_id1).is_none());
|
|
|
|
// Test single update
|
|
let rt = tokio::runtime::Runtime::new().unwrap();
|
|
rt.block_on(async {
|
|
let result = mock_db.update_document_ocr(doc_id1, "Test OCR text").await;
|
|
assert!(result.is_ok());
|
|
});
|
|
|
|
assert_eq!(mock_db.get_ocr_text(&doc_id1).unwrap(), "Test OCR text");
|
|
|
|
// Test multiple updates
|
|
rt.block_on(async {
|
|
let result = mock_db.update_document_ocr(doc_id2, "Another OCR text").await;
|
|
assert!(result.is_ok());
|
|
});
|
|
|
|
let all_updates = mock_db.get_all_ocr_updates();
|
|
assert_eq!(all_updates.len(), 2);
|
|
assert!(all_updates.contains_key(&doc_id1));
|
|
assert!(all_updates.contains_key(&doc_id2));
|
|
}
|
|
|
|
/// Test that malformed PDFs don't crash the OCR system
|
|
#[tokio::test]
|
|
async fn test_malformed_pdf_panic_handling() {
|
|
let ocr_service = OcrService::new();
|
|
|
|
// Create a malformed PDF in memory that will cause pdf-extract to panic
|
|
let malformed_pdf_content = b"%PDF-1.4
|
|
1 0 obj
|
|
<< /Type /Catalog /Pages 2 0 R >>
|
|
endobj
|
|
2 0 obj
|
|
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
|
|
endobj
|
|
3 0 obj
|
|
<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
|
|
endobj
|
|
4 0 obj
|
|
<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
|
|
endobj
|
|
5 0 obj
|
|
<< /Length 999 >>
|
|
stream
|
|
BT
|
|
/F1 12 Tf
|
|
100 700 Td
|
|
(This is a malformed PDF with invalid content stream) Tj
|
|
ET
|
|
INVALID_CONTENT_STREAM_DATA_THAT_CAUSES_PANIC
|
|
endstream
|
|
endobj
|
|
xref
|
|
0 6
|
|
0000000000 65535 f
|
|
0000000009 00000 n
|
|
0000000058 00000 n
|
|
0000000115 00000 n
|
|
0000000262 00000 n
|
|
0000000341 00000 n
|
|
trailer
|
|
<< /Size 6 /Root 1 0 R >>
|
|
startxref
|
|
999
|
|
%%EOF";
|
|
|
|
// Write to temporary file
|
|
let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
|
|
std::fs::write(temp_file.path(), malformed_pdf_content).unwrap();
|
|
|
|
let result = ocr_service.extract_text_from_pdf(temp_file.path().to_str().unwrap()).await;
|
|
// With ocrmypdf, this should now succeed gracefully (more robust than pdf-extract)
|
|
// or return a descriptive error - either is acceptable
|
|
match result {
|
|
Ok(text) => {
|
|
println!("Successfully extracted text from malformed PDF: '{}'", text);
|
|
// OCRmyPDF is more robust and can handle some malformed PDFs
|
|
}
|
|
Err(e) => {
|
|
println!("Error extracting from malformed PDF: {}", e);
|
|
// Should contain descriptive error message if it fails
|
|
let error_msg = e.to_string();
|
|
assert!(
|
|
error_msg.contains("ocrmypdf") ||
|
|
error_msg.contains("extraction") ||
|
|
error_msg.contains("InputFileError") ||
|
|
error_msg.contains("Failed to extract")
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_corrupted_pdf_structure_handling() {
|
|
let ocr_service = OcrService::new();
|
|
|
|
// Create a corrupted PDF structure that will cause pdf-extract to fail
|
|
let corrupted_pdf_content = b"%PDF-1.4
|
|
1 0 obj
|
|
<< /Type /Catalog /Pages 2 0 R >>
|
|
endobj
|
|
2 0 obj
|
|
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
|
|
endobj
|
|
3 0 obj
|
|
<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
|
|
endobj
|
|
4 0 obj
|
|
<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
|
|
endobj
|
|
5 0 obj
|
|
<< /Length 44 >>
|
|
stream
|
|
BT
|
|
/F1 12 Tf
|
|
100 700 Td
|
|
(Corrupted PDF) Tj
|
|
ET
|
|
endstream
|
|
endobj
|
|
xref
|
|
0 6
|
|
0000000000 65535 f
|
|
0000000009 00000 n
|
|
0000000058 00000 n
|
|
0000000115 00000 n
|
|
0000000262 00000 n
|
|
0000000341 00000 n
|
|
trailer
|
|
<< /Size 6 /Root 1 0 R /InvalidKey >>
|
|
startxref
|
|
999999
|
|
%%EOF";
|
|
|
|
let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
|
|
std::fs::write(temp_file.path(), corrupted_pdf_content).unwrap();
|
|
|
|
let result = ocr_service.extract_text_from_pdf(temp_file.path().to_str().unwrap()).await;
|
|
// Should not panic, should return an error instead
|
|
assert!(result.is_err(), "Expected error for corrupted PDF");
|
|
let error_msg = result.unwrap_err().to_string();
|
|
println!("Corrupted PDF error: {}", error_msg);
|
|
// Should contain descriptive error message
|
|
assert!(
|
|
error_msg.contains("panic") ||
|
|
error_msg.contains("corrupted") ||
|
|
error_msg.contains("extract") ||
|
|
error_msg.contains("PDF") ||
|
|
error_msg.contains("Failed to extract")
|
|
);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_invalid_font_encoding_handling() {
|
|
let ocr_service = OcrService::new();
|
|
|
|
// Test with invalid font encoding
|
|
let invalid_font = "tests/test_pdfs/invalid_font_encoding.pdf";
|
|
if Path::new(invalid_font).exists() {
|
|
let result = ocr_service.extract_text_from_pdf(invalid_font).await;
|
|
// Should not panic, should return an error instead
|
|
assert!(result.is_err());
|
|
let error_msg = result.unwrap_err().to_string();
|
|
// Should contain descriptive error message
|
|
assert!(
|
|
error_msg.contains("panic") ||
|
|
error_msg.contains("font") ||
|
|
error_msg.contains("encoding") ||
|
|
error_msg.contains("extract")
|
|
);
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_fake_pdf_handling() {
|
|
let ocr_service = OcrService::new();
|
|
|
|
// Create a fake PDF file (not actually a PDF) that will definitely cause an error
|
|
let fake_pdf_content = b"This is not a PDF file at all, just plain text with a PDF extension.
|
|
It should cause pdf-extract to fail when trying to parse it.
|
|
This tests the error handling for files that aren't actually PDFs.";
|
|
|
|
let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
|
|
std::fs::write(temp_file.path(), fake_pdf_content).unwrap();
|
|
|
|
let result = ocr_service.extract_text_from_pdf(temp_file.path().to_str().unwrap()).await;
|
|
// Should not panic, should return an error instead
|
|
assert!(result.is_err(), "Expected error for fake PDF");
|
|
let error_msg = result.unwrap_err().to_string();
|
|
println!("Fake PDF error: {}", error_msg);
|
|
// Should contain descriptive error message about parsing failure
|
|
assert!(
|
|
error_msg.contains("extract") ||
|
|
error_msg.contains("parse") ||
|
|
error_msg.contains("PDF") ||
|
|
error_msg.contains("format") ||
|
|
error_msg.contains("Failed to extract")
|
|
);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_problematic_encoding_pdf_handling() {
|
|
let ocr_service = OcrService::new();
|
|
|
|
// Test with the existing problematic encoding PDF
|
|
let problematic_encoding = "tests/test_pdfs/problematic_encoding.pdf";
|
|
if Path::new(problematic_encoding).exists() {
|
|
let result = ocr_service.extract_text_from_pdf(problematic_encoding).await;
|
|
// With ocrmypdf, this may succeed gracefully or return descriptive error
|
|
match result {
|
|
Ok(text) => {
|
|
println!("Successfully extracted text from problematic encoding PDF: '{}'", text);
|
|
// OCRmyPDF's robustness allows it to handle some problematic encoding PDFs
|
|
}
|
|
Err(e) => {
|
|
println!("Error extracting from problematic encoding PDF: {}", e);
|
|
let error_msg = e.to_string();
|
|
assert!(
|
|
error_msg.contains("ocrmypdf") ||
|
|
error_msg.contains("extraction") ||
|
|
error_msg.contains("strategies") ||
|
|
error_msg.contains("Failed to extract")
|
|
);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Test that the enhanced OCR service also handles panics correctly
|
|
#[tokio::test]
|
|
async fn test_enhanced_ocr_panic_handling() {
|
|
use crate::ocr::enhanced::EnhancedOcrService;
|
|
use crate::services::file_service::FileService;
|
|
use crate::models::Settings;
|
|
|
|
let ocr_service = EnhancedOcrService::new("tests".to_string());
|
|
let settings = Settings::default();
|
|
|
|
// Test all malformed PDFs with enhanced OCR
|
|
let test_files = vec![
|
|
"tests/test_pdfs/malformed_content_stream.pdf",
|
|
"tests/test_pdfs/corrupted_structure.pdf",
|
|
"tests/test_pdfs/invalid_font_encoding.pdf",
|
|
"tests/test_pdfs/fake_pdf.pdf",
|
|
"tests/test_pdfs/problematic_encoding.pdf",
|
|
];
|
|
|
|
for test_file in test_files {
|
|
if Path::new(test_file).exists() {
|
|
let result = ocr_service.extract_text_with_context(
|
|
test_file,
|
|
"application/pdf",
|
|
&Path::new(test_file).file_name().unwrap().to_str().unwrap(),
|
|
1024, // file_size
|
|
&settings
|
|
).await;
|
|
|
|
// Should not panic, should return an error instead
|
|
assert!(result.is_err(), "Expected error for file: {}", test_file);
|
|
let error_msg = result.unwrap_err().to_string();
|
|
|
|
// Should contain descriptive error message
|
|
assert!(
|
|
error_msg.contains("panic") ||
|
|
error_msg.contains("extract") ||
|
|
error_msg.contains("PDF") ||
|
|
error_msg.contains("corrupted") ||
|
|
error_msg.contains("encoding") ||
|
|
error_msg.contains("font"),
|
|
"Error message should be descriptive for {}: {}", test_file, error_msg
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Test that panic handling works correctly in concurrent scenarios
|
|
#[tokio::test]
|
|
async fn test_concurrent_pdf_panic_handling() {
|
|
use std::sync::Arc;
|
|
use futures::future::join_all;
|
|
|
|
let ocr_service = Arc::new(OcrService::new());
|
|
let mut handles = Vec::new();
|
|
|
|
// Test concurrent processing of malformed PDFs
|
|
let test_files = vec![
|
|
"tests/test_pdfs/malformed_content_stream.pdf",
|
|
"tests/test_pdfs/corrupted_structure.pdf",
|
|
"tests/test_pdfs/invalid_font_encoding.pdf",
|
|
"tests/test_pdfs/fake_pdf.pdf",
|
|
];
|
|
|
|
for test_file in test_files {
|
|
if Path::new(test_file).exists() {
|
|
let ocr_service_clone = Arc::clone(&ocr_service);
|
|
let test_file_owned = test_file.to_string();
|
|
|
|
let handle = tokio::spawn(async move {
|
|
let result = ocr_service_clone.extract_text_from_pdf(&test_file_owned).await;
|
|
// Should not panic, should return an error instead
|
|
assert!(result.is_err(), "Expected error for file: {}", test_file_owned);
|
|
let error_msg = result.unwrap_err().to_string();
|
|
|
|
// Should contain descriptive error message
|
|
assert!(
|
|
error_msg.contains("panic") ||
|
|
error_msg.contains("extract") ||
|
|
error_msg.contains("PDF") ||
|
|
error_msg.contains("corrupted") ||
|
|
error_msg.contains("encoding"),
|
|
"Error message should be descriptive for {}: {}", test_file_owned, error_msg
|
|
);
|
|
});
|
|
|
|
handles.push(handle);
|
|
}
|
|
}
|
|
|
|
// Wait for all concurrent tasks to complete
|
|
let results = join_all(handles).await;
|
|
|
|
// Verify all tasks completed without panicking
|
|
for result in results {
|
|
assert!(result.is_ok(), "Task should complete without panicking");
|
|
}
|
|
}
|
|
} |