feat(ocr): gracefully handle problematic PDFs in all the ways, create tests so that it doesn't happen again
This commit is contained in:
parent
ad27d99949
commit
6165148e4d
|
|
@ -839,6 +839,7 @@ impl EnhancedOcrService {
|
|||
let text = match extraction_result {
|
||||
Ok(Ok(Ok(Ok(text)))) => text,
|
||||
Ok(Ok(Ok(Err(e)))) => {
|
||||
warn!("PDF text extraction failed for file '{}' (size: {} bytes): {}", file_path, file_size, e);
|
||||
return Err(anyhow!(
|
||||
"PDF text extraction failed for file '{}' (size: {} bytes): {}. This may indicate a corrupted or unsupported PDF format.",
|
||||
file_path, file_size, e
|
||||
|
|
@ -847,8 +848,7 @@ impl EnhancedOcrService {
|
|||
Ok(Ok(Err(_panic))) => {
|
||||
// pdf-extract panicked (e.g., missing unicode map, corrupted font encoding)
|
||||
// For now, gracefully handle this common issue
|
||||
use tracing::debug;
|
||||
debug!("PDF text extraction failed for '{}' due to font encoding issues. This is a known limitation with certain PDF files.", file_path);
|
||||
warn!("PDF text extraction panicked for '{}' (size: {} bytes) due to font encoding issues. This is a known limitation with certain PDF files.", file_path, file_size);
|
||||
|
||||
return Err(anyhow!(
|
||||
"PDF text extraction failed due to font encoding issues in '{}' (size: {} bytes). This PDF uses non-standard fonts or character encoding that cannot be processed. To extract text from this PDF, consider: 1) Converting it to images and uploading those instead, 2) Using a different PDF viewer to re-save the PDF with standard encoding, or 3) Using external tools to convert the PDF to a more compatible format.",
|
||||
|
|
@ -856,9 +856,11 @@ impl EnhancedOcrService {
|
|||
));
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
warn!("PDF extraction task failed for '{}' (size: {} bytes): {}", file_path, file_size, e);
|
||||
return Err(anyhow!("PDF extraction task failed: {}", e));
|
||||
}
|
||||
Err(_) => {
|
||||
warn!("PDF extraction timed out after 2 minutes for file '{}' (size: {} bytes). The PDF may be corrupted or too complex.", file_path, file_size);
|
||||
return Err(anyhow!(
|
||||
"PDF extraction timed out after 2 minutes for file '{}' (size: {} bytes). The PDF may be corrupted or too complex.",
|
||||
file_path, file_size
|
||||
|
|
@ -1019,7 +1021,26 @@ impl EnhancedOcrService {
|
|||
let temp_ocr_path = temp_ocr_path.clone();
|
||||
move || -> Result<String> {
|
||||
let bytes = std::fs::read(&temp_ocr_path)?;
|
||||
let text = pdf_extract::extract_text_from_mem(&bytes)?;
|
||||
// Catch panics from pdf-extract library (same pattern as used elsewhere)
|
||||
let text = match catch_unwind(AssertUnwindSafe(|| {
|
||||
pdf_extract::extract_text_from_mem(&bytes)
|
||||
})) {
|
||||
Ok(Ok(text)) => text,
|
||||
Ok(Err(e)) => {
|
||||
warn!("PDF text extraction failed after OCR processing for '{}': {}", temp_ocr_path, e);
|
||||
return Err(anyhow!(
|
||||
"PDF text extraction failed after OCR processing: {}. This may indicate a corrupted or unsupported PDF format.",
|
||||
e
|
||||
));
|
||||
},
|
||||
Err(_) => {
|
||||
warn!("PDF extraction panicked after OCR processing for '{}' due to invalid content stream", temp_ocr_path);
|
||||
return Err(anyhow!(
|
||||
"PDF extraction panicked after OCR processing due to invalid content stream or corrupted PDF structure. \
|
||||
This suggests the PDF has malformed internal structure that cannot be parsed safely."
|
||||
));
|
||||
},
|
||||
};
|
||||
Ok(text.trim().to_string())
|
||||
}
|
||||
}).await??;
|
||||
|
|
|
|||
|
|
@ -393,4 +393,295 @@ startxref
|
|||
assert!(all_updates.contains_key(&doc_id1));
|
||||
assert!(all_updates.contains_key(&doc_id2));
|
||||
}
|
||||
|
||||
/// Test that malformed PDFs don't crash the OCR system
|
||||
#[tokio::test]
|
||||
async fn test_malformed_pdf_panic_handling() {
|
||||
let ocr_service = OcrService::new();
|
||||
|
||||
// Create a malformed PDF in memory that will cause pdf-extract to panic
|
||||
let malformed_pdf_content = b"%PDF-1.4
|
||||
1 0 obj
|
||||
<< /Type /Catalog /Pages 2 0 R >>
|
||||
endobj
|
||||
2 0 obj
|
||||
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
|
||||
endobj
|
||||
3 0 obj
|
||||
<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
|
||||
endobj
|
||||
4 0 obj
|
||||
<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
|
||||
endobj
|
||||
5 0 obj
|
||||
<< /Length 999 >>
|
||||
stream
|
||||
BT
|
||||
/F1 12 Tf
|
||||
100 700 Td
|
||||
(This is a malformed PDF with invalid content stream) Tj
|
||||
ET
|
||||
INVALID_CONTENT_STREAM_DATA_THAT_CAUSES_PANIC
|
||||
endstream
|
||||
endobj
|
||||
xref
|
||||
0 6
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000058 00000 n
|
||||
0000000115 00000 n
|
||||
0000000262 00000 n
|
||||
0000000341 00000 n
|
||||
trailer
|
||||
<< /Size 6 /Root 1 0 R >>
|
||||
startxref
|
||||
999
|
||||
%%EOF";
|
||||
|
||||
// Write to temporary file
|
||||
let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
|
||||
std::fs::write(temp_file.path(), malformed_pdf_content).unwrap();
|
||||
|
||||
let result = ocr_service.extract_text_from_pdf(temp_file.path().to_str().unwrap()).await;
|
||||
// Should not panic, should return an error instead
|
||||
assert!(result.is_err(), "Expected error for malformed PDF");
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
println!("Error message: {}", error_msg);
|
||||
// Should contain descriptive error message
|
||||
assert!(
|
||||
error_msg.contains("panic") ||
|
||||
error_msg.contains("invalid content stream") ||
|
||||
error_msg.contains("corrupted") ||
|
||||
error_msg.contains("extract") ||
|
||||
error_msg.contains("Failed to extract")
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_corrupted_pdf_structure_handling() {
|
||||
let ocr_service = OcrService::new();
|
||||
|
||||
// Create a corrupted PDF structure that will cause pdf-extract to fail
|
||||
let corrupted_pdf_content = b"%PDF-1.4
|
||||
1 0 obj
|
||||
<< /Type /Catalog /Pages 2 0 R >>
|
||||
endobj
|
||||
2 0 obj
|
||||
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
|
||||
endobj
|
||||
3 0 obj
|
||||
<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
|
||||
endobj
|
||||
4 0 obj
|
||||
<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
|
||||
endobj
|
||||
5 0 obj
|
||||
<< /Length 44 >>
|
||||
stream
|
||||
BT
|
||||
/F1 12 Tf
|
||||
100 700 Td
|
||||
(Corrupted PDF) Tj
|
||||
ET
|
||||
endstream
|
||||
endobj
|
||||
xref
|
||||
0 6
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000058 00000 n
|
||||
0000000115 00000 n
|
||||
0000000262 00000 n
|
||||
0000000341 00000 n
|
||||
trailer
|
||||
<< /Size 6 /Root 1 0 R /InvalidKey >>
|
||||
startxref
|
||||
999999
|
||||
%%EOF";
|
||||
|
||||
let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
|
||||
std::fs::write(temp_file.path(), corrupted_pdf_content).unwrap();
|
||||
|
||||
let result = ocr_service.extract_text_from_pdf(temp_file.path().to_str().unwrap()).await;
|
||||
// Should not panic, should return an error instead
|
||||
assert!(result.is_err(), "Expected error for corrupted PDF");
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
println!("Corrupted PDF error: {}", error_msg);
|
||||
// Should contain descriptive error message
|
||||
assert!(
|
||||
error_msg.contains("panic") ||
|
||||
error_msg.contains("corrupted") ||
|
||||
error_msg.contains("extract") ||
|
||||
error_msg.contains("PDF") ||
|
||||
error_msg.contains("Failed to extract")
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_invalid_font_encoding_handling() {
|
||||
let ocr_service = OcrService::new();
|
||||
|
||||
// Test with invalid font encoding
|
||||
let invalid_font = "tests/test_pdfs/invalid_font_encoding.pdf";
|
||||
if Path::new(invalid_font).exists() {
|
||||
let result = ocr_service.extract_text_from_pdf(invalid_font).await;
|
||||
// Should not panic, should return an error instead
|
||||
assert!(result.is_err());
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
// Should contain descriptive error message
|
||||
assert!(
|
||||
error_msg.contains("panic") ||
|
||||
error_msg.contains("font") ||
|
||||
error_msg.contains("encoding") ||
|
||||
error_msg.contains("extract")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fake_pdf_handling() {
|
||||
let ocr_service = OcrService::new();
|
||||
|
||||
// Create a fake PDF file (not actually a PDF) that will definitely cause an error
|
||||
let fake_pdf_content = b"This is not a PDF file at all, just plain text with a PDF extension.
|
||||
It should cause pdf-extract to fail when trying to parse it.
|
||||
This tests the error handling for files that aren't actually PDFs.";
|
||||
|
||||
let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
|
||||
std::fs::write(temp_file.path(), fake_pdf_content).unwrap();
|
||||
|
||||
let result = ocr_service.extract_text_from_pdf(temp_file.path().to_str().unwrap()).await;
|
||||
// Should not panic, should return an error instead
|
||||
assert!(result.is_err(), "Expected error for fake PDF");
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
println!("Fake PDF error: {}", error_msg);
|
||||
// Should contain descriptive error message about parsing failure
|
||||
assert!(
|
||||
error_msg.contains("extract") ||
|
||||
error_msg.contains("parse") ||
|
||||
error_msg.contains("PDF") ||
|
||||
error_msg.contains("format") ||
|
||||
error_msg.contains("Failed to extract")
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_problematic_encoding_pdf_handling() {
|
||||
let ocr_service = OcrService::new();
|
||||
|
||||
// Test with the existing problematic encoding PDF
|
||||
let problematic_encoding = "tests/test_pdfs/problematic_encoding.pdf";
|
||||
if Path::new(problematic_encoding).exists() {
|
||||
let result = ocr_service.extract_text_from_pdf(problematic_encoding).await;
|
||||
// Should not panic, should return an error instead
|
||||
assert!(result.is_err());
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
// Should contain descriptive error message
|
||||
assert!(
|
||||
error_msg.contains("panic") ||
|
||||
error_msg.contains("encoding") ||
|
||||
error_msg.contains("extract") ||
|
||||
error_msg.contains("font")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Test that the enhanced OCR service also handles panics correctly
|
||||
#[tokio::test]
|
||||
async fn test_enhanced_ocr_panic_handling() {
|
||||
use crate::ocr::enhanced::EnhancedOcrService;
|
||||
use crate::services::file_service::FileService;
|
||||
use crate::models::Settings;
|
||||
|
||||
let ocr_service = EnhancedOcrService::new("tests".to_string());
|
||||
let settings = Settings::default();
|
||||
|
||||
// Test all malformed PDFs with enhanced OCR
|
||||
let test_files = vec![
|
||||
"tests/test_pdfs/malformed_content_stream.pdf",
|
||||
"tests/test_pdfs/corrupted_structure.pdf",
|
||||
"tests/test_pdfs/invalid_font_encoding.pdf",
|
||||
"tests/test_pdfs/fake_pdf.pdf",
|
||||
"tests/test_pdfs/problematic_encoding.pdf",
|
||||
];
|
||||
|
||||
for test_file in test_files {
|
||||
if Path::new(test_file).exists() {
|
||||
let result = ocr_service.extract_text_with_context(
|
||||
test_file,
|
||||
"application/pdf",
|
||||
&Path::new(test_file).file_name().unwrap().to_str().unwrap(),
|
||||
1024, // file_size
|
||||
&settings
|
||||
).await;
|
||||
|
||||
// Should not panic, should return an error instead
|
||||
assert!(result.is_err(), "Expected error for file: {}", test_file);
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
|
||||
// Should contain descriptive error message
|
||||
assert!(
|
||||
error_msg.contains("panic") ||
|
||||
error_msg.contains("extract") ||
|
||||
error_msg.contains("PDF") ||
|
||||
error_msg.contains("corrupted") ||
|
||||
error_msg.contains("encoding") ||
|
||||
error_msg.contains("font"),
|
||||
"Error message should be descriptive for {}: {}", test_file, error_msg
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Test that panic handling works correctly in concurrent scenarios
|
||||
#[tokio::test]
|
||||
async fn test_concurrent_pdf_panic_handling() {
|
||||
use std::sync::Arc;
|
||||
use futures::future::join_all;
|
||||
|
||||
let ocr_service = Arc::new(OcrService::new());
|
||||
let mut handles = Vec::new();
|
||||
|
||||
// Test concurrent processing of malformed PDFs
|
||||
let test_files = vec![
|
||||
"tests/test_pdfs/malformed_content_stream.pdf",
|
||||
"tests/test_pdfs/corrupted_structure.pdf",
|
||||
"tests/test_pdfs/invalid_font_encoding.pdf",
|
||||
"tests/test_pdfs/fake_pdf.pdf",
|
||||
];
|
||||
|
||||
for test_file in test_files {
|
||||
if Path::new(test_file).exists() {
|
||||
let ocr_service_clone = Arc::clone(&ocr_service);
|
||||
let test_file_owned = test_file.to_string();
|
||||
|
||||
let handle = tokio::spawn(async move {
|
||||
let result = ocr_service_clone.extract_text_from_pdf(&test_file_owned).await;
|
||||
// Should not panic, should return an error instead
|
||||
assert!(result.is_err(), "Expected error for file: {}", test_file_owned);
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
|
||||
// Should contain descriptive error message
|
||||
assert!(
|
||||
error_msg.contains("panic") ||
|
||||
error_msg.contains("extract") ||
|
||||
error_msg.contains("PDF") ||
|
||||
error_msg.contains("corrupted") ||
|
||||
error_msg.contains("encoding"),
|
||||
"Error message should be descriptive for {}: {}", test_file_owned, error_msg
|
||||
);
|
||||
});
|
||||
|
||||
handles.push(handle);
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for all concurrent tasks to complete
|
||||
let results = join_all(handles).await;
|
||||
|
||||
// Verify all tasks completed without panicking
|
||||
for result in results {
|
||||
assert!(result.is_ok(), "Task should complete without panicking");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
%PDF-1.4
|
||||
1 0 obj
|
||||
<< /Type /Catalog /Pages 2 0 R >>
|
||||
endobj
|
||||
2 0 obj
|
||||
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
|
||||
endobj
|
||||
3 0 obj
|
||||
<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
|
||||
endobj
|
||||
4 0 obj
|
||||
<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
|
||||
endobj
|
||||
5 0 obj
|
||||
<< /Length 44 >>
|
||||
stream
|
||||
BT
|
||||
/F1 12 Tf
|
||||
100 700 Td
|
||||
(Corrupted PDF) Tj
|
||||
ET
|
||||
endstream
|
||||
endobj
|
||||
xref
|
||||
0 6
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000058 00000 n
|
||||
0000000115 00000 n
|
||||
0000000262 00000 n
|
||||
0000000341 00000 n
|
||||
trailer
|
||||
<< /Size 6 /Root 1 0 R /InvalidKey >>
|
||||
startxref
|
||||
999999
|
||||
%%EOF
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
This is not a PDF file at all, just plain text with a PDF extension.
|
||||
It should cause pdf-extract to fail when trying to parse it.
|
||||
This tests the error handling for files that aren't actually PDFs.
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
%PDF-1.4
|
||||
1 0 obj
|
||||
<< /Type /Catalog /Pages 2 0 R >>
|
||||
endobj
|
||||
2 0 obj
|
||||
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
|
||||
endobj
|
||||
3 0 obj
|
||||
<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
|
||||
endobj
|
||||
4 0 obj
|
||||
<< /Type /Font /Subtype /Type1 /BaseFont /InvalidFont /Encoding /InvalidEncoding >>
|
||||
endobj
|
||||
5 0 obj
|
||||
<< /Length 44 >>
|
||||
stream
|
||||
BT
|
||||
/F1 12 Tf
|
||||
100 700 Td
|
||||
(Invalid font encoding) Tj
|
||||
ET
|
||||
endstream
|
||||
endobj
|
||||
xref
|
||||
0 6
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000058 00000 n
|
||||
0000000115 00000 n
|
||||
0000000262 00000 n
|
||||
0000000341 00000 n
|
||||
trailer
|
||||
<< /Size 6 /Root 1 0 R >>
|
||||
startxref
|
||||
435
|
||||
%%EOF
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
%PDF-1.4
|
||||
1 0 obj
|
||||
<< /Type /Catalog /Pages 2 0 R >>
|
||||
endobj
|
||||
2 0 obj
|
||||
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
|
||||
endobj
|
||||
3 0 obj
|
||||
<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
|
||||
endobj
|
||||
4 0 obj
|
||||
<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
|
||||
endobj
|
||||
5 0 obj
|
||||
<< /Length 999 >>
|
||||
stream
|
||||
BT
|
||||
/F1 12 Tf
|
||||
100 700 Td
|
||||
(This is a malformed PDF with invalid content stream) Tj
|
||||
ET
|
||||
INVALID_CONTENT_STREAM_DATA_THAT_CAUSES_PANIC
|
||||
endstream
|
||||
endobj
|
||||
xref
|
||||
0 6
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000058 00000 n
|
||||
0000000115 00000 n
|
||||
0000000262 00000 n
|
||||
0000000341 00000 n
|
||||
trailer
|
||||
<< /Size 6 /Root 1 0 R >>
|
||||
startxref
|
||||
999
|
||||
%%EOF
|
||||
Loading…
Reference in New Issue