Merge branch 'feat/multiple-ocr-languages' of https://github.com/readur/readur into feat/multiple-ocr-languages

This commit is contained in:
perf3ct 2025-07-14 19:33:51 +00:00
commit 7317fd5ebb
11 changed files with 433 additions and 9 deletions

2
Cargo.lock generated
View File

@ -3741,7 +3741,7 @@ dependencies = [
[[package]]
name = "readur"
version = "0.1.0"
version = "2.4.2"
dependencies = [
"anyhow",
"aws-config",

View File

@ -1,6 +1,6 @@
[package]
name = "readur"
version = "0.1.0"
version = "2.4.2"
edition = "2021"
[[bin]]

View File

@ -1,12 +1,12 @@
{
"name": "readur-frontend",
"version": "0.1.0",
"version": "2.4.2",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "readur-frontend",
"version": "0.1.0",
"version": "2.4.2",
"dependencies": {
"@emotion/react": "^11.14.0",
"@emotion/styled": "^11.14.0",

View File

@ -1,6 +1,6 @@
{
"name": "readur-frontend",
"version": "0.1.0",
"version": "2.4.2",
"private": true,
"type": "module",
"scripts": {

View File

@ -861,6 +861,7 @@ impl EnhancedOcrService {
let text = match extraction_result {
Ok(Ok(Ok(Ok(text)))) => text,
Ok(Ok(Ok(Err(e)))) => {
warn!("PDF text extraction failed for file '{}' (size: {} bytes): {}", file_path, file_size, e);
return Err(anyhow!(
"PDF text extraction failed for file '{}' (size: {} bytes): {}. This may indicate a corrupted or unsupported PDF format.",
file_path, file_size, e
@ -869,8 +870,7 @@ impl EnhancedOcrService {
Ok(Ok(Err(_panic))) => {
// pdf-extract panicked (e.g., missing unicode map, corrupted font encoding)
// For now, gracefully handle this common issue
use tracing::debug;
debug!("PDF text extraction failed for '{}' due to font encoding issues. This is a known limitation with certain PDF files.", file_path);
warn!("PDF text extraction panicked for '{}' (size: {} bytes) due to font encoding issues. This is a known limitation with certain PDF files.", file_path, file_size);
return Err(anyhow!(
"PDF text extraction failed due to font encoding issues in '{}' (size: {} bytes). This PDF uses non-standard fonts or character encoding that cannot be processed. To extract text from this PDF, consider: 1) Converting it to images and uploading those instead, 2) Using a different PDF viewer to re-save the PDF with standard encoding, or 3) Using external tools to convert the PDF to a more compatible format.",
@ -878,9 +878,11 @@ impl EnhancedOcrService {
));
}
Ok(Err(e)) => {
warn!("PDF extraction task failed for '{}' (size: {} bytes): {}", file_path, file_size, e);
return Err(anyhow!("PDF extraction task failed: {}", e));
}
Err(_) => {
warn!("PDF extraction timed out after 2 minutes for file '{}' (size: {} bytes). The PDF may be corrupted or too complex.", file_path, file_size);
return Err(anyhow!(
"PDF extraction timed out after 2 minutes for file '{}' (size: {} bytes). The PDF may be corrupted or too complex.",
file_path, file_size
@ -1041,7 +1043,26 @@ impl EnhancedOcrService {
let temp_ocr_path = temp_ocr_path.clone();
move || -> Result<String> {
let bytes = std::fs::read(&temp_ocr_path)?;
let text = pdf_extract::extract_text_from_mem(&bytes)?;
// Catch panics from pdf-extract library (same pattern as used elsewhere)
let text = match catch_unwind(AssertUnwindSafe(|| {
pdf_extract::extract_text_from_mem(&bytes)
})) {
Ok(Ok(text)) => text,
Ok(Err(e)) => {
warn!("PDF text extraction failed after OCR processing for '{}': {}", temp_ocr_path, e);
return Err(anyhow!(
"PDF text extraction failed after OCR processing: {}. This may indicate a corrupted or unsupported PDF format.",
e
));
},
Err(_) => {
warn!("PDF extraction panicked after OCR processing for '{}' due to invalid content stream", temp_ocr_path);
return Err(anyhow!(
"PDF extraction panicked after OCR processing due to invalid content stream or corrupted PDF structure. \
This suggests the PDF has malformed internal structure that cannot be parsed safely."
));
},
};
Ok(text.trim().to_string())
}
}).await??;

View File

@ -172,7 +172,7 @@ use crate::{
modifiers(&SecurityAddon),
info(
title = "Readur API",
version = "0.1.0",
version = "2.4.2",
description = "Document management and OCR processing API",
contact(
name = "Readur Team",

View File

@ -393,4 +393,295 @@ startxref
assert!(all_updates.contains_key(&doc_id1));
assert!(all_updates.contains_key(&doc_id2));
}
/// Test that malformed PDFs don't crash the OCR system
#[tokio::test]
async fn test_malformed_pdf_panic_handling() {
let ocr_service = OcrService::new();
// Create a malformed PDF in memory that will cause pdf-extract to panic
let malformed_pdf_content = b"%PDF-1.4
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
endobj
4 0 obj
<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
endobj
5 0 obj
<< /Length 999 >>
stream
BT
/F1 12 Tf
100 700 Td
(This is a malformed PDF with invalid content stream) Tj
ET
INVALID_CONTENT_STREAM_DATA_THAT_CAUSES_PANIC
endstream
endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000262 00000 n
0000000341 00000 n
trailer
<< /Size 6 /Root 1 0 R >>
startxref
999
%%EOF";
// Write to temporary file
let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
std::fs::write(temp_file.path(), malformed_pdf_content).unwrap();
let result = ocr_service.extract_text_from_pdf(temp_file.path().to_str().unwrap()).await;
// Should not panic, should return an error instead
assert!(result.is_err(), "Expected error for malformed PDF");
let error_msg = result.unwrap_err().to_string();
println!("Error message: {}", error_msg);
// Should contain descriptive error message
assert!(
error_msg.contains("panic") ||
error_msg.contains("invalid content stream") ||
error_msg.contains("corrupted") ||
error_msg.contains("extract") ||
error_msg.contains("Failed to extract")
);
}
#[tokio::test]
async fn test_corrupted_pdf_structure_handling() {
let ocr_service = OcrService::new();
// Create a corrupted PDF structure that will cause pdf-extract to fail
let corrupted_pdf_content = b"%PDF-1.4
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
endobj
4 0 obj
<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
endobj
5 0 obj
<< /Length 44 >>
stream
BT
/F1 12 Tf
100 700 Td
(Corrupted PDF) Tj
ET
endstream
endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000262 00000 n
0000000341 00000 n
trailer
<< /Size 6 /Root 1 0 R /InvalidKey >>
startxref
999999
%%EOF";
let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
std::fs::write(temp_file.path(), corrupted_pdf_content).unwrap();
let result = ocr_service.extract_text_from_pdf(temp_file.path().to_str().unwrap()).await;
// Should not panic, should return an error instead
assert!(result.is_err(), "Expected error for corrupted PDF");
let error_msg = result.unwrap_err().to_string();
println!("Corrupted PDF error: {}", error_msg);
// Should contain descriptive error message
assert!(
error_msg.contains("panic") ||
error_msg.contains("corrupted") ||
error_msg.contains("extract") ||
error_msg.contains("PDF") ||
error_msg.contains("Failed to extract")
);
}
#[tokio::test]
async fn test_invalid_font_encoding_handling() {
let ocr_service = OcrService::new();
// Test with invalid font encoding
let invalid_font = "tests/test_pdfs/invalid_font_encoding.pdf";
if Path::new(invalid_font).exists() {
let result = ocr_service.extract_text_from_pdf(invalid_font).await;
// Should not panic, should return an error instead
assert!(result.is_err());
let error_msg = result.unwrap_err().to_string();
// Should contain descriptive error message
assert!(
error_msg.contains("panic") ||
error_msg.contains("font") ||
error_msg.contains("encoding") ||
error_msg.contains("extract")
);
}
}
#[tokio::test]
async fn test_fake_pdf_handling() {
let ocr_service = OcrService::new();
// Create a fake PDF file (not actually a PDF) that will definitely cause an error
let fake_pdf_content = b"This is not a PDF file at all, just plain text with a PDF extension.
It should cause pdf-extract to fail when trying to parse it.
This tests the error handling for files that aren't actually PDFs.";
let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
std::fs::write(temp_file.path(), fake_pdf_content).unwrap();
let result = ocr_service.extract_text_from_pdf(temp_file.path().to_str().unwrap()).await;
// Should not panic, should return an error instead
assert!(result.is_err(), "Expected error for fake PDF");
let error_msg = result.unwrap_err().to_string();
println!("Fake PDF error: {}", error_msg);
// Should contain descriptive error message about parsing failure
assert!(
error_msg.contains("extract") ||
error_msg.contains("parse") ||
error_msg.contains("PDF") ||
error_msg.contains("format") ||
error_msg.contains("Failed to extract")
);
}
#[tokio::test]
async fn test_problematic_encoding_pdf_handling() {
let ocr_service = OcrService::new();
// Test with the existing problematic encoding PDF
let problematic_encoding = "tests/test_pdfs/problematic_encoding.pdf";
if Path::new(problematic_encoding).exists() {
let result = ocr_service.extract_text_from_pdf(problematic_encoding).await;
// Should not panic, should return an error instead
assert!(result.is_err());
let error_msg = result.unwrap_err().to_string();
// Should contain descriptive error message
assert!(
error_msg.contains("panic") ||
error_msg.contains("encoding") ||
error_msg.contains("extract") ||
error_msg.contains("font")
);
}
}
/// Test that the enhanced OCR service also handles panics correctly
#[tokio::test]
async fn test_enhanced_ocr_panic_handling() {
use crate::ocr::enhanced::EnhancedOcrService;
use crate::services::file_service::FileService;
use crate::models::Settings;
let ocr_service = EnhancedOcrService::new("tests".to_string());
let settings = Settings::default();
// Test all malformed PDFs with enhanced OCR
let test_files = vec![
"tests/test_pdfs/malformed_content_stream.pdf",
"tests/test_pdfs/corrupted_structure.pdf",
"tests/test_pdfs/invalid_font_encoding.pdf",
"tests/test_pdfs/fake_pdf.pdf",
"tests/test_pdfs/problematic_encoding.pdf",
];
for test_file in test_files {
if Path::new(test_file).exists() {
let result = ocr_service.extract_text_with_context(
test_file,
"application/pdf",
&Path::new(test_file).file_name().unwrap().to_str().unwrap(),
1024, // file_size
&settings
).await;
// Should not panic, should return an error instead
assert!(result.is_err(), "Expected error for file: {}", test_file);
let error_msg = result.unwrap_err().to_string();
// Should contain descriptive error message
assert!(
error_msg.contains("panic") ||
error_msg.contains("extract") ||
error_msg.contains("PDF") ||
error_msg.contains("corrupted") ||
error_msg.contains("encoding") ||
error_msg.contains("font"),
"Error message should be descriptive for {}: {}", test_file, error_msg
);
}
}
}
/// Test that panic handling works correctly in concurrent scenarios
#[tokio::test]
async fn test_concurrent_pdf_panic_handling() {
use std::sync::Arc;
use futures::future::join_all;
let ocr_service = Arc::new(OcrService::new());
let mut handles = Vec::new();
// Test concurrent processing of malformed PDFs
let test_files = vec![
"tests/test_pdfs/malformed_content_stream.pdf",
"tests/test_pdfs/corrupted_structure.pdf",
"tests/test_pdfs/invalid_font_encoding.pdf",
"tests/test_pdfs/fake_pdf.pdf",
];
for test_file in test_files {
if Path::new(test_file).exists() {
let ocr_service_clone = Arc::clone(&ocr_service);
let test_file_owned = test_file.to_string();
let handle = tokio::spawn(async move {
let result = ocr_service_clone.extract_text_from_pdf(&test_file_owned).await;
// Should not panic, should return an error instead
assert!(result.is_err(), "Expected error for file: {}", test_file_owned);
let error_msg = result.unwrap_err().to_string();
// Should contain descriptive error message
assert!(
error_msg.contains("panic") ||
error_msg.contains("extract") ||
error_msg.contains("PDF") ||
error_msg.contains("corrupted") ||
error_msg.contains("encoding"),
"Error message should be descriptive for {}: {}", test_file_owned, error_msg
);
});
handles.push(handle);
}
}
// Wait for all concurrent tasks to complete
let results = join_all(handles).await;
// Verify all tasks completed without panicking
for result in results {
assert!(result.is_ok(), "Task should complete without panicking");
}
}
}

View File

@ -0,0 +1,36 @@
%PDF-1.4
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
endobj
4 0 obj
<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
endobj
5 0 obj
<< /Length 44 >>
stream
BT
/F1 12 Tf
100 700 Td
(Corrupted PDF) Tj
ET
endstream
endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000262 00000 n
0000000341 00000 n
trailer
<< /Size 6 /Root 1 0 R /InvalidKey >>
startxref
999999
%%EOF

View File

@ -0,0 +1,3 @@
This is not a PDF file at all, just plain text with a PDF extension.
It should cause pdf-extract to fail when trying to parse it.
This tests the error handling for files that aren't actually PDFs.

View File

@ -0,0 +1,36 @@
%PDF-1.4
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
endobj
4 0 obj
<< /Type /Font /Subtype /Type1 /BaseFont /InvalidFont /Encoding /InvalidEncoding >>
endobj
5 0 obj
<< /Length 44 >>
stream
BT
/F1 12 Tf
100 700 Td
(Invalid font encoding) Tj
ET
endstream
endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000262 00000 n
0000000341 00000 n
trailer
<< /Size 6 /Root 1 0 R >>
startxref
435
%%EOF

View File

@ -0,0 +1,37 @@
%PDF-1.4
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
endobj
4 0 obj
<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
endobj
5 0 obj
<< /Length 999 >>
stream
BT
/F1 12 Tf
100 700 Td
(This is a malformed PDF with invalid content stream) Tj
ET
INVALID_CONTENT_STREAM_DATA_THAT_CAUSES_PANIC
endstream
endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000262 00000 n
0000000341 00000 n
trailer
<< /Size 6 /Root 1 0 R >>
startxref
999
%%EOF