feat(dev): drop pdf_extract in favor of ocrmypdf

This commit is contained in:
perf3ct 2025-07-15 14:50:17 +00:00
parent 628fe8cb7b
commit 549c2f8a16
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
5 changed files with 511 additions and 326 deletions

164
Cargo.lock generated
View File

@ -33,26 +33,6 @@ version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
[[package]]
name = "adobe-cmap-parser"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae8abfa9a4688de8fc9f42b3f013b6fffec18ed8a554f5f113577e0b9b3212a3"
dependencies = [
"pom",
]
[[package]]
name = "aes"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
dependencies = [
"cfg-if",
"cipher",
"cpufeatures",
]
[[package]]
name = "aho-corasick"
version = "1.1.3"
@ -256,7 +236,7 @@ dependencies = [
"anyhow",
"arrayvec",
"log",
"nom 7.1.3",
"nom",
"num-rational",
"v_frame",
]
@ -903,15 +883,6 @@ dependencies = [
"generic-array",
]
[[package]]
name = "block-padding"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8894febbff9f758034a5b8e12d87918f56dfc64a8e1fe757d65e29041538d93"
dependencies = [
"generic-array",
]
[[package]]
name = "blowfish"
version = "0.9.1"
@ -984,12 +955,6 @@ version = "3.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "793db76d6187cd04dff33004d8e6c9cc4e05cd330500379d2394209271b4aeee"
[[package]]
name = "bytecount"
version = "0.6.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e"
[[package]]
name = "bytemuck"
version = "1.23.1"
@ -1024,15 +989,6 @@ dependencies = [
"either",
]
[[package]]
name = "cbc"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26b52a9543ae338f279b96b0b9fed9c8093744685043739079ce85cd58f289a6"
dependencies = [
"cipher",
]
[[package]]
name = "cc"
version = "1.2.27"
@ -1050,15 +1006,9 @@ version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
dependencies = [
"nom 7.1.3",
"nom",
]
[[package]]
name = "cff-parser"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "31f5b6e9141c036f3ff4ce7b2f7e432b0f00dee416ddcd4f17741d189ddc2e9d"
[[package]]
name = "cfg-expr"
version = "0.15.8"
@ -1478,15 +1428,6 @@ version = "1.0.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1c7a8fb8a9fbf66c1f703fe16184d10ca0ee9d23be5b4436400408ba54a95005"
[[package]]
name = "ecb"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a8bfa975b1aec2145850fcaa1c6fe269a16578c44705a532ae3edc92b8881c7"
dependencies = [
"cipher",
]
[[package]]
name = "ecdsa"
version = "0.14.8"
@ -1595,15 +1536,6 @@ dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "euclid"
version = "0.20.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2bb7ef65b3777a325d1eeefefab5b6d4959da54747e33bd6258e789640f307ad"
dependencies = [
"num-traits",
]
[[package]]
name = "event-listener"
version = "5.4.0"
@ -2504,7 +2436,6 @@ version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01"
dependencies = [
"block-padding",
"generic-array",
]
@ -2777,32 +2708,6 @@ dependencies = [
"imgref",
]
[[package]]
name = "lopdf"
version = "0.36.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59fa2559e99ba0f26a12458aabc754432c805bbb8cba516c427825a997af1fb7"
dependencies = [
"aes",
"bitflags 2.9.1",
"cbc",
"ecb",
"encoding_rs",
"flate2",
"indexmap 2.9.0",
"itoa",
"log",
"md-5",
"nom 8.0.0",
"nom_locate",
"rand 0.9.1",
"rangemap",
"sha2",
"stringprep",
"thiserror 2.0.12",
"weezl",
]
[[package]]
name = "lru"
version = "0.12.5"
@ -2972,26 +2877,6 @@ dependencies = [
"minimal-lexical",
]
[[package]]
name = "nom"
version = "8.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405"
dependencies = [
"memchr",
]
[[package]]
name = "nom_locate"
version = "5.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b577e2d69827c4740cba2b52efaad1c4cc7c73042860b199710b3575c68438d"
dependencies = [
"bytecount",
"memchr",
"nom 8.0.0",
]
[[package]]
name = "noop_proc_macro"
version = "0.3.0"
@ -3355,23 +3240,6 @@ version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
[[package]]
name = "pdf-extract"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c2f44c6c642e359e2fe7f662bf5438db3811b6b4be60afc6de04b619ce51e1a"
dependencies = [
"adobe-cmap-parser",
"cff-parser",
"encoding_rs",
"euclid",
"log",
"lopdf",
"postscript",
"type1-encoding-parser",
"unicode-normalization",
]
[[package]]
name = "peeking_take_while"
version = "0.1.2"
@ -3465,18 +3333,6 @@ dependencies = [
"miniz_oxide",
]
[[package]]
name = "pom"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6"
[[package]]
name = "postscript"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306"
[[package]]
name = "potential_utf"
version = "0.1.2"
@ -3648,12 +3504,6 @@ dependencies = [
"rand 0.8.5",
]
[[package]]
name = "rangemap"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f60fcc7d6849342eff22c4350c8b9a989ee8ceabc4b481253e8946b9fe83d684"
[[package]]
name = "rav1e"
version = "0.7.1"
@ -3763,7 +3613,6 @@ dependencies = [
"mime_guess",
"notify",
"oauth2",
"pdf-extract",
"quick-xml",
"raw-cpuid",
"readur",
@ -5437,15 +5286,6 @@ version = "0.25.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2df906b07856748fa3f6e0ad0cbaa047052d4a7dd609e231c4f72cee8c36f31"
[[package]]
name = "type1-encoding-parser"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3d6cc09e1a99c7e01f2afe4953789311a1c50baebbdac5b477ecf78e2e92a5b"
dependencies = [
"pom",
]
[[package]]
name = "typenum"
version = "1.18.0"

View File

@ -34,7 +34,6 @@ futures = "0.3"
notify = "8"
mime_guess = "2"
tesseract = { version = "0.15", optional = true }
pdf-extract = { version = "0.9", optional = true }
image = { version = "0.25", features = ["png", "jpeg", "tiff", "bmp"], optional = true }
imageproc = { version = "0.25", optional = true }
thiserror = "2.0"
@ -61,7 +60,7 @@ testcontainers-modules = { version = "0.12", features = ["postgres"], optional =
[features]
default = ["ocr", "s3"]
ocr = ["tesseract", "pdf-extract", "image", "imageproc", "raw-cpuid"]
ocr = ["tesseract", "image", "imageproc", "raw-cpuid"]
s3 = ["aws-config", "aws-sdk-s3", "aws-credential-types", "aws-types"]
test-utils = ["testcontainers", "testcontainers-modules"]

View File

@ -811,13 +811,13 @@ impl EnhancedOcrService {
Ok(closed)
}
/// Extract text from PDF with size and time limits
/// Extract text from PDF using ocrmypdf
#[cfg(feature = "ocr")]
pub async fn extract_text_from_pdf(&self, file_path: &str, settings: &Settings) -> Result<OcrResult> {
let start_time = std::time::Instant::now();
info!("Extracting text from PDF: {}", file_path);
// Check file size before loading into memory
// Check file size before processing
let metadata = tokio::fs::metadata(file_path).await?;
let file_size = metadata.len();
@ -831,103 +831,91 @@ impl EnhancedOcrService {
));
}
let bytes = tokio::fs::read(file_path).await?;
// Check if it's a valid PDF by reading first 1KB
let mut header_bytes = vec![0u8; 1024.min(file_size as usize)];
let mut file = tokio::fs::File::open(file_path).await?;
use tokio::io::AsyncReadExt;
file.read_exact(&mut header_bytes).await?;
drop(file);
// Check if it's a valid PDF (handles leading null bytes)
if !is_valid_pdf(&bytes) {
if !is_valid_pdf(&header_bytes) {
return Err(anyhow!(
"Invalid PDF file: Missing or corrupted PDF header. File size: {} bytes, Header: {:?}",
bytes.len(),
bytes.get(0..50).unwrap_or(&[]).iter().map(|&b| {
file_size,
header_bytes.get(0..50).unwrap_or(&[]).iter().map(|&b| {
if b >= 32 && b <= 126 { b as char } else { '.' }
}).collect::<String>()
));
}
// Clean the PDF data (remove leading null bytes)
let clean_bytes = clean_pdf_data(&bytes);
// Add timeout and panic recovery for PDF extraction
let extraction_result = tokio::time::timeout(
std::time::Duration::from_secs(120), // 2 minute timeout
tokio::task::spawn_blocking(move || {
// Catch panics from pdf-extract library
catch_unwind(AssertUnwindSafe(|| {
pdf_extract::extract_text_from_mem(&clean_bytes)
}))
})
).await;
let text = match extraction_result {
Ok(Ok(Ok(Ok(text)))) => text,
Ok(Ok(Ok(Err(e)))) => {
warn!("PDF text extraction failed for file '{}' (size: {} bytes): {}", file_path, file_size, e);
return Err(anyhow!(
"PDF text extraction failed for file '{}' (size: {} bytes): {}. This may indicate a corrupted or unsupported PDF format.",
file_path, file_size, e
));
}
Ok(Ok(Err(_panic))) => {
// pdf-extract panicked (e.g., missing unicode map, corrupted font encoding)
// For now, gracefully handle this common issue
warn!("PDF text extraction panicked for '{}' (size: {} bytes) due to font encoding issues. This is a known limitation with certain PDF files.", file_path, file_size);
return Err(anyhow!(
"PDF text extraction failed due to font encoding issues in '{}' (size: {} bytes). This PDF uses non-standard fonts or character encoding that cannot be processed. To extract text from this PDF, consider: 1) Converting it to images and uploading those instead, 2) Using a different PDF viewer to re-save the PDF with standard encoding, or 3) Using external tools to convert the PDF to a more compatible format.",
file_path, file_size
));
}
Ok(Err(e)) => {
warn!("PDF extraction task failed for '{}' (size: {} bytes): {}", file_path, file_size, e);
return Err(anyhow!("PDF extraction task failed: {}", e));
}
Err(_) => {
warn!("PDF extraction timed out after 2 minutes for file '{}' (size: {} bytes). The PDF may be corrupted or too complex.", file_path, file_size);
return Err(anyhow!(
"PDF extraction timed out after 2 minutes for file '{}' (size: {} bytes). The PDF may be corrupted or too complex.",
file_path, file_size
));
}
};
// Limit extracted text size to prevent memory issues
const MAX_TEXT_SIZE: usize = 10 * 1024 * 1024; // 10MB of text
let trimmed_text = if text.len() > MAX_TEXT_SIZE {
warn!("PDF text too large ({} chars), truncating to {} chars", text.len(), MAX_TEXT_SIZE);
format!("{}... [TEXT TRUNCATED DUE TO SIZE]", &text[..MAX_TEXT_SIZE])
} else {
text.trim().to_string()
};
let processing_time = start_time.elapsed().as_millis() as u64;
let word_count = self.count_words_safely(&trimmed_text);
// Debug logging to understand PDF extraction issues
debug!(
"PDF extraction debug - File: '{}' | Raw text length: {} | Trimmed text length: {} | Word count: {} | First 200 chars: {:?}",
file_path,
text.len(),
trimmed_text.len(),
word_count,
trimmed_text.chars().take(200).collect::<String>()
);
// Smart detection: assess if text extraction quality is good enough
if self.is_text_extraction_quality_sufficient(&trimmed_text, word_count, file_size) {
info!("PDF text extraction successful for '{}', using extracted text", file_path);
Ok(OcrResult {
text: trimmed_text,
confidence: 95.0, // PDF text extraction is generally high confidence
processing_time_ms: processing_time,
word_count,
preprocessing_applied: vec!["PDF text extraction".to_string()],
processed_image_path: None,
})
} else {
info!("PDF text extraction insufficient for '{}' ({} words), falling back to OCR", file_path, word_count);
// Fall back to OCR using ocrmypdf
self.extract_text_from_pdf_with_ocr(file_path, settings, start_time).await
// Check if ocrmypdf is available
if !self.is_ocrmypdf_available().await {
return Err(anyhow!(
"ocrmypdf is not available on this system. To extract text from PDFs, please install ocrmypdf. \
On Ubuntu/Debian: 'apt-get install ocrmypdf'. \
On macOS: 'brew install ocrmypdf'."
));
}
// First try to extract text without OCR for performance (using --skip-text)
let quick_extraction_result = self.extract_pdf_text_quick(file_path).await;
match quick_extraction_result {
Ok((text, extraction_time)) => {
let word_count = self.count_words_safely(&text);
// Check if quick extraction got good results
if self.is_text_extraction_quality_sufficient(&text, word_count, file_size) {
info!("PDF text extraction successful for '{}' using quick method", file_path);
return Ok(OcrResult {
text,
confidence: 95.0,
processing_time_ms: extraction_time,
word_count,
preprocessing_applied: vec!["PDF text extraction (ocrmypdf --skip-text)".to_string()],
processed_image_path: None,
});
} else {
info!("Quick PDF extraction insufficient for '{}' ({} words), using full OCR", file_path, word_count);
}
}
Err(e) => {
warn!("Quick PDF extraction failed for '{}': {}, using full OCR", file_path, e);
}
}
// If quick extraction failed or was insufficient, use full OCR
let full_ocr_result = self.extract_text_from_pdf_with_ocr(file_path, settings, start_time).await;
// If OCR also fails, try direct text extraction as last resort
if full_ocr_result.is_err() {
warn!("Full OCR failed, trying direct text extraction as last resort for: {}", file_path);
match self.extract_text_from_pdf_bytes(file_path).await {
Ok(text) if !text.trim().is_empty() => {
let processing_time = start_time.elapsed().as_millis() as u64;
let word_count = self.count_words_safely(&text);
info!("Direct text extraction succeeded as last resort for: {}", file_path);
return Ok(OcrResult {
text,
confidence: 50.0, // Lower confidence for direct extraction
processing_time_ms: processing_time,
word_count,
preprocessing_applied: vec!["Direct PDF text extraction (last resort)".to_string()],
processed_image_path: None,
});
}
Ok(_) => {
warn!("Direct text extraction returned empty text for: {}", file_path);
}
Err(e) => {
warn!("Direct text extraction also failed for {}: {}", file_path, e);
}
}
}
full_ocr_result
}
/// Assess if text extraction quality is sufficient or if OCR fallback is needed
@ -1002,14 +990,15 @@ impl EnhancedOcrService {
);
let temp_ocr_path = format!("{}/{}", self.temp_dir, temp_ocr_filename);
// Run ocrmypdf to create searchable PDF
// Run ocrmypdf with progressive fallback strategies
let ocrmypdf_result = tokio::time::timeout(
std::time::Duration::from_secs(300), // 5 minute timeout for OCR
tokio::task::spawn_blocking({
let file_path = file_path.to_string();
let temp_ocr_path = temp_ocr_path.clone();
move || {
std::process::Command::new("ocrmypdf")
// Strategy 1: Standard OCR with cleaning
let mut result = std::process::Command::new("ocrmypdf")
.arg("--force-ocr") // OCR even if text is detected
.arg("-O2") // Optimize level 2 (balanced quality/speed)
.arg("--deskew") // Correct skewed pages
@ -1018,6 +1007,38 @@ impl EnhancedOcrService {
.arg("eng") // English language
.arg(&file_path)
.arg(&temp_ocr_path)
.output();
if result.is_ok() && result.as_ref().unwrap().status.success() {
return result;
}
// Strategy 2: If standard OCR fails, try with error recovery
eprintln!("Standard OCR failed, trying recovery mode...");
result = std::process::Command::new("ocrmypdf")
.arg("--force-ocr")
.arg("--fix-metadata") // Fix metadata issues
.arg("--remove-background") // Remove background noise
.arg("-O1") // Lower optimization for problematic PDFs
.arg("--language")
.arg("eng")
.arg(&file_path)
.arg(&temp_ocr_path)
.output();
if result.is_ok() && result.as_ref().unwrap().status.success() {
return result;
}
// Strategy 3: Last resort - minimal processing (skips very large pages)
eprintln!("Recovery mode failed, trying minimal processing...");
std::process::Command::new("ocrmypdf")
.arg("--force-ocr")
.arg("--skip-big") // Skip very large pages that might cause memory issues
.arg("--language")
.arg("eng")
.arg(&file_path)
.arg(&temp_ocr_path)
.output()
}
})
@ -1044,25 +1065,28 @@ impl EnhancedOcrService {
move || -> Result<String> {
let bytes = std::fs::read(&temp_ocr_path)?;
// Catch panics from pdf-extract library (same pattern as used elsewhere)
let text = match catch_unwind(AssertUnwindSafe(|| {
pdf_extract::extract_text_from_mem(&bytes)
})) {
Ok(Ok(text)) => text,
Ok(Err(e)) => {
warn!("PDF text extraction failed after OCR processing for '{}': {}", temp_ocr_path, e);
return Err(anyhow!(
"PDF text extraction failed after OCR processing: {}. This may indicate a corrupted or unsupported PDF format.",
e
));
},
Err(_) => {
warn!("PDF extraction panicked after OCR processing for '{}' due to invalid content stream", temp_ocr_path);
return Err(anyhow!(
"PDF extraction panicked after OCR processing due to invalid content stream or corrupted PDF structure. \
This suggests the PDF has malformed internal structure that cannot be parsed safely."
));
},
};
// Extract text from the OCR'd PDF using ocrmypdf's sidecar option
let temp_text_path = format!("{}.txt", temp_ocr_path);
let extract_result = std::process::Command::new("ocrmypdf")
.arg("--sidecar") // Extract text to a sidecar file
.arg(&temp_text_path)
.arg(&temp_ocr_path)
.arg("-") // Output to stdout (dummy, required by ocrmypdf)
.output()?;
if !extract_result.status.success() {
let stderr = String::from_utf8_lossy(&extract_result.stderr);
return Err(anyhow!(
"ocrmypdf text extraction failed: {}",
stderr
));
}
// Read the extracted text from the sidecar file
let text = std::fs::read_to_string(&temp_text_path)?;
// Clean up the text file
let _ = std::fs::remove_file(&temp_text_path);
Ok(text.trim().to_string())
}
}).await??;
@ -1086,6 +1110,225 @@ impl EnhancedOcrService {
})
}
/// Progressive PDF text extraction with fallback strategies
#[cfg(feature = "ocr")]
async fn extract_pdf_text_quick(&self, file_path: &str) -> Result<(String, u64)> {
let start_time = std::time::Instant::now();
// Generate temporary file path for text extraction
let temp_text_filename = format!("quick_text_{}_{}.txt",
std::process::id(),
std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?.as_millis()
);
let temp_text_path = format!("{}/{}", self.temp_dir, temp_text_filename);
// Strategy 1: Fast extraction with --skip-text (extracts existing text, no OCR)
let mut ocrmypdf_result = tokio::process::Command::new("ocrmypdf")
.arg("--skip-text") // Extract existing text without OCR processing
.arg("--sidecar") // Extract text to sidecar file
.arg(&temp_text_path)
.arg(file_path)
.arg("-") // Dummy output (required by ocrmypdf)
.output()
.await;
if ocrmypdf_result.is_ok() && ocrmypdf_result.as_ref().unwrap().status.success() {
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
let _ = tokio::fs::remove_file(&temp_text_path).await;
let processing_time = start_time.elapsed().as_millis() as u64;
return Ok((text.trim().to_string(), processing_time));
}
}
info!("Quick extraction failed, trying recovery strategies for: {}", file_path);
// Strategy 2: Try with --fix-metadata for corrupted metadata
let temp_fixed_pdf = format!("{}/fixed_{}_{}.pdf", self.temp_dir, std::process::id(),
std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?.as_millis());
ocrmypdf_result = tokio::process::Command::new("ocrmypdf")
.arg("--fix-metadata") // Fix metadata issues
.arg("--skip-text") // Still skip OCR for speed
.arg("--sidecar")
.arg(&temp_text_path)
.arg(file_path)
.arg(&temp_fixed_pdf)
.output()
.await;
if ocrmypdf_result.is_ok() && ocrmypdf_result.as_ref().unwrap().status.success() {
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
let _ = tokio::fs::remove_file(&temp_text_path).await;
let _ = tokio::fs::remove_file(&temp_fixed_pdf).await;
let processing_time = start_time.elapsed().as_millis() as u64;
return Ok((text.trim().to_string(), processing_time));
}
}
// Strategy 3: Try with --remove-background for scanned documents
ocrmypdf_result = tokio::process::Command::new("ocrmypdf")
.arg("--remove-background")
.arg("--skip-text")
.arg("--sidecar")
.arg(&temp_text_path)
.arg(file_path)
.arg(&temp_fixed_pdf)
.output()
.await;
if ocrmypdf_result.is_ok() && ocrmypdf_result.as_ref().unwrap().status.success() {
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
let _ = tokio::fs::remove_file(&temp_text_path).await;
let _ = tokio::fs::remove_file(&temp_fixed_pdf).await;
let processing_time = start_time.elapsed().as_millis() as u64;
return Ok((text.trim().to_string(), processing_time));
}
}
// Clean up temporary files
let _ = tokio::fs::remove_file(&temp_text_path).await;
let _ = tokio::fs::remove_file(&temp_fixed_pdf).await;
// Last resort: try to extract any readable text directly from the PDF file
warn!("All ocrmypdf strategies failed, trying direct text extraction from: {}", file_path);
match self.extract_text_from_pdf_bytes(file_path).await {
Ok(text) if !text.trim().is_empty() => {
let processing_time = start_time.elapsed().as_millis() as u64;
info!("Direct text extraction succeeded for: {}", file_path);
Ok((text, processing_time))
}
Ok(_) => {
warn!("Direct text extraction returned empty text for: {}", file_path);
// If all strategies fail, return the last error
match ocrmypdf_result {
Ok(output) => {
let stderr = String::from_utf8_lossy(&output.stderr);
Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr))
}
Err(e) => Err(anyhow!("Failed to run ocrmypdf: {}", e)),
}
}
Err(e) => {
warn!("Direct text extraction also failed for {}: {}", file_path, e);
// If all strategies fail, return the last error
match ocrmypdf_result {
Ok(output) => {
let stderr = String::from_utf8_lossy(&output.stderr);
Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr))
}
Err(e) => Err(anyhow!("Failed to run ocrmypdf: {}", e)),
}
}
}
}
/// Last resort: extract readable text directly from PDF bytes
/// This can find text that's embedded in the PDF even if the structure is corrupted
#[cfg(feature = "ocr")]
async fn extract_text_from_pdf_bytes(&self, file_path: &str) -> Result<String> {
let bytes = tokio::fs::read(file_path).await?;
// Look for text strings in the PDF
let mut extracted_text = String::new();
let mut current_text = String::new();
let mut in_text_object = false;
let mut in_string = false;
let mut escape_next = false;
for &byte in &bytes {
let char = byte as char;
// Look for text objects (BT...ET blocks)
if !in_text_object && char == 'B' {
// Check if this might be the start of "BT" (Begin Text)
if let Some(window) = bytes.windows(2).find(|w| w == b"BT") {
in_text_object = true;
continue;
}
}
if in_text_object && char == 'E' {
// Check if this might be the start of "ET" (End Text)
if let Some(window) = bytes.windows(2).find(|w| w == b"ET") {
in_text_object = false;
if !current_text.trim().is_empty() {
extracted_text.push_str(&current_text);
extracted_text.push(' ');
current_text.clear();
}
continue;
}
}
// Look for text strings in parentheses (text) or brackets
if in_text_object {
if char == '(' && !escape_next {
in_string = true;
continue;
}
if char == ')' && !escape_next && in_string {
in_string = false;
current_text.push(' ');
continue;
}
if in_string {
if escape_next {
escape_next = false;
current_text.push(char);
} else if char == '\\' {
escape_next = true;
} else {
current_text.push(char);
}
}
}
}
// Also try to find any readable ASCII text in the PDF
let mut ascii_text = String::new();
let mut current_word = String::new();
for &byte in &bytes {
if byte >= 32 && byte <= 126 { // Printable ASCII
current_word.push(byte as char);
} else {
if current_word.len() > 3 { // Only keep words longer than 3 characters
ascii_text.push_str(&current_word);
ascii_text.push(' ');
}
current_word.clear();
}
}
// Add the last word if it's long enough
if current_word.len() > 3 {
ascii_text.push_str(&current_word);
}
// Combine both extraction methods
let mut final_text = extracted_text;
if !ascii_text.trim().is_empty() {
final_text.push_str("\\n");
final_text.push_str(&ascii_text);
}
// Clean up the text
let cleaned_text = final_text
.split_whitespace()
.filter(|word| word.len() > 1) // Filter out single characters
.collect::<Vec<_>>()
.join(" ");
if cleaned_text.trim().is_empty() {
Err(anyhow!("No readable text found in PDF"))
} else {
Ok(cleaned_text)
}
}
/// Check if ocrmypdf is available on the system
#[cfg(feature = "ocr")]
async fn is_ocrmypdf_available(&self) -> bool {
@ -1353,24 +1596,4 @@ fn is_valid_pdf(data: &[u8]) -> bool {
}
false
}
/// Remove leading null bytes and return clean PDF data
/// Returns the original data if no PDF header is found
fn clean_pdf_data(data: &[u8]) -> Vec<u8> {
if data.len() < 5 {
return data.to_vec();
}
// Find the first occurrence of "%PDF-" in the first 1KB
let search_limit = data.len().min(1024);
for i in 0..=search_limit.saturating_sub(5) {
if &data[i..i+5] == b"%PDF-" {
return data[i..].to_vec();
}
}
// If no PDF header found, return original data
data.to_vec()
}

View File

@ -8,7 +8,6 @@ pub mod tests;
use anyhow::{anyhow, Result};
use std::path::Path;
use std::panic::{catch_unwind, AssertUnwindSafe};
use crate::ocr::error::OcrError;
use crate::ocr::health::OcrHealthChecker;
@ -62,14 +61,85 @@ impl OcrService {
pub async fn extract_text_from_pdf(&self, file_path: &str) -> Result<String> {
#[cfg(feature = "ocr")]
{
let bytes = std::fs::read(file_path)?;
let text = match catch_unwind(AssertUnwindSafe(|| {
pdf_extract::extract_text_from_mem(&bytes)
})) {
Ok(Ok(text)) => text,
Ok(Err(e)) => return Err(anyhow!("Failed to extract text from PDF: {}", e)),
Err(_) => return Err(anyhow!("PDF extraction panicked due to invalid content stream in file: {}", file_path)),
};
// Check if ocrmypdf is available
let ocrmypdf_check = tokio::process::Command::new("ocrmypdf")
.arg("--version")
.output()
.await;
if ocrmypdf_check.is_err() || !ocrmypdf_check.unwrap().status.success() {
return Err(anyhow!(
"ocrmypdf is not available. Please install ocrmypdf: \
On Ubuntu/Debian: 'apt-get install ocrmypdf'. \
On macOS: 'brew install ocrmypdf'."
));
}
// Create temporary file for text extraction
let temp_dir = std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string());
let temp_text_path = format!("{}/pdf_text_{}.txt", temp_dir, std::process::id());
// Progressive extraction with fallback strategies
let mut output = tokio::process::Command::new("ocrmypdf")
.arg("--skip-text") // Extract existing text without OCR processing
.arg("--sidecar") // Extract text to sidecar file
.arg(&temp_text_path)
.arg(file_path)
.arg("-") // Dummy output (required)
.output()
.await?;
if !output.status.success() {
// Try with metadata fixing for corrupted files
output = tokio::process::Command::new("ocrmypdf")
.arg("--fix-metadata") // Fix corrupted metadata
.arg("--skip-text") // Still extract existing text only
.arg("--sidecar")
.arg(&temp_text_path)
.arg(file_path)
.arg("-")
.output()
.await?;
if !output.status.success() {
// Final fallback: minimal processing (may skip large pages)
output = tokio::process::Command::new("ocrmypdf")
.arg("--skip-big") // Skip very large pages to avoid memory issues
.arg("--sidecar")
.arg(&temp_text_path)
.arg(file_path)
.arg("-")
.output()
.await?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
// Clean up temp file on error
let _ = tokio::fs::remove_file(&temp_text_path).await;
// Last resort: try direct text extraction
match self.extract_text_from_pdf_bytes(file_path).await {
Ok(text) if !text.trim().is_empty() => {
return Ok(text);
}
Ok(_) => {
// Empty text from direct extraction
}
Err(_) => {
// Direct extraction also failed
}
}
return Err(anyhow!("Failed to extract text from PDF after trying multiple strategies: {}", stderr));
}
}
}
// Read the extracted text
let text = tokio::fs::read_to_string(&temp_text_path).await?;
// Clean up temporary file
let _ = tokio::fs::remove_file(&temp_text_path).await;
Ok(text.trim().to_string())
}
@ -106,6 +176,45 @@ impl OcrService {
}
}
/// Last resort: extract readable text directly from PDF bytes
async fn extract_text_from_pdf_bytes(&self, file_path: &str) -> Result<String> {
let bytes = tokio::fs::read(file_path).await?;
// Look for readable ASCII text in the PDF
let mut ascii_text = String::new();
let mut current_word = String::new();
for &byte in &bytes {
if byte >= 32 && byte <= 126 { // Printable ASCII
current_word.push(byte as char);
} else {
if current_word.len() > 3 { // Only keep words longer than 3 characters
ascii_text.push_str(&current_word);
ascii_text.push(' ');
}
current_word.clear();
}
}
// Add the last word if it's long enough
if current_word.len() > 3 {
ascii_text.push_str(&current_word);
}
// Clean up the text
let cleaned_text = ascii_text
.split_whitespace()
.filter(|word| word.len() > 1) // Filter out single characters
.collect::<Vec<_>>()
.join(" ");
if cleaned_text.trim().is_empty() {
Err(anyhow!("No readable text found in PDF"))
} else {
Ok(cleaned_text)
}
}
pub fn is_image_file(&self, file_path: &str) -> bool {
if let Some(extension) = Path::new(file_path)
.extension()

View File

@ -443,18 +443,25 @@ startxref
std::fs::write(temp_file.path(), malformed_pdf_content).unwrap();
let result = ocr_service.extract_text_from_pdf(temp_file.path().to_str().unwrap()).await;
// Should not panic, should return an error instead
assert!(result.is_err(), "Expected error for malformed PDF");
let error_msg = result.unwrap_err().to_string();
println!("Error message: {}", error_msg);
// Should contain descriptive error message
assert!(
error_msg.contains("panic") ||
error_msg.contains("invalid content stream") ||
error_msg.contains("corrupted") ||
error_msg.contains("extract") ||
error_msg.contains("Failed to extract")
);
// With ocrmypdf, this should now succeed gracefully (more robust than pdf-extract)
// or return a descriptive error - either is acceptable
match result {
Ok(text) => {
println!("Successfully extracted text from malformed PDF: '{}'", text);
// OCRmyPDF is more robust and can handle some malformed PDFs
}
Err(e) => {
println!("Error extracting from malformed PDF: {}", e);
// Should contain descriptive error message if it fails
let error_msg = e.to_string();
assert!(
error_msg.contains("ocrmypdf") ||
error_msg.contains("extraction") ||
error_msg.contains("InputFileError") ||
error_msg.contains("Failed to extract")
);
}
}
}
#[tokio::test]
@ -573,16 +580,23 @@ This tests the error handling for files that aren't actually PDFs.";
let problematic_encoding = "tests/test_pdfs/problematic_encoding.pdf";
if Path::new(problematic_encoding).exists() {
let result = ocr_service.extract_text_from_pdf(problematic_encoding).await;
// Should not panic, should return an error instead
assert!(result.is_err());
let error_msg = result.unwrap_err().to_string();
// Should contain descriptive error message
assert!(
error_msg.contains("panic") ||
error_msg.contains("encoding") ||
error_msg.contains("extract") ||
error_msg.contains("font")
);
// With ocrmypdf, this may succeed gracefully or return descriptive error
match result {
Ok(text) => {
println!("Successfully extracted text from problematic encoding PDF: '{}'", text);
// OCRmyPDF's robustness allows it to handle some problematic encoding PDFs
}
Err(e) => {
println!("Error extracting from problematic encoding PDF: {}", e);
let error_msg = e.to_string();
assert!(
error_msg.contains("ocrmypdf") ||
error_msg.contains("extraction") ||
error_msg.contains("strategies") ||
error_msg.contains("Failed to extract")
);
}
}
}
}