feat(ocr): use ocrmypdf and pdftotext to get OCR layer if it already exists

This commit is contained in:
perf3ct 2025-07-15 15:59:29 +00:00
parent 3df5b5ef1d
commit 564c564613
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
3 changed files with 352 additions and 76 deletions

View File

@ -0,0 +1,232 @@
use std::env;
use std::process;
use tokio;
use anyhow::Result;
async fn test_pdftotext(file_path: &str) -> Result<(String, usize)> {
println!("=== Testing pdftotext ===");
let temp_text_path = format!("/tmp/debug_pdftotext_{}.txt", std::process::id());
let output = tokio::process::Command::new("pdftotext")
.arg("-layout")
.arg(file_path)
.arg(&temp_text_path)
.output()
.await?;
println!("pdftotext exit status: {}", output.status);
if !output.stderr.is_empty() {
println!("pdftotext stderr: {}", String::from_utf8_lossy(&output.stderr));
}
if output.status.success() {
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
let word_count = text.split_whitespace().count();
println!("pdftotext extracted {} words", word_count);
println!("First 200 chars: {:?}", &text.chars().take(200).collect::<String>());
// Clean up
let _ = tokio::fs::remove_file(&temp_text_path).await;
return Ok((text, word_count));
} else {
println!("Failed to read pdftotext output file");
}
} else {
println!("pdftotext failed");
}
Ok((String::new(), 0))
}
async fn test_ocrmypdf_sidecar(file_path: &str) -> Result<(String, usize)> {
println!("\n=== Testing ocrmypdf --sidecar ===");
let temp_text_path = format!("/tmp/debug_ocrmypdf_{}.txt", std::process::id());
let output = tokio::process::Command::new("ocrmypdf")
.arg("--sidecar")
.arg(&temp_text_path)
.arg(file_path)
.arg("-") // Dummy output
.output()
.await?;
println!("ocrmypdf --sidecar exit status: {}", output.status);
if !output.stderr.is_empty() {
println!("ocrmypdf --sidecar stderr: {}", String::from_utf8_lossy(&output.stderr));
}
if output.status.success() {
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
let word_count = text.split_whitespace().count();
println!("ocrmypdf --sidecar extracted {} words", word_count);
println!("First 200 chars: {:?}", &text.chars().take(200).collect::<String>());
// Clean up
let _ = tokio::fs::remove_file(&temp_text_path).await;
return Ok((text, word_count));
} else {
println!("Failed to read ocrmypdf sidecar output file");
}
} else {
println!("ocrmypdf --sidecar failed");
}
Ok((String::new(), 0))
}
async fn test_direct_extraction(file_path: &str) -> Result<(String, usize)> {
println!("\n=== Testing direct text extraction ===");
let bytes = tokio::fs::read(file_path).await?;
println!("PDF file size: {} bytes", bytes.len());
// Look for readable ASCII text in the PDF
let mut ascii_text = String::new();
let mut current_word = String::new();
for &byte in &bytes {
if byte >= 32 && byte <= 126 { // Printable ASCII
current_word.push(byte as char);
} else {
if current_word.len() > 3 { // Only keep words longer than 3 characters
ascii_text.push_str(&current_word);
ascii_text.push(' ');
}
current_word.clear();
}
}
// Add the last word if it's long enough
if current_word.len() > 3 {
ascii_text.push_str(&current_word);
}
// Clean up the text
let cleaned_text = ascii_text
.split_whitespace()
.filter(|word| word.len() > 1) // Filter out single characters
.collect::<Vec<_>>()
.join(" ");
let word_count = cleaned_text.split_whitespace().count();
println!("Direct extraction got {} words", word_count);
println!("First 200 chars: {:?}", &cleaned_text.chars().take(200).collect::<String>());
Ok((cleaned_text, word_count))
}
async fn test_quality_assessment(text: &str, word_count: usize, file_size: u64) {
println!("\n=== Testing quality assessment ===");
// Replicate the quality assessment logic
if word_count == 0 {
println!("Quality check: FAIL - no words");
return;
}
// For very small files, low word count might be normal
if file_size < 50_000 && word_count >= 1 {
println!("Quality check: PASS - small file with some text");
return;
}
// Calculate word density (words per KB)
let file_size_kb = (file_size as f64) / 1024.0;
let word_density = (word_count as f64) / file_size_kb;
const MIN_WORD_DENSITY: f64 = 5.0;
const MIN_WORDS_FOR_LARGE_FILES: usize = 10;
const SUBSTANTIAL_WORD_COUNT: usize = 50;
println!("File size: {:.1} KB", file_size_kb);
println!("Word density: {:.2} words/KB", word_density);
// If we have substantial text, accept it regardless of density
if word_count >= SUBSTANTIAL_WORD_COUNT {
println!("Quality check: PASS - substantial text content ({} words)", word_count);
return;
}
if word_density < MIN_WORD_DENSITY && word_count < MIN_WORDS_FOR_LARGE_FILES {
println!("Quality check: FAIL - appears to be image-based ({} words, {:.2} words/KB)", word_count, word_density);
return;
}
// Additional check: if text is mostly non-alphanumeric, might be extraction artifacts
let alphanumeric_chars = text.chars().filter(|c| c.is_alphanumeric()).count();
let alphanumeric_ratio = if text.len() > 0 {
(alphanumeric_chars as f64) / (text.len() as f64)
} else {
0.0
};
println!("Alphanumeric ratio: {:.1}%", alphanumeric_ratio * 100.0);
// If less than 30% alphanumeric content, likely poor extraction
if alphanumeric_ratio < 0.3 {
println!("Quality check: FAIL - low alphanumeric content ({:.1}%)", alphanumeric_ratio * 100.0);
return;
}
println!("Quality check: PASS - {} words, {:.2} words/KB, {:.1}% alphanumeric",
word_count, word_density, alphanumeric_ratio * 100.0);
}
#[tokio::main]
async fn main() -> Result<()> {
let args: Vec<String> = env::args().collect();
if args.len() != 2 {
eprintln!("Usage: {} <pdf_file_path>", args[0]);
process::exit(1);
}
let pdf_path = &args[1];
println!("Debugging PDF extraction for: {}", pdf_path);
// Check if file exists
if !tokio::fs::metadata(pdf_path).await.is_ok() {
eprintln!("Error: File '{}' not found", pdf_path);
process::exit(1);
}
let file_size = tokio::fs::metadata(pdf_path).await?.len();
println!("File size: {} bytes ({:.2} MB)", file_size, file_size as f64 / (1024.0 * 1024.0));
// Test each extraction method
let (pdftotext_text, pdftotext_words) = test_pdftotext(pdf_path).await?;
let (ocrmypdf_text, ocrmypdf_words) = test_ocrmypdf_sidecar(pdf_path).await?;
let (direct_text, direct_words) = test_direct_extraction(pdf_path).await?;
// Test quality assessment on each result
if pdftotext_words > 0 {
test_quality_assessment(&pdftotext_text, pdftotext_words, file_size).await;
}
if ocrmypdf_words > 0 {
test_quality_assessment(&ocrmypdf_text, ocrmypdf_words, file_size).await;
}
if direct_words > 0 {
test_quality_assessment(&direct_text, direct_words, file_size).await;
}
println!("\n=== Summary ===");
println!("pdftotext: {} words", pdftotext_words);
println!("ocrmypdf --sidecar: {} words", ocrmypdf_words);
println!("direct extraction: {} words", direct_words);
// Determine what should happen based on the logic
if pdftotext_words > 5 {
println!("Expected result: Use pdftotext ({} words)", pdftotext_words);
} else if direct_words > 5 {
println!("Expected result: Use direct extraction ({} words)", direct_words);
} else if ocrmypdf_words > 0 {
println!("Expected result: Use ocrmypdf --sidecar ({} words)", ocrmypdf_words);
} else {
println!("Expected result: All methods failed");
}
Ok(())
}

View File

@ -872,7 +872,7 @@ impl EnhancedOcrService {
confidence: 95.0,
processing_time_ms: extraction_time,
word_count,
preprocessing_applied: vec!["PDF text extraction (ocrmypdf --skip-text)".to_string()],
preprocessing_applied: vec!["PDF text extraction (pdftotext)".to_string()],
processed_image_path: None,
});
} else {
@ -938,8 +938,16 @@ impl EnhancedOcrService {
// Reasonable thresholds based on typical PDF content:
// - Text-based PDFs typically have 50-200 words per KB
// - Below 5 words per KB suggests mostly images/scanned content
// - But if we have a substantial number of words (>50), accept it regardless of density
const MIN_WORD_DENSITY: f64 = 5.0;
const MIN_WORDS_FOR_LARGE_FILES: usize = 10;
const SUBSTANTIAL_WORD_COUNT: usize = 50;
// If we have substantial text, accept it regardless of density
if word_count >= SUBSTANTIAL_WORD_COUNT {
debug!("PDF has substantial text content: {} words, accepting regardless of density", word_count);
return true;
}
if word_density < MIN_WORD_DENSITY && word_count < MIN_WORDS_FOR_LARGE_FILES {
debug!("PDF appears to be image-based: {} words in {:.1} KB (density: {:.2} words/KB)",
@ -1122,102 +1130,130 @@ impl EnhancedOcrService {
);
let temp_text_path = format!("{}/{}", self.temp_dir, temp_text_filename);
// Strategy 1: Fast extraction with --skip-text (extracts existing text, no OCR)
let mut ocrmypdf_result = tokio::process::Command::new("ocrmypdf")
.arg("--skip-text") // Extract existing text without OCR processing
.arg("--sidecar") // Extract text to sidecar file
.arg(&temp_text_path)
// Strategy 1: Fast text extraction using pdftotext (for existing text)
debug!("Trying pdftotext for existing text extraction: {}", file_path);
debug!("Using temp file path: {}", temp_text_path);
let pdftotext_result = tokio::process::Command::new("pdftotext")
.arg("-layout") // Preserve layout
.arg(file_path)
.arg("-") // Dummy output (required by ocrmypdf)
.arg(&temp_text_path)
.output()
.await;
if ocrmypdf_result.is_ok() && ocrmypdf_result.as_ref().unwrap().status.success() {
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
let _ = tokio::fs::remove_file(&temp_text_path).await;
let processing_time = start_time.elapsed().as_millis() as u64;
return Ok((text.trim().to_string(), processing_time));
if let Ok(output) = pdftotext_result {
debug!("pdftotext exit status: {}", output.status);
if !output.stderr.is_empty() {
debug!("pdftotext stderr: {}", String::from_utf8_lossy(&output.stderr));
}
if output.status.success() {
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
let _ = tokio::fs::remove_file(&temp_text_path).await;
let word_count = text.split_whitespace().count();
debug!("pdftotext extracted {} words from temp file", word_count);
// If we got substantial text (more than a few words), use it
if word_count > 5 {
let processing_time = start_time.elapsed().as_millis() as u64;
info!("pdftotext extracted {} words from: {}", word_count, file_path);
return Ok((text.trim().to_string(), processing_time));
} else {
debug!("pdftotext only extracted {} words, will try direct extraction before OCR", word_count);
}
} else {
debug!("Failed to read pdftotext output file: {}", temp_text_path);
}
} else {
let stderr = String::from_utf8_lossy(&output.stderr);
debug!("pdftotext failed with status {}: {}", output.status, stderr);
}
} else {
debug!("Failed to execute pdftotext command");
}
info!("pdftotext extraction insufficient for '{}', trying direct extraction before OCR", file_path);
// Strategy 2: Try direct text extraction (often works when pdftotext fails)
match self.extract_text_from_pdf_bytes(file_path).await {
Ok(text) if !text.trim().is_empty() => {
let word_count = text.split_whitespace().count();
if word_count > 5 {
let processing_time = start_time.elapsed().as_millis() as u64;
info!("Direct text extraction succeeded for '{}': {} words", file_path, word_count);
return Ok((text, processing_time));
} else {
debug!("Direct extraction only got {} words, trying OCR", word_count);
}
}
Ok(_) => {
debug!("Direct text extraction returned empty text");
}
Err(e) => {
debug!("Direct text extraction failed: {}", e);
}
}
info!("Quick extraction failed, trying recovery strategies for: {}", file_path);
info!("Direct extraction insufficient for '{}', using OCR extraction", file_path);
// Strategy 2: Try with --fix-metadata for corrupted metadata
let temp_fixed_pdf = format!("{}/fixed_{}_{}.pdf", self.temp_dir, std::process::id(),
std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?.as_millis());
ocrmypdf_result = tokio::process::Command::new("ocrmypdf")
.arg("--fix-metadata") // Fix metadata issues
.arg("--skip-text") // Still skip OCR for speed
// Strategy 3: Use ocrmypdf --sidecar to extract existing OCR text
let ocrmypdf_result = tokio::process::Command::new("ocrmypdf")
.arg("--sidecar")
.arg(&temp_text_path)
.arg(file_path)
.arg(&temp_fixed_pdf)
.arg("-") // Dummy output (we only want sidecar)
.output()
.await;
if ocrmypdf_result.is_ok() && ocrmypdf_result.as_ref().unwrap().status.success() {
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
let _ = tokio::fs::remove_file(&temp_text_path).await;
let _ = tokio::fs::remove_file(&temp_fixed_pdf).await;
let processing_time = start_time.elapsed().as_millis() as u64;
return Ok((text.trim().to_string(), processing_time));
if let Ok(output) = &ocrmypdf_result {
if output.status.success() {
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
let _ = tokio::fs::remove_file(&temp_text_path).await;
let word_count = text.split_whitespace().count();
if word_count > 0 {
let processing_time = start_time.elapsed().as_millis() as u64;
info!("ocrmypdf --sidecar extracted {} words from: {}", word_count, file_path);
return Ok((text.trim().to_string(), processing_time));
}
}
} else {
let stderr = String::from_utf8_lossy(&output.stderr);
debug!("ocrmypdf --sidecar failed: {}", stderr);
// Check if the error indicates the page already has text
if stderr.contains("page already has text") {
// This is good - it means there's already text, we should use pdftotext
warn!("ocrmypdf detected existing text in PDF, this should have been caught by pdftotext");
}
}
}
// Strategy 3: Try with --remove-background for scanned documents
ocrmypdf_result = tokio::process::Command::new("ocrmypdf")
.arg("--remove-background")
.arg("--skip-text")
.arg("--sidecar")
.arg(&temp_text_path)
.arg(file_path)
.arg(&temp_fixed_pdf)
.output()
.await;
if ocrmypdf_result.is_ok() && ocrmypdf_result.as_ref().unwrap().status.success() {
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
let _ = tokio::fs::remove_file(&temp_text_path).await;
let _ = tokio::fs::remove_file(&temp_fixed_pdf).await;
let processing_time = start_time.elapsed().as_millis() as u64;
return Ok((text.trim().to_string(), processing_time));
}
}
// Clean up temporary files
let _ = tokio::fs::remove_file(&temp_text_path).await;
let _ = tokio::fs::remove_file(&temp_fixed_pdf).await;
// Last resort: try to extract any readable text directly from the PDF file
warn!("All ocrmypdf strategies failed, trying direct text extraction from: {}", file_path);
// Strategy 3: Last resort - direct byte-level text extraction
warn!("Standard extraction methods failed, trying direct text extraction from: {}", file_path);
match self.extract_text_from_pdf_bytes(file_path).await {
Ok(text) if !text.trim().is_empty() => {
let processing_time = start_time.elapsed().as_millis() as u64;
info!("Direct text extraction succeeded for: {}", file_path);
let word_count = text.split_whitespace().count();
info!("Direct text extraction succeeded for '{}': {} words", file_path, word_count);
Ok((text, processing_time))
}
Ok(_) => {
warn!("Direct text extraction returned empty text for: {}", file_path);
// If all strategies fail, return the last error
match ocrmypdf_result {
Ok(output) => {
let stderr = String::from_utf8_lossy(&output.stderr);
Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr))
}
Err(e) => Err(anyhow!("Failed to run ocrmypdf: {}", e)),
if let Ok(ref output) = ocrmypdf_result {
let stderr = String::from_utf8_lossy(&output.stderr);
Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr))
} else {
Err(anyhow!("All PDF extraction strategies failed"))
}
}
Err(e) => {
warn!("Direct text extraction also failed for {}: {}", file_path, e);
// If all strategies fail, return the last error
match ocrmypdf_result {
Ok(output) => {
let stderr = String::from_utf8_lossy(&output.stderr);
Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr))
}
Err(e) => Err(anyhow!("Failed to run ocrmypdf: {}", e)),
if let Ok(ref output) = ocrmypdf_result {
let stderr = String::from_utf8_lossy(&output.stderr);
Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr))
} else {
Err(anyhow!("All PDF extraction strategies failed: {}", e))
}
}
}

View File

@ -80,24 +80,32 @@ impl OcrService {
let temp_text_path = format!("{}/pdf_text_{}.txt", temp_dir, std::process::id());
// Progressive extraction with fallback strategies
let mut output = tokio::process::Command::new("ocrmypdf")
.arg("--skip-text") // Extract existing text without OCR processing
.arg("--sidecar") // Extract text to sidecar file
.arg(&temp_text_path)
// Strategy 1: pdftotext for existing text (fastest)
let mut output = tokio::process::Command::new("pdftotext")
.arg("-layout") // Preserve layout
.arg(file_path)
.arg("-") // Dummy output (required)
.arg(&temp_text_path)
.output()
.await?;
if output.status.success() {
// Check if we got substantial text
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
let word_count = text.split_whitespace().count();
if word_count > 5 {
let _ = tokio::fs::remove_file(&temp_text_path).await;
return Ok(text.trim().to_string());
}
}
}
if !output.status.success() {
// Try with metadata fixing for corrupted files
// Strategy 2: ocrmypdf sidecar (when pdftotext fails)
output = tokio::process::Command::new("ocrmypdf")
.arg("--fix-metadata") // Fix corrupted metadata
.arg("--skip-text") // Still extract existing text only
.arg("--sidecar")
.arg("--sidecar") // Extract text to sidecar file
.arg(&temp_text_path)
.arg(file_path)
.arg("-")
.arg("-") // Dummy output
.output()
.await?;