From 564c5646139120fb66073ec04599fc9261ade28b Mon Sep 17 00:00:00 2001 From: perf3ct Date: Tue, 15 Jul 2025 15:59:29 +0000 Subject: [PATCH] feat(ocr): use ocrmypdf and pdftotext to get OCR layer if it already exists --- src/bin/debug_pdf_extraction.rs | 232 ++++++++++++++++++++++++++++++++ src/ocr/enhanced.rs | 168 ++++++++++++++--------- src/ocr/mod.rs | 28 ++-- 3 files changed, 352 insertions(+), 76 deletions(-) create mode 100644 src/bin/debug_pdf_extraction.rs diff --git a/src/bin/debug_pdf_extraction.rs b/src/bin/debug_pdf_extraction.rs new file mode 100644 index 0000000..3572b1a --- /dev/null +++ b/src/bin/debug_pdf_extraction.rs @@ -0,0 +1,232 @@ +use std::env; +use std::process; +use tokio; +use anyhow::Result; + +async fn test_pdftotext(file_path: &str) -> Result<(String, usize)> { + println!("=== Testing pdftotext ==="); + + let temp_text_path = format!("/tmp/debug_pdftotext_{}.txt", std::process::id()); + + let output = tokio::process::Command::new("pdftotext") + .arg("-layout") + .arg(file_path) + .arg(&temp_text_path) + .output() + .await?; + + println!("pdftotext exit status: {}", output.status); + if !output.stderr.is_empty() { + println!("pdftotext stderr: {}", String::from_utf8_lossy(&output.stderr)); + } + + if output.status.success() { + if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await { + let word_count = text.split_whitespace().count(); + println!("pdftotext extracted {} words", word_count); + println!("First 200 chars: {:?}", &text.chars().take(200).collect::()); + + // Clean up + let _ = tokio::fs::remove_file(&temp_text_path).await; + return Ok((text, word_count)); + } else { + println!("Failed to read pdftotext output file"); + } + } else { + println!("pdftotext failed"); + } + + Ok((String::new(), 0)) +} + +async fn test_ocrmypdf_sidecar(file_path: &str) -> Result<(String, usize)> { + println!("\n=== Testing ocrmypdf --sidecar ==="); + + let temp_text_path = format!("/tmp/debug_ocrmypdf_{}.txt", std::process::id()); + + let output = tokio::process::Command::new("ocrmypdf") + .arg("--sidecar") + .arg(&temp_text_path) + .arg(file_path) + .arg("-") // Dummy output + .output() + .await?; + + println!("ocrmypdf --sidecar exit status: {}", output.status); + if !output.stderr.is_empty() { + println!("ocrmypdf --sidecar stderr: {}", String::from_utf8_lossy(&output.stderr)); + } + + if output.status.success() { + if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await { + let word_count = text.split_whitespace().count(); + println!("ocrmypdf --sidecar extracted {} words", word_count); + println!("First 200 chars: {:?}", &text.chars().take(200).collect::()); + + // Clean up + let _ = tokio::fs::remove_file(&temp_text_path).await; + return Ok((text, word_count)); + } else { + println!("Failed to read ocrmypdf sidecar output file"); + } + } else { + println!("ocrmypdf --sidecar failed"); + } + + Ok((String::new(), 0)) +} + +async fn test_direct_extraction(file_path: &str) -> Result<(String, usize)> { + println!("\n=== Testing direct text extraction ==="); + + let bytes = tokio::fs::read(file_path).await?; + println!("PDF file size: {} bytes", bytes.len()); + + // Look for readable ASCII text in the PDF + let mut ascii_text = String::new(); + let mut current_word = String::new(); + + for &byte in &bytes { + if byte >= 32 && byte <= 126 { // Printable ASCII + current_word.push(byte as char); + } else { + if current_word.len() > 3 { // Only keep words longer than 3 characters + ascii_text.push_str(¤t_word); + ascii_text.push(' '); + } + current_word.clear(); + } + } + + // Add the last word if it's long enough + if current_word.len() > 3 { + ascii_text.push_str(¤t_word); + } + + // Clean up the text + let cleaned_text = ascii_text + .split_whitespace() + .filter(|word| word.len() > 1) // Filter out single characters + .collect::>() + .join(" "); + + let word_count = cleaned_text.split_whitespace().count(); + println!("Direct extraction got {} words", word_count); + println!("First 200 chars: {:?}", &cleaned_text.chars().take(200).collect::()); + + Ok((cleaned_text, word_count)) +} + +async fn test_quality_assessment(text: &str, word_count: usize, file_size: u64) { + println!("\n=== Testing quality assessment ==="); + + // Replicate the quality assessment logic + if word_count == 0 { + println!("Quality check: FAIL - no words"); + return; + } + + // For very small files, low word count might be normal + if file_size < 50_000 && word_count >= 1 { + println!("Quality check: PASS - small file with some text"); + return; + } + + // Calculate word density (words per KB) + let file_size_kb = (file_size as f64) / 1024.0; + let word_density = (word_count as f64) / file_size_kb; + + const MIN_WORD_DENSITY: f64 = 5.0; + const MIN_WORDS_FOR_LARGE_FILES: usize = 10; + const SUBSTANTIAL_WORD_COUNT: usize = 50; + + println!("File size: {:.1} KB", file_size_kb); + println!("Word density: {:.2} words/KB", word_density); + + // If we have substantial text, accept it regardless of density + if word_count >= SUBSTANTIAL_WORD_COUNT { + println!("Quality check: PASS - substantial text content ({} words)", word_count); + return; + } + + if word_density < MIN_WORD_DENSITY && word_count < MIN_WORDS_FOR_LARGE_FILES { + println!("Quality check: FAIL - appears to be image-based ({} words, {:.2} words/KB)", word_count, word_density); + return; + } + + // Additional check: if text is mostly non-alphanumeric, might be extraction artifacts + let alphanumeric_chars = text.chars().filter(|c| c.is_alphanumeric()).count(); + let alphanumeric_ratio = if text.len() > 0 { + (alphanumeric_chars as f64) / (text.len() as f64) + } else { + 0.0 + }; + + println!("Alphanumeric ratio: {:.1}%", alphanumeric_ratio * 100.0); + + // If less than 30% alphanumeric content, likely poor extraction + if alphanumeric_ratio < 0.3 { + println!("Quality check: FAIL - low alphanumeric content ({:.1}%)", alphanumeric_ratio * 100.0); + return; + } + + println!("Quality check: PASS - {} words, {:.2} words/KB, {:.1}% alphanumeric", + word_count, word_density, alphanumeric_ratio * 100.0); +} + +#[tokio::main] +async fn main() -> Result<()> { + let args: Vec = env::args().collect(); + if args.len() != 2 { + eprintln!("Usage: {} ", args[0]); + process::exit(1); + } + + let pdf_path = &args[1]; + println!("Debugging PDF extraction for: {}", pdf_path); + + // Check if file exists + if !tokio::fs::metadata(pdf_path).await.is_ok() { + eprintln!("Error: File '{}' not found", pdf_path); + process::exit(1); + } + + let file_size = tokio::fs::metadata(pdf_path).await?.len(); + println!("File size: {} bytes ({:.2} MB)", file_size, file_size as f64 / (1024.0 * 1024.0)); + + // Test each extraction method + let (pdftotext_text, pdftotext_words) = test_pdftotext(pdf_path).await?; + let (ocrmypdf_text, ocrmypdf_words) = test_ocrmypdf_sidecar(pdf_path).await?; + let (direct_text, direct_words) = test_direct_extraction(pdf_path).await?; + + // Test quality assessment on each result + if pdftotext_words > 0 { + test_quality_assessment(&pdftotext_text, pdftotext_words, file_size).await; + } + + if ocrmypdf_words > 0 { + test_quality_assessment(&ocrmypdf_text, ocrmypdf_words, file_size).await; + } + + if direct_words > 0 { + test_quality_assessment(&direct_text, direct_words, file_size).await; + } + + println!("\n=== Summary ==="); + println!("pdftotext: {} words", pdftotext_words); + println!("ocrmypdf --sidecar: {} words", ocrmypdf_words); + println!("direct extraction: {} words", direct_words); + + // Determine what should happen based on the logic + if pdftotext_words > 5 { + println!("Expected result: Use pdftotext ({} words)", pdftotext_words); + } else if direct_words > 5 { + println!("Expected result: Use direct extraction ({} words)", direct_words); + } else if ocrmypdf_words > 0 { + println!("Expected result: Use ocrmypdf --sidecar ({} words)", ocrmypdf_words); + } else { + println!("Expected result: All methods failed"); + } + + Ok(()) +} \ No newline at end of file diff --git a/src/ocr/enhanced.rs b/src/ocr/enhanced.rs index ac8d541..9af1667 100644 --- a/src/ocr/enhanced.rs +++ b/src/ocr/enhanced.rs @@ -872,7 +872,7 @@ impl EnhancedOcrService { confidence: 95.0, processing_time_ms: extraction_time, word_count, - preprocessing_applied: vec!["PDF text extraction (ocrmypdf --skip-text)".to_string()], + preprocessing_applied: vec!["PDF text extraction (pdftotext)".to_string()], processed_image_path: None, }); } else { @@ -938,8 +938,16 @@ impl EnhancedOcrService { // Reasonable thresholds based on typical PDF content: // - Text-based PDFs typically have 50-200 words per KB // - Below 5 words per KB suggests mostly images/scanned content + // - But if we have a substantial number of words (>50), accept it regardless of density const MIN_WORD_DENSITY: f64 = 5.0; const MIN_WORDS_FOR_LARGE_FILES: usize = 10; + const SUBSTANTIAL_WORD_COUNT: usize = 50; + + // If we have substantial text, accept it regardless of density + if word_count >= SUBSTANTIAL_WORD_COUNT { + debug!("PDF has substantial text content: {} words, accepting regardless of density", word_count); + return true; + } if word_density < MIN_WORD_DENSITY && word_count < MIN_WORDS_FOR_LARGE_FILES { debug!("PDF appears to be image-based: {} words in {:.1} KB (density: {:.2} words/KB)", @@ -1122,102 +1130,130 @@ impl EnhancedOcrService { ); let temp_text_path = format!("{}/{}", self.temp_dir, temp_text_filename); - // Strategy 1: Fast extraction with --skip-text (extracts existing text, no OCR) - let mut ocrmypdf_result = tokio::process::Command::new("ocrmypdf") - .arg("--skip-text") // Extract existing text without OCR processing - .arg("--sidecar") // Extract text to sidecar file - .arg(&temp_text_path) + // Strategy 1: Fast text extraction using pdftotext (for existing text) + debug!("Trying pdftotext for existing text extraction: {}", file_path); + debug!("Using temp file path: {}", temp_text_path); + let pdftotext_result = tokio::process::Command::new("pdftotext") + .arg("-layout") // Preserve layout .arg(file_path) - .arg("-") // Dummy output (required by ocrmypdf) + .arg(&temp_text_path) .output() .await; - if ocrmypdf_result.is_ok() && ocrmypdf_result.as_ref().unwrap().status.success() { - if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await { - let _ = tokio::fs::remove_file(&temp_text_path).await; - let processing_time = start_time.elapsed().as_millis() as u64; - return Ok((text.trim().to_string(), processing_time)); + if let Ok(output) = pdftotext_result { + debug!("pdftotext exit status: {}", output.status); + if !output.stderr.is_empty() { + debug!("pdftotext stderr: {}", String::from_utf8_lossy(&output.stderr)); + } + if output.status.success() { + if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await { + let _ = tokio::fs::remove_file(&temp_text_path).await; + let word_count = text.split_whitespace().count(); + debug!("pdftotext extracted {} words from temp file", word_count); + + // If we got substantial text (more than a few words), use it + if word_count > 5 { + let processing_time = start_time.elapsed().as_millis() as u64; + info!("pdftotext extracted {} words from: {}", word_count, file_path); + return Ok((text.trim().to_string(), processing_time)); + } else { + debug!("pdftotext only extracted {} words, will try direct extraction before OCR", word_count); + } + } else { + debug!("Failed to read pdftotext output file: {}", temp_text_path); + } + } else { + let stderr = String::from_utf8_lossy(&output.stderr); + debug!("pdftotext failed with status {}: {}", output.status, stderr); + } + } else { + debug!("Failed to execute pdftotext command"); + } + + info!("pdftotext extraction insufficient for '{}', trying direct extraction before OCR", file_path); + + // Strategy 2: Try direct text extraction (often works when pdftotext fails) + match self.extract_text_from_pdf_bytes(file_path).await { + Ok(text) if !text.trim().is_empty() => { + let word_count = text.split_whitespace().count(); + if word_count > 5 { + let processing_time = start_time.elapsed().as_millis() as u64; + info!("Direct text extraction succeeded for '{}': {} words", file_path, word_count); + return Ok((text, processing_time)); + } else { + debug!("Direct extraction only got {} words, trying OCR", word_count); + } + } + Ok(_) => { + debug!("Direct text extraction returned empty text"); + } + Err(e) => { + debug!("Direct text extraction failed: {}", e); } } - info!("Quick extraction failed, trying recovery strategies for: {}", file_path); + info!("Direct extraction insufficient for '{}', using OCR extraction", file_path); - // Strategy 2: Try with --fix-metadata for corrupted metadata - let temp_fixed_pdf = format!("{}/fixed_{}_{}.pdf", self.temp_dir, std::process::id(), - std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?.as_millis()); - - ocrmypdf_result = tokio::process::Command::new("ocrmypdf") - .arg("--fix-metadata") // Fix metadata issues - .arg("--skip-text") // Still skip OCR for speed + // Strategy 3: Use ocrmypdf --sidecar to extract existing OCR text + let ocrmypdf_result = tokio::process::Command::new("ocrmypdf") .arg("--sidecar") .arg(&temp_text_path) .arg(file_path) - .arg(&temp_fixed_pdf) + .arg("-") // Dummy output (we only want sidecar) .output() .await; - if ocrmypdf_result.is_ok() && ocrmypdf_result.as_ref().unwrap().status.success() { - if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await { - let _ = tokio::fs::remove_file(&temp_text_path).await; - let _ = tokio::fs::remove_file(&temp_fixed_pdf).await; - let processing_time = start_time.elapsed().as_millis() as u64; - return Ok((text.trim().to_string(), processing_time)); + if let Ok(output) = &ocrmypdf_result { + if output.status.success() { + if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await { + let _ = tokio::fs::remove_file(&temp_text_path).await; + let word_count = text.split_whitespace().count(); + if word_count > 0 { + let processing_time = start_time.elapsed().as_millis() as u64; + info!("ocrmypdf --sidecar extracted {} words from: {}", word_count, file_path); + return Ok((text.trim().to_string(), processing_time)); + } + } + } else { + let stderr = String::from_utf8_lossy(&output.stderr); + debug!("ocrmypdf --sidecar failed: {}", stderr); + + // Check if the error indicates the page already has text + if stderr.contains("page already has text") { + // This is good - it means there's already text, we should use pdftotext + warn!("ocrmypdf detected existing text in PDF, this should have been caught by pdftotext"); + } } } - // Strategy 3: Try with --remove-background for scanned documents - ocrmypdf_result = tokio::process::Command::new("ocrmypdf") - .arg("--remove-background") - .arg("--skip-text") - .arg("--sidecar") - .arg(&temp_text_path) - .arg(file_path) - .arg(&temp_fixed_pdf) - .output() - .await; - - if ocrmypdf_result.is_ok() && ocrmypdf_result.as_ref().unwrap().status.success() { - if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await { - let _ = tokio::fs::remove_file(&temp_text_path).await; - let _ = tokio::fs::remove_file(&temp_fixed_pdf).await; - let processing_time = start_time.elapsed().as_millis() as u64; - return Ok((text.trim().to_string(), processing_time)); - } - } - - // Clean up temporary files - let _ = tokio::fs::remove_file(&temp_text_path).await; - let _ = tokio::fs::remove_file(&temp_fixed_pdf).await; - - // Last resort: try to extract any readable text directly from the PDF file - warn!("All ocrmypdf strategies failed, trying direct text extraction from: {}", file_path); + // Strategy 3: Last resort - direct byte-level text extraction + warn!("Standard extraction methods failed, trying direct text extraction from: {}", file_path); match self.extract_text_from_pdf_bytes(file_path).await { Ok(text) if !text.trim().is_empty() => { let processing_time = start_time.elapsed().as_millis() as u64; - info!("Direct text extraction succeeded for: {}", file_path); + let word_count = text.split_whitespace().count(); + info!("Direct text extraction succeeded for '{}': {} words", file_path, word_count); Ok((text, processing_time)) } Ok(_) => { warn!("Direct text extraction returned empty text for: {}", file_path); // If all strategies fail, return the last error - match ocrmypdf_result { - Ok(output) => { - let stderr = String::from_utf8_lossy(&output.stderr); - Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr)) - } - Err(e) => Err(anyhow!("Failed to run ocrmypdf: {}", e)), + if let Ok(ref output) = ocrmypdf_result { + let stderr = String::from_utf8_lossy(&output.stderr); + Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr)) + } else { + Err(anyhow!("All PDF extraction strategies failed")) } } Err(e) => { warn!("Direct text extraction also failed for {}: {}", file_path, e); // If all strategies fail, return the last error - match ocrmypdf_result { - Ok(output) => { - let stderr = String::from_utf8_lossy(&output.stderr); - Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr)) - } - Err(e) => Err(anyhow!("Failed to run ocrmypdf: {}", e)), + if let Ok(ref output) = ocrmypdf_result { + let stderr = String::from_utf8_lossy(&output.stderr); + Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr)) + } else { + Err(anyhow!("All PDF extraction strategies failed: {}", e)) } } } diff --git a/src/ocr/mod.rs b/src/ocr/mod.rs index e64a907..d521e1e 100644 --- a/src/ocr/mod.rs +++ b/src/ocr/mod.rs @@ -80,24 +80,32 @@ impl OcrService { let temp_text_path = format!("{}/pdf_text_{}.txt", temp_dir, std::process::id()); // Progressive extraction with fallback strategies - let mut output = tokio::process::Command::new("ocrmypdf") - .arg("--skip-text") // Extract existing text without OCR processing - .arg("--sidecar") // Extract text to sidecar file - .arg(&temp_text_path) + // Strategy 1: pdftotext for existing text (fastest) + let mut output = tokio::process::Command::new("pdftotext") + .arg("-layout") // Preserve layout .arg(file_path) - .arg("-") // Dummy output (required) + .arg(&temp_text_path) .output() .await?; + if output.status.success() { + // Check if we got substantial text + if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await { + let word_count = text.split_whitespace().count(); + if word_count > 5 { + let _ = tokio::fs::remove_file(&temp_text_path).await; + return Ok(text.trim().to_string()); + } + } + } + if !output.status.success() { - // Try with metadata fixing for corrupted files + // Strategy 2: ocrmypdf sidecar (when pdftotext fails) output = tokio::process::Command::new("ocrmypdf") - .arg("--fix-metadata") // Fix corrupted metadata - .arg("--skip-text") // Still extract existing text only - .arg("--sidecar") + .arg("--sidecar") // Extract text to sidecar file .arg(&temp_text_path) .arg(file_path) - .arg("-") + .arg("-") // Dummy output .output() .await?;