feat(ocr): use ocrmypdf and pdftotext to get OCR layer if it already exists

2025-07-15 15:59:29 +00:00 · 2025-07-15 15:59:29 +00:00 · 564c564613
parent 3df5b5ef1d
commit 564c564613
3 changed files with 352 additions and 76 deletions
--- a/src/bin/debug_pdf_extraction.rs
+++ b/src/bin/debug_pdf_extraction.rs
@ -0,0 +1,232 @@
+use std::env;
+use std::process;
+use tokio;
+use anyhow::Result;
+
+async fn test_pdftotext(file_path: &str) -> Result<(String, usize)> {
+    println!("=== Testing pdftotext ===");
+    
+    let temp_text_path = format!("/tmp/debug_pdftotext_{}.txt", std::process::id());
+    
+    let output = tokio::process::Command::new("pdftotext")
+        .arg("-layout")
+        .arg(file_path)
+        .arg(&temp_text_path)
+        .output()
+        .await?;
+    
+    println!("pdftotext exit status: {}", output.status);
+    if !output.stderr.is_empty() {
+        println!("pdftotext stderr: {}", String::from_utf8_lossy(&output.stderr));
+    }
+    
+    if output.status.success() {
+        if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
+            let word_count = text.split_whitespace().count();
+            println!("pdftotext extracted {} words", word_count);
+            println!("First 200 chars: {:?}", &text.chars().take(200).collect::<String>());
+            
+            // Clean up
+            let _ = tokio::fs::remove_file(&temp_text_path).await;
+            return Ok((text, word_count));
+        } else {
+            println!("Failed to read pdftotext output file");
+        }
+    } else {
+        println!("pdftotext failed");
+    }
+    
+    Ok((String::new(), 0))
+}
+
+async fn test_ocrmypdf_sidecar(file_path: &str) -> Result<(String, usize)> {
+    println!("\n=== Testing ocrmypdf --sidecar ===");
+    
+    let temp_text_path = format!("/tmp/debug_ocrmypdf_{}.txt", std::process::id());
+    
+    let output = tokio::process::Command::new("ocrmypdf")
+        .arg("--sidecar")
+        .arg(&temp_text_path)
+        .arg(file_path)
+        .arg("-")  // Dummy output
+        .output()
+        .await?;
+    
+    println!("ocrmypdf --sidecar exit status: {}", output.status);
+    if !output.stderr.is_empty() {
+        println!("ocrmypdf --sidecar stderr: {}", String::from_utf8_lossy(&output.stderr));
+    }
+    
+    if output.status.success() {
+        if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
+            let word_count = text.split_whitespace().count();
+            println!("ocrmypdf --sidecar extracted {} words", word_count);
+            println!("First 200 chars: {:?}", &text.chars().take(200).collect::<String>());
+            
+            // Clean up
+            let _ = tokio::fs::remove_file(&temp_text_path).await;
+            return Ok((text, word_count));
+        } else {
+            println!("Failed to read ocrmypdf sidecar output file");
+        }
+    } else {
+        println!("ocrmypdf --sidecar failed");
+    }
+    
+    Ok((String::new(), 0))
+}
+
+async fn test_direct_extraction(file_path: &str) -> Result<(String, usize)> {
+    println!("\n=== Testing direct text extraction ===");
+    
+    let bytes = tokio::fs::read(file_path).await?;
+    println!("PDF file size: {} bytes", bytes.len());
+    
+    // Look for readable ASCII text in the PDF
+    let mut ascii_text = String::new();
+    let mut current_word = String::new();
+    
+    for &byte in &bytes {
+        if byte >= 32 && byte <= 126 {  // Printable ASCII
+            current_word.push(byte as char);
+        } else {
+            if current_word.len() > 3 {  // Only keep words longer than 3 characters
+                ascii_text.push_str(&current_word);
+                ascii_text.push(' ');
+            }
+            current_word.clear();
+        }
+    }
+    
+    // Add the last word if it's long enough
+    if current_word.len() > 3 {
+        ascii_text.push_str(&current_word);
+    }
+    
+    // Clean up the text
+    let cleaned_text = ascii_text
+        .split_whitespace()
+        .filter(|word| word.len() > 1)  // Filter out single characters
+        .collect::<Vec<_>>()
+        .join(" ");
+    
+    let word_count = cleaned_text.split_whitespace().count();
+    println!("Direct extraction got {} words", word_count);
+    println!("First 200 chars: {:?}", &cleaned_text.chars().take(200).collect::<String>());
+    
+    Ok((cleaned_text, word_count))
+}
+
+async fn test_quality_assessment(text: &str, word_count: usize, file_size: u64) {
+    println!("\n=== Testing quality assessment ===");
+    
+    // Replicate the quality assessment logic
+    if word_count == 0 {
+        println!("Quality check: FAIL - no words");
+        return;
+    }
+    
+    // For very small files, low word count might be normal
+    if file_size < 50_000 && word_count >= 1 {
+        println!("Quality check: PASS - small file with some text");
+        return;
+    }
+    
+    // Calculate word density (words per KB)
+    let file_size_kb = (file_size as f64) / 1024.0;
+    let word_density = (word_count as f64) / file_size_kb;
+    
+    const MIN_WORD_DENSITY: f64 = 5.0;
+    const MIN_WORDS_FOR_LARGE_FILES: usize = 10;
+    const SUBSTANTIAL_WORD_COUNT: usize = 50;
+    
+    println!("File size: {:.1} KB", file_size_kb);
+    println!("Word density: {:.2} words/KB", word_density);
+    
+    // If we have substantial text, accept it regardless of density
+    if word_count >= SUBSTANTIAL_WORD_COUNT {
+        println!("Quality check: PASS - substantial text content ({} words)", word_count);
+        return;
+    }
+    
+    if word_density < MIN_WORD_DENSITY && word_count < MIN_WORDS_FOR_LARGE_FILES {
+        println!("Quality check: FAIL - appears to be image-based ({} words, {:.2} words/KB)", word_count, word_density);
+        return;
+    }
+    
+    // Additional check: if text is mostly non-alphanumeric, might be extraction artifacts
+    let alphanumeric_chars = text.chars().filter(|c| c.is_alphanumeric()).count();
+    let alphanumeric_ratio = if text.len() > 0 {
+        (alphanumeric_chars as f64) / (text.len() as f64)
+    } else {
+        0.0
+    };
+    
+    println!("Alphanumeric ratio: {:.1}%", alphanumeric_ratio * 100.0);
+    
+    // If less than 30% alphanumeric content, likely poor extraction
+    if alphanumeric_ratio < 0.3 {
+        println!("Quality check: FAIL - low alphanumeric content ({:.1}%)", alphanumeric_ratio * 100.0);
+        return;
+    }
+    
+    println!("Quality check: PASS - {} words, {:.2} words/KB, {:.1}% alphanumeric", 
+             word_count, word_density, alphanumeric_ratio * 100.0);
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let args: Vec<String> = env::args().collect();
+    if args.len() != 2 {
+        eprintln!("Usage: {} <pdf_file_path>", args[0]);
+        process::exit(1);
+    }
+    
+    let pdf_path = &args[1];
+    println!("Debugging PDF extraction for: {}", pdf_path);
+    
+    // Check if file exists
+    if !tokio::fs::metadata(pdf_path).await.is_ok() {
+        eprintln!("Error: File '{}' not found", pdf_path);
+        process::exit(1);
+    }
+    
+    let file_size = tokio::fs::metadata(pdf_path).await?.len();
+    println!("File size: {} bytes ({:.2} MB)", file_size, file_size as f64 / (1024.0 * 1024.0));
+    
+    // Test each extraction method
+    let (pdftotext_text, pdftotext_words) = test_pdftotext(pdf_path).await?;
+    let (ocrmypdf_text, ocrmypdf_words) = test_ocrmypdf_sidecar(pdf_path).await?;
+    let (direct_text, direct_words) = test_direct_extraction(pdf_path).await?;
+    
+    // Test quality assessment on each result
+    if pdftotext_words > 0 {
+        test_quality_assessment(&pdftotext_text, pdftotext_words, file_size).await;
+    }
+    
+    if ocrmypdf_words > 0 {
+        test_quality_assessment(&ocrmypdf_text, ocrmypdf_words, file_size).await;
+    }
+    
+    if direct_words > 0 {
+        test_quality_assessment(&direct_text, direct_words, file_size).await;
+    }
+    
+    println!("\n=== Summary ===");
+    println!("pdftotext: {} words", pdftotext_words);
+    println!("ocrmypdf --sidecar: {} words", ocrmypdf_words);
+    println!("direct extraction: {} words", direct_words);
+    
+    // Determine what should happen based on the logic
+    if pdftotext_words > 5 {
+        println!("Expected result: Use pdftotext ({} words)", pdftotext_words);
+    } else if direct_words > 5 {
+        println!("Expected result: Use direct extraction ({} words)", direct_words);
+    } else if ocrmypdf_words > 0 {
+        println!("Expected result: Use ocrmypdf --sidecar ({} words)", ocrmypdf_words);
+    } else {
+        println!("Expected result: All methods failed");
+    }
+    
+    Ok(())
+}
--- a/src/ocr/enhanced.rs
+++ b/src/ocr/enhanced.rs
@ -872,7 +872,7 @@ impl EnhancedOcrService {
                        confidence: 95.0,
                        processing_time_ms: extraction_time,
                        word_count,
-                        preprocessing_applied: vec!["PDF text extraction (ocrmypdf --skip-text)".to_string()],
+                        preprocessing_applied: vec!["PDF text extraction (pdftotext)".to_string()],
                        processed_image_path: None,
                    });
                } else {
@ -938,8 +938,16 @@ impl EnhancedOcrService {
        // Reasonable thresholds based on typical PDF content:
        // - Text-based PDFs typically have 50-200 words per KB
        // - Below 5 words per KB suggests mostly images/scanned content
+        // - But if we have a substantial number of words (>50), accept it regardless of density
        const MIN_WORD_DENSITY: f64 = 5.0;
        const MIN_WORDS_FOR_LARGE_FILES: usize = 10;
+        const SUBSTANTIAL_WORD_COUNT: usize = 50;
+        
+        // If we have substantial text, accept it regardless of density
+        if word_count >= SUBSTANTIAL_WORD_COUNT {
+            debug!("PDF has substantial text content: {} words, accepting regardless of density", word_count);
+            return true;
+        }
        
        if word_density < MIN_WORD_DENSITY && word_count < MIN_WORDS_FOR_LARGE_FILES {
            debug!("PDF appears to be image-based: {} words in {:.1} KB (density: {:.2} words/KB)", 
@ -1122,102 +1130,130 @@ impl EnhancedOcrService {
        );
        let temp_text_path = format!("{}/{}", self.temp_dir, temp_text_filename);
        
-        // Strategy 1: Fast extraction with --skip-text (extracts existing text, no OCR)
-        let mut ocrmypdf_result = tokio::process::Command::new("ocrmypdf")
-            .arg("--skip-text")  // Extract existing text without OCR processing
-            .arg("--sidecar")    // Extract text to sidecar file
-            .arg(&temp_text_path)
+        // Strategy 1: Fast text extraction using pdftotext (for existing text)
+        debug!("Trying pdftotext for existing text extraction: {}", file_path);
+        debug!("Using temp file path: {}", temp_text_path);
+        let pdftotext_result = tokio::process::Command::new("pdftotext")
+            .arg("-layout")  // Preserve layout
            .arg(file_path)
-            .arg("-")  // Dummy output (required by ocrmypdf)
+            .arg(&temp_text_path)
            .output()
            .await;
        
-        if ocrmypdf_result.is_ok() && ocrmypdf_result.as_ref().unwrap().status.success() {
-            if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
-                let _ = tokio::fs::remove_file(&temp_text_path).await;
-                let processing_time = start_time.elapsed().as_millis() as u64;
-                return Ok((text.trim().to_string(), processing_time));
+        if let Ok(output) = pdftotext_result {
+            debug!("pdftotext exit status: {}", output.status);
+            if !output.stderr.is_empty() {
+                debug!("pdftotext stderr: {}", String::from_utf8_lossy(&output.stderr));
+            }
+            if output.status.success() {
+                if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
+                    let _ = tokio::fs::remove_file(&temp_text_path).await;
+                    let word_count = text.split_whitespace().count();
+                    debug!("pdftotext extracted {} words from temp file", word_count);
+                    
+                    // If we got substantial text (more than a few words), use it
+                    if word_count > 5 {
+                        let processing_time = start_time.elapsed().as_millis() as u64;
+                        info!("pdftotext extracted {} words from: {}", word_count, file_path);
+                        return Ok((text.trim().to_string(), processing_time));
+                    } else {
+                        debug!("pdftotext only extracted {} words, will try direct extraction before OCR", word_count);
+                    }
+                } else {
+                    debug!("Failed to read pdftotext output file: {}", temp_text_path);
+                }
+            } else {
+                let stderr = String::from_utf8_lossy(&output.stderr);
+                debug!("pdftotext failed with status {}: {}", output.status, stderr);
+            }
+        } else {
+            debug!("Failed to execute pdftotext command");
+        }
+        
+        info!("pdftotext extraction insufficient for '{}', trying direct extraction before OCR", file_path);
+        
+        // Strategy 2: Try direct text extraction (often works when pdftotext fails)
+        match self.extract_text_from_pdf_bytes(file_path).await {
+            Ok(text) if !text.trim().is_empty() => {
+                let word_count = text.split_whitespace().count();
+                if word_count > 5 {
+                    let processing_time = start_time.elapsed().as_millis() as u64;
+                    info!("Direct text extraction succeeded for '{}': {} words", file_path, word_count);
+                    return Ok((text, processing_time));
+                } else {
+                    debug!("Direct extraction only got {} words, trying OCR", word_count);
+                }
+            }
+            Ok(_) => {
+                debug!("Direct text extraction returned empty text");
+            }
+            Err(e) => {
+                debug!("Direct text extraction failed: {}", e);
            }
        }
        
-        info!("Quick extraction failed, trying recovery strategies for: {}", file_path);
+        info!("Direct extraction insufficient for '{}', using OCR extraction", file_path);
        
-        // Strategy 2: Try with --fix-metadata for corrupted metadata
-        let temp_fixed_pdf = format!("{}/fixed_{}_{}.pdf", self.temp_dir, std::process::id(), 
-            std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?.as_millis());
-        
-        ocrmypdf_result = tokio::process::Command::new("ocrmypdf")
-            .arg("--fix-metadata")  // Fix metadata issues
-            .arg("--skip-text")     // Still skip OCR for speed
+        // Strategy 3: Use ocrmypdf --sidecar to extract existing OCR text
+        let ocrmypdf_result = tokio::process::Command::new("ocrmypdf")
            .arg("--sidecar")
            .arg(&temp_text_path)
            .arg(file_path)
-            .arg(&temp_fixed_pdf)
+            .arg("-")  // Dummy output (we only want sidecar)
            .output()
            .await;
        
-        if ocrmypdf_result.is_ok() && ocrmypdf_result.as_ref().unwrap().status.success() {
-            if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
-                let _ = tokio::fs::remove_file(&temp_text_path).await;
-                let _ = tokio::fs::remove_file(&temp_fixed_pdf).await;
-                let processing_time = start_time.elapsed().as_millis() as u64;
-                return Ok((text.trim().to_string(), processing_time));
+        if let Ok(output) = &ocrmypdf_result {
+            if output.status.success() {
+                if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
+                    let _ = tokio::fs::remove_file(&temp_text_path).await;
+                    let word_count = text.split_whitespace().count();
+                    if word_count > 0 {
+                        let processing_time = start_time.elapsed().as_millis() as u64;
+                        info!("ocrmypdf --sidecar extracted {} words from: {}", word_count, file_path);
+                        return Ok((text.trim().to_string(), processing_time));
+                    }
+                }
+            } else {
+                let stderr = String::from_utf8_lossy(&output.stderr);
+                debug!("ocrmypdf --sidecar failed: {}", stderr);
+                
+                // Check if the error indicates the page already has text
+                if stderr.contains("page already has text") {
+                    // This is good - it means there's already text, we should use pdftotext
+                    warn!("ocrmypdf detected existing text in PDF, this should have been caught by pdftotext");
+                }
            }
        }
        
-        // Strategy 3: Try with --remove-background for scanned documents
-        ocrmypdf_result = tokio::process::Command::new("ocrmypdf")
-            .arg("--remove-background")
-            .arg("--skip-text")
-            .arg("--sidecar")
-            .arg(&temp_text_path)
-            .arg(file_path)
-            .arg(&temp_fixed_pdf)
-            .output()
-            .await;
-        
-        if ocrmypdf_result.is_ok() && ocrmypdf_result.as_ref().unwrap().status.success() {
-            if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
-                let _ = tokio::fs::remove_file(&temp_text_path).await;
-                let _ = tokio::fs::remove_file(&temp_fixed_pdf).await;
-                let processing_time = start_time.elapsed().as_millis() as u64;
-                return Ok((text.trim().to_string(), processing_time));
-            }
-        }
-        
-        // Clean up temporary files
-        let _ = tokio::fs::remove_file(&temp_text_path).await;
-        let _ = tokio::fs::remove_file(&temp_fixed_pdf).await;
-        
-        // Last resort: try to extract any readable text directly from the PDF file
-        warn!("All ocrmypdf strategies failed, trying direct text extraction from: {}", file_path);
+        // Strategy 3: Last resort - direct byte-level text extraction
+        warn!("Standard extraction methods failed, trying direct text extraction from: {}", file_path);
        
        match self.extract_text_from_pdf_bytes(file_path).await {
            Ok(text) if !text.trim().is_empty() => {
                let processing_time = start_time.elapsed().as_millis() as u64;
-                info!("Direct text extraction succeeded for: {}", file_path);
+                let word_count = text.split_whitespace().count();
+                info!("Direct text extraction succeeded for '{}': {} words", file_path, word_count);
                Ok((text, processing_time))
            }
            Ok(_) => {
                warn!("Direct text extraction returned empty text for: {}", file_path);
                // If all strategies fail, return the last error
-                match ocrmypdf_result {
-                    Ok(output) => {
-                        let stderr = String::from_utf8_lossy(&output.stderr);
-                        Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr))
-                    }
-                    Err(e) => Err(anyhow!("Failed to run ocrmypdf: {}", e)),
+                if let Ok(ref output) = ocrmypdf_result {
+                    let stderr = String::from_utf8_lossy(&output.stderr);
+                    Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr))
+                } else {
+                    Err(anyhow!("All PDF extraction strategies failed"))
                }
            }
            Err(e) => {
                warn!("Direct text extraction also failed for {}: {}", file_path, e);
                // If all strategies fail, return the last error
-                match ocrmypdf_result {
-                    Ok(output) => {
-                        let stderr = String::from_utf8_lossy(&output.stderr);
-                        Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr))
-                    }
-                    Err(e) => Err(anyhow!("Failed to run ocrmypdf: {}", e)),
+                if let Ok(ref output) = ocrmypdf_result {
+                    let stderr = String::from_utf8_lossy(&output.stderr);
+                    Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr))
+                } else {
+                    Err(anyhow!("All PDF extraction strategies failed: {}", e))
                }
            }
        }
--- a/src/ocr/mod.rs
+++ b/src/ocr/mod.rs
@ -80,24 +80,32 @@ impl OcrService {
            let temp_text_path = format!("{}/pdf_text_{}.txt", temp_dir, std::process::id());
            
            // Progressive extraction with fallback strategies
-            let mut output = tokio::process::Command::new("ocrmypdf")
-                .arg("--skip-text")  // Extract existing text without OCR processing
-                .arg("--sidecar")    // Extract text to sidecar file
-                .arg(&temp_text_path)
+            // Strategy 1: pdftotext for existing text (fastest)
+            let mut output = tokio::process::Command::new("pdftotext")
+                .arg("-layout")  // Preserve layout
                .arg(file_path)
-                .arg("-")  // Dummy output (required)
+                .arg(&temp_text_path)
                .output()
                .await?;
                
+            if output.status.success() {
+                // Check if we got substantial text
+                if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
+                    let word_count = text.split_whitespace().count();
+                    if word_count > 5 {
+                        let _ = tokio::fs::remove_file(&temp_text_path).await;
+                        return Ok(text.trim().to_string());
+                    }
+                }
+            }
+            
            if !output.status.success() {
-                // Try with metadata fixing for corrupted files
+                // Strategy 2: ocrmypdf sidecar (when pdftotext fails)
                output = tokio::process::Command::new("ocrmypdf")
-                    .arg("--fix-metadata")  // Fix corrupted metadata
-                    .arg("--skip-text")     // Still extract existing text only
-                    .arg("--sidecar")
+                    .arg("--sidecar")    // Extract text to sidecar file
                    .arg(&temp_text_path)
                    .arg(file_path)
-                    .arg("-")
+                    .arg("-")  // Dummy output
                    .output()
                    .await?;