feat(ocr): use ocrmypdf and pdftotext to get OCR layer if it already exists
This commit is contained in:
parent
3df5b5ef1d
commit
564c564613
|
|
@ -0,0 +1,232 @@
|
||||||
|
use std::env;
|
||||||
|
use std::process;
|
||||||
|
use tokio;
|
||||||
|
use anyhow::Result;
|
||||||
|
|
||||||
|
async fn test_pdftotext(file_path: &str) -> Result<(String, usize)> {
|
||||||
|
println!("=== Testing pdftotext ===");
|
||||||
|
|
||||||
|
let temp_text_path = format!("/tmp/debug_pdftotext_{}.txt", std::process::id());
|
||||||
|
|
||||||
|
let output = tokio::process::Command::new("pdftotext")
|
||||||
|
.arg("-layout")
|
||||||
|
.arg(file_path)
|
||||||
|
.arg(&temp_text_path)
|
||||||
|
.output()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
println!("pdftotext exit status: {}", output.status);
|
||||||
|
if !output.stderr.is_empty() {
|
||||||
|
println!("pdftotext stderr: {}", String::from_utf8_lossy(&output.stderr));
|
||||||
|
}
|
||||||
|
|
||||||
|
if output.status.success() {
|
||||||
|
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
|
||||||
|
let word_count = text.split_whitespace().count();
|
||||||
|
println!("pdftotext extracted {} words", word_count);
|
||||||
|
println!("First 200 chars: {:?}", &text.chars().take(200).collect::<String>());
|
||||||
|
|
||||||
|
// Clean up
|
||||||
|
let _ = tokio::fs::remove_file(&temp_text_path).await;
|
||||||
|
return Ok((text, word_count));
|
||||||
|
} else {
|
||||||
|
println!("Failed to read pdftotext output file");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
println!("pdftotext failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok((String::new(), 0))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn test_ocrmypdf_sidecar(file_path: &str) -> Result<(String, usize)> {
|
||||||
|
println!("\n=== Testing ocrmypdf --sidecar ===");
|
||||||
|
|
||||||
|
let temp_text_path = format!("/tmp/debug_ocrmypdf_{}.txt", std::process::id());
|
||||||
|
|
||||||
|
let output = tokio::process::Command::new("ocrmypdf")
|
||||||
|
.arg("--sidecar")
|
||||||
|
.arg(&temp_text_path)
|
||||||
|
.arg(file_path)
|
||||||
|
.arg("-") // Dummy output
|
||||||
|
.output()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
println!("ocrmypdf --sidecar exit status: {}", output.status);
|
||||||
|
if !output.stderr.is_empty() {
|
||||||
|
println!("ocrmypdf --sidecar stderr: {}", String::from_utf8_lossy(&output.stderr));
|
||||||
|
}
|
||||||
|
|
||||||
|
if output.status.success() {
|
||||||
|
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
|
||||||
|
let word_count = text.split_whitespace().count();
|
||||||
|
println!("ocrmypdf --sidecar extracted {} words", word_count);
|
||||||
|
println!("First 200 chars: {:?}", &text.chars().take(200).collect::<String>());
|
||||||
|
|
||||||
|
// Clean up
|
||||||
|
let _ = tokio::fs::remove_file(&temp_text_path).await;
|
||||||
|
return Ok((text, word_count));
|
||||||
|
} else {
|
||||||
|
println!("Failed to read ocrmypdf sidecar output file");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
println!("ocrmypdf --sidecar failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok((String::new(), 0))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn test_direct_extraction(file_path: &str) -> Result<(String, usize)> {
|
||||||
|
println!("\n=== Testing direct text extraction ===");
|
||||||
|
|
||||||
|
let bytes = tokio::fs::read(file_path).await?;
|
||||||
|
println!("PDF file size: {} bytes", bytes.len());
|
||||||
|
|
||||||
|
// Look for readable ASCII text in the PDF
|
||||||
|
let mut ascii_text = String::new();
|
||||||
|
let mut current_word = String::new();
|
||||||
|
|
||||||
|
for &byte in &bytes {
|
||||||
|
if byte >= 32 && byte <= 126 { // Printable ASCII
|
||||||
|
current_word.push(byte as char);
|
||||||
|
} else {
|
||||||
|
if current_word.len() > 3 { // Only keep words longer than 3 characters
|
||||||
|
ascii_text.push_str(¤t_word);
|
||||||
|
ascii_text.push(' ');
|
||||||
|
}
|
||||||
|
current_word.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add the last word if it's long enough
|
||||||
|
if current_word.len() > 3 {
|
||||||
|
ascii_text.push_str(¤t_word);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clean up the text
|
||||||
|
let cleaned_text = ascii_text
|
||||||
|
.split_whitespace()
|
||||||
|
.filter(|word| word.len() > 1) // Filter out single characters
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join(" ");
|
||||||
|
|
||||||
|
let word_count = cleaned_text.split_whitespace().count();
|
||||||
|
println!("Direct extraction got {} words", word_count);
|
||||||
|
println!("First 200 chars: {:?}", &cleaned_text.chars().take(200).collect::<String>());
|
||||||
|
|
||||||
|
Ok((cleaned_text, word_count))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn test_quality_assessment(text: &str, word_count: usize, file_size: u64) {
|
||||||
|
println!("\n=== Testing quality assessment ===");
|
||||||
|
|
||||||
|
// Replicate the quality assessment logic
|
||||||
|
if word_count == 0 {
|
||||||
|
println!("Quality check: FAIL - no words");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// For very small files, low word count might be normal
|
||||||
|
if file_size < 50_000 && word_count >= 1 {
|
||||||
|
println!("Quality check: PASS - small file with some text");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate word density (words per KB)
|
||||||
|
let file_size_kb = (file_size as f64) / 1024.0;
|
||||||
|
let word_density = (word_count as f64) / file_size_kb;
|
||||||
|
|
||||||
|
const MIN_WORD_DENSITY: f64 = 5.0;
|
||||||
|
const MIN_WORDS_FOR_LARGE_FILES: usize = 10;
|
||||||
|
const SUBSTANTIAL_WORD_COUNT: usize = 50;
|
||||||
|
|
||||||
|
println!("File size: {:.1} KB", file_size_kb);
|
||||||
|
println!("Word density: {:.2} words/KB", word_density);
|
||||||
|
|
||||||
|
// If we have substantial text, accept it regardless of density
|
||||||
|
if word_count >= SUBSTANTIAL_WORD_COUNT {
|
||||||
|
println!("Quality check: PASS - substantial text content ({} words)", word_count);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if word_density < MIN_WORD_DENSITY && word_count < MIN_WORDS_FOR_LARGE_FILES {
|
||||||
|
println!("Quality check: FAIL - appears to be image-based ({} words, {:.2} words/KB)", word_count, word_density);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Additional check: if text is mostly non-alphanumeric, might be extraction artifacts
|
||||||
|
let alphanumeric_chars = text.chars().filter(|c| c.is_alphanumeric()).count();
|
||||||
|
let alphanumeric_ratio = if text.len() > 0 {
|
||||||
|
(alphanumeric_chars as f64) / (text.len() as f64)
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
};
|
||||||
|
|
||||||
|
println!("Alphanumeric ratio: {:.1}%", alphanumeric_ratio * 100.0);
|
||||||
|
|
||||||
|
// If less than 30% alphanumeric content, likely poor extraction
|
||||||
|
if alphanumeric_ratio < 0.3 {
|
||||||
|
println!("Quality check: FAIL - low alphanumeric content ({:.1}%)", alphanumeric_ratio * 100.0);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("Quality check: PASS - {} words, {:.2} words/KB, {:.1}% alphanumeric",
|
||||||
|
word_count, word_density, alphanumeric_ratio * 100.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<()> {
|
||||||
|
let args: Vec<String> = env::args().collect();
|
||||||
|
if args.len() != 2 {
|
||||||
|
eprintln!("Usage: {} <pdf_file_path>", args[0]);
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
let pdf_path = &args[1];
|
||||||
|
println!("Debugging PDF extraction for: {}", pdf_path);
|
||||||
|
|
||||||
|
// Check if file exists
|
||||||
|
if !tokio::fs::metadata(pdf_path).await.is_ok() {
|
||||||
|
eprintln!("Error: File '{}' not found", pdf_path);
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
let file_size = tokio::fs::metadata(pdf_path).await?.len();
|
||||||
|
println!("File size: {} bytes ({:.2} MB)", file_size, file_size as f64 / (1024.0 * 1024.0));
|
||||||
|
|
||||||
|
// Test each extraction method
|
||||||
|
let (pdftotext_text, pdftotext_words) = test_pdftotext(pdf_path).await?;
|
||||||
|
let (ocrmypdf_text, ocrmypdf_words) = test_ocrmypdf_sidecar(pdf_path).await?;
|
||||||
|
let (direct_text, direct_words) = test_direct_extraction(pdf_path).await?;
|
||||||
|
|
||||||
|
// Test quality assessment on each result
|
||||||
|
if pdftotext_words > 0 {
|
||||||
|
test_quality_assessment(&pdftotext_text, pdftotext_words, file_size).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ocrmypdf_words > 0 {
|
||||||
|
test_quality_assessment(&ocrmypdf_text, ocrmypdf_words, file_size).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
if direct_words > 0 {
|
||||||
|
test_quality_assessment(&direct_text, direct_words, file_size).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("\n=== Summary ===");
|
||||||
|
println!("pdftotext: {} words", pdftotext_words);
|
||||||
|
println!("ocrmypdf --sidecar: {} words", ocrmypdf_words);
|
||||||
|
println!("direct extraction: {} words", direct_words);
|
||||||
|
|
||||||
|
// Determine what should happen based on the logic
|
||||||
|
if pdftotext_words > 5 {
|
||||||
|
println!("Expected result: Use pdftotext ({} words)", pdftotext_words);
|
||||||
|
} else if direct_words > 5 {
|
||||||
|
println!("Expected result: Use direct extraction ({} words)", direct_words);
|
||||||
|
} else if ocrmypdf_words > 0 {
|
||||||
|
println!("Expected result: Use ocrmypdf --sidecar ({} words)", ocrmypdf_words);
|
||||||
|
} else {
|
||||||
|
println!("Expected result: All methods failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
@ -872,7 +872,7 @@ impl EnhancedOcrService {
|
||||||
confidence: 95.0,
|
confidence: 95.0,
|
||||||
processing_time_ms: extraction_time,
|
processing_time_ms: extraction_time,
|
||||||
word_count,
|
word_count,
|
||||||
preprocessing_applied: vec!["PDF text extraction (ocrmypdf --skip-text)".to_string()],
|
preprocessing_applied: vec!["PDF text extraction (pdftotext)".to_string()],
|
||||||
processed_image_path: None,
|
processed_image_path: None,
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -938,8 +938,16 @@ impl EnhancedOcrService {
|
||||||
// Reasonable thresholds based on typical PDF content:
|
// Reasonable thresholds based on typical PDF content:
|
||||||
// - Text-based PDFs typically have 50-200 words per KB
|
// - Text-based PDFs typically have 50-200 words per KB
|
||||||
// - Below 5 words per KB suggests mostly images/scanned content
|
// - Below 5 words per KB suggests mostly images/scanned content
|
||||||
|
// - But if we have a substantial number of words (>50), accept it regardless of density
|
||||||
const MIN_WORD_DENSITY: f64 = 5.0;
|
const MIN_WORD_DENSITY: f64 = 5.0;
|
||||||
const MIN_WORDS_FOR_LARGE_FILES: usize = 10;
|
const MIN_WORDS_FOR_LARGE_FILES: usize = 10;
|
||||||
|
const SUBSTANTIAL_WORD_COUNT: usize = 50;
|
||||||
|
|
||||||
|
// If we have substantial text, accept it regardless of density
|
||||||
|
if word_count >= SUBSTANTIAL_WORD_COUNT {
|
||||||
|
debug!("PDF has substantial text content: {} words, accepting regardless of density", word_count);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
if word_density < MIN_WORD_DENSITY && word_count < MIN_WORDS_FOR_LARGE_FILES {
|
if word_density < MIN_WORD_DENSITY && word_count < MIN_WORDS_FOR_LARGE_FILES {
|
||||||
debug!("PDF appears to be image-based: {} words in {:.1} KB (density: {:.2} words/KB)",
|
debug!("PDF appears to be image-based: {} words in {:.1} KB (density: {:.2} words/KB)",
|
||||||
|
|
@ -1122,102 +1130,130 @@ impl EnhancedOcrService {
|
||||||
);
|
);
|
||||||
let temp_text_path = format!("{}/{}", self.temp_dir, temp_text_filename);
|
let temp_text_path = format!("{}/{}", self.temp_dir, temp_text_filename);
|
||||||
|
|
||||||
// Strategy 1: Fast extraction with --skip-text (extracts existing text, no OCR)
|
// Strategy 1: Fast text extraction using pdftotext (for existing text)
|
||||||
let mut ocrmypdf_result = tokio::process::Command::new("ocrmypdf")
|
debug!("Trying pdftotext for existing text extraction: {}", file_path);
|
||||||
.arg("--skip-text") // Extract existing text without OCR processing
|
debug!("Using temp file path: {}", temp_text_path);
|
||||||
.arg("--sidecar") // Extract text to sidecar file
|
let pdftotext_result = tokio::process::Command::new("pdftotext")
|
||||||
.arg(&temp_text_path)
|
.arg("-layout") // Preserve layout
|
||||||
.arg(file_path)
|
.arg(file_path)
|
||||||
.arg("-") // Dummy output (required by ocrmypdf)
|
.arg(&temp_text_path)
|
||||||
.output()
|
.output()
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
if ocrmypdf_result.is_ok() && ocrmypdf_result.as_ref().unwrap().status.success() {
|
if let Ok(output) = pdftotext_result {
|
||||||
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
|
debug!("pdftotext exit status: {}", output.status);
|
||||||
let _ = tokio::fs::remove_file(&temp_text_path).await;
|
if !output.stderr.is_empty() {
|
||||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
debug!("pdftotext stderr: {}", String::from_utf8_lossy(&output.stderr));
|
||||||
return Ok((text.trim().to_string(), processing_time));
|
}
|
||||||
|
if output.status.success() {
|
||||||
|
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
|
||||||
|
let _ = tokio::fs::remove_file(&temp_text_path).await;
|
||||||
|
let word_count = text.split_whitespace().count();
|
||||||
|
debug!("pdftotext extracted {} words from temp file", word_count);
|
||||||
|
|
||||||
|
// If we got substantial text (more than a few words), use it
|
||||||
|
if word_count > 5 {
|
||||||
|
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||||
|
info!("pdftotext extracted {} words from: {}", word_count, file_path);
|
||||||
|
return Ok((text.trim().to_string(), processing_time));
|
||||||
|
} else {
|
||||||
|
debug!("pdftotext only extracted {} words, will try direct extraction before OCR", word_count);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
debug!("Failed to read pdftotext output file: {}", temp_text_path);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||||
|
debug!("pdftotext failed with status {}: {}", output.status, stderr);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
debug!("Failed to execute pdftotext command");
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("pdftotext extraction insufficient for '{}', trying direct extraction before OCR", file_path);
|
||||||
|
|
||||||
|
// Strategy 2: Try direct text extraction (often works when pdftotext fails)
|
||||||
|
match self.extract_text_from_pdf_bytes(file_path).await {
|
||||||
|
Ok(text) if !text.trim().is_empty() => {
|
||||||
|
let word_count = text.split_whitespace().count();
|
||||||
|
if word_count > 5 {
|
||||||
|
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||||
|
info!("Direct text extraction succeeded for '{}': {} words", file_path, word_count);
|
||||||
|
return Ok((text, processing_time));
|
||||||
|
} else {
|
||||||
|
debug!("Direct extraction only got {} words, trying OCR", word_count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(_) => {
|
||||||
|
debug!("Direct text extraction returned empty text");
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
debug!("Direct text extraction failed: {}", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
info!("Quick extraction failed, trying recovery strategies for: {}", file_path);
|
info!("Direct extraction insufficient for '{}', using OCR extraction", file_path);
|
||||||
|
|
||||||
// Strategy 2: Try with --fix-metadata for corrupted metadata
|
// Strategy 3: Use ocrmypdf --sidecar to extract existing OCR text
|
||||||
let temp_fixed_pdf = format!("{}/fixed_{}_{}.pdf", self.temp_dir, std::process::id(),
|
let ocrmypdf_result = tokio::process::Command::new("ocrmypdf")
|
||||||
std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?.as_millis());
|
|
||||||
|
|
||||||
ocrmypdf_result = tokio::process::Command::new("ocrmypdf")
|
|
||||||
.arg("--fix-metadata") // Fix metadata issues
|
|
||||||
.arg("--skip-text") // Still skip OCR for speed
|
|
||||||
.arg("--sidecar")
|
.arg("--sidecar")
|
||||||
.arg(&temp_text_path)
|
.arg(&temp_text_path)
|
||||||
.arg(file_path)
|
.arg(file_path)
|
||||||
.arg(&temp_fixed_pdf)
|
.arg("-") // Dummy output (we only want sidecar)
|
||||||
.output()
|
.output()
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
if ocrmypdf_result.is_ok() && ocrmypdf_result.as_ref().unwrap().status.success() {
|
if let Ok(output) = &ocrmypdf_result {
|
||||||
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
|
if output.status.success() {
|
||||||
let _ = tokio::fs::remove_file(&temp_text_path).await;
|
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
|
||||||
let _ = tokio::fs::remove_file(&temp_fixed_pdf).await;
|
let _ = tokio::fs::remove_file(&temp_text_path).await;
|
||||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
let word_count = text.split_whitespace().count();
|
||||||
return Ok((text.trim().to_string(), processing_time));
|
if word_count > 0 {
|
||||||
|
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||||
|
info!("ocrmypdf --sidecar extracted {} words from: {}", word_count, file_path);
|
||||||
|
return Ok((text.trim().to_string(), processing_time));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||||
|
debug!("ocrmypdf --sidecar failed: {}", stderr);
|
||||||
|
|
||||||
|
// Check if the error indicates the page already has text
|
||||||
|
if stderr.contains("page already has text") {
|
||||||
|
// This is good - it means there's already text, we should use pdftotext
|
||||||
|
warn!("ocrmypdf detected existing text in PDF, this should have been caught by pdftotext");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Strategy 3: Try with --remove-background for scanned documents
|
// Strategy 3: Last resort - direct byte-level text extraction
|
||||||
ocrmypdf_result = tokio::process::Command::new("ocrmypdf")
|
warn!("Standard extraction methods failed, trying direct text extraction from: {}", file_path);
|
||||||
.arg("--remove-background")
|
|
||||||
.arg("--skip-text")
|
|
||||||
.arg("--sidecar")
|
|
||||||
.arg(&temp_text_path)
|
|
||||||
.arg(file_path)
|
|
||||||
.arg(&temp_fixed_pdf)
|
|
||||||
.output()
|
|
||||||
.await;
|
|
||||||
|
|
||||||
if ocrmypdf_result.is_ok() && ocrmypdf_result.as_ref().unwrap().status.success() {
|
|
||||||
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
|
|
||||||
let _ = tokio::fs::remove_file(&temp_text_path).await;
|
|
||||||
let _ = tokio::fs::remove_file(&temp_fixed_pdf).await;
|
|
||||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
|
||||||
return Ok((text.trim().to_string(), processing_time));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clean up temporary files
|
|
||||||
let _ = tokio::fs::remove_file(&temp_text_path).await;
|
|
||||||
let _ = tokio::fs::remove_file(&temp_fixed_pdf).await;
|
|
||||||
|
|
||||||
// Last resort: try to extract any readable text directly from the PDF file
|
|
||||||
warn!("All ocrmypdf strategies failed, trying direct text extraction from: {}", file_path);
|
|
||||||
|
|
||||||
match self.extract_text_from_pdf_bytes(file_path).await {
|
match self.extract_text_from_pdf_bytes(file_path).await {
|
||||||
Ok(text) if !text.trim().is_empty() => {
|
Ok(text) if !text.trim().is_empty() => {
|
||||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||||
info!("Direct text extraction succeeded for: {}", file_path);
|
let word_count = text.split_whitespace().count();
|
||||||
|
info!("Direct text extraction succeeded for '{}': {} words", file_path, word_count);
|
||||||
Ok((text, processing_time))
|
Ok((text, processing_time))
|
||||||
}
|
}
|
||||||
Ok(_) => {
|
Ok(_) => {
|
||||||
warn!("Direct text extraction returned empty text for: {}", file_path);
|
warn!("Direct text extraction returned empty text for: {}", file_path);
|
||||||
// If all strategies fail, return the last error
|
// If all strategies fail, return the last error
|
||||||
match ocrmypdf_result {
|
if let Ok(ref output) = ocrmypdf_result {
|
||||||
Ok(output) => {
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr))
|
||||||
Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr))
|
} else {
|
||||||
}
|
Err(anyhow!("All PDF extraction strategies failed"))
|
||||||
Err(e) => Err(anyhow!("Failed to run ocrmypdf: {}", e)),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
warn!("Direct text extraction also failed for {}: {}", file_path, e);
|
warn!("Direct text extraction also failed for {}: {}", file_path, e);
|
||||||
// If all strategies fail, return the last error
|
// If all strategies fail, return the last error
|
||||||
match ocrmypdf_result {
|
if let Ok(ref output) = ocrmypdf_result {
|
||||||
Ok(output) => {
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr))
|
||||||
Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr))
|
} else {
|
||||||
}
|
Err(anyhow!("All PDF extraction strategies failed: {}", e))
|
||||||
Err(e) => Err(anyhow!("Failed to run ocrmypdf: {}", e)),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -80,24 +80,32 @@ impl OcrService {
|
||||||
let temp_text_path = format!("{}/pdf_text_{}.txt", temp_dir, std::process::id());
|
let temp_text_path = format!("{}/pdf_text_{}.txt", temp_dir, std::process::id());
|
||||||
|
|
||||||
// Progressive extraction with fallback strategies
|
// Progressive extraction with fallback strategies
|
||||||
let mut output = tokio::process::Command::new("ocrmypdf")
|
// Strategy 1: pdftotext for existing text (fastest)
|
||||||
.arg("--skip-text") // Extract existing text without OCR processing
|
let mut output = tokio::process::Command::new("pdftotext")
|
||||||
.arg("--sidecar") // Extract text to sidecar file
|
.arg("-layout") // Preserve layout
|
||||||
.arg(&temp_text_path)
|
|
||||||
.arg(file_path)
|
.arg(file_path)
|
||||||
.arg("-") // Dummy output (required)
|
.arg(&temp_text_path)
|
||||||
.output()
|
.output()
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
|
if output.status.success() {
|
||||||
|
// Check if we got substantial text
|
||||||
|
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
|
||||||
|
let word_count = text.split_whitespace().count();
|
||||||
|
if word_count > 5 {
|
||||||
|
let _ = tokio::fs::remove_file(&temp_text_path).await;
|
||||||
|
return Ok(text.trim().to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if !output.status.success() {
|
if !output.status.success() {
|
||||||
// Try with metadata fixing for corrupted files
|
// Strategy 2: ocrmypdf sidecar (when pdftotext fails)
|
||||||
output = tokio::process::Command::new("ocrmypdf")
|
output = tokio::process::Command::new("ocrmypdf")
|
||||||
.arg("--fix-metadata") // Fix corrupted metadata
|
.arg("--sidecar") // Extract text to sidecar file
|
||||||
.arg("--skip-text") // Still extract existing text only
|
|
||||||
.arg("--sidecar")
|
|
||||||
.arg(&temp_text_path)
|
.arg(&temp_text_path)
|
||||||
.arg(file_path)
|
.arg(file_path)
|
||||||
.arg("-")
|
.arg("-") // Dummy output
|
||||||
.output()
|
.output()
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue