feat(dev): drop pdf_extract in favor of ocrmypdf
This commit is contained in:
parent
628fe8cb7b
commit
549c2f8a16
|
|
@ -33,26 +33,6 @@ version = "2.0.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
|
||||
|
||||
[[package]]
|
||||
name = "adobe-cmap-parser"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ae8abfa9a4688de8fc9f42b3f013b6fffec18ed8a554f5f113577e0b9b3212a3"
|
||||
dependencies = [
|
||||
"pom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aes"
|
||||
version = "0.8.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"cipher",
|
||||
"cpufeatures",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.3"
|
||||
|
|
@ -256,7 +236,7 @@ dependencies = [
|
|||
"anyhow",
|
||||
"arrayvec",
|
||||
"log",
|
||||
"nom 7.1.3",
|
||||
"nom",
|
||||
"num-rational",
|
||||
"v_frame",
|
||||
]
|
||||
|
|
@ -903,15 +883,6 @@ dependencies = [
|
|||
"generic-array",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "block-padding"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8894febbff9f758034a5b8e12d87918f56dfc64a8e1fe757d65e29041538d93"
|
||||
dependencies = [
|
||||
"generic-array",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "blowfish"
|
||||
version = "0.9.1"
|
||||
|
|
@ -984,12 +955,6 @@ version = "3.18.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "793db76d6187cd04dff33004d8e6c9cc4e05cd330500379d2394209271b4aeee"
|
||||
|
||||
[[package]]
|
||||
name = "bytecount"
|
||||
version = "0.6.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e"
|
||||
|
||||
[[package]]
|
||||
name = "bytemuck"
|
||||
version = "1.23.1"
|
||||
|
|
@ -1024,15 +989,6 @@ dependencies = [
|
|||
"either",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cbc"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "26b52a9543ae338f279b96b0b9fed9c8093744685043739079ce85cd58f289a6"
|
||||
dependencies = [
|
||||
"cipher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.2.27"
|
||||
|
|
@ -1050,15 +1006,9 @@ version = "0.6.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
|
||||
dependencies = [
|
||||
"nom 7.1.3",
|
||||
"nom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cff-parser"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "31f5b6e9141c036f3ff4ce7b2f7e432b0f00dee416ddcd4f17741d189ddc2e9d"
|
||||
|
||||
[[package]]
|
||||
name = "cfg-expr"
|
||||
version = "0.15.8"
|
||||
|
|
@ -1478,15 +1428,6 @@ version = "1.0.19"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1c7a8fb8a9fbf66c1f703fe16184d10ca0ee9d23be5b4436400408ba54a95005"
|
||||
|
||||
[[package]]
|
||||
name = "ecb"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1a8bfa975b1aec2145850fcaa1c6fe269a16578c44705a532ae3edc92b8881c7"
|
||||
dependencies = [
|
||||
"cipher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ecdsa"
|
||||
version = "0.14.8"
|
||||
|
|
@ -1595,15 +1536,6 @@ dependencies = [
|
|||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "euclid"
|
||||
version = "0.20.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2bb7ef65b3777a325d1eeefefab5b6d4959da54747e33bd6258e789640f307ad"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "event-listener"
|
||||
version = "5.4.0"
|
||||
|
|
@ -2504,7 +2436,6 @@ version = "0.1.4"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01"
|
||||
dependencies = [
|
||||
"block-padding",
|
||||
"generic-array",
|
||||
]
|
||||
|
||||
|
|
@ -2777,32 +2708,6 @@ dependencies = [
|
|||
"imgref",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lopdf"
|
||||
version = "0.36.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "59fa2559e99ba0f26a12458aabc754432c805bbb8cba516c427825a997af1fb7"
|
||||
dependencies = [
|
||||
"aes",
|
||||
"bitflags 2.9.1",
|
||||
"cbc",
|
||||
"ecb",
|
||||
"encoding_rs",
|
||||
"flate2",
|
||||
"indexmap 2.9.0",
|
||||
"itoa",
|
||||
"log",
|
||||
"md-5",
|
||||
"nom 8.0.0",
|
||||
"nom_locate",
|
||||
"rand 0.9.1",
|
||||
"rangemap",
|
||||
"sha2",
|
||||
"stringprep",
|
||||
"thiserror 2.0.12",
|
||||
"weezl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lru"
|
||||
version = "0.12.5"
|
||||
|
|
@ -2972,26 +2877,6 @@ dependencies = [
|
|||
"minimal-lexical",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nom"
|
||||
version = "8.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nom_locate"
|
||||
version = "5.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b577e2d69827c4740cba2b52efaad1c4cc7c73042860b199710b3575c68438d"
|
||||
dependencies = [
|
||||
"bytecount",
|
||||
"memchr",
|
||||
"nom 8.0.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "noop_proc_macro"
|
||||
version = "0.3.0"
|
||||
|
|
@ -3355,23 +3240,6 @@ version = "1.0.15"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
|
||||
|
||||
[[package]]
|
||||
name = "pdf-extract"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c2f44c6c642e359e2fe7f662bf5438db3811b6b4be60afc6de04b619ce51e1a"
|
||||
dependencies = [
|
||||
"adobe-cmap-parser",
|
||||
"cff-parser",
|
||||
"encoding_rs",
|
||||
"euclid",
|
||||
"log",
|
||||
"lopdf",
|
||||
"postscript",
|
||||
"type1-encoding-parser",
|
||||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "peeking_take_while"
|
||||
version = "0.1.2"
|
||||
|
|
@ -3465,18 +3333,6 @@ dependencies = [
|
|||
"miniz_oxide",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pom"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6"
|
||||
|
||||
[[package]]
|
||||
name = "postscript"
|
||||
version = "0.14.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306"
|
||||
|
||||
[[package]]
|
||||
name = "potential_utf"
|
||||
version = "0.1.2"
|
||||
|
|
@ -3648,12 +3504,6 @@ dependencies = [
|
|||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rangemap"
|
||||
version = "1.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f60fcc7d6849342eff22c4350c8b9a989ee8ceabc4b481253e8946b9fe83d684"
|
||||
|
||||
[[package]]
|
||||
name = "rav1e"
|
||||
version = "0.7.1"
|
||||
|
|
@ -3763,7 +3613,6 @@ dependencies = [
|
|||
"mime_guess",
|
||||
"notify",
|
||||
"oauth2",
|
||||
"pdf-extract",
|
||||
"quick-xml",
|
||||
"raw-cpuid",
|
||||
"readur",
|
||||
|
|
@ -5437,15 +5286,6 @@ version = "0.25.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d2df906b07856748fa3f6e0ad0cbaa047052d4a7dd609e231c4f72cee8c36f31"
|
||||
|
||||
[[package]]
|
||||
name = "type1-encoding-parser"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3d6cc09e1a99c7e01f2afe4953789311a1c50baebbdac5b477ecf78e2e92a5b"
|
||||
dependencies = [
|
||||
"pom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typenum"
|
||||
version = "1.18.0"
|
||||
|
|
|
|||
|
|
@ -34,7 +34,6 @@ futures = "0.3"
|
|||
notify = "8"
|
||||
mime_guess = "2"
|
||||
tesseract = { version = "0.15", optional = true }
|
||||
pdf-extract = { version = "0.9", optional = true }
|
||||
image = { version = "0.25", features = ["png", "jpeg", "tiff", "bmp"], optional = true }
|
||||
imageproc = { version = "0.25", optional = true }
|
||||
thiserror = "2.0"
|
||||
|
|
@ -61,7 +60,7 @@ testcontainers-modules = { version = "0.12", features = ["postgres"], optional =
|
|||
|
||||
[features]
|
||||
default = ["ocr", "s3"]
|
||||
ocr = ["tesseract", "pdf-extract", "image", "imageproc", "raw-cpuid"]
|
||||
ocr = ["tesseract", "image", "imageproc", "raw-cpuid"]
|
||||
s3 = ["aws-config", "aws-sdk-s3", "aws-credential-types", "aws-types"]
|
||||
test-utils = ["testcontainers", "testcontainers-modules"]
|
||||
|
||||
|
|
|
|||
|
|
@ -811,13 +811,13 @@ impl EnhancedOcrService {
|
|||
Ok(closed)
|
||||
}
|
||||
|
||||
/// Extract text from PDF with size and time limits
|
||||
/// Extract text from PDF using ocrmypdf
|
||||
#[cfg(feature = "ocr")]
|
||||
pub async fn extract_text_from_pdf(&self, file_path: &str, settings: &Settings) -> Result<OcrResult> {
|
||||
let start_time = std::time::Instant::now();
|
||||
info!("Extracting text from PDF: {}", file_path);
|
||||
|
||||
// Check file size before loading into memory
|
||||
// Check file size before processing
|
||||
let metadata = tokio::fs::metadata(file_path).await?;
|
||||
let file_size = metadata.len();
|
||||
|
||||
|
|
@ -831,103 +831,91 @@ impl EnhancedOcrService {
|
|||
));
|
||||
}
|
||||
|
||||
let bytes = tokio::fs::read(file_path).await?;
|
||||
// Check if it's a valid PDF by reading first 1KB
|
||||
let mut header_bytes = vec![0u8; 1024.min(file_size as usize)];
|
||||
let mut file = tokio::fs::File::open(file_path).await?;
|
||||
use tokio::io::AsyncReadExt;
|
||||
file.read_exact(&mut header_bytes).await?;
|
||||
drop(file);
|
||||
|
||||
// Check if it's a valid PDF (handles leading null bytes)
|
||||
if !is_valid_pdf(&bytes) {
|
||||
if !is_valid_pdf(&header_bytes) {
|
||||
return Err(anyhow!(
|
||||
"Invalid PDF file: Missing or corrupted PDF header. File size: {} bytes, Header: {:?}",
|
||||
bytes.len(),
|
||||
bytes.get(0..50).unwrap_or(&[]).iter().map(|&b| {
|
||||
file_size,
|
||||
header_bytes.get(0..50).unwrap_or(&[]).iter().map(|&b| {
|
||||
if b >= 32 && b <= 126 { b as char } else { '.' }
|
||||
}).collect::<String>()
|
||||
));
|
||||
}
|
||||
|
||||
// Clean the PDF data (remove leading null bytes)
|
||||
let clean_bytes = clean_pdf_data(&bytes);
|
||||
|
||||
// Add timeout and panic recovery for PDF extraction
|
||||
let extraction_result = tokio::time::timeout(
|
||||
std::time::Duration::from_secs(120), // 2 minute timeout
|
||||
tokio::task::spawn_blocking(move || {
|
||||
// Catch panics from pdf-extract library
|
||||
catch_unwind(AssertUnwindSafe(|| {
|
||||
pdf_extract::extract_text_from_mem(&clean_bytes)
|
||||
}))
|
||||
})
|
||||
).await;
|
||||
|
||||
let text = match extraction_result {
|
||||
Ok(Ok(Ok(Ok(text)))) => text,
|
||||
Ok(Ok(Ok(Err(e)))) => {
|
||||
warn!("PDF text extraction failed for file '{}' (size: {} bytes): {}", file_path, file_size, e);
|
||||
return Err(anyhow!(
|
||||
"PDF text extraction failed for file '{}' (size: {} bytes): {}. This may indicate a corrupted or unsupported PDF format.",
|
||||
file_path, file_size, e
|
||||
));
|
||||
}
|
||||
Ok(Ok(Err(_panic))) => {
|
||||
// pdf-extract panicked (e.g., missing unicode map, corrupted font encoding)
|
||||
// For now, gracefully handle this common issue
|
||||
warn!("PDF text extraction panicked for '{}' (size: {} bytes) due to font encoding issues. This is a known limitation with certain PDF files.", file_path, file_size);
|
||||
|
||||
return Err(anyhow!(
|
||||
"PDF text extraction failed due to font encoding issues in '{}' (size: {} bytes). This PDF uses non-standard fonts or character encoding that cannot be processed. To extract text from this PDF, consider: 1) Converting it to images and uploading those instead, 2) Using a different PDF viewer to re-save the PDF with standard encoding, or 3) Using external tools to convert the PDF to a more compatible format.",
|
||||
file_path, file_size
|
||||
));
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
warn!("PDF extraction task failed for '{}' (size: {} bytes): {}", file_path, file_size, e);
|
||||
return Err(anyhow!("PDF extraction task failed: {}", e));
|
||||
}
|
||||
Err(_) => {
|
||||
warn!("PDF extraction timed out after 2 minutes for file '{}' (size: {} bytes). The PDF may be corrupted or too complex.", file_path, file_size);
|
||||
return Err(anyhow!(
|
||||
"PDF extraction timed out after 2 minutes for file '{}' (size: {} bytes). The PDF may be corrupted or too complex.",
|
||||
file_path, file_size
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
// Limit extracted text size to prevent memory issues
|
||||
const MAX_TEXT_SIZE: usize = 10 * 1024 * 1024; // 10MB of text
|
||||
let trimmed_text = if text.len() > MAX_TEXT_SIZE {
|
||||
warn!("PDF text too large ({} chars), truncating to {} chars", text.len(), MAX_TEXT_SIZE);
|
||||
format!("{}... [TEXT TRUNCATED DUE TO SIZE]", &text[..MAX_TEXT_SIZE])
|
||||
} else {
|
||||
text.trim().to_string()
|
||||
};
|
||||
|
||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||
let word_count = self.count_words_safely(&trimmed_text);
|
||||
|
||||
// Debug logging to understand PDF extraction issues
|
||||
debug!(
|
||||
"PDF extraction debug - File: '{}' | Raw text length: {} | Trimmed text length: {} | Word count: {} | First 200 chars: {:?}",
|
||||
file_path,
|
||||
text.len(),
|
||||
trimmed_text.len(),
|
||||
word_count,
|
||||
trimmed_text.chars().take(200).collect::<String>()
|
||||
);
|
||||
|
||||
// Smart detection: assess if text extraction quality is good enough
|
||||
if self.is_text_extraction_quality_sufficient(&trimmed_text, word_count, file_size) {
|
||||
info!("PDF text extraction successful for '{}', using extracted text", file_path);
|
||||
Ok(OcrResult {
|
||||
text: trimmed_text,
|
||||
confidence: 95.0, // PDF text extraction is generally high confidence
|
||||
processing_time_ms: processing_time,
|
||||
word_count,
|
||||
preprocessing_applied: vec!["PDF text extraction".to_string()],
|
||||
processed_image_path: None,
|
||||
})
|
||||
} else {
|
||||
info!("PDF text extraction insufficient for '{}' ({} words), falling back to OCR", file_path, word_count);
|
||||
// Fall back to OCR using ocrmypdf
|
||||
self.extract_text_from_pdf_with_ocr(file_path, settings, start_time).await
|
||||
// Check if ocrmypdf is available
|
||||
if !self.is_ocrmypdf_available().await {
|
||||
return Err(anyhow!(
|
||||
"ocrmypdf is not available on this system. To extract text from PDFs, please install ocrmypdf. \
|
||||
On Ubuntu/Debian: 'apt-get install ocrmypdf'. \
|
||||
On macOS: 'brew install ocrmypdf'."
|
||||
));
|
||||
}
|
||||
|
||||
// First try to extract text without OCR for performance (using --skip-text)
|
||||
let quick_extraction_result = self.extract_pdf_text_quick(file_path).await;
|
||||
|
||||
match quick_extraction_result {
|
||||
Ok((text, extraction_time)) => {
|
||||
let word_count = self.count_words_safely(&text);
|
||||
|
||||
// Check if quick extraction got good results
|
||||
if self.is_text_extraction_quality_sufficient(&text, word_count, file_size) {
|
||||
info!("PDF text extraction successful for '{}' using quick method", file_path);
|
||||
return Ok(OcrResult {
|
||||
text,
|
||||
confidence: 95.0,
|
||||
processing_time_ms: extraction_time,
|
||||
word_count,
|
||||
preprocessing_applied: vec!["PDF text extraction (ocrmypdf --skip-text)".to_string()],
|
||||
processed_image_path: None,
|
||||
});
|
||||
} else {
|
||||
info!("Quick PDF extraction insufficient for '{}' ({} words), using full OCR", file_path, word_count);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Quick PDF extraction failed for '{}': {}, using full OCR", file_path, e);
|
||||
}
|
||||
}
|
||||
|
||||
// If quick extraction failed or was insufficient, use full OCR
|
||||
let full_ocr_result = self.extract_text_from_pdf_with_ocr(file_path, settings, start_time).await;
|
||||
|
||||
// If OCR also fails, try direct text extraction as last resort
|
||||
if full_ocr_result.is_err() {
|
||||
warn!("Full OCR failed, trying direct text extraction as last resort for: {}", file_path);
|
||||
|
||||
match self.extract_text_from_pdf_bytes(file_path).await {
|
||||
Ok(text) if !text.trim().is_empty() => {
|
||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||
let word_count = self.count_words_safely(&text);
|
||||
info!("Direct text extraction succeeded as last resort for: {}", file_path);
|
||||
|
||||
return Ok(OcrResult {
|
||||
text,
|
||||
confidence: 50.0, // Lower confidence for direct extraction
|
||||
processing_time_ms: processing_time,
|
||||
word_count,
|
||||
preprocessing_applied: vec!["Direct PDF text extraction (last resort)".to_string()],
|
||||
processed_image_path: None,
|
||||
});
|
||||
}
|
||||
Ok(_) => {
|
||||
warn!("Direct text extraction returned empty text for: {}", file_path);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Direct text extraction also failed for {}: {}", file_path, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
full_ocr_result
|
||||
}
|
||||
|
||||
/// Assess if text extraction quality is sufficient or if OCR fallback is needed
|
||||
|
|
@ -1002,14 +990,15 @@ impl EnhancedOcrService {
|
|||
);
|
||||
let temp_ocr_path = format!("{}/{}", self.temp_dir, temp_ocr_filename);
|
||||
|
||||
// Run ocrmypdf to create searchable PDF
|
||||
// Run ocrmypdf with progressive fallback strategies
|
||||
let ocrmypdf_result = tokio::time::timeout(
|
||||
std::time::Duration::from_secs(300), // 5 minute timeout for OCR
|
||||
tokio::task::spawn_blocking({
|
||||
let file_path = file_path.to_string();
|
||||
let temp_ocr_path = temp_ocr_path.clone();
|
||||
move || {
|
||||
std::process::Command::new("ocrmypdf")
|
||||
// Strategy 1: Standard OCR with cleaning
|
||||
let mut result = std::process::Command::new("ocrmypdf")
|
||||
.arg("--force-ocr") // OCR even if text is detected
|
||||
.arg("-O2") // Optimize level 2 (balanced quality/speed)
|
||||
.arg("--deskew") // Correct skewed pages
|
||||
|
|
@ -1018,6 +1007,38 @@ impl EnhancedOcrService {
|
|||
.arg("eng") // English language
|
||||
.arg(&file_path)
|
||||
.arg(&temp_ocr_path)
|
||||
.output();
|
||||
|
||||
if result.is_ok() && result.as_ref().unwrap().status.success() {
|
||||
return result;
|
||||
}
|
||||
|
||||
// Strategy 2: If standard OCR fails, try with error recovery
|
||||
eprintln!("Standard OCR failed, trying recovery mode...");
|
||||
result = std::process::Command::new("ocrmypdf")
|
||||
.arg("--force-ocr")
|
||||
.arg("--fix-metadata") // Fix metadata issues
|
||||
.arg("--remove-background") // Remove background noise
|
||||
.arg("-O1") // Lower optimization for problematic PDFs
|
||||
.arg("--language")
|
||||
.arg("eng")
|
||||
.arg(&file_path)
|
||||
.arg(&temp_ocr_path)
|
||||
.output();
|
||||
|
||||
if result.is_ok() && result.as_ref().unwrap().status.success() {
|
||||
return result;
|
||||
}
|
||||
|
||||
// Strategy 3: Last resort - minimal processing (skips very large pages)
|
||||
eprintln!("Recovery mode failed, trying minimal processing...");
|
||||
std::process::Command::new("ocrmypdf")
|
||||
.arg("--force-ocr")
|
||||
.arg("--skip-big") // Skip very large pages that might cause memory issues
|
||||
.arg("--language")
|
||||
.arg("eng")
|
||||
.arg(&file_path)
|
||||
.arg(&temp_ocr_path)
|
||||
.output()
|
||||
}
|
||||
})
|
||||
|
|
@ -1044,25 +1065,28 @@ impl EnhancedOcrService {
|
|||
move || -> Result<String> {
|
||||
let bytes = std::fs::read(&temp_ocr_path)?;
|
||||
// Catch panics from pdf-extract library (same pattern as used elsewhere)
|
||||
let text = match catch_unwind(AssertUnwindSafe(|| {
|
||||
pdf_extract::extract_text_from_mem(&bytes)
|
||||
})) {
|
||||
Ok(Ok(text)) => text,
|
||||
Ok(Err(e)) => {
|
||||
warn!("PDF text extraction failed after OCR processing for '{}': {}", temp_ocr_path, e);
|
||||
return Err(anyhow!(
|
||||
"PDF text extraction failed after OCR processing: {}. This may indicate a corrupted or unsupported PDF format.",
|
||||
e
|
||||
));
|
||||
},
|
||||
Err(_) => {
|
||||
warn!("PDF extraction panicked after OCR processing for '{}' due to invalid content stream", temp_ocr_path);
|
||||
return Err(anyhow!(
|
||||
"PDF extraction panicked after OCR processing due to invalid content stream or corrupted PDF structure. \
|
||||
This suggests the PDF has malformed internal structure that cannot be parsed safely."
|
||||
));
|
||||
},
|
||||
};
|
||||
// Extract text from the OCR'd PDF using ocrmypdf's sidecar option
|
||||
let temp_text_path = format!("{}.txt", temp_ocr_path);
|
||||
let extract_result = std::process::Command::new("ocrmypdf")
|
||||
.arg("--sidecar") // Extract text to a sidecar file
|
||||
.arg(&temp_text_path)
|
||||
.arg(&temp_ocr_path)
|
||||
.arg("-") // Output to stdout (dummy, required by ocrmypdf)
|
||||
.output()?;
|
||||
|
||||
if !extract_result.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&extract_result.stderr);
|
||||
return Err(anyhow!(
|
||||
"ocrmypdf text extraction failed: {}",
|
||||
stderr
|
||||
));
|
||||
}
|
||||
|
||||
// Read the extracted text from the sidecar file
|
||||
let text = std::fs::read_to_string(&temp_text_path)?;
|
||||
|
||||
// Clean up the text file
|
||||
let _ = std::fs::remove_file(&temp_text_path);
|
||||
Ok(text.trim().to_string())
|
||||
}
|
||||
}).await??;
|
||||
|
|
@ -1086,6 +1110,225 @@ impl EnhancedOcrService {
|
|||
})
|
||||
}
|
||||
|
||||
/// Progressive PDF text extraction with fallback strategies
|
||||
#[cfg(feature = "ocr")]
|
||||
async fn extract_pdf_text_quick(&self, file_path: &str) -> Result<(String, u64)> {
|
||||
let start_time = std::time::Instant::now();
|
||||
|
||||
// Generate temporary file path for text extraction
|
||||
let temp_text_filename = format!("quick_text_{}_{}.txt",
|
||||
std::process::id(),
|
||||
std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?.as_millis()
|
||||
);
|
||||
let temp_text_path = format!("{}/{}", self.temp_dir, temp_text_filename);
|
||||
|
||||
// Strategy 1: Fast extraction with --skip-text (extracts existing text, no OCR)
|
||||
let mut ocrmypdf_result = tokio::process::Command::new("ocrmypdf")
|
||||
.arg("--skip-text") // Extract existing text without OCR processing
|
||||
.arg("--sidecar") // Extract text to sidecar file
|
||||
.arg(&temp_text_path)
|
||||
.arg(file_path)
|
||||
.arg("-") // Dummy output (required by ocrmypdf)
|
||||
.output()
|
||||
.await;
|
||||
|
||||
if ocrmypdf_result.is_ok() && ocrmypdf_result.as_ref().unwrap().status.success() {
|
||||
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
|
||||
let _ = tokio::fs::remove_file(&temp_text_path).await;
|
||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||
return Ok((text.trim().to_string(), processing_time));
|
||||
}
|
||||
}
|
||||
|
||||
info!("Quick extraction failed, trying recovery strategies for: {}", file_path);
|
||||
|
||||
// Strategy 2: Try with --fix-metadata for corrupted metadata
|
||||
let temp_fixed_pdf = format!("{}/fixed_{}_{}.pdf", self.temp_dir, std::process::id(),
|
||||
std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?.as_millis());
|
||||
|
||||
ocrmypdf_result = tokio::process::Command::new("ocrmypdf")
|
||||
.arg("--fix-metadata") // Fix metadata issues
|
||||
.arg("--skip-text") // Still skip OCR for speed
|
||||
.arg("--sidecar")
|
||||
.arg(&temp_text_path)
|
||||
.arg(file_path)
|
||||
.arg(&temp_fixed_pdf)
|
||||
.output()
|
||||
.await;
|
||||
|
||||
if ocrmypdf_result.is_ok() && ocrmypdf_result.as_ref().unwrap().status.success() {
|
||||
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
|
||||
let _ = tokio::fs::remove_file(&temp_text_path).await;
|
||||
let _ = tokio::fs::remove_file(&temp_fixed_pdf).await;
|
||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||
return Ok((text.trim().to_string(), processing_time));
|
||||
}
|
||||
}
|
||||
|
||||
// Strategy 3: Try with --remove-background for scanned documents
|
||||
ocrmypdf_result = tokio::process::Command::new("ocrmypdf")
|
||||
.arg("--remove-background")
|
||||
.arg("--skip-text")
|
||||
.arg("--sidecar")
|
||||
.arg(&temp_text_path)
|
||||
.arg(file_path)
|
||||
.arg(&temp_fixed_pdf)
|
||||
.output()
|
||||
.await;
|
||||
|
||||
if ocrmypdf_result.is_ok() && ocrmypdf_result.as_ref().unwrap().status.success() {
|
||||
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
|
||||
let _ = tokio::fs::remove_file(&temp_text_path).await;
|
||||
let _ = tokio::fs::remove_file(&temp_fixed_pdf).await;
|
||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||
return Ok((text.trim().to_string(), processing_time));
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up temporary files
|
||||
let _ = tokio::fs::remove_file(&temp_text_path).await;
|
||||
let _ = tokio::fs::remove_file(&temp_fixed_pdf).await;
|
||||
|
||||
// Last resort: try to extract any readable text directly from the PDF file
|
||||
warn!("All ocrmypdf strategies failed, trying direct text extraction from: {}", file_path);
|
||||
|
||||
match self.extract_text_from_pdf_bytes(file_path).await {
|
||||
Ok(text) if !text.trim().is_empty() => {
|
||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||
info!("Direct text extraction succeeded for: {}", file_path);
|
||||
Ok((text, processing_time))
|
||||
}
|
||||
Ok(_) => {
|
||||
warn!("Direct text extraction returned empty text for: {}", file_path);
|
||||
// If all strategies fail, return the last error
|
||||
match ocrmypdf_result {
|
||||
Ok(output) => {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr))
|
||||
}
|
||||
Err(e) => Err(anyhow!("Failed to run ocrmypdf: {}", e)),
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Direct text extraction also failed for {}: {}", file_path, e);
|
||||
// If all strategies fail, return the last error
|
||||
match ocrmypdf_result {
|
||||
Ok(output) => {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr))
|
||||
}
|
||||
Err(e) => Err(anyhow!("Failed to run ocrmypdf: {}", e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Last resort: extract readable text directly from PDF bytes
|
||||
/// This can find text that's embedded in the PDF even if the structure is corrupted
|
||||
#[cfg(feature = "ocr")]
|
||||
async fn extract_text_from_pdf_bytes(&self, file_path: &str) -> Result<String> {
|
||||
let bytes = tokio::fs::read(file_path).await?;
|
||||
|
||||
// Look for text strings in the PDF
|
||||
let mut extracted_text = String::new();
|
||||
let mut current_text = String::new();
|
||||
let mut in_text_object = false;
|
||||
let mut in_string = false;
|
||||
let mut escape_next = false;
|
||||
|
||||
for &byte in &bytes {
|
||||
let char = byte as char;
|
||||
|
||||
// Look for text objects (BT...ET blocks)
|
||||
if !in_text_object && char == 'B' {
|
||||
// Check if this might be the start of "BT" (Begin Text)
|
||||
if let Some(window) = bytes.windows(2).find(|w| w == b"BT") {
|
||||
in_text_object = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if in_text_object && char == 'E' {
|
||||
// Check if this might be the start of "ET" (End Text)
|
||||
if let Some(window) = bytes.windows(2).find(|w| w == b"ET") {
|
||||
in_text_object = false;
|
||||
if !current_text.trim().is_empty() {
|
||||
extracted_text.push_str(¤t_text);
|
||||
extracted_text.push(' ');
|
||||
current_text.clear();
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Look for text strings in parentheses (text) or brackets
|
||||
if in_text_object {
|
||||
if char == '(' && !escape_next {
|
||||
in_string = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if char == ')' && !escape_next && in_string {
|
||||
in_string = false;
|
||||
current_text.push(' ');
|
||||
continue;
|
||||
}
|
||||
|
||||
if in_string {
|
||||
if escape_next {
|
||||
escape_next = false;
|
||||
current_text.push(char);
|
||||
} else if char == '\\' {
|
||||
escape_next = true;
|
||||
} else {
|
||||
current_text.push(char);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Also try to find any readable ASCII text in the PDF
|
||||
let mut ascii_text = String::new();
|
||||
let mut current_word = String::new();
|
||||
|
||||
for &byte in &bytes {
|
||||
if byte >= 32 && byte <= 126 { // Printable ASCII
|
||||
current_word.push(byte as char);
|
||||
} else {
|
||||
if current_word.len() > 3 { // Only keep words longer than 3 characters
|
||||
ascii_text.push_str(¤t_word);
|
||||
ascii_text.push(' ');
|
||||
}
|
||||
current_word.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// Add the last word if it's long enough
|
||||
if current_word.len() > 3 {
|
||||
ascii_text.push_str(¤t_word);
|
||||
}
|
||||
|
||||
// Combine both extraction methods
|
||||
let mut final_text = extracted_text;
|
||||
if !ascii_text.trim().is_empty() {
|
||||
final_text.push_str("\\n");
|
||||
final_text.push_str(&ascii_text);
|
||||
}
|
||||
|
||||
// Clean up the text
|
||||
let cleaned_text = final_text
|
||||
.split_whitespace()
|
||||
.filter(|word| word.len() > 1) // Filter out single characters
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ");
|
||||
|
||||
if cleaned_text.trim().is_empty() {
|
||||
Err(anyhow!("No readable text found in PDF"))
|
||||
} else {
|
||||
Ok(cleaned_text)
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if ocrmypdf is available on the system
|
||||
#[cfg(feature = "ocr")]
|
||||
async fn is_ocrmypdf_available(&self) -> bool {
|
||||
|
|
@ -1353,24 +1596,4 @@ fn is_valid_pdf(data: &[u8]) -> bool {
|
|||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Remove leading null bytes and return clean PDF data
|
||||
/// Returns the original data if no PDF header is found
|
||||
fn clean_pdf_data(data: &[u8]) -> Vec<u8> {
|
||||
if data.len() < 5 {
|
||||
return data.to_vec();
|
||||
}
|
||||
|
||||
// Find the first occurrence of "%PDF-" in the first 1KB
|
||||
let search_limit = data.len().min(1024);
|
||||
|
||||
for i in 0..=search_limit.saturating_sub(5) {
|
||||
if &data[i..i+5] == b"%PDF-" {
|
||||
return data[i..].to_vec();
|
||||
}
|
||||
}
|
||||
|
||||
// If no PDF header found, return original data
|
||||
data.to_vec()
|
||||
}
|
||||
127
src/ocr/mod.rs
127
src/ocr/mod.rs
|
|
@ -8,7 +8,6 @@ pub mod tests;
|
|||
|
||||
use anyhow::{anyhow, Result};
|
||||
use std::path::Path;
|
||||
use std::panic::{catch_unwind, AssertUnwindSafe};
|
||||
use crate::ocr::error::OcrError;
|
||||
use crate::ocr::health::OcrHealthChecker;
|
||||
|
||||
|
|
@ -62,14 +61,85 @@ impl OcrService {
|
|||
pub async fn extract_text_from_pdf(&self, file_path: &str) -> Result<String> {
|
||||
#[cfg(feature = "ocr")]
|
||||
{
|
||||
let bytes = std::fs::read(file_path)?;
|
||||
let text = match catch_unwind(AssertUnwindSafe(|| {
|
||||
pdf_extract::extract_text_from_mem(&bytes)
|
||||
})) {
|
||||
Ok(Ok(text)) => text,
|
||||
Ok(Err(e)) => return Err(anyhow!("Failed to extract text from PDF: {}", e)),
|
||||
Err(_) => return Err(anyhow!("PDF extraction panicked due to invalid content stream in file: {}", file_path)),
|
||||
};
|
||||
// Check if ocrmypdf is available
|
||||
let ocrmypdf_check = tokio::process::Command::new("ocrmypdf")
|
||||
.arg("--version")
|
||||
.output()
|
||||
.await;
|
||||
|
||||
if ocrmypdf_check.is_err() || !ocrmypdf_check.unwrap().status.success() {
|
||||
return Err(anyhow!(
|
||||
"ocrmypdf is not available. Please install ocrmypdf: \
|
||||
On Ubuntu/Debian: 'apt-get install ocrmypdf'. \
|
||||
On macOS: 'brew install ocrmypdf'."
|
||||
));
|
||||
}
|
||||
|
||||
// Create temporary file for text extraction
|
||||
let temp_dir = std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string());
|
||||
let temp_text_path = format!("{}/pdf_text_{}.txt", temp_dir, std::process::id());
|
||||
|
||||
// Progressive extraction with fallback strategies
|
||||
let mut output = tokio::process::Command::new("ocrmypdf")
|
||||
.arg("--skip-text") // Extract existing text without OCR processing
|
||||
.arg("--sidecar") // Extract text to sidecar file
|
||||
.arg(&temp_text_path)
|
||||
.arg(file_path)
|
||||
.arg("-") // Dummy output (required)
|
||||
.output()
|
||||
.await?;
|
||||
|
||||
if !output.status.success() {
|
||||
// Try with metadata fixing for corrupted files
|
||||
output = tokio::process::Command::new("ocrmypdf")
|
||||
.arg("--fix-metadata") // Fix corrupted metadata
|
||||
.arg("--skip-text") // Still extract existing text only
|
||||
.arg("--sidecar")
|
||||
.arg(&temp_text_path)
|
||||
.arg(file_path)
|
||||
.arg("-")
|
||||
.output()
|
||||
.await?;
|
||||
|
||||
if !output.status.success() {
|
||||
// Final fallback: minimal processing (may skip large pages)
|
||||
output = tokio::process::Command::new("ocrmypdf")
|
||||
.arg("--skip-big") // Skip very large pages to avoid memory issues
|
||||
.arg("--sidecar")
|
||||
.arg(&temp_text_path)
|
||||
.arg(file_path)
|
||||
.arg("-")
|
||||
.output()
|
||||
.await?;
|
||||
|
||||
if !output.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
// Clean up temp file on error
|
||||
let _ = tokio::fs::remove_file(&temp_text_path).await;
|
||||
|
||||
// Last resort: try direct text extraction
|
||||
match self.extract_text_from_pdf_bytes(file_path).await {
|
||||
Ok(text) if !text.trim().is_empty() => {
|
||||
return Ok(text);
|
||||
}
|
||||
Ok(_) => {
|
||||
// Empty text from direct extraction
|
||||
}
|
||||
Err(_) => {
|
||||
// Direct extraction also failed
|
||||
}
|
||||
}
|
||||
|
||||
return Err(anyhow!("Failed to extract text from PDF after trying multiple strategies: {}", stderr));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Read the extracted text
|
||||
let text = tokio::fs::read_to_string(&temp_text_path).await?;
|
||||
|
||||
// Clean up temporary file
|
||||
let _ = tokio::fs::remove_file(&temp_text_path).await;
|
||||
|
||||
Ok(text.trim().to_string())
|
||||
}
|
||||
|
|
@ -106,6 +176,45 @@ impl OcrService {
|
|||
}
|
||||
}
|
||||
|
||||
/// Last resort: extract readable text directly from PDF bytes
|
||||
async fn extract_text_from_pdf_bytes(&self, file_path: &str) -> Result<String> {
|
||||
let bytes = tokio::fs::read(file_path).await?;
|
||||
|
||||
// Look for readable ASCII text in the PDF
|
||||
let mut ascii_text = String::new();
|
||||
let mut current_word = String::new();
|
||||
|
||||
for &byte in &bytes {
|
||||
if byte >= 32 && byte <= 126 { // Printable ASCII
|
||||
current_word.push(byte as char);
|
||||
} else {
|
||||
if current_word.len() > 3 { // Only keep words longer than 3 characters
|
||||
ascii_text.push_str(¤t_word);
|
||||
ascii_text.push(' ');
|
||||
}
|
||||
current_word.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// Add the last word if it's long enough
|
||||
if current_word.len() > 3 {
|
||||
ascii_text.push_str(¤t_word);
|
||||
}
|
||||
|
||||
// Clean up the text
|
||||
let cleaned_text = ascii_text
|
||||
.split_whitespace()
|
||||
.filter(|word| word.len() > 1) // Filter out single characters
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ");
|
||||
|
||||
if cleaned_text.trim().is_empty() {
|
||||
Err(anyhow!("No readable text found in PDF"))
|
||||
} else {
|
||||
Ok(cleaned_text)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_image_file(&self, file_path: &str) -> bool {
|
||||
if let Some(extension) = Path::new(file_path)
|
||||
.extension()
|
||||
|
|
|
|||
|
|
@ -443,18 +443,25 @@ startxref
|
|||
std::fs::write(temp_file.path(), malformed_pdf_content).unwrap();
|
||||
|
||||
let result = ocr_service.extract_text_from_pdf(temp_file.path().to_str().unwrap()).await;
|
||||
// Should not panic, should return an error instead
|
||||
assert!(result.is_err(), "Expected error for malformed PDF");
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
println!("Error message: {}", error_msg);
|
||||
// Should contain descriptive error message
|
||||
assert!(
|
||||
error_msg.contains("panic") ||
|
||||
error_msg.contains("invalid content stream") ||
|
||||
error_msg.contains("corrupted") ||
|
||||
error_msg.contains("extract") ||
|
||||
error_msg.contains("Failed to extract")
|
||||
);
|
||||
// With ocrmypdf, this should now succeed gracefully (more robust than pdf-extract)
|
||||
// or return a descriptive error - either is acceptable
|
||||
match result {
|
||||
Ok(text) => {
|
||||
println!("Successfully extracted text from malformed PDF: '{}'", text);
|
||||
// OCRmyPDF is more robust and can handle some malformed PDFs
|
||||
}
|
||||
Err(e) => {
|
||||
println!("Error extracting from malformed PDF: {}", e);
|
||||
// Should contain descriptive error message if it fails
|
||||
let error_msg = e.to_string();
|
||||
assert!(
|
||||
error_msg.contains("ocrmypdf") ||
|
||||
error_msg.contains("extraction") ||
|
||||
error_msg.contains("InputFileError") ||
|
||||
error_msg.contains("Failed to extract")
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
|
@ -573,16 +580,23 @@ This tests the error handling for files that aren't actually PDFs.";
|
|||
let problematic_encoding = "tests/test_pdfs/problematic_encoding.pdf";
|
||||
if Path::new(problematic_encoding).exists() {
|
||||
let result = ocr_service.extract_text_from_pdf(problematic_encoding).await;
|
||||
// Should not panic, should return an error instead
|
||||
assert!(result.is_err());
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
// Should contain descriptive error message
|
||||
assert!(
|
||||
error_msg.contains("panic") ||
|
||||
error_msg.contains("encoding") ||
|
||||
error_msg.contains("extract") ||
|
||||
error_msg.contains("font")
|
||||
);
|
||||
// With ocrmypdf, this may succeed gracefully or return descriptive error
|
||||
match result {
|
||||
Ok(text) => {
|
||||
println!("Successfully extracted text from problematic encoding PDF: '{}'", text);
|
||||
// OCRmyPDF's robustness allows it to handle some problematic encoding PDFs
|
||||
}
|
||||
Err(e) => {
|
||||
println!("Error extracting from problematic encoding PDF: {}", e);
|
||||
let error_msg = e.to_string();
|
||||
assert!(
|
||||
error_msg.contains("ocrmypdf") ||
|
||||
error_msg.contains("extraction") ||
|
||||
error_msg.contains("strategies") ||
|
||||
error_msg.contains("Failed to extract")
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue