Readur/src/ocr/enhanced.rs

1846 lines
78 KiB
Rust

use anyhow::{anyhow, Result};
use tracing::{debug, info, warn};
use std::panic::{catch_unwind, AssertUnwindSafe};
#[cfg(feature = "ocr")]
use image::{DynamicImage, ImageBuffer, Luma, GenericImageView};
#[cfg(feature = "ocr")]
use imageproc::{
contrast::adaptive_threshold,
morphology::{close, open},
filter::{median_filter, gaussian_blur_f32},
distance_transform::Norm,
};
#[cfg(feature = "ocr")]
use tesseract::{Tesseract, PageSegMode, OcrEngineMode};
use crate::models::Settings;
use crate::services::file_service::FileService;
use super::xml_extractor::XmlOfficeExtractor;
// Removed text_sanitization import - now using minimal inline sanitization
/// RAII guard for automatic cleanup of temporary files
struct FileCleanupGuard {
file_path: String,
}
impl FileCleanupGuard {
fn new(file_path: &str) -> Self {
Self {
file_path: file_path.to_string(),
}
}
}
impl Drop for FileCleanupGuard {
fn drop(&mut self) {
if std::path::Path::new(&self.file_path).exists() {
if let Err(e) = std::fs::remove_file(&self.file_path) {
warn!("Failed to clean up temporary file '{}': {}", self.file_path, e);
} else {
debug!("Cleaned up temporary file: {}", self.file_path);
}
}
}
}
#[derive(Debug, Clone)]
pub struct ImageQualityStats {
pub average_brightness: f32,
pub contrast_ratio: f32,
pub noise_level: f32,
pub sharpness: f32,
}
#[derive(Debug, Clone)]
pub struct OcrResult {
pub text: String,
pub confidence: f32,
pub processing_time_ms: u64,
pub word_count: usize,
pub preprocessing_applied: Vec<String>,
pub processed_image_path: Option<String>,
}
pub struct EnhancedOcrService {
pub temp_dir: String,
pub file_service: FileService,
}
impl EnhancedOcrService {
// Security limits for Office document processing
const MAX_OFFICE_DOCUMENT_SIZE: u64 = 100 * 1024 * 1024; // 100MB for all Office documents
const MAX_ENTRY_NAME_LENGTH: usize = 255; // Maximum length of entry names
/// Remove null bytes from text to prevent PostgreSQL errors
/// This is the ONLY sanitization we do - preserving all other original content
fn remove_null_bytes(text: &str) -> String {
let original_len = text.len();
let cleaned: String = text.chars().filter(|&c| c != '\0').collect();
// Log if we found and removed null bytes (shouldn't happen with valid documents)
let cleaned_len = cleaned.len();
if cleaned_len < original_len {
let null_bytes_removed = text.chars().filter(|&c| c == '\0').count();
warn!(
"Removed {} null bytes from extracted text (original: {} chars, cleaned: {} chars). \
This indicates corrupted or malformed document data.",
null_bytes_removed, original_len, cleaned_len
);
}
cleaned
}
pub fn new(temp_dir: String, file_service: FileService) -> Self {
Self { temp_dir, file_service }
}
/// Extract text from image with high-quality OCR settings
#[cfg(feature = "ocr")]
pub async fn extract_text_from_image(&self, file_path: &str, settings: &Settings) -> Result<OcrResult> {
let start_time = std::time::Instant::now();
info!("Starting enhanced OCR for image: {}", file_path);
let mut preprocessing_applied = Vec::new();
// Load and preprocess the image
let (processed_image_path, preprocess_steps) = if settings.enable_image_preprocessing {
let (processed_path, steps) = self.preprocess_image(file_path, settings).await?;
(processed_path, steps)
} else {
(file_path.to_string(), Vec::new())
};
preprocessing_applied.extend(preprocess_steps);
// Move CPU-intensive OCR operations to blocking thread pool
let processed_image_path_clone = processed_image_path.clone();
let settings_clone = settings.clone();
let temp_dir = self.temp_dir.clone();
let ocr_result = tokio::task::spawn_blocking(move || -> Result<(String, f32)> {
// Configure Tesseract with optimal settings
let mut tesseract = Self::configure_tesseract_static(&processed_image_path_clone, &settings_clone)?;
// Extract text with confidence
let text = tesseract.get_text()?.trim().to_string();
let confidence = Self::calculate_overall_confidence_static(&mut tesseract)?;
Ok((text, confidence))
}).await??;
let (text, confidence) = ocr_result;
let processing_time = start_time.elapsed().as_millis() as u64;
let word_count = text.split_whitespace().count();
debug!(
"OCR completed: {} words, {:.1}% confidence, {}ms",
word_count, confidence, processing_time
);
// Return the processed image path if different from original (caller will handle cleanup/saving)
let result_processed_image_path = if processed_image_path != file_path {
Some(processed_image_path.clone())
} else {
None
};
let result = OcrResult {
text,
confidence,
processing_time_ms: processing_time,
word_count,
preprocessing_applied,
processed_image_path: result_processed_image_path,
};
// Clean up temporary files if not saved for review
if let Some(ref temp_path) = result.processed_image_path {
if !settings.save_processed_images {
let _ = tokio::fs::remove_file(temp_path).await;
}
}
Ok(result)
}
/// Preprocess image for optimal OCR quality, especially for challenging conditions
#[cfg(feature = "ocr")]
async fn preprocess_image(&self, input_path: &str, settings: &Settings) -> Result<(String, Vec<String>)> {
// Resolve the file path first
let resolved_path = self.resolve_file_path(input_path).await?;
let img = image::open(&resolved_path)?;
let mut processed_img = img;
let mut preprocessing_applied = Vec::new();
info!("Original image dimensions: {}x{}", processed_img.width(), processed_img.height());
// Apply orientation detection and correction
if settings.ocr_detect_orientation {
processed_img = self.detect_and_correct_orientation(processed_img)?;
}
// Aggressively upscale low-resolution images for better OCR
processed_img = self.smart_resize_for_ocr(processed_img, settings.ocr_dpi)?;
// Convert to grayscale for better OCR
let gray_img = processed_img.to_luma8();
let mut processed_gray = gray_img;
// Analyze image quality and apply appropriate enhancements
let quality_stats = self.analyze_image_quality(&processed_gray);
info!("Image quality analysis: brightness={:.1}, contrast={:.1}, noise_level={:.1}, sharpness={:.1}",
quality_stats.average_brightness, quality_stats.contrast_ratio, quality_stats.noise_level, quality_stats.sharpness);
// Determine if image needs enhancement based on quality thresholds
let needs_enhancement = self.needs_enhancement(&quality_stats, settings);
if !needs_enhancement {
info!("Image quality is good, skipping enhancement steps");
} else {
info!("Image quality needs improvement, applying selective enhancements");
// Apply brightness correction only for very dim images
if quality_stats.average_brightness < 50.0 || settings.ocr_brightness_boost > 0.0 {
processed_gray = self.enhance_brightness_and_contrast(processed_gray, &quality_stats, settings)?;
preprocessing_applied.push("Brightness/contrast correction".to_string());
}
// Apply noise removal only for very noisy images
if quality_stats.noise_level > 0.25 || (settings.ocr_remove_noise && settings.ocr_noise_reduction_level > 1) {
processed_gray = self.adaptive_noise_removal(processed_gray, &quality_stats, settings)?;
preprocessing_applied.push("Noise reduction".to_string());
}
// Apply contrast enhancement only for very low contrast images
if quality_stats.contrast_ratio < 0.2 || (settings.ocr_enhance_contrast && settings.ocr_adaptive_threshold_window_size > 0) {
let original_gray = processed_gray.clone();
match self.adaptive_contrast_enhancement(processed_gray, &quality_stats, settings) {
Ok(enhanced) => {
processed_gray = enhanced;
preprocessing_applied.push("Contrast enhancement".to_string());
}
Err(e) => {
warn!("Contrast enhancement failed, using alternative method: {}", e);
// Fallback to basic contrast enhancement
processed_gray = self.apply_alternative_contrast_enhancement(original_gray.clone(), &quality_stats, settings)
.unwrap_or_else(|_| {
warn!("Alternative contrast enhancement also failed, using original image");
original_gray
});
preprocessing_applied.push("Basic contrast enhancement".to_string());
}
}
}
// Apply sharpening only for very blurry images
if quality_stats.sharpness < 0.2 || settings.ocr_sharpening_strength > 0.5 {
processed_gray = self.sharpen_image(processed_gray, settings)?;
preprocessing_applied.push("Image sharpening".to_string());
}
// Apply morphological operations only if explicitly enabled and image needs it
if settings.ocr_morphological_operations && quality_stats.noise_level > 0.15 {
processed_gray = self.apply_morphological_operations(processed_gray)?;
preprocessing_applied.push("Morphological operations".to_string());
}
}
// Save processed image to temporary file
let temp_filename = format!("processed_{}_{}.png",
std::process::id(),
std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?.as_millis()
);
let temp_path = format!("{}/{}", self.temp_dir, temp_filename);
let dynamic_processed = DynamicImage::ImageLuma8(processed_gray);
dynamic_processed.save(&temp_path)?;
info!("Processed image saved to: {}", temp_path);
Ok((temp_path, preprocessing_applied))
}
/// Determine if image needs enhancement based on quality thresholds
#[cfg(feature = "ocr")]
fn needs_enhancement(&self, stats: &ImageQualityStats, settings: &Settings) -> bool {
// If user wants to skip enhancement entirely, respect that
if settings.ocr_skip_enhancement {
info!("OCR enhancement disabled by user setting");
return false;
}
// Use user-configurable thresholds
let brightness_threshold = settings.ocr_quality_threshold_brightness;
let contrast_threshold = settings.ocr_quality_threshold_contrast;
let noise_threshold = settings.ocr_quality_threshold_noise;
let sharpness_threshold = settings.ocr_quality_threshold_sharpness;
// Check if any metric falls below acceptable quality thresholds
let needs_brightness_fix = stats.average_brightness < brightness_threshold;
let needs_contrast_fix = stats.contrast_ratio < contrast_threshold;
let needs_noise_fix = stats.noise_level > noise_threshold;
let needs_sharpening = stats.sharpness < sharpness_threshold;
// Also check if user has explicitly enabled aggressive enhancement
let user_wants_enhancement = settings.ocr_brightness_boost > 0.0 ||
settings.ocr_contrast_multiplier > 1.0 ||
settings.ocr_noise_reduction_level > 1 ||
settings.ocr_sharpening_strength > 0.0;
let needs_enhancement = needs_brightness_fix || needs_contrast_fix || needs_noise_fix || needs_sharpening || user_wants_enhancement;
info!("Enhancement decision: brightness_ok={}, contrast_ok={}, noise_ok={}, sharpness_ok={}, user_enhancement={}, needs_enhancement={}",
!needs_brightness_fix, !needs_contrast_fix, !needs_noise_fix, !needs_sharpening, user_wants_enhancement, needs_enhancement);
needs_enhancement
}
/// Build language combination string for Tesseract (e.g., "eng+spa")
fn build_language_combination(&self, settings: &Settings) -> String {
if settings.preferred_languages.len() > 1 {
// Use preferred_languages with primary_language first
let mut languages = settings.preferred_languages.clone();
// Ensure primary language is first
languages.retain(|lang| lang != &settings.primary_language);
languages.insert(0, settings.primary_language.clone());
// Join with + for Tesseract multi-language format
languages.join("+")
} else if !settings.preferred_languages.is_empty() {
// Single language from preferred_languages
settings.preferred_languages[0].clone()
} else {
// Fallback to ocr_language field for backward compatibility
settings.ocr_language.clone()
}
}
/// Configure Tesseract with optimal settings
#[cfg(feature = "ocr")]
fn configure_tesseract(&self, image_path: &str, settings: &Settings) -> Result<Tesseract> {
let language_combination = self.build_language_combination(settings);
let mut tesseract = Tesseract::new(None, Some(&language_combination))?;
// Set the image
tesseract = tesseract.set_image(image_path)?;
// Configure Page Segmentation Mode (PSM)
let psm = match settings.ocr_page_segmentation_mode {
0 => PageSegMode::PsmOsdOnly,
1 => PageSegMode::PsmAutoOsd,
2 => PageSegMode::PsmAutoOnly,
3 => PageSegMode::PsmAuto,
4 => PageSegMode::PsmSingleColumn,
5 => PageSegMode::PsmSingleBlockVertText,
6 => PageSegMode::PsmSingleBlock,
7 => PageSegMode::PsmSingleLine,
8 => PageSegMode::PsmSingleWord,
9 => PageSegMode::PsmCircleWord,
10 => PageSegMode::PsmSingleChar,
11 => PageSegMode::PsmSparseText,
12 => PageSegMode::PsmSparseTextOsd,
13 => PageSegMode::PsmRawLine,
_ => PageSegMode::PsmAuto, // Default fallback
};
tesseract.set_page_seg_mode(psm);
// Configure OCR Engine Mode (OEM)
let _oem = match settings.ocr_engine_mode {
0 => OcrEngineMode::TesseractOnly,
1 => OcrEngineMode::LstmOnly,
2 => OcrEngineMode::TesseractLstmCombined,
3 => OcrEngineMode::Default,
_ => OcrEngineMode::Default, // Default fallback
};
// Note: set_engine_mode may not be available in the current tesseract crate version
// We'll configure this differently if needed
// Basic configuration - skip advanced settings that might cause issues
// Only set essential variables that are widely supported
Ok(tesseract)
}
/// Calculate overall confidence score using Tesseract's mean confidence
#[cfg(feature = "ocr")]
fn calculate_overall_confidence(&self, tesseract: &mut Tesseract) -> Result<f32> {
// Use Tesseract's built-in mean confidence calculation
let confidence = tesseract.mean_text_conf();
// Convert from i32 to f32 and ensure it's within valid range
let confidence_f32 = confidence as f32;
// Clamp confidence to valid range (0.0 to 100.0)
let clamped_confidence = confidence_f32.max(0.0).min(100.0);
debug!("Tesseract confidence: {} -> {:.1}%", confidence, clamped_confidence);
Ok(clamped_confidence)
}
/// Detect and correct image orientation
#[cfg(feature = "ocr")]
fn detect_and_correct_orientation(&self, img: DynamicImage) -> Result<DynamicImage> {
// For now, we'll implement basic rotation detection
// In a production system, you might want to use Tesseract's OSD or advanced algorithms
let (width, height) = img.dimensions();
// If image is wider than tall by significant margin, it might need rotation
if width as f32 / height as f32 > 2.0 {
Ok(img.rotate90())
} else {
Ok(img)
}
}
/// Smart resize for OCR - optimize image size for best OCR performance
#[cfg(feature = "ocr")]
fn smart_resize_for_ocr(&self, img: DynamicImage, _target_dpi: i32) -> Result<DynamicImage> {
let (width, height) = img.dimensions();
let max_dimension = width.max(height);
let min_dimension = width.min(height);
// Calculate optimal dimensions for OCR
let mut new_width = width;
let mut new_height = height;
// Scale DOWN large images for better OCR performance and memory efficiency
if max_dimension > 2048 {
let scale_factor = 2048.0 / max_dimension as f32;
new_width = (width as f32 * scale_factor) as u32;
new_height = (height as f32 * scale_factor) as u32;
info!("Scaling down large image ({}x{}) by factor {:.2}x to {}x{} for optimal OCR",
width, height, scale_factor, new_width, new_height);
}
// Scale UP very small images that would produce poor OCR results
else if min_dimension < 300 {
let scale_factor = 600.0 / min_dimension as f32;
new_width = (width as f32 * scale_factor) as u32;
new_height = (height as f32 * scale_factor) as u32;
info!("Scaling up small image ({}x{}) by factor {:.2}x to {}x{} for better OCR",
width, height, scale_factor, new_width, new_height);
}
if new_width != width || new_height != height {
// Use Lanczos3 for best quality upscaling
Ok(img.resize(new_width, new_height, image::imageops::FilterType::Lanczos3))
} else {
Ok(img)
}
}
/// Analyze image quality metrics
#[cfg(feature = "ocr")]
fn analyze_image_quality(&self, img: &ImageBuffer<Luma<u8>, Vec<u8>>) -> ImageQualityStats {
let (width, height) = img.dimensions();
let pixel_count = (width as u64) * (height as u64);
// For very large images, use sampling to avoid performance issues and overflow
let (average_brightness, variance) = if pixel_count > 4_000_000 { // > 4 megapixels
self.analyze_quality_sampled(img)
} else {
self.analyze_quality_full(img)
};
let contrast_ratio = variance.sqrt() / 255.0;
// Estimate noise level using local variance
let noise_level = self.estimate_noise_level(img);
// Estimate sharpness using gradient magnitude
let sharpness = self.estimate_sharpness(img);
ImageQualityStats {
average_brightness,
contrast_ratio,
noise_level,
sharpness,
}
}
/// Analyze quality for normal-sized images (< 4 megapixels)
#[cfg(feature = "ocr")]
fn analyze_quality_full(&self, img: &ImageBuffer<Luma<u8>, Vec<u8>>) -> (f32, f32) {
let pixels: Vec<u8> = img.pixels().map(|p| p[0]).collect();
let pixel_count = pixels.len() as f32;
// Calculate average brightness using u64 to prevent overflow
let sum: u64 = pixels.iter().map(|&p| p as u64).sum();
let average_brightness = sum as f32 / pixel_count;
// Calculate variance
let variance: f32 = pixels.iter()
.map(|&p| {
let diff = p as f32 - average_brightness;
diff * diff
})
.sum::<f32>() / pixel_count;
(average_brightness, variance)
}
/// Analyze quality for large images using sampling
#[cfg(feature = "ocr")]
fn analyze_quality_sampled(&self, img: &ImageBuffer<Luma<u8>, Vec<u8>>) -> (f32, f32) {
let (width, height) = img.dimensions();
let mut pixel_sum = 0u64;
let mut sample_count = 0u32;
// Sample every 10th pixel to avoid overflow and improve performance
for y in (0..height).step_by(10) {
for x in (0..width).step_by(10) {
pixel_sum += img.get_pixel(x, y)[0] as u64;
sample_count += 1;
}
}
let average_brightness = if sample_count > 0 {
pixel_sum as f32 / sample_count as f32
} else {
128.0 // Default middle brightness
};
// Calculate variance using sampled pixels
let mut variance_sum = 0.0f32;
for y in (0..height).step_by(10) {
for x in (0..width).step_by(10) {
let pixel_value = img.get_pixel(x, y)[0] as f32;
let diff = pixel_value - average_brightness;
variance_sum += diff * diff;
}
}
let variance = if sample_count > 0 {
variance_sum / sample_count as f32
} else {
0.0
};
(average_brightness, variance)
}
/// Estimate noise level in image
#[cfg(feature = "ocr")]
fn estimate_noise_level(&self, img: &ImageBuffer<Luma<u8>, Vec<u8>>) -> f32 {
let (width, height) = img.dimensions();
let mut noise_sum = 0.0f32;
let mut sample_count = 0u32;
// Sample every 10th pixel to estimate noise
for y in (5..height-5).step_by(10) {
for x in (5..width-5).step_by(10) {
let center = img.get_pixel(x, y)[0] as f32;
let mut neighbor_sum = 0.0f32;
let mut neighbor_count = 0u32;
// Check 3x3 neighborhood
for dy in -1..=1 {
for dx in -1..=1 {
if dx == 0 && dy == 0 { continue; }
let neighbor = img.get_pixel((x as i32 + dx) as u32, (y as i32 + dy) as u32)[0] as f32;
neighbor_sum += neighbor;
neighbor_count += 1;
}
}
let neighbor_avg = neighbor_sum / neighbor_count as f32;
let local_variance = (center - neighbor_avg).abs();
noise_sum += local_variance;
sample_count += 1;
}
}
if sample_count > 0 {
(noise_sum / sample_count as f32) / 255.0
} else {
0.0
}
}
/// Estimate image sharpness using gradient magnitude
#[cfg(feature = "ocr")]
fn estimate_sharpness(&self, img: &ImageBuffer<Luma<u8>, Vec<u8>>) -> f32 {
let (width, height) = img.dimensions();
let mut gradient_sum = 0.0f32;
let mut sample_count = 0u64; // Use u64 to prevent overflow
// For large images, sample pixels to avoid performance issues and overflow
let total_pixels = (width as u64) * (height as u64);
let step_size = if total_pixels > 4_000_000 { 10 } else { 1 }; // Sample every 10th pixel for large images
// Calculate gradients for interior pixels
for y in (1..height-1).step_by(step_size) {
for x in (1..width-1).step_by(step_size) {
let _center = img.get_pixel(x, y)[0] as f32;
let left = img.get_pixel(x-1, y)[0] as f32;
let right = img.get_pixel(x+1, y)[0] as f32;
let top = img.get_pixel(x, y-1)[0] as f32;
let bottom = img.get_pixel(x, y+1)[0] as f32;
let grad_x = (right - left) / 2.0;
let grad_y = (bottom - top) / 2.0;
let gradient_magnitude = (grad_x * grad_x + grad_y * grad_y).sqrt();
gradient_sum += gradient_magnitude;
sample_count += 1;
}
}
if sample_count > 0 {
(gradient_sum / sample_count as f32) / 255.0
} else {
0.0
}
}
/// Enhanced brightness and contrast correction for dim images
#[cfg(feature = "ocr")]
fn enhance_brightness_and_contrast(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>, stats: &ImageQualityStats, settings: &Settings) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
let (width, height) = img.dimensions();
let mut enhanced = ImageBuffer::new(width, height);
// Calculate enhancement parameters based on image statistics and user settings
let brightness_boost = if settings.ocr_brightness_boost > 0.0 {
settings.ocr_brightness_boost // Use user-configured value
} else if stats.average_brightness < 50.0 {
60.0 - stats.average_brightness // Aggressive boost for very dim images
} else if stats.average_brightness < 80.0 {
30.0 - (stats.average_brightness - 50.0) * 0.5 // Moderate boost
} else {
0.0 // No boost needed
};
let contrast_multiplier = if settings.ocr_contrast_multiplier > 0.0 {
settings.ocr_contrast_multiplier // Use user-configured value
} else if stats.contrast_ratio < 0.2 {
2.5 // Aggressive contrast boost for flat images
} else if stats.contrast_ratio < 0.4 {
1.8 // Moderate contrast boost
} else {
1.2 // Slight boost
};
info!("Applying brightness boost: {:.1}, contrast multiplier: {:.1}", brightness_boost, contrast_multiplier);
for (x, y, pixel) in img.enumerate_pixels() {
let original_value = pixel[0] as f32;
// Apply brightness and contrast enhancement
let enhanced_value = ((original_value + brightness_boost) * contrast_multiplier).round();
let clamped_value = enhanced_value.max(0.0).min(255.0) as u8;
enhanced.put_pixel(x, y, Luma([clamped_value]));
}
Ok(enhanced)
}
/// Adaptive noise removal based on detected noise level
#[cfg(feature = "ocr")]
fn adaptive_noise_removal(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>, stats: &ImageQualityStats, settings: &Settings) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
let mut processed = img;
// Use user-configured noise reduction level if specified
let noise_level = if settings.ocr_noise_reduction_level > 0 {
settings.ocr_noise_reduction_level
} else if stats.noise_level > 0.2 {
3 // Heavy noise
} else if stats.noise_level > 0.1 {
2 // Moderate noise
} else {
1 // Light noise
};
match noise_level {
3 => {
// Heavy noise - apply multiple filters
processed = median_filter(&processed, 2, 2); // Larger median filter
processed = gaussian_blur_f32(&processed, 0.8); // More blur
info!("Applied heavy noise reduction");
},
2 => {
// Moderate noise
processed = median_filter(&processed, 1, 1);
processed = gaussian_blur_f32(&processed, 0.5);
info!("Applied moderate noise reduction");
},
1 | _ => {
// Light noise or clean image
processed = median_filter(&processed, 1, 1);
info!("Applied light noise reduction");
}
}
Ok(processed)
}
/// Adaptive contrast enhancement based on image quality
#[cfg(feature = "ocr")]
fn adaptive_contrast_enhancement(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>, stats: &ImageQualityStats, settings: &Settings) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
// Choose threshold size based on image dimensions and quality
let (width, height) = img.dimensions();
let min_dimension = width.min(height);
// Check if image is too large for safe adaptive threshold processing
// The integral image calculation can overflow with large images
if width as u64 * height as u64 > 1_500_000 {
info!("Image too large for adaptive threshold ({}x{}), using alternative contrast enhancement", width, height);
return self.apply_alternative_contrast_enhancement(img, stats, settings);
}
let threshold_size = if settings.ocr_adaptive_threshold_window_size > 0 {
// Use user-configured window size
settings.ocr_adaptive_threshold_window_size as u32
} else if stats.contrast_ratio < 0.2 {
// Low contrast - use smaller windows for more aggressive local adaptation
(min_dimension / 20).max(11).min(31)
} else {
// Good contrast - use larger windows
(min_dimension / 15).max(15).min(41)
};
// Ensure odd number for threshold size
let threshold_size = if threshold_size % 2 == 0 { threshold_size + 1 } else { threshold_size };
info!("Applying adaptive threshold with window size: {}", threshold_size);
// Wrap in panic-safe block to catch overflow errors
let enhanced = catch_unwind(AssertUnwindSafe(|| {
adaptive_threshold(&img, threshold_size)
}));
match enhanced {
Ok(result) => Ok(result),
Err(_) => {
warn!("Adaptive threshold panicked (likely overflow), using alternative method");
self.apply_alternative_contrast_enhancement(img, stats, settings)
}
}
}
/// Alternative contrast enhancement for large images to avoid overflow
#[cfg(feature = "ocr")]
fn apply_alternative_contrast_enhancement(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>, stats: &ImageQualityStats, settings: &Settings) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
let (width, height) = img.dimensions();
let mut enhanced = ImageBuffer::new(width, height);
// Use histogram equalization instead of adaptive threshold for large images
if settings.ocr_histogram_equalization {
info!("Applying histogram equalization for contrast enhancement (user enabled)");
} else {
info!("Applying histogram equalization for contrast enhancement (fallback)");
}
// Calculate histogram using u64 to prevent overflow
let mut histogram = [0u64; 256];
for pixel in img.pixels() {
histogram[pixel[0] as usize] += 1;
}
// Calculate cumulative distribution function
let total_pixels = (width as u64) * (height as u64);
let mut cdf = [0u64; 256];
cdf[0] = histogram[0];
for i in 1..256 {
cdf[i] = cdf[i - 1] + histogram[i];
}
// Create lookup table for histogram equalization
let mut lookup = [0u8; 256];
for i in 0..256 {
if cdf[i] > 0 {
lookup[i] = ((cdf[i] as f64 / total_pixels as f64) * 255.0) as u8;
}
}
// Apply histogram equalization
for (x, y, pixel) in img.enumerate_pixels() {
let old_value = pixel[0];
let new_value = lookup[old_value as usize];
enhanced.put_pixel(x, y, Luma([new_value]));
}
// Apply additional contrast stretching if needed
if stats.contrast_ratio < 0.3 {
enhanced = self.apply_contrast_stretching(enhanced)?;
}
Ok(enhanced)
}
/// Apply contrast stretching to improve dynamic range
#[cfg(feature = "ocr")]
fn apply_contrast_stretching(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
let (width, height) = img.dimensions();
let mut enhanced = ImageBuffer::new(width, height);
// Find min and max values
let mut min_val = 255u8;
let mut max_val = 0u8;
for pixel in img.pixels() {
let val = pixel[0];
min_val = min_val.min(val);
max_val = max_val.max(val);
}
// Avoid division by zero
if max_val == min_val {
return Ok(img);
}
let range = max_val - min_val;
// Apply contrast stretching
for (x, y, pixel) in img.enumerate_pixels() {
let old_value = pixel[0];
let new_value = (((old_value - min_val) as f32 / range as f32) * 255.0) as u8;
enhanced.put_pixel(x, y, Luma([new_value]));
}
Ok(enhanced)
}
/// Sharpen blurry images
#[cfg(feature = "ocr")]
fn sharpen_image(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>, settings: &Settings) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
let (width, height) = img.dimensions();
let mut sharpened = ImageBuffer::new(width, height);
// Unsharp mask kernel - enhances edges
let kernel = [
[0.0, -1.0, 0.0],
[-1.0, 5.0, -1.0],
[0.0, -1.0, 0.0],
];
for y in 1..height-1 {
for x in 1..width-1 {
let mut sum = 0.0;
for ky in 0..3 {
for kx in 0..3 {
let px = img.get_pixel(x + kx - 1, y + ky - 1)[0] as f32;
sum += px * kernel[ky as usize][kx as usize];
}
}
let sharpened_value = sum.round().max(0.0).min(255.0) as u8;
sharpened.put_pixel(x, y, Luma([sharpened_value]));
}
}
// Copy border pixels
for y in 0..height {
for x in 0..width {
if x == 0 || x == width-1 || y == 0 || y == height-1 {
sharpened.put_pixel(x, y, *img.get_pixel(x, y));
}
}
}
info!("Applied image sharpening");
Ok(sharpened)
}
/// Apply morphological operations for text clarity
#[cfg(feature = "ocr")]
fn apply_morphological_operations(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
// Apply opening to remove small noise
let opened = open(&img, Norm::LInf, 1);
// Apply closing to fill small gaps in text
let closed = close(&opened, Norm::LInf, 1);
Ok(closed)
}
/// Extract text from PDF using ocrmypdf
#[cfg(feature = "ocr")]
pub async fn extract_text_from_pdf(&self, file_path: &str, settings: &Settings) -> Result<OcrResult> {
let start_time = std::time::Instant::now();
info!("Extracting text from PDF: {}", file_path);
// Check file size before processing
let metadata = tokio::fs::metadata(file_path).await?;
let file_size = metadata.len();
// Limit PDF size to 100MB to prevent memory exhaustion
const MAX_PDF_SIZE: u64 = 100 * 1024 * 1024; // 100MB
if file_size > MAX_PDF_SIZE {
return Err(anyhow!(
"PDF file too large: {:.1} MB (max: {:.1} MB). Consider splitting the PDF.",
file_size as f64 / (1024.0 * 1024.0),
MAX_PDF_SIZE as f64 / (1024.0 * 1024.0)
));
}
// Check if it's a valid PDF by reading first 1KB
let mut header_bytes = vec![0u8; 1024.min(file_size as usize)];
let mut file = tokio::fs::File::open(file_path).await?;
use tokio::io::AsyncReadExt;
file.read_exact(&mut header_bytes).await?;
drop(file);
if !is_valid_pdf(&header_bytes) {
return Err(anyhow!(
"Invalid PDF file: Missing or corrupted PDF header. File size: {} bytes, Header: {:?}",
file_size,
header_bytes.get(0..50).unwrap_or(&[]).iter().map(|&b| {
if b >= 32 && b <= 126 { b as char } else { '.' }
}).collect::<String>()
));
}
// Check if ocrmypdf is available
if !self.is_ocrmypdf_available().await {
return Err(anyhow!(
"ocrmypdf is not available on this system. To extract text from PDFs, please install ocrmypdf. \
On Ubuntu/Debian: 'apt-get install ocrmypdf'. \
On macOS: 'brew install ocrmypdf'."
));
}
// First try to extract text without OCR for performance (using --skip-text)
let quick_extraction_result = self.extract_pdf_text_quick(file_path).await;
match quick_extraction_result {
Ok((text, extraction_time)) => {
let word_count = self.count_words_safely(&text);
// Check if quick extraction got good results
if self.is_text_extraction_quality_sufficient(&text, word_count, file_size) {
info!("PDF text extraction successful for '{}' using quick method", file_path);
return Ok(OcrResult {
text,
confidence: 95.0,
processing_time_ms: extraction_time,
word_count,
preprocessing_applied: vec!["PDF text extraction (pdftotext)".to_string()],
processed_image_path: None,
});
} else {
info!("Quick PDF extraction insufficient for '{}' ({} words), using full OCR", file_path, word_count);
}
}
Err(e) => {
warn!("Quick PDF extraction failed for '{}': {}, using full OCR", file_path, e);
}
}
// If quick extraction failed or was insufficient, use full OCR
let full_ocr_result = self.extract_text_from_pdf_with_ocr(file_path, settings, start_time).await;
// If OCR also fails, try direct text extraction as last resort
if full_ocr_result.is_err() {
warn!("Full OCR failed, trying direct text extraction as last resort for: {}", file_path);
match self.extract_text_from_pdf_bytes(file_path).await {
Ok(text) if !text.trim().is_empty() => {
let processing_time = start_time.elapsed().as_millis() as u64;
let word_count = self.count_words_safely(&text);
info!("Direct text extraction succeeded as last resort for: {}", file_path);
return Ok(OcrResult {
text,
confidence: 50.0, // Lower confidence for direct extraction
processing_time_ms: processing_time,
word_count,
preprocessing_applied: vec!["Direct PDF text extraction (last resort)".to_string()],
processed_image_path: None,
});
}
Ok(_) => {
warn!("Direct text extraction returned empty text for: {}", file_path);
}
Err(e) => {
warn!("Direct text extraction also failed for {}: {}", file_path, e);
}
}
}
full_ocr_result
}
/// Assess if text extraction quality is sufficient or if OCR fallback is needed
#[cfg(feature = "ocr")]
fn is_text_extraction_quality_sufficient(&self, text: &str, word_count: usize, file_size: u64) -> bool {
// If we got no words at all, definitely need OCR
if word_count == 0 {
return false;
}
// For very small files, low word count might be normal
if file_size < 50_000 && word_count >= 1 {
return true;
}
// Calculate word density (words per KB)
let file_size_kb = (file_size as f64) / 1024.0;
let word_density = (word_count as f64) / file_size_kb;
// Reasonable thresholds based on typical PDF content:
// - Text-based PDFs typically have 50-200 words per KB
// - Below 5 words per KB suggests mostly images/scanned content
// - But if we have a substantial number of words (>50), accept it regardless of density
const MIN_WORD_DENSITY: f64 = 5.0;
const MIN_WORDS_FOR_LARGE_FILES: usize = 10;
const SUBSTANTIAL_WORD_COUNT: usize = 50;
// If we have substantial text, accept it regardless of density
if word_count >= SUBSTANTIAL_WORD_COUNT {
debug!("PDF has substantial text content: {} words, accepting regardless of density", word_count);
return true;
}
if word_density < MIN_WORD_DENSITY && word_count < MIN_WORDS_FOR_LARGE_FILES {
debug!("PDF appears to be image-based: {} words in {:.1} KB (density: {:.2} words/KB)",
word_count, file_size_kb, word_density);
return false;
}
// Additional check: if text is mostly non-alphanumeric, might be extraction artifacts
let alphanumeric_chars = text.chars().filter(|c| c.is_alphanumeric()).count();
let alphanumeric_ratio = if text.len() > 0 {
(alphanumeric_chars as f64) / (text.len() as f64)
} else {
0.0
};
// If less than 30% alphanumeric content, likely poor extraction
if alphanumeric_ratio < 0.3 {
debug!("PDF text has low alphanumeric content: {:.1}% ({} of {} chars)",
alphanumeric_ratio * 100.0, alphanumeric_chars, text.len());
return false;
}
debug!("PDF text extraction quality sufficient: {} words, {:.2} words/KB, {:.1}% alphanumeric",
word_count, word_density, alphanumeric_ratio * 100.0);
true
}
/// Extract text from PDF using OCR (ocrmypdf) for image-based or poor-quality PDFs
#[cfg(feature = "ocr")]
async fn extract_text_from_pdf_with_ocr(&self, file_path: &str, settings: &Settings, start_time: std::time::Instant) -> Result<OcrResult> {
info!("Starting OCR extraction for PDF: {}", file_path);
// Check if ocrmypdf is available
if !self.is_ocrmypdf_available().await {
return Err(anyhow!(
"ocrmypdf is not available on this system. To extract text from image-based PDFs like '{}', please install ocrmypdf. \
On Ubuntu/Debian: 'apt-get install ocrmypdf'. \
On macOS: 'brew install ocrmypdf'. \
Alternatively, convert the PDF to images and upload those instead.",
file_path
));
}
// Generate temporary file path for OCR'd PDF
let temp_ocr_filename = format!("ocr_{}_{}.pdf",
std::process::id(),
std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?.as_millis()
);
let temp_ocr_path = format!("{}/{}", self.temp_dir, temp_ocr_filename);
// Run ocrmypdf with progressive fallback strategies
let ocrmypdf_result = tokio::time::timeout(
std::time::Duration::from_secs(300), // 5 minute timeout for OCR
tokio::task::spawn_blocking({
let file_path = file_path.to_string();
let temp_ocr_path = temp_ocr_path.clone();
move || {
// Strategy 1: Standard OCR with cleaning
let mut result = std::process::Command::new("ocrmypdf")
.arg("--force-ocr") // OCR even if text is detected
.arg("-O2") // Optimize level 2 (balanced quality/speed)
.arg("--deskew") // Correct skewed pages
.arg("--clean") // Clean up artifacts
.arg("--language")
.arg("eng") // English language
.arg(&file_path)
.arg(&temp_ocr_path)
.output();
if result.is_ok() && result.as_ref().unwrap().status.success() {
return result;
}
// Strategy 2: If standard OCR fails, try with error recovery
eprintln!("Standard OCR failed, trying recovery mode...");
result = std::process::Command::new("ocrmypdf")
.arg("--force-ocr")
.arg("--fix-metadata") // Fix metadata issues
.arg("--remove-background") // Remove background noise
.arg("-O1") // Lower optimization for problematic PDFs
.arg("--language")
.arg("eng")
.arg(&file_path)
.arg(&temp_ocr_path)
.output();
if result.is_ok() && result.as_ref().unwrap().status.success() {
return result;
}
// Strategy 3: Last resort - minimal processing (skips very large pages)
eprintln!("Recovery mode failed, trying minimal processing...");
std::process::Command::new("ocrmypdf")
.arg("--force-ocr")
.arg("--skip-big") // Skip very large pages that might cause memory issues
.arg("--language")
.arg("eng")
.arg(&file_path)
.arg(&temp_ocr_path)
.output()
}
})
).await;
let ocrmypdf_output = match ocrmypdf_result {
Ok(Ok(output)) => output?,
Ok(Err(e)) => return Err(anyhow!("Failed to join ocrmypdf task: {}", e)),
Err(_) => return Err(anyhow!("ocrmypdf timed out after 5 minutes for file '{}'", file_path)),
};
if !ocrmypdf_output.status.success() {
let stderr = String::from_utf8_lossy(&ocrmypdf_output.stderr);
let stdout = String::from_utf8_lossy(&ocrmypdf_output.stdout);
return Err(anyhow!(
"ocrmypdf failed for '{}': Exit code {}\nStderr: {}\nStdout: {}",
file_path, ocrmypdf_output.status.code().unwrap_or(-1), stderr, stdout
));
}
// Extract text from the OCR'd PDF
let ocr_text_result = tokio::task::spawn_blocking({
let temp_ocr_path = temp_ocr_path.clone();
move || -> Result<String> {
let _bytes = std::fs::read(&temp_ocr_path)?;
// Catch panics from pdf-extract library (same pattern as used elsewhere)
// Extract text from the OCR'd PDF using ocrmypdf's sidecar option
let temp_text_path = format!("{}.txt", temp_ocr_path);
let extract_result = std::process::Command::new("ocrmypdf")
.arg("--sidecar") // Extract text to a sidecar file
.arg(&temp_text_path)
.arg(&temp_ocr_path)
.arg("-") // Output to stdout (dummy, required by ocrmypdf)
.output()?;
if !extract_result.status.success() {
let stderr = String::from_utf8_lossy(&extract_result.stderr);
return Err(anyhow!(
"ocrmypdf text extraction failed: {}",
stderr
));
}
// Read the extracted text from the sidecar file
let text = std::fs::read_to_string(&temp_text_path)?;
// Clean up the text file
let _ = std::fs::remove_file(&temp_text_path);
Ok(text.trim().to_string())
}
}).await??;
// Clean up temporary file
let _ = tokio::fs::remove_file(&temp_ocr_path).await;
let processing_time = start_time.elapsed().as_millis() as u64;
let word_count = self.count_words_safely(&ocr_text_result);
info!("OCR extraction completed for '{}': {} words in {}ms",
file_path, word_count, processing_time);
Ok(OcrResult {
text: ocr_text_result,
confidence: 85.0, // OCR is generally lower confidence than direct text extraction
processing_time_ms: processing_time,
word_count,
preprocessing_applied: vec!["OCR via ocrmypdf".to_string()],
processed_image_path: None,
})
}
/// Progressive PDF text extraction with fallback strategies
#[cfg(feature = "ocr")]
async fn extract_pdf_text_quick(&self, file_path: &str) -> Result<(String, u64)> {
let start_time = std::time::Instant::now();
// Generate temporary file path for text extraction
let temp_text_filename = format!("quick_text_{}_{}.txt",
std::process::id(),
std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?.as_millis()
);
let temp_text_path = format!("{}/{}", self.temp_dir, temp_text_filename);
// Strategy 1: Fast text extraction using pdftotext (for existing text)
debug!("Trying pdftotext for existing text extraction: {}", file_path);
debug!("Using temp file path: {}", temp_text_path);
let pdftotext_result = tokio::process::Command::new("pdftotext")
.arg("-layout") // Preserve layout
.arg(file_path)
.arg(&temp_text_path)
.output()
.await;
if let Ok(output) = pdftotext_result {
debug!("pdftotext exit status: {}", output.status);
if !output.stderr.is_empty() {
debug!("pdftotext stderr: {}", String::from_utf8_lossy(&output.stderr));
}
if output.status.success() {
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
let _ = tokio::fs::remove_file(&temp_text_path).await;
let word_count = text.split_whitespace().count();
debug!("pdftotext extracted {} words from temp file", word_count);
// If we got substantial text (more than a few words), use it
if word_count > 5 {
let processing_time = start_time.elapsed().as_millis() as u64;
info!("pdftotext extracted {} words from: {}", word_count, file_path);
return Ok((text.trim().to_string(), processing_time));
} else {
debug!("pdftotext only extracted {} words, will try direct extraction before OCR", word_count);
}
} else {
debug!("Failed to read pdftotext output file: {}", temp_text_path);
}
} else {
let stderr = String::from_utf8_lossy(&output.stderr);
debug!("pdftotext failed with status {}: {}", output.status, stderr);
}
} else {
debug!("Failed to execute pdftotext command");
}
info!("pdftotext extraction insufficient for '{}', trying direct extraction before OCR", file_path);
// Strategy 2: Try direct text extraction (often works when pdftotext fails)
match self.extract_text_from_pdf_bytes(file_path).await {
Ok(text) if !text.trim().is_empty() => {
let word_count = text.split_whitespace().count();
if word_count > 5 {
let processing_time = start_time.elapsed().as_millis() as u64;
info!("Direct text extraction succeeded for '{}': {} words", file_path, word_count);
return Ok((text, processing_time));
} else {
debug!("Direct extraction only got {} words, trying OCR", word_count);
}
}
Ok(_) => {
debug!("Direct text extraction returned empty text");
}
Err(e) => {
debug!("Direct text extraction failed: {}", e);
}
}
info!("Direct extraction insufficient for '{}', using OCR extraction", file_path);
// Strategy 3: Use ocrmypdf --sidecar to extract existing OCR text
let ocrmypdf_result = tokio::process::Command::new("ocrmypdf")
.arg("--sidecar")
.arg(&temp_text_path)
.arg(file_path)
.arg("-") // Dummy output (we only want sidecar)
.output()
.await;
if let Ok(output) = &ocrmypdf_result {
if output.status.success() {
if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
let _ = tokio::fs::remove_file(&temp_text_path).await;
let word_count = text.split_whitespace().count();
if word_count > 0 {
let processing_time = start_time.elapsed().as_millis() as u64;
info!("ocrmypdf --sidecar extracted {} words from: {}", word_count, file_path);
return Ok((text.trim().to_string(), processing_time));
}
}
} else {
let stderr = String::from_utf8_lossy(&output.stderr);
debug!("ocrmypdf --sidecar failed: {}", stderr);
// Check if the error indicates the page already has text
if stderr.contains("page already has text") {
// This is good - it means there's already text, we should use pdftotext
warn!("ocrmypdf detected existing text in PDF, this should have been caught by pdftotext");
}
}
}
// Strategy 3: Last resort - direct byte-level text extraction
warn!("Standard extraction methods failed, trying direct text extraction from: {}", file_path);
match self.extract_text_from_pdf_bytes(file_path).await {
Ok(text) if !text.trim().is_empty() => {
let processing_time = start_time.elapsed().as_millis() as u64;
let word_count = text.split_whitespace().count();
info!("Direct text extraction succeeded for '{}': {} words", file_path, word_count);
Ok((text, processing_time))
}
Ok(_) => {
warn!("Direct text extraction returned empty text for: {}", file_path);
// If all strategies fail, return the last error
if let Ok(ref output) = ocrmypdf_result {
let stderr = String::from_utf8_lossy(&output.stderr);
Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr))
} else {
Err(anyhow!("All PDF extraction strategies failed"))
}
}
Err(e) => {
warn!("Direct text extraction also failed for {}: {}", file_path, e);
// If all strategies fail, return the last error
if let Ok(ref output) = ocrmypdf_result {
let stderr = String::from_utf8_lossy(&output.stderr);
Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr))
} else {
Err(anyhow!("All PDF extraction strategies failed: {}", e))
}
}
}
}
/// Last resort: extract readable text directly from PDF bytes
/// This can find text that's embedded in the PDF even if the structure is corrupted
#[cfg(feature = "ocr")]
async fn extract_text_from_pdf_bytes(&self, file_path: &str) -> Result<String> {
let bytes = tokio::fs::read(file_path).await?;
// Look for text strings in the PDF
let mut extracted_text = String::new();
let mut current_text = String::new();
let mut in_text_object = false;
let mut in_string = false;
let mut escape_next = false;
for &byte in &bytes {
let char = byte as char;
// Look for text objects (BT...ET blocks)
if !in_text_object && char == 'B' {
// Check if this might be the start of "BT" (Begin Text)
if let Some(_window) = bytes.windows(2).find(|w| w == b"BT") {
in_text_object = true;
continue;
}
}
if in_text_object && char == 'E' {
// Check if this might be the start of "ET" (End Text)
if let Some(_window) = bytes.windows(2).find(|w| w == b"ET") {
in_text_object = false;
if !current_text.trim().is_empty() {
extracted_text.push_str(&current_text);
extracted_text.push(' ');
current_text.clear();
}
continue;
}
}
// Look for text strings in parentheses (text) or brackets
if in_text_object {
if char == '(' && !escape_next {
in_string = true;
continue;
}
if char == ')' && !escape_next && in_string {
in_string = false;
current_text.push(' ');
continue;
}
if in_string {
if escape_next {
escape_next = false;
current_text.push(char);
} else if char == '\\' {
escape_next = true;
} else {
current_text.push(char);
}
}
}
}
// Also try to find any readable ASCII text in the PDF
let mut ascii_text = String::new();
let mut current_word = String::new();
for &byte in &bytes {
if byte >= 32 && byte <= 126 { // Printable ASCII
current_word.push(byte as char);
} else {
if current_word.len() > 3 { // Only keep words longer than 3 characters
ascii_text.push_str(&current_word);
ascii_text.push(' ');
}
current_word.clear();
}
}
// Add the last word if it's long enough
if current_word.len() > 3 {
ascii_text.push_str(&current_word);
}
// Combine both extraction methods
let mut final_text = extracted_text;
if !ascii_text.trim().is_empty() {
final_text.push_str("\\n");
final_text.push_str(&ascii_text);
}
// Clean up the text
let cleaned_text = final_text
.split_whitespace()
.filter(|word| word.len() > 1) // Filter out single characters
.collect::<Vec<_>>()
.join(" ");
if cleaned_text.trim().is_empty() {
Err(anyhow!("No readable text found in PDF"))
} else {
Ok(cleaned_text)
}
}
/// Check if ocrmypdf is available on the system
#[cfg(feature = "ocr")]
async fn is_ocrmypdf_available(&self) -> bool {
match tokio::process::Command::new("ocrmypdf")
.arg("--version")
.output()
.await
{
Ok(output) => output.status.success(),
Err(_) => false,
}
}
#[cfg(not(feature = "ocr"))]
fn is_text_extraction_quality_sufficient(&self, _text: &str, _word_count: usize, _file_size: u64) -> bool {
// When OCR is disabled, always accept text extraction results
true
}
#[cfg(not(feature = "ocr"))]
async fn is_ocrmypdf_available(&self) -> bool {
false // OCR feature not enabled
}
#[cfg(not(feature = "ocr"))]
async fn extract_text_from_pdf_with_ocr(&self, file_path: &str, _settings: &Settings, _start_time: std::time::Instant) -> Result<OcrResult> {
Err(anyhow::anyhow!("OCR feature not enabled - cannot process image-based PDF: {}", file_path))
}
/// Resolve file path to actual location, handling both old and new directory structures
async fn resolve_file_path(&self, file_path: &str) -> Result<String> {
// Use the FileService's resolve_file_path method
self.file_service.resolve_file_path(file_path).await
}
/// Extract text from any supported file type with enhanced logging
pub async fn extract_text_with_context(&self, file_path: &str, mime_type: &str, filename: &str, file_size: i64, settings: &Settings) -> Result<OcrResult> {
// Format file size for better readability
let file_size_mb = file_size as f64 / (1024.0 * 1024.0);
info!(
"Starting OCR extraction | File: '{}' | Type: {} | Size: {:.2} MB | Path: {}",
filename, mime_type, file_size_mb, file_path
);
self.extract_text(file_path, mime_type, settings).await
}
/// Extract text from Office documents (DOCX, DOC, Excel) using XML extraction
pub async fn extract_text_from_office(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result<OcrResult> {
let start_time = std::time::Instant::now();
info!("Extracting text from Office document: {} (type: {})", file_path, mime_type);
// Check file size before processing
let metadata = tokio::fs::metadata(file_path).await?;
let file_size = metadata.len();
if file_size > Self::MAX_OFFICE_DOCUMENT_SIZE {
return Err(anyhow!(
"Office document too large: {:.1} MB (max: {:.1} MB). Consider converting to PDF or splitting the document.",
file_size as f64 / (1024.0 * 1024.0),
Self::MAX_OFFICE_DOCUMENT_SIZE as f64 / (1024.0 * 1024.0)
));
}
// Use XML extraction as the primary method
let xml_extractor = XmlOfficeExtractor::new(self.temp_dir.clone());
let xml_result = xml_extractor.extract_text_from_office(file_path, mime_type).await?;
let total_time = start_time.elapsed().as_millis() as u64;
info!(
"Office document extraction completed: {} words in {}ms using XML extraction",
xml_result.word_count,
total_time
);
// Convert OfficeExtractionResult to OcrResult for backward compatibility
Ok(OcrResult {
text: xml_result.text,
confidence: xml_result.confidence,
processing_time_ms: xml_result.processing_time_ms,
word_count: xml_result.word_count,
preprocessing_applied: vec![format!("XML extraction - {}", xml_result.extraction_method)],
processed_image_path: None,
})
}
/// Extract text from any supported file type
pub async fn extract_text(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result<OcrResult> {
// Resolve the actual file path
let resolved_path = self.resolve_file_path(file_path).await?;
match mime_type {
"application/pdf" => {
#[cfg(feature = "ocr")]
{
self.extract_text_from_pdf(&resolved_path, settings).await
}
#[cfg(not(feature = "ocr"))]
{
Err(anyhow::anyhow!("OCR feature not enabled"))
}
}
mime if mime.starts_with("image/") => {
#[cfg(feature = "ocr")]
{
self.extract_text_from_image(&resolved_path, settings).await
}
#[cfg(not(feature = "ocr"))]
{
Err(anyhow::anyhow!("OCR feature not enabled"))
}
}
"text/plain" => {
let start_time = std::time::Instant::now();
// Check file size before loading into memory
let metadata = tokio::fs::metadata(&resolved_path).await?;
let file_size = metadata.len();
// Limit text file size to 50MB to prevent memory exhaustion
const MAX_TEXT_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB
if file_size > MAX_TEXT_FILE_SIZE {
return Err(anyhow!(
"Text file too large: {:.1} MB (max: {:.1} MB). Consider splitting the file.",
file_size as f64 / (1024.0 * 1024.0),
MAX_TEXT_FILE_SIZE as f64 / (1024.0 * 1024.0)
));
}
let text = tokio::fs::read_to_string(&resolved_path).await?;
// Only remove null bytes - preserve all original formatting
let cleaned_text = Self::remove_null_bytes(&text);
// Limit text content size in memory
const MAX_TEXT_CONTENT_SIZE: usize = 10 * 1024 * 1024; // 10MB of text content
let trimmed_text = if cleaned_text.len() > MAX_TEXT_CONTENT_SIZE {
warn!("Text file content too large ({} chars), truncating to {} chars", cleaned_text.len(), MAX_TEXT_CONTENT_SIZE);
format!("{}... [TEXT TRUNCATED DUE TO SIZE]", &cleaned_text[..MAX_TEXT_CONTENT_SIZE])
} else {
cleaned_text.trim().to_string()
};
let processing_time = start_time.elapsed().as_millis() as u64;
let word_count = self.count_words_safely(&trimmed_text);
Ok(OcrResult {
text: trimmed_text,
confidence: 100.0, // Plain text is 100% confident
processing_time_ms: processing_time,
word_count,
preprocessing_applied: vec!["Plain text read".to_string()],
processed_image_path: None, // No image processing for plain text
})
}
// Handle Office document formats
mime if matches!(mime,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
"application/msword" |
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
) => {
// extract_text_from_office now returns OcrResult directly
self.extract_text_from_office(&resolved_path, mime, settings).await
}
_ => Err(anyhow::anyhow!("Unsupported file type: {}", mime_type)),
}
}
/// Safely count words to prevent overflow on very large texts
#[cfg(feature = "ocr")]
pub fn count_words_safely(&self, text: &str) -> usize {
// For very large texts, sample to estimate word count to prevent overflow
if text.len() > 1_000_000 { // > 1MB of text
// Sample first 100KB and extrapolate
let sample_size = 100_000;
let sample_text = &text[..sample_size.min(text.len())];
let sample_words = self.count_words_in_text(sample_text);
let estimated_total = (sample_words as f64 * (text.len() as f64 / sample_size as f64)) as usize;
// Cap at reasonable maximum to prevent display issues
estimated_total.min(10_000_000) // Max 10M words
} else {
self.count_words_in_text(text)
}
}
#[cfg(feature = "ocr")]
fn count_words_in_text(&self, text: &str) -> usize {
let whitespace_words = text.split_whitespace().count();
// If we have exactly 1 "word" but it's very long (likely continuous text), try enhanced detection
// OR if we have no whitespace words but text exists
let is_continuous_text = whitespace_words == 1 && text.len() > 15; // 15+ chars suggests it might be continuous
let is_no_words = whitespace_words == 0 && !text.trim().is_empty();
if is_continuous_text || is_no_words {
// Count total alphanumeric characters first
let alphanumeric_chars = text.chars().filter(|c| c.is_alphanumeric()).count();
// If no alphanumeric content, it's pure punctuation/symbols
if alphanumeric_chars == 0 {
return 0;
}
// For continuous text, look for word boundaries using multiple strategies
let mut word_count = 0;
// Strategy 1: Count transitions from lowercase to uppercase (camelCase detection)
let chars: Vec<char> = text.chars().collect();
let mut camel_transitions = 0;
for i in 1..chars.len() {
let prev_char = chars[i-1];
let curr_char = chars[i];
// Count transitions from lowercase letter to uppercase letter
if prev_char.is_lowercase() && curr_char.is_uppercase() {
camel_transitions += 1;
}
// Count transitions from letter to digit or digit to letter
else if (prev_char.is_alphabetic() && curr_char.is_numeric()) ||
(prev_char.is_numeric() && curr_char.is_alphabetic()) {
camel_transitions += 1;
}
}
// If we found camelCase transitions, estimate words
if camel_transitions > 0 {
word_count = camel_transitions + 1; // +1 for the first word
}
// Strategy 2: If no camelCase detected, estimate based on character count
if word_count == 0 {
// Estimate based on typical word length (4-6 characters per word)
word_count = (alphanumeric_chars / 5).max(1);
}
word_count
} else {
whitespace_words
}
}
/// Validate OCR result quality
#[cfg(feature = "ocr")]
pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> Result<(), String> {
// Hard reject completely unreliable OCR (likely corrupted/garbage)
const HARD_MINIMUM_CONFIDENCE: f32 = 5.0;
if result.confidence < HARD_MINIMUM_CONFIDENCE {
return Err(format!(
"OCR confidence critically low: {:.1}% (absolute minimum: {:.1}%) - likely corrupted input",
result.confidence,
HARD_MINIMUM_CONFIDENCE
));
}
// Log warning for low confidence instead of rejecting
if result.confidence < settings.ocr_min_confidence {
warn!(
"OCR confidence below recommended threshold: {:.1}% (recommended: {:.1}%) - accepting but flagging for review",
result.confidence,
settings.ocr_min_confidence
);
}
// Check empty text FIRST (before word count check)
let total_chars = result.text.len();
if total_chars == 0 {
return Err("OCR result contains no characters".to_string());
}
// THEN check word count
if result.word_count == 0 {
return Err("No words detected in OCR output".to_string());
}
// Count valuable content: letters + digits (explicitly treating digits as good content)
let content_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count();
let content_ratio = content_chars as f32 / total_chars as f32;
// Only reject if >90% symbols (likely OCR garbage)
// This allows bills/receipts/invoices with numbers and formatting characters
const MIN_CONTENT_RATIO: f32 = 0.10;
if content_ratio < MIN_CONTENT_RATIO {
let symbol_ratio = 1.0 - content_ratio;
return Err(format!(
"OCR result has too little meaningful content: {:.1}% content (letters+digits), {:.1}% symbols/formatting (minimum content: {:.1}%)",
content_ratio * 100.0,
symbol_ratio * 100.0,
MIN_CONTENT_RATIO * 100.0
));
}
// Log info for documents with reasonable content
debug!(
"OCR validation passed: {:.1}% confidence, {} words, {:.1}% content (letters+digits)",
result.confidence,
result.word_count,
content_ratio * 100.0
);
Ok(())
}
}
#[cfg(not(feature = "ocr"))]
impl EnhancedOcrService {
pub async fn extract_text_from_image(&self, _file_path: &str, _settings: &Settings) -> Result<OcrResult> {
Err(anyhow::anyhow!("OCR feature not enabled"))
}
pub async fn extract_text_from_pdf(&self, _file_path: &str, _settings: &Settings) -> Result<OcrResult> {
Err(anyhow::anyhow!("OCR feature not enabled"))
}
pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> Result<(), String> {
Err("OCR feature not enabled".to_string())
}
pub fn count_words_safely(&self, text: &str) -> usize {
// Simple word count for non-OCR builds
text.split_whitespace().count()
}
}
/// Check if the given bytes represent a valid PDF file
/// Handles PDFs with leading null bytes or whitespace
fn is_valid_pdf(data: &[u8]) -> bool {
if data.len() < 5 {
return false;
}
// Find the first occurrence of "%PDF-" in the first 1KB of the file
// Some PDFs have leading null bytes or other metadata
let search_limit = data.len().min(1024);
let search_data = &data[0..search_limit];
for i in 0..=search_limit.saturating_sub(5) {
if &search_data[i..i+5] == b"%PDF-" {
return true;
}
}
false
}
impl EnhancedOcrService {
/// Static version of configure_tesseract for use in spawn_blocking
#[cfg(feature = "ocr")]
fn configure_tesseract_static(image_path: &str, settings: &Settings) -> Result<Tesseract> {
let language_combination = Self::build_language_combination_static(settings);
let mut tesseract = Tesseract::new(None, Some(&language_combination))?;
// Set the image
tesseract = tesseract.set_image(image_path)?;
// Configure Page Segmentation Mode (PSM)
let psm = match settings.ocr_page_segmentation_mode {
0 => PageSegMode::PsmOsdOnly,
1 => PageSegMode::PsmAutoOsd,
2 => PageSegMode::PsmAutoOnly,
3 => PageSegMode::PsmAuto,
4 => PageSegMode::PsmSingleColumn,
5 => PageSegMode::PsmSingleBlockVertText,
6 => PageSegMode::PsmSingleBlock,
7 => PageSegMode::PsmSingleLine,
8 => PageSegMode::PsmSingleWord,
9 => PageSegMode::PsmCircleWord,
10 => PageSegMode::PsmSingleChar,
11 => PageSegMode::PsmSparseText,
12 => PageSegMode::PsmSparseTextOsd,
13 => PageSegMode::PsmRawLine,
_ => PageSegMode::PsmAuto, // Default fallback
};
tesseract.set_page_seg_mode(psm);
// Configure OCR Engine Mode (OEM)
let _oem = match settings.ocr_engine_mode {
0 => OcrEngineMode::TesseractOnly,
1 => OcrEngineMode::LstmOnly,
2 => OcrEngineMode::TesseractOnly, // Fallback since TesseractLstm doesn't exist
3 => OcrEngineMode::Default,
_ => OcrEngineMode::Default, // Default fallback
};
Ok(tesseract)
}
/// Static version of calculate_overall_confidence for use in spawn_blocking
#[cfg(feature = "ocr")]
fn calculate_overall_confidence_static(tesseract: &mut Tesseract) -> Result<f32> {
// Use Tesseract's built-in mean confidence calculation
let confidence = tesseract.mean_text_conf();
// Convert from i32 to f32 and ensure it's within valid range
let confidence_f32 = confidence as f32;
// Clamp confidence to valid range (0.0 to 100.0)
let clamped_confidence = confidence_f32.max(0.0).min(100.0);
debug!("Tesseract confidence: {} -> {:.1}%", confidence, clamped_confidence);
Ok(clamped_confidence)
}
/// Static version of build_language_combination for use in spawn_blocking
fn build_language_combination_static(settings: &Settings) -> String {
if settings.preferred_languages.len() > 1 {
// Use preferred_languages with primary_language first
let mut languages = settings.preferred_languages.clone();
// Ensure primary language is first
languages.retain(|lang| lang != &settings.primary_language);
languages.insert(0, settings.primary_language.clone());
// Join with + for Tesseract multi-language format
languages.join("+")
} else if !settings.preferred_languages.is_empty() {
// Single language from preferred_languages
settings.preferred_languages[0].clone()
} else {
// Fallback to ocr_language field for backward compatibility
settings.ocr_language.clone()
}
}
}