Readur/src/ocr/enhanced.rs

use anyhow::{anyhow, Result};
use tracing::{debug, info, warn};
use std::panic::{catch_unwind, AssertUnwindSafe};

#[cfg(feature = "ocr")]
use image::{DynamicImage, ImageBuffer, Luma, GenericImageView};
#[cfg(feature = "ocr")]
use imageproc::{
    contrast::adaptive_threshold,
    morphology::{close, open},
    filter::{median_filter, gaussian_blur_f32},
    distance_transform::Norm,
};
#[cfg(feature = "ocr")]
use tesseract::{Tesseract, PageSegMode, OcrEngineMode};

use crate::models::Settings;
use crate::services::file_service::FileService;
use super::xml_extractor::XmlOfficeExtractor;
// Removed text_sanitization import - now using minimal inline sanitization

/// RAII guard for automatic cleanup of temporary files
struct FileCleanupGuard {
    file_path: String,
}

impl FileCleanupGuard {
    fn new(file_path: &str) -> Self {
        Self {
            file_path: file_path.to_string(),
        }
    }
}

impl Drop for FileCleanupGuard {
    fn drop(&mut self) {
        if std::path::Path::new(&self.file_path).exists() {
            if let Err(e) = std::fs::remove_file(&self.file_path) {
                warn!("Failed to clean up temporary file '{}': {}", self.file_path, e);
            } else {
                debug!("Cleaned up temporary file: {}", self.file_path);
            }
        }
    }
}

#[derive(Debug, Clone)]
pub struct ImageQualityStats {
    pub average_brightness: f32,
    pub contrast_ratio: f32,
    pub noise_level: f32,
    pub sharpness: f32,
}

#[derive(Debug, Clone)]
pub struct OcrResult {
    pub text: String,
    pub confidence: f32,
    pub processing_time_ms: u64,
    pub word_count: usize,
    pub preprocessing_applied: Vec<String>,
    pub processed_image_path: Option<String>,
}

pub struct EnhancedOcrService {
    pub temp_dir: String,
    pub file_service: FileService,
}

impl EnhancedOcrService {
    // Security limits for Office document processing
    const MAX_OFFICE_DOCUMENT_SIZE: u64 = 100 * 1024 * 1024; // 100MB for all Office documents
    const MAX_ENTRY_NAME_LENGTH: usize = 255; // Maximum length of entry names

    /// Remove null bytes from text to prevent PostgreSQL errors
    /// This is the ONLY sanitization we do - preserving all other original content
    fn remove_null_bytes(text: &str) -> String {
        let original_len = text.len();
        let cleaned: String = text.chars().filter(|&c| c != '\0').collect();

        // Log if we found and removed null bytes (shouldn't happen with valid documents)
        let cleaned_len = cleaned.len();
        if cleaned_len < original_len {
            let null_bytes_removed = text.chars().filter(|&c| c == '\0').count();
            warn!(
                "Removed {} null bytes from extracted text (original: {} chars, cleaned: {} chars). \
                This indicates corrupted or malformed document data.",
                null_bytes_removed, original_len, cleaned_len
            );
        }

        cleaned
    }


    pub fn new(temp_dir: String, file_service: FileService) -> Self {
        Self { temp_dir, file_service }
    }


    /// Extract text from image with high-quality OCR settings
    #[cfg(feature = "ocr")]
    pub async fn extract_text_from_image(&self, file_path: &str, settings: &Settings) -> Result<OcrResult> {
        let start_time = std::time::Instant::now();
        info!("Starting enhanced OCR for image: {}", file_path);

        let mut preprocessing_applied = Vec::new();

        // Load and preprocess the image
        let (processed_image_path, preprocess_steps) = if settings.enable_image_preprocessing {
            let (processed_path, steps) = self.preprocess_image(file_path, settings).await?;
            (processed_path, steps)
        } else {
            (file_path.to_string(), Vec::new())
        };

        preprocessing_applied.extend(preprocess_steps);

        // Move CPU-intensive OCR operations to blocking thread pool
        let processed_image_path_clone = processed_image_path.clone();
        let settings_clone = settings.clone();
        let temp_dir = self.temp_dir.clone();

        let ocr_result = tokio::task::spawn_blocking(move || -> Result<(String, f32)> {
            // Configure Tesseract with optimal settings
            let mut tesseract = Self::configure_tesseract_static(&processed_image_path_clone, &settings_clone)?;

            // Extract text with confidence
            let text = tesseract.get_text()?.trim().to_string();
            let confidence = Self::calculate_overall_confidence_static(&mut tesseract)?;

            Ok((text, confidence))
        }).await??;

        let (text, confidence) = ocr_result;

        let processing_time = start_time.elapsed().as_millis() as u64;
        let word_count = text.split_whitespace().count();

        debug!(
            "OCR completed: {} words, {:.1}% confidence, {}ms",
            word_count, confidence, processing_time
        );

        // Return the processed image path if different from original (caller will handle cleanup/saving)
        let result_processed_image_path = if processed_image_path != file_path {
            Some(processed_image_path.clone())
        } else {
            None
        };

        let result = OcrResult {
            text,
            confidence,
            processing_time_ms: processing_time,
            word_count,
            preprocessing_applied,
            processed_image_path: result_processed_image_path,
        };

        // Clean up temporary files if not saved for review
        if let Some(ref temp_path) = result.processed_image_path {
            if !settings.save_processed_images {
                let _ = tokio::fs::remove_file(temp_path).await;
            }
        }

        Ok(result)
    }

    /// Preprocess image for optimal OCR quality, especially for challenging conditions
    #[cfg(feature = "ocr")]
    async fn preprocess_image(&self, input_path: &str, settings: &Settings) -> Result<(String, Vec<String>)> {
        // Resolve the file path first
        let resolved_path = self.resolve_file_path(input_path).await?;
        let img = image::open(&resolved_path)?;
        let mut processed_img = img;
        let mut preprocessing_applied = Vec::new();

        info!("Original image dimensions: {}x{}", processed_img.width(), processed_img.height());

        // Apply orientation detection and correction
        if settings.ocr_detect_orientation {
            processed_img = self.detect_and_correct_orientation(processed_img)?;
        }

        // Aggressively upscale low-resolution images for better OCR
        processed_img = self.smart_resize_for_ocr(processed_img, settings.ocr_dpi)?;

        // Convert to grayscale for better OCR
        let gray_img = processed_img.to_luma8();
        let mut processed_gray = gray_img;

        // Analyze image quality and apply appropriate enhancements
        let quality_stats = self.analyze_image_quality(&processed_gray);
        info!("Image quality analysis: brightness={:.1}, contrast={:.1}, noise_level={:.1}, sharpness={:.1}",
               quality_stats.average_brightness, quality_stats.contrast_ratio, quality_stats.noise_level, quality_stats.sharpness);

        // Determine if image needs enhancement based on quality thresholds
        let needs_enhancement = self.needs_enhancement(&quality_stats, settings);

        if !needs_enhancement {
            info!("Image quality is good, skipping enhancement steps");
        } else {
            info!("Image quality needs improvement, applying selective enhancements");

            // Apply brightness correction only for very dim images
            if quality_stats.average_brightness < 50.0 || settings.ocr_brightness_boost > 0.0 {
                processed_gray = self.enhance_brightness_and_contrast(processed_gray, &quality_stats, settings)?;
                preprocessing_applied.push("Brightness/contrast correction".to_string());
            }

            // Apply noise removal only for very noisy images
            if quality_stats.noise_level > 0.25 || (settings.ocr_remove_noise && settings.ocr_noise_reduction_level > 1) {
                processed_gray = self.adaptive_noise_removal(processed_gray, &quality_stats, settings)?;
                preprocessing_applied.push("Noise reduction".to_string());
            }

            // Apply contrast enhancement only for very low contrast images
            if quality_stats.contrast_ratio < 0.2 || (settings.ocr_enhance_contrast && settings.ocr_adaptive_threshold_window_size > 0) {
                let original_gray = processed_gray.clone();
                match self.adaptive_contrast_enhancement(processed_gray, &quality_stats, settings) {
                    Ok(enhanced) => {
                        processed_gray = enhanced;
                        preprocessing_applied.push("Contrast enhancement".to_string());
                    }
                    Err(e) => {
                        warn!("Contrast enhancement failed, using alternative method: {}", e);
                        // Fallback to basic contrast enhancement
                        processed_gray = self.apply_alternative_contrast_enhancement(original_gray.clone(), &quality_stats, settings)
                            .unwrap_or_else(|_| {
                                warn!("Alternative contrast enhancement also failed, using original image");
                                original_gray
                            });
                        preprocessing_applied.push("Basic contrast enhancement".to_string());
                    }
                }
            }

            // Apply sharpening only for very blurry images
            if quality_stats.sharpness < 0.2 || settings.ocr_sharpening_strength > 0.5 {
                processed_gray = self.sharpen_image(processed_gray, settings)?;
                preprocessing_applied.push("Image sharpening".to_string());
            }

            // Apply morphological operations only if explicitly enabled and image needs it
            if settings.ocr_morphological_operations && quality_stats.noise_level > 0.15 {
                processed_gray = self.apply_morphological_operations(processed_gray)?;
                preprocessing_applied.push("Morphological operations".to_string());
            }
        }

        // Save processed image to temporary file
        let temp_filename = format!("processed_{}_{}.png",
            std::process::id(),
            std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?.as_millis()
        );
        let temp_path = format!("{}/{}", self.temp_dir, temp_filename);

        let dynamic_processed = DynamicImage::ImageLuma8(processed_gray);
        dynamic_processed.save(&temp_path)?;

        info!("Processed image saved to: {}", temp_path);
        Ok((temp_path, preprocessing_applied))
    }

    /// Determine if image needs enhancement based on quality thresholds
    #[cfg(feature = "ocr")]
    fn needs_enhancement(&self, stats: &ImageQualityStats, settings: &Settings) -> bool {
        // If user wants to skip enhancement entirely, respect that
        if settings.ocr_skip_enhancement {
            info!("OCR enhancement disabled by user setting");
            return false;
        }

        // Use user-configurable thresholds
        let brightness_threshold = settings.ocr_quality_threshold_brightness;
        let contrast_threshold = settings.ocr_quality_threshold_contrast;
        let noise_threshold = settings.ocr_quality_threshold_noise;
        let sharpness_threshold = settings.ocr_quality_threshold_sharpness;

        // Check if any metric falls below acceptable quality thresholds
        let needs_brightness_fix = stats.average_brightness < brightness_threshold;
        let needs_contrast_fix = stats.contrast_ratio < contrast_threshold;
        let needs_noise_fix = stats.noise_level > noise_threshold;
        let needs_sharpening = stats.sharpness < sharpness_threshold;

        // Also check if user has explicitly enabled aggressive enhancement
        let user_wants_enhancement = settings.ocr_brightness_boost > 0.0 ||
                                    settings.ocr_contrast_multiplier > 1.0 ||
                                    settings.ocr_noise_reduction_level > 1 ||
                                    settings.ocr_sharpening_strength > 0.0;

        let needs_enhancement = needs_brightness_fix || needs_contrast_fix || needs_noise_fix || needs_sharpening || user_wants_enhancement;

        info!("Enhancement decision: brightness_ok={}, contrast_ok={}, noise_ok={}, sharpness_ok={}, user_enhancement={}, needs_enhancement={}",
              !needs_brightness_fix, !needs_contrast_fix, !needs_noise_fix, !needs_sharpening, user_wants_enhancement, needs_enhancement);

        needs_enhancement
    }

    /// Build language combination string for Tesseract (e.g., "eng+spa")
    fn build_language_combination(&self, settings: &Settings) -> String {
        if settings.preferred_languages.len() > 1 {
            // Use preferred_languages with primary_language first
            let mut languages = settings.preferred_languages.clone();

            // Ensure primary language is first
            languages.retain(|lang| lang != &settings.primary_language);
            languages.insert(0, settings.primary_language.clone());

            // Join with + for Tesseract multi-language format
            languages.join("+")
        } else if !settings.preferred_languages.is_empty() {
            // Single language from preferred_languages
            settings.preferred_languages[0].clone()
        } else {
            // Fallback to ocr_language field for backward compatibility
            settings.ocr_language.clone()
        }
    }

    /// Configure Tesseract with optimal settings
    #[cfg(feature = "ocr")]
    fn configure_tesseract(&self, image_path: &str, settings: &Settings) -> Result<Tesseract> {
        let language_combination = self.build_language_combination(settings);
        let mut tesseract = Tesseract::new(None, Some(&language_combination))?;

        // Set the image
        tesseract = tesseract.set_image(image_path)?;

        // Configure Page Segmentation Mode (PSM)
        let psm = match settings.ocr_page_segmentation_mode {
            0 => PageSegMode::PsmOsdOnly,
            1 => PageSegMode::PsmAutoOsd,
            2 => PageSegMode::PsmAutoOnly,
            3 => PageSegMode::PsmAuto,
            4 => PageSegMode::PsmSingleColumn,
            5 => PageSegMode::PsmSingleBlockVertText,
            6 => PageSegMode::PsmSingleBlock,
            7 => PageSegMode::PsmSingleLine,
            8 => PageSegMode::PsmSingleWord,
            9 => PageSegMode::PsmCircleWord,
            10 => PageSegMode::PsmSingleChar,
            11 => PageSegMode::PsmSparseText,
            12 => PageSegMode::PsmSparseTextOsd,
            13 => PageSegMode::PsmRawLine,
            _ => PageSegMode::PsmAuto, // Default fallback
        };
        tesseract.set_page_seg_mode(psm);

        // Configure OCR Engine Mode (OEM)
        let _oem = match settings.ocr_engine_mode {
            0 => OcrEngineMode::TesseractOnly,
            1 => OcrEngineMode::LstmOnly,
            2 => OcrEngineMode::TesseractLstmCombined,
            3 => OcrEngineMode::Default,
            _ => OcrEngineMode::Default, // Default fallback
        };

        // Note: set_engine_mode may not be available in the current tesseract crate version
        // We'll configure this differently if needed

        // Basic configuration - skip advanced settings that might cause issues
        // Only set essential variables that are widely supported

        Ok(tesseract)
    }

    /// Calculate overall confidence score using Tesseract's mean confidence
    #[cfg(feature = "ocr")]
    fn calculate_overall_confidence(&self, tesseract: &mut Tesseract) -> Result<f32> {
        // Use Tesseract's built-in mean confidence calculation
        let confidence = tesseract.mean_text_conf();

        // Convert from i32 to f32 and ensure it's within valid range
        let confidence_f32 = confidence as f32;

        // Clamp confidence to valid range (0.0 to 100.0)
        let clamped_confidence = confidence_f32.max(0.0).min(100.0);

        debug!("Tesseract confidence: {} -> {:.1}%", confidence, clamped_confidence);

        Ok(clamped_confidence)
    }

    /// Detect and correct image orientation
    #[cfg(feature = "ocr")]
    fn detect_and_correct_orientation(&self, img: DynamicImage) -> Result<DynamicImage> {
        // For now, we'll implement basic rotation detection
        // In a production system, you might want to use Tesseract's OSD or advanced algorithms
        let (width, height) = img.dimensions();

        // If image is wider than tall by significant margin, it might need rotation
        if width as f32 / height as f32 > 2.0 {
            Ok(img.rotate90())
        } else {
            Ok(img)
        }
    }

    /// Smart resize for OCR - optimize image size for best OCR performance
    #[cfg(feature = "ocr")]
    fn smart_resize_for_ocr(&self, img: DynamicImage, _target_dpi: i32) -> Result<DynamicImage> {
        let (width, height) = img.dimensions();
        let max_dimension = width.max(height);
        let min_dimension = width.min(height);

        // Calculate optimal dimensions for OCR
        let mut new_width = width;
        let mut new_height = height;

        // Scale DOWN large images for better OCR performance and memory efficiency
        if max_dimension > 2048 {
            let scale_factor = 2048.0 / max_dimension as f32;
            new_width = (width as f32 * scale_factor) as u32;
            new_height = (height as f32 * scale_factor) as u32;
            info!("Scaling down large image ({}x{}) by factor {:.2}x to {}x{} for optimal OCR",
                  width, height, scale_factor, new_width, new_height);
        }
        // Scale UP very small images that would produce poor OCR results
        else if min_dimension < 300 {
            let scale_factor = 600.0 / min_dimension as f32;
            new_width = (width as f32 * scale_factor) as u32;
            new_height = (height as f32 * scale_factor) as u32;
            info!("Scaling up small image ({}x{}) by factor {:.2}x to {}x{} for better OCR",
                  width, height, scale_factor, new_width, new_height);
        }

        if new_width != width || new_height != height {
            // Use Lanczos3 for best quality upscaling
            Ok(img.resize(new_width, new_height, image::imageops::FilterType::Lanczos3))
        } else {
            Ok(img)
        }
    }

    /// Analyze image quality metrics
    #[cfg(feature = "ocr")]
    fn analyze_image_quality(&self, img: &ImageBuffer<Luma<u8>, Vec<u8>>) -> ImageQualityStats {
        let (width, height) = img.dimensions();
        let pixel_count = (width as u64) * (height as u64);

        // For very large images, use sampling to avoid performance issues and overflow
        let (average_brightness, variance) = if pixel_count > 4_000_000 { // > 4 megapixels
            self.analyze_quality_sampled(img)
        } else {
            self.analyze_quality_full(img)
        };

        let contrast_ratio = variance.sqrt() / 255.0;

        // Estimate noise level using local variance
        let noise_level = self.estimate_noise_level(img);

        // Estimate sharpness using gradient magnitude
        let sharpness = self.estimate_sharpness(img);

        ImageQualityStats {
            average_brightness,
            contrast_ratio,
            noise_level,
            sharpness,
        }
    }

    /// Analyze quality for normal-sized images (< 4 megapixels)
    #[cfg(feature = "ocr")]
    fn analyze_quality_full(&self, img: &ImageBuffer<Luma<u8>, Vec<u8>>) -> (f32, f32) {
        let pixels: Vec<u8> = img.pixels().map(|p| p[0]).collect();
        let pixel_count = pixels.len() as f32;

        // Calculate average brightness using u64 to prevent overflow
        let sum: u64 = pixels.iter().map(|&p| p as u64).sum();
        let average_brightness = sum as f32 / pixel_count;

        // Calculate variance
        let variance: f32 = pixels.iter()
            .map(|&p| {
                let diff = p as f32 - average_brightness;
                diff * diff
            })
            .sum::<f32>() / pixel_count;

        (average_brightness, variance)
    }

    /// Analyze quality for large images using sampling
    #[cfg(feature = "ocr")]
    fn analyze_quality_sampled(&self, img: &ImageBuffer<Luma<u8>, Vec<u8>>) -> (f32, f32) {
        let (width, height) = img.dimensions();
        let mut pixel_sum = 0u64;
        let mut sample_count = 0u32;

        // Sample every 10th pixel to avoid overflow and improve performance
        for y in (0..height).step_by(10) {
            for x in (0..width).step_by(10) {
                pixel_sum += img.get_pixel(x, y)[0] as u64;
                sample_count += 1;
            }
        }

        let average_brightness = if sample_count > 0 {
            pixel_sum as f32 / sample_count as f32
        } else {
            128.0 // Default middle brightness
        };

        // Calculate variance using sampled pixels
        let mut variance_sum = 0.0f32;
        for y in (0..height).step_by(10) {
            for x in (0..width).step_by(10) {
                let pixel_value = img.get_pixel(x, y)[0] as f32;
                let diff = pixel_value - average_brightness;
                variance_sum += diff * diff;
            }
        }

        let variance = if sample_count > 0 {
            variance_sum / sample_count as f32
        } else {
            0.0
        };

        (average_brightness, variance)
    }

    /// Estimate noise level in image
    #[cfg(feature = "ocr")]
    fn estimate_noise_level(&self, img: &ImageBuffer<Luma<u8>, Vec<u8>>) -> f32 {
        let (width, height) = img.dimensions();
        let mut noise_sum = 0.0f32;
        let mut sample_count = 0u32;

        // Sample every 10th pixel to estimate noise
        for y in (5..height-5).step_by(10) {
            for x in (5..width-5).step_by(10) {
                let center = img.get_pixel(x, y)[0] as f32;
                let mut neighbor_sum = 0.0f32;
                let mut neighbor_count = 0u32;

                // Check 3x3 neighborhood
                for dy in -1..=1 {
                    for dx in -1..=1 {
                        if dx == 0 && dy == 0 { continue; }
                        let neighbor = img.get_pixel((x as i32 + dx) as u32, (y as i32 + dy) as u32)[0] as f32;
                        neighbor_sum += neighbor;
                        neighbor_count += 1;
                    }
                }

                let neighbor_avg = neighbor_sum / neighbor_count as f32;
                let local_variance = (center - neighbor_avg).abs();
                noise_sum += local_variance;
                sample_count += 1;
            }
        }

        if sample_count > 0 {
            (noise_sum / sample_count as f32) / 255.0
        } else {
            0.0
        }
    }

    /// Estimate image sharpness using gradient magnitude
    #[cfg(feature = "ocr")]
    fn estimate_sharpness(&self, img: &ImageBuffer<Luma<u8>, Vec<u8>>) -> f32 {
        let (width, height) = img.dimensions();
        let mut gradient_sum = 0.0f32;
        let mut sample_count = 0u64; // Use u64 to prevent overflow

        // For large images, sample pixels to avoid performance issues and overflow
        let total_pixels = (width as u64) * (height as u64);
        let step_size = if total_pixels > 4_000_000 { 10 } else { 1 }; // Sample every 10th pixel for large images

        // Calculate gradients for interior pixels
        for y in (1..height-1).step_by(step_size) {
            for x in (1..width-1).step_by(step_size) {
                let _center = img.get_pixel(x, y)[0] as f32;
                let left = img.get_pixel(x-1, y)[0] as f32;
                let right = img.get_pixel(x+1, y)[0] as f32;
                let top = img.get_pixel(x, y-1)[0] as f32;
                let bottom = img.get_pixel(x, y+1)[0] as f32;

                let grad_x = (right - left) / 2.0;
                let grad_y = (bottom - top) / 2.0;
                let gradient_magnitude = (grad_x * grad_x + grad_y * grad_y).sqrt();

                gradient_sum += gradient_magnitude;
                sample_count += 1;
            }
        }

        if sample_count > 0 {
            (gradient_sum / sample_count as f32) / 255.0
        } else {
            0.0
        }
    }

    /// Enhanced brightness and contrast correction for dim images
    #[cfg(feature = "ocr")]
    fn enhance_brightness_and_contrast(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>, stats: &ImageQualityStats, settings: &Settings) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
        let (width, height) = img.dimensions();
        let mut enhanced = ImageBuffer::new(width, height);

        // Calculate enhancement parameters based on image statistics and user settings
        let brightness_boost = if settings.ocr_brightness_boost > 0.0 {
            settings.ocr_brightness_boost  // Use user-configured value
        } else if stats.average_brightness < 50.0 {
            60.0 - stats.average_brightness  // Aggressive boost for very dim images
        } else if stats.average_brightness < 80.0 {
            30.0 - (stats.average_brightness - 50.0) * 0.5  // Moderate boost
        } else {
            0.0  // No boost needed
        };

        let contrast_multiplier = if settings.ocr_contrast_multiplier > 0.0 {
            settings.ocr_contrast_multiplier  // Use user-configured value
        } else if stats.contrast_ratio < 0.2 {
            2.5  // Aggressive contrast boost for flat images
        } else if stats.contrast_ratio < 0.4 {
            1.8  // Moderate contrast boost
        } else {
            1.2  // Slight boost
        };

        info!("Applying brightness boost: {:.1}, contrast multiplier: {:.1}", brightness_boost, contrast_multiplier);

        for (x, y, pixel) in img.enumerate_pixels() {
            let original_value = pixel[0] as f32;

            // Apply brightness and contrast enhancement
            let enhanced_value = ((original_value + brightness_boost) * contrast_multiplier).round();
            let clamped_value = enhanced_value.max(0.0).min(255.0) as u8;

            enhanced.put_pixel(x, y, Luma([clamped_value]));
        }

        Ok(enhanced)
    }

    /// Adaptive noise removal based on detected noise level
    #[cfg(feature = "ocr")]
    fn adaptive_noise_removal(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>, stats: &ImageQualityStats, settings: &Settings) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
        let mut processed = img;

        // Use user-configured noise reduction level if specified
        let noise_level = if settings.ocr_noise_reduction_level > 0 {
            settings.ocr_noise_reduction_level
        } else if stats.noise_level > 0.2 {
            3  // Heavy noise
        } else if stats.noise_level > 0.1 {
            2  // Moderate noise
        } else {
            1  // Light noise
        };

        match noise_level {
            3 => {
                // Heavy noise - apply multiple filters
                processed = median_filter(&processed, 2, 2);  // Larger median filter
                processed = gaussian_blur_f32(&processed, 0.8);  // More blur
                info!("Applied heavy noise reduction");
            },
            2 => {
                // Moderate noise
                processed = median_filter(&processed, 1, 1);
                processed = gaussian_blur_f32(&processed, 0.5);
                info!("Applied moderate noise reduction");
            },
            1 | _ => {
                // Light noise or clean image
                processed = median_filter(&processed, 1, 1);
                info!("Applied light noise reduction");
            }
        }

        Ok(processed)
    }

    /// Adaptive contrast enhancement based on image quality
    #[cfg(feature = "ocr")]
    fn adaptive_contrast_enhancement(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>, stats: &ImageQualityStats, settings: &Settings) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
        // Choose threshold size based on image dimensions and quality
        let (width, height) = img.dimensions();
        let min_dimension = width.min(height);

        // Check if image is too large for safe adaptive threshold processing
        // The integral image calculation can overflow with large images
        if width as u64 * height as u64 > 1_500_000 {
            info!("Image too large for adaptive threshold ({}x{}), using alternative contrast enhancement", width, height);
            return self.apply_alternative_contrast_enhancement(img, stats, settings);
        }

        let threshold_size = if settings.ocr_adaptive_threshold_window_size > 0 {
            // Use user-configured window size
            settings.ocr_adaptive_threshold_window_size as u32
        } else if stats.contrast_ratio < 0.2 {
            // Low contrast - use smaller windows for more aggressive local adaptation
            (min_dimension / 20).max(11).min(31)
        } else {
            // Good contrast - use larger windows
            (min_dimension / 15).max(15).min(41)
        };

        // Ensure odd number for threshold size
        let threshold_size = if threshold_size % 2 == 0 { threshold_size + 1 } else { threshold_size };

        info!("Applying adaptive threshold with window size: {}", threshold_size);

        // Wrap in panic-safe block to catch overflow errors
        let enhanced = catch_unwind(AssertUnwindSafe(|| {
            adaptive_threshold(&img, threshold_size)
        }));

        match enhanced {
            Ok(result) => Ok(result),
            Err(_) => {
                warn!("Adaptive threshold panicked (likely overflow), using alternative method");
                self.apply_alternative_contrast_enhancement(img, stats, settings)
            }
        }
    }

    /// Alternative contrast enhancement for large images to avoid overflow
    #[cfg(feature = "ocr")]
    fn apply_alternative_contrast_enhancement(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>, stats: &ImageQualityStats, settings: &Settings) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
        let (width, height) = img.dimensions();
        let mut enhanced = ImageBuffer::new(width, height);

        // Use histogram equalization instead of adaptive threshold for large images
        if settings.ocr_histogram_equalization {
            info!("Applying histogram equalization for contrast enhancement (user enabled)");
        } else {
            info!("Applying histogram equalization for contrast enhancement (fallback)");
        }

        // Calculate histogram using u64 to prevent overflow
        let mut histogram = [0u64; 256];
        for pixel in img.pixels() {
            histogram[pixel[0] as usize] += 1;
        }

        // Calculate cumulative distribution function
        let total_pixels = (width as u64) * (height as u64);
        let mut cdf = [0u64; 256];
        cdf[0] = histogram[0];
        for i in 1..256 {
            cdf[i] = cdf[i - 1] + histogram[i];
        }

        // Create lookup table for histogram equalization
        let mut lookup = [0u8; 256];
        for i in 0..256 {
            if cdf[i] > 0 {
                lookup[i] = ((cdf[i] as f64 / total_pixels as f64) * 255.0) as u8;
            }
        }

        // Apply histogram equalization
        for (x, y, pixel) in img.enumerate_pixels() {
            let old_value = pixel[0];
            let new_value = lookup[old_value as usize];
            enhanced.put_pixel(x, y, Luma([new_value]));
        }

        // Apply additional contrast stretching if needed
        if stats.contrast_ratio < 0.3 {
            enhanced = self.apply_contrast_stretching(enhanced)?;
        }

        Ok(enhanced)
    }

    /// Apply contrast stretching to improve dynamic range
    #[cfg(feature = "ocr")]
    fn apply_contrast_stretching(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
        let (width, height) = img.dimensions();
        let mut enhanced = ImageBuffer::new(width, height);

        // Find min and max values
        let mut min_val = 255u8;
        let mut max_val = 0u8;

        for pixel in img.pixels() {
            let val = pixel[0];
            min_val = min_val.min(val);
            max_val = max_val.max(val);
        }

        // Avoid division by zero
        if max_val == min_val {
            return Ok(img);
        }

        let range = max_val - min_val;

        // Apply contrast stretching
        for (x, y, pixel) in img.enumerate_pixels() {
            let old_value = pixel[0];
            let new_value = (((old_value - min_val) as f32 / range as f32) * 255.0) as u8;
            enhanced.put_pixel(x, y, Luma([new_value]));
        }

        Ok(enhanced)
    }

    /// Sharpen blurry images
    #[cfg(feature = "ocr")]
    fn sharpen_image(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>, settings: &Settings) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
        let (width, height) = img.dimensions();
        let mut sharpened = ImageBuffer::new(width, height);

        // Unsharp mask kernel - enhances edges
        let kernel = [
            [0.0, -1.0, 0.0],
            [-1.0, 5.0, -1.0],
            [0.0, -1.0, 0.0],
        ];

        for y in 1..height-1 {
            for x in 1..width-1 {
                let mut sum = 0.0;

                for ky in 0..3 {
                    for kx in 0..3 {
                        let px = img.get_pixel(x + kx - 1, y + ky - 1)[0] as f32;
                        sum += px * kernel[ky as usize][kx as usize];
                    }
                }

                let sharpened_value = sum.round().max(0.0).min(255.0) as u8;
                sharpened.put_pixel(x, y, Luma([sharpened_value]));
            }
        }

        // Copy border pixels
        for y in 0..height {
            for x in 0..width {
                if x == 0 || x == width-1 || y == 0 || y == height-1 {
                    sharpened.put_pixel(x, y, *img.get_pixel(x, y));
                }
            }
        }

        info!("Applied image sharpening");
        Ok(sharpened)
    }

    /// Apply morphological operations for text clarity
    #[cfg(feature = "ocr")]
    fn apply_morphological_operations(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
        // Apply opening to remove small noise
        let opened = open(&img, Norm::LInf, 1);

        // Apply closing to fill small gaps in text
        let closed = close(&opened, Norm::LInf, 1);

        Ok(closed)
    }

    /// Extract text from PDF using ocrmypdf
    #[cfg(feature = "ocr")]
    pub async fn extract_text_from_pdf(&self, file_path: &str, settings: &Settings) -> Result<OcrResult> {
        let start_time = std::time::Instant::now();
        info!("Extracting text from PDF: {}", file_path);

        // Check file size before processing
        let metadata = tokio::fs::metadata(file_path).await?;
        let file_size = metadata.len();

        // Limit PDF size to 100MB to prevent memory exhaustion
        const MAX_PDF_SIZE: u64 = 100 * 1024 * 1024; // 100MB
        if file_size > MAX_PDF_SIZE {
            return Err(anyhow!(
                "PDF file too large: {:.1} MB (max: {:.1} MB). Consider splitting the PDF.",
                file_size as f64 / (1024.0 * 1024.0),
                MAX_PDF_SIZE as f64 / (1024.0 * 1024.0)
            ));
        }

        // Check if it's a valid PDF by reading first 1KB
        let mut header_bytes = vec![0u8; 1024.min(file_size as usize)];
        let mut file = tokio::fs::File::open(file_path).await?;
        use tokio::io::AsyncReadExt;
        file.read_exact(&mut header_bytes).await?;
        drop(file);

        if !is_valid_pdf(&header_bytes) {
            return Err(anyhow!(
                "Invalid PDF file: Missing or corrupted PDF header. File size: {} bytes, Header: {:?}",
                file_size,
                header_bytes.get(0..50).unwrap_or(&[]).iter().map(|&b| {
                    if b >= 32 && b <= 126 { b as char } else { '.' }
                }).collect::<String>()
            ));
        }

        // Check if ocrmypdf is available
        if !self.is_ocrmypdf_available().await {
            return Err(anyhow!(
                "ocrmypdf is not available on this system. To extract text from PDFs, please install ocrmypdf. \
                On Ubuntu/Debian: 'apt-get install ocrmypdf'. \
                On macOS: 'brew install ocrmypdf'."
            ));
        }

        // First try to extract text without OCR for performance (using --skip-text)
        let quick_extraction_result = self.extract_pdf_text_quick(file_path).await;

        match quick_extraction_result {
            Ok((text, extraction_time)) => {
                let word_count = self.count_words_safely(&text);

                // Check if quick extraction got good results
                if self.is_text_extraction_quality_sufficient(&text, word_count, file_size) {
                    info!("PDF text extraction successful for '{}' using quick method", file_path);
                    return Ok(OcrResult {
                        text,
                        confidence: 95.0,
                        processing_time_ms: extraction_time,
                        word_count,
                        preprocessing_applied: vec!["PDF text extraction (pdftotext)".to_string()],
                        processed_image_path: None,
                    });
                } else {
                    info!("Quick PDF extraction insufficient for '{}' ({} words), using full OCR", file_path, word_count);
                }
            }
            Err(e) => {
                warn!("Quick PDF extraction failed for '{}': {}, using full OCR", file_path, e);
            }
        }

        // If quick extraction failed or was insufficient, use full OCR
        let full_ocr_result = self.extract_text_from_pdf_with_ocr(file_path, settings, start_time).await;

        // If OCR also fails, try direct text extraction as last resort
        if full_ocr_result.is_err() {
            warn!("Full OCR failed, trying direct text extraction as last resort for: {}", file_path);

            match self.extract_text_from_pdf_bytes(file_path).await {
                Ok(text) if !text.trim().is_empty() => {
                    let processing_time = start_time.elapsed().as_millis() as u64;
                    let word_count = self.count_words_safely(&text);
                    info!("Direct text extraction succeeded as last resort for: {}", file_path);

                    return Ok(OcrResult {
                        text,
                        confidence: 50.0, // Lower confidence for direct extraction
                        processing_time_ms: processing_time,
                        word_count,
                        preprocessing_applied: vec!["Direct PDF text extraction (last resort)".to_string()],
                        processed_image_path: None,
                    });
                }
                Ok(_) => {
                    warn!("Direct text extraction returned empty text for: {}", file_path);
                }
                Err(e) => {
                    warn!("Direct text extraction also failed for {}: {}", file_path, e);
                }
            }
        }

        full_ocr_result
    }

    /// Assess if text extraction quality is sufficient or if OCR fallback is needed
    #[cfg(feature = "ocr")]
    fn is_text_extraction_quality_sufficient(&self, text: &str, word_count: usize, file_size: u64) -> bool {
        // If we got no words at all, definitely need OCR
        if word_count == 0 {
            return false;
        }

        // For very small files, low word count might be normal
        if file_size < 50_000 && word_count >= 1 {
            return true;
        }

        // Calculate word density (words per KB)
        let file_size_kb = (file_size as f64) / 1024.0;
        let word_density = (word_count as f64) / file_size_kb;

        // Reasonable thresholds based on typical PDF content:
        // - Text-based PDFs typically have 50-200 words per KB
        // - Below 5 words per KB suggests mostly images/scanned content
        // - But if we have a substantial number of words (>50), accept it regardless of density
        const MIN_WORD_DENSITY: f64 = 5.0;
        const MIN_WORDS_FOR_LARGE_FILES: usize = 10;
        const SUBSTANTIAL_WORD_COUNT: usize = 50;

        // If we have substantial text, accept it regardless of density
        if word_count >= SUBSTANTIAL_WORD_COUNT {
            debug!("PDF has substantial text content: {} words, accepting regardless of density", word_count);
            return true;
        }

        if word_density < MIN_WORD_DENSITY && word_count < MIN_WORDS_FOR_LARGE_FILES {
            debug!("PDF appears to be image-based: {} words in {:.1} KB (density: {:.2} words/KB)",
                   word_count, file_size_kb, word_density);
            return false;
        }

        // Additional check: if text is mostly non-alphanumeric, might be extraction artifacts
        let alphanumeric_chars = text.chars().filter(|c| c.is_alphanumeric()).count();
        let alphanumeric_ratio = if text.len() > 0 {
            (alphanumeric_chars as f64) / (text.len() as f64)
        } else {
            0.0
        };

        // If less than 30% alphanumeric content, likely poor extraction
        if alphanumeric_ratio < 0.3 {
            debug!("PDF text has low alphanumeric content: {:.1}% ({} of {} chars)",
                   alphanumeric_ratio * 100.0, alphanumeric_chars, text.len());
            return false;
        }

        debug!("PDF text extraction quality sufficient: {} words, {:.2} words/KB, {:.1}% alphanumeric",
               word_count, word_density, alphanumeric_ratio * 100.0);
        true
    }

    /// Extract text from PDF using OCR (ocrmypdf) for image-based or poor-quality PDFs
    #[cfg(feature = "ocr")]
    async fn extract_text_from_pdf_with_ocr(&self, file_path: &str, settings: &Settings, start_time: std::time::Instant) -> Result<OcrResult> {
        info!("Starting OCR extraction for PDF: {}", file_path);

        // Check if ocrmypdf is available
        if !self.is_ocrmypdf_available().await {
            return Err(anyhow!(
                "ocrmypdf is not available on this system. To extract text from image-based PDFs like '{}', please install ocrmypdf. \
                On Ubuntu/Debian: 'apt-get install ocrmypdf'. \
                On macOS: 'brew install ocrmypdf'. \
                Alternatively, convert the PDF to images and upload those instead.",
                file_path
            ));
        }

        // Generate temporary file path for OCR'd PDF
        let temp_ocr_filename = format!("ocr_{}_{}.pdf",
            std::process::id(),
            std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?.as_millis()
        );
        let temp_ocr_path = format!("{}/{}", self.temp_dir, temp_ocr_filename);

        // Run ocrmypdf with progressive fallback strategies
        let ocrmypdf_result = tokio::time::timeout(
            std::time::Duration::from_secs(300), // 5 minute timeout for OCR
            tokio::task::spawn_blocking({
                let file_path = file_path.to_string();
                let temp_ocr_path = temp_ocr_path.clone();
                move || {
                    // Strategy 1: Standard OCR with cleaning
                    let mut result = std::process::Command::new("ocrmypdf")
                        .arg("--force-ocr")  // OCR even if text is detected
                        .arg("-O2")          // Optimize level 2 (balanced quality/speed)
                        .arg("--deskew")     // Correct skewed pages
                        .arg("--clean")      // Clean up artifacts
                        .arg("--language")
                        .arg("eng")          // English language
                        .arg(&file_path)
                        .arg(&temp_ocr_path)
                        .output();

                    if result.is_ok() && result.as_ref().unwrap().status.success() {
                        return result;
                    }

                    // Strategy 2: If standard OCR fails, try with error recovery
                    eprintln!("Standard OCR failed, trying recovery mode...");
                    result = std::process::Command::new("ocrmypdf")
                        .arg("--force-ocr")
                        .arg("--fix-metadata")  // Fix metadata issues
                        .arg("--remove-background")  // Remove background noise
                        .arg("-O1")          // Lower optimization for problematic PDFs
                        .arg("--language")
                        .arg("eng")
                        .arg(&file_path)
                        .arg(&temp_ocr_path)
                        .output();

                    if result.is_ok() && result.as_ref().unwrap().status.success() {
                        return result;
                    }

                    // Strategy 3: Last resort - minimal processing (skips very large pages)
                    eprintln!("Recovery mode failed, trying minimal processing...");
                    std::process::Command::new("ocrmypdf")
                        .arg("--force-ocr")
                        .arg("--skip-big")  // Skip very large pages that might cause memory issues
                        .arg("--language")
                        .arg("eng")
                        .arg(&file_path)
                        .arg(&temp_ocr_path)
                        .output()
                }
            })
        ).await;

        let ocrmypdf_output = match ocrmypdf_result {
            Ok(Ok(output)) => output?,
            Ok(Err(e)) => return Err(anyhow!("Failed to join ocrmypdf task: {}", e)),
            Err(_) => return Err(anyhow!("ocrmypdf timed out after 5 minutes for file '{}'", file_path)),
        };

        if !ocrmypdf_output.status.success() {
            let stderr = String::from_utf8_lossy(&ocrmypdf_output.stderr);
            let stdout = String::from_utf8_lossy(&ocrmypdf_output.stdout);
            return Err(anyhow!(
                "ocrmypdf failed for '{}': Exit code {}\nStderr: {}\nStdout: {}",
                file_path, ocrmypdf_output.status.code().unwrap_or(-1), stderr, stdout
            ));
        }

        // Extract text from the OCR'd PDF
        let ocr_text_result = tokio::task::spawn_blocking({
            let temp_ocr_path = temp_ocr_path.clone();
            move || -> Result<String> {
                let _bytes = std::fs::read(&temp_ocr_path)?;
                // Catch panics from pdf-extract library (same pattern as used elsewhere)
                // Extract text from the OCR'd PDF using ocrmypdf's sidecar option
                let temp_text_path = format!("{}.txt", temp_ocr_path);
                let extract_result = std::process::Command::new("ocrmypdf")
                    .arg("--sidecar")  // Extract text to a sidecar file
                    .arg(&temp_text_path)
                    .arg(&temp_ocr_path)
                    .arg("-")  // Output to stdout (dummy, required by ocrmypdf)
                    .output()?;

                if !extract_result.status.success() {
                    let stderr = String::from_utf8_lossy(&extract_result.stderr);
                    return Err(anyhow!(
                        "ocrmypdf text extraction failed: {}",
                        stderr
                    ));
                }

                // Read the extracted text from the sidecar file
                let text = std::fs::read_to_string(&temp_text_path)?;

                // Clean up the text file
                let _ = std::fs::remove_file(&temp_text_path);
                Ok(text.trim().to_string())
            }
        }).await??;

        // Clean up temporary file
        let _ = tokio::fs::remove_file(&temp_ocr_path).await;

        let processing_time = start_time.elapsed().as_millis() as u64;
        let word_count = self.count_words_safely(&ocr_text_result);

        info!("OCR extraction completed for '{}': {} words in {}ms",
              file_path, word_count, processing_time);

        Ok(OcrResult {
            text: ocr_text_result,
            confidence: 85.0, // OCR is generally lower confidence than direct text extraction
            processing_time_ms: processing_time,
            word_count,
            preprocessing_applied: vec!["OCR via ocrmypdf".to_string()],
            processed_image_path: None,
        })
    }

    /// Progressive PDF text extraction with fallback strategies
    #[cfg(feature = "ocr")]
    async fn extract_pdf_text_quick(&self, file_path: &str) -> Result<(String, u64)> {
        let start_time = std::time::Instant::now();

        // Generate temporary file path for text extraction
        let temp_text_filename = format!("quick_text_{}_{}.txt",
            std::process::id(),
            std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?.as_millis()
        );
        let temp_text_path = format!("{}/{}", self.temp_dir, temp_text_filename);

        // Strategy 1: Fast text extraction using pdftotext (for existing text)
        debug!("Trying pdftotext for existing text extraction: {}", file_path);
        debug!("Using temp file path: {}", temp_text_path);
        let pdftotext_result = tokio::process::Command::new("pdftotext")
            .arg("-layout")  // Preserve layout
            .arg(file_path)
            .arg(&temp_text_path)
            .output()
            .await;

        if let Ok(output) = pdftotext_result {
            debug!("pdftotext exit status: {}", output.status);
            if !output.stderr.is_empty() {
                debug!("pdftotext stderr: {}", String::from_utf8_lossy(&output.stderr));
            }
            if output.status.success() {
                if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
                    let _ = tokio::fs::remove_file(&temp_text_path).await;
                    let word_count = text.split_whitespace().count();
                    debug!("pdftotext extracted {} words from temp file", word_count);

                    // If we got substantial text (more than a few words), use it
                    if word_count > 5 {
                        let processing_time = start_time.elapsed().as_millis() as u64;
                        info!("pdftotext extracted {} words from: {}", word_count, file_path);
                        return Ok((text.trim().to_string(), processing_time));
                    } else {
                        debug!("pdftotext only extracted {} words, will try direct extraction before OCR", word_count);
                    }
                } else {
                    debug!("Failed to read pdftotext output file: {}", temp_text_path);
                }
            } else {
                let stderr = String::from_utf8_lossy(&output.stderr);
                debug!("pdftotext failed with status {}: {}", output.status, stderr);
            }
        } else {
            debug!("Failed to execute pdftotext command");
        }

        info!("pdftotext extraction insufficient for '{}', trying direct extraction before OCR", file_path);

        // Strategy 2: Try direct text extraction (often works when pdftotext fails)
        match self.extract_text_from_pdf_bytes(file_path).await {
            Ok(text) if !text.trim().is_empty() => {
                let word_count = text.split_whitespace().count();
                if word_count > 5 {
                    let processing_time = start_time.elapsed().as_millis() as u64;
                    info!("Direct text extraction succeeded for '{}': {} words", file_path, word_count);
                    return Ok((text, processing_time));
                } else {
                    debug!("Direct extraction only got {} words, trying OCR", word_count);
                }
            }
            Ok(_) => {
                debug!("Direct text extraction returned empty text");
            }
            Err(e) => {
                debug!("Direct text extraction failed: {}", e);
            }
        }

        info!("Direct extraction insufficient for '{}', using OCR extraction", file_path);

        // Strategy 3: Use ocrmypdf --sidecar to extract existing OCR text
        let ocrmypdf_result = tokio::process::Command::new("ocrmypdf")
            .arg("--sidecar")
            .arg(&temp_text_path)
            .arg(file_path)
            .arg("-")  // Dummy output (we only want sidecar)
            .output()
            .await;

        if let Ok(output) = &ocrmypdf_result {
            if output.status.success() {
                if let Ok(text) = tokio::fs::read_to_string(&temp_text_path).await {
                    let _ = tokio::fs::remove_file(&temp_text_path).await;
                    let word_count = text.split_whitespace().count();
                    if word_count > 0 {
                        let processing_time = start_time.elapsed().as_millis() as u64;
                        info!("ocrmypdf --sidecar extracted {} words from: {}", word_count, file_path);
                        return Ok((text.trim().to_string(), processing_time));
                    }
                }
            } else {
                let stderr = String::from_utf8_lossy(&output.stderr);
                debug!("ocrmypdf --sidecar failed: {}", stderr);

                // Check if the error indicates the page already has text
                if stderr.contains("page already has text") {
                    // This is good - it means there's already text, we should use pdftotext
                    warn!("ocrmypdf detected existing text in PDF, this should have been caught by pdftotext");
                }
            }
        }

        // Strategy 3: Last resort - direct byte-level text extraction
        warn!("Standard extraction methods failed, trying direct text extraction from: {}", file_path);

        match self.extract_text_from_pdf_bytes(file_path).await {
            Ok(text) if !text.trim().is_empty() => {
                let processing_time = start_time.elapsed().as_millis() as u64;
                let word_count = text.split_whitespace().count();
                info!("Direct text extraction succeeded for '{}': {} words", file_path, word_count);
                Ok((text, processing_time))
            }
            Ok(_) => {
                warn!("Direct text extraction returned empty text for: {}", file_path);
                // If all strategies fail, return the last error
                if let Ok(ref output) = ocrmypdf_result {
                    let stderr = String::from_utf8_lossy(&output.stderr);
                    Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr))
                } else {
                    Err(anyhow!("All PDF extraction strategies failed"))
                }
            }
            Err(e) => {
                warn!("Direct text extraction also failed for {}: {}", file_path, e);
                // If all strategies fail, return the last error
                if let Ok(ref output) = ocrmypdf_result {
                    let stderr = String::from_utf8_lossy(&output.stderr);
                    Err(anyhow!("All PDF extraction strategies failed. Last error: {}", stderr))
                } else {
                    Err(anyhow!("All PDF extraction strategies failed: {}", e))
                }
            }
        }
    }

    /// Last resort: extract readable text directly from PDF bytes
    /// This can find text that's embedded in the PDF even if the structure is corrupted
    #[cfg(feature = "ocr")]
    async fn extract_text_from_pdf_bytes(&self, file_path: &str) -> Result<String> {
        let bytes = tokio::fs::read(file_path).await?;

        // Look for text strings in the PDF
        let mut extracted_text = String::new();
        let mut current_text = String::new();
        let mut in_text_object = false;
        let mut in_string = false;
        let mut escape_next = false;

        for &byte in &bytes {
            let char = byte as char;

            // Look for text objects (BT...ET blocks)
            if !in_text_object && char == 'B' {
                // Check if this might be the start of "BT" (Begin Text)
                if let Some(_window) = bytes.windows(2).find(|w| w == b"BT") {
                    in_text_object = true;
                    continue;
                }
            }

            if in_text_object && char == 'E' {
                // Check if this might be the start of "ET" (End Text)
                if let Some(_window) = bytes.windows(2).find(|w| w == b"ET") {
                    in_text_object = false;
                    if !current_text.trim().is_empty() {
                        extracted_text.push_str(&current_text);
                        extracted_text.push(' ');
                        current_text.clear();
                    }
                    continue;
                }
            }

            // Look for text strings in parentheses (text) or brackets
            if in_text_object {
                if char == '(' && !escape_next {
                    in_string = true;
                    continue;
                }

                if char == ')' && !escape_next && in_string {
                    in_string = false;
                    current_text.push(' ');
                    continue;
                }

                if in_string {
                    if escape_next {
                        escape_next = false;
                        current_text.push(char);
                    } else if char == '\\' {
                        escape_next = true;
                    } else {
                        current_text.push(char);
                    }
                }
            }
        }

        // Also try to find any readable ASCII text in the PDF
        let mut ascii_text = String::new();
        let mut current_word = String::new();

        for &byte in &bytes {
            if byte >= 32 && byte <= 126 {  // Printable ASCII
                current_word.push(byte as char);
            } else {
                if current_word.len() > 3 {  // Only keep words longer than 3 characters
                    ascii_text.push_str(&current_word);
                    ascii_text.push(' ');
                }
                current_word.clear();
            }
        }

        // Add the last word if it's long enough
        if current_word.len() > 3 {
            ascii_text.push_str(&current_word);
        }

        // Combine both extraction methods
        let mut final_text = extracted_text;
        if !ascii_text.trim().is_empty() {
            final_text.push_str("\\n");
            final_text.push_str(&ascii_text);
        }

        // Clean up the text
        let cleaned_text = final_text
            .split_whitespace()
            .filter(|word| word.len() > 1)  // Filter out single characters
            .collect::<Vec<_>>()
            .join(" ");

        if cleaned_text.trim().is_empty() {
            Err(anyhow!("No readable text found in PDF"))
        } else {
            Ok(cleaned_text)
        }
    }

    /// Check if ocrmypdf is available on the system
    #[cfg(feature = "ocr")]
    async fn is_ocrmypdf_available(&self) -> bool {
        match tokio::process::Command::new("ocrmypdf")
            .arg("--version")
            .output()
            .await
        {
            Ok(output) => output.status.success(),
            Err(_) => false,
        }
    }

    #[cfg(not(feature = "ocr"))]
    fn is_text_extraction_quality_sufficient(&self, _text: &str, _word_count: usize, _file_size: u64) -> bool {
        // When OCR is disabled, always accept text extraction results
        true
    }

    #[cfg(not(feature = "ocr"))]
    async fn is_ocrmypdf_available(&self) -> bool {
        false // OCR feature not enabled
    }

    #[cfg(not(feature = "ocr"))]
    async fn extract_text_from_pdf_with_ocr(&self, file_path: &str, _settings: &Settings, _start_time: std::time::Instant) -> Result<OcrResult> {
        Err(anyhow::anyhow!("OCR feature not enabled - cannot process image-based PDF: {}", file_path))
    }

    /// Resolve file path to actual location, handling both old and new directory structures
    async fn resolve_file_path(&self, file_path: &str) -> Result<String> {
        // Use the FileService's resolve_file_path method
        self.file_service.resolve_file_path(file_path).await
    }

    /// Extract text from any supported file type with enhanced logging
    pub async fn extract_text_with_context(&self, file_path: &str, mime_type: &str, filename: &str, file_size: i64, settings: &Settings) -> Result<OcrResult> {
        // Format file size for better readability
        let file_size_mb = file_size as f64 / (1024.0 * 1024.0);

        info!(
            "Starting OCR extraction | File: '{}' | Type: {} | Size: {:.2} MB | Path: {}",
            filename, mime_type, file_size_mb, file_path
        );

        self.extract_text(file_path, mime_type, settings).await
    }

    /// Extract text from Office documents (DOCX, DOC, Excel) using XML extraction
    pub async fn extract_text_from_office(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result<OcrResult> {
        let start_time = std::time::Instant::now();
        info!("Extracting text from Office document: {} (type: {})", file_path, mime_type);

        // Check file size before processing
        let metadata = tokio::fs::metadata(file_path).await?;
        let file_size = metadata.len();

        if file_size > Self::MAX_OFFICE_DOCUMENT_SIZE {
            return Err(anyhow!(
                "Office document too large: {:.1} MB (max: {:.1} MB). Consider converting to PDF or splitting the document.",
                file_size as f64 / (1024.0 * 1024.0),
                Self::MAX_OFFICE_DOCUMENT_SIZE as f64 / (1024.0 * 1024.0)
            ));
        }

        // Use XML extraction as the primary method
        let xml_extractor = XmlOfficeExtractor::new(self.temp_dir.clone());
        let xml_result = xml_extractor.extract_text_from_office(file_path, mime_type).await?;

        let total_time = start_time.elapsed().as_millis() as u64;

        info!(
            "Office document extraction completed: {} words in {}ms using XML extraction",
            xml_result.word_count,
            total_time
        );

        // Convert OfficeExtractionResult to OcrResult for backward compatibility
        Ok(OcrResult {
            text: xml_result.text,
            confidence: xml_result.confidence,
            processing_time_ms: xml_result.processing_time_ms,
            word_count: xml_result.word_count,
            preprocessing_applied: vec![format!("XML extraction - {}", xml_result.extraction_method)],
            processed_image_path: None,
        })
    }

    /// Extract text from any supported file type
    pub async fn extract_text(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result<OcrResult> {
        // Resolve the actual file path
        let resolved_path = self.resolve_file_path(file_path).await?;
        match mime_type {
            "application/pdf" => {
                #[cfg(feature = "ocr")]
                {
                    self.extract_text_from_pdf(&resolved_path, settings).await
                }
                #[cfg(not(feature = "ocr"))]
                {
                    Err(anyhow::anyhow!("OCR feature not enabled"))
                }
            }
            mime if mime.starts_with("image/") => {
                #[cfg(feature = "ocr")]
                {
                    self.extract_text_from_image(&resolved_path, settings).await
                }
                #[cfg(not(feature = "ocr"))]
                {
                    Err(anyhow::anyhow!("OCR feature not enabled"))
                }
            }
            "text/plain" => {
                let start_time = std::time::Instant::now();

                // Check file size before loading into memory
                let metadata = tokio::fs::metadata(&resolved_path).await?;
                let file_size = metadata.len();

                // Limit text file size to 50MB to prevent memory exhaustion
                const MAX_TEXT_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB
                if file_size > MAX_TEXT_FILE_SIZE {
                    return Err(anyhow!(
                        "Text file too large: {:.1} MB (max: {:.1} MB). Consider splitting the file.",
                        file_size as f64 / (1024.0 * 1024.0),
                        MAX_TEXT_FILE_SIZE as f64 / (1024.0 * 1024.0)
                    ));
                }

                let text = tokio::fs::read_to_string(&resolved_path).await?;

                // Only remove null bytes - preserve all original formatting
                let cleaned_text = Self::remove_null_bytes(&text);

                // Limit text content size in memory
                const MAX_TEXT_CONTENT_SIZE: usize = 10 * 1024 * 1024; // 10MB of text content
                let trimmed_text = if cleaned_text.len() > MAX_TEXT_CONTENT_SIZE {
                    warn!("Text file content too large ({} chars), truncating to {} chars", cleaned_text.len(), MAX_TEXT_CONTENT_SIZE);
                    format!("{}... [TEXT TRUNCATED DUE TO SIZE]", &cleaned_text[..MAX_TEXT_CONTENT_SIZE])
                } else {
                    cleaned_text.trim().to_string()
                };

                let processing_time = start_time.elapsed().as_millis() as u64;
                let word_count = self.count_words_safely(&trimmed_text);

                Ok(OcrResult {
                    text: trimmed_text,
                    confidence: 100.0, // Plain text is 100% confident
                    processing_time_ms: processing_time,
                    word_count,
                    preprocessing_applied: vec!["Plain text read".to_string()],
                    processed_image_path: None, // No image processing for plain text
                })
            }
            // Handle Office document formats
            mime if matches!(mime,
                "application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
                "application/msword" |
                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
                "application/vnd.openxmlformats-officedocument.presentationml.presentation"
            ) => {
                // extract_text_from_office now returns OcrResult directly
                self.extract_text_from_office(&resolved_path, mime, settings).await
            }
            _ => Err(anyhow::anyhow!("Unsupported file type: {}", mime_type)),
        }
    }

    /// Safely count words to prevent overflow on very large texts
    #[cfg(feature = "ocr")]
    pub fn count_words_safely(&self, text: &str) -> usize {
        // For very large texts, sample to estimate word count to prevent overflow
        if text.len() > 1_000_000 { // > 1MB of text
            // Sample first 100KB and extrapolate
            let sample_size = 100_000;
            let sample_text = &text[..sample_size.min(text.len())];
            let sample_words = self.count_words_in_text(sample_text);
            let estimated_total = (sample_words as f64 * (text.len() as f64 / sample_size as f64)) as usize;

            // Cap at reasonable maximum to prevent display issues
            estimated_total.min(10_000_000) // Max 10M words
        } else {
            self.count_words_in_text(text)
        }
    }

    #[cfg(feature = "ocr")]
    fn count_words_in_text(&self, text: &str) -> usize {
        let whitespace_words = text.split_whitespace().count();

        // If we have exactly 1 "word" but it's very long (likely continuous text), try enhanced detection
        // OR if we have no whitespace words but text exists
        let is_continuous_text = whitespace_words == 1 && text.len() > 15; // 15+ chars suggests it might be continuous
        let is_no_words = whitespace_words == 0 && !text.trim().is_empty();

        if is_continuous_text || is_no_words {
            // Count total alphanumeric characters first
            let alphanumeric_chars = text.chars().filter(|c| c.is_alphanumeric()).count();

            // If no alphanumeric content, it's pure punctuation/symbols
            if alphanumeric_chars == 0 {
                return 0;
            }

            // For continuous text, look for word boundaries using multiple strategies
            let mut word_count = 0;

            // Strategy 1: Count transitions from lowercase to uppercase (camelCase detection)
            let chars: Vec<char> = text.chars().collect();
            let mut camel_transitions = 0;

            for i in 1..chars.len() {
                let prev_char = chars[i-1];
                let curr_char = chars[i];

                // Count transitions from lowercase letter to uppercase letter
                if prev_char.is_lowercase() && curr_char.is_uppercase() {
                    camel_transitions += 1;
                }
                // Count transitions from letter to digit or digit to letter
                else if (prev_char.is_alphabetic() && curr_char.is_numeric()) ||
                        (prev_char.is_numeric() && curr_char.is_alphabetic()) {
                    camel_transitions += 1;
                }
            }

            // If we found camelCase transitions, estimate words
            if camel_transitions > 0 {
                word_count = camel_transitions + 1; // +1 for the first word
            }

            // Strategy 2: If no camelCase detected, estimate based on character count
            if word_count == 0 {
                // Estimate based on typical word length (4-6 characters per word)
                word_count = (alphanumeric_chars / 5).max(1);
            }

            word_count
        } else {
            whitespace_words
        }
    }

    /// Validate OCR result quality
    #[cfg(feature = "ocr")]
    pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> Result<(), String> {
        // Hard reject completely unreliable OCR (likely corrupted/garbage)
        const HARD_MINIMUM_CONFIDENCE: f32 = 5.0;
        if result.confidence < HARD_MINIMUM_CONFIDENCE {
            return Err(format!(
                "OCR confidence critically low: {:.1}% (absolute minimum: {:.1}%) - likely corrupted input",
                result.confidence,
                HARD_MINIMUM_CONFIDENCE
            ));
        }

        // Log warning for low confidence instead of rejecting
        if result.confidence < settings.ocr_min_confidence {
            warn!(
                "OCR confidence below recommended threshold: {:.1}% (recommended: {:.1}%) - accepting but flagging for review",
                result.confidence,
                settings.ocr_min_confidence
            );
        }

        // Check empty text FIRST (before word count check)
        let total_chars = result.text.len();
        if total_chars == 0 {
            return Err("OCR result contains no characters".to_string());
        }

        // THEN check word count
        if result.word_count == 0 {
            return Err("No words detected in OCR output".to_string());
        }

        // Count valuable content: letters + digits (explicitly treating digits as good content)
        let content_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count();
        let content_ratio = content_chars as f32 / total_chars as f32;

        // Only reject if >90% symbols (likely OCR garbage)
        // This allows bills/receipts/invoices with numbers and formatting characters
        const MIN_CONTENT_RATIO: f32 = 0.10;
        if content_ratio < MIN_CONTENT_RATIO {
            let symbol_ratio = 1.0 - content_ratio;
            return Err(format!(
                "OCR result has too little meaningful content: {:.1}% content (letters+digits), {:.1}% symbols/formatting (minimum content: {:.1}%)",
                content_ratio * 100.0,
                symbol_ratio * 100.0,
                MIN_CONTENT_RATIO * 100.0
            ));
        }

        // Log info for documents with reasonable content
        debug!(
            "OCR validation passed: {:.1}% confidence, {} words, {:.1}% content (letters+digits)",
            result.confidence,
            result.word_count,
            content_ratio * 100.0
        );

        Ok(())
    }
}

#[cfg(not(feature = "ocr"))]
impl EnhancedOcrService {
    pub async fn extract_text_from_image(&self, _file_path: &str, _settings: &Settings) -> Result<OcrResult> {
        Err(anyhow::anyhow!("OCR feature not enabled"))
    }

    pub async fn extract_text_from_pdf(&self, _file_path: &str, _settings: &Settings) -> Result<OcrResult> {
        Err(anyhow::anyhow!("OCR feature not enabled"))
    }


    pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> Result<(), String> {
        Err("OCR feature not enabled".to_string())
    }

    pub fn count_words_safely(&self, text: &str) -> usize {
        // Simple word count for non-OCR builds
        text.split_whitespace().count()
    }
}

/// Check if the given bytes represent a valid PDF file
/// Handles PDFs with leading null bytes or whitespace
fn is_valid_pdf(data: &[u8]) -> bool {
    if data.len() < 5 {
        return false;
    }

    // Find the first occurrence of "%PDF-" in the first 1KB of the file
    // Some PDFs have leading null bytes or other metadata
    let search_limit = data.len().min(1024);
    let search_data = &data[0..search_limit];

    for i in 0..=search_limit.saturating_sub(5) {
        if &search_data[i..i+5] == b"%PDF-" {
            return true;
        }
    }

    false
}

impl EnhancedOcrService {
    /// Static version of configure_tesseract for use in spawn_blocking
    #[cfg(feature = "ocr")]
    fn configure_tesseract_static(image_path: &str, settings: &Settings) -> Result<Tesseract> {
        let language_combination = Self::build_language_combination_static(settings);
        let mut tesseract = Tesseract::new(None, Some(&language_combination))?;

        // Set the image
        tesseract = tesseract.set_image(image_path)?;

        // Configure Page Segmentation Mode (PSM)
        let psm = match settings.ocr_page_segmentation_mode {
            0 => PageSegMode::PsmOsdOnly,
            1 => PageSegMode::PsmAutoOsd,
            2 => PageSegMode::PsmAutoOnly,
            3 => PageSegMode::PsmAuto,
            4 => PageSegMode::PsmSingleColumn,
            5 => PageSegMode::PsmSingleBlockVertText,
            6 => PageSegMode::PsmSingleBlock,
            7 => PageSegMode::PsmSingleLine,
            8 => PageSegMode::PsmSingleWord,
            9 => PageSegMode::PsmCircleWord,
            10 => PageSegMode::PsmSingleChar,
            11 => PageSegMode::PsmSparseText,
            12 => PageSegMode::PsmSparseTextOsd,
            13 => PageSegMode::PsmRawLine,
            _ => PageSegMode::PsmAuto, // Default fallback
        };
        tesseract.set_page_seg_mode(psm);

        // Configure OCR Engine Mode (OEM)
        let _oem = match settings.ocr_engine_mode {
            0 => OcrEngineMode::TesseractOnly,
            1 => OcrEngineMode::LstmOnly,
            2 => OcrEngineMode::TesseractOnly, // Fallback since TesseractLstm doesn't exist
            3 => OcrEngineMode::Default,
            _ => OcrEngineMode::Default, // Default fallback
        };

        Ok(tesseract)
    }

    /// Static version of calculate_overall_confidence for use in spawn_blocking
    #[cfg(feature = "ocr")]
    fn calculate_overall_confidence_static(tesseract: &mut Tesseract) -> Result<f32> {
        // Use Tesseract's built-in mean confidence calculation
        let confidence = tesseract.mean_text_conf();

        // Convert from i32 to f32 and ensure it's within valid range
        let confidence_f32 = confidence as f32;

        // Clamp confidence to valid range (0.0 to 100.0)
        let clamped_confidence = confidence_f32.max(0.0).min(100.0);

        debug!("Tesseract confidence: {} -> {:.1}%", confidence, clamped_confidence);

        Ok(clamped_confidence)
    }

    /// Static version of build_language_combination for use in spawn_blocking
    fn build_language_combination_static(settings: &Settings) -> String {
        if settings.preferred_languages.len() > 1 {
            // Use preferred_languages with primary_language first
            let mut languages = settings.preferred_languages.clone();

            // Ensure primary language is first
            languages.retain(|lang| lang != &settings.primary_language);
            languages.insert(0, settings.primary_language.clone());

            // Join with + for Tesseract multi-language format
            languages.join("+")
        } else if !settings.preferred_languages.is_empty() {
            // Single language from preferred_languages
            settings.preferred_languages[0].clone()
        } else {
            // Fallback to ocr_language field for backward compatibility
            settings.ocr_language.clone()
        }
    }
}