Readur/src/ocr/enhanced.rs

use anyhow::{anyhow, Result};
use tracing::{debug, info, warn};
use std::panic::{catch_unwind, AssertUnwindSafe};

#[cfg(feature = "ocr")]
use image::{DynamicImage, ImageBuffer, Luma, GenericImageView};
#[cfg(feature = "ocr")]
use imageproc::{
    contrast::adaptive_threshold,
    morphology::{close, open},
    filter::{median_filter, gaussian_blur_f32},
    distance_transform::Norm,
};
#[cfg(feature = "ocr")]
use tesseract::{Tesseract, PageSegMode, OcrEngineMode};

use crate::models::Settings;
use crate::services::file_service::FileService;

#[derive(Debug, Clone)]
pub struct ImageQualityStats {
    pub average_brightness: f32,
    pub contrast_ratio: f32,
    pub noise_level: f32,
    pub sharpness: f32,
}

#[derive(Debug, Clone)]
pub struct OcrResult {
    pub text: String,
    pub confidence: f32,
    pub processing_time_ms: u64,
    pub word_count: usize,
    pub preprocessing_applied: Vec<String>,
    pub processed_image_path: Option<String>,
}

pub struct EnhancedOcrService {
    pub temp_dir: String,
    pub file_service: FileService,
}

impl EnhancedOcrService {
    pub fn new(temp_dir: String) -> Self {
        let upload_path = std::env::var("UPLOAD_PATH").unwrap_or_else(|_| "./uploads".to_string());
        let file_service = FileService::new(upload_path);
        Self { temp_dir, file_service }
    }

    /// Extract text from image with high-quality OCR settings
    #[cfg(feature = "ocr")]
    pub async fn extract_text_from_image(&self, file_path: &str, settings: &Settings) -> Result<OcrResult> {
        let start_time = std::time::Instant::now();
        info!("Starting enhanced OCR for image: {}", file_path);

        let mut preprocessing_applied = Vec::new();

        // Load and preprocess the image
        let (processed_image_path, mut preprocess_steps) = if settings.enable_image_preprocessing {
            let (processed_path, steps) = self.preprocess_image(file_path, settings).await?;
            (processed_path, steps)
        } else {
            (file_path.to_string(), Vec::new())
        };

        preprocessing_applied.extend(preprocess_steps);

        // Move CPU-intensive OCR operations to blocking thread pool
        let processed_image_path_clone = processed_image_path.clone();
        let settings_clone = settings.clone();
        let temp_dir = self.temp_dir.clone();

        let ocr_result = tokio::task::spawn_blocking(move || -> Result<(String, f32)> {
            // Configure Tesseract with optimal settings
            let ocr_service = EnhancedOcrService::new(temp_dir);
            let mut tesseract = ocr_service.configure_tesseract(&processed_image_path_clone, &settings_clone)?;

            // Extract text with confidence
            let text = tesseract.get_text()?.trim().to_string();
            let confidence = ocr_service.calculate_overall_confidence(&mut tesseract)?;

            Ok((text, confidence))
        }).await??;

        let (text, confidence) = ocr_result;

        let processing_time = start_time.elapsed().as_millis() as u64;
        let word_count = text.split_whitespace().count();

        debug!(
            "OCR completed: {} words, {:.1}% confidence, {}ms",
            word_count, confidence, processing_time
        );

        // Return the processed image path if different from original (caller will handle cleanup/saving)
        let result_processed_image_path = if processed_image_path != file_path {
            Some(processed_image_path.clone())
        } else {
            None
        };

        let result = OcrResult {
            text,
            confidence,
            processing_time_ms: processing_time,
            word_count,
            preprocessing_applied,
            processed_image_path: result_processed_image_path,
        };

        // Clean up temporary files if not saved for review
        if let Some(ref temp_path) = result.processed_image_path {
            if !settings.save_processed_images {
                let _ = tokio::fs::remove_file(temp_path).await;
            }
        }

        Ok(result)
    }

    /// Preprocess image for optimal OCR quality, especially for challenging conditions
    #[cfg(feature = "ocr")]
    async fn preprocess_image(&self, input_path: &str, settings: &Settings) -> Result<(String, Vec<String>)> {
        // Resolve the file path first
        let resolved_path = self.resolve_file_path(input_path).await?;
        let img = image::open(&resolved_path)?;
        let mut processed_img = img;
        let mut preprocessing_applied = Vec::new();

        info!("Original image dimensions: {}x{}", processed_img.width(), processed_img.height());

        // Apply orientation detection and correction
        if settings.ocr_detect_orientation {
            processed_img = self.detect_and_correct_orientation(processed_img)?;
        }

        // Aggressively upscale low-resolution images for better OCR
        processed_img = self.smart_resize_for_ocr(processed_img, settings.ocr_dpi)?;

        // Convert to grayscale for better OCR
        let gray_img = processed_img.to_luma8();
        let mut processed_gray = gray_img;

        // Analyze image quality and apply appropriate enhancements
        let quality_stats = self.analyze_image_quality(&processed_gray);
        info!("Image quality analysis: brightness={:.1}, contrast={:.1}, noise_level={:.1}, sharpness={:.1}",
               quality_stats.average_brightness, quality_stats.contrast_ratio, quality_stats.noise_level, quality_stats.sharpness);

        // Determine if image needs enhancement based on quality thresholds
        let needs_enhancement = self.needs_enhancement(&quality_stats, settings);

        if !needs_enhancement {
            info!("Image quality is good, skipping enhancement steps");
        } else {
            info!("Image quality needs improvement, applying selective enhancements");

            // Apply brightness correction only for very dim images
            if quality_stats.average_brightness < 50.0 || settings.ocr_brightness_boost > 0.0 {
                processed_gray = self.enhance_brightness_and_contrast(processed_gray, &quality_stats, settings)?;
                preprocessing_applied.push("Brightness/contrast correction".to_string());
            }

            // Apply noise removal only for very noisy images
            if quality_stats.noise_level > 0.25 || (settings.ocr_remove_noise && settings.ocr_noise_reduction_level > 1) {
                processed_gray = self.adaptive_noise_removal(processed_gray, &quality_stats, settings)?;
                preprocessing_applied.push("Noise reduction".to_string());
            }

            // Apply contrast enhancement only for very low contrast images
            if quality_stats.contrast_ratio < 0.2 || (settings.ocr_enhance_contrast && settings.ocr_adaptive_threshold_window_size > 0) {
                let original_gray = processed_gray.clone();
                match self.adaptive_contrast_enhancement(processed_gray, &quality_stats, settings) {
                    Ok(enhanced) => {
                        processed_gray = enhanced;
                        preprocessing_applied.push("Contrast enhancement".to_string());
                    }
                    Err(e) => {
                        warn!("Contrast enhancement failed, using alternative method: {}", e);
                        // Fallback to basic contrast enhancement
                        processed_gray = self.apply_alternative_contrast_enhancement(original_gray.clone(), &quality_stats, settings)
                            .unwrap_or_else(|_| {
                                warn!("Alternative contrast enhancement also failed, using original image");
                                original_gray
                            });
                        preprocessing_applied.push("Basic contrast enhancement".to_string());
                    }
                }
            }

            // Apply sharpening only for very blurry images
            if quality_stats.sharpness < 0.2 || settings.ocr_sharpening_strength > 0.5 {
                processed_gray = self.sharpen_image(processed_gray, settings)?;
                preprocessing_applied.push("Image sharpening".to_string());
            }

            // Apply morphological operations only if explicitly enabled and image needs it
            if settings.ocr_morphological_operations && quality_stats.noise_level > 0.15 {
                processed_gray = self.apply_morphological_operations(processed_gray)?;
                preprocessing_applied.push("Morphological operations".to_string());
            }
        }

        // Save processed image to temporary file
        let temp_filename = format!("processed_{}_{}.png",
            std::process::id(),
            std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?.as_millis()
        );
        let temp_path = format!("{}/{}", self.temp_dir, temp_filename);

        let dynamic_processed = DynamicImage::ImageLuma8(processed_gray);
        dynamic_processed.save(&temp_path)?;

        info!("Processed image saved to: {}", temp_path);
        Ok((temp_path, preprocessing_applied))
    }

    /// Determine if image needs enhancement based on quality thresholds
    #[cfg(feature = "ocr")]
    fn needs_enhancement(&self, stats: &ImageQualityStats, settings: &Settings) -> bool {
        // If user wants to skip enhancement entirely, respect that
        if settings.ocr_skip_enhancement {
            info!("OCR enhancement disabled by user setting");
            return false;
        }

        // Use user-configurable thresholds
        let brightness_threshold = settings.ocr_quality_threshold_brightness;
        let contrast_threshold = settings.ocr_quality_threshold_contrast;
        let noise_threshold = settings.ocr_quality_threshold_noise;
        let sharpness_threshold = settings.ocr_quality_threshold_sharpness;

        // Check if any metric falls below acceptable quality thresholds
        let needs_brightness_fix = stats.average_brightness < brightness_threshold;
        let needs_contrast_fix = stats.contrast_ratio < contrast_threshold;
        let needs_noise_fix = stats.noise_level > noise_threshold;
        let needs_sharpening = stats.sharpness < sharpness_threshold;

        // Also check if user has explicitly enabled aggressive enhancement
        let user_wants_enhancement = settings.ocr_brightness_boost > 0.0 ||
                                    settings.ocr_contrast_multiplier > 1.0 ||
                                    settings.ocr_noise_reduction_level > 1 ||
                                    settings.ocr_sharpening_strength > 0.0;

        let needs_enhancement = needs_brightness_fix || needs_contrast_fix || needs_noise_fix || needs_sharpening || user_wants_enhancement;

        info!("Enhancement decision: brightness_ok={}, contrast_ok={}, noise_ok={}, sharpness_ok={}, user_enhancement={}, needs_enhancement={}",
              !needs_brightness_fix, !needs_contrast_fix, !needs_noise_fix, !needs_sharpening, user_wants_enhancement, needs_enhancement);

        needs_enhancement
    }

    /// Build language combination string for Tesseract (e.g., "eng+spa")
    fn build_language_combination(&self, settings: &Settings) -> String {
        if settings.preferred_languages.len() > 1 {
            // Use preferred_languages with primary_language first
            let mut languages = settings.preferred_languages.clone();

            // Ensure primary language is first
            languages.retain(|lang| lang != &settings.primary_language);
            languages.insert(0, settings.primary_language.clone());

            // Join with + for Tesseract multi-language format
            languages.join("+")
        } else if !settings.preferred_languages.is_empty() {
            // Single language from preferred_languages
            settings.preferred_languages[0].clone()
        } else {
            // Fallback to ocr_language field for backward compatibility
            settings.ocr_language.clone()
        }
    }

    /// Configure Tesseract with optimal settings
    #[cfg(feature = "ocr")]
    fn configure_tesseract(&self, image_path: &str, settings: &Settings) -> Result<Tesseract> {
        let language_combination = self.build_language_combination(settings);
        let mut tesseract = Tesseract::new(None, Some(&language_combination))?;

        // Set the image
        tesseract = tesseract.set_image(image_path)?;

        // Configure Page Segmentation Mode (PSM)
        let psm = match settings.ocr_page_segmentation_mode {
            0 => PageSegMode::PsmOsdOnly,
            1 => PageSegMode::PsmAutoOsd,
            2 => PageSegMode::PsmAutoOnly,
            3 => PageSegMode::PsmAuto,
            4 => PageSegMode::PsmSingleColumn,
            5 => PageSegMode::PsmSingleBlockVertText,
            6 => PageSegMode::PsmSingleBlock,
            7 => PageSegMode::PsmSingleLine,
            8 => PageSegMode::PsmSingleWord,
            9 => PageSegMode::PsmCircleWord,
            10 => PageSegMode::PsmSingleChar,
            11 => PageSegMode::PsmSparseText,
            12 => PageSegMode::PsmSparseTextOsd,
            13 => PageSegMode::PsmRawLine,
            _ => PageSegMode::PsmAuto, // Default fallback
        };
        tesseract.set_page_seg_mode(psm);

        // Configure OCR Engine Mode (OEM)
        let _oem = match settings.ocr_engine_mode {
            0 => OcrEngineMode::TesseractOnly,
            1 => OcrEngineMode::LstmOnly,
            2 => OcrEngineMode::TesseractLstmCombined,
            3 => OcrEngineMode::Default,
            _ => OcrEngineMode::Default, // Default fallback
        };

        // Note: set_engine_mode may not be available in the current tesseract crate version
        // We'll configure this differently if needed

        // Basic configuration - skip advanced settings that might cause issues
        // Only set essential variables that are widely supported

        Ok(tesseract)
    }

    /// Calculate overall confidence score using Tesseract's mean confidence
    #[cfg(feature = "ocr")]
    fn calculate_overall_confidence(&self, tesseract: &mut Tesseract) -> Result<f32> {
        // Use Tesseract's built-in mean confidence calculation
        let confidence = tesseract.mean_text_conf();

        // Convert from i32 to f32 and ensure it's within valid range
        let confidence_f32 = confidence as f32;

        // Clamp confidence to valid range (0.0 to 100.0)
        let clamped_confidence = confidence_f32.max(0.0).min(100.0);

        debug!("Tesseract confidence: {} -> {:.1}%", confidence, clamped_confidence);

        Ok(clamped_confidence)
    }

    /// Detect and correct image orientation
    #[cfg(feature = "ocr")]
    fn detect_and_correct_orientation(&self, img: DynamicImage) -> Result<DynamicImage> {
        // For now, we'll implement basic rotation detection
        // In a production system, you might want to use Tesseract's OSD or advanced algorithms
        let (width, height) = img.dimensions();

        // If image is wider than tall by significant margin, it might need rotation
        if width as f32 / height as f32 > 2.0 {
            Ok(img.rotate90())
        } else {
            Ok(img)
        }
    }

    /// Smart resize for OCR - optimize image size for best OCR performance
    #[cfg(feature = "ocr")]
    fn smart_resize_for_ocr(&self, img: DynamicImage, _target_dpi: i32) -> Result<DynamicImage> {
        let (width, height) = img.dimensions();
        let max_dimension = width.max(height);
        let min_dimension = width.min(height);

        // Calculate optimal dimensions for OCR
        let mut new_width = width;
        let mut new_height = height;

        // Scale DOWN large images for better OCR performance and memory efficiency
        if max_dimension > 2048 {
            let scale_factor = 2048.0 / max_dimension as f32;
            new_width = (width as f32 * scale_factor) as u32;
            new_height = (height as f32 * scale_factor) as u32;
            info!("Scaling down large image ({}x{}) by factor {:.2}x to {}x{} for optimal OCR",
                  width, height, scale_factor, new_width, new_height);
        }
        // Scale UP very small images that would produce poor OCR results
        else if min_dimension < 300 {
            let scale_factor = 600.0 / min_dimension as f32;
            new_width = (width as f32 * scale_factor) as u32;
            new_height = (height as f32 * scale_factor) as u32;
            info!("Scaling up small image ({}x{}) by factor {:.2}x to {}x{} for better OCR",
                  width, height, scale_factor, new_width, new_height);
        }

        if new_width != width || new_height != height {
            // Use Lanczos3 for best quality upscaling
            Ok(img.resize(new_width, new_height, image::imageops::FilterType::Lanczos3))
        } else {
            Ok(img)
        }
    }

    /// Analyze image quality metrics
    #[cfg(feature = "ocr")]
    fn analyze_image_quality(&self, img: &ImageBuffer<Luma<u8>, Vec<u8>>) -> ImageQualityStats {
        let (width, height) = img.dimensions();
        let pixel_count = (width as u64) * (height as u64);

        // For very large images, use sampling to avoid performance issues and overflow
        let (average_brightness, variance) = if pixel_count > 4_000_000 { // > 4 megapixels
            self.analyze_quality_sampled(img)
        } else {
            self.analyze_quality_full(img)
        };

        let contrast_ratio = variance.sqrt() / 255.0;

        // Estimate noise level using local variance
        let noise_level = self.estimate_noise_level(img);

        // Estimate sharpness using gradient magnitude
        let sharpness = self.estimate_sharpness(img);

        ImageQualityStats {
            average_brightness,
            contrast_ratio,
            noise_level,
            sharpness,
        }
    }

    /// Analyze quality for normal-sized images (< 4 megapixels)
    #[cfg(feature = "ocr")]
    fn analyze_quality_full(&self, img: &ImageBuffer<Luma<u8>, Vec<u8>>) -> (f32, f32) {
        let pixels: Vec<u8> = img.pixels().map(|p| p[0]).collect();
        let pixel_count = pixels.len() as f32;

        // Calculate average brightness using u64 to prevent overflow
        let sum: u64 = pixels.iter().map(|&p| p as u64).sum();
        let average_brightness = sum as f32 / pixel_count;

        // Calculate variance
        let variance: f32 = pixels.iter()
            .map(|&p| {
                let diff = p as f32 - average_brightness;
                diff * diff
            })
            .sum::<f32>() / pixel_count;

        (average_brightness, variance)
    }

    /// Analyze quality for large images using sampling
    #[cfg(feature = "ocr")]
    fn analyze_quality_sampled(&self, img: &ImageBuffer<Luma<u8>, Vec<u8>>) -> (f32, f32) {
        let (width, height) = img.dimensions();
        let mut pixel_sum = 0u64;
        let mut sample_count = 0u32;

        // Sample every 10th pixel to avoid overflow and improve performance
        for y in (0..height).step_by(10) {
            for x in (0..width).step_by(10) {
                pixel_sum += img.get_pixel(x, y)[0] as u64;
                sample_count += 1;
            }
        }

        let average_brightness = if sample_count > 0 {
            pixel_sum as f32 / sample_count as f32
        } else {
            128.0 // Default middle brightness
        };

        // Calculate variance using sampled pixels
        let mut variance_sum = 0.0f32;
        for y in (0..height).step_by(10) {
            for x in (0..width).step_by(10) {
                let pixel_value = img.get_pixel(x, y)[0] as f32;
                let diff = pixel_value - average_brightness;
                variance_sum += diff * diff;
            }
        }

        let variance = if sample_count > 0 {
            variance_sum / sample_count as f32
        } else {
            0.0
        };

        (average_brightness, variance)
    }

    /// Estimate noise level in image
    #[cfg(feature = "ocr")]
    fn estimate_noise_level(&self, img: &ImageBuffer<Luma<u8>, Vec<u8>>) -> f32 {
        let (width, height) = img.dimensions();
        let mut noise_sum = 0.0f32;
        let mut sample_count = 0u32;

        // Sample every 10th pixel to estimate noise
        for y in (5..height-5).step_by(10) {
            for x in (5..width-5).step_by(10) {
                let center = img.get_pixel(x, y)[0] as f32;
                let mut neighbor_sum = 0.0f32;
                let mut neighbor_count = 0u32;

                // Check 3x3 neighborhood
                for dy in -1..=1 {
                    for dx in -1..=1 {
                        if dx == 0 && dy == 0 { continue; }
                        let neighbor = img.get_pixel((x as i32 + dx) as u32, (y as i32 + dy) as u32)[0] as f32;
                        neighbor_sum += neighbor;
                        neighbor_count += 1;
                    }
                }

                let neighbor_avg = neighbor_sum / neighbor_count as f32;
                let local_variance = (center - neighbor_avg).abs();
                noise_sum += local_variance;
                sample_count += 1;
            }
        }

        if sample_count > 0 {
            (noise_sum / sample_count as f32) / 255.0
        } else {
            0.0
        }
    }

    /// Estimate image sharpness using gradient magnitude
    #[cfg(feature = "ocr")]
    fn estimate_sharpness(&self, img: &ImageBuffer<Luma<u8>, Vec<u8>>) -> f32 {
        let (width, height) = img.dimensions();
        let mut gradient_sum = 0.0f32;
        let mut sample_count = 0u64; // Use u64 to prevent overflow

        // For large images, sample pixels to avoid performance issues and overflow
        let total_pixels = (width as u64) * (height as u64);
        let step_size = if total_pixels > 4_000_000 { 10 } else { 1 }; // Sample every 10th pixel for large images

        // Calculate gradients for interior pixels
        for y in (1..height-1).step_by(step_size) {
            for x in (1..width-1).step_by(step_size) {
                let _center = img.get_pixel(x, y)[0] as f32;
                let left = img.get_pixel(x-1, y)[0] as f32;
                let right = img.get_pixel(x+1, y)[0] as f32;
                let top = img.get_pixel(x, y-1)[0] as f32;
                let bottom = img.get_pixel(x, y+1)[0] as f32;

                let grad_x = (right - left) / 2.0;
                let grad_y = (bottom - top) / 2.0;
                let gradient_magnitude = (grad_x * grad_x + grad_y * grad_y).sqrt();

                gradient_sum += gradient_magnitude;
                sample_count += 1;
            }
        }

        if sample_count > 0 {
            (gradient_sum / sample_count as f32) / 255.0
        } else {
            0.0
        }
    }

    /// Enhanced brightness and contrast correction for dim images
    #[cfg(feature = "ocr")]
    fn enhance_brightness_and_contrast(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>, stats: &ImageQualityStats, settings: &Settings) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
        let (width, height) = img.dimensions();
        let mut enhanced = ImageBuffer::new(width, height);

        // Calculate enhancement parameters based on image statistics and user settings
        let brightness_boost = if settings.ocr_brightness_boost > 0.0 {
            settings.ocr_brightness_boost  // Use user-configured value
        } else if stats.average_brightness < 50.0 {
            60.0 - stats.average_brightness  // Aggressive boost for very dim images
        } else if stats.average_brightness < 80.0 {
            30.0 - (stats.average_brightness - 50.0) * 0.5  // Moderate boost
        } else {
            0.0  // No boost needed
        };

        let contrast_multiplier = if settings.ocr_contrast_multiplier > 0.0 {
            settings.ocr_contrast_multiplier  // Use user-configured value
        } else if stats.contrast_ratio < 0.2 {
            2.5  // Aggressive contrast boost for flat images
        } else if stats.contrast_ratio < 0.4 {
            1.8  // Moderate contrast boost
        } else {
            1.2  // Slight boost
        };

        info!("Applying brightness boost: {:.1}, contrast multiplier: {:.1}", brightness_boost, contrast_multiplier);

        for (x, y, pixel) in img.enumerate_pixels() {
            let original_value = pixel[0] as f32;

            // Apply brightness and contrast enhancement
            let enhanced_value = ((original_value + brightness_boost) * contrast_multiplier).round();
            let clamped_value = enhanced_value.max(0.0).min(255.0) as u8;

            enhanced.put_pixel(x, y, Luma([clamped_value]));
        }

        Ok(enhanced)
    }

    /// Adaptive noise removal based on detected noise level
    #[cfg(feature = "ocr")]
    fn adaptive_noise_removal(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>, stats: &ImageQualityStats, settings: &Settings) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
        let mut processed = img;

        // Use user-configured noise reduction level if specified
        let noise_level = if settings.ocr_noise_reduction_level > 0 {
            settings.ocr_noise_reduction_level
        } else if stats.noise_level > 0.2 {
            3  // Heavy noise
        } else if stats.noise_level > 0.1 {
            2  // Moderate noise
        } else {
            1  // Light noise
        };

        match noise_level {
            3 => {
                // Heavy noise - apply multiple filters
                processed = median_filter(&processed, 2, 2);  // Larger median filter
                processed = gaussian_blur_f32(&processed, 0.8);  // More blur
                info!("Applied heavy noise reduction");
            },
            2 => {
                // Moderate noise
                processed = median_filter(&processed, 1, 1);
                processed = gaussian_blur_f32(&processed, 0.5);
                info!("Applied moderate noise reduction");
            },
            1 | _ => {
                // Light noise or clean image
                processed = median_filter(&processed, 1, 1);
                info!("Applied light noise reduction");
            }
        }

        Ok(processed)
    }

    /// Adaptive contrast enhancement based on image quality
    #[cfg(feature = "ocr")]
    fn adaptive_contrast_enhancement(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>, stats: &ImageQualityStats, settings: &Settings) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
        // Choose threshold size based on image dimensions and quality
        let (width, height) = img.dimensions();
        let min_dimension = width.min(height);

        // Check if image is too large for safe adaptive threshold processing
        // The integral image calculation can overflow with large images
        if width as u64 * height as u64 > 1_500_000 {
            info!("Image too large for adaptive threshold ({}x{}), using alternative contrast enhancement", width, height);
            return self.apply_alternative_contrast_enhancement(img, stats, settings);
        }

        let threshold_size = if settings.ocr_adaptive_threshold_window_size > 0 {
            // Use user-configured window size
            settings.ocr_adaptive_threshold_window_size as u32
        } else if stats.contrast_ratio < 0.2 {
            // Low contrast - use smaller windows for more aggressive local adaptation
            (min_dimension / 20).max(11).min(31)
        } else {
            // Good contrast - use larger windows
            (min_dimension / 15).max(15).min(41)
        };

        // Ensure odd number for threshold size
        let threshold_size = if threshold_size % 2 == 0 { threshold_size + 1 } else { threshold_size };

        info!("Applying adaptive threshold with window size: {}", threshold_size);

        // Wrap in panic-safe block to catch overflow errors
        let enhanced = catch_unwind(AssertUnwindSafe(|| {
            adaptive_threshold(&img, threshold_size)
        }));

        match enhanced {
            Ok(result) => Ok(result),
            Err(_) => {
                warn!("Adaptive threshold panicked (likely overflow), using alternative method");
                self.apply_alternative_contrast_enhancement(img, stats, settings)
            }
        }
    }

    /// Alternative contrast enhancement for large images to avoid overflow
    #[cfg(feature = "ocr")]
    fn apply_alternative_contrast_enhancement(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>, stats: &ImageQualityStats, settings: &Settings) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
        let (width, height) = img.dimensions();
        let mut enhanced = ImageBuffer::new(width, height);

        // Use histogram equalization instead of adaptive threshold for large images
        if settings.ocr_histogram_equalization {
            info!("Applying histogram equalization for contrast enhancement (user enabled)");
        } else {
            info!("Applying histogram equalization for contrast enhancement (fallback)");
        }

        // Calculate histogram using u64 to prevent overflow
        let mut histogram = [0u64; 256];
        for pixel in img.pixels() {
            histogram[pixel[0] as usize] += 1;
        }

        // Calculate cumulative distribution function
        let total_pixels = (width as u64) * (height as u64);
        let mut cdf = [0u64; 256];
        cdf[0] = histogram[0];
        for i in 1..256 {
            cdf[i] = cdf[i - 1] + histogram[i];
        }

        // Create lookup table for histogram equalization
        let mut lookup = [0u8; 256];
        for i in 0..256 {
            if cdf[i] > 0 {
                lookup[i] = ((cdf[i] as f64 / total_pixels as f64) * 255.0) as u8;
            }
        }

        // Apply histogram equalization
        for (x, y, pixel) in img.enumerate_pixels() {
            let old_value = pixel[0];
            let new_value = lookup[old_value as usize];
            enhanced.put_pixel(x, y, Luma([new_value]));
        }

        // Apply additional contrast stretching if needed
        if stats.contrast_ratio < 0.3 {
            enhanced = self.apply_contrast_stretching(enhanced)?;
        }

        Ok(enhanced)
    }

    /// Apply contrast stretching to improve dynamic range
    #[cfg(feature = "ocr")]
    fn apply_contrast_stretching(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
        let (width, height) = img.dimensions();
        let mut enhanced = ImageBuffer::new(width, height);

        // Find min and max values
        let mut min_val = 255u8;
        let mut max_val = 0u8;

        for pixel in img.pixels() {
            let val = pixel[0];
            min_val = min_val.min(val);
            max_val = max_val.max(val);
        }

        // Avoid division by zero
        if max_val == min_val {
            return Ok(img);
        }

        let range = max_val - min_val;

        // Apply contrast stretching
        for (x, y, pixel) in img.enumerate_pixels() {
            let old_value = pixel[0];
            let new_value = (((old_value - min_val) as f32 / range as f32) * 255.0) as u8;
            enhanced.put_pixel(x, y, Luma([new_value]));
        }

        Ok(enhanced)
    }

    /// Sharpen blurry images
    #[cfg(feature = "ocr")]
    fn sharpen_image(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>, settings: &Settings) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
        let (width, height) = img.dimensions();
        let mut sharpened = ImageBuffer::new(width, height);

        // Unsharp mask kernel - enhances edges
        let kernel = [
            [0.0, -1.0, 0.0],
            [-1.0, 5.0, -1.0],
            [0.0, -1.0, 0.0],
        ];

        for y in 1..height-1 {
            for x in 1..width-1 {
                let mut sum = 0.0;

                for ky in 0..3 {
                    for kx in 0..3 {
                        let px = img.get_pixel(x + kx - 1, y + ky - 1)[0] as f32;
                        sum += px * kernel[ky as usize][kx as usize];
                    }
                }

                let sharpened_value = sum.round().max(0.0).min(255.0) as u8;
                sharpened.put_pixel(x, y, Luma([sharpened_value]));
            }
        }

        // Copy border pixels
        for y in 0..height {
            for x in 0..width {
                if x == 0 || x == width-1 || y == 0 || y == height-1 {
                    sharpened.put_pixel(x, y, *img.get_pixel(x, y));
                }
            }
        }

        info!("Applied image sharpening");
        Ok(sharpened)
    }

    /// Apply morphological operations for text clarity
    #[cfg(feature = "ocr")]
    fn apply_morphological_operations(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
        // Apply opening to remove small noise
        let opened = open(&img, Norm::LInf, 1);

        // Apply closing to fill small gaps in text
        let closed = close(&opened, Norm::LInf, 1);

        Ok(closed)
    }

    /// Extract text from PDF with size and time limits
    #[cfg(feature = "ocr")]
    pub async fn extract_text_from_pdf(&self, file_path: &str, settings: &Settings) -> Result<OcrResult> {
        let start_time = std::time::Instant::now();
        info!("Extracting text from PDF: {}", file_path);

        // Check file size before loading into memory
        let metadata = tokio::fs::metadata(file_path).await?;
        let file_size = metadata.len();

        // Limit PDF size to 100MB to prevent memory exhaustion
        const MAX_PDF_SIZE: u64 = 100 * 1024 * 1024; // 100MB
        if file_size > MAX_PDF_SIZE {
            return Err(anyhow!(
                "PDF file too large: {:.1} MB (max: {:.1} MB). Consider splitting the PDF.",
                file_size as f64 / (1024.0 * 1024.0),
                MAX_PDF_SIZE as f64 / (1024.0 * 1024.0)
            ));
        }

        let bytes = tokio::fs::read(file_path).await?;

        // Check if it's a valid PDF (handles leading null bytes)
        if !is_valid_pdf(&bytes) {
            return Err(anyhow!(
                "Invalid PDF file: Missing or corrupted PDF header. File size: {} bytes, Header: {:?}",
                bytes.len(),
                bytes.get(0..50).unwrap_or(&[]).iter().map(|&b| {
                    if b >= 32 && b <= 126 { b as char } else { '.' }
                }).collect::<String>()
            ));
        }

        // Clean the PDF data (remove leading null bytes)
        let clean_bytes = clean_pdf_data(&bytes);

        // Add timeout and panic recovery for PDF extraction
        let extraction_result = tokio::time::timeout(
            std::time::Duration::from_secs(120), // 2 minute timeout
            tokio::task::spawn_blocking(move || {
                // Catch panics from pdf-extract library
                catch_unwind(AssertUnwindSafe(|| {
                    pdf_extract::extract_text_from_mem(&clean_bytes)
                }))
            })
        ).await;

        let text = match extraction_result {
            Ok(Ok(Ok(Ok(text)))) => text,
            Ok(Ok(Ok(Err(e)))) => {
                return Err(anyhow!(
                    "PDF text extraction failed for file '{}' (size: {} bytes): {}. This may indicate a corrupted or unsupported PDF format.",
                    file_path, file_size, e
                ));
            }
            Ok(Ok(Err(_panic))) => {
                // pdf-extract panicked (e.g., missing unicode map, corrupted font encoding)
                // For now, gracefully handle this common issue
                use tracing::debug;
                debug!("PDF text extraction failed for '{}' due to font encoding issues. This is a known limitation with certain PDF files.", file_path);

                return Err(anyhow!(
                    "PDF text extraction failed due to font encoding issues in '{}' (size: {} bytes). This PDF uses non-standard fonts or character encoding that cannot be processed. To extract text from this PDF, consider: 1) Converting it to images and uploading those instead, 2) Using a different PDF viewer to re-save the PDF with standard encoding, or 3) Using external tools to convert the PDF to a more compatible format.",
                    file_path, file_size
                ));
            }
            Ok(Err(e)) => {
                return Err(anyhow!("PDF extraction task failed: {}", e));
            }
            Err(_) => {
                return Err(anyhow!(
                    "PDF extraction timed out after 2 minutes for file '{}' (size: {} bytes). The PDF may be corrupted or too complex.",
                    file_path, file_size
                ));
            }
        };

        // Limit extracted text size to prevent memory issues
        const MAX_TEXT_SIZE: usize = 10 * 1024 * 1024; // 10MB of text
        let trimmed_text = if text.len() > MAX_TEXT_SIZE {
            warn!("PDF text too large ({} chars), truncating to {} chars", text.len(), MAX_TEXT_SIZE);
            format!("{}... [TEXT TRUNCATED DUE TO SIZE]", &text[..MAX_TEXT_SIZE])
        } else {
            text.trim().to_string()
        };

        let processing_time = start_time.elapsed().as_millis() as u64;
        let word_count = self.count_words_safely(&trimmed_text);

        // Debug logging to understand PDF extraction issues
        debug!(
            "PDF extraction debug - File: '{}' | Raw text length: {} | Trimmed text length: {} | Word count: {} | First 200 chars: {:?}",
            file_path,
            text.len(),
            trimmed_text.len(),
            word_count,
            trimmed_text.chars().take(200).collect::<String>()
        );

        // Smart detection: assess if text extraction quality is good enough
        if self.is_text_extraction_quality_sufficient(&trimmed_text, word_count, file_size) {
            info!("PDF text extraction successful for '{}', using extracted text", file_path);
            Ok(OcrResult {
                text: trimmed_text,
                confidence: 95.0, // PDF text extraction is generally high confidence
                processing_time_ms: processing_time,
                word_count,
                preprocessing_applied: vec!["PDF text extraction".to_string()],
                processed_image_path: None,
            })
        } else {
            info!("PDF text extraction insufficient for '{}' ({} words), falling back to OCR", file_path, word_count);
            // Fall back to OCR using ocrmypdf
            self.extract_text_from_pdf_with_ocr(file_path, settings, start_time).await
        }
    }

    /// Assess if text extraction quality is sufficient or if OCR fallback is needed
    #[cfg(feature = "ocr")]
    fn is_text_extraction_quality_sufficient(&self, text: &str, word_count: usize, file_size: u64) -> bool {
        // If we got no words at all, definitely need OCR
        if word_count == 0 {
            return false;
        }

        // For very small files, low word count might be normal
        if file_size < 50_000 && word_count >= 1 {
            return true;
        }

        // Calculate word density (words per KB)
        let file_size_kb = (file_size as f64) / 1024.0;
        let word_density = (word_count as f64) / file_size_kb;

        // Reasonable thresholds based on typical PDF content:
        // - Text-based PDFs typically have 50-200 words per KB
        // - Below 5 words per KB suggests mostly images/scanned content
        const MIN_WORD_DENSITY: f64 = 5.0;
        const MIN_WORDS_FOR_LARGE_FILES: usize = 10;

        if word_density < MIN_WORD_DENSITY && word_count < MIN_WORDS_FOR_LARGE_FILES {
            debug!("PDF appears to be image-based: {} words in {:.1} KB (density: {:.2} words/KB)",
                   word_count, file_size_kb, word_density);
            return false;
        }

        // Additional check: if text is mostly non-alphanumeric, might be extraction artifacts
        let alphanumeric_chars = text.chars().filter(|c| c.is_alphanumeric()).count();
        let alphanumeric_ratio = if text.len() > 0 {
            (alphanumeric_chars as f64) / (text.len() as f64)
        } else {
            0.0
        };

        // If less than 30% alphanumeric content, likely poor extraction
        if alphanumeric_ratio < 0.3 {
            debug!("PDF text has low alphanumeric content: {:.1}% ({} of {} chars)",
                   alphanumeric_ratio * 100.0, alphanumeric_chars, text.len());
            return false;
        }

        debug!("PDF text extraction quality sufficient: {} words, {:.2} words/KB, {:.1}% alphanumeric",
               word_count, word_density, alphanumeric_ratio * 100.0);
        true
    }

    /// Extract text from PDF using OCR (ocrmypdf) for image-based or poor-quality PDFs
    #[cfg(feature = "ocr")]
    async fn extract_text_from_pdf_with_ocr(&self, file_path: &str, settings: &Settings, start_time: std::time::Instant) -> Result<OcrResult> {
        info!("Starting OCR extraction for PDF: {}", file_path);

        // Check if ocrmypdf is available
        if !self.is_ocrmypdf_available().await {
            return Err(anyhow!(
                "ocrmypdf is not available on this system. To extract text from image-based PDFs like '{}', please install ocrmypdf. \
                On Ubuntu/Debian: 'apt-get install ocrmypdf'. \
                On macOS: 'brew install ocrmypdf'. \
                Alternatively, convert the PDF to images and upload those instead.",
                file_path
            ));
        }

        // Generate temporary file path for OCR'd PDF
        let temp_ocr_filename = format!("ocr_{}_{}.pdf",
            std::process::id(),
            std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?.as_millis()
        );
        let temp_ocr_path = format!("{}/{}", self.temp_dir, temp_ocr_filename);

        // Run ocrmypdf to create searchable PDF
        let ocrmypdf_result = tokio::time::timeout(
            std::time::Duration::from_secs(300), // 5 minute timeout for OCR
            tokio::task::spawn_blocking({
                let file_path = file_path.to_string();
                let temp_ocr_path = temp_ocr_path.clone();
                move || {
                    std::process::Command::new("ocrmypdf")
                        .arg("--force-ocr")  // OCR even if text is detected
                        .arg("-O2")          // Optimize level 2 (balanced quality/speed)
                        .arg("--deskew")     // Correct skewed pages
                        .arg("--clean")      // Clean up artifacts
                        .arg("--language")
                        .arg("eng")          // English language
                        .arg(&file_path)
                        .arg(&temp_ocr_path)
                        .output()
                }
            })
        ).await;

        let ocrmypdf_output = match ocrmypdf_result {
            Ok(Ok(output)) => output?,
            Ok(Err(e)) => return Err(anyhow!("Failed to join ocrmypdf task: {}", e)),
            Err(_) => return Err(anyhow!("ocrmypdf timed out after 5 minutes for file '{}'", file_path)),
        };

        if !ocrmypdf_output.status.success() {
            let stderr = String::from_utf8_lossy(&ocrmypdf_output.stderr);
            let stdout = String::from_utf8_lossy(&ocrmypdf_output.stdout);
            return Err(anyhow!(
                "ocrmypdf failed for '{}': Exit code {}\nStderr: {}\nStdout: {}",
                file_path, ocrmypdf_output.status.code().unwrap_or(-1), stderr, stdout
            ));
        }

        // Extract text from the OCR'd PDF
        let ocr_text_result = tokio::task::spawn_blocking({
            let temp_ocr_path = temp_ocr_path.clone();
            move || -> Result<String> {
                let bytes = std::fs::read(&temp_ocr_path)?;
                let text = pdf_extract::extract_text_from_mem(&bytes)?;
                Ok(text.trim().to_string())
            }
        }).await??;

        // Clean up temporary file
        let _ = tokio::fs::remove_file(&temp_ocr_path).await;

        let processing_time = start_time.elapsed().as_millis() as u64;
        let word_count = self.count_words_safely(&ocr_text_result);

        info!("OCR extraction completed for '{}': {} words in {}ms",
              file_path, word_count, processing_time);

        Ok(OcrResult {
            text: ocr_text_result,
            confidence: 85.0, // OCR is generally lower confidence than direct text extraction
            processing_time_ms: processing_time,
            word_count,
            preprocessing_applied: vec!["OCR via ocrmypdf".to_string()],
            processed_image_path: None,
        })
    }

    /// Check if ocrmypdf is available on the system
    #[cfg(feature = "ocr")]
    async fn is_ocrmypdf_available(&self) -> bool {
        match tokio::process::Command::new("ocrmypdf")
            .arg("--version")
            .output()
            .await
        {
            Ok(output) => output.status.success(),
            Err(_) => false,
        }
    }

    #[cfg(not(feature = "ocr"))]
    fn is_text_extraction_quality_sufficient(&self, _text: &str, _word_count: usize, _file_size: u64) -> bool {
        // When OCR is disabled, always accept text extraction results
        true
    }

    #[cfg(not(feature = "ocr"))]
    async fn is_ocrmypdf_available(&self) -> bool {
        false // OCR feature not enabled
    }

    #[cfg(not(feature = "ocr"))]
    async fn extract_text_from_pdf_with_ocr(&self, file_path: &str, _settings: &Settings, _start_time: std::time::Instant) -> Result<OcrResult> {
        Err(anyhow::anyhow!("OCR feature not enabled - cannot process image-based PDF: {}", file_path))
    }

    /// Resolve file path to actual location, handling both old and new directory structures
    async fn resolve_file_path(&self, file_path: &str) -> Result<String> {
        // Use the FileService's resolve_file_path method
        self.file_service.resolve_file_path(file_path).await
    }

    /// Extract text from any supported file type with enhanced logging
    pub async fn extract_text_with_context(&self, file_path: &str, mime_type: &str, filename: &str, file_size: i64, settings: &Settings) -> Result<OcrResult> {
        // Format file size for better readability
        let file_size_mb = file_size as f64 / (1024.0 * 1024.0);

        info!(
            "Starting OCR extraction | File: '{}' | Type: {} | Size: {:.2} MB | Path: {}",
            filename, mime_type, file_size_mb, file_path
        );

        self.extract_text(file_path, mime_type, settings).await
    }

    /// Extract text from any supported file type
    pub async fn extract_text(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result<OcrResult> {
        // Resolve the actual file path
        let resolved_path = self.resolve_file_path(file_path).await?;
        match mime_type {
            "application/pdf" => {
                #[cfg(feature = "ocr")]
                {
                    self.extract_text_from_pdf(&resolved_path, settings).await
                }
                #[cfg(not(feature = "ocr"))]
                {
                    Err(anyhow::anyhow!("OCR feature not enabled"))
                }
            }
            mime if mime.starts_with("image/") => {
                #[cfg(feature = "ocr")]
                {
                    self.extract_text_from_image(&resolved_path, settings).await
                }
                #[cfg(not(feature = "ocr"))]
                {
                    Err(anyhow::anyhow!("OCR feature not enabled"))
                }
            }
            "text/plain" => {
                let start_time = std::time::Instant::now();

                // Check file size before loading into memory
                let metadata = tokio::fs::metadata(&resolved_path).await?;
                let file_size = metadata.len();

                // Limit text file size to 50MB to prevent memory exhaustion
                const MAX_TEXT_FILE_SIZE: u64 = 50 * 1024 * 1024; // 50MB
                if file_size > MAX_TEXT_FILE_SIZE {
                    return Err(anyhow!(
                        "Text file too large: {:.1} MB (max: {:.1} MB). Consider splitting the file.",
                        file_size as f64 / (1024.0 * 1024.0),
                        MAX_TEXT_FILE_SIZE as f64 / (1024.0 * 1024.0)
                    ));
                }

                let text = tokio::fs::read_to_string(&resolved_path).await?;

                // Limit text content size in memory
                const MAX_TEXT_CONTENT_SIZE: usize = 10 * 1024 * 1024; // 10MB of text content
                let trimmed_text = if text.len() > MAX_TEXT_CONTENT_SIZE {
                    warn!("Text file content too large ({} chars), truncating to {} chars", text.len(), MAX_TEXT_CONTENT_SIZE);
                    format!("{}... [TEXT TRUNCATED DUE TO SIZE]", &text[..MAX_TEXT_CONTENT_SIZE])
                } else {
                    text.trim().to_string()
                };

                let processing_time = start_time.elapsed().as_millis() as u64;
                let word_count = self.count_words_safely(&trimmed_text);

                Ok(OcrResult {
                    text: trimmed_text,
                    confidence: 100.0, // Plain text is 100% confident
                    processing_time_ms: processing_time,
                    word_count,
                    preprocessing_applied: vec!["Plain text read".to_string()],
                    processed_image_path: None, // No image processing for plain text
                })
            }
            _ => Err(anyhow::anyhow!("Unsupported file type: {}", mime_type)),
        }
    }

    /// Safely count words to prevent overflow on very large texts
    #[cfg(feature = "ocr")]
    pub fn count_words_safely(&self, text: &str) -> usize {
        // For very large texts, sample to estimate word count to prevent overflow
        if text.len() > 1_000_000 { // > 1MB of text
            // Sample first 100KB and extrapolate
            let sample_size = 100_000;
            let sample_text = &text[..sample_size.min(text.len())];
            let sample_words = self.count_words_in_text(sample_text);
            let estimated_total = (sample_words as f64 * (text.len() as f64 / sample_size as f64)) as usize;

            // Cap at reasonable maximum to prevent display issues
            estimated_total.min(10_000_000) // Max 10M words
        } else {
            self.count_words_in_text(text)
        }
    }

    #[cfg(feature = "ocr")]
    fn count_words_in_text(&self, text: &str) -> usize {
        let whitespace_words = text.split_whitespace().count();

        // If we have exactly 1 "word" but it's very long (likely continuous text), try enhanced detection
        // OR if we have no whitespace words but text exists
        let is_continuous_text = whitespace_words == 1 && text.len() > 15; // 15+ chars suggests it might be continuous
        let is_no_words = whitespace_words == 0 && !text.trim().is_empty();

        if is_continuous_text || is_no_words {
            // Count total alphanumeric characters first
            let alphanumeric_chars = text.chars().filter(|c| c.is_alphanumeric()).count();

            // If no alphanumeric content, it's pure punctuation/symbols
            if alphanumeric_chars == 0 {
                return 0;
            }

            // For continuous text, look for word boundaries using multiple strategies
            let mut word_count = 0;

            // Strategy 1: Count transitions from lowercase to uppercase (camelCase detection)
            let chars: Vec<char> = text.chars().collect();
            let mut camel_transitions = 0;

            for i in 1..chars.len() {
                let prev_char = chars[i-1];
                let curr_char = chars[i];

                // Count transitions from lowercase letter to uppercase letter
                if prev_char.is_lowercase() && curr_char.is_uppercase() {
                    camel_transitions += 1;
                }
                // Count transitions from letter to digit or digit to letter
                else if (prev_char.is_alphabetic() && curr_char.is_numeric()) ||
                        (prev_char.is_numeric() && curr_char.is_alphabetic()) {
                    camel_transitions += 1;
                }
            }

            // If we found camelCase transitions, estimate words
            if camel_transitions > 0 {
                word_count = camel_transitions + 1; // +1 for the first word
            }

            // Strategy 2: If no camelCase detected, estimate based on character count
            if word_count == 0 {
                // Estimate based on typical word length (4-6 characters per word)
                word_count = (alphanumeric_chars / 5).max(1);
            }

            word_count
        } else {
            whitespace_words
        }
    }

    /// Validate OCR result quality
    #[cfg(feature = "ocr")]
    pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> bool {
        // Check minimum confidence threshold
        if result.confidence < settings.ocr_min_confidence {
            warn!(
                "OCR result below confidence threshold: {:.1}% < {:.1}%",
                result.confidence, settings.ocr_min_confidence
            );
            return false;
        }

        // Check if text is reasonable (not just noise)
        if result.word_count == 0 {
            warn!("OCR result contains no words");
            return false;
        }

        // Check for reasonable character distribution
        let total_chars = result.text.len();
        if total_chars == 0 {
            return false;
        }

        let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count();
        let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32;

        // Expect at least 30% alphanumeric characters for valid text
        if alphanumeric_ratio < 0.3 {
            warn!(
                "OCR result has low alphanumeric ratio: {:.1}%",
                alphanumeric_ratio * 100.0
            );
            return false;
        }

        true
    }
}

#[cfg(not(feature = "ocr"))]
impl EnhancedOcrService {
    pub async fn extract_text_from_image(&self, _file_path: &str, _settings: &Settings) -> Result<OcrResult> {
        Err(anyhow::anyhow!("OCR feature not enabled"))
    }

    pub async fn extract_text_from_pdf(&self, _file_path: &str, _settings: &Settings) -> Result<OcrResult> {
        Err(anyhow::anyhow!("OCR feature not enabled"))
    }


    pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> bool {
        false
    }
}

/// Check if the given bytes represent a valid PDF file
/// Handles PDFs with leading null bytes or whitespace
fn is_valid_pdf(data: &[u8]) -> bool {
    if data.len() < 5 {
        return false;
    }

    // Find the first occurrence of "%PDF-" in the first 1KB of the file
    // Some PDFs have leading null bytes or other metadata
    let search_limit = data.len().min(1024);
    let search_data = &data[0..search_limit];

    for i in 0..=search_limit.saturating_sub(5) {
        if &search_data[i..i+5] == b"%PDF-" {
            return true;
        }
    }

    false
}

/// Remove leading null bytes and return clean PDF data
/// Returns the original data if no PDF header is found
fn clean_pdf_data(data: &[u8]) -> Vec<u8> {
    if data.len() < 5 {
        return data.to_vec();
    }

    // Find the first occurrence of "%PDF-" in the first 1KB
    let search_limit = data.len().min(1024);

    for i in 0..=search_limit.saturating_sub(5) {
        if &data[i..i+5] == b"%PDF-" {
            return data[i..].to_vec();
        }
    }

    // If no PDF header found, return original data
    data.to_vec()
}