use anyhow::{anyhow, Result}; use std::path::Path; use crate::ocr_error::OcrError; use crate::ocr_health::OcrHealthChecker; #[cfg(feature = "ocr")] use tesseract::Tesseract; pub struct OcrService { health_checker: OcrHealthChecker, } impl OcrService { pub fn new() -> Self { Self { health_checker: OcrHealthChecker::new(), } } pub async fn extract_text_from_image(&self, file_path: &str) -> Result { self.extract_text_from_image_with_lang(file_path, "eng").await } pub async fn extract_text_from_image_with_lang(&self, file_path: &str, lang: &str) -> Result { #[cfg(feature = "ocr")] { // Perform health checks first self.health_checker.check_tesseract_installation() .map_err(|e: OcrError| anyhow!(e))?; self.health_checker.check_language_data(lang) .map_err(|e: OcrError| anyhow!(e))?; let mut tesseract = Tesseract::new(None, Some(lang)) .map_err(|e| anyhow!(OcrError::InitializationFailed { details: e.to_string() }))? .set_image(file_path)?; let text = tesseract.get_text() .map_err(|e| anyhow!(OcrError::InitializationFailed { details: format!("Failed to extract text: {}", e) }))?; Ok(text.trim().to_string()) } #[cfg(not(feature = "ocr"))] { Err(anyhow!(OcrError::TesseractNotInstalled)) } } pub async fn extract_text_from_pdf(&self, file_path: &str) -> Result { #[cfg(feature = "ocr")] { let bytes = std::fs::read(file_path)?; let text = pdf_extract::extract_text_from_mem(&bytes) .map_err(|e| anyhow!("Failed to extract text from PDF: {}", e))?; Ok(text.trim().to_string()) } #[cfg(not(feature = "ocr"))] { Err(anyhow!(OcrError::TesseractNotInstalled)) } } pub async fn extract_text(&self, file_path: &str, mime_type: &str) -> Result { self.extract_text_with_lang(file_path, mime_type, "eng").await } pub async fn extract_text_with_lang(&self, file_path: &str, mime_type: &str, lang: &str) -> Result { match mime_type { "application/pdf" => self.extract_text_from_pdf(file_path).await, "image/png" | "image/jpeg" | "image/jpg" | "image/tiff" | "image/bmp" => { self.extract_text_from_image_with_lang(file_path, lang).await } "text/plain" => { let text = tokio::fs::read_to_string(file_path).await?; Ok(text) } _ => { if self.is_image_file(file_path) { self.extract_text_from_image_with_lang(file_path, lang).await } else { Err(anyhow!(OcrError::InvalidImageFormat { details: format!("Unsupported MIME type: {}", mime_type) })) } } } } pub fn is_image_file(&self, file_path: &str) -> bool { if let Some(extension) = Path::new(file_path) .extension() .and_then(|ext| ext.to_str()) { let ext_lower = extension.to_lowercase(); matches!(ext_lower.as_str(), "png" | "jpg" | "jpeg" | "tiff" | "bmp" | "gif") } else { false } } }