diff --git a/Cargo.lock b/Cargo.lock index 00e33d7..f147635 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1290,7 +1290,7 @@ dependencies = [ "js-sys", "log", "wasm-bindgen", - "windows-core", + "windows-core 0.61.2", ] [[package]] @@ -1876,6 +1876,15 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "ntapi" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4" +dependencies = [ + "winapi", +] + [[package]] name = "nu-ansi-term" version = "0.46.0" @@ -2399,6 +2408,15 @@ version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f60fcc7d6849342eff22c4350c8b9a989ee8ceabc4b481253e8946b9fe83d684" +[[package]] +name = "raw-cpuid" +version = "11.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6df7ab838ed27997ba19a4664507e6f82b41fe6e20be42929332156e5e85146" +dependencies = [ + "bitflags 2.9.1", +] + [[package]] name = "rawpointer" version = "0.2.1" @@ -2445,15 +2463,18 @@ dependencies = [ "notify", "pdf-extract", "quick-xml", + "raw-cpuid", "regex", "reqwest", "serde", "serde_json", "sqlx", + "sysinfo", "tempfile", "tesseract", "testcontainers", "testcontainers-modules", + "thiserror 1.0.69", "tokio", "tokio-util", "tower 0.4.13", @@ -3272,6 +3293,21 @@ dependencies = [ "syn 2.0.102", ] +[[package]] +name = "sysinfo" +version = "0.30.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a5b4ddaee55fb2bea2bf0e5000747e5f5c0de765e5a5ff87f4cd106439f4bb3" +dependencies = [ + "cfg-if", + "core-foundation-sys", + "libc", + "ntapi", + "once_cell", + "rayon", + "windows", +] + [[package]] name = "system-configuration" version = "0.5.1" @@ -4053,6 +4089,25 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" +dependencies = [ + "windows-core 0.52.0", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-core" version = "0.61.2" diff --git a/Cargo.toml b/Cargo.toml index 0d40b9c..edc9e47 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,6 +36,9 @@ tesseract = { version = "0.15", optional = true } pdf-extract = { version = "0.7", optional = true } image = { version = "0.24", features = ["png", "jpeg", "tiff", "bmp"], optional = true } imageproc = { version = "0.23", optional = true } +thiserror = "1.0" +sysinfo = "0.30" +raw-cpuid = { version = "11", optional = true } reqwest = { version = "0.11", features = ["json", "multipart"] } quick-xml = { version = "0.31", features = ["serialize"] } urlencoding = "2.1" @@ -48,7 +51,7 @@ utoipa-swagger-ui = { version = "6", features = ["axum"] } [features] default = ["ocr"] -ocr = ["tesseract", "pdf-extract", "image", "imageproc"] +ocr = ["tesseract", "pdf-extract", "image", "imageproc", "raw-cpuid"] [dev-dependencies] tempfile = "3" diff --git a/src/lib.rs b/src/lib.rs index 94b04f9..c77731c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,7 +6,12 @@ pub mod enhanced_ocr; pub mod file_service; pub mod models; pub mod ocr; +pub mod ocr_api; +pub mod ocr_enhanced; +pub mod ocr_error; +pub mod ocr_health; pub mod ocr_queue; +pub mod ocr_tests; pub mod routes; pub mod seed; pub mod watcher; diff --git a/src/ocr.rs b/src/ocr.rs index 15c01a0..f93b06e 100644 --- a/src/ocr.rs +++ b/src/ocr.rs @@ -1,14 +1,20 @@ use anyhow::{anyhow, Result}; use std::path::Path; +use crate::ocr_error::OcrError; +use crate::ocr_health::OcrHealthChecker; #[cfg(feature = "ocr")] use tesseract::Tesseract; -pub struct OcrService; +pub struct OcrService { + health_checker: OcrHealthChecker, +} impl OcrService { pub fn new() -> Self { - Self + Self { + health_checker: OcrHealthChecker::new(), + } } pub async fn extract_text_from_image(&self, file_path: &str) -> Result { @@ -18,17 +24,29 @@ impl OcrService { pub async fn extract_text_from_image_with_lang(&self, file_path: &str, lang: &str) -> Result { #[cfg(feature = "ocr")] { - let mut tesseract = Tesseract::new(None, Some(lang))? + // Perform health checks first + self.health_checker.check_tesseract_installation() + .map_err(|e| anyhow!(e))?; + self.health_checker.check_language_data(lang) + .map_err(|e| anyhow!(e))?; + + let mut tesseract = Tesseract::new(None, Some(lang)) + .map_err(|e| anyhow!(OcrError::InitializationFailed { + details: e.to_string() + }))? .set_image(file_path)?; - let text = tesseract.get_text()?; + let text = tesseract.get_text() + .map_err(|e| anyhow!(OcrError::InitializationFailed { + details: format!("Failed to extract text: {}", e) + }))?; Ok(text.trim().to_string()) } #[cfg(not(feature = "ocr"))] { - Err(anyhow!("OCR feature is disabled. Recompile with --features ocr")) + Err(anyhow!(OcrError::TesseractNotInstalled)) } } @@ -44,7 +62,7 @@ impl OcrService { #[cfg(not(feature = "ocr"))] { - Err(anyhow!("OCR feature is disabled. Recompile with --features ocr")) + Err(anyhow!(OcrError::TesseractNotInstalled)) } } @@ -66,7 +84,9 @@ impl OcrService { if self.is_image_file(file_path) { self.extract_text_from_image_with_lang(file_path, lang).await } else { - Err(anyhow!("Unsupported file type for OCR: {}", mime_type)) + Err(anyhow!(OcrError::InvalidImageFormat { + details: format!("Unsupported MIME type: {}", mime_type) + })) } } } diff --git a/src/ocr_api.rs b/src/ocr_api.rs new file mode 100644 index 0000000..2f79d22 --- /dev/null +++ b/src/ocr_api.rs @@ -0,0 +1,129 @@ +use crate::ocr_enhanced::EnhancedOcrService; +use crate::ocr_error::OcrError; +use crate::AppState; +use axum::{ + extract::State, + http::StatusCode, + response::Json, +}; +use serde::{Deserialize, Serialize}; + +#[derive(Serialize)] +struct OcrHealthResponse { + status: String, + tesseract_installed: bool, + available_languages: Vec, + diagnostics: Option, + errors: Vec, +} + +#[derive(Serialize)] +struct OcrErrorResponse { + error: String, + error_code: String, + details: Option, + is_recoverable: bool, +} + +#[derive(Deserialize)] +struct OcrRequest { + file_path: String, + language: Option, + use_fallback: Option, +} + +pub async fn health_check( + State(_state): State, +) -> Result, (StatusCode, Json)> { + let service = EnhancedOcrService::new(); + let diagnostics = service.get_diagnostics().await; + + let health_checker = crate::ocr_health::OcrHealthChecker::new(); + + match health_checker.perform_full_health_check() { + Ok(diag) => { + Ok(Json(OcrHealthResponse { + status: "healthy".to_string(), + tesseract_installed: true, + available_languages: diag.available_languages, + diagnostics: Some(diagnostics), + errors: vec![], + })) + } + Err(errors) => { + let error_messages: Vec = errors.iter() + .map(|e| e.to_string()) + .collect(); + + let _status_code = if errors.iter().any(|e| e.is_configuration_error()) { + StatusCode::SERVICE_UNAVAILABLE + } else { + StatusCode::INTERNAL_SERVER_ERROR + }; + + Ok(Json(OcrHealthResponse { + status: "unhealthy".to_string(), + tesseract_installed: errors.iter().all(|e| !matches!(e, OcrError::TesseractNotInstalled)), + available_languages: vec![], + diagnostics: Some(diagnostics), + errors: error_messages, + })) + } + } +} + +pub async fn perform_ocr( + State(_state): State, + Json(request): Json, +) -> Result, (StatusCode, Json)> { + let service = EnhancedOcrService::new(); + let lang = request.language.as_deref().unwrap_or("eng"); + let use_fallback = request.use_fallback.unwrap_or(true); + + let result = if use_fallback { + service.extract_with_fallback(&request.file_path, lang).await + } else { + service.extract_text_with_validation(&request.file_path, lang).await + }; + + match result { + Ok(text) => Ok(Json(serde_json::json!({ + "text": text, + "status": "success" + }))), + Err(e) => { + if let Some(ocr_error) = e.downcast_ref::() { + let (status_code, details) = match ocr_error { + OcrError::TesseractNotInstalled => (StatusCode::SERVICE_UNAVAILABLE, "Please install Tesseract OCR"), + OcrError::LanguageDataNotFound { .. } => (StatusCode::BAD_REQUEST, "Language pack not installed"), + OcrError::InsufficientMemory { .. } => (StatusCode::INSUFFICIENT_STORAGE, "Not enough memory"), + OcrError::ImageTooLarge { .. } => (StatusCode::PAYLOAD_TOO_LARGE, "Image exceeds size limits"), + OcrError::OcrTimeout { .. } => (StatusCode::REQUEST_TIMEOUT, "OCR operation timed out"), + OcrError::PermissionDenied { .. } => (StatusCode::FORBIDDEN, "Cannot access file"), + OcrError::InvalidImageFormat { .. } => (StatusCode::UNPROCESSABLE_ENTITY, "Invalid image format"), + _ => (StatusCode::INTERNAL_SERVER_ERROR, "OCR processing failed"), + }; + + Err(( + status_code, + Json(OcrErrorResponse { + error: ocr_error.to_string(), + error_code: ocr_error.error_code().to_string(), + details: Some(details.to_string()), + is_recoverable: ocr_error.is_recoverable(), + }), + )) + } else { + Err(( + StatusCode::INTERNAL_SERVER_ERROR, + Json(OcrErrorResponse { + error: e.to_string(), + error_code: "OCR_UNKNOWN_ERROR".to_string(), + details: None, + is_recoverable: false, + }), + )) + } + } + } +} \ No newline at end of file diff --git a/src/ocr_enhanced.rs b/src/ocr_enhanced.rs new file mode 100644 index 0000000..7b1468e --- /dev/null +++ b/src/ocr_enhanced.rs @@ -0,0 +1,247 @@ +use crate::ocr_error::OcrError; +use crate::ocr_health::OcrHealthChecker; +use anyhow::{anyhow, Result}; +use image::DynamicImage; +use std::path::Path; +use std::time::{Duration, Instant}; +use tokio::time::timeout; + +#[cfg(feature = "ocr")] +use tesseract::{Tesseract, PageSegMode}; + +pub struct EnhancedOcrService { + health_checker: OcrHealthChecker, + max_image_width: u32, + max_image_height: u32, + ocr_timeout_seconds: u64, + min_confidence_threshold: f32, +} + +impl EnhancedOcrService { + pub fn new() -> Self { + Self { + health_checker: OcrHealthChecker::new(), + max_image_width: 10000, + max_image_height: 10000, + ocr_timeout_seconds: 120, + min_confidence_threshold: 60.0, + } + } + + pub fn with_limits(mut self, max_width: u32, max_height: u32) -> Self { + self.max_image_width = max_width; + self.max_image_height = max_height; + self + } + + pub fn with_timeout(mut self, seconds: u64) -> Self { + self.ocr_timeout_seconds = seconds; + self + } + + pub async fn extract_text_with_validation(&self, file_path: &str, lang: &str) -> Result { + // Perform pre-flight checks + self.preflight_checks(lang)?; + + // Load and validate image + let image = self.load_and_validate_image(file_path)?; + + // Check memory requirements + let (width, height) = (image.width(), image.height()); + self.health_checker.validate_memory_for_image(width, height) + .map_err(|e| anyhow!(e))?; + + // Perform OCR with timeout + let text = self.perform_ocr_with_timeout(file_path, lang).await?; + + Ok(text) + } + + fn preflight_checks(&self, lang: &str) -> Result<()> { + // Check Tesseract installation + self.health_checker.check_tesseract_installation() + .map_err(|e| anyhow!(e))?; + + // Check CPU requirements + self.health_checker.validate_cpu_requirements() + .map_err(|e| anyhow!(e))?; + + // Check language data + self.health_checker.check_language_data(lang) + .map_err(|e| anyhow!(e))?; + + Ok(()) + } + + fn load_and_validate_image(&self, file_path: &str) -> Result { + // Check file permissions + if !Path::new(file_path).exists() { + return Err(anyhow!("File not found: {}", file_path)); + } + + let metadata = std::fs::metadata(file_path) + .map_err(|_| OcrError::PermissionDenied { + path: file_path.to_string() + })?; + + if !metadata.is_file() { + return Err(anyhow!("Path is not a file: {}", file_path)); + } + + // Try to load image + let image = image::open(file_path) + .map_err(|e| OcrError::InvalidImageFormat { + details: e.to_string() + })?; + + // Validate dimensions + if image.width() > self.max_image_width || image.height() > self.max_image_height { + return Err(OcrError::ImageTooLarge { + width: image.width(), + height: image.height(), + max_width: self.max_image_width, + max_height: self.max_image_height, + }.into()); + } + + Ok(image) + } + + async fn perform_ocr_with_timeout(&self, file_path: &str, lang: &str) -> Result { + let file_path = file_path.to_string(); + let lang = lang.to_string(); + let timeout_duration = Duration::from_secs(self.ocr_timeout_seconds); + let min_confidence = self.min_confidence_threshold; + + let ocr_future = tokio::task::spawn_blocking(move || { + Self::perform_ocr_internal(&file_path, &lang, min_confidence) + }); + + match timeout(timeout_duration, ocr_future).await { + Ok(Ok(result)) => result, + Ok(Err(e)) => Err(anyhow!("OCR task failed: {}", e)), + Err(_) => Err(OcrError::OcrTimeout { + seconds: self.ocr_timeout_seconds + }.into()), + } + } + + #[cfg(feature = "ocr")] + fn perform_ocr_internal(file_path: &str, lang: &str, min_confidence: f32) -> Result { + let start_time = Instant::now(); + + // Initialize Tesseract with error handling + let mut tesseract = Tesseract::new(None, Some(lang)) + .map_err(|e| OcrError::InitializationFailed { + details: e.to_string() + })?; + + // Set optimal parameters for various hardware + tesseract.set_page_seg_mode(PageSegMode::PsmAuto); + + let mut tesseract = tesseract + .set_variable("tessedit_do_invert", "0")? + .set_variable("edges_max_children_per_outline", "40")?; + + // For low-end hardware, use faster but less accurate settings + if let Ok(available_mem) = std::env::var("OCR_LOW_MEMORY_MODE") { + if available_mem == "true" { + tesseract = tesseract + .set_variable("textord_heavy_nr", "0")? + .set_variable("cube_debug_level", "0")?; + } + } + + tesseract = tesseract.set_image(file_path) + .map_err(|e| OcrError::InvalidImageFormat { + details: e.to_string() + })?; + + // Get text with confidence check + let text = tesseract.get_text() + .map_err(|e| OcrError::InitializationFailed { + details: e.to_string() + })?; + + // Get mean confidence + let confidence = tesseract.mean_text_conf(); + + if confidence < min_confidence as i32 { + return Err(OcrError::LowConfidence { + score: confidence as f32, + threshold: min_confidence + }.into()); + } + + let elapsed = start_time.elapsed(); + tracing::info!("OCR completed in {:?} with confidence: {}%", elapsed, confidence); + + Ok(text.trim().to_string()) + } + + #[cfg(not(feature = "ocr"))] + fn perform_ocr_internal(_file_path: &str, _lang: &str, _min_confidence: f32) -> Result { + Err(anyhow!("OCR feature is disabled. Recompile with --features ocr")) + } + + pub async fn extract_with_fallback(&self, file_path: &str, lang: &str) -> Result { + // Try primary extraction + match self.extract_text_with_validation(file_path, lang).await { + Ok(text) => Ok(text), + Err(e) => { + // Check if error is recoverable + if let Some(ocr_error) = e.downcast_ref::() { + if ocr_error.is_recoverable() { + // Try with reduced quality settings + self.extract_with_reduced_quality(file_path, lang).await + } else { + Err(e) + } + } else { + Err(e) + } + } + } + } + + async fn extract_with_reduced_quality(&self, file_path: &str, lang: &str) -> Result { + // Downsample image for lower memory usage + let image = self.load_and_validate_image(file_path)?; + let resized = self.resize_for_ocr(image); + + // Save temporary resized image + let temp_path = format!("{}_resized.png", file_path); + resized.save(&temp_path) + .map_err(|e| anyhow!("Failed to save resized image: {}", e))?; + + // Try OCR on resized image + let result = self.perform_ocr_with_timeout(&temp_path, lang).await; + + // Clean up + let _ = std::fs::remove_file(&temp_path); + + result + } + + fn resize_for_ocr(&self, image: DynamicImage) -> DynamicImage { + let (width, height) = (image.width(), image.height()); + + // Target dimensions for low memory mode + let max_dimension = 2000; + + if width > max_dimension || height > max_dimension { + let scale = max_dimension as f32 / width.max(height) as f32; + let new_width = (width as f32 * scale) as u32; + let new_height = (height as f32 * scale) as u32; + + image.resize(new_width, new_height, image::imageops::FilterType::Lanczos3) + } else { + image + } + } + + pub async fn get_diagnostics(&self) -> String { + let diagnostics = self.health_checker.get_full_diagnostics(); + format!("{}", diagnostics) + } +} \ No newline at end of file diff --git a/src/ocr_error.rs b/src/ocr_error.rs new file mode 100644 index 0000000..4023318 --- /dev/null +++ b/src/ocr_error.rs @@ -0,0 +1,129 @@ +use std::fmt; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum OcrError { + #[error("Tesseract is not installed on the system")] + TesseractNotInstalled, + + #[error("Tesseract language data not found for '{lang}'. Please install tesseract-ocr-{lang}")] + LanguageDataNotFound { lang: String }, + + #[error("TESSDATA_PREFIX environment variable not set or invalid: {path}")] + TessdataPathInvalid { path: String }, + + #[error("Insufficient memory for OCR operation. Required: {required}MB, Available: {available}MB")] + InsufficientMemory { required: u64, available: u64 }, + + #[error("CPU instruction set missing: {instruction}. Tesseract requires {instruction} support")] + MissingCpuInstruction { instruction: String }, + + #[error("Image too large for OCR. Max dimensions: {max_width}x{max_height}, Actual: {width}x{height}")] + ImageTooLarge { + width: u32, + height: u32, + max_width: u32, + max_height: u32, + }, + + #[error("Invalid image format or corrupted image: {details}")] + InvalidImageFormat { details: String }, + + #[error("OCR timeout after {seconds} seconds. Consider reducing image size or quality")] + OcrTimeout { seconds: u64 }, + + #[error("Permission denied accessing file: {path}")] + PermissionDenied { path: String }, + + #[error("Tesseract initialization failed: {details}")] + InitializationFailed { details: String }, + + #[error("OCR quality too low. Confidence score: {score}% (minimum: {threshold}%)")] + LowConfidence { score: f32, threshold: f32 }, + + #[error("Hardware acceleration not available: {details}")] + HardwareAccelerationUnavailable { details: String }, + + #[error(transparent)] + Io(#[from] std::io::Error), + + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl OcrError { + pub fn is_recoverable(&self) -> bool { + matches!( + self, + OcrError::InsufficientMemory { .. } + | OcrError::OcrTimeout { .. } + | OcrError::LowConfidence { .. } + ) + } + + pub fn is_configuration_error(&self) -> bool { + matches!( + self, + OcrError::TesseractNotInstalled + | OcrError::LanguageDataNotFound { .. } + | OcrError::TessdataPathInvalid { .. } + | OcrError::MissingCpuInstruction { .. } + ) + } + + pub fn error_code(&self) -> &'static str { + match self { + OcrError::TesseractNotInstalled => "OCR_NOT_INSTALLED", + OcrError::LanguageDataNotFound { .. } => "OCR_LANG_MISSING", + OcrError::TessdataPathInvalid { .. } => "OCR_DATA_PATH_INVALID", + OcrError::InsufficientMemory { .. } => "OCR_OUT_OF_MEMORY", + OcrError::MissingCpuInstruction { .. } => "OCR_CPU_UNSUPPORTED", + OcrError::ImageTooLarge { .. } => "OCR_IMAGE_TOO_LARGE", + OcrError::InvalidImageFormat { .. } => "OCR_INVALID_FORMAT", + OcrError::OcrTimeout { .. } => "OCR_TIMEOUT", + OcrError::PermissionDenied { .. } => "OCR_PERMISSION_DENIED", + OcrError::InitializationFailed { .. } => "OCR_INIT_FAILED", + OcrError::LowConfidence { .. } => "OCR_LOW_CONFIDENCE", + OcrError::HardwareAccelerationUnavailable { .. } => "OCR_NO_HW_ACCEL", + OcrError::Io(_) => "OCR_IO_ERROR", + OcrError::Other(_) => "OCR_UNKNOWN_ERROR", + } + } +} + +#[derive(Debug, Clone)] +pub struct OcrDiagnostics { + pub tesseract_version: Option, + pub available_languages: Vec, + pub tessdata_path: Option, + pub cpu_features: CpuFeatures, + pub memory_available_mb: u64, + pub temp_space_available_mb: u64, +} + +#[derive(Debug, Clone)] +pub struct CpuFeatures { + pub sse2: bool, + pub sse3: bool, + pub sse4_1: bool, + pub sse4_2: bool, + pub avx: bool, + pub avx2: bool, +} + +impl fmt::Display for OcrDiagnostics { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "OCR Diagnostics:")?; + writeln!(f, " Tesseract Version: {}", self.tesseract_version.as_deref().unwrap_or("Not installed"))?; + writeln!(f, " Tessdata Path: {}", self.tessdata_path.as_deref().unwrap_or("Not set"))?; + writeln!(f, " Available Languages: {}", self.available_languages.join(", "))?; + writeln!(f, " Memory Available: {} MB", self.memory_available_mb)?; + writeln!(f, " Temp Space: {} MB", self.temp_space_available_mb)?; + writeln!(f, " CPU Features:")?; + writeln!(f, " SSE2: {}", self.cpu_features.sse2)?; + writeln!(f, " SSE4.1: {}", self.cpu_features.sse4_1)?; + writeln!(f, " AVX: {}", self.cpu_features.avx)?; + writeln!(f, " AVX2: {}", self.cpu_features.avx2)?; + Ok(()) + } +} \ No newline at end of file diff --git a/src/ocr_health.rs b/src/ocr_health.rs new file mode 100644 index 0000000..8b36757 --- /dev/null +++ b/src/ocr_health.rs @@ -0,0 +1,282 @@ +use crate::ocr_error::{CpuFeatures, OcrDiagnostics, OcrError}; +use std::process::Command; +use std::env; +use std::path::Path; +use sysinfo::System; + +pub struct OcrHealthChecker; + +impl OcrHealthChecker { + pub fn new() -> Self { + Self + } + + pub fn check_tesseract_installation(&self) -> Result { + let output = Command::new("tesseract") + .arg("--version") + .output() + .map_err(|_| OcrError::TesseractNotInstalled)?; + + if !output.status.success() { + return Err(OcrError::TesseractNotInstalled); + } + + let version_info = String::from_utf8_lossy(&output.stdout); + let version = version_info + .lines() + .next() + .map(|s| s.to_string()) + .unwrap_or_else(|| "Unknown".to_string()); + + Ok(version) + } + + pub fn check_language_data(&self, lang: &str) -> Result<(), OcrError> { + let tessdata_path = self.get_tessdata_path()?; + let lang_file = format!("{}.traineddata", lang); + let lang_path = Path::new(&tessdata_path).join(&lang_file); + + if !lang_path.exists() { + return Err(OcrError::LanguageDataNotFound { + lang: lang.to_string(), + }); + } + + Ok(()) + } + + pub fn get_tessdata_path(&self) -> Result { + if let Ok(path) = env::var("TESSDATA_PREFIX") { + if Path::new(&path).exists() { + return Ok(path); + } else { + return Err(OcrError::TessdataPathInvalid { path }); + } + } + + let common_paths = vec![ + "/usr/share/tesseract-ocr/4.00/tessdata", + "/usr/share/tesseract-ocr/5.00/tessdata", + "/usr/local/share/tessdata", + "/opt/homebrew/share/tessdata", + "C:\\Program Files\\Tesseract-OCR\\tessdata", + ]; + + for path in common_paths { + if Path::new(path).exists() { + return Ok(path.to_string()); + } + } + + Err(OcrError::TessdataPathInvalid { + path: "No tessdata directory found".to_string(), + }) + } + + pub fn get_available_languages(&self) -> Vec { + let tessdata_path = match self.get_tessdata_path() { + Ok(path) => path, + Err(_) => return vec![], + }; + + let mut languages = vec![]; + if let Ok(entries) = std::fs::read_dir(&tessdata_path) { + for entry in entries.flatten() { + if let Some(name) = entry.file_name().to_str() { + if name.ends_with(".traineddata") { + let lang = name.trim_end_matches(".traineddata"); + languages.push(lang.to_string()); + } + } + } + } + + languages.sort(); + languages + } + + pub fn check_cpu_features(&self) -> CpuFeatures { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + use raw_cpuid::CpuId; + let cpuid = CpuId::new(); + + let features = cpuid.get_feature_info().map(|f| CpuFeatures { + sse2: f.has_sse2(), + sse3: f.has_sse3(), + sse4_1: f.has_sse41(), + sse4_2: f.has_sse42(), + avx: f.has_avx(), + avx2: cpuid.get_extended_feature_info() + .map(|ef| ef.has_avx2()) + .unwrap_or(false), + }).unwrap_or_else(|| CpuFeatures { + sse2: false, + sse3: false, + sse4_1: false, + sse4_2: false, + avx: false, + avx2: false, + }); + + features + } + + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + { + CpuFeatures { + sse2: false, + sse3: false, + sse4_1: false, + sse4_2: false, + avx: false, + avx2: false, + } + } + } + + pub fn check_memory_available(&self) -> u64 { + let mut sys = System::new_all(); + sys.refresh_memory(); + sys.available_memory() / (1024 * 1024) // Convert to MB + } + + pub fn check_temp_space(&self) -> u64 { + use std::fs; + + let temp_dir = env::temp_dir(); + + // Try to get actual available space using statvfs on Unix-like systems + #[cfg(target_family = "unix")] + { + use std::mem; + + #[repr(C)] + struct statvfs { + f_bsize: u64, // file system block size + f_frsize: u64, // fragment size + f_blocks: u64, // size of fs in f_frsize units + f_bfree: u64, // # free blocks + f_bavail: u64, // # free blocks for unprivileged users + f_files: u64, // # inodes + f_ffree: u64, // # free inodes + f_favail: u64, // # free inodes for unprivileged users + f_fsid: u64, // file system ID + f_flag: u64, // mount flags + f_namemax: u64, // maximum filename length + } + + extern "C" { + fn statvfs(path: *const i8, buf: *mut statvfs) -> i32; + } + + unsafe { + let mut buf: statvfs = mem::zeroed(); + let path_cstr = format!("{}\0", temp_dir.display()); + + if statvfs(path_cstr.as_ptr() as *const i8, &mut buf) == 0 { + let available_bytes = buf.f_bavail * buf.f_frsize; + return available_bytes / (1024 * 1024); // Convert to MB + } + } + } + + // Windows implementation + #[cfg(target_family = "windows")] + { + // For Windows, we'd need to use GetDiskFreeSpaceEx from winapi + // For now, try to estimate based on a test file write + } + + // Fallback: Try to estimate available space by checking if we can create a test file + let test_file = temp_dir.join(".ocr_space_test"); + let test_size = 100 * 1024 * 1024; // 100MB test + + match fs::write(&test_file, vec![0u8; test_size]) { + Ok(_) => { + let _ = fs::remove_file(&test_file); + // If we can write 100MB, assume at least 1GB is available + 1000 + } + Err(_) => { + // If we can't write 100MB, report low space + 50 + } + } + } + + pub fn validate_cpu_requirements(&self) -> Result<(), OcrError> { + let features = self.check_cpu_features(); + + // Tesseract 4.x+ requires at least SSE2 + if !features.sse2 { + return Err(OcrError::MissingCpuInstruction { + instruction: "SSE2".to_string(), + }); + } + + Ok(()) + } + + pub fn estimate_memory_requirement(&self, image_width: u32, image_height: u32) -> u64 { + // Rough estimation: 4 bytes per pixel (RGBA) * 3 (for processing buffers) + // Plus 100MB base overhead for Tesseract + let pixels = (image_width as u64) * (image_height as u64); + let image_memory = (pixels * 4 * 3) / (1024 * 1024); // Convert to MB + image_memory + 100 + } + + pub fn validate_memory_for_image(&self, width: u32, height: u32) -> Result<(), OcrError> { + let required = self.estimate_memory_requirement(width, height); + let available = self.check_memory_available(); + + if required > available { + return Err(OcrError::InsufficientMemory { required, available }); + } + + Ok(()) + } + + pub fn get_full_diagnostics(&self) -> OcrDiagnostics { + OcrDiagnostics { + tesseract_version: self.check_tesseract_installation().ok(), + available_languages: self.get_available_languages(), + tessdata_path: self.get_tessdata_path().ok(), + cpu_features: self.check_cpu_features(), + memory_available_mb: self.check_memory_available(), + temp_space_available_mb: self.check_temp_space(), + } + } + + pub fn perform_full_health_check(&self) -> Result> { + let mut errors = Vec::new(); + + // Check Tesseract installation + if let Err(e) = self.check_tesseract_installation() { + errors.push(e); + } + + // Check CPU requirements + if let Err(e) = self.validate_cpu_requirements() { + errors.push(e); + } + + // Check tessdata path + if let Err(e) = self.get_tessdata_path() { + errors.push(e); + } + + // Check for at least English language data + if let Err(e) = self.check_language_data("eng") { + errors.push(e); + } + + let diagnostics = self.get_full_diagnostics(); + + if errors.is_empty() { + Ok(diagnostics) + } else { + Err(errors) + } + } +} \ No newline at end of file diff --git a/src/ocr_tests.rs b/src/ocr_tests.rs new file mode 100644 index 0000000..77e5083 --- /dev/null +++ b/src/ocr_tests.rs @@ -0,0 +1,215 @@ +#[cfg(test)] +mod tests { + use super::super::*; + use crate::ocr_error::{OcrError, OcrDiagnostics, CpuFeatures}; + use crate::ocr_health::OcrHealthChecker; + use crate::ocr_enhanced::EnhancedOcrService; + use std::env; + use tempfile::TempDir; + use std::fs; + + #[test] + fn test_ocr_error_types() { + // Test error creation and properties + let err = OcrError::TesseractNotInstalled; + assert_eq!(err.error_code(), "OCR_NOT_INSTALLED"); + assert!(!err.is_recoverable()); + assert!(err.is_configuration_error()); + + let err = OcrError::InsufficientMemory { required: 1000, available: 500 }; + assert_eq!(err.error_code(), "OCR_OUT_OF_MEMORY"); + assert!(err.is_recoverable()); + assert!(!err.is_configuration_error()); + + let err = OcrError::LanguageDataNotFound { lang: "deu".to_string() }; + assert!(err.to_string().contains("deu")); + assert!(err.is_configuration_error()); + } + + #[test] + fn test_cpu_features_display() { + let features = CpuFeatures { + sse2: true, + sse3: true, + sse4_1: false, + sse4_2: false, + avx: false, + avx2: false, + }; + + let diag = OcrDiagnostics { + tesseract_version: Some("4.1.1".to_string()), + available_languages: vec!["eng".to_string(), "fra".to_string()], + tessdata_path: Some("/usr/share/tessdata".to_string()), + cpu_features: features, + memory_available_mb: 8192, + temp_space_available_mb: 50000, + }; + + let display = format!("{}", diag); + assert!(display.contains("Tesseract Version: 4.1.1")); + assert!(display.contains("SSE2: true")); + assert!(display.contains("Available Languages: eng, fra")); + } + + #[test] + fn test_health_checker_cpu_validation() { + let checker = OcrHealthChecker::new(); + let features = checker.check_cpu_features(); + + // On x86/x64, we should at least detect the presence of CPU features + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + // Modern CPUs should have at least SSE2 + // Note: This might fail on very old hardware + if std::env::var("CI").is_err() { + // Only check in non-CI environments + let _ = checker.validate_cpu_requirements(); + } + } + } + + #[test] + fn test_memory_estimation() { + let checker = OcrHealthChecker::new(); + + // Test memory estimation for different image sizes + let small_image = checker.estimate_memory_requirement(640, 480); + let medium_image = checker.estimate_memory_requirement(1920, 1080); + let large_image = checker.estimate_memory_requirement(4096, 4096); + + // Small image should need less memory than large + assert!(small_image < medium_image); + assert!(medium_image < large_image); + + // Base overhead is 100MB + assert!(small_image >= 100); + } + + #[test] + fn test_temp_space_check() { + let checker = OcrHealthChecker::new(); + let space = checker.check_temp_space(); + + // Should return some positive value + assert!(space > 0); + } + + #[test] + fn test_tessdata_path_detection() { + let checker = OcrHealthChecker::new(); + + // Set a custom TESSDATA_PREFIX for testing + let temp_dir = TempDir::new().unwrap(); + env::set_var("TESSDATA_PREFIX", temp_dir.path()); + + match checker.get_tessdata_path() { + Ok(path) => assert_eq!(path, temp_dir.path().to_string_lossy()), + Err(e) => { + // Expected if the temp directory doesn't exist + match e { + OcrError::TessdataPathInvalid { .. } => (), + _ => panic!("Unexpected error type"), + } + } + } + + env::remove_var("TESSDATA_PREFIX"); + } + + #[test] + fn test_language_detection() { + let checker = OcrHealthChecker::new(); + + // Create a mock tessdata directory + let temp_dir = TempDir::new().unwrap(); + let tessdata_path = temp_dir.path().join("tessdata"); + fs::create_dir(&tessdata_path).unwrap(); + + // Create mock language files + fs::write(tessdata_path.join("eng.traineddata"), b"mock").unwrap(); + fs::write(tessdata_path.join("fra.traineddata"), b"mock").unwrap(); + fs::write(tessdata_path.join("deu.traineddata"), b"mock").unwrap(); + + env::set_var("TESSDATA_PREFIX", &tessdata_path); + + let languages = checker.get_available_languages(); + assert!(languages.contains(&"eng".to_string())); + assert!(languages.contains(&"fra".to_string())); + assert!(languages.contains(&"deu".to_string())); + assert_eq!(languages.len(), 3); + + // Test language validation + assert!(checker.check_language_data("eng").is_ok()); + assert!(checker.check_language_data("jpn").is_err()); + + env::remove_var("TESSDATA_PREFIX"); + } + + #[tokio::test] + async fn test_enhanced_ocr_timeout() { + let service = EnhancedOcrService::new() + .with_timeout(1); // 1 second timeout + + // This should timeout since no actual file exists + let result = service.extract_text_with_validation("/nonexistent/file.png", "eng").await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_enhanced_ocr_image_validation() { + let service = EnhancedOcrService::new() + .with_limits(100, 100); // Very small limit + + // Create a mock large image path + let result = service.extract_text_with_validation("/path/to/large/image.png", "eng").await; + assert!(result.is_err()); + } + + #[test] + fn test_error_recovery_classification() { + // Test which errors are considered recoverable + let recoverable_errors = vec![ + OcrError::InsufficientMemory { required: 1000, available: 500 }, + OcrError::OcrTimeout { seconds: 30 }, + OcrError::LowConfidence { score: 40.0, threshold: 60.0 }, + ]; + + for err in recoverable_errors { + assert!(err.is_recoverable(), "Error {:?} should be recoverable", err); + } + + let non_recoverable_errors = vec![ + OcrError::TesseractNotInstalled, + OcrError::LanguageDataNotFound { lang: "eng".to_string() }, + OcrError::MissingCpuInstruction { instruction: "SSE2".to_string() }, + OcrError::PermissionDenied { path: "/test".to_string() }, + ]; + + for err in non_recoverable_errors { + assert!(!err.is_recoverable(), "Error {:?} should not be recoverable", err); + } + } + + #[test] + fn test_image_size_validation() { + let checker = OcrHealthChecker::new(); + + // Assuming we have at least 100MB available + let available = checker.check_memory_available(); + if available > 100 { + // Small image should pass + assert!(checker.validate_memory_for_image(640, 480).is_ok()); + + // Extremely large image should fail + let result = checker.validate_memory_for_image(50000, 50000); + assert!(result.is_err()); + + if let Err(OcrError::InsufficientMemory { required, available }) = result { + assert!(required > available); + } else { + panic!("Expected InsufficientMemory error"); + } + } + } +} \ No newline at end of file