feat(server): implement better ocr failure and guardrails

This commit is contained in:
perf3ct 2025-06-14 22:13:04 +00:00
parent 003d90943c
commit 9fa45f8891
9 changed files with 1094 additions and 9 deletions

57
Cargo.lock generated
View File

@ -1290,7 +1290,7 @@ dependencies = [
"js-sys",
"log",
"wasm-bindgen",
"windows-core",
"windows-core 0.61.2",
]
[[package]]
@ -1876,6 +1876,15 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "ntapi"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4"
dependencies = [
"winapi",
]
[[package]]
name = "nu-ansi-term"
version = "0.46.0"
@ -2399,6 +2408,15 @@ version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f60fcc7d6849342eff22c4350c8b9a989ee8ceabc4b481253e8946b9fe83d684"
[[package]]
name = "raw-cpuid"
version = "11.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6df7ab838ed27997ba19a4664507e6f82b41fe6e20be42929332156e5e85146"
dependencies = [
"bitflags 2.9.1",
]
[[package]]
name = "rawpointer"
version = "0.2.1"
@ -2445,15 +2463,18 @@ dependencies = [
"notify",
"pdf-extract",
"quick-xml",
"raw-cpuid",
"regex",
"reqwest",
"serde",
"serde_json",
"sqlx",
"sysinfo",
"tempfile",
"tesseract",
"testcontainers",
"testcontainers-modules",
"thiserror 1.0.69",
"tokio",
"tokio-util",
"tower 0.4.13",
@ -3272,6 +3293,21 @@ dependencies = [
"syn 2.0.102",
]
[[package]]
name = "sysinfo"
version = "0.30.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0a5b4ddaee55fb2bea2bf0e5000747e5f5c0de765e5a5ff87f4cd106439f4bb3"
dependencies = [
"cfg-if",
"core-foundation-sys",
"libc",
"ntapi",
"once_cell",
"rayon",
"windows",
]
[[package]]
name = "system-configuration"
version = "0.5.1"
@ -4053,6 +4089,25 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be"
dependencies = [
"windows-core 0.52.0",
"windows-targets 0.52.6",
]
[[package]]
name = "windows-core"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
dependencies = [
"windows-targets 0.52.6",
]
[[package]]
name = "windows-core"
version = "0.61.2"

View File

@ -36,6 +36,9 @@ tesseract = { version = "0.15", optional = true }
pdf-extract = { version = "0.7", optional = true }
image = { version = "0.24", features = ["png", "jpeg", "tiff", "bmp"], optional = true }
imageproc = { version = "0.23", optional = true }
thiserror = "1.0"
sysinfo = "0.30"
raw-cpuid = { version = "11", optional = true }
reqwest = { version = "0.11", features = ["json", "multipart"] }
quick-xml = { version = "0.31", features = ["serialize"] }
urlencoding = "2.1"
@ -48,7 +51,7 @@ utoipa-swagger-ui = { version = "6", features = ["axum"] }
[features]
default = ["ocr"]
ocr = ["tesseract", "pdf-extract", "image", "imageproc"]
ocr = ["tesseract", "pdf-extract", "image", "imageproc", "raw-cpuid"]
[dev-dependencies]
tempfile = "3"

View File

@ -6,7 +6,12 @@ pub mod enhanced_ocr;
pub mod file_service;
pub mod models;
pub mod ocr;
pub mod ocr_api;
pub mod ocr_enhanced;
pub mod ocr_error;
pub mod ocr_health;
pub mod ocr_queue;
pub mod ocr_tests;
pub mod routes;
pub mod seed;
pub mod watcher;

View File

@ -1,14 +1,20 @@
use anyhow::{anyhow, Result};
use std::path::Path;
use crate::ocr_error::OcrError;
use crate::ocr_health::OcrHealthChecker;
#[cfg(feature = "ocr")]
use tesseract::Tesseract;
pub struct OcrService;
pub struct OcrService {
health_checker: OcrHealthChecker,
}
impl OcrService {
pub fn new() -> Self {
Self
Self {
health_checker: OcrHealthChecker::new(),
}
}
pub async fn extract_text_from_image(&self, file_path: &str) -> Result<String> {
@ -18,17 +24,29 @@ impl OcrService {
pub async fn extract_text_from_image_with_lang(&self, file_path: &str, lang: &str) -> Result<String> {
#[cfg(feature = "ocr")]
{
let mut tesseract = Tesseract::new(None, Some(lang))?
// Perform health checks first
self.health_checker.check_tesseract_installation()
.map_err(|e| anyhow!(e))?;
self.health_checker.check_language_data(lang)
.map_err(|e| anyhow!(e))?;
let mut tesseract = Tesseract::new(None, Some(lang))
.map_err(|e| anyhow!(OcrError::InitializationFailed {
details: e.to_string()
}))?
.set_image(file_path)?;
let text = tesseract.get_text()?;
let text = tesseract.get_text()
.map_err(|e| anyhow!(OcrError::InitializationFailed {
details: format!("Failed to extract text: {}", e)
}))?;
Ok(text.trim().to_string())
}
#[cfg(not(feature = "ocr"))]
{
Err(anyhow!("OCR feature is disabled. Recompile with --features ocr"))
Err(anyhow!(OcrError::TesseractNotInstalled))
}
}
@ -44,7 +62,7 @@ impl OcrService {
#[cfg(not(feature = "ocr"))]
{
Err(anyhow!("OCR feature is disabled. Recompile with --features ocr"))
Err(anyhow!(OcrError::TesseractNotInstalled))
}
}
@ -66,7 +84,9 @@ impl OcrService {
if self.is_image_file(file_path) {
self.extract_text_from_image_with_lang(file_path, lang).await
} else {
Err(anyhow!("Unsupported file type for OCR: {}", mime_type))
Err(anyhow!(OcrError::InvalidImageFormat {
details: format!("Unsupported MIME type: {}", mime_type)
}))
}
}
}

129
src/ocr_api.rs Normal file
View File

@ -0,0 +1,129 @@
use crate::ocr_enhanced::EnhancedOcrService;
use crate::ocr_error::OcrError;
use crate::AppState;
use axum::{
extract::State,
http::StatusCode,
response::Json,
};
use serde::{Deserialize, Serialize};
#[derive(Serialize)]
struct OcrHealthResponse {
status: String,
tesseract_installed: bool,
available_languages: Vec<String>,
diagnostics: Option<String>,
errors: Vec<String>,
}
#[derive(Serialize)]
struct OcrErrorResponse {
error: String,
error_code: String,
details: Option<String>,
is_recoverable: bool,
}
#[derive(Deserialize)]
struct OcrRequest {
file_path: String,
language: Option<String>,
use_fallback: Option<bool>,
}
pub async fn health_check(
State(_state): State<AppState>,
) -> Result<Json<OcrHealthResponse>, (StatusCode, Json<OcrErrorResponse>)> {
let service = EnhancedOcrService::new();
let diagnostics = service.get_diagnostics().await;
let health_checker = crate::ocr_health::OcrHealthChecker::new();
match health_checker.perform_full_health_check() {
Ok(diag) => {
Ok(Json(OcrHealthResponse {
status: "healthy".to_string(),
tesseract_installed: true,
available_languages: diag.available_languages,
diagnostics: Some(diagnostics),
errors: vec![],
}))
}
Err(errors) => {
let error_messages: Vec<String> = errors.iter()
.map(|e| e.to_string())
.collect();
let _status_code = if errors.iter().any(|e| e.is_configuration_error()) {
StatusCode::SERVICE_UNAVAILABLE
} else {
StatusCode::INTERNAL_SERVER_ERROR
};
Ok(Json(OcrHealthResponse {
status: "unhealthy".to_string(),
tesseract_installed: errors.iter().all(|e| !matches!(e, OcrError::TesseractNotInstalled)),
available_languages: vec![],
diagnostics: Some(diagnostics),
errors: error_messages,
}))
}
}
}
pub async fn perform_ocr(
State(_state): State<AppState>,
Json(request): Json<OcrRequest>,
) -> Result<Json<serde_json::Value>, (StatusCode, Json<OcrErrorResponse>)> {
let service = EnhancedOcrService::new();
let lang = request.language.as_deref().unwrap_or("eng");
let use_fallback = request.use_fallback.unwrap_or(true);
let result = if use_fallback {
service.extract_with_fallback(&request.file_path, lang).await
} else {
service.extract_text_with_validation(&request.file_path, lang).await
};
match result {
Ok(text) => Ok(Json(serde_json::json!({
"text": text,
"status": "success"
}))),
Err(e) => {
if let Some(ocr_error) = e.downcast_ref::<OcrError>() {
let (status_code, details) = match ocr_error {
OcrError::TesseractNotInstalled => (StatusCode::SERVICE_UNAVAILABLE, "Please install Tesseract OCR"),
OcrError::LanguageDataNotFound { .. } => (StatusCode::BAD_REQUEST, "Language pack not installed"),
OcrError::InsufficientMemory { .. } => (StatusCode::INSUFFICIENT_STORAGE, "Not enough memory"),
OcrError::ImageTooLarge { .. } => (StatusCode::PAYLOAD_TOO_LARGE, "Image exceeds size limits"),
OcrError::OcrTimeout { .. } => (StatusCode::REQUEST_TIMEOUT, "OCR operation timed out"),
OcrError::PermissionDenied { .. } => (StatusCode::FORBIDDEN, "Cannot access file"),
OcrError::InvalidImageFormat { .. } => (StatusCode::UNPROCESSABLE_ENTITY, "Invalid image format"),
_ => (StatusCode::INTERNAL_SERVER_ERROR, "OCR processing failed"),
};
Err((
status_code,
Json(OcrErrorResponse {
error: ocr_error.to_string(),
error_code: ocr_error.error_code().to_string(),
details: Some(details.to_string()),
is_recoverable: ocr_error.is_recoverable(),
}),
))
} else {
Err((
StatusCode::INTERNAL_SERVER_ERROR,
Json(OcrErrorResponse {
error: e.to_string(),
error_code: "OCR_UNKNOWN_ERROR".to_string(),
details: None,
is_recoverable: false,
}),
))
}
}
}
}

247
src/ocr_enhanced.rs Normal file
View File

@ -0,0 +1,247 @@
use crate::ocr_error::OcrError;
use crate::ocr_health::OcrHealthChecker;
use anyhow::{anyhow, Result};
use image::DynamicImage;
use std::path::Path;
use std::time::{Duration, Instant};
use tokio::time::timeout;
#[cfg(feature = "ocr")]
use tesseract::{Tesseract, PageSegMode};
pub struct EnhancedOcrService {
health_checker: OcrHealthChecker,
max_image_width: u32,
max_image_height: u32,
ocr_timeout_seconds: u64,
min_confidence_threshold: f32,
}
impl EnhancedOcrService {
pub fn new() -> Self {
Self {
health_checker: OcrHealthChecker::new(),
max_image_width: 10000,
max_image_height: 10000,
ocr_timeout_seconds: 120,
min_confidence_threshold: 60.0,
}
}
pub fn with_limits(mut self, max_width: u32, max_height: u32) -> Self {
self.max_image_width = max_width;
self.max_image_height = max_height;
self
}
pub fn with_timeout(mut self, seconds: u64) -> Self {
self.ocr_timeout_seconds = seconds;
self
}
pub async fn extract_text_with_validation(&self, file_path: &str, lang: &str) -> Result<String> {
// Perform pre-flight checks
self.preflight_checks(lang)?;
// Load and validate image
let image = self.load_and_validate_image(file_path)?;
// Check memory requirements
let (width, height) = (image.width(), image.height());
self.health_checker.validate_memory_for_image(width, height)
.map_err(|e| anyhow!(e))?;
// Perform OCR with timeout
let text = self.perform_ocr_with_timeout(file_path, lang).await?;
Ok(text)
}
fn preflight_checks(&self, lang: &str) -> Result<()> {
// Check Tesseract installation
self.health_checker.check_tesseract_installation()
.map_err(|e| anyhow!(e))?;
// Check CPU requirements
self.health_checker.validate_cpu_requirements()
.map_err(|e| anyhow!(e))?;
// Check language data
self.health_checker.check_language_data(lang)
.map_err(|e| anyhow!(e))?;
Ok(())
}
fn load_and_validate_image(&self, file_path: &str) -> Result<DynamicImage> {
// Check file permissions
if !Path::new(file_path).exists() {
return Err(anyhow!("File not found: {}", file_path));
}
let metadata = std::fs::metadata(file_path)
.map_err(|_| OcrError::PermissionDenied {
path: file_path.to_string()
})?;
if !metadata.is_file() {
return Err(anyhow!("Path is not a file: {}", file_path));
}
// Try to load image
let image = image::open(file_path)
.map_err(|e| OcrError::InvalidImageFormat {
details: e.to_string()
})?;
// Validate dimensions
if image.width() > self.max_image_width || image.height() > self.max_image_height {
return Err(OcrError::ImageTooLarge {
width: image.width(),
height: image.height(),
max_width: self.max_image_width,
max_height: self.max_image_height,
}.into());
}
Ok(image)
}
async fn perform_ocr_with_timeout(&self, file_path: &str, lang: &str) -> Result<String> {
let file_path = file_path.to_string();
let lang = lang.to_string();
let timeout_duration = Duration::from_secs(self.ocr_timeout_seconds);
let min_confidence = self.min_confidence_threshold;
let ocr_future = tokio::task::spawn_blocking(move || {
Self::perform_ocr_internal(&file_path, &lang, min_confidence)
});
match timeout(timeout_duration, ocr_future).await {
Ok(Ok(result)) => result,
Ok(Err(e)) => Err(anyhow!("OCR task failed: {}", e)),
Err(_) => Err(OcrError::OcrTimeout {
seconds: self.ocr_timeout_seconds
}.into()),
}
}
#[cfg(feature = "ocr")]
fn perform_ocr_internal(file_path: &str, lang: &str, min_confidence: f32) -> Result<String> {
let start_time = Instant::now();
// Initialize Tesseract with error handling
let mut tesseract = Tesseract::new(None, Some(lang))
.map_err(|e| OcrError::InitializationFailed {
details: e.to_string()
})?;
// Set optimal parameters for various hardware
tesseract.set_page_seg_mode(PageSegMode::PsmAuto);
let mut tesseract = tesseract
.set_variable("tessedit_do_invert", "0")?
.set_variable("edges_max_children_per_outline", "40")?;
// For low-end hardware, use faster but less accurate settings
if let Ok(available_mem) = std::env::var("OCR_LOW_MEMORY_MODE") {
if available_mem == "true" {
tesseract = tesseract
.set_variable("textord_heavy_nr", "0")?
.set_variable("cube_debug_level", "0")?;
}
}
tesseract = tesseract.set_image(file_path)
.map_err(|e| OcrError::InvalidImageFormat {
details: e.to_string()
})?;
// Get text with confidence check
let text = tesseract.get_text()
.map_err(|e| OcrError::InitializationFailed {
details: e.to_string()
})?;
// Get mean confidence
let confidence = tesseract.mean_text_conf();
if confidence < min_confidence as i32 {
return Err(OcrError::LowConfidence {
score: confidence as f32,
threshold: min_confidence
}.into());
}
let elapsed = start_time.elapsed();
tracing::info!("OCR completed in {:?} with confidence: {}%", elapsed, confidence);
Ok(text.trim().to_string())
}
#[cfg(not(feature = "ocr"))]
fn perform_ocr_internal(_file_path: &str, _lang: &str, _min_confidence: f32) -> Result<String> {
Err(anyhow!("OCR feature is disabled. Recompile with --features ocr"))
}
pub async fn extract_with_fallback(&self, file_path: &str, lang: &str) -> Result<String> {
// Try primary extraction
match self.extract_text_with_validation(file_path, lang).await {
Ok(text) => Ok(text),
Err(e) => {
// Check if error is recoverable
if let Some(ocr_error) = e.downcast_ref::<OcrError>() {
if ocr_error.is_recoverable() {
// Try with reduced quality settings
self.extract_with_reduced_quality(file_path, lang).await
} else {
Err(e)
}
} else {
Err(e)
}
}
}
}
async fn extract_with_reduced_quality(&self, file_path: &str, lang: &str) -> Result<String> {
// Downsample image for lower memory usage
let image = self.load_and_validate_image(file_path)?;
let resized = self.resize_for_ocr(image);
// Save temporary resized image
let temp_path = format!("{}_resized.png", file_path);
resized.save(&temp_path)
.map_err(|e| anyhow!("Failed to save resized image: {}", e))?;
// Try OCR on resized image
let result = self.perform_ocr_with_timeout(&temp_path, lang).await;
// Clean up
let _ = std::fs::remove_file(&temp_path);
result
}
fn resize_for_ocr(&self, image: DynamicImage) -> DynamicImage {
let (width, height) = (image.width(), image.height());
// Target dimensions for low memory mode
let max_dimension = 2000;
if width > max_dimension || height > max_dimension {
let scale = max_dimension as f32 / width.max(height) as f32;
let new_width = (width as f32 * scale) as u32;
let new_height = (height as f32 * scale) as u32;
image.resize(new_width, new_height, image::imageops::FilterType::Lanczos3)
} else {
image
}
}
pub async fn get_diagnostics(&self) -> String {
let diagnostics = self.health_checker.get_full_diagnostics();
format!("{}", diagnostics)
}
}

129
src/ocr_error.rs Normal file
View File

@ -0,0 +1,129 @@
use std::fmt;
use thiserror::Error;
#[derive(Error, Debug)]
pub enum OcrError {
#[error("Tesseract is not installed on the system")]
TesseractNotInstalled,
#[error("Tesseract language data not found for '{lang}'. Please install tesseract-ocr-{lang}")]
LanguageDataNotFound { lang: String },
#[error("TESSDATA_PREFIX environment variable not set or invalid: {path}")]
TessdataPathInvalid { path: String },
#[error("Insufficient memory for OCR operation. Required: {required}MB, Available: {available}MB")]
InsufficientMemory { required: u64, available: u64 },
#[error("CPU instruction set missing: {instruction}. Tesseract requires {instruction} support")]
MissingCpuInstruction { instruction: String },
#[error("Image too large for OCR. Max dimensions: {max_width}x{max_height}, Actual: {width}x{height}")]
ImageTooLarge {
width: u32,
height: u32,
max_width: u32,
max_height: u32,
},
#[error("Invalid image format or corrupted image: {details}")]
InvalidImageFormat { details: String },
#[error("OCR timeout after {seconds} seconds. Consider reducing image size or quality")]
OcrTimeout { seconds: u64 },
#[error("Permission denied accessing file: {path}")]
PermissionDenied { path: String },
#[error("Tesseract initialization failed: {details}")]
InitializationFailed { details: String },
#[error("OCR quality too low. Confidence score: {score}% (minimum: {threshold}%)")]
LowConfidence { score: f32, threshold: f32 },
#[error("Hardware acceleration not available: {details}")]
HardwareAccelerationUnavailable { details: String },
#[error(transparent)]
Io(#[from] std::io::Error),
#[error(transparent)]
Other(#[from] anyhow::Error),
}
impl OcrError {
pub fn is_recoverable(&self) -> bool {
matches!(
self,
OcrError::InsufficientMemory { .. }
| OcrError::OcrTimeout { .. }
| OcrError::LowConfidence { .. }
)
}
pub fn is_configuration_error(&self) -> bool {
matches!(
self,
OcrError::TesseractNotInstalled
| OcrError::LanguageDataNotFound { .. }
| OcrError::TessdataPathInvalid { .. }
| OcrError::MissingCpuInstruction { .. }
)
}
pub fn error_code(&self) -> &'static str {
match self {
OcrError::TesseractNotInstalled => "OCR_NOT_INSTALLED",
OcrError::LanguageDataNotFound { .. } => "OCR_LANG_MISSING",
OcrError::TessdataPathInvalid { .. } => "OCR_DATA_PATH_INVALID",
OcrError::InsufficientMemory { .. } => "OCR_OUT_OF_MEMORY",
OcrError::MissingCpuInstruction { .. } => "OCR_CPU_UNSUPPORTED",
OcrError::ImageTooLarge { .. } => "OCR_IMAGE_TOO_LARGE",
OcrError::InvalidImageFormat { .. } => "OCR_INVALID_FORMAT",
OcrError::OcrTimeout { .. } => "OCR_TIMEOUT",
OcrError::PermissionDenied { .. } => "OCR_PERMISSION_DENIED",
OcrError::InitializationFailed { .. } => "OCR_INIT_FAILED",
OcrError::LowConfidence { .. } => "OCR_LOW_CONFIDENCE",
OcrError::HardwareAccelerationUnavailable { .. } => "OCR_NO_HW_ACCEL",
OcrError::Io(_) => "OCR_IO_ERROR",
OcrError::Other(_) => "OCR_UNKNOWN_ERROR",
}
}
}
#[derive(Debug, Clone)]
pub struct OcrDiagnostics {
pub tesseract_version: Option<String>,
pub available_languages: Vec<String>,
pub tessdata_path: Option<String>,
pub cpu_features: CpuFeatures,
pub memory_available_mb: u64,
pub temp_space_available_mb: u64,
}
#[derive(Debug, Clone)]
pub struct CpuFeatures {
pub sse2: bool,
pub sse3: bool,
pub sse4_1: bool,
pub sse4_2: bool,
pub avx: bool,
pub avx2: bool,
}
impl fmt::Display for OcrDiagnostics {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "OCR Diagnostics:")?;
writeln!(f, " Tesseract Version: {}", self.tesseract_version.as_deref().unwrap_or("Not installed"))?;
writeln!(f, " Tessdata Path: {}", self.tessdata_path.as_deref().unwrap_or("Not set"))?;
writeln!(f, " Available Languages: {}", self.available_languages.join(", "))?;
writeln!(f, " Memory Available: {} MB", self.memory_available_mb)?;
writeln!(f, " Temp Space: {} MB", self.temp_space_available_mb)?;
writeln!(f, " CPU Features:")?;
writeln!(f, " SSE2: {}", self.cpu_features.sse2)?;
writeln!(f, " SSE4.1: {}", self.cpu_features.sse4_1)?;
writeln!(f, " AVX: {}", self.cpu_features.avx)?;
writeln!(f, " AVX2: {}", self.cpu_features.avx2)?;
Ok(())
}
}

282
src/ocr_health.rs Normal file
View File

@ -0,0 +1,282 @@
use crate::ocr_error::{CpuFeatures, OcrDiagnostics, OcrError};
use std::process::Command;
use std::env;
use std::path::Path;
use sysinfo::System;
pub struct OcrHealthChecker;
impl OcrHealthChecker {
pub fn new() -> Self {
Self
}
pub fn check_tesseract_installation(&self) -> Result<String, OcrError> {
let output = Command::new("tesseract")
.arg("--version")
.output()
.map_err(|_| OcrError::TesseractNotInstalled)?;
if !output.status.success() {
return Err(OcrError::TesseractNotInstalled);
}
let version_info = String::from_utf8_lossy(&output.stdout);
let version = version_info
.lines()
.next()
.map(|s| s.to_string())
.unwrap_or_else(|| "Unknown".to_string());
Ok(version)
}
pub fn check_language_data(&self, lang: &str) -> Result<(), OcrError> {
let tessdata_path = self.get_tessdata_path()?;
let lang_file = format!("{}.traineddata", lang);
let lang_path = Path::new(&tessdata_path).join(&lang_file);
if !lang_path.exists() {
return Err(OcrError::LanguageDataNotFound {
lang: lang.to_string(),
});
}
Ok(())
}
pub fn get_tessdata_path(&self) -> Result<String, OcrError> {
if let Ok(path) = env::var("TESSDATA_PREFIX") {
if Path::new(&path).exists() {
return Ok(path);
} else {
return Err(OcrError::TessdataPathInvalid { path });
}
}
let common_paths = vec![
"/usr/share/tesseract-ocr/4.00/tessdata",
"/usr/share/tesseract-ocr/5.00/tessdata",
"/usr/local/share/tessdata",
"/opt/homebrew/share/tessdata",
"C:\\Program Files\\Tesseract-OCR\\tessdata",
];
for path in common_paths {
if Path::new(path).exists() {
return Ok(path.to_string());
}
}
Err(OcrError::TessdataPathInvalid {
path: "No tessdata directory found".to_string(),
})
}
pub fn get_available_languages(&self) -> Vec<String> {
let tessdata_path = match self.get_tessdata_path() {
Ok(path) => path,
Err(_) => return vec![],
};
let mut languages = vec![];
if let Ok(entries) = std::fs::read_dir(&tessdata_path) {
for entry in entries.flatten() {
if let Some(name) = entry.file_name().to_str() {
if name.ends_with(".traineddata") {
let lang = name.trim_end_matches(".traineddata");
languages.push(lang.to_string());
}
}
}
}
languages.sort();
languages
}
pub fn check_cpu_features(&self) -> CpuFeatures {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
use raw_cpuid::CpuId;
let cpuid = CpuId::new();
let features = cpuid.get_feature_info().map(|f| CpuFeatures {
sse2: f.has_sse2(),
sse3: f.has_sse3(),
sse4_1: f.has_sse41(),
sse4_2: f.has_sse42(),
avx: f.has_avx(),
avx2: cpuid.get_extended_feature_info()
.map(|ef| ef.has_avx2())
.unwrap_or(false),
}).unwrap_or_else(|| CpuFeatures {
sse2: false,
sse3: false,
sse4_1: false,
sse4_2: false,
avx: false,
avx2: false,
});
features
}
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
{
CpuFeatures {
sse2: false,
sse3: false,
sse4_1: false,
sse4_2: false,
avx: false,
avx2: false,
}
}
}
pub fn check_memory_available(&self) -> u64 {
let mut sys = System::new_all();
sys.refresh_memory();
sys.available_memory() / (1024 * 1024) // Convert to MB
}
pub fn check_temp_space(&self) -> u64 {
use std::fs;
let temp_dir = env::temp_dir();
// Try to get actual available space using statvfs on Unix-like systems
#[cfg(target_family = "unix")]
{
use std::mem;
#[repr(C)]
struct statvfs {
f_bsize: u64, // file system block size
f_frsize: u64, // fragment size
f_blocks: u64, // size of fs in f_frsize units
f_bfree: u64, // # free blocks
f_bavail: u64, // # free blocks for unprivileged users
f_files: u64, // # inodes
f_ffree: u64, // # free inodes
f_favail: u64, // # free inodes for unprivileged users
f_fsid: u64, // file system ID
f_flag: u64, // mount flags
f_namemax: u64, // maximum filename length
}
extern "C" {
fn statvfs(path: *const i8, buf: *mut statvfs) -> i32;
}
unsafe {
let mut buf: statvfs = mem::zeroed();
let path_cstr = format!("{}\0", temp_dir.display());
if statvfs(path_cstr.as_ptr() as *const i8, &mut buf) == 0 {
let available_bytes = buf.f_bavail * buf.f_frsize;
return available_bytes / (1024 * 1024); // Convert to MB
}
}
}
// Windows implementation
#[cfg(target_family = "windows")]
{
// For Windows, we'd need to use GetDiskFreeSpaceEx from winapi
// For now, try to estimate based on a test file write
}
// Fallback: Try to estimate available space by checking if we can create a test file
let test_file = temp_dir.join(".ocr_space_test");
let test_size = 100 * 1024 * 1024; // 100MB test
match fs::write(&test_file, vec![0u8; test_size]) {
Ok(_) => {
let _ = fs::remove_file(&test_file);
// If we can write 100MB, assume at least 1GB is available
1000
}
Err(_) => {
// If we can't write 100MB, report low space
50
}
}
}
pub fn validate_cpu_requirements(&self) -> Result<(), OcrError> {
let features = self.check_cpu_features();
// Tesseract 4.x+ requires at least SSE2
if !features.sse2 {
return Err(OcrError::MissingCpuInstruction {
instruction: "SSE2".to_string(),
});
}
Ok(())
}
pub fn estimate_memory_requirement(&self, image_width: u32, image_height: u32) -> u64 {
// Rough estimation: 4 bytes per pixel (RGBA) * 3 (for processing buffers)
// Plus 100MB base overhead for Tesseract
let pixels = (image_width as u64) * (image_height as u64);
let image_memory = (pixels * 4 * 3) / (1024 * 1024); // Convert to MB
image_memory + 100
}
pub fn validate_memory_for_image(&self, width: u32, height: u32) -> Result<(), OcrError> {
let required = self.estimate_memory_requirement(width, height);
let available = self.check_memory_available();
if required > available {
return Err(OcrError::InsufficientMemory { required, available });
}
Ok(())
}
pub fn get_full_diagnostics(&self) -> OcrDiagnostics {
OcrDiagnostics {
tesseract_version: self.check_tesseract_installation().ok(),
available_languages: self.get_available_languages(),
tessdata_path: self.get_tessdata_path().ok(),
cpu_features: self.check_cpu_features(),
memory_available_mb: self.check_memory_available(),
temp_space_available_mb: self.check_temp_space(),
}
}
pub fn perform_full_health_check(&self) -> Result<OcrDiagnostics, Vec<OcrError>> {
let mut errors = Vec::new();
// Check Tesseract installation
if let Err(e) = self.check_tesseract_installation() {
errors.push(e);
}
// Check CPU requirements
if let Err(e) = self.validate_cpu_requirements() {
errors.push(e);
}
// Check tessdata path
if let Err(e) = self.get_tessdata_path() {
errors.push(e);
}
// Check for at least English language data
if let Err(e) = self.check_language_data("eng") {
errors.push(e);
}
let diagnostics = self.get_full_diagnostics();
if errors.is_empty() {
Ok(diagnostics)
} else {
Err(errors)
}
}
}

215
src/ocr_tests.rs Normal file
View File

@ -0,0 +1,215 @@
#[cfg(test)]
mod tests {
use super::super::*;
use crate::ocr_error::{OcrError, OcrDiagnostics, CpuFeatures};
use crate::ocr_health::OcrHealthChecker;
use crate::ocr_enhanced::EnhancedOcrService;
use std::env;
use tempfile::TempDir;
use std::fs;
#[test]
fn test_ocr_error_types() {
// Test error creation and properties
let err = OcrError::TesseractNotInstalled;
assert_eq!(err.error_code(), "OCR_NOT_INSTALLED");
assert!(!err.is_recoverable());
assert!(err.is_configuration_error());
let err = OcrError::InsufficientMemory { required: 1000, available: 500 };
assert_eq!(err.error_code(), "OCR_OUT_OF_MEMORY");
assert!(err.is_recoverable());
assert!(!err.is_configuration_error());
let err = OcrError::LanguageDataNotFound { lang: "deu".to_string() };
assert!(err.to_string().contains("deu"));
assert!(err.is_configuration_error());
}
#[test]
fn test_cpu_features_display() {
let features = CpuFeatures {
sse2: true,
sse3: true,
sse4_1: false,
sse4_2: false,
avx: false,
avx2: false,
};
let diag = OcrDiagnostics {
tesseract_version: Some("4.1.1".to_string()),
available_languages: vec!["eng".to_string(), "fra".to_string()],
tessdata_path: Some("/usr/share/tessdata".to_string()),
cpu_features: features,
memory_available_mb: 8192,
temp_space_available_mb: 50000,
};
let display = format!("{}", diag);
assert!(display.contains("Tesseract Version: 4.1.1"));
assert!(display.contains("SSE2: true"));
assert!(display.contains("Available Languages: eng, fra"));
}
#[test]
fn test_health_checker_cpu_validation() {
let checker = OcrHealthChecker::new();
let features = checker.check_cpu_features();
// On x86/x64, we should at least detect the presence of CPU features
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
// Modern CPUs should have at least SSE2
// Note: This might fail on very old hardware
if std::env::var("CI").is_err() {
// Only check in non-CI environments
let _ = checker.validate_cpu_requirements();
}
}
}
#[test]
fn test_memory_estimation() {
let checker = OcrHealthChecker::new();
// Test memory estimation for different image sizes
let small_image = checker.estimate_memory_requirement(640, 480);
let medium_image = checker.estimate_memory_requirement(1920, 1080);
let large_image = checker.estimate_memory_requirement(4096, 4096);
// Small image should need less memory than large
assert!(small_image < medium_image);
assert!(medium_image < large_image);
// Base overhead is 100MB
assert!(small_image >= 100);
}
#[test]
fn test_temp_space_check() {
let checker = OcrHealthChecker::new();
let space = checker.check_temp_space();
// Should return some positive value
assert!(space > 0);
}
#[test]
fn test_tessdata_path_detection() {
let checker = OcrHealthChecker::new();
// Set a custom TESSDATA_PREFIX for testing
let temp_dir = TempDir::new().unwrap();
env::set_var("TESSDATA_PREFIX", temp_dir.path());
match checker.get_tessdata_path() {
Ok(path) => assert_eq!(path, temp_dir.path().to_string_lossy()),
Err(e) => {
// Expected if the temp directory doesn't exist
match e {
OcrError::TessdataPathInvalid { .. } => (),
_ => panic!("Unexpected error type"),
}
}
}
env::remove_var("TESSDATA_PREFIX");
}
#[test]
fn test_language_detection() {
let checker = OcrHealthChecker::new();
// Create a mock tessdata directory
let temp_dir = TempDir::new().unwrap();
let tessdata_path = temp_dir.path().join("tessdata");
fs::create_dir(&tessdata_path).unwrap();
// Create mock language files
fs::write(tessdata_path.join("eng.traineddata"), b"mock").unwrap();
fs::write(tessdata_path.join("fra.traineddata"), b"mock").unwrap();
fs::write(tessdata_path.join("deu.traineddata"), b"mock").unwrap();
env::set_var("TESSDATA_PREFIX", &tessdata_path);
let languages = checker.get_available_languages();
assert!(languages.contains(&"eng".to_string()));
assert!(languages.contains(&"fra".to_string()));
assert!(languages.contains(&"deu".to_string()));
assert_eq!(languages.len(), 3);
// Test language validation
assert!(checker.check_language_data("eng").is_ok());
assert!(checker.check_language_data("jpn").is_err());
env::remove_var("TESSDATA_PREFIX");
}
#[tokio::test]
async fn test_enhanced_ocr_timeout() {
let service = EnhancedOcrService::new()
.with_timeout(1); // 1 second timeout
// This should timeout since no actual file exists
let result = service.extract_text_with_validation("/nonexistent/file.png", "eng").await;
assert!(result.is_err());
}
#[tokio::test]
async fn test_enhanced_ocr_image_validation() {
let service = EnhancedOcrService::new()
.with_limits(100, 100); // Very small limit
// Create a mock large image path
let result = service.extract_text_with_validation("/path/to/large/image.png", "eng").await;
assert!(result.is_err());
}
#[test]
fn test_error_recovery_classification() {
// Test which errors are considered recoverable
let recoverable_errors = vec![
OcrError::InsufficientMemory { required: 1000, available: 500 },
OcrError::OcrTimeout { seconds: 30 },
OcrError::LowConfidence { score: 40.0, threshold: 60.0 },
];
for err in recoverable_errors {
assert!(err.is_recoverable(), "Error {:?} should be recoverable", err);
}
let non_recoverable_errors = vec![
OcrError::TesseractNotInstalled,
OcrError::LanguageDataNotFound { lang: "eng".to_string() },
OcrError::MissingCpuInstruction { instruction: "SSE2".to_string() },
OcrError::PermissionDenied { path: "/test".to_string() },
];
for err in non_recoverable_errors {
assert!(!err.is_recoverable(), "Error {:?} should not be recoverable", err);
}
}
#[test]
fn test_image_size_validation() {
let checker = OcrHealthChecker::new();
// Assuming we have at least 100MB available
let available = checker.check_memory_available();
if available > 100 {
// Small image should pass
assert!(checker.validate_memory_for_image(640, 480).is_ok());
// Extremely large image should fail
let result = checker.validate_memory_for_image(50000, 50000);
assert!(result.is_err());
if let Err(OcrError::InsufficientMemory { required, available }) = result {
assert!(required > available);
} else {
panic!("Expected InsufficientMemory error");
}
}
}
}