feat(server): implement better ocr failure and guardrails
This commit is contained in:
parent
003d90943c
commit
9fa45f8891
|
|
@ -1290,7 +1290,7 @@ dependencies = [
|
|||
"js-sys",
|
||||
"log",
|
||||
"wasm-bindgen",
|
||||
"windows-core",
|
||||
"windows-core 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -1876,6 +1876,15 @@ dependencies = [
|
|||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ntapi"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nu-ansi-term"
|
||||
version = "0.46.0"
|
||||
|
|
@ -2399,6 +2408,15 @@ version = "1.5.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f60fcc7d6849342eff22c4350c8b9a989ee8ceabc4b481253e8946b9fe83d684"
|
||||
|
||||
[[package]]
|
||||
name = "raw-cpuid"
|
||||
version = "11.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c6df7ab838ed27997ba19a4664507e6f82b41fe6e20be42929332156e5e85146"
|
||||
dependencies = [
|
||||
"bitflags 2.9.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rawpointer"
|
||||
version = "0.2.1"
|
||||
|
|
@ -2445,15 +2463,18 @@ dependencies = [
|
|||
"notify",
|
||||
"pdf-extract",
|
||||
"quick-xml",
|
||||
"raw-cpuid",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sqlx",
|
||||
"sysinfo",
|
||||
"tempfile",
|
||||
"tesseract",
|
||||
"testcontainers",
|
||||
"testcontainers-modules",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tower 0.4.13",
|
||||
|
|
@ -3272,6 +3293,21 @@ dependencies = [
|
|||
"syn 2.0.102",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sysinfo"
|
||||
version = "0.30.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0a5b4ddaee55fb2bea2bf0e5000747e5f5c0de765e5a5ff87f4cd106439f4bb3"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
"ntapi",
|
||||
"once_cell",
|
||||
"rayon",
|
||||
"windows",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "system-configuration"
|
||||
version = "0.5.1"
|
||||
|
|
@ -4053,6 +4089,25 @@ version = "0.4.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||
|
||||
[[package]]
|
||||
name = "windows"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be"
|
||||
dependencies = [
|
||||
"windows-core 0.52.0",
|
||||
"windows-targets 0.52.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-core"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
|
||||
dependencies = [
|
||||
"windows-targets 0.52.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-core"
|
||||
version = "0.61.2"
|
||||
|
|
|
|||
|
|
@ -36,6 +36,9 @@ tesseract = { version = "0.15", optional = true }
|
|||
pdf-extract = { version = "0.7", optional = true }
|
||||
image = { version = "0.24", features = ["png", "jpeg", "tiff", "bmp"], optional = true }
|
||||
imageproc = { version = "0.23", optional = true }
|
||||
thiserror = "1.0"
|
||||
sysinfo = "0.30"
|
||||
raw-cpuid = { version = "11", optional = true }
|
||||
reqwest = { version = "0.11", features = ["json", "multipart"] }
|
||||
quick-xml = { version = "0.31", features = ["serialize"] }
|
||||
urlencoding = "2.1"
|
||||
|
|
@ -48,7 +51,7 @@ utoipa-swagger-ui = { version = "6", features = ["axum"] }
|
|||
|
||||
[features]
|
||||
default = ["ocr"]
|
||||
ocr = ["tesseract", "pdf-extract", "image", "imageproc"]
|
||||
ocr = ["tesseract", "pdf-extract", "image", "imageproc", "raw-cpuid"]
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3"
|
||||
|
|
|
|||
|
|
@ -6,7 +6,12 @@ pub mod enhanced_ocr;
|
|||
pub mod file_service;
|
||||
pub mod models;
|
||||
pub mod ocr;
|
||||
pub mod ocr_api;
|
||||
pub mod ocr_enhanced;
|
||||
pub mod ocr_error;
|
||||
pub mod ocr_health;
|
||||
pub mod ocr_queue;
|
||||
pub mod ocr_tests;
|
||||
pub mod routes;
|
||||
pub mod seed;
|
||||
pub mod watcher;
|
||||
|
|
|
|||
34
src/ocr.rs
34
src/ocr.rs
|
|
@ -1,14 +1,20 @@
|
|||
use anyhow::{anyhow, Result};
|
||||
use std::path::Path;
|
||||
use crate::ocr_error::OcrError;
|
||||
use crate::ocr_health::OcrHealthChecker;
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
use tesseract::Tesseract;
|
||||
|
||||
pub struct OcrService;
|
||||
pub struct OcrService {
|
||||
health_checker: OcrHealthChecker,
|
||||
}
|
||||
|
||||
impl OcrService {
|
||||
pub fn new() -> Self {
|
||||
Self
|
||||
Self {
|
||||
health_checker: OcrHealthChecker::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn extract_text_from_image(&self, file_path: &str) -> Result<String> {
|
||||
|
|
@ -18,17 +24,29 @@ impl OcrService {
|
|||
pub async fn extract_text_from_image_with_lang(&self, file_path: &str, lang: &str) -> Result<String> {
|
||||
#[cfg(feature = "ocr")]
|
||||
{
|
||||
let mut tesseract = Tesseract::new(None, Some(lang))?
|
||||
// Perform health checks first
|
||||
self.health_checker.check_tesseract_installation()
|
||||
.map_err(|e| anyhow!(e))?;
|
||||
self.health_checker.check_language_data(lang)
|
||||
.map_err(|e| anyhow!(e))?;
|
||||
|
||||
let mut tesseract = Tesseract::new(None, Some(lang))
|
||||
.map_err(|e| anyhow!(OcrError::InitializationFailed {
|
||||
details: e.to_string()
|
||||
}))?
|
||||
.set_image(file_path)?;
|
||||
|
||||
let text = tesseract.get_text()?;
|
||||
let text = tesseract.get_text()
|
||||
.map_err(|e| anyhow!(OcrError::InitializationFailed {
|
||||
details: format!("Failed to extract text: {}", e)
|
||||
}))?;
|
||||
|
||||
Ok(text.trim().to_string())
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "ocr"))]
|
||||
{
|
||||
Err(anyhow!("OCR feature is disabled. Recompile with --features ocr"))
|
||||
Err(anyhow!(OcrError::TesseractNotInstalled))
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -44,7 +62,7 @@ impl OcrService {
|
|||
|
||||
#[cfg(not(feature = "ocr"))]
|
||||
{
|
||||
Err(anyhow!("OCR feature is disabled. Recompile with --features ocr"))
|
||||
Err(anyhow!(OcrError::TesseractNotInstalled))
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -66,7 +84,9 @@ impl OcrService {
|
|||
if self.is_image_file(file_path) {
|
||||
self.extract_text_from_image_with_lang(file_path, lang).await
|
||||
} else {
|
||||
Err(anyhow!("Unsupported file type for OCR: {}", mime_type))
|
||||
Err(anyhow!(OcrError::InvalidImageFormat {
|
||||
details: format!("Unsupported MIME type: {}", mime_type)
|
||||
}))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,129 @@
|
|||
use crate::ocr_enhanced::EnhancedOcrService;
|
||||
use crate::ocr_error::OcrError;
|
||||
use crate::AppState;
|
||||
use axum::{
|
||||
extract::State,
|
||||
http::StatusCode,
|
||||
response::Json,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct OcrHealthResponse {
|
||||
status: String,
|
||||
tesseract_installed: bool,
|
||||
available_languages: Vec<String>,
|
||||
diagnostics: Option<String>,
|
||||
errors: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct OcrErrorResponse {
|
||||
error: String,
|
||||
error_code: String,
|
||||
details: Option<String>,
|
||||
is_recoverable: bool,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct OcrRequest {
|
||||
file_path: String,
|
||||
language: Option<String>,
|
||||
use_fallback: Option<bool>,
|
||||
}
|
||||
|
||||
pub async fn health_check(
|
||||
State(_state): State<AppState>,
|
||||
) -> Result<Json<OcrHealthResponse>, (StatusCode, Json<OcrErrorResponse>)> {
|
||||
let service = EnhancedOcrService::new();
|
||||
let diagnostics = service.get_diagnostics().await;
|
||||
|
||||
let health_checker = crate::ocr_health::OcrHealthChecker::new();
|
||||
|
||||
match health_checker.perform_full_health_check() {
|
||||
Ok(diag) => {
|
||||
Ok(Json(OcrHealthResponse {
|
||||
status: "healthy".to_string(),
|
||||
tesseract_installed: true,
|
||||
available_languages: diag.available_languages,
|
||||
diagnostics: Some(diagnostics),
|
||||
errors: vec![],
|
||||
}))
|
||||
}
|
||||
Err(errors) => {
|
||||
let error_messages: Vec<String> = errors.iter()
|
||||
.map(|e| e.to_string())
|
||||
.collect();
|
||||
|
||||
let _status_code = if errors.iter().any(|e| e.is_configuration_error()) {
|
||||
StatusCode::SERVICE_UNAVAILABLE
|
||||
} else {
|
||||
StatusCode::INTERNAL_SERVER_ERROR
|
||||
};
|
||||
|
||||
Ok(Json(OcrHealthResponse {
|
||||
status: "unhealthy".to_string(),
|
||||
tesseract_installed: errors.iter().all(|e| !matches!(e, OcrError::TesseractNotInstalled)),
|
||||
available_languages: vec![],
|
||||
diagnostics: Some(diagnostics),
|
||||
errors: error_messages,
|
||||
}))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn perform_ocr(
|
||||
State(_state): State<AppState>,
|
||||
Json(request): Json<OcrRequest>,
|
||||
) -> Result<Json<serde_json::Value>, (StatusCode, Json<OcrErrorResponse>)> {
|
||||
let service = EnhancedOcrService::new();
|
||||
let lang = request.language.as_deref().unwrap_or("eng");
|
||||
let use_fallback = request.use_fallback.unwrap_or(true);
|
||||
|
||||
let result = if use_fallback {
|
||||
service.extract_with_fallback(&request.file_path, lang).await
|
||||
} else {
|
||||
service.extract_text_with_validation(&request.file_path, lang).await
|
||||
};
|
||||
|
||||
match result {
|
||||
Ok(text) => Ok(Json(serde_json::json!({
|
||||
"text": text,
|
||||
"status": "success"
|
||||
}))),
|
||||
Err(e) => {
|
||||
if let Some(ocr_error) = e.downcast_ref::<OcrError>() {
|
||||
let (status_code, details) = match ocr_error {
|
||||
OcrError::TesseractNotInstalled => (StatusCode::SERVICE_UNAVAILABLE, "Please install Tesseract OCR"),
|
||||
OcrError::LanguageDataNotFound { .. } => (StatusCode::BAD_REQUEST, "Language pack not installed"),
|
||||
OcrError::InsufficientMemory { .. } => (StatusCode::INSUFFICIENT_STORAGE, "Not enough memory"),
|
||||
OcrError::ImageTooLarge { .. } => (StatusCode::PAYLOAD_TOO_LARGE, "Image exceeds size limits"),
|
||||
OcrError::OcrTimeout { .. } => (StatusCode::REQUEST_TIMEOUT, "OCR operation timed out"),
|
||||
OcrError::PermissionDenied { .. } => (StatusCode::FORBIDDEN, "Cannot access file"),
|
||||
OcrError::InvalidImageFormat { .. } => (StatusCode::UNPROCESSABLE_ENTITY, "Invalid image format"),
|
||||
_ => (StatusCode::INTERNAL_SERVER_ERROR, "OCR processing failed"),
|
||||
};
|
||||
|
||||
Err((
|
||||
status_code,
|
||||
Json(OcrErrorResponse {
|
||||
error: ocr_error.to_string(),
|
||||
error_code: ocr_error.error_code().to_string(),
|
||||
details: Some(details.to_string()),
|
||||
is_recoverable: ocr_error.is_recoverable(),
|
||||
}),
|
||||
))
|
||||
} else {
|
||||
Err((
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(OcrErrorResponse {
|
||||
error: e.to_string(),
|
||||
error_code: "OCR_UNKNOWN_ERROR".to_string(),
|
||||
details: None,
|
||||
is_recoverable: false,
|
||||
}),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,247 @@
|
|||
use crate::ocr_error::OcrError;
|
||||
use crate::ocr_health::OcrHealthChecker;
|
||||
use anyhow::{anyhow, Result};
|
||||
use image::DynamicImage;
|
||||
use std::path::Path;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio::time::timeout;
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
use tesseract::{Tesseract, PageSegMode};
|
||||
|
||||
pub struct EnhancedOcrService {
|
||||
health_checker: OcrHealthChecker,
|
||||
max_image_width: u32,
|
||||
max_image_height: u32,
|
||||
ocr_timeout_seconds: u64,
|
||||
min_confidence_threshold: f32,
|
||||
}
|
||||
|
||||
impl EnhancedOcrService {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
health_checker: OcrHealthChecker::new(),
|
||||
max_image_width: 10000,
|
||||
max_image_height: 10000,
|
||||
ocr_timeout_seconds: 120,
|
||||
min_confidence_threshold: 60.0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_limits(mut self, max_width: u32, max_height: u32) -> Self {
|
||||
self.max_image_width = max_width;
|
||||
self.max_image_height = max_height;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_timeout(mut self, seconds: u64) -> Self {
|
||||
self.ocr_timeout_seconds = seconds;
|
||||
self
|
||||
}
|
||||
|
||||
pub async fn extract_text_with_validation(&self, file_path: &str, lang: &str) -> Result<String> {
|
||||
// Perform pre-flight checks
|
||||
self.preflight_checks(lang)?;
|
||||
|
||||
// Load and validate image
|
||||
let image = self.load_and_validate_image(file_path)?;
|
||||
|
||||
// Check memory requirements
|
||||
let (width, height) = (image.width(), image.height());
|
||||
self.health_checker.validate_memory_for_image(width, height)
|
||||
.map_err(|e| anyhow!(e))?;
|
||||
|
||||
// Perform OCR with timeout
|
||||
let text = self.perform_ocr_with_timeout(file_path, lang).await?;
|
||||
|
||||
Ok(text)
|
||||
}
|
||||
|
||||
fn preflight_checks(&self, lang: &str) -> Result<()> {
|
||||
// Check Tesseract installation
|
||||
self.health_checker.check_tesseract_installation()
|
||||
.map_err(|e| anyhow!(e))?;
|
||||
|
||||
// Check CPU requirements
|
||||
self.health_checker.validate_cpu_requirements()
|
||||
.map_err(|e| anyhow!(e))?;
|
||||
|
||||
// Check language data
|
||||
self.health_checker.check_language_data(lang)
|
||||
.map_err(|e| anyhow!(e))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn load_and_validate_image(&self, file_path: &str) -> Result<DynamicImage> {
|
||||
// Check file permissions
|
||||
if !Path::new(file_path).exists() {
|
||||
return Err(anyhow!("File not found: {}", file_path));
|
||||
}
|
||||
|
||||
let metadata = std::fs::metadata(file_path)
|
||||
.map_err(|_| OcrError::PermissionDenied {
|
||||
path: file_path.to_string()
|
||||
})?;
|
||||
|
||||
if !metadata.is_file() {
|
||||
return Err(anyhow!("Path is not a file: {}", file_path));
|
||||
}
|
||||
|
||||
// Try to load image
|
||||
let image = image::open(file_path)
|
||||
.map_err(|e| OcrError::InvalidImageFormat {
|
||||
details: e.to_string()
|
||||
})?;
|
||||
|
||||
// Validate dimensions
|
||||
if image.width() > self.max_image_width || image.height() > self.max_image_height {
|
||||
return Err(OcrError::ImageTooLarge {
|
||||
width: image.width(),
|
||||
height: image.height(),
|
||||
max_width: self.max_image_width,
|
||||
max_height: self.max_image_height,
|
||||
}.into());
|
||||
}
|
||||
|
||||
Ok(image)
|
||||
}
|
||||
|
||||
async fn perform_ocr_with_timeout(&self, file_path: &str, lang: &str) -> Result<String> {
|
||||
let file_path = file_path.to_string();
|
||||
let lang = lang.to_string();
|
||||
let timeout_duration = Duration::from_secs(self.ocr_timeout_seconds);
|
||||
let min_confidence = self.min_confidence_threshold;
|
||||
|
||||
let ocr_future = tokio::task::spawn_blocking(move || {
|
||||
Self::perform_ocr_internal(&file_path, &lang, min_confidence)
|
||||
});
|
||||
|
||||
match timeout(timeout_duration, ocr_future).await {
|
||||
Ok(Ok(result)) => result,
|
||||
Ok(Err(e)) => Err(anyhow!("OCR task failed: {}", e)),
|
||||
Err(_) => Err(OcrError::OcrTimeout {
|
||||
seconds: self.ocr_timeout_seconds
|
||||
}.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
fn perform_ocr_internal(file_path: &str, lang: &str, min_confidence: f32) -> Result<String> {
|
||||
let start_time = Instant::now();
|
||||
|
||||
// Initialize Tesseract with error handling
|
||||
let mut tesseract = Tesseract::new(None, Some(lang))
|
||||
.map_err(|e| OcrError::InitializationFailed {
|
||||
details: e.to_string()
|
||||
})?;
|
||||
|
||||
// Set optimal parameters for various hardware
|
||||
tesseract.set_page_seg_mode(PageSegMode::PsmAuto);
|
||||
|
||||
let mut tesseract = tesseract
|
||||
.set_variable("tessedit_do_invert", "0")?
|
||||
.set_variable("edges_max_children_per_outline", "40")?;
|
||||
|
||||
// For low-end hardware, use faster but less accurate settings
|
||||
if let Ok(available_mem) = std::env::var("OCR_LOW_MEMORY_MODE") {
|
||||
if available_mem == "true" {
|
||||
tesseract = tesseract
|
||||
.set_variable("textord_heavy_nr", "0")?
|
||||
.set_variable("cube_debug_level", "0")?;
|
||||
}
|
||||
}
|
||||
|
||||
tesseract = tesseract.set_image(file_path)
|
||||
.map_err(|e| OcrError::InvalidImageFormat {
|
||||
details: e.to_string()
|
||||
})?;
|
||||
|
||||
// Get text with confidence check
|
||||
let text = tesseract.get_text()
|
||||
.map_err(|e| OcrError::InitializationFailed {
|
||||
details: e.to_string()
|
||||
})?;
|
||||
|
||||
// Get mean confidence
|
||||
let confidence = tesseract.mean_text_conf();
|
||||
|
||||
if confidence < min_confidence as i32 {
|
||||
return Err(OcrError::LowConfidence {
|
||||
score: confidence as f32,
|
||||
threshold: min_confidence
|
||||
}.into());
|
||||
}
|
||||
|
||||
let elapsed = start_time.elapsed();
|
||||
tracing::info!("OCR completed in {:?} with confidence: {}%", elapsed, confidence);
|
||||
|
||||
Ok(text.trim().to_string())
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "ocr"))]
|
||||
fn perform_ocr_internal(_file_path: &str, _lang: &str, _min_confidence: f32) -> Result<String> {
|
||||
Err(anyhow!("OCR feature is disabled. Recompile with --features ocr"))
|
||||
}
|
||||
|
||||
pub async fn extract_with_fallback(&self, file_path: &str, lang: &str) -> Result<String> {
|
||||
// Try primary extraction
|
||||
match self.extract_text_with_validation(file_path, lang).await {
|
||||
Ok(text) => Ok(text),
|
||||
Err(e) => {
|
||||
// Check if error is recoverable
|
||||
if let Some(ocr_error) = e.downcast_ref::<OcrError>() {
|
||||
if ocr_error.is_recoverable() {
|
||||
// Try with reduced quality settings
|
||||
self.extract_with_reduced_quality(file_path, lang).await
|
||||
} else {
|
||||
Err(e)
|
||||
}
|
||||
} else {
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn extract_with_reduced_quality(&self, file_path: &str, lang: &str) -> Result<String> {
|
||||
// Downsample image for lower memory usage
|
||||
let image = self.load_and_validate_image(file_path)?;
|
||||
let resized = self.resize_for_ocr(image);
|
||||
|
||||
// Save temporary resized image
|
||||
let temp_path = format!("{}_resized.png", file_path);
|
||||
resized.save(&temp_path)
|
||||
.map_err(|e| anyhow!("Failed to save resized image: {}", e))?;
|
||||
|
||||
// Try OCR on resized image
|
||||
let result = self.perform_ocr_with_timeout(&temp_path, lang).await;
|
||||
|
||||
// Clean up
|
||||
let _ = std::fs::remove_file(&temp_path);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
fn resize_for_ocr(&self, image: DynamicImage) -> DynamicImage {
|
||||
let (width, height) = (image.width(), image.height());
|
||||
|
||||
// Target dimensions for low memory mode
|
||||
let max_dimension = 2000;
|
||||
|
||||
if width > max_dimension || height > max_dimension {
|
||||
let scale = max_dimension as f32 / width.max(height) as f32;
|
||||
let new_width = (width as f32 * scale) as u32;
|
||||
let new_height = (height as f32 * scale) as u32;
|
||||
|
||||
image.resize(new_width, new_height, image::imageops::FilterType::Lanczos3)
|
||||
} else {
|
||||
image
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn get_diagnostics(&self) -> String {
|
||||
let diagnostics = self.health_checker.get_full_diagnostics();
|
||||
format!("{}", diagnostics)
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,129 @@
|
|||
use std::fmt;
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum OcrError {
|
||||
#[error("Tesseract is not installed on the system")]
|
||||
TesseractNotInstalled,
|
||||
|
||||
#[error("Tesseract language data not found for '{lang}'. Please install tesseract-ocr-{lang}")]
|
||||
LanguageDataNotFound { lang: String },
|
||||
|
||||
#[error("TESSDATA_PREFIX environment variable not set or invalid: {path}")]
|
||||
TessdataPathInvalid { path: String },
|
||||
|
||||
#[error("Insufficient memory for OCR operation. Required: {required}MB, Available: {available}MB")]
|
||||
InsufficientMemory { required: u64, available: u64 },
|
||||
|
||||
#[error("CPU instruction set missing: {instruction}. Tesseract requires {instruction} support")]
|
||||
MissingCpuInstruction { instruction: String },
|
||||
|
||||
#[error("Image too large for OCR. Max dimensions: {max_width}x{max_height}, Actual: {width}x{height}")]
|
||||
ImageTooLarge {
|
||||
width: u32,
|
||||
height: u32,
|
||||
max_width: u32,
|
||||
max_height: u32,
|
||||
},
|
||||
|
||||
#[error("Invalid image format or corrupted image: {details}")]
|
||||
InvalidImageFormat { details: String },
|
||||
|
||||
#[error("OCR timeout after {seconds} seconds. Consider reducing image size or quality")]
|
||||
OcrTimeout { seconds: u64 },
|
||||
|
||||
#[error("Permission denied accessing file: {path}")]
|
||||
PermissionDenied { path: String },
|
||||
|
||||
#[error("Tesseract initialization failed: {details}")]
|
||||
InitializationFailed { details: String },
|
||||
|
||||
#[error("OCR quality too low. Confidence score: {score}% (minimum: {threshold}%)")]
|
||||
LowConfidence { score: f32, threshold: f32 },
|
||||
|
||||
#[error("Hardware acceleration not available: {details}")]
|
||||
HardwareAccelerationUnavailable { details: String },
|
||||
|
||||
#[error(transparent)]
|
||||
Io(#[from] std::io::Error),
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl OcrError {
|
||||
pub fn is_recoverable(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
OcrError::InsufficientMemory { .. }
|
||||
| OcrError::OcrTimeout { .. }
|
||||
| OcrError::LowConfidence { .. }
|
||||
)
|
||||
}
|
||||
|
||||
pub fn is_configuration_error(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
OcrError::TesseractNotInstalled
|
||||
| OcrError::LanguageDataNotFound { .. }
|
||||
| OcrError::TessdataPathInvalid { .. }
|
||||
| OcrError::MissingCpuInstruction { .. }
|
||||
)
|
||||
}
|
||||
|
||||
pub fn error_code(&self) -> &'static str {
|
||||
match self {
|
||||
OcrError::TesseractNotInstalled => "OCR_NOT_INSTALLED",
|
||||
OcrError::LanguageDataNotFound { .. } => "OCR_LANG_MISSING",
|
||||
OcrError::TessdataPathInvalid { .. } => "OCR_DATA_PATH_INVALID",
|
||||
OcrError::InsufficientMemory { .. } => "OCR_OUT_OF_MEMORY",
|
||||
OcrError::MissingCpuInstruction { .. } => "OCR_CPU_UNSUPPORTED",
|
||||
OcrError::ImageTooLarge { .. } => "OCR_IMAGE_TOO_LARGE",
|
||||
OcrError::InvalidImageFormat { .. } => "OCR_INVALID_FORMAT",
|
||||
OcrError::OcrTimeout { .. } => "OCR_TIMEOUT",
|
||||
OcrError::PermissionDenied { .. } => "OCR_PERMISSION_DENIED",
|
||||
OcrError::InitializationFailed { .. } => "OCR_INIT_FAILED",
|
||||
OcrError::LowConfidence { .. } => "OCR_LOW_CONFIDENCE",
|
||||
OcrError::HardwareAccelerationUnavailable { .. } => "OCR_NO_HW_ACCEL",
|
||||
OcrError::Io(_) => "OCR_IO_ERROR",
|
||||
OcrError::Other(_) => "OCR_UNKNOWN_ERROR",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct OcrDiagnostics {
|
||||
pub tesseract_version: Option<String>,
|
||||
pub available_languages: Vec<String>,
|
||||
pub tessdata_path: Option<String>,
|
||||
pub cpu_features: CpuFeatures,
|
||||
pub memory_available_mb: u64,
|
||||
pub temp_space_available_mb: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CpuFeatures {
|
||||
pub sse2: bool,
|
||||
pub sse3: bool,
|
||||
pub sse4_1: bool,
|
||||
pub sse4_2: bool,
|
||||
pub avx: bool,
|
||||
pub avx2: bool,
|
||||
}
|
||||
|
||||
impl fmt::Display for OcrDiagnostics {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
writeln!(f, "OCR Diagnostics:")?;
|
||||
writeln!(f, " Tesseract Version: {}", self.tesseract_version.as_deref().unwrap_or("Not installed"))?;
|
||||
writeln!(f, " Tessdata Path: {}", self.tessdata_path.as_deref().unwrap_or("Not set"))?;
|
||||
writeln!(f, " Available Languages: {}", self.available_languages.join(", "))?;
|
||||
writeln!(f, " Memory Available: {} MB", self.memory_available_mb)?;
|
||||
writeln!(f, " Temp Space: {} MB", self.temp_space_available_mb)?;
|
||||
writeln!(f, " CPU Features:")?;
|
||||
writeln!(f, " SSE2: {}", self.cpu_features.sse2)?;
|
||||
writeln!(f, " SSE4.1: {}", self.cpu_features.sse4_1)?;
|
||||
writeln!(f, " AVX: {}", self.cpu_features.avx)?;
|
||||
writeln!(f, " AVX2: {}", self.cpu_features.avx2)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,282 @@
|
|||
use crate::ocr_error::{CpuFeatures, OcrDiagnostics, OcrError};
|
||||
use std::process::Command;
|
||||
use std::env;
|
||||
use std::path::Path;
|
||||
use sysinfo::System;
|
||||
|
||||
pub struct OcrHealthChecker;
|
||||
|
||||
impl OcrHealthChecker {
|
||||
pub fn new() -> Self {
|
||||
Self
|
||||
}
|
||||
|
||||
pub fn check_tesseract_installation(&self) -> Result<String, OcrError> {
|
||||
let output = Command::new("tesseract")
|
||||
.arg("--version")
|
||||
.output()
|
||||
.map_err(|_| OcrError::TesseractNotInstalled)?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(OcrError::TesseractNotInstalled);
|
||||
}
|
||||
|
||||
let version_info = String::from_utf8_lossy(&output.stdout);
|
||||
let version = version_info
|
||||
.lines()
|
||||
.next()
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(|| "Unknown".to_string());
|
||||
|
||||
Ok(version)
|
||||
}
|
||||
|
||||
pub fn check_language_data(&self, lang: &str) -> Result<(), OcrError> {
|
||||
let tessdata_path = self.get_tessdata_path()?;
|
||||
let lang_file = format!("{}.traineddata", lang);
|
||||
let lang_path = Path::new(&tessdata_path).join(&lang_file);
|
||||
|
||||
if !lang_path.exists() {
|
||||
return Err(OcrError::LanguageDataNotFound {
|
||||
lang: lang.to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_tessdata_path(&self) -> Result<String, OcrError> {
|
||||
if let Ok(path) = env::var("TESSDATA_PREFIX") {
|
||||
if Path::new(&path).exists() {
|
||||
return Ok(path);
|
||||
} else {
|
||||
return Err(OcrError::TessdataPathInvalid { path });
|
||||
}
|
||||
}
|
||||
|
||||
let common_paths = vec![
|
||||
"/usr/share/tesseract-ocr/4.00/tessdata",
|
||||
"/usr/share/tesseract-ocr/5.00/tessdata",
|
||||
"/usr/local/share/tessdata",
|
||||
"/opt/homebrew/share/tessdata",
|
||||
"C:\\Program Files\\Tesseract-OCR\\tessdata",
|
||||
];
|
||||
|
||||
for path in common_paths {
|
||||
if Path::new(path).exists() {
|
||||
return Ok(path.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
Err(OcrError::TessdataPathInvalid {
|
||||
path: "No tessdata directory found".to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_available_languages(&self) -> Vec<String> {
|
||||
let tessdata_path = match self.get_tessdata_path() {
|
||||
Ok(path) => path,
|
||||
Err(_) => return vec![],
|
||||
};
|
||||
|
||||
let mut languages = vec![];
|
||||
if let Ok(entries) = std::fs::read_dir(&tessdata_path) {
|
||||
for entry in entries.flatten() {
|
||||
if let Some(name) = entry.file_name().to_str() {
|
||||
if name.ends_with(".traineddata") {
|
||||
let lang = name.trim_end_matches(".traineddata");
|
||||
languages.push(lang.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
languages.sort();
|
||||
languages
|
||||
}
|
||||
|
||||
pub fn check_cpu_features(&self) -> CpuFeatures {
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
{
|
||||
use raw_cpuid::CpuId;
|
||||
let cpuid = CpuId::new();
|
||||
|
||||
let features = cpuid.get_feature_info().map(|f| CpuFeatures {
|
||||
sse2: f.has_sse2(),
|
||||
sse3: f.has_sse3(),
|
||||
sse4_1: f.has_sse41(),
|
||||
sse4_2: f.has_sse42(),
|
||||
avx: f.has_avx(),
|
||||
avx2: cpuid.get_extended_feature_info()
|
||||
.map(|ef| ef.has_avx2())
|
||||
.unwrap_or(false),
|
||||
}).unwrap_or_else(|| CpuFeatures {
|
||||
sse2: false,
|
||||
sse3: false,
|
||||
sse4_1: false,
|
||||
sse4_2: false,
|
||||
avx: false,
|
||||
avx2: false,
|
||||
});
|
||||
|
||||
features
|
||||
}
|
||||
|
||||
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
|
||||
{
|
||||
CpuFeatures {
|
||||
sse2: false,
|
||||
sse3: false,
|
||||
sse4_1: false,
|
||||
sse4_2: false,
|
||||
avx: false,
|
||||
avx2: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn check_memory_available(&self) -> u64 {
|
||||
let mut sys = System::new_all();
|
||||
sys.refresh_memory();
|
||||
sys.available_memory() / (1024 * 1024) // Convert to MB
|
||||
}
|
||||
|
||||
pub fn check_temp_space(&self) -> u64 {
|
||||
use std::fs;
|
||||
|
||||
let temp_dir = env::temp_dir();
|
||||
|
||||
// Try to get actual available space using statvfs on Unix-like systems
|
||||
#[cfg(target_family = "unix")]
|
||||
{
|
||||
use std::mem;
|
||||
|
||||
#[repr(C)]
|
||||
struct statvfs {
|
||||
f_bsize: u64, // file system block size
|
||||
f_frsize: u64, // fragment size
|
||||
f_blocks: u64, // size of fs in f_frsize units
|
||||
f_bfree: u64, // # free blocks
|
||||
f_bavail: u64, // # free blocks for unprivileged users
|
||||
f_files: u64, // # inodes
|
||||
f_ffree: u64, // # free inodes
|
||||
f_favail: u64, // # free inodes for unprivileged users
|
||||
f_fsid: u64, // file system ID
|
||||
f_flag: u64, // mount flags
|
||||
f_namemax: u64, // maximum filename length
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
fn statvfs(path: *const i8, buf: *mut statvfs) -> i32;
|
||||
}
|
||||
|
||||
unsafe {
|
||||
let mut buf: statvfs = mem::zeroed();
|
||||
let path_cstr = format!("{}\0", temp_dir.display());
|
||||
|
||||
if statvfs(path_cstr.as_ptr() as *const i8, &mut buf) == 0 {
|
||||
let available_bytes = buf.f_bavail * buf.f_frsize;
|
||||
return available_bytes / (1024 * 1024); // Convert to MB
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Windows implementation
|
||||
#[cfg(target_family = "windows")]
|
||||
{
|
||||
// For Windows, we'd need to use GetDiskFreeSpaceEx from winapi
|
||||
// For now, try to estimate based on a test file write
|
||||
}
|
||||
|
||||
// Fallback: Try to estimate available space by checking if we can create a test file
|
||||
let test_file = temp_dir.join(".ocr_space_test");
|
||||
let test_size = 100 * 1024 * 1024; // 100MB test
|
||||
|
||||
match fs::write(&test_file, vec![0u8; test_size]) {
|
||||
Ok(_) => {
|
||||
let _ = fs::remove_file(&test_file);
|
||||
// If we can write 100MB, assume at least 1GB is available
|
||||
1000
|
||||
}
|
||||
Err(_) => {
|
||||
// If we can't write 100MB, report low space
|
||||
50
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn validate_cpu_requirements(&self) -> Result<(), OcrError> {
|
||||
let features = self.check_cpu_features();
|
||||
|
||||
// Tesseract 4.x+ requires at least SSE2
|
||||
if !features.sse2 {
|
||||
return Err(OcrError::MissingCpuInstruction {
|
||||
instruction: "SSE2".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn estimate_memory_requirement(&self, image_width: u32, image_height: u32) -> u64 {
|
||||
// Rough estimation: 4 bytes per pixel (RGBA) * 3 (for processing buffers)
|
||||
// Plus 100MB base overhead for Tesseract
|
||||
let pixels = (image_width as u64) * (image_height as u64);
|
||||
let image_memory = (pixels * 4 * 3) / (1024 * 1024); // Convert to MB
|
||||
image_memory + 100
|
||||
}
|
||||
|
||||
pub fn validate_memory_for_image(&self, width: u32, height: u32) -> Result<(), OcrError> {
|
||||
let required = self.estimate_memory_requirement(width, height);
|
||||
let available = self.check_memory_available();
|
||||
|
||||
if required > available {
|
||||
return Err(OcrError::InsufficientMemory { required, available });
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_full_diagnostics(&self) -> OcrDiagnostics {
|
||||
OcrDiagnostics {
|
||||
tesseract_version: self.check_tesseract_installation().ok(),
|
||||
available_languages: self.get_available_languages(),
|
||||
tessdata_path: self.get_tessdata_path().ok(),
|
||||
cpu_features: self.check_cpu_features(),
|
||||
memory_available_mb: self.check_memory_available(),
|
||||
temp_space_available_mb: self.check_temp_space(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn perform_full_health_check(&self) -> Result<OcrDiagnostics, Vec<OcrError>> {
|
||||
let mut errors = Vec::new();
|
||||
|
||||
// Check Tesseract installation
|
||||
if let Err(e) = self.check_tesseract_installation() {
|
||||
errors.push(e);
|
||||
}
|
||||
|
||||
// Check CPU requirements
|
||||
if let Err(e) = self.validate_cpu_requirements() {
|
||||
errors.push(e);
|
||||
}
|
||||
|
||||
// Check tessdata path
|
||||
if let Err(e) = self.get_tessdata_path() {
|
||||
errors.push(e);
|
||||
}
|
||||
|
||||
// Check for at least English language data
|
||||
if let Err(e) = self.check_language_data("eng") {
|
||||
errors.push(e);
|
||||
}
|
||||
|
||||
let diagnostics = self.get_full_diagnostics();
|
||||
|
||||
if errors.is_empty() {
|
||||
Ok(diagnostics)
|
||||
} else {
|
||||
Err(errors)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,215 @@
|
|||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::*;
|
||||
use crate::ocr_error::{OcrError, OcrDiagnostics, CpuFeatures};
|
||||
use crate::ocr_health::OcrHealthChecker;
|
||||
use crate::ocr_enhanced::EnhancedOcrService;
|
||||
use std::env;
|
||||
use tempfile::TempDir;
|
||||
use std::fs;
|
||||
|
||||
#[test]
|
||||
fn test_ocr_error_types() {
|
||||
// Test error creation and properties
|
||||
let err = OcrError::TesseractNotInstalled;
|
||||
assert_eq!(err.error_code(), "OCR_NOT_INSTALLED");
|
||||
assert!(!err.is_recoverable());
|
||||
assert!(err.is_configuration_error());
|
||||
|
||||
let err = OcrError::InsufficientMemory { required: 1000, available: 500 };
|
||||
assert_eq!(err.error_code(), "OCR_OUT_OF_MEMORY");
|
||||
assert!(err.is_recoverable());
|
||||
assert!(!err.is_configuration_error());
|
||||
|
||||
let err = OcrError::LanguageDataNotFound { lang: "deu".to_string() };
|
||||
assert!(err.to_string().contains("deu"));
|
||||
assert!(err.is_configuration_error());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cpu_features_display() {
|
||||
let features = CpuFeatures {
|
||||
sse2: true,
|
||||
sse3: true,
|
||||
sse4_1: false,
|
||||
sse4_2: false,
|
||||
avx: false,
|
||||
avx2: false,
|
||||
};
|
||||
|
||||
let diag = OcrDiagnostics {
|
||||
tesseract_version: Some("4.1.1".to_string()),
|
||||
available_languages: vec!["eng".to_string(), "fra".to_string()],
|
||||
tessdata_path: Some("/usr/share/tessdata".to_string()),
|
||||
cpu_features: features,
|
||||
memory_available_mb: 8192,
|
||||
temp_space_available_mb: 50000,
|
||||
};
|
||||
|
||||
let display = format!("{}", diag);
|
||||
assert!(display.contains("Tesseract Version: 4.1.1"));
|
||||
assert!(display.contains("SSE2: true"));
|
||||
assert!(display.contains("Available Languages: eng, fra"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_health_checker_cpu_validation() {
|
||||
let checker = OcrHealthChecker::new();
|
||||
let features = checker.check_cpu_features();
|
||||
|
||||
// On x86/x64, we should at least detect the presence of CPU features
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
{
|
||||
// Modern CPUs should have at least SSE2
|
||||
// Note: This might fail on very old hardware
|
||||
if std::env::var("CI").is_err() {
|
||||
// Only check in non-CI environments
|
||||
let _ = checker.validate_cpu_requirements();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_memory_estimation() {
|
||||
let checker = OcrHealthChecker::new();
|
||||
|
||||
// Test memory estimation for different image sizes
|
||||
let small_image = checker.estimate_memory_requirement(640, 480);
|
||||
let medium_image = checker.estimate_memory_requirement(1920, 1080);
|
||||
let large_image = checker.estimate_memory_requirement(4096, 4096);
|
||||
|
||||
// Small image should need less memory than large
|
||||
assert!(small_image < medium_image);
|
||||
assert!(medium_image < large_image);
|
||||
|
||||
// Base overhead is 100MB
|
||||
assert!(small_image >= 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_temp_space_check() {
|
||||
let checker = OcrHealthChecker::new();
|
||||
let space = checker.check_temp_space();
|
||||
|
||||
// Should return some positive value
|
||||
assert!(space > 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tessdata_path_detection() {
|
||||
let checker = OcrHealthChecker::new();
|
||||
|
||||
// Set a custom TESSDATA_PREFIX for testing
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
env::set_var("TESSDATA_PREFIX", temp_dir.path());
|
||||
|
||||
match checker.get_tessdata_path() {
|
||||
Ok(path) => assert_eq!(path, temp_dir.path().to_string_lossy()),
|
||||
Err(e) => {
|
||||
// Expected if the temp directory doesn't exist
|
||||
match e {
|
||||
OcrError::TessdataPathInvalid { .. } => (),
|
||||
_ => panic!("Unexpected error type"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
env::remove_var("TESSDATA_PREFIX");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_language_detection() {
|
||||
let checker = OcrHealthChecker::new();
|
||||
|
||||
// Create a mock tessdata directory
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let tessdata_path = temp_dir.path().join("tessdata");
|
||||
fs::create_dir(&tessdata_path).unwrap();
|
||||
|
||||
// Create mock language files
|
||||
fs::write(tessdata_path.join("eng.traineddata"), b"mock").unwrap();
|
||||
fs::write(tessdata_path.join("fra.traineddata"), b"mock").unwrap();
|
||||
fs::write(tessdata_path.join("deu.traineddata"), b"mock").unwrap();
|
||||
|
||||
env::set_var("TESSDATA_PREFIX", &tessdata_path);
|
||||
|
||||
let languages = checker.get_available_languages();
|
||||
assert!(languages.contains(&"eng".to_string()));
|
||||
assert!(languages.contains(&"fra".to_string()));
|
||||
assert!(languages.contains(&"deu".to_string()));
|
||||
assert_eq!(languages.len(), 3);
|
||||
|
||||
// Test language validation
|
||||
assert!(checker.check_language_data("eng").is_ok());
|
||||
assert!(checker.check_language_data("jpn").is_err());
|
||||
|
||||
env::remove_var("TESSDATA_PREFIX");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_enhanced_ocr_timeout() {
|
||||
let service = EnhancedOcrService::new()
|
||||
.with_timeout(1); // 1 second timeout
|
||||
|
||||
// This should timeout since no actual file exists
|
||||
let result = service.extract_text_with_validation("/nonexistent/file.png", "eng").await;
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_enhanced_ocr_image_validation() {
|
||||
let service = EnhancedOcrService::new()
|
||||
.with_limits(100, 100); // Very small limit
|
||||
|
||||
// Create a mock large image path
|
||||
let result = service.extract_text_with_validation("/path/to/large/image.png", "eng").await;
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_error_recovery_classification() {
|
||||
// Test which errors are considered recoverable
|
||||
let recoverable_errors = vec![
|
||||
OcrError::InsufficientMemory { required: 1000, available: 500 },
|
||||
OcrError::OcrTimeout { seconds: 30 },
|
||||
OcrError::LowConfidence { score: 40.0, threshold: 60.0 },
|
||||
];
|
||||
|
||||
for err in recoverable_errors {
|
||||
assert!(err.is_recoverable(), "Error {:?} should be recoverable", err);
|
||||
}
|
||||
|
||||
let non_recoverable_errors = vec![
|
||||
OcrError::TesseractNotInstalled,
|
||||
OcrError::LanguageDataNotFound { lang: "eng".to_string() },
|
||||
OcrError::MissingCpuInstruction { instruction: "SSE2".to_string() },
|
||||
OcrError::PermissionDenied { path: "/test".to_string() },
|
||||
];
|
||||
|
||||
for err in non_recoverable_errors {
|
||||
assert!(!err.is_recoverable(), "Error {:?} should not be recoverable", err);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_image_size_validation() {
|
||||
let checker = OcrHealthChecker::new();
|
||||
|
||||
// Assuming we have at least 100MB available
|
||||
let available = checker.check_memory_available();
|
||||
if available > 100 {
|
||||
// Small image should pass
|
||||
assert!(checker.validate_memory_for_image(640, 480).is_ok());
|
||||
|
||||
// Extremely large image should fail
|
||||
let result = checker.validate_memory_for_image(50000, 50000);
|
||||
assert!(result.is_err());
|
||||
|
||||
if let Err(OcrError::InsufficientMemory { required, available }) = result {
|
||||
assert!(required > available);
|
||||
} else {
|
||||
panic!("Expected InsufficientMemory error");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue