351 lines
12 KiB
Rust
351 lines
12 KiB
Rust
use crate::ocr::error::{CpuFeatures, OcrDiagnostics, OcrError};
|
|
use std::process::Command;
|
|
use std::env;
|
|
use sysinfo::System;
|
|
|
|
pub struct OcrHealthChecker;
|
|
|
|
impl OcrHealthChecker {
|
|
pub fn new() -> Self {
|
|
Self
|
|
}
|
|
|
|
pub fn check_tesseract_installation(&self) -> Result<String, OcrError> {
|
|
let output = Command::new("tesseract")
|
|
.arg("--version")
|
|
.output()
|
|
.map_err(|_| OcrError::TesseractNotInstalled)?;
|
|
|
|
if !output.status.success() {
|
|
return Err(OcrError::TesseractNotInstalled);
|
|
}
|
|
|
|
let version_info = String::from_utf8_lossy(&output.stdout);
|
|
let version = version_info
|
|
.lines()
|
|
.next()
|
|
.map(|s| s.to_string())
|
|
.unwrap_or_else(|| "Unknown".to_string());
|
|
|
|
Ok(version)
|
|
}
|
|
|
|
pub fn check_language_data(&self, lang: &str) -> Result<(), OcrError> {
|
|
// Use Tesseract's built-in language validation by attempting to list languages
|
|
let output = Command::new("tesseract")
|
|
.arg("--list-langs")
|
|
.output()
|
|
.map_err(|_| OcrError::TesseractNotInstalled)?;
|
|
|
|
if !output.status.success() {
|
|
return Err(OcrError::TesseractNotInstalled);
|
|
}
|
|
|
|
let langs_output = String::from_utf8_lossy(&output.stdout);
|
|
let available_langs: Vec<&str> = langs_output.lines().skip(1).collect(); // Skip first line "List of available languages:"
|
|
|
|
if !available_langs.contains(&lang) {
|
|
return Err(OcrError::LanguageDataNotFound {
|
|
lang: lang.to_string(),
|
|
});
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
|
|
pub fn get_available_languages(&self) -> Result<Vec<String>, OcrError> {
|
|
// Use Tesseract's built-in language listing
|
|
let output = Command::new("tesseract")
|
|
.arg("--list-langs")
|
|
.output()
|
|
.map_err(|_| OcrError::TesseractNotInstalled)?;
|
|
|
|
if !output.status.success() {
|
|
return Err(OcrError::TesseractNotInstalled);
|
|
}
|
|
|
|
let langs_output = String::from_utf8_lossy(&output.stdout);
|
|
let mut languages: Vec<String> = langs_output
|
|
.lines()
|
|
.skip(1) // Skip first line "List of available languages:"
|
|
.map(|s| s.trim().to_string())
|
|
.filter(|s| !s.is_empty())
|
|
.collect();
|
|
|
|
languages.sort();
|
|
Ok(languages)
|
|
}
|
|
|
|
pub fn validate_language(&self, lang: &str) -> Result<(), OcrError> {
|
|
// Check if language is supported
|
|
let available_languages = self.get_available_languages()?;
|
|
if !available_languages.contains(&lang.to_string()) {
|
|
return Err(OcrError::LanguageDataNotFound {
|
|
lang: lang.to_string(),
|
|
});
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// Validate a language combination (e.g., "eng+spa")
|
|
pub fn validate_language_combination(&self, lang_combination: &str) -> Result<(), OcrError> {
|
|
if lang_combination.is_empty() {
|
|
return Err(OcrError::LanguageDataNotFound {
|
|
lang: "empty".to_string(),
|
|
});
|
|
}
|
|
|
|
// Split by '+' to handle multi-language combinations
|
|
let languages: Vec<&str> = lang_combination.split('+').collect();
|
|
|
|
// Validate each language in the combination
|
|
for lang in &languages {
|
|
self.validate_language(lang.trim())?;
|
|
}
|
|
|
|
// Limit number of languages for performance (max 4)
|
|
if languages.len() > 4 {
|
|
return Err(OcrError::LanguageDataNotFound {
|
|
lang: format!("Too many languages in combination: {}. Maximum is 4.", languages.len()),
|
|
});
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Validate a list of preferred languages
|
|
pub fn validate_preferred_languages(&self, languages: &[String]) -> Result<(), OcrError> {
|
|
if languages.is_empty() {
|
|
return Err(OcrError::LanguageDataNotFound {
|
|
lang: "No languages provided".to_string(),
|
|
});
|
|
}
|
|
|
|
// Limit number of languages for performance
|
|
if languages.len() > 4 {
|
|
return Err(OcrError::LanguageDataNotFound {
|
|
lang: format!("Too many preferred languages: {}. Maximum is 4.", languages.len()),
|
|
});
|
|
}
|
|
|
|
// Validate each language
|
|
for lang in languages {
|
|
self.validate_language(lang)?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub fn get_language_display_name(&self, lang_code: &str) -> String {
|
|
match lang_code {
|
|
"eng" => "English".to_string(),
|
|
"spa" => "Spanish".to_string(),
|
|
"fra" => "French".to_string(),
|
|
"deu" => "German".to_string(),
|
|
"ita" => "Italian".to_string(),
|
|
"por" => "Portuguese".to_string(),
|
|
"rus" => "Russian".to_string(),
|
|
"chi_sim" => "Chinese (Simplified)".to_string(),
|
|
"chi_tra" => "Chinese (Traditional)".to_string(),
|
|
"jpn" => "Japanese".to_string(),
|
|
"kor" => "Korean".to_string(),
|
|
"ara" => "Arabic".to_string(),
|
|
"hin" => "Hindi".to_string(),
|
|
"nld" => "Dutch".to_string(),
|
|
"swe" => "Swedish".to_string(),
|
|
"nor" => "Norwegian".to_string(),
|
|
"dan" => "Danish".to_string(),
|
|
"fin" => "Finnish".to_string(),
|
|
"pol" => "Polish".to_string(),
|
|
"ces" => "Czech".to_string(),
|
|
"hun" => "Hungarian".to_string(),
|
|
"tur" => "Turkish".to_string(),
|
|
"tha" => "Thai".to_string(),
|
|
"vie" => "Vietnamese".to_string(),
|
|
_ => lang_code.to_string(), // Return the code itself for unknown languages
|
|
}
|
|
}
|
|
|
|
pub fn check_cpu_features(&self) -> CpuFeatures {
|
|
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
|
{
|
|
use raw_cpuid::CpuId;
|
|
let cpuid = CpuId::new();
|
|
|
|
let features = cpuid.get_feature_info().map(|f| CpuFeatures {
|
|
sse2: f.has_sse2(),
|
|
sse3: f.has_sse3(),
|
|
sse4_1: f.has_sse41(),
|
|
sse4_2: f.has_sse42(),
|
|
avx: f.has_avx(),
|
|
avx2: cpuid.get_extended_feature_info()
|
|
.map(|ef| ef.has_avx2())
|
|
.unwrap_or(false),
|
|
}).unwrap_or_else(|| CpuFeatures {
|
|
sse2: false,
|
|
sse3: false,
|
|
sse4_1: false,
|
|
sse4_2: false,
|
|
avx: false,
|
|
avx2: false,
|
|
});
|
|
|
|
features
|
|
}
|
|
|
|
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
|
|
{
|
|
CpuFeatures {
|
|
sse2: false,
|
|
sse3: false,
|
|
sse4_1: false,
|
|
sse4_2: false,
|
|
avx: false,
|
|
avx2: false,
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn check_memory_available(&self) -> u64 {
|
|
let mut sys = System::new_all();
|
|
sys.refresh_memory();
|
|
sys.available_memory() / (1024 * 1024) // Convert to MB
|
|
}
|
|
|
|
pub fn check_temp_space(&self) -> u64 {
|
|
use std::fs;
|
|
|
|
let temp_dir = env::temp_dir();
|
|
|
|
// Try to get actual available space using statvfs on Unix-like systems
|
|
#[cfg(target_family = "unix")]
|
|
{
|
|
use std::mem;
|
|
|
|
#[repr(C)]
|
|
struct statvfs {
|
|
f_bsize: u64, // file system block size
|
|
f_frsize: u64, // fragment size
|
|
f_blocks: u64, // size of fs in f_frsize units
|
|
f_bfree: u64, // # free blocks
|
|
f_bavail: u64, // # free blocks for unprivileged users
|
|
f_files: u64, // # inodes
|
|
f_ffree: u64, // # free inodes
|
|
f_favail: u64, // # free inodes for unprivileged users
|
|
f_fsid: u64, // file system ID
|
|
f_flag: u64, // mount flags
|
|
f_namemax: u64, // maximum filename length
|
|
}
|
|
|
|
extern "C" {
|
|
fn statvfs(path: *const i8, buf: *mut statvfs) -> i32;
|
|
}
|
|
|
|
unsafe {
|
|
let mut buf: statvfs = mem::zeroed();
|
|
let path_cstr = format!("{}\0", temp_dir.display());
|
|
|
|
if statvfs(path_cstr.as_ptr() as *const i8, &mut buf) == 0 {
|
|
let available_bytes = buf.f_bavail * buf.f_frsize;
|
|
return available_bytes / (1024 * 1024); // Convert to MB
|
|
}
|
|
}
|
|
}
|
|
|
|
// Windows implementation
|
|
#[cfg(target_family = "windows")]
|
|
{
|
|
// For Windows, we'd need to use GetDiskFreeSpaceEx from winapi
|
|
// For now, try to estimate based on a test file write
|
|
}
|
|
|
|
// Fallback: Try to estimate available space by checking if we can create a test file
|
|
let test_file = temp_dir.join(".ocr_space_test");
|
|
let test_size = 100 * 1024 * 1024; // 100MB test
|
|
|
|
match fs::write(&test_file, vec![0u8; test_size]) {
|
|
Ok(_) => {
|
|
let _ = fs::remove_file(&test_file);
|
|
// If we can write 100MB, assume at least 1GB is available
|
|
1000
|
|
}
|
|
Err(_) => {
|
|
// If we can't write 100MB, report low space
|
|
50
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn validate_cpu_requirements(&self) -> Result<(), OcrError> {
|
|
let features = self.check_cpu_features();
|
|
|
|
// Tesseract 4.x+ requires at least SSE2
|
|
if !features.sse2 {
|
|
return Err(OcrError::MissingCpuInstruction {
|
|
instruction: "SSE2".to_string(),
|
|
});
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub fn estimate_memory_requirement(&self, image_width: u32, image_height: u32) -> u64 {
|
|
// Rough estimation: 4 bytes per pixel (RGBA) * 3 (for processing buffers)
|
|
// Plus 100MB base overhead for Tesseract
|
|
let pixels = (image_width as u64) * (image_height as u64);
|
|
let image_memory = (pixels * 4 * 3) / (1024 * 1024); // Convert to MB
|
|
image_memory + 100
|
|
}
|
|
|
|
pub fn validate_memory_for_image(&self, width: u32, height: u32) -> Result<(), OcrError> {
|
|
let required = self.estimate_memory_requirement(width, height);
|
|
let available = self.check_memory_available();
|
|
|
|
if required > available {
|
|
return Err(OcrError::InsufficientMemory { required, available });
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub fn get_full_diagnostics(&self) -> OcrDiagnostics {
|
|
OcrDiagnostics {
|
|
tesseract_version: self.check_tesseract_installation().ok(),
|
|
available_languages: self.get_available_languages().unwrap_or_else(|_| vec![]),
|
|
tessdata_path: None, // No longer managing tessdata paths
|
|
cpu_features: self.check_cpu_features(),
|
|
memory_available_mb: self.check_memory_available(),
|
|
temp_space_available_mb: self.check_temp_space(),
|
|
}
|
|
}
|
|
|
|
pub fn perform_full_health_check(&self) -> Result<OcrDiagnostics, Vec<OcrError>> {
|
|
let mut errors = Vec::new();
|
|
|
|
// Check Tesseract installation
|
|
if let Err(e) = self.check_tesseract_installation() {
|
|
errors.push(e);
|
|
}
|
|
|
|
// Check CPU requirements
|
|
if let Err(e) = self.validate_cpu_requirements() {
|
|
errors.push(e);
|
|
}
|
|
|
|
// Tessdata path no longer managed - Tesseract handles it automatically
|
|
|
|
// Check for at least English language data
|
|
if let Err(e) = self.check_language_data("eng") {
|
|
errors.push(e);
|
|
}
|
|
|
|
let diagnostics = self.get_full_diagnostics();
|
|
|
|
if errors.is_empty() {
|
|
Ok(diagnostics)
|
|
} else {
|
|
Err(errors)
|
|
}
|
|
}
|
|
} |