feat(ocr): fix ocr variables

This commit is contained in:
perf3ct 2025-06-13 15:24:25 +00:00
parent cd35f877b1
commit e6e2ba76f5
1 changed files with 2 additions and 40 deletions

View File

@ -189,46 +189,8 @@ impl EnhancedOcrService {
// Note: set_engine_mode may not be available in the current tesseract crate version
// We'll configure this differently if needed
// Set DPI if specified and different from 0
if settings.ocr_dpi > 0 {
tesseract = tesseract.set_variable("user_defined_dpi", &settings.ocr_dpi.to_string())?;
}
// Configure character whitelist/blacklist
if let Some(ref whitelist) = settings.ocr_whitelist_chars {
if !whitelist.is_empty() {
tesseract = tesseract.set_variable("tessedit_char_whitelist", whitelist)?;
}
}
if let Some(ref blacklist) = settings.ocr_blacklist_chars {
if !blacklist.is_empty() {
tesseract = tesseract.set_variable("tessedit_char_blacklist", blacklist)?;
}
}
// Additional high-quality settings for challenging images
tesseract = tesseract.set_variable("preserve_interword_spaces", "1")?;
tesseract = tesseract.set_variable("tessedit_do_invert", "0")?;
tesseract = tesseract.set_variable("classify_enable_learning", "0")?;
tesseract = tesseract.set_variable("textord_really_old_xheight", "1")?;
tesseract = tesseract.set_variable("textord_min_xheight", "7")?;
// Enhanced settings for low-quality images
tesseract = tesseract.set_variable("tessedit_char_unblacklist_fraction", "0.0")?;
tesseract = tesseract.set_variable("edges_max_children_per_outline", "40")?;
tesseract = tesseract.set_variable("textord_noise_sizefraction", "10.0")?;
tesseract = tesseract.set_variable("textord_noise_translimit", "16.0")?;
tesseract = tesseract.set_variable("textord_noise_normratio", "2.0")?;
// Improve word breaking for dense text
tesseract = tesseract.set_variable("textord_tabfind_find_tables", "1")?;
tesseract = tesseract.set_variable("textord_use_cjk_fp_model", "0")?;
// Better handling of degraded images
tesseract = tesseract.set_variable("classify_adapt_feature_threshold", "230")?;
tesseract = tesseract.set_variable("classify_adapt_proto_threshold", "230")?;
tesseract = tesseract.set_variable("textord_heavy_nr", "1")?;
// Basic configuration - skip advanced settings that might cause issues
// Only set essential variables that are widely supported
Ok(tesseract)
}