feat(lang): update backend to support multiple languages at the same time during OCR

This commit is contained in:
perf3ct 2025-07-14 19:33:43 +00:00
parent dc55b2e50b
commit 849c9f91c7
10 changed files with 430 additions and 228 deletions

View File

@ -0,0 +1,56 @@
-- Migration: Add multi-language OCR support
-- This migration adds support for multiple OCR languages per user
-- Add new columns for multi-language support
ALTER TABLE settings
ADD COLUMN preferred_languages JSONB DEFAULT '["eng"]'::jsonb,
ADD COLUMN primary_language VARCHAR(10) DEFAULT 'eng',
ADD COLUMN auto_detect_language_combination BOOLEAN DEFAULT false;
-- Migrate existing ocr_language data to new preferred_languages array
UPDATE settings
SET preferred_languages = jsonb_build_array(COALESCE(ocr_language, 'eng')),
primary_language = COALESCE(ocr_language, 'eng')
WHERE preferred_languages = '["eng"]'::jsonb;
-- Create index for efficient querying of preferred languages
CREATE INDEX IF NOT EXISTS idx_settings_preferred_languages ON settings USING gin(preferred_languages);
CREATE INDEX IF NOT EXISTS idx_settings_primary_language ON settings(primary_language);
-- Add constraint to ensure primary_language is always in preferred_languages
ALTER TABLE settings
ADD CONSTRAINT check_primary_language_in_preferred
CHECK (preferred_languages ? primary_language);
-- Add constraint to limit number of preferred languages (max 4 for performance)
ALTER TABLE settings
ADD CONSTRAINT check_max_preferred_languages
CHECK (jsonb_array_length(preferred_languages) <= 4);
-- Add constraint to ensure valid language codes (3-letter ISO codes)
ALTER TABLE settings
ADD CONSTRAINT check_valid_language_codes
CHECK (
primary_language ~ '^[a-z]{3}(_[A-Z]{2})?$' AND
(
SELECT bool_and(value::text ~ '^"[a-z]{3}(_[A-Z]{2})?"$')
FROM jsonb_array_elements(preferred_languages)
)
);
-- Update existing users who don't have settings yet
INSERT INTO settings (user_id, preferred_languages, primary_language, auto_detect_language_combination)
SELECT
u.id,
'["eng"]'::jsonb,
'eng',
false
FROM users u
WHERE NOT EXISTS (
SELECT 1 FROM settings s WHERE s.user_id = u.id
);
-- Add comments for documentation
COMMENT ON COLUMN settings.preferred_languages IS 'Array of 3-letter ISO language codes for OCR processing, max 4 languages';
COMMENT ON COLUMN settings.primary_language IS 'Primary language code that should be listed first in OCR processing';
COMMENT ON COLUMN settings.auto_detect_language_combination IS 'Whether to automatically suggest language combinations based on document content';

View File

@ -1,14 +1,94 @@
use anyhow::Result;
use sqlx::Row;
use uuid::Uuid;
use serde_json::Value;
use super::Database;
// Helper function to parse JSONB array to Vec<String>
fn parse_jsonb_string_array(value: Value) -> Vec<String> {
match value {
Value::Array(arr) => arr.into_iter()
.filter_map(|v| v.as_str().map(|s| s.to_string()))
.collect(),
_ => vec!["eng".to_string()], // fallback to English
}
}
// Helper function to create Settings from database row
fn settings_from_row(row: &sqlx::postgres::PgRow) -> crate::models::Settings {
let preferred_languages_json: Value = row.get("preferred_languages");
let preferred_languages = parse_jsonb_string_array(preferred_languages_json);
crate::models::Settings {
id: row.get("id"),
user_id: row.get("user_id"),
ocr_language: row.get("ocr_language"),
preferred_languages,
primary_language: row.get("primary_language"),
auto_detect_language_combination: row.get("auto_detect_language_combination"),
concurrent_ocr_jobs: row.get("concurrent_ocr_jobs"),
ocr_timeout_seconds: row.get("ocr_timeout_seconds"),
max_file_size_mb: row.get("max_file_size_mb"),
allowed_file_types: row.get("allowed_file_types"),
auto_rotate_images: row.get("auto_rotate_images"),
enable_image_preprocessing: row.get("enable_image_preprocessing"),
search_results_per_page: row.get("search_results_per_page"),
search_snippet_length: row.get("search_snippet_length"),
fuzzy_search_threshold: row.get("fuzzy_search_threshold"),
retention_days: row.get("retention_days"),
enable_auto_cleanup: row.get("enable_auto_cleanup"),
enable_compression: row.get("enable_compression"),
memory_limit_mb: row.get("memory_limit_mb"),
cpu_priority: row.get("cpu_priority"),
enable_background_ocr: row.get("enable_background_ocr"),
ocr_page_segmentation_mode: row.get("ocr_page_segmentation_mode"),
ocr_engine_mode: row.get("ocr_engine_mode"),
ocr_min_confidence: row.get("ocr_min_confidence"),
ocr_dpi: row.get("ocr_dpi"),
ocr_enhance_contrast: row.get("ocr_enhance_contrast"),
ocr_remove_noise: row.get("ocr_remove_noise"),
ocr_detect_orientation: row.get("ocr_detect_orientation"),
ocr_whitelist_chars: row.get("ocr_whitelist_chars"),
ocr_blacklist_chars: row.get("ocr_blacklist_chars"),
ocr_brightness_boost: row.get("ocr_brightness_boost"),
ocr_contrast_multiplier: row.get("ocr_contrast_multiplier"),
ocr_noise_reduction_level: row.get("ocr_noise_reduction_level"),
ocr_sharpening_strength: row.get("ocr_sharpening_strength"),
ocr_morphological_operations: row.get("ocr_morphological_operations"),
ocr_adaptive_threshold_window_size: row.get("ocr_adaptive_threshold_window_size"),
ocr_histogram_equalization: row.get("ocr_histogram_equalization"),
ocr_upscale_factor: row.get("ocr_upscale_factor"),
ocr_max_image_width: row.get("ocr_max_image_width"),
ocr_max_image_height: row.get("ocr_max_image_height"),
save_processed_images: row.get("save_processed_images"),
ocr_quality_threshold_brightness: row.get("ocr_quality_threshold_brightness"),
ocr_quality_threshold_contrast: row.get("ocr_quality_threshold_contrast"),
ocr_quality_threshold_noise: row.get("ocr_quality_threshold_noise"),
ocr_quality_threshold_sharpness: row.get("ocr_quality_threshold_sharpness"),
ocr_skip_enhancement: row.get("ocr_skip_enhancement"),
webdav_enabled: row.get("webdav_enabled"),
webdav_server_url: row.get("webdav_server_url"),
webdav_username: row.get("webdav_username"),
webdav_password: row.get("webdav_password"),
webdav_watch_folders: row.get("webdav_watch_folders"),
webdav_file_extensions: row.get("webdav_file_extensions"),
webdav_auto_sync: row.get("webdav_auto_sync"),
webdav_sync_interval_minutes: row.get("webdav_sync_interval_minutes"),
created_at: row.get("created_at"),
updated_at: row.get("updated_at"),
}
}
impl Database {
pub async fn get_user_settings(&self, user_id: Uuid) -> Result<Option<crate::models::Settings>> {
self.with_retry(|| async {
let row = sqlx::query(
r#"SELECT id, user_id, ocr_language, concurrent_ocr_jobs, ocr_timeout_seconds,
r#"SELECT id, user_id, ocr_language,
COALESCE(preferred_languages, '["eng"]'::jsonb) as preferred_languages,
COALESCE(primary_language, 'eng') as primary_language,
COALESCE(auto_detect_language_combination, false) as auto_detect_language_combination,
concurrent_ocr_jobs, ocr_timeout_seconds,
max_file_size_mb, allowed_file_types, auto_rotate_images, enable_image_preprocessing,
search_results_per_page, search_snippet_length, fuzzy_search_threshold,
retention_days, enable_auto_cleanup, enable_compression, memory_limit_mb,
@ -31,61 +111,7 @@ impl Database {
.map_err(|e| anyhow::anyhow!("Database query failed: {}", e))?;
match row {
Some(row) => Ok(Some(crate::models::Settings {
id: row.get("id"),
user_id: row.get("user_id"),
ocr_language: row.get("ocr_language"),
concurrent_ocr_jobs: row.get("concurrent_ocr_jobs"),
ocr_timeout_seconds: row.get("ocr_timeout_seconds"),
max_file_size_mb: row.get("max_file_size_mb"),
allowed_file_types: row.get("allowed_file_types"),
auto_rotate_images: row.get("auto_rotate_images"),
enable_image_preprocessing: row.get("enable_image_preprocessing"),
search_results_per_page: row.get("search_results_per_page"),
search_snippet_length: row.get("search_snippet_length"),
fuzzy_search_threshold: row.get("fuzzy_search_threshold"),
retention_days: row.get("retention_days"),
enable_auto_cleanup: row.get("enable_auto_cleanup"),
enable_compression: row.get("enable_compression"),
memory_limit_mb: row.get("memory_limit_mb"),
cpu_priority: row.get("cpu_priority"),
enable_background_ocr: row.get("enable_background_ocr"),
ocr_page_segmentation_mode: row.get("ocr_page_segmentation_mode"),
ocr_engine_mode: row.get("ocr_engine_mode"),
ocr_min_confidence: row.get("ocr_min_confidence"),
ocr_dpi: row.get("ocr_dpi"),
ocr_enhance_contrast: row.get("ocr_enhance_contrast"),
ocr_remove_noise: row.get("ocr_remove_noise"),
ocr_detect_orientation: row.get("ocr_detect_orientation"),
ocr_whitelist_chars: row.get("ocr_whitelist_chars"),
ocr_blacklist_chars: row.get("ocr_blacklist_chars"),
ocr_brightness_boost: row.get("ocr_brightness_boost"),
ocr_contrast_multiplier: row.get("ocr_contrast_multiplier"),
ocr_noise_reduction_level: row.get("ocr_noise_reduction_level"),
ocr_sharpening_strength: row.get("ocr_sharpening_strength"),
ocr_morphological_operations: row.get("ocr_morphological_operations"),
ocr_adaptive_threshold_window_size: row.get("ocr_adaptive_threshold_window_size"),
ocr_histogram_equalization: row.get("ocr_histogram_equalization"),
ocr_upscale_factor: row.get("ocr_upscale_factor"),
ocr_max_image_width: row.get("ocr_max_image_width"),
ocr_max_image_height: row.get("ocr_max_image_height"),
save_processed_images: row.get("save_processed_images"),
ocr_quality_threshold_brightness: row.get("ocr_quality_threshold_brightness"),
ocr_quality_threshold_contrast: row.get("ocr_quality_threshold_contrast"),
ocr_quality_threshold_noise: row.get("ocr_quality_threshold_noise"),
ocr_quality_threshold_sharpness: row.get("ocr_quality_threshold_sharpness"),
ocr_skip_enhancement: row.get("ocr_skip_enhancement"),
webdav_enabled: row.get("webdav_enabled"),
webdav_server_url: row.get("webdav_server_url"),
webdav_username: row.get("webdav_username"),
webdav_password: row.get("webdav_password"),
webdav_watch_folders: row.get("webdav_watch_folders"),
webdav_file_extensions: row.get("webdav_file_extensions"),
webdav_auto_sync: row.get("webdav_auto_sync"),
webdav_sync_interval_minutes: row.get("webdav_sync_interval_minutes"),
created_at: row.get("created_at"),
updated_at: row.get("updated_at"),
})),
Some(row) => Ok(Some(settings_from_row(&row))),
None => Ok(None),
}
}).await
@ -93,7 +119,11 @@ impl Database {
pub async fn get_all_user_settings(&self) -> Result<Vec<crate::models::Settings>> {
let rows = sqlx::query(
r#"SELECT id, user_id, ocr_language, concurrent_ocr_jobs, ocr_timeout_seconds,
r#"SELECT id, user_id, ocr_language,
COALESCE(preferred_languages, '["eng"]'::jsonb) as preferred_languages,
COALESCE(primary_language, 'eng') as primary_language,
COALESCE(auto_detect_language_combination, false) as auto_detect_language_combination,
concurrent_ocr_jobs, ocr_timeout_seconds,
max_file_size_mb, allowed_file_types, auto_rotate_images, enable_image_preprocessing,
search_results_per_page, search_snippet_length, fuzzy_search_threshold,
retention_days, enable_auto_cleanup, enable_compression, memory_limit_mb,
@ -114,64 +144,9 @@ impl Database {
.fetch_all(&self.pool)
.await?;
let mut settings_list = Vec::new();
for row in rows {
settings_list.push(crate::models::Settings {
id: row.get("id"),
user_id: row.get("user_id"),
ocr_language: row.get("ocr_language"),
concurrent_ocr_jobs: row.get("concurrent_ocr_jobs"),
ocr_timeout_seconds: row.get("ocr_timeout_seconds"),
max_file_size_mb: row.get("max_file_size_mb"),
allowed_file_types: row.get("allowed_file_types"),
auto_rotate_images: row.get("auto_rotate_images"),
enable_image_preprocessing: row.get("enable_image_preprocessing"),
search_results_per_page: row.get("search_results_per_page"),
search_snippet_length: row.get("search_snippet_length"),
fuzzy_search_threshold: row.get("fuzzy_search_threshold"),
retention_days: row.get("retention_days"),
enable_auto_cleanup: row.get("enable_auto_cleanup"),
enable_compression: row.get("enable_compression"),
memory_limit_mb: row.get("memory_limit_mb"),
cpu_priority: row.get("cpu_priority"),
enable_background_ocr: row.get("enable_background_ocr"),
ocr_page_segmentation_mode: row.get("ocr_page_segmentation_mode"),
ocr_engine_mode: row.get("ocr_engine_mode"),
ocr_min_confidence: row.get("ocr_min_confidence"),
ocr_dpi: row.get("ocr_dpi"),
ocr_enhance_contrast: row.get("ocr_enhance_contrast"),
ocr_remove_noise: row.get("ocr_remove_noise"),
ocr_detect_orientation: row.get("ocr_detect_orientation"),
ocr_whitelist_chars: row.get("ocr_whitelist_chars"),
ocr_blacklist_chars: row.get("ocr_blacklist_chars"),
ocr_brightness_boost: row.get("ocr_brightness_boost"),
ocr_contrast_multiplier: row.get("ocr_contrast_multiplier"),
ocr_noise_reduction_level: row.get("ocr_noise_reduction_level"),
ocr_sharpening_strength: row.get("ocr_sharpening_strength"),
ocr_morphological_operations: row.get("ocr_morphological_operations"),
ocr_adaptive_threshold_window_size: row.get("ocr_adaptive_threshold_window_size"),
ocr_histogram_equalization: row.get("ocr_histogram_equalization"),
ocr_upscale_factor: row.get("ocr_upscale_factor"),
ocr_max_image_width: row.get("ocr_max_image_width"),
ocr_max_image_height: row.get("ocr_max_image_height"),
save_processed_images: row.get("save_processed_images"),
ocr_quality_threshold_brightness: row.get("ocr_quality_threshold_brightness"),
ocr_quality_threshold_contrast: row.get("ocr_quality_threshold_contrast"),
ocr_quality_threshold_noise: row.get("ocr_quality_threshold_noise"),
ocr_quality_threshold_sharpness: row.get("ocr_quality_threshold_sharpness"),
ocr_skip_enhancement: row.get("ocr_skip_enhancement"),
webdav_enabled: row.get("webdav_enabled"),
webdav_server_url: row.get("webdav_server_url"),
webdav_username: row.get("webdav_username"),
webdav_password: row.get("webdav_password"),
webdav_watch_folders: row.get("webdav_watch_folders"),
webdav_file_extensions: row.get("webdav_file_extensions"),
webdav_auto_sync: row.get("webdav_auto_sync"),
webdav_sync_interval_minutes: row.get("webdav_sync_interval_minutes"),
created_at: row.get("created_at"),
updated_at: row.get("updated_at"),
});
}
let settings_list = rows.into_iter()
.map(|row| settings_from_row(&row))
.collect();
Ok(settings_list)
}
@ -191,7 +166,7 @@ impl Database {
let row = sqlx::query(
r#"
INSERT INTO settings (
user_id, ocr_language, concurrent_ocr_jobs, ocr_timeout_seconds,
user_id, ocr_language, preferred_languages, primary_language, auto_detect_language_combination, concurrent_ocr_jobs, ocr_timeout_seconds,
max_file_size_mb, allowed_file_types, auto_rotate_images, enable_image_preprocessing,
search_results_per_page, search_snippet_length, fuzzy_search_threshold,
retention_days, enable_auto_cleanup, enable_compression, memory_limit_mb,
@ -206,59 +181,66 @@ impl Database {
webdav_enabled, webdav_server_url, webdav_username, webdav_password,
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes
)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $50)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $50, $51, $52, $53)
ON CONFLICT (user_id) DO UPDATE SET
ocr_language = $2,
concurrent_ocr_jobs = $3,
ocr_timeout_seconds = $4,
max_file_size_mb = $5,
allowed_file_types = $6,
auto_rotate_images = $7,
enable_image_preprocessing = $8,
search_results_per_page = $9,
search_snippet_length = $10,
fuzzy_search_threshold = $11,
retention_days = $12,
enable_auto_cleanup = $13,
enable_compression = $14,
memory_limit_mb = $15,
cpu_priority = $16,
enable_background_ocr = $17,
ocr_page_segmentation_mode = $18,
ocr_engine_mode = $19,
ocr_min_confidence = $20,
ocr_dpi = $21,
ocr_enhance_contrast = $22,
ocr_remove_noise = $23,
ocr_detect_orientation = $24,
ocr_whitelist_chars = $25,
ocr_blacklist_chars = $26,
ocr_brightness_boost = $27,
ocr_contrast_multiplier = $28,
ocr_noise_reduction_level = $29,
ocr_sharpening_strength = $30,
ocr_morphological_operations = $31,
ocr_adaptive_threshold_window_size = $32,
ocr_histogram_equalization = $33,
ocr_upscale_factor = $34,
ocr_max_image_width = $35,
ocr_max_image_height = $36,
save_processed_images = $37,
ocr_quality_threshold_brightness = $38,
ocr_quality_threshold_contrast = $39,
ocr_quality_threshold_noise = $40,
ocr_quality_threshold_sharpness = $41,
ocr_skip_enhancement = $42,
webdav_enabled = $43,
webdav_server_url = $44,
webdav_username = $45,
webdav_password = $46,
webdav_watch_folders = $47,
webdav_file_extensions = $48,
webdav_auto_sync = $49,
webdav_sync_interval_minutes = $50,
preferred_languages = $3,
primary_language = $4,
auto_detect_language_combination = $5,
concurrent_ocr_jobs = $6,
ocr_timeout_seconds = $7,
max_file_size_mb = $8,
allowed_file_types = $9,
auto_rotate_images = $10,
enable_image_preprocessing = $11,
search_results_per_page = $12,
search_snippet_length = $13,
fuzzy_search_threshold = $14,
retention_days = $15,
enable_auto_cleanup = $16,
enable_compression = $17,
memory_limit_mb = $18,
cpu_priority = $19,
enable_background_ocr = $20,
ocr_page_segmentation_mode = $21,
ocr_engine_mode = $22,
ocr_min_confidence = $23,
ocr_dpi = $24,
ocr_enhance_contrast = $25,
ocr_remove_noise = $26,
ocr_detect_orientation = $27,
ocr_whitelist_chars = $28,
ocr_blacklist_chars = $29,
ocr_brightness_boost = $30,
ocr_contrast_multiplier = $31,
ocr_noise_reduction_level = $32,
ocr_sharpening_strength = $33,
ocr_morphological_operations = $34,
ocr_adaptive_threshold_window_size = $35,
ocr_histogram_equalization = $36,
ocr_upscale_factor = $37,
ocr_max_image_width = $38,
ocr_max_image_height = $39,
save_processed_images = $40,
ocr_quality_threshold_brightness = $41,
ocr_quality_threshold_contrast = $42,
ocr_quality_threshold_noise = $43,
ocr_quality_threshold_sharpness = $44,
ocr_skip_enhancement = $45,
webdav_enabled = $46,
webdav_server_url = $47,
webdav_username = $48,
webdav_password = $49,
webdav_watch_folders = $50,
webdav_file_extensions = $51,
webdav_auto_sync = $52,
webdav_sync_interval_minutes = $53,
updated_at = NOW()
RETURNING id, user_id, ocr_language, concurrent_ocr_jobs, ocr_timeout_seconds,
RETURNING id, user_id, ocr_language,
COALESCE(preferred_languages, '["eng"]'::jsonb) as preferred_languages,
COALESCE(primary_language, 'eng') as primary_language,
COALESCE(auto_detect_language_combination, false) as auto_detect_language_combination,
concurrent_ocr_jobs, ocr_timeout_seconds,
max_file_size_mb, allowed_file_types, auto_rotate_images, enable_image_preprocessing,
search_results_per_page, search_snippet_length, fuzzy_search_threshold,
retention_days, enable_auto_cleanup, enable_compression, memory_limit_mb,
@ -277,6 +259,9 @@ impl Database {
)
.bind(user_id)
.bind(settings.ocr_language.as_ref().unwrap_or(&current.ocr_language))
.bind(serde_json::to_value(settings.preferred_languages.as_ref().unwrap_or(&current.preferred_languages)).unwrap())
.bind(settings.primary_language.as_ref().unwrap_or(&current.primary_language))
.bind(settings.auto_detect_language_combination.unwrap_or(current.auto_detect_language_combination))
.bind(settings.concurrent_ocr_jobs.unwrap_or(current.concurrent_ocr_jobs))
.bind(settings.ocr_timeout_seconds.unwrap_or(current.ocr_timeout_seconds))
.bind(settings.max_file_size_mb.unwrap_or(current.max_file_size_mb))
@ -328,61 +313,7 @@ impl Database {
.fetch_one(&self.pool)
.await?;
Ok(crate::models::Settings {
id: row.get("id"),
user_id: row.get("user_id"),
ocr_language: row.get("ocr_language"),
concurrent_ocr_jobs: row.get("concurrent_ocr_jobs"),
ocr_timeout_seconds: row.get("ocr_timeout_seconds"),
max_file_size_mb: row.get("max_file_size_mb"),
allowed_file_types: row.get("allowed_file_types"),
auto_rotate_images: row.get("auto_rotate_images"),
enable_image_preprocessing: row.get("enable_image_preprocessing"),
search_results_per_page: row.get("search_results_per_page"),
search_snippet_length: row.get("search_snippet_length"),
fuzzy_search_threshold: row.get("fuzzy_search_threshold"),
retention_days: row.get("retention_days"),
enable_auto_cleanup: row.get("enable_auto_cleanup"),
enable_compression: row.get("enable_compression"),
memory_limit_mb: row.get("memory_limit_mb"),
cpu_priority: row.get("cpu_priority"),
enable_background_ocr: row.get("enable_background_ocr"),
ocr_page_segmentation_mode: row.get("ocr_page_segmentation_mode"),
ocr_engine_mode: row.get("ocr_engine_mode"),
ocr_min_confidence: row.get("ocr_min_confidence"),
ocr_dpi: row.get("ocr_dpi"),
ocr_enhance_contrast: row.get("ocr_enhance_contrast"),
ocr_remove_noise: row.get("ocr_remove_noise"),
ocr_detect_orientation: row.get("ocr_detect_orientation"),
ocr_whitelist_chars: row.get("ocr_whitelist_chars"),
ocr_blacklist_chars: row.get("ocr_blacklist_chars"),
ocr_brightness_boost: row.get("ocr_brightness_boost"),
ocr_contrast_multiplier: row.get("ocr_contrast_multiplier"),
ocr_noise_reduction_level: row.get("ocr_noise_reduction_level"),
ocr_sharpening_strength: row.get("ocr_sharpening_strength"),
ocr_morphological_operations: row.get("ocr_morphological_operations"),
ocr_adaptive_threshold_window_size: row.get("ocr_adaptive_threshold_window_size"),
ocr_histogram_equalization: row.get("ocr_histogram_equalization"),
ocr_upscale_factor: row.get("ocr_upscale_factor"),
ocr_max_image_width: row.get("ocr_max_image_width"),
ocr_max_image_height: row.get("ocr_max_image_height"),
save_processed_images: row.get("save_processed_images"),
ocr_quality_threshold_brightness: row.get("ocr_quality_threshold_brightness"),
ocr_quality_threshold_contrast: row.get("ocr_quality_threshold_contrast"),
ocr_quality_threshold_noise: row.get("ocr_quality_threshold_noise"),
ocr_quality_threshold_sharpness: row.get("ocr_quality_threshold_sharpness"),
ocr_skip_enhancement: row.get("ocr_skip_enhancement"),
webdav_enabled: row.get("webdav_enabled"),
webdav_server_url: row.get("webdav_server_url"),
webdav_username: row.get("webdav_username"),
webdav_password: row.get("webdav_password"),
webdav_watch_folders: row.get("webdav_watch_folders"),
webdav_file_extensions: row.get("webdav_file_extensions"),
webdav_auto_sync: row.get("webdav_auto_sync"),
webdav_sync_interval_minutes: row.get("webdav_sync_interval_minutes"),
created_at: row.get("created_at"),
updated_at: row.get("updated_at"),
})
Ok(settings_from_row(&row))
}
pub async fn update_user_ocr_language(&self, user_id: Uuid, language: &str) -> Result<()> {

View File

@ -9,6 +9,9 @@ pub struct Settings {
pub id: Uuid,
pub user_id: Uuid,
pub ocr_language: String,
pub preferred_languages: Vec<String>,
pub primary_language: String,
pub auto_detect_language_combination: bool,
pub concurrent_ocr_jobs: i32,
pub ocr_timeout_seconds: i32,
pub max_file_size_mb: i32,
@ -64,6 +67,9 @@ pub struct Settings {
#[derive(Debug, Serialize, Deserialize, ToSchema)]
pub struct SettingsResponse {
pub ocr_language: String,
pub preferred_languages: Vec<String>,
pub primary_language: String,
pub auto_detect_language_combination: bool,
pub concurrent_ocr_jobs: i32,
pub ocr_timeout_seconds: i32,
pub max_file_size_mb: i32,
@ -117,6 +123,9 @@ pub struct SettingsResponse {
#[derive(Debug, Serialize, Deserialize, ToSchema)]
pub struct UpdateSettings {
pub ocr_language: Option<String>,
pub preferred_languages: Option<Vec<String>>,
pub primary_language: Option<String>,
pub auto_detect_language_combination: Option<bool>,
pub concurrent_ocr_jobs: Option<i32>,
pub ocr_timeout_seconds: Option<i32>,
pub max_file_size_mb: Option<i32>,
@ -171,6 +180,9 @@ impl From<Settings> for SettingsResponse {
fn from(settings: Settings) -> Self {
Self {
ocr_language: settings.ocr_language,
preferred_languages: settings.preferred_languages,
primary_language: settings.primary_language,
auto_detect_language_combination: settings.auto_detect_language_combination,
concurrent_ocr_jobs: settings.concurrent_ocr_jobs,
ocr_timeout_seconds: settings.ocr_timeout_seconds,
max_file_size_mb: settings.max_file_size_mb,
@ -223,12 +235,79 @@ impl From<Settings> for SettingsResponse {
}
}
impl UpdateSettings {
/// Create an UpdateSettings that only updates language preferences
pub fn language_update(
preferred_languages: Vec<String>,
primary_language: String,
ocr_language: String,
) -> Self {
Self {
preferred_languages: Some(preferred_languages),
primary_language: Some(primary_language),
ocr_language: Some(ocr_language),
auto_detect_language_combination: None,
concurrent_ocr_jobs: None,
ocr_timeout_seconds: None,
max_file_size_mb: None,
allowed_file_types: None,
auto_rotate_images: None,
enable_image_preprocessing: None,
search_results_per_page: None,
search_snippet_length: None,
fuzzy_search_threshold: None,
retention_days: None,
enable_auto_cleanup: None,
enable_compression: None,
memory_limit_mb: None,
cpu_priority: None,
enable_background_ocr: None,
ocr_page_segmentation_mode: None,
ocr_engine_mode: None,
ocr_min_confidence: None,
ocr_dpi: None,
ocr_enhance_contrast: None,
ocr_remove_noise: None,
ocr_detect_orientation: None,
ocr_whitelist_chars: None,
ocr_blacklist_chars: None,
ocr_brightness_boost: None,
ocr_contrast_multiplier: None,
ocr_noise_reduction_level: None,
ocr_sharpening_strength: None,
ocr_morphological_operations: None,
ocr_adaptive_threshold_window_size: None,
ocr_histogram_equalization: None,
ocr_upscale_factor: None,
ocr_max_image_width: None,
ocr_max_image_height: None,
save_processed_images: None,
ocr_quality_threshold_brightness: None,
ocr_quality_threshold_contrast: None,
ocr_quality_threshold_noise: None,
ocr_quality_threshold_sharpness: None,
ocr_skip_enhancement: None,
webdav_enabled: None,
webdav_server_url: None,
webdav_username: None,
webdav_password: None,
webdav_watch_folders: None,
webdav_file_extensions: None,
webdav_auto_sync: None,
webdav_sync_interval_minutes: None,
}
}
}
impl Default for Settings {
fn default() -> Self {
Self {
id: Uuid::new_v4(),
user_id: Uuid::nil(),
ocr_language: "eng".to_string(),
preferred_languages: vec!["eng".to_string()],
primary_language: "eng".to_string(),
auto_detect_language_combination: false,
concurrent_ocr_jobs: 4,
ocr_timeout_seconds: 300,
max_file_size_mb: 50,

View File

@ -249,10 +249,32 @@ impl EnhancedOcrService {
needs_enhancement
}
/// Build language combination string for Tesseract (e.g., "eng+spa")
fn build_language_combination(&self, settings: &Settings) -> String {
if settings.preferred_languages.len() > 1 {
// Use preferred_languages with primary_language first
let mut languages = settings.preferred_languages.clone();
// Ensure primary language is first
languages.retain(|lang| lang != &settings.primary_language);
languages.insert(0, settings.primary_language.clone());
// Join with + for Tesseract multi-language format
languages.join("+")
} else if !settings.preferred_languages.is_empty() {
// Single language from preferred_languages
settings.preferred_languages[0].clone()
} else {
// Fallback to ocr_language field for backward compatibility
settings.ocr_language.clone()
}
}
/// Configure Tesseract with optimal settings
#[cfg(feature = "ocr")]
fn configure_tesseract(&self, image_path: &str, settings: &Settings) -> Result<Tesseract> {
let mut tesseract = Tesseract::new(None, Some(&settings.ocr_language))?;
let language_combination = self.build_language_combination(settings);
let mut tesseract = Tesseract::new(None, Some(&language_combination))?;
// Set the image
tesseract = tesseract.set_image(image_path)?;

View File

@ -123,6 +123,55 @@ impl OcrHealthChecker {
}
Ok(())
}
/// Validate a language combination (e.g., "eng+spa")
pub fn validate_language_combination(&self, lang_combination: &str) -> Result<(), OcrError> {
if lang_combination.is_empty() {
return Err(OcrError::LanguageDataNotFound {
lang: "empty".to_string(),
});
}
// Split by '+' to handle multi-language combinations
let languages: Vec<&str> = lang_combination.split('+').collect();
// Validate each language in the combination
for lang in &languages {
self.validate_language(lang.trim())?;
}
// Limit number of languages for performance (max 4)
if languages.len() > 4 {
return Err(OcrError::LanguageDataNotFound {
lang: format!("Too many languages in combination: {}. Maximum is 4.", languages.len()),
});
}
Ok(())
}
/// Validate a list of preferred languages
pub fn validate_preferred_languages(&self, languages: &[String]) -> Result<(), OcrError> {
if languages.is_empty() {
return Err(OcrError::LanguageDataNotFound {
lang: "No languages provided".to_string(),
});
}
// Limit number of languages for performance
if languages.len() > 4 {
return Err(OcrError::LanguageDataNotFound {
lang: format!("Too many preferred languages: {}. Maximum is 4.", languages.len()),
});
}
// Validate each language
for lang in languages {
self.validate_language(lang)?;
}
Ok(())
}
pub fn get_language_display_name(&self, lang_code: &str) -> String {
match lang_code {

View File

@ -36,7 +36,7 @@ impl OcrService {
// Perform health checks first
self.health_checker.check_tesseract_installation()
.map_err(|e: OcrError| anyhow!(e))?;
self.health_checker.check_language_data(lang)
self.health_checker.validate_language_combination(lang)
.map_err(|e: OcrError| anyhow!(e))?;
let mut tesseract = Tesseract::new(None, Some(lang))

View File

@ -40,6 +40,7 @@ pub async fn upload_document(
) -> Result<Json<DocumentUploadResponse>, StatusCode> {
let mut uploaded_file = None;
let mut ocr_language: Option<String> = None;
let mut ocr_languages: Vec<String> = Vec::new();
// First pass: collect all multipart fields
while let Some(field) = multipart.next_field().await.map_err(|e| {
@ -65,6 +66,22 @@ pub async fn upload_document(
}
}
}
} else if name == "ocr_languages" || name.starts_with("ocr_languages[") {
let language = field.text().await.map_err(|_| StatusCode::BAD_REQUEST)?;
if !language.trim().is_empty() {
// Validate that the language is available
let health_checker = crate::ocr::health::OcrHealthChecker::new();
match health_checker.validate_language(language.trim()) {
Ok(_) => {
ocr_languages.push(language.trim().to_string());
info!("OCR language added to list: {}", language);
}
Err(e) => {
warn!("Invalid OCR language specified '{}': {}", language, e);
return Err(StatusCode::BAD_REQUEST);
}
}
}
} else if name == "file" {
let filename = field.file_name()
.ok_or_else(|| {
@ -143,8 +160,30 @@ pub async fn upload_document(
Ok(IngestionResult::Created(document)) => {
info!("Document uploaded successfully: {}", document.id);
// If a language was specified, update the user's OCR language setting
if let Some(lang) = &ocr_language {
// Update user's OCR language settings based on what was provided
if !ocr_languages.is_empty() {
// Multi-language support: update preferred languages
let health_checker = crate::ocr::health::OcrHealthChecker::new();
match health_checker.validate_preferred_languages(&ocr_languages) {
Ok(_) => {
let settings_update = crate::models::UpdateSettings::language_update(
ocr_languages.clone(),
ocr_languages[0].clone(), // First language as primary
ocr_languages[0].clone(), // Backward compatibility
);
if let Err(e) = state.db.create_or_update_settings(auth_user.user.id, &settings_update).await {
warn!("Failed to update user preferred languages to {:?}: {}", ocr_languages, e);
} else {
info!("Updated user {} preferred languages to: {:?}", auth_user.user.id, ocr_languages);
}
}
Err(e) => {
warn!("Invalid language combination provided: {}", e);
}
}
} else if let Some(lang) = &ocr_language {
// Single language (backward compatibility)
if let Err(e) = state.db.update_user_ocr_language(auth_user.user.id, lang).await {
warn!("Failed to update user OCR language to {}: {}", lang, e);
} else {

View File

@ -107,9 +107,31 @@ pub async fn retry_ocr(
}
}
// If a language was specified, validate and update the user's OCR language setting
if let Some(lang) = &request.language {
// Validate that the language is available
// Update user's OCR language settings based on what was provided
if let Some(languages) = &request.languages {
// Multi-language support: validate and update preferred languages
let health_checker = crate::ocr::health::OcrHealthChecker::new();
match health_checker.validate_preferred_languages(languages) {
Ok(_) => {
let settings_update = crate::models::UpdateSettings::language_update(
languages.clone(),
languages[0].clone(), // First language as primary
languages[0].clone(), // Backward compatibility
);
if let Err(e) = state.db.create_or_update_settings(auth_user.user.id, &settings_update).await {
warn!("Failed to update user preferred languages to {:?}: {}", languages, e);
} else {
info!("Updated user {} preferred languages to: {:?} for retry", auth_user.user.id, languages);
}
}
Err(e) => {
warn!("Invalid language combination provided: {}", e);
return Err(StatusCode::BAD_REQUEST);
}
}
} else if let Some(lang) = &request.language {
// Single language (backward compatibility)
let health_checker = crate::ocr::health::OcrHealthChecker::new();
match health_checker.validate_language(lang) {
Ok(_) => {

View File

@ -30,6 +30,7 @@ pub struct DeleteLowConfidenceRequest {
#[derive(Deserialize, ToSchema)]
pub struct RetryOcrRequest {
pub language: Option<String>,
pub languages: Option<Vec<String>>,
}
#[derive(Deserialize, Serialize, ToSchema)]

View File

@ -49,6 +49,9 @@ async fn get_settings(
let default = crate::models::Settings::default();
SettingsResponse {
ocr_language: default.ocr_language,
preferred_languages: default.preferred_languages,
primary_language: default.primary_language,
auto_detect_language_combination: default.auto_detect_language_combination,
concurrent_ocr_jobs: default.concurrent_ocr_jobs,
ocr_timeout_seconds: default.ocr_timeout_seconds,
max_file_size_mb: default.max_file_size_mb,