diff --git a/migrations/20250714000000_add_multi_language_ocr_support.sql b/migrations/20250714000000_add_multi_language_ocr_support.sql new file mode 100644 index 0000000..5e8ca49 --- /dev/null +++ b/migrations/20250714000000_add_multi_language_ocr_support.sql @@ -0,0 +1,56 @@ +-- Migration: Add multi-language OCR support +-- This migration adds support for multiple OCR languages per user + +-- Add new columns for multi-language support +ALTER TABLE settings +ADD COLUMN preferred_languages JSONB DEFAULT '["eng"]'::jsonb, +ADD COLUMN primary_language VARCHAR(10) DEFAULT 'eng', +ADD COLUMN auto_detect_language_combination BOOLEAN DEFAULT false; + +-- Migrate existing ocr_language data to new preferred_languages array +UPDATE settings +SET preferred_languages = jsonb_build_array(COALESCE(ocr_language, 'eng')), + primary_language = COALESCE(ocr_language, 'eng') +WHERE preferred_languages = '["eng"]'::jsonb; + +-- Create index for efficient querying of preferred languages +CREATE INDEX IF NOT EXISTS idx_settings_preferred_languages ON settings USING gin(preferred_languages); +CREATE INDEX IF NOT EXISTS idx_settings_primary_language ON settings(primary_language); + +-- Add constraint to ensure primary_language is always in preferred_languages +ALTER TABLE settings +ADD CONSTRAINT check_primary_language_in_preferred +CHECK (preferred_languages ? primary_language); + +-- Add constraint to limit number of preferred languages (max 4 for performance) +ALTER TABLE settings +ADD CONSTRAINT check_max_preferred_languages +CHECK (jsonb_array_length(preferred_languages) <= 4); + +-- Add constraint to ensure valid language codes (3-letter ISO codes) +ALTER TABLE settings +ADD CONSTRAINT check_valid_language_codes +CHECK ( + primary_language ~ '^[a-z]{3}(_[A-Z]{2})?$' AND + ( + SELECT bool_and(value::text ~ '^"[a-z]{3}(_[A-Z]{2})?"$') + FROM jsonb_array_elements(preferred_languages) + ) +); + +-- Update existing users who don't have settings yet +INSERT INTO settings (user_id, preferred_languages, primary_language, auto_detect_language_combination) +SELECT + u.id, + '["eng"]'::jsonb, + 'eng', + false +FROM users u +WHERE NOT EXISTS ( + SELECT 1 FROM settings s WHERE s.user_id = u.id +); + +-- Add comments for documentation +COMMENT ON COLUMN settings.preferred_languages IS 'Array of 3-letter ISO language codes for OCR processing, max 4 languages'; +COMMENT ON COLUMN settings.primary_language IS 'Primary language code that should be listed first in OCR processing'; +COMMENT ON COLUMN settings.auto_detect_language_combination IS 'Whether to automatically suggest language combinations based on document content'; \ No newline at end of file diff --git a/src/db/settings.rs b/src/db/settings.rs index e9167f9..abf69df 100644 --- a/src/db/settings.rs +++ b/src/db/settings.rs @@ -1,14 +1,94 @@ use anyhow::Result; use sqlx::Row; use uuid::Uuid; +use serde_json::Value; use super::Database; +// Helper function to parse JSONB array to Vec +fn parse_jsonb_string_array(value: Value) -> Vec { + match value { + Value::Array(arr) => arr.into_iter() + .filter_map(|v| v.as_str().map(|s| s.to_string())) + .collect(), + _ => vec!["eng".to_string()], // fallback to English + } +} + +// Helper function to create Settings from database row +fn settings_from_row(row: &sqlx::postgres::PgRow) -> crate::models::Settings { + let preferred_languages_json: Value = row.get("preferred_languages"); + let preferred_languages = parse_jsonb_string_array(preferred_languages_json); + + crate::models::Settings { + id: row.get("id"), + user_id: row.get("user_id"), + ocr_language: row.get("ocr_language"), + preferred_languages, + primary_language: row.get("primary_language"), + auto_detect_language_combination: row.get("auto_detect_language_combination"), + concurrent_ocr_jobs: row.get("concurrent_ocr_jobs"), + ocr_timeout_seconds: row.get("ocr_timeout_seconds"), + max_file_size_mb: row.get("max_file_size_mb"), + allowed_file_types: row.get("allowed_file_types"), + auto_rotate_images: row.get("auto_rotate_images"), + enable_image_preprocessing: row.get("enable_image_preprocessing"), + search_results_per_page: row.get("search_results_per_page"), + search_snippet_length: row.get("search_snippet_length"), + fuzzy_search_threshold: row.get("fuzzy_search_threshold"), + retention_days: row.get("retention_days"), + enable_auto_cleanup: row.get("enable_auto_cleanup"), + enable_compression: row.get("enable_compression"), + memory_limit_mb: row.get("memory_limit_mb"), + cpu_priority: row.get("cpu_priority"), + enable_background_ocr: row.get("enable_background_ocr"), + ocr_page_segmentation_mode: row.get("ocr_page_segmentation_mode"), + ocr_engine_mode: row.get("ocr_engine_mode"), + ocr_min_confidence: row.get("ocr_min_confidence"), + ocr_dpi: row.get("ocr_dpi"), + ocr_enhance_contrast: row.get("ocr_enhance_contrast"), + ocr_remove_noise: row.get("ocr_remove_noise"), + ocr_detect_orientation: row.get("ocr_detect_orientation"), + ocr_whitelist_chars: row.get("ocr_whitelist_chars"), + ocr_blacklist_chars: row.get("ocr_blacklist_chars"), + ocr_brightness_boost: row.get("ocr_brightness_boost"), + ocr_contrast_multiplier: row.get("ocr_contrast_multiplier"), + ocr_noise_reduction_level: row.get("ocr_noise_reduction_level"), + ocr_sharpening_strength: row.get("ocr_sharpening_strength"), + ocr_morphological_operations: row.get("ocr_morphological_operations"), + ocr_adaptive_threshold_window_size: row.get("ocr_adaptive_threshold_window_size"), + ocr_histogram_equalization: row.get("ocr_histogram_equalization"), + ocr_upscale_factor: row.get("ocr_upscale_factor"), + ocr_max_image_width: row.get("ocr_max_image_width"), + ocr_max_image_height: row.get("ocr_max_image_height"), + save_processed_images: row.get("save_processed_images"), + ocr_quality_threshold_brightness: row.get("ocr_quality_threshold_brightness"), + ocr_quality_threshold_contrast: row.get("ocr_quality_threshold_contrast"), + ocr_quality_threshold_noise: row.get("ocr_quality_threshold_noise"), + ocr_quality_threshold_sharpness: row.get("ocr_quality_threshold_sharpness"), + ocr_skip_enhancement: row.get("ocr_skip_enhancement"), + webdav_enabled: row.get("webdav_enabled"), + webdav_server_url: row.get("webdav_server_url"), + webdav_username: row.get("webdav_username"), + webdav_password: row.get("webdav_password"), + webdav_watch_folders: row.get("webdav_watch_folders"), + webdav_file_extensions: row.get("webdav_file_extensions"), + webdav_auto_sync: row.get("webdav_auto_sync"), + webdav_sync_interval_minutes: row.get("webdav_sync_interval_minutes"), + created_at: row.get("created_at"), + updated_at: row.get("updated_at"), + } +} + impl Database { pub async fn get_user_settings(&self, user_id: Uuid) -> Result> { self.with_retry(|| async { let row = sqlx::query( - r#"SELECT id, user_id, ocr_language, concurrent_ocr_jobs, ocr_timeout_seconds, + r#"SELECT id, user_id, ocr_language, + COALESCE(preferred_languages, '["eng"]'::jsonb) as preferred_languages, + COALESCE(primary_language, 'eng') as primary_language, + COALESCE(auto_detect_language_combination, false) as auto_detect_language_combination, + concurrent_ocr_jobs, ocr_timeout_seconds, max_file_size_mb, allowed_file_types, auto_rotate_images, enable_image_preprocessing, search_results_per_page, search_snippet_length, fuzzy_search_threshold, retention_days, enable_auto_cleanup, enable_compression, memory_limit_mb, @@ -31,61 +111,7 @@ impl Database { .map_err(|e| anyhow::anyhow!("Database query failed: {}", e))?; match row { - Some(row) => Ok(Some(crate::models::Settings { - id: row.get("id"), - user_id: row.get("user_id"), - ocr_language: row.get("ocr_language"), - concurrent_ocr_jobs: row.get("concurrent_ocr_jobs"), - ocr_timeout_seconds: row.get("ocr_timeout_seconds"), - max_file_size_mb: row.get("max_file_size_mb"), - allowed_file_types: row.get("allowed_file_types"), - auto_rotate_images: row.get("auto_rotate_images"), - enable_image_preprocessing: row.get("enable_image_preprocessing"), - search_results_per_page: row.get("search_results_per_page"), - search_snippet_length: row.get("search_snippet_length"), - fuzzy_search_threshold: row.get("fuzzy_search_threshold"), - retention_days: row.get("retention_days"), - enable_auto_cleanup: row.get("enable_auto_cleanup"), - enable_compression: row.get("enable_compression"), - memory_limit_mb: row.get("memory_limit_mb"), - cpu_priority: row.get("cpu_priority"), - enable_background_ocr: row.get("enable_background_ocr"), - ocr_page_segmentation_mode: row.get("ocr_page_segmentation_mode"), - ocr_engine_mode: row.get("ocr_engine_mode"), - ocr_min_confidence: row.get("ocr_min_confidence"), - ocr_dpi: row.get("ocr_dpi"), - ocr_enhance_contrast: row.get("ocr_enhance_contrast"), - ocr_remove_noise: row.get("ocr_remove_noise"), - ocr_detect_orientation: row.get("ocr_detect_orientation"), - ocr_whitelist_chars: row.get("ocr_whitelist_chars"), - ocr_blacklist_chars: row.get("ocr_blacklist_chars"), - ocr_brightness_boost: row.get("ocr_brightness_boost"), - ocr_contrast_multiplier: row.get("ocr_contrast_multiplier"), - ocr_noise_reduction_level: row.get("ocr_noise_reduction_level"), - ocr_sharpening_strength: row.get("ocr_sharpening_strength"), - ocr_morphological_operations: row.get("ocr_morphological_operations"), - ocr_adaptive_threshold_window_size: row.get("ocr_adaptive_threshold_window_size"), - ocr_histogram_equalization: row.get("ocr_histogram_equalization"), - ocr_upscale_factor: row.get("ocr_upscale_factor"), - ocr_max_image_width: row.get("ocr_max_image_width"), - ocr_max_image_height: row.get("ocr_max_image_height"), - save_processed_images: row.get("save_processed_images"), - ocr_quality_threshold_brightness: row.get("ocr_quality_threshold_brightness"), - ocr_quality_threshold_contrast: row.get("ocr_quality_threshold_contrast"), - ocr_quality_threshold_noise: row.get("ocr_quality_threshold_noise"), - ocr_quality_threshold_sharpness: row.get("ocr_quality_threshold_sharpness"), - ocr_skip_enhancement: row.get("ocr_skip_enhancement"), - webdav_enabled: row.get("webdav_enabled"), - webdav_server_url: row.get("webdav_server_url"), - webdav_username: row.get("webdav_username"), - webdav_password: row.get("webdav_password"), - webdav_watch_folders: row.get("webdav_watch_folders"), - webdav_file_extensions: row.get("webdav_file_extensions"), - webdav_auto_sync: row.get("webdav_auto_sync"), - webdav_sync_interval_minutes: row.get("webdav_sync_interval_minutes"), - created_at: row.get("created_at"), - updated_at: row.get("updated_at"), - })), + Some(row) => Ok(Some(settings_from_row(&row))), None => Ok(None), } }).await @@ -93,7 +119,11 @@ impl Database { pub async fn get_all_user_settings(&self) -> Result> { let rows = sqlx::query( - r#"SELECT id, user_id, ocr_language, concurrent_ocr_jobs, ocr_timeout_seconds, + r#"SELECT id, user_id, ocr_language, + COALESCE(preferred_languages, '["eng"]'::jsonb) as preferred_languages, + COALESCE(primary_language, 'eng') as primary_language, + COALESCE(auto_detect_language_combination, false) as auto_detect_language_combination, + concurrent_ocr_jobs, ocr_timeout_seconds, max_file_size_mb, allowed_file_types, auto_rotate_images, enable_image_preprocessing, search_results_per_page, search_snippet_length, fuzzy_search_threshold, retention_days, enable_auto_cleanup, enable_compression, memory_limit_mb, @@ -114,64 +144,9 @@ impl Database { .fetch_all(&self.pool) .await?; - let mut settings_list = Vec::new(); - for row in rows { - settings_list.push(crate::models::Settings { - id: row.get("id"), - user_id: row.get("user_id"), - ocr_language: row.get("ocr_language"), - concurrent_ocr_jobs: row.get("concurrent_ocr_jobs"), - ocr_timeout_seconds: row.get("ocr_timeout_seconds"), - max_file_size_mb: row.get("max_file_size_mb"), - allowed_file_types: row.get("allowed_file_types"), - auto_rotate_images: row.get("auto_rotate_images"), - enable_image_preprocessing: row.get("enable_image_preprocessing"), - search_results_per_page: row.get("search_results_per_page"), - search_snippet_length: row.get("search_snippet_length"), - fuzzy_search_threshold: row.get("fuzzy_search_threshold"), - retention_days: row.get("retention_days"), - enable_auto_cleanup: row.get("enable_auto_cleanup"), - enable_compression: row.get("enable_compression"), - memory_limit_mb: row.get("memory_limit_mb"), - cpu_priority: row.get("cpu_priority"), - enable_background_ocr: row.get("enable_background_ocr"), - ocr_page_segmentation_mode: row.get("ocr_page_segmentation_mode"), - ocr_engine_mode: row.get("ocr_engine_mode"), - ocr_min_confidence: row.get("ocr_min_confidence"), - ocr_dpi: row.get("ocr_dpi"), - ocr_enhance_contrast: row.get("ocr_enhance_contrast"), - ocr_remove_noise: row.get("ocr_remove_noise"), - ocr_detect_orientation: row.get("ocr_detect_orientation"), - ocr_whitelist_chars: row.get("ocr_whitelist_chars"), - ocr_blacklist_chars: row.get("ocr_blacklist_chars"), - ocr_brightness_boost: row.get("ocr_brightness_boost"), - ocr_contrast_multiplier: row.get("ocr_contrast_multiplier"), - ocr_noise_reduction_level: row.get("ocr_noise_reduction_level"), - ocr_sharpening_strength: row.get("ocr_sharpening_strength"), - ocr_morphological_operations: row.get("ocr_morphological_operations"), - ocr_adaptive_threshold_window_size: row.get("ocr_adaptive_threshold_window_size"), - ocr_histogram_equalization: row.get("ocr_histogram_equalization"), - ocr_upscale_factor: row.get("ocr_upscale_factor"), - ocr_max_image_width: row.get("ocr_max_image_width"), - ocr_max_image_height: row.get("ocr_max_image_height"), - save_processed_images: row.get("save_processed_images"), - ocr_quality_threshold_brightness: row.get("ocr_quality_threshold_brightness"), - ocr_quality_threshold_contrast: row.get("ocr_quality_threshold_contrast"), - ocr_quality_threshold_noise: row.get("ocr_quality_threshold_noise"), - ocr_quality_threshold_sharpness: row.get("ocr_quality_threshold_sharpness"), - ocr_skip_enhancement: row.get("ocr_skip_enhancement"), - webdav_enabled: row.get("webdav_enabled"), - webdav_server_url: row.get("webdav_server_url"), - webdav_username: row.get("webdav_username"), - webdav_password: row.get("webdav_password"), - webdav_watch_folders: row.get("webdav_watch_folders"), - webdav_file_extensions: row.get("webdav_file_extensions"), - webdav_auto_sync: row.get("webdav_auto_sync"), - webdav_sync_interval_minutes: row.get("webdav_sync_interval_minutes"), - created_at: row.get("created_at"), - updated_at: row.get("updated_at"), - }); - } + let settings_list = rows.into_iter() + .map(|row| settings_from_row(&row)) + .collect(); Ok(settings_list) } @@ -191,7 +166,7 @@ impl Database { let row = sqlx::query( r#" INSERT INTO settings ( - user_id, ocr_language, concurrent_ocr_jobs, ocr_timeout_seconds, + user_id, ocr_language, preferred_languages, primary_language, auto_detect_language_combination, concurrent_ocr_jobs, ocr_timeout_seconds, max_file_size_mb, allowed_file_types, auto_rotate_images, enable_image_preprocessing, search_results_per_page, search_snippet_length, fuzzy_search_threshold, retention_days, enable_auto_cleanup, enable_compression, memory_limit_mb, @@ -206,59 +181,66 @@ impl Database { webdav_enabled, webdav_server_url, webdav_username, webdav_password, webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes ) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $50) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $50, $51, $52, $53) ON CONFLICT (user_id) DO UPDATE SET ocr_language = $2, - concurrent_ocr_jobs = $3, - ocr_timeout_seconds = $4, - max_file_size_mb = $5, - allowed_file_types = $6, - auto_rotate_images = $7, - enable_image_preprocessing = $8, - search_results_per_page = $9, - search_snippet_length = $10, - fuzzy_search_threshold = $11, - retention_days = $12, - enable_auto_cleanup = $13, - enable_compression = $14, - memory_limit_mb = $15, - cpu_priority = $16, - enable_background_ocr = $17, - ocr_page_segmentation_mode = $18, - ocr_engine_mode = $19, - ocr_min_confidence = $20, - ocr_dpi = $21, - ocr_enhance_contrast = $22, - ocr_remove_noise = $23, - ocr_detect_orientation = $24, - ocr_whitelist_chars = $25, - ocr_blacklist_chars = $26, - ocr_brightness_boost = $27, - ocr_contrast_multiplier = $28, - ocr_noise_reduction_level = $29, - ocr_sharpening_strength = $30, - ocr_morphological_operations = $31, - ocr_adaptive_threshold_window_size = $32, - ocr_histogram_equalization = $33, - ocr_upscale_factor = $34, - ocr_max_image_width = $35, - ocr_max_image_height = $36, - save_processed_images = $37, - ocr_quality_threshold_brightness = $38, - ocr_quality_threshold_contrast = $39, - ocr_quality_threshold_noise = $40, - ocr_quality_threshold_sharpness = $41, - ocr_skip_enhancement = $42, - webdav_enabled = $43, - webdav_server_url = $44, - webdav_username = $45, - webdav_password = $46, - webdav_watch_folders = $47, - webdav_file_extensions = $48, - webdav_auto_sync = $49, - webdav_sync_interval_minutes = $50, + preferred_languages = $3, + primary_language = $4, + auto_detect_language_combination = $5, + concurrent_ocr_jobs = $6, + ocr_timeout_seconds = $7, + max_file_size_mb = $8, + allowed_file_types = $9, + auto_rotate_images = $10, + enable_image_preprocessing = $11, + search_results_per_page = $12, + search_snippet_length = $13, + fuzzy_search_threshold = $14, + retention_days = $15, + enable_auto_cleanup = $16, + enable_compression = $17, + memory_limit_mb = $18, + cpu_priority = $19, + enable_background_ocr = $20, + ocr_page_segmentation_mode = $21, + ocr_engine_mode = $22, + ocr_min_confidence = $23, + ocr_dpi = $24, + ocr_enhance_contrast = $25, + ocr_remove_noise = $26, + ocr_detect_orientation = $27, + ocr_whitelist_chars = $28, + ocr_blacklist_chars = $29, + ocr_brightness_boost = $30, + ocr_contrast_multiplier = $31, + ocr_noise_reduction_level = $32, + ocr_sharpening_strength = $33, + ocr_morphological_operations = $34, + ocr_adaptive_threshold_window_size = $35, + ocr_histogram_equalization = $36, + ocr_upscale_factor = $37, + ocr_max_image_width = $38, + ocr_max_image_height = $39, + save_processed_images = $40, + ocr_quality_threshold_brightness = $41, + ocr_quality_threshold_contrast = $42, + ocr_quality_threshold_noise = $43, + ocr_quality_threshold_sharpness = $44, + ocr_skip_enhancement = $45, + webdav_enabled = $46, + webdav_server_url = $47, + webdav_username = $48, + webdav_password = $49, + webdav_watch_folders = $50, + webdav_file_extensions = $51, + webdav_auto_sync = $52, + webdav_sync_interval_minutes = $53, updated_at = NOW() - RETURNING id, user_id, ocr_language, concurrent_ocr_jobs, ocr_timeout_seconds, + RETURNING id, user_id, ocr_language, + COALESCE(preferred_languages, '["eng"]'::jsonb) as preferred_languages, + COALESCE(primary_language, 'eng') as primary_language, + COALESCE(auto_detect_language_combination, false) as auto_detect_language_combination, + concurrent_ocr_jobs, ocr_timeout_seconds, max_file_size_mb, allowed_file_types, auto_rotate_images, enable_image_preprocessing, search_results_per_page, search_snippet_length, fuzzy_search_threshold, retention_days, enable_auto_cleanup, enable_compression, memory_limit_mb, @@ -277,6 +259,9 @@ impl Database { ) .bind(user_id) .bind(settings.ocr_language.as_ref().unwrap_or(¤t.ocr_language)) + .bind(serde_json::to_value(settings.preferred_languages.as_ref().unwrap_or(¤t.preferred_languages)).unwrap()) + .bind(settings.primary_language.as_ref().unwrap_or(¤t.primary_language)) + .bind(settings.auto_detect_language_combination.unwrap_or(current.auto_detect_language_combination)) .bind(settings.concurrent_ocr_jobs.unwrap_or(current.concurrent_ocr_jobs)) .bind(settings.ocr_timeout_seconds.unwrap_or(current.ocr_timeout_seconds)) .bind(settings.max_file_size_mb.unwrap_or(current.max_file_size_mb)) @@ -328,61 +313,7 @@ impl Database { .fetch_one(&self.pool) .await?; - Ok(crate::models::Settings { - id: row.get("id"), - user_id: row.get("user_id"), - ocr_language: row.get("ocr_language"), - concurrent_ocr_jobs: row.get("concurrent_ocr_jobs"), - ocr_timeout_seconds: row.get("ocr_timeout_seconds"), - max_file_size_mb: row.get("max_file_size_mb"), - allowed_file_types: row.get("allowed_file_types"), - auto_rotate_images: row.get("auto_rotate_images"), - enable_image_preprocessing: row.get("enable_image_preprocessing"), - search_results_per_page: row.get("search_results_per_page"), - search_snippet_length: row.get("search_snippet_length"), - fuzzy_search_threshold: row.get("fuzzy_search_threshold"), - retention_days: row.get("retention_days"), - enable_auto_cleanup: row.get("enable_auto_cleanup"), - enable_compression: row.get("enable_compression"), - memory_limit_mb: row.get("memory_limit_mb"), - cpu_priority: row.get("cpu_priority"), - enable_background_ocr: row.get("enable_background_ocr"), - ocr_page_segmentation_mode: row.get("ocr_page_segmentation_mode"), - ocr_engine_mode: row.get("ocr_engine_mode"), - ocr_min_confidence: row.get("ocr_min_confidence"), - ocr_dpi: row.get("ocr_dpi"), - ocr_enhance_contrast: row.get("ocr_enhance_contrast"), - ocr_remove_noise: row.get("ocr_remove_noise"), - ocr_detect_orientation: row.get("ocr_detect_orientation"), - ocr_whitelist_chars: row.get("ocr_whitelist_chars"), - ocr_blacklist_chars: row.get("ocr_blacklist_chars"), - ocr_brightness_boost: row.get("ocr_brightness_boost"), - ocr_contrast_multiplier: row.get("ocr_contrast_multiplier"), - ocr_noise_reduction_level: row.get("ocr_noise_reduction_level"), - ocr_sharpening_strength: row.get("ocr_sharpening_strength"), - ocr_morphological_operations: row.get("ocr_morphological_operations"), - ocr_adaptive_threshold_window_size: row.get("ocr_adaptive_threshold_window_size"), - ocr_histogram_equalization: row.get("ocr_histogram_equalization"), - ocr_upscale_factor: row.get("ocr_upscale_factor"), - ocr_max_image_width: row.get("ocr_max_image_width"), - ocr_max_image_height: row.get("ocr_max_image_height"), - save_processed_images: row.get("save_processed_images"), - ocr_quality_threshold_brightness: row.get("ocr_quality_threshold_brightness"), - ocr_quality_threshold_contrast: row.get("ocr_quality_threshold_contrast"), - ocr_quality_threshold_noise: row.get("ocr_quality_threshold_noise"), - ocr_quality_threshold_sharpness: row.get("ocr_quality_threshold_sharpness"), - ocr_skip_enhancement: row.get("ocr_skip_enhancement"), - webdav_enabled: row.get("webdav_enabled"), - webdav_server_url: row.get("webdav_server_url"), - webdav_username: row.get("webdav_username"), - webdav_password: row.get("webdav_password"), - webdav_watch_folders: row.get("webdav_watch_folders"), - webdav_file_extensions: row.get("webdav_file_extensions"), - webdav_auto_sync: row.get("webdav_auto_sync"), - webdav_sync_interval_minutes: row.get("webdav_sync_interval_minutes"), - created_at: row.get("created_at"), - updated_at: row.get("updated_at"), - }) + Ok(settings_from_row(&row)) } pub async fn update_user_ocr_language(&self, user_id: Uuid, language: &str) -> Result<()> { diff --git a/src/models/settings.rs b/src/models/settings.rs index 446c17d..886f676 100644 --- a/src/models/settings.rs +++ b/src/models/settings.rs @@ -9,6 +9,9 @@ pub struct Settings { pub id: Uuid, pub user_id: Uuid, pub ocr_language: String, + pub preferred_languages: Vec, + pub primary_language: String, + pub auto_detect_language_combination: bool, pub concurrent_ocr_jobs: i32, pub ocr_timeout_seconds: i32, pub max_file_size_mb: i32, @@ -64,6 +67,9 @@ pub struct Settings { #[derive(Debug, Serialize, Deserialize, ToSchema)] pub struct SettingsResponse { pub ocr_language: String, + pub preferred_languages: Vec, + pub primary_language: String, + pub auto_detect_language_combination: bool, pub concurrent_ocr_jobs: i32, pub ocr_timeout_seconds: i32, pub max_file_size_mb: i32, @@ -117,6 +123,9 @@ pub struct SettingsResponse { #[derive(Debug, Serialize, Deserialize, ToSchema)] pub struct UpdateSettings { pub ocr_language: Option, + pub preferred_languages: Option>, + pub primary_language: Option, + pub auto_detect_language_combination: Option, pub concurrent_ocr_jobs: Option, pub ocr_timeout_seconds: Option, pub max_file_size_mb: Option, @@ -171,6 +180,9 @@ impl From for SettingsResponse { fn from(settings: Settings) -> Self { Self { ocr_language: settings.ocr_language, + preferred_languages: settings.preferred_languages, + primary_language: settings.primary_language, + auto_detect_language_combination: settings.auto_detect_language_combination, concurrent_ocr_jobs: settings.concurrent_ocr_jobs, ocr_timeout_seconds: settings.ocr_timeout_seconds, max_file_size_mb: settings.max_file_size_mb, @@ -223,12 +235,79 @@ impl From for SettingsResponse { } } +impl UpdateSettings { + /// Create an UpdateSettings that only updates language preferences + pub fn language_update( + preferred_languages: Vec, + primary_language: String, + ocr_language: String, + ) -> Self { + Self { + preferred_languages: Some(preferred_languages), + primary_language: Some(primary_language), + ocr_language: Some(ocr_language), + auto_detect_language_combination: None, + concurrent_ocr_jobs: None, + ocr_timeout_seconds: None, + max_file_size_mb: None, + allowed_file_types: None, + auto_rotate_images: None, + enable_image_preprocessing: None, + search_results_per_page: None, + search_snippet_length: None, + fuzzy_search_threshold: None, + retention_days: None, + enable_auto_cleanup: None, + enable_compression: None, + memory_limit_mb: None, + cpu_priority: None, + enable_background_ocr: None, + ocr_page_segmentation_mode: None, + ocr_engine_mode: None, + ocr_min_confidence: None, + ocr_dpi: None, + ocr_enhance_contrast: None, + ocr_remove_noise: None, + ocr_detect_orientation: None, + ocr_whitelist_chars: None, + ocr_blacklist_chars: None, + ocr_brightness_boost: None, + ocr_contrast_multiplier: None, + ocr_noise_reduction_level: None, + ocr_sharpening_strength: None, + ocr_morphological_operations: None, + ocr_adaptive_threshold_window_size: None, + ocr_histogram_equalization: None, + ocr_upscale_factor: None, + ocr_max_image_width: None, + ocr_max_image_height: None, + save_processed_images: None, + ocr_quality_threshold_brightness: None, + ocr_quality_threshold_contrast: None, + ocr_quality_threshold_noise: None, + ocr_quality_threshold_sharpness: None, + ocr_skip_enhancement: None, + webdav_enabled: None, + webdav_server_url: None, + webdav_username: None, + webdav_password: None, + webdav_watch_folders: None, + webdav_file_extensions: None, + webdav_auto_sync: None, + webdav_sync_interval_minutes: None, + } + } +} + impl Default for Settings { fn default() -> Self { Self { id: Uuid::new_v4(), user_id: Uuid::nil(), ocr_language: "eng".to_string(), + preferred_languages: vec!["eng".to_string()], + primary_language: "eng".to_string(), + auto_detect_language_combination: false, concurrent_ocr_jobs: 4, ocr_timeout_seconds: 300, max_file_size_mb: 50, diff --git a/src/ocr/enhanced.rs b/src/ocr/enhanced.rs index f333a66..2842841 100644 --- a/src/ocr/enhanced.rs +++ b/src/ocr/enhanced.rs @@ -249,10 +249,32 @@ impl EnhancedOcrService { needs_enhancement } + /// Build language combination string for Tesseract (e.g., "eng+spa") + fn build_language_combination(&self, settings: &Settings) -> String { + if settings.preferred_languages.len() > 1 { + // Use preferred_languages with primary_language first + let mut languages = settings.preferred_languages.clone(); + + // Ensure primary language is first + languages.retain(|lang| lang != &settings.primary_language); + languages.insert(0, settings.primary_language.clone()); + + // Join with + for Tesseract multi-language format + languages.join("+") + } else if !settings.preferred_languages.is_empty() { + // Single language from preferred_languages + settings.preferred_languages[0].clone() + } else { + // Fallback to ocr_language field for backward compatibility + settings.ocr_language.clone() + } + } + /// Configure Tesseract with optimal settings #[cfg(feature = "ocr")] fn configure_tesseract(&self, image_path: &str, settings: &Settings) -> Result { - let mut tesseract = Tesseract::new(None, Some(&settings.ocr_language))?; + let language_combination = self.build_language_combination(settings); + let mut tesseract = Tesseract::new(None, Some(&language_combination))?; // Set the image tesseract = tesseract.set_image(image_path)?; diff --git a/src/ocr/health.rs b/src/ocr/health.rs index 0011055..e5fa1ba 100644 --- a/src/ocr/health.rs +++ b/src/ocr/health.rs @@ -123,6 +123,55 @@ impl OcrHealthChecker { } Ok(()) } + + /// Validate a language combination (e.g., "eng+spa") + pub fn validate_language_combination(&self, lang_combination: &str) -> Result<(), OcrError> { + if lang_combination.is_empty() { + return Err(OcrError::LanguageDataNotFound { + lang: "empty".to_string(), + }); + } + + // Split by '+' to handle multi-language combinations + let languages: Vec<&str> = lang_combination.split('+').collect(); + + // Validate each language in the combination + for lang in &languages { + self.validate_language(lang.trim())?; + } + + // Limit number of languages for performance (max 4) + if languages.len() > 4 { + return Err(OcrError::LanguageDataNotFound { + lang: format!("Too many languages in combination: {}. Maximum is 4.", languages.len()), + }); + } + + Ok(()) + } + + /// Validate a list of preferred languages + pub fn validate_preferred_languages(&self, languages: &[String]) -> Result<(), OcrError> { + if languages.is_empty() { + return Err(OcrError::LanguageDataNotFound { + lang: "No languages provided".to_string(), + }); + } + + // Limit number of languages for performance + if languages.len() > 4 { + return Err(OcrError::LanguageDataNotFound { + lang: format!("Too many preferred languages: {}. Maximum is 4.", languages.len()), + }); + } + + // Validate each language + for lang in languages { + self.validate_language(lang)?; + } + + Ok(()) + } pub fn get_language_display_name(&self, lang_code: &str) -> String { match lang_code { diff --git a/src/ocr/mod.rs b/src/ocr/mod.rs index 87b5080..d955979 100644 --- a/src/ocr/mod.rs +++ b/src/ocr/mod.rs @@ -36,7 +36,7 @@ impl OcrService { // Perform health checks first self.health_checker.check_tesseract_installation() .map_err(|e: OcrError| anyhow!(e))?; - self.health_checker.check_language_data(lang) + self.health_checker.validate_language_combination(lang) .map_err(|e: OcrError| anyhow!(e))?; let mut tesseract = Tesseract::new(None, Some(lang)) diff --git a/src/routes/documents/crud.rs b/src/routes/documents/crud.rs index dd324bc..dbf7824 100644 --- a/src/routes/documents/crud.rs +++ b/src/routes/documents/crud.rs @@ -40,6 +40,7 @@ pub async fn upload_document( ) -> Result, StatusCode> { let mut uploaded_file = None; let mut ocr_language: Option = None; + let mut ocr_languages: Vec = Vec::new(); // First pass: collect all multipart fields while let Some(field) = multipart.next_field().await.map_err(|e| { @@ -65,6 +66,22 @@ pub async fn upload_document( } } } + } else if name == "ocr_languages" || name.starts_with("ocr_languages[") { + let language = field.text().await.map_err(|_| StatusCode::BAD_REQUEST)?; + if !language.trim().is_empty() { + // Validate that the language is available + let health_checker = crate::ocr::health::OcrHealthChecker::new(); + match health_checker.validate_language(language.trim()) { + Ok(_) => { + ocr_languages.push(language.trim().to_string()); + info!("OCR language added to list: {}", language); + } + Err(e) => { + warn!("Invalid OCR language specified '{}': {}", language, e); + return Err(StatusCode::BAD_REQUEST); + } + } + } } else if name == "file" { let filename = field.file_name() .ok_or_else(|| { @@ -143,8 +160,30 @@ pub async fn upload_document( Ok(IngestionResult::Created(document)) => { info!("Document uploaded successfully: {}", document.id); - // If a language was specified, update the user's OCR language setting - if let Some(lang) = &ocr_language { + // Update user's OCR language settings based on what was provided + if !ocr_languages.is_empty() { + // Multi-language support: update preferred languages + let health_checker = crate::ocr::health::OcrHealthChecker::new(); + match health_checker.validate_preferred_languages(&ocr_languages) { + Ok(_) => { + let settings_update = crate::models::UpdateSettings::language_update( + ocr_languages.clone(), + ocr_languages[0].clone(), // First language as primary + ocr_languages[0].clone(), // Backward compatibility + ); + + if let Err(e) = state.db.create_or_update_settings(auth_user.user.id, &settings_update).await { + warn!("Failed to update user preferred languages to {:?}: {}", ocr_languages, e); + } else { + info!("Updated user {} preferred languages to: {:?}", auth_user.user.id, ocr_languages); + } + } + Err(e) => { + warn!("Invalid language combination provided: {}", e); + } + } + } else if let Some(lang) = &ocr_language { + // Single language (backward compatibility) if let Err(e) = state.db.update_user_ocr_language(auth_user.user.id, lang).await { warn!("Failed to update user OCR language to {}: {}", lang, e); } else { diff --git a/src/routes/documents/ocr.rs b/src/routes/documents/ocr.rs index c9a0ce9..b2a5b8c 100644 --- a/src/routes/documents/ocr.rs +++ b/src/routes/documents/ocr.rs @@ -107,9 +107,31 @@ pub async fn retry_ocr( } } - // If a language was specified, validate and update the user's OCR language setting - if let Some(lang) = &request.language { - // Validate that the language is available + // Update user's OCR language settings based on what was provided + if let Some(languages) = &request.languages { + // Multi-language support: validate and update preferred languages + let health_checker = crate::ocr::health::OcrHealthChecker::new(); + match health_checker.validate_preferred_languages(languages) { + Ok(_) => { + let settings_update = crate::models::UpdateSettings::language_update( + languages.clone(), + languages[0].clone(), // First language as primary + languages[0].clone(), // Backward compatibility + ); + + if let Err(e) = state.db.create_or_update_settings(auth_user.user.id, &settings_update).await { + warn!("Failed to update user preferred languages to {:?}: {}", languages, e); + } else { + info!("Updated user {} preferred languages to: {:?} for retry", auth_user.user.id, languages); + } + } + Err(e) => { + warn!("Invalid language combination provided: {}", e); + return Err(StatusCode::BAD_REQUEST); + } + } + } else if let Some(lang) = &request.language { + // Single language (backward compatibility) let health_checker = crate::ocr::health::OcrHealthChecker::new(); match health_checker.validate_language(lang) { Ok(_) => { diff --git a/src/routes/documents/types.rs b/src/routes/documents/types.rs index 8f64848..a63b151 100644 --- a/src/routes/documents/types.rs +++ b/src/routes/documents/types.rs @@ -30,6 +30,7 @@ pub struct DeleteLowConfidenceRequest { #[derive(Deserialize, ToSchema)] pub struct RetryOcrRequest { pub language: Option, + pub languages: Option>, } #[derive(Deserialize, Serialize, ToSchema)] diff --git a/src/routes/settings.rs b/src/routes/settings.rs index 8e6d42d..f11a006 100644 --- a/src/routes/settings.rs +++ b/src/routes/settings.rs @@ -49,6 +49,9 @@ async fn get_settings( let default = crate::models::Settings::default(); SettingsResponse { ocr_language: default.ocr_language, + preferred_languages: default.preferred_languages, + primary_language: default.primary_language, + auto_detect_language_combination: default.auto_detect_language_combination, concurrent_ocr_jobs: default.concurrent_ocr_jobs, ocr_timeout_seconds: default.ocr_timeout_seconds, max_file_size_mb: default.max_file_size_mb,