feat(lang): update backend to support multiple languages at the same time during OCR
This commit is contained in:
parent
dc55b2e50b
commit
849c9f91c7
|
|
@ -0,0 +1,56 @@
|
|||
-- Migration: Add multi-language OCR support
|
||||
-- This migration adds support for multiple OCR languages per user
|
||||
|
||||
-- Add new columns for multi-language support
|
||||
ALTER TABLE settings
|
||||
ADD COLUMN preferred_languages JSONB DEFAULT '["eng"]'::jsonb,
|
||||
ADD COLUMN primary_language VARCHAR(10) DEFAULT 'eng',
|
||||
ADD COLUMN auto_detect_language_combination BOOLEAN DEFAULT false;
|
||||
|
||||
-- Migrate existing ocr_language data to new preferred_languages array
|
||||
UPDATE settings
|
||||
SET preferred_languages = jsonb_build_array(COALESCE(ocr_language, 'eng')),
|
||||
primary_language = COALESCE(ocr_language, 'eng')
|
||||
WHERE preferred_languages = '["eng"]'::jsonb;
|
||||
|
||||
-- Create index for efficient querying of preferred languages
|
||||
CREATE INDEX IF NOT EXISTS idx_settings_preferred_languages ON settings USING gin(preferred_languages);
|
||||
CREATE INDEX IF NOT EXISTS idx_settings_primary_language ON settings(primary_language);
|
||||
|
||||
-- Add constraint to ensure primary_language is always in preferred_languages
|
||||
ALTER TABLE settings
|
||||
ADD CONSTRAINT check_primary_language_in_preferred
|
||||
CHECK (preferred_languages ? primary_language);
|
||||
|
||||
-- Add constraint to limit number of preferred languages (max 4 for performance)
|
||||
ALTER TABLE settings
|
||||
ADD CONSTRAINT check_max_preferred_languages
|
||||
CHECK (jsonb_array_length(preferred_languages) <= 4);
|
||||
|
||||
-- Add constraint to ensure valid language codes (3-letter ISO codes)
|
||||
ALTER TABLE settings
|
||||
ADD CONSTRAINT check_valid_language_codes
|
||||
CHECK (
|
||||
primary_language ~ '^[a-z]{3}(_[A-Z]{2})?$' AND
|
||||
(
|
||||
SELECT bool_and(value::text ~ '^"[a-z]{3}(_[A-Z]{2})?"$')
|
||||
FROM jsonb_array_elements(preferred_languages)
|
||||
)
|
||||
);
|
||||
|
||||
-- Update existing users who don't have settings yet
|
||||
INSERT INTO settings (user_id, preferred_languages, primary_language, auto_detect_language_combination)
|
||||
SELECT
|
||||
u.id,
|
||||
'["eng"]'::jsonb,
|
||||
'eng',
|
||||
false
|
||||
FROM users u
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM settings s WHERE s.user_id = u.id
|
||||
);
|
||||
|
||||
-- Add comments for documentation
|
||||
COMMENT ON COLUMN settings.preferred_languages IS 'Array of 3-letter ISO language codes for OCR processing, max 4 languages';
|
||||
COMMENT ON COLUMN settings.primary_language IS 'Primary language code that should be listed first in OCR processing';
|
||||
COMMENT ON COLUMN settings.auto_detect_language_combination IS 'Whether to automatically suggest language combinations based on document content';
|
||||
|
|
@ -1,14 +1,94 @@
|
|||
use anyhow::Result;
|
||||
use sqlx::Row;
|
||||
use uuid::Uuid;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::Database;
|
||||
|
||||
// Helper function to parse JSONB array to Vec<String>
|
||||
fn parse_jsonb_string_array(value: Value) -> Vec<String> {
|
||||
match value {
|
||||
Value::Array(arr) => arr.into_iter()
|
||||
.filter_map(|v| v.as_str().map(|s| s.to_string()))
|
||||
.collect(),
|
||||
_ => vec!["eng".to_string()], // fallback to English
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to create Settings from database row
|
||||
fn settings_from_row(row: &sqlx::postgres::PgRow) -> crate::models::Settings {
|
||||
let preferred_languages_json: Value = row.get("preferred_languages");
|
||||
let preferred_languages = parse_jsonb_string_array(preferred_languages_json);
|
||||
|
||||
crate::models::Settings {
|
||||
id: row.get("id"),
|
||||
user_id: row.get("user_id"),
|
||||
ocr_language: row.get("ocr_language"),
|
||||
preferred_languages,
|
||||
primary_language: row.get("primary_language"),
|
||||
auto_detect_language_combination: row.get("auto_detect_language_combination"),
|
||||
concurrent_ocr_jobs: row.get("concurrent_ocr_jobs"),
|
||||
ocr_timeout_seconds: row.get("ocr_timeout_seconds"),
|
||||
max_file_size_mb: row.get("max_file_size_mb"),
|
||||
allowed_file_types: row.get("allowed_file_types"),
|
||||
auto_rotate_images: row.get("auto_rotate_images"),
|
||||
enable_image_preprocessing: row.get("enable_image_preprocessing"),
|
||||
search_results_per_page: row.get("search_results_per_page"),
|
||||
search_snippet_length: row.get("search_snippet_length"),
|
||||
fuzzy_search_threshold: row.get("fuzzy_search_threshold"),
|
||||
retention_days: row.get("retention_days"),
|
||||
enable_auto_cleanup: row.get("enable_auto_cleanup"),
|
||||
enable_compression: row.get("enable_compression"),
|
||||
memory_limit_mb: row.get("memory_limit_mb"),
|
||||
cpu_priority: row.get("cpu_priority"),
|
||||
enable_background_ocr: row.get("enable_background_ocr"),
|
||||
ocr_page_segmentation_mode: row.get("ocr_page_segmentation_mode"),
|
||||
ocr_engine_mode: row.get("ocr_engine_mode"),
|
||||
ocr_min_confidence: row.get("ocr_min_confidence"),
|
||||
ocr_dpi: row.get("ocr_dpi"),
|
||||
ocr_enhance_contrast: row.get("ocr_enhance_contrast"),
|
||||
ocr_remove_noise: row.get("ocr_remove_noise"),
|
||||
ocr_detect_orientation: row.get("ocr_detect_orientation"),
|
||||
ocr_whitelist_chars: row.get("ocr_whitelist_chars"),
|
||||
ocr_blacklist_chars: row.get("ocr_blacklist_chars"),
|
||||
ocr_brightness_boost: row.get("ocr_brightness_boost"),
|
||||
ocr_contrast_multiplier: row.get("ocr_contrast_multiplier"),
|
||||
ocr_noise_reduction_level: row.get("ocr_noise_reduction_level"),
|
||||
ocr_sharpening_strength: row.get("ocr_sharpening_strength"),
|
||||
ocr_morphological_operations: row.get("ocr_morphological_operations"),
|
||||
ocr_adaptive_threshold_window_size: row.get("ocr_adaptive_threshold_window_size"),
|
||||
ocr_histogram_equalization: row.get("ocr_histogram_equalization"),
|
||||
ocr_upscale_factor: row.get("ocr_upscale_factor"),
|
||||
ocr_max_image_width: row.get("ocr_max_image_width"),
|
||||
ocr_max_image_height: row.get("ocr_max_image_height"),
|
||||
save_processed_images: row.get("save_processed_images"),
|
||||
ocr_quality_threshold_brightness: row.get("ocr_quality_threshold_brightness"),
|
||||
ocr_quality_threshold_contrast: row.get("ocr_quality_threshold_contrast"),
|
||||
ocr_quality_threshold_noise: row.get("ocr_quality_threshold_noise"),
|
||||
ocr_quality_threshold_sharpness: row.get("ocr_quality_threshold_sharpness"),
|
||||
ocr_skip_enhancement: row.get("ocr_skip_enhancement"),
|
||||
webdav_enabled: row.get("webdav_enabled"),
|
||||
webdav_server_url: row.get("webdav_server_url"),
|
||||
webdav_username: row.get("webdav_username"),
|
||||
webdav_password: row.get("webdav_password"),
|
||||
webdav_watch_folders: row.get("webdav_watch_folders"),
|
||||
webdav_file_extensions: row.get("webdav_file_extensions"),
|
||||
webdav_auto_sync: row.get("webdav_auto_sync"),
|
||||
webdav_sync_interval_minutes: row.get("webdav_sync_interval_minutes"),
|
||||
created_at: row.get("created_at"),
|
||||
updated_at: row.get("updated_at"),
|
||||
}
|
||||
}
|
||||
|
||||
impl Database {
|
||||
pub async fn get_user_settings(&self, user_id: Uuid) -> Result<Option<crate::models::Settings>> {
|
||||
self.with_retry(|| async {
|
||||
let row = sqlx::query(
|
||||
r#"SELECT id, user_id, ocr_language, concurrent_ocr_jobs, ocr_timeout_seconds,
|
||||
r#"SELECT id, user_id, ocr_language,
|
||||
COALESCE(preferred_languages, '["eng"]'::jsonb) as preferred_languages,
|
||||
COALESCE(primary_language, 'eng') as primary_language,
|
||||
COALESCE(auto_detect_language_combination, false) as auto_detect_language_combination,
|
||||
concurrent_ocr_jobs, ocr_timeout_seconds,
|
||||
max_file_size_mb, allowed_file_types, auto_rotate_images, enable_image_preprocessing,
|
||||
search_results_per_page, search_snippet_length, fuzzy_search_threshold,
|
||||
retention_days, enable_auto_cleanup, enable_compression, memory_limit_mb,
|
||||
|
|
@ -31,61 +111,7 @@ impl Database {
|
|||
.map_err(|e| anyhow::anyhow!("Database query failed: {}", e))?;
|
||||
|
||||
match row {
|
||||
Some(row) => Ok(Some(crate::models::Settings {
|
||||
id: row.get("id"),
|
||||
user_id: row.get("user_id"),
|
||||
ocr_language: row.get("ocr_language"),
|
||||
concurrent_ocr_jobs: row.get("concurrent_ocr_jobs"),
|
||||
ocr_timeout_seconds: row.get("ocr_timeout_seconds"),
|
||||
max_file_size_mb: row.get("max_file_size_mb"),
|
||||
allowed_file_types: row.get("allowed_file_types"),
|
||||
auto_rotate_images: row.get("auto_rotate_images"),
|
||||
enable_image_preprocessing: row.get("enable_image_preprocessing"),
|
||||
search_results_per_page: row.get("search_results_per_page"),
|
||||
search_snippet_length: row.get("search_snippet_length"),
|
||||
fuzzy_search_threshold: row.get("fuzzy_search_threshold"),
|
||||
retention_days: row.get("retention_days"),
|
||||
enable_auto_cleanup: row.get("enable_auto_cleanup"),
|
||||
enable_compression: row.get("enable_compression"),
|
||||
memory_limit_mb: row.get("memory_limit_mb"),
|
||||
cpu_priority: row.get("cpu_priority"),
|
||||
enable_background_ocr: row.get("enable_background_ocr"),
|
||||
ocr_page_segmentation_mode: row.get("ocr_page_segmentation_mode"),
|
||||
ocr_engine_mode: row.get("ocr_engine_mode"),
|
||||
ocr_min_confidence: row.get("ocr_min_confidence"),
|
||||
ocr_dpi: row.get("ocr_dpi"),
|
||||
ocr_enhance_contrast: row.get("ocr_enhance_contrast"),
|
||||
ocr_remove_noise: row.get("ocr_remove_noise"),
|
||||
ocr_detect_orientation: row.get("ocr_detect_orientation"),
|
||||
ocr_whitelist_chars: row.get("ocr_whitelist_chars"),
|
||||
ocr_blacklist_chars: row.get("ocr_blacklist_chars"),
|
||||
ocr_brightness_boost: row.get("ocr_brightness_boost"),
|
||||
ocr_contrast_multiplier: row.get("ocr_contrast_multiplier"),
|
||||
ocr_noise_reduction_level: row.get("ocr_noise_reduction_level"),
|
||||
ocr_sharpening_strength: row.get("ocr_sharpening_strength"),
|
||||
ocr_morphological_operations: row.get("ocr_morphological_operations"),
|
||||
ocr_adaptive_threshold_window_size: row.get("ocr_adaptive_threshold_window_size"),
|
||||
ocr_histogram_equalization: row.get("ocr_histogram_equalization"),
|
||||
ocr_upscale_factor: row.get("ocr_upscale_factor"),
|
||||
ocr_max_image_width: row.get("ocr_max_image_width"),
|
||||
ocr_max_image_height: row.get("ocr_max_image_height"),
|
||||
save_processed_images: row.get("save_processed_images"),
|
||||
ocr_quality_threshold_brightness: row.get("ocr_quality_threshold_brightness"),
|
||||
ocr_quality_threshold_contrast: row.get("ocr_quality_threshold_contrast"),
|
||||
ocr_quality_threshold_noise: row.get("ocr_quality_threshold_noise"),
|
||||
ocr_quality_threshold_sharpness: row.get("ocr_quality_threshold_sharpness"),
|
||||
ocr_skip_enhancement: row.get("ocr_skip_enhancement"),
|
||||
webdav_enabled: row.get("webdav_enabled"),
|
||||
webdav_server_url: row.get("webdav_server_url"),
|
||||
webdav_username: row.get("webdav_username"),
|
||||
webdav_password: row.get("webdav_password"),
|
||||
webdav_watch_folders: row.get("webdav_watch_folders"),
|
||||
webdav_file_extensions: row.get("webdav_file_extensions"),
|
||||
webdav_auto_sync: row.get("webdav_auto_sync"),
|
||||
webdav_sync_interval_minutes: row.get("webdav_sync_interval_minutes"),
|
||||
created_at: row.get("created_at"),
|
||||
updated_at: row.get("updated_at"),
|
||||
})),
|
||||
Some(row) => Ok(Some(settings_from_row(&row))),
|
||||
None => Ok(None),
|
||||
}
|
||||
}).await
|
||||
|
|
@ -93,7 +119,11 @@ impl Database {
|
|||
|
||||
pub async fn get_all_user_settings(&self) -> Result<Vec<crate::models::Settings>> {
|
||||
let rows = sqlx::query(
|
||||
r#"SELECT id, user_id, ocr_language, concurrent_ocr_jobs, ocr_timeout_seconds,
|
||||
r#"SELECT id, user_id, ocr_language,
|
||||
COALESCE(preferred_languages, '["eng"]'::jsonb) as preferred_languages,
|
||||
COALESCE(primary_language, 'eng') as primary_language,
|
||||
COALESCE(auto_detect_language_combination, false) as auto_detect_language_combination,
|
||||
concurrent_ocr_jobs, ocr_timeout_seconds,
|
||||
max_file_size_mb, allowed_file_types, auto_rotate_images, enable_image_preprocessing,
|
||||
search_results_per_page, search_snippet_length, fuzzy_search_threshold,
|
||||
retention_days, enable_auto_cleanup, enable_compression, memory_limit_mb,
|
||||
|
|
@ -114,64 +144,9 @@ impl Database {
|
|||
.fetch_all(&self.pool)
|
||||
.await?;
|
||||
|
||||
let mut settings_list = Vec::new();
|
||||
for row in rows {
|
||||
settings_list.push(crate::models::Settings {
|
||||
id: row.get("id"),
|
||||
user_id: row.get("user_id"),
|
||||
ocr_language: row.get("ocr_language"),
|
||||
concurrent_ocr_jobs: row.get("concurrent_ocr_jobs"),
|
||||
ocr_timeout_seconds: row.get("ocr_timeout_seconds"),
|
||||
max_file_size_mb: row.get("max_file_size_mb"),
|
||||
allowed_file_types: row.get("allowed_file_types"),
|
||||
auto_rotate_images: row.get("auto_rotate_images"),
|
||||
enable_image_preprocessing: row.get("enable_image_preprocessing"),
|
||||
search_results_per_page: row.get("search_results_per_page"),
|
||||
search_snippet_length: row.get("search_snippet_length"),
|
||||
fuzzy_search_threshold: row.get("fuzzy_search_threshold"),
|
||||
retention_days: row.get("retention_days"),
|
||||
enable_auto_cleanup: row.get("enable_auto_cleanup"),
|
||||
enable_compression: row.get("enable_compression"),
|
||||
memory_limit_mb: row.get("memory_limit_mb"),
|
||||
cpu_priority: row.get("cpu_priority"),
|
||||
enable_background_ocr: row.get("enable_background_ocr"),
|
||||
ocr_page_segmentation_mode: row.get("ocr_page_segmentation_mode"),
|
||||
ocr_engine_mode: row.get("ocr_engine_mode"),
|
||||
ocr_min_confidence: row.get("ocr_min_confidence"),
|
||||
ocr_dpi: row.get("ocr_dpi"),
|
||||
ocr_enhance_contrast: row.get("ocr_enhance_contrast"),
|
||||
ocr_remove_noise: row.get("ocr_remove_noise"),
|
||||
ocr_detect_orientation: row.get("ocr_detect_orientation"),
|
||||
ocr_whitelist_chars: row.get("ocr_whitelist_chars"),
|
||||
ocr_blacklist_chars: row.get("ocr_blacklist_chars"),
|
||||
ocr_brightness_boost: row.get("ocr_brightness_boost"),
|
||||
ocr_contrast_multiplier: row.get("ocr_contrast_multiplier"),
|
||||
ocr_noise_reduction_level: row.get("ocr_noise_reduction_level"),
|
||||
ocr_sharpening_strength: row.get("ocr_sharpening_strength"),
|
||||
ocr_morphological_operations: row.get("ocr_morphological_operations"),
|
||||
ocr_adaptive_threshold_window_size: row.get("ocr_adaptive_threshold_window_size"),
|
||||
ocr_histogram_equalization: row.get("ocr_histogram_equalization"),
|
||||
ocr_upscale_factor: row.get("ocr_upscale_factor"),
|
||||
ocr_max_image_width: row.get("ocr_max_image_width"),
|
||||
ocr_max_image_height: row.get("ocr_max_image_height"),
|
||||
save_processed_images: row.get("save_processed_images"),
|
||||
ocr_quality_threshold_brightness: row.get("ocr_quality_threshold_brightness"),
|
||||
ocr_quality_threshold_contrast: row.get("ocr_quality_threshold_contrast"),
|
||||
ocr_quality_threshold_noise: row.get("ocr_quality_threshold_noise"),
|
||||
ocr_quality_threshold_sharpness: row.get("ocr_quality_threshold_sharpness"),
|
||||
ocr_skip_enhancement: row.get("ocr_skip_enhancement"),
|
||||
webdav_enabled: row.get("webdav_enabled"),
|
||||
webdav_server_url: row.get("webdav_server_url"),
|
||||
webdav_username: row.get("webdav_username"),
|
||||
webdav_password: row.get("webdav_password"),
|
||||
webdav_watch_folders: row.get("webdav_watch_folders"),
|
||||
webdav_file_extensions: row.get("webdav_file_extensions"),
|
||||
webdav_auto_sync: row.get("webdav_auto_sync"),
|
||||
webdav_sync_interval_minutes: row.get("webdav_sync_interval_minutes"),
|
||||
created_at: row.get("created_at"),
|
||||
updated_at: row.get("updated_at"),
|
||||
});
|
||||
}
|
||||
let settings_list = rows.into_iter()
|
||||
.map(|row| settings_from_row(&row))
|
||||
.collect();
|
||||
|
||||
Ok(settings_list)
|
||||
}
|
||||
|
|
@ -191,7 +166,7 @@ impl Database {
|
|||
let row = sqlx::query(
|
||||
r#"
|
||||
INSERT INTO settings (
|
||||
user_id, ocr_language, concurrent_ocr_jobs, ocr_timeout_seconds,
|
||||
user_id, ocr_language, preferred_languages, primary_language, auto_detect_language_combination, concurrent_ocr_jobs, ocr_timeout_seconds,
|
||||
max_file_size_mb, allowed_file_types, auto_rotate_images, enable_image_preprocessing,
|
||||
search_results_per_page, search_snippet_length, fuzzy_search_threshold,
|
||||
retention_days, enable_auto_cleanup, enable_compression, memory_limit_mb,
|
||||
|
|
@ -206,59 +181,66 @@ impl Database {
|
|||
webdav_enabled, webdav_server_url, webdav_username, webdav_password,
|
||||
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes
|
||||
)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $50)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $50, $51, $52, $53)
|
||||
ON CONFLICT (user_id) DO UPDATE SET
|
||||
ocr_language = $2,
|
||||
concurrent_ocr_jobs = $3,
|
||||
ocr_timeout_seconds = $4,
|
||||
max_file_size_mb = $5,
|
||||
allowed_file_types = $6,
|
||||
auto_rotate_images = $7,
|
||||
enable_image_preprocessing = $8,
|
||||
search_results_per_page = $9,
|
||||
search_snippet_length = $10,
|
||||
fuzzy_search_threshold = $11,
|
||||
retention_days = $12,
|
||||
enable_auto_cleanup = $13,
|
||||
enable_compression = $14,
|
||||
memory_limit_mb = $15,
|
||||
cpu_priority = $16,
|
||||
enable_background_ocr = $17,
|
||||
ocr_page_segmentation_mode = $18,
|
||||
ocr_engine_mode = $19,
|
||||
ocr_min_confidence = $20,
|
||||
ocr_dpi = $21,
|
||||
ocr_enhance_contrast = $22,
|
||||
ocr_remove_noise = $23,
|
||||
ocr_detect_orientation = $24,
|
||||
ocr_whitelist_chars = $25,
|
||||
ocr_blacklist_chars = $26,
|
||||
ocr_brightness_boost = $27,
|
||||
ocr_contrast_multiplier = $28,
|
||||
ocr_noise_reduction_level = $29,
|
||||
ocr_sharpening_strength = $30,
|
||||
ocr_morphological_operations = $31,
|
||||
ocr_adaptive_threshold_window_size = $32,
|
||||
ocr_histogram_equalization = $33,
|
||||
ocr_upscale_factor = $34,
|
||||
ocr_max_image_width = $35,
|
||||
ocr_max_image_height = $36,
|
||||
save_processed_images = $37,
|
||||
ocr_quality_threshold_brightness = $38,
|
||||
ocr_quality_threshold_contrast = $39,
|
||||
ocr_quality_threshold_noise = $40,
|
||||
ocr_quality_threshold_sharpness = $41,
|
||||
ocr_skip_enhancement = $42,
|
||||
webdav_enabled = $43,
|
||||
webdav_server_url = $44,
|
||||
webdav_username = $45,
|
||||
webdav_password = $46,
|
||||
webdav_watch_folders = $47,
|
||||
webdav_file_extensions = $48,
|
||||
webdav_auto_sync = $49,
|
||||
webdav_sync_interval_minutes = $50,
|
||||
preferred_languages = $3,
|
||||
primary_language = $4,
|
||||
auto_detect_language_combination = $5,
|
||||
concurrent_ocr_jobs = $6,
|
||||
ocr_timeout_seconds = $7,
|
||||
max_file_size_mb = $8,
|
||||
allowed_file_types = $9,
|
||||
auto_rotate_images = $10,
|
||||
enable_image_preprocessing = $11,
|
||||
search_results_per_page = $12,
|
||||
search_snippet_length = $13,
|
||||
fuzzy_search_threshold = $14,
|
||||
retention_days = $15,
|
||||
enable_auto_cleanup = $16,
|
||||
enable_compression = $17,
|
||||
memory_limit_mb = $18,
|
||||
cpu_priority = $19,
|
||||
enable_background_ocr = $20,
|
||||
ocr_page_segmentation_mode = $21,
|
||||
ocr_engine_mode = $22,
|
||||
ocr_min_confidence = $23,
|
||||
ocr_dpi = $24,
|
||||
ocr_enhance_contrast = $25,
|
||||
ocr_remove_noise = $26,
|
||||
ocr_detect_orientation = $27,
|
||||
ocr_whitelist_chars = $28,
|
||||
ocr_blacklist_chars = $29,
|
||||
ocr_brightness_boost = $30,
|
||||
ocr_contrast_multiplier = $31,
|
||||
ocr_noise_reduction_level = $32,
|
||||
ocr_sharpening_strength = $33,
|
||||
ocr_morphological_operations = $34,
|
||||
ocr_adaptive_threshold_window_size = $35,
|
||||
ocr_histogram_equalization = $36,
|
||||
ocr_upscale_factor = $37,
|
||||
ocr_max_image_width = $38,
|
||||
ocr_max_image_height = $39,
|
||||
save_processed_images = $40,
|
||||
ocr_quality_threshold_brightness = $41,
|
||||
ocr_quality_threshold_contrast = $42,
|
||||
ocr_quality_threshold_noise = $43,
|
||||
ocr_quality_threshold_sharpness = $44,
|
||||
ocr_skip_enhancement = $45,
|
||||
webdav_enabled = $46,
|
||||
webdav_server_url = $47,
|
||||
webdav_username = $48,
|
||||
webdav_password = $49,
|
||||
webdav_watch_folders = $50,
|
||||
webdav_file_extensions = $51,
|
||||
webdav_auto_sync = $52,
|
||||
webdav_sync_interval_minutes = $53,
|
||||
updated_at = NOW()
|
||||
RETURNING id, user_id, ocr_language, concurrent_ocr_jobs, ocr_timeout_seconds,
|
||||
RETURNING id, user_id, ocr_language,
|
||||
COALESCE(preferred_languages, '["eng"]'::jsonb) as preferred_languages,
|
||||
COALESCE(primary_language, 'eng') as primary_language,
|
||||
COALESCE(auto_detect_language_combination, false) as auto_detect_language_combination,
|
||||
concurrent_ocr_jobs, ocr_timeout_seconds,
|
||||
max_file_size_mb, allowed_file_types, auto_rotate_images, enable_image_preprocessing,
|
||||
search_results_per_page, search_snippet_length, fuzzy_search_threshold,
|
||||
retention_days, enable_auto_cleanup, enable_compression, memory_limit_mb,
|
||||
|
|
@ -277,6 +259,9 @@ impl Database {
|
|||
)
|
||||
.bind(user_id)
|
||||
.bind(settings.ocr_language.as_ref().unwrap_or(¤t.ocr_language))
|
||||
.bind(serde_json::to_value(settings.preferred_languages.as_ref().unwrap_or(¤t.preferred_languages)).unwrap())
|
||||
.bind(settings.primary_language.as_ref().unwrap_or(¤t.primary_language))
|
||||
.bind(settings.auto_detect_language_combination.unwrap_or(current.auto_detect_language_combination))
|
||||
.bind(settings.concurrent_ocr_jobs.unwrap_or(current.concurrent_ocr_jobs))
|
||||
.bind(settings.ocr_timeout_seconds.unwrap_or(current.ocr_timeout_seconds))
|
||||
.bind(settings.max_file_size_mb.unwrap_or(current.max_file_size_mb))
|
||||
|
|
@ -328,61 +313,7 @@ impl Database {
|
|||
.fetch_one(&self.pool)
|
||||
.await?;
|
||||
|
||||
Ok(crate::models::Settings {
|
||||
id: row.get("id"),
|
||||
user_id: row.get("user_id"),
|
||||
ocr_language: row.get("ocr_language"),
|
||||
concurrent_ocr_jobs: row.get("concurrent_ocr_jobs"),
|
||||
ocr_timeout_seconds: row.get("ocr_timeout_seconds"),
|
||||
max_file_size_mb: row.get("max_file_size_mb"),
|
||||
allowed_file_types: row.get("allowed_file_types"),
|
||||
auto_rotate_images: row.get("auto_rotate_images"),
|
||||
enable_image_preprocessing: row.get("enable_image_preprocessing"),
|
||||
search_results_per_page: row.get("search_results_per_page"),
|
||||
search_snippet_length: row.get("search_snippet_length"),
|
||||
fuzzy_search_threshold: row.get("fuzzy_search_threshold"),
|
||||
retention_days: row.get("retention_days"),
|
||||
enable_auto_cleanup: row.get("enable_auto_cleanup"),
|
||||
enable_compression: row.get("enable_compression"),
|
||||
memory_limit_mb: row.get("memory_limit_mb"),
|
||||
cpu_priority: row.get("cpu_priority"),
|
||||
enable_background_ocr: row.get("enable_background_ocr"),
|
||||
ocr_page_segmentation_mode: row.get("ocr_page_segmentation_mode"),
|
||||
ocr_engine_mode: row.get("ocr_engine_mode"),
|
||||
ocr_min_confidence: row.get("ocr_min_confidence"),
|
||||
ocr_dpi: row.get("ocr_dpi"),
|
||||
ocr_enhance_contrast: row.get("ocr_enhance_contrast"),
|
||||
ocr_remove_noise: row.get("ocr_remove_noise"),
|
||||
ocr_detect_orientation: row.get("ocr_detect_orientation"),
|
||||
ocr_whitelist_chars: row.get("ocr_whitelist_chars"),
|
||||
ocr_blacklist_chars: row.get("ocr_blacklist_chars"),
|
||||
ocr_brightness_boost: row.get("ocr_brightness_boost"),
|
||||
ocr_contrast_multiplier: row.get("ocr_contrast_multiplier"),
|
||||
ocr_noise_reduction_level: row.get("ocr_noise_reduction_level"),
|
||||
ocr_sharpening_strength: row.get("ocr_sharpening_strength"),
|
||||
ocr_morphological_operations: row.get("ocr_morphological_operations"),
|
||||
ocr_adaptive_threshold_window_size: row.get("ocr_adaptive_threshold_window_size"),
|
||||
ocr_histogram_equalization: row.get("ocr_histogram_equalization"),
|
||||
ocr_upscale_factor: row.get("ocr_upscale_factor"),
|
||||
ocr_max_image_width: row.get("ocr_max_image_width"),
|
||||
ocr_max_image_height: row.get("ocr_max_image_height"),
|
||||
save_processed_images: row.get("save_processed_images"),
|
||||
ocr_quality_threshold_brightness: row.get("ocr_quality_threshold_brightness"),
|
||||
ocr_quality_threshold_contrast: row.get("ocr_quality_threshold_contrast"),
|
||||
ocr_quality_threshold_noise: row.get("ocr_quality_threshold_noise"),
|
||||
ocr_quality_threshold_sharpness: row.get("ocr_quality_threshold_sharpness"),
|
||||
ocr_skip_enhancement: row.get("ocr_skip_enhancement"),
|
||||
webdav_enabled: row.get("webdav_enabled"),
|
||||
webdav_server_url: row.get("webdav_server_url"),
|
||||
webdav_username: row.get("webdav_username"),
|
||||
webdav_password: row.get("webdav_password"),
|
||||
webdav_watch_folders: row.get("webdav_watch_folders"),
|
||||
webdav_file_extensions: row.get("webdav_file_extensions"),
|
||||
webdav_auto_sync: row.get("webdav_auto_sync"),
|
||||
webdav_sync_interval_minutes: row.get("webdav_sync_interval_minutes"),
|
||||
created_at: row.get("created_at"),
|
||||
updated_at: row.get("updated_at"),
|
||||
})
|
||||
Ok(settings_from_row(&row))
|
||||
}
|
||||
|
||||
pub async fn update_user_ocr_language(&self, user_id: Uuid, language: &str) -> Result<()> {
|
||||
|
|
|
|||
|
|
@ -9,6 +9,9 @@ pub struct Settings {
|
|||
pub id: Uuid,
|
||||
pub user_id: Uuid,
|
||||
pub ocr_language: String,
|
||||
pub preferred_languages: Vec<String>,
|
||||
pub primary_language: String,
|
||||
pub auto_detect_language_combination: bool,
|
||||
pub concurrent_ocr_jobs: i32,
|
||||
pub ocr_timeout_seconds: i32,
|
||||
pub max_file_size_mb: i32,
|
||||
|
|
@ -64,6 +67,9 @@ pub struct Settings {
|
|||
#[derive(Debug, Serialize, Deserialize, ToSchema)]
|
||||
pub struct SettingsResponse {
|
||||
pub ocr_language: String,
|
||||
pub preferred_languages: Vec<String>,
|
||||
pub primary_language: String,
|
||||
pub auto_detect_language_combination: bool,
|
||||
pub concurrent_ocr_jobs: i32,
|
||||
pub ocr_timeout_seconds: i32,
|
||||
pub max_file_size_mb: i32,
|
||||
|
|
@ -117,6 +123,9 @@ pub struct SettingsResponse {
|
|||
#[derive(Debug, Serialize, Deserialize, ToSchema)]
|
||||
pub struct UpdateSettings {
|
||||
pub ocr_language: Option<String>,
|
||||
pub preferred_languages: Option<Vec<String>>,
|
||||
pub primary_language: Option<String>,
|
||||
pub auto_detect_language_combination: Option<bool>,
|
||||
pub concurrent_ocr_jobs: Option<i32>,
|
||||
pub ocr_timeout_seconds: Option<i32>,
|
||||
pub max_file_size_mb: Option<i32>,
|
||||
|
|
@ -171,6 +180,9 @@ impl From<Settings> for SettingsResponse {
|
|||
fn from(settings: Settings) -> Self {
|
||||
Self {
|
||||
ocr_language: settings.ocr_language,
|
||||
preferred_languages: settings.preferred_languages,
|
||||
primary_language: settings.primary_language,
|
||||
auto_detect_language_combination: settings.auto_detect_language_combination,
|
||||
concurrent_ocr_jobs: settings.concurrent_ocr_jobs,
|
||||
ocr_timeout_seconds: settings.ocr_timeout_seconds,
|
||||
max_file_size_mb: settings.max_file_size_mb,
|
||||
|
|
@ -223,12 +235,79 @@ impl From<Settings> for SettingsResponse {
|
|||
}
|
||||
}
|
||||
|
||||
impl UpdateSettings {
|
||||
/// Create an UpdateSettings that only updates language preferences
|
||||
pub fn language_update(
|
||||
preferred_languages: Vec<String>,
|
||||
primary_language: String,
|
||||
ocr_language: String,
|
||||
) -> Self {
|
||||
Self {
|
||||
preferred_languages: Some(preferred_languages),
|
||||
primary_language: Some(primary_language),
|
||||
ocr_language: Some(ocr_language),
|
||||
auto_detect_language_combination: None,
|
||||
concurrent_ocr_jobs: None,
|
||||
ocr_timeout_seconds: None,
|
||||
max_file_size_mb: None,
|
||||
allowed_file_types: None,
|
||||
auto_rotate_images: None,
|
||||
enable_image_preprocessing: None,
|
||||
search_results_per_page: None,
|
||||
search_snippet_length: None,
|
||||
fuzzy_search_threshold: None,
|
||||
retention_days: None,
|
||||
enable_auto_cleanup: None,
|
||||
enable_compression: None,
|
||||
memory_limit_mb: None,
|
||||
cpu_priority: None,
|
||||
enable_background_ocr: None,
|
||||
ocr_page_segmentation_mode: None,
|
||||
ocr_engine_mode: None,
|
||||
ocr_min_confidence: None,
|
||||
ocr_dpi: None,
|
||||
ocr_enhance_contrast: None,
|
||||
ocr_remove_noise: None,
|
||||
ocr_detect_orientation: None,
|
||||
ocr_whitelist_chars: None,
|
||||
ocr_blacklist_chars: None,
|
||||
ocr_brightness_boost: None,
|
||||
ocr_contrast_multiplier: None,
|
||||
ocr_noise_reduction_level: None,
|
||||
ocr_sharpening_strength: None,
|
||||
ocr_morphological_operations: None,
|
||||
ocr_adaptive_threshold_window_size: None,
|
||||
ocr_histogram_equalization: None,
|
||||
ocr_upscale_factor: None,
|
||||
ocr_max_image_width: None,
|
||||
ocr_max_image_height: None,
|
||||
save_processed_images: None,
|
||||
ocr_quality_threshold_brightness: None,
|
||||
ocr_quality_threshold_contrast: None,
|
||||
ocr_quality_threshold_noise: None,
|
||||
ocr_quality_threshold_sharpness: None,
|
||||
ocr_skip_enhancement: None,
|
||||
webdav_enabled: None,
|
||||
webdav_server_url: None,
|
||||
webdav_username: None,
|
||||
webdav_password: None,
|
||||
webdav_watch_folders: None,
|
||||
webdav_file_extensions: None,
|
||||
webdav_auto_sync: None,
|
||||
webdav_sync_interval_minutes: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Settings {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
id: Uuid::new_v4(),
|
||||
user_id: Uuid::nil(),
|
||||
ocr_language: "eng".to_string(),
|
||||
preferred_languages: vec!["eng".to_string()],
|
||||
primary_language: "eng".to_string(),
|
||||
auto_detect_language_combination: false,
|
||||
concurrent_ocr_jobs: 4,
|
||||
ocr_timeout_seconds: 300,
|
||||
max_file_size_mb: 50,
|
||||
|
|
|
|||
|
|
@ -249,10 +249,32 @@ impl EnhancedOcrService {
|
|||
needs_enhancement
|
||||
}
|
||||
|
||||
/// Build language combination string for Tesseract (e.g., "eng+spa")
|
||||
fn build_language_combination(&self, settings: &Settings) -> String {
|
||||
if settings.preferred_languages.len() > 1 {
|
||||
// Use preferred_languages with primary_language first
|
||||
let mut languages = settings.preferred_languages.clone();
|
||||
|
||||
// Ensure primary language is first
|
||||
languages.retain(|lang| lang != &settings.primary_language);
|
||||
languages.insert(0, settings.primary_language.clone());
|
||||
|
||||
// Join with + for Tesseract multi-language format
|
||||
languages.join("+")
|
||||
} else if !settings.preferred_languages.is_empty() {
|
||||
// Single language from preferred_languages
|
||||
settings.preferred_languages[0].clone()
|
||||
} else {
|
||||
// Fallback to ocr_language field for backward compatibility
|
||||
settings.ocr_language.clone()
|
||||
}
|
||||
}
|
||||
|
||||
/// Configure Tesseract with optimal settings
|
||||
#[cfg(feature = "ocr")]
|
||||
fn configure_tesseract(&self, image_path: &str, settings: &Settings) -> Result<Tesseract> {
|
||||
let mut tesseract = Tesseract::new(None, Some(&settings.ocr_language))?;
|
||||
let language_combination = self.build_language_combination(settings);
|
||||
let mut tesseract = Tesseract::new(None, Some(&language_combination))?;
|
||||
|
||||
// Set the image
|
||||
tesseract = tesseract.set_image(image_path)?;
|
||||
|
|
|
|||
|
|
@ -123,6 +123,55 @@ impl OcrHealthChecker {
|
|||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Validate a language combination (e.g., "eng+spa")
|
||||
pub fn validate_language_combination(&self, lang_combination: &str) -> Result<(), OcrError> {
|
||||
if lang_combination.is_empty() {
|
||||
return Err(OcrError::LanguageDataNotFound {
|
||||
lang: "empty".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
// Split by '+' to handle multi-language combinations
|
||||
let languages: Vec<&str> = lang_combination.split('+').collect();
|
||||
|
||||
// Validate each language in the combination
|
||||
for lang in &languages {
|
||||
self.validate_language(lang.trim())?;
|
||||
}
|
||||
|
||||
// Limit number of languages for performance (max 4)
|
||||
if languages.len() > 4 {
|
||||
return Err(OcrError::LanguageDataNotFound {
|
||||
lang: format!("Too many languages in combination: {}. Maximum is 4.", languages.len()),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Validate a list of preferred languages
|
||||
pub fn validate_preferred_languages(&self, languages: &[String]) -> Result<(), OcrError> {
|
||||
if languages.is_empty() {
|
||||
return Err(OcrError::LanguageDataNotFound {
|
||||
lang: "No languages provided".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
// Limit number of languages for performance
|
||||
if languages.len() > 4 {
|
||||
return Err(OcrError::LanguageDataNotFound {
|
||||
lang: format!("Too many preferred languages: {}. Maximum is 4.", languages.len()),
|
||||
});
|
||||
}
|
||||
|
||||
// Validate each language
|
||||
for lang in languages {
|
||||
self.validate_language(lang)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_language_display_name(&self, lang_code: &str) -> String {
|
||||
match lang_code {
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ impl OcrService {
|
|||
// Perform health checks first
|
||||
self.health_checker.check_tesseract_installation()
|
||||
.map_err(|e: OcrError| anyhow!(e))?;
|
||||
self.health_checker.check_language_data(lang)
|
||||
self.health_checker.validate_language_combination(lang)
|
||||
.map_err(|e: OcrError| anyhow!(e))?;
|
||||
|
||||
let mut tesseract = Tesseract::new(None, Some(lang))
|
||||
|
|
|
|||
|
|
@ -40,6 +40,7 @@ pub async fn upload_document(
|
|||
) -> Result<Json<DocumentUploadResponse>, StatusCode> {
|
||||
let mut uploaded_file = None;
|
||||
let mut ocr_language: Option<String> = None;
|
||||
let mut ocr_languages: Vec<String> = Vec::new();
|
||||
|
||||
// First pass: collect all multipart fields
|
||||
while let Some(field) = multipart.next_field().await.map_err(|e| {
|
||||
|
|
@ -65,6 +66,22 @@ pub async fn upload_document(
|
|||
}
|
||||
}
|
||||
}
|
||||
} else if name == "ocr_languages" || name.starts_with("ocr_languages[") {
|
||||
let language = field.text().await.map_err(|_| StatusCode::BAD_REQUEST)?;
|
||||
if !language.trim().is_empty() {
|
||||
// Validate that the language is available
|
||||
let health_checker = crate::ocr::health::OcrHealthChecker::new();
|
||||
match health_checker.validate_language(language.trim()) {
|
||||
Ok(_) => {
|
||||
ocr_languages.push(language.trim().to_string());
|
||||
info!("OCR language added to list: {}", language);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Invalid OCR language specified '{}': {}", language, e);
|
||||
return Err(StatusCode::BAD_REQUEST);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if name == "file" {
|
||||
let filename = field.file_name()
|
||||
.ok_or_else(|| {
|
||||
|
|
@ -143,8 +160,30 @@ pub async fn upload_document(
|
|||
Ok(IngestionResult::Created(document)) => {
|
||||
info!("Document uploaded successfully: {}", document.id);
|
||||
|
||||
// If a language was specified, update the user's OCR language setting
|
||||
if let Some(lang) = &ocr_language {
|
||||
// Update user's OCR language settings based on what was provided
|
||||
if !ocr_languages.is_empty() {
|
||||
// Multi-language support: update preferred languages
|
||||
let health_checker = crate::ocr::health::OcrHealthChecker::new();
|
||||
match health_checker.validate_preferred_languages(&ocr_languages) {
|
||||
Ok(_) => {
|
||||
let settings_update = crate::models::UpdateSettings::language_update(
|
||||
ocr_languages.clone(),
|
||||
ocr_languages[0].clone(), // First language as primary
|
||||
ocr_languages[0].clone(), // Backward compatibility
|
||||
);
|
||||
|
||||
if let Err(e) = state.db.create_or_update_settings(auth_user.user.id, &settings_update).await {
|
||||
warn!("Failed to update user preferred languages to {:?}: {}", ocr_languages, e);
|
||||
} else {
|
||||
info!("Updated user {} preferred languages to: {:?}", auth_user.user.id, ocr_languages);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Invalid language combination provided: {}", e);
|
||||
}
|
||||
}
|
||||
} else if let Some(lang) = &ocr_language {
|
||||
// Single language (backward compatibility)
|
||||
if let Err(e) = state.db.update_user_ocr_language(auth_user.user.id, lang).await {
|
||||
warn!("Failed to update user OCR language to {}: {}", lang, e);
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -107,9 +107,31 @@ pub async fn retry_ocr(
|
|||
}
|
||||
}
|
||||
|
||||
// If a language was specified, validate and update the user's OCR language setting
|
||||
if let Some(lang) = &request.language {
|
||||
// Validate that the language is available
|
||||
// Update user's OCR language settings based on what was provided
|
||||
if let Some(languages) = &request.languages {
|
||||
// Multi-language support: validate and update preferred languages
|
||||
let health_checker = crate::ocr::health::OcrHealthChecker::new();
|
||||
match health_checker.validate_preferred_languages(languages) {
|
||||
Ok(_) => {
|
||||
let settings_update = crate::models::UpdateSettings::language_update(
|
||||
languages.clone(),
|
||||
languages[0].clone(), // First language as primary
|
||||
languages[0].clone(), // Backward compatibility
|
||||
);
|
||||
|
||||
if let Err(e) = state.db.create_or_update_settings(auth_user.user.id, &settings_update).await {
|
||||
warn!("Failed to update user preferred languages to {:?}: {}", languages, e);
|
||||
} else {
|
||||
info!("Updated user {} preferred languages to: {:?} for retry", auth_user.user.id, languages);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Invalid language combination provided: {}", e);
|
||||
return Err(StatusCode::BAD_REQUEST);
|
||||
}
|
||||
}
|
||||
} else if let Some(lang) = &request.language {
|
||||
// Single language (backward compatibility)
|
||||
let health_checker = crate::ocr::health::OcrHealthChecker::new();
|
||||
match health_checker.validate_language(lang) {
|
||||
Ok(_) => {
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ pub struct DeleteLowConfidenceRequest {
|
|||
#[derive(Deserialize, ToSchema)]
|
||||
pub struct RetryOcrRequest {
|
||||
pub language: Option<String>,
|
||||
pub languages: Option<Vec<String>>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, ToSchema)]
|
||||
|
|
|
|||
|
|
@ -49,6 +49,9 @@ async fn get_settings(
|
|||
let default = crate::models::Settings::default();
|
||||
SettingsResponse {
|
||||
ocr_language: default.ocr_language,
|
||||
preferred_languages: default.preferred_languages,
|
||||
primary_language: default.primary_language,
|
||||
auto_detect_language_combination: default.auto_detect_language_combination,
|
||||
concurrent_ocr_jobs: default.concurrent_ocr_jobs,
|
||||
ocr_timeout_seconds: default.ocr_timeout_seconds,
|
||||
max_file_size_mb: default.max_file_size_mb,
|
||||
|
|
|
|||
Loading…
Reference in New Issue