feat(client/server): update search tests, and upgrade OCR

2025-06-12 22:02:26 -07:00 · 2025-06-12 22:02:26 -07:00 · d5f419ca18
parent 1a1f886f04
commit d5f419ca18
3 changed files with 173 additions and 18 deletions
--- a/apply_migrations.sh
+++ b/apply_migrations.sh
@ -0,0 +1,57 @@
 #!/bin/bash
 # Apply database migrations for enhanced OCR
 # Usage: ./apply_migrations.sh [database_url]
 set -e
 # Default database URL from environment or use provided argument
 DATABASE_URL=${1:-${DATABASE_URL:-"postgresql://localhost/readur"}}
 echo "Applying migrations to: $DATABASE_URL"
 # Apply migration 002 if it hasn't been applied yet
 echo "Checking if migration 002_add_enhanced_ocr_fields.sql needs to be applied..."
 # Check if the new columns exist
 COLUMNS_EXIST=$(psql "$DATABASE_URL" -t -c "
 SELECT COUNT(*) 
 FROM information_schema.columns 
 WHERE table_name = 'documents' 
 AND column_name IN ('ocr_confidence', 'ocr_word_count', 'ocr_processing_time_ms');
 ")
 if [[ $COLUMNS_EXIST -eq 3 ]]; then
    echo "Enhanced OCR fields already exist. Migration already applied."
 else
    echo "Applying migration 002_add_enhanced_ocr_fields.sql..."
    psql "$DATABASE_URL" -f migrations/002_add_enhanced_ocr_fields.sql
    echo "Migration 002 applied successfully!"
 fi
 # Verify the migration was successful
 echo "Verifying migration..."
 VERIFICATION=$(psql "$DATABASE_URL" -t -c "
 SELECT 
    (SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'documents' AND column_name = 'ocr_confidence') as doc_cols,
    (SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'settings' AND column_name = 'ocr_page_segmentation_mode') as settings_cols;
 ")
 echo "Migration verification: $VERIFICATION"
 if echo "$VERIFICATION" | grep -q "1.*1"; then
    echo "✅ Enhanced OCR migration completed successfully!"
    echo ""
    echo "New features available:"
    echo "- OCR confidence scoring and quality validation"
    echo "- Advanced image preprocessing for challenging images"
    echo "- Configurable Tesseract PSM and OEM settings"
    echo "- Intelligent brightness/contrast enhancement"
    echo "- Adaptive noise removal and sharpening"
    echo "- OCR analytics and monitoring"
    echo ""
    echo "You can now restart your Readur server to use the enhanced OCR features."
 else
    echo "❌ Migration verification failed. Please check the logs above."
    exit 1
 fi
--- a/migrations/002_add_enhanced_ocr_fields.sql
+++ b/migrations/002_add_enhanced_ocr_fields.sql
@ -0,0 +1,70 @@
 -- Add enhanced OCR metadata fields to documents table
 ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_confidence REAL;
 ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_word_count INT;
 ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_processing_time_ms INT;
 -- Add enhanced OCR configuration fields to settings table
 ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_page_segmentation_mode INT DEFAULT 3;
 ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_engine_mode INT DEFAULT 3;
 ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_min_confidence REAL DEFAULT 30.0;
 ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_dpi INT DEFAULT 300;
 ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_enhance_contrast BOOLEAN DEFAULT true;
 ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_remove_noise BOOLEAN DEFAULT true;
 ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_detect_orientation BOOLEAN DEFAULT true;
 ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_whitelist_chars TEXT;
 ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_blacklist_chars TEXT;
 -- Add comments for documentation
 COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence score (0-100)';
 COMMENT ON COLUMN documents.ocr_word_count IS 'Number of words extracted by OCR';
 COMMENT ON COLUMN documents.ocr_processing_time_ms IS 'Time taken for OCR processing in milliseconds';
 COMMENT ON COLUMN settings.ocr_page_segmentation_mode IS 'Tesseract Page Segmentation Mode (0-13), default 3=PSM_AUTO';
 COMMENT ON COLUMN settings.ocr_engine_mode IS 'Tesseract OCR Engine Mode (0-3), default 3=OEM_DEFAULT';
 COMMENT ON COLUMN settings.ocr_min_confidence IS 'Minimum OCR confidence threshold (0-100)';
 COMMENT ON COLUMN settings.ocr_dpi IS 'Target DPI for OCR processing, 0=auto';
 COMMENT ON COLUMN settings.ocr_enhance_contrast IS 'Enable adaptive contrast enhancement';
 COMMENT ON COLUMN settings.ocr_remove_noise IS 'Enable image noise removal';
 COMMENT ON COLUMN settings.ocr_detect_orientation IS 'Enable automatic orientation detection';
 COMMENT ON COLUMN settings.ocr_whitelist_chars IS 'Characters to allow in OCR (null=all)';
 COMMENT ON COLUMN settings.ocr_blacklist_chars IS 'Characters to exclude from OCR (null=none)';
 -- Create index on OCR confidence for quality filtering
 CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence ON documents(ocr_confidence) WHERE ocr_confidence IS NOT NULL;
 -- Create index on word count for analytics
 CREATE INDEX IF NOT EXISTS idx_documents_ocr_word_count ON documents(ocr_word_count) WHERE ocr_word_count IS NOT NULL;
 -- Update existing settings to have the new defaults
 UPDATE settings SET 
    ocr_page_segmentation_mode = 3,
    ocr_engine_mode = 3,
    ocr_min_confidence = 30.0,
    ocr_dpi = 300,
    ocr_enhance_contrast = true,
    ocr_remove_noise = true,
    ocr_detect_orientation = true
 WHERE ocr_page_segmentation_mode IS NULL;
 -- Create a view for enhanced OCR analytics
 CREATE OR REPLACE VIEW ocr_analytics AS
 SELECT 
    DATE(created_at) as date,
    COUNT(*) as total_documents,
    COUNT(ocr_text) as documents_with_ocr,
    COUNT(ocr_confidence) as documents_with_confidence,
    AVG(ocr_confidence) as avg_confidence,
    MIN(ocr_confidence) as min_confidence,
    MAX(ocr_confidence) as max_confidence,
    AVG(ocr_word_count) as avg_word_count,
    SUM(ocr_word_count) as total_words_extracted,
    AVG(ocr_processing_time_ms) as avg_processing_time_ms,
    COUNT(*) FILTER (WHERE ocr_confidence < 50) as low_confidence_count,
    COUNT(*) FILTER (WHERE ocr_confidence >= 80) as high_confidence_count,
    COUNT(*) FILTER (WHERE ocr_status = 'failed') as failed_ocr_count
 FROM documents 
 WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
 GROUP BY DATE(created_at)
 ORDER BY date DESC;
 COMMENT ON VIEW ocr_analytics IS 'Daily OCR analytics for monitoring quality and performance';
--- a/src/ocr_queue.rs
+++ b/src/ocr_queue.rs
@ -8,7 +8,7 @@ use tokio::time::{sleep, Duration};
 use tracing::{error, info, warn};
 use uuid::Uuid;
-use crate::{db::Database, ocr::OcrService};
+use crate::{db::Database, enhanced_ocr::EnhancedOcrService};
 #[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
 pub struct OcrQueueItem {
@ -204,7 +204,7 @@ impl OcrQueueService {
    }
    /// Process a single queue item
-    async fn process_item(&self, item: OcrQueueItem, ocr_service: &OcrService) -> Result<()> {
+    async fn process_item(&self, item: OcrQueueItem, ocr_service: &EnhancedOcrService) -> Result<()> {
        let start_time = std::time::Instant::now();
        info!("Processing OCR job {} for document {}", item.id, item.document_id);
@ -226,35 +226,62 @@ impl OcrQueueService {
                let file_path: String = row.get("file_path");
                let mime_type: String = row.get("mime_type");
                let user_id: Option<Uuid> = row.get("user_id");
-                // Get user's OCR settings
+                // Get user's OCR settings or use defaults
                let settings = if let Some(user_id) = user_id {
                    self.db.get_user_settings(user_id).await.ok().flatten()
                        .unwrap_or_else(|| crate::models::Settings::default())
                } else {
-                    None
+                    crate::models::Settings::default()
                };
-                let ocr_language = settings
+                // Perform enhanced OCR
-                    .as_ref()
+                match ocr_service.extract_text(&file_path, &mime_type, &settings).await {
-                    .map(|s| s.ocr_language.clone())
+                    Ok(ocr_result) => {
-                    .unwrap_or_else(|| "eng".to_string());
+                        // Validate OCR quality
-
+                        if !ocr_service.validate_ocr_quality(&ocr_result, &settings) {
-                // Perform OCR
+                            let error_msg = format!("OCR quality below threshold: {:.1}% confidence, {} words", 
-                match ocr_service.extract_text_with_lang(&file_path, &mime_type, &ocr_language).await {
+                                                   ocr_result.confidence, ocr_result.word_count);
-                    Ok(text) => {
+                            warn!("{}", error_msg);
-                        if !text.is_empty() {
+                            
-                            // Update document with OCR text
+                            // Mark as failed for quality issues
                            sqlx::query(
                                r#"
                                UPDATE documents
                                SET ocr_status = 'failed',
                                    ocr_error = $2,
                                    updated_at = NOW()
                                WHERE id = $1
                                "#
                            )
                            .bind(item.document_id)
                            .bind(&error_msg)
                            .execute(&self.pool)
                            .await?;
                            self.mark_failed(item.id, &error_msg).await?;
                            return Ok(());
                        }
                        if !ocr_result.text.is_empty() {
                            // Update document with enhanced OCR text and metadata
                            sqlx::query(
                                r#"
                                UPDATE documents
                                SET ocr_text = $2,
                                    ocr_status = 'completed',
                                    ocr_completed_at = NOW(),
                                    ocr_confidence = $3,
                                    ocr_word_count = $4,
                                    ocr_processing_time_ms = $5,
                                    updated_at = NOW()
                                WHERE id = $1
                                "#
                            )
                            .bind(item.document_id)
-                            .bind(text)
+                            .bind(&ocr_result.text)
                            .bind(ocr_result.confidence)
                            .bind(ocr_result.word_count as i32)
                            .bind(ocr_result.processing_time_ms as i32)
                            .execute(&self.pool)
                            .await?;
                        }
@ -263,8 +290,9 @@ impl OcrQueueService {
                        self.mark_completed(item.id, processing_time_ms).await?;
                        info!(
-                            "Successfully processed OCR job {} for document {} in {}ms",
+                            "Successfully processed OCR job {} for document {} in {}ms - Enhanced OCR: {:.1}% confidence, {} words, Preprocessing: {:?}",
-                            item.id, item.document_id, processing_time_ms
+                            item.id, item.document_id, processing_time_ms, 
                            ocr_result.confidence, ocr_result.word_count, ocr_result.preprocessing_applied
                        );
                    }
                    Err(e) => {
@ -302,7 +330,7 @@ impl OcrQueueService {
    /// Start the worker loop
    pub async fn start_worker(self: Arc<Self>) -> Result<()> {
        let semaphore = Arc::new(Semaphore::new(self.max_concurrent_jobs));
-        let ocr_service = Arc::new(OcrService::new());
+        let ocr_service = Arc::new(EnhancedOcrService::new("/tmp".to_string()));
        info!(
            "Starting OCR worker {} with {} concurrent jobs",