feat(client/server): update search tests, and upgrade OCR

2025-06-12 22:02:26 -07:00 · 2025-06-12 22:02:26 -07:00 · d5f419ca18
parent 1a1f886f04
commit d5f419ca18
3 changed files with 173 additions and 18 deletions
--- a/apply_migrations.sh
+++ b/apply_migrations.sh
@ -0,0 +1,57 @@
+#!/bin/bash
+
+# Apply database migrations for enhanced OCR
+# Usage: ./apply_migrations.sh [database_url]
+
+set -e
+
+# Default database URL from environment or use provided argument
+DATABASE_URL=${1:-${DATABASE_URL:-"postgresql://localhost/readur"}}
+
+echo "Applying migrations to: $DATABASE_URL"
+
+# Apply migration 002 if it hasn't been applied yet
+echo "Checking if migration 002_add_enhanced_ocr_fields.sql needs to be applied..."
+
+# Check if the new columns exist
+COLUMNS_EXIST=$(psql "$DATABASE_URL" -t -c "
+SELECT COUNT(*) 
+FROM information_schema.columns 
+WHERE table_name = 'documents' 
+AND column_name IN ('ocr_confidence', 'ocr_word_count', 'ocr_processing_time_ms');
+")
+
+if [[ $COLUMNS_EXIST -eq 3 ]]; then
+    echo "Enhanced OCR fields already exist. Migration already applied."
+else
+    echo "Applying migration 002_add_enhanced_ocr_fields.sql..."
+    psql "$DATABASE_URL" -f migrations/002_add_enhanced_ocr_fields.sql
+    echo "Migration 002 applied successfully!"
+fi
+
+# Verify the migration was successful
+echo "Verifying migration..."
+VERIFICATION=$(psql "$DATABASE_URL" -t -c "
+SELECT 
+    (SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'documents' AND column_name = 'ocr_confidence') as doc_cols,
+    (SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'settings' AND column_name = 'ocr_page_segmentation_mode') as settings_cols;
+")
+
+echo "Migration verification: $VERIFICATION"
+
+if echo "$VERIFICATION" | grep -q "1.*1"; then
+    echo "✅ Enhanced OCR migration completed successfully!"
+    echo ""
+    echo "New features available:"
+    echo "- OCR confidence scoring and quality validation"
+    echo "- Advanced image preprocessing for challenging images"
+    echo "- Configurable Tesseract PSM and OEM settings"
+    echo "- Intelligent brightness/contrast enhancement"
+    echo "- Adaptive noise removal and sharpening"
+    echo "- OCR analytics and monitoring"
+    echo ""
+    echo "You can now restart your Readur server to use the enhanced OCR features."
+else
+    echo "❌ Migration verification failed. Please check the logs above."
+    exit 1
+fi
--- a/migrations/002_add_enhanced_ocr_fields.sql
+++ b/migrations/002_add_enhanced_ocr_fields.sql
@ -0,0 +1,70 @@
+-- Add enhanced OCR metadata fields to documents table
+ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_confidence REAL;
+ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_word_count INT;
+ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_processing_time_ms INT;
+
+-- Add enhanced OCR configuration fields to settings table
+ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_page_segmentation_mode INT DEFAULT 3;
+ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_engine_mode INT DEFAULT 3;
+ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_min_confidence REAL DEFAULT 30.0;
+ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_dpi INT DEFAULT 300;
+ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_enhance_contrast BOOLEAN DEFAULT true;
+ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_remove_noise BOOLEAN DEFAULT true;
+ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_detect_orientation BOOLEAN DEFAULT true;
+ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_whitelist_chars TEXT;
+ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_blacklist_chars TEXT;
+
+-- Add comments for documentation
+COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence score (0-100)';
+COMMENT ON COLUMN documents.ocr_word_count IS 'Number of words extracted by OCR';
+COMMENT ON COLUMN documents.ocr_processing_time_ms IS 'Time taken for OCR processing in milliseconds';
+
+COMMENT ON COLUMN settings.ocr_page_segmentation_mode IS 'Tesseract Page Segmentation Mode (0-13), default 3=PSM_AUTO';
+COMMENT ON COLUMN settings.ocr_engine_mode IS 'Tesseract OCR Engine Mode (0-3), default 3=OEM_DEFAULT';
+COMMENT ON COLUMN settings.ocr_min_confidence IS 'Minimum OCR confidence threshold (0-100)';
+COMMENT ON COLUMN settings.ocr_dpi IS 'Target DPI for OCR processing, 0=auto';
+COMMENT ON COLUMN settings.ocr_enhance_contrast IS 'Enable adaptive contrast enhancement';
+COMMENT ON COLUMN settings.ocr_remove_noise IS 'Enable image noise removal';
+COMMENT ON COLUMN settings.ocr_detect_orientation IS 'Enable automatic orientation detection';
+COMMENT ON COLUMN settings.ocr_whitelist_chars IS 'Characters to allow in OCR (null=all)';
+COMMENT ON COLUMN settings.ocr_blacklist_chars IS 'Characters to exclude from OCR (null=none)';
+
+-- Create index on OCR confidence for quality filtering
+CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence ON documents(ocr_confidence) WHERE ocr_confidence IS NOT NULL;
+
+-- Create index on word count for analytics
+CREATE INDEX IF NOT EXISTS idx_documents_ocr_word_count ON documents(ocr_word_count) WHERE ocr_word_count IS NOT NULL;
+
+-- Update existing settings to have the new defaults
+UPDATE settings SET 
+    ocr_page_segmentation_mode = 3,
+    ocr_engine_mode = 3,
+    ocr_min_confidence = 30.0,
+    ocr_dpi = 300,
+    ocr_enhance_contrast = true,
+    ocr_remove_noise = true,
+    ocr_detect_orientation = true
+WHERE ocr_page_segmentation_mode IS NULL;
+
+-- Create a view for enhanced OCR analytics
+CREATE OR REPLACE VIEW ocr_analytics AS
+SELECT 
+    DATE(created_at) as date,
+    COUNT(*) as total_documents,
+    COUNT(ocr_text) as documents_with_ocr,
+    COUNT(ocr_confidence) as documents_with_confidence,
+    AVG(ocr_confidence) as avg_confidence,
+    MIN(ocr_confidence) as min_confidence,
+    MAX(ocr_confidence) as max_confidence,
+    AVG(ocr_word_count) as avg_word_count,
+    SUM(ocr_word_count) as total_words_extracted,
+    AVG(ocr_processing_time_ms) as avg_processing_time_ms,
+    COUNT(*) FILTER (WHERE ocr_confidence < 50) as low_confidence_count,
+    COUNT(*) FILTER (WHERE ocr_confidence >= 80) as high_confidence_count,
+    COUNT(*) FILTER (WHERE ocr_status = 'failed') as failed_ocr_count
+FROM documents 
+WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
+GROUP BY DATE(created_at)
+ORDER BY date DESC;
+
+COMMENT ON VIEW ocr_analytics IS 'Daily OCR analytics for monitoring quality and performance';
--- a/src/ocr_queue.rs
+++ b/src/ocr_queue.rs
@ -8,7 +8,7 @@ use tokio::time::{sleep, Duration};
 use tracing::{error, info, warn};
 use uuid::Uuid;

-use crate::{db::Database, ocr::OcrService};
+use crate::{db::Database, enhanced_ocr::EnhancedOcrService};

 #[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
 pub struct OcrQueueItem {
@ -204,7 +204,7 @@ impl OcrQueueService {
    }

    /// Process a single queue item
-    async fn process_item(&self, item: OcrQueueItem, ocr_service: &OcrService) -> Result<()> {
+    async fn process_item(&self, item: OcrQueueItem, ocr_service: &EnhancedOcrService) -> Result<()> {
        let start_time = std::time::Instant::now();
        
        info!("Processing OCR job {} for document {}", item.id, item.document_id);
@ -226,35 +226,62 @@ impl OcrQueueService {
                let file_path: String = row.get("file_path");
                let mime_type: String = row.get("mime_type");
                let user_id: Option<Uuid> = row.get("user_id");
-                // Get user's OCR settings
+                // Get user's OCR settings or use defaults
                let settings = if let Some(user_id) = user_id {
                    self.db.get_user_settings(user_id).await.ok().flatten()
+                        .unwrap_or_else(|| crate::models::Settings::default())
                } else {
-                    None
+                    crate::models::Settings::default()
                };

-                let ocr_language = settings
-                    .as_ref()
-                    .map(|s| s.ocr_language.clone())
-                    .unwrap_or_else(|| "eng".to_string());
-
-                // Perform OCR
-                match ocr_service.extract_text_with_lang(&file_path, &mime_type, &ocr_language).await {
-                    Ok(text) => {
-                        if !text.is_empty() {
-                            // Update document with OCR text
+                // Perform enhanced OCR
+                match ocr_service.extract_text(&file_path, &mime_type, &settings).await {
+                    Ok(ocr_result) => {
+                        // Validate OCR quality
+                        if !ocr_service.validate_ocr_quality(&ocr_result, &settings) {
+                            let error_msg = format!("OCR quality below threshold: {:.1}% confidence, {} words", 
+                                                   ocr_result.confidence, ocr_result.word_count);
+                            warn!("{}", error_msg);
+                            
+                            // Mark as failed for quality issues
+                            sqlx::query(
+                                r#"
+                                UPDATE documents
+                                SET ocr_status = 'failed',
+                                    ocr_error = $2,
+                                    updated_at = NOW()
+                                WHERE id = $1
+                                "#
+                            )
+                            .bind(item.document_id)
+                            .bind(&error_msg)
+                            .execute(&self.pool)
+                            .await?;
+                            
+                            self.mark_failed(item.id, &error_msg).await?;
+                            return Ok(());
+                        }
+                        
+                        if !ocr_result.text.is_empty() {
+                            // Update document with enhanced OCR text and metadata
                            sqlx::query(
                                r#"
                                UPDATE documents
                                SET ocr_text = $2,
                                    ocr_status = 'completed',
                                    ocr_completed_at = NOW(),
+                                    ocr_confidence = $3,
+                                    ocr_word_count = $4,
+                                    ocr_processing_time_ms = $5,
                                    updated_at = NOW()
                                WHERE id = $1
                                "#
                            )
                            .bind(item.document_id)
-                            .bind(text)
+                            .bind(&ocr_result.text)
+                            .bind(ocr_result.confidence)
+                            .bind(ocr_result.word_count as i32)
+                            .bind(ocr_result.processing_time_ms as i32)
                            .execute(&self.pool)
                            .await?;
                        }
@ -263,8 +290,9 @@ impl OcrQueueService {
                        self.mark_completed(item.id, processing_time_ms).await?;
                        
                        info!(
-                            "Successfully processed OCR job {} for document {} in {}ms",
-                            item.id, item.document_id, processing_time_ms
+                            "Successfully processed OCR job {} for document {} in {}ms - Enhanced OCR: {:.1}% confidence, {} words, Preprocessing: {:?}",
+                            item.id, item.document_id, processing_time_ms, 
+                            ocr_result.confidence, ocr_result.word_count, ocr_result.preprocessing_applied
                        );
                    }
                    Err(e) => {
@ -302,7 +330,7 @@ impl OcrQueueService {
    /// Start the worker loop
    pub async fn start_worker(self: Arc<Self>) -> Result<()> {
        let semaphore = Arc::new(Semaphore::new(self.max_concurrent_jobs));
-        let ocr_service = Arc::new(OcrService::new());
+        let ocr_service = Arc::new(EnhancedOcrService::new("/tmp".to_string()));
        
        info!(
            "Starting OCR worker {} with {} concurrent jobs",