feat(migrations): improve migrations and split large SQL statements into smaller ones

2025-06-12 22:27:04 -07:00 · 2025-06-12 22:27:04 -07:00 · 3dcf753ff3
parent 4fa74a6333
commit 3dcf753ff3
6 changed files with 107 additions and 121 deletions
--- a/migrations/001_add_ocr_queue.sql
+++ b/migrations/001_add_ocr_queue.sql
@ -16,18 +16,20 @@ CREATE TABLE IF NOT EXISTS ocr_queue (
    CONSTRAINT check_status CHECK (status IN ('pending', 'processing', 'completed', 'failed', 'cancelled'))
 );

-- Indexes for efficient queue operations
 CREATE INDEX IF NOT EXISTS idx_ocr_queue_status ON ocr_queue(status, priority DESC, created_at);
+
 CREATE INDEX IF NOT EXISTS idx_ocr_queue_document_id ON ocr_queue(document_id);
+
 CREATE INDEX IF NOT EXISTS idx_ocr_queue_worker ON ocr_queue(worker_id) WHERE status = 'processing';
+
 CREATE INDEX IF NOT EXISTS idx_ocr_queue_created_at ON ocr_queue(created_at) WHERE status = 'pending';

-- Add processing status to documents
 ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_status VARCHAR(20) DEFAULT 'pending';
+
 ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_error TEXT;
+
 ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_completed_at TIMESTAMPTZ;

-- Metrics table for monitoring
 CREATE TABLE IF NOT EXISTS ocr_metrics (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    date DATE DEFAULT CURRENT_DATE,
@ -41,27 +43,4 @@ CREATE TABLE IF NOT EXISTS ocr_metrics (
    queue_depth INT,
    active_workers INT,
    UNIQUE(date, hour)
-);
-
-- Function to get queue statistics
-CREATE OR REPLACE FUNCTION get_ocr_queue_stats()
-RETURNS TABLE (
-    pending_count BIGINT,
-    processing_count BIGINT,
-    failed_count BIGINT,
-    completed_today BIGINT,
-    avg_wait_time_minutes DOUBLE PRECISION,
-    oldest_pending_minutes DOUBLE PRECISION
-) AS $$
-BEGIN
-    RETURN QUERY
-    SELECT 
-        COUNT(*) FILTER (WHERE status = 'pending') as pending_count,
-        COUNT(*) FILTER (WHERE status = 'processing') as processing_count,
-        COUNT(*) FILTER (WHERE status = 'failed' AND attempts >= max_attempts) as failed_count,
-        COUNT(*) FILTER (WHERE status = 'completed' AND completed_at >= CURRENT_DATE) as completed_today,
-        AVG(EXTRACT(EPOCH FROM (COALESCE(started_at, NOW()) - created_at))/60) FILTER (WHERE status IN ('processing', 'completed')) as avg_wait_time_minutes,
-        MAX(EXTRACT(EPOCH FROM (NOW() - created_at))/60) FILTER (WHERE status = 'pending') as oldest_pending_minutes
-    FROM ocr_queue;
-END;
-$$ LANGUAGE plpgsql;
+);
--- a/migrations/002_add_enhanced_ocr_fields.sql
+++ b/migrations/002_add_enhanced_ocr_fields.sql
@ -1,70 +1,27 @@
-- Add enhanced OCR metadata fields to documents table
 ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_confidence REAL;
+
 ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_word_count INT;
+
 ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_processing_time_ms INT;

-- Add enhanced OCR configuration fields to settings table
 ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_page_segmentation_mode INT DEFAULT 3;
+
 ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_engine_mode INT DEFAULT 3;
+
 ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_min_confidence REAL DEFAULT 30.0;
+
 ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_dpi INT DEFAULT 300;
+
 ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_enhance_contrast BOOLEAN DEFAULT true;
+
 ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_remove_noise BOOLEAN DEFAULT true;
+
 ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_detect_orientation BOOLEAN DEFAULT true;
+
 ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_whitelist_chars TEXT;
+
 ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_blacklist_chars TEXT;

-- Add comments for documentation
-COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence score (0-100)';
-COMMENT ON COLUMN documents.ocr_word_count IS 'Number of words extracted by OCR';
-COMMENT ON COLUMN documents.ocr_processing_time_ms IS 'Time taken for OCR processing in milliseconds';
-
-COMMENT ON COLUMN settings.ocr_page_segmentation_mode IS 'Tesseract Page Segmentation Mode (0-13), default 3=PSM_AUTO';
-COMMENT ON COLUMN settings.ocr_engine_mode IS 'Tesseract OCR Engine Mode (0-3), default 3=OEM_DEFAULT';
-COMMENT ON COLUMN settings.ocr_min_confidence IS 'Minimum OCR confidence threshold (0-100)';
-COMMENT ON COLUMN settings.ocr_dpi IS 'Target DPI for OCR processing, 0=auto';
-COMMENT ON COLUMN settings.ocr_enhance_contrast IS 'Enable adaptive contrast enhancement';
-COMMENT ON COLUMN settings.ocr_remove_noise IS 'Enable image noise removal';
-COMMENT ON COLUMN settings.ocr_detect_orientation IS 'Enable automatic orientation detection';
-COMMENT ON COLUMN settings.ocr_whitelist_chars IS 'Characters to allow in OCR (null=all)';
-COMMENT ON COLUMN settings.ocr_blacklist_chars IS 'Characters to exclude from OCR (null=none)';
-
-- Create index on OCR confidence for quality filtering
 CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence ON documents(ocr_confidence) WHERE ocr_confidence IS NOT NULL;

-- Create index on word count for analytics
-CREATE INDEX IF NOT EXISTS idx_documents_ocr_word_count ON documents(ocr_word_count) WHERE ocr_word_count IS NOT NULL;
-
-- Update existing settings to have the new defaults
-UPDATE settings SET 
-    ocr_page_segmentation_mode = 3,
-    ocr_engine_mode = 3,
-    ocr_min_confidence = 30.0,
-    ocr_dpi = 300,
-    ocr_enhance_contrast = true,
-    ocr_remove_noise = true,
-    ocr_detect_orientation = true
-WHERE ocr_page_segmentation_mode IS NULL;
-
-- Create a view for enhanced OCR analytics
-CREATE OR REPLACE VIEW ocr_analytics AS
-SELECT 
-    DATE(created_at) as date,
-    COUNT(*) as total_documents,
-    COUNT(ocr_text) as documents_with_ocr,
-    COUNT(ocr_confidence) as documents_with_confidence,
-    AVG(ocr_confidence) as avg_confidence,
-    MIN(ocr_confidence) as min_confidence,
-    MAX(ocr_confidence) as max_confidence,
-    AVG(ocr_word_count) as avg_word_count,
-    SUM(ocr_word_count) as total_words_extracted,
-    AVG(ocr_processing_time_ms) as avg_processing_time_ms,
-    COUNT(*) FILTER (WHERE ocr_confidence < 50) as low_confidence_count,
-    COUNT(*) FILTER (WHERE ocr_confidence >= 80) as high_confidence_count,
-    COUNT(*) FILTER (WHERE ocr_status = 'failed') as failed_ocr_count
-FROM documents 
-WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
-GROUP BY DATE(created_at)
-ORDER BY date DESC;
-
-COMMENT ON VIEW ocr_analytics IS 'Daily OCR analytics for monitoring quality and performance';
+CREATE INDEX IF NOT EXISTS idx_documents_ocr_word_count ON documents(ocr_word_count) WHERE ocr_word_count IS NOT NULL;
--- a/migrations/003_add_ocr_queue_function.sql
+++ b/migrations/003_add_ocr_queue_function.sql
@ -0,0 +1,21 @@
+CREATE OR REPLACE FUNCTION get_ocr_queue_stats()
+RETURNS TABLE (
+    pending_count BIGINT,
+    processing_count BIGINT,
+    failed_count BIGINT,
+    completed_today BIGINT,
+    avg_wait_time_minutes DOUBLE PRECISION,
+    oldest_pending_minutes DOUBLE PRECISION
+) AS $$
+BEGIN
+    RETURN QUERY
+    SELECT 
+        COUNT(*) FILTER (WHERE status = 'pending') as pending_count,
+        COUNT(*) FILTER (WHERE status = 'processing') as processing_count,
+        COUNT(*) FILTER (WHERE status = 'failed' AND attempts >= max_attempts) as failed_count,
+        COUNT(*) FILTER (WHERE status = 'completed' AND completed_at >= CURRENT_DATE) as completed_today,
+        AVG(EXTRACT(EPOCH FROM (COALESCE(started_at, NOW()) - created_at))/60) FILTER (WHERE status IN ('processing', 'completed')) as avg_wait_time_minutes,
+        MAX(EXTRACT(EPOCH FROM (NOW() - created_at))/60) FILTER (WHERE status = 'pending') as oldest_pending_minutes
+    FROM ocr_queue;
+END;
+$$ LANGUAGE plpgsql;
--- a/migrations/004_update_settings_defaults.sql
+++ b/migrations/004_update_settings_defaults.sql
@ -0,0 +1,9 @@
+UPDATE settings SET 
+    ocr_page_segmentation_mode = 3,
+    ocr_engine_mode = 3,
+    ocr_min_confidence = 30.0,
+    ocr_dpi = 300,
+    ocr_enhance_contrast = true,
+    ocr_remove_noise = true,
+    ocr_detect_orientation = true
+WHERE ocr_page_segmentation_mode IS NULL;
--- a/migrations/005_create_ocr_analytics_view.sql
+++ b/migrations/005_create_ocr_analytics_view.sql
@ -0,0 +1,19 @@
+CREATE OR REPLACE VIEW ocr_analytics AS
+SELECT 
+    DATE(created_at) as date,
+    COUNT(*) as total_documents,
+    COUNT(ocr_text) as documents_with_ocr,
+    COUNT(ocr_confidence) as documents_with_confidence,
+    AVG(ocr_confidence) as avg_confidence,
+    MIN(ocr_confidence) as min_confidence,
+    MAX(ocr_confidence) as max_confidence,
+    AVG(ocr_word_count) as avg_word_count,
+    SUM(ocr_word_count) as total_words_extracted,
+    AVG(ocr_processing_time_ms) as avg_processing_time_ms,
+    COUNT(*) FILTER (WHERE ocr_confidence < 50) as low_confidence_count,
+    COUNT(*) FILTER (WHERE ocr_confidence >= 80) as high_confidence_count,
+    COUNT(*) FILTER (WHERE ocr_status = 'failed') as failed_ocr_count
+FROM documents 
+WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
+GROUP BY DATE(created_at)
+ORDER BY date DESC;
--- a/src/migrations.rs
+++ b/src/migrations.rs
@ -114,11 +114,12 @@ impl MigrationRunner {
        // Start a transaction
        let mut tx = self.pool.begin().await?;

-        // Split SQL into individual statements and execute each one
-        let statements = self.split_sql_statements(&migration.sql);
+        // Simple approach: split on semicolons and execute each statement
+        let statements = self.split_simple(&migration.sql);
        
        for (i, statement) in statements.iter().enumerate() {
-            if statement.trim().is_empty() {
+            let statement = statement.trim();
+            if statement.is_empty() {
                continue;
            }
            
@ -126,8 +127,8 @@ impl MigrationRunner {
                .execute(&mut *tx)
                .await
                .map_err(|e| {
-                    error!("Failed to apply migration {} statement {}: {}\nStatement: {}", 
-                           migration.version, i + 1, e, statement);
+                    error!("Failed to execute statement {} in migration {}: {}\nStatement: {}", 
+                           i + 1, migration.version, e, statement);
                    e
                })?;
        }
@ -148,56 +149,56 @@ impl MigrationRunner {
        Ok(())
    }

-    /// Split SQL content into individual statements
-    fn split_sql_statements(&self, sql: &str) -> Vec<String> {
+    /// Simple SQL splitting - handle dollar-quoted strings properly
+    fn split_simple(&self, sql: &str) -> Vec<String> {
        let mut statements = Vec::new();
-        let mut current_statement = String::new();
-        let mut in_function = false;
-        let mut function_depth = 0;
+        let mut current = String::new();
+        let mut in_dollar_quote = false;
+        let mut dollar_tag = String::new();
        
        for line in sql.lines() {
            let trimmed = line.trim();
            
-            // Skip comments and empty lines
-            if trimmed.is_empty() || trimmed.starts_with("--") {
+            // Skip empty lines and comments when not in a dollar quote
+            if !in_dollar_quote && (trimmed.is_empty() || trimmed.starts_with("--")) {
                continue;
            }
            
-            // Detect function definitions (PostgreSQL functions can contain semicolons)
-            if trimmed.contains("CREATE OR REPLACE FUNCTION") || trimmed.contains("CREATE FUNCTION") {
-                in_function = true;
-                function_depth = 0;
-            }
-            
-            current_statement.push_str(line);
-            current_statement.push('\n');
-            
-            // Track function block depth
-            if in_function {
-                if trimmed.contains("BEGIN") {
-                    function_depth += 1;
-                }
-                if trimmed.contains("END;") {
-                    function_depth -= 1;
-                    if function_depth <= 0 {
-                        in_function = false;
-                        statements.push(current_statement.trim().to_string());
-                        current_statement.clear();
-                        continue;
+            // Check for dollar quote start/end
+            if let Some(tag_start) = line.find("$$") {
+                if !in_dollar_quote {
+                    // Starting a dollar quote
+                    in_dollar_quote = true;
+                    // Extract the tag (if any) between the $$ 
+                    if let Some(tag_end) = line[tag_start + 2..].find("$$") {
+                        // This line both starts and ends the quote - shouldn't happen with functions
+                        in_dollar_quote = false;
+                    }
+                } else {
+                    // Might be ending the dollar quote
+                    if line.contains("$$") {
+                        in_dollar_quote = false;
                    }
                }
            }
            
-            // For non-function statements, split on semicolon
-            if !in_function && trimmed.ends_with(';') {
-                statements.push(current_statement.trim().to_string());
-                current_statement.clear();
+            current.push_str(line);
+            current.push('\n');
+            
+            // If not in dollar quote and line ends with semicolon, this is a complete statement
+            if !in_dollar_quote && trimmed.ends_with(';') {
+                let statement = current.trim();
+                if !statement.is_empty() {
+                    statements.push(statement.to_string());
+                }
+                current.clear();
            }
        }
        
-        // Add any remaining statement
-        if !current_statement.trim().is_empty() {
-            statements.push(current_statement.trim().to_string());
+        // Add any remaining content as final statement
+        let final_statement = current.trim();
+        if !final_statement.is_empty() {
+            statements.push(final_statement.to_string());
        }
        
        statements