diff --git a/migrations/001_add_ocr_queue.sql b/migrations/001_add_ocr_queue.sql index c8ea217..3f4a19f 100644 --- a/migrations/001_add_ocr_queue.sql +++ b/migrations/001_add_ocr_queue.sql @@ -16,18 +16,20 @@ CREATE TABLE IF NOT EXISTS ocr_queue ( CONSTRAINT check_status CHECK (status IN ('pending', 'processing', 'completed', 'failed', 'cancelled')) ); --- Indexes for efficient queue operations CREATE INDEX IF NOT EXISTS idx_ocr_queue_status ON ocr_queue(status, priority DESC, created_at); + CREATE INDEX IF NOT EXISTS idx_ocr_queue_document_id ON ocr_queue(document_id); + CREATE INDEX IF NOT EXISTS idx_ocr_queue_worker ON ocr_queue(worker_id) WHERE status = 'processing'; + CREATE INDEX IF NOT EXISTS idx_ocr_queue_created_at ON ocr_queue(created_at) WHERE status = 'pending'; --- Add processing status to documents ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_status VARCHAR(20) DEFAULT 'pending'; + ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_error TEXT; + ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_completed_at TIMESTAMPTZ; --- Metrics table for monitoring CREATE TABLE IF NOT EXISTS ocr_metrics ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), date DATE DEFAULT CURRENT_DATE, @@ -41,27 +43,4 @@ CREATE TABLE IF NOT EXISTS ocr_metrics ( queue_depth INT, active_workers INT, UNIQUE(date, hour) -); - --- Function to get queue statistics -CREATE OR REPLACE FUNCTION get_ocr_queue_stats() -RETURNS TABLE ( - pending_count BIGINT, - processing_count BIGINT, - failed_count BIGINT, - completed_today BIGINT, - avg_wait_time_minutes DOUBLE PRECISION, - oldest_pending_minutes DOUBLE PRECISION -) AS $$ -BEGIN - RETURN QUERY - SELECT - COUNT(*) FILTER (WHERE status = 'pending') as pending_count, - COUNT(*) FILTER (WHERE status = 'processing') as processing_count, - COUNT(*) FILTER (WHERE status = 'failed' AND attempts >= max_attempts) as failed_count, - COUNT(*) FILTER (WHERE status = 'completed' AND completed_at >= CURRENT_DATE) as completed_today, - AVG(EXTRACT(EPOCH FROM (COALESCE(started_at, NOW()) - created_at))/60) FILTER (WHERE status IN ('processing', 'completed')) as avg_wait_time_minutes, - MAX(EXTRACT(EPOCH FROM (NOW() - created_at))/60) FILTER (WHERE status = 'pending') as oldest_pending_minutes - FROM ocr_queue; -END; -$$ LANGUAGE plpgsql; \ No newline at end of file +); \ No newline at end of file diff --git a/migrations/002_add_enhanced_ocr_fields.sql b/migrations/002_add_enhanced_ocr_fields.sql index 4567b54..cbf0fd0 100644 --- a/migrations/002_add_enhanced_ocr_fields.sql +++ b/migrations/002_add_enhanced_ocr_fields.sql @@ -1,70 +1,27 @@ --- Add enhanced OCR metadata fields to documents table ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_confidence REAL; + ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_word_count INT; + ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_processing_time_ms INT; --- Add enhanced OCR configuration fields to settings table ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_page_segmentation_mode INT DEFAULT 3; + ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_engine_mode INT DEFAULT 3; + ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_min_confidence REAL DEFAULT 30.0; + ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_dpi INT DEFAULT 300; + ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_enhance_contrast BOOLEAN DEFAULT true; + ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_remove_noise BOOLEAN DEFAULT true; + ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_detect_orientation BOOLEAN DEFAULT true; + ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_whitelist_chars TEXT; + ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_blacklist_chars TEXT; --- Add comments for documentation -COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence score (0-100)'; -COMMENT ON COLUMN documents.ocr_word_count IS 'Number of words extracted by OCR'; -COMMENT ON COLUMN documents.ocr_processing_time_ms IS 'Time taken for OCR processing in milliseconds'; - -COMMENT ON COLUMN settings.ocr_page_segmentation_mode IS 'Tesseract Page Segmentation Mode (0-13), default 3=PSM_AUTO'; -COMMENT ON COLUMN settings.ocr_engine_mode IS 'Tesseract OCR Engine Mode (0-3), default 3=OEM_DEFAULT'; -COMMENT ON COLUMN settings.ocr_min_confidence IS 'Minimum OCR confidence threshold (0-100)'; -COMMENT ON COLUMN settings.ocr_dpi IS 'Target DPI for OCR processing, 0=auto'; -COMMENT ON COLUMN settings.ocr_enhance_contrast IS 'Enable adaptive contrast enhancement'; -COMMENT ON COLUMN settings.ocr_remove_noise IS 'Enable image noise removal'; -COMMENT ON COLUMN settings.ocr_detect_orientation IS 'Enable automatic orientation detection'; -COMMENT ON COLUMN settings.ocr_whitelist_chars IS 'Characters to allow in OCR (null=all)'; -COMMENT ON COLUMN settings.ocr_blacklist_chars IS 'Characters to exclude from OCR (null=none)'; - --- Create index on OCR confidence for quality filtering CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence ON documents(ocr_confidence) WHERE ocr_confidence IS NOT NULL; --- Create index on word count for analytics -CREATE INDEX IF NOT EXISTS idx_documents_ocr_word_count ON documents(ocr_word_count) WHERE ocr_word_count IS NOT NULL; - --- Update existing settings to have the new defaults -UPDATE settings SET - ocr_page_segmentation_mode = 3, - ocr_engine_mode = 3, - ocr_min_confidence = 30.0, - ocr_dpi = 300, - ocr_enhance_contrast = true, - ocr_remove_noise = true, - ocr_detect_orientation = true -WHERE ocr_page_segmentation_mode IS NULL; - --- Create a view for enhanced OCR analytics -CREATE OR REPLACE VIEW ocr_analytics AS -SELECT - DATE(created_at) as date, - COUNT(*) as total_documents, - COUNT(ocr_text) as documents_with_ocr, - COUNT(ocr_confidence) as documents_with_confidence, - AVG(ocr_confidence) as avg_confidence, - MIN(ocr_confidence) as min_confidence, - MAX(ocr_confidence) as max_confidence, - AVG(ocr_word_count) as avg_word_count, - SUM(ocr_word_count) as total_words_extracted, - AVG(ocr_processing_time_ms) as avg_processing_time_ms, - COUNT(*) FILTER (WHERE ocr_confidence < 50) as low_confidence_count, - COUNT(*) FILTER (WHERE ocr_confidence >= 80) as high_confidence_count, - COUNT(*) FILTER (WHERE ocr_status = 'failed') as failed_ocr_count -FROM documents -WHERE created_at >= CURRENT_DATE - INTERVAL '30 days' -GROUP BY DATE(created_at) -ORDER BY date DESC; - -COMMENT ON VIEW ocr_analytics IS 'Daily OCR analytics for monitoring quality and performance'; \ No newline at end of file +CREATE INDEX IF NOT EXISTS idx_documents_ocr_word_count ON documents(ocr_word_count) WHERE ocr_word_count IS NOT NULL; \ No newline at end of file diff --git a/migrations/003_add_ocr_queue_function.sql b/migrations/003_add_ocr_queue_function.sql new file mode 100644 index 0000000..c894256 --- /dev/null +++ b/migrations/003_add_ocr_queue_function.sql @@ -0,0 +1,21 @@ +CREATE OR REPLACE FUNCTION get_ocr_queue_stats() +RETURNS TABLE ( + pending_count BIGINT, + processing_count BIGINT, + failed_count BIGINT, + completed_today BIGINT, + avg_wait_time_minutes DOUBLE PRECISION, + oldest_pending_minutes DOUBLE PRECISION +) AS $$ +BEGIN + RETURN QUERY + SELECT + COUNT(*) FILTER (WHERE status = 'pending') as pending_count, + COUNT(*) FILTER (WHERE status = 'processing') as processing_count, + COUNT(*) FILTER (WHERE status = 'failed' AND attempts >= max_attempts) as failed_count, + COUNT(*) FILTER (WHERE status = 'completed' AND completed_at >= CURRENT_DATE) as completed_today, + AVG(EXTRACT(EPOCH FROM (COALESCE(started_at, NOW()) - created_at))/60) FILTER (WHERE status IN ('processing', 'completed')) as avg_wait_time_minutes, + MAX(EXTRACT(EPOCH FROM (NOW() - created_at))/60) FILTER (WHERE status = 'pending') as oldest_pending_minutes + FROM ocr_queue; +END; +$$ LANGUAGE plpgsql; \ No newline at end of file diff --git a/migrations/004_update_settings_defaults.sql b/migrations/004_update_settings_defaults.sql new file mode 100644 index 0000000..f966729 --- /dev/null +++ b/migrations/004_update_settings_defaults.sql @@ -0,0 +1,9 @@ +UPDATE settings SET + ocr_page_segmentation_mode = 3, + ocr_engine_mode = 3, + ocr_min_confidence = 30.0, + ocr_dpi = 300, + ocr_enhance_contrast = true, + ocr_remove_noise = true, + ocr_detect_orientation = true +WHERE ocr_page_segmentation_mode IS NULL; \ No newline at end of file diff --git a/migrations/005_create_ocr_analytics_view.sql b/migrations/005_create_ocr_analytics_view.sql new file mode 100644 index 0000000..08720ce --- /dev/null +++ b/migrations/005_create_ocr_analytics_view.sql @@ -0,0 +1,19 @@ +CREATE OR REPLACE VIEW ocr_analytics AS +SELECT + DATE(created_at) as date, + COUNT(*) as total_documents, + COUNT(ocr_text) as documents_with_ocr, + COUNT(ocr_confidence) as documents_with_confidence, + AVG(ocr_confidence) as avg_confidence, + MIN(ocr_confidence) as min_confidence, + MAX(ocr_confidence) as max_confidence, + AVG(ocr_word_count) as avg_word_count, + SUM(ocr_word_count) as total_words_extracted, + AVG(ocr_processing_time_ms) as avg_processing_time_ms, + COUNT(*) FILTER (WHERE ocr_confidence < 50) as low_confidence_count, + COUNT(*) FILTER (WHERE ocr_confidence >= 80) as high_confidence_count, + COUNT(*) FILTER (WHERE ocr_status = 'failed') as failed_ocr_count +FROM documents +WHERE created_at >= CURRENT_DATE - INTERVAL '30 days' +GROUP BY DATE(created_at) +ORDER BY date DESC; \ No newline at end of file diff --git a/src/migrations.rs b/src/migrations.rs index f2c40a6..6421607 100644 --- a/src/migrations.rs +++ b/src/migrations.rs @@ -114,11 +114,12 @@ impl MigrationRunner { // Start a transaction let mut tx = self.pool.begin().await?; - // Split SQL into individual statements and execute each one - let statements = self.split_sql_statements(&migration.sql); + // Simple approach: split on semicolons and execute each statement + let statements = self.split_simple(&migration.sql); for (i, statement) in statements.iter().enumerate() { - if statement.trim().is_empty() { + let statement = statement.trim(); + if statement.is_empty() { continue; } @@ -126,8 +127,8 @@ impl MigrationRunner { .execute(&mut *tx) .await .map_err(|e| { - error!("Failed to apply migration {} statement {}: {}\nStatement: {}", - migration.version, i + 1, e, statement); + error!("Failed to execute statement {} in migration {}: {}\nStatement: {}", + i + 1, migration.version, e, statement); e })?; } @@ -148,56 +149,56 @@ impl MigrationRunner { Ok(()) } - /// Split SQL content into individual statements - fn split_sql_statements(&self, sql: &str) -> Vec { + /// Simple SQL splitting - handle dollar-quoted strings properly + fn split_simple(&self, sql: &str) -> Vec { let mut statements = Vec::new(); - let mut current_statement = String::new(); - let mut in_function = false; - let mut function_depth = 0; + let mut current = String::new(); + let mut in_dollar_quote = false; + let mut dollar_tag = String::new(); for line in sql.lines() { let trimmed = line.trim(); - // Skip comments and empty lines - if trimmed.is_empty() || trimmed.starts_with("--") { + // Skip empty lines and comments when not in a dollar quote + if !in_dollar_quote && (trimmed.is_empty() || trimmed.starts_with("--")) { continue; } - // Detect function definitions (PostgreSQL functions can contain semicolons) - if trimmed.contains("CREATE OR REPLACE FUNCTION") || trimmed.contains("CREATE FUNCTION") { - in_function = true; - function_depth = 0; - } - - current_statement.push_str(line); - current_statement.push('\n'); - - // Track function block depth - if in_function { - if trimmed.contains("BEGIN") { - function_depth += 1; - } - if trimmed.contains("END;") { - function_depth -= 1; - if function_depth <= 0 { - in_function = false; - statements.push(current_statement.trim().to_string()); - current_statement.clear(); - continue; + // Check for dollar quote start/end + if let Some(tag_start) = line.find("$$") { + if !in_dollar_quote { + // Starting a dollar quote + in_dollar_quote = true; + // Extract the tag (if any) between the $$ + if let Some(tag_end) = line[tag_start + 2..].find("$$") { + // This line both starts and ends the quote - shouldn't happen with functions + in_dollar_quote = false; + } + } else { + // Might be ending the dollar quote + if line.contains("$$") { + in_dollar_quote = false; } } } - // For non-function statements, split on semicolon - if !in_function && trimmed.ends_with(';') { - statements.push(current_statement.trim().to_string()); - current_statement.clear(); + current.push_str(line); + current.push('\n'); + + // If not in dollar quote and line ends with semicolon, this is a complete statement + if !in_dollar_quote && trimmed.ends_with(';') { + let statement = current.trim(); + if !statement.is_empty() { + statements.push(statement.to_string()); + } + current.clear(); } } - // Add any remaining statement - if !current_statement.trim().is_empty() { - statements.push(current_statement.trim().to_string()); + // Add any remaining content as final statement + let final_statement = current.trim(); + if !final_statement.is_empty() { + statements.push(final_statement.to_string()); } statements