feat(migrations): improve migrations and split large SQL statements into smaller ones

This commit is contained in:
perfectra1n 2025-06-12 22:27:04 -07:00
parent 4fa74a6333
commit 3dcf753ff3
6 changed files with 107 additions and 121 deletions

View File

@ -16,18 +16,20 @@ CREATE TABLE IF NOT EXISTS ocr_queue (
CONSTRAINT check_status CHECK (status IN ('pending', 'processing', 'completed', 'failed', 'cancelled'))
);
-- Indexes for efficient queue operations
CREATE INDEX IF NOT EXISTS idx_ocr_queue_status ON ocr_queue(status, priority DESC, created_at);
CREATE INDEX IF NOT EXISTS idx_ocr_queue_document_id ON ocr_queue(document_id);
CREATE INDEX IF NOT EXISTS idx_ocr_queue_worker ON ocr_queue(worker_id) WHERE status = 'processing';
CREATE INDEX IF NOT EXISTS idx_ocr_queue_created_at ON ocr_queue(created_at) WHERE status = 'pending';
-- Add processing status to documents
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_status VARCHAR(20) DEFAULT 'pending';
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_error TEXT;
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_completed_at TIMESTAMPTZ;
-- Metrics table for monitoring
CREATE TABLE IF NOT EXISTS ocr_metrics (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
date DATE DEFAULT CURRENT_DATE,
@ -41,27 +43,4 @@ CREATE TABLE IF NOT EXISTS ocr_metrics (
queue_depth INT,
active_workers INT,
UNIQUE(date, hour)
);
-- Function to get queue statistics
CREATE OR REPLACE FUNCTION get_ocr_queue_stats()
RETURNS TABLE (
pending_count BIGINT,
processing_count BIGINT,
failed_count BIGINT,
completed_today BIGINT,
avg_wait_time_minutes DOUBLE PRECISION,
oldest_pending_minutes DOUBLE PRECISION
) AS $$
BEGIN
RETURN QUERY
SELECT
COUNT(*) FILTER (WHERE status = 'pending') as pending_count,
COUNT(*) FILTER (WHERE status = 'processing') as processing_count,
COUNT(*) FILTER (WHERE status = 'failed' AND attempts >= max_attempts) as failed_count,
COUNT(*) FILTER (WHERE status = 'completed' AND completed_at >= CURRENT_DATE) as completed_today,
AVG(EXTRACT(EPOCH FROM (COALESCE(started_at, NOW()) - created_at))/60) FILTER (WHERE status IN ('processing', 'completed')) as avg_wait_time_minutes,
MAX(EXTRACT(EPOCH FROM (NOW() - created_at))/60) FILTER (WHERE status = 'pending') as oldest_pending_minutes
FROM ocr_queue;
END;
$$ LANGUAGE plpgsql;
);

View File

@ -1,70 +1,27 @@
-- Add enhanced OCR metadata fields to documents table
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_confidence REAL;
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_word_count INT;
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_processing_time_ms INT;
-- Add enhanced OCR configuration fields to settings table
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_page_segmentation_mode INT DEFAULT 3;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_engine_mode INT DEFAULT 3;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_min_confidence REAL DEFAULT 30.0;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_dpi INT DEFAULT 300;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_enhance_contrast BOOLEAN DEFAULT true;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_remove_noise BOOLEAN DEFAULT true;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_detect_orientation BOOLEAN DEFAULT true;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_whitelist_chars TEXT;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_blacklist_chars TEXT;
-- Add comments for documentation
COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence score (0-100)';
COMMENT ON COLUMN documents.ocr_word_count IS 'Number of words extracted by OCR';
COMMENT ON COLUMN documents.ocr_processing_time_ms IS 'Time taken for OCR processing in milliseconds';
COMMENT ON COLUMN settings.ocr_page_segmentation_mode IS 'Tesseract Page Segmentation Mode (0-13), default 3=PSM_AUTO';
COMMENT ON COLUMN settings.ocr_engine_mode IS 'Tesseract OCR Engine Mode (0-3), default 3=OEM_DEFAULT';
COMMENT ON COLUMN settings.ocr_min_confidence IS 'Minimum OCR confidence threshold (0-100)';
COMMENT ON COLUMN settings.ocr_dpi IS 'Target DPI for OCR processing, 0=auto';
COMMENT ON COLUMN settings.ocr_enhance_contrast IS 'Enable adaptive contrast enhancement';
COMMENT ON COLUMN settings.ocr_remove_noise IS 'Enable image noise removal';
COMMENT ON COLUMN settings.ocr_detect_orientation IS 'Enable automatic orientation detection';
COMMENT ON COLUMN settings.ocr_whitelist_chars IS 'Characters to allow in OCR (null=all)';
COMMENT ON COLUMN settings.ocr_blacklist_chars IS 'Characters to exclude from OCR (null=none)';
-- Create index on OCR confidence for quality filtering
CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence ON documents(ocr_confidence) WHERE ocr_confidence IS NOT NULL;
-- Create index on word count for analytics
CREATE INDEX IF NOT EXISTS idx_documents_ocr_word_count ON documents(ocr_word_count) WHERE ocr_word_count IS NOT NULL;
-- Update existing settings to have the new defaults
UPDATE settings SET
ocr_page_segmentation_mode = 3,
ocr_engine_mode = 3,
ocr_min_confidence = 30.0,
ocr_dpi = 300,
ocr_enhance_contrast = true,
ocr_remove_noise = true,
ocr_detect_orientation = true
WHERE ocr_page_segmentation_mode IS NULL;
-- Create a view for enhanced OCR analytics
CREATE OR REPLACE VIEW ocr_analytics AS
SELECT
DATE(created_at) as date,
COUNT(*) as total_documents,
COUNT(ocr_text) as documents_with_ocr,
COUNT(ocr_confidence) as documents_with_confidence,
AVG(ocr_confidence) as avg_confidence,
MIN(ocr_confidence) as min_confidence,
MAX(ocr_confidence) as max_confidence,
AVG(ocr_word_count) as avg_word_count,
SUM(ocr_word_count) as total_words_extracted,
AVG(ocr_processing_time_ms) as avg_processing_time_ms,
COUNT(*) FILTER (WHERE ocr_confidence < 50) as low_confidence_count,
COUNT(*) FILTER (WHERE ocr_confidence >= 80) as high_confidence_count,
COUNT(*) FILTER (WHERE ocr_status = 'failed') as failed_ocr_count
FROM documents
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY DATE(created_at)
ORDER BY date DESC;
COMMENT ON VIEW ocr_analytics IS 'Daily OCR analytics for monitoring quality and performance';
CREATE INDEX IF NOT EXISTS idx_documents_ocr_word_count ON documents(ocr_word_count) WHERE ocr_word_count IS NOT NULL;

View File

@ -0,0 +1,21 @@
CREATE OR REPLACE FUNCTION get_ocr_queue_stats()
RETURNS TABLE (
pending_count BIGINT,
processing_count BIGINT,
failed_count BIGINT,
completed_today BIGINT,
avg_wait_time_minutes DOUBLE PRECISION,
oldest_pending_minutes DOUBLE PRECISION
) AS $$
BEGIN
RETURN QUERY
SELECT
COUNT(*) FILTER (WHERE status = 'pending') as pending_count,
COUNT(*) FILTER (WHERE status = 'processing') as processing_count,
COUNT(*) FILTER (WHERE status = 'failed' AND attempts >= max_attempts) as failed_count,
COUNT(*) FILTER (WHERE status = 'completed' AND completed_at >= CURRENT_DATE) as completed_today,
AVG(EXTRACT(EPOCH FROM (COALESCE(started_at, NOW()) - created_at))/60) FILTER (WHERE status IN ('processing', 'completed')) as avg_wait_time_minutes,
MAX(EXTRACT(EPOCH FROM (NOW() - created_at))/60) FILTER (WHERE status = 'pending') as oldest_pending_minutes
FROM ocr_queue;
END;
$$ LANGUAGE plpgsql;

View File

@ -0,0 +1,9 @@
UPDATE settings SET
ocr_page_segmentation_mode = 3,
ocr_engine_mode = 3,
ocr_min_confidence = 30.0,
ocr_dpi = 300,
ocr_enhance_contrast = true,
ocr_remove_noise = true,
ocr_detect_orientation = true
WHERE ocr_page_segmentation_mode IS NULL;

View File

@ -0,0 +1,19 @@
CREATE OR REPLACE VIEW ocr_analytics AS
SELECT
DATE(created_at) as date,
COUNT(*) as total_documents,
COUNT(ocr_text) as documents_with_ocr,
COUNT(ocr_confidence) as documents_with_confidence,
AVG(ocr_confidence) as avg_confidence,
MIN(ocr_confidence) as min_confidence,
MAX(ocr_confidence) as max_confidence,
AVG(ocr_word_count) as avg_word_count,
SUM(ocr_word_count) as total_words_extracted,
AVG(ocr_processing_time_ms) as avg_processing_time_ms,
COUNT(*) FILTER (WHERE ocr_confidence < 50) as low_confidence_count,
COUNT(*) FILTER (WHERE ocr_confidence >= 80) as high_confidence_count,
COUNT(*) FILTER (WHERE ocr_status = 'failed') as failed_ocr_count
FROM documents
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY DATE(created_at)
ORDER BY date DESC;

View File

@ -114,11 +114,12 @@ impl MigrationRunner {
// Start a transaction
let mut tx = self.pool.begin().await?;
// Split SQL into individual statements and execute each one
let statements = self.split_sql_statements(&migration.sql);
// Simple approach: split on semicolons and execute each statement
let statements = self.split_simple(&migration.sql);
for (i, statement) in statements.iter().enumerate() {
if statement.trim().is_empty() {
let statement = statement.trim();
if statement.is_empty() {
continue;
}
@ -126,8 +127,8 @@ impl MigrationRunner {
.execute(&mut *tx)
.await
.map_err(|e| {
error!("Failed to apply migration {} statement {}: {}\nStatement: {}",
migration.version, i + 1, e, statement);
error!("Failed to execute statement {} in migration {}: {}\nStatement: {}",
i + 1, migration.version, e, statement);
e
})?;
}
@ -148,56 +149,56 @@ impl MigrationRunner {
Ok(())
}
/// Split SQL content into individual statements
fn split_sql_statements(&self, sql: &str) -> Vec<String> {
/// Simple SQL splitting - handle dollar-quoted strings properly
fn split_simple(&self, sql: &str) -> Vec<String> {
let mut statements = Vec::new();
let mut current_statement = String::new();
let mut in_function = false;
let mut function_depth = 0;
let mut current = String::new();
let mut in_dollar_quote = false;
let mut dollar_tag = String::new();
for line in sql.lines() {
let trimmed = line.trim();
// Skip comments and empty lines
if trimmed.is_empty() || trimmed.starts_with("--") {
// Skip empty lines and comments when not in a dollar quote
if !in_dollar_quote && (trimmed.is_empty() || trimmed.starts_with("--")) {
continue;
}
// Detect function definitions (PostgreSQL functions can contain semicolons)
if trimmed.contains("CREATE OR REPLACE FUNCTION") || trimmed.contains("CREATE FUNCTION") {
in_function = true;
function_depth = 0;
}
current_statement.push_str(line);
current_statement.push('\n');
// Track function block depth
if in_function {
if trimmed.contains("BEGIN") {
function_depth += 1;
}
if trimmed.contains("END;") {
function_depth -= 1;
if function_depth <= 0 {
in_function = false;
statements.push(current_statement.trim().to_string());
current_statement.clear();
continue;
// Check for dollar quote start/end
if let Some(tag_start) = line.find("$$") {
if !in_dollar_quote {
// Starting a dollar quote
in_dollar_quote = true;
// Extract the tag (if any) between the $$
if let Some(tag_end) = line[tag_start + 2..].find("$$") {
// This line both starts and ends the quote - shouldn't happen with functions
in_dollar_quote = false;
}
} else {
// Might be ending the dollar quote
if line.contains("$$") {
in_dollar_quote = false;
}
}
}
// For non-function statements, split on semicolon
if !in_function && trimmed.ends_with(';') {
statements.push(current_statement.trim().to_string());
current_statement.clear();
current.push_str(line);
current.push('\n');
// If not in dollar quote and line ends with semicolon, this is a complete statement
if !in_dollar_quote && trimmed.ends_with(';') {
let statement = current.trim();
if !statement.is_empty() {
statements.push(statement.to_string());
}
current.clear();
}
}
// Add any remaining statement
if !current_statement.trim().is_empty() {
statements.push(current_statement.trim().to_string());
// Add any remaining content as final statement
let final_statement = current.trim();
if !final_statement.is_empty() {
statements.push(final_statement.to_string());
}
statements