feat(migrations): improve migrations and split large SQL statements into smaller ones
This commit is contained in:
parent
4fa74a6333
commit
3dcf753ff3
|
|
@ -16,18 +16,20 @@ CREATE TABLE IF NOT EXISTS ocr_queue (
|
|||
CONSTRAINT check_status CHECK (status IN ('pending', 'processing', 'completed', 'failed', 'cancelled'))
|
||||
);
|
||||
|
||||
-- Indexes for efficient queue operations
|
||||
CREATE INDEX IF NOT EXISTS idx_ocr_queue_status ON ocr_queue(status, priority DESC, created_at);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_ocr_queue_document_id ON ocr_queue(document_id);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_ocr_queue_worker ON ocr_queue(worker_id) WHERE status = 'processing';
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_ocr_queue_created_at ON ocr_queue(created_at) WHERE status = 'pending';
|
||||
|
||||
-- Add processing status to documents
|
||||
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_status VARCHAR(20) DEFAULT 'pending';
|
||||
|
||||
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_error TEXT;
|
||||
|
||||
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_completed_at TIMESTAMPTZ;
|
||||
|
||||
-- Metrics table for monitoring
|
||||
CREATE TABLE IF NOT EXISTS ocr_metrics (
|
||||
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||
date DATE DEFAULT CURRENT_DATE,
|
||||
|
|
@ -41,27 +43,4 @@ CREATE TABLE IF NOT EXISTS ocr_metrics (
|
|||
queue_depth INT,
|
||||
active_workers INT,
|
||||
UNIQUE(date, hour)
|
||||
);
|
||||
|
||||
-- Function to get queue statistics
|
||||
CREATE OR REPLACE FUNCTION get_ocr_queue_stats()
|
||||
RETURNS TABLE (
|
||||
pending_count BIGINT,
|
||||
processing_count BIGINT,
|
||||
failed_count BIGINT,
|
||||
completed_today BIGINT,
|
||||
avg_wait_time_minutes DOUBLE PRECISION,
|
||||
oldest_pending_minutes DOUBLE PRECISION
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE status = 'pending') as pending_count,
|
||||
COUNT(*) FILTER (WHERE status = 'processing') as processing_count,
|
||||
COUNT(*) FILTER (WHERE status = 'failed' AND attempts >= max_attempts) as failed_count,
|
||||
COUNT(*) FILTER (WHERE status = 'completed' AND completed_at >= CURRENT_DATE) as completed_today,
|
||||
AVG(EXTRACT(EPOCH FROM (COALESCE(started_at, NOW()) - created_at))/60) FILTER (WHERE status IN ('processing', 'completed')) as avg_wait_time_minutes,
|
||||
MAX(EXTRACT(EPOCH FROM (NOW() - created_at))/60) FILTER (WHERE status = 'pending') as oldest_pending_minutes
|
||||
FROM ocr_queue;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
);
|
||||
|
|
@ -1,70 +1,27 @@
|
|||
-- Add enhanced OCR metadata fields to documents table
|
||||
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_confidence REAL;
|
||||
|
||||
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_word_count INT;
|
||||
|
||||
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_processing_time_ms INT;
|
||||
|
||||
-- Add enhanced OCR configuration fields to settings table
|
||||
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_page_segmentation_mode INT DEFAULT 3;
|
||||
|
||||
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_engine_mode INT DEFAULT 3;
|
||||
|
||||
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_min_confidence REAL DEFAULT 30.0;
|
||||
|
||||
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_dpi INT DEFAULT 300;
|
||||
|
||||
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_enhance_contrast BOOLEAN DEFAULT true;
|
||||
|
||||
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_remove_noise BOOLEAN DEFAULT true;
|
||||
|
||||
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_detect_orientation BOOLEAN DEFAULT true;
|
||||
|
||||
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_whitelist_chars TEXT;
|
||||
|
||||
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_blacklist_chars TEXT;
|
||||
|
||||
-- Add comments for documentation
|
||||
COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence score (0-100)';
|
||||
COMMENT ON COLUMN documents.ocr_word_count IS 'Number of words extracted by OCR';
|
||||
COMMENT ON COLUMN documents.ocr_processing_time_ms IS 'Time taken for OCR processing in milliseconds';
|
||||
|
||||
COMMENT ON COLUMN settings.ocr_page_segmentation_mode IS 'Tesseract Page Segmentation Mode (0-13), default 3=PSM_AUTO';
|
||||
COMMENT ON COLUMN settings.ocr_engine_mode IS 'Tesseract OCR Engine Mode (0-3), default 3=OEM_DEFAULT';
|
||||
COMMENT ON COLUMN settings.ocr_min_confidence IS 'Minimum OCR confidence threshold (0-100)';
|
||||
COMMENT ON COLUMN settings.ocr_dpi IS 'Target DPI for OCR processing, 0=auto';
|
||||
COMMENT ON COLUMN settings.ocr_enhance_contrast IS 'Enable adaptive contrast enhancement';
|
||||
COMMENT ON COLUMN settings.ocr_remove_noise IS 'Enable image noise removal';
|
||||
COMMENT ON COLUMN settings.ocr_detect_orientation IS 'Enable automatic orientation detection';
|
||||
COMMENT ON COLUMN settings.ocr_whitelist_chars IS 'Characters to allow in OCR (null=all)';
|
||||
COMMENT ON COLUMN settings.ocr_blacklist_chars IS 'Characters to exclude from OCR (null=none)';
|
||||
|
||||
-- Create index on OCR confidence for quality filtering
|
||||
CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence ON documents(ocr_confidence) WHERE ocr_confidence IS NOT NULL;
|
||||
|
||||
-- Create index on word count for analytics
|
||||
CREATE INDEX IF NOT EXISTS idx_documents_ocr_word_count ON documents(ocr_word_count) WHERE ocr_word_count IS NOT NULL;
|
||||
|
||||
-- Update existing settings to have the new defaults
|
||||
UPDATE settings SET
|
||||
ocr_page_segmentation_mode = 3,
|
||||
ocr_engine_mode = 3,
|
||||
ocr_min_confidence = 30.0,
|
||||
ocr_dpi = 300,
|
||||
ocr_enhance_contrast = true,
|
||||
ocr_remove_noise = true,
|
||||
ocr_detect_orientation = true
|
||||
WHERE ocr_page_segmentation_mode IS NULL;
|
||||
|
||||
-- Create a view for enhanced OCR analytics
|
||||
CREATE OR REPLACE VIEW ocr_analytics AS
|
||||
SELECT
|
||||
DATE(created_at) as date,
|
||||
COUNT(*) as total_documents,
|
||||
COUNT(ocr_text) as documents_with_ocr,
|
||||
COUNT(ocr_confidence) as documents_with_confidence,
|
||||
AVG(ocr_confidence) as avg_confidence,
|
||||
MIN(ocr_confidence) as min_confidence,
|
||||
MAX(ocr_confidence) as max_confidence,
|
||||
AVG(ocr_word_count) as avg_word_count,
|
||||
SUM(ocr_word_count) as total_words_extracted,
|
||||
AVG(ocr_processing_time_ms) as avg_processing_time_ms,
|
||||
COUNT(*) FILTER (WHERE ocr_confidence < 50) as low_confidence_count,
|
||||
COUNT(*) FILTER (WHERE ocr_confidence >= 80) as high_confidence_count,
|
||||
COUNT(*) FILTER (WHERE ocr_status = 'failed') as failed_ocr_count
|
||||
FROM documents
|
||||
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
|
||||
GROUP BY DATE(created_at)
|
||||
ORDER BY date DESC;
|
||||
|
||||
COMMENT ON VIEW ocr_analytics IS 'Daily OCR analytics for monitoring quality and performance';
|
||||
CREATE INDEX IF NOT EXISTS idx_documents_ocr_word_count ON documents(ocr_word_count) WHERE ocr_word_count IS NOT NULL;
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
CREATE OR REPLACE FUNCTION get_ocr_queue_stats()
|
||||
RETURNS TABLE (
|
||||
pending_count BIGINT,
|
||||
processing_count BIGINT,
|
||||
failed_count BIGINT,
|
||||
completed_today BIGINT,
|
||||
avg_wait_time_minutes DOUBLE PRECISION,
|
||||
oldest_pending_minutes DOUBLE PRECISION
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE status = 'pending') as pending_count,
|
||||
COUNT(*) FILTER (WHERE status = 'processing') as processing_count,
|
||||
COUNT(*) FILTER (WHERE status = 'failed' AND attempts >= max_attempts) as failed_count,
|
||||
COUNT(*) FILTER (WHERE status = 'completed' AND completed_at >= CURRENT_DATE) as completed_today,
|
||||
AVG(EXTRACT(EPOCH FROM (COALESCE(started_at, NOW()) - created_at))/60) FILTER (WHERE status IN ('processing', 'completed')) as avg_wait_time_minutes,
|
||||
MAX(EXTRACT(EPOCH FROM (NOW() - created_at))/60) FILTER (WHERE status = 'pending') as oldest_pending_minutes
|
||||
FROM ocr_queue;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
UPDATE settings SET
|
||||
ocr_page_segmentation_mode = 3,
|
||||
ocr_engine_mode = 3,
|
||||
ocr_min_confidence = 30.0,
|
||||
ocr_dpi = 300,
|
||||
ocr_enhance_contrast = true,
|
||||
ocr_remove_noise = true,
|
||||
ocr_detect_orientation = true
|
||||
WHERE ocr_page_segmentation_mode IS NULL;
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
CREATE OR REPLACE VIEW ocr_analytics AS
|
||||
SELECT
|
||||
DATE(created_at) as date,
|
||||
COUNT(*) as total_documents,
|
||||
COUNT(ocr_text) as documents_with_ocr,
|
||||
COUNT(ocr_confidence) as documents_with_confidence,
|
||||
AVG(ocr_confidence) as avg_confidence,
|
||||
MIN(ocr_confidence) as min_confidence,
|
||||
MAX(ocr_confidence) as max_confidence,
|
||||
AVG(ocr_word_count) as avg_word_count,
|
||||
SUM(ocr_word_count) as total_words_extracted,
|
||||
AVG(ocr_processing_time_ms) as avg_processing_time_ms,
|
||||
COUNT(*) FILTER (WHERE ocr_confidence < 50) as low_confidence_count,
|
||||
COUNT(*) FILTER (WHERE ocr_confidence >= 80) as high_confidence_count,
|
||||
COUNT(*) FILTER (WHERE ocr_status = 'failed') as failed_ocr_count
|
||||
FROM documents
|
||||
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
|
||||
GROUP BY DATE(created_at)
|
||||
ORDER BY date DESC;
|
||||
|
|
@ -114,11 +114,12 @@ impl MigrationRunner {
|
|||
// Start a transaction
|
||||
let mut tx = self.pool.begin().await?;
|
||||
|
||||
// Split SQL into individual statements and execute each one
|
||||
let statements = self.split_sql_statements(&migration.sql);
|
||||
// Simple approach: split on semicolons and execute each statement
|
||||
let statements = self.split_simple(&migration.sql);
|
||||
|
||||
for (i, statement) in statements.iter().enumerate() {
|
||||
if statement.trim().is_empty() {
|
||||
let statement = statement.trim();
|
||||
if statement.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -126,8 +127,8 @@ impl MigrationRunner {
|
|||
.execute(&mut *tx)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
error!("Failed to apply migration {} statement {}: {}\nStatement: {}",
|
||||
migration.version, i + 1, e, statement);
|
||||
error!("Failed to execute statement {} in migration {}: {}\nStatement: {}",
|
||||
i + 1, migration.version, e, statement);
|
||||
e
|
||||
})?;
|
||||
}
|
||||
|
|
@ -148,56 +149,56 @@ impl MigrationRunner {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
/// Split SQL content into individual statements
|
||||
fn split_sql_statements(&self, sql: &str) -> Vec<String> {
|
||||
/// Simple SQL splitting - handle dollar-quoted strings properly
|
||||
fn split_simple(&self, sql: &str) -> Vec<String> {
|
||||
let mut statements = Vec::new();
|
||||
let mut current_statement = String::new();
|
||||
let mut in_function = false;
|
||||
let mut function_depth = 0;
|
||||
let mut current = String::new();
|
||||
let mut in_dollar_quote = false;
|
||||
let mut dollar_tag = String::new();
|
||||
|
||||
for line in sql.lines() {
|
||||
let trimmed = line.trim();
|
||||
|
||||
// Skip comments and empty lines
|
||||
if trimmed.is_empty() || trimmed.starts_with("--") {
|
||||
// Skip empty lines and comments when not in a dollar quote
|
||||
if !in_dollar_quote && (trimmed.is_empty() || trimmed.starts_with("--")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Detect function definitions (PostgreSQL functions can contain semicolons)
|
||||
if trimmed.contains("CREATE OR REPLACE FUNCTION") || trimmed.contains("CREATE FUNCTION") {
|
||||
in_function = true;
|
||||
function_depth = 0;
|
||||
}
|
||||
|
||||
current_statement.push_str(line);
|
||||
current_statement.push('\n');
|
||||
|
||||
// Track function block depth
|
||||
if in_function {
|
||||
if trimmed.contains("BEGIN") {
|
||||
function_depth += 1;
|
||||
}
|
||||
if trimmed.contains("END;") {
|
||||
function_depth -= 1;
|
||||
if function_depth <= 0 {
|
||||
in_function = false;
|
||||
statements.push(current_statement.trim().to_string());
|
||||
current_statement.clear();
|
||||
continue;
|
||||
// Check for dollar quote start/end
|
||||
if let Some(tag_start) = line.find("$$") {
|
||||
if !in_dollar_quote {
|
||||
// Starting a dollar quote
|
||||
in_dollar_quote = true;
|
||||
// Extract the tag (if any) between the $$
|
||||
if let Some(tag_end) = line[tag_start + 2..].find("$$") {
|
||||
// This line both starts and ends the quote - shouldn't happen with functions
|
||||
in_dollar_quote = false;
|
||||
}
|
||||
} else {
|
||||
// Might be ending the dollar quote
|
||||
if line.contains("$$") {
|
||||
in_dollar_quote = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// For non-function statements, split on semicolon
|
||||
if !in_function && trimmed.ends_with(';') {
|
||||
statements.push(current_statement.trim().to_string());
|
||||
current_statement.clear();
|
||||
current.push_str(line);
|
||||
current.push('\n');
|
||||
|
||||
// If not in dollar quote and line ends with semicolon, this is a complete statement
|
||||
if !in_dollar_quote && trimmed.ends_with(';') {
|
||||
let statement = current.trim();
|
||||
if !statement.is_empty() {
|
||||
statements.push(statement.to_string());
|
||||
}
|
||||
current.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// Add any remaining statement
|
||||
if !current_statement.trim().is_empty() {
|
||||
statements.push(current_statement.trim().to_string());
|
||||
// Add any remaining content as final statement
|
||||
let final_statement = current.trim();
|
||||
if !final_statement.is_empty() {
|
||||
statements.push(final_statement.to_string());
|
||||
}
|
||||
|
||||
statements
|
||||
|
|
|
|||
Loading…
Reference in New Issue