Readur/migrations/002_add_enhanced_ocr_fields...

70 lines
3.7 KiB
SQL

-- Add enhanced OCR metadata fields to documents table
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_confidence REAL;
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_word_count INT;
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_processing_time_ms INT;
-- Add enhanced OCR configuration fields to settings table
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_page_segmentation_mode INT DEFAULT 3;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_engine_mode INT DEFAULT 3;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_min_confidence REAL DEFAULT 30.0;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_dpi INT DEFAULT 300;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_enhance_contrast BOOLEAN DEFAULT true;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_remove_noise BOOLEAN DEFAULT true;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_detect_orientation BOOLEAN DEFAULT true;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_whitelist_chars TEXT;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_blacklist_chars TEXT;
-- Add comments for documentation
COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence score (0-100)';
COMMENT ON COLUMN documents.ocr_word_count IS 'Number of words extracted by OCR';
COMMENT ON COLUMN documents.ocr_processing_time_ms IS 'Time taken for OCR processing in milliseconds';
COMMENT ON COLUMN settings.ocr_page_segmentation_mode IS 'Tesseract Page Segmentation Mode (0-13), default 3=PSM_AUTO';
COMMENT ON COLUMN settings.ocr_engine_mode IS 'Tesseract OCR Engine Mode (0-3), default 3=OEM_DEFAULT';
COMMENT ON COLUMN settings.ocr_min_confidence IS 'Minimum OCR confidence threshold (0-100)';
COMMENT ON COLUMN settings.ocr_dpi IS 'Target DPI for OCR processing, 0=auto';
COMMENT ON COLUMN settings.ocr_enhance_contrast IS 'Enable adaptive contrast enhancement';
COMMENT ON COLUMN settings.ocr_remove_noise IS 'Enable image noise removal';
COMMENT ON COLUMN settings.ocr_detect_orientation IS 'Enable automatic orientation detection';
COMMENT ON COLUMN settings.ocr_whitelist_chars IS 'Characters to allow in OCR (null=all)';
COMMENT ON COLUMN settings.ocr_blacklist_chars IS 'Characters to exclude from OCR (null=none)';
-- Create index on OCR confidence for quality filtering
CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence ON documents(ocr_confidence) WHERE ocr_confidence IS NOT NULL;
-- Create index on word count for analytics
CREATE INDEX IF NOT EXISTS idx_documents_ocr_word_count ON documents(ocr_word_count) WHERE ocr_word_count IS NOT NULL;
-- Update existing settings to have the new defaults
UPDATE settings SET
ocr_page_segmentation_mode = 3,
ocr_engine_mode = 3,
ocr_min_confidence = 30.0,
ocr_dpi = 300,
ocr_enhance_contrast = true,
ocr_remove_noise = true,
ocr_detect_orientation = true
WHERE ocr_page_segmentation_mode IS NULL;
-- Create a view for enhanced OCR analytics
CREATE OR REPLACE VIEW ocr_analytics AS
SELECT
DATE(created_at) as date,
COUNT(*) as total_documents,
COUNT(ocr_text) as documents_with_ocr,
COUNT(ocr_confidence) as documents_with_confidence,
AVG(ocr_confidence) as avg_confidence,
MIN(ocr_confidence) as min_confidence,
MAX(ocr_confidence) as max_confidence,
AVG(ocr_word_count) as avg_word_count,
SUM(ocr_word_count) as total_words_extracted,
AVG(ocr_processing_time_ms) as avg_processing_time_ms,
COUNT(*) FILTER (WHERE ocr_confidence < 50) as low_confidence_count,
COUNT(*) FILTER (WHERE ocr_confidence >= 80) as high_confidence_count,
COUNT(*) FILTER (WHERE ocr_status = 'failed') as failed_ocr_count
FROM documents
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY DATE(created_at)
ORDER BY date DESC;
COMMENT ON VIEW ocr_analytics IS 'Daily OCR analytics for monitoring quality and performance';