70 lines
3.7 KiB
SQL
70 lines
3.7 KiB
SQL
-- Add enhanced OCR metadata fields to documents table
|
|
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_confidence REAL;
|
|
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_word_count INT;
|
|
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_processing_time_ms INT;
|
|
|
|
-- Add enhanced OCR configuration fields to settings table
|
|
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_page_segmentation_mode INT DEFAULT 3;
|
|
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_engine_mode INT DEFAULT 3;
|
|
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_min_confidence REAL DEFAULT 30.0;
|
|
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_dpi INT DEFAULT 300;
|
|
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_enhance_contrast BOOLEAN DEFAULT true;
|
|
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_remove_noise BOOLEAN DEFAULT true;
|
|
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_detect_orientation BOOLEAN DEFAULT true;
|
|
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_whitelist_chars TEXT;
|
|
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_blacklist_chars TEXT;
|
|
|
|
-- Add comments for documentation
|
|
COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence score (0-100)';
|
|
COMMENT ON COLUMN documents.ocr_word_count IS 'Number of words extracted by OCR';
|
|
COMMENT ON COLUMN documents.ocr_processing_time_ms IS 'Time taken for OCR processing in milliseconds';
|
|
|
|
COMMENT ON COLUMN settings.ocr_page_segmentation_mode IS 'Tesseract Page Segmentation Mode (0-13), default 3=PSM_AUTO';
|
|
COMMENT ON COLUMN settings.ocr_engine_mode IS 'Tesseract OCR Engine Mode (0-3), default 3=OEM_DEFAULT';
|
|
COMMENT ON COLUMN settings.ocr_min_confidence IS 'Minimum OCR confidence threshold (0-100)';
|
|
COMMENT ON COLUMN settings.ocr_dpi IS 'Target DPI for OCR processing, 0=auto';
|
|
COMMENT ON COLUMN settings.ocr_enhance_contrast IS 'Enable adaptive contrast enhancement';
|
|
COMMENT ON COLUMN settings.ocr_remove_noise IS 'Enable image noise removal';
|
|
COMMENT ON COLUMN settings.ocr_detect_orientation IS 'Enable automatic orientation detection';
|
|
COMMENT ON COLUMN settings.ocr_whitelist_chars IS 'Characters to allow in OCR (null=all)';
|
|
COMMENT ON COLUMN settings.ocr_blacklist_chars IS 'Characters to exclude from OCR (null=none)';
|
|
|
|
-- Create index on OCR confidence for quality filtering
|
|
CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence ON documents(ocr_confidence) WHERE ocr_confidence IS NOT NULL;
|
|
|
|
-- Create index on word count for analytics
|
|
CREATE INDEX IF NOT EXISTS idx_documents_ocr_word_count ON documents(ocr_word_count) WHERE ocr_word_count IS NOT NULL;
|
|
|
|
-- Update existing settings to have the new defaults
|
|
UPDATE settings SET
|
|
ocr_page_segmentation_mode = 3,
|
|
ocr_engine_mode = 3,
|
|
ocr_min_confidence = 30.0,
|
|
ocr_dpi = 300,
|
|
ocr_enhance_contrast = true,
|
|
ocr_remove_noise = true,
|
|
ocr_detect_orientation = true
|
|
WHERE ocr_page_segmentation_mode IS NULL;
|
|
|
|
-- Create a view for enhanced OCR analytics
|
|
CREATE OR REPLACE VIEW ocr_analytics AS
|
|
SELECT
|
|
DATE(created_at) as date,
|
|
COUNT(*) as total_documents,
|
|
COUNT(ocr_text) as documents_with_ocr,
|
|
COUNT(ocr_confidence) as documents_with_confidence,
|
|
AVG(ocr_confidence) as avg_confidence,
|
|
MIN(ocr_confidence) as min_confidence,
|
|
MAX(ocr_confidence) as max_confidence,
|
|
AVG(ocr_word_count) as avg_word_count,
|
|
SUM(ocr_word_count) as total_words_extracted,
|
|
AVG(ocr_processing_time_ms) as avg_processing_time_ms,
|
|
COUNT(*) FILTER (WHERE ocr_confidence < 50) as low_confidence_count,
|
|
COUNT(*) FILTER (WHERE ocr_confidence >= 80) as high_confidence_count,
|
|
COUNT(*) FILTER (WHERE ocr_status = 'failed') as failed_ocr_count
|
|
FROM documents
|
|
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
|
|
GROUP BY DATE(created_at)
|
|
ORDER BY date DESC;
|
|
|
|
COMMENT ON VIEW ocr_analytics IS 'Daily OCR analytics for monitoring quality and performance'; |