Readur/migrations/20250628000002_populate_ocr...

36 lines
1.5 KiB
SQL

-- Populate OCR queue with documents that have pending OCR status
-- This migration addresses the issue where documents marked as pending
-- by the confidence backfill migration are not in the processing queue
-- Insert pending documents into OCR queue
INSERT INTO ocr_queue (document_id, priority, file_size, created_at)
SELECT
id,
-- Calculate priority based on file size (smaller files get higher priority)
CASE
WHEN file_size <= 1048576 THEN 10 -- <= 1MB: highest priority
WHEN file_size <= 5242880 THEN 8 -- 1-5MB: high priority
WHEN file_size <= 10485760 THEN 6 -- 5-10MB: medium priority
WHEN file_size <= 52428800 THEN 4 -- 10-50MB: low priority
ELSE 2 -- > 50MB: lowest priority
END as priority,
file_size,
NOW() as created_at
FROM documents
WHERE ocr_status = 'pending'
AND id NOT IN (SELECT document_id FROM ocr_queue) -- Avoid duplicates
AND file_path IS NOT NULL -- Only queue documents with files
AND (mime_type LIKE 'image/%' OR mime_type = 'application/pdf' OR mime_type = 'text/plain'); -- Only OCR-able types
-- Log the result
DO $$
DECLARE
queued_count INTEGER;
BEGIN
GET DIAGNOSTICS queued_count = ROW_COUNT;
RAISE NOTICE 'Added % pending documents to OCR queue for processing', queued_count;
END $$;
-- Create helpful index for monitoring
CREATE INDEX IF NOT EXISTS idx_ocr_queue_document_status
ON ocr_queue(document_id, status, created_at);