From dc5cb994626c2680f7416a1c9aa4b037510896a5 Mon Sep 17 00:00:00 2001
From: perf3ct <jonfuller2012@gmail.com>
Date: Sat, 28 Jun 2025 14:53:45 +0000
Subject: [PATCH] feat(migration): instead of hardcoded guessing, re-enter
 those documents into the queue

---
 ...20250628000001_backfill_ocr_confidence.sql | 64 ++++++-------------
 1 file changed, 20 insertions(+), 44 deletions(-)

diff --git a/migrations/20250628000001_backfill_ocr_confidence.sql b/migrations/20250628000001_backfill_ocr_confidence.sql
index 6a095a6..829d5cd 100644
--- a/migrations/20250628000001_backfill_ocr_confidence.sql
+++ b/migrations/20250628000001_backfill_ocr_confidence.sql
@@ -1,59 +1,35 @@
--- Backfill OCR confidence scores for existing documents
--- Since OCR confidence was previously hardcoded to 85%, we need to recalculate
--- actual confidence for documents that currently have this placeholder value
+-- Re-queue documents with placeholder OCR confidence for reprocessing
+-- Since OCR confidence was previously hardcoded to 85%, we need to reprocess
+-- these documents to get accurate confidence scores
 
--- First, let's identify documents that likely have placeholder confidence
--- (85% exactly, which was the hardcoded value)
-CREATE TEMP TABLE documents_to_update AS
-SELECT id, ocr_text, ocr_status 
-FROM documents 
-WHERE ocr_confidence = 85.0 
-  AND ocr_status = 'completed' 
-  AND ocr_text IS NOT NULL 
-  AND length(trim(ocr_text)) > 0;
-
--- For now, we'll estimate confidence based on text quality metrics
--- This is a rough approximation until we can re-run OCR with actual confidence
+-- Mark documents with exactly 85% confidence as pending OCR reprocessing
 UPDATE documents 
-SET ocr_confidence = CASE
-    -- High quality text: good length, reasonable character distribution
-    WHEN length(trim(ocr_text)) > 1000 
-         AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), chr(10), ''))) * 100.0 / length(ocr_text) > 10.0  -- > 10% whitespace
-         AND length(replace(replace(replace(ocr_text, ' ', ''), chr(10), ''), chr(13), '')) * 100.0 / length(ocr_text) > 70.0  -- > 70% non-whitespace chars
-    THEN 90.0 + (random() * 8.0)  -- 90-98%
-    
-    -- Medium quality text: decent length, some structure
-    WHEN length(trim(ocr_text)) > 100 
-         AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), chr(10), ''))) * 100.0 / length(ocr_text) > 5.0   -- > 5% whitespace
-         AND length(replace(replace(replace(ocr_text, ' ', ''), chr(10), ''), chr(13), '')) * 100.0 / length(ocr_text) > 50.0  -- > 50% non-whitespace chars
-    THEN 70.0 + (random() * 15.0)  -- 70-85%
-    
-    -- Low quality text: short or poor structure
-    WHEN length(trim(ocr_text)) > 10
-         AND length(replace(replace(replace(ocr_text, ' ', ''), chr(10), ''), chr(13), '')) * 100.0 / length(ocr_text) > 30.0  -- > 30% non-whitespace chars
-    THEN 40.0 + (random() * 25.0)  -- 40-65%
-    
-    -- Very poor quality: very short or mostly garbage
-    ELSE 20.0 + (random() * 15.0)  -- 20-35%
-END
-WHERE id IN (SELECT id FROM documents_to_update);
+SET ocr_status = 'pending',
+    ocr_confidence = NULL,
+    ocr_error = NULL,
+    updated_at = CURRENT_TIMESTAMP
+WHERE ocr_confidence = 85.0 
+  AND ocr_status = 'completed'
+  AND ocr_text IS NOT NULL;
 
 -- Add a comment explaining what we did
-COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence percentage (0-100). Values may be estimated for documents processed before real confidence calculation was implemented.';
+COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence percentage (0-100) from Tesseract. Documents with NULL confidence and pending status will be reprocessed.';
 
 -- Log the update
 DO $$
 DECLARE
     updated_count INTEGER;
 BEGIN
-    SELECT COUNT(*) INTO updated_count FROM documents_to_update;
-    RAISE NOTICE 'Backfilled OCR confidence for % documents that had placeholder 85%% confidence', updated_count;
+    GET DIAGNOSTICS updated_count = ROW_COUNT;
+    RAISE NOTICE 'Marked % documents with placeholder 85%% confidence for OCR reprocessing', updated_count;
 END $$;
 
--- Clean up
-DROP TABLE documents_to_update;
-
 -- Create an index to help with confidence-based queries
 CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence_range 
 ON documents(ocr_confidence) 
-WHERE ocr_confidence IS NOT NULL;
\ No newline at end of file
+WHERE ocr_confidence IS NOT NULL;
+
+-- Create an index to help the OCR queue find pending documents efficiently
+CREATE INDEX IF NOT EXISTS idx_documents_ocr_pending 
+ON documents(created_at) 
+WHERE ocr_status = 'pending';
\ No newline at end of file