feat(migration): instead of hardcoded guessing, re-enter those documents into the queue
This commit is contained in:
parent
e995653d69
commit
69425b2201
|
|
@ -1,59 +1,35 @@
|
||||||
-- Backfill OCR confidence scores for existing documents
|
-- Re-queue documents with placeholder OCR confidence for reprocessing
|
||||||
-- Since OCR confidence was previously hardcoded to 85%, we need to recalculate
|
-- Since OCR confidence was previously hardcoded to 85%, we need to reprocess
|
||||||
-- actual confidence for documents that currently have this placeholder value
|
-- these documents to get accurate confidence scores
|
||||||
|
|
||||||
-- First, let's identify documents that likely have placeholder confidence
|
-- Mark documents with exactly 85% confidence as pending OCR reprocessing
|
||||||
-- (85% exactly, which was the hardcoded value)
|
UPDATE documents
|
||||||
CREATE TEMP TABLE documents_to_update AS
|
SET ocr_status = 'pending',
|
||||||
SELECT id, ocr_text, ocr_status
|
ocr_confidence = NULL,
|
||||||
FROM documents
|
ocr_error = NULL,
|
||||||
|
updated_at = CURRENT_TIMESTAMP
|
||||||
WHERE ocr_confidence = 85.0
|
WHERE ocr_confidence = 85.0
|
||||||
AND ocr_status = 'completed'
|
AND ocr_status = 'completed'
|
||||||
AND ocr_text IS NOT NULL
|
AND ocr_text IS NOT NULL;
|
||||||
AND length(trim(ocr_text)) > 0;
|
|
||||||
|
|
||||||
-- For now, we'll estimate confidence based on text quality metrics
|
|
||||||
-- This is a rough approximation until we can re-run OCR with actual confidence
|
|
||||||
UPDATE documents
|
|
||||||
SET ocr_confidence = CASE
|
|
||||||
-- High quality text: good length, reasonable character distribution
|
|
||||||
WHEN length(trim(ocr_text)) > 1000
|
|
||||||
AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), chr(10), ''))) * 100.0 / length(ocr_text) > 10.0 -- > 10% whitespace
|
|
||||||
AND length(replace(replace(replace(ocr_text, ' ', ''), chr(10), ''), chr(13), '')) * 100.0 / length(ocr_text) > 70.0 -- > 70% non-whitespace chars
|
|
||||||
THEN 90.0 + (random() * 8.0) -- 90-98%
|
|
||||||
|
|
||||||
-- Medium quality text: decent length, some structure
|
|
||||||
WHEN length(trim(ocr_text)) > 100
|
|
||||||
AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), chr(10), ''))) * 100.0 / length(ocr_text) > 5.0 -- > 5% whitespace
|
|
||||||
AND length(replace(replace(replace(ocr_text, ' ', ''), chr(10), ''), chr(13), '')) * 100.0 / length(ocr_text) > 50.0 -- > 50% non-whitespace chars
|
|
||||||
THEN 70.0 + (random() * 15.0) -- 70-85%
|
|
||||||
|
|
||||||
-- Low quality text: short or poor structure
|
|
||||||
WHEN length(trim(ocr_text)) > 10
|
|
||||||
AND length(replace(replace(replace(ocr_text, ' ', ''), chr(10), ''), chr(13), '')) * 100.0 / length(ocr_text) > 30.0 -- > 30% non-whitespace chars
|
|
||||||
THEN 40.0 + (random() * 25.0) -- 40-65%
|
|
||||||
|
|
||||||
-- Very poor quality: very short or mostly garbage
|
|
||||||
ELSE 20.0 + (random() * 15.0) -- 20-35%
|
|
||||||
END
|
|
||||||
WHERE id IN (SELECT id FROM documents_to_update);
|
|
||||||
|
|
||||||
-- Add a comment explaining what we did
|
-- Add a comment explaining what we did
|
||||||
COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence percentage (0-100). Values may be estimated for documents processed before real confidence calculation was implemented.';
|
COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence percentage (0-100) from Tesseract. Documents with NULL confidence and pending status will be reprocessed.';
|
||||||
|
|
||||||
-- Log the update
|
-- Log the update
|
||||||
DO $$
|
DO $$
|
||||||
DECLARE
|
DECLARE
|
||||||
updated_count INTEGER;
|
updated_count INTEGER;
|
||||||
BEGIN
|
BEGIN
|
||||||
SELECT COUNT(*) INTO updated_count FROM documents_to_update;
|
GET DIAGNOSTICS updated_count = ROW_COUNT;
|
||||||
RAISE NOTICE 'Backfilled OCR confidence for % documents that had placeholder 85%% confidence', updated_count;
|
RAISE NOTICE 'Marked % documents with placeholder 85%% confidence for OCR reprocessing', updated_count;
|
||||||
END $$;
|
END $$;
|
||||||
|
|
||||||
-- Clean up
|
|
||||||
DROP TABLE documents_to_update;
|
|
||||||
|
|
||||||
-- Create an index to help with confidence-based queries
|
-- Create an index to help with confidence-based queries
|
||||||
CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence_range
|
CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence_range
|
||||||
ON documents(ocr_confidence)
|
ON documents(ocr_confidence)
|
||||||
WHERE ocr_confidence IS NOT NULL;
|
WHERE ocr_confidence IS NOT NULL;
|
||||||
|
|
||||||
|
-- Create an index to help the OCR queue find pending documents efficiently
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_documents_ocr_pending
|
||||||
|
ON documents(created_at)
|
||||||
|
WHERE ocr_status = 'pending';
|
||||||
Loading…
Reference in New Issue