feat(dev): trigger re-ocr on doc and docx
This commit is contained in:
parent
43b679f59b
commit
036941b3dc
|
|
@ -19,3 +19,82 @@ COMMENT ON COLUMN settings.office_extraction_enable_detailed_logging IS
|
||||||
|
|
||||||
-- The default values are already set in the column definitions above
|
-- The default values are already set in the column definitions above
|
||||||
-- No need to insert default settings as they should be created when users are created
|
-- No need to insert default settings as they should be created when users are created
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- TRIGGER RE-OCR FOR .doc AND .docx FILES
|
||||||
|
-- ============================================================================
|
||||||
|
-- This section will mark all existing .doc and .docx files for re-processing
|
||||||
|
-- to take advantage of improved Office document extraction capabilities
|
||||||
|
|
||||||
|
-- First, let's count how many documents will be affected
|
||||||
|
DO $$
|
||||||
|
DECLARE
|
||||||
|
doc_count INTEGER;
|
||||||
|
docx_count INTEGER;
|
||||||
|
total_count INTEGER;
|
||||||
|
BEGIN
|
||||||
|
SELECT COUNT(*) INTO doc_count
|
||||||
|
FROM documents
|
||||||
|
WHERE LOWER(original_filename) LIKE '%.doc' AND LOWER(original_filename) NOT LIKE '%.docx';
|
||||||
|
|
||||||
|
SELECT COUNT(*) INTO docx_count
|
||||||
|
FROM documents
|
||||||
|
WHERE LOWER(original_filename) LIKE '%.docx';
|
||||||
|
|
||||||
|
total_count := doc_count + docx_count;
|
||||||
|
|
||||||
|
RAISE NOTICE 'Found % .doc files and % .docx files (% total) that will be queued for re-OCR',
|
||||||
|
doc_count, docx_count, total_count;
|
||||||
|
END $$;
|
||||||
|
|
||||||
|
-- Update documents table: Reset OCR status for .doc and .docx files
|
||||||
|
UPDATE documents
|
||||||
|
SET
|
||||||
|
ocr_status = 'pending',
|
||||||
|
ocr_text = NULL,
|
||||||
|
ocr_confidence = NULL,
|
||||||
|
ocr_word_count = NULL,
|
||||||
|
ocr_processing_time_ms = NULL,
|
||||||
|
ocr_error = NULL,
|
||||||
|
ocr_completed_at = NULL,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE
|
||||||
|
(LOWER(original_filename) LIKE '%.doc' OR LOWER(original_filename) LIKE '%.docx')
|
||||||
|
AND ocr_status != 'pending'; -- Only update if not already pending
|
||||||
|
|
||||||
|
-- Add entries to OCR queue for all .doc and .docx files that don't already have pending queue entries
|
||||||
|
INSERT INTO ocr_queue (document_id, status, priority, attempts, max_attempts, created_at, file_size)
|
||||||
|
SELECT
|
||||||
|
d.id,
|
||||||
|
'pending',
|
||||||
|
3, -- Medium-high priority for office documents
|
||||||
|
0, -- Reset attempts
|
||||||
|
3, -- Standard max attempts
|
||||||
|
NOW(),
|
||||||
|
d.file_size
|
||||||
|
FROM documents d
|
||||||
|
WHERE
|
||||||
|
(LOWER(d.original_filename) LIKE '%.doc' OR LOWER(d.original_filename) LIKE '%.docx')
|
||||||
|
AND NOT EXISTS (
|
||||||
|
SELECT 1
|
||||||
|
FROM ocr_queue oq
|
||||||
|
WHERE oq.document_id = d.id
|
||||||
|
AND oq.status IN ('pending', 'processing')
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Log the final count of queued items
|
||||||
|
DO $$
|
||||||
|
DECLARE
|
||||||
|
queued_count INTEGER;
|
||||||
|
BEGIN
|
||||||
|
SELECT COUNT(*) INTO queued_count
|
||||||
|
FROM ocr_queue oq
|
||||||
|
JOIN documents d ON oq.document_id = d.id
|
||||||
|
WHERE
|
||||||
|
(LOWER(d.original_filename) LIKE '%.doc' OR LOWER(d.original_filename) LIKE '%.docx')
|
||||||
|
AND oq.status = 'pending'
|
||||||
|
AND oq.created_at >= NOW() - INTERVAL '1 minute'; -- Recently queued
|
||||||
|
|
||||||
|
RAISE NOTICE 'Successfully queued % .doc/.docx files for re-OCR processing', queued_count;
|
||||||
|
RAISE NOTICE 'Office document re-OCR migration completed successfully';
|
||||||
|
END $$;
|
||||||
Loading…
Reference in New Issue