feat(dev): trigger re-ocr on doc and docx

This commit is contained in:
perf3ct 2025-09-02 23:04:31 +00:00
parent 43b679f59b
commit 036941b3dc
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
1 changed files with 80 additions and 1 deletions

View File

@ -18,4 +18,83 @@ COMMENT ON COLUMN settings.office_extraction_enable_detailed_logging IS
'Enable detailed logging for office document extraction operations (default: false)';
-- The default values are already set in the column definitions above
-- No need to insert default settings as they should be created when users are created
-- No need to insert default settings as they should be created when users are created
-- ============================================================================
-- TRIGGER RE-OCR FOR .doc AND .docx FILES
-- ============================================================================
-- This section will mark all existing .doc and .docx files for re-processing
-- to take advantage of improved Office document extraction capabilities
-- First, let's count how many documents will be affected
DO $$
DECLARE
doc_count INTEGER;
docx_count INTEGER;
total_count INTEGER;
BEGIN
SELECT COUNT(*) INTO doc_count
FROM documents
WHERE LOWER(original_filename) LIKE '%.doc' AND LOWER(original_filename) NOT LIKE '%.docx';
SELECT COUNT(*) INTO docx_count
FROM documents
WHERE LOWER(original_filename) LIKE '%.docx';
total_count := doc_count + docx_count;
RAISE NOTICE 'Found % .doc files and % .docx files (% total) that will be queued for re-OCR',
doc_count, docx_count, total_count;
END $$;
-- Update documents table: Reset OCR status for .doc and .docx files
UPDATE documents
SET
ocr_status = 'pending',
ocr_text = NULL,
ocr_confidence = NULL,
ocr_word_count = NULL,
ocr_processing_time_ms = NULL,
ocr_error = NULL,
ocr_completed_at = NULL,
updated_at = NOW()
WHERE
(LOWER(original_filename) LIKE '%.doc' OR LOWER(original_filename) LIKE '%.docx')
AND ocr_status != 'pending'; -- Only update if not already pending
-- Add entries to OCR queue for all .doc and .docx files that don't already have pending queue entries
INSERT INTO ocr_queue (document_id, status, priority, attempts, max_attempts, created_at, file_size)
SELECT
d.id,
'pending',
3, -- Medium-high priority for office documents
0, -- Reset attempts
3, -- Standard max attempts
NOW(),
d.file_size
FROM documents d
WHERE
(LOWER(d.original_filename) LIKE '%.doc' OR LOWER(d.original_filename) LIKE '%.docx')
AND NOT EXISTS (
SELECT 1
FROM ocr_queue oq
WHERE oq.document_id = d.id
AND oq.status IN ('pending', 'processing')
);
-- Log the final count of queued items
DO $$
DECLARE
queued_count INTEGER;
BEGIN
SELECT COUNT(*) INTO queued_count
FROM ocr_queue oq
JOIN documents d ON oq.document_id = d.id
WHERE
(LOWER(d.original_filename) LIKE '%.doc' OR LOWER(d.original_filename) LIKE '%.docx')
AND oq.status = 'pending'
AND oq.created_at >= NOW() - INTERVAL '1 minute'; -- Recently queued
RAISE NOTICE 'Successfully queued % .doc/.docx files for re-OCR processing', queued_count;
RAISE NOTICE 'Office document re-OCR migration completed successfully';
END $$;