feat(dev): trigger re-ocr on doc and docx

2025-09-02 23:04:31 +00:00 · 2025-09-02 23:04:31 +00:00 · 036941b3dc
parent 43b679f59b
commit 036941b3dc
1 changed files with 80 additions and 1 deletions
--- a/migrations/20250901000001_add_office_extraction_settings.sql
+++ b/migrations/20250901000001_add_office_extraction_settings.sql
@ -18,4 +18,83 @@ COMMENT ON COLUMN settings.office_extraction_enable_detailed_logging IS
 'Enable detailed logging for office document extraction operations (default: false)';

 -- The default values are already set in the column definitions above
-- No need to insert default settings as they should be created when users are created
+-- No need to insert default settings as they should be created when users are created
+
+-- ============================================================================
+-- TRIGGER RE-OCR FOR .doc AND .docx FILES
+-- ============================================================================
+-- This section will mark all existing .doc and .docx files for re-processing
+-- to take advantage of improved Office document extraction capabilities
+
+-- First, let's count how many documents will be affected
+DO $$
+DECLARE
+    doc_count INTEGER;
+    docx_count INTEGER;
+    total_count INTEGER;
+BEGIN
+    SELECT COUNT(*) INTO doc_count 
+    FROM documents 
+    WHERE LOWER(original_filename) LIKE '%.doc' AND LOWER(original_filename) NOT LIKE '%.docx';
+    
+    SELECT COUNT(*) INTO docx_count 
+    FROM documents 
+    WHERE LOWER(original_filename) LIKE '%.docx';
+    
+    total_count := doc_count + docx_count;
+    
+    RAISE NOTICE 'Found % .doc files and % .docx files (% total) that will be queued for re-OCR', 
+        doc_count, docx_count, total_count;
+END $$;
+
+-- Update documents table: Reset OCR status for .doc and .docx files
+UPDATE documents 
+SET 
+    ocr_status = 'pending',
+    ocr_text = NULL,
+    ocr_confidence = NULL,
+    ocr_word_count = NULL,
+    ocr_processing_time_ms = NULL,
+    ocr_error = NULL,
+    ocr_completed_at = NULL,
+    updated_at = NOW()
+WHERE 
+    (LOWER(original_filename) LIKE '%.doc' OR LOWER(original_filename) LIKE '%.docx')
+    AND ocr_status != 'pending';  -- Only update if not already pending
+
+-- Add entries to OCR queue for all .doc and .docx files that don't already have pending queue entries
+INSERT INTO ocr_queue (document_id, status, priority, attempts, max_attempts, created_at, file_size)
+SELECT 
+    d.id,
+    'pending',
+    3,  -- Medium-high priority for office documents
+    0,  -- Reset attempts
+    3,  -- Standard max attempts
+    NOW(),
+    d.file_size
+FROM documents d
+WHERE 
+    (LOWER(d.original_filename) LIKE '%.doc' OR LOWER(d.original_filename) LIKE '%.docx')
+    AND NOT EXISTS (
+        SELECT 1 
+        FROM ocr_queue oq 
+        WHERE oq.document_id = d.id 
+        AND oq.status IN ('pending', 'processing')
+    );
+
+-- Log the final count of queued items
+DO $$
+DECLARE
+    queued_count INTEGER;
+BEGIN
+    SELECT COUNT(*) INTO queued_count 
+    FROM ocr_queue oq
+    JOIN documents d ON oq.document_id = d.id
+    WHERE 
+        (LOWER(d.original_filename) LIKE '%.doc' OR LOWER(d.original_filename) LIKE '%.docx')
+        AND oq.status = 'pending'
+        AND oq.created_at >= NOW() - INTERVAL '1 minute';  -- Recently queued
+    
+    RAISE NOTICE 'Successfully queued % .doc/.docx files for re-OCR processing', queued_count;
+    RAISE NOTICE 'Office document re-OCR migration completed successfully';
+END $$;