feat(dev): trigger re-ocr on doc and docx

2025-09-02 23:04:31 +00:00 · 2025-09-02 23:04:31 +00:00 · 036941b3dc
parent 43b679f59b
commit 036941b3dc
1 changed files with 80 additions and 1 deletions
--- a/migrations/20250901000001_add_office_extraction_settings.sql
+++ b/migrations/20250901000001_add_office_extraction_settings.sql
@ -19,3 +19,82 @@ COMMENT ON COLUMN settings.office_extraction_enable_detailed_logging IS
 -- The default values are already set in the column definitions above
 -- No need to insert default settings as they should be created when users are created
 -- ============================================================================
 -- TRIGGER RE-OCR FOR .doc AND .docx FILES
 -- ============================================================================
 -- This section will mark all existing .doc and .docx files for re-processing
 -- to take advantage of improved Office document extraction capabilities
 -- First, let's count how many documents will be affected
 DO $$
 DECLARE
    doc_count INTEGER;
    docx_count INTEGER;
    total_count INTEGER;
 BEGIN
    SELECT COUNT(*) INTO doc_count 
    FROM documents 
    WHERE LOWER(original_filename) LIKE '%.doc' AND LOWER(original_filename) NOT LIKE '%.docx';
    SELECT COUNT(*) INTO docx_count 
    FROM documents 
    WHERE LOWER(original_filename) LIKE '%.docx';
    total_count := doc_count + docx_count;
    RAISE NOTICE 'Found % .doc files and % .docx files (% total) that will be queued for re-OCR', 
        doc_count, docx_count, total_count;
 END $$;
 -- Update documents table: Reset OCR status for .doc and .docx files
 UPDATE documents 
 SET 
    ocr_status = 'pending',
    ocr_text = NULL,
    ocr_confidence = NULL,
    ocr_word_count = NULL,
    ocr_processing_time_ms = NULL,
    ocr_error = NULL,
    ocr_completed_at = NULL,
    updated_at = NOW()
 WHERE 
    (LOWER(original_filename) LIKE '%.doc' OR LOWER(original_filename) LIKE '%.docx')
    AND ocr_status != 'pending';  -- Only update if not already pending
 -- Add entries to OCR queue for all .doc and .docx files that don't already have pending queue entries
 INSERT INTO ocr_queue (document_id, status, priority, attempts, max_attempts, created_at, file_size)
 SELECT 
    d.id,
    'pending',
    3,  -- Medium-high priority for office documents
    0,  -- Reset attempts
    3,  -- Standard max attempts
    NOW(),
    d.file_size
 FROM documents d
 WHERE 
    (LOWER(d.original_filename) LIKE '%.doc' OR LOWER(d.original_filename) LIKE '%.docx')
    AND NOT EXISTS (
        SELECT 1 
        FROM ocr_queue oq 
        WHERE oq.document_id = d.id 
        AND oq.status IN ('pending', 'processing')
    );
 -- Log the final count of queued items
 DO $$
 DECLARE
    queued_count INTEGER;
 BEGIN
    SELECT COUNT(*) INTO queued_count 
    FROM ocr_queue oq
    JOIN documents d ON oq.document_id = d.id
    WHERE 
        (LOWER(d.original_filename) LIKE '%.doc' OR LOWER(d.original_filename) LIKE '%.docx')
        AND oq.status = 'pending'
        AND oq.created_at >= NOW() - INTERVAL '1 minute';  -- Recently queued
    RAISE NOTICE 'Successfully queued % .doc/.docx files for re-OCR processing', queued_count;
    RAISE NOTICE 'Office document re-OCR migration completed successfully';
 END $$;