Readur/migrations/20250901000001_add_office_e...

-- Add office document extraction settings to the settings table
-- This migration adds timeout controls for Office document extraction using XML parsing

-- Add office extraction timeout column (default: 120 seconds)
ALTER TABLE settings
ADD COLUMN IF NOT EXISTS office_extraction_timeout_seconds INTEGER NOT NULL DEFAULT 120
CHECK (office_extraction_timeout_seconds > 0 AND office_extraction_timeout_seconds <= 600);

-- Add office extraction detailed logging column (default: false for production)
ALTER TABLE settings
ADD COLUMN IF NOT EXISTS office_extraction_enable_detailed_logging BOOLEAN NOT NULL DEFAULT false;

-- Add comment to document the new columns
COMMENT ON COLUMN settings.office_extraction_timeout_seconds IS
'Timeout in seconds for office document extraction (1-600 seconds, default: 120)';

COMMENT ON COLUMN settings.office_extraction_enable_detailed_logging IS
'Enable detailed logging for office document extraction operations (default: false)';

-- The default values are already set in the column definitions above
-- No need to insert default settings as they should be created when users are created

-- ============================================================================
-- TRIGGER RE-OCR FOR .doc AND .docx FILES
-- ============================================================================
-- This section will mark all existing .doc and .docx files for re-processing
-- to take advantage of improved Office document extraction capabilities

-- First, let's count how many documents will be affected
DO $$
DECLARE
    doc_count INTEGER;
    docx_count INTEGER;
    total_count INTEGER;
BEGIN
    SELECT COUNT(*) INTO doc_count
    FROM documents
    WHERE LOWER(original_filename) LIKE '%.doc' AND LOWER(original_filename) NOT LIKE '%.docx';

    SELECT COUNT(*) INTO docx_count
    FROM documents
    WHERE LOWER(original_filename) LIKE '%.docx';

    total_count := doc_count + docx_count;

    RAISE NOTICE 'Found % .doc files and % .docx files (% total) that will be queued for re-OCR',
        doc_count, docx_count, total_count;
END $$;

-- Update documents table: Reset OCR status for .doc and .docx files
UPDATE documents
SET
    ocr_status = 'pending',
    ocr_text = NULL,
    ocr_confidence = NULL,
    ocr_word_count = NULL,
    ocr_processing_time_ms = NULL,
    ocr_error = NULL,
    ocr_completed_at = NULL,
    updated_at = NOW()
WHERE
    (LOWER(original_filename) LIKE '%.doc' OR LOWER(original_filename) LIKE '%.docx')
    AND ocr_status != 'pending';  -- Only update if not already pending

-- Add entries to OCR queue for all .doc and .docx files that don't already have pending queue entries
INSERT INTO ocr_queue (document_id, status, priority, attempts, max_attempts, created_at, file_size)
SELECT
    d.id,
    'pending',
    3,  -- Medium-high priority for office documents
    0,  -- Reset attempts
    3,  -- Standard max attempts
    NOW(),
    d.file_size
FROM documents d
WHERE
    (LOWER(d.original_filename) LIKE '%.doc' OR LOWER(d.original_filename) LIKE '%.docx')
    AND NOT EXISTS (
        SELECT 1
        FROM ocr_queue oq
        WHERE oq.document_id = d.id
        AND oq.status IN ('pending', 'processing')
    );

-- Log the final count of queued items
DO $$
DECLARE
    queued_count INTEGER;
BEGIN
    SELECT COUNT(*) INTO queued_count
    FROM ocr_queue oq
    JOIN documents d ON oq.document_id = d.id
    WHERE
        (LOWER(d.original_filename) LIKE '%.doc' OR LOWER(d.original_filename) LIKE '%.docx')
        AND oq.status = 'pending'
        AND oq.created_at >= NOW() - INTERVAL '1 minute';  -- Recently queued

    RAISE NOTICE 'Successfully queued % .doc/.docx files for re-OCR processing', queued_count;
    RAISE NOTICE 'Office document re-OCR migration completed successfully';
END $$;