diff --git a/migrations/20250901000001_add_office_extraction_settings.sql b/migrations/20250901000001_add_office_extraction_settings.sql index 5cf5cc1..c68cac7 100644 --- a/migrations/20250901000001_add_office_extraction_settings.sql +++ b/migrations/20250901000001_add_office_extraction_settings.sql @@ -18,4 +18,83 @@ COMMENT ON COLUMN settings.office_extraction_enable_detailed_logging IS 'Enable detailed logging for office document extraction operations (default: false)'; -- The default values are already set in the column definitions above --- No need to insert default settings as they should be created when users are created \ No newline at end of file +-- No need to insert default settings as they should be created when users are created + +-- ============================================================================ +-- TRIGGER RE-OCR FOR .doc AND .docx FILES +-- ============================================================================ +-- This section will mark all existing .doc and .docx files for re-processing +-- to take advantage of improved Office document extraction capabilities + +-- First, let's count how many documents will be affected +DO $$ +DECLARE + doc_count INTEGER; + docx_count INTEGER; + total_count INTEGER; +BEGIN + SELECT COUNT(*) INTO doc_count + FROM documents + WHERE LOWER(original_filename) LIKE '%.doc' AND LOWER(original_filename) NOT LIKE '%.docx'; + + SELECT COUNT(*) INTO docx_count + FROM documents + WHERE LOWER(original_filename) LIKE '%.docx'; + + total_count := doc_count + docx_count; + + RAISE NOTICE 'Found % .doc files and % .docx files (% total) that will be queued for re-OCR', + doc_count, docx_count, total_count; +END $$; + +-- Update documents table: Reset OCR status for .doc and .docx files +UPDATE documents +SET + ocr_status = 'pending', + ocr_text = NULL, + ocr_confidence = NULL, + ocr_word_count = NULL, + ocr_processing_time_ms = NULL, + ocr_error = NULL, + ocr_completed_at = NULL, + updated_at = NOW() +WHERE + (LOWER(original_filename) LIKE '%.doc' OR LOWER(original_filename) LIKE '%.docx') + AND ocr_status != 'pending'; -- Only update if not already pending + +-- Add entries to OCR queue for all .doc and .docx files that don't already have pending queue entries +INSERT INTO ocr_queue (document_id, status, priority, attempts, max_attempts, created_at, file_size) +SELECT + d.id, + 'pending', + 3, -- Medium-high priority for office documents + 0, -- Reset attempts + 3, -- Standard max attempts + NOW(), + d.file_size +FROM documents d +WHERE + (LOWER(d.original_filename) LIKE '%.doc' OR LOWER(d.original_filename) LIKE '%.docx') + AND NOT EXISTS ( + SELECT 1 + FROM ocr_queue oq + WHERE oq.document_id = d.id + AND oq.status IN ('pending', 'processing') + ); + +-- Log the final count of queued items +DO $$ +DECLARE + queued_count INTEGER; +BEGIN + SELECT COUNT(*) INTO queued_count + FROM ocr_queue oq + JOIN documents d ON oq.document_id = d.id + WHERE + (LOWER(d.original_filename) LIKE '%.doc' OR LOWER(d.original_filename) LIKE '%.docx') + AND oq.status = 'pending' + AND oq.created_at >= NOW() - INTERVAL '1 minute'; -- Recently queued + + RAISE NOTICE 'Successfully queued % .doc/.docx files for re-OCR processing', queued_count; + RAISE NOTICE 'Office document re-OCR migration completed successfully'; +END $$; \ No newline at end of file