feat(dev): trigger re-ocr on doc and docx
This commit is contained in:
parent
43b679f59b
commit
036941b3dc
|
|
@ -18,4 +18,83 @@ COMMENT ON COLUMN settings.office_extraction_enable_detailed_logging IS
|
|||
'Enable detailed logging for office document extraction operations (default: false)';
|
||||
|
||||
-- The default values are already set in the column definitions above
|
||||
-- No need to insert default settings as they should be created when users are created
|
||||
-- No need to insert default settings as they should be created when users are created
|
||||
|
||||
-- ============================================================================
|
||||
-- TRIGGER RE-OCR FOR .doc AND .docx FILES
|
||||
-- ============================================================================
|
||||
-- This section will mark all existing .doc and .docx files for re-processing
|
||||
-- to take advantage of improved Office document extraction capabilities
|
||||
|
||||
-- First, let's count how many documents will be affected
|
||||
DO $$
|
||||
DECLARE
|
||||
doc_count INTEGER;
|
||||
docx_count INTEGER;
|
||||
total_count INTEGER;
|
||||
BEGIN
|
||||
SELECT COUNT(*) INTO doc_count
|
||||
FROM documents
|
||||
WHERE LOWER(original_filename) LIKE '%.doc' AND LOWER(original_filename) NOT LIKE '%.docx';
|
||||
|
||||
SELECT COUNT(*) INTO docx_count
|
||||
FROM documents
|
||||
WHERE LOWER(original_filename) LIKE '%.docx';
|
||||
|
||||
total_count := doc_count + docx_count;
|
||||
|
||||
RAISE NOTICE 'Found % .doc files and % .docx files (% total) that will be queued for re-OCR',
|
||||
doc_count, docx_count, total_count;
|
||||
END $$;
|
||||
|
||||
-- Update documents table: Reset OCR status for .doc and .docx files
|
||||
UPDATE documents
|
||||
SET
|
||||
ocr_status = 'pending',
|
||||
ocr_text = NULL,
|
||||
ocr_confidence = NULL,
|
||||
ocr_word_count = NULL,
|
||||
ocr_processing_time_ms = NULL,
|
||||
ocr_error = NULL,
|
||||
ocr_completed_at = NULL,
|
||||
updated_at = NOW()
|
||||
WHERE
|
||||
(LOWER(original_filename) LIKE '%.doc' OR LOWER(original_filename) LIKE '%.docx')
|
||||
AND ocr_status != 'pending'; -- Only update if not already pending
|
||||
|
||||
-- Add entries to OCR queue for all .doc and .docx files that don't already have pending queue entries
|
||||
INSERT INTO ocr_queue (document_id, status, priority, attempts, max_attempts, created_at, file_size)
|
||||
SELECT
|
||||
d.id,
|
||||
'pending',
|
||||
3, -- Medium-high priority for office documents
|
||||
0, -- Reset attempts
|
||||
3, -- Standard max attempts
|
||||
NOW(),
|
||||
d.file_size
|
||||
FROM documents d
|
||||
WHERE
|
||||
(LOWER(d.original_filename) LIKE '%.doc' OR LOWER(d.original_filename) LIKE '%.docx')
|
||||
AND NOT EXISTS (
|
||||
SELECT 1
|
||||
FROM ocr_queue oq
|
||||
WHERE oq.document_id = d.id
|
||||
AND oq.status IN ('pending', 'processing')
|
||||
);
|
||||
|
||||
-- Log the final count of queued items
|
||||
DO $$
|
||||
DECLARE
|
||||
queued_count INTEGER;
|
||||
BEGIN
|
||||
SELECT COUNT(*) INTO queued_count
|
||||
FROM ocr_queue oq
|
||||
JOIN documents d ON oq.document_id = d.id
|
||||
WHERE
|
||||
(LOWER(d.original_filename) LIKE '%.doc' OR LOWER(d.original_filename) LIKE '%.docx')
|
||||
AND oq.status = 'pending'
|
||||
AND oq.created_at >= NOW() - INTERVAL '1 minute'; -- Recently queued
|
||||
|
||||
RAISE NOTICE 'Successfully queued % .doc/.docx files for re-OCR processing', queued_count;
|
||||
RAISE NOTICE 'Office document re-OCR migration completed successfully';
|
||||
END $$;
|
||||
Loading…
Reference in New Issue