100 lines
3.7 KiB
SQL
100 lines
3.7 KiB
SQL
-- Add office document extraction settings to the settings table
|
|
-- This migration adds timeout controls for Office document extraction using XML parsing
|
|
|
|
-- Add office extraction timeout column (default: 120 seconds)
|
|
ALTER TABLE settings
|
|
ADD COLUMN IF NOT EXISTS office_extraction_timeout_seconds INTEGER NOT NULL DEFAULT 120
|
|
CHECK (office_extraction_timeout_seconds > 0 AND office_extraction_timeout_seconds <= 600);
|
|
|
|
-- Add office extraction detailed logging column (default: false for production)
|
|
ALTER TABLE settings
|
|
ADD COLUMN IF NOT EXISTS office_extraction_enable_detailed_logging BOOLEAN NOT NULL DEFAULT false;
|
|
|
|
-- Add comment to document the new columns
|
|
COMMENT ON COLUMN settings.office_extraction_timeout_seconds IS
|
|
'Timeout in seconds for office document extraction (1-600 seconds, default: 120)';
|
|
|
|
COMMENT ON COLUMN settings.office_extraction_enable_detailed_logging IS
|
|
'Enable detailed logging for office document extraction operations (default: false)';
|
|
|
|
-- The default values are already set in the column definitions above
|
|
-- No need to insert default settings as they should be created when users are created
|
|
|
|
-- ============================================================================
|
|
-- TRIGGER RE-OCR FOR .doc AND .docx FILES
|
|
-- ============================================================================
|
|
-- This section will mark all existing .doc and .docx files for re-processing
|
|
-- to take advantage of improved Office document extraction capabilities
|
|
|
|
-- First, let's count how many documents will be affected
|
|
DO $$
|
|
DECLARE
|
|
doc_count INTEGER;
|
|
docx_count INTEGER;
|
|
total_count INTEGER;
|
|
BEGIN
|
|
SELECT COUNT(*) INTO doc_count
|
|
FROM documents
|
|
WHERE LOWER(original_filename) LIKE '%.doc' AND LOWER(original_filename) NOT LIKE '%.docx';
|
|
|
|
SELECT COUNT(*) INTO docx_count
|
|
FROM documents
|
|
WHERE LOWER(original_filename) LIKE '%.docx';
|
|
|
|
total_count := doc_count + docx_count;
|
|
|
|
RAISE NOTICE 'Found % .doc files and % .docx files (% total) that will be queued for re-OCR',
|
|
doc_count, docx_count, total_count;
|
|
END $$;
|
|
|
|
-- Update documents table: Reset OCR status for .doc and .docx files
|
|
UPDATE documents
|
|
SET
|
|
ocr_status = 'pending',
|
|
ocr_text = NULL,
|
|
ocr_confidence = NULL,
|
|
ocr_word_count = NULL,
|
|
ocr_processing_time_ms = NULL,
|
|
ocr_error = NULL,
|
|
ocr_completed_at = NULL,
|
|
updated_at = NOW()
|
|
WHERE
|
|
(LOWER(original_filename) LIKE '%.doc' OR LOWER(original_filename) LIKE '%.docx')
|
|
AND ocr_status != 'pending'; -- Only update if not already pending
|
|
|
|
-- Add entries to OCR queue for all .doc and .docx files that don't already have pending queue entries
|
|
INSERT INTO ocr_queue (document_id, status, priority, attempts, max_attempts, created_at, file_size)
|
|
SELECT
|
|
d.id,
|
|
'pending',
|
|
3, -- Medium-high priority for office documents
|
|
0, -- Reset attempts
|
|
3, -- Standard max attempts
|
|
NOW(),
|
|
d.file_size
|
|
FROM documents d
|
|
WHERE
|
|
(LOWER(d.original_filename) LIKE '%.doc' OR LOWER(d.original_filename) LIKE '%.docx')
|
|
AND NOT EXISTS (
|
|
SELECT 1
|
|
FROM ocr_queue oq
|
|
WHERE oq.document_id = d.id
|
|
AND oq.status IN ('pending', 'processing')
|
|
);
|
|
|
|
-- Log the final count of queued items
|
|
DO $$
|
|
DECLARE
|
|
queued_count INTEGER;
|
|
BEGIN
|
|
SELECT COUNT(*) INTO queued_count
|
|
FROM ocr_queue oq
|
|
JOIN documents d ON oq.document_id = d.id
|
|
WHERE
|
|
(LOWER(d.original_filename) LIKE '%.doc' OR LOWER(d.original_filename) LIKE '%.docx')
|
|
AND oq.status = 'pending'
|
|
AND oq.created_at >= NOW() - INTERVAL '1 minute'; -- Recently queued
|
|
|
|
RAISE NOTICE 'Successfully queued % .doc/.docx files for re-OCR processing', queued_count;
|
|
RAISE NOTICE 'Office document re-OCR migration completed successfully';
|
|
END $$; |