Readur/migrations/20250628000004_migrate_fail...

123 lines
3.4 KiB
SQL

-- Migration to move existing failed OCR documents from documents table to failed_documents table
-- This consolidates all failure tracking into a single table
-- First, ensure the failed_documents table exists
-- (This migration depends on 20250628000003_add_failed_documents_table.sql)
-- Move failed OCR documents to failed_documents table
INSERT INTO failed_documents (
user_id,
filename,
original_filename,
file_path,
file_size,
file_hash,
mime_type,
content,
tags,
ocr_text,
ocr_confidence,
ocr_word_count,
ocr_processing_time_ms,
failure_reason,
failure_stage,
ingestion_source,
error_message,
retry_count,
created_at,
updated_at
)
SELECT
d.user_id,
d.filename,
d.original_filename,
d.file_path,
d.file_size,
d.file_hash,
d.mime_type,
d.content,
d.tags,
d.ocr_text,
d.ocr_confidence,
d.ocr_word_count,
d.ocr_processing_time_ms,
COALESCE(d.ocr_failure_reason, 'other') as failure_reason,
'ocr' as failure_stage,
'migration' as ingestion_source, -- Mark these as migrated from existing system
d.ocr_error as error_message,
COALESCE(q.retry_count, 0) as retry_count,
d.created_at,
d.updated_at
FROM documents d
LEFT JOIN (
SELECT document_id, COUNT(*) as retry_count
FROM ocr_queue
WHERE status IN ('failed', 'completed')
GROUP BY document_id
) q ON d.id = q.document_id
WHERE d.ocr_status = 'failed';
-- Log the migration for audit purposes
INSERT INTO failed_documents (
user_id,
filename,
original_filename,
failure_reason,
failure_stage,
ingestion_source,
error_message,
created_at,
updated_at
) VALUES (
'00000000-0000-0000-0000-000000000000'::uuid, -- System user ID
'migration_log',
'Failed OCR Migration Log',
'migration_completed',
'migration',
'system',
'Migrated ' || (SELECT COUNT(*) FROM documents WHERE ocr_status = 'failed') || ' failed OCR documents to failed_documents table',
NOW(),
NOW()
);
-- Remove failed OCR documents from documents table
-- Note: This uses CASCADE to also clean up related records in ocr_queue table
DELETE FROM documents WHERE ocr_status = 'failed';
-- Update statistics and constraints
ANALYZE documents;
ANALYZE failed_documents;
-- Add comment documenting the migration
COMMENT ON TABLE failed_documents IS 'Tracks all documents that failed at any stage of processing. Consolidated from documents table (OCR failures) and new ingestion failures as of migration 20250628000004.';
-- Create indexes for efficient querying of migrated data
CREATE INDEX IF NOT EXISTS idx_failed_documents_failure_stage_reason ON failed_documents(failure_stage, failure_reason);
CREATE INDEX IF NOT EXISTS idx_failed_documents_ocr_confidence ON failed_documents(ocr_confidence) WHERE ocr_confidence IS NOT NULL;
-- Optional: Create a view for backward compatibility during transition
CREATE OR REPLACE VIEW legacy_failed_ocr_documents AS
SELECT
id,
user_id,
filename,
original_filename,
file_path,
file_size,
mime_type,
tags,
ocr_text,
ocr_confidence,
ocr_word_count,
ocr_processing_time_ms,
failure_reason as ocr_failure_reason,
error_message as ocr_error,
'failed' as ocr_status,
retry_count,
created_at,
updated_at
FROM failed_documents
WHERE failure_stage = 'ocr';
-- Grant appropriate permissions
-- GRANT SELECT ON legacy_failed_ocr_documents TO readur_user;