Readur/migrations/20240618000001_add_file_has...

36 lines
1.4 KiB
SQL

-- Add file_hash field to documents table for efficient duplicate detection
-- This will store SHA256 hash of file content to prevent duplicates
-- Add the file_hash column to documents table
ALTER TABLE documents
ADD COLUMN IF NOT EXISTS file_hash VARCHAR(64);
-- Create unique index to prevent hash duplicates per user
-- This enforces that each user cannot have duplicate file content
CREATE UNIQUE INDEX IF NOT EXISTS idx_documents_user_file_hash
ON documents(user_id, file_hash)
WHERE file_hash IS NOT NULL;
-- Create additional index for efficient hash lookups
CREATE INDEX IF NOT EXISTS idx_documents_file_hash
ON documents(file_hash)
WHERE file_hash IS NOT NULL;
-- Add helpful comments
COMMENT ON COLUMN documents.file_hash IS 'SHA256 hash of file content for duplicate detection - prevents same content from being stored multiple times per user';
-- Create a view for duplicate analysis
CREATE OR REPLACE VIEW document_duplicates_analysis AS
SELECT
file_hash,
COUNT(*) as duplicate_count,
array_agg(DISTINCT user_id ORDER BY user_id) as users_with_duplicates,
array_agg(filename ORDER BY created_at) as filenames,
MIN(created_at) as first_upload,
MAX(created_at) as last_upload,
SUM(file_size) as total_storage_used
FROM documents
WHERE file_hash IS NOT NULL
GROUP BY file_hash
HAVING COUNT(*) > 1
ORDER BY duplicate_count DESC, total_storage_used DESC;