36 lines
1.4 KiB
SQL
36 lines
1.4 KiB
SQL
-- Add file_hash field to documents table for efficient duplicate detection
|
|
-- This will store SHA256 hash of file content to prevent duplicates
|
|
|
|
-- Add the file_hash column to documents table
|
|
ALTER TABLE documents
|
|
ADD COLUMN IF NOT EXISTS file_hash VARCHAR(64);
|
|
|
|
-- Create unique index to prevent hash duplicates per user
|
|
-- This enforces that each user cannot have duplicate file content
|
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_documents_user_file_hash
|
|
ON documents(user_id, file_hash)
|
|
WHERE file_hash IS NOT NULL;
|
|
|
|
-- Create additional index for efficient hash lookups
|
|
CREATE INDEX IF NOT EXISTS idx_documents_file_hash
|
|
ON documents(file_hash)
|
|
WHERE file_hash IS NOT NULL;
|
|
|
|
-- Add helpful comments
|
|
COMMENT ON COLUMN documents.file_hash IS 'SHA256 hash of file content for duplicate detection - prevents same content from being stored multiple times per user';
|
|
|
|
-- Create a view for duplicate analysis
|
|
CREATE OR REPLACE VIEW document_duplicates_analysis AS
|
|
SELECT
|
|
file_hash,
|
|
COUNT(*) as duplicate_count,
|
|
array_agg(DISTINCT user_id ORDER BY user_id) as users_with_duplicates,
|
|
array_agg(filename ORDER BY created_at) as filenames,
|
|
MIN(created_at) as first_upload,
|
|
MAX(created_at) as last_upload,
|
|
SUM(file_size) as total_storage_used
|
|
FROM documents
|
|
WHERE file_hash IS NOT NULL
|
|
GROUP BY file_hash
|
|
HAVING COUNT(*) > 1
|
|
ORDER BY duplicate_count DESC, total_storage_used DESC; |