feat(server): add hash for documents

2025-06-17 15:41:42 +00:00 · 2025-06-17 15:41:42 +00:00 · 58aaedf4a6
parent 4d32163ccc
commit 58aaedf4a6
10 changed files with 202 additions and 102 deletions
--- a/migrations/20240618000001_add_file_hash_field.sql
+++ b/migrations/20240618000001_add_file_hash_field.sql
@ -0,0 +1,36 @@
 -- Add file_hash field to documents table for efficient duplicate detection
 -- This will store SHA256 hash of file content to prevent duplicates
 -- Add the file_hash column to documents table
 ALTER TABLE documents 
 ADD COLUMN IF NOT EXISTS file_hash VARCHAR(64);
 -- Create unique index to prevent hash duplicates per user
 -- This enforces that each user cannot have duplicate file content
 CREATE UNIQUE INDEX IF NOT EXISTS idx_documents_user_file_hash 
 ON documents(user_id, file_hash) 
 WHERE file_hash IS NOT NULL;
 -- Create additional index for efficient hash lookups
 CREATE INDEX IF NOT EXISTS idx_documents_file_hash 
 ON documents(file_hash) 
 WHERE file_hash IS NOT NULL;
 -- Add helpful comments
 COMMENT ON COLUMN documents.file_hash IS 'SHA256 hash of file content for duplicate detection - prevents same content from being stored multiple times per user';
 -- Create a view for duplicate analysis
 CREATE OR REPLACE VIEW document_duplicates_analysis AS
 SELECT 
    file_hash,
    COUNT(*) as duplicate_count,
    array_agg(DISTINCT user_id ORDER BY user_id) as users_with_duplicates,
    array_agg(filename ORDER BY created_at) as filenames,
    MIN(created_at) as first_upload,
    MAX(created_at) as last_upload,
    SUM(file_size) as total_storage_used
 FROM documents 
 WHERE file_hash IS NOT NULL
 GROUP BY file_hash
 HAVING COUNT(*) > 1
 ORDER BY duplicate_count DESC, total_storage_used DESC;
--- a/src/batch_ingest.rs
+++ b/src/batch_ingest.rs
@ -6,6 +6,7 @@ use tokio::sync::Semaphore;
 use tracing::{error, info, warn};
 use uuid::Uuid;
 use walkdir::WalkDir;
 use sha2::{Sha256, Digest};
 use crate::{
    config::Config,
@ -188,6 +189,25 @@ async fn process_single_file(
    // Read file data
    let file_data = fs::read(&path).await?;
    // Calculate file hash for deduplication
    let file_hash = calculate_file_hash(&file_data);
    // Check for duplicate content using efficient hash lookup
    match db.get_document_by_user_and_hash(user_id, &file_hash).await {
        Ok(Some(existing_doc)) => {
            info!("Skipping duplicate file: {} matches existing document {} (hash: {})", 
                filename, existing_doc.original_filename, &file_hash[..8]);
            return Ok(None); // Skip processing duplicate
        }
        Ok(None) => {
            info!("No duplicate content found for hash {}, proceeding with file processing", &file_hash[..8]);
        }
        Err(e) => {
            warn!("Error checking for duplicate hash {}: {}", &file_hash[..8], e);
            // Continue processing even if duplicate check fails
        }
    }
    let mime_type = mime_guess::from_path(&filename)
        .first_or_octet_stream()
        .to_string();
@ -195,7 +215,7 @@ async fn process_single_file(
    // Save file
    let file_path = file_service.save_file(&filename, &file_data).await?;
-    // Create document
+    // Create document with hash
    let document = file_service.create_document(
        &filename,
        &filename,
@ -203,6 +223,7 @@ async fn process_single_file(
        file_size,
        &mime_type,
        user_id,
        Some(file_hash),
    );
    // Save to database (without OCR)
@ -225,3 +246,10 @@ fn calculate_priority(file_size: i64) -> i32 {
        _ => 2,                 // > 50MB: lowest priority
    }
 }
 fn calculate_file_hash(data: &[u8]) -> String {
    let mut hasher = Sha256::new();
    hasher.update(data);
    let result = hasher.finalize();
    format!("{:x}", result)
 }
--- a/src/db/documents.rs
+++ b/src/db/documents.rs
@ -9,9 +9,9 @@ impl Database {
    pub async fn create_document(&self, document: Document) -> Result<Document> {
        let row = sqlx::query(
            r#"
-            INSERT INTO documents (id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id)
+            INSERT INTO documents (id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash)
-            VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)
+            VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19)
-            RETURNING id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id
+            RETURNING id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
            "#
        )
        .bind(document.id)
@ -32,6 +32,7 @@ impl Database {
        .bind(document.created_at)
        .bind(document.updated_at)
        .bind(document.user_id)
        .bind(&document.file_hash)
        .fetch_one(&self.pool)
        .await?;
@ -54,6 +55,7 @@ impl Database {
            created_at: row.get("created_at"),
            updated_at: row.get("updated_at"),
            user_id: row.get("user_id"),
            file_hash: row.get("file_hash"),
        })
    }
@ -61,7 +63,7 @@ impl Database {
        let query = if user_role == crate::models::UserRole::Admin {
            // Admins can see all documents
            r#"
-            SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id
+            SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
            FROM documents 
            ORDER BY created_at DESC 
            LIMIT $1 OFFSET $2
@ -69,7 +71,7 @@ impl Database {
        } else {
            // Regular users can only see their own documents
            r#"
-            SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id
+            SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
            FROM documents 
            WHERE user_id = $3 
            ORDER BY created_at DESC 
@ -113,6 +115,7 @@ impl Database {
                created_at: row.get("created_at"),
                updated_at: row.get("updated_at"),
                user_id: row.get("user_id"),
                file_hash: row.get("file_hash"),
            })
            .collect();
@ -211,6 +214,7 @@ impl Database {
                created_at: row.get("created_at"),
                updated_at: row.get("updated_at"),
                user_id: row.get("user_id"),
                file_hash: row.get("file_hash"),
            })
            .collect();
@ -297,6 +301,7 @@ impl Database {
                created_at: row.get("created_at"),
                updated_at: row.get("updated_at"),
                user_id: row.get("user_id"),
                file_hash: row.get("file_hash"),
            })
            .collect();
@ -337,6 +342,7 @@ impl Database {
                created_at: row.get("created_at"),
                updated_at: row.get("updated_at"),
                user_id: row.get("user_id"),
                file_hash: row.get("file_hash"),
            })
            .collect();
@ -407,6 +413,7 @@ impl Database {
                created_at: row.get("created_at"),
                updated_at: row.get("updated_at"),
                user_id: row.get("user_id"),
                file_hash: row.get("file_hash"),
            })
            .collect();
@ -1122,4 +1129,45 @@ impl Database {
            None => Ok(None),
        }
    }
    /// Check if a document with the given file hash already exists for the user
    pub async fn get_document_by_user_and_hash(&self, user_id: Uuid, file_hash: &str) -> Result<Option<Document>> {
        let row = sqlx::query(
            r#"
            SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
            FROM documents 
            WHERE user_id = $1 AND file_hash = $2
            LIMIT 1
            "#
        )
        .bind(user_id)
        .bind(file_hash)
        .fetch_optional(&self.pool)
        .await?;
        match row {
            Some(row) => Ok(Some(Document {
                id: row.get("id"),
                filename: row.get("filename"),
                original_filename: row.get("original_filename"),
                file_path: row.get("file_path"),
                file_size: row.get("file_size"),
                mime_type: row.get("mime_type"),
                content: row.get("content"),
                ocr_text: row.get("ocr_text"),
                ocr_confidence: row.get("ocr_confidence"),
                ocr_word_count: row.get("ocr_word_count"),
                ocr_processing_time_ms: row.get("ocr_processing_time_ms"),
                ocr_status: row.get("ocr_status"),
                ocr_error: row.get("ocr_error"),
                ocr_completed_at: row.get("ocr_completed_at"),
                tags: row.get("tags"),
                created_at: row.get("created_at"),
                updated_at: row.get("updated_at"),
                user_id: row.get("user_id"),
                file_hash: row.get("file_hash"),
            })),
            None => Ok(None),
        }
    }
 }
--- a/src/file_service.rs
+++ b/src/file_service.rs
@ -156,6 +156,7 @@ impl FileService {
        file_size: i64,
        mime_type: &str,
        user_id: Uuid,
        file_hash: Option<String>,
    ) -> Document {
        Document {
            id: Uuid::new_v4(),
@ -176,6 +177,7 @@ impl FileService {
            created_at: Utc::now(),
            updated_at: Utc::now(),
            user_id,
            file_hash,
        }
    }
--- a/src/models.rs
+++ b/src/models.rs
@ -98,6 +98,7 @@ pub struct Document {
    pub created_at: DateTime<Utc>,
    pub updated_at: DateTime<Utc>,
    pub user_id: Uuid,
    pub file_hash: Option<String>,
 }
 #[derive(Debug, Serialize, Deserialize, ToSchema)]
--- a/src/routes/documents.rs
+++ b/src/routes/documents.rs
@ -142,21 +142,17 @@ async fn upload_document(
            // Calculate file hash for deduplication
            let file_hash = calculate_file_hash(&data);
-            // Check if this exact file content already exists in the system
+            // Check if this exact file content already exists using efficient hash lookup
-            // This prevents uploading and processing duplicate files
+            match state.db.get_document_by_user_and_hash(auth_user.user.id, &file_hash).await {
-            if let Ok(existing_docs) = state.db.get_documents_by_user_with_role(auth_user.user.id, auth_user.user.role, 1000, 0).await {
+                Ok(Some(existing_doc)) => {
-                for existing_doc in existing_docs {
+                    // Return the existing document instead of creating a duplicate
-                    // Quick size check first (much faster than hash comparison)
+                    return Ok(Json(existing_doc.into()));
-                    if existing_doc.file_size == file_size {
+                }
-                        // Read the existing file and compare hashes
+                Ok(None) => {
-                        if let Ok(existing_file_data) = tokio::fs::read(&existing_doc.file_path).await {
+                    // No duplicate found, proceed with upload
-                            let existing_hash = calculate_file_hash(&existing_file_data);
+                }
-                            if file_hash == existing_hash {
+                Err(_) => {
-                                // Return the existing document instead of creating a duplicate
+                    // Continue even if duplicate check fails
                                return Ok(Json(existing_doc.into()));
                            }
                        }
                    }
                }
            }
@ -176,6 +172,7 @@ async fn upload_document(
                file_size,
                &mime_type,
                auth_user.user.id,
                Some(file_hash),
            );
            let saved_document = state
--- a/src/routes/webdav/webdav_sync.rs
+++ b/src/routes/webdav/webdav_sync.rs
@ -276,51 +276,42 @@ async fn process_single_file(
    // Calculate file hash for deduplication 
    let file_hash = calculate_file_hash(&file_data);
-    // Check if this exact file content already exists for this user
+    // Check if this exact file content already exists for this user using efficient hash lookup
    // This prevents downloading and processing duplicate files from WebDAV
    info!("Checking for duplicate content for user {}: {} (hash: {}, size: {} bytes)", 
        user_id, file_info.name, &file_hash[..8], file_data.len());
-    // Query documents with the same file size for this user only
+    // Use efficient database hash lookup instead of reading all documents
-    let size_filter = file_data.len() as i64;
+    match state.db.get_document_by_user_and_hash(user_id, &file_hash).await {
-    if let Ok(existing_docs) = state.db.get_documents_by_user_with_role(user_id, crate::models::UserRole::User, 1000, 0).await {
+        Ok(Some(existing_doc)) => {
-        let matching_docs: Vec<_> = existing_docs.into_iter()
+            info!("Found duplicate content for user {}: {} matches existing document {} (hash: {})", 
-            .filter(|doc| doc.file_size == size_filter)
+                user_id, file_info.name, existing_doc.original_filename, &file_hash[..8]);
            .collect();
-        info!("Found {} documents with same size for user {}", matching_docs.len(), user_id);
+            // Record this WebDAV file as a duplicate but link to existing document
            let webdav_file = CreateWebDAVFile {
                user_id,
                webdav_path: file_info.path.clone(),
                etag: file_info.etag.clone(),
                last_modified: file_info.last_modified,
                file_size: file_info.size,
                mime_type: file_info.mime_type.clone(),
                document_id: Some(existing_doc.id), // Link to existing document
                sync_status: "duplicate_content".to_string(),
                sync_error: None,
            };
-        for existing_doc in matching_docs {
+            if let Err(e) = state.db.create_or_update_webdav_file(&webdav_file).await {
-            // Read the existing file and compare hashes
+                error!("Failed to record duplicate WebDAV file: {}", e);
            if let Ok(existing_file_data) = tokio::fs::read(&existing_doc.file_path).await {
                let existing_hash = calculate_file_hash(&existing_file_data);
                if file_hash == existing_hash {
                    info!("Found duplicate content for user {}: {} matches existing document {}", 
                        user_id, file_info.name, existing_doc.original_filename);
                    // Record this WebDAV file as a duplicate but link to existing document
                    let webdav_file = CreateWebDAVFile {
                        user_id,
                        webdav_path: file_info.path.clone(),
                        etag: file_info.etag.clone(),
                        last_modified: file_info.last_modified,
                        file_size: file_info.size,
                        mime_type: file_info.mime_type.clone(),
                        document_id: Some(existing_doc.id), // Link to existing document
                        sync_status: "duplicate_content".to_string(),
                        sync_error: None,
                    };
                    if let Err(e) = state.db.create_or_update_webdav_file(&webdav_file).await {
                        error!("Failed to record duplicate WebDAV file: {}", e);
                    }
                    info!("WebDAV file marked as duplicate_content, skipping processing");
                    return Ok(false); // Not processed (duplicate)
                }
            } else {
                warn!("Could not read existing file for hash comparison: {}", existing_doc.file_path);
            }
            info!("WebDAV file marked as duplicate_content, skipping processing");
            return Ok(false); // Not processed (duplicate)
        }
        Ok(None) => {
            info!("No duplicate content found for hash {}, proceeding with file processing", &file_hash[..8]);
        }
        Err(e) => {
            warn!("Error checking for duplicate hash {}: {}", &file_hash[..8], e);
            // Continue processing even if duplicate check fails
        }
    }
@ -330,7 +321,7 @@ async fn process_single_file(
    let saved_file_path = file_service.save_file(&file_info.name, &file_data).await
        .map_err(|e| format!("Failed to save {}: {}", file_info.name, e))?;
-    // Create document record
+    // Create document record with hash
    let file_service = FileService::new(state.config.upload_path.clone());
    let document = file_service.create_document(
        &file_info.name,
@ -339,6 +330,7 @@ async fn process_single_file(
        file_data.len() as i64,
        &file_info.mime_type,
        user_id,
        Some(file_hash.clone()), // Store the calculated hash
    );
    // Save document to database
--- a/src/source_sync.rs
+++ b/src/source_sync.rs
@ -534,25 +534,19 @@ impl SourceSyncService {
        // Calculate file hash for deduplication
        let file_hash = Self::calculate_file_hash(&file_data);
-        // Check for duplicate content
+        // Check for duplicate content using efficient hash lookup
-        if let Ok(existing_docs) = state.db.get_documents_by_user_with_role(
+        match state.db.get_document_by_user_and_hash(user_id, &file_hash).await {
-            user_id, 
+            Ok(Some(existing_doc)) => {
-            crate::models::UserRole::User, 
+                info!("File content already exists for user {}: {} matches existing document {} (hash: {})", 
-            1000, 
+                    user_id, file_info.name, existing_doc.original_filename, &file_hash[..8]);
-            0
+                return Ok(false); // Skip processing duplicate
-        ).await {
+            }
-            let matching_docs: Vec<_> = existing_docs.into_iter()
+            Ok(None) => {
-                .filter(|doc| doc.file_size == file_data.len() as i64)
+                info!("No duplicate content found for hash {}, proceeding with file processing", &file_hash[..8]);
-                .collect();
+            }
-
+            Err(e) => {
-            for existing_doc in matching_docs {
+                warn!("Error checking for duplicate hash {}: {}", &file_hash[..8], e);
-                if let Ok(existing_file_data) = tokio::fs::read(&existing_doc.file_path).await {
+                // Continue processing even if duplicate check fails
                    let existing_hash = Self::calculate_file_hash(&existing_file_data);
                    if file_hash == existing_hash {
                        info!("File content already exists, skipping: {}", file_info.path);
                        return Ok(false);
                    }
                }
            }
        }
@ -561,7 +555,7 @@ impl SourceSyncService {
        let saved_file_path = file_service.save_file(&file_info.name, &file_data).await
            .map_err(|e| anyhow!("Failed to save {}: {}", file_info.name, e))?;
-        // Create document record
+        // Create document record with hash
        let document = file_service.create_document(
            &file_info.name,
            &file_info.name,
@ -569,6 +563,7 @@ impl SourceSyncService {
            file_data.len() as i64,
            &file_info.mime_type,
            user_id,
            Some(file_hash.clone()), // Store the calculated hash
        );
        let created_document = state.db.create_document(document).await
@ -655,25 +650,19 @@ impl SourceSyncService {
        // Calculate file hash for deduplication
        let file_hash = Self::calculate_file_hash(&file_data);
-        // Check for duplicate content
+        // Check for duplicate content using efficient hash lookup
-        if let Ok(existing_docs) = state.db.get_documents_by_user_with_role(
+        match state.db.get_document_by_user_and_hash(user_id, &file_hash).await {
-            user_id, 
+            Ok(Some(existing_doc)) => {
-            crate::models::UserRole::User, 
+                info!("File content already exists for user {}: {} matches existing document {} (hash: {})", 
-            1000, 
+                    user_id, file_info.name, existing_doc.original_filename, &file_hash[..8]);
-            0
+                return Ok(false); // Skip processing duplicate
-        ).await {
+            }
-            let matching_docs: Vec<_> = existing_docs.into_iter()
+            Ok(None) => {
-                .filter(|doc| doc.file_size == file_data.len() as i64)
+                info!("No duplicate content found for hash {}, proceeding with file processing", &file_hash[..8]);
-                .collect();
+            }
-
+            Err(e) => {
-            for existing_doc in matching_docs {
+                warn!("Error checking for duplicate hash {}: {}", &file_hash[..8], e);
-                if let Ok(existing_file_data) = tokio::fs::read(&existing_doc.file_path).await {
+                // Continue processing even if duplicate check fails
                    let existing_hash = Self::calculate_file_hash(&existing_file_data);
                    if file_hash == existing_hash {
                        info!("File content already exists, skipping: {}", file_info.path);
                        return Ok(false);
                    }
                }
            }
        }
@ -688,7 +677,7 @@ impl SourceSyncService {
        let saved_file_path = file_service.save_file(&file_info.name, &file_data).await
            .map_err(|e| anyhow!("Failed to save {}: {}", file_info.name, e))?;
-        // Create document record
+        // Create document record with hash
        let document = file_service.create_document(
            &file_info.name,
            &file_info.name,
@ -696,6 +685,7 @@ impl SourceSyncService {
            file_data.len() as i64,
            &file_info.mime_type,
            user_id,
            Some(file_hash.clone()), // Store the calculated hash
        );
        let created_document = state.db.create_document(document).await
--- a/src/tests/file_service_tests.rs
+++ b/src/tests/file_service_tests.rs
@ -72,6 +72,7 @@ mod tests {
            1024,
            "application/pdf",
            user_id,
            Some("abcd1234hash".to_string()),
        );
        assert_eq!(document.filename, "saved_file.pdf");
@ -80,6 +81,7 @@ mod tests {
        assert_eq!(document.file_size, 1024);
        assert_eq!(document.mime_type, "application/pdf");
        assert_eq!(document.user_id, user_id);
        assert_eq!(document.file_hash, Some("abcd1234hash".to_string()));
        assert!(document.content.is_none());
        assert!(document.ocr_text.is_none());
        assert!(document.tags.is_empty());
--- a/src/watcher.rs
+++ b/src/watcher.rs
@ -362,6 +362,9 @@ async fn process_file(
    let saved_file_path = file_service.save_file(&filename, &file_data).await?;
    // Calculate file hash for deduplication
    let file_hash = calculate_file_hash(&file_data);
    let document = file_service.create_document(
        &filename,
        &filename,
@ -369,6 +372,7 @@ async fn process_file(
        file_size,
        &mime_type,
        admin_user_id,
        Some(file_hash),
    );
    let created_doc = db.create_document(document).await?;