diff --git a/frontend/src/pages/FailedOcrPage.tsx b/frontend/src/pages/FailedOcrPage.tsx index 2a13e78..d6f8a10 100644 --- a/frontend/src/pages/FailedOcrPage.tsx +++ b/frontend/src/pages/FailedOcrPage.tsx @@ -27,6 +27,8 @@ import { Collapse, LinearProgress, Snackbar, + Tabs, + Tab, } from '@mui/material'; import { Refresh as RefreshIcon, @@ -37,6 +39,7 @@ import { Schedule as ScheduleIcon, Visibility as VisibilityIcon, Download as DownloadIcon, + FileCopy as FileCopyIcon, } from '@mui/icons-material'; import { format } from 'date-fns'; import { api, documentService } from '../services/api'; @@ -87,16 +90,54 @@ interface RetryResponse { estimated_wait_minutes?: number; } +interface DuplicateDocument { + id: string; + filename: string; + original_filename: string; + file_size: number; + mime_type: string; + created_at: string; + user_id: string; +} + +interface DuplicateGroup { + file_hash: string; + duplicate_count: number; + first_uploaded: string; + last_uploaded: string; + documents: DuplicateDocument[]; +} + +interface DuplicatesResponse { + duplicates: DuplicateGroup[]; + pagination: { + total: number; + limit: number; + offset: number; + has_more: boolean; + }; + statistics: { + total_duplicate_groups: number; + }; +} + const FailedOcrPage: React.FC = () => { + const [currentTab, setCurrentTab] = useState(0); const [documents, setDocuments] = useState([]); + const [duplicates, setDuplicates] = useState([]); const [loading, setLoading] = useState(true); + const [duplicatesLoading, setDuplicatesLoading] = useState(false); const [retrying, setRetrying] = useState(null); const [statistics, setStatistics] = useState(null); + const [duplicateStatistics, setDuplicateStatistics] = useState(null); const [pagination, setPagination] = useState({ page: 1, limit: 25 }); + const [duplicatesPagination, setDuplicatesPagination] = useState({ page: 1, limit: 25 }); const [totalPages, setTotalPages] = useState(0); + const [duplicatesTotalPages, setDuplicatesTotalPages] = useState(0); const [selectedDocument, setSelectedDocument] = useState(null); const [detailsOpen, setDetailsOpen] = useState(false); const [expandedRows, setExpandedRows] = useState>(new Set()); + const [expandedDuplicateGroups, setExpandedDuplicateGroups] = useState>(new Set()); const [snackbar, setSnackbar] = useState<{ open: boolean; message: string; severity: 'success' | 'error' }>({ open: false, message: '', @@ -124,10 +165,37 @@ const FailedOcrPage: React.FC = () => { } }; + const fetchDuplicates = async () => { + try { + setDuplicatesLoading(true); + const offset = (duplicatesPagination.page - 1) * duplicatesPagination.limit; + const response = await documentService.getDuplicates(duplicatesPagination.limit, offset); + + setDuplicates(response.data.duplicates); + setDuplicateStatistics(response.data.statistics); + setDuplicatesTotalPages(Math.ceil(response.data.pagination.total / duplicatesPagination.limit)); + } catch (error) { + console.error('Failed to fetch duplicates:', error); + setSnackbar({ + open: true, + message: 'Failed to load duplicate documents', + severity: 'error' + }); + } finally { + setDuplicatesLoading(false); + } + }; + useEffect(() => { fetchFailedDocuments(); }, [pagination.page]); + useEffect(() => { + if (currentTab === 1) { + fetchDuplicates(); + } + }, [currentTab, duplicatesPagination.page]); + const handleRetryOcr = async (document: FailedDocument) => { try { setRetrying(document.id); @@ -200,6 +268,28 @@ const FailedOcrPage: React.FC = () => { setDetailsOpen(true); }; + const toggleDuplicateGroupExpansion = (groupHash: string) => { + const newExpanded = new Set(expandedDuplicateGroups); + if (newExpanded.has(groupHash)) { + newExpanded.delete(groupHash); + } else { + newExpanded.add(groupHash); + } + setExpandedDuplicateGroups(newExpanded); + }; + + const handleTabChange = (event: React.SyntheticEvent, newValue: number) => { + setCurrentTab(newValue); + }; + + const refreshCurrentTab = () => { + if (currentTab === 0) { + fetchFailedDocuments(); + } else { + fetchDuplicates(); + } + }; + if (loading && documents.length === 0) { return ( @@ -212,20 +302,38 @@ const FailedOcrPage: React.FC = () => { - Failed OCR Documents + Failed OCR & Duplicates - {/* Statistics Overview */} - {statistics && ( + + + } + label={`Failed OCR${statistics ? ` (${statistics.total_failed})` : ''}`} + iconPosition="start" + /> + } + label={`Duplicates${duplicateStatistics ? ` (${duplicateStatistics.total_duplicate_groups})` : ''}`} + iconPosition="start" + /> + + + + {/* Failed OCR Tab Content */} + {currentTab === 0 && ( + <> + {/* Statistics Overview */} + {statistics && ( @@ -435,6 +543,174 @@ const FailedOcrPage: React.FC = () => { )} )} + + )} + + {/* Duplicates Tab Content */} + {currentTab === 1 && ( + <> + {/* Duplicate Statistics Overview */} + {duplicateStatistics && ( + + + + + + + Total Duplicate Groups + + + {duplicateStatistics.total_duplicate_groups} + + + + + + )} + + {duplicatesLoading ? ( + + + + ) : duplicates.length === 0 ? ( + + No duplicates found! + You don't have any duplicate documents. All your files have unique content. + + ) : ( + <> + + Duplicate Documents + These documents have identical content but may have different filenames. + You can click on each group to see all the documents with the same content. + + + + + + + + Content Hash + Duplicate Count + First Uploaded + Last Uploaded + Actions + + + + {duplicates.map((group) => ( + + + + toggleDuplicateGroupExpansion(group.file_hash)} + > + {expandedDuplicateGroups.has(group.file_hash) ? : } + + + + + {group.file_hash.substring(0, 16)}... + + + + + + + + {format(new Date(group.first_uploaded), 'MMM dd, yyyy')} + + + + + {format(new Date(group.last_uploaded), 'MMM dd, yyyy')} + + + + + View files below + + + + + + + + + Duplicate Files ({group.duplicate_count} total) + + + {group.documents.map((doc, index) => ( + + + + + {doc.filename} + + {doc.original_filename !== doc.filename && ( + + Original: {doc.original_filename} + + )} + + {formatFileSize(doc.file_size)} • {doc.mime_type} + + + Uploaded: {format(new Date(doc.created_at), 'MMM dd, yyyy HH:mm')} + + + + window.open(`/api/documents/${doc.id}/view`, '_blank')} + > + + + + + window.open(`/api/documents/${doc.id}/download`, '_blank')} + > + + + + + + + + ))} + + + + + + + ))} + +
+
+ + {/* Duplicates Pagination */} + {duplicatesTotalPages > 1 && ( + + setDuplicatesPagination(prev => ({ ...prev, page }))} + color="primary" + /> + + )} + + )} + + )} {/* Document Details Dialog */} { + return api.get(`/documents/duplicates`, { + params: { limit, offset }, + }) + }, + search: (searchRequest: SearchRequest) => { return api.get('/search', { params: searchRequest, diff --git a/src/db/documents.rs b/src/db/documents.rs index a497741..12d946d 100644 --- a/src/db/documents.rs +++ b/src/db/documents.rs @@ -991,6 +991,7 @@ impl Database { created_at: row.get("created_at"), updated_at: row.get("updated_at"), user_id: row.get("user_id"), + file_hash: row.get("file_hash"), }); } @@ -1125,6 +1126,7 @@ impl Database { created_at: row.get("created_at"), updated_at: row.get("updated_at"), user_id: row.get("user_id"), + file_hash: row.get("file_hash"), })), None => Ok(None), } @@ -1170,4 +1172,124 @@ impl Database { None => Ok(None), } } + + /// Get documents grouped by duplicate hashes for a user + pub async fn get_user_duplicates(&self, user_id: Uuid, user_role: crate::models::UserRole, limit: i64, offset: i64) -> Result<(Vec, i64)> { + let (docs_query, count_query) = if user_role == crate::models::UserRole::Admin { + // Admins can see all duplicates + ( + r#" + SELECT + file_hash, + COUNT(*) as duplicate_count, + MIN(created_at) as first_uploaded, + MAX(created_at) as last_uploaded, + json_agg( + json_build_object( + 'id', id, + 'filename', filename, + 'original_filename', original_filename, + 'file_size', file_size, + 'mime_type', mime_type, + 'created_at', created_at, + 'user_id', user_id + ) ORDER BY created_at + ) as documents + FROM documents + WHERE file_hash IS NOT NULL + GROUP BY file_hash + HAVING COUNT(*) > 1 + ORDER BY duplicate_count DESC, first_uploaded DESC + LIMIT $1 OFFSET $2 + "#, + r#" + SELECT COUNT(*) as total FROM ( + SELECT file_hash + FROM documents + WHERE file_hash IS NOT NULL + GROUP BY file_hash + HAVING COUNT(*) > 1 + ) as duplicate_groups + "# + ) + } else { + // Regular users see only their own duplicates + ( + r#" + SELECT + file_hash, + COUNT(*) as duplicate_count, + MIN(created_at) as first_uploaded, + MAX(created_at) as last_uploaded, + json_agg( + json_build_object( + 'id', id, + 'filename', filename, + 'original_filename', original_filename, + 'file_size', file_size, + 'mime_type', mime_type, + 'created_at', created_at, + 'user_id', user_id + ) ORDER BY created_at + ) as documents + FROM documents + WHERE user_id = $3 AND file_hash IS NOT NULL + GROUP BY file_hash + HAVING COUNT(*) > 1 + ORDER BY duplicate_count DESC, first_uploaded DESC + LIMIT $1 OFFSET $2 + "#, + r#" + SELECT COUNT(*) as total FROM ( + SELECT file_hash + FROM documents + WHERE user_id = $1 AND file_hash IS NOT NULL + GROUP BY file_hash + HAVING COUNT(*) > 1 + ) as duplicate_groups + "# + ) + }; + + let rows = if user_role == crate::models::UserRole::Admin { + sqlx::query(docs_query) + .bind(limit) + .bind(offset) + .fetch_all(&self.pool) + .await? + } else { + sqlx::query(docs_query) + .bind(limit) + .bind(offset) + .bind(user_id) + .fetch_all(&self.pool) + .await? + }; + + let duplicates: Vec = rows + .into_iter() + .map(|row| { + serde_json::json!({ + "file_hash": row.get::("file_hash"), + "duplicate_count": row.get::("duplicate_count"), + "first_uploaded": row.get::, _>("first_uploaded"), + "last_uploaded": row.get::, _>("last_uploaded"), + "documents": row.get::("documents") + }) + }) + .collect(); + + let total = if user_role == crate::models::UserRole::Admin { + sqlx::query_scalar::<_, i64>(count_query) + .fetch_one(&self.pool) + .await? + } else { + sqlx::query_scalar::<_, i64>(count_query) + .bind(user_id) + .fetch_one(&self.pool) + .await? + }; + + Ok((duplicates, total)) + } } \ No newline at end of file diff --git a/src/routes/documents.rs b/src/routes/documents.rs index 52a4491..316b1dd 100644 --- a/src/routes/documents.rs +++ b/src/routes/documents.rs @@ -37,6 +37,7 @@ pub fn router() -> Router> { .route("/{id}/processed-image", get(get_processed_image)) .route("/{id}/retry-ocr", post(retry_ocr)) .route("/failed-ocr", get(get_failed_ocr_documents)) + .route("/duplicates", get(get_user_duplicates)) } #[utoipa::path( @@ -226,7 +227,7 @@ fn calculate_file_hash(data: &[u8]) -> String { ("ocr_status" = Option, Query, description = "Filter by OCR status (pending, processing, completed, failed)") ), responses( - (status = 200, description = "List of user documents", body = Vec), + (status = 200, description = "Paginated list of user documents with metadata", body = String), (status = 401, description = "Unauthorized") ) )] @@ -809,4 +810,50 @@ async fn get_failure_statistics( .collect(); Ok(serde_json::json!(categories)) +} + +#[utoipa::path( + get, + path = "/api/documents/duplicates", + tag = "documents", + security( + ("bearer_auth" = []) + ), + params( + ("limit" = Option, Query, description = "Number of duplicate groups to return per page"), + ("offset" = Option, Query, description = "Number of duplicate groups to skip") + ), + responses( + (status = 200, description = "User's duplicate documents grouped by hash", body = String), + (status = 401, description = "Unauthorized") + ) +)] +async fn get_user_duplicates( + State(state): State>, + auth_user: AuthUser, + Query(query): Query, +) -> Result, StatusCode> { + let limit = query.limit.unwrap_or(25); + let offset = query.offset.unwrap_or(0); + + let (duplicates, total_count) = state + .db + .get_user_duplicates(auth_user.user.id, auth_user.user.role, limit, offset) + .await + .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; + + let response = serde_json::json!({ + "duplicates": duplicates, + "pagination": { + "total": total_count, + "limit": limit, + "offset": offset, + "has_more": offset + limit < total_count + }, + "statistics": { + "total_duplicate_groups": total_count + } + }); + + Ok(Json(response)) } \ No newline at end of file diff --git a/src/swagger.rs b/src/swagger.rs index dd74b3d..532c457 100644 --- a/src/swagger.rs +++ b/src/swagger.rs @@ -8,7 +8,12 @@ use crate::{ models::{ CreateUser, LoginRequest, LoginResponse, UserResponse, UpdateUser, DocumentResponse, SearchRequest, SearchResponse, EnhancedDocumentResponse, - SettingsResponse, UpdateSettings, SearchMode, SearchSnippet, HighlightRange + SettingsResponse, UpdateSettings, SearchMode, SearchSnippet, HighlightRange, + FacetItem, SearchFacetsResponse, Notification, NotificationSummary, CreateNotification, + Source, SourceResponse, CreateSource, UpdateSource, SourceWithStats, + WebDAVSourceConfig, LocalFolderSourceConfig, S3SourceConfig, + WebDAVCrawlEstimate, WebDAVTestConnection, WebDAVConnectionResult, WebDAVSyncStatus, + ProcessedImage, CreateProcessedImage }, routes::metrics::{ SystemMetrics, DatabaseMetrics, OcrMetrics, DocumentMetrics, UserMetrics, GeneralSystemMetrics @@ -26,10 +31,19 @@ use crate::{ // Document endpoints crate::routes::documents::upload_document, crate::routes::documents::list_documents, + crate::routes::documents::get_document_by_id, crate::routes::documents::download_document, + crate::routes::documents::view_document, + crate::routes::documents::get_document_thumbnail, + crate::routes::documents::get_document_ocr, + crate::routes::documents::get_processed_image, + crate::routes::documents::retry_ocr, + crate::routes::documents::get_failed_ocr_documents, + crate::routes::documents::get_user_duplicates, // Search endpoints crate::routes::search::search_documents, crate::routes::search::enhanced_search_documents, + crate::routes::search::get_search_facets, // Settings endpoints crate::routes::settings::get_settings, crate::routes::settings::update_settings, @@ -42,14 +56,46 @@ use crate::{ // Queue endpoints crate::routes::queue::get_queue_stats, crate::routes::queue::requeue_failed, + crate::routes::queue::get_ocr_status, + crate::routes::queue::pause_ocr_processing, + crate::routes::queue::resume_ocr_processing, // Metrics endpoints crate::routes::metrics::get_system_metrics, + // Notifications endpoints + crate::routes::notifications::get_notifications, + crate::routes::notifications::get_notification_summary, + crate::routes::notifications::mark_notification_read, + crate::routes::notifications::mark_all_notifications_read, + crate::routes::notifications::delete_notification, + // Sources endpoints + crate::routes::sources::list_sources, + crate::routes::sources::create_source, + crate::routes::sources::get_source, + crate::routes::sources::update_source, + crate::routes::sources::delete_source, + crate::routes::sources::trigger_sync, + crate::routes::sources::stop_sync, + crate::routes::sources::test_connection, + crate::routes::sources::estimate_crawl, + crate::routes::sources::estimate_crawl_with_config, + crate::routes::sources::test_connection_with_config, + // WebDAV endpoints + crate::routes::webdav::start_webdav_sync, + crate::routes::webdav::cancel_webdav_sync, + crate::routes::webdav::get_webdav_sync_status, + crate::routes::webdav::test_webdav_connection, + crate::routes::webdav::estimate_webdav_crawl, ), components( schemas( CreateUser, LoginRequest, LoginResponse, UserResponse, UpdateUser, DocumentResponse, SearchRequest, SearchResponse, EnhancedDocumentResponse, SettingsResponse, UpdateSettings, SearchMode, SearchSnippet, HighlightRange, + FacetItem, SearchFacetsResponse, Notification, NotificationSummary, CreateNotification, + Source, SourceResponse, CreateSource, UpdateSource, SourceWithStats, + WebDAVSourceConfig, LocalFolderSourceConfig, S3SourceConfig, + WebDAVCrawlEstimate, WebDAVTestConnection, WebDAVConnectionResult, WebDAVSyncStatus, + ProcessedImage, CreateProcessedImage, SystemMetrics, DatabaseMetrics, OcrMetrics, DocumentMetrics, UserMetrics, GeneralSystemMetrics ) ), @@ -61,6 +107,9 @@ use crate::{ (name = "users", description = "User management endpoints"), (name = "queue", description = "OCR queue management endpoints"), (name = "metrics", description = "System metrics and monitoring endpoints"), + (name = "notifications", description = "User notification endpoints"), + (name = "sources", description = "Document source management endpoints"), + (name = "webdav", description = "WebDAV synchronization endpoints"), ), modifiers(&SecurityAddon), info( diff --git a/src/tests/db_tests.rs b/src/tests/db_tests.rs index 449da94..2526a2b 100644 --- a/src/tests/db_tests.rs +++ b/src/tests/db_tests.rs @@ -48,6 +48,7 @@ mod tests { created_at: Utc::now(), updated_at: Utc::now(), user_id, + file_hash: Some("abcd1234567890123456789012345678901234567890123456789012345678".to_string()), } } diff --git a/src/tests/documents_tests.rs b/src/tests/documents_tests.rs index d299d14..095d159 100644 --- a/src/tests/documents_tests.rs +++ b/src/tests/documents_tests.rs @@ -25,6 +25,7 @@ mod tests { created_at: Utc::now(), updated_at: Utc::now(), user_id, + file_hash: Some("1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef".to_string()), } } @@ -48,6 +49,7 @@ mod tests { created_at: Utc::now(), updated_at: Utc::now(), user_id, + file_hash: Some("fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321".to_string()), } } @@ -71,6 +73,7 @@ mod tests { created_at: Utc::now(), updated_at: Utc::now(), user_id, + file_hash: Some("abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890".to_string()), } } diff --git a/src/tests/enhanced_search_tests.rs b/src/tests/enhanced_search_tests.rs index 8e00527..d072074 100644 --- a/src/tests/enhanced_search_tests.rs +++ b/src/tests/enhanced_search_tests.rs @@ -938,6 +938,7 @@ mod tests { created_at: Utc::now(), updated_at: Utc::now(), user_id: user.id, + file_hash: Some("0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef".to_string()), }; db.create_document(document).await.unwrap(); diff --git a/tests/document_upload_hash_duplicate_tests.rs b/tests/document_upload_hash_duplicate_tests.rs new file mode 100644 index 0000000..6fe8d9d --- /dev/null +++ b/tests/document_upload_hash_duplicate_tests.rs @@ -0,0 +1,391 @@ +use anyhow::Result; +use chrono::Utc; +use std::sync::Arc; +use uuid::Uuid; +use sha2::{Sha256, Digest}; + +use readur::{ + AppState, + db::Database, + config::Config, + models::{Document, CreateUser, UserRole}, +}; + +// Helper function to calculate file hash +fn calculate_file_hash(data: &[u8]) -> String { + let mut hasher = Sha256::new(); + hasher.update(data); + let result = hasher.finalize(); + format!("{:x}", result) +} + +// Helper function to create test document +fn create_test_document(user_id: Uuid, filename: &str, file_hash: String) -> Document { + Document { + id: Uuid::new_v4(), + filename: filename.to_string(), + original_filename: filename.to_string(), + file_path: format!("/tmp/{}", filename), + file_size: 1024, + mime_type: "application/pdf".to_string(), + content: None, + ocr_text: None, + ocr_confidence: None, + ocr_word_count: None, + ocr_processing_time_ms: None, + ocr_status: Some("pending".to_string()), + ocr_error: None, + ocr_completed_at: None, + tags: Vec::new(), + created_at: Utc::now(), + updated_at: Utc::now(), + user_id, + file_hash: Some(file_hash), + } +} + +// Helper function to create test user with unique identifier +fn create_test_user_with_suffix(suffix: &str) -> CreateUser { + CreateUser { + username: format!("testuser_{}", suffix), + email: format!("test_{}@example.com", suffix), + password: "test_password".to_string(), + role: Some(UserRole::User), + } +} + +async fn create_test_app_state() -> Result> { + let config = Config::from_env().unwrap_or_else(|_| { + // Create a test config if env fails + Config { + database_url: "postgresql://readur:readur@localhost:5432/readur".to_string(), + server_address: "127.0.0.1:8000".to_string(), + jwt_secret: "test-secret".to_string(), + upload_path: "./test-uploads".to_string(), + watch_folder: "./test-watch".to_string(), + allowed_file_types: vec!["pdf".to_string(), "txt".to_string()], + watch_interval_seconds: Some(30), + file_stability_check_ms: Some(500), + max_file_age_hours: None, + ocr_language: "eng".to_string(), + concurrent_ocr_jobs: 2, + ocr_timeout_seconds: 60, + max_file_size_mb: 10, + memory_limit_mb: 256, + cpu_priority: "normal".to_string(), + } + }); + let db = Database::new("postgresql://readur:readur@localhost:5432/readur").await?; + let queue_service = std::sync::Arc::new( + readur::ocr_queue::OcrQueueService::new(db.clone(), db.get_pool().clone(), 1) + ); + + Ok(Arc::new(AppState { + db, + config, + webdav_scheduler: None, + source_scheduler: None, + queue_service, + })) +} + +#[tokio::test] +async fn test_document_upload_duplicate_detection_returns_existing() -> Result<()> { + let state = create_test_app_state().await?; + let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple())); + + // Create user in database + let created_user = state.db.create_user(user).await?; + let user_id = created_user.id; + + // Test content + let test_content = b"This is test PDF content for upload duplicate detection"; + let file_hash = calculate_file_hash(test_content); + + // Create existing document with same hash + let existing_doc = create_test_document(user_id, "existing.pdf", file_hash.clone()); + let created_doc = state.db.create_document(existing_doc).await?; + + // Test that the hash lookup would find the existing document + let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &file_hash).await?; + assert!(duplicate_check.is_some(), "Should find existing document with same hash"); + + let found_doc = duplicate_check.unwrap(); + assert_eq!(found_doc.id, created_doc.id); + assert_eq!(found_doc.file_hash, Some(file_hash)); + + Ok(()) +} + +#[tokio::test] +async fn test_document_upload_unique_content_processed() -> Result<()> { + let state = create_test_app_state().await?; + let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple())); + + // Create user in database + let created_user = state.db.create_user(user).await?; + let user_id = created_user.id; + + // Test content + let test_content = b"This is unique PDF content for upload processing"; + let file_hash = calculate_file_hash(test_content); + + // Verify no existing document with this hash + let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &file_hash).await?; + assert!(duplicate_check.is_none(), "Should not find any existing document with this hash"); + + Ok(()) +} + +#[tokio::test] +async fn test_document_upload_different_users_same_content() -> Result<()> { + let state = create_test_app_state().await?; + + // Create two users + let user1 = create_test_user_with_suffix(&format!("different_users_1_{}", Uuid::new_v4().simple())); + let created_user1 = state.db.create_user(user1).await?; + let user1_id = created_user1.id; + + let user2 = create_test_user_with_suffix(&format!("different_users_2_{}", Uuid::new_v4().simple())); + let created_user2 = state.db.create_user(user2).await?; + let user2_id = created_user2.id; + + // Test content + let test_content = b"Shared content between different users for upload"; + let file_hash = calculate_file_hash(test_content); + + // Create document for user1 with this hash + let user1_doc = create_test_document(user1_id, "user1.pdf", file_hash.clone()); + state.db.create_document(user1_doc).await?; + + // Check that user2 doesn't see user1's document as duplicate + let duplicate_check = state.db.get_document_by_user_and_hash(user2_id, &file_hash).await?; + assert!(duplicate_check.is_none(), "User2 should not see user1's document as duplicate"); + + // User2 should be able to create their own document with same hash + let user2_doc = create_test_document(user2_id, "user2.pdf", file_hash.clone()); + let result = state.db.create_document(user2_doc).await; + assert!(result.is_ok(), "User2 should be able to create document with same hash"); + + Ok(()) +} + +#[tokio::test] +async fn test_document_upload_hash_calculation_accuracy() -> Result<()> { + // Test various file contents and ensure hash calculation is accurate + let test_cases = vec![ + (b"" as &[u8], "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"), // Empty + (b"a", "ca978112ca1bbdcafac231b39a23dc4da786eff8147c4e72b9807785afee48bb"), // Single char + (b"Hello, World!", "dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f"), // Text + ]; + + for (content, expected_hash) in test_cases { + let calculated_hash = calculate_file_hash(content); + assert_eq!(calculated_hash, expected_hash, "Hash mismatch for content: {:?}", content); + } + + Ok(()) +} + +#[tokio::test] +async fn test_document_upload_large_file_hash() -> Result<()> { + // Test hash calculation for larger files + let large_content = vec![b'X'; 1_000_000]; // 1MB of 'X' characters + + let hash1 = calculate_file_hash(&large_content); + let hash2 = calculate_file_hash(&large_content); + + // Hash should be consistent + assert_eq!(hash1, hash2); + assert_eq!(hash1.len(), 64); // SHA256 hex length + assert!(hash1.chars().all(|c| c.is_ascii_hexdigit())); + + Ok(()) +} + +#[tokio::test] +async fn test_document_upload_binary_content_hash() -> Result<()> { + // Test hash calculation for binary content + let mut binary_content = Vec::new(); + for i in 0..256 { + binary_content.push(i as u8); + } + + let hash = calculate_file_hash(&binary_content); + + assert_eq!(hash.len(), 64); + assert!(hash.chars().all(|c| c.is_ascii_hexdigit())); + + // Same binary content should produce same hash + let hash2 = calculate_file_hash(&binary_content); + assert_eq!(hash, hash2); + + Ok(()) +} + +#[tokio::test] +async fn test_document_upload_duplicate_prevention_database_constraint() -> Result<()> { + let state = create_test_app_state().await?; + let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple())); + // Create user in database and get the created user + let created_user = state.db.create_user(user).await?; + let user_id = created_user.id; + + let test_hash = "duplicate_upload_test_hash_123456789012345678901234567890123456"; + + // Create first document with the hash + let doc1 = create_test_document(user_id, "test1.pdf", test_hash.to_string()); + let result1 = state.db.create_document(doc1).await; + assert!(result1.is_ok(), "First document should be created successfully"); + + // Try to create second document with same hash for same user + let doc2 = create_test_document(user_id, "test2.pdf", test_hash.to_string()); + let result2 = state.db.create_document(doc2).await; + + // This should fail due to unique constraint + assert!(result2.is_err(), "Second document with same hash should fail"); + + Ok(()) +} + +#[tokio::test] +async fn test_document_upload_filename_vs_content_duplicate() -> Result<()> { + let state = create_test_app_state().await?; + let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple())); + // Create user in database and get the created user + let created_user = state.db.create_user(user).await?; + let user_id = created_user.id; + + // Same content, different filenames + let content = b"Same content, different names"; + let hash = calculate_file_hash(content); + + // Create first document + let doc1 = create_test_document(user_id, "document_v1.pdf", hash.clone()); + state.db.create_document(doc1).await?; + + // Check that same content is detected as duplicate regardless of filename + let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &hash).await?; + assert!(duplicate_check.is_some(), "Same content should be detected as duplicate regardless of filename"); + + Ok(()) +} + +#[tokio::test] +async fn test_document_upload_unicode_content_hash() -> Result<()> { + // Test hash calculation with unicode content + let unicode_content = "Hello 世界 🌍 café naïve résumé".as_bytes(); + + let hash1 = calculate_file_hash(unicode_content); + let hash2 = calculate_file_hash(unicode_content); + + // Hash should be consistent for unicode content + assert_eq!(hash1, hash2); + assert_eq!(hash1.len(), 64); + assert!(hash1.chars().all(|c| c.is_ascii_hexdigit())); + + Ok(()) +} + +#[tokio::test] +async fn test_document_upload_concurrent_same_content() -> Result<()> { + let state = create_test_app_state().await?; + let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple())); + // Create user in database and get the created user + let created_user = state.db.create_user(user).await?; + let user_id = created_user.id; + + let test_content = b"Concurrent upload test content"; + let file_hash = calculate_file_hash(test_content); + + // Simulate concurrent uploads of same content + let mut handles = Vec::new(); + + for i in 0..5 { + let state_clone = state.clone(); + let hash_clone = file_hash.clone(); + + let handle = tokio::spawn(async move { + let doc = create_test_document(user_id, &format!("concurrent{}.pdf", i), hash_clone); + state_clone.db.create_document(doc).await + }); + + handles.push(handle); + } + + // Wait for all operations and count results + let mut success_count = 0; + let mut error_count = 0; + + for handle in handles { + match handle.await? { + Ok(_) => success_count += 1, + Err(_) => error_count += 1, + } + } + + // Only one should succeed due to unique constraint + assert_eq!(success_count, 1, "Only one document should be created successfully"); + assert_eq!(error_count, 4, "Four operations should fail due to duplicate hash"); + + Ok(()) +} + +#[tokio::test] +async fn test_document_upload_mime_type_independence() -> Result<()> { + let state = create_test_app_state().await?; + let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple())); + // Create user in database and get the created user + let created_user = state.db.create_user(user).await?; + let user_id = created_user.id; + + let content = b"Same content, different perceived types"; + let hash = calculate_file_hash(content); + + // Create document as PDF + let mut pdf_doc = create_test_document(user_id, "test.pdf", hash.clone()); + pdf_doc.mime_type = "application/pdf".to_string(); + state.db.create_document(pdf_doc).await?; + + // Try to upload same content as text file - should be detected as duplicate + let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &hash).await?; + assert!(duplicate_check.is_some(), "Same content should be detected as duplicate regardless of MIME type"); + + Ok(()) +} + +#[tokio::test] +async fn test_document_upload_performance_hash_lookup() -> Result<()> { + let state = create_test_app_state().await?; + let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple())); + // Create user in database and get the created user + let created_user = state.db.create_user(user).await?; + let user_id = created_user.id; + + // Create multiple documents with different hashes + let mut test_hashes = Vec::new(); + + for i in 0..50 { + let content = format!("Performance test content {}", i); + let hash = calculate_file_hash(content.as_bytes()); + test_hashes.push(hash.clone()); + + let doc = create_test_document(user_id, &format!("perf_test_{}.pdf", i), hash); + state.db.create_document(doc).await?; + } + + // Measure hash lookup performance + let start = std::time::Instant::now(); + + for hash in &test_hashes { + let result = state.db.get_document_by_user_and_hash(user_id, hash).await?; + assert!(result.is_some(), "Should find document with hash: {}", hash); + } + + let duration = start.elapsed(); + + // Hash lookups should be very fast + assert!(duration.as_millis() < 2000, "Hash lookups should be fast even with many documents: {:?}", duration); + + Ok(()) +} \ No newline at end of file diff --git a/tests/hash_duplicate_detection_tests.rs b/tests/hash_duplicate_detection_tests.rs new file mode 100644 index 0000000..c43f1ff --- /dev/null +++ b/tests/hash_duplicate_detection_tests.rs @@ -0,0 +1,276 @@ +use anyhow::Result; +use chrono::Utc; +use uuid::Uuid; +use sha2::{Sha256, Digest}; +use tempfile::TempDir; + +use readur::{ + db::Database, + file_service::FileService, + models::{Document, CreateUser, UserRole}, +}; + +const TEST_DB_URL: &str = "postgresql://readur:readur@localhost:5432/readur"; + +// Helper function to create a test user with unique identifier +async fn create_test_user(db: &Database, username: &str) -> Result { + let unique_suffix = Uuid::new_v4().simple(); + let user = CreateUser { + username: format!("{}_{}", username, unique_suffix), + email: format!("{}_{}@example.com", username, unique_suffix), + password: "password123".to_string(), + role: Some(UserRole::User), + }; + let created_user = db.create_user(user).await?; + Ok(created_user.id) +} + +// Helper function to calculate file hash +fn calculate_file_hash(data: &[u8]) -> String { + let mut hasher = Sha256::new(); + hasher.update(data); + let result = hasher.finalize(); + format!("{:x}", result) +} + +// Helper function to create a test document +fn create_test_document(user_id: Uuid, filename: &str, file_hash: Option) -> Document { + Document { + id: Uuid::new_v4(), + filename: filename.to_string(), + original_filename: filename.to_string(), + file_path: format!("/tmp/{}", filename), + file_size: 1024, + mime_type: "application/pdf".to_string(), + content: None, + ocr_text: None, + ocr_confidence: None, + ocr_word_count: None, + ocr_processing_time_ms: None, + ocr_status: Some("pending".to_string()), + ocr_error: None, + ocr_completed_at: None, + tags: Vec::new(), + created_at: Utc::now(), + updated_at: Utc::now(), + user_id, + file_hash, + } +} + +#[tokio::test] +async fn test_get_document_by_user_and_hash_found() -> Result<()> { + let db = Database::new(TEST_DB_URL).await?; + let user_id = create_test_user(&db, "testuser1").await?; + let file_hash = "abcd1234567890"; + + // Create a document with the hash + let document = create_test_document(user_id, "test.pdf", Some(file_hash.to_string())); + let created_doc = db.create_document(document).await?; + + // Test finding the document by hash + let found_doc = db.get_document_by_user_and_hash(user_id, file_hash).await?; + + assert!(found_doc.is_some()); + let found_doc = found_doc.unwrap(); + assert_eq!(found_doc.id, created_doc.id); + assert_eq!(found_doc.file_hash, Some(file_hash.to_string())); + assert_eq!(found_doc.user_id, user_id); + + Ok(()) +} + +#[tokio::test] +async fn test_get_document_by_user_and_hash_not_found() -> Result<()> { + let db = Database::new(TEST_DB_URL).await?; + let user_id = Uuid::new_v4(); + let non_existent_hash = "nonexistent1234567890"; + + // Test finding a non-existent hash + let found_doc = db.get_document_by_user_and_hash(user_id, non_existent_hash).await?; + + assert!(found_doc.is_none()); + + Ok(()) +} + +#[tokio::test] +async fn test_get_document_by_user_and_hash_different_user() -> Result<()> { + let db = Database::new(TEST_DB_URL).await?; + let user1_id = create_test_user(&db, "testuser2").await?; + let user2_id = create_test_user(&db, "testuser3").await?; + let file_hash = "shared_hash_1234567890"; + + // Create a document for user1 with the hash + let document = create_test_document(user1_id, "test.pdf", Some(file_hash.to_string())); + db.create_document(document).await?; + + // Test that user2 cannot find user1's document by hash + let found_doc = db.get_document_by_user_and_hash(user2_id, file_hash).await?; + + assert!(found_doc.is_none(), "User should not be able to access another user's documents"); + + Ok(()) +} + +#[tokio::test] +async fn test_duplicate_hash_prevention_same_user() -> Result<()> { + let db = Database::new(TEST_DB_URL).await?; + let user_id = create_test_user(&db, "testuser4").await?; + let file_hash = "duplicate_hash_1234567890"; + + // Create first document with the hash + let document1 = create_test_document(user_id, "test1.pdf", Some(file_hash.to_string())); + let result1 = db.create_document(document1).await; + assert!(result1.is_ok(), "First document with hash should be created successfully"); + + // Try to create second document with same hash for same user + let document2 = create_test_document(user_id, "test2.pdf", Some(file_hash.to_string())); + let result2 = db.create_document(document2).await; + + // This should fail due to unique constraint + assert!(result2.is_err(), "Second document with same hash for same user should fail"); + + Ok(()) +} + +#[tokio::test] +async fn test_same_hash_different_users_allowed() -> Result<()> { + let db = Database::new(TEST_DB_URL).await?; + let user1_id = create_test_user(&db, "testuser5").await?; + let user2_id = create_test_user(&db, "testuser6").await?; + let file_hash = "shared_content_hash_1234567890"; + + // Create document for user1 with the hash + let document1 = create_test_document(user1_id, "test1.pdf", Some(file_hash.to_string())); + let result1 = db.create_document(document1).await; + assert!(result1.is_ok(), "First user's document should be created successfully"); + + // Create document for user2 with same hash + let document2 = create_test_document(user2_id, "test2.pdf", Some(file_hash.to_string())); + let result2 = db.create_document(document2).await; + assert!(result2.is_ok(), "Second user's document with same hash should be allowed"); + + // Verify both users can find their respective documents + let found_doc1 = db.get_document_by_user_and_hash(user1_id, file_hash).await?; + let found_doc2 = db.get_document_by_user_and_hash(user2_id, file_hash).await?; + + assert!(found_doc1.is_some()); + assert!(found_doc2.is_some()); + assert_ne!(found_doc1.unwrap().id, found_doc2.unwrap().id); + + Ok(()) +} + +#[tokio::test] +async fn test_null_hash_allowed_multiple() -> Result<()> { + let db = Database::new(TEST_DB_URL).await?; + let user_id = create_test_user(&db, "testuser7").await?; + + // Create multiple documents with null hash (should be allowed) + let document1 = create_test_document(user_id, "test1.pdf", None); + let result1 = db.create_document(document1).await; + assert!(result1.is_ok(), "First document with null hash should be created"); + + let document2 = create_test_document(user_id, "test2.pdf", None); + let result2 = db.create_document(document2).await; + assert!(result2.is_ok(), "Second document with null hash should be created"); + + Ok(()) +} + +#[test] +fn test_calculate_file_hash_consistency() { + let test_data = b"Hello, World! This is test content for hash calculation."; + + // Calculate hash multiple times + let hash1 = calculate_file_hash(test_data); + let hash2 = calculate_file_hash(test_data); + let hash3 = calculate_file_hash(test_data); + + // All hashes should be identical + assert_eq!(hash1, hash2); + assert_eq!(hash2, hash3); + + // Hash should be 64 characters (SHA256 hex) + assert_eq!(hash1.len(), 64); + + // Should be valid hex + assert!(hash1.chars().all(|c| c.is_ascii_hexdigit())); +} + +#[test] +fn test_calculate_file_hash_different_content() { + let data1 = b"Content 1"; + let data2 = b"Content 2"; + let data3 = b"content 1"; // Different case + + let hash1 = calculate_file_hash(data1); + let hash2 = calculate_file_hash(data2); + let hash3 = calculate_file_hash(data3); + + // All hashes should be different + assert_ne!(hash1, hash2); + assert_ne!(hash1, hash3); + assert_ne!(hash2, hash3); +} + +#[test] +fn test_calculate_file_hash_empty_content() { + let empty_data = b""; + let hash = calculate_file_hash(empty_data); + + // Should produce a valid hash even for empty content + assert_eq!(hash.len(), 64); + assert!(hash.chars().all(|c| c.is_ascii_hexdigit())); + + // Known SHA256 hash of empty string + assert_eq!(hash, "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"); +} + +#[tokio::test] +async fn test_file_service_create_document_with_hash() { + let temp_dir = TempDir::new().unwrap(); + let upload_path = temp_dir.path().to_string_lossy().to_string(); + let file_service = FileService::new(upload_path); + let user_id = Uuid::new_v4(); + let test_hash = "test_hash_1234567890"; + + let document = file_service.create_document( + "test.pdf", + "original.pdf", + "/path/to/file.pdf", + 1024, + "application/pdf", + user_id, + Some(test_hash.to_string()), + ); + + assert_eq!(document.filename, "test.pdf"); + assert_eq!(document.original_filename, "original.pdf"); + assert_eq!(document.file_hash, Some(test_hash.to_string())); + assert_eq!(document.user_id, user_id); +} + +#[tokio::test] +async fn test_file_service_create_document_without_hash() { + let temp_dir = TempDir::new().unwrap(); + let upload_path = temp_dir.path().to_string_lossy().to_string(); + let file_service = FileService::new(upload_path); + let user_id = Uuid::new_v4(); + + let document = file_service.create_document( + "test.pdf", + "original.pdf", + "/path/to/file.pdf", + 1024, + "application/pdf", + user_id, + None, + ); + + assert_eq!(document.filename, "test.pdf"); + assert_eq!(document.original_filename, "original.pdf"); + assert_eq!(document.file_hash, None); + assert_eq!(document.user_id, user_id); +} \ No newline at end of file diff --git a/tests/source_sync_hash_duplicate_tests.rs b/tests/source_sync_hash_duplicate_tests.rs new file mode 100644 index 0000000..23af08c --- /dev/null +++ b/tests/source_sync_hash_duplicate_tests.rs @@ -0,0 +1,440 @@ +use anyhow::Result; +use chrono::Utc; +use std::sync::Arc; +use uuid::Uuid; +use sha2::{Sha256, Digest}; + +use readur::{ + AppState, + db::Database, + config::Config, + models::{FileInfo, Document, Source, SourceType, SourceStatus}, +}; + +// Helper function to calculate file hash +fn calculate_file_hash(data: &[u8]) -> String { + let mut hasher = Sha256::new(); + hasher.update(data); + let result = hasher.finalize(); + format!("{:x}", result) +} + +// Helper function to create test file info +fn create_test_file_info(name: &str, path: &str, content: &[u8]) -> FileInfo { + FileInfo { + name: name.to_string(), + path: path.to_string(), + size: content.len() as i64, + last_modified: Some(Utc::now()), + etag: "test-etag".to_string(), + mime_type: "application/pdf".to_string(), + is_directory: false, + } +} + +// Helper function to create test document +fn create_test_document(user_id: Uuid, filename: &str, file_hash: String) -> Document { + Document { + id: Uuid::new_v4(), + filename: filename.to_string(), + original_filename: filename.to_string(), + file_path: format!("/tmp/{}", filename), + file_size: 1024, + mime_type: "application/pdf".to_string(), + content: None, + ocr_text: None, + ocr_confidence: None, + ocr_word_count: None, + ocr_processing_time_ms: None, + ocr_status: Some("pending".to_string()), + ocr_error: None, + ocr_completed_at: None, + tags: Vec::new(), + created_at: Utc::now(), + updated_at: Utc::now(), + user_id, + file_hash: Some(file_hash), + } +} + +// Helper function to create test source +fn create_test_source(user_id: Uuid, source_type: SourceType) -> Source { + Source { + id: Uuid::new_v4(), + user_id, + name: "Test Source".to_string(), + source_type, + config: serde_json::json!({}), + status: SourceStatus::Idle, + enabled: true, + last_sync_at: None, + last_error: None, + last_error_at: None, + total_files_synced: 0, + total_files_pending: 0, + total_size_bytes: 0, + created_at: Utc::now(), + updated_at: Utc::now(), + } +} + +// Helper function to create a test user with unique identifier +async fn create_test_user(db: &Database, username: &str) -> Result { + use readur::models::{CreateUser, UserRole}; + let unique_suffix = Uuid::new_v4().simple(); + let user = CreateUser { + username: format!("{}_{}", username, unique_suffix), + email: format!("{}_{}@example.com", username, unique_suffix), + password: "password123".to_string(), + role: Some(UserRole::User), + }; + let created_user = db.create_user(user).await?; + Ok(created_user.id) +} + +async fn create_test_app_state() -> Result> { + let config = Config::from_env().unwrap_or_else(|_| { + Config { + database_url: "postgresql://readur:readur@localhost:5432/readur".to_string(), + server_address: "127.0.0.1:8000".to_string(), + jwt_secret: "test-secret".to_string(), + upload_path: "./test-uploads".to_string(), + watch_folder: "./test-watch".to_string(), + allowed_file_types: vec!["pdf".to_string(), "txt".to_string()], + watch_interval_seconds: Some(30), + file_stability_check_ms: Some(500), + max_file_age_hours: None, + ocr_language: "eng".to_string(), + concurrent_ocr_jobs: 2, + ocr_timeout_seconds: 60, + max_file_size_mb: 10, + memory_limit_mb: 256, + cpu_priority: "normal".to_string(), + } + }); + let db = Database::new("postgresql://readur:readur@localhost:5432/readur").await?; + let queue_service = std::sync::Arc::new( + readur::ocr_queue::OcrQueueService::new(db.clone(), db.get_pool().clone(), 1) + ); + + Ok(Arc::new(AppState { + db, + config, + webdav_scheduler: None, + source_scheduler: None, + queue_service, + })) +} + +#[tokio::test] +async fn test_source_sync_duplicate_detection_skips_duplicate() -> Result<()> { + let state = create_test_app_state().await?; + let user_id = create_test_user(&state.db, "source_sync_test").await?; + + // Test content + let test_content = b"This is test content for source sync duplicate detection"; + let file_hash = calculate_file_hash(test_content); + + // Create existing document with same hash + let existing_doc = create_test_document(user_id, "existing.pdf", file_hash.clone()); + state.db.create_document(existing_doc).await?; + + // Check if duplicate exists using the efficient method + let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &file_hash).await?; + + assert!(duplicate_check.is_some(), "Should find existing document with same hash"); + + let found_doc = duplicate_check.unwrap(); + assert_eq!(found_doc.file_hash, Some(file_hash)); + assert_eq!(found_doc.user_id, user_id); + + Ok(()) +} + +#[tokio::test] +async fn test_source_sync_duplicate_detection_processes_unique() -> Result<()> { + let state = create_test_app_state().await?; + let user_id = create_test_user(&state.db, "source_sync_test").await?; + + // Test content + let test_content = b"This is unique content that should be processed by source sync"; + let file_hash = calculate_file_hash(test_content); + + // Verify no existing document with this hash + let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &file_hash).await?; + assert!(duplicate_check.is_none(), "Should not find any existing document with this hash"); + + // This indicates the file would be processed normally + Ok(()) +} + +#[tokio::test] +async fn test_source_sync_duplicate_different_users() -> Result<()> { + let state = create_test_app_state().await?; + let user1_id = create_test_user(&state.db, "source_sync_user1").await?; + let user2_id = create_test_user(&state.db, "source_sync_user2").await?; + + // Test content + let test_content = b"Shared content between different users in source sync"; + let file_hash = calculate_file_hash(test_content); + + // Create document for user1 with this hash + let user1_doc = create_test_document(user1_id, "user1.pdf", file_hash.clone()); + state.db.create_document(user1_doc).await?; + + // Check that user2 doesn't see user1's document as duplicate + let duplicate_check = state.db.get_document_by_user_and_hash(user2_id, &file_hash).await?; + assert!(duplicate_check.is_none(), "User2 should not see user1's document as duplicate"); + + // User2 should be able to create their own document with same hash + let user2_doc = create_test_document(user2_id, "user2.pdf", file_hash.clone()); + let result = state.db.create_document(user2_doc).await; + assert!(result.is_ok(), "User2 should be able to create document with same hash"); + + Ok(()) +} + +#[tokio::test] +async fn test_source_sync_hash_calculation_consistency() -> Result<()> { + let test_content = b"Test content for hash consistency in source sync"; + + // Calculate hash multiple times + let hash1 = calculate_file_hash(test_content); + let hash2 = calculate_file_hash(test_content); + let hash3 = calculate_file_hash(test_content); + + // All hashes should be identical + assert_eq!(hash1, hash2); + assert_eq!(hash2, hash3); + + // Hash should be 64 characters (SHA256 hex) + assert_eq!(hash1.len(), 64); + + // Should be valid hex + assert!(hash1.chars().all(|c| c.is_ascii_hexdigit())); + + Ok(()) +} + +#[tokio::test] +async fn test_source_sync_duplicate_detection_performance() -> Result<()> { + let state = create_test_app_state().await?; + let user_id = create_test_user(&state.db, "source_sync_test").await?; + + // Create multiple documents with different hashes + let mut created_hashes = Vec::new(); + + for i in 0..10 { + let content = format!("Test content number {}", i); + let hash = calculate_file_hash(content.as_bytes()); + created_hashes.push(hash.clone()); + + let doc = create_test_document(user_id, &format!("test{}.pdf", i), hash); + state.db.create_document(doc).await?; + } + + // Test lookup performance - should be fast even with multiple documents + let start = std::time::Instant::now(); + + for hash in &created_hashes { + let result = state.db.get_document_by_user_and_hash(user_id, hash).await?; + assert!(result.is_some(), "Should find document with hash: {}", hash); + } + + let duration = start.elapsed(); + assert!(duration.as_millis() < 1000, "Hash lookups should be fast: {:?}", duration); + + Ok(()) +} + +#[tokio::test] +async fn test_source_sync_file_modification_detection() -> Result<()> { + let state = create_test_app_state().await?; + let user_id = create_test_user(&state.db, "source_sync_test").await?; + + // Original content + let original_content = b"Original file content"; + let original_hash = calculate_file_hash(original_content); + + // Modified content (same file, different content) + let modified_content = b"Modified file content"; + let modified_hash = calculate_file_hash(modified_content); + + // Create document with original content + let original_doc = create_test_document(user_id, "test.pdf", original_hash.clone()); + state.db.create_document(original_doc).await?; + + // Check original content is found + let original_check = state.db.get_document_by_user_and_hash(user_id, &original_hash).await?; + assert!(original_check.is_some(), "Should find document with original hash"); + + // Check modified content is not found (different hash) + let modified_check = state.db.get_document_by_user_and_hash(user_id, &modified_hash).await?; + assert!(modified_check.is_none(), "Should not find document with modified hash"); + + // Verify hashes are actually different + assert_ne!(original_hash, modified_hash, "Original and modified content should have different hashes"); + + Ok(()) +} + +#[tokio::test] +async fn test_source_sync_edge_case_empty_files() -> Result<()> { + let state = create_test_app_state().await?; + let user_id = create_test_user(&state.db, "source_sync_test").await?; + + // Empty file content + let empty_content = b""; + let empty_hash = calculate_file_hash(empty_content); + + // Create document with empty content + let empty_doc = create_test_document(user_id, "empty.pdf", empty_hash.clone()); + state.db.create_document(empty_doc).await?; + + // Check empty file is found + let empty_check = state.db.get_document_by_user_and_hash(user_id, &empty_hash).await?; + assert!(empty_check.is_some(), "Should find document with empty content hash"); + + // Verify empty hash is the known SHA256 empty string hash + assert_eq!(empty_hash, "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"); + + Ok(()) +} + +#[tokio::test] +async fn test_source_sync_large_file_hash_consistency() -> Result<()> { + // Simulate large file content + let large_content = vec![b'A'; 10_000_000]; // 10MB of 'A' characters + + // Calculate hash + let hash = calculate_file_hash(&large_content); + + // Hash should still be 64 characters + assert_eq!(hash.len(), 64); + assert!(hash.chars().all(|c| c.is_ascii_hexdigit())); + + // Calculate same hash again to ensure consistency + let hash2 = calculate_file_hash(&large_content); + assert_eq!(hash, hash2); + + Ok(()) +} + +#[tokio::test] +async fn test_source_sync_binary_file_handling() -> Result<()> { + let state = create_test_app_state().await?; + let user_id = create_test_user(&state.db, "source_sync_test").await?; + + // Binary content (PDF header + some binary data) + let mut binary_content = b"%PDF-1.4\n".to_vec(); + binary_content.extend_from_slice(&[0u8, 1u8, 2u8, 3u8, 255u8, 254u8, 253u8]); + + let binary_hash = calculate_file_hash(&binary_content); + + // Create document with binary content + let binary_doc = create_test_document(user_id, "binary.pdf", binary_hash.clone()); + state.db.create_document(binary_doc).await?; + + // Check binary file is found + let binary_check = state.db.get_document_by_user_and_hash(user_id, &binary_hash).await?; + assert!(binary_check.is_some(), "Should find document with binary content hash"); + + Ok(()) +} + +#[tokio::test] +async fn test_source_sync_unicode_filename_handling() -> Result<()> { + let state = create_test_app_state().await?; + let user_id = create_test_user(&state.db, "source_sync_test").await?; + + // Unicode content and filename + let unicode_content = "Test content with unicode: 测试内容 🚀 café".as_bytes(); + let unicode_hash = calculate_file_hash(unicode_content); + + // Create document with unicode filename + let unicode_doc = create_test_document(user_id, "测试文档🚀.pdf", unicode_hash.clone()); + state.db.create_document(unicode_doc).await?; + + // Check unicode file is found + let unicode_check = state.db.get_document_by_user_and_hash(user_id, &unicode_hash).await?; + assert!(unicode_check.is_some(), "Should find document with unicode content hash"); + + let found_doc = unicode_check.unwrap(); + assert_eq!(found_doc.filename, "测试文档🚀.pdf"); + + Ok(()) +} + +#[tokio::test] +async fn test_source_sync_concurrent_hash_operations() -> Result<()> { + let state = create_test_app_state().await?; + let user_id = create_test_user(&state.db, "source_sync_test").await?; + + // Create multiple concurrent hash lookup operations + let mut handles = Vec::new(); + + for i in 0..20 { + let state_clone = state.clone(); + let hash = format!("{}test_hash_concurrent_{}", "a".repeat(40), i); + + let handle = tokio::spawn(async move { + state_clone.db.get_document_by_user_and_hash(user_id, &hash).await + }); + + handles.push(handle); + } + + // Wait for all concurrent operations + let mut results = Vec::new(); + for handle in handles { + let result = handle.await??; + results.push(result); + } + + // All should return None (no documents exist with these hashes) + for (i, result) in results.iter().enumerate() { + assert!(result.is_none(), "Concurrent operation {} should return None", i); + } + + Ok(()) +} + +#[tokio::test] +async fn test_source_sync_duplicate_prevention_race_condition() -> Result<()> { + let state = create_test_app_state().await?; + let user_id = create_test_user(&state.db, "source_sync_test").await?; + + let test_hash = "race_condition_test_hash_123456789012345678901234567890123456"; + + // Try to create multiple documents with same hash concurrently + let mut handles = Vec::new(); + + for i in 0..5 { + let state_clone = state.clone(); + let hash_clone = test_hash.to_string(); + + let handle = tokio::spawn(async move { + let doc = create_test_document(user_id, &format!("test{}.pdf", i), hash_clone); + state_clone.db.create_document(doc).await + }); + + handles.push(handle); + } + + // Wait for all operations and count successes + let mut success_count = 0; + let mut error_count = 0; + + for handle in handles { + match handle.await? { + Ok(_) => success_count += 1, + Err(_) => error_count += 1, + } + } + + // Only one should succeed due to unique constraint + assert_eq!(success_count, 1, "Only one document should be created successfully"); + assert_eq!(error_count, 4, "Four operations should fail due to duplicate hash"); + + Ok(()) +} \ No newline at end of file diff --git a/tests/webdav_hash_duplicate_tests.rs b/tests/webdav_hash_duplicate_tests.rs new file mode 100644 index 0000000..4af34e4 --- /dev/null +++ b/tests/webdav_hash_duplicate_tests.rs @@ -0,0 +1,389 @@ +use anyhow::Result; +use chrono::Utc; +use std::sync::Arc; +use uuid::Uuid; +use sha2::{Sha256, Digest}; + +use readur::{ + AppState, + db::Database, + config::Config, + models::{FileInfo, CreateWebDAVFile, Document}, +}; + +// Helper function to calculate file hash +fn calculate_file_hash(data: &[u8]) -> String { + let mut hasher = Sha256::new(); + hasher.update(data); + let result = hasher.finalize(); + format!("{:x}", result) +} + +// Helper function to create test file info +fn create_test_file_info(name: &str, path: &str, size: i64) -> FileInfo { + FileInfo { + name: name.to_string(), + path: path.to_string(), + size, + last_modified: Some(Utc::now()), + etag: "test-etag".to_string(), + mime_type: "application/pdf".to_string(), + is_directory: false, + } +} + +// Helper function to create test document +fn create_test_document(user_id: Uuid, filename: &str, file_hash: String) -> Document { + Document { + id: Uuid::new_v4(), + filename: filename.to_string(), + original_filename: filename.to_string(), + file_path: format!("/tmp/{}", filename), + file_size: 1024, + mime_type: "application/pdf".to_string(), + content: None, + ocr_text: None, + ocr_confidence: None, + ocr_word_count: None, + ocr_processing_time_ms: None, + ocr_status: Some("pending".to_string()), + ocr_error: None, + ocr_completed_at: None, + tags: Vec::new(), + created_at: Utc::now(), + updated_at: Utc::now(), + user_id, + file_hash: Some(file_hash), + } +} + +// Mock WebDAV service for testing +#[derive(Clone)] +struct MockWebDAVService { + pub test_files: std::collections::HashMap>, +} + +impl MockWebDAVService { + fn new() -> Self { + Self { + test_files: std::collections::HashMap::new(), + } + } + + fn add_test_file(&mut self, path: &str, content: Vec) { + self.test_files.insert(path.to_string(), content); + } + + async fn download_file(&self, path: &str) -> Result> { + self.test_files + .get(path) + .cloned() + .ok_or_else(|| anyhow::anyhow!("File not found: {}", path)) + } +} + +// Helper function to create a test user with unique identifier +async fn create_test_user(db: &Database, username: &str) -> Result { + use readur::models::{CreateUser, UserRole}; + let unique_suffix = Uuid::new_v4().simple(); + let user = CreateUser { + username: format!("{}_{}", username, unique_suffix), + email: format!("{}_{}@example.com", username, unique_suffix), + password: "password123".to_string(), + role: Some(UserRole::User), + }; + let created_user = db.create_user(user).await?; + Ok(created_user.id) +} + +async fn create_test_app_state() -> Result> { + let config = Config::from_env().unwrap_or_else(|_| { + Config { + database_url: "postgresql://readur:readur@localhost:5432/readur".to_string(), + server_address: "127.0.0.1:8000".to_string(), + jwt_secret: "test-secret".to_string(), + upload_path: "./test-uploads".to_string(), + watch_folder: "./test-watch".to_string(), + allowed_file_types: vec!["pdf".to_string(), "txt".to_string()], + watch_interval_seconds: Some(30), + file_stability_check_ms: Some(500), + max_file_age_hours: None, + ocr_language: "eng".to_string(), + concurrent_ocr_jobs: 2, + ocr_timeout_seconds: 60, + max_file_size_mb: 10, + memory_limit_mb: 256, + cpu_priority: "normal".to_string(), + } + }); + let db = Database::new("postgresql://readur:readur@localhost:5432/readur").await?; + let queue_service = std::sync::Arc::new( + readur::ocr_queue::OcrQueueService::new(db.clone(), db.get_pool().clone(), 1) + ); + + Ok(Arc::new(AppState { + db, + config, + webdav_scheduler: None, + source_scheduler: None, + queue_service, + })) +} + +#[tokio::test] +async fn test_webdav_sync_duplicate_detection_skips_duplicate() -> Result<()> { + let state = create_test_app_state().await?; + let user_id = create_test_user(&state.db, "webdav_test").await?; + + // Test content + let test_content = b"This is test PDF content for duplicate detection"; + let file_hash = calculate_file_hash(test_content); + + // Create existing document with same hash + let existing_doc = create_test_document(user_id, "existing.pdf", file_hash.clone()); + state.db.create_document(existing_doc).await?; + + // Setup mock WebDAV service + let mut webdav_service = MockWebDAVService::new(); + webdav_service.add_test_file("/test/duplicate.pdf", test_content.to_vec()); + + // Create file info for the duplicate file + let file_info = create_test_file_info("duplicate.pdf", "/test/duplicate.pdf", test_content.len() as i64); + + // Create a mock process_single_file function (since the actual one is private) + // We'll test the duplicate detection logic directly + + // Check if duplicate exists using the new efficient method + let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &file_hash).await?; + + assert!(duplicate_check.is_some(), "Should find existing document with same hash"); + + let found_doc = duplicate_check.unwrap(); + assert_eq!(found_doc.file_hash, Some(file_hash)); + assert_eq!(found_doc.user_id, user_id); + + // Verify that WebDAV tracking would record this as a duplicate + let webdav_file = CreateWebDAVFile { + user_id, + webdav_path: file_info.path.clone(), + etag: file_info.etag.clone(), + last_modified: file_info.last_modified, + file_size: file_info.size, + mime_type: file_info.mime_type.clone(), + document_id: Some(found_doc.id), + sync_status: "duplicate_content".to_string(), + sync_error: None, + }; + + let created_webdav_file = state.db.create_or_update_webdav_file(&webdav_file).await?; + assert_eq!(created_webdav_file.sync_status, "duplicate_content"); + assert_eq!(created_webdav_file.document_id, Some(found_doc.id)); + + Ok(()) +} + +#[tokio::test] +async fn test_webdav_sync_duplicate_detection_processes_unique() -> Result<()> { + let state = create_test_app_state().await?; + let user_id = create_test_user(&state.db, "webdav_test").await?; + + // Test content + let test_content = b"This is unique PDF content that should be processed"; + let file_hash = calculate_file_hash(test_content); + + // Verify no existing document with this hash + let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &file_hash).await?; + assert!(duplicate_check.is_none(), "Should not find any existing document with this hash"); + + // This indicates the file would be processed normally + // In the actual sync, this would proceed to save the file and create a new document + + Ok(()) +} + +#[tokio::test] +async fn test_webdav_sync_duplicate_different_users() -> Result<()> { + let state = create_test_app_state().await?; + let user1_id = create_test_user(&state.db, "webdav_user1").await?; + let user2_id = create_test_user(&state.db, "webdav_user2").await?; + + // Test content + let test_content = b"Shared content between different users"; + let file_hash = calculate_file_hash(test_content); + + // Create document for user1 with this hash + let user1_doc = create_test_document(user1_id, "user1.pdf", file_hash.clone()); + state.db.create_document(user1_doc).await?; + + // Check that user2 doesn't see user1's document as duplicate + let duplicate_check = state.db.get_document_by_user_and_hash(user2_id, &file_hash).await?; + assert!(duplicate_check.is_none(), "User2 should not see user1's document as duplicate"); + + // User2 should be able to create their own document with same hash + let user2_doc = create_test_document(user2_id, "user2.pdf", file_hash.clone()); + let result = state.db.create_document(user2_doc).await; + assert!(result.is_ok(), "User2 should be able to create document with same hash"); + + Ok(()) +} + +#[tokio::test] +async fn test_webdav_sync_etag_change_detection() -> Result<()> { + let state = create_test_app_state().await?; + let user_id = create_test_user(&state.db, "webdav_test").await?; + + let webdav_path = "/test/updated.pdf"; + let old_etag = "old-etag-123"; + let new_etag = "new-etag-456"; + + // Create a document first + let test_doc = create_test_document(user_id, "updated.pdf", "etag_test_hash_1234567890".to_string()); + let created_doc = state.db.create_document(test_doc).await?; + + // Create initial WebDAV file record + let initial_webdav_file = CreateWebDAVFile { + user_id, + webdav_path: webdav_path.to_string(), + etag: old_etag.to_string(), + last_modified: Some(Utc::now()), + file_size: 1024, + mime_type: "application/pdf".to_string(), + document_id: Some(created_doc.id), + sync_status: "synced".to_string(), + sync_error: None, + }; + + state.db.create_or_update_webdav_file(&initial_webdav_file).await?; + + // Check existing WebDAV file + let existing_file = state.db.get_webdav_file_by_path(user_id, webdav_path).await?; + assert!(existing_file.is_some()); + + let existing_file = existing_file.unwrap(); + assert_eq!(existing_file.etag, old_etag); + + // Simulate file with new ETag (indicating change) + let file_info = FileInfo { + name: "updated.pdf".to_string(), + path: webdav_path.to_string(), + size: 1024, + last_modified: Some(Utc::now()), + etag: new_etag.to_string(), + mime_type: "application/pdf".to_string(), + is_directory: false, + }; + + // ETag comparison should detect change + assert_ne!(existing_file.etag, file_info.etag, "ETag change should be detected"); + + Ok(()) +} + +#[tokio::test] +async fn test_webdav_sync_hash_collision_prevention() -> Result<()> { + let state = create_test_app_state().await?; + let user_id = create_test_user(&state.db, "webdav_test").await?; + + // Create document with specific hash + let test_hash = "abcd1234567890123456789012345678901234567890123456789012345678"; + let document = create_test_document(user_id, "original.pdf", test_hash.to_string()); + state.db.create_document(document).await?; + + // Try to create another document with same hash (should fail due to unique constraint) + let duplicate_document = create_test_document(user_id, "duplicate.pdf", test_hash.to_string()); + let result = state.db.create_document(duplicate_document).await; + + assert!(result.is_err(), "Should not be able to create duplicate hash for same user"); + + Ok(()) +} + +#[tokio::test] +async fn test_webdav_sync_file_content_vs_metadata_change() -> Result<()> { + let state = create_test_app_state().await?; + let user_id = create_test_user(&state.db, "webdav_test").await?; + + // Original content and hash + let original_content = b"Original file content"; + let original_hash = calculate_file_hash(original_content); + + // Create original document + let original_doc = create_test_document(user_id, "test.pdf", original_hash.clone()); + state.db.create_document(original_doc).await?; + + // Same content but different metadata (name, etc.) - should still be detected as duplicate + let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &original_hash).await?; + assert!(duplicate_check.is_some(), "Same content should be detected as duplicate regardless of filename"); + + // Different content - should not be duplicate + let different_content = b"Different file content"; + let different_hash = calculate_file_hash(different_content); + + let unique_check = state.db.get_document_by_user_and_hash(user_id, &different_hash).await?; + assert!(unique_check.is_none(), "Different content should not be detected as duplicate"); + + Ok(()) +} + +#[tokio::test] +async fn test_webdav_sync_error_handling_invalid_hash() -> Result<()> { + let state = create_test_app_state().await?; + let user_id = create_test_user(&state.db, "webdav_test").await?; + + // Test with invalid hash formats + let invalid_g_hash = "g".repeat(64); + let invalid_hashes = vec![ + "", // Empty + "short", // Too short + "invalid_characters_!@#$", // Invalid characters + &invalid_g_hash, // Invalid hex (contains 'g') + ]; + + for invalid_hash in invalid_hashes { + let result = state.db.get_document_by_user_and_hash(user_id, invalid_hash).await; + // Should handle gracefully - either return None or proper error + match result { + Ok(doc) => assert!(doc.is_none(), "Invalid hash should not match any document"), + Err(_) => {} // Acceptable to return error for invalid input + } + } + + Ok(()) +} + +#[tokio::test] +async fn test_webdav_sync_concurrent_duplicate_detection() -> Result<()> { + let state = create_test_app_state().await?; + let user_id = create_test_user(&state.db, "webdav_test").await?; + + let test_content = b"Concurrent test content"; + let file_hash = calculate_file_hash(test_content); + + // Simulate concurrent duplicate checks + let mut handles = Vec::new(); + + for i in 0..5 { + let state_clone = state.clone(); + let hash_clone = file_hash.clone(); + + let handle = tokio::spawn(async move { + state_clone.db.get_document_by_user_and_hash(user_id, &hash_clone).await + }); + + handles.push(handle); + } + + // Wait for all concurrent operations + let mut all_none = true; + for handle in handles { + let result = handle.await??; + if result.is_some() { + all_none = false; + } + } + + // Since no document exists with this hash, all should return None + assert!(all_none, "All concurrent checks should return None for non-existent hash"); + + Ok(()) +} \ No newline at end of file