diff --git a/Cargo.lock b/Cargo.lock index 63b5984..4c18977 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2929,6 +2929,7 @@ dependencies = [ "reqwest", "serde", "serde_json", + "sha2", "sqlx", "sysinfo", "tempfile", diff --git a/Cargo.toml b/Cargo.toml index b737f62..2e4338e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ hostname = "0.4" walkdir = "2" clap = { version = "4", features = ["derive"] } utoipa = { version = "5", features = ["axum_extras", "chrono", "uuid"] } +sha2 = "0.10" utoipa-swagger-ui = { version = "9", features = ["axum"] } [features] diff --git a/frontend/src/components/GlobalSearchBar/GlobalSearchBar.tsx b/frontend/src/components/GlobalSearchBar/GlobalSearchBar.tsx index bfe35b4..0b6a221 100644 --- a/frontend/src/components/GlobalSearchBar/GlobalSearchBar.tsx +++ b/frontend/src/components/GlobalSearchBar/GlobalSearchBar.tsx @@ -33,35 +33,18 @@ import { AccessTime as TimeIcon, } from '@mui/icons-material'; import { useNavigate } from 'react-router-dom'; -import { documentService, SearchRequest } from '../../services/api'; +import { documentService, SearchRequest, EnhancedDocument, SearchResponse } from '../../services/api'; interface GlobalSearchBarProps { sx?: SxProps; [key: string]: any; } -interface Document { - id: string; - original_filename: string; - filename?: string; - file_size: number; - mime_type: string; - has_ocr_text?: boolean; - search_rank?: number; - snippets?: Array<{ text: string }>; -} - -interface SearchResponse { - documents: Document[]; - total_count: number; - search_time_ms: number; -} - const GlobalSearchBar: React.FC = ({ sx, ...props }) => { const navigate = useNavigate(); const theme = useTheme(); const [query, setQuery] = useState(''); - const [results, setResults] = useState([]); + const [results, setResults] = useState([]); const [loading, setLoading] = useState(false); const [showResults, setShowResults] = useState(false); const [recentSearches, setRecentSearches] = useState([]); @@ -221,7 +204,7 @@ const GlobalSearchBar: React.FC = ({ sx, ...props }) => { setSearchProgress(0); }; - const handleDocumentClick = (doc: Document): void => { + const handleDocumentClick = (doc: EnhancedDocument): void => { saveRecentSearch(query); setShowResults(false); navigate(`/documents/${doc.id}`); @@ -661,7 +644,7 @@ const GlobalSearchBar: React.FC = ({ sx, ...props }) => { flex: 1, }} > - {highlightText(generateContextSnippet(doc.original_filename, query), query)} + {highlightText(doc.original_filename || doc.filename, query)} } secondary={ @@ -725,7 +708,7 @@ const GlobalSearchBar: React.FC = ({ sx, ...props }) => { flex: 1, }} > - {highlightText(doc.snippets[0].text.substring(0, 80) + '...', query)} + {highlightText(doc.snippets[0]?.text?.substring(0, 80) + '...' || '', query)} )} diff --git a/frontend/src/pages/DocumentDetailsPage.tsx b/frontend/src/pages/DocumentDetailsPage.tsx index 56ccdac..2db276e 100644 --- a/frontend/src/pages/DocumentDetailsPage.tsx +++ b/frontend/src/pages/DocumentDetailsPage.tsx @@ -80,23 +80,22 @@ const DocumentDetailsPage: React.FC = () => { }, [document]); const fetchDocumentDetails = async (): Promise => { + if (!id) { + setError('No document ID provided'); + setLoading(false); + return; + } + try { setLoading(true); setError(null); - // Since we don't have a direct document details endpoint, - // we'll fetch the document from the list and find the matching one - const response = await documentService.list(1000, 0); - const foundDoc = response.data.find(doc => doc.id === id); - - if (foundDoc) { - setDocument(foundDoc); - } else { - setError('Document not found'); - } - } catch (err) { - setError('Failed to load document details'); - console.error(err); + const response = await documentService.getById(id); + setDocument(response.data); + } catch (err: any) { + const errorMessage = err.message || 'Failed to load document details'; + setError(errorMessage); + console.error('Failed to fetch document details:', err); } finally { setLoading(false); } diff --git a/frontend/src/services/api.ts b/frontend/src/services/api.ts index f4c9ca2..10e5e46 100644 --- a/frontend/src/services/api.ts +++ b/frontend/src/services/api.ts @@ -111,6 +111,23 @@ export const documentService = { }) }, + getById: (id: string) => { + // Use the document list endpoint with pagination to find the specific document + // This is a temporary solution until we have a proper document details endpoint + return api.get('/documents', { + params: { + limit: 1000, // Fetch a reasonable amount to find our document + offset: 0 + } + }).then(response => { + const document = response.data.find(doc => doc.id === id); + if (!document) { + throw new Error('Document not found'); + } + return { data: document }; + }) + }, + download: (id: string) => { return api.get(`/documents/${id}/download`, { responseType: 'blob', diff --git a/src/config.rs b/src/config.rs index 9af6a09..c0c6f97 100644 --- a/src/config.rs +++ b/src/config.rs @@ -28,7 +28,7 @@ impl Config { pub fn from_env() -> Result { dotenvy::dotenv().ok(); - Ok(Config { + let config = Config { database_url: env::var("DATABASE_URL") .unwrap_or_else(|_| "postgresql://readur:readur@localhost/readur".to_string()), server_address: { @@ -85,6 +85,68 @@ impl Config { .unwrap_or(512), cpu_priority: env::var("CPU_PRIORITY") .unwrap_or_else(|_| "normal".to_string()), - }) + }; + + // Validate configuration to prevent recursion issues + config.validate_paths()?; + + Ok(config) + } + + fn validate_paths(&self) -> Result<()> { + use std::path::Path; + + let upload_path = Path::new(&self.upload_path); + let watch_path = Path::new(&self.watch_folder); + + // Normalize paths to handle relative paths and symlinks + let upload_canonical = upload_path.canonicalize() + .unwrap_or_else(|_| upload_path.to_path_buf()); + let watch_canonical = watch_path.canonicalize() + .unwrap_or_else(|_| watch_path.to_path_buf()); + + // Check if paths are the same + if upload_canonical == watch_canonical { + return Err(anyhow::anyhow!( + "Configuration Error: UPLOAD_PATH and WATCH_FOLDER cannot be the same directory.\n\ + This would cause infinite recursion where WebDAV files are downloaded to the upload \n\ + directory and then immediately reprocessed by the watcher.\n\ + Current config:\n\ + - UPLOAD_PATH: {}\n\ + - WATCH_FOLDER: {}\n\ + Please set them to different directories.", + self.upload_path, self.watch_folder + )); + } + + // Check if watch folder is inside upload folder + if watch_canonical.starts_with(&upload_canonical) { + return Err(anyhow::anyhow!( + "Configuration Error: WATCH_FOLDER cannot be inside UPLOAD_PATH.\n\ + This would cause recursion where WebDAV files downloaded to uploads are \n\ + detected by the watcher as new files.\n\ + Current config:\n\ + - UPLOAD_PATH: {}\n\ + - WATCH_FOLDER: {}\n\ + Please move the watch folder outside the upload directory.", + self.upload_path, self.watch_folder + )); + } + + // Check if upload folder is inside watch folder + if upload_canonical.starts_with(&watch_canonical) { + return Err(anyhow::anyhow!( + "Configuration Error: UPLOAD_PATH cannot be inside WATCH_FOLDER.\n\ + This would cause recursion where files from the watch folder are \n\ + copied to uploads (inside the watch folder) and reprocessed.\n\ + Current config:\n\ + - UPLOAD_PATH: {}\n\ + - WATCH_FOLDER: {}\n\ + Please move the upload directory outside the watch folder.", + self.upload_path, self.watch_folder + )); + } + + Ok(()) } } \ No newline at end of file diff --git a/src/routes/documents.rs b/src/routes/documents.rs index 913cd14..3803c70 100644 --- a/src/routes/documents.rs +++ b/src/routes/documents.rs @@ -8,6 +8,7 @@ use axum::{ use serde::Deserialize; use std::sync::Arc; use utoipa::ToSchema; +use sha2::{Sha256, Digest}; use crate::{ auth::AuthUser, @@ -85,6 +86,27 @@ async fn upload_document( return Err(StatusCode::PAYLOAD_TOO_LARGE); } + // Calculate file hash for deduplication + let file_hash = calculate_file_hash(&data); + + // Check if this exact file content already exists in the system + // This prevents uploading and processing duplicate files + if let Ok(existing_docs) = state.db.get_documents_by_user_with_role(auth_user.user.id, auth_user.user.role, 1000, 0).await { + for existing_doc in existing_docs { + // Quick size check first (much faster than hash comparison) + if existing_doc.file_size == file_size { + // Read the existing file and compare hashes + if let Ok(existing_file_data) = tokio::fs::read(&existing_doc.file_path).await { + let existing_hash = calculate_file_hash(&existing_file_data); + if file_hash == existing_hash { + // Return the existing document instead of creating a duplicate + return Ok(Json(existing_doc.into())); + } + } + } + } + } + let mime_type = mime_guess::from_path(&filename) .first_or_octet_stream() .to_string(); @@ -135,6 +157,13 @@ async fn upload_document( Err(StatusCode::BAD_REQUEST) } +fn calculate_file_hash(data: &[u8]) -> String { + let mut hasher = Sha256::new(); + hasher.update(data); + let result = hasher.finalize(); + format!("{:x}", result) +} + #[utoipa::path( get, path = "/api/documents", diff --git a/src/routes/webdav/webdav_sync.rs b/src/routes/webdav/webdav_sync.rs index 1f266c8..81ecff8 100644 --- a/src/routes/webdav/webdav_sync.rs +++ b/src/routes/webdav/webdav_sync.rs @@ -4,6 +4,7 @@ use tracing::{error, info, warn}; use chrono::Utc; use tokio::sync::Semaphore; use futures::stream::{FuturesUnordered, StreamExt}; +use sha2::{Sha256, Digest}; use crate::{ AppState, @@ -241,6 +242,46 @@ async fn process_single_file( info!("Downloaded file: {} ({} bytes)", file_info.name, file_data.len()); + // Calculate file hash for deduplication + let file_hash = calculate_file_hash(&file_data); + + // Check if this exact file content already exists in the system + // This prevents downloading and processing duplicate files from WebDAV + if let Ok(existing_docs) = state.db.get_documents_by_user_with_role(user_id, crate::models::UserRole::User, 1000, 0).await { + for existing_doc in existing_docs { + // Quick size check first (much faster than hash comparison) + if existing_doc.file_size == file_data.len() as i64 { + // Read the existing file and compare hashes + if let Ok(existing_file_data) = tokio::fs::read(&existing_doc.file_path).await { + let existing_hash = calculate_file_hash(&existing_file_data); + if file_hash == existing_hash { + info!("Skipping duplicate WebDAV file content: {} (hash: {}, already exists as: {})", + file_info.name, &file_hash[..8], existing_doc.original_filename); + + // Still record this WebDAV file in the tracking table to prevent re-downloading + let webdav_file = CreateWebDAVFile { + user_id, + webdav_path: file_info.path.clone(), + etag: file_info.etag.clone(), + last_modified: file_info.last_modified, + file_size: file_info.size, + mime_type: file_info.mime_type.clone(), + document_id: Some(existing_doc.id), // Link to existing document + sync_status: "duplicate_content".to_string(), + sync_error: None, + }; + + if let Err(e) = state.db.create_or_update_webdav_file(&webdav_file).await { + error!("Failed to record duplicate WebDAV file: {}", e); + } + + return Ok(false); // Not processed (duplicate) + } + } + } + } + } + // Create file service and save file to disk let file_service = FileService::new(state.config.upload_path.clone()); @@ -312,4 +353,11 @@ async fn process_single_file( } Ok(true) // Successfully processed +} + +fn calculate_file_hash(data: &[u8]) -> String { + let mut hasher = Sha256::new(); + hasher.update(data); + let result = hasher.finalize(); + format!("{:x}", result) } \ No newline at end of file diff --git a/src/watcher.rs b/src/watcher.rs index 2903155..e046571 100644 --- a/src/watcher.rs +++ b/src/watcher.rs @@ -7,6 +7,7 @@ use tokio::sync::mpsc; use tokio::time::{interval, sleep}; use tracing::{debug, error, info, warn}; use walkdir::WalkDir; +use sha2::{Sha256, Digest}; use crate::{config::Config, db::Database, file_service::FileService, ocr_queue::OcrQueueService}; @@ -134,7 +135,9 @@ async fn start_polling_watcher( let mut interval = interval(Duration::from_secs(config.watch_interval_seconds.unwrap_or(30))); // Initial scan + info!("Starting initial scan of watch directory: {}", config.watch_folder); scan_directory(&config.watch_folder, &mut known_files, &db, &file_service, &queue_service, &config).await?; + info!("Initial scan completed. Found {} files to track", known_files.len()); loop { interval.tick().await; @@ -242,6 +245,19 @@ async fn process_file( return Ok(()); } + // CRITICAL: Skip files that are in the upload directory - these are managed by WebDAV/manual uploads + let path_str = path.to_string_lossy(); + let upload_path_normalized = std::path::Path::new(&config.upload_path) + .canonicalize() + .unwrap_or_else(|_| std::path::PathBuf::from(&config.upload_path)); + + if let Ok(file_canonical) = path.canonicalize() { + if file_canonical.starts_with(&upload_path_normalized) { + debug!("Skipping file in upload directory (managed by WebDAV/manual upload): {}", filename); + return Ok(()); + } + } + // Check file age if configured if let Some(max_age_hours) = config.max_file_age_hours { if let Ok(metadata) = tokio::fs::metadata(path).await { @@ -255,7 +271,7 @@ async fn process_file( } } - info!("Processing new file: {:?}", path); + info!("Processing new file: {:?} (from watch directory: {})", path, config.watch_folder); let file_data = tokio::fs::read(path).await?; let file_size = file_data.len() as i64; @@ -283,16 +299,35 @@ async fn process_file( return Ok(()); } - // Check for duplicate files (same filename and size) - if let Ok(existing_docs) = db.find_documents_by_filename(&filename).await { - for doc in existing_docs { - if doc.file_size == file_size { - info!("Skipping duplicate file: {} (already exists with same size)", filename); - return Ok(()); + // Fetch admin user ID from database for watch folder documents + let admin_user = db.get_user_by_username("admin").await? + .ok_or_else(|| anyhow::anyhow!("Admin user not found. Please ensure the admin user is created."))?; + let admin_user_id = admin_user.id; + + // Calculate file hash for deduplication + let file_hash = calculate_file_hash(&file_data); + + // Check if this exact file content already exists in the system by comparing + // against existing files with the same size (performance optimization) + if let Ok(existing_docs) = db.get_documents_by_user_with_role(admin_user_id, crate::models::UserRole::Admin, 1000, 0).await { + for existing_doc in existing_docs { + // Quick size check first (much faster than hash comparison) + if existing_doc.file_size == file_size { + // Read the existing file and compare hashes + if let Ok(existing_file_data) = tokio::fs::read(&existing_doc.file_path).await { + let existing_hash = calculate_file_hash(&existing_file_data); + if file_hash == existing_hash { + info!("Skipping duplicate file content: {} (hash: {}, already exists as: {})", + filename, &file_hash[..8], existing_doc.original_filename); + return Ok(()); + } + } } } } + debug!("File content is unique: {} (hash: {})", filename, &file_hash[..8]); + // Validate PDF files before processing if mime_type == "application/pdf" { if !is_valid_pdf(&file_data) { @@ -310,11 +345,6 @@ async fn process_file( let saved_file_path = file_service.save_file(&filename, &file_data).await?; - // Fetch admin user ID from database for watch folder documents - let admin_user = db.get_user_by_username("admin").await? - .ok_or_else(|| anyhow::anyhow!("Admin user not found. Please ensure the admin user is created."))?; - let admin_user_id = admin_user.id; - let document = file_service.create_document( &filename, &filename, @@ -410,4 +440,11 @@ fn clean_pdf_data(data: &[u8]) -> &[u8] { // If no PDF header found, return original data data +} + +fn calculate_file_hash(data: &[u8]) -> String { + let mut hasher = Sha256::new(); + hasher.update(data); + let result = hasher.finalize(); + format!("{:x}", result) } \ No newline at end of file