feat(server): fix recursively scanning the uploads folder, and the quick search bar

2025-06-15 04:37:49 +00:00 · 2025-06-15 04:37:49 +00:00 · 4aa3d77e40
parent 99521b4ca0
commit 4aa3d77e40
9 changed files with 226 additions and 49 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2929,6 +2929,7 @@ dependencies = [
 "reqwest",
 "serde",
 "serde_json",
+ "sha2",
 "sqlx",
 "sysinfo",
 "tempfile",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -48,6 +48,7 @@ hostname = "0.4"
 walkdir = "2"
 clap = { version = "4", features = ["derive"] }
 utoipa = { version = "5", features = ["axum_extras", "chrono", "uuid"] }
+sha2 = "0.10"
 utoipa-swagger-ui = { version = "9", features = ["axum"] }

 [features]
--- a/frontend/src/components/GlobalSearchBar/GlobalSearchBar.tsx
+++ b/frontend/src/components/GlobalSearchBar/GlobalSearchBar.tsx
@ -33,35 +33,18 @@ import {
  AccessTime as TimeIcon,
 } from '@mui/icons-material';
 import { useNavigate } from 'react-router-dom';
-import { documentService, SearchRequest } from '../../services/api';
+import { documentService, SearchRequest, EnhancedDocument, SearchResponse } from '../../services/api';

 interface GlobalSearchBarProps {
  sx?: SxProps<Theme>;
  [key: string]: any;
 }

-interface Document {
-  id: string;
-  original_filename: string;
-  filename?: string;
-  file_size: number;
-  mime_type: string;
-  has_ocr_text?: boolean;
-  search_rank?: number;
-  snippets?: Array<{ text: string }>;
-}
-
-interface SearchResponse {
-  documents: Document[];
-  total_count: number;
-  search_time_ms: number;
-}
-
 const GlobalSearchBar: React.FC<GlobalSearchBarProps> = ({ sx, ...props }) => {
  const navigate = useNavigate();
  const theme = useTheme();
  const [query, setQuery] = useState<string>('');
-  const [results, setResults] = useState<Document[]>([]);
+  const [results, setResults] = useState<EnhancedDocument[]>([]);
  const [loading, setLoading] = useState<boolean>(false);
  const [showResults, setShowResults] = useState<boolean>(false);
  const [recentSearches, setRecentSearches] = useState<string[]>([]);
@ -221,7 +204,7 @@ const GlobalSearchBar: React.FC<GlobalSearchBarProps> = ({ sx, ...props }) => {
    setSearchProgress(0);
  };

-  const handleDocumentClick = (doc: Document): void => {
+  const handleDocumentClick = (doc: EnhancedDocument): void => {
    saveRecentSearch(query);
    setShowResults(false);
    navigate(`/documents/${doc.id}`);
@ -661,7 +644,7 @@ const GlobalSearchBar: React.FC<GlobalSearchBarProps> = ({ sx, ...props }) => {
                                  flex: 1,
                                }}
                              >
-                                {highlightText(generateContextSnippet(doc.original_filename, query), query)}
+                                {highlightText(doc.original_filename || doc.filename, query)}
                              </Typography>
                            }
                            secondary={
@ -725,7 +708,7 @@ const GlobalSearchBar: React.FC<GlobalSearchBarProps> = ({ sx, ...props }) => {
                                      flex: 1,
                                    }}
                                  >
-                                    {highlightText(doc.snippets[0].text.substring(0, 80) + '...', query)}
+                                    {highlightText(doc.snippets[0]?.text?.substring(0, 80) + '...' || '', query)}
                                  </Typography>
                                )}
                              </Box>
--- a/frontend/src/pages/DocumentDetailsPage.tsx
+++ b/frontend/src/pages/DocumentDetailsPage.tsx
@ -80,23 +80,22 @@ const DocumentDetailsPage: React.FC = () => {
  }, [document]);

  const fetchDocumentDetails = async (): Promise<void> => {
+    if (!id) {
+      setError('No document ID provided');
+      setLoading(false);
+      return;
+    }
+
    try {
      setLoading(true);
      setError(null);
      
-      // Since we don't have a direct document details endpoint, 
-      // we'll fetch the document from the list and find the matching one
-      const response = await documentService.list(1000, 0);
-      const foundDoc = response.data.find(doc => doc.id === id);
-      
-      if (foundDoc) {
-        setDocument(foundDoc);
-      } else {
-        setError('Document not found');
-      }
-    } catch (err) {
-      setError('Failed to load document details');
-      console.error(err);
+      const response = await documentService.getById(id);
+      setDocument(response.data);
+    } catch (err: any) {
+      const errorMessage = err.message || 'Failed to load document details';
+      setError(errorMessage);
+      console.error('Failed to fetch document details:', err);
    } finally {
      setLoading(false);
    }
--- a/frontend/src/services/api.ts
+++ b/frontend/src/services/api.ts
@ -111,6 +111,23 @@ export const documentService = {
    })
  },

+  getById: (id: string) => {
+    // Use the document list endpoint with pagination to find the specific document
+    // This is a temporary solution until we have a proper document details endpoint
+    return api.get<Document[]>('/documents', {
+      params: { 
+        limit: 1000, // Fetch a reasonable amount to find our document
+        offset: 0 
+      }
+    }).then(response => {
+      const document = response.data.find(doc => doc.id === id);
+      if (!document) {
+        throw new Error('Document not found');
+      }
+      return { data: document };
+    })
+  },
+
  download: (id: string) => {
    return api.get(`/documents/${id}/download`, {
      responseType: 'blob',
--- a/src/config.rs
+++ b/src/config.rs
@ -28,7 +28,7 @@ impl Config {
    pub fn from_env() -> Result<Self> {
        dotenvy::dotenv().ok();
        
-        Ok(Config {
+        let config = Config {
            database_url: env::var("DATABASE_URL")
                .unwrap_or_else(|_| "postgresql://readur:readur@localhost/readur".to_string()),
            server_address: {
@ -85,6 +85,68 @@ impl Config {
                .unwrap_or(512),
            cpu_priority: env::var("CPU_PRIORITY")
                .unwrap_or_else(|_| "normal".to_string()),
-        })
+        };
+        
+        // Validate configuration to prevent recursion issues
+        config.validate_paths()?;
+        
+        Ok(config)
+    }
+    
+    fn validate_paths(&self) -> Result<()> {
+        use std::path::Path;
+        
+        let upload_path = Path::new(&self.upload_path);
+        let watch_path = Path::new(&self.watch_folder);
+        
+        // Normalize paths to handle relative paths and symlinks
+        let upload_canonical = upload_path.canonicalize()
+            .unwrap_or_else(|_| upload_path.to_path_buf());
+        let watch_canonical = watch_path.canonicalize()
+            .unwrap_or_else(|_| watch_path.to_path_buf());
+        
+        // Check if paths are the same
+        if upload_canonical == watch_canonical {
+            return Err(anyhow::anyhow!(
+                "Configuration Error: UPLOAD_PATH and WATCH_FOLDER cannot be the same directory.\n\
+                 This would cause infinite recursion where WebDAV files are downloaded to the upload \n\
+                 directory and then immediately reprocessed by the watcher.\n\
+                 Current config:\n\
+                 - UPLOAD_PATH: {}\n\
+                 - WATCH_FOLDER: {}\n\
+                 Please set them to different directories.",
+                self.upload_path, self.watch_folder
+            ));
+        }
+        
+        // Check if watch folder is inside upload folder
+        if watch_canonical.starts_with(&upload_canonical) {
+            return Err(anyhow::anyhow!(
+                "Configuration Error: WATCH_FOLDER cannot be inside UPLOAD_PATH.\n\
+                 This would cause recursion where WebDAV files downloaded to uploads are \n\
+                 detected by the watcher as new files.\n\
+                 Current config:\n\
+                 - UPLOAD_PATH: {}\n\
+                 - WATCH_FOLDER: {}\n\
+                 Please move the watch folder outside the upload directory.",
+                self.upload_path, self.watch_folder
+            ));
+        }
+        
+        // Check if upload folder is inside watch folder
+        if upload_canonical.starts_with(&watch_canonical) {
+            return Err(anyhow::anyhow!(
+                "Configuration Error: UPLOAD_PATH cannot be inside WATCH_FOLDER.\n\
+                 This would cause recursion where files from the watch folder are \n\
+                 copied to uploads (inside the watch folder) and reprocessed.\n\
+                 Current config:\n\
+                 - UPLOAD_PATH: {}\n\
+                 - WATCH_FOLDER: {}\n\
+                 Please move the upload directory outside the watch folder.",
+                self.upload_path, self.watch_folder
+            ));
+        }
+        
+        Ok(())
    }
 }
--- a/src/routes/documents.rs
+++ b/src/routes/documents.rs
@ -8,6 +8,7 @@ use axum::{
 use serde::Deserialize;
 use std::sync::Arc;
 use utoipa::ToSchema;
+use sha2::{Sha256, Digest};

 use crate::{
    auth::AuthUser,
@ -85,6 +86,27 @@ async fn upload_document(
                return Err(StatusCode::PAYLOAD_TOO_LARGE);
            }
            
+            // Calculate file hash for deduplication
+            let file_hash = calculate_file_hash(&data);
+            
+            // Check if this exact file content already exists in the system
+            // This prevents uploading and processing duplicate files
+            if let Ok(existing_docs) = state.db.get_documents_by_user_with_role(auth_user.user.id, auth_user.user.role, 1000, 0).await {
+                for existing_doc in existing_docs {
+                    // Quick size check first (much faster than hash comparison)
+                    if existing_doc.file_size == file_size {
+                        // Read the existing file and compare hashes
+                        if let Ok(existing_file_data) = tokio::fs::read(&existing_doc.file_path).await {
+                            let existing_hash = calculate_file_hash(&existing_file_data);
+                            if file_hash == existing_hash {
+                                // Return the existing document instead of creating a duplicate
+                                return Ok(Json(existing_doc.into()));
+                            }
+                        }
+                    }
+                }
+            }
+            
            let mime_type = mime_guess::from_path(&filename)
                .first_or_octet_stream()
                .to_string();
@ -135,6 +157,13 @@ async fn upload_document(
    Err(StatusCode::BAD_REQUEST)
 }

+fn calculate_file_hash(data: &[u8]) -> String {
+    let mut hasher = Sha256::new();
+    hasher.update(data);
+    let result = hasher.finalize();
+    format!("{:x}", result)
+}
+
 #[utoipa::path(
    get,
    path = "/api/documents",
--- a/src/routes/webdav/webdav_sync.rs
+++ b/src/routes/webdav/webdav_sync.rs
@ -4,6 +4,7 @@ use tracing::{error, info, warn};
 use chrono::Utc;
 use tokio::sync::Semaphore;
 use futures::stream::{FuturesUnordered, StreamExt};
+use sha2::{Sha256, Digest};

 use crate::{
    AppState,
@ -241,6 +242,46 @@ async fn process_single_file(
    
    info!("Downloaded file: {} ({} bytes)", file_info.name, file_data.len());
    
+    // Calculate file hash for deduplication 
+    let file_hash = calculate_file_hash(&file_data);
+    
+    // Check if this exact file content already exists in the system
+    // This prevents downloading and processing duplicate files from WebDAV
+    if let Ok(existing_docs) = state.db.get_documents_by_user_with_role(user_id, crate::models::UserRole::User, 1000, 0).await {
+        for existing_doc in existing_docs {
+            // Quick size check first (much faster than hash comparison)
+            if existing_doc.file_size == file_data.len() as i64 {
+                // Read the existing file and compare hashes
+                if let Ok(existing_file_data) = tokio::fs::read(&existing_doc.file_path).await {
+                    let existing_hash = calculate_file_hash(&existing_file_data);
+                    if file_hash == existing_hash {
+                        info!("Skipping duplicate WebDAV file content: {} (hash: {}, already exists as: {})", 
+                            file_info.name, &file_hash[..8], existing_doc.original_filename);
+                        
+                        // Still record this WebDAV file in the tracking table to prevent re-downloading
+                        let webdav_file = CreateWebDAVFile {
+                            user_id,
+                            webdav_path: file_info.path.clone(),
+                            etag: file_info.etag.clone(),
+                            last_modified: file_info.last_modified,
+                            file_size: file_info.size,
+                            mime_type: file_info.mime_type.clone(),
+                            document_id: Some(existing_doc.id), // Link to existing document
+                            sync_status: "duplicate_content".to_string(),
+                            sync_error: None,
+                        };
+                        
+                        if let Err(e) = state.db.create_or_update_webdav_file(&webdav_file).await {
+                            error!("Failed to record duplicate WebDAV file: {}", e);
+                        }
+                        
+                        return Ok(false); // Not processed (duplicate)
+                    }
+                }
+            }
+        }
+    }
+    
    // Create file service and save file to disk
    let file_service = FileService::new(state.config.upload_path.clone());
    
@ -312,4 +353,11 @@ async fn process_single_file(
    }
    
    Ok(true) // Successfully processed
+}
+
+fn calculate_file_hash(data: &[u8]) -> String {
+    let mut hasher = Sha256::new();
+    hasher.update(data);
+    let result = hasher.finalize();
+    format!("{:x}", result)
 }
--- a/src/watcher.rs
+++ b/src/watcher.rs
@ -7,6 +7,7 @@ use tokio::sync::mpsc;
 use tokio::time::{interval, sleep};
 use tracing::{debug, error, info, warn};
 use walkdir::WalkDir;
+use sha2::{Sha256, Digest};

 use crate::{config::Config, db::Database, file_service::FileService, ocr_queue::OcrQueueService};

@ -134,7 +135,9 @@ async fn start_polling_watcher(
    let mut interval = interval(Duration::from_secs(config.watch_interval_seconds.unwrap_or(30)));
    
    // Initial scan
+    info!("Starting initial scan of watch directory: {}", config.watch_folder);
    scan_directory(&config.watch_folder, &mut known_files, &db, &file_service, &queue_service, &config).await?;
+    info!("Initial scan completed. Found {} files to track", known_files.len());
    
    loop {
        interval.tick().await;
@ -242,6 +245,19 @@ async fn process_file(
        return Ok(());
    }
    
+    // CRITICAL: Skip files that are in the upload directory - these are managed by WebDAV/manual uploads
+    let path_str = path.to_string_lossy();
+    let upload_path_normalized = std::path::Path::new(&config.upload_path)
+        .canonicalize()
+        .unwrap_or_else(|_| std::path::PathBuf::from(&config.upload_path));
+    
+    if let Ok(file_canonical) = path.canonicalize() {
+        if file_canonical.starts_with(&upload_path_normalized) {
+            debug!("Skipping file in upload directory (managed by WebDAV/manual upload): {}", filename);
+            return Ok(());
+        }
+    }
+    
    // Check file age if configured
    if let Some(max_age_hours) = config.max_file_age_hours {
        if let Ok(metadata) = tokio::fs::metadata(path).await {
@ -255,7 +271,7 @@ async fn process_file(
        }
    }
    
-    info!("Processing new file: {:?}", path);
+    info!("Processing new file: {:?} (from watch directory: {})", path, config.watch_folder);
    
    let file_data = tokio::fs::read(path).await?;
    let file_size = file_data.len() as i64;
@ -283,16 +299,35 @@ async fn process_file(
        return Ok(());  
    }
    
-    // Check for duplicate files (same filename and size)
-    if let Ok(existing_docs) = db.find_documents_by_filename(&filename).await {
-        for doc in existing_docs {
-            if doc.file_size == file_size {
-                info!("Skipping duplicate file: {} (already exists with same size)", filename);
-                return Ok(());
+    // Fetch admin user ID from database for watch folder documents
+    let admin_user = db.get_user_by_username("admin").await?
+        .ok_or_else(|| anyhow::anyhow!("Admin user not found. Please ensure the admin user is created."))?;
+    let admin_user_id = admin_user.id;
+    
+    // Calculate file hash for deduplication
+    let file_hash = calculate_file_hash(&file_data);
+    
+    // Check if this exact file content already exists in the system by comparing
+    // against existing files with the same size (performance optimization)
+    if let Ok(existing_docs) = db.get_documents_by_user_with_role(admin_user_id, crate::models::UserRole::Admin, 1000, 0).await {
+        for existing_doc in existing_docs {
+            // Quick size check first (much faster than hash comparison)
+            if existing_doc.file_size == file_size {
+                // Read the existing file and compare hashes
+                if let Ok(existing_file_data) = tokio::fs::read(&existing_doc.file_path).await {
+                    let existing_hash = calculate_file_hash(&existing_file_data);
+                    if file_hash == existing_hash {
+                        info!("Skipping duplicate file content: {} (hash: {}, already exists as: {})", 
+                            filename, &file_hash[..8], existing_doc.original_filename);
+                        return Ok(());
+                    }
+                }
            }
        }
    }
    
+    debug!("File content is unique: {} (hash: {})", filename, &file_hash[..8]);
+    
    // Validate PDF files before processing
    if mime_type == "application/pdf" {
        if !is_valid_pdf(&file_data) {
@ -310,11 +345,6 @@ async fn process_file(
    
    let saved_file_path = file_service.save_file(&filename, &file_data).await?;
    
-    // Fetch admin user ID from database for watch folder documents
-    let admin_user = db.get_user_by_username("admin").await?
-        .ok_or_else(|| anyhow::anyhow!("Admin user not found. Please ensure the admin user is created."))?;
-    let admin_user_id = admin_user.id;
-    
    let document = file_service.create_document(
        &filename,
        &filename,
@ -410,4 +440,11 @@ fn clean_pdf_data(data: &[u8]) -> &[u8] {
    
    // If no PDF header found, return original data
    data
+}
+
+fn calculate_file_hash(data: &[u8]) -> String {
+    let mut hasher = Sha256::new();
+    hasher.update(data);
+    let result = hasher.finalize();
+    format!("{:x}", result)
 }