feat(server): I feel like I'm going to have to come back and fix this later

This commit is contained in:
perf3ct 2025-06-15 04:58:22 +00:00
parent 7feec817d0
commit a8baa671ec
3 changed files with 95 additions and 49 deletions

View File

@ -218,21 +218,25 @@ async fn process_single_file(
info!("Processing file: {}", file_info.path);
// Check if we've already processed this file
info!("Checking WebDAV tracking for: {}", file_info.path);
match state.db.get_webdav_file_by_path(user_id, &file_info.path).await {
Ok(Some(existing_file)) => {
info!("Found existing WebDAV file record: {} (current ETag: {}, remote ETag: {})",
file_info.path, existing_file.etag, file_info.etag);
// Check if file has changed (compare ETags)
if existing_file.etag == file_info.etag {
info!("Skipping unchanged file: {} (ETag: {})", file_info.path, file_info.etag);
info!("Skipping unchanged WebDAV file: {} (ETag: {})", file_info.path, file_info.etag);
return Ok(false); // Not processed (no change)
}
info!("File has changed: {} (old ETag: {}, new ETag: {})",
info!("WebDAV file has changed: {} (old ETag: {}, new ETag: {})",
file_info.path, existing_file.etag, file_info.etag);
}
Ok(None) => {
info!("New file found: {}", file_info.path);
info!("New WebDAV file detected: {}", file_info.path);
}
Err(e) => {
warn!("Error checking existing file {}: {}", file_info.path, e);
warn!("Error checking existing WebDAV file {}: {}", file_info.path, e);
}
}
@ -245,39 +249,50 @@ async fn process_single_file(
// Calculate file hash for deduplication
let file_hash = calculate_file_hash(&file_data);
// Check if this exact file content already exists in the system
// Check if this exact file content already exists for this user
// This prevents downloading and processing duplicate files from WebDAV
info!("Checking for duplicate content for user {}: {} (hash: {}, size: {} bytes)",
user_id, file_info.name, &file_hash[..8], file_data.len());
// Query documents with the same file size for this user only
let size_filter = file_data.len() as i64;
if let Ok(existing_docs) = state.db.get_documents_by_user_with_role(user_id, crate::models::UserRole::User, 1000, 0).await {
for existing_doc in existing_docs {
// Quick size check first (much faster than hash comparison)
if existing_doc.file_size == file_data.len() as i64 {
// Read the existing file and compare hashes
if let Ok(existing_file_data) = tokio::fs::read(&existing_doc.file_path).await {
let existing_hash = calculate_file_hash(&existing_file_data);
if file_hash == existing_hash {
info!("Skipping duplicate WebDAV file content: {} (hash: {}, already exists as: {})",
file_info.name, &file_hash[..8], existing_doc.original_filename);
// Still record this WebDAV file in the tracking table to prevent re-downloading
let webdav_file = CreateWebDAVFile {
user_id,
webdav_path: file_info.path.clone(),
etag: file_info.etag.clone(),
last_modified: file_info.last_modified,
file_size: file_info.size,
mime_type: file_info.mime_type.clone(),
document_id: Some(existing_doc.id), // Link to existing document
sync_status: "duplicate_content".to_string(),
sync_error: None,
};
if let Err(e) = state.db.create_or_update_webdav_file(&webdav_file).await {
error!("Failed to record duplicate WebDAV file: {}", e);
}
return Ok(false); // Not processed (duplicate)
let matching_docs: Vec<_> = existing_docs.into_iter()
.filter(|doc| doc.file_size == size_filter)
.collect();
info!("Found {} documents with same size for user {}", matching_docs.len(), user_id);
for existing_doc in matching_docs {
// Read the existing file and compare hashes
if let Ok(existing_file_data) = tokio::fs::read(&existing_doc.file_path).await {
let existing_hash = calculate_file_hash(&existing_file_data);
if file_hash == existing_hash {
info!("Found duplicate content for user {}: {} matches existing document {}",
user_id, file_info.name, existing_doc.original_filename);
// Record this WebDAV file as a duplicate but link to existing document
let webdav_file = CreateWebDAVFile {
user_id,
webdav_path: file_info.path.clone(),
etag: file_info.etag.clone(),
last_modified: file_info.last_modified,
file_size: file_info.size,
mime_type: file_info.mime_type.clone(),
document_id: Some(existing_doc.id), // Link to existing document
sync_status: "duplicate_content".to_string(),
sync_error: None,
};
if let Err(e) = state.db.create_or_update_webdav_file(&webdav_file).await {
error!("Failed to record duplicate WebDAV file: {}", e);
}
info!("WebDAV file marked as duplicate_content, skipping processing");
return Ok(false); // Not processed (duplicate)
}
} else {
warn!("Could not read existing file for hash comparison: {}", existing_doc.file_path);
}
}
}
@ -325,8 +340,10 @@ async fn process_single_file(
// Queue for OCR processing if enabled
if enable_background_ocr {
info!("Background OCR is enabled, queueing document {} for processing", created_document.id);
match state.db.pool.acquire().await {
Ok(conn) => {
Ok(_conn) => {
let queue_service = crate::ocr_queue::OcrQueueService::new(
state.db.clone(),
state.db.pool.clone(),
@ -350,6 +367,8 @@ async fn process_single_file(
error!("Failed to connect to database for OCR queueing: {}", e);
}
}
} else {
info!("Background OCR is disabled, skipping OCR queue for document {}", created_document.id);
}
Ok(true) // Successfully processed

View File

@ -13,6 +13,16 @@ use crate::{config::Config, db::Database, file_service::FileService, ocr_queue::
pub async fn start_folder_watcher(config: Config, db: Database) -> Result<()> {
info!("Starting hybrid folder watcher on: {}", config.watch_folder);
info!("Upload path configured as: {}", config.upload_path);
// Debug: Check if paths resolve correctly
let watch_canonical = std::path::Path::new(&config.watch_folder).canonicalize()
.unwrap_or_else(|_| std::path::PathBuf::from(&config.watch_folder));
let upload_canonical = std::path::Path::new(&config.upload_path).canonicalize()
.unwrap_or_else(|_| std::path::PathBuf::from(&config.upload_path));
info!("Watch folder canonical path: {:?}", watch_canonical);
info!("Upload folder canonical path: {:?}", upload_canonical);
// Initialize services with shared database
let file_service = FileService::new(config.upload_path.clone());
@ -167,6 +177,7 @@ async fn scan_directory(
{
if entry.file_type().is_file() {
let path = entry.path().to_path_buf();
debug!("Found file during scan: {:?}", path);
if let Ok(metadata) = entry.metadata() {
if let Ok(modified) = metadata.modified() {
@ -307,20 +318,26 @@ async fn process_file(
// Calculate file hash for deduplication
let file_hash = calculate_file_hash(&file_data);
// Check if this exact file content already exists in the system by comparing
// against existing files with the same size (performance optimization)
// Check if this exact file content already exists for the admin user
debug!("Checking for duplicate content for admin user: {} (hash: {}, size: {} bytes)",
filename, &file_hash[..8], file_size);
// Query documents with the same file size for the admin user only
if let Ok(existing_docs) = db.get_documents_by_user_with_role(admin_user_id, crate::models::UserRole::Admin, 1000, 0).await {
for existing_doc in existing_docs {
// Quick size check first (much faster than hash comparison)
if existing_doc.file_size == file_size {
// Read the existing file and compare hashes
if let Ok(existing_file_data) = tokio::fs::read(&existing_doc.file_path).await {
let existing_hash = calculate_file_hash(&existing_file_data);
if file_hash == existing_hash {
info!("Skipping duplicate file content: {} (hash: {}, already exists as: {})",
filename, &file_hash[..8], existing_doc.original_filename);
return Ok(());
}
let matching_docs: Vec<_> = existing_docs.into_iter()
.filter(|doc| doc.file_size == file_size)
.collect();
debug!("Found {} documents with same size for admin user", matching_docs.len());
for existing_doc in matching_docs {
// Read the existing file and compare hashes
if let Ok(existing_file_data) = tokio::fs::read(&existing_doc.file_path).await {
let existing_hash = calculate_file_hash(&existing_file_data);
if file_hash == existing_hash {
info!("Skipping duplicate file content: {} (hash: {}, already exists as: {})",
filename, &file_hash[..8], existing_doc.original_filename);
return Ok(());
}
}
}

View File

@ -143,11 +143,21 @@ impl WebDAVScheduler {
let elapsed_minutes = elapsed.num_minutes();
if elapsed_minutes < sync_interval_minutes as i64 {
info!("Sync not due for user {} (last sync {} minutes ago, interval {} minutes)",
user_settings.user_id, elapsed_minutes, sync_interval_minutes);
// Only log this occasionally to avoid spam
if elapsed_minutes % 10 == 0 {
info!("Sync not due for user {} (last sync {} minutes ago, interval {} minutes)",
user_settings.user_id, elapsed_minutes, sync_interval_minutes);
}
return Ok(false);
}
info!("Sync is due for user {} (last sync {} minutes ago, interval {} minutes)",
user_settings.user_id, elapsed_minutes, sync_interval_minutes);
} else {
info!("No previous sync found for user {}, sync is due", user_settings.user_id);
}
} else {
info!("No sync state found for user {}, sync is due", user_settings.user_id);
}
// Sync is due