From c656a96d912d858202aa54daddbd2bff8808ab16 Mon Sep 17 00:00:00 2001 From: perf3ct Date: Mon, 16 Jun 2025 21:24:46 +0000 Subject: [PATCH] feat(server): create folders within 'upload' path to manage thumbnails, processed images, etc. --- ...1_update_file_paths_to_structured_dirs.sql | 17 ++ src/db.rs | 2 +- src/enhanced_ocr.rs | 53 ++++-- src/file_service.rs | 159 ++++++++++++++++-- src/main.rs | 17 ++ src/ocr_queue.rs | 119 +++++++++++++ 6 files changed, 344 insertions(+), 23 deletions(-) create mode 100644 migrations/20240616000001_update_file_paths_to_structured_dirs.sql diff --git a/migrations/20240616000001_update_file_paths_to_structured_dirs.sql b/migrations/20240616000001_update_file_paths_to_structured_dirs.sql new file mode 100644 index 0000000..73f007a --- /dev/null +++ b/migrations/20240616000001_update_file_paths_to_structured_dirs.sql @@ -0,0 +1,17 @@ +-- Update existing file paths to use the new structured directory layout +-- This migration moves file paths from ./uploads/filename to ./uploads/documents/filename + +UPDATE documents +SET file_path = CASE + -- Update file paths that start with ./uploads/ but don't already have /documents/ + WHEN file_path LIKE './uploads/%' AND file_path NOT LIKE './uploads/documents/%' THEN + REPLACE(file_path, './uploads/', './uploads/documents/') + -- Update file paths that start with uploads/ but don't already have /documents/ + WHEN file_path LIKE 'uploads/%' AND file_path NOT LIKE 'uploads/documents/%' THEN + REPLACE(file_path, 'uploads/', 'uploads/documents/') + ELSE file_path +END +WHERE + (file_path LIKE './uploads/%' AND file_path NOT LIKE './uploads/documents/%') + OR + (file_path LIKE 'uploads/%' AND file_path NOT LIKE 'uploads/documents/%'); \ No newline at end of file diff --git a/src/db.rs b/src/db.rs index 1500b83..68ef522 100644 --- a/src/db.rs +++ b/src/db.rs @@ -2195,7 +2195,7 @@ impl Database { pub async fn update_source(&self, user_id: Uuid, source_id: Uuid, update: &crate::models::UpdateSource) -> Result { let mut query = String::from("UPDATE sources SET updated_at = NOW()"); - let mut bind_count = 1; + let mut bind_count = 0; if update.name.is_some() { bind_count += 1; diff --git a/src/enhanced_ocr.rs b/src/enhanced_ocr.rs index 51d6aae..357f4b5 100644 --- a/src/enhanced_ocr.rs +++ b/src/enhanced_ocr.rs @@ -15,6 +15,7 @@ use imageproc::{ use tesseract::{Tesseract, PageSegMode, OcrEngineMode}; use crate::models::Settings; +use crate::file_service::FileService; #[derive(Debug, Clone)] pub struct ImageQualityStats { @@ -31,15 +32,19 @@ pub struct OcrResult { pub processing_time_ms: u64, pub word_count: usize, pub preprocessing_applied: Vec, + pub processed_image_path: Option, } pub struct EnhancedOcrService { pub temp_dir: String, + pub file_service: FileService, } impl EnhancedOcrService { pub fn new(temp_dir: String) -> Self { - Self { temp_dir } + let upload_path = std::env::var("UPLOAD_PATH").unwrap_or_else(|_| "./uploads".to_string()); + let file_service = FileService::new(upload_path); + Self { temp_dir, file_service } } /// Extract text from image with high-quality OCR settings @@ -79,11 +84,6 @@ impl EnhancedOcrService { let (text, confidence) = ocr_result; - // Clean up temporary files if created - if processed_image_path != file_path { - let _ = tokio::fs::remove_file(&processed_image_path).await; - } - let processing_time = start_time.elapsed().as_millis() as u64; let word_count = text.split_whitespace().count(); @@ -92,19 +92,38 @@ impl EnhancedOcrService { word_count, confidence, processing_time ); - Ok(OcrResult { + // Return the processed image path if different from original (caller will handle cleanup/saving) + let result_processed_image_path = if processed_image_path != file_path { + Some(processed_image_path.clone()) + } else { + None + }; + + let result = OcrResult { text, confidence, processing_time_ms: processing_time, word_count, preprocessing_applied, - }) + processed_image_path: result_processed_image_path, + }; + + // Clean up temporary files if not saved for review + if let Some(ref temp_path) = result.processed_image_path { + if !settings.save_processed_images { + let _ = tokio::fs::remove_file(temp_path).await; + } + } + + Ok(result) } /// Preprocess image for optimal OCR quality, especially for challenging conditions #[cfg(feature = "ocr")] async fn preprocess_image(&self, input_path: &str, settings: &Settings) -> Result<(String, Vec)> { - let img = image::open(input_path)?; + // Resolve the file path first + let resolved_path = self.resolve_file_path(input_path).await?; + let img = image::open(&resolved_path)?; let mut processed_img = img; let mut preprocessing_applied = Vec::new(); @@ -741,16 +760,25 @@ impl EnhancedOcrService { processing_time_ms: processing_time, word_count, preprocessing_applied: vec!["PDF text extraction".to_string()], + processed_image_path: None, // No image processing for PDF text extraction }) } + /// Resolve file path to actual location, handling both old and new directory structures + async fn resolve_file_path(&self, file_path: &str) -> Result { + // Use the FileService's resolve_file_path method + self.file_service.resolve_file_path(file_path).await + } + /// Extract text from any supported file type pub async fn extract_text(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result { + // Resolve the actual file path + let resolved_path = self.resolve_file_path(file_path).await?; match mime_type { "application/pdf" => { #[cfg(feature = "ocr")] { - self.extract_text_from_pdf(file_path, settings).await + self.extract_text_from_pdf(&resolved_path, settings).await } #[cfg(not(feature = "ocr"))] { @@ -760,7 +788,7 @@ impl EnhancedOcrService { mime if mime.starts_with("image/") => { #[cfg(feature = "ocr")] { - self.extract_text_from_image(file_path, settings).await + self.extract_text_from_image(&resolved_path, settings).await } #[cfg(not(feature = "ocr"))] { @@ -769,7 +797,7 @@ impl EnhancedOcrService { } "text/plain" => { let start_time = std::time::Instant::now(); - let text = tokio::fs::read_to_string(file_path).await?; + let text = tokio::fs::read_to_string(&resolved_path).await?; let processing_time = start_time.elapsed().as_millis() as u64; let word_count = text.split_whitespace().count(); @@ -779,6 +807,7 @@ impl EnhancedOcrService { processing_time_ms: processing_time, word_count, preprocessing_applied: vec!["Plain text read".to_string()], + processed_image_path: None, // No image processing for plain text }) } _ => Err(anyhow::anyhow!("Unsupported file type: {}", mime_type)), diff --git a/src/file_service.rs b/src/file_service.rs index 08f75cc..04953ea 100644 --- a/src/file_service.rs +++ b/src/file_service.rs @@ -1,8 +1,9 @@ use anyhow::Result; use chrono::Utc; -use std::path::Path; +use std::path::{Path, PathBuf}; use tokio::fs; use uuid::Uuid; +use tracing::{info, warn, error}; use crate::models::Document; @@ -19,6 +20,106 @@ impl FileService { Self { upload_path } } + /// Initialize the upload directory structure + pub async fn initialize_directory_structure(&self) -> Result<()> { + let base_path = Path::new(&self.upload_path); + + // Create subdirectories for organized file storage + let directories = [ + "documents", // Final uploaded documents + "thumbnails", // Document thumbnails + "processed_images", // OCR processed images for review + "temp", // Temporary files during processing + "backups", // Document backups + ]; + + for dir in directories.iter() { + let dir_path = base_path.join(dir); + if let Err(e) = fs::create_dir_all(&dir_path).await { + error!("Failed to create directory {:?}: {}", dir_path, e); + return Err(anyhow::anyhow!("Failed to create directory structure: {}", e)); + } + info!("Ensured directory exists: {:?}", dir_path); + } + + Ok(()) + } + + /// Get the path for a specific subdirectory + pub fn get_subdirectory_path(&self, subdir: &str) -> PathBuf { + Path::new(&self.upload_path).join(subdir) + } + + /// Get the documents directory path + pub fn get_documents_path(&self) -> PathBuf { + self.get_subdirectory_path("documents") + } + + /// Get the thumbnails directory path + pub fn get_thumbnails_path(&self) -> PathBuf { + self.get_subdirectory_path("thumbnails") + } + + /// Get the processed images directory path + pub fn get_processed_images_path(&self) -> PathBuf { + self.get_subdirectory_path("processed_images") + } + + /// Get the temp directory path + pub fn get_temp_path(&self) -> PathBuf { + self.get_subdirectory_path("temp") + } + + /// Migrate existing files from the root upload directory to the structured format + pub async fn migrate_existing_files(&self) -> Result<()> { + let base_path = Path::new(&self.upload_path); + let documents_dir = self.get_documents_path(); + let thumbnails_dir = self.get_thumbnails_path(); + + info!("Starting migration of existing files to structured directories..."); + let mut migrated_count = 0; + let mut thumbnail_count = 0; + + // Read all files in the base upload directory + let mut entries = fs::read_dir(base_path).await?; + + while let Some(entry) = entries.next_entry().await? { + let file_path = entry.path(); + + // Skip directories and already structured subdirectories + if file_path.is_dir() { + continue; + } + + if let Some(filename) = file_path.file_name().and_then(|n| n.to_str()) { + // Handle thumbnail files + if filename.ends_with("_thumb.jpg") { + let new_path = thumbnails_dir.join(filename); + if let Err(e) = fs::rename(&file_path, &new_path).await { + warn!("Failed to migrate thumbnail {}: {}", filename, e); + } else { + thumbnail_count += 1; + info!("Migrated thumbnail: {} -> {:?}", filename, new_path); + } + } + // Handle regular document files + else { + let new_path = documents_dir.join(filename); + if let Err(e) = fs::rename(&file_path, &new_path).await { + warn!("Failed to migrate document {}: {}", filename, e); + } else { + migrated_count += 1; + info!("Migrated document: {} -> {:?}", filename, new_path); + } + } + } + } + + info!("Migration completed: {} documents, {} thumbnails moved to structured directories", + migrated_count, thumbnail_count); + Ok(()) + } + pub async fn save_file(&self, filename: &str, data: &[u8]) -> Result { let file_id = Uuid::new_v4(); let extension = Path::new(filename) @@ -32,10 +133,14 @@ impl FileService { format!("{}.{}", file_id, extension) }; - let file_path = Path::new(&self.upload_path).join(&saved_filename); + // Save to documents subdirectory + let documents_dir = self.get_documents_path(); + let file_path = documents_dir.join(&saved_filename); - if let Some(parent) = file_path.parent() { - fs::create_dir_all(parent).await?; + // Ensure the documents directory exists + if let Err(e) = fs::create_dir_all(&documents_dir).await { + error!("Failed to create documents directory: {}", e); + return Err(anyhow::anyhow!("Failed to create documents directory: {}", e)); } fs::write(&file_path, data).await?; @@ -86,17 +191,50 @@ impl FileService { } } + /// Resolve file path to actual location, handling both old and new directory structures + pub async fn resolve_file_path(&self, file_path: &str) -> Result { + // If the file exists at the given path, use it + if Path::new(file_path).exists() { + return Ok(file_path.to_string()); + } + + // Try to find the file in the new structured directory + if file_path.starts_with("./uploads/") && !file_path.contains("/documents/") { + let new_path = file_path.replace("./uploads/", "./uploads/documents/"); + if Path::new(&new_path).exists() { + info!("Found file in new structured directory: {} -> {}", file_path, new_path); + return Ok(new_path); + } + } + + // Try without the ./ prefix + if file_path.starts_with("uploads/") && !file_path.contains("/documents/") { + let new_path = file_path.replace("uploads/", "uploads/documents/"); + if Path::new(&new_path).exists() { + info!("Found file in new structured directory: {} -> {}", file_path, new_path); + return Ok(new_path); + } + } + + // File not found in any expected location + Err(anyhow::anyhow!("File not found: {} (checked original path and structured directory)", file_path)) + } + pub async fn read_file(&self, file_path: &str) -> Result> { - let data = fs::read(file_path).await?; + let resolved_path = self.resolve_file_path(file_path).await?; + let data = fs::read(&resolved_path).await?; Ok(data) } #[cfg(feature = "ocr")] pub async fn get_or_generate_thumbnail(&self, file_path: &str, filename: &str) -> Result> { - // Create thumbnails directory if it doesn't exist - let thumbnails_dir = Path::new(&self.upload_path).join("thumbnails"); + // Use the structured thumbnails directory + let thumbnails_dir = self.get_thumbnails_path(); if !thumbnails_dir.exists() { - fs::create_dir_all(&thumbnails_dir).await?; + if let Err(e) = fs::create_dir_all(&thumbnails_dir).await { + error!("Failed to create thumbnails directory: {}", e); + return Err(anyhow::anyhow!("Failed to create thumbnails directory: {}", e)); + } } // Generate thumbnail filename based on original file path @@ -111,8 +249,9 @@ impl FileService { return self.read_file(&thumbnail_path.to_string_lossy()).await; } - // Generate thumbnail - let thumbnail_data = self.generate_thumbnail(file_path, filename).await?; + // Resolve file path and generate thumbnail + let resolved_path = self.resolve_file_path(file_path).await?; + let thumbnail_data = self.generate_thumbnail(&resolved_path, filename).await?; // Save thumbnail to cache fs::write(&thumbnail_path, &thumbnail_data).await?; diff --git a/src/main.rs b/src/main.rs index dd2fd56..89dc635 100644 --- a/src/main.rs +++ b/src/main.rs @@ -20,6 +20,23 @@ async fn main() -> Result<(), Box> { tracing_subscriber::fmt::init(); let config = Config::from_env()?; + + // Initialize upload directory structure + info!("Initializing upload directory structure..."); + let file_service = readur::file_service::FileService::new(config.upload_path.clone()); + if let Err(e) = file_service.initialize_directory_structure().await { + error!("Failed to initialize directory structure: {}", e); + return Err(e.into()); + } + info!("✅ Upload directory structure initialized"); + + // Migrate existing files to new structure (one-time operation) + info!("Migrating existing files to structured directories..."); + if let Err(e) = file_service.migrate_existing_files().await { + warn!("Failed to migrate some existing files: {}", e); + // Don't fail startup for migration issues + } + // Create separate database pools for different workloads let web_db = Database::new_with_pool_config(&config.database_url, 20, 2).await?; // Web UI pool let background_db = Database::new_with_pool_config(&config.database_url, 30, 3).await?; // Background operations pool diff --git a/src/ocr_queue.rs b/src/ocr_queue.rs index 8c73325..4f5cd2c 100644 --- a/src/ocr_queue.rs +++ b/src/ocr_queue.rs @@ -363,6 +363,31 @@ impl OcrQueueService { } } + // Save processed image if setting is enabled and image was processed + if settings.save_processed_images { + if let Some(ref processed_image_path) = ocr_result.processed_image_path { + match self.save_processed_image_for_review( + item.document_id, + user_id.unwrap_or_default(), + &file_path, + processed_image_path, + &ocr_result.preprocessing_applied, + ).await { + Ok(_) => { + info!("✅ Saved processed image for document {} for review", item.document_id); + } + Err(e) => { + warn!("Failed to save processed image for document {}: {}", item.document_id, e); + } + } + } + } + + // Clean up temporary processed image file if it exists + if let Some(ref temp_path) = ocr_result.processed_image_path { + let _ = tokio::fs::remove_file(temp_path).await; + } + let processing_time_ms = start_time.elapsed().as_millis() as i32; self.mark_completed(item.id, processing_time_ms).await?; @@ -481,6 +506,100 @@ impl OcrQueueService { } } + /// Save processed image for review when the setting is enabled + async fn save_processed_image_for_review( + &self, + document_id: Uuid, + user_id: Uuid, + original_image_path: &str, + processed_image_path: &str, + processing_steps: &[String], + ) -> Result<()> { + use std::path::Path; + + // Use the FileService to get the proper processed images directory + use crate::file_service::FileService; + let base_upload_dir = std::env::var("UPLOAD_PATH").unwrap_or_else(|_| "uploads".to_string()); + let file_service = FileService::new(base_upload_dir); + let processed_images_dir = file_service.get_processed_images_path(); + + // Ensure the directory exists with proper error handling + if let Err(e) = tokio::fs::create_dir_all(&processed_images_dir).await { + error!("Failed to create processed images directory {:?}: {}", processed_images_dir, e); + return Err(anyhow::anyhow!("Failed to create processed images directory: {}", e)); + } + + info!("Ensured processed images directory exists: {:?}", processed_images_dir); + + // Generate a unique filename for the processed image + let file_stem = Path::new(processed_image_path) + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or("processed"); + let extension = Path::new(processed_image_path) + .extension() + .and_then(|s| s.to_str()) + .unwrap_or("jpg"); + + let permanent_filename = format!("{}_processed_{}.{}", document_id, chrono::Utc::now().timestamp(), extension); + let permanent_path = processed_images_dir.join(&permanent_filename); + + // Verify source file exists before copying + if !Path::new(processed_image_path).exists() { + return Err(anyhow::anyhow!("Source processed image file does not exist: {}", processed_image_path)); + } + + // Copy the processed image to permanent location with error handling + if let Err(e) = tokio::fs::copy(processed_image_path, &permanent_path).await { + error!("Failed to copy processed image from {} to {:?}: {}", processed_image_path, permanent_path, e); + return Err(anyhow::anyhow!("Failed to copy processed image: {}", e)); + } + + info!("Successfully copied processed image to: {:?}", permanent_path); + + // Save to database + let processing_parameters = serde_json::json!({ + "steps": processing_steps, + "timestamp": chrono::Utc::now(), + "original_path": original_image_path, + }); + + // Save metadata to database with error handling + if let Err(e) = sqlx::query( + r#" + INSERT INTO processed_images (document_id, user_id, original_image_path, processed_image_path, processing_parameters, processing_steps, created_at) + VALUES ($1, $2, $3, $4, $5, $6, NOW()) + ON CONFLICT (document_id) + DO UPDATE SET + processed_image_path = EXCLUDED.processed_image_path, + processing_parameters = EXCLUDED.processing_parameters, + processing_steps = EXCLUDED.processing_steps, + created_at = NOW() + "# + ) + .bind(document_id) + .bind(user_id) + .bind(original_image_path) + .bind(permanent_path.to_string_lossy().as_ref()) + .bind(&processing_parameters) + .bind(processing_steps) + .execute(&self.pool) + .await { + error!("Failed to save processed image metadata to database for document {}: {}", document_id, e); + + // Clean up the copied file if database save fails + if let Err(cleanup_err) = tokio::fs::remove_file(&permanent_path).await { + warn!("Failed to clean up processed image file after database error: {}", cleanup_err); + } + + return Err(anyhow::anyhow!("Failed to save processed image metadata: {}", e)); + } + + info!("Successfully saved processed image metadata for document {} to database", document_id); + + Ok(()) + } + /// Get queue statistics pub async fn get_stats(&self) -> Result { let stats = sqlx::query(