feat(server): create folders within 'upload' path to manage thumbnails, processed images, etc.

This commit is contained in:
perf3ct 2025-06-16 21:24:46 +00:00
parent 54a5e88ae3
commit 0ccceb768a
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
6 changed files with 344 additions and 23 deletions

View File

@ -0,0 +1,17 @@
-- Update existing file paths to use the new structured directory layout
-- This migration moves file paths from ./uploads/filename to ./uploads/documents/filename
UPDATE documents
SET file_path = CASE
-- Update file paths that start with ./uploads/ but don't already have /documents/
WHEN file_path LIKE './uploads/%' AND file_path NOT LIKE './uploads/documents/%' THEN
REPLACE(file_path, './uploads/', './uploads/documents/')
-- Update file paths that start with uploads/ but don't already have /documents/
WHEN file_path LIKE 'uploads/%' AND file_path NOT LIKE 'uploads/documents/%' THEN
REPLACE(file_path, 'uploads/', 'uploads/documents/')
ELSE file_path
END
WHERE
(file_path LIKE './uploads/%' AND file_path NOT LIKE './uploads/documents/%')
OR
(file_path LIKE 'uploads/%' AND file_path NOT LIKE 'uploads/documents/%');

View File

@ -2195,7 +2195,7 @@ impl Database {
pub async fn update_source(&self, user_id: Uuid, source_id: Uuid, update: &crate::models::UpdateSource) -> Result<crate::models::Source> {
let mut query = String::from("UPDATE sources SET updated_at = NOW()");
let mut bind_count = 1;
let mut bind_count = 0;
if update.name.is_some() {
bind_count += 1;

View File

@ -15,6 +15,7 @@ use imageproc::{
use tesseract::{Tesseract, PageSegMode, OcrEngineMode};
use crate::models::Settings;
use crate::file_service::FileService;
#[derive(Debug, Clone)]
pub struct ImageQualityStats {
@ -31,15 +32,19 @@ pub struct OcrResult {
pub processing_time_ms: u64,
pub word_count: usize,
pub preprocessing_applied: Vec<String>,
pub processed_image_path: Option<String>,
}
pub struct EnhancedOcrService {
pub temp_dir: String,
pub file_service: FileService,
}
impl EnhancedOcrService {
pub fn new(temp_dir: String) -> Self {
Self { temp_dir }
let upload_path = std::env::var("UPLOAD_PATH").unwrap_or_else(|_| "./uploads".to_string());
let file_service = FileService::new(upload_path);
Self { temp_dir, file_service }
}
/// Extract text from image with high-quality OCR settings
@ -79,11 +84,6 @@ impl EnhancedOcrService {
let (text, confidence) = ocr_result;
// Clean up temporary files if created
if processed_image_path != file_path {
let _ = tokio::fs::remove_file(&processed_image_path).await;
}
let processing_time = start_time.elapsed().as_millis() as u64;
let word_count = text.split_whitespace().count();
@ -92,19 +92,38 @@ impl EnhancedOcrService {
word_count, confidence, processing_time
);
Ok(OcrResult {
// Return the processed image path if different from original (caller will handle cleanup/saving)
let result_processed_image_path = if processed_image_path != file_path {
Some(processed_image_path.clone())
} else {
None
};
let result = OcrResult {
text,
confidence,
processing_time_ms: processing_time,
word_count,
preprocessing_applied,
})
processed_image_path: result_processed_image_path,
};
// Clean up temporary files if not saved for review
if let Some(ref temp_path) = result.processed_image_path {
if !settings.save_processed_images {
let _ = tokio::fs::remove_file(temp_path).await;
}
}
Ok(result)
}
/// Preprocess image for optimal OCR quality, especially for challenging conditions
#[cfg(feature = "ocr")]
async fn preprocess_image(&self, input_path: &str, settings: &Settings) -> Result<(String, Vec<String>)> {
let img = image::open(input_path)?;
// Resolve the file path first
let resolved_path = self.resolve_file_path(input_path).await?;
let img = image::open(&resolved_path)?;
let mut processed_img = img;
let mut preprocessing_applied = Vec::new();
@ -741,16 +760,25 @@ impl EnhancedOcrService {
processing_time_ms: processing_time,
word_count,
preprocessing_applied: vec!["PDF text extraction".to_string()],
processed_image_path: None, // No image processing for PDF text extraction
})
}
/// Resolve file path to actual location, handling both old and new directory structures
async fn resolve_file_path(&self, file_path: &str) -> Result<String> {
// Use the FileService's resolve_file_path method
self.file_service.resolve_file_path(file_path).await
}
/// Extract text from any supported file type
pub async fn extract_text(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result<OcrResult> {
// Resolve the actual file path
let resolved_path = self.resolve_file_path(file_path).await?;
match mime_type {
"application/pdf" => {
#[cfg(feature = "ocr")]
{
self.extract_text_from_pdf(file_path, settings).await
self.extract_text_from_pdf(&resolved_path, settings).await
}
#[cfg(not(feature = "ocr"))]
{
@ -760,7 +788,7 @@ impl EnhancedOcrService {
mime if mime.starts_with("image/") => {
#[cfg(feature = "ocr")]
{
self.extract_text_from_image(file_path, settings).await
self.extract_text_from_image(&resolved_path, settings).await
}
#[cfg(not(feature = "ocr"))]
{
@ -769,7 +797,7 @@ impl EnhancedOcrService {
}
"text/plain" => {
let start_time = std::time::Instant::now();
let text = tokio::fs::read_to_string(file_path).await?;
let text = tokio::fs::read_to_string(&resolved_path).await?;
let processing_time = start_time.elapsed().as_millis() as u64;
let word_count = text.split_whitespace().count();
@ -779,6 +807,7 @@ impl EnhancedOcrService {
processing_time_ms: processing_time,
word_count,
preprocessing_applied: vec!["Plain text read".to_string()],
processed_image_path: None, // No image processing for plain text
})
}
_ => Err(anyhow::anyhow!("Unsupported file type: {}", mime_type)),

View File

@ -1,8 +1,9 @@
use anyhow::Result;
use chrono::Utc;
use std::path::Path;
use std::path::{Path, PathBuf};
use tokio::fs;
use uuid::Uuid;
use tracing::{info, warn, error};
use crate::models::Document;
@ -19,6 +20,106 @@ impl FileService {
Self { upload_path }
}
/// Initialize the upload directory structure
pub async fn initialize_directory_structure(&self) -> Result<()> {
let base_path = Path::new(&self.upload_path);
// Create subdirectories for organized file storage
let directories = [
"documents", // Final uploaded documents
"thumbnails", // Document thumbnails
"processed_images", // OCR processed images for review
"temp", // Temporary files during processing
"backups", // Document backups
];
for dir in directories.iter() {
let dir_path = base_path.join(dir);
if let Err(e) = fs::create_dir_all(&dir_path).await {
error!("Failed to create directory {:?}: {}", dir_path, e);
return Err(anyhow::anyhow!("Failed to create directory structure: {}", e));
}
info!("Ensured directory exists: {:?}", dir_path);
}
Ok(())
}
/// Get the path for a specific subdirectory
pub fn get_subdirectory_path(&self, subdir: &str) -> PathBuf {
Path::new(&self.upload_path).join(subdir)
}
/// Get the documents directory path
pub fn get_documents_path(&self) -> PathBuf {
self.get_subdirectory_path("documents")
}
/// Get the thumbnails directory path
pub fn get_thumbnails_path(&self) -> PathBuf {
self.get_subdirectory_path("thumbnails")
}
/// Get the processed images directory path
pub fn get_processed_images_path(&self) -> PathBuf {
self.get_subdirectory_path("processed_images")
}
/// Get the temp directory path
pub fn get_temp_path(&self) -> PathBuf {
self.get_subdirectory_path("temp")
}
/// Migrate existing files from the root upload directory to the structured format
pub async fn migrate_existing_files(&self) -> Result<()> {
let base_path = Path::new(&self.upload_path);
let documents_dir = self.get_documents_path();
let thumbnails_dir = self.get_thumbnails_path();
info!("Starting migration of existing files to structured directories...");
let mut migrated_count = 0;
let mut thumbnail_count = 0;
// Read all files in the base upload directory
let mut entries = fs::read_dir(base_path).await?;
while let Some(entry) = entries.next_entry().await? {
let file_path = entry.path();
// Skip directories and already structured subdirectories
if file_path.is_dir() {
continue;
}
if let Some(filename) = file_path.file_name().and_then(|n| n.to_str()) {
// Handle thumbnail files
if filename.ends_with("_thumb.jpg") {
let new_path = thumbnails_dir.join(filename);
if let Err(e) = fs::rename(&file_path, &new_path).await {
warn!("Failed to migrate thumbnail {}: {}", filename, e);
} else {
thumbnail_count += 1;
info!("Migrated thumbnail: {} -> {:?}", filename, new_path);
}
}
// Handle regular document files
else {
let new_path = documents_dir.join(filename);
if let Err(e) = fs::rename(&file_path, &new_path).await {
warn!("Failed to migrate document {}: {}", filename, e);
} else {
migrated_count += 1;
info!("Migrated document: {} -> {:?}", filename, new_path);
}
}
}
}
info!("Migration completed: {} documents, {} thumbnails moved to structured directories",
migrated_count, thumbnail_count);
Ok(())
}
pub async fn save_file(&self, filename: &str, data: &[u8]) -> Result<String> {
let file_id = Uuid::new_v4();
let extension = Path::new(filename)
@ -32,10 +133,14 @@ impl FileService {
format!("{}.{}", file_id, extension)
};
let file_path = Path::new(&self.upload_path).join(&saved_filename);
// Save to documents subdirectory
let documents_dir = self.get_documents_path();
let file_path = documents_dir.join(&saved_filename);
if let Some(parent) = file_path.parent() {
fs::create_dir_all(parent).await?;
// Ensure the documents directory exists
if let Err(e) = fs::create_dir_all(&documents_dir).await {
error!("Failed to create documents directory: {}", e);
return Err(anyhow::anyhow!("Failed to create documents directory: {}", e));
}
fs::write(&file_path, data).await?;
@ -86,17 +191,50 @@ impl FileService {
}
}
/// Resolve file path to actual location, handling both old and new directory structures
pub async fn resolve_file_path(&self, file_path: &str) -> Result<String> {
// If the file exists at the given path, use it
if Path::new(file_path).exists() {
return Ok(file_path.to_string());
}
// Try to find the file in the new structured directory
if file_path.starts_with("./uploads/") && !file_path.contains("/documents/") {
let new_path = file_path.replace("./uploads/", "./uploads/documents/");
if Path::new(&new_path).exists() {
info!("Found file in new structured directory: {} -> {}", file_path, new_path);
return Ok(new_path);
}
}
// Try without the ./ prefix
if file_path.starts_with("uploads/") && !file_path.contains("/documents/") {
let new_path = file_path.replace("uploads/", "uploads/documents/");
if Path::new(&new_path).exists() {
info!("Found file in new structured directory: {} -> {}", file_path, new_path);
return Ok(new_path);
}
}
// File not found in any expected location
Err(anyhow::anyhow!("File not found: {} (checked original path and structured directory)", file_path))
}
pub async fn read_file(&self, file_path: &str) -> Result<Vec<u8>> {
let data = fs::read(file_path).await?;
let resolved_path = self.resolve_file_path(file_path).await?;
let data = fs::read(&resolved_path).await?;
Ok(data)
}
#[cfg(feature = "ocr")]
pub async fn get_or_generate_thumbnail(&self, file_path: &str, filename: &str) -> Result<Vec<u8>> {
// Create thumbnails directory if it doesn't exist
let thumbnails_dir = Path::new(&self.upload_path).join("thumbnails");
// Use the structured thumbnails directory
let thumbnails_dir = self.get_thumbnails_path();
if !thumbnails_dir.exists() {
fs::create_dir_all(&thumbnails_dir).await?;
if let Err(e) = fs::create_dir_all(&thumbnails_dir).await {
error!("Failed to create thumbnails directory: {}", e);
return Err(anyhow::anyhow!("Failed to create thumbnails directory: {}", e));
}
}
// Generate thumbnail filename based on original file path
@ -111,8 +249,9 @@ impl FileService {
return self.read_file(&thumbnail_path.to_string_lossy()).await;
}
// Generate thumbnail
let thumbnail_data = self.generate_thumbnail(file_path, filename).await?;
// Resolve file path and generate thumbnail
let resolved_path = self.resolve_file_path(file_path).await?;
let thumbnail_data = self.generate_thumbnail(&resolved_path, filename).await?;
// Save thumbnail to cache
fs::write(&thumbnail_path, &thumbnail_data).await?;

View File

@ -20,6 +20,23 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
tracing_subscriber::fmt::init();
let config = Config::from_env()?;
// Initialize upload directory structure
info!("Initializing upload directory structure...");
let file_service = readur::file_service::FileService::new(config.upload_path.clone());
if let Err(e) = file_service.initialize_directory_structure().await {
error!("Failed to initialize directory structure: {}", e);
return Err(e.into());
}
info!("✅ Upload directory structure initialized");
// Migrate existing files to new structure (one-time operation)
info!("Migrating existing files to structured directories...");
if let Err(e) = file_service.migrate_existing_files().await {
warn!("Failed to migrate some existing files: {}", e);
// Don't fail startup for migration issues
}
// Create separate database pools for different workloads
let web_db = Database::new_with_pool_config(&config.database_url, 20, 2).await?; // Web UI pool
let background_db = Database::new_with_pool_config(&config.database_url, 30, 3).await?; // Background operations pool

View File

@ -363,6 +363,31 @@ impl OcrQueueService {
}
}
// Save processed image if setting is enabled and image was processed
if settings.save_processed_images {
if let Some(ref processed_image_path) = ocr_result.processed_image_path {
match self.save_processed_image_for_review(
item.document_id,
user_id.unwrap_or_default(),
&file_path,
processed_image_path,
&ocr_result.preprocessing_applied,
).await {
Ok(_) => {
info!("✅ Saved processed image for document {} for review", item.document_id);
}
Err(e) => {
warn!("Failed to save processed image for document {}: {}", item.document_id, e);
}
}
}
}
// Clean up temporary processed image file if it exists
if let Some(ref temp_path) = ocr_result.processed_image_path {
let _ = tokio::fs::remove_file(temp_path).await;
}
let processing_time_ms = start_time.elapsed().as_millis() as i32;
self.mark_completed(item.id, processing_time_ms).await?;
@ -481,6 +506,100 @@ impl OcrQueueService {
}
}
/// Save processed image for review when the setting is enabled
async fn save_processed_image_for_review(
&self,
document_id: Uuid,
user_id: Uuid,
original_image_path: &str,
processed_image_path: &str,
processing_steps: &[String],
) -> Result<()> {
use std::path::Path;
// Use the FileService to get the proper processed images directory
use crate::file_service::FileService;
let base_upload_dir = std::env::var("UPLOAD_PATH").unwrap_or_else(|_| "uploads".to_string());
let file_service = FileService::new(base_upload_dir);
let processed_images_dir = file_service.get_processed_images_path();
// Ensure the directory exists with proper error handling
if let Err(e) = tokio::fs::create_dir_all(&processed_images_dir).await {
error!("Failed to create processed images directory {:?}: {}", processed_images_dir, e);
return Err(anyhow::anyhow!("Failed to create processed images directory: {}", e));
}
info!("Ensured processed images directory exists: {:?}", processed_images_dir);
// Generate a unique filename for the processed image
let file_stem = Path::new(processed_image_path)
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("processed");
let extension = Path::new(processed_image_path)
.extension()
.and_then(|s| s.to_str())
.unwrap_or("jpg");
let permanent_filename = format!("{}_processed_{}.{}", document_id, chrono::Utc::now().timestamp(), extension);
let permanent_path = processed_images_dir.join(&permanent_filename);
// Verify source file exists before copying
if !Path::new(processed_image_path).exists() {
return Err(anyhow::anyhow!("Source processed image file does not exist: {}", processed_image_path));
}
// Copy the processed image to permanent location with error handling
if let Err(e) = tokio::fs::copy(processed_image_path, &permanent_path).await {
error!("Failed to copy processed image from {} to {:?}: {}", processed_image_path, permanent_path, e);
return Err(anyhow::anyhow!("Failed to copy processed image: {}", e));
}
info!("Successfully copied processed image to: {:?}", permanent_path);
// Save to database
let processing_parameters = serde_json::json!({
"steps": processing_steps,
"timestamp": chrono::Utc::now(),
"original_path": original_image_path,
});
// Save metadata to database with error handling
if let Err(e) = sqlx::query(
r#"
INSERT INTO processed_images (document_id, user_id, original_image_path, processed_image_path, processing_parameters, processing_steps, created_at)
VALUES ($1, $2, $3, $4, $5, $6, NOW())
ON CONFLICT (document_id)
DO UPDATE SET
processed_image_path = EXCLUDED.processed_image_path,
processing_parameters = EXCLUDED.processing_parameters,
processing_steps = EXCLUDED.processing_steps,
created_at = NOW()
"#
)
.bind(document_id)
.bind(user_id)
.bind(original_image_path)
.bind(permanent_path.to_string_lossy().as_ref())
.bind(&processing_parameters)
.bind(processing_steps)
.execute(&self.pool)
.await {
error!("Failed to save processed image metadata to database for document {}: {}", document_id, e);
// Clean up the copied file if database save fails
if let Err(cleanup_err) = tokio::fs::remove_file(&permanent_path).await {
warn!("Failed to clean up processed image file after database error: {}", cleanup_err);
}
return Err(anyhow::anyhow!("Failed to save processed image metadata: {}", e));
}
info!("Successfully saved processed image metadata for document {} to database", document_id);
Ok(())
}
/// Get queue statistics
pub async fn get_stats(&self) -> Result<QueueStats> {
let stats = sqlx::query(