feat(server): create folders within 'upload' path to manage thumbnails, processed images, etc.
This commit is contained in:
parent
f361dd4e7c
commit
c656a96d91
|
|
@ -0,0 +1,17 @@
|
|||
-- Update existing file paths to use the new structured directory layout
|
||||
-- This migration moves file paths from ./uploads/filename to ./uploads/documents/filename
|
||||
|
||||
UPDATE documents
|
||||
SET file_path = CASE
|
||||
-- Update file paths that start with ./uploads/ but don't already have /documents/
|
||||
WHEN file_path LIKE './uploads/%' AND file_path NOT LIKE './uploads/documents/%' THEN
|
||||
REPLACE(file_path, './uploads/', './uploads/documents/')
|
||||
-- Update file paths that start with uploads/ but don't already have /documents/
|
||||
WHEN file_path LIKE 'uploads/%' AND file_path NOT LIKE 'uploads/documents/%' THEN
|
||||
REPLACE(file_path, 'uploads/', 'uploads/documents/')
|
||||
ELSE file_path
|
||||
END
|
||||
WHERE
|
||||
(file_path LIKE './uploads/%' AND file_path NOT LIKE './uploads/documents/%')
|
||||
OR
|
||||
(file_path LIKE 'uploads/%' AND file_path NOT LIKE 'uploads/documents/%');
|
||||
|
|
@ -2195,7 +2195,7 @@ impl Database {
|
|||
|
||||
pub async fn update_source(&self, user_id: Uuid, source_id: Uuid, update: &crate::models::UpdateSource) -> Result<crate::models::Source> {
|
||||
let mut query = String::from("UPDATE sources SET updated_at = NOW()");
|
||||
let mut bind_count = 1;
|
||||
let mut bind_count = 0;
|
||||
|
||||
if update.name.is_some() {
|
||||
bind_count += 1;
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ use imageproc::{
|
|||
use tesseract::{Tesseract, PageSegMode, OcrEngineMode};
|
||||
|
||||
use crate::models::Settings;
|
||||
use crate::file_service::FileService;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ImageQualityStats {
|
||||
|
|
@ -31,15 +32,19 @@ pub struct OcrResult {
|
|||
pub processing_time_ms: u64,
|
||||
pub word_count: usize,
|
||||
pub preprocessing_applied: Vec<String>,
|
||||
pub processed_image_path: Option<String>,
|
||||
}
|
||||
|
||||
pub struct EnhancedOcrService {
|
||||
pub temp_dir: String,
|
||||
pub file_service: FileService,
|
||||
}
|
||||
|
||||
impl EnhancedOcrService {
|
||||
pub fn new(temp_dir: String) -> Self {
|
||||
Self { temp_dir }
|
||||
let upload_path = std::env::var("UPLOAD_PATH").unwrap_or_else(|_| "./uploads".to_string());
|
||||
let file_service = FileService::new(upload_path);
|
||||
Self { temp_dir, file_service }
|
||||
}
|
||||
|
||||
/// Extract text from image with high-quality OCR settings
|
||||
|
|
@ -79,11 +84,6 @@ impl EnhancedOcrService {
|
|||
|
||||
let (text, confidence) = ocr_result;
|
||||
|
||||
// Clean up temporary files if created
|
||||
if processed_image_path != file_path {
|
||||
let _ = tokio::fs::remove_file(&processed_image_path).await;
|
||||
}
|
||||
|
||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||
let word_count = text.split_whitespace().count();
|
||||
|
||||
|
|
@ -92,19 +92,38 @@ impl EnhancedOcrService {
|
|||
word_count, confidence, processing_time
|
||||
);
|
||||
|
||||
Ok(OcrResult {
|
||||
// Return the processed image path if different from original (caller will handle cleanup/saving)
|
||||
let result_processed_image_path = if processed_image_path != file_path {
|
||||
Some(processed_image_path.clone())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let result = OcrResult {
|
||||
text,
|
||||
confidence,
|
||||
processing_time_ms: processing_time,
|
||||
word_count,
|
||||
preprocessing_applied,
|
||||
})
|
||||
processed_image_path: result_processed_image_path,
|
||||
};
|
||||
|
||||
// Clean up temporary files if not saved for review
|
||||
if let Some(ref temp_path) = result.processed_image_path {
|
||||
if !settings.save_processed_images {
|
||||
let _ = tokio::fs::remove_file(temp_path).await;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Preprocess image for optimal OCR quality, especially for challenging conditions
|
||||
#[cfg(feature = "ocr")]
|
||||
async fn preprocess_image(&self, input_path: &str, settings: &Settings) -> Result<(String, Vec<String>)> {
|
||||
let img = image::open(input_path)?;
|
||||
// Resolve the file path first
|
||||
let resolved_path = self.resolve_file_path(input_path).await?;
|
||||
let img = image::open(&resolved_path)?;
|
||||
let mut processed_img = img;
|
||||
let mut preprocessing_applied = Vec::new();
|
||||
|
||||
|
|
@ -741,16 +760,25 @@ impl EnhancedOcrService {
|
|||
processing_time_ms: processing_time,
|
||||
word_count,
|
||||
preprocessing_applied: vec!["PDF text extraction".to_string()],
|
||||
processed_image_path: None, // No image processing for PDF text extraction
|
||||
})
|
||||
}
|
||||
|
||||
/// Resolve file path to actual location, handling both old and new directory structures
|
||||
async fn resolve_file_path(&self, file_path: &str) -> Result<String> {
|
||||
// Use the FileService's resolve_file_path method
|
||||
self.file_service.resolve_file_path(file_path).await
|
||||
}
|
||||
|
||||
/// Extract text from any supported file type
|
||||
pub async fn extract_text(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result<OcrResult> {
|
||||
// Resolve the actual file path
|
||||
let resolved_path = self.resolve_file_path(file_path).await?;
|
||||
match mime_type {
|
||||
"application/pdf" => {
|
||||
#[cfg(feature = "ocr")]
|
||||
{
|
||||
self.extract_text_from_pdf(file_path, settings).await
|
||||
self.extract_text_from_pdf(&resolved_path, settings).await
|
||||
}
|
||||
#[cfg(not(feature = "ocr"))]
|
||||
{
|
||||
|
|
@ -760,7 +788,7 @@ impl EnhancedOcrService {
|
|||
mime if mime.starts_with("image/") => {
|
||||
#[cfg(feature = "ocr")]
|
||||
{
|
||||
self.extract_text_from_image(file_path, settings).await
|
||||
self.extract_text_from_image(&resolved_path, settings).await
|
||||
}
|
||||
#[cfg(not(feature = "ocr"))]
|
||||
{
|
||||
|
|
@ -769,7 +797,7 @@ impl EnhancedOcrService {
|
|||
}
|
||||
"text/plain" => {
|
||||
let start_time = std::time::Instant::now();
|
||||
let text = tokio::fs::read_to_string(file_path).await?;
|
||||
let text = tokio::fs::read_to_string(&resolved_path).await?;
|
||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||
let word_count = text.split_whitespace().count();
|
||||
|
||||
|
|
@ -779,6 +807,7 @@ impl EnhancedOcrService {
|
|||
processing_time_ms: processing_time,
|
||||
word_count,
|
||||
preprocessing_applied: vec!["Plain text read".to_string()],
|
||||
processed_image_path: None, // No image processing for plain text
|
||||
})
|
||||
}
|
||||
_ => Err(anyhow::anyhow!("Unsupported file type: {}", mime_type)),
|
||||
|
|
|
|||
|
|
@ -1,8 +1,9 @@
|
|||
use anyhow::Result;
|
||||
use chrono::Utc;
|
||||
use std::path::Path;
|
||||
use std::path::{Path, PathBuf};
|
||||
use tokio::fs;
|
||||
use uuid::Uuid;
|
||||
use tracing::{info, warn, error};
|
||||
|
||||
use crate::models::Document;
|
||||
|
||||
|
|
@ -19,6 +20,106 @@ impl FileService {
|
|||
Self { upload_path }
|
||||
}
|
||||
|
||||
/// Initialize the upload directory structure
|
||||
pub async fn initialize_directory_structure(&self) -> Result<()> {
|
||||
let base_path = Path::new(&self.upload_path);
|
||||
|
||||
// Create subdirectories for organized file storage
|
||||
let directories = [
|
||||
"documents", // Final uploaded documents
|
||||
"thumbnails", // Document thumbnails
|
||||
"processed_images", // OCR processed images for review
|
||||
"temp", // Temporary files during processing
|
||||
"backups", // Document backups
|
||||
];
|
||||
|
||||
for dir in directories.iter() {
|
||||
let dir_path = base_path.join(dir);
|
||||
if let Err(e) = fs::create_dir_all(&dir_path).await {
|
||||
error!("Failed to create directory {:?}: {}", dir_path, e);
|
||||
return Err(anyhow::anyhow!("Failed to create directory structure: {}", e));
|
||||
}
|
||||
info!("Ensured directory exists: {:?}", dir_path);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get the path for a specific subdirectory
|
||||
pub fn get_subdirectory_path(&self, subdir: &str) -> PathBuf {
|
||||
Path::new(&self.upload_path).join(subdir)
|
||||
}
|
||||
|
||||
/// Get the documents directory path
|
||||
pub fn get_documents_path(&self) -> PathBuf {
|
||||
self.get_subdirectory_path("documents")
|
||||
}
|
||||
|
||||
/// Get the thumbnails directory path
|
||||
pub fn get_thumbnails_path(&self) -> PathBuf {
|
||||
self.get_subdirectory_path("thumbnails")
|
||||
}
|
||||
|
||||
/// Get the processed images directory path
|
||||
pub fn get_processed_images_path(&self) -> PathBuf {
|
||||
self.get_subdirectory_path("processed_images")
|
||||
}
|
||||
|
||||
/// Get the temp directory path
|
||||
pub fn get_temp_path(&self) -> PathBuf {
|
||||
self.get_subdirectory_path("temp")
|
||||
}
|
||||
|
||||
/// Migrate existing files from the root upload directory to the structured format
|
||||
pub async fn migrate_existing_files(&self) -> Result<()> {
|
||||
let base_path = Path::new(&self.upload_path);
|
||||
let documents_dir = self.get_documents_path();
|
||||
let thumbnails_dir = self.get_thumbnails_path();
|
||||
|
||||
info!("Starting migration of existing files to structured directories...");
|
||||
let mut migrated_count = 0;
|
||||
let mut thumbnail_count = 0;
|
||||
|
||||
// Read all files in the base upload directory
|
||||
let mut entries = fs::read_dir(base_path).await?;
|
||||
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let file_path = entry.path();
|
||||
|
||||
// Skip directories and already structured subdirectories
|
||||
if file_path.is_dir() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(filename) = file_path.file_name().and_then(|n| n.to_str()) {
|
||||
// Handle thumbnail files
|
||||
if filename.ends_with("_thumb.jpg") {
|
||||
let new_path = thumbnails_dir.join(filename);
|
||||
if let Err(e) = fs::rename(&file_path, &new_path).await {
|
||||
warn!("Failed to migrate thumbnail {}: {}", filename, e);
|
||||
} else {
|
||||
thumbnail_count += 1;
|
||||
info!("Migrated thumbnail: {} -> {:?}", filename, new_path);
|
||||
}
|
||||
}
|
||||
// Handle regular document files
|
||||
else {
|
||||
let new_path = documents_dir.join(filename);
|
||||
if let Err(e) = fs::rename(&file_path, &new_path).await {
|
||||
warn!("Failed to migrate document {}: {}", filename, e);
|
||||
} else {
|
||||
migrated_count += 1;
|
||||
info!("Migrated document: {} -> {:?}", filename, new_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!("Migration completed: {} documents, {} thumbnails moved to structured directories",
|
||||
migrated_count, thumbnail_count);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn save_file(&self, filename: &str, data: &[u8]) -> Result<String> {
|
||||
let file_id = Uuid::new_v4();
|
||||
let extension = Path::new(filename)
|
||||
|
|
@ -32,10 +133,14 @@ impl FileService {
|
|||
format!("{}.{}", file_id, extension)
|
||||
};
|
||||
|
||||
let file_path = Path::new(&self.upload_path).join(&saved_filename);
|
||||
// Save to documents subdirectory
|
||||
let documents_dir = self.get_documents_path();
|
||||
let file_path = documents_dir.join(&saved_filename);
|
||||
|
||||
if let Some(parent) = file_path.parent() {
|
||||
fs::create_dir_all(parent).await?;
|
||||
// Ensure the documents directory exists
|
||||
if let Err(e) = fs::create_dir_all(&documents_dir).await {
|
||||
error!("Failed to create documents directory: {}", e);
|
||||
return Err(anyhow::anyhow!("Failed to create documents directory: {}", e));
|
||||
}
|
||||
|
||||
fs::write(&file_path, data).await?;
|
||||
|
|
@ -86,17 +191,50 @@ impl FileService {
|
|||
}
|
||||
}
|
||||
|
||||
/// Resolve file path to actual location, handling both old and new directory structures
|
||||
pub async fn resolve_file_path(&self, file_path: &str) -> Result<String> {
|
||||
// If the file exists at the given path, use it
|
||||
if Path::new(file_path).exists() {
|
||||
return Ok(file_path.to_string());
|
||||
}
|
||||
|
||||
// Try to find the file in the new structured directory
|
||||
if file_path.starts_with("./uploads/") && !file_path.contains("/documents/") {
|
||||
let new_path = file_path.replace("./uploads/", "./uploads/documents/");
|
||||
if Path::new(&new_path).exists() {
|
||||
info!("Found file in new structured directory: {} -> {}", file_path, new_path);
|
||||
return Ok(new_path);
|
||||
}
|
||||
}
|
||||
|
||||
// Try without the ./ prefix
|
||||
if file_path.starts_with("uploads/") && !file_path.contains("/documents/") {
|
||||
let new_path = file_path.replace("uploads/", "uploads/documents/");
|
||||
if Path::new(&new_path).exists() {
|
||||
info!("Found file in new structured directory: {} -> {}", file_path, new_path);
|
||||
return Ok(new_path);
|
||||
}
|
||||
}
|
||||
|
||||
// File not found in any expected location
|
||||
Err(anyhow::anyhow!("File not found: {} (checked original path and structured directory)", file_path))
|
||||
}
|
||||
|
||||
pub async fn read_file(&self, file_path: &str) -> Result<Vec<u8>> {
|
||||
let data = fs::read(file_path).await?;
|
||||
let resolved_path = self.resolve_file_path(file_path).await?;
|
||||
let data = fs::read(&resolved_path).await?;
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
pub async fn get_or_generate_thumbnail(&self, file_path: &str, filename: &str) -> Result<Vec<u8>> {
|
||||
// Create thumbnails directory if it doesn't exist
|
||||
let thumbnails_dir = Path::new(&self.upload_path).join("thumbnails");
|
||||
// Use the structured thumbnails directory
|
||||
let thumbnails_dir = self.get_thumbnails_path();
|
||||
if !thumbnails_dir.exists() {
|
||||
fs::create_dir_all(&thumbnails_dir).await?;
|
||||
if let Err(e) = fs::create_dir_all(&thumbnails_dir).await {
|
||||
error!("Failed to create thumbnails directory: {}", e);
|
||||
return Err(anyhow::anyhow!("Failed to create thumbnails directory: {}", e));
|
||||
}
|
||||
}
|
||||
|
||||
// Generate thumbnail filename based on original file path
|
||||
|
|
@ -111,8 +249,9 @@ impl FileService {
|
|||
return self.read_file(&thumbnail_path.to_string_lossy()).await;
|
||||
}
|
||||
|
||||
// Generate thumbnail
|
||||
let thumbnail_data = self.generate_thumbnail(file_path, filename).await?;
|
||||
// Resolve file path and generate thumbnail
|
||||
let resolved_path = self.resolve_file_path(file_path).await?;
|
||||
let thumbnail_data = self.generate_thumbnail(&resolved_path, filename).await?;
|
||||
|
||||
// Save thumbnail to cache
|
||||
fs::write(&thumbnail_path, &thumbnail_data).await?;
|
||||
|
|
|
|||
17
src/main.rs
17
src/main.rs
|
|
@ -20,6 +20,23 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
tracing_subscriber::fmt::init();
|
||||
|
||||
let config = Config::from_env()?;
|
||||
|
||||
// Initialize upload directory structure
|
||||
info!("Initializing upload directory structure...");
|
||||
let file_service = readur::file_service::FileService::new(config.upload_path.clone());
|
||||
if let Err(e) = file_service.initialize_directory_structure().await {
|
||||
error!("Failed to initialize directory structure: {}", e);
|
||||
return Err(e.into());
|
||||
}
|
||||
info!("✅ Upload directory structure initialized");
|
||||
|
||||
// Migrate existing files to new structure (one-time operation)
|
||||
info!("Migrating existing files to structured directories...");
|
||||
if let Err(e) = file_service.migrate_existing_files().await {
|
||||
warn!("Failed to migrate some existing files: {}", e);
|
||||
// Don't fail startup for migration issues
|
||||
}
|
||||
|
||||
// Create separate database pools for different workloads
|
||||
let web_db = Database::new_with_pool_config(&config.database_url, 20, 2).await?; // Web UI pool
|
||||
let background_db = Database::new_with_pool_config(&config.database_url, 30, 3).await?; // Background operations pool
|
||||
|
|
|
|||
119
src/ocr_queue.rs
119
src/ocr_queue.rs
|
|
@ -363,6 +363,31 @@ impl OcrQueueService {
|
|||
}
|
||||
}
|
||||
|
||||
// Save processed image if setting is enabled and image was processed
|
||||
if settings.save_processed_images {
|
||||
if let Some(ref processed_image_path) = ocr_result.processed_image_path {
|
||||
match self.save_processed_image_for_review(
|
||||
item.document_id,
|
||||
user_id.unwrap_or_default(),
|
||||
&file_path,
|
||||
processed_image_path,
|
||||
&ocr_result.preprocessing_applied,
|
||||
).await {
|
||||
Ok(_) => {
|
||||
info!("✅ Saved processed image for document {} for review", item.document_id);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Failed to save processed image for document {}: {}", item.document_id, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up temporary processed image file if it exists
|
||||
if let Some(ref temp_path) = ocr_result.processed_image_path {
|
||||
let _ = tokio::fs::remove_file(temp_path).await;
|
||||
}
|
||||
|
||||
let processing_time_ms = start_time.elapsed().as_millis() as i32;
|
||||
self.mark_completed(item.id, processing_time_ms).await?;
|
||||
|
||||
|
|
@ -481,6 +506,100 @@ impl OcrQueueService {
|
|||
}
|
||||
}
|
||||
|
||||
/// Save processed image for review when the setting is enabled
|
||||
async fn save_processed_image_for_review(
|
||||
&self,
|
||||
document_id: Uuid,
|
||||
user_id: Uuid,
|
||||
original_image_path: &str,
|
||||
processed_image_path: &str,
|
||||
processing_steps: &[String],
|
||||
) -> Result<()> {
|
||||
use std::path::Path;
|
||||
|
||||
// Use the FileService to get the proper processed images directory
|
||||
use crate::file_service::FileService;
|
||||
let base_upload_dir = std::env::var("UPLOAD_PATH").unwrap_or_else(|_| "uploads".to_string());
|
||||
let file_service = FileService::new(base_upload_dir);
|
||||
let processed_images_dir = file_service.get_processed_images_path();
|
||||
|
||||
// Ensure the directory exists with proper error handling
|
||||
if let Err(e) = tokio::fs::create_dir_all(&processed_images_dir).await {
|
||||
error!("Failed to create processed images directory {:?}: {}", processed_images_dir, e);
|
||||
return Err(anyhow::anyhow!("Failed to create processed images directory: {}", e));
|
||||
}
|
||||
|
||||
info!("Ensured processed images directory exists: {:?}", processed_images_dir);
|
||||
|
||||
// Generate a unique filename for the processed image
|
||||
let file_stem = Path::new(processed_image_path)
|
||||
.file_stem()
|
||||
.and_then(|s| s.to_str())
|
||||
.unwrap_or("processed");
|
||||
let extension = Path::new(processed_image_path)
|
||||
.extension()
|
||||
.and_then(|s| s.to_str())
|
||||
.unwrap_or("jpg");
|
||||
|
||||
let permanent_filename = format!("{}_processed_{}.{}", document_id, chrono::Utc::now().timestamp(), extension);
|
||||
let permanent_path = processed_images_dir.join(&permanent_filename);
|
||||
|
||||
// Verify source file exists before copying
|
||||
if !Path::new(processed_image_path).exists() {
|
||||
return Err(anyhow::anyhow!("Source processed image file does not exist: {}", processed_image_path));
|
||||
}
|
||||
|
||||
// Copy the processed image to permanent location with error handling
|
||||
if let Err(e) = tokio::fs::copy(processed_image_path, &permanent_path).await {
|
||||
error!("Failed to copy processed image from {} to {:?}: {}", processed_image_path, permanent_path, e);
|
||||
return Err(anyhow::anyhow!("Failed to copy processed image: {}", e));
|
||||
}
|
||||
|
||||
info!("Successfully copied processed image to: {:?}", permanent_path);
|
||||
|
||||
// Save to database
|
||||
let processing_parameters = serde_json::json!({
|
||||
"steps": processing_steps,
|
||||
"timestamp": chrono::Utc::now(),
|
||||
"original_path": original_image_path,
|
||||
});
|
||||
|
||||
// Save metadata to database with error handling
|
||||
if let Err(e) = sqlx::query(
|
||||
r#"
|
||||
INSERT INTO processed_images (document_id, user_id, original_image_path, processed_image_path, processing_parameters, processing_steps, created_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, NOW())
|
||||
ON CONFLICT (document_id)
|
||||
DO UPDATE SET
|
||||
processed_image_path = EXCLUDED.processed_image_path,
|
||||
processing_parameters = EXCLUDED.processing_parameters,
|
||||
processing_steps = EXCLUDED.processing_steps,
|
||||
created_at = NOW()
|
||||
"#
|
||||
)
|
||||
.bind(document_id)
|
||||
.bind(user_id)
|
||||
.bind(original_image_path)
|
||||
.bind(permanent_path.to_string_lossy().as_ref())
|
||||
.bind(&processing_parameters)
|
||||
.bind(processing_steps)
|
||||
.execute(&self.pool)
|
||||
.await {
|
||||
error!("Failed to save processed image metadata to database for document {}: {}", document_id, e);
|
||||
|
||||
// Clean up the copied file if database save fails
|
||||
if let Err(cleanup_err) = tokio::fs::remove_file(&permanent_path).await {
|
||||
warn!("Failed to clean up processed image file after database error: {}", cleanup_err);
|
||||
}
|
||||
|
||||
return Err(anyhow::anyhow!("Failed to save processed image metadata: {}", e));
|
||||
}
|
||||
|
||||
info!("Successfully saved processed image metadata for document {} to database", document_id);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get queue statistics
|
||||
pub async fn get_stats(&self) -> Result<QueueStats> {
|
||||
let stats = sqlx::query(
|
||||
|
|
|
|||
Loading…
Reference in New Issue