561 lines
22 KiB
Rust
561 lines
22 KiB
Rust
use anyhow::Result;
|
|
use chrono::Utc;
|
|
use std::path::{Path, PathBuf};
|
|
use tokio::fs;
|
|
use uuid::Uuid;
|
|
use tracing::{info, warn, error};
|
|
|
|
use crate::models::Document;
|
|
|
|
#[cfg(feature = "ocr")]
|
|
use image::{DynamicImage, ImageFormat, imageops::FilterType};
|
|
|
|
#[derive(Clone)]
|
|
pub struct FileService {
|
|
upload_path: String,
|
|
}
|
|
|
|
impl FileService {
|
|
pub fn new(upload_path: String) -> Self {
|
|
Self { upload_path }
|
|
}
|
|
|
|
/// Initialize the upload directory structure
|
|
pub async fn initialize_directory_structure(&self) -> Result<()> {
|
|
let base_path = Path::new(&self.upload_path);
|
|
|
|
// Create subdirectories for organized file storage
|
|
let directories = [
|
|
"documents", // Final uploaded documents
|
|
"thumbnails", // Document thumbnails
|
|
"processed_images", // OCR processed images for review
|
|
"temp", // Temporary files during processing
|
|
"backups", // Document backups
|
|
];
|
|
|
|
for dir in directories.iter() {
|
|
let dir_path = base_path.join(dir);
|
|
if let Err(e) = fs::create_dir_all(&dir_path).await {
|
|
error!("Failed to create directory {:?}: {}", dir_path, e);
|
|
return Err(anyhow::anyhow!("Failed to create directory structure: {}", e));
|
|
}
|
|
info!("Ensured directory exists: {:?}", dir_path);
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Get the path for a specific subdirectory
|
|
pub fn get_subdirectory_path(&self, subdir: &str) -> PathBuf {
|
|
Path::new(&self.upload_path).join(subdir)
|
|
}
|
|
|
|
/// Get the documents directory path
|
|
pub fn get_documents_path(&self) -> PathBuf {
|
|
self.get_subdirectory_path("documents")
|
|
}
|
|
|
|
/// Get the thumbnails directory path
|
|
pub fn get_thumbnails_path(&self) -> PathBuf {
|
|
self.get_subdirectory_path("thumbnails")
|
|
}
|
|
|
|
/// Get the processed images directory path
|
|
pub fn get_processed_images_path(&self) -> PathBuf {
|
|
self.get_subdirectory_path("processed_images")
|
|
}
|
|
|
|
/// Get the temp directory path
|
|
pub fn get_temp_path(&self) -> PathBuf {
|
|
self.get_subdirectory_path("temp")
|
|
}
|
|
|
|
/// Migrate existing files from the root upload directory to the structured format
|
|
pub async fn migrate_existing_files(&self) -> Result<()> {
|
|
let base_path = Path::new(&self.upload_path);
|
|
let documents_dir = self.get_documents_path();
|
|
let thumbnails_dir = self.get_thumbnails_path();
|
|
|
|
info!("Starting migration of existing files to structured directories...");
|
|
let mut migrated_count = 0;
|
|
let mut thumbnail_count = 0;
|
|
|
|
// Read all files in the base upload directory
|
|
let mut entries = fs::read_dir(base_path).await?;
|
|
|
|
while let Some(entry) = entries.next_entry().await? {
|
|
let file_path = entry.path();
|
|
|
|
// Skip directories and already structured subdirectories
|
|
if file_path.is_dir() {
|
|
continue;
|
|
}
|
|
|
|
if let Some(filename) = file_path.file_name().and_then(|n| n.to_str()) {
|
|
// Handle thumbnail files
|
|
if filename.ends_with("_thumb.jpg") {
|
|
let new_path = thumbnails_dir.join(filename);
|
|
if let Err(e) = fs::rename(&file_path, &new_path).await {
|
|
warn!("Failed to migrate thumbnail {}: {}", filename, e);
|
|
} else {
|
|
thumbnail_count += 1;
|
|
info!("Migrated thumbnail: {} -> {:?}", filename, new_path);
|
|
}
|
|
}
|
|
// Handle regular document files
|
|
else {
|
|
let new_path = documents_dir.join(filename);
|
|
if let Err(e) = fs::rename(&file_path, &new_path).await {
|
|
warn!("Failed to migrate document {}: {}", filename, e);
|
|
} else {
|
|
migrated_count += 1;
|
|
info!("Migrated document: {} -> {:?}", filename, new_path);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
info!("Migration completed: {} documents, {} thumbnails moved to structured directories",
|
|
migrated_count, thumbnail_count);
|
|
Ok(())
|
|
}
|
|
|
|
pub async fn save_file(&self, filename: &str, data: &[u8]) -> Result<String> {
|
|
let file_id = Uuid::new_v4();
|
|
let extension = Path::new(filename)
|
|
.extension()
|
|
.and_then(|ext| ext.to_str())
|
|
.unwrap_or("");
|
|
|
|
let saved_filename = if extension.is_empty() {
|
|
file_id.to_string()
|
|
} else {
|
|
format!("{}.{}", file_id, extension)
|
|
};
|
|
|
|
// Save to documents subdirectory
|
|
let documents_dir = self.get_documents_path();
|
|
let file_path = documents_dir.join(&saved_filename);
|
|
|
|
// Ensure the documents directory exists
|
|
if let Err(e) = fs::create_dir_all(&documents_dir).await {
|
|
error!("Failed to create documents directory: {}", e);
|
|
return Err(anyhow::anyhow!("Failed to create documents directory: {}", e));
|
|
}
|
|
|
|
fs::write(&file_path, data).await?;
|
|
|
|
Ok(file_path.to_string_lossy().to_string())
|
|
}
|
|
|
|
pub fn create_document(
|
|
&self,
|
|
filename: &str,
|
|
original_filename: &str,
|
|
file_path: &str,
|
|
file_size: i64,
|
|
mime_type: &str,
|
|
user_id: Uuid,
|
|
file_hash: Option<String>,
|
|
original_created_at: Option<chrono::DateTime<chrono::Utc>>,
|
|
original_modified_at: Option<chrono::DateTime<chrono::Utc>>,
|
|
source_metadata: Option<serde_json::Value>,
|
|
) -> Document {
|
|
Document {
|
|
id: Uuid::new_v4(),
|
|
filename: filename.to_string(),
|
|
original_filename: original_filename.to_string(),
|
|
file_path: file_path.to_string(),
|
|
file_size,
|
|
mime_type: mime_type.to_string(),
|
|
content: None,
|
|
ocr_text: None,
|
|
ocr_confidence: None,
|
|
ocr_word_count: None,
|
|
ocr_processing_time_ms: None,
|
|
ocr_status: Some("pending".to_string()),
|
|
ocr_error: None,
|
|
ocr_completed_at: None,
|
|
ocr_retry_count: None,
|
|
ocr_failure_reason: None,
|
|
tags: Vec::new(),
|
|
created_at: Utc::now(),
|
|
updated_at: Utc::now(),
|
|
user_id,
|
|
file_hash,
|
|
original_created_at,
|
|
original_modified_at,
|
|
source_metadata,
|
|
}
|
|
}
|
|
|
|
pub fn is_allowed_file_type(&self, filename: &str, allowed_types: &[String]) -> bool {
|
|
if let Some(extension) = Path::new(filename)
|
|
.extension()
|
|
.and_then(|ext| ext.to_str())
|
|
{
|
|
let ext_lower = extension.to_lowercase();
|
|
allowed_types.contains(&ext_lower)
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
|
|
/// Resolve file path to actual location, handling both old and new directory structures
|
|
pub async fn resolve_file_path(&self, file_path: &str) -> Result<String> {
|
|
// If the file exists at the given path, use it
|
|
if Path::new(file_path).exists() {
|
|
return Ok(file_path.to_string());
|
|
}
|
|
|
|
// Try to find the file in the new structured directory
|
|
if file_path.starts_with("./uploads/") && !file_path.contains("/documents/") {
|
|
let new_path = file_path.replace("./uploads/", "./uploads/documents/");
|
|
if Path::new(&new_path).exists() {
|
|
info!("Found file in new structured directory: {} -> {}", file_path, new_path);
|
|
return Ok(new_path);
|
|
}
|
|
}
|
|
|
|
// Try without the ./ prefix
|
|
if file_path.starts_with("uploads/") && !file_path.contains("/documents/") {
|
|
let new_path = file_path.replace("uploads/", "uploads/documents/");
|
|
if Path::new(&new_path).exists() {
|
|
info!("Found file in new structured directory: {} -> {}", file_path, new_path);
|
|
return Ok(new_path);
|
|
}
|
|
}
|
|
|
|
// File not found in any expected location
|
|
Err(anyhow::anyhow!("File not found: {} (checked original path and structured directory)", file_path))
|
|
}
|
|
|
|
pub async fn read_file(&self, file_path: &str) -> Result<Vec<u8>> {
|
|
let resolved_path = self.resolve_file_path(file_path).await?;
|
|
let data = fs::read(&resolved_path).await?;
|
|
Ok(data)
|
|
}
|
|
|
|
#[cfg(feature = "ocr")]
|
|
pub async fn get_or_generate_thumbnail(&self, file_path: &str, filename: &str) -> Result<Vec<u8>> {
|
|
// Use the structured thumbnails directory
|
|
let thumbnails_dir = self.get_thumbnails_path();
|
|
if !thumbnails_dir.exists() {
|
|
if let Err(e) = fs::create_dir_all(&thumbnails_dir).await {
|
|
error!("Failed to create thumbnails directory: {}", e);
|
|
return Err(anyhow::anyhow!("Failed to create thumbnails directory: {}", e));
|
|
}
|
|
}
|
|
|
|
// Generate thumbnail filename based on original file path
|
|
let file_stem = Path::new(file_path)
|
|
.file_stem()
|
|
.and_then(|s| s.to_str())
|
|
.unwrap_or("unknown");
|
|
let thumbnail_path = thumbnails_dir.join(format!("{}_thumb.jpg", file_stem));
|
|
|
|
// Check if thumbnail already exists
|
|
if thumbnail_path.exists() {
|
|
return self.read_file(&thumbnail_path.to_string_lossy()).await;
|
|
}
|
|
|
|
// Resolve file path and generate thumbnail
|
|
let resolved_path = self.resolve_file_path(file_path).await?;
|
|
let thumbnail_data = self.generate_thumbnail(&resolved_path, filename).await?;
|
|
|
|
// Save thumbnail to cache
|
|
fs::write(&thumbnail_path, &thumbnail_data).await?;
|
|
|
|
Ok(thumbnail_data)
|
|
}
|
|
|
|
#[cfg(feature = "ocr")]
|
|
async fn generate_thumbnail(&self, file_path: &str, filename: &str) -> Result<Vec<u8>> {
|
|
let file_data = self.read_file(file_path).await?;
|
|
|
|
// Determine file type from extension
|
|
let extension = Path::new(filename)
|
|
.extension()
|
|
.and_then(|ext| ext.to_str())
|
|
.unwrap_or("")
|
|
.to_lowercase();
|
|
|
|
match extension.as_str() {
|
|
"jpg" | "jpeg" | "png" | "bmp" | "tiff" | "gif" => {
|
|
self.generate_image_thumbnail(&file_data).await
|
|
}
|
|
"pdf" => {
|
|
self.generate_pdf_thumbnail(&file_data).await
|
|
}
|
|
"txt" => {
|
|
self.generate_text_thumbnail(&file_data).await
|
|
}
|
|
"doc" | "docx" => {
|
|
self.generate_placeholder_thumbnail("DOC").await
|
|
}
|
|
_ => {
|
|
// For other file types, generate a placeholder
|
|
self.generate_placeholder_thumbnail(&extension.to_uppercase()).await
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(feature = "ocr")]
|
|
async fn generate_image_thumbnail(&self, file_data: &[u8]) -> Result<Vec<u8>> {
|
|
let img = image::load_from_memory(file_data)?;
|
|
let thumbnail = img.resize(200, 200, FilterType::Lanczos3);
|
|
|
|
// Convert to RGB if the image has an alpha channel (RGBA)
|
|
// JPEG doesn't support transparency, so we need to remove the alpha channel
|
|
let rgb_thumbnail = match thumbnail {
|
|
image::DynamicImage::ImageRgba8(_) => {
|
|
// Convert RGBA to RGB by compositing against a white background
|
|
let rgb_img = image::DynamicImage::ImageRgb8(
|
|
thumbnail.to_rgb8()
|
|
);
|
|
rgb_img
|
|
},
|
|
_ => thumbnail, // Already RGB or other compatible format
|
|
};
|
|
|
|
let mut buffer = Vec::new();
|
|
let mut cursor = std::io::Cursor::new(&mut buffer);
|
|
rgb_thumbnail.write_to(&mut cursor, ImageFormat::Jpeg)?;
|
|
|
|
Ok(buffer)
|
|
}
|
|
|
|
#[cfg(feature = "ocr")]
|
|
async fn generate_pdf_thumbnail(&self, file_data: &[u8]) -> Result<Vec<u8>> {
|
|
use std::process::Command;
|
|
use tokio::fs;
|
|
use uuid::Uuid;
|
|
|
|
// Create a temporary file for the PDF
|
|
let temp_id = Uuid::new_v4();
|
|
let temp_pdf_path = format!("/tmp/pdf_thumb_{}.pdf", temp_id);
|
|
let temp_png_path = format!("/tmp/pdf_thumb_{}.png", temp_id);
|
|
|
|
// Write PDF data to temporary file
|
|
if let Err(e) = fs::write(&temp_pdf_path, file_data).await {
|
|
error!("Failed to write temporary PDF file: {}", e);
|
|
return self.generate_placeholder_thumbnail("PDF").await;
|
|
}
|
|
|
|
// Use pdftoppm to convert first page to PNG
|
|
let output = Command::new("pdftoppm")
|
|
.arg("-f").arg("1") // First page only
|
|
.arg("-l").arg("1") // Last page (same as first)
|
|
.arg("-scale-to").arg("200") // Scale to 200px width
|
|
.arg("-png") // Output as PNG
|
|
.arg(&temp_pdf_path)
|
|
.arg(&format!("/tmp/pdf_thumb_{}", temp_id)) // Output prefix
|
|
.output();
|
|
|
|
// Clean up temporary PDF file
|
|
let _ = fs::remove_file(&temp_pdf_path).await;
|
|
|
|
match output {
|
|
Ok(result) if result.status.success() => {
|
|
// pdftoppm adds "-1" to the filename for the first page
|
|
let actual_png_path = format!("/tmp/pdf_thumb_{}-1.png", temp_id);
|
|
|
|
// Read the generated PNG file
|
|
match fs::read(&actual_png_path).await {
|
|
Ok(png_data) => {
|
|
// Clean up temporary PNG file
|
|
let _ = fs::remove_file(&actual_png_path).await;
|
|
|
|
// Convert PNG to JPEG thumbnail
|
|
match image::load_from_memory(&png_data) {
|
|
Ok(img) => {
|
|
// Resize to 200x200 maintaining aspect ratio
|
|
let thumbnail = img.resize(200, 200, image::imageops::FilterType::Lanczos3);
|
|
|
|
// Convert to JPEG
|
|
let mut buffer = Vec::new();
|
|
let mut cursor = std::io::Cursor::new(&mut buffer);
|
|
if thumbnail.write_to(&mut cursor, ImageFormat::Jpeg).is_ok() {
|
|
Ok(buffer)
|
|
} else {
|
|
self.generate_placeholder_thumbnail("PDF").await
|
|
}
|
|
}
|
|
Err(_) => self.generate_placeholder_thumbnail("PDF").await,
|
|
}
|
|
}
|
|
Err(_) => {
|
|
let _ = fs::remove_file(&actual_png_path).await;
|
|
self.generate_placeholder_thumbnail("PDF").await
|
|
}
|
|
}
|
|
}
|
|
_ => {
|
|
// Clean up any potential PNG files
|
|
let _ = fs::remove_file(&temp_png_path).await;
|
|
let _ = fs::remove_file(&format!("/tmp/pdf_thumb_{}-1.png", temp_id)).await;
|
|
self.generate_placeholder_thumbnail("PDF").await
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(feature = "ocr")]
|
|
async fn generate_text_thumbnail(&self, file_data: &[u8]) -> Result<Vec<u8>> {
|
|
use image::Rgb;
|
|
|
|
// Convert bytes to text
|
|
let text = String::from_utf8_lossy(file_data);
|
|
self.generate_text_based_thumbnail(&text, "TXT", Rgb([34, 139, 34])).await
|
|
}
|
|
|
|
#[cfg(feature = "ocr")]
|
|
async fn generate_text_based_thumbnail(&self, text: &str, file_type: &str, bg_color: image::Rgb<u8>) -> Result<Vec<u8>> {
|
|
use image::{RgbImage, Rgb, DynamicImage, ImageFormat};
|
|
|
|
let width = 200;
|
|
let height = 200;
|
|
let mut img = RgbImage::new(width, height);
|
|
|
|
// Fill background
|
|
for pixel in img.pixels_mut() {
|
|
*pixel = bg_color;
|
|
}
|
|
|
|
// Add file type indicator at the top
|
|
let text_color = Rgb([255, 255, 255]); // White text
|
|
let preview_text = if text.len() > 300 {
|
|
format!("{}\n{}", file_type, &text[..300].trim())
|
|
} else {
|
|
format!("{}\n{}", file_type, text.trim())
|
|
};
|
|
|
|
// Simple text rendering - just place some characters as visual indicators
|
|
// For a more sophisticated approach, you'd use a text rendering library
|
|
let lines: Vec<&str> = preview_text.lines().take(15).collect();
|
|
for (line_idx, line) in lines.iter().enumerate() {
|
|
let y_offset = 20 + (line_idx * 12);
|
|
if y_offset >= height as usize - 10 { break; }
|
|
|
|
// Simple character placement (very basic text rendering)
|
|
for (char_idx, _) in line.chars().take(25).enumerate() {
|
|
let x_offset = 10 + (char_idx * 7);
|
|
if x_offset >= width as usize - 10 { break; }
|
|
|
|
// Draw a simple "character" representation as white pixels
|
|
if x_offset < width as usize && y_offset < height as usize {
|
|
if let Some(pixel) = img.get_pixel_mut_checked(x_offset as u32, y_offset as u32) {
|
|
*pixel = text_color;
|
|
}
|
|
// Add some thickness
|
|
if let Some(pixel) = img.get_pixel_mut_checked(x_offset as u32 + 1, y_offset as u32) {
|
|
*pixel = text_color;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
let dynamic_img = DynamicImage::ImageRgb8(img);
|
|
let mut buffer = Vec::new();
|
|
let mut cursor = std::io::Cursor::new(&mut buffer);
|
|
dynamic_img.write_to(&mut cursor, ImageFormat::Jpeg)?;
|
|
|
|
Ok(buffer)
|
|
}
|
|
|
|
#[cfg(feature = "ocr")]
|
|
async fn generate_placeholder_thumbnail(&self, file_type: &str) -> Result<Vec<u8>> {
|
|
// Create a simple colored rectangle as placeholder
|
|
use image::{RgbImage, Rgb};
|
|
|
|
let mut img = RgbImage::new(200, 200);
|
|
|
|
// Different colors for different file types
|
|
let color = match file_type {
|
|
"PDF" => Rgb([220, 38, 27]), // Red for PDF
|
|
"TXT" => Rgb([34, 139, 34]), // Green for text
|
|
"DOC" | "DOCX" => Rgb([41, 128, 185]), // Blue for Word docs
|
|
_ => Rgb([108, 117, 125]), // Gray for unknown
|
|
};
|
|
|
|
// Fill with solid color
|
|
for pixel in img.pixels_mut() {
|
|
*pixel = color;
|
|
}
|
|
|
|
let dynamic_img = DynamicImage::ImageRgb8(img);
|
|
let mut buffer = Vec::new();
|
|
let mut cursor = std::io::Cursor::new(&mut buffer);
|
|
dynamic_img.write_to(&mut cursor, ImageFormat::Jpeg)?;
|
|
|
|
Ok(buffer)
|
|
}
|
|
|
|
#[cfg(not(feature = "ocr"))]
|
|
pub async fn get_or_generate_thumbnail(&self, _file_path: &str, _filename: &str) -> Result<Vec<u8>> {
|
|
anyhow::bail!("Thumbnail generation requires OCR feature")
|
|
}
|
|
|
|
pub async fn delete_document_files(&self, document: &Document) -> Result<()> {
|
|
let mut deleted_files = Vec::new();
|
|
let mut serious_errors = Vec::new();
|
|
|
|
// Helper function to safely delete a file, handling concurrent deletion scenarios
|
|
async fn safe_delete(path: &Path, serious_errors: &mut Vec<String>) -> Option<String> {
|
|
match fs::remove_file(path).await {
|
|
Ok(_) => {
|
|
info!("Deleted file: {}", path.display());
|
|
Some(path.to_string_lossy().to_string())
|
|
}
|
|
Err(e) => {
|
|
match e.kind() {
|
|
std::io::ErrorKind::NotFound => {
|
|
// File already deleted (possibly by concurrent request) - this is fine
|
|
info!("File already deleted: {}", path.display());
|
|
None
|
|
}
|
|
_ => {
|
|
// Other errors (permissions, I/O errors, etc.) are serious
|
|
warn!("Failed to delete file {}: {}", path.display(), e);
|
|
serious_errors.push(format!("Failed to delete file {}: {}", path.display(), e));
|
|
None
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Delete main document file
|
|
let main_file = Path::new(&document.file_path);
|
|
if let Some(deleted_path) = safe_delete(&main_file, &mut serious_errors).await {
|
|
deleted_files.push(deleted_path);
|
|
}
|
|
|
|
// Delete thumbnail if it exists
|
|
let thumbnail_filename = format!("{}_thumb.jpg", document.id);
|
|
let thumbnail_path = self.get_thumbnails_path().join(&thumbnail_filename);
|
|
if let Some(deleted_path) = safe_delete(&thumbnail_path, &mut serious_errors).await {
|
|
deleted_files.push(deleted_path);
|
|
}
|
|
|
|
// Delete processed image if it exists
|
|
let processed_image_filename = format!("{}_processed.png", document.id);
|
|
let processed_image_path = self.get_processed_images_path().join(&processed_image_filename);
|
|
if let Some(deleted_path) = safe_delete(&processed_image_path, &mut serious_errors).await {
|
|
deleted_files.push(deleted_path);
|
|
}
|
|
|
|
// Only fail if there were serious errors (not "file not found")
|
|
if !serious_errors.is_empty() {
|
|
error!("Serious errors occurred while deleting files for document {}: {}", document.id, serious_errors.join("; "));
|
|
return Err(anyhow::anyhow!("File deletion errors: {}", serious_errors.join("; ")));
|
|
}
|
|
|
|
if deleted_files.is_empty() {
|
|
info!("No files needed deletion for document {} (all files already removed)", document.id);
|
|
} else {
|
|
info!("Successfully deleted {} files for document {}", deleted_files.len(), document.id);
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
} |