1013 lines
38 KiB
Rust
1013 lines
38 KiB
Rust
use anyhow::{anyhow, Result};
|
|
use async_trait::async_trait;
|
|
use chrono::{DateTime, Datelike};
|
|
use tracing::{debug, info, warn, error};
|
|
use serde_json;
|
|
use std::collections::HashMap;
|
|
use std::time::Duration;
|
|
use uuid::Uuid;
|
|
use futures::stream::StreamExt;
|
|
use tokio::io::AsyncReadExt;
|
|
|
|
#[cfg(feature = "s3")]
|
|
use aws_sdk_s3::Client;
|
|
#[cfg(feature = "s3")]
|
|
use aws_credential_types::Credentials;
|
|
#[cfg(feature = "s3")]
|
|
use aws_types::region::Region as AwsRegion;
|
|
#[cfg(feature = "s3")]
|
|
use aws_sdk_s3::primitives::ByteStream;
|
|
#[cfg(feature = "s3")]
|
|
use aws_sdk_s3::types::{CompletedPart, CompletedMultipartUpload};
|
|
|
|
use crate::models::{FileIngestionInfo, S3SourceConfig};
|
|
use crate::storage::StorageBackend;
|
|
|
|
/// Threshold for using streaming multipart uploads (100MB)
|
|
const STREAMING_THRESHOLD: usize = 100 * 1024 * 1024;
|
|
|
|
/// Multipart upload chunk size (16MB - AWS minimum is 5MB, we use 16MB for better performance)
|
|
const MULTIPART_CHUNK_SIZE: usize = 16 * 1024 * 1024;
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct S3Service {
|
|
#[cfg(feature = "s3")]
|
|
client: Client,
|
|
config: S3SourceConfig,
|
|
}
|
|
|
|
impl S3Service {
|
|
pub async fn new(config: S3SourceConfig) -> Result<Self> {
|
|
#[cfg(not(feature = "s3"))]
|
|
{
|
|
return Err(anyhow!("S3 support not compiled in. Enable the 's3' feature to use S3 sources."));
|
|
}
|
|
|
|
#[cfg(feature = "s3")]
|
|
{
|
|
// Validate required fields
|
|
if config.bucket_name.is_empty() {
|
|
return Err(anyhow!("Bucket name is required"));
|
|
}
|
|
if config.access_key_id.is_empty() {
|
|
return Err(anyhow!("Access key ID is required"));
|
|
}
|
|
if config.secret_access_key.is_empty() {
|
|
return Err(anyhow!("Secret access key is required"));
|
|
}
|
|
|
|
// Create S3 client with custom configuration
|
|
let credentials = Credentials::new(
|
|
&config.access_key_id,
|
|
&config.secret_access_key,
|
|
None, // session token
|
|
None, // expiry
|
|
"readur-s3-source"
|
|
);
|
|
|
|
let region = if config.region.is_empty() {
|
|
"us-east-1".to_string()
|
|
} else {
|
|
config.region.clone()
|
|
};
|
|
|
|
let mut s3_config_builder = aws_sdk_s3::config::Builder::new()
|
|
.region(AwsRegion::new(region))
|
|
.credentials_provider(credentials)
|
|
.behavior_version_latest();
|
|
|
|
// Set custom endpoint if provided (for S3-compatible services)
|
|
if let Some(endpoint_url) = &config.endpoint_url {
|
|
if !endpoint_url.is_empty() {
|
|
s3_config_builder = s3_config_builder.endpoint_url(endpoint_url);
|
|
info!("Using custom S3 endpoint: {}", endpoint_url);
|
|
}
|
|
}
|
|
|
|
let s3_config = s3_config_builder.build();
|
|
let client = Client::from_conf(s3_config);
|
|
|
|
Ok(Self {
|
|
#[cfg(feature = "s3")]
|
|
client,
|
|
config
|
|
})
|
|
}
|
|
}
|
|
|
|
/// Discover files in a specific S3 prefix (folder)
|
|
pub async fn discover_files_in_folder(&self, folder_path: &str) -> Result<Vec<FileIngestionInfo>> {
|
|
#[cfg(not(feature = "s3"))]
|
|
{
|
|
return Err(anyhow!("S3 support not compiled in"));
|
|
}
|
|
|
|
#[cfg(feature = "s3")]
|
|
{
|
|
info!("Scanning S3 bucket: {} prefix: {}", self.config.bucket_name, folder_path);
|
|
|
|
let mut files = Vec::new();
|
|
let mut continuation_token: Option<String> = None;
|
|
|
|
loop {
|
|
let mut list_request = self.client
|
|
.list_objects_v2()
|
|
.bucket(&self.config.bucket_name)
|
|
.prefix(folder_path);
|
|
|
|
if let Some(token) = &continuation_token {
|
|
list_request = list_request.continuation_token(token);
|
|
}
|
|
|
|
match list_request.send().await {
|
|
Ok(response) => {
|
|
if let Some(contents) = response.contents {
|
|
for object in contents {
|
|
if let Some(key) = object.key {
|
|
// Skip "directories" (keys ending with /)
|
|
if key.ends_with('/') {
|
|
continue;
|
|
}
|
|
|
|
// Check file extension
|
|
let extension = std::path::Path::new(&key)
|
|
.extension()
|
|
.and_then(|ext| ext.to_str())
|
|
.unwrap_or("")
|
|
.to_lowercase();
|
|
|
|
if !self.config.file_extensions.contains(&extension) {
|
|
debug!("Skipping S3 object with unsupported extension: {}", key);
|
|
continue;
|
|
}
|
|
|
|
let file_name = std::path::Path::new(&key)
|
|
.file_name()
|
|
.and_then(|name| name.to_str())
|
|
.unwrap_or(&key)
|
|
.to_string();
|
|
|
|
let size = object.size.unwrap_or(0);
|
|
let last_modified = object.last_modified
|
|
.and_then(|dt| {
|
|
// Convert AWS DateTime to chrono DateTime
|
|
let timestamp = dt.secs();
|
|
DateTime::from_timestamp(timestamp, 0)
|
|
});
|
|
|
|
let etag = object.e_tag.unwrap_or_else(|| {
|
|
// Generate a fallback ETag if none provided
|
|
format!("fallback-{}", &key.chars().take(16).collect::<String>())
|
|
});
|
|
|
|
// Remove quotes from ETag if present
|
|
let etag = etag.trim_matches('"').to_string();
|
|
|
|
let mime_type = Self::get_mime_type(&extension);
|
|
|
|
// Build additional metadata from S3 object properties
|
|
let mut metadata_map = serde_json::Map::new();
|
|
|
|
// Add S3-specific metadata
|
|
if let Some(storage_class) = &object.storage_class {
|
|
metadata_map.insert("storage_class".to_string(), serde_json::Value::String(storage_class.as_str().to_string()));
|
|
}
|
|
|
|
if let Some(owner) = &object.owner {
|
|
if let Some(display_name) = &owner.display_name {
|
|
metadata_map.insert("owner_display_name".to_string(), serde_json::Value::String(display_name.clone()));
|
|
}
|
|
if let Some(id) = &owner.id {
|
|
metadata_map.insert("owner_id".to_string(), serde_json::Value::String(id.clone()));
|
|
}
|
|
}
|
|
|
|
// Store the S3 key for reference
|
|
metadata_map.insert("s3_key".to_string(), serde_json::Value::String(key.clone()));
|
|
|
|
// Add bucket name for reference
|
|
metadata_map.insert("s3_bucket".to_string(), serde_json::Value::String(self.config.bucket_name.clone()));
|
|
|
|
// If we have region info, add it
|
|
metadata_map.insert("s3_region".to_string(), serde_json::Value::String(self.config.region.clone()));
|
|
|
|
let file_info = FileIngestionInfo {
|
|
relative_path: key.clone(),
|
|
full_path: format!("s3://{}/{}", self.config.bucket_name, key), // S3 full path includes bucket
|
|
#[allow(deprecated)]
|
|
path: key.clone(),
|
|
name: file_name,
|
|
size,
|
|
mime_type,
|
|
last_modified,
|
|
etag,
|
|
is_directory: false,
|
|
created_at: None, // S3 doesn't provide creation time, only last modified
|
|
permissions: None, // S3 uses different permission model (ACLs/policies)
|
|
owner: object.owner.as_ref().and_then(|o| o.display_name.clone()),
|
|
group: None, // S3 doesn't have Unix-style groups
|
|
metadata: if metadata_map.is_empty() { None } else { Some(serde_json::Value::Object(metadata_map)) },
|
|
};
|
|
|
|
files.push(file_info);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check if there are more results
|
|
if response.is_truncated == Some(true) {
|
|
continuation_token = response.next_continuation_token;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
Err(e) => {
|
|
return Err(anyhow!("Failed to list S3 objects: {}", e));
|
|
}
|
|
}
|
|
}
|
|
|
|
info!("Found {} files in S3 bucket {} prefix {}", files.len(), self.config.bucket_name, folder_path);
|
|
Ok(files)
|
|
}
|
|
}
|
|
|
|
/// Download file content from S3
|
|
pub async fn download_file(&self, object_key: &str) -> Result<Vec<u8>> {
|
|
#[cfg(not(feature = "s3"))]
|
|
{
|
|
return Err(anyhow!("S3 support not compiled in"));
|
|
}
|
|
|
|
#[cfg(feature = "s3")]
|
|
{
|
|
info!("Downloading S3 object: {}/{}", self.config.bucket_name, object_key);
|
|
|
|
let response = self.client
|
|
.get_object()
|
|
.bucket(&self.config.bucket_name)
|
|
.key(object_key)
|
|
.send()
|
|
.await
|
|
.map_err(|e| anyhow!("Failed to download S3 object {}: {}", object_key, e))?;
|
|
|
|
let body = response.body.collect().await
|
|
.map_err(|e| anyhow!("Failed to read S3 object body: {}", e))?;
|
|
|
|
let bytes = body.into_bytes().to_vec();
|
|
info!("Downloaded S3 object {} ({} bytes)", object_key, bytes.len());
|
|
|
|
Ok(bytes)
|
|
}
|
|
}
|
|
|
|
/// Test S3 connection and access to bucket
|
|
pub async fn test_connection(&self) -> Result<String> {
|
|
#[cfg(not(feature = "s3"))]
|
|
{
|
|
return Err(anyhow!("S3 support not compiled in"));
|
|
}
|
|
|
|
#[cfg(feature = "s3")]
|
|
{
|
|
info!("Testing S3 connection to bucket: {}", self.config.bucket_name);
|
|
|
|
// Test bucket access by listing objects with a limit
|
|
let response = self.client
|
|
.list_objects_v2()
|
|
.bucket(&self.config.bucket_name)
|
|
.max_keys(1)
|
|
.send()
|
|
.await
|
|
.map_err(|e| anyhow!("Failed to access S3 bucket {}: {}", self.config.bucket_name, e))?;
|
|
|
|
// Test if we can get bucket region (additional validation)
|
|
let _head_bucket_response = self.client
|
|
.head_bucket()
|
|
.bucket(&self.config.bucket_name)
|
|
.send()
|
|
.await
|
|
.map_err(|e| anyhow!("Cannot access bucket {}: {}", self.config.bucket_name, e))?;
|
|
|
|
let object_count = response.key_count.unwrap_or(0);
|
|
|
|
Ok(format!(
|
|
"Successfully connected to S3 bucket '{}' (found {} objects)",
|
|
self.config.bucket_name, object_count
|
|
))
|
|
}
|
|
}
|
|
|
|
/// Get estimated file count and size for all watch folders
|
|
pub async fn estimate_sync(&self) -> Result<(usize, i64)> {
|
|
let mut total_files = 0;
|
|
let mut total_size = 0i64;
|
|
|
|
for folder in &self.config.watch_folders {
|
|
match self.discover_files_in_folder(folder).await {
|
|
Ok(files) => {
|
|
total_files += files.len();
|
|
total_size += files.iter().map(|f| f.size).sum::<i64>();
|
|
}
|
|
Err(e) => {
|
|
warn!("Failed to estimate folder {}: {}", folder, e);
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok((total_files, total_size))
|
|
}
|
|
|
|
/// Get MIME type based on file extension
|
|
fn get_mime_type(extension: &str) -> String {
|
|
match extension {
|
|
"pdf" => "application/pdf",
|
|
"txt" => "text/plain",
|
|
"png" => "image/png",
|
|
"jpg" | "jpeg" => "image/jpeg",
|
|
"tiff" | "tif" => "image/tiff",
|
|
"bmp" => "image/bmp",
|
|
"gif" => "image/gif",
|
|
"webp" => "image/webp",
|
|
"doc" => "application/msword",
|
|
"docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
"xls" => "application/vnd.ms-excel",
|
|
"xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
"ppt" => "application/vnd.ms-powerpoint",
|
|
"pptx" => "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
_ => "application/octet-stream",
|
|
}.to_string()
|
|
}
|
|
|
|
pub fn get_config(&self) -> &S3SourceConfig {
|
|
&self.config
|
|
}
|
|
|
|
// ========================================
|
|
// DIRECT STORAGE OPERATIONS
|
|
// ========================================
|
|
|
|
/// Store a file directly to S3 with structured path
|
|
pub async fn store_document(&self, user_id: Uuid, document_id: Uuid, filename: &str, data: &[u8]) -> Result<String> {
|
|
#[cfg(not(feature = "s3"))]
|
|
{
|
|
return Err(anyhow!("S3 support not compiled in"));
|
|
}
|
|
|
|
#[cfg(feature = "s3")]
|
|
{
|
|
let key = self.generate_document_key(user_id, document_id, filename);
|
|
|
|
// Use streaming upload for large files
|
|
if data.len() > STREAMING_THRESHOLD {
|
|
info!("Using streaming multipart upload for large file: {} ({} bytes)", key, data.len());
|
|
self.store_file_multipart(&key, data, None).await?;
|
|
} else {
|
|
self.store_file(&key, data, None).await?;
|
|
}
|
|
|
|
Ok(key)
|
|
}
|
|
}
|
|
|
|
/// Store a thumbnail to S3
|
|
pub async fn store_thumbnail(&self, user_id: Uuid, document_id: Uuid, data: &[u8]) -> Result<String> {
|
|
#[cfg(not(feature = "s3"))]
|
|
{
|
|
return Err(anyhow!("S3 support not compiled in"));
|
|
}
|
|
|
|
#[cfg(feature = "s3")]
|
|
{
|
|
let key = format!("thumbnails/{}/{}_thumb.jpg", user_id, document_id);
|
|
self.store_file(&key, data, Some(self.get_image_metadata())).await?;
|
|
Ok(key)
|
|
}
|
|
}
|
|
|
|
/// Store a processed image to S3
|
|
pub async fn store_processed_image(&self, user_id: Uuid, document_id: Uuid, data: &[u8]) -> Result<String> {
|
|
#[cfg(not(feature = "s3"))]
|
|
{
|
|
return Err(anyhow!("S3 support not compiled in"));
|
|
}
|
|
|
|
#[cfg(feature = "s3")]
|
|
{
|
|
let key = format!("processed_images/{}/{}_processed.png", user_id, document_id);
|
|
self.store_file(&key, data, Some(self.get_image_metadata())).await?;
|
|
Ok(key)
|
|
}
|
|
}
|
|
|
|
/// Generic file storage method
|
|
async fn store_file(&self, key: &str, data: &[u8], metadata: Option<HashMap<String, String>>) -> Result<()> {
|
|
#[cfg(not(feature = "s3"))]
|
|
{
|
|
return Err(anyhow!("S3 support not compiled in"));
|
|
}
|
|
|
|
#[cfg(feature = "s3")]
|
|
{
|
|
info!("Storing file to S3: {}/{}", self.config.bucket_name, key);
|
|
|
|
let key_owned = key.to_string();
|
|
let data_owned = data.to_vec();
|
|
let metadata_owned = metadata.clone();
|
|
let bucket_name = self.config.bucket_name.clone();
|
|
let client = self.client.clone();
|
|
|
|
self.retry_operation(&format!("store_file: {}", key), || {
|
|
let key = key_owned.clone();
|
|
let data = data_owned.clone();
|
|
let metadata = metadata_owned.clone();
|
|
let bucket_name = bucket_name.clone();
|
|
let client = client.clone();
|
|
let content_type = self.get_content_type_from_key(&key);
|
|
|
|
async move {
|
|
let mut put_request = client
|
|
.put_object()
|
|
.bucket(&bucket_name)
|
|
.key(&key)
|
|
.body(ByteStream::from(data));
|
|
|
|
// Add metadata if provided
|
|
if let Some(meta) = metadata {
|
|
for (k, v) in meta {
|
|
put_request = put_request.metadata(k, v);
|
|
}
|
|
}
|
|
|
|
// Set content type based on file extension
|
|
if let Some(ct) = content_type {
|
|
put_request = put_request.content_type(ct);
|
|
}
|
|
|
|
put_request.send().await
|
|
.map_err(|e| anyhow!("Failed to store file {}: {}", key, e))?;
|
|
|
|
Ok(())
|
|
}
|
|
}).await?;
|
|
|
|
info!("Successfully stored file: {}", key);
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
/// Store large files using multipart upload for better performance and memory usage
|
|
async fn store_file_multipart(&self, key: &str, data: &[u8], metadata: Option<HashMap<String, String>>) -> Result<()> {
|
|
#[cfg(not(feature = "s3"))]
|
|
{
|
|
return Err(anyhow!("S3 support not compiled in"));
|
|
}
|
|
|
|
#[cfg(feature = "s3")]
|
|
{
|
|
info!("Starting multipart upload for file: {}/{} ({} bytes)", self.config.bucket_name, key, data.len());
|
|
|
|
let key_owned = key.to_string();
|
|
let data_owned = data.to_vec();
|
|
let metadata_owned = metadata.clone();
|
|
let bucket_name = self.config.bucket_name.clone();
|
|
let client = self.client.clone();
|
|
|
|
self.retry_operation(&format!("store_file_multipart: {}", key), || {
|
|
let key = key_owned.clone();
|
|
let data = data_owned.clone();
|
|
let metadata = metadata_owned.clone();
|
|
let bucket_name = bucket_name.clone();
|
|
let client = client.clone();
|
|
let content_type = self.get_content_type_from_key(&key);
|
|
|
|
async move {
|
|
// Step 1: Initiate multipart upload
|
|
let mut create_request = client
|
|
.create_multipart_upload()
|
|
.bucket(&bucket_name)
|
|
.key(&key);
|
|
|
|
// Add metadata if provided
|
|
if let Some(meta) = metadata {
|
|
for (k, v) in meta {
|
|
create_request = create_request.metadata(k, v);
|
|
}
|
|
}
|
|
|
|
// Set content type based on file extension
|
|
if let Some(ct) = content_type {
|
|
create_request = create_request.content_type(ct);
|
|
}
|
|
|
|
let create_response = create_request.send().await
|
|
.map_err(|e| anyhow!("Failed to initiate multipart upload for {}: {}", key, e))?;
|
|
|
|
let upload_id = create_response.upload_id()
|
|
.ok_or_else(|| anyhow!("Missing upload ID in multipart upload response"))?;
|
|
|
|
info!("Initiated multipart upload for {}: {}", key, upload_id);
|
|
|
|
// Step 2: Upload parts in chunks
|
|
let mut completed_parts = Vec::new();
|
|
let total_chunks = (data.len() + MULTIPART_CHUNK_SIZE - 1) / MULTIPART_CHUNK_SIZE;
|
|
|
|
for (chunk_index, chunk) in data.chunks(MULTIPART_CHUNK_SIZE).enumerate() {
|
|
let part_number = (chunk_index + 1) as i32;
|
|
|
|
debug!("Uploading part {} of {} for {} ({} bytes)",
|
|
part_number, total_chunks, key, chunk.len());
|
|
|
|
let upload_part_response = client
|
|
.upload_part()
|
|
.bucket(&bucket_name)
|
|
.key(&key)
|
|
.upload_id(upload_id)
|
|
.part_number(part_number)
|
|
.body(ByteStream::from(chunk.to_vec()))
|
|
.send()
|
|
.await
|
|
.map_err(|e| anyhow!("Failed to upload part {} for {}: {}", part_number, key, e))?;
|
|
|
|
let etag = upload_part_response.e_tag()
|
|
.ok_or_else(|| anyhow!("Missing ETag in upload part response"))?;
|
|
|
|
completed_parts.push(
|
|
CompletedPart::builder()
|
|
.part_number(part_number)
|
|
.e_tag(etag)
|
|
.build()
|
|
);
|
|
|
|
debug!("Successfully uploaded part {} for {}", part_number, key);
|
|
}
|
|
|
|
// Step 3: Complete multipart upload
|
|
let completed_multipart_upload = CompletedMultipartUpload::builder()
|
|
.set_parts(Some(completed_parts))
|
|
.build();
|
|
|
|
client
|
|
.complete_multipart_upload()
|
|
.bucket(&bucket_name)
|
|
.key(&key)
|
|
.upload_id(upload_id)
|
|
.multipart_upload(completed_multipart_upload)
|
|
.send()
|
|
.await
|
|
.map_err(|e| {
|
|
// If completion fails, try to abort the multipart upload
|
|
let abort_client = client.clone();
|
|
let abort_bucket = bucket_name.clone();
|
|
let abort_key = key.clone();
|
|
let abort_upload_id = upload_id.to_string();
|
|
|
|
tokio::spawn(async move {
|
|
if let Err(abort_err) = abort_client
|
|
.abort_multipart_upload()
|
|
.bucket(abort_bucket)
|
|
.key(abort_key)
|
|
.upload_id(abort_upload_id)
|
|
.send()
|
|
.await
|
|
{
|
|
error!("Failed to abort multipart upload: {}", abort_err);
|
|
}
|
|
});
|
|
|
|
anyhow!("Failed to complete multipart upload for {}: {}", key, e)
|
|
})?;
|
|
|
|
info!("Successfully completed multipart upload for {}", key);
|
|
Ok(())
|
|
}
|
|
}).await?;
|
|
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
/// Retrieve a file from S3
|
|
pub async fn retrieve_file(&self, key: &str) -> Result<Vec<u8>> {
|
|
#[cfg(not(feature = "s3"))]
|
|
{
|
|
return Err(anyhow!("S3 support not compiled in"));
|
|
}
|
|
|
|
#[cfg(feature = "s3")]
|
|
{
|
|
info!("Retrieving file from S3: {}/{}", self.config.bucket_name, key);
|
|
|
|
let key_owned = key.to_string();
|
|
let bucket_name = self.config.bucket_name.clone();
|
|
let client = self.client.clone();
|
|
|
|
let bytes = self.retry_operation(&format!("retrieve_file: {}", key), || {
|
|
let key = key_owned.clone();
|
|
let bucket_name = bucket_name.clone();
|
|
let client = client.clone();
|
|
|
|
async move {
|
|
let response = client
|
|
.get_object()
|
|
.bucket(&bucket_name)
|
|
.key(&key)
|
|
.send()
|
|
.await
|
|
.map_err(|e| anyhow!("Failed to retrieve file {}: {}", key, e))?;
|
|
|
|
let body = response.body.collect().await
|
|
.map_err(|e| anyhow!("Failed to read file body: {}", e))?;
|
|
|
|
Ok(body.into_bytes().to_vec())
|
|
}
|
|
}).await?;
|
|
|
|
info!("Successfully retrieved file: {} ({} bytes)", key, bytes.len());
|
|
Ok(bytes)
|
|
}
|
|
}
|
|
|
|
/// Delete a file from S3
|
|
pub async fn delete_file(&self, key: &str) -> Result<()> {
|
|
#[cfg(not(feature = "s3"))]
|
|
{
|
|
return Err(anyhow!("S3 support not compiled in"));
|
|
}
|
|
|
|
#[cfg(feature = "s3")]
|
|
{
|
|
info!("Deleting file from S3: {}/{}", self.config.bucket_name, key);
|
|
|
|
self.client
|
|
.delete_object()
|
|
.bucket(&self.config.bucket_name)
|
|
.key(key)
|
|
.send()
|
|
.await
|
|
.map_err(|e| anyhow!("Failed to delete file {}: {}", key, e))?;
|
|
|
|
info!("Successfully deleted file: {}", key);
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
/// Check if a file exists in S3
|
|
pub async fn file_exists(&self, key: &str) -> Result<bool> {
|
|
#[cfg(not(feature = "s3"))]
|
|
{
|
|
return Err(anyhow!("S3 support not compiled in"));
|
|
}
|
|
|
|
#[cfg(feature = "s3")]
|
|
{
|
|
match self.client
|
|
.head_object()
|
|
.bucket(&self.config.bucket_name)
|
|
.key(key)
|
|
.send()
|
|
.await
|
|
{
|
|
Ok(_) => Ok(true),
|
|
Err(e) => {
|
|
let error_msg = e.to_string();
|
|
if error_msg.contains("NotFound") || error_msg.contains("404") {
|
|
Ok(false)
|
|
} else {
|
|
Err(anyhow!("Failed to check file existence {}: {}", key, e))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Delete all files for a document (document, thumbnail, processed image)
|
|
pub async fn delete_document_files(&self, user_id: Uuid, document_id: Uuid, filename: &str) -> Result<()> {
|
|
#[cfg(not(feature = "s3"))]
|
|
{
|
|
return Err(anyhow!("S3 support not compiled in"));
|
|
}
|
|
|
|
#[cfg(feature = "s3")]
|
|
{
|
|
let document_key = self.generate_document_key(user_id, document_id, filename);
|
|
let thumbnail_key = format!("thumbnails/{}/{}_thumb.jpg", user_id, document_id);
|
|
let processed_key = format!("processed_images/{}/{}_processed.png", user_id, document_id);
|
|
|
|
let mut errors = Vec::new();
|
|
|
|
// Delete document file
|
|
if let Err(e) = self.delete_file(&document_key).await {
|
|
if !e.to_string().contains("NotFound") {
|
|
errors.push(format!("Document: {}", e));
|
|
}
|
|
}
|
|
|
|
// Delete thumbnail
|
|
if let Err(e) = self.delete_file(&thumbnail_key).await {
|
|
if !e.to_string().contains("NotFound") {
|
|
errors.push(format!("Thumbnail: {}", e));
|
|
}
|
|
}
|
|
|
|
// Delete processed image
|
|
if let Err(e) = self.delete_file(&processed_key).await {
|
|
if !e.to_string().contains("NotFound") {
|
|
errors.push(format!("Processed image: {}", e));
|
|
}
|
|
}
|
|
|
|
if !errors.is_empty() {
|
|
return Err(anyhow!("Failed to delete some files: {}", errors.join("; ")));
|
|
}
|
|
|
|
info!("Successfully deleted all files for document {}", document_id);
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
// ========================================
|
|
// HELPER METHODS
|
|
// ========================================
|
|
|
|
/// Generate a structured S3 key for a document
|
|
fn generate_document_key(&self, user_id: Uuid, document_id: Uuid, filename: &str) -> String {
|
|
let now = chrono::Utc::now();
|
|
let year = now.year();
|
|
let month = now.month();
|
|
|
|
// Extract file extension
|
|
let extension = std::path::Path::new(filename)
|
|
.extension()
|
|
.and_then(|ext| ext.to_str())
|
|
.unwrap_or("");
|
|
|
|
if extension.is_empty() {
|
|
format!("documents/{}/{:04}/{:02}/{}", user_id, year, month, document_id)
|
|
} else {
|
|
format!("documents/{}/{:04}/{:02}/{}.{}", user_id, year, month, document_id, extension)
|
|
}
|
|
}
|
|
|
|
/// Get content type from S3 key/filename
|
|
fn get_content_type_from_key(&self, key: &str) -> Option<String> {
|
|
let extension = std::path::Path::new(key)
|
|
.extension()
|
|
.and_then(|ext| ext.to_str())
|
|
.unwrap_or("")
|
|
.to_lowercase();
|
|
|
|
Some(Self::get_mime_type(&extension))
|
|
}
|
|
|
|
/// Get metadata for image files
|
|
fn get_image_metadata(&self) -> HashMap<String, String> {
|
|
let mut metadata = HashMap::new();
|
|
metadata.insert("generated-by".to_string(), "readur".to_string());
|
|
metadata.insert("created-at".to_string(), chrono::Utc::now().to_rfc3339());
|
|
metadata
|
|
}
|
|
|
|
/// Retry wrapper for S3 operations with exponential backoff
|
|
async fn retry_operation<T, F, Fut>(&self, operation_name: &str, operation: F) -> Result<T>
|
|
where
|
|
F: Fn() -> Fut,
|
|
Fut: std::future::Future<Output = Result<T>>,
|
|
{
|
|
const MAX_RETRIES: u32 = 3;
|
|
const BASE_DELAY_MS: u64 = 100;
|
|
|
|
let mut last_error = None;
|
|
|
|
for attempt in 0..=MAX_RETRIES {
|
|
match operation().await {
|
|
Ok(result) => {
|
|
if attempt > 0 {
|
|
info!("S3 operation '{}' succeeded after {} retries", operation_name, attempt);
|
|
}
|
|
return Ok(result);
|
|
}
|
|
Err(e) => {
|
|
last_error = Some(e);
|
|
|
|
if attempt < MAX_RETRIES {
|
|
let delay_ms = BASE_DELAY_MS * 2u64.pow(attempt);
|
|
warn!("S3 operation '{}' failed (attempt {}/{}), retrying in {}ms: {}",
|
|
operation_name, attempt + 1, MAX_RETRIES + 1, delay_ms, last_error.as_ref().unwrap());
|
|
tokio::time::sleep(Duration::from_millis(delay_ms)).await;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
error!("S3 operation '{}' failed after {} attempts: {}",
|
|
operation_name, MAX_RETRIES + 1, last_error.as_ref().unwrap());
|
|
Err(last_error.unwrap())
|
|
}
|
|
}
|
|
|
|
// Implement StorageBackend trait for S3Service
|
|
#[async_trait]
|
|
impl StorageBackend for S3Service {
|
|
fn as_any(&self) -> Option<&dyn std::any::Any> {
|
|
Some(self)
|
|
}
|
|
async fn store_document(&self, user_id: Uuid, document_id: Uuid, filename: &str, data: &[u8]) -> Result<String> {
|
|
// Generate S3 key
|
|
let key = self.generate_document_key(user_id, document_id, filename);
|
|
|
|
// Use streaming upload for large files
|
|
if data.len() > STREAMING_THRESHOLD {
|
|
info!("Using streaming multipart upload for large file: {} ({} bytes)", key, data.len());
|
|
self.store_file_multipart(&key, data, None).await?;
|
|
} else {
|
|
self.store_file(&key, data, None).await?;
|
|
}
|
|
|
|
Ok(format!("s3://{}", key))
|
|
}
|
|
|
|
async fn store_thumbnail(&self, user_id: Uuid, document_id: Uuid, data: &[u8]) -> Result<String> {
|
|
let key = format!("thumbnails/{}/{}_thumb.jpg", user_id, document_id);
|
|
self.store_file(&key, data, Some(self.get_image_metadata())).await?;
|
|
Ok(format!("s3://{}", key))
|
|
}
|
|
|
|
async fn store_processed_image(&self, user_id: Uuid, document_id: Uuid, data: &[u8]) -> Result<String> {
|
|
let key = format!("processed_images/{}/{}_processed.png", user_id, document_id);
|
|
self.store_file(&key, data, Some(self.get_image_metadata())).await?;
|
|
Ok(format!("s3://{}", key))
|
|
}
|
|
|
|
async fn retrieve_file(&self, path: &str) -> Result<Vec<u8>> {
|
|
// Handle s3:// prefix if present
|
|
let key = if path.starts_with("s3://") {
|
|
path.strip_prefix("s3://").unwrap_or(path)
|
|
} else {
|
|
path
|
|
};
|
|
|
|
#[cfg(not(feature = "s3"))]
|
|
{
|
|
return Err(anyhow!("S3 support not compiled in"));
|
|
}
|
|
|
|
#[cfg(feature = "s3")]
|
|
{
|
|
info!("Retrieving file from S3: {}/{}", self.config.bucket_name, key);
|
|
|
|
let key_owned = key.to_string();
|
|
let bucket_name = self.config.bucket_name.clone();
|
|
let client = self.client.clone();
|
|
|
|
let bytes = self.retry_operation(&format!("retrieve_file: {}", key), || {
|
|
let key = key_owned.clone();
|
|
let bucket_name = bucket_name.clone();
|
|
let client = client.clone();
|
|
|
|
async move {
|
|
let response = client
|
|
.get_object()
|
|
.bucket(&bucket_name)
|
|
.key(&key)
|
|
.send()
|
|
.await
|
|
.map_err(|e| anyhow!("Failed to retrieve file {}: {}", key, e))?;
|
|
|
|
let body = response.body.collect().await
|
|
.map_err(|e| anyhow!("Failed to read file body: {}", e))?;
|
|
|
|
Ok(body.into_bytes().to_vec())
|
|
}
|
|
}).await?;
|
|
|
|
info!("Successfully retrieved file: {} ({} bytes)", key, bytes.len());
|
|
Ok(bytes)
|
|
}
|
|
}
|
|
|
|
async fn delete_document_files(&self, user_id: Uuid, document_id: Uuid, filename: &str) -> Result<()> {
|
|
#[cfg(not(feature = "s3"))]
|
|
{
|
|
return Err(anyhow!("S3 support not compiled in"));
|
|
}
|
|
|
|
#[cfg(feature = "s3")]
|
|
{
|
|
let document_key = self.generate_document_key(user_id, document_id, filename);
|
|
let thumbnail_key = format!("thumbnails/{}/{}_thumb.jpg", user_id, document_id);
|
|
let processed_key = format!("processed_images/{}/{}_processed.png", user_id, document_id);
|
|
|
|
let mut errors = Vec::new();
|
|
|
|
// Delete document file
|
|
if let Err(e) = self.delete_file(&document_key).await {
|
|
if !e.to_string().contains("NotFound") {
|
|
errors.push(format!("Document: {}", e));
|
|
}
|
|
}
|
|
|
|
// Delete thumbnail
|
|
if let Err(e) = self.delete_file(&thumbnail_key).await {
|
|
if !e.to_string().contains("NotFound") {
|
|
errors.push(format!("Thumbnail: {}", e));
|
|
}
|
|
}
|
|
|
|
// Delete processed image
|
|
if let Err(e) = self.delete_file(&processed_key).await {
|
|
if !e.to_string().contains("NotFound") {
|
|
errors.push(format!("Processed image: {}", e));
|
|
}
|
|
}
|
|
|
|
if !errors.is_empty() {
|
|
return Err(anyhow!("Failed to delete some files: {}", errors.join("; ")));
|
|
}
|
|
|
|
info!("Successfully deleted all files for document {}", document_id);
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
async fn file_exists(&self, path: &str) -> Result<bool> {
|
|
// Handle s3:// prefix if present
|
|
let key = if path.starts_with("s3://") {
|
|
path.strip_prefix("s3://").unwrap_or(path)
|
|
} else {
|
|
path
|
|
};
|
|
|
|
#[cfg(not(feature = "s3"))]
|
|
{
|
|
return Err(anyhow!("S3 support not compiled in"));
|
|
}
|
|
|
|
#[cfg(feature = "s3")]
|
|
{
|
|
match self.client
|
|
.head_object()
|
|
.bucket(&self.config.bucket_name)
|
|
.key(key)
|
|
.send()
|
|
.await
|
|
{
|
|
Ok(_) => Ok(true),
|
|
Err(e) => {
|
|
let error_msg = e.to_string();
|
|
if error_msg.contains("NotFound") || error_msg.contains("404") {
|
|
Ok(false)
|
|
} else {
|
|
Err(anyhow!("Failed to check file existence {}: {}", key, e))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
fn storage_type(&self) -> &'static str {
|
|
"s3"
|
|
}
|
|
|
|
async fn initialize(&self) -> Result<()> {
|
|
self.test_connection().await?;
|
|
info!("S3 storage backend initialized successfully");
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[tokio::test]
|
|
async fn test_s3_config_creation() {
|
|
let config = S3SourceConfig {
|
|
bucket_name: "test-bucket".to_string(),
|
|
region: "us-east-1".to_string(),
|
|
access_key_id: "test-key".to_string(),
|
|
secret_access_key: "test-secret".to_string(),
|
|
endpoint_url: None,
|
|
prefix: None,
|
|
watch_folders: vec!["documents/".to_string()],
|
|
file_extensions: vec!["pdf".to_string(), "txt".to_string()],
|
|
auto_sync: true,
|
|
sync_interval_minutes: 60,
|
|
};
|
|
|
|
// This will create the client but won't test actual S3 access
|
|
let service = S3Service::new(config).await;
|
|
#[cfg(feature = "s3")]
|
|
assert!(service.is_ok());
|
|
#[cfg(not(feature = "s3"))]
|
|
assert!(service.is_err());
|
|
}
|
|
|
|
#[test]
|
|
fn test_mime_type_detection() {
|
|
assert_eq!(S3Service::get_mime_type("pdf"), "application/pdf");
|
|
assert_eq!(S3Service::get_mime_type("jpg"), "image/jpeg");
|
|
assert_eq!(S3Service::get_mime_type("txt"), "text/plain");
|
|
assert_eq!(S3Service::get_mime_type("unknown"), "application/octet-stream");
|
|
}
|
|
} |