use anyhow::{anyhow, Result}; use async_trait::async_trait; use chrono::{DateTime, Datelike}; use tracing::{debug, info, warn, error}; use serde_json; use std::collections::HashMap; use std::time::Duration; use uuid::Uuid; use futures::stream::StreamExt; use tokio::io::AsyncReadExt; #[cfg(feature = "s3")] use aws_sdk_s3::Client; #[cfg(feature = "s3")] use aws_credential_types::Credentials; #[cfg(feature = "s3")] use aws_types::region::Region as AwsRegion; #[cfg(feature = "s3")] use aws_sdk_s3::primitives::ByteStream; #[cfg(feature = "s3")] use aws_sdk_s3::types::{CompletedPart, CompletedMultipartUpload}; use crate::models::{FileIngestionInfo, S3SourceConfig}; use crate::storage::StorageBackend; /// Threshold for using streaming multipart uploads (100MB) const STREAMING_THRESHOLD: usize = 100 * 1024 * 1024; /// Multipart upload chunk size (16MB - AWS minimum is 5MB, we use 16MB for better performance) const MULTIPART_CHUNK_SIZE: usize = 16 * 1024 * 1024; #[derive(Debug, Clone)] pub struct S3Service { #[cfg(feature = "s3")] client: Client, config: S3SourceConfig, } impl S3Service { pub async fn new(config: S3SourceConfig) -> Result { #[cfg(not(feature = "s3"))] { return Err(anyhow!("S3 support not compiled in. Enable the 's3' feature to use S3 sources.")); } #[cfg(feature = "s3")] { // Validate required fields if config.bucket_name.is_empty() { return Err(anyhow!("Bucket name is required")); } if config.access_key_id.is_empty() { return Err(anyhow!("Access key ID is required")); } if config.secret_access_key.is_empty() { return Err(anyhow!("Secret access key is required")); } // Create S3 client with custom configuration let credentials = Credentials::new( &config.access_key_id, &config.secret_access_key, None, // session token None, // expiry "readur-s3-source" ); let region = if config.region.is_empty() { "us-east-1".to_string() } else { config.region.clone() }; let mut s3_config_builder = aws_sdk_s3::config::Builder::new() .region(AwsRegion::new(region)) .credentials_provider(credentials) .behavior_version_latest(); // Set custom endpoint if provided (for S3-compatible services) if let Some(endpoint_url) = &config.endpoint_url { if !endpoint_url.is_empty() { s3_config_builder = s3_config_builder.endpoint_url(endpoint_url); info!("Using custom S3 endpoint: {}", endpoint_url); } } let s3_config = s3_config_builder.build(); let client = Client::from_conf(s3_config); Ok(Self { #[cfg(feature = "s3")] client, config }) } } /// Discover files in a specific S3 prefix (folder) pub async fn discover_files_in_folder(&self, folder_path: &str) -> Result> { #[cfg(not(feature = "s3"))] { return Err(anyhow!("S3 support not compiled in")); } #[cfg(feature = "s3")] { info!("Scanning S3 bucket: {} prefix: {}", self.config.bucket_name, folder_path); let mut files = Vec::new(); let mut continuation_token: Option = None; loop { let mut list_request = self.client .list_objects_v2() .bucket(&self.config.bucket_name) .prefix(folder_path); if let Some(token) = &continuation_token { list_request = list_request.continuation_token(token); } match list_request.send().await { Ok(response) => { if let Some(contents) = response.contents { for object in contents { if let Some(key) = object.key { // Skip "directories" (keys ending with /) if key.ends_with('/') { continue; } // Check file extension let extension = std::path::Path::new(&key) .extension() .and_then(|ext| ext.to_str()) .unwrap_or("") .to_lowercase(); if !self.config.file_extensions.contains(&extension) { debug!("Skipping S3 object with unsupported extension: {}", key); continue; } let file_name = std::path::Path::new(&key) .file_name() .and_then(|name| name.to_str()) .unwrap_or(&key) .to_string(); let size = object.size.unwrap_or(0); let last_modified = object.last_modified .and_then(|dt| { // Convert AWS DateTime to chrono DateTime let timestamp = dt.secs(); DateTime::from_timestamp(timestamp, 0) }); let etag = object.e_tag.unwrap_or_else(|| { // Generate a fallback ETag if none provided format!("fallback-{}", &key.chars().take(16).collect::()) }); // Remove quotes from ETag if present let etag = etag.trim_matches('"').to_string(); let mime_type = Self::get_mime_type(&extension); // Build additional metadata from S3 object properties let mut metadata_map = serde_json::Map::new(); // Add S3-specific metadata if let Some(storage_class) = &object.storage_class { metadata_map.insert("storage_class".to_string(), serde_json::Value::String(storage_class.as_str().to_string())); } if let Some(owner) = &object.owner { if let Some(display_name) = &owner.display_name { metadata_map.insert("owner_display_name".to_string(), serde_json::Value::String(display_name.clone())); } if let Some(id) = &owner.id { metadata_map.insert("owner_id".to_string(), serde_json::Value::String(id.clone())); } } // Store the S3 key for reference metadata_map.insert("s3_key".to_string(), serde_json::Value::String(key.clone())); // Add bucket name for reference metadata_map.insert("s3_bucket".to_string(), serde_json::Value::String(self.config.bucket_name.clone())); // If we have region info, add it metadata_map.insert("s3_region".to_string(), serde_json::Value::String(self.config.region.clone())); let file_info = FileIngestionInfo { relative_path: key.clone(), full_path: format!("s3://{}/{}", self.config.bucket_name, key), // S3 full path includes bucket #[allow(deprecated)] path: key.clone(), name: file_name, size, mime_type, last_modified, etag, is_directory: false, created_at: None, // S3 doesn't provide creation time, only last modified permissions: None, // S3 uses different permission model (ACLs/policies) owner: object.owner.as_ref().and_then(|o| o.display_name.clone()), group: None, // S3 doesn't have Unix-style groups metadata: if metadata_map.is_empty() { None } else { Some(serde_json::Value::Object(metadata_map)) }, }; files.push(file_info); } } } // Check if there are more results if response.is_truncated == Some(true) { continuation_token = response.next_continuation_token; } else { break; } } Err(e) => { return Err(anyhow!("Failed to list S3 objects: {}", e)); } } } info!("Found {} files in S3 bucket {} prefix {}", files.len(), self.config.bucket_name, folder_path); Ok(files) } } /// Download file content from S3 pub async fn download_file(&self, object_key: &str) -> Result> { #[cfg(not(feature = "s3"))] { return Err(anyhow!("S3 support not compiled in")); } #[cfg(feature = "s3")] { info!("Downloading S3 object: {}/{}", self.config.bucket_name, object_key); let response = self.client .get_object() .bucket(&self.config.bucket_name) .key(object_key) .send() .await .map_err(|e| anyhow!("Failed to download S3 object {}: {}", object_key, e))?; let body = response.body.collect().await .map_err(|e| anyhow!("Failed to read S3 object body: {}", e))?; let bytes = body.into_bytes().to_vec(); info!("Downloaded S3 object {} ({} bytes)", object_key, bytes.len()); Ok(bytes) } } /// Test S3 connection and access to bucket pub async fn test_connection(&self) -> Result { #[cfg(not(feature = "s3"))] { return Err(anyhow!("S3 support not compiled in")); } #[cfg(feature = "s3")] { info!("Testing S3 connection to bucket: {}", self.config.bucket_name); // Test bucket access by listing objects with a limit let response = self.client .list_objects_v2() .bucket(&self.config.bucket_name) .max_keys(1) .send() .await .map_err(|e| anyhow!("Failed to access S3 bucket {}: {}", self.config.bucket_name, e))?; // Test if we can get bucket region (additional validation) let _head_bucket_response = self.client .head_bucket() .bucket(&self.config.bucket_name) .send() .await .map_err(|e| anyhow!("Cannot access bucket {}: {}", self.config.bucket_name, e))?; let object_count = response.key_count.unwrap_or(0); Ok(format!( "Successfully connected to S3 bucket '{}' (found {} objects)", self.config.bucket_name, object_count )) } } /// Get estimated file count and size for all watch folders pub async fn estimate_sync(&self) -> Result<(usize, i64)> { let mut total_files = 0; let mut total_size = 0i64; for folder in &self.config.watch_folders { match self.discover_files_in_folder(folder).await { Ok(files) => { total_files += files.len(); total_size += files.iter().map(|f| f.size).sum::(); } Err(e) => { warn!("Failed to estimate folder {}: {}", folder, e); } } } Ok((total_files, total_size)) } /// Get MIME type based on file extension fn get_mime_type(extension: &str) -> String { match extension { "pdf" => "application/pdf", "txt" => "text/plain", "png" => "image/png", "jpg" | "jpeg" => "image/jpeg", "tiff" | "tif" => "image/tiff", "bmp" => "image/bmp", "gif" => "image/gif", "webp" => "image/webp", "doc" => "application/msword", "docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "xls" => "application/vnd.ms-excel", "xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "ppt" => "application/vnd.ms-powerpoint", "pptx" => "application/vnd.openxmlformats-officedocument.presentationml.presentation", _ => "application/octet-stream", }.to_string() } pub fn get_config(&self) -> &S3SourceConfig { &self.config } // ======================================== // DIRECT STORAGE OPERATIONS // ======================================== /// Store a file directly to S3 with structured path pub async fn store_document(&self, user_id: Uuid, document_id: Uuid, filename: &str, data: &[u8]) -> Result { #[cfg(not(feature = "s3"))] { return Err(anyhow!("S3 support not compiled in")); } #[cfg(feature = "s3")] { let key = self.generate_document_key(user_id, document_id, filename); // Use streaming upload for large files if data.len() > STREAMING_THRESHOLD { info!("Using streaming multipart upload for large file: {} ({} bytes)", key, data.len()); self.store_file_multipart(&key, data, None).await?; } else { self.store_file(&key, data, None).await?; } Ok(key) } } /// Store a thumbnail to S3 pub async fn store_thumbnail(&self, user_id: Uuid, document_id: Uuid, data: &[u8]) -> Result { #[cfg(not(feature = "s3"))] { return Err(anyhow!("S3 support not compiled in")); } #[cfg(feature = "s3")] { let key = format!("thumbnails/{}/{}_thumb.jpg", user_id, document_id); self.store_file(&key, data, Some(self.get_image_metadata())).await?; Ok(key) } } /// Store a processed image to S3 pub async fn store_processed_image(&self, user_id: Uuid, document_id: Uuid, data: &[u8]) -> Result { #[cfg(not(feature = "s3"))] { return Err(anyhow!("S3 support not compiled in")); } #[cfg(feature = "s3")] { let key = format!("processed_images/{}/{}_processed.png", user_id, document_id); self.store_file(&key, data, Some(self.get_image_metadata())).await?; Ok(key) } } /// Generic file storage method async fn store_file(&self, key: &str, data: &[u8], metadata: Option>) -> Result<()> { #[cfg(not(feature = "s3"))] { return Err(anyhow!("S3 support not compiled in")); } #[cfg(feature = "s3")] { info!("Storing file to S3: {}/{}", self.config.bucket_name, key); let key_owned = key.to_string(); let data_owned = data.to_vec(); let metadata_owned = metadata.clone(); let bucket_name = self.config.bucket_name.clone(); let client = self.client.clone(); self.retry_operation(&format!("store_file: {}", key), || { let key = key_owned.clone(); let data = data_owned.clone(); let metadata = metadata_owned.clone(); let bucket_name = bucket_name.clone(); let client = client.clone(); let content_type = self.get_content_type_from_key(&key); async move { let mut put_request = client .put_object() .bucket(&bucket_name) .key(&key) .body(ByteStream::from(data)); // Add metadata if provided if let Some(meta) = metadata { for (k, v) in meta { put_request = put_request.metadata(k, v); } } // Set content type based on file extension if let Some(ct) = content_type { put_request = put_request.content_type(ct); } put_request.send().await .map_err(|e| anyhow!("Failed to store file {}: {}", key, e))?; Ok(()) } }).await?; info!("Successfully stored file: {}", key); Ok(()) } } /// Store large files using multipart upload for better performance and memory usage async fn store_file_multipart(&self, key: &str, data: &[u8], metadata: Option>) -> Result<()> { #[cfg(not(feature = "s3"))] { return Err(anyhow!("S3 support not compiled in")); } #[cfg(feature = "s3")] { info!("Starting multipart upload for file: {}/{} ({} bytes)", self.config.bucket_name, key, data.len()); let key_owned = key.to_string(); let data_owned = data.to_vec(); let metadata_owned = metadata.clone(); let bucket_name = self.config.bucket_name.clone(); let client = self.client.clone(); self.retry_operation(&format!("store_file_multipart: {}", key), || { let key = key_owned.clone(); let data = data_owned.clone(); let metadata = metadata_owned.clone(); let bucket_name = bucket_name.clone(); let client = client.clone(); let content_type = self.get_content_type_from_key(&key); async move { // Step 1: Initiate multipart upload let mut create_request = client .create_multipart_upload() .bucket(&bucket_name) .key(&key); // Add metadata if provided if let Some(meta) = metadata { for (k, v) in meta { create_request = create_request.metadata(k, v); } } // Set content type based on file extension if let Some(ct) = content_type { create_request = create_request.content_type(ct); } let create_response = create_request.send().await .map_err(|e| anyhow!("Failed to initiate multipart upload for {}: {}", key, e))?; let upload_id = create_response.upload_id() .ok_or_else(|| anyhow!("Missing upload ID in multipart upload response"))?; info!("Initiated multipart upload for {}: {}", key, upload_id); // Step 2: Upload parts in chunks let mut completed_parts = Vec::new(); let total_chunks = (data.len() + MULTIPART_CHUNK_SIZE - 1) / MULTIPART_CHUNK_SIZE; for (chunk_index, chunk) in data.chunks(MULTIPART_CHUNK_SIZE).enumerate() { let part_number = (chunk_index + 1) as i32; debug!("Uploading part {} of {} for {} ({} bytes)", part_number, total_chunks, key, chunk.len()); let upload_part_response = client .upload_part() .bucket(&bucket_name) .key(&key) .upload_id(upload_id) .part_number(part_number) .body(ByteStream::from(chunk.to_vec())) .send() .await .map_err(|e| anyhow!("Failed to upload part {} for {}: {}", part_number, key, e))?; let etag = upload_part_response.e_tag() .ok_or_else(|| anyhow!("Missing ETag in upload part response"))?; completed_parts.push( CompletedPart::builder() .part_number(part_number) .e_tag(etag) .build() ); debug!("Successfully uploaded part {} for {}", part_number, key); } // Step 3: Complete multipart upload let completed_multipart_upload = CompletedMultipartUpload::builder() .set_parts(Some(completed_parts)) .build(); client .complete_multipart_upload() .bucket(&bucket_name) .key(&key) .upload_id(upload_id) .multipart_upload(completed_multipart_upload) .send() .await .map_err(|e| { // If completion fails, try to abort the multipart upload let abort_client = client.clone(); let abort_bucket = bucket_name.clone(); let abort_key = key.clone(); let abort_upload_id = upload_id.to_string(); tokio::spawn(async move { if let Err(abort_err) = abort_client .abort_multipart_upload() .bucket(abort_bucket) .key(abort_key) .upload_id(abort_upload_id) .send() .await { error!("Failed to abort multipart upload: {}", abort_err); } }); anyhow!("Failed to complete multipart upload for {}: {}", key, e) })?; info!("Successfully completed multipart upload for {}", key); Ok(()) } }).await?; Ok(()) } } /// Retrieve a file from S3 pub async fn retrieve_file(&self, key: &str) -> Result> { #[cfg(not(feature = "s3"))] { return Err(anyhow!("S3 support not compiled in")); } #[cfg(feature = "s3")] { info!("Retrieving file from S3: {}/{}", self.config.bucket_name, key); let key_owned = key.to_string(); let bucket_name = self.config.bucket_name.clone(); let client = self.client.clone(); let bytes = self.retry_operation(&format!("retrieve_file: {}", key), || { let key = key_owned.clone(); let bucket_name = bucket_name.clone(); let client = client.clone(); async move { let response = client .get_object() .bucket(&bucket_name) .key(&key) .send() .await .map_err(|e| anyhow!("Failed to retrieve file {}: {}", key, e))?; let body = response.body.collect().await .map_err(|e| anyhow!("Failed to read file body: {}", e))?; Ok(body.into_bytes().to_vec()) } }).await?; info!("Successfully retrieved file: {} ({} bytes)", key, bytes.len()); Ok(bytes) } } /// Delete a file from S3 pub async fn delete_file(&self, key: &str) -> Result<()> { #[cfg(not(feature = "s3"))] { return Err(anyhow!("S3 support not compiled in")); } #[cfg(feature = "s3")] { info!("Deleting file from S3: {}/{}", self.config.bucket_name, key); self.client .delete_object() .bucket(&self.config.bucket_name) .key(key) .send() .await .map_err(|e| anyhow!("Failed to delete file {}: {}", key, e))?; info!("Successfully deleted file: {}", key); Ok(()) } } /// Check if a file exists in S3 pub async fn file_exists(&self, key: &str) -> Result { #[cfg(not(feature = "s3"))] { return Err(anyhow!("S3 support not compiled in")); } #[cfg(feature = "s3")] { match self.client .head_object() .bucket(&self.config.bucket_name) .key(key) .send() .await { Ok(_) => Ok(true), Err(e) => { let error_msg = e.to_string(); if error_msg.contains("NotFound") || error_msg.contains("404") { Ok(false) } else { Err(anyhow!("Failed to check file existence {}: {}", key, e)) } } } } } /// Delete all files for a document (document, thumbnail, processed image) pub async fn delete_document_files(&self, user_id: Uuid, document_id: Uuid, filename: &str) -> Result<()> { #[cfg(not(feature = "s3"))] { return Err(anyhow!("S3 support not compiled in")); } #[cfg(feature = "s3")] { let document_key = self.generate_document_key(user_id, document_id, filename); let thumbnail_key = format!("thumbnails/{}/{}_thumb.jpg", user_id, document_id); let processed_key = format!("processed_images/{}/{}_processed.png", user_id, document_id); let mut errors = Vec::new(); // Delete document file if let Err(e) = self.delete_file(&document_key).await { if !e.to_string().contains("NotFound") { errors.push(format!("Document: {}", e)); } } // Delete thumbnail if let Err(e) = self.delete_file(&thumbnail_key).await { if !e.to_string().contains("NotFound") { errors.push(format!("Thumbnail: {}", e)); } } // Delete processed image if let Err(e) = self.delete_file(&processed_key).await { if !e.to_string().contains("NotFound") { errors.push(format!("Processed image: {}", e)); } } if !errors.is_empty() { return Err(anyhow!("Failed to delete some files: {}", errors.join("; "))); } info!("Successfully deleted all files for document {}", document_id); Ok(()) } } // ======================================== // HELPER METHODS // ======================================== /// Generate a structured S3 key for a document fn generate_document_key(&self, user_id: Uuid, document_id: Uuid, filename: &str) -> String { let now = chrono::Utc::now(); let year = now.year(); let month = now.month(); // Extract file extension let extension = std::path::Path::new(filename) .extension() .and_then(|ext| ext.to_str()) .unwrap_or(""); if extension.is_empty() { format!("documents/{}/{:04}/{:02}/{}", user_id, year, month, document_id) } else { format!("documents/{}/{:04}/{:02}/{}.{}", user_id, year, month, document_id, extension) } } /// Get content type from S3 key/filename fn get_content_type_from_key(&self, key: &str) -> Option { let extension = std::path::Path::new(key) .extension() .and_then(|ext| ext.to_str()) .unwrap_or("") .to_lowercase(); Some(Self::get_mime_type(&extension)) } /// Get metadata for image files fn get_image_metadata(&self) -> HashMap { let mut metadata = HashMap::new(); metadata.insert("generated-by".to_string(), "readur".to_string()); metadata.insert("created-at".to_string(), chrono::Utc::now().to_rfc3339()); metadata } /// Retry wrapper for S3 operations with exponential backoff async fn retry_operation(&self, operation_name: &str, operation: F) -> Result where F: Fn() -> Fut, Fut: std::future::Future>, { const MAX_RETRIES: u32 = 3; const BASE_DELAY_MS: u64 = 100; let mut last_error = None; for attempt in 0..=MAX_RETRIES { match operation().await { Ok(result) => { if attempt > 0 { info!("S3 operation '{}' succeeded after {} retries", operation_name, attempt); } return Ok(result); } Err(e) => { last_error = Some(e); if attempt < MAX_RETRIES { let delay_ms = BASE_DELAY_MS * 2u64.pow(attempt); warn!("S3 operation '{}' failed (attempt {}/{}), retrying in {}ms: {}", operation_name, attempt + 1, MAX_RETRIES + 1, delay_ms, last_error.as_ref().unwrap()); tokio::time::sleep(Duration::from_millis(delay_ms)).await; } } } } error!("S3 operation '{}' failed after {} attempts: {}", operation_name, MAX_RETRIES + 1, last_error.as_ref().unwrap()); Err(last_error.unwrap()) } } // Implement StorageBackend trait for S3Service #[async_trait] impl StorageBackend for S3Service { fn as_any(&self) -> Option<&dyn std::any::Any> { Some(self) } async fn store_document(&self, user_id: Uuid, document_id: Uuid, filename: &str, data: &[u8]) -> Result { // Generate S3 key let key = self.generate_document_key(user_id, document_id, filename); // Use streaming upload for large files if data.len() > STREAMING_THRESHOLD { info!("Using streaming multipart upload for large file: {} ({} bytes)", key, data.len()); self.store_file_multipart(&key, data, None).await?; } else { self.store_file(&key, data, None).await?; } Ok(format!("s3://{}", key)) } async fn store_thumbnail(&self, user_id: Uuid, document_id: Uuid, data: &[u8]) -> Result { let key = format!("thumbnails/{}/{}_thumb.jpg", user_id, document_id); self.store_file(&key, data, Some(self.get_image_metadata())).await?; Ok(format!("s3://{}", key)) } async fn store_processed_image(&self, user_id: Uuid, document_id: Uuid, data: &[u8]) -> Result { let key = format!("processed_images/{}/{}_processed.png", user_id, document_id); self.store_file(&key, data, Some(self.get_image_metadata())).await?; Ok(format!("s3://{}", key)) } async fn retrieve_file(&self, path: &str) -> Result> { // Handle s3:// prefix if present let key = if path.starts_with("s3://") { path.strip_prefix("s3://").unwrap_or(path) } else { path }; #[cfg(not(feature = "s3"))] { return Err(anyhow!("S3 support not compiled in")); } #[cfg(feature = "s3")] { info!("Retrieving file from S3: {}/{}", self.config.bucket_name, key); let key_owned = key.to_string(); let bucket_name = self.config.bucket_name.clone(); let client = self.client.clone(); let bytes = self.retry_operation(&format!("retrieve_file: {}", key), || { let key = key_owned.clone(); let bucket_name = bucket_name.clone(); let client = client.clone(); async move { let response = client .get_object() .bucket(&bucket_name) .key(&key) .send() .await .map_err(|e| anyhow!("Failed to retrieve file {}: {}", key, e))?; let body = response.body.collect().await .map_err(|e| anyhow!("Failed to read file body: {}", e))?; Ok(body.into_bytes().to_vec()) } }).await?; info!("Successfully retrieved file: {} ({} bytes)", key, bytes.len()); Ok(bytes) } } async fn delete_document_files(&self, user_id: Uuid, document_id: Uuid, filename: &str) -> Result<()> { #[cfg(not(feature = "s3"))] { return Err(anyhow!("S3 support not compiled in")); } #[cfg(feature = "s3")] { let document_key = self.generate_document_key(user_id, document_id, filename); let thumbnail_key = format!("thumbnails/{}/{}_thumb.jpg", user_id, document_id); let processed_key = format!("processed_images/{}/{}_processed.png", user_id, document_id); let mut errors = Vec::new(); // Delete document file if let Err(e) = self.delete_file(&document_key).await { if !e.to_string().contains("NotFound") { errors.push(format!("Document: {}", e)); } } // Delete thumbnail if let Err(e) = self.delete_file(&thumbnail_key).await { if !e.to_string().contains("NotFound") { errors.push(format!("Thumbnail: {}", e)); } } // Delete processed image if let Err(e) = self.delete_file(&processed_key).await { if !e.to_string().contains("NotFound") { errors.push(format!("Processed image: {}", e)); } } if !errors.is_empty() { return Err(anyhow!("Failed to delete some files: {}", errors.join("; "))); } info!("Successfully deleted all files for document {}", document_id); Ok(()) } } async fn file_exists(&self, path: &str) -> Result { // Handle s3:// prefix if present let key = if path.starts_with("s3://") { path.strip_prefix("s3://").unwrap_or(path) } else { path }; #[cfg(not(feature = "s3"))] { return Err(anyhow!("S3 support not compiled in")); } #[cfg(feature = "s3")] { match self.client .head_object() .bucket(&self.config.bucket_name) .key(key) .send() .await { Ok(_) => Ok(true), Err(e) => { let error_msg = e.to_string(); if error_msg.contains("NotFound") || error_msg.contains("404") { Ok(false) } else { Err(anyhow!("Failed to check file existence {}: {}", key, e)) } } } } } fn storage_type(&self) -> &'static str { "s3" } async fn initialize(&self) -> Result<()> { self.test_connection().await?; info!("S3 storage backend initialized successfully"); Ok(()) } } #[cfg(test)] mod tests { use super::*; #[tokio::test] async fn test_s3_config_creation() { let config = S3SourceConfig { bucket_name: "test-bucket".to_string(), region: "us-east-1".to_string(), access_key_id: "test-key".to_string(), secret_access_key: "test-secret".to_string(), endpoint_url: None, prefix: None, watch_folders: vec!["documents/".to_string()], file_extensions: vec!["pdf".to_string(), "txt".to_string()], auto_sync: true, sync_interval_minutes: 60, }; // This will create the client but won't test actual S3 access let service = S3Service::new(config).await; #[cfg(feature = "s3")] assert!(service.is_ok()); #[cfg(not(feature = "s3"))] assert!(service.is_err()); } #[test] fn test_mime_type_detection() { assert_eq!(S3Service::get_mime_type("pdf"), "application/pdf"); assert_eq!(S3Service::get_mime_type("jpg"), "image/jpeg"); assert_eq!(S3Service::get_mime_type("txt"), "text/plain"); assert_eq!(S3Service::get_mime_type("unknown"), "application/octet-stream"); } }