539 lines
20 KiB
Rust
539 lines
20 KiB
Rust
use anyhow::Result;
|
|
use reqwest::Method;
|
|
use std::collections::HashSet;
|
|
use tokio::sync::Semaphore;
|
|
use futures_util::stream::{self, StreamExt};
|
|
use tracing::{debug, info, warn};
|
|
|
|
use crate::models::{FileIngestionInfo, WebDAVCrawlEstimate, WebDAVFolderInfo};
|
|
use crate::webdav_xml_parser::{parse_propfind_response, parse_propfind_response_with_directories};
|
|
use super::config::{WebDAVConfig, ConcurrencyConfig};
|
|
use super::connection::WebDAVConnection;
|
|
use super::url_management::WebDAVUrlManager;
|
|
|
|
/// Results from WebDAV discovery including both files and directories
|
|
#[derive(Debug, Clone)]
|
|
pub struct WebDAVDiscoveryResult {
|
|
pub files: Vec<FileIngestionInfo>,
|
|
pub directories: Vec<FileIngestionInfo>,
|
|
}
|
|
|
|
pub struct WebDAVDiscovery {
|
|
connection: WebDAVConnection,
|
|
config: WebDAVConfig,
|
|
concurrency_config: ConcurrencyConfig,
|
|
url_manager: WebDAVUrlManager,
|
|
}
|
|
|
|
impl WebDAVDiscovery {
|
|
pub fn new(
|
|
connection: WebDAVConnection,
|
|
config: WebDAVConfig,
|
|
concurrency_config: ConcurrencyConfig
|
|
) -> Self {
|
|
let url_manager = WebDAVUrlManager::new(config.clone());
|
|
Self {
|
|
connection,
|
|
config,
|
|
concurrency_config,
|
|
url_manager
|
|
}
|
|
}
|
|
|
|
/// Discovers files in a directory with support for pagination and filtering
|
|
pub async fn discover_files(&self, directory_path: &str, recursive: bool) -> Result<Vec<FileIngestionInfo>> {
|
|
info!("🔍 Discovering files in directory: {}", directory_path);
|
|
|
|
if recursive {
|
|
self.discover_files_recursive(directory_path).await
|
|
} else {
|
|
self.discover_files_single_directory(directory_path).await
|
|
}
|
|
}
|
|
|
|
/// Discovers both files and directories with their ETags for directory tracking
|
|
pub async fn discover_files_and_directories(&self, directory_path: &str, recursive: bool) -> Result<WebDAVDiscoveryResult> {
|
|
info!("🔍 Discovering files and directories in: {}", directory_path);
|
|
|
|
if recursive {
|
|
self.discover_files_and_directories_recursive(directory_path).await
|
|
} else {
|
|
self.discover_files_and_directories_single(directory_path).await
|
|
}
|
|
}
|
|
|
|
/// Discovers files in a single directory (non-recursive)
|
|
async fn discover_files_single_directory(&self, directory_path: &str) -> Result<Vec<FileIngestionInfo>> {
|
|
let url = self.connection.get_url_for_path(directory_path);
|
|
|
|
let propfind_body = r#"<?xml version="1.0" encoding="utf-8"?>
|
|
<D:propfind xmlns:D="DAV:">
|
|
<D:prop>
|
|
<D:displayname/>
|
|
<D:getcontentlength/>
|
|
<D:getlastmodified/>
|
|
<D:getetag/>
|
|
<D:resourcetype/>
|
|
<D:creationdate/>
|
|
</D:prop>
|
|
</D:propfind>"#;
|
|
|
|
let response = self.connection
|
|
.authenticated_request(
|
|
Method::from_bytes(b"PROPFIND")?,
|
|
&url,
|
|
Some(propfind_body.to_string()),
|
|
Some(vec![
|
|
("Depth", "1"),
|
|
("Content-Type", "application/xml"),
|
|
]),
|
|
)
|
|
.await?;
|
|
|
|
let body = response.text().await?;
|
|
let files = parse_propfind_response(&body)?;
|
|
|
|
// Process file paths using the centralized URL manager
|
|
let files = self.url_manager.process_file_infos(files);
|
|
|
|
// Filter files based on supported extensions
|
|
let filtered_files: Vec<FileIngestionInfo> = files
|
|
.into_iter()
|
|
.filter(|file| {
|
|
!file.is_directory && self.config.is_supported_extension(&file.name)
|
|
})
|
|
.collect();
|
|
|
|
debug!("Found {} supported files in directory: {}", filtered_files.len(), directory_path);
|
|
Ok(filtered_files)
|
|
}
|
|
|
|
/// Discovers both files and directories in a single directory (non-recursive)
|
|
async fn discover_files_and_directories_single(&self, directory_path: &str) -> Result<WebDAVDiscoveryResult> {
|
|
let url = self.connection.get_url_for_path(directory_path);
|
|
|
|
let propfind_body = r#"<?xml version="1.0" encoding="utf-8"?>
|
|
<D:propfind xmlns:D="DAV:">
|
|
<D:prop>
|
|
<D:displayname/>
|
|
<D:getcontentlength/>
|
|
<D:getlastmodified/>
|
|
<D:getetag/>
|
|
<D:resourcetype/>
|
|
<D:creationdate/>
|
|
</D:prop>
|
|
</D:propfind>"#;
|
|
|
|
let response = self.connection
|
|
.authenticated_request(
|
|
Method::from_bytes(b"PROPFIND")?,
|
|
&url,
|
|
Some(propfind_body.to_string()),
|
|
Some(vec![
|
|
("Depth", "1"),
|
|
("Content-Type", "application/xml"),
|
|
]),
|
|
)
|
|
.await?;
|
|
|
|
let body = response.text().await?;
|
|
let all_items = parse_propfind_response_with_directories(&body)?;
|
|
|
|
// Process file paths using the centralized URL manager
|
|
let all_items = self.url_manager.process_file_infos(all_items);
|
|
|
|
// Separate files and directories
|
|
let mut files = Vec::new();
|
|
let mut directories = Vec::new();
|
|
|
|
for item in all_items {
|
|
if item.is_directory {
|
|
directories.push(item);
|
|
} else if self.config.is_supported_extension(&item.name) {
|
|
files.push(item);
|
|
}
|
|
}
|
|
|
|
debug!("Single directory '{}': {} files, {} directories",
|
|
directory_path, files.len(), directories.len());
|
|
|
|
Ok(WebDAVDiscoveryResult { files, directories })
|
|
}
|
|
|
|
/// Discovers files recursively in directory tree
|
|
async fn discover_files_recursive(&self, root_directory: &str) -> Result<Vec<FileIngestionInfo>> {
|
|
let mut all_files = Vec::new();
|
|
let mut directories_to_scan = vec![root_directory.to_string()];
|
|
let semaphore = Semaphore::new(self.concurrency_config.max_concurrent_scans);
|
|
|
|
while !directories_to_scan.is_empty() {
|
|
let current_batch: Vec<String> = directories_to_scan
|
|
.drain(..)
|
|
.take(self.concurrency_config.max_concurrent_scans)
|
|
.collect();
|
|
|
|
let tasks = current_batch.into_iter().map(|dir| {
|
|
let semaphore = &semaphore;
|
|
async move {
|
|
let _permit = semaphore.acquire().await.unwrap();
|
|
self.scan_directory_with_subdirs(&dir).await
|
|
}
|
|
});
|
|
|
|
let results = stream::iter(tasks)
|
|
.buffer_unordered(self.concurrency_config.max_concurrent_scans)
|
|
.collect::<Vec<_>>()
|
|
.await;
|
|
|
|
for result in results {
|
|
match result {
|
|
Ok((files, subdirs)) => {
|
|
all_files.extend(files);
|
|
directories_to_scan.extend(subdirs);
|
|
}
|
|
Err(e) => {
|
|
warn!("Failed to scan directory: {}", e);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
info!("Recursive discovery found {} total files", all_files.len());
|
|
Ok(all_files)
|
|
}
|
|
|
|
/// Discovers both files and directories recursively in directory tree
|
|
async fn discover_files_and_directories_recursive(&self, root_directory: &str) -> Result<WebDAVDiscoveryResult> {
|
|
let mut all_files = Vec::new();
|
|
let mut all_directories = Vec::new();
|
|
let mut directories_to_scan = vec![root_directory.to_string()];
|
|
let semaphore = Semaphore::new(self.concurrency_config.max_concurrent_scans);
|
|
|
|
while !directories_to_scan.is_empty() {
|
|
let current_batch: Vec<String> = directories_to_scan
|
|
.drain(..)
|
|
.take(self.concurrency_config.max_concurrent_scans)
|
|
.collect();
|
|
|
|
let tasks = current_batch.into_iter().map(|dir| {
|
|
let semaphore = &semaphore;
|
|
async move {
|
|
let _permit = semaphore.acquire().await.unwrap();
|
|
self.scan_directory_with_all_info(&dir).await
|
|
}
|
|
});
|
|
|
|
let results = stream::iter(tasks)
|
|
.buffer_unordered(self.concurrency_config.max_concurrent_scans)
|
|
.collect::<Vec<_>>()
|
|
.await;
|
|
|
|
for result in results {
|
|
match result {
|
|
Ok((files, directories, subdirs_to_scan)) => {
|
|
all_files.extend(files);
|
|
all_directories.extend(directories);
|
|
directories_to_scan.extend(subdirs_to_scan);
|
|
}
|
|
Err(e) => {
|
|
warn!("Failed to scan directory: {}", e);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
info!("Recursive discovery found {} total files and {} directories",
|
|
all_files.len(), all_directories.len());
|
|
Ok(WebDAVDiscoveryResult {
|
|
files: all_files,
|
|
directories: all_directories
|
|
})
|
|
}
|
|
|
|
/// Scans a directory and returns both files and subdirectories
|
|
async fn scan_directory_with_subdirs(&self, directory_path: &str) -> Result<(Vec<FileIngestionInfo>, Vec<String>)> {
|
|
let url = self.connection.get_url_for_path(directory_path);
|
|
|
|
let propfind_body = r#"<?xml version="1.0" encoding="utf-8"?>
|
|
<D:propfind xmlns:D="DAV:">
|
|
<D:prop>
|
|
<D:displayname/>
|
|
<D:getcontentlength/>
|
|
<D:getlastmodified/>
|
|
<D:getetag/>
|
|
<D:resourcetype/>
|
|
<D:creationdate/>
|
|
</D:prop>
|
|
</D:propfind>"#;
|
|
|
|
let response = self.connection
|
|
.authenticated_request(
|
|
Method::from_bytes(b"PROPFIND")?,
|
|
&url,
|
|
Some(propfind_body.to_string()),
|
|
Some(vec![
|
|
("Depth", "1"),
|
|
("Content-Type", "application/xml"),
|
|
]),
|
|
)
|
|
.await?;
|
|
|
|
let body = response.text().await?;
|
|
let all_items = parse_propfind_response_with_directories(&body)?;
|
|
|
|
// Process file paths using the centralized URL manager
|
|
let all_items = self.url_manager.process_file_infos(all_items);
|
|
|
|
// Separate files and directories
|
|
let mut filtered_files = Vec::new();
|
|
let mut subdirectory_paths = Vec::new();
|
|
|
|
for item in all_items {
|
|
if item.is_directory {
|
|
// Use the relative_path which is now properly set by url_manager
|
|
subdirectory_paths.push(item.relative_path.clone());
|
|
} else if self.config.is_supported_extension(&item.name) {
|
|
filtered_files.push(item);
|
|
}
|
|
}
|
|
|
|
let full_dir_paths = subdirectory_paths;
|
|
|
|
debug!("Directory '{}': {} files, {} subdirectories",
|
|
directory_path, filtered_files.len(), full_dir_paths.len());
|
|
|
|
Ok((filtered_files, full_dir_paths))
|
|
}
|
|
|
|
/// Scans a directory and returns files, directories, and subdirectory paths for queue
|
|
async fn scan_directory_with_all_info(&self, directory_path: &str) -> Result<(Vec<FileIngestionInfo>, Vec<FileIngestionInfo>, Vec<String>)> {
|
|
let url = self.connection.get_url_for_path(directory_path);
|
|
|
|
let propfind_body = r#"<?xml version="1.0" encoding="utf-8"?>
|
|
<D:propfind xmlns:D="DAV:">
|
|
<D:prop>
|
|
<D:displayname/>
|
|
<D:getcontentlength/>
|
|
<D:getlastmodified/>
|
|
<D:getetag/>
|
|
<D:resourcetype/>
|
|
<D:creationdate/>
|
|
</D:prop>
|
|
</D:propfind>"#;
|
|
|
|
let response = self.connection
|
|
.authenticated_request(
|
|
Method::from_bytes(b"PROPFIND")?,
|
|
&url,
|
|
Some(propfind_body.to_string()),
|
|
Some(vec![
|
|
("Depth", "1"),
|
|
("Content-Type", "application/xml"),
|
|
]),
|
|
)
|
|
.await?;
|
|
|
|
let body = response.text().await?;
|
|
let all_items = parse_propfind_response_with_directories(&body)?;
|
|
|
|
// Process file paths using the centralized URL manager
|
|
let all_items = self.url_manager.process_file_infos(all_items);
|
|
|
|
// Separate files and directories
|
|
let mut filtered_files = Vec::new();
|
|
let mut directories = Vec::new();
|
|
let mut subdirectory_paths = Vec::new();
|
|
|
|
for item in all_items {
|
|
if item.is_directory {
|
|
// Use the relative_path which is now properly set by url_manager
|
|
directories.push(item.clone());
|
|
subdirectory_paths.push(item.relative_path.clone());
|
|
} else if self.config.is_supported_extension(&item.name) {
|
|
filtered_files.push(item);
|
|
}
|
|
}
|
|
|
|
debug!("Directory '{}': {} files, {} directories, {} paths to scan",
|
|
directory_path, filtered_files.len(), directories.len(), subdirectory_paths.len());
|
|
|
|
Ok((filtered_files, directories, subdirectory_paths))
|
|
}
|
|
|
|
/// Estimates crawl time and file counts for watch folders
|
|
pub async fn estimate_crawl(&self) -> Result<WebDAVCrawlEstimate> {
|
|
info!("📊 Estimating crawl for WebDAV watch folders");
|
|
|
|
let mut folders = Vec::new();
|
|
let mut total_files = 0;
|
|
let mut total_supported_files = 0;
|
|
let mut total_size_mb = 0.0;
|
|
|
|
for watch_folder in &self.config.watch_folders {
|
|
match self.estimate_folder(watch_folder).await {
|
|
Ok(folder_info) => {
|
|
total_files += folder_info.total_files;
|
|
total_supported_files += folder_info.supported_files;
|
|
total_size_mb += folder_info.total_size_mb;
|
|
folders.push(folder_info);
|
|
}
|
|
Err(e) => {
|
|
warn!("Failed to estimate folder '{}': {}", watch_folder, e);
|
|
// Add empty folder info for failed estimates
|
|
folders.push(WebDAVFolderInfo {
|
|
path: watch_folder.clone(),
|
|
total_files: 0,
|
|
supported_files: 0,
|
|
estimated_time_hours: 0.0,
|
|
total_size_mb: 0.0,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
// Estimate total time based on file count and average processing time
|
|
let avg_time_per_file_seconds = 2.0; // Conservative estimate
|
|
let total_estimated_time_hours = (total_supported_files as f32 * avg_time_per_file_seconds) / 3600.0;
|
|
|
|
Ok(WebDAVCrawlEstimate {
|
|
folders,
|
|
total_files,
|
|
total_supported_files,
|
|
total_estimated_time_hours,
|
|
total_size_mb,
|
|
})
|
|
}
|
|
|
|
/// Estimates file count and size for a specific folder
|
|
async fn estimate_folder(&self, folder_path: &str) -> Result<WebDAVFolderInfo> {
|
|
debug!("Estimating folder: {}", folder_path);
|
|
|
|
// Sample a few subdirectories to estimate the total
|
|
let sample_files = self.discover_files_single_directory(folder_path).await?;
|
|
|
|
// Get subdirectories for deeper estimation
|
|
let subdirs = self.get_subdirectories(folder_path).await?;
|
|
|
|
let mut total_files = sample_files.len() as i64;
|
|
let mut total_size: i64 = sample_files.iter().map(|f| f.size).sum();
|
|
|
|
// Sample a few subdirectories to extrapolate
|
|
let sample_size = std::cmp::min(5, subdirs.len());
|
|
if sample_size > 0 {
|
|
let mut sample_total = 0i64;
|
|
|
|
for subdir in subdirs.iter().take(sample_size) {
|
|
if let Ok(subdir_files) = self.discover_files_single_directory(subdir).await {
|
|
sample_total += subdir_files.len() as i64;
|
|
}
|
|
}
|
|
|
|
// Extrapolate based on sample
|
|
if sample_total > 0 {
|
|
let avg_files_per_subdir = sample_total as f64 / sample_size as f64;
|
|
total_files += (avg_files_per_subdir * subdirs.len() as f64) as i64;
|
|
}
|
|
}
|
|
|
|
// Filter for supported files
|
|
let supported_files = (total_files as f64 * self.calculate_support_ratio(&sample_files)) as i64;
|
|
|
|
let total_size_mb = total_size as f64 / (1024.0 * 1024.0);
|
|
let estimated_time_hours = (supported_files as f32 * 2.0) / 3600.0; // 2 seconds per file
|
|
|
|
Ok(WebDAVFolderInfo {
|
|
path: folder_path.to_string(),
|
|
total_files,
|
|
supported_files,
|
|
estimated_time_hours,
|
|
total_size_mb,
|
|
})
|
|
}
|
|
|
|
/// Gets subdirectories for a given path
|
|
async fn get_subdirectories(&self, directory_path: &str) -> Result<Vec<String>> {
|
|
let url = self.connection.get_url_for_path(directory_path);
|
|
|
|
let propfind_body = r#"<?xml version="1.0" encoding="utf-8"?>
|
|
<D:propfind xmlns:D="DAV:">
|
|
<D:prop>
|
|
<D:resourcetype/>
|
|
</D:prop>
|
|
</D:propfind>"#;
|
|
|
|
let response = self.connection
|
|
.authenticated_request(
|
|
Method::from_bytes(b"PROPFIND")?,
|
|
&url,
|
|
Some(propfind_body.to_string()),
|
|
Some(vec![
|
|
("Depth", "1"),
|
|
("Content-Type", "application/xml"),
|
|
]),
|
|
)
|
|
.await?;
|
|
|
|
let body = response.text().await?;
|
|
let all_items = parse_propfind_response_with_directories(&body)?;
|
|
|
|
// Process file paths using the centralized URL manager
|
|
let all_items = self.url_manager.process_file_infos(all_items);
|
|
|
|
// Filter out only directories and extract their paths
|
|
let directory_paths: Vec<String> = all_items
|
|
.into_iter()
|
|
.filter(|item| item.is_directory)
|
|
.map(|item| item.relative_path)
|
|
.collect();
|
|
|
|
Ok(directory_paths)
|
|
}
|
|
|
|
/// Calculates the ratio of supported files in a sample
|
|
fn calculate_support_ratio(&self, sample_files: &[FileIngestionInfo]) -> f64 {
|
|
if sample_files.is_empty() {
|
|
return 1.0; // Assume all files are supported if no sample
|
|
}
|
|
|
|
let supported_count = sample_files
|
|
.iter()
|
|
.filter(|file| self.config.is_supported_extension(&file.name))
|
|
.count();
|
|
|
|
supported_count as f64 / sample_files.len() as f64
|
|
}
|
|
|
|
/// Filters files by last modified date (for incremental syncs)
|
|
pub fn filter_files_by_date(&self, files: Vec<FileIngestionInfo>, since: chrono::DateTime<chrono::Utc>) -> Vec<FileIngestionInfo> {
|
|
files
|
|
.into_iter()
|
|
.filter(|file| {
|
|
file.last_modified
|
|
.map(|modified| modified > since)
|
|
.unwrap_or(true) // Include files without modification date
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
/// Deduplicates files by ETag or path
|
|
pub fn deduplicate_files(&self, files: Vec<FileIngestionInfo>) -> Vec<FileIngestionInfo> {
|
|
let mut seen_etags = HashSet::new();
|
|
let mut seen_paths = HashSet::new();
|
|
let mut deduplicated = Vec::new();
|
|
|
|
for file in files {
|
|
let is_duplicate = if !file.etag.is_empty() {
|
|
!seen_etags.insert(file.etag.clone())
|
|
} else {
|
|
!seen_paths.insert(file.relative_path.clone())
|
|
};
|
|
|
|
if !is_duplicate {
|
|
deduplicated.push(file);
|
|
}
|
|
}
|
|
|
|
debug!("Deduplicated {} files", deduplicated.len());
|
|
deduplicated
|
|
}
|
|
} |