Readur/src/services/webdav/discovery.rs

539 lines
20 KiB
Rust

use anyhow::Result;
use reqwest::Method;
use std::collections::HashSet;
use tokio::sync::Semaphore;
use futures_util::stream::{self, StreamExt};
use tracing::{debug, info, warn};
use crate::models::{FileIngestionInfo, WebDAVCrawlEstimate, WebDAVFolderInfo};
use crate::webdav_xml_parser::{parse_propfind_response, parse_propfind_response_with_directories};
use super::config::{WebDAVConfig, ConcurrencyConfig};
use super::connection::WebDAVConnection;
use super::url_management::WebDAVUrlManager;
/// Results from WebDAV discovery including both files and directories
#[derive(Debug, Clone)]
pub struct WebDAVDiscoveryResult {
pub files: Vec<FileIngestionInfo>,
pub directories: Vec<FileIngestionInfo>,
}
pub struct WebDAVDiscovery {
connection: WebDAVConnection,
config: WebDAVConfig,
concurrency_config: ConcurrencyConfig,
url_manager: WebDAVUrlManager,
}
impl WebDAVDiscovery {
pub fn new(
connection: WebDAVConnection,
config: WebDAVConfig,
concurrency_config: ConcurrencyConfig
) -> Self {
let url_manager = WebDAVUrlManager::new(config.clone());
Self {
connection,
config,
concurrency_config,
url_manager
}
}
/// Discovers files in a directory with support for pagination and filtering
pub async fn discover_files(&self, directory_path: &str, recursive: bool) -> Result<Vec<FileIngestionInfo>> {
info!("🔍 Discovering files in directory: {}", directory_path);
if recursive {
self.discover_files_recursive(directory_path).await
} else {
self.discover_files_single_directory(directory_path).await
}
}
/// Discovers both files and directories with their ETags for directory tracking
pub async fn discover_files_and_directories(&self, directory_path: &str, recursive: bool) -> Result<WebDAVDiscoveryResult> {
info!("🔍 Discovering files and directories in: {}", directory_path);
if recursive {
self.discover_files_and_directories_recursive(directory_path).await
} else {
self.discover_files_and_directories_single(directory_path).await
}
}
/// Discovers files in a single directory (non-recursive)
async fn discover_files_single_directory(&self, directory_path: &str) -> Result<Vec<FileIngestionInfo>> {
let url = self.connection.get_url_for_path(directory_path);
let propfind_body = r#"<?xml version="1.0" encoding="utf-8"?>
<D:propfind xmlns:D="DAV:">
<D:prop>
<D:displayname/>
<D:getcontentlength/>
<D:getlastmodified/>
<D:getetag/>
<D:resourcetype/>
<D:creationdate/>
</D:prop>
</D:propfind>"#;
let response = self.connection
.authenticated_request(
Method::from_bytes(b"PROPFIND")?,
&url,
Some(propfind_body.to_string()),
Some(vec![
("Depth", "1"),
("Content-Type", "application/xml"),
]),
)
.await?;
let body = response.text().await?;
let files = parse_propfind_response(&body)?;
// Process file paths using the centralized URL manager
let files = self.url_manager.process_file_infos(files);
// Filter files based on supported extensions
let filtered_files: Vec<FileIngestionInfo> = files
.into_iter()
.filter(|file| {
!file.is_directory && self.config.is_supported_extension(&file.name)
})
.collect();
debug!("Found {} supported files in directory: {}", filtered_files.len(), directory_path);
Ok(filtered_files)
}
/// Discovers both files and directories in a single directory (non-recursive)
async fn discover_files_and_directories_single(&self, directory_path: &str) -> Result<WebDAVDiscoveryResult> {
let url = self.connection.get_url_for_path(directory_path);
let propfind_body = r#"<?xml version="1.0" encoding="utf-8"?>
<D:propfind xmlns:D="DAV:">
<D:prop>
<D:displayname/>
<D:getcontentlength/>
<D:getlastmodified/>
<D:getetag/>
<D:resourcetype/>
<D:creationdate/>
</D:prop>
</D:propfind>"#;
let response = self.connection
.authenticated_request(
Method::from_bytes(b"PROPFIND")?,
&url,
Some(propfind_body.to_string()),
Some(vec![
("Depth", "1"),
("Content-Type", "application/xml"),
]),
)
.await?;
let body = response.text().await?;
let all_items = parse_propfind_response_with_directories(&body)?;
// Process file paths using the centralized URL manager
let all_items = self.url_manager.process_file_infos(all_items);
// Separate files and directories
let mut files = Vec::new();
let mut directories = Vec::new();
for item in all_items {
if item.is_directory {
directories.push(item);
} else if self.config.is_supported_extension(&item.name) {
files.push(item);
}
}
debug!("Single directory '{}': {} files, {} directories",
directory_path, files.len(), directories.len());
Ok(WebDAVDiscoveryResult { files, directories })
}
/// Discovers files recursively in directory tree
async fn discover_files_recursive(&self, root_directory: &str) -> Result<Vec<FileIngestionInfo>> {
let mut all_files = Vec::new();
let mut directories_to_scan = vec![root_directory.to_string()];
let semaphore = Semaphore::new(self.concurrency_config.max_concurrent_scans);
while !directories_to_scan.is_empty() {
let current_batch: Vec<String> = directories_to_scan
.drain(..)
.take(self.concurrency_config.max_concurrent_scans)
.collect();
let tasks = current_batch.into_iter().map(|dir| {
let semaphore = &semaphore;
async move {
let _permit = semaphore.acquire().await.unwrap();
self.scan_directory_with_subdirs(&dir).await
}
});
let results = stream::iter(tasks)
.buffer_unordered(self.concurrency_config.max_concurrent_scans)
.collect::<Vec<_>>()
.await;
for result in results {
match result {
Ok((files, subdirs)) => {
all_files.extend(files);
directories_to_scan.extend(subdirs);
}
Err(e) => {
warn!("Failed to scan directory: {}", e);
}
}
}
}
info!("Recursive discovery found {} total files", all_files.len());
Ok(all_files)
}
/// Discovers both files and directories recursively in directory tree
async fn discover_files_and_directories_recursive(&self, root_directory: &str) -> Result<WebDAVDiscoveryResult> {
let mut all_files = Vec::new();
let mut all_directories = Vec::new();
let mut directories_to_scan = vec![root_directory.to_string()];
let semaphore = Semaphore::new(self.concurrency_config.max_concurrent_scans);
while !directories_to_scan.is_empty() {
let current_batch: Vec<String> = directories_to_scan
.drain(..)
.take(self.concurrency_config.max_concurrent_scans)
.collect();
let tasks = current_batch.into_iter().map(|dir| {
let semaphore = &semaphore;
async move {
let _permit = semaphore.acquire().await.unwrap();
self.scan_directory_with_all_info(&dir).await
}
});
let results = stream::iter(tasks)
.buffer_unordered(self.concurrency_config.max_concurrent_scans)
.collect::<Vec<_>>()
.await;
for result in results {
match result {
Ok((files, directories, subdirs_to_scan)) => {
all_files.extend(files);
all_directories.extend(directories);
directories_to_scan.extend(subdirs_to_scan);
}
Err(e) => {
warn!("Failed to scan directory: {}", e);
}
}
}
}
info!("Recursive discovery found {} total files and {} directories",
all_files.len(), all_directories.len());
Ok(WebDAVDiscoveryResult {
files: all_files,
directories: all_directories
})
}
/// Scans a directory and returns both files and subdirectories
async fn scan_directory_with_subdirs(&self, directory_path: &str) -> Result<(Vec<FileIngestionInfo>, Vec<String>)> {
let url = self.connection.get_url_for_path(directory_path);
let propfind_body = r#"<?xml version="1.0" encoding="utf-8"?>
<D:propfind xmlns:D="DAV:">
<D:prop>
<D:displayname/>
<D:getcontentlength/>
<D:getlastmodified/>
<D:getetag/>
<D:resourcetype/>
<D:creationdate/>
</D:prop>
</D:propfind>"#;
let response = self.connection
.authenticated_request(
Method::from_bytes(b"PROPFIND")?,
&url,
Some(propfind_body.to_string()),
Some(vec![
("Depth", "1"),
("Content-Type", "application/xml"),
]),
)
.await?;
let body = response.text().await?;
let all_items = parse_propfind_response_with_directories(&body)?;
// Process file paths using the centralized URL manager
let all_items = self.url_manager.process_file_infos(all_items);
// Separate files and directories
let mut filtered_files = Vec::new();
let mut subdirectory_paths = Vec::new();
for item in all_items {
if item.is_directory {
// Use the relative_path which is now properly set by url_manager
subdirectory_paths.push(item.relative_path.clone());
} else if self.config.is_supported_extension(&item.name) {
filtered_files.push(item);
}
}
let full_dir_paths = subdirectory_paths;
debug!("Directory '{}': {} files, {} subdirectories",
directory_path, filtered_files.len(), full_dir_paths.len());
Ok((filtered_files, full_dir_paths))
}
/// Scans a directory and returns files, directories, and subdirectory paths for queue
async fn scan_directory_with_all_info(&self, directory_path: &str) -> Result<(Vec<FileIngestionInfo>, Vec<FileIngestionInfo>, Vec<String>)> {
let url = self.connection.get_url_for_path(directory_path);
let propfind_body = r#"<?xml version="1.0" encoding="utf-8"?>
<D:propfind xmlns:D="DAV:">
<D:prop>
<D:displayname/>
<D:getcontentlength/>
<D:getlastmodified/>
<D:getetag/>
<D:resourcetype/>
<D:creationdate/>
</D:prop>
</D:propfind>"#;
let response = self.connection
.authenticated_request(
Method::from_bytes(b"PROPFIND")?,
&url,
Some(propfind_body.to_string()),
Some(vec![
("Depth", "1"),
("Content-Type", "application/xml"),
]),
)
.await?;
let body = response.text().await?;
let all_items = parse_propfind_response_with_directories(&body)?;
// Process file paths using the centralized URL manager
let all_items = self.url_manager.process_file_infos(all_items);
// Separate files and directories
let mut filtered_files = Vec::new();
let mut directories = Vec::new();
let mut subdirectory_paths = Vec::new();
for item in all_items {
if item.is_directory {
// Use the relative_path which is now properly set by url_manager
directories.push(item.clone());
subdirectory_paths.push(item.relative_path.clone());
} else if self.config.is_supported_extension(&item.name) {
filtered_files.push(item);
}
}
debug!("Directory '{}': {} files, {} directories, {} paths to scan",
directory_path, filtered_files.len(), directories.len(), subdirectory_paths.len());
Ok((filtered_files, directories, subdirectory_paths))
}
/// Estimates crawl time and file counts for watch folders
pub async fn estimate_crawl(&self) -> Result<WebDAVCrawlEstimate> {
info!("📊 Estimating crawl for WebDAV watch folders");
let mut folders = Vec::new();
let mut total_files = 0;
let mut total_supported_files = 0;
let mut total_size_mb = 0.0;
for watch_folder in &self.config.watch_folders {
match self.estimate_folder(watch_folder).await {
Ok(folder_info) => {
total_files += folder_info.total_files;
total_supported_files += folder_info.supported_files;
total_size_mb += folder_info.total_size_mb;
folders.push(folder_info);
}
Err(e) => {
warn!("Failed to estimate folder '{}': {}", watch_folder, e);
// Add empty folder info for failed estimates
folders.push(WebDAVFolderInfo {
path: watch_folder.clone(),
total_files: 0,
supported_files: 0,
estimated_time_hours: 0.0,
total_size_mb: 0.0,
});
}
}
}
// Estimate total time based on file count and average processing time
let avg_time_per_file_seconds = 2.0; // Conservative estimate
let total_estimated_time_hours = (total_supported_files as f32 * avg_time_per_file_seconds) / 3600.0;
Ok(WebDAVCrawlEstimate {
folders,
total_files,
total_supported_files,
total_estimated_time_hours,
total_size_mb,
})
}
/// Estimates file count and size for a specific folder
async fn estimate_folder(&self, folder_path: &str) -> Result<WebDAVFolderInfo> {
debug!("Estimating folder: {}", folder_path);
// Sample a few subdirectories to estimate the total
let sample_files = self.discover_files_single_directory(folder_path).await?;
// Get subdirectories for deeper estimation
let subdirs = self.get_subdirectories(folder_path).await?;
let mut total_files = sample_files.len() as i64;
let mut total_size: i64 = sample_files.iter().map(|f| f.size).sum();
// Sample a few subdirectories to extrapolate
let sample_size = std::cmp::min(5, subdirs.len());
if sample_size > 0 {
let mut sample_total = 0i64;
for subdir in subdirs.iter().take(sample_size) {
if let Ok(subdir_files) = self.discover_files_single_directory(subdir).await {
sample_total += subdir_files.len() as i64;
}
}
// Extrapolate based on sample
if sample_total > 0 {
let avg_files_per_subdir = sample_total as f64 / sample_size as f64;
total_files += (avg_files_per_subdir * subdirs.len() as f64) as i64;
}
}
// Filter for supported files
let supported_files = (total_files as f64 * self.calculate_support_ratio(&sample_files)) as i64;
let total_size_mb = total_size as f64 / (1024.0 * 1024.0);
let estimated_time_hours = (supported_files as f32 * 2.0) / 3600.0; // 2 seconds per file
Ok(WebDAVFolderInfo {
path: folder_path.to_string(),
total_files,
supported_files,
estimated_time_hours,
total_size_mb,
})
}
/// Gets subdirectories for a given path
async fn get_subdirectories(&self, directory_path: &str) -> Result<Vec<String>> {
let url = self.connection.get_url_for_path(directory_path);
let propfind_body = r#"<?xml version="1.0" encoding="utf-8"?>
<D:propfind xmlns:D="DAV:">
<D:prop>
<D:resourcetype/>
</D:prop>
</D:propfind>"#;
let response = self.connection
.authenticated_request(
Method::from_bytes(b"PROPFIND")?,
&url,
Some(propfind_body.to_string()),
Some(vec![
("Depth", "1"),
("Content-Type", "application/xml"),
]),
)
.await?;
let body = response.text().await?;
let all_items = parse_propfind_response_with_directories(&body)?;
// Process file paths using the centralized URL manager
let all_items = self.url_manager.process_file_infos(all_items);
// Filter out only directories and extract their paths
let directory_paths: Vec<String> = all_items
.into_iter()
.filter(|item| item.is_directory)
.map(|item| item.relative_path)
.collect();
Ok(directory_paths)
}
/// Calculates the ratio of supported files in a sample
fn calculate_support_ratio(&self, sample_files: &[FileIngestionInfo]) -> f64 {
if sample_files.is_empty() {
return 1.0; // Assume all files are supported if no sample
}
let supported_count = sample_files
.iter()
.filter(|file| self.config.is_supported_extension(&file.name))
.count();
supported_count as f64 / sample_files.len() as f64
}
/// Filters files by last modified date (for incremental syncs)
pub fn filter_files_by_date(&self, files: Vec<FileIngestionInfo>, since: chrono::DateTime<chrono::Utc>) -> Vec<FileIngestionInfo> {
files
.into_iter()
.filter(|file| {
file.last_modified
.map(|modified| modified > since)
.unwrap_or(true) // Include files without modification date
})
.collect()
}
/// Deduplicates files by ETag or path
pub fn deduplicate_files(&self, files: Vec<FileIngestionInfo>) -> Vec<FileIngestionInfo> {
let mut seen_etags = HashSet::new();
let mut seen_paths = HashSet::new();
let mut deduplicated = Vec::new();
for file in files {
let is_duplicate = if !file.etag.is_empty() {
!seen_etags.insert(file.etag.clone())
} else {
!seen_paths.insert(file.relative_path.clone())
};
if !is_duplicate {
deduplicated.push(file);
}
}
debug!("Deduplicated {} files", deduplicated.len());
deduplicated
}
}