feat(server): do a *much* better job at determining file types thanks to infer rust package

This commit is contained in:
perf3ct 2025-07-29 21:28:33 +00:00
parent aff7b907c7
commit d7a0a1f294
15 changed files with 3203 additions and 67 deletions

2
.gitignore vendored
View File

@ -4,7 +4,7 @@ node_modules/
.env
assets/
frontend/dist/
.claude/
.claude/settings.local.json # This file is used to store the local Claude settings.
readur_uploads/
readur_watch/
test-results/

25
Cargo.lock generated
View File

@ -1009,6 +1009,17 @@ dependencies = [
"nom",
]
[[package]]
name = "cfb"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f"
dependencies = [
"byteorder",
"fnv",
"uuid",
]
[[package]]
name = "cfg-expr"
version = "0.15.8"
@ -2410,6 +2421,15 @@ dependencies = [
"serde",
]
[[package]]
name = "infer"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb33622da908807a06f9513c19b3c1ad50fab3e4137d82a78107d502075aa199"
dependencies = [
"cfb",
]
[[package]]
name = "inotify"
version = "0.11.0"
@ -2625,7 +2645,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
dependencies = [
"cfg-if",
"windows-targets 0.48.5",
"windows-targets 0.53.2",
]
[[package]]
@ -3610,6 +3630,7 @@ dependencies = [
"hostname",
"image",
"imageproc",
"infer",
"jsonwebtoken",
"mime_guess",
"notify",
@ -5674,7 +5695,7 @@ version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
dependencies = [
"windows-sys 0.48.0",
"windows-sys 0.59.0",
]
[[package]]

View File

@ -11,6 +11,7 @@ path = "src/main.rs"
name = "test_runner"
path = "src/bin/test_runner.rs"
[dependencies]
tokio = { version = "1", features = ["full"] }
axum = { version = "0.8", features = ["multipart"] }
@ -33,6 +34,7 @@ futures-util = "0.3"
futures = "0.3"
notify = "8"
mime_guess = "2"
infer = "0.15"
tesseract = { version = "0.15", optional = true }
image = { version = "0.25", features = ["png", "jpeg", "tiff", "bmp"], optional = true }
imageproc = { version = "0.25", optional = true }

View File

@ -1,12 +1,12 @@
{
"name": "readur-frontend",
"version": "2.4.2",
"version": "2.5.3",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "readur-frontend",
"version": "2.4.2",
"version": "2.5.3",
"dependencies": {
"@emotion/react": "^11.14.0",
"@emotion/styled": "^11.14.0",

View File

@ -5,6 +5,7 @@ pub mod db_guardrails_simple;
pub mod errors;
pub mod ingestion;
pub mod metadata_extraction;
pub mod mime_detection;
pub mod models;
pub mod monitoring;
pub mod ocr;

431
src/mime_detection.rs Normal file
View File

@ -0,0 +1,431 @@
/// MIME type detection module for improved file type identification
///
/// This module provides functions for detecting file MIME types using multiple methods:
/// 1. Content-based detection using magic bytes (most reliable)
/// 2. Server-provided MIME type (when available and trusted)
/// 3. Extension-based fallback (least reliable, but covers edge cases)
///
/// The goal is to provide accurate MIME type detection that's particularly important
/// for OCR processing where incorrectly classified image files can cause issues.
use std::path::Path;
use tracing::{debug, warn};
/// Strategy for MIME type detection
#[derive(Debug, Clone, PartialEq)]
pub enum DetectionStrategy {
/// Use content-based detection (magic bytes) - most reliable
ContentBased,
/// Trust server-provided MIME type if available, fallback to content
TrustServer,
/// Use extension-based detection - least reliable but fastest
ExtensionOnly,
/// Comprehensive strategy: server -> content -> extension -> fallback
Comprehensive,
}
/// Result of MIME type detection with metadata about the detection method used
#[derive(Debug, Clone)]
pub struct MimeDetectionResult {
pub mime_type: String,
pub confidence: MimeConfidence,
pub detection_method: DetectionMethod,
pub original_server_type: Option<String>,
pub detected_extension: Option<String>,
}
/// Confidence level of the MIME type detection
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub enum MimeConfidence {
/// Low confidence - extension-based or fallback detection
Low,
/// Medium confidence - mime_guess library detection
Medium,
/// High confidence - magic byte detection or trusted server
High,
/// Very high confidence - content analysis confirms server type
VeryHigh,
}
/// Method used for MIME type detection
#[derive(Debug, Clone, PartialEq)]
pub enum DetectionMethod {
/// Detected using magic bytes/file signature
MagicBytes,
/// Provided by the server and trusted
ServerProvided,
/// Detected using file extension
Extension,
/// Fallback to default type
Fallback,
/// Hybrid approach using multiple methods
Hybrid,
}
impl MimeDetectionResult {
/// Create a result for server-provided MIME type
pub fn from_server(mime_type: String) -> Self {
Self {
mime_type,
confidence: MimeConfidence::High,
detection_method: DetectionMethod::ServerProvided,
original_server_type: None,
detected_extension: None,
}
}
/// Create a result for content-based detection
pub fn from_content(mime_type: String, server_type: Option<String>) -> Self {
Self {
mime_type,
confidence: MimeConfidence::High,
detection_method: DetectionMethod::MagicBytes,
original_server_type: server_type,
detected_extension: None,
}
}
/// Create a result for extension-based detection
pub fn from_extension(mime_type: String, extension: String) -> Self {
Self {
mime_type,
confidence: MimeConfidence::Medium,
detection_method: DetectionMethod::Extension,
original_server_type: None,
detected_extension: Some(extension),
}
}
/// Create a fallback result
pub fn fallback() -> Self {
Self {
mime_type: "application/octet-stream".to_string(),
confidence: MimeConfidence::Low,
detection_method: DetectionMethod::Fallback,
original_server_type: None,
detected_extension: None,
}
}
/// Check if the detected MIME type indicates an image file
pub fn is_image(&self) -> bool {
self.mime_type.starts_with("image/")
}
/// Check if the detected MIME type indicates a document file
pub fn is_document(&self) -> bool {
matches!(self.mime_type.as_str(),
"application/pdf" |
"application/msword" |
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
"application/vnd.ms-excel" |
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
"application/vnd.ms-powerpoint" |
"application/vnd.openxmlformats-officedocument.presentationml.presentation" |
"text/plain" |
"text/rtf" |
"application/rtf"
)
}
/// Check if this MIME type is suitable for OCR processing
pub fn is_ocr_suitable(&self) -> bool {
self.is_image() || self.mime_type == "application/pdf"
}
}
/// Detect MIME type for WebDAV discovery phase (when we only have file metadata)
///
/// This function is called during the initial WebDAV XML parsing when we don't
/// have access to the actual file content yet.
///
/// # Arguments
/// * `filename` - The filename/path of the file
/// * `server_mime_type` - MIME type provided by the WebDAV server, if any
/// * `strategy` - Detection strategy to use
///
/// # Returns
/// A `MimeDetectionResult` with the best available MIME type determination
pub fn detect_mime_for_discovery(
filename: &str,
server_mime_type: Option<&str>,
strategy: DetectionStrategy,
) -> MimeDetectionResult {
debug!("Detecting MIME type for discovery: filename={}, server_type={:?}, strategy={:?}",
filename, server_mime_type, strategy);
match strategy {
DetectionStrategy::ContentBased => {
// During discovery, we can't analyze content, so fall back to extension
detect_from_extension(filename, server_mime_type)
}
DetectionStrategy::TrustServer => {
if let Some(server_type) = server_mime_type {
if is_trusted_server_mime_type(server_type) {
return MimeDetectionResult::from_server(server_type.to_string());
}
}
// Fallback to extension-based detection
detect_from_extension(filename, server_mime_type)
}
DetectionStrategy::ExtensionOnly => {
detect_from_extension(filename, server_mime_type)
}
DetectionStrategy::Comprehensive => {
// Use server type if trusted, otherwise extension-based
if let Some(server_type) = server_mime_type {
if is_trusted_server_mime_type(server_type) {
return MimeDetectionResult::from_server(server_type.to_string());
}
}
detect_from_extension(filename, server_mime_type)
}
}
}
/// Detect MIME type when file content is available (during file download/processing)
///
/// This provides the most accurate detection using magic bytes from the actual file content.
///
/// # Arguments
/// * `content` - The first few bytes of the file content (at least 512 bytes recommended)
/// * `filename` - The filename for fallback detection
/// * `server_mime_type` - MIME type provided by the server, if any
///
/// # Returns
/// A `MimeDetectionResult` with high-confidence MIME type detection
pub fn detect_mime_from_content(
content: &[u8],
filename: &str,
server_mime_type: Option<&str>,
) -> MimeDetectionResult {
debug!("Detecting MIME type from content: filename={}, server_type={:?}, content_len={}",
filename, server_mime_type, content.len());
// First, try magic byte detection
if let Some(detected_type) = infer::get(content) {
let mime_type = detected_type.mime_type().to_string();
debug!("Magic bytes detected MIME type: {}", mime_type);
// If server provided a type, check for consistency
if let Some(server_type) = server_mime_type {
if are_mime_types_compatible(&mime_type, server_type) {
// Both agree - very high confidence
let mut result = MimeDetectionResult::from_content(mime_type, Some(server_type.to_string()));
result.confidence = MimeConfidence::VeryHigh;
result.detection_method = DetectionMethod::Hybrid;
return result;
} else {
// Content detection overrides server type - trust the bytes
warn!("MIME type mismatch: server={}, content={} for file {}",
server_type, mime_type, filename);
return MimeDetectionResult::from_content(mime_type, Some(server_type.to_string()));
}
} else {
// Only content detection available
return MimeDetectionResult::from_content(mime_type, None);
}
}
// Magic bytes detection failed, fall back to server type if trusted
if let Some(server_type) = server_mime_type {
if is_trusted_server_mime_type(server_type) {
debug!("Using trusted server MIME type: {}", server_type);
return MimeDetectionResult::from_server(server_type.to_string());
}
}
// Fall back to extension-based detection
debug!("Content detection failed, falling back to extension detection");
detect_from_extension(filename, server_mime_type)
}
/// Update an existing MIME type with content-based detection if available
///
/// This function is useful for re-detecting MIME types when file content becomes
/// available after initial discovery.
///
/// # Arguments
/// * `current_mime_type` - The currently assigned MIME type
/// * `content` - File content for analysis
/// * `filename` - Filename for context
///
/// # Returns
/// A new `MimeDetectionResult` if detection improves confidence, or None if no change needed
pub fn update_mime_type_with_content(
current_mime_type: &str,
content: &[u8],
filename: &str,
) -> Option<MimeDetectionResult> {
let new_result = detect_mime_from_content(content, filename, Some(current_mime_type));
// Only update if we have higher confidence or detected a different type
if new_result.confidence > MimeConfidence::Medium ||
new_result.mime_type != current_mime_type {
Some(new_result)
} else {
None
}
}
/// Detect MIME type from file extension using mime_guess library
fn detect_from_extension(filename: &str, server_mime_type: Option<&str>) -> MimeDetectionResult {
let path = Path::new(filename);
if let Some(mime_type) = mime_guess::from_path(path).first() {
let mime_str = mime_type.to_string();
debug!("Extension-based detection: {} -> {}", filename, mime_str);
let mut result = MimeDetectionResult::from_extension(
mime_str,
path.extension()
.and_then(|ext| ext.to_str())
.unwrap_or("")
.to_string()
);
result.original_server_type = server_mime_type.map(|s| s.to_string());
result
} else {
debug!("Extension-based detection failed for: {}", filename);
let mut result = MimeDetectionResult::fallback();
result.original_server_type = server_mime_type.map(|s| s.to_string());
result
}
}
/// Check if a server-provided MIME type should be trusted
///
/// Some servers return generic types like "application/octet-stream" which
/// aren't useful, while others provide accurate information.
fn is_trusted_server_mime_type(mime_type: &str) -> bool {
!matches!(mime_type,
"application/octet-stream" |
"application/binary" |
"binary/octet-stream" |
"" |
"unknown"
)
}
/// Check if two MIME types are compatible/equivalent
///
/// Some servers might return slightly different but equivalent MIME types
/// (e.g., "image/jpg" vs "image/jpeg")
fn are_mime_types_compatible(type1: &str, type2: &str) -> bool {
if type1 == type2 {
return true;
}
// Handle common variations
match (type1, type2) {
("image/jpeg", "image/jpg") | ("image/jpg", "image/jpeg") => true,
("image/tiff", "image/tif") | ("image/tif", "image/tiff") => true,
("text/plain", "text/txt") | ("text/txt", "text/plain") => true,
_ => {
// Check if they have the same primary type (e.g., both are "image/*")
let parts1: Vec<&str> = type1.split('/').collect();
let parts2: Vec<&str> = type2.split('/').collect();
parts1.len() == 2 && parts2.len() == 2 && parts1[0] == parts2[0]
}
}
}
/// Legacy function for backward compatibility
///
/// This maintains the same interface as the original `get_mime_type_from_extension`
/// function but uses the new detection system.
pub fn get_mime_type_from_extension(extension: &str) -> String {
let fake_filename = format!("file.{}", extension);
let result = detect_from_extension(&fake_filename, None);
result.mime_type
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_mime_detection_from_extension() {
let result = detect_mime_for_discovery(
"test.pdf",
None,
DetectionStrategy::ExtensionOnly
);
assert_eq!(result.mime_type, "application/pdf");
assert_eq!(result.detection_method, DetectionMethod::Extension);
}
#[test]
fn test_server_type_trust() {
// Trusted server type
let result = detect_mime_for_discovery(
"test.pdf",
Some("application/pdf"),
DetectionStrategy::TrustServer
);
assert_eq!(result.mime_type, "application/pdf");
assert_eq!(result.detection_method, DetectionMethod::ServerProvided);
// Untrusted server type should fall back
let result = detect_mime_for_discovery(
"test.pdf",
Some("application/octet-stream"),
DetectionStrategy::TrustServer
);
assert_eq!(result.mime_type, "application/pdf");
assert_eq!(result.detection_method, DetectionMethod::Extension);
}
#[test]
fn test_mime_type_compatibility() {
assert!(are_mime_types_compatible("image/jpeg", "image/jpg"));
assert!(are_mime_types_compatible("image/jpg", "image/jpeg"));
assert!(are_mime_types_compatible("text/plain", "text/plain"));
assert!(!are_mime_types_compatible("image/jpeg", "text/plain"));
}
#[test]
fn test_content_based_detection() {
// PDF magic bytes
let pdf_header = b"%PDF-1.4";
let result = detect_mime_from_content(pdf_header, "test.pdf", None);
assert_eq!(result.mime_type, "application/pdf");
assert_eq!(result.detection_method, DetectionMethod::MagicBytes);
assert_eq!(result.confidence, MimeConfidence::High);
// JPEG magic bytes
let jpeg_header = [0xFF, 0xD8, 0xFF];
let result = detect_mime_from_content(&jpeg_header, "test.jpg", None);
assert_eq!(result.mime_type, "image/jpeg");
}
#[test]
fn test_hybrid_detection() {
// Content and server agree
let pdf_header = b"%PDF-1.4";
let result = detect_mime_from_content(pdf_header, "test.pdf", Some("application/pdf"));
assert_eq!(result.mime_type, "application/pdf");
assert_eq!(result.detection_method, DetectionMethod::Hybrid);
assert_eq!(result.confidence, MimeConfidence::VeryHigh);
}
#[test]
fn test_legacy_compatibility() {
assert_eq!(get_mime_type_from_extension("pdf"), "application/pdf");
assert_eq!(get_mime_type_from_extension("jpg"), "image/jpeg");
assert_eq!(get_mime_type_from_extension("png"), "image/png");
}
#[test]
fn test_ocr_suitability() {
let pdf_result = MimeDetectionResult::from_content("application/pdf".to_string(), None);
assert!(pdf_result.is_ocr_suitable());
let image_result = MimeDetectionResult::from_content("image/jpeg".to_string(), None);
assert!(image_result.is_ocr_suitable());
let text_result = MimeDetectionResult::from_content("text/plain".to_string(), None);
assert!(!text_result.is_ocr_suitable());
}
}

View File

@ -570,27 +570,17 @@ impl SourceScheduler {
return Err(format!("WebDAV server_url is empty"));
}
// Check if URL starts with a valid scheme
if !server_url.starts_with("http://") && !server_url.starts_with("https://") {
return Err(format!(
"WebDAV server_url must start with 'http://' or 'https://'. \
Current value: '{}'. \
Examples of valid URLs: \
- https://cloud.example.com \
- http://192.168.1.100:8080 \
- https://nextcloud.mydomain.com:443",
server_url
));
}
// Normalize URL by adding protocol if missing (consistent with WebDAVConfig)
let normalized_url = crate::services::webdav::config::WebDAVConfig::normalize_server_url(server_url);
// Try to parse as URL to catch other issues
match reqwest::Url::parse(server_url) {
// Try to parse the normalized URL to catch other issues
match reqwest::Url::parse(&normalized_url) {
Ok(url) => {
if url.scheme() != "http" && url.scheme() != "https" {
return Err(format!(
"WebDAV server_url has invalid scheme '{}'. Only 'http' and 'https' are supported. \
Current URL: '{}'",
url.scheme(), server_url
url.scheme(), normalized_url
));
}
@ -599,23 +589,23 @@ impl SourceScheduler {
"WebDAV server_url is missing hostname. \
Current URL: '{}'. \
Example: https://cloud.example.com",
server_url
normalized_url
));
}
crate::debug_log!("SOURCE_SCHEDULER", "✅ WebDAV URL validation passed for source '{}': {}", source_name, server_url);
crate::debug_log!("SOURCE_SCHEDULER", "✅ WebDAV URL validation passed for source '{}': {} (normalized to: {})", source_name, server_url, normalized_url);
Ok(())
}
Err(e) => {
Err(format!(
"WebDAV server_url is not a valid URL: {}. \
Current value: '{}'. \
Current value: '{}' (normalized to: '{}'). \
The URL must be absolute and include the full domain. \
Examples: \
- https://cloud.example.com \
- http://192.168.1.100:8080/webdav \
- https://nextcloud.mydomain.com",
e, server_url
e, server_url, normalized_url
))
}
}

View File

@ -103,6 +103,32 @@ impl WebDAVConfig {
}
}
/// Normalizes a server URL by adding protocol if missing
/// Prefers HTTPS over HTTP for security reasons
pub fn normalize_server_url(url: &str) -> String {
let trimmed = url.trim();
// If protocol is already specified, return as-is
if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
return trimmed.to_string();
}
// If no protocol specified, default to HTTPS for security
format!("https://{}", trimmed)
}
/// Generates alternative protocol URL for fallback attempts
/// If input has HTTPS, returns HTTP version and vice versa
pub fn get_alternative_protocol_url(url: &str) -> Option<String> {
if url.starts_with("https://") {
Some(url.replacen("https://", "http://", 1))
} else if url.starts_with("http://") {
Some(url.replacen("http://", "https://", 1))
} else {
None
}
}
/// Validates the configuration
pub fn validate(&self) -> anyhow::Result<()> {
if self.server_url.is_empty() {
@ -121,9 +147,22 @@ impl WebDAVConfig {
return Err(anyhow::anyhow!("At least one watch folder must be specified"));
}
// Validate URL format
if !self.server_url.starts_with("http://") && !self.server_url.starts_with("https://") {
return Err(anyhow::anyhow!("Server URL must start with http:// or https://"));
// Validate URL format - now accepts URLs without protocol
// Protocol detection and fallback will be handled during connection testing
let normalized_url = Self::normalize_server_url(&self.server_url);
// Basic URL validation - check if it looks like a valid domain/IP
let url_without_protocol = normalized_url
.trim_start_matches("https://")
.trim_start_matches("http://");
if url_without_protocol.is_empty() {
return Err(anyhow::anyhow!("Server URL must contain a valid domain or IP address"));
}
// Check for obviously invalid URLs
if url_without_protocol.contains("://") {
return Err(anyhow::anyhow!("Invalid URL format: contains multiple protocols"));
}
Ok(())
@ -131,8 +170,8 @@ impl WebDAVConfig {
/// Returns the base URL for WebDAV operations
pub fn webdav_url(&self) -> String {
// Normalize the server URL by removing trailing slashes
let normalized_url = self.server_url.trim_end_matches('/').to_string();
// Normalize the server URL by adding protocol if missing and removing trailing slashes
let normalized_url = Self::normalize_server_url(&self.server_url).trim_end_matches('/').to_string();
// Add WebDAV path based on server type
match self.server_type.as_deref() {
@ -160,7 +199,7 @@ impl WebDAVConfig {
/// Returns alternative WebDAV URLs to try if the primary one fails
/// This is used for fallback mechanisms when encountering 405 errors
pub fn webdav_fallback_urls(&self) -> Vec<String> {
let normalized_url = self.server_url.trim_end_matches('/').to_string();
let normalized_url = Self::normalize_server_url(&self.server_url).trim_end_matches('/').to_string();
let mut fallback_urls = Vec::new();
match self.server_type.as_deref() {

View File

@ -23,4 +23,6 @@ mod url_construction_tests;
#[cfg(test)]
mod subdirectory_edge_cases_tests;
#[cfg(test)]
mod protocol_detection_tests;
#[cfg(test)]
mod tests;

View File

@ -0,0 +1,233 @@
#[cfg(test)]
mod tests {
use super::super::{WebDAVService, WebDAVConfig};
/// Helper function to create test WebDAV config without protocol
fn create_test_config_without_protocol() -> WebDAVConfig {
WebDAVConfig {
server_url: "nas.example.com".to_string(), // No protocol
username: "testuser".to_string(),
password: "testpass".to_string(),
watch_folders: vec!["/Documents".to_string()],
file_extensions: vec!["pdf".to_string(), "txt".to_string()],
timeout_seconds: 30,
server_type: Some("nextcloud".to_string()),
}
}
/// Helper function to create test WebDAV config with HTTPS protocol
fn create_test_config_with_https() -> WebDAVConfig {
WebDAVConfig {
server_url: "https://nas.example.com".to_string(),
username: "testuser".to_string(),
password: "testpass".to_string(),
watch_folders: vec!["/Documents".to_string()],
file_extensions: vec!["pdf".to_string(), "txt".to_string()],
timeout_seconds: 30,
server_type: Some("nextcloud".to_string()),
}
}
/// Helper function to create test WebDAV config with HTTP protocol
fn create_test_config_with_http() -> WebDAVConfig {
WebDAVConfig {
server_url: "http://nas.example.com".to_string(),
username: "testuser".to_string(),
password: "testpass".to_string(),
watch_folders: vec!["/Documents".to_string()],
file_extensions: vec!["pdf".to_string(), "txt".to_string()],
timeout_seconds: 30,
server_type: Some("nextcloud".to_string()),
}
}
#[tokio::test]
async fn test_config_validation_accepts_url_without_protocol() {
let config = create_test_config_without_protocol();
// Should not fail validation
assert!(config.validate().is_ok());
}
#[tokio::test]
async fn test_config_validation_accepts_url_with_https() {
let config = create_test_config_with_https();
// Should not fail validation
assert!(config.validate().is_ok());
}
#[tokio::test]
async fn test_config_validation_accepts_url_with_http() {
let config = create_test_config_with_http();
// Should not fail validation
assert!(config.validate().is_ok());
}
#[tokio::test]
async fn test_normalize_server_url_adds_https_by_default() {
let normalized = WebDAVConfig::normalize_server_url("nas.example.com");
assert_eq!(normalized, "https://nas.example.com");
}
#[tokio::test]
async fn test_normalize_server_url_preserves_existing_protocol() {
let https_url = WebDAVConfig::normalize_server_url("https://nas.example.com");
assert_eq!(https_url, "https://nas.example.com");
let http_url = WebDAVConfig::normalize_server_url("http://nas.example.com");
assert_eq!(http_url, "http://nas.example.com");
}
#[tokio::test]
async fn test_get_alternative_protocol_url() {
// HTTPS to HTTP
let alt_http = WebDAVConfig::get_alternative_protocol_url("https://nas.example.com");
assert_eq!(alt_http, Some("http://nas.example.com".to_string()));
// HTTP to HTTPS
let alt_https = WebDAVConfig::get_alternative_protocol_url("http://nas.example.com");
assert_eq!(alt_https, Some("https://nas.example.com".to_string()));
// No protocol - should return None
let no_protocol = WebDAVConfig::get_alternative_protocol_url("nas.example.com");
assert_eq!(no_protocol, None);
}
#[tokio::test]
async fn test_webdav_url_uses_normalized_url() {
let config = create_test_config_without_protocol();
let webdav_url = config.webdav_url();
// Should start with https:// (normalized)
assert!(webdav_url.starts_with("https://"));
assert_eq!(webdav_url, "https://nas.example.com/remote.php/dav/files/testuser");
}
#[tokio::test]
async fn test_service_creation_with_protocol_detection() {
let config = create_test_config_without_protocol();
// Should be able to create service without errors
let service = WebDAVService::new(config);
assert!(service.is_ok());
}
#[tokio::test]
async fn test_effective_server_url_defaults_to_normalized() {
let config = create_test_config_without_protocol();
let service = WebDAVService::new(config).unwrap();
let effective_url = service.get_effective_server_url();
assert_eq!(effective_url, "https://nas.example.com");
}
#[tokio::test]
async fn test_effective_server_url_with_existing_protocol() {
let config = create_test_config_with_http();
let service = WebDAVService::new(config).unwrap();
let effective_url = service.get_effective_server_url();
assert_eq!(effective_url, "http://nas.example.com");
}
#[tokio::test]
async fn test_working_protocol_initially_none() {
let config = create_test_config_without_protocol();
let service = WebDAVService::new(config).unwrap();
// Initially, no working protocol should be detected
assert!(service.get_working_protocol().is_none());
}
#[tokio::test]
async fn test_is_connection_error_detection() {
let config = create_test_config_without_protocol();
let service = WebDAVService::new(config).unwrap();
// Test various connection error patterns
let connection_errors = vec![
anyhow::anyhow!("connection refused"),
anyhow::anyhow!("timeout occurred"),
anyhow::anyhow!("DNS resolution failed"),
anyhow::anyhow!("TLS handshake failed"),
anyhow::anyhow!("SSL certificate error"),
];
for error in connection_errors {
assert!(service.is_connection_error(&error), "Should detect '{}' as connection error", error);
}
// Test non-connection errors
let non_connection_errors = vec![
anyhow::anyhow!("401 Unauthorized"),
anyhow::anyhow!("403 Forbidden"),
anyhow::anyhow!("invalid credentials"),
];
for error in non_connection_errors {
assert!(!service.is_connection_error(&error), "Should NOT detect '{}' as connection error", error);
}
}
#[tokio::test]
async fn test_config_validation_rejects_empty_url() {
let mut config = create_test_config_without_protocol();
config.server_url = "".to_string();
assert!(config.validate().is_err());
}
#[tokio::test]
async fn test_config_validation_rejects_invalid_url() {
let mut config = create_test_config_without_protocol();
config.server_url = "http://https://invalid".to_string();
assert!(config.validate().is_err());
}
#[tokio::test]
async fn test_webdav_fallback_urls_use_normalized_url() {
let config = create_test_config_without_protocol();
let fallback_urls = config.webdav_fallback_urls();
// All fallback URLs should start with https:// (normalized)
for url in fallback_urls {
assert!(url.starts_with("https://"), "Fallback URL should be normalized: {}", url);
}
}
#[tokio::test]
async fn test_backward_compatibility_with_existing_protocols() {
// Existing URLs with protocols should work unchanged
let https_config = create_test_config_with_https();
let http_config = create_test_config_with_http();
let https_service = WebDAVService::new(https_config).unwrap();
let http_service = WebDAVService::new(http_config).unwrap();
assert_eq!(https_service.get_effective_server_url(), "https://nas.example.com");
assert_eq!(http_service.get_effective_server_url(), "http://nas.example.com");
}
#[tokio::test]
async fn test_url_construction_with_protocol_detection() {
let config = create_test_config_without_protocol();
let service = WebDAVService::new(config).unwrap();
// Test URL construction for different paths
let test_paths = vec![
"/Documents/file.pdf",
"Photos/image.jpg",
"/",
"",
];
for path in test_paths {
let url = service.get_url_for_path(path);
// Should start with https:// (normalized default)
assert!(url.starts_with("https://"), "URL should be normalized for path '{}': {}", path, url);
}
}
}

View File

@ -14,6 +14,7 @@ use crate::models::{
WebDAVFolderInfo,
};
use crate::webdav_xml_parser::{parse_propfind_response, parse_propfind_response_with_directories};
use crate::mime_detection::{detect_mime_from_content, update_mime_type_with_content, MimeDetectionResult};
use super::{config::{WebDAVConfig, RetryConfig, ConcurrencyConfig}, SyncProgress};
@ -24,6 +25,15 @@ pub struct WebDAVDiscoveryResult {
pub directories: Vec<FileIngestionInfo>,
}
/// Result of downloading a file with MIME type detection
#[derive(Debug, Clone)]
pub struct WebDAVDownloadResult {
pub content: Vec<u8>,
pub file_info: FileIngestionInfo,
pub mime_detection: MimeDetectionResult,
pub mime_type_updated: bool,
}
/// Server capabilities information
#[derive(Debug, Clone)]
pub struct ServerCapabilities {
@ -135,6 +145,8 @@ pub struct WebDAVService {
concurrency_config: ConcurrencyConfig,
scan_semaphore: Arc<Semaphore>,
download_semaphore: Arc<Semaphore>,
/// Stores the working protocol (updated after successful protocol detection)
working_protocol: Arc<std::sync::RwLock<Option<String>>>,
}
impl WebDAVService {
@ -173,9 +185,156 @@ impl WebDAVService {
concurrency_config,
scan_semaphore,
download_semaphore,
working_protocol: Arc::new(std::sync::RwLock::new(None)),
})
}
// ============================================================================
// Protocol Detection Methods
// ============================================================================
/// Detects the working protocol by trying HTTPS first, then HTTP
/// This method handles smart protocol detection for URLs without explicit protocols
async fn detect_working_protocol(&self) -> Result<String> {
info!("🔍 Starting smart protocol detection for: {}", self.config.server_url);
// If URL already has a protocol, use it directly
if self.config.server_url.starts_with("http://") || self.config.server_url.starts_with("https://") {
let protocol = if self.config.server_url.starts_with("https://") { "https" } else { "http" };
info!("✅ Protocol already specified: {}", protocol);
return Ok(protocol.to_string());
}
// Try HTTPS first (more secure default)
let https_url = format!("https://{}", self.config.server_url.trim());
info!("🔐 Trying HTTPS first: {}", https_url);
match self.test_protocol_connection(&https_url).await {
Ok(()) => {
info!("✅ HTTPS connection successful");
// Store the working protocol for future use
if let Ok(mut working_protocol) = self.working_protocol.write() {
*working_protocol = Some("https".to_string());
}
return Ok("https".to_string());
}
Err(https_error) => {
warn!("❌ HTTPS connection failed: {}", https_error);
// Check if this is a connection-related error (not auth error)
if self.is_connection_error(&https_error) {
info!("🔄 HTTPS failed with connection error, trying HTTP fallback");
// Try HTTP fallback
let http_url = format!("http://{}", self.config.server_url.trim());
info!("🔓 Trying HTTP fallback: {}", http_url);
match self.test_protocol_connection(&http_url).await {
Ok(()) => {
warn!("⚠️ HTTP connection successful - consider configuring HTTPS for security");
// Store the working protocol for future use
if let Ok(mut working_protocol) = self.working_protocol.write() {
*working_protocol = Some("http".to_string());
}
return Ok("http".to_string());
}
Err(http_error) => {
error!("❌ Both HTTPS and HTTP failed");
error!(" HTTPS error: {}", https_error);
error!(" HTTP error: {}", http_error);
return Err(anyhow!(
"Protocol detection failed. Both HTTPS and HTTP connections failed. \
HTTPS error: {}. HTTP error: {}. \
Please verify the server URL and ensure WebDAV is properly configured.",
https_error, http_error
));
}
}
} else {
// Auth or other non-connection error with HTTPS - don't try HTTP
error!("❌ HTTPS failed with non-connection error (likely auth or server config): {}", https_error);
return Err(anyhow!(
"HTTPS connection failed with authentication or server configuration error: {}. \
Please check your credentials and server settings.",
https_error
));
}
}
}
}
/// Tests connection with a specific protocol URL
async fn test_protocol_connection(&self, full_url: &str) -> Result<()> {
debug!("🧪 Testing protocol connection to: {}", full_url);
// Create a temporary config with the full URL for testing
let temp_config = WebDAVConfig {
server_url: full_url.to_string(),
username: self.config.username.clone(),
password: self.config.password.clone(),
watch_folders: self.config.watch_folders.clone(),
file_extensions: self.config.file_extensions.clone(),
timeout_seconds: self.config.timeout_seconds,
server_type: self.config.server_type.clone(),
};
// Test basic OPTIONS request
let webdav_url = temp_config.webdav_url();
debug!("📍 Testing WebDAV URL: {}", webdav_url);
let response = self.client
.request(Method::OPTIONS, &webdav_url)
.basic_auth(&self.config.username, Some(&self.config.password))
.send()
.await
.map_err(|e| anyhow!("Connection failed: {}", e))?;
if !response.status().is_success() {
return Err(anyhow!(
"Protocol test failed with status: {} - {}",
response.status(),
response.text().await.unwrap_or_default()
));
}
debug!("✅ Protocol connection test successful");
Ok(())
}
/// Determines if an error is connection-related (vs auth or other errors)
pub fn is_connection_error(&self, error: &anyhow::Error) -> bool {
let error_str = error.to_string().to_lowercase();
// Connection-related errors that suggest trying different protocol
error_str.contains("connection refused") ||
error_str.contains("timeout") ||
error_str.contains("dns") ||
error_str.contains("network") ||
error_str.contains("unreachable") ||
error_str.contains("tls") ||
error_str.contains("ssl") ||
error_str.contains("certificate") ||
error_str.contains("handshake")
}
/// Gets the currently working protocol (if detected)
pub fn get_working_protocol(&self) -> Option<String> {
self.working_protocol.read().ok().and_then(|p| p.clone())
}
/// Gets the effective server URL with the working protocol
pub fn get_effective_server_url(&self) -> String {
// If we have a detected working protocol, use it
if let Some(protocol) = self.get_working_protocol() {
if !self.config.server_url.starts_with("http://") && !self.config.server_url.starts_with("https://") {
return format!("{}://{}", protocol, self.config.server_url.trim());
}
}
// Otherwise use the configured URL (normalized)
WebDAVConfig::normalize_server_url(&self.config.server_url)
}
// ============================================================================
// Connection and Testing Methods
// ============================================================================
@ -194,13 +353,31 @@ impl WebDAVService {
});
}
// Test basic connectivity with OPTIONS request
// Perform protocol detection if needed
let working_protocol = match self.detect_working_protocol().await {
Ok(protocol) => {
info!("✅ Protocol detection successful: {}", protocol);
protocol
}
Err(e) => {
error!("❌ Protocol detection failed: {}", e);
return Ok(WebDAVConnectionResult {
success: false,
message: format!("Protocol detection failed: {}", e),
server_version: None,
server_type: None,
});
}
};
// Test basic connectivity with OPTIONS request using detected protocol
match self.test_options_request().await {
Ok((server_version, server_type)) => {
info!("✅ WebDAV connection successful");
let effective_url = self.get_effective_server_url();
info!("✅ WebDAV connection successful using {} ({})", working_protocol.to_uppercase(), effective_url);
Ok(WebDAVConnectionResult {
success: true,
message: "Connection successful".to_string(),
message: format!("Connection successful using {}", working_protocol.to_uppercase()),
server_version,
server_type,
})
@ -235,7 +412,18 @@ impl WebDAVService {
/// Performs OPTIONS request to test basic connectivity
async fn test_options_request(&self) -> Result<(Option<String>, Option<String>)> {
let webdav_url = self.config.webdav_url();
// Create a temporary config with the effective server URL for WebDAV operations
let effective_server_url = self.get_effective_server_url();
let temp_config = WebDAVConfig {
server_url: effective_server_url,
username: self.config.username.clone(),
password: self.config.password.clone(),
watch_folders: self.config.watch_folders.clone(),
file_extensions: self.config.file_extensions.clone(),
timeout_seconds: self.config.timeout_seconds,
server_type: self.config.server_type.clone(),
};
let webdav_url = temp_config.webdav_url();
let response = self.client
.request(Method::OPTIONS, &webdav_url)
@ -304,8 +492,9 @@ impl WebDAVService {
/// Tests for Nextcloud-specific capabilities
async fn test_nextcloud_capabilities(&self) -> Result<()> {
let effective_server_url = self.get_effective_server_url();
let capabilities_url = format!("{}/ocs/v1.php/cloud/capabilities",
self.config.server_url.trim_end_matches('/'));
effective_server_url.trim_end_matches('/'));
let response = self.client
.get(&capabilities_url)
@ -592,7 +781,18 @@ impl WebDAVService {
/// Gets the WebDAV URL for a specific path
pub fn get_url_for_path(&self, path: &str) -> String {
let base_url = self.config.webdav_url();
// Create a temporary config with the effective server URL
let effective_server_url = self.get_effective_server_url();
let temp_config = WebDAVConfig {
server_url: effective_server_url,
username: self.config.username.clone(),
password: self.config.password.clone(),
watch_folders: self.config.watch_folders.clone(),
file_extensions: self.config.file_extensions.clone(),
timeout_seconds: self.config.timeout_seconds,
server_type: self.config.server_type.clone(),
};
let base_url = temp_config.webdav_url();
let clean_path = path.trim_start_matches('/');
let final_url = if clean_path.is_empty() {
@ -652,7 +852,18 @@ impl WebDAVService {
/// Convert file paths to the proper URL format for the server
pub fn path_to_url(&self, relative_path: &str) -> String {
let clean_path = relative_path.trim_start_matches('/');
let base_url = self.config.webdav_url();
// Create a temporary config with the effective server URL
let effective_server_url = self.get_effective_server_url();
let temp_config = WebDAVConfig {
server_url: effective_server_url,
username: self.config.username.clone(),
password: self.config.password.clone(),
watch_folders: self.config.watch_folders.clone(),
file_extensions: self.config.file_extensions.clone(),
timeout_seconds: self.config.timeout_seconds,
server_type: self.config.server_type.clone(),
};
let base_url = temp_config.webdav_url();
if clean_path.is_empty() {
base_url
@ -777,42 +988,64 @@ impl WebDAVService {
async fn discover_files_recursive(&self, directory_path: &str) -> Result<Vec<FileIngestionInfo>> {
let mut all_files = Vec::new();
let mut directories_to_scan = vec![directory_path.to_string()];
let mut scanned_directories = std::collections::HashSet::new();
let semaphore = Arc::new(Semaphore::new(self.concurrency_config.max_concurrent_scans));
debug!("Starting recursive file scan from: {}", directory_path);
while !directories_to_scan.is_empty() {
let current_directories = directories_to_scan.clone();
directories_to_scan.clear();
// Take a batch of directories to process
let batch_size = std::cmp::min(directories_to_scan.len(), self.concurrency_config.max_concurrent_scans);
let current_batch: Vec<String> = directories_to_scan.drain(..batch_size).collect();
debug!("Processing batch of {} directories, {} remaining in queue",
current_batch.len(), directories_to_scan.len());
// Process directories concurrently
let tasks = current_directories.into_iter().map(|dir| {
let tasks = current_batch.into_iter().filter_map(|dir| {
// Skip if already scanned
if scanned_directories.contains(&dir) {
debug!("Skipping already scanned directory: {}", dir);
return None;
}
scanned_directories.insert(dir.clone());
let permit = semaphore.clone();
let service = self.clone();
async move {
Some(async move {
let _permit = permit.acquire().await.unwrap();
service.discover_files_and_directories_single(&dir).await
}
let result = service.discover_files_and_directories_single(&dir).await;
(dir, result)
})
});
let results = futures_util::future::join_all(tasks).await;
for result in results {
for (scanned_dir, result) in results {
match result {
Ok(discovery_result) => {
debug!("Directory '{}' scan complete: {} files, {} subdirectories",
scanned_dir, discovery_result.files.len(), discovery_result.directories.len());
all_files.extend(discovery_result.files);
// Add subdirectories to the queue for the next iteration
for dir in discovery_result.directories {
if dir.is_directory {
directories_to_scan.push(dir.relative_path);
if dir.is_directory && !scanned_directories.contains(&dir.relative_path) {
directories_to_scan.push(dir.relative_path.clone());
debug!("Added subdirectory to scan queue: {}", dir.relative_path);
}
}
}
Err(e) => {
warn!("Failed to scan directory: {}", e);
warn!("Failed to scan directory '{}': {}", scanned_dir, e);
}
}
}
debug!("Batch complete. Total files found: {}. Queue size: {}",
all_files.len(), directories_to_scan.len());
}
info!("Recursive scan completed. Found {} files total", all_files.len());
@ -908,12 +1141,19 @@ impl WebDAVService {
let body = response.text().await?;
let all_items = parse_propfind_response_with_directories(&body)?;
// Process the items to convert href to relative paths
let processed_items = self.process_file_infos(all_items);
// Separate files and directories, excluding the parent directory itself
let mut files = Vec::new();
let mut directories = Vec::new();
for item in all_items {
if item.relative_path == directory_path {
for item in processed_items {
// Skip the directory itself (handle both with and without trailing slash)
let normalized_item_path = item.relative_path.trim_end_matches('/');
let normalized_directory_path = directory_path.trim_end_matches('/');
if normalized_item_path == normalized_directory_path {
continue; // Skip the directory itself
}
@ -933,41 +1173,69 @@ impl WebDAVService {
let mut all_files = Vec::new();
let mut all_directories = Vec::new();
let mut directories_to_scan = vec![directory_path.to_string()];
let mut scanned_directories = std::collections::HashSet::new();
let semaphore = Arc::new(Semaphore::new(self.concurrency_config.max_concurrent_scans));
debug!("Starting recursive scan from: {}", directory_path);
while !directories_to_scan.is_empty() {
let current_directories = directories_to_scan.clone();
directories_to_scan.clear();
// Take a batch of directories to process (limit batch size for better progress tracking)
let batch_size = std::cmp::min(directories_to_scan.len(), self.concurrency_config.max_concurrent_scans);
let current_batch: Vec<String> = directories_to_scan.drain(..batch_size).collect();
debug!("Processing batch of {} directories, {} remaining in queue",
current_batch.len(), directories_to_scan.len());
// Process directories concurrently
let tasks = current_directories.into_iter().map(|dir| {
let tasks = current_batch.into_iter().filter_map(|dir| {
// Skip if already scanned (prevent infinite loops)
if scanned_directories.contains(&dir) {
debug!("Skipping already scanned directory: {}", dir);
return None;
}
scanned_directories.insert(dir.clone());
let permit = semaphore.clone();
let service = self.clone();
async move {
Some(async move {
let _permit = permit.acquire().await.unwrap();
service.discover_files_and_directories_single(&dir).await
}
let result = service.discover_files_and_directories_single(&dir).await;
(dir, result)
})
});
let results = futures_util::future::join_all(tasks).await;
for result in results {
for (scanned_dir, result) in results {
match result {
Ok(discovery_result) => {
debug!("Directory '{}' scan complete: {} files, {} subdirectories",
scanned_dir, discovery_result.files.len(), discovery_result.directories.len());
all_files.extend(discovery_result.files);
// Add directories to our results and to the scan queue
for dir in discovery_result.directories {
// Only add to scan queue if not already scanned
if !scanned_directories.contains(&dir.relative_path) {
directories_to_scan.push(dir.relative_path.clone());
debug!("Added subdirectory to scan queue: {} (scanned set size: {})",
dir.relative_path, scanned_directories.len());
} else {
debug!("Skipping already scanned directory: {} (already in scanned set)", dir.relative_path);
}
all_directories.push(dir);
}
}
Err(e) => {
warn!("Failed to scan directory: {}", e);
warn!("Failed to scan directory '{}': {}", scanned_dir, e);
}
}
}
debug!("Batch complete. Total progress: {} files, {} directories found. Queue size: {}",
all_files.len(), all_directories.len(), directories_to_scan.len());
}
info!("Recursive scan completed. Found {} files and {} directories", all_files.len(), all_directories.len());
@ -1172,6 +1440,131 @@ impl WebDAVService {
Ok(results)
}
/// Downloads a file with enhanced MIME type detection based on content
///
/// This method downloads the file and performs content-based MIME type detection
/// using magic bytes, providing more accurate type identification than the initial
/// discovery phase which only has access to filenames and server-provided types.
///
/// # Arguments
/// * `file_info` - The file information from WebDAV discovery
///
/// # Returns
/// A `WebDAVDownloadResult` containing the file content, updated file info, and MIME detection details
pub async fn download_file_with_mime_detection(&self, file_info: &FileIngestionInfo) -> Result<WebDAVDownloadResult> {
let _permit = self.download_semaphore.acquire().await?;
debug!("⬇️🔍 Downloading file with MIME detection: {}", file_info.relative_path);
// Use the relative path directly since it's already processed
let relative_path = &file_info.relative_path;
let url = self.get_url_for_path(&relative_path);
let response = self.authenticated_request(
reqwest::Method::GET,
&url,
None,
None,
).await?;
if !response.status().is_success() {
return Err(anyhow!(
"Failed to download file '{}': HTTP {}",
file_info.relative_path,
response.status()
));
}
// Get server-provided content type from response headers
let server_content_type = response
.headers()
.get("content-type")
.and_then(|header| header.to_str().ok())
.map(|s| s.split(';').next().unwrap_or(s).trim().to_string()); // Remove charset info and convert to owned
let content = response.bytes().await?;
debug!("✅ Downloaded {} bytes for file: {}", content.len(), file_info.relative_path);
// Perform content-based MIME type detection
let mime_detection_result = detect_mime_from_content(
&content,
&file_info.name,
server_content_type.as_deref()
);
// Check if MIME type should be updated
let mime_type_updated = mime_detection_result.mime_type != file_info.mime_type;
if mime_type_updated {
debug!("🔄 MIME type updated for {}: '{}' -> '{}' (method: {:?}, confidence: {:?})",
file_info.name,
file_info.mime_type,
mime_detection_result.mime_type,
mime_detection_result.detection_method,
mime_detection_result.confidence);
} else {
debug!("✅ MIME type confirmed for {}: '{}' (method: {:?}, confidence: {:?})",
file_info.name,
mime_detection_result.mime_type,
mime_detection_result.detection_method,
mime_detection_result.confidence);
}
// Create updated file info if MIME type changed
let updated_file_info = if mime_type_updated {
let mut updated = file_info.clone();
updated.mime_type = mime_detection_result.mime_type.clone();
updated
} else {
file_info.clone()
};
Ok(WebDAVDownloadResult {
content: content.to_vec(),
file_info: updated_file_info,
mime_detection: mime_detection_result,
mime_type_updated,
})
}
/// Downloads multiple files with MIME type detection concurrently
///
/// Similar to `download_files` but includes content-based MIME type detection
/// for each downloaded file.
///
/// # Arguments
/// * `files` - The files to download
///
/// # Returns
/// A vector of tuples containing the original file info and download result
pub async fn download_files_with_mime_detection(&self, files: &[FileIngestionInfo]) -> Result<Vec<(FileIngestionInfo, Result<WebDAVDownloadResult>)>> {
info!("⬇️🔍 Downloading {} files with MIME detection concurrently", files.len());
let tasks = files.iter().map(|file| {
let file_clone = file.clone();
let service_clone = self.clone();
async move {
let result = service_clone.download_file_with_mime_detection(&file_clone).await;
(file_clone, result)
}
});
let results = futures_util::future::join_all(tasks).await;
let success_count = results.iter().filter(|(_, result)| result.is_ok()).count();
let failure_count = results.len() - success_count;
let mime_updated_count = results.iter()
.filter_map(|(_, result)| result.as_ref().ok())
.filter(|download_result| download_result.mime_type_updated)
.count();
info!("📊 Download with MIME detection completed: {} successful, {} failed, {} MIME types updated",
success_count, failure_count, mime_updated_count);
Ok(results)
}
/// Gets file metadata without downloading content
pub async fn get_file_metadata(&self, file_path: &str) -> Result<FileIngestionInfo> {
debug!("📋 Getting metadata for file: {}", file_path);
@ -1226,9 +1619,21 @@ impl WebDAVService {
pub async fn get_server_capabilities(&self) -> Result<ServerCapabilities> {
debug!("🔍 Checking server capabilities");
// Create a temporary config with the effective server URL
let effective_server_url = self.get_effective_server_url();
let temp_config = WebDAVConfig {
server_url: effective_server_url,
username: self.config.username.clone(),
password: self.config.password.clone(),
watch_folders: self.config.watch_folders.clone(),
file_extensions: self.config.file_extensions.clone(),
timeout_seconds: self.config.timeout_seconds,
server_type: self.config.server_type.clone(),
};
let options_response = self.authenticated_request(
reqwest::Method::OPTIONS,
&self.config.webdav_url(),
&temp_config.webdav_url(),
None,
None,
).await?;
@ -1550,6 +1955,7 @@ impl WebDAVService {
}
}
// Implement Clone to allow sharing the service
impl Clone for WebDAVService {
fn clone(&self) -> Self {
@ -1560,6 +1966,7 @@ impl Clone for WebDAVService {
concurrency_config: self.concurrency_config.clone(),
scan_semaphore: Arc::clone(&self.scan_semaphore),
download_semaphore: Arc::clone(&self.download_semaphore),
working_protocol: Arc::clone(&self.working_protocol),
}
}
}

View File

@ -1,2 +1,3 @@
pub mod etag_comparison_tests;
pub mod deletion_detection_tests;
pub mod path_processing_tests;

View File

@ -0,0 +1,452 @@
#[cfg(test)]
mod path_processing_tests {
use crate::models::FileIngestionInfo;
use crate::services::webdav::{WebDAVConfig, WebDAVService};
use crate::webdav_xml_parser::parse_propfind_response_with_directories;
use wiremock::{
matchers::{method, path, header},
Mock, MockServer, ResponseTemplate,
};
/// Creates a test WebDAV service with mock server
fn create_test_service(mock_server_url: &str) -> WebDAVService {
let config = WebDAVConfig {
server_url: mock_server_url.to_string(),
username: "testuser".to_string(),
password: "testpass".to_string(),
watch_folders: vec!["/TestDocuments".to_string()],
file_extensions: vec!["pdf".to_string(), "txt".to_string()],
timeout_seconds: 30,
server_type: Some("nextcloud".to_string()),
};
WebDAVService::new(config).expect("Failed to create test service")
}
/// Mock WebDAV PROPFIND response with directories and files
fn mock_propfind_response() -> String {
r#"<?xml version="1.0"?>
<d:multistatus xmlns:d="DAV:" xmlns:s="http://sabredav.org/ns" xmlns:oc="http://owncloud.org/ns" xmlns:nc="http://nextcloud.org/ns">
<d:response>
<d:href>/remote.php/dav/files/testuser/TestDocuments/</d:href>
<d:propstat>
<d:prop>
<d:displayname>TestDocuments</d:displayname>
<d:getlastmodified>Tue, 29 Jul 2025 01:34:17 GMT</d:getlastmodified>
<d:getetag>"parent123etag"</d:getetag>
<d:resourcetype><d:collection/></d:resourcetype>
</d:prop>
<d:status>HTTP/1.1 200 OK</d:status>
</d:propstat>
</d:response>
<d:response>
<d:href>/remote.php/dav/files/testuser/TestDocuments/SubDir1/</d:href>
<d:propstat>
<d:prop>
<d:displayname>SubDir1</d:displayname>
<d:getlastmodified>Fri, 20 Jun 2025 23:35:17 GMT</d:getlastmodified>
<d:getetag>"subdir1etag"</d:getetag>
<d:resourcetype><d:collection/></d:resourcetype>
</d:prop>
<d:status>HTTP/1.1 200 OK</d:status>
</d:propstat>
</d:response>
<d:response>
<d:href>/remote.php/dav/files/testuser/TestDocuments/SubDir2/</d:href>
<d:propstat>
<d:prop>
<d:displayname>SubDir2</d:displayname>
<d:getlastmodified>Tue, 29 Jul 2025 01:34:17 GMT</d:getlastmodified>
<d:getetag>"subdir2etag"</d:getetag>
<d:resourcetype><d:collection/></d:resourcetype>
</d:prop>
<d:status>HTTP/1.1 200 OK</d:status>
</d:propstat>
</d:response>
<d:response>
<d:href>/remote.php/dav/files/testuser/TestDocuments/test.pdf</d:href>
<d:propstat>
<d:prop>
<d:displayname>test.pdf</d:displayname>
<d:getlastmodified>Thu, 24 Jul 2025 19:16:19 GMT</d:getlastmodified>
<d:getetag>"fileetag123"</d:getetag>
<d:getcontentlength>1234567</d:getcontentlength>
<d:resourcetype/>
</d:prop>
<d:status>HTTP/1.1 200 OK</d:status>
</d:propstat>
</d:response>
</d:multistatus>"#.to_string()
}
/// Mock WebDAV response for empty directory
fn mock_empty_directory_response() -> String {
r#"<?xml version="1.0"?>
<d:multistatus xmlns:d="DAV:" xmlns:s="http://sabredav.org/ns" xmlns:oc="http://owncloud.org/ns" xmlns:nc="http://nextcloud.org/ns">
<d:response>
<d:href>/remote.php/dav/files/testuser/TestDocuments/SubDir1/</d:href>
<d:propstat>
<d:prop>
<d:displayname>SubDir1</d:displayname>
<d:getlastmodified>Fri, 20 Jun 2025 23:35:17 GMT</d:getlastmodified>
<d:getetag>"subdir1etag"</d:getetag>
<d:resourcetype><d:collection/></d:resourcetype>
</d:prop>
<d:status>HTTP/1.1 200 OK</d:status>
</d:propstat>
</d:response>
</d:multistatus>"#.to_string()
}
#[test]
fn test_xml_parser_returns_temp_paths() {
// This test ensures the XML parser behavior is documented
let xml_response = mock_propfind_response();
let parsed_items = parse_propfind_response_with_directories(&xml_response)
.expect("Failed to parse XML response");
// All parsed items should have relative_path as "TEMP" initially
for item in &parsed_items {
assert_eq!(item.relative_path, "TEMP",
"XML parser should set relative_path to TEMP for processing by discovery layer");
}
// Should find the correct number of items
assert_eq!(parsed_items.len(), 4, "Should parse all 4 items from XML");
// Verify we get both directories and files
let directories: Vec<_> = parsed_items.iter().filter(|i| i.is_directory).collect();
let files: Vec<_> = parsed_items.iter().filter(|i| !i.is_directory).collect();
assert_eq!(directories.len(), 3, "Should find 3 directories");
assert_eq!(files.len(), 1, "Should find 1 file");
}
#[test]
fn test_path_processing_converts_temp_to_relative_paths() {
let service = create_test_service("http://test.example.com");
// Create mock parsed items with TEMP paths (simulating XML parser output)
let mock_items = vec![
FileIngestionInfo {
relative_path: "TEMP".to_string(),
full_path: "/remote.php/dav/files/testuser/TestDocuments/".to_string(),
#[allow(deprecated)]
path: "/remote.php/dav/files/testuser/TestDocuments/".to_string(),
name: "TestDocuments".to_string(),
size: 0,
mime_type: "application/octet-stream".to_string(),
last_modified: None,
etag: "parent123etag".to_string(),
is_directory: true,
created_at: None,
permissions: None,
owner: None,
group: None,
metadata: None,
},
FileIngestionInfo {
relative_path: "TEMP".to_string(),
full_path: "/remote.php/dav/files/testuser/TestDocuments/SubDir1/".to_string(),
#[allow(deprecated)]
path: "/remote.php/dav/files/testuser/TestDocuments/SubDir1/".to_string(),
name: "SubDir1".to_string(),
size: 0,
mime_type: "application/octet-stream".to_string(),
last_modified: None,
etag: "subdir1etag".to_string(),
is_directory: true,
created_at: None,
permissions: None,
owner: None,
group: None,
metadata: None,
},
];
// Process the items
let processed_items = service.process_file_infos(mock_items);
// Verify paths are correctly converted
assert_eq!(processed_items[0].relative_path, "/TestDocuments/");
assert_eq!(processed_items[1].relative_path, "/TestDocuments/SubDir1/");
// Verify full_path remains unchanged
assert_eq!(processed_items[0].full_path, "/remote.php/dav/files/testuser/TestDocuments/");
assert_eq!(processed_items[1].full_path, "/remote.php/dav/files/testuser/TestDocuments/SubDir1/");
}
#[test]
fn test_directory_filtering_excludes_parent() {
// Create processed items including parent directory
let processed_items = vec![
FileIngestionInfo {
relative_path: "/TestDocuments/".to_string(),
full_path: "/remote.php/dav/files/testuser/TestDocuments/".to_string(),
#[allow(deprecated)]
path: "/TestDocuments/".to_string(),
name: "TestDocuments".to_string(),
size: 0,
mime_type: "application/octet-stream".to_string(),
last_modified: None,
etag: "parent123etag".to_string(),
is_directory: true,
created_at: None,
permissions: None,
owner: None,
group: None,
metadata: None,
},
FileIngestionInfo {
relative_path: "/TestDocuments/SubDir1/".to_string(),
full_path: "/remote.php/dav/files/testuser/TestDocuments/SubDir1/".to_string(),
#[allow(deprecated)]
path: "/TestDocuments/SubDir1/".to_string(),
name: "SubDir1".to_string(),
size: 0,
mime_type: "application/octet-stream".to_string(),
last_modified: None,
etag: "subdir1etag".to_string(),
is_directory: true,
created_at: None,
permissions: None,
owner: None,
group: None,
metadata: None,
},
];
// Simulate the filtering logic from discover_files_and_directories_single_with_url
let directory_path = "/TestDocuments";
let mut files = Vec::new();
let mut directories = Vec::new();
for item in processed_items {
// Skip the directory itself (handle both with and without trailing slash)
let normalized_item_path = item.relative_path.trim_end_matches('/');
let normalized_directory_path = directory_path.trim_end_matches('/');
if normalized_item_path == normalized_directory_path {
continue; // Skip the directory itself
}
if item.is_directory {
directories.push(item);
} else {
files.push(item);
}
}
// Should exclude parent directory but include subdirectory
assert_eq!(files.len(), 0);
assert_eq!(directories.len(), 1);
assert_eq!(directories[0].relative_path, "/TestDocuments/SubDir1/");
}
#[tokio::test]
async fn test_single_directory_discovery_integration() {
let mock_server = MockServer::start().await;
// Mock the PROPFIND request
Mock::given(method("PROPFIND"))
.and(path("/remote.php/dav/files/testuser/TestDocuments"))
.and(header("depth", "1"))
.and(header("content-type", "application/xml"))
.respond_with(
ResponseTemplate::new(207)
.set_body_string(mock_propfind_response())
.insert_header("content-type", "application/xml")
)
.mount(&mock_server)
.await;
let service = create_test_service(&mock_server.uri());
// Test single directory discovery
let result = service.discover_files_and_directories("/TestDocuments", false).await
.expect("Single directory discovery should succeed");
// Verify results
assert_eq!(result.files.len(), 1, "Should find 1 file");
assert_eq!(result.directories.len(), 2, "Should find 2 directories (excluding parent)");
// Verify directory paths are correct (not TEMP)
let dir_paths: Vec<&String> = result.directories.iter().map(|d| &d.relative_path).collect();
assert!(dir_paths.contains(&&"/TestDocuments/SubDir1/".to_string()));
assert!(dir_paths.contains(&&"/TestDocuments/SubDir2/".to_string()));
// Verify no directory has TEMP path
for dir in &result.directories {
assert_ne!(dir.relative_path, "TEMP", "Directory path should not be TEMP");
}
// Verify file path is correct
assert_eq!(result.files[0].relative_path, "/TestDocuments/test.pdf");
assert_ne!(result.files[0].relative_path, "TEMP", "File path should not be TEMP");
}
#[tokio::test]
async fn test_recursive_directory_discovery_integration() {
let mock_server = MockServer::start().await;
// Mock the initial PROPFIND request for root directory
Mock::given(method("PROPFIND"))
.and(path("/remote.php/dav/files/testuser/TestDocuments"))
.and(header("depth", "1"))
.and(header("content-type", "application/xml"))
.respond_with(
ResponseTemplate::new(207)
.set_body_string(mock_propfind_response())
.insert_header("content-type", "application/xml")
)
.mount(&mock_server)
.await;
// Mock PROPFIND requests for subdirectories (return empty for simplicity)
Mock::given(method("PROPFIND"))
.and(path("/remote.php/dav/files/testuser/TestDocuments/SubDir1"))
.and(header("depth", "1"))
.and(header("content-type", "application/xml"))
.respond_with(
ResponseTemplate::new(207)
.set_body_string(mock_empty_directory_response())
.insert_header("content-type", "application/xml")
)
.mount(&mock_server)
.await;
Mock::given(method("PROPFIND"))
.and(path("/remote.php/dav/files/testuser/TestDocuments/SubDir2"))
.and(header("depth", "1"))
.and(header("content-type", "application/xml"))
.respond_with(
ResponseTemplate::new(207)
.set_body_string(mock_empty_directory_response())
.insert_header("content-type", "application/xml")
)
.mount(&mock_server)
.await;
let service = create_test_service(&mock_server.uri());
// Test recursive directory discovery
let result = service.discover_files_and_directories("/TestDocuments", true).await
.expect("Recursive directory discovery should succeed");
// Verify results
assert_eq!(result.files.len(), 1, "Should find 1 file");
assert_eq!(result.directories.len(), 2, "Should find 2 directories (excluding parents)");
// Verify no paths are TEMP
for item in result.files.iter().chain(result.directories.iter()) {
assert_ne!(item.relative_path, "TEMP", "Paths should be processed, not TEMP");
assert!(item.relative_path.starts_with("/TestDocuments"),
"All paths should start with /TestDocuments, got: {}", item.relative_path);
}
}
#[test]
fn test_href_to_relative_path_conversion() {
let service = create_test_service("http://test.example.com");
// Test Nextcloud path conversion
assert_eq!(
service.href_to_relative_path("/remote.php/dav/files/testuser/Documents/file.pdf"),
"/Documents/file.pdf"
);
assert_eq!(
service.href_to_relative_path("/remote.php/dav/files/testuser/"),
"/"
);
assert_eq!(
service.href_to_relative_path("/remote.php/dav/files/testuser/Deep/Nested/Path/"),
"/Deep/Nested/Path/"
);
}
#[test]
fn test_url_construction() {
let service = create_test_service("http://test.example.com");
// Test URL construction for different paths
assert_eq!(
service.get_url_for_path("/TestDocuments"),
"http://test.example.com/remote.php/dav/files/testuser/TestDocuments"
);
assert_eq!(
service.get_url_for_path("/TestDocuments/SubDir"),
"http://test.example.com/remote.php/dav/files/testuser/TestDocuments/SubDir"
);
assert_eq!(
service.get_url_for_path("/"),
"http://test.example.com/remote.php/dav/files/testuser"
);
}
#[test]
fn test_regression_temp_paths_are_processed() {
// Regression test: Ensure TEMP paths from XML parser are always processed
let service = create_test_service("http://test.example.com");
// Simulate the exact scenario that caused the bug
let raw_xml_items = vec![
FileIngestionInfo {
relative_path: "TEMP".to_string(), // This is what XML parser returns
full_path: "/remote.php/dav/files/testuser/TestDocuments/ImportantFolder/".to_string(),
#[allow(deprecated)]
path: "/remote.php/dav/files/testuser/TestDocuments/ImportantFolder/".to_string(),
name: "ImportantFolder".to_string(),
size: 0,
mime_type: "application/octet-stream".to_string(),
last_modified: None,
etag: "folder123etag".to_string(),
is_directory: true,
created_at: None,
permissions: None,
owner: None,
group: None,
metadata: None,
}
];
// Process items as the service should do
let processed_items = service.process_file_infos(raw_xml_items);
// Verify the bug is fixed
assert_eq!(processed_items.len(), 1);
assert_ne!(processed_items[0].relative_path, "TEMP",
"REGRESSION: relative_path should not remain as TEMP after processing");
assert_eq!(processed_items[0].relative_path, "/TestDocuments/ImportantFolder/",
"relative_path should be properly converted from href");
}
#[tokio::test]
async fn test_discover_files_and_directories_processes_paths() {
// Integration test to ensure discover_files_and_directories always processes paths
let mock_server = MockServer::start().await;
Mock::given(method("PROPFIND"))
.and(path("/remote.php/dav/files/testuser/TestDocuments"))
.respond_with(
ResponseTemplate::new(207)
.set_body_string(mock_propfind_response())
.insert_header("content-type", "application/xml")
)
.mount(&mock_server)
.await;
let service = create_test_service(&mock_server.uri());
let result = service.discover_files_and_directories("/TestDocuments", false).await
.expect("Discovery should succeed");
// Ensure no items have TEMP paths (regression test)
for item in result.files.iter().chain(result.directories.iter()) {
assert_ne!(item.relative_path, "TEMP",
"REGRESSION: No items should have TEMP paths after discovery");
}
}
}

View File

@ -6,6 +6,7 @@ use std::str;
use serde_json;
use crate::models::FileIngestionInfo;
use crate::mime_detection::{detect_mime_for_discovery, DetectionStrategy};
#[derive(Debug, Default)]
struct PropFindResponse {
@ -200,6 +201,14 @@ pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileIngestionInfo>>
// Use the metadata collected during parsing
let metadata = resp.metadata;
// Determine MIME type using improved detection
let mime_detection_result = detect_mime_for_discovery(
&name,
resp.content_type.as_deref(),
DetectionStrategy::Comprehensive
);
let mime_type = mime_detection_result.mime_type;
let file_info = FileIngestionInfo {
relative_path: "TEMP".to_string(), // Will be set by discovery layer
full_path: resp.href.clone(),
@ -207,7 +216,7 @@ pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileIngestionInfo>>
path: resp.href.clone(), // Legacy field - keep for compatibility
name,
size: resp.content_length.unwrap_or(0),
mime_type: resp.content_type.unwrap_or_else(|| "application/octet-stream".to_string()),
mime_type,
last_modified: parse_http_date(&resp.last_modified.unwrap_or_default()),
etag: resp.etag.unwrap_or_else(|| format!("\"{}\"", uuid::Uuid::new_v4())),
is_directory: false,
@ -418,6 +427,18 @@ pub fn parse_propfind_response_with_directories(xml_text: &str) -> Result<Vec<Fi
}
});
// Determine MIME type for files (directories get empty string)
let mime_type = if resp.is_collection {
"".to_string()
} else {
let mime_detection_result = detect_mime_for_discovery(
&name,
resp.content_type.as_deref(),
DetectionStrategy::Comprehensive
);
mime_detection_result.mime_type
};
let file_info = FileIngestionInfo {
relative_path: "TEMP".to_string(), // Will be set by discovery layer
full_path: resp.href.clone(),
@ -425,11 +446,7 @@ pub fn parse_propfind_response_with_directories(xml_text: &str) -> Result<Vec<Fi
path: resp.href.clone(), // Legacy field - keep for compatibility
name,
size: resp.content_length.unwrap_or(0),
mime_type: if resp.is_collection {
"".to_string()
} else {
resp.content_type.unwrap_or_else(|| "application/octet-stream".to_string())
},
mime_type,
last_modified: parse_http_date(&resp.last_modified.unwrap_or_default()),
etag: resp.etag.unwrap_or_else(|| format!("\"{}\"", uuid::Uuid::new_v4())),
is_directory: resp.is_collection,
@ -944,3 +961,4 @@ mod tests {
assert!(strong_compare_etags("\"1\"", "\"1\""));
}
}

File diff suppressed because it is too large Load Diff