feat(server): do a *much* better job at determining file types thanks to infer rust package
This commit is contained in:
parent
aff7b907c7
commit
d7a0a1f294
|
|
@ -4,7 +4,7 @@ node_modules/
|
|||
.env
|
||||
assets/
|
||||
frontend/dist/
|
||||
.claude/
|
||||
.claude/settings.local.json # This file is used to store the local Claude settings.
|
||||
readur_uploads/
|
||||
readur_watch/
|
||||
test-results/
|
||||
|
|
|
|||
|
|
@ -1009,6 +1009,17 @@ dependencies = [
|
|||
"nom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cfb"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"fnv",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cfg-expr"
|
||||
version = "0.15.8"
|
||||
|
|
@ -2410,6 +2421,15 @@ dependencies = [
|
|||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "infer"
|
||||
version = "0.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cb33622da908807a06f9513c19b3c1ad50fab3e4137d82a78107d502075aa199"
|
||||
dependencies = [
|
||||
"cfb",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "inotify"
|
||||
version = "0.11.0"
|
||||
|
|
@ -2625,7 +2645,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"windows-targets 0.48.5",
|
||||
"windows-targets 0.53.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -3610,6 +3630,7 @@ dependencies = [
|
|||
"hostname",
|
||||
"image",
|
||||
"imageproc",
|
||||
"infer",
|
||||
"jsonwebtoken",
|
||||
"mime_guess",
|
||||
"notify",
|
||||
|
|
@ -5674,7 +5695,7 @@ version = "0.1.9"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
|
||||
dependencies = [
|
||||
"windows-sys 0.48.0",
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ path = "src/main.rs"
|
|||
name = "test_runner"
|
||||
path = "src/bin/test_runner.rs"
|
||||
|
||||
|
||||
[dependencies]
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
axum = { version = "0.8", features = ["multipart"] }
|
||||
|
|
@ -33,6 +34,7 @@ futures-util = "0.3"
|
|||
futures = "0.3"
|
||||
notify = "8"
|
||||
mime_guess = "2"
|
||||
infer = "0.15"
|
||||
tesseract = { version = "0.15", optional = true }
|
||||
image = { version = "0.25", features = ["png", "jpeg", "tiff", "bmp"], optional = true }
|
||||
imageproc = { version = "0.25", optional = true }
|
||||
|
|
|
|||
|
|
@ -1,12 +1,12 @@
|
|||
{
|
||||
"name": "readur-frontend",
|
||||
"version": "2.4.2",
|
||||
"version": "2.5.3",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "readur-frontend",
|
||||
"version": "2.4.2",
|
||||
"version": "2.5.3",
|
||||
"dependencies": {
|
||||
"@emotion/react": "^11.14.0",
|
||||
"@emotion/styled": "^11.14.0",
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ pub mod db_guardrails_simple;
|
|||
pub mod errors;
|
||||
pub mod ingestion;
|
||||
pub mod metadata_extraction;
|
||||
pub mod mime_detection;
|
||||
pub mod models;
|
||||
pub mod monitoring;
|
||||
pub mod ocr;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,431 @@
|
|||
/// MIME type detection module for improved file type identification
|
||||
///
|
||||
/// This module provides functions for detecting file MIME types using multiple methods:
|
||||
/// 1. Content-based detection using magic bytes (most reliable)
|
||||
/// 2. Server-provided MIME type (when available and trusted)
|
||||
/// 3. Extension-based fallback (least reliable, but covers edge cases)
|
||||
///
|
||||
/// The goal is to provide accurate MIME type detection that's particularly important
|
||||
/// for OCR processing where incorrectly classified image files can cause issues.
|
||||
|
||||
use std::path::Path;
|
||||
use tracing::{debug, warn};
|
||||
|
||||
/// Strategy for MIME type detection
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum DetectionStrategy {
|
||||
/// Use content-based detection (magic bytes) - most reliable
|
||||
ContentBased,
|
||||
/// Trust server-provided MIME type if available, fallback to content
|
||||
TrustServer,
|
||||
/// Use extension-based detection - least reliable but fastest
|
||||
ExtensionOnly,
|
||||
/// Comprehensive strategy: server -> content -> extension -> fallback
|
||||
Comprehensive,
|
||||
}
|
||||
|
||||
/// Result of MIME type detection with metadata about the detection method used
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MimeDetectionResult {
|
||||
pub mime_type: String,
|
||||
pub confidence: MimeConfidence,
|
||||
pub detection_method: DetectionMethod,
|
||||
pub original_server_type: Option<String>,
|
||||
pub detected_extension: Option<String>,
|
||||
}
|
||||
|
||||
/// Confidence level of the MIME type detection
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum MimeConfidence {
|
||||
/// Low confidence - extension-based or fallback detection
|
||||
Low,
|
||||
/// Medium confidence - mime_guess library detection
|
||||
Medium,
|
||||
/// High confidence - magic byte detection or trusted server
|
||||
High,
|
||||
/// Very high confidence - content analysis confirms server type
|
||||
VeryHigh,
|
||||
}
|
||||
|
||||
/// Method used for MIME type detection
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum DetectionMethod {
|
||||
/// Detected using magic bytes/file signature
|
||||
MagicBytes,
|
||||
/// Provided by the server and trusted
|
||||
ServerProvided,
|
||||
/// Detected using file extension
|
||||
Extension,
|
||||
/// Fallback to default type
|
||||
Fallback,
|
||||
/// Hybrid approach using multiple methods
|
||||
Hybrid,
|
||||
}
|
||||
|
||||
impl MimeDetectionResult {
|
||||
/// Create a result for server-provided MIME type
|
||||
pub fn from_server(mime_type: String) -> Self {
|
||||
Self {
|
||||
mime_type,
|
||||
confidence: MimeConfidence::High,
|
||||
detection_method: DetectionMethod::ServerProvided,
|
||||
original_server_type: None,
|
||||
detected_extension: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a result for content-based detection
|
||||
pub fn from_content(mime_type: String, server_type: Option<String>) -> Self {
|
||||
Self {
|
||||
mime_type,
|
||||
confidence: MimeConfidence::High,
|
||||
detection_method: DetectionMethod::MagicBytes,
|
||||
original_server_type: server_type,
|
||||
detected_extension: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a result for extension-based detection
|
||||
pub fn from_extension(mime_type: String, extension: String) -> Self {
|
||||
Self {
|
||||
mime_type,
|
||||
confidence: MimeConfidence::Medium,
|
||||
detection_method: DetectionMethod::Extension,
|
||||
original_server_type: None,
|
||||
detected_extension: Some(extension),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a fallback result
|
||||
pub fn fallback() -> Self {
|
||||
Self {
|
||||
mime_type: "application/octet-stream".to_string(),
|
||||
confidence: MimeConfidence::Low,
|
||||
detection_method: DetectionMethod::Fallback,
|
||||
original_server_type: None,
|
||||
detected_extension: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if the detected MIME type indicates an image file
|
||||
pub fn is_image(&self) -> bool {
|
||||
self.mime_type.starts_with("image/")
|
||||
}
|
||||
|
||||
/// Check if the detected MIME type indicates a document file
|
||||
pub fn is_document(&self) -> bool {
|
||||
matches!(self.mime_type.as_str(),
|
||||
"application/pdf" |
|
||||
"application/msword" |
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
|
||||
"application/vnd.ms-excel" |
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
|
||||
"application/vnd.ms-powerpoint" |
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation" |
|
||||
"text/plain" |
|
||||
"text/rtf" |
|
||||
"application/rtf"
|
||||
)
|
||||
}
|
||||
|
||||
/// Check if this MIME type is suitable for OCR processing
|
||||
pub fn is_ocr_suitable(&self) -> bool {
|
||||
self.is_image() || self.mime_type == "application/pdf"
|
||||
}
|
||||
}
|
||||
|
||||
/// Detect MIME type for WebDAV discovery phase (when we only have file metadata)
|
||||
///
|
||||
/// This function is called during the initial WebDAV XML parsing when we don't
|
||||
/// have access to the actual file content yet.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `filename` - The filename/path of the file
|
||||
/// * `server_mime_type` - MIME type provided by the WebDAV server, if any
|
||||
/// * `strategy` - Detection strategy to use
|
||||
///
|
||||
/// # Returns
|
||||
/// A `MimeDetectionResult` with the best available MIME type determination
|
||||
pub fn detect_mime_for_discovery(
|
||||
filename: &str,
|
||||
server_mime_type: Option<&str>,
|
||||
strategy: DetectionStrategy,
|
||||
) -> MimeDetectionResult {
|
||||
debug!("Detecting MIME type for discovery: filename={}, server_type={:?}, strategy={:?}",
|
||||
filename, server_mime_type, strategy);
|
||||
|
||||
match strategy {
|
||||
DetectionStrategy::ContentBased => {
|
||||
// During discovery, we can't analyze content, so fall back to extension
|
||||
detect_from_extension(filename, server_mime_type)
|
||||
}
|
||||
DetectionStrategy::TrustServer => {
|
||||
if let Some(server_type) = server_mime_type {
|
||||
if is_trusted_server_mime_type(server_type) {
|
||||
return MimeDetectionResult::from_server(server_type.to_string());
|
||||
}
|
||||
}
|
||||
// Fallback to extension-based detection
|
||||
detect_from_extension(filename, server_mime_type)
|
||||
}
|
||||
DetectionStrategy::ExtensionOnly => {
|
||||
detect_from_extension(filename, server_mime_type)
|
||||
}
|
||||
DetectionStrategy::Comprehensive => {
|
||||
// Use server type if trusted, otherwise extension-based
|
||||
if let Some(server_type) = server_mime_type {
|
||||
if is_trusted_server_mime_type(server_type) {
|
||||
return MimeDetectionResult::from_server(server_type.to_string());
|
||||
}
|
||||
}
|
||||
detect_from_extension(filename, server_mime_type)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Detect MIME type when file content is available (during file download/processing)
|
||||
///
|
||||
/// This provides the most accurate detection using magic bytes from the actual file content.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `content` - The first few bytes of the file content (at least 512 bytes recommended)
|
||||
/// * `filename` - The filename for fallback detection
|
||||
/// * `server_mime_type` - MIME type provided by the server, if any
|
||||
///
|
||||
/// # Returns
|
||||
/// A `MimeDetectionResult` with high-confidence MIME type detection
|
||||
pub fn detect_mime_from_content(
|
||||
content: &[u8],
|
||||
filename: &str,
|
||||
server_mime_type: Option<&str>,
|
||||
) -> MimeDetectionResult {
|
||||
debug!("Detecting MIME type from content: filename={}, server_type={:?}, content_len={}",
|
||||
filename, server_mime_type, content.len());
|
||||
|
||||
// First, try magic byte detection
|
||||
if let Some(detected_type) = infer::get(content) {
|
||||
let mime_type = detected_type.mime_type().to_string();
|
||||
debug!("Magic bytes detected MIME type: {}", mime_type);
|
||||
|
||||
// If server provided a type, check for consistency
|
||||
if let Some(server_type) = server_mime_type {
|
||||
if are_mime_types_compatible(&mime_type, server_type) {
|
||||
// Both agree - very high confidence
|
||||
let mut result = MimeDetectionResult::from_content(mime_type, Some(server_type.to_string()));
|
||||
result.confidence = MimeConfidence::VeryHigh;
|
||||
result.detection_method = DetectionMethod::Hybrid;
|
||||
return result;
|
||||
} else {
|
||||
// Content detection overrides server type - trust the bytes
|
||||
warn!("MIME type mismatch: server={}, content={} for file {}",
|
||||
server_type, mime_type, filename);
|
||||
return MimeDetectionResult::from_content(mime_type, Some(server_type.to_string()));
|
||||
}
|
||||
} else {
|
||||
// Only content detection available
|
||||
return MimeDetectionResult::from_content(mime_type, None);
|
||||
}
|
||||
}
|
||||
|
||||
// Magic bytes detection failed, fall back to server type if trusted
|
||||
if let Some(server_type) = server_mime_type {
|
||||
if is_trusted_server_mime_type(server_type) {
|
||||
debug!("Using trusted server MIME type: {}", server_type);
|
||||
return MimeDetectionResult::from_server(server_type.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to extension-based detection
|
||||
debug!("Content detection failed, falling back to extension detection");
|
||||
detect_from_extension(filename, server_mime_type)
|
||||
}
|
||||
|
||||
/// Update an existing MIME type with content-based detection if available
|
||||
///
|
||||
/// This function is useful for re-detecting MIME types when file content becomes
|
||||
/// available after initial discovery.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `current_mime_type` - The currently assigned MIME type
|
||||
/// * `content` - File content for analysis
|
||||
/// * `filename` - Filename for context
|
||||
///
|
||||
/// # Returns
|
||||
/// A new `MimeDetectionResult` if detection improves confidence, or None if no change needed
|
||||
pub fn update_mime_type_with_content(
|
||||
current_mime_type: &str,
|
||||
content: &[u8],
|
||||
filename: &str,
|
||||
) -> Option<MimeDetectionResult> {
|
||||
let new_result = detect_mime_from_content(content, filename, Some(current_mime_type));
|
||||
|
||||
// Only update if we have higher confidence or detected a different type
|
||||
if new_result.confidence > MimeConfidence::Medium ||
|
||||
new_result.mime_type != current_mime_type {
|
||||
Some(new_result)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Detect MIME type from file extension using mime_guess library
|
||||
fn detect_from_extension(filename: &str, server_mime_type: Option<&str>) -> MimeDetectionResult {
|
||||
let path = Path::new(filename);
|
||||
|
||||
if let Some(mime_type) = mime_guess::from_path(path).first() {
|
||||
let mime_str = mime_type.to_string();
|
||||
debug!("Extension-based detection: {} -> {}", filename, mime_str);
|
||||
|
||||
let mut result = MimeDetectionResult::from_extension(
|
||||
mime_str,
|
||||
path.extension()
|
||||
.and_then(|ext| ext.to_str())
|
||||
.unwrap_or("")
|
||||
.to_string()
|
||||
);
|
||||
result.original_server_type = server_mime_type.map(|s| s.to_string());
|
||||
result
|
||||
} else {
|
||||
debug!("Extension-based detection failed for: {}", filename);
|
||||
let mut result = MimeDetectionResult::fallback();
|
||||
result.original_server_type = server_mime_type.map(|s| s.to_string());
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if a server-provided MIME type should be trusted
|
||||
///
|
||||
/// Some servers return generic types like "application/octet-stream" which
|
||||
/// aren't useful, while others provide accurate information.
|
||||
fn is_trusted_server_mime_type(mime_type: &str) -> bool {
|
||||
!matches!(mime_type,
|
||||
"application/octet-stream" |
|
||||
"application/binary" |
|
||||
"binary/octet-stream" |
|
||||
"" |
|
||||
"unknown"
|
||||
)
|
||||
}
|
||||
|
||||
/// Check if two MIME types are compatible/equivalent
|
||||
///
|
||||
/// Some servers might return slightly different but equivalent MIME types
|
||||
/// (e.g., "image/jpg" vs "image/jpeg")
|
||||
fn are_mime_types_compatible(type1: &str, type2: &str) -> bool {
|
||||
if type1 == type2 {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Handle common variations
|
||||
match (type1, type2) {
|
||||
("image/jpeg", "image/jpg") | ("image/jpg", "image/jpeg") => true,
|
||||
("image/tiff", "image/tif") | ("image/tif", "image/tiff") => true,
|
||||
("text/plain", "text/txt") | ("text/txt", "text/plain") => true,
|
||||
_ => {
|
||||
// Check if they have the same primary type (e.g., both are "image/*")
|
||||
let parts1: Vec<&str> = type1.split('/').collect();
|
||||
let parts2: Vec<&str> = type2.split('/').collect();
|
||||
|
||||
parts1.len() == 2 && parts2.len() == 2 && parts1[0] == parts2[0]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Legacy function for backward compatibility
|
||||
///
|
||||
/// This maintains the same interface as the original `get_mime_type_from_extension`
|
||||
/// function but uses the new detection system.
|
||||
pub fn get_mime_type_from_extension(extension: &str) -> String {
|
||||
let fake_filename = format!("file.{}", extension);
|
||||
let result = detect_from_extension(&fake_filename, None);
|
||||
result.mime_type
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_mime_detection_from_extension() {
|
||||
let result = detect_mime_for_discovery(
|
||||
"test.pdf",
|
||||
None,
|
||||
DetectionStrategy::ExtensionOnly
|
||||
);
|
||||
assert_eq!(result.mime_type, "application/pdf");
|
||||
assert_eq!(result.detection_method, DetectionMethod::Extension);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_server_type_trust() {
|
||||
// Trusted server type
|
||||
let result = detect_mime_for_discovery(
|
||||
"test.pdf",
|
||||
Some("application/pdf"),
|
||||
DetectionStrategy::TrustServer
|
||||
);
|
||||
assert_eq!(result.mime_type, "application/pdf");
|
||||
assert_eq!(result.detection_method, DetectionMethod::ServerProvided);
|
||||
|
||||
// Untrusted server type should fall back
|
||||
let result = detect_mime_for_discovery(
|
||||
"test.pdf",
|
||||
Some("application/octet-stream"),
|
||||
DetectionStrategy::TrustServer
|
||||
);
|
||||
assert_eq!(result.mime_type, "application/pdf");
|
||||
assert_eq!(result.detection_method, DetectionMethod::Extension);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mime_type_compatibility() {
|
||||
assert!(are_mime_types_compatible("image/jpeg", "image/jpg"));
|
||||
assert!(are_mime_types_compatible("image/jpg", "image/jpeg"));
|
||||
assert!(are_mime_types_compatible("text/plain", "text/plain"));
|
||||
assert!(!are_mime_types_compatible("image/jpeg", "text/plain"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_content_based_detection() {
|
||||
// PDF magic bytes
|
||||
let pdf_header = b"%PDF-1.4";
|
||||
let result = detect_mime_from_content(pdf_header, "test.pdf", None);
|
||||
assert_eq!(result.mime_type, "application/pdf");
|
||||
assert_eq!(result.detection_method, DetectionMethod::MagicBytes);
|
||||
assert_eq!(result.confidence, MimeConfidence::High);
|
||||
|
||||
// JPEG magic bytes
|
||||
let jpeg_header = [0xFF, 0xD8, 0xFF];
|
||||
let result = detect_mime_from_content(&jpeg_header, "test.jpg", None);
|
||||
assert_eq!(result.mime_type, "image/jpeg");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hybrid_detection() {
|
||||
// Content and server agree
|
||||
let pdf_header = b"%PDF-1.4";
|
||||
let result = detect_mime_from_content(pdf_header, "test.pdf", Some("application/pdf"));
|
||||
assert_eq!(result.mime_type, "application/pdf");
|
||||
assert_eq!(result.detection_method, DetectionMethod::Hybrid);
|
||||
assert_eq!(result.confidence, MimeConfidence::VeryHigh);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_legacy_compatibility() {
|
||||
assert_eq!(get_mime_type_from_extension("pdf"), "application/pdf");
|
||||
assert_eq!(get_mime_type_from_extension("jpg"), "image/jpeg");
|
||||
assert_eq!(get_mime_type_from_extension("png"), "image/png");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ocr_suitability() {
|
||||
let pdf_result = MimeDetectionResult::from_content("application/pdf".to_string(), None);
|
||||
assert!(pdf_result.is_ocr_suitable());
|
||||
|
||||
let image_result = MimeDetectionResult::from_content("image/jpeg".to_string(), None);
|
||||
assert!(image_result.is_ocr_suitable());
|
||||
|
||||
let text_result = MimeDetectionResult::from_content("text/plain".to_string(), None);
|
||||
assert!(!text_result.is_ocr_suitable());
|
||||
}
|
||||
}
|
||||
|
|
@ -570,27 +570,17 @@ impl SourceScheduler {
|
|||
return Err(format!("WebDAV server_url is empty"));
|
||||
}
|
||||
|
||||
// Check if URL starts with a valid scheme
|
||||
if !server_url.starts_with("http://") && !server_url.starts_with("https://") {
|
||||
return Err(format!(
|
||||
"WebDAV server_url must start with 'http://' or 'https://'. \
|
||||
Current value: '{}'. \
|
||||
Examples of valid URLs: \
|
||||
- https://cloud.example.com \
|
||||
- http://192.168.1.100:8080 \
|
||||
- https://nextcloud.mydomain.com:443",
|
||||
server_url
|
||||
));
|
||||
}
|
||||
// Normalize URL by adding protocol if missing (consistent with WebDAVConfig)
|
||||
let normalized_url = crate::services::webdav::config::WebDAVConfig::normalize_server_url(server_url);
|
||||
|
||||
// Try to parse as URL to catch other issues
|
||||
match reqwest::Url::parse(server_url) {
|
||||
// Try to parse the normalized URL to catch other issues
|
||||
match reqwest::Url::parse(&normalized_url) {
|
||||
Ok(url) => {
|
||||
if url.scheme() != "http" && url.scheme() != "https" {
|
||||
return Err(format!(
|
||||
"WebDAV server_url has invalid scheme '{}'. Only 'http' and 'https' are supported. \
|
||||
Current URL: '{}'",
|
||||
url.scheme(), server_url
|
||||
url.scheme(), normalized_url
|
||||
));
|
||||
}
|
||||
|
||||
|
|
@ -599,23 +589,23 @@ impl SourceScheduler {
|
|||
"WebDAV server_url is missing hostname. \
|
||||
Current URL: '{}'. \
|
||||
Example: https://cloud.example.com",
|
||||
server_url
|
||||
normalized_url
|
||||
));
|
||||
}
|
||||
|
||||
crate::debug_log!("SOURCE_SCHEDULER", "✅ WebDAV URL validation passed for source '{}': {}", source_name, server_url);
|
||||
crate::debug_log!("SOURCE_SCHEDULER", "✅ WebDAV URL validation passed for source '{}': {} (normalized to: {})", source_name, server_url, normalized_url);
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
Err(format!(
|
||||
"WebDAV server_url is not a valid URL: {}. \
|
||||
Current value: '{}'. \
|
||||
Current value: '{}' (normalized to: '{}'). \
|
||||
The URL must be absolute and include the full domain. \
|
||||
Examples: \
|
||||
- https://cloud.example.com \
|
||||
- http://192.168.1.100:8080/webdav \
|
||||
- https://nextcloud.mydomain.com",
|
||||
e, server_url
|
||||
e, server_url, normalized_url
|
||||
))
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -103,6 +103,32 @@ impl WebDAVConfig {
|
|||
}
|
||||
}
|
||||
|
||||
/// Normalizes a server URL by adding protocol if missing
|
||||
/// Prefers HTTPS over HTTP for security reasons
|
||||
pub fn normalize_server_url(url: &str) -> String {
|
||||
let trimmed = url.trim();
|
||||
|
||||
// If protocol is already specified, return as-is
|
||||
if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
|
||||
return trimmed.to_string();
|
||||
}
|
||||
|
||||
// If no protocol specified, default to HTTPS for security
|
||||
format!("https://{}", trimmed)
|
||||
}
|
||||
|
||||
/// Generates alternative protocol URL for fallback attempts
|
||||
/// If input has HTTPS, returns HTTP version and vice versa
|
||||
pub fn get_alternative_protocol_url(url: &str) -> Option<String> {
|
||||
if url.starts_with("https://") {
|
||||
Some(url.replacen("https://", "http://", 1))
|
||||
} else if url.starts_with("http://") {
|
||||
Some(url.replacen("http://", "https://", 1))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Validates the configuration
|
||||
pub fn validate(&self) -> anyhow::Result<()> {
|
||||
if self.server_url.is_empty() {
|
||||
|
|
@ -121,9 +147,22 @@ impl WebDAVConfig {
|
|||
return Err(anyhow::anyhow!("At least one watch folder must be specified"));
|
||||
}
|
||||
|
||||
// Validate URL format
|
||||
if !self.server_url.starts_with("http://") && !self.server_url.starts_with("https://") {
|
||||
return Err(anyhow::anyhow!("Server URL must start with http:// or https://"));
|
||||
// Validate URL format - now accepts URLs without protocol
|
||||
// Protocol detection and fallback will be handled during connection testing
|
||||
let normalized_url = Self::normalize_server_url(&self.server_url);
|
||||
|
||||
// Basic URL validation - check if it looks like a valid domain/IP
|
||||
let url_without_protocol = normalized_url
|
||||
.trim_start_matches("https://")
|
||||
.trim_start_matches("http://");
|
||||
|
||||
if url_without_protocol.is_empty() {
|
||||
return Err(anyhow::anyhow!("Server URL must contain a valid domain or IP address"));
|
||||
}
|
||||
|
||||
// Check for obviously invalid URLs
|
||||
if url_without_protocol.contains("://") {
|
||||
return Err(anyhow::anyhow!("Invalid URL format: contains multiple protocols"));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
|
@ -131,8 +170,8 @@ impl WebDAVConfig {
|
|||
|
||||
/// Returns the base URL for WebDAV operations
|
||||
pub fn webdav_url(&self) -> String {
|
||||
// Normalize the server URL by removing trailing slashes
|
||||
let normalized_url = self.server_url.trim_end_matches('/').to_string();
|
||||
// Normalize the server URL by adding protocol if missing and removing trailing slashes
|
||||
let normalized_url = Self::normalize_server_url(&self.server_url).trim_end_matches('/').to_string();
|
||||
|
||||
// Add WebDAV path based on server type
|
||||
match self.server_type.as_deref() {
|
||||
|
|
@ -160,7 +199,7 @@ impl WebDAVConfig {
|
|||
/// Returns alternative WebDAV URLs to try if the primary one fails
|
||||
/// This is used for fallback mechanisms when encountering 405 errors
|
||||
pub fn webdav_fallback_urls(&self) -> Vec<String> {
|
||||
let normalized_url = self.server_url.trim_end_matches('/').to_string();
|
||||
let normalized_url = Self::normalize_server_url(&self.server_url).trim_end_matches('/').to_string();
|
||||
let mut fallback_urls = Vec::new();
|
||||
|
||||
match self.server_type.as_deref() {
|
||||
|
|
|
|||
|
|
@ -23,4 +23,6 @@ mod url_construction_tests;
|
|||
#[cfg(test)]
|
||||
mod subdirectory_edge_cases_tests;
|
||||
#[cfg(test)]
|
||||
mod protocol_detection_tests;
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
|
@ -0,0 +1,233 @@
|
|||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::{WebDAVService, WebDAVConfig};
|
||||
|
||||
/// Helper function to create test WebDAV config without protocol
|
||||
fn create_test_config_without_protocol() -> WebDAVConfig {
|
||||
WebDAVConfig {
|
||||
server_url: "nas.example.com".to_string(), // No protocol
|
||||
username: "testuser".to_string(),
|
||||
password: "testpass".to_string(),
|
||||
watch_folders: vec!["/Documents".to_string()],
|
||||
file_extensions: vec!["pdf".to_string(), "txt".to_string()],
|
||||
timeout_seconds: 30,
|
||||
server_type: Some("nextcloud".to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to create test WebDAV config with HTTPS protocol
|
||||
fn create_test_config_with_https() -> WebDAVConfig {
|
||||
WebDAVConfig {
|
||||
server_url: "https://nas.example.com".to_string(),
|
||||
username: "testuser".to_string(),
|
||||
password: "testpass".to_string(),
|
||||
watch_folders: vec!["/Documents".to_string()],
|
||||
file_extensions: vec!["pdf".to_string(), "txt".to_string()],
|
||||
timeout_seconds: 30,
|
||||
server_type: Some("nextcloud".to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to create test WebDAV config with HTTP protocol
|
||||
fn create_test_config_with_http() -> WebDAVConfig {
|
||||
WebDAVConfig {
|
||||
server_url: "http://nas.example.com".to_string(),
|
||||
username: "testuser".to_string(),
|
||||
password: "testpass".to_string(),
|
||||
watch_folders: vec!["/Documents".to_string()],
|
||||
file_extensions: vec!["pdf".to_string(), "txt".to_string()],
|
||||
timeout_seconds: 30,
|
||||
server_type: Some("nextcloud".to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_config_validation_accepts_url_without_protocol() {
|
||||
let config = create_test_config_without_protocol();
|
||||
|
||||
// Should not fail validation
|
||||
assert!(config.validate().is_ok());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_config_validation_accepts_url_with_https() {
|
||||
let config = create_test_config_with_https();
|
||||
|
||||
// Should not fail validation
|
||||
assert!(config.validate().is_ok());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_config_validation_accepts_url_with_http() {
|
||||
let config = create_test_config_with_http();
|
||||
|
||||
// Should not fail validation
|
||||
assert!(config.validate().is_ok());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_normalize_server_url_adds_https_by_default() {
|
||||
let normalized = WebDAVConfig::normalize_server_url("nas.example.com");
|
||||
assert_eq!(normalized, "https://nas.example.com");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_normalize_server_url_preserves_existing_protocol() {
|
||||
let https_url = WebDAVConfig::normalize_server_url("https://nas.example.com");
|
||||
assert_eq!(https_url, "https://nas.example.com");
|
||||
|
||||
let http_url = WebDAVConfig::normalize_server_url("http://nas.example.com");
|
||||
assert_eq!(http_url, "http://nas.example.com");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get_alternative_protocol_url() {
|
||||
// HTTPS to HTTP
|
||||
let alt_http = WebDAVConfig::get_alternative_protocol_url("https://nas.example.com");
|
||||
assert_eq!(alt_http, Some("http://nas.example.com".to_string()));
|
||||
|
||||
// HTTP to HTTPS
|
||||
let alt_https = WebDAVConfig::get_alternative_protocol_url("http://nas.example.com");
|
||||
assert_eq!(alt_https, Some("https://nas.example.com".to_string()));
|
||||
|
||||
// No protocol - should return None
|
||||
let no_protocol = WebDAVConfig::get_alternative_protocol_url("nas.example.com");
|
||||
assert_eq!(no_protocol, None);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_webdav_url_uses_normalized_url() {
|
||||
let config = create_test_config_without_protocol();
|
||||
let webdav_url = config.webdav_url();
|
||||
|
||||
// Should start with https:// (normalized)
|
||||
assert!(webdav_url.starts_with("https://"));
|
||||
assert_eq!(webdav_url, "https://nas.example.com/remote.php/dav/files/testuser");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_service_creation_with_protocol_detection() {
|
||||
let config = create_test_config_without_protocol();
|
||||
|
||||
// Should be able to create service without errors
|
||||
let service = WebDAVService::new(config);
|
||||
assert!(service.is_ok());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_effective_server_url_defaults_to_normalized() {
|
||||
let config = create_test_config_without_protocol();
|
||||
let service = WebDAVService::new(config).unwrap();
|
||||
|
||||
let effective_url = service.get_effective_server_url();
|
||||
assert_eq!(effective_url, "https://nas.example.com");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_effective_server_url_with_existing_protocol() {
|
||||
let config = create_test_config_with_http();
|
||||
let service = WebDAVService::new(config).unwrap();
|
||||
|
||||
let effective_url = service.get_effective_server_url();
|
||||
assert_eq!(effective_url, "http://nas.example.com");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_working_protocol_initially_none() {
|
||||
let config = create_test_config_without_protocol();
|
||||
let service = WebDAVService::new(config).unwrap();
|
||||
|
||||
// Initially, no working protocol should be detected
|
||||
assert!(service.get_working_protocol().is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_is_connection_error_detection() {
|
||||
let config = create_test_config_without_protocol();
|
||||
let service = WebDAVService::new(config).unwrap();
|
||||
|
||||
// Test various connection error patterns
|
||||
let connection_errors = vec![
|
||||
anyhow::anyhow!("connection refused"),
|
||||
anyhow::anyhow!("timeout occurred"),
|
||||
anyhow::anyhow!("DNS resolution failed"),
|
||||
anyhow::anyhow!("TLS handshake failed"),
|
||||
anyhow::anyhow!("SSL certificate error"),
|
||||
];
|
||||
|
||||
for error in connection_errors {
|
||||
assert!(service.is_connection_error(&error), "Should detect '{}' as connection error", error);
|
||||
}
|
||||
|
||||
// Test non-connection errors
|
||||
let non_connection_errors = vec![
|
||||
anyhow::anyhow!("401 Unauthorized"),
|
||||
anyhow::anyhow!("403 Forbidden"),
|
||||
anyhow::anyhow!("invalid credentials"),
|
||||
];
|
||||
|
||||
for error in non_connection_errors {
|
||||
assert!(!service.is_connection_error(&error), "Should NOT detect '{}' as connection error", error);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_config_validation_rejects_empty_url() {
|
||||
let mut config = create_test_config_without_protocol();
|
||||
config.server_url = "".to_string();
|
||||
|
||||
assert!(config.validate().is_err());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_config_validation_rejects_invalid_url() {
|
||||
let mut config = create_test_config_without_protocol();
|
||||
config.server_url = "http://https://invalid".to_string();
|
||||
|
||||
assert!(config.validate().is_err());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_webdav_fallback_urls_use_normalized_url() {
|
||||
let config = create_test_config_without_protocol();
|
||||
let fallback_urls = config.webdav_fallback_urls();
|
||||
|
||||
// All fallback URLs should start with https:// (normalized)
|
||||
for url in fallback_urls {
|
||||
assert!(url.starts_with("https://"), "Fallback URL should be normalized: {}", url);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_backward_compatibility_with_existing_protocols() {
|
||||
// Existing URLs with protocols should work unchanged
|
||||
let https_config = create_test_config_with_https();
|
||||
let http_config = create_test_config_with_http();
|
||||
|
||||
let https_service = WebDAVService::new(https_config).unwrap();
|
||||
let http_service = WebDAVService::new(http_config).unwrap();
|
||||
|
||||
assert_eq!(https_service.get_effective_server_url(), "https://nas.example.com");
|
||||
assert_eq!(http_service.get_effective_server_url(), "http://nas.example.com");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_url_construction_with_protocol_detection() {
|
||||
let config = create_test_config_without_protocol();
|
||||
let service = WebDAVService::new(config).unwrap();
|
||||
|
||||
// Test URL construction for different paths
|
||||
let test_paths = vec![
|
||||
"/Documents/file.pdf",
|
||||
"Photos/image.jpg",
|
||||
"/",
|
||||
"",
|
||||
];
|
||||
|
||||
for path in test_paths {
|
||||
let url = service.get_url_for_path(path);
|
||||
// Should start with https:// (normalized default)
|
||||
assert!(url.starts_with("https://"), "URL should be normalized for path '{}': {}", path, url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -14,6 +14,7 @@ use crate::models::{
|
|||
WebDAVFolderInfo,
|
||||
};
|
||||
use crate::webdav_xml_parser::{parse_propfind_response, parse_propfind_response_with_directories};
|
||||
use crate::mime_detection::{detect_mime_from_content, update_mime_type_with_content, MimeDetectionResult};
|
||||
|
||||
use super::{config::{WebDAVConfig, RetryConfig, ConcurrencyConfig}, SyncProgress};
|
||||
|
||||
|
|
@ -24,6 +25,15 @@ pub struct WebDAVDiscoveryResult {
|
|||
pub directories: Vec<FileIngestionInfo>,
|
||||
}
|
||||
|
||||
/// Result of downloading a file with MIME type detection
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WebDAVDownloadResult {
|
||||
pub content: Vec<u8>,
|
||||
pub file_info: FileIngestionInfo,
|
||||
pub mime_detection: MimeDetectionResult,
|
||||
pub mime_type_updated: bool,
|
||||
}
|
||||
|
||||
/// Server capabilities information
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ServerCapabilities {
|
||||
|
|
@ -135,6 +145,8 @@ pub struct WebDAVService {
|
|||
concurrency_config: ConcurrencyConfig,
|
||||
scan_semaphore: Arc<Semaphore>,
|
||||
download_semaphore: Arc<Semaphore>,
|
||||
/// Stores the working protocol (updated after successful protocol detection)
|
||||
working_protocol: Arc<std::sync::RwLock<Option<String>>>,
|
||||
}
|
||||
|
||||
impl WebDAVService {
|
||||
|
|
@ -173,9 +185,156 @@ impl WebDAVService {
|
|||
concurrency_config,
|
||||
scan_semaphore,
|
||||
download_semaphore,
|
||||
working_protocol: Arc::new(std::sync::RwLock::new(None)),
|
||||
})
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Protocol Detection Methods
|
||||
// ============================================================================
|
||||
|
||||
/// Detects the working protocol by trying HTTPS first, then HTTP
|
||||
/// This method handles smart protocol detection for URLs without explicit protocols
|
||||
async fn detect_working_protocol(&self) -> Result<String> {
|
||||
info!("🔍 Starting smart protocol detection for: {}", self.config.server_url);
|
||||
|
||||
// If URL already has a protocol, use it directly
|
||||
if self.config.server_url.starts_with("http://") || self.config.server_url.starts_with("https://") {
|
||||
let protocol = if self.config.server_url.starts_with("https://") { "https" } else { "http" };
|
||||
info!("✅ Protocol already specified: {}", protocol);
|
||||
return Ok(protocol.to_string());
|
||||
}
|
||||
|
||||
// Try HTTPS first (more secure default)
|
||||
let https_url = format!("https://{}", self.config.server_url.trim());
|
||||
info!("🔐 Trying HTTPS first: {}", https_url);
|
||||
|
||||
match self.test_protocol_connection(&https_url).await {
|
||||
Ok(()) => {
|
||||
info!("✅ HTTPS connection successful");
|
||||
// Store the working protocol for future use
|
||||
if let Ok(mut working_protocol) = self.working_protocol.write() {
|
||||
*working_protocol = Some("https".to_string());
|
||||
}
|
||||
return Ok("https".to_string());
|
||||
}
|
||||
Err(https_error) => {
|
||||
warn!("❌ HTTPS connection failed: {}", https_error);
|
||||
|
||||
// Check if this is a connection-related error (not auth error)
|
||||
if self.is_connection_error(&https_error) {
|
||||
info!("🔄 HTTPS failed with connection error, trying HTTP fallback");
|
||||
|
||||
// Try HTTP fallback
|
||||
let http_url = format!("http://{}", self.config.server_url.trim());
|
||||
info!("🔓 Trying HTTP fallback: {}", http_url);
|
||||
|
||||
match self.test_protocol_connection(&http_url).await {
|
||||
Ok(()) => {
|
||||
warn!("⚠️ HTTP connection successful - consider configuring HTTPS for security");
|
||||
// Store the working protocol for future use
|
||||
if let Ok(mut working_protocol) = self.working_protocol.write() {
|
||||
*working_protocol = Some("http".to_string());
|
||||
}
|
||||
return Ok("http".to_string());
|
||||
}
|
||||
Err(http_error) => {
|
||||
error!("❌ Both HTTPS and HTTP failed");
|
||||
error!(" HTTPS error: {}", https_error);
|
||||
error!(" HTTP error: {}", http_error);
|
||||
return Err(anyhow!(
|
||||
"Protocol detection failed. Both HTTPS and HTTP connections failed. \
|
||||
HTTPS error: {}. HTTP error: {}. \
|
||||
Please verify the server URL and ensure WebDAV is properly configured.",
|
||||
https_error, http_error
|
||||
));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Auth or other non-connection error with HTTPS - don't try HTTP
|
||||
error!("❌ HTTPS failed with non-connection error (likely auth or server config): {}", https_error);
|
||||
return Err(anyhow!(
|
||||
"HTTPS connection failed with authentication or server configuration error: {}. \
|
||||
Please check your credentials and server settings.",
|
||||
https_error
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Tests connection with a specific protocol URL
|
||||
async fn test_protocol_connection(&self, full_url: &str) -> Result<()> {
|
||||
debug!("🧪 Testing protocol connection to: {}", full_url);
|
||||
|
||||
// Create a temporary config with the full URL for testing
|
||||
let temp_config = WebDAVConfig {
|
||||
server_url: full_url.to_string(),
|
||||
username: self.config.username.clone(),
|
||||
password: self.config.password.clone(),
|
||||
watch_folders: self.config.watch_folders.clone(),
|
||||
file_extensions: self.config.file_extensions.clone(),
|
||||
timeout_seconds: self.config.timeout_seconds,
|
||||
server_type: self.config.server_type.clone(),
|
||||
};
|
||||
|
||||
// Test basic OPTIONS request
|
||||
let webdav_url = temp_config.webdav_url();
|
||||
debug!("📍 Testing WebDAV URL: {}", webdav_url);
|
||||
|
||||
let response = self.client
|
||||
.request(Method::OPTIONS, &webdav_url)
|
||||
.basic_auth(&self.config.username, Some(&self.config.password))
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| anyhow!("Connection failed: {}", e))?;
|
||||
|
||||
if !response.status().is_success() {
|
||||
return Err(anyhow!(
|
||||
"Protocol test failed with status: {} - {}",
|
||||
response.status(),
|
||||
response.text().await.unwrap_or_default()
|
||||
));
|
||||
}
|
||||
|
||||
debug!("✅ Protocol connection test successful");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Determines if an error is connection-related (vs auth or other errors)
|
||||
pub fn is_connection_error(&self, error: &anyhow::Error) -> bool {
|
||||
let error_str = error.to_string().to_lowercase();
|
||||
|
||||
// Connection-related errors that suggest trying different protocol
|
||||
error_str.contains("connection refused") ||
|
||||
error_str.contains("timeout") ||
|
||||
error_str.contains("dns") ||
|
||||
error_str.contains("network") ||
|
||||
error_str.contains("unreachable") ||
|
||||
error_str.contains("tls") ||
|
||||
error_str.contains("ssl") ||
|
||||
error_str.contains("certificate") ||
|
||||
error_str.contains("handshake")
|
||||
}
|
||||
|
||||
/// Gets the currently working protocol (if detected)
|
||||
pub fn get_working_protocol(&self) -> Option<String> {
|
||||
self.working_protocol.read().ok().and_then(|p| p.clone())
|
||||
}
|
||||
|
||||
/// Gets the effective server URL with the working protocol
|
||||
pub fn get_effective_server_url(&self) -> String {
|
||||
// If we have a detected working protocol, use it
|
||||
if let Some(protocol) = self.get_working_protocol() {
|
||||
if !self.config.server_url.starts_with("http://") && !self.config.server_url.starts_with("https://") {
|
||||
return format!("{}://{}", protocol, self.config.server_url.trim());
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise use the configured URL (normalized)
|
||||
WebDAVConfig::normalize_server_url(&self.config.server_url)
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Connection and Testing Methods
|
||||
// ============================================================================
|
||||
|
|
@ -194,13 +353,31 @@ impl WebDAVService {
|
|||
});
|
||||
}
|
||||
|
||||
// Test basic connectivity with OPTIONS request
|
||||
// Perform protocol detection if needed
|
||||
let working_protocol = match self.detect_working_protocol().await {
|
||||
Ok(protocol) => {
|
||||
info!("✅ Protocol detection successful: {}", protocol);
|
||||
protocol
|
||||
}
|
||||
Err(e) => {
|
||||
error!("❌ Protocol detection failed: {}", e);
|
||||
return Ok(WebDAVConnectionResult {
|
||||
success: false,
|
||||
message: format!("Protocol detection failed: {}", e),
|
||||
server_version: None,
|
||||
server_type: None,
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
// Test basic connectivity with OPTIONS request using detected protocol
|
||||
match self.test_options_request().await {
|
||||
Ok((server_version, server_type)) => {
|
||||
info!("✅ WebDAV connection successful");
|
||||
let effective_url = self.get_effective_server_url();
|
||||
info!("✅ WebDAV connection successful using {} ({})", working_protocol.to_uppercase(), effective_url);
|
||||
Ok(WebDAVConnectionResult {
|
||||
success: true,
|
||||
message: "Connection successful".to_string(),
|
||||
message: format!("Connection successful using {}", working_protocol.to_uppercase()),
|
||||
server_version,
|
||||
server_type,
|
||||
})
|
||||
|
|
@ -235,7 +412,18 @@ impl WebDAVService {
|
|||
|
||||
/// Performs OPTIONS request to test basic connectivity
|
||||
async fn test_options_request(&self) -> Result<(Option<String>, Option<String>)> {
|
||||
let webdav_url = self.config.webdav_url();
|
||||
// Create a temporary config with the effective server URL for WebDAV operations
|
||||
let effective_server_url = self.get_effective_server_url();
|
||||
let temp_config = WebDAVConfig {
|
||||
server_url: effective_server_url,
|
||||
username: self.config.username.clone(),
|
||||
password: self.config.password.clone(),
|
||||
watch_folders: self.config.watch_folders.clone(),
|
||||
file_extensions: self.config.file_extensions.clone(),
|
||||
timeout_seconds: self.config.timeout_seconds,
|
||||
server_type: self.config.server_type.clone(),
|
||||
};
|
||||
let webdav_url = temp_config.webdav_url();
|
||||
|
||||
let response = self.client
|
||||
.request(Method::OPTIONS, &webdav_url)
|
||||
|
|
@ -304,8 +492,9 @@ impl WebDAVService {
|
|||
|
||||
/// Tests for Nextcloud-specific capabilities
|
||||
async fn test_nextcloud_capabilities(&self) -> Result<()> {
|
||||
let effective_server_url = self.get_effective_server_url();
|
||||
let capabilities_url = format!("{}/ocs/v1.php/cloud/capabilities",
|
||||
self.config.server_url.trim_end_matches('/'));
|
||||
effective_server_url.trim_end_matches('/'));
|
||||
|
||||
let response = self.client
|
||||
.get(&capabilities_url)
|
||||
|
|
@ -592,7 +781,18 @@ impl WebDAVService {
|
|||
|
||||
/// Gets the WebDAV URL for a specific path
|
||||
pub fn get_url_for_path(&self, path: &str) -> String {
|
||||
let base_url = self.config.webdav_url();
|
||||
// Create a temporary config with the effective server URL
|
||||
let effective_server_url = self.get_effective_server_url();
|
||||
let temp_config = WebDAVConfig {
|
||||
server_url: effective_server_url,
|
||||
username: self.config.username.clone(),
|
||||
password: self.config.password.clone(),
|
||||
watch_folders: self.config.watch_folders.clone(),
|
||||
file_extensions: self.config.file_extensions.clone(),
|
||||
timeout_seconds: self.config.timeout_seconds,
|
||||
server_type: self.config.server_type.clone(),
|
||||
};
|
||||
let base_url = temp_config.webdav_url();
|
||||
let clean_path = path.trim_start_matches('/');
|
||||
|
||||
let final_url = if clean_path.is_empty() {
|
||||
|
|
@ -652,7 +852,18 @@ impl WebDAVService {
|
|||
/// Convert file paths to the proper URL format for the server
|
||||
pub fn path_to_url(&self, relative_path: &str) -> String {
|
||||
let clean_path = relative_path.trim_start_matches('/');
|
||||
let base_url = self.config.webdav_url();
|
||||
// Create a temporary config with the effective server URL
|
||||
let effective_server_url = self.get_effective_server_url();
|
||||
let temp_config = WebDAVConfig {
|
||||
server_url: effective_server_url,
|
||||
username: self.config.username.clone(),
|
||||
password: self.config.password.clone(),
|
||||
watch_folders: self.config.watch_folders.clone(),
|
||||
file_extensions: self.config.file_extensions.clone(),
|
||||
timeout_seconds: self.config.timeout_seconds,
|
||||
server_type: self.config.server_type.clone(),
|
||||
};
|
||||
let base_url = temp_config.webdav_url();
|
||||
|
||||
if clean_path.is_empty() {
|
||||
base_url
|
||||
|
|
@ -777,42 +988,64 @@ impl WebDAVService {
|
|||
async fn discover_files_recursive(&self, directory_path: &str) -> Result<Vec<FileIngestionInfo>> {
|
||||
let mut all_files = Vec::new();
|
||||
let mut directories_to_scan = vec![directory_path.to_string()];
|
||||
let mut scanned_directories = std::collections::HashSet::new();
|
||||
let semaphore = Arc::new(Semaphore::new(self.concurrency_config.max_concurrent_scans));
|
||||
|
||||
debug!("Starting recursive file scan from: {}", directory_path);
|
||||
|
||||
while !directories_to_scan.is_empty() {
|
||||
let current_directories = directories_to_scan.clone();
|
||||
directories_to_scan.clear();
|
||||
// Take a batch of directories to process
|
||||
let batch_size = std::cmp::min(directories_to_scan.len(), self.concurrency_config.max_concurrent_scans);
|
||||
let current_batch: Vec<String> = directories_to_scan.drain(..batch_size).collect();
|
||||
|
||||
debug!("Processing batch of {} directories, {} remaining in queue",
|
||||
current_batch.len(), directories_to_scan.len());
|
||||
|
||||
// Process directories concurrently
|
||||
let tasks = current_directories.into_iter().map(|dir| {
|
||||
let tasks = current_batch.into_iter().filter_map(|dir| {
|
||||
// Skip if already scanned
|
||||
if scanned_directories.contains(&dir) {
|
||||
debug!("Skipping already scanned directory: {}", dir);
|
||||
return None;
|
||||
}
|
||||
scanned_directories.insert(dir.clone());
|
||||
|
||||
let permit = semaphore.clone();
|
||||
let service = self.clone();
|
||||
|
||||
async move {
|
||||
Some(async move {
|
||||
let _permit = permit.acquire().await.unwrap();
|
||||
service.discover_files_and_directories_single(&dir).await
|
||||
}
|
||||
let result = service.discover_files_and_directories_single(&dir).await;
|
||||
(dir, result)
|
||||
})
|
||||
});
|
||||
|
||||
let results = futures_util::future::join_all(tasks).await;
|
||||
|
||||
for result in results {
|
||||
for (scanned_dir, result) in results {
|
||||
match result {
|
||||
Ok(discovery_result) => {
|
||||
debug!("Directory '{}' scan complete: {} files, {} subdirectories",
|
||||
scanned_dir, discovery_result.files.len(), discovery_result.directories.len());
|
||||
|
||||
all_files.extend(discovery_result.files);
|
||||
|
||||
// Add subdirectories to the queue for the next iteration
|
||||
for dir in discovery_result.directories {
|
||||
if dir.is_directory {
|
||||
directories_to_scan.push(dir.relative_path);
|
||||
if dir.is_directory && !scanned_directories.contains(&dir.relative_path) {
|
||||
directories_to_scan.push(dir.relative_path.clone());
|
||||
debug!("Added subdirectory to scan queue: {}", dir.relative_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Failed to scan directory: {}", e);
|
||||
warn!("Failed to scan directory '{}': {}", scanned_dir, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!("Batch complete. Total files found: {}. Queue size: {}",
|
||||
all_files.len(), directories_to_scan.len());
|
||||
}
|
||||
|
||||
info!("Recursive scan completed. Found {} files total", all_files.len());
|
||||
|
|
@ -908,12 +1141,19 @@ impl WebDAVService {
|
|||
let body = response.text().await?;
|
||||
let all_items = parse_propfind_response_with_directories(&body)?;
|
||||
|
||||
// Process the items to convert href to relative paths
|
||||
let processed_items = self.process_file_infos(all_items);
|
||||
|
||||
// Separate files and directories, excluding the parent directory itself
|
||||
let mut files = Vec::new();
|
||||
let mut directories = Vec::new();
|
||||
|
||||
for item in all_items {
|
||||
if item.relative_path == directory_path {
|
||||
for item in processed_items {
|
||||
// Skip the directory itself (handle both with and without trailing slash)
|
||||
let normalized_item_path = item.relative_path.trim_end_matches('/');
|
||||
let normalized_directory_path = directory_path.trim_end_matches('/');
|
||||
|
||||
if normalized_item_path == normalized_directory_path {
|
||||
continue; // Skip the directory itself
|
||||
}
|
||||
|
||||
|
|
@ -933,41 +1173,69 @@ impl WebDAVService {
|
|||
let mut all_files = Vec::new();
|
||||
let mut all_directories = Vec::new();
|
||||
let mut directories_to_scan = vec![directory_path.to_string()];
|
||||
let mut scanned_directories = std::collections::HashSet::new();
|
||||
let semaphore = Arc::new(Semaphore::new(self.concurrency_config.max_concurrent_scans));
|
||||
|
||||
debug!("Starting recursive scan from: {}", directory_path);
|
||||
|
||||
while !directories_to_scan.is_empty() {
|
||||
let current_directories = directories_to_scan.clone();
|
||||
directories_to_scan.clear();
|
||||
// Take a batch of directories to process (limit batch size for better progress tracking)
|
||||
let batch_size = std::cmp::min(directories_to_scan.len(), self.concurrency_config.max_concurrent_scans);
|
||||
let current_batch: Vec<String> = directories_to_scan.drain(..batch_size).collect();
|
||||
|
||||
debug!("Processing batch of {} directories, {} remaining in queue",
|
||||
current_batch.len(), directories_to_scan.len());
|
||||
|
||||
// Process directories concurrently
|
||||
let tasks = current_directories.into_iter().map(|dir| {
|
||||
let tasks = current_batch.into_iter().filter_map(|dir| {
|
||||
// Skip if already scanned (prevent infinite loops)
|
||||
if scanned_directories.contains(&dir) {
|
||||
debug!("Skipping already scanned directory: {}", dir);
|
||||
return None;
|
||||
}
|
||||
scanned_directories.insert(dir.clone());
|
||||
|
||||
let permit = semaphore.clone();
|
||||
let service = self.clone();
|
||||
|
||||
async move {
|
||||
Some(async move {
|
||||
let _permit = permit.acquire().await.unwrap();
|
||||
service.discover_files_and_directories_single(&dir).await
|
||||
}
|
||||
let result = service.discover_files_and_directories_single(&dir).await;
|
||||
(dir, result)
|
||||
})
|
||||
});
|
||||
|
||||
let results = futures_util::future::join_all(tasks).await;
|
||||
|
||||
for result in results {
|
||||
for (scanned_dir, result) in results {
|
||||
match result {
|
||||
Ok(discovery_result) => {
|
||||
debug!("Directory '{}' scan complete: {} files, {} subdirectories",
|
||||
scanned_dir, discovery_result.files.len(), discovery_result.directories.len());
|
||||
|
||||
all_files.extend(discovery_result.files);
|
||||
|
||||
// Add directories to our results and to the scan queue
|
||||
for dir in discovery_result.directories {
|
||||
// Only add to scan queue if not already scanned
|
||||
if !scanned_directories.contains(&dir.relative_path) {
|
||||
directories_to_scan.push(dir.relative_path.clone());
|
||||
debug!("Added subdirectory to scan queue: {} (scanned set size: {})",
|
||||
dir.relative_path, scanned_directories.len());
|
||||
} else {
|
||||
debug!("Skipping already scanned directory: {} (already in scanned set)", dir.relative_path);
|
||||
}
|
||||
all_directories.push(dir);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Failed to scan directory: {}", e);
|
||||
warn!("Failed to scan directory '{}': {}", scanned_dir, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!("Batch complete. Total progress: {} files, {} directories found. Queue size: {}",
|
||||
all_files.len(), all_directories.len(), directories_to_scan.len());
|
||||
}
|
||||
|
||||
info!("Recursive scan completed. Found {} files and {} directories", all_files.len(), all_directories.len());
|
||||
|
|
@ -1172,6 +1440,131 @@ impl WebDAVService {
|
|||
Ok(results)
|
||||
}
|
||||
|
||||
/// Downloads a file with enhanced MIME type detection based on content
|
||||
///
|
||||
/// This method downloads the file and performs content-based MIME type detection
|
||||
/// using magic bytes, providing more accurate type identification than the initial
|
||||
/// discovery phase which only has access to filenames and server-provided types.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `file_info` - The file information from WebDAV discovery
|
||||
///
|
||||
/// # Returns
|
||||
/// A `WebDAVDownloadResult` containing the file content, updated file info, and MIME detection details
|
||||
pub async fn download_file_with_mime_detection(&self, file_info: &FileIngestionInfo) -> Result<WebDAVDownloadResult> {
|
||||
let _permit = self.download_semaphore.acquire().await?;
|
||||
|
||||
debug!("⬇️🔍 Downloading file with MIME detection: {}", file_info.relative_path);
|
||||
|
||||
// Use the relative path directly since it's already processed
|
||||
let relative_path = &file_info.relative_path;
|
||||
let url = self.get_url_for_path(&relative_path);
|
||||
|
||||
let response = self.authenticated_request(
|
||||
reqwest::Method::GET,
|
||||
&url,
|
||||
None,
|
||||
None,
|
||||
).await?;
|
||||
|
||||
if !response.status().is_success() {
|
||||
return Err(anyhow!(
|
||||
"Failed to download file '{}': HTTP {}",
|
||||
file_info.relative_path,
|
||||
response.status()
|
||||
));
|
||||
}
|
||||
|
||||
// Get server-provided content type from response headers
|
||||
let server_content_type = response
|
||||
.headers()
|
||||
.get("content-type")
|
||||
.and_then(|header| header.to_str().ok())
|
||||
.map(|s| s.split(';').next().unwrap_or(s).trim().to_string()); // Remove charset info and convert to owned
|
||||
|
||||
let content = response.bytes().await?;
|
||||
debug!("✅ Downloaded {} bytes for file: {}", content.len(), file_info.relative_path);
|
||||
|
||||
// Perform content-based MIME type detection
|
||||
let mime_detection_result = detect_mime_from_content(
|
||||
&content,
|
||||
&file_info.name,
|
||||
server_content_type.as_deref()
|
||||
);
|
||||
|
||||
// Check if MIME type should be updated
|
||||
let mime_type_updated = mime_detection_result.mime_type != file_info.mime_type;
|
||||
|
||||
if mime_type_updated {
|
||||
debug!("🔄 MIME type updated for {}: '{}' -> '{}' (method: {:?}, confidence: {:?})",
|
||||
file_info.name,
|
||||
file_info.mime_type,
|
||||
mime_detection_result.mime_type,
|
||||
mime_detection_result.detection_method,
|
||||
mime_detection_result.confidence);
|
||||
} else {
|
||||
debug!("✅ MIME type confirmed for {}: '{}' (method: {:?}, confidence: {:?})",
|
||||
file_info.name,
|
||||
mime_detection_result.mime_type,
|
||||
mime_detection_result.detection_method,
|
||||
mime_detection_result.confidence);
|
||||
}
|
||||
|
||||
// Create updated file info if MIME type changed
|
||||
let updated_file_info = if mime_type_updated {
|
||||
let mut updated = file_info.clone();
|
||||
updated.mime_type = mime_detection_result.mime_type.clone();
|
||||
updated
|
||||
} else {
|
||||
file_info.clone()
|
||||
};
|
||||
|
||||
Ok(WebDAVDownloadResult {
|
||||
content: content.to_vec(),
|
||||
file_info: updated_file_info,
|
||||
mime_detection: mime_detection_result,
|
||||
mime_type_updated,
|
||||
})
|
||||
}
|
||||
|
||||
/// Downloads multiple files with MIME type detection concurrently
|
||||
///
|
||||
/// Similar to `download_files` but includes content-based MIME type detection
|
||||
/// for each downloaded file.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `files` - The files to download
|
||||
///
|
||||
/// # Returns
|
||||
/// A vector of tuples containing the original file info and download result
|
||||
pub async fn download_files_with_mime_detection(&self, files: &[FileIngestionInfo]) -> Result<Vec<(FileIngestionInfo, Result<WebDAVDownloadResult>)>> {
|
||||
info!("⬇️🔍 Downloading {} files with MIME detection concurrently", files.len());
|
||||
|
||||
let tasks = files.iter().map(|file| {
|
||||
let file_clone = file.clone();
|
||||
let service_clone = self.clone();
|
||||
|
||||
async move {
|
||||
let result = service_clone.download_file_with_mime_detection(&file_clone).await;
|
||||
(file_clone, result)
|
||||
}
|
||||
});
|
||||
|
||||
let results = futures_util::future::join_all(tasks).await;
|
||||
|
||||
let success_count = results.iter().filter(|(_, result)| result.is_ok()).count();
|
||||
let failure_count = results.len() - success_count;
|
||||
let mime_updated_count = results.iter()
|
||||
.filter_map(|(_, result)| result.as_ref().ok())
|
||||
.filter(|download_result| download_result.mime_type_updated)
|
||||
.count();
|
||||
|
||||
info!("📊 Download with MIME detection completed: {} successful, {} failed, {} MIME types updated",
|
||||
success_count, failure_count, mime_updated_count);
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Gets file metadata without downloading content
|
||||
pub async fn get_file_metadata(&self, file_path: &str) -> Result<FileIngestionInfo> {
|
||||
debug!("📋 Getting metadata for file: {}", file_path);
|
||||
|
|
@ -1226,9 +1619,21 @@ impl WebDAVService {
|
|||
pub async fn get_server_capabilities(&self) -> Result<ServerCapabilities> {
|
||||
debug!("🔍 Checking server capabilities");
|
||||
|
||||
// Create a temporary config with the effective server URL
|
||||
let effective_server_url = self.get_effective_server_url();
|
||||
let temp_config = WebDAVConfig {
|
||||
server_url: effective_server_url,
|
||||
username: self.config.username.clone(),
|
||||
password: self.config.password.clone(),
|
||||
watch_folders: self.config.watch_folders.clone(),
|
||||
file_extensions: self.config.file_extensions.clone(),
|
||||
timeout_seconds: self.config.timeout_seconds,
|
||||
server_type: self.config.server_type.clone(),
|
||||
};
|
||||
|
||||
let options_response = self.authenticated_request(
|
||||
reqwest::Method::OPTIONS,
|
||||
&self.config.webdav_url(),
|
||||
&temp_config.webdav_url(),
|
||||
None,
|
||||
None,
|
||||
).await?;
|
||||
|
|
@ -1550,6 +1955,7 @@ impl WebDAVService {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// Implement Clone to allow sharing the service
|
||||
impl Clone for WebDAVService {
|
||||
fn clone(&self) -> Self {
|
||||
|
|
@ -1560,6 +1966,7 @@ impl Clone for WebDAVService {
|
|||
concurrency_config: self.concurrency_config.clone(),
|
||||
scan_semaphore: Arc::clone(&self.scan_semaphore),
|
||||
download_semaphore: Arc::clone(&self.download_semaphore),
|
||||
working_protocol: Arc::clone(&self.working_protocol),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,2 +1,3 @@
|
|||
pub mod etag_comparison_tests;
|
||||
pub mod deletion_detection_tests;
|
||||
pub mod path_processing_tests;
|
||||
|
|
@ -0,0 +1,452 @@
|
|||
#[cfg(test)]
|
||||
mod path_processing_tests {
|
||||
use crate::models::FileIngestionInfo;
|
||||
use crate::services::webdav::{WebDAVConfig, WebDAVService};
|
||||
use crate::webdav_xml_parser::parse_propfind_response_with_directories;
|
||||
use wiremock::{
|
||||
matchers::{method, path, header},
|
||||
Mock, MockServer, ResponseTemplate,
|
||||
};
|
||||
|
||||
/// Creates a test WebDAV service with mock server
|
||||
fn create_test_service(mock_server_url: &str) -> WebDAVService {
|
||||
let config = WebDAVConfig {
|
||||
server_url: mock_server_url.to_string(),
|
||||
username: "testuser".to_string(),
|
||||
password: "testpass".to_string(),
|
||||
watch_folders: vec!["/TestDocuments".to_string()],
|
||||
file_extensions: vec!["pdf".to_string(), "txt".to_string()],
|
||||
timeout_seconds: 30,
|
||||
server_type: Some("nextcloud".to_string()),
|
||||
};
|
||||
WebDAVService::new(config).expect("Failed to create test service")
|
||||
}
|
||||
|
||||
/// Mock WebDAV PROPFIND response with directories and files
|
||||
fn mock_propfind_response() -> String {
|
||||
r#"<?xml version="1.0"?>
|
||||
<d:multistatus xmlns:d="DAV:" xmlns:s="http://sabredav.org/ns" xmlns:oc="http://owncloud.org/ns" xmlns:nc="http://nextcloud.org/ns">
|
||||
<d:response>
|
||||
<d:href>/remote.php/dav/files/testuser/TestDocuments/</d:href>
|
||||
<d:propstat>
|
||||
<d:prop>
|
||||
<d:displayname>TestDocuments</d:displayname>
|
||||
<d:getlastmodified>Tue, 29 Jul 2025 01:34:17 GMT</d:getlastmodified>
|
||||
<d:getetag>"parent123etag"</d:getetag>
|
||||
<d:resourcetype><d:collection/></d:resourcetype>
|
||||
</d:prop>
|
||||
<d:status>HTTP/1.1 200 OK</d:status>
|
||||
</d:propstat>
|
||||
</d:response>
|
||||
<d:response>
|
||||
<d:href>/remote.php/dav/files/testuser/TestDocuments/SubDir1/</d:href>
|
||||
<d:propstat>
|
||||
<d:prop>
|
||||
<d:displayname>SubDir1</d:displayname>
|
||||
<d:getlastmodified>Fri, 20 Jun 2025 23:35:17 GMT</d:getlastmodified>
|
||||
<d:getetag>"subdir1etag"</d:getetag>
|
||||
<d:resourcetype><d:collection/></d:resourcetype>
|
||||
</d:prop>
|
||||
<d:status>HTTP/1.1 200 OK</d:status>
|
||||
</d:propstat>
|
||||
</d:response>
|
||||
<d:response>
|
||||
<d:href>/remote.php/dav/files/testuser/TestDocuments/SubDir2/</d:href>
|
||||
<d:propstat>
|
||||
<d:prop>
|
||||
<d:displayname>SubDir2</d:displayname>
|
||||
<d:getlastmodified>Tue, 29 Jul 2025 01:34:17 GMT</d:getlastmodified>
|
||||
<d:getetag>"subdir2etag"</d:getetag>
|
||||
<d:resourcetype><d:collection/></d:resourcetype>
|
||||
</d:prop>
|
||||
<d:status>HTTP/1.1 200 OK</d:status>
|
||||
</d:propstat>
|
||||
</d:response>
|
||||
<d:response>
|
||||
<d:href>/remote.php/dav/files/testuser/TestDocuments/test.pdf</d:href>
|
||||
<d:propstat>
|
||||
<d:prop>
|
||||
<d:displayname>test.pdf</d:displayname>
|
||||
<d:getlastmodified>Thu, 24 Jul 2025 19:16:19 GMT</d:getlastmodified>
|
||||
<d:getetag>"fileetag123"</d:getetag>
|
||||
<d:getcontentlength>1234567</d:getcontentlength>
|
||||
<d:resourcetype/>
|
||||
</d:prop>
|
||||
<d:status>HTTP/1.1 200 OK</d:status>
|
||||
</d:propstat>
|
||||
</d:response>
|
||||
</d:multistatus>"#.to_string()
|
||||
}
|
||||
|
||||
/// Mock WebDAV response for empty directory
|
||||
fn mock_empty_directory_response() -> String {
|
||||
r#"<?xml version="1.0"?>
|
||||
<d:multistatus xmlns:d="DAV:" xmlns:s="http://sabredav.org/ns" xmlns:oc="http://owncloud.org/ns" xmlns:nc="http://nextcloud.org/ns">
|
||||
<d:response>
|
||||
<d:href>/remote.php/dav/files/testuser/TestDocuments/SubDir1/</d:href>
|
||||
<d:propstat>
|
||||
<d:prop>
|
||||
<d:displayname>SubDir1</d:displayname>
|
||||
<d:getlastmodified>Fri, 20 Jun 2025 23:35:17 GMT</d:getlastmodified>
|
||||
<d:getetag>"subdir1etag"</d:getetag>
|
||||
<d:resourcetype><d:collection/></d:resourcetype>
|
||||
</d:prop>
|
||||
<d:status>HTTP/1.1 200 OK</d:status>
|
||||
</d:propstat>
|
||||
</d:response>
|
||||
</d:multistatus>"#.to_string()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_xml_parser_returns_temp_paths() {
|
||||
// This test ensures the XML parser behavior is documented
|
||||
let xml_response = mock_propfind_response();
|
||||
let parsed_items = parse_propfind_response_with_directories(&xml_response)
|
||||
.expect("Failed to parse XML response");
|
||||
|
||||
// All parsed items should have relative_path as "TEMP" initially
|
||||
for item in &parsed_items {
|
||||
assert_eq!(item.relative_path, "TEMP",
|
||||
"XML parser should set relative_path to TEMP for processing by discovery layer");
|
||||
}
|
||||
|
||||
// Should find the correct number of items
|
||||
assert_eq!(parsed_items.len(), 4, "Should parse all 4 items from XML");
|
||||
|
||||
// Verify we get both directories and files
|
||||
let directories: Vec<_> = parsed_items.iter().filter(|i| i.is_directory).collect();
|
||||
let files: Vec<_> = parsed_items.iter().filter(|i| !i.is_directory).collect();
|
||||
|
||||
assert_eq!(directories.len(), 3, "Should find 3 directories");
|
||||
assert_eq!(files.len(), 1, "Should find 1 file");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_path_processing_converts_temp_to_relative_paths() {
|
||||
let service = create_test_service("http://test.example.com");
|
||||
|
||||
// Create mock parsed items with TEMP paths (simulating XML parser output)
|
||||
let mock_items = vec![
|
||||
FileIngestionInfo {
|
||||
relative_path: "TEMP".to_string(),
|
||||
full_path: "/remote.php/dav/files/testuser/TestDocuments/".to_string(),
|
||||
#[allow(deprecated)]
|
||||
path: "/remote.php/dav/files/testuser/TestDocuments/".to_string(),
|
||||
name: "TestDocuments".to_string(),
|
||||
size: 0,
|
||||
mime_type: "application/octet-stream".to_string(),
|
||||
last_modified: None,
|
||||
etag: "parent123etag".to_string(),
|
||||
is_directory: true,
|
||||
created_at: None,
|
||||
permissions: None,
|
||||
owner: None,
|
||||
group: None,
|
||||
metadata: None,
|
||||
},
|
||||
FileIngestionInfo {
|
||||
relative_path: "TEMP".to_string(),
|
||||
full_path: "/remote.php/dav/files/testuser/TestDocuments/SubDir1/".to_string(),
|
||||
#[allow(deprecated)]
|
||||
path: "/remote.php/dav/files/testuser/TestDocuments/SubDir1/".to_string(),
|
||||
name: "SubDir1".to_string(),
|
||||
size: 0,
|
||||
mime_type: "application/octet-stream".to_string(),
|
||||
last_modified: None,
|
||||
etag: "subdir1etag".to_string(),
|
||||
is_directory: true,
|
||||
created_at: None,
|
||||
permissions: None,
|
||||
owner: None,
|
||||
group: None,
|
||||
metadata: None,
|
||||
},
|
||||
];
|
||||
|
||||
// Process the items
|
||||
let processed_items = service.process_file_infos(mock_items);
|
||||
|
||||
// Verify paths are correctly converted
|
||||
assert_eq!(processed_items[0].relative_path, "/TestDocuments/");
|
||||
assert_eq!(processed_items[1].relative_path, "/TestDocuments/SubDir1/");
|
||||
|
||||
// Verify full_path remains unchanged
|
||||
assert_eq!(processed_items[0].full_path, "/remote.php/dav/files/testuser/TestDocuments/");
|
||||
assert_eq!(processed_items[1].full_path, "/remote.php/dav/files/testuser/TestDocuments/SubDir1/");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_directory_filtering_excludes_parent() {
|
||||
// Create processed items including parent directory
|
||||
let processed_items = vec![
|
||||
FileIngestionInfo {
|
||||
relative_path: "/TestDocuments/".to_string(),
|
||||
full_path: "/remote.php/dav/files/testuser/TestDocuments/".to_string(),
|
||||
#[allow(deprecated)]
|
||||
path: "/TestDocuments/".to_string(),
|
||||
name: "TestDocuments".to_string(),
|
||||
size: 0,
|
||||
mime_type: "application/octet-stream".to_string(),
|
||||
last_modified: None,
|
||||
etag: "parent123etag".to_string(),
|
||||
is_directory: true,
|
||||
created_at: None,
|
||||
permissions: None,
|
||||
owner: None,
|
||||
group: None,
|
||||
metadata: None,
|
||||
},
|
||||
FileIngestionInfo {
|
||||
relative_path: "/TestDocuments/SubDir1/".to_string(),
|
||||
full_path: "/remote.php/dav/files/testuser/TestDocuments/SubDir1/".to_string(),
|
||||
#[allow(deprecated)]
|
||||
path: "/TestDocuments/SubDir1/".to_string(),
|
||||
name: "SubDir1".to_string(),
|
||||
size: 0,
|
||||
mime_type: "application/octet-stream".to_string(),
|
||||
last_modified: None,
|
||||
etag: "subdir1etag".to_string(),
|
||||
is_directory: true,
|
||||
created_at: None,
|
||||
permissions: None,
|
||||
owner: None,
|
||||
group: None,
|
||||
metadata: None,
|
||||
},
|
||||
];
|
||||
|
||||
// Simulate the filtering logic from discover_files_and_directories_single_with_url
|
||||
let directory_path = "/TestDocuments";
|
||||
let mut files = Vec::new();
|
||||
let mut directories = Vec::new();
|
||||
|
||||
for item in processed_items {
|
||||
// Skip the directory itself (handle both with and without trailing slash)
|
||||
let normalized_item_path = item.relative_path.trim_end_matches('/');
|
||||
let normalized_directory_path = directory_path.trim_end_matches('/');
|
||||
|
||||
if normalized_item_path == normalized_directory_path {
|
||||
continue; // Skip the directory itself
|
||||
}
|
||||
|
||||
if item.is_directory {
|
||||
directories.push(item);
|
||||
} else {
|
||||
files.push(item);
|
||||
}
|
||||
}
|
||||
|
||||
// Should exclude parent directory but include subdirectory
|
||||
assert_eq!(files.len(), 0);
|
||||
assert_eq!(directories.len(), 1);
|
||||
assert_eq!(directories[0].relative_path, "/TestDocuments/SubDir1/");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_single_directory_discovery_integration() {
|
||||
let mock_server = MockServer::start().await;
|
||||
|
||||
// Mock the PROPFIND request
|
||||
Mock::given(method("PROPFIND"))
|
||||
.and(path("/remote.php/dav/files/testuser/TestDocuments"))
|
||||
.and(header("depth", "1"))
|
||||
.and(header("content-type", "application/xml"))
|
||||
.respond_with(
|
||||
ResponseTemplate::new(207)
|
||||
.set_body_string(mock_propfind_response())
|
||||
.insert_header("content-type", "application/xml")
|
||||
)
|
||||
.mount(&mock_server)
|
||||
.await;
|
||||
|
||||
let service = create_test_service(&mock_server.uri());
|
||||
|
||||
// Test single directory discovery
|
||||
let result = service.discover_files_and_directories("/TestDocuments", false).await
|
||||
.expect("Single directory discovery should succeed");
|
||||
|
||||
// Verify results
|
||||
assert_eq!(result.files.len(), 1, "Should find 1 file");
|
||||
assert_eq!(result.directories.len(), 2, "Should find 2 directories (excluding parent)");
|
||||
|
||||
// Verify directory paths are correct (not TEMP)
|
||||
let dir_paths: Vec<&String> = result.directories.iter().map(|d| &d.relative_path).collect();
|
||||
assert!(dir_paths.contains(&&"/TestDocuments/SubDir1/".to_string()));
|
||||
assert!(dir_paths.contains(&&"/TestDocuments/SubDir2/".to_string()));
|
||||
|
||||
// Verify no directory has TEMP path
|
||||
for dir in &result.directories {
|
||||
assert_ne!(dir.relative_path, "TEMP", "Directory path should not be TEMP");
|
||||
}
|
||||
|
||||
// Verify file path is correct
|
||||
assert_eq!(result.files[0].relative_path, "/TestDocuments/test.pdf");
|
||||
assert_ne!(result.files[0].relative_path, "TEMP", "File path should not be TEMP");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_recursive_directory_discovery_integration() {
|
||||
let mock_server = MockServer::start().await;
|
||||
|
||||
// Mock the initial PROPFIND request for root directory
|
||||
Mock::given(method("PROPFIND"))
|
||||
.and(path("/remote.php/dav/files/testuser/TestDocuments"))
|
||||
.and(header("depth", "1"))
|
||||
.and(header("content-type", "application/xml"))
|
||||
.respond_with(
|
||||
ResponseTemplate::new(207)
|
||||
.set_body_string(mock_propfind_response())
|
||||
.insert_header("content-type", "application/xml")
|
||||
)
|
||||
.mount(&mock_server)
|
||||
.await;
|
||||
|
||||
// Mock PROPFIND requests for subdirectories (return empty for simplicity)
|
||||
Mock::given(method("PROPFIND"))
|
||||
.and(path("/remote.php/dav/files/testuser/TestDocuments/SubDir1"))
|
||||
.and(header("depth", "1"))
|
||||
.and(header("content-type", "application/xml"))
|
||||
.respond_with(
|
||||
ResponseTemplate::new(207)
|
||||
.set_body_string(mock_empty_directory_response())
|
||||
.insert_header("content-type", "application/xml")
|
||||
)
|
||||
.mount(&mock_server)
|
||||
.await;
|
||||
|
||||
Mock::given(method("PROPFIND"))
|
||||
.and(path("/remote.php/dav/files/testuser/TestDocuments/SubDir2"))
|
||||
.and(header("depth", "1"))
|
||||
.and(header("content-type", "application/xml"))
|
||||
.respond_with(
|
||||
ResponseTemplate::new(207)
|
||||
.set_body_string(mock_empty_directory_response())
|
||||
.insert_header("content-type", "application/xml")
|
||||
)
|
||||
.mount(&mock_server)
|
||||
.await;
|
||||
|
||||
let service = create_test_service(&mock_server.uri());
|
||||
|
||||
// Test recursive directory discovery
|
||||
let result = service.discover_files_and_directories("/TestDocuments", true).await
|
||||
.expect("Recursive directory discovery should succeed");
|
||||
|
||||
// Verify results
|
||||
assert_eq!(result.files.len(), 1, "Should find 1 file");
|
||||
assert_eq!(result.directories.len(), 2, "Should find 2 directories (excluding parents)");
|
||||
|
||||
// Verify no paths are TEMP
|
||||
for item in result.files.iter().chain(result.directories.iter()) {
|
||||
assert_ne!(item.relative_path, "TEMP", "Paths should be processed, not TEMP");
|
||||
assert!(item.relative_path.starts_with("/TestDocuments"),
|
||||
"All paths should start with /TestDocuments, got: {}", item.relative_path);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_href_to_relative_path_conversion() {
|
||||
let service = create_test_service("http://test.example.com");
|
||||
|
||||
// Test Nextcloud path conversion
|
||||
assert_eq!(
|
||||
service.href_to_relative_path("/remote.php/dav/files/testuser/Documents/file.pdf"),
|
||||
"/Documents/file.pdf"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
service.href_to_relative_path("/remote.php/dav/files/testuser/"),
|
||||
"/"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
service.href_to_relative_path("/remote.php/dav/files/testuser/Deep/Nested/Path/"),
|
||||
"/Deep/Nested/Path/"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_url_construction() {
|
||||
let service = create_test_service("http://test.example.com");
|
||||
|
||||
// Test URL construction for different paths
|
||||
assert_eq!(
|
||||
service.get_url_for_path("/TestDocuments"),
|
||||
"http://test.example.com/remote.php/dav/files/testuser/TestDocuments"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
service.get_url_for_path("/TestDocuments/SubDir"),
|
||||
"http://test.example.com/remote.php/dav/files/testuser/TestDocuments/SubDir"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
service.get_url_for_path("/"),
|
||||
"http://test.example.com/remote.php/dav/files/testuser"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_regression_temp_paths_are_processed() {
|
||||
// Regression test: Ensure TEMP paths from XML parser are always processed
|
||||
let service = create_test_service("http://test.example.com");
|
||||
|
||||
// Simulate the exact scenario that caused the bug
|
||||
let raw_xml_items = vec![
|
||||
FileIngestionInfo {
|
||||
relative_path: "TEMP".to_string(), // This is what XML parser returns
|
||||
full_path: "/remote.php/dav/files/testuser/TestDocuments/ImportantFolder/".to_string(),
|
||||
#[allow(deprecated)]
|
||||
path: "/remote.php/dav/files/testuser/TestDocuments/ImportantFolder/".to_string(),
|
||||
name: "ImportantFolder".to_string(),
|
||||
size: 0,
|
||||
mime_type: "application/octet-stream".to_string(),
|
||||
last_modified: None,
|
||||
etag: "folder123etag".to_string(),
|
||||
is_directory: true,
|
||||
created_at: None,
|
||||
permissions: None,
|
||||
owner: None,
|
||||
group: None,
|
||||
metadata: None,
|
||||
}
|
||||
];
|
||||
|
||||
// Process items as the service should do
|
||||
let processed_items = service.process_file_infos(raw_xml_items);
|
||||
|
||||
// Verify the bug is fixed
|
||||
assert_eq!(processed_items.len(), 1);
|
||||
assert_ne!(processed_items[0].relative_path, "TEMP",
|
||||
"REGRESSION: relative_path should not remain as TEMP after processing");
|
||||
assert_eq!(processed_items[0].relative_path, "/TestDocuments/ImportantFolder/",
|
||||
"relative_path should be properly converted from href");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_discover_files_and_directories_processes_paths() {
|
||||
// Integration test to ensure discover_files_and_directories always processes paths
|
||||
let mock_server = MockServer::start().await;
|
||||
|
||||
Mock::given(method("PROPFIND"))
|
||||
.and(path("/remote.php/dav/files/testuser/TestDocuments"))
|
||||
.respond_with(
|
||||
ResponseTemplate::new(207)
|
||||
.set_body_string(mock_propfind_response())
|
||||
.insert_header("content-type", "application/xml")
|
||||
)
|
||||
.mount(&mock_server)
|
||||
.await;
|
||||
|
||||
let service = create_test_service(&mock_server.uri());
|
||||
|
||||
let result = service.discover_files_and_directories("/TestDocuments", false).await
|
||||
.expect("Discovery should succeed");
|
||||
|
||||
// Ensure no items have TEMP paths (regression test)
|
||||
for item in result.files.iter().chain(result.directories.iter()) {
|
||||
assert_ne!(item.relative_path, "TEMP",
|
||||
"REGRESSION: No items should have TEMP paths after discovery");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -6,6 +6,7 @@ use std::str;
|
|||
use serde_json;
|
||||
|
||||
use crate::models::FileIngestionInfo;
|
||||
use crate::mime_detection::{detect_mime_for_discovery, DetectionStrategy};
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct PropFindResponse {
|
||||
|
|
@ -200,6 +201,14 @@ pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileIngestionInfo>>
|
|||
// Use the metadata collected during parsing
|
||||
let metadata = resp.metadata;
|
||||
|
||||
// Determine MIME type using improved detection
|
||||
let mime_detection_result = detect_mime_for_discovery(
|
||||
&name,
|
||||
resp.content_type.as_deref(),
|
||||
DetectionStrategy::Comprehensive
|
||||
);
|
||||
let mime_type = mime_detection_result.mime_type;
|
||||
|
||||
let file_info = FileIngestionInfo {
|
||||
relative_path: "TEMP".to_string(), // Will be set by discovery layer
|
||||
full_path: resp.href.clone(),
|
||||
|
|
@ -207,7 +216,7 @@ pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileIngestionInfo>>
|
|||
path: resp.href.clone(), // Legacy field - keep for compatibility
|
||||
name,
|
||||
size: resp.content_length.unwrap_or(0),
|
||||
mime_type: resp.content_type.unwrap_or_else(|| "application/octet-stream".to_string()),
|
||||
mime_type,
|
||||
last_modified: parse_http_date(&resp.last_modified.unwrap_or_default()),
|
||||
etag: resp.etag.unwrap_or_else(|| format!("\"{}\"", uuid::Uuid::new_v4())),
|
||||
is_directory: false,
|
||||
|
|
@ -418,6 +427,18 @@ pub fn parse_propfind_response_with_directories(xml_text: &str) -> Result<Vec<Fi
|
|||
}
|
||||
});
|
||||
|
||||
// Determine MIME type for files (directories get empty string)
|
||||
let mime_type = if resp.is_collection {
|
||||
"".to_string()
|
||||
} else {
|
||||
let mime_detection_result = detect_mime_for_discovery(
|
||||
&name,
|
||||
resp.content_type.as_deref(),
|
||||
DetectionStrategy::Comprehensive
|
||||
);
|
||||
mime_detection_result.mime_type
|
||||
};
|
||||
|
||||
let file_info = FileIngestionInfo {
|
||||
relative_path: "TEMP".to_string(), // Will be set by discovery layer
|
||||
full_path: resp.href.clone(),
|
||||
|
|
@ -425,11 +446,7 @@ pub fn parse_propfind_response_with_directories(xml_text: &str) -> Result<Vec<Fi
|
|||
path: resp.href.clone(), // Legacy field - keep for compatibility
|
||||
name,
|
||||
size: resp.content_length.unwrap_or(0),
|
||||
mime_type: if resp.is_collection {
|
||||
"".to_string()
|
||||
} else {
|
||||
resp.content_type.unwrap_or_else(|| "application/octet-stream".to_string())
|
||||
},
|
||||
mime_type,
|
||||
last_modified: parse_http_date(&resp.last_modified.unwrap_or_default()),
|
||||
etag: resp.etag.unwrap_or_else(|| format!("\"{}\"", uuid::Uuid::new_v4())),
|
||||
is_directory: resp.is_collection,
|
||||
|
|
@ -944,3 +961,4 @@ mod tests {
|
|||
assert!(strong_compare_etags("\"1\"", "\"1\""));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue