feat(server): do a *much* better job at determining file types thanks to infer rust package
This commit is contained in:
parent
aff7b907c7
commit
d7a0a1f294
|
|
@ -4,7 +4,7 @@ node_modules/
|
||||||
.env
|
.env
|
||||||
assets/
|
assets/
|
||||||
frontend/dist/
|
frontend/dist/
|
||||||
.claude/
|
.claude/settings.local.json # This file is used to store the local Claude settings.
|
||||||
readur_uploads/
|
readur_uploads/
|
||||||
readur_watch/
|
readur_watch/
|
||||||
test-results/
|
test-results/
|
||||||
|
|
|
||||||
|
|
@ -1009,6 +1009,17 @@ dependencies = [
|
||||||
"nom",
|
"nom",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cfb"
|
||||||
|
version = "0.7.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f"
|
||||||
|
dependencies = [
|
||||||
|
"byteorder",
|
||||||
|
"fnv",
|
||||||
|
"uuid",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cfg-expr"
|
name = "cfg-expr"
|
||||||
version = "0.15.8"
|
version = "0.15.8"
|
||||||
|
|
@ -2410,6 +2421,15 @@ dependencies = [
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "infer"
|
||||||
|
version = "0.15.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "cb33622da908807a06f9513c19b3c1ad50fab3e4137d82a78107d502075aa199"
|
||||||
|
dependencies = [
|
||||||
|
"cfb",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "inotify"
|
name = "inotify"
|
||||||
version = "0.11.0"
|
version = "0.11.0"
|
||||||
|
|
@ -2625,7 +2645,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
|
checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"windows-targets 0.48.5",
|
"windows-targets 0.53.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
@ -3610,6 +3630,7 @@ dependencies = [
|
||||||
"hostname",
|
"hostname",
|
||||||
"image",
|
"image",
|
||||||
"imageproc",
|
"imageproc",
|
||||||
|
"infer",
|
||||||
"jsonwebtoken",
|
"jsonwebtoken",
|
||||||
"mime_guess",
|
"mime_guess",
|
||||||
"notify",
|
"notify",
|
||||||
|
|
@ -5674,7 +5695,7 @@ version = "0.1.9"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
|
checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"windows-sys 0.48.0",
|
"windows-sys 0.59.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ path = "src/main.rs"
|
||||||
name = "test_runner"
|
name = "test_runner"
|
||||||
path = "src/bin/test_runner.rs"
|
path = "src/bin/test_runner.rs"
|
||||||
|
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
tokio = { version = "1", features = ["full"] }
|
tokio = { version = "1", features = ["full"] }
|
||||||
axum = { version = "0.8", features = ["multipart"] }
|
axum = { version = "0.8", features = ["multipart"] }
|
||||||
|
|
@ -33,6 +34,7 @@ futures-util = "0.3"
|
||||||
futures = "0.3"
|
futures = "0.3"
|
||||||
notify = "8"
|
notify = "8"
|
||||||
mime_guess = "2"
|
mime_guess = "2"
|
||||||
|
infer = "0.15"
|
||||||
tesseract = { version = "0.15", optional = true }
|
tesseract = { version = "0.15", optional = true }
|
||||||
image = { version = "0.25", features = ["png", "jpeg", "tiff", "bmp"], optional = true }
|
image = { version = "0.25", features = ["png", "jpeg", "tiff", "bmp"], optional = true }
|
||||||
imageproc = { version = "0.25", optional = true }
|
imageproc = { version = "0.25", optional = true }
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,12 @@
|
||||||
{
|
{
|
||||||
"name": "readur-frontend",
|
"name": "readur-frontend",
|
||||||
"version": "2.4.2",
|
"version": "2.5.3",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "readur-frontend",
|
"name": "readur-frontend",
|
||||||
"version": "2.4.2",
|
"version": "2.5.3",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@emotion/react": "^11.14.0",
|
"@emotion/react": "^11.14.0",
|
||||||
"@emotion/styled": "^11.14.0",
|
"@emotion/styled": "^11.14.0",
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ pub mod db_guardrails_simple;
|
||||||
pub mod errors;
|
pub mod errors;
|
||||||
pub mod ingestion;
|
pub mod ingestion;
|
||||||
pub mod metadata_extraction;
|
pub mod metadata_extraction;
|
||||||
|
pub mod mime_detection;
|
||||||
pub mod models;
|
pub mod models;
|
||||||
pub mod monitoring;
|
pub mod monitoring;
|
||||||
pub mod ocr;
|
pub mod ocr;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,431 @@
|
||||||
|
/// MIME type detection module for improved file type identification
|
||||||
|
///
|
||||||
|
/// This module provides functions for detecting file MIME types using multiple methods:
|
||||||
|
/// 1. Content-based detection using magic bytes (most reliable)
|
||||||
|
/// 2. Server-provided MIME type (when available and trusted)
|
||||||
|
/// 3. Extension-based fallback (least reliable, but covers edge cases)
|
||||||
|
///
|
||||||
|
/// The goal is to provide accurate MIME type detection that's particularly important
|
||||||
|
/// for OCR processing where incorrectly classified image files can cause issues.
|
||||||
|
|
||||||
|
use std::path::Path;
|
||||||
|
use tracing::{debug, warn};
|
||||||
|
|
||||||
|
/// Strategy for MIME type detection
|
||||||
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
|
pub enum DetectionStrategy {
|
||||||
|
/// Use content-based detection (magic bytes) - most reliable
|
||||||
|
ContentBased,
|
||||||
|
/// Trust server-provided MIME type if available, fallback to content
|
||||||
|
TrustServer,
|
||||||
|
/// Use extension-based detection - least reliable but fastest
|
||||||
|
ExtensionOnly,
|
||||||
|
/// Comprehensive strategy: server -> content -> extension -> fallback
|
||||||
|
Comprehensive,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Result of MIME type detection with metadata about the detection method used
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct MimeDetectionResult {
|
||||||
|
pub mime_type: String,
|
||||||
|
pub confidence: MimeConfidence,
|
||||||
|
pub detection_method: DetectionMethod,
|
||||||
|
pub original_server_type: Option<String>,
|
||||||
|
pub detected_extension: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Confidence level of the MIME type detection
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
|
pub enum MimeConfidence {
|
||||||
|
/// Low confidence - extension-based or fallback detection
|
||||||
|
Low,
|
||||||
|
/// Medium confidence - mime_guess library detection
|
||||||
|
Medium,
|
||||||
|
/// High confidence - magic byte detection or trusted server
|
||||||
|
High,
|
||||||
|
/// Very high confidence - content analysis confirms server type
|
||||||
|
VeryHigh,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Method used for MIME type detection
|
||||||
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
|
pub enum DetectionMethod {
|
||||||
|
/// Detected using magic bytes/file signature
|
||||||
|
MagicBytes,
|
||||||
|
/// Provided by the server and trusted
|
||||||
|
ServerProvided,
|
||||||
|
/// Detected using file extension
|
||||||
|
Extension,
|
||||||
|
/// Fallback to default type
|
||||||
|
Fallback,
|
||||||
|
/// Hybrid approach using multiple methods
|
||||||
|
Hybrid,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MimeDetectionResult {
|
||||||
|
/// Create a result for server-provided MIME type
|
||||||
|
pub fn from_server(mime_type: String) -> Self {
|
||||||
|
Self {
|
||||||
|
mime_type,
|
||||||
|
confidence: MimeConfidence::High,
|
||||||
|
detection_method: DetectionMethod::ServerProvided,
|
||||||
|
original_server_type: None,
|
||||||
|
detected_extension: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a result for content-based detection
|
||||||
|
pub fn from_content(mime_type: String, server_type: Option<String>) -> Self {
|
||||||
|
Self {
|
||||||
|
mime_type,
|
||||||
|
confidence: MimeConfidence::High,
|
||||||
|
detection_method: DetectionMethod::MagicBytes,
|
||||||
|
original_server_type: server_type,
|
||||||
|
detected_extension: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a result for extension-based detection
|
||||||
|
pub fn from_extension(mime_type: String, extension: String) -> Self {
|
||||||
|
Self {
|
||||||
|
mime_type,
|
||||||
|
confidence: MimeConfidence::Medium,
|
||||||
|
detection_method: DetectionMethod::Extension,
|
||||||
|
original_server_type: None,
|
||||||
|
detected_extension: Some(extension),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a fallback result
|
||||||
|
pub fn fallback() -> Self {
|
||||||
|
Self {
|
||||||
|
mime_type: "application/octet-stream".to_string(),
|
||||||
|
confidence: MimeConfidence::Low,
|
||||||
|
detection_method: DetectionMethod::Fallback,
|
||||||
|
original_server_type: None,
|
||||||
|
detected_extension: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if the detected MIME type indicates an image file
|
||||||
|
pub fn is_image(&self) -> bool {
|
||||||
|
self.mime_type.starts_with("image/")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if the detected MIME type indicates a document file
|
||||||
|
pub fn is_document(&self) -> bool {
|
||||||
|
matches!(self.mime_type.as_str(),
|
||||||
|
"application/pdf" |
|
||||||
|
"application/msword" |
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
|
||||||
|
"application/vnd.ms-excel" |
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
|
||||||
|
"application/vnd.ms-powerpoint" |
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation" |
|
||||||
|
"text/plain" |
|
||||||
|
"text/rtf" |
|
||||||
|
"application/rtf"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if this MIME type is suitable for OCR processing
|
||||||
|
pub fn is_ocr_suitable(&self) -> bool {
|
||||||
|
self.is_image() || self.mime_type == "application/pdf"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Detect MIME type for WebDAV discovery phase (when we only have file metadata)
|
||||||
|
///
|
||||||
|
/// This function is called during the initial WebDAV XML parsing when we don't
|
||||||
|
/// have access to the actual file content yet.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
/// * `filename` - The filename/path of the file
|
||||||
|
/// * `server_mime_type` - MIME type provided by the WebDAV server, if any
|
||||||
|
/// * `strategy` - Detection strategy to use
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// A `MimeDetectionResult` with the best available MIME type determination
|
||||||
|
pub fn detect_mime_for_discovery(
|
||||||
|
filename: &str,
|
||||||
|
server_mime_type: Option<&str>,
|
||||||
|
strategy: DetectionStrategy,
|
||||||
|
) -> MimeDetectionResult {
|
||||||
|
debug!("Detecting MIME type for discovery: filename={}, server_type={:?}, strategy={:?}",
|
||||||
|
filename, server_mime_type, strategy);
|
||||||
|
|
||||||
|
match strategy {
|
||||||
|
DetectionStrategy::ContentBased => {
|
||||||
|
// During discovery, we can't analyze content, so fall back to extension
|
||||||
|
detect_from_extension(filename, server_mime_type)
|
||||||
|
}
|
||||||
|
DetectionStrategy::TrustServer => {
|
||||||
|
if let Some(server_type) = server_mime_type {
|
||||||
|
if is_trusted_server_mime_type(server_type) {
|
||||||
|
return MimeDetectionResult::from_server(server_type.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Fallback to extension-based detection
|
||||||
|
detect_from_extension(filename, server_mime_type)
|
||||||
|
}
|
||||||
|
DetectionStrategy::ExtensionOnly => {
|
||||||
|
detect_from_extension(filename, server_mime_type)
|
||||||
|
}
|
||||||
|
DetectionStrategy::Comprehensive => {
|
||||||
|
// Use server type if trusted, otherwise extension-based
|
||||||
|
if let Some(server_type) = server_mime_type {
|
||||||
|
if is_trusted_server_mime_type(server_type) {
|
||||||
|
return MimeDetectionResult::from_server(server_type.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
detect_from_extension(filename, server_mime_type)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Detect MIME type when file content is available (during file download/processing)
|
||||||
|
///
|
||||||
|
/// This provides the most accurate detection using magic bytes from the actual file content.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
/// * `content` - The first few bytes of the file content (at least 512 bytes recommended)
|
||||||
|
/// * `filename` - The filename for fallback detection
|
||||||
|
/// * `server_mime_type` - MIME type provided by the server, if any
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// A `MimeDetectionResult` with high-confidence MIME type detection
|
||||||
|
pub fn detect_mime_from_content(
|
||||||
|
content: &[u8],
|
||||||
|
filename: &str,
|
||||||
|
server_mime_type: Option<&str>,
|
||||||
|
) -> MimeDetectionResult {
|
||||||
|
debug!("Detecting MIME type from content: filename={}, server_type={:?}, content_len={}",
|
||||||
|
filename, server_mime_type, content.len());
|
||||||
|
|
||||||
|
// First, try magic byte detection
|
||||||
|
if let Some(detected_type) = infer::get(content) {
|
||||||
|
let mime_type = detected_type.mime_type().to_string();
|
||||||
|
debug!("Magic bytes detected MIME type: {}", mime_type);
|
||||||
|
|
||||||
|
// If server provided a type, check for consistency
|
||||||
|
if let Some(server_type) = server_mime_type {
|
||||||
|
if are_mime_types_compatible(&mime_type, server_type) {
|
||||||
|
// Both agree - very high confidence
|
||||||
|
let mut result = MimeDetectionResult::from_content(mime_type, Some(server_type.to_string()));
|
||||||
|
result.confidence = MimeConfidence::VeryHigh;
|
||||||
|
result.detection_method = DetectionMethod::Hybrid;
|
||||||
|
return result;
|
||||||
|
} else {
|
||||||
|
// Content detection overrides server type - trust the bytes
|
||||||
|
warn!("MIME type mismatch: server={}, content={} for file {}",
|
||||||
|
server_type, mime_type, filename);
|
||||||
|
return MimeDetectionResult::from_content(mime_type, Some(server_type.to_string()));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Only content detection available
|
||||||
|
return MimeDetectionResult::from_content(mime_type, None);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Magic bytes detection failed, fall back to server type if trusted
|
||||||
|
if let Some(server_type) = server_mime_type {
|
||||||
|
if is_trusted_server_mime_type(server_type) {
|
||||||
|
debug!("Using trusted server MIME type: {}", server_type);
|
||||||
|
return MimeDetectionResult::from_server(server_type.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall back to extension-based detection
|
||||||
|
debug!("Content detection failed, falling back to extension detection");
|
||||||
|
detect_from_extension(filename, server_mime_type)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Update an existing MIME type with content-based detection if available
|
||||||
|
///
|
||||||
|
/// This function is useful for re-detecting MIME types when file content becomes
|
||||||
|
/// available after initial discovery.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
/// * `current_mime_type` - The currently assigned MIME type
|
||||||
|
/// * `content` - File content for analysis
|
||||||
|
/// * `filename` - Filename for context
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// A new `MimeDetectionResult` if detection improves confidence, or None if no change needed
|
||||||
|
pub fn update_mime_type_with_content(
|
||||||
|
current_mime_type: &str,
|
||||||
|
content: &[u8],
|
||||||
|
filename: &str,
|
||||||
|
) -> Option<MimeDetectionResult> {
|
||||||
|
let new_result = detect_mime_from_content(content, filename, Some(current_mime_type));
|
||||||
|
|
||||||
|
// Only update if we have higher confidence or detected a different type
|
||||||
|
if new_result.confidence > MimeConfidence::Medium ||
|
||||||
|
new_result.mime_type != current_mime_type {
|
||||||
|
Some(new_result)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Detect MIME type from file extension using mime_guess library
|
||||||
|
fn detect_from_extension(filename: &str, server_mime_type: Option<&str>) -> MimeDetectionResult {
|
||||||
|
let path = Path::new(filename);
|
||||||
|
|
||||||
|
if let Some(mime_type) = mime_guess::from_path(path).first() {
|
||||||
|
let mime_str = mime_type.to_string();
|
||||||
|
debug!("Extension-based detection: {} -> {}", filename, mime_str);
|
||||||
|
|
||||||
|
let mut result = MimeDetectionResult::from_extension(
|
||||||
|
mime_str,
|
||||||
|
path.extension()
|
||||||
|
.and_then(|ext| ext.to_str())
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_string()
|
||||||
|
);
|
||||||
|
result.original_server_type = server_mime_type.map(|s| s.to_string());
|
||||||
|
result
|
||||||
|
} else {
|
||||||
|
debug!("Extension-based detection failed for: {}", filename);
|
||||||
|
let mut result = MimeDetectionResult::fallback();
|
||||||
|
result.original_server_type = server_mime_type.map(|s| s.to_string());
|
||||||
|
result
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if a server-provided MIME type should be trusted
|
||||||
|
///
|
||||||
|
/// Some servers return generic types like "application/octet-stream" which
|
||||||
|
/// aren't useful, while others provide accurate information.
|
||||||
|
fn is_trusted_server_mime_type(mime_type: &str) -> bool {
|
||||||
|
!matches!(mime_type,
|
||||||
|
"application/octet-stream" |
|
||||||
|
"application/binary" |
|
||||||
|
"binary/octet-stream" |
|
||||||
|
"" |
|
||||||
|
"unknown"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if two MIME types are compatible/equivalent
|
||||||
|
///
|
||||||
|
/// Some servers might return slightly different but equivalent MIME types
|
||||||
|
/// (e.g., "image/jpg" vs "image/jpeg")
|
||||||
|
fn are_mime_types_compatible(type1: &str, type2: &str) -> bool {
|
||||||
|
if type1 == type2 {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle common variations
|
||||||
|
match (type1, type2) {
|
||||||
|
("image/jpeg", "image/jpg") | ("image/jpg", "image/jpeg") => true,
|
||||||
|
("image/tiff", "image/tif") | ("image/tif", "image/tiff") => true,
|
||||||
|
("text/plain", "text/txt") | ("text/txt", "text/plain") => true,
|
||||||
|
_ => {
|
||||||
|
// Check if they have the same primary type (e.g., both are "image/*")
|
||||||
|
let parts1: Vec<&str> = type1.split('/').collect();
|
||||||
|
let parts2: Vec<&str> = type2.split('/').collect();
|
||||||
|
|
||||||
|
parts1.len() == 2 && parts2.len() == 2 && parts1[0] == parts2[0]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Legacy function for backward compatibility
|
||||||
|
///
|
||||||
|
/// This maintains the same interface as the original `get_mime_type_from_extension`
|
||||||
|
/// function but uses the new detection system.
|
||||||
|
pub fn get_mime_type_from_extension(extension: &str) -> String {
|
||||||
|
let fake_filename = format!("file.{}", extension);
|
||||||
|
let result = detect_from_extension(&fake_filename, None);
|
||||||
|
result.mime_type
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_mime_detection_from_extension() {
|
||||||
|
let result = detect_mime_for_discovery(
|
||||||
|
"test.pdf",
|
||||||
|
None,
|
||||||
|
DetectionStrategy::ExtensionOnly
|
||||||
|
);
|
||||||
|
assert_eq!(result.mime_type, "application/pdf");
|
||||||
|
assert_eq!(result.detection_method, DetectionMethod::Extension);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_server_type_trust() {
|
||||||
|
// Trusted server type
|
||||||
|
let result = detect_mime_for_discovery(
|
||||||
|
"test.pdf",
|
||||||
|
Some("application/pdf"),
|
||||||
|
DetectionStrategy::TrustServer
|
||||||
|
);
|
||||||
|
assert_eq!(result.mime_type, "application/pdf");
|
||||||
|
assert_eq!(result.detection_method, DetectionMethod::ServerProvided);
|
||||||
|
|
||||||
|
// Untrusted server type should fall back
|
||||||
|
let result = detect_mime_for_discovery(
|
||||||
|
"test.pdf",
|
||||||
|
Some("application/octet-stream"),
|
||||||
|
DetectionStrategy::TrustServer
|
||||||
|
);
|
||||||
|
assert_eq!(result.mime_type, "application/pdf");
|
||||||
|
assert_eq!(result.detection_method, DetectionMethod::Extension);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_mime_type_compatibility() {
|
||||||
|
assert!(are_mime_types_compatible("image/jpeg", "image/jpg"));
|
||||||
|
assert!(are_mime_types_compatible("image/jpg", "image/jpeg"));
|
||||||
|
assert!(are_mime_types_compatible("text/plain", "text/plain"));
|
||||||
|
assert!(!are_mime_types_compatible("image/jpeg", "text/plain"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_content_based_detection() {
|
||||||
|
// PDF magic bytes
|
||||||
|
let pdf_header = b"%PDF-1.4";
|
||||||
|
let result = detect_mime_from_content(pdf_header, "test.pdf", None);
|
||||||
|
assert_eq!(result.mime_type, "application/pdf");
|
||||||
|
assert_eq!(result.detection_method, DetectionMethod::MagicBytes);
|
||||||
|
assert_eq!(result.confidence, MimeConfidence::High);
|
||||||
|
|
||||||
|
// JPEG magic bytes
|
||||||
|
let jpeg_header = [0xFF, 0xD8, 0xFF];
|
||||||
|
let result = detect_mime_from_content(&jpeg_header, "test.jpg", None);
|
||||||
|
assert_eq!(result.mime_type, "image/jpeg");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_hybrid_detection() {
|
||||||
|
// Content and server agree
|
||||||
|
let pdf_header = b"%PDF-1.4";
|
||||||
|
let result = detect_mime_from_content(pdf_header, "test.pdf", Some("application/pdf"));
|
||||||
|
assert_eq!(result.mime_type, "application/pdf");
|
||||||
|
assert_eq!(result.detection_method, DetectionMethod::Hybrid);
|
||||||
|
assert_eq!(result.confidence, MimeConfidence::VeryHigh);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_legacy_compatibility() {
|
||||||
|
assert_eq!(get_mime_type_from_extension("pdf"), "application/pdf");
|
||||||
|
assert_eq!(get_mime_type_from_extension("jpg"), "image/jpeg");
|
||||||
|
assert_eq!(get_mime_type_from_extension("png"), "image/png");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ocr_suitability() {
|
||||||
|
let pdf_result = MimeDetectionResult::from_content("application/pdf".to_string(), None);
|
||||||
|
assert!(pdf_result.is_ocr_suitable());
|
||||||
|
|
||||||
|
let image_result = MimeDetectionResult::from_content("image/jpeg".to_string(), None);
|
||||||
|
assert!(image_result.is_ocr_suitable());
|
||||||
|
|
||||||
|
let text_result = MimeDetectionResult::from_content("text/plain".to_string(), None);
|
||||||
|
assert!(!text_result.is_ocr_suitable());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -570,27 +570,17 @@ impl SourceScheduler {
|
||||||
return Err(format!("WebDAV server_url is empty"));
|
return Err(format!("WebDAV server_url is empty"));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if URL starts with a valid scheme
|
// Normalize URL by adding protocol if missing (consistent with WebDAVConfig)
|
||||||
if !server_url.starts_with("http://") && !server_url.starts_with("https://") {
|
let normalized_url = crate::services::webdav::config::WebDAVConfig::normalize_server_url(server_url);
|
||||||
return Err(format!(
|
|
||||||
"WebDAV server_url must start with 'http://' or 'https://'. \
|
|
||||||
Current value: '{}'. \
|
|
||||||
Examples of valid URLs: \
|
|
||||||
- https://cloud.example.com \
|
|
||||||
- http://192.168.1.100:8080 \
|
|
||||||
- https://nextcloud.mydomain.com:443",
|
|
||||||
server_url
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try to parse as URL to catch other issues
|
// Try to parse the normalized URL to catch other issues
|
||||||
match reqwest::Url::parse(server_url) {
|
match reqwest::Url::parse(&normalized_url) {
|
||||||
Ok(url) => {
|
Ok(url) => {
|
||||||
if url.scheme() != "http" && url.scheme() != "https" {
|
if url.scheme() != "http" && url.scheme() != "https" {
|
||||||
return Err(format!(
|
return Err(format!(
|
||||||
"WebDAV server_url has invalid scheme '{}'. Only 'http' and 'https' are supported. \
|
"WebDAV server_url has invalid scheme '{}'. Only 'http' and 'https' are supported. \
|
||||||
Current URL: '{}'",
|
Current URL: '{}'",
|
||||||
url.scheme(), server_url
|
url.scheme(), normalized_url
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -599,23 +589,23 @@ impl SourceScheduler {
|
||||||
"WebDAV server_url is missing hostname. \
|
"WebDAV server_url is missing hostname. \
|
||||||
Current URL: '{}'. \
|
Current URL: '{}'. \
|
||||||
Example: https://cloud.example.com",
|
Example: https://cloud.example.com",
|
||||||
server_url
|
normalized_url
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
crate::debug_log!("SOURCE_SCHEDULER", "✅ WebDAV URL validation passed for source '{}': {}", source_name, server_url);
|
crate::debug_log!("SOURCE_SCHEDULER", "✅ WebDAV URL validation passed for source '{}': {} (normalized to: {})", source_name, server_url, normalized_url);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
Err(format!(
|
Err(format!(
|
||||||
"WebDAV server_url is not a valid URL: {}. \
|
"WebDAV server_url is not a valid URL: {}. \
|
||||||
Current value: '{}'. \
|
Current value: '{}' (normalized to: '{}'). \
|
||||||
The URL must be absolute and include the full domain. \
|
The URL must be absolute and include the full domain. \
|
||||||
Examples: \
|
Examples: \
|
||||||
- https://cloud.example.com \
|
- https://cloud.example.com \
|
||||||
- http://192.168.1.100:8080/webdav \
|
- http://192.168.1.100:8080/webdav \
|
||||||
- https://nextcloud.mydomain.com",
|
- https://nextcloud.mydomain.com",
|
||||||
e, server_url
|
e, server_url, normalized_url
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -103,6 +103,32 @@ impl WebDAVConfig {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Normalizes a server URL by adding protocol if missing
|
||||||
|
/// Prefers HTTPS over HTTP for security reasons
|
||||||
|
pub fn normalize_server_url(url: &str) -> String {
|
||||||
|
let trimmed = url.trim();
|
||||||
|
|
||||||
|
// If protocol is already specified, return as-is
|
||||||
|
if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
|
||||||
|
return trimmed.to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
// If no protocol specified, default to HTTPS for security
|
||||||
|
format!("https://{}", trimmed)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generates alternative protocol URL for fallback attempts
|
||||||
|
/// If input has HTTPS, returns HTTP version and vice versa
|
||||||
|
pub fn get_alternative_protocol_url(url: &str) -> Option<String> {
|
||||||
|
if url.starts_with("https://") {
|
||||||
|
Some(url.replacen("https://", "http://", 1))
|
||||||
|
} else if url.starts_with("http://") {
|
||||||
|
Some(url.replacen("http://", "https://", 1))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Validates the configuration
|
/// Validates the configuration
|
||||||
pub fn validate(&self) -> anyhow::Result<()> {
|
pub fn validate(&self) -> anyhow::Result<()> {
|
||||||
if self.server_url.is_empty() {
|
if self.server_url.is_empty() {
|
||||||
|
|
@ -121,9 +147,22 @@ impl WebDAVConfig {
|
||||||
return Err(anyhow::anyhow!("At least one watch folder must be specified"));
|
return Err(anyhow::anyhow!("At least one watch folder must be specified"));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Validate URL format
|
// Validate URL format - now accepts URLs without protocol
|
||||||
if !self.server_url.starts_with("http://") && !self.server_url.starts_with("https://") {
|
// Protocol detection and fallback will be handled during connection testing
|
||||||
return Err(anyhow::anyhow!("Server URL must start with http:// or https://"));
|
let normalized_url = Self::normalize_server_url(&self.server_url);
|
||||||
|
|
||||||
|
// Basic URL validation - check if it looks like a valid domain/IP
|
||||||
|
let url_without_protocol = normalized_url
|
||||||
|
.trim_start_matches("https://")
|
||||||
|
.trim_start_matches("http://");
|
||||||
|
|
||||||
|
if url_without_protocol.is_empty() {
|
||||||
|
return Err(anyhow::anyhow!("Server URL must contain a valid domain or IP address"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for obviously invalid URLs
|
||||||
|
if url_without_protocol.contains("://") {
|
||||||
|
return Err(anyhow::anyhow!("Invalid URL format: contains multiple protocols"));
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|
@ -131,8 +170,8 @@ impl WebDAVConfig {
|
||||||
|
|
||||||
/// Returns the base URL for WebDAV operations
|
/// Returns the base URL for WebDAV operations
|
||||||
pub fn webdav_url(&self) -> String {
|
pub fn webdav_url(&self) -> String {
|
||||||
// Normalize the server URL by removing trailing slashes
|
// Normalize the server URL by adding protocol if missing and removing trailing slashes
|
||||||
let normalized_url = self.server_url.trim_end_matches('/').to_string();
|
let normalized_url = Self::normalize_server_url(&self.server_url).trim_end_matches('/').to_string();
|
||||||
|
|
||||||
// Add WebDAV path based on server type
|
// Add WebDAV path based on server type
|
||||||
match self.server_type.as_deref() {
|
match self.server_type.as_deref() {
|
||||||
|
|
@ -160,7 +199,7 @@ impl WebDAVConfig {
|
||||||
/// Returns alternative WebDAV URLs to try if the primary one fails
|
/// Returns alternative WebDAV URLs to try if the primary one fails
|
||||||
/// This is used for fallback mechanisms when encountering 405 errors
|
/// This is used for fallback mechanisms when encountering 405 errors
|
||||||
pub fn webdav_fallback_urls(&self) -> Vec<String> {
|
pub fn webdav_fallback_urls(&self) -> Vec<String> {
|
||||||
let normalized_url = self.server_url.trim_end_matches('/').to_string();
|
let normalized_url = Self::normalize_server_url(&self.server_url).trim_end_matches('/').to_string();
|
||||||
let mut fallback_urls = Vec::new();
|
let mut fallback_urls = Vec::new();
|
||||||
|
|
||||||
match self.server_type.as_deref() {
|
match self.server_type.as_deref() {
|
||||||
|
|
|
||||||
|
|
@ -23,4 +23,6 @@ mod url_construction_tests;
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod subdirectory_edge_cases_tests;
|
mod subdirectory_edge_cases_tests;
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
mod protocol_detection_tests;
|
||||||
|
#[cfg(test)]
|
||||||
mod tests;
|
mod tests;
|
||||||
|
|
@ -0,0 +1,233 @@
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::super::{WebDAVService, WebDAVConfig};
|
||||||
|
|
||||||
|
/// Helper function to create test WebDAV config without protocol
|
||||||
|
fn create_test_config_without_protocol() -> WebDAVConfig {
|
||||||
|
WebDAVConfig {
|
||||||
|
server_url: "nas.example.com".to_string(), // No protocol
|
||||||
|
username: "testuser".to_string(),
|
||||||
|
password: "testpass".to_string(),
|
||||||
|
watch_folders: vec!["/Documents".to_string()],
|
||||||
|
file_extensions: vec!["pdf".to_string(), "txt".to_string()],
|
||||||
|
timeout_seconds: 30,
|
||||||
|
server_type: Some("nextcloud".to_string()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Helper function to create test WebDAV config with HTTPS protocol
|
||||||
|
fn create_test_config_with_https() -> WebDAVConfig {
|
||||||
|
WebDAVConfig {
|
||||||
|
server_url: "https://nas.example.com".to_string(),
|
||||||
|
username: "testuser".to_string(),
|
||||||
|
password: "testpass".to_string(),
|
||||||
|
watch_folders: vec!["/Documents".to_string()],
|
||||||
|
file_extensions: vec!["pdf".to_string(), "txt".to_string()],
|
||||||
|
timeout_seconds: 30,
|
||||||
|
server_type: Some("nextcloud".to_string()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Helper function to create test WebDAV config with HTTP protocol
|
||||||
|
fn create_test_config_with_http() -> WebDAVConfig {
|
||||||
|
WebDAVConfig {
|
||||||
|
server_url: "http://nas.example.com".to_string(),
|
||||||
|
username: "testuser".to_string(),
|
||||||
|
password: "testpass".to_string(),
|
||||||
|
watch_folders: vec!["/Documents".to_string()],
|
||||||
|
file_extensions: vec!["pdf".to_string(), "txt".to_string()],
|
||||||
|
timeout_seconds: 30,
|
||||||
|
server_type: Some("nextcloud".to_string()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_config_validation_accepts_url_without_protocol() {
|
||||||
|
let config = create_test_config_without_protocol();
|
||||||
|
|
||||||
|
// Should not fail validation
|
||||||
|
assert!(config.validate().is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_config_validation_accepts_url_with_https() {
|
||||||
|
let config = create_test_config_with_https();
|
||||||
|
|
||||||
|
// Should not fail validation
|
||||||
|
assert!(config.validate().is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_config_validation_accepts_url_with_http() {
|
||||||
|
let config = create_test_config_with_http();
|
||||||
|
|
||||||
|
// Should not fail validation
|
||||||
|
assert!(config.validate().is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_normalize_server_url_adds_https_by_default() {
|
||||||
|
let normalized = WebDAVConfig::normalize_server_url("nas.example.com");
|
||||||
|
assert_eq!(normalized, "https://nas.example.com");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_normalize_server_url_preserves_existing_protocol() {
|
||||||
|
let https_url = WebDAVConfig::normalize_server_url("https://nas.example.com");
|
||||||
|
assert_eq!(https_url, "https://nas.example.com");
|
||||||
|
|
||||||
|
let http_url = WebDAVConfig::normalize_server_url("http://nas.example.com");
|
||||||
|
assert_eq!(http_url, "http://nas.example.com");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_get_alternative_protocol_url() {
|
||||||
|
// HTTPS to HTTP
|
||||||
|
let alt_http = WebDAVConfig::get_alternative_protocol_url("https://nas.example.com");
|
||||||
|
assert_eq!(alt_http, Some("http://nas.example.com".to_string()));
|
||||||
|
|
||||||
|
// HTTP to HTTPS
|
||||||
|
let alt_https = WebDAVConfig::get_alternative_protocol_url("http://nas.example.com");
|
||||||
|
assert_eq!(alt_https, Some("https://nas.example.com".to_string()));
|
||||||
|
|
||||||
|
// No protocol - should return None
|
||||||
|
let no_protocol = WebDAVConfig::get_alternative_protocol_url("nas.example.com");
|
||||||
|
assert_eq!(no_protocol, None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_webdav_url_uses_normalized_url() {
|
||||||
|
let config = create_test_config_without_protocol();
|
||||||
|
let webdav_url = config.webdav_url();
|
||||||
|
|
||||||
|
// Should start with https:// (normalized)
|
||||||
|
assert!(webdav_url.starts_with("https://"));
|
||||||
|
assert_eq!(webdav_url, "https://nas.example.com/remote.php/dav/files/testuser");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_service_creation_with_protocol_detection() {
|
||||||
|
let config = create_test_config_without_protocol();
|
||||||
|
|
||||||
|
// Should be able to create service without errors
|
||||||
|
let service = WebDAVService::new(config);
|
||||||
|
assert!(service.is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_effective_server_url_defaults_to_normalized() {
|
||||||
|
let config = create_test_config_without_protocol();
|
||||||
|
let service = WebDAVService::new(config).unwrap();
|
||||||
|
|
||||||
|
let effective_url = service.get_effective_server_url();
|
||||||
|
assert_eq!(effective_url, "https://nas.example.com");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_effective_server_url_with_existing_protocol() {
|
||||||
|
let config = create_test_config_with_http();
|
||||||
|
let service = WebDAVService::new(config).unwrap();
|
||||||
|
|
||||||
|
let effective_url = service.get_effective_server_url();
|
||||||
|
assert_eq!(effective_url, "http://nas.example.com");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_working_protocol_initially_none() {
|
||||||
|
let config = create_test_config_without_protocol();
|
||||||
|
let service = WebDAVService::new(config).unwrap();
|
||||||
|
|
||||||
|
// Initially, no working protocol should be detected
|
||||||
|
assert!(service.get_working_protocol().is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_is_connection_error_detection() {
|
||||||
|
let config = create_test_config_without_protocol();
|
||||||
|
let service = WebDAVService::new(config).unwrap();
|
||||||
|
|
||||||
|
// Test various connection error patterns
|
||||||
|
let connection_errors = vec![
|
||||||
|
anyhow::anyhow!("connection refused"),
|
||||||
|
anyhow::anyhow!("timeout occurred"),
|
||||||
|
anyhow::anyhow!("DNS resolution failed"),
|
||||||
|
anyhow::anyhow!("TLS handshake failed"),
|
||||||
|
anyhow::anyhow!("SSL certificate error"),
|
||||||
|
];
|
||||||
|
|
||||||
|
for error in connection_errors {
|
||||||
|
assert!(service.is_connection_error(&error), "Should detect '{}' as connection error", error);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test non-connection errors
|
||||||
|
let non_connection_errors = vec![
|
||||||
|
anyhow::anyhow!("401 Unauthorized"),
|
||||||
|
anyhow::anyhow!("403 Forbidden"),
|
||||||
|
anyhow::anyhow!("invalid credentials"),
|
||||||
|
];
|
||||||
|
|
||||||
|
for error in non_connection_errors {
|
||||||
|
assert!(!service.is_connection_error(&error), "Should NOT detect '{}' as connection error", error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_config_validation_rejects_empty_url() {
|
||||||
|
let mut config = create_test_config_without_protocol();
|
||||||
|
config.server_url = "".to_string();
|
||||||
|
|
||||||
|
assert!(config.validate().is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_config_validation_rejects_invalid_url() {
|
||||||
|
let mut config = create_test_config_without_protocol();
|
||||||
|
config.server_url = "http://https://invalid".to_string();
|
||||||
|
|
||||||
|
assert!(config.validate().is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_webdav_fallback_urls_use_normalized_url() {
|
||||||
|
let config = create_test_config_without_protocol();
|
||||||
|
let fallback_urls = config.webdav_fallback_urls();
|
||||||
|
|
||||||
|
// All fallback URLs should start with https:// (normalized)
|
||||||
|
for url in fallback_urls {
|
||||||
|
assert!(url.starts_with("https://"), "Fallback URL should be normalized: {}", url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_backward_compatibility_with_existing_protocols() {
|
||||||
|
// Existing URLs with protocols should work unchanged
|
||||||
|
let https_config = create_test_config_with_https();
|
||||||
|
let http_config = create_test_config_with_http();
|
||||||
|
|
||||||
|
let https_service = WebDAVService::new(https_config).unwrap();
|
||||||
|
let http_service = WebDAVService::new(http_config).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(https_service.get_effective_server_url(), "https://nas.example.com");
|
||||||
|
assert_eq!(http_service.get_effective_server_url(), "http://nas.example.com");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_url_construction_with_protocol_detection() {
|
||||||
|
let config = create_test_config_without_protocol();
|
||||||
|
let service = WebDAVService::new(config).unwrap();
|
||||||
|
|
||||||
|
// Test URL construction for different paths
|
||||||
|
let test_paths = vec![
|
||||||
|
"/Documents/file.pdf",
|
||||||
|
"Photos/image.jpg",
|
||||||
|
"/",
|
||||||
|
"",
|
||||||
|
];
|
||||||
|
|
||||||
|
for path in test_paths {
|
||||||
|
let url = service.get_url_for_path(path);
|
||||||
|
// Should start with https:// (normalized default)
|
||||||
|
assert!(url.starts_with("https://"), "URL should be normalized for path '{}': {}", path, url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -14,6 +14,7 @@ use crate::models::{
|
||||||
WebDAVFolderInfo,
|
WebDAVFolderInfo,
|
||||||
};
|
};
|
||||||
use crate::webdav_xml_parser::{parse_propfind_response, parse_propfind_response_with_directories};
|
use crate::webdav_xml_parser::{parse_propfind_response, parse_propfind_response_with_directories};
|
||||||
|
use crate::mime_detection::{detect_mime_from_content, update_mime_type_with_content, MimeDetectionResult};
|
||||||
|
|
||||||
use super::{config::{WebDAVConfig, RetryConfig, ConcurrencyConfig}, SyncProgress};
|
use super::{config::{WebDAVConfig, RetryConfig, ConcurrencyConfig}, SyncProgress};
|
||||||
|
|
||||||
|
|
@ -24,6 +25,15 @@ pub struct WebDAVDiscoveryResult {
|
||||||
pub directories: Vec<FileIngestionInfo>,
|
pub directories: Vec<FileIngestionInfo>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Result of downloading a file with MIME type detection
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct WebDAVDownloadResult {
|
||||||
|
pub content: Vec<u8>,
|
||||||
|
pub file_info: FileIngestionInfo,
|
||||||
|
pub mime_detection: MimeDetectionResult,
|
||||||
|
pub mime_type_updated: bool,
|
||||||
|
}
|
||||||
|
|
||||||
/// Server capabilities information
|
/// Server capabilities information
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct ServerCapabilities {
|
pub struct ServerCapabilities {
|
||||||
|
|
@ -135,6 +145,8 @@ pub struct WebDAVService {
|
||||||
concurrency_config: ConcurrencyConfig,
|
concurrency_config: ConcurrencyConfig,
|
||||||
scan_semaphore: Arc<Semaphore>,
|
scan_semaphore: Arc<Semaphore>,
|
||||||
download_semaphore: Arc<Semaphore>,
|
download_semaphore: Arc<Semaphore>,
|
||||||
|
/// Stores the working protocol (updated after successful protocol detection)
|
||||||
|
working_protocol: Arc<std::sync::RwLock<Option<String>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl WebDAVService {
|
impl WebDAVService {
|
||||||
|
|
@ -173,9 +185,156 @@ impl WebDAVService {
|
||||||
concurrency_config,
|
concurrency_config,
|
||||||
scan_semaphore,
|
scan_semaphore,
|
||||||
download_semaphore,
|
download_semaphore,
|
||||||
|
working_protocol: Arc::new(std::sync::RwLock::new(None)),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Protocol Detection Methods
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
/// Detects the working protocol by trying HTTPS first, then HTTP
|
||||||
|
/// This method handles smart protocol detection for URLs without explicit protocols
|
||||||
|
async fn detect_working_protocol(&self) -> Result<String> {
|
||||||
|
info!("🔍 Starting smart protocol detection for: {}", self.config.server_url);
|
||||||
|
|
||||||
|
// If URL already has a protocol, use it directly
|
||||||
|
if self.config.server_url.starts_with("http://") || self.config.server_url.starts_with("https://") {
|
||||||
|
let protocol = if self.config.server_url.starts_with("https://") { "https" } else { "http" };
|
||||||
|
info!("✅ Protocol already specified: {}", protocol);
|
||||||
|
return Ok(protocol.to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try HTTPS first (more secure default)
|
||||||
|
let https_url = format!("https://{}", self.config.server_url.trim());
|
||||||
|
info!("🔐 Trying HTTPS first: {}", https_url);
|
||||||
|
|
||||||
|
match self.test_protocol_connection(&https_url).await {
|
||||||
|
Ok(()) => {
|
||||||
|
info!("✅ HTTPS connection successful");
|
||||||
|
// Store the working protocol for future use
|
||||||
|
if let Ok(mut working_protocol) = self.working_protocol.write() {
|
||||||
|
*working_protocol = Some("https".to_string());
|
||||||
|
}
|
||||||
|
return Ok("https".to_string());
|
||||||
|
}
|
||||||
|
Err(https_error) => {
|
||||||
|
warn!("❌ HTTPS connection failed: {}", https_error);
|
||||||
|
|
||||||
|
// Check if this is a connection-related error (not auth error)
|
||||||
|
if self.is_connection_error(&https_error) {
|
||||||
|
info!("🔄 HTTPS failed with connection error, trying HTTP fallback");
|
||||||
|
|
||||||
|
// Try HTTP fallback
|
||||||
|
let http_url = format!("http://{}", self.config.server_url.trim());
|
||||||
|
info!("🔓 Trying HTTP fallback: {}", http_url);
|
||||||
|
|
||||||
|
match self.test_protocol_connection(&http_url).await {
|
||||||
|
Ok(()) => {
|
||||||
|
warn!("⚠️ HTTP connection successful - consider configuring HTTPS for security");
|
||||||
|
// Store the working protocol for future use
|
||||||
|
if let Ok(mut working_protocol) = self.working_protocol.write() {
|
||||||
|
*working_protocol = Some("http".to_string());
|
||||||
|
}
|
||||||
|
return Ok("http".to_string());
|
||||||
|
}
|
||||||
|
Err(http_error) => {
|
||||||
|
error!("❌ Both HTTPS and HTTP failed");
|
||||||
|
error!(" HTTPS error: {}", https_error);
|
||||||
|
error!(" HTTP error: {}", http_error);
|
||||||
|
return Err(anyhow!(
|
||||||
|
"Protocol detection failed. Both HTTPS and HTTP connections failed. \
|
||||||
|
HTTPS error: {}. HTTP error: {}. \
|
||||||
|
Please verify the server URL and ensure WebDAV is properly configured.",
|
||||||
|
https_error, http_error
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Auth or other non-connection error with HTTPS - don't try HTTP
|
||||||
|
error!("❌ HTTPS failed with non-connection error (likely auth or server config): {}", https_error);
|
||||||
|
return Err(anyhow!(
|
||||||
|
"HTTPS connection failed with authentication or server configuration error: {}. \
|
||||||
|
Please check your credentials and server settings.",
|
||||||
|
https_error
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Tests connection with a specific protocol URL
|
||||||
|
async fn test_protocol_connection(&self, full_url: &str) -> Result<()> {
|
||||||
|
debug!("🧪 Testing protocol connection to: {}", full_url);
|
||||||
|
|
||||||
|
// Create a temporary config with the full URL for testing
|
||||||
|
let temp_config = WebDAVConfig {
|
||||||
|
server_url: full_url.to_string(),
|
||||||
|
username: self.config.username.clone(),
|
||||||
|
password: self.config.password.clone(),
|
||||||
|
watch_folders: self.config.watch_folders.clone(),
|
||||||
|
file_extensions: self.config.file_extensions.clone(),
|
||||||
|
timeout_seconds: self.config.timeout_seconds,
|
||||||
|
server_type: self.config.server_type.clone(),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Test basic OPTIONS request
|
||||||
|
let webdav_url = temp_config.webdav_url();
|
||||||
|
debug!("📍 Testing WebDAV URL: {}", webdav_url);
|
||||||
|
|
||||||
|
let response = self.client
|
||||||
|
.request(Method::OPTIONS, &webdav_url)
|
||||||
|
.basic_auth(&self.config.username, Some(&self.config.password))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| anyhow!("Connection failed: {}", e))?;
|
||||||
|
|
||||||
|
if !response.status().is_success() {
|
||||||
|
return Err(anyhow!(
|
||||||
|
"Protocol test failed with status: {} - {}",
|
||||||
|
response.status(),
|
||||||
|
response.text().await.unwrap_or_default()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!("✅ Protocol connection test successful");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Determines if an error is connection-related (vs auth or other errors)
|
||||||
|
pub fn is_connection_error(&self, error: &anyhow::Error) -> bool {
|
||||||
|
let error_str = error.to_string().to_lowercase();
|
||||||
|
|
||||||
|
// Connection-related errors that suggest trying different protocol
|
||||||
|
error_str.contains("connection refused") ||
|
||||||
|
error_str.contains("timeout") ||
|
||||||
|
error_str.contains("dns") ||
|
||||||
|
error_str.contains("network") ||
|
||||||
|
error_str.contains("unreachable") ||
|
||||||
|
error_str.contains("tls") ||
|
||||||
|
error_str.contains("ssl") ||
|
||||||
|
error_str.contains("certificate") ||
|
||||||
|
error_str.contains("handshake")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets the currently working protocol (if detected)
|
||||||
|
pub fn get_working_protocol(&self) -> Option<String> {
|
||||||
|
self.working_protocol.read().ok().and_then(|p| p.clone())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets the effective server URL with the working protocol
|
||||||
|
pub fn get_effective_server_url(&self) -> String {
|
||||||
|
// If we have a detected working protocol, use it
|
||||||
|
if let Some(protocol) = self.get_working_protocol() {
|
||||||
|
if !self.config.server_url.starts_with("http://") && !self.config.server_url.starts_with("https://") {
|
||||||
|
return format!("{}://{}", protocol, self.config.server_url.trim());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Otherwise use the configured URL (normalized)
|
||||||
|
WebDAVConfig::normalize_server_url(&self.config.server_url)
|
||||||
|
}
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// Connection and Testing Methods
|
// Connection and Testing Methods
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
@ -194,13 +353,31 @@ impl WebDAVService {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test basic connectivity with OPTIONS request
|
// Perform protocol detection if needed
|
||||||
|
let working_protocol = match self.detect_working_protocol().await {
|
||||||
|
Ok(protocol) => {
|
||||||
|
info!("✅ Protocol detection successful: {}", protocol);
|
||||||
|
protocol
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("❌ Protocol detection failed: {}", e);
|
||||||
|
return Ok(WebDAVConnectionResult {
|
||||||
|
success: false,
|
||||||
|
message: format!("Protocol detection failed: {}", e),
|
||||||
|
server_version: None,
|
||||||
|
server_type: None,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Test basic connectivity with OPTIONS request using detected protocol
|
||||||
match self.test_options_request().await {
|
match self.test_options_request().await {
|
||||||
Ok((server_version, server_type)) => {
|
Ok((server_version, server_type)) => {
|
||||||
info!("✅ WebDAV connection successful");
|
let effective_url = self.get_effective_server_url();
|
||||||
|
info!("✅ WebDAV connection successful using {} ({})", working_protocol.to_uppercase(), effective_url);
|
||||||
Ok(WebDAVConnectionResult {
|
Ok(WebDAVConnectionResult {
|
||||||
success: true,
|
success: true,
|
||||||
message: "Connection successful".to_string(),
|
message: format!("Connection successful using {}", working_protocol.to_uppercase()),
|
||||||
server_version,
|
server_version,
|
||||||
server_type,
|
server_type,
|
||||||
})
|
})
|
||||||
|
|
@ -235,7 +412,18 @@ impl WebDAVService {
|
||||||
|
|
||||||
/// Performs OPTIONS request to test basic connectivity
|
/// Performs OPTIONS request to test basic connectivity
|
||||||
async fn test_options_request(&self) -> Result<(Option<String>, Option<String>)> {
|
async fn test_options_request(&self) -> Result<(Option<String>, Option<String>)> {
|
||||||
let webdav_url = self.config.webdav_url();
|
// Create a temporary config with the effective server URL for WebDAV operations
|
||||||
|
let effective_server_url = self.get_effective_server_url();
|
||||||
|
let temp_config = WebDAVConfig {
|
||||||
|
server_url: effective_server_url,
|
||||||
|
username: self.config.username.clone(),
|
||||||
|
password: self.config.password.clone(),
|
||||||
|
watch_folders: self.config.watch_folders.clone(),
|
||||||
|
file_extensions: self.config.file_extensions.clone(),
|
||||||
|
timeout_seconds: self.config.timeout_seconds,
|
||||||
|
server_type: self.config.server_type.clone(),
|
||||||
|
};
|
||||||
|
let webdav_url = temp_config.webdav_url();
|
||||||
|
|
||||||
let response = self.client
|
let response = self.client
|
||||||
.request(Method::OPTIONS, &webdav_url)
|
.request(Method::OPTIONS, &webdav_url)
|
||||||
|
|
@ -304,8 +492,9 @@ impl WebDAVService {
|
||||||
|
|
||||||
/// Tests for Nextcloud-specific capabilities
|
/// Tests for Nextcloud-specific capabilities
|
||||||
async fn test_nextcloud_capabilities(&self) -> Result<()> {
|
async fn test_nextcloud_capabilities(&self) -> Result<()> {
|
||||||
|
let effective_server_url = self.get_effective_server_url();
|
||||||
let capabilities_url = format!("{}/ocs/v1.php/cloud/capabilities",
|
let capabilities_url = format!("{}/ocs/v1.php/cloud/capabilities",
|
||||||
self.config.server_url.trim_end_matches('/'));
|
effective_server_url.trim_end_matches('/'));
|
||||||
|
|
||||||
let response = self.client
|
let response = self.client
|
||||||
.get(&capabilities_url)
|
.get(&capabilities_url)
|
||||||
|
|
@ -592,7 +781,18 @@ impl WebDAVService {
|
||||||
|
|
||||||
/// Gets the WebDAV URL for a specific path
|
/// Gets the WebDAV URL for a specific path
|
||||||
pub fn get_url_for_path(&self, path: &str) -> String {
|
pub fn get_url_for_path(&self, path: &str) -> String {
|
||||||
let base_url = self.config.webdav_url();
|
// Create a temporary config with the effective server URL
|
||||||
|
let effective_server_url = self.get_effective_server_url();
|
||||||
|
let temp_config = WebDAVConfig {
|
||||||
|
server_url: effective_server_url,
|
||||||
|
username: self.config.username.clone(),
|
||||||
|
password: self.config.password.clone(),
|
||||||
|
watch_folders: self.config.watch_folders.clone(),
|
||||||
|
file_extensions: self.config.file_extensions.clone(),
|
||||||
|
timeout_seconds: self.config.timeout_seconds,
|
||||||
|
server_type: self.config.server_type.clone(),
|
||||||
|
};
|
||||||
|
let base_url = temp_config.webdav_url();
|
||||||
let clean_path = path.trim_start_matches('/');
|
let clean_path = path.trim_start_matches('/');
|
||||||
|
|
||||||
let final_url = if clean_path.is_empty() {
|
let final_url = if clean_path.is_empty() {
|
||||||
|
|
@ -652,7 +852,18 @@ impl WebDAVService {
|
||||||
/// Convert file paths to the proper URL format for the server
|
/// Convert file paths to the proper URL format for the server
|
||||||
pub fn path_to_url(&self, relative_path: &str) -> String {
|
pub fn path_to_url(&self, relative_path: &str) -> String {
|
||||||
let clean_path = relative_path.trim_start_matches('/');
|
let clean_path = relative_path.trim_start_matches('/');
|
||||||
let base_url = self.config.webdav_url();
|
// Create a temporary config with the effective server URL
|
||||||
|
let effective_server_url = self.get_effective_server_url();
|
||||||
|
let temp_config = WebDAVConfig {
|
||||||
|
server_url: effective_server_url,
|
||||||
|
username: self.config.username.clone(),
|
||||||
|
password: self.config.password.clone(),
|
||||||
|
watch_folders: self.config.watch_folders.clone(),
|
||||||
|
file_extensions: self.config.file_extensions.clone(),
|
||||||
|
timeout_seconds: self.config.timeout_seconds,
|
||||||
|
server_type: self.config.server_type.clone(),
|
||||||
|
};
|
||||||
|
let base_url = temp_config.webdav_url();
|
||||||
|
|
||||||
if clean_path.is_empty() {
|
if clean_path.is_empty() {
|
||||||
base_url
|
base_url
|
||||||
|
|
@ -777,42 +988,64 @@ impl WebDAVService {
|
||||||
async fn discover_files_recursive(&self, directory_path: &str) -> Result<Vec<FileIngestionInfo>> {
|
async fn discover_files_recursive(&self, directory_path: &str) -> Result<Vec<FileIngestionInfo>> {
|
||||||
let mut all_files = Vec::new();
|
let mut all_files = Vec::new();
|
||||||
let mut directories_to_scan = vec![directory_path.to_string()];
|
let mut directories_to_scan = vec![directory_path.to_string()];
|
||||||
|
let mut scanned_directories = std::collections::HashSet::new();
|
||||||
let semaphore = Arc::new(Semaphore::new(self.concurrency_config.max_concurrent_scans));
|
let semaphore = Arc::new(Semaphore::new(self.concurrency_config.max_concurrent_scans));
|
||||||
|
|
||||||
|
debug!("Starting recursive file scan from: {}", directory_path);
|
||||||
|
|
||||||
while !directories_to_scan.is_empty() {
|
while !directories_to_scan.is_empty() {
|
||||||
let current_directories = directories_to_scan.clone();
|
// Take a batch of directories to process
|
||||||
directories_to_scan.clear();
|
let batch_size = std::cmp::min(directories_to_scan.len(), self.concurrency_config.max_concurrent_scans);
|
||||||
|
let current_batch: Vec<String> = directories_to_scan.drain(..batch_size).collect();
|
||||||
|
|
||||||
|
debug!("Processing batch of {} directories, {} remaining in queue",
|
||||||
|
current_batch.len(), directories_to_scan.len());
|
||||||
|
|
||||||
// Process directories concurrently
|
// Process directories concurrently
|
||||||
let tasks = current_directories.into_iter().map(|dir| {
|
let tasks = current_batch.into_iter().filter_map(|dir| {
|
||||||
|
// Skip if already scanned
|
||||||
|
if scanned_directories.contains(&dir) {
|
||||||
|
debug!("Skipping already scanned directory: {}", dir);
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
scanned_directories.insert(dir.clone());
|
||||||
|
|
||||||
let permit = semaphore.clone();
|
let permit = semaphore.clone();
|
||||||
let service = self.clone();
|
let service = self.clone();
|
||||||
|
|
||||||
async move {
|
Some(async move {
|
||||||
let _permit = permit.acquire().await.unwrap();
|
let _permit = permit.acquire().await.unwrap();
|
||||||
service.discover_files_and_directories_single(&dir).await
|
let result = service.discover_files_and_directories_single(&dir).await;
|
||||||
}
|
(dir, result)
|
||||||
|
})
|
||||||
});
|
});
|
||||||
|
|
||||||
let results = futures_util::future::join_all(tasks).await;
|
let results = futures_util::future::join_all(tasks).await;
|
||||||
|
|
||||||
for result in results {
|
for (scanned_dir, result) in results {
|
||||||
match result {
|
match result {
|
||||||
Ok(discovery_result) => {
|
Ok(discovery_result) => {
|
||||||
|
debug!("Directory '{}' scan complete: {} files, {} subdirectories",
|
||||||
|
scanned_dir, discovery_result.files.len(), discovery_result.directories.len());
|
||||||
|
|
||||||
all_files.extend(discovery_result.files);
|
all_files.extend(discovery_result.files);
|
||||||
|
|
||||||
// Add subdirectories to the queue for the next iteration
|
// Add subdirectories to the queue for the next iteration
|
||||||
for dir in discovery_result.directories {
|
for dir in discovery_result.directories {
|
||||||
if dir.is_directory {
|
if dir.is_directory && !scanned_directories.contains(&dir.relative_path) {
|
||||||
directories_to_scan.push(dir.relative_path);
|
directories_to_scan.push(dir.relative_path.clone());
|
||||||
|
debug!("Added subdirectory to scan queue: {}", dir.relative_path);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
warn!("Failed to scan directory: {}", e);
|
warn!("Failed to scan directory '{}': {}", scanned_dir, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
debug!("Batch complete. Total files found: {}. Queue size: {}",
|
||||||
|
all_files.len(), directories_to_scan.len());
|
||||||
}
|
}
|
||||||
|
|
||||||
info!("Recursive scan completed. Found {} files total", all_files.len());
|
info!("Recursive scan completed. Found {} files total", all_files.len());
|
||||||
|
|
@ -908,12 +1141,19 @@ impl WebDAVService {
|
||||||
let body = response.text().await?;
|
let body = response.text().await?;
|
||||||
let all_items = parse_propfind_response_with_directories(&body)?;
|
let all_items = parse_propfind_response_with_directories(&body)?;
|
||||||
|
|
||||||
|
// Process the items to convert href to relative paths
|
||||||
|
let processed_items = self.process_file_infos(all_items);
|
||||||
|
|
||||||
// Separate files and directories, excluding the parent directory itself
|
// Separate files and directories, excluding the parent directory itself
|
||||||
let mut files = Vec::new();
|
let mut files = Vec::new();
|
||||||
let mut directories = Vec::new();
|
let mut directories = Vec::new();
|
||||||
|
|
||||||
for item in all_items {
|
for item in processed_items {
|
||||||
if item.relative_path == directory_path {
|
// Skip the directory itself (handle both with and without trailing slash)
|
||||||
|
let normalized_item_path = item.relative_path.trim_end_matches('/');
|
||||||
|
let normalized_directory_path = directory_path.trim_end_matches('/');
|
||||||
|
|
||||||
|
if normalized_item_path == normalized_directory_path {
|
||||||
continue; // Skip the directory itself
|
continue; // Skip the directory itself
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -933,41 +1173,69 @@ impl WebDAVService {
|
||||||
let mut all_files = Vec::new();
|
let mut all_files = Vec::new();
|
||||||
let mut all_directories = Vec::new();
|
let mut all_directories = Vec::new();
|
||||||
let mut directories_to_scan = vec![directory_path.to_string()];
|
let mut directories_to_scan = vec![directory_path.to_string()];
|
||||||
|
let mut scanned_directories = std::collections::HashSet::new();
|
||||||
let semaphore = Arc::new(Semaphore::new(self.concurrency_config.max_concurrent_scans));
|
let semaphore = Arc::new(Semaphore::new(self.concurrency_config.max_concurrent_scans));
|
||||||
|
|
||||||
|
debug!("Starting recursive scan from: {}", directory_path);
|
||||||
|
|
||||||
while !directories_to_scan.is_empty() {
|
while !directories_to_scan.is_empty() {
|
||||||
let current_directories = directories_to_scan.clone();
|
// Take a batch of directories to process (limit batch size for better progress tracking)
|
||||||
directories_to_scan.clear();
|
let batch_size = std::cmp::min(directories_to_scan.len(), self.concurrency_config.max_concurrent_scans);
|
||||||
|
let current_batch: Vec<String> = directories_to_scan.drain(..batch_size).collect();
|
||||||
|
|
||||||
|
debug!("Processing batch of {} directories, {} remaining in queue",
|
||||||
|
current_batch.len(), directories_to_scan.len());
|
||||||
|
|
||||||
// Process directories concurrently
|
// Process directories concurrently
|
||||||
let tasks = current_directories.into_iter().map(|dir| {
|
let tasks = current_batch.into_iter().filter_map(|dir| {
|
||||||
|
// Skip if already scanned (prevent infinite loops)
|
||||||
|
if scanned_directories.contains(&dir) {
|
||||||
|
debug!("Skipping already scanned directory: {}", dir);
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
scanned_directories.insert(dir.clone());
|
||||||
|
|
||||||
let permit = semaphore.clone();
|
let permit = semaphore.clone();
|
||||||
let service = self.clone();
|
let service = self.clone();
|
||||||
|
|
||||||
async move {
|
Some(async move {
|
||||||
let _permit = permit.acquire().await.unwrap();
|
let _permit = permit.acquire().await.unwrap();
|
||||||
service.discover_files_and_directories_single(&dir).await
|
let result = service.discover_files_and_directories_single(&dir).await;
|
||||||
}
|
(dir, result)
|
||||||
|
})
|
||||||
});
|
});
|
||||||
|
|
||||||
let results = futures_util::future::join_all(tasks).await;
|
let results = futures_util::future::join_all(tasks).await;
|
||||||
|
|
||||||
for result in results {
|
for (scanned_dir, result) in results {
|
||||||
match result {
|
match result {
|
||||||
Ok(discovery_result) => {
|
Ok(discovery_result) => {
|
||||||
|
debug!("Directory '{}' scan complete: {} files, {} subdirectories",
|
||||||
|
scanned_dir, discovery_result.files.len(), discovery_result.directories.len());
|
||||||
|
|
||||||
all_files.extend(discovery_result.files);
|
all_files.extend(discovery_result.files);
|
||||||
|
|
||||||
// Add directories to our results and to the scan queue
|
// Add directories to our results and to the scan queue
|
||||||
for dir in discovery_result.directories {
|
for dir in discovery_result.directories {
|
||||||
directories_to_scan.push(dir.relative_path.clone());
|
// Only add to scan queue if not already scanned
|
||||||
|
if !scanned_directories.contains(&dir.relative_path) {
|
||||||
|
directories_to_scan.push(dir.relative_path.clone());
|
||||||
|
debug!("Added subdirectory to scan queue: {} (scanned set size: {})",
|
||||||
|
dir.relative_path, scanned_directories.len());
|
||||||
|
} else {
|
||||||
|
debug!("Skipping already scanned directory: {} (already in scanned set)", dir.relative_path);
|
||||||
|
}
|
||||||
all_directories.push(dir);
|
all_directories.push(dir);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
warn!("Failed to scan directory: {}", e);
|
warn!("Failed to scan directory '{}': {}", scanned_dir, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
debug!("Batch complete. Total progress: {} files, {} directories found. Queue size: {}",
|
||||||
|
all_files.len(), all_directories.len(), directories_to_scan.len());
|
||||||
}
|
}
|
||||||
|
|
||||||
info!("Recursive scan completed. Found {} files and {} directories", all_files.len(), all_directories.len());
|
info!("Recursive scan completed. Found {} files and {} directories", all_files.len(), all_directories.len());
|
||||||
|
|
@ -1172,6 +1440,131 @@ impl WebDAVService {
|
||||||
Ok(results)
|
Ok(results)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Downloads a file with enhanced MIME type detection based on content
|
||||||
|
///
|
||||||
|
/// This method downloads the file and performs content-based MIME type detection
|
||||||
|
/// using magic bytes, providing more accurate type identification than the initial
|
||||||
|
/// discovery phase which only has access to filenames and server-provided types.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
/// * `file_info` - The file information from WebDAV discovery
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// A `WebDAVDownloadResult` containing the file content, updated file info, and MIME detection details
|
||||||
|
pub async fn download_file_with_mime_detection(&self, file_info: &FileIngestionInfo) -> Result<WebDAVDownloadResult> {
|
||||||
|
let _permit = self.download_semaphore.acquire().await?;
|
||||||
|
|
||||||
|
debug!("⬇️🔍 Downloading file with MIME detection: {}", file_info.relative_path);
|
||||||
|
|
||||||
|
// Use the relative path directly since it's already processed
|
||||||
|
let relative_path = &file_info.relative_path;
|
||||||
|
let url = self.get_url_for_path(&relative_path);
|
||||||
|
|
||||||
|
let response = self.authenticated_request(
|
||||||
|
reqwest::Method::GET,
|
||||||
|
&url,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
).await?;
|
||||||
|
|
||||||
|
if !response.status().is_success() {
|
||||||
|
return Err(anyhow!(
|
||||||
|
"Failed to download file '{}': HTTP {}",
|
||||||
|
file_info.relative_path,
|
||||||
|
response.status()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get server-provided content type from response headers
|
||||||
|
let server_content_type = response
|
||||||
|
.headers()
|
||||||
|
.get("content-type")
|
||||||
|
.and_then(|header| header.to_str().ok())
|
||||||
|
.map(|s| s.split(';').next().unwrap_or(s).trim().to_string()); // Remove charset info and convert to owned
|
||||||
|
|
||||||
|
let content = response.bytes().await?;
|
||||||
|
debug!("✅ Downloaded {} bytes for file: {}", content.len(), file_info.relative_path);
|
||||||
|
|
||||||
|
// Perform content-based MIME type detection
|
||||||
|
let mime_detection_result = detect_mime_from_content(
|
||||||
|
&content,
|
||||||
|
&file_info.name,
|
||||||
|
server_content_type.as_deref()
|
||||||
|
);
|
||||||
|
|
||||||
|
// Check if MIME type should be updated
|
||||||
|
let mime_type_updated = mime_detection_result.mime_type != file_info.mime_type;
|
||||||
|
|
||||||
|
if mime_type_updated {
|
||||||
|
debug!("🔄 MIME type updated for {}: '{}' -> '{}' (method: {:?}, confidence: {:?})",
|
||||||
|
file_info.name,
|
||||||
|
file_info.mime_type,
|
||||||
|
mime_detection_result.mime_type,
|
||||||
|
mime_detection_result.detection_method,
|
||||||
|
mime_detection_result.confidence);
|
||||||
|
} else {
|
||||||
|
debug!("✅ MIME type confirmed for {}: '{}' (method: {:?}, confidence: {:?})",
|
||||||
|
file_info.name,
|
||||||
|
mime_detection_result.mime_type,
|
||||||
|
mime_detection_result.detection_method,
|
||||||
|
mime_detection_result.confidence);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create updated file info if MIME type changed
|
||||||
|
let updated_file_info = if mime_type_updated {
|
||||||
|
let mut updated = file_info.clone();
|
||||||
|
updated.mime_type = mime_detection_result.mime_type.clone();
|
||||||
|
updated
|
||||||
|
} else {
|
||||||
|
file_info.clone()
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(WebDAVDownloadResult {
|
||||||
|
content: content.to_vec(),
|
||||||
|
file_info: updated_file_info,
|
||||||
|
mime_detection: mime_detection_result,
|
||||||
|
mime_type_updated,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Downloads multiple files with MIME type detection concurrently
|
||||||
|
///
|
||||||
|
/// Similar to `download_files` but includes content-based MIME type detection
|
||||||
|
/// for each downloaded file.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
/// * `files` - The files to download
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// A vector of tuples containing the original file info and download result
|
||||||
|
pub async fn download_files_with_mime_detection(&self, files: &[FileIngestionInfo]) -> Result<Vec<(FileIngestionInfo, Result<WebDAVDownloadResult>)>> {
|
||||||
|
info!("⬇️🔍 Downloading {} files with MIME detection concurrently", files.len());
|
||||||
|
|
||||||
|
let tasks = files.iter().map(|file| {
|
||||||
|
let file_clone = file.clone();
|
||||||
|
let service_clone = self.clone();
|
||||||
|
|
||||||
|
async move {
|
||||||
|
let result = service_clone.download_file_with_mime_detection(&file_clone).await;
|
||||||
|
(file_clone, result)
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let results = futures_util::future::join_all(tasks).await;
|
||||||
|
|
||||||
|
let success_count = results.iter().filter(|(_, result)| result.is_ok()).count();
|
||||||
|
let failure_count = results.len() - success_count;
|
||||||
|
let mime_updated_count = results.iter()
|
||||||
|
.filter_map(|(_, result)| result.as_ref().ok())
|
||||||
|
.filter(|download_result| download_result.mime_type_updated)
|
||||||
|
.count();
|
||||||
|
|
||||||
|
info!("📊 Download with MIME detection completed: {} successful, {} failed, {} MIME types updated",
|
||||||
|
success_count, failure_count, mime_updated_count);
|
||||||
|
|
||||||
|
Ok(results)
|
||||||
|
}
|
||||||
|
|
||||||
/// Gets file metadata without downloading content
|
/// Gets file metadata without downloading content
|
||||||
pub async fn get_file_metadata(&self, file_path: &str) -> Result<FileIngestionInfo> {
|
pub async fn get_file_metadata(&self, file_path: &str) -> Result<FileIngestionInfo> {
|
||||||
debug!("📋 Getting metadata for file: {}", file_path);
|
debug!("📋 Getting metadata for file: {}", file_path);
|
||||||
|
|
@ -1226,9 +1619,21 @@ impl WebDAVService {
|
||||||
pub async fn get_server_capabilities(&self) -> Result<ServerCapabilities> {
|
pub async fn get_server_capabilities(&self) -> Result<ServerCapabilities> {
|
||||||
debug!("🔍 Checking server capabilities");
|
debug!("🔍 Checking server capabilities");
|
||||||
|
|
||||||
|
// Create a temporary config with the effective server URL
|
||||||
|
let effective_server_url = self.get_effective_server_url();
|
||||||
|
let temp_config = WebDAVConfig {
|
||||||
|
server_url: effective_server_url,
|
||||||
|
username: self.config.username.clone(),
|
||||||
|
password: self.config.password.clone(),
|
||||||
|
watch_folders: self.config.watch_folders.clone(),
|
||||||
|
file_extensions: self.config.file_extensions.clone(),
|
||||||
|
timeout_seconds: self.config.timeout_seconds,
|
||||||
|
server_type: self.config.server_type.clone(),
|
||||||
|
};
|
||||||
|
|
||||||
let options_response = self.authenticated_request(
|
let options_response = self.authenticated_request(
|
||||||
reqwest::Method::OPTIONS,
|
reqwest::Method::OPTIONS,
|
||||||
&self.config.webdav_url(),
|
&temp_config.webdav_url(),
|
||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
).await?;
|
).await?;
|
||||||
|
|
@ -1550,6 +1955,7 @@ impl WebDAVService {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Implement Clone to allow sharing the service
|
// Implement Clone to allow sharing the service
|
||||||
impl Clone for WebDAVService {
|
impl Clone for WebDAVService {
|
||||||
fn clone(&self) -> Self {
|
fn clone(&self) -> Self {
|
||||||
|
|
@ -1560,6 +1966,7 @@ impl Clone for WebDAVService {
|
||||||
concurrency_config: self.concurrency_config.clone(),
|
concurrency_config: self.concurrency_config.clone(),
|
||||||
scan_semaphore: Arc::clone(&self.scan_semaphore),
|
scan_semaphore: Arc::clone(&self.scan_semaphore),
|
||||||
download_semaphore: Arc::clone(&self.download_semaphore),
|
download_semaphore: Arc::clone(&self.download_semaphore),
|
||||||
|
working_protocol: Arc::clone(&self.working_protocol),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,2 +1,3 @@
|
||||||
pub mod etag_comparison_tests;
|
pub mod etag_comparison_tests;
|
||||||
pub mod deletion_detection_tests;
|
pub mod deletion_detection_tests;
|
||||||
|
pub mod path_processing_tests;
|
||||||
|
|
@ -0,0 +1,452 @@
|
||||||
|
#[cfg(test)]
|
||||||
|
mod path_processing_tests {
|
||||||
|
use crate::models::FileIngestionInfo;
|
||||||
|
use crate::services::webdav::{WebDAVConfig, WebDAVService};
|
||||||
|
use crate::webdav_xml_parser::parse_propfind_response_with_directories;
|
||||||
|
use wiremock::{
|
||||||
|
matchers::{method, path, header},
|
||||||
|
Mock, MockServer, ResponseTemplate,
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Creates a test WebDAV service with mock server
|
||||||
|
fn create_test_service(mock_server_url: &str) -> WebDAVService {
|
||||||
|
let config = WebDAVConfig {
|
||||||
|
server_url: mock_server_url.to_string(),
|
||||||
|
username: "testuser".to_string(),
|
||||||
|
password: "testpass".to_string(),
|
||||||
|
watch_folders: vec!["/TestDocuments".to_string()],
|
||||||
|
file_extensions: vec!["pdf".to_string(), "txt".to_string()],
|
||||||
|
timeout_seconds: 30,
|
||||||
|
server_type: Some("nextcloud".to_string()),
|
||||||
|
};
|
||||||
|
WebDAVService::new(config).expect("Failed to create test service")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mock WebDAV PROPFIND response with directories and files
|
||||||
|
fn mock_propfind_response() -> String {
|
||||||
|
r#"<?xml version="1.0"?>
|
||||||
|
<d:multistatus xmlns:d="DAV:" xmlns:s="http://sabredav.org/ns" xmlns:oc="http://owncloud.org/ns" xmlns:nc="http://nextcloud.org/ns">
|
||||||
|
<d:response>
|
||||||
|
<d:href>/remote.php/dav/files/testuser/TestDocuments/</d:href>
|
||||||
|
<d:propstat>
|
||||||
|
<d:prop>
|
||||||
|
<d:displayname>TestDocuments</d:displayname>
|
||||||
|
<d:getlastmodified>Tue, 29 Jul 2025 01:34:17 GMT</d:getlastmodified>
|
||||||
|
<d:getetag>"parent123etag"</d:getetag>
|
||||||
|
<d:resourcetype><d:collection/></d:resourcetype>
|
||||||
|
</d:prop>
|
||||||
|
<d:status>HTTP/1.1 200 OK</d:status>
|
||||||
|
</d:propstat>
|
||||||
|
</d:response>
|
||||||
|
<d:response>
|
||||||
|
<d:href>/remote.php/dav/files/testuser/TestDocuments/SubDir1/</d:href>
|
||||||
|
<d:propstat>
|
||||||
|
<d:prop>
|
||||||
|
<d:displayname>SubDir1</d:displayname>
|
||||||
|
<d:getlastmodified>Fri, 20 Jun 2025 23:35:17 GMT</d:getlastmodified>
|
||||||
|
<d:getetag>"subdir1etag"</d:getetag>
|
||||||
|
<d:resourcetype><d:collection/></d:resourcetype>
|
||||||
|
</d:prop>
|
||||||
|
<d:status>HTTP/1.1 200 OK</d:status>
|
||||||
|
</d:propstat>
|
||||||
|
</d:response>
|
||||||
|
<d:response>
|
||||||
|
<d:href>/remote.php/dav/files/testuser/TestDocuments/SubDir2/</d:href>
|
||||||
|
<d:propstat>
|
||||||
|
<d:prop>
|
||||||
|
<d:displayname>SubDir2</d:displayname>
|
||||||
|
<d:getlastmodified>Tue, 29 Jul 2025 01:34:17 GMT</d:getlastmodified>
|
||||||
|
<d:getetag>"subdir2etag"</d:getetag>
|
||||||
|
<d:resourcetype><d:collection/></d:resourcetype>
|
||||||
|
</d:prop>
|
||||||
|
<d:status>HTTP/1.1 200 OK</d:status>
|
||||||
|
</d:propstat>
|
||||||
|
</d:response>
|
||||||
|
<d:response>
|
||||||
|
<d:href>/remote.php/dav/files/testuser/TestDocuments/test.pdf</d:href>
|
||||||
|
<d:propstat>
|
||||||
|
<d:prop>
|
||||||
|
<d:displayname>test.pdf</d:displayname>
|
||||||
|
<d:getlastmodified>Thu, 24 Jul 2025 19:16:19 GMT</d:getlastmodified>
|
||||||
|
<d:getetag>"fileetag123"</d:getetag>
|
||||||
|
<d:getcontentlength>1234567</d:getcontentlength>
|
||||||
|
<d:resourcetype/>
|
||||||
|
</d:prop>
|
||||||
|
<d:status>HTTP/1.1 200 OK</d:status>
|
||||||
|
</d:propstat>
|
||||||
|
</d:response>
|
||||||
|
</d:multistatus>"#.to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mock WebDAV response for empty directory
|
||||||
|
fn mock_empty_directory_response() -> String {
|
||||||
|
r#"<?xml version="1.0"?>
|
||||||
|
<d:multistatus xmlns:d="DAV:" xmlns:s="http://sabredav.org/ns" xmlns:oc="http://owncloud.org/ns" xmlns:nc="http://nextcloud.org/ns">
|
||||||
|
<d:response>
|
||||||
|
<d:href>/remote.php/dav/files/testuser/TestDocuments/SubDir1/</d:href>
|
||||||
|
<d:propstat>
|
||||||
|
<d:prop>
|
||||||
|
<d:displayname>SubDir1</d:displayname>
|
||||||
|
<d:getlastmodified>Fri, 20 Jun 2025 23:35:17 GMT</d:getlastmodified>
|
||||||
|
<d:getetag>"subdir1etag"</d:getetag>
|
||||||
|
<d:resourcetype><d:collection/></d:resourcetype>
|
||||||
|
</d:prop>
|
||||||
|
<d:status>HTTP/1.1 200 OK</d:status>
|
||||||
|
</d:propstat>
|
||||||
|
</d:response>
|
||||||
|
</d:multistatus>"#.to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_xml_parser_returns_temp_paths() {
|
||||||
|
// This test ensures the XML parser behavior is documented
|
||||||
|
let xml_response = mock_propfind_response();
|
||||||
|
let parsed_items = parse_propfind_response_with_directories(&xml_response)
|
||||||
|
.expect("Failed to parse XML response");
|
||||||
|
|
||||||
|
// All parsed items should have relative_path as "TEMP" initially
|
||||||
|
for item in &parsed_items {
|
||||||
|
assert_eq!(item.relative_path, "TEMP",
|
||||||
|
"XML parser should set relative_path to TEMP for processing by discovery layer");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should find the correct number of items
|
||||||
|
assert_eq!(parsed_items.len(), 4, "Should parse all 4 items from XML");
|
||||||
|
|
||||||
|
// Verify we get both directories and files
|
||||||
|
let directories: Vec<_> = parsed_items.iter().filter(|i| i.is_directory).collect();
|
||||||
|
let files: Vec<_> = parsed_items.iter().filter(|i| !i.is_directory).collect();
|
||||||
|
|
||||||
|
assert_eq!(directories.len(), 3, "Should find 3 directories");
|
||||||
|
assert_eq!(files.len(), 1, "Should find 1 file");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_path_processing_converts_temp_to_relative_paths() {
|
||||||
|
let service = create_test_service("http://test.example.com");
|
||||||
|
|
||||||
|
// Create mock parsed items with TEMP paths (simulating XML parser output)
|
||||||
|
let mock_items = vec![
|
||||||
|
FileIngestionInfo {
|
||||||
|
relative_path: "TEMP".to_string(),
|
||||||
|
full_path: "/remote.php/dav/files/testuser/TestDocuments/".to_string(),
|
||||||
|
#[allow(deprecated)]
|
||||||
|
path: "/remote.php/dav/files/testuser/TestDocuments/".to_string(),
|
||||||
|
name: "TestDocuments".to_string(),
|
||||||
|
size: 0,
|
||||||
|
mime_type: "application/octet-stream".to_string(),
|
||||||
|
last_modified: None,
|
||||||
|
etag: "parent123etag".to_string(),
|
||||||
|
is_directory: true,
|
||||||
|
created_at: None,
|
||||||
|
permissions: None,
|
||||||
|
owner: None,
|
||||||
|
group: None,
|
||||||
|
metadata: None,
|
||||||
|
},
|
||||||
|
FileIngestionInfo {
|
||||||
|
relative_path: "TEMP".to_string(),
|
||||||
|
full_path: "/remote.php/dav/files/testuser/TestDocuments/SubDir1/".to_string(),
|
||||||
|
#[allow(deprecated)]
|
||||||
|
path: "/remote.php/dav/files/testuser/TestDocuments/SubDir1/".to_string(),
|
||||||
|
name: "SubDir1".to_string(),
|
||||||
|
size: 0,
|
||||||
|
mime_type: "application/octet-stream".to_string(),
|
||||||
|
last_modified: None,
|
||||||
|
etag: "subdir1etag".to_string(),
|
||||||
|
is_directory: true,
|
||||||
|
created_at: None,
|
||||||
|
permissions: None,
|
||||||
|
owner: None,
|
||||||
|
group: None,
|
||||||
|
metadata: None,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
// Process the items
|
||||||
|
let processed_items = service.process_file_infos(mock_items);
|
||||||
|
|
||||||
|
// Verify paths are correctly converted
|
||||||
|
assert_eq!(processed_items[0].relative_path, "/TestDocuments/");
|
||||||
|
assert_eq!(processed_items[1].relative_path, "/TestDocuments/SubDir1/");
|
||||||
|
|
||||||
|
// Verify full_path remains unchanged
|
||||||
|
assert_eq!(processed_items[0].full_path, "/remote.php/dav/files/testuser/TestDocuments/");
|
||||||
|
assert_eq!(processed_items[1].full_path, "/remote.php/dav/files/testuser/TestDocuments/SubDir1/");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_directory_filtering_excludes_parent() {
|
||||||
|
// Create processed items including parent directory
|
||||||
|
let processed_items = vec![
|
||||||
|
FileIngestionInfo {
|
||||||
|
relative_path: "/TestDocuments/".to_string(),
|
||||||
|
full_path: "/remote.php/dav/files/testuser/TestDocuments/".to_string(),
|
||||||
|
#[allow(deprecated)]
|
||||||
|
path: "/TestDocuments/".to_string(),
|
||||||
|
name: "TestDocuments".to_string(),
|
||||||
|
size: 0,
|
||||||
|
mime_type: "application/octet-stream".to_string(),
|
||||||
|
last_modified: None,
|
||||||
|
etag: "parent123etag".to_string(),
|
||||||
|
is_directory: true,
|
||||||
|
created_at: None,
|
||||||
|
permissions: None,
|
||||||
|
owner: None,
|
||||||
|
group: None,
|
||||||
|
metadata: None,
|
||||||
|
},
|
||||||
|
FileIngestionInfo {
|
||||||
|
relative_path: "/TestDocuments/SubDir1/".to_string(),
|
||||||
|
full_path: "/remote.php/dav/files/testuser/TestDocuments/SubDir1/".to_string(),
|
||||||
|
#[allow(deprecated)]
|
||||||
|
path: "/TestDocuments/SubDir1/".to_string(),
|
||||||
|
name: "SubDir1".to_string(),
|
||||||
|
size: 0,
|
||||||
|
mime_type: "application/octet-stream".to_string(),
|
||||||
|
last_modified: None,
|
||||||
|
etag: "subdir1etag".to_string(),
|
||||||
|
is_directory: true,
|
||||||
|
created_at: None,
|
||||||
|
permissions: None,
|
||||||
|
owner: None,
|
||||||
|
group: None,
|
||||||
|
metadata: None,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
// Simulate the filtering logic from discover_files_and_directories_single_with_url
|
||||||
|
let directory_path = "/TestDocuments";
|
||||||
|
let mut files = Vec::new();
|
||||||
|
let mut directories = Vec::new();
|
||||||
|
|
||||||
|
for item in processed_items {
|
||||||
|
// Skip the directory itself (handle both with and without trailing slash)
|
||||||
|
let normalized_item_path = item.relative_path.trim_end_matches('/');
|
||||||
|
let normalized_directory_path = directory_path.trim_end_matches('/');
|
||||||
|
|
||||||
|
if normalized_item_path == normalized_directory_path {
|
||||||
|
continue; // Skip the directory itself
|
||||||
|
}
|
||||||
|
|
||||||
|
if item.is_directory {
|
||||||
|
directories.push(item);
|
||||||
|
} else {
|
||||||
|
files.push(item);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should exclude parent directory but include subdirectory
|
||||||
|
assert_eq!(files.len(), 0);
|
||||||
|
assert_eq!(directories.len(), 1);
|
||||||
|
assert_eq!(directories[0].relative_path, "/TestDocuments/SubDir1/");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_single_directory_discovery_integration() {
|
||||||
|
let mock_server = MockServer::start().await;
|
||||||
|
|
||||||
|
// Mock the PROPFIND request
|
||||||
|
Mock::given(method("PROPFIND"))
|
||||||
|
.and(path("/remote.php/dav/files/testuser/TestDocuments"))
|
||||||
|
.and(header("depth", "1"))
|
||||||
|
.and(header("content-type", "application/xml"))
|
||||||
|
.respond_with(
|
||||||
|
ResponseTemplate::new(207)
|
||||||
|
.set_body_string(mock_propfind_response())
|
||||||
|
.insert_header("content-type", "application/xml")
|
||||||
|
)
|
||||||
|
.mount(&mock_server)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let service = create_test_service(&mock_server.uri());
|
||||||
|
|
||||||
|
// Test single directory discovery
|
||||||
|
let result = service.discover_files_and_directories("/TestDocuments", false).await
|
||||||
|
.expect("Single directory discovery should succeed");
|
||||||
|
|
||||||
|
// Verify results
|
||||||
|
assert_eq!(result.files.len(), 1, "Should find 1 file");
|
||||||
|
assert_eq!(result.directories.len(), 2, "Should find 2 directories (excluding parent)");
|
||||||
|
|
||||||
|
// Verify directory paths are correct (not TEMP)
|
||||||
|
let dir_paths: Vec<&String> = result.directories.iter().map(|d| &d.relative_path).collect();
|
||||||
|
assert!(dir_paths.contains(&&"/TestDocuments/SubDir1/".to_string()));
|
||||||
|
assert!(dir_paths.contains(&&"/TestDocuments/SubDir2/".to_string()));
|
||||||
|
|
||||||
|
// Verify no directory has TEMP path
|
||||||
|
for dir in &result.directories {
|
||||||
|
assert_ne!(dir.relative_path, "TEMP", "Directory path should not be TEMP");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify file path is correct
|
||||||
|
assert_eq!(result.files[0].relative_path, "/TestDocuments/test.pdf");
|
||||||
|
assert_ne!(result.files[0].relative_path, "TEMP", "File path should not be TEMP");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_recursive_directory_discovery_integration() {
|
||||||
|
let mock_server = MockServer::start().await;
|
||||||
|
|
||||||
|
// Mock the initial PROPFIND request for root directory
|
||||||
|
Mock::given(method("PROPFIND"))
|
||||||
|
.and(path("/remote.php/dav/files/testuser/TestDocuments"))
|
||||||
|
.and(header("depth", "1"))
|
||||||
|
.and(header("content-type", "application/xml"))
|
||||||
|
.respond_with(
|
||||||
|
ResponseTemplate::new(207)
|
||||||
|
.set_body_string(mock_propfind_response())
|
||||||
|
.insert_header("content-type", "application/xml")
|
||||||
|
)
|
||||||
|
.mount(&mock_server)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
// Mock PROPFIND requests for subdirectories (return empty for simplicity)
|
||||||
|
Mock::given(method("PROPFIND"))
|
||||||
|
.and(path("/remote.php/dav/files/testuser/TestDocuments/SubDir1"))
|
||||||
|
.and(header("depth", "1"))
|
||||||
|
.and(header("content-type", "application/xml"))
|
||||||
|
.respond_with(
|
||||||
|
ResponseTemplate::new(207)
|
||||||
|
.set_body_string(mock_empty_directory_response())
|
||||||
|
.insert_header("content-type", "application/xml")
|
||||||
|
)
|
||||||
|
.mount(&mock_server)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
Mock::given(method("PROPFIND"))
|
||||||
|
.and(path("/remote.php/dav/files/testuser/TestDocuments/SubDir2"))
|
||||||
|
.and(header("depth", "1"))
|
||||||
|
.and(header("content-type", "application/xml"))
|
||||||
|
.respond_with(
|
||||||
|
ResponseTemplate::new(207)
|
||||||
|
.set_body_string(mock_empty_directory_response())
|
||||||
|
.insert_header("content-type", "application/xml")
|
||||||
|
)
|
||||||
|
.mount(&mock_server)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let service = create_test_service(&mock_server.uri());
|
||||||
|
|
||||||
|
// Test recursive directory discovery
|
||||||
|
let result = service.discover_files_and_directories("/TestDocuments", true).await
|
||||||
|
.expect("Recursive directory discovery should succeed");
|
||||||
|
|
||||||
|
// Verify results
|
||||||
|
assert_eq!(result.files.len(), 1, "Should find 1 file");
|
||||||
|
assert_eq!(result.directories.len(), 2, "Should find 2 directories (excluding parents)");
|
||||||
|
|
||||||
|
// Verify no paths are TEMP
|
||||||
|
for item in result.files.iter().chain(result.directories.iter()) {
|
||||||
|
assert_ne!(item.relative_path, "TEMP", "Paths should be processed, not TEMP");
|
||||||
|
assert!(item.relative_path.starts_with("/TestDocuments"),
|
||||||
|
"All paths should start with /TestDocuments, got: {}", item.relative_path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_href_to_relative_path_conversion() {
|
||||||
|
let service = create_test_service("http://test.example.com");
|
||||||
|
|
||||||
|
// Test Nextcloud path conversion
|
||||||
|
assert_eq!(
|
||||||
|
service.href_to_relative_path("/remote.php/dav/files/testuser/Documents/file.pdf"),
|
||||||
|
"/Documents/file.pdf"
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
service.href_to_relative_path("/remote.php/dav/files/testuser/"),
|
||||||
|
"/"
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
service.href_to_relative_path("/remote.php/dav/files/testuser/Deep/Nested/Path/"),
|
||||||
|
"/Deep/Nested/Path/"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_url_construction() {
|
||||||
|
let service = create_test_service("http://test.example.com");
|
||||||
|
|
||||||
|
// Test URL construction for different paths
|
||||||
|
assert_eq!(
|
||||||
|
service.get_url_for_path("/TestDocuments"),
|
||||||
|
"http://test.example.com/remote.php/dav/files/testuser/TestDocuments"
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
service.get_url_for_path("/TestDocuments/SubDir"),
|
||||||
|
"http://test.example.com/remote.php/dav/files/testuser/TestDocuments/SubDir"
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
service.get_url_for_path("/"),
|
||||||
|
"http://test.example.com/remote.php/dav/files/testuser"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_regression_temp_paths_are_processed() {
|
||||||
|
// Regression test: Ensure TEMP paths from XML parser are always processed
|
||||||
|
let service = create_test_service("http://test.example.com");
|
||||||
|
|
||||||
|
// Simulate the exact scenario that caused the bug
|
||||||
|
let raw_xml_items = vec![
|
||||||
|
FileIngestionInfo {
|
||||||
|
relative_path: "TEMP".to_string(), // This is what XML parser returns
|
||||||
|
full_path: "/remote.php/dav/files/testuser/TestDocuments/ImportantFolder/".to_string(),
|
||||||
|
#[allow(deprecated)]
|
||||||
|
path: "/remote.php/dav/files/testuser/TestDocuments/ImportantFolder/".to_string(),
|
||||||
|
name: "ImportantFolder".to_string(),
|
||||||
|
size: 0,
|
||||||
|
mime_type: "application/octet-stream".to_string(),
|
||||||
|
last_modified: None,
|
||||||
|
etag: "folder123etag".to_string(),
|
||||||
|
is_directory: true,
|
||||||
|
created_at: None,
|
||||||
|
permissions: None,
|
||||||
|
owner: None,
|
||||||
|
group: None,
|
||||||
|
metadata: None,
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
// Process items as the service should do
|
||||||
|
let processed_items = service.process_file_infos(raw_xml_items);
|
||||||
|
|
||||||
|
// Verify the bug is fixed
|
||||||
|
assert_eq!(processed_items.len(), 1);
|
||||||
|
assert_ne!(processed_items[0].relative_path, "TEMP",
|
||||||
|
"REGRESSION: relative_path should not remain as TEMP after processing");
|
||||||
|
assert_eq!(processed_items[0].relative_path, "/TestDocuments/ImportantFolder/",
|
||||||
|
"relative_path should be properly converted from href");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_discover_files_and_directories_processes_paths() {
|
||||||
|
// Integration test to ensure discover_files_and_directories always processes paths
|
||||||
|
let mock_server = MockServer::start().await;
|
||||||
|
|
||||||
|
Mock::given(method("PROPFIND"))
|
||||||
|
.and(path("/remote.php/dav/files/testuser/TestDocuments"))
|
||||||
|
.respond_with(
|
||||||
|
ResponseTemplate::new(207)
|
||||||
|
.set_body_string(mock_propfind_response())
|
||||||
|
.insert_header("content-type", "application/xml")
|
||||||
|
)
|
||||||
|
.mount(&mock_server)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let service = create_test_service(&mock_server.uri());
|
||||||
|
|
||||||
|
let result = service.discover_files_and_directories("/TestDocuments", false).await
|
||||||
|
.expect("Discovery should succeed");
|
||||||
|
|
||||||
|
// Ensure no items have TEMP paths (regression test)
|
||||||
|
for item in result.files.iter().chain(result.directories.iter()) {
|
||||||
|
assert_ne!(item.relative_path, "TEMP",
|
||||||
|
"REGRESSION: No items should have TEMP paths after discovery");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -6,6 +6,7 @@ use std::str;
|
||||||
use serde_json;
|
use serde_json;
|
||||||
|
|
||||||
use crate::models::FileIngestionInfo;
|
use crate::models::FileIngestionInfo;
|
||||||
|
use crate::mime_detection::{detect_mime_for_discovery, DetectionStrategy};
|
||||||
|
|
||||||
#[derive(Debug, Default)]
|
#[derive(Debug, Default)]
|
||||||
struct PropFindResponse {
|
struct PropFindResponse {
|
||||||
|
|
@ -200,6 +201,14 @@ pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileIngestionInfo>>
|
||||||
// Use the metadata collected during parsing
|
// Use the metadata collected during parsing
|
||||||
let metadata = resp.metadata;
|
let metadata = resp.metadata;
|
||||||
|
|
||||||
|
// Determine MIME type using improved detection
|
||||||
|
let mime_detection_result = detect_mime_for_discovery(
|
||||||
|
&name,
|
||||||
|
resp.content_type.as_deref(),
|
||||||
|
DetectionStrategy::Comprehensive
|
||||||
|
);
|
||||||
|
let mime_type = mime_detection_result.mime_type;
|
||||||
|
|
||||||
let file_info = FileIngestionInfo {
|
let file_info = FileIngestionInfo {
|
||||||
relative_path: "TEMP".to_string(), // Will be set by discovery layer
|
relative_path: "TEMP".to_string(), // Will be set by discovery layer
|
||||||
full_path: resp.href.clone(),
|
full_path: resp.href.clone(),
|
||||||
|
|
@ -207,7 +216,7 @@ pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileIngestionInfo>>
|
||||||
path: resp.href.clone(), // Legacy field - keep for compatibility
|
path: resp.href.clone(), // Legacy field - keep for compatibility
|
||||||
name,
|
name,
|
||||||
size: resp.content_length.unwrap_or(0),
|
size: resp.content_length.unwrap_or(0),
|
||||||
mime_type: resp.content_type.unwrap_or_else(|| "application/octet-stream".to_string()),
|
mime_type,
|
||||||
last_modified: parse_http_date(&resp.last_modified.unwrap_or_default()),
|
last_modified: parse_http_date(&resp.last_modified.unwrap_or_default()),
|
||||||
etag: resp.etag.unwrap_or_else(|| format!("\"{}\"", uuid::Uuid::new_v4())),
|
etag: resp.etag.unwrap_or_else(|| format!("\"{}\"", uuid::Uuid::new_v4())),
|
||||||
is_directory: false,
|
is_directory: false,
|
||||||
|
|
@ -418,6 +427,18 @@ pub fn parse_propfind_response_with_directories(xml_text: &str) -> Result<Vec<Fi
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Determine MIME type for files (directories get empty string)
|
||||||
|
let mime_type = if resp.is_collection {
|
||||||
|
"".to_string()
|
||||||
|
} else {
|
||||||
|
let mime_detection_result = detect_mime_for_discovery(
|
||||||
|
&name,
|
||||||
|
resp.content_type.as_deref(),
|
||||||
|
DetectionStrategy::Comprehensive
|
||||||
|
);
|
||||||
|
mime_detection_result.mime_type
|
||||||
|
};
|
||||||
|
|
||||||
let file_info = FileIngestionInfo {
|
let file_info = FileIngestionInfo {
|
||||||
relative_path: "TEMP".to_string(), // Will be set by discovery layer
|
relative_path: "TEMP".to_string(), // Will be set by discovery layer
|
||||||
full_path: resp.href.clone(),
|
full_path: resp.href.clone(),
|
||||||
|
|
@ -425,11 +446,7 @@ pub fn parse_propfind_response_with_directories(xml_text: &str) -> Result<Vec<Fi
|
||||||
path: resp.href.clone(), // Legacy field - keep for compatibility
|
path: resp.href.clone(), // Legacy field - keep for compatibility
|
||||||
name,
|
name,
|
||||||
size: resp.content_length.unwrap_or(0),
|
size: resp.content_length.unwrap_or(0),
|
||||||
mime_type: if resp.is_collection {
|
mime_type,
|
||||||
"".to_string()
|
|
||||||
} else {
|
|
||||||
resp.content_type.unwrap_or_else(|| "application/octet-stream".to_string())
|
|
||||||
},
|
|
||||||
last_modified: parse_http_date(&resp.last_modified.unwrap_or_default()),
|
last_modified: parse_http_date(&resp.last_modified.unwrap_or_default()),
|
||||||
etag: resp.etag.unwrap_or_else(|| format!("\"{}\"", uuid::Uuid::new_v4())),
|
etag: resp.etag.unwrap_or_else(|| format!("\"{}\"", uuid::Uuid::new_v4())),
|
||||||
is_directory: resp.is_collection,
|
is_directory: resp.is_collection,
|
||||||
|
|
@ -943,4 +960,5 @@ mod tests {
|
||||||
assert!(weak_compare_etags("\"1\"", "\"1\""));
|
assert!(weak_compare_etags("\"1\"", "\"1\""));
|
||||||
assert!(strong_compare_etags("\"1\"", "\"1\""));
|
assert!(strong_compare_etags("\"1\"", "\"1\""));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue