From aa45cd06e09fc0103ba596401b1e663d2086b6d1 Mon Sep 17 00:00:00 2001 From: perf3ct Date: Sat, 14 Jun 2025 16:21:28 +0000 Subject: [PATCH] feat(server): webdav integration nearly done --- Cargo.lock | 12 ++ Cargo.toml | 2 + src/lib.rs | 1 + src/main.rs | 1 + src/models.rs | 11 ++ src/webdav_service.rs | 135 +--------------- src/webdav_xml_parser.rs | 340 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 371 insertions(+), 131 deletions(-) create mode 100644 src/webdav_xml_parser.rs diff --git a/Cargo.lock b/Cargo.lock index 553a468..00e33d7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2288,6 +2288,16 @@ dependencies = [ "bytemuck", ] +[[package]] +name = "quick-xml" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quote" version = "1.0.40" @@ -2434,6 +2444,7 @@ dependencies = [ "mime_guess", "notify", "pdf-extract", + "quick-xml", "regex", "reqwest", "serde", @@ -2449,6 +2460,7 @@ dependencies = [ "tower-http", "tracing", "tracing-subscriber", + "urlencoding", "utoipa", "utoipa-swagger-ui", "uuid", diff --git a/Cargo.toml b/Cargo.toml index 3568114..0d40b9c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,8 @@ pdf-extract = { version = "0.7", optional = true } image = { version = "0.24", features = ["png", "jpeg", "tiff", "bmp"], optional = true } imageproc = { version = "0.23", optional = true } reqwest = { version = "0.11", features = ["json", "multipart"] } +quick-xml = { version = "0.31", features = ["serialize"] } +urlencoding = "2.1" dotenvy = "0.15" hostname = "0.4" walkdir = "2" diff --git a/src/lib.rs b/src/lib.rs index 1794e87..94b04f9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,6 +12,7 @@ pub mod seed; pub mod watcher; pub mod webdav_service; pub mod webdav_scheduler; +pub mod webdav_xml_parser; #[cfg(test)] mod tests; diff --git a/src/main.rs b/src/main.rs index c86118b..742ac38 100644 --- a/src/main.rs +++ b/src/main.rs @@ -24,6 +24,7 @@ mod swagger; mod watcher; mod webdav_service; mod webdav_scheduler; +mod webdav_xml_parser; #[cfg(test)] mod tests; diff --git a/src/models.rs b/src/models.rs index 13c6e85..3cd2500 100644 --- a/src/models.rs +++ b/src/models.rs @@ -609,4 +609,15 @@ pub struct CreateWebDAVFile { pub document_id: Option, pub sync_status: String, pub sync_error: Option, +} + +#[derive(Debug, Clone)] +pub struct FileInfo { + pub path: String, + pub name: String, + pub size: i64, + pub mime_type: String, + pub last_modified: Option>, + pub etag: String, + pub is_directory: bool, } \ No newline at end of file diff --git a/src/webdav_service.rs b/src/webdav_service.rs index dc3e06c..dd0e9a6 100644 --- a/src/webdav_service.rs +++ b/src/webdav_service.rs @@ -1,16 +1,16 @@ use anyhow::{anyhow, Result}; use chrono::{DateTime, Utc}; use reqwest::{Client, Method}; -use serde::{Deserialize, Serialize}; use std::collections::HashSet; use std::time::Duration; use tokio::time::sleep; use tracing::{debug, error, info, warn}; use crate::models::{ - WebDAVConnectionResult, WebDAVCrawlEstimate, WebDAVFolderInfo, - WebDAVSyncStatus, WebDAVTestConnection, + FileInfo, WebDAVConnectionResult, WebDAVCrawlEstimate, WebDAVFolderInfo, + WebDAVTestConnection, }; +use crate::webdav_xml_parser::parse_propfind_response; #[derive(Debug, Clone)] pub struct WebDAVConfig { @@ -44,66 +44,7 @@ impl Default for RetryConfig { } } -#[derive(Debug, Serialize, Deserialize)] -struct WebDAVResponse { - #[serde(rename = "d:multistatus")] - multistatus: MultiStatus, -} -#[derive(Debug, Serialize, Deserialize)] -struct MultiStatus { - #[serde(rename = "d:response")] - responses: Vec, -} - -#[derive(Debug, Serialize, Deserialize)] -struct DAVResponse { - #[serde(rename = "d:href")] - href: String, - #[serde(rename = "d:propstat")] - propstat: PropStat, -} - -#[derive(Debug, Serialize, Deserialize)] -struct PropStat { - #[serde(rename = "d:prop")] - prop: DAVProperties, - #[serde(rename = "d:status")] - status: String, -} - -#[derive(Debug, Serialize, Deserialize)] -struct DAVProperties { - #[serde(rename = "d:displayname")] - displayname: Option, - #[serde(rename = "d:getcontentlength")] - contentlength: Option, - #[serde(rename = "d:getlastmodified")] - lastmodified: Option, - #[serde(rename = "d:getcontenttype")] - contenttype: Option, - #[serde(rename = "d:getetag")] - etag: Option, - #[serde(rename = "d:resourcetype")] - resourcetype: Option, -} - -#[derive(Debug, Serialize, Deserialize)] -struct ResourceType { - #[serde(rename = "d:collection")] - collection: Option, -} - -#[derive(Debug, Clone)] -pub struct FileInfo { - pub path: String, - pub name: String, - pub size: i64, - pub mime_type: String, - pub last_modified: Option>, - pub etag: String, - pub is_directory: bool, -} pub struct WebDAVService { client: Client, @@ -438,75 +379,7 @@ impl WebDAVService { } pub fn parse_webdav_response(&self, xml_text: &str) -> Result> { - // For now, we'll do simple string parsing - // In a production system, you'd want to use a proper XML parser like quick-xml - let mut files = Vec::new(); - - // This is a simplified parser - in practice you'd want proper XML parsing - let lines: Vec<&str> = xml_text.lines().collect(); - let mut current_file: Option = None; - let mut in_response = false; - - for line in lines { - let line = line.trim(); - - if line.contains("") { - in_response = true; - current_file = Some(FileInfo { - path: String::new(), - name: String::new(), - size: 0, - mime_type: String::new(), - last_modified: None, - etag: String::new(), - is_directory: false, - }); - } else if line.contains("") && in_response { - if let Some(file) = current_file.take() { - if !file.path.is_empty() && !file.path.ends_with('/') { - files.push(file); - } - } - in_response = false; - } else if in_response { - if let Some(ref mut file) = current_file { - if line.contains("") { - if let Some(start) = line.find("") { - if let Some(end) = line.find("") { - let href = &line[start + 8..end]; - file.path = href.to_string(); - file.name = href.split('/').last().unwrap_or("").to_string(); - } - } - } else if line.contains("") { - if let Some(start) = line.find("") { - if let Some(end) = line.find("") { - if let Ok(size) = line[start + 20..end].parse::() { - file.size = size; - } - } - } - } else if line.contains("") { - if let Some(start) = line.find("") { - if let Some(end) = line.find("") { - file.mime_type = line[start + 18..end].to_string(); - } - } - } else if line.contains("") { - if let Some(start) = line.find("") { - if let Some(end) = line.find("") { - file.etag = line[start + 11..end].to_string(); - } - } - } else if line.contains(" Result> { diff --git a/src/webdav_xml_parser.rs b/src/webdav_xml_parser.rs new file mode 100644 index 0000000..a90f3d0 --- /dev/null +++ b/src/webdav_xml_parser.rs @@ -0,0 +1,340 @@ +use anyhow::{anyhow, Result}; +use chrono::{DateTime, Utc}; +use quick_xml::events::{BytesStart, Event}; +use quick_xml::reader::Reader; +use std::str; + +use crate::models::FileInfo; + +#[derive(Debug, Default)] +struct PropFindResponse { + href: String, + displayname: String, + content_length: Option, + last_modified: Option, + content_type: Option, + etag: Option, + is_collection: bool, +} + +pub fn parse_propfind_response(xml_text: &str) -> Result> { + let mut reader = Reader::from_str(xml_text); + reader.trim_text(true); + + let mut files = Vec::new(); + let mut current_response: Option = None; + let mut current_element = String::new(); + let mut in_response = false; + let mut in_propstat = false; + let mut in_prop = false; + let mut in_resourcetype = false; + let mut status_ok = false; + + let mut buf = Vec::new(); + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(e)) | Ok(Event::Empty(e)) => { + let name = get_local_name(&e)?; + + match name.as_str() { + "response" => { + in_response = true; + current_response = Some(PropFindResponse::default()); + } + "propstat" => { + in_propstat = true; + } + "prop" => { + in_prop = true; + } + "resourcetype" => { + in_resourcetype = true; + } + "collection" if in_resourcetype => { + if let Some(ref mut resp) = current_response { + resp.is_collection = true; + } + } + _ => { + current_element = name; + } + } + } + Ok(Event::Text(e)) => { + let text = e.unescape()?.to_string(); + + if in_response && !text.trim().is_empty() { + if let Some(ref mut resp) = current_response { + match current_element.as_str() { + "href" => { + resp.href = text.trim().to_string(); + } + "displayname" => { + resp.displayname = text.trim().to_string(); + } + "getcontentlength" => { + resp.content_length = text.trim().parse().ok(); + } + "getlastmodified" => { + resp.last_modified = Some(text.trim().to_string()); + } + "getcontenttype" => { + resp.content_type = Some(text.trim().to_string()); + } + "getetag" => { + resp.etag = Some(text.trim().to_string()); + } + "status" if in_propstat => { + // Check if status is 200 OK + if text.contains("200") { + status_ok = true; + } + } + _ => {} + } + } + } + } + Ok(Event::End(e)) => { + let name = get_local_name_from_end(&e)?; + + match name.as_str() { + "response" => { + if let Some(resp) = current_response.take() { + // Only add files (not directories) with valid properties + if !resp.is_collection && status_ok && !resp.href.is_empty() { + // Extract filename from href + let name = if resp.displayname.is_empty() { + resp.href + .split('/') + .last() + .unwrap_or("") + .to_string() + } else { + resp.displayname.clone() + }; + + // Decode URL-encoded characters + let name = urlencoding::decode(&name) + .unwrap_or_else(|_| std::borrow::Cow::Borrowed(&name)) + .to_string(); + + let file_info = FileInfo { + path: resp.href.clone(), + name, + size: resp.content_length.unwrap_or(0), + mime_type: resp.content_type.unwrap_or_else(|| "application/octet-stream".to_string()), + last_modified: parse_http_date(&resp.last_modified.unwrap_or_default()), + etag: resp.etag.unwrap_or_else(|| format!("\"{}\"", uuid::Uuid::new_v4())), + is_directory: false, + }; + + files.push(file_info); + } + } + in_response = false; + status_ok = false; + } + "propstat" => { + in_propstat = false; + } + "prop" => { + in_prop = false; + } + "resourcetype" => { + in_resourcetype = false; + } + _ => {} + } + + current_element.clear(); + } + Ok(Event::Eof) => break, + Err(e) => return Err(anyhow!("XML parsing error: {}", e)), + _ => {} + } + + buf.clear(); + } + + Ok(files) +} + +fn get_local_name(e: &BytesStart) -> Result { + let qname = e.name(); + let local = qname.local_name(); + let name = str::from_utf8(local.as_ref()) + .map_err(|e| anyhow!("Invalid UTF-8 in element name: {}", e))?; + Ok(name.to_string()) +} + +fn get_local_name_from_end(e: &quick_xml::events::BytesEnd) -> Result { + let qname = e.name(); + let local = qname.local_name(); + let name = str::from_utf8(local.as_ref()) + .map_err(|e| anyhow!("Invalid UTF-8 in element name: {}", e))?; + Ok(name.to_string()) +} + +fn parse_http_date(date_str: &str) -> Option> { + if date_str.is_empty() { + return None; + } + + // Try to parse RFC 2822 format (used by WebDAV) + DateTime::parse_from_rfc2822(date_str) + .ok() + .map(|dt| dt.with_timezone(&Utc)) + .or_else(|| { + // Try RFC 3339 as fallback + DateTime::parse_from_rfc3339(date_str) + .ok() + .map(|dt| dt.with_timezone(&Utc)) + }) + .or_else(|| { + // Try a custom format as last resort + chrono::NaiveDateTime::parse_from_str(date_str, "%a, %d %b %Y %H:%M:%S GMT") + .ok() + .map(|ndt| DateTime::from_naive_utc_and_offset(ndt, Utc)) + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_simple_propfind() { + let xml = r#" + + + /webdav/test.pdf + + + test.pdf + 1024 + Mon, 01 Jan 2024 12:00:00 GMT + application/pdf + "abc123" + + + HTTP/1.1 200 OK + + + "#; + + let files = parse_propfind_response(xml).unwrap(); + assert_eq!(files.len(), 1); + + let file = &files[0]; + assert_eq!(file.name, "test.pdf"); + assert_eq!(file.size, 1024); + assert_eq!(file.mime_type, "application/pdf"); + assert_eq!(file.etag, "\"abc123\""); + assert!(!file.is_directory); + } + + #[test] + fn test_parse_propfind_with_directory() { + let xml = r#" + + + /webdav/Documents/ + + + Documents + + + + + HTTP/1.1 200 OK + + + + /webdav/Documents/file.txt + + + file.txt + 256 + text/plain + + + HTTP/1.1 200 OK + + + "#; + + let files = parse_propfind_response(xml).unwrap(); + assert_eq!(files.len(), 1); // Only the file, not the directory + + let file = &files[0]; + assert_eq!(file.name, "file.txt"); + assert_eq!(file.size, 256); + } + + #[test] + fn test_parse_nextcloud_response() { + let xml = r#" + + + /remote.php/dav/files/admin/Documents/report.pdf + + + report.pdf + 2048000 + Mon, 15 Jan 2024 14:30:00 GMT + application/pdf + "pdf123" + + + HTTP/1.1 200 OK + + + "#; + + let files = parse_propfind_response(xml).unwrap(); + assert_eq!(files.len(), 1); + + let file = &files[0]; + assert_eq!(file.name, "report.pdf"); + assert_eq!(file.path, "/remote.php/dav/files/admin/Documents/report.pdf"); + assert_eq!(file.size, 2048000); + assert!(file.last_modified.is_some()); + } + + #[test] + fn test_parse_url_encoded_filenames() { + let xml = r#" + + + /webdav/File%20with%20spaces.pdf + + + File with spaces.pdf + 1024 + application/pdf + + + HTTP/1.1 200 OK + + + "#; + + let files = parse_propfind_response(xml).unwrap(); + assert_eq!(files.len(), 1); + + let file = &files[0]; + assert_eq!(file.name, "File with spaces.pdf"); + } + + #[test] + fn test_empty_response() { + let xml = r#" + + "#; + + let files = parse_propfind_response(xml).unwrap(); + assert_eq!(files.len(), 0); + } +} \ No newline at end of file