feat(server): webdav integration nearly done

This commit is contained in:
perf3ct 2025-06-14 16:21:28 +00:00
parent 5b67232266
commit aa45cd06e0
7 changed files with 371 additions and 131 deletions

12
Cargo.lock generated
View File

@ -2288,6 +2288,16 @@ dependencies = [
"bytemuck",
]
[[package]]
name = "quick-xml"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
dependencies = [
"memchr",
"serde",
]
[[package]]
name = "quote"
version = "1.0.40"
@ -2434,6 +2444,7 @@ dependencies = [
"mime_guess",
"notify",
"pdf-extract",
"quick-xml",
"regex",
"reqwest",
"serde",
@ -2449,6 +2460,7 @@ dependencies = [
"tower-http",
"tracing",
"tracing-subscriber",
"urlencoding",
"utoipa",
"utoipa-swagger-ui",
"uuid",

View File

@ -37,6 +37,8 @@ pdf-extract = { version = "0.7", optional = true }
image = { version = "0.24", features = ["png", "jpeg", "tiff", "bmp"], optional = true }
imageproc = { version = "0.23", optional = true }
reqwest = { version = "0.11", features = ["json", "multipart"] }
quick-xml = { version = "0.31", features = ["serialize"] }
urlencoding = "2.1"
dotenvy = "0.15"
hostname = "0.4"
walkdir = "2"

View File

@ -12,6 +12,7 @@ pub mod seed;
pub mod watcher;
pub mod webdav_service;
pub mod webdav_scheduler;
pub mod webdav_xml_parser;
#[cfg(test)]
mod tests;

View File

@ -24,6 +24,7 @@ mod swagger;
mod watcher;
mod webdav_service;
mod webdav_scheduler;
mod webdav_xml_parser;
#[cfg(test)]
mod tests;

View File

@ -610,3 +610,14 @@ pub struct CreateWebDAVFile {
pub sync_status: String,
pub sync_error: Option<String>,
}
#[derive(Debug, Clone)]
pub struct FileInfo {
pub path: String,
pub name: String,
pub size: i64,
pub mime_type: String,
pub last_modified: Option<DateTime<Utc>>,
pub etag: String,
pub is_directory: bool,
}

View File

@ -1,16 +1,16 @@
use anyhow::{anyhow, Result};
use chrono::{DateTime, Utc};
use reqwest::{Client, Method};
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use std::time::Duration;
use tokio::time::sleep;
use tracing::{debug, error, info, warn};
use crate::models::{
WebDAVConnectionResult, WebDAVCrawlEstimate, WebDAVFolderInfo,
WebDAVSyncStatus, WebDAVTestConnection,
FileInfo, WebDAVConnectionResult, WebDAVCrawlEstimate, WebDAVFolderInfo,
WebDAVTestConnection,
};
use crate::webdav_xml_parser::parse_propfind_response;
#[derive(Debug, Clone)]
pub struct WebDAVConfig {
@ -44,66 +44,7 @@ impl Default for RetryConfig {
}
}
#[derive(Debug, Serialize, Deserialize)]
struct WebDAVResponse {
#[serde(rename = "d:multistatus")]
multistatus: MultiStatus,
}
#[derive(Debug, Serialize, Deserialize)]
struct MultiStatus {
#[serde(rename = "d:response")]
responses: Vec<DAVResponse>,
}
#[derive(Debug, Serialize, Deserialize)]
struct DAVResponse {
#[serde(rename = "d:href")]
href: String,
#[serde(rename = "d:propstat")]
propstat: PropStat,
}
#[derive(Debug, Serialize, Deserialize)]
struct PropStat {
#[serde(rename = "d:prop")]
prop: DAVProperties,
#[serde(rename = "d:status")]
status: String,
}
#[derive(Debug, Serialize, Deserialize)]
struct DAVProperties {
#[serde(rename = "d:displayname")]
displayname: Option<String>,
#[serde(rename = "d:getcontentlength")]
contentlength: Option<String>,
#[serde(rename = "d:getlastmodified")]
lastmodified: Option<String>,
#[serde(rename = "d:getcontenttype")]
contenttype: Option<String>,
#[serde(rename = "d:getetag")]
etag: Option<String>,
#[serde(rename = "d:resourcetype")]
resourcetype: Option<ResourceType>,
}
#[derive(Debug, Serialize, Deserialize)]
struct ResourceType {
#[serde(rename = "d:collection")]
collection: Option<serde_json::Value>,
}
#[derive(Debug, Clone)]
pub struct FileInfo {
pub path: String,
pub name: String,
pub size: i64,
pub mime_type: String,
pub last_modified: Option<DateTime<Utc>>,
pub etag: String,
pub is_directory: bool,
}
pub struct WebDAVService {
client: Client,
@ -438,75 +379,7 @@ impl WebDAVService {
}
pub fn parse_webdav_response(&self, xml_text: &str) -> Result<Vec<FileInfo>> {
// For now, we'll do simple string parsing
// In a production system, you'd want to use a proper XML parser like quick-xml
let mut files = Vec::new();
// This is a simplified parser - in practice you'd want proper XML parsing
let lines: Vec<&str> = xml_text.lines().collect();
let mut current_file: Option<FileInfo> = None;
let mut in_response = false;
for line in lines {
let line = line.trim();
if line.contains("<d:response>") {
in_response = true;
current_file = Some(FileInfo {
path: String::new(),
name: String::new(),
size: 0,
mime_type: String::new(),
last_modified: None,
etag: String::new(),
is_directory: false,
});
} else if line.contains("</d:response>") && in_response {
if let Some(file) = current_file.take() {
if !file.path.is_empty() && !file.path.ends_with('/') {
files.push(file);
}
}
in_response = false;
} else if in_response {
if let Some(ref mut file) = current_file {
if line.contains("<d:href>") {
if let Some(start) = line.find("<d:href>") {
if let Some(end) = line.find("</d:href>") {
let href = &line[start + 8..end];
file.path = href.to_string();
file.name = href.split('/').last().unwrap_or("").to_string();
}
}
} else if line.contains("<d:getcontentlength>") {
if let Some(start) = line.find("<d:getcontentlength>") {
if let Some(end) = line.find("</d:getcontentlength>") {
if let Ok(size) = line[start + 20..end].parse::<i64>() {
file.size = size;
}
}
}
} else if line.contains("<d:getcontenttype>") {
if let Some(start) = line.find("<d:getcontenttype>") {
if let Some(end) = line.find("</d:getcontenttype>") {
file.mime_type = line[start + 18..end].to_string();
}
}
} else if line.contains("<d:getetag>") {
if let Some(start) = line.find("<d:getetag>") {
if let Some(end) = line.find("</d:getetag>") {
file.etag = line[start + 11..end].to_string();
}
}
} else if line.contains("<d:collection") {
file.is_directory = true;
}
}
}
}
info!("Parsed {} files from WebDAV response", files.len());
Ok(files)
parse_propfind_response(xml_text)
}
pub async fn download_file(&self, file_path: &str) -> Result<Vec<u8>> {

340
src/webdav_xml_parser.rs Normal file
View File

@ -0,0 +1,340 @@
use anyhow::{anyhow, Result};
use chrono::{DateTime, Utc};
use quick_xml::events::{BytesStart, Event};
use quick_xml::reader::Reader;
use std::str;
use crate::models::FileInfo;
#[derive(Debug, Default)]
struct PropFindResponse {
href: String,
displayname: String,
content_length: Option<i64>,
last_modified: Option<String>,
content_type: Option<String>,
etag: Option<String>,
is_collection: bool,
}
pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileInfo>> {
let mut reader = Reader::from_str(xml_text);
reader.trim_text(true);
let mut files = Vec::new();
let mut current_response: Option<PropFindResponse> = None;
let mut current_element = String::new();
let mut in_response = false;
let mut in_propstat = false;
let mut in_prop = false;
let mut in_resourcetype = false;
let mut status_ok = false;
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) | Ok(Event::Empty(e)) => {
let name = get_local_name(&e)?;
match name.as_str() {
"response" => {
in_response = true;
current_response = Some(PropFindResponse::default());
}
"propstat" => {
in_propstat = true;
}
"prop" => {
in_prop = true;
}
"resourcetype" => {
in_resourcetype = true;
}
"collection" if in_resourcetype => {
if let Some(ref mut resp) = current_response {
resp.is_collection = true;
}
}
_ => {
current_element = name;
}
}
}
Ok(Event::Text(e)) => {
let text = e.unescape()?.to_string();
if in_response && !text.trim().is_empty() {
if let Some(ref mut resp) = current_response {
match current_element.as_str() {
"href" => {
resp.href = text.trim().to_string();
}
"displayname" => {
resp.displayname = text.trim().to_string();
}
"getcontentlength" => {
resp.content_length = text.trim().parse().ok();
}
"getlastmodified" => {
resp.last_modified = Some(text.trim().to_string());
}
"getcontenttype" => {
resp.content_type = Some(text.trim().to_string());
}
"getetag" => {
resp.etag = Some(text.trim().to_string());
}
"status" if in_propstat => {
// Check if status is 200 OK
if text.contains("200") {
status_ok = true;
}
}
_ => {}
}
}
}
}
Ok(Event::End(e)) => {
let name = get_local_name_from_end(&e)?;
match name.as_str() {
"response" => {
if let Some(resp) = current_response.take() {
// Only add files (not directories) with valid properties
if !resp.is_collection && status_ok && !resp.href.is_empty() {
// Extract filename from href
let name = if resp.displayname.is_empty() {
resp.href
.split('/')
.last()
.unwrap_or("")
.to_string()
} else {
resp.displayname.clone()
};
// Decode URL-encoded characters
let name = urlencoding::decode(&name)
.unwrap_or_else(|_| std::borrow::Cow::Borrowed(&name))
.to_string();
let file_info = FileInfo {
path: resp.href.clone(),
name,
size: resp.content_length.unwrap_or(0),
mime_type: resp.content_type.unwrap_or_else(|| "application/octet-stream".to_string()),
last_modified: parse_http_date(&resp.last_modified.unwrap_or_default()),
etag: resp.etag.unwrap_or_else(|| format!("\"{}\"", uuid::Uuid::new_v4())),
is_directory: false,
};
files.push(file_info);
}
}
in_response = false;
status_ok = false;
}
"propstat" => {
in_propstat = false;
}
"prop" => {
in_prop = false;
}
"resourcetype" => {
in_resourcetype = false;
}
_ => {}
}
current_element.clear();
}
Ok(Event::Eof) => break,
Err(e) => return Err(anyhow!("XML parsing error: {}", e)),
_ => {}
}
buf.clear();
}
Ok(files)
}
fn get_local_name(e: &BytesStart) -> Result<String> {
let qname = e.name();
let local = qname.local_name();
let name = str::from_utf8(local.as_ref())
.map_err(|e| anyhow!("Invalid UTF-8 in element name: {}", e))?;
Ok(name.to_string())
}
fn get_local_name_from_end(e: &quick_xml::events::BytesEnd) -> Result<String> {
let qname = e.name();
let local = qname.local_name();
let name = str::from_utf8(local.as_ref())
.map_err(|e| anyhow!("Invalid UTF-8 in element name: {}", e))?;
Ok(name.to_string())
}
fn parse_http_date(date_str: &str) -> Option<DateTime<Utc>> {
if date_str.is_empty() {
return None;
}
// Try to parse RFC 2822 format (used by WebDAV)
DateTime::parse_from_rfc2822(date_str)
.ok()
.map(|dt| dt.with_timezone(&Utc))
.or_else(|| {
// Try RFC 3339 as fallback
DateTime::parse_from_rfc3339(date_str)
.ok()
.map(|dt| dt.with_timezone(&Utc))
})
.or_else(|| {
// Try a custom format as last resort
chrono::NaiveDateTime::parse_from_str(date_str, "%a, %d %b %Y %H:%M:%S GMT")
.ok()
.map(|ndt| DateTime::from_naive_utc_and_offset(ndt, Utc))
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_simple_propfind() {
let xml = r#"<?xml version="1.0"?>
<d:multistatus xmlns:d="DAV:">
<d:response>
<d:href>/webdav/test.pdf</d:href>
<d:propstat>
<d:prop>
<d:displayname>test.pdf</d:displayname>
<d:getcontentlength>1024</d:getcontentlength>
<d:getlastmodified>Mon, 01 Jan 2024 12:00:00 GMT</d:getlastmodified>
<d:getcontenttype>application/pdf</d:getcontenttype>
<d:getetag>"abc123"</d:getetag>
<d:resourcetype/>
</d:prop>
<d:status>HTTP/1.1 200 OK</d:status>
</d:propstat>
</d:response>
</d:multistatus>"#;
let files = parse_propfind_response(xml).unwrap();
assert_eq!(files.len(), 1);
let file = &files[0];
assert_eq!(file.name, "test.pdf");
assert_eq!(file.size, 1024);
assert_eq!(file.mime_type, "application/pdf");
assert_eq!(file.etag, "\"abc123\"");
assert!(!file.is_directory);
}
#[test]
fn test_parse_propfind_with_directory() {
let xml = r#"<?xml version="1.0"?>
<d:multistatus xmlns:d="DAV:">
<d:response>
<d:href>/webdav/Documents/</d:href>
<d:propstat>
<d:prop>
<d:displayname>Documents</d:displayname>
<d:resourcetype>
<d:collection/>
</d:resourcetype>
</d:prop>
<d:status>HTTP/1.1 200 OK</d:status>
</d:propstat>
</d:response>
<d:response>
<d:href>/webdav/Documents/file.txt</d:href>
<d:propstat>
<d:prop>
<d:displayname>file.txt</d:displayname>
<d:getcontentlength>256</d:getcontentlength>
<d:getcontenttype>text/plain</d:getcontenttype>
<d:resourcetype/>
</d:prop>
<d:status>HTTP/1.1 200 OK</d:status>
</d:propstat>
</d:response>
</d:multistatus>"#;
let files = parse_propfind_response(xml).unwrap();
assert_eq!(files.len(), 1); // Only the file, not the directory
let file = &files[0];
assert_eq!(file.name, "file.txt");
assert_eq!(file.size, 256);
}
#[test]
fn test_parse_nextcloud_response() {
let xml = r#"<?xml version="1.0"?>
<d:multistatus xmlns:d="DAV:" xmlns:s="http://sabredav.org/ns" xmlns:oc="http://owncloud.org/ns">
<d:response>
<d:href>/remote.php/dav/files/admin/Documents/report.pdf</d:href>
<d:propstat>
<d:prop>
<d:displayname>report.pdf</d:displayname>
<d:getcontentlength>2048000</d:getcontentlength>
<d:getlastmodified>Mon, 15 Jan 2024 14:30:00 GMT</d:getlastmodified>
<d:getcontenttype>application/pdf</d:getcontenttype>
<d:getetag>"pdf123"</d:getetag>
<d:resourcetype/>
</d:prop>
<d:status>HTTP/1.1 200 OK</d:status>
</d:propstat>
</d:response>
</d:multistatus>"#;
let files = parse_propfind_response(xml).unwrap();
assert_eq!(files.len(), 1);
let file = &files[0];
assert_eq!(file.name, "report.pdf");
assert_eq!(file.path, "/remote.php/dav/files/admin/Documents/report.pdf");
assert_eq!(file.size, 2048000);
assert!(file.last_modified.is_some());
}
#[test]
fn test_parse_url_encoded_filenames() {
let xml = r#"<?xml version="1.0"?>
<d:multistatus xmlns:d="DAV:">
<d:response>
<d:href>/webdav/File%20with%20spaces.pdf</d:href>
<d:propstat>
<d:prop>
<d:displayname>File with spaces.pdf</d:displayname>
<d:getcontentlength>1024</d:getcontentlength>
<d:getcontenttype>application/pdf</d:getcontenttype>
<d:resourcetype/>
</d:prop>
<d:status>HTTP/1.1 200 OK</d:status>
</d:propstat>
</d:response>
</d:multistatus>"#;
let files = parse_propfind_response(xml).unwrap();
assert_eq!(files.len(), 1);
let file = &files[0];
assert_eq!(file.name, "File with spaces.pdf");
}
#[test]
fn test_empty_response() {
let xml = r#"<?xml version="1.0"?>
<d:multistatus xmlns:d="DAV:">
</d:multistatus>"#;
let files = parse_propfind_response(xml).unwrap();
assert_eq!(files.len(), 0);
}
}