feat(webdav): also add some crazy source automatic validation

This commit is contained in:
perf3ct 2025-07-03 05:26:36 +00:00
parent 99cbb9caee
commit c0835f436f
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
7 changed files with 751 additions and 288 deletions

View File

@ -69,6 +69,9 @@ import {
Visibility as OcrIcon,
Block as BlockIcon,
FindInPage as DeepScanIcon,
HealthAndSafety as HealthIcon,
Warning as WarningIcon,
Error as CriticalIcon,
} from '@mui/icons-material';
import { useNavigate } from 'react-router-dom';
import api, { queueService } from '../services/api';
@ -92,6 +95,11 @@ interface Source {
total_documents_ocr: number;
created_at: string;
updated_at: string;
// Validation fields
validation_status?: string | null;
last_validation_at?: string | null;
validation_score?: number | null;
validation_issues?: string | null;
}
interface SnackbarState {
@ -152,7 +160,7 @@ const SourcesPage: React.FC = () => {
const [testingConnection, setTestingConnection] = useState(false);
const [syncingSource, setSyncingSource] = useState<string | null>(null);
const [stoppingSync, setStoppingSync] = useState<string | null>(null);
const [deepScanning, setDeepScanning] = useState<string | null>(null);
const [validating, setValidating] = useState<string | null>(null);
const [autoRefreshing, setAutoRefreshing] = useState(false);
useEffect(() => {
@ -490,31 +498,83 @@ const SourcesPage: React.FC = () => {
}
};
const handleDeepScan = async (sourceId: string) => {
setDeepScanning(sourceId);
const handleValidation = async (sourceId: string) => {
setValidating(sourceId);
try {
const response = await api.post(`/sources/${sourceId}/deep-scan`);
const response = await api.post(`/sources/${sourceId}/validate`);
if (response.data.success) {
showSnackbar(response.data.message || 'Deep scan started successfully', 'success');
setTimeout(loadSources, 1000);
showSnackbar(response.data.message || 'Validation check started successfully', 'success');
setTimeout(loadSources, 2000); // Reload after 2 seconds to show updated status
} else {
showSnackbar(response.data.message || 'Failed to start deep scan', 'error');
showSnackbar(response.data.message || 'Failed to start validation check', 'error');
}
} catch (error: any) {
console.error('Failed to trigger deep scan:', error);
if (error.response?.status === 409) {
showSnackbar('Source is already syncing', 'warning');
} else if (error.response?.status === 404) {
showSnackbar('Source not found', 'error');
} else {
const message = error.response?.data?.message || 'Failed to start deep scan';
showSnackbar(message, 'error');
}
console.error('Failed to trigger validation:', error);
const message = error.response?.data?.message || 'Failed to start validation check';
showSnackbar(message, 'error');
} finally {
setDeepScanning(null);
setValidating(null);
}
};
// Helper function to render validation status
const renderValidationStatus = (source: Source) => {
const validationStatus = source.validation_status;
const validationScore = source.validation_score;
const lastValidationAt = source.last_validation_at;
let statusColor = theme.palette.grey[500];
let StatusIcon = HealthIcon;
let statusText = 'Unknown';
let tooltipText = 'Validation status unknown';
if (validationStatus === 'healthy') {
statusColor = theme.palette.success.main;
StatusIcon = CheckCircleIcon;
statusText = 'Healthy';
tooltipText = `Health score: ${validationScore || 'N/A'}`;
} else if (validationStatus === 'warning') {
statusColor = theme.palette.warning.main;
StatusIcon = WarningIcon;
statusText = 'Warning';
tooltipText = `Health score: ${validationScore || 'N/A'} - Issues detected`;
} else if (validationStatus === 'critical') {
statusColor = theme.palette.error.main;
StatusIcon = CriticalIcon;
statusText = 'Critical';
tooltipText = `Health score: ${validationScore || 'N/A'} - Critical issues`;
} else if (validationStatus === 'validating') {
statusColor = theme.palette.info.main;
StatusIcon = HealthIcon;
statusText = 'Validating';
tooltipText = 'Validation check in progress';
}
if (lastValidationAt) {
const lastValidation = new Date(lastValidationAt);
tooltipText += `\nLast checked: ${formatDistanceToNow(lastValidation)} ago`;
}
return (
<Tooltip title={tooltipText}>
<Chip
icon={<StatusIcon />}
label={statusText}
size="small"
sx={{
bgcolor: alpha(statusColor, 0.1),
color: statusColor,
borderColor: statusColor,
border: '1px solid',
'& .MuiChip-icon': {
color: statusColor,
},
}}
/>
</Tooltip>
);
};
// Utility functions for folder management
const addFolder = () => {
if (newFolder && !formData.watch_folders.includes(newFolder)) {
@ -864,25 +924,28 @@ const SourcesPage: React.FC = () => {
</span>
</Tooltip>
)}
<Tooltip title="Deep Scan">
<span>
{/* Validation Status Display */}
<Box sx={{ display: 'flex', alignItems: 'center', gap: 1, minWidth: 120 }}>
{renderValidationStatus(source)}
<Tooltip title="Run Validation Check">
<IconButton
onClick={() => handleDeepScan(source.id)}
disabled={deepScanning === source.id || source.status === 'syncing' || !source.enabled}
onClick={() => handleValidation(source.id)}
disabled={validating === source.id || source.status === 'syncing' || !source.enabled}
size="small"
sx={{
bgcolor: alpha(theme.palette.secondary.main, 0.1),
'&:hover': { bgcolor: alpha(theme.palette.secondary.main, 0.2) },
color: theme.palette.secondary.main,
bgcolor: alpha(theme.palette.info.main, 0.1),
'&:hover': { bgcolor: alpha(theme.palette.info.main, 0.2) },
color: theme.palette.info.main,
}}
>
{deepScanning === source.id ? (
<CircularProgress size={20} />
{validating === source.id ? (
<CircularProgress size={16} />
) : (
<DeepScanIcon />
<HealthIcon />
)}
</IconButton>
</span>
</Tooltip>
</Tooltip>
</Box>
<Tooltip title="Edit Source">
<IconButton
onClick={() => handleEditSource(source)}

View File

@ -0,0 +1,16 @@
-- Add validation status fields to sources table
ALTER TABLE sources
ADD COLUMN validation_status TEXT DEFAULT NULL,
ADD COLUMN last_validation_at TIMESTAMP WITH TIME ZONE DEFAULT NULL,
ADD COLUMN validation_score INTEGER DEFAULT NULL CHECK (validation_score >= 0 AND validation_score <= 100),
ADD COLUMN validation_issues TEXT DEFAULT NULL;
-- Create index for querying validation status
CREATE INDEX idx_sources_validation_status ON sources (validation_status);
CREATE INDEX idx_sources_last_validation_at ON sources (last_validation_at);
-- Add comments for documentation
COMMENT ON COLUMN sources.validation_status IS 'Current validation status: "healthy", "warning", "critical", "validating", or NULL';
COMMENT ON COLUMN sources.last_validation_at IS 'Timestamp of the last validation check';
COMMENT ON COLUMN sources.validation_score IS 'Health score from 0-100, where 100 is perfect health';
COMMENT ON COLUMN sources.validation_issues IS 'JSON array of validation issues and recommendations';

View File

@ -43,6 +43,10 @@ impl Database {
total_size_bytes: row.get("total_size_bytes"),
created_at: row.get("created_at"),
updated_at: row.get("updated_at"),
validation_status: row.get("validation_status"),
last_validation_at: row.get("last_validation_at"),
validation_score: row.get("validation_score"),
validation_issues: row.get("validation_issues"),
})
}
@ -103,6 +107,10 @@ impl Database {
total_size_bytes: row.get("total_size_bytes"),
created_at: row.get("created_at"),
updated_at: row.get("updated_at"),
validation_status: row.get("validation_status"),
last_validation_at: row.get("last_validation_at"),
validation_score: row.get("validation_score"),
validation_issues: row.get("validation_issues"),
});
}
@ -164,6 +172,10 @@ impl Database {
total_size_bytes: row.get("total_size_bytes"),
created_at: row.get("created_at"),
updated_at: row.get("updated_at"),
validation_status: row.get("validation_status"),
last_validation_at: row.get("last_validation_at"),
validation_score: row.get("validation_score"),
validation_issues: row.get("validation_issues"),
})
}
@ -254,6 +266,10 @@ impl Database {
total_size_bytes: row.get("total_size_bytes"),
created_at: row.get("created_at"),
updated_at: row.get("updated_at"),
validation_status: row.get("validation_status"),
last_validation_at: row.get("last_validation_at"),
validation_score: row.get("validation_score"),
validation_issues: row.get("validation_issues"),
});
}

View File

@ -1048,6 +1048,15 @@ pub struct Source {
pub total_size_bytes: i64,
pub created_at: DateTime<Utc>,
pub updated_at: DateTime<Utc>,
// Validation status tracking
#[sqlx(default)]
pub validation_status: Option<String>,
#[sqlx(default)]
pub last_validation_at: Option<DateTime<Utc>>,
#[sqlx(default)]
pub validation_score: Option<i32>, // 0-100 health score
#[sqlx(default)]
pub validation_issues: Option<String>, // JSON array of validation issues
}
#[derive(Debug, Serialize, Deserialize, ToSchema)]
@ -1072,6 +1081,15 @@ pub struct SourceResponse {
/// Total number of documents that have been OCR'd from this source
#[serde(default)]
pub total_documents_ocr: i64,
/// Validation status and health score
#[serde(default)]
pub validation_status: Option<String>,
#[serde(default)]
pub last_validation_at: Option<DateTime<Utc>>,
#[serde(default)]
pub validation_score: Option<i32>,
#[serde(default)]
pub validation_issues: Option<String>,
}
#[derive(Debug, Serialize, Deserialize, ToSchema)]
@ -1116,6 +1134,11 @@ impl From<Source> for SourceResponse {
// These will be populated separately when needed
total_documents: 0,
total_documents_ocr: 0,
// Validation fields
validation_status: source.validation_status,
last_validation_at: source.last_validation_at,
validation_score: source.validation_score,
validation_issues: source.validation_issues,
}
}
}

View File

@ -23,6 +23,7 @@ pub fn router() -> Router<Arc<AppState>> {
.route("/{id}/sync", post(trigger_sync))
.route("/{id}/sync/stop", post(stop_sync))
.route("/{id}/deep-scan", post(trigger_deep_scan))
.route("/{id}/validate", post(validate_source))
.route("/{id}/test", post(test_connection))
.route("/{id}/estimate", post(estimate_crawl))
.route("/estimate", post(estimate_crawl_with_config))
@ -642,6 +643,52 @@ async fn trigger_deep_scan(
}
}
#[utoipa::path(
post,
path = "/api/sources/{id}/validate",
tag = "sources",
security(
("bearer_auth" = [])
),
params(
("id" = Uuid, Path, description = "Source ID")
),
responses(
(status = 200, description = "Validation started successfully"),
(status = 401, description = "Unauthorized"),
(status = 404, description = "Source not found"),
(status = 500, description = "Internal server error")
)
)]
async fn validate_source(
auth_user: AuthUser,
Path(source_id): Path<Uuid>,
State(state): State<Arc<AppState>>,
) -> Result<Json<serde_json::Value>, StatusCode> {
info!("Starting validation check for source {} by user {}", source_id, auth_user.user.username);
let source = state
.db
.get_source(auth_user.user.id, source_id)
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?
.ok_or(StatusCode::NOT_FOUND)?;
// Start validation in background
let state_clone = state.clone();
let source_clone = source.clone();
tokio::spawn(async move {
if let Err(e) = crate::scheduling::source_scheduler::SourceScheduler::validate_source_health(&source_clone, &state_clone).await {
error!("Manual validation check failed for source {}: {}", source_clone.name, e);
}
});
Ok(Json(serde_json::json!({
"success": true,
"message": format!("Validation check started for source '{}'", source.name)
})))
}
#[utoipa::path(
post,
path = "/api/sources/{id}/sync/stop",

View File

@ -7,6 +7,7 @@ use tokio_util::sync::CancellationToken;
use tracing::{error, info, warn};
use chrono::Utc;
use uuid::Uuid;
use sqlx::Row;
use crate::{
AppState,
@ -14,6 +15,16 @@ use crate::{
};
use super::source_sync::SourceSyncService;
struct SyncHealthAnalysis {
score_penalty: i32,
issues: Vec<serde_json::Value>,
}
struct ErrorAnalysis {
score_penalty: i32,
issues: Vec<serde_json::Value>,
}
pub struct SourceScheduler {
state: Arc<AppState>,
sync_service: SourceSyncService,
@ -201,6 +212,11 @@ impl SourceScheduler {
info!("Background sync completed for source {}: {} files processed",
source_clone.name, files_processed);
// Perform automatic validation check after sync completion
if let Err(e) = Self::validate_source_health(&source_clone, &state_clone).await {
error!("Failed to perform validation check: {}", e);
}
// Update last sync time
if let Err(e) = sqlx::query(
r#"UPDATE sources
@ -516,4 +532,545 @@ impl SourceScheduler {
}
}
}
/// Check if a deep scan should be triggered based on sync results
async fn check_and_trigger_deep_scan(
source: &crate::models::Source,
files_processed: usize,
state: &Arc<AppState>,
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
// Get sync history for intelligent decision making
let recent_syncs = sqlx::query(
r#"
SELECT
COUNT(*) as sync_count,
SUM(CASE WHEN total_files_synced = 0 THEN 1 ELSE 0 END) as empty_sync_count,
MAX(last_sync_at) as last_sync,
MIN(last_sync_at) as first_sync
FROM (
SELECT total_files_synced, last_sync_at
FROM sources
WHERE id = $1
ORDER BY last_sync_at DESC
LIMIT 10
) recent_syncs
"#
)
.bind(source.id)
.fetch_one(state.db.get_pool())
.await?;
// Get last deep scan time
let last_deep_scan = sqlx::query(
r#"
SELECT MAX(created_at) as last_deep_scan
FROM notifications
WHERE user_id = $1
AND metadata->>'source_id' = $2
AND metadata->>'scan_type' = 'deep_scan'
AND notification_type = 'success'
"#
)
.bind(source.user_id)
.bind(source.id.to_string())
.fetch_one(state.db.get_pool())
.await?;
let mut should_trigger_deep_scan = false;
let mut reason = String::new();
// Trigger conditions:
// 1. If the last 5+ syncs found no files, something might be wrong
let empty_sync_count: i64 = recent_syncs.try_get("empty_sync_count").unwrap_or(0);
if empty_sync_count >= 5 {
should_trigger_deep_scan = true;
reason = "Multiple consecutive syncs found no files - deep scan needed to verify directory structure".to_string();
}
// 2. If we haven't done a deep scan in over 7 days
let last_deep_time: Option<chrono::DateTime<chrono::Utc>> = last_deep_scan.try_get("last_deep_scan").ok();
if let Some(last_deep) = last_deep_time {
let days_since_deep_scan = (chrono::Utc::now() - last_deep).num_days();
if days_since_deep_scan > 7 {
should_trigger_deep_scan = true;
reason = format!("No deep scan in {} days - periodic verification needed", days_since_deep_scan);
}
}
// 3. If this is the first sync ever (no deep scan history)
let sync_count: i64 = recent_syncs.try_get("sync_count").unwrap_or(0);
if last_deep_time.is_none() && sync_count <= 1 {
should_trigger_deep_scan = true;
reason = "First sync completed - deep scan recommended for initial directory discovery".to_string();
}
// 4. If sync found files but we've been getting inconsistent results
else if files_processed > 0 {
// Check for erratic sync patterns (alternating between finding files and not)
let erratic_check = sqlx::query(
r#"
SELECT
COUNT(DISTINCT CASE WHEN total_files_synced > 0 THEN 1 ELSE 0 END) as distinct_states
FROM (
SELECT total_files_synced
FROM sources
WHERE id = $1
ORDER BY last_sync_at DESC
LIMIT 5
) recent
"#
)
.bind(source.id)
.fetch_one(state.db.get_pool())
.await?;
let distinct_states: i64 = erratic_check.try_get("distinct_states").unwrap_or(0);
if distinct_states > 1 {
should_trigger_deep_scan = true;
reason = "Inconsistent sync results detected - deep scan needed for stability".to_string();
}
}
if should_trigger_deep_scan {
info!("🎯 Intelligent deep scan trigger activated for source {}: {}", source.name, reason);
// Create notification about automatic deep scan
let notification = crate::models::CreateNotification {
notification_type: "info".to_string(),
title: "Automatic Deep Scan Triggered".to_string(),
message: format!("Starting deep scan for {}: {}", source.name, reason),
action_url: Some("/sources".to_string()),
metadata: Some(serde_json::json!({
"source_type": source.source_type.to_string(),
"source_id": source.id,
"scan_type": "deep_scan",
"trigger_reason": reason,
"automatic": true
})),
};
if let Err(e) = state.db.create_notification(source.user_id, &notification).await {
error!("Failed to create deep scan notification: {}", e);
}
// Trigger the deep scan via the API endpoint
// We'll reuse the existing deep scan logic from the sources route
let webdav_config: WebDAVSourceConfig = serde_json::from_value(source.config.clone())?;
let webdav_service = crate::services::webdav_service::WebDAVService::new(
crate::services::webdav_service::WebDAVConfig {
server_url: webdav_config.server_url.clone(),
username: webdav_config.username.clone(),
password: webdav_config.password.clone(),
watch_folders: webdav_config.watch_folders.clone(),
file_extensions: webdav_config.file_extensions.clone(),
timeout_seconds: 600, // 10 minutes for deep scan
server_type: webdav_config.server_type.clone(),
}
)?;
// Run deep scan in background
let source_clone = source.clone();
let state_clone = state.clone();
tokio::spawn(async move {
match webdav_service.deep_scan_with_guaranteed_completeness(source_clone.user_id, &state_clone).await {
Ok(files) => {
info!("🎉 Automatic deep scan completed for {}: {} files found", source_clone.name, files.len());
// Process the files if any were found
let files_processed = if !files.is_empty() {
let total_files = files.len();
// Filter and process files as in the manual deep scan
let files_to_process: Vec<_> = files.into_iter()
.filter(|file_info| {
if file_info.is_directory {
return false;
}
let file_extension = std::path::Path::new(&file_info.name)
.extension()
.and_then(|ext| ext.to_str())
.unwrap_or("")
.to_lowercase();
webdav_config.file_extensions.contains(&file_extension)
})
.collect();
let processed_count = files_to_process.len();
if let Err(e) = crate::routes::webdav::webdav_sync::process_files_for_deep_scan(
state_clone.clone(),
source_clone.user_id,
&webdav_service,
&files_to_process,
true, // enable background OCR
Some(source_clone.id)
).await {
error!("Failed to process files from automatic deep scan: {}", e);
}
processed_count
} else {
0
};
// Success notification
let notification = crate::models::CreateNotification {
notification_type: "success".to_string(),
title: "Automatic Deep Scan Completed".to_string(),
message: format!("Deep scan of {} completed successfully", source_clone.name),
action_url: Some("/documents".to_string()),
metadata: Some(serde_json::json!({
"source_type": source_clone.source_type.to_string(),
"source_id": source_clone.id,
"scan_type": "deep_scan",
"automatic": true,
"files_found": files_processed
})),
};
if let Err(e) = state_clone.db.create_notification(source_clone.user_id, &notification).await {
error!("Failed to create success notification: {}", e);
}
}
Err(e) => {
error!("Automatic deep scan failed for {}: {}", source_clone.name, e);
// Error notification
let notification = crate::models::CreateNotification {
notification_type: "error".to_string(),
title: "Automatic Deep Scan Failed".to_string(),
message: format!("Deep scan of {} failed: {}", source_clone.name, e),
action_url: Some("/sources".to_string()),
metadata: Some(serde_json::json!({
"source_type": source_clone.source_type.to_string(),
"source_id": source_clone.id,
"scan_type": "deep_scan",
"automatic": true,
"error": e.to_string()
})),
};
if let Err(e) = state_clone.db.create_notification(source_clone.user_id, &notification).await {
error!("Failed to create error notification: {}", e);
}
}
}
});
}
Ok(())
}
/// Perform automatic validation of source health and connectivity
pub async fn validate_source_health(
source: &crate::models::Source,
state: &Arc<AppState>,
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
info!("🔍 Starting validation check for source: {}", source.name);
let mut validation_score = 100;
let mut validation_issues = Vec::<serde_json::Value>::new();
let mut validation_status = "healthy";
// 1. Configuration validation
if let Err(config_error) = Self::validate_source_config_detailed(source) {
validation_score -= 30;
validation_status = "critical";
validation_issues.push(serde_json::json!({
"type": "configuration",
"severity": "critical",
"message": format!("Configuration error: {}", config_error),
"recommendation": "Check and fix source configuration in settings"
}));
}
// 2. Connectivity validation
match source.source_type {
crate::models::SourceType::WebDAV => {
if let Err(e) = Self::validate_webdav_connectivity(source).await {
validation_score -= 25;
if validation_status == "healthy" { validation_status = "warning"; }
validation_issues.push(serde_json::json!({
"type": "connectivity",
"severity": "warning",
"message": format!("WebDAV connectivity issue: {}", e),
"recommendation": "Check server URL, credentials, and network connectivity"
}));
}
}
crate::models::SourceType::LocalFolder => {
if let Err(e) = Self::validate_local_folder_access(source).await {
validation_score -= 25;
if validation_status == "healthy" { validation_status = "warning"; }
validation_issues.push(serde_json::json!({
"type": "connectivity",
"severity": "warning",
"message": format!("Local folder access issue: {}", e),
"recommendation": "Check folder permissions and path accessibility"
}));
}
}
crate::models::SourceType::S3 => {
if let Err(e) = Self::validate_s3_connectivity(source).await {
validation_score -= 25;
if validation_status == "healthy" { validation_status = "warning"; }
validation_issues.push(serde_json::json!({
"type": "connectivity",
"severity": "warning",
"message": format!("S3 connectivity issue: {}", e),
"recommendation": "Check AWS credentials, bucket access, and permissions"
}));
}
}
}
// 3. Sync pattern analysis
if let Ok(sync_health) = Self::analyze_sync_patterns(source, state).await {
validation_score -= sync_health.score_penalty;
if sync_health.score_penalty > 15 && validation_status == "healthy" {
validation_status = "warning";
}
for issue in sync_health.issues {
validation_issues.push(issue);
}
}
// 4. Error rate analysis
if let Ok(error_analysis) = Self::analyze_error_patterns(source, state).await {
validation_score -= error_analysis.score_penalty;
if error_analysis.score_penalty > 20 {
validation_status = "warning";
}
for issue in error_analysis.issues {
validation_issues.push(issue);
}
}
// Cap the minimum score at 0
validation_score = validation_score.max(0);
// Update validation status in database
let validation_issues_json = serde_json::to_string(&validation_issues)
.unwrap_or_else(|_| "[]".to_string());
if let Err(e) = sqlx::query(
r#"
UPDATE sources
SET validation_status = $1,
last_validation_at = NOW(),
validation_score = $2,
validation_issues = $3,
updated_at = NOW()
WHERE id = $4
"#
)
.bind(validation_status)
.bind(validation_score)
.bind(validation_issues_json)
.bind(source.id)
.execute(state.db.get_pool())
.await {
error!("Failed to update validation status: {}", e);
}
// Send notification if there are critical issues
if validation_status == "critical" || validation_score < 50 {
let notification = crate::models::CreateNotification {
notification_type: if validation_status == "critical" { "error" } else { "warning" }.to_string(),
title: format!("Source Validation {}", if validation_status == "critical" { "Failed" } else { "Warning" }),
message: format!("Source {} has validation issues (score: {})", source.name, validation_score),
action_url: Some("/sources".to_string()),
metadata: Some(serde_json::json!({
"source_type": source.source_type.to_string(),
"source_id": source.id,
"validation_type": "health_check",
"validation_score": validation_score,
"validation_status": validation_status,
"issue_count": validation_issues.len()
})),
};
if let Err(e) = state.db.create_notification(source.user_id, &notification).await {
error!("Failed to create validation notification: {}", e);
}
}
info!("✅ Validation completed for {}: {} (score: {})", source.name, validation_status, validation_score);
Ok(())
}
fn validate_source_config_detailed(source: &crate::models::Source) -> Result<(), String> {
// Reuse existing validation logic but return more detailed errors
Self::validate_source_config_static(source)
}
fn validate_source_config_static(source: &crate::models::Source) -> Result<(), String> {
use crate::models::{SourceType, WebDAVSourceConfig, S3SourceConfig, LocalFolderSourceConfig};
match source.source_type {
SourceType::WebDAV => {
let config: WebDAVSourceConfig = serde_json::from_value(source.config.clone())
.map_err(|e| format!("Failed to parse WebDAV configuration: {}", e))?;
if config.server_url.trim().is_empty() {
return Err("WebDAV server URL is empty".to_string());
}
if config.username.trim().is_empty() {
return Err("WebDAV username is empty".to_string());
}
if config.password.trim().is_empty() {
return Err("WebDAV password is empty".to_string());
}
if config.watch_folders.is_empty() {
return Err("WebDAV watch folders list is empty".to_string());
}
Ok(())
}
SourceType::S3 => {
let _config: S3SourceConfig = serde_json::from_value(source.config.clone())
.map_err(|e| format!("Failed to parse S3 configuration: {}", e))?;
Ok(())
}
SourceType::LocalFolder => {
let _config: LocalFolderSourceConfig = serde_json::from_value(source.config.clone())
.map_err(|e| format!("Failed to parse Local Folder configuration: {}", e))?;
Ok(())
}
}
}
async fn validate_webdav_connectivity(source: &crate::models::Source) -> Result<(), String> {
use crate::models::WebDAVSourceConfig;
let config: WebDAVSourceConfig = serde_json::from_value(source.config.clone())
.map_err(|e| format!("Config parse error: {}", e))?;
let webdav_config = crate::services::webdav_service::WebDAVConfig {
server_url: config.server_url,
username: config.username,
password: config.password,
watch_folders: config.watch_folders,
file_extensions: config.file_extensions,
timeout_seconds: 30, // Quick connectivity test
server_type: config.server_type,
};
let webdav_service = crate::services::webdav_service::WebDAVService::new(webdav_config)
.map_err(|e| format!("Service creation failed: {}", e))?;
let test_config = crate::models::WebDAVTestConnection {
server_url: config.server_url.clone(),
username: config.username.clone(),
password: config.password.clone(),
server_type: config.server_type.clone(),
};
webdav_service.test_connection(test_config).await
.map_err(|e| format!("Connection test failed: {}", e.message))?;
Ok(())
}
async fn validate_local_folder_access(_source: &crate::models::Source) -> Result<(), String> {
// Simplified local folder validation - could be enhanced
// For now, just return OK as local folders are validated differently
Ok(())
}
async fn validate_s3_connectivity(_source: &crate::models::Source) -> Result<(), String> {
// Simplified S3 validation - could be enhanced with actual AWS SDK calls
// For now, just return OK as S3 validation requires more complex setup
Ok(())
}
async fn analyze_sync_patterns(
source: &crate::models::Source,
state: &Arc<AppState>
) -> Result<SyncHealthAnalysis, Box<dyn std::error::Error + Send + Sync>> {
let mut score_penalty = 0;
let mut issues = Vec::new();
// Check recent sync history
let sync_stats = sqlx::query(
r#"
SELECT
COUNT(*) as total_syncs,
SUM(CASE WHEN total_files_synced = 0 THEN 1 ELSE 0 END) as empty_syncs,
MAX(last_sync_at) as last_sync,
AVG(total_files_synced) as avg_files_per_sync
FROM sources
WHERE id = $1 AND last_sync_at >= NOW() - INTERVAL '7 days'
"#
)
.bind(source.id)
.fetch_one(state.db.get_pool())
.await?;
let total_syncs: i64 = sync_stats.try_get("total_syncs").unwrap_or(0);
let empty_syncs: i64 = sync_stats.try_get("empty_syncs").unwrap_or(0);
if total_syncs > 0 {
let empty_sync_ratio = (empty_syncs as f64) / (total_syncs as f64);
if empty_sync_ratio > 0.8 {
score_penalty += 20;
issues.push(serde_json::json!({
"type": "sync_pattern",
"severity": "warning",
"message": format!("High empty sync ratio: {:.1}% of recent syncs found no files", empty_sync_ratio * 100.0),
"recommendation": "This may indicate connectivity issues or that the source has no new content"
}));
}
if total_syncs < 2 && chrono::Utc::now().signed_duration_since(source.created_at).num_days() > 1 {
score_penalty += 10;
issues.push(serde_json::json!({
"type": "sync_pattern",
"severity": "info",
"message": "Very few syncs performed since source creation",
"recommendation": "Consider enabling auto-sync or manually triggering sync to ensure content is up to date"
}));
}
}
Ok(SyncHealthAnalysis { score_penalty, issues })
}
async fn analyze_error_patterns(
source: &crate::models::Source,
_state: &Arc<AppState>
) -> Result<ErrorAnalysis, Box<dyn std::error::Error + Send + Sync>> {
let mut score_penalty = 0;
let mut issues = Vec::new();
// Check if source has recent errors
if let Some(last_error_at) = source.last_error_at {
let hours_since_error = chrono::Utc::now().signed_duration_since(last_error_at).num_hours();
if hours_since_error < 24 {
score_penalty += 15;
issues.push(serde_json::json!({
"type": "error_pattern",
"severity": "warning",
"message": format!("Recent error occurred {} hours ago", hours_since_error),
"recommendation": format!("Last error: {}", source.last_error.as_deref().unwrap_or("Unknown error"))
}));
}
}
// Check if source is in error state
if source.status == crate::models::SourceStatus::Error {
score_penalty += 25;
issues.push(serde_json::json!({
"type": "error_pattern",
"severity": "critical",
"message": "Source is currently in error state",
"recommendation": "Review and fix the configuration or connectivity issues"
}));
}
Ok(ErrorAnalysis { score_penalty, issues })
}
}

View File

@ -948,179 +948,6 @@ impl WebDAVService {
}
}
}
/// Get a list of directories that need targeted scanning based on recent changes
pub async fn get_directories_needing_scan(&self, user_id: uuid::Uuid, state: &crate::AppState, max_age_hours: i64) -> Result<Vec<String>> {
let cutoff_time = chrono::Utc::now() - chrono::Duration::hours(max_age_hours);
match state.db.list_webdav_directories(user_id).await {
Ok(directories) => {
let stale_dirs: Vec<String> = directories.iter()
.filter(|dir| dir.last_scanned_at < cutoff_time)
.map(|dir| dir.directory_path.clone())
.collect();
debug!("🕒 Found {} directories not scanned in last {} hours", stale_dirs.len(), max_age_hours);
Ok(stale_dirs)
}
Err(e) => {
error!("Failed to get directories needing scan: {}", e);
Err(e.into())
}
}
}
/// Smart sync mode that combines multiple optimization strategies
pub async fn discover_files_smart_sync(&self, watch_folders: &[String], user_id: uuid::Uuid, state: &crate::AppState) -> Result<Vec<FileInfo>> {
debug!("🧠 Starting smart sync for {} watch folders", watch_folders.len());
let mut all_files = Vec::new();
for folder_path in watch_folders {
debug!("🔍 Smart sync processing folder: {}", folder_path);
// Step 1: Try optimized discovery first (checks directory ETag)
let optimized_result = self.discover_files_in_folder_optimized(folder_path, user_id, state).await;
match optimized_result {
Ok(files) => {
if !files.is_empty() {
debug!("✅ Optimized discovery found {} files in {}", files.len(), folder_path);
all_files.extend(files);
} else {
debug!("🔍 Directory {} unchanged, checking for stale subdirectories", folder_path);
// Step 2: Check for stale subdirectories that need targeted scanning
let stale_dirs = self.get_stale_subdirectories(folder_path, user_id, state, 24).await?;
if !stale_dirs.is_empty() {
debug!("🎯 Found {} stale subdirectories, performing targeted scan", stale_dirs.len());
let targeted_files = self.discover_files_targeted_rescan(&stale_dirs, user_id, state).await?;
all_files.extend(targeted_files);
} else {
debug!("✅ All subdirectories of {} are fresh, no scan needed", folder_path);
}
}
}
Err(e) => {
warn!("Optimized discovery failed for {}, falling back to full scan: {}", folder_path, e);
// Fallback to traditional full scan
match self.discover_files_in_folder(folder_path).await {
Ok(files) => {
debug!("📂 Fallback scan found {} files in {}", files.len(), folder_path);
all_files.extend(files);
}
Err(fallback_error) => {
error!("Both optimized and fallback scans failed for {}: {}", folder_path, fallback_error);
return Err(fallback_error);
}
}
}
}
}
debug!("🧠 Smart sync completed: {} total files discovered", all_files.len());
Ok(all_files)
}
/// Get subdirectories of a parent that haven't been scanned recently
async fn get_stale_subdirectories(&self, parent_path: &str, user_id: uuid::Uuid, state: &crate::AppState, max_age_hours: i64) -> Result<Vec<String>> {
let cutoff_time = chrono::Utc::now() - chrono::Duration::hours(max_age_hours);
match state.db.list_webdav_directories(user_id).await {
Ok(directories) => {
let stale_subdirs: Vec<String> = directories.iter()
.filter(|dir| {
dir.directory_path.starts_with(parent_path) &&
dir.directory_path != parent_path &&
dir.last_scanned_at < cutoff_time
})
.map(|dir| dir.directory_path.clone())
.collect();
debug!("🕒 Found {} stale subdirectories under {} (not scanned in {} hours)",
stale_subdirs.len(), parent_path, max_age_hours);
Ok(stale_subdirs)
}
Err(e) => {
error!("Failed to get stale subdirectories: {}", e);
Err(e.into())
}
}
}
/// Perform incremental sync - only scan directories that have actually changed
pub async fn discover_files_incremental(&self, watch_folders: &[String], user_id: uuid::Uuid, state: &crate::AppState) -> Result<Vec<FileInfo>> {
debug!("⚡ Starting incremental sync for {} watch folders", watch_folders.len());
let mut changed_files = Vec::new();
let mut unchanged_count = 0;
let mut changed_count = 0;
for folder_path in watch_folders {
// Check directory ETag to see if it changed
match self.check_directory_etag(folder_path).await {
Ok(current_etag) => {
let needs_scan = match state.db.get_webdav_directory(user_id, folder_path).await {
Ok(Some(stored_dir)) => {
if stored_dir.directory_etag != current_etag {
debug!("🔄 Directory {} changed (ETag: {} → {})", folder_path, stored_dir.directory_etag, current_etag);
changed_count += 1;
true
} else {
debug!("✅ Directory {} unchanged (ETag: {})", folder_path, current_etag);
unchanged_count += 1;
false
}
}
Ok(None) => {
debug!("🆕 New directory {} detected", folder_path);
changed_count += 1;
true
}
Err(e) => {
warn!("Database error for {}: {}, scanning to be safe", folder_path, e);
changed_count += 1;
true
}
};
if needs_scan {
// Directory changed - perform targeted scan
match self.discover_files_in_folder_optimized(folder_path, user_id, state).await {
Ok(mut files) => {
debug!("📂 Incremental scan found {} files in changed directory {}", files.len(), folder_path);
changed_files.append(&mut files);
}
Err(e) => {
error!("Failed incremental scan of {}: {}", folder_path, e);
}
}
} else {
// Directory unchanged - just update scan timestamp
let update = crate::models::UpdateWebDAVDirectory {
directory_etag: current_etag,
last_scanned_at: chrono::Utc::now(),
file_count: 0, // Will be updated by the database layer
total_size_bytes: 0,
};
if let Err(e) = state.db.update_webdav_directory(user_id, folder_path, &update).await {
warn!("Failed to update scan timestamp for {}: {}", folder_path, e);
}
}
}
Err(e) => {
error!("Failed to check directory ETag for {}: {}", folder_path, e);
}
}
}
debug!("⚡ Incremental sync completed: {} unchanged, {} changed, {} total files found",
unchanged_count, changed_count, changed_files.len());
Ok(changed_files)
}
/// Check subdirectories individually for changes when parent directory is unchanged
async fn check_subdirectories_for_changes(&self, parent_path: &str, user_id: uuid::Uuid, state: &crate::AppState) -> Result<Vec<FileInfo>> {
@ -1802,10 +1629,6 @@ impl WebDAVService {
state.db.mark_webdav_scan_complete(user_id, path).await
}
/// Resume a deep scan from a checkpoint after server restart/interruption
pub async fn resume_deep_scan(&self, checkpoint_path: &str, user_id: uuid::Uuid, state: &crate::AppState) -> Result<Vec<FileInfo>> {
self.resume_deep_scan_internal(checkpoint_path, user_id, state).await
}
/// Internal resume function that doesn't trigger crash recovery detection (to avoid recursion)
async fn resume_deep_scan_internal(&self, checkpoint_path: &str, user_id: uuid::Uuid, state: &crate::AppState) -> Result<Vec<FileInfo>> {
@ -1857,88 +1680,6 @@ impl WebDAVService {
}
}
/// Discover files in multiple folders concurrently with rate limiting
pub async fn discover_files_concurrent(&self, folders: &[String], user_id: uuid::Uuid, state: &crate::AppState) -> Result<Vec<FileInfo>> {
if folders.is_empty() {
return Ok(Vec::new());
}
info!("🚀 Starting concurrent discovery for {} folders", folders.len());
let semaphore = std::sync::Arc::new(Semaphore::new(self.concurrency_config.max_concurrent_scans));
let folders_stream = stream::iter(folders.iter())
.map(|folder_path| {
let semaphore = semaphore.clone();
let service = self.clone();
let folder_path = folder_path.clone();
async move {
let _permit = semaphore.acquire().await.map_err(|e| anyhow!("Semaphore error: {}", e))?;
info!("📂 Scanning folder: {}", folder_path);
let start_time = std::time::Instant::now();
// Save checkpoint for resumption after interruption
let checkpoint_record = crate::models::CreateWebDAVDirectory {
user_id,
directory_path: folder_path.clone(),
directory_etag: "scanning".to_string(), // Temporary marker
file_count: 0,
total_size_bytes: 0,
};
if let Err(e) = state.db.create_or_update_webdav_directory(&checkpoint_record).await {
warn!("Failed to save scan checkpoint for {}: {}", folder_path, e);
}
let result = service.discover_files_in_folder_optimized(&folder_path, user_id, state).await;
match &result {
Ok(files) => {
let duration = start_time.elapsed();
info!("✅ Completed folder {} in {:?}: {} files found",
folder_path, duration, files.len());
}
Err(e) => {
// Check if this was a server restart/connection issue
if service.is_server_restart_error(e) {
warn!("🔄 Server restart detected during scan of {}, will resume later", folder_path);
// Keep checkpoint for resumption
return Err(anyhow!("Server restart detected: {}", e));
} else {
error!("❌ Failed to scan folder {}: {}", folder_path, e);
}
}
}
result.map(|files| (folder_path, files))
}
})
.buffer_unordered(self.concurrency_config.max_concurrent_scans);
let mut all_files = Vec::new();
let mut success_count = 0;
let mut error_count = 0;
let mut folders_stream = std::pin::pin!(folders_stream);
while let Some(result) = folders_stream.next().await {
match result {
Ok((folder_path, mut files)) => {
debug!("📁 Folder {} contributed {} files", folder_path, files.len());
all_files.append(&mut files);
success_count += 1;
}
Err(e) => {
warn!("Folder scan error: {}", e);
error_count += 1;
}
}
}
info!("🎯 Concurrent discovery completed: {} folders successful, {} failed, {} total files",
success_count, error_count, all_files.len());
Ok(all_files)
}
pub async fn download_file(&self, file_path: &str) -> Result<Vec<u8>> {
self.retry_with_backoff("download_file", || {