From b356017484631bb7eac081fb4368bc8bd2b33166 Mon Sep 17 00:00:00 2001 From: perf3ct Date: Mon, 7 Jul 2025 19:10:45 +0000 Subject: [PATCH] feat(server): implement better error checking for sources --- src/db/sources.rs | 3 +- src/routes/documents/types.rs | 2 +- src/scheduling/source_scheduler.rs | 11 ++++++++ tests/integration_debug_ocr_test.rs | 43 +++++++++++++++++++++-------- 4 files changed, 45 insertions(+), 14 deletions(-) diff --git a/src/db/sources.rs b/src/db/sources.rs index ee349fc..0ba1c65 100644 --- a/src/db/sources.rs +++ b/src/db/sources.rs @@ -286,7 +286,8 @@ impl Database { let rows = sqlx::query( r#"SELECT id, user_id, name, source_type, enabled, config, status, last_sync_at, last_error, last_error_at, total_files_synced, - total_files_pending, total_size_bytes, created_at, updated_at + total_files_pending, total_size_bytes, created_at, updated_at, + validation_status, last_validation_at, validation_score, validation_issues FROM sources WHERE enabled = true AND status != 'syncing' ORDER BY last_sync_at ASC NULLS FIRST"# diff --git a/src/routes/documents/types.rs b/src/routes/documents/types.rs index 17639b6..4e36e26 100644 --- a/src/routes/documents/types.rs +++ b/src/routes/documents/types.rs @@ -27,7 +27,7 @@ pub struct DeleteLowConfidenceRequest { pub preview_only: Option, } -#[derive(Serialize, ToSchema)] +#[derive(Deserialize, Serialize, ToSchema)] pub struct DocumentUploadResponse { pub document_id: uuid::Uuid, pub filename: String, diff --git a/src/scheduling/source_scheduler.rs b/src/scheduling/source_scheduler.rs index 91d6606..5558fa1 100644 --- a/src/scheduling/source_scheduler.rs +++ b/src/scheduling/source_scheduler.rs @@ -168,6 +168,17 @@ impl SourceScheduler { let sources = self.state.db.get_sources_for_sync().await?; for source in sources { + // Skip sources that are already in error status due to configuration issues + if source.status == crate::models::SourceStatus::Error && + source.last_error.as_ref().map(|e| e.contains("Configuration error")).unwrap_or(false) { + // Only log this once every hour to reduce spam + if source.last_error_at.map(|t| chrono::Utc::now() - t > chrono::Duration::hours(1)).unwrap_or(true) { + warn!("⚠️ Skipping source '{}' (ID: {}) due to persistent configuration error: {}", + source.name, source.id, source.last_error.as_ref().unwrap_or(&"Unknown error".to_string())); + } + continue; + } + // Validate source configuration before checking if sync is due if let Err(e) = self.validate_source_config(&source) { error!("❌ CONFIGURATION ERROR during background sync check for source '{}' (ID: {}): {}", diff --git a/tests/integration_debug_ocr_test.rs b/tests/integration_debug_ocr_test.rs index 0ce3ad0..9e133e8 100644 --- a/tests/integration_debug_ocr_test.rs +++ b/tests/integration_debug_ocr_test.rs @@ -6,9 +6,9 @@ use reqwest::Client; use serde_json::Value; use std::time::{Duration, Instant}; use tokio::time::sleep; -use uuid::Uuid; -use readur::models::{DocumentResponse, CreateUser, LoginRequest, LoginResponse}; +use readur::models::{CreateUser, LoginRequest, LoginResponse}; +use readur::routes::documents::types::DocumentUploadResponse; fn get_base_url() -> String { std::env::var("API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string()) @@ -131,22 +131,29 @@ async fn debug_ocr_content() { panic!("Document 2 upload failed with status {}: {}", status, error_text); } - let doc1: DocumentResponse = doc1_response.json().await.expect("Valid JSON for doc1"); - let doc2: DocumentResponse = doc2_response.json().await.expect("Valid JSON for doc2"); + let doc1: DocumentUploadResponse = doc1_response.json().await.expect("Valid JSON for doc1"); + let doc2: DocumentUploadResponse = doc2_response.json().await.expect("Valid JSON for doc2"); - println!("📄 Document 1: {}", doc1.id); - println!("📄 Document 2: {}", doc2.id); + println!("📄 Document 1: {}", doc1.document_id); + println!("📄 Document 2: {}", doc2.document_id); // Wait for OCR to complete let start = Instant::now(); let mut doc1_completed = false; let mut doc2_completed = false; + let mut last_status_print = Instant::now(); while start.elapsed() < TIMEOUT && (!doc1_completed || !doc2_completed) { + // Print progress every 10 seconds + if last_status_print.elapsed() >= Duration::from_secs(10) { + println!("⏳ OCR processing... elapsed: {:?}, Doc1: {}, Doc2: {}", + start.elapsed(), doc1_completed, doc2_completed); + last_status_print = Instant::now(); + } // Check document 1 if !doc1_completed { let response = client - .get(&format!("{}/api/documents/{}/ocr", get_base_url(), doc1.id)) + .get(&format!("{}/api/documents/{}/ocr", get_base_url(), doc1.document_id)) .header("Authorization", format!("Bearer {}", token)) .send() .await @@ -154,17 +161,21 @@ async fn debug_ocr_content() { if response.status().is_success() { let ocr_data: Value = response.json().await.expect("Valid JSON"); - if ocr_data["ocr_status"].as_str() == Some("completed") { + let current_status = ocr_data["ocr_status"].as_str().unwrap_or("unknown"); + println!("📊 Document 1 OCR status: {}", current_status); + if current_status == "completed" { doc1_completed = true; println!("✅ Document 1 OCR completed"); } + } else { + println!("❌ Document 1 OCR endpoint returned: {}", response.status()); } } // Check document 2 if !doc2_completed { let response = client - .get(&format!("{}/api/documents/{}/ocr", get_base_url(), doc2.id)) + .get(&format!("{}/api/documents/{}/ocr", get_base_url(), doc2.document_id)) .header("Authorization", format!("Bearer {}", token)) .send() .await @@ -172,10 +183,14 @@ async fn debug_ocr_content() { if response.status().is_success() { let ocr_data: Value = response.json().await.expect("Valid JSON"); - if ocr_data["ocr_status"].as_str() == Some("completed") { + let current_status = ocr_data["ocr_status"].as_str().unwrap_or("unknown"); + println!("📊 Document 2 OCR status: {}", current_status); + if current_status == "completed" { doc2_completed = true; println!("✅ Document 2 OCR completed"); } + } else { + println!("❌ Document 2 OCR endpoint returned: {}", response.status()); } } @@ -183,19 +198,23 @@ async fn debug_ocr_content() { } if !doc1_completed || !doc2_completed { + println!("❌ OCR TIMEOUT DETAILS:"); + println!(" ⏱️ Total elapsed time: {:?}", start.elapsed()); + println!(" 📄 Document 1 completed: {}", doc1_completed); + println!(" 📄 Document 2 completed: {}", doc2_completed); panic!("OCR did not complete within timeout"); } // Now get the actual OCR content and analyze it let doc1_ocr_response = client - .get(&format!("{}/api/documents/{}/ocr", get_base_url(), doc1.id)) + .get(&format!("{}/api/documents/{}/ocr", get_base_url(), doc1.document_id)) .header("Authorization", format!("Bearer {}", token)) .send() .await .expect("OCR endpoint should work"); let doc2_ocr_response = client - .get(&format!("{}/api/documents/{}/ocr", get_base_url(), doc2.id)) + .get(&format!("{}/api/documents/{}/ocr", get_base_url(), doc2.document_id)) .header("Authorization", format!("Bearer {}", token)) .send() .await