/*! * Debug OCR Pipeline Test - Trace every step to find corruption source */ use reqwest::Client; use serde_json::Value; use std::time::{Duration, Instant}; use tokio::time::sleep; use uuid::Uuid; use futures; use readur::models::{DocumentResponse, CreateUser, LoginRequest, LoginResponse}; fn get_base_url() -> String { std::env::var("API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string()) } const TIMEOUT: Duration = Duration::from_secs(120); struct PipelineDebugger { client: Client, token: String, } impl PipelineDebugger { async fn new() -> Self { let client = Client::new(); // Debug: Print the base URL we're trying to connect to let base_url = get_base_url(); println!("šŸ” DEBUG: Attempting to connect to server at: {}", base_url); // Check server health with better error handling println!("šŸ” DEBUG: Checking server health at: {}/api/health", base_url); let health_check_result = client .get(&format!("{}/api/health", base_url)) .timeout(Duration::from_secs(5)) .send() .await; match health_check_result { Ok(response) => { println!("šŸ” DEBUG: Health check response status: {}", response.status()); if !response.status().is_success() { let status = response.status(); let body = response.text().await.unwrap_or_else(|_| "Unable to read response body".to_string()); panic!("Server not healthy. Status: {}, Body: {}", status, body); } println!("āœ… DEBUG: Server health check passed"); } Err(e) => { println!("āŒ DEBUG: Failed to connect to server health endpoint"); println!("āŒ DEBUG: Error type: {:?}", e); if e.is_timeout() { panic!("Health check timed out after 5 seconds"); } else if e.is_connect() { panic!("Could not connect to server at {}. Is the server running?", base_url); } else { panic!("Health check failed with error: {}", e); } } } // Create test user let timestamp = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap() .as_millis(); let username = format!("pipeline_debug_{}", timestamp); let email = format!("pipeline_debug_{}@test.com", timestamp); // Register user let user_data = CreateUser { username: username.clone(), email: email.clone(), password: "testpass123".to_string(), role: Some(readur::models::UserRole::User), }; let register_response = client .post(&format!("{}/api/auth/register", get_base_url())) .json(&user_data) .send() .await .expect("Registration should work"); if !register_response.status().is_success() { panic!("Registration failed: {}", register_response.text().await.unwrap_or_default()); } // Login let login_data = LoginRequest { username: username.clone(), password: "testpass123".to_string(), }; let login_response = client .post(&format!("{}/api/auth/login", get_base_url())) .json(&login_data) .send() .await .expect("Login should work"); if !login_response.status().is_success() { panic!("Login failed: {}", login_response.text().await.unwrap_or_default()); } let login_result: LoginResponse = login_response.json().await.expect("Login should return JSON"); let token = login_result.token; println!("āœ… Pipeline debugger initialized for user: {}", username); Self { client, token } } async fn upload_document_with_debug(&self, content: &str, filename: &str) -> DocumentResponse { println!("\nšŸ“¤ UPLOAD PHASE - Starting upload for: {}", filename); println!(" Content: {}", content); println!(" Content Length: {} bytes", content.len()); let part = reqwest::multipart::Part::text(content.to_string()) .file_name(filename.to_string()) .mime_str("text/plain") .expect("Valid mime type"); let form = reqwest::multipart::Form::new().part("file", part); let upload_start = Instant::now(); let upload_url = format!("{}/api/documents", get_base_url()); println!(" šŸ” DEBUG: Uploading to URL: {}", upload_url); println!(" šŸ” DEBUG: Using token (first 10 chars): {}...", &self.token[..10.min(self.token.len())]); let response_result = self.client .post(&upload_url) .header("Authorization", format!("Bearer {}", self.token)) .multipart(form) .send() .await; let response = match response_result { Ok(resp) => { println!(" šŸ” DEBUG: Upload request sent successfully"); resp } Err(e) => { println!(" āŒ DEBUG: Upload request failed"); println!(" āŒ DEBUG: Error type: {:?}", e); if e.is_timeout() { panic!("Upload request timed out"); } else if e.is_connect() { panic!("Could not connect to server for upload. Error: {}", e); } else if e.is_request() { panic!("Request building failed: {}", e); } else { panic!("Upload failed with network error: {}", e); } } }; let upload_duration = upload_start.elapsed(); println!(" šŸ” DEBUG: Upload response received. Status: {}", response.status()); if !response.status().is_success() { let status = response.status(); let headers = response.headers().clone(); let body = response.text().await.unwrap_or_else(|_| "Unable to read response body".to_string()); println!(" āŒ DEBUG: Upload failed with status: {}", status); println!(" āŒ DEBUG: Response headers: {:?}", headers); println!(" āŒ DEBUG: Response body: {}", body); panic!("Upload failed with status {}: {}", status, body); } let document: DocumentResponse = response.json().await.expect("Valid JSON"); println!(" āœ… Upload completed in {:?}", upload_duration); println!(" šŸ“„ Document ID: {}", document.id); println!(" šŸ“‚ Filename: {}", document.filename); println!(" šŸ“ File Size: {} bytes", document.file_size); println!(" šŸ·ļø MIME Type: {}", document.mime_type); println!(" šŸ”„ Initial OCR Status: {:?}", document.ocr_status); document } async fn trace_ocr_processing(&self, document_id: Uuid, expected_content: &str) -> Value { println!("\nšŸ” OCR PROCESSING PHASE - Tracing for document: {}", document_id); let start = Instant::now(); let mut last_status = String::new(); let mut status_changes = Vec::new(); let mut poll_count = 0; while start.elapsed() < TIMEOUT { poll_count += 1; let response = self.client .get(&format!("{}/api/documents/{}/ocr", get_base_url(), document_id)) .header("Authorization", format!("Bearer {}", self.token)) .send() .await .expect("OCR endpoint should work"); if !response.status().is_success() { println!(" āŒ OCR endpoint error: {}", response.status()); sleep(Duration::from_millis(100)).await; continue; } let ocr_data: Value = response.json().await.expect("Valid JSON"); let current_status = ocr_data["ocr_status"].as_str().unwrap_or("unknown").to_string(); // Track status changes if current_status != last_status { let elapsed = start.elapsed(); status_changes.push((elapsed, current_status.clone())); println!(" šŸ“‹ Status Change #{}: {} -> {} (after {:?})", status_changes.len(), last_status, current_status, elapsed); last_status = current_status.clone(); } // Detailed logging every 10 polls or on status change if poll_count % 10 == 0 || status_changes.len() > 0 { println!(" šŸ”„ Poll #{}: Status={}, HasText={}, TextLen={}", poll_count, current_status, ocr_data["has_ocr_text"].as_bool().unwrap_or(false), ocr_data["ocr_text"].as_str().unwrap_or("").len() ); if let Some(confidence) = ocr_data["ocr_confidence"].as_f64() { println!(" šŸ“Š Confidence: {:.1}%", confidence); } if let Some(word_count) = ocr_data["ocr_word_count"].as_i64() { println!(" šŸ“ Word Count: {}", word_count); } if let Some(error) = ocr_data["ocr_error"].as_str() { println!(" āŒ Error: {}", error); } } // Check if processing is complete match current_status.as_str() { "completed" => { println!(" āœ… OCR Processing completed after {:?} and {} polls", start.elapsed(), poll_count); // Detailed final analysis let ocr_text = ocr_data["ocr_text"].as_str().unwrap_or(""); println!("\n šŸ”¬ FINAL CONTENT ANALYSIS:"); println!(" Expected: {}", expected_content); println!(" Actual: {}", ocr_text); println!(" Match: {}", ocr_text == expected_content); println!(" Expected Length: {} chars", expected_content.len()); println!(" Actual Length: {} chars", ocr_text.len()); if ocr_text != expected_content { println!(" āš ļø CONTENT MISMATCH DETECTED!"); // Character-by-character comparison let expected_chars: Vec = expected_content.chars().collect(); let actual_chars: Vec = ocr_text.chars().collect(); for (i, (e, a)) in expected_chars.iter().zip(actual_chars.iter()).enumerate() { if e != a { println!(" Diff at position {}: expected '{}' got '{}'", i, e, a); break; } } } return ocr_data; } "failed" => { println!(" āŒ OCR Processing failed after {:?} and {} polls", start.elapsed(), poll_count); return ocr_data; } _ => { // Continue polling } } sleep(Duration::from_millis(50)).await; } panic!("OCR processing did not complete within {:?}", TIMEOUT); } async fn get_all_documents(&self) -> Vec { let response = self.client .get(&format!("{}/api/documents", get_base_url())) .header("Authorization", format!("Bearer {}", self.token)) .send() .await .expect("Documents endpoint should work"); if !response.status().is_success() { panic!("Failed to get documents: {}", response.status()); } let data: Value = response.json().await.expect("Valid JSON"); // Handle both paginated and non-paginated response formats match data { Value::Object(obj) if obj.contains_key("documents") => { obj["documents"].as_array().unwrap_or(&vec![]).clone() } Value::Array(arr) => arr, _ => vec![] } } } #[tokio::test] #[ignore = "Debug test - run manually when needed"] async fn debug_high_concurrency_pipeline() { println!("šŸš€ STARTING HIGH-CONCURRENCY PIPELINE DEBUG"); println!("============================================"); let debugger = PipelineDebugger::new().await; // Create 5 documents with unique, easily identifiable content let documents = vec![ ("DOC-ALPHA-001-UNIQUE-SIGNATURE-ALPHA", "debug_alpha.txt"), ("DOC-BRAVO-002-UNIQUE-SIGNATURE-BRAVO", "debug_bravo.txt"), ("DOC-CHARLIE-003-UNIQUE-SIGNATURE-CHARLIE", "debug_charlie.txt"), ("DOC-DELTA-004-UNIQUE-SIGNATURE-DELTA", "debug_delta.txt"), ("DOC-ECHO-005-UNIQUE-SIGNATURE-ECHO", "debug_echo.txt"), ]; println!("\nšŸ“ TEST DOCUMENTS:"); for (i, (content, filename)) in documents.iter().enumerate() { println!(" {}: {} -> {}", i+1, filename, content); } // Phase 1: Upload all documents simultaneously println!("\nšŸ PHASE 1: SIMULTANEOUS UPLOAD"); println!("================================"); let upload_start = Instant::now(); // Execute all uploads concurrently let uploaded_docs = futures::future::join_all( documents.iter().map(|(content, filename)| { debugger.upload_document_with_debug(content, filename) }).collect::>() ).await; let upload_duration = upload_start.elapsed(); println!("\nāœ… ALL UPLOADS COMPLETED in {:?}", upload_duration); // Phase 2: Trace OCR processing for each document println!("\nšŸ”¬ PHASE 2: OCR PROCESSING TRACE"); println!("================================"); let mut ocr_tasks = Vec::new(); for (i, doc) in uploaded_docs.iter().enumerate() { let doc_id = doc.id; let expected_content = documents[i].0.to_string(); let debugger_ref = &debugger; let task = async move { let result = debugger_ref.trace_ocr_processing(doc_id, &expected_content).await; (doc_id, expected_content, result) }; ocr_tasks.push(task); } // Process all OCR traces concurrently let ocr_results = futures::future::join_all(ocr_tasks).await; // Phase 3: Comprehensive analysis println!("\nšŸ“Š PHASE 3: COMPREHENSIVE ANALYSIS"); println!("==================================="); let mut corrupted_docs = Vec::new(); let mut successful_docs = Vec::new(); for (doc_id, expected_content, ocr_result) in ocr_results { let actual_text = ocr_result["ocr_text"].as_str().unwrap_or(""); let status = ocr_result["ocr_status"].as_str().unwrap_or("unknown"); println!("\nšŸ“„ Document Analysis: {}", doc_id); println!(" Status: {}", status); println!(" Expected: {}", expected_content); println!(" Actual: {}", actual_text); if status == "completed" { if actual_text == expected_content { println!(" āœ… CONTENT CORRECT"); successful_docs.push(doc_id); } else { println!(" āŒ CONTENT CORRUPTED"); corrupted_docs.push((doc_id, expected_content.clone(), actual_text.to_string())); // Check if it contains any other document's content for (other_expected, _) in &documents { if other_expected != &expected_content && actual_text.contains(other_expected) { println!(" šŸ”„ Contains content from: {}", other_expected); } } } } else { println!(" āš ļø NON-COMPLETED STATUS: {}", status); } } // Phase 4: System state analysis println!("\nšŸ—ļø PHASE 4: SYSTEM STATE ANALYSIS"); println!("==================================="); let all_docs = debugger.get_all_documents().await; println!("šŸ“‹ Total documents in system: {}", all_docs.len()); for doc in &all_docs { if let (Some(id), Some(filename), Some(status)) = ( doc["id"].as_str(), doc["filename"].as_str(), doc["ocr_status"].as_str() ) { println!(" šŸ“„ {}: {} -> {}", id, filename, status); } } // Final verdict println!("\nšŸ† FINAL VERDICT"); println!("================"); println!("āœ… Successful: {}", successful_docs.len()); println!("āŒ Corrupted: {}", corrupted_docs.len()); if corrupted_docs.is_empty() { println!("šŸŽ‰ NO CORRUPTION DETECTED!"); } else { println!("🚨 CORRUPTION DETECTED IN {} DOCUMENTS:", corrupted_docs.len()); for (doc_id, expected, actual) in &corrupted_docs { println!(" šŸ“„ {}: expected '{}' got '{}'", doc_id, expected, actual); } // Try to identify patterns if corrupted_docs.iter().all(|(_, _, actual)| actual.is_empty()) { println!("šŸ” PATTERN: All corrupted documents have EMPTY content"); } else if corrupted_docs.iter().all(|(_, _, actual)| actual == &corrupted_docs[0].2) { println!("šŸ” PATTERN: All corrupted documents have IDENTICAL content: '{}'", corrupted_docs[0].2); } else { println!("šŸ” PATTERN: Mixed corruption types detected"); } panic!("CORRUPTION DETECTED - see analysis above"); } } #[tokio::test] #[ignore = "Debug test - run manually when needed"] async fn debug_extreme_high_concurrency_pipeline() { println!("šŸš€ STARTING EXTREME HIGH-CONCURRENCY PIPELINE STRESS TEST"); println!("========================================================"); let debugger = PipelineDebugger::new().await; // Create 50+ documents with unique, easily identifiable content let mut documents = Vec::new(); for i in 1..=55 { let content = format!("STRESS-TEST-DOCUMENT-{:03}-UNIQUE-SIGNATURE-{:03}", i, i); let filename = format!("stress_test_{:03}.txt", i); documents.push((content, filename)); } println!("\nšŸ“ STRESS TEST SETUP:"); println!(" šŸ“Š Total Documents: {}", documents.len()); println!(" šŸ”„ Concurrent Processing: All {} documents simultaneously", documents.len()); println!(" šŸŽÆ Goal: Zero corruption across all documents"); // Phase 1: Upload all documents simultaneously println!("\nšŸ PHASE 1: SIMULTANEOUS UPLOAD"); println!("================================"); let upload_start = Instant::now(); // Execute all uploads concurrently let uploaded_docs = futures::future::join_all( documents.iter().map(|(content, filename)| { debugger.upload_document_with_debug(content, filename) }).collect::>() ).await; let upload_duration = upload_start.elapsed(); println!("\nāœ… ALL UPLOADS COMPLETED in {:?}", upload_duration); // Phase 2: Trace OCR processing for each document println!("\nšŸ”¬ PHASE 2: OCR PROCESSING TRACE"); println!("================================"); let mut ocr_tasks = Vec::new(); for (i, doc) in uploaded_docs.iter().enumerate() { let doc_id = doc.id; let expected_content = documents[i].0.to_string(); let debugger_ref = &debugger; let task = async move { let result = debugger_ref.trace_ocr_processing(doc_id, &expected_content).await; (doc_id, expected_content, result) }; ocr_tasks.push(task); } // Process all OCR traces concurrently let ocr_results = futures::future::join_all(ocr_tasks).await; // Phase 3: Comprehensive analysis println!("\nšŸ“Š PHASE 3: COMPREHENSIVE ANALYSIS"); println!("==================================="); let mut corrupted_docs = Vec::new(); let mut successful_docs = Vec::new(); for (doc_id, expected_content, ocr_result) in ocr_results { let actual_text = ocr_result["ocr_text"].as_str().unwrap_or(""); let status = ocr_result["ocr_status"].as_str().unwrap_or("unknown"); println!("\nšŸ“„ Document Analysis: {}", doc_id); println!(" Status: {}", status); println!(" Expected: {}", expected_content); println!(" Actual: {}", actual_text); if status == "completed" { if actual_text == expected_content { println!(" āœ… CONTENT CORRECT"); successful_docs.push(doc_id); } else { println!(" āŒ CONTENT CORRUPTED"); corrupted_docs.push((doc_id, expected_content.clone(), actual_text.to_string())); // Check if it contains any other document's content for (other_expected, _) in &documents { if other_expected != &expected_content && actual_text.contains(other_expected) { println!(" šŸ”„ Contains content from: {}", other_expected); } } } } else { println!(" āš ļø NON-COMPLETED STATUS: {}", status); } } // Phase 4: System state analysis println!("\nšŸ—ļø PHASE 4: SYSTEM STATE ANALYSIS"); println!("==================================="); let all_docs = debugger.get_all_documents().await; println!("šŸ“‹ Total documents in system: {}", all_docs.len()); for doc in &all_docs { if let (Some(id), Some(filename), Some(status)) = ( doc["id"].as_str(), doc["filename"].as_str(), doc["ocr_status"].as_str() ) { println!(" šŸ“„ {}: {} -> {}", id, filename, status); } } // Final verdict println!("\nšŸ† FINAL VERDICT"); println!("================"); println!("āœ… Successful: {}", successful_docs.len()); println!("āŒ Corrupted: {}", corrupted_docs.len()); if corrupted_docs.is_empty() { println!("šŸŽ‰ NO CORRUPTION DETECTED!"); } else { println!("🚨 CORRUPTION DETECTED IN {} DOCUMENTS:", corrupted_docs.len()); for (doc_id, expected, actual) in &corrupted_docs { println!(" šŸ“„ {}: expected '{}' got '{}'", doc_id, expected, actual); } // Try to identify patterns if corrupted_docs.iter().all(|(_, _, actual)| actual.is_empty()) { println!("šŸ” PATTERN: All corrupted documents have EMPTY content"); } else if corrupted_docs.iter().all(|(_, _, actual)| actual == &corrupted_docs[0].2) { println!("šŸ” PATTERN: All corrupted documents have IDENTICAL content: '{}'", corrupted_docs[0].2); } else { println!("šŸ” PATTERN: Mixed corruption types detected"); } panic!("CORRUPTION DETECTED - see analysis above"); } } #[tokio::test] #[ignore = "Debug test - run manually when needed"] async fn debug_document_upload_race_conditions() { println!("šŸ” DEBUGGING DOCUMENT UPLOAD PROCESS"); println!("===================================="); // First, let's do a basic connectivity test println!("šŸ” DEBUG: Testing basic network connectivity..."); let test_client = reqwest::Client::new(); let base_url = get_base_url(); println!("šŸ” DEBUG: Base URL from environment: {}", base_url); // Try a simple GET request first match test_client.get(&base_url).send().await { Ok(resp) => { println!("āœ… DEBUG: Basic connectivity test passed. Status: {}", resp.status()); } Err(e) => { println!("āŒ DEBUG: Basic connectivity test failed"); println!("āŒ DEBUG: Error: {:?}", e); panic!("Cannot connect to server at {}. Error: {}", base_url, e); } } let debugger = PipelineDebugger::new().await; // Upload same content with different filenames to test: // 1. Concurrent upload race condition handling (no 500 errors) // 2. Proper deduplication (identical content = same document ID) let same_content = "IDENTICAL-CONTENT-FOR-RACE-CONDITION-TEST"; let task1 = debugger.upload_document_with_debug(same_content, "race1.txt"); let task2 = debugger.upload_document_with_debug(same_content, "race2.txt"); let task3 = debugger.upload_document_with_debug(same_content, "race3.txt"); let (doc1, doc2, doc3) = futures::future::join3(task1, task2, task3).await; let docs = vec![doc1, doc2, doc3]; println!("\nšŸ“Š UPLOAD RACE CONDITION ANALYSIS:"); for (i, doc) in docs.iter().enumerate() { println!(" Doc {}: ID={}, Filename={}, Size={}", i+1, doc.id, doc.filename, doc.file_size); } // Check deduplication behavior: identical content should result in same document ID let mut ids: Vec<_> = docs.iter().map(|d| d.id).collect(); ids.sort(); ids.dedup(); if ids.len() == 1 { println!("āœ… Correct deduplication: All identical content maps to same document ID"); println!("āœ… Race condition handled properly: No 500 errors during concurrent uploads"); } else if ids.len() == docs.len() { println!("āŒ UNEXPECTED: All documents have unique IDs despite identical content"); panic!("Deduplication not working - identical content should map to same document"); } else { println!("āŒ PARTIAL DEDUPLICATION: Some duplicates detected but not all"); panic!("Inconsistent deduplication behavior"); } // Verify all documents have the same content hash (should be identical) let content_hashes: Vec<_> = docs.iter().map(|d| { // We can't directly access file_hash from DocumentResponse, but we can verify // they all have the same file size as a proxy for same content d.file_size }).collect(); if content_hashes.iter().all(|&size| size == content_hashes[0]) { println!("āœ… All documents have same file size (content verification)"); } else { println!("āŒ Documents have different file sizes - test setup error"); } }