Readur/tests/debug_pipeline_test.rs.disa...

663 lines
26 KiB
Plaintext
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*!
* Debug OCR Pipeline Test - Trace every step to find corruption source
*/
use reqwest::Client;
use serde_json::Value;
use std::time::{Duration, Instant};
use tokio::time::sleep;
use uuid::Uuid;
use futures;
use readur::models::{DocumentResponse, CreateUser, LoginRequest, LoginResponse};
fn get_base_url() -> String {
std::env::var("API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string())
}
const TIMEOUT: Duration = Duration::from_secs(120);
struct PipelineDebugger {
client: Client,
token: String,
}
impl PipelineDebugger {
async fn new() -> Self {
let client = Client::new();
// Debug: Print the base URL we're trying to connect to
let base_url = get_base_url();
println!("🔍 DEBUG: Attempting to connect to server at: {}", base_url);
// Check server health with better error handling
println!("🔍 DEBUG: Checking server health at: {}/api/health", base_url);
let health_check_result = client
.get(&format!("{}/api/health", base_url))
.timeout(Duration::from_secs(5))
.send()
.await;
match health_check_result {
Ok(response) => {
println!("🔍 DEBUG: Health check response status: {}", response.status());
if !response.status().is_success() {
let status = response.status();
let body = response.text().await.unwrap_or_else(|_| "Unable to read response body".to_string());
panic!("Server not healthy. Status: {}, Body: {}", status, body);
}
println!("✅ DEBUG: Server health check passed");
}
Err(e) => {
println!("❌ DEBUG: Failed to connect to server health endpoint");
println!("❌ DEBUG: Error type: {:?}", e);
if e.is_timeout() {
panic!("Health check timed out after 5 seconds");
} else if e.is_connect() {
panic!("Could not connect to server at {}. Is the server running?", base_url);
} else {
panic!("Health check failed with error: {}", e);
}
}
}
// Create test user
let timestamp = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_millis();
let username = format!("pipeline_debug_{}", timestamp);
let email = format!("pipeline_debug_{}@test.com", timestamp);
// Register user
let user_data = CreateUser {
username: username.clone(),
email: email.clone(),
password: "testpass123".to_string(),
role: Some(readur::models::UserRole::User),
};
let register_response = client
.post(&format!("{}/api/auth/register", get_base_url()))
.json(&user_data)
.send()
.await
.expect("Registration should work");
if !register_response.status().is_success() {
panic!("Registration failed: {}", register_response.text().await.unwrap_or_default());
}
// Login
let login_data = LoginRequest {
username: username.clone(),
password: "testpass123".to_string(),
};
let login_response = client
.post(&format!("{}/api/auth/login", get_base_url()))
.json(&login_data)
.send()
.await
.expect("Login should work");
if !login_response.status().is_success() {
panic!("Login failed: {}", login_response.text().await.unwrap_or_default());
}
let login_result: LoginResponse = login_response.json().await.expect("Login should return JSON");
let token = login_result.token;
println!("✅ Pipeline debugger initialized for user: {}", username);
Self { client, token }
}
async fn upload_document_with_debug(&self, content: &str, filename: &str) -> DocumentResponse {
println!("\n📤 UPLOAD PHASE - Starting upload for: {}", filename);
println!(" Content: {}", content);
println!(" Content Length: {} bytes", content.len());
let part = reqwest::multipart::Part::text(content.to_string())
.file_name(filename.to_string())
.mime_str("text/plain")
.expect("Valid mime type");
let form = reqwest::multipart::Form::new().part("file", part);
let upload_start = Instant::now();
let upload_url = format!("{}/api/documents", get_base_url());
println!(" 🔍 DEBUG: Uploading to URL: {}", upload_url);
println!(" 🔍 DEBUG: Using token (first 10 chars): {}...", &self.token[..10.min(self.token.len())]);
let response_result = self.client
.post(&upload_url)
.header("Authorization", format!("Bearer {}", self.token))
.multipart(form)
.send()
.await;
let response = match response_result {
Ok(resp) => {
println!(" 🔍 DEBUG: Upload request sent successfully");
resp
}
Err(e) => {
println!(" ❌ DEBUG: Upload request failed");
println!(" ❌ DEBUG: Error type: {:?}", e);
if e.is_timeout() {
panic!("Upload request timed out");
} else if e.is_connect() {
panic!("Could not connect to server for upload. Error: {}", e);
} else if e.is_request() {
panic!("Request building failed: {}", e);
} else {
panic!("Upload failed with network error: {}", e);
}
}
};
let upload_duration = upload_start.elapsed();
println!(" 🔍 DEBUG: Upload response received. Status: {}", response.status());
if !response.status().is_success() {
let status = response.status();
let headers = response.headers().clone();
let body = response.text().await.unwrap_or_else(|_| "Unable to read response body".to_string());
println!(" ❌ DEBUG: Upload failed with status: {}", status);
println!(" ❌ DEBUG: Response headers: {:?}", headers);
println!(" ❌ DEBUG: Response body: {}", body);
panic!("Upload failed with status {}: {}", status, body);
}
let document: DocumentResponse = response.json().await.expect("Valid JSON");
println!(" ✅ Upload completed in {:?}", upload_duration);
println!(" 📄 Document ID: {}", document.id);
println!(" 📂 Filename: {}", document.filename);
println!(" 📏 File Size: {} bytes", document.file_size);
println!(" 🏷️ MIME Type: {}", document.mime_type);
println!(" 🔄 Initial OCR Status: {:?}", document.ocr_status);
document
}
async fn trace_ocr_processing(&self, document_id: Uuid, expected_content: &str) -> Value {
println!("\n🔍 OCR PROCESSING PHASE - Tracing for document: {}", document_id);
let start = Instant::now();
let mut last_status = String::new();
let mut status_changes = Vec::new();
let mut poll_count = 0;
while start.elapsed() < TIMEOUT {
poll_count += 1;
let response = self.client
.get(&format!("{}/api/documents/{}/ocr", get_base_url(), document_id))
.header("Authorization", format!("Bearer {}", self.token))
.send()
.await
.expect("OCR endpoint should work");
if !response.status().is_success() {
println!(" ❌ OCR endpoint error: {}", response.status());
sleep(Duration::from_millis(100)).await;
continue;
}
let ocr_data: Value = response.json().await.expect("Valid JSON");
let current_status = ocr_data["ocr_status"].as_str().unwrap_or("unknown").to_string();
// Track status changes
if current_status != last_status {
let elapsed = start.elapsed();
status_changes.push((elapsed, current_status.clone()));
println!(" 📋 Status Change #{}: {} -> {} (after {:?})",
status_changes.len(), last_status, current_status, elapsed);
last_status = current_status.clone();
}
// Detailed logging every 10 polls or on status change
if poll_count % 10 == 0 || status_changes.len() > 0 {
println!(" 🔄 Poll #{}: Status={}, HasText={}, TextLen={}",
poll_count,
current_status,
ocr_data["has_ocr_text"].as_bool().unwrap_or(false),
ocr_data["ocr_text"].as_str().unwrap_or("").len()
);
if let Some(confidence) = ocr_data["ocr_confidence"].as_f64() {
println!(" 📊 Confidence: {:.1}%", confidence);
}
if let Some(word_count) = ocr_data["ocr_word_count"].as_i64() {
println!(" 📝 Word Count: {}", word_count);
}
if let Some(error) = ocr_data["ocr_error"].as_str() {
println!(" ❌ Error: {}", error);
}
}
// Check if processing is complete
match current_status.as_str() {
"completed" => {
println!(" ✅ OCR Processing completed after {:?} and {} polls", start.elapsed(), poll_count);
// Detailed final analysis
let ocr_text = ocr_data["ocr_text"].as_str().unwrap_or("");
println!("\n 🔬 FINAL CONTENT ANALYSIS:");
println!(" Expected: {}", expected_content);
println!(" Actual: {}", ocr_text);
println!(" Match: {}", ocr_text == expected_content);
println!(" Expected Length: {} chars", expected_content.len());
println!(" Actual Length: {} chars", ocr_text.len());
if ocr_text != expected_content {
println!(" ⚠️ CONTENT MISMATCH DETECTED!");
// Character-by-character comparison
let expected_chars: Vec<char> = expected_content.chars().collect();
let actual_chars: Vec<char> = ocr_text.chars().collect();
for (i, (e, a)) in expected_chars.iter().zip(actual_chars.iter()).enumerate() {
if e != a {
println!(" Diff at position {}: expected '{}' got '{}'", i, e, a);
break;
}
}
}
return ocr_data;
}
"failed" => {
println!(" ❌ OCR Processing failed after {:?} and {} polls", start.elapsed(), poll_count);
return ocr_data;
}
_ => {
// Continue polling
}
}
sleep(Duration::from_millis(50)).await;
}
panic!("OCR processing did not complete within {:?}", TIMEOUT);
}
async fn get_all_documents(&self) -> Vec<Value> {
let response = self.client
.get(&format!("{}/api/documents", get_base_url()))
.header("Authorization", format!("Bearer {}", self.token))
.send()
.await
.expect("Documents endpoint should work");
if !response.status().is_success() {
panic!("Failed to get documents: {}", response.status());
}
let data: Value = response.json().await.expect("Valid JSON");
// Handle both paginated and non-paginated response formats
match data {
Value::Object(obj) if obj.contains_key("documents") => {
obj["documents"].as_array().unwrap_or(&vec![]).clone()
}
Value::Array(arr) => arr,
_ => vec![]
}
}
}
#[tokio::test]
#[ignore = "Debug test - run manually when needed"]
async fn debug_high_concurrency_pipeline() {
println!("🚀 STARTING HIGH-CONCURRENCY PIPELINE DEBUG");
println!("============================================");
let debugger = PipelineDebugger::new().await;
// Create 5 documents with unique, easily identifiable content
let documents = vec![
("DOC-ALPHA-001-UNIQUE-SIGNATURE-ALPHA", "debug_alpha.txt"),
("DOC-BRAVO-002-UNIQUE-SIGNATURE-BRAVO", "debug_bravo.txt"),
("DOC-CHARLIE-003-UNIQUE-SIGNATURE-CHARLIE", "debug_charlie.txt"),
("DOC-DELTA-004-UNIQUE-SIGNATURE-DELTA", "debug_delta.txt"),
("DOC-ECHO-005-UNIQUE-SIGNATURE-ECHO", "debug_echo.txt"),
];
println!("\n📝 TEST DOCUMENTS:");
for (i, (content, filename)) in documents.iter().enumerate() {
println!(" {}: {} -> {}", i+1, filename, content);
}
// Phase 1: Upload all documents simultaneously
println!("\n🏁 PHASE 1: SIMULTANEOUS UPLOAD");
println!("================================");
let upload_start = Instant::now();
// Execute all uploads concurrently
let uploaded_docs = futures::future::join_all(
documents.iter().map(|(content, filename)| {
debugger.upload_document_with_debug(content, filename)
}).collect::<Vec<_>>()
).await;
let upload_duration = upload_start.elapsed();
println!("\n✅ ALL UPLOADS COMPLETED in {:?}", upload_duration);
// Phase 2: Trace OCR processing for each document
println!("\n🔬 PHASE 2: OCR PROCESSING TRACE");
println!("================================");
let mut ocr_tasks = Vec::new();
for (i, doc) in uploaded_docs.iter().enumerate() {
let doc_id = doc.id;
let expected_content = documents[i].0.to_string();
let debugger_ref = &debugger;
let task = async move {
let result = debugger_ref.trace_ocr_processing(doc_id, &expected_content).await;
(doc_id, expected_content, result)
};
ocr_tasks.push(task);
}
// Process all OCR traces concurrently
let ocr_results = futures::future::join_all(ocr_tasks).await;
// Phase 3: Comprehensive analysis
println!("\n📊 PHASE 3: COMPREHENSIVE ANALYSIS");
println!("===================================");
let mut corrupted_docs = Vec::new();
let mut successful_docs = Vec::new();
for (doc_id, expected_content, ocr_result) in ocr_results {
let actual_text = ocr_result["ocr_text"].as_str().unwrap_or("");
let status = ocr_result["ocr_status"].as_str().unwrap_or("unknown");
println!("\n📄 Document Analysis: {}", doc_id);
println!(" Status: {}", status);
println!(" Expected: {}", expected_content);
println!(" Actual: {}", actual_text);
if status == "completed" {
if actual_text == expected_content {
println!(" ✅ CONTENT CORRECT");
successful_docs.push(doc_id);
} else {
println!(" ❌ CONTENT CORRUPTED");
corrupted_docs.push((doc_id, expected_content.clone(), actual_text.to_string()));
// Check if it contains any other document's content
for (other_expected, _) in &documents {
if other_expected != &expected_content && actual_text.contains(other_expected) {
println!(" 🔄 Contains content from: {}", other_expected);
}
}
}
} else {
println!(" ⚠️ NON-COMPLETED STATUS: {}", status);
}
}
// Phase 4: System state analysis
println!("\n🏗 PHASE 4: SYSTEM STATE ANALYSIS");
println!("===================================");
let all_docs = debugger.get_all_documents().await;
println!("📋 Total documents in system: {}", all_docs.len());
for doc in &all_docs {
if let (Some(id), Some(filename), Some(status)) = (
doc["id"].as_str(),
doc["filename"].as_str(),
doc["ocr_status"].as_str()
) {
println!(" 📄 {}: {} -> {}", id, filename, status);
}
}
// Final verdict
println!("\n🏆 FINAL VERDICT");
println!("================");
println!("✅ Successful: {}", successful_docs.len());
println!("❌ Corrupted: {}", corrupted_docs.len());
if corrupted_docs.is_empty() {
println!("🎉 NO CORRUPTION DETECTED!");
} else {
println!("🚨 CORRUPTION DETECTED IN {} DOCUMENTS:", corrupted_docs.len());
for (doc_id, expected, actual) in &corrupted_docs {
println!(" 📄 {}: expected '{}' got '{}'", doc_id, expected, actual);
}
// Try to identify patterns
if corrupted_docs.iter().all(|(_, _, actual)| actual.is_empty()) {
println!("🔍 PATTERN: All corrupted documents have EMPTY content");
} else if corrupted_docs.iter().all(|(_, _, actual)| actual == &corrupted_docs[0].2) {
println!("🔍 PATTERN: All corrupted documents have IDENTICAL content: '{}'", corrupted_docs[0].2);
} else {
println!("🔍 PATTERN: Mixed corruption types detected");
}
panic!("CORRUPTION DETECTED - see analysis above");
}
}
#[tokio::test]
#[ignore = "Debug test - run manually when needed"]
async fn debug_extreme_high_concurrency_pipeline() {
println!("🚀 STARTING EXTREME HIGH-CONCURRENCY PIPELINE STRESS TEST");
println!("========================================================");
let debugger = PipelineDebugger::new().await;
// Create 50+ documents with unique, easily identifiable content
let mut documents = Vec::new();
for i in 1..=55 {
let content = format!("STRESS-TEST-DOCUMENT-{:03}-UNIQUE-SIGNATURE-{:03}", i, i);
let filename = format!("stress_test_{:03}.txt", i);
documents.push((content, filename));
}
println!("\n📝 STRESS TEST SETUP:");
println!(" 📊 Total Documents: {}", documents.len());
println!(" 🔄 Concurrent Processing: All {} documents simultaneously", documents.len());
println!(" 🎯 Goal: Zero corruption across all documents");
// Phase 1: Upload all documents simultaneously
println!("\n🏁 PHASE 1: SIMULTANEOUS UPLOAD");
println!("================================");
let upload_start = Instant::now();
// Execute all uploads concurrently
let uploaded_docs = futures::future::join_all(
documents.iter().map(|(content, filename)| {
debugger.upload_document_with_debug(content, filename)
}).collect::<Vec<_>>()
).await;
let upload_duration = upload_start.elapsed();
println!("\n✅ ALL UPLOADS COMPLETED in {:?}", upload_duration);
// Phase 2: Trace OCR processing for each document
println!("\n🔬 PHASE 2: OCR PROCESSING TRACE");
println!("================================");
let mut ocr_tasks = Vec::new();
for (i, doc) in uploaded_docs.iter().enumerate() {
let doc_id = doc.id;
let expected_content = documents[i].0.to_string();
let debugger_ref = &debugger;
let task = async move {
let result = debugger_ref.trace_ocr_processing(doc_id, &expected_content).await;
(doc_id, expected_content, result)
};
ocr_tasks.push(task);
}
// Process all OCR traces concurrently
let ocr_results = futures::future::join_all(ocr_tasks).await;
// Phase 3: Comprehensive analysis
println!("\n📊 PHASE 3: COMPREHENSIVE ANALYSIS");
println!("===================================");
let mut corrupted_docs = Vec::new();
let mut successful_docs = Vec::new();
for (doc_id, expected_content, ocr_result) in ocr_results {
let actual_text = ocr_result["ocr_text"].as_str().unwrap_or("");
let status = ocr_result["ocr_status"].as_str().unwrap_or("unknown");
println!("\n📄 Document Analysis: {}", doc_id);
println!(" Status: {}", status);
println!(" Expected: {}", expected_content);
println!(" Actual: {}", actual_text);
if status == "completed" {
if actual_text == expected_content {
println!(" ✅ CONTENT CORRECT");
successful_docs.push(doc_id);
} else {
println!(" ❌ CONTENT CORRUPTED");
corrupted_docs.push((doc_id, expected_content.clone(), actual_text.to_string()));
// Check if it contains any other document's content
for (other_expected, _) in &documents {
if other_expected != &expected_content && actual_text.contains(other_expected) {
println!(" 🔄 Contains content from: {}", other_expected);
}
}
}
} else {
println!(" ⚠️ NON-COMPLETED STATUS: {}", status);
}
}
// Phase 4: System state analysis
println!("\n🏗 PHASE 4: SYSTEM STATE ANALYSIS");
println!("===================================");
let all_docs = debugger.get_all_documents().await;
println!("📋 Total documents in system: {}", all_docs.len());
for doc in &all_docs {
if let (Some(id), Some(filename), Some(status)) = (
doc["id"].as_str(),
doc["filename"].as_str(),
doc["ocr_status"].as_str()
) {
println!(" 📄 {}: {} -> {}", id, filename, status);
}
}
// Final verdict
println!("\n🏆 FINAL VERDICT");
println!("================");
println!("✅ Successful: {}", successful_docs.len());
println!("❌ Corrupted: {}", corrupted_docs.len());
if corrupted_docs.is_empty() {
println!("🎉 NO CORRUPTION DETECTED!");
} else {
println!("🚨 CORRUPTION DETECTED IN {} DOCUMENTS:", corrupted_docs.len());
for (doc_id, expected, actual) in &corrupted_docs {
println!(" 📄 {}: expected '{}' got '{}'", doc_id, expected, actual);
}
// Try to identify patterns
if corrupted_docs.iter().all(|(_, _, actual)| actual.is_empty()) {
println!("🔍 PATTERN: All corrupted documents have EMPTY content");
} else if corrupted_docs.iter().all(|(_, _, actual)| actual == &corrupted_docs[0].2) {
println!("🔍 PATTERN: All corrupted documents have IDENTICAL content: '{}'", corrupted_docs[0].2);
} else {
println!("🔍 PATTERN: Mixed corruption types detected");
}
panic!("CORRUPTION DETECTED - see analysis above");
}
}
#[tokio::test]
#[ignore = "Debug test - run manually when needed"]
async fn debug_document_upload_race_conditions() {
println!("🔍 DEBUGGING DOCUMENT UPLOAD PROCESS");
println!("====================================");
// First, let's do a basic connectivity test
println!("🔍 DEBUG: Testing basic network connectivity...");
let test_client = reqwest::Client::new();
let base_url = get_base_url();
println!("🔍 DEBUG: Base URL from environment: {}", base_url);
// Try a simple GET request first
match test_client.get(&base_url).send().await {
Ok(resp) => {
println!("✅ DEBUG: Basic connectivity test passed. Status: {}", resp.status());
}
Err(e) => {
println!("❌ DEBUG: Basic connectivity test failed");
println!("❌ DEBUG: Error: {:?}", e);
panic!("Cannot connect to server at {}. Error: {}", base_url, e);
}
}
let debugger = PipelineDebugger::new().await;
// Upload same content with different filenames to test:
// 1. Concurrent upload race condition handling (no 500 errors)
// 2. Proper deduplication (identical content = same document ID)
let same_content = "IDENTICAL-CONTENT-FOR-RACE-CONDITION-TEST";
let task1 = debugger.upload_document_with_debug(same_content, "race1.txt");
let task2 = debugger.upload_document_with_debug(same_content, "race2.txt");
let task3 = debugger.upload_document_with_debug(same_content, "race3.txt");
let (doc1, doc2, doc3) = futures::future::join3(task1, task2, task3).await;
let docs = vec![doc1, doc2, doc3];
println!("\n📊 UPLOAD RACE CONDITION ANALYSIS:");
for (i, doc) in docs.iter().enumerate() {
println!(" Doc {}: ID={}, Filename={}, Size={}",
i+1, doc.id, doc.filename, doc.file_size);
}
// Check deduplication behavior: identical content should result in same document ID
let mut ids: Vec<_> = docs.iter().map(|d| d.id).collect();
ids.sort();
ids.dedup();
if ids.len() == 1 {
println!("✅ Correct deduplication: All identical content maps to same document ID");
println!("✅ Race condition handled properly: No 500 errors during concurrent uploads");
} else if ids.len() == docs.len() {
println!("❌ UNEXPECTED: All documents have unique IDs despite identical content");
panic!("Deduplication not working - identical content should map to same document");
} else {
println!("❌ PARTIAL DEDUPLICATION: Some duplicates detected but not all");
panic!("Inconsistent deduplication behavior");
}
// Verify all documents have the same content hash (should be identical)
let content_hashes: Vec<_> = docs.iter().map(|d| {
// We can't directly access file_hash from DocumentResponse, but we can verify
// they all have the same file size as a proxy for same content
d.file_size
}).collect();
if content_hashes.iter().all(|&size| size == content_hashes[0]) {
println!("✅ All documents have same file size (content verification)");
} else {
println!("❌ Documents have different file sizes - test setup error");
}
}