Readur/tests/debug_pipeline_test.rs.disa...

/*!
 * Debug OCR Pipeline Test - Trace every step to find corruption source
 */

use reqwest::Client;
use serde_json::Value;
use std::time::{Duration, Instant};
use tokio::time::sleep;
use uuid::Uuid;
use futures;

use readur::models::{DocumentResponse, CreateUser, LoginRequest, LoginResponse};

fn get_base_url() -> String {
    std::env::var("API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string())
}
const TIMEOUT: Duration = Duration::from_secs(120);

struct PipelineDebugger {
    client: Client,
    token: String,
}

impl PipelineDebugger {
    async fn new() -> Self {
        let client = Client::new();

        // Debug: Print the base URL we're trying to connect to
        let base_url = get_base_url();
        println!("🔍 DEBUG: Attempting to connect to server at: {}", base_url);

        // Check server health with better error handling
        println!("🔍 DEBUG: Checking server health at: {}/api/health", base_url);

        let health_check_result = client
            .get(&format!("{}/api/health", base_url))
            .timeout(Duration::from_secs(5))
            .send()
            .await;

        match health_check_result {
            Ok(response) => {
                println!("🔍 DEBUG: Health check response status: {}", response.status());
                if !response.status().is_success() {
                    let status = response.status();
                    let body = response.text().await.unwrap_or_else(|_| "Unable to read response body".to_string());
                    panic!("Server not healthy. Status: {}, Body: {}", status, body);
                }
                println!("✅ DEBUG: Server health check passed");
            }
            Err(e) => {
                println!("❌ DEBUG: Failed to connect to server health endpoint");
                println!("❌ DEBUG: Error type: {:?}", e);
                if e.is_timeout() {
                    panic!("Health check timed out after 5 seconds");
                } else if e.is_connect() {
                    panic!("Could not connect to server at {}. Is the server running?", base_url);
                } else {
                    panic!("Health check failed with error: {}", e);
                }
            }
        }

        // Create test user
        let timestamp = std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
            .unwrap()
            .as_millis();
        let username = format!("pipeline_debug_{}", timestamp);
        let email = format!("pipeline_debug_{}@test.com", timestamp);

        // Register user
        let user_data = CreateUser {
            username: username.clone(),
            email: email.clone(),
            password: "testpass123".to_string(),
            role: Some(readur::models::UserRole::User),
        };

        let register_response = client
            .post(&format!("{}/api/auth/register", get_base_url()))
            .json(&user_data)
            .send()
            .await
            .expect("Registration should work");

        if !register_response.status().is_success() {
            panic!("Registration failed: {}", register_response.text().await.unwrap_or_default());
        }

        // Login
        let login_data = LoginRequest {
            username: username.clone(),
            password: "testpass123".to_string(),
        };

        let login_response = client
            .post(&format!("{}/api/auth/login", get_base_url()))
            .json(&login_data)
            .send()
            .await
            .expect("Login should work");

        if !login_response.status().is_success() {
            panic!("Login failed: {}", login_response.text().await.unwrap_or_default());
        }

        let login_result: LoginResponse = login_response.json().await.expect("Login should return JSON");
        let token = login_result.token;

        println!("✅ Pipeline debugger initialized for user: {}", username);

        Self { client, token }
    }

    async fn upload_document_with_debug(&self, content: &str, filename: &str) -> DocumentResponse {
        println!("\n📤 UPLOAD PHASE - Starting upload for: {}", filename);
        println!("  Content: {}", content);
        println!("  Content Length: {} bytes", content.len());

        let part = reqwest::multipart::Part::text(content.to_string())
            .file_name(filename.to_string())
            .mime_str("text/plain")
            .expect("Valid mime type");
        let form = reqwest::multipart::Form::new().part("file", part);

        let upload_start = Instant::now();
        let upload_url = format!("{}/api/documents", get_base_url());
        println!("  🔍 DEBUG: Uploading to URL: {}", upload_url);
        println!("  🔍 DEBUG: Using token (first 10 chars): {}...", &self.token[..10.min(self.token.len())]);

        let response_result = self.client
            .post(&upload_url)
            .header("Authorization", format!("Bearer {}", self.token))
            .multipart(form)
            .send()
            .await;

        let response = match response_result {
            Ok(resp) => {
                println!("  🔍 DEBUG: Upload request sent successfully");
                resp
            }
            Err(e) => {
                println!("  ❌ DEBUG: Upload request failed");
                println!("  ❌ DEBUG: Error type: {:?}", e);
                if e.is_timeout() {
                    panic!("Upload request timed out");
                } else if e.is_connect() {
                    panic!("Could not connect to server for upload. Error: {}", e);
                } else if e.is_request() {
                    panic!("Request building failed: {}", e);
                } else {
                    panic!("Upload failed with network error: {}", e);
                }
            }
        };

        let upload_duration = upload_start.elapsed();
        println!("  🔍 DEBUG: Upload response received. Status: {}", response.status());

        if !response.status().is_success() {
            let status = response.status();
            let headers = response.headers().clone();
            let body = response.text().await.unwrap_or_else(|_| "Unable to read response body".to_string());

            println!("  ❌ DEBUG: Upload failed with status: {}", status);
            println!("  ❌ DEBUG: Response headers: {:?}", headers);
            println!("  ❌ DEBUG: Response body: {}", body);

            panic!("Upload failed with status {}: {}", status, body);
        }

        let document: DocumentResponse = response.json().await.expect("Valid JSON");

        println!("  ✅ Upload completed in {:?}", upload_duration);
        println!("  📄 Document ID: {}", document.id);
        println!("  📂 Filename: {}", document.filename);
        println!("  📏 File Size: {} bytes", document.file_size);
        println!("  🏷️  MIME Type: {}", document.mime_type);
        println!("  🔄 Initial OCR Status: {:?}", document.ocr_status);

        document
    }

    async fn trace_ocr_processing(&self, document_id: Uuid, expected_content: &str) -> Value {
        println!("\n🔍 OCR PROCESSING PHASE - Tracing for document: {}", document_id);

        let start = Instant::now();
        let mut last_status = String::new();
        let mut status_changes = Vec::new();
        let mut poll_count = 0;

        while start.elapsed() < TIMEOUT {
            poll_count += 1;

            let response = self.client
                .get(&format!("{}/api/documents/{}/ocr", get_base_url(), document_id))
                .header("Authorization", format!("Bearer {}", self.token))
                .send()
                .await
                .expect("OCR endpoint should work");

            if !response.status().is_success() {
                println!("  ❌ OCR endpoint error: {}", response.status());
                sleep(Duration::from_millis(100)).await;
                continue;
            }

            let ocr_data: Value = response.json().await.expect("Valid JSON");
            let current_status = ocr_data["ocr_status"].as_str().unwrap_or("unknown").to_string();

            // Track status changes
            if current_status != last_status {
                let elapsed = start.elapsed();
                status_changes.push((elapsed, current_status.clone()));
                println!("  📋 Status Change #{}: {} -> {} (after {:?})",
                        status_changes.len(), last_status, current_status, elapsed);
                last_status = current_status.clone();
            }

            // Detailed logging every 10 polls or on status change
            if poll_count % 10 == 0 || status_changes.len() > 0 {
                println!("  🔄 Poll #{}: Status={}, HasText={}, TextLen={}",
                        poll_count,
                        current_status,
                        ocr_data["has_ocr_text"].as_bool().unwrap_or(false),
                        ocr_data["ocr_text"].as_str().unwrap_or("").len()
                );

                if let Some(confidence) = ocr_data["ocr_confidence"].as_f64() {
                    println!("    📊 Confidence: {:.1}%", confidence);
                }
                if let Some(word_count) = ocr_data["ocr_word_count"].as_i64() {
                    println!("    📝 Word Count: {}", word_count);
                }
                if let Some(error) = ocr_data["ocr_error"].as_str() {
                    println!("    ❌ Error: {}", error);
                }
            }

            // Check if processing is complete
            match current_status.as_str() {
                "completed" => {
                    println!("  ✅ OCR Processing completed after {:?} and {} polls", start.elapsed(), poll_count);

                    // Detailed final analysis
                    let ocr_text = ocr_data["ocr_text"].as_str().unwrap_or("");
                    println!("\n  🔬 FINAL CONTENT ANALYSIS:");
                    println!("    Expected: {}", expected_content);
                    println!("    Actual:   {}", ocr_text);
                    println!("    Match: {}", ocr_text == expected_content);
                    println!("    Expected Length: {} chars", expected_content.len());
                    println!("    Actual Length:   {} chars", ocr_text.len());

                    if ocr_text != expected_content {
                        println!("    ⚠️  CONTENT MISMATCH DETECTED!");

                        // Character-by-character comparison
                        let expected_chars: Vec<char> = expected_content.chars().collect();
                        let actual_chars: Vec<char> = ocr_text.chars().collect();

                        for (i, (e, a)) in expected_chars.iter().zip(actual_chars.iter()).enumerate() {
                            if e != a {
                                println!("      Diff at position {}: expected '{}' got '{}'", i, e, a);
                                break;
                            }
                        }
                    }

                    return ocr_data;
                }
                "failed" => {
                    println!("  ❌ OCR Processing failed after {:?} and {} polls", start.elapsed(), poll_count);
                    return ocr_data;
                }
                _ => {
                    // Continue polling
                }
            }

            sleep(Duration::from_millis(50)).await;
        }

        panic!("OCR processing did not complete within {:?}", TIMEOUT);
    }

    async fn get_all_documents(&self) -> Vec<Value> {
        let response = self.client
            .get(&format!("{}/api/documents", get_base_url()))
            .header("Authorization", format!("Bearer {}", self.token))
            .send()
            .await
            .expect("Documents endpoint should work");

        if !response.status().is_success() {
            panic!("Failed to get documents: {}", response.status());
        }

        let data: Value = response.json().await.expect("Valid JSON");

        // Handle both paginated and non-paginated response formats
        match data {
            Value::Object(obj) if obj.contains_key("documents") => {
                obj["documents"].as_array().unwrap_or(&vec![]).clone()
            }
            Value::Array(arr) => arr,
            _ => vec![]
        }
    }
}

#[tokio::test]
#[ignore = "Debug test - run manually when needed"]
async fn debug_high_concurrency_pipeline() {
    println!("🚀 STARTING HIGH-CONCURRENCY PIPELINE DEBUG");
    println!("============================================");

    let debugger = PipelineDebugger::new().await;

    // Create 5 documents with unique, easily identifiable content
    let documents = vec![
        ("DOC-ALPHA-001-UNIQUE-SIGNATURE-ALPHA", "debug_alpha.txt"),
        ("DOC-BRAVO-002-UNIQUE-SIGNATURE-BRAVO", "debug_bravo.txt"),
        ("DOC-CHARLIE-003-UNIQUE-SIGNATURE-CHARLIE", "debug_charlie.txt"),
        ("DOC-DELTA-004-UNIQUE-SIGNATURE-DELTA", "debug_delta.txt"),
        ("DOC-ECHO-005-UNIQUE-SIGNATURE-ECHO", "debug_echo.txt"),
    ];

    println!("\n📝 TEST DOCUMENTS:");
    for (i, (content, filename)) in documents.iter().enumerate() {
        println!("  {}: {} -> {}", i+1, filename, content);
    }

    // Phase 1: Upload all documents simultaneously
    println!("\n🏁 PHASE 1: SIMULTANEOUS UPLOAD");
    println!("================================");

    let upload_start = Instant::now();

    // Execute all uploads concurrently
    let uploaded_docs = futures::future::join_all(
        documents.iter().map(|(content, filename)| {
            debugger.upload_document_with_debug(content, filename)
        }).collect::<Vec<_>>()
    ).await;
    let upload_duration = upload_start.elapsed();

    println!("\n✅ ALL UPLOADS COMPLETED in {:?}", upload_duration);

    // Phase 2: Trace OCR processing for each document
    println!("\n🔬 PHASE 2: OCR PROCESSING TRACE");
    println!("================================");

    let mut ocr_tasks = Vec::new();

    for (i, doc) in uploaded_docs.iter().enumerate() {
        let doc_id = doc.id;
        let expected_content = documents[i].0.to_string();
        let debugger_ref = &debugger;

        let task = async move {
            let result = debugger_ref.trace_ocr_processing(doc_id, &expected_content).await;
            (doc_id, expected_content, result)
        };

        ocr_tasks.push(task);
    }

    // Process all OCR traces concurrently
    let ocr_results = futures::future::join_all(ocr_tasks).await;

    // Phase 3: Comprehensive analysis
    println!("\n📊 PHASE 3: COMPREHENSIVE ANALYSIS");
    println!("===================================");

    let mut corrupted_docs = Vec::new();
    let mut successful_docs = Vec::new();

    for (doc_id, expected_content, ocr_result) in ocr_results {
        let actual_text = ocr_result["ocr_text"].as_str().unwrap_or("");
        let status = ocr_result["ocr_status"].as_str().unwrap_or("unknown");

        println!("\n📄 Document Analysis: {}", doc_id);
        println!("  Status: {}", status);
        println!("  Expected: {}", expected_content);
        println!("  Actual:   {}", actual_text);

        if status == "completed" {
            if actual_text == expected_content {
                println!("  ✅ CONTENT CORRECT");
                successful_docs.push(doc_id);
            } else {
                println!("  ❌ CONTENT CORRUPTED");
                corrupted_docs.push((doc_id, expected_content.clone(), actual_text.to_string()));

                // Check if it contains any other document's content
                for (other_expected, _) in &documents {
                    if other_expected != &expected_content && actual_text.contains(other_expected) {
                        println!("    🔄 Contains content from: {}", other_expected);
                    }
                }
            }
        } else {
            println!("  ⚠️  NON-COMPLETED STATUS: {}", status);
        }
    }

    // Phase 4: System state analysis
    println!("\n🏗️  PHASE 4: SYSTEM STATE ANALYSIS");
    println!("===================================");

    let all_docs = debugger.get_all_documents().await;
    println!("📋 Total documents in system: {}", all_docs.len());

    for doc in &all_docs {
        if let (Some(id), Some(filename), Some(status)) = (
            doc["id"].as_str(),
            doc["filename"].as_str(),
            doc["ocr_status"].as_str()
        ) {
            println!("  📄 {}: {} -> {}", id, filename, status);
        }
    }

    // Final verdict
    println!("\n🏆 FINAL VERDICT");
    println!("================");
    println!("✅ Successful: {}", successful_docs.len());
    println!("❌ Corrupted:  {}", corrupted_docs.len());

    if corrupted_docs.is_empty() {
        println!("🎉 NO CORRUPTION DETECTED!");
    } else {
        println!("🚨 CORRUPTION DETECTED IN {} DOCUMENTS:", corrupted_docs.len());
        for (doc_id, expected, actual) in &corrupted_docs {
            println!("  📄 {}: expected '{}' got '{}'", doc_id, expected, actual);
        }

        // Try to identify patterns
        if corrupted_docs.iter().all(|(_, _, actual)| actual.is_empty()) {
            println!("🔍 PATTERN: All corrupted documents have EMPTY content");
        } else if corrupted_docs.iter().all(|(_, _, actual)| actual == &corrupted_docs[0].2) {
            println!("🔍 PATTERN: All corrupted documents have IDENTICAL content: '{}'", corrupted_docs[0].2);
        } else {
            println!("🔍 PATTERN: Mixed corruption types detected");
        }

        panic!("CORRUPTION DETECTED - see analysis above");
    }
}

#[tokio::test]
#[ignore = "Debug test - run manually when needed"]
async fn debug_extreme_high_concurrency_pipeline() {
    println!("🚀 STARTING EXTREME HIGH-CONCURRENCY PIPELINE STRESS TEST");
    println!("========================================================");

    let debugger = PipelineDebugger::new().await;

    // Create 50+ documents with unique, easily identifiable content
    let mut documents = Vec::new();
    for i in 1..=55 {
        let content = format!("STRESS-TEST-DOCUMENT-{:03}-UNIQUE-SIGNATURE-{:03}", i, i);
        let filename = format!("stress_test_{:03}.txt", i);
        documents.push((content, filename));
    }

    println!("\n📝 STRESS TEST SETUP:");
    println!("  📊 Total Documents: {}", documents.len());
    println!("  🔄 Concurrent Processing: All {} documents simultaneously", documents.len());
    println!("  🎯 Goal: Zero corruption across all documents");

    // Phase 1: Upload all documents simultaneously
    println!("\n🏁 PHASE 1: SIMULTANEOUS UPLOAD");
    println!("================================");

    let upload_start = Instant::now();

    // Execute all uploads concurrently
    let uploaded_docs = futures::future::join_all(
        documents.iter().map(|(content, filename)| {
            debugger.upload_document_with_debug(content, filename)
        }).collect::<Vec<_>>()
    ).await;
    let upload_duration = upload_start.elapsed();

    println!("\n✅ ALL UPLOADS COMPLETED in {:?}", upload_duration);

    // Phase 2: Trace OCR processing for each document
    println!("\n🔬 PHASE 2: OCR PROCESSING TRACE");
    println!("================================");

    let mut ocr_tasks = Vec::new();

    for (i, doc) in uploaded_docs.iter().enumerate() {
        let doc_id = doc.id;
        let expected_content = documents[i].0.to_string();
        let debugger_ref = &debugger;

        let task = async move {
            let result = debugger_ref.trace_ocr_processing(doc_id, &expected_content).await;
            (doc_id, expected_content, result)
        };

        ocr_tasks.push(task);
    }

    // Process all OCR traces concurrently
    let ocr_results = futures::future::join_all(ocr_tasks).await;

    // Phase 3: Comprehensive analysis
    println!("\n📊 PHASE 3: COMPREHENSIVE ANALYSIS");
    println!("===================================");

    let mut corrupted_docs = Vec::new();
    let mut successful_docs = Vec::new();

    for (doc_id, expected_content, ocr_result) in ocr_results {
        let actual_text = ocr_result["ocr_text"].as_str().unwrap_or("");
        let status = ocr_result["ocr_status"].as_str().unwrap_or("unknown");

        println!("\n📄 Document Analysis: {}", doc_id);
        println!("  Status: {}", status);
        println!("  Expected: {}", expected_content);
        println!("  Actual:   {}", actual_text);

        if status == "completed" {
            if actual_text == expected_content {
                println!("  ✅ CONTENT CORRECT");
                successful_docs.push(doc_id);
            } else {
                println!("  ❌ CONTENT CORRUPTED");
                corrupted_docs.push((doc_id, expected_content.clone(), actual_text.to_string()));

                // Check if it contains any other document's content
                for (other_expected, _) in &documents {
                    if other_expected != &expected_content && actual_text.contains(other_expected) {
                        println!("    🔄 Contains content from: {}", other_expected);
                    }
                }
            }
        } else {
            println!("  ⚠️  NON-COMPLETED STATUS: {}", status);
        }
    }

    // Phase 4: System state analysis
    println!("\n🏗️  PHASE 4: SYSTEM STATE ANALYSIS");
    println!("===================================");

    let all_docs = debugger.get_all_documents().await;
    println!("📋 Total documents in system: {}", all_docs.len());

    for doc in &all_docs {
        if let (Some(id), Some(filename), Some(status)) = (
            doc["id"].as_str(),
            doc["filename"].as_str(),
            doc["ocr_status"].as_str()
        ) {
            println!("  📄 {}: {} -> {}", id, filename, status);
        }
    }

    // Final verdict
    println!("\n🏆 FINAL VERDICT");
    println!("================");
    println!("✅ Successful: {}", successful_docs.len());
    println!("❌ Corrupted:  {}", corrupted_docs.len());

    if corrupted_docs.is_empty() {
        println!("🎉 NO CORRUPTION DETECTED!");
    } else {
        println!("🚨 CORRUPTION DETECTED IN {} DOCUMENTS:", corrupted_docs.len());
        for (doc_id, expected, actual) in &corrupted_docs {
            println!("  📄 {}: expected '{}' got '{}'", doc_id, expected, actual);
        }

        // Try to identify patterns
        if corrupted_docs.iter().all(|(_, _, actual)| actual.is_empty()) {
            println!("🔍 PATTERN: All corrupted documents have EMPTY content");
        } else if corrupted_docs.iter().all(|(_, _, actual)| actual == &corrupted_docs[0].2) {
            println!("🔍 PATTERN: All corrupted documents have IDENTICAL content: '{}'", corrupted_docs[0].2);
        } else {
            println!("🔍 PATTERN: Mixed corruption types detected");
        }

        panic!("CORRUPTION DETECTED - see analysis above");
    }
}

#[tokio::test]
#[ignore = "Debug test - run manually when needed"]
async fn debug_document_upload_race_conditions() {
    println!("🔍 DEBUGGING DOCUMENT UPLOAD PROCESS");
    println!("====================================");

    // First, let's do a basic connectivity test
    println!("🔍 DEBUG: Testing basic network connectivity...");
    let test_client = reqwest::Client::new();
    let base_url = get_base_url();
    println!("🔍 DEBUG: Base URL from environment: {}", base_url);

    // Try a simple GET request first
    match test_client.get(&base_url).send().await {
        Ok(resp) => {
            println!("✅ DEBUG: Basic connectivity test passed. Status: {}", resp.status());
        }
        Err(e) => {
            println!("❌ DEBUG: Basic connectivity test failed");
            println!("❌ DEBUG: Error: {:?}", e);
            panic!("Cannot connect to server at {}. Error: {}", base_url, e);
        }
    }

    let debugger = PipelineDebugger::new().await;

    // Upload same content with different filenames to test:
    // 1. Concurrent upload race condition handling (no 500 errors)
    // 2. Proper deduplication (identical content = same document ID)
    let same_content = "IDENTICAL-CONTENT-FOR-RACE-CONDITION-TEST";
    let task1 = debugger.upload_document_with_debug(same_content, "race1.txt");
    let task2 = debugger.upload_document_with_debug(same_content, "race2.txt");
    let task3 = debugger.upload_document_with_debug(same_content, "race3.txt");

    let (doc1, doc2, doc3) = futures::future::join3(task1, task2, task3).await;
    let docs = vec![doc1, doc2, doc3];

    println!("\n📊 UPLOAD RACE CONDITION ANALYSIS:");
    for (i, doc) in docs.iter().enumerate() {
        println!("  Doc {}: ID={}, Filename={}, Size={}",
                i+1, doc.id, doc.filename, doc.file_size);
    }

    // Check deduplication behavior: identical content should result in same document ID
    let mut ids: Vec<_> = docs.iter().map(|d| d.id).collect();
    ids.sort();
    ids.dedup();

    if ids.len() == 1 {
        println!("✅ Correct deduplication: All identical content maps to same document ID");
        println!("✅ Race condition handled properly: No 500 errors during concurrent uploads");
    } else if ids.len() == docs.len() {
        println!("❌ UNEXPECTED: All documents have unique IDs despite identical content");
        panic!("Deduplication not working - identical content should map to same document");
    } else {
        println!("❌ PARTIAL DEDUPLICATION: Some duplicates detected but not all");
        panic!("Inconsistent deduplication behavior");
    }

    // Verify all documents have the same content hash (should be identical)
    let content_hashes: Vec<_> = docs.iter().map(|d| {
        // We can't directly access file_hash from DocumentResponse, but we can verify
        // they all have the same file size as a proxy for same content
        d.file_size
    }).collect();

    if content_hashes.iter().all(|&size| size == content_hashes[0]) {
        println!("✅ All documents have same file size (content verification)");
    } else {
        println!("❌ Documents have different file sizes - test setup error");
    }
}