/*! * Investigate why high document volumes return empty OCR content */ use reqwest::Client; use serde_json::Value; use std::time::{Duration, Instant}; use tokio::time::sleep; use futures; use readur::models::{CreateUser, LoginRequest, LoginResponse}; use readur::routes::documents::types::DocumentUploadResponse; fn get_base_url() -> String { std::env::var("API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string()) } struct Investigator { client: Client, token: String, } impl Investigator { async fn new() -> Self { let client = Client::new(); let timestamp = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap() .as_millis(); let username = format!("investigator_{}", timestamp); let email = format!("investigator_{}@test.com", timestamp); // Register and login let user_data = CreateUser { username: username.clone(), email: email.clone(), password: "testpass123".to_string(), role: Some(readur::models::UserRole::User), }; client.post(&format!("{}/api/auth/register", get_base_url())) .json(&user_data) .send() .await .expect("Registration should work"); let login_data = LoginRequest { username: username.clone(), password: "testpass123".to_string(), }; let login_response = client .post(&format!("{}/api/auth/login", get_base_url())) .json(&login_data) .send() .await .expect("Login should work"); let login_result: LoginResponse = login_response.json().await.expect("Login should return JSON"); let token = login_result.token; Self { client, token } } async fn upload_document(&self, content: &str, filename: &str) -> DocumentUploadResponse { let part = reqwest::multipart::Part::text(content.to_string()) .file_name(filename.to_string()) .mime_str("text/plain") .expect("Valid mime type"); let form = reqwest::multipart::Form::new().part("file", part); let response = self.client .post(&format!("{}/api/documents", get_base_url())) .header("Authorization", format!("Bearer {}", self.token)) .multipart(form) .send() .await .expect("Upload should work"); response.json().await.expect("Valid JSON") } async fn get_document_details(&self, doc_id: &str) -> Value { let response = self.client .get(&format!("{}/api/documents/{}/ocr", get_base_url(), doc_id)) .header("Authorization", format!("Bearer {}", self.token)) .send() .await .expect("Should get document details"); response.json().await.expect("Valid JSON") } async fn get_queue_stats(&self) -> Value { let response = self.client .get(&format!("{}/api/queue/stats", get_base_url())) .header("Authorization", format!("Bearer {}", self.token)) .send() .await; match response { Ok(resp) => resp.json().await.unwrap_or_else(|_| serde_json::json!({"error": "Failed to parse"})), Err(_) => serde_json::json!({"error": "Failed to get queue stats"}) } } } #[tokio::test] async fn investigate_empty_content_issue() { println!("šŸ” INVESTIGATING EMPTY CONTENT ISSUE"); println!("==================================="); let investigator = Investigator::new().await; // Test with different document counts to find the threshold let test_cases = vec![ ("Low concurrency", 3), ("Medium concurrency", 10), ("High concurrency", 20), ]; for (test_name, doc_count) in test_cases { println!("\nšŸ“Š TEST: {} ({} documents)", test_name, doc_count); println!("{}=", "=".repeat(50)); // Upload documents let mut documents = Vec::new(); for i in 1..=doc_count { let content = format!("TEST-{}-CONTENT-{:02}", test_name.replace(" ", "_").to_uppercase(), i); let filename = format!("test_{}_{:02}.txt", test_name.replace(" ", "_"), i); documents.push((content, filename)); } println!("šŸ“¤ Uploading {} documents...", doc_count); let upload_start = Instant::now(); let uploaded_docs = futures::future::join_all( documents.iter().map(|(content, filename)| { investigator.upload_document(content, filename) }).collect::>() ).await; let upload_time = upload_start.elapsed(); println!("āœ… Upload completed in {:?}", upload_time); // Check queue stats immediately after upload let queue_stats = investigator.get_queue_stats().await; println!("šŸ“Š Queue stats after upload: {}", serde_json::to_string_pretty(&queue_stats).unwrap_or_default()); // Wait for processing with detailed monitoring println!("šŸ”„ Monitoring OCR processing..."); let mut completed_count = 0; let process_start = Instant::now(); while completed_count < doc_count && process_start.elapsed() < Duration::from_secs(60) { sleep(Duration::from_secs(2)).await; let mut current_completed = 0; let mut sample_results = Vec::new(); for (i, doc) in uploaded_docs.iter().enumerate().take(3) { // Sample first 3 docs let details = investigator.get_document_details(&doc.id.to_string()).await; let status = details["ocr_status"].as_str().unwrap_or("unknown"); let ocr_text = details["ocr_text"].as_str().unwrap_or(""); let expected = &documents[i].0; if status == "completed" { current_completed += 1; } sample_results.push((doc.id.to_string(), status.to_string(), expected.clone(), ocr_text.to_string())); } // Estimate total completed (this is rough but gives us an idea) let estimated_total_completed = if current_completed > 0 { (current_completed as f64 / 3.0 * doc_count as f64) as usize } else { 0 }; if estimated_total_completed != completed_count { completed_count = estimated_total_completed; println!(" šŸ“ˆ Progress: ~{}/{} completed", completed_count, doc_count); // Show sample results for (doc_id, status, expected, actual) in sample_results { if status == "completed" { let is_correct = actual == expected; let result_icon = if is_correct { "āœ…" } else if actual.is_empty() { "āŒšŸ“„" } else { "āŒšŸ”„" }; println!(" {} {}: expected='{}' actual='{}'", result_icon, &doc_id[..8], expected, actual); } } } if estimated_total_completed >= doc_count { break; } } let process_time = process_start.elapsed(); println!("ā±ļø Processing time: {:?}", process_time); // Final analysis let mut success_count = 0; let mut empty_count = 0; let mut other_corruption = 0; for (i, doc) in uploaded_docs.iter().enumerate() { let details = investigator.get_document_details(&doc.id.to_string()).await; let status = details["ocr_status"].as_str().unwrap_or("unknown"); let ocr_text = details["ocr_text"].as_str().unwrap_or(""); let expected = &documents[i].0; if status == "completed" { if ocr_text == expected { success_count += 1; } else if ocr_text.is_empty() { empty_count += 1; } else { other_corruption += 1; } } } println!("\nšŸ“Š RESULTS for {} documents:", doc_count); println!(" āœ… Successful: {}", success_count); println!(" āŒ Empty content: {}", empty_count); println!(" šŸ”„ Other corruption: {}", other_corruption); println!(" šŸ“ˆ Success rate: {:.1}%", (success_count as f64 / doc_count as f64) * 100.0); // Get final queue stats let final_queue_stats = investigator.get_queue_stats().await; println!("šŸ“Š Final queue stats: {}", serde_json::to_string_pretty(&final_queue_stats).unwrap_or_default()); if empty_count > 0 { println!("āš ļø EMPTY CONTENT THRESHOLD FOUND AT {} DOCUMENTS", doc_count); } } }