Readur/tests/investigate_empty_content.rs

240 lines
9.0 KiB
Rust

/*!
* Investigate why high document volumes return empty OCR content
*/
use reqwest::Client;
use serde_json::Value;
use std::time::{Duration, Instant};
use tokio::time::sleep;
use uuid::Uuid;
use futures;
use readur::models::{DocumentResponse, CreateUser, LoginRequest, LoginResponse};
fn get_base_url() -> String {
std::env::var("API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string())
}
struct Investigator {
client: Client,
token: String,
}
impl Investigator {
async fn new() -> Self {
let client = Client::new();
let timestamp = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_millis();
let username = format!("investigator_{}", timestamp);
let email = format!("investigator_{}@test.com", timestamp);
// Register and login
let user_data = CreateUser {
username: username.clone(),
email: email.clone(),
password: "testpass123".to_string(),
role: Some(readur::models::UserRole::User),
};
client.post(&format!("{}/api/auth/register", get_base_url()))
.json(&user_data)
.send()
.await
.expect("Registration should work");
let login_data = LoginRequest {
username: username.clone(),
password: "testpass123".to_string(),
};
let login_response = client
.post(&format!("{}/api/auth/login", get_base_url()))
.json(&login_data)
.send()
.await
.expect("Login should work");
let login_result: LoginResponse = login_response.json().await.expect("Login should return JSON");
let token = login_result.token;
Self { client, token }
}
async fn upload_document(&self, content: &str, filename: &str) -> DocumentResponse {
let part = reqwest::multipart::Part::text(content.to_string())
.file_name(filename.to_string())
.mime_str("text/plain")
.expect("Valid mime type");
let form = reqwest::multipart::Form::new().part("file", part);
let response = self.client
.post(&format!("{}/api/documents", get_base_url()))
.header("Authorization", format!("Bearer {}", self.token))
.multipart(form)
.send()
.await
.expect("Upload should work");
response.json().await.expect("Valid JSON")
}
async fn get_document_details(&self, doc_id: &str) -> Value {
let response = self.client
.get(&format!("{}/api/documents/{}/ocr", get_base_url(), doc_id))
.header("Authorization", format!("Bearer {}", self.token))
.send()
.await
.expect("Should get document details");
response.json().await.expect("Valid JSON")
}
async fn get_queue_stats(&self) -> Value {
let response = self.client
.get(&format!("{}/api/queue/stats", get_base_url()))
.header("Authorization", format!("Bearer {}", self.token))
.send()
.await;
match response {
Ok(resp) => resp.json().await.unwrap_or_else(|_| serde_json::json!({"error": "Failed to parse"})),
Err(_) => serde_json::json!({"error": "Failed to get queue stats"})
}
}
}
#[tokio::test]
async fn investigate_empty_content_issue() {
println!("🔍 INVESTIGATING EMPTY CONTENT ISSUE");
println!("===================================");
let investigator = Investigator::new().await;
// Test with different document counts to find the threshold
let test_cases = vec![
("Low concurrency", 3),
("Medium concurrency", 10),
("High concurrency", 20),
];
for (test_name, doc_count) in test_cases {
println!("\n📊 TEST: {} ({} documents)", test_name, doc_count);
println!("{}=", "=".repeat(50));
// Upload documents
let mut documents = Vec::new();
for i in 1..=doc_count {
let content = format!("TEST-{}-CONTENT-{:02}", test_name.replace(" ", "_").to_uppercase(), i);
let filename = format!("test_{}_{:02}.txt", test_name.replace(" ", "_"), i);
documents.push((content, filename));
}
println!("📤 Uploading {} documents...", doc_count);
let upload_start = Instant::now();
let uploaded_docs = futures::future::join_all(
documents.iter().map(|(content, filename)| {
investigator.upload_document(content, filename)
}).collect::<Vec<_>>()
).await;
let upload_time = upload_start.elapsed();
println!("✅ Upload completed in {:?}", upload_time);
// Check queue stats immediately after upload
let queue_stats = investigator.get_queue_stats().await;
println!("📊 Queue stats after upload: {}", serde_json::to_string_pretty(&queue_stats).unwrap_or_default());
// Wait for processing with detailed monitoring
println!("🔄 Monitoring OCR processing...");
let mut completed_count = 0;
let process_start = Instant::now();
while completed_count < doc_count && process_start.elapsed() < Duration::from_secs(60) {
sleep(Duration::from_secs(2)).await;
let mut current_completed = 0;
let mut sample_results = Vec::new();
for (i, doc) in uploaded_docs.iter().enumerate().take(3) { // Sample first 3 docs
let details = investigator.get_document_details(&doc.id.to_string()).await;
let status = details["ocr_status"].as_str().unwrap_or("unknown");
let ocr_text = details["ocr_text"].as_str().unwrap_or("");
let expected = &documents[i].0;
if status == "completed" {
current_completed += 1;
}
sample_results.push((doc.id.to_string(), status.to_string(), expected.clone(), ocr_text.to_string()));
}
// Estimate total completed (this is rough but gives us an idea)
let estimated_total_completed = if current_completed > 0 {
(current_completed as f64 / 3.0 * doc_count as f64) as usize
} else {
0
};
if estimated_total_completed != completed_count {
completed_count = estimated_total_completed;
println!(" 📈 Progress: ~{}/{} completed", completed_count, doc_count);
// Show sample results
for (doc_id, status, expected, actual) in sample_results {
if status == "completed" {
let is_correct = actual == expected;
let result_icon = if is_correct { "" } else if actual.is_empty() { "❌📄" } else { "❌🔄" };
println!(" {} {}: expected='{}' actual='{}'", result_icon, &doc_id[..8], expected, actual);
}
}
}
if estimated_total_completed >= doc_count {
break;
}
}
let process_time = process_start.elapsed();
println!("⏱️ Processing time: {:?}", process_time);
// Final analysis
let mut success_count = 0;
let mut empty_count = 0;
let mut other_corruption = 0;
for (i, doc) in uploaded_docs.iter().enumerate() {
let details = investigator.get_document_details(&doc.id.to_string()).await;
let status = details["ocr_status"].as_str().unwrap_or("unknown");
let ocr_text = details["ocr_text"].as_str().unwrap_or("");
let expected = &documents[i].0;
if status == "completed" {
if ocr_text == expected {
success_count += 1;
} else if ocr_text.is_empty() {
empty_count += 1;
} else {
other_corruption += 1;
}
}
}
println!("\n📊 RESULTS for {} documents:", doc_count);
println!(" ✅ Successful: {}", success_count);
println!(" ❌ Empty content: {}", empty_count);
println!(" 🔄 Other corruption: {}", other_corruption);
println!(" 📈 Success rate: {:.1}%", (success_count as f64 / doc_count as f64) * 100.0);
// Get final queue stats
let final_queue_stats = investigator.get_queue_stats().await;
println!("📊 Final queue stats: {}", serde_json::to_string_pretty(&final_queue_stats).unwrap_or_default());
if empty_count > 0 {
println!("⚠️ EMPTY CONTENT THRESHOLD FOUND AT {} DOCUMENTS", doc_count);
}
}
}