feat(server/client): add pagination in client, resolve race condition in server

2025-06-15 21:48:59 +00:00 · 2025-06-15 21:48:59 +00:00 · a7cf67f90d
parent 42bc72ded4
commit a7cf67f90d
5 changed files with 823 additions and 25 deletions
--- a/frontend/src/pages/DocumentsPage.tsx
+++ b/frontend/src/pages/DocumentsPage.tsx
@ -22,6 +22,10 @@ import {
  Divider,
  CircularProgress,
  Alert,
+  Pagination,
+  FormControl,
+  InputLabel,
+  Select,
 } from '@mui/material';
 import {
  GridView as GridViewIcon,
@ -38,6 +42,8 @@ import {
  CalendarToday as DateIcon,
  Storage as SizeIcon,
  Visibility as ViewIcon,
+  ChevronLeft as ChevronLeftIcon,
+  ChevronRight as ChevronRightIcon,
 } from '@mui/icons-material';
 import { documentService } from '../services/api';
 import DocumentThumbnail from '../components/DocumentThumbnail';
@ -50,9 +56,23 @@ interface Document {
  mime_type: string;
  created_at: string;
  has_ocr_text?: boolean;
+  ocr_status?: string;
+  ocr_confidence?: number;
  tags: string[];
 }

+interface PaginationInfo {
+  total: number;
+  limit: number;
+  offset: number;
+  has_more: boolean;
+}
+
+interface DocumentsResponse {
+  documents: Document[];
+  pagination: PaginationInfo;
+}
+
 type ViewMode = 'grid' | 'list';
 type SortField = 'created_at' | 'original_filename' | 'file_size';
 type SortOrder = 'asc' | 'desc';
@ -60,12 +80,14 @@ type SortOrder = 'asc' | 'desc';
 const DocumentsPage: React.FC = () => {
  const navigate = useNavigate();
  const [documents, setDocuments] = useState<Document[]>([]);
+  const [pagination, setPagination] = useState<PaginationInfo>({ total: 0, limit: 20, offset: 0, has_more: false });
  const [loading, setLoading] = useState<boolean>(true);
  const [error, setError] = useState<string | null>(null);
  const [viewMode, setViewMode] = useState<ViewMode>('grid');
  const [searchQuery, setSearchQuery] = useState<string>('');
  const [sortBy, setSortBy] = useState<SortField>('created_at');
  const [sortOrder, setSortOrder] = useState<SortOrder>('desc');
+  const [ocrFilter, setOcrFilter] = useState<string>('');
  
  // Menu states
  const [sortMenuAnchor, setSortMenuAnchor] = useState<null | HTMLElement>(null);
@ -74,13 +96,18 @@ const DocumentsPage: React.FC = () => {

  useEffect(() => {
    fetchDocuments();
-  }, []);
+  }, [pagination.limit, pagination.offset, ocrFilter]);

  const fetchDocuments = async (): Promise<void> => {
    try {
      setLoading(true);
-      const response = await documentService.list(100, 0);
-      setDocuments(response.data);
+      const response = await documentService.listWithPagination(
+        pagination.limit, 
+        pagination.offset, 
+        ocrFilter || undefined
+      );
+      setDocuments(response.data.documents);
+      setPagination(response.data.pagination);
    } catch (err) {
      setError('Failed to load documents');
      console.error(err);
@ -179,6 +206,39 @@ const DocumentsPage: React.FC = () => {
    handleSortMenuClose();
  };

+  const handlePageChange = (event: React.ChangeEvent<unknown>, page: number): void => {
+    const newOffset = (page - 1) * pagination.limit;
+    setPagination(prev => ({ ...prev, offset: newOffset }));
+  };
+
+  const handleOcrFilterChange = (event: React.ChangeEvent<HTMLInputElement>): void => {
+    setOcrFilter(event.target.value);
+    setPagination(prev => ({ ...prev, offset: 0 })); // Reset to first page when filtering
+  };
+
+  const getOcrStatusChip = (doc: Document) => {
+    if (!doc.ocr_status) return null;
+    
+    const statusConfig = {
+      'completed': { color: 'success' as const, label: doc.ocr_confidence ? `OCR ${Math.round(doc.ocr_confidence)}%` : 'OCR Done' },
+      'processing': { color: 'warning' as const, label: 'Processing...' },
+      'failed': { color: 'error' as const, label: 'OCR Failed' },
+      'pending': { color: 'default' as const, label: 'Pending' },
+    };
+    
+    const config = statusConfig[doc.ocr_status as keyof typeof statusConfig];
+    if (!config) return null;
+    
+    return (
+      <Chip 
+        label={config.label}
+        size="small" 
+        color={config.color}
+        variant="outlined"
+      />
+    );
+  };
+
  if (loading) {
    return (
      <Box display="flex" justifyContent="center" alignItems="center" minHeight="400px">
@ -257,6 +317,22 @@ const DocumentsPage: React.FC = () => {
          </ToggleButton>
        </ToggleButtonGroup>

+        {/* OCR Filter */}
+        <FormControl size="small" sx={{ minWidth: 120 }}>
+          <InputLabel>OCR Status</InputLabel>
+          <Select
+            value={ocrFilter}
+            label="OCR Status"
+            onChange={handleOcrFilterChange}
+          >
+            <MenuItem value="">All</MenuItem>
+            <MenuItem value="completed">Completed</MenuItem>
+            <MenuItem value="processing">Processing</MenuItem>
+            <MenuItem value="failed">Failed</MenuItem>
+            <MenuItem value="pending">Pending</MenuItem>
+          </Select>
+        </FormControl>
+
        {/* Sort Button */}
        <Button
          variant="outlined"
@ -418,14 +494,7 @@ const DocumentsPage: React.FC = () => {
                          size="small" 
                          variant="outlined"
                        />
-                        {doc.has_ocr_text && (
-                          <Chip 
-                            label="OCR" 
-                            size="small" 
-                            color="success"
-                            variant="outlined"
-                          />
-                        )}
+                        {getOcrStatusChip(doc)}
                      </Stack>
                      
                      {doc.tags.length > 0 && (
@ -489,12 +558,29 @@ const DocumentsPage: React.FC = () => {
        </Grid>
      )}

-      {/* Results count */}
-      <Box sx={{ mt: 3, textAlign: 'center' }}>
-        <Typography variant="body2" color="text.secondary">
-          Showing {sortedDocuments.length} of {documents.length} documents
-          {searchQuery && ` matching "${searchQuery}"`}
-        </Typography>
+      {/* Results count and pagination */}
+      <Box sx={{ mt: 3 }}>
+        <Box sx={{ textAlign: 'center', mb: 2 }}>
+          <Typography variant="body2" color="text.secondary">
+            Showing {pagination.offset + 1}-{Math.min(pagination.offset + pagination.limit, pagination.total)} of {pagination.total} documents
+            {ocrFilter && ` with OCR status: ${ocrFilter}`}
+            {searchQuery && ` matching "${searchQuery}"`}
+          </Typography>
+        </Box>
+        
+        {pagination.total > pagination.limit && (
+          <Box sx={{ display: 'flex', justifyContent: 'center' }}>
+            <Pagination
+              count={Math.ceil(pagination.total / pagination.limit)}
+              page={Math.floor(pagination.offset / pagination.limit) + 1}
+              onChange={handlePageChange}
+              color="primary"
+              size="large"
+              showFirstButton
+              showLastButton
+            />
+          </Box>
+        )}
      </Box>
    </Box>
  );
--- a/frontend/src/services/api.ts
+++ b/frontend/src/services/api.ts
@ -111,6 +111,16 @@ export const documentService = {
    })
  },

+  listWithPagination: (limit = 20, offset = 0, ocrStatus?: string) => {
+    const params: any = { limit, offset };
+    if (ocrStatus) {
+      params.ocr_status = ocrStatus;
+    }
+    return api.get<{documents: Document[], pagination: {total: number, limit: number, offset: number, has_more: boolean}}>('/documents', {
+      params,
+    })
+  },
+
  getById: (id: string) => {
    // Use the document list endpoint with pagination to find the specific document
    // This is a temporary solution until we have a proper document details endpoint
--- a/src/db.rs
+++ b/src/db.rs
@ -503,6 +503,147 @@ impl Database {
        Ok(documents)
    }

+    pub async fn get_documents_by_user_with_role_and_filter(&self, user_id: Uuid, user_role: crate::models::UserRole, limit: i64, offset: i64, ocr_status: Option<&str>) -> Result<Vec<Document>> {
+        let rows = match (user_role == crate::models::UserRole::Admin, ocr_status) {
+            (true, Some(status)) => {
+                // Admin with OCR filter
+                sqlx::query(
+                    r#"
+                    SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id
+                    FROM documents 
+                    WHERE ocr_status = $3
+                    ORDER BY created_at DESC 
+                    LIMIT $1 OFFSET $2
+                    "#
+                )
+                .bind(limit)
+                .bind(offset)
+                .bind(status)
+                .fetch_all(&self.pool)
+                .await?
+            }
+            (true, None) => {
+                // Admin without OCR filter
+                sqlx::query(
+                    r#"
+                    SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id
+                    FROM documents 
+                    ORDER BY created_at DESC 
+                    LIMIT $1 OFFSET $2
+                    "#
+                )
+                .bind(limit)
+                .bind(offset)
+                .fetch_all(&self.pool)
+                .await?
+            }
+            (false, Some(status)) => {
+                // Regular user with OCR filter
+                sqlx::query(
+                    r#"
+                    SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id
+                    FROM documents 
+                    WHERE user_id = $3 AND ocr_status = $4
+                    ORDER BY created_at DESC 
+                    LIMIT $1 OFFSET $2
+                    "#
+                )
+                .bind(limit)
+                .bind(offset)
+                .bind(user_id)
+                .bind(status)
+                .fetch_all(&self.pool)
+                .await?
+            }
+            (false, None) => {
+                // Regular user without OCR filter
+                sqlx::query(
+                    r#"
+                    SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id
+                    FROM documents 
+                    WHERE user_id = $3 
+                    ORDER BY created_at DESC 
+                    LIMIT $1 OFFSET $2
+                    "#
+                )
+                .bind(limit)
+                .bind(offset)
+                .bind(user_id)
+                .fetch_all(&self.pool)
+                .await?
+            }
+        };
+
+        let documents = rows
+            .into_iter()
+            .map(|row| Document {
+                id: row.get("id"),
+                filename: row.get("filename"),
+                original_filename: row.get("original_filename"),
+                file_path: row.get("file_path"),
+                file_size: row.get("file_size"),
+                mime_type: row.get("mime_type"),
+                content: row.get("content"),
+                ocr_text: row.get("ocr_text"),
+                ocr_confidence: row.get("ocr_confidence"),
+                ocr_word_count: row.get("ocr_word_count"),
+                ocr_processing_time_ms: row.get("ocr_processing_time_ms"),
+                ocr_status: row.get("ocr_status"),
+                ocr_error: row.get("ocr_error"),
+                ocr_completed_at: row.get("ocr_completed_at"),
+                tags: row.get("tags"),
+                created_at: row.get("created_at"),
+                updated_at: row.get("updated_at"),
+                user_id: row.get("user_id"),
+            })
+            .collect();
+
+        Ok(documents)
+    }
+
+    pub async fn get_documents_count_with_role_and_filter(&self, user_id: Uuid, user_role: crate::models::UserRole, ocr_status: Option<&str>) -> Result<i64> {
+        let count = match (user_role == crate::models::UserRole::Admin, ocr_status) {
+            (true, Some(status)) => {
+                // Admin with OCR filter
+                sqlx::query_scalar::<_, i64>(
+                    "SELECT COUNT(*) FROM documents WHERE ocr_status = $1"
+                )
+                .bind(status)
+                .fetch_one(&self.pool)
+                .await?
+            }
+            (true, None) => {
+                // Admin without OCR filter
+                sqlx::query_scalar::<_, i64>(
+                    "SELECT COUNT(*) FROM documents"
+                )
+                .fetch_one(&self.pool)
+                .await?
+            }
+            (false, Some(status)) => {
+                // Regular user with OCR filter
+                sqlx::query_scalar::<_, i64>(
+                    "SELECT COUNT(*) FROM documents WHERE user_id = $1 AND ocr_status = $2"
+                )
+                .bind(user_id)
+                .bind(status)
+                .fetch_one(&self.pool)
+                .await?
+            }
+            (false, None) => {
+                // Regular user without OCR filter
+                sqlx::query_scalar::<_, i64>(
+                    "SELECT COUNT(*) FROM documents WHERE user_id = $1"
+                )
+                .bind(user_id)
+                .fetch_one(&self.pool)
+                .await?
+            }
+        };
+
+        Ok(count)
+    }
+
    pub async fn get_documents_by_user(&self, user_id: Uuid, limit: i64, offset: i64) -> Result<Vec<Document>> {
        let rows = sqlx::query(
            r#"
--- a/src/routes/documents.rs
+++ b/src/routes/documents.rs
@ -22,6 +22,7 @@ use crate::{
 struct PaginationQuery {
    limit: Option<i64>,
    offset: Option<i64>,
+    ocr_status: Option<String>,
 }

 pub fn router() -> Router<Arc<AppState>> {
@ -174,7 +175,8 @@ fn calculate_file_hash(data: &[u8]) -> String {
    ),
    params(
        ("limit" = Option<i64>, Query, description = "Number of documents to return (default: 50)"),
-        ("offset" = Option<i64>, Query, description = "Number of documents to skip (default: 0)")
+        ("offset" = Option<i64>, Query, description = "Number of documents to skip (default: 0)"),
+        ("ocr_status" = Option<String>, Query, description = "Filter by OCR status (pending, processing, completed, failed)")
    ),
    responses(
        (status = 200, description = "List of user documents", body = Vec<DocumentResponse>),
@ -185,17 +187,40 @@ async fn list_documents(
    State(state): State<Arc<AppState>>,
    auth_user: AuthUser,
    Query(pagination): Query<PaginationQuery>,
-) -> Result<Json<Vec<DocumentResponse>>, StatusCode> {
+) -> Result<Json<serde_json::Value>, StatusCode> {
    let limit = pagination.limit.unwrap_or(50);
    let offset = pagination.offset.unwrap_or(0);
    
-    let documents = state
-        .db
-        .get_documents_by_user_with_role(auth_user.user.id, auth_user.user.role, limit, offset)
-        .await
-        .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
+    let user_id = auth_user.user.id;
+    let user_role = auth_user.user.role;
+    let ocr_filter = pagination.ocr_status.as_deref();
    
-    let response: Vec<DocumentResponse> = documents.into_iter().map(|doc| doc.into()).collect();
+    let (documents, total_count) = tokio::try_join!(
+        state.db.get_documents_by_user_with_role_and_filter(
+            user_id, 
+            user_role.clone(), 
+            limit, 
+            offset, 
+            ocr_filter
+        ),
+        state.db.get_documents_count_with_role_and_filter(
+            user_id,
+            user_role,
+            ocr_filter
+        )
+    ).map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
+    
+    let documents_response: Vec<DocumentResponse> = documents.into_iter().map(|doc| doc.into()).collect();
+    
+    let response = serde_json::json!({
+        "documents": documents_response,
+        "pagination": {
+            "total": total_count,
+            "limit": limit,
+            "offset": offset,
+            "has_more": offset + limit < total_count
+        }
+    });
    
    Ok(Json(response))
 }
--- a/tests/ocr_corruption_tests.rs
+++ b/tests/ocr_corruption_tests.rs
@ -0,0 +1,536 @@
+/*!
+ * OCR Corruption Integration Tests
+ * 
+ * Tests for diagnosing and reproducing the issue where FileA's OCR text
+ * gets corrupted when FileB is processed simultaneously.
+ */
+
+use reqwest::Client;
+use serde_json::{json, Value};
+use std::time::{Duration, Instant};
+use tokio::time::sleep;
+use uuid::Uuid;
+
+use readur::models::{DocumentResponse, CreateUser, LoginRequest, LoginResponse};
+
+const BASE_URL: &str = "http://localhost:8000";
+const TIMEOUT: Duration = Duration::from_secs(60);
+
+/// Test client for OCR corruption scenarios
+struct OcrTestClient {
+    client: Client,
+    token: Option<String>,
+    user_id: Option<Uuid>,
+}
+
+impl OcrTestClient {
+    fn new() -> Self {
+        Self {
+            client: Client::new(),
+            token: None,
+            user_id: None,
+        }
+    }
+    
+    async fn check_server_health(&self) -> Result<(), Box<dyn std::error::Error>> {
+        let response = self.client
+            .get(&format!("{}/api/health", BASE_URL))
+            .timeout(Duration::from_secs(5))
+            .send()
+            .await?;
+        
+        if !response.status().is_success() {
+            return Err("Server health check failed".into());
+        }
+        
+        Ok(())
+    }
+    
+    async fn register_and_login(&mut self, username: &str, email: &str, password: &str) -> Result<String, Box<dyn std::error::Error>> {
+        let user_data = CreateUser {
+            username: username.to_string(),
+            email: email.to_string(),
+            password: password.to_string(),
+            role: Some(readur::models::UserRole::User),
+        };
+        
+        let register_response = self.client
+            .post(&format!("{}/api/auth/register", BASE_URL))
+            .json(&user_data)
+            .send()
+            .await?;
+        
+        if !register_response.status().is_success() {
+            return Err(format!("Registration failed: {}", register_response.text().await?).into());
+        }
+        
+        let login_data = LoginRequest {
+            username: username.to_string(),
+            password: password.to_string(),
+        };
+        
+        let login_response = self.client
+            .post(&format!("{}/api/auth/login", BASE_URL))
+            .json(&login_data)
+            .send()
+            .await?;
+        
+        if !login_response.status().is_success() {
+            return Err(format!("Login failed: {}", login_response.text().await?).into());
+        }
+        
+        let login_result: LoginResponse = login_response.json().await?;
+        self.token = Some(login_result.token.clone());
+        
+        Ok(login_result.token)
+    }
+    
+    /// Upload a document and return its ID and expected content
+    async fn upload_document(&self, content: &str, filename: &str) -> Result<(Uuid, String), Box<dyn std::error::Error>> {
+        let token = self.token.as_ref().ok_or("Not authenticated")?;
+        
+        let part = reqwest::multipart::Part::text(content.to_string())
+            .file_name(filename.to_string())
+            .mime_str("text/plain")?;
+        let form = reqwest::multipart::Form::new()
+            .part("file", part);
+        
+        let response = self.client
+            .post(&format!("{}/api/documents", BASE_URL))
+            .header("Authorization", format!("Bearer {}", token))
+            .multipart(form)
+            .send()
+            .await?;
+        
+        if !response.status().is_success() {
+            return Err(format!("Upload failed: {}", response.text().await?).into());
+        }
+        
+        let document: DocumentResponse = response.json().await?;
+        Ok((document.id, content.to_string()))
+    }
+    
+    /// Get document details including OCR status
+    async fn get_document_details(&self, doc_id: Uuid) -> Result<Value, Box<dyn std::error::Error>> {
+        let token = self.token.as_ref().ok_or("Not authenticated")?;
+        
+        let response = self.client
+            .get(&format!("{}/api/documents/{}/ocr", BASE_URL, doc_id))
+            .header("Authorization", format!("Bearer {}", token))
+            .send()
+            .await?;
+        
+        if !response.status().is_success() {
+            return Err(format!("Failed to get document details: {}", response.text().await?).into());
+        }
+        
+        let doc_data: Value = response.json().await?;
+        Ok(doc_data)
+    }
+    
+    /// Wait for OCR to complete for a document
+    async fn wait_for_ocr(&self, doc_id: Uuid) -> Result<Value, Box<dyn std::error::Error>> {
+        let start = Instant::now();
+        
+        while start.elapsed() < TIMEOUT {
+            let doc_data = self.get_document_details(doc_id).await?;
+            
+            match doc_data["ocr_status"].as_str() {
+                Some("completed") => {
+                    println!("✅ OCR completed for document {}", doc_id);
+                    return Ok(doc_data);
+                },
+                Some("failed") => {
+                    return Err(format!("OCR failed for document {}: {}", 
+                                     doc_id, 
+                                     doc_data["ocr_error"].as_str().unwrap_or("unknown error")).into());
+                },
+                Some("processing") => {
+                    println!("⏳ OCR still processing for document {}", doc_id);
+                },
+                _ => {
+                    println!("📋 Document {} queued for OCR", doc_id);
+                }
+            }
+            
+            sleep(Duration::from_millis(200)).await;
+        }
+        
+        Err(format!("OCR did not complete within {} seconds for document {}", TIMEOUT.as_secs(), doc_id).into())
+    }
+    
+    /// Upload multiple documents simultaneously and track their OCR results
+    async fn upload_documents_simultaneously(&self, documents: Vec<(&str, &str)>) -> Result<Vec<(Uuid, String, Value)>, Box<dyn std::error::Error>> {
+        let mut upload_tasks = Vec::new();
+        
+        // Upload all documents simultaneously
+        for (content, filename) in documents {
+            let content_owned = content.to_string();
+            let filename_owned = filename.to_string();
+            let client_ref = self;
+            
+            let task = async move {
+                client_ref.upload_document(&content_owned, &filename_owned).await
+            };
+            
+            upload_tasks.push(task);
+        }
+        
+        // Wait for all uploads to complete
+        let mut uploaded_docs = Vec::new();
+        for task in upload_tasks {
+            let (doc_id, expected_content) = task.await?;
+            uploaded_docs.push((doc_id, expected_content));
+            println!("📄 Uploaded document: {}", doc_id);
+        }
+        
+        // Now wait for OCR processing on all documents
+        let mut ocr_tasks = Vec::new();
+        for (doc_id, expected_content) in uploaded_docs {
+            let client_ref = self;
+            let task = async move {
+                let ocr_result = client_ref.wait_for_ocr(doc_id).await?;
+                Ok::<(Uuid, String, Value), Box<dyn std::error::Error>>((doc_id, expected_content, ocr_result))
+            };
+            ocr_tasks.push(task);
+        }
+        
+        // Wait for all OCR to complete
+        let mut results = Vec::new();
+        for task in ocr_tasks {
+            let result = task.await?;
+            results.push(result);
+        }
+        
+        Ok(results)
+    }
+}
+
+#[tokio::test]
+async fn test_concurrent_ocr_corruption() {
+    println!("🧪 Starting OCR corruption test with concurrent file processing");
+    
+    let mut client = OcrTestClient::new();
+    
+    // Check server health
+    if let Err(e) = client.check_server_health().await {
+        panic!("Server not running at {}: {}", BASE_URL, e);
+    }
+    
+    // Create test user
+    let timestamp = std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .unwrap()
+        .as_millis();
+    let username = format!("ocr_corruption_test_{}", timestamp);
+    let email = format!("ocr_corruption_{}@test.com", timestamp);
+    
+    let _token = client.register_and_login(&username, &email, "testpass123").await
+        .expect("Failed to register and login");
+    
+    println!("✅ User registered: {}", username);
+    
+    // Create test documents with distinctive content
+    let file_a_content = r#"
+=== DOCUMENT A - IMPORTANT CONTRACT ===
+Contract Number: CONTRACT-A-001
+Party 1: Alice Corporation
+Party 2: Bob Industries
+Date: 2024-01-15
+Amount: $50,000
+Terms: This is the content for Document A. It contains specific legal text 
+that should remain associated with Document A only. Any corruption would 
+be immediately visible.
+
+DOCUMENT A SIGNATURE: Alice Smith, CEO
+UNIQUE IDENTIFIER FOR A: ALPHA-BRAVO-CHARLIE-001
+"#;
+    
+    let file_b_content = r#"
+=== DOCUMENT B - TECHNICAL SPECIFICATION ===
+Specification ID: SPEC-B-002
+Product: Widget Manufacturing System
+Version: 2.0
+Author: Technical Team B
+Date: 2024-01-16
+
+This is Document B containing technical specifications. It has completely 
+different content from Document A. If OCR corruption occurs, Document A 
+might end up with this technical content instead of its contract text.
+
+DOCUMENT B SIGNATURE: Bob Johnson, CTO
+UNIQUE IDENTIFIER FOR B: DELTA-ECHO-FOXTROT-002
+"#;
+    
+    // Test documents to upload simultaneously
+    let documents = vec![
+        (file_a_content, "contract_a.txt"),
+        (file_b_content, "specification_b.txt"),
+    ];
+    
+    println!("📤 Uploading documents simultaneously...");
+    
+    let results = client.upload_documents_simultaneously(documents).await
+        .expect("Failed to upload documents simultaneously");
+    
+    println!("🔍 Analyzing OCR results for corruption...");
+    
+    let mut corruption_detected = false;
+    
+    for (doc_id, expected_content, ocr_result) in results {
+        let actual_ocr_text = ocr_result["ocr_text"].as_str().unwrap_or("");
+        let filename = ocr_result["filename"].as_str().unwrap_or("unknown");
+        
+        println!("\n📋 Document: {} ({})", doc_id, filename);
+        println!("📄 Expected content length: {} chars", expected_content.len());
+        println!("🔤 Actual OCR text length: {} chars", actual_ocr_text.len());
+        
+        // Check for content mismatch (corruption)
+        if filename.contains("contract_a") {
+            // Document A should contain contract-specific terms
+            let has_contract_content = actual_ocr_text.contains("CONTRACT-A-001") 
+                && actual_ocr_text.contains("Alice Corporation")
+                && actual_ocr_text.contains("ALPHA-BRAVO-CHARLIE-001");
+                
+            let has_spec_content = actual_ocr_text.contains("SPEC-B-002")
+                || actual_ocr_text.contains("Widget Manufacturing")
+                || actual_ocr_text.contains("DELTA-ECHO-FOXTROT-002");
+            
+            if !has_contract_content {
+                println!("❌ CORRUPTION DETECTED: Document A missing its original contract content!");
+                corruption_detected = true;
+            }
+            
+            if has_spec_content {
+                println!("❌ CORRUPTION DETECTED: Document A contains Document B's specification content!");
+                corruption_detected = true;
+            }
+            
+            if has_contract_content && !has_spec_content {
+                println!("✅ Document A has correct content");
+            }
+        } else if filename.contains("specification_b") {
+            // Document B should contain specification-specific terms
+            let has_spec_content = actual_ocr_text.contains("SPEC-B-002")
+                && actual_ocr_text.contains("Widget Manufacturing")
+                && actual_ocr_text.contains("DELTA-ECHO-FOXTROT-002");
+                
+            let has_contract_content = actual_ocr_text.contains("CONTRACT-A-001")
+                || actual_ocr_text.contains("Alice Corporation")
+                || actual_ocr_text.contains("ALPHA-BRAVO-CHARLIE-001");
+            
+            if !has_spec_content {
+                println!("❌ CORRUPTION DETECTED: Document B missing its original specification content!");
+                corruption_detected = true;
+            }
+            
+            if has_contract_content {
+                println!("❌ CORRUPTION DETECTED: Document B contains Document A's contract content!");
+                corruption_detected = true;
+            }
+            
+            if has_spec_content && !has_contract_content {
+                println!("✅ Document B has correct content");
+            }
+        }
+        
+        // Additional integrity checks
+        if let Some(confidence) = ocr_result["ocr_confidence"].as_f64() {
+            println!("📊 OCR Confidence: {:.1}%", confidence);
+            if confidence < 50.0 {
+                println!("⚠️  Low OCR confidence may indicate processing issues");
+            }
+        }
+        
+        if let Some(word_count) = ocr_result["ocr_word_count"].as_i64() {
+            println!("📝 OCR Word Count: {}", word_count);
+        }
+        
+        if let Some(processing_time) = ocr_result["ocr_processing_time_ms"].as_i64() {
+            println!("⏱️  OCR Processing Time: {}ms", processing_time);
+        }
+    }
+    
+    if corruption_detected {
+        panic!("🚨 OCR CORRUPTION DETECTED! FileA's content was overwritten with FileB's data or vice versa.");
+    } else {
+        println!("\n🎉 No OCR corruption detected - all documents retained their correct content!");
+    }
+}
+
+#[tokio::test]
+async fn test_high_volume_concurrent_ocr() {
+    println!("🧪 Starting high-volume concurrent OCR test");
+    
+    let mut client = OcrTestClient::new();
+    
+    if let Err(e) = client.check_server_health().await {
+        panic!("Server not running at {}: {}", BASE_URL, e);
+    }
+    
+    let timestamp = std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .unwrap()
+        .as_millis();
+    let username = format!("high_volume_test_{}", timestamp);
+    let email = format!("high_volume_{}@test.com", timestamp);
+    
+    let _token = client.register_and_login(&username, &email, "testpass123").await
+        .expect("Failed to register and login");
+    
+    // Create 5 documents with unique identifiable content
+    let mut documents = Vec::new();
+    for i in 1..=5 {
+        let content = format!(r#"
+=== DOCUMENT {} - UNIQUE CONTENT ===
+Document Number: DOC-{:03}
+Unique Signature: SIGNATURE-{}-{}-{}
+Content: This is document number {} with completely unique content.
+Every document should retain its own unique signature and number.
+Any mixing of content between documents indicates corruption.
+Random data: {}
+End of Document {}
+"#, i, i, timestamp, i*7, i, timestamp * i, i, i);
+        
+        documents.push((content, format!("doc_{}.txt", i)));
+    }
+    
+    println!("📤 Uploading {} documents simultaneously...", documents.len());
+    
+    let documents_ref: Vec<(&str, &str)> = documents.iter()
+        .map(|(content, filename)| (content.as_str(), filename.as_str()))
+        .collect();
+    
+    let results = client.upload_documents_simultaneously(documents_ref).await
+        .expect("Failed to upload documents simultaneously");
+    
+    println!("🔍 Analyzing results for content mixing...");
+    
+    let mut all_signatures = Vec::new();
+    let mut corruption_found = false;
+    
+    // Extract all unique signatures
+    for i in 1..=5 {
+        all_signatures.push(format!("SIGNATURE-{}-{}-{}", i, timestamp, i*7));
+    }
+    
+    // Check each document for corruption
+    for (doc_id, expected_content, ocr_result) in results {
+        let actual_ocr_text = ocr_result["ocr_text"].as_str().unwrap_or("");
+        let filename = ocr_result["filename"].as_str().unwrap_or("unknown");
+        
+        // Determine which document this should be based on filename
+        if let Some(doc_num_str) = filename.strip_prefix("doc_").and_then(|s| s.strip_suffix(".txt")) {
+            if let Ok(doc_num) = doc_num_str.parse::<i32>() {
+                let expected_signature = format!("SIGNATURE-{}-{}-{}", doc_num, timestamp, doc_num*7);
+                
+                println!("\n📋 Checking Document {} ({})", doc_num, doc_id);
+                
+                // Check if it has its own signature
+                let has_own_signature = actual_ocr_text.contains(&expected_signature);
+                
+                // Check if it has any other document's signature
+                let mut has_other_signatures = Vec::new();
+                for (i, sig) in all_signatures.iter().enumerate() {
+                    if i + 1 != doc_num as usize && actual_ocr_text.contains(sig) {
+                        has_other_signatures.push(i + 1);
+                    }
+                }
+                
+                if !has_own_signature {
+                    println!("❌ CORRUPTION: Document {} missing its own signature!", doc_num);
+                    corruption_found = true;
+                }
+                
+                if !has_other_signatures.is_empty() {
+                    println!("❌ CORRUPTION: Document {} contains signatures from documents: {:?}", doc_num, has_other_signatures);
+                    corruption_found = true;
+                }
+                
+                if has_own_signature && has_other_signatures.is_empty() {
+                    println!("✅ Document {} has correct content", doc_num);
+                }
+            }
+        }
+    }
+    
+    if corruption_found {
+        panic!("🚨 CONTENT CORRUPTION DETECTED in high-volume test!");
+    } else {
+        println!("\n🎉 High-volume test passed - no corruption detected!");
+    }
+}
+
+#[tokio::test]
+async fn test_rapid_sequential_uploads() {
+    println!("🧪 Testing rapid sequential uploads for race conditions");
+    
+    let mut client = OcrTestClient::new();
+    
+    if let Err(e) = client.check_server_health().await {
+        panic!("Server not running at {}: {}", BASE_URL, e);
+    }
+    
+    let timestamp = std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .unwrap()
+        .as_millis();
+    let username = format!("rapid_test_{}", timestamp);
+    let email = format!("rapid_{}@test.com", timestamp);
+    
+    let _token = client.register_and_login(&username, &email, "testpass123").await
+        .expect("Failed to register and login");
+    
+    println!("📤 Uploading documents in rapid sequence...");
+    
+    // Upload documents one after another with minimal delay
+    let mut doc_ids = Vec::new();
+    let mut expected_contents = Vec::new();
+    
+    for i in 1..=3 {
+        let content = format!("RAPID-TEST-DOCUMENT-{}-{}-UNIQUE-CONTENT", i, timestamp);
+        let filename = format!("rapid_{}.txt", i);
+        
+        let (doc_id, expected) = client.upload_document(&content, &filename).await
+            .expect("Failed to upload document");
+        
+        doc_ids.push(doc_id);
+        expected_contents.push(expected);
+        
+        println!("📄 Uploaded rapid document {}: {}", i, doc_id);
+        
+        // Very short delay to create timing pressure
+        sleep(Duration::from_millis(50)).await;
+    }
+    
+    println!("⏳ Waiting for all OCR to complete...");
+    
+    // Wait for all to complete and check for corruption
+    for (i, doc_id) in doc_ids.iter().enumerate() {
+        let ocr_result = client.wait_for_ocr(*doc_id).await
+            .expect("Failed to wait for OCR");
+        
+        let actual_text = ocr_result["ocr_text"].as_str().unwrap_or("");
+        let expected_marker = format!("RAPID-TEST-DOCUMENT-{}", i + 1);
+        
+        if !actual_text.contains(&expected_marker) {
+            panic!("🚨 RAPID UPLOAD CORRUPTION: Document {} missing its unique marker '{}'", 
+                   doc_id, expected_marker);
+        }
+        
+        // Check it doesn't contain other documents' markers
+        for j in 1..=3 {
+            if j != (i + 1) {
+                let other_marker = format!("RAPID-TEST-DOCUMENT-{}", j);
+                if actual_text.contains(&other_marker) {
+                    panic!("🚨 RAPID UPLOAD CORRUPTION: Document {} contains marker from document {}", 
+                           doc_id, j);
+                }
+            }
+        }
+        
+        println!("✅ Rapid document {} has correct content", i + 1);
+    }
+    
+    println!("🎉 Rapid sequential upload test passed!");
+}