feat(server/client): add pagination in client, resolve race condition in server
This commit is contained in:
parent
42bc72ded4
commit
a7cf67f90d
|
|
@ -22,6 +22,10 @@ import {
|
|||
Divider,
|
||||
CircularProgress,
|
||||
Alert,
|
||||
Pagination,
|
||||
FormControl,
|
||||
InputLabel,
|
||||
Select,
|
||||
} from '@mui/material';
|
||||
import {
|
||||
GridView as GridViewIcon,
|
||||
|
|
@ -38,6 +42,8 @@ import {
|
|||
CalendarToday as DateIcon,
|
||||
Storage as SizeIcon,
|
||||
Visibility as ViewIcon,
|
||||
ChevronLeft as ChevronLeftIcon,
|
||||
ChevronRight as ChevronRightIcon,
|
||||
} from '@mui/icons-material';
|
||||
import { documentService } from '../services/api';
|
||||
import DocumentThumbnail from '../components/DocumentThumbnail';
|
||||
|
|
@ -50,9 +56,23 @@ interface Document {
|
|||
mime_type: string;
|
||||
created_at: string;
|
||||
has_ocr_text?: boolean;
|
||||
ocr_status?: string;
|
||||
ocr_confidence?: number;
|
||||
tags: string[];
|
||||
}
|
||||
|
||||
interface PaginationInfo {
|
||||
total: number;
|
||||
limit: number;
|
||||
offset: number;
|
||||
has_more: boolean;
|
||||
}
|
||||
|
||||
interface DocumentsResponse {
|
||||
documents: Document[];
|
||||
pagination: PaginationInfo;
|
||||
}
|
||||
|
||||
type ViewMode = 'grid' | 'list';
|
||||
type SortField = 'created_at' | 'original_filename' | 'file_size';
|
||||
type SortOrder = 'asc' | 'desc';
|
||||
|
|
@ -60,12 +80,14 @@ type SortOrder = 'asc' | 'desc';
|
|||
const DocumentsPage: React.FC = () => {
|
||||
const navigate = useNavigate();
|
||||
const [documents, setDocuments] = useState<Document[]>([]);
|
||||
const [pagination, setPagination] = useState<PaginationInfo>({ total: 0, limit: 20, offset: 0, has_more: false });
|
||||
const [loading, setLoading] = useState<boolean>(true);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
const [viewMode, setViewMode] = useState<ViewMode>('grid');
|
||||
const [searchQuery, setSearchQuery] = useState<string>('');
|
||||
const [sortBy, setSortBy] = useState<SortField>('created_at');
|
||||
const [sortOrder, setSortOrder] = useState<SortOrder>('desc');
|
||||
const [ocrFilter, setOcrFilter] = useState<string>('');
|
||||
|
||||
// Menu states
|
||||
const [sortMenuAnchor, setSortMenuAnchor] = useState<null | HTMLElement>(null);
|
||||
|
|
@ -74,13 +96,18 @@ const DocumentsPage: React.FC = () => {
|
|||
|
||||
useEffect(() => {
|
||||
fetchDocuments();
|
||||
}, []);
|
||||
}, [pagination.limit, pagination.offset, ocrFilter]);
|
||||
|
||||
const fetchDocuments = async (): Promise<void> => {
|
||||
try {
|
||||
setLoading(true);
|
||||
const response = await documentService.list(100, 0);
|
||||
setDocuments(response.data);
|
||||
const response = await documentService.listWithPagination(
|
||||
pagination.limit,
|
||||
pagination.offset,
|
||||
ocrFilter || undefined
|
||||
);
|
||||
setDocuments(response.data.documents);
|
||||
setPagination(response.data.pagination);
|
||||
} catch (err) {
|
||||
setError('Failed to load documents');
|
||||
console.error(err);
|
||||
|
|
@ -179,6 +206,39 @@ const DocumentsPage: React.FC = () => {
|
|||
handleSortMenuClose();
|
||||
};
|
||||
|
||||
const handlePageChange = (event: React.ChangeEvent<unknown>, page: number): void => {
|
||||
const newOffset = (page - 1) * pagination.limit;
|
||||
setPagination(prev => ({ ...prev, offset: newOffset }));
|
||||
};
|
||||
|
||||
const handleOcrFilterChange = (event: React.ChangeEvent<HTMLInputElement>): void => {
|
||||
setOcrFilter(event.target.value);
|
||||
setPagination(prev => ({ ...prev, offset: 0 })); // Reset to first page when filtering
|
||||
};
|
||||
|
||||
const getOcrStatusChip = (doc: Document) => {
|
||||
if (!doc.ocr_status) return null;
|
||||
|
||||
const statusConfig = {
|
||||
'completed': { color: 'success' as const, label: doc.ocr_confidence ? `OCR ${Math.round(doc.ocr_confidence)}%` : 'OCR Done' },
|
||||
'processing': { color: 'warning' as const, label: 'Processing...' },
|
||||
'failed': { color: 'error' as const, label: 'OCR Failed' },
|
||||
'pending': { color: 'default' as const, label: 'Pending' },
|
||||
};
|
||||
|
||||
const config = statusConfig[doc.ocr_status as keyof typeof statusConfig];
|
||||
if (!config) return null;
|
||||
|
||||
return (
|
||||
<Chip
|
||||
label={config.label}
|
||||
size="small"
|
||||
color={config.color}
|
||||
variant="outlined"
|
||||
/>
|
||||
);
|
||||
};
|
||||
|
||||
if (loading) {
|
||||
return (
|
||||
<Box display="flex" justifyContent="center" alignItems="center" minHeight="400px">
|
||||
|
|
@ -257,6 +317,22 @@ const DocumentsPage: React.FC = () => {
|
|||
</ToggleButton>
|
||||
</ToggleButtonGroup>
|
||||
|
||||
{/* OCR Filter */}
|
||||
<FormControl size="small" sx={{ minWidth: 120 }}>
|
||||
<InputLabel>OCR Status</InputLabel>
|
||||
<Select
|
||||
value={ocrFilter}
|
||||
label="OCR Status"
|
||||
onChange={handleOcrFilterChange}
|
||||
>
|
||||
<MenuItem value="">All</MenuItem>
|
||||
<MenuItem value="completed">Completed</MenuItem>
|
||||
<MenuItem value="processing">Processing</MenuItem>
|
||||
<MenuItem value="failed">Failed</MenuItem>
|
||||
<MenuItem value="pending">Pending</MenuItem>
|
||||
</Select>
|
||||
</FormControl>
|
||||
|
||||
{/* Sort Button */}
|
||||
<Button
|
||||
variant="outlined"
|
||||
|
|
@ -418,14 +494,7 @@ const DocumentsPage: React.FC = () => {
|
|||
size="small"
|
||||
variant="outlined"
|
||||
/>
|
||||
{doc.has_ocr_text && (
|
||||
<Chip
|
||||
label="OCR"
|
||||
size="small"
|
||||
color="success"
|
||||
variant="outlined"
|
||||
/>
|
||||
)}
|
||||
{getOcrStatusChip(doc)}
|
||||
</Stack>
|
||||
|
||||
{doc.tags.length > 0 && (
|
||||
|
|
@ -489,12 +558,29 @@ const DocumentsPage: React.FC = () => {
|
|||
</Grid>
|
||||
)}
|
||||
|
||||
{/* Results count */}
|
||||
<Box sx={{ mt: 3, textAlign: 'center' }}>
|
||||
<Typography variant="body2" color="text.secondary">
|
||||
Showing {sortedDocuments.length} of {documents.length} documents
|
||||
{searchQuery && ` matching "${searchQuery}"`}
|
||||
</Typography>
|
||||
{/* Results count and pagination */}
|
||||
<Box sx={{ mt: 3 }}>
|
||||
<Box sx={{ textAlign: 'center', mb: 2 }}>
|
||||
<Typography variant="body2" color="text.secondary">
|
||||
Showing {pagination.offset + 1}-{Math.min(pagination.offset + pagination.limit, pagination.total)} of {pagination.total} documents
|
||||
{ocrFilter && ` with OCR status: ${ocrFilter}`}
|
||||
{searchQuery && ` matching "${searchQuery}"`}
|
||||
</Typography>
|
||||
</Box>
|
||||
|
||||
{pagination.total > pagination.limit && (
|
||||
<Box sx={{ display: 'flex', justifyContent: 'center' }}>
|
||||
<Pagination
|
||||
count={Math.ceil(pagination.total / pagination.limit)}
|
||||
page={Math.floor(pagination.offset / pagination.limit) + 1}
|
||||
onChange={handlePageChange}
|
||||
color="primary"
|
||||
size="large"
|
||||
showFirstButton
|
||||
showLastButton
|
||||
/>
|
||||
</Box>
|
||||
)}
|
||||
</Box>
|
||||
</Box>
|
||||
);
|
||||
|
|
|
|||
|
|
@ -111,6 +111,16 @@ export const documentService = {
|
|||
})
|
||||
},
|
||||
|
||||
listWithPagination: (limit = 20, offset = 0, ocrStatus?: string) => {
|
||||
const params: any = { limit, offset };
|
||||
if (ocrStatus) {
|
||||
params.ocr_status = ocrStatus;
|
||||
}
|
||||
return api.get<{documents: Document[], pagination: {total: number, limit: number, offset: number, has_more: boolean}}>('/documents', {
|
||||
params,
|
||||
})
|
||||
},
|
||||
|
||||
getById: (id: string) => {
|
||||
// Use the document list endpoint with pagination to find the specific document
|
||||
// This is a temporary solution until we have a proper document details endpoint
|
||||
|
|
|
|||
141
src/db.rs
141
src/db.rs
|
|
@ -503,6 +503,147 @@ impl Database {
|
|||
Ok(documents)
|
||||
}
|
||||
|
||||
pub async fn get_documents_by_user_with_role_and_filter(&self, user_id: Uuid, user_role: crate::models::UserRole, limit: i64, offset: i64, ocr_status: Option<&str>) -> Result<Vec<Document>> {
|
||||
let rows = match (user_role == crate::models::UserRole::Admin, ocr_status) {
|
||||
(true, Some(status)) => {
|
||||
// Admin with OCR filter
|
||||
sqlx::query(
|
||||
r#"
|
||||
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id
|
||||
FROM documents
|
||||
WHERE ocr_status = $3
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $1 OFFSET $2
|
||||
"#
|
||||
)
|
||||
.bind(limit)
|
||||
.bind(offset)
|
||||
.bind(status)
|
||||
.fetch_all(&self.pool)
|
||||
.await?
|
||||
}
|
||||
(true, None) => {
|
||||
// Admin without OCR filter
|
||||
sqlx::query(
|
||||
r#"
|
||||
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id
|
||||
FROM documents
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $1 OFFSET $2
|
||||
"#
|
||||
)
|
||||
.bind(limit)
|
||||
.bind(offset)
|
||||
.fetch_all(&self.pool)
|
||||
.await?
|
||||
}
|
||||
(false, Some(status)) => {
|
||||
// Regular user with OCR filter
|
||||
sqlx::query(
|
||||
r#"
|
||||
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id
|
||||
FROM documents
|
||||
WHERE user_id = $3 AND ocr_status = $4
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $1 OFFSET $2
|
||||
"#
|
||||
)
|
||||
.bind(limit)
|
||||
.bind(offset)
|
||||
.bind(user_id)
|
||||
.bind(status)
|
||||
.fetch_all(&self.pool)
|
||||
.await?
|
||||
}
|
||||
(false, None) => {
|
||||
// Regular user without OCR filter
|
||||
sqlx::query(
|
||||
r#"
|
||||
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id
|
||||
FROM documents
|
||||
WHERE user_id = $3
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $1 OFFSET $2
|
||||
"#
|
||||
)
|
||||
.bind(limit)
|
||||
.bind(offset)
|
||||
.bind(user_id)
|
||||
.fetch_all(&self.pool)
|
||||
.await?
|
||||
}
|
||||
};
|
||||
|
||||
let documents = rows
|
||||
.into_iter()
|
||||
.map(|row| Document {
|
||||
id: row.get("id"),
|
||||
filename: row.get("filename"),
|
||||
original_filename: row.get("original_filename"),
|
||||
file_path: row.get("file_path"),
|
||||
file_size: row.get("file_size"),
|
||||
mime_type: row.get("mime_type"),
|
||||
content: row.get("content"),
|
||||
ocr_text: row.get("ocr_text"),
|
||||
ocr_confidence: row.get("ocr_confidence"),
|
||||
ocr_word_count: row.get("ocr_word_count"),
|
||||
ocr_processing_time_ms: row.get("ocr_processing_time_ms"),
|
||||
ocr_status: row.get("ocr_status"),
|
||||
ocr_error: row.get("ocr_error"),
|
||||
ocr_completed_at: row.get("ocr_completed_at"),
|
||||
tags: row.get("tags"),
|
||||
created_at: row.get("created_at"),
|
||||
updated_at: row.get("updated_at"),
|
||||
user_id: row.get("user_id"),
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(documents)
|
||||
}
|
||||
|
||||
pub async fn get_documents_count_with_role_and_filter(&self, user_id: Uuid, user_role: crate::models::UserRole, ocr_status: Option<&str>) -> Result<i64> {
|
||||
let count = match (user_role == crate::models::UserRole::Admin, ocr_status) {
|
||||
(true, Some(status)) => {
|
||||
// Admin with OCR filter
|
||||
sqlx::query_scalar::<_, i64>(
|
||||
"SELECT COUNT(*) FROM documents WHERE ocr_status = $1"
|
||||
)
|
||||
.bind(status)
|
||||
.fetch_one(&self.pool)
|
||||
.await?
|
||||
}
|
||||
(true, None) => {
|
||||
// Admin without OCR filter
|
||||
sqlx::query_scalar::<_, i64>(
|
||||
"SELECT COUNT(*) FROM documents"
|
||||
)
|
||||
.fetch_one(&self.pool)
|
||||
.await?
|
||||
}
|
||||
(false, Some(status)) => {
|
||||
// Regular user with OCR filter
|
||||
sqlx::query_scalar::<_, i64>(
|
||||
"SELECT COUNT(*) FROM documents WHERE user_id = $1 AND ocr_status = $2"
|
||||
)
|
||||
.bind(user_id)
|
||||
.bind(status)
|
||||
.fetch_one(&self.pool)
|
||||
.await?
|
||||
}
|
||||
(false, None) => {
|
||||
// Regular user without OCR filter
|
||||
sqlx::query_scalar::<_, i64>(
|
||||
"SELECT COUNT(*) FROM documents WHERE user_id = $1"
|
||||
)
|
||||
.bind(user_id)
|
||||
.fetch_one(&self.pool)
|
||||
.await?
|
||||
}
|
||||
};
|
||||
|
||||
Ok(count)
|
||||
}
|
||||
|
||||
pub async fn get_documents_by_user(&self, user_id: Uuid, limit: i64, offset: i64) -> Result<Vec<Document>> {
|
||||
let rows = sqlx::query(
|
||||
r#"
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ use crate::{
|
|||
struct PaginationQuery {
|
||||
limit: Option<i64>,
|
||||
offset: Option<i64>,
|
||||
ocr_status: Option<String>,
|
||||
}
|
||||
|
||||
pub fn router() -> Router<Arc<AppState>> {
|
||||
|
|
@ -174,7 +175,8 @@ fn calculate_file_hash(data: &[u8]) -> String {
|
|||
),
|
||||
params(
|
||||
("limit" = Option<i64>, Query, description = "Number of documents to return (default: 50)"),
|
||||
("offset" = Option<i64>, Query, description = "Number of documents to skip (default: 0)")
|
||||
("offset" = Option<i64>, Query, description = "Number of documents to skip (default: 0)"),
|
||||
("ocr_status" = Option<String>, Query, description = "Filter by OCR status (pending, processing, completed, failed)")
|
||||
),
|
||||
responses(
|
||||
(status = 200, description = "List of user documents", body = Vec<DocumentResponse>),
|
||||
|
|
@ -185,17 +187,40 @@ async fn list_documents(
|
|||
State(state): State<Arc<AppState>>,
|
||||
auth_user: AuthUser,
|
||||
Query(pagination): Query<PaginationQuery>,
|
||||
) -> Result<Json<Vec<DocumentResponse>>, StatusCode> {
|
||||
) -> Result<Json<serde_json::Value>, StatusCode> {
|
||||
let limit = pagination.limit.unwrap_or(50);
|
||||
let offset = pagination.offset.unwrap_or(0);
|
||||
|
||||
let documents = state
|
||||
.db
|
||||
.get_documents_by_user_with_role(auth_user.user.id, auth_user.user.role, limit, offset)
|
||||
.await
|
||||
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
|
||||
let user_id = auth_user.user.id;
|
||||
let user_role = auth_user.user.role;
|
||||
let ocr_filter = pagination.ocr_status.as_deref();
|
||||
|
||||
let response: Vec<DocumentResponse> = documents.into_iter().map(|doc| doc.into()).collect();
|
||||
let (documents, total_count) = tokio::try_join!(
|
||||
state.db.get_documents_by_user_with_role_and_filter(
|
||||
user_id,
|
||||
user_role.clone(),
|
||||
limit,
|
||||
offset,
|
||||
ocr_filter
|
||||
),
|
||||
state.db.get_documents_count_with_role_and_filter(
|
||||
user_id,
|
||||
user_role,
|
||||
ocr_filter
|
||||
)
|
||||
).map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
|
||||
|
||||
let documents_response: Vec<DocumentResponse> = documents.into_iter().map(|doc| doc.into()).collect();
|
||||
|
||||
let response = serde_json::json!({
|
||||
"documents": documents_response,
|
||||
"pagination": {
|
||||
"total": total_count,
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
"has_more": offset + limit < total_count
|
||||
}
|
||||
});
|
||||
|
||||
Ok(Json(response))
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,536 @@
|
|||
/*!
|
||||
* OCR Corruption Integration Tests
|
||||
*
|
||||
* Tests for diagnosing and reproducing the issue where FileA's OCR text
|
||||
* gets corrupted when FileB is processed simultaneously.
|
||||
*/
|
||||
|
||||
use reqwest::Client;
|
||||
use serde_json::{json, Value};
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio::time::sleep;
|
||||
use uuid::Uuid;
|
||||
|
||||
use readur::models::{DocumentResponse, CreateUser, LoginRequest, LoginResponse};
|
||||
|
||||
const BASE_URL: &str = "http://localhost:8000";
|
||||
const TIMEOUT: Duration = Duration::from_secs(60);
|
||||
|
||||
/// Test client for OCR corruption scenarios
|
||||
struct OcrTestClient {
|
||||
client: Client,
|
||||
token: Option<String>,
|
||||
user_id: Option<Uuid>,
|
||||
}
|
||||
|
||||
impl OcrTestClient {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
client: Client::new(),
|
||||
token: None,
|
||||
user_id: None,
|
||||
}
|
||||
}
|
||||
|
||||
async fn check_server_health(&self) -> Result<(), Box<dyn std::error::Error>> {
|
||||
let response = self.client
|
||||
.get(&format!("{}/api/health", BASE_URL))
|
||||
.timeout(Duration::from_secs(5))
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
if !response.status().is_success() {
|
||||
return Err("Server health check failed".into());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn register_and_login(&mut self, username: &str, email: &str, password: &str) -> Result<String, Box<dyn std::error::Error>> {
|
||||
let user_data = CreateUser {
|
||||
username: username.to_string(),
|
||||
email: email.to_string(),
|
||||
password: password.to_string(),
|
||||
role: Some(readur::models::UserRole::User),
|
||||
};
|
||||
|
||||
let register_response = self.client
|
||||
.post(&format!("{}/api/auth/register", BASE_URL))
|
||||
.json(&user_data)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
if !register_response.status().is_success() {
|
||||
return Err(format!("Registration failed: {}", register_response.text().await?).into());
|
||||
}
|
||||
|
||||
let login_data = LoginRequest {
|
||||
username: username.to_string(),
|
||||
password: password.to_string(),
|
||||
};
|
||||
|
||||
let login_response = self.client
|
||||
.post(&format!("{}/api/auth/login", BASE_URL))
|
||||
.json(&login_data)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
if !login_response.status().is_success() {
|
||||
return Err(format!("Login failed: {}", login_response.text().await?).into());
|
||||
}
|
||||
|
||||
let login_result: LoginResponse = login_response.json().await?;
|
||||
self.token = Some(login_result.token.clone());
|
||||
|
||||
Ok(login_result.token)
|
||||
}
|
||||
|
||||
/// Upload a document and return its ID and expected content
|
||||
async fn upload_document(&self, content: &str, filename: &str) -> Result<(Uuid, String), Box<dyn std::error::Error>> {
|
||||
let token = self.token.as_ref().ok_or("Not authenticated")?;
|
||||
|
||||
let part = reqwest::multipart::Part::text(content.to_string())
|
||||
.file_name(filename.to_string())
|
||||
.mime_str("text/plain")?;
|
||||
let form = reqwest::multipart::Form::new()
|
||||
.part("file", part);
|
||||
|
||||
let response = self.client
|
||||
.post(&format!("{}/api/documents", BASE_URL))
|
||||
.header("Authorization", format!("Bearer {}", token))
|
||||
.multipart(form)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
if !response.status().is_success() {
|
||||
return Err(format!("Upload failed: {}", response.text().await?).into());
|
||||
}
|
||||
|
||||
let document: DocumentResponse = response.json().await?;
|
||||
Ok((document.id, content.to_string()))
|
||||
}
|
||||
|
||||
/// Get document details including OCR status
|
||||
async fn get_document_details(&self, doc_id: Uuid) -> Result<Value, Box<dyn std::error::Error>> {
|
||||
let token = self.token.as_ref().ok_or("Not authenticated")?;
|
||||
|
||||
let response = self.client
|
||||
.get(&format!("{}/api/documents/{}/ocr", BASE_URL, doc_id))
|
||||
.header("Authorization", format!("Bearer {}", token))
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
if !response.status().is_success() {
|
||||
return Err(format!("Failed to get document details: {}", response.text().await?).into());
|
||||
}
|
||||
|
||||
let doc_data: Value = response.json().await?;
|
||||
Ok(doc_data)
|
||||
}
|
||||
|
||||
/// Wait for OCR to complete for a document
|
||||
async fn wait_for_ocr(&self, doc_id: Uuid) -> Result<Value, Box<dyn std::error::Error>> {
|
||||
let start = Instant::now();
|
||||
|
||||
while start.elapsed() < TIMEOUT {
|
||||
let doc_data = self.get_document_details(doc_id).await?;
|
||||
|
||||
match doc_data["ocr_status"].as_str() {
|
||||
Some("completed") => {
|
||||
println!("✅ OCR completed for document {}", doc_id);
|
||||
return Ok(doc_data);
|
||||
},
|
||||
Some("failed") => {
|
||||
return Err(format!("OCR failed for document {}: {}",
|
||||
doc_id,
|
||||
doc_data["ocr_error"].as_str().unwrap_or("unknown error")).into());
|
||||
},
|
||||
Some("processing") => {
|
||||
println!("⏳ OCR still processing for document {}", doc_id);
|
||||
},
|
||||
_ => {
|
||||
println!("📋 Document {} queued for OCR", doc_id);
|
||||
}
|
||||
}
|
||||
|
||||
sleep(Duration::from_millis(200)).await;
|
||||
}
|
||||
|
||||
Err(format!("OCR did not complete within {} seconds for document {}", TIMEOUT.as_secs(), doc_id).into())
|
||||
}
|
||||
|
||||
/// Upload multiple documents simultaneously and track their OCR results
|
||||
async fn upload_documents_simultaneously(&self, documents: Vec<(&str, &str)>) -> Result<Vec<(Uuid, String, Value)>, Box<dyn std::error::Error>> {
|
||||
let mut upload_tasks = Vec::new();
|
||||
|
||||
// Upload all documents simultaneously
|
||||
for (content, filename) in documents {
|
||||
let content_owned = content.to_string();
|
||||
let filename_owned = filename.to_string();
|
||||
let client_ref = self;
|
||||
|
||||
let task = async move {
|
||||
client_ref.upload_document(&content_owned, &filename_owned).await
|
||||
};
|
||||
|
||||
upload_tasks.push(task);
|
||||
}
|
||||
|
||||
// Wait for all uploads to complete
|
||||
let mut uploaded_docs = Vec::new();
|
||||
for task in upload_tasks {
|
||||
let (doc_id, expected_content) = task.await?;
|
||||
uploaded_docs.push((doc_id, expected_content));
|
||||
println!("📄 Uploaded document: {}", doc_id);
|
||||
}
|
||||
|
||||
// Now wait for OCR processing on all documents
|
||||
let mut ocr_tasks = Vec::new();
|
||||
for (doc_id, expected_content) in uploaded_docs {
|
||||
let client_ref = self;
|
||||
let task = async move {
|
||||
let ocr_result = client_ref.wait_for_ocr(doc_id).await?;
|
||||
Ok::<(Uuid, String, Value), Box<dyn std::error::Error>>((doc_id, expected_content, ocr_result))
|
||||
};
|
||||
ocr_tasks.push(task);
|
||||
}
|
||||
|
||||
// Wait for all OCR to complete
|
||||
let mut results = Vec::new();
|
||||
for task in ocr_tasks {
|
||||
let result = task.await?;
|
||||
results.push(result);
|
||||
}
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_concurrent_ocr_corruption() {
|
||||
println!("🧪 Starting OCR corruption test with concurrent file processing");
|
||||
|
||||
let mut client = OcrTestClient::new();
|
||||
|
||||
// Check server health
|
||||
if let Err(e) = client.check_server_health().await {
|
||||
panic!("Server not running at {}: {}", BASE_URL, e);
|
||||
}
|
||||
|
||||
// Create test user
|
||||
let timestamp = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
let username = format!("ocr_corruption_test_{}", timestamp);
|
||||
let email = format!("ocr_corruption_{}@test.com", timestamp);
|
||||
|
||||
let _token = client.register_and_login(&username, &email, "testpass123").await
|
||||
.expect("Failed to register and login");
|
||||
|
||||
println!("✅ User registered: {}", username);
|
||||
|
||||
// Create test documents with distinctive content
|
||||
let file_a_content = r#"
|
||||
=== DOCUMENT A - IMPORTANT CONTRACT ===
|
||||
Contract Number: CONTRACT-A-001
|
||||
Party 1: Alice Corporation
|
||||
Party 2: Bob Industries
|
||||
Date: 2024-01-15
|
||||
Amount: $50,000
|
||||
Terms: This is the content for Document A. It contains specific legal text
|
||||
that should remain associated with Document A only. Any corruption would
|
||||
be immediately visible.
|
||||
|
||||
DOCUMENT A SIGNATURE: Alice Smith, CEO
|
||||
UNIQUE IDENTIFIER FOR A: ALPHA-BRAVO-CHARLIE-001
|
||||
"#;
|
||||
|
||||
let file_b_content = r#"
|
||||
=== DOCUMENT B - TECHNICAL SPECIFICATION ===
|
||||
Specification ID: SPEC-B-002
|
||||
Product: Widget Manufacturing System
|
||||
Version: 2.0
|
||||
Author: Technical Team B
|
||||
Date: 2024-01-16
|
||||
|
||||
This is Document B containing technical specifications. It has completely
|
||||
different content from Document A. If OCR corruption occurs, Document A
|
||||
might end up with this technical content instead of its contract text.
|
||||
|
||||
DOCUMENT B SIGNATURE: Bob Johnson, CTO
|
||||
UNIQUE IDENTIFIER FOR B: DELTA-ECHO-FOXTROT-002
|
||||
"#;
|
||||
|
||||
// Test documents to upload simultaneously
|
||||
let documents = vec![
|
||||
(file_a_content, "contract_a.txt"),
|
||||
(file_b_content, "specification_b.txt"),
|
||||
];
|
||||
|
||||
println!("📤 Uploading documents simultaneously...");
|
||||
|
||||
let results = client.upload_documents_simultaneously(documents).await
|
||||
.expect("Failed to upload documents simultaneously");
|
||||
|
||||
println!("🔍 Analyzing OCR results for corruption...");
|
||||
|
||||
let mut corruption_detected = false;
|
||||
|
||||
for (doc_id, expected_content, ocr_result) in results {
|
||||
let actual_ocr_text = ocr_result["ocr_text"].as_str().unwrap_or("");
|
||||
let filename = ocr_result["filename"].as_str().unwrap_or("unknown");
|
||||
|
||||
println!("\n📋 Document: {} ({})", doc_id, filename);
|
||||
println!("📄 Expected content length: {} chars", expected_content.len());
|
||||
println!("🔤 Actual OCR text length: {} chars", actual_ocr_text.len());
|
||||
|
||||
// Check for content mismatch (corruption)
|
||||
if filename.contains("contract_a") {
|
||||
// Document A should contain contract-specific terms
|
||||
let has_contract_content = actual_ocr_text.contains("CONTRACT-A-001")
|
||||
&& actual_ocr_text.contains("Alice Corporation")
|
||||
&& actual_ocr_text.contains("ALPHA-BRAVO-CHARLIE-001");
|
||||
|
||||
let has_spec_content = actual_ocr_text.contains("SPEC-B-002")
|
||||
|| actual_ocr_text.contains("Widget Manufacturing")
|
||||
|| actual_ocr_text.contains("DELTA-ECHO-FOXTROT-002");
|
||||
|
||||
if !has_contract_content {
|
||||
println!("❌ CORRUPTION DETECTED: Document A missing its original contract content!");
|
||||
corruption_detected = true;
|
||||
}
|
||||
|
||||
if has_spec_content {
|
||||
println!("❌ CORRUPTION DETECTED: Document A contains Document B's specification content!");
|
||||
corruption_detected = true;
|
||||
}
|
||||
|
||||
if has_contract_content && !has_spec_content {
|
||||
println!("✅ Document A has correct content");
|
||||
}
|
||||
} else if filename.contains("specification_b") {
|
||||
// Document B should contain specification-specific terms
|
||||
let has_spec_content = actual_ocr_text.contains("SPEC-B-002")
|
||||
&& actual_ocr_text.contains("Widget Manufacturing")
|
||||
&& actual_ocr_text.contains("DELTA-ECHO-FOXTROT-002");
|
||||
|
||||
let has_contract_content = actual_ocr_text.contains("CONTRACT-A-001")
|
||||
|| actual_ocr_text.contains("Alice Corporation")
|
||||
|| actual_ocr_text.contains("ALPHA-BRAVO-CHARLIE-001");
|
||||
|
||||
if !has_spec_content {
|
||||
println!("❌ CORRUPTION DETECTED: Document B missing its original specification content!");
|
||||
corruption_detected = true;
|
||||
}
|
||||
|
||||
if has_contract_content {
|
||||
println!("❌ CORRUPTION DETECTED: Document B contains Document A's contract content!");
|
||||
corruption_detected = true;
|
||||
}
|
||||
|
||||
if has_spec_content && !has_contract_content {
|
||||
println!("✅ Document B has correct content");
|
||||
}
|
||||
}
|
||||
|
||||
// Additional integrity checks
|
||||
if let Some(confidence) = ocr_result["ocr_confidence"].as_f64() {
|
||||
println!("📊 OCR Confidence: {:.1}%", confidence);
|
||||
if confidence < 50.0 {
|
||||
println!("⚠️ Low OCR confidence may indicate processing issues");
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(word_count) = ocr_result["ocr_word_count"].as_i64() {
|
||||
println!("📝 OCR Word Count: {}", word_count);
|
||||
}
|
||||
|
||||
if let Some(processing_time) = ocr_result["ocr_processing_time_ms"].as_i64() {
|
||||
println!("⏱️ OCR Processing Time: {}ms", processing_time);
|
||||
}
|
||||
}
|
||||
|
||||
if corruption_detected {
|
||||
panic!("🚨 OCR CORRUPTION DETECTED! FileA's content was overwritten with FileB's data or vice versa.");
|
||||
} else {
|
||||
println!("\n🎉 No OCR corruption detected - all documents retained their correct content!");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_high_volume_concurrent_ocr() {
|
||||
println!("🧪 Starting high-volume concurrent OCR test");
|
||||
|
||||
let mut client = OcrTestClient::new();
|
||||
|
||||
if let Err(e) = client.check_server_health().await {
|
||||
panic!("Server not running at {}: {}", BASE_URL, e);
|
||||
}
|
||||
|
||||
let timestamp = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
let username = format!("high_volume_test_{}", timestamp);
|
||||
let email = format!("high_volume_{}@test.com", timestamp);
|
||||
|
||||
let _token = client.register_and_login(&username, &email, "testpass123").await
|
||||
.expect("Failed to register and login");
|
||||
|
||||
// Create 5 documents with unique identifiable content
|
||||
let mut documents = Vec::new();
|
||||
for i in 1..=5 {
|
||||
let content = format!(r#"
|
||||
=== DOCUMENT {} - UNIQUE CONTENT ===
|
||||
Document Number: DOC-{:03}
|
||||
Unique Signature: SIGNATURE-{}-{}-{}
|
||||
Content: This is document number {} with completely unique content.
|
||||
Every document should retain its own unique signature and number.
|
||||
Any mixing of content between documents indicates corruption.
|
||||
Random data: {}
|
||||
End of Document {}
|
||||
"#, i, i, timestamp, i*7, i, timestamp * i, i, i);
|
||||
|
||||
documents.push((content, format!("doc_{}.txt", i)));
|
||||
}
|
||||
|
||||
println!("📤 Uploading {} documents simultaneously...", documents.len());
|
||||
|
||||
let documents_ref: Vec<(&str, &str)> = documents.iter()
|
||||
.map(|(content, filename)| (content.as_str(), filename.as_str()))
|
||||
.collect();
|
||||
|
||||
let results = client.upload_documents_simultaneously(documents_ref).await
|
||||
.expect("Failed to upload documents simultaneously");
|
||||
|
||||
println!("🔍 Analyzing results for content mixing...");
|
||||
|
||||
let mut all_signatures = Vec::new();
|
||||
let mut corruption_found = false;
|
||||
|
||||
// Extract all unique signatures
|
||||
for i in 1..=5 {
|
||||
all_signatures.push(format!("SIGNATURE-{}-{}-{}", i, timestamp, i*7));
|
||||
}
|
||||
|
||||
// Check each document for corruption
|
||||
for (doc_id, expected_content, ocr_result) in results {
|
||||
let actual_ocr_text = ocr_result["ocr_text"].as_str().unwrap_or("");
|
||||
let filename = ocr_result["filename"].as_str().unwrap_or("unknown");
|
||||
|
||||
// Determine which document this should be based on filename
|
||||
if let Some(doc_num_str) = filename.strip_prefix("doc_").and_then(|s| s.strip_suffix(".txt")) {
|
||||
if let Ok(doc_num) = doc_num_str.parse::<i32>() {
|
||||
let expected_signature = format!("SIGNATURE-{}-{}-{}", doc_num, timestamp, doc_num*7);
|
||||
|
||||
println!("\n📋 Checking Document {} ({})", doc_num, doc_id);
|
||||
|
||||
// Check if it has its own signature
|
||||
let has_own_signature = actual_ocr_text.contains(&expected_signature);
|
||||
|
||||
// Check if it has any other document's signature
|
||||
let mut has_other_signatures = Vec::new();
|
||||
for (i, sig) in all_signatures.iter().enumerate() {
|
||||
if i + 1 != doc_num as usize && actual_ocr_text.contains(sig) {
|
||||
has_other_signatures.push(i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
if !has_own_signature {
|
||||
println!("❌ CORRUPTION: Document {} missing its own signature!", doc_num);
|
||||
corruption_found = true;
|
||||
}
|
||||
|
||||
if !has_other_signatures.is_empty() {
|
||||
println!("❌ CORRUPTION: Document {} contains signatures from documents: {:?}", doc_num, has_other_signatures);
|
||||
corruption_found = true;
|
||||
}
|
||||
|
||||
if has_own_signature && has_other_signatures.is_empty() {
|
||||
println!("✅ Document {} has correct content", doc_num);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if corruption_found {
|
||||
panic!("🚨 CONTENT CORRUPTION DETECTED in high-volume test!");
|
||||
} else {
|
||||
println!("\n🎉 High-volume test passed - no corruption detected!");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_rapid_sequential_uploads() {
|
||||
println!("🧪 Testing rapid sequential uploads for race conditions");
|
||||
|
||||
let mut client = OcrTestClient::new();
|
||||
|
||||
if let Err(e) = client.check_server_health().await {
|
||||
panic!("Server not running at {}: {}", BASE_URL, e);
|
||||
}
|
||||
|
||||
let timestamp = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
let username = format!("rapid_test_{}", timestamp);
|
||||
let email = format!("rapid_{}@test.com", timestamp);
|
||||
|
||||
let _token = client.register_and_login(&username, &email, "testpass123").await
|
||||
.expect("Failed to register and login");
|
||||
|
||||
println!("📤 Uploading documents in rapid sequence...");
|
||||
|
||||
// Upload documents one after another with minimal delay
|
||||
let mut doc_ids = Vec::new();
|
||||
let mut expected_contents = Vec::new();
|
||||
|
||||
for i in 1..=3 {
|
||||
let content = format!("RAPID-TEST-DOCUMENT-{}-{}-UNIQUE-CONTENT", i, timestamp);
|
||||
let filename = format!("rapid_{}.txt", i);
|
||||
|
||||
let (doc_id, expected) = client.upload_document(&content, &filename).await
|
||||
.expect("Failed to upload document");
|
||||
|
||||
doc_ids.push(doc_id);
|
||||
expected_contents.push(expected);
|
||||
|
||||
println!("📄 Uploaded rapid document {}: {}", i, doc_id);
|
||||
|
||||
// Very short delay to create timing pressure
|
||||
sleep(Duration::from_millis(50)).await;
|
||||
}
|
||||
|
||||
println!("⏳ Waiting for all OCR to complete...");
|
||||
|
||||
// Wait for all to complete and check for corruption
|
||||
for (i, doc_id) in doc_ids.iter().enumerate() {
|
||||
let ocr_result = client.wait_for_ocr(*doc_id).await
|
||||
.expect("Failed to wait for OCR");
|
||||
|
||||
let actual_text = ocr_result["ocr_text"].as_str().unwrap_or("");
|
||||
let expected_marker = format!("RAPID-TEST-DOCUMENT-{}", i + 1);
|
||||
|
||||
if !actual_text.contains(&expected_marker) {
|
||||
panic!("🚨 RAPID UPLOAD CORRUPTION: Document {} missing its unique marker '{}'",
|
||||
doc_id, expected_marker);
|
||||
}
|
||||
|
||||
// Check it doesn't contain other documents' markers
|
||||
for j in 1..=3 {
|
||||
if j != (i + 1) {
|
||||
let other_marker = format!("RAPID-TEST-DOCUMENT-{}", j);
|
||||
if actual_text.contains(&other_marker) {
|
||||
panic!("🚨 RAPID UPLOAD CORRUPTION: Document {} contains marker from document {}",
|
||||
doc_id, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
println!("✅ Rapid document {} has correct content", i + 1);
|
||||
}
|
||||
|
||||
println!("🎉 Rapid sequential upload test passed!");
|
||||
}
|
||||
Loading…
Reference in New Issue