From 582617ab88fdc20eef1d0006709e83fa0d52c462 Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Fri, 27 Jun 2025 20:23:59 -0700 Subject: [PATCH 1/4] fix(server/client): fix incorrect OCR measurements --- Cargo.toml | 1 + frontend/src/pages/FailedOcrPage.tsx | 169 +++++++ frontend/src/services/api.ts | 5 + ...20250628000001_backfill_ocr_confidence.sql | 59 +++ src/db/documents.rs | 159 ++++++ src/ocr/enhanced.rs | 20 +- src/routes/documents.rs | 101 +++- src/tests/document_routes_tests.rs | 300 ++++++++++++ src/tests/documents_tests.rs | 394 +++++++++++++++ src/tests/enhanced_ocr_tests.rs | 455 ++++++++++++++++++ src/tests/mod.rs | 1 + ...ion_document_deletion_integration_tests.rs | 271 +++++++++++ 12 files changed, 1926 insertions(+), 9 deletions(-) create mode 100644 migrations/20250628000001_backfill_ocr_confidence.sql create mode 100644 src/tests/enhanced_ocr_tests.rs diff --git a/Cargo.toml b/Cargo.toml index e6a62c2..902f113 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -71,6 +71,7 @@ testcontainers = "0.24" testcontainers-modules = { version = "0.12", features = ["postgres"] } wiremock = "0.6" tokio-test = "0.4" +futures = "0.3" [profile.test] incremental = false diff --git a/frontend/src/pages/FailedOcrPage.tsx b/frontend/src/pages/FailedOcrPage.tsx index 2dad6da..7b33a89 100644 --- a/frontend/src/pages/FailedOcrPage.tsx +++ b/frontend/src/pages/FailedOcrPage.tsx @@ -155,6 +155,11 @@ const FailedOcrPage: React.FC = () => { const [previewData, setPreviewData] = useState(null); const [confirmDeleteOpen, setConfirmDeleteOpen] = useState(false); + // Failed documents deletion state + const [failedDocsLoading, setFailedDocsLoading] = useState(false); + const [failedPreviewData, setFailedPreviewData] = useState(null); + const [confirmDeleteFailedOpen, setConfirmDeleteFailedOpen] = useState(false); + const fetchFailedDocuments = async () => { try { setLoading(true); @@ -308,6 +313,8 @@ const FailedOcrPage: React.FC = () => { fetchDuplicates(); } else if (currentTab === 2) { handlePreviewLowConfidence(); + } else if (currentTab === 3) { + handlePreviewFailedDocuments(); } }; @@ -369,6 +376,51 @@ const FailedOcrPage: React.FC = () => { } }; + // Failed documents handlers + const handlePreviewFailedDocuments = async () => { + try { + setFailedDocsLoading(true); + const response = await documentService.deleteFailedOcr(true); + setFailedPreviewData(response.data); + } catch (error) { + setSnackbar({ + open: true, + message: 'Failed to preview failed documents', + severity: 'error' + }); + } finally { + setFailedDocsLoading(false); + } + }; + + const handleDeleteFailedDocuments = async () => { + try { + setFailedDocsLoading(true); + const response = await documentService.deleteFailedOcr(false); + + setSnackbar({ + open: true, + message: response.data.message, + severity: 'success' + }); + setFailedPreviewData(null); + setConfirmDeleteFailedOpen(false); + + // Refresh failed OCR tab if currently viewing it + if (currentTab === 0) { + fetchFailedDocuments(); + } + } catch (error) { + setSnackbar({ + open: true, + message: 'Failed to delete failed documents', + severity: 'error' + }); + } finally { + setFailedDocsLoading(false); + } + }; + if (loading && (!documents || documents.length === 0)) { return ( @@ -410,6 +462,11 @@ const FailedOcrPage: React.FC = () => { label={`Low Confidence${previewData ? ` (${previewData.matched_count})` : ''}`} iconPosition="start" /> + } + label="Delete Failed" + iconPosition="start" + /> @@ -989,6 +1046,83 @@ const FailedOcrPage: React.FC = () => { )} + {/* Delete Failed Documents Tab Content */} + {currentTab === 3 && ( + <> + + Delete Failed OCR Documents + + This tool allows you to delete all documents where OCR processing failed completely. + This includes documents with NULL confidence values or explicit failure status. + Use the preview feature first to see what documents would be affected before deleting. + + + + + + + + + + + + + + + + + {/* Preview Results */} + {failedPreviewData && ( + + + + Preview Results + + 0 ? 'error.main' : 'success.main'}> + {failedPreviewData.message} + + {failedPreviewData.matched_count > 0 && ( + + + Document IDs that would be deleted: + + + {failedPreviewData.document_ids.slice(0, 10).join(', ')} + {failedPreviewData.document_ids.length > 10 && ` ... and ${failedPreviewData.document_ids.length - 10} more`} + + + )} + + + )} + + {/* Loading State */} + {failedDocsLoading && !failedPreviewData && ( + + + Processing request... + + )} + + )} + {/* Confirmation Dialog */} { + {/* Confirmation Dialog for Failed Documents */} + setConfirmDeleteFailedOpen(false)} + maxWidth="sm" + fullWidth + > + + + Confirm Failed Document Deletion + + + + Are you sure you want to delete {failedPreviewData?.matched_count || 0} documents with failed OCR processing? + + + This action cannot be undone. The documents and their files will be permanently deleted. + + + + + + + + {/* Document Details Dialog */} { + return api.post('/documents/delete-failed-ocr', { + preview_only: previewOnly + }) + }, } export interface OcrStatusResponse { diff --git a/migrations/20250628000001_backfill_ocr_confidence.sql b/migrations/20250628000001_backfill_ocr_confidence.sql new file mode 100644 index 0000000..0005371 --- /dev/null +++ b/migrations/20250628000001_backfill_ocr_confidence.sql @@ -0,0 +1,59 @@ +-- Backfill OCR confidence scores for existing documents +-- Since OCR confidence was previously hardcoded to 85%, we need to recalculate +-- actual confidence for documents that currently have this placeholder value + +-- First, let's identify documents that likely have placeholder confidence +-- (85% exactly, which was the hardcoded value) +CREATE TEMP TABLE documents_to_update AS +SELECT id, ocr_text, ocr_status +FROM documents +WHERE ocr_confidence = 85.0 + AND ocr_status = 'completed' + AND ocr_text IS NOT NULL + AND length(trim(ocr_text)) > 0; + +-- For now, we'll estimate confidence based on text quality metrics +-- This is a rough approximation until we can re-run OCR with actual confidence +UPDATE documents +SET ocr_confidence = CASE + -- High quality text: good length, reasonable character distribution + WHEN length(trim(ocr_text)) > 1000 + AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), char(10), ''))) * 100.0 / length(ocr_text) > 10.0 -- > 10% whitespace + AND length(replace(replace(replace(ocr_text, ' ', ''), char(10), ''), char(13), '')) * 100.0 / length(ocr_text) > 70.0 -- > 70% non-whitespace chars + THEN 90.0 + (random() * 8.0) -- 90-98% + + -- Medium quality text: decent length, some structure + WHEN length(trim(ocr_text)) > 100 + AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), char(10), ''))) * 100.0 / length(ocr_text) > 5.0 -- > 5% whitespace + AND length(replace(replace(replace(ocr_text, ' ', ''), char(10), ''), char(13), '')) * 100.0 / length(ocr_text) > 50.0 -- > 50% non-whitespace chars + THEN 70.0 + (random() * 15.0) -- 70-85% + + -- Low quality text: short or poor structure + WHEN length(trim(ocr_text)) > 10 + AND length(replace(replace(replace(ocr_text, ' ', ''), char(10), ''), char(13), '')) * 100.0 / length(ocr_text) > 30.0 -- > 30% non-whitespace chars + THEN 40.0 + (random() * 25.0) -- 40-65% + + -- Very poor quality: very short or mostly garbage + ELSE 20.0 + (random() * 15.0) -- 20-35% +END +WHERE id IN (SELECT id FROM documents_to_update); + +-- Add a comment explaining what we did +COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence percentage (0-100). Values may be estimated for documents processed before real confidence calculation was implemented.'; + +-- Log the update +DO $$ +DECLARE + updated_count INTEGER; +BEGIN + SELECT COUNT(*) INTO updated_count FROM documents_to_update; + RAISE NOTICE 'Backfilled OCR confidence for % documents that had placeholder 85%% confidence', updated_count; +END $$; + +-- Clean up +DROP TABLE documents_to_update; + +-- Create an index to help with confidence-based queries +CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence_range +ON documents(ocr_confidence) +WHERE ocr_confidence IS NOT NULL; \ No newline at end of file diff --git a/src/db/documents.rs b/src/db/documents.rs index 3f7294b..5217d97 100644 --- a/src/db/documents.rs +++ b/src/db/documents.rs @@ -1586,6 +1586,165 @@ impl Database { Ok(documents) } + /// Find documents with failed OCR processing + pub async fn find_failed_ocr_documents(&self, user_id: uuid::Uuid, user_role: crate::models::UserRole) -> Result> { + let documents = if user_role == crate::models::UserRole::Admin { + let rows = sqlx::query( + r#" + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + FROM documents + WHERE ocr_status = 'failed' OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing') + ORDER BY created_at DESC + "#, + ) + .fetch_all(&self.pool) + .await?; + + rows.into_iter().map(|r| Document { + id: r.get("id"), + filename: r.get("filename"), + original_filename: r.get("original_filename"), + file_path: r.get("file_path"), + file_size: r.get("file_size"), + mime_type: r.get("mime_type"), + content: r.get("content"), + ocr_text: r.get("ocr_text"), + ocr_confidence: r.get("ocr_confidence"), + ocr_word_count: r.get("ocr_word_count"), + ocr_processing_time_ms: r.get("ocr_processing_time_ms"), + ocr_status: r.get("ocr_status"), + ocr_error: r.get("ocr_error"), + ocr_completed_at: r.get("ocr_completed_at"), + tags: r.get("tags"), + created_at: r.get("created_at"), + updated_at: r.get("updated_at"), + user_id: r.get("user_id"), + file_hash: r.get("file_hash"), + }).collect() + } else { + let rows = sqlx::query( + r#" + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + FROM documents + WHERE (ocr_status = 'failed' OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing')) AND user_id = $1 + ORDER BY created_at DESC + "#, + ) + .bind(user_id) + .fetch_all(&self.pool) + .await?; + + rows.into_iter().map(|r| Document { + id: r.get("id"), + filename: r.get("filename"), + original_filename: r.get("original_filename"), + file_path: r.get("file_path"), + file_size: r.get("file_size"), + mime_type: r.get("mime_type"), + content: r.get("content"), + ocr_text: r.get("ocr_text"), + ocr_confidence: r.get("ocr_confidence"), + ocr_word_count: r.get("ocr_word_count"), + ocr_processing_time_ms: r.get("ocr_processing_time_ms"), + ocr_status: r.get("ocr_status"), + ocr_error: r.get("ocr_error"), + ocr_completed_at: r.get("ocr_completed_at"), + tags: r.get("tags"), + created_at: r.get("created_at"), + updated_at: r.get("updated_at"), + user_id: r.get("user_id"), + file_hash: r.get("file_hash"), + }).collect() + }; + + Ok(documents) + } + + /// Find documents with low confidence or failed OCR (combined) + pub async fn find_low_confidence_and_failed_documents(&self, max_confidence: f32, user_id: uuid::Uuid, user_role: crate::models::UserRole) -> Result> { + let documents = if user_role == crate::models::UserRole::Admin { + let rows = sqlx::query( + r#" + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + FROM documents + WHERE (ocr_confidence IS NOT NULL AND ocr_confidence < $1) + OR ocr_status = 'failed' + OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing') + ORDER BY + CASE WHEN ocr_confidence IS NOT NULL THEN ocr_confidence ELSE -1 END ASC, + created_at DESC + "#, + ) + .bind(max_confidence) + .fetch_all(&self.pool) + .await?; + + rows.into_iter().map(|r| Document { + id: r.get("id"), + filename: r.get("filename"), + original_filename: r.get("original_filename"), + file_path: r.get("file_path"), + file_size: r.get("file_size"), + mime_type: r.get("mime_type"), + content: r.get("content"), + ocr_text: r.get("ocr_text"), + ocr_confidence: r.get("ocr_confidence"), + ocr_word_count: r.get("ocr_word_count"), + ocr_processing_time_ms: r.get("ocr_processing_time_ms"), + ocr_status: r.get("ocr_status"), + ocr_error: r.get("ocr_error"), + ocr_completed_at: r.get("ocr_completed_at"), + tags: r.get("tags"), + created_at: r.get("created_at"), + updated_at: r.get("updated_at"), + user_id: r.get("user_id"), + file_hash: r.get("file_hash"), + }).collect() + } else { + let rows = sqlx::query( + r#" + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + FROM documents + WHERE ((ocr_confidence IS NOT NULL AND ocr_confidence < $1) + OR ocr_status = 'failed' + OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing')) + AND user_id = $2 + ORDER BY + CASE WHEN ocr_confidence IS NOT NULL THEN ocr_confidence ELSE -1 END ASC, + created_at DESC + "#, + ) + .bind(max_confidence) + .bind(user_id) + .fetch_all(&self.pool) + .await?; + + rows.into_iter().map(|r| Document { + id: r.get("id"), + filename: r.get("filename"), + original_filename: r.get("original_filename"), + file_path: r.get("file_path"), + file_size: r.get("file_size"), + mime_type: r.get("mime_type"), + content: r.get("content"), + ocr_text: r.get("ocr_text"), + ocr_confidence: r.get("ocr_confidence"), + ocr_word_count: r.get("ocr_word_count"), + ocr_processing_time_ms: r.get("ocr_processing_time_ms"), + ocr_status: r.get("ocr_status"), + ocr_error: r.get("ocr_error"), + ocr_completed_at: r.get("ocr_completed_at"), + tags: r.get("tags"), + created_at: r.get("created_at"), + updated_at: r.get("updated_at"), + user_id: r.get("user_id"), + file_hash: r.get("file_hash"), + }).collect() + }; + + Ok(documents) + } + pub async fn count_documents_for_source(&self, source_id: Uuid) -> Result<(i64, i64)> { let row = sqlx::query( r#" diff --git a/src/ocr/enhanced.rs b/src/ocr/enhanced.rs index 87e690d..4531f7d 100644 --- a/src/ocr/enhanced.rs +++ b/src/ocr/enhanced.rs @@ -295,15 +295,21 @@ impl EnhancedOcrService { Ok(tesseract) } - /// Calculate overall confidence score + /// Calculate overall confidence score using Tesseract's mean confidence #[cfg(feature = "ocr")] - fn calculate_overall_confidence(&self, _tesseract: &mut Tesseract) -> Result { - // Note: get_word_confidences may not be available in current tesseract crate version - // For now, we'll estimate confidence based on text quality - // This can be enhanced when the API is available or with alternative methods + fn calculate_overall_confidence(&self, tesseract: &mut Tesseract) -> Result { + // Use Tesseract's built-in mean confidence calculation + let confidence = tesseract.mean_text_conf(); - // Return a reasonable default confidence for now - Ok(85.0) + // Convert from i32 to f32 and ensure it's within valid range + let confidence_f32 = confidence as f32; + + // Clamp confidence to valid range (0.0 to 100.0) + let clamped_confidence = confidence_f32.max(0.0).min(100.0); + + debug!("Tesseract confidence: {} -> {:.1}%", confidence, clamped_confidence); + + Ok(clamped_confidence) } /// Detect and correct image orientation diff --git a/src/routes/documents.rs b/src/routes/documents.rs index 010cc1b..048899b 100644 --- a/src/routes/documents.rs +++ b/src/routes/documents.rs @@ -53,6 +53,7 @@ pub fn router() -> Router> { .route("/failed-ocr", get(get_failed_ocr_documents)) .route("/duplicates", get(get_user_duplicates)) .route("/delete-low-confidence", post(delete_low_confidence_documents)) + .route("/delete-failed-ocr", post(delete_failed_ocr_documents)) } #[utoipa::path( @@ -1055,10 +1056,10 @@ pub async fn delete_low_confidence_documents( let is_preview = request.preview_only.unwrap_or(false); - // Find documents with confidence below threshold + // Find documents with confidence below threshold OR failed OCR let matched_documents = state .db - .find_documents_by_confidence_threshold(request.max_confidence, auth_user.user.id, auth_user.user.role) + .find_low_confidence_and_failed_documents(request.max_confidence, auth_user.user.id, auth_user.user.role) .await .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; @@ -1136,4 +1137,100 @@ pub async fn delete_low_confidence_documents( "ignored_file_creation_failures": ignored_file_creation_failures, "deleted_document_ids": deleted_documents.iter().map(|d| d.id).collect::>() }))) +} + +/// Delete all documents with failed OCR processing +pub async fn delete_failed_ocr_documents( + State(state): State>, + auth_user: AuthUser, + Json(request): Json, +) -> Result, StatusCode> { + let is_preview = request.get("preview_only").and_then(|v| v.as_bool()).unwrap_or(false); + + // Find documents with failed OCR + let matched_documents = state + .db + .find_failed_ocr_documents(auth_user.user.id, auth_user.user.role) + .await + .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; + + let matched_count = matched_documents.len(); + + if is_preview { + return Ok(Json(serde_json::json!({ + "success": true, + "message": format!("Found {} documents with failed OCR processing", matched_count), + "matched_count": matched_count, + "preview": true, + "document_ids": matched_documents.iter().map(|d| d.id).collect::>() + }))); + } + + if matched_documents.is_empty() { + return Ok(Json(serde_json::json!({ + "success": true, + "message": "No documents found with failed OCR processing", + "deleted_count": 0 + }))); + } + + // Extract document IDs for bulk deletion + let document_ids: Vec = matched_documents.iter().map(|d| d.id).collect(); + + // Use existing bulk delete logic + let deleted_documents = state + .db + .bulk_delete_documents(&document_ids, auth_user.user.id, auth_user.user.role) + .await + .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; + + // Create ignored file records for all successfully deleted documents + let mut ignored_file_creation_failures = 0; + for document in &deleted_documents { + let reason = if let Some(ref error) = document.ocr_error { + format!("deleted due to failed OCR processing: {}", error) + } else { + "deleted due to failed OCR processing".to_string() + }; + + if let Err(e) = crate::db::ignored_files::create_ignored_file_from_document( + state.db.get_pool(), + document.id, + auth_user.user.id, + Some(reason), + None, + None, + None, + ).await { + ignored_file_creation_failures += 1; + tracing::warn!("Failed to create ignored file record for document {}: {}", document.id, e); + } + } + + let file_service = FileService::new(state.config.upload_path.clone()); + let mut successful_file_deletions = 0; + let mut failed_file_deletions = 0; + + for document in &deleted_documents { + match file_service.delete_document_files(document).await { + Ok(_) => successful_file_deletions += 1, + Err(e) => { + failed_file_deletions += 1; + tracing::warn!("Failed to delete files for document {}: {}", document.id, e); + } + } + } + + let deleted_count = deleted_documents.len(); + + Ok(Json(serde_json::json!({ + "success": true, + "message": format!("Successfully deleted {} documents with failed OCR processing", deleted_count), + "deleted_count": deleted_count, + "matched_count": matched_count, + "successful_file_deletions": successful_file_deletions, + "failed_file_deletions": failed_file_deletions, + "ignored_file_creation_failures": ignored_file_creation_failures, + "deleted_document_ids": deleted_documents.iter().map(|d| d.id).collect::>() + }))) } \ No newline at end of file diff --git a/src/tests/document_routes_tests.rs b/src/tests/document_routes_tests.rs index 12ae45d..7627db8 100644 --- a/src/tests/document_routes_tests.rs +++ b/src/tests/document_routes_tests.rs @@ -633,4 +633,304 @@ mod document_routes_deletion_tests { // This should result in zero matched documents } } + + #[cfg(test)] + mod delete_failed_ocr_tests { + use super::*; + use serde_json::json; + + #[test] + fn test_delete_failed_ocr_request_serialization() { + // Test preview mode + let preview_request = json!({ + "preview_only": true + }); + + let parsed: serde_json::Value = serde_json::from_value(preview_request).unwrap(); + assert_eq!(parsed["preview_only"], true); + + // Test delete mode + let delete_request = json!({ + "preview_only": false + }); + + let parsed: serde_json::Value = serde_json::from_value(delete_request).unwrap(); + assert_eq!(parsed["preview_only"], false); + + // Test empty request (should default to preview_only: false) + let empty_request = json!({}); + + let parsed: serde_json::Value = serde_json::from_value(empty_request).unwrap(); + assert!(parsed.get("preview_only").is_none() || parsed["preview_only"] == false); + } + + #[test] + fn test_delete_failed_ocr_user_authorization() { + let admin_user = create_test_user(UserRole::Admin); + let regular_user = create_test_user(UserRole::User); + + // Both admins and regular users should be able to delete their own failed documents + assert_eq!(admin_user.role, UserRole::Admin); + assert_eq!(regular_user.role, UserRole::User); + + // Admin should be able to see all failed documents + // Regular user should only see their own failed documents + // This logic would be tested in the actual endpoint implementation + } + + #[test] + fn test_failed_document_criteria() { + let user_id = Uuid::new_v4(); + + // Test document with failed OCR status + let mut failed_doc = create_test_document(user_id); + failed_doc.ocr_status = Some("failed".to_string()); + failed_doc.ocr_confidence = None; + failed_doc.ocr_error = Some("OCR processing failed".to_string()); + + // Should be included in failed document deletion + assert_eq!(failed_doc.ocr_status, Some("failed".to_string())); + assert!(failed_doc.ocr_confidence.is_none()); + + // Test document with NULL confidence but completed status + let mut null_confidence_doc = create_test_document(user_id); + null_confidence_doc.ocr_status = Some("completed".to_string()); + null_confidence_doc.ocr_confidence = None; + null_confidence_doc.ocr_text = Some("Text but no confidence".to_string()); + + // Should be included in failed document deletion (NULL confidence indicates failure) + assert_eq!(null_confidence_doc.ocr_status, Some("completed".to_string())); + assert!(null_confidence_doc.ocr_confidence.is_none()); + + // Test document with successful OCR + let mut success_doc = create_test_document(user_id); + success_doc.ocr_status = Some("completed".to_string()); + success_doc.ocr_confidence = Some(85.0); + success_doc.ocr_text = Some("Successfully extracted text".to_string()); + + // Should NOT be included in failed document deletion + assert_eq!(success_doc.ocr_status, Some("completed".to_string())); + assert!(success_doc.ocr_confidence.is_some()); + + // Test document with pending status + let mut pending_doc = create_test_document(user_id); + pending_doc.ocr_status = Some("pending".to_string()); + pending_doc.ocr_confidence = None; + + // Should NOT be included in failed document deletion (still processing) + assert_eq!(pending_doc.ocr_status, Some("pending".to_string())); + + // Test document with processing status + let mut processing_doc = create_test_document(user_id); + processing_doc.ocr_status = Some("processing".to_string()); + processing_doc.ocr_confidence = None; + + // Should NOT be included in failed document deletion (still processing) + assert_eq!(processing_doc.ocr_status, Some("processing".to_string())); + } + + #[test] + fn test_delete_failed_ocr_response_format() { + // Test preview response format + let preview_response = json!({ + "success": true, + "message": "Found 5 documents with failed OCR processing", + "matched_count": 5, + "preview": true, + "document_ids": ["id1", "id2", "id3", "id4", "id5"] + }); + + assert_eq!(preview_response["success"], true); + assert_eq!(preview_response["matched_count"], 5); + assert_eq!(preview_response["preview"], true); + assert!(preview_response["document_ids"].is_array()); + + // Test delete response format + let delete_response = json!({ + "success": true, + "message": "Successfully deleted 3 documents with failed OCR processing", + "deleted_count": 3, + "matched_count": 3, + "successful_file_deletions": 3, + "failed_file_deletions": 0, + "ignored_file_creation_failures": 0, + "deleted_document_ids": ["id1", "id2", "id3"] + }); + + assert_eq!(delete_response["success"], true); + assert_eq!(delete_response["deleted_count"], 3); + assert_eq!(delete_response["matched_count"], 3); + assert!(delete_response["deleted_document_ids"].is_array()); + assert!(delete_response.get("preview").is_none()); // Should not have preview flag in delete response + + // Test no documents found response + let no_docs_response = json!({ + "success": true, + "message": "No documents found with failed OCR processing", + "deleted_count": 0 + }); + + assert_eq!(no_docs_response["success"], true); + assert_eq!(no_docs_response["deleted_count"], 0); + } + + #[test] + fn test_delete_failed_ocr_error_scenarios() { + // Test with no failed documents + let no_failed_docs_request = json!({ + "preview_only": true + }); + + // Should return success with 0 matched count + // This would be tested in integration tests with actual database + + // Test with file deletion failures + let file_deletion_error = json!({ + "success": true, + "message": "Successfully deleted 2 documents with failed OCR processing", + "deleted_count": 2, + "matched_count": 2, + "successful_file_deletions": 1, + "failed_file_deletions": 1, + "ignored_file_creation_failures": 0, + "deleted_document_ids": ["id1", "id2"] + }); + + // Should still report success but indicate file deletion issues + assert_eq!(file_deletion_error["success"], true); + assert_eq!(file_deletion_error["failed_file_deletions"], 1); + + // Test with ignored file creation failures + let ignored_file_error = json!({ + "success": true, + "message": "Successfully deleted 2 documents with failed OCR processing", + "deleted_count": 2, + "matched_count": 2, + "successful_file_deletions": 2, + "failed_file_deletions": 0, + "ignored_file_creation_failures": 1, + "deleted_document_ids": ["id1", "id2"] + }); + + assert_eq!(ignored_file_error["success"], true); + assert_eq!(ignored_file_error["ignored_file_creation_failures"], 1); + } + + #[test] + fn test_delete_failed_ocr_failure_reason_handling() { + let user_id = Uuid::new_v4(); + + // Test document with specific failure reason + let mut ocr_timeout_doc = create_test_document(user_id); + ocr_timeout_doc.ocr_status = Some("failed".to_string()); + ocr_timeout_doc.ocr_error = Some("OCR processing timed out after 2 minutes".to_string()); + + // Test document with corruption error + let mut corruption_doc = create_test_document(user_id); + corruption_doc.ocr_status = Some("failed".to_string()); + corruption_doc.ocr_error = Some("Invalid image format - file appears corrupted".to_string()); + + // Test document with font encoding error + let mut font_error_doc = create_test_document(user_id); + font_error_doc.ocr_status = Some("failed".to_string()); + font_error_doc.ocr_error = Some("PDF text extraction failed due to font encoding issues".to_string()); + + // All should be valid candidates for deletion + assert!(ocr_timeout_doc.ocr_error.is_some()); + assert!(corruption_doc.ocr_error.is_some()); + assert!(font_error_doc.ocr_error.is_some()); + + // The deletion should create appropriate ignored file records with the error reasons + } + + #[test] + fn test_delete_failed_ocr_ignored_file_creation() { + // Test that deleted failed documents create proper ignored file records + let user_id = Uuid::new_v4(); + + let mut failed_doc = create_test_document(user_id); + failed_doc.ocr_status = Some("failed".to_string()); + failed_doc.ocr_error = Some("OCR processing failed due to corrupted image".to_string()); + + // Expected ignored file reason should include the error + let expected_reason = "deleted due to failed OCR processing: OCR processing failed due to corrupted image"; + + // In the actual implementation, this would be tested by verifying the ignored file record + assert!(failed_doc.ocr_error.is_some()); + + // Test document with no specific error + let mut failed_no_error_doc = create_test_document(user_id); + failed_no_error_doc.ocr_status = Some("failed".to_string()); + failed_no_error_doc.ocr_error = None; + + // Should use generic reason + let expected_generic_reason = "deleted due to failed OCR processing"; + + // Both should result in appropriate ignored file records + assert_eq!(failed_doc.ocr_status, Some("failed".to_string())); + assert_eq!(failed_no_error_doc.ocr_status, Some("failed".to_string())); + } + + #[test] + fn test_delete_failed_ocr_vs_low_confidence_distinction() { + let user_id = Uuid::new_v4(); + + // Failed OCR document (should be in failed deletion, not low confidence) + let mut failed_doc = create_test_document(user_id); + failed_doc.ocr_status = Some("failed".to_string()); + failed_doc.ocr_confidence = None; + + // Low confidence document (should be in low confidence deletion, not failed) + let mut low_confidence_doc = create_test_document(user_id); + low_confidence_doc.ocr_status = Some("completed".to_string()); + low_confidence_doc.ocr_confidence = Some(25.0); + + // NULL confidence but completed (edge case - should be in failed deletion) + let mut null_confidence_doc = create_test_document(user_id); + null_confidence_doc.ocr_status = Some("completed".to_string()); + null_confidence_doc.ocr_confidence = None; + + // High confidence document (should be in neither) + let mut high_confidence_doc = create_test_document(user_id); + high_confidence_doc.ocr_status = Some("completed".to_string()); + high_confidence_doc.ocr_confidence = Some(95.0); + + // Verify the logic for each type + assert_eq!(failed_doc.ocr_status, Some("failed".to_string())); + assert!(failed_doc.ocr_confidence.is_none()); + + assert_eq!(low_confidence_doc.ocr_status, Some("completed".to_string())); + assert!(low_confidence_doc.ocr_confidence.unwrap() < 50.0); + + assert_eq!(null_confidence_doc.ocr_status, Some("completed".to_string())); + assert!(null_confidence_doc.ocr_confidence.is_none()); + + assert_eq!(high_confidence_doc.ocr_status, Some("completed".to_string())); + assert!(high_confidence_doc.ocr_confidence.unwrap() > 50.0); + } + + #[test] + fn test_delete_failed_ocr_endpoint_path() { + // Test that the endpoint path is correct + let endpoint_path = "/api/documents/delete-failed-ocr"; + + // This would be used in integration tests + assert!(endpoint_path.contains("delete-failed-ocr")); + assert!(endpoint_path.starts_with("/api/documents/")); + } + + #[test] + fn test_delete_failed_ocr_http_methods() { + // The endpoint should only accept POST requests + // GET, PUT, DELETE should not be allowed + + // This would be tested in integration tests with actual HTTP requests + let allowed_method = "POST"; + let disallowed_methods = vec!["GET", "PUT", "DELETE", "PATCH"]; + + assert_eq!(allowed_method, "POST"); + assert!(disallowed_methods.contains(&"GET")); + assert!(disallowed_methods.contains(&"DELETE")); + } + } } \ No newline at end of file diff --git a/src/tests/documents_tests.rs b/src/tests/documents_tests.rs index 0291c29..17b1050 100644 --- a/src/tests/documents_tests.rs +++ b/src/tests/documents_tests.rs @@ -1796,4 +1796,398 @@ mod deletion_error_handling_tests { } } } + + #[tokio::test] + async fn test_find_failed_ocr_documents() { + use testcontainers::{runners::AsyncRunner}; + use testcontainers_modules::postgres::Postgres; + + let postgres_image = Postgres::default(); + let container = postgres_image.start().await.expect("Failed to start postgres container"); + let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port"); + + // Use TEST_DATABASE_URL if available, otherwise use the container + let connection_string = std::env::var("TEST_DATABASE_URL") + .unwrap_or_else(|_| format!("postgres://postgres:postgres@127.0.0.1:{}/postgres", port)); + let database = Database::new(&connection_string).await.unwrap(); + database.migrate().await.unwrap(); + let user_id = Uuid::new_v4(); + let admin_user_id = Uuid::new_v4(); + + // Create test documents with different OCR statuses + let mut success_doc = create_test_document(user_id); + success_doc.ocr_status = Some("completed".to_string()); + success_doc.ocr_confidence = Some(85.0); + success_doc.ocr_text = Some("Successfully extracted text".to_string()); + + let mut failed_doc = create_test_document(user_id); + failed_doc.ocr_status = Some("failed".to_string()); + failed_doc.ocr_confidence = None; + failed_doc.ocr_text = None; + failed_doc.ocr_error = Some("OCR processing failed due to corrupted image".to_string()); + + let mut null_confidence_doc = create_test_document(user_id); + null_confidence_doc.ocr_status = Some("completed".to_string()); + null_confidence_doc.ocr_confidence = None; // NULL confidence but not failed + null_confidence_doc.ocr_text = Some("Text extracted but no confidence".to_string()); + + let mut pending_doc = create_test_document(user_id); + pending_doc.ocr_status = Some("pending".to_string()); + pending_doc.ocr_confidence = None; + pending_doc.ocr_text = None; + + let mut processing_doc = create_test_document(user_id); + processing_doc.ocr_status = Some("processing".to_string()); + processing_doc.ocr_confidence = None; + processing_doc.ocr_text = None; + + // Different user's failed document + let mut other_user_failed_doc = create_test_document(admin_user_id); + other_user_failed_doc.ocr_status = Some("failed".to_string()); + other_user_failed_doc.ocr_confidence = None; + + // Insert all documents + let success_id = database.create_document(success_doc).await.unwrap().id; + let failed_id = database.create_document(failed_doc).await.unwrap().id; + let null_confidence_id = database.create_document(null_confidence_doc).await.unwrap().id; + let pending_id = database.create_document(pending_doc).await.unwrap().id; + let processing_id = database.create_document(processing_doc).await.unwrap().id; + let other_user_failed_id = database.create_document(other_user_failed_doc).await.unwrap().id; + + // Test as regular user + let failed_docs = database + .find_failed_ocr_documents(user_id, crate::models::UserRole::User) + .await + .unwrap(); + + // Should find: failed_doc and null_confidence_doc (but not pending/processing) + assert_eq!(failed_docs.len(), 2); + let failed_ids: Vec = failed_docs.iter().map(|d| d.id).collect(); + assert!(failed_ids.contains(&failed_id)); + assert!(failed_ids.contains(&null_confidence_id)); + assert!(!failed_ids.contains(&success_id)); + assert!(!failed_ids.contains(&pending_id)); + assert!(!failed_ids.contains(&processing_id)); + assert!(!failed_ids.contains(&other_user_failed_id)); // Different user + + // Test as admin + let admin_failed_docs = database + .find_failed_ocr_documents(admin_user_id, crate::models::UserRole::Admin) + .await + .unwrap(); + + // Should find all failed documents (from all users) + assert!(admin_failed_docs.len() >= 3); // At least our 3 failed docs + let admin_failed_ids: Vec = admin_failed_docs.iter().map(|d| d.id).collect(); + assert!(admin_failed_ids.contains(&failed_id)); + assert!(admin_failed_ids.contains(&null_confidence_id)); + assert!(admin_failed_ids.contains(&other_user_failed_id)); + } + + #[tokio::test] + async fn test_find_low_confidence_and_failed_documents() { + use testcontainers::{runners::AsyncRunner}; + use testcontainers_modules::postgres::Postgres; + + let postgres_image = Postgres::default(); + let container = postgres_image.start().await.expect("Failed to start postgres container"); + let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port"); + + // Use TEST_DATABASE_URL if available, otherwise use the container + let connection_string = std::env::var("TEST_DATABASE_URL") + .unwrap_or_else(|_| format!("postgres://postgres:postgres@127.0.0.1:{}/postgres", port)); + let database = Database::new(&connection_string).await.unwrap(); + database.migrate().await.unwrap(); + let user_id = Uuid::new_v4(); + + // Create test documents with different confidence levels + let mut high_confidence_doc = create_test_document(user_id); + high_confidence_doc.ocr_confidence = Some(95.0); + high_confidence_doc.ocr_status = Some("completed".to_string()); + + let mut medium_confidence_doc = create_test_document(user_id); + medium_confidence_doc.ocr_confidence = Some(65.0); + medium_confidence_doc.ocr_status = Some("completed".to_string()); + + let mut low_confidence_doc = create_test_document(user_id); + low_confidence_doc.ocr_confidence = Some(25.0); + low_confidence_doc.ocr_status = Some("completed".to_string()); + + let mut failed_doc = create_test_document(user_id); + failed_doc.ocr_status = Some("failed".to_string()); + failed_doc.ocr_confidence = None; + failed_doc.ocr_error = Some("Processing failed".to_string()); + + let mut null_confidence_doc = create_test_document(user_id); + null_confidence_doc.ocr_status = Some("completed".to_string()); + null_confidence_doc.ocr_confidence = None; + + let mut pending_doc = create_test_document(user_id); + pending_doc.ocr_status = Some("pending".to_string()); + pending_doc.ocr_confidence = None; + + // Insert all documents + let high_id = database.create_document(high_confidence_doc).await.unwrap().id; + let medium_id = database.create_document(medium_confidence_doc).await.unwrap().id; + let low_id = database.create_document(low_confidence_doc).await.unwrap().id; + let failed_id = database.create_document(failed_doc).await.unwrap().id; + let null_confidence_id = database.create_document(null_confidence_doc).await.unwrap().id; + let pending_id = database.create_document(pending_doc).await.unwrap().id; + + // Test with threshold of 50% - should include low confidence, failed, and null confidence + let threshold_50_docs = database + .find_low_confidence_and_failed_documents(50.0, user_id, crate::models::UserRole::User) + .await + .unwrap(); + + assert_eq!(threshold_50_docs.len(), 3); + let threshold_50_ids: Vec = threshold_50_docs.iter().map(|d| d.id).collect(); + assert!(threshold_50_ids.contains(&low_id)); // 25% confidence + assert!(threshold_50_ids.contains(&failed_id)); // failed status + assert!(threshold_50_ids.contains(&null_confidence_id)); // NULL confidence + assert!(!threshold_50_ids.contains(&high_id)); // 95% confidence + assert!(!threshold_50_ids.contains(&medium_id)); // 65% confidence + assert!(!threshold_50_ids.contains(&pending_id)); // pending status + + // Test with threshold of 70% - should include low and medium confidence, failed, and null confidence + let threshold_70_docs = database + .find_low_confidence_and_failed_documents(70.0, user_id, crate::models::UserRole::User) + .await + .unwrap(); + + assert_eq!(threshold_70_docs.len(), 4); + let threshold_70_ids: Vec = threshold_70_docs.iter().map(|d| d.id).collect(); + assert!(threshold_70_ids.contains(&low_id)); // 25% confidence + assert!(threshold_70_ids.contains(&medium_id)); // 65% confidence + assert!(threshold_70_ids.contains(&failed_id)); // failed status + assert!(threshold_70_ids.contains(&null_confidence_id)); // NULL confidence + assert!(!threshold_70_ids.contains(&high_id)); // 95% confidence + assert!(!threshold_70_ids.contains(&pending_id)); // pending status + + // Test with threshold of 100% - should include all except pending/processing + let threshold_100_docs = database + .find_low_confidence_and_failed_documents(100.0, user_id, crate::models::UserRole::User) + .await + .unwrap(); + + assert_eq!(threshold_100_docs.len(), 5); + let threshold_100_ids: Vec = threshold_100_docs.iter().map(|d| d.id).collect(); + assert!(threshold_100_ids.contains(&high_id)); // 95% confidence + assert!(threshold_100_ids.contains(&medium_id)); // 65% confidence + assert!(threshold_100_ids.contains(&low_id)); // 25% confidence + assert!(threshold_100_ids.contains(&failed_id)); // failed status + assert!(threshold_100_ids.contains(&null_confidence_id)); // NULL confidence + assert!(!threshold_100_ids.contains(&pending_id)); // pending status + + // Test with threshold of 0% - should only include failed and null confidence + let threshold_0_docs = database + .find_low_confidence_and_failed_documents(0.0, user_id, crate::models::UserRole::User) + .await + .unwrap(); + + assert_eq!(threshold_0_docs.len(), 2); + let threshold_0_ids: Vec = threshold_0_docs.iter().map(|d| d.id).collect(); + assert!(threshold_0_ids.contains(&failed_id)); // failed status + assert!(threshold_0_ids.contains(&null_confidence_id)); // NULL confidence + assert!(!threshold_0_ids.contains(&high_id)); // 95% confidence + assert!(!threshold_0_ids.contains(&medium_id)); // 65% confidence + assert!(!threshold_0_ids.contains(&low_id)); // 25% confidence + assert!(!threshold_0_ids.contains(&pending_id)); // pending status + } + + #[tokio::test] + async fn test_find_documents_by_confidence_threshold_original_behavior() { + use testcontainers::{runners::AsyncRunner}; + use testcontainers_modules::postgres::Postgres; + + let postgres_image = Postgres::default(); + let container = postgres_image.start().await.expect("Failed to start postgres container"); + let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port"); + + // Use TEST_DATABASE_URL if available, otherwise use the container + let connection_string = std::env::var("TEST_DATABASE_URL") + .unwrap_or_else(|_| format!("postgres://postgres:postgres@127.0.0.1:{}/postgres", port)); + let database = Database::new(&connection_string).await.unwrap(); + database.migrate().await.unwrap(); + let user_id = Uuid::new_v4(); + + // Create test documents to verify original behavior is preserved + let mut high_confidence_doc = create_test_document(user_id); + high_confidence_doc.ocr_confidence = Some(90.0); + high_confidence_doc.ocr_status = Some("completed".to_string()); + + let mut low_confidence_doc = create_test_document(user_id); + low_confidence_doc.ocr_confidence = Some(40.0); + low_confidence_doc.ocr_status = Some("completed".to_string()); + + let mut null_confidence_doc = create_test_document(user_id); + null_confidence_doc.ocr_confidence = None; + null_confidence_doc.ocr_status = Some("completed".to_string()); + + let mut failed_doc = create_test_document(user_id); + failed_doc.ocr_confidence = None; + failed_doc.ocr_status = Some("failed".to_string()); + + // Insert documents + let high_id = database.create_document(high_confidence_doc).await.unwrap().id; + let low_id = database.create_document(low_confidence_doc).await.unwrap().id; + let null_confidence_id = database.create_document(null_confidence_doc).await.unwrap().id; + let failed_id = database.create_document(failed_doc).await.unwrap().id; + + // Test original method - should only find documents with explicit confidence below threshold + let original_results = database + .find_documents_by_confidence_threshold(50.0, user_id, crate::models::UserRole::User) + .await + .unwrap(); + + // Should only include low_confidence_doc (40%), not NULL confidence or failed docs + assert_eq!(original_results.len(), 1); + assert_eq!(original_results[0].id, low_id); + + let original_ids: Vec = original_results.iter().map(|d| d.id).collect(); + assert!(!original_ids.contains(&high_id)); // 90% > 50% + assert!(!original_ids.contains(&null_confidence_id)); // NULL confidence excluded + assert!(!original_ids.contains(&failed_id)); // NULL confidence excluded + } + + #[tokio::test] + async fn test_confidence_query_ordering() { + use testcontainers::{runners::AsyncRunner}; + use testcontainers_modules::postgres::Postgres; + + let postgres_image = Postgres::default(); + let container = postgres_image.start().await.expect("Failed to start postgres container"); + let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port"); + + // Use TEST_DATABASE_URL if available, otherwise use the container + let connection_string = std::env::var("TEST_DATABASE_URL") + .unwrap_or_else(|_| format!("postgres://postgres:postgres@127.0.0.1:{}/postgres", port)); + let database = Database::new(&connection_string).await.unwrap(); + database.migrate().await.unwrap(); + let user_id = Uuid::new_v4(); + + // Create documents with different confidence levels and statuses + let mut confidence_10_doc = create_test_document(user_id); + confidence_10_doc.ocr_confidence = Some(10.0); + confidence_10_doc.ocr_status = Some("completed".to_string()); + + let mut confidence_30_doc = create_test_document(user_id); + confidence_30_doc.ocr_confidence = Some(30.0); + confidence_30_doc.ocr_status = Some("completed".to_string()); + + let mut failed_doc = create_test_document(user_id); + failed_doc.ocr_confidence = None; + failed_doc.ocr_status = Some("failed".to_string()); + + let mut null_confidence_doc = create_test_document(user_id); + null_confidence_doc.ocr_confidence = None; + null_confidence_doc.ocr_status = Some("completed".to_string()); + + // Insert documents + let id_10 = database.create_document(confidence_10_doc).await.unwrap().id; + let id_30 = database.create_document(confidence_30_doc).await.unwrap().id; + let failed_id = database.create_document(failed_doc).await.unwrap().id; + let null_id = database.create_document(null_confidence_doc).await.unwrap().id; + + // Test ordering in combined query + let results = database + .find_low_confidence_and_failed_documents(50.0, user_id, crate::models::UserRole::User) + .await + .unwrap(); + + assert_eq!(results.len(), 4); + + // Check that documents with actual confidence are ordered by confidence (ascending) + // and NULL confidence documents come first (due to CASE WHEN ordering) + let confidence_values: Vec> = results.iter().map(|d| d.ocr_confidence).collect(); + + // First two should be NULL confidence (failed and completed with NULL) + assert!(confidence_values[0].is_none()); + assert!(confidence_values[1].is_none()); + + // Next should be lowest confidence + assert_eq!(confidence_values[2], Some(10.0)); + + // Last should be higher confidence + assert_eq!(confidence_values[3], Some(30.0)); + } + + #[tokio::test] + async fn test_user_isolation_in_confidence_queries() { + use testcontainers::{runners::AsyncRunner}; + use testcontainers_modules::postgres::Postgres; + + let postgres_image = Postgres::default(); + let container = postgres_image.start().await.expect("Failed to start postgres container"); + let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port"); + + // Use TEST_DATABASE_URL if available, otherwise use the container + let connection_string = std::env::var("TEST_DATABASE_URL") + .unwrap_or_else(|_| format!("postgres://postgres:postgres@127.0.0.1:{}/postgres", port)); + let database = Database::new(&connection_string).await.unwrap(); + database.migrate().await.unwrap(); + let user1_id = Uuid::new_v4(); + let user2_id = Uuid::new_v4(); + + // Create documents for user1 + let mut user1_low_doc = create_test_document(user1_id); + user1_low_doc.ocr_confidence = Some(20.0); + + let mut user1_failed_doc = create_test_document(user1_id); + user1_failed_doc.ocr_status = Some("failed".to_string()); + user1_failed_doc.ocr_confidence = None; + + // Create documents for user2 + let mut user2_low_doc = create_test_document(user2_id); + user2_low_doc.ocr_confidence = Some(25.0); + + let mut user2_failed_doc = create_test_document(user2_id); + user2_failed_doc.ocr_status = Some("failed".to_string()); + user2_failed_doc.ocr_confidence = None; + + // Insert documents + let user1_low_id: Uuid = database.create_document(user1_low_doc).await.unwrap().id; + let user1_failed_id: Uuid = database.create_document(user1_failed_doc).await.unwrap().id; + let user2_low_id: Uuid = database.create_document(user2_low_doc).await.unwrap().id; + let user2_failed_id: Uuid = database.create_document(user2_failed_doc).await.unwrap().id; + + // Test user1 can only see their documents + let user1_results = database + .find_low_confidence_and_failed_documents(50.0, user1_id, crate::models::UserRole::User) + .await + .unwrap(); + + assert_eq!(user1_results.len(), 2); + let user1_ids: Vec = user1_results.iter().map(|d| d.id).collect(); + assert!(user1_ids.contains(&user1_low_id)); + assert!(user1_ids.contains(&user1_failed_id)); + assert!(!user1_ids.contains(&user2_low_id)); + assert!(!user1_ids.contains(&user2_failed_id)); + + // Test user2 can only see their documents + let user2_results = database + .find_low_confidence_and_failed_documents(50.0, user2_id, crate::models::UserRole::User) + .await + .unwrap(); + + assert_eq!(user2_results.len(), 2); + let user2_ids: Vec = user2_results.iter().map(|d| d.id).collect(); + assert!(user2_ids.contains(&user2_low_id)); + assert!(user2_ids.contains(&user2_failed_id)); + assert!(!user2_ids.contains(&user1_low_id)); + assert!(!user2_ids.contains(&user1_failed_id)); + + // Test admin can see all documents + let admin_results = database + .find_low_confidence_and_failed_documents(50.0, user1_id, crate::models::UserRole::Admin) + .await + .unwrap(); + + assert!(admin_results.len() >= 4); // At least our 4 test documents + let admin_ids: Vec = admin_results.iter().map(|d| d.id).collect(); + assert!(admin_ids.contains(&user1_low_id)); + assert!(admin_ids.contains(&user1_failed_id)); + assert!(admin_ids.contains(&user2_low_id)); + assert!(admin_ids.contains(&user2_failed_id)); + } } \ No newline at end of file diff --git a/src/tests/enhanced_ocr_tests.rs b/src/tests/enhanced_ocr_tests.rs new file mode 100644 index 0000000..efb17b3 --- /dev/null +++ b/src/tests/enhanced_ocr_tests.rs @@ -0,0 +1,455 @@ +#[cfg(test)] +mod tests { + use crate::ocr::enhanced::{EnhancedOcrService, OcrResult, ImageQualityStats}; + use crate::models::Settings; + use std::fs; + use tempfile::{NamedTempFile, TempDir}; + + fn create_test_settings() -> Settings { + Settings::default() + } + + fn create_temp_dir() -> TempDir { + TempDir::new().expect("Failed to create temp directory") + } + + #[test] + fn test_enhanced_ocr_service_creation() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let service = EnhancedOcrService::new(temp_path); + + // Service should be created successfully + assert!(!service.temp_dir.is_empty()); + } + + #[test] + fn test_image_quality_stats_creation() { + let stats = ImageQualityStats { + average_brightness: 128.0, + contrast_ratio: 0.5, + noise_level: 0.1, + sharpness: 0.8, + }; + + assert_eq!(stats.average_brightness, 128.0); + assert_eq!(stats.contrast_ratio, 0.5); + assert_eq!(stats.noise_level, 0.1); + assert_eq!(stats.sharpness, 0.8); + } + + #[test] + fn test_ocr_result_structure() { + let result = OcrResult { + text: "Test text".to_string(), + confidence: 85.5, + processing_time_ms: 1500, + word_count: 2, + preprocessing_applied: vec!["noise_reduction".to_string()], + processed_image_path: Some("/tmp/processed.png".to_string()), + }; + + assert_eq!(result.text, "Test text"); + assert_eq!(result.confidence, 85.5); + assert_eq!(result.processing_time_ms, 1500); + assert_eq!(result.word_count, 2); + assert_eq!(result.preprocessing_applied.len(), 1); + assert!(result.processed_image_path.is_some()); + } + + #[tokio::test] + async fn test_extract_text_from_plain_text() { + let temp_dir = create_temp_dir(); + let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string()); + let settings = create_test_settings(); + + let temp_file = NamedTempFile::with_suffix(".txt").unwrap(); + let test_content = "This is a test text file with multiple words."; + fs::write(temp_file.path(), test_content).unwrap(); + + let result = service + .extract_text(temp_file.path().to_str().unwrap(), "text/plain", &settings) + .await; + + assert!(result.is_ok()); + let ocr_result = result.unwrap(); + assert_eq!(ocr_result.text.trim(), test_content); + assert_eq!(ocr_result.confidence, 100.0); // Plain text should be 100% confident + assert_eq!(ocr_result.word_count, 9); // "This is a test text file with multiple words" + assert!(ocr_result.processing_time_ms > 0); + assert!(ocr_result.preprocessing_applied.contains(&"Plain text read".to_string())); + } + + #[tokio::test] + async fn test_extract_text_with_context() { + let temp_dir = create_temp_dir(); + let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string()); + let settings = create_test_settings(); + + let temp_file = NamedTempFile::with_suffix(".txt").unwrap(); + let test_content = "Context test content"; + fs::write(temp_file.path(), test_content).unwrap(); + + let result = service + .extract_text_with_context( + temp_file.path().to_str().unwrap(), + "text/plain", + "test_file.txt", + 19, // Length of "Context test content" + &settings, + ) + .await; + + assert!(result.is_ok()); + let ocr_result = result.unwrap(); + assert_eq!(ocr_result.text.trim(), test_content); + assert_eq!(ocr_result.confidence, 100.0); + } + + #[tokio::test] + async fn test_extract_text_unsupported_mime_type() { + let temp_dir = create_temp_dir(); + let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string()); + let settings = create_test_settings(); + + let temp_file = NamedTempFile::new().unwrap(); + fs::write(temp_file.path(), "some content").unwrap(); + + let result = service + .extract_text(temp_file.path().to_str().unwrap(), "application/unknown", &settings) + .await; + + assert!(result.is_err()); + let error_msg = result.unwrap_err().to_string(); + assert!(error_msg.contains("Unsupported file type")); + } + + #[tokio::test] + async fn test_extract_text_nonexistent_file() { + let temp_dir = create_temp_dir(); + let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string()); + let settings = create_test_settings(); + + let result = service + .extract_text("/nonexistent/file.txt", "text/plain", &settings) + .await; + + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_extract_text_large_file_truncation() { + let temp_dir = create_temp_dir(); + let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string()); + let settings = create_test_settings(); + + let temp_file = NamedTempFile::with_suffix(".txt").unwrap(); + + // Create a file larger than the limit (50MB for text files) + let large_content = "A".repeat(60 * 1024 * 1024); // 60MB + fs::write(temp_file.path(), &large_content).unwrap(); + + let result = service + .extract_text(temp_file.path().to_str().unwrap(), "text/plain", &settings) + .await; + + // Should fail due to size limit + assert!(result.is_err()); + let error_msg = result.unwrap_err().to_string(); + assert!(error_msg.contains("too large")); + } + + #[cfg(feature = "ocr")] + #[test] + fn test_validate_ocr_quality_high_confidence() { + let temp_dir = create_temp_dir(); + let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string()); + let mut settings = create_test_settings(); + settings.ocr_min_confidence = 30.0; + + let result = OcrResult { + text: "This is high quality OCR text with good words.".to_string(), + confidence: 95.0, + processing_time_ms: 1000, + word_count: 9, + preprocessing_applied: vec![], + processed_image_path: None, + }; + + let is_valid = service.validate_ocr_quality(&result, &settings); + assert!(is_valid); + } + + #[cfg(feature = "ocr")] + #[test] + fn test_validate_ocr_quality_low_confidence() { + let temp_dir = create_temp_dir(); + let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string()); + let mut settings = create_test_settings(); + settings.ocr_min_confidence = 50.0; + + let result = OcrResult { + text: "Poor quality text".to_string(), + confidence: 25.0, // Below threshold + processing_time_ms: 1000, + word_count: 3, + preprocessing_applied: vec![], + processed_image_path: None, + }; + + let is_valid = service.validate_ocr_quality(&result, &settings); + assert!(!is_valid); + } + + #[cfg(feature = "ocr")] + #[test] + fn test_validate_ocr_quality_no_words() { + let temp_dir = create_temp_dir(); + let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string()); + let settings = create_test_settings(); + + let result = OcrResult { + text: "".to_string(), + confidence: 95.0, + processing_time_ms: 1000, + word_count: 0, // No words + preprocessing_applied: vec![], + processed_image_path: None, + }; + + let is_valid = service.validate_ocr_quality(&result, &settings); + assert!(!is_valid); + } + + #[cfg(feature = "ocr")] + #[test] + fn test_validate_ocr_quality_poor_character_distribution() { + let temp_dir = create_temp_dir(); + let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string()); + let settings = create_test_settings(); + + let result = OcrResult { + text: "!!!@@@###$$$%%%^^^&&&***".to_string(), // Mostly symbols, < 30% alphanumeric + confidence: 85.0, + processing_time_ms: 1000, + word_count: 1, + preprocessing_applied: vec![], + processed_image_path: None, + }; + + let is_valid = service.validate_ocr_quality(&result, &settings); + assert!(!is_valid); + } + + #[cfg(feature = "ocr")] + #[test] + fn test_validate_ocr_quality_good_character_distribution() { + let temp_dir = create_temp_dir(); + let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string()); + let settings = create_test_settings(); + + let result = OcrResult { + text: "The quick brown fox jumps over the lazy dog. 123".to_string(), // Good alphanumeric ratio + confidence: 85.0, + processing_time_ms: 1000, + word_count: 10, + preprocessing_applied: vec![], + processed_image_path: None, + }; + + let is_valid = service.validate_ocr_quality(&result, &settings); + assert!(is_valid); + } + + #[tokio::test] + async fn test_word_count_calculation() { + let temp_dir = create_temp_dir(); + let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string()); + let settings = create_test_settings(); + + let test_cases = vec![ + ("", 0), + ("word", 1), + ("two words", 2), + (" spaced words ", 2), + ("Multiple\nlines\nof\ntext", 4), + ("punctuation, words! work? correctly.", 4), + ]; + + for (content, expected_count) in test_cases { + let temp_file = NamedTempFile::with_suffix(".txt").unwrap(); + fs::write(temp_file.path(), content).unwrap(); + + let result = service + .extract_text(temp_file.path().to_str().unwrap(), "text/plain", &settings) + .await; + + assert!(result.is_ok()); + let ocr_result = result.unwrap(); + assert_eq!(ocr_result.word_count, expected_count, "Failed for content: '{}'", content); + } + } + + #[tokio::test] + async fn test_pdf_extraction_with_invalid_pdf() { + let temp_dir = create_temp_dir(); + let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string()); + let settings = create_test_settings(); + + let temp_file = NamedTempFile::with_suffix(".pdf").unwrap(); + fs::write(temp_file.path(), "Not a valid PDF").unwrap(); + + let result = service + .extract_text(temp_file.path().to_str().unwrap(), "application/pdf", &settings) + .await; + + assert!(result.is_err()); + let error_msg = result.unwrap_err().to_string(); + assert!(error_msg.contains("Invalid PDF") || error_msg.contains("Missing") || error_msg.contains("corrupted")); + } + + #[tokio::test] + async fn test_pdf_extraction_with_minimal_valid_pdf() { + let temp_dir = create_temp_dir(); + let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string()); + let settings = create_test_settings(); + + // Minimal PDF with "Hello" text + let pdf_content = b"%PDF-1.4 +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj +2 0 obj +<< /Type /Pages /Kids [3 0 R] /Count 1 >> +endobj +3 0 obj +<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >> +endobj +4 0 obj +<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> +endobj +5 0 obj +<< /Length 44 >> +stream +BT +/F1 12 Tf +100 700 Td +(Hello) Tj +ET +endstream +endobj +xref +0 6 +0000000000 65535 f +0000000009 00000 n +0000000058 00000 n +0000000115 00000 n +0000000262 00000 n +0000000341 00000 n +trailer +<< /Size 6 /Root 1 0 R >> +startxref +435 +%%EOF"; + + let temp_file = NamedTempFile::with_suffix(".pdf").unwrap(); + fs::write(temp_file.path(), pdf_content).unwrap(); + + let result = service + .extract_text(temp_file.path().to_str().unwrap(), "application/pdf", &settings) + .await; + + match result { + Ok(ocr_result) => { + // PDF extraction succeeded + assert_eq!(ocr_result.confidence, 95.0); // PDF text extraction should be high confidence + assert!(ocr_result.processing_time_ms > 0); + assert!(ocr_result.preprocessing_applied.contains(&"PDF text extraction".to_string())); + println!("PDF extracted text: '{}'", ocr_result.text); + } + Err(e) => { + // PDF extraction might fail depending on the pdf-extract library + println!("PDF extraction failed (may be expected): {}", e); + } + } + } + + #[tokio::test] + async fn test_pdf_size_limit() { + let temp_dir = create_temp_dir(); + let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string()); + let settings = create_test_settings(); + + let temp_file = NamedTempFile::with_suffix(".pdf").unwrap(); + + // Create a file larger than the 100MB PDF limit + let large_pdf_content = format!("%PDF-1.4\n{}", "A".repeat(110 * 1024 * 1024)); + fs::write(temp_file.path(), large_pdf_content).unwrap(); + + let result = service + .extract_text(temp_file.path().to_str().unwrap(), "application/pdf", &settings) + .await; + + assert!(result.is_err()); + let error_msg = result.unwrap_err().to_string(); + assert!(error_msg.contains("too large")); + } + + #[test] + fn test_settings_default_values() { + let settings = Settings::default(); + + // Test that OCR-related settings have reasonable defaults + assert_eq!(settings.ocr_min_confidence, 30.0); + assert_eq!(settings.ocr_dpi, 300); + assert_eq!(settings.ocr_page_segmentation_mode, 3); + assert_eq!(settings.ocr_engine_mode, 3); + assert!(settings.enable_background_ocr); + assert!(settings.ocr_enhance_contrast); + assert!(settings.ocr_remove_noise); + assert!(settings.ocr_detect_orientation); + } + + #[tokio::test] + async fn test_concurrent_ocr_processing() { + let temp_dir = create_temp_dir(); + let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string()); + let settings = create_test_settings(); + + let mut handles = vec![]; + + // Process multiple files concurrently + for i in 0..5 { + let temp_file = NamedTempFile::with_suffix(".txt").unwrap(); + let content = format!("Concurrent test content {}", i); + fs::write(temp_file.path(), &content).unwrap(); + + let service_clone = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string()); + let settings_clone = settings.clone(); + let file_path = temp_file.path().to_str().unwrap().to_string(); + + let handle = tokio::spawn(async move { + let result = service_clone + .extract_text(&file_path, "text/plain", &settings_clone) + .await; + + // Keep temp_file alive until task completes + drop(temp_file); + result + }); + + handles.push(handle); + } + + // Wait for all tasks to complete + let results = futures::future::join_all(handles).await; + + // All tasks should succeed + for (i, result) in results.into_iter().enumerate() { + assert!(result.is_ok(), "Task {} failed", i); + let ocr_result = result.unwrap().unwrap(); + assert!(ocr_result.text.contains(&format!("Concurrent test content {}", i))); + assert_eq!(ocr_result.confidence, 100.0); + } + } +} \ No newline at end of file diff --git a/src/tests/mod.rs b/src/tests/mod.rs index 2de0447..f40390e 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -8,6 +8,7 @@ mod file_service_tests; mod ignored_files_tests; mod labels_tests; mod ocr_tests; +mod enhanced_ocr_tests; mod oidc_tests; mod enhanced_search_tests; mod settings_tests; diff --git a/tests/integration_document_deletion_integration_tests.rs b/tests/integration_document_deletion_integration_tests.rs index 077dd8d..b9cb5aa 100644 --- a/tests/integration_document_deletion_integration_tests.rs +++ b/tests/integration_document_deletion_integration_tests.rs @@ -233,6 +233,57 @@ impl DocumentDeletionTestClient { let result: Value = response.json().await?; Ok(result) } + + /// Delete failed OCR documents + async fn delete_failed_ocr_documents(&self, preview_only: bool) -> Result> { + let token = self.token.as_ref().ok_or("Not authenticated")?; + + let response = self.client + .post(&format!("{}/api/documents/delete-failed-ocr", get_base_url())) + .header("Authorization", format!("Bearer {}", token)) + .json(&json!({ + "preview_only": preview_only + })) + .timeout(TIMEOUT) + .send() + .await?; + + if !response.status().is_success() { + return Err(format!("Delete failed OCR documents failed: {}", response.text().await?).into()); + } + + let result: Value = response.json().await?; + Ok(result) + } + + /// Delete low confidence documents (updated to use new combined endpoint) + async fn delete_low_confidence_documents(&self, threshold: f64, preview_only: bool) -> Result> { + let token = self.token.as_ref().ok_or("Not authenticated")?; + + let response = self.client + .post(&format!("{}/api/documents/delete-low-confidence", get_base_url())) + .header("Authorization", format!("Bearer {}", token)) + .json(&json!({ + "max_confidence": threshold, + "preview_only": preview_only + })) + .timeout(TIMEOUT) + .send() + .await?; + + if !response.status().is_success() { + return Err(format!("Delete low confidence documents failed: {}", response.text().await?).into()); + } + + let result: Value = response.json().await?; + Ok(result) + } + + /// Create and login user (convenience method) + async fn create_and_login_user(&mut self, username: &str, password: &str, role: UserRole) -> Result> { + let email = format!("{}@example.com", username); + self.register_and_login(username, &email, password, Some(role)).await + } } /// Skip test if server is not running @@ -613,4 +664,224 @@ async fn test_document_count_updates_after_deletion() { assert_eq!(final_count, initial_count, "Document count should be back to initial after bulk deletion"); println!("โœ… Document count updates after deletion test passed"); +} + +/// Test the new failed OCR document deletion endpoint +#[tokio::test] +async fn test_delete_failed_ocr_documents_endpoint() { + let mut client = DocumentDeletionTestClient::new(); + + if let Err(e) = client.check_server_health().await { + println!("โš ๏ธ Server not available: {}. Skipping test.", e); + return; + } + + println!("๐Ÿงช Testing failed OCR document deletion endpoint..."); + + // Create and login as regular user + client.create_and_login_user("failed_ocr_user", "failed_ocr_password", UserRole::User) + .await.expect("Failed to create and login user"); + + // Preview failed documents (should return empty initially) + let preview_response = client.delete_failed_ocr_documents(true) + .await.expect("Failed to preview failed OCR documents"); + + assert_eq!(preview_response["success"], true); + assert!(preview_response["matched_count"].as_i64().unwrap() >= 0); + assert_eq!(preview_response["preview"], true); + + println!("๐Ÿ“‹ Preview request successful: {} failed documents found", + preview_response["matched_count"]); + + // If there are failed documents, test deletion + if preview_response["matched_count"].as_i64().unwrap() > 0 { + // Test actual deletion + let delete_response = client.delete_failed_ocr_documents(false) + .await.expect("Failed to delete failed OCR documents"); + + assert_eq!(delete_response["success"], true); + assert!(delete_response["deleted_count"].as_i64().unwrap() >= 0); + assert!(delete_response.get("preview").is_none()); + + println!("๐Ÿ—‘๏ธ Successfully deleted {} failed documents", + delete_response["deleted_count"]); + } else { + println!("โ„น๏ธ No failed documents found to delete"); + } + + println!("โœ… Failed OCR document deletion endpoint test passed"); +} + +/// Test confidence-based vs failed document deletion distinction +#[tokio::test] +async fn test_confidence_vs_failed_document_distinction() { + let mut client = DocumentDeletionTestClient::new(); + + if let Err(e) = client.check_server_health().await { + println!("โš ๏ธ Server not available: {}. Skipping test.", e); + return; + } + + println!("๐Ÿงช Testing distinction between confidence and failed document deletion..."); + + // Create and login as admin to see all documents + client.create_and_login_user("distinction_admin", "distinction_password", UserRole::Admin) + .await.expect("Failed to create and login admin"); + + // Get baseline counts + let initial_low_confidence = client.delete_low_confidence_documents(30.0, true) + .await.expect("Failed to preview low confidence documents"); + let initial_failed = client.delete_failed_ocr_documents(true) + .await.expect("Failed to preview failed documents"); + + let initial_low_count = initial_low_confidence["matched_count"].as_i64().unwrap(); + let initial_failed_count = initial_failed["matched_count"].as_i64().unwrap(); + + println!("๐Ÿ“Š Initial counts - Low confidence: {}, Failed: {}", + initial_low_count, initial_failed_count); + + // Test that the endpoints return different sets of documents + // (This assumes there are some of each type in the system) + + // Verify that failed documents endpoint only includes failed/NULL confidence docs + if initial_failed_count > 0 { + let failed_docs = initial_failed["document_ids"].as_array().unwrap(); + println!("๐Ÿ” Found {} failed document IDs", failed_docs.len()); + } + + // Verify that low confidence endpoint respects threshold + if initial_low_count > 0 { + let low_confidence_docs = initial_low_confidence["document_ids"].as_array().unwrap(); + println!("๐Ÿ” Found {} low confidence document IDs", low_confidence_docs.len()); + } + + println!("โœ… Document type distinction test passed"); +} + +/// Test error handling for delete endpoints +#[tokio::test] +async fn test_delete_endpoints_error_handling() { + let client = DocumentDeletionTestClient::new(); + + if let Err(e) = client.check_server_health().await { + println!("โš ๏ธ Server not available: {}. Skipping test.", e); + return; + } + + println!("๐Ÿงช Testing delete endpoints error handling..."); + + // Test unauthenticated request + let failed_response = client.client + .post(&format!("{}/api/documents/delete-failed-ocr", get_base_url())) + .json(&json!({"preview_only": true})) + .timeout(TIMEOUT) + .send() + .await + .expect("Failed to send request"); + + assert_eq!(failed_response.status(), 401, "Should require authentication"); + + // Test invalid JSON + let invalid_json_response = client.client + .post(&format!("{}/api/documents/delete-failed-ocr", get_base_url())) + .header("content-type", "application/json") + .body("invalid json") + .timeout(TIMEOUT) + .send() + .await + .expect("Failed to send request"); + + assert!(invalid_json_response.status().is_client_error(), "Should reject invalid JSON"); + + println!("โœ… Error handling test passed"); +} + +/// Test role-based access for new delete endpoints +#[tokio::test] +async fn test_role_based_access_for_delete_endpoints() { + let mut client = DocumentDeletionTestClient::new(); + + if let Err(e) = client.check_server_health().await { + println!("โš ๏ธ Server not available: {}. Skipping test.", e); + return; + } + + println!("๐Ÿงช Testing role-based access for delete endpoints..."); + + // Test as regular user + client.create_and_login_user("delete_regular_user", "delete_password", UserRole::User) + .await.expect("Failed to create and login user"); + + let user_response = client.delete_failed_ocr_documents(true) + .await.expect("Failed to preview as user"); + + assert_eq!(user_response["success"], true); + let user_count = user_response["matched_count"].as_i64().unwrap(); + + // Test as admin + client.create_and_login_user("delete_admin_user", "delete_admin_password", UserRole::Admin) + .await.expect("Failed to create and login admin"); + + let admin_response = client.delete_failed_ocr_documents(true) + .await.expect("Failed to preview as admin"); + + assert_eq!(admin_response["success"], true); + let admin_count = admin_response["matched_count"].as_i64().unwrap(); + + // Admin should see at least as many documents as regular user + assert!(admin_count >= user_count, + "Admin should see at least as many documents as user"); + + println!("๐Ÿ‘ค User can see {} documents, Admin can see {} documents", + user_count, admin_count); + + println!("โœ… Role-based access test passed"); +} + +/// Test the enhanced low confidence deletion with failed documents +#[tokio::test] +async fn test_enhanced_low_confidence_deletion() { + let mut client = DocumentDeletionTestClient::new(); + + if let Err(e) = client.check_server_health().await { + println!("โš ๏ธ Server not available: {}. Skipping test.", e); + return; + } + + println!("๐Ÿงช Testing enhanced low confidence deletion (includes failed docs)..."); + + // Create and login as admin + client.create_and_login_user("enhanced_delete_admin", "enhanced_password", UserRole::Admin) + .await.expect("Failed to create and login admin"); + + // Test with various thresholds + let thresholds = vec![0.0, 30.0, 50.0, 85.0, 100.0]; + + for threshold in thresholds { + let response = client.delete_low_confidence_documents(threshold, true) + .await.expect(&format!("Failed to preview with threshold {}", threshold)); + + assert_eq!(response["success"], true); + let count = response["matched_count"].as_i64().unwrap(); + + println!("๐ŸŽฏ Threshold {}%: {} documents would be deleted", threshold, count); + + // Verify response format + assert!(response.get("document_ids").is_some()); + assert_eq!(response["preview"], true); + } + + // Test that higher thresholds generally include more documents + let low_threshold_response = client.delete_low_confidence_documents(10.0, true) + .await.expect("Failed to preview with low threshold"); + let high_threshold_response = client.delete_low_confidence_documents(90.0, true) + .await.expect("Failed to preview with high threshold"); + + let low_count = low_threshold_response["matched_count"].as_i64().unwrap(); + let high_count = high_threshold_response["matched_count"].as_i64().unwrap(); + + assert!(high_count >= low_count, + "Higher threshold should include at least as many documents as lower threshold"); + + println!("โœ… Enhanced low confidence deletion test passed"); } \ No newline at end of file From e995653d69ce6df4d5da498e61c9a81def6e58f9 Mon Sep 17 00:00:00 2001 From: perf3ct Date: Sat, 28 Jun 2025 14:51:06 +0000 Subject: [PATCH 2/4] fix(migrations): resolve issue in migration for ocr confidence --- migrations/20250628000001_backfill_ocr_confidence.sql | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/migrations/20250628000001_backfill_ocr_confidence.sql b/migrations/20250628000001_backfill_ocr_confidence.sql index 0005371..6a095a6 100644 --- a/migrations/20250628000001_backfill_ocr_confidence.sql +++ b/migrations/20250628000001_backfill_ocr_confidence.sql @@ -18,19 +18,19 @@ UPDATE documents SET ocr_confidence = CASE -- High quality text: good length, reasonable character distribution WHEN length(trim(ocr_text)) > 1000 - AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), char(10), ''))) * 100.0 / length(ocr_text) > 10.0 -- > 10% whitespace - AND length(replace(replace(replace(ocr_text, ' ', ''), char(10), ''), char(13), '')) * 100.0 / length(ocr_text) > 70.0 -- > 70% non-whitespace chars + AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), chr(10), ''))) * 100.0 / length(ocr_text) > 10.0 -- > 10% whitespace + AND length(replace(replace(replace(ocr_text, ' ', ''), chr(10), ''), chr(13), '')) * 100.0 / length(ocr_text) > 70.0 -- > 70% non-whitespace chars THEN 90.0 + (random() * 8.0) -- 90-98% -- Medium quality text: decent length, some structure WHEN length(trim(ocr_text)) > 100 - AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), char(10), ''))) * 100.0 / length(ocr_text) > 5.0 -- > 5% whitespace - AND length(replace(replace(replace(ocr_text, ' ', ''), char(10), ''), char(13), '')) * 100.0 / length(ocr_text) > 50.0 -- > 50% non-whitespace chars + AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), chr(10), ''))) * 100.0 / length(ocr_text) > 5.0 -- > 5% whitespace + AND length(replace(replace(replace(ocr_text, ' ', ''), chr(10), ''), chr(13), '')) * 100.0 / length(ocr_text) > 50.0 -- > 50% non-whitespace chars THEN 70.0 + (random() * 15.0) -- 70-85% -- Low quality text: short or poor structure WHEN length(trim(ocr_text)) > 10 - AND length(replace(replace(replace(ocr_text, ' ', ''), char(10), ''), char(13), '')) * 100.0 / length(ocr_text) > 30.0 -- > 30% non-whitespace chars + AND length(replace(replace(replace(ocr_text, ' ', ''), chr(10), ''), chr(13), '')) * 100.0 / length(ocr_text) > 30.0 -- > 30% non-whitespace chars THEN 40.0 + (random() * 25.0) -- 40-65% -- Very poor quality: very short or mostly garbage From 69425b220165ca5ca03695dc72e017b06e28ac9f Mon Sep 17 00:00:00 2001 From: perf3ct Date: Sat, 28 Jun 2025 14:53:45 +0000 Subject: [PATCH 3/4] feat(migration): instead of hardcoded guessing, re-enter those documents into the queue --- ...20250628000001_backfill_ocr_confidence.sql | 64 ++++++------------- 1 file changed, 20 insertions(+), 44 deletions(-) diff --git a/migrations/20250628000001_backfill_ocr_confidence.sql b/migrations/20250628000001_backfill_ocr_confidence.sql index 6a095a6..829d5cd 100644 --- a/migrations/20250628000001_backfill_ocr_confidence.sql +++ b/migrations/20250628000001_backfill_ocr_confidence.sql @@ -1,59 +1,35 @@ --- Backfill OCR confidence scores for existing documents --- Since OCR confidence was previously hardcoded to 85%, we need to recalculate --- actual confidence for documents that currently have this placeholder value +-- Re-queue documents with placeholder OCR confidence for reprocessing +-- Since OCR confidence was previously hardcoded to 85%, we need to reprocess +-- these documents to get accurate confidence scores --- First, let's identify documents that likely have placeholder confidence --- (85% exactly, which was the hardcoded value) -CREATE TEMP TABLE documents_to_update AS -SELECT id, ocr_text, ocr_status -FROM documents -WHERE ocr_confidence = 85.0 - AND ocr_status = 'completed' - AND ocr_text IS NOT NULL - AND length(trim(ocr_text)) > 0; - --- For now, we'll estimate confidence based on text quality metrics --- This is a rough approximation until we can re-run OCR with actual confidence +-- Mark documents with exactly 85% confidence as pending OCR reprocessing UPDATE documents -SET ocr_confidence = CASE - -- High quality text: good length, reasonable character distribution - WHEN length(trim(ocr_text)) > 1000 - AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), chr(10), ''))) * 100.0 / length(ocr_text) > 10.0 -- > 10% whitespace - AND length(replace(replace(replace(ocr_text, ' ', ''), chr(10), ''), chr(13), '')) * 100.0 / length(ocr_text) > 70.0 -- > 70% non-whitespace chars - THEN 90.0 + (random() * 8.0) -- 90-98% - - -- Medium quality text: decent length, some structure - WHEN length(trim(ocr_text)) > 100 - AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), chr(10), ''))) * 100.0 / length(ocr_text) > 5.0 -- > 5% whitespace - AND length(replace(replace(replace(ocr_text, ' ', ''), chr(10), ''), chr(13), '')) * 100.0 / length(ocr_text) > 50.0 -- > 50% non-whitespace chars - THEN 70.0 + (random() * 15.0) -- 70-85% - - -- Low quality text: short or poor structure - WHEN length(trim(ocr_text)) > 10 - AND length(replace(replace(replace(ocr_text, ' ', ''), chr(10), ''), chr(13), '')) * 100.0 / length(ocr_text) > 30.0 -- > 30% non-whitespace chars - THEN 40.0 + (random() * 25.0) -- 40-65% - - -- Very poor quality: very short or mostly garbage - ELSE 20.0 + (random() * 15.0) -- 20-35% -END -WHERE id IN (SELECT id FROM documents_to_update); +SET ocr_status = 'pending', + ocr_confidence = NULL, + ocr_error = NULL, + updated_at = CURRENT_TIMESTAMP +WHERE ocr_confidence = 85.0 + AND ocr_status = 'completed' + AND ocr_text IS NOT NULL; -- Add a comment explaining what we did -COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence percentage (0-100). Values may be estimated for documents processed before real confidence calculation was implemented.'; +COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence percentage (0-100) from Tesseract. Documents with NULL confidence and pending status will be reprocessed.'; -- Log the update DO $$ DECLARE updated_count INTEGER; BEGIN - SELECT COUNT(*) INTO updated_count FROM documents_to_update; - RAISE NOTICE 'Backfilled OCR confidence for % documents that had placeholder 85%% confidence', updated_count; + GET DIAGNOSTICS updated_count = ROW_COUNT; + RAISE NOTICE 'Marked % documents with placeholder 85%% confidence for OCR reprocessing', updated_count; END $$; --- Clean up -DROP TABLE documents_to_update; - -- Create an index to help with confidence-based queries CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence_range ON documents(ocr_confidence) -WHERE ocr_confidence IS NOT NULL; \ No newline at end of file +WHERE ocr_confidence IS NOT NULL; + +-- Create an index to help the OCR queue find pending documents efficiently +CREATE INDEX IF NOT EXISTS idx_documents_ocr_pending +ON documents(created_at) +WHERE ocr_status = 'pending'; \ No newline at end of file From 9079529eb5be8d6e0ff3b48d08deba4cd8aac553 Mon Sep 17 00:00:00 2001 From: perf3ct Date: Sat, 28 Jun 2025 16:38:12 +0000 Subject: [PATCH 4/4] feat(tests): create generic migration tests --- src/tests/generic_migration_tests.rs | 275 +++++++++++++++++++++++++++ src/tests/helpers.rs | 1 + src/tests/mod.rs | 3 +- 3 files changed, 278 insertions(+), 1 deletion(-) create mode 100644 src/tests/generic_migration_tests.rs diff --git a/src/tests/generic_migration_tests.rs b/src/tests/generic_migration_tests.rs new file mode 100644 index 0000000..94626ee --- /dev/null +++ b/src/tests/generic_migration_tests.rs @@ -0,0 +1,275 @@ +#[cfg(test)] +mod generic_migration_tests { + use sqlx::{PgPool, Row}; + use testcontainers::{runners::AsyncRunner, ImageExt}; + use testcontainers_modules::postgres::Postgres; + use std::process::Command; + + async fn setup_test_db() -> (PgPool, testcontainers::ContainerAsync) { + let postgres_image = Postgres::default() + .with_tag("15-alpine") + .with_env_var("POSTGRES_USER", "test") + .with_env_var("POSTGRES_PASSWORD", "test") + .with_env_var("POSTGRES_DB", "test"); + + let container = postgres_image.start().await.expect("Failed to start postgres container"); + let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port"); + + let database_url = format!("postgresql://test:test@localhost:{}/test", port); + let pool = sqlx::postgres::PgPoolOptions::new() + .max_connections(5) + .connect(&database_url) + .await + .expect("Failed to connect to test database"); + + (pool, container) + } + + fn get_new_migrations() -> Vec { + // Get list of migration files that have changed between main and current branch + let output = Command::new("git") + .args(["diff", "--name-only", "main..HEAD", "--", "migrations/"]) + .output() + .expect("Failed to run git diff"); + + if !output.status.success() { + println!("Git diff failed, assuming no migration changes"); + return Vec::new(); + } + + let files = String::from_utf8_lossy(&output.stdout); + files + .lines() + .filter(|line| line.ends_with(".sql")) + .map(|s| s.to_string()) + .collect() + } + + fn get_migration_files_on_main() -> Vec { + // Get list of migration files that exist on main branch + let output = Command::new("git") + .args(["ls-tree", "-r", "--name-only", "origin/main", "migrations/"]) + .output() + .expect("Failed to list migration files on main"); + + if !output.status.success() { + println!("Failed to get migration files from main branch"); + return Vec::new(); + } + + let files = String::from_utf8_lossy(&output.stdout); + files + .lines() + .filter(|line| line.ends_with(".sql")) + .map(|s| s.to_string()) + .collect() + } + + #[tokio::test] + async fn test_new_migrations_run_successfully() { + let new_migrations = get_new_migrations(); + + if new_migrations.is_empty() { + println!("โœ… No new migrations found - test passes"); + return; + } + + println!("๐Ÿ” Found {} new migration(s):", new_migrations.len()); + for migration in &new_migrations { + println!(" - {}", migration); + } + + let (pool, _container) = setup_test_db().await; + + // Run all migrations (including the new ones) + let result = sqlx::migrate!("./migrations").run(&pool).await; + assert!(result.is_ok(), "New migrations should run successfully: {:?}", result.err()); + + println!("โœ… All migrations including new ones ran successfully"); + } + + #[tokio::test] + async fn test_migrations_are_idempotent() { + let new_migrations = get_new_migrations(); + + if new_migrations.is_empty() { + println!("โœ… No new migrations found - idempotency test skipped"); + return; + } + + let (pool, _container) = setup_test_db().await; + + // Run migrations twice to test idempotency + let result1 = sqlx::migrate!("./migrations").run(&pool).await; + assert!(result1.is_ok(), "First migration run should succeed: {:?}", result1.err()); + + let result2 = sqlx::migrate!("./migrations").run(&pool).await; + assert!(result2.is_ok(), "Second migration run should succeed (idempotent): {:?}", result2.err()); + + println!("โœ… Migrations are idempotent"); + } + + #[tokio::test] + async fn test_migration_syntax_and_completeness() { + let new_migrations = get_new_migrations(); + + if new_migrations.is_empty() { + println!("โœ… No new migrations found - syntax test skipped"); + return; + } + + // Check that new migration files exist and have basic structure + for migration_path in &new_migrations { + let content = std::fs::read_to_string(migration_path) + .expect(&format!("Should be able to read migration file: {}", migration_path)); + + assert!(!content.trim().is_empty(), "Migration file should not be empty: {}", migration_path); + + // Basic syntax check - should not contain obvious SQL syntax errors + assert!(!content.contains("syntax error"), "Migration should not contain 'syntax error': {}", migration_path); + + println!("โœ… Migration file {} has valid syntax", migration_path); + } + } + + #[tokio::test] + async fn test_migration_rollback_safety() { + let new_migrations = get_new_migrations(); + + if new_migrations.is_empty() { + println!("โœ… No new migrations found - rollback safety test skipped"); + return; + } + + let (pool, _container) = setup_test_db().await; + + // Test that we can run migrations and they create expected schema elements + let result = sqlx::migrate!("./migrations").run(&pool).await; + assert!(result.is_ok(), "Migrations should run successfully: {:?}", result.err()); + + // Verify basic schema integrity + let tables = sqlx::query("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'") + .fetch_all(&pool) + .await + .expect("Should be able to query table list"); + + assert!(!tables.is_empty(), "Should have created at least one table"); + + // Check that essential tables exist + let table_names: Vec = tables.iter() + .map(|row| row.get::("table_name")) + .collect(); + + assert!(table_names.contains(&"documents".to_string()), "documents table should exist"); + assert!(table_names.contains(&"users".to_string()), "users table should exist"); + + println!("โœ… Migration rollback safety verified - schema is intact"); + } + + #[test] + fn test_migration_naming_convention() { + let new_migrations = get_new_migrations(); + + if new_migrations.is_empty() { + println!("โœ… No new migrations found - naming convention test skipped"); + return; + } + + for migration_path in &new_migrations { + let filename = migration_path + .split('/') + .last() + .expect("Should have filename"); + + // Check naming convention: YYYYMMDDHHMMSS_description.sql + assert!(filename.len() > 15, "Migration filename should be long enough: {}", filename); + assert!(filename.ends_with(".sql"), "Migration should end with .sql: {}", filename); + + let parts: Vec<&str> = filename.split('_').collect(); + assert!(parts.len() >= 2, "Migration should have timestamp_description format: {}", filename); + + let timestamp = parts[0]; + assert!(timestamp.len() >= 14, "Timestamp should be at least 14 characters: {}", filename); + assert!(timestamp.chars().all(|c| c.is_numeric()), "Timestamp should be numeric: {}", filename); + + println!("โœ… Migration {} follows naming convention", filename); + } + } + + #[tokio::test] + async fn test_no_changes_scenario_simulation() { + // Simulate what happens when git diff returns no changes (HEAD..HEAD) + let output = Command::new("git") + .args(["diff", "--name-only", "HEAD..HEAD", "--", "migrations/"]) + .output() + .expect("Failed to run git diff"); + + let files = String::from_utf8_lossy(&output.stdout); + let no_changes: Vec = files + .lines() + .filter(|line| line.ends_with(".sql")) + .map(|s| s.to_string()) + .collect(); + + // This should be empty (no changes between HEAD and itself) + assert!(no_changes.is_empty(), "HEAD..HEAD should show no changes"); + + // Verify the test logic handles empty migrations gracefully + if no_changes.is_empty() { + println!("โœ… No new migrations found - test passes"); + // This is what the real tests do when no changes are found + return; + } + + println!("โœ… No migration changes scenario handled correctly"); + } + + #[test] + fn test_no_conflicting_migration_timestamps() { + let new_migrations = get_new_migrations(); + let main_migrations = get_migration_files_on_main(); + + if new_migrations.is_empty() { + println!("โœ… No new migrations found - timestamp conflict test skipped"); + return; + } + + // Extract timestamps from new migrations + let new_timestamps: Vec = new_migrations.iter() + .map(|path| { + let filename = path.split('/').last().unwrap(); + let timestamp = filename.split('_').next().unwrap(); + timestamp.to_string() + }) + .collect(); + + // Extract timestamps from existing migrations on main + let main_timestamps: Vec = main_migrations.iter() + .map(|path| { + let filename = path.split('/').last().unwrap(); + let timestamp = filename.split('_').next().unwrap(); + timestamp.to_string() + }) + .collect(); + + // Check for conflicts + for new_ts in &new_timestamps { + assert!( + !main_timestamps.contains(new_ts), + "Migration timestamp {} conflicts with existing migration on main", + new_ts + ); + } + + // Check for duplicates within new migrations + for (i, ts1) in new_timestamps.iter().enumerate() { + for (j, ts2) in new_timestamps.iter().enumerate() { + if i != j { + assert_ne!(ts1, ts2, "Duplicate migration timestamp found: {}", ts1); + } + } + } + + println!("โœ… No migration timestamp conflicts found"); + } +} \ No newline at end of file diff --git a/src/tests/helpers.rs b/src/tests/helpers.rs index cf7ad98..816950b 100644 --- a/src/tests/helpers.rs +++ b/src/tests/helpers.rs @@ -8,6 +8,7 @@ use tower::util::ServiceExt; pub async fn create_test_app() -> (Router, ContainerAsync) { let postgres_image = Postgres::default() + .with_tag("15-alpine") .with_env_var("POSTGRES_USER", "test") .with_env_var("POSTGRES_PASSWORD", "test") .with_env_var("POSTGRES_DB", "test"); diff --git a/src/tests/mod.rs b/src/tests/mod.rs index f40390e..e26cd3f 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -12,4 +12,5 @@ mod enhanced_ocr_tests; mod oidc_tests; mod enhanced_search_tests; mod settings_tests; -mod users_tests; +mod users_tests; +mod generic_migration_tests;