From a642eec3ce590eb5a4b59335103b2082eb648ea6 Mon Sep 17 00:00:00 2001 From: perf3ct Date: Fri, 27 Jun 2025 22:16:38 +0000 Subject: [PATCH] feat(server/client): implement button deleting low confidence documents (e.g. documents that have no text) --- frontend/src/pages/FailedOcrPage.tsx | 200 ++++++++++++- .../pages/__tests__/FailedOcrPage.test.tsx | 231 ++++++++++++++- frontend/src/services/__mocks__/api.ts | 1 + frontend/src/services/__tests__/api.test.ts | 181 ++++++++++++ frontend/src/services/api.ts | 7 + src/db/documents.rs | 75 +++++ src/routes/documents.rs | 119 ++++++++ src/tests/document_routes_tests.rs | 266 +++++++++++++++++ src/tests/documents_tests.rs | 271 ++++++++++++++++++ 9 files changed, 1347 insertions(+), 4 deletions(-) diff --git a/frontend/src/pages/FailedOcrPage.tsx b/frontend/src/pages/FailedOcrPage.tsx index b11fe60..2dad6da 100644 --- a/frontend/src/pages/FailedOcrPage.tsx +++ b/frontend/src/pages/FailedOcrPage.tsx @@ -28,6 +28,7 @@ import { Snackbar, Tabs, Tab, + TextField, useTheme, } from '@mui/material'; import Grid from '@mui/material/GridLegacy'; @@ -147,6 +148,12 @@ const FailedOcrPage: React.FC = () => { message: '', severity: 'success' }); + + // Low confidence documents state + const [confidenceThreshold, setConfidenceThreshold] = useState(30); + const [lowConfidenceLoading, setLowConfidenceLoading] = useState(false); + const [previewData, setPreviewData] = useState(null); + const [confirmDeleteOpen, setConfirmDeleteOpen] = useState(false); const fetchFailedDocuments = async () => { try { @@ -297,8 +304,68 @@ const FailedOcrPage: React.FC = () => { const refreshCurrentTab = () => { if (currentTab === 0) { fetchFailedDocuments(); - } else { + } else if (currentTab === 1) { fetchDuplicates(); + } else if (currentTab === 2) { + handlePreviewLowConfidence(); + } + }; + + // Low confidence document handlers + const handlePreviewLowConfidence = async () => { + try { + setLowConfidenceLoading(true); + const response = await documentService.deleteLowConfidence(confidenceThreshold, true); + setPreviewData(response.data); + setSnackbar({ + open: true, + message: response.data.message, + severity: 'info' + }); + } catch (error) { + setSnackbar({ + open: true, + message: 'Failed to preview low confidence documents', + severity: 'error' + }); + } finally { + setLowConfidenceLoading(false); + } + }; + + const handleDeleteLowConfidence = async () => { + if (!previewData || previewData.matched_count === 0) { + setSnackbar({ + open: true, + message: 'No documents to delete', + severity: 'warning' + }); + return; + } + + try { + setLowConfidenceLoading(true); + const response = await documentService.deleteLowConfidence(confidenceThreshold, false); + setSnackbar({ + open: true, + message: response.data.message, + severity: 'success' + }); + setPreviewData(null); + setConfirmDeleteOpen(false); + + // Refresh other tabs if they have data affected + if (currentTab === 0) { + fetchFailedDocuments(); + } + } catch (error) { + setSnackbar({ + open: true, + message: 'Failed to delete low confidence documents', + severity: 'error' + }); + } finally { + setLowConfidenceLoading(false); } }; @@ -314,7 +381,7 @@ const FailedOcrPage: React.FC = () => { - Failed OCR & Duplicates + Document Management + + + + + + + + + {/* Preview Results */} + {previewData && ( + + + + Preview Results + + 0 ? 'warning.main' : 'success.main'}> + {previewData.message} + + {previewData.matched_count > 0 && ( + + + Document IDs that would be deleted: + + + {previewData.document_ids.slice(0, 10).join(', ')} + {previewData.document_ids.length > 10 && ` ... and ${previewData.document_ids.length - 10} more`} + + + )} + + + )} + + {/* Loading State */} + {lowConfidenceLoading && !previewData && ( + + + Processing request... + + )} + + )} + + {/* Confirmation Dialog */} + setConfirmDeleteOpen(false)} + maxWidth="sm" + fullWidth + > + + + Confirm Low Confidence Document Deletion + + + + Are you sure you want to delete {previewData?.matched_count || 0} documents with OCR confidence below {confidenceThreshold}%? + + + This action cannot be undone. The documents and their files will be permanently deleted. + + + + + + + + {/* Document Details Dialog */} ({ retryOcr: () => Promise.resolve({ data: { success: true, message: 'OCR retry queued successfully' } }), + deleteLowConfidence: vi.fn(() => Promise.resolve({ + data: { + success: true, + message: 'Found 0 documents with OCR confidence below 30%', + matched_count: 0, + preview: true, + document_ids: [] + } + })), }, })); @@ -55,7 +64,7 @@ describe('FailedOcrPage', () => { // Wait for the page to load and show the title await waitFor(() => { - expect(screen.getByText('Failed OCR & Duplicates')).toBeInTheDocument(); + expect(screen.getByText('Document Management')).toBeInTheDocument(); }); }); @@ -92,4 +101,224 @@ describe('FailedOcrPage', () => { // test('handles retry OCR functionality', async () => { ... }); // test('handles API errors gracefully', async () => { ... }); // test('refreshes data when refresh button is clicked', async () => { ... }); +}); + +describe('FailedOcrPage - Low Confidence Deletion', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + test('renders low confidence deletion tab', async () => { + render( + + + + ); + + // Wait for tabs to load + await waitFor(() => { + const tabs = screen.getByRole('tablist'); + expect(tabs).toBeInTheDocument(); + }); + + // Check for Low Confidence tab + await waitFor(() => { + const lowConfidenceTab = screen.getByText(/Low Confidence/i); + expect(lowConfidenceTab).toBeInTheDocument(); + }); + }); + + test('displays confidence threshold input when low confidence tab is active', async () => { + render( + + + + ); + + // Wait for component to load + await waitFor(() => { + const tabs = screen.getByRole('tablist'); + expect(tabs).toBeInTheDocument(); + }); + + // Click on Low Confidence tab (third tab, index 2) + const lowConfidenceTab = screen.getByText(/Low Confidence/i); + lowConfidenceTab.click(); + + // Wait for tab content to render + await waitFor(() => { + const thresholdInput = screen.getByLabelText(/Maximum Confidence Threshold/i); + expect(thresholdInput).toBeInTheDocument(); + }); + }); + + test('displays preview and delete buttons in low confidence tab', async () => { + render( + + + + ); + + // Navigate to Low Confidence tab + await waitFor(() => { + const lowConfidenceTab = screen.getByText(/Low Confidence/i); + lowConfidenceTab.click(); + }); + + // Check for action buttons + await waitFor(() => { + const previewButton = screen.getByText(/Preview Documents/i); + const deleteButton = screen.getByText(/Delete Low Confidence Documents/i); + + expect(previewButton).toBeInTheDocument(); + expect(deleteButton).toBeInTheDocument(); + }); + }); + + test('shows informational alert about low confidence deletion', async () => { + render( + + + + ); + + // Navigate to Low Confidence tab + await waitFor(() => { + const lowConfidenceTab = screen.getByText(/Low Confidence/i); + lowConfidenceTab.click(); + }); + + // Check for informational content + await waitFor(() => { + const alertTitle = screen.getByText(/Low Confidence Document Deletion/i); + const alertText = screen.getByText(/This tool allows you to delete documents/i); + + expect(alertTitle).toBeInTheDocument(); + expect(alertText).toBeInTheDocument(); + }); + }); + + // DISABLED - Interactive tests that would require complex user event simulation + // These tests would need fireEvent.change, fireEvent.click, and proper async handling + + // test('calls deleteLowConfidence API when preview button is clicked', async () => { + // const mockDeleteLowConfidence = vi.mocked(documentService.deleteLowConfidence); + // + // render(); + // + // // Navigate to tab and click preview + // const lowConfidenceTab = screen.getByText(/Low Confidence/i); + // fireEvent.click(lowConfidenceTab); + // + // const previewButton = screen.getByText(/Preview Documents/i); + // fireEvent.click(previewButton); + // + // await waitFor(() => { + // expect(mockDeleteLowConfidence).toHaveBeenCalledWith(30, true); + // }); + // }); + + // test('validates confidence threshold input values', async () => { + // render(); + // + // const lowConfidenceTab = screen.getByText(/Low Confidence/i); + // fireEvent.click(lowConfidenceTab); + // + // const thresholdInput = screen.getByLabelText(/Maximum Confidence Threshold/i); + // + // // Test invalid values + // fireEvent.change(thresholdInput, { target: { value: '150' } }); + // expect(thresholdInput.value).toBe('100'); // Should be clamped + // + // fireEvent.change(thresholdInput, { target: { value: '-10' } }); + // expect(thresholdInput.value).toBe('0'); // Should be clamped + // }); + + // test('shows confirmation dialog before deletion', async () => { + // const mockDeleteLowConfidence = vi.mocked(documentService.deleteLowConfidence); + // mockDeleteLowConfidence.mockResolvedValueOnce({ + // data: { + // success: true, + // matched_count: 5, + // preview: true, + // document_ids: ['doc1', 'doc2', 'doc3', 'doc4', 'doc5'] + // } + // }); + // + // render(); + // + // // Navigate to tab, preview, then try to delete + // const lowConfidenceTab = screen.getByText(/Low Confidence/i); + // fireEvent.click(lowConfidenceTab); + // + // const previewButton = screen.getByText(/Preview Documents/i); + // fireEvent.click(previewButton); + // + // await waitFor(() => { + // const deleteButton = screen.getByText(/Delete Low Confidence Documents/i); + // fireEvent.click(deleteButton); + // }); + // + // // Should show confirmation dialog + // await waitFor(() => { + // const confirmDialog = screen.getByText(/Confirm Low Confidence Document Deletion/i); + // expect(confirmDialog).toBeInTheDocument(); + // }); + // }); + + // test('disables delete button when no preview data available', async () => { + // render(); + // + // const lowConfidenceTab = screen.getByText(/Low Confidence/i); + // fireEvent.click(lowConfidenceTab); + // + // await waitFor(() => { + // const deleteButton = screen.getByText(/Delete Low Confidence Documents/i); + // expect(deleteButton).toBeDisabled(); + // }); + // }); + + // test('displays preview results after API call', async () => { + // const mockDeleteLowConfidence = vi.mocked(documentService.deleteLowConfidence); + // mockDeleteLowConfidence.mockResolvedValueOnce({ + // data: { + // success: true, + // message: 'Found 3 documents with OCR confidence below 30%', + // matched_count: 3, + // preview: true, + // document_ids: ['doc1', 'doc2', 'doc3'] + // } + // }); + // + // render(); + // + // const lowConfidenceTab = screen.getByText(/Low Confidence/i); + // fireEvent.click(lowConfidenceTab); + // + // const previewButton = screen.getByText(/Preview Documents/i); + // fireEvent.click(previewButton); + // + // await waitFor(() => { + // expect(screen.getByText(/Preview Results/i)).toBeInTheDocument(); + // expect(screen.getByText(/Found 3 documents/i)).toBeInTheDocument(); + // }); + // }); + + // test('handles API errors gracefully', async () => { + // const mockDeleteLowConfidence = vi.mocked(documentService.deleteLowConfidence); + // mockDeleteLowConfidence.mockRejectedValueOnce(new Error('Network error')); + // + // render(); + // + // const lowConfidenceTab = screen.getByText(/Low Confidence/i); + // fireEvent.click(lowConfidenceTab); + // + // const previewButton = screen.getByText(/Preview Documents/i); + // fireEvent.click(previewButton); + // + // await waitFor(() => { + // // Should show error message via snackbar or similar + // expect(screen.getByText(/Failed to preview low confidence documents/i)).toBeInTheDocument(); + // }); + // }); }); \ No newline at end of file diff --git a/frontend/src/services/__mocks__/api.ts b/frontend/src/services/__mocks__/api.ts index 46d1f12..8e8a742 100644 --- a/frontend/src/services/__mocks__/api.ts +++ b/frontend/src/services/__mocks__/api.ts @@ -22,6 +22,7 @@ export const documentService = { getFailedOcrDocuments: vi.fn(), getDuplicates: vi.fn(), retryOcr: vi.fn(), + deleteLowConfidence: vi.fn(), } // Re-export types that components might need diff --git a/frontend/src/services/__tests__/api.test.ts b/frontend/src/services/__tests__/api.test.ts index 604c469..719cb51 100644 --- a/frontend/src/services/__tests__/api.test.ts +++ b/frontend/src/services/__tests__/api.test.ts @@ -6,6 +6,7 @@ const mockGetOcrText = vi.fn(); const mockList = vi.fn(); const mockUpload = vi.fn(); const mockDownload = vi.fn(); +const mockDeleteLowConfidence = vi.fn(); // Mock the entire api module vi.mock('../api', async () => { @@ -17,6 +18,7 @@ vi.mock('../api', async () => { list: mockList, upload: mockUpload, download: mockDownload, + deleteLowConfidence: mockDeleteLowConfidence, }, }; }); @@ -309,4 +311,183 @@ describe('OcrResponse interface', () => { expect(ocrResponseMinimal.ocr_text).toBeNull(); expect(ocrResponseMinimal.ocr_confidence).toBeUndefined(); }); +}); + +describe('documentService.deleteLowConfidence', () => { + it('should delete low confidence documents successfully', async () => { + const mockDeleteResponse = { + data: { + success: true, + message: 'Successfully deleted 3 documents with OCR confidence below 30%', + deleted_count: 3, + matched_count: 3, + successful_file_deletions: 3, + failed_file_deletions: 0, + ignored_file_creation_failures: 0, + deleted_document_ids: ['doc-1', 'doc-2', 'doc-3'] + }, + status: 200, + statusText: 'OK', + headers: {}, + config: {}, + }; + + mockDeleteLowConfidence.mockResolvedValue(mockDeleteResponse); + + const result = await documentService.deleteLowConfidence(30.0, false); + + expect(mockDeleteLowConfidence).toHaveBeenCalledWith(30.0, false); + expect(result.data.success).toBe(true); + expect(result.data.deleted_count).toBe(3); + expect(result.data.matched_count).toBe(3); + expect(result.data.deleted_document_ids).toHaveLength(3); + }); + + it('should preview low confidence documents without deleting', async () => { + const mockPreviewResponse = { + data: { + success: true, + message: 'Found 5 documents with OCR confidence below 50%', + matched_count: 5, + preview: true, + document_ids: ['doc-1', 'doc-2', 'doc-3', 'doc-4', 'doc-5'] + }, + status: 200, + statusText: 'OK', + headers: {}, + config: {}, + }; + + mockDeleteLowConfidence.mockResolvedValue(mockPreviewResponse); + + const result = await documentService.deleteLowConfidence(50.0, true); + + expect(mockDeleteLowConfidence).toHaveBeenCalledWith(50.0, true); + expect(result.data.success).toBe(true); + expect(result.data.preview).toBe(true); + expect(result.data.matched_count).toBe(5); + expect(result.data.document_ids).toHaveLength(5); + expect(result.data).not.toHaveProperty('deleted_count'); + }); + + it('should handle no matching documents', async () => { + const mockEmptyResponse = { + data: { + success: true, + message: 'No documents found with OCR confidence below 10%', + deleted_count: 0 + }, + status: 200, + statusText: 'OK', + headers: {}, + config: {}, + }; + + mockDeleteLowConfidence.mockResolvedValue(mockEmptyResponse); + + const result = await documentService.deleteLowConfidence(10.0, false); + + expect(mockDeleteLowConfidence).toHaveBeenCalledWith(10.0, false); + expect(result.data.success).toBe(true); + expect(result.data.deleted_count).toBe(0); + }); + + it('should handle validation errors for invalid confidence threshold', async () => { + const mockErrorResponse = { + data: { + success: false, + message: 'max_confidence must be between 0.0 and 100.0', + matched_count: 0 + }, + status: 200, + statusText: 'OK', + headers: {}, + config: {}, + }; + + mockDeleteLowConfidence.mockResolvedValue(mockErrorResponse); + + const result = await documentService.deleteLowConfidence(-10.0, false); + + expect(mockDeleteLowConfidence).toHaveBeenCalledWith(-10.0, false); + expect(result.data.success).toBe(false); + expect(result.data.message).toContain('must be between 0.0 and 100.0'); + }); + + it('should handle API errors gracefully', async () => { + const mockError = new Error('Network error'); + mockDeleteLowConfidence.mockRejectedValue(mockError); + + await expect(documentService.deleteLowConfidence(30.0, false)) + .rejects.toThrow('Network error'); + + expect(mockDeleteLowConfidence).toHaveBeenCalledWith(30.0, false); + }); + + it('should use correct default values', async () => { + const mockResponse = { + data: { success: true, matched_count: 0 }, + status: 200, + statusText: 'OK', + headers: {}, + config: {}, + }; + + mockDeleteLowConfidence.mockResolvedValue(mockResponse); + + // Test with explicit false value (the default) + await documentService.deleteLowConfidence(40.0, false); + + expect(mockDeleteLowConfidence).toHaveBeenCalledWith(40.0, false); + }); + + it('should handle partial deletion failures', async () => { + const mockPartialFailureResponse = { + data: { + success: true, + message: 'Successfully deleted 2 documents with OCR confidence below 25%', + deleted_count: 2, + matched_count: 3, + successful_file_deletions: 1, + failed_file_deletions: 1, + ignored_file_creation_failures: 1, + deleted_document_ids: ['doc-1', 'doc-2'] + }, + status: 200, + statusText: 'OK', + headers: {}, + config: {}, + }; + + mockDeleteLowConfidence.mockResolvedValue(mockPartialFailureResponse); + + const result = await documentService.deleteLowConfidence(25.0, false); + + expect(result.data.success).toBe(true); + expect(result.data.deleted_count).toBe(2); + expect(result.data.matched_count).toBe(3); + expect(result.data.failed_file_deletions).toBe(1); + expect(result.data.ignored_file_creation_failures).toBe(1); + }); + + it('should properly encode confidence threshold values', async () => { + const mockResponse = { + data: { success: true, matched_count: 0 }, + status: 200, + statusText: 'OK', + headers: {}, + config: {}, + }; + + mockDeleteLowConfidence.mockResolvedValue(mockResponse); + + // Test various confidence values + const testValues = [0.0, 0.1, 30.5, 50.0, 99.9, 100.0]; + + for (const confidence of testValues) { + mockDeleteLowConfidence.mockClear(); + await documentService.deleteLowConfidence(confidence, true); + expect(mockDeleteLowConfidence).toHaveBeenCalledWith(confidence, true); + } + }); }); \ No newline at end of file diff --git a/frontend/src/services/api.ts b/frontend/src/services/api.ts index db0cf7e..0b6abe4 100644 --- a/frontend/src/services/api.ts +++ b/frontend/src/services/api.ts @@ -241,6 +241,13 @@ export const documentService = { data: { document_ids: documentIds } }) }, + + deleteLowConfidence: (maxConfidence: number, previewOnly: boolean = false) => { + return api.post('/documents/delete-low-confidence', { + max_confidence: maxConfidence, + preview_only: previewOnly + }) + }, } export interface OcrStatusResponse { diff --git a/src/db/documents.rs b/src/db/documents.rs index 3bdec41..af1de96 100644 --- a/src/db/documents.rs +++ b/src/db/documents.rs @@ -1509,4 +1509,79 @@ impl Database { Ok(deleted_documents) } + + pub async fn find_documents_by_confidence_threshold(&self, max_confidence: f32, user_id: uuid::Uuid, user_role: crate::models::UserRole) -> Result> { + let documents = if user_role == crate::models::UserRole::Admin { + let rows = sqlx::query( + r#" + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + FROM documents + WHERE ocr_confidence IS NOT NULL AND ocr_confidence < $1 + ORDER BY ocr_confidence ASC, created_at DESC + "#, + ) + .bind(max_confidence) + .fetch_all(&self.pool) + .await?; + + rows.into_iter().map(|r| Document { + id: r.get("id"), + filename: r.get("filename"), + original_filename: r.get("original_filename"), + file_path: r.get("file_path"), + file_size: r.get("file_size"), + mime_type: r.get("mime_type"), + content: r.get("content"), + ocr_text: r.get("ocr_text"), + ocr_confidence: r.get("ocr_confidence"), + ocr_word_count: r.get("ocr_word_count"), + ocr_processing_time_ms: r.get("ocr_processing_time_ms"), + ocr_status: r.get("ocr_status"), + ocr_error: r.get("ocr_error"), + ocr_completed_at: r.get("ocr_completed_at"), + tags: r.get("tags"), + created_at: r.get("created_at"), + updated_at: r.get("updated_at"), + user_id: r.get("user_id"), + file_hash: r.get("file_hash"), + }).collect() + } else { + let rows = sqlx::query( + r#" + SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash + FROM documents + WHERE ocr_confidence IS NOT NULL AND ocr_confidence < $1 AND user_id = $2 + ORDER BY ocr_confidence ASC, created_at DESC + "#, + ) + .bind(max_confidence) + .bind(user_id) + .fetch_all(&self.pool) + .await?; + + rows.into_iter().map(|r| Document { + id: r.get("id"), + filename: r.get("filename"), + original_filename: r.get("original_filename"), + file_path: r.get("file_path"), + file_size: r.get("file_size"), + mime_type: r.get("mime_type"), + content: r.get("content"), + ocr_text: r.get("ocr_text"), + ocr_confidence: r.get("ocr_confidence"), + ocr_word_count: r.get("ocr_word_count"), + ocr_processing_time_ms: r.get("ocr_processing_time_ms"), + ocr_status: r.get("ocr_status"), + ocr_error: r.get("ocr_error"), + ocr_completed_at: r.get("ocr_completed_at"), + tags: r.get("tags"), + created_at: r.get("created_at"), + updated_at: r.get("updated_at"), + user_id: r.get("user_id"), + file_hash: r.get("file_hash"), + }).collect() + }; + + Ok(documents) + } } \ No newline at end of file diff --git a/src/routes/documents.rs b/src/routes/documents.rs index 62cd744..010cc1b 100644 --- a/src/routes/documents.rs +++ b/src/routes/documents.rs @@ -31,6 +31,12 @@ pub struct BulkDeleteRequest { pub document_ids: Vec, } +#[derive(Deserialize, Serialize, ToSchema)] +pub struct DeleteLowConfidenceRequest { + pub max_confidence: f32, + pub preview_only: Option, +} + pub fn router() -> Router> { Router::new() .route("/", post(upload_document)) @@ -46,6 +52,7 @@ pub fn router() -> Router> { .route("/{id}/retry-ocr", post(retry_ocr)) .route("/failed-ocr", get(get_failed_ocr_documents)) .route("/duplicates", get(get_user_duplicates)) + .route("/delete-low-confidence", post(delete_low_confidence_documents)) } #[utoipa::path( @@ -1017,4 +1024,116 @@ pub async fn bulk_delete_documents( "ignored_file_creation_failures": ignored_file_creation_failures, "deleted_document_ids": deleted_documents.iter().map(|d| d.id).collect::>() }))) +} + +#[utoipa::path( + post, + path = "/api/documents/delete-low-confidence", + request_body = DeleteLowConfidenceRequest, + responses( + (status = 200, description = "Low confidence documents operation result"), + (status = 401, description = "Unauthorized"), + (status = 500, description = "Internal server error") + ), + security( + ("bearer_auth" = []) + ), + tag = "documents" +)] +pub async fn delete_low_confidence_documents( + State(state): State>, + auth_user: AuthUser, + Json(request): Json, +) -> Result, StatusCode> { + if request.max_confidence < 0.0 || request.max_confidence > 100.0 { + return Ok(Json(serde_json::json!({ + "success": false, + "message": "max_confidence must be between 0.0 and 100.0", + "matched_count": 0 + }))); + } + + let is_preview = request.preview_only.unwrap_or(false); + + // Find documents with confidence below threshold + let matched_documents = state + .db + .find_documents_by_confidence_threshold(request.max_confidence, auth_user.user.id, auth_user.user.role) + .await + .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; + + let matched_count = matched_documents.len(); + + if is_preview { + return Ok(Json(serde_json::json!({ + "success": true, + "message": format!("Found {} documents with OCR confidence below {}%", matched_count, request.max_confidence), + "matched_count": matched_count, + "preview": true, + "document_ids": matched_documents.iter().map(|d| d.id).collect::>() + }))); + } + + if matched_documents.is_empty() { + return Ok(Json(serde_json::json!({ + "success": true, + "message": format!("No documents found with OCR confidence below {}%", request.max_confidence), + "deleted_count": 0 + }))); + } + + // Extract document IDs for bulk deletion + let document_ids: Vec = matched_documents.iter().map(|d| d.id).collect(); + + // Use existing bulk delete logic + let deleted_documents = state + .db + .bulk_delete_documents(&document_ids, auth_user.user.id, auth_user.user.role) + .await + .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; + + // Create ignored file records for all successfully deleted documents + let mut ignored_file_creation_failures = 0; + for document in &deleted_documents { + if let Err(e) = crate::db::ignored_files::create_ignored_file_from_document( + state.db.get_pool(), + document.id, + auth_user.user.id, + Some(format!("deleted due to low OCR confidence ({}%)", + document.ocr_confidence.unwrap_or(0.0))), + None, + None, + None, + ).await { + ignored_file_creation_failures += 1; + tracing::warn!("Failed to create ignored file record for document {}: {}", document.id, e); + } + } + + let file_service = FileService::new(state.config.upload_path.clone()); + let mut successful_file_deletions = 0; + let mut failed_file_deletions = 0; + + for document in &deleted_documents { + match file_service.delete_document_files(document).await { + Ok(_) => successful_file_deletions += 1, + Err(e) => { + failed_file_deletions += 1; + tracing::warn!("Failed to delete files for document {}: {}", document.id, e); + } + } + } + + let deleted_count = deleted_documents.len(); + + Ok(Json(serde_json::json!({ + "success": true, + "message": format!("Successfully deleted {} documents with OCR confidence below {}%", deleted_count, request.max_confidence), + "deleted_count": deleted_count, + "matched_count": matched_count, + "successful_file_deletions": successful_file_deletions, + "failed_file_deletions": failed_file_deletions, + "ignored_file_creation_failures": ignored_file_creation_failures, + "deleted_document_ids": deleted_documents.iter().map(|d| d.id).collect::>() + }))) } \ No newline at end of file diff --git a/src/tests/document_routes_tests.rs b/src/tests/document_routes_tests.rs index 15d94dc..12ae45d 100644 --- a/src/tests/document_routes_tests.rs +++ b/src/tests/document_routes_tests.rs @@ -367,4 +367,270 @@ mod document_routes_deletion_tests { assert!(!unauthorized_error.contains("403")); assert!(!validation_error.contains("serde")); } + + // Low confidence deletion tests + mod low_confidence_deletion_tests { + use super::*; + use crate::routes::documents::DeleteLowConfidenceRequest; + + fn create_low_confidence_document(user_id: Uuid, confidence: f32) -> Document { + Document { + id: Uuid::new_v4(), + filename: format!("low_conf_{}.pdf", confidence), + original_filename: format!("low_conf_{}.pdf", confidence), + file_path: format!("/uploads/low_conf_{}.pdf", confidence), + file_size: 1024, + mime_type: "application/pdf".to_string(), + content: Some("Test document content".to_string()), + ocr_text: Some("Low quality OCR text".to_string()), + ocr_confidence: Some(confidence), + ocr_word_count: Some(10), + ocr_processing_time_ms: Some(500), + ocr_status: Some("completed".to_string()), + ocr_error: None, + ocr_completed_at: Some(Utc::now()), + tags: vec!["low-confidence".to_string()], + created_at: Utc::now(), + updated_at: Utc::now(), + user_id, + file_hash: Some("abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890".to_string()), + } + } + + #[test] + fn test_delete_low_confidence_request_serialization() { + // Test valid request + let valid_request = json!({ + "max_confidence": 50.0, + "preview_only": true + }); + + let result: Result = serde_json::from_value(valid_request); + assert!(result.is_ok()); + let request = result.unwrap(); + assert_eq!(request.max_confidence, 50.0); + assert_eq!(request.preview_only, Some(true)); + + // Test request with only max_confidence + let minimal_request = json!({ + "max_confidence": 30.0 + }); + + let result: Result = serde_json::from_value(minimal_request); + assert!(result.is_ok()); + let request = result.unwrap(); + assert_eq!(request.max_confidence, 30.0); + assert_eq!(request.preview_only, None); + } + + #[test] + fn test_delete_low_confidence_request_validation() { + // Test invalid confidence values + let invalid_negative = json!({ + "max_confidence": -10.0, + "preview_only": false + }); + + let result: Result = serde_json::from_value(invalid_negative); + assert!(result.is_ok()); // Serialization succeeds, validation happens in handler + + let invalid_too_high = json!({ + "max_confidence": 150.0, + "preview_only": false + }); + + let result: Result = serde_json::from_value(invalid_too_high); + assert!(result.is_ok()); // Serialization succeeds, validation happens in handler + } + + #[test] + fn test_confidence_threshold_logic() { + let user = create_test_user(UserRole::User); + + // Create documents with various confidence levels + let high_confidence_doc = create_low_confidence_document(user.id, 95.0); + let medium_confidence_doc = create_low_confidence_document(user.id, 60.0); + let low_confidence_doc = create_low_confidence_document(user.id, 25.0); + let very_low_confidence_doc = create_low_confidence_document(user.id, 5.0); + + let documents = vec![ + &high_confidence_doc, + &medium_confidence_doc, + &low_confidence_doc, + &very_low_confidence_doc + ]; + + // Test threshold logic for different confidence values + let threshold_50 = 50.0; + let threshold_30 = 30.0; + let threshold_10 = 10.0; + + // Documents below 50% threshold + let below_50: Vec<_> = documents.iter() + .filter(|doc| doc.ocr_confidence.unwrap_or(0.0) < threshold_50) + .collect(); + assert_eq!(below_50.len(), 2); // 25.0 and 5.0 + + // Documents below 30% threshold + let below_30: Vec<_> = documents.iter() + .filter(|doc| doc.ocr_confidence.unwrap_or(0.0) < threshold_30) + .collect(); + assert_eq!(below_30.len(), 2); // 25.0 and 5.0 + + // Documents below 10% threshold + let below_10: Vec<_> = documents.iter() + .filter(|doc| doc.ocr_confidence.unwrap_or(0.0) < threshold_10) + .collect(); + assert_eq!(below_10.len(), 1); // 5.0 + } + + #[test] + fn test_user_role_authorization_for_low_confidence_deletion() { + let user1 = create_test_user(UserRole::User); + let user2 = create_test_user(UserRole::User); + let admin = create_test_user(UserRole::Admin); + + let user1_doc = create_low_confidence_document(user1.id, 25.0); + let user2_doc = create_low_confidence_document(user2.id, 15.0); + + // User1 should only be able to delete their own low confidence documents + assert_eq!(user1_doc.user_id, user1.id); + assert_ne!(user1_doc.user_id, user2.id); + + // User2 should only be able to delete their own low confidence documents + assert_eq!(user2_doc.user_id, user2.id); + assert_ne!(user2_doc.user_id, user1.id); + + // Admin should be able to delete any low confidence documents + let admin_can_delete_user1 = user1_doc.user_id == admin.id || admin.role == UserRole::Admin; + let admin_can_delete_user2 = user2_doc.user_id == admin.id || admin.role == UserRole::Admin; + assert!(admin_can_delete_user1); + assert!(admin_can_delete_user2); + } + + #[test] + fn test_edge_cases_for_confidence_values() { + let user = create_test_user(UserRole::User); + + // Test document with None confidence (should not be included) + let mut no_confidence_doc = create_low_confidence_document(user.id, 0.0); + no_confidence_doc.ocr_confidence = None; + + // Test document with exactly threshold confidence (should not be included) + let exact_threshold_doc = create_low_confidence_document(user.id, 30.0); + + // Test document just below threshold (should be included) + let just_below_doc = create_low_confidence_document(user.id, 29.9); + + let threshold = 30.0; + + // None confidence should be excluded (no OCR confidence available) + assert!(no_confidence_doc.ocr_confidence.is_none()); + + // Exact threshold should be excluded (not less than threshold) + assert_eq!(exact_threshold_doc.ocr_confidence.unwrap(), threshold); + assert!(!(exact_threshold_doc.ocr_confidence.unwrap() < threshold)); + + // Just below threshold should be included + assert!(just_below_doc.ocr_confidence.unwrap() < threshold); + } + + #[test] + fn test_preview_mode_behavior() { + let user = create_test_user(UserRole::User); + let doc1 = create_low_confidence_document(user.id, 20.0); + let doc2 = create_low_confidence_document(user.id, 10.0); + + let preview_request = DeleteLowConfidenceRequest { + max_confidence: 30.0, + preview_only: Some(true), + }; + + let delete_request = DeleteLowConfidenceRequest { + max_confidence: 30.0, + preview_only: Some(false), + }; + + let no_preview_request = DeleteLowConfidenceRequest { + max_confidence: 30.0, + preview_only: None, + }; + + // Preview mode should be true when explicitly set + assert_eq!(preview_request.preview_only.unwrap_or(false), true); + + // Delete mode should be false when explicitly set + assert_eq!(delete_request.preview_only.unwrap_or(false), false); + + // Default should be false when not specified + assert_eq!(no_preview_request.preview_only.unwrap_or(false), false); + } + + #[test] + fn test_response_format_expectations() { + // Test expected response structure for preview mode + let expected_preview_response = json!({ + "success": true, + "message": "Found 5 documents with OCR confidence below 30%", + "matched_count": 5, + "preview": true, + "document_ids": ["uuid1", "uuid2", "uuid3", "uuid4", "uuid5"] + }); + + // Test expected response structure for delete mode + let expected_delete_response = json!({ + "success": true, + "message": "Successfully deleted 5 documents with OCR confidence below 30%", + "deleted_count": 5, + "matched_count": 5, + "successful_file_deletions": 5, + "failed_file_deletions": 0, + "ignored_file_creation_failures": 0, + "deleted_document_ids": ["uuid1", "uuid2", "uuid3", "uuid4", "uuid5"] + }); + + // Verify JSON structure is valid + assert!(expected_preview_response.is_object()); + assert!(expected_delete_response.is_object()); + + // Verify required fields exist + assert!(expected_preview_response["success"].is_boolean()); + assert!(expected_preview_response["matched_count"].is_number()); + assert!(expected_preview_response["document_ids"].is_array()); + + assert!(expected_delete_response["success"].is_boolean()); + assert!(expected_delete_response["deleted_count"].is_number()); + assert!(expected_delete_response["deleted_document_ids"].is_array()); + } + + #[test] + fn test_error_scenarios() { + // Test validation error for invalid confidence range + let invalid_confidence_cases = vec![ + (-1.0, "negative confidence"), + (101.0, "confidence over 100"), + (150.5, "way over 100"), + ]; + + for (confidence, description) in invalid_confidence_cases { + let request = DeleteLowConfidenceRequest { + max_confidence: confidence, + preview_only: Some(false), + }; + + // Validation logic should catch these in the handler + assert!(confidence < 0.0 || confidence > 100.0, + "Should be invalid: {}", description); + } + + // Test empty result scenario + let request = DeleteLowConfidenceRequest { + max_confidence: 0.0, // Very low threshold, should match nothing + preview_only: Some(true), + }; + + assert_eq!(request.max_confidence, 0.0); + // This should result in zero matched documents + } + } } \ No newline at end of file diff --git a/src/tests/documents_tests.rs b/src/tests/documents_tests.rs index b9af619..0291c29 100644 --- a/src/tests/documents_tests.rs +++ b/src/tests/documents_tests.rs @@ -1525,4 +1525,275 @@ mod deletion_error_handling_tests { // If transaction were to be rolled back, document would exist again // This test verifies the transaction was committed properly } + + mod low_confidence_deletion_db_tests { + use super::*; + use crate::models::UserRole; + + #[cfg(test)] + fn create_test_document_with_confidence(user_id: Uuid, confidence: f32) -> Document { + Document { + id: Uuid::new_v4(), + filename: format!("test_conf_{}.pdf", confidence), + original_filename: format!("test_conf_{}.pdf", confidence), + file_path: format!("/uploads/test_conf_{}.pdf", confidence), + file_size: 1024, + mime_type: "application/pdf".to_string(), + content: Some("Test document content".to_string()), + ocr_text: Some("Test OCR text".to_string()), + ocr_confidence: Some(confidence), + ocr_word_count: Some(50), + ocr_processing_time_ms: Some(1000), + ocr_status: Some("completed".to_string()), + ocr_error: None, + ocr_completed_at: Some(Utc::now()), + tags: vec!["test".to_string()], + created_at: Utc::now(), + updated_at: Utc::now(), + user_id, + file_hash: Some("test_hash_123456789abcdef123456789abcdef123456789abcdef123456789abcdef".to_string()), + } + } + + #[test] + fn test_confidence_filtering_logic() { + let user_id = Uuid::new_v4(); + + let documents = vec![ + create_test_document_with_confidence(user_id, 95.0), // Should not be deleted + create_test_document_with_confidence(user_id, 75.0), // Should not be deleted + create_test_document_with_confidence(user_id, 45.0), // Should not be deleted + create_test_document_with_confidence(user_id, 25.0), // Should be deleted (< 30) + create_test_document_with_confidence(user_id, 15.0), // Should be deleted (< 30) + create_test_document_with_confidence(user_id, 5.0), // Should be deleted (< 30) + ]; + + let threshold = 30.0; + let low_confidence_docs: Vec<_> = documents.iter() + .filter(|doc| { + doc.ocr_confidence.is_some() && + doc.ocr_confidence.unwrap() < threshold + }) + .collect(); + + assert_eq!(low_confidence_docs.len(), 3); + assert_eq!(low_confidence_docs[0].ocr_confidence.unwrap(), 25.0); + assert_eq!(low_confidence_docs[1].ocr_confidence.unwrap(), 15.0); + assert_eq!(low_confidence_docs[2].ocr_confidence.unwrap(), 5.0); + } + + #[test] + fn test_documents_without_ocr_confidence_excluded() { + let user_id = Uuid::new_v4(); + + let mut doc_no_confidence = create_test_document_with_confidence(user_id, 20.0); + doc_no_confidence.ocr_confidence = None; + + let doc_with_confidence = create_test_document_with_confidence(user_id, 20.0); + + let documents = vec![doc_no_confidence, doc_with_confidence]; + let threshold = 30.0; + + let low_confidence_docs: Vec<_> = documents.iter() + .filter(|doc| { + doc.ocr_confidence.is_some() && + doc.ocr_confidence.unwrap() < threshold + }) + .collect(); + + // Only the document with confidence should be included + assert_eq!(low_confidence_docs.len(), 1); + assert!(low_confidence_docs[0].ocr_confidence.is_some()); + } + + #[test] + fn test_user_role_authorization_in_filtering() { + let user1_id = Uuid::new_v4(); + let user2_id = Uuid::new_v4(); + + let user1_doc = create_test_document_with_confidence(user1_id, 20.0); + let user2_doc = create_test_document_with_confidence(user2_id, 15.0); + + // Regular user should only see their own documents + let user_role = UserRole::User; + let admin_role = UserRole::Admin; + + // User1 should only access their own document + let user1_can_access_own = user1_doc.user_id == user1_id || user_role == UserRole::Admin; + let user1_can_access_other = user2_doc.user_id == user1_id || user_role == UserRole::Admin; + + assert!(user1_can_access_own); + assert!(!user1_can_access_other); + + // Admin should access all documents + let admin_can_access_user1 = user1_doc.user_id == user1_id || admin_role == UserRole::Admin; + let admin_can_access_user2 = user2_doc.user_id == user1_id || admin_role == UserRole::Admin; + + assert!(admin_can_access_user1); + assert!(admin_can_access_user2); + } + + #[test] + fn test_boundary_conditions_for_confidence_thresholds() { + let user_id = Uuid::new_v4(); + + let test_cases = vec![ + (0.0, 10.0, true), // 0% < 10% threshold + (10.0, 10.0, false), // 10% = 10% threshold (not less than) + (10.1, 10.0, false), // 10.1% > 10% threshold + (29.9, 30.0, true), // 29.9% < 30% threshold + (30.0, 30.0, false), // 30% = 30% threshold (not less than) + (30.1, 30.0, false), // 30.1% > 30% threshold + (99.9, 100.0, true), // 99.9% < 100% threshold + (100.0, 100.0, false), // 100% = 100% threshold (not less than) + ]; + + for (doc_confidence, threshold, should_be_included) in test_cases { + let doc = create_test_document_with_confidence(user_id, doc_confidence); + let is_included = doc.ocr_confidence.is_some() && + doc.ocr_confidence.unwrap() < threshold; + + assert_eq!(is_included, should_be_included, + "Document with {}% confidence vs {}% threshold", + doc_confidence, threshold); + } + } + + #[test] + fn test_performance_considerations_for_large_datasets() { + let user_id = Uuid::new_v4(); + + // Create a large number of test documents + let mut documents = Vec::new(); + for i in 0..1000 { + let confidence = (i as f32) / 10.0; // 0.0 to 99.9 + documents.push(create_test_document_with_confidence(user_id, confidence)); + } + + let threshold = 50.0; + let start_time = std::time::Instant::now(); + + let low_confidence_docs: Vec<_> = documents.iter() + .filter(|doc| { + doc.ocr_confidence.is_some() && + doc.ocr_confidence.unwrap() < threshold + }) + .collect(); + + let elapsed = start_time.elapsed(); + + // Verify the filtering works correctly for large datasets + assert_eq!(low_confidence_docs.len(), 500); // 0.0 to 49.9 + + // Performance should be reasonable (under 10ms for 1000 documents in memory) + assert!(elapsed.as_millis() < 10, + "Filtering 1000 documents took too long: {:?}", elapsed); + } + + #[test] + fn test_sql_query_structure_expectations() { + // Test that our expected SQL query structure would work + let user_id = Uuid::new_v4(); + let confidence_threshold = 30.0; + + // This tests the logical structure we expect in the actual SQL query + let expected_where_conditions = vec![ + "ocr_confidence IS NOT NULL", + "ocr_confidence < $1", // $1 = confidence_threshold + "user_id = $2", // $2 = user_id (for non-admin users) + ]; + + // Verify our test documents would match the expected query logic + let test_doc = create_test_document_with_confidence(user_id, 25.0); + + // Simulate the SQL conditions + let confidence_not_null = test_doc.ocr_confidence.is_some(); + let confidence_below_threshold = test_doc.ocr_confidence.unwrap() < confidence_threshold; + let user_matches = test_doc.user_id == user_id; + + assert!(confidence_not_null); + assert!(confidence_below_threshold); + assert!(user_matches); + + // This document should be included in results + let would_be_selected = confidence_not_null && confidence_below_threshold && user_matches; + assert!(would_be_selected); + } + + #[test] + fn test_deletion_ordering_expectations() { + let user_id = Uuid::new_v4(); + + let mut documents = vec![ + create_test_document_with_confidence(user_id, 25.0), + create_test_document_with_confidence(user_id, 5.0), + create_test_document_with_confidence(user_id, 15.0), + create_test_document_with_confidence(user_id, 35.0), // Above threshold + ]; + + let threshold = 30.0; + let mut low_confidence_docs: Vec<_> = documents.iter() + .filter(|doc| { + doc.ocr_confidence.is_some() && + doc.ocr_confidence.unwrap() < threshold + }) + .collect(); + + // Sort by confidence ascending (lowest first) then by creation date descending (newest first) + low_confidence_docs.sort_by(|a, b| { + let conf_a = a.ocr_confidence.unwrap(); + let conf_b = b.ocr_confidence.unwrap(); + conf_a.partial_cmp(&conf_b).unwrap() + .then_with(|| b.created_at.cmp(&a.created_at)) + }); + + assert_eq!(low_confidence_docs.len(), 3); + assert_eq!(low_confidence_docs[0].ocr_confidence.unwrap(), 5.0); // Lowest confidence first + assert_eq!(low_confidence_docs[1].ocr_confidence.unwrap(), 15.0); + assert_eq!(low_confidence_docs[2].ocr_confidence.unwrap(), 25.0); + } + + #[test] + fn test_error_handling_scenarios() { + let user_id = Uuid::new_v4(); + + // Test invalid threshold values (these would be caught by the API handler) + let invalid_thresholds = vec![-1.0, 101.0, f32::NAN, f32::INFINITY]; + + for threshold in invalid_thresholds { + // The database query itself should handle these gracefully + // Invalid thresholds should either match no documents or be rejected + let test_doc = create_test_document_with_confidence(user_id, 50.0); + + if threshold.is_finite() { + let would_match = test_doc.ocr_confidence.is_some() && + test_doc.ocr_confidence.unwrap() < threshold; + + if threshold < 0.0 { + assert!(!would_match, "Negative threshold should match no documents"); + } + if threshold > 100.0 { + // Documents with confidence > 100 shouldn't exist, but if they did, + // they should still be considered for deletion if threshold > 100 + assert!(would_match, "Threshold > 100 should match normal documents"); + } + } else { + // NaN and infinity comparisons + let would_match = test_doc.ocr_confidence.is_some() && + test_doc.ocr_confidence.unwrap() < threshold; + + if threshold.is_nan() { + // NaN comparisons should always be false + assert!(!would_match, "NaN threshold should match no documents"); + } else if threshold == f32::INFINITY { + // Positive infinity should match all finite numbers + assert!(would_match, "Positive infinity threshold should match finite documents"); + } else { + // Other invalid values like negative infinity + assert!(!would_match, "Invalid threshold should match no documents"); + } + } + } + } + } } \ No newline at end of file