Merge pull request #66 from readur/feat/delete-low-confidence-documents

feat(server/client): implement button deleting low confidence documen…
This commit is contained in:
Jon Fuller 2025-06-27 16:09:11 -07:00 committed by GitHub
commit ba79e8b8d3
9 changed files with 1349 additions and 4 deletions

View File

@ -28,6 +28,7 @@ import {
Snackbar,
Tabs,
Tab,
TextField,
useTheme,
} from '@mui/material';
import Grid from '@mui/material/GridLegacy';
@ -147,6 +148,12 @@ const FailedOcrPage: React.FC = () => {
message: '',
severity: 'success'
});
// Low confidence documents state
const [confidenceThreshold, setConfidenceThreshold] = useState<number>(30);
const [lowConfidenceLoading, setLowConfidenceLoading] = useState(false);
const [previewData, setPreviewData] = useState<any>(null);
const [confirmDeleteOpen, setConfirmDeleteOpen] = useState(false);
const fetchFailedDocuments = async () => {
try {
@ -297,8 +304,68 @@ const FailedOcrPage: React.FC = () => {
const refreshCurrentTab = () => {
if (currentTab === 0) {
fetchFailedDocuments();
} else {
} else if (currentTab === 1) {
fetchDuplicates();
} else if (currentTab === 2) {
handlePreviewLowConfidence();
}
};
// Low confidence document handlers
const handlePreviewLowConfidence = async () => {
try {
setLowConfidenceLoading(true);
const response = await documentService.deleteLowConfidence(confidenceThreshold, true);
setPreviewData(response.data);
setSnackbar({
open: true,
message: response.data.message,
severity: 'info'
});
} catch (error) {
setSnackbar({
open: true,
message: 'Failed to preview low confidence documents',
severity: 'error'
});
} finally {
setLowConfidenceLoading(false);
}
};
const handleDeleteLowConfidence = async () => {
if (!previewData || previewData.matched_count === 0) {
setSnackbar({
open: true,
message: 'No documents to delete',
severity: 'warning'
});
return;
}
try {
setLowConfidenceLoading(true);
const response = await documentService.deleteLowConfidence(confidenceThreshold, false);
setSnackbar({
open: true,
message: response.data.message,
severity: 'success'
});
setPreviewData(null);
setConfirmDeleteOpen(false);
// Refresh other tabs if they have data affected
if (currentTab === 0) {
fetchFailedDocuments();
}
} catch (error) {
setSnackbar({
open: true,
message: 'Failed to delete low confidence documents',
severity: 'error'
});
} finally {
setLowConfidenceLoading(false);
}
};
@ -314,7 +381,7 @@ const FailedOcrPage: React.FC = () => {
<Box sx={{ p: 3 }}>
<Box display="flex" justifyContent="space-between" alignItems="center" mb={3}>
<Typography variant="h4" component="h1">
Failed OCR & Duplicates
Document Management
</Typography>
<Button
variant="outlined"
@ -327,7 +394,7 @@ const FailedOcrPage: React.FC = () => {
</Box>
<Paper sx={{ mb: 3 }}>
<Tabs value={currentTab} onChange={handleTabChange} aria-label="failed ocr and duplicates tabs">
<Tabs value={currentTab} onChange={handleTabChange} aria-label="document management tabs">
<Tab
icon={<ErrorIcon />}
label={`Failed OCR${statistics ? ` (${statistics.total_failed})` : ''}`}
@ -338,6 +405,11 @@ const FailedOcrPage: React.FC = () => {
label={`Duplicates${duplicateStatistics ? ` (${duplicateStatistics.total_duplicate_groups})` : ''}`}
iconPosition="start"
/>
<Tab
icon={<FindInPageIcon />}
label={`Low Confidence${previewData ? ` (${previewData.matched_count})` : ''}`}
iconPosition="start"
/>
</Tabs>
</Paper>
@ -830,6 +902,128 @@ const FailedOcrPage: React.FC = () => {
</>
)}
{/* Low Confidence Documents Tab Content */}
{currentTab === 2 && (
<>
<Alert severity="info" sx={{ mb: 3 }}>
<AlertTitle>Low Confidence Document Deletion</AlertTitle>
<Typography>
This tool allows you to delete documents with OCR confidence below a specified threshold.
Use the preview feature first to see what documents would be affected before deleting.
</Typography>
</Alert>
<Card sx={{ mb: 3 }}>
<CardContent>
<Grid container spacing={3} alignItems="center">
<Grid item xs={12} md={4}>
<TextField
label="Maximum Confidence Threshold (%)"
type="number"
value={confidenceThreshold}
onChange={(e) => setConfidenceThreshold(Math.max(0, Math.min(100, Number(e.target.value))))}
fullWidth
inputProps={{ min: 0, max: 100, step: 1 }}
helperText="Documents with confidence below this value will be deleted"
/>
</Grid>
<Grid item xs={12} md={4}>
<Button
variant="outlined"
onClick={handlePreviewLowConfidence}
disabled={lowConfidenceLoading}
startIcon={lowConfidenceLoading ? <CircularProgress size={20} /> : <FindInPageIcon />}
fullWidth
>
Preview Documents
</Button>
</Grid>
<Grid item xs={12} md={4}>
<Button
variant="contained"
color="warning"
onClick={() => setConfirmDeleteOpen(true)}
disabled={!previewData || previewData.matched_count === 0 || lowConfidenceLoading}
startIcon={<DeleteIcon />}
fullWidth
>
Delete Low Confidence Documents
</Button>
</Grid>
</Grid>
</CardContent>
</Card>
{/* Preview Results */}
{previewData && (
<Card sx={{ mb: 3 }}>
<CardContent>
<Typography variant="h6" gutterBottom>
Preview Results
</Typography>
<Typography color={previewData.matched_count > 0 ? 'warning.main' : 'success.main'}>
{previewData.message}
</Typography>
{previewData.matched_count > 0 && (
<Box sx={{ mt: 2 }}>
<Typography variant="body2" color="text.secondary">
Document IDs that would be deleted:
</Typography>
<Typography variant="body2" sx={{ fontFamily: 'monospace', wordBreak: 'break-all' }}>
{previewData.document_ids.slice(0, 10).join(', ')}
{previewData.document_ids.length > 10 && ` ... and ${previewData.document_ids.length - 10} more`}
</Typography>
</Box>
)}
</CardContent>
</Card>
)}
{/* Loading State */}
{lowConfidenceLoading && !previewData && (
<Box display="flex" justifyContent="center" alignItems="center" minHeight="200px">
<CircularProgress />
<Typography sx={{ ml: 2 }}>Processing request...</Typography>
</Box>
)}
</>
)}
{/* Confirmation Dialog */}
<Dialog
open={confirmDeleteOpen}
onClose={() => setConfirmDeleteOpen(false)}
maxWidth="sm"
fullWidth
>
<DialogTitle color="warning.main">
<DeleteIcon sx={{ mr: 1, verticalAlign: 'middle' }} />
Confirm Low Confidence Document Deletion
</DialogTitle>
<DialogContent>
<Typography>
Are you sure you want to delete {previewData?.matched_count || 0} documents with OCR confidence below {confidenceThreshold}%?
</Typography>
<Alert severity="warning" sx={{ mt: 2 }}>
This action cannot be undone. The documents and their files will be permanently deleted.
</Alert>
</DialogContent>
<DialogActions>
<Button onClick={() => setConfirmDeleteOpen(false)}>
Cancel
</Button>
<Button
onClick={handleDeleteLowConfidence}
color="warning"
variant="contained"
disabled={lowConfidenceLoading}
startIcon={lowConfidenceLoading ? <CircularProgress size={20} /> : <DeleteIcon />}
>
{lowConfidenceLoading ? 'Deleting...' : 'Delete Documents'}
</Button>
</DialogActions>
</Dialog>
{/* Document Details Dialog */}
<Dialog
open={detailsOpen}

View File

@ -23,6 +23,15 @@ vi.mock('../../services/api', () => ({
retryOcr: () => Promise.resolve({
data: { success: true, message: 'OCR retry queued successfully' }
}),
deleteLowConfidence: vi.fn(() => Promise.resolve({
data: {
success: true,
message: 'Found 0 documents with OCR confidence below 30%',
matched_count: 0,
preview: true,
document_ids: []
}
})),
},
}));
@ -55,7 +64,7 @@ describe('FailedOcrPage', () => {
// Wait for the page to load and show the title
await waitFor(() => {
expect(screen.getByText('Failed OCR & Duplicates')).toBeInTheDocument();
expect(screen.getByText('Document Management')).toBeInTheDocument();
});
});
@ -92,4 +101,224 @@ describe('FailedOcrPage', () => {
// test('handles retry OCR functionality', async () => { ... });
// test('handles API errors gracefully', async () => { ... });
// test('refreshes data when refresh button is clicked', async () => { ... });
});
describe('FailedOcrPage - Low Confidence Deletion', () => {
beforeEach(() => {
vi.clearAllMocks();
});
test('renders low confidence deletion tab', async () => {
render(
<FailedOcrPageWrapper>
<FailedOcrPage />
</FailedOcrPageWrapper>
);
// Wait for tabs to load
await waitFor(() => {
const tabs = screen.getByRole('tablist');
expect(tabs).toBeInTheDocument();
});
// Check for Low Confidence tab
await waitFor(() => {
const lowConfidenceTab = screen.getByText(/Low Confidence/i);
expect(lowConfidenceTab).toBeInTheDocument();
});
});
test('displays confidence threshold input when low confidence tab is active', async () => {
render(
<FailedOcrPageWrapper>
<FailedOcrPage />
</FailedOcrPageWrapper>
);
// Wait for component to load
await waitFor(() => {
const tabs = screen.getByRole('tablist');
expect(tabs).toBeInTheDocument();
});
// Click on Low Confidence tab (third tab, index 2)
const lowConfidenceTab = screen.getByText(/Low Confidence/i);
lowConfidenceTab.click();
// Wait for tab content to render
await waitFor(() => {
const thresholdInput = screen.getByLabelText(/Maximum Confidence Threshold/i);
expect(thresholdInput).toBeInTheDocument();
});
});
test('displays preview and delete buttons in low confidence tab', async () => {
render(
<FailedOcrPageWrapper>
<FailedOcrPage />
</FailedOcrPageWrapper>
);
// Navigate to Low Confidence tab
await waitFor(() => {
const lowConfidenceTab = screen.getByText(/Low Confidence/i);
lowConfidenceTab.click();
});
// Check for action buttons
await waitFor(() => {
const previewButton = screen.getByText(/Preview Documents/i);
const deleteButton = screen.getByText(/Delete Low Confidence Documents/i);
expect(previewButton).toBeInTheDocument();
expect(deleteButton).toBeInTheDocument();
});
});
test('shows informational alert about low confidence deletion', async () => {
render(
<FailedOcrPageWrapper>
<FailedOcrPage />
</FailedOcrPageWrapper>
);
// Navigate to Low Confidence tab
await waitFor(() => {
const lowConfidenceTab = screen.getByText(/Low Confidence/i);
lowConfidenceTab.click();
});
// Check for informational content
await waitFor(() => {
const alertTitle = screen.getByText(/Low Confidence Document Deletion/i);
const alertText = screen.getByText(/This tool allows you to delete documents/i);
expect(alertTitle).toBeInTheDocument();
expect(alertText).toBeInTheDocument();
});
});
// DISABLED - Interactive tests that would require complex user event simulation
// These tests would need fireEvent.change, fireEvent.click, and proper async handling
// test('calls deleteLowConfidence API when preview button is clicked', async () => {
// const mockDeleteLowConfidence = vi.mocked(documentService.deleteLowConfidence);
//
// render(<FailedOcrPageWrapper><FailedOcrPage /></FailedOcrPageWrapper>);
//
// // Navigate to tab and click preview
// const lowConfidenceTab = screen.getByText(/Low Confidence/i);
// fireEvent.click(lowConfidenceTab);
//
// const previewButton = screen.getByText(/Preview Documents/i);
// fireEvent.click(previewButton);
//
// await waitFor(() => {
// expect(mockDeleteLowConfidence).toHaveBeenCalledWith(30, true);
// });
// });
// test('validates confidence threshold input values', async () => {
// render(<FailedOcrPageWrapper><FailedOcrPage /></FailedOcrPageWrapper>);
//
// const lowConfidenceTab = screen.getByText(/Low Confidence/i);
// fireEvent.click(lowConfidenceTab);
//
// const thresholdInput = screen.getByLabelText(/Maximum Confidence Threshold/i);
//
// // Test invalid values
// fireEvent.change(thresholdInput, { target: { value: '150' } });
// expect(thresholdInput.value).toBe('100'); // Should be clamped
//
// fireEvent.change(thresholdInput, { target: { value: '-10' } });
// expect(thresholdInput.value).toBe('0'); // Should be clamped
// });
// test('shows confirmation dialog before deletion', async () => {
// const mockDeleteLowConfidence = vi.mocked(documentService.deleteLowConfidence);
// mockDeleteLowConfidence.mockResolvedValueOnce({
// data: {
// success: true,
// matched_count: 5,
// preview: true,
// document_ids: ['doc1', 'doc2', 'doc3', 'doc4', 'doc5']
// }
// });
//
// render(<FailedOcrPageWrapper><FailedOcrPage /></FailedOcrPageWrapper>);
//
// // Navigate to tab, preview, then try to delete
// const lowConfidenceTab = screen.getByText(/Low Confidence/i);
// fireEvent.click(lowConfidenceTab);
//
// const previewButton = screen.getByText(/Preview Documents/i);
// fireEvent.click(previewButton);
//
// await waitFor(() => {
// const deleteButton = screen.getByText(/Delete Low Confidence Documents/i);
// fireEvent.click(deleteButton);
// });
//
// // Should show confirmation dialog
// await waitFor(() => {
// const confirmDialog = screen.getByText(/Confirm Low Confidence Document Deletion/i);
// expect(confirmDialog).toBeInTheDocument();
// });
// });
// test('disables delete button when no preview data available', async () => {
// render(<FailedOcrPageWrapper><FailedOcrPage /></FailedOcrPageWrapper>);
//
// const lowConfidenceTab = screen.getByText(/Low Confidence/i);
// fireEvent.click(lowConfidenceTab);
//
// await waitFor(() => {
// const deleteButton = screen.getByText(/Delete Low Confidence Documents/i);
// expect(deleteButton).toBeDisabled();
// });
// });
// test('displays preview results after API call', async () => {
// const mockDeleteLowConfidence = vi.mocked(documentService.deleteLowConfidence);
// mockDeleteLowConfidence.mockResolvedValueOnce({
// data: {
// success: true,
// message: 'Found 3 documents with OCR confidence below 30%',
// matched_count: 3,
// preview: true,
// document_ids: ['doc1', 'doc2', 'doc3']
// }
// });
//
// render(<FailedOcrPageWrapper><FailedOcrPage /></FailedOcrPageWrapper>);
//
// const lowConfidenceTab = screen.getByText(/Low Confidence/i);
// fireEvent.click(lowConfidenceTab);
//
// const previewButton = screen.getByText(/Preview Documents/i);
// fireEvent.click(previewButton);
//
// await waitFor(() => {
// expect(screen.getByText(/Preview Results/i)).toBeInTheDocument();
// expect(screen.getByText(/Found 3 documents/i)).toBeInTheDocument();
// });
// });
// test('handles API errors gracefully', async () => {
// const mockDeleteLowConfidence = vi.mocked(documentService.deleteLowConfidence);
// mockDeleteLowConfidence.mockRejectedValueOnce(new Error('Network error'));
//
// render(<FailedOcrPageWrapper><FailedOcrPage /></FailedOcrPageWrapper>);
//
// const lowConfidenceTab = screen.getByText(/Low Confidence/i);
// fireEvent.click(lowConfidenceTab);
//
// const previewButton = screen.getByText(/Preview Documents/i);
// fireEvent.click(previewButton);
//
// await waitFor(() => {
// // Should show error message via snackbar or similar
// expect(screen.getByText(/Failed to preview low confidence documents/i)).toBeInTheDocument();
// });
// });
});

View File

@ -22,6 +22,7 @@ export const documentService = {
getFailedOcrDocuments: vi.fn(),
getDuplicates: vi.fn(),
retryOcr: vi.fn(),
deleteLowConfidence: vi.fn(),
}
// Re-export types that components might need

View File

@ -6,6 +6,7 @@ const mockGetOcrText = vi.fn();
const mockList = vi.fn();
const mockUpload = vi.fn();
const mockDownload = vi.fn();
const mockDeleteLowConfidence = vi.fn();
// Mock the entire api module
vi.mock('../api', async () => {
@ -17,6 +18,7 @@ vi.mock('../api', async () => {
list: mockList,
upload: mockUpload,
download: mockDownload,
deleteLowConfidence: mockDeleteLowConfidence,
},
};
});
@ -309,4 +311,183 @@ describe('OcrResponse interface', () => {
expect(ocrResponseMinimal.ocr_text).toBeNull();
expect(ocrResponseMinimal.ocr_confidence).toBeUndefined();
});
});
describe('documentService.deleteLowConfidence', () => {
it('should delete low confidence documents successfully', async () => {
const mockDeleteResponse = {
data: {
success: true,
message: 'Successfully deleted 3 documents with OCR confidence below 30%',
deleted_count: 3,
matched_count: 3,
successful_file_deletions: 3,
failed_file_deletions: 0,
ignored_file_creation_failures: 0,
deleted_document_ids: ['doc-1', 'doc-2', 'doc-3']
},
status: 200,
statusText: 'OK',
headers: {},
config: {},
};
mockDeleteLowConfidence.mockResolvedValue(mockDeleteResponse);
const result = await documentService.deleteLowConfidence(30.0, false);
expect(mockDeleteLowConfidence).toHaveBeenCalledWith(30.0, false);
expect(result.data.success).toBe(true);
expect(result.data.deleted_count).toBe(3);
expect(result.data.matched_count).toBe(3);
expect(result.data.deleted_document_ids).toHaveLength(3);
});
it('should preview low confidence documents without deleting', async () => {
const mockPreviewResponse = {
data: {
success: true,
message: 'Found 5 documents with OCR confidence below 50%',
matched_count: 5,
preview: true,
document_ids: ['doc-1', 'doc-2', 'doc-3', 'doc-4', 'doc-5']
},
status: 200,
statusText: 'OK',
headers: {},
config: {},
};
mockDeleteLowConfidence.mockResolvedValue(mockPreviewResponse);
const result = await documentService.deleteLowConfidence(50.0, true);
expect(mockDeleteLowConfidence).toHaveBeenCalledWith(50.0, true);
expect(result.data.success).toBe(true);
expect(result.data.preview).toBe(true);
expect(result.data.matched_count).toBe(5);
expect(result.data.document_ids).toHaveLength(5);
expect(result.data).not.toHaveProperty('deleted_count');
});
it('should handle no matching documents', async () => {
const mockEmptyResponse = {
data: {
success: true,
message: 'No documents found with OCR confidence below 10%',
deleted_count: 0
},
status: 200,
statusText: 'OK',
headers: {},
config: {},
};
mockDeleteLowConfidence.mockResolvedValue(mockEmptyResponse);
const result = await documentService.deleteLowConfidence(10.0, false);
expect(mockDeleteLowConfidence).toHaveBeenCalledWith(10.0, false);
expect(result.data.success).toBe(true);
expect(result.data.deleted_count).toBe(0);
});
it('should handle validation errors for invalid confidence threshold', async () => {
const mockErrorResponse = {
data: {
success: false,
message: 'max_confidence must be between 0.0 and 100.0',
matched_count: 0
},
status: 200,
statusText: 'OK',
headers: {},
config: {},
};
mockDeleteLowConfidence.mockResolvedValue(mockErrorResponse);
const result = await documentService.deleteLowConfidence(-10.0, false);
expect(mockDeleteLowConfidence).toHaveBeenCalledWith(-10.0, false);
expect(result.data.success).toBe(false);
expect(result.data.message).toContain('must be between 0.0 and 100.0');
});
it('should handle API errors gracefully', async () => {
const mockError = new Error('Network error');
mockDeleteLowConfidence.mockRejectedValue(mockError);
await expect(documentService.deleteLowConfidence(30.0, false))
.rejects.toThrow('Network error');
expect(mockDeleteLowConfidence).toHaveBeenCalledWith(30.0, false);
});
it('should use correct default values', async () => {
const mockResponse = {
data: { success: true, matched_count: 0 },
status: 200,
statusText: 'OK',
headers: {},
config: {},
};
mockDeleteLowConfidence.mockResolvedValue(mockResponse);
// Test with explicit false value (the default)
await documentService.deleteLowConfidence(40.0, false);
expect(mockDeleteLowConfidence).toHaveBeenCalledWith(40.0, false);
});
it('should handle partial deletion failures', async () => {
const mockPartialFailureResponse = {
data: {
success: true,
message: 'Successfully deleted 2 documents with OCR confidence below 25%',
deleted_count: 2,
matched_count: 3,
successful_file_deletions: 1,
failed_file_deletions: 1,
ignored_file_creation_failures: 1,
deleted_document_ids: ['doc-1', 'doc-2']
},
status: 200,
statusText: 'OK',
headers: {},
config: {},
};
mockDeleteLowConfidence.mockResolvedValue(mockPartialFailureResponse);
const result = await documentService.deleteLowConfidence(25.0, false);
expect(result.data.success).toBe(true);
expect(result.data.deleted_count).toBe(2);
expect(result.data.matched_count).toBe(3);
expect(result.data.failed_file_deletions).toBe(1);
expect(result.data.ignored_file_creation_failures).toBe(1);
});
it('should properly encode confidence threshold values', async () => {
const mockResponse = {
data: { success: true, matched_count: 0 },
status: 200,
statusText: 'OK',
headers: {},
config: {},
};
mockDeleteLowConfidence.mockResolvedValue(mockResponse);
// Test various confidence values
const testValues = [0.0, 0.1, 30.5, 50.0, 99.9, 100.0];
for (const confidence of testValues) {
mockDeleteLowConfidence.mockClear();
await documentService.deleteLowConfidence(confidence, true);
expect(mockDeleteLowConfidence).toHaveBeenCalledWith(confidence, true);
}
});
});

View File

@ -241,6 +241,13 @@ export const documentService = {
data: { document_ids: documentIds }
})
},
deleteLowConfidence: (maxConfidence: number, previewOnly: boolean = false) => {
return api.post('/documents/delete-low-confidence', {
max_confidence: maxConfidence,
preview_only: previewOnly
})
},
}
export interface OcrStatusResponse {

View File

@ -1510,6 +1510,82 @@ impl Database {
Ok(deleted_documents)
}
pub async fn find_documents_by_confidence_threshold(&self, max_confidence: f32, user_id: uuid::Uuid, user_role: crate::models::UserRole) -> Result<Vec<Document>> {
let documents = if user_role == crate::models::UserRole::Admin {
let rows = sqlx::query(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
FROM documents
WHERE ocr_confidence IS NOT NULL AND ocr_confidence < $1
ORDER BY ocr_confidence ASC, created_at DESC
"#,
)
.bind(max_confidence)
.fetch_all(&self.pool)
.await?;
rows.into_iter().map(|r| Document {
id: r.get("id"),
filename: r.get("filename"),
original_filename: r.get("original_filename"),
file_path: r.get("file_path"),
file_size: r.get("file_size"),
mime_type: r.get("mime_type"),
content: r.get("content"),
ocr_text: r.get("ocr_text"),
ocr_confidence: r.get("ocr_confidence"),
ocr_word_count: r.get("ocr_word_count"),
ocr_processing_time_ms: r.get("ocr_processing_time_ms"),
ocr_status: r.get("ocr_status"),
ocr_error: r.get("ocr_error"),
ocr_completed_at: r.get("ocr_completed_at"),
tags: r.get("tags"),
created_at: r.get("created_at"),
updated_at: r.get("updated_at"),
user_id: r.get("user_id"),
file_hash: r.get("file_hash"),
}).collect()
} else {
let rows = sqlx::query(
r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
FROM documents
WHERE ocr_confidence IS NOT NULL AND ocr_confidence < $1 AND user_id = $2
ORDER BY ocr_confidence ASC, created_at DESC
"#,
)
.bind(max_confidence)
.bind(user_id)
.fetch_all(&self.pool)
.await?;
rows.into_iter().map(|r| Document {
id: r.get("id"),
filename: r.get("filename"),
original_filename: r.get("original_filename"),
file_path: r.get("file_path"),
file_size: r.get("file_size"),
mime_type: r.get("mime_type"),
content: r.get("content"),
ocr_text: r.get("ocr_text"),
ocr_confidence: r.get("ocr_confidence"),
ocr_word_count: r.get("ocr_word_count"),
ocr_processing_time_ms: r.get("ocr_processing_time_ms"),
ocr_status: r.get("ocr_status"),
ocr_error: r.get("ocr_error"),
ocr_completed_at: r.get("ocr_completed_at"),
tags: r.get("tags"),
created_at: r.get("created_at"),
updated_at: r.get("updated_at"),
user_id: r.get("user_id"),
file_hash: r.get("file_hash"),
}).collect()
};
Ok(documents)
}
pub async fn count_documents_for_source(&self, source_id: Uuid) -> Result<(i64, i64)> {
let row = sqlx::query(
r#"
@ -1563,5 +1639,6 @@ impl Database {
.collect();
Ok(results)
}
}

View File

@ -31,6 +31,12 @@ pub struct BulkDeleteRequest {
pub document_ids: Vec<uuid::Uuid>,
}
#[derive(Deserialize, Serialize, ToSchema)]
pub struct DeleteLowConfidenceRequest {
pub max_confidence: f32,
pub preview_only: Option<bool>,
}
pub fn router() -> Router<Arc<AppState>> {
Router::new()
.route("/", post(upload_document))
@ -46,6 +52,7 @@ pub fn router() -> Router<Arc<AppState>> {
.route("/{id}/retry-ocr", post(retry_ocr))
.route("/failed-ocr", get(get_failed_ocr_documents))
.route("/duplicates", get(get_user_duplicates))
.route("/delete-low-confidence", post(delete_low_confidence_documents))
}
#[utoipa::path(
@ -1017,4 +1024,116 @@ pub async fn bulk_delete_documents(
"ignored_file_creation_failures": ignored_file_creation_failures,
"deleted_document_ids": deleted_documents.iter().map(|d| d.id).collect::<Vec<_>>()
})))
}
#[utoipa::path(
post,
path = "/api/documents/delete-low-confidence",
request_body = DeleteLowConfidenceRequest,
responses(
(status = 200, description = "Low confidence documents operation result"),
(status = 401, description = "Unauthorized"),
(status = 500, description = "Internal server error")
),
security(
("bearer_auth" = [])
),
tag = "documents"
)]
pub async fn delete_low_confidence_documents(
State(state): State<Arc<AppState>>,
auth_user: AuthUser,
Json(request): Json<DeleteLowConfidenceRequest>,
) -> Result<Json<serde_json::Value>, StatusCode> {
if request.max_confidence < 0.0 || request.max_confidence > 100.0 {
return Ok(Json(serde_json::json!({
"success": false,
"message": "max_confidence must be between 0.0 and 100.0",
"matched_count": 0
})));
}
let is_preview = request.preview_only.unwrap_or(false);
// Find documents with confidence below threshold
let matched_documents = state
.db
.find_documents_by_confidence_threshold(request.max_confidence, auth_user.user.id, auth_user.user.role)
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
let matched_count = matched_documents.len();
if is_preview {
return Ok(Json(serde_json::json!({
"success": true,
"message": format!("Found {} documents with OCR confidence below {}%", matched_count, request.max_confidence),
"matched_count": matched_count,
"preview": true,
"document_ids": matched_documents.iter().map(|d| d.id).collect::<Vec<_>>()
})));
}
if matched_documents.is_empty() {
return Ok(Json(serde_json::json!({
"success": true,
"message": format!("No documents found with OCR confidence below {}%", request.max_confidence),
"deleted_count": 0
})));
}
// Extract document IDs for bulk deletion
let document_ids: Vec<uuid::Uuid> = matched_documents.iter().map(|d| d.id).collect();
// Use existing bulk delete logic
let deleted_documents = state
.db
.bulk_delete_documents(&document_ids, auth_user.user.id, auth_user.user.role)
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
// Create ignored file records for all successfully deleted documents
let mut ignored_file_creation_failures = 0;
for document in &deleted_documents {
if let Err(e) = crate::db::ignored_files::create_ignored_file_from_document(
state.db.get_pool(),
document.id,
auth_user.user.id,
Some(format!("deleted due to low OCR confidence ({}%)",
document.ocr_confidence.unwrap_or(0.0))),
None,
None,
None,
).await {
ignored_file_creation_failures += 1;
tracing::warn!("Failed to create ignored file record for document {}: {}", document.id, e);
}
}
let file_service = FileService::new(state.config.upload_path.clone());
let mut successful_file_deletions = 0;
let mut failed_file_deletions = 0;
for document in &deleted_documents {
match file_service.delete_document_files(document).await {
Ok(_) => successful_file_deletions += 1,
Err(e) => {
failed_file_deletions += 1;
tracing::warn!("Failed to delete files for document {}: {}", document.id, e);
}
}
}
let deleted_count = deleted_documents.len();
Ok(Json(serde_json::json!({
"success": true,
"message": format!("Successfully deleted {} documents with OCR confidence below {}%", deleted_count, request.max_confidence),
"deleted_count": deleted_count,
"matched_count": matched_count,
"successful_file_deletions": successful_file_deletions,
"failed_file_deletions": failed_file_deletions,
"ignored_file_creation_failures": ignored_file_creation_failures,
"deleted_document_ids": deleted_documents.iter().map(|d| d.id).collect::<Vec<_>>()
})))
}

View File

@ -367,4 +367,270 @@ mod document_routes_deletion_tests {
assert!(!unauthorized_error.contains("403"));
assert!(!validation_error.contains("serde"));
}
// Low confidence deletion tests
mod low_confidence_deletion_tests {
use super::*;
use crate::routes::documents::DeleteLowConfidenceRequest;
fn create_low_confidence_document(user_id: Uuid, confidence: f32) -> Document {
Document {
id: Uuid::new_v4(),
filename: format!("low_conf_{}.pdf", confidence),
original_filename: format!("low_conf_{}.pdf", confidence),
file_path: format!("/uploads/low_conf_{}.pdf", confidence),
file_size: 1024,
mime_type: "application/pdf".to_string(),
content: Some("Test document content".to_string()),
ocr_text: Some("Low quality OCR text".to_string()),
ocr_confidence: Some(confidence),
ocr_word_count: Some(10),
ocr_processing_time_ms: Some(500),
ocr_status: Some("completed".to_string()),
ocr_error: None,
ocr_completed_at: Some(Utc::now()),
tags: vec!["low-confidence".to_string()],
created_at: Utc::now(),
updated_at: Utc::now(),
user_id,
file_hash: Some("abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890".to_string()),
}
}
#[test]
fn test_delete_low_confidence_request_serialization() {
// Test valid request
let valid_request = json!({
"max_confidence": 50.0,
"preview_only": true
});
let result: Result<DeleteLowConfidenceRequest, _> = serde_json::from_value(valid_request);
assert!(result.is_ok());
let request = result.unwrap();
assert_eq!(request.max_confidence, 50.0);
assert_eq!(request.preview_only, Some(true));
// Test request with only max_confidence
let minimal_request = json!({
"max_confidence": 30.0
});
let result: Result<DeleteLowConfidenceRequest, _> = serde_json::from_value(minimal_request);
assert!(result.is_ok());
let request = result.unwrap();
assert_eq!(request.max_confidence, 30.0);
assert_eq!(request.preview_only, None);
}
#[test]
fn test_delete_low_confidence_request_validation() {
// Test invalid confidence values
let invalid_negative = json!({
"max_confidence": -10.0,
"preview_only": false
});
let result: Result<DeleteLowConfidenceRequest, _> = serde_json::from_value(invalid_negative);
assert!(result.is_ok()); // Serialization succeeds, validation happens in handler
let invalid_too_high = json!({
"max_confidence": 150.0,
"preview_only": false
});
let result: Result<DeleteLowConfidenceRequest, _> = serde_json::from_value(invalid_too_high);
assert!(result.is_ok()); // Serialization succeeds, validation happens in handler
}
#[test]
fn test_confidence_threshold_logic() {
let user = create_test_user(UserRole::User);
// Create documents with various confidence levels
let high_confidence_doc = create_low_confidence_document(user.id, 95.0);
let medium_confidence_doc = create_low_confidence_document(user.id, 60.0);
let low_confidence_doc = create_low_confidence_document(user.id, 25.0);
let very_low_confidence_doc = create_low_confidence_document(user.id, 5.0);
let documents = vec![
&high_confidence_doc,
&medium_confidence_doc,
&low_confidence_doc,
&very_low_confidence_doc
];
// Test threshold logic for different confidence values
let threshold_50 = 50.0;
let threshold_30 = 30.0;
let threshold_10 = 10.0;
// Documents below 50% threshold
let below_50: Vec<_> = documents.iter()
.filter(|doc| doc.ocr_confidence.unwrap_or(0.0) < threshold_50)
.collect();
assert_eq!(below_50.len(), 2); // 25.0 and 5.0
// Documents below 30% threshold
let below_30: Vec<_> = documents.iter()
.filter(|doc| doc.ocr_confidence.unwrap_or(0.0) < threshold_30)
.collect();
assert_eq!(below_30.len(), 2); // 25.0 and 5.0
// Documents below 10% threshold
let below_10: Vec<_> = documents.iter()
.filter(|doc| doc.ocr_confidence.unwrap_or(0.0) < threshold_10)
.collect();
assert_eq!(below_10.len(), 1); // 5.0
}
#[test]
fn test_user_role_authorization_for_low_confidence_deletion() {
let user1 = create_test_user(UserRole::User);
let user2 = create_test_user(UserRole::User);
let admin = create_test_user(UserRole::Admin);
let user1_doc = create_low_confidence_document(user1.id, 25.0);
let user2_doc = create_low_confidence_document(user2.id, 15.0);
// User1 should only be able to delete their own low confidence documents
assert_eq!(user1_doc.user_id, user1.id);
assert_ne!(user1_doc.user_id, user2.id);
// User2 should only be able to delete their own low confidence documents
assert_eq!(user2_doc.user_id, user2.id);
assert_ne!(user2_doc.user_id, user1.id);
// Admin should be able to delete any low confidence documents
let admin_can_delete_user1 = user1_doc.user_id == admin.id || admin.role == UserRole::Admin;
let admin_can_delete_user2 = user2_doc.user_id == admin.id || admin.role == UserRole::Admin;
assert!(admin_can_delete_user1);
assert!(admin_can_delete_user2);
}
#[test]
fn test_edge_cases_for_confidence_values() {
let user = create_test_user(UserRole::User);
// Test document with None confidence (should not be included)
let mut no_confidence_doc = create_low_confidence_document(user.id, 0.0);
no_confidence_doc.ocr_confidence = None;
// Test document with exactly threshold confidence (should not be included)
let exact_threshold_doc = create_low_confidence_document(user.id, 30.0);
// Test document just below threshold (should be included)
let just_below_doc = create_low_confidence_document(user.id, 29.9);
let threshold = 30.0;
// None confidence should be excluded (no OCR confidence available)
assert!(no_confidence_doc.ocr_confidence.is_none());
// Exact threshold should be excluded (not less than threshold)
assert_eq!(exact_threshold_doc.ocr_confidence.unwrap(), threshold);
assert!(!(exact_threshold_doc.ocr_confidence.unwrap() < threshold));
// Just below threshold should be included
assert!(just_below_doc.ocr_confidence.unwrap() < threshold);
}
#[test]
fn test_preview_mode_behavior() {
let user = create_test_user(UserRole::User);
let doc1 = create_low_confidence_document(user.id, 20.0);
let doc2 = create_low_confidence_document(user.id, 10.0);
let preview_request = DeleteLowConfidenceRequest {
max_confidence: 30.0,
preview_only: Some(true),
};
let delete_request = DeleteLowConfidenceRequest {
max_confidence: 30.0,
preview_only: Some(false),
};
let no_preview_request = DeleteLowConfidenceRequest {
max_confidence: 30.0,
preview_only: None,
};
// Preview mode should be true when explicitly set
assert_eq!(preview_request.preview_only.unwrap_or(false), true);
// Delete mode should be false when explicitly set
assert_eq!(delete_request.preview_only.unwrap_or(false), false);
// Default should be false when not specified
assert_eq!(no_preview_request.preview_only.unwrap_or(false), false);
}
#[test]
fn test_response_format_expectations() {
// Test expected response structure for preview mode
let expected_preview_response = json!({
"success": true,
"message": "Found 5 documents with OCR confidence below 30%",
"matched_count": 5,
"preview": true,
"document_ids": ["uuid1", "uuid2", "uuid3", "uuid4", "uuid5"]
});
// Test expected response structure for delete mode
let expected_delete_response = json!({
"success": true,
"message": "Successfully deleted 5 documents with OCR confidence below 30%",
"deleted_count": 5,
"matched_count": 5,
"successful_file_deletions": 5,
"failed_file_deletions": 0,
"ignored_file_creation_failures": 0,
"deleted_document_ids": ["uuid1", "uuid2", "uuid3", "uuid4", "uuid5"]
});
// Verify JSON structure is valid
assert!(expected_preview_response.is_object());
assert!(expected_delete_response.is_object());
// Verify required fields exist
assert!(expected_preview_response["success"].is_boolean());
assert!(expected_preview_response["matched_count"].is_number());
assert!(expected_preview_response["document_ids"].is_array());
assert!(expected_delete_response["success"].is_boolean());
assert!(expected_delete_response["deleted_count"].is_number());
assert!(expected_delete_response["deleted_document_ids"].is_array());
}
#[test]
fn test_error_scenarios() {
// Test validation error for invalid confidence range
let invalid_confidence_cases = vec![
(-1.0, "negative confidence"),
(101.0, "confidence over 100"),
(150.5, "way over 100"),
];
for (confidence, description) in invalid_confidence_cases {
let request = DeleteLowConfidenceRequest {
max_confidence: confidence,
preview_only: Some(false),
};
// Validation logic should catch these in the handler
assert!(confidence < 0.0 || confidence > 100.0,
"Should be invalid: {}", description);
}
// Test empty result scenario
let request = DeleteLowConfidenceRequest {
max_confidence: 0.0, // Very low threshold, should match nothing
preview_only: Some(true),
};
assert_eq!(request.max_confidence, 0.0);
// This should result in zero matched documents
}
}
}

View File

@ -1525,4 +1525,275 @@ mod deletion_error_handling_tests {
// If transaction were to be rolled back, document would exist again
// This test verifies the transaction was committed properly
}
mod low_confidence_deletion_db_tests {
use super::*;
use crate::models::UserRole;
#[cfg(test)]
fn create_test_document_with_confidence(user_id: Uuid, confidence: f32) -> Document {
Document {
id: Uuid::new_v4(),
filename: format!("test_conf_{}.pdf", confidence),
original_filename: format!("test_conf_{}.pdf", confidence),
file_path: format!("/uploads/test_conf_{}.pdf", confidence),
file_size: 1024,
mime_type: "application/pdf".to_string(),
content: Some("Test document content".to_string()),
ocr_text: Some("Test OCR text".to_string()),
ocr_confidence: Some(confidence),
ocr_word_count: Some(50),
ocr_processing_time_ms: Some(1000),
ocr_status: Some("completed".to_string()),
ocr_error: None,
ocr_completed_at: Some(Utc::now()),
tags: vec!["test".to_string()],
created_at: Utc::now(),
updated_at: Utc::now(),
user_id,
file_hash: Some("test_hash_123456789abcdef123456789abcdef123456789abcdef123456789abcdef".to_string()),
}
}
#[test]
fn test_confidence_filtering_logic() {
let user_id = Uuid::new_v4();
let documents = vec![
create_test_document_with_confidence(user_id, 95.0), // Should not be deleted
create_test_document_with_confidence(user_id, 75.0), // Should not be deleted
create_test_document_with_confidence(user_id, 45.0), // Should not be deleted
create_test_document_with_confidence(user_id, 25.0), // Should be deleted (< 30)
create_test_document_with_confidence(user_id, 15.0), // Should be deleted (< 30)
create_test_document_with_confidence(user_id, 5.0), // Should be deleted (< 30)
];
let threshold = 30.0;
let low_confidence_docs: Vec<_> = documents.iter()
.filter(|doc| {
doc.ocr_confidence.is_some() &&
doc.ocr_confidence.unwrap() < threshold
})
.collect();
assert_eq!(low_confidence_docs.len(), 3);
assert_eq!(low_confidence_docs[0].ocr_confidence.unwrap(), 25.0);
assert_eq!(low_confidence_docs[1].ocr_confidence.unwrap(), 15.0);
assert_eq!(low_confidence_docs[2].ocr_confidence.unwrap(), 5.0);
}
#[test]
fn test_documents_without_ocr_confidence_excluded() {
let user_id = Uuid::new_v4();
let mut doc_no_confidence = create_test_document_with_confidence(user_id, 20.0);
doc_no_confidence.ocr_confidence = None;
let doc_with_confidence = create_test_document_with_confidence(user_id, 20.0);
let documents = vec![doc_no_confidence, doc_with_confidence];
let threshold = 30.0;
let low_confidence_docs: Vec<_> = documents.iter()
.filter(|doc| {
doc.ocr_confidence.is_some() &&
doc.ocr_confidence.unwrap() < threshold
})
.collect();
// Only the document with confidence should be included
assert_eq!(low_confidence_docs.len(), 1);
assert!(low_confidence_docs[0].ocr_confidence.is_some());
}
#[test]
fn test_user_role_authorization_in_filtering() {
let user1_id = Uuid::new_v4();
let user2_id = Uuid::new_v4();
let user1_doc = create_test_document_with_confidence(user1_id, 20.0);
let user2_doc = create_test_document_with_confidence(user2_id, 15.0);
// Regular user should only see their own documents
let user_role = UserRole::User;
let admin_role = UserRole::Admin;
// User1 should only access their own document
let user1_can_access_own = user1_doc.user_id == user1_id || user_role == UserRole::Admin;
let user1_can_access_other = user2_doc.user_id == user1_id || user_role == UserRole::Admin;
assert!(user1_can_access_own);
assert!(!user1_can_access_other);
// Admin should access all documents
let admin_can_access_user1 = user1_doc.user_id == user1_id || admin_role == UserRole::Admin;
let admin_can_access_user2 = user2_doc.user_id == user1_id || admin_role == UserRole::Admin;
assert!(admin_can_access_user1);
assert!(admin_can_access_user2);
}
#[test]
fn test_boundary_conditions_for_confidence_thresholds() {
let user_id = Uuid::new_v4();
let test_cases = vec![
(0.0, 10.0, true), // 0% < 10% threshold
(10.0, 10.0, false), // 10% = 10% threshold (not less than)
(10.1, 10.0, false), // 10.1% > 10% threshold
(29.9, 30.0, true), // 29.9% < 30% threshold
(30.0, 30.0, false), // 30% = 30% threshold (not less than)
(30.1, 30.0, false), // 30.1% > 30% threshold
(99.9, 100.0, true), // 99.9% < 100% threshold
(100.0, 100.0, false), // 100% = 100% threshold (not less than)
];
for (doc_confidence, threshold, should_be_included) in test_cases {
let doc = create_test_document_with_confidence(user_id, doc_confidence);
let is_included = doc.ocr_confidence.is_some() &&
doc.ocr_confidence.unwrap() < threshold;
assert_eq!(is_included, should_be_included,
"Document with {}% confidence vs {}% threshold",
doc_confidence, threshold);
}
}
#[test]
fn test_performance_considerations_for_large_datasets() {
let user_id = Uuid::new_v4();
// Create a large number of test documents
let mut documents = Vec::new();
for i in 0..1000 {
let confidence = (i as f32) / 10.0; // 0.0 to 99.9
documents.push(create_test_document_with_confidence(user_id, confidence));
}
let threshold = 50.0;
let start_time = std::time::Instant::now();
let low_confidence_docs: Vec<_> = documents.iter()
.filter(|doc| {
doc.ocr_confidence.is_some() &&
doc.ocr_confidence.unwrap() < threshold
})
.collect();
let elapsed = start_time.elapsed();
// Verify the filtering works correctly for large datasets
assert_eq!(low_confidence_docs.len(), 500); // 0.0 to 49.9
// Performance should be reasonable (under 10ms for 1000 documents in memory)
assert!(elapsed.as_millis() < 10,
"Filtering 1000 documents took too long: {:?}", elapsed);
}
#[test]
fn test_sql_query_structure_expectations() {
// Test that our expected SQL query structure would work
let user_id = Uuid::new_v4();
let confidence_threshold = 30.0;
// This tests the logical structure we expect in the actual SQL query
let expected_where_conditions = vec![
"ocr_confidence IS NOT NULL",
"ocr_confidence < $1", // $1 = confidence_threshold
"user_id = $2", // $2 = user_id (for non-admin users)
];
// Verify our test documents would match the expected query logic
let test_doc = create_test_document_with_confidence(user_id, 25.0);
// Simulate the SQL conditions
let confidence_not_null = test_doc.ocr_confidence.is_some();
let confidence_below_threshold = test_doc.ocr_confidence.unwrap() < confidence_threshold;
let user_matches = test_doc.user_id == user_id;
assert!(confidence_not_null);
assert!(confidence_below_threshold);
assert!(user_matches);
// This document should be included in results
let would_be_selected = confidence_not_null && confidence_below_threshold && user_matches;
assert!(would_be_selected);
}
#[test]
fn test_deletion_ordering_expectations() {
let user_id = Uuid::new_v4();
let mut documents = vec![
create_test_document_with_confidence(user_id, 25.0),
create_test_document_with_confidence(user_id, 5.0),
create_test_document_with_confidence(user_id, 15.0),
create_test_document_with_confidence(user_id, 35.0), // Above threshold
];
let threshold = 30.0;
let mut low_confidence_docs: Vec<_> = documents.iter()
.filter(|doc| {
doc.ocr_confidence.is_some() &&
doc.ocr_confidence.unwrap() < threshold
})
.collect();
// Sort by confidence ascending (lowest first) then by creation date descending (newest first)
low_confidence_docs.sort_by(|a, b| {
let conf_a = a.ocr_confidence.unwrap();
let conf_b = b.ocr_confidence.unwrap();
conf_a.partial_cmp(&conf_b).unwrap()
.then_with(|| b.created_at.cmp(&a.created_at))
});
assert_eq!(low_confidence_docs.len(), 3);
assert_eq!(low_confidence_docs[0].ocr_confidence.unwrap(), 5.0); // Lowest confidence first
assert_eq!(low_confidence_docs[1].ocr_confidence.unwrap(), 15.0);
assert_eq!(low_confidence_docs[2].ocr_confidence.unwrap(), 25.0);
}
#[test]
fn test_error_handling_scenarios() {
let user_id = Uuid::new_v4();
// Test invalid threshold values (these would be caught by the API handler)
let invalid_thresholds = vec![-1.0, 101.0, f32::NAN, f32::INFINITY];
for threshold in invalid_thresholds {
// The database query itself should handle these gracefully
// Invalid thresholds should either match no documents or be rejected
let test_doc = create_test_document_with_confidence(user_id, 50.0);
if threshold.is_finite() {
let would_match = test_doc.ocr_confidence.is_some() &&
test_doc.ocr_confidence.unwrap() < threshold;
if threshold < 0.0 {
assert!(!would_match, "Negative threshold should match no documents");
}
if threshold > 100.0 {
// Documents with confidence > 100 shouldn't exist, but if they did,
// they should still be considered for deletion if threshold > 100
assert!(would_match, "Threshold > 100 should match normal documents");
}
} else {
// NaN and infinity comparisons
let would_match = test_doc.ocr_confidence.is_some() &&
test_doc.ocr_confidence.unwrap() < threshold;
if threshold.is_nan() {
// NaN comparisons should always be false
assert!(!would_match, "NaN threshold should match no documents");
} else if threshold == f32::INFINITY {
// Positive infinity should match all finite numbers
assert!(would_match, "Positive infinity threshold should match finite documents");
} else {
// Other invalid values like negative infinity
assert!(!would_match, "Invalid threshold should match no documents");
}
}
}
}
}
}