diff --git a/frontend/src/components/DocumentList.tsx b/frontend/src/components/DocumentList.tsx index 6281fd7..3f546cc 100644 --- a/frontend/src/components/DocumentList.tsx +++ b/frontend/src/components/DocumentList.tsx @@ -96,16 +96,14 @@ function DocumentList({ documents, loading }: DocumentListProps) { } const getOcrMetrics = (document: Document) => { - if (!document.has_ocr_text || !document.ocr_word_count) { + if (!document.has_ocr_text || document.ocr_word_count == null) { return null } const metrics = [] - - if (document.ocr_word_count) { - metrics.push(`${document.ocr_word_count} words`) - } - + + metrics.push(`${document.ocr_word_count} words`) + if (document.ocr_processing_time_ms) { const seconds = (document.ocr_processing_time_ms / 1000).toFixed(1) metrics.push(`${seconds}s`) diff --git a/frontend/src/components/__tests__/DocumentList.test.tsx b/frontend/src/components/__tests__/DocumentList.test.tsx new file mode 100644 index 0000000..0550257 --- /dev/null +++ b/frontend/src/components/__tests__/DocumentList.test.tsx @@ -0,0 +1,269 @@ +import { describe, it, expect, vi } from 'vitest'; +import { render, screen } from '@testing-library/react'; +import DocumentList from '../DocumentList'; +import type { Document } from '../../services/api'; + +// Mock the documentService to prevent actual download attempts +vi.mock('../../services/api', () => ({ + documentService: { + download: vi.fn().mockResolvedValue({ data: new Blob() }) + } +})); + +// Mock window.URL methods for download functionality +global.URL.createObjectURL = vi.fn(() => 'mock-object-url'); +global.URL.revokeObjectURL = vi.fn(); + +describe('DocumentList - OCR Metrics Display', () => { + /** + * Helper function to create a mock document with sensible defaults + * All OCR-related fields can be overridden via the overrides parameter + */ + const createMockDocument = (overrides: Partial = {}): Document => ({ + id: 'test-id-1', + user_id: 'user-123', + filename: 'test-document.pdf', + original_filename: 'test-document.pdf', + file_path: '/documents/test-document.pdf', + mime_type: 'application/pdf', + file_size: 1024000, // 1MB + tags: [], + created_at: '2024-01-01T00:00:00Z', + updated_at: '2024-01-01T00:00:00Z', + has_ocr_text: true, + ...overrides, + }); + + /** + * Test Case 1: Document with 0 word count shows "0 words" + * + * This is the primary bug fix test case. Previously, when ocr_word_count was 0, + * the condition `!document.ocr_word_count` evaluated to true (since 0 is falsy), + * causing the function to return null and display nothing instead of "0 words". + * + * After the fix, we now explicitly check `document.ocr_word_count == null`, + * which correctly allows 0 to pass through and be displayed. + */ + it('should display "0 words" when ocr_word_count is 0', () => { + const document = createMockDocument({ + ocr_word_count: 0, + has_ocr_text: true, + }); + + render(); + + // Verify that "0 words" is rendered in the document list + expect(screen.getByText(/0 words/i)).toBeInTheDocument(); + }); + + /** + * Test Case 2: Document with null word count shows no metrics + * + * When ocr_word_count is explicitly null, it indicates that OCR word counting + * has not been performed or is unavailable. In this case, no OCR metrics + * should be displayed. + */ + it('should not display OCR metrics when ocr_word_count is null', () => { + const document = createMockDocument({ + ocr_word_count: null, + has_ocr_text: true, + }); + + render(); + + // Verify that word count is not rendered + expect(screen.queryByText(/words/i)).not.toBeInTheDocument(); + }); + + /** + * Test Case 3: Document with undefined word count shows no metrics + * + * When ocr_word_count is undefined, it indicates the field was not provided. + * This should behave the same as null - no OCR metrics displayed. + * The == null check handles both null and undefined. + */ + it('should not display OCR metrics when ocr_word_count is undefined', () => { + const document = createMockDocument({ + ocr_word_count: undefined, + has_ocr_text: true, + }); + + render(); + + // Verify that word count is not rendered + expect(screen.queryByText(/words/i)).not.toBeInTheDocument(); + }); + + /** + * Test Case 4: Document with valid word count shows correctly + * + * Standard case where OCR has been performed and produced a meaningful + * word count. This verifies normal operation with typical values. + */ + it('should display correct word count when ocr_word_count has a valid number', () => { + const document = createMockDocument({ + ocr_word_count: 290, + has_ocr_text: true, + }); + + render(); + + // Verify that "290 words" is rendered correctly + expect(screen.getByText(/290 words/i)).toBeInTheDocument(); + }); + + /** + * Test Case 5: Document without OCR text shows no metrics + * + * When has_ocr_text is false, it indicates that OCR has not been performed + * on this document at all. No OCR metrics should be displayed regardless + * of what ocr_word_count contains. + */ + it('should not display OCR metrics when has_ocr_text is false', () => { + const document = createMockDocument({ + has_ocr_text: false, + ocr_word_count: 100, // Even with a word count, it shouldn't show + }); + + render(); + + // Verify that word count is not rendered when OCR is not available + expect(screen.queryByText(/words/i)).not.toBeInTheDocument(); + }); + + /** + * Test Case 6: Document with processing time shows both metrics + * + * When both word count and processing time are available, both metrics + * should be displayed with proper formatting (processing time converted + * from milliseconds to seconds with 1 decimal place). + */ + it('should display both word count and processing time when available', () => { + const document = createMockDocument({ + ocr_word_count: 100, + ocr_processing_time_ms: 1500, // 1.5 seconds + has_ocr_text: true, + }); + + render(); + + // Verify that both metrics are rendered + expect(screen.getByText(/100 words/i)).toBeInTheDocument(); + expect(screen.getByText(/1\.5s/i)).toBeInTheDocument(); + }); + + /** + * Additional Test: Edge case with very large word count + * + * Ensures the component handles large numbers correctly without + * formatting issues or overflow. + */ + it('should handle large word counts correctly', () => { + const document = createMockDocument({ + ocr_word_count: 1234567, + has_ocr_text: true, + }); + + render(); + + // Verify that large numbers are displayed without formatting + expect(screen.getByText(/1234567 words/i)).toBeInTheDocument(); + }); + + /** + * Additional Test: Processing time formatting + * + * Verifies that processing times are correctly converted from milliseconds + * to seconds and formatted with one decimal place. + */ + it('should format processing time correctly in seconds', () => { + const document = createMockDocument({ + ocr_word_count: 50, + ocr_processing_time_ms: 234, // Should display as 0.2s + has_ocr_text: true, + }); + + render(); + + // Verify processing time is formatted to 1 decimal place + expect(screen.getByText(/0\.2s/i)).toBeInTheDocument(); + }); + + /** + * Additional Test: Multiple documents with different OCR states + * + * Ensures the component correctly handles a list of documents where + * each document has different OCR metrics states. + */ + it('should handle multiple documents with different OCR metrics', () => { + const documents = [ + createMockDocument({ + id: 'doc-1', + original_filename: 'document1.pdf', + ocr_word_count: 0, + has_ocr_text: true, + }), + createMockDocument({ + id: 'doc-2', + original_filename: 'document2.pdf', + ocr_word_count: 500, + has_ocr_text: true, + }), + createMockDocument({ + id: 'doc-3', + original_filename: 'document3.pdf', + ocr_word_count: null, + has_ocr_text: true, + }), + createMockDocument({ + id: 'doc-4', + original_filename: 'document4.pdf', + has_ocr_text: false, + }), + ]; + + const { container } = render(); + + // Get all text content from the rendered component + const renderedText = container.textContent || ''; + + // Verify that both "0 words" and "500 words" appear in the rendered output + expect(renderedText).toContain('0 words'); // doc-1 shows 0 words + expect(renderedText).toContain('500 words'); // doc-2 shows 500 words + + // Count how many times "words" appears in the rendered text + // Should be exactly 2 (for doc-1 and doc-2) + const wordMatches = renderedText.match(/\d+ words/g); + expect(wordMatches).toHaveLength(2); + + // Verify all document filenames are rendered + expect(screen.getByText('document1.pdf')).toBeInTheDocument(); + expect(screen.getByText('document2.pdf')).toBeInTheDocument(); + expect(screen.getByText('document3.pdf')).toBeInTheDocument(); + expect(screen.getByText('document4.pdf')).toBeInTheDocument(); + }); + + /** + * Additional Test: Loading state + * + * Verifies that the loading state is properly displayed when + * documents are being fetched. + */ + it('should display loading state when loading is true', () => { + render(); + + expect(screen.getByText(/loading documents/i)).toBeInTheDocument(); + }); + + /** + * Additional Test: Empty state + * + * Verifies that the empty state is properly displayed when + * no documents are available. + */ + it('should display empty state when no documents are available', () => { + render(); + + expect(screen.getByText(/no documents found/i)).toBeInTheDocument(); + }); +}); diff --git a/frontend/src/pages/DocumentDetailsPage.tsx b/frontend/src/pages/DocumentDetailsPage.tsx index 0f89134..20e1e21 100644 --- a/frontend/src/pages/DocumentDetailsPage.tsx +++ b/frontend/src/pages/DocumentDetailsPage.tsx @@ -836,7 +836,7 @@ const DocumentDetailsPage: React.FC = () => { )} - {ocrData.ocr_word_count && ( + {ocrData.ocr_word_count != null && ( { size="small" /> )} - {ocrData.ocr_word_count && ( + {ocrData.ocr_word_count != null && ( { size="small" /> )} - {ocrData.ocr_word_count && ( + {ocrData.ocr_word_count != null && ( { + const actual = await vi.importActual('../../services/api'); + return { + ...actual, + documentService: { + getById: vi.fn(), + download: vi.fn(), + getOcrText: vi.fn(), + getThumbnail: vi.fn(), + getProcessedImage: vi.fn(), + bulkRetryOcr: vi.fn(), + delete: vi.fn(), + }, + default: { + get: vi.fn(), + post: vi.fn(), + put: vi.fn(), + delete: vi.fn(), + }, + }; +}); + +// Mock components that are used by DocumentDetailsPage but not part of our test focus +vi.mock('../../components/DocumentViewer', () => ({ + default: () => null, +})); + +vi.mock('../../components/Labels/LabelSelector', () => ({ + default: () => null, +})); + +vi.mock('../../components/MetadataDisplay', () => ({ + default: () => null, +})); + +vi.mock('../../components/FileIntegrityDisplay', () => ({ + default: () => null, +})); + +vi.mock('../../components/ProcessingTimeline', () => ({ + default: () => null, +})); + +vi.mock('../../components/RetryHistoryModal', () => ({ + RetryHistoryModal: () => null, +})); + +// Mock react-i18next +vi.mock('react-i18next', () => ({ + useTranslation: () => ({ + t: (key: string, params?: any) => { + // Provide simple translations for the keys we need + const translations: Record = { + 'documentDetails.errors.notFound': 'Document not found', + 'documentDetails.actions.backToDocuments': 'Back to Documents', + 'documentDetails.actions.download': 'Download', + 'documentDetails.actions.viewDocument': 'View Document', + 'documentDetails.actions.viewOcrText': 'View OCR Text', + 'documentDetails.actions.deleteDocument': 'Delete Document', + 'documentDetails.actions.editLabels': 'Edit Labels', + 'documentDetails.actions.viewProcessedImage': 'View Processed Image', + 'documentDetails.actions.retryOcr': 'Retry OCR', + 'documentDetails.actions.retryHistory': 'Retry History', + 'documentDetails.subtitle': 'Document Details', + 'documentDetails.metadata.fileSize': 'File Size', + 'documentDetails.metadata.uploadDate': 'Upload Date', + 'documentDetails.metadata.sourceType': 'Source Type', + 'documentDetails.metadata.originalPath': 'Original Path', + 'documentDetails.metadata.originalCreated': 'Original Created', + 'documentDetails.metadata.originalModified': 'Original Modified', + 'documentDetails.metadata.ocrStatus': 'OCR Status', + 'documentDetails.metadata.textExtracted': 'Text Extracted', + 'documentDetails.ocr.title': 'OCR Text Content', + 'documentDetails.ocr.confidence': 'Confidence', + 'documentDetails.ocr.words': 'Words', + 'documentDetails.ocr.processingTime': 'Processing Time', + 'documentDetails.ocr.loading': 'Loading OCR text...', + 'documentDetails.ocr.loadFailed': 'Failed to load OCR text', + 'documentDetails.ocr.noText': 'No OCR text available', + 'documentDetails.ocr.error': 'OCR Error', + 'documentDetails.ocr.expand': 'Expand', + 'documentDetails.ocr.expandTooltip': 'Expand OCR Text', + 'documentDetails.tagsLabels.title': 'Tags & Labels', + 'documentDetails.tagsLabels.tags': 'Tags', + 'documentDetails.tagsLabels.labels': 'Labels', + 'documentDetails.tagsLabels.noLabels': 'No labels assigned', + 'navigation.documents': 'Documents', + 'common.status.error': 'An error occurred', + 'common.actions.close': 'Close', + 'common.actions.download': 'Download', + 'common.actions.cancel': 'Cancel', + }; + + if (params) { + let translation = translations[key] || key; + // Simple parameter replacement + Object.keys(params).forEach((param) => { + translation = translation.replace(`{{${param}}}`, params[param]); + }); + return translation; + } + + return translations[key] || key; + }, + i18n: { + changeLanguage: vi.fn(), + }, + }), +})); + +// Import components and types AFTER the mocks are set up +import DocumentDetailsPage from '../DocumentDetailsPage'; +import * as apiModule from '../../services/api'; +import type { Document, OcrResponse } from '../../services/api'; +import { ThemeProvider as CustomThemeProvider } from '../../contexts/ThemeContext'; + +// Get references to the mocked services +const mockDocumentService = vi.mocked(apiModule.documentService, true); +const mockApi = vi.mocked(apiModule.default, true); + +// Create MUI theme for wrapping components +const theme = createTheme(); + +/** + * Helper function to create a base mock document + */ +const createBaseMockDocument = (overrides: Partial = {}): Document => ({ + id: 'test-doc-id', + filename: 'test.pdf', + original_filename: 'test.pdf', + file_path: '/path/to/test.pdf', + file_size: 1024000, + mime_type: 'application/pdf', + tags: [], + created_at: '2024-01-01T00:00:00Z', + updated_at: '2024-01-01T00:00:00Z', + user_id: 'user-123', + username: 'testuser', + has_ocr_text: true, + ...overrides, +}); + +/** + * Helper function to create mock OCR response data + */ +const createMockOcrResponse = (overrides: Partial = {}): OcrResponse => ({ + document_id: 'test-doc-id', + filename: 'test.pdf', + has_ocr_text: true, + ocr_text: 'Sample OCR text content', + ocr_confidence: 95.5, + ocr_word_count: 290, + ocr_processing_time_ms: 1500, + ocr_status: 'completed', + ocr_completed_at: '2024-01-01T00:01:00Z', + ...overrides, +}); + +/** + * Helper to render DocumentDetailsPage with all necessary providers + */ +const renderDocumentDetailsPage = (documentId = 'test-doc-id') => { + return render( + + + + + } /> + + + + + ); +}; + +describe('DocumentDetailsPage - OCR Word Count Display', () => { + beforeEach(() => { + console.log('mockDocumentService:', mockDocumentService); + console.log('mockDocumentService.getThumbnail:', mockDocumentService.getThumbnail); + vi.clearAllMocks(); + + // Mock window.matchMedia (needed for ThemeContext) + Object.defineProperty(window, 'matchMedia', { + writable: true, + value: vi.fn().mockImplementation((query) => ({ + matches: false, + media: query, + onchange: null, + addListener: vi.fn(), + removeListener: vi.fn(), + addEventListener: vi.fn(), + removeEventListener: vi.fn(), + dispatchEvent: vi.fn(), + })), + }); + + // Setup all default mocks - use type assertion since we know they're vi.fn() mocks + (mockDocumentService.getThumbnail as ReturnType).mockRejectedValue(new Error('No thumbnail')); + (mockDocumentService.bulkRetryOcr as ReturnType).mockResolvedValue({ data: { success: true } } as any); + (mockDocumentService.delete as ReturnType).mockResolvedValue({} as any); + (mockApi.get as ReturnType).mockResolvedValue({ status: 200, data: [] }); + (mockApi.post as ReturnType).mockResolvedValue({ status: 200, data: {} }); + (mockApi.put as ReturnType).mockResolvedValue({ status: 200, data: {} }); + }); + + /** + * Test Case 1: Verify OCR word count of 0 renders correctly + * + * This tests the bug fix at lines 839, 1086, and 1184 where we changed: + * - Before: {ocrData.ocr_word_count && ( + * - After: {ocrData.ocr_word_count != null && ( + * + * With ocr_word_count = 0, the old condition would be falsy and not render, + * but the new condition correctly checks for null/undefined. + */ + test('displays OCR word count of 0 correctly', async () => { + const mockDocument = createBaseMockDocument({ + has_ocr_text: true, + ocr_word_count: 0, + }); + + const mockOcrData = createMockOcrResponse({ + ocr_word_count: 0, + ocr_text: '', // Empty document + }); + + (mockDocumentService.getById as ReturnType).mockResolvedValue({ data: mockDocument }); + (mockDocumentService.getOcrText as ReturnType).mockResolvedValue({ data: mockOcrData }); + + renderDocumentDetailsPage(); + + // Wait for the document to load + await waitFor(() => { + expect(screen.getByText('test.pdf')).toBeInTheDocument(); + }); + + // Wait for OCR data to load + await waitFor(() => { + expect(mockDocumentService.getOcrText).toHaveBeenCalled(); + }); + + // Verify that the word count section renders (it should now with != null check) + await waitFor(() => { + // The word count should be displayed as "0" + const wordCountElements = screen.getAllByText('0'); + expect(wordCountElements.length).toBeGreaterThan(0); + + // Verify "Words" label is present (indicates the stat box rendered) + expect(screen.getByText('Words')).toBeInTheDocument(); + }); + }); + + /** + * Test Case 2: Verify OCR word count of null does not render + * + * When ocr_word_count is null, the != null check should be false, + * and the word count stat should not appear. + */ + test('does not display word count when ocr_word_count is null', async () => { + const mockDocument = createBaseMockDocument({ + has_ocr_text: true, + ocr_word_count: undefined, // Will be null in the API response + }); + + const mockOcrData = createMockOcrResponse({ + ocr_word_count: undefined, + }); + + (mockDocumentService.getById as ReturnType).mockResolvedValue({ data: mockDocument }); + (mockDocumentService.getOcrText as ReturnType).mockResolvedValue({ data: mockOcrData }); + + renderDocumentDetailsPage(); + + // Wait for the document to load + await waitFor(() => { + expect(screen.getByText('test.pdf')).toBeInTheDocument(); + }); + + // Wait for OCR data to load + await waitFor(() => { + expect(mockDocumentService.getOcrText).toHaveBeenCalled(); + }); + + // Verify OCR section still renders (document has OCR text) + await waitFor(() => { + expect(screen.getByText('OCR Text Content')).toBeInTheDocument(); + }); + + // Word count stat box should not render + // We check that "Words" label doesn't appear in the stats section + const wordsLabels = screen.queryAllByText('Words'); + expect(wordsLabels.length).toBe(0); + }); + + /** + * Test Case 3: Verify OCR word count of undefined does not render + * + * Similar to null case - when the field is explicitly undefined, + * the stat should not render. + */ + test('does not display word count when ocr_word_count is undefined', async () => { + const mockDocument = createBaseMockDocument({ + has_ocr_text: true, + }); + + // Explicitly create OCR data without ocr_word_count field + const mockOcrData: OcrResponse = { + document_id: 'test-doc-id', + filename: 'test.pdf', + has_ocr_text: true, + ocr_text: 'Some text', + ocr_confidence: 85.0, + ocr_processing_time_ms: 1200, + ocr_status: 'completed', + // ocr_word_count is intentionally omitted + }; + + (mockDocumentService.getById as ReturnType).mockResolvedValue({ data: mockDocument }); + (mockDocumentService.getOcrText as ReturnType).mockResolvedValue({ data: mockOcrData }); + + renderDocumentDetailsPage(); + + // Wait for the document to load + await waitFor(() => { + expect(screen.getByText('test.pdf')).toBeInTheDocument(); + }); + + // Wait for OCR data to load + await waitFor(() => { + expect(mockDocumentService.getOcrText).toHaveBeenCalled(); + }); + + // Verify OCR section renders + await waitFor(() => { + expect(screen.getByText('OCR Text Content')).toBeInTheDocument(); + }); + + // Confidence should render (it's present in mockOcrData) + await waitFor(() => { + expect(screen.getByText(/85%/)).toBeInTheDocument(); + }); + + // Word count should NOT render + const wordsLabels = screen.queryAllByText('Words'); + expect(wordsLabels.length).toBe(0); + }); + + /** + * Test Case 4: Verify valid OCR word count renders correctly + * + * This is the happy path - a normal document with a valid word count + * should display properly. + */ + test('displays valid OCR word count correctly', async () => { + const mockDocument = createBaseMockDocument({ + has_ocr_text: true, + ocr_word_count: 290, + }); + + const mockOcrData = createMockOcrResponse({ + ocr_word_count: 290, + ocr_text: 'This is a sample document with approximately 290 words...', + }); + + (mockDocumentService.getById as ReturnType).mockResolvedValue({ data: mockDocument }); + (mockDocumentService.getOcrText as ReturnType).mockResolvedValue({ data: mockOcrData }); + + renderDocumentDetailsPage(); + + // Wait for the document to load + await waitFor(() => { + expect(screen.getByText('test.pdf')).toBeInTheDocument(); + }); + + // Wait for OCR data to load + await waitFor(() => { + expect(mockDocumentService.getOcrText).toHaveBeenCalled(); + }); + + // Verify word count displays with proper formatting + await waitFor(() => { + // Should display "290" formatted with toLocaleString() + expect(screen.getByText('290')).toBeInTheDocument(); + expect(screen.getByText('Words')).toBeInTheDocument(); + }); + + // Also verify confidence is displayed + await waitFor(() => { + expect(screen.getByText(/96%/)).toBeInTheDocument(); // 95.5 rounds to 96 + expect(screen.getByText('Confidence')).toBeInTheDocument(); + }); + + // Verify processing time is displayed + await waitFor(() => { + expect(screen.getByText('1500ms')).toBeInTheDocument(); + expect(screen.getByText('Processing Time')).toBeInTheDocument(); + }); + }); +}); diff --git a/src/ocr/enhanced.rs b/src/ocr/enhanced.rs index 3afa7b0..a139859 100644 --- a/src/ocr/enhanced.rs +++ b/src/ocr/enhanced.rs @@ -1663,35 +1663,42 @@ impl EnhancedOcrService { /// Validate OCR result quality #[cfg(feature = "ocr")] pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> Result<(), String> { - // Check minimum confidence threshold - if result.confidence < settings.ocr_min_confidence { + // Hard reject completely unreliable OCR (likely corrupted/garbage) + const HARD_MINIMUM_CONFIDENCE: f32 = 5.0; + if result.confidence < HARD_MINIMUM_CONFIDENCE { return Err(format!( - "OCR confidence below threshold: {:.1}% (minimum: {:.1}%)", + "OCR confidence critically low: {:.1}% (absolute minimum: {:.1}%) - likely corrupted input", result.confidence, - settings.ocr_min_confidence + HARD_MINIMUM_CONFIDENCE )); } - // Check if text is reasonable (not just noise) - if result.word_count == 0 { - return Err("No words detected in OCR output".to_string()); + // Log warning for low confidence instead of rejecting + if result.confidence < settings.ocr_min_confidence { + warn!( + "OCR confidence below recommended threshold: {:.1}% (recommended: {:.1}%) - accepting but flagging for review", + result.confidence, + settings.ocr_min_confidence + ); } - // Check for reasonable character distribution + // Check empty text FIRST (before word count check) let total_chars = result.text.len(); if total_chars == 0 { return Err("OCR result contains no characters".to_string()); } - // Count alphanumeric characters and digits separately - let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count(); + // THEN check word count + if result.word_count == 0 { + return Err("No words detected in OCR output".to_string()); + } + + // Special handling for numeric-heavy documents (bills, receipts, invoices) let digit_chars = result.text.chars().filter(|c| c.is_numeric()).count(); - let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32; let digit_ratio = digit_chars as f32 / total_chars as f32; - // Special handling for numeric-heavy documents (bills, transaction lists, etc.) - // If document has >40% digits, it's likely a valid numeric document - if digit_ratio > 0.4 { + // If >30% digits, likely a valid numeric document - be more lenient + if digit_ratio > 0.3 { debug!( "Document has high numeric content: {:.1}% digits - accepting as valid numeric document", digit_ratio * 100.0 @@ -1699,16 +1706,29 @@ impl EnhancedOcrService { return Ok(()); } - // Expect at least 20% alphanumeric characters for valid text (relaxed from 30%) - const MIN_ALPHANUMERIC_RATIO: f32 = 0.20; + // Count alphanumeric characters + let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count(); + let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32; + + // Relaxed threshold: only reject if >90% symbols (likely garbage) + // This allows bills/receipts with lots of numbers and special characters + const MIN_ALPHANUMERIC_RATIO: f32 = 0.10; if alphanumeric_ratio < MIN_ALPHANUMERIC_RATIO { return Err(format!( - "OCR result has low alphanumeric content: {:.1}% (minimum: {:.1}%)", + "OCR result has too much non-alphanumeric content: {:.1}% alphanumeric (minimum: {:.1}%)", alphanumeric_ratio * 100.0, MIN_ALPHANUMERIC_RATIO * 100.0 )); } + // Log info for documents with reasonable content + debug!( + "OCR validation passed: {:.1}% confidence, {} words, {:.1}% alphanumeric", + result.confidence, + result.word_count, + alphanumeric_ratio * 100.0 + ); + Ok(()) } } diff --git a/tests/integration_enhanced_ocr_tests.rs b/tests/integration_enhanced_ocr_tests.rs index b5f8ba5..8b0f94f 100644 --- a/tests/integration_enhanced_ocr_tests.rs +++ b/tests/integration_enhanced_ocr_tests.rs @@ -319,18 +319,19 @@ mod tests { let service = EnhancedOcrService::new(temp_path, file_service); let mut settings = create_test_settings(); settings.ocr_min_confidence = 50.0; - + let result = OcrResult { text: "Poor quality text".to_string(), - confidence: 25.0, // Below threshold + confidence: 25.0, // Below threshold but still accepted processing_time_ms: 1000, word_count: 3, preprocessing_applied: vec![], processed_image_path: None, }; - + + // Low confidence is now accepted with a warning, not rejected let result_validation = service.validate_ocr_quality(&result, &settings); - assert!(result_validation.is_err()); + assert!(result_validation.is_ok()); } #[cfg(feature = "ocr")] @@ -571,37 +572,37 @@ startxref let file_service = create_test_file_service(&temp_path).await; let service = EnhancedOcrService::new(temp_path, file_service); let settings = create_test_settings(); - + let mut handles = vec![]; - + // Process multiple files concurrently for i in 0..5 { let temp_file = NamedTempFile::with_suffix(".txt").unwrap(); let content = format!("Concurrent test content {}", i); fs::write(temp_file.path(), &content).unwrap(); - + let temp_path_clone = temp_dir.path().to_str().unwrap().to_string(); let file_service_clone = create_test_file_service(&temp_path_clone).await; let service_clone = EnhancedOcrService::new(temp_path_clone, file_service_clone); let settings_clone = settings.clone(); let file_path = temp_file.path().to_str().unwrap().to_string(); - + let handle = tokio::spawn(async move { let result = service_clone .extract_text(&file_path, "text/plain", &settings_clone) .await; - + // Keep temp_file alive until task completes drop(temp_file); result }); - + handles.push(handle); } - + // Wait for all tasks to complete let results = futures::future::join_all(handles).await; - + // All tasks should succeed for (i, result) in results.into_iter().enumerate() { assert!(result.is_ok(), "Task {} failed", i); @@ -610,4 +611,251 @@ startxref assert_eq!(ocr_result.confidence, 100.0); } } + + // New validation tests for updated OCR validation logic + + #[cfg(feature = "ocr")] + #[tokio::test] + async fn test_validate_ocr_quality_below_hard_minimum() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let file_service = create_test_file_service(&temp_path).await; + let service = EnhancedOcrService::new(temp_path, file_service); + let settings = create_test_settings(); + + // Test OCR with confidence below the hard minimum (5%) + // This should be rejected as critically low/corrupted + let result = OcrResult { + text: "Some text".to_string(), + confidence: 4.9, // Below hard minimum of 5% + processing_time_ms: 1000, + word_count: 2, + preprocessing_applied: vec![], + processed_image_path: None, + }; + + let validation_result = service.validate_ocr_quality(&result, &settings); + assert!(validation_result.is_err(), "Expected validation to fail for confidence below hard minimum"); + + let error_msg = validation_result.unwrap_err(); + assert!(error_msg.contains("critically low"), + "Expected 'critically low' in error message, got: {}", error_msg); + } + + #[cfg(feature = "ocr")] + #[tokio::test] + async fn test_validate_ocr_quality_at_hard_minimum_boundary() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let file_service = create_test_file_service(&temp_path).await; + let service = EnhancedOcrService::new(temp_path, file_service); + let settings = create_test_settings(); + + // Test OCR with exactly 5% confidence (boundary case) + // This should be accepted (at the hard minimum threshold) + let result = OcrResult { + text: "Boundary test text".to_string(), + confidence: 5.0, // Exactly at hard minimum + processing_time_ms: 1000, + word_count: 3, + preprocessing_applied: vec![], + processed_image_path: None, + }; + + let validation_result = service.validate_ocr_quality(&result, &settings); + assert!(validation_result.is_ok(), + "Expected validation to pass at hard minimum boundary (5%)"); + } + + #[cfg(feature = "ocr")] + #[tokio::test] + async fn test_validate_ocr_quality_numeric_document() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let file_service = create_test_file_service(&temp_path).await; + let service = EnhancedOcrService::new(temp_path, file_service); + let settings = create_test_settings(); + + // Test invoice/receipt with >30% digits + // Should be accepted even with lower alphanumeric ratio due to high digit content + let result = OcrResult { + text: "Invoice #12345\n$1,234.56\n$2,345.67\nTotal: $3,580.23\n!!!".to_string(), + confidence: 60.0, + processing_time_ms: 1000, + word_count: 5, + preprocessing_applied: vec![], + processed_image_path: None, + }; + + // Calculate to verify we have >30% digits + let digit_count = result.text.chars().filter(|c| c.is_numeric()).count(); + let total_chars = result.text.len(); + let digit_ratio = digit_count as f32 / total_chars as f32; + assert!(digit_ratio > 0.3, "Test data should have >30% digits, got {:.1}%", digit_ratio * 100.0); + + let validation_result = service.validate_ocr_quality(&result, &settings); + assert!(validation_result.is_ok(), + "Expected validation to pass for numeric document with {:.1}% digits", digit_ratio * 100.0); + } + + #[cfg(feature = "ocr")] + #[tokio::test] + async fn test_validate_ocr_quality_numeric_document_boundary() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let file_service = create_test_file_service(&temp_path).await; + let service = EnhancedOcrService::new(temp_path, file_service); + let settings = create_test_settings(); + + // Test document with exactly 30% digits (boundary case) + // 30 digits + 70 non-digit chars = 100 total chars + let result = OcrResult { + text: "123456789012345678901234567890AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA".to_string(), + confidence: 60.0, + processing_time_ms: 1000, + word_count: 2, + preprocessing_applied: vec![], + processed_image_path: None, + }; + + // Verify exactly 30% digits + let digit_count = result.text.chars().filter(|c| c.is_numeric()).count(); + let total_chars = result.text.len(); + let digit_ratio = digit_count as f32 / total_chars as f32; + assert_eq!(digit_count, 30, "Test data should have exactly 30 digits"); + assert_eq!(total_chars, 100, "Test data should have exactly 100 chars"); + assert!((digit_ratio - 0.3).abs() < 0.01, "Should have exactly 30% digits, got {:.1}%", digit_ratio * 100.0); + + let validation_result = service.validate_ocr_quality(&result, &settings); + // At exactly 30%, it should NOT trigger the >30% special handling + // So it will be validated normally (which should pass with 100% alphanumeric) + assert!(validation_result.is_ok(), + "Expected validation to pass at 30% digit boundary"); + } + + #[cfg(feature = "ocr")] + #[tokio::test] + async fn test_validate_ocr_quality_alphanumeric_boundary() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let file_service = create_test_file_service(&temp_path).await; + let service = EnhancedOcrService::new(temp_path, file_service); + let settings = create_test_settings(); + + // Test text with exactly 10% alphanumeric characters (boundary case) + // 1 letter + 9 symbols = 10 total chars = 10% alphanumeric + let result = OcrResult { + text: "a!!!!!!!!!".to_string(), // 1 alphanumeric + 9 symbols = 10% + confidence: 60.0, + processing_time_ms: 1000, + word_count: 1, + preprocessing_applied: vec![], + processed_image_path: None, + }; + + // Verify exactly 10% alphanumeric + let alphanumeric_count = result.text.chars().filter(|c| c.is_alphanumeric()).count(); + let total_chars = result.text.len(); + let alphanumeric_ratio = alphanumeric_count as f32 / total_chars as f32; + assert_eq!(alphanumeric_count, 1, "Test data should have exactly 1 alphanumeric char"); + assert_eq!(total_chars, 10, "Test data should have exactly 10 chars"); + assert!((alphanumeric_ratio - 0.1).abs() < 0.01, "Should have exactly 10% alphanumeric, got {:.1}%", alphanumeric_ratio * 100.0); + + let validation_result = service.validate_ocr_quality(&result, &settings); + assert!(validation_result.is_ok(), + "Expected validation to pass at 10% alphanumeric boundary"); + } + + #[cfg(feature = "ocr")] + #[tokio::test] + async fn test_validate_ocr_quality_below_alphanumeric_threshold() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let file_service = create_test_file_service(&temp_path).await; + let service = EnhancedOcrService::new(temp_path, file_service); + let settings = create_test_settings(); + + // Test text with <10% alphanumeric (pure garbage) + // 1 letter + 13 symbols = 14 total chars = 7.14% alphanumeric + let result = OcrResult { + text: "a!!!!!!!!!!!!!!".to_string(), // 1 alphanumeric + 14 symbols = ~7% + confidence: 60.0, + processing_time_ms: 1000, + word_count: 1, + preprocessing_applied: vec![], + processed_image_path: None, + }; + + // Verify <10% alphanumeric + let alphanumeric_count = result.text.chars().filter(|c| c.is_alphanumeric()).count(); + let total_chars = result.text.len(); + let alphanumeric_ratio = alphanumeric_count as f32 / total_chars as f32; + assert!(alphanumeric_ratio < 0.10, "Test data should have <10% alphanumeric, got {:.1}%", alphanumeric_ratio * 100.0); + + let validation_result = service.validate_ocr_quality(&result, &settings); + assert!(validation_result.is_err(), + "Expected validation to fail for <10% alphanumeric content"); + + let error_msg = validation_result.unwrap_err(); + assert!(error_msg.contains("non-alphanumeric"), + "Expected error about non-alphanumeric content, got: {}", error_msg); + } + + #[cfg(feature = "ocr")] + #[tokio::test] + async fn test_validate_ocr_quality_empty_text() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let file_service = create_test_file_service(&temp_path).await; + let service = EnhancedOcrService::new(temp_path, file_service); + let settings = create_test_settings(); + + // Test completely empty text + // Should fail with "no characters" error (not "no words") + let result = OcrResult { + text: "".to_string(), + confidence: 60.0, + processing_time_ms: 1000, + word_count: 0, + preprocessing_applied: vec![], + processed_image_path: None, + }; + + let validation_result = service.validate_ocr_quality(&result, &settings); + assert!(validation_result.is_err(), + "Expected validation to fail for empty text"); + + let error_msg = validation_result.unwrap_err(); + assert!(error_msg.contains("no characters"), + "Expected error about 'no characters' (not 'no words'), got: {}", error_msg); + } + + #[cfg(feature = "ocr")] + #[tokio::test] + async fn test_validate_ocr_quality_whitespace_only() { + let temp_dir = create_temp_dir(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + let file_service = create_test_file_service(&temp_path).await; + let service = EnhancedOcrService::new(temp_path, file_service); + let settings = create_test_settings(); + + // Test text with only whitespace + // Has characters but no words - should fail with "No words" error + let result = OcrResult { + text: " \n\n\t\t".to_string(), + confidence: 60.0, + processing_time_ms: 1000, + word_count: 0, // Whitespace doesn't count as words + preprocessing_applied: vec![], + processed_image_path: None, + }; + + let validation_result = service.validate_ocr_quality(&result, &settings); + assert!(validation_result.is_err(), + "Expected validation to fail for whitespace-only text"); + + let error_msg = validation_result.unwrap_err(); + assert!(error_msg.contains("No words"), + "Expected error about 'No words' (not 'no characters'), got: {}", error_msg); + } } \ No newline at end of file