diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..0504a11 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,76 @@ +# Git +.git +.gitignore +.github + +# Documentation +*.md +docs/ +site/ + +# Development files +.env +.env.dev +.env.test +*.log + +# Test files and data +test_data/ +test_files/ +test-uploads/ +test-results/ +tests/ + +# Build artifacts +target/ +frontend/dist/ +frontend/node_modules/ + +# IDE and editor files +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# CI/CD +.woodpecker/ +.claude/ + +# Docker files (no need to copy into image) +Dockerfile* +docker-compose*.yml +.dockerignore + +# Python +__pycache__/ +*.py[cod] +*$py.class +venv/ +*.egg-info/ +.pytest_cache/ + +# Temporary files +*.tmp +*.bak +.DS_Store + +# Uploads and watch directories (will be mounted) +readur_uploads/ +readur_watch/ +uploads/ + +# Other +charts/ +scripts/ +Makefile +requirements.txt +*.sh +grafana-dashboard.json +nginx.conf +ssl/ +config/ +.cargo/ +renovate.json +mkdocs.yml +.playwright-mcp/ diff --git a/Dockerfile.dev b/Dockerfile.dev new file mode 100644 index 0000000..9286ed7 --- /dev/null +++ b/Dockerfile.dev @@ -0,0 +1,73 @@ +# Development Dockerfile with hot-reloading support for Rust backend +FROM rust:1.90-bookworm + +# Install system dependencies for OCR and PDF processing +RUN apt-get update && apt-get install -y \ + tesseract-ocr \ + tesseract-ocr-eng \ + tesseract-ocr-spa \ + tesseract-ocr-fra \ + tesseract-ocr-deu \ + tesseract-ocr-ita \ + tesseract-ocr-por \ + tesseract-ocr-rus \ + tesseract-ocr-chi-sim \ + tesseract-ocr-chi-tra \ + tesseract-ocr-jpn \ + tesseract-ocr-kor \ + tesseract-ocr-ara \ + tesseract-ocr-hin \ + tesseract-ocr-nld \ + tesseract-ocr-swe \ + tesseract-ocr-nor \ + tesseract-ocr-dan \ + tesseract-ocr-fin \ + tesseract-ocr-pol \ + tesseract-ocr-ces \ + tesseract-ocr-hun \ + tesseract-ocr-tur \ + tesseract-ocr-tha \ + tesseract-ocr-vie \ + libtesseract-dev \ + libleptonica-dev \ + pkg-config \ + libclang-dev \ + clang \ + poppler-utils \ + ocrmypdf \ + curl \ + # Legacy DOC file support + antiword \ + catdoc \ + && rm -rf /var/lib/apt/lists/* + +# Install cargo-watch for auto-recompilation +RUN cargo install cargo-watch + +WORKDIR /app + +# Create necessary directories +RUN mkdir -p /app/uploads /app/watch /app/frontend + +# Copy dependency files first for better caching +COPY Cargo.toml Cargo.lock ./ + +# Create dummy source files for all binaries to build dependencies +# This significantly speeds up the first cargo-watch build +RUN mkdir -p src/bin && \ + echo "fn main() {}" > src/main.rs && \ + echo "fn main() {}" > src/bin/test_runner.rs && \ + echo "fn main() {}" > src/bin/analyze-webdav-performance.rs && \ + cargo build && \ + rm -rf src target + +# The actual source code will be mounted as a volume +# This allows for hot-reloading without rebuilding the image + +EXPOSE 8000 + +# Use cargo-watch to automatically rebuild and restart on file changes +# --why: shows what triggered the rebuild +# --ignore: ignore certain file patterns to avoid unnecessary rebuilds +# Run the main readur binary +CMD ["cargo", "watch", "-x", "run --bin readur", "--why", "--ignore", "frontend/*", "--ignore", "*.md", "--ignore", "test*"] diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml new file mode 100644 index 0000000..ed921ae --- /dev/null +++ b/docker-compose.dev.yml @@ -0,0 +1,122 @@ +# Docker Compose configuration for development with hot-reloading +services: + postgres: + image: postgres:17-alpine + environment: + POSTGRES_USER: readur + POSTGRES_PASSWORD: readur + POSTGRES_DB: readur + volumes: + - postgres_data_dev:/var/lib/postgresql/data + ports: + - "5432:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U readur"] + interval: 10s + timeout: 5s + retries: 5 + + backend: + build: + context: . + dockerfile: Dockerfile.dev + environment: + # Database configuration + DATABASE_URL: postgresql://readur:readur@postgres/readur + + # Server configuration + SERVER_HOST: 0.0.0.0 + SERVER_PORT: 8000 + + # Security + JWT_SECRET: dev-secret-key-change-in-production + + # File paths + UPLOAD_PATH: /app/uploads + WATCH_FOLDER: /app/watch + + # OCR configuration + OCR_LANGUAGE: eng + CONCURRENT_OCR_JOBS: 4 + OCR_TIMEOUT_SECONDS: 300 + MAX_FILE_SIZE_MB: 50 + + # Performance + MEMORY_LIMIT_MB: 512 + CPU_PRIORITY: normal + + # File watching + ALLOWED_FILE_TYPES: pdf,txt,doc,docx,png,jpg,jpeg + WATCH_INTERVAL_SECONDS: 30 + FILE_STABILITY_CHECK_MS: 1000 + MAX_FILE_AGE_HOURS: 24 + + # Development mode + RUST_LOG: debug + RUST_BACKTRACE: 1 + + ports: + - "8000:8000" + + volumes: + # Mount source code for hot-reloading + - ./src:/app/src + - ./Cargo.toml:/app/Cargo.toml + - ./Cargo.lock:/app/Cargo.lock + - ./migrations:/app/migrations + + # Persistent storage + - ./readur_uploads:/app/uploads + - ./readur_watch:/app/watch + + # Cache cargo registry and git dependencies to speed up rebuilds + - cargo_registry:/usr/local/cargo/registry + - cargo_git:/usr/local/cargo/git + + # Cache build artifacts (but not the final binary) + - target_cache:/app/target + + depends_on: + postgres: + condition: service_healthy + + # No healthcheck in dev mode to avoid noise during restarts + + frontend: + build: + context: ./frontend + dockerfile: Dockerfile.dev + environment: + # Configure Vite to proxy API requests to the backend service + # In Docker network, services can communicate by service name + VITE_API_PROXY_TARGET: http://backend:8000 + + # Use a less common port to avoid conflicts + CLIENT_PORT: 3456 + + ports: + - "3456:3456" + + volumes: + # Mount entire frontend directory for hot-reloading + # This is simpler and avoids file vs directory mount issues + - ./frontend:/app + + # Exclude node_modules - use the container's version + - /app/node_modules + + # Exclude any build artifacts + - /app/dist + + depends_on: + - backend + + # Enable stdin and tty for interactive npm commands + stdin_open: true + tty: true + +volumes: + postgres_data_dev: + cargo_registry: + cargo_git: + target_cache: diff --git a/frontend/.dockerignore b/frontend/.dockerignore new file mode 100644 index 0000000..b1c16ad --- /dev/null +++ b/frontend/.dockerignore @@ -0,0 +1,48 @@ +# Node modules (installed in container) +node_modules/ + +# Build output (generated in container) +dist/ + +# Tests +**/*.test.ts +**/*.test.tsx +**/*.spec.ts +**/*.spec.tsx +test/ +coverage/ +.nyc_output/ + +# Development files +.env +.env.local +.env.*.local + +# Git +.git +.gitignore + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# Logs +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +lerna-debug.log* + +# Documentation +*.md + +# Playwright +playwright-report/ +test-results/ +playwright/.cache/ + +# OS +.DS_Store +Thumbs.db diff --git a/frontend/Dockerfile.dev b/frontend/Dockerfile.dev new file mode 100644 index 0000000..00deed1 --- /dev/null +++ b/frontend/Dockerfile.dev @@ -0,0 +1,20 @@ +# Development Dockerfile for frontend with Vite hot-reloading +FROM node:22-bookworm + +WORKDIR /app + +# Copy package files for dependency installation +COPY package*.json ./ + +# Install dependencies +RUN npm install + +# The source code will be mounted as a volume for hot-reloading +# Vite's dev server provides hot module replacement (HMR) + +EXPOSE 3456 + +# Run Vite dev server +# --host 0.0.0.0 allows connections from outside the container +# --port is set via CLIENT_PORT environment variable in vite.config.ts +CMD ["npm", "run", "dev", "--", "--host", "0.0.0.0"] diff --git a/frontend/src/pages/DocumentDetailsPage.tsx b/frontend/src/pages/DocumentDetailsPage.tsx index 20e1e21..8e88e75 100644 --- a/frontend/src/pages/DocumentDetailsPage.tsx +++ b/frontend/src/pages/DocumentDetailsPage.tsx @@ -13,6 +13,7 @@ import { IconButton, Paper, Alert, + AlertTitle, CircularProgress, Tooltip, Dialog, @@ -24,6 +25,7 @@ import { Skeleton, TextField, InputAdornment, + Snackbar, } from '@mui/material'; import Grid from '@mui/material/GridLegacy'; import { @@ -52,6 +54,9 @@ import { OpenInFull as ExpandIcon, Close as CloseIcon, Delete as DeleteIcon, + Schedule as ScheduleIcon, + CheckCircle as CheckCircleIcon, + Error as ErrorIcon, } from '@mui/icons-material'; import { documentService, OcrResponse, type Document } from '../services/api'; import DocumentViewer from '../components/DocumentViewer'; @@ -72,6 +77,67 @@ const DocumentDetailsPage: React.FC = () => { const { mode, modernTokens, glassEffect } = useTheme(); const theme = useMuiTheme(); const [document, setDocument] = useState(null); + + // Helper function to render OCR status badge + const getOcrStatusBadge = (status?: string) => { + if (!status || status === 'pending') { + return ( + } + label="Pending OCR" + sx={{ + backgroundColor: theme.palette.warning.light, + color: theme.palette.warning.dark, + border: `1px solid ${theme.palette.warning.main}`, + fontWeight: 600, + }} + /> + ); + } + + if (status === 'processing') { + return ( + } + label="Processing..." + sx={{ + backgroundColor: theme.palette.info.light, + color: theme.palette.info.dark, + border: `1px solid ${theme.palette.info.main}`, + fontWeight: 600, + }} + /> + ); + } + + if (status === 'completed') { + return ( + } + label="Completed" + color="success" + sx={{ + fontWeight: 600, + }} + /> + ); + } + + if (status === 'failed') { + return ( + } + label="Failed" + color="error" + sx={{ + fontWeight: 600, + }} + /> + ); + } + + return null; + }; const [loading, setLoading] = useState(true); const [error, setError] = useState(null); const [ocrText, setOcrText] = useState(''); @@ -92,15 +158,20 @@ const DocumentDetailsPage: React.FC = () => { // Retry functionality state const [retryingOcr, setRetryingOcr] = useState(false); const [retryHistoryModalOpen, setRetryHistoryModalOpen] = useState(false); - + // Delete functionality state const [deleting, setDeleting] = useState(false); const [deleteConfirmOpen, setDeleteConfirmOpen] = useState(false); + // Snackbar state for retry feedback + const [snackbarOpen, setSnackbarOpen] = useState(false); + const [snackbarMessage, setSnackbarMessage] = useState(''); + const [snackbarSeverity, setSnackbarSeverity] = useState<'success' | 'error'>('success'); + // Retry handlers const handleRetryOcr = async () => { if (!document) return; - + setRetryingOcr(true); try { await documentService.bulkRetryOcr({ @@ -108,13 +179,22 @@ const DocumentDetailsPage: React.FC = () => { document_ids: [document.id], priority_override: 15, }); - - // Show success message and refresh document + + // Show success message + setSnackbarMessage('OCR retry initiated successfully'); + setSnackbarSeverity('success'); + setSnackbarOpen(true); + + // Refresh document after a brief delay setTimeout(() => { fetchDocumentDetails(); }, 1000); - } catch (error) { + } catch (error: any) { console.error('Failed to retry OCR:', error); + // Show error message + setSnackbarMessage(`Failed to retry OCR: ${error.message || 'Unknown error'}`); + setSnackbarSeverity('error'); + setSnackbarOpen(true); } finally { setRetryingOcr(false); } @@ -170,6 +250,21 @@ const DocumentDetailsPage: React.FC = () => { fetchAvailableLabels(); }, []); + // Auto-refresh during OCR processing + useEffect(() => { + if (!document) return; + + const isProcessing = document.ocr_status === 'processing' || retryingOcr; + + if (isProcessing) { + const interval = setInterval(() => { + fetchDocumentDetails(); + }, 3000); // Poll every 3 seconds + + return () => clearInterval(interval); + } + }, [document?.ocr_status, retryingOcr]); + const fetchDocumentDetails = async (): Promise => { if (!id) { setError(t('documentDetails.errors.notFound')); @@ -492,8 +587,37 @@ const DocumentDetailsPage: React.FC = () => { + {/* OCR Status Alert - Shows when failed */} + {document?.ocr_status === 'failed' && ( + + : } + sx={{ fontWeight: 600 }} + > + Retry OCR + + } + > + OCR Processing Failed + {document.ocr_failure_reason || document.ocr_error || 'OCR processing encountered an error. You can retry the operation.'} + + + )} + {/* Modern Content Layout */} - + {/* Hero Document Preview */} @@ -664,19 +788,43 @@ const DocumentDetailsPage: React.FC = () => { )} - {document.has_ocr_text && ( - + {/* OCR Status with Retry Info */} + + {t('documentDetails.metadata.ocrStatus')} - } - /> + {getOcrStatusBadge(document.ocr_status)} - )} + + {/* Retry Count Indicator */} + {document.ocr_retry_count != null && document.ocr_retry_count > 0 && ( + + + + Retried {document.ocr_retry_count}× - click for history + + + )} + {/* Action Buttons */} @@ -1490,6 +1638,26 @@ const DocumentDetailsPage: React.FC = () => { + + {/* Snackbar for Retry Feedback */} + setSnackbarOpen(false)} + anchorOrigin={{ vertical: 'bottom', horizontal: 'right' }} + > + setSnackbarOpen(false)} + severity={snackbarSeverity} + sx={{ + width: '100%', + borderRadius: 2, + boxShadow: theme.shadows[8], + }} + > + {snackbarMessage} + + ); }; diff --git a/frontend/src/services/api.ts b/frontend/src/services/api.ts index 7f089be..322b3ef 100644 --- a/frontend/src/services/api.ts +++ b/frontend/src/services/api.ts @@ -61,6 +61,10 @@ export interface Document { ocr_word_count?: number ocr_processing_time_ms?: number ocr_status?: string + ocr_error?: string + ocr_failure_reason?: string + ocr_retry_count?: number + ocr_completed_at?: string } export interface SearchRequest { @@ -113,6 +117,10 @@ export interface EnhancedDocument { ocr_word_count?: number ocr_processing_time_ms?: number ocr_status?: string + ocr_error?: string + ocr_failure_reason?: string + ocr_retry_count?: number + ocr_completed_at?: string search_rank?: number snippets: SearchSnippet[] } @@ -250,6 +258,8 @@ export interface OcrResponse { ocr_processing_time_ms?: number ocr_status?: string ocr_error?: string + ocr_failure_reason?: string + ocr_retry_count?: number ocr_completed_at?: string } diff --git a/frontend/vite.config.ts b/frontend/vite.config.ts index 85b6aa5..7b89620 100644 --- a/frontend/vite.config.ts +++ b/frontend/vite.config.ts @@ -4,6 +4,8 @@ import react from '@vitejs/plugin-react' // Support environment variables for development const BACKEND_PORT = process.env.BACKEND_PORT || '8000' const CLIENT_PORT = process.env.CLIENT_PORT || '5173' +// Allow overriding the proxy target for Docker development +const PROXY_TARGET = process.env.VITE_API_PROXY_TARGET || `http://localhost:${BACKEND_PORT}` export default defineConfig({ plugins: [react()], @@ -15,7 +17,7 @@ export default defineConfig({ port: parseInt(CLIENT_PORT), proxy: { '/api': { - target: `http://localhost:${BACKEND_PORT}`, + target: PROXY_TARGET, changeOrigin: true, }, }, diff --git a/src/ocr/enhanced.rs b/src/ocr/enhanced.rs index a139859..547791d 100644 --- a/src/ocr/enhanced.rs +++ b/src/ocr/enhanced.rs @@ -1693,40 +1693,29 @@ impl EnhancedOcrService { return Err("No words detected in OCR output".to_string()); } - // Special handling for numeric-heavy documents (bills, receipts, invoices) - let digit_chars = result.text.chars().filter(|c| c.is_numeric()).count(); - let digit_ratio = digit_chars as f32 / total_chars as f32; + // Count valuable content: letters + digits (explicitly treating digits as good content) + let content_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count(); + let content_ratio = content_chars as f32 / total_chars as f32; - // If >30% digits, likely a valid numeric document - be more lenient - if digit_ratio > 0.3 { - debug!( - "Document has high numeric content: {:.1}% digits - accepting as valid numeric document", - digit_ratio * 100.0 - ); - return Ok(()); - } - - // Count alphanumeric characters - let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count(); - let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32; - - // Relaxed threshold: only reject if >90% symbols (likely garbage) - // This allows bills/receipts with lots of numbers and special characters - const MIN_ALPHANUMERIC_RATIO: f32 = 0.10; - if alphanumeric_ratio < MIN_ALPHANUMERIC_RATIO { + // Only reject if >90% symbols (likely OCR garbage) + // This allows bills/receipts/invoices with numbers and formatting characters + const MIN_CONTENT_RATIO: f32 = 0.10; + if content_ratio < MIN_CONTENT_RATIO { + let symbol_ratio = 1.0 - content_ratio; return Err(format!( - "OCR result has too much non-alphanumeric content: {:.1}% alphanumeric (minimum: {:.1}%)", - alphanumeric_ratio * 100.0, - MIN_ALPHANUMERIC_RATIO * 100.0 + "OCR result has too little meaningful content: {:.1}% content (letters+digits), {:.1}% symbols/formatting (minimum content: {:.1}%)", + content_ratio * 100.0, + symbol_ratio * 100.0, + MIN_CONTENT_RATIO * 100.0 )); } // Log info for documents with reasonable content debug!( - "OCR validation passed: {:.1}% confidence, {} words, {:.1}% alphanumeric", + "OCR validation passed: {:.1}% confidence, {} words, {:.1}% content (letters+digits)", result.confidence, result.word_count, - alphanumeric_ratio * 100.0 + content_ratio * 100.0 ); Ok(()) diff --git a/tests/integration_enhanced_ocr_tests.rs b/tests/integration_enhanced_ocr_tests.rs index 8b0f94f..fdc40af 100644 --- a/tests/integration_enhanced_ocr_tests.rs +++ b/tests/integration_enhanced_ocr_tests.rs @@ -366,7 +366,7 @@ mod tests { let settings = create_test_settings(); let result = OcrResult { - text: "!!!@@@###$$$%%%^^^&&&***".to_string(), // Mostly symbols, < 30% alphanumeric + text: "!!!@@@###$$$%%%^^^&&&***".to_string(), // Mostly symbols, < 10% content confidence: 85.0, processing_time_ms: 1000, word_count: 1, @@ -676,8 +676,8 @@ startxref let service = EnhancedOcrService::new(temp_path, file_service); let settings = create_test_settings(); - // Test invoice/receipt with >30% digits - // Should be accepted even with lower alphanumeric ratio due to high digit content + // Test invoice/receipt with lots of digits + // Should be accepted because digits count as valuable content (letters+digits >= 10%) let result = OcrResult { text: "Invoice #12345\n$1,234.56\n$2,345.67\nTotal: $3,580.23\n!!!".to_string(), confidence: 60.0, @@ -687,15 +687,15 @@ startxref processed_image_path: None, }; - // Calculate to verify we have >30% digits - let digit_count = result.text.chars().filter(|c| c.is_numeric()).count(); + // Calculate to verify we have good content ratio (letters + digits) + let content_count = result.text.chars().filter(|c| c.is_alphanumeric()).count(); let total_chars = result.text.len(); - let digit_ratio = digit_count as f32 / total_chars as f32; - assert!(digit_ratio > 0.3, "Test data should have >30% digits, got {:.1}%", digit_ratio * 100.0); + let content_ratio = content_count as f32 / total_chars as f32; + assert!(content_ratio >= 0.10, "Test data should have >=10% content, got {:.1}%", content_ratio * 100.0); let validation_result = service.validate_ocr_quality(&result, &settings); assert!(validation_result.is_ok(), - "Expected validation to pass for numeric document with {:.1}% digits", digit_ratio * 100.0); + "Expected validation to pass for numeric document with {:.1}% content", content_ratio * 100.0); } #[cfg(feature = "ocr")] @@ -707,8 +707,8 @@ startxref let service = EnhancedOcrService::new(temp_path, file_service); let settings = create_test_settings(); - // Test document with exactly 30% digits (boundary case) - // 30 digits + 70 non-digit chars = 100 total chars + // Test document with 30% digits and 70% letters (100% content) + // Should easily pass since content ratio = 100% let result = OcrResult { text: "123456789012345678901234567890AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA".to_string(), confidence: 60.0, @@ -718,19 +718,18 @@ startxref processed_image_path: None, }; - // Verify exactly 30% digits - let digit_count = result.text.chars().filter(|c| c.is_numeric()).count(); + // Verify 100% content (30% digits + 70% letters) + let content_count = result.text.chars().filter(|c| c.is_alphanumeric()).count(); let total_chars = result.text.len(); - let digit_ratio = digit_count as f32 / total_chars as f32; - assert_eq!(digit_count, 30, "Test data should have exactly 30 digits"); + let content_ratio = content_count as f32 / total_chars as f32; + assert_eq!(content_count, 100, "Test data should have 100% content"); assert_eq!(total_chars, 100, "Test data should have exactly 100 chars"); - assert!((digit_ratio - 0.3).abs() < 0.01, "Should have exactly 30% digits, got {:.1}%", digit_ratio * 100.0); + assert!((content_ratio - 1.0).abs() < 0.01, "Should have 100% content, got {:.1}%", content_ratio * 100.0); let validation_result = service.validate_ocr_quality(&result, &settings); - // At exactly 30%, it should NOT trigger the >30% special handling - // So it will be validated normally (which should pass with 100% alphanumeric) + // Should pass easily with 100% content (letters + digits) assert!(validation_result.is_ok(), - "Expected validation to pass at 30% digit boundary"); + "Expected validation to pass with 100% content"); } #[cfg(feature = "ocr")] @@ -742,10 +741,10 @@ startxref let service = EnhancedOcrService::new(temp_path, file_service); let settings = create_test_settings(); - // Test text with exactly 10% alphanumeric characters (boundary case) - // 1 letter + 9 symbols = 10 total chars = 10% alphanumeric + // Test text with exactly 10% content (letters+digits) - boundary case + // 1 letter + 9 symbols = 10 total chars = 10% content let result = OcrResult { - text: "a!!!!!!!!!".to_string(), // 1 alphanumeric + 9 symbols = 10% + text: "a!!!!!!!!!".to_string(), // 1 letter + 9 symbols = 10% content confidence: 60.0, processing_time_ms: 1000, word_count: 1, @@ -753,17 +752,17 @@ startxref processed_image_path: None, }; - // Verify exactly 10% alphanumeric - let alphanumeric_count = result.text.chars().filter(|c| c.is_alphanumeric()).count(); + // Verify exactly 10% content (letters+digits) + let content_count = result.text.chars().filter(|c| c.is_alphanumeric()).count(); let total_chars = result.text.len(); - let alphanumeric_ratio = alphanumeric_count as f32 / total_chars as f32; - assert_eq!(alphanumeric_count, 1, "Test data should have exactly 1 alphanumeric char"); + let content_ratio = content_count as f32 / total_chars as f32; + assert_eq!(content_count, 1, "Test data should have exactly 1 content char"); assert_eq!(total_chars, 10, "Test data should have exactly 10 chars"); - assert!((alphanumeric_ratio - 0.1).abs() < 0.01, "Should have exactly 10% alphanumeric, got {:.1}%", alphanumeric_ratio * 100.0); + assert!((content_ratio - 0.1).abs() < 0.01, "Should have exactly 10% content, got {:.1}%", content_ratio * 100.0); let validation_result = service.validate_ocr_quality(&result, &settings); assert!(validation_result.is_ok(), - "Expected validation to pass at 10% alphanumeric boundary"); + "Expected validation to pass at 10% content boundary"); } #[cfg(feature = "ocr")] @@ -775,10 +774,10 @@ startxref let service = EnhancedOcrService::new(temp_path, file_service); let settings = create_test_settings(); - // Test text with <10% alphanumeric (pure garbage) - // 1 letter + 13 symbols = 14 total chars = 7.14% alphanumeric + // Test text with <10% content (letters+digits) - pure garbage + // 1 letter + 14 symbols = 15 total chars = 6.67% content let result = OcrResult { - text: "a!!!!!!!!!!!!!!".to_string(), // 1 alphanumeric + 14 symbols = ~7% + text: "a!!!!!!!!!!!!!!".to_string(), // 1 letter + 14 symbols = ~7% content confidence: 60.0, processing_time_ms: 1000, word_count: 1, @@ -786,19 +785,19 @@ startxref processed_image_path: None, }; - // Verify <10% alphanumeric - let alphanumeric_count = result.text.chars().filter(|c| c.is_alphanumeric()).count(); + // Verify <10% content (letters+digits) + let content_count = result.text.chars().filter(|c| c.is_alphanumeric()).count(); let total_chars = result.text.len(); - let alphanumeric_ratio = alphanumeric_count as f32 / total_chars as f32; - assert!(alphanumeric_ratio < 0.10, "Test data should have <10% alphanumeric, got {:.1}%", alphanumeric_ratio * 100.0); + let content_ratio = content_count as f32 / total_chars as f32; + assert!(content_ratio < 0.10, "Test data should have <10% content, got {:.1}%", content_ratio * 100.0); let validation_result = service.validate_ocr_quality(&result, &settings); assert!(validation_result.is_err(), - "Expected validation to fail for <10% alphanumeric content"); + "Expected validation to fail for <10% content"); let error_msg = validation_result.unwrap_err(); - assert!(error_msg.contains("non-alphanumeric"), - "Expected error about non-alphanumeric content, got: {}", error_msg); + assert!(error_msg.contains("meaningful content"), + "Expected error about meaningful content, got: {}", error_msg); } #[cfg(feature = "ocr")]