feat(ocr): implement new dev stack and allow for more numbers in ocr documents

This commit is contained in:
perf3ct 2025-10-28 14:34:34 -07:00
parent 8337b988b7
commit 65c49ef4f2
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
10 changed files with 585 additions and 78 deletions

76
.dockerignore Normal file
View File

@ -0,0 +1,76 @@
# Git
.git
.gitignore
.github
# Documentation
*.md
docs/
site/
# Development files
.env
.env.dev
.env.test
*.log
# Test files and data
test_data/
test_files/
test-uploads/
test-results/
tests/
# Build artifacts
target/
frontend/dist/
frontend/node_modules/
# IDE and editor files
.vscode/
.idea/
*.swp
*.swo
*~
# CI/CD
.woodpecker/
.claude/
# Docker files (no need to copy into image)
Dockerfile*
docker-compose*.yml
.dockerignore
# Python
__pycache__/
*.py[cod]
*$py.class
venv/
*.egg-info/
.pytest_cache/
# Temporary files
*.tmp
*.bak
.DS_Store
# Uploads and watch directories (will be mounted)
readur_uploads/
readur_watch/
uploads/
# Other
charts/
scripts/
Makefile
requirements.txt
*.sh
grafana-dashboard.json
nginx.conf
ssl/
config/
.cargo/
renovate.json
mkdocs.yml
.playwright-mcp/

73
Dockerfile.dev Normal file
View File

@ -0,0 +1,73 @@
# Development Dockerfile with hot-reloading support for Rust backend
FROM rust:1.90-bookworm
# Install system dependencies for OCR and PDF processing
RUN apt-get update && apt-get install -y \
tesseract-ocr \
tesseract-ocr-eng \
tesseract-ocr-spa \
tesseract-ocr-fra \
tesseract-ocr-deu \
tesseract-ocr-ita \
tesseract-ocr-por \
tesseract-ocr-rus \
tesseract-ocr-chi-sim \
tesseract-ocr-chi-tra \
tesseract-ocr-jpn \
tesseract-ocr-kor \
tesseract-ocr-ara \
tesseract-ocr-hin \
tesseract-ocr-nld \
tesseract-ocr-swe \
tesseract-ocr-nor \
tesseract-ocr-dan \
tesseract-ocr-fin \
tesseract-ocr-pol \
tesseract-ocr-ces \
tesseract-ocr-hun \
tesseract-ocr-tur \
tesseract-ocr-tha \
tesseract-ocr-vie \
libtesseract-dev \
libleptonica-dev \
pkg-config \
libclang-dev \
clang \
poppler-utils \
ocrmypdf \
curl \
# Legacy DOC file support
antiword \
catdoc \
&& rm -rf /var/lib/apt/lists/*
# Install cargo-watch for auto-recompilation
RUN cargo install cargo-watch
WORKDIR /app
# Create necessary directories
RUN mkdir -p /app/uploads /app/watch /app/frontend
# Copy dependency files first for better caching
COPY Cargo.toml Cargo.lock ./
# Create dummy source files for all binaries to build dependencies
# This significantly speeds up the first cargo-watch build
RUN mkdir -p src/bin && \
echo "fn main() {}" > src/main.rs && \
echo "fn main() {}" > src/bin/test_runner.rs && \
echo "fn main() {}" > src/bin/analyze-webdav-performance.rs && \
cargo build && \
rm -rf src target
# The actual source code will be mounted as a volume
# This allows for hot-reloading without rebuilding the image
EXPOSE 8000
# Use cargo-watch to automatically rebuild and restart on file changes
# --why: shows what triggered the rebuild
# --ignore: ignore certain file patterns to avoid unnecessary rebuilds
# Run the main readur binary
CMD ["cargo", "watch", "-x", "run --bin readur", "--why", "--ignore", "frontend/*", "--ignore", "*.md", "--ignore", "test*"]

122
docker-compose.dev.yml Normal file
View File

@ -0,0 +1,122 @@
# Docker Compose configuration for development with hot-reloading
services:
postgres:
image: postgres:17-alpine
environment:
POSTGRES_USER: readur
POSTGRES_PASSWORD: readur
POSTGRES_DB: readur
volumes:
- postgres_data_dev:/var/lib/postgresql/data
ports:
- "5432:5432"
healthcheck:
test: ["CMD-SHELL", "pg_isready -U readur"]
interval: 10s
timeout: 5s
retries: 5
backend:
build:
context: .
dockerfile: Dockerfile.dev
environment:
# Database configuration
DATABASE_URL: postgresql://readur:readur@postgres/readur
# Server configuration
SERVER_HOST: 0.0.0.0
SERVER_PORT: 8000
# Security
JWT_SECRET: dev-secret-key-change-in-production
# File paths
UPLOAD_PATH: /app/uploads
WATCH_FOLDER: /app/watch
# OCR configuration
OCR_LANGUAGE: eng
CONCURRENT_OCR_JOBS: 4
OCR_TIMEOUT_SECONDS: 300
MAX_FILE_SIZE_MB: 50
# Performance
MEMORY_LIMIT_MB: 512
CPU_PRIORITY: normal
# File watching
ALLOWED_FILE_TYPES: pdf,txt,doc,docx,png,jpg,jpeg
WATCH_INTERVAL_SECONDS: 30
FILE_STABILITY_CHECK_MS: 1000
MAX_FILE_AGE_HOURS: 24
# Development mode
RUST_LOG: debug
RUST_BACKTRACE: 1
ports:
- "8000:8000"
volumes:
# Mount source code for hot-reloading
- ./src:/app/src
- ./Cargo.toml:/app/Cargo.toml
- ./Cargo.lock:/app/Cargo.lock
- ./migrations:/app/migrations
# Persistent storage
- ./readur_uploads:/app/uploads
- ./readur_watch:/app/watch
# Cache cargo registry and git dependencies to speed up rebuilds
- cargo_registry:/usr/local/cargo/registry
- cargo_git:/usr/local/cargo/git
# Cache build artifacts (but not the final binary)
- target_cache:/app/target
depends_on:
postgres:
condition: service_healthy
# No healthcheck in dev mode to avoid noise during restarts
frontend:
build:
context: ./frontend
dockerfile: Dockerfile.dev
environment:
# Configure Vite to proxy API requests to the backend service
# In Docker network, services can communicate by service name
VITE_API_PROXY_TARGET: http://backend:8000
# Use a less common port to avoid conflicts
CLIENT_PORT: 3456
ports:
- "3456:3456"
volumes:
# Mount entire frontend directory for hot-reloading
# This is simpler and avoids file vs directory mount issues
- ./frontend:/app
# Exclude node_modules - use the container's version
- /app/node_modules
# Exclude any build artifacts
- /app/dist
depends_on:
- backend
# Enable stdin and tty for interactive npm commands
stdin_open: true
tty: true
volumes:
postgres_data_dev:
cargo_registry:
cargo_git:
target_cache:

48
frontend/.dockerignore Normal file
View File

@ -0,0 +1,48 @@
# Node modules (installed in container)
node_modules/
# Build output (generated in container)
dist/
# Tests
**/*.test.ts
**/*.test.tsx
**/*.spec.ts
**/*.spec.tsx
test/
coverage/
.nyc_output/
# Development files
.env
.env.local
.env.*.local
# Git
.git
.gitignore
# IDE
.vscode/
.idea/
*.swp
*.swo
# Logs
npm-debug.log*
yarn-debug.log*
yarn-error.log*
pnpm-debug.log*
lerna-debug.log*
# Documentation
*.md
# Playwright
playwright-report/
test-results/
playwright/.cache/
# OS
.DS_Store
Thumbs.db

20
frontend/Dockerfile.dev Normal file
View File

@ -0,0 +1,20 @@
# Development Dockerfile for frontend with Vite hot-reloading
FROM node:22-bookworm
WORKDIR /app
# Copy package files for dependency installation
COPY package*.json ./
# Install dependencies
RUN npm install
# The source code will be mounted as a volume for hot-reloading
# Vite's dev server provides hot module replacement (HMR)
EXPOSE 3456
# Run Vite dev server
# --host 0.0.0.0 allows connections from outside the container
# --port is set via CLIENT_PORT environment variable in vite.config.ts
CMD ["npm", "run", "dev", "--", "--host", "0.0.0.0"]

View File

@ -13,6 +13,7 @@ import {
IconButton,
Paper,
Alert,
AlertTitle,
CircularProgress,
Tooltip,
Dialog,
@ -24,6 +25,7 @@ import {
Skeleton,
TextField,
InputAdornment,
Snackbar,
} from '@mui/material';
import Grid from '@mui/material/GridLegacy';
import {
@ -52,6 +54,9 @@ import {
OpenInFull as ExpandIcon,
Close as CloseIcon,
Delete as DeleteIcon,
Schedule as ScheduleIcon,
CheckCircle as CheckCircleIcon,
Error as ErrorIcon,
} from '@mui/icons-material';
import { documentService, OcrResponse, type Document } from '../services/api';
import DocumentViewer from '../components/DocumentViewer';
@ -72,6 +77,67 @@ const DocumentDetailsPage: React.FC = () => {
const { mode, modernTokens, glassEffect } = useTheme();
const theme = useMuiTheme();
const [document, setDocument] = useState<Document | null>(null);
// Helper function to render OCR status badge
const getOcrStatusBadge = (status?: string) => {
if (!status || status === 'pending') {
return (
<Chip
icon={<ScheduleIcon sx={{ fontSize: 18 }} />}
label="Pending OCR"
sx={{
backgroundColor: theme.palette.warning.light,
color: theme.palette.warning.dark,
border: `1px solid ${theme.palette.warning.main}`,
fontWeight: 600,
}}
/>
);
}
if (status === 'processing') {
return (
<Chip
icon={<CircularProgress size={16} sx={{ ml: 1 }} />}
label="Processing..."
sx={{
backgroundColor: theme.palette.info.light,
color: theme.palette.info.dark,
border: `1px solid ${theme.palette.info.main}`,
fontWeight: 600,
}}
/>
);
}
if (status === 'completed') {
return (
<Chip
icon={<CheckCircleIcon sx={{ fontSize: 18 }} />}
label="Completed"
color="success"
sx={{
fontWeight: 600,
}}
/>
);
}
if (status === 'failed') {
return (
<Chip
icon={<ErrorIcon sx={{ fontSize: 18 }} />}
label="Failed"
color="error"
sx={{
fontWeight: 600,
}}
/>
);
}
return null;
};
const [loading, setLoading] = useState<boolean>(true);
const [error, setError] = useState<string | null>(null);
const [ocrText, setOcrText] = useState<string>('');
@ -92,15 +158,20 @@ const DocumentDetailsPage: React.FC = () => {
// Retry functionality state
const [retryingOcr, setRetryingOcr] = useState<boolean>(false);
const [retryHistoryModalOpen, setRetryHistoryModalOpen] = useState<boolean>(false);
// Delete functionality state
const [deleting, setDeleting] = useState<boolean>(false);
const [deleteConfirmOpen, setDeleteConfirmOpen] = useState<boolean>(false);
// Snackbar state for retry feedback
const [snackbarOpen, setSnackbarOpen] = useState<boolean>(false);
const [snackbarMessage, setSnackbarMessage] = useState<string>('');
const [snackbarSeverity, setSnackbarSeverity] = useState<'success' | 'error'>('success');
// Retry handlers
const handleRetryOcr = async () => {
if (!document) return;
setRetryingOcr(true);
try {
await documentService.bulkRetryOcr({
@ -108,13 +179,22 @@ const DocumentDetailsPage: React.FC = () => {
document_ids: [document.id],
priority_override: 15,
});
// Show success message and refresh document
// Show success message
setSnackbarMessage('OCR retry initiated successfully');
setSnackbarSeverity('success');
setSnackbarOpen(true);
// Refresh document after a brief delay
setTimeout(() => {
fetchDocumentDetails();
}, 1000);
} catch (error) {
} catch (error: any) {
console.error('Failed to retry OCR:', error);
// Show error message
setSnackbarMessage(`Failed to retry OCR: ${error.message || 'Unknown error'}`);
setSnackbarSeverity('error');
setSnackbarOpen(true);
} finally {
setRetryingOcr(false);
}
@ -170,6 +250,21 @@ const DocumentDetailsPage: React.FC = () => {
fetchAvailableLabels();
}, []);
// Auto-refresh during OCR processing
useEffect(() => {
if (!document) return;
const isProcessing = document.ocr_status === 'processing' || retryingOcr;
if (isProcessing) {
const interval = setInterval(() => {
fetchDocumentDetails();
}, 3000); // Poll every 3 seconds
return () => clearInterval(interval);
}
}, [document?.ocr_status, retryingOcr]);
const fetchDocumentDetails = async (): Promise<void> => {
if (!id) {
setError(t('documentDetails.errors.notFound'));
@ -492,8 +587,37 @@ const DocumentDetailsPage: React.FC = () => {
</Box>
</Fade>
{/* OCR Status Alert - Shows when failed */}
{document?.ocr_status === 'failed' && (
<Fade in timeout={600}>
<Alert
severity="error"
sx={{
mb: 4,
borderRadius: 3,
border: `1px solid ${theme.palette.error.main}`,
}}
action={
<Button
color="inherit"
size="small"
onClick={handleRetryOcr}
disabled={retryingOcr}
startIcon={retryingOcr ? <CircularProgress size={16} /> : <RefreshIcon />}
sx={{ fontWeight: 600 }}
>
Retry OCR
</Button>
}
>
<AlertTitle sx={{ fontWeight: 700 }}>OCR Processing Failed</AlertTitle>
{document.ocr_failure_reason || document.ocr_error || 'OCR processing encountered an error. You can retry the operation.'}
</Alert>
</Fade>
)}
{/* Modern Content Layout */}
<Fade in timeout={800}>
<Fade in timeout={700}>
<Grid container spacing={4}>
{/* Hero Document Preview */}
<Grid item xs={12} lg={5}>
@ -664,19 +788,43 @@ const DocumentDetailsPage: React.FC = () => {
</Box>
)}
{document.has_ocr_text && (
<Box sx={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center' }}>
{/* OCR Status with Retry Info */}
<Box>
<Box sx={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', mb: 1 }}>
<Typography variant="body2" color="text.secondary">
{t('documentDetails.metadata.ocrStatus')}
</Typography>
<Chip
label={t('documentDetails.metadata.textExtracted')}
color="success"
size="small"
icon={<TextIcon sx={{ fontSize: 16 }} />}
/>
{getOcrStatusBadge(document.ocr_status)}
</Box>
)}
{/* Retry Count Indicator */}
{document.ocr_retry_count != null && document.ocr_retry_count > 0 && (
<Box
onClick={handleShowRetryHistory}
sx={{
display: 'flex',
alignItems: 'center',
gap: 0.5,
cursor: 'pointer',
p: 1,
mt: 1,
borderRadius: 1.5,
backgroundColor: theme.palette.action.hover,
border: `1px solid ${theme.palette.divider}`,
transition: 'all 0.2s ease',
'&:hover': {
backgroundColor: theme.palette.action.selected,
borderColor: theme.palette.info.main,
},
}}
>
<HistoryIcon sx={{ fontSize: 16, color: theme.palette.info.main }} />
<Typography variant="caption" sx={{ fontWeight: 600, color: theme.palette.text.secondary }}>
Retried {document.ocr_retry_count}× - click for history
</Typography>
</Box>
)}
</Box>
</Stack>
{/* Action Buttons */}
@ -1490,6 +1638,26 @@ const DocumentDetailsPage: React.FC = () => {
</Button>
</DialogActions>
</Dialog>
{/* Snackbar for Retry Feedback */}
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={() => setSnackbarOpen(false)}
anchorOrigin={{ vertical: 'bottom', horizontal: 'right' }}
>
<Alert
onClose={() => setSnackbarOpen(false)}
severity={snackbarSeverity}
sx={{
width: '100%',
borderRadius: 2,
boxShadow: theme.shadows[8],
}}
>
{snackbarMessage}
</Alert>
</Snackbar>
</Box>
);
};

View File

@ -61,6 +61,10 @@ export interface Document {
ocr_word_count?: number
ocr_processing_time_ms?: number
ocr_status?: string
ocr_error?: string
ocr_failure_reason?: string
ocr_retry_count?: number
ocr_completed_at?: string
}
export interface SearchRequest {
@ -113,6 +117,10 @@ export interface EnhancedDocument {
ocr_word_count?: number
ocr_processing_time_ms?: number
ocr_status?: string
ocr_error?: string
ocr_failure_reason?: string
ocr_retry_count?: number
ocr_completed_at?: string
search_rank?: number
snippets: SearchSnippet[]
}
@ -250,6 +258,8 @@ export interface OcrResponse {
ocr_processing_time_ms?: number
ocr_status?: string
ocr_error?: string
ocr_failure_reason?: string
ocr_retry_count?: number
ocr_completed_at?: string
}

View File

@ -4,6 +4,8 @@ import react from '@vitejs/plugin-react'
// Support environment variables for development
const BACKEND_PORT = process.env.BACKEND_PORT || '8000'
const CLIENT_PORT = process.env.CLIENT_PORT || '5173'
// Allow overriding the proxy target for Docker development
const PROXY_TARGET = process.env.VITE_API_PROXY_TARGET || `http://localhost:${BACKEND_PORT}`
export default defineConfig({
plugins: [react()],
@ -15,7 +17,7 @@ export default defineConfig({
port: parseInt(CLIENT_PORT),
proxy: {
'/api': {
target: `http://localhost:${BACKEND_PORT}`,
target: PROXY_TARGET,
changeOrigin: true,
},
},

View File

@ -1693,40 +1693,29 @@ impl EnhancedOcrService {
return Err("No words detected in OCR output".to_string());
}
// Special handling for numeric-heavy documents (bills, receipts, invoices)
let digit_chars = result.text.chars().filter(|c| c.is_numeric()).count();
let digit_ratio = digit_chars as f32 / total_chars as f32;
// Count valuable content: letters + digits (explicitly treating digits as good content)
let content_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count();
let content_ratio = content_chars as f32 / total_chars as f32;
// If >30% digits, likely a valid numeric document - be more lenient
if digit_ratio > 0.3 {
debug!(
"Document has high numeric content: {:.1}% digits - accepting as valid numeric document",
digit_ratio * 100.0
);
return Ok(());
}
// Count alphanumeric characters
let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count();
let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32;
// Relaxed threshold: only reject if >90% symbols (likely garbage)
// This allows bills/receipts with lots of numbers and special characters
const MIN_ALPHANUMERIC_RATIO: f32 = 0.10;
if alphanumeric_ratio < MIN_ALPHANUMERIC_RATIO {
// Only reject if >90% symbols (likely OCR garbage)
// This allows bills/receipts/invoices with numbers and formatting characters
const MIN_CONTENT_RATIO: f32 = 0.10;
if content_ratio < MIN_CONTENT_RATIO {
let symbol_ratio = 1.0 - content_ratio;
return Err(format!(
"OCR result has too much non-alphanumeric content: {:.1}% alphanumeric (minimum: {:.1}%)",
alphanumeric_ratio * 100.0,
MIN_ALPHANUMERIC_RATIO * 100.0
"OCR result has too little meaningful content: {:.1}% content (letters+digits), {:.1}% symbols/formatting (minimum content: {:.1}%)",
content_ratio * 100.0,
symbol_ratio * 100.0,
MIN_CONTENT_RATIO * 100.0
));
}
// Log info for documents with reasonable content
debug!(
"OCR validation passed: {:.1}% confidence, {} words, {:.1}% alphanumeric",
"OCR validation passed: {:.1}% confidence, {} words, {:.1}% content (letters+digits)",
result.confidence,
result.word_count,
alphanumeric_ratio * 100.0
content_ratio * 100.0
);
Ok(())

View File

@ -366,7 +366,7 @@ mod tests {
let settings = create_test_settings();
let result = OcrResult {
text: "!!!@@@###$$$%%%^^^&&&***".to_string(), // Mostly symbols, < 30% alphanumeric
text: "!!!@@@###$$$%%%^^^&&&***".to_string(), // Mostly symbols, < 10% content
confidence: 85.0,
processing_time_ms: 1000,
word_count: 1,
@ -676,8 +676,8 @@ startxref
let service = EnhancedOcrService::new(temp_path, file_service);
let settings = create_test_settings();
// Test invoice/receipt with >30% digits
// Should be accepted even with lower alphanumeric ratio due to high digit content
// Test invoice/receipt with lots of digits
// Should be accepted because digits count as valuable content (letters+digits >= 10%)
let result = OcrResult {
text: "Invoice #12345\n$1,234.56\n$2,345.67\nTotal: $3,580.23\n!!!".to_string(),
confidence: 60.0,
@ -687,15 +687,15 @@ startxref
processed_image_path: None,
};
// Calculate to verify we have >30% digits
let digit_count = result.text.chars().filter(|c| c.is_numeric()).count();
// Calculate to verify we have good content ratio (letters + digits)
let content_count = result.text.chars().filter(|c| c.is_alphanumeric()).count();
let total_chars = result.text.len();
let digit_ratio = digit_count as f32 / total_chars as f32;
assert!(digit_ratio > 0.3, "Test data should have >30% digits, got {:.1}%", digit_ratio * 100.0);
let content_ratio = content_count as f32 / total_chars as f32;
assert!(content_ratio >= 0.10, "Test data should have >=10% content, got {:.1}%", content_ratio * 100.0);
let validation_result = service.validate_ocr_quality(&result, &settings);
assert!(validation_result.is_ok(),
"Expected validation to pass for numeric document with {:.1}% digits", digit_ratio * 100.0);
"Expected validation to pass for numeric document with {:.1}% content", content_ratio * 100.0);
}
#[cfg(feature = "ocr")]
@ -707,8 +707,8 @@ startxref
let service = EnhancedOcrService::new(temp_path, file_service);
let settings = create_test_settings();
// Test document with exactly 30% digits (boundary case)
// 30 digits + 70 non-digit chars = 100 total chars
// Test document with 30% digits and 70% letters (100% content)
// Should easily pass since content ratio = 100%
let result = OcrResult {
text: "123456789012345678901234567890AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA".to_string(),
confidence: 60.0,
@ -718,19 +718,18 @@ startxref
processed_image_path: None,
};
// Verify exactly 30% digits
let digit_count = result.text.chars().filter(|c| c.is_numeric()).count();
// Verify 100% content (30% digits + 70% letters)
let content_count = result.text.chars().filter(|c| c.is_alphanumeric()).count();
let total_chars = result.text.len();
let digit_ratio = digit_count as f32 / total_chars as f32;
assert_eq!(digit_count, 30, "Test data should have exactly 30 digits");
let content_ratio = content_count as f32 / total_chars as f32;
assert_eq!(content_count, 100, "Test data should have 100% content");
assert_eq!(total_chars, 100, "Test data should have exactly 100 chars");
assert!((digit_ratio - 0.3).abs() < 0.01, "Should have exactly 30% digits, got {:.1}%", digit_ratio * 100.0);
assert!((content_ratio - 1.0).abs() < 0.01, "Should have 100% content, got {:.1}%", content_ratio * 100.0);
let validation_result = service.validate_ocr_quality(&result, &settings);
// At exactly 30%, it should NOT trigger the >30% special handling
// So it will be validated normally (which should pass with 100% alphanumeric)
// Should pass easily with 100% content (letters + digits)
assert!(validation_result.is_ok(),
"Expected validation to pass at 30% digit boundary");
"Expected validation to pass with 100% content");
}
#[cfg(feature = "ocr")]
@ -742,10 +741,10 @@ startxref
let service = EnhancedOcrService::new(temp_path, file_service);
let settings = create_test_settings();
// Test text with exactly 10% alphanumeric characters (boundary case)
// 1 letter + 9 symbols = 10 total chars = 10% alphanumeric
// Test text with exactly 10% content (letters+digits) - boundary case
// 1 letter + 9 symbols = 10 total chars = 10% content
let result = OcrResult {
text: "a!!!!!!!!!".to_string(), // 1 alphanumeric + 9 symbols = 10%
text: "a!!!!!!!!!".to_string(), // 1 letter + 9 symbols = 10% content
confidence: 60.0,
processing_time_ms: 1000,
word_count: 1,
@ -753,17 +752,17 @@ startxref
processed_image_path: None,
};
// Verify exactly 10% alphanumeric
let alphanumeric_count = result.text.chars().filter(|c| c.is_alphanumeric()).count();
// Verify exactly 10% content (letters+digits)
let content_count = result.text.chars().filter(|c| c.is_alphanumeric()).count();
let total_chars = result.text.len();
let alphanumeric_ratio = alphanumeric_count as f32 / total_chars as f32;
assert_eq!(alphanumeric_count, 1, "Test data should have exactly 1 alphanumeric char");
let content_ratio = content_count as f32 / total_chars as f32;
assert_eq!(content_count, 1, "Test data should have exactly 1 content char");
assert_eq!(total_chars, 10, "Test data should have exactly 10 chars");
assert!((alphanumeric_ratio - 0.1).abs() < 0.01, "Should have exactly 10% alphanumeric, got {:.1}%", alphanumeric_ratio * 100.0);
assert!((content_ratio - 0.1).abs() < 0.01, "Should have exactly 10% content, got {:.1}%", content_ratio * 100.0);
let validation_result = service.validate_ocr_quality(&result, &settings);
assert!(validation_result.is_ok(),
"Expected validation to pass at 10% alphanumeric boundary");
"Expected validation to pass at 10% content boundary");
}
#[cfg(feature = "ocr")]
@ -775,10 +774,10 @@ startxref
let service = EnhancedOcrService::new(temp_path, file_service);
let settings = create_test_settings();
// Test text with <10% alphanumeric (pure garbage)
// 1 letter + 13 symbols = 14 total chars = 7.14% alphanumeric
// Test text with <10% content (letters+digits) - pure garbage
// 1 letter + 14 symbols = 15 total chars = 6.67% content
let result = OcrResult {
text: "a!!!!!!!!!!!!!!".to_string(), // 1 alphanumeric + 14 symbols = ~7%
text: "a!!!!!!!!!!!!!!".to_string(), // 1 letter + 14 symbols = ~7% content
confidence: 60.0,
processing_time_ms: 1000,
word_count: 1,
@ -786,19 +785,19 @@ startxref
processed_image_path: None,
};
// Verify <10% alphanumeric
let alphanumeric_count = result.text.chars().filter(|c| c.is_alphanumeric()).count();
// Verify <10% content (letters+digits)
let content_count = result.text.chars().filter(|c| c.is_alphanumeric()).count();
let total_chars = result.text.len();
let alphanumeric_ratio = alphanumeric_count as f32 / total_chars as f32;
assert!(alphanumeric_ratio < 0.10, "Test data should have <10% alphanumeric, got {:.1}%", alphanumeric_ratio * 100.0);
let content_ratio = content_count as f32 / total_chars as f32;
assert!(content_ratio < 0.10, "Test data should have <10% content, got {:.1}%", content_ratio * 100.0);
let validation_result = service.validate_ocr_quality(&result, &settings);
assert!(validation_result.is_err(),
"Expected validation to fail for <10% alphanumeric content");
"Expected validation to fail for <10% content");
let error_msg = validation_result.unwrap_err();
assert!(error_msg.contains("non-alphanumeric"),
"Expected error about non-alphanumeric content, got: {}", error_msg);
assert!(error_msg.contains("meaningful content"),
"Expected error about meaningful content, got: {}", error_msg);
}
#[cfg(feature = "ocr")]