diff --git a/frontend/src/pages/DocumentManagementPage.tsx b/frontend/src/pages/DocumentManagementPage.tsx index 05c6cf9..98ade75 100644 --- a/frontend/src/pages/DocumentManagementPage.tsx +++ b/frontend/src/pages/DocumentManagementPage.tsx @@ -1249,7 +1249,7 @@ const DocumentManagementPage: React.FC = () => { variant="outlined" /> )} - {document.ocr_word_count !== undefined && ( + {document.ocr_word_count !== undefined && document.ocr_word_count !== null && ( } @@ -1689,8 +1689,8 @@ const DocumentManagementPage: React.FC = () => { - - {doc.ocr_confidence ? `${doc.ocr_confidence.toFixed(1)}%` : 'N/A'} + + {doc.ocr_confidence !== undefined && doc.ocr_confidence !== null ? `${doc.ocr_confidence.toFixed(1)}%` : 'N/A'} diff --git a/src/ocr/enhanced.rs b/src/ocr/enhanced.rs index 9af58f5..3afa7b0 100644 --- a/src/ocr/enhanced.rs +++ b/src/ocr/enhanced.rs @@ -1662,41 +1662,54 @@ impl EnhancedOcrService { /// Validate OCR result quality #[cfg(feature = "ocr")] - pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> bool { + pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> Result<(), String> { // Check minimum confidence threshold if result.confidence < settings.ocr_min_confidence { - warn!( - "OCR result below confidence threshold: {:.1}% < {:.1}%", - result.confidence, settings.ocr_min_confidence - ); - return false; + return Err(format!( + "OCR confidence below threshold: {:.1}% (minimum: {:.1}%)", + result.confidence, + settings.ocr_min_confidence + )); } - + // Check if text is reasonable (not just noise) if result.word_count == 0 { - warn!("OCR result contains no words"); - return false; + return Err("No words detected in OCR output".to_string()); } - + // Check for reasonable character distribution let total_chars = result.text.len(); if total_chars == 0 { - return false; + return Err("OCR result contains no characters".to_string()); } - + + // Count alphanumeric characters and digits separately let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count(); + let digit_chars = result.text.chars().filter(|c| c.is_numeric()).count(); let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32; - - // Expect at least 30% alphanumeric characters for valid text - if alphanumeric_ratio < 0.3 { - warn!( - "OCR result has low alphanumeric ratio: {:.1}%", - alphanumeric_ratio * 100.0 + let digit_ratio = digit_chars as f32 / total_chars as f32; + + // Special handling for numeric-heavy documents (bills, transaction lists, etc.) + // If document has >40% digits, it's likely a valid numeric document + if digit_ratio > 0.4 { + debug!( + "Document has high numeric content: {:.1}% digits - accepting as valid numeric document", + digit_ratio * 100.0 ); - return false; + return Ok(()); } - - true + + // Expect at least 20% alphanumeric characters for valid text (relaxed from 30%) + const MIN_ALPHANUMERIC_RATIO: f32 = 0.20; + if alphanumeric_ratio < MIN_ALPHANUMERIC_RATIO { + return Err(format!( + "OCR result has low alphanumeric content: {:.1}% (minimum: {:.1}%)", + alphanumeric_ratio * 100.0, + MIN_ALPHANUMERIC_RATIO * 100.0 + )); + } + + Ok(()) } } @@ -1711,8 +1724,8 @@ impl EnhancedOcrService { } - pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> bool { - false + pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> Result<(), String> { + Err("OCR feature not enabled".to_string()) } pub fn count_words_safely(&self, text: &str) -> usize { diff --git a/src/ocr/queue.rs b/src/ocr/queue.rs index 5cb91e5..f27b1cf 100644 --- a/src/ocr/queue.rs +++ b/src/ocr/queue.rs @@ -378,9 +378,8 @@ impl OcrQueueService { match ocr_service.extract_text_with_context(&file_path, &mime_type, &filename, file_size, &settings).await { Ok(ocr_result) => { // Validate OCR quality - if !ocr_service.validate_ocr_quality(&ocr_result, &settings) { - let error_msg = format!("OCR quality below threshold: {:.1}% confidence, {} words", - ocr_result.confidence, ocr_result.word_count); + if let Err(validation_error) = ocr_service.validate_ocr_quality(&ocr_result, &settings) { + let error_msg = format!("OCR quality validation failed: {}", validation_error); warn!("⚠️ OCR quality issues for '{}' | Job: {} | Document: {} | {:.1}% confidence | {} words", filename, item.id, item.document_id, ocr_result.confidence, ocr_result.word_count); @@ -390,6 +389,9 @@ impl OcrQueueService { "low_ocr_confidence", &error_msg, item.attempts, + Some(ocr_result.text.clone()), + Some(ocr_result.confidence), + Some(ocr_result.word_count as i32), ).await; // Mark as failed for quality issues with proper failure reason @@ -433,13 +435,16 @@ impl OcrQueueService { // Use classification function to determine proper failure reason let (failure_reason, _should_suppress) = Self::classify_ocr_error(error_msg); - + // Create failed document record using helper function let _ = self.create_failed_document_from_ocr_error( item.document_id, failure_reason, error_msg, item.attempts, + None, + None, + None, ).await; self.mark_failed(item.id, error_msg).await?; @@ -451,13 +456,16 @@ impl OcrQueueService { // Use classification function to determine proper failure reason let (failure_reason, _should_suppress) = Self::classify_ocr_error(&error_msg); - + // Create failed document record using helper function let _ = self.create_failed_document_from_ocr_error( item.document_id, failure_reason, &error_msg, item.attempts, + None, + None, + None, ).await; self.mark_failed(item.id, &error_msg).await?; @@ -472,13 +480,16 @@ impl OcrQueueService { // Use classification function to determine proper failure reason let (failure_reason, _should_suppress) = Self::classify_ocr_error(&error_msg); - + // Create failed document record using helper function let _ = self.create_failed_document_from_ocr_error( item.document_id, failure_reason, &error_msg, item.attempts, + None, + None, + None, ).await; // Mark document as failed for no extractable text @@ -560,6 +571,9 @@ impl OcrQueueService { failure_reason, &error_msg, item.attempts, + None, + None, + None, ).await; // Always use 'failed' status with specific failure reason @@ -1127,6 +1141,9 @@ impl OcrQueueService { failure_reason: &str, error_message: &str, retry_count: i32, + ocr_text: Option, + ocr_confidence: Option, + ocr_word_count: Option, ) -> Result<()> { // Query document directly from database without user restrictions (OCR service context) let document_row = sqlx::query( @@ -1166,9 +1183,9 @@ impl OcrQueueService { mime_type: Some(mime_type), content: None, tags: Vec::new(), - ocr_text: None, - ocr_confidence: None, - ocr_word_count: None, + ocr_text, + ocr_confidence, + ocr_word_count, ocr_processing_time_ms: None, failure_reason: failure_reason.to_string(), failure_stage: "ocr".to_string(), diff --git a/tests/integration_enhanced_ocr_tests.rs b/tests/integration_enhanced_ocr_tests.rs index d7695f8..b5f8ba5 100644 --- a/tests/integration_enhanced_ocr_tests.rs +++ b/tests/integration_enhanced_ocr_tests.rs @@ -306,8 +306,8 @@ mod tests { processed_image_path: None, }; - let is_valid = service.validate_ocr_quality(&result, &settings); - assert!(is_valid); + let result_validation = service.validate_ocr_quality(&result, &settings); + assert!(result_validation.is_ok()); } #[cfg(feature = "ocr")] @@ -329,8 +329,8 @@ mod tests { processed_image_path: None, }; - let is_valid = service.validate_ocr_quality(&result, &settings); - assert!(!is_valid); + let result_validation = service.validate_ocr_quality(&result, &settings); + assert!(result_validation.is_err()); } #[cfg(feature = "ocr")] @@ -351,8 +351,8 @@ mod tests { processed_image_path: None, }; - let is_valid = service.validate_ocr_quality(&result, &settings); - assert!(!is_valid); + let result_validation = service.validate_ocr_quality(&result, &settings); + assert!(result_validation.is_err()); } #[cfg(feature = "ocr")] @@ -373,8 +373,8 @@ mod tests { processed_image_path: None, }; - let is_valid = service.validate_ocr_quality(&result, &settings); - assert!(!is_valid); + let result_validation = service.validate_ocr_quality(&result, &settings); + assert!(result_validation.is_err()); } #[cfg(feature = "ocr")] @@ -395,8 +395,8 @@ mod tests { processed_image_path: None, }; - let is_valid = service.validate_ocr_quality(&result, &settings); - assert!(is_valid); + let result_validation = service.validate_ocr_quality(&result, &settings); + assert!(result_validation.is_ok()); } #[tokio::test] diff --git a/tests/integration_pdf_word_count_tests.rs b/tests/integration_pdf_word_count_tests.rs index 2c4b509..1fd3ca2 100644 --- a/tests/integration_pdf_word_count_tests.rs +++ b/tests/integration_pdf_word_count_tests.rs @@ -243,12 +243,12 @@ mod pdf_word_count_integration_tests { match service.extract_text_from_pdf(&pdf_path, &settings).await { Ok(result) => { // Test quality validation - let is_valid = service.validate_ocr_quality(&result, &settings); - + let result_validation = service.validate_ocr_quality(&result, &settings); + if result.word_count > 0 { - assert!(is_valid, "Good quality PDF should pass validation"); + assert!(result_validation.is_ok(), "Good quality PDF should pass validation"); } else { - assert!(!is_valid, "PDF with 0 words should fail validation"); + assert!(result_validation.is_err(), "PDF with 0 words should fail validation"); } // Verify OCR result structure