feat(ui): handle strange responses that the UI could recieve

2025-10-05 13:45:10 -07:00 · 2025-10-05 13:45:10 -07:00 · e7574cb0da
parent 1e652ed62e
commit e7574cb0da
5 changed files with 79 additions and 49 deletions
--- a/frontend/src/pages/DocumentManagementPage.tsx
+++ b/frontend/src/pages/DocumentManagementPage.tsx
@ -1249,7 +1249,7 @@ const DocumentManagementPage: React.FC = () => {
                                          variant="outlined"
                                        />
                                      )}
-                                      {document.ocr_word_count !== undefined && (
+                                      {document.ocr_word_count !== undefined && document.ocr_word_count !== null && (
                                        <Chip
                                          size="small"
                                          icon={<FindInPageIcon />}
@ -1689,8 +1689,8 @@ const DocumentManagementPage: React.FC = () => {
                                </Typography>
                              </TableCell>
                              <TableCell>
-                                <Typography variant="body2" color={doc.ocr_confidence ? 'warning.main' : 'error.main'}>
+                                <Typography variant="body2" color={doc.ocr_confidence !== undefined && doc.ocr_confidence !== null ? 'warning.main' : 'error.main'}>
-                                  {doc.ocr_confidence ? `${doc.ocr_confidence.toFixed(1)}%` : 'N/A'}
+                                  {doc.ocr_confidence !== undefined && doc.ocr_confidence !== null ? `${doc.ocr_confidence.toFixed(1)}%` : 'N/A'}
                                </Typography>
                              </TableCell>
                              <TableCell>
--- a/src/ocr/enhanced.rs
+++ b/src/ocr/enhanced.rs
@ -1662,41 +1662,54 @@ impl EnhancedOcrService {
    /// Validate OCR result quality
    #[cfg(feature = "ocr")]
-    pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> bool {
+    pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> Result<(), String> {
        // Check minimum confidence threshold
        if result.confidence < settings.ocr_min_confidence {
-            warn!(
+            return Err(format!(
-                "OCR result below confidence threshold: {:.1}% < {:.1}%", 
+                "OCR confidence below threshold: {:.1}% (minimum: {:.1}%)",
-                result.confidence, settings.ocr_min_confidence
+                result.confidence,
-            );
+                settings.ocr_min_confidence
-            return false;
+            ));
        }
-        
+
        // Check if text is reasonable (not just noise)
        if result.word_count == 0 {
-            warn!("OCR result contains no words");
+            return Err("No words detected in OCR output".to_string());
            return false;
        }
-        
+
        // Check for reasonable character distribution
        let total_chars = result.text.len();
        if total_chars == 0 {
-            return false;
+            return Err("OCR result contains no characters".to_string());
        }
-        
+
        // Count alphanumeric characters and digits separately
        let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count();
        let digit_chars = result.text.chars().filter(|c| c.is_numeric()).count();
        let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32;
-        
+        let digit_ratio = digit_chars as f32 / total_chars as f32;
-        // Expect at least 30% alphanumeric characters for valid text
+
-        if alphanumeric_ratio < 0.3 {
+        // Special handling for numeric-heavy documents (bills, transaction lists, etc.)
-            warn!(
+        // If document has >40% digits, it's likely a valid numeric document
-                "OCR result has low alphanumeric ratio: {:.1}%", 
+        if digit_ratio > 0.4 {
-                alphanumeric_ratio * 100.0
+            debug!(
                "Document has high numeric content: {:.1}% digits - accepting as valid numeric document",
                digit_ratio * 100.0
            );
-            return false;
+            return Ok(());
        }
-        
+
-        true
+        // Expect at least 20% alphanumeric characters for valid text (relaxed from 30%)
        const MIN_ALPHANUMERIC_RATIO: f32 = 0.20;
        if alphanumeric_ratio < MIN_ALPHANUMERIC_RATIO {
            return Err(format!(
                "OCR result has low alphanumeric content: {:.1}% (minimum: {:.1}%)",
                alphanumeric_ratio * 100.0,
                MIN_ALPHANUMERIC_RATIO * 100.0
            ));
        }
        Ok(())
    }
 }
@ -1711,8 +1724,8 @@ impl EnhancedOcrService {
    }
-    pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> bool {
+    pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> Result<(), String> {
-        false
+        Err("OCR feature not enabled".to_string())
    }
    pub fn count_words_safely(&self, text: &str) -> usize {
--- a/src/ocr/queue.rs
+++ b/src/ocr/queue.rs
@ -378,9 +378,8 @@ impl OcrQueueService {
                match ocr_service.extract_text_with_context(&file_path, &mime_type, &filename, file_size, &settings).await {
                    Ok(ocr_result) => {
                        // Validate OCR quality
-                        if !ocr_service.validate_ocr_quality(&ocr_result, &settings) {
+                        if let Err(validation_error) = ocr_service.validate_ocr_quality(&ocr_result, &settings) {
-                            let error_msg = format!("OCR quality below threshold: {:.1}% confidence, {} words", 
+                            let error_msg = format!("OCR quality validation failed: {}", validation_error);
                                                   ocr_result.confidence, ocr_result.word_count);
                            warn!("⚠️  OCR quality issues for '{}' | Job: {} | Document: {} | {:.1}% confidence | {} words", 
                                  filename, item.id, item.document_id, ocr_result.confidence, ocr_result.word_count);
@ -390,6 +389,9 @@ impl OcrQueueService {
                                "low_ocr_confidence",
                                &error_msg,
                                item.attempts,
                                Some(ocr_result.text.clone()),
                                Some(ocr_result.confidence),
                                Some(ocr_result.word_count as i32),
                            ).await;
                            // Mark as failed for quality issues with proper failure reason
@ -433,13 +435,16 @@ impl OcrQueueService {
                                    // Use classification function to determine proper failure reason
                                    let (failure_reason, _should_suppress) = Self::classify_ocr_error(error_msg);
-                                    
+
                                    // Create failed document record using helper function
                                    let _ = self.create_failed_document_from_ocr_error(
                                        item.document_id,
                                        failure_reason,
                                        error_msg,
                                        item.attempts,
                                        None,
                                        None,
                                        None,
                                    ).await;
                                    self.mark_failed(item.id, error_msg).await?;
@ -451,13 +456,16 @@ impl OcrQueueService {
                                    // Use classification function to determine proper failure reason
                                    let (failure_reason, _should_suppress) = Self::classify_ocr_error(&error_msg);
-                                    
+
                                    // Create failed document record using helper function
                                    let _ = self.create_failed_document_from_ocr_error(
                                        item.document_id,
                                        failure_reason,
                                        &error_msg,
                                        item.attempts,
                                        None,
                                        None,
                                        None,
                                    ).await;
                                    self.mark_failed(item.id, &error_msg).await?;
@ -472,13 +480,16 @@ impl OcrQueueService {
                            // Use classification function to determine proper failure reason
                            let (failure_reason, _should_suppress) = Self::classify_ocr_error(&error_msg);
-                            
+
                            // Create failed document record using helper function
                            let _ = self.create_failed_document_from_ocr_error(
                                item.document_id,
                                failure_reason,
                                &error_msg,
                                item.attempts,
                                None,
                                None,
                                None,
                            ).await;
                            // Mark document as failed for no extractable text
@ -560,6 +571,9 @@ impl OcrQueueService {
                            failure_reason,
                            &error_msg,
                            item.attempts,
                            None,
                            None,
                            None,
                        ).await;
                        // Always use 'failed' status with specific failure reason
@ -1127,6 +1141,9 @@ impl OcrQueueService {
        failure_reason: &str,
        error_message: &str,
        retry_count: i32,
        ocr_text: Option<String>,
        ocr_confidence: Option<f32>,
        ocr_word_count: Option<i32>,
    ) -> Result<()> {
        // Query document directly from database without user restrictions (OCR service context)
        let document_row = sqlx::query(
@ -1166,9 +1183,9 @@ impl OcrQueueService {
                mime_type: Some(mime_type),
                content: None,
                tags: Vec::new(),
-                ocr_text: None,
+                ocr_text,
-                ocr_confidence: None,
+                ocr_confidence,
-                ocr_word_count: None,
+                ocr_word_count,
                ocr_processing_time_ms: None,
                failure_reason: failure_reason.to_string(),
                failure_stage: "ocr".to_string(),
--- a/tests/integration_enhanced_ocr_tests.rs
+++ b/tests/integration_enhanced_ocr_tests.rs
@ -306,8 +306,8 @@ mod tests {
            processed_image_path: None,
        };
-        let is_valid = service.validate_ocr_quality(&result, &settings);
+        let result_validation = service.validate_ocr_quality(&result, &settings);
-        assert!(is_valid);
+        assert!(result_validation.is_ok());
    }
    #[cfg(feature = "ocr")]
@ -329,8 +329,8 @@ mod tests {
            processed_image_path: None,
        };
-        let is_valid = service.validate_ocr_quality(&result, &settings);
+        let result_validation = service.validate_ocr_quality(&result, &settings);
-        assert!(!is_valid);
+        assert!(result_validation.is_err());
    }
    #[cfg(feature = "ocr")]
@ -351,8 +351,8 @@ mod tests {
            processed_image_path: None,
        };
-        let is_valid = service.validate_ocr_quality(&result, &settings);
+        let result_validation = service.validate_ocr_quality(&result, &settings);
-        assert!(!is_valid);
+        assert!(result_validation.is_err());
    }
    #[cfg(feature = "ocr")]
@ -373,8 +373,8 @@ mod tests {
            processed_image_path: None,
        };
-        let is_valid = service.validate_ocr_quality(&result, &settings);
+        let result_validation = service.validate_ocr_quality(&result, &settings);
-        assert!(!is_valid);
+        assert!(result_validation.is_err());
    }
    #[cfg(feature = "ocr")]
@ -395,8 +395,8 @@ mod tests {
            processed_image_path: None,
        };
-        let is_valid = service.validate_ocr_quality(&result, &settings);
+        let result_validation = service.validate_ocr_quality(&result, &settings);
-        assert!(is_valid);
+        assert!(result_validation.is_ok());
    }
    #[tokio::test]
--- a/tests/integration_pdf_word_count_tests.rs
+++ b/tests/integration_pdf_word_count_tests.rs
@ -243,12 +243,12 @@ mod pdf_word_count_integration_tests {
        match service.extract_text_from_pdf(&pdf_path, &settings).await {
            Ok(result) => {
                // Test quality validation
-                let is_valid = service.validate_ocr_quality(&result, &settings);
+                let result_validation = service.validate_ocr_quality(&result, &settings);
-                
+
                if result.word_count > 0 {
-                    assert!(is_valid, "Good quality PDF should pass validation");
+                    assert!(result_validation.is_ok(), "Good quality PDF should pass validation");
                } else {
-                    assert!(!is_valid, "PDF with 0 words should fail validation");
+                    assert!(result_validation.is_err(), "PDF with 0 words should fail validation");
                }
                // Verify OCR result structure