diff --git a/frontend/src/pages/DocumentManagementPage.tsx b/frontend/src/pages/DocumentManagementPage.tsx
index 05c6cf9..98ade75 100644
--- a/frontend/src/pages/DocumentManagementPage.tsx
+++ b/frontend/src/pages/DocumentManagementPage.tsx
@@ -1249,7 +1249,7 @@ const DocumentManagementPage: React.FC = () => {
variant="outlined"
/>
)}
- {document.ocr_word_count !== undefined && (
+ {document.ocr_word_count !== undefined && document.ocr_word_count !== null && (
}
@@ -1689,8 +1689,8 @@ const DocumentManagementPage: React.FC = () => {
-
- {doc.ocr_confidence ? `${doc.ocr_confidence.toFixed(1)}%` : 'N/A'}
+
+ {doc.ocr_confidence !== undefined && doc.ocr_confidence !== null ? `${doc.ocr_confidence.toFixed(1)}%` : 'N/A'}
diff --git a/src/ocr/enhanced.rs b/src/ocr/enhanced.rs
index 9af58f5..3afa7b0 100644
--- a/src/ocr/enhanced.rs
+++ b/src/ocr/enhanced.rs
@@ -1662,41 +1662,54 @@ impl EnhancedOcrService {
/// Validate OCR result quality
#[cfg(feature = "ocr")]
- pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> bool {
+ pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> Result<(), String> {
// Check minimum confidence threshold
if result.confidence < settings.ocr_min_confidence {
- warn!(
- "OCR result below confidence threshold: {:.1}% < {:.1}%",
- result.confidence, settings.ocr_min_confidence
- );
- return false;
+ return Err(format!(
+ "OCR confidence below threshold: {:.1}% (minimum: {:.1}%)",
+ result.confidence,
+ settings.ocr_min_confidence
+ ));
}
-
+
// Check if text is reasonable (not just noise)
if result.word_count == 0 {
- warn!("OCR result contains no words");
- return false;
+ return Err("No words detected in OCR output".to_string());
}
-
+
// Check for reasonable character distribution
let total_chars = result.text.len();
if total_chars == 0 {
- return false;
+ return Err("OCR result contains no characters".to_string());
}
-
+
+ // Count alphanumeric characters and digits separately
let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count();
+ let digit_chars = result.text.chars().filter(|c| c.is_numeric()).count();
let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32;
-
- // Expect at least 30% alphanumeric characters for valid text
- if alphanumeric_ratio < 0.3 {
- warn!(
- "OCR result has low alphanumeric ratio: {:.1}%",
- alphanumeric_ratio * 100.0
+ let digit_ratio = digit_chars as f32 / total_chars as f32;
+
+ // Special handling for numeric-heavy documents (bills, transaction lists, etc.)
+ // If document has >40% digits, it's likely a valid numeric document
+ if digit_ratio > 0.4 {
+ debug!(
+ "Document has high numeric content: {:.1}% digits - accepting as valid numeric document",
+ digit_ratio * 100.0
);
- return false;
+ return Ok(());
}
-
- true
+
+ // Expect at least 20% alphanumeric characters for valid text (relaxed from 30%)
+ const MIN_ALPHANUMERIC_RATIO: f32 = 0.20;
+ if alphanumeric_ratio < MIN_ALPHANUMERIC_RATIO {
+ return Err(format!(
+ "OCR result has low alphanumeric content: {:.1}% (minimum: {:.1}%)",
+ alphanumeric_ratio * 100.0,
+ MIN_ALPHANUMERIC_RATIO * 100.0
+ ));
+ }
+
+ Ok(())
}
}
@@ -1711,8 +1724,8 @@ impl EnhancedOcrService {
}
- pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> bool {
- false
+ pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> Result<(), String> {
+ Err("OCR feature not enabled".to_string())
}
pub fn count_words_safely(&self, text: &str) -> usize {
diff --git a/src/ocr/queue.rs b/src/ocr/queue.rs
index 5cb91e5..f27b1cf 100644
--- a/src/ocr/queue.rs
+++ b/src/ocr/queue.rs
@@ -378,9 +378,8 @@ impl OcrQueueService {
match ocr_service.extract_text_with_context(&file_path, &mime_type, &filename, file_size, &settings).await {
Ok(ocr_result) => {
// Validate OCR quality
- if !ocr_service.validate_ocr_quality(&ocr_result, &settings) {
- let error_msg = format!("OCR quality below threshold: {:.1}% confidence, {} words",
- ocr_result.confidence, ocr_result.word_count);
+ if let Err(validation_error) = ocr_service.validate_ocr_quality(&ocr_result, &settings) {
+ let error_msg = format!("OCR quality validation failed: {}", validation_error);
warn!("⚠️ OCR quality issues for '{}' | Job: {} | Document: {} | {:.1}% confidence | {} words",
filename, item.id, item.document_id, ocr_result.confidence, ocr_result.word_count);
@@ -390,6 +389,9 @@ impl OcrQueueService {
"low_ocr_confidence",
&error_msg,
item.attempts,
+ Some(ocr_result.text.clone()),
+ Some(ocr_result.confidence),
+ Some(ocr_result.word_count as i32),
).await;
// Mark as failed for quality issues with proper failure reason
@@ -433,13 +435,16 @@ impl OcrQueueService {
// Use classification function to determine proper failure reason
let (failure_reason, _should_suppress) = Self::classify_ocr_error(error_msg);
-
+
// Create failed document record using helper function
let _ = self.create_failed_document_from_ocr_error(
item.document_id,
failure_reason,
error_msg,
item.attempts,
+ None,
+ None,
+ None,
).await;
self.mark_failed(item.id, error_msg).await?;
@@ -451,13 +456,16 @@ impl OcrQueueService {
// Use classification function to determine proper failure reason
let (failure_reason, _should_suppress) = Self::classify_ocr_error(&error_msg);
-
+
// Create failed document record using helper function
let _ = self.create_failed_document_from_ocr_error(
item.document_id,
failure_reason,
&error_msg,
item.attempts,
+ None,
+ None,
+ None,
).await;
self.mark_failed(item.id, &error_msg).await?;
@@ -472,13 +480,16 @@ impl OcrQueueService {
// Use classification function to determine proper failure reason
let (failure_reason, _should_suppress) = Self::classify_ocr_error(&error_msg);
-
+
// Create failed document record using helper function
let _ = self.create_failed_document_from_ocr_error(
item.document_id,
failure_reason,
&error_msg,
item.attempts,
+ None,
+ None,
+ None,
).await;
// Mark document as failed for no extractable text
@@ -560,6 +571,9 @@ impl OcrQueueService {
failure_reason,
&error_msg,
item.attempts,
+ None,
+ None,
+ None,
).await;
// Always use 'failed' status with specific failure reason
@@ -1127,6 +1141,9 @@ impl OcrQueueService {
failure_reason: &str,
error_message: &str,
retry_count: i32,
+ ocr_text: Option,
+ ocr_confidence: Option,
+ ocr_word_count: Option,
) -> Result<()> {
// Query document directly from database without user restrictions (OCR service context)
let document_row = sqlx::query(
@@ -1166,9 +1183,9 @@ impl OcrQueueService {
mime_type: Some(mime_type),
content: None,
tags: Vec::new(),
- ocr_text: None,
- ocr_confidence: None,
- ocr_word_count: None,
+ ocr_text,
+ ocr_confidence,
+ ocr_word_count,
ocr_processing_time_ms: None,
failure_reason: failure_reason.to_string(),
failure_stage: "ocr".to_string(),
diff --git a/tests/integration_enhanced_ocr_tests.rs b/tests/integration_enhanced_ocr_tests.rs
index d7695f8..b5f8ba5 100644
--- a/tests/integration_enhanced_ocr_tests.rs
+++ b/tests/integration_enhanced_ocr_tests.rs
@@ -306,8 +306,8 @@ mod tests {
processed_image_path: None,
};
- let is_valid = service.validate_ocr_quality(&result, &settings);
- assert!(is_valid);
+ let result_validation = service.validate_ocr_quality(&result, &settings);
+ assert!(result_validation.is_ok());
}
#[cfg(feature = "ocr")]
@@ -329,8 +329,8 @@ mod tests {
processed_image_path: None,
};
- let is_valid = service.validate_ocr_quality(&result, &settings);
- assert!(!is_valid);
+ let result_validation = service.validate_ocr_quality(&result, &settings);
+ assert!(result_validation.is_err());
}
#[cfg(feature = "ocr")]
@@ -351,8 +351,8 @@ mod tests {
processed_image_path: None,
};
- let is_valid = service.validate_ocr_quality(&result, &settings);
- assert!(!is_valid);
+ let result_validation = service.validate_ocr_quality(&result, &settings);
+ assert!(result_validation.is_err());
}
#[cfg(feature = "ocr")]
@@ -373,8 +373,8 @@ mod tests {
processed_image_path: None,
};
- let is_valid = service.validate_ocr_quality(&result, &settings);
- assert!(!is_valid);
+ let result_validation = service.validate_ocr_quality(&result, &settings);
+ assert!(result_validation.is_err());
}
#[cfg(feature = "ocr")]
@@ -395,8 +395,8 @@ mod tests {
processed_image_path: None,
};
- let is_valid = service.validate_ocr_quality(&result, &settings);
- assert!(is_valid);
+ let result_validation = service.validate_ocr_quality(&result, &settings);
+ assert!(result_validation.is_ok());
}
#[tokio::test]
diff --git a/tests/integration_pdf_word_count_tests.rs b/tests/integration_pdf_word_count_tests.rs
index 2c4b509..1fd3ca2 100644
--- a/tests/integration_pdf_word_count_tests.rs
+++ b/tests/integration_pdf_word_count_tests.rs
@@ -243,12 +243,12 @@ mod pdf_word_count_integration_tests {
match service.extract_text_from_pdf(&pdf_path, &settings).await {
Ok(result) => {
// Test quality validation
- let is_valid = service.validate_ocr_quality(&result, &settings);
-
+ let result_validation = service.validate_ocr_quality(&result, &settings);
+
if result.word_count > 0 {
- assert!(is_valid, "Good quality PDF should pass validation");
+ assert!(result_validation.is_ok(), "Good quality PDF should pass validation");
} else {
- assert!(!is_valid, "PDF with 0 words should fail validation");
+ assert!(result_validation.is_err(), "PDF with 0 words should fail validation");
}
// Verify OCR result structure