feat(ui): handle strange responses that the UI could recieve

This commit is contained in:
perf3ct 2025-10-05 13:45:10 -07:00
parent 1e652ed62e
commit e7574cb0da
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
5 changed files with 79 additions and 49 deletions

View File

@ -1249,7 +1249,7 @@ const DocumentManagementPage: React.FC = () => {
variant="outlined"
/>
)}
{document.ocr_word_count !== undefined && (
{document.ocr_word_count !== undefined && document.ocr_word_count !== null && (
<Chip
size="small"
icon={<FindInPageIcon />}
@ -1689,8 +1689,8 @@ const DocumentManagementPage: React.FC = () => {
</Typography>
</TableCell>
<TableCell>
<Typography variant="body2" color={doc.ocr_confidence ? 'warning.main' : 'error.main'}>
{doc.ocr_confidence ? `${doc.ocr_confidence.toFixed(1)}%` : 'N/A'}
<Typography variant="body2" color={doc.ocr_confidence !== undefined && doc.ocr_confidence !== null ? 'warning.main' : 'error.main'}>
{doc.ocr_confidence !== undefined && doc.ocr_confidence !== null ? `${doc.ocr_confidence.toFixed(1)}%` : 'N/A'}
</Typography>
</TableCell>
<TableCell>

View File

@ -1662,41 +1662,54 @@ impl EnhancedOcrService {
/// Validate OCR result quality
#[cfg(feature = "ocr")]
pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> bool {
pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> Result<(), String> {
// Check minimum confidence threshold
if result.confidence < settings.ocr_min_confidence {
warn!(
"OCR result below confidence threshold: {:.1}% < {:.1}%",
result.confidence, settings.ocr_min_confidence
);
return false;
return Err(format!(
"OCR confidence below threshold: {:.1}% (minimum: {:.1}%)",
result.confidence,
settings.ocr_min_confidence
));
}
// Check if text is reasonable (not just noise)
if result.word_count == 0 {
warn!("OCR result contains no words");
return false;
return Err("No words detected in OCR output".to_string());
}
// Check for reasonable character distribution
let total_chars = result.text.len();
if total_chars == 0 {
return false;
return Err("OCR result contains no characters".to_string());
}
// Count alphanumeric characters and digits separately
let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count();
let digit_chars = result.text.chars().filter(|c| c.is_numeric()).count();
let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32;
let digit_ratio = digit_chars as f32 / total_chars as f32;
// Expect at least 30% alphanumeric characters for valid text
if alphanumeric_ratio < 0.3 {
warn!(
"OCR result has low alphanumeric ratio: {:.1}%",
alphanumeric_ratio * 100.0
// Special handling for numeric-heavy documents (bills, transaction lists, etc.)
// If document has >40% digits, it's likely a valid numeric document
if digit_ratio > 0.4 {
debug!(
"Document has high numeric content: {:.1}% digits - accepting as valid numeric document",
digit_ratio * 100.0
);
return false;
return Ok(());
}
true
// Expect at least 20% alphanumeric characters for valid text (relaxed from 30%)
const MIN_ALPHANUMERIC_RATIO: f32 = 0.20;
if alphanumeric_ratio < MIN_ALPHANUMERIC_RATIO {
return Err(format!(
"OCR result has low alphanumeric content: {:.1}% (minimum: {:.1}%)",
alphanumeric_ratio * 100.0,
MIN_ALPHANUMERIC_RATIO * 100.0
));
}
Ok(())
}
}
@ -1711,8 +1724,8 @@ impl EnhancedOcrService {
}
pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> bool {
false
pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> Result<(), String> {
Err("OCR feature not enabled".to_string())
}
pub fn count_words_safely(&self, text: &str) -> usize {

View File

@ -378,9 +378,8 @@ impl OcrQueueService {
match ocr_service.extract_text_with_context(&file_path, &mime_type, &filename, file_size, &settings).await {
Ok(ocr_result) => {
// Validate OCR quality
if !ocr_service.validate_ocr_quality(&ocr_result, &settings) {
let error_msg = format!("OCR quality below threshold: {:.1}% confidence, {} words",
ocr_result.confidence, ocr_result.word_count);
if let Err(validation_error) = ocr_service.validate_ocr_quality(&ocr_result, &settings) {
let error_msg = format!("OCR quality validation failed: {}", validation_error);
warn!("⚠️ OCR quality issues for '{}' | Job: {} | Document: {} | {:.1}% confidence | {} words",
filename, item.id, item.document_id, ocr_result.confidence, ocr_result.word_count);
@ -390,6 +389,9 @@ impl OcrQueueService {
"low_ocr_confidence",
&error_msg,
item.attempts,
Some(ocr_result.text.clone()),
Some(ocr_result.confidence),
Some(ocr_result.word_count as i32),
).await;
// Mark as failed for quality issues with proper failure reason
@ -440,6 +442,9 @@ impl OcrQueueService {
failure_reason,
error_msg,
item.attempts,
None,
None,
None,
).await;
self.mark_failed(item.id, error_msg).await?;
@ -458,6 +463,9 @@ impl OcrQueueService {
failure_reason,
&error_msg,
item.attempts,
None,
None,
None,
).await;
self.mark_failed(item.id, &error_msg).await?;
@ -479,6 +487,9 @@ impl OcrQueueService {
failure_reason,
&error_msg,
item.attempts,
None,
None,
None,
).await;
// Mark document as failed for no extractable text
@ -560,6 +571,9 @@ impl OcrQueueService {
failure_reason,
&error_msg,
item.attempts,
None,
None,
None,
).await;
// Always use 'failed' status with specific failure reason
@ -1127,6 +1141,9 @@ impl OcrQueueService {
failure_reason: &str,
error_message: &str,
retry_count: i32,
ocr_text: Option<String>,
ocr_confidence: Option<f32>,
ocr_word_count: Option<i32>,
) -> Result<()> {
// Query document directly from database without user restrictions (OCR service context)
let document_row = sqlx::query(
@ -1166,9 +1183,9 @@ impl OcrQueueService {
mime_type: Some(mime_type),
content: None,
tags: Vec::new(),
ocr_text: None,
ocr_confidence: None,
ocr_word_count: None,
ocr_text,
ocr_confidence,
ocr_word_count,
ocr_processing_time_ms: None,
failure_reason: failure_reason.to_string(),
failure_stage: "ocr".to_string(),

View File

@ -306,8 +306,8 @@ mod tests {
processed_image_path: None,
};
let is_valid = service.validate_ocr_quality(&result, &settings);
assert!(is_valid);
let result_validation = service.validate_ocr_quality(&result, &settings);
assert!(result_validation.is_ok());
}
#[cfg(feature = "ocr")]
@ -329,8 +329,8 @@ mod tests {
processed_image_path: None,
};
let is_valid = service.validate_ocr_quality(&result, &settings);
assert!(!is_valid);
let result_validation = service.validate_ocr_quality(&result, &settings);
assert!(result_validation.is_err());
}
#[cfg(feature = "ocr")]
@ -351,8 +351,8 @@ mod tests {
processed_image_path: None,
};
let is_valid = service.validate_ocr_quality(&result, &settings);
assert!(!is_valid);
let result_validation = service.validate_ocr_quality(&result, &settings);
assert!(result_validation.is_err());
}
#[cfg(feature = "ocr")]
@ -373,8 +373,8 @@ mod tests {
processed_image_path: None,
};
let is_valid = service.validate_ocr_quality(&result, &settings);
assert!(!is_valid);
let result_validation = service.validate_ocr_quality(&result, &settings);
assert!(result_validation.is_err());
}
#[cfg(feature = "ocr")]
@ -395,8 +395,8 @@ mod tests {
processed_image_path: None,
};
let is_valid = service.validate_ocr_quality(&result, &settings);
assert!(is_valid);
let result_validation = service.validate_ocr_quality(&result, &settings);
assert!(result_validation.is_ok());
}
#[tokio::test]

View File

@ -243,12 +243,12 @@ mod pdf_word_count_integration_tests {
match service.extract_text_from_pdf(&pdf_path, &settings).await {
Ok(result) => {
// Test quality validation
let is_valid = service.validate_ocr_quality(&result, &settings);
let result_validation = service.validate_ocr_quality(&result, &settings);
if result.word_count > 0 {
assert!(is_valid, "Good quality PDF should pass validation");
assert!(result_validation.is_ok(), "Good quality PDF should pass validation");
} else {
assert!(!is_valid, "PDF with 0 words should fail validation");
assert!(result_validation.is_err(), "PDF with 0 words should fail validation");
}
// Verify OCR result structure