feat(ui): handle strange responses that the UI could recieve
This commit is contained in:
parent
1e652ed62e
commit
e7574cb0da
|
|
@ -1249,7 +1249,7 @@ const DocumentManagementPage: React.FC = () => {
|
|||
variant="outlined"
|
||||
/>
|
||||
)}
|
||||
{document.ocr_word_count !== undefined && (
|
||||
{document.ocr_word_count !== undefined && document.ocr_word_count !== null && (
|
||||
<Chip
|
||||
size="small"
|
||||
icon={<FindInPageIcon />}
|
||||
|
|
@ -1689,8 +1689,8 @@ const DocumentManagementPage: React.FC = () => {
|
|||
</Typography>
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
<Typography variant="body2" color={doc.ocr_confidence ? 'warning.main' : 'error.main'}>
|
||||
{doc.ocr_confidence ? `${doc.ocr_confidence.toFixed(1)}%` : 'N/A'}
|
||||
<Typography variant="body2" color={doc.ocr_confidence !== undefined && doc.ocr_confidence !== null ? 'warning.main' : 'error.main'}>
|
||||
{doc.ocr_confidence !== undefined && doc.ocr_confidence !== null ? `${doc.ocr_confidence.toFixed(1)}%` : 'N/A'}
|
||||
</Typography>
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
|
|
|
|||
|
|
@ -1662,41 +1662,54 @@ impl EnhancedOcrService {
|
|||
|
||||
/// Validate OCR result quality
|
||||
#[cfg(feature = "ocr")]
|
||||
pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> bool {
|
||||
pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> Result<(), String> {
|
||||
// Check minimum confidence threshold
|
||||
if result.confidence < settings.ocr_min_confidence {
|
||||
warn!(
|
||||
"OCR result below confidence threshold: {:.1}% < {:.1}%",
|
||||
result.confidence, settings.ocr_min_confidence
|
||||
);
|
||||
return false;
|
||||
return Err(format!(
|
||||
"OCR confidence below threshold: {:.1}% (minimum: {:.1}%)",
|
||||
result.confidence,
|
||||
settings.ocr_min_confidence
|
||||
));
|
||||
}
|
||||
|
||||
|
||||
// Check if text is reasonable (not just noise)
|
||||
if result.word_count == 0 {
|
||||
warn!("OCR result contains no words");
|
||||
return false;
|
||||
return Err("No words detected in OCR output".to_string());
|
||||
}
|
||||
|
||||
|
||||
// Check for reasonable character distribution
|
||||
let total_chars = result.text.len();
|
||||
if total_chars == 0 {
|
||||
return false;
|
||||
return Err("OCR result contains no characters".to_string());
|
||||
}
|
||||
|
||||
|
||||
// Count alphanumeric characters and digits separately
|
||||
let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count();
|
||||
let digit_chars = result.text.chars().filter(|c| c.is_numeric()).count();
|
||||
let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32;
|
||||
|
||||
// Expect at least 30% alphanumeric characters for valid text
|
||||
if alphanumeric_ratio < 0.3 {
|
||||
warn!(
|
||||
"OCR result has low alphanumeric ratio: {:.1}%",
|
||||
alphanumeric_ratio * 100.0
|
||||
let digit_ratio = digit_chars as f32 / total_chars as f32;
|
||||
|
||||
// Special handling for numeric-heavy documents (bills, transaction lists, etc.)
|
||||
// If document has >40% digits, it's likely a valid numeric document
|
||||
if digit_ratio > 0.4 {
|
||||
debug!(
|
||||
"Document has high numeric content: {:.1}% digits - accepting as valid numeric document",
|
||||
digit_ratio * 100.0
|
||||
);
|
||||
return false;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
true
|
||||
|
||||
// Expect at least 20% alphanumeric characters for valid text (relaxed from 30%)
|
||||
const MIN_ALPHANUMERIC_RATIO: f32 = 0.20;
|
||||
if alphanumeric_ratio < MIN_ALPHANUMERIC_RATIO {
|
||||
return Err(format!(
|
||||
"OCR result has low alphanumeric content: {:.1}% (minimum: {:.1}%)",
|
||||
alphanumeric_ratio * 100.0,
|
||||
MIN_ALPHANUMERIC_RATIO * 100.0
|
||||
));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1711,8 +1724,8 @@ impl EnhancedOcrService {
|
|||
}
|
||||
|
||||
|
||||
pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> bool {
|
||||
false
|
||||
pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> Result<(), String> {
|
||||
Err("OCR feature not enabled".to_string())
|
||||
}
|
||||
|
||||
pub fn count_words_safely(&self, text: &str) -> usize {
|
||||
|
|
|
|||
|
|
@ -378,9 +378,8 @@ impl OcrQueueService {
|
|||
match ocr_service.extract_text_with_context(&file_path, &mime_type, &filename, file_size, &settings).await {
|
||||
Ok(ocr_result) => {
|
||||
// Validate OCR quality
|
||||
if !ocr_service.validate_ocr_quality(&ocr_result, &settings) {
|
||||
let error_msg = format!("OCR quality below threshold: {:.1}% confidence, {} words",
|
||||
ocr_result.confidence, ocr_result.word_count);
|
||||
if let Err(validation_error) = ocr_service.validate_ocr_quality(&ocr_result, &settings) {
|
||||
let error_msg = format!("OCR quality validation failed: {}", validation_error);
|
||||
warn!("⚠️ OCR quality issues for '{}' | Job: {} | Document: {} | {:.1}% confidence | {} words",
|
||||
filename, item.id, item.document_id, ocr_result.confidence, ocr_result.word_count);
|
||||
|
||||
|
|
@ -390,6 +389,9 @@ impl OcrQueueService {
|
|||
"low_ocr_confidence",
|
||||
&error_msg,
|
||||
item.attempts,
|
||||
Some(ocr_result.text.clone()),
|
||||
Some(ocr_result.confidence),
|
||||
Some(ocr_result.word_count as i32),
|
||||
).await;
|
||||
|
||||
// Mark as failed for quality issues with proper failure reason
|
||||
|
|
@ -433,13 +435,16 @@ impl OcrQueueService {
|
|||
|
||||
// Use classification function to determine proper failure reason
|
||||
let (failure_reason, _should_suppress) = Self::classify_ocr_error(error_msg);
|
||||
|
||||
|
||||
// Create failed document record using helper function
|
||||
let _ = self.create_failed_document_from_ocr_error(
|
||||
item.document_id,
|
||||
failure_reason,
|
||||
error_msg,
|
||||
item.attempts,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
).await;
|
||||
|
||||
self.mark_failed(item.id, error_msg).await?;
|
||||
|
|
@ -451,13 +456,16 @@ impl OcrQueueService {
|
|||
|
||||
// Use classification function to determine proper failure reason
|
||||
let (failure_reason, _should_suppress) = Self::classify_ocr_error(&error_msg);
|
||||
|
||||
|
||||
// Create failed document record using helper function
|
||||
let _ = self.create_failed_document_from_ocr_error(
|
||||
item.document_id,
|
||||
failure_reason,
|
||||
&error_msg,
|
||||
item.attempts,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
).await;
|
||||
|
||||
self.mark_failed(item.id, &error_msg).await?;
|
||||
|
|
@ -472,13 +480,16 @@ impl OcrQueueService {
|
|||
|
||||
// Use classification function to determine proper failure reason
|
||||
let (failure_reason, _should_suppress) = Self::classify_ocr_error(&error_msg);
|
||||
|
||||
|
||||
// Create failed document record using helper function
|
||||
let _ = self.create_failed_document_from_ocr_error(
|
||||
item.document_id,
|
||||
failure_reason,
|
||||
&error_msg,
|
||||
item.attempts,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
).await;
|
||||
|
||||
// Mark document as failed for no extractable text
|
||||
|
|
@ -560,6 +571,9 @@ impl OcrQueueService {
|
|||
failure_reason,
|
||||
&error_msg,
|
||||
item.attempts,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
).await;
|
||||
|
||||
// Always use 'failed' status with specific failure reason
|
||||
|
|
@ -1127,6 +1141,9 @@ impl OcrQueueService {
|
|||
failure_reason: &str,
|
||||
error_message: &str,
|
||||
retry_count: i32,
|
||||
ocr_text: Option<String>,
|
||||
ocr_confidence: Option<f32>,
|
||||
ocr_word_count: Option<i32>,
|
||||
) -> Result<()> {
|
||||
// Query document directly from database without user restrictions (OCR service context)
|
||||
let document_row = sqlx::query(
|
||||
|
|
@ -1166,9 +1183,9 @@ impl OcrQueueService {
|
|||
mime_type: Some(mime_type),
|
||||
content: None,
|
||||
tags: Vec::new(),
|
||||
ocr_text: None,
|
||||
ocr_confidence: None,
|
||||
ocr_word_count: None,
|
||||
ocr_text,
|
||||
ocr_confidence,
|
||||
ocr_word_count,
|
||||
ocr_processing_time_ms: None,
|
||||
failure_reason: failure_reason.to_string(),
|
||||
failure_stage: "ocr".to_string(),
|
||||
|
|
|
|||
|
|
@ -306,8 +306,8 @@ mod tests {
|
|||
processed_image_path: None,
|
||||
};
|
||||
|
||||
let is_valid = service.validate_ocr_quality(&result, &settings);
|
||||
assert!(is_valid);
|
||||
let result_validation = service.validate_ocr_quality(&result, &settings);
|
||||
assert!(result_validation.is_ok());
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
|
|
@ -329,8 +329,8 @@ mod tests {
|
|||
processed_image_path: None,
|
||||
};
|
||||
|
||||
let is_valid = service.validate_ocr_quality(&result, &settings);
|
||||
assert!(!is_valid);
|
||||
let result_validation = service.validate_ocr_quality(&result, &settings);
|
||||
assert!(result_validation.is_err());
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
|
|
@ -351,8 +351,8 @@ mod tests {
|
|||
processed_image_path: None,
|
||||
};
|
||||
|
||||
let is_valid = service.validate_ocr_quality(&result, &settings);
|
||||
assert!(!is_valid);
|
||||
let result_validation = service.validate_ocr_quality(&result, &settings);
|
||||
assert!(result_validation.is_err());
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
|
|
@ -373,8 +373,8 @@ mod tests {
|
|||
processed_image_path: None,
|
||||
};
|
||||
|
||||
let is_valid = service.validate_ocr_quality(&result, &settings);
|
||||
assert!(!is_valid);
|
||||
let result_validation = service.validate_ocr_quality(&result, &settings);
|
||||
assert!(result_validation.is_err());
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
|
|
@ -395,8 +395,8 @@ mod tests {
|
|||
processed_image_path: None,
|
||||
};
|
||||
|
||||
let is_valid = service.validate_ocr_quality(&result, &settings);
|
||||
assert!(is_valid);
|
||||
let result_validation = service.validate_ocr_quality(&result, &settings);
|
||||
assert!(result_validation.is_ok());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
|
|
|||
|
|
@ -243,12 +243,12 @@ mod pdf_word_count_integration_tests {
|
|||
match service.extract_text_from_pdf(&pdf_path, &settings).await {
|
||||
Ok(result) => {
|
||||
// Test quality validation
|
||||
let is_valid = service.validate_ocr_quality(&result, &settings);
|
||||
|
||||
let result_validation = service.validate_ocr_quality(&result, &settings);
|
||||
|
||||
if result.word_count > 0 {
|
||||
assert!(is_valid, "Good quality PDF should pass validation");
|
||||
assert!(result_validation.is_ok(), "Good quality PDF should pass validation");
|
||||
} else {
|
||||
assert!(!is_valid, "PDF with 0 words should fail validation");
|
||||
assert!(result_validation.is_err(), "PDF with 0 words should fail validation");
|
||||
}
|
||||
|
||||
// Verify OCR result structure
|
||||
|
|
|
|||
Loading…
Reference in New Issue