feat(ui): handle strange responses that the UI could recieve
This commit is contained in:
parent
1e652ed62e
commit
e7574cb0da
|
|
@ -1249,7 +1249,7 @@ const DocumentManagementPage: React.FC = () => {
|
||||||
variant="outlined"
|
variant="outlined"
|
||||||
/>
|
/>
|
||||||
)}
|
)}
|
||||||
{document.ocr_word_count !== undefined && (
|
{document.ocr_word_count !== undefined && document.ocr_word_count !== null && (
|
||||||
<Chip
|
<Chip
|
||||||
size="small"
|
size="small"
|
||||||
icon={<FindInPageIcon />}
|
icon={<FindInPageIcon />}
|
||||||
|
|
@ -1689,8 +1689,8 @@ const DocumentManagementPage: React.FC = () => {
|
||||||
</Typography>
|
</Typography>
|
||||||
</TableCell>
|
</TableCell>
|
||||||
<TableCell>
|
<TableCell>
|
||||||
<Typography variant="body2" color={doc.ocr_confidence ? 'warning.main' : 'error.main'}>
|
<Typography variant="body2" color={doc.ocr_confidence !== undefined && doc.ocr_confidence !== null ? 'warning.main' : 'error.main'}>
|
||||||
{doc.ocr_confidence ? `${doc.ocr_confidence.toFixed(1)}%` : 'N/A'}
|
{doc.ocr_confidence !== undefined && doc.ocr_confidence !== null ? `${doc.ocr_confidence.toFixed(1)}%` : 'N/A'}
|
||||||
</Typography>
|
</Typography>
|
||||||
</TableCell>
|
</TableCell>
|
||||||
<TableCell>
|
<TableCell>
|
||||||
|
|
|
||||||
|
|
@ -1662,41 +1662,54 @@ impl EnhancedOcrService {
|
||||||
|
|
||||||
/// Validate OCR result quality
|
/// Validate OCR result quality
|
||||||
#[cfg(feature = "ocr")]
|
#[cfg(feature = "ocr")]
|
||||||
pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> bool {
|
pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> Result<(), String> {
|
||||||
// Check minimum confidence threshold
|
// Check minimum confidence threshold
|
||||||
if result.confidence < settings.ocr_min_confidence {
|
if result.confidence < settings.ocr_min_confidence {
|
||||||
warn!(
|
return Err(format!(
|
||||||
"OCR result below confidence threshold: {:.1}% < {:.1}%",
|
"OCR confidence below threshold: {:.1}% (minimum: {:.1}%)",
|
||||||
result.confidence, settings.ocr_min_confidence
|
result.confidence,
|
||||||
);
|
settings.ocr_min_confidence
|
||||||
return false;
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if text is reasonable (not just noise)
|
// Check if text is reasonable (not just noise)
|
||||||
if result.word_count == 0 {
|
if result.word_count == 0 {
|
||||||
warn!("OCR result contains no words");
|
return Err("No words detected in OCR output".to_string());
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check for reasonable character distribution
|
// Check for reasonable character distribution
|
||||||
let total_chars = result.text.len();
|
let total_chars = result.text.len();
|
||||||
if total_chars == 0 {
|
if total_chars == 0 {
|
||||||
return false;
|
return Err("OCR result contains no characters".to_string());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Count alphanumeric characters and digits separately
|
||||||
let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count();
|
let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count();
|
||||||
|
let digit_chars = result.text.chars().filter(|c| c.is_numeric()).count();
|
||||||
let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32;
|
let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32;
|
||||||
|
let digit_ratio = digit_chars as f32 / total_chars as f32;
|
||||||
// Expect at least 30% alphanumeric characters for valid text
|
|
||||||
if alphanumeric_ratio < 0.3 {
|
// Special handling for numeric-heavy documents (bills, transaction lists, etc.)
|
||||||
warn!(
|
// If document has >40% digits, it's likely a valid numeric document
|
||||||
"OCR result has low alphanumeric ratio: {:.1}%",
|
if digit_ratio > 0.4 {
|
||||||
alphanumeric_ratio * 100.0
|
debug!(
|
||||||
|
"Document has high numeric content: {:.1}% digits - accepting as valid numeric document",
|
||||||
|
digit_ratio * 100.0
|
||||||
);
|
);
|
||||||
return false;
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
true
|
// Expect at least 20% alphanumeric characters for valid text (relaxed from 30%)
|
||||||
|
const MIN_ALPHANUMERIC_RATIO: f32 = 0.20;
|
||||||
|
if alphanumeric_ratio < MIN_ALPHANUMERIC_RATIO {
|
||||||
|
return Err(format!(
|
||||||
|
"OCR result has low alphanumeric content: {:.1}% (minimum: {:.1}%)",
|
||||||
|
alphanumeric_ratio * 100.0,
|
||||||
|
MIN_ALPHANUMERIC_RATIO * 100.0
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1711,8 +1724,8 @@ impl EnhancedOcrService {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> bool {
|
pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> Result<(), String> {
|
||||||
false
|
Err("OCR feature not enabled".to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn count_words_safely(&self, text: &str) -> usize {
|
pub fn count_words_safely(&self, text: &str) -> usize {
|
||||||
|
|
|
||||||
|
|
@ -378,9 +378,8 @@ impl OcrQueueService {
|
||||||
match ocr_service.extract_text_with_context(&file_path, &mime_type, &filename, file_size, &settings).await {
|
match ocr_service.extract_text_with_context(&file_path, &mime_type, &filename, file_size, &settings).await {
|
||||||
Ok(ocr_result) => {
|
Ok(ocr_result) => {
|
||||||
// Validate OCR quality
|
// Validate OCR quality
|
||||||
if !ocr_service.validate_ocr_quality(&ocr_result, &settings) {
|
if let Err(validation_error) = ocr_service.validate_ocr_quality(&ocr_result, &settings) {
|
||||||
let error_msg = format!("OCR quality below threshold: {:.1}% confidence, {} words",
|
let error_msg = format!("OCR quality validation failed: {}", validation_error);
|
||||||
ocr_result.confidence, ocr_result.word_count);
|
|
||||||
warn!("⚠️ OCR quality issues for '{}' | Job: {} | Document: {} | {:.1}% confidence | {} words",
|
warn!("⚠️ OCR quality issues for '{}' | Job: {} | Document: {} | {:.1}% confidence | {} words",
|
||||||
filename, item.id, item.document_id, ocr_result.confidence, ocr_result.word_count);
|
filename, item.id, item.document_id, ocr_result.confidence, ocr_result.word_count);
|
||||||
|
|
||||||
|
|
@ -390,6 +389,9 @@ impl OcrQueueService {
|
||||||
"low_ocr_confidence",
|
"low_ocr_confidence",
|
||||||
&error_msg,
|
&error_msg,
|
||||||
item.attempts,
|
item.attempts,
|
||||||
|
Some(ocr_result.text.clone()),
|
||||||
|
Some(ocr_result.confidence),
|
||||||
|
Some(ocr_result.word_count as i32),
|
||||||
).await;
|
).await;
|
||||||
|
|
||||||
// Mark as failed for quality issues with proper failure reason
|
// Mark as failed for quality issues with proper failure reason
|
||||||
|
|
@ -433,13 +435,16 @@ impl OcrQueueService {
|
||||||
|
|
||||||
// Use classification function to determine proper failure reason
|
// Use classification function to determine proper failure reason
|
||||||
let (failure_reason, _should_suppress) = Self::classify_ocr_error(error_msg);
|
let (failure_reason, _should_suppress) = Self::classify_ocr_error(error_msg);
|
||||||
|
|
||||||
// Create failed document record using helper function
|
// Create failed document record using helper function
|
||||||
let _ = self.create_failed_document_from_ocr_error(
|
let _ = self.create_failed_document_from_ocr_error(
|
||||||
item.document_id,
|
item.document_id,
|
||||||
failure_reason,
|
failure_reason,
|
||||||
error_msg,
|
error_msg,
|
||||||
item.attempts,
|
item.attempts,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
).await;
|
).await;
|
||||||
|
|
||||||
self.mark_failed(item.id, error_msg).await?;
|
self.mark_failed(item.id, error_msg).await?;
|
||||||
|
|
@ -451,13 +456,16 @@ impl OcrQueueService {
|
||||||
|
|
||||||
// Use classification function to determine proper failure reason
|
// Use classification function to determine proper failure reason
|
||||||
let (failure_reason, _should_suppress) = Self::classify_ocr_error(&error_msg);
|
let (failure_reason, _should_suppress) = Self::classify_ocr_error(&error_msg);
|
||||||
|
|
||||||
// Create failed document record using helper function
|
// Create failed document record using helper function
|
||||||
let _ = self.create_failed_document_from_ocr_error(
|
let _ = self.create_failed_document_from_ocr_error(
|
||||||
item.document_id,
|
item.document_id,
|
||||||
failure_reason,
|
failure_reason,
|
||||||
&error_msg,
|
&error_msg,
|
||||||
item.attempts,
|
item.attempts,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
).await;
|
).await;
|
||||||
|
|
||||||
self.mark_failed(item.id, &error_msg).await?;
|
self.mark_failed(item.id, &error_msg).await?;
|
||||||
|
|
@ -472,13 +480,16 @@ impl OcrQueueService {
|
||||||
|
|
||||||
// Use classification function to determine proper failure reason
|
// Use classification function to determine proper failure reason
|
||||||
let (failure_reason, _should_suppress) = Self::classify_ocr_error(&error_msg);
|
let (failure_reason, _should_suppress) = Self::classify_ocr_error(&error_msg);
|
||||||
|
|
||||||
// Create failed document record using helper function
|
// Create failed document record using helper function
|
||||||
let _ = self.create_failed_document_from_ocr_error(
|
let _ = self.create_failed_document_from_ocr_error(
|
||||||
item.document_id,
|
item.document_id,
|
||||||
failure_reason,
|
failure_reason,
|
||||||
&error_msg,
|
&error_msg,
|
||||||
item.attempts,
|
item.attempts,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
).await;
|
).await;
|
||||||
|
|
||||||
// Mark document as failed for no extractable text
|
// Mark document as failed for no extractable text
|
||||||
|
|
@ -560,6 +571,9 @@ impl OcrQueueService {
|
||||||
failure_reason,
|
failure_reason,
|
||||||
&error_msg,
|
&error_msg,
|
||||||
item.attempts,
|
item.attempts,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
).await;
|
).await;
|
||||||
|
|
||||||
// Always use 'failed' status with specific failure reason
|
// Always use 'failed' status with specific failure reason
|
||||||
|
|
@ -1127,6 +1141,9 @@ impl OcrQueueService {
|
||||||
failure_reason: &str,
|
failure_reason: &str,
|
||||||
error_message: &str,
|
error_message: &str,
|
||||||
retry_count: i32,
|
retry_count: i32,
|
||||||
|
ocr_text: Option<String>,
|
||||||
|
ocr_confidence: Option<f32>,
|
||||||
|
ocr_word_count: Option<i32>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
// Query document directly from database without user restrictions (OCR service context)
|
// Query document directly from database without user restrictions (OCR service context)
|
||||||
let document_row = sqlx::query(
|
let document_row = sqlx::query(
|
||||||
|
|
@ -1166,9 +1183,9 @@ impl OcrQueueService {
|
||||||
mime_type: Some(mime_type),
|
mime_type: Some(mime_type),
|
||||||
content: None,
|
content: None,
|
||||||
tags: Vec::new(),
|
tags: Vec::new(),
|
||||||
ocr_text: None,
|
ocr_text,
|
||||||
ocr_confidence: None,
|
ocr_confidence,
|
||||||
ocr_word_count: None,
|
ocr_word_count,
|
||||||
ocr_processing_time_ms: None,
|
ocr_processing_time_ms: None,
|
||||||
failure_reason: failure_reason.to_string(),
|
failure_reason: failure_reason.to_string(),
|
||||||
failure_stage: "ocr".to_string(),
|
failure_stage: "ocr".to_string(),
|
||||||
|
|
|
||||||
|
|
@ -306,8 +306,8 @@ mod tests {
|
||||||
processed_image_path: None,
|
processed_image_path: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let is_valid = service.validate_ocr_quality(&result, &settings);
|
let result_validation = service.validate_ocr_quality(&result, &settings);
|
||||||
assert!(is_valid);
|
assert!(result_validation.is_ok());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "ocr")]
|
#[cfg(feature = "ocr")]
|
||||||
|
|
@ -329,8 +329,8 @@ mod tests {
|
||||||
processed_image_path: None,
|
processed_image_path: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let is_valid = service.validate_ocr_quality(&result, &settings);
|
let result_validation = service.validate_ocr_quality(&result, &settings);
|
||||||
assert!(!is_valid);
|
assert!(result_validation.is_err());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "ocr")]
|
#[cfg(feature = "ocr")]
|
||||||
|
|
@ -351,8 +351,8 @@ mod tests {
|
||||||
processed_image_path: None,
|
processed_image_path: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let is_valid = service.validate_ocr_quality(&result, &settings);
|
let result_validation = service.validate_ocr_quality(&result, &settings);
|
||||||
assert!(!is_valid);
|
assert!(result_validation.is_err());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "ocr")]
|
#[cfg(feature = "ocr")]
|
||||||
|
|
@ -373,8 +373,8 @@ mod tests {
|
||||||
processed_image_path: None,
|
processed_image_path: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let is_valid = service.validate_ocr_quality(&result, &settings);
|
let result_validation = service.validate_ocr_quality(&result, &settings);
|
||||||
assert!(!is_valid);
|
assert!(result_validation.is_err());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "ocr")]
|
#[cfg(feature = "ocr")]
|
||||||
|
|
@ -395,8 +395,8 @@ mod tests {
|
||||||
processed_image_path: None,
|
processed_image_path: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let is_valid = service.validate_ocr_quality(&result, &settings);
|
let result_validation = service.validate_ocr_quality(&result, &settings);
|
||||||
assert!(is_valid);
|
assert!(result_validation.is_ok());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
|
|
|
||||||
|
|
@ -243,12 +243,12 @@ mod pdf_word_count_integration_tests {
|
||||||
match service.extract_text_from_pdf(&pdf_path, &settings).await {
|
match service.extract_text_from_pdf(&pdf_path, &settings).await {
|
||||||
Ok(result) => {
|
Ok(result) => {
|
||||||
// Test quality validation
|
// Test quality validation
|
||||||
let is_valid = service.validate_ocr_quality(&result, &settings);
|
let result_validation = service.validate_ocr_quality(&result, &settings);
|
||||||
|
|
||||||
if result.word_count > 0 {
|
if result.word_count > 0 {
|
||||||
assert!(is_valid, "Good quality PDF should pass validation");
|
assert!(result_validation.is_ok(), "Good quality PDF should pass validation");
|
||||||
} else {
|
} else {
|
||||||
assert!(!is_valid, "PDF with 0 words should fail validation");
|
assert!(result_validation.is_err(), "PDF with 0 words should fail validation");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Verify OCR result structure
|
// Verify OCR result structure
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue