feat(ui): handle strange responses that the UI could recieve

This commit is contained in:
perf3ct 2025-10-05 13:45:10 -07:00
parent 1e652ed62e
commit e7574cb0da
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
5 changed files with 79 additions and 49 deletions

View File

@ -1249,7 +1249,7 @@ const DocumentManagementPage: React.FC = () => {
variant="outlined" variant="outlined"
/> />
)} )}
{document.ocr_word_count !== undefined && ( {document.ocr_word_count !== undefined && document.ocr_word_count !== null && (
<Chip <Chip
size="small" size="small"
icon={<FindInPageIcon />} icon={<FindInPageIcon />}
@ -1689,8 +1689,8 @@ const DocumentManagementPage: React.FC = () => {
</Typography> </Typography>
</TableCell> </TableCell>
<TableCell> <TableCell>
<Typography variant="body2" color={doc.ocr_confidence ? 'warning.main' : 'error.main'}> <Typography variant="body2" color={doc.ocr_confidence !== undefined && doc.ocr_confidence !== null ? 'warning.main' : 'error.main'}>
{doc.ocr_confidence ? `${doc.ocr_confidence.toFixed(1)}%` : 'N/A'} {doc.ocr_confidence !== undefined && doc.ocr_confidence !== null ? `${doc.ocr_confidence.toFixed(1)}%` : 'N/A'}
</Typography> </Typography>
</TableCell> </TableCell>
<TableCell> <TableCell>

View File

@ -1662,41 +1662,54 @@ impl EnhancedOcrService {
/// Validate OCR result quality /// Validate OCR result quality
#[cfg(feature = "ocr")] #[cfg(feature = "ocr")]
pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> bool { pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> Result<(), String> {
// Check minimum confidence threshold // Check minimum confidence threshold
if result.confidence < settings.ocr_min_confidence { if result.confidence < settings.ocr_min_confidence {
warn!( return Err(format!(
"OCR result below confidence threshold: {:.1}% < {:.1}%", "OCR confidence below threshold: {:.1}% (minimum: {:.1}%)",
result.confidence, settings.ocr_min_confidence result.confidence,
); settings.ocr_min_confidence
return false; ));
} }
// Check if text is reasonable (not just noise) // Check if text is reasonable (not just noise)
if result.word_count == 0 { if result.word_count == 0 {
warn!("OCR result contains no words"); return Err("No words detected in OCR output".to_string());
return false;
} }
// Check for reasonable character distribution // Check for reasonable character distribution
let total_chars = result.text.len(); let total_chars = result.text.len();
if total_chars == 0 { if total_chars == 0 {
return false; return Err("OCR result contains no characters".to_string());
} }
// Count alphanumeric characters and digits separately
let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count(); let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count();
let digit_chars = result.text.chars().filter(|c| c.is_numeric()).count();
let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32; let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32;
let digit_ratio = digit_chars as f32 / total_chars as f32;
// Expect at least 30% alphanumeric characters for valid text
if alphanumeric_ratio < 0.3 { // Special handling for numeric-heavy documents (bills, transaction lists, etc.)
warn!( // If document has >40% digits, it's likely a valid numeric document
"OCR result has low alphanumeric ratio: {:.1}%", if digit_ratio > 0.4 {
alphanumeric_ratio * 100.0 debug!(
"Document has high numeric content: {:.1}% digits - accepting as valid numeric document",
digit_ratio * 100.0
); );
return false; return Ok(());
} }
true // Expect at least 20% alphanumeric characters for valid text (relaxed from 30%)
const MIN_ALPHANUMERIC_RATIO: f32 = 0.20;
if alphanumeric_ratio < MIN_ALPHANUMERIC_RATIO {
return Err(format!(
"OCR result has low alphanumeric content: {:.1}% (minimum: {:.1}%)",
alphanumeric_ratio * 100.0,
MIN_ALPHANUMERIC_RATIO * 100.0
));
}
Ok(())
} }
} }
@ -1711,8 +1724,8 @@ impl EnhancedOcrService {
} }
pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> bool { pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> Result<(), String> {
false Err("OCR feature not enabled".to_string())
} }
pub fn count_words_safely(&self, text: &str) -> usize { pub fn count_words_safely(&self, text: &str) -> usize {

View File

@ -378,9 +378,8 @@ impl OcrQueueService {
match ocr_service.extract_text_with_context(&file_path, &mime_type, &filename, file_size, &settings).await { match ocr_service.extract_text_with_context(&file_path, &mime_type, &filename, file_size, &settings).await {
Ok(ocr_result) => { Ok(ocr_result) => {
// Validate OCR quality // Validate OCR quality
if !ocr_service.validate_ocr_quality(&ocr_result, &settings) { if let Err(validation_error) = ocr_service.validate_ocr_quality(&ocr_result, &settings) {
let error_msg = format!("OCR quality below threshold: {:.1}% confidence, {} words", let error_msg = format!("OCR quality validation failed: {}", validation_error);
ocr_result.confidence, ocr_result.word_count);
warn!("⚠️ OCR quality issues for '{}' | Job: {} | Document: {} | {:.1}% confidence | {} words", warn!("⚠️ OCR quality issues for '{}' | Job: {} | Document: {} | {:.1}% confidence | {} words",
filename, item.id, item.document_id, ocr_result.confidence, ocr_result.word_count); filename, item.id, item.document_id, ocr_result.confidence, ocr_result.word_count);
@ -390,6 +389,9 @@ impl OcrQueueService {
"low_ocr_confidence", "low_ocr_confidence",
&error_msg, &error_msg,
item.attempts, item.attempts,
Some(ocr_result.text.clone()),
Some(ocr_result.confidence),
Some(ocr_result.word_count as i32),
).await; ).await;
// Mark as failed for quality issues with proper failure reason // Mark as failed for quality issues with proper failure reason
@ -433,13 +435,16 @@ impl OcrQueueService {
// Use classification function to determine proper failure reason // Use classification function to determine proper failure reason
let (failure_reason, _should_suppress) = Self::classify_ocr_error(error_msg); let (failure_reason, _should_suppress) = Self::classify_ocr_error(error_msg);
// Create failed document record using helper function // Create failed document record using helper function
let _ = self.create_failed_document_from_ocr_error( let _ = self.create_failed_document_from_ocr_error(
item.document_id, item.document_id,
failure_reason, failure_reason,
error_msg, error_msg,
item.attempts, item.attempts,
None,
None,
None,
).await; ).await;
self.mark_failed(item.id, error_msg).await?; self.mark_failed(item.id, error_msg).await?;
@ -451,13 +456,16 @@ impl OcrQueueService {
// Use classification function to determine proper failure reason // Use classification function to determine proper failure reason
let (failure_reason, _should_suppress) = Self::classify_ocr_error(&error_msg); let (failure_reason, _should_suppress) = Self::classify_ocr_error(&error_msg);
// Create failed document record using helper function // Create failed document record using helper function
let _ = self.create_failed_document_from_ocr_error( let _ = self.create_failed_document_from_ocr_error(
item.document_id, item.document_id,
failure_reason, failure_reason,
&error_msg, &error_msg,
item.attempts, item.attempts,
None,
None,
None,
).await; ).await;
self.mark_failed(item.id, &error_msg).await?; self.mark_failed(item.id, &error_msg).await?;
@ -472,13 +480,16 @@ impl OcrQueueService {
// Use classification function to determine proper failure reason // Use classification function to determine proper failure reason
let (failure_reason, _should_suppress) = Self::classify_ocr_error(&error_msg); let (failure_reason, _should_suppress) = Self::classify_ocr_error(&error_msg);
// Create failed document record using helper function // Create failed document record using helper function
let _ = self.create_failed_document_from_ocr_error( let _ = self.create_failed_document_from_ocr_error(
item.document_id, item.document_id,
failure_reason, failure_reason,
&error_msg, &error_msg,
item.attempts, item.attempts,
None,
None,
None,
).await; ).await;
// Mark document as failed for no extractable text // Mark document as failed for no extractable text
@ -560,6 +571,9 @@ impl OcrQueueService {
failure_reason, failure_reason,
&error_msg, &error_msg,
item.attempts, item.attempts,
None,
None,
None,
).await; ).await;
// Always use 'failed' status with specific failure reason // Always use 'failed' status with specific failure reason
@ -1127,6 +1141,9 @@ impl OcrQueueService {
failure_reason: &str, failure_reason: &str,
error_message: &str, error_message: &str,
retry_count: i32, retry_count: i32,
ocr_text: Option<String>,
ocr_confidence: Option<f32>,
ocr_word_count: Option<i32>,
) -> Result<()> { ) -> Result<()> {
// Query document directly from database without user restrictions (OCR service context) // Query document directly from database without user restrictions (OCR service context)
let document_row = sqlx::query( let document_row = sqlx::query(
@ -1166,9 +1183,9 @@ impl OcrQueueService {
mime_type: Some(mime_type), mime_type: Some(mime_type),
content: None, content: None,
tags: Vec::new(), tags: Vec::new(),
ocr_text: None, ocr_text,
ocr_confidence: None, ocr_confidence,
ocr_word_count: None, ocr_word_count,
ocr_processing_time_ms: None, ocr_processing_time_ms: None,
failure_reason: failure_reason.to_string(), failure_reason: failure_reason.to_string(),
failure_stage: "ocr".to_string(), failure_stage: "ocr".to_string(),

View File

@ -306,8 +306,8 @@ mod tests {
processed_image_path: None, processed_image_path: None,
}; };
let is_valid = service.validate_ocr_quality(&result, &settings); let result_validation = service.validate_ocr_quality(&result, &settings);
assert!(is_valid); assert!(result_validation.is_ok());
} }
#[cfg(feature = "ocr")] #[cfg(feature = "ocr")]
@ -329,8 +329,8 @@ mod tests {
processed_image_path: None, processed_image_path: None,
}; };
let is_valid = service.validate_ocr_quality(&result, &settings); let result_validation = service.validate_ocr_quality(&result, &settings);
assert!(!is_valid); assert!(result_validation.is_err());
} }
#[cfg(feature = "ocr")] #[cfg(feature = "ocr")]
@ -351,8 +351,8 @@ mod tests {
processed_image_path: None, processed_image_path: None,
}; };
let is_valid = service.validate_ocr_quality(&result, &settings); let result_validation = service.validate_ocr_quality(&result, &settings);
assert!(!is_valid); assert!(result_validation.is_err());
} }
#[cfg(feature = "ocr")] #[cfg(feature = "ocr")]
@ -373,8 +373,8 @@ mod tests {
processed_image_path: None, processed_image_path: None,
}; };
let is_valid = service.validate_ocr_quality(&result, &settings); let result_validation = service.validate_ocr_quality(&result, &settings);
assert!(!is_valid); assert!(result_validation.is_err());
} }
#[cfg(feature = "ocr")] #[cfg(feature = "ocr")]
@ -395,8 +395,8 @@ mod tests {
processed_image_path: None, processed_image_path: None,
}; };
let is_valid = service.validate_ocr_quality(&result, &settings); let result_validation = service.validate_ocr_quality(&result, &settings);
assert!(is_valid); assert!(result_validation.is_ok());
} }
#[tokio::test] #[tokio::test]

View File

@ -243,12 +243,12 @@ mod pdf_word_count_integration_tests {
match service.extract_text_from_pdf(&pdf_path, &settings).await { match service.extract_text_from_pdf(&pdf_path, &settings).await {
Ok(result) => { Ok(result) => {
// Test quality validation // Test quality validation
let is_valid = service.validate_ocr_quality(&result, &settings); let result_validation = service.validate_ocr_quality(&result, &settings);
if result.word_count > 0 { if result.word_count > 0 {
assert!(is_valid, "Good quality PDF should pass validation"); assert!(result_validation.is_ok(), "Good quality PDF should pass validation");
} else { } else {
assert!(!is_valid, "PDF with 0 words should fail validation"); assert!(result_validation.is_err(), "PDF with 0 words should fail validation");
} }
// Verify OCR result structure // Verify OCR result structure