feat(ocr): resolve issue with null bytes coming out of OCR documents

This commit is contained in:
perf3ct 2025-11-02 11:55:27 -08:00
parent 63aa7347a9
commit f4fa33807c
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
2 changed files with 25 additions and 12 deletions

View File

@ -559,22 +559,22 @@ mod tests {
fn test_sanitize_text_for_db() {
// Test removing null bytes
let text_with_nulls = "Hello\0World\0!";
let sanitized = TransactionManager::sanitize_text_for_db(text_with_nulls);
let sanitized = DocumentTransactionManager::sanitize_text_for_db(text_with_nulls);
assert_eq!(sanitized, "HelloWorld!");
// Test preserving normal text
let normal_text = "This is a normal PDF text with special chars: €£¥";
let sanitized = TransactionManager::sanitize_text_for_db(normal_text);
let sanitized = DocumentTransactionManager::sanitize_text_for_db(normal_text);
assert_eq!(sanitized, normal_text);
// Test handling empty string
let empty = "";
let sanitized = TransactionManager::sanitize_text_for_db(empty);
let sanitized = DocumentTransactionManager::sanitize_text_for_db(empty);
assert_eq!(sanitized, "");
// Test handling text with multiple null bytes
let many_nulls = "\0\0Start\0Middle\0\0End\0\0";
let sanitized = TransactionManager::sanitize_text_for_db(many_nulls);
let sanitized = DocumentTransactionManager::sanitize_text_for_db(many_nulls);
assert_eq!(sanitized, "StartMiddleEnd");
}
}

View File

@ -131,9 +131,12 @@ impl EnhancedOcrService {
Ok((text, confidence))
}).await??;
let (text, confidence) = ocr_result;
// Sanitize null bytes to prevent PostgreSQL errors
let text = Self::remove_null_bytes(&text);
let processing_time = start_time.elapsed().as_millis() as u64;
let word_count = text.split_whitespace().count();
@ -917,6 +920,10 @@ impl EnhancedOcrService {
// Check if quick extraction got good results
if self.is_text_extraction_quality_sufficient(&text, word_count, file_size) {
info!("PDF text extraction successful for '{}' using quick method", file_path);
// Sanitize null bytes to prevent PostgreSQL errors
let text = Self::remove_null_bytes(&text);
return Ok(OcrResult {
text,
confidence: 95.0,
@ -946,7 +953,10 @@ impl EnhancedOcrService {
let processing_time = start_time.elapsed().as_millis() as u64;
let word_count = self.count_words_safely(&text);
info!("Direct text extraction succeeded as last resort for: {}", file_path);
// Sanitize null bytes to prevent PostgreSQL errors
let text = Self::remove_null_bytes(&text);
return Ok(OcrResult {
text,
confidence: 50.0, // Lower confidence for direct extraction
@ -1155,9 +1165,12 @@ impl EnhancedOcrService {
let processing_time = start_time.elapsed().as_millis() as u64;
let word_count = self.count_words_safely(&ocr_text_result);
info!("OCR extraction completed for '{}': {} words in {}ms",
info!("OCR extraction completed for '{}': {} words in {}ms",
file_path, word_count, processing_time);
// Sanitize null bytes to prevent PostgreSQL errors
let ocr_text_result = Self::remove_null_bytes(&ocr_text_result);
Ok(OcrResult {
text: ocr_text_result,
confidence: 85.0, // OCR is generally lower confidence than direct text extraction