feat(ocr): resolve issue with null bytes coming out of OCR documents
This commit is contained in:
parent
63aa7347a9
commit
f4fa33807c
|
|
@ -559,22 +559,22 @@ mod tests {
|
|||
fn test_sanitize_text_for_db() {
|
||||
// Test removing null bytes
|
||||
let text_with_nulls = "Hello\0World\0!";
|
||||
let sanitized = TransactionManager::sanitize_text_for_db(text_with_nulls);
|
||||
let sanitized = DocumentTransactionManager::sanitize_text_for_db(text_with_nulls);
|
||||
assert_eq!(sanitized, "HelloWorld!");
|
||||
|
||||
|
||||
// Test preserving normal text
|
||||
let normal_text = "This is a normal PDF text with special chars: €£¥";
|
||||
let sanitized = TransactionManager::sanitize_text_for_db(normal_text);
|
||||
let sanitized = DocumentTransactionManager::sanitize_text_for_db(normal_text);
|
||||
assert_eq!(sanitized, normal_text);
|
||||
|
||||
|
||||
// Test handling empty string
|
||||
let empty = "";
|
||||
let sanitized = TransactionManager::sanitize_text_for_db(empty);
|
||||
let sanitized = DocumentTransactionManager::sanitize_text_for_db(empty);
|
||||
assert_eq!(sanitized, "");
|
||||
|
||||
|
||||
// Test handling text with multiple null bytes
|
||||
let many_nulls = "\0\0Start\0Middle\0\0End\0\0";
|
||||
let sanitized = TransactionManager::sanitize_text_for_db(many_nulls);
|
||||
let sanitized = DocumentTransactionManager::sanitize_text_for_db(many_nulls);
|
||||
assert_eq!(sanitized, "StartMiddleEnd");
|
||||
}
|
||||
}
|
||||
|
|
@ -131,9 +131,12 @@ impl EnhancedOcrService {
|
|||
|
||||
Ok((text, confidence))
|
||||
}).await??;
|
||||
|
||||
|
||||
let (text, confidence) = ocr_result;
|
||||
|
||||
|
||||
// Sanitize null bytes to prevent PostgreSQL errors
|
||||
let text = Self::remove_null_bytes(&text);
|
||||
|
||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||
let word_count = text.split_whitespace().count();
|
||||
|
||||
|
|
@ -917,6 +920,10 @@ impl EnhancedOcrService {
|
|||
// Check if quick extraction got good results
|
||||
if self.is_text_extraction_quality_sufficient(&text, word_count, file_size) {
|
||||
info!("PDF text extraction successful for '{}' using quick method", file_path);
|
||||
|
||||
// Sanitize null bytes to prevent PostgreSQL errors
|
||||
let text = Self::remove_null_bytes(&text);
|
||||
|
||||
return Ok(OcrResult {
|
||||
text,
|
||||
confidence: 95.0,
|
||||
|
|
@ -946,7 +953,10 @@ impl EnhancedOcrService {
|
|||
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||
let word_count = self.count_words_safely(&text);
|
||||
info!("Direct text extraction succeeded as last resort for: {}", file_path);
|
||||
|
||||
|
||||
// Sanitize null bytes to prevent PostgreSQL errors
|
||||
let text = Self::remove_null_bytes(&text);
|
||||
|
||||
return Ok(OcrResult {
|
||||
text,
|
||||
confidence: 50.0, // Lower confidence for direct extraction
|
||||
|
|
@ -1155,9 +1165,12 @@ impl EnhancedOcrService {
|
|||
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||
let word_count = self.count_words_safely(&ocr_text_result);
|
||||
|
||||
info!("OCR extraction completed for '{}': {} words in {}ms",
|
||||
info!("OCR extraction completed for '{}': {} words in {}ms",
|
||||
file_path, word_count, processing_time);
|
||||
|
||||
|
||||
// Sanitize null bytes to prevent PostgreSQL errors
|
||||
let ocr_text_result = Self::remove_null_bytes(&ocr_text_result);
|
||||
|
||||
Ok(OcrResult {
|
||||
text: ocr_text_result,
|
||||
confidence: 85.0, // OCR is generally lower confidence than direct text extraction
|
||||
|
|
|
|||
Loading…
Reference in New Issue