feat(ocr): resolve issue with null bytes coming out of OCR documents

2025-11-02 11:55:27 -08:00 · 2025-11-02 11:55:27 -08:00 · f4fa33807c
parent 63aa7347a9
commit f4fa33807c
2 changed files with 25 additions and 12 deletions
--- a/src/db_guardrails.rs
+++ b/src/db_guardrails.rs
@ -559,22 +559,22 @@ mod tests {
    fn test_sanitize_text_for_db() {
        // Test removing null bytes
        let text_with_nulls = "Hello\0World\0!";
-        let sanitized = TransactionManager::sanitize_text_for_db(text_with_nulls);
+        let sanitized = DocumentTransactionManager::sanitize_text_for_db(text_with_nulls);
        assert_eq!(sanitized, "HelloWorld!");
-        
+
        // Test preserving normal text
        let normal_text = "This is a normal PDF text with special chars: €£¥";
-        let sanitized = TransactionManager::sanitize_text_for_db(normal_text);
+        let sanitized = DocumentTransactionManager::sanitize_text_for_db(normal_text);
        assert_eq!(sanitized, normal_text);
-        
+
        // Test handling empty string
        let empty = "";
-        let sanitized = TransactionManager::sanitize_text_for_db(empty);
+        let sanitized = DocumentTransactionManager::sanitize_text_for_db(empty);
        assert_eq!(sanitized, "");
-        
+
        // Test handling text with multiple null bytes
        let many_nulls = "\0\0Start\0Middle\0\0End\0\0";
-        let sanitized = TransactionManager::sanitize_text_for_db(many_nulls);
+        let sanitized = DocumentTransactionManager::sanitize_text_for_db(many_nulls);
        assert_eq!(sanitized, "StartMiddleEnd");
    }
 }
--- a/src/ocr/enhanced.rs
+++ b/src/ocr/enhanced.rs
@ -131,9 +131,12 @@ impl EnhancedOcrService {
            
            Ok((text, confidence))
        }).await??;
-        
+
        let (text, confidence) = ocr_result;
-        
+
+        // Sanitize null bytes to prevent PostgreSQL errors
+        let text = Self::remove_null_bytes(&text);
+
        let processing_time = start_time.elapsed().as_millis() as u64;
        let word_count = text.split_whitespace().count();
        
@ -917,6 +920,10 @@ impl EnhancedOcrService {
                // Check if quick extraction got good results
                if self.is_text_extraction_quality_sufficient(&text, word_count, file_size) {
                    info!("PDF text extraction successful for '{}' using quick method", file_path);
+
+                    // Sanitize null bytes to prevent PostgreSQL errors
+                    let text = Self::remove_null_bytes(&text);
+
                    return Ok(OcrResult {
                        text,
                        confidence: 95.0,
@ -946,7 +953,10 @@ impl EnhancedOcrService {
                    let processing_time = start_time.elapsed().as_millis() as u64;
                    let word_count = self.count_words_safely(&text);
                    info!("Direct text extraction succeeded as last resort for: {}", file_path);
-                    
+
+                    // Sanitize null bytes to prevent PostgreSQL errors
+                    let text = Self::remove_null_bytes(&text);
+
                    return Ok(OcrResult {
                        text,
                        confidence: 50.0, // Lower confidence for direct extraction
@ -1155,9 +1165,12 @@ impl EnhancedOcrService {
        let processing_time = start_time.elapsed().as_millis() as u64;
        let word_count = self.count_words_safely(&ocr_text_result);
        
-        info!("OCR extraction completed for '{}': {} words in {}ms", 
+        info!("OCR extraction completed for '{}': {} words in {}ms",
              file_path, word_count, processing_time);
-        
+
+        // Sanitize null bytes to prevent PostgreSQL errors
+        let ocr_text_result = Self::remove_null_bytes(&ocr_text_result);
+
        Ok(OcrResult {
            text: ocr_text_result,
            confidence: 85.0, // OCR is generally lower confidence than direct text extraction