diff --git a/src/db_guardrails.rs b/src/db_guardrails.rs index 2961353..ea270cb 100644 --- a/src/db_guardrails.rs +++ b/src/db_guardrails.rs @@ -559,22 +559,22 @@ mod tests { fn test_sanitize_text_for_db() { // Test removing null bytes let text_with_nulls = "Hello\0World\0!"; - let sanitized = TransactionManager::sanitize_text_for_db(text_with_nulls); + let sanitized = DocumentTransactionManager::sanitize_text_for_db(text_with_nulls); assert_eq!(sanitized, "HelloWorld!"); - + // Test preserving normal text let normal_text = "This is a normal PDF text with special chars: €£¥"; - let sanitized = TransactionManager::sanitize_text_for_db(normal_text); + let sanitized = DocumentTransactionManager::sanitize_text_for_db(normal_text); assert_eq!(sanitized, normal_text); - + // Test handling empty string let empty = ""; - let sanitized = TransactionManager::sanitize_text_for_db(empty); + let sanitized = DocumentTransactionManager::sanitize_text_for_db(empty); assert_eq!(sanitized, ""); - + // Test handling text with multiple null bytes let many_nulls = "\0\0Start\0Middle\0\0End\0\0"; - let sanitized = TransactionManager::sanitize_text_for_db(many_nulls); + let sanitized = DocumentTransactionManager::sanitize_text_for_db(many_nulls); assert_eq!(sanitized, "StartMiddleEnd"); } } \ No newline at end of file diff --git a/src/ocr/enhanced.rs b/src/ocr/enhanced.rs index 547791d..5a9dea6 100644 --- a/src/ocr/enhanced.rs +++ b/src/ocr/enhanced.rs @@ -131,9 +131,12 @@ impl EnhancedOcrService { Ok((text, confidence)) }).await??; - + let (text, confidence) = ocr_result; - + + // Sanitize null bytes to prevent PostgreSQL errors + let text = Self::remove_null_bytes(&text); + let processing_time = start_time.elapsed().as_millis() as u64; let word_count = text.split_whitespace().count(); @@ -917,6 +920,10 @@ impl EnhancedOcrService { // Check if quick extraction got good results if self.is_text_extraction_quality_sufficient(&text, word_count, file_size) { info!("PDF text extraction successful for '{}' using quick method", file_path); + + // Sanitize null bytes to prevent PostgreSQL errors + let text = Self::remove_null_bytes(&text); + return Ok(OcrResult { text, confidence: 95.0, @@ -946,7 +953,10 @@ impl EnhancedOcrService { let processing_time = start_time.elapsed().as_millis() as u64; let word_count = self.count_words_safely(&text); info!("Direct text extraction succeeded as last resort for: {}", file_path); - + + // Sanitize null bytes to prevent PostgreSQL errors + let text = Self::remove_null_bytes(&text); + return Ok(OcrResult { text, confidence: 50.0, // Lower confidence for direct extraction @@ -1155,9 +1165,12 @@ impl EnhancedOcrService { let processing_time = start_time.elapsed().as_millis() as u64; let word_count = self.count_words_safely(&ocr_text_result); - info!("OCR extraction completed for '{}': {} words in {}ms", + info!("OCR extraction completed for '{}': {} words in {}ms", file_path, word_count, processing_time); - + + // Sanitize null bytes to prevent PostgreSQL errors + let ocr_text_result = Self::remove_null_bytes(&ocr_text_result); + Ok(OcrResult { text: ocr_text_result, confidence: 85.0, // OCR is generally lower confidence than direct text extraction