feat(ocr): resolve issue with null bytes coming out of OCR documents
This commit is contained in:
parent
63aa7347a9
commit
f4fa33807c
|
|
@ -559,22 +559,22 @@ mod tests {
|
||||||
fn test_sanitize_text_for_db() {
|
fn test_sanitize_text_for_db() {
|
||||||
// Test removing null bytes
|
// Test removing null bytes
|
||||||
let text_with_nulls = "Hello\0World\0!";
|
let text_with_nulls = "Hello\0World\0!";
|
||||||
let sanitized = TransactionManager::sanitize_text_for_db(text_with_nulls);
|
let sanitized = DocumentTransactionManager::sanitize_text_for_db(text_with_nulls);
|
||||||
assert_eq!(sanitized, "HelloWorld!");
|
assert_eq!(sanitized, "HelloWorld!");
|
||||||
|
|
||||||
// Test preserving normal text
|
// Test preserving normal text
|
||||||
let normal_text = "This is a normal PDF text with special chars: €£¥";
|
let normal_text = "This is a normal PDF text with special chars: €£¥";
|
||||||
let sanitized = TransactionManager::sanitize_text_for_db(normal_text);
|
let sanitized = DocumentTransactionManager::sanitize_text_for_db(normal_text);
|
||||||
assert_eq!(sanitized, normal_text);
|
assert_eq!(sanitized, normal_text);
|
||||||
|
|
||||||
// Test handling empty string
|
// Test handling empty string
|
||||||
let empty = "";
|
let empty = "";
|
||||||
let sanitized = TransactionManager::sanitize_text_for_db(empty);
|
let sanitized = DocumentTransactionManager::sanitize_text_for_db(empty);
|
||||||
assert_eq!(sanitized, "");
|
assert_eq!(sanitized, "");
|
||||||
|
|
||||||
// Test handling text with multiple null bytes
|
// Test handling text with multiple null bytes
|
||||||
let many_nulls = "\0\0Start\0Middle\0\0End\0\0";
|
let many_nulls = "\0\0Start\0Middle\0\0End\0\0";
|
||||||
let sanitized = TransactionManager::sanitize_text_for_db(many_nulls);
|
let sanitized = DocumentTransactionManager::sanitize_text_for_db(many_nulls);
|
||||||
assert_eq!(sanitized, "StartMiddleEnd");
|
assert_eq!(sanitized, "StartMiddleEnd");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -134,6 +134,9 @@ impl EnhancedOcrService {
|
||||||
|
|
||||||
let (text, confidence) = ocr_result;
|
let (text, confidence) = ocr_result;
|
||||||
|
|
||||||
|
// Sanitize null bytes to prevent PostgreSQL errors
|
||||||
|
let text = Self::remove_null_bytes(&text);
|
||||||
|
|
||||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||||
let word_count = text.split_whitespace().count();
|
let word_count = text.split_whitespace().count();
|
||||||
|
|
||||||
|
|
@ -917,6 +920,10 @@ impl EnhancedOcrService {
|
||||||
// Check if quick extraction got good results
|
// Check if quick extraction got good results
|
||||||
if self.is_text_extraction_quality_sufficient(&text, word_count, file_size) {
|
if self.is_text_extraction_quality_sufficient(&text, word_count, file_size) {
|
||||||
info!("PDF text extraction successful for '{}' using quick method", file_path);
|
info!("PDF text extraction successful for '{}' using quick method", file_path);
|
||||||
|
|
||||||
|
// Sanitize null bytes to prevent PostgreSQL errors
|
||||||
|
let text = Self::remove_null_bytes(&text);
|
||||||
|
|
||||||
return Ok(OcrResult {
|
return Ok(OcrResult {
|
||||||
text,
|
text,
|
||||||
confidence: 95.0,
|
confidence: 95.0,
|
||||||
|
|
@ -947,6 +954,9 @@ impl EnhancedOcrService {
|
||||||
let word_count = self.count_words_safely(&text);
|
let word_count = self.count_words_safely(&text);
|
||||||
info!("Direct text extraction succeeded as last resort for: {}", file_path);
|
info!("Direct text extraction succeeded as last resort for: {}", file_path);
|
||||||
|
|
||||||
|
// Sanitize null bytes to prevent PostgreSQL errors
|
||||||
|
let text = Self::remove_null_bytes(&text);
|
||||||
|
|
||||||
return Ok(OcrResult {
|
return Ok(OcrResult {
|
||||||
text,
|
text,
|
||||||
confidence: 50.0, // Lower confidence for direct extraction
|
confidence: 50.0, // Lower confidence for direct extraction
|
||||||
|
|
@ -1158,6 +1168,9 @@ impl EnhancedOcrService {
|
||||||
info!("OCR extraction completed for '{}': {} words in {}ms",
|
info!("OCR extraction completed for '{}': {} words in {}ms",
|
||||||
file_path, word_count, processing_time);
|
file_path, word_count, processing_time);
|
||||||
|
|
||||||
|
// Sanitize null bytes to prevent PostgreSQL errors
|
||||||
|
let ocr_text_result = Self::remove_null_bytes(&ocr_text_result);
|
||||||
|
|
||||||
Ok(OcrResult {
|
Ok(OcrResult {
|
||||||
text: ocr_text_result,
|
text: ocr_text_result,
|
||||||
confidence: 85.0, // OCR is generally lower confidence than direct text extraction
|
confidence: 85.0, // OCR is generally lower confidence than direct text extraction
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue