feat(ocr): no longer add explicit section / page break

This commit is contained in:
perf3ct 2025-09-05 00:06:09 +00:00
parent 07602a0096
commit 7863b9100f
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
1 changed files with 5 additions and 9 deletions

View File

@ -808,12 +808,12 @@ impl XmlOfficeExtractor {
} }
// Remove automatic spacing after w:r - this was causing words to be split // Remove automatic spacing after w:r - this was causing words to be split
// Instead, rely on explicit w:space elements and natural paragraph breaks // Instead, rely on explicit w:space elements and natural paragraph breaks
// Handle section breaks and page breaks // Handle section breaks and page breaks with just whitespace
b"w:sectPr" => { b"w:sectPr" => {
text_content.push("\n\n--- Section Break ---\n\n".to_string()); text_content.push("\n\n".to_string());
} }
b"w:lastRenderedPageBreak" => { b"w:lastRenderedPageBreak" => {
text_content.push("\n\n--- Page Break ---\n\n".to_string()); text_content.push("\n\n".to_string());
} }
_ => {} _ => {}
} }
@ -835,12 +835,8 @@ impl XmlOfficeExtractor {
let raw_text = text_content.join(""); let raw_text = text_content.join("");
let cleaned_text = Self::clean_extracted_text(&raw_text); let cleaned_text = Self::clean_extracted_text(&raw_text);
// Check if we have actual text content (not just structural markers like section breaks) // Check if we have actual text content
let content_without_markers = cleaned_text if cleaned_text.trim().is_empty() {
.replace("--- Section Break ---", "")
.replace("--- Page Break ---", "");
if content_without_markers.trim().is_empty() {
return Err(OfficeExtractionError::empty_document_error(&file_path_clone, "DOCX")); return Err(OfficeExtractionError::empty_document_error(&file_path_clone, "DOCX"));
} }