From 7863b9100f80a918181175532cac9ceae6156337 Mon Sep 17 00:00:00 2001
From: perf3ct <jonfuller2012@gmail.com>
Date: Fri, 5 Sep 2025 00:06:09 +0000
Subject: [PATCH] feat(ocr): no longer add explicit section / page break

---
 src/ocr/xml_extractor.rs | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/src/ocr/xml_extractor.rs b/src/ocr/xml_extractor.rs
index 4982c50..6081b72 100644
--- a/src/ocr/xml_extractor.rs
+++ b/src/ocr/xml_extractor.rs
@@ -808,12 +808,12 @@ impl XmlOfficeExtractor {
                             }
                             // Remove automatic spacing after w:r - this was causing words to be split
                             // Instead, rely on explicit w:space elements and natural paragraph breaks
-                            // Handle section breaks and page breaks
+                            // Handle section breaks and page breaks with just whitespace
                             b"w:sectPr" => {
-                                text_content.push("\n\n--- Section Break ---\n\n".to_string());
+                                text_content.push("\n\n".to_string());
                             }
                             b"w:lastRenderedPageBreak" => {
-                                text_content.push("\n\n--- Page Break ---\n\n".to_string());
+                                text_content.push("\n\n".to_string());
                             }
                             _ => {}
                         }
@@ -835,12 +835,8 @@ impl XmlOfficeExtractor {
             let raw_text = text_content.join("");
             let cleaned_text = Self::clean_extracted_text(&raw_text);
             
-            // Check if we have actual text content (not just structural markers like section breaks)
-            let content_without_markers = cleaned_text
-                .replace("--- Section Break ---", "")
-                .replace("--- Page Break ---", "");
-            
-            if content_without_markers.trim().is_empty() {
+            // Check if we have actual text content
+            if cleaned_text.trim().is_empty() {
                 return Err(OfficeExtractionError::empty_document_error(&file_path_clone, "DOCX"));
             }