From 7863b9100f80a918181175532cac9ceae6156337 Mon Sep 17 00:00:00 2001 From: perf3ct Date: Fri, 5 Sep 2025 00:06:09 +0000 Subject: [PATCH] feat(ocr): no longer add explicit section / page break --- src/ocr/xml_extractor.rs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/ocr/xml_extractor.rs b/src/ocr/xml_extractor.rs index 4982c50..6081b72 100644 --- a/src/ocr/xml_extractor.rs +++ b/src/ocr/xml_extractor.rs @@ -808,12 +808,12 @@ impl XmlOfficeExtractor { } // Remove automatic spacing after w:r - this was causing words to be split // Instead, rely on explicit w:space elements and natural paragraph breaks - // Handle section breaks and page breaks + // Handle section breaks and page breaks with just whitespace b"w:sectPr" => { - text_content.push("\n\n--- Section Break ---\n\n".to_string()); + text_content.push("\n\n".to_string()); } b"w:lastRenderedPageBreak" => { - text_content.push("\n\n--- Page Break ---\n\n".to_string()); + text_content.push("\n\n".to_string()); } _ => {} } @@ -835,12 +835,8 @@ impl XmlOfficeExtractor { let raw_text = text_content.join(""); let cleaned_text = Self::clean_extracted_text(&raw_text); - // Check if we have actual text content (not just structural markers like section breaks) - let content_without_markers = cleaned_text - .replace("--- Section Break ---", "") - .replace("--- Page Break ---", ""); - - if content_without_markers.trim().is_empty() { + // Check if we have actual text content + if cleaned_text.trim().is_empty() { return Err(OfficeExtractionError::empty_document_error(&file_path_clone, "DOCX")); }