feat(ocr): no longer add explicit section / page break
This commit is contained in:
parent
07602a0096
commit
7863b9100f
|
|
@ -808,12 +808,12 @@ impl XmlOfficeExtractor {
|
||||||
}
|
}
|
||||||
// Remove automatic spacing after w:r - this was causing words to be split
|
// Remove automatic spacing after w:r - this was causing words to be split
|
||||||
// Instead, rely on explicit w:space elements and natural paragraph breaks
|
// Instead, rely on explicit w:space elements and natural paragraph breaks
|
||||||
// Handle section breaks and page breaks
|
// Handle section breaks and page breaks with just whitespace
|
||||||
b"w:sectPr" => {
|
b"w:sectPr" => {
|
||||||
text_content.push("\n\n--- Section Break ---\n\n".to_string());
|
text_content.push("\n\n".to_string());
|
||||||
}
|
}
|
||||||
b"w:lastRenderedPageBreak" => {
|
b"w:lastRenderedPageBreak" => {
|
||||||
text_content.push("\n\n--- Page Break ---\n\n".to_string());
|
text_content.push("\n\n".to_string());
|
||||||
}
|
}
|
||||||
_ => {}
|
_ => {}
|
||||||
}
|
}
|
||||||
|
|
@ -835,12 +835,8 @@ impl XmlOfficeExtractor {
|
||||||
let raw_text = text_content.join("");
|
let raw_text = text_content.join("");
|
||||||
let cleaned_text = Self::clean_extracted_text(&raw_text);
|
let cleaned_text = Self::clean_extracted_text(&raw_text);
|
||||||
|
|
||||||
// Check if we have actual text content (not just structural markers like section breaks)
|
// Check if we have actual text content
|
||||||
let content_without_markers = cleaned_text
|
if cleaned_text.trim().is_empty() {
|
||||||
.replace("--- Section Break ---", "")
|
|
||||||
.replace("--- Page Break ---", "");
|
|
||||||
|
|
||||||
if content_without_markers.trim().is_empty() {
|
|
||||||
return Err(OfficeExtractionError::empty_document_error(&file_path_clone, "DOCX"));
|
return Err(OfficeExtractionError::empty_document_error(&file_path_clone, "DOCX"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue