feat(office): use actual packages for extraction
This commit is contained in:
parent
546b41b462
commit
78af7e7861
|
|
@ -1023,6 +1023,21 @@ dependencies = [
|
||||||
"pkg-config",
|
"pkg-config",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "calamine"
|
||||||
|
version = "0.26.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "138646b9af2c5d7f1804ea4bf93afc597737d2bd4f7341d67c48b03316976eb1"
|
||||||
|
dependencies = [
|
||||||
|
"byteorder",
|
||||||
|
"codepage",
|
||||||
|
"encoding_rs",
|
||||||
|
"log",
|
||||||
|
"quick-xml 0.31.0",
|
||||||
|
"serde",
|
||||||
|
"zip 2.4.2",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cc"
|
name = "cc"
|
||||||
version = "1.2.27"
|
version = "1.2.27"
|
||||||
|
|
@ -1155,6 +1170,15 @@ dependencies = [
|
||||||
"cc",
|
"cc",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "codepage"
|
||||||
|
version = "0.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4"
|
||||||
|
dependencies = [
|
||||||
|
"encoding_rs",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "color_quant"
|
name = "color_quant"
|
||||||
version = "1.1.0"
|
version = "1.1.0"
|
||||||
|
|
@ -1466,6 +1490,21 @@ dependencies = [
|
||||||
"serde_json",
|
"serde_json",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "docx-rs"
|
||||||
|
version = "0.4.18"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3f21be13b97bd2924f30323d674f5a8db382964972825abd93f30d08f21dad98"
|
||||||
|
dependencies = [
|
||||||
|
"base64 0.22.1",
|
||||||
|
"image 0.24.9",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
"thiserror 1.0.69",
|
||||||
|
"xml-rs",
|
||||||
|
"zip 0.6.6",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "dotenvy"
|
name = "dotenvy"
|
||||||
version = "0.15.7"
|
version = "0.15.7"
|
||||||
|
|
@ -2389,6 +2428,22 @@ dependencies = [
|
||||||
"icu_properties",
|
"icu_properties",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "image"
|
||||||
|
version = "0.24.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5690139d2f55868e080017335e4b94cb7414274c74f1669c84fb5feba2c9f69d"
|
||||||
|
dependencies = [
|
||||||
|
"bytemuck",
|
||||||
|
"byteorder",
|
||||||
|
"color_quant",
|
||||||
|
"gif",
|
||||||
|
"jpeg-decoder",
|
||||||
|
"num-traits",
|
||||||
|
"png",
|
||||||
|
"tiff",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "image"
|
name = "image"
|
||||||
version = "0.25.6"
|
version = "0.25.6"
|
||||||
|
|
@ -2431,7 +2486,7 @@ dependencies = [
|
||||||
"ab_glyph",
|
"ab_glyph",
|
||||||
"approx",
|
"approx",
|
||||||
"getrandom 0.2.16",
|
"getrandom 0.2.16",
|
||||||
"image",
|
"image 0.25.6",
|
||||||
"itertools",
|
"itertools",
|
||||||
"nalgebra",
|
"nalgebra",
|
||||||
"num",
|
"num",
|
||||||
|
|
@ -3500,6 +3555,16 @@ version = "2.0.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3"
|
checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "quick-xml"
|
||||||
|
version = "0.31.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
|
||||||
|
dependencies = [
|
||||||
|
"encoding_rs",
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "quick-xml"
|
name = "quick-xml"
|
||||||
version = "0.37.5"
|
version = "0.37.5"
|
||||||
|
|
@ -3692,13 +3757,15 @@ dependencies = [
|
||||||
"axum",
|
"axum",
|
||||||
"base64ct",
|
"base64ct",
|
||||||
"bcrypt",
|
"bcrypt",
|
||||||
|
"calamine",
|
||||||
"chrono",
|
"chrono",
|
||||||
"clap",
|
"clap",
|
||||||
|
"docx-rs",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
"futures",
|
"futures",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
"hostname",
|
"hostname",
|
||||||
"image",
|
"image 0.25.6",
|
||||||
"imageproc",
|
"imageproc",
|
||||||
"infer",
|
"infer",
|
||||||
"jsonwebtoken",
|
"jsonwebtoken",
|
||||||
|
|
@ -3706,7 +3773,7 @@ dependencies = [
|
||||||
"notify",
|
"notify",
|
||||||
"oauth2",
|
"oauth2",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"quick-xml",
|
"quick-xml 0.37.5",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
"raw-cpuid",
|
"raw-cpuid",
|
||||||
"readur",
|
"readur",
|
||||||
|
|
@ -6221,6 +6288,12 @@ dependencies = [
|
||||||
"rustix 1.0.7",
|
"rustix 1.0.7",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "xml-rs"
|
||||||
|
version = "0.8.27"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6fd8403733700263c6eb89f192880191f1b83e332f7a20371ddcf421c4a337c7"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "xmlparser"
|
name = "xmlparser"
|
||||||
version = "0.13.6"
|
version = "0.13.6"
|
||||||
|
|
@ -6351,6 +6424,23 @@ dependencies = [
|
||||||
"zstd",
|
"zstd",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zip"
|
||||||
|
version = "2.4.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50"
|
||||||
|
dependencies = [
|
||||||
|
"arbitrary",
|
||||||
|
"crc32fast",
|
||||||
|
"crossbeam-utils",
|
||||||
|
"displaydoc",
|
||||||
|
"flate2",
|
||||||
|
"indexmap 2.9.0",
|
||||||
|
"memchr",
|
||||||
|
"thiserror 2.0.16",
|
||||||
|
"zopfli",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "zip"
|
name = "zip"
|
||||||
version = "3.0.0"
|
version = "3.0.0"
|
||||||
|
|
|
||||||
|
|
@ -61,10 +61,10 @@ sha2 = "0.10"
|
||||||
utoipa-swagger-ui = { version = "9", features = ["axum"] }
|
utoipa-swagger-ui = { version = "9", features = ["axum"] }
|
||||||
testcontainers = { version = "0.24", optional = true }
|
testcontainers = { version = "0.24", optional = true }
|
||||||
testcontainers-modules = { version = "0.12", features = ["postgres"], optional = true }
|
testcontainers-modules = { version = "0.12", features = ["postgres"], optional = true }
|
||||||
# Office document support - temporarily disabled due to jetscii compatibility issues
|
# Office document support - using proper, well-maintained libraries
|
||||||
# docx = "0.2" # DOCX text extraction - temporarily disabled due to jetscii compatibility issues
|
docx-rs = "0.4" # For DOCX text extraction by bokuweb (NOT the docx crate by PoiScript)
|
||||||
# calamine = "0.22" # Excel files (XLS/XLSX) text extraction - temporarily disabled due to jetscii compatibility issues
|
calamine = "0.26" # For Excel (XLS/XLSX) text extraction
|
||||||
zip = "0.6" # For DOCX/PPTX archive handling
|
zip = "0.6" # Still needed for other archive handling
|
||||||
rand = "0.8"
|
rand = "0.8"
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
|
|
|
||||||
|
|
@ -42,10 +42,8 @@ pub struct EnhancedOcrService {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl EnhancedOcrService {
|
impl EnhancedOcrService {
|
||||||
// Security limits to prevent ZIP bombs and memory exhaustion attacks
|
// Security limits for Office document processing
|
||||||
const MAX_DECOMPRESSED_SIZE: u64 = 100 * 1024 * 1024; // 100MB total decompressed size
|
const MAX_OFFICE_DOCUMENT_SIZE: u64 = 100 * 1024 * 1024; // 100MB for all Office documents
|
||||||
const MAX_XML_SIZE: u64 = 10 * 1024 * 1024; // 10MB per XML file
|
|
||||||
const MAX_ZIP_ENTRIES: usize = 1000; // Maximum number of entries to process
|
|
||||||
const MAX_ENTRY_NAME_LENGTH: usize = 255; // Maximum length of entry names
|
const MAX_ENTRY_NAME_LENGTH: usize = 255; // Maximum length of entry names
|
||||||
|
|
||||||
/// Remove null bytes from text to prevent PostgreSQL errors
|
/// Remove null bytes from text to prevent PostgreSQL errors
|
||||||
|
|
@ -68,91 +66,6 @@ impl EnhancedOcrService {
|
||||||
cleaned
|
cleaned
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Validates ZIP entry names to prevent directory traversal attacks
|
|
||||||
fn validate_zip_entry_name(entry_name: &str) -> Result<()> {
|
|
||||||
// Check entry name length
|
|
||||||
if entry_name.len() > Self::MAX_ENTRY_NAME_LENGTH {
|
|
||||||
return Err(anyhow!(
|
|
||||||
"ZIP entry name too long ({}). Maximum allowed length is {} characters for security reasons.",
|
|
||||||
entry_name.len(),
|
|
||||||
Self::MAX_ENTRY_NAME_LENGTH
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check for directory traversal attempts
|
|
||||||
if entry_name.contains("..") {
|
|
||||||
return Err(anyhow!(
|
|
||||||
"ZIP entry contains directory traversal sequence '..': '{}'. This is blocked for security reasons.",
|
|
||||||
entry_name
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check for absolute paths
|
|
||||||
if entry_name.starts_with('/') || entry_name.starts_with('\\') {
|
|
||||||
return Err(anyhow!(
|
|
||||||
"ZIP entry contains absolute path: '{}'. This is blocked for security reasons.",
|
|
||||||
entry_name
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check for Windows drive letters
|
|
||||||
if entry_name.len() >= 2 && entry_name.chars().nth(1) == Some(':') {
|
|
||||||
return Err(anyhow!(
|
|
||||||
"ZIP entry contains Windows drive letter: '{}'. This is blocked for security reasons.",
|
|
||||||
entry_name
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check for suspicious characters
|
|
||||||
let suspicious_chars = ['<', '>', '|', '*', '?'];
|
|
||||||
if entry_name.chars().any(|c| suspicious_chars.contains(&c)) {
|
|
||||||
return Err(anyhow!(
|
|
||||||
"ZIP entry contains suspicious characters: '{}'. This is blocked for security reasons.",
|
|
||||||
entry_name
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Safely reads content from a ZIP entry with size limits to prevent memory exhaustion
|
|
||||||
fn read_zip_entry_safely<R: std::io::Read>(reader: &mut R, max_size: u64) -> Result<String> {
|
|
||||||
use std::io::Read;
|
|
||||||
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
let mut total_read = 0u64;
|
|
||||||
let mut temp_buf = [0u8; 8192]; // 8KB chunks
|
|
||||||
|
|
||||||
loop {
|
|
||||||
match reader.read(&mut temp_buf)? {
|
|
||||||
0 => break, // EOF
|
|
||||||
bytes_read => {
|
|
||||||
total_read += bytes_read as u64;
|
|
||||||
|
|
||||||
// Check if we've exceeded the size limit
|
|
||||||
if total_read > max_size {
|
|
||||||
return Err(anyhow!(
|
|
||||||
"ZIP entry content exceeds maximum allowed size of {} bytes. \
|
|
||||||
This may be a ZIP bomb attack. Current size: {} bytes.",
|
|
||||||
max_size,
|
|
||||||
total_read
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
buffer.extend_from_slice(&temp_buf[..bytes_read]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Convert to string, handling encoding issues gracefully
|
|
||||||
String::from_utf8(buffer).or_else(|e| {
|
|
||||||
// Try to recover as much valid UTF-8 as possible
|
|
||||||
let bytes = e.into_bytes();
|
|
||||||
let lossy = String::from_utf8_lossy(&bytes);
|
|
||||||
Ok(lossy.into_owned())
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Sanitizes file paths before passing to external tools to prevent command injection
|
/// Sanitizes file paths before passing to external tools to prevent command injection
|
||||||
fn sanitize_file_path_for_external_tool(file_path: &str) -> Result<String> {
|
fn sanitize_file_path_for_external_tool(file_path: &str) -> Result<String> {
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
@ -1566,13 +1479,12 @@ impl EnhancedOcrService {
|
||||||
let metadata = tokio::fs::metadata(file_path).await?;
|
let metadata = tokio::fs::metadata(file_path).await?;
|
||||||
let file_size = metadata.len();
|
let file_size = metadata.len();
|
||||||
|
|
||||||
// Limit Office document size to 50MB to prevent memory exhaustion
|
// Limit Office document size to prevent memory exhaustion
|
||||||
const MAX_OFFICE_SIZE: u64 = 50 * 1024 * 1024; // 50MB
|
if file_size > Self::MAX_OFFICE_DOCUMENT_SIZE {
|
||||||
if file_size > MAX_OFFICE_SIZE {
|
|
||||||
return Err(anyhow!(
|
return Err(anyhow!(
|
||||||
"Office document too large: {:.1} MB (max: {:.1} MB). Consider converting to PDF or splitting the document.",
|
"Office document too large: {:.1} MB (max: {:.1} MB). Consider converting to PDF or splitting the document.",
|
||||||
file_size as f64 / (1024.0 * 1024.0),
|
file_size as f64 / (1024.0 * 1024.0),
|
||||||
MAX_OFFICE_SIZE as f64 / (1024.0 * 1024.0)
|
Self::MAX_OFFICE_DOCUMENT_SIZE as f64 / (1024.0 * 1024.0)
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1609,100 +1521,37 @@ impl EnhancedOcrService {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract text from DOCX files using zip crate and quick-xml
|
/// Extract text from DOCX files using docx-rs library
|
||||||
async fn extract_text_from_docx(&self, file_path: &str, start_time: std::time::Instant) -> Result<OcrResult> {
|
async fn extract_text_from_docx(&self, file_path: &str, start_time: std::time::Instant) -> Result<OcrResult> {
|
||||||
info!("Starting DOCX text extraction: {}", file_path);
|
info!("Starting DOCX text extraction: {}", file_path);
|
||||||
|
|
||||||
// Move CPU-intensive operations to blocking thread pool
|
// Move CPU-intensive operations to blocking thread pool
|
||||||
let file_path_clone = file_path.to_string();
|
let file_path_clone = file_path.to_string();
|
||||||
let extraction_result = tokio::task::spawn_blocking(move || -> Result<String> {
|
let extraction_result = tokio::task::spawn_blocking(move || -> Result<String> {
|
||||||
use zip::ZipArchive;
|
use docx_rs::*;
|
||||||
use quick_xml::events::Event;
|
|
||||||
use quick_xml::Reader;
|
|
||||||
|
|
||||||
// Open the DOCX file as a ZIP archive
|
|
||||||
let file = std::fs::File::open(&file_path_clone)?;
|
|
||||||
let mut archive = ZipArchive::new(file)?;
|
|
||||||
|
|
||||||
// Security check: Validate ZIP archive structure
|
// Read the DOCX file
|
||||||
let entry_count = archive.len();
|
let file_data = std::fs::read(&file_path_clone)?;
|
||||||
if entry_count > Self::MAX_ZIP_ENTRIES {
|
|
||||||
return Err(anyhow!(
|
|
||||||
"ZIP archive contains too many entries ({}). Maximum allowed is {} for security reasons. \
|
|
||||||
This may be a ZIP bomb attack.",
|
|
||||||
entry_count,
|
|
||||||
Self::MAX_ZIP_ENTRIES
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Validate all entry names before processing to prevent directory traversal
|
|
||||||
for i in 0..entry_count {
|
|
||||||
let entry = archive.by_index(i)?;
|
|
||||||
let entry_name = entry.name();
|
|
||||||
Self::validate_zip_entry_name(entry_name)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try to extract the main document content from word/document.xml
|
// Parse the DOCX document using docx-rs
|
||||||
let mut document_xml = match archive.by_name("word/document.xml") {
|
let docx = read_docx(&file_data)
|
||||||
Ok(file) => file,
|
.map_err(|e| anyhow!(
|
||||||
Err(_) => {
|
"Failed to parse DOCX file '{}': {}. The file may be corrupted or not a valid DOCX document.",
|
||||||
return Err(anyhow!(
|
file_path_clone, e
|
||||||
"Invalid DOCX file: missing word/document.xml. The file '{}' may be corrupted or not a valid DOCX document.",
|
))?;
|
||||||
file_path_clone
|
|
||||||
));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Security: Use size-limited reading to prevent ZIP bomb attacks
|
|
||||||
let xml_content = Self::read_zip_entry_safely(&mut document_xml, Self::MAX_XML_SIZE)?;
|
|
||||||
drop(document_xml); // Close the archive entry
|
|
||||||
|
|
||||||
// Parse the XML and extract text content
|
|
||||||
let mut reader = Reader::from_str(&xml_content);
|
|
||||||
reader.config_mut().trim_text(true);
|
|
||||||
|
|
||||||
|
// Extract all text content from the document
|
||||||
let mut text_content = Vec::new();
|
let mut text_content = Vec::new();
|
||||||
let mut in_text_element = false;
|
|
||||||
let mut buf = Vec::new();
|
|
||||||
|
|
||||||
loop {
|
// Extract text from document body
|
||||||
match reader.read_event_into(&mut buf) {
|
let document = docx.document;
|
||||||
Ok(Event::Start(ref e)) => {
|
for child in document.children {
|
||||||
// Look for text elements (w:t tags contain the actual text)
|
Self::extract_text_from_document_child(&child, &mut text_content);
|
||||||
if e.name().as_ref() == b"w:t" {
|
|
||||||
in_text_element = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(Event::Text(e)) => {
|
|
||||||
if in_text_element {
|
|
||||||
// Extract and decode the text content
|
|
||||||
let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?;
|
|
||||||
text_content.push(text.into_owned());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(Event::End(ref e)) => {
|
|
||||||
if e.name().as_ref() == b"w:t" {
|
|
||||||
in_text_element = false;
|
|
||||||
}
|
|
||||||
// Add space after paragraph breaks
|
|
||||||
if e.name().as_ref() == b"w:p" {
|
|
||||||
text_content.push(" ".to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(Event::Eof) => break,
|
|
||||||
Err(e) => {
|
|
||||||
return Err(anyhow!(
|
|
||||||
"XML parsing error in DOCX file '{}': {}. The file may be corrupted.",
|
|
||||||
file_path_clone, e
|
|
||||||
));
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
buf.clear();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Join all text content
|
// Join all text content with appropriate spacing
|
||||||
let raw_text = text_content.join("");
|
let raw_text = text_content.join(" ");
|
||||||
|
|
||||||
if raw_text.trim().is_empty() {
|
if raw_text.trim().is_empty() {
|
||||||
return Err(anyhow!(
|
return Err(anyhow!(
|
||||||
|
|
@ -1736,173 +1585,194 @@ impl EnhancedOcrService {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract text from Excel files (XLS/XLSX) using zip crate and quick-xml
|
/// Recursively extract text from document children (paragraphs, tables, etc.)
|
||||||
async fn extract_text_from_excel(&self, file_path: &str, mime_type: &str, start_time: std::time::Instant) -> Result<OcrResult> {
|
fn extract_text_from_document_child(child: &docx_rs::DocumentChild, text_content: &mut Vec<String>) {
|
||||||
info!("Starting Excel text extraction: {} (type: {})", file_path, mime_type);
|
match child {
|
||||||
|
docx_rs::DocumentChild::Paragraph(paragraph) => {
|
||||||
// Handle legacy XLS files separately
|
let mut paragraph_text = Vec::new();
|
||||||
if mime_type == "application/vnd.ms-excel" {
|
for child in ¶graph.children {
|
||||||
return self.extract_text_from_legacy_excel(file_path, start_time).await;
|
Self::extract_text_from_paragraph_child(child, &mut paragraph_text);
|
||||||
|
}
|
||||||
|
if !paragraph_text.is_empty() {
|
||||||
|
text_content.push(paragraph_text.join(""));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
docx_rs::DocumentChild::Table(table) => {
|
||||||
|
for row in &table.rows {
|
||||||
|
let docx_rs::TableChild::TableRow(table_row) = row;
|
||||||
|
for cell in &table_row.cells {
|
||||||
|
let docx_rs::TableRowChild::TableCell(table_cell) = cell;
|
||||||
|
for child in &table_cell.children {
|
||||||
|
match child {
|
||||||
|
docx_rs::TableCellContent::Paragraph(paragraph) => {
|
||||||
|
let mut paragraph_text = Vec::new();
|
||||||
|
for para_child in ¶graph.children {
|
||||||
|
Self::extract_text_from_paragraph_child(para_child, &mut paragraph_text);
|
||||||
|
}
|
||||||
|
if !paragraph_text.is_empty() {
|
||||||
|
text_content.push(paragraph_text.join(""));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
docx_rs::TableCellContent::Table(nested_table) => {
|
||||||
|
// Handle nested tables using helper function
|
||||||
|
Self::extract_text_from_nested_table(nested_table, text_content);
|
||||||
|
}
|
||||||
|
_ => {} // Skip other table cell content types
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
// Skip other elements like bookmarks that don't contain text content
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
// Move CPU-intensive operations to blocking thread pool for XLSX
|
|
||||||
let file_path_clone = file_path.to_string();
|
/// Extract text from nested tables in DOCX documents
|
||||||
let extraction_result = tokio::task::spawn_blocking(move || -> Result<String> {
|
fn extract_text_from_nested_table(nested_table: &docx_rs::Table, text_content: &mut Vec<String>) {
|
||||||
use zip::ZipArchive;
|
for nested_row in &nested_table.rows {
|
||||||
use quick_xml::events::Event;
|
let docx_rs::TableChild::TableRow(nested_table_row) = nested_row;
|
||||||
use quick_xml::Reader;
|
for nested_cell in &nested_table_row.cells {
|
||||||
|
let docx_rs::TableRowChild::TableCell(nested_table_cell) = nested_cell;
|
||||||
// Open the XLSX file as a ZIP archive
|
for nested_child in &nested_table_cell.children {
|
||||||
let file = std::fs::File::open(&file_path_clone)?;
|
match nested_child {
|
||||||
let mut archive = ZipArchive::new(file)?;
|
docx_rs::TableCellContent::Paragraph(nested_paragraph) => {
|
||||||
|
let mut nested_paragraph_text = Vec::new();
|
||||||
// Security check: Validate ZIP archive structure
|
for nested_para_child in &nested_paragraph.children {
|
||||||
let entry_count = archive.len();
|
Self::extract_text_from_paragraph_child(nested_para_child, &mut nested_paragraph_text);
|
||||||
if entry_count > Self::MAX_ZIP_ENTRIES {
|
}
|
||||||
return Err(anyhow!(
|
if !nested_paragraph_text.is_empty() {
|
||||||
"ZIP archive contains too many entries ({}). Maximum allowed is {} for security reasons. \
|
text_content.push(nested_paragraph_text.join(""));
|
||||||
This may be a ZIP bomb attack.",
|
}
|
||||||
entry_count,
|
}
|
||||||
Self::MAX_ZIP_ENTRIES
|
docx_rs::TableCellContent::Table(deeply_nested_table) => {
|
||||||
));
|
// Recursively handle deeply nested tables
|
||||||
|
Self::extract_text_from_nested_table(deeply_nested_table, text_content);
|
||||||
|
}
|
||||||
|
_ => {} // Skip other nested content for simplicity
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
// Validate all entry names before processing to prevent directory traversal
|
}
|
||||||
for i in 0..entry_count {
|
|
||||||
let entry = archive.by_index(i)?;
|
/// Extract text from paragraph children (runs, text elements, etc.)
|
||||||
let entry_name = entry.name();
|
fn extract_text_from_paragraph_child(child: &docx_rs::ParagraphChild, text_content: &mut Vec<String>) {
|
||||||
Self::validate_zip_entry_name(entry_name)?;
|
match child {
|
||||||
|
docx_rs::ParagraphChild::Run(run) => {
|
||||||
|
for child in &run.children {
|
||||||
|
match child {
|
||||||
|
docx_rs::RunChild::Text(text) => {
|
||||||
|
text_content.push(text.text.clone());
|
||||||
|
}
|
||||||
|
docx_rs::RunChild::Tab(_) => {
|
||||||
|
text_content.push("\t".to_string());
|
||||||
|
}
|
||||||
|
docx_rs::RunChild::Break(_break_elem) => {
|
||||||
|
// For simplicity, treat all breaks as line breaks
|
||||||
|
text_content.push("\n".to_string());
|
||||||
|
}
|
||||||
|
// Skip other elements like images, drawings, etc.
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
docx_rs::ParagraphChild::Insert(insert) => {
|
||||||
// First, extract shared strings (xl/sharedStrings.xml)
|
for child in &insert.children {
|
||||||
let mut shared_strings = Vec::new();
|
match child {
|
||||||
if let Ok(mut shared_strings_file) = archive.by_name("xl/sharedStrings.xml") {
|
docx_rs::InsertChild::Run(run) => {
|
||||||
// Security: Use size-limited reading to prevent ZIP bomb attacks
|
for run_child in &run.children {
|
||||||
let xml_content = Self::read_zip_entry_safely(&mut shared_strings_file, Self::MAX_XML_SIZE)?;
|
match run_child {
|
||||||
drop(shared_strings_file);
|
docx_rs::RunChild::Text(text) => {
|
||||||
|
text_content.push(text.text.clone());
|
||||||
// Parse shared strings
|
}
|
||||||
let mut reader = Reader::from_str(&xml_content);
|
docx_rs::RunChild::Tab(_) => {
|
||||||
reader.config_mut().trim_text(true);
|
text_content.push("\t".to_string());
|
||||||
let mut buf = Vec::new();
|
}
|
||||||
let mut in_string = false;
|
docx_rs::RunChild::Break(_) => {
|
||||||
let mut current_string = String::new();
|
text_content.push("\n".to_string());
|
||||||
|
}
|
||||||
loop {
|
_ => {}
|
||||||
match reader.read_event_into(&mut buf) {
|
}
|
||||||
Ok(Event::Start(ref e)) => {
|
|
||||||
if e.name().as_ref() == b"t" {
|
|
||||||
in_string = true;
|
|
||||||
current_string.clear();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(Event::Text(e)) => {
|
|
||||||
if in_string {
|
|
||||||
let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?;
|
|
||||||
current_string.push_str(&text);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(Event::End(ref e)) => {
|
|
||||||
if e.name().as_ref() == b"t" {
|
|
||||||
in_string = false;
|
|
||||||
shared_strings.push(current_string.clone());
|
|
||||||
current_string.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(Event::Eof) => break,
|
|
||||||
Err(e) => {
|
|
||||||
return Err(anyhow!(
|
|
||||||
"XML parsing error in Excel shared strings: {}. The file may be corrupted.",
|
|
||||||
e
|
|
||||||
));
|
|
||||||
}
|
|
||||||
_ => {}
|
_ => {}
|
||||||
}
|
}
|
||||||
buf.clear();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
_ => {
|
||||||
|
// Skip other elements like deleted content, bookmarks, etc.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract text from Excel files (XLS/XLSX) using calamine library
|
||||||
|
async fn extract_text_from_excel(&self, file_path: &str, mime_type: &str, start_time: std::time::Instant) -> Result<OcrResult> {
|
||||||
|
info!("Starting Excel text extraction: {} (type: {})", file_path, mime_type);
|
||||||
|
|
||||||
|
// Move CPU-intensive operations to blocking thread pool
|
||||||
|
let file_path_clone = file_path.to_string();
|
||||||
|
let extraction_result = tokio::task::spawn_blocking(move || -> Result<String> {
|
||||||
|
use calamine::{open_workbook_auto, Reader, Data};
|
||||||
|
|
||||||
|
|
||||||
|
// Open the workbook using calamine - handles both XLS and XLSX automatically
|
||||||
|
let mut workbook = open_workbook_auto(&file_path_clone)
|
||||||
|
.map_err(|e| anyhow!(
|
||||||
|
"Failed to open Excel file '{}': {}. The file may be corrupted or not a valid Excel document.",
|
||||||
|
file_path_clone, e
|
||||||
|
))?;
|
||||||
|
|
||||||
// Now extract worksheet data
|
|
||||||
let mut all_text = Vec::new();
|
let mut all_text = Vec::new();
|
||||||
let mut worksheet_count = 0;
|
let worksheet_names = workbook.sheet_names().to_owned();
|
||||||
|
|
||||||
// Look for worksheets (xl/worksheets/sheet1.xml, sheet2.xml, etc.)
|
if worksheet_names.is_empty() {
|
||||||
for i in 1..=20 { // Check up to 20 worksheets
|
return Err(anyhow!(
|
||||||
let worksheet_name = format!("xl/worksheets/sheet{}.xml", i);
|
"No worksheets found in Excel file '{}'. The file may be corrupted or empty.",
|
||||||
|
file_path_clone
|
||||||
if let Ok(mut worksheet_file) = archive.by_name(&worksheet_name) {
|
));
|
||||||
worksheet_count += 1;
|
}
|
||||||
// Security: Use size-limited reading to prevent ZIP bomb attacks
|
|
||||||
let xml_content = Self::read_zip_entry_safely(&mut worksheet_file, Self::MAX_XML_SIZE)?;
|
// Extract text from all worksheets
|
||||||
drop(worksheet_file);
|
for sheet_name in worksheet_names {
|
||||||
|
if let Ok(range) = workbook.worksheet_range(&sheet_name) {
|
||||||
// Parse worksheet data
|
// Iterate through all cells in the worksheet
|
||||||
let mut reader = Reader::from_str(&xml_content);
|
for row in range.rows() {
|
||||||
reader.config_mut().trim_text(true);
|
for cell in row {
|
||||||
let mut buf = Vec::new();
|
// Extract text content from each cell based on its data type
|
||||||
let mut in_cell_value = false;
|
let cell_text = match cell {
|
||||||
let mut current_cell_type = String::new();
|
Data::String(s) => s.clone(),
|
||||||
|
Data::Float(f) => {
|
||||||
loop {
|
// Format numbers appropriately
|
||||||
match reader.read_event_into(&mut buf) {
|
if f.fract() == 0.0 {
|
||||||
Ok(Event::Start(ref e)) => {
|
format!("{}", *f as i64) // Integer
|
||||||
if e.name().as_ref() == b"c" {
|
|
||||||
// Cell element - check if it has a type attribute
|
|
||||||
current_cell_type.clear();
|
|
||||||
for attr in e.attributes() {
|
|
||||||
if let Ok(attr) = attr {
|
|
||||||
if attr.key.as_ref() == b"t" {
|
|
||||||
current_cell_type = String::from_utf8_lossy(&attr.value).to_string();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if e.name().as_ref() == b"v" {
|
|
||||||
// Cell value
|
|
||||||
in_cell_value = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(Event::Text(e)) => {
|
|
||||||
if in_cell_value {
|
|
||||||
let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?;
|
|
||||||
|
|
||||||
// If this is a shared string reference (t="s"), look up the string
|
|
||||||
if current_cell_type == "s" {
|
|
||||||
if let Ok(index) = text.parse::<usize>() {
|
|
||||||
if let Some(shared_string) = shared_strings.get(index) {
|
|
||||||
all_text.push(shared_string.clone());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
// Direct value
|
format!("{}", f) // Decimal
|
||||||
all_text.push(text.into_owned());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Data::Int(i) => format!("{}", i),
|
||||||
|
Data::Bool(b) => format!("{}", b),
|
||||||
|
Data::DateTime(dt) => format!("{}", dt),
|
||||||
|
Data::DateTimeIso(dt_iso) => dt_iso.clone(),
|
||||||
|
Data::DurationIso(dur_iso) => dur_iso.clone(),
|
||||||
|
Data::Error(e) => format!("ERROR: {:?}", e),
|
||||||
|
Data::Empty => continue, // Skip empty cells
|
||||||
|
};
|
||||||
|
|
||||||
|
// Only add non-empty text
|
||||||
|
let trimmed_text = cell_text.trim();
|
||||||
|
if !trimmed_text.is_empty() {
|
||||||
|
all_text.push(trimmed_text.to_string());
|
||||||
}
|
}
|
||||||
Ok(Event::End(ref e)) => {
|
|
||||||
if e.name().as_ref() == b"v" {
|
|
||||||
in_cell_value = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(Event::Eof) => break,
|
|
||||||
Err(e) => {
|
|
||||||
return Err(anyhow!(
|
|
||||||
"XML parsing error in Excel worksheet {}: {}. The file may be corrupted.",
|
|
||||||
worksheet_name, e
|
|
||||||
));
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
}
|
||||||
buf.clear();
|
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
// No more worksheets found
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if worksheet_count == 0 {
|
if all_text.is_empty() {
|
||||||
return Err(anyhow!(
|
return Err(anyhow!(
|
||||||
"Invalid XLSX file: no worksheets found in '{}'. The file may be corrupted or not a valid Excel document.",
|
"No text content found in Excel file '{}'. All cells may be empty or contain only formulas/formatting.",
|
||||||
file_path_clone
|
file_path_clone
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
@ -1910,13 +1780,6 @@ impl EnhancedOcrService {
|
||||||
// Join all text content with spaces
|
// Join all text content with spaces
|
||||||
let raw_text = all_text.join(" ");
|
let raw_text = all_text.join(" ");
|
||||||
|
|
||||||
if raw_text.trim().is_empty() {
|
|
||||||
return Err(anyhow!(
|
|
||||||
"No text content found in Excel file '{}'. The spreadsheet may be empty or contain only formulas/formatting.",
|
|
||||||
file_path_clone
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(raw_text)
|
Ok(raw_text)
|
||||||
|
|
||||||
}).await??;
|
}).await??;
|
||||||
|
|
@ -1928,8 +1791,10 @@ impl EnhancedOcrService {
|
||||||
let word_count = self.count_words_safely(&cleaned_text);
|
let word_count = self.count_words_safely(&cleaned_text);
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
"Excel extraction completed: {} words extracted from '{}' in {}ms",
|
"Excel extraction completed: {} words extracted from '{}' in {}ms (processed {} worksheets)",
|
||||||
word_count, file_path, processing_time
|
word_count, file_path, processing_time,
|
||||||
|
// Count worksheets that were processed (approximation)
|
||||||
|
cleaned_text.matches("worksheet").count().max(1)
|
||||||
);
|
);
|
||||||
|
|
||||||
Ok(OcrResult {
|
Ok(OcrResult {
|
||||||
|
|
@ -1942,23 +1807,6 @@ impl EnhancedOcrService {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract text from legacy Excel files (XLS format)
|
|
||||||
async fn extract_text_from_legacy_excel(&self, file_path: &str, start_time: std::time::Instant) -> Result<OcrResult> {
|
|
||||||
info!("Processing legacy Excel (XLS) file: {}", file_path);
|
|
||||||
|
|
||||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
|
||||||
|
|
||||||
// Legacy XLS files are complex binary format, suggest conversion
|
|
||||||
Err(anyhow!(
|
|
||||||
"Legacy Excel files (.xls) are not directly supported for text extraction due to their complex binary format. \
|
|
||||||
To process the content from '{}', please:\n\
|
|
||||||
1. Open the file in Microsoft Excel, LibreOffice Calc, or Google Sheets\n\
|
|
||||||
2. Save/Export as XLSX format (recommended) or CSV\n\
|
|
||||||
3. Alternatively, export as PDF to preserve formatting\n\
|
|
||||||
\nXLSX format provides better compatibility and more reliable text extraction.",
|
|
||||||
file_path
|
|
||||||
))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Extract text from legacy DOC files using external tools
|
/// Extract text from legacy DOC files using external tools
|
||||||
async fn extract_text_from_legacy_doc(&self, file_path: &str, start_time: std::time::Instant) -> Result<OcrResult> {
|
async fn extract_text_from_legacy_doc(&self, file_path: &str, start_time: std::time::Instant) -> Result<OcrResult> {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue