Readur/src/ocr/xml_extractor.rs

1429 lines
62 KiB
Rust

use anyhow::{anyhow, Result};
use tracing::{info, warn};
use std::time::Instant;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use tokio::time::{timeout, Duration};
use super::enhanced::OcrResult;
/// User-friendly error messages for Office document extraction issues
pub struct OfficeExtractionError;
impl OfficeExtractionError {
/// Create a user-friendly timeout error
pub fn timeout_error(file_path: &str, timeout_seconds: u64) -> anyhow::Error {
anyhow!(
"Document processing timed out after {} seconds.\n\
\n\
The file '{}' is taking too long to process, which may indicate:\n\
• Very large or complex document structure\n\
• Document contains many embedded objects or images\n\
• Corrupted or damaged file\n\
\n\
Suggestions to resolve this issue:\n\
1. Convert the document to PDF format (often processes faster)\n\
2. Split large documents into smaller sections\n\
3. Remove or compress embedded images/objects\n\
4. Try opening and re-saving the document to fix potential corruption\n\
5. Contact support if this is an important document that consistently fails",
timeout_seconds, file_path
)
}
/// Create a user-friendly file size error
pub fn file_too_large_error(file_path: &str, file_size_mb: f64, max_size_mb: f64) -> anyhow::Error {
anyhow!(
"Document is too large to process safely.\n\
\n\
The file '{}' is {:.1} MB, but the maximum allowed size is {:.1} MB.\n\
\n\
This limit helps prevent system overload and ensures reliable processing.\n\
\n\
Suggestions to resolve this issue:\n\
1. Split the document into smaller files (recommended)\n\
2. Reduce image quality or remove unnecessary images\n\
3. Convert to PDF format which often compresses better\n\
4. Remove embedded objects, videos, or audio files\n\
5. Process individual sections separately if splitting isn't practical",
file_path, file_size_mb, max_size_mb
)
}
/// Create a user-friendly corrupted file error
pub fn corrupted_file_error(file_path: &str, file_type: &str, specific_issue: &str) -> anyhow::Error {
anyhow!(
"Unable to process document - file appears corrupted or invalid.\n\
\n\
The {} file '{}' could not be processed due to: {}\n\
\n\
This typically indicates:\n\
• File corruption during transfer or storage\n\
• Incomplete download or truncated file\n\
• File format doesn't match the expected structure\n\
• Document was created with incompatible software\n\
\n\
Suggestions to resolve this issue:\n\
1. Re-download or re-obtain the original file\n\
2. Open the document in its native application and re-save it\n\
3. Try converting the document to PDF format first\n\
4. Use a file repair tool if available\n\
5. Contact the document creator for a fresh copy",
file_type, file_path, specific_issue
)
}
/// Create a user-friendly empty document error
pub fn empty_document_error(file_path: &str, document_type: &str) -> anyhow::Error {
anyhow!(
"No text content found in document.\n\
\n\
The {} file '{}' appears to be empty or contains no extractable text.\n\
\n\
This could mean:\n\
• Document contains only images, charts, or graphics\n\
• All content is in unsupported formats (e.g., embedded objects)\n\
• Document is password-protected or encrypted\n\
• File contains only formatting with no actual text\n\
\n\
Suggestions:\n\
1. Check if the document has visible content when opened normally\n\
2. If it contains images with text, convert to PDF and try again\n\
3. Copy and paste content into a new document if possible\n\
4. Remove password protection if the document is encrypted\n\
5. Contact support if you believe this document should contain text",
document_type, file_path
)
}
/// Create a user-friendly unsupported format error
pub fn unsupported_format_error(file_path: &str, file_format: &str, suggested_formats: &[&str]) -> anyhow::Error {
let format_list = suggested_formats.join(", ");
anyhow!(
"Document format not supported for text extraction.\n\
\n\
The file '{}' is in {} format, which is not currently supported for automatic text extraction.\n\
\n\
Supported formats include: {}\n\
\n\
Suggestions to process this document:\n\
1. Convert to a supported format (PDF recommended)\n\
2. Open in the original application and export/save as supported format\n\
3. Copy text manually and paste into a supported document type\n\
4. Use online conversion tools to change the format\n\
5. Contact support if you frequently work with this format",
file_path, file_format, format_list
)
}
/// Create a user-friendly ZIP bomb protection error
pub fn zip_bomb_protection_error(current_size_mb: f64, max_size_mb: f64) -> anyhow::Error {
anyhow!(
"Document processing stopped for security reasons.\n\
\n\
The document's internal structure expanded to {:.1} MB when processed, \
exceeding the safety limit of {:.1} MB.\n\
\n\
This protection prevents potential 'ZIP bomb' attacks that could overwhelm the system.\n\
\n\
If this is a legitimate document:\n\
1. The document may be extremely large or complex\n\
2. Try splitting it into smaller sections\n\
3. Convert to PDF format which may process more efficiently\n\
4. Remove large embedded objects or images\n\
5. Contact support if you believe this is a valid business document",
current_size_mb, max_size_mb
)
}
}
/// Result structure for Office document text extraction
#[derive(Debug, Clone)]
pub struct OfficeExtractionResult {
pub text: String,
pub confidence: f32,
pub processing_time_ms: u64,
pub word_count: usize,
pub extraction_method: String,
}
impl From<OfficeExtractionResult> for OcrResult {
/// Convert OfficeExtractionResult to OcrResult for compatibility with the main OCR service
fn from(office_result: OfficeExtractionResult) -> Self {
OcrResult {
text: office_result.text,
confidence: office_result.confidence,
processing_time_ms: office_result.processing_time_ms,
word_count: office_result.word_count,
preprocessing_applied: vec![office_result.extraction_method],
processed_image_path: None, // XML extraction doesn't produce processed images
}
}
}
/// Extraction context for tracking progress and supporting cancellation
pub struct ExtractionContext {
/// Flag to indicate if the operation should be cancelled
pub cancelled: Arc<AtomicBool>,
/// Total decompressed size across all ZIP entries (for ZIP bomb protection)
pub total_decompressed_size: Arc<AtomicU64>,
/// Maximum allowed total decompressed size
pub max_total_decompressed_size: u64,
/// Original compressed file size for compression ratio calculations
pub compressed_file_size: u64,
/// Maximum allowed compression ratio (decompressed/compressed)
pub max_compression_ratio: f64,
}
impl ExtractionContext {
pub fn new(max_total_decompressed_size: u64) -> Self {
Self {
cancelled: Arc::new(AtomicBool::new(false)),
total_decompressed_size: Arc::new(AtomicU64::new(0)),
max_total_decompressed_size,
compressed_file_size: 0, // Will be set when file is processed
max_compression_ratio: 1000.0, // Allow up to 1000:1 ratio (should catch most ZIP bombs)
}
}
pub fn new_with_file_info(max_total_decompressed_size: u64, compressed_file_size: u64) -> Self {
Self {
cancelled: Arc::new(AtomicBool::new(false)),
total_decompressed_size: Arc::new(AtomicU64::new(0)),
max_total_decompressed_size,
compressed_file_size,
max_compression_ratio: 1000.0, // Allow up to 1000:1 ratio
}
}
pub fn cancel(&self) {
self.cancelled.store(true, Ordering::SeqCst);
}
pub fn is_cancelled(&self) -> bool {
self.cancelled.load(Ordering::SeqCst)
}
pub fn add_decompressed_bytes(&self, bytes: u64) -> Result<()> {
let new_total = self.total_decompressed_size.fetch_add(bytes, Ordering::SeqCst) + bytes;
// Check absolute size limit
if new_total > self.max_total_decompressed_size {
return Err(OfficeExtractionError::zip_bomb_protection_error(
new_total as f64 / (1024.0 * 1024.0),
self.max_total_decompressed_size as f64 / (1024.0 * 1024.0)
));
}
// Check compression ratio if we have file size info
if self.compressed_file_size > 0 {
let current_ratio = new_total as f64 / self.compressed_file_size as f64;
if current_ratio > self.max_compression_ratio {
return Err(anyhow!(
"Document compression ratio is suspiciously high: {:.1}:1 (limit: {:.1}:1).\n\
\n\
The document expanded from {:.1} MB to {:.1} MB when processed, \
which indicates a potential ZIP bomb attack.\n\
\n\
ZIP bombs are malicious files designed to consume system resources \
by expanding to enormous sizes when decompressed.\n\
\n\
If this is a legitimate document:\n\
1. The file may contain highly repetitive content\n\
2. Try converting to PDF format first\n\
3. Split the document into smaller sections\n\
4. Contact support if this is a valid business document",
current_ratio,
self.max_compression_ratio,
self.compressed_file_size as f64 / (1024.0 * 1024.0),
new_total as f64 / (1024.0 * 1024.0)
));
}
}
Ok(())
}
}
/// XML-based Office document extractor with security features
pub struct XmlOfficeExtractor {
/// Temporary directory for file processing
pub temp_dir: String,
}
impl XmlOfficeExtractor {
// Security limits to prevent ZIP bombs and memory exhaustion attacks
const MAX_DECOMPRESSED_SIZE: u64 = 100 * 1024 * 1024; // 100MB total decompressed size across all entries
const MAX_XML_SIZE: u64 = 10 * 1024 * 1024; // 10MB per XML file
const MAX_ZIP_ENTRIES: usize = 1000; // Maximum number of entries to process
const MAX_ENTRY_NAME_LENGTH: usize = 255; // Maximum length of entry names
const MAX_OFFICE_SIZE: u64 = 50 * 1024 * 1024; // 50MB max Office document size
// Operation timeout constants
const DEFAULT_TIMEOUT_SECONDS: u64 = 120; // 2 minutes default timeout
const MAX_TIMEOUT_SECONDS: u64 = 600; // 10 minutes maximum timeout
// XML processing constants
const XML_READ_BUFFER_SIZE: usize = 8192; // 8KB chunks for reading
const MAX_WORKSHEETS_TO_CHECK: usize = 50; // Maximum worksheets to check in Excel files
const WORD_LENGTH_ESTIMATE: usize = 5; // Average characters per word for estimation
const MAX_WORD_COUNT_DISPLAY: usize = 10_000_000; // Maximum word count to prevent display issues
// XML entity limits to prevent expansion attacks
const MAX_ENTITY_EXPANSIONS: usize = 1000; // Maximum number of entity expansions
const MAX_ENTITY_DEPTH: usize = 10; // Maximum depth of nested entity references
/// Create a new XML Office extractor
pub fn new(temp_dir: String) -> Self {
Self { temp_dir }
}
/// Create a secure XML reader with protection against entity expansion attacks
fn create_secure_xml_reader(xml_content: &str) -> quick_xml::Reader<&[u8]> {
use quick_xml::Reader;
let mut reader = Reader::from_str(xml_content);
let config = reader.config_mut();
// Security configurations to prevent XML attacks
config.trim_text(true);
config.check_end_names = false; // Performance: disable end name checking
config.expand_empty_elements = false; // Security: don't expand empty elements
// Note: quick-xml doesn't support external entity expansion by default,
// but we're being explicit about security configurations
reader
}
/// Validate file path for security to prevent directory traversal and shell injection
fn validate_file_path_security(&self, file_path: &str) -> Result<()> {
// Check for null bytes
if file_path.contains('\0') {
return Err(anyhow!(
"File path contains null bytes: '{}'. This is blocked for security reasons.",
file_path.replace('\0', "\\0")
));
}
// Check for directory traversal attempts
if file_path.contains("..") {
return Err(anyhow!(
"File path contains directory traversal sequence '..': '{}'. This is blocked for security reasons.",
file_path
));
}
// Check for suspicious shell injection characters
let suspicious_chars = ['|', '&', ';', '$', '`', '(', ')', '{', '}', '[', ']', '<', '>'];
if file_path.chars().any(|c| suspicious_chars.contains(&c)) {
return Err(anyhow!(
"File path contains suspicious characters that could be used for command injection: '{}'. This is blocked for security reasons.",
file_path
));
}
// Check for shell command prefixes
let dangerous_prefixes = ["/bin/", "/usr/bin/", "/sbin/", "/usr/sbin/"];
for prefix in &dangerous_prefixes {
if file_path.starts_with(prefix) {
return Err(anyhow!(
"File path starts with potentially dangerous system directory '{}': '{}'. This is blocked for security reasons.",
prefix, file_path
));
}
}
// Ensure path is reasonably long (avoid empty or very short paths that might be special)
if file_path.trim().len() < 3 {
return Err(anyhow!(
"File path is too short: '{}'. This might indicate a malformed or dangerous path.",
file_path
));
}
// Check that file exists (additional validation)
if !std::path::Path::new(file_path).exists() {
return Err(anyhow!(
"File does not exist: '{}'. This prevents processing of non-existent files.",
file_path
));
}
Ok(())
}
/// Try to execute an external tool with timeout and proper error handling
async fn try_external_tool(&self, tool_name: &str, args: &[&str], file_path: &str) -> Result<String> {
use tokio::process::Command;
// Create the command with proper argument passing (no shell)
let mut cmd = Command::new(tool_name);
cmd.args(args);
// Set timeout (30 seconds should be reasonable for DOC extraction)
let timeout_duration = Duration::from_secs(30);
info!("Executing external tool: {} with args: {:?}", tool_name, args);
// Execute the command with timeout
let output = match timeout(timeout_duration, cmd.output()).await {
Ok(Ok(output)) => output,
Ok(Err(e)) => {
if e.kind() == std::io::ErrorKind::NotFound {
return Err(anyhow!(
"Tool '{}' not found. Please install it: sudo apt-get install {}",
tool_name,
match tool_name {
"antiword" => "antiword",
"catdoc" => "catdoc",
"wvText" => "wv",
_ => tool_name,
}
));
} else {
return Err(anyhow!("Failed to execute '{}': {}", tool_name, e));
}
}
Err(_) => {
return Err(anyhow!(
"Tool '{}' timed out after 30 seconds while processing '{}'",
tool_name, file_path
));
}
};
// Check exit status
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
let stdout = String::from_utf8_lossy(&output.stdout);
return Err(anyhow!(
"Tool '{}' failed with exit code: {:?}\nstderr: {}\nstdout: {}",
tool_name,
output.status.code(),
stderr.trim(),
stdout.trim()
));
}
// Extract text from stdout
let extracted_text = String::from_utf8_lossy(&output.stdout).into_owned();
// Check if we got any meaningful output
if extracted_text.trim().is_empty() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(anyhow!(
"Tool '{}' produced no output. stderr: {}",
tool_name,
stderr.trim()
));
}
info!("Successfully extracted {} characters with {}", extracted_text.len(), tool_name);
Ok(extracted_text)
}
/// Parse workbook.xml to get actual worksheet references instead of guessing
fn get_worksheet_names_from_workbook(archive: &mut zip::ZipArchive<std::fs::File>, context: &ExtractionContext) -> Result<Vec<String>> {
use quick_xml::events::Event;
// Try to read workbook.xml
let mut workbook_file = match archive.by_name("xl/workbook.xml") {
Ok(file) => file,
Err(_) => {
// Fall back to the old method if workbook.xml doesn't exist
warn!("workbook.xml not found, falling back to sequential worksheet detection");
return Ok((1..=Self::MAX_WORKSHEETS_TO_CHECK)
.map(|i| format!("sheet{}.xml", i))
.collect());
}
};
let xml_content = Self::read_zip_entry_safely(&mut workbook_file, Self::MAX_XML_SIZE, context)?;
drop(workbook_file);
let mut reader = Self::create_secure_xml_reader(&xml_content);
let mut worksheets = Vec::new();
let mut buf = Vec::new();
// Parse workbook.xml to find sheet references
loop {
if context.is_cancelled() {
return Err(anyhow!("Operation cancelled while parsing workbook.xml"));
}
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
if e.name().as_ref() == b"sheet" {
// Look for the r:id attribute to get the worksheet relationship
for attr in e.attributes() {
if let Ok(attr) = attr {
if attr.key.as_ref() == b"r:id" {
let sheet_id = String::from_utf8_lossy(&attr.value);
// Convert relationship ID to worksheet filename
// Typical pattern: rId1 -> sheet1.xml, rId2 -> sheet2.xml, etc.
if let Some(sheet_num) = sheet_id.strip_prefix("rId") {
worksheets.push(format!("sheet{}.xml", sheet_num));
}
}
}
}
}
}
Ok(Event::Eof) => break,
Err(e) => {
warn!("Error parsing workbook.xml, falling back to sequential detection: {}", e);
// Fall back to old method on parse error
return Ok((1..=Self::MAX_WORKSHEETS_TO_CHECK)
.map(|i| format!("sheet{}.xml", i))
.collect());
}
_ => {}
}
buf.clear();
}
if worksheets.is_empty() {
// Fall back if no worksheets found
warn!("No worksheets found in workbook.xml, falling back to sequential detection");
Ok((1..=Self::MAX_WORKSHEETS_TO_CHECK)
.map(|i| format!("sheet{}.xml", i))
.collect())
} else {
info!("Found {} worksheets in workbook.xml", worksheets.len());
Ok(worksheets)
}
}
/// Remove null bytes from text to prevent PostgreSQL errors
/// This is the ONLY sanitization we do - preserving all other original content
fn remove_null_bytes(text: &str) -> String {
let original_len = text.len();
let cleaned: String = text.chars().filter(|&c| c != '\0').collect();
// Log if we found and removed null bytes (shouldn't happen with valid documents)
let cleaned_len = cleaned.len();
if cleaned_len < original_len {
let null_bytes_removed = text.chars().filter(|&c| c == '\0').count();
warn!(
"Removed {} null bytes from extracted text (original: {} chars, cleaned: {} chars). \
This indicates corrupted or malformed document data.",
null_bytes_removed, original_len, cleaned_len
);
}
cleaned
}
/// Validates ZIP entry names to prevent directory traversal attacks
fn validate_zip_entry_name(entry_name: &str) -> Result<()> {
// Check entry name length
if entry_name.len() > Self::MAX_ENTRY_NAME_LENGTH {
return Err(anyhow!(
"ZIP entry name too long ({}). Maximum allowed length is {} characters for security reasons.",
entry_name.len(),
Self::MAX_ENTRY_NAME_LENGTH
));
}
// Check for directory traversal attempts
if entry_name.contains("..") {
return Err(anyhow!(
"ZIP entry contains directory traversal sequence '..': '{}'. This is blocked for security reasons.",
entry_name
));
}
// Check for absolute paths
if entry_name.starts_with('/') || entry_name.starts_with('\\') {
return Err(anyhow!(
"ZIP entry contains absolute path: '{}'. This is blocked for security reasons.",
entry_name
));
}
// Check for Windows drive letters
if entry_name.len() >= 2 && entry_name.chars().nth(1) == Some(':') {
return Err(anyhow!(
"ZIP entry contains Windows drive letter: '{}'. This is blocked for security reasons.",
entry_name
));
}
// Check for suspicious characters
let suspicious_chars = ['<', '>', '|', '*', '?'];
if entry_name.chars().any(|c| suspicious_chars.contains(&c)) {
return Err(anyhow!(
"ZIP entry contains suspicious characters: '{}'. This is blocked for security reasons.",
entry_name
));
}
Ok(())
}
/// Safely reads content from a ZIP entry with size limits and cancellation support
fn read_zip_entry_safely<R: std::io::Read>(
reader: &mut R,
max_size: u64,
context: &ExtractionContext
) -> Result<String> {
use std::io::Read;
let mut buffer = Vec::new();
let mut total_read = 0u64;
let mut temp_buf = [0u8; Self::XML_READ_BUFFER_SIZE];
loop {
// Check for cancellation
if context.is_cancelled() {
return Err(anyhow!("Operation cancelled by user"));
}
match reader.read(&mut temp_buf)? {
0 => break, // EOF
bytes_read => {
total_read += bytes_read as u64;
// Check if we've exceeded the per-file size limit
if total_read > max_size {
return Err(anyhow!(
"ZIP entry content exceeds maximum allowed size of {:.1} MB. \
This may be a ZIP bomb attack. Current size: {:.1} MB.",
max_size as f64 / (1024.0 * 1024.0),
total_read as f64 / (1024.0 * 1024.0)
));
}
// Update total decompressed size across all entries
context.add_decompressed_bytes(bytes_read as u64)?;
buffer.extend_from_slice(&temp_buf[..bytes_read]);
}
}
}
// Convert to string, handling encoding issues gracefully
String::from_utf8(buffer).or_else(|e| {
// Try to recover as much valid UTF-8 as possible
let bytes = e.into_bytes();
let lossy = String::from_utf8_lossy(&bytes);
Ok(lossy.into_owned())
})
}
/// Extract text from Office documents using XML parsing with timeout and cancellation support
pub async fn extract_text_from_office(&self, file_path: &str, mime_type: &str) -> Result<OfficeExtractionResult> {
self.extract_text_from_office_with_timeout(file_path, mime_type, Self::DEFAULT_TIMEOUT_SECONDS).await
}
/// Extract text from Office documents with custom timeout
pub async fn extract_text_from_office_with_timeout(
&self,
file_path: &str,
mime_type: &str,
timeout_seconds: u64
) -> Result<OfficeExtractionResult> {
let timeout_duration = Duration::from_secs(timeout_seconds.min(Self::MAX_TIMEOUT_SECONDS));
let extraction_future = self.extract_text_from_office_internal(file_path, mime_type);
match timeout(timeout_duration, extraction_future).await {
Ok(result) => result,
Err(_) => Err(OfficeExtractionError::timeout_error(file_path, timeout_seconds))
}
}
/// Internal extraction method with cancellation support
async fn extract_text_from_office_internal(&self, file_path: &str, mime_type: &str) -> Result<OfficeExtractionResult> {
let start_time = Instant::now();
info!("Extracting text from Office document: {} (type: {})", file_path, mime_type);
// Check file size before processing
let metadata = tokio::fs::metadata(file_path).await?;
let file_size = metadata.len();
if file_size > Self::MAX_OFFICE_SIZE {
return Err(OfficeExtractionError::file_too_large_error(
file_path,
file_size as f64 / (1024.0 * 1024.0),
Self::MAX_OFFICE_SIZE as f64 / (1024.0 * 1024.0)
));
}
// Create extraction context for ZIP bomb protection and cancellation support
let context = ExtractionContext::new_with_file_info(Self::MAX_DECOMPRESSED_SIZE, file_size);
match mime_type {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" => {
self.extract_text_from_docx(file_path, start_time, &context).await
}
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => {
self.extract_text_from_xlsx(file_path, start_time, &context).await
}
"application/msword" => {
self.extract_text_from_legacy_doc(file_path, start_time).await
}
"application/vnd.ms-excel" => {
self.extract_text_from_legacy_excel(file_path, start_time).await
}
"application/vnd.openxmlformats-officedocument.presentationml.presentation" => {
// For PPTX, provide guidance for now as it's complex
Err(OfficeExtractionError::unsupported_format_error(
file_path,
"PowerPoint (PPTX)",
&["PDF", "DOCX", "XLSX", "TXT"]
))
}
_ => {
Err(OfficeExtractionError::unsupported_format_error(
file_path,
mime_type,
&["PDF", "DOCX", "XLSX", "TXT"]
))
}
}
}
/// Extract text from DOCX files using ZIP + XML parsing
async fn extract_text_from_docx(&self, file_path: &str, start_time: Instant, context: &ExtractionContext) -> Result<OfficeExtractionResult> {
info!("Starting DOCX text extraction: {}", file_path);
// Move CPU-intensive operations to blocking thread pool
let file_path_clone = file_path.to_string();
let context_clone = ExtractionContext::new_with_file_info(
context.max_total_decompressed_size,
context.compressed_file_size
);
let extraction_result = tokio::task::spawn_blocking(move || -> Result<String> {
use zip::ZipArchive;
use quick_xml::events::Event;
// Open the DOCX file as a ZIP archive
let file = std::fs::File::open(&file_path_clone)?;
let mut archive = ZipArchive::new(file)?;
// Security check: Validate ZIP archive structure
let entry_count = archive.len();
if entry_count > Self::MAX_ZIP_ENTRIES {
return Err(anyhow!(
"ZIP archive contains too many entries ({}). Maximum allowed is {} for security reasons. \
This may be a ZIP bomb attack.",
entry_count,
Self::MAX_ZIP_ENTRIES
));
}
// Validate all entry names before processing to prevent directory traversal
for i in 0..entry_count {
let entry = archive.by_index(i)?;
let entry_name = entry.name();
Self::validate_zip_entry_name(entry_name)?;
}
// Try to extract the main document content from word/document.xml
let mut document_xml = match archive.by_name("word/document.xml") {
Ok(file) => file,
Err(_) => {
return Err(OfficeExtractionError::corrupted_file_error(
&file_path_clone,
"DOCX",
"missing word/document.xml - required component not found"
));
}
};
// Security: Use size-limited reading to prevent ZIP bomb attacks
let xml_content = Self::read_zip_entry_safely(&mut document_xml, Self::MAX_XML_SIZE, &context_clone)?;
drop(document_xml); // Close the archive entry
// Parse the XML and extract text content
let mut reader = Self::create_secure_xml_reader(&xml_content);
let mut text_content = Vec::new();
let mut in_text_element = false;
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) => {
// Look for text elements (w:t tags contain the actual text)
if e.name().as_ref() == b"w:t" {
in_text_element = true;
}
}
Ok(Event::Empty(ref e)) => {
// Handle self-closing elements that represent spacing
match e.name().as_ref() {
b"w:tab" => {
text_content.push("\t".to_string());
}
b"w:br" => {
text_content.push("\n".to_string());
}
b"w:cr" => {
text_content.push("\r".to_string());
}
b"w:space" => {
// Check for xml:space="preserve" attribute
let mut space_count = 1; // Default to one space
for attr in e.attributes() {
if let Ok(attr) = attr {
if attr.key.as_ref() == b"w:count" {
if let Ok(count_str) = std::str::from_utf8(&attr.value) {
space_count = count_str.parse::<usize>().unwrap_or(1);
}
}
}
}
text_content.push(" ".repeat(space_count));
}
_ => {}
}
}
Ok(Event::Text(e)) => {
if in_text_element {
// Extract and decode the text content
let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?;
text_content.push(text.into_owned());
}
}
Ok(Event::End(ref e)) => {
if e.name().as_ref() == b"w:t" {
in_text_element = false;
}
// Add proper breaks and spacing to preserve document structure
match e.name().as_ref() {
b"w:p" => {
// End of paragraph - add double newline for better readability
text_content.push("\n\n".to_string());
}
b"w:tr" => {
// End of table row - add single newline
text_content.push("\n".to_string());
}
b"w:tc" => {
// End of table cell - add tab separator
text_content.push("\t".to_string());
}
// Remove automatic spacing after w:r - this was causing words to be split
// Instead, rely on explicit w:space elements and natural paragraph breaks
// Handle section breaks and page breaks with just whitespace
b"w:sectPr" => {
text_content.push("\n\n".to_string());
}
b"w:lastRenderedPageBreak" => {
text_content.push("\n\n".to_string());
}
_ => {}
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(OfficeExtractionError::corrupted_file_error(
&file_path_clone,
"DOCX",
&format!("XML parsing error - {}", e)
));
}
_ => {}
}
buf.clear();
}
// Join all text content and clean it up for better readability
let raw_text = text_content.join("");
let cleaned_text = Self::clean_extracted_text(&raw_text);
// Check if we have actual text content
if cleaned_text.trim().is_empty() {
return Err(OfficeExtractionError::empty_document_error(&file_path_clone, "DOCX"));
}
Ok(cleaned_text)
}).await??;
let processing_time = start_time.elapsed().as_millis() as u64;
// Only remove null bytes - preserve all original formatting
let cleaned_text = Self::remove_null_bytes(&extraction_result);
let word_count = self.count_words_safely(&cleaned_text);
info!(
"DOCX extraction completed: {} words extracted from '{}' in {}ms",
word_count, file_path, processing_time
);
Ok(OfficeExtractionResult {
text: cleaned_text,
confidence: 100.0, // Direct text extraction has perfect confidence
processing_time_ms: processing_time,
word_count,
extraction_method: "DOCX XML extraction".to_string(),
})
}
/// Extract text from XLSX files using ZIP + XML parsing
async fn extract_text_from_xlsx(&self, file_path: &str, start_time: Instant, context: &ExtractionContext) -> Result<OfficeExtractionResult> {
info!("Starting XLSX text extraction: {}", file_path);
// Move CPU-intensive operations to blocking thread pool
let file_path_clone = file_path.to_string();
let context_clone = ExtractionContext::new_with_file_info(
context.max_total_decompressed_size,
context.compressed_file_size
);
let extraction_result = tokio::task::spawn_blocking(move || -> Result<String> {
use zip::ZipArchive;
use quick_xml::events::Event;
// Open the XLSX file as a ZIP archive
let file = std::fs::File::open(&file_path_clone)?;
let mut archive = ZipArchive::new(file)?;
// Security check: Validate ZIP archive structure
let entry_count = archive.len();
if entry_count > Self::MAX_ZIP_ENTRIES {
return Err(anyhow!(
"ZIP archive contains too many entries ({}). Maximum allowed is {} for security reasons. \
This may be a ZIP bomb attack.",
entry_count,
Self::MAX_ZIP_ENTRIES
));
}
// Validate all entry names before processing to prevent directory traversal
for i in 0..entry_count {
let entry = archive.by_index(i)?;
let entry_name = entry.name();
Self::validate_zip_entry_name(entry_name)?;
}
// First, extract shared strings (xl/sharedStrings.xml)
let mut shared_strings = Vec::new();
if let Ok(mut shared_strings_file) = archive.by_name("xl/sharedStrings.xml") {
// Security: Use size-limited reading to prevent ZIP bomb attacks
let xml_content = Self::read_zip_entry_safely(&mut shared_strings_file, Self::MAX_XML_SIZE, &context_clone)?;
drop(shared_strings_file);
// Parse shared strings
let mut reader = Self::create_secure_xml_reader(&xml_content);
let mut buf = Vec::new();
let mut in_string = false;
let mut current_string = String::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) => {
if e.name().as_ref() == b"t" {
in_string = true;
current_string.clear();
}
}
Ok(Event::Text(e)) => {
if in_string {
let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?;
current_string.push_str(&text);
}
}
Ok(Event::End(ref e)) => {
if e.name().as_ref() == b"t" {
in_string = false;
shared_strings.push(current_string.clone());
current_string.clear();
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(OfficeExtractionError::corrupted_file_error(
&file_path_clone,
"XLSX",
&format!("shared strings XML parsing error - {}", e)
));
}
_ => {}
}
buf.clear();
}
}
// Now extract worksheet data
let mut all_text = Vec::new();
let mut worksheet_count = 0;
// Get actual worksheet names from workbook.xml instead of guessing
let worksheet_names = Self::get_worksheet_names_from_workbook(&mut archive, &context_clone)?;
// Process each worksheet
for worksheet_filename in worksheet_names {
let worksheet_path = format!("xl/worksheets/{}", worksheet_filename);
if let Ok(mut worksheet_file) = archive.by_name(&worksheet_path) {
worksheet_count += 1;
// Security: Use size-limited reading to prevent ZIP bomb attacks
let xml_content = Self::read_zip_entry_safely(&mut worksheet_file, Self::MAX_XML_SIZE, &context_clone)?;
drop(worksheet_file);
// Parse worksheet data
let mut reader = Self::create_secure_xml_reader(&xml_content);
let mut buf = Vec::new();
let mut in_cell_value = false;
let mut current_cell_type = String::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) => {
if e.name().as_ref() == b"c" {
// Cell element - check if it has a type attribute
current_cell_type.clear();
for attr in e.attributes() {
if let Ok(attr) = attr {
if attr.key.as_ref() == b"t" {
current_cell_type = String::from_utf8_lossy(&attr.value).to_string();
}
}
}
} else if e.name().as_ref() == b"v" {
// Cell value
in_cell_value = true;
}
}
Ok(Event::Text(e)) => {
if in_cell_value {
let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?;
// If this is a shared string reference (t="s"), look up the string
if current_cell_type == "s" {
if let Ok(index) = text.parse::<usize>() {
if let Some(shared_string) = shared_strings.get(index) {
all_text.push(shared_string.clone());
}
}
} else {
// Direct value
all_text.push(text.into_owned());
}
}
}
Ok(Event::End(ref e)) => {
if e.name().as_ref() == b"v" {
in_cell_value = false;
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(OfficeExtractionError::corrupted_file_error(
&file_path_clone,
"XLSX",
&format!("worksheet '{}' XML parsing error - {}", worksheet_path, e)
));
}
_ => {}
}
buf.clear();
}
}
}
if worksheet_count == 0 {
return Err(OfficeExtractionError::corrupted_file_error(
&file_path_clone,
"XLSX",
"no worksheets found - file structure is invalid"
));
}
// Join all text content with spaces
let raw_text = all_text.join(" ");
if raw_text.trim().is_empty() {
return Err(OfficeExtractionError::empty_document_error(&file_path_clone, "XLSX"));
}
Ok(raw_text)
}).await??;
let processing_time = start_time.elapsed().as_millis() as u64;
// Only remove null bytes - preserve all original formatting
let cleaned_text = Self::remove_null_bytes(&extraction_result);
let word_count = self.count_words_safely(&cleaned_text);
info!(
"XLSX extraction completed: {} words extracted from '{}' in {}ms",
word_count, file_path, processing_time
);
Ok(OfficeExtractionResult {
text: cleaned_text,
confidence: 100.0, // Direct text extraction has perfect confidence
processing_time_ms: processing_time,
word_count,
extraction_method: "XLSX XML extraction".to_string(),
})
}
/// Extract text from legacy DOC files using external tools (antiword, catdoc, wvText)
async fn extract_text_from_legacy_doc(&self, file_path: &str, start_time: Instant) -> Result<OfficeExtractionResult> {
info!("Processing legacy DOC file: {}", file_path);
// Validate file path for security
self.validate_file_path_security(file_path)?;
// Try external tools in order of preference
let tools = vec![
("antiword", vec![file_path]),
("catdoc", vec![file_path]),
("wvText", vec![file_path]),
];
let mut last_error: Option<String> = None;
let mut tried_tools = Vec::new();
for (tool_name, args) in tools {
tried_tools.push(tool_name);
info!("Attempting DOC extraction with {}", tool_name);
match self.try_external_tool(tool_name, &args, file_path).await {
Ok(extracted_text) => {
let processing_time = start_time.elapsed().as_millis() as u64;
// Clean and validate the extracted text
let cleaned_text = Self::clean_extracted_text(&extracted_text);
let sanitized_text = Self::remove_null_bytes(&cleaned_text);
if sanitized_text.trim().is_empty() {
return Err(OfficeExtractionError::empty_document_error(file_path, "DOC"));
}
let word_count = self.count_words_safely(&sanitized_text);
info!(
"DOC extraction succeeded with {}: {} words extracted from '{}' in {}ms",
tool_name, word_count, file_path, processing_time
);
return Ok(OfficeExtractionResult {
text: sanitized_text,
confidence: 90.0, // External tool extraction has good but not perfect confidence
processing_time_ms: processing_time,
word_count,
extraction_method: format!("DOC external tool ({})", tool_name),
});
}
Err(e) => {
warn!("DOC extraction with {} failed: {}", tool_name, e);
last_error = Some(e.to_string());
}
}
}
// All tools failed
let processing_time = start_time.elapsed().as_millis() as u64;
let error_message = format!(
"None of the DOC extraction tools (antiword, catdoc, wvText) are available or working.\n\
\n\
Tried tools: {}\n\
Processing time: {}ms\n\
\n\
This file is in the legacy Microsoft Word (.doc) binary format which requires \
external tools for text extraction.\n\
\n\
To extract text from DOC files, please install one of these tools:\n\
• antiword: sudo apt-get install antiword (Ubuntu/Debian)\n\
• catdoc: sudo apt-get install catdoc (Ubuntu/Debian)\n\
• wvText: sudo apt-get install wv (Ubuntu/Debian)\n\
\n\
Last error: {}\n\
\n\
Alternatively, you can:\n\
1. Convert the file to DOCX format using Microsoft Word or LibreOffice\n\
2. Save/export as PDF format\n\
3. Copy and paste the text into a new DOCX document\n\
4. Use online conversion tools to convert DOC to DOCX",
tried_tools.join(", "),
processing_time,
last_error.unwrap_or_else(|| "All extraction methods failed".to_string())
);
Err(anyhow::anyhow!(error_message))
}
/// Extract text from legacy Excel files - provide guidance for now
async fn extract_text_from_legacy_excel(&self, file_path: &str, start_time: Instant) -> Result<OfficeExtractionResult> {
info!("Processing legacy Excel (XLS) file: {}", file_path);
let _processing_time = start_time.elapsed().as_millis() as u64;
// Legacy XLS files are complex binary format, suggest conversion
Err(OfficeExtractionError::unsupported_format_error(
file_path,
"Legacy Excel (.xls)",
&["XLSX", "PDF", "CSV", "TXT"]
))
}
/// Clean extracted text to improve readability and structure
fn clean_extracted_text(text: &str) -> String {
use regex::Regex;
// Create regex patterns for cleaning (compile once for efficiency)
let multiple_spaces = Regex::new(r" {3,}").unwrap(); // 3+ spaces -> 2 spaces
let multiple_newlines = Regex::new(r"\n{3,}").unwrap(); // 3+ newlines -> 2 newlines
let space_before_newline = Regex::new(r" +\n").unwrap(); // spaces before newlines
let newline_before_space = Regex::new(r"\n +").unwrap(); // newlines followed by spaces
let mixed_whitespace = Regex::new(r"[ \t]+").unwrap(); // tabs and spaces -> single space
// Pattern to fix concatenated words like "ExecutiveSummary" -> "Executive Summary"
// This looks for lowercase-uppercase transitions and adds a space
let word_boundaries = Regex::new(r"([a-z])([A-Z])").unwrap();
let mut cleaned = text.to_string();
// First, fix word boundaries that got concatenated
cleaned = word_boundaries.replace_all(&cleaned, "$1 $2").to_string();
// Clean up excessive whitespace
cleaned = multiple_spaces.replace_all(&cleaned, " ").to_string();
cleaned = multiple_newlines.replace_all(&cleaned, "\n\n").to_string();
cleaned = space_before_newline.replace_all(&cleaned, "\n").to_string();
cleaned = newline_before_space.replace_all(&cleaned, "\n").to_string();
cleaned = mixed_whitespace.replace_all(&cleaned, " ").to_string();
// Remove leading/trailing whitespace but preserve internal structure
cleaned.trim().to_string()
}
/// Safely count words to prevent overflow on very large texts
pub fn count_words_safely(&self, text: &str) -> usize {
// Early return for empty or tiny texts
if text.trim().is_empty() {
return 0;
}
// For very large texts, use sampling to estimate word count
const LARGE_TEXT_THRESHOLD: usize = 1_000_000; // 1MB
const SAMPLE_SIZE: usize = 100_000; // 100KB samples
const MAX_WORD_COUNT: usize = 10_000_000; // 10M words cap
if text.len() > LARGE_TEXT_THRESHOLD {
warn!(
"Text is very large ({:.1} MB), using sampling method for word count estimation",
text.len() as f64 / (1024.0 * 1024.0)
);
// Use multiple samples for better accuracy on very large texts
let num_samples = 3;
let sample_size = SAMPLE_SIZE.min(text.len() / num_samples);
let mut total_estimated_words = 0;
// Sample from beginning, middle, and end
for i in 0..num_samples {
let start = (text.len() / num_samples) * i;
let end = (start + sample_size).min(text.len());
// Ensure we sample complete characters (UTF-8 safe)
let sample_start = Self::floor_char_boundary(text, start);
let sample_end = Self::floor_char_boundary(text, end);
if sample_end > sample_start {
let sample = &text[sample_start..sample_end];
let sample_words = self.count_words_in_text_optimized(sample);
// Extrapolate this sample to the full text
let sample_ratio = text.len() as f64 / (sample_end - sample_start) as f64;
let estimated_from_sample = (sample_words as f64 * sample_ratio / num_samples as f64) as usize;
total_estimated_words += estimated_from_sample;
}
}
// Cap at reasonable maximum
total_estimated_words.min(MAX_WORD_COUNT)
} else if text.len() > 50_000 { // 50KB - use optimized counting for medium texts
self.count_words_in_text_optimized(text)
} else {
// Small texts can use the full algorithm
self.count_words_in_text(text)
}
}
/// Helper method to find the nearest character boundary (stable replacement for floor_char_boundary)
fn floor_char_boundary(text: &str, index: usize) -> usize {
if index >= text.len() {
return text.len();
}
// Find the start of a UTF-8 character by backing up until we find a valid char boundary
let mut boundary = index;
while boundary > 0 && !text.is_char_boundary(boundary) {
boundary -= 1;
}
boundary
}
/// Optimized word counting for medium-large texts
fn count_words_in_text_optimized(&self, text: &str) -> usize {
// For performance, use a simpler approach for medium-large texts
let mut word_count = 0;
let mut in_word = false;
for ch in text.chars() {
if ch.is_whitespace() {
if in_word {
word_count += 1;
in_word = false;
}
} else if ch.is_alphanumeric() {
in_word = true;
}
// Ignore pure punctuation
}
// Count the last word if text doesn't end with whitespace
if in_word {
word_count += 1;
}
word_count
}
fn count_words_in_text(&self, text: &str) -> usize {
let whitespace_words = text.split_whitespace().count();
// If we have exactly 1 "word" but it's very long (likely continuous text), try enhanced detection
// OR if we have no whitespace words but text exists
let is_continuous_text = whitespace_words == 1 && text.len() > 15; // 15+ chars suggests it might be continuous
let is_no_words = whitespace_words == 0 && !text.trim().is_empty();
if is_continuous_text || is_no_words {
// Count total alphanumeric characters first
let alphanumeric_chars = text.chars().filter(|c| c.is_alphanumeric()).count();
// If no alphanumeric content, it's pure punctuation/symbols
if alphanumeric_chars == 0 {
return 0;
}
// For continuous text, look for word boundaries using multiple strategies
let mut word_count = 0;
// Strategy 1: Count transitions from lowercase to uppercase (camelCase detection)
let chars: Vec<char> = text.chars().collect();
let mut camel_transitions = 0;
for i in 1..chars.len() {
let prev_char = chars[i-1];
let curr_char = chars[i];
// Count transitions from lowercase letter to uppercase letter
if prev_char.is_lowercase() && curr_char.is_uppercase() {
camel_transitions += 1;
}
// Count transitions from letter to digit or digit to letter
else if (prev_char.is_alphabetic() && curr_char.is_numeric()) ||
(prev_char.is_numeric() && curr_char.is_alphabetic()) {
camel_transitions += 1;
}
}
// If we found camelCase transitions, estimate words
if camel_transitions > 0 {
word_count = camel_transitions + 1; // +1 for the first word
}
// Strategy 2: If no camelCase detected, estimate based on character count
if word_count == 0 {
// Estimate based on typical word length (4-6 characters per word)
word_count = (alphanumeric_chars / 5).max(1);
}
word_count
} else {
whitespace_words
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
fn create_test_extractor() -> (XmlOfficeExtractor, TempDir) {
let temp_dir = TempDir::new().unwrap();
let extractor = XmlOfficeExtractor::new(temp_dir.path().to_string_lossy().to_string());
(extractor, temp_dir)
}
#[test]
fn test_validate_zip_entry_name() {
// Valid names should pass
assert!(XmlOfficeExtractor::validate_zip_entry_name("word/document.xml").is_ok());
assert!(XmlOfficeExtractor::validate_zip_entry_name("xl/worksheets/sheet1.xml").is_ok());
// Invalid names should fail
assert!(XmlOfficeExtractor::validate_zip_entry_name("../../../etc/passwd").is_err());
assert!(XmlOfficeExtractor::validate_zip_entry_name("/etc/passwd").is_err());
assert!(XmlOfficeExtractor::validate_zip_entry_name("C:\\windows\\system32\\cmd.exe").is_err());
assert!(XmlOfficeExtractor::validate_zip_entry_name("file<script>alert(1)</script>.xml").is_err());
// Too long name should fail
let long_name = "a".repeat(300);
assert!(XmlOfficeExtractor::validate_zip_entry_name(&long_name).is_err());
}
#[test]
fn test_remove_null_bytes() {
let text_with_nulls = "Hello\0World\0Test";
let cleaned = XmlOfficeExtractor::remove_null_bytes(text_with_nulls);
assert_eq!(cleaned, "HelloWorldTest");
let text_without_nulls = "Hello World Test";
let cleaned = XmlOfficeExtractor::remove_null_bytes(text_without_nulls);
assert_eq!(cleaned, "Hello World Test");
}
#[test]
fn test_count_words_safely() {
let (extractor, _temp_dir) = create_test_extractor();
// Normal text
assert_eq!(extractor.count_words_safely("Hello world test"), 3);
// Empty text
assert_eq!(extractor.count_words_safely(""), 0);
assert_eq!(extractor.count_words_safely(" "), 0);
// Continuous text without spaces
assert!(extractor.count_words_safely("HelloWorldTestingCamelCase") > 0);
// Very large text should not panic
let large_text = "word ".repeat(500_000); // 2MB+ of text
let word_count = extractor.count_words_safely(&large_text);
assert!(word_count > 0);
assert!(word_count <= 10_000_000); // Should be capped
}
#[test]
fn test_read_zip_entry_safely() {
use std::io::Cursor;
let context = ExtractionContext::new(10 * 1024 * 1024); // 10MB limit
// Test normal sized content
let small_content = b"Hello World";
let mut cursor = Cursor::new(small_content);
let result = XmlOfficeExtractor::read_zip_entry_safely(&mut cursor, 1024, &context);
assert!(result.is_ok());
assert_eq!(result.unwrap(), "Hello World");
// Test oversized content
let large_content = vec![b'A'; 2048];
let mut cursor = Cursor::new(large_content);
let result = XmlOfficeExtractor::read_zip_entry_safely(&mut cursor, 1024, &context);
assert!(result.is_err());
assert!(result.unwrap_err().to_string().contains("exceeds maximum allowed size"));
}
}