1429 lines
62 KiB
Rust
1429 lines
62 KiB
Rust
use anyhow::{anyhow, Result};
|
|
use tracing::{info, warn};
|
|
use std::time::Instant;
|
|
use std::sync::Arc;
|
|
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
|
|
use tokio::time::{timeout, Duration};
|
|
use super::enhanced::OcrResult;
|
|
|
|
/// User-friendly error messages for Office document extraction issues
|
|
pub struct OfficeExtractionError;
|
|
|
|
impl OfficeExtractionError {
|
|
/// Create a user-friendly timeout error
|
|
pub fn timeout_error(file_path: &str, timeout_seconds: u64) -> anyhow::Error {
|
|
anyhow!(
|
|
"Document processing timed out after {} seconds.\n\
|
|
\n\
|
|
The file '{}' is taking too long to process, which may indicate:\n\
|
|
• Very large or complex document structure\n\
|
|
• Document contains many embedded objects or images\n\
|
|
• Corrupted or damaged file\n\
|
|
\n\
|
|
Suggestions to resolve this issue:\n\
|
|
1. Convert the document to PDF format (often processes faster)\n\
|
|
2. Split large documents into smaller sections\n\
|
|
3. Remove or compress embedded images/objects\n\
|
|
4. Try opening and re-saving the document to fix potential corruption\n\
|
|
5. Contact support if this is an important document that consistently fails",
|
|
timeout_seconds, file_path
|
|
)
|
|
}
|
|
|
|
/// Create a user-friendly file size error
|
|
pub fn file_too_large_error(file_path: &str, file_size_mb: f64, max_size_mb: f64) -> anyhow::Error {
|
|
anyhow!(
|
|
"Document is too large to process safely.\n\
|
|
\n\
|
|
The file '{}' is {:.1} MB, but the maximum allowed size is {:.1} MB.\n\
|
|
\n\
|
|
This limit helps prevent system overload and ensures reliable processing.\n\
|
|
\n\
|
|
Suggestions to resolve this issue:\n\
|
|
1. Split the document into smaller files (recommended)\n\
|
|
2. Reduce image quality or remove unnecessary images\n\
|
|
3. Convert to PDF format which often compresses better\n\
|
|
4. Remove embedded objects, videos, or audio files\n\
|
|
5. Process individual sections separately if splitting isn't practical",
|
|
file_path, file_size_mb, max_size_mb
|
|
)
|
|
}
|
|
|
|
/// Create a user-friendly corrupted file error
|
|
pub fn corrupted_file_error(file_path: &str, file_type: &str, specific_issue: &str) -> anyhow::Error {
|
|
anyhow!(
|
|
"Unable to process document - file appears corrupted or invalid.\n\
|
|
\n\
|
|
The {} file '{}' could not be processed due to: {}\n\
|
|
\n\
|
|
This typically indicates:\n\
|
|
• File corruption during transfer or storage\n\
|
|
• Incomplete download or truncated file\n\
|
|
• File format doesn't match the expected structure\n\
|
|
• Document was created with incompatible software\n\
|
|
\n\
|
|
Suggestions to resolve this issue:\n\
|
|
1. Re-download or re-obtain the original file\n\
|
|
2. Open the document in its native application and re-save it\n\
|
|
3. Try converting the document to PDF format first\n\
|
|
4. Use a file repair tool if available\n\
|
|
5. Contact the document creator for a fresh copy",
|
|
file_type, file_path, specific_issue
|
|
)
|
|
}
|
|
|
|
/// Create a user-friendly empty document error
|
|
pub fn empty_document_error(file_path: &str, document_type: &str) -> anyhow::Error {
|
|
anyhow!(
|
|
"No text content found in document.\n\
|
|
\n\
|
|
The {} file '{}' appears to be empty or contains no extractable text.\n\
|
|
\n\
|
|
This could mean:\n\
|
|
• Document contains only images, charts, or graphics\n\
|
|
• All content is in unsupported formats (e.g., embedded objects)\n\
|
|
• Document is password-protected or encrypted\n\
|
|
• File contains only formatting with no actual text\n\
|
|
\n\
|
|
Suggestions:\n\
|
|
1. Check if the document has visible content when opened normally\n\
|
|
2. If it contains images with text, convert to PDF and try again\n\
|
|
3. Copy and paste content into a new document if possible\n\
|
|
4. Remove password protection if the document is encrypted\n\
|
|
5. Contact support if you believe this document should contain text",
|
|
document_type, file_path
|
|
)
|
|
}
|
|
|
|
/// Create a user-friendly unsupported format error
|
|
pub fn unsupported_format_error(file_path: &str, file_format: &str, suggested_formats: &[&str]) -> anyhow::Error {
|
|
let format_list = suggested_formats.join(", ");
|
|
anyhow!(
|
|
"Document format not supported for text extraction.\n\
|
|
\n\
|
|
The file '{}' is in {} format, which is not currently supported for automatic text extraction.\n\
|
|
\n\
|
|
Supported formats include: {}\n\
|
|
\n\
|
|
Suggestions to process this document:\n\
|
|
1. Convert to a supported format (PDF recommended)\n\
|
|
2. Open in the original application and export/save as supported format\n\
|
|
3. Copy text manually and paste into a supported document type\n\
|
|
4. Use online conversion tools to change the format\n\
|
|
5. Contact support if you frequently work with this format",
|
|
file_path, file_format, format_list
|
|
)
|
|
}
|
|
|
|
/// Create a user-friendly ZIP bomb protection error
|
|
pub fn zip_bomb_protection_error(current_size_mb: f64, max_size_mb: f64) -> anyhow::Error {
|
|
anyhow!(
|
|
"Document processing stopped for security reasons.\n\
|
|
\n\
|
|
The document's internal structure expanded to {:.1} MB when processed, \
|
|
exceeding the safety limit of {:.1} MB.\n\
|
|
\n\
|
|
This protection prevents potential 'ZIP bomb' attacks that could overwhelm the system.\n\
|
|
\n\
|
|
If this is a legitimate document:\n\
|
|
1. The document may be extremely large or complex\n\
|
|
2. Try splitting it into smaller sections\n\
|
|
3. Convert to PDF format which may process more efficiently\n\
|
|
4. Remove large embedded objects or images\n\
|
|
5. Contact support if you believe this is a valid business document",
|
|
current_size_mb, max_size_mb
|
|
)
|
|
}
|
|
}
|
|
|
|
/// Result structure for Office document text extraction
|
|
#[derive(Debug, Clone)]
|
|
pub struct OfficeExtractionResult {
|
|
pub text: String,
|
|
pub confidence: f32,
|
|
pub processing_time_ms: u64,
|
|
pub word_count: usize,
|
|
pub extraction_method: String,
|
|
}
|
|
|
|
impl From<OfficeExtractionResult> for OcrResult {
|
|
/// Convert OfficeExtractionResult to OcrResult for compatibility with the main OCR service
|
|
fn from(office_result: OfficeExtractionResult) -> Self {
|
|
OcrResult {
|
|
text: office_result.text,
|
|
confidence: office_result.confidence,
|
|
processing_time_ms: office_result.processing_time_ms,
|
|
word_count: office_result.word_count,
|
|
preprocessing_applied: vec![office_result.extraction_method],
|
|
processed_image_path: None, // XML extraction doesn't produce processed images
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Extraction context for tracking progress and supporting cancellation
|
|
pub struct ExtractionContext {
|
|
/// Flag to indicate if the operation should be cancelled
|
|
pub cancelled: Arc<AtomicBool>,
|
|
/// Total decompressed size across all ZIP entries (for ZIP bomb protection)
|
|
pub total_decompressed_size: Arc<AtomicU64>,
|
|
/// Maximum allowed total decompressed size
|
|
pub max_total_decompressed_size: u64,
|
|
/// Original compressed file size for compression ratio calculations
|
|
pub compressed_file_size: u64,
|
|
/// Maximum allowed compression ratio (decompressed/compressed)
|
|
pub max_compression_ratio: f64,
|
|
}
|
|
|
|
impl ExtractionContext {
|
|
pub fn new(max_total_decompressed_size: u64) -> Self {
|
|
Self {
|
|
cancelled: Arc::new(AtomicBool::new(false)),
|
|
total_decompressed_size: Arc::new(AtomicU64::new(0)),
|
|
max_total_decompressed_size,
|
|
compressed_file_size: 0, // Will be set when file is processed
|
|
max_compression_ratio: 1000.0, // Allow up to 1000:1 ratio (should catch most ZIP bombs)
|
|
}
|
|
}
|
|
|
|
pub fn new_with_file_info(max_total_decompressed_size: u64, compressed_file_size: u64) -> Self {
|
|
Self {
|
|
cancelled: Arc::new(AtomicBool::new(false)),
|
|
total_decompressed_size: Arc::new(AtomicU64::new(0)),
|
|
max_total_decompressed_size,
|
|
compressed_file_size,
|
|
max_compression_ratio: 1000.0, // Allow up to 1000:1 ratio
|
|
}
|
|
}
|
|
|
|
pub fn cancel(&self) {
|
|
self.cancelled.store(true, Ordering::SeqCst);
|
|
}
|
|
|
|
pub fn is_cancelled(&self) -> bool {
|
|
self.cancelled.load(Ordering::SeqCst)
|
|
}
|
|
|
|
pub fn add_decompressed_bytes(&self, bytes: u64) -> Result<()> {
|
|
let new_total = self.total_decompressed_size.fetch_add(bytes, Ordering::SeqCst) + bytes;
|
|
|
|
// Check absolute size limit
|
|
if new_total > self.max_total_decompressed_size {
|
|
return Err(OfficeExtractionError::zip_bomb_protection_error(
|
|
new_total as f64 / (1024.0 * 1024.0),
|
|
self.max_total_decompressed_size as f64 / (1024.0 * 1024.0)
|
|
));
|
|
}
|
|
|
|
// Check compression ratio if we have file size info
|
|
if self.compressed_file_size > 0 {
|
|
let current_ratio = new_total as f64 / self.compressed_file_size as f64;
|
|
if current_ratio > self.max_compression_ratio {
|
|
return Err(anyhow!(
|
|
"Document compression ratio is suspiciously high: {:.1}:1 (limit: {:.1}:1).\n\
|
|
\n\
|
|
The document expanded from {:.1} MB to {:.1} MB when processed, \
|
|
which indicates a potential ZIP bomb attack.\n\
|
|
\n\
|
|
ZIP bombs are malicious files designed to consume system resources \
|
|
by expanding to enormous sizes when decompressed.\n\
|
|
\n\
|
|
If this is a legitimate document:\n\
|
|
1. The file may contain highly repetitive content\n\
|
|
2. Try converting to PDF format first\n\
|
|
3. Split the document into smaller sections\n\
|
|
4. Contact support if this is a valid business document",
|
|
current_ratio,
|
|
self.max_compression_ratio,
|
|
self.compressed_file_size as f64 / (1024.0 * 1024.0),
|
|
new_total as f64 / (1024.0 * 1024.0)
|
|
));
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
/// XML-based Office document extractor with security features
|
|
pub struct XmlOfficeExtractor {
|
|
/// Temporary directory for file processing
|
|
pub temp_dir: String,
|
|
}
|
|
|
|
impl XmlOfficeExtractor {
|
|
// Security limits to prevent ZIP bombs and memory exhaustion attacks
|
|
const MAX_DECOMPRESSED_SIZE: u64 = 100 * 1024 * 1024; // 100MB total decompressed size across all entries
|
|
const MAX_XML_SIZE: u64 = 10 * 1024 * 1024; // 10MB per XML file
|
|
const MAX_ZIP_ENTRIES: usize = 1000; // Maximum number of entries to process
|
|
const MAX_ENTRY_NAME_LENGTH: usize = 255; // Maximum length of entry names
|
|
const MAX_OFFICE_SIZE: u64 = 50 * 1024 * 1024; // 50MB max Office document size
|
|
|
|
// Operation timeout constants
|
|
const DEFAULT_TIMEOUT_SECONDS: u64 = 120; // 2 minutes default timeout
|
|
const MAX_TIMEOUT_SECONDS: u64 = 600; // 10 minutes maximum timeout
|
|
|
|
// XML processing constants
|
|
const XML_READ_BUFFER_SIZE: usize = 8192; // 8KB chunks for reading
|
|
const MAX_WORKSHEETS_TO_CHECK: usize = 50; // Maximum worksheets to check in Excel files
|
|
const WORD_LENGTH_ESTIMATE: usize = 5; // Average characters per word for estimation
|
|
const MAX_WORD_COUNT_DISPLAY: usize = 10_000_000; // Maximum word count to prevent display issues
|
|
|
|
// XML entity limits to prevent expansion attacks
|
|
const MAX_ENTITY_EXPANSIONS: usize = 1000; // Maximum number of entity expansions
|
|
const MAX_ENTITY_DEPTH: usize = 10; // Maximum depth of nested entity references
|
|
|
|
/// Create a new XML Office extractor
|
|
pub fn new(temp_dir: String) -> Self {
|
|
Self { temp_dir }
|
|
}
|
|
|
|
/// Create a secure XML reader with protection against entity expansion attacks
|
|
fn create_secure_xml_reader(xml_content: &str) -> quick_xml::Reader<&[u8]> {
|
|
use quick_xml::Reader;
|
|
|
|
let mut reader = Reader::from_str(xml_content);
|
|
let config = reader.config_mut();
|
|
|
|
// Security configurations to prevent XML attacks
|
|
config.trim_text(true);
|
|
config.check_end_names = false; // Performance: disable end name checking
|
|
config.expand_empty_elements = false; // Security: don't expand empty elements
|
|
|
|
// Note: quick-xml doesn't support external entity expansion by default,
|
|
// but we're being explicit about security configurations
|
|
|
|
reader
|
|
}
|
|
|
|
/// Validate file path for security to prevent directory traversal and shell injection
|
|
fn validate_file_path_security(&self, file_path: &str) -> Result<()> {
|
|
// Check for null bytes
|
|
if file_path.contains('\0') {
|
|
return Err(anyhow!(
|
|
"File path contains null bytes: '{}'. This is blocked for security reasons.",
|
|
file_path.replace('\0', "\\0")
|
|
));
|
|
}
|
|
|
|
// Check for directory traversal attempts
|
|
if file_path.contains("..") {
|
|
return Err(anyhow!(
|
|
"File path contains directory traversal sequence '..': '{}'. This is blocked for security reasons.",
|
|
file_path
|
|
));
|
|
}
|
|
|
|
// Check for suspicious shell injection characters
|
|
let suspicious_chars = ['|', '&', ';', '$', '`', '(', ')', '{', '}', '[', ']', '<', '>'];
|
|
if file_path.chars().any(|c| suspicious_chars.contains(&c)) {
|
|
return Err(anyhow!(
|
|
"File path contains suspicious characters that could be used for command injection: '{}'. This is blocked for security reasons.",
|
|
file_path
|
|
));
|
|
}
|
|
|
|
// Check for shell command prefixes
|
|
let dangerous_prefixes = ["/bin/", "/usr/bin/", "/sbin/", "/usr/sbin/"];
|
|
for prefix in &dangerous_prefixes {
|
|
if file_path.starts_with(prefix) {
|
|
return Err(anyhow!(
|
|
"File path starts with potentially dangerous system directory '{}': '{}'. This is blocked for security reasons.",
|
|
prefix, file_path
|
|
));
|
|
}
|
|
}
|
|
|
|
// Ensure path is reasonably long (avoid empty or very short paths that might be special)
|
|
if file_path.trim().len() < 3 {
|
|
return Err(anyhow!(
|
|
"File path is too short: '{}'. This might indicate a malformed or dangerous path.",
|
|
file_path
|
|
));
|
|
}
|
|
|
|
// Check that file exists (additional validation)
|
|
if !std::path::Path::new(file_path).exists() {
|
|
return Err(anyhow!(
|
|
"File does not exist: '{}'. This prevents processing of non-existent files.",
|
|
file_path
|
|
));
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Try to execute an external tool with timeout and proper error handling
|
|
async fn try_external_tool(&self, tool_name: &str, args: &[&str], file_path: &str) -> Result<String> {
|
|
use tokio::process::Command;
|
|
|
|
// Create the command with proper argument passing (no shell)
|
|
let mut cmd = Command::new(tool_name);
|
|
cmd.args(args);
|
|
|
|
// Set timeout (30 seconds should be reasonable for DOC extraction)
|
|
let timeout_duration = Duration::from_secs(30);
|
|
|
|
info!("Executing external tool: {} with args: {:?}", tool_name, args);
|
|
|
|
// Execute the command with timeout
|
|
let output = match timeout(timeout_duration, cmd.output()).await {
|
|
Ok(Ok(output)) => output,
|
|
Ok(Err(e)) => {
|
|
if e.kind() == std::io::ErrorKind::NotFound {
|
|
return Err(anyhow!(
|
|
"Tool '{}' not found. Please install it: sudo apt-get install {}",
|
|
tool_name,
|
|
match tool_name {
|
|
"antiword" => "antiword",
|
|
"catdoc" => "catdoc",
|
|
"wvText" => "wv",
|
|
_ => tool_name,
|
|
}
|
|
));
|
|
} else {
|
|
return Err(anyhow!("Failed to execute '{}': {}", tool_name, e));
|
|
}
|
|
}
|
|
Err(_) => {
|
|
return Err(anyhow!(
|
|
"Tool '{}' timed out after 30 seconds while processing '{}'",
|
|
tool_name, file_path
|
|
));
|
|
}
|
|
};
|
|
|
|
// Check exit status
|
|
if !output.status.success() {
|
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
return Err(anyhow!(
|
|
"Tool '{}' failed with exit code: {:?}\nstderr: {}\nstdout: {}",
|
|
tool_name,
|
|
output.status.code(),
|
|
stderr.trim(),
|
|
stdout.trim()
|
|
));
|
|
}
|
|
|
|
// Extract text from stdout
|
|
let extracted_text = String::from_utf8_lossy(&output.stdout).into_owned();
|
|
|
|
// Check if we got any meaningful output
|
|
if extracted_text.trim().is_empty() {
|
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
return Err(anyhow!(
|
|
"Tool '{}' produced no output. stderr: {}",
|
|
tool_name,
|
|
stderr.trim()
|
|
));
|
|
}
|
|
|
|
info!("Successfully extracted {} characters with {}", extracted_text.len(), tool_name);
|
|
Ok(extracted_text)
|
|
}
|
|
|
|
/// Parse workbook.xml to get actual worksheet references instead of guessing
|
|
fn get_worksheet_names_from_workbook(archive: &mut zip::ZipArchive<std::fs::File>, context: &ExtractionContext) -> Result<Vec<String>> {
|
|
use quick_xml::events::Event;
|
|
|
|
// Try to read workbook.xml
|
|
let mut workbook_file = match archive.by_name("xl/workbook.xml") {
|
|
Ok(file) => file,
|
|
Err(_) => {
|
|
// Fall back to the old method if workbook.xml doesn't exist
|
|
warn!("workbook.xml not found, falling back to sequential worksheet detection");
|
|
return Ok((1..=Self::MAX_WORKSHEETS_TO_CHECK)
|
|
.map(|i| format!("sheet{}.xml", i))
|
|
.collect());
|
|
}
|
|
};
|
|
|
|
let xml_content = Self::read_zip_entry_safely(&mut workbook_file, Self::MAX_XML_SIZE, context)?;
|
|
drop(workbook_file);
|
|
|
|
let mut reader = Self::create_secure_xml_reader(&xml_content);
|
|
|
|
let mut worksheets = Vec::new();
|
|
let mut buf = Vec::new();
|
|
|
|
// Parse workbook.xml to find sheet references
|
|
loop {
|
|
if context.is_cancelled() {
|
|
return Err(anyhow!("Operation cancelled while parsing workbook.xml"));
|
|
}
|
|
|
|
match reader.read_event_into(&mut buf) {
|
|
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
|
|
if e.name().as_ref() == b"sheet" {
|
|
// Look for the r:id attribute to get the worksheet relationship
|
|
for attr in e.attributes() {
|
|
if let Ok(attr) = attr {
|
|
if attr.key.as_ref() == b"r:id" {
|
|
let sheet_id = String::from_utf8_lossy(&attr.value);
|
|
// Convert relationship ID to worksheet filename
|
|
// Typical pattern: rId1 -> sheet1.xml, rId2 -> sheet2.xml, etc.
|
|
if let Some(sheet_num) = sheet_id.strip_prefix("rId") {
|
|
worksheets.push(format!("sheet{}.xml", sheet_num));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
Ok(Event::Eof) => break,
|
|
Err(e) => {
|
|
warn!("Error parsing workbook.xml, falling back to sequential detection: {}", e);
|
|
// Fall back to old method on parse error
|
|
return Ok((1..=Self::MAX_WORKSHEETS_TO_CHECK)
|
|
.map(|i| format!("sheet{}.xml", i))
|
|
.collect());
|
|
}
|
|
_ => {}
|
|
}
|
|
buf.clear();
|
|
}
|
|
|
|
if worksheets.is_empty() {
|
|
// Fall back if no worksheets found
|
|
warn!("No worksheets found in workbook.xml, falling back to sequential detection");
|
|
Ok((1..=Self::MAX_WORKSHEETS_TO_CHECK)
|
|
.map(|i| format!("sheet{}.xml", i))
|
|
.collect())
|
|
} else {
|
|
info!("Found {} worksheets in workbook.xml", worksheets.len());
|
|
Ok(worksheets)
|
|
}
|
|
}
|
|
|
|
/// Remove null bytes from text to prevent PostgreSQL errors
|
|
/// This is the ONLY sanitization we do - preserving all other original content
|
|
fn remove_null_bytes(text: &str) -> String {
|
|
let original_len = text.len();
|
|
let cleaned: String = text.chars().filter(|&c| c != '\0').collect();
|
|
|
|
// Log if we found and removed null bytes (shouldn't happen with valid documents)
|
|
let cleaned_len = cleaned.len();
|
|
if cleaned_len < original_len {
|
|
let null_bytes_removed = text.chars().filter(|&c| c == '\0').count();
|
|
warn!(
|
|
"Removed {} null bytes from extracted text (original: {} chars, cleaned: {} chars). \
|
|
This indicates corrupted or malformed document data.",
|
|
null_bytes_removed, original_len, cleaned_len
|
|
);
|
|
}
|
|
|
|
cleaned
|
|
}
|
|
|
|
/// Validates ZIP entry names to prevent directory traversal attacks
|
|
fn validate_zip_entry_name(entry_name: &str) -> Result<()> {
|
|
// Check entry name length
|
|
if entry_name.len() > Self::MAX_ENTRY_NAME_LENGTH {
|
|
return Err(anyhow!(
|
|
"ZIP entry name too long ({}). Maximum allowed length is {} characters for security reasons.",
|
|
entry_name.len(),
|
|
Self::MAX_ENTRY_NAME_LENGTH
|
|
));
|
|
}
|
|
|
|
// Check for directory traversal attempts
|
|
if entry_name.contains("..") {
|
|
return Err(anyhow!(
|
|
"ZIP entry contains directory traversal sequence '..': '{}'. This is blocked for security reasons.",
|
|
entry_name
|
|
));
|
|
}
|
|
|
|
// Check for absolute paths
|
|
if entry_name.starts_with('/') || entry_name.starts_with('\\') {
|
|
return Err(anyhow!(
|
|
"ZIP entry contains absolute path: '{}'. This is blocked for security reasons.",
|
|
entry_name
|
|
));
|
|
}
|
|
|
|
// Check for Windows drive letters
|
|
if entry_name.len() >= 2 && entry_name.chars().nth(1) == Some(':') {
|
|
return Err(anyhow!(
|
|
"ZIP entry contains Windows drive letter: '{}'. This is blocked for security reasons.",
|
|
entry_name
|
|
));
|
|
}
|
|
|
|
// Check for suspicious characters
|
|
let suspicious_chars = ['<', '>', '|', '*', '?'];
|
|
if entry_name.chars().any(|c| suspicious_chars.contains(&c)) {
|
|
return Err(anyhow!(
|
|
"ZIP entry contains suspicious characters: '{}'. This is blocked for security reasons.",
|
|
entry_name
|
|
));
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Safely reads content from a ZIP entry with size limits and cancellation support
|
|
fn read_zip_entry_safely<R: std::io::Read>(
|
|
reader: &mut R,
|
|
max_size: u64,
|
|
context: &ExtractionContext
|
|
) -> Result<String> {
|
|
use std::io::Read;
|
|
|
|
let mut buffer = Vec::new();
|
|
let mut total_read = 0u64;
|
|
let mut temp_buf = [0u8; Self::XML_READ_BUFFER_SIZE];
|
|
|
|
loop {
|
|
// Check for cancellation
|
|
if context.is_cancelled() {
|
|
return Err(anyhow!("Operation cancelled by user"));
|
|
}
|
|
|
|
match reader.read(&mut temp_buf)? {
|
|
0 => break, // EOF
|
|
bytes_read => {
|
|
total_read += bytes_read as u64;
|
|
|
|
// Check if we've exceeded the per-file size limit
|
|
if total_read > max_size {
|
|
return Err(anyhow!(
|
|
"ZIP entry content exceeds maximum allowed size of {:.1} MB. \
|
|
This may be a ZIP bomb attack. Current size: {:.1} MB.",
|
|
max_size as f64 / (1024.0 * 1024.0),
|
|
total_read as f64 / (1024.0 * 1024.0)
|
|
));
|
|
}
|
|
|
|
// Update total decompressed size across all entries
|
|
context.add_decompressed_bytes(bytes_read as u64)?;
|
|
|
|
buffer.extend_from_slice(&temp_buf[..bytes_read]);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Convert to string, handling encoding issues gracefully
|
|
String::from_utf8(buffer).or_else(|e| {
|
|
// Try to recover as much valid UTF-8 as possible
|
|
let bytes = e.into_bytes();
|
|
let lossy = String::from_utf8_lossy(&bytes);
|
|
Ok(lossy.into_owned())
|
|
})
|
|
}
|
|
|
|
/// Extract text from Office documents using XML parsing with timeout and cancellation support
|
|
pub async fn extract_text_from_office(&self, file_path: &str, mime_type: &str) -> Result<OfficeExtractionResult> {
|
|
self.extract_text_from_office_with_timeout(file_path, mime_type, Self::DEFAULT_TIMEOUT_SECONDS).await
|
|
}
|
|
|
|
/// Extract text from Office documents with custom timeout
|
|
pub async fn extract_text_from_office_with_timeout(
|
|
&self,
|
|
file_path: &str,
|
|
mime_type: &str,
|
|
timeout_seconds: u64
|
|
) -> Result<OfficeExtractionResult> {
|
|
let timeout_duration = Duration::from_secs(timeout_seconds.min(Self::MAX_TIMEOUT_SECONDS));
|
|
|
|
let extraction_future = self.extract_text_from_office_internal(file_path, mime_type);
|
|
|
|
match timeout(timeout_duration, extraction_future).await {
|
|
Ok(result) => result,
|
|
Err(_) => Err(OfficeExtractionError::timeout_error(file_path, timeout_seconds))
|
|
}
|
|
}
|
|
|
|
/// Internal extraction method with cancellation support
|
|
async fn extract_text_from_office_internal(&self, file_path: &str, mime_type: &str) -> Result<OfficeExtractionResult> {
|
|
let start_time = Instant::now();
|
|
info!("Extracting text from Office document: {} (type: {})", file_path, mime_type);
|
|
|
|
// Check file size before processing
|
|
let metadata = tokio::fs::metadata(file_path).await?;
|
|
let file_size = metadata.len();
|
|
|
|
if file_size > Self::MAX_OFFICE_SIZE {
|
|
return Err(OfficeExtractionError::file_too_large_error(
|
|
file_path,
|
|
file_size as f64 / (1024.0 * 1024.0),
|
|
Self::MAX_OFFICE_SIZE as f64 / (1024.0 * 1024.0)
|
|
));
|
|
}
|
|
|
|
// Create extraction context for ZIP bomb protection and cancellation support
|
|
let context = ExtractionContext::new_with_file_info(Self::MAX_DECOMPRESSED_SIZE, file_size);
|
|
|
|
match mime_type {
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" => {
|
|
self.extract_text_from_docx(file_path, start_time, &context).await
|
|
}
|
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => {
|
|
self.extract_text_from_xlsx(file_path, start_time, &context).await
|
|
}
|
|
"application/msword" => {
|
|
self.extract_text_from_legacy_doc(file_path, start_time).await
|
|
}
|
|
"application/vnd.ms-excel" => {
|
|
self.extract_text_from_legacy_excel(file_path, start_time).await
|
|
}
|
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation" => {
|
|
// For PPTX, provide guidance for now as it's complex
|
|
Err(OfficeExtractionError::unsupported_format_error(
|
|
file_path,
|
|
"PowerPoint (PPTX)",
|
|
&["PDF", "DOCX", "XLSX", "TXT"]
|
|
))
|
|
}
|
|
_ => {
|
|
Err(OfficeExtractionError::unsupported_format_error(
|
|
file_path,
|
|
mime_type,
|
|
&["PDF", "DOCX", "XLSX", "TXT"]
|
|
))
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Extract text from DOCX files using ZIP + XML parsing
|
|
async fn extract_text_from_docx(&self, file_path: &str, start_time: Instant, context: &ExtractionContext) -> Result<OfficeExtractionResult> {
|
|
info!("Starting DOCX text extraction: {}", file_path);
|
|
|
|
// Move CPU-intensive operations to blocking thread pool
|
|
let file_path_clone = file_path.to_string();
|
|
let context_clone = ExtractionContext::new_with_file_info(
|
|
context.max_total_decompressed_size,
|
|
context.compressed_file_size
|
|
);
|
|
let extraction_result = tokio::task::spawn_blocking(move || -> Result<String> {
|
|
use zip::ZipArchive;
|
|
use quick_xml::events::Event;
|
|
|
|
// Open the DOCX file as a ZIP archive
|
|
let file = std::fs::File::open(&file_path_clone)?;
|
|
let mut archive = ZipArchive::new(file)?;
|
|
|
|
// Security check: Validate ZIP archive structure
|
|
let entry_count = archive.len();
|
|
if entry_count > Self::MAX_ZIP_ENTRIES {
|
|
return Err(anyhow!(
|
|
"ZIP archive contains too many entries ({}). Maximum allowed is {} for security reasons. \
|
|
This may be a ZIP bomb attack.",
|
|
entry_count,
|
|
Self::MAX_ZIP_ENTRIES
|
|
));
|
|
}
|
|
|
|
// Validate all entry names before processing to prevent directory traversal
|
|
for i in 0..entry_count {
|
|
let entry = archive.by_index(i)?;
|
|
let entry_name = entry.name();
|
|
Self::validate_zip_entry_name(entry_name)?;
|
|
}
|
|
|
|
// Try to extract the main document content from word/document.xml
|
|
let mut document_xml = match archive.by_name("word/document.xml") {
|
|
Ok(file) => file,
|
|
Err(_) => {
|
|
return Err(OfficeExtractionError::corrupted_file_error(
|
|
&file_path_clone,
|
|
"DOCX",
|
|
"missing word/document.xml - required component not found"
|
|
));
|
|
}
|
|
};
|
|
|
|
// Security: Use size-limited reading to prevent ZIP bomb attacks
|
|
let xml_content = Self::read_zip_entry_safely(&mut document_xml, Self::MAX_XML_SIZE, &context_clone)?;
|
|
drop(document_xml); // Close the archive entry
|
|
|
|
// Parse the XML and extract text content
|
|
let mut reader = Self::create_secure_xml_reader(&xml_content);
|
|
|
|
let mut text_content = Vec::new();
|
|
let mut in_text_element = false;
|
|
let mut buf = Vec::new();
|
|
|
|
loop {
|
|
match reader.read_event_into(&mut buf) {
|
|
Ok(Event::Start(ref e)) => {
|
|
// Look for text elements (w:t tags contain the actual text)
|
|
if e.name().as_ref() == b"w:t" {
|
|
in_text_element = true;
|
|
}
|
|
}
|
|
Ok(Event::Empty(ref e)) => {
|
|
// Handle self-closing elements that represent spacing
|
|
match e.name().as_ref() {
|
|
b"w:tab" => {
|
|
text_content.push("\t".to_string());
|
|
}
|
|
b"w:br" => {
|
|
text_content.push("\n".to_string());
|
|
}
|
|
b"w:cr" => {
|
|
text_content.push("\r".to_string());
|
|
}
|
|
b"w:space" => {
|
|
// Check for xml:space="preserve" attribute
|
|
let mut space_count = 1; // Default to one space
|
|
for attr in e.attributes() {
|
|
if let Ok(attr) = attr {
|
|
if attr.key.as_ref() == b"w:count" {
|
|
if let Ok(count_str) = std::str::from_utf8(&attr.value) {
|
|
space_count = count_str.parse::<usize>().unwrap_or(1);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
text_content.push(" ".repeat(space_count));
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
Ok(Event::Text(e)) => {
|
|
if in_text_element {
|
|
// Extract and decode the text content
|
|
let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?;
|
|
text_content.push(text.into_owned());
|
|
}
|
|
}
|
|
Ok(Event::End(ref e)) => {
|
|
if e.name().as_ref() == b"w:t" {
|
|
in_text_element = false;
|
|
}
|
|
// Add proper breaks and spacing to preserve document structure
|
|
match e.name().as_ref() {
|
|
b"w:p" => {
|
|
// End of paragraph - add double newline for better readability
|
|
text_content.push("\n\n".to_string());
|
|
}
|
|
b"w:tr" => {
|
|
// End of table row - add single newline
|
|
text_content.push("\n".to_string());
|
|
}
|
|
b"w:tc" => {
|
|
// End of table cell - add tab separator
|
|
text_content.push("\t".to_string());
|
|
}
|
|
// Remove automatic spacing after w:r - this was causing words to be split
|
|
// Instead, rely on explicit w:space elements and natural paragraph breaks
|
|
// Handle section breaks and page breaks with just whitespace
|
|
b"w:sectPr" => {
|
|
text_content.push("\n\n".to_string());
|
|
}
|
|
b"w:lastRenderedPageBreak" => {
|
|
text_content.push("\n\n".to_string());
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
Ok(Event::Eof) => break,
|
|
Err(e) => {
|
|
return Err(OfficeExtractionError::corrupted_file_error(
|
|
&file_path_clone,
|
|
"DOCX",
|
|
&format!("XML parsing error - {}", e)
|
|
));
|
|
}
|
|
_ => {}
|
|
}
|
|
buf.clear();
|
|
}
|
|
|
|
// Join all text content and clean it up for better readability
|
|
let raw_text = text_content.join("");
|
|
let cleaned_text = Self::clean_extracted_text(&raw_text);
|
|
|
|
// Check if we have actual text content
|
|
if cleaned_text.trim().is_empty() {
|
|
return Err(OfficeExtractionError::empty_document_error(&file_path_clone, "DOCX"));
|
|
}
|
|
|
|
Ok(cleaned_text)
|
|
|
|
}).await??;
|
|
|
|
let processing_time = start_time.elapsed().as_millis() as u64;
|
|
|
|
// Only remove null bytes - preserve all original formatting
|
|
let cleaned_text = Self::remove_null_bytes(&extraction_result);
|
|
let word_count = self.count_words_safely(&cleaned_text);
|
|
|
|
info!(
|
|
"DOCX extraction completed: {} words extracted from '{}' in {}ms",
|
|
word_count, file_path, processing_time
|
|
);
|
|
|
|
Ok(OfficeExtractionResult {
|
|
text: cleaned_text,
|
|
confidence: 100.0, // Direct text extraction has perfect confidence
|
|
processing_time_ms: processing_time,
|
|
word_count,
|
|
extraction_method: "DOCX XML extraction".to_string(),
|
|
})
|
|
}
|
|
|
|
/// Extract text from XLSX files using ZIP + XML parsing
|
|
async fn extract_text_from_xlsx(&self, file_path: &str, start_time: Instant, context: &ExtractionContext) -> Result<OfficeExtractionResult> {
|
|
info!("Starting XLSX text extraction: {}", file_path);
|
|
|
|
// Move CPU-intensive operations to blocking thread pool
|
|
let file_path_clone = file_path.to_string();
|
|
let context_clone = ExtractionContext::new_with_file_info(
|
|
context.max_total_decompressed_size,
|
|
context.compressed_file_size
|
|
);
|
|
let extraction_result = tokio::task::spawn_blocking(move || -> Result<String> {
|
|
use zip::ZipArchive;
|
|
use quick_xml::events::Event;
|
|
|
|
// Open the XLSX file as a ZIP archive
|
|
let file = std::fs::File::open(&file_path_clone)?;
|
|
let mut archive = ZipArchive::new(file)?;
|
|
|
|
// Security check: Validate ZIP archive structure
|
|
let entry_count = archive.len();
|
|
if entry_count > Self::MAX_ZIP_ENTRIES {
|
|
return Err(anyhow!(
|
|
"ZIP archive contains too many entries ({}). Maximum allowed is {} for security reasons. \
|
|
This may be a ZIP bomb attack.",
|
|
entry_count,
|
|
Self::MAX_ZIP_ENTRIES
|
|
));
|
|
}
|
|
|
|
// Validate all entry names before processing to prevent directory traversal
|
|
for i in 0..entry_count {
|
|
let entry = archive.by_index(i)?;
|
|
let entry_name = entry.name();
|
|
Self::validate_zip_entry_name(entry_name)?;
|
|
}
|
|
|
|
// First, extract shared strings (xl/sharedStrings.xml)
|
|
let mut shared_strings = Vec::new();
|
|
if let Ok(mut shared_strings_file) = archive.by_name("xl/sharedStrings.xml") {
|
|
// Security: Use size-limited reading to prevent ZIP bomb attacks
|
|
let xml_content = Self::read_zip_entry_safely(&mut shared_strings_file, Self::MAX_XML_SIZE, &context_clone)?;
|
|
drop(shared_strings_file);
|
|
|
|
// Parse shared strings
|
|
let mut reader = Self::create_secure_xml_reader(&xml_content);
|
|
let mut buf = Vec::new();
|
|
let mut in_string = false;
|
|
let mut current_string = String::new();
|
|
|
|
loop {
|
|
match reader.read_event_into(&mut buf) {
|
|
Ok(Event::Start(ref e)) => {
|
|
if e.name().as_ref() == b"t" {
|
|
in_string = true;
|
|
current_string.clear();
|
|
}
|
|
}
|
|
Ok(Event::Text(e)) => {
|
|
if in_string {
|
|
let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?;
|
|
current_string.push_str(&text);
|
|
}
|
|
}
|
|
Ok(Event::End(ref e)) => {
|
|
if e.name().as_ref() == b"t" {
|
|
in_string = false;
|
|
shared_strings.push(current_string.clone());
|
|
current_string.clear();
|
|
}
|
|
}
|
|
Ok(Event::Eof) => break,
|
|
Err(e) => {
|
|
return Err(OfficeExtractionError::corrupted_file_error(
|
|
&file_path_clone,
|
|
"XLSX",
|
|
&format!("shared strings XML parsing error - {}", e)
|
|
));
|
|
}
|
|
_ => {}
|
|
}
|
|
buf.clear();
|
|
}
|
|
}
|
|
|
|
// Now extract worksheet data
|
|
let mut all_text = Vec::new();
|
|
let mut worksheet_count = 0;
|
|
|
|
// Get actual worksheet names from workbook.xml instead of guessing
|
|
let worksheet_names = Self::get_worksheet_names_from_workbook(&mut archive, &context_clone)?;
|
|
|
|
// Process each worksheet
|
|
for worksheet_filename in worksheet_names {
|
|
let worksheet_path = format!("xl/worksheets/{}", worksheet_filename);
|
|
|
|
if let Ok(mut worksheet_file) = archive.by_name(&worksheet_path) {
|
|
worksheet_count += 1;
|
|
// Security: Use size-limited reading to prevent ZIP bomb attacks
|
|
let xml_content = Self::read_zip_entry_safely(&mut worksheet_file, Self::MAX_XML_SIZE, &context_clone)?;
|
|
drop(worksheet_file);
|
|
|
|
// Parse worksheet data
|
|
let mut reader = Self::create_secure_xml_reader(&xml_content);
|
|
let mut buf = Vec::new();
|
|
let mut in_cell_value = false;
|
|
let mut current_cell_type = String::new();
|
|
|
|
loop {
|
|
match reader.read_event_into(&mut buf) {
|
|
Ok(Event::Start(ref e)) => {
|
|
if e.name().as_ref() == b"c" {
|
|
// Cell element - check if it has a type attribute
|
|
current_cell_type.clear();
|
|
for attr in e.attributes() {
|
|
if let Ok(attr) = attr {
|
|
if attr.key.as_ref() == b"t" {
|
|
current_cell_type = String::from_utf8_lossy(&attr.value).to_string();
|
|
}
|
|
}
|
|
}
|
|
} else if e.name().as_ref() == b"v" {
|
|
// Cell value
|
|
in_cell_value = true;
|
|
}
|
|
}
|
|
Ok(Event::Text(e)) => {
|
|
if in_cell_value {
|
|
let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?;
|
|
|
|
// If this is a shared string reference (t="s"), look up the string
|
|
if current_cell_type == "s" {
|
|
if let Ok(index) = text.parse::<usize>() {
|
|
if let Some(shared_string) = shared_strings.get(index) {
|
|
all_text.push(shared_string.clone());
|
|
}
|
|
}
|
|
} else {
|
|
// Direct value
|
|
all_text.push(text.into_owned());
|
|
}
|
|
}
|
|
}
|
|
Ok(Event::End(ref e)) => {
|
|
if e.name().as_ref() == b"v" {
|
|
in_cell_value = false;
|
|
}
|
|
}
|
|
Ok(Event::Eof) => break,
|
|
Err(e) => {
|
|
return Err(OfficeExtractionError::corrupted_file_error(
|
|
&file_path_clone,
|
|
"XLSX",
|
|
&format!("worksheet '{}' XML parsing error - {}", worksheet_path, e)
|
|
));
|
|
}
|
|
_ => {}
|
|
}
|
|
buf.clear();
|
|
}
|
|
}
|
|
}
|
|
|
|
if worksheet_count == 0 {
|
|
return Err(OfficeExtractionError::corrupted_file_error(
|
|
&file_path_clone,
|
|
"XLSX",
|
|
"no worksheets found - file structure is invalid"
|
|
));
|
|
}
|
|
|
|
// Join all text content with spaces
|
|
let raw_text = all_text.join(" ");
|
|
|
|
if raw_text.trim().is_empty() {
|
|
return Err(OfficeExtractionError::empty_document_error(&file_path_clone, "XLSX"));
|
|
}
|
|
|
|
Ok(raw_text)
|
|
|
|
}).await??;
|
|
|
|
let processing_time = start_time.elapsed().as_millis() as u64;
|
|
|
|
// Only remove null bytes - preserve all original formatting
|
|
let cleaned_text = Self::remove_null_bytes(&extraction_result);
|
|
let word_count = self.count_words_safely(&cleaned_text);
|
|
|
|
info!(
|
|
"XLSX extraction completed: {} words extracted from '{}' in {}ms",
|
|
word_count, file_path, processing_time
|
|
);
|
|
|
|
Ok(OfficeExtractionResult {
|
|
text: cleaned_text,
|
|
confidence: 100.0, // Direct text extraction has perfect confidence
|
|
processing_time_ms: processing_time,
|
|
word_count,
|
|
extraction_method: "XLSX XML extraction".to_string(),
|
|
})
|
|
}
|
|
|
|
/// Extract text from legacy DOC files using external tools (antiword, catdoc, wvText)
|
|
async fn extract_text_from_legacy_doc(&self, file_path: &str, start_time: Instant) -> Result<OfficeExtractionResult> {
|
|
info!("Processing legacy DOC file: {}", file_path);
|
|
|
|
// Validate file path for security
|
|
self.validate_file_path_security(file_path)?;
|
|
|
|
// Try external tools in order of preference
|
|
let tools = vec![
|
|
("antiword", vec![file_path]),
|
|
("catdoc", vec![file_path]),
|
|
("wvText", vec![file_path]),
|
|
];
|
|
|
|
let mut last_error: Option<String> = None;
|
|
let mut tried_tools = Vec::new();
|
|
|
|
for (tool_name, args) in tools {
|
|
tried_tools.push(tool_name);
|
|
info!("Attempting DOC extraction with {}", tool_name);
|
|
|
|
match self.try_external_tool(tool_name, &args, file_path).await {
|
|
Ok(extracted_text) => {
|
|
let processing_time = start_time.elapsed().as_millis() as u64;
|
|
|
|
// Clean and validate the extracted text
|
|
let cleaned_text = Self::clean_extracted_text(&extracted_text);
|
|
let sanitized_text = Self::remove_null_bytes(&cleaned_text);
|
|
|
|
if sanitized_text.trim().is_empty() {
|
|
return Err(OfficeExtractionError::empty_document_error(file_path, "DOC"));
|
|
}
|
|
|
|
let word_count = self.count_words_safely(&sanitized_text);
|
|
|
|
info!(
|
|
"DOC extraction succeeded with {}: {} words extracted from '{}' in {}ms",
|
|
tool_name, word_count, file_path, processing_time
|
|
);
|
|
|
|
return Ok(OfficeExtractionResult {
|
|
text: sanitized_text,
|
|
confidence: 90.0, // External tool extraction has good but not perfect confidence
|
|
processing_time_ms: processing_time,
|
|
word_count,
|
|
extraction_method: format!("DOC external tool ({})", tool_name),
|
|
});
|
|
}
|
|
Err(e) => {
|
|
warn!("DOC extraction with {} failed: {}", tool_name, e);
|
|
last_error = Some(e.to_string());
|
|
}
|
|
}
|
|
}
|
|
|
|
// All tools failed
|
|
let processing_time = start_time.elapsed().as_millis() as u64;
|
|
let error_message = format!(
|
|
"None of the DOC extraction tools (antiword, catdoc, wvText) are available or working.\n\
|
|
\n\
|
|
Tried tools: {}\n\
|
|
Processing time: {}ms\n\
|
|
\n\
|
|
This file is in the legacy Microsoft Word (.doc) binary format which requires \
|
|
external tools for text extraction.\n\
|
|
\n\
|
|
To extract text from DOC files, please install one of these tools:\n\
|
|
• antiword: sudo apt-get install antiword (Ubuntu/Debian)\n\
|
|
• catdoc: sudo apt-get install catdoc (Ubuntu/Debian)\n\
|
|
• wvText: sudo apt-get install wv (Ubuntu/Debian)\n\
|
|
\n\
|
|
Last error: {}\n\
|
|
\n\
|
|
Alternatively, you can:\n\
|
|
1. Convert the file to DOCX format using Microsoft Word or LibreOffice\n\
|
|
2. Save/export as PDF format\n\
|
|
3. Copy and paste the text into a new DOCX document\n\
|
|
4. Use online conversion tools to convert DOC to DOCX",
|
|
tried_tools.join(", "),
|
|
processing_time,
|
|
last_error.unwrap_or_else(|| "All extraction methods failed".to_string())
|
|
);
|
|
|
|
Err(anyhow::anyhow!(error_message))
|
|
}
|
|
|
|
/// Extract text from legacy Excel files - provide guidance for now
|
|
async fn extract_text_from_legacy_excel(&self, file_path: &str, start_time: Instant) -> Result<OfficeExtractionResult> {
|
|
info!("Processing legacy Excel (XLS) file: {}", file_path);
|
|
|
|
let _processing_time = start_time.elapsed().as_millis() as u64;
|
|
|
|
// Legacy XLS files are complex binary format, suggest conversion
|
|
Err(OfficeExtractionError::unsupported_format_error(
|
|
file_path,
|
|
"Legacy Excel (.xls)",
|
|
&["XLSX", "PDF", "CSV", "TXT"]
|
|
))
|
|
}
|
|
|
|
/// Clean extracted text to improve readability and structure
|
|
fn clean_extracted_text(text: &str) -> String {
|
|
use regex::Regex;
|
|
|
|
// Create regex patterns for cleaning (compile once for efficiency)
|
|
let multiple_spaces = Regex::new(r" {3,}").unwrap(); // 3+ spaces -> 2 spaces
|
|
let multiple_newlines = Regex::new(r"\n{3,}").unwrap(); // 3+ newlines -> 2 newlines
|
|
let space_before_newline = Regex::new(r" +\n").unwrap(); // spaces before newlines
|
|
let newline_before_space = Regex::new(r"\n +").unwrap(); // newlines followed by spaces
|
|
let mixed_whitespace = Regex::new(r"[ \t]+").unwrap(); // tabs and spaces -> single space
|
|
|
|
// Pattern to fix concatenated words like "ExecutiveSummary" -> "Executive Summary"
|
|
// This looks for lowercase-uppercase transitions and adds a space
|
|
let word_boundaries = Regex::new(r"([a-z])([A-Z])").unwrap();
|
|
|
|
let mut cleaned = text.to_string();
|
|
|
|
// First, fix word boundaries that got concatenated
|
|
cleaned = word_boundaries.replace_all(&cleaned, "$1 $2").to_string();
|
|
|
|
// Clean up excessive whitespace
|
|
cleaned = multiple_spaces.replace_all(&cleaned, " ").to_string();
|
|
cleaned = multiple_newlines.replace_all(&cleaned, "\n\n").to_string();
|
|
cleaned = space_before_newline.replace_all(&cleaned, "\n").to_string();
|
|
cleaned = newline_before_space.replace_all(&cleaned, "\n").to_string();
|
|
cleaned = mixed_whitespace.replace_all(&cleaned, " ").to_string();
|
|
|
|
// Remove leading/trailing whitespace but preserve internal structure
|
|
cleaned.trim().to_string()
|
|
}
|
|
|
|
/// Safely count words to prevent overflow on very large texts
|
|
pub fn count_words_safely(&self, text: &str) -> usize {
|
|
// Early return for empty or tiny texts
|
|
if text.trim().is_empty() {
|
|
return 0;
|
|
}
|
|
|
|
// For very large texts, use sampling to estimate word count
|
|
const LARGE_TEXT_THRESHOLD: usize = 1_000_000; // 1MB
|
|
const SAMPLE_SIZE: usize = 100_000; // 100KB samples
|
|
const MAX_WORD_COUNT: usize = 10_000_000; // 10M words cap
|
|
|
|
if text.len() > LARGE_TEXT_THRESHOLD {
|
|
warn!(
|
|
"Text is very large ({:.1} MB), using sampling method for word count estimation",
|
|
text.len() as f64 / (1024.0 * 1024.0)
|
|
);
|
|
|
|
// Use multiple samples for better accuracy on very large texts
|
|
let num_samples = 3;
|
|
let sample_size = SAMPLE_SIZE.min(text.len() / num_samples);
|
|
let mut total_estimated_words = 0;
|
|
|
|
// Sample from beginning, middle, and end
|
|
for i in 0..num_samples {
|
|
let start = (text.len() / num_samples) * i;
|
|
let end = (start + sample_size).min(text.len());
|
|
|
|
// Ensure we sample complete characters (UTF-8 safe)
|
|
let sample_start = Self::floor_char_boundary(text, start);
|
|
let sample_end = Self::floor_char_boundary(text, end);
|
|
|
|
if sample_end > sample_start {
|
|
let sample = &text[sample_start..sample_end];
|
|
let sample_words = self.count_words_in_text_optimized(sample);
|
|
|
|
// Extrapolate this sample to the full text
|
|
let sample_ratio = text.len() as f64 / (sample_end - sample_start) as f64;
|
|
let estimated_from_sample = (sample_words as f64 * sample_ratio / num_samples as f64) as usize;
|
|
total_estimated_words += estimated_from_sample;
|
|
}
|
|
}
|
|
|
|
// Cap at reasonable maximum
|
|
total_estimated_words.min(MAX_WORD_COUNT)
|
|
} else if text.len() > 50_000 { // 50KB - use optimized counting for medium texts
|
|
self.count_words_in_text_optimized(text)
|
|
} else {
|
|
// Small texts can use the full algorithm
|
|
self.count_words_in_text(text)
|
|
}
|
|
}
|
|
|
|
/// Helper method to find the nearest character boundary (stable replacement for floor_char_boundary)
|
|
fn floor_char_boundary(text: &str, index: usize) -> usize {
|
|
if index >= text.len() {
|
|
return text.len();
|
|
}
|
|
|
|
// Find the start of a UTF-8 character by backing up until we find a valid char boundary
|
|
let mut boundary = index;
|
|
while boundary > 0 && !text.is_char_boundary(boundary) {
|
|
boundary -= 1;
|
|
}
|
|
boundary
|
|
}
|
|
|
|
/// Optimized word counting for medium-large texts
|
|
fn count_words_in_text_optimized(&self, text: &str) -> usize {
|
|
// For performance, use a simpler approach for medium-large texts
|
|
let mut word_count = 0;
|
|
let mut in_word = false;
|
|
|
|
for ch in text.chars() {
|
|
if ch.is_whitespace() {
|
|
if in_word {
|
|
word_count += 1;
|
|
in_word = false;
|
|
}
|
|
} else if ch.is_alphanumeric() {
|
|
in_word = true;
|
|
}
|
|
// Ignore pure punctuation
|
|
}
|
|
|
|
// Count the last word if text doesn't end with whitespace
|
|
if in_word {
|
|
word_count += 1;
|
|
}
|
|
|
|
word_count
|
|
}
|
|
|
|
fn count_words_in_text(&self, text: &str) -> usize {
|
|
let whitespace_words = text.split_whitespace().count();
|
|
|
|
// If we have exactly 1 "word" but it's very long (likely continuous text), try enhanced detection
|
|
// OR if we have no whitespace words but text exists
|
|
let is_continuous_text = whitespace_words == 1 && text.len() > 15; // 15+ chars suggests it might be continuous
|
|
let is_no_words = whitespace_words == 0 && !text.trim().is_empty();
|
|
|
|
if is_continuous_text || is_no_words {
|
|
// Count total alphanumeric characters first
|
|
let alphanumeric_chars = text.chars().filter(|c| c.is_alphanumeric()).count();
|
|
|
|
// If no alphanumeric content, it's pure punctuation/symbols
|
|
if alphanumeric_chars == 0 {
|
|
return 0;
|
|
}
|
|
|
|
// For continuous text, look for word boundaries using multiple strategies
|
|
let mut word_count = 0;
|
|
|
|
// Strategy 1: Count transitions from lowercase to uppercase (camelCase detection)
|
|
let chars: Vec<char> = text.chars().collect();
|
|
let mut camel_transitions = 0;
|
|
|
|
for i in 1..chars.len() {
|
|
let prev_char = chars[i-1];
|
|
let curr_char = chars[i];
|
|
|
|
// Count transitions from lowercase letter to uppercase letter
|
|
if prev_char.is_lowercase() && curr_char.is_uppercase() {
|
|
camel_transitions += 1;
|
|
}
|
|
// Count transitions from letter to digit or digit to letter
|
|
else if (prev_char.is_alphabetic() && curr_char.is_numeric()) ||
|
|
(prev_char.is_numeric() && curr_char.is_alphabetic()) {
|
|
camel_transitions += 1;
|
|
}
|
|
}
|
|
|
|
// If we found camelCase transitions, estimate words
|
|
if camel_transitions > 0 {
|
|
word_count = camel_transitions + 1; // +1 for the first word
|
|
}
|
|
|
|
// Strategy 2: If no camelCase detected, estimate based on character count
|
|
if word_count == 0 {
|
|
// Estimate based on typical word length (4-6 characters per word)
|
|
word_count = (alphanumeric_chars / 5).max(1);
|
|
}
|
|
|
|
word_count
|
|
} else {
|
|
whitespace_words
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use tempfile::TempDir;
|
|
|
|
fn create_test_extractor() -> (XmlOfficeExtractor, TempDir) {
|
|
let temp_dir = TempDir::new().unwrap();
|
|
let extractor = XmlOfficeExtractor::new(temp_dir.path().to_string_lossy().to_string());
|
|
(extractor, temp_dir)
|
|
}
|
|
|
|
#[test]
|
|
fn test_validate_zip_entry_name() {
|
|
// Valid names should pass
|
|
assert!(XmlOfficeExtractor::validate_zip_entry_name("word/document.xml").is_ok());
|
|
assert!(XmlOfficeExtractor::validate_zip_entry_name("xl/worksheets/sheet1.xml").is_ok());
|
|
|
|
// Invalid names should fail
|
|
assert!(XmlOfficeExtractor::validate_zip_entry_name("../../../etc/passwd").is_err());
|
|
assert!(XmlOfficeExtractor::validate_zip_entry_name("/etc/passwd").is_err());
|
|
assert!(XmlOfficeExtractor::validate_zip_entry_name("C:\\windows\\system32\\cmd.exe").is_err());
|
|
assert!(XmlOfficeExtractor::validate_zip_entry_name("file<script>alert(1)</script>.xml").is_err());
|
|
|
|
// Too long name should fail
|
|
let long_name = "a".repeat(300);
|
|
assert!(XmlOfficeExtractor::validate_zip_entry_name(&long_name).is_err());
|
|
}
|
|
|
|
#[test]
|
|
fn test_remove_null_bytes() {
|
|
let text_with_nulls = "Hello\0World\0Test";
|
|
let cleaned = XmlOfficeExtractor::remove_null_bytes(text_with_nulls);
|
|
assert_eq!(cleaned, "HelloWorldTest");
|
|
|
|
let text_without_nulls = "Hello World Test";
|
|
let cleaned = XmlOfficeExtractor::remove_null_bytes(text_without_nulls);
|
|
assert_eq!(cleaned, "Hello World Test");
|
|
}
|
|
|
|
#[test]
|
|
fn test_count_words_safely() {
|
|
let (extractor, _temp_dir) = create_test_extractor();
|
|
|
|
// Normal text
|
|
assert_eq!(extractor.count_words_safely("Hello world test"), 3);
|
|
|
|
// Empty text
|
|
assert_eq!(extractor.count_words_safely(""), 0);
|
|
assert_eq!(extractor.count_words_safely(" "), 0);
|
|
|
|
// Continuous text without spaces
|
|
assert!(extractor.count_words_safely("HelloWorldTestingCamelCase") > 0);
|
|
|
|
// Very large text should not panic
|
|
let large_text = "word ".repeat(500_000); // 2MB+ of text
|
|
let word_count = extractor.count_words_safely(&large_text);
|
|
assert!(word_count > 0);
|
|
assert!(word_count <= 10_000_000); // Should be capped
|
|
}
|
|
|
|
#[test]
|
|
fn test_read_zip_entry_safely() {
|
|
use std::io::Cursor;
|
|
|
|
let context = ExtractionContext::new(10 * 1024 * 1024); // 10MB limit
|
|
|
|
// Test normal sized content
|
|
let small_content = b"Hello World";
|
|
let mut cursor = Cursor::new(small_content);
|
|
let result = XmlOfficeExtractor::read_zip_entry_safely(&mut cursor, 1024, &context);
|
|
assert!(result.is_ok());
|
|
assert_eq!(result.unwrap(), "Hello World");
|
|
|
|
// Test oversized content
|
|
let large_content = vec![b'A'; 2048];
|
|
let mut cursor = Cursor::new(large_content);
|
|
let result = XmlOfficeExtractor::read_zip_entry_safely(&mut cursor, 1024, &context);
|
|
assert!(result.is_err());
|
|
assert!(result.unwrap_err().to_string().contains("exceeds maximum allowed size"));
|
|
}
|
|
} |