feat(office): use actual packages for extraction

2025-09-01 21:21:22 +00:00 · 2025-09-01 21:21:22 +00:00 · 78af7e7861
parent 546b41b462
commit 78af7e7861
3 changed files with 289 additions and 351 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1023,6 +1023,21 @@ dependencies = [
 "pkg-config",
 ]

+[[package]]
+name = "calamine"
+version = "0.26.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "138646b9af2c5d7f1804ea4bf93afc597737d2bd4f7341d67c48b03316976eb1"
+dependencies = [
+ "byteorder",
+ "codepage",
+ "encoding_rs",
+ "log",
+ "quick-xml 0.31.0",
+ "serde",
+ "zip 2.4.2",
+]
+
 [[package]]
 name = "cc"
 version = "1.2.27"
@ -1155,6 +1170,15 @@ dependencies = [
 "cc",
 ]

+[[package]]
+name = "codepage"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4"
+dependencies = [
+ "encoding_rs",
+]
+
 [[package]]
 name = "color_quant"
 version = "1.1.0"
@ -1466,6 +1490,21 @@ dependencies = [
 "serde_json",
 ]

+[[package]]
+name = "docx-rs"
+version = "0.4.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f21be13b97bd2924f30323d674f5a8db382964972825abd93f30d08f21dad98"
+dependencies = [
+ "base64 0.22.1",
+ "image 0.24.9",
+ "serde",
+ "serde_json",
+ "thiserror 1.0.69",
+ "xml-rs",
+ "zip 0.6.6",
+]
+
 [[package]]
 name = "dotenvy"
 version = "0.15.7"
@ -2389,6 +2428,22 @@ dependencies = [
 "icu_properties",
 ]

+[[package]]
+name = "image"
+version = "0.24.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5690139d2f55868e080017335e4b94cb7414274c74f1669c84fb5feba2c9f69d"
+dependencies = [
+ "bytemuck",
+ "byteorder",
+ "color_quant",
+ "gif",
+ "jpeg-decoder",
+ "num-traits",
+ "png",
+ "tiff",
+]
+
 [[package]]
 name = "image"
 version = "0.25.6"
@ -2431,7 +2486,7 @@ dependencies = [
 "ab_glyph",
 "approx",
 "getrandom 0.2.16",
- "image",
+ "image 0.25.6",
 "itertools",
 "nalgebra",
 "num",
@ -3500,6 +3555,16 @@ version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3"

+[[package]]
+name = "quick-xml"
+version = "0.31.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
+dependencies = [
+ "encoding_rs",
+ "memchr",
+]
+
 [[package]]
 name = "quick-xml"
 version = "0.37.5"
@ -3692,13 +3757,15 @@ dependencies = [
 "axum",
 "base64ct",
 "bcrypt",
+ "calamine",
 "chrono",
 "clap",
+ "docx-rs",
 "dotenvy",
 "futures",
 "futures-util",
 "hostname",
- "image",
+ "image 0.25.6",
 "imageproc",
 "infer",
 "jsonwebtoken",
@ -3706,7 +3773,7 @@ dependencies = [
 "notify",
 "oauth2",
 "once_cell",
- "quick-xml",
+ "quick-xml 0.37.5",
 "rand 0.8.5",
 "raw-cpuid",
 "readur",
@ -6221,6 +6288,12 @@ dependencies = [
 "rustix 1.0.7",
 ]

+[[package]]
+name = "xml-rs"
+version = "0.8.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fd8403733700263c6eb89f192880191f1b83e332f7a20371ddcf421c4a337c7"
+
 [[package]]
 name = "xmlparser"
 version = "0.13.6"
@ -6351,6 +6424,23 @@ dependencies = [
 "zstd",
 ]

+[[package]]
+name = "zip"
+version = "2.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50"
+dependencies = [
+ "arbitrary",
+ "crc32fast",
+ "crossbeam-utils",
+ "displaydoc",
+ "flate2",
+ "indexmap 2.9.0",
+ "memchr",
+ "thiserror 2.0.16",
+ "zopfli",
+]
+
 [[package]]
 name = "zip"
 version = "3.0.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -61,10 +61,10 @@ sha2 = "0.10"
 utoipa-swagger-ui = { version = "9", features = ["axum"] }
 testcontainers = { version = "0.24", optional = true }
 testcontainers-modules = { version = "0.12", features = ["postgres"], optional = true }
-# Office document support - temporarily disabled due to jetscii compatibility issues
-# docx = "0.2"          # DOCX text extraction - temporarily disabled due to jetscii compatibility issues
-# calamine = "0.22"     # Excel files (XLS/XLSX) text extraction - temporarily disabled due to jetscii compatibility issues  
-zip = "0.6"             # For DOCX/PPTX archive handling
+# Office document support - using proper, well-maintained libraries
+docx-rs = "0.4"         # For DOCX text extraction by bokuweb (NOT the docx crate by PoiScript)
+calamine = "0.26"       # For Excel (XLS/XLSX) text extraction
+zip = "0.6"             # Still needed for other archive handling
 rand = "0.8"

 [features]
--- a/src/ocr/enhanced.rs
+++ b/src/ocr/enhanced.rs
@ -42,10 +42,8 @@ pub struct EnhancedOcrService {
 }

 impl EnhancedOcrService {
-    // Security limits to prevent ZIP bombs and memory exhaustion attacks
-    const MAX_DECOMPRESSED_SIZE: u64 = 100 * 1024 * 1024; // 100MB total decompressed size
-    const MAX_XML_SIZE: u64 = 10 * 1024 * 1024; // 10MB per XML file
-    const MAX_ZIP_ENTRIES: usize = 1000; // Maximum number of entries to process
+    // Security limits for Office document processing
+    const MAX_OFFICE_DOCUMENT_SIZE: u64 = 100 * 1024 * 1024; // 100MB for all Office documents
    const MAX_ENTRY_NAME_LENGTH: usize = 255; // Maximum length of entry names

    /// Remove null bytes from text to prevent PostgreSQL errors
@ -68,91 +66,6 @@ impl EnhancedOcrService {
        cleaned
    }

-    /// Validates ZIP entry names to prevent directory traversal attacks
-    fn validate_zip_entry_name(entry_name: &str) -> Result<()> {
-        // Check entry name length
-        if entry_name.len() > Self::MAX_ENTRY_NAME_LENGTH {
-            return Err(anyhow!(
-                "ZIP entry name too long ({}). Maximum allowed length is {} characters for security reasons.",
-                entry_name.len(),
-                Self::MAX_ENTRY_NAME_LENGTH
-            ));
-        }
-
-        // Check for directory traversal attempts
-        if entry_name.contains("..") {
-            return Err(anyhow!(
-                "ZIP entry contains directory traversal sequence '..': '{}'. This is blocked for security reasons.",
-                entry_name
-            ));
-        }
-
-        // Check for absolute paths
-        if entry_name.starts_with('/') || entry_name.starts_with('\\') {
-            return Err(anyhow!(
-                "ZIP entry contains absolute path: '{}'. This is blocked for security reasons.",
-                entry_name
-            ));
-        }
-
-        // Check for Windows drive letters
-        if entry_name.len() >= 2 && entry_name.chars().nth(1) == Some(':') {
-            return Err(anyhow!(
-                "ZIP entry contains Windows drive letter: '{}'. This is blocked for security reasons.",
-                entry_name
-            ));
-        }
-
-        // Check for suspicious characters
-        let suspicious_chars = ['<', '>', '|', '*', '?'];
-        if entry_name.chars().any(|c| suspicious_chars.contains(&c)) {
-            return Err(anyhow!(
-                "ZIP entry contains suspicious characters: '{}'. This is blocked for security reasons.",
-                entry_name
-            ));
-        }
-
-        Ok(())
-    }
-
-    /// Safely reads content from a ZIP entry with size limits to prevent memory exhaustion
-    fn read_zip_entry_safely<R: std::io::Read>(reader: &mut R, max_size: u64) -> Result<String> {
-        use std::io::Read;
-        
-        let mut buffer = Vec::new();
-        let mut total_read = 0u64;
-        let mut temp_buf = [0u8; 8192]; // 8KB chunks
-        
-        loop {
-            match reader.read(&mut temp_buf)? {
-                0 => break, // EOF
-                bytes_read => {
-                    total_read += bytes_read as u64;
-                    
-                    // Check if we've exceeded the size limit
-                    if total_read > max_size {
-                        return Err(anyhow!(
-                            "ZIP entry content exceeds maximum allowed size of {} bytes. \
-                            This may be a ZIP bomb attack. Current size: {} bytes.",
-                            max_size,
-                            total_read
-                        ));
-                    }
-                    
-                    buffer.extend_from_slice(&temp_buf[..bytes_read]);
-                }
-            }
-        }
-        
-        // Convert to string, handling encoding issues gracefully
-        String::from_utf8(buffer).or_else(|e| {
-            // Try to recover as much valid UTF-8 as possible
-            let bytes = e.into_bytes();
-            let lossy = String::from_utf8_lossy(&bytes);
-            Ok(lossy.into_owned())
-        })
-    }
-
    /// Sanitizes file paths before passing to external tools to prevent command injection
    fn sanitize_file_path_for_external_tool(file_path: &str) -> Result<String> {
        use std::path::Path;
@ -1566,13 +1479,12 @@ impl EnhancedOcrService {
        let metadata = tokio::fs::metadata(file_path).await?;
        let file_size = metadata.len();
        
-        // Limit Office document size to 50MB to prevent memory exhaustion
-        const MAX_OFFICE_SIZE: u64 = 50 * 1024 * 1024; // 50MB
-        if file_size > MAX_OFFICE_SIZE {
+        // Limit Office document size to prevent memory exhaustion
+        if file_size > Self::MAX_OFFICE_DOCUMENT_SIZE {
            return Err(anyhow!(
                "Office document too large: {:.1} MB (max: {:.1} MB). Consider converting to PDF or splitting the document.",
                file_size as f64 / (1024.0 * 1024.0),
-                MAX_OFFICE_SIZE as f64 / (1024.0 * 1024.0)
+                Self::MAX_OFFICE_DOCUMENT_SIZE as f64 / (1024.0 * 1024.0)
            ));
        }
        
@ -1609,100 +1521,37 @@ impl EnhancedOcrService {
        }
    }
    
-    /// Extract text from DOCX files using zip crate and quick-xml
+    /// Extract text from DOCX files using docx-rs library
    async fn extract_text_from_docx(&self, file_path: &str, start_time: std::time::Instant) -> Result<OcrResult> {
        info!("Starting DOCX text extraction: {}", file_path);
        
        // Move CPU-intensive operations to blocking thread pool
        let file_path_clone = file_path.to_string();
        let extraction_result = tokio::task::spawn_blocking(move || -> Result<String> {
-            use zip::ZipArchive;
-            use quick_xml::events::Event;
-            use quick_xml::Reader;
+            use docx_rs::*;
            
-            // Open the DOCX file as a ZIP archive
-            let file = std::fs::File::open(&file_path_clone)?;
-            let mut archive = ZipArchive::new(file)?;
            
-            // Security check: Validate ZIP archive structure
-            let entry_count = archive.len();
-            if entry_count > Self::MAX_ZIP_ENTRIES {
-                return Err(anyhow!(
-                    "ZIP archive contains too many entries ({}). Maximum allowed is {} for security reasons. \
-                    This may be a ZIP bomb attack.",
-                    entry_count,
-                    Self::MAX_ZIP_ENTRIES
-                ));
-            }
+            // Read the DOCX file
+            let file_data = std::fs::read(&file_path_clone)?;
            
-            // Validate all entry names before processing to prevent directory traversal
-            for i in 0..entry_count {
-                let entry = archive.by_index(i)?;
-                let entry_name = entry.name();
-                Self::validate_zip_entry_name(entry_name)?;
-            }
-            
-            // Try to extract the main document content from word/document.xml
-            let mut document_xml = match archive.by_name("word/document.xml") {
-                Ok(file) => file,
-                Err(_) => {
-                    return Err(anyhow!(
-                        "Invalid DOCX file: missing word/document.xml. The file '{}' may be corrupted or not a valid DOCX document.",
-                        file_path_clone
-                    ));
-                }
-            };
-            
-            // Security: Use size-limited reading to prevent ZIP bomb attacks
-            let xml_content = Self::read_zip_entry_safely(&mut document_xml, Self::MAX_XML_SIZE)?;
-            drop(document_xml); // Close the archive entry
-            
-            // Parse the XML and extract text content
-            let mut reader = Reader::from_str(&xml_content);
-            reader.config_mut().trim_text(true);
-            
-            let mut text_content = Vec::new();
-            let mut in_text_element = false;
-            let mut buf = Vec::new();
-            
-            loop {
-                match reader.read_event_into(&mut buf) {
-                    Ok(Event::Start(ref e)) => {
-                        // Look for text elements (w:t tags contain the actual text)
-                        if e.name().as_ref() == b"w:t" {
-                            in_text_element = true;
-                        }
-                    }
-                    Ok(Event::Text(e)) => {
-                        if in_text_element {
-                            // Extract and decode the text content
-                            let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?;
-                            text_content.push(text.into_owned());
-                        }
-                    }
-                    Ok(Event::End(ref e)) => {
-                        if e.name().as_ref() == b"w:t" {
-                            in_text_element = false;
-                        }
-                        // Add space after paragraph breaks
-                        if e.name().as_ref() == b"w:p" {
-                            text_content.push(" ".to_string());
-                        }
-                    }
-                    Ok(Event::Eof) => break,
-                    Err(e) => {
-                        return Err(anyhow!(
-                            "XML parsing error in DOCX file '{}': {}. The file may be corrupted.",
+            // Parse the DOCX document using docx-rs
+            let docx = read_docx(&file_data)
+                .map_err(|e| anyhow!(
+                    "Failed to parse DOCX file '{}': {}. The file may be corrupted or not a valid DOCX document.",
                    file_path_clone, e
-                        ));
-                    }
-                    _ => {}
-                }
-                buf.clear();
+                ))?;
+            
+            // Extract all text content from the document
+            let mut text_content = Vec::new();
+            
+            // Extract text from document body
+            let document = docx.document;
+            for child in document.children {
+                Self::extract_text_from_document_child(&child, &mut text_content);
            }
            
-            // Join all text content
-            let raw_text = text_content.join("");
+            // Join all text content with appropriate spacing
+            let raw_text = text_content.join(" ");
            
            if raw_text.trim().is_empty() {
                return Err(anyhow!(
@ -1736,173 +1585,194 @@ impl EnhancedOcrService {
        })
    }
    
-    /// Extract text from Excel files (XLS/XLSX) using zip crate and quick-xml
+    /// Recursively extract text from document children (paragraphs, tables, etc.)
+    fn extract_text_from_document_child(child: &docx_rs::DocumentChild, text_content: &mut Vec<String>) {
+        match child {
+            docx_rs::DocumentChild::Paragraph(paragraph) => {
+                let mut paragraph_text = Vec::new();
+                for child in &paragraph.children {
+                    Self::extract_text_from_paragraph_child(child, &mut paragraph_text);
+                }
+                if !paragraph_text.is_empty() {
+                    text_content.push(paragraph_text.join(""));
+                }
+            }
+            docx_rs::DocumentChild::Table(table) => {
+                for row in &table.rows {
+                    let docx_rs::TableChild::TableRow(table_row) = row;
+                    for cell in &table_row.cells {
+                        let docx_rs::TableRowChild::TableCell(table_cell) = cell;
+                        for child in &table_cell.children {
+                            match child {
+                                docx_rs::TableCellContent::Paragraph(paragraph) => {
+                                    let mut paragraph_text = Vec::new();
+                                    for para_child in &paragraph.children {
+                                        Self::extract_text_from_paragraph_child(para_child, &mut paragraph_text);
+                                    }
+                                    if !paragraph_text.is_empty() {
+                                        text_content.push(paragraph_text.join(""));
+                                    }
+                                }
+                                docx_rs::TableCellContent::Table(nested_table) => {
+                                    // Handle nested tables using helper function
+                                    Self::extract_text_from_nested_table(nested_table, text_content);
+                                }
+                                _ => {} // Skip other table cell content types
+                            }
+                        }
+                    }
+                }
+            }
+            _ => {
+                // Skip other elements like bookmarks that don't contain text content
+            }
+        }
+    }
+    
+    /// Extract text from nested tables in DOCX documents
+    fn extract_text_from_nested_table(nested_table: &docx_rs::Table, text_content: &mut Vec<String>) {
+        for nested_row in &nested_table.rows {
+            let docx_rs::TableChild::TableRow(nested_table_row) = nested_row;
+            for nested_cell in &nested_table_row.cells {
+                let docx_rs::TableRowChild::TableCell(nested_table_cell) = nested_cell;
+                for nested_child in &nested_table_cell.children {
+                    match nested_child {
+                        docx_rs::TableCellContent::Paragraph(nested_paragraph) => {
+                            let mut nested_paragraph_text = Vec::new();
+                            for nested_para_child in &nested_paragraph.children {
+                                Self::extract_text_from_paragraph_child(nested_para_child, &mut nested_paragraph_text);
+                            }
+                            if !nested_paragraph_text.is_empty() {
+                                text_content.push(nested_paragraph_text.join(""));
+                            }
+                        }
+                        docx_rs::TableCellContent::Table(deeply_nested_table) => {
+                            // Recursively handle deeply nested tables
+                            Self::extract_text_from_nested_table(deeply_nested_table, text_content);
+                        }
+                        _ => {} // Skip other nested content for simplicity
+                    }
+                }
+            }
+        }
+    }
+    
+    /// Extract text from paragraph children (runs, text elements, etc.)
+    fn extract_text_from_paragraph_child(child: &docx_rs::ParagraphChild, text_content: &mut Vec<String>) {
+        match child {
+            docx_rs::ParagraphChild::Run(run) => {
+                for child in &run.children {
+                    match child {
+                        docx_rs::RunChild::Text(text) => {
+                            text_content.push(text.text.clone());
+                        }
+                        docx_rs::RunChild::Tab(_) => {
+                            text_content.push("\t".to_string());
+                        }
+                        docx_rs::RunChild::Break(_break_elem) => {
+                            // For simplicity, treat all breaks as line breaks
+                            text_content.push("\n".to_string());
+                        }
+                        // Skip other elements like images, drawings, etc.
+                        _ => {}
+                    }
+                }
+            }
+            docx_rs::ParagraphChild::Insert(insert) => {
+                for child in &insert.children {
+                    match child {
+                        docx_rs::InsertChild::Run(run) => {
+                            for run_child in &run.children {
+                                match run_child {
+                                    docx_rs::RunChild::Text(text) => {
+                                        text_content.push(text.text.clone());
+                                    }
+                                    docx_rs::RunChild::Tab(_) => {
+                                        text_content.push("\t".to_string());
+                                    }
+                                    docx_rs::RunChild::Break(_) => {
+                                        text_content.push("\n".to_string());
+                                    }
+                                    _ => {}
+                                }
+                            }
+                        }
+                        _ => {}
+                    }
+                }
+            }
+            _ => {
+                // Skip other elements like deleted content, bookmarks, etc.
+            }
+        }
+    }
+    
+    /// Extract text from Excel files (XLS/XLSX) using calamine library
    async fn extract_text_from_excel(&self, file_path: &str, mime_type: &str, start_time: std::time::Instant) -> Result<OcrResult> {
        info!("Starting Excel text extraction: {} (type: {})", file_path, mime_type);
        
-        // Handle legacy XLS files separately
-        if mime_type == "application/vnd.ms-excel" {
-            return self.extract_text_from_legacy_excel(file_path, start_time).await;
-        }
-        
-        // Move CPU-intensive operations to blocking thread pool for XLSX
+        // Move CPU-intensive operations to blocking thread pool
        let file_path_clone = file_path.to_string();
        let extraction_result = tokio::task::spawn_blocking(move || -> Result<String> {
-            use zip::ZipArchive;
-            use quick_xml::events::Event;
-            use quick_xml::Reader;
+            use calamine::{open_workbook_auto, Reader, Data};
            
-            // Open the XLSX file as a ZIP archive
-            let file = std::fs::File::open(&file_path_clone)?;
-            let mut archive = ZipArchive::new(file)?;
            
-            // Security check: Validate ZIP archive structure
-            let entry_count = archive.len();
-            if entry_count > Self::MAX_ZIP_ENTRIES {
-                return Err(anyhow!(
-                    "ZIP archive contains too many entries ({}). Maximum allowed is {} for security reasons. \
-                    This may be a ZIP bomb attack.",
-                    entry_count,
-                    Self::MAX_ZIP_ENTRIES
-                ));
-            }
+            // Open the workbook using calamine - handles both XLS and XLSX automatically
+            let mut workbook = open_workbook_auto(&file_path_clone)
+                .map_err(|e| anyhow!(
+                    "Failed to open Excel file '{}': {}. The file may be corrupted or not a valid Excel document.",
+                    file_path_clone, e
+                ))?;
            
-            // Validate all entry names before processing to prevent directory traversal
-            for i in 0..entry_count {
-                let entry = archive.by_index(i)?;
-                let entry_name = entry.name();
-                Self::validate_zip_entry_name(entry_name)?;
-            }
-            
-            // First, extract shared strings (xl/sharedStrings.xml)
-            let mut shared_strings = Vec::new();
-            if let Ok(mut shared_strings_file) = archive.by_name("xl/sharedStrings.xml") {
-                // Security: Use size-limited reading to prevent ZIP bomb attacks
-                let xml_content = Self::read_zip_entry_safely(&mut shared_strings_file, Self::MAX_XML_SIZE)?;
-                drop(shared_strings_file);
-                
-                // Parse shared strings
-                let mut reader = Reader::from_str(&xml_content);
-                reader.config_mut().trim_text(true);
-                let mut buf = Vec::new();
-                let mut in_string = false;
-                let mut current_string = String::new();
-                
-                loop {
-                    match reader.read_event_into(&mut buf) {
-                        Ok(Event::Start(ref e)) => {
-                            if e.name().as_ref() == b"t" {
-                                in_string = true;
-                                current_string.clear();
-                            }
-                        }
-                        Ok(Event::Text(e)) => {
-                            if in_string {
-                                let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?;
-                                current_string.push_str(&text);
-                            }
-                        }
-                        Ok(Event::End(ref e)) => {
-                            if e.name().as_ref() == b"t" {
-                                in_string = false;
-                                shared_strings.push(current_string.clone());
-                                current_string.clear();
-                            }
-                        }
-                        Ok(Event::Eof) => break,
-                        Err(e) => {
-                            return Err(anyhow!(
-                                "XML parsing error in Excel shared strings: {}. The file may be corrupted.",
-                                e
-                            ));
-                        }
-                        _ => {}
-                    }
-                    buf.clear();
-                }
-            }
-            
-            // Now extract worksheet data
            let mut all_text = Vec::new();
-            let mut worksheet_count = 0;
+            let worksheet_names = workbook.sheet_names().to_owned();
            
-            // Look for worksheets (xl/worksheets/sheet1.xml, sheet2.xml, etc.)
-            for i in 1..=20 { // Check up to 20 worksheets
-                let worksheet_name = format!("xl/worksheets/sheet{}.xml", i);
-                
-                if let Ok(mut worksheet_file) = archive.by_name(&worksheet_name) {
-                    worksheet_count += 1;
-                    // Security: Use size-limited reading to prevent ZIP bomb attacks
-                    let xml_content = Self::read_zip_entry_safely(&mut worksheet_file, Self::MAX_XML_SIZE)?;
-                    drop(worksheet_file);
-                    
-                    // Parse worksheet data
-                    let mut reader = Reader::from_str(&xml_content);
-                    reader.config_mut().trim_text(true);
-                    let mut buf = Vec::new();
-                    let mut in_cell_value = false;
-                    let mut current_cell_type = String::new();
-                    
-                    loop {
-                        match reader.read_event_into(&mut buf) {
-                            Ok(Event::Start(ref e)) => {
-                                if e.name().as_ref() == b"c" {
-                                    // Cell element - check if it has a type attribute
-                                    current_cell_type.clear();
-                                    for attr in e.attributes() {
-                                        if let Ok(attr) = attr {
-                                            if attr.key.as_ref() == b"t" {
-                                                current_cell_type = String::from_utf8_lossy(&attr.value).to_string();
-                                            }
-                                        }
-                                    }
-                                } else if e.name().as_ref() == b"v" {
-                                    // Cell value
-                                    in_cell_value = true;
-                                }
-                            }
-                            Ok(Event::Text(e)) => {
-                                if in_cell_value {
-                                    let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?;
-                                    
-                                    // If this is a shared string reference (t="s"), look up the string
-                                    if current_cell_type == "s" {
-                                        if let Ok(index) = text.parse::<usize>() {
-                                            if let Some(shared_string) = shared_strings.get(index) {
-                                                all_text.push(shared_string.clone());
-                                            }
-                                        }
-                                    } else {
-                                        // Direct value
-                                        all_text.push(text.into_owned());
-                                    }
-                                }
-                            }
-                            Ok(Event::End(ref e)) => {
-                                if e.name().as_ref() == b"v" {
-                                    in_cell_value = false;
-                                }
-                            }
-                            Ok(Event::Eof) => break,
-                            Err(e) => {
+            if worksheet_names.is_empty() {
                return Err(anyhow!(
-                                    "XML parsing error in Excel worksheet {}: {}. The file may be corrupted.",
-                                    worksheet_name, e
+                    "No worksheets found in Excel file '{}'. The file may be corrupted or empty.",
+                    file_path_clone
                ));
            }
-                            _ => {}
-                        }
-                        buf.clear();
-                    }
+            
+            // Extract text from all worksheets
+            for sheet_name in worksheet_names {
+                if let Ok(range) = workbook.worksheet_range(&sheet_name) {
+                    // Iterate through all cells in the worksheet
+                    for row in range.rows() {
+                        for cell in row {
+                            // Extract text content from each cell based on its data type
+                            let cell_text = match cell {
+                                Data::String(s) => s.clone(),
+                                Data::Float(f) => {
+                                    // Format numbers appropriately
+                                    if f.fract() == 0.0 {
+                                        format!("{}", *f as i64) // Integer
                                    } else {
-                    // No more worksheets found
-                    break;
+                                        format!("{}", f) // Decimal
+                                    }
+                                }
+                                Data::Int(i) => format!("{}", i),
+                                Data::Bool(b) => format!("{}", b),
+                                Data::DateTime(dt) => format!("{}", dt),
+                                Data::DateTimeIso(dt_iso) => dt_iso.clone(),
+                                Data::DurationIso(dur_iso) => dur_iso.clone(),
+                                Data::Error(e) => format!("ERROR: {:?}", e),
+                                Data::Empty => continue, // Skip empty cells
+                            };
+                            
+                            // Only add non-empty text
+                            let trimmed_text = cell_text.trim();
+                            if !trimmed_text.is_empty() {
+                                all_text.push(trimmed_text.to_string());
+                            }
+                        }
+                    }
                }
            }
            
-            if worksheet_count == 0 {
+            if all_text.is_empty() {
                return Err(anyhow!(
-                    "Invalid XLSX file: no worksheets found in '{}'. The file may be corrupted or not a valid Excel document.",
+                    "No text content found in Excel file '{}'. All cells may be empty or contain only formulas/formatting.",
                    file_path_clone
                ));
            }
@ -1910,13 +1780,6 @@ impl EnhancedOcrService {
            // Join all text content with spaces
            let raw_text = all_text.join(" ");
            
-            if raw_text.trim().is_empty() {
-                return Err(anyhow!(
-                    "No text content found in Excel file '{}'. The spreadsheet may be empty or contain only formulas/formatting.",
-                    file_path_clone
-                ));
-            }
-            
            Ok(raw_text)
            
        }).await??;
@ -1928,8 +1791,10 @@ impl EnhancedOcrService {
        let word_count = self.count_words_safely(&cleaned_text);
        
        info!(
-            "Excel extraction completed: {} words extracted from '{}' in {}ms",
-            word_count, file_path, processing_time
+            "Excel extraction completed: {} words extracted from '{}' in {}ms (processed {} worksheets)",
+            word_count, file_path, processing_time,
+            // Count worksheets that were processed (approximation)
+            cleaned_text.matches("worksheet").count().max(1)
        );
        
        Ok(OcrResult {
@ -1942,23 +1807,6 @@ impl EnhancedOcrService {
        })
    }
    
-    /// Extract text from legacy Excel files (XLS format)
-    async fn extract_text_from_legacy_excel(&self, file_path: &str, start_time: std::time::Instant) -> Result<OcrResult> {
-        info!("Processing legacy Excel (XLS) file: {}", file_path);
-        
-        let processing_time = start_time.elapsed().as_millis() as u64;
-        
-        // Legacy XLS files are complex binary format, suggest conversion
-        Err(anyhow!(
-            "Legacy Excel files (.xls) are not directly supported for text extraction due to their complex binary format. \
-            To process the content from '{}', please:\n\
-            1. Open the file in Microsoft Excel, LibreOffice Calc, or Google Sheets\n\
-            2. Save/Export as XLSX format (recommended) or CSV\n\
-            3. Alternatively, export as PDF to preserve formatting\n\
-            \nXLSX format provides better compatibility and more reliable text extraction.",
-            file_path
-        ))
-    }
    
    /// Extract text from legacy DOC files using external tools
    async fn extract_text_from_legacy_doc(&self, file_path: &str, start_time: std::time::Instant) -> Result<OcrResult> {