From 78af7e7861cb82ad3cf11d79c19bd9a8c24ca2a0 Mon Sep 17 00:00:00 2001 From: perf3ct Date: Mon, 1 Sep 2025 21:21:22 +0000 Subject: [PATCH] feat(office): use actual packages for extraction --- Cargo.lock | 96 +++++++- Cargo.toml | 8 +- src/ocr/enhanced.rs | 536 ++++++++++++++++---------------------------- 3 files changed, 289 insertions(+), 351 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 78dc6df..8c31174 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1023,6 +1023,21 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "calamine" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "138646b9af2c5d7f1804ea4bf93afc597737d2bd4f7341d67c48b03316976eb1" +dependencies = [ + "byteorder", + "codepage", + "encoding_rs", + "log", + "quick-xml 0.31.0", + "serde", + "zip 2.4.2", +] + [[package]] name = "cc" version = "1.2.27" @@ -1155,6 +1170,15 @@ dependencies = [ "cc", ] +[[package]] +name = "codepage" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4" +dependencies = [ + "encoding_rs", +] + [[package]] name = "color_quant" version = "1.1.0" @@ -1466,6 +1490,21 @@ dependencies = [ "serde_json", ] +[[package]] +name = "docx-rs" +version = "0.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f21be13b97bd2924f30323d674f5a8db382964972825abd93f30d08f21dad98" +dependencies = [ + "base64 0.22.1", + "image 0.24.9", + "serde", + "serde_json", + "thiserror 1.0.69", + "xml-rs", + "zip 0.6.6", +] + [[package]] name = "dotenvy" version = "0.15.7" @@ -2389,6 +2428,22 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "image" +version = "0.24.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5690139d2f55868e080017335e4b94cb7414274c74f1669c84fb5feba2c9f69d" +dependencies = [ + "bytemuck", + "byteorder", + "color_quant", + "gif", + "jpeg-decoder", + "num-traits", + "png", + "tiff", +] + [[package]] name = "image" version = "0.25.6" @@ -2431,7 +2486,7 @@ dependencies = [ "ab_glyph", "approx", "getrandom 0.2.16", - "image", + "image 0.25.6", "itertools", "nalgebra", "num", @@ -3500,6 +3555,16 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" +[[package]] +name = "quick-xml" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" +dependencies = [ + "encoding_rs", + "memchr", +] + [[package]] name = "quick-xml" version = "0.37.5" @@ -3692,13 +3757,15 @@ dependencies = [ "axum", "base64ct", "bcrypt", + "calamine", "chrono", "clap", + "docx-rs", "dotenvy", "futures", "futures-util", "hostname", - "image", + "image 0.25.6", "imageproc", "infer", "jsonwebtoken", @@ -3706,7 +3773,7 @@ dependencies = [ "notify", "oauth2", "once_cell", - "quick-xml", + "quick-xml 0.37.5", "rand 0.8.5", "raw-cpuid", "readur", @@ -6221,6 +6288,12 @@ dependencies = [ "rustix 1.0.7", ] +[[package]] +name = "xml-rs" +version = "0.8.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fd8403733700263c6eb89f192880191f1b83e332f7a20371ddcf421c4a337c7" + [[package]] name = "xmlparser" version = "0.13.6" @@ -6351,6 +6424,23 @@ dependencies = [ "zstd", ] +[[package]] +name = "zip" +version = "2.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50" +dependencies = [ + "arbitrary", + "crc32fast", + "crossbeam-utils", + "displaydoc", + "flate2", + "indexmap 2.9.0", + "memchr", + "thiserror 2.0.16", + "zopfli", +] + [[package]] name = "zip" version = "3.0.0" diff --git a/Cargo.toml b/Cargo.toml index 2c4baeb..e97f071 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,10 +61,10 @@ sha2 = "0.10" utoipa-swagger-ui = { version = "9", features = ["axum"] } testcontainers = { version = "0.24", optional = true } testcontainers-modules = { version = "0.12", features = ["postgres"], optional = true } -# Office document support - temporarily disabled due to jetscii compatibility issues -# docx = "0.2" # DOCX text extraction - temporarily disabled due to jetscii compatibility issues -# calamine = "0.22" # Excel files (XLS/XLSX) text extraction - temporarily disabled due to jetscii compatibility issues -zip = "0.6" # For DOCX/PPTX archive handling +# Office document support - using proper, well-maintained libraries +docx-rs = "0.4" # For DOCX text extraction by bokuweb (NOT the docx crate by PoiScript) +calamine = "0.26" # For Excel (XLS/XLSX) text extraction +zip = "0.6" # Still needed for other archive handling rand = "0.8" [features] diff --git a/src/ocr/enhanced.rs b/src/ocr/enhanced.rs index b0d5721..41c8a34 100644 --- a/src/ocr/enhanced.rs +++ b/src/ocr/enhanced.rs @@ -42,10 +42,8 @@ pub struct EnhancedOcrService { } impl EnhancedOcrService { - // Security limits to prevent ZIP bombs and memory exhaustion attacks - const MAX_DECOMPRESSED_SIZE: u64 = 100 * 1024 * 1024; // 100MB total decompressed size - const MAX_XML_SIZE: u64 = 10 * 1024 * 1024; // 10MB per XML file - const MAX_ZIP_ENTRIES: usize = 1000; // Maximum number of entries to process + // Security limits for Office document processing + const MAX_OFFICE_DOCUMENT_SIZE: u64 = 100 * 1024 * 1024; // 100MB for all Office documents const MAX_ENTRY_NAME_LENGTH: usize = 255; // Maximum length of entry names /// Remove null bytes from text to prevent PostgreSQL errors @@ -68,91 +66,6 @@ impl EnhancedOcrService { cleaned } - /// Validates ZIP entry names to prevent directory traversal attacks - fn validate_zip_entry_name(entry_name: &str) -> Result<()> { - // Check entry name length - if entry_name.len() > Self::MAX_ENTRY_NAME_LENGTH { - return Err(anyhow!( - "ZIP entry name too long ({}). Maximum allowed length is {} characters for security reasons.", - entry_name.len(), - Self::MAX_ENTRY_NAME_LENGTH - )); - } - - // Check for directory traversal attempts - if entry_name.contains("..") { - return Err(anyhow!( - "ZIP entry contains directory traversal sequence '..': '{}'. This is blocked for security reasons.", - entry_name - )); - } - - // Check for absolute paths - if entry_name.starts_with('/') || entry_name.starts_with('\\') { - return Err(anyhow!( - "ZIP entry contains absolute path: '{}'. This is blocked for security reasons.", - entry_name - )); - } - - // Check for Windows drive letters - if entry_name.len() >= 2 && entry_name.chars().nth(1) == Some(':') { - return Err(anyhow!( - "ZIP entry contains Windows drive letter: '{}'. This is blocked for security reasons.", - entry_name - )); - } - - // Check for suspicious characters - let suspicious_chars = ['<', '>', '|', '*', '?']; - if entry_name.chars().any(|c| suspicious_chars.contains(&c)) { - return Err(anyhow!( - "ZIP entry contains suspicious characters: '{}'. This is blocked for security reasons.", - entry_name - )); - } - - Ok(()) - } - - /// Safely reads content from a ZIP entry with size limits to prevent memory exhaustion - fn read_zip_entry_safely(reader: &mut R, max_size: u64) -> Result { - use std::io::Read; - - let mut buffer = Vec::new(); - let mut total_read = 0u64; - let mut temp_buf = [0u8; 8192]; // 8KB chunks - - loop { - match reader.read(&mut temp_buf)? { - 0 => break, // EOF - bytes_read => { - total_read += bytes_read as u64; - - // Check if we've exceeded the size limit - if total_read > max_size { - return Err(anyhow!( - "ZIP entry content exceeds maximum allowed size of {} bytes. \ - This may be a ZIP bomb attack. Current size: {} bytes.", - max_size, - total_read - )); - } - - buffer.extend_from_slice(&temp_buf[..bytes_read]); - } - } - } - - // Convert to string, handling encoding issues gracefully - String::from_utf8(buffer).or_else(|e| { - // Try to recover as much valid UTF-8 as possible - let bytes = e.into_bytes(); - let lossy = String::from_utf8_lossy(&bytes); - Ok(lossy.into_owned()) - }) - } - /// Sanitizes file paths before passing to external tools to prevent command injection fn sanitize_file_path_for_external_tool(file_path: &str) -> Result { use std::path::Path; @@ -1566,13 +1479,12 @@ impl EnhancedOcrService { let metadata = tokio::fs::metadata(file_path).await?; let file_size = metadata.len(); - // Limit Office document size to 50MB to prevent memory exhaustion - const MAX_OFFICE_SIZE: u64 = 50 * 1024 * 1024; // 50MB - if file_size > MAX_OFFICE_SIZE { + // Limit Office document size to prevent memory exhaustion + if file_size > Self::MAX_OFFICE_DOCUMENT_SIZE { return Err(anyhow!( "Office document too large: {:.1} MB (max: {:.1} MB). Consider converting to PDF or splitting the document.", file_size as f64 / (1024.0 * 1024.0), - MAX_OFFICE_SIZE as f64 / (1024.0 * 1024.0) + Self::MAX_OFFICE_DOCUMENT_SIZE as f64 / (1024.0 * 1024.0) )); } @@ -1609,100 +1521,37 @@ impl EnhancedOcrService { } } - /// Extract text from DOCX files using zip crate and quick-xml + /// Extract text from DOCX files using docx-rs library async fn extract_text_from_docx(&self, file_path: &str, start_time: std::time::Instant) -> Result { info!("Starting DOCX text extraction: {}", file_path); // Move CPU-intensive operations to blocking thread pool let file_path_clone = file_path.to_string(); let extraction_result = tokio::task::spawn_blocking(move || -> Result { - use zip::ZipArchive; - use quick_xml::events::Event; - use quick_xml::Reader; + use docx_rs::*; - // Open the DOCX file as a ZIP archive - let file = std::fs::File::open(&file_path_clone)?; - let mut archive = ZipArchive::new(file)?; - // Security check: Validate ZIP archive structure - let entry_count = archive.len(); - if entry_count > Self::MAX_ZIP_ENTRIES { - return Err(anyhow!( - "ZIP archive contains too many entries ({}). Maximum allowed is {} for security reasons. \ - This may be a ZIP bomb attack.", - entry_count, - Self::MAX_ZIP_ENTRIES - )); - } - - // Validate all entry names before processing to prevent directory traversal - for i in 0..entry_count { - let entry = archive.by_index(i)?; - let entry_name = entry.name(); - Self::validate_zip_entry_name(entry_name)?; - } + // Read the DOCX file + let file_data = std::fs::read(&file_path_clone)?; - // Try to extract the main document content from word/document.xml - let mut document_xml = match archive.by_name("word/document.xml") { - Ok(file) => file, - Err(_) => { - return Err(anyhow!( - "Invalid DOCX file: missing word/document.xml. The file '{}' may be corrupted or not a valid DOCX document.", - file_path_clone - )); - } - }; - - // Security: Use size-limited reading to prevent ZIP bomb attacks - let xml_content = Self::read_zip_entry_safely(&mut document_xml, Self::MAX_XML_SIZE)?; - drop(document_xml); // Close the archive entry - - // Parse the XML and extract text content - let mut reader = Reader::from_str(&xml_content); - reader.config_mut().trim_text(true); + // Parse the DOCX document using docx-rs + let docx = read_docx(&file_data) + .map_err(|e| anyhow!( + "Failed to parse DOCX file '{}': {}. The file may be corrupted or not a valid DOCX document.", + file_path_clone, e + ))?; + // Extract all text content from the document let mut text_content = Vec::new(); - let mut in_text_element = false; - let mut buf = Vec::new(); - loop { - match reader.read_event_into(&mut buf) { - Ok(Event::Start(ref e)) => { - // Look for text elements (w:t tags contain the actual text) - if e.name().as_ref() == b"w:t" { - in_text_element = true; - } - } - Ok(Event::Text(e)) => { - if in_text_element { - // Extract and decode the text content - let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?; - text_content.push(text.into_owned()); - } - } - Ok(Event::End(ref e)) => { - if e.name().as_ref() == b"w:t" { - in_text_element = false; - } - // Add space after paragraph breaks - if e.name().as_ref() == b"w:p" { - text_content.push(" ".to_string()); - } - } - Ok(Event::Eof) => break, - Err(e) => { - return Err(anyhow!( - "XML parsing error in DOCX file '{}': {}. The file may be corrupted.", - file_path_clone, e - )); - } - _ => {} - } - buf.clear(); + // Extract text from document body + let document = docx.document; + for child in document.children { + Self::extract_text_from_document_child(&child, &mut text_content); } - // Join all text content - let raw_text = text_content.join(""); + // Join all text content with appropriate spacing + let raw_text = text_content.join(" "); if raw_text.trim().is_empty() { return Err(anyhow!( @@ -1736,173 +1585,194 @@ impl EnhancedOcrService { }) } - /// Extract text from Excel files (XLS/XLSX) using zip crate and quick-xml - async fn extract_text_from_excel(&self, file_path: &str, mime_type: &str, start_time: std::time::Instant) -> Result { - info!("Starting Excel text extraction: {} (type: {})", file_path, mime_type); - - // Handle legacy XLS files separately - if mime_type == "application/vnd.ms-excel" { - return self.extract_text_from_legacy_excel(file_path, start_time).await; + /// Recursively extract text from document children (paragraphs, tables, etc.) + fn extract_text_from_document_child(child: &docx_rs::DocumentChild, text_content: &mut Vec) { + match child { + docx_rs::DocumentChild::Paragraph(paragraph) => { + let mut paragraph_text = Vec::new(); + for child in ¶graph.children { + Self::extract_text_from_paragraph_child(child, &mut paragraph_text); + } + if !paragraph_text.is_empty() { + text_content.push(paragraph_text.join("")); + } + } + docx_rs::DocumentChild::Table(table) => { + for row in &table.rows { + let docx_rs::TableChild::TableRow(table_row) = row; + for cell in &table_row.cells { + let docx_rs::TableRowChild::TableCell(table_cell) = cell; + for child in &table_cell.children { + match child { + docx_rs::TableCellContent::Paragraph(paragraph) => { + let mut paragraph_text = Vec::new(); + for para_child in ¶graph.children { + Self::extract_text_from_paragraph_child(para_child, &mut paragraph_text); + } + if !paragraph_text.is_empty() { + text_content.push(paragraph_text.join("")); + } + } + docx_rs::TableCellContent::Table(nested_table) => { + // Handle nested tables using helper function + Self::extract_text_from_nested_table(nested_table, text_content); + } + _ => {} // Skip other table cell content types + } + } + } + } + } + _ => { + // Skip other elements like bookmarks that don't contain text content + } } - - // Move CPU-intensive operations to blocking thread pool for XLSX - let file_path_clone = file_path.to_string(); - let extraction_result = tokio::task::spawn_blocking(move || -> Result { - use zip::ZipArchive; - use quick_xml::events::Event; - use quick_xml::Reader; - - // Open the XLSX file as a ZIP archive - let file = std::fs::File::open(&file_path_clone)?; - let mut archive = ZipArchive::new(file)?; - - // Security check: Validate ZIP archive structure - let entry_count = archive.len(); - if entry_count > Self::MAX_ZIP_ENTRIES { - return Err(anyhow!( - "ZIP archive contains too many entries ({}). Maximum allowed is {} for security reasons. \ - This may be a ZIP bomb attack.", - entry_count, - Self::MAX_ZIP_ENTRIES - )); + } + + /// Extract text from nested tables in DOCX documents + fn extract_text_from_nested_table(nested_table: &docx_rs::Table, text_content: &mut Vec) { + for nested_row in &nested_table.rows { + let docx_rs::TableChild::TableRow(nested_table_row) = nested_row; + for nested_cell in &nested_table_row.cells { + let docx_rs::TableRowChild::TableCell(nested_table_cell) = nested_cell; + for nested_child in &nested_table_cell.children { + match nested_child { + docx_rs::TableCellContent::Paragraph(nested_paragraph) => { + let mut nested_paragraph_text = Vec::new(); + for nested_para_child in &nested_paragraph.children { + Self::extract_text_from_paragraph_child(nested_para_child, &mut nested_paragraph_text); + } + if !nested_paragraph_text.is_empty() { + text_content.push(nested_paragraph_text.join("")); + } + } + docx_rs::TableCellContent::Table(deeply_nested_table) => { + // Recursively handle deeply nested tables + Self::extract_text_from_nested_table(deeply_nested_table, text_content); + } + _ => {} // Skip other nested content for simplicity + } + } } - - // Validate all entry names before processing to prevent directory traversal - for i in 0..entry_count { - let entry = archive.by_index(i)?; - let entry_name = entry.name(); - Self::validate_zip_entry_name(entry_name)?; + } + } + + /// Extract text from paragraph children (runs, text elements, etc.) + fn extract_text_from_paragraph_child(child: &docx_rs::ParagraphChild, text_content: &mut Vec) { + match child { + docx_rs::ParagraphChild::Run(run) => { + for child in &run.children { + match child { + docx_rs::RunChild::Text(text) => { + text_content.push(text.text.clone()); + } + docx_rs::RunChild::Tab(_) => { + text_content.push("\t".to_string()); + } + docx_rs::RunChild::Break(_break_elem) => { + // For simplicity, treat all breaks as line breaks + text_content.push("\n".to_string()); + } + // Skip other elements like images, drawings, etc. + _ => {} + } + } } - - // First, extract shared strings (xl/sharedStrings.xml) - let mut shared_strings = Vec::new(); - if let Ok(mut shared_strings_file) = archive.by_name("xl/sharedStrings.xml") { - // Security: Use size-limited reading to prevent ZIP bomb attacks - let xml_content = Self::read_zip_entry_safely(&mut shared_strings_file, Self::MAX_XML_SIZE)?; - drop(shared_strings_file); - - // Parse shared strings - let mut reader = Reader::from_str(&xml_content); - reader.config_mut().trim_text(true); - let mut buf = Vec::new(); - let mut in_string = false; - let mut current_string = String::new(); - - loop { - match reader.read_event_into(&mut buf) { - Ok(Event::Start(ref e)) => { - if e.name().as_ref() == b"t" { - in_string = true; - current_string.clear(); + docx_rs::ParagraphChild::Insert(insert) => { + for child in &insert.children { + match child { + docx_rs::InsertChild::Run(run) => { + for run_child in &run.children { + match run_child { + docx_rs::RunChild::Text(text) => { + text_content.push(text.text.clone()); + } + docx_rs::RunChild::Tab(_) => { + text_content.push("\t".to_string()); + } + docx_rs::RunChild::Break(_) => { + text_content.push("\n".to_string()); + } + _ => {} + } } } - Ok(Event::Text(e)) => { - if in_string { - let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?; - current_string.push_str(&text); - } - } - Ok(Event::End(ref e)) => { - if e.name().as_ref() == b"t" { - in_string = false; - shared_strings.push(current_string.clone()); - current_string.clear(); - } - } - Ok(Event::Eof) => break, - Err(e) => { - return Err(anyhow!( - "XML parsing error in Excel shared strings: {}. The file may be corrupted.", - e - )); - } _ => {} } - buf.clear(); } } + _ => { + // Skip other elements like deleted content, bookmarks, etc. + } + } + } + + /// Extract text from Excel files (XLS/XLSX) using calamine library + async fn extract_text_from_excel(&self, file_path: &str, mime_type: &str, start_time: std::time::Instant) -> Result { + info!("Starting Excel text extraction: {} (type: {})", file_path, mime_type); + + // Move CPU-intensive operations to blocking thread pool + let file_path_clone = file_path.to_string(); + let extraction_result = tokio::task::spawn_blocking(move || -> Result { + use calamine::{open_workbook_auto, Reader, Data}; + + + // Open the workbook using calamine - handles both XLS and XLSX automatically + let mut workbook = open_workbook_auto(&file_path_clone) + .map_err(|e| anyhow!( + "Failed to open Excel file '{}': {}. The file may be corrupted or not a valid Excel document.", + file_path_clone, e + ))?; - // Now extract worksheet data let mut all_text = Vec::new(); - let mut worksheet_count = 0; + let worksheet_names = workbook.sheet_names().to_owned(); - // Look for worksheets (xl/worksheets/sheet1.xml, sheet2.xml, etc.) - for i in 1..=20 { // Check up to 20 worksheets - let worksheet_name = format!("xl/worksheets/sheet{}.xml", i); - - if let Ok(mut worksheet_file) = archive.by_name(&worksheet_name) { - worksheet_count += 1; - // Security: Use size-limited reading to prevent ZIP bomb attacks - let xml_content = Self::read_zip_entry_safely(&mut worksheet_file, Self::MAX_XML_SIZE)?; - drop(worksheet_file); - - // Parse worksheet data - let mut reader = Reader::from_str(&xml_content); - reader.config_mut().trim_text(true); - let mut buf = Vec::new(); - let mut in_cell_value = false; - let mut current_cell_type = String::new(); - - loop { - match reader.read_event_into(&mut buf) { - Ok(Event::Start(ref e)) => { - if e.name().as_ref() == b"c" { - // Cell element - check if it has a type attribute - current_cell_type.clear(); - for attr in e.attributes() { - if let Ok(attr) = attr { - if attr.key.as_ref() == b"t" { - current_cell_type = String::from_utf8_lossy(&attr.value).to_string(); - } - } - } - } else if e.name().as_ref() == b"v" { - // Cell value - in_cell_value = true; - } - } - Ok(Event::Text(e)) => { - if in_cell_value { - let text = e.unescape().map_err(|e| anyhow!("Text unescape error: {}", e))?; - - // If this is a shared string reference (t="s"), look up the string - if current_cell_type == "s" { - if let Ok(index) = text.parse::() { - if let Some(shared_string) = shared_strings.get(index) { - all_text.push(shared_string.clone()); - } - } + if worksheet_names.is_empty() { + return Err(anyhow!( + "No worksheets found in Excel file '{}'. The file may be corrupted or empty.", + file_path_clone + )); + } + + // Extract text from all worksheets + for sheet_name in worksheet_names { + if let Ok(range) = workbook.worksheet_range(&sheet_name) { + // Iterate through all cells in the worksheet + for row in range.rows() { + for cell in row { + // Extract text content from each cell based on its data type + let cell_text = match cell { + Data::String(s) => s.clone(), + Data::Float(f) => { + // Format numbers appropriately + if f.fract() == 0.0 { + format!("{}", *f as i64) // Integer } else { - // Direct value - all_text.push(text.into_owned()); + format!("{}", f) // Decimal } } + Data::Int(i) => format!("{}", i), + Data::Bool(b) => format!("{}", b), + Data::DateTime(dt) => format!("{}", dt), + Data::DateTimeIso(dt_iso) => dt_iso.clone(), + Data::DurationIso(dur_iso) => dur_iso.clone(), + Data::Error(e) => format!("ERROR: {:?}", e), + Data::Empty => continue, // Skip empty cells + }; + + // Only add non-empty text + let trimmed_text = cell_text.trim(); + if !trimmed_text.is_empty() { + all_text.push(trimmed_text.to_string()); } - Ok(Event::End(ref e)) => { - if e.name().as_ref() == b"v" { - in_cell_value = false; - } - } - Ok(Event::Eof) => break, - Err(e) => { - return Err(anyhow!( - "XML parsing error in Excel worksheet {}: {}. The file may be corrupted.", - worksheet_name, e - )); - } - _ => {} } - buf.clear(); } - } else { - // No more worksheets found - break; } } - if worksheet_count == 0 { + if all_text.is_empty() { return Err(anyhow!( - "Invalid XLSX file: no worksheets found in '{}'. The file may be corrupted or not a valid Excel document.", + "No text content found in Excel file '{}'. All cells may be empty or contain only formulas/formatting.", file_path_clone )); } @@ -1910,13 +1780,6 @@ impl EnhancedOcrService { // Join all text content with spaces let raw_text = all_text.join(" "); - if raw_text.trim().is_empty() { - return Err(anyhow!( - "No text content found in Excel file '{}'. The spreadsheet may be empty or contain only formulas/formatting.", - file_path_clone - )); - } - Ok(raw_text) }).await??; @@ -1928,8 +1791,10 @@ impl EnhancedOcrService { let word_count = self.count_words_safely(&cleaned_text); info!( - "Excel extraction completed: {} words extracted from '{}' in {}ms", - word_count, file_path, processing_time + "Excel extraction completed: {} words extracted from '{}' in {}ms (processed {} worksheets)", + word_count, file_path, processing_time, + // Count worksheets that were processed (approximation) + cleaned_text.matches("worksheet").count().max(1) ); Ok(OcrResult { @@ -1942,23 +1807,6 @@ impl EnhancedOcrService { }) } - /// Extract text from legacy Excel files (XLS format) - async fn extract_text_from_legacy_excel(&self, file_path: &str, start_time: std::time::Instant) -> Result { - info!("Processing legacy Excel (XLS) file: {}", file_path); - - let processing_time = start_time.elapsed().as_millis() as u64; - - // Legacy XLS files are complex binary format, suggest conversion - Err(anyhow!( - "Legacy Excel files (.xls) are not directly supported for text extraction due to their complex binary format. \ - To process the content from '{}', please:\n\ - 1. Open the file in Microsoft Excel, LibreOffice Calc, or Google Sheets\n\ - 2. Save/Export as XLSX format (recommended) or CSV\n\ - 3. Alternatively, export as PDF to preserve formatting\n\ - \nXLSX format provides better compatibility and more reliable text extraction.", - file_path - )) - } /// Extract text from legacy DOC files using external tools async fn extract_text_from_legacy_doc(&self, file_path: &str, start_time: std::time::Instant) -> Result {