From 149c3b9a3fb9e08c6b8d2717b372f9ee58fe944d Mon Sep 17 00:00:00 2001 From: perf3ct Date: Tue, 2 Sep 2025 03:47:20 +0000 Subject: [PATCH] feat(office): yeet unused fallback strategy --- src/ocr/fallback_strategy.rs | 220 ------------------ src/ocr/mod.rs | 102 +++----- tests/integration_office_extraction.rs | 84 +++---- tests/integration_webdav_integration_tests.rs | 6 + 4 files changed, 65 insertions(+), 347 deletions(-) delete mode 100644 src/ocr/fallback_strategy.rs diff --git a/src/ocr/fallback_strategy.rs b/src/ocr/fallback_strategy.rs deleted file mode 100644 index 2b65a9b..0000000 --- a/src/ocr/fallback_strategy.rs +++ /dev/null @@ -1,220 +0,0 @@ -use anyhow::Result; -use serde::{Deserialize, Serialize}; -use tracing::{info, warn}; - -use super::xml_extractor::{OfficeExtractionResult, XmlOfficeExtractor}; - -#[cfg(test)] -use anyhow::anyhow; - -/// Configuration for XML-based Office document extraction -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct FallbackConfig { - /// Enable XML extraction - pub enabled: bool, - /// Maximum number of retry attempts for transient failures - pub max_retries: u32, - /// Initial retry delay in milliseconds - pub initial_retry_delay_ms: u64, - /// Maximum retry delay in milliseconds - pub max_retry_delay_ms: u64, - /// Timeout for XML extraction in seconds - pub xml_timeout_seconds: u64, -} - - -impl Default for FallbackConfig { - fn default() -> Self { - Self { - enabled: true, - max_retries: 3, - initial_retry_delay_ms: 1000, - max_retry_delay_ms: 30000, - xml_timeout_seconds: 180, - } - } -} - - - -/// Statistics for monitoring XML extraction performance -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct FallbackStats { - pub total_extractions: u64, - pub xml_successes: u64, - pub retry_attempts: u64, - pub average_processing_time_ms: f64, - pub success_rate_percentage: f64, -} - -impl Default for FallbackStats { - fn default() -> Self { - Self { - total_extractions: 0, - xml_successes: 0, - retry_attempts: 0, - average_processing_time_ms: 0.0, - success_rate_percentage: 100.0, - } - } -} - -/// XML-based Office document extraction service -pub struct FallbackStrategy { - config: FallbackConfig, - xml_extractor: XmlOfficeExtractor, - stats: std::sync::Arc>, -} - -impl FallbackStrategy { - /// Create a new XML extraction service - pub fn new(config: FallbackConfig, temp_dir: String) -> Self { - Self { - config, - xml_extractor: XmlOfficeExtractor::new(temp_dir), - stats: std::sync::Arc::new(std::sync::RwLock::new(FallbackStats::default())), - } - } - - /// Extract Office document using XML extraction - pub async fn extract_with_fallback( - &self, - file_path: &str, - mime_type: &str, - ) -> Result { - let start_time = std::time::Instant::now(); - let document_type = self.get_document_type(mime_type); - - info!("Starting XML extraction for {} (type: {})", file_path, document_type); - - // Update total extraction count - if let Ok(mut stats) = self.stats.write() { - stats.total_extractions += 1; - } - - // Use XML extraction as the only method - let result = self.execute_xml_extraction(file_path, mime_type).await; - - let processing_time = start_time.elapsed(); - - // Update statistics - self.update_stats(&result, processing_time).await; - - result - } - - /// Execute XML extraction directly - async fn execute_xml_extraction( - &self, - file_path: &str, - mime_type: &str, - ) -> Result { - let result = self.xml_extractor.extract_text_from_office(file_path, mime_type).await?; - - // Update stats - if let Ok(mut stats) = self.stats.write() { - stats.xml_successes += 1; - } - - Ok(result) - } - - - /// Get document type from MIME type - fn get_document_type(&self, mime_type: &str) -> String { - match mime_type { - "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => "docx".to_string(), - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => "xlsx".to_string(), - "application/vnd.openxmlformats-officedocument.presentationml.presentation" => "pptx".to_string(), - "application/msword" => "doc".to_string(), - "application/vnd.ms-excel" => "xls".to_string(), - "application/vnd.ms-powerpoint" => "ppt".to_string(), - "application/pdf" => "pdf".to_string(), - _ => "unknown".to_string(), - } - } - - /// Update statistics after extraction - async fn update_stats(&self, result: &Result, processing_time: std::time::Duration) { - if let Ok(mut stats) = self.stats.write() { - let processing_time_ms = processing_time.as_millis() as f64; - - // Update average processing time using exponential moving average - let alpha = 0.1; // Smoothing factor - stats.average_processing_time_ms = - alpha * processing_time_ms + (1.0 - alpha) * stats.average_processing_time_ms; - - // Update success rate with proper division by zero protection - let total_attempts = stats.total_extractions; - let successful_attempts = stats.xml_successes; - - if total_attempts > 0 { - stats.success_rate_percentage = (successful_attempts as f64 / total_attempts as f64) * 100.0; - } else if result.is_ok() { - stats.success_rate_percentage = 100.0; - } - } - } - - /// Get current statistics - pub async fn get_stats(&self) -> FallbackStats { - self.stats.read() - .map(|stats| stats.clone()) - .unwrap_or_else(|_| { - warn!("Failed to acquire read lock on stats, returning default"); - FallbackStats::default() - }) - } - - /// Reset statistics - pub async fn reset_stats(&self) { - if let Ok(mut stats) = self.stats.write() { - *stats = FallbackStats::default(); - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use tempfile::TempDir; - - fn create_test_strategy() -> (FallbackStrategy, TempDir) { - let temp_dir = TempDir::new().unwrap(); - let config = FallbackConfig::default(); - let strategy = FallbackStrategy::new(config, temp_dir.path().to_string_lossy().to_string()); - (strategy, temp_dir) - } - - #[tokio::test] - async fn test_stats_tracking() { - let (strategy, _temp_dir) = create_test_strategy(); - - let initial_stats = strategy.get_stats().await; - assert_eq!(initial_stats.total_extractions, 0); - - // Simulate some operations by updating stats directly - if let Ok(mut stats) = strategy.stats.write() { - stats.total_extractions = 10; - stats.xml_successes = 9; - // Calculate success rate manually as update_stats would do - stats.success_rate_percentage = (9.0 / 10.0) * 100.0; - } - - let updated_stats = strategy.get_stats().await; - assert_eq!(updated_stats.total_extractions, 10); - assert_eq!(updated_stats.xml_successes, 9); - assert_eq!(updated_stats.success_rate_percentage, 90.0); // 9 successes out of 10 - } - - #[test] - fn test_get_document_type() { - let (strategy, _temp_dir) = create_test_strategy(); - - assert_eq!(strategy.get_document_type("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), "docx"); - assert_eq!(strategy.get_document_type("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), "xlsx"); - assert_eq!(strategy.get_document_type("application/vnd.openxmlformats-officedocument.presentationml.presentation"), "pptx"); - assert_eq!(strategy.get_document_type("application/pdf"), "pdf"); - assert_eq!(strategy.get_document_type("unknown/type"), "unknown"); - } -} \ No newline at end of file diff --git a/src/ocr/mod.rs b/src/ocr/mod.rs index b23f1ab..4f343a3 100644 --- a/src/ocr/mod.rs +++ b/src/ocr/mod.rs @@ -2,7 +2,6 @@ pub mod api; pub mod enhanced; pub mod enhanced_processing; pub mod error; -pub mod fallback_strategy; pub mod health; pub mod queue; pub mod tests; @@ -12,21 +11,18 @@ use anyhow::{anyhow, Result}; use std::path::Path; use crate::ocr::error::OcrError; use crate::ocr::health::OcrHealthChecker; -use crate::ocr::fallback_strategy::{FallbackStrategy, FallbackConfig}; #[cfg(feature = "ocr")] use tesseract::Tesseract; pub struct OcrService { health_checker: OcrHealthChecker, - fallback_strategy: Option, + temp_dir: String, } /// Configuration for the OCR service #[derive(Debug, Clone)] pub struct OcrConfig { - /// Fallback configuration - pub fallback_config: FallbackConfig, /// Temporary directory for processing pub temp_dir: String, } @@ -34,7 +30,6 @@ pub struct OcrConfig { impl Default for OcrConfig { fn default() -> Self { Self { - fallback_config: FallbackConfig::default(), temp_dir: std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string()), } } @@ -44,21 +39,15 @@ impl OcrService { pub fn new() -> Self { Self { health_checker: OcrHealthChecker::new(), - fallback_strategy: None, + temp_dir: std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string()), } } /// Create OCR service with configuration pub fn new_with_config(config: OcrConfig) -> Self { - let fallback_strategy = if config.fallback_config.enabled { - Some(FallbackStrategy::new(config.fallback_config, config.temp_dir)) - } else { - None - }; - Self { health_checker: OcrHealthChecker::new(), - fallback_strategy, + temp_dir: config.temp_dir, } } @@ -201,37 +190,21 @@ impl OcrService { file_path: &str, mime_type: &str, ) -> Result { - match &self.fallback_strategy { - Some(strategy) => { - let result = strategy.extract_with_fallback(file_path, mime_type).await?; - // Convert the result to OcrResult for backward compatibility - Ok(crate::ocr::enhanced::OcrResult { - text: result.text, - confidence: result.confidence, - processing_time_ms: result.processing_time_ms, - word_count: result.word_count, - preprocessing_applied: vec![format!("XML extraction - {}", result.extraction_method)], - processed_image_path: None, - }) - } - None => { - // Use basic XML extraction if no strategy is configured - let xml_extractor = crate::ocr::xml_extractor::XmlOfficeExtractor::new( - std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string()) - ); - - let result = xml_extractor.extract_text_from_office(file_path, mime_type).await?; - // Convert OfficeExtractionResult to OcrResult for backward compatibility - Ok(crate::ocr::enhanced::OcrResult { - text: result.text, - confidence: result.confidence, - processing_time_ms: result.processing_time_ms, - word_count: result.word_count, - preprocessing_applied: vec![format!("XML extraction - {}", result.extraction_method)], - processed_image_path: None, - }) - } - } + // Use XML extraction directly + let xml_extractor = crate::ocr::xml_extractor::XmlOfficeExtractor::new( + self.temp_dir.clone() + ); + + let result = xml_extractor.extract_text_from_office(file_path, mime_type).await?; + // Convert OfficeExtractionResult to OcrResult for backward compatibility + Ok(crate::ocr::enhanced::OcrResult { + text: result.text, + confidence: result.confidence, + processing_time_ms: result.processing_time_ms, + word_count: result.word_count, + preprocessing_applied: vec![format!("XML extraction - {}", result.extraction_method)], + processed_image_path: None, + }) } /// Extract text from Office documents with custom configuration @@ -331,28 +304,10 @@ impl OcrService { } } - /// Get XML extraction statistics - pub async fn get_fallback_stats(&self) -> Option { - match &self.fallback_strategy { - Some(strategy) => Some(strategy.get_stats().await), - None => None, - } - } - - /// Reset XML extraction statistics - pub async fn reset_fallback_stats(&self) -> Result<()> { - match &self.fallback_strategy { - Some(strategy) => { - strategy.reset_stats().await; - Ok(()) - } - None => Err(anyhow!("XML extraction strategy not configured")), - } - } /// Check if Office document extraction is available pub fn supports_office_documents(&self) -> bool { - self.fallback_strategy.is_some() + true // XML extraction is always available } /// Get supported MIME types @@ -367,16 +322,15 @@ impl OcrService { "text/plain", ]; - if self.supports_office_documents() { - types.extend_from_slice(&[ - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - "application/msword", - "application/vnd.ms-excel", - "application/vnd.ms-powerpoint", - ]); - } + // Office document types are always supported via XML extraction + types.extend_from_slice(&[ + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/msword", + "application/vnd.ms-excel", + "application/vnd.ms-powerpoint", + ]); types } diff --git a/tests/integration_office_extraction.rs b/tests/integration_office_extraction.rs index b2f1231..0396cf1 100644 --- a/tests/integration_office_extraction.rs +++ b/tests/integration_office_extraction.rs @@ -7,7 +7,6 @@ use tokio::time::timeout; use readur::ocr::{ OcrService, OcrConfig, - fallback_strategy::FallbackConfig, }; /// Test utilities for creating mock Office documents @@ -72,7 +71,7 @@ impl OfficeTestDocuments { let file = fs::File::create(&file_path)?; let mut zip = zip::ZipWriter::new(file); - // Add [Content_Types].xml + // Add [Content_Types].xml with shared strings support zip.start_file("[Content_Types].xml", zip::write::FileOptions::default())?; zip.write_all(br#" @@ -80,6 +79,7 @@ impl OfficeTestDocuments { + "#)?; // Add _rels/.rels @@ -98,26 +98,42 @@ impl OfficeTestDocuments { "#)?; - // Add xl/_rels/workbook.xml.rels + // Add xl/_rels/workbook.xml.rels with shared strings relationship zip.start_file("xl/_rels/workbook.xml.rels", zip::write::FileOptions::default())?; zip.write_all(br#" + "#)?; - // Add xl/worksheets/sheet1.xml with actual content + // Add xl/sharedStrings.xml with the text content + zip.start_file("xl/sharedStrings.xml", zip::write::FileOptions::default())?; + let mut shared_strings_xml = String::from(r#" +"#); + shared_strings_xml = shared_strings_xml.replace("{count}", &content.len().to_string()); + + for cell_content in content { + shared_strings_xml.push_str(&format!(r#" + {}"#, cell_content)); + } + + shared_strings_xml.push_str(r#" +"#); + zip.write_all(shared_strings_xml.as_bytes())?; + + // Add xl/worksheets/sheet1.xml with references to shared strings zip.start_file("xl/worksheets/sheet1.xml", zip::write::FileOptions::default())?; let mut worksheet_xml = String::from(r#" "#); - for (row_idx, cell_content) in content.iter().enumerate() { + for (row_idx, _) in content.iter().enumerate() { worksheet_xml.push_str(&format!(r#" - - {} + + {} - "#, row_idx + 1, row_idx + 1, cell_content)); + "#, row_idx + 1, row_idx + 1, row_idx)); } worksheet_xml.push_str(r#" @@ -146,16 +162,9 @@ impl OfficeTestDocuments { } } -/// Create a test OCR service with fallback strategy +/// Create a test OCR service with XML extraction fn create_test_ocr_service(temp_dir: &str) -> OcrService { let config = OcrConfig { - fallback_config: FallbackConfig { - enabled: true, - max_retries: 2, - initial_retry_delay_ms: 100, - max_retry_delay_ms: 1000, - xml_timeout_seconds: 60, - }, temp_dir: temp_dir.to_string(), }; @@ -224,7 +233,6 @@ async fn test_extraction_modes() -> Result<()> { // Test XML extraction with the simplified approach let ocr_config = OcrConfig { - fallback_config: FallbackConfig::default(), temp_dir: temp_dir.clone(), }; @@ -250,15 +258,8 @@ async fn test_fallback_mechanism() -> Result<()> { let test_docs = OfficeTestDocuments::new()?; let temp_dir = test_docs.temp_dir.path().to_string_lossy().to_string(); - // Create a service with XML-only mode (simplified) + // Create a service with XML extraction let config = OcrConfig { - fallback_config: FallbackConfig { - enabled: true, - max_retries: 1, - initial_retry_delay_ms: 50, - max_retry_delay_ms: 200, - xml_timeout_seconds: 30, - }, temp_dir, }; @@ -387,15 +388,8 @@ async fn test_concurrent_extraction() -> Result<()> { async fn test_circuit_breaker() -> Result<()> { let test_docs = OfficeTestDocuments::new()?; - // Create service with simple retry settings (circuit breaker functionality removed) + // Create service with XML extraction let config = OcrConfig { - fallback_config: FallbackConfig { - enabled: true, - max_retries: 0, // No retries to make failures immediate - initial_retry_delay_ms: 10, - max_retry_delay_ms: 100, - xml_timeout_seconds: 30, - }, temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(), }; @@ -442,13 +436,7 @@ async fn test_statistics_tracking() -> Result<()> { let test_docs = OfficeTestDocuments::new()?; let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref()); - // Reset stats - ocr_service.reset_fallback_stats().await?; - - let initial_stats = ocr_service.get_fallback_stats().await.unwrap(); - assert_eq!(initial_stats.total_extractions, 0); - - // Perform some extractions + // Perform some extractions to verify functionality let valid_path = test_docs.create_mock_docx("stats_test.docx", "Statistics test document")?; for i in 0..3 { @@ -462,13 +450,10 @@ async fn test_statistics_tracking() -> Result<()> { assert!(!ocr_result.text.is_empty()); assert!(ocr_result.confidence > 0.0); assert!(ocr_result.word_count > 0); + assert!(ocr_result.processing_time_ms > 0); } - // Check updated stats - let final_stats = ocr_service.get_fallback_stats().await.unwrap(); - assert_eq!(final_stats.total_extractions, 3); - assert!(final_stats.success_rate_percentage > 0.0); - assert!(final_stats.average_processing_time_ms > 0.0); + // All extractions succeeded, indicating the XML extraction is working correctly Ok(()) } @@ -495,15 +480,8 @@ async fn test_mime_type_support() -> Result<()> { async fn test_learning_mechanism() -> Result<()> { let test_docs = OfficeTestDocuments::new()?; - // Create service with simple XML extraction (learning functionality removed) + // Create service with XML extraction let config = OcrConfig { - fallback_config: FallbackConfig { - enabled: true, - max_retries: 1, - initial_retry_delay_ms: 10, - max_retry_delay_ms: 100, - xml_timeout_seconds: 30, - }, temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(), }; diff --git a/tests/integration_webdav_integration_tests.rs b/tests/integration_webdav_integration_tests.rs index afc8149..c3cfa4a 100644 --- a/tests/integration_webdav_integration_tests.rs +++ b/tests/integration_webdav_integration_tests.rs @@ -72,6 +72,9 @@ fn create_empty_update_settings() -> UpdateSettings { webdav_file_extensions: None, webdav_auto_sync: None, webdav_sync_interval_minutes: None, + // Office document extraction configuration + office_extraction_timeout_seconds: None, + office_extraction_enable_detailed_logging: None, } } @@ -215,6 +218,9 @@ async fn setup_webdav_settings(state: &AppState, user_id: Uuid) { ocr_quality_threshold_noise: None, ocr_quality_threshold_sharpness: None, ocr_skip_enhancement: None, + // Office document extraction configuration + office_extraction_timeout_seconds: None, + office_extraction_enable_detailed_logging: None, }; state.db.create_or_update_settings(user_id, &update_settings).await