feat(office): xml extraction seems to work now

2025-09-02 01:22:04 +00:00 · 2025-09-02 01:22:04 +00:00 · d5d6d2edb4
parent 774efd1140
commit d5d6d2edb4
12 changed files with 176 additions and 968 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1023,21 +1023,6 @@ dependencies = [
 "pkg-config",
 ]
 [[package]]
 name = "calamine"
 version = "0.26.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "138646b9af2c5d7f1804ea4bf93afc597737d2bd4f7341d67c48b03316976eb1"
 dependencies = [
 "byteorder",
 "codepage",
 "encoding_rs",
 "log",
 "quick-xml 0.31.0",
 "serde",
 "zip 2.4.2",
 ]
 [[package]]
 name = "cc"
 version = "1.2.27"
@ -1170,15 +1155,6 @@ dependencies = [
 "cc",
 ]
 [[package]]
 name = "codepage"
 version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4"
 dependencies = [
 "encoding_rs",
 ]
 [[package]]
 name = "color_quant"
 version = "1.1.0"
@ -1490,21 +1466,6 @@ dependencies = [
 "serde_json",
 ]
 [[package]]
 name = "docx-rs"
 version = "0.4.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f21be13b97bd2924f30323d674f5a8db382964972825abd93f30d08f21dad98"
 dependencies = [
 "base64 0.22.1",
 "image 0.24.9",
 "serde",
 "serde_json",
 "thiserror 1.0.69",
 "xml-rs",
 "zip 0.6.6",
 ]
 [[package]]
 name = "dotenvy"
 version = "0.15.7"
@ -2428,22 +2389,6 @@ dependencies = [
 "icu_properties",
 ]
 [[package]]
 name = "image"
 version = "0.24.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5690139d2f55868e080017335e4b94cb7414274c74f1669c84fb5feba2c9f69d"
 dependencies = [
 "bytemuck",
 "byteorder",
 "color_quant",
 "gif",
 "jpeg-decoder",
 "num-traits",
 "png",
 "tiff",
 ]
 [[package]]
 name = "image"
 version = "0.25.6"
@ -2486,7 +2431,7 @@ dependencies = [
 "ab_glyph",
 "approx",
 "getrandom 0.2.16",
- "image 0.25.6",
+ "image",
 "itertools",
 "nalgebra",
 "num",
@ -3555,16 +3500,6 @@ version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3"
 [[package]]
 name = "quick-xml"
 version = "0.31.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
 dependencies = [
 "encoding_rs",
 "memchr",
 ]
 [[package]]
 name = "quick-xml"
 version = "0.37.5"
@ -3757,15 +3692,13 @@ dependencies = [
 "axum",
 "base64ct",
 "bcrypt",
 "calamine",
 "chrono",
 "clap",
 "docx-rs",
 "dotenvy",
 "futures",
 "futures-util",
 "hostname",
- "image 0.25.6",
+ "image",
 "imageproc",
 "infer",
 "jsonwebtoken",
@ -3773,7 +3706,7 @@ dependencies = [
 "notify",
 "oauth2",
 "once_cell",
- "quick-xml 0.37.5",
+ "quick-xml",
 "rand 0.8.5",
 "raw-cpuid",
 "readur",
@ -6298,12 +6231,6 @@ dependencies = [
 "rustix 1.0.7",
 ]
 [[package]]
 name = "xml-rs"
 version = "0.8.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6fd8403733700263c6eb89f192880191f1b83e332f7a20371ddcf421c4a337c7"
 [[package]]
 name = "xmlparser"
 version = "0.13.6"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -62,9 +62,7 @@ sha2 = "0.10"
 utoipa-swagger-ui = { version = "9", features = ["axum"] }
 testcontainers = { version = "0.24", optional = true }
 testcontainers-modules = { version = "0.12", features = ["postgres"], optional = true }
-# Office document support - using proper, well-maintained libraries
+# Office document support - now using XML extraction only
 docx-rs = "0.4"         # For DOCX text extraction by bokuweb (NOT the docx crate by PoiScript)
 calamine = "0.26"       # For Excel (XLS/XLSX) text extraction
 zip = "0.6"             # Still needed for other archive handling
 rand = "0.8"
--- a/src/db/settings.rs
+++ b/src/db/settings.rs
@ -76,7 +76,6 @@ fn settings_from_row(row: &sqlx::postgres::PgRow) -> crate::models::Settings {
        webdav_auto_sync: row.get("webdav_auto_sync"),
        webdav_sync_interval_minutes: row.get("webdav_sync_interval_minutes"),
        // Office document extraction configuration
        office_extraction_mode: row.get("office_extraction_mode"),
        office_extraction_timeout_seconds: row.get("office_extraction_timeout_seconds"),
        office_extraction_enable_detailed_logging: row.get("office_extraction_enable_detailed_logging"),
        created_at: row.get("created_at"),
@ -106,7 +105,6 @@ impl Database {
                   ocr_quality_threshold_sharpness, ocr_skip_enhancement,
                   webdav_enabled, webdav_server_url, webdav_username, webdav_password,
                   webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes,
                   COALESCE(office_extraction_mode, 'compare_always') as office_extraction_mode,
                   COALESCE(office_extraction_timeout_seconds, 120) as office_extraction_timeout_seconds,
                   COALESCE(office_extraction_enable_detailed_logging, true) as office_extraction_enable_detailed_logging,
                   created_at, updated_at
@ -144,7 +142,6 @@ impl Database {
               ocr_quality_threshold_sharpness, ocr_skip_enhancement,
               webdav_enabled, webdav_server_url, webdav_username, webdav_password,
               webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes,
               COALESCE(office_extraction_mode, 'library_first') as office_extraction_mode,
               COALESCE(office_extraction_timeout_seconds, 120) as office_extraction_timeout_seconds,
               COALESCE(office_extraction_enable_detailed_logging, false) as office_extraction_enable_detailed_logging,
               created_at, updated_at
@ -163,18 +160,6 @@ impl Database {
    /// Validate office extraction settings
    fn validate_office_extraction_settings(settings: &crate::models::UpdateSettings) -> Result<()> {
        // Validate extraction mode
        if let Some(mode) = &settings.office_extraction_mode {
            let valid_modes = ["library_first", "xml_first", "compare_always", "library_only", "xml_only"];
            if !valid_modes.contains(&mode.as_str()) {
                return Err(anyhow!(
                    "Invalid office extraction mode '{}'. Valid modes are: {}",
                    mode,
                    valid_modes.join(", ")
                ));
            }
        }
        // Validate timeout
        if let Some(timeout) = settings.office_extraction_timeout_seconds {
            if timeout <= 0 {
@ -307,9 +292,9 @@ impl Database {
                ocr_quality_threshold_sharpness, ocr_skip_enhancement,
                webdav_enabled, webdav_server_url, webdav_username, webdav_password,
                webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes,
-                office_extraction_mode, office_extraction_timeout_seconds, office_extraction_enable_detailed_logging
+                office_extraction_timeout_seconds, office_extraction_enable_detailed_logging
            )
-            VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $50, $51, $52, $53, $54, $55, $56)
+            VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $50, $51, $52, $53, $54, $55)
            ON CONFLICT (user_id) DO UPDATE SET
                ocr_language = $2,
                preferred_languages = $3,
@ -363,9 +348,8 @@ impl Database {
                webdav_file_extensions = $51,
                webdav_auto_sync = $52,
                webdav_sync_interval_minutes = $53,
-                office_extraction_mode = $54,
+                office_extraction_timeout_seconds = $54,
-                office_extraction_timeout_seconds = $55,
+                office_extraction_enable_detailed_logging = $55,
                office_extraction_enable_detailed_logging = $56,
                updated_at = NOW()
            RETURNING id, user_id, ocr_language, 
                      COALESCE(preferred_languages, '["eng"]'::jsonb) as preferred_languages,
@ -385,7 +369,6 @@ impl Database {
                      ocr_quality_threshold_sharpness, ocr_skip_enhancement,
                      webdav_enabled, webdav_server_url, webdav_username, webdav_password,
                      webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes,
                      COALESCE(office_extraction_mode, 'library_first') as office_extraction_mode,
                      COALESCE(office_extraction_timeout_seconds, 120) as office_extraction_timeout_seconds,
                      COALESCE(office_extraction_enable_detailed_logging, false) as office_extraction_enable_detailed_logging,
                      created_at, updated_at
@ -444,7 +427,6 @@ impl Database {
        .bind(settings.webdav_file_extensions.as_ref().unwrap_or(&current.webdav_file_extensions))
        .bind(settings.webdav_auto_sync.unwrap_or(current.webdav_auto_sync))
        .bind(settings.webdav_sync_interval_minutes.unwrap_or(current.webdav_sync_interval_minutes))
        .bind(settings.office_extraction_mode.as_ref().unwrap_or(&current.office_extraction_mode))
        .bind(settings.office_extraction_timeout_seconds.unwrap_or(current.office_extraction_timeout_seconds))
        .bind(settings.office_extraction_enable_detailed_logging.unwrap_or(current.office_extraction_enable_detailed_logging))
        .fetch_one(&self.pool)
--- a/src/models/settings.rs
+++ b/src/models/settings.rs
@ -61,7 +61,6 @@ pub struct Settings {
    pub webdav_auto_sync: bool,
    pub webdav_sync_interval_minutes: i32,
    // Office document extraction configuration
    pub office_extraction_mode: String, // "library_first", "xml_first", "compare_always", "library_only", "xml_only"
    pub office_extraction_timeout_seconds: i32,
    pub office_extraction_enable_detailed_logging: bool,
    pub created_at: DateTime<Utc>,
@ -123,7 +122,6 @@ pub struct SettingsResponse {
    pub webdav_auto_sync: bool,
    pub webdav_sync_interval_minutes: i32,
    // Office document extraction configuration
    pub office_extraction_mode: String,
    pub office_extraction_timeout_seconds: i32,
    pub office_extraction_enable_detailed_logging: bool,
 }
@ -183,7 +181,6 @@ pub struct UpdateSettings {
    pub webdav_auto_sync: Option<bool>,
    pub webdav_sync_interval_minutes: Option<i32>,
    // Office document extraction configuration
    pub office_extraction_mode: Option<String>,
    pub office_extraction_timeout_seconds: Option<i32>,
    pub office_extraction_enable_detailed_logging: Option<bool>,
 }
@ -244,7 +241,6 @@ impl From<Settings> for SettingsResponse {
            webdav_auto_sync: settings.webdav_auto_sync,
            webdav_sync_interval_minutes: settings.webdav_sync_interval_minutes,
            // Office document extraction configuration
            office_extraction_mode: settings.office_extraction_mode,
            office_extraction_timeout_seconds: settings.office_extraction_timeout_seconds,
            office_extraction_enable_detailed_logging: settings.office_extraction_enable_detailed_logging,
        }
@ -312,7 +308,6 @@ impl UpdateSettings {
            webdav_auto_sync: None,
            webdav_sync_interval_minutes: None,
            // Office document extraction configuration - don't update these in language update
            office_extraction_mode: None,
            office_extraction_timeout_seconds: None,
            office_extraction_enable_detailed_logging: None,
        }
@ -393,7 +388,6 @@ impl Default for Settings {
            webdav_auto_sync: false,
            webdav_sync_interval_minutes: 60,
            // Office document extraction configuration defaults
            office_extraction_mode: "library_first".to_string(), // Default to library-first approach
            office_extraction_timeout_seconds: 120, // 2 minutes default timeout
            office_extraction_enable_detailed_logging: false, // Conservative default
            created_at: chrono::Utc::now(),
--- a/src/ocr/enhanced.rs
+++ b/src/ocr/enhanced.rs
@ -92,39 +92,6 @@ impl EnhancedOcrService {
        cleaned
    }
    /// Sanitizes file paths before passing to external tools to prevent command injection
    fn sanitize_file_path_for_external_tool(file_path: &str) -> Result<String> {
        use std::path::Path;
        // Resolve to absolute path to prevent relative path tricks
        let path = Path::new(file_path);
        let absolute_path = path.canonicalize()
            .map_err(|e| anyhow!("Failed to resolve file path '{}': {}. File may not exist.", file_path, e))?;
        let path_str = absolute_path.to_str()
            .ok_or_else(|| anyhow!("File path contains invalid UTF-8 characters: '{:?}'", absolute_path))?;
        // Check for suspicious characters that could be used for command injection
        let dangerous_chars = ['&', '|', ';', '`', '$', '(', ')', '<', '>', '"', '\'', '\\'];
        if path_str.chars().any(|c| dangerous_chars.contains(&c)) {
            return Err(anyhow!(
                "File path contains potentially dangerous characters: '{}'. \
                This is blocked for security reasons to prevent command injection.",
                path_str
            ));
        }
        // Ensure the path doesn't contain shell metacharacters
        if path_str.contains("..") || path_str.contains("//") {
            return Err(anyhow!(
                "File path contains suspicious sequences: '{}'. \
                This is blocked for security reasons.",
                path_str
            ));
        }
        Ok(path_str.to_string())
    }
    pub fn new(temp_dir: String, file_service: FileService) -> Self {
        Self { temp_dir, file_service }
@ -1525,138 +1492,16 @@ impl EnhancedOcrService {
            total_time
        );
        // Convert OfficeExtractionResult to OcrResult for backward compatibility
        Ok(OcrResult {
            text: xml_result.text,
            confidence: xml_result.confidence,
-            processing_time_ms: total_time,
+            processing_time_ms: xml_result.processing_time_ms,
            word_count: xml_result.word_count,
-            preprocessing_applied: vec![xml_result.extraction_method],
+            preprocessing_applied: vec![format!("XML extraction - {}", xml_result.extraction_method)],
            processed_image_path: None,
        })
    }
    /// REMOVED: This method was used for comparison analysis - now handled by extract_text_from_office
    #[deprecated(note = "Use extract_text_from_office instead - this method was for comparison analysis")]
    /// Extract text from legacy DOC files using lightweight external tools  
    pub async fn extract_text_from_legacy_doc(&self, file_path: &str, start_time: std::time::Instant) -> Result<OcrResult> {
        info!("Processing legacy DOC file: {}", file_path);
        // Use lightweight DOC extraction tools in order of preference
        let tools = ["antiword", "catdoc", "wvText"];
        let mut last_error = None;
        for tool in &tools {
            match self.try_doc_extraction_tool(file_path, tool).await {
                Ok(text) if !text.trim().is_empty() => {
                    let processing_time = start_time.elapsed().as_millis() as u64;
                    // Only remove null bytes - preserve all original formatting
                    let cleaned_text = Self::remove_null_bytes(&text);
                    let word_count = self.count_words_safely(&cleaned_text);
                    info!(
                        "Legacy DOC extraction completed using {}: {} words extracted from '{}' in {}ms",
                        tool, word_count, file_path, processing_time
                    );
                    return Ok(OcrResult {
                        text: cleaned_text,
                        confidence: 90.0, // High confidence for proven extraction tools
                        processing_time_ms: processing_time,
                        word_count,
                        preprocessing_applied: vec![format!("Legacy DOC extraction ({})", tool)],
                        processed_image_path: None,
                    });
                }
                Ok(_) => {
                    // Tool succeeded but returned empty text
                    last_error = Some(anyhow!("{} returned empty content", tool));
                }
                Err(e) => {
                    last_error = Some(e);
                    continue; // Try next tool
                }
            }
        }
        // If all tools failed, provide helpful installation guidance
        let processing_time = start_time.elapsed().as_millis() as u64;
        Err(anyhow!(
            "Legacy DOC file extraction failed for '{}'. None of the DOC extraction tools ({}) are available or working.\n\
            \nTo process DOC files, please install one of these lightweight tools:\n\
            \n• antiword (recommended for most DOC files):\n\
               - Ubuntu/Debian: 'sudo apt-get install antiword'\n\
               - macOS: 'brew install antiword'\n\
               - Alpine: 'apk add antiword'\n\
            \n• catdoc (good fallback option):\n\
               - Ubuntu/Debian: 'sudo apt-get install catdoc'\n\
               - macOS: 'brew install catdoc'\n\
               - Alpine: 'apk add catdoc'\n\
            \n• wv (includes wvText tool):\n\
               - Ubuntu/Debian: 'sudo apt-get install wv'\n\
               - macOS: 'brew install wv'\n\
            \nAlternatively, convert the DOC file to DOCX or PDF format for better compatibility.\n\
            These tools are much lighter than LibreOffice (~1-2MB vs 400-500MB) and work reliably for most DOC files.\n\
            Processing time: {}ms\n\
            Last error: {}",
            file_path,
            tools.join(", "),
            processing_time,
            last_error.map(|e| e.to_string()).unwrap_or_else(|| "Unknown error".to_string())
        ))
    }
    /// Try to extract text from DOC file using a specific external tool
    async fn try_doc_extraction_tool(&self, file_path: &str, tool: &str) -> Result<String> {
        // Security: Sanitize file path before passing to external tools
        let sanitized_path = Self::sanitize_file_path_for_external_tool(file_path)?;
        let output = match tool {
            "antiword" => {
                tokio::process::Command::new("antiword")
                    .arg(&sanitized_path)
                    .output()
                    .await?
            }
            "catdoc" => {
                tokio::process::Command::new("catdoc")
                    .arg("-a")  // ASCII output
                    .arg(&sanitized_path)
                    .output()
                    .await?
            }
            "wvText" => {
                // wvText from wv package
                tokio::process::Command::new("wvText")
                    .arg(&sanitized_path)
                    .arg("-")  // Output to stdout
                    .output()
                    .await?
            }
            _ => return Err(anyhow!("Unknown DOC extraction tool: {}", tool)),
        };
        if !output.status.success() {
            let stderr = String::from_utf8_lossy(&output.stderr);
            return Err(anyhow!(
                "{} failed with exit code {}: {}",
                tool,
                output.status.code().unwrap_or(-1),
                stderr
            ));
        }
        let text = String::from_utf8_lossy(&output.stdout).to_string();
        // Check if tool is actually available (some might succeed but output usage info)
        if text.contains("command not found") || text.contains("Usage:") {
            return Err(anyhow!("{} is not properly installed or configured", tool));
        }
        Ok(text)
    }
    /// Extract text from any supported file type
    pub async fn extract_text(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result<OcrResult> {
@ -1733,6 +1578,7 @@ impl EnhancedOcrService {
                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
                "application/vnd.openxmlformats-officedocument.presentationml.presentation"
            ) => {
                // extract_text_from_office now returns OcrResult directly
                self.extract_text_from_office(&resolved_path, mime, settings).await
            }
            _ => Err(anyhow::anyhow!("Unsupported file type: {}", mime_type)),
--- a/src/ocr/fallback_strategy.rs
+++ b/src/ocr/fallback_strategy.rs
@ -1,17 +1,16 @@
 use anyhow::Result;
 use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
+use tracing::{info, warn};
 use std::sync::{Arc, RwLock, Mutex};
 use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
 use tracing::{debug, error, info, warn};
 use rand::Rng;
 use super::xml_extractor::{OfficeExtractionResult, XmlOfficeExtractor};
-/// Configuration for fallback strategy behavior
+#[cfg(test)]
 use anyhow::anyhow;
 /// Configuration for XML-based Office document extraction
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct FallbackConfig {
-    /// Enable fallback mechanism
+    /// Enable XML extraction
    pub enabled: bool,
    /// Maximum number of retry attempts for transient failures
    pub max_retries: u32,
@ -19,68 +18,10 @@ pub struct FallbackConfig {
    pub initial_retry_delay_ms: u64,
    /// Maximum retry delay in milliseconds
    pub max_retry_delay_ms: u64,
-    /// Circuit breaker configuration
+    /// Timeout for XML extraction in seconds
    pub circuit_breaker: CircuitBreakerConfig,
    /// Learning mechanism configuration
    pub learning: LearningConfig,
    /// Timeout configuration for individual methods
    pub method_timeouts: MethodTimeouts,
 }
 /// Circuit breaker configuration
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct CircuitBreakerConfig {
    /// Enable circuit breaker
    pub enabled: bool,
    /// Number of consecutive failures before opening circuit
    pub failure_threshold: u32,
    /// Time to wait before attempting to close circuit
    pub recovery_timeout_seconds: u64,
    /// Percentage of successful requests needed to close circuit (0-100)
    pub success_threshold_percentage: u32,
 }
 /// Learning mechanism configuration
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LearningConfig {
    /// Enable learning from successful extractions
    pub enabled: bool,
    /// Cache successful extraction methods per document type
    pub cache_successful_methods: bool,
    /// Time to keep method preferences in cache (in hours)
    pub cache_ttl_hours: u64,
 }
 impl Default for LearningConfig {
    fn default() -> Self {
        Self {
            enabled: true,
            cache_successful_methods: true,
            cache_ttl_hours: 24,
        }
    }
 }
 /// Timeout configuration for different extraction methods
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct MethodTimeouts {
    /// Timeout for library-based extraction in seconds
    pub library_timeout_seconds: u64,
    /// Timeout for XML-based extraction in seconds
    pub xml_timeout_seconds: u64,
    /// Timeout for OCR-based extraction in seconds
    pub ocr_timeout_seconds: u64,
 }
 impl Default for MethodTimeouts {
    fn default() -> Self {
        Self {
            library_timeout_seconds: 120,
            xml_timeout_seconds: 180,
            ocr_timeout_seconds: 300,
        }
    }
 }
 impl Default for FallbackConfig {
    fn default() -> Self {
@ -89,322 +30,18 @@ impl Default for FallbackConfig {
            max_retries: 3,
            initial_retry_delay_ms: 1000,
            max_retry_delay_ms: 30000,
-            circuit_breaker: CircuitBreakerConfig {
+            xml_timeout_seconds: 180,
                enabled: true,
                failure_threshold: 5,
                recovery_timeout_seconds: 60,
                success_threshold_percentage: 50,
            },
            learning: LearningConfig {
                enabled: true,
                cache_successful_methods: true,
                cache_ttl_hours: 24,
            },
            method_timeouts: MethodTimeouts {
                library_timeout_seconds: 120,
                xml_timeout_seconds: 180,
                ocr_timeout_seconds: 300,
            },
        }
    }
 }
 /// Circuit breaker states
 #[derive(Debug, Clone, PartialEq)]
 pub enum CircuitState {
    Closed,   // Normal operation
    Open,     // Failing fast
    HalfOpen, // Testing recovery
 }
 /// Circuit breaker for a specific extraction method
 /// Thread-safe implementation using Arc<Mutex> for shared state
 #[derive(Debug, Clone)]
 pub struct CircuitBreaker {
    inner: Arc<std::sync::Mutex<CircuitBreakerInner>>,
 }
-#[derive(Debug)]
+/// Statistics for monitoring XML extraction performance
 struct CircuitBreakerInner {
    state: CircuitState,
    failure_count: u32,
    success_count: u32,
    last_failure_time: Option<Instant>,
    config: CircuitBreakerConfig,
 }
 impl CircuitBreaker {
    fn new(config: CircuitBreakerConfig) -> Self {
        Self {
            inner: Arc::new(Mutex::new(CircuitBreakerInner {
                state: CircuitState::Closed,
                failure_count: 0,
                success_count: 0,
                last_failure_time: None,
                config,
            })),
        }
    }
    /// Check if the circuit should allow a request
    fn should_allow_request(&self) -> bool {
        let mut inner = match self.inner.lock() {
            Ok(guard) => guard,
            Err(poisoned) => {
                warn!("Circuit breaker mutex was poisoned, recovering");
                poisoned.into_inner()
            }
        };
        match inner.state {
            CircuitState::Closed => true,
            CircuitState::Open => {
                // Check if we should transition to half-open
                if let Some(last_failure) = inner.last_failure_time {
                    if last_failure.elapsed().as_secs() >= inner.config.recovery_timeout_seconds {
                        info!("Circuit breaker transitioning from Open to HalfOpen for recovery test");
                        inner.state = CircuitState::HalfOpen;
                        inner.success_count = 0;
                        true
                    } else {
                        false
                    }
                } else {
                    false
                }
            }
            CircuitState::HalfOpen => true,
        }
    }
    /// Record a successful operation
    fn record_success(&self) {
        let mut inner = match self.inner.lock() {
            Ok(guard) => guard,
            Err(poisoned) => {
                warn!("Circuit breaker mutex was poisoned during success recording, recovering");
                poisoned.into_inner()
            }
        };
        inner.success_count += 1;
        match inner.state {
            CircuitState::Closed => {
                // Reset failure count on success
                inner.failure_count = 0;
            }
            CircuitState::HalfOpen => {
                // Check if we should close the circuit
                let total_requests = inner.success_count + inner.failure_count;
                if total_requests >= 10 { // Minimum sample size
                    let success_percentage = (inner.success_count * 100) / total_requests;
                    if success_percentage >= inner.config.success_threshold_percentage {
                        info!("Circuit breaker closing after successful recovery ({}% success rate)", success_percentage);
                        inner.state = CircuitState::Closed;
                        inner.failure_count = 0;
                        inner.success_count = 0;
                    }
                }
            }
            CircuitState::Open => {
                // Should not happen, but reset if it does
                warn!("Unexpected success recorded while circuit is Open");
            }
        }
    }
    /// Record a failed operation
    fn record_failure(&self) {
        let mut inner = match self.inner.lock() {
            Ok(guard) => guard,
            Err(poisoned) => {
                warn!("Circuit breaker mutex was poisoned during failure recording, recovering");
                poisoned.into_inner()
            }
        };
        inner.failure_count += 1;
        inner.last_failure_time = Some(Instant::now());
        match inner.state {
            CircuitState::Closed => {
                if inner.failure_count >= inner.config.failure_threshold {
                    warn!("Circuit breaker opening after {} consecutive failures", inner.failure_count);
                    inner.state = CircuitState::Open;
                }
            }
            CircuitState::HalfOpen => {
                warn!("Circuit breaker opening again after failure during recovery test");
                inner.state = CircuitState::Open;
                inner.success_count = 0;
            }
            CircuitState::Open => {
                // Already open, nothing to do
            }
        }
    }
 }
 /// Cached method preference for a specific document type
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct MethodPreference {
    pub method_name: String,
    pub success_count: u32,
    pub last_success_time: u64, // Unix timestamp
    pub average_processing_time_ms: u64,
    pub confidence_score: f32,
 }
 /// Learning cache for method preferences
 #[derive(Debug, Clone)]
 pub struct LearningCache {
    preferences: Arc<RwLock<HashMap<String, MethodPreference>>>,
    config: LearningConfig,
 }
 impl LearningCache {
    fn new(config: LearningConfig) -> Self {
        Self {
            preferences: Arc::new(RwLock::new(HashMap::new())),
            config,
        }
    }
    /// Get preferred method for a document type
    fn get_preferred_method(&self, document_type: &str) -> Option<String> {
        if !self.config.cache_successful_methods {
            return None;
        }
        let preferences = match self.preferences.read() {
            Ok(p) => p,
            Err(poisoned) => {
                warn!("Learning cache get_preferred_method: mutex was poisoned, attempting recovery");
                poisoned.into_inner()
            }
        };
        let preference = preferences.get(document_type)?;
        // Check if preference is still valid (not expired)
        let now = match SystemTime::now().duration_since(UNIX_EPOCH) {
            Ok(d) => d.as_secs(),
            Err(_) => {
                warn!("Learning cache: failed to get current time, using cached preference anyway");
                return Some(preference.method_name.clone());
            }
        };
        let expire_time = preference.last_success_time + (self.config.cache_ttl_hours * 3600);
        if now <= expire_time {
            Some(preference.method_name.clone())
        } else {
            None
        }
    }
    /// Record successful method usage
    fn record_success(&self, document_type: &str, method_name: &str, processing_time_ms: u64, confidence: f32) {
        if !self.config.cache_successful_methods {
            return;
        }
        let now = SystemTime::now()
            .duration_since(UNIX_EPOCH)
            .map(|d| d.as_secs())
            .unwrap_or(0);
        let mut preferences = match self.preferences.write() {
            Ok(p) => p,
            Err(poisoned) => {
                warn!("Learning cache record_success: mutex was poisoned, attempting recovery");
                poisoned.into_inner()
            }
        };
        let preference = preferences.entry(document_type.to_string()).or_insert_with(|| MethodPreference {
            method_name: method_name.to_string(),
            success_count: 0,
            last_success_time: now,
            average_processing_time_ms: processing_time_ms,
            confidence_score: confidence,
        });
        // Update statistics
        preference.success_count += 1;
        preference.last_success_time = now;
        // Update rolling average for processing time
        let weight = 0.2; // Give recent results 20% weight
        preference.average_processing_time_ms = 
            ((1.0 - weight) * preference.average_processing_time_ms as f64 + 
             weight * processing_time_ms as f64) as u64;
        // Update rolling average for confidence
        preference.confidence_score = 
            (1.0 - weight as f32) * preference.confidence_score + 
            weight as f32 * confidence;
        // If this method is performing better, update the preference
        if method_name != preference.method_name {
            // Switch to new method if it's significantly better
            let time_improvement = preference.average_processing_time_ms as f64 / processing_time_ms as f64;
            let confidence_improvement = confidence / preference.confidence_score;
            if time_improvement > 1.2 || confidence_improvement > 1.1 {
                debug!("Switching preferred method for {} from {} to {} (time improvement: {:.2}x, confidence improvement: {:.2}x)",
                    document_type, preference.method_name, method_name, time_improvement, confidence_improvement);
                preference.method_name = method_name.to_string();
            }
        }
    }
    /// Clean up expired entries
    /// This method is thread-safe and handles poisoned mutexes gracefully
    fn cleanup_expired(&self) {
        let now = SystemTime::now()
            .duration_since(UNIX_EPOCH)
            .map(|d| d.as_secs())
            .unwrap_or(0);
        match self.preferences.write() {
            Ok(mut preferences) => {
                let expire_threshold = now.saturating_sub(self.config.cache_ttl_hours * 3600);
                let initial_count = preferences.len();
                preferences.retain(|_, pref| pref.last_success_time > expire_threshold);
                let final_count = preferences.len();
                if initial_count != final_count {
                    debug!("Learning cache cleanup: removed {} expired entries ({}->{})", 
                        initial_count - final_count, initial_count, final_count);
                }
            }
            Err(poisoned) => {
                warn!("Learning cache cleanup: mutex was poisoned, attempting recovery");
                // In case of poisoned mutex, try to recover and clean up
                let mut preferences = poisoned.into_inner();
                let expire_threshold = now.saturating_sub(self.config.cache_ttl_hours * 3600);
                let initial_count = preferences.len();
                preferences.retain(|_, pref| pref.last_success_time > expire_threshold);
                let final_count = preferences.len();
                if initial_count != final_count {
                    debug!("Learning cache cleanup (recovered): removed {} expired entries ({}->{})", 
                        initial_count - final_count, initial_count, final_count);
                }
            }
        }
    }
 }
 /// Statistics for monitoring fallback performance
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct FallbackStats {
    pub total_extractions: u64,
    pub library_successes: u64,
    pub xml_successes: u64,
    pub fallback_used: u64,
    pub circuit_breaker_trips: u64,
    pub retry_attempts: u64,
    pub average_processing_time_ms: f64,
    pub success_rate_percentage: f64,
@ -414,10 +51,7 @@ impl Default for FallbackStats {
    fn default() -> Self {
        Self {
            total_extractions: 0,
            library_successes: 0,
            xml_successes: 0,
            fallback_used: 0,
            circuit_breaker_trips: 0,
            retry_attempts: 0,
            average_processing_time_ms: 0.0,
            success_rate_percentage: 100.0,
@ -425,64 +59,46 @@ impl Default for FallbackStats {
    }
 }
-/// Main fallback strategy implementation
+/// XML-based Office document extraction service
 pub struct FallbackStrategy {
    config: FallbackConfig,
    xml_extractor: XmlOfficeExtractor,
-    circuit_breakers: Arc<RwLock<HashMap<String, CircuitBreaker>>>,
+    stats: std::sync::Arc<std::sync::RwLock<FallbackStats>>,
    learning_cache: LearningCache,
    stats: Arc<RwLock<FallbackStats>>,
 }
 impl FallbackStrategy {
-    /// Create a new fallback strategy
+    /// Create a new XML extraction service
    pub fn new(config: FallbackConfig, temp_dir: String) -> Self {
        Self {
-            config: config.clone(),
+            config,
            xml_extractor: XmlOfficeExtractor::new(temp_dir),
-            circuit_breakers: Arc::new(RwLock::new(HashMap::new())),
+            stats: std::sync::Arc::new(std::sync::RwLock::new(FallbackStats::default())),
            learning_cache: LearningCache::new(config.learning),
            stats: Arc::new(RwLock::new(FallbackStats::default())),
        }
    }
-    /// Execute extraction with intelligent fallback strategy
+    /// Extract Office document using XML extraction
    pub async fn extract_with_fallback(
        &self,
        file_path: &str,
        mime_type: &str,
    ) -> Result<OfficeExtractionResult> {
-        let start_time = Instant::now();
+        let start_time = std::time::Instant::now();
        let document_type = self.get_document_type(mime_type);
-        info!("Starting extraction with fallback for {} (type: {})", file_path, document_type);
+        info!("Starting XML extraction for {} (type: {})", file_path, document_type);
        // Update total extraction count
-        match self.stats.write() {
+        if let Ok(mut stats) = self.stats.write() {
-            Ok(mut stats) => {
+            stats.total_extractions += 1;
                stats.total_extractions += 1;
            }
            Err(_) => {
                warn!("Failed to acquire write lock on stats for extraction count update");
            }
        }
-        // Use XML extraction as the primary method
+        // Use XML extraction as the only method
        let result = self.execute_xml_extraction(file_path, mime_type).await;
        let processing_time = start_time.elapsed();
        // Update statistics  
        self.update_stats(&result, processing_time).await;
        // Clean up expired cache entries periodically (1% chance per extraction)
        // This is done asynchronously to avoid blocking the main extraction flow
        if rand::thread_rng().gen_range(0..100) == 0 {
            let cache_clone = self.learning_cache.clone();
            tokio::spawn(async move {
                cache_clone.cleanup_expired();
            });
        }
        result
    }
@ -496,51 +112,13 @@ impl FallbackStrategy {
        let result = self.xml_extractor.extract_text_from_office(file_path, mime_type).await?;
        // Update stats
-        match self.stats.write() {
+        if let Ok(mut stats) = self.stats.write() {
-            Ok(mut stats) => {
+            stats.xml_successes += 1;
                stats.xml_successes += 1;
            }
            Err(_) => {
                warn!("Failed to acquire write lock on stats for xml success update");
            }
        }
        Ok(result)
    }
    /// Record a failure for circuit breaker tracking
    async fn record_failure(&self, method_name: &str) {
        if !self.config.circuit_breaker.enabled {
            return;
        }
        match self.circuit_breakers.write() {
            Ok(mut breakers) => {
                let breaker = breakers.entry(method_name.to_string())
                    .or_insert_with(|| CircuitBreaker::new(self.config.circuit_breaker.clone()));
                breaker.record_failure();
                // Check if circuit is now open and update stats
                if let Ok(inner) = breaker.inner.lock() {
                    if inner.state == CircuitState::Open {
                        match self.stats.write() {
                            Ok(mut stats) => {
                                stats.circuit_breaker_trips += 1;
                            }
                            Err(_) => {
                                warn!("Failed to acquire write lock on stats for circuit breaker trip recording");
                            }
                        }
                    }
                } else {
                    warn!("Failed to check circuit breaker state after failure recording");
                }
            }
            Err(_) => {
                warn!("Failed to acquire write lock on circuit breakers for failure recording");
            }
        }
    }
    /// Get document type from MIME type
    fn get_document_type(&self, mime_type: &str) -> String {
@ -557,55 +135,41 @@ impl FallbackStrategy {
    }
    /// Update statistics after extraction
-    async fn update_stats(&self, result: &Result<OfficeExtractionResult>, processing_time: Duration) {
+    async fn update_stats(&self, result: &Result<OfficeExtractionResult>, processing_time: std::time::Duration) {
-        match self.stats.write() {
+        if let Ok(mut stats) = self.stats.write() {
-            Ok(mut stats) => {
+            let processing_time_ms = processing_time.as_millis() as f64;
-                let processing_time_ms = processing_time.as_millis() as f64;
+            
-                
+            // Update average processing time using exponential moving average
-                // Update average processing time using exponential moving average
+            let alpha = 0.1; // Smoothing factor
-                let alpha = 0.1; // Smoothing factor
+            stats.average_processing_time_ms = 
-                stats.average_processing_time_ms = 
+                alpha * processing_time_ms + (1.0 - alpha) * stats.average_processing_time_ms;
-                    alpha * processing_time_ms + (1.0 - alpha) * stats.average_processing_time_ms;
+            
-                
+            // Update success rate with proper division by zero protection
-                // Update success rate with proper division by zero protection
+            let total_attempts = stats.total_extractions;
-                let total_attempts = stats.total_extractions;
+            let successful_attempts = stats.xml_successes;
-                let successful_attempts = stats.library_successes + stats.xml_successes;
+            
-                
+            if total_attempts > 0 {
-                if total_attempts > 0 {
+                stats.success_rate_percentage = (successful_attempts as f64 / total_attempts as f64) * 100.0;
-                    stats.success_rate_percentage = (successful_attempts as f64 / total_attempts as f64) * 100.0;
+            } else if result.is_ok() {
-                } else {
+                stats.success_rate_percentage = 100.0;
                    // Keep existing success rate if no attempts yet, or set to 100% for first success
                    if result.is_ok() {
                        stats.success_rate_percentage = 100.0;
                    }
                }
            }
            Err(_) => {
                warn!("Failed to acquire write lock on stats for update");
            }
        }
    }
    /// Get current statistics
    pub async fn get_stats(&self) -> FallbackStats {
-        match self.stats.read() {
+        self.stats.read()
-            Ok(stats) => stats.clone(),
+            .map(|stats| stats.clone())
-            Err(_) => {
+            .unwrap_or_else(|_| {
                warn!("Failed to acquire read lock on stats, returning default");
                FallbackStats::default()
-            }
+            })
        }
    }
    /// Reset statistics
    pub async fn reset_stats(&self) {
-        match self.stats.write() {
+        if let Ok(mut stats) = self.stats.write() {
-            Ok(mut stats) => {
+            *stats = FallbackStats::default();
                *stats = FallbackStats::default();
            }
            Err(_) => {
                warn!("Failed to acquire write lock on stats for reset");
            }
        }
    }
 }
@ -622,88 +186,6 @@ mod tests {
        (strategy, temp_dir)
    }
    #[test]
    fn test_circuit_breaker() {
        let config = CircuitBreakerConfig {
            enabled: true,
            failure_threshold: 3,
            recovery_timeout_seconds: 1,
            success_threshold_percentage: 50,
        };
        let breaker = CircuitBreaker::new(config);
        // Initially closed
        assert!(breaker.should_allow_request());
        // Record failures
        breaker.record_failure();
        breaker.record_failure();
        assert!(breaker.should_allow_request()); // Still closed after 2 failures
        breaker.record_failure(); // Should open circuit
        assert!(!breaker.should_allow_request()); // Now should be open
    }
    #[test]
    fn test_learning_cache() {
        let config = LearningConfig {
            enabled: true,
            cache_successful_methods: true,
            cache_ttl_hours: 1,
        };
        let cache = LearningCache::new(config);
        // Initially no preference
        assert!(cache.get_preferred_method("docx").is_none());
        // Record success
        cache.record_success("docx", "XML", 1000, 95.0);
        // Should have preference now
        assert_eq!(cache.get_preferred_method("docx"), Some("XML".to_string()));
    }
    #[tokio::test]
    async fn test_is_retryable_error() {
        let (strategy, _temp_dir) = create_test_strategy();
        // Test retryable errors
        let retryable_errors = [
            "Connection timeout occurred",
            "Network temporarily unavailable",
            "Resource busy, try again",
            "Service unavailable (503)",
            "Rate limit exceeded (429)",
            "Out of memory - allocation failed",
        ];
        for error_msg in retryable_errors {
            let error = anyhow!("{}", error_msg);
            assert!(strategy.is_retryable_error(&error), "Expected '{}' to be retryable", error_msg);
        }
        // Test non-retryable errors
        let non_retryable_errors = [
            "File is corrupted",
            "Invalid format detected",
            "Access denied - permission error",
            "File not found (404)",
            "Unauthorized access (403)",
            "Assertion failed in parser",
        ];
        for error_msg in non_retryable_errors {
            let error = anyhow!("{}", error_msg);
            assert!(!strategy.is_retryable_error(&error), "Expected '{}' to be non-retryable", error_msg);
        }
        // Test unknown errors (should be non-retryable by default)
        let unknown_error = anyhow!("Some unknown error occurred");
        assert!(!strategy.is_retryable_error(&unknown_error));
    }
    #[tokio::test] 
    async fn test_stats_tracking() {
        let (strategy, _temp_dir) = create_test_strategy();
@ -712,19 +194,27 @@ mod tests {
        assert_eq!(initial_stats.total_extractions, 0);
        // Simulate some operations by updating stats directly
-        match strategy.stats.write() {
+        if let Ok(mut stats) = strategy.stats.write() {
-            Ok(mut stats) => {
+            stats.total_extractions = 10;
-                stats.total_extractions = 10;
+            stats.xml_successes = 9;
-                stats.library_successes = 7;
+            // Calculate success rate manually as update_stats would do
-                stats.xml_successes = 2;
+            stats.success_rate_percentage = (9.0 / 10.0) * 100.0;
            }
            Err(_) => {
                panic!("Failed to acquire write lock on stats in test");
            }
        }
        let updated_stats = strategy.get_stats().await;
        assert_eq!(updated_stats.total_extractions, 10);
        assert_eq!(updated_stats.xml_successes, 9);
        assert_eq!(updated_stats.success_rate_percentage, 90.0); // 9 successes out of 10
    }
    #[test]
    fn test_get_document_type() {
        let (strategy, _temp_dir) = create_test_strategy();
        assert_eq!(strategy.get_document_type("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), "docx");
        assert_eq!(strategy.get_document_type("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), "xlsx");
        assert_eq!(strategy.get_document_type("application/vnd.openxmlformats-officedocument.presentationml.presentation"), "pptx");
        assert_eq!(strategy.get_document_type("application/pdf"), "pdf");
        assert_eq!(strategy.get_document_type("unknown/type"), "unknown");
    }
 }
--- a/src/ocr/mod.rs
+++ b/src/ocr/mod.rs
@ -195,25 +195,41 @@ impl OcrService {
        }
    }
-    /// Extract text from Office documents using fallback strategy
+    /// Extract text from Office documents using XML extraction
    pub async fn extract_text_from_office_document(
        &self,
        file_path: &str,
        mime_type: &str,
-    ) -> Result<String> {
+    ) -> Result<crate::ocr::enhanced::OcrResult> {
        match &self.fallback_strategy {
            Some(strategy) => {
                let result = strategy.extract_with_fallback(file_path, mime_type).await?;
-                Ok(result.text)
+                // Convert the result to OcrResult for backward compatibility
                Ok(crate::ocr::enhanced::OcrResult {
                    text: result.text,
                    confidence: result.confidence,
                    processing_time_ms: result.processing_time_ms,
                    word_count: result.word_count,
                    preprocessing_applied: vec![format!("XML extraction - {}", result.extraction_method)],
                    processed_image_path: None,
                })
            }
            None => {
-                // Fallback to basic XML extraction if no strategy is configured
+                // Use basic XML extraction if no strategy is configured
                let xml_extractor = crate::ocr::xml_extractor::XmlOfficeExtractor::new(
                    std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string())
                );
                let result = xml_extractor.extract_text_from_office(file_path, mime_type).await?;
-                Ok(result.text)
+                // Convert OfficeExtractionResult to OcrResult for backward compatibility
                Ok(crate::ocr::enhanced::OcrResult {
                    text: result.text,
                    confidence: result.confidence,
                    processing_time_ms: result.processing_time_ms,
                    word_count: result.word_count,
                    preprocessing_applied: vec![format!("XML extraction - {}", result.extraction_method)],
                    processed_image_path: None,
                })
            }
        }
    }
@ -223,16 +239,9 @@ impl OcrService {
        &self,
        file_path: &str,
        mime_type: &str,
-    ) -> Result<String> {
+    ) -> Result<crate::ocr::enhanced::OcrResult> {
-        match &self.fallback_strategy {
+        // Use the same XML extraction logic as the basic method
-            Some(strategy) => {
+        self.extract_text_from_office_document(file_path, mime_type).await
                let result = strategy.extract_with_fallback(file_path, mime_type).await?;
                Ok(result.text)
            }
            None => {
                return Err(anyhow!("Fallback strategy not configured for advanced Office document extraction"));
            }
        }
    }
    pub async fn extract_text(&self, file_path: &str, mime_type: &str) -> Result<String> {
@ -249,7 +258,8 @@ impl OcrService {
            "application/msword" |
            "application/vnd.ms-excel" |
            "application/vnd.ms-powerpoint" => {
-                self.extract_text_from_office_document(file_path, mime_type).await
+                let result = self.extract_text_from_office_document(file_path, mime_type).await?;
                Ok(result.text)
            }
            "image/png" | "image/jpeg" | "image/jpg" | "image/tiff" | "image/bmp" => {
                self.extract_text_from_image_with_lang(file_path, lang).await
@ -321,7 +331,7 @@ impl OcrService {
        }
    }
-    /// Get fallback strategy statistics
+    /// Get XML extraction statistics
    pub async fn get_fallback_stats(&self) -> Option<crate::ocr::fallback_strategy::FallbackStats> {
        match &self.fallback_strategy {
            Some(strategy) => Some(strategy.get_stats().await),
@ -329,14 +339,14 @@ impl OcrService {
        }
    }
-    /// Reset fallback strategy statistics
+    /// Reset XML extraction statistics
    pub async fn reset_fallback_stats(&self) -> Result<()> {
        match &self.fallback_strategy {
            Some(strategy) => {
                strategy.reset_stats().await;
                Ok(())
            }
-            None => Err(anyhow!("Fallback strategy not configured")),
+            None => Err(anyhow!("XML extraction strategy not configured")),
        }
    }
--- a/src/routes/settings.rs
+++ b/src/routes/settings.rs
@ -102,7 +102,6 @@ async fn get_settings(
                webdav_auto_sync: default.webdav_auto_sync,
                webdav_sync_interval_minutes: default.webdav_sync_interval_minutes,
                // Office document extraction configuration
                office_extraction_mode: default.office_extraction_mode,
                office_extraction_timeout_seconds: default.office_extraction_timeout_seconds,
                office_extraction_enable_detailed_logging: default.office_extraction_enable_detailed_logging,
            }
--- a/test_files/word/document.xml
+++ b/test_files/word/document.xml
--- a/tests/integration_office_document_extraction_tests.rs
+++ b/tests/integration_office_document_extraction_tests.rs
@ -457,19 +457,20 @@ async fn test_doc_extraction_multiple_strategies() {
    let settings = Settings::default();
    let start_time = std::time::Instant::now();
-    // Test the full legacy DOC extraction process
+    // Test Office extraction with the DOC file (this should fail as DOC files are not XML-based)
-    let result = ocr_service.extract_text_from_legacy_doc(
+    let result = ocr_service.extract_text_from_office(
        doc_path.to_str().unwrap(),
-        start_time
+        "application/msword",
        &settings
    ).await;
-    // Should fail since we don't have LibreOffice or extraction tools in test env
+    // Should fail since DOC files are not XML-based and we only do XML extraction now
-    assert!(result.is_err(), "Should fail without proper tools");
+    assert!(result.is_err(), "Should fail for DOC files as they are not XML-based");
    let error_msg = result.unwrap_err().to_string();
-    // Verify it mentions trying extraction tools
+    // Verify it mentions XML parsing issues for DOC files
-    assert!(error_msg.contains("None of the DOC extraction tools") || error_msg.contains("All extraction methods failed"), 
+    assert!(error_msg.contains("not a valid ZIP") || error_msg.contains("invalid") || error_msg.contains("XML"), 
-        "Should mention all methods tried: {}", error_msg);
+        "Should mention XML/ZIP parsing issues: {}", error_msg);
 }
 #[tokio::test]
--- a/tests/integration_office_extraction.rs
+++ b/tests/integration_office_extraction.rs
@ -7,7 +7,7 @@ use tokio::time::timeout;
 use readur::ocr::{
    OcrService, OcrConfig,
-    fallback_strategy::{FallbackConfig, CircuitBreakerConfig, LearningConfig, MethodTimeouts},
+    fallback_strategy::FallbackConfig,
 };
 /// Test utilities for creating mock Office documents
@ -154,18 +154,7 @@ fn create_test_ocr_service(temp_dir: &str) -> OcrService {
            max_retries: 2,
            initial_retry_delay_ms: 100,
            max_retry_delay_ms: 1000,
-            circuit_breaker: CircuitBreakerConfig {
+            xml_timeout_seconds: 60,
                enabled: true,
                failure_threshold: 3,
                recovery_timeout_seconds: 5,
                success_threshold_percentage: 70,
            },
            learning: LearningConfig {
                enabled: true,
                cache_successful_methods: true,
                cache_ttl_hours: 1,
            },
            method_timeouts: MethodTimeouts::default(),
        },
        temp_dir: temp_dir.to_string(),
    };
@ -186,16 +175,12 @@ async fn test_extract_text_from_docx() -> Result<()> {
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    ).await?;
-    assert!(result.success);
+    // The method now returns an OcrResult
    // Since we're using a placeholder library extraction, check for the actual content
    println!("Extracted text: '{}'", result.text);
    println!("Method used: {}", result.method_name);
    assert!(!result.text.is_empty());
-    assert!(result.word_count > 0);
+    assert!(result.text.contains(test_content));
    assert!(result.confidence > 0.0);
-    assert!(result.processing_time < Duration::from_secs(30));
+    assert!(result.word_count > 0);
    // The method might be Library-based extraction (placeholder) or XML extraction
    assert!(result.method_name.contains("extraction"));
    Ok(())
 }
@ -218,13 +203,13 @@ async fn test_extract_text_from_xlsx() -> Result<()> {
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
    ).await?;
-    assert!(result.success);
+    // The method now returns an OcrResult
    // Since we're using placeholder extraction, check basic properties
    println!("XLSX extracted text: '{}'", result.text);
    println!("XLSX method used: {}", result.method_name);
    assert!(!result.text.is_empty());
-    assert!(result.word_count > 0);
+    // Check if it contains some of our test content
    assert!(result.text.contains("Header") || result.text.contains("Data"));
    assert!(result.confidence > 0.0);
    assert!(result.word_count > 0);
    Ok(())
 }
@ -252,8 +237,10 @@ async fn test_extraction_modes() -> Result<()> {
    // XML extraction should succeed with our test document
    assert!(result.is_ok(), "XML extraction failed: {:?}", result);
-    let extracted_text = result?;
+    let extracted_result = result?;
-    assert!(!extracted_text.is_empty());
+    assert!(!extracted_result.text.is_empty());
    assert!(extracted_result.confidence > 0.0);
    assert!(extracted_result.word_count > 0);
    Ok(())
 }
@ -263,29 +250,14 @@ async fn test_fallback_mechanism() -> Result<()> {
    let test_docs = OfficeTestDocuments::new()?;
    let temp_dir = test_docs.temp_dir.path().to_string_lossy().to_string();
-    // Create a service with library-first mode
+    // Create a service with XML-only mode (simplified)
    let config = OcrConfig {
        fallback_config: FallbackConfig {
            enabled: true,
            max_retries: 1,
            initial_retry_delay_ms: 50,
            max_retry_delay_ms: 200,
-            circuit_breaker: CircuitBreakerConfig {
+            xml_timeout_seconds: 30,
                enabled: false, // Disable for this test
                failure_threshold: 5,
                recovery_timeout_seconds: 10,
                success_threshold_percentage: 50,
            },
            learning: LearningConfig {
                enabled: true,
                cache_successful_methods: true,
                cache_ttl_hours: 1,
            },
            method_timeouts: MethodTimeouts {
                library_timeout_seconds: 1, // Very short timeout to force fallback
                xml_timeout_seconds: 30,
                ocr_timeout_seconds: 60,
            },
        },
        temp_dir,
    };
@ -293,16 +265,16 @@ async fn test_fallback_mechanism() -> Result<()> {
    let ocr_service = OcrService::new_with_config(config);
    let docx_path = test_docs.create_mock_docx("fallback_test.docx", "Fallback test content")?;
-    // The library method should timeout and fallback to XML
+    // The XML extraction should succeed
    let result = ocr_service.extract_text_from_office_document(
        &docx_path,
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    ).await?;
-    assert!(result.success);
+    // The method now returns an OcrResult
    assert!(result.text.contains("Fallback test content"));
-    // Should have used XML extraction due to library timeout
+    assert!(result.confidence > 0.0);
-    assert!(result.method_name.contains("XML"));
+    assert!(result.word_count > 0);
    Ok(())
 }
@ -326,7 +298,9 @@ async fn test_timeout_handling() -> Result<()> {
    // Should complete successfully even with short timeout for our simple test file
    assert!(result.is_ok());
    let extraction_result = result??;
-    assert!(extraction_result.success);
+    assert!(!extraction_result.text.is_empty());
    assert!(extraction_result.confidence > 0.0);
    assert!(extraction_result.word_count > 0);
    Ok(())
 }
@ -399,10 +373,11 @@ async fn test_concurrent_extraction() -> Result<()> {
    // Verify all extractions succeeded
    for (i, task_result) in results.into_iter().enumerate() {
-        let extraction_result = task_result??;
+        let ocr_result = task_result??;
-        assert!(extraction_result.success, "Task {} failed", i);
+        assert!(!ocr_result.text.is_empty(), "Task {} failed", i);
-        assert!(extraction_result.text.contains(&format!("Test document {}", i)));
+        assert!(ocr_result.text.contains(&format!("Test document {}", i)));
-        assert!(extraction_result.word_count > 0);
+        assert!(ocr_result.confidence > 0.0);
        assert!(ocr_result.word_count > 0);
    }
    Ok(())
@ -412,25 +387,14 @@ async fn test_concurrent_extraction() -> Result<()> {
 async fn test_circuit_breaker() -> Result<()> {
    let test_docs = OfficeTestDocuments::new()?;
-    // Create service with aggressive circuit breaker settings
+    // Create service with simple retry settings (circuit breaker functionality removed)
    let config = OcrConfig {
        fallback_config: FallbackConfig {
            enabled: true,
            max_retries: 0, // No retries to make failures immediate
            initial_retry_delay_ms: 10,
            max_retry_delay_ms: 100,
-            circuit_breaker: CircuitBreakerConfig {
+            xml_timeout_seconds: 30,
                enabled: true,
                failure_threshold: 2, // Trip after just 2 failures
                recovery_timeout_seconds: 1,
                success_threshold_percentage: 100, // Require 100% success to close
            },
            learning: LearningConfig::default(),
            method_timeouts: MethodTimeouts {
                library_timeout_seconds: 30,
                xml_timeout_seconds: 30,
                ocr_timeout_seconds: 30,
            },
        },
        temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(),
    };
@ -458,24 +422,17 @@ async fn test_circuit_breaker() -> Result<()> {
    ).await;
    assert!(result2.is_err());
-    // Third attempt - should fail fast due to circuit breaker
+    // Third attempt - should succeed since circuit breaker functionality was removed  
    let result3 = ocr_service.extract_text_from_office_document(
        &valid_path,
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    ).await;
-    assert!(result3.is_err());
+    // With simplified architecture, valid documents should always work
-    let error_msg = result3.unwrap_err().to_string();
+    assert!(result3.is_ok());
-    assert!(error_msg.contains("circuit breaker") || error_msg.contains("open"));
+    let valid_result = result3.unwrap();
-    
+    assert!(valid_result.text.contains("Valid document"));
-    // Wait for recovery timeout
+    assert!(valid_result.confidence > 0.0);
-    tokio::time::sleep(Duration::from_secs(2)).await;
+    assert!(valid_result.word_count > 0);
    // Now should be able to process valid document (circuit goes to half-open)
    let _result4 = ocr_service.extract_text_from_office_document(
        &valid_path,
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    ).await;
    // This might still fail if circuit is still open, which is acceptable behavior
    Ok(())
 }
@ -501,6 +458,10 @@ async fn test_statistics_tracking() -> Result<()> {
        ).await;
        assert!(result.is_ok(), "Extraction {} failed: {:?}", i, result);
        let ocr_result = result.unwrap();
        assert!(!ocr_result.text.is_empty());
        assert!(ocr_result.confidence > 0.0);
        assert!(ocr_result.word_count > 0);
    }
    // Check updated stats
@ -534,25 +495,14 @@ async fn test_mime_type_support() -> Result<()> {
 async fn test_learning_mechanism() -> Result<()> {
    let test_docs = OfficeTestDocuments::new()?;
-    // Create service with learning enabled
+    // Create service with simple XML extraction (learning functionality removed)
    let config = OcrConfig {
        fallback_config: FallbackConfig {
            enabled: true,
            max_retries: 1,
            initial_retry_delay_ms: 10,
            max_retry_delay_ms: 100,
-            circuit_breaker: CircuitBreakerConfig {
+            xml_timeout_seconds: 30,
                enabled: false, // Disable to focus on learning
                failure_threshold: 10,
                recovery_timeout_seconds: 10,
                success_threshold_percentage: 50,
            },
            learning: LearningConfig {
                enabled: true,
                cache_successful_methods: true,
                cache_ttl_hours: 1,
            },
            method_timeouts: MethodTimeouts::default(),
        },
        temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(),
    };
@ -569,15 +519,16 @@ async fn test_learning_mechanism() -> Result<()> {
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
        ).await;
-        assert!(result.is_ok(), "Learning iteration {} failed: {:?}", i, result);
+        assert!(result.is_ok(), "Extraction iteration {} failed: {:?}", i, result);
-        let result = result?;
+        let ocr_result = result?;
-        assert!(result.success);
+        assert!(!ocr_result.text.is_empty());
-        assert!(result.text.contains(&format!("document {}", i)));
+        assert!(ocr_result.text.contains(&format!("document {}", i)));
        assert!(ocr_result.confidence > 0.0);
        assert!(ocr_result.word_count > 0);
    }
-    // The learning mechanism should now have preferences cached
+    // With the simplified XML-only architecture, the system should consistently work
-    // We can't easily test this directly without exposing internal state,
+    // All extractions succeeded, indicating the XML extraction is working correctly
    // but the fact that all extractions succeeded indicates the system is working
    Ok(())
 }
@ -635,11 +586,11 @@ async fn benchmark_extraction_performance() -> Result<()> {
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
        ).await?;
-        assert!(result.success);
+        assert!(!result.text.is_empty());
-        println!("Iteration {}: {} ms, {} words", 
+        println!("Iteration {}: extracted {} chars, confidence: {:.1}%", 
            i, 
-            result.processing_time.as_millis(),
+            result.text.len(),
-            result.word_count
+            result.confidence
        );
    }
--- a/tests/integration_settings_tests.rs
+++ b/tests/integration_settings_tests.rs
@ -115,6 +115,8 @@ mod tests {
                webdav_file_extensions: None,
                webdav_auto_sync: None,
                webdav_sync_interval_minutes: None,
                office_extraction_timeout_seconds: None,
                office_extraction_enable_detailed_logging: None,
            };
            let response = ctx.app
@ -238,6 +240,8 @@ mod tests {
                webdav_file_extensions: None,
                webdav_auto_sync: None,
                webdav_sync_interval_minutes: None,
                office_extraction_timeout_seconds: None,
                office_extraction_enable_detailed_logging: None,
            };
            let response = ctx.app
@ -388,6 +392,8 @@ mod tests {
                webdav_file_extensions: None,
                webdav_auto_sync: None,
                webdav_sync_interval_minutes: None,
                office_extraction_timeout_seconds: None,
                office_extraction_enable_detailed_logging: None,
            };
            let response = ctx.app
@ -515,6 +521,8 @@ mod tests {
                webdav_file_extensions: None,
                webdav_auto_sync: None,
                webdav_sync_interval_minutes: None,
                office_extraction_timeout_seconds: None,
                office_extraction_enable_detailed_logging: None,
            };
            let response = ctx.app