feat(office): xml extraction seems to work now

This commit is contained in:
perf3ct 2025-09-02 01:22:04 +00:00
parent 774efd1140
commit d5d6d2edb4
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
12 changed files with 176 additions and 968 deletions

79
Cargo.lock generated
View File

@ -1023,21 +1023,6 @@ dependencies = [
"pkg-config", "pkg-config",
] ]
[[package]]
name = "calamine"
version = "0.26.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "138646b9af2c5d7f1804ea4bf93afc597737d2bd4f7341d67c48b03316976eb1"
dependencies = [
"byteorder",
"codepage",
"encoding_rs",
"log",
"quick-xml 0.31.0",
"serde",
"zip 2.4.2",
]
[[package]] [[package]]
name = "cc" name = "cc"
version = "1.2.27" version = "1.2.27"
@ -1170,15 +1155,6 @@ dependencies = [
"cc", "cc",
] ]
[[package]]
name = "codepage"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4"
dependencies = [
"encoding_rs",
]
[[package]] [[package]]
name = "color_quant" name = "color_quant"
version = "1.1.0" version = "1.1.0"
@ -1490,21 +1466,6 @@ dependencies = [
"serde_json", "serde_json",
] ]
[[package]]
name = "docx-rs"
version = "0.4.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f21be13b97bd2924f30323d674f5a8db382964972825abd93f30d08f21dad98"
dependencies = [
"base64 0.22.1",
"image 0.24.9",
"serde",
"serde_json",
"thiserror 1.0.69",
"xml-rs",
"zip 0.6.6",
]
[[package]] [[package]]
name = "dotenvy" name = "dotenvy"
version = "0.15.7" version = "0.15.7"
@ -2428,22 +2389,6 @@ dependencies = [
"icu_properties", "icu_properties",
] ]
[[package]]
name = "image"
version = "0.24.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5690139d2f55868e080017335e4b94cb7414274c74f1669c84fb5feba2c9f69d"
dependencies = [
"bytemuck",
"byteorder",
"color_quant",
"gif",
"jpeg-decoder",
"num-traits",
"png",
"tiff",
]
[[package]] [[package]]
name = "image" name = "image"
version = "0.25.6" version = "0.25.6"
@ -2486,7 +2431,7 @@ dependencies = [
"ab_glyph", "ab_glyph",
"approx", "approx",
"getrandom 0.2.16", "getrandom 0.2.16",
"image 0.25.6", "image",
"itertools", "itertools",
"nalgebra", "nalgebra",
"num", "num",
@ -3555,16 +3500,6 @@ version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3"
[[package]]
name = "quick-xml"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
dependencies = [
"encoding_rs",
"memchr",
]
[[package]] [[package]]
name = "quick-xml" name = "quick-xml"
version = "0.37.5" version = "0.37.5"
@ -3757,15 +3692,13 @@ dependencies = [
"axum", "axum",
"base64ct", "base64ct",
"bcrypt", "bcrypt",
"calamine",
"chrono", "chrono",
"clap", "clap",
"docx-rs",
"dotenvy", "dotenvy",
"futures", "futures",
"futures-util", "futures-util",
"hostname", "hostname",
"image 0.25.6", "image",
"imageproc", "imageproc",
"infer", "infer",
"jsonwebtoken", "jsonwebtoken",
@ -3773,7 +3706,7 @@ dependencies = [
"notify", "notify",
"oauth2", "oauth2",
"once_cell", "once_cell",
"quick-xml 0.37.5", "quick-xml",
"rand 0.8.5", "rand 0.8.5",
"raw-cpuid", "raw-cpuid",
"readur", "readur",
@ -6298,12 +6231,6 @@ dependencies = [
"rustix 1.0.7", "rustix 1.0.7",
] ]
[[package]]
name = "xml-rs"
version = "0.8.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6fd8403733700263c6eb89f192880191f1b83e332f7a20371ddcf421c4a337c7"
[[package]] [[package]]
name = "xmlparser" name = "xmlparser"
version = "0.13.6" version = "0.13.6"

View File

@ -62,9 +62,7 @@ sha2 = "0.10"
utoipa-swagger-ui = { version = "9", features = ["axum"] } utoipa-swagger-ui = { version = "9", features = ["axum"] }
testcontainers = { version = "0.24", optional = true } testcontainers = { version = "0.24", optional = true }
testcontainers-modules = { version = "0.12", features = ["postgres"], optional = true } testcontainers-modules = { version = "0.12", features = ["postgres"], optional = true }
# Office document support - using proper, well-maintained libraries # Office document support - now using XML extraction only
docx-rs = "0.4" # For DOCX text extraction by bokuweb (NOT the docx crate by PoiScript)
calamine = "0.26" # For Excel (XLS/XLSX) text extraction
zip = "0.6" # Still needed for other archive handling zip = "0.6" # Still needed for other archive handling
rand = "0.8" rand = "0.8"

View File

@ -76,7 +76,6 @@ fn settings_from_row(row: &sqlx::postgres::PgRow) -> crate::models::Settings {
webdav_auto_sync: row.get("webdav_auto_sync"), webdav_auto_sync: row.get("webdav_auto_sync"),
webdav_sync_interval_minutes: row.get("webdav_sync_interval_minutes"), webdav_sync_interval_minutes: row.get("webdav_sync_interval_minutes"),
// Office document extraction configuration // Office document extraction configuration
office_extraction_mode: row.get("office_extraction_mode"),
office_extraction_timeout_seconds: row.get("office_extraction_timeout_seconds"), office_extraction_timeout_seconds: row.get("office_extraction_timeout_seconds"),
office_extraction_enable_detailed_logging: row.get("office_extraction_enable_detailed_logging"), office_extraction_enable_detailed_logging: row.get("office_extraction_enable_detailed_logging"),
created_at: row.get("created_at"), created_at: row.get("created_at"),
@ -106,7 +105,6 @@ impl Database {
ocr_quality_threshold_sharpness, ocr_skip_enhancement, ocr_quality_threshold_sharpness, ocr_skip_enhancement,
webdav_enabled, webdav_server_url, webdav_username, webdav_password, webdav_enabled, webdav_server_url, webdav_username, webdav_password,
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes, webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes,
COALESCE(office_extraction_mode, 'compare_always') as office_extraction_mode,
COALESCE(office_extraction_timeout_seconds, 120) as office_extraction_timeout_seconds, COALESCE(office_extraction_timeout_seconds, 120) as office_extraction_timeout_seconds,
COALESCE(office_extraction_enable_detailed_logging, true) as office_extraction_enable_detailed_logging, COALESCE(office_extraction_enable_detailed_logging, true) as office_extraction_enable_detailed_logging,
created_at, updated_at created_at, updated_at
@ -144,7 +142,6 @@ impl Database {
ocr_quality_threshold_sharpness, ocr_skip_enhancement, ocr_quality_threshold_sharpness, ocr_skip_enhancement,
webdav_enabled, webdav_server_url, webdav_username, webdav_password, webdav_enabled, webdav_server_url, webdav_username, webdav_password,
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes, webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes,
COALESCE(office_extraction_mode, 'library_first') as office_extraction_mode,
COALESCE(office_extraction_timeout_seconds, 120) as office_extraction_timeout_seconds, COALESCE(office_extraction_timeout_seconds, 120) as office_extraction_timeout_seconds,
COALESCE(office_extraction_enable_detailed_logging, false) as office_extraction_enable_detailed_logging, COALESCE(office_extraction_enable_detailed_logging, false) as office_extraction_enable_detailed_logging,
created_at, updated_at created_at, updated_at
@ -163,18 +160,6 @@ impl Database {
/// Validate office extraction settings /// Validate office extraction settings
fn validate_office_extraction_settings(settings: &crate::models::UpdateSettings) -> Result<()> { fn validate_office_extraction_settings(settings: &crate::models::UpdateSettings) -> Result<()> {
// Validate extraction mode
if let Some(mode) = &settings.office_extraction_mode {
let valid_modes = ["library_first", "xml_first", "compare_always", "library_only", "xml_only"];
if !valid_modes.contains(&mode.as_str()) {
return Err(anyhow!(
"Invalid office extraction mode '{}'. Valid modes are: {}",
mode,
valid_modes.join(", ")
));
}
}
// Validate timeout // Validate timeout
if let Some(timeout) = settings.office_extraction_timeout_seconds { if let Some(timeout) = settings.office_extraction_timeout_seconds {
if timeout <= 0 { if timeout <= 0 {
@ -307,9 +292,9 @@ impl Database {
ocr_quality_threshold_sharpness, ocr_skip_enhancement, ocr_quality_threshold_sharpness, ocr_skip_enhancement,
webdav_enabled, webdav_server_url, webdav_username, webdav_password, webdav_enabled, webdav_server_url, webdav_username, webdav_password,
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes, webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes,
office_extraction_mode, office_extraction_timeout_seconds, office_extraction_enable_detailed_logging office_extraction_timeout_seconds, office_extraction_enable_detailed_logging
) )
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $50, $51, $52, $53, $54, $55, $56) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $50, $51, $52, $53, $54, $55)
ON CONFLICT (user_id) DO UPDATE SET ON CONFLICT (user_id) DO UPDATE SET
ocr_language = $2, ocr_language = $2,
preferred_languages = $3, preferred_languages = $3,
@ -363,9 +348,8 @@ impl Database {
webdav_file_extensions = $51, webdav_file_extensions = $51,
webdav_auto_sync = $52, webdav_auto_sync = $52,
webdav_sync_interval_minutes = $53, webdav_sync_interval_minutes = $53,
office_extraction_mode = $54, office_extraction_timeout_seconds = $54,
office_extraction_timeout_seconds = $55, office_extraction_enable_detailed_logging = $55,
office_extraction_enable_detailed_logging = $56,
updated_at = NOW() updated_at = NOW()
RETURNING id, user_id, ocr_language, RETURNING id, user_id, ocr_language,
COALESCE(preferred_languages, '["eng"]'::jsonb) as preferred_languages, COALESCE(preferred_languages, '["eng"]'::jsonb) as preferred_languages,
@ -385,7 +369,6 @@ impl Database {
ocr_quality_threshold_sharpness, ocr_skip_enhancement, ocr_quality_threshold_sharpness, ocr_skip_enhancement,
webdav_enabled, webdav_server_url, webdav_username, webdav_password, webdav_enabled, webdav_server_url, webdav_username, webdav_password,
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes, webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes,
COALESCE(office_extraction_mode, 'library_first') as office_extraction_mode,
COALESCE(office_extraction_timeout_seconds, 120) as office_extraction_timeout_seconds, COALESCE(office_extraction_timeout_seconds, 120) as office_extraction_timeout_seconds,
COALESCE(office_extraction_enable_detailed_logging, false) as office_extraction_enable_detailed_logging, COALESCE(office_extraction_enable_detailed_logging, false) as office_extraction_enable_detailed_logging,
created_at, updated_at created_at, updated_at
@ -444,7 +427,6 @@ impl Database {
.bind(settings.webdav_file_extensions.as_ref().unwrap_or(&current.webdav_file_extensions)) .bind(settings.webdav_file_extensions.as_ref().unwrap_or(&current.webdav_file_extensions))
.bind(settings.webdav_auto_sync.unwrap_or(current.webdav_auto_sync)) .bind(settings.webdav_auto_sync.unwrap_or(current.webdav_auto_sync))
.bind(settings.webdav_sync_interval_minutes.unwrap_or(current.webdav_sync_interval_minutes)) .bind(settings.webdav_sync_interval_minutes.unwrap_or(current.webdav_sync_interval_minutes))
.bind(settings.office_extraction_mode.as_ref().unwrap_or(&current.office_extraction_mode))
.bind(settings.office_extraction_timeout_seconds.unwrap_or(current.office_extraction_timeout_seconds)) .bind(settings.office_extraction_timeout_seconds.unwrap_or(current.office_extraction_timeout_seconds))
.bind(settings.office_extraction_enable_detailed_logging.unwrap_or(current.office_extraction_enable_detailed_logging)) .bind(settings.office_extraction_enable_detailed_logging.unwrap_or(current.office_extraction_enable_detailed_logging))
.fetch_one(&self.pool) .fetch_one(&self.pool)

View File

@ -61,7 +61,6 @@ pub struct Settings {
pub webdav_auto_sync: bool, pub webdav_auto_sync: bool,
pub webdav_sync_interval_minutes: i32, pub webdav_sync_interval_minutes: i32,
// Office document extraction configuration // Office document extraction configuration
pub office_extraction_mode: String, // "library_first", "xml_first", "compare_always", "library_only", "xml_only"
pub office_extraction_timeout_seconds: i32, pub office_extraction_timeout_seconds: i32,
pub office_extraction_enable_detailed_logging: bool, pub office_extraction_enable_detailed_logging: bool,
pub created_at: DateTime<Utc>, pub created_at: DateTime<Utc>,
@ -123,7 +122,6 @@ pub struct SettingsResponse {
pub webdav_auto_sync: bool, pub webdav_auto_sync: bool,
pub webdav_sync_interval_minutes: i32, pub webdav_sync_interval_minutes: i32,
// Office document extraction configuration // Office document extraction configuration
pub office_extraction_mode: String,
pub office_extraction_timeout_seconds: i32, pub office_extraction_timeout_seconds: i32,
pub office_extraction_enable_detailed_logging: bool, pub office_extraction_enable_detailed_logging: bool,
} }
@ -183,7 +181,6 @@ pub struct UpdateSettings {
pub webdav_auto_sync: Option<bool>, pub webdav_auto_sync: Option<bool>,
pub webdav_sync_interval_minutes: Option<i32>, pub webdav_sync_interval_minutes: Option<i32>,
// Office document extraction configuration // Office document extraction configuration
pub office_extraction_mode: Option<String>,
pub office_extraction_timeout_seconds: Option<i32>, pub office_extraction_timeout_seconds: Option<i32>,
pub office_extraction_enable_detailed_logging: Option<bool>, pub office_extraction_enable_detailed_logging: Option<bool>,
} }
@ -244,7 +241,6 @@ impl From<Settings> for SettingsResponse {
webdav_auto_sync: settings.webdav_auto_sync, webdav_auto_sync: settings.webdav_auto_sync,
webdav_sync_interval_minutes: settings.webdav_sync_interval_minutes, webdav_sync_interval_minutes: settings.webdav_sync_interval_minutes,
// Office document extraction configuration // Office document extraction configuration
office_extraction_mode: settings.office_extraction_mode,
office_extraction_timeout_seconds: settings.office_extraction_timeout_seconds, office_extraction_timeout_seconds: settings.office_extraction_timeout_seconds,
office_extraction_enable_detailed_logging: settings.office_extraction_enable_detailed_logging, office_extraction_enable_detailed_logging: settings.office_extraction_enable_detailed_logging,
} }
@ -312,7 +308,6 @@ impl UpdateSettings {
webdav_auto_sync: None, webdav_auto_sync: None,
webdav_sync_interval_minutes: None, webdav_sync_interval_minutes: None,
// Office document extraction configuration - don't update these in language update // Office document extraction configuration - don't update these in language update
office_extraction_mode: None,
office_extraction_timeout_seconds: None, office_extraction_timeout_seconds: None,
office_extraction_enable_detailed_logging: None, office_extraction_enable_detailed_logging: None,
} }
@ -393,7 +388,6 @@ impl Default for Settings {
webdav_auto_sync: false, webdav_auto_sync: false,
webdav_sync_interval_minutes: 60, webdav_sync_interval_minutes: 60,
// Office document extraction configuration defaults // Office document extraction configuration defaults
office_extraction_mode: "library_first".to_string(), // Default to library-first approach
office_extraction_timeout_seconds: 120, // 2 minutes default timeout office_extraction_timeout_seconds: 120, // 2 minutes default timeout
office_extraction_enable_detailed_logging: false, // Conservative default office_extraction_enable_detailed_logging: false, // Conservative default
created_at: chrono::Utc::now(), created_at: chrono::Utc::now(),

View File

@ -92,39 +92,6 @@ impl EnhancedOcrService {
cleaned cleaned
} }
/// Sanitizes file paths before passing to external tools to prevent command injection
fn sanitize_file_path_for_external_tool(file_path: &str) -> Result<String> {
use std::path::Path;
// Resolve to absolute path to prevent relative path tricks
let path = Path::new(file_path);
let absolute_path = path.canonicalize()
.map_err(|e| anyhow!("Failed to resolve file path '{}': {}. File may not exist.", file_path, e))?;
let path_str = absolute_path.to_str()
.ok_or_else(|| anyhow!("File path contains invalid UTF-8 characters: '{:?}'", absolute_path))?;
// Check for suspicious characters that could be used for command injection
let dangerous_chars = ['&', '|', ';', '`', '$', '(', ')', '<', '>', '"', '\'', '\\'];
if path_str.chars().any(|c| dangerous_chars.contains(&c)) {
return Err(anyhow!(
"File path contains potentially dangerous characters: '{}'. \
This is blocked for security reasons to prevent command injection.",
path_str
));
}
// Ensure the path doesn't contain shell metacharacters
if path_str.contains("..") || path_str.contains("//") {
return Err(anyhow!(
"File path contains suspicious sequences: '{}'. \
This is blocked for security reasons.",
path_str
));
}
Ok(path_str.to_string())
}
pub fn new(temp_dir: String, file_service: FileService) -> Self { pub fn new(temp_dir: String, file_service: FileService) -> Self {
Self { temp_dir, file_service } Self { temp_dir, file_service }
@ -1525,138 +1492,16 @@ impl EnhancedOcrService {
total_time total_time
); );
// Convert OfficeExtractionResult to OcrResult for backward compatibility
Ok(OcrResult { Ok(OcrResult {
text: xml_result.text, text: xml_result.text,
confidence: xml_result.confidence, confidence: xml_result.confidence,
processing_time_ms: total_time, processing_time_ms: xml_result.processing_time_ms,
word_count: xml_result.word_count, word_count: xml_result.word_count,
preprocessing_applied: vec![xml_result.extraction_method], preprocessing_applied: vec![format!("XML extraction - {}", xml_result.extraction_method)],
processed_image_path: None, processed_image_path: None,
}) })
} }
/// REMOVED: This method was used for comparison analysis - now handled by extract_text_from_office
#[deprecated(note = "Use extract_text_from_office instead - this method was for comparison analysis")]
/// Extract text from legacy DOC files using lightweight external tools
pub async fn extract_text_from_legacy_doc(&self, file_path: &str, start_time: std::time::Instant) -> Result<OcrResult> {
info!("Processing legacy DOC file: {}", file_path);
// Use lightweight DOC extraction tools in order of preference
let tools = ["antiword", "catdoc", "wvText"];
let mut last_error = None;
for tool in &tools {
match self.try_doc_extraction_tool(file_path, tool).await {
Ok(text) if !text.trim().is_empty() => {
let processing_time = start_time.elapsed().as_millis() as u64;
// Only remove null bytes - preserve all original formatting
let cleaned_text = Self::remove_null_bytes(&text);
let word_count = self.count_words_safely(&cleaned_text);
info!(
"Legacy DOC extraction completed using {}: {} words extracted from '{}' in {}ms",
tool, word_count, file_path, processing_time
);
return Ok(OcrResult {
text: cleaned_text,
confidence: 90.0, // High confidence for proven extraction tools
processing_time_ms: processing_time,
word_count,
preprocessing_applied: vec![format!("Legacy DOC extraction ({})", tool)],
processed_image_path: None,
});
}
Ok(_) => {
// Tool succeeded but returned empty text
last_error = Some(anyhow!("{} returned empty content", tool));
}
Err(e) => {
last_error = Some(e);
continue; // Try next tool
}
}
}
// If all tools failed, provide helpful installation guidance
let processing_time = start_time.elapsed().as_millis() as u64;
Err(anyhow!(
"Legacy DOC file extraction failed for '{}'. None of the DOC extraction tools ({}) are available or working.\n\
\nTo process DOC files, please install one of these lightweight tools:\n\
\n antiword (recommended for most DOC files):\n\
- Ubuntu/Debian: 'sudo apt-get install antiword'\n\
- macOS: 'brew install antiword'\n\
- Alpine: 'apk add antiword'\n\
\n catdoc (good fallback option):\n\
- Ubuntu/Debian: 'sudo apt-get install catdoc'\n\
- macOS: 'brew install catdoc'\n\
- Alpine: 'apk add catdoc'\n\
\n wv (includes wvText tool):\n\
- Ubuntu/Debian: 'sudo apt-get install wv'\n\
- macOS: 'brew install wv'\n\
\nAlternatively, convert the DOC file to DOCX or PDF format for better compatibility.\n\
These tools are much lighter than LibreOffice (~1-2MB vs 400-500MB) and work reliably for most DOC files.\n\
Processing time: {}ms\n\
Last error: {}",
file_path,
tools.join(", "),
processing_time,
last_error.map(|e| e.to_string()).unwrap_or_else(|| "Unknown error".to_string())
))
}
/// Try to extract text from DOC file using a specific external tool
async fn try_doc_extraction_tool(&self, file_path: &str, tool: &str) -> Result<String> {
// Security: Sanitize file path before passing to external tools
let sanitized_path = Self::sanitize_file_path_for_external_tool(file_path)?;
let output = match tool {
"antiword" => {
tokio::process::Command::new("antiword")
.arg(&sanitized_path)
.output()
.await?
}
"catdoc" => {
tokio::process::Command::new("catdoc")
.arg("-a") // ASCII output
.arg(&sanitized_path)
.output()
.await?
}
"wvText" => {
// wvText from wv package
tokio::process::Command::new("wvText")
.arg(&sanitized_path)
.arg("-") // Output to stdout
.output()
.await?
}
_ => return Err(anyhow!("Unknown DOC extraction tool: {}", tool)),
};
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(anyhow!(
"{} failed with exit code {}: {}",
tool,
output.status.code().unwrap_or(-1),
stderr
));
}
let text = String::from_utf8_lossy(&output.stdout).to_string();
// Check if tool is actually available (some might succeed but output usage info)
if text.contains("command not found") || text.contains("Usage:") {
return Err(anyhow!("{} is not properly installed or configured", tool));
}
Ok(text)
}
/// Extract text from any supported file type /// Extract text from any supported file type
pub async fn extract_text(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result<OcrResult> { pub async fn extract_text(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result<OcrResult> {
@ -1733,6 +1578,7 @@ impl EnhancedOcrService {
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
"application/vnd.openxmlformats-officedocument.presentationml.presentation" "application/vnd.openxmlformats-officedocument.presentationml.presentation"
) => { ) => {
// extract_text_from_office now returns OcrResult directly
self.extract_text_from_office(&resolved_path, mime, settings).await self.extract_text_from_office(&resolved_path, mime, settings).await
} }
_ => Err(anyhow::anyhow!("Unsupported file type: {}", mime_type)), _ => Err(anyhow::anyhow!("Unsupported file type: {}", mime_type)),

View File

@ -1,17 +1,16 @@
use anyhow::Result; use anyhow::Result;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::collections::HashMap; use tracing::{info, warn};
use std::sync::{Arc, RwLock, Mutex};
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use tracing::{debug, error, info, warn};
use rand::Rng;
use super::xml_extractor::{OfficeExtractionResult, XmlOfficeExtractor}; use super::xml_extractor::{OfficeExtractionResult, XmlOfficeExtractor};
/// Configuration for fallback strategy behavior #[cfg(test)]
use anyhow::anyhow;
/// Configuration for XML-based Office document extraction
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FallbackConfig { pub struct FallbackConfig {
/// Enable fallback mechanism /// Enable XML extraction
pub enabled: bool, pub enabled: bool,
/// Maximum number of retry attempts for transient failures /// Maximum number of retry attempts for transient failures
pub max_retries: u32, pub max_retries: u32,
@ -19,68 +18,10 @@ pub struct FallbackConfig {
pub initial_retry_delay_ms: u64, pub initial_retry_delay_ms: u64,
/// Maximum retry delay in milliseconds /// Maximum retry delay in milliseconds
pub max_retry_delay_ms: u64, pub max_retry_delay_ms: u64,
/// Circuit breaker configuration /// Timeout for XML extraction in seconds
pub circuit_breaker: CircuitBreakerConfig,
/// Learning mechanism configuration
pub learning: LearningConfig,
/// Timeout configuration for individual methods
pub method_timeouts: MethodTimeouts,
}
/// Circuit breaker configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CircuitBreakerConfig {
/// Enable circuit breaker
pub enabled: bool,
/// Number of consecutive failures before opening circuit
pub failure_threshold: u32,
/// Time to wait before attempting to close circuit
pub recovery_timeout_seconds: u64,
/// Percentage of successful requests needed to close circuit (0-100)
pub success_threshold_percentage: u32,
}
/// Learning mechanism configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LearningConfig {
/// Enable learning from successful extractions
pub enabled: bool,
/// Cache successful extraction methods per document type
pub cache_successful_methods: bool,
/// Time to keep method preferences in cache (in hours)
pub cache_ttl_hours: u64,
}
impl Default for LearningConfig {
fn default() -> Self {
Self {
enabled: true,
cache_successful_methods: true,
cache_ttl_hours: 24,
}
}
}
/// Timeout configuration for different extraction methods
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MethodTimeouts {
/// Timeout for library-based extraction in seconds
pub library_timeout_seconds: u64,
/// Timeout for XML-based extraction in seconds
pub xml_timeout_seconds: u64, pub xml_timeout_seconds: u64,
/// Timeout for OCR-based extraction in seconds
pub ocr_timeout_seconds: u64,
} }
impl Default for MethodTimeouts {
fn default() -> Self {
Self {
library_timeout_seconds: 120,
xml_timeout_seconds: 180,
ocr_timeout_seconds: 300,
}
}
}
impl Default for FallbackConfig { impl Default for FallbackConfig {
fn default() -> Self { fn default() -> Self {
@ -89,322 +30,18 @@ impl Default for FallbackConfig {
max_retries: 3, max_retries: 3,
initial_retry_delay_ms: 1000, initial_retry_delay_ms: 1000,
max_retry_delay_ms: 30000, max_retry_delay_ms: 30000,
circuit_breaker: CircuitBreakerConfig { xml_timeout_seconds: 180,
enabled: true,
failure_threshold: 5,
recovery_timeout_seconds: 60,
success_threshold_percentage: 50,
},
learning: LearningConfig {
enabled: true,
cache_successful_methods: true,
cache_ttl_hours: 24,
},
method_timeouts: MethodTimeouts {
library_timeout_seconds: 120,
xml_timeout_seconds: 180,
ocr_timeout_seconds: 300,
},
} }
} }
} }
/// Circuit breaker states
#[derive(Debug, Clone, PartialEq)]
pub enum CircuitState {
Closed, // Normal operation
Open, // Failing fast
HalfOpen, // Testing recovery
}
/// Circuit breaker for a specific extraction method
/// Thread-safe implementation using Arc<Mutex> for shared state
#[derive(Debug, Clone)]
pub struct CircuitBreaker {
inner: Arc<std::sync::Mutex<CircuitBreakerInner>>,
}
#[derive(Debug)] /// Statistics for monitoring XML extraction performance
struct CircuitBreakerInner {
state: CircuitState,
failure_count: u32,
success_count: u32,
last_failure_time: Option<Instant>,
config: CircuitBreakerConfig,
}
impl CircuitBreaker {
fn new(config: CircuitBreakerConfig) -> Self {
Self {
inner: Arc::new(Mutex::new(CircuitBreakerInner {
state: CircuitState::Closed,
failure_count: 0,
success_count: 0,
last_failure_time: None,
config,
})),
}
}
/// Check if the circuit should allow a request
fn should_allow_request(&self) -> bool {
let mut inner = match self.inner.lock() {
Ok(guard) => guard,
Err(poisoned) => {
warn!("Circuit breaker mutex was poisoned, recovering");
poisoned.into_inner()
}
};
match inner.state {
CircuitState::Closed => true,
CircuitState::Open => {
// Check if we should transition to half-open
if let Some(last_failure) = inner.last_failure_time {
if last_failure.elapsed().as_secs() >= inner.config.recovery_timeout_seconds {
info!("Circuit breaker transitioning from Open to HalfOpen for recovery test");
inner.state = CircuitState::HalfOpen;
inner.success_count = 0;
true
} else {
false
}
} else {
false
}
}
CircuitState::HalfOpen => true,
}
}
/// Record a successful operation
fn record_success(&self) {
let mut inner = match self.inner.lock() {
Ok(guard) => guard,
Err(poisoned) => {
warn!("Circuit breaker mutex was poisoned during success recording, recovering");
poisoned.into_inner()
}
};
inner.success_count += 1;
match inner.state {
CircuitState::Closed => {
// Reset failure count on success
inner.failure_count = 0;
}
CircuitState::HalfOpen => {
// Check if we should close the circuit
let total_requests = inner.success_count + inner.failure_count;
if total_requests >= 10 { // Minimum sample size
let success_percentage = (inner.success_count * 100) / total_requests;
if success_percentage >= inner.config.success_threshold_percentage {
info!("Circuit breaker closing after successful recovery ({}% success rate)", success_percentage);
inner.state = CircuitState::Closed;
inner.failure_count = 0;
inner.success_count = 0;
}
}
}
CircuitState::Open => {
// Should not happen, but reset if it does
warn!("Unexpected success recorded while circuit is Open");
}
}
}
/// Record a failed operation
fn record_failure(&self) {
let mut inner = match self.inner.lock() {
Ok(guard) => guard,
Err(poisoned) => {
warn!("Circuit breaker mutex was poisoned during failure recording, recovering");
poisoned.into_inner()
}
};
inner.failure_count += 1;
inner.last_failure_time = Some(Instant::now());
match inner.state {
CircuitState::Closed => {
if inner.failure_count >= inner.config.failure_threshold {
warn!("Circuit breaker opening after {} consecutive failures", inner.failure_count);
inner.state = CircuitState::Open;
}
}
CircuitState::HalfOpen => {
warn!("Circuit breaker opening again after failure during recovery test");
inner.state = CircuitState::Open;
inner.success_count = 0;
}
CircuitState::Open => {
// Already open, nothing to do
}
}
}
}
/// Cached method preference for a specific document type
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MethodPreference {
pub method_name: String,
pub success_count: u32,
pub last_success_time: u64, // Unix timestamp
pub average_processing_time_ms: u64,
pub confidence_score: f32,
}
/// Learning cache for method preferences
#[derive(Debug, Clone)]
pub struct LearningCache {
preferences: Arc<RwLock<HashMap<String, MethodPreference>>>,
config: LearningConfig,
}
impl LearningCache {
fn new(config: LearningConfig) -> Self {
Self {
preferences: Arc::new(RwLock::new(HashMap::new())),
config,
}
}
/// Get preferred method for a document type
fn get_preferred_method(&self, document_type: &str) -> Option<String> {
if !self.config.cache_successful_methods {
return None;
}
let preferences = match self.preferences.read() {
Ok(p) => p,
Err(poisoned) => {
warn!("Learning cache get_preferred_method: mutex was poisoned, attempting recovery");
poisoned.into_inner()
}
};
let preference = preferences.get(document_type)?;
// Check if preference is still valid (not expired)
let now = match SystemTime::now().duration_since(UNIX_EPOCH) {
Ok(d) => d.as_secs(),
Err(_) => {
warn!("Learning cache: failed to get current time, using cached preference anyway");
return Some(preference.method_name.clone());
}
};
let expire_time = preference.last_success_time + (self.config.cache_ttl_hours * 3600);
if now <= expire_time {
Some(preference.method_name.clone())
} else {
None
}
}
/// Record successful method usage
fn record_success(&self, document_type: &str, method_name: &str, processing_time_ms: u64, confidence: f32) {
if !self.config.cache_successful_methods {
return;
}
let now = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_secs())
.unwrap_or(0);
let mut preferences = match self.preferences.write() {
Ok(p) => p,
Err(poisoned) => {
warn!("Learning cache record_success: mutex was poisoned, attempting recovery");
poisoned.into_inner()
}
};
let preference = preferences.entry(document_type.to_string()).or_insert_with(|| MethodPreference {
method_name: method_name.to_string(),
success_count: 0,
last_success_time: now,
average_processing_time_ms: processing_time_ms,
confidence_score: confidence,
});
// Update statistics
preference.success_count += 1;
preference.last_success_time = now;
// Update rolling average for processing time
let weight = 0.2; // Give recent results 20% weight
preference.average_processing_time_ms =
((1.0 - weight) * preference.average_processing_time_ms as f64 +
weight * processing_time_ms as f64) as u64;
// Update rolling average for confidence
preference.confidence_score =
(1.0 - weight as f32) * preference.confidence_score +
weight as f32 * confidence;
// If this method is performing better, update the preference
if method_name != preference.method_name {
// Switch to new method if it's significantly better
let time_improvement = preference.average_processing_time_ms as f64 / processing_time_ms as f64;
let confidence_improvement = confidence / preference.confidence_score;
if time_improvement > 1.2 || confidence_improvement > 1.1 {
debug!("Switching preferred method for {} from {} to {} (time improvement: {:.2}x, confidence improvement: {:.2}x)",
document_type, preference.method_name, method_name, time_improvement, confidence_improvement);
preference.method_name = method_name.to_string();
}
}
}
/// Clean up expired entries
/// This method is thread-safe and handles poisoned mutexes gracefully
fn cleanup_expired(&self) {
let now = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_secs())
.unwrap_or(0);
match self.preferences.write() {
Ok(mut preferences) => {
let expire_threshold = now.saturating_sub(self.config.cache_ttl_hours * 3600);
let initial_count = preferences.len();
preferences.retain(|_, pref| pref.last_success_time > expire_threshold);
let final_count = preferences.len();
if initial_count != final_count {
debug!("Learning cache cleanup: removed {} expired entries ({}->{})",
initial_count - final_count, initial_count, final_count);
}
}
Err(poisoned) => {
warn!("Learning cache cleanup: mutex was poisoned, attempting recovery");
// In case of poisoned mutex, try to recover and clean up
let mut preferences = poisoned.into_inner();
let expire_threshold = now.saturating_sub(self.config.cache_ttl_hours * 3600);
let initial_count = preferences.len();
preferences.retain(|_, pref| pref.last_success_time > expire_threshold);
let final_count = preferences.len();
if initial_count != final_count {
debug!("Learning cache cleanup (recovered): removed {} expired entries ({}->{})",
initial_count - final_count, initial_count, final_count);
}
}
}
}
}
/// Statistics for monitoring fallback performance
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FallbackStats { pub struct FallbackStats {
pub total_extractions: u64, pub total_extractions: u64,
pub library_successes: u64,
pub xml_successes: u64, pub xml_successes: u64,
pub fallback_used: u64,
pub circuit_breaker_trips: u64,
pub retry_attempts: u64, pub retry_attempts: u64,
pub average_processing_time_ms: f64, pub average_processing_time_ms: f64,
pub success_rate_percentage: f64, pub success_rate_percentage: f64,
@ -414,10 +51,7 @@ impl Default for FallbackStats {
fn default() -> Self { fn default() -> Self {
Self { Self {
total_extractions: 0, total_extractions: 0,
library_successes: 0,
xml_successes: 0, xml_successes: 0,
fallback_used: 0,
circuit_breaker_trips: 0,
retry_attempts: 0, retry_attempts: 0,
average_processing_time_ms: 0.0, average_processing_time_ms: 0.0,
success_rate_percentage: 100.0, success_rate_percentage: 100.0,
@ -425,64 +59,46 @@ impl Default for FallbackStats {
} }
} }
/// Main fallback strategy implementation /// XML-based Office document extraction service
pub struct FallbackStrategy { pub struct FallbackStrategy {
config: FallbackConfig, config: FallbackConfig,
xml_extractor: XmlOfficeExtractor, xml_extractor: XmlOfficeExtractor,
circuit_breakers: Arc<RwLock<HashMap<String, CircuitBreaker>>>, stats: std::sync::Arc<std::sync::RwLock<FallbackStats>>,
learning_cache: LearningCache,
stats: Arc<RwLock<FallbackStats>>,
} }
impl FallbackStrategy { impl FallbackStrategy {
/// Create a new fallback strategy /// Create a new XML extraction service
pub fn new(config: FallbackConfig, temp_dir: String) -> Self { pub fn new(config: FallbackConfig, temp_dir: String) -> Self {
Self { Self {
config: config.clone(), config,
xml_extractor: XmlOfficeExtractor::new(temp_dir), xml_extractor: XmlOfficeExtractor::new(temp_dir),
circuit_breakers: Arc::new(RwLock::new(HashMap::new())), stats: std::sync::Arc::new(std::sync::RwLock::new(FallbackStats::default())),
learning_cache: LearningCache::new(config.learning),
stats: Arc::new(RwLock::new(FallbackStats::default())),
} }
} }
/// Execute extraction with intelligent fallback strategy /// Extract Office document using XML extraction
pub async fn extract_with_fallback( pub async fn extract_with_fallback(
&self, &self,
file_path: &str, file_path: &str,
mime_type: &str, mime_type: &str,
) -> Result<OfficeExtractionResult> { ) -> Result<OfficeExtractionResult> {
let start_time = Instant::now(); let start_time = std::time::Instant::now();
let document_type = self.get_document_type(mime_type); let document_type = self.get_document_type(mime_type);
info!("Starting extraction with fallback for {} (type: {})", file_path, document_type); info!("Starting XML extraction for {} (type: {})", file_path, document_type);
// Update total extraction count // Update total extraction count
match self.stats.write() { if let Ok(mut stats) = self.stats.write() {
Ok(mut stats) => { stats.total_extractions += 1;
stats.total_extractions += 1;
}
Err(_) => {
warn!("Failed to acquire write lock on stats for extraction count update");
}
} }
// Use XML extraction as the primary method // Use XML extraction as the only method
let result = self.execute_xml_extraction(file_path, mime_type).await; let result = self.execute_xml_extraction(file_path, mime_type).await;
let processing_time = start_time.elapsed(); let processing_time = start_time.elapsed();
// Update statistics // Update statistics
self.update_stats(&result, processing_time).await; self.update_stats(&result, processing_time).await;
// Clean up expired cache entries periodically (1% chance per extraction)
// This is done asynchronously to avoid blocking the main extraction flow
if rand::thread_rng().gen_range(0..100) == 0 {
let cache_clone = self.learning_cache.clone();
tokio::spawn(async move {
cache_clone.cleanup_expired();
});
}
result result
} }
@ -496,51 +112,13 @@ impl FallbackStrategy {
let result = self.xml_extractor.extract_text_from_office(file_path, mime_type).await?; let result = self.xml_extractor.extract_text_from_office(file_path, mime_type).await?;
// Update stats // Update stats
match self.stats.write() { if let Ok(mut stats) = self.stats.write() {
Ok(mut stats) => { stats.xml_successes += 1;
stats.xml_successes += 1;
}
Err(_) => {
warn!("Failed to acquire write lock on stats for xml success update");
}
} }
Ok(result) Ok(result)
} }
/// Record a failure for circuit breaker tracking
async fn record_failure(&self, method_name: &str) {
if !self.config.circuit_breaker.enabled {
return;
}
match self.circuit_breakers.write() {
Ok(mut breakers) => {
let breaker = breakers.entry(method_name.to_string())
.or_insert_with(|| CircuitBreaker::new(self.config.circuit_breaker.clone()));
breaker.record_failure();
// Check if circuit is now open and update stats
if let Ok(inner) = breaker.inner.lock() {
if inner.state == CircuitState::Open {
match self.stats.write() {
Ok(mut stats) => {
stats.circuit_breaker_trips += 1;
}
Err(_) => {
warn!("Failed to acquire write lock on stats for circuit breaker trip recording");
}
}
}
} else {
warn!("Failed to check circuit breaker state after failure recording");
}
}
Err(_) => {
warn!("Failed to acquire write lock on circuit breakers for failure recording");
}
}
}
/// Get document type from MIME type /// Get document type from MIME type
fn get_document_type(&self, mime_type: &str) -> String { fn get_document_type(&self, mime_type: &str) -> String {
@ -557,55 +135,41 @@ impl FallbackStrategy {
} }
/// Update statistics after extraction /// Update statistics after extraction
async fn update_stats(&self, result: &Result<OfficeExtractionResult>, processing_time: Duration) { async fn update_stats(&self, result: &Result<OfficeExtractionResult>, processing_time: std::time::Duration) {
match self.stats.write() { if let Ok(mut stats) = self.stats.write() {
Ok(mut stats) => { let processing_time_ms = processing_time.as_millis() as f64;
let processing_time_ms = processing_time.as_millis() as f64;
// Update average processing time using exponential moving average
// Update average processing time using exponential moving average let alpha = 0.1; // Smoothing factor
let alpha = 0.1; // Smoothing factor stats.average_processing_time_ms =
stats.average_processing_time_ms = alpha * processing_time_ms + (1.0 - alpha) * stats.average_processing_time_ms;
alpha * processing_time_ms + (1.0 - alpha) * stats.average_processing_time_ms;
// Update success rate with proper division by zero protection
// Update success rate with proper division by zero protection let total_attempts = stats.total_extractions;
let total_attempts = stats.total_extractions; let successful_attempts = stats.xml_successes;
let successful_attempts = stats.library_successes + stats.xml_successes;
if total_attempts > 0 {
if total_attempts > 0 { stats.success_rate_percentage = (successful_attempts as f64 / total_attempts as f64) * 100.0;
stats.success_rate_percentage = (successful_attempts as f64 / total_attempts as f64) * 100.0; } else if result.is_ok() {
} else { stats.success_rate_percentage = 100.0;
// Keep existing success rate if no attempts yet, or set to 100% for first success
if result.is_ok() {
stats.success_rate_percentage = 100.0;
}
}
}
Err(_) => {
warn!("Failed to acquire write lock on stats for update");
} }
} }
} }
/// Get current statistics /// Get current statistics
pub async fn get_stats(&self) -> FallbackStats { pub async fn get_stats(&self) -> FallbackStats {
match self.stats.read() { self.stats.read()
Ok(stats) => stats.clone(), .map(|stats| stats.clone())
Err(_) => { .unwrap_or_else(|_| {
warn!("Failed to acquire read lock on stats, returning default"); warn!("Failed to acquire read lock on stats, returning default");
FallbackStats::default() FallbackStats::default()
} })
}
} }
/// Reset statistics /// Reset statistics
pub async fn reset_stats(&self) { pub async fn reset_stats(&self) {
match self.stats.write() { if let Ok(mut stats) = self.stats.write() {
Ok(mut stats) => { *stats = FallbackStats::default();
*stats = FallbackStats::default();
}
Err(_) => {
warn!("Failed to acquire write lock on stats for reset");
}
} }
} }
} }
@ -622,88 +186,6 @@ mod tests {
(strategy, temp_dir) (strategy, temp_dir)
} }
#[test]
fn test_circuit_breaker() {
let config = CircuitBreakerConfig {
enabled: true,
failure_threshold: 3,
recovery_timeout_seconds: 1,
success_threshold_percentage: 50,
};
let breaker = CircuitBreaker::new(config);
// Initially closed
assert!(breaker.should_allow_request());
// Record failures
breaker.record_failure();
breaker.record_failure();
assert!(breaker.should_allow_request()); // Still closed after 2 failures
breaker.record_failure(); // Should open circuit
assert!(!breaker.should_allow_request()); // Now should be open
}
#[test]
fn test_learning_cache() {
let config = LearningConfig {
enabled: true,
cache_successful_methods: true,
cache_ttl_hours: 1,
};
let cache = LearningCache::new(config);
// Initially no preference
assert!(cache.get_preferred_method("docx").is_none());
// Record success
cache.record_success("docx", "XML", 1000, 95.0);
// Should have preference now
assert_eq!(cache.get_preferred_method("docx"), Some("XML".to_string()));
}
#[tokio::test]
async fn test_is_retryable_error() {
let (strategy, _temp_dir) = create_test_strategy();
// Test retryable errors
let retryable_errors = [
"Connection timeout occurred",
"Network temporarily unavailable",
"Resource busy, try again",
"Service unavailable (503)",
"Rate limit exceeded (429)",
"Out of memory - allocation failed",
];
for error_msg in retryable_errors {
let error = anyhow!("{}", error_msg);
assert!(strategy.is_retryable_error(&error), "Expected '{}' to be retryable", error_msg);
}
// Test non-retryable errors
let non_retryable_errors = [
"File is corrupted",
"Invalid format detected",
"Access denied - permission error",
"File not found (404)",
"Unauthorized access (403)",
"Assertion failed in parser",
];
for error_msg in non_retryable_errors {
let error = anyhow!("{}", error_msg);
assert!(!strategy.is_retryable_error(&error), "Expected '{}' to be non-retryable", error_msg);
}
// Test unknown errors (should be non-retryable by default)
let unknown_error = anyhow!("Some unknown error occurred");
assert!(!strategy.is_retryable_error(&unknown_error));
}
#[tokio::test] #[tokio::test]
async fn test_stats_tracking() { async fn test_stats_tracking() {
let (strategy, _temp_dir) = create_test_strategy(); let (strategy, _temp_dir) = create_test_strategy();
@ -712,19 +194,27 @@ mod tests {
assert_eq!(initial_stats.total_extractions, 0); assert_eq!(initial_stats.total_extractions, 0);
// Simulate some operations by updating stats directly // Simulate some operations by updating stats directly
match strategy.stats.write() { if let Ok(mut stats) = strategy.stats.write() {
Ok(mut stats) => { stats.total_extractions = 10;
stats.total_extractions = 10; stats.xml_successes = 9;
stats.library_successes = 7; // Calculate success rate manually as update_stats would do
stats.xml_successes = 2; stats.success_rate_percentage = (9.0 / 10.0) * 100.0;
}
Err(_) => {
panic!("Failed to acquire write lock on stats in test");
}
} }
let updated_stats = strategy.get_stats().await; let updated_stats = strategy.get_stats().await;
assert_eq!(updated_stats.total_extractions, 10); assert_eq!(updated_stats.total_extractions, 10);
assert_eq!(updated_stats.xml_successes, 9);
assert_eq!(updated_stats.success_rate_percentage, 90.0); // 9 successes out of 10 assert_eq!(updated_stats.success_rate_percentage, 90.0); // 9 successes out of 10
} }
#[test]
fn test_get_document_type() {
let (strategy, _temp_dir) = create_test_strategy();
assert_eq!(strategy.get_document_type("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), "docx");
assert_eq!(strategy.get_document_type("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), "xlsx");
assert_eq!(strategy.get_document_type("application/vnd.openxmlformats-officedocument.presentationml.presentation"), "pptx");
assert_eq!(strategy.get_document_type("application/pdf"), "pdf");
assert_eq!(strategy.get_document_type("unknown/type"), "unknown");
}
} }

View File

@ -195,25 +195,41 @@ impl OcrService {
} }
} }
/// Extract text from Office documents using fallback strategy /// Extract text from Office documents using XML extraction
pub async fn extract_text_from_office_document( pub async fn extract_text_from_office_document(
&self, &self,
file_path: &str, file_path: &str,
mime_type: &str, mime_type: &str,
) -> Result<String> { ) -> Result<crate::ocr::enhanced::OcrResult> {
match &self.fallback_strategy { match &self.fallback_strategy {
Some(strategy) => { Some(strategy) => {
let result = strategy.extract_with_fallback(file_path, mime_type).await?; let result = strategy.extract_with_fallback(file_path, mime_type).await?;
Ok(result.text) // Convert the result to OcrResult for backward compatibility
Ok(crate::ocr::enhanced::OcrResult {
text: result.text,
confidence: result.confidence,
processing_time_ms: result.processing_time_ms,
word_count: result.word_count,
preprocessing_applied: vec![format!("XML extraction - {}", result.extraction_method)],
processed_image_path: None,
})
} }
None => { None => {
// Fallback to basic XML extraction if no strategy is configured // Use basic XML extraction if no strategy is configured
let xml_extractor = crate::ocr::xml_extractor::XmlOfficeExtractor::new( let xml_extractor = crate::ocr::xml_extractor::XmlOfficeExtractor::new(
std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string()) std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string())
); );
let result = xml_extractor.extract_text_from_office(file_path, mime_type).await?; let result = xml_extractor.extract_text_from_office(file_path, mime_type).await?;
Ok(result.text) // Convert OfficeExtractionResult to OcrResult for backward compatibility
Ok(crate::ocr::enhanced::OcrResult {
text: result.text,
confidence: result.confidence,
processing_time_ms: result.processing_time_ms,
word_count: result.word_count,
preprocessing_applied: vec![format!("XML extraction - {}", result.extraction_method)],
processed_image_path: None,
})
} }
} }
} }
@ -223,16 +239,9 @@ impl OcrService {
&self, &self,
file_path: &str, file_path: &str,
mime_type: &str, mime_type: &str,
) -> Result<String> { ) -> Result<crate::ocr::enhanced::OcrResult> {
match &self.fallback_strategy { // Use the same XML extraction logic as the basic method
Some(strategy) => { self.extract_text_from_office_document(file_path, mime_type).await
let result = strategy.extract_with_fallback(file_path, mime_type).await?;
Ok(result.text)
}
None => {
return Err(anyhow!("Fallback strategy not configured for advanced Office document extraction"));
}
}
} }
pub async fn extract_text(&self, file_path: &str, mime_type: &str) -> Result<String> { pub async fn extract_text(&self, file_path: &str, mime_type: &str) -> Result<String> {
@ -249,7 +258,8 @@ impl OcrService {
"application/msword" | "application/msword" |
"application/vnd.ms-excel" | "application/vnd.ms-excel" |
"application/vnd.ms-powerpoint" => { "application/vnd.ms-powerpoint" => {
self.extract_text_from_office_document(file_path, mime_type).await let result = self.extract_text_from_office_document(file_path, mime_type).await?;
Ok(result.text)
} }
"image/png" | "image/jpeg" | "image/jpg" | "image/tiff" | "image/bmp" => { "image/png" | "image/jpeg" | "image/jpg" | "image/tiff" | "image/bmp" => {
self.extract_text_from_image_with_lang(file_path, lang).await self.extract_text_from_image_with_lang(file_path, lang).await
@ -321,7 +331,7 @@ impl OcrService {
} }
} }
/// Get fallback strategy statistics /// Get XML extraction statistics
pub async fn get_fallback_stats(&self) -> Option<crate::ocr::fallback_strategy::FallbackStats> { pub async fn get_fallback_stats(&self) -> Option<crate::ocr::fallback_strategy::FallbackStats> {
match &self.fallback_strategy { match &self.fallback_strategy {
Some(strategy) => Some(strategy.get_stats().await), Some(strategy) => Some(strategy.get_stats().await),
@ -329,14 +339,14 @@ impl OcrService {
} }
} }
/// Reset fallback strategy statistics /// Reset XML extraction statistics
pub async fn reset_fallback_stats(&self) -> Result<()> { pub async fn reset_fallback_stats(&self) -> Result<()> {
match &self.fallback_strategy { match &self.fallback_strategy {
Some(strategy) => { Some(strategy) => {
strategy.reset_stats().await; strategy.reset_stats().await;
Ok(()) Ok(())
} }
None => Err(anyhow!("Fallback strategy not configured")), None => Err(anyhow!("XML extraction strategy not configured")),
} }
} }

View File

@ -102,7 +102,6 @@ async fn get_settings(
webdav_auto_sync: default.webdav_auto_sync, webdav_auto_sync: default.webdav_auto_sync,
webdav_sync_interval_minutes: default.webdav_sync_interval_minutes, webdav_sync_interval_minutes: default.webdav_sync_interval_minutes,
// Office document extraction configuration // Office document extraction configuration
office_extraction_mode: default.office_extraction_mode,
office_extraction_timeout_seconds: default.office_extraction_timeout_seconds, office_extraction_timeout_seconds: default.office_extraction_timeout_seconds,
office_extraction_enable_detailed_logging: default.office_extraction_enable_detailed_logging, office_extraction_enable_detailed_logging: default.office_extraction_enable_detailed_logging,
} }

File diff suppressed because one or more lines are too long

View File

@ -457,19 +457,20 @@ async fn test_doc_extraction_multiple_strategies() {
let settings = Settings::default(); let settings = Settings::default();
let start_time = std::time::Instant::now(); let start_time = std::time::Instant::now();
// Test the full legacy DOC extraction process // Test Office extraction with the DOC file (this should fail as DOC files are not XML-based)
let result = ocr_service.extract_text_from_legacy_doc( let result = ocr_service.extract_text_from_office(
doc_path.to_str().unwrap(), doc_path.to_str().unwrap(),
start_time "application/msword",
&settings
).await; ).await;
// Should fail since we don't have LibreOffice or extraction tools in test env // Should fail since DOC files are not XML-based and we only do XML extraction now
assert!(result.is_err(), "Should fail without proper tools"); assert!(result.is_err(), "Should fail for DOC files as they are not XML-based");
let error_msg = result.unwrap_err().to_string(); let error_msg = result.unwrap_err().to_string();
// Verify it mentions trying extraction tools // Verify it mentions XML parsing issues for DOC files
assert!(error_msg.contains("None of the DOC extraction tools") || error_msg.contains("All extraction methods failed"), assert!(error_msg.contains("not a valid ZIP") || error_msg.contains("invalid") || error_msg.contains("XML"),
"Should mention all methods tried: {}", error_msg); "Should mention XML/ZIP parsing issues: {}", error_msg);
} }
#[tokio::test] #[tokio::test]

View File

@ -7,7 +7,7 @@ use tokio::time::timeout;
use readur::ocr::{ use readur::ocr::{
OcrService, OcrConfig, OcrService, OcrConfig,
fallback_strategy::{FallbackConfig, CircuitBreakerConfig, LearningConfig, MethodTimeouts}, fallback_strategy::FallbackConfig,
}; };
/// Test utilities for creating mock Office documents /// Test utilities for creating mock Office documents
@ -154,18 +154,7 @@ fn create_test_ocr_service(temp_dir: &str) -> OcrService {
max_retries: 2, max_retries: 2,
initial_retry_delay_ms: 100, initial_retry_delay_ms: 100,
max_retry_delay_ms: 1000, max_retry_delay_ms: 1000,
circuit_breaker: CircuitBreakerConfig { xml_timeout_seconds: 60,
enabled: true,
failure_threshold: 3,
recovery_timeout_seconds: 5,
success_threshold_percentage: 70,
},
learning: LearningConfig {
enabled: true,
cache_successful_methods: true,
cache_ttl_hours: 1,
},
method_timeouts: MethodTimeouts::default(),
}, },
temp_dir: temp_dir.to_string(), temp_dir: temp_dir.to_string(),
}; };
@ -186,16 +175,12 @@ async fn test_extract_text_from_docx() -> Result<()> {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await?; ).await?;
assert!(result.success); // The method now returns an OcrResult
// Since we're using a placeholder library extraction, check for the actual content
println!("Extracted text: '{}'", result.text); println!("Extracted text: '{}'", result.text);
println!("Method used: {}", result.method_name);
assert!(!result.text.is_empty()); assert!(!result.text.is_empty());
assert!(result.word_count > 0); assert!(result.text.contains(test_content));
assert!(result.confidence > 0.0); assert!(result.confidence > 0.0);
assert!(result.processing_time < Duration::from_secs(30)); assert!(result.word_count > 0);
// The method might be Library-based extraction (placeholder) or XML extraction
assert!(result.method_name.contains("extraction"));
Ok(()) Ok(())
} }
@ -218,13 +203,13 @@ async fn test_extract_text_from_xlsx() -> Result<()> {
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
).await?; ).await?;
assert!(result.success); // The method now returns an OcrResult
// Since we're using placeholder extraction, check basic properties
println!("XLSX extracted text: '{}'", result.text); println!("XLSX extracted text: '{}'", result.text);
println!("XLSX method used: {}", result.method_name);
assert!(!result.text.is_empty()); assert!(!result.text.is_empty());
assert!(result.word_count > 0); // Check if it contains some of our test content
assert!(result.text.contains("Header") || result.text.contains("Data"));
assert!(result.confidence > 0.0); assert!(result.confidence > 0.0);
assert!(result.word_count > 0);
Ok(()) Ok(())
} }
@ -252,8 +237,10 @@ async fn test_extraction_modes() -> Result<()> {
// XML extraction should succeed with our test document // XML extraction should succeed with our test document
assert!(result.is_ok(), "XML extraction failed: {:?}", result); assert!(result.is_ok(), "XML extraction failed: {:?}", result);
let extracted_text = result?; let extracted_result = result?;
assert!(!extracted_text.is_empty()); assert!(!extracted_result.text.is_empty());
assert!(extracted_result.confidence > 0.0);
assert!(extracted_result.word_count > 0);
Ok(()) Ok(())
} }
@ -263,29 +250,14 @@ async fn test_fallback_mechanism() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?; let test_docs = OfficeTestDocuments::new()?;
let temp_dir = test_docs.temp_dir.path().to_string_lossy().to_string(); let temp_dir = test_docs.temp_dir.path().to_string_lossy().to_string();
// Create a service with library-first mode // Create a service with XML-only mode (simplified)
let config = OcrConfig { let config = OcrConfig {
fallback_config: FallbackConfig { fallback_config: FallbackConfig {
enabled: true, enabled: true,
max_retries: 1, max_retries: 1,
initial_retry_delay_ms: 50, initial_retry_delay_ms: 50,
max_retry_delay_ms: 200, max_retry_delay_ms: 200,
circuit_breaker: CircuitBreakerConfig { xml_timeout_seconds: 30,
enabled: false, // Disable for this test
failure_threshold: 5,
recovery_timeout_seconds: 10,
success_threshold_percentage: 50,
},
learning: LearningConfig {
enabled: true,
cache_successful_methods: true,
cache_ttl_hours: 1,
},
method_timeouts: MethodTimeouts {
library_timeout_seconds: 1, // Very short timeout to force fallback
xml_timeout_seconds: 30,
ocr_timeout_seconds: 60,
},
}, },
temp_dir, temp_dir,
}; };
@ -293,16 +265,16 @@ async fn test_fallback_mechanism() -> Result<()> {
let ocr_service = OcrService::new_with_config(config); let ocr_service = OcrService::new_with_config(config);
let docx_path = test_docs.create_mock_docx("fallback_test.docx", "Fallback test content")?; let docx_path = test_docs.create_mock_docx("fallback_test.docx", "Fallback test content")?;
// The library method should timeout and fallback to XML // The XML extraction should succeed
let result = ocr_service.extract_text_from_office_document( let result = ocr_service.extract_text_from_office_document(
&docx_path, &docx_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await?; ).await?;
assert!(result.success); // The method now returns an OcrResult
assert!(result.text.contains("Fallback test content")); assert!(result.text.contains("Fallback test content"));
// Should have used XML extraction due to library timeout assert!(result.confidence > 0.0);
assert!(result.method_name.contains("XML")); assert!(result.word_count > 0);
Ok(()) Ok(())
} }
@ -326,7 +298,9 @@ async fn test_timeout_handling() -> Result<()> {
// Should complete successfully even with short timeout for our simple test file // Should complete successfully even with short timeout for our simple test file
assert!(result.is_ok()); assert!(result.is_ok());
let extraction_result = result??; let extraction_result = result??;
assert!(extraction_result.success); assert!(!extraction_result.text.is_empty());
assert!(extraction_result.confidence > 0.0);
assert!(extraction_result.word_count > 0);
Ok(()) Ok(())
} }
@ -399,10 +373,11 @@ async fn test_concurrent_extraction() -> Result<()> {
// Verify all extractions succeeded // Verify all extractions succeeded
for (i, task_result) in results.into_iter().enumerate() { for (i, task_result) in results.into_iter().enumerate() {
let extraction_result = task_result??; let ocr_result = task_result??;
assert!(extraction_result.success, "Task {} failed", i); assert!(!ocr_result.text.is_empty(), "Task {} failed", i);
assert!(extraction_result.text.contains(&format!("Test document {}", i))); assert!(ocr_result.text.contains(&format!("Test document {}", i)));
assert!(extraction_result.word_count > 0); assert!(ocr_result.confidence > 0.0);
assert!(ocr_result.word_count > 0);
} }
Ok(()) Ok(())
@ -412,25 +387,14 @@ async fn test_concurrent_extraction() -> Result<()> {
async fn test_circuit_breaker() -> Result<()> { async fn test_circuit_breaker() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?; let test_docs = OfficeTestDocuments::new()?;
// Create service with aggressive circuit breaker settings // Create service with simple retry settings (circuit breaker functionality removed)
let config = OcrConfig { let config = OcrConfig {
fallback_config: FallbackConfig { fallback_config: FallbackConfig {
enabled: true, enabled: true,
max_retries: 0, // No retries to make failures immediate max_retries: 0, // No retries to make failures immediate
initial_retry_delay_ms: 10, initial_retry_delay_ms: 10,
max_retry_delay_ms: 100, max_retry_delay_ms: 100,
circuit_breaker: CircuitBreakerConfig { xml_timeout_seconds: 30,
enabled: true,
failure_threshold: 2, // Trip after just 2 failures
recovery_timeout_seconds: 1,
success_threshold_percentage: 100, // Require 100% success to close
},
learning: LearningConfig::default(),
method_timeouts: MethodTimeouts {
library_timeout_seconds: 30,
xml_timeout_seconds: 30,
ocr_timeout_seconds: 30,
},
}, },
temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(), temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(),
}; };
@ -458,24 +422,17 @@ async fn test_circuit_breaker() -> Result<()> {
).await; ).await;
assert!(result2.is_err()); assert!(result2.is_err());
// Third attempt - should fail fast due to circuit breaker // Third attempt - should succeed since circuit breaker functionality was removed
let result3 = ocr_service.extract_text_from_office_document( let result3 = ocr_service.extract_text_from_office_document(
&valid_path, &valid_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await; ).await;
assert!(result3.is_err()); // With simplified architecture, valid documents should always work
let error_msg = result3.unwrap_err().to_string(); assert!(result3.is_ok());
assert!(error_msg.contains("circuit breaker") || error_msg.contains("open")); let valid_result = result3.unwrap();
assert!(valid_result.text.contains("Valid document"));
// Wait for recovery timeout assert!(valid_result.confidence > 0.0);
tokio::time::sleep(Duration::from_secs(2)).await; assert!(valid_result.word_count > 0);
// Now should be able to process valid document (circuit goes to half-open)
let _result4 = ocr_service.extract_text_from_office_document(
&valid_path,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await;
// This might still fail if circuit is still open, which is acceptable behavior
Ok(()) Ok(())
} }
@ -501,6 +458,10 @@ async fn test_statistics_tracking() -> Result<()> {
).await; ).await;
assert!(result.is_ok(), "Extraction {} failed: {:?}", i, result); assert!(result.is_ok(), "Extraction {} failed: {:?}", i, result);
let ocr_result = result.unwrap();
assert!(!ocr_result.text.is_empty());
assert!(ocr_result.confidence > 0.0);
assert!(ocr_result.word_count > 0);
} }
// Check updated stats // Check updated stats
@ -534,25 +495,14 @@ async fn test_mime_type_support() -> Result<()> {
async fn test_learning_mechanism() -> Result<()> { async fn test_learning_mechanism() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?; let test_docs = OfficeTestDocuments::new()?;
// Create service with learning enabled // Create service with simple XML extraction (learning functionality removed)
let config = OcrConfig { let config = OcrConfig {
fallback_config: FallbackConfig { fallback_config: FallbackConfig {
enabled: true, enabled: true,
max_retries: 1, max_retries: 1,
initial_retry_delay_ms: 10, initial_retry_delay_ms: 10,
max_retry_delay_ms: 100, max_retry_delay_ms: 100,
circuit_breaker: CircuitBreakerConfig { xml_timeout_seconds: 30,
enabled: false, // Disable to focus on learning
failure_threshold: 10,
recovery_timeout_seconds: 10,
success_threshold_percentage: 50,
},
learning: LearningConfig {
enabled: true,
cache_successful_methods: true,
cache_ttl_hours: 1,
},
method_timeouts: MethodTimeouts::default(),
}, },
temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(), temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(),
}; };
@ -569,15 +519,16 @@ async fn test_learning_mechanism() -> Result<()> {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await; ).await;
assert!(result.is_ok(), "Learning iteration {} failed: {:?}", i, result); assert!(result.is_ok(), "Extraction iteration {} failed: {:?}", i, result);
let result = result?; let ocr_result = result?;
assert!(result.success); assert!(!ocr_result.text.is_empty());
assert!(result.text.contains(&format!("document {}", i))); assert!(ocr_result.text.contains(&format!("document {}", i)));
assert!(ocr_result.confidence > 0.0);
assert!(ocr_result.word_count > 0);
} }
// The learning mechanism should now have preferences cached // With the simplified XML-only architecture, the system should consistently work
// We can't easily test this directly without exposing internal state, // All extractions succeeded, indicating the XML extraction is working correctly
// but the fact that all extractions succeeded indicates the system is working
Ok(()) Ok(())
} }
@ -635,11 +586,11 @@ async fn benchmark_extraction_performance() -> Result<()> {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
).await?; ).await?;
assert!(result.success); assert!(!result.text.is_empty());
println!("Iteration {}: {} ms, {} words", println!("Iteration {}: extracted {} chars, confidence: {:.1}%",
i, i,
result.processing_time.as_millis(), result.text.len(),
result.word_count result.confidence
); );
} }

View File

@ -115,6 +115,8 @@ mod tests {
webdav_file_extensions: None, webdav_file_extensions: None,
webdav_auto_sync: None, webdav_auto_sync: None,
webdav_sync_interval_minutes: None, webdav_sync_interval_minutes: None,
office_extraction_timeout_seconds: None,
office_extraction_enable_detailed_logging: None,
}; };
let response = ctx.app let response = ctx.app
@ -238,6 +240,8 @@ mod tests {
webdav_file_extensions: None, webdav_file_extensions: None,
webdav_auto_sync: None, webdav_auto_sync: None,
webdav_sync_interval_minutes: None, webdav_sync_interval_minutes: None,
office_extraction_timeout_seconds: None,
office_extraction_enable_detailed_logging: None,
}; };
let response = ctx.app let response = ctx.app
@ -388,6 +392,8 @@ mod tests {
webdav_file_extensions: None, webdav_file_extensions: None,
webdav_auto_sync: None, webdav_auto_sync: None,
webdav_sync_interval_minutes: None, webdav_sync_interval_minutes: None,
office_extraction_timeout_seconds: None,
office_extraction_enable_detailed_logging: None,
}; };
let response = ctx.app let response = ctx.app
@ -515,6 +521,8 @@ mod tests {
webdav_file_extensions: None, webdav_file_extensions: None,
webdav_auto_sync: None, webdav_auto_sync: None,
webdav_sync_interval_minutes: None, webdav_sync_interval_minutes: None,
office_extraction_timeout_seconds: None,
office_extraction_enable_detailed_logging: None,
}; };
let response = ctx.app let response = ctx.app