feat(office): yeet unused fallback strategy

This commit is contained in:
perf3ct 2025-09-02 03:47:20 +00:00
parent d5d6d2edb4
commit 149c3b9a3f
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
4 changed files with 65 additions and 347 deletions

View File

@ -1,220 +0,0 @@
use anyhow::Result;
use serde::{Deserialize, Serialize};
use tracing::{info, warn};
use super::xml_extractor::{OfficeExtractionResult, XmlOfficeExtractor};
#[cfg(test)]
use anyhow::anyhow;
/// Configuration for XML-based Office document extraction
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FallbackConfig {
/// Enable XML extraction
pub enabled: bool,
/// Maximum number of retry attempts for transient failures
pub max_retries: u32,
/// Initial retry delay in milliseconds
pub initial_retry_delay_ms: u64,
/// Maximum retry delay in milliseconds
pub max_retry_delay_ms: u64,
/// Timeout for XML extraction in seconds
pub xml_timeout_seconds: u64,
}
impl Default for FallbackConfig {
fn default() -> Self {
Self {
enabled: true,
max_retries: 3,
initial_retry_delay_ms: 1000,
max_retry_delay_ms: 30000,
xml_timeout_seconds: 180,
}
}
}
/// Statistics for monitoring XML extraction performance
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FallbackStats {
pub total_extractions: u64,
pub xml_successes: u64,
pub retry_attempts: u64,
pub average_processing_time_ms: f64,
pub success_rate_percentage: f64,
}
impl Default for FallbackStats {
fn default() -> Self {
Self {
total_extractions: 0,
xml_successes: 0,
retry_attempts: 0,
average_processing_time_ms: 0.0,
success_rate_percentage: 100.0,
}
}
}
/// XML-based Office document extraction service
pub struct FallbackStrategy {
config: FallbackConfig,
xml_extractor: XmlOfficeExtractor,
stats: std::sync::Arc<std::sync::RwLock<FallbackStats>>,
}
impl FallbackStrategy {
/// Create a new XML extraction service
pub fn new(config: FallbackConfig, temp_dir: String) -> Self {
Self {
config,
xml_extractor: XmlOfficeExtractor::new(temp_dir),
stats: std::sync::Arc::new(std::sync::RwLock::new(FallbackStats::default())),
}
}
/// Extract Office document using XML extraction
pub async fn extract_with_fallback(
&self,
file_path: &str,
mime_type: &str,
) -> Result<OfficeExtractionResult> {
let start_time = std::time::Instant::now();
let document_type = self.get_document_type(mime_type);
info!("Starting XML extraction for {} (type: {})", file_path, document_type);
// Update total extraction count
if let Ok(mut stats) = self.stats.write() {
stats.total_extractions += 1;
}
// Use XML extraction as the only method
let result = self.execute_xml_extraction(file_path, mime_type).await;
let processing_time = start_time.elapsed();
// Update statistics
self.update_stats(&result, processing_time).await;
result
}
/// Execute XML extraction directly
async fn execute_xml_extraction(
&self,
file_path: &str,
mime_type: &str,
) -> Result<OfficeExtractionResult> {
let result = self.xml_extractor.extract_text_from_office(file_path, mime_type).await?;
// Update stats
if let Ok(mut stats) = self.stats.write() {
stats.xml_successes += 1;
}
Ok(result)
}
/// Get document type from MIME type
fn get_document_type(&self, mime_type: &str) -> String {
match mime_type {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" => "docx".to_string(),
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => "xlsx".to_string(),
"application/vnd.openxmlformats-officedocument.presentationml.presentation" => "pptx".to_string(),
"application/msword" => "doc".to_string(),
"application/vnd.ms-excel" => "xls".to_string(),
"application/vnd.ms-powerpoint" => "ppt".to_string(),
"application/pdf" => "pdf".to_string(),
_ => "unknown".to_string(),
}
}
/// Update statistics after extraction
async fn update_stats(&self, result: &Result<OfficeExtractionResult>, processing_time: std::time::Duration) {
if let Ok(mut stats) = self.stats.write() {
let processing_time_ms = processing_time.as_millis() as f64;
// Update average processing time using exponential moving average
let alpha = 0.1; // Smoothing factor
stats.average_processing_time_ms =
alpha * processing_time_ms + (1.0 - alpha) * stats.average_processing_time_ms;
// Update success rate with proper division by zero protection
let total_attempts = stats.total_extractions;
let successful_attempts = stats.xml_successes;
if total_attempts > 0 {
stats.success_rate_percentage = (successful_attempts as f64 / total_attempts as f64) * 100.0;
} else if result.is_ok() {
stats.success_rate_percentage = 100.0;
}
}
}
/// Get current statistics
pub async fn get_stats(&self) -> FallbackStats {
self.stats.read()
.map(|stats| stats.clone())
.unwrap_or_else(|_| {
warn!("Failed to acquire read lock on stats, returning default");
FallbackStats::default()
})
}
/// Reset statistics
pub async fn reset_stats(&self) {
if let Ok(mut stats) = self.stats.write() {
*stats = FallbackStats::default();
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
fn create_test_strategy() -> (FallbackStrategy, TempDir) {
let temp_dir = TempDir::new().unwrap();
let config = FallbackConfig::default();
let strategy = FallbackStrategy::new(config, temp_dir.path().to_string_lossy().to_string());
(strategy, temp_dir)
}
#[tokio::test]
async fn test_stats_tracking() {
let (strategy, _temp_dir) = create_test_strategy();
let initial_stats = strategy.get_stats().await;
assert_eq!(initial_stats.total_extractions, 0);
// Simulate some operations by updating stats directly
if let Ok(mut stats) = strategy.stats.write() {
stats.total_extractions = 10;
stats.xml_successes = 9;
// Calculate success rate manually as update_stats would do
stats.success_rate_percentage = (9.0 / 10.0) * 100.0;
}
let updated_stats = strategy.get_stats().await;
assert_eq!(updated_stats.total_extractions, 10);
assert_eq!(updated_stats.xml_successes, 9);
assert_eq!(updated_stats.success_rate_percentage, 90.0); // 9 successes out of 10
}
#[test]
fn test_get_document_type() {
let (strategy, _temp_dir) = create_test_strategy();
assert_eq!(strategy.get_document_type("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), "docx");
assert_eq!(strategy.get_document_type("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), "xlsx");
assert_eq!(strategy.get_document_type("application/vnd.openxmlformats-officedocument.presentationml.presentation"), "pptx");
assert_eq!(strategy.get_document_type("application/pdf"), "pdf");
assert_eq!(strategy.get_document_type("unknown/type"), "unknown");
}
}

View File

@ -2,7 +2,6 @@ pub mod api;
pub mod enhanced;
pub mod enhanced_processing;
pub mod error;
pub mod fallback_strategy;
pub mod health;
pub mod queue;
pub mod tests;
@ -12,21 +11,18 @@ use anyhow::{anyhow, Result};
use std::path::Path;
use crate::ocr::error::OcrError;
use crate::ocr::health::OcrHealthChecker;
use crate::ocr::fallback_strategy::{FallbackStrategy, FallbackConfig};
#[cfg(feature = "ocr")]
use tesseract::Tesseract;
pub struct OcrService {
health_checker: OcrHealthChecker,
fallback_strategy: Option<FallbackStrategy>,
temp_dir: String,
}
/// Configuration for the OCR service
#[derive(Debug, Clone)]
pub struct OcrConfig {
/// Fallback configuration
pub fallback_config: FallbackConfig,
/// Temporary directory for processing
pub temp_dir: String,
}
@ -34,7 +30,6 @@ pub struct OcrConfig {
impl Default for OcrConfig {
fn default() -> Self {
Self {
fallback_config: FallbackConfig::default(),
temp_dir: std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string()),
}
}
@ -44,21 +39,15 @@ impl OcrService {
pub fn new() -> Self {
Self {
health_checker: OcrHealthChecker::new(),
fallback_strategy: None,
temp_dir: std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string()),
}
}
/// Create OCR service with configuration
pub fn new_with_config(config: OcrConfig) -> Self {
let fallback_strategy = if config.fallback_config.enabled {
Some(FallbackStrategy::new(config.fallback_config, config.temp_dir))
} else {
None
};
Self {
health_checker: OcrHealthChecker::new(),
fallback_strategy,
temp_dir: config.temp_dir,
}
}
@ -201,23 +190,9 @@ impl OcrService {
file_path: &str,
mime_type: &str,
) -> Result<crate::ocr::enhanced::OcrResult> {
match &self.fallback_strategy {
Some(strategy) => {
let result = strategy.extract_with_fallback(file_path, mime_type).await?;
// Convert the result to OcrResult for backward compatibility
Ok(crate::ocr::enhanced::OcrResult {
text: result.text,
confidence: result.confidence,
processing_time_ms: result.processing_time_ms,
word_count: result.word_count,
preprocessing_applied: vec![format!("XML extraction - {}", result.extraction_method)],
processed_image_path: None,
})
}
None => {
// Use basic XML extraction if no strategy is configured
// Use XML extraction directly
let xml_extractor = crate::ocr::xml_extractor::XmlOfficeExtractor::new(
std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string())
self.temp_dir.clone()
);
let result = xml_extractor.extract_text_from_office(file_path, mime_type).await?;
@ -231,8 +206,6 @@ impl OcrService {
processed_image_path: None,
})
}
}
}
/// Extract text from Office documents with custom configuration
pub async fn extract_text_from_office_document_with_config(
@ -331,28 +304,10 @@ impl OcrService {
}
}
/// Get XML extraction statistics
pub async fn get_fallback_stats(&self) -> Option<crate::ocr::fallback_strategy::FallbackStats> {
match &self.fallback_strategy {
Some(strategy) => Some(strategy.get_stats().await),
None => None,
}
}
/// Reset XML extraction statistics
pub async fn reset_fallback_stats(&self) -> Result<()> {
match &self.fallback_strategy {
Some(strategy) => {
strategy.reset_stats().await;
Ok(())
}
None => Err(anyhow!("XML extraction strategy not configured")),
}
}
/// Check if Office document extraction is available
pub fn supports_office_documents(&self) -> bool {
self.fallback_strategy.is_some()
true // XML extraction is always available
}
/// Get supported MIME types
@ -367,7 +322,7 @@ impl OcrService {
"text/plain",
];
if self.supports_office_documents() {
// Office document types are always supported via XML extraction
types.extend_from_slice(&[
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
@ -376,7 +331,6 @@ impl OcrService {
"application/vnd.ms-excel",
"application/vnd.ms-powerpoint",
]);
}
types
}

View File

@ -7,7 +7,6 @@ use tokio::time::timeout;
use readur::ocr::{
OcrService, OcrConfig,
fallback_strategy::FallbackConfig,
};
/// Test utilities for creating mock Office documents
@ -72,7 +71,7 @@ impl OfficeTestDocuments {
let file = fs::File::create(&file_path)?;
let mut zip = zip::ZipWriter::new(file);
// Add [Content_Types].xml
// Add [Content_Types].xml with shared strings support
zip.start_file("[Content_Types].xml", zip::write::FileOptions::default())?;
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
@ -80,6 +79,7 @@ impl OfficeTestDocuments {
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/xl/workbook.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"/>
<Override PartName="/xl/worksheets/sheet1.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml"/>
<Override PartName="/xl/sharedStrings.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml"/>
</Types>"#)?;
// Add _rels/.rels
@ -98,26 +98,42 @@ impl OfficeTestDocuments {
</sheets>
</workbook>"#)?;
// Add xl/_rels/workbook.xml.rels
// Add xl/_rels/workbook.xml.rels with shared strings relationship
zip.start_file("xl/_rels/workbook.xml.rels", zip::write::FileOptions::default())?;
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet" Target="worksheets/sheet1.xml"/>
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings" Target="sharedStrings.xml"/>
</Relationships>"#)?;
// Add xl/worksheets/sheet1.xml with actual content
// Add xl/sharedStrings.xml with the text content
zip.start_file("xl/sharedStrings.xml", zip::write::FileOptions::default())?;
let mut shared_strings_xml = String::from(r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<sst xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" count="{count}" uniqueCount="{count}">"#);
shared_strings_xml = shared_strings_xml.replace("{count}", &content.len().to_string());
for cell_content in content {
shared_strings_xml.push_str(&format!(r#"
<si><t>{}</t></si>"#, cell_content));
}
shared_strings_xml.push_str(r#"
</sst>"#);
zip.write_all(shared_strings_xml.as_bytes())?;
// Add xl/worksheets/sheet1.xml with references to shared strings
zip.start_file("xl/worksheets/sheet1.xml", zip::write::FileOptions::default())?;
let mut worksheet_xml = String::from(r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
<sheetData>"#);
for (row_idx, cell_content) in content.iter().enumerate() {
for (row_idx, _) in content.iter().enumerate() {
worksheet_xml.push_str(&format!(r#"
<row r="{}">
<c r="A{}" t="inlineStr">
<is><t>{}</t></is>
<c r="A{}" t="s">
<v>{}</v>
</c>
</row>"#, row_idx + 1, row_idx + 1, cell_content));
</row>"#, row_idx + 1, row_idx + 1, row_idx));
}
worksheet_xml.push_str(r#"
@ -146,16 +162,9 @@ impl OfficeTestDocuments {
}
}
/// Create a test OCR service with fallback strategy
/// Create a test OCR service with XML extraction
fn create_test_ocr_service(temp_dir: &str) -> OcrService {
let config = OcrConfig {
fallback_config: FallbackConfig {
enabled: true,
max_retries: 2,
initial_retry_delay_ms: 100,
max_retry_delay_ms: 1000,
xml_timeout_seconds: 60,
},
temp_dir: temp_dir.to_string(),
};
@ -224,7 +233,6 @@ async fn test_extraction_modes() -> Result<()> {
// Test XML extraction with the simplified approach
let ocr_config = OcrConfig {
fallback_config: FallbackConfig::default(),
temp_dir: temp_dir.clone(),
};
@ -250,15 +258,8 @@ async fn test_fallback_mechanism() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let temp_dir = test_docs.temp_dir.path().to_string_lossy().to_string();
// Create a service with XML-only mode (simplified)
// Create a service with XML extraction
let config = OcrConfig {
fallback_config: FallbackConfig {
enabled: true,
max_retries: 1,
initial_retry_delay_ms: 50,
max_retry_delay_ms: 200,
xml_timeout_seconds: 30,
},
temp_dir,
};
@ -387,15 +388,8 @@ async fn test_concurrent_extraction() -> Result<()> {
async fn test_circuit_breaker() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
// Create service with simple retry settings (circuit breaker functionality removed)
// Create service with XML extraction
let config = OcrConfig {
fallback_config: FallbackConfig {
enabled: true,
max_retries: 0, // No retries to make failures immediate
initial_retry_delay_ms: 10,
max_retry_delay_ms: 100,
xml_timeout_seconds: 30,
},
temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(),
};
@ -442,13 +436,7 @@ async fn test_statistics_tracking() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
// Reset stats
ocr_service.reset_fallback_stats().await?;
let initial_stats = ocr_service.get_fallback_stats().await.unwrap();
assert_eq!(initial_stats.total_extractions, 0);
// Perform some extractions
// Perform some extractions to verify functionality
let valid_path = test_docs.create_mock_docx("stats_test.docx", "Statistics test document")?;
for i in 0..3 {
@ -462,13 +450,10 @@ async fn test_statistics_tracking() -> Result<()> {
assert!(!ocr_result.text.is_empty());
assert!(ocr_result.confidence > 0.0);
assert!(ocr_result.word_count > 0);
assert!(ocr_result.processing_time_ms > 0);
}
// Check updated stats
let final_stats = ocr_service.get_fallback_stats().await.unwrap();
assert_eq!(final_stats.total_extractions, 3);
assert!(final_stats.success_rate_percentage > 0.0);
assert!(final_stats.average_processing_time_ms > 0.0);
// All extractions succeeded, indicating the XML extraction is working correctly
Ok(())
}
@ -495,15 +480,8 @@ async fn test_mime_type_support() -> Result<()> {
async fn test_learning_mechanism() -> Result<()> {
let test_docs = OfficeTestDocuments::new()?;
// Create service with simple XML extraction (learning functionality removed)
// Create service with XML extraction
let config = OcrConfig {
fallback_config: FallbackConfig {
enabled: true,
max_retries: 1,
initial_retry_delay_ms: 10,
max_retry_delay_ms: 100,
xml_timeout_seconds: 30,
},
temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(),
};

View File

@ -72,6 +72,9 @@ fn create_empty_update_settings() -> UpdateSettings {
webdav_file_extensions: None,
webdav_auto_sync: None,
webdav_sync_interval_minutes: None,
// Office document extraction configuration
office_extraction_timeout_seconds: None,
office_extraction_enable_detailed_logging: None,
}
}
@ -215,6 +218,9 @@ async fn setup_webdav_settings(state: &AppState, user_id: Uuid) {
ocr_quality_threshold_noise: None,
ocr_quality_threshold_sharpness: None,
ocr_skip_enhancement: None,
// Office document extraction configuration
office_extraction_timeout_seconds: None,
office_extraction_enable_detailed_logging: None,
};
state.db.create_or_update_settings(user_id, &update_settings).await