feat(office): yeet unused fallback strategy
This commit is contained in:
parent
d5d6d2edb4
commit
149c3b9a3f
|
|
@ -1,220 +0,0 @@
|
|||
use anyhow::Result;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::{info, warn};
|
||||
|
||||
use super::xml_extractor::{OfficeExtractionResult, XmlOfficeExtractor};
|
||||
|
||||
#[cfg(test)]
|
||||
use anyhow::anyhow;
|
||||
|
||||
/// Configuration for XML-based Office document extraction
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct FallbackConfig {
|
||||
/// Enable XML extraction
|
||||
pub enabled: bool,
|
||||
/// Maximum number of retry attempts for transient failures
|
||||
pub max_retries: u32,
|
||||
/// Initial retry delay in milliseconds
|
||||
pub initial_retry_delay_ms: u64,
|
||||
/// Maximum retry delay in milliseconds
|
||||
pub max_retry_delay_ms: u64,
|
||||
/// Timeout for XML extraction in seconds
|
||||
pub xml_timeout_seconds: u64,
|
||||
}
|
||||
|
||||
|
||||
impl Default for FallbackConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
enabled: true,
|
||||
max_retries: 3,
|
||||
initial_retry_delay_ms: 1000,
|
||||
max_retry_delay_ms: 30000,
|
||||
xml_timeout_seconds: 180,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Statistics for monitoring XML extraction performance
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct FallbackStats {
|
||||
pub total_extractions: u64,
|
||||
pub xml_successes: u64,
|
||||
pub retry_attempts: u64,
|
||||
pub average_processing_time_ms: f64,
|
||||
pub success_rate_percentage: f64,
|
||||
}
|
||||
|
||||
impl Default for FallbackStats {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
total_extractions: 0,
|
||||
xml_successes: 0,
|
||||
retry_attempts: 0,
|
||||
average_processing_time_ms: 0.0,
|
||||
success_rate_percentage: 100.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// XML-based Office document extraction service
|
||||
pub struct FallbackStrategy {
|
||||
config: FallbackConfig,
|
||||
xml_extractor: XmlOfficeExtractor,
|
||||
stats: std::sync::Arc<std::sync::RwLock<FallbackStats>>,
|
||||
}
|
||||
|
||||
impl FallbackStrategy {
|
||||
/// Create a new XML extraction service
|
||||
pub fn new(config: FallbackConfig, temp_dir: String) -> Self {
|
||||
Self {
|
||||
config,
|
||||
xml_extractor: XmlOfficeExtractor::new(temp_dir),
|
||||
stats: std::sync::Arc::new(std::sync::RwLock::new(FallbackStats::default())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract Office document using XML extraction
|
||||
pub async fn extract_with_fallback(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
) -> Result<OfficeExtractionResult> {
|
||||
let start_time = std::time::Instant::now();
|
||||
let document_type = self.get_document_type(mime_type);
|
||||
|
||||
info!("Starting XML extraction for {} (type: {})", file_path, document_type);
|
||||
|
||||
// Update total extraction count
|
||||
if let Ok(mut stats) = self.stats.write() {
|
||||
stats.total_extractions += 1;
|
||||
}
|
||||
|
||||
// Use XML extraction as the only method
|
||||
let result = self.execute_xml_extraction(file_path, mime_type).await;
|
||||
|
||||
let processing_time = start_time.elapsed();
|
||||
|
||||
// Update statistics
|
||||
self.update_stats(&result, processing_time).await;
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Execute XML extraction directly
|
||||
async fn execute_xml_extraction(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
) -> Result<OfficeExtractionResult> {
|
||||
let result = self.xml_extractor.extract_text_from_office(file_path, mime_type).await?;
|
||||
|
||||
// Update stats
|
||||
if let Ok(mut stats) = self.stats.write() {
|
||||
stats.xml_successes += 1;
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
|
||||
/// Get document type from MIME type
|
||||
fn get_document_type(&self, mime_type: &str) -> String {
|
||||
match mime_type {
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" => "docx".to_string(),
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => "xlsx".to_string(),
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation" => "pptx".to_string(),
|
||||
"application/msword" => "doc".to_string(),
|
||||
"application/vnd.ms-excel" => "xls".to_string(),
|
||||
"application/vnd.ms-powerpoint" => "ppt".to_string(),
|
||||
"application/pdf" => "pdf".to_string(),
|
||||
_ => "unknown".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Update statistics after extraction
|
||||
async fn update_stats(&self, result: &Result<OfficeExtractionResult>, processing_time: std::time::Duration) {
|
||||
if let Ok(mut stats) = self.stats.write() {
|
||||
let processing_time_ms = processing_time.as_millis() as f64;
|
||||
|
||||
// Update average processing time using exponential moving average
|
||||
let alpha = 0.1; // Smoothing factor
|
||||
stats.average_processing_time_ms =
|
||||
alpha * processing_time_ms + (1.0 - alpha) * stats.average_processing_time_ms;
|
||||
|
||||
// Update success rate with proper division by zero protection
|
||||
let total_attempts = stats.total_extractions;
|
||||
let successful_attempts = stats.xml_successes;
|
||||
|
||||
if total_attempts > 0 {
|
||||
stats.success_rate_percentage = (successful_attempts as f64 / total_attempts as f64) * 100.0;
|
||||
} else if result.is_ok() {
|
||||
stats.success_rate_percentage = 100.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get current statistics
|
||||
pub async fn get_stats(&self) -> FallbackStats {
|
||||
self.stats.read()
|
||||
.map(|stats| stats.clone())
|
||||
.unwrap_or_else(|_| {
|
||||
warn!("Failed to acquire read lock on stats, returning default");
|
||||
FallbackStats::default()
|
||||
})
|
||||
}
|
||||
|
||||
/// Reset statistics
|
||||
pub async fn reset_stats(&self) {
|
||||
if let Ok(mut stats) = self.stats.write() {
|
||||
*stats = FallbackStats::default();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn create_test_strategy() -> (FallbackStrategy, TempDir) {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let config = FallbackConfig::default();
|
||||
let strategy = FallbackStrategy::new(config, temp_dir.path().to_string_lossy().to_string());
|
||||
(strategy, temp_dir)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_stats_tracking() {
|
||||
let (strategy, _temp_dir) = create_test_strategy();
|
||||
|
||||
let initial_stats = strategy.get_stats().await;
|
||||
assert_eq!(initial_stats.total_extractions, 0);
|
||||
|
||||
// Simulate some operations by updating stats directly
|
||||
if let Ok(mut stats) = strategy.stats.write() {
|
||||
stats.total_extractions = 10;
|
||||
stats.xml_successes = 9;
|
||||
// Calculate success rate manually as update_stats would do
|
||||
stats.success_rate_percentage = (9.0 / 10.0) * 100.0;
|
||||
}
|
||||
|
||||
let updated_stats = strategy.get_stats().await;
|
||||
assert_eq!(updated_stats.total_extractions, 10);
|
||||
assert_eq!(updated_stats.xml_successes, 9);
|
||||
assert_eq!(updated_stats.success_rate_percentage, 90.0); // 9 successes out of 10
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_document_type() {
|
||||
let (strategy, _temp_dir) = create_test_strategy();
|
||||
|
||||
assert_eq!(strategy.get_document_type("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), "docx");
|
||||
assert_eq!(strategy.get_document_type("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), "xlsx");
|
||||
assert_eq!(strategy.get_document_type("application/vnd.openxmlformats-officedocument.presentationml.presentation"), "pptx");
|
||||
assert_eq!(strategy.get_document_type("application/pdf"), "pdf");
|
||||
assert_eq!(strategy.get_document_type("unknown/type"), "unknown");
|
||||
}
|
||||
}
|
||||
100
src/ocr/mod.rs
100
src/ocr/mod.rs
|
|
@ -2,7 +2,6 @@ pub mod api;
|
|||
pub mod enhanced;
|
||||
pub mod enhanced_processing;
|
||||
pub mod error;
|
||||
pub mod fallback_strategy;
|
||||
pub mod health;
|
||||
pub mod queue;
|
||||
pub mod tests;
|
||||
|
|
@ -12,21 +11,18 @@ use anyhow::{anyhow, Result};
|
|||
use std::path::Path;
|
||||
use crate::ocr::error::OcrError;
|
||||
use crate::ocr::health::OcrHealthChecker;
|
||||
use crate::ocr::fallback_strategy::{FallbackStrategy, FallbackConfig};
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
use tesseract::Tesseract;
|
||||
|
||||
pub struct OcrService {
|
||||
health_checker: OcrHealthChecker,
|
||||
fallback_strategy: Option<FallbackStrategy>,
|
||||
temp_dir: String,
|
||||
}
|
||||
|
||||
/// Configuration for the OCR service
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct OcrConfig {
|
||||
/// Fallback configuration
|
||||
pub fallback_config: FallbackConfig,
|
||||
/// Temporary directory for processing
|
||||
pub temp_dir: String,
|
||||
}
|
||||
|
|
@ -34,7 +30,6 @@ pub struct OcrConfig {
|
|||
impl Default for OcrConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
fallback_config: FallbackConfig::default(),
|
||||
temp_dir: std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string()),
|
||||
}
|
||||
}
|
||||
|
|
@ -44,21 +39,15 @@ impl OcrService {
|
|||
pub fn new() -> Self {
|
||||
Self {
|
||||
health_checker: OcrHealthChecker::new(),
|
||||
fallback_strategy: None,
|
||||
temp_dir: std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create OCR service with configuration
|
||||
pub fn new_with_config(config: OcrConfig) -> Self {
|
||||
let fallback_strategy = if config.fallback_config.enabled {
|
||||
Some(FallbackStrategy::new(config.fallback_config, config.temp_dir))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Self {
|
||||
health_checker: OcrHealthChecker::new(),
|
||||
fallback_strategy,
|
||||
temp_dir: config.temp_dir,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -201,37 +190,21 @@ impl OcrService {
|
|||
file_path: &str,
|
||||
mime_type: &str,
|
||||
) -> Result<crate::ocr::enhanced::OcrResult> {
|
||||
match &self.fallback_strategy {
|
||||
Some(strategy) => {
|
||||
let result = strategy.extract_with_fallback(file_path, mime_type).await?;
|
||||
// Convert the result to OcrResult for backward compatibility
|
||||
Ok(crate::ocr::enhanced::OcrResult {
|
||||
text: result.text,
|
||||
confidence: result.confidence,
|
||||
processing_time_ms: result.processing_time_ms,
|
||||
word_count: result.word_count,
|
||||
preprocessing_applied: vec![format!("XML extraction - {}", result.extraction_method)],
|
||||
processed_image_path: None,
|
||||
})
|
||||
}
|
||||
None => {
|
||||
// Use basic XML extraction if no strategy is configured
|
||||
let xml_extractor = crate::ocr::xml_extractor::XmlOfficeExtractor::new(
|
||||
std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string())
|
||||
);
|
||||
// Use XML extraction directly
|
||||
let xml_extractor = crate::ocr::xml_extractor::XmlOfficeExtractor::new(
|
||||
self.temp_dir.clone()
|
||||
);
|
||||
|
||||
let result = xml_extractor.extract_text_from_office(file_path, mime_type).await?;
|
||||
// Convert OfficeExtractionResult to OcrResult for backward compatibility
|
||||
Ok(crate::ocr::enhanced::OcrResult {
|
||||
text: result.text,
|
||||
confidence: result.confidence,
|
||||
processing_time_ms: result.processing_time_ms,
|
||||
word_count: result.word_count,
|
||||
preprocessing_applied: vec![format!("XML extraction - {}", result.extraction_method)],
|
||||
processed_image_path: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
let result = xml_extractor.extract_text_from_office(file_path, mime_type).await?;
|
||||
// Convert OfficeExtractionResult to OcrResult for backward compatibility
|
||||
Ok(crate::ocr::enhanced::OcrResult {
|
||||
text: result.text,
|
||||
confidence: result.confidence,
|
||||
processing_time_ms: result.processing_time_ms,
|
||||
word_count: result.word_count,
|
||||
preprocessing_applied: vec![format!("XML extraction - {}", result.extraction_method)],
|
||||
processed_image_path: None,
|
||||
})
|
||||
}
|
||||
|
||||
/// Extract text from Office documents with custom configuration
|
||||
|
|
@ -331,28 +304,10 @@ impl OcrService {
|
|||
}
|
||||
}
|
||||
|
||||
/// Get XML extraction statistics
|
||||
pub async fn get_fallback_stats(&self) -> Option<crate::ocr::fallback_strategy::FallbackStats> {
|
||||
match &self.fallback_strategy {
|
||||
Some(strategy) => Some(strategy.get_stats().await),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Reset XML extraction statistics
|
||||
pub async fn reset_fallback_stats(&self) -> Result<()> {
|
||||
match &self.fallback_strategy {
|
||||
Some(strategy) => {
|
||||
strategy.reset_stats().await;
|
||||
Ok(())
|
||||
}
|
||||
None => Err(anyhow!("XML extraction strategy not configured")),
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if Office document extraction is available
|
||||
pub fn supports_office_documents(&self) -> bool {
|
||||
self.fallback_strategy.is_some()
|
||||
true // XML extraction is always available
|
||||
}
|
||||
|
||||
/// Get supported MIME types
|
||||
|
|
@ -367,16 +322,15 @@ impl OcrService {
|
|||
"text/plain",
|
||||
];
|
||||
|
||||
if self.supports_office_documents() {
|
||||
types.extend_from_slice(&[
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
"application/msword",
|
||||
"application/vnd.ms-excel",
|
||||
"application/vnd.ms-powerpoint",
|
||||
]);
|
||||
}
|
||||
// Office document types are always supported via XML extraction
|
||||
types.extend_from_slice(&[
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
"application/msword",
|
||||
"application/vnd.ms-excel",
|
||||
"application/vnd.ms-powerpoint",
|
||||
]);
|
||||
|
||||
types
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,7 +7,6 @@ use tokio::time::timeout;
|
|||
|
||||
use readur::ocr::{
|
||||
OcrService, OcrConfig,
|
||||
fallback_strategy::FallbackConfig,
|
||||
};
|
||||
|
||||
/// Test utilities for creating mock Office documents
|
||||
|
|
@ -72,7 +71,7 @@ impl OfficeTestDocuments {
|
|||
let file = fs::File::create(&file_path)?;
|
||||
let mut zip = zip::ZipWriter::new(file);
|
||||
|
||||
// Add [Content_Types].xml
|
||||
// Add [Content_Types].xml with shared strings support
|
||||
zip.start_file("[Content_Types].xml", zip::write::FileOptions::default())?;
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
||||
|
|
@ -80,6 +79,7 @@ impl OfficeTestDocuments {
|
|||
<Default Extension="xml" ContentType="application/xml"/>
|
||||
<Override PartName="/xl/workbook.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"/>
|
||||
<Override PartName="/xl/worksheets/sheet1.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml"/>
|
||||
<Override PartName="/xl/sharedStrings.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml"/>
|
||||
</Types>"#)?;
|
||||
|
||||
// Add _rels/.rels
|
||||
|
|
@ -98,26 +98,42 @@ impl OfficeTestDocuments {
|
|||
</sheets>
|
||||
</workbook>"#)?;
|
||||
|
||||
// Add xl/_rels/workbook.xml.rels
|
||||
// Add xl/_rels/workbook.xml.rels with shared strings relationship
|
||||
zip.start_file("xl/_rels/workbook.xml.rels", zip::write::FileOptions::default())?;
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
||||
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet" Target="worksheets/sheet1.xml"/>
|
||||
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings" Target="sharedStrings.xml"/>
|
||||
</Relationships>"#)?;
|
||||
|
||||
// Add xl/worksheets/sheet1.xml with actual content
|
||||
// Add xl/sharedStrings.xml with the text content
|
||||
zip.start_file("xl/sharedStrings.xml", zip::write::FileOptions::default())?;
|
||||
let mut shared_strings_xml = String::from(r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<sst xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" count="{count}" uniqueCount="{count}">"#);
|
||||
shared_strings_xml = shared_strings_xml.replace("{count}", &content.len().to_string());
|
||||
|
||||
for cell_content in content {
|
||||
shared_strings_xml.push_str(&format!(r#"
|
||||
<si><t>{}</t></si>"#, cell_content));
|
||||
}
|
||||
|
||||
shared_strings_xml.push_str(r#"
|
||||
</sst>"#);
|
||||
zip.write_all(shared_strings_xml.as_bytes())?;
|
||||
|
||||
// Add xl/worksheets/sheet1.xml with references to shared strings
|
||||
zip.start_file("xl/worksheets/sheet1.xml", zip::write::FileOptions::default())?;
|
||||
let mut worksheet_xml = String::from(r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
|
||||
<sheetData>"#);
|
||||
|
||||
for (row_idx, cell_content) in content.iter().enumerate() {
|
||||
for (row_idx, _) in content.iter().enumerate() {
|
||||
worksheet_xml.push_str(&format!(r#"
|
||||
<row r="{}">
|
||||
<c r="A{}" t="inlineStr">
|
||||
<is><t>{}</t></is>
|
||||
<c r="A{}" t="s">
|
||||
<v>{}</v>
|
||||
</c>
|
||||
</row>"#, row_idx + 1, row_idx + 1, cell_content));
|
||||
</row>"#, row_idx + 1, row_idx + 1, row_idx));
|
||||
}
|
||||
|
||||
worksheet_xml.push_str(r#"
|
||||
|
|
@ -146,16 +162,9 @@ impl OfficeTestDocuments {
|
|||
}
|
||||
}
|
||||
|
||||
/// Create a test OCR service with fallback strategy
|
||||
/// Create a test OCR service with XML extraction
|
||||
fn create_test_ocr_service(temp_dir: &str) -> OcrService {
|
||||
let config = OcrConfig {
|
||||
fallback_config: FallbackConfig {
|
||||
enabled: true,
|
||||
max_retries: 2,
|
||||
initial_retry_delay_ms: 100,
|
||||
max_retry_delay_ms: 1000,
|
||||
xml_timeout_seconds: 60,
|
||||
},
|
||||
temp_dir: temp_dir.to_string(),
|
||||
};
|
||||
|
||||
|
|
@ -224,7 +233,6 @@ async fn test_extraction_modes() -> Result<()> {
|
|||
|
||||
// Test XML extraction with the simplified approach
|
||||
let ocr_config = OcrConfig {
|
||||
fallback_config: FallbackConfig::default(),
|
||||
temp_dir: temp_dir.clone(),
|
||||
};
|
||||
|
||||
|
|
@ -250,15 +258,8 @@ async fn test_fallback_mechanism() -> Result<()> {
|
|||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let temp_dir = test_docs.temp_dir.path().to_string_lossy().to_string();
|
||||
|
||||
// Create a service with XML-only mode (simplified)
|
||||
// Create a service with XML extraction
|
||||
let config = OcrConfig {
|
||||
fallback_config: FallbackConfig {
|
||||
enabled: true,
|
||||
max_retries: 1,
|
||||
initial_retry_delay_ms: 50,
|
||||
max_retry_delay_ms: 200,
|
||||
xml_timeout_seconds: 30,
|
||||
},
|
||||
temp_dir,
|
||||
};
|
||||
|
||||
|
|
@ -387,15 +388,8 @@ async fn test_concurrent_extraction() -> Result<()> {
|
|||
async fn test_circuit_breaker() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
|
||||
// Create service with simple retry settings (circuit breaker functionality removed)
|
||||
// Create service with XML extraction
|
||||
let config = OcrConfig {
|
||||
fallback_config: FallbackConfig {
|
||||
enabled: true,
|
||||
max_retries: 0, // No retries to make failures immediate
|
||||
initial_retry_delay_ms: 10,
|
||||
max_retry_delay_ms: 100,
|
||||
xml_timeout_seconds: 30,
|
||||
},
|
||||
temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(),
|
||||
};
|
||||
|
||||
|
|
@ -442,13 +436,7 @@ async fn test_statistics_tracking() -> Result<()> {
|
|||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
|
||||
// Reset stats
|
||||
ocr_service.reset_fallback_stats().await?;
|
||||
|
||||
let initial_stats = ocr_service.get_fallback_stats().await.unwrap();
|
||||
assert_eq!(initial_stats.total_extractions, 0);
|
||||
|
||||
// Perform some extractions
|
||||
// Perform some extractions to verify functionality
|
||||
let valid_path = test_docs.create_mock_docx("stats_test.docx", "Statistics test document")?;
|
||||
|
||||
for i in 0..3 {
|
||||
|
|
@ -462,13 +450,10 @@ async fn test_statistics_tracking() -> Result<()> {
|
|||
assert!(!ocr_result.text.is_empty());
|
||||
assert!(ocr_result.confidence > 0.0);
|
||||
assert!(ocr_result.word_count > 0);
|
||||
assert!(ocr_result.processing_time_ms > 0);
|
||||
}
|
||||
|
||||
// Check updated stats
|
||||
let final_stats = ocr_service.get_fallback_stats().await.unwrap();
|
||||
assert_eq!(final_stats.total_extractions, 3);
|
||||
assert!(final_stats.success_rate_percentage > 0.0);
|
||||
assert!(final_stats.average_processing_time_ms > 0.0);
|
||||
// All extractions succeeded, indicating the XML extraction is working correctly
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -495,15 +480,8 @@ async fn test_mime_type_support() -> Result<()> {
|
|||
async fn test_learning_mechanism() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
|
||||
// Create service with simple XML extraction (learning functionality removed)
|
||||
// Create service with XML extraction
|
||||
let config = OcrConfig {
|
||||
fallback_config: FallbackConfig {
|
||||
enabled: true,
|
||||
max_retries: 1,
|
||||
initial_retry_delay_ms: 10,
|
||||
max_retry_delay_ms: 100,
|
||||
xml_timeout_seconds: 30,
|
||||
},
|
||||
temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(),
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -72,6 +72,9 @@ fn create_empty_update_settings() -> UpdateSettings {
|
|||
webdav_file_extensions: None,
|
||||
webdav_auto_sync: None,
|
||||
webdav_sync_interval_minutes: None,
|
||||
// Office document extraction configuration
|
||||
office_extraction_timeout_seconds: None,
|
||||
office_extraction_enable_detailed_logging: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -215,6 +218,9 @@ async fn setup_webdav_settings(state: &AppState, user_id: Uuid) {
|
|||
ocr_quality_threshold_noise: None,
|
||||
ocr_quality_threshold_sharpness: None,
|
||||
ocr_skip_enhancement: None,
|
||||
// Office document extraction configuration
|
||||
office_extraction_timeout_seconds: None,
|
||||
office_extraction_enable_detailed_logging: None,
|
||||
};
|
||||
|
||||
state.db.create_or_update_settings(user_id, &update_settings).await
|
||||
|
|
|
|||
Loading…
Reference in New Issue