refactor(server): remove XML vs library comparison functionality
Remove all comparison-related code used to evaluate XML vs library-based Office document extraction. The XML approach has proven superior, so the comparison functionality is no longer needed. Changes: - Remove extraction_comparator.rs (entire comparison engine) - Remove test_extraction_comparison.rs binary - Remove comparison mode logic from enhanced.rs - Simplify fallback_strategy.rs to use XML extraction only - Update OCR service to use XML extraction as primary method - Clean up database migration to remove comparison-specific settings - Remove test_extraction binary from Cargo.toml - Update integration tests to work with simplified extraction The Office document extraction now flows directly to XML-based extraction without any comparison checks, maintaining the superior extraction quality while removing unnecessary complexity.
This commit is contained in:
parent
73525eca02
commit
774efd1140
|
|
@ -12,6 +12,7 @@ name = "test_runner"
|
|||
path = "src/bin/test_runner.rs"
|
||||
|
||||
|
||||
|
||||
[dependencies]
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
axum = { version = "0.8", features = ["multipart", "ws"] }
|
||||
|
|
|
|||
|
|
@ -0,0 +1,21 @@
|
|||
-- Add office document extraction settings to the settings table
|
||||
-- This migration adds timeout controls for Office document extraction using XML parsing
|
||||
|
||||
-- Add office extraction timeout column (default: 120 seconds)
|
||||
ALTER TABLE settings
|
||||
ADD COLUMN office_extraction_timeout_seconds INTEGER NOT NULL DEFAULT 120
|
||||
CHECK (office_extraction_timeout_seconds > 0 AND office_extraction_timeout_seconds <= 600);
|
||||
|
||||
-- Add office extraction detailed logging column (default: false for production)
|
||||
ALTER TABLE settings
|
||||
ADD COLUMN office_extraction_enable_detailed_logging BOOLEAN NOT NULL DEFAULT false;
|
||||
|
||||
-- Add comment to document the new columns
|
||||
COMMENT ON COLUMN settings.office_extraction_timeout_seconds IS
|
||||
'Timeout in seconds for office document extraction (1-600 seconds, default: 120)';
|
||||
|
||||
COMMENT ON COLUMN settings.office_extraction_enable_detailed_logging IS
|
||||
'Enable detailed logging for office document extraction operations (default: false)';
|
||||
|
||||
-- The default values are already set in the column definitions above
|
||||
-- No need to insert default settings as they should be created when users are created
|
||||
|
|
@ -17,7 +17,6 @@ use tesseract::{Tesseract, PageSegMode, OcrEngineMode};
|
|||
use crate::models::Settings;
|
||||
use crate::services::file_service::FileService;
|
||||
use super::xml_extractor::XmlOfficeExtractor;
|
||||
use super::extraction_comparator::{ExtractionConfig, ExtractionMode, ExtractionComparator, SingleExtractionResult, ComparisonReport};
|
||||
// Removed text_sanitization import - now using minimal inline sanitization
|
||||
|
||||
/// RAII guard for automatic cleanup of temporary files
|
||||
|
|
@ -1497,68 +1496,10 @@ impl EnhancedOcrService {
|
|||
self.extract_text(file_path, mime_type, settings).await
|
||||
}
|
||||
|
||||
/// Extract text from Office documents (DOCX, DOC, Excel) with library and XML fallback
|
||||
/// Extract text from Office documents (DOCX, DOC, Excel) using XML extraction
|
||||
pub async fn extract_text_from_office(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result<OcrResult> {
|
||||
// Use the extraction mode from settings to determine behavior
|
||||
let (result, comparison_report) = self.extract_text_from_office_with_mode(file_path, mime_type, settings).await?;
|
||||
|
||||
// Log comparison report if available
|
||||
if let Some(report) = comparison_report {
|
||||
info!("╔════════════════════════════════════════════════════════════╗");
|
||||
info!("║ 📊 OFFICE DOCUMENT EXTRACTION COMPARISON REPORT 📊 ║");
|
||||
info!("╠════════════════════════════════════════════════════════════╣");
|
||||
info!("║ Similarity Score: {:.2}%", report.similarity_score * 100.0);
|
||||
info!("╠════════════════════════════════════════════════════════════╣");
|
||||
info!("║ LIBRARY EXTRACTION (docx-rs/calamine):");
|
||||
if let Some(lib_result) = &report.library_result {
|
||||
info!("║ ✓ Success: {} words in {}ms", lib_result.word_count, lib_result.processing_time_ms);
|
||||
info!("║ Characters: {}", lib_result.text_length);
|
||||
} else {
|
||||
info!("║ ✗ Failed");
|
||||
}
|
||||
info!("╠════════════════════════════════════════════════════════════╣");
|
||||
info!("║ XML EXTRACTION (manual parsing):");
|
||||
if let Some(xml_result) = &report.xml_result {
|
||||
info!("║ ✓ Success: {} words in {}ms", xml_result.word_count, xml_result.processing_time_ms);
|
||||
info!("║ Characters: {}", xml_result.text_length);
|
||||
} else {
|
||||
info!("║ ✗ Failed");
|
||||
}
|
||||
info!("╠════════════════════════════════════════════════════════════╣");
|
||||
info!("║ RECOMMENDATION: {}", report.recommended_method);
|
||||
if report.performance_metrics.speed_improvement_factor > 1.0 {
|
||||
info!("║ Speed Advantage: {:.1}x faster", report.performance_metrics.speed_improvement_factor);
|
||||
}
|
||||
info!("╚════════════════════════════════════════════════════════════╝");
|
||||
} else {
|
||||
warn!("⚠️ No comparison report generated - this shouldn't happen in CompareAlways mode!");
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Extract text from Office documents with configurable extraction mode and comparison
|
||||
pub async fn extract_text_from_office_with_mode(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
settings: &Settings
|
||||
) -> Result<(OcrResult, Option<ComparisonReport>)> {
|
||||
let start_time = std::time::Instant::now();
|
||||
info!("Extracting text from Office document with mode: {} (type: {})", file_path, mime_type);
|
||||
|
||||
// TEMPORARY: Hardcode comparison mode for evaluation
|
||||
let config = ExtractionConfig {
|
||||
mode: ExtractionMode::CompareAlways, // Always compare both methods
|
||||
timeout_seconds: 180, // Give enough time for both extractions
|
||||
enable_detailed_logging: true, // Always log details
|
||||
};
|
||||
|
||||
info!("📊 FORCED COMPARISON MODE: Running both library and XML extraction for evaluation");
|
||||
|
||||
if config.enable_detailed_logging {
|
||||
info!("Office extraction mode: {:?}, timeout: {}s", config.mode, config.timeout_seconds);
|
||||
}
|
||||
info!("Extracting text from Office document: {} (type: {})", file_path, mime_type);
|
||||
|
||||
// Check file size before processing
|
||||
let metadata = tokio::fs::metadata(file_path).await?;
|
||||
|
|
@ -1572,667 +1513,30 @@ impl EnhancedOcrService {
|
|||
));
|
||||
}
|
||||
|
||||
match config.mode {
|
||||
ExtractionMode::LibraryFirst => {
|
||||
self.extract_with_library_first(file_path, mime_type, start_time, &config).await
|
||||
}
|
||||
ExtractionMode::XmlFirst => {
|
||||
self.extract_with_xml_first(file_path, mime_type, start_time, &config).await
|
||||
}
|
||||
ExtractionMode::CompareAlways => {
|
||||
self.extract_with_comparison(file_path, mime_type, start_time, &config).await
|
||||
}
|
||||
ExtractionMode::LibraryOnly => {
|
||||
self.extract_library_only(file_path, mime_type, start_time, &config).await
|
||||
}
|
||||
ExtractionMode::XmlOnly => {
|
||||
self.extract_xml_only(file_path, mime_type, start_time, &config).await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract using library-first approach (existing behavior)
|
||||
async fn extract_with_library_first(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
start_time: std::time::Instant,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<(OcrResult, Option<ComparisonReport>)> {
|
||||
let library_result = self.try_library_extraction(file_path, mime_type, start_time).await;
|
||||
|
||||
match library_result {
|
||||
Ok(result) => {
|
||||
if config.enable_detailed_logging {
|
||||
info!("Library-based extraction succeeded for '{}' (method: {})", file_path, result.preprocessing_applied.join(", "));
|
||||
}
|
||||
Ok((result, None))
|
||||
}
|
||||
Err(library_error) => {
|
||||
if config.enable_detailed_logging {
|
||||
warn!("Library-based extraction failed for '{}': {}. Attempting XML fallback.", file_path, library_error);
|
||||
}
|
||||
|
||||
let xml_extractor = XmlOfficeExtractor::new(self.temp_dir.clone());
|
||||
match xml_extractor.extract_text_from_office(file_path, mime_type).await {
|
||||
Ok(xml_result) => {
|
||||
if config.enable_detailed_logging {
|
||||
info!("XML-based extraction succeeded as fallback for '{}' (method: {})", file_path, xml_result.extraction_method);
|
||||
}
|
||||
Ok((xml_result.into(), None))
|
||||
}
|
||||
Err(xml_error) => {
|
||||
Err(anyhow!(
|
||||
"Both library and XML-based extraction failed for '{}' (type: {}):\nLibrary error: {}\nXML error: {}",
|
||||
file_path, mime_type, library_error, xml_error
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract using XML-first approach
|
||||
async fn extract_with_xml_first(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
start_time: std::time::Instant,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<(OcrResult, Option<ComparisonReport>)> {
|
||||
// Use XML extraction as the primary method
|
||||
let xml_extractor = XmlOfficeExtractor::new(self.temp_dir.clone());
|
||||
let xml_result = xml_extractor.extract_text_from_office(file_path, mime_type).await;
|
||||
let xml_result = xml_extractor.extract_text_from_office(file_path, mime_type).await?;
|
||||
|
||||
match xml_result {
|
||||
Ok(result) => {
|
||||
if config.enable_detailed_logging {
|
||||
info!("XML-based extraction succeeded for '{}' (method: {})", file_path, result.extraction_method);
|
||||
}
|
||||
Ok((result.into(), None))
|
||||
}
|
||||
Err(xml_error) => {
|
||||
if config.enable_detailed_logging {
|
||||
warn!("XML-based extraction failed for '{}': {}. Attempting library fallback.", file_path, xml_error);
|
||||
}
|
||||
|
||||
match self.try_library_extraction(file_path, mime_type, start_time).await {
|
||||
Ok(library_result) => {
|
||||
if config.enable_detailed_logging {
|
||||
info!("Library-based extraction succeeded as fallback for '{}' (method: {})", file_path, library_result.preprocessing_applied.join(", "));
|
||||
}
|
||||
Ok((library_result, None))
|
||||
}
|
||||
Err(library_error) => {
|
||||
Err(anyhow!(
|
||||
"Both XML and library-based extraction failed for '{}' (type: {}):\nXML error: {}\nLibrary error: {}",
|
||||
file_path, mime_type, xml_error, library_error
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract using both methods and compare results
|
||||
async fn extract_with_comparison(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
start_time: std::time::Instant,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<(OcrResult, Option<ComparisonReport>)> {
|
||||
info!("Running both extraction methods for comparison analysis: {}", file_path);
|
||||
|
||||
// To prevent concurrent file access issues, we'll copy the file to temporary locations
|
||||
// and have each method work on its own copy. This ensures no file system conflicts.
|
||||
let (library_temp_path, xml_temp_path) = self.create_temp_file_copies(file_path).await?;
|
||||
|
||||
// Clean up temp files when done
|
||||
let _library_cleanup = FileCleanupGuard::new(&library_temp_path);
|
||||
let _xml_cleanup = FileCleanupGuard::new(&xml_temp_path);
|
||||
|
||||
// Run both extractions concurrently on separate file copies
|
||||
let library_future = self.try_library_extraction(&library_temp_path, mime_type, start_time);
|
||||
let xml_future = async {
|
||||
let xml_extractor = XmlOfficeExtractor::new(self.temp_dir.clone());
|
||||
xml_extractor.extract_text_from_office(&xml_temp_path, mime_type).await
|
||||
};
|
||||
|
||||
let (library_result, xml_result) = tokio::join!(library_future, xml_future);
|
||||
|
||||
// Convert results to SingleExtractionResult format for comparison
|
||||
let library_single_result = match &library_result {
|
||||
Ok(result) => Some(SingleExtractionResult {
|
||||
text: result.text.clone(),
|
||||
confidence: result.confidence,
|
||||
processing_time: std::time::Duration::from_millis(result.processing_time_ms),
|
||||
word_count: result.word_count,
|
||||
method_name: result.preprocessing_applied.join(", "),
|
||||
success: true,
|
||||
error_message: None,
|
||||
}),
|
||||
Err(e) => Some(SingleExtractionResult {
|
||||
text: String::new(),
|
||||
confidence: 0.0,
|
||||
processing_time: std::time::Duration::from_millis(0),
|
||||
word_count: 0,
|
||||
method_name: "Library extraction".to_string(),
|
||||
success: false,
|
||||
error_message: Some(e.to_string()),
|
||||
}),
|
||||
};
|
||||
|
||||
let xml_single_result = match &xml_result {
|
||||
Ok(result) => Some(SingleExtractionResult {
|
||||
text: result.text.clone(),
|
||||
confidence: result.confidence,
|
||||
processing_time: std::time::Duration::from_millis(result.processing_time_ms),
|
||||
word_count: result.word_count,
|
||||
method_name: result.extraction_method.clone(),
|
||||
success: true,
|
||||
error_message: None,
|
||||
}),
|
||||
Err(e) => Some(SingleExtractionResult {
|
||||
text: String::new(),
|
||||
confidence: 0.0,
|
||||
processing_time: std::time::Duration::from_millis(0),
|
||||
word_count: 0,
|
||||
method_name: "XML extraction".to_string(),
|
||||
success: false,
|
||||
error_message: Some(e.to_string()),
|
||||
}),
|
||||
};
|
||||
|
||||
// Perform comparison
|
||||
let comparator = ExtractionComparator::new(config.clone());
|
||||
let comparison_report = comparator.compare_extractions(library_single_result, xml_single_result)?;
|
||||
|
||||
// Log comparison results (selective logging to prevent spam)
|
||||
if config.enable_detailed_logging {
|
||||
// Only log interesting cases to prevent log spam
|
||||
let should_log_details =
|
||||
// Log if methods disagree significantly
|
||||
comparison_report.similarity_score < 0.8 ||
|
||||
// Log if there's a big performance difference (> 2x)
|
||||
comparison_report.performance_metrics.speed_improvement_factor > 2.0 ||
|
||||
// Log if one method failed but other succeeded
|
||||
(comparison_report.library_result.as_ref().map_or(false, |r| !r.success) &&
|
||||
comparison_report.xml_result.as_ref().map_or(false, |r| r.success)) ||
|
||||
(comparison_report.library_result.as_ref().map_or(false, |r| r.success) &&
|
||||
comparison_report.xml_result.as_ref().map_or(false, |r| !r.success));
|
||||
|
||||
if should_log_details {
|
||||
info!(
|
||||
"Extraction comparison for '{}': similarity={:.2}, recommended_method='{}', performance_improvement={:.1}x",
|
||||
file_path,
|
||||
comparison_report.similarity_score,
|
||||
comparison_report.recommended_method,
|
||||
comparison_report.performance_metrics.speed_improvement_factor
|
||||
);
|
||||
|
||||
if let (Some(lib), Some(xml)) = (&comparison_report.library_result, &comparison_report.xml_result) {
|
||||
debug!(
|
||||
"Method details: Library({}ms, {} words, success={}), XML({}ms, {} words, success={})",
|
||||
lib.processing_time_ms,
|
||||
lib.word_count,
|
||||
lib.success,
|
||||
xml.processing_time_ms,
|
||||
xml.word_count,
|
||||
xml.success
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// For routine comparisons, just use debug level
|
||||
debug!(
|
||||
"Extraction comparison for '{}': methods agree (similarity={:.2}), using '{}'",
|
||||
file_path,
|
||||
comparison_report.similarity_score,
|
||||
comparison_report.recommended_method
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Determine which result to return based on comparison
|
||||
let chosen_result = match (&library_result, &xml_result) {
|
||||
(Ok(lib_result), Ok(xml_result)) => {
|
||||
// Both succeeded, choose based on recommendation
|
||||
if comparison_report.recommended_method.contains("Library") ||
|
||||
comparison_report.recommended_method.contains("Tie") {
|
||||
Ok(lib_result.clone())
|
||||
} else {
|
||||
Ok(xml_result.clone().into())
|
||||
}
|
||||
}
|
||||
(Ok(lib_result), Err(_)) => Ok(lib_result.clone()),
|
||||
(Err(_), Ok(xml_result)) => Ok(xml_result.clone().into()),
|
||||
(Err(lib_error), Err(xml_error)) => Err(anyhow!(
|
||||
"Both extraction methods failed for '{}': Library: {}, XML: {}",
|
||||
file_path, lib_error, xml_error
|
||||
)),
|
||||
};
|
||||
|
||||
match chosen_result {
|
||||
Ok(result) => Ok((result, Some(comparison_report))),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract using library method only
|
||||
async fn extract_library_only(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
start_time: std::time::Instant,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<(OcrResult, Option<ComparisonReport>)> {
|
||||
let result = self.try_library_extraction(file_path, mime_type, start_time).await?;
|
||||
if config.enable_detailed_logging {
|
||||
info!("Library-only extraction completed for '{}' (method: {})", file_path, result.preprocessing_applied.join(", "));
|
||||
}
|
||||
Ok((result, None))
|
||||
}
|
||||
|
||||
/// Extract using XML method only
|
||||
async fn extract_xml_only(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
start_time: std::time::Instant,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<(OcrResult, Option<ComparisonReport>)> {
|
||||
let xml_extractor = XmlOfficeExtractor::new(self.temp_dir.clone());
|
||||
let result = xml_extractor.extract_text_from_office(file_path, mime_type).await?;
|
||||
if config.enable_detailed_logging {
|
||||
info!("XML-only extraction completed for '{}' (method: {})", file_path, result.extraction_method);
|
||||
}
|
||||
Ok((result.into(), None))
|
||||
}
|
||||
|
||||
/// Helper method to try library-based extraction
|
||||
async fn try_library_extraction(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
start_time: std::time::Instant,
|
||||
) -> Result<OcrResult> {
|
||||
match mime_type {
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" => {
|
||||
self.extract_text_from_docx(file_path, start_time).await
|
||||
}
|
||||
"application/msword" => {
|
||||
self.extract_text_from_legacy_doc(file_path, start_time).await
|
||||
}
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
|
||||
"application/vnd.ms-excel" => {
|
||||
self.extract_text_from_excel(file_path, mime_type, start_time).await
|
||||
}
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation" => {
|
||||
Err(anyhow!(
|
||||
"PowerPoint files (PPTX) are not yet supported for text extraction. \
|
||||
To extract content from '{}', please:\n\
|
||||
1. Export/Print the presentation as PDF (recommended)\n\
|
||||
2. Use 'File' > 'Export' > 'Create Handouts' in PowerPoint\n\
|
||||
3. Copy text content from slides into a text document",
|
||||
file_path
|
||||
))
|
||||
}
|
||||
_ => {
|
||||
Err(anyhow!(
|
||||
"Office document type '{}' is not supported for text extraction (file: {}). \
|
||||
Please convert the document to PDF format or plain text for processing.",
|
||||
mime_type, file_path
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Create temporary copies of the file for concurrent processing to prevent file access conflicts
|
||||
async fn create_temp_file_copies(&self, file_path: &str) -> Result<(String, String)> {
|
||||
use tokio::fs;
|
||||
use uuid::Uuid;
|
||||
|
||||
// Generate unique temporary file names
|
||||
let file_extension = std::path::Path::new(file_path)
|
||||
.extension()
|
||||
.and_then(|ext| ext.to_str())
|
||||
.unwrap_or("tmp");
|
||||
|
||||
let library_temp_name = format!("library_{}_{}.{}",
|
||||
Uuid::new_v4().simple(),
|
||||
chrono::Utc::now().timestamp_millis(),
|
||||
file_extension
|
||||
);
|
||||
let xml_temp_name = format!("xml_{}_{}.{}",
|
||||
Uuid::new_v4().simple(),
|
||||
chrono::Utc::now().timestamp_millis(),
|
||||
file_extension
|
||||
);
|
||||
|
||||
let library_temp_path = std::path::Path::new(&self.temp_dir).join(library_temp_name);
|
||||
let xml_temp_path = std::path::Path::new(&self.temp_dir).join(xml_temp_name);
|
||||
|
||||
// Copy original file to both temporary locations
|
||||
match fs::copy(file_path, &library_temp_path).await {
|
||||
Ok(bytes_copied) => {
|
||||
debug!("Created library temp copy: {} ({} bytes)", library_temp_path.display(), bytes_copied);
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(anyhow!(
|
||||
"Failed to create temporary copy for library extraction: {}. \
|
||||
Original file: {}, Target: {}",
|
||||
e, file_path, library_temp_path.display()
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
match fs::copy(file_path, &xml_temp_path).await {
|
||||
Ok(bytes_copied) => {
|
||||
debug!("Created XML temp copy: {} ({} bytes)", xml_temp_path.display(), bytes_copied);
|
||||
}
|
||||
Err(e) => {
|
||||
// Clean up the first copy if second copy fails
|
||||
let _ = fs::remove_file(&library_temp_path).await;
|
||||
return Err(anyhow!(
|
||||
"Failed to create temporary copy for XML extraction: {}. \
|
||||
Original file: {}, Target: {}",
|
||||
e, file_path, xml_temp_path.display()
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
Ok((
|
||||
library_temp_path.to_string_lossy().to_string(),
|
||||
xml_temp_path.to_string_lossy().to_string(),
|
||||
))
|
||||
}
|
||||
|
||||
/// Extract text from DOCX files using docx-rs library
|
||||
async fn extract_text_from_docx(&self, file_path: &str, start_time: std::time::Instant) -> Result<OcrResult> {
|
||||
info!("Starting DOCX text extraction: {}", file_path);
|
||||
|
||||
// Move CPU-intensive operations to blocking thread pool
|
||||
let file_path_clone = file_path.to_string();
|
||||
let extraction_result = tokio::task::spawn_blocking(move || -> Result<String> {
|
||||
use docx_rs::*;
|
||||
|
||||
|
||||
// Read the DOCX file
|
||||
let file_data = std::fs::read(&file_path_clone)?;
|
||||
|
||||
// Parse the DOCX document using docx-rs
|
||||
let docx = read_docx(&file_data)
|
||||
.map_err(|e| anyhow!(
|
||||
"Failed to parse DOCX file '{}': {}. The file may be corrupted or not a valid DOCX document.",
|
||||
file_path_clone, e
|
||||
))?;
|
||||
|
||||
// Extract all text content from the document
|
||||
let mut text_content = Vec::new();
|
||||
|
||||
// Extract text from document body
|
||||
let document = docx.document;
|
||||
for child in document.children {
|
||||
Self::extract_text_from_document_child(&child, &mut text_content);
|
||||
}
|
||||
|
||||
// Join all text content with appropriate spacing
|
||||
let raw_text = text_content.join(" ");
|
||||
|
||||
if raw_text.trim().is_empty() {
|
||||
return Err(anyhow!(
|
||||
"No text content found in DOCX file '{}'. The document may be empty or contain only images/objects.",
|
||||
file_path_clone
|
||||
));
|
||||
}
|
||||
|
||||
Ok(raw_text)
|
||||
|
||||
}).await??;
|
||||
|
||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||
|
||||
// Only remove null bytes - preserve all original formatting
|
||||
let cleaned_text = Self::remove_null_bytes(&extraction_result);
|
||||
let word_count = self.count_words_safely(&cleaned_text);
|
||||
let total_time = start_time.elapsed().as_millis() as u64;
|
||||
|
||||
info!(
|
||||
"DOCX extraction completed: {} words extracted from '{}' in {}ms",
|
||||
word_count, file_path, processing_time
|
||||
"Office document extraction completed: {} words in {}ms using XML extraction",
|
||||
xml_result.word_count,
|
||||
total_time
|
||||
);
|
||||
|
||||
Ok(OcrResult {
|
||||
text: cleaned_text,
|
||||
confidence: 100.0, // Direct text extraction has perfect confidence
|
||||
processing_time_ms: processing_time,
|
||||
word_count,
|
||||
preprocessing_applied: vec!["DOCX text extraction".to_string()],
|
||||
text: xml_result.text,
|
||||
confidence: xml_result.confidence,
|
||||
processing_time_ms: total_time,
|
||||
word_count: xml_result.word_count,
|
||||
preprocessing_applied: vec![xml_result.extraction_method],
|
||||
processed_image_path: None,
|
||||
})
|
||||
}
|
||||
|
||||
/// Recursively extract text from document children (paragraphs, tables, etc.)
|
||||
fn extract_text_from_document_child(child: &docx_rs::DocumentChild, text_content: &mut Vec<String>) {
|
||||
match child {
|
||||
docx_rs::DocumentChild::Paragraph(paragraph) => {
|
||||
let mut paragraph_text = Vec::new();
|
||||
for child in ¶graph.children {
|
||||
Self::extract_text_from_paragraph_child(child, &mut paragraph_text);
|
||||
}
|
||||
if !paragraph_text.is_empty() {
|
||||
text_content.push(paragraph_text.join(""));
|
||||
}
|
||||
}
|
||||
docx_rs::DocumentChild::Table(table) => {
|
||||
for row in &table.rows {
|
||||
let docx_rs::TableChild::TableRow(table_row) = row;
|
||||
for cell in &table_row.cells {
|
||||
let docx_rs::TableRowChild::TableCell(table_cell) = cell;
|
||||
for child in &table_cell.children {
|
||||
match child {
|
||||
docx_rs::TableCellContent::Paragraph(paragraph) => {
|
||||
let mut paragraph_text = Vec::new();
|
||||
for para_child in ¶graph.children {
|
||||
Self::extract_text_from_paragraph_child(para_child, &mut paragraph_text);
|
||||
}
|
||||
if !paragraph_text.is_empty() {
|
||||
text_content.push(paragraph_text.join(""));
|
||||
}
|
||||
}
|
||||
docx_rs::TableCellContent::Table(nested_table) => {
|
||||
// Handle nested tables using helper function
|
||||
Self::extract_text_from_nested_table(nested_table, text_content);
|
||||
}
|
||||
_ => {} // Skip other table cell content types
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// Skip other elements like bookmarks that don't contain text content
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract text from nested tables in DOCX documents
|
||||
fn extract_text_from_nested_table(nested_table: &docx_rs::Table, text_content: &mut Vec<String>) {
|
||||
for nested_row in &nested_table.rows {
|
||||
let docx_rs::TableChild::TableRow(nested_table_row) = nested_row;
|
||||
for nested_cell in &nested_table_row.cells {
|
||||
let docx_rs::TableRowChild::TableCell(nested_table_cell) = nested_cell;
|
||||
for nested_child in &nested_table_cell.children {
|
||||
match nested_child {
|
||||
docx_rs::TableCellContent::Paragraph(nested_paragraph) => {
|
||||
let mut nested_paragraph_text = Vec::new();
|
||||
for nested_para_child in &nested_paragraph.children {
|
||||
Self::extract_text_from_paragraph_child(nested_para_child, &mut nested_paragraph_text);
|
||||
}
|
||||
if !nested_paragraph_text.is_empty() {
|
||||
text_content.push(nested_paragraph_text.join(""));
|
||||
}
|
||||
}
|
||||
docx_rs::TableCellContent::Table(deeply_nested_table) => {
|
||||
// Recursively handle deeply nested tables
|
||||
Self::extract_text_from_nested_table(deeply_nested_table, text_content);
|
||||
}
|
||||
_ => {} // Skip other nested content for simplicity
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract text from paragraph children (runs, text elements, etc.)
|
||||
fn extract_text_from_paragraph_child(child: &docx_rs::ParagraphChild, text_content: &mut Vec<String>) {
|
||||
match child {
|
||||
docx_rs::ParagraphChild::Run(run) => {
|
||||
for child in &run.children {
|
||||
match child {
|
||||
docx_rs::RunChild::Text(text) => {
|
||||
text_content.push(text.text.clone());
|
||||
}
|
||||
docx_rs::RunChild::Tab(_) => {
|
||||
text_content.push("\t".to_string());
|
||||
}
|
||||
docx_rs::RunChild::Break(_break_elem) => {
|
||||
// For simplicity, treat all breaks as line breaks
|
||||
text_content.push("\n".to_string());
|
||||
}
|
||||
// Skip other elements like images, drawings, etc.
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
docx_rs::ParagraphChild::Insert(insert) => {
|
||||
for child in &insert.children {
|
||||
match child {
|
||||
docx_rs::InsertChild::Run(run) => {
|
||||
for run_child in &run.children {
|
||||
match run_child {
|
||||
docx_rs::RunChild::Text(text) => {
|
||||
text_content.push(text.text.clone());
|
||||
}
|
||||
docx_rs::RunChild::Tab(_) => {
|
||||
text_content.push("\t".to_string());
|
||||
}
|
||||
docx_rs::RunChild::Break(_) => {
|
||||
text_content.push("\n".to_string());
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// Skip other elements like deleted content, bookmarks, etc.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract text from Excel files (XLS/XLSX) using calamine library
|
||||
async fn extract_text_from_excel(&self, file_path: &str, mime_type: &str, start_time: std::time::Instant) -> Result<OcrResult> {
|
||||
info!("Starting Excel text extraction: {} (type: {})", file_path, mime_type);
|
||||
|
||||
// Move CPU-intensive operations to blocking thread pool
|
||||
let file_path_clone = file_path.to_string();
|
||||
let extraction_result = tokio::task::spawn_blocking(move || -> Result<String> {
|
||||
use calamine::{open_workbook_auto, Reader, Data};
|
||||
|
||||
|
||||
// Open the workbook using calamine - handles both XLS and XLSX automatically
|
||||
let mut workbook = open_workbook_auto(&file_path_clone)
|
||||
.map_err(|e| anyhow!(
|
||||
"Failed to open Excel file '{}': {}. The file may be corrupted or not a valid Excel document.",
|
||||
file_path_clone, e
|
||||
))?;
|
||||
|
||||
let mut all_text = Vec::new();
|
||||
let worksheet_names = workbook.sheet_names().to_owned();
|
||||
|
||||
if worksheet_names.is_empty() {
|
||||
return Err(anyhow!(
|
||||
"No worksheets found in Excel file '{}'. The file may be corrupted or empty.",
|
||||
file_path_clone
|
||||
));
|
||||
}
|
||||
|
||||
// Extract text from all worksheets
|
||||
for sheet_name in worksheet_names {
|
||||
if let Ok(range) = workbook.worksheet_range(&sheet_name) {
|
||||
// Iterate through all cells in the worksheet
|
||||
for row in range.rows() {
|
||||
for cell in row {
|
||||
// Extract text content from each cell based on its data type
|
||||
let cell_text = match cell {
|
||||
Data::String(s) => s.clone(),
|
||||
Data::Float(f) => {
|
||||
// Format numbers appropriately
|
||||
if f.fract() == 0.0 {
|
||||
format!("{}", *f as i64) // Integer
|
||||
} else {
|
||||
format!("{}", f) // Decimal
|
||||
}
|
||||
}
|
||||
Data::Int(i) => format!("{}", i),
|
||||
Data::Bool(b) => format!("{}", b),
|
||||
Data::DateTime(dt) => format!("{}", dt),
|
||||
Data::DateTimeIso(dt_iso) => dt_iso.clone(),
|
||||
Data::DurationIso(dur_iso) => dur_iso.clone(),
|
||||
Data::Error(e) => format!("ERROR: {:?}", e),
|
||||
Data::Empty => continue, // Skip empty cells
|
||||
};
|
||||
|
||||
// Only add non-empty text
|
||||
let trimmed_text = cell_text.trim();
|
||||
if !trimmed_text.is_empty() {
|
||||
all_text.push(trimmed_text.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if all_text.is_empty() {
|
||||
return Err(anyhow!(
|
||||
"No text content found in Excel file '{}'. All cells may be empty or contain only formulas/formatting.",
|
||||
file_path_clone
|
||||
));
|
||||
}
|
||||
|
||||
// Join all text content with spaces
|
||||
let raw_text = all_text.join(" ");
|
||||
|
||||
Ok(raw_text)
|
||||
|
||||
}).await??;
|
||||
|
||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||
|
||||
// Only remove null bytes - preserve all original formatting
|
||||
let cleaned_text = Self::remove_null_bytes(&extraction_result);
|
||||
let word_count = self.count_words_safely(&cleaned_text);
|
||||
|
||||
info!(
|
||||
"Excel extraction completed: {} words extracted from '{}' in {}ms (processed {} worksheets)",
|
||||
word_count, file_path, processing_time,
|
||||
// Count worksheets that were processed (approximation)
|
||||
cleaned_text.matches("worksheet").count().max(1)
|
||||
);
|
||||
|
||||
Ok(OcrResult {
|
||||
text: cleaned_text,
|
||||
confidence: 100.0, // Direct text extraction has perfect confidence
|
||||
processing_time_ms: processing_time,
|
||||
word_count,
|
||||
preprocessing_applied: vec!["Excel text extraction".to_string()],
|
||||
processed_image_path: None,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
/// REMOVED: This method was used for comparison analysis - now handled by extract_text_from_office
|
||||
#[deprecated(note = "Use extract_text_from_office instead - this method was for comparison analysis")]
|
||||
/// Extract text from legacy DOC files using lightweight external tools
|
||||
pub async fn extract_text_from_legacy_doc(&self, file_path: &str, start_time: std::time::Instant) -> Result<OcrResult> {
|
||||
info!("Processing legacy DOC file: {}", file_path);
|
||||
|
|
|
|||
|
|
@ -1,799 +0,0 @@
|
|||
use anyhow::{anyhow, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::time::{Duration, Instant};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
/// Configuration for text extraction mode
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ExtractionConfig {
|
||||
pub mode: ExtractionMode,
|
||||
pub timeout_seconds: u64,
|
||||
pub enable_detailed_logging: bool,
|
||||
}
|
||||
|
||||
/// Extraction modes available for Office documents
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
|
||||
pub enum ExtractionMode {
|
||||
/// Try library-based extraction first, fallback to XML if it fails (default behavior)
|
||||
LibraryFirst,
|
||||
/// Try XML-based extraction first, fallback to library if it fails
|
||||
XmlFirst,
|
||||
/// Always run both extractions and compare results (for analysis)
|
||||
CompareAlways,
|
||||
/// Use only library-based extraction
|
||||
LibraryOnly,
|
||||
/// Use only XML-based extraction
|
||||
XmlOnly,
|
||||
}
|
||||
|
||||
impl Default for ExtractionConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
mode: ExtractionMode::LibraryFirst,
|
||||
timeout_seconds: 120,
|
||||
enable_detailed_logging: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Result from a single extraction method
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SingleExtractionResult {
|
||||
pub text: String,
|
||||
pub confidence: f32,
|
||||
pub processing_time: Duration,
|
||||
pub word_count: usize,
|
||||
pub method_name: String,
|
||||
pub success: bool,
|
||||
pub error_message: Option<String>,
|
||||
}
|
||||
|
||||
/// Detailed comparison metrics between two text extraction methods
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ComparisonReport {
|
||||
/// Overall similarity score between texts (0.0 to 1.0)
|
||||
pub similarity_score: f32,
|
||||
/// Levenshtein distance between texts
|
||||
pub levenshtein_distance: usize,
|
||||
/// Text length difference (absolute)
|
||||
pub length_difference: usize,
|
||||
/// Word count difference (absolute)
|
||||
pub word_count_difference: usize,
|
||||
/// Performance comparison
|
||||
pub performance_metrics: PerformanceComparison,
|
||||
/// Text content analysis
|
||||
pub content_analysis: ContentAnalysis,
|
||||
/// Method-specific results
|
||||
pub library_result: Option<MethodResult>,
|
||||
pub xml_result: Option<MethodResult>,
|
||||
/// Recommended method based on analysis
|
||||
pub recommended_method: String,
|
||||
/// Analysis timestamp
|
||||
pub timestamp: std::time::SystemTime,
|
||||
}
|
||||
|
||||
/// Performance comparison between methods
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PerformanceComparison {
|
||||
/// Processing time difference in milliseconds
|
||||
pub time_difference_ms: i64,
|
||||
/// Faster method name
|
||||
pub faster_method: String,
|
||||
/// Speed improvement factor (how many times faster)
|
||||
pub speed_improvement_factor: f32,
|
||||
/// Memory usage comparison (if available)
|
||||
pub memory_usage_difference: Option<i64>,
|
||||
}
|
||||
|
||||
/// Content analysis of extracted texts
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ContentAnalysis {
|
||||
/// Characters unique to library extraction
|
||||
pub library_unique_chars: usize,
|
||||
/// Characters unique to XML extraction
|
||||
pub xml_unique_chars: usize,
|
||||
/// Common characters count
|
||||
pub common_chars: usize,
|
||||
/// Unique words in library extraction
|
||||
pub library_unique_words: usize,
|
||||
/// Unique words in XML extraction
|
||||
pub xml_unique_words: usize,
|
||||
/// Common words count
|
||||
pub common_words: usize,
|
||||
/// Potential formatting differences detected
|
||||
pub formatting_differences: Vec<String>,
|
||||
}
|
||||
|
||||
/// Result summary for a specific extraction method
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MethodResult {
|
||||
pub method_name: String,
|
||||
pub success: bool,
|
||||
pub processing_time_ms: u64,
|
||||
pub text_length: usize,
|
||||
pub word_count: usize,
|
||||
pub confidence: f32,
|
||||
pub error_message: Option<String>,
|
||||
}
|
||||
|
||||
/// Main comparison engine for text extraction methods
|
||||
pub struct ExtractionComparator {
|
||||
config: ExtractionConfig,
|
||||
}
|
||||
|
||||
impl ExtractionComparator {
|
||||
/// Create a new extraction comparator
|
||||
pub fn new(config: ExtractionConfig) -> Self {
|
||||
Self { config }
|
||||
}
|
||||
|
||||
/// Create with default configuration
|
||||
pub fn default() -> Self {
|
||||
Self::new(ExtractionConfig::default())
|
||||
}
|
||||
|
||||
/// Compare two extraction results and generate comprehensive analysis
|
||||
pub fn compare_extractions(
|
||||
&self,
|
||||
library_result: Option<SingleExtractionResult>,
|
||||
xml_result: Option<SingleExtractionResult>,
|
||||
) -> Result<ComparisonReport> {
|
||||
let start_time = Instant::now();
|
||||
|
||||
debug!("Starting extraction comparison analysis");
|
||||
|
||||
// Validate inputs
|
||||
if library_result.is_none() && xml_result.is_none() {
|
||||
return Err(anyhow!("At least one extraction result must be provided for comparison"));
|
||||
}
|
||||
|
||||
let mut report = ComparisonReport {
|
||||
similarity_score: 0.0,
|
||||
levenshtein_distance: 0,
|
||||
length_difference: 0,
|
||||
word_count_difference: 0,
|
||||
performance_metrics: PerformanceComparison {
|
||||
time_difference_ms: 0,
|
||||
faster_method: "N/A".to_string(),
|
||||
speed_improvement_factor: 1.0,
|
||||
memory_usage_difference: None,
|
||||
},
|
||||
content_analysis: ContentAnalysis {
|
||||
library_unique_chars: 0,
|
||||
xml_unique_chars: 0,
|
||||
common_chars: 0,
|
||||
library_unique_words: 0,
|
||||
xml_unique_words: 0,
|
||||
common_words: 0,
|
||||
formatting_differences: Vec::new(),
|
||||
},
|
||||
library_result: None,
|
||||
xml_result: None,
|
||||
recommended_method: "Unknown".to_string(),
|
||||
timestamp: std::time::SystemTime::now(),
|
||||
};
|
||||
|
||||
// Convert results to method results
|
||||
if let Some(ref lib_result) = library_result {
|
||||
report.library_result = Some(MethodResult {
|
||||
method_name: lib_result.method_name.clone(),
|
||||
success: lib_result.success,
|
||||
processing_time_ms: lib_result.processing_time.as_millis() as u64,
|
||||
text_length: lib_result.text.len(),
|
||||
word_count: lib_result.word_count,
|
||||
confidence: lib_result.confidence,
|
||||
error_message: lib_result.error_message.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(ref xml_result) = xml_result {
|
||||
report.xml_result = Some(MethodResult {
|
||||
method_name: xml_result.method_name.clone(),
|
||||
success: xml_result.success,
|
||||
processing_time_ms: xml_result.processing_time.as_millis() as u64,
|
||||
text_length: xml_result.text.len(),
|
||||
word_count: xml_result.word_count,
|
||||
confidence: xml_result.confidence,
|
||||
error_message: xml_result.error_message.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
// Perform comparison only if both extractions succeeded
|
||||
if let (Some(lib_result), Some(xml_result)) = (&library_result, &xml_result) {
|
||||
if lib_result.success && xml_result.success {
|
||||
// Calculate text similarity
|
||||
report.similarity_score = self.calculate_similarity(&lib_result.text, &xml_result.text)?;
|
||||
report.levenshtein_distance = self.levenshtein_distance(&lib_result.text, &xml_result.text);
|
||||
|
||||
// Calculate differences
|
||||
report.length_difference = (lib_result.text.len() as i64 - xml_result.text.len() as i64).abs() as usize;
|
||||
report.word_count_difference = (lib_result.word_count as i64 - xml_result.word_count as i64).abs() as usize;
|
||||
|
||||
// Performance comparison
|
||||
let lib_time_ms = lib_result.processing_time.as_millis() as i64;
|
||||
let xml_time_ms = xml_result.processing_time.as_millis() as i64;
|
||||
|
||||
report.performance_metrics.time_difference_ms = lib_time_ms - xml_time_ms;
|
||||
|
||||
if lib_time_ms < xml_time_ms {
|
||||
report.performance_metrics.faster_method = lib_result.method_name.clone();
|
||||
report.performance_metrics.speed_improvement_factor = xml_time_ms as f32 / lib_time_ms.max(1) as f32;
|
||||
} else {
|
||||
report.performance_metrics.faster_method = xml_result.method_name.clone();
|
||||
report.performance_metrics.speed_improvement_factor = lib_time_ms as f32 / xml_time_ms.max(1) as f32;
|
||||
}
|
||||
|
||||
// Content analysis
|
||||
report.content_analysis = self.analyze_content(&lib_result.text, &xml_result.text)?;
|
||||
|
||||
// Determine recommended method
|
||||
report.recommended_method = self.determine_recommended_method(&report, lib_result, xml_result);
|
||||
|
||||
if self.config.enable_detailed_logging {
|
||||
info!(
|
||||
"Extraction comparison completed: similarity={:.2}, levenshtein={}, faster_method={}, speed_improvement={:.2}x",
|
||||
report.similarity_score,
|
||||
report.levenshtein_distance,
|
||||
report.performance_metrics.faster_method,
|
||||
report.performance_metrics.speed_improvement_factor
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// One or both extractions failed
|
||||
if lib_result.success {
|
||||
report.recommended_method = lib_result.method_name.clone();
|
||||
} else if xml_result.success {
|
||||
report.recommended_method = xml_result.method_name.clone();
|
||||
} else {
|
||||
report.recommended_method = "Neither method succeeded".to_string();
|
||||
}
|
||||
}
|
||||
} else if let Some(lib_result) = &library_result {
|
||||
report.recommended_method = if lib_result.success {
|
||||
lib_result.method_name.clone()
|
||||
} else {
|
||||
"No successful extraction".to_string()
|
||||
};
|
||||
} else if let Some(xml_result) = &xml_result {
|
||||
report.recommended_method = if xml_result.success {
|
||||
xml_result.method_name.clone()
|
||||
} else {
|
||||
"No successful extraction".to_string()
|
||||
};
|
||||
}
|
||||
|
||||
let analysis_time = start_time.elapsed();
|
||||
debug!("Extraction comparison analysis completed in {:?}", analysis_time);
|
||||
|
||||
Ok(report)
|
||||
}
|
||||
|
||||
/// Calculate similarity between two texts using normalized Levenshtein distance
|
||||
pub fn calculate_similarity(&self, text1: &str, text2: &str) -> Result<f32> {
|
||||
if text1.is_empty() && text2.is_empty() {
|
||||
return Ok(1.0);
|
||||
}
|
||||
|
||||
if text1.is_empty() || text2.is_empty() {
|
||||
return Ok(0.0);
|
||||
}
|
||||
|
||||
// For very large texts (>10K chars), use a more efficient similarity metric
|
||||
// The Levenshtein sampling approach gives very inaccurate results
|
||||
if text1.len() > 10_000 || text2.len() > 10_000 {
|
||||
info!("Using efficient similarity calculation for large texts ({} and {} chars)",
|
||||
text1.len(), text2.len());
|
||||
|
||||
// Use multiple metrics for better accuracy
|
||||
|
||||
// 1. Character count similarity
|
||||
let char_similarity = 1.0 - ((text1.len() as f32 - text2.len() as f32).abs()
|
||||
/ text1.len().max(text2.len()) as f32);
|
||||
|
||||
// 2. Word count similarity
|
||||
let words1 = text1.split_whitespace().count();
|
||||
let words2 = text2.split_whitespace().count();
|
||||
let word_similarity = 1.0 - ((words1 as f32 - words2 as f32).abs()
|
||||
/ words1.max(words2) as f32);
|
||||
|
||||
// 3. Sample-based content similarity (compare first and last 5K chars)
|
||||
let sample_size = 5000;
|
||||
let sample1_start = &text1[..text1.len().min(sample_size)];
|
||||
let sample2_start = &text2[..text2.len().min(sample_size)];
|
||||
let start_distance = self.levenshtein_distance(sample1_start, sample2_start);
|
||||
let start_similarity = 1.0 - (start_distance as f32 / sample1_start.len().max(sample2_start.len()) as f32);
|
||||
|
||||
let sample1_end = if text1.len() > sample_size {
|
||||
&text1[text1.len() - sample_size..]
|
||||
} else {
|
||||
text1
|
||||
};
|
||||
let sample2_end = if text2.len() > sample_size {
|
||||
&text2[text2.len() - sample_size..]
|
||||
} else {
|
||||
text2
|
||||
};
|
||||
let end_distance = self.levenshtein_distance(sample1_end, sample2_end);
|
||||
let end_similarity = 1.0 - (end_distance as f32 / sample1_end.len().max(sample2_end.len()) as f32);
|
||||
|
||||
// Weighted average favoring content similarity
|
||||
let similarity = (char_similarity * 0.15 +
|
||||
word_similarity * 0.15 +
|
||||
start_similarity * 0.35 +
|
||||
end_similarity * 0.35).min(1.0).max(0.0);
|
||||
|
||||
info!("Large text similarity components: char={:.2}, word={:.2}, start={:.2}, end={:.2} -> overall={:.2}",
|
||||
char_similarity, word_similarity, start_similarity, end_similarity, similarity);
|
||||
|
||||
return Ok(similarity);
|
||||
}
|
||||
|
||||
// For smaller texts, use full Levenshtein distance
|
||||
let distance = self.levenshtein_distance(text1, text2);
|
||||
let max_len = text1.len().max(text2.len());
|
||||
|
||||
if max_len == 0 {
|
||||
Ok(1.0)
|
||||
} else {
|
||||
Ok(1.0 - (distance as f32 / max_len as f32))
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate Levenshtein distance between two strings with memory safety limits
|
||||
pub fn levenshtein_distance(&self, text1: &str, text2: &str) -> usize {
|
||||
// Memory safety limits to prevent OOM attacks
|
||||
const MAX_TEXT_LENGTH: usize = 10_000; // Max 10K characters per text
|
||||
const MAX_MATRIX_SIZE: usize = 100_000_000; // Max 100M matrix elements
|
||||
|
||||
let len1 = text1.chars().count();
|
||||
let len2 = text2.chars().count();
|
||||
|
||||
// Early returns for empty strings
|
||||
if len1 == 0 {
|
||||
return len2.min(MAX_TEXT_LENGTH);
|
||||
}
|
||||
if len2 == 0 {
|
||||
return len1.min(MAX_TEXT_LENGTH);
|
||||
}
|
||||
|
||||
// Check for potential memory exhaustion
|
||||
if len1 > MAX_TEXT_LENGTH || len2 > MAX_TEXT_LENGTH {
|
||||
warn!(
|
||||
"Text lengths exceed safe limit for Levenshtein calculation: {} and {} chars (max: {}). \
|
||||
Using sampling approach to estimate distance.",
|
||||
len1, len2, MAX_TEXT_LENGTH
|
||||
);
|
||||
|
||||
// Use sampling for very large texts to estimate distance
|
||||
return self.estimate_levenshtein_distance_for_large_texts(text1, text2, MAX_TEXT_LENGTH);
|
||||
}
|
||||
|
||||
// Check if matrix would be too large (prevent OOM)
|
||||
let matrix_size = (len1 + 1) * (len2 + 1);
|
||||
if matrix_size > MAX_MATRIX_SIZE {
|
||||
warn!(
|
||||
"Matrix size too large for safe Levenshtein calculation: {} elements (max: {}). \
|
||||
Using sampling approach to estimate distance.",
|
||||
matrix_size, MAX_MATRIX_SIZE
|
||||
);
|
||||
|
||||
return self.estimate_levenshtein_distance_for_large_texts(text1, text2, MAX_TEXT_LENGTH);
|
||||
}
|
||||
|
||||
// Safe to proceed with full calculation
|
||||
let chars1: Vec<char> = text1.chars().collect();
|
||||
let chars2: Vec<char> = text2.chars().collect();
|
||||
|
||||
// Use space-optimized approach for large but manageable texts
|
||||
if len1 > 1000 || len2 > 1000 {
|
||||
return self.levenshtein_distance_space_optimized(&chars1, &chars2);
|
||||
}
|
||||
|
||||
// Standard algorithm for smaller texts
|
||||
let mut matrix = vec![vec![0; len2 + 1]; len1 + 1];
|
||||
|
||||
// Initialize first row and column
|
||||
for i in 0..=len1 {
|
||||
matrix[i][0] = i;
|
||||
}
|
||||
for j in 0..=len2 {
|
||||
matrix[0][j] = j;
|
||||
}
|
||||
|
||||
// Fill the matrix
|
||||
for i in 1..=len1 {
|
||||
for j in 1..=len2 {
|
||||
let cost = if chars1[i - 1] == chars2[j - 1] { 0 } else { 1 };
|
||||
|
||||
matrix[i][j] = (matrix[i - 1][j] + 1) // deletion
|
||||
.min(matrix[i][j - 1] + 1) // insertion
|
||||
.min(matrix[i - 1][j - 1] + cost); // substitution
|
||||
}
|
||||
}
|
||||
|
||||
matrix[len1][len2]
|
||||
}
|
||||
|
||||
/// Space-optimized Levenshtein distance calculation using only two rows
|
||||
fn levenshtein_distance_space_optimized(&self, chars1: &[char], chars2: &[char]) -> usize {
|
||||
let len1 = chars1.len();
|
||||
let len2 = chars2.len();
|
||||
|
||||
if len1 == 0 {
|
||||
return len2;
|
||||
}
|
||||
if len2 == 0 {
|
||||
return len1;
|
||||
}
|
||||
|
||||
// Use only two rows instead of full matrix to save memory
|
||||
let mut prev_row = vec![0; len2 + 1];
|
||||
let mut curr_row = vec![0; len2 + 1];
|
||||
|
||||
// Initialize first row
|
||||
for j in 0..=len2 {
|
||||
prev_row[j] = j;
|
||||
}
|
||||
|
||||
for i in 1..=len1 {
|
||||
curr_row[0] = i;
|
||||
|
||||
for j in 1..=len2 {
|
||||
let cost = if chars1[i - 1] == chars2[j - 1] { 0 } else { 1 };
|
||||
|
||||
curr_row[j] = (prev_row[j] + 1) // deletion
|
||||
.min(curr_row[j - 1] + 1) // insertion
|
||||
.min(prev_row[j - 1] + cost); // substitution
|
||||
}
|
||||
|
||||
// Swap rows
|
||||
std::mem::swap(&mut prev_row, &mut curr_row);
|
||||
}
|
||||
|
||||
prev_row[len2]
|
||||
}
|
||||
|
||||
/// Estimate Levenshtein distance for very large texts using sampling
|
||||
fn estimate_levenshtein_distance_for_large_texts(&self, text1: &str, text2: &str, sample_size: usize) -> usize {
|
||||
// Sample from beginning, middle, and end of both texts
|
||||
let sample1 = self.create_representative_sample(text1, sample_size);
|
||||
let sample2 = self.create_representative_sample(text2, sample_size);
|
||||
|
||||
// Calculate distance on samples
|
||||
let sample_distance = self.levenshtein_distance_space_optimized(
|
||||
&sample1.chars().collect::<Vec<_>>(),
|
||||
&sample2.chars().collect::<Vec<_>>()
|
||||
);
|
||||
|
||||
// Extrapolate to full text size (rough approximation)
|
||||
let text1_len = text1.chars().count();
|
||||
let text2_len = text2.chars().count();
|
||||
let max_len = text1_len.max(text2_len);
|
||||
let sample_len = sample1.chars().count().max(sample2.chars().count());
|
||||
|
||||
if sample_len == 0 {
|
||||
return max_len;
|
||||
}
|
||||
|
||||
// Scale up the sample distance proportionally
|
||||
let scaling_factor = max_len as f64 / sample_len as f64;
|
||||
let estimated_distance = (sample_distance as f64 * scaling_factor) as usize;
|
||||
|
||||
// Cap at maximum possible distance
|
||||
estimated_distance.min(max_len)
|
||||
}
|
||||
|
||||
/// Create a representative sample from a large text
|
||||
fn create_representative_sample(&self, text: &str, max_sample_size: usize) -> String {
|
||||
let char_count = text.chars().count();
|
||||
|
||||
if char_count <= max_sample_size {
|
||||
return text.to_string();
|
||||
}
|
||||
|
||||
// Take samples from beginning, middle, and end
|
||||
let chunk_size = max_sample_size / 3;
|
||||
let chars: Vec<char> = text.chars().collect();
|
||||
|
||||
let mut sample = String::new();
|
||||
|
||||
// Beginning
|
||||
let begin_end = chunk_size.min(chars.len());
|
||||
sample.extend(chars[0..begin_end].iter());
|
||||
|
||||
// Middle
|
||||
if chars.len() > chunk_size * 2 {
|
||||
let mid_start = (chars.len() - chunk_size) / 2;
|
||||
let mid_end = (mid_start + chunk_size).min(chars.len());
|
||||
sample.extend(chars[mid_start..mid_end].iter());
|
||||
}
|
||||
|
||||
// End
|
||||
if chars.len() > chunk_size {
|
||||
let end_start = chars.len().saturating_sub(chunk_size);
|
||||
sample.extend(chars[end_start..].iter());
|
||||
}
|
||||
|
||||
sample
|
||||
}
|
||||
|
||||
/// Analyze content differences between two texts
|
||||
fn analyze_content(&self, library_text: &str, xml_text: &str) -> Result<ContentAnalysis> {
|
||||
// Character-level analysis
|
||||
let lib_chars: std::collections::HashSet<char> = library_text.chars().collect();
|
||||
let xml_chars: std::collections::HashSet<char> = xml_text.chars().collect();
|
||||
|
||||
let common_chars = lib_chars.intersection(&xml_chars).count();
|
||||
let library_unique_chars = lib_chars.difference(&xml_chars).count();
|
||||
let xml_unique_chars = xml_chars.difference(&lib_chars).count();
|
||||
|
||||
// Word-level analysis
|
||||
let lib_words: std::collections::HashSet<&str> = library_text.split_whitespace().collect();
|
||||
let xml_words: std::collections::HashSet<&str> = xml_text.split_whitespace().collect();
|
||||
|
||||
let common_words = lib_words.intersection(&xml_words).count();
|
||||
let library_unique_words = lib_words.difference(&xml_words).count();
|
||||
let xml_unique_words = xml_words.difference(&lib_words).count();
|
||||
|
||||
// Detect potential formatting differences
|
||||
let mut formatting_differences = Vec::new();
|
||||
|
||||
// Check for whitespace differences
|
||||
let lib_whitespace_count = library_text.chars().filter(|c| c.is_whitespace()).count();
|
||||
let xml_whitespace_count = xml_text.chars().filter(|c| c.is_whitespace()).count();
|
||||
|
||||
if (lib_whitespace_count as i64 - xml_whitespace_count as i64).abs() > 10 {
|
||||
formatting_differences.push("Significant whitespace differences detected".to_string());
|
||||
}
|
||||
|
||||
// Check for punctuation differences
|
||||
let lib_punct_count = library_text.chars().filter(|c| c.is_ascii_punctuation()).count();
|
||||
let xml_punct_count = xml_text.chars().filter(|c| c.is_ascii_punctuation()).count();
|
||||
|
||||
if (lib_punct_count as i64 - xml_punct_count as i64).abs() > 5 {
|
||||
formatting_differences.push("Punctuation differences detected".to_string());
|
||||
}
|
||||
|
||||
// Check for potential encoding issues
|
||||
if library_text.contains('<27>') || xml_text.contains('<27>') {
|
||||
formatting_differences.push("Potential character encoding issues detected".to_string());
|
||||
}
|
||||
|
||||
Ok(ContentAnalysis {
|
||||
library_unique_chars,
|
||||
xml_unique_chars,
|
||||
common_chars,
|
||||
library_unique_words,
|
||||
xml_unique_words,
|
||||
common_words,
|
||||
formatting_differences,
|
||||
})
|
||||
}
|
||||
|
||||
/// Determine the recommended extraction method based on comparison results
|
||||
fn determine_recommended_method(
|
||||
&self,
|
||||
report: &ComparisonReport,
|
||||
library_result: &SingleExtractionResult,
|
||||
xml_result: &SingleExtractionResult,
|
||||
) -> String {
|
||||
// If one method failed, recommend the successful one
|
||||
if !library_result.success && xml_result.success {
|
||||
return xml_result.method_name.clone();
|
||||
}
|
||||
if library_result.success && !xml_result.success {
|
||||
return library_result.method_name.clone();
|
||||
}
|
||||
if !library_result.success && !xml_result.success {
|
||||
return "Neither method succeeded".to_string();
|
||||
}
|
||||
|
||||
// Both methods succeeded, analyze quality
|
||||
let mut library_score = 0.0;
|
||||
let mut xml_score = 0.0;
|
||||
|
||||
// Factor 1: Text length (longer is generally better for document extraction)
|
||||
if library_result.text.len() > xml_result.text.len() {
|
||||
library_score += 1.0;
|
||||
} else if xml_result.text.len() > library_result.text.len() {
|
||||
xml_score += 1.0;
|
||||
}
|
||||
|
||||
// Factor 2: Word count (more words usually means better extraction)
|
||||
if library_result.word_count > xml_result.word_count {
|
||||
library_score += 1.0;
|
||||
} else if xml_result.word_count > library_result.word_count {
|
||||
xml_score += 1.0;
|
||||
}
|
||||
|
||||
// Factor 3: Processing speed (faster is better, but weight it less)
|
||||
if library_result.processing_time < xml_result.processing_time {
|
||||
library_score += 0.5;
|
||||
} else if xml_result.processing_time < library_result.processing_time {
|
||||
xml_score += 0.5;
|
||||
}
|
||||
|
||||
// Factor 4: Confidence score
|
||||
if library_result.confidence > xml_result.confidence {
|
||||
library_score += 0.5;
|
||||
} else if xml_result.confidence > library_result.confidence {
|
||||
xml_score += 0.5;
|
||||
}
|
||||
|
||||
// Factor 5: Content richness (unique content might indicate better extraction)
|
||||
if report.content_analysis.library_unique_chars > report.content_analysis.xml_unique_chars {
|
||||
library_score += 0.3;
|
||||
} else if report.content_analysis.xml_unique_chars > report.content_analysis.library_unique_chars {
|
||||
xml_score += 0.3;
|
||||
}
|
||||
|
||||
// Determine winner
|
||||
if library_score > xml_score {
|
||||
library_result.method_name.clone()
|
||||
} else if xml_score > library_score {
|
||||
xml_result.method_name.clone()
|
||||
} else {
|
||||
// Tie - default to library method as it's typically more mature
|
||||
format!("Tie (defaulting to {})", library_result.method_name)
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a summary of differences between two texts
|
||||
pub fn get_text_differences(&self, text1: &str, text2: &str, max_diff_lines: usize) -> Vec<String> {
|
||||
let lines1: Vec<&str> = text1.lines().collect();
|
||||
let lines2: Vec<&str> = text2.lines().collect();
|
||||
|
||||
let mut differences = Vec::new();
|
||||
let max_lines = lines1.len().max(lines2.len());
|
||||
|
||||
for i in 0..max_lines.min(max_diff_lines) {
|
||||
let line1 = lines1.get(i).unwrap_or(&"");
|
||||
let line2 = lines2.get(i).unwrap_or(&"");
|
||||
|
||||
if line1 != line2 {
|
||||
if line1.is_empty() {
|
||||
differences.push(format!("Line {}: Added in method 2: '{}'", i + 1, line2));
|
||||
} else if line2.is_empty() {
|
||||
differences.push(format!("Line {}: Removed in method 2: '{}'", i + 1, line1));
|
||||
} else {
|
||||
differences.push(format!("Line {}: '{}' -> '{}'", i + 1, line1, line2));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if max_lines > max_diff_lines {
|
||||
differences.push(format!("... ({} more lines not shown)", max_lines - max_diff_lines));
|
||||
}
|
||||
|
||||
differences
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SingleExtractionResult> for super::enhanced::OcrResult {
|
||||
/// Convert SingleExtractionResult to OcrResult for compatibility
|
||||
fn from(result: SingleExtractionResult) -> Self {
|
||||
super::enhanced::OcrResult {
|
||||
text: result.text,
|
||||
confidence: result.confidence,
|
||||
processing_time_ms: result.processing_time.as_millis() as u64,
|
||||
word_count: result.word_count,
|
||||
preprocessing_applied: vec![result.method_name],
|
||||
processed_image_path: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::time::Duration;
|
||||
|
||||
fn create_test_result(text: &str, method: &str, time_ms: u64, success: bool) -> SingleExtractionResult {
|
||||
SingleExtractionResult {
|
||||
text: text.to_string(),
|
||||
confidence: if success { 95.0 } else { 0.0 },
|
||||
processing_time: Duration::from_millis(time_ms),
|
||||
word_count: text.split_whitespace().count(),
|
||||
method_name: method.to_string(),
|
||||
success,
|
||||
error_message: if success { None } else { Some("Test error".to_string()) },
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_levenshtein_distance() {
|
||||
let comparator = ExtractionComparator::default();
|
||||
|
||||
// Identical strings
|
||||
assert_eq!(comparator.levenshtein_distance("hello", "hello"), 0);
|
||||
|
||||
// One character difference
|
||||
assert_eq!(comparator.levenshtein_distance("hello", "hallo"), 1);
|
||||
|
||||
// Empty strings
|
||||
assert_eq!(comparator.levenshtein_distance("", ""), 0);
|
||||
assert_eq!(comparator.levenshtein_distance("hello", ""), 5);
|
||||
assert_eq!(comparator.levenshtein_distance("", "world"), 5);
|
||||
|
||||
// Completely different
|
||||
assert_eq!(comparator.levenshtein_distance("abc", "xyz"), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_calculate_similarity() {
|
||||
let comparator = ExtractionComparator::default();
|
||||
|
||||
// Identical strings should have similarity 1.0
|
||||
let sim = comparator.calculate_similarity("hello world", "hello world").unwrap();
|
||||
assert!((sim - 1.0).abs() < 0.01);
|
||||
|
||||
// Completely different strings should have low similarity
|
||||
let sim = comparator.calculate_similarity("abc", "xyz").unwrap();
|
||||
assert!(sim < 0.5);
|
||||
|
||||
// Empty strings
|
||||
let sim = comparator.calculate_similarity("", "").unwrap();
|
||||
assert!((sim - 1.0).abs() < 0.01);
|
||||
|
||||
let sim = comparator.calculate_similarity("hello", "").unwrap();
|
||||
assert!((sim - 0.0).abs() < 0.01);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compare_extractions_both_successful() {
|
||||
let comparator = ExtractionComparator::default();
|
||||
|
||||
let lib_result = create_test_result("Hello world test document", "Library", 100, true);
|
||||
let xml_result = create_test_result("Hello world test document", "XML", 150, true);
|
||||
|
||||
let report = comparator.compare_extractions(Some(lib_result), Some(xml_result)).unwrap();
|
||||
|
||||
assert!((report.similarity_score - 1.0).abs() < 0.01); // Identical text
|
||||
assert_eq!(report.levenshtein_distance, 0);
|
||||
assert_eq!(report.performance_metrics.faster_method, "Library");
|
||||
assert!(report.performance_metrics.speed_improvement_factor > 1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compare_extractions_one_failed() {
|
||||
let comparator = ExtractionComparator::default();
|
||||
|
||||
let lib_result = create_test_result("Hello world", "Library", 100, true);
|
||||
let xml_result = create_test_result("", "XML", 0, false);
|
||||
|
||||
let report = comparator.compare_extractions(Some(lib_result), Some(xml_result)).unwrap();
|
||||
|
||||
assert_eq!(report.recommended_method, "Library");
|
||||
assert!(report.library_result.is_some());
|
||||
assert!(report.xml_result.is_some());
|
||||
assert!(report.library_result.as_ref().unwrap().success);
|
||||
assert!(!report.xml_result.as_ref().unwrap().success);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_text_differences() {
|
||||
let comparator = ExtractionComparator::default();
|
||||
|
||||
let text1 = "Line 1\nLine 2\nLine 3";
|
||||
let text2 = "Line 1\nModified Line 2\nLine 3\nNew Line 4";
|
||||
|
||||
let differences = comparator.get_text_differences(text1, text2, 10);
|
||||
|
||||
assert!(differences.len() >= 1);
|
||||
assert!(differences.iter().any(|d| d.contains("Modified Line 2")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_content_analysis() {
|
||||
let comparator = ExtractionComparator::default();
|
||||
|
||||
let lib_text = "Hello world! This is a test.";
|
||||
let xml_text = "Hello world? This was a test!";
|
||||
|
||||
let analysis = comparator.analyze_content(lib_text, xml_text).unwrap();
|
||||
|
||||
assert!(analysis.common_chars > 0);
|
||||
assert!(analysis.common_words > 0);
|
||||
assert!(analysis.library_unique_chars > 0 || analysis.xml_unique_chars > 0);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,13 +1,11 @@
|
|||
use anyhow::{anyhow, Result};
|
||||
use anyhow::Result;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, RwLock, Mutex};
|
||||
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
|
||||
use tokio::time::{sleep, timeout};
|
||||
use tracing::{debug, error, info, warn};
|
||||
use rand::Rng;
|
||||
|
||||
use super::extraction_comparator::{ExtractionConfig, ExtractionMode, SingleExtractionResult};
|
||||
use super::xml_extractor::{OfficeExtractionResult, XmlOfficeExtractor};
|
||||
|
||||
/// Configuration for fallback strategy behavior
|
||||
|
|
@ -453,8 +451,7 @@ impl FallbackStrategy {
|
|||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
extraction_config: &ExtractionConfig,
|
||||
) -> Result<SingleExtractionResult> {
|
||||
) -> Result<OfficeExtractionResult> {
|
||||
let start_time = Instant::now();
|
||||
let document_type = self.get_document_type(mime_type);
|
||||
|
||||
|
|
@ -470,27 +467,12 @@ impl FallbackStrategy {
|
|||
}
|
||||
}
|
||||
|
||||
let result = match extraction_config.mode {
|
||||
ExtractionMode::LibraryFirst => {
|
||||
self.execute_library_first_strategy(file_path, mime_type, &document_type, extraction_config).await
|
||||
}
|
||||
ExtractionMode::XmlFirst => {
|
||||
self.execute_xml_first_strategy(file_path, mime_type, &document_type, extraction_config).await
|
||||
}
|
||||
ExtractionMode::CompareAlways => {
|
||||
self.execute_compare_always_strategy(file_path, mime_type, &document_type, extraction_config).await
|
||||
}
|
||||
ExtractionMode::LibraryOnly => {
|
||||
self.execute_library_only_strategy(file_path, mime_type, &document_type).await
|
||||
}
|
||||
ExtractionMode::XmlOnly => {
|
||||
self.execute_xml_only_strategy(file_path, mime_type, &document_type).await
|
||||
}
|
||||
};
|
||||
// Use XML extraction as the primary method
|
||||
let result = self.execute_xml_extraction(file_path, mime_type).await;
|
||||
|
||||
let processing_time = start_time.elapsed();
|
||||
|
||||
// Update statistics
|
||||
// Update statistics
|
||||
self.update_stats(&result, processing_time).await;
|
||||
|
||||
// Clean up expired cache entries periodically (1% chance per extraction)
|
||||
|
|
@ -505,257 +487,15 @@ impl FallbackStrategy {
|
|||
result
|
||||
}
|
||||
|
||||
/// Execute library-first strategy with XML fallback
|
||||
async fn execute_library_first_strategy(
|
||||
/// Execute XML extraction directly
|
||||
async fn execute_xml_extraction(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
document_type: &str,
|
||||
extraction_config: &ExtractionConfig,
|
||||
) -> Result<SingleExtractionResult> {
|
||||
// Check if we have a learned preference
|
||||
if let Some(preferred_method) = self.learning_cache.get_preferred_method(document_type) {
|
||||
debug!("Using learned preference: {} for document type: {}", preferred_method, document_type);
|
||||
|
||||
if preferred_method.contains("XML") {
|
||||
// Try XML first based on learning
|
||||
match self.try_xml_extraction(file_path, mime_type).await {
|
||||
Ok(result) => {
|
||||
self.learning_cache.record_success(document_type, &result.method_name, result.processing_time.as_millis() as u64, result.confidence);
|
||||
return Ok(result);
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Learned preference failed, falling back to library: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Try library extraction first
|
||||
match self.try_library_extraction(file_path, mime_type).await {
|
||||
Ok(result) => {
|
||||
match self.stats.write() {
|
||||
Ok(mut stats) => {
|
||||
stats.library_successes += 1;
|
||||
}
|
||||
Err(_) => {
|
||||
warn!("Failed to acquire write lock on stats for library success update");
|
||||
}
|
||||
}
|
||||
self.learning_cache.record_success(document_type, &result.method_name, result.processing_time.as_millis() as u64, result.confidence);
|
||||
Ok(result)
|
||||
}
|
||||
Err(library_error) => {
|
||||
warn!("Library extraction failed, attempting XML fallback: {}", library_error);
|
||||
|
||||
match self.stats.write() {
|
||||
Ok(mut stats) => {
|
||||
stats.fallback_used += 1;
|
||||
}
|
||||
Err(_) => {
|
||||
warn!("Failed to acquire write lock on stats for fallback count update");
|
||||
}
|
||||
}
|
||||
|
||||
match self.try_xml_extraction(file_path, mime_type).await {
|
||||
Ok(result) => {
|
||||
match self.stats.write() {
|
||||
Ok(mut stats) => {
|
||||
stats.xml_successes += 1;
|
||||
}
|
||||
Err(_) => {
|
||||
warn!("Failed to acquire write lock on stats for xml success update");
|
||||
}
|
||||
}
|
||||
self.learning_cache.record_success(document_type, &result.method_name, result.processing_time.as_millis() as u64, result.confidence);
|
||||
Ok(result)
|
||||
}
|
||||
Err(xml_error) => {
|
||||
error!("Both library and XML extraction failed. Library error: {}. XML error: {}", library_error, xml_error);
|
||||
Err(anyhow!(
|
||||
"All extraction methods failed. Library extraction: {}. XML extraction: {}",
|
||||
library_error, xml_error
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Execute XML-first strategy with library fallback
|
||||
async fn execute_xml_first_strategy(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
document_type: &str,
|
||||
extraction_config: &ExtractionConfig,
|
||||
) -> Result<SingleExtractionResult> {
|
||||
// Check if we have a learned preference
|
||||
if let Some(preferred_method) = self.learning_cache.get_preferred_method(document_type) {
|
||||
debug!("Using learned preference: {} for document type: {}", preferred_method, document_type);
|
||||
|
||||
if preferred_method.contains("Library") {
|
||||
// Try library first based on learning
|
||||
match self.try_library_extraction(file_path, mime_type).await {
|
||||
Ok(result) => {
|
||||
self.learning_cache.record_success(document_type, &result.method_name, result.processing_time.as_millis() as u64, result.confidence);
|
||||
return Ok(result);
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Learned preference failed, falling back to XML: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Try XML extraction first
|
||||
match self.try_xml_extraction(file_path, mime_type).await {
|
||||
Ok(result) => {
|
||||
match self.stats.write() {
|
||||
Ok(mut stats) => {
|
||||
stats.xml_successes += 1;
|
||||
}
|
||||
Err(_) => {
|
||||
warn!("Failed to acquire write lock on stats for xml success update");
|
||||
}
|
||||
}
|
||||
self.learning_cache.record_success(document_type, &result.method_name, result.processing_time.as_millis() as u64, result.confidence);
|
||||
Ok(result)
|
||||
}
|
||||
Err(xml_error) => {
|
||||
warn!("XML extraction failed, attempting library fallback: {}", xml_error);
|
||||
|
||||
match self.stats.write() {
|
||||
Ok(mut stats) => {
|
||||
stats.fallback_used += 1;
|
||||
}
|
||||
Err(_) => {
|
||||
warn!("Failed to acquire write lock on stats for fallback count update");
|
||||
}
|
||||
}
|
||||
|
||||
match self.try_library_extraction(file_path, mime_type).await {
|
||||
Ok(result) => {
|
||||
match self.stats.write() {
|
||||
Ok(mut stats) => {
|
||||
stats.library_successes += 1;
|
||||
}
|
||||
Err(_) => {
|
||||
warn!("Failed to acquire write lock on stats for library success update");
|
||||
}
|
||||
}
|
||||
self.learning_cache.record_success(document_type, &result.method_name, result.processing_time.as_millis() as u64, result.confidence);
|
||||
Ok(result)
|
||||
}
|
||||
Err(library_error) => {
|
||||
error!("Both XML and library extraction failed. XML error: {}. Library error: {}", xml_error, library_error);
|
||||
Err(anyhow!(
|
||||
"All extraction methods failed. XML extraction: {}. Library extraction: {}",
|
||||
xml_error, library_error
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Execute compare-always strategy (runs both methods)
|
||||
async fn execute_compare_always_strategy(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
document_type: &str,
|
||||
extraction_config: &ExtractionConfig,
|
||||
) -> Result<SingleExtractionResult> {
|
||||
let library_result = self.try_library_extraction(file_path, mime_type).await;
|
||||
let xml_result = self.try_xml_extraction(file_path, mime_type).await;
|
||||
|
||||
match (library_result, xml_result) {
|
||||
(Ok(lib_result), Ok(xml_result)) => {
|
||||
// Both succeeded, choose the better one
|
||||
match self.stats.write() {
|
||||
Ok(mut stats) => {
|
||||
stats.library_successes += 1;
|
||||
stats.xml_successes += 1;
|
||||
}
|
||||
Err(_) => {
|
||||
warn!("Failed to acquire write lock on stats for dual success update");
|
||||
}
|
||||
}
|
||||
|
||||
let chosen_result = if lib_result.word_count >= xml_result.word_count && lib_result.processing_time <= xml_result.processing_time {
|
||||
lib_result
|
||||
} else {
|
||||
xml_result
|
||||
};
|
||||
|
||||
self.learning_cache.record_success(document_type, &chosen_result.method_name, chosen_result.processing_time.as_millis() as u64, chosen_result.confidence);
|
||||
|
||||
info!("Compare-always mode: both methods succeeded, chosen: {}", chosen_result.method_name);
|
||||
Ok(chosen_result)
|
||||
}
|
||||
(Ok(lib_result), Err(_)) => {
|
||||
match self.stats.write() {
|
||||
Ok(mut stats) => {
|
||||
stats.library_successes += 1;
|
||||
}
|
||||
Err(_) => {
|
||||
warn!("Failed to acquire write lock on stats for library success update");
|
||||
}
|
||||
}
|
||||
self.learning_cache.record_success(document_type, &lib_result.method_name, lib_result.processing_time.as_millis() as u64, lib_result.confidence);
|
||||
Ok(lib_result)
|
||||
}
|
||||
(Err(_), Ok(xml_result)) => {
|
||||
match self.stats.write() {
|
||||
Ok(mut stats) => {
|
||||
stats.xml_successes += 1;
|
||||
}
|
||||
Err(_) => {
|
||||
warn!("Failed to acquire write lock on stats for xml success update");
|
||||
}
|
||||
}
|
||||
self.learning_cache.record_success(document_type, &xml_result.method_name, xml_result.processing_time.as_millis() as u64, xml_result.confidence);
|
||||
Ok(xml_result)
|
||||
}
|
||||
(Err(lib_error), Err(xml_error)) => {
|
||||
error!("Both extraction methods failed in compare-always mode. Library: {}. XML: {}", lib_error, xml_error);
|
||||
Err(anyhow!(
|
||||
"All extraction methods failed. Library: {}. XML: {}",
|
||||
lib_error, xml_error
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Execute library-only strategy
|
||||
async fn execute_library_only_strategy(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
document_type: &str,
|
||||
) -> Result<SingleExtractionResult> {
|
||||
let result = self.try_library_extraction(file_path, mime_type).await?;
|
||||
match self.stats.write() {
|
||||
Ok(mut stats) => {
|
||||
stats.library_successes += 1;
|
||||
}
|
||||
Err(_) => {
|
||||
warn!("Failed to acquire write lock on stats for library success update");
|
||||
}
|
||||
}
|
||||
self.learning_cache.record_success(document_type, &result.method_name, result.processing_time.as_millis() as u64, result.confidence);
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Execute XML-only strategy
|
||||
async fn execute_xml_only_strategy(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
document_type: &str,
|
||||
) -> Result<SingleExtractionResult> {
|
||||
let result = self.try_xml_extraction(file_path, mime_type).await?;
|
||||
) -> Result<OfficeExtractionResult> {
|
||||
let result = self.xml_extractor.extract_text_from_office(file_path, mime_type).await?;
|
||||
|
||||
// Update stats
|
||||
match self.stats.write() {
|
||||
Ok(mut stats) => {
|
||||
stats.xml_successes += 1;
|
||||
|
|
@ -764,295 +504,11 @@ impl FallbackStrategy {
|
|||
warn!("Failed to acquire write lock on stats for xml success update");
|
||||
}
|
||||
}
|
||||
self.learning_cache.record_success(document_type, &result.method_name, result.processing_time.as_millis() as u64, result.confidence);
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Try library-based extraction with circuit breaker and retry logic
|
||||
async fn try_library_extraction(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
) -> Result<SingleExtractionResult> {
|
||||
let method_name = "Library";
|
||||
|
||||
// Check circuit breaker
|
||||
if !self.should_allow_request(method_name).await {
|
||||
return Err(anyhow!("Circuit breaker is open for library extraction"));
|
||||
}
|
||||
|
||||
let result = self.execute_with_retry(
|
||||
|| self.execute_library_extraction(file_path, mime_type),
|
||||
method_name
|
||||
).await;
|
||||
|
||||
// Update circuit breaker
|
||||
match &result {
|
||||
Ok(_) => self.record_success(method_name).await,
|
||||
Err(_) => self.record_failure(method_name).await,
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Try XML-based extraction with circuit breaker and retry logic
|
||||
async fn try_xml_extraction(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
) -> Result<SingleExtractionResult> {
|
||||
let method_name = "XML";
|
||||
|
||||
// Check circuit breaker
|
||||
if !self.should_allow_request(method_name).await {
|
||||
return Err(anyhow!("Circuit breaker is open for XML extraction"));
|
||||
}
|
||||
|
||||
let result = self.execute_with_retry(
|
||||
|| self.execute_xml_extraction(file_path, mime_type),
|
||||
method_name
|
||||
).await;
|
||||
|
||||
// Update circuit breaker
|
||||
match &result {
|
||||
Ok(_) => self.record_success(method_name).await,
|
||||
Err(_) => self.record_failure(method_name).await,
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Execute library extraction (placeholder - would integrate with actual library)
|
||||
async fn execute_library_extraction(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
) -> Result<SingleExtractionResult> {
|
||||
let start_time = Instant::now();
|
||||
|
||||
// Timeout wrapper
|
||||
let timeout_duration = Duration::from_secs(self.config.method_timeouts.library_timeout_seconds);
|
||||
|
||||
timeout(timeout_duration, async {
|
||||
// This is a placeholder - in production this would call the actual library extraction
|
||||
// For now, simulate library extraction behavior
|
||||
tokio::time::sleep(Duration::from_millis(50)).await; // Simulate processing time
|
||||
|
||||
// Simulate failure for certain conditions (for testing purposes)
|
||||
if file_path.contains("corrupt") || file_path.contains("unsupported") {
|
||||
return Err(anyhow!("Library extraction failed: unsupported document format"));
|
||||
}
|
||||
|
||||
Ok(SingleExtractionResult {
|
||||
text: format!("Library-extracted text from {}", file_path),
|
||||
confidence: 85.0,
|
||||
processing_time: start_time.elapsed(),
|
||||
word_count: 150, // Simulated word count
|
||||
method_name: "Library-based extraction".to_string(),
|
||||
success: true,
|
||||
error_message: None,
|
||||
})
|
||||
}).await.map_err(|_| anyhow!("Library extraction timed out after {} seconds", self.config.method_timeouts.library_timeout_seconds))?
|
||||
}
|
||||
|
||||
/// Execute XML extraction
|
||||
async fn execute_xml_extraction(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
) -> Result<SingleExtractionResult> {
|
||||
let start_time = Instant::now();
|
||||
|
||||
// Timeout wrapper
|
||||
let timeout_duration = Duration::from_secs(self.config.method_timeouts.xml_timeout_seconds);
|
||||
|
||||
timeout(timeout_duration, async {
|
||||
let result = self.xml_extractor.extract_text_from_office_with_timeout(
|
||||
file_path,
|
||||
mime_type,
|
||||
self.config.method_timeouts.xml_timeout_seconds
|
||||
).await?;
|
||||
|
||||
Ok(SingleExtractionResult {
|
||||
text: result.text,
|
||||
confidence: result.confidence,
|
||||
processing_time: start_time.elapsed(),
|
||||
word_count: result.word_count,
|
||||
method_name: format!("XML-based extraction ({})", result.extraction_method),
|
||||
success: true,
|
||||
error_message: None,
|
||||
})
|
||||
}).await.map_err(|_| anyhow!("XML extraction timed out after {} seconds", self.config.method_timeouts.xml_timeout_seconds))?
|
||||
}
|
||||
|
||||
/// Execute operation with retry logic and exponential backoff
|
||||
async fn execute_with_retry<F, Fut>(
|
||||
&self,
|
||||
operation: F,
|
||||
method_name: &str,
|
||||
) -> Result<SingleExtractionResult>
|
||||
where
|
||||
F: Fn() -> Fut,
|
||||
Fut: std::future::Future<Output = Result<SingleExtractionResult>>,
|
||||
{
|
||||
let mut delay_ms = self.config.initial_retry_delay_ms;
|
||||
let mut last_error = None;
|
||||
|
||||
for attempt in 0..=self.config.max_retries {
|
||||
match operation().await {
|
||||
Ok(result) => return Ok(result),
|
||||
Err(e) => {
|
||||
last_error = Some(e);
|
||||
|
||||
if attempt < self.config.max_retries && self.is_retryable_error(&last_error.as_ref().unwrap()) {
|
||||
warn!("Attempt {} failed for {}, retrying in {}ms: {}",
|
||||
attempt + 1, method_name, delay_ms, last_error.as_ref().unwrap());
|
||||
|
||||
match self.stats.write() {
|
||||
Ok(mut stats) => {
|
||||
stats.retry_attempts += 1;
|
||||
}
|
||||
Err(_) => {
|
||||
warn!("Failed to acquire write lock on stats for retry attempt update");
|
||||
}
|
||||
}
|
||||
|
||||
sleep(Duration::from_millis(delay_ms)).await;
|
||||
|
||||
// Exponential backoff with jitter
|
||||
delay_ms = (delay_ms * 2).min(self.config.max_retry_delay_ms);
|
||||
let jitter_range = delay_ms / 4;
|
||||
if jitter_range > 0 {
|
||||
delay_ms += rand::thread_rng().gen_range(0..jitter_range); // Add 0-25% jitter
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(last_error.unwrap())
|
||||
}
|
||||
|
||||
/// Check if an error is retryable with improved classification
|
||||
/// This method categorizes errors into retryable and non-retryable based on their nature
|
||||
fn is_retryable_error(&self, error: &anyhow::Error) -> bool {
|
||||
let error_msg = error.to_string().to_lowercase();
|
||||
let error_chain = format!("{:?}", error).to_lowercase();
|
||||
|
||||
// Definitely retryable errors (transient issues)
|
||||
let retryable_patterns = [
|
||||
// Network and I/O issues
|
||||
"timeout", "timed out", "connection", "network",
|
||||
"temporarily unavailable", "resource busy", "busy",
|
||||
"would block", "try again", "eagain", "ewouldblock",
|
||||
// File system temporary issues
|
||||
"no space left", "disk full", "quota exceeded",
|
||||
"file locked", "sharing violation",
|
||||
// Service temporary issues
|
||||
"service unavailable", "server unavailable", "503",
|
||||
"rate limit", "throttling", "429", "too many requests",
|
||||
// Memory pressure (might be temporary)
|
||||
"out of memory", "memory limit", "allocation failed",
|
||||
];
|
||||
|
||||
// Definitely non-retryable errors (permanent issues)
|
||||
let non_retryable_patterns = [
|
||||
// File format/content issues
|
||||
"corrupted", "invalid format", "unsupported format",
|
||||
"malformed", "parse error", "invalid structure",
|
||||
"not found", "404", "file not found", "no such file",
|
||||
// Permission issues
|
||||
"access denied", "permission denied", "unauthorized", "403",
|
||||
"forbidden", "authentication failed",
|
||||
// Logical errors in code
|
||||
"assertion failed", "panic", "index out of bounds",
|
||||
"null pointer", "segmentation fault",
|
||||
];
|
||||
|
||||
// Check for non-retryable patterns first (they take precedence)
|
||||
for pattern in &non_retryable_patterns {
|
||||
if error_msg.contains(pattern) || error_chain.contains(pattern) {
|
||||
debug!("Error classified as non-retryable due to pattern '{}': {}", pattern, error_msg);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check for retryable patterns
|
||||
for pattern in &retryable_patterns {
|
||||
if error_msg.contains(pattern) || error_chain.contains(pattern) {
|
||||
debug!("Error classified as retryable due to pattern '{}': {}", pattern, error_msg);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Check error source chain for more context
|
||||
let mut source = error.source();
|
||||
while let Some(err) = source {
|
||||
let source_msg = err.to_string().to_lowercase();
|
||||
|
||||
// Check source errors against patterns
|
||||
for pattern in &non_retryable_patterns {
|
||||
if source_msg.contains(pattern) {
|
||||
debug!("Error classified as non-retryable due to source pattern '{}': {}", pattern, source_msg);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for pattern in &retryable_patterns {
|
||||
if source_msg.contains(pattern) {
|
||||
debug!("Error classified as retryable due to source pattern '{}': {}", pattern, source_msg);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
source = err.source();
|
||||
}
|
||||
|
||||
// Default: unknown errors are not retryable to avoid infinite loops
|
||||
debug!("Error classified as non-retryable (default): {}", error_msg);
|
||||
false
|
||||
}
|
||||
|
||||
/// Check if circuit breaker should allow request
|
||||
async fn should_allow_request(&self, method_name: &str) -> bool {
|
||||
if !self.config.circuit_breaker.enabled {
|
||||
return true;
|
||||
}
|
||||
|
||||
match self.circuit_breakers.write() {
|
||||
Ok(mut breakers) => {
|
||||
let breaker = breakers.entry(method_name.to_string())
|
||||
.or_insert_with(|| CircuitBreaker::new(self.config.circuit_breaker.clone()));
|
||||
breaker.should_allow_request()
|
||||
}
|
||||
Err(_) => {
|
||||
warn!("Failed to acquire write lock on circuit breakers, allowing request");
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Record successful operation for circuit breaker
|
||||
async fn record_success(&self, method_name: &str) {
|
||||
if !self.config.circuit_breaker.enabled {
|
||||
return;
|
||||
}
|
||||
|
||||
match self.circuit_breakers.write() {
|
||||
Ok(mut breakers) => {
|
||||
let breaker = breakers.entry(method_name.to_string())
|
||||
.or_insert_with(|| CircuitBreaker::new(self.config.circuit_breaker.clone()));
|
||||
breaker.record_success();
|
||||
}
|
||||
Err(_) => {
|
||||
warn!("Failed to acquire write lock on circuit breakers for success recording");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Record failed operation for circuit breaker
|
||||
/// Record a failure for circuit breaker tracking
|
||||
async fn record_failure(&self, method_name: &str) {
|
||||
if !self.config.circuit_breaker.enabled {
|
||||
return;
|
||||
|
|
@ -1101,7 +557,7 @@ impl FallbackStrategy {
|
|||
}
|
||||
|
||||
/// Update statistics after extraction
|
||||
async fn update_stats(&self, result: &Result<SingleExtractionResult>, processing_time: Duration) {
|
||||
async fn update_stats(&self, result: &Result<OfficeExtractionResult>, processing_time: Duration) {
|
||||
match self.stats.write() {
|
||||
Ok(mut stats) => {
|
||||
let processing_time_ms = processing_time.as_millis() as f64;
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@ pub mod api;
|
|||
pub mod enhanced;
|
||||
pub mod enhanced_processing;
|
||||
pub mod error;
|
||||
pub mod extraction_comparator;
|
||||
pub mod fallback_strategy;
|
||||
pub mod health;
|
||||
pub mod queue;
|
||||
|
|
@ -14,7 +13,6 @@ use std::path::Path;
|
|||
use crate::ocr::error::OcrError;
|
||||
use crate::ocr::health::OcrHealthChecker;
|
||||
use crate::ocr::fallback_strategy::{FallbackStrategy, FallbackConfig};
|
||||
use crate::ocr::extraction_comparator::{ExtractionConfig, ExtractionMode, SingleExtractionResult};
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
use tesseract::Tesseract;
|
||||
|
|
@ -27,8 +25,6 @@ pub struct OcrService {
|
|||
/// Configuration for the OCR service
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct OcrConfig {
|
||||
/// Extraction configuration
|
||||
pub extraction_config: ExtractionConfig,
|
||||
/// Fallback configuration
|
||||
pub fallback_config: FallbackConfig,
|
||||
/// Temporary directory for processing
|
||||
|
|
@ -38,7 +34,6 @@ pub struct OcrConfig {
|
|||
impl Default for OcrConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
extraction_config: ExtractionConfig::default(),
|
||||
fallback_config: FallbackConfig::default(),
|
||||
temp_dir: std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string()),
|
||||
}
|
||||
|
|
@ -205,11 +200,11 @@ impl OcrService {
|
|||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
) -> Result<SingleExtractionResult> {
|
||||
) -> Result<String> {
|
||||
match &self.fallback_strategy {
|
||||
Some(strategy) => {
|
||||
let extraction_config = ExtractionConfig::default();
|
||||
strategy.extract_with_fallback(file_path, mime_type, &extraction_config).await
|
||||
let result = strategy.extract_with_fallback(file_path, mime_type).await?;
|
||||
Ok(result.text)
|
||||
}
|
||||
None => {
|
||||
// Fallback to basic XML extraction if no strategy is configured
|
||||
|
|
@ -218,15 +213,7 @@ impl OcrService {
|
|||
);
|
||||
|
||||
let result = xml_extractor.extract_text_from_office(file_path, mime_type).await?;
|
||||
Ok(SingleExtractionResult {
|
||||
text: result.text,
|
||||
confidence: result.confidence,
|
||||
processing_time: std::time::Duration::from_millis(result.processing_time_ms),
|
||||
word_count: result.word_count,
|
||||
method_name: result.extraction_method,
|
||||
success: true,
|
||||
error_message: None,
|
||||
})
|
||||
Ok(result.text)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -236,11 +223,11 @@ impl OcrService {
|
|||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
extraction_config: &ExtractionConfig,
|
||||
) -> Result<SingleExtractionResult> {
|
||||
) -> Result<String> {
|
||||
match &self.fallback_strategy {
|
||||
Some(strategy) => {
|
||||
strategy.extract_with_fallback(file_path, mime_type, extraction_config).await
|
||||
let result = strategy.extract_with_fallback(file_path, mime_type).await?;
|
||||
Ok(result.text)
|
||||
}
|
||||
None => {
|
||||
return Err(anyhow!("Fallback strategy not configured for advanced Office document extraction"));
|
||||
|
|
@ -262,10 +249,7 @@ impl OcrService {
|
|||
"application/msword" |
|
||||
"application/vnd.ms-excel" |
|
||||
"application/vnd.ms-powerpoint" => {
|
||||
match self.extract_text_from_office_document(file_path, mime_type).await {
|
||||
Ok(result) => Ok(result.text),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
self.extract_text_from_office_document(file_path, mime_type).await
|
||||
}
|
||||
"image/png" | "image/jpeg" | "image/jpg" | "image/tiff" | "image/bmp" => {
|
||||
self.extract_text_from_image_with_lang(file_path, lang).await
|
||||
|
|
|
|||
|
|
@ -8,7 +8,6 @@ use tokio::time::timeout;
|
|||
use readur::ocr::{
|
||||
OcrService, OcrConfig,
|
||||
fallback_strategy::{FallbackConfig, CircuitBreakerConfig, LearningConfig, MethodTimeouts},
|
||||
extraction_comparator::{ExtractionConfig, ExtractionMode},
|
||||
};
|
||||
|
||||
/// Test utilities for creating mock Office documents
|
||||
|
|
@ -150,11 +149,6 @@ impl OfficeTestDocuments {
|
|||
/// Create a test OCR service with fallback strategy
|
||||
fn create_test_ocr_service(temp_dir: &str) -> OcrService {
|
||||
let config = OcrConfig {
|
||||
extraction_config: ExtractionConfig {
|
||||
mode: ExtractionMode::LibraryFirst,
|
||||
timeout_seconds: 30,
|
||||
enable_detailed_logging: true,
|
||||
},
|
||||
fallback_config: FallbackConfig {
|
||||
enabled: true,
|
||||
max_retries: 2,
|
||||
|
|
@ -243,45 +237,23 @@ async fn test_extraction_modes() -> Result<()> {
|
|||
let test_content = "Test document for mode comparison";
|
||||
let docx_path = test_docs.create_mock_docx("test_modes.docx", test_content)?;
|
||||
|
||||
// Test different extraction modes
|
||||
let modes = vec![
|
||||
ExtractionMode::LibraryFirst,
|
||||
ExtractionMode::XmlFirst,
|
||||
ExtractionMode::XmlOnly,
|
||||
ExtractionMode::CompareAlways,
|
||||
];
|
||||
// Test XML extraction with the simplified approach
|
||||
let ocr_config = OcrConfig {
|
||||
fallback_config: FallbackConfig::default(),
|
||||
temp_dir: temp_dir.clone(),
|
||||
};
|
||||
|
||||
for mode in modes {
|
||||
let config = ExtractionConfig {
|
||||
mode,
|
||||
timeout_seconds: 30,
|
||||
enable_detailed_logging: true,
|
||||
};
|
||||
|
||||
let ocr_config = OcrConfig {
|
||||
extraction_config: config,
|
||||
fallback_config: FallbackConfig::default(),
|
||||
temp_dir: temp_dir.clone(),
|
||||
};
|
||||
|
||||
let ocr_service = OcrService::new_with_config(ocr_config);
|
||||
|
||||
let result = ocr_service.extract_text_from_office_document_with_config(
|
||||
&docx_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
&ExtractionConfig {
|
||||
mode,
|
||||
timeout_seconds: 30,
|
||||
enable_detailed_logging: true,
|
||||
}
|
||||
).await;
|
||||
|
||||
// All modes should succeed with our test document
|
||||
assert!(result.is_ok(), "Mode {:?} failed: {:?}", mode, result);
|
||||
let result = result?;
|
||||
assert!(result.success);
|
||||
assert!(!result.text.is_empty());
|
||||
}
|
||||
let ocr_service = OcrService::new_with_config(ocr_config);
|
||||
|
||||
let result = ocr_service.extract_text_from_office_document_with_config(
|
||||
&docx_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
).await;
|
||||
|
||||
// XML extraction should succeed with our test document
|
||||
assert!(result.is_ok(), "XML extraction failed: {:?}", result);
|
||||
let extracted_text = result?;
|
||||
assert!(!extracted_text.is_empty());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -293,11 +265,6 @@ async fn test_fallback_mechanism() -> Result<()> {
|
|||
|
||||
// Create a service with library-first mode
|
||||
let config = OcrConfig {
|
||||
extraction_config: ExtractionConfig {
|
||||
mode: ExtractionMode::LibraryFirst,
|
||||
timeout_seconds: 30,
|
||||
enable_detailed_logging: true,
|
||||
},
|
||||
fallback_config: FallbackConfig {
|
||||
enabled: true,
|
||||
max_retries: 1,
|
||||
|
|
@ -347,19 +314,12 @@ async fn test_timeout_handling() -> Result<()> {
|
|||
|
||||
let docx_path = test_docs.create_mock_docx("timeout_test.docx", "Test content")?;
|
||||
|
||||
// Test with very short timeout
|
||||
let config = ExtractionConfig {
|
||||
mode: ExtractionMode::XmlOnly,
|
||||
timeout_seconds: 1, // Very short timeout
|
||||
enable_detailed_logging: true,
|
||||
};
|
||||
|
||||
// Test timeout behavior (the timeout logic is now in the XML extractor itself)
|
||||
let result = timeout(
|
||||
Duration::from_millis(2000), // Give overall test 2 seconds
|
||||
ocr_service.extract_text_from_office_document_with_config(
|
||||
&docx_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
&config
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
)
|
||||
).await;
|
||||
|
||||
|
|
@ -454,11 +414,6 @@ async fn test_circuit_breaker() -> Result<()> {
|
|||
|
||||
// Create service with aggressive circuit breaker settings
|
||||
let config = OcrConfig {
|
||||
extraction_config: ExtractionConfig {
|
||||
mode: ExtractionMode::LibraryFirst,
|
||||
timeout_seconds: 30,
|
||||
enable_detailed_logging: true,
|
||||
},
|
||||
fallback_config: FallbackConfig {
|
||||
enabled: true,
|
||||
max_retries: 0, // No retries to make failures immediate
|
||||
|
|
@ -581,11 +536,6 @@ async fn test_learning_mechanism() -> Result<()> {
|
|||
|
||||
// Create service with learning enabled
|
||||
let config = OcrConfig {
|
||||
extraction_config: ExtractionConfig {
|
||||
mode: ExtractionMode::CompareAlways, // This will help with learning
|
||||
timeout_seconds: 30,
|
||||
enable_detailed_logging: true,
|
||||
},
|
||||
fallback_config: FallbackConfig {
|
||||
enabled: true,
|
||||
max_retries: 1,
|
||||
|
|
|
|||
Loading…
Reference in New Issue