/// MIME type detection module for improved file type identification /// /// This module provides functions for detecting file MIME types using multiple methods: /// 1. Content-based detection using magic bytes (most reliable) /// 2. Server-provided MIME type (when available and trusted) /// 3. Extension-based fallback (least reliable, but covers edge cases) /// /// The goal is to provide accurate MIME type detection that's particularly important /// for OCR processing where incorrectly classified image files can cause issues. use std::path::Path; use tracing::{debug, warn}; /// Strategy for MIME type detection #[derive(Debug, Clone, PartialEq)] pub enum DetectionStrategy { /// Use content-based detection (magic bytes) - most reliable ContentBased, /// Trust server-provided MIME type if available, fallback to content TrustServer, /// Use extension-based detection - least reliable but fastest ExtensionOnly, /// Comprehensive strategy: server -> content -> extension -> fallback Comprehensive, } /// Result of MIME type detection with metadata about the detection method used #[derive(Debug, Clone)] pub struct MimeDetectionResult { pub mime_type: String, pub confidence: MimeConfidence, pub detection_method: DetectionMethod, pub original_server_type: Option, pub detected_extension: Option, } /// Confidence level of the MIME type detection #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] pub enum MimeConfidence { /// Low confidence - extension-based or fallback detection Low, /// Medium confidence - mime_guess library detection Medium, /// High confidence - magic byte detection or trusted server High, /// Very high confidence - content analysis confirms server type VeryHigh, } /// Method used for MIME type detection #[derive(Debug, Clone, PartialEq)] pub enum DetectionMethod { /// Detected using magic bytes/file signature MagicBytes, /// Provided by the server and trusted ServerProvided, /// Detected using file extension Extension, /// Fallback to default type Fallback, /// Hybrid approach using multiple methods Hybrid, } impl MimeDetectionResult { /// Create a result for server-provided MIME type pub fn from_server(mime_type: String) -> Self { Self { mime_type, confidence: MimeConfidence::High, detection_method: DetectionMethod::ServerProvided, original_server_type: None, detected_extension: None, } } /// Create a result for content-based detection pub fn from_content(mime_type: String, server_type: Option) -> Self { Self { mime_type, confidence: MimeConfidence::High, detection_method: DetectionMethod::MagicBytes, original_server_type: server_type, detected_extension: None, } } /// Create a result for extension-based detection pub fn from_extension(mime_type: String, extension: String) -> Self { Self { mime_type, confidence: MimeConfidence::Medium, detection_method: DetectionMethod::Extension, original_server_type: None, detected_extension: Some(extension), } } /// Create a fallback result pub fn fallback() -> Self { Self { mime_type: "application/octet-stream".to_string(), confidence: MimeConfidence::Low, detection_method: DetectionMethod::Fallback, original_server_type: None, detected_extension: None, } } /// Check if the detected MIME type indicates an image file pub fn is_image(&self) -> bool { self.mime_type.starts_with("image/") } /// Check if the detected MIME type indicates a document file pub fn is_document(&self) -> bool { matches!(self.mime_type.as_str(), "application/pdf" | "application/msword" | "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | "application/vnd.ms-excel" | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | "application/vnd.ms-powerpoint" | "application/vnd.openxmlformats-officedocument.presentationml.presentation" | "text/plain" | "text/rtf" | "application/rtf" ) } /// Check if this MIME type is suitable for OCR processing pub fn is_ocr_suitable(&self) -> bool { self.is_image() || self.mime_type == "application/pdf" } } /// Detect MIME type for WebDAV discovery phase (when we only have file metadata) /// /// This function is called during the initial WebDAV XML parsing when we don't /// have access to the actual file content yet. /// /// # Arguments /// * `filename` - The filename/path of the file /// * `server_mime_type` - MIME type provided by the WebDAV server, if any /// * `strategy` - Detection strategy to use /// /// # Returns /// A `MimeDetectionResult` with the best available MIME type determination pub fn detect_mime_for_discovery( filename: &str, server_mime_type: Option<&str>, strategy: DetectionStrategy, ) -> MimeDetectionResult { debug!("Detecting MIME type for discovery: filename={}, server_type={:?}, strategy={:?}", filename, server_mime_type, strategy); match strategy { DetectionStrategy::ContentBased => { // During discovery, we can't analyze content, so fall back to extension detect_from_extension(filename, server_mime_type) } DetectionStrategy::TrustServer => { if let Some(server_type) = server_mime_type { if is_trusted_server_mime_type(server_type) { return MimeDetectionResult::from_server(server_type.to_string()); } } // Fallback to extension-based detection detect_from_extension(filename, server_mime_type) } DetectionStrategy::ExtensionOnly => { detect_from_extension(filename, server_mime_type) } DetectionStrategy::Comprehensive => { // Use server type if trusted, otherwise extension-based if let Some(server_type) = server_mime_type { if is_trusted_server_mime_type(server_type) { return MimeDetectionResult::from_server(server_type.to_string()); } } detect_from_extension(filename, server_mime_type) } } } /// Detect MIME type when file content is available (during file download/processing) /// /// This provides the most accurate detection using magic bytes from the actual file content. /// /// # Arguments /// * `content` - The first few bytes of the file content (at least 512 bytes recommended) /// * `filename` - The filename for fallback detection /// * `server_mime_type` - MIME type provided by the server, if any /// /// # Returns /// A `MimeDetectionResult` with high-confidence MIME type detection pub fn detect_mime_from_content( content: &[u8], filename: &str, server_mime_type: Option<&str>, ) -> MimeDetectionResult { debug!("Detecting MIME type from content: filename={}, server_type={:?}, content_len={}", filename, server_mime_type, content.len()); // First, try magic byte detection if let Some(detected_type) = infer::get(content) { let mime_type = detected_type.mime_type().to_string(); debug!("Magic bytes detected MIME type: {}", mime_type); // If server provided a type, check for consistency if let Some(server_type) = server_mime_type { if are_mime_types_compatible(&mime_type, server_type) { // Both agree - very high confidence let mut result = MimeDetectionResult::from_content(mime_type, Some(server_type.to_string())); result.confidence = MimeConfidence::VeryHigh; result.detection_method = DetectionMethod::Hybrid; return result; } else { // Content detection overrides server type - trust the bytes warn!("MIME type mismatch: server={}, content={} for file {}", server_type, mime_type, filename); return MimeDetectionResult::from_content(mime_type, Some(server_type.to_string())); } } else { // Only content detection available return MimeDetectionResult::from_content(mime_type, None); } } // Magic bytes detection failed, fall back to server type if trusted if let Some(server_type) = server_mime_type { if is_trusted_server_mime_type(server_type) { debug!("Using trusted server MIME type: {}", server_type); return MimeDetectionResult::from_server(server_type.to_string()); } } // Fall back to extension-based detection debug!("Content detection failed, falling back to extension detection"); detect_from_extension(filename, server_mime_type) } /// Update an existing MIME type with content-based detection if available /// /// This function is useful for re-detecting MIME types when file content becomes /// available after initial discovery. /// /// # Arguments /// * `current_mime_type` - The currently assigned MIME type /// * `content` - File content for analysis /// * `filename` - Filename for context /// /// # Returns /// A new `MimeDetectionResult` if detection improves confidence, or None if no change needed pub fn update_mime_type_with_content( current_mime_type: &str, content: &[u8], filename: &str, ) -> Option { let new_result = detect_mime_from_content(content, filename, Some(current_mime_type)); // Only update if we have higher confidence or detected a different type if new_result.confidence > MimeConfidence::Medium || new_result.mime_type != current_mime_type { Some(new_result) } else { None } } /// Detect MIME type from file extension using mime_guess library fn detect_from_extension(filename: &str, server_mime_type: Option<&str>) -> MimeDetectionResult { let path = Path::new(filename); if let Some(mime_type) = mime_guess::from_path(path).first() { let mime_str = mime_type.to_string(); debug!("Extension-based detection: {} -> {}", filename, mime_str); let mut result = MimeDetectionResult::from_extension( mime_str, path.extension() .and_then(|ext| ext.to_str()) .unwrap_or("") .to_string() ); result.original_server_type = server_mime_type.map(|s| s.to_string()); result } else { debug!("Extension-based detection failed for: {}", filename); let mut result = MimeDetectionResult::fallback(); result.original_server_type = server_mime_type.map(|s| s.to_string()); result } } /// Check if a server-provided MIME type should be trusted /// /// Some servers return generic types like "application/octet-stream" which /// aren't useful, while others provide accurate information. fn is_trusted_server_mime_type(mime_type: &str) -> bool { !matches!(mime_type, "application/octet-stream" | "application/binary" | "binary/octet-stream" | "" | "unknown" ) } /// Check if two MIME types are compatible/equivalent /// /// Some servers might return slightly different but equivalent MIME types /// (e.g., "image/jpg" vs "image/jpeg") fn are_mime_types_compatible(type1: &str, type2: &str) -> bool { if type1 == type2 { return true; } // Handle common variations match (type1, type2) { ("image/jpeg", "image/jpg") | ("image/jpg", "image/jpeg") => true, ("image/tiff", "image/tif") | ("image/tif", "image/tiff") => true, ("text/plain", "text/txt") | ("text/txt", "text/plain") => true, _ => { // Check if they have the same primary type (e.g., both are "image/*") let parts1: Vec<&str> = type1.split('/').collect(); let parts2: Vec<&str> = type2.split('/').collect(); parts1.len() == 2 && parts2.len() == 2 && parts1[0] == parts2[0] } } } /// Legacy function for backward compatibility /// /// This maintains the same interface as the original `get_mime_type_from_extension` /// function but uses the new detection system. pub fn get_mime_type_from_extension(extension: &str) -> String { let fake_filename = format!("file.{}", extension); let result = detect_from_extension(&fake_filename, None); result.mime_type } #[cfg(test)] mod tests { use super::*; #[test] fn test_mime_detection_from_extension() { let result = detect_mime_for_discovery( "test.pdf", None, DetectionStrategy::ExtensionOnly ); assert_eq!(result.mime_type, "application/pdf"); assert_eq!(result.detection_method, DetectionMethod::Extension); } #[test] fn test_server_type_trust() { // Trusted server type let result = detect_mime_for_discovery( "test.pdf", Some("application/pdf"), DetectionStrategy::TrustServer ); assert_eq!(result.mime_type, "application/pdf"); assert_eq!(result.detection_method, DetectionMethod::ServerProvided); // Untrusted server type should fall back let result = detect_mime_for_discovery( "test.pdf", Some("application/octet-stream"), DetectionStrategy::TrustServer ); assert_eq!(result.mime_type, "application/pdf"); assert_eq!(result.detection_method, DetectionMethod::Extension); } #[test] fn test_mime_type_compatibility() { assert!(are_mime_types_compatible("image/jpeg", "image/jpg")); assert!(are_mime_types_compatible("image/jpg", "image/jpeg")); assert!(are_mime_types_compatible("text/plain", "text/plain")); assert!(!are_mime_types_compatible("image/jpeg", "text/plain")); } #[test] fn test_content_based_detection() { // PDF magic bytes let pdf_header = b"%PDF-1.4"; let result = detect_mime_from_content(pdf_header, "test.pdf", None); assert_eq!(result.mime_type, "application/pdf"); assert_eq!(result.detection_method, DetectionMethod::MagicBytes); assert_eq!(result.confidence, MimeConfidence::High); // JPEG magic bytes let jpeg_header = [0xFF, 0xD8, 0xFF]; let result = detect_mime_from_content(&jpeg_header, "test.jpg", None); assert_eq!(result.mime_type, "image/jpeg"); } #[test] fn test_hybrid_detection() { // Content and server agree let pdf_header = b"%PDF-1.4"; let result = detect_mime_from_content(pdf_header, "test.pdf", Some("application/pdf")); assert_eq!(result.mime_type, "application/pdf"); assert_eq!(result.detection_method, DetectionMethod::Hybrid); assert_eq!(result.confidence, MimeConfidence::VeryHigh); } #[test] fn test_legacy_compatibility() { assert_eq!(get_mime_type_from_extension("pdf"), "application/pdf"); assert_eq!(get_mime_type_from_extension("jpg"), "image/jpeg"); assert_eq!(get_mime_type_from_extension("png"), "image/png"); } #[test] fn test_ocr_suitability() { let pdf_result = MimeDetectionResult::from_content("application/pdf".to_string(), None); assert!(pdf_result.is_ocr_suitable()); let image_result = MimeDetectionResult::from_content("image/jpeg".to_string(), None); assert!(image_result.is_ocr_suitable()); let text_result = MimeDetectionResult::from_content("text/plain".to_string(), None); assert!(!text_result.is_ocr_suitable()); } }