431 lines
16 KiB
Rust
431 lines
16 KiB
Rust
/// MIME type detection module for improved file type identification
|
|
///
|
|
/// This module provides functions for detecting file MIME types using multiple methods:
|
|
/// 1. Content-based detection using magic bytes (most reliable)
|
|
/// 2. Server-provided MIME type (when available and trusted)
|
|
/// 3. Extension-based fallback (least reliable, but covers edge cases)
|
|
///
|
|
/// The goal is to provide accurate MIME type detection that's particularly important
|
|
/// for OCR processing where incorrectly classified image files can cause issues.
|
|
|
|
use std::path::Path;
|
|
use tracing::{debug, warn};
|
|
|
|
/// Strategy for MIME type detection
|
|
#[derive(Debug, Clone, PartialEq)]
|
|
pub enum DetectionStrategy {
|
|
/// Use content-based detection (magic bytes) - most reliable
|
|
ContentBased,
|
|
/// Trust server-provided MIME type if available, fallback to content
|
|
TrustServer,
|
|
/// Use extension-based detection - least reliable but fastest
|
|
ExtensionOnly,
|
|
/// Comprehensive strategy: server -> content -> extension -> fallback
|
|
Comprehensive,
|
|
}
|
|
|
|
/// Result of MIME type detection with metadata about the detection method used
|
|
#[derive(Debug, Clone)]
|
|
pub struct MimeDetectionResult {
|
|
pub mime_type: String,
|
|
pub confidence: MimeConfidence,
|
|
pub detection_method: DetectionMethod,
|
|
pub original_server_type: Option<String>,
|
|
pub detected_extension: Option<String>,
|
|
}
|
|
|
|
/// Confidence level of the MIME type detection
|
|
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
|
pub enum MimeConfidence {
|
|
/// Low confidence - extension-based or fallback detection
|
|
Low,
|
|
/// Medium confidence - mime_guess library detection
|
|
Medium,
|
|
/// High confidence - magic byte detection or trusted server
|
|
High,
|
|
/// Very high confidence - content analysis confirms server type
|
|
VeryHigh,
|
|
}
|
|
|
|
/// Method used for MIME type detection
|
|
#[derive(Debug, Clone, PartialEq)]
|
|
pub enum DetectionMethod {
|
|
/// Detected using magic bytes/file signature
|
|
MagicBytes,
|
|
/// Provided by the server and trusted
|
|
ServerProvided,
|
|
/// Detected using file extension
|
|
Extension,
|
|
/// Fallback to default type
|
|
Fallback,
|
|
/// Hybrid approach using multiple methods
|
|
Hybrid,
|
|
}
|
|
|
|
impl MimeDetectionResult {
|
|
/// Create a result for server-provided MIME type
|
|
pub fn from_server(mime_type: String) -> Self {
|
|
Self {
|
|
mime_type,
|
|
confidence: MimeConfidence::High,
|
|
detection_method: DetectionMethod::ServerProvided,
|
|
original_server_type: None,
|
|
detected_extension: None,
|
|
}
|
|
}
|
|
|
|
/// Create a result for content-based detection
|
|
pub fn from_content(mime_type: String, server_type: Option<String>) -> Self {
|
|
Self {
|
|
mime_type,
|
|
confidence: MimeConfidence::High,
|
|
detection_method: DetectionMethod::MagicBytes,
|
|
original_server_type: server_type,
|
|
detected_extension: None,
|
|
}
|
|
}
|
|
|
|
/// Create a result for extension-based detection
|
|
pub fn from_extension(mime_type: String, extension: String) -> Self {
|
|
Self {
|
|
mime_type,
|
|
confidence: MimeConfidence::Medium,
|
|
detection_method: DetectionMethod::Extension,
|
|
original_server_type: None,
|
|
detected_extension: Some(extension),
|
|
}
|
|
}
|
|
|
|
/// Create a fallback result
|
|
pub fn fallback() -> Self {
|
|
Self {
|
|
mime_type: "application/octet-stream".to_string(),
|
|
confidence: MimeConfidence::Low,
|
|
detection_method: DetectionMethod::Fallback,
|
|
original_server_type: None,
|
|
detected_extension: None,
|
|
}
|
|
}
|
|
|
|
/// Check if the detected MIME type indicates an image file
|
|
pub fn is_image(&self) -> bool {
|
|
self.mime_type.starts_with("image/")
|
|
}
|
|
|
|
/// Check if the detected MIME type indicates a document file
|
|
pub fn is_document(&self) -> bool {
|
|
matches!(self.mime_type.as_str(),
|
|
"application/pdf" |
|
|
"application/msword" |
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
|
|
"application/vnd.ms-excel" |
|
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
|
|
"application/vnd.ms-powerpoint" |
|
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation" |
|
|
"text/plain" |
|
|
"text/rtf" |
|
|
"application/rtf"
|
|
)
|
|
}
|
|
|
|
/// Check if this MIME type is suitable for OCR processing
|
|
pub fn is_ocr_suitable(&self) -> bool {
|
|
self.is_image() || self.mime_type == "application/pdf"
|
|
}
|
|
}
|
|
|
|
/// Detect MIME type for WebDAV discovery phase (when we only have file metadata)
|
|
///
|
|
/// This function is called during the initial WebDAV XML parsing when we don't
|
|
/// have access to the actual file content yet.
|
|
///
|
|
/// # Arguments
|
|
/// * `filename` - The filename/path of the file
|
|
/// * `server_mime_type` - MIME type provided by the WebDAV server, if any
|
|
/// * `strategy` - Detection strategy to use
|
|
///
|
|
/// # Returns
|
|
/// A `MimeDetectionResult` with the best available MIME type determination
|
|
pub fn detect_mime_for_discovery(
|
|
filename: &str,
|
|
server_mime_type: Option<&str>,
|
|
strategy: DetectionStrategy,
|
|
) -> MimeDetectionResult {
|
|
debug!("Detecting MIME type for discovery: filename={}, server_type={:?}, strategy={:?}",
|
|
filename, server_mime_type, strategy);
|
|
|
|
match strategy {
|
|
DetectionStrategy::ContentBased => {
|
|
// During discovery, we can't analyze content, so fall back to extension
|
|
detect_from_extension(filename, server_mime_type)
|
|
}
|
|
DetectionStrategy::TrustServer => {
|
|
if let Some(server_type) = server_mime_type {
|
|
if is_trusted_server_mime_type(server_type) {
|
|
return MimeDetectionResult::from_server(server_type.to_string());
|
|
}
|
|
}
|
|
// Fallback to extension-based detection
|
|
detect_from_extension(filename, server_mime_type)
|
|
}
|
|
DetectionStrategy::ExtensionOnly => {
|
|
detect_from_extension(filename, server_mime_type)
|
|
}
|
|
DetectionStrategy::Comprehensive => {
|
|
// Use server type if trusted, otherwise extension-based
|
|
if let Some(server_type) = server_mime_type {
|
|
if is_trusted_server_mime_type(server_type) {
|
|
return MimeDetectionResult::from_server(server_type.to_string());
|
|
}
|
|
}
|
|
detect_from_extension(filename, server_mime_type)
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Detect MIME type when file content is available (during file download/processing)
|
|
///
|
|
/// This provides the most accurate detection using magic bytes from the actual file content.
|
|
///
|
|
/// # Arguments
|
|
/// * `content` - The first few bytes of the file content (at least 512 bytes recommended)
|
|
/// * `filename` - The filename for fallback detection
|
|
/// * `server_mime_type` - MIME type provided by the server, if any
|
|
///
|
|
/// # Returns
|
|
/// A `MimeDetectionResult` with high-confidence MIME type detection
|
|
pub fn detect_mime_from_content(
|
|
content: &[u8],
|
|
filename: &str,
|
|
server_mime_type: Option<&str>,
|
|
) -> MimeDetectionResult {
|
|
debug!("Detecting MIME type from content: filename={}, server_type={:?}, content_len={}",
|
|
filename, server_mime_type, content.len());
|
|
|
|
// First, try magic byte detection
|
|
if let Some(detected_type) = infer::get(content) {
|
|
let mime_type = detected_type.mime_type().to_string();
|
|
debug!("Magic bytes detected MIME type: {}", mime_type);
|
|
|
|
// If server provided a type, check for consistency
|
|
if let Some(server_type) = server_mime_type {
|
|
if are_mime_types_compatible(&mime_type, server_type) {
|
|
// Both agree - very high confidence
|
|
let mut result = MimeDetectionResult::from_content(mime_type, Some(server_type.to_string()));
|
|
result.confidence = MimeConfidence::VeryHigh;
|
|
result.detection_method = DetectionMethod::Hybrid;
|
|
return result;
|
|
} else {
|
|
// Content detection overrides server type - trust the bytes
|
|
warn!("MIME type mismatch: server={}, content={} for file {}",
|
|
server_type, mime_type, filename);
|
|
return MimeDetectionResult::from_content(mime_type, Some(server_type.to_string()));
|
|
}
|
|
} else {
|
|
// Only content detection available
|
|
return MimeDetectionResult::from_content(mime_type, None);
|
|
}
|
|
}
|
|
|
|
// Magic bytes detection failed, fall back to server type if trusted
|
|
if let Some(server_type) = server_mime_type {
|
|
if is_trusted_server_mime_type(server_type) {
|
|
debug!("Using trusted server MIME type: {}", server_type);
|
|
return MimeDetectionResult::from_server(server_type.to_string());
|
|
}
|
|
}
|
|
|
|
// Fall back to extension-based detection
|
|
debug!("Content detection failed, falling back to extension detection");
|
|
detect_from_extension(filename, server_mime_type)
|
|
}
|
|
|
|
/// Update an existing MIME type with content-based detection if available
|
|
///
|
|
/// This function is useful for re-detecting MIME types when file content becomes
|
|
/// available after initial discovery.
|
|
///
|
|
/// # Arguments
|
|
/// * `current_mime_type` - The currently assigned MIME type
|
|
/// * `content` - File content for analysis
|
|
/// * `filename` - Filename for context
|
|
///
|
|
/// # Returns
|
|
/// A new `MimeDetectionResult` if detection improves confidence, or None if no change needed
|
|
pub fn update_mime_type_with_content(
|
|
current_mime_type: &str,
|
|
content: &[u8],
|
|
filename: &str,
|
|
) -> Option<MimeDetectionResult> {
|
|
let new_result = detect_mime_from_content(content, filename, Some(current_mime_type));
|
|
|
|
// Only update if we have higher confidence or detected a different type
|
|
if new_result.confidence > MimeConfidence::Medium ||
|
|
new_result.mime_type != current_mime_type {
|
|
Some(new_result)
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
/// Detect MIME type from file extension using mime_guess library
|
|
fn detect_from_extension(filename: &str, server_mime_type: Option<&str>) -> MimeDetectionResult {
|
|
let path = Path::new(filename);
|
|
|
|
if let Some(mime_type) = mime_guess::from_path(path).first() {
|
|
let mime_str = mime_type.to_string();
|
|
debug!("Extension-based detection: {} -> {}", filename, mime_str);
|
|
|
|
let mut result = MimeDetectionResult::from_extension(
|
|
mime_str,
|
|
path.extension()
|
|
.and_then(|ext| ext.to_str())
|
|
.unwrap_or("")
|
|
.to_string()
|
|
);
|
|
result.original_server_type = server_mime_type.map(|s| s.to_string());
|
|
result
|
|
} else {
|
|
debug!("Extension-based detection failed for: {}", filename);
|
|
let mut result = MimeDetectionResult::fallback();
|
|
result.original_server_type = server_mime_type.map(|s| s.to_string());
|
|
result
|
|
}
|
|
}
|
|
|
|
/// Check if a server-provided MIME type should be trusted
|
|
///
|
|
/// Some servers return generic types like "application/octet-stream" which
|
|
/// aren't useful, while others provide accurate information.
|
|
fn is_trusted_server_mime_type(mime_type: &str) -> bool {
|
|
!matches!(mime_type,
|
|
"application/octet-stream" |
|
|
"application/binary" |
|
|
"binary/octet-stream" |
|
|
"" |
|
|
"unknown"
|
|
)
|
|
}
|
|
|
|
/// Check if two MIME types are compatible/equivalent
|
|
///
|
|
/// Some servers might return slightly different but equivalent MIME types
|
|
/// (e.g., "image/jpg" vs "image/jpeg")
|
|
fn are_mime_types_compatible(type1: &str, type2: &str) -> bool {
|
|
if type1 == type2 {
|
|
return true;
|
|
}
|
|
|
|
// Handle common variations
|
|
match (type1, type2) {
|
|
("image/jpeg", "image/jpg") | ("image/jpg", "image/jpeg") => true,
|
|
("image/tiff", "image/tif") | ("image/tif", "image/tiff") => true,
|
|
("text/plain", "text/txt") | ("text/txt", "text/plain") => true,
|
|
_ => {
|
|
// Check if they have the same primary type (e.g., both are "image/*")
|
|
let parts1: Vec<&str> = type1.split('/').collect();
|
|
let parts2: Vec<&str> = type2.split('/').collect();
|
|
|
|
parts1.len() == 2 && parts2.len() == 2 && parts1[0] == parts2[0]
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Legacy function for backward compatibility
|
|
///
|
|
/// This maintains the same interface as the original `get_mime_type_from_extension`
|
|
/// function but uses the new detection system.
|
|
pub fn get_mime_type_from_extension(extension: &str) -> String {
|
|
let fake_filename = format!("file.{}", extension);
|
|
let result = detect_from_extension(&fake_filename, None);
|
|
result.mime_type
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_mime_detection_from_extension() {
|
|
let result = detect_mime_for_discovery(
|
|
"test.pdf",
|
|
None,
|
|
DetectionStrategy::ExtensionOnly
|
|
);
|
|
assert_eq!(result.mime_type, "application/pdf");
|
|
assert_eq!(result.detection_method, DetectionMethod::Extension);
|
|
}
|
|
|
|
#[test]
|
|
fn test_server_type_trust() {
|
|
// Trusted server type
|
|
let result = detect_mime_for_discovery(
|
|
"test.pdf",
|
|
Some("application/pdf"),
|
|
DetectionStrategy::TrustServer
|
|
);
|
|
assert_eq!(result.mime_type, "application/pdf");
|
|
assert_eq!(result.detection_method, DetectionMethod::ServerProvided);
|
|
|
|
// Untrusted server type should fall back
|
|
let result = detect_mime_for_discovery(
|
|
"test.pdf",
|
|
Some("application/octet-stream"),
|
|
DetectionStrategy::TrustServer
|
|
);
|
|
assert_eq!(result.mime_type, "application/pdf");
|
|
assert_eq!(result.detection_method, DetectionMethod::Extension);
|
|
}
|
|
|
|
#[test]
|
|
fn test_mime_type_compatibility() {
|
|
assert!(are_mime_types_compatible("image/jpeg", "image/jpg"));
|
|
assert!(are_mime_types_compatible("image/jpg", "image/jpeg"));
|
|
assert!(are_mime_types_compatible("text/plain", "text/plain"));
|
|
assert!(!are_mime_types_compatible("image/jpeg", "text/plain"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_content_based_detection() {
|
|
// PDF magic bytes
|
|
let pdf_header = b"%PDF-1.4";
|
|
let result = detect_mime_from_content(pdf_header, "test.pdf", None);
|
|
assert_eq!(result.mime_type, "application/pdf");
|
|
assert_eq!(result.detection_method, DetectionMethod::MagicBytes);
|
|
assert_eq!(result.confidence, MimeConfidence::High);
|
|
|
|
// JPEG magic bytes
|
|
let jpeg_header = [0xFF, 0xD8, 0xFF];
|
|
let result = detect_mime_from_content(&jpeg_header, "test.jpg", None);
|
|
assert_eq!(result.mime_type, "image/jpeg");
|
|
}
|
|
|
|
#[test]
|
|
fn test_hybrid_detection() {
|
|
// Content and server agree
|
|
let pdf_header = b"%PDF-1.4";
|
|
let result = detect_mime_from_content(pdf_header, "test.pdf", Some("application/pdf"));
|
|
assert_eq!(result.mime_type, "application/pdf");
|
|
assert_eq!(result.detection_method, DetectionMethod::Hybrid);
|
|
assert_eq!(result.confidence, MimeConfidence::VeryHigh);
|
|
}
|
|
|
|
#[test]
|
|
fn test_legacy_compatibility() {
|
|
assert_eq!(get_mime_type_from_extension("pdf"), "application/pdf");
|
|
assert_eq!(get_mime_type_from_extension("jpg"), "image/jpeg");
|
|
assert_eq!(get_mime_type_from_extension("png"), "image/png");
|
|
}
|
|
|
|
#[test]
|
|
fn test_ocr_suitability() {
|
|
let pdf_result = MimeDetectionResult::from_content("application/pdf".to_string(), None);
|
|
assert!(pdf_result.is_ocr_suitable());
|
|
|
|
let image_result = MimeDetectionResult::from_content("image/jpeg".to_string(), None);
|
|
assert!(image_result.is_ocr_suitable());
|
|
|
|
let text_result = MimeDetectionResult::from_content("text/plain".to_string(), None);
|
|
assert!(!text_result.is_ocr_suitable());
|
|
}
|
|
} |