Readur/src/metadata_extraction/tests.rs

320 lines
13 KiB
Rust

#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use serde_json::Value;
#[tokio::test]
async fn test_image_metadata_extraction_portrait() {
let image_data = fs::read("test_files/portrait_100x200.png").expect("Failed to read portrait test image");
let metadata = extract_content_metadata(&image_data, "image/png", "portrait_100x200.png")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
// Check basic image properties
assert_eq!(metadata["image_width"], Value::Number(100.into()));
assert_eq!(metadata["image_height"], Value::Number(200.into()));
assert_eq!(metadata["orientation"], Value::String("portrait".to_string()));
assert_eq!(metadata["file_extension"], Value::String("png".to_string()));
// Check calculated values
assert_eq!(metadata["aspect_ratio"], Value::String("0.50".to_string()));
assert_eq!(metadata["megapixels"], Value::String("0.0 MP".to_string()));
}
#[tokio::test]
async fn test_image_metadata_extraction_landscape() {
let image_data = fs::read("test_files/landscape_300x200.png").expect("Failed to read landscape test image");
let metadata = extract_content_metadata(&image_data, "image/png", "landscape_300x200.png")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["image_width"], Value::Number(300.into()));
assert_eq!(metadata["image_height"], Value::Number(200.into()));
assert_eq!(metadata["orientation"], Value::String("landscape".to_string()));
assert_eq!(metadata["aspect_ratio"], Value::String("1.50".to_string()));
}
#[tokio::test]
async fn test_image_metadata_extraction_square() {
let image_data = fs::read("test_files/square_150x150.png").expect("Failed to read square test image");
let metadata = extract_content_metadata(&image_data, "image/png", "square_150x150.png")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["image_width"], Value::Number(150.into()));
assert_eq!(metadata["image_height"], Value::Number(150.into()));
assert_eq!(metadata["orientation"], Value::String("square".to_string()));
assert_eq!(metadata["aspect_ratio"], Value::String("1.00".to_string()));
}
#[tokio::test]
async fn test_image_metadata_extraction_high_resolution() {
let image_data = fs::read("test_files/hires_1920x1080.png").expect("Failed to read high-res test image");
let metadata = extract_content_metadata(&image_data, "image/png", "hires_1920x1080.png")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["image_width"], Value::Number(1920.into()));
assert_eq!(metadata["image_height"], Value::Number(1080.into()));
assert_eq!(metadata["orientation"], Value::String("landscape".to_string()));
assert_eq!(metadata["megapixels"], Value::String("2.1 MP".to_string()));
}
#[tokio::test]
async fn test_jpeg_metadata_extraction() {
let image_data = fs::read("test_files/test_image.jpg").expect("Failed to read JPEG test image");
let metadata = extract_content_metadata(&image_data, "image/jpeg", "test_image.jpg")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["file_extension"], Value::String("jpg".to_string()));
assert!(metadata.contains_key("image_width"));
assert!(metadata.contains_key("image_height"));
}
#[tokio::test]
async fn test_pdf_metadata_extraction_single_page() {
let pdf_data = fs::read("test_files/single_page_v14.pdf").expect("Failed to read single page PDF");
let metadata = extract_content_metadata(&pdf_data, "application/pdf", "single_page_v14.pdf")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["file_extension"], Value::String("pdf".to_string()));
// Note: PDF version detection might vary depending on how reportlab creates the file
assert!(metadata.contains_key("pdf_version") || metadata.contains_key("file_type"));
}
#[tokio::test]
async fn test_pdf_metadata_extraction_multipage() {
let pdf_data = fs::read("test_files/multipage_test.pdf").expect("Failed to read multipage PDF");
let metadata = extract_content_metadata(&pdf_data, "application/pdf", "multipage_test.pdf")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["file_extension"], Value::String("pdf".to_string()));
// Should detect multiple pages if our page counting works
if let Some(page_count) = metadata.get("page_count") {
if let Value::Number(count) = page_count {
assert!(count.as_u64().unwrap() > 1);
}
}
}
#[tokio::test]
async fn test_pdf_metadata_with_fonts_and_images() {
let pdf_data = fs::read("test_files/complex_content.pdf").expect("Failed to read complex PDF");
let metadata = extract_content_metadata(&pdf_data, "application/pdf", "complex_content.pdf")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
// Should detect fonts and potentially images/objects
if let Some(Value::Bool(has_fonts)) = metadata.get("contains_fonts") {
// Font detection might work depending on PDF structure
}
}
#[tokio::test]
async fn test_text_metadata_extraction_comprehensive() {
let text_data = fs::read("test_files/comprehensive_text.txt").expect("Failed to read comprehensive text");
let metadata = extract_content_metadata(&text_data, "text/plain", "comprehensive_text.txt")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["file_extension"], Value::String("txt".to_string()));
// Check text statistics
if let Value::Number(char_count) = &metadata["character_count"] {
assert!(char_count.as_u64().unwrap() > 500); // Should be substantial
}
if let Value::Number(word_count) = &metadata["word_count"] {
assert!(word_count.as_u64().unwrap() > 80); // Should have many words
}
if let Value::Number(line_count) = &metadata["line_count"] {
assert!(line_count.as_u64().unwrap() > 15); // Should have multiple lines
}
// Should detect Unicode content
assert_eq!(metadata["contains_unicode"], Value::Bool(true));
// Should detect likely English
if let Some(Value::String(lang)) = metadata.get("likely_language") {
assert_eq!(lang, "english");
}
}
#[tokio::test]
async fn test_text_metadata_extraction_ascii_only() {
let text_data = fs::read("test_files/ascii_only.txt").expect("Failed to read ASCII text");
let metadata = extract_content_metadata(&text_data, "text/plain", "ascii_only.txt")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
// Should NOT contain Unicode
assert!(metadata.get("contains_unicode").is_none() || metadata["contains_unicode"] == Value::Bool(false));
}
#[tokio::test]
async fn test_text_metadata_extraction_large_file() {
let text_data = fs::read("test_files/large_text.txt").expect("Failed to read large text");
let metadata = extract_content_metadata(&text_data, "text/plain", "large_text.txt")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
// Should handle large files properly
if let Value::Number(char_count) = &metadata["character_count"] {
assert!(char_count.as_u64().unwrap() > 50000); // Should be large
}
if let Value::Number(word_count) = &metadata["word_count"] {
assert!(word_count.as_u64().unwrap() > 10000); // Should have many words
}
}
#[tokio::test]
async fn test_json_format_detection() {
let text_data = fs::read("test_files/test_format.json").expect("Failed to read JSON text");
let metadata = extract_content_metadata(&text_data, "text/plain", "test_format.json")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["file_extension"], Value::String("json".to_string()));
// Should detect JSON format
if let Some(Value::String(format)) = metadata.get("text_format") {
assert_eq!(format, "json");
}
}
#[tokio::test]
async fn test_xml_format_detection() {
let text_data = fs::read("test_files/test_format.xml").expect("Failed to read XML text");
let metadata = extract_content_metadata(&text_data, "text/plain", "test_format.xml")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["file_extension"], Value::String("xml".to_string()));
// Should detect XML format
if let Some(Value::String(format)) = metadata.get("text_format") {
assert_eq!(format, "xml");
}
}
#[tokio::test]
async fn test_html_format_detection() {
let text_data = fs::read("test_files/test_format.html").expect("Failed to read HTML text");
let metadata = extract_content_metadata(&text_data, "text/plain", "test_format.html")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["file_extension"], Value::String("html".to_string()));
// Should detect HTML format
if let Some(Value::String(format)) = metadata.get("text_format") {
assert_eq!(format, "html");
}
}
#[tokio::test]
async fn test_unknown_file_type() {
let dummy_data = b"This is some random binary data that doesn't match any known format.";
let metadata = extract_content_metadata(dummy_data, "application/octet-stream", "unknown.bin")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["file_type"], Value::String("application/octet-stream".to_string()));
assert_eq!(metadata["file_extension"], Value::String("bin".to_string()));
}
#[tokio::test]
async fn test_empty_file() {
let empty_data = b"";
let metadata = extract_content_metadata(empty_data, "text/plain", "empty.txt")
.await
.expect("Failed to extract metadata");
// Should still return some metadata (at least file extension)
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["file_extension"], Value::String("txt".to_string()));
}
#[tokio::test]
async fn test_file_without_extension() {
let text_data = b"Some text content without file extension";
let metadata = extract_content_metadata(text_data, "text/plain", "no_extension")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
// Should not have file_extension field
assert!(!metadata.contains_key("file_extension"));
}
}