feat(server): show source metadata EVEN better

2025-07-10 21:51:30 +00:00 · 2025-07-10 21:51:30 +00:00 · 59b4eb170c
parent 4c4a593a1e
commit 59b4eb170c
26 changed files with 1017 additions and 8 deletions
--- a/src/ingestion/batch_ingest.rs
+++ b/src/ingestion/batch_ingest.rs
@ -230,8 +230,8 @@ async fn process_single_file(
    user_id: Uuid,
    db: Database,
 ) -> Result<Option<(Uuid, i64)>> {
-    // Extract file info with metadata
-    let file_info = extract_file_info_from_path(&path).await?;
+    // Extract basic file info first
+    let mut file_info = extract_file_info_from_path(&path).await?;
    
    // Skip very large files (> 100MB)
    if file_info.size > 100 * 1024 * 1024 {
@ -242,6 +242,11 @@ async fn process_single_file(
    // Read file data
    let file_data = fs::read(&path).await?;
    
+    // Extract content-based metadata
+    if let Ok(Some(content_metadata)) = crate::metadata_extraction::extract_content_metadata(&file_data, &file_info.mime_type, &file_info.name).await {
+        file_info.metadata = Some(content_metadata);
+    }
+    
    // Use the unified ingestion service with full metadata support
    let ingestion_service = DocumentIngestionService::new(db, file_service);
    
--- a/src/metadata_extraction.rs
+++ b/src/metadata_extraction.rs
@ -1,6 +1,5 @@
 use anyhow::Result;
 use serde_json::{Map, Value};
-use std::collections::HashMap;

 /// Extract metadata from file content based on file type
 pub async fn extract_content_metadata(file_data: &[u8], mime_type: &str, filename: &str) -> Result<Option<Value>> {
@ -176,4 +175,7 @@ async fn extract_text_metadata(file_data: &[u8]) -> Result<Map<String, Value>> {
    }
    
    Ok(metadata)
-}
+}
+
+#[cfg(test)]
+mod tests;
--- a/src/metadata_extraction/tests.rs
+++ b/src/metadata_extraction/tests.rs
@ -0,0 +1,320 @@
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::fs;
+    use serde_json::Value;
+
+    #[tokio::test]
+    async fn test_image_metadata_extraction_portrait() {
+        let image_data = fs::read("test_files/portrait_100x200.png").expect("Failed to read portrait test image");
+        
+        let metadata = extract_content_metadata(&image_data, "image/png", "portrait_100x200.png")
+            .await
+            .expect("Failed to extract metadata");
+        
+        assert!(metadata.is_some());
+        let metadata = metadata.unwrap();
+        
+        // Check basic image properties
+        assert_eq!(metadata["image_width"], Value::Number(100.into()));
+        assert_eq!(metadata["image_height"], Value::Number(200.into()));
+        assert_eq!(metadata["orientation"], Value::String("portrait".to_string()));
+        assert_eq!(metadata["file_extension"], Value::String("png".to_string()));
+        
+        // Check calculated values
+        assert_eq!(metadata["aspect_ratio"], Value::String("0.50".to_string()));
+        assert_eq!(metadata["megapixels"], Value::String("0.0 MP".to_string()));
+    }
+
+    #[tokio::test]
+    async fn test_image_metadata_extraction_landscape() {
+        let image_data = fs::read("test_files/landscape_300x200.png").expect("Failed to read landscape test image");
+        
+        let metadata = extract_content_metadata(&image_data, "image/png", "landscape_300x200.png")
+            .await
+            .expect("Failed to extract metadata");
+        
+        assert!(metadata.is_some());
+        let metadata = metadata.unwrap();
+        
+        assert_eq!(metadata["image_width"], Value::Number(300.into()));
+        assert_eq!(metadata["image_height"], Value::Number(200.into()));
+        assert_eq!(metadata["orientation"], Value::String("landscape".to_string()));
+        assert_eq!(metadata["aspect_ratio"], Value::String("1.50".to_string()));
+    }
+
+    #[tokio::test]
+    async fn test_image_metadata_extraction_square() {
+        let image_data = fs::read("test_files/square_150x150.png").expect("Failed to read square test image");
+        
+        let metadata = extract_content_metadata(&image_data, "image/png", "square_150x150.png")
+            .await
+            .expect("Failed to extract metadata");
+        
+        assert!(metadata.is_some());
+        let metadata = metadata.unwrap();
+        
+        assert_eq!(metadata["image_width"], Value::Number(150.into()));
+        assert_eq!(metadata["image_height"], Value::Number(150.into()));
+        assert_eq!(metadata["orientation"], Value::String("square".to_string()));
+        assert_eq!(metadata["aspect_ratio"], Value::String("1.00".to_string()));
+    }
+
+    #[tokio::test]
+    async fn test_image_metadata_extraction_high_resolution() {
+        let image_data = fs::read("test_files/hires_1920x1080.png").expect("Failed to read high-res test image");
+        
+        let metadata = extract_content_metadata(&image_data, "image/png", "hires_1920x1080.png")
+            .await
+            .expect("Failed to extract metadata");
+        
+        assert!(metadata.is_some());
+        let metadata = metadata.unwrap();
+        
+        assert_eq!(metadata["image_width"], Value::Number(1920.into()));
+        assert_eq!(metadata["image_height"], Value::Number(1080.into()));
+        assert_eq!(metadata["orientation"], Value::String("landscape".to_string()));
+        assert_eq!(metadata["megapixels"], Value::String("2.1 MP".to_string()));
+    }
+
+    #[tokio::test]
+    async fn test_jpeg_metadata_extraction() {
+        let image_data = fs::read("test_files/test_image.jpg").expect("Failed to read JPEG test image");
+        
+        let metadata = extract_content_metadata(&image_data, "image/jpeg", "test_image.jpg")
+            .await
+            .expect("Failed to extract metadata");
+        
+        assert!(metadata.is_some());
+        let metadata = metadata.unwrap();
+        
+        assert_eq!(metadata["file_extension"], Value::String("jpg".to_string()));
+        assert!(metadata.contains_key("image_width"));
+        assert!(metadata.contains_key("image_height"));
+    }
+
+    #[tokio::test]
+    async fn test_pdf_metadata_extraction_single_page() {
+        let pdf_data = fs::read("test_files/single_page_v14.pdf").expect("Failed to read single page PDF");
+        
+        let metadata = extract_content_metadata(&pdf_data, "application/pdf", "single_page_v14.pdf")
+            .await
+            .expect("Failed to extract metadata");
+        
+        assert!(metadata.is_some());
+        let metadata = metadata.unwrap();
+        
+        assert_eq!(metadata["file_extension"], Value::String("pdf".to_string()));
+        // Note: PDF version detection might vary depending on how reportlab creates the file
+        assert!(metadata.contains_key("pdf_version") || metadata.contains_key("file_type"));
+    }
+
+    #[tokio::test]
+    async fn test_pdf_metadata_extraction_multipage() {
+        let pdf_data = fs::read("test_files/multipage_test.pdf").expect("Failed to read multipage PDF");
+        
+        let metadata = extract_content_metadata(&pdf_data, "application/pdf", "multipage_test.pdf")
+            .await
+            .expect("Failed to extract metadata");
+        
+        assert!(metadata.is_some());
+        let metadata = metadata.unwrap();
+        
+        assert_eq!(metadata["file_extension"], Value::String("pdf".to_string()));
+        // Should detect multiple pages if our page counting works
+        if let Some(page_count) = metadata.get("page_count") {
+            if let Value::Number(count) = page_count {
+                assert!(count.as_u64().unwrap() > 1);
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_pdf_metadata_with_fonts_and_images() {
+        let pdf_data = fs::read("test_files/complex_content.pdf").expect("Failed to read complex PDF");
+        
+        let metadata = extract_content_metadata(&pdf_data, "application/pdf", "complex_content.pdf")
+            .await
+            .expect("Failed to extract metadata");
+        
+        assert!(metadata.is_some());
+        let metadata = metadata.unwrap();
+        
+        // Should detect fonts and potentially images/objects
+        if let Some(Value::Bool(has_fonts)) = metadata.get("contains_fonts") {
+            // Font detection might work depending on PDF structure
+        }
+    }
+
+    #[tokio::test]
+    async fn test_text_metadata_extraction_comprehensive() {
+        let text_data = fs::read("test_files/comprehensive_text.txt").expect("Failed to read comprehensive text");
+        
+        let metadata = extract_content_metadata(&text_data, "text/plain", "comprehensive_text.txt")
+            .await
+            .expect("Failed to extract metadata");
+        
+        assert!(metadata.is_some());
+        let metadata = metadata.unwrap();
+        
+        assert_eq!(metadata["file_extension"], Value::String("txt".to_string()));
+        
+        // Check text statistics
+        if let Value::Number(char_count) = &metadata["character_count"] {
+            assert!(char_count.as_u64().unwrap() > 500); // Should be substantial
+        }
+        
+        if let Value::Number(word_count) = &metadata["word_count"] {
+            assert!(word_count.as_u64().unwrap() > 80); // Should have many words
+        }
+        
+        if let Value::Number(line_count) = &metadata["line_count"] {
+            assert!(line_count.as_u64().unwrap() > 15); // Should have multiple lines
+        }
+        
+        // Should detect Unicode content
+        assert_eq!(metadata["contains_unicode"], Value::Bool(true));
+        
+        // Should detect likely English
+        if let Some(Value::String(lang)) = metadata.get("likely_language") {
+            assert_eq!(lang, "english");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_text_metadata_extraction_ascii_only() {
+        let text_data = fs::read("test_files/ascii_only.txt").expect("Failed to read ASCII text");
+        
+        let metadata = extract_content_metadata(&text_data, "text/plain", "ascii_only.txt")
+            .await
+            .expect("Failed to extract metadata");
+        
+        assert!(metadata.is_some());
+        let metadata = metadata.unwrap();
+        
+        // Should NOT contain Unicode
+        assert!(metadata.get("contains_unicode").is_none() || metadata["contains_unicode"] == Value::Bool(false));
+    }
+
+    #[tokio::test]
+    async fn test_text_metadata_extraction_large_file() {
+        let text_data = fs::read("test_files/large_text.txt").expect("Failed to read large text");
+        
+        let metadata = extract_content_metadata(&text_data, "text/plain", "large_text.txt")
+            .await
+            .expect("Failed to extract metadata");
+        
+        assert!(metadata.is_some());
+        let metadata = metadata.unwrap();
+        
+        // Should handle large files properly
+        if let Value::Number(char_count) = &metadata["character_count"] {
+            assert!(char_count.as_u64().unwrap() > 50000); // Should be large
+        }
+        
+        if let Value::Number(word_count) = &metadata["word_count"] {
+            assert!(word_count.as_u64().unwrap() > 10000); // Should have many words
+        }
+    }
+
+    #[tokio::test]
+    async fn test_json_format_detection() {
+        let text_data = fs::read("test_files/test_format.json").expect("Failed to read JSON text");
+        
+        let metadata = extract_content_metadata(&text_data, "text/plain", "test_format.json")
+            .await
+            .expect("Failed to extract metadata");
+        
+        assert!(metadata.is_some());
+        let metadata = metadata.unwrap();
+        
+        assert_eq!(metadata["file_extension"], Value::String("json".to_string()));
+        
+        // Should detect JSON format
+        if let Some(Value::String(format)) = metadata.get("text_format") {
+            assert_eq!(format, "json");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_xml_format_detection() {
+        let text_data = fs::read("test_files/test_format.xml").expect("Failed to read XML text");
+        
+        let metadata = extract_content_metadata(&text_data, "text/plain", "test_format.xml")
+            .await
+            .expect("Failed to extract metadata");
+        
+        assert!(metadata.is_some());
+        let metadata = metadata.unwrap();
+        
+        assert_eq!(metadata["file_extension"], Value::String("xml".to_string()));
+        
+        // Should detect XML format
+        if let Some(Value::String(format)) = metadata.get("text_format") {
+            assert_eq!(format, "xml");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_html_format_detection() {
+        let text_data = fs::read("test_files/test_format.html").expect("Failed to read HTML text");
+        
+        let metadata = extract_content_metadata(&text_data, "text/plain", "test_format.html")
+            .await
+            .expect("Failed to extract metadata");
+        
+        assert!(metadata.is_some());
+        let metadata = metadata.unwrap();
+        
+        assert_eq!(metadata["file_extension"], Value::String("html".to_string()));
+        
+        // Should detect HTML format
+        if let Some(Value::String(format)) = metadata.get("text_format") {
+            assert_eq!(format, "html");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_unknown_file_type() {
+        let dummy_data = b"This is some random binary data that doesn't match any known format.";
+        
+        let metadata = extract_content_metadata(dummy_data, "application/octet-stream", "unknown.bin")
+            .await
+            .expect("Failed to extract metadata");
+        
+        assert!(metadata.is_some());
+        let metadata = metadata.unwrap();
+        
+        assert_eq!(metadata["file_type"], Value::String("application/octet-stream".to_string()));
+        assert_eq!(metadata["file_extension"], Value::String("bin".to_string()));
+    }
+
+    #[tokio::test]
+    async fn test_empty_file() {
+        let empty_data = b"";
+        
+        let metadata = extract_content_metadata(empty_data, "text/plain", "empty.txt")
+            .await
+            .expect("Failed to extract metadata");
+        
+        // Should still return some metadata (at least file extension)
+        assert!(metadata.is_some());
+        let metadata = metadata.unwrap();
+        assert_eq!(metadata["file_extension"], Value::String("txt".to_string()));
+    }
+
+    #[tokio::test]
+    async fn test_file_without_extension() {
+        let text_data = b"Some text content without file extension";
+        
+        let metadata = extract_content_metadata(text_data, "text/plain", "no_extension")
+            .await
+            .expect("Failed to extract metadata");
+        
+        assert!(metadata.is_some());
+        let metadata = metadata.unwrap();
+        
+        // Should not have file_extension field
+        assert!(!metadata.contains_key("file_extension"));
+    }
+}
--- a/src/routes/documents/crud.rs
+++ b/src/routes/documents/crud.rs
@ -78,7 +78,7 @@ pub async fn upload_document(
    use crate::models::FileIngestionInfo;
    use chrono::Utc;
    
-    let file_info = FileIngestionInfo {
+    let mut file_info = FileIngestionInfo {
        path: format!("upload/{}", filename), // Virtual path for web uploads
        name: filename.clone(),
        size: data.len() as i64,
@ -90,9 +90,14 @@ pub async fn upload_document(
        permissions: None, // Web uploads don't have filesystem permissions
        owner: Some(auth_user.user.username.clone()), // Uploader as owner
        group: None, // Web uploads don't have filesystem groups
-        metadata: None, // Could extract EXIF/PDF metadata in the future
+        metadata: None, // Will be populated with extracted metadata below
    };
    
+    // Extract content-based metadata from uploaded file
+    if let Ok(Some(content_metadata)) = crate::metadata_extraction::extract_content_metadata(&data, &content_type, &filename).await {
+        file_info.metadata = Some(content_metadata);
+    }
+    
    // Create ingestion service
    let file_service = FileService::new(state.config.upload_path.clone());
    let ingestion_service = DocumentIngestionService::new(
--- a/src/scheduling/watcher.rs
+++ b/src/scheduling/watcher.rs
@ -337,8 +337,13 @@ async fn process_file(
        }
    }
    
-    // Extract file info with metadata
-    let file_info = extract_file_info_from_path(path).await?;
+    // Extract basic file info first
+    let mut file_info = extract_file_info_from_path(path).await?;
+    
+    // Extract content-based metadata
+    if let Ok(Some(content_metadata)) = crate::metadata_extraction::extract_content_metadata(&file_data, &file_info.mime_type, &file_info.name).await {
+        file_info.metadata = Some(content_metadata);
+    }
    
    // Use the unified ingestion service for consistent deduplication
    let ingestion_service = DocumentIngestionService::new(db.clone(), file_service.clone());
--- a/test_files/ascii_only.txt
+++ b/test_files/ascii_only.txt
@ -0,0 +1,5 @@
+Pure ASCII text document without any Unicode characters.
+This file contains only standard ASCII characters from the basic set.
+Numbers: 0123456789
+Punctuation: .,;:!?'"()-[]{}
+All characters should be ASCII-only for testing encoding detection.
--- a/test_files/complex_content.pdf
+++ b/test_files/complex_content.pdf
@ -0,0 +1,80 @@
+%PDF-1.3
+%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
+1 0 obj
+<<
+/F1 2 0 R /F2 3 0 R /F3 4 0 R
+>>
+endobj
+2 0 obj
+<<
+/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
+>>
+endobj
+3 0 obj
+<<
+/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
+>>
+endobj
+4 0 obj
+<<
+/BaseFont /Courier /Encoding /WinAnsiEncoding /Name /F3 /Subtype /Type1 /Type /Font
+>>
+endobj
+5 0 obj
+<<
+/Contents 9 0 R /MediaBox [ 0 0 612 792 ] /Parent 8 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+
+>> 
+  /Type /Page
+>>
+endobj
+6 0 obj
+<<
+/PageMode /UseNone /Pages 8 0 R /Type /Catalog
+>>
+endobj
+7 0 obj
+<<
+/Author (anonymous) /CreationDate (D:20250710214218+00'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20250710214218+00'00') /Producer (ReportLab PDF Library - www.reportlab.com) 
+  /Subject (unspecified) /Title (Complex PDF with Fonts) /Trapped /False
+>>
+endobj
+8 0 obj
+<<
+/Count 1 /Kids [ 5 0 R ] /Type /Pages
+>>
+endobj
+9 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 246
+>>
+stream
+Garp%aUZ01'Euj3^Z$lr70RKbDN26UQ2En"QspCZ"fK-0V7'.H&;JtnEutX^`S4<o?m.72F)V/*!70(%EeBWNg0B/(gO,TM=3`mZ1PT&X+>!djY>/-1Ll7*9[\kUD]$a]-(W-gT#W[42^Y:qOp/#=7!qndZ#V1iAYW[K/"S#OYOFENi#\m$3$pO%_hg)82^%7pMPXa0S88Np"d23mBn#h"bUhu6<Y6qlocNt;-:Db!E.?Zcu(gLV~>endstream
+endobj
+xref
+0 10
+0000000000 65535 f 
+0000000073 00000 n 
+0000000124 00000 n 
+0000000231 00000 n 
+0000000343 00000 n 
+0000000448 00000 n 
+0000000641 00000 n 
+0000000709 00000 n 
+0000001019 00000 n 
+0000001078 00000 n 
+trailer
+<<
+/ID 
+[<915ab5b109826181e9414e12bf7351a6><915ab5b109826181e9414e12bf7351a6>]
+% ReportLab generated PDF document -- digest (http://www.reportlab.com)
+
+/Info 7 0 R
+/Root 6 0 R
+/Size 10
+>>
+startxref
+1414
+%%EOF
--- a/test_files/comprehensive_text.txt
+++ b/test_files/comprehensive_text.txt
@ -0,0 +1,15 @@
+This is a comprehensive test document for text metadata extraction.
+
+It contains multiple paragraphs, various types of content, and different characteristics.
+Word count: This sentence has exactly seven words counting properly.
+Line counting: Each line should be counted separately for accurate statistics.
+
+Unicode content: café, naïve, résumé, piñata, Zürich, москва, 東京, 🎉✨🔥
+Numbers and mixed content: 123 ABC def456 GHI789 test@example.com
+
+Special formatting:
+- Bulleted lists
+- Multiple items
+- With various content
+
+The document ends here with a final paragraph.
--- a/test_files/create_metadata_test_files.py
+++ b/test_files/create_metadata_test_files.py
@ -0,0 +1,266 @@
+#!/usr/bin/env python3
+"""
+Create test files for metadata extraction testing.
+"""
+
+import os
+import sys
+from pathlib import Path
+
+# Try to import PIL for image creation
+try:
+    from PIL import Image, ImageDraw, ImageFont
+    from PIL.ExifTags import TAGS
+    from PIL.ExifTags import GPSTAGS
+    PIL_AVAILABLE = True
+except ImportError:
+    print("PIL not available, skipping image creation with EXIF")
+    PIL_AVAILABLE = False
+
+# Try to import reportlab for PDF creation
+try:
+    from reportlab.pdfgen import canvas
+    from reportlab.lib.pagesizes import letter, A4
+    from reportlab.pdfbase import pdfmetrics
+    from reportlab.pdfbase.ttfonts import TTFont
+    REPORTLAB_AVAILABLE = True
+except ImportError:
+    print("reportlab not available, creating simple PDF-like files")
+    REPORTLAB_AVAILABLE = False
+
+def create_test_images():
+    """Create test images with various properties."""
+    if not PIL_AVAILABLE:
+        print("Skipping image creation - PIL not available")
+        return
+    
+    print("Creating test images...")
+    
+    # 1. Portrait image (100x200)
+    img = Image.new('RGB', (100, 200), color='lightblue')
+    draw = ImageDraw.Draw(img)
+    draw.text((10, 50), "Portrait\n100x200", fill='black')
+    img.save('test_files/portrait_100x200.png')
+    
+    # 2. Landscape image (300x200)
+    img = Image.new('RGB', (300, 200), color='lightgreen')
+    draw = ImageDraw.Draw(img)
+    draw.text((50, 50), "Landscape 300x200", fill='black')
+    img.save('test_files/landscape_300x200.png')
+    
+    # 3. Square image (150x150)
+    img = Image.new('RGB', (150, 150), color='lightyellow')
+    draw = ImageDraw.Draw(img)
+    draw.text((25, 50), "Square\n150x150", fill='black')
+    img.save('test_files/square_150x150.png')
+    
+    # 4. High resolution image (1920x1080)
+    img = Image.new('RGB', (1920, 1080), color='lightcoral')
+    draw = ImageDraw.Draw(img)
+    try:
+        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 40)
+    except:
+        font = ImageFont.load_default()
+    draw.text((100, 500), "High Resolution\n1920x1080\n2.07 Megapixels", fill='black', font=font)
+    img.save('test_files/hires_1920x1080.png')
+    
+    # 5. Small image (50x50)
+    img = Image.new('RGB', (50, 50), color='lightgray')
+    img.save('test_files/small_50x50.png')
+    
+    # 6. JPEG with different color mode
+    img = Image.new('RGB', (200, 200), color='purple')
+    draw = ImageDraw.Draw(img)
+    draw.text((50, 50), "JPEG\nTest", fill='white')
+    img.save('test_files/test_image.jpg', 'JPEG')
+    
+    print("Created test images")
+
+def create_test_pdfs():
+    """Create test PDFs with various properties."""
+    if not REPORTLAB_AVAILABLE:
+        print("Creating simple PDF-like files...")
+        # Create simple files that look like PDF headers
+        simple_pdfs = [
+            ("%PDF-1.4\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n2 0 obj\n<<\n/Type /Pages\n/Kids [3 0 R]\n/Count 1\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n/MediaBox [0 0 612 792]\n>>\nendobj\nxref\n0 4\n0000000000 65535 f \n0000000009 00000 n \n0000000074 00000 n \n0000000120 00000 n \ntrailer\n<<\n/Size 4\n/Root 1 0 R\n>>\nstartxref\n179\n%%EOF", "simple_v14.pdf"),
+            ("%PDF-1.7\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n2 0 obj\n<<\n/Type /Pages\n/Kids [3 0 R 4 0 R]\n/Count 2\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n>>\nendobj\n4 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n>>\nendobj\nxref\n0 5\ntrailer\n<<\n/Size 5\n/Root 1 0 R\n>>\n%%EOF", "multipage_v17.pdf"),
+            ("%PDF-1.5\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n/Linearized true\n>>\nendobj\n2 0 obj\n<<\n/Type /Pages\n/Kids [3 0 R]\n/Count 1\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n/Font 4 0 R\n/Image 5 0 R\n>>\nendobj\nxref\n0 4\ntrailer\n<<\n/Size 4\n/Root 1 0 R\n>>\n%%EOF", "with_fonts_images.pdf"),
+        ]
+        
+        for content, filename in simple_pdfs:
+            with open(f'test_files/{filename}', 'wb') as f:
+                f.write(content.encode('latin1'))
+        print("Created simple PDF-like files")
+        return
+    
+    print("Creating test PDFs with reportlab...")
+    
+    # 1. Single page PDF v1.4
+    c = canvas.Canvas('test_files/single_page_v14.pdf', pagesize=letter)
+    c.setTitle("Single Page Test Document")
+    c.setAuthor("Test Author")
+    c.setSubject("Test Subject")
+    c.setCreator("Python reportlab")
+    c.setFont("Helvetica", 12)
+    c.drawString(100, 750, "Single Page PDF Document")
+    c.drawString(100, 700, "This is a test PDF for metadata extraction.")
+    c.drawString(100, 650, "It should be detected as PDF version 1.4")
+    c.save()
+    
+    # 2. Multi-page PDF
+    c = canvas.Canvas('test_files/multipage_test.pdf', pagesize=A4)
+    c.setTitle("Multi-page Test Document")
+    # Page 1
+    c.setFont("Helvetica", 14)
+    c.drawString(100, 800, "Page 1 of Multi-page Document")
+    c.drawString(100, 750, "This document has multiple pages.")
+    c.showPage()
+    # Page 2
+    c.drawString(100, 800, "Page 2 of Multi-page Document")
+    c.drawString(100, 750, "Second page content here.")
+    c.showPage()
+    # Page 3
+    c.drawString(100, 800, "Page 3 - Final Page")
+    c.drawString(100, 750, "Third and final page.")
+    c.save()
+    
+    # 3. PDF with fonts and complex content
+    c = canvas.Canvas('test_files/complex_content.pdf', pagesize=letter)
+    c.setTitle("Complex PDF with Fonts")
+    c.setFont("Helvetica-Bold", 16)
+    c.drawString(100, 750, "Document with Multiple Fonts")
+    c.setFont("Helvetica", 12)
+    c.drawString(100, 700, "This document contains multiple font types.")
+    c.setFont("Courier", 10)
+    c.drawString(100, 650, "Some monospace text for variety.")
+    # Add some graphics/lines
+    c.line(100, 600, 500, 600)
+    c.rect(100, 550, 200, 30)
+    c.save()
+    
+    print("Created test PDFs")
+
+def create_text_files():
+    """Create various text files for testing."""
+    print("Creating test text files...")
+    
+    # 1. Plain text with various content
+    content = """This is a comprehensive test document for text metadata extraction.
+
+It contains multiple paragraphs, various types of content, and different characteristics.
+Word count: This sentence has exactly seven words counting properly.
+Line counting: Each line should be counted separately for accurate statistics.
+
+Unicode content: café, naïve, résumé, piñata, Zürich, москва, 東京, 🎉✨🔥
+Numbers and mixed content: 123 ABC def456 GHI789 test@example.com
+
+Special formatting:
+- Bulleted lists
+- Multiple items
+- With various content
+
+The document ends here with a final paragraph."""
+    
+    with open('test_files/comprehensive_text.txt', 'w', encoding='utf-8') as f:
+        f.write(content)
+    
+    # 2. JSON format text
+    json_content = """{
+  "document": {
+    "title": "Test JSON Document",
+    "type": "metadata_test",
+    "properties": {
+      "word_count": 25,
+      "format": "json",
+      "encoding": "utf-8"
+    },
+    "content": [
+      "This JSON should be detected as JSON format",
+      "It contains structured data in JSON format"
+    ]
+  }
+}"""
+    
+    with open('test_files/test_format.json', 'w') as f:
+        f.write(json_content)
+    
+    # 3. XML format text
+    xml_content = """<?xml version="1.0" encoding="UTF-8"?>
+<document type="test">
+  <metadata>
+    <title>XML Test Document</title>
+    <format>xml</format>
+    <word_count>15</word_count>
+  </metadata>
+  <content>
+    <section>This XML document should be detected as XML format.</section>
+    <section>It contains structured markup for testing.</section>
+  </content>
+</document>"""
+    
+    with open('test_files/test_format.xml', 'w') as f:
+        f.write(xml_content)
+    
+    # 4. HTML format text
+    html_content = """<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>HTML Test Document</title>
+</head>
+<body>
+    <h1>HTML Test Page</h1>
+    <p>This document should be detected as HTML format.</p>
+    <p>It contains HTML markup and structure.</p>
+    <ul>
+        <li>List item one</li>
+        <li>List item two</li>
+    </ul>
+</body>
+</html>"""
+    
+    with open('test_files/test_format.html', 'w') as f:
+        f.write(html_content)
+    
+    # 5. Large text file for performance testing
+    large_content = "This is a large text file for testing performance. " * 1000
+    large_content += "\nEnd of large file with final line."
+    
+    with open('test_files/large_text.txt', 'w') as f:
+        f.write(large_content)
+    
+    # 6. ASCII-only text
+    ascii_content = """Pure ASCII text document without any Unicode characters.
+This file contains only standard ASCII characters from the basic set.
+Numbers: 0123456789
+Punctuation: .,;:!?'"()-[]{}
+All characters should be ASCII-only for testing encoding detection."""
+    
+    with open('test_files/ascii_only.txt', 'w') as f:
+        f.write(ascii_content)
+    
+    print("Created test text files")
+
+def main():
+    """Create all test files."""
+    # Ensure test_files directory exists
+    os.makedirs('test_files', exist_ok=True)
+    
+    print("Creating test files for metadata extraction testing...")
+    
+    create_text_files()
+    create_test_images()
+    create_test_pdfs()
+    
+    print("\nAll test files created successfully!")
+    print("Files created in test_files/ directory:")
+    
+    # List all created files
+    test_files = sorted(Path('test_files').glob('*'))
+    for file_path in test_files:
+        if file_path.is_file() and not file_path.name.endswith('.py'):
+            size = file_path.stat().st_size
+            print(f"  {file_path.name} ({size} bytes)")
+
+if __name__ == "__main__":
+    main()
--- a/test_files/create_test_image.rs
+++ b/test_files/create_test_image.rs
@ -0,0 +1,42 @@
+// This is a helper script to create test images
+use image::{ImageBuffer, Rgb, DynamicImage};
+use std::path::Path;
+
+pub fn create_test_images() -> Result<(), Box<dyn std::error::Error>> {
+    // Create a simple 100x200 RGB image (portrait)
+    let mut img = ImageBuffer::new(100, 200);
+    for (x, y, pixel) in img.enumerate_pixels_mut() {
+        let r = (x * 255 / 100) as u8;
+        let g = (y * 255 / 200) as u8;
+        let b = 128;
+        *pixel = Rgb([r, g, b]);
+    }
+    
+    let dynamic_img = DynamicImage::ImageRgb8(img);
+    dynamic_img.save("test_files/sample_portrait.png")?;
+    
+    // Create a simple 300x200 RGB image (landscape)
+    let mut img2 = ImageBuffer::new(300, 200);
+    for (x, y, pixel) in img2.enumerate_pixels_mut() {
+        let r = 255 - (x * 255 / 300) as u8;
+        let g = (y * 255 / 200) as u8;
+        let b = (x + y) as u8 % 255;
+        *pixel = Rgb([r, g, b]);
+    }
+    
+    let dynamic_img2 = DynamicImage::ImageRgb8(img2);
+    dynamic_img2.save("test_files/sample_landscape.png")?;
+    
+    // Create a square image 150x150
+    let mut img3 = ImageBuffer::new(150, 150);
+    for (x, y, pixel) in img3.enumerate_pixels_mut() {
+        let distance = ((x as i32 - 75).pow(2) + (y as i32 - 75).pow(2)) as f32;
+        let intensity = (255.0 * (1.0 - distance / (75.0 * 75.0))).max(0.0) as u8;
+        *pixel = Rgb([intensity, 0, 255 - intensity]);
+    }
+    
+    let dynamic_img3 = DynamicImage::ImageRgb8(img3);
+    dynamic_img3.save("test_files/sample_square.png")?;
+    
+    Ok(())
+}
--- a/test_files/hires_1920x1080.png
+++ b/test_files/hires_1920x1080.png
--- a/test_files/landscape_300x200.png
+++ b/test_files/landscape_300x200.png
--- a/test_files/large_text.txt
+++ b/test_files/large_text.txt
--- a/test_files/multipage_test.pdf
+++ b/test_files/multipage_test.pdf
@ -0,0 +1,106 @@
+%PDF-1.3
+%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
+1 0 obj
+<<
+/F1 2 0 R
+>>
+endobj
+2 0 obj
+<<
+/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
+>>
+endobj
+3 0 obj
+<<
+/Contents 9 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 8 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+
+>> 
+  /Type /Page
+>>
+endobj
+4 0 obj
+<<
+/Contents 10 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 8 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+
+>> 
+  /Type /Page
+>>
+endobj
+5 0 obj
+<<
+/Contents 11 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 8 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+
+>> 
+  /Type /Page
+>>
+endobj
+6 0 obj
+<<
+/PageMode /UseNone /Pages 8 0 R /Type /Catalog
+>>
+endobj
+7 0 obj
+<<
+/Author (anonymous) /CreationDate (D:20250710214218+00'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20250710214218+00'00') /Producer (ReportLab PDF Library - www.reportlab.com) 
+  /Subject (unspecified) /Title (Multi-page Test Document) /Trapped /False
+>>
+endobj
+8 0 obj
+<<
+/Count 3 /Kids [ 3 0 R 4 0 R 5 0 R ] /Type /Pages
+>>
+endobj
+9 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 169
+>>
+stream
+GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_?CT3!K3H3=WkWlTgfk;(arJdECd6a7R1F/.SqNX9u4:qV^,3M!5s*':G0mdbq%1@#g#sMdDDX6g_?JG,U9Wt+U+lN+X=6o+W"Z_5u+E]aJ<9.,U4#5!.\</LB~>endstream
+endobj
+10 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 155
+>>
+stream
+GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_?CW4KISi90MntRifICKNI\t><!=M#r%nT7d[ZIoh#\/F,sSps2`7d,;^!DZ+[c]rG&pT<p>FH]r4E^IYV%'!"K4m8,IJq"%nWf'(B@ns.4,~>endstream
+endobj
+11 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 139
+>>
+stream
+GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_?CW4KISi90MntRifICKNI\t><!?b8>m7^]]Ykcn&6DdR,YK:K$]??Q;i%<4N6J"1KuCV<Kf$$:!h_//,]*ln!4=[/%f~>endstream
+endobj
+xref
+0 12
+0000000000 65535 f 
+0000000073 00000 n 
+0000000104 00000 n 
+0000000211 00000 n 
+0000000414 00000 n 
+0000000618 00000 n 
+0000000822 00000 n 
+0000000890 00000 n 
+0000001202 00000 n 
+0000001273 00000 n 
+0000001532 00000 n 
+0000001778 00000 n 
+trailer
+<<
+/ID 
+[<3b1faa812dbde8df21a25a9c074387ca><3b1faa812dbde8df21a25a9c074387ca>]
+% ReportLab generated PDF document -- digest (http://www.reportlab.com)
+
+/Info 7 0 R
+/Root 6 0 R
+/Size 12
+>>
+startxref
+2008
+%%EOF
--- a/test_files/portrait_100x200.png
+++ b/test_files/portrait_100x200.png
--- a/test_files/sample.html
+++ b/test_files/sample.html
@ -0,0 +1,13 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Sample HTML Document</title>
+</head>
+<body>
+    <h1>Test HTML File</h1>
+    <p>This is a sample HTML document for testing format detection.</p>
+    <p>It contains multiple paragraphs and should be detected as HTML.</p>
+</body>
+</html>
--- a/test_files/sample.json
+++ b/test_files/sample.json
@ -0,0 +1,12 @@
+{
+  "name": "Test Document",
+  "type": "sample",
+  "metadata": {
+    "created": "2024-01-01",
+    "author": "Test User"
+  },
+  "content": [
+    "This is a JSON file",
+    "Used for testing text format detection"
+  ]
+}
--- a/test_files/sample.txt
+++ b/test_files/sample.txt
@ -0,0 +1,8 @@
+This is a sample text file for testing metadata extraction.
+It contains multiple lines and various words.
+The quick brown fox jumps over the lazy dog.
+This file is used to test character count, word count, and line count extraction.
+
+Some special characters: áéíóú, çñ, and emojis 🎉✨
+
+This should help test Unicode detection as well.
--- a/test_files/sample.xml
+++ b/test_files/sample.xml
@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<document>
+  <title>Sample XML Document</title>
+  <content>
+    <paragraph>This is a sample XML file for testing.</paragraph>
+    <paragraph>It should be detected as XML format.</paragraph>
+  </content>
+  <metadata>
+    <author>Test User</author>
+    <created>2024-01-01</created>
+  </metadata>
+</document>
--- a/test_files/single_page_v14.pdf
+++ b/test_files/single_page_v14.pdf
@ -0,0 +1,68 @@
+%PDF-1.3
+%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
+1 0 obj
+<<
+/F1 2 0 R
+>>
+endobj
+2 0 obj
+<<
+/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
+>>
+endobj
+3 0 obj
+<<
+/Contents 7 0 R /MediaBox [ 0 0 612 792 ] /Parent 6 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+
+>> 
+  /Type /Page
+>>
+endobj
+4 0 obj
+<<
+/PageMode /UseNone /Pages 6 0 R /Type /Catalog
+>>
+endobj
+5 0 obj
+<<
+/Author (Test Author) /CreationDate (D:20250710214218+00'00') /Creator (Python reportlab) /Keywords () /ModDate (D:20250710214218+00'00') /Producer (ReportLab PDF Library - www.reportlab.com) 
+  /Subject (Test Subject) /Title (Single Page Test Document) /Trapped /False
+>>
+endobj
+6 0 obj
+<<
+/Count 1 /Kids [ 3 0 R ] /Type /Pages
+>>
+endobj
+7 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 206
+>>
+stream
+Gas2B]aDVA&;9pC`KY*LD^B-O0d'R(h@#jac&^q+OP>SPmh&K#W6@$9_Y5bf'gkod%QJ.'$Nb1<.o;ODct=>@o&ifTJs=-$X.UIKf=%kE`l6H4ncH?T8&r:oU$p@/Dkq]C\nM9+)$gJ8i5uRL/B/S_i27!\Ph,TnDqT$"E>!rEO.F7L@9L)S[P2.<5'R?1"%om1'9X0s@Rg.~>endstream
+endobj
+xref
+0 8
+0000000000 65535 f 
+0000000073 00000 n 
+0000000104 00000 n 
+0000000211 00000 n 
+0000000404 00000 n 
+0000000472 00000 n 
+0000000763 00000 n 
+0000000822 00000 n 
+trailer
+<<
+/ID 
+[<eb148edbf508f579851c4559676f2901><eb148edbf508f579851c4559676f2901>]
+% ReportLab generated PDF document -- digest (http://www.reportlab.com)
+
+/Info 5 0 R
+/Root 4 0 R
+/Size 8
+>>
+startxref
+1118
+%%EOF
--- a/test_files/small_50x50.png
+++ b/test_files/small_50x50.png
--- a/test_files/square_150x150.png
+++ b/test_files/square_150x150.png
--- a/test_files/test_format.html
+++ b/test_files/test_format.html
@ -0,0 +1,16 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>HTML Test Document</title>
+</head>
+<body>
+    <h1>HTML Test Page</h1>
+    <p>This document should be detected as HTML format.</p>
+    <p>It contains HTML markup and structure.</p>
+    <ul>
+        <li>List item one</li>
+        <li>List item two</li>
+    </ul>
+</body>
+</html>
--- a/test_files/test_format.json
+++ b/test_files/test_format.json
@ -0,0 +1,15 @@
+{
+  "document": {
+    "title": "Test JSON Document",
+    "type": "metadata_test",
+    "properties": {
+      "word_count": 25,
+      "format": "json",
+      "encoding": "utf-8"
+    },
+    "content": [
+      "This JSON should be detected as JSON format",
+      "It contains structured data in JSON format"
+    ]
+  }
+}
--- a/test_files/test_format.xml
+++ b/test_files/test_format.xml
@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<document type="test">
+  <metadata>
+    <title>XML Test Document</title>
+    <format>xml</format>
+    <word_count>15</word_count>
+  </metadata>
+  <content>
+    <section>This XML document should be detected as XML format.</section>
+    <section>It contains structured markup for testing.</section>
+  </content>
+</document>
--- a/test_files/test_image.jpg
+++ b/test_files/test_image.jpg