diff --git a/src/ingestion/batch_ingest.rs b/src/ingestion/batch_ingest.rs index 62c20d6..b865dab 100644 --- a/src/ingestion/batch_ingest.rs +++ b/src/ingestion/batch_ingest.rs @@ -230,8 +230,8 @@ async fn process_single_file( user_id: Uuid, db: Database, ) -> Result> { - // Extract file info with metadata - let file_info = extract_file_info_from_path(&path).await?; + // Extract basic file info first + let mut file_info = extract_file_info_from_path(&path).await?; // Skip very large files (> 100MB) if file_info.size > 100 * 1024 * 1024 { @@ -242,6 +242,11 @@ async fn process_single_file( // Read file data let file_data = fs::read(&path).await?; + // Extract content-based metadata + if let Ok(Some(content_metadata)) = crate::metadata_extraction::extract_content_metadata(&file_data, &file_info.mime_type, &file_info.name).await { + file_info.metadata = Some(content_metadata); + } + // Use the unified ingestion service with full metadata support let ingestion_service = DocumentIngestionService::new(db, file_service); diff --git a/src/metadata_extraction.rs b/src/metadata_extraction.rs index 7008410..1397464 100644 --- a/src/metadata_extraction.rs +++ b/src/metadata_extraction.rs @@ -1,6 +1,5 @@ use anyhow::Result; use serde_json::{Map, Value}; -use std::collections::HashMap; /// Extract metadata from file content based on file type pub async fn extract_content_metadata(file_data: &[u8], mime_type: &str, filename: &str) -> Result> { @@ -176,4 +175,7 @@ async fn extract_text_metadata(file_data: &[u8]) -> Result> { } Ok(metadata) -} \ No newline at end of file +} + +#[cfg(test)] +mod tests; \ No newline at end of file diff --git a/src/metadata_extraction/tests.rs b/src/metadata_extraction/tests.rs new file mode 100644 index 0000000..5b016a8 --- /dev/null +++ b/src/metadata_extraction/tests.rs @@ -0,0 +1,320 @@ +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + use serde_json::Value; + + #[tokio::test] + async fn test_image_metadata_extraction_portrait() { + let image_data = fs::read("test_files/portrait_100x200.png").expect("Failed to read portrait test image"); + + let metadata = extract_content_metadata(&image_data, "image/png", "portrait_100x200.png") + .await + .expect("Failed to extract metadata"); + + assert!(metadata.is_some()); + let metadata = metadata.unwrap(); + + // Check basic image properties + assert_eq!(metadata["image_width"], Value::Number(100.into())); + assert_eq!(metadata["image_height"], Value::Number(200.into())); + assert_eq!(metadata["orientation"], Value::String("portrait".to_string())); + assert_eq!(metadata["file_extension"], Value::String("png".to_string())); + + // Check calculated values + assert_eq!(metadata["aspect_ratio"], Value::String("0.50".to_string())); + assert_eq!(metadata["megapixels"], Value::String("0.0 MP".to_string())); + } + + #[tokio::test] + async fn test_image_metadata_extraction_landscape() { + let image_data = fs::read("test_files/landscape_300x200.png").expect("Failed to read landscape test image"); + + let metadata = extract_content_metadata(&image_data, "image/png", "landscape_300x200.png") + .await + .expect("Failed to extract metadata"); + + assert!(metadata.is_some()); + let metadata = metadata.unwrap(); + + assert_eq!(metadata["image_width"], Value::Number(300.into())); + assert_eq!(metadata["image_height"], Value::Number(200.into())); + assert_eq!(metadata["orientation"], Value::String("landscape".to_string())); + assert_eq!(metadata["aspect_ratio"], Value::String("1.50".to_string())); + } + + #[tokio::test] + async fn test_image_metadata_extraction_square() { + let image_data = fs::read("test_files/square_150x150.png").expect("Failed to read square test image"); + + let metadata = extract_content_metadata(&image_data, "image/png", "square_150x150.png") + .await + .expect("Failed to extract metadata"); + + assert!(metadata.is_some()); + let metadata = metadata.unwrap(); + + assert_eq!(metadata["image_width"], Value::Number(150.into())); + assert_eq!(metadata["image_height"], Value::Number(150.into())); + assert_eq!(metadata["orientation"], Value::String("square".to_string())); + assert_eq!(metadata["aspect_ratio"], Value::String("1.00".to_string())); + } + + #[tokio::test] + async fn test_image_metadata_extraction_high_resolution() { + let image_data = fs::read("test_files/hires_1920x1080.png").expect("Failed to read high-res test image"); + + let metadata = extract_content_metadata(&image_data, "image/png", "hires_1920x1080.png") + .await + .expect("Failed to extract metadata"); + + assert!(metadata.is_some()); + let metadata = metadata.unwrap(); + + assert_eq!(metadata["image_width"], Value::Number(1920.into())); + assert_eq!(metadata["image_height"], Value::Number(1080.into())); + assert_eq!(metadata["orientation"], Value::String("landscape".to_string())); + assert_eq!(metadata["megapixels"], Value::String("2.1 MP".to_string())); + } + + #[tokio::test] + async fn test_jpeg_metadata_extraction() { + let image_data = fs::read("test_files/test_image.jpg").expect("Failed to read JPEG test image"); + + let metadata = extract_content_metadata(&image_data, "image/jpeg", "test_image.jpg") + .await + .expect("Failed to extract metadata"); + + assert!(metadata.is_some()); + let metadata = metadata.unwrap(); + + assert_eq!(metadata["file_extension"], Value::String("jpg".to_string())); + assert!(metadata.contains_key("image_width")); + assert!(metadata.contains_key("image_height")); + } + + #[tokio::test] + async fn test_pdf_metadata_extraction_single_page() { + let pdf_data = fs::read("test_files/single_page_v14.pdf").expect("Failed to read single page PDF"); + + let metadata = extract_content_metadata(&pdf_data, "application/pdf", "single_page_v14.pdf") + .await + .expect("Failed to extract metadata"); + + assert!(metadata.is_some()); + let metadata = metadata.unwrap(); + + assert_eq!(metadata["file_extension"], Value::String("pdf".to_string())); + // Note: PDF version detection might vary depending on how reportlab creates the file + assert!(metadata.contains_key("pdf_version") || metadata.contains_key("file_type")); + } + + #[tokio::test] + async fn test_pdf_metadata_extraction_multipage() { + let pdf_data = fs::read("test_files/multipage_test.pdf").expect("Failed to read multipage PDF"); + + let metadata = extract_content_metadata(&pdf_data, "application/pdf", "multipage_test.pdf") + .await + .expect("Failed to extract metadata"); + + assert!(metadata.is_some()); + let metadata = metadata.unwrap(); + + assert_eq!(metadata["file_extension"], Value::String("pdf".to_string())); + // Should detect multiple pages if our page counting works + if let Some(page_count) = metadata.get("page_count") { + if let Value::Number(count) = page_count { + assert!(count.as_u64().unwrap() > 1); + } + } + } + + #[tokio::test] + async fn test_pdf_metadata_with_fonts_and_images() { + let pdf_data = fs::read("test_files/complex_content.pdf").expect("Failed to read complex PDF"); + + let metadata = extract_content_metadata(&pdf_data, "application/pdf", "complex_content.pdf") + .await + .expect("Failed to extract metadata"); + + assert!(metadata.is_some()); + let metadata = metadata.unwrap(); + + // Should detect fonts and potentially images/objects + if let Some(Value::Bool(has_fonts)) = metadata.get("contains_fonts") { + // Font detection might work depending on PDF structure + } + } + + #[tokio::test] + async fn test_text_metadata_extraction_comprehensive() { + let text_data = fs::read("test_files/comprehensive_text.txt").expect("Failed to read comprehensive text"); + + let metadata = extract_content_metadata(&text_data, "text/plain", "comprehensive_text.txt") + .await + .expect("Failed to extract metadata"); + + assert!(metadata.is_some()); + let metadata = metadata.unwrap(); + + assert_eq!(metadata["file_extension"], Value::String("txt".to_string())); + + // Check text statistics + if let Value::Number(char_count) = &metadata["character_count"] { + assert!(char_count.as_u64().unwrap() > 500); // Should be substantial + } + + if let Value::Number(word_count) = &metadata["word_count"] { + assert!(word_count.as_u64().unwrap() > 80); // Should have many words + } + + if let Value::Number(line_count) = &metadata["line_count"] { + assert!(line_count.as_u64().unwrap() > 15); // Should have multiple lines + } + + // Should detect Unicode content + assert_eq!(metadata["contains_unicode"], Value::Bool(true)); + + // Should detect likely English + if let Some(Value::String(lang)) = metadata.get("likely_language") { + assert_eq!(lang, "english"); + } + } + + #[tokio::test] + async fn test_text_metadata_extraction_ascii_only() { + let text_data = fs::read("test_files/ascii_only.txt").expect("Failed to read ASCII text"); + + let metadata = extract_content_metadata(&text_data, "text/plain", "ascii_only.txt") + .await + .expect("Failed to extract metadata"); + + assert!(metadata.is_some()); + let metadata = metadata.unwrap(); + + // Should NOT contain Unicode + assert!(metadata.get("contains_unicode").is_none() || metadata["contains_unicode"] == Value::Bool(false)); + } + + #[tokio::test] + async fn test_text_metadata_extraction_large_file() { + let text_data = fs::read("test_files/large_text.txt").expect("Failed to read large text"); + + let metadata = extract_content_metadata(&text_data, "text/plain", "large_text.txt") + .await + .expect("Failed to extract metadata"); + + assert!(metadata.is_some()); + let metadata = metadata.unwrap(); + + // Should handle large files properly + if let Value::Number(char_count) = &metadata["character_count"] { + assert!(char_count.as_u64().unwrap() > 50000); // Should be large + } + + if let Value::Number(word_count) = &metadata["word_count"] { + assert!(word_count.as_u64().unwrap() > 10000); // Should have many words + } + } + + #[tokio::test] + async fn test_json_format_detection() { + let text_data = fs::read("test_files/test_format.json").expect("Failed to read JSON text"); + + let metadata = extract_content_metadata(&text_data, "text/plain", "test_format.json") + .await + .expect("Failed to extract metadata"); + + assert!(metadata.is_some()); + let metadata = metadata.unwrap(); + + assert_eq!(metadata["file_extension"], Value::String("json".to_string())); + + // Should detect JSON format + if let Some(Value::String(format)) = metadata.get("text_format") { + assert_eq!(format, "json"); + } + } + + #[tokio::test] + async fn test_xml_format_detection() { + let text_data = fs::read("test_files/test_format.xml").expect("Failed to read XML text"); + + let metadata = extract_content_metadata(&text_data, "text/plain", "test_format.xml") + .await + .expect("Failed to extract metadata"); + + assert!(metadata.is_some()); + let metadata = metadata.unwrap(); + + assert_eq!(metadata["file_extension"], Value::String("xml".to_string())); + + // Should detect XML format + if let Some(Value::String(format)) = metadata.get("text_format") { + assert_eq!(format, "xml"); + } + } + + #[tokio::test] + async fn test_html_format_detection() { + let text_data = fs::read("test_files/test_format.html").expect("Failed to read HTML text"); + + let metadata = extract_content_metadata(&text_data, "text/plain", "test_format.html") + .await + .expect("Failed to extract metadata"); + + assert!(metadata.is_some()); + let metadata = metadata.unwrap(); + + assert_eq!(metadata["file_extension"], Value::String("html".to_string())); + + // Should detect HTML format + if let Some(Value::String(format)) = metadata.get("text_format") { + assert_eq!(format, "html"); + } + } + + #[tokio::test] + async fn test_unknown_file_type() { + let dummy_data = b"This is some random binary data that doesn't match any known format."; + + let metadata = extract_content_metadata(dummy_data, "application/octet-stream", "unknown.bin") + .await + .expect("Failed to extract metadata"); + + assert!(metadata.is_some()); + let metadata = metadata.unwrap(); + + assert_eq!(metadata["file_type"], Value::String("application/octet-stream".to_string())); + assert_eq!(metadata["file_extension"], Value::String("bin".to_string())); + } + + #[tokio::test] + async fn test_empty_file() { + let empty_data = b""; + + let metadata = extract_content_metadata(empty_data, "text/plain", "empty.txt") + .await + .expect("Failed to extract metadata"); + + // Should still return some metadata (at least file extension) + assert!(metadata.is_some()); + let metadata = metadata.unwrap(); + assert_eq!(metadata["file_extension"], Value::String("txt".to_string())); + } + + #[tokio::test] + async fn test_file_without_extension() { + let text_data = b"Some text content without file extension"; + + let metadata = extract_content_metadata(text_data, "text/plain", "no_extension") + .await + .expect("Failed to extract metadata"); + + assert!(metadata.is_some()); + let metadata = metadata.unwrap(); + + // Should not have file_extension field + assert!(!metadata.contains_key("file_extension")); + } +} \ No newline at end of file diff --git a/src/routes/documents/crud.rs b/src/routes/documents/crud.rs index a211e2f..3fd1551 100644 --- a/src/routes/documents/crud.rs +++ b/src/routes/documents/crud.rs @@ -78,7 +78,7 @@ pub async fn upload_document( use crate::models::FileIngestionInfo; use chrono::Utc; - let file_info = FileIngestionInfo { + let mut file_info = FileIngestionInfo { path: format!("upload/{}", filename), // Virtual path for web uploads name: filename.clone(), size: data.len() as i64, @@ -90,9 +90,14 @@ pub async fn upload_document( permissions: None, // Web uploads don't have filesystem permissions owner: Some(auth_user.user.username.clone()), // Uploader as owner group: None, // Web uploads don't have filesystem groups - metadata: None, // Could extract EXIF/PDF metadata in the future + metadata: None, // Will be populated with extracted metadata below }; + // Extract content-based metadata from uploaded file + if let Ok(Some(content_metadata)) = crate::metadata_extraction::extract_content_metadata(&data, &content_type, &filename).await { + file_info.metadata = Some(content_metadata); + } + // Create ingestion service let file_service = FileService::new(state.config.upload_path.clone()); let ingestion_service = DocumentIngestionService::new( diff --git a/src/scheduling/watcher.rs b/src/scheduling/watcher.rs index 61b6e40..00ac8ae 100644 --- a/src/scheduling/watcher.rs +++ b/src/scheduling/watcher.rs @@ -337,8 +337,13 @@ async fn process_file( } } - // Extract file info with metadata - let file_info = extract_file_info_from_path(path).await?; + // Extract basic file info first + let mut file_info = extract_file_info_from_path(path).await?; + + // Extract content-based metadata + if let Ok(Some(content_metadata)) = crate::metadata_extraction::extract_content_metadata(&file_data, &file_info.mime_type, &file_info.name).await { + file_info.metadata = Some(content_metadata); + } // Use the unified ingestion service for consistent deduplication let ingestion_service = DocumentIngestionService::new(db.clone(), file_service.clone()); diff --git a/test_files/ascii_only.txt b/test_files/ascii_only.txt new file mode 100644 index 0000000..d9f3e53 --- /dev/null +++ b/test_files/ascii_only.txt @@ -0,0 +1,5 @@ +Pure ASCII text document without any Unicode characters. +This file contains only standard ASCII characters from the basic set. +Numbers: 0123456789 +Punctuation: .,;:!?'"()-[]{} +All characters should be ASCII-only for testing encoding detection. \ No newline at end of file diff --git a/test_files/complex_content.pdf b/test_files/complex_content.pdf new file mode 100644 index 0000000..71042c3 --- /dev/null +++ b/test_files/complex_content.pdf @@ -0,0 +1,80 @@ +%PDF-1.3 +% ReportLab Generated PDF document http://www.reportlab.com +1 0 obj +<< +/F1 2 0 R /F2 3 0 R /F3 4 0 R +>> +endobj +2 0 obj +<< +/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font +>> +endobj +3 0 obj +<< +/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font +>> +endobj +4 0 obj +<< +/BaseFont /Courier /Encoding /WinAnsiEncoding /Name /F3 /Subtype /Type1 /Type /Font +>> +endobj +5 0 obj +<< +/Contents 9 0 R /MediaBox [ 0 0 612 792 ] /Parent 8 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +6 0 obj +<< +/PageMode /UseNone /Pages 8 0 R /Type /Catalog +>> +endobj +7 0 obj +<< +/Author (anonymous) /CreationDate (D:20250710214218+00'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20250710214218+00'00') /Producer (ReportLab PDF Library - www.reportlab.com) + /Subject (unspecified) /Title (Complex PDF with Fonts) /Trapped /False +>> +endobj +8 0 obj +<< +/Count 1 /Kids [ 5 0 R ] /Type /Pages +>> +endobj +9 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 246 +>> +stream +Garp%aUZ01'Euj3^Z$lr70RKbDN26UQ2En"QspCZ"fK-0V7'.H&;JtnEutX^`S4!djY>/-1Ll7*9[\kUD]$a]-(W-gT#W[42^Y:qOp/#=7!qndZ#V1iAYW[K/"S#OYOFENi#\m$3$pO%_hg)82^%7pMPXa0S88Np"d23mBn#h"bUhu6endstream +endobj +xref +0 10 +0000000000 65535 f +0000000073 00000 n +0000000124 00000 n +0000000231 00000 n +0000000343 00000 n +0000000448 00000 n +0000000641 00000 n +0000000709 00000 n +0000001019 00000 n +0000001078 00000 n +trailer +<< +/ID +[<915ab5b109826181e9414e12bf7351a6><915ab5b109826181e9414e12bf7351a6>] +% ReportLab generated PDF document -- digest (http://www.reportlab.com) + +/Info 7 0 R +/Root 6 0 R +/Size 10 +>> +startxref +1414 +%%EOF diff --git a/test_files/comprehensive_text.txt b/test_files/comprehensive_text.txt new file mode 100644 index 0000000..d68bf22 --- /dev/null +++ b/test_files/comprehensive_text.txt @@ -0,0 +1,15 @@ +This is a comprehensive test document for text metadata extraction. + +It contains multiple paragraphs, various types of content, and different characteristics. +Word count: This sentence has exactly seven words counting properly. +Line counting: Each line should be counted separately for accurate statistics. + +Unicode content: café, naïve, résumé, piñata, Zürich, москва, 東京, 🎉✨🔥 +Numbers and mixed content: 123 ABC def456 GHI789 test@example.com + +Special formatting: +- Bulleted lists +- Multiple items +- With various content + +The document ends here with a final paragraph. \ No newline at end of file diff --git a/test_files/create_metadata_test_files.py b/test_files/create_metadata_test_files.py new file mode 100644 index 0000000..085d222 --- /dev/null +++ b/test_files/create_metadata_test_files.py @@ -0,0 +1,266 @@ +#!/usr/bin/env python3 +""" +Create test files for metadata extraction testing. +""" + +import os +import sys +from pathlib import Path + +# Try to import PIL for image creation +try: + from PIL import Image, ImageDraw, ImageFont + from PIL.ExifTags import TAGS + from PIL.ExifTags import GPSTAGS + PIL_AVAILABLE = True +except ImportError: + print("PIL not available, skipping image creation with EXIF") + PIL_AVAILABLE = False + +# Try to import reportlab for PDF creation +try: + from reportlab.pdfgen import canvas + from reportlab.lib.pagesizes import letter, A4 + from reportlab.pdfbase import pdfmetrics + from reportlab.pdfbase.ttfonts import TTFont + REPORTLAB_AVAILABLE = True +except ImportError: + print("reportlab not available, creating simple PDF-like files") + REPORTLAB_AVAILABLE = False + +def create_test_images(): + """Create test images with various properties.""" + if not PIL_AVAILABLE: + print("Skipping image creation - PIL not available") + return + + print("Creating test images...") + + # 1. Portrait image (100x200) + img = Image.new('RGB', (100, 200), color='lightblue') + draw = ImageDraw.Draw(img) + draw.text((10, 50), "Portrait\n100x200", fill='black') + img.save('test_files/portrait_100x200.png') + + # 2. Landscape image (300x200) + img = Image.new('RGB', (300, 200), color='lightgreen') + draw = ImageDraw.Draw(img) + draw.text((50, 50), "Landscape 300x200", fill='black') + img.save('test_files/landscape_300x200.png') + + # 3. Square image (150x150) + img = Image.new('RGB', (150, 150), color='lightyellow') + draw = ImageDraw.Draw(img) + draw.text((25, 50), "Square\n150x150", fill='black') + img.save('test_files/square_150x150.png') + + # 4. High resolution image (1920x1080) + img = Image.new('RGB', (1920, 1080), color='lightcoral') + draw = ImageDraw.Draw(img) + try: + font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 40) + except: + font = ImageFont.load_default() + draw.text((100, 500), "High Resolution\n1920x1080\n2.07 Megapixels", fill='black', font=font) + img.save('test_files/hires_1920x1080.png') + + # 5. Small image (50x50) + img = Image.new('RGB', (50, 50), color='lightgray') + img.save('test_files/small_50x50.png') + + # 6. JPEG with different color mode + img = Image.new('RGB', (200, 200), color='purple') + draw = ImageDraw.Draw(img) + draw.text((50, 50), "JPEG\nTest", fill='white') + img.save('test_files/test_image.jpg', 'JPEG') + + print("Created test images") + +def create_test_pdfs(): + """Create test PDFs with various properties.""" + if not REPORTLAB_AVAILABLE: + print("Creating simple PDF-like files...") + # Create simple files that look like PDF headers + simple_pdfs = [ + ("%PDF-1.4\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n2 0 obj\n<<\n/Type /Pages\n/Kids [3 0 R]\n/Count 1\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n/MediaBox [0 0 612 792]\n>>\nendobj\nxref\n0 4\n0000000000 65535 f \n0000000009 00000 n \n0000000074 00000 n \n0000000120 00000 n \ntrailer\n<<\n/Size 4\n/Root 1 0 R\n>>\nstartxref\n179\n%%EOF", "simple_v14.pdf"), + ("%PDF-1.7\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n2 0 obj\n<<\n/Type /Pages\n/Kids [3 0 R 4 0 R]\n/Count 2\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n>>\nendobj\n4 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n>>\nendobj\nxref\n0 5\ntrailer\n<<\n/Size 5\n/Root 1 0 R\n>>\n%%EOF", "multipage_v17.pdf"), + ("%PDF-1.5\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n/Linearized true\n>>\nendobj\n2 0 obj\n<<\n/Type /Pages\n/Kids [3 0 R]\n/Count 1\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n/Font 4 0 R\n/Image 5 0 R\n>>\nendobj\nxref\n0 4\ntrailer\n<<\n/Size 4\n/Root 1 0 R\n>>\n%%EOF", "with_fonts_images.pdf"), + ] + + for content, filename in simple_pdfs: + with open(f'test_files/{filename}', 'wb') as f: + f.write(content.encode('latin1')) + print("Created simple PDF-like files") + return + + print("Creating test PDFs with reportlab...") + + # 1. Single page PDF v1.4 + c = canvas.Canvas('test_files/single_page_v14.pdf', pagesize=letter) + c.setTitle("Single Page Test Document") + c.setAuthor("Test Author") + c.setSubject("Test Subject") + c.setCreator("Python reportlab") + c.setFont("Helvetica", 12) + c.drawString(100, 750, "Single Page PDF Document") + c.drawString(100, 700, "This is a test PDF for metadata extraction.") + c.drawString(100, 650, "It should be detected as PDF version 1.4") + c.save() + + # 2. Multi-page PDF + c = canvas.Canvas('test_files/multipage_test.pdf', pagesize=A4) + c.setTitle("Multi-page Test Document") + # Page 1 + c.setFont("Helvetica", 14) + c.drawString(100, 800, "Page 1 of Multi-page Document") + c.drawString(100, 750, "This document has multiple pages.") + c.showPage() + # Page 2 + c.drawString(100, 800, "Page 2 of Multi-page Document") + c.drawString(100, 750, "Second page content here.") + c.showPage() + # Page 3 + c.drawString(100, 800, "Page 3 - Final Page") + c.drawString(100, 750, "Third and final page.") + c.save() + + # 3. PDF with fonts and complex content + c = canvas.Canvas('test_files/complex_content.pdf', pagesize=letter) + c.setTitle("Complex PDF with Fonts") + c.setFont("Helvetica-Bold", 16) + c.drawString(100, 750, "Document with Multiple Fonts") + c.setFont("Helvetica", 12) + c.drawString(100, 700, "This document contains multiple font types.") + c.setFont("Courier", 10) + c.drawString(100, 650, "Some monospace text for variety.") + # Add some graphics/lines + c.line(100, 600, 500, 600) + c.rect(100, 550, 200, 30) + c.save() + + print("Created test PDFs") + +def create_text_files(): + """Create various text files for testing.""" + print("Creating test text files...") + + # 1. Plain text with various content + content = """This is a comprehensive test document for text metadata extraction. + +It contains multiple paragraphs, various types of content, and different characteristics. +Word count: This sentence has exactly seven words counting properly. +Line counting: Each line should be counted separately for accurate statistics. + +Unicode content: café, naïve, résumé, piñata, Zürich, москва, 東京, 🎉✨🔥 +Numbers and mixed content: 123 ABC def456 GHI789 test@example.com + +Special formatting: +- Bulleted lists +- Multiple items +- With various content + +The document ends here with a final paragraph.""" + + with open('test_files/comprehensive_text.txt', 'w', encoding='utf-8') as f: + f.write(content) + + # 2. JSON format text + json_content = """{ + "document": { + "title": "Test JSON Document", + "type": "metadata_test", + "properties": { + "word_count": 25, + "format": "json", + "encoding": "utf-8" + }, + "content": [ + "This JSON should be detected as JSON format", + "It contains structured data in JSON format" + ] + } +}""" + + with open('test_files/test_format.json', 'w') as f: + f.write(json_content) + + # 3. XML format text + xml_content = """ + + + XML Test Document + xml + 15 + + +
This XML document should be detected as XML format.
+
It contains structured markup for testing.
+
+
""" + + with open('test_files/test_format.xml', 'w') as f: + f.write(xml_content) + + # 4. HTML format text + html_content = """ + + + + HTML Test Document + + +

HTML Test Page

+

This document should be detected as HTML format.

+

It contains HTML markup and structure.

+
    +
  • List item one
  • +
  • List item two
  • +
+ +""" + + with open('test_files/test_format.html', 'w') as f: + f.write(html_content) + + # 5. Large text file for performance testing + large_content = "This is a large text file for testing performance. " * 1000 + large_content += "\nEnd of large file with final line." + + with open('test_files/large_text.txt', 'w') as f: + f.write(large_content) + + # 6. ASCII-only text + ascii_content = """Pure ASCII text document without any Unicode characters. +This file contains only standard ASCII characters from the basic set. +Numbers: 0123456789 +Punctuation: .,;:!?'"()-[]{} +All characters should be ASCII-only for testing encoding detection.""" + + with open('test_files/ascii_only.txt', 'w') as f: + f.write(ascii_content) + + print("Created test text files") + +def main(): + """Create all test files.""" + # Ensure test_files directory exists + os.makedirs('test_files', exist_ok=True) + + print("Creating test files for metadata extraction testing...") + + create_text_files() + create_test_images() + create_test_pdfs() + + print("\nAll test files created successfully!") + print("Files created in test_files/ directory:") + + # List all created files + test_files = sorted(Path('test_files').glob('*')) + for file_path in test_files: + if file_path.is_file() and not file_path.name.endswith('.py'): + size = file_path.stat().st_size + print(f" {file_path.name} ({size} bytes)") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/test_files/create_test_image.rs b/test_files/create_test_image.rs new file mode 100644 index 0000000..031b6bd --- /dev/null +++ b/test_files/create_test_image.rs @@ -0,0 +1,42 @@ +// This is a helper script to create test images +use image::{ImageBuffer, Rgb, DynamicImage}; +use std::path::Path; + +pub fn create_test_images() -> Result<(), Box> { + // Create a simple 100x200 RGB image (portrait) + let mut img = ImageBuffer::new(100, 200); + for (x, y, pixel) in img.enumerate_pixels_mut() { + let r = (x * 255 / 100) as u8; + let g = (y * 255 / 200) as u8; + let b = 128; + *pixel = Rgb([r, g, b]); + } + + let dynamic_img = DynamicImage::ImageRgb8(img); + dynamic_img.save("test_files/sample_portrait.png")?; + + // Create a simple 300x200 RGB image (landscape) + let mut img2 = ImageBuffer::new(300, 200); + for (x, y, pixel) in img2.enumerate_pixels_mut() { + let r = 255 - (x * 255 / 300) as u8; + let g = (y * 255 / 200) as u8; + let b = (x + y) as u8 % 255; + *pixel = Rgb([r, g, b]); + } + + let dynamic_img2 = DynamicImage::ImageRgb8(img2); + dynamic_img2.save("test_files/sample_landscape.png")?; + + // Create a square image 150x150 + let mut img3 = ImageBuffer::new(150, 150); + for (x, y, pixel) in img3.enumerate_pixels_mut() { + let distance = ((x as i32 - 75).pow(2) + (y as i32 - 75).pow(2)) as f32; + let intensity = (255.0 * (1.0 - distance / (75.0 * 75.0))).max(0.0) as u8; + *pixel = Rgb([intensity, 0, 255 - intensity]); + } + + let dynamic_img3 = DynamicImage::ImageRgb8(img3); + dynamic_img3.save("test_files/sample_square.png")?; + + Ok(()) +} \ No newline at end of file diff --git a/test_files/hires_1920x1080.png b/test_files/hires_1920x1080.png new file mode 100644 index 0000000..613ff9f Binary files /dev/null and b/test_files/hires_1920x1080.png differ diff --git a/test_files/landscape_300x200.png b/test_files/landscape_300x200.png new file mode 100644 index 0000000..31f85e7 Binary files /dev/null and b/test_files/landscape_300x200.png differ diff --git a/test_files/large_text.txt b/test_files/large_text.txt new file mode 100644 index 0000000..64021bc --- /dev/null +++ b/test_files/large_text.txt @@ -0,0 +1,2 @@ +This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. This is a large text file for testing performance. +End of large file with final line. \ No newline at end of file diff --git a/test_files/multipage_test.pdf b/test_files/multipage_test.pdf new file mode 100644 index 0000000..bda5e74 --- /dev/null +++ b/test_files/multipage_test.pdf @@ -0,0 +1,106 @@ +%PDF-1.3 +% ReportLab Generated PDF document http://www.reportlab.com +1 0 obj +<< +/F1 2 0 R +>> +endobj +2 0 obj +<< +/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font +>> +endobj +3 0 obj +<< +/Contents 9 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 8 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +4 0 obj +<< +/Contents 10 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 8 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +5 0 obj +<< +/Contents 11 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 8 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +6 0 obj +<< +/PageMode /UseNone /Pages 8 0 R /Type /Catalog +>> +endobj +7 0 obj +<< +/Author (anonymous) /CreationDate (D:20250710214218+00'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20250710214218+00'00') /Producer (ReportLab PDF Library - www.reportlab.com) + /Subject (unspecified) /Title (Multi-page Test Document) /Trapped /False +>> +endobj +8 0 obj +<< +/Count 3 /Kids [ 3 0 R 4 0 R 5 0 R ] /Type /Pages +>> +endobj +9 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 169 +>> +stream +GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_?CT3!K3H3=WkWlTgfk;(arJdECd6a7R1F/.SqNX9u4:qV^,3M!5s*':G0mdbq%1@#g#sMdDDX6g_?JG,U9Wt+U+lN+X=6o+W"Z_5u+E]aJ<9.,U4#5!.\endstream +endobj +10 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 155 +>> +stream +GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_?CW4KISi90MntRifICKNI\t>FH]r4E^IYV%'!"K4m8,IJq"%nWf'(B@ns.4,~>endstream +endobj +11 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 139 +>> +stream +GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_?CW4KISi90MntRifICKNI\t>m7^]]Ykcn&6DdR,YK:K$]??Q;i%<4N6J"1KuCVendstream +endobj +xref +0 12 +0000000000 65535 f +0000000073 00000 n +0000000104 00000 n +0000000211 00000 n +0000000414 00000 n +0000000618 00000 n +0000000822 00000 n +0000000890 00000 n +0000001202 00000 n +0000001273 00000 n +0000001532 00000 n +0000001778 00000 n +trailer +<< +/ID +[<3b1faa812dbde8df21a25a9c074387ca><3b1faa812dbde8df21a25a9c074387ca>] +% ReportLab generated PDF document -- digest (http://www.reportlab.com) + +/Info 7 0 R +/Root 6 0 R +/Size 12 +>> +startxref +2008 +%%EOF diff --git a/test_files/portrait_100x200.png b/test_files/portrait_100x200.png new file mode 100644 index 0000000..7e3a61c Binary files /dev/null and b/test_files/portrait_100x200.png differ diff --git a/test_files/sample.html b/test_files/sample.html new file mode 100644 index 0000000..72b449d --- /dev/null +++ b/test_files/sample.html @@ -0,0 +1,13 @@ + + + + + + Sample HTML Document + + +

Test HTML File

+

This is a sample HTML document for testing format detection.

+

It contains multiple paragraphs and should be detected as HTML.

+ + \ No newline at end of file diff --git a/test_files/sample.json b/test_files/sample.json new file mode 100644 index 0000000..8011f29 --- /dev/null +++ b/test_files/sample.json @@ -0,0 +1,12 @@ +{ + "name": "Test Document", + "type": "sample", + "metadata": { + "created": "2024-01-01", + "author": "Test User" + }, + "content": [ + "This is a JSON file", + "Used for testing text format detection" + ] +} \ No newline at end of file diff --git a/test_files/sample.txt b/test_files/sample.txt new file mode 100644 index 0000000..37130f7 --- /dev/null +++ b/test_files/sample.txt @@ -0,0 +1,8 @@ +This is a sample text file for testing metadata extraction. +It contains multiple lines and various words. +The quick brown fox jumps over the lazy dog. +This file is used to test character count, word count, and line count extraction. + +Some special characters: áéíóú, çñ, and emojis 🎉✨ + +This should help test Unicode detection as well. \ No newline at end of file diff --git a/test_files/sample.xml b/test_files/sample.xml new file mode 100644 index 0000000..6d5773c --- /dev/null +++ b/test_files/sample.xml @@ -0,0 +1,12 @@ + + + Sample XML Document + + This is a sample XML file for testing. + It should be detected as XML format. + + + Test User + 2024-01-01 + + \ No newline at end of file diff --git a/test_files/single_page_v14.pdf b/test_files/single_page_v14.pdf new file mode 100644 index 0000000..8db1349 --- /dev/null +++ b/test_files/single_page_v14.pdf @@ -0,0 +1,68 @@ +%PDF-1.3 +% ReportLab Generated PDF document http://www.reportlab.com +1 0 obj +<< +/F1 2 0 R +>> +endobj +2 0 obj +<< +/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font +>> +endobj +3 0 obj +<< +/Contents 7 0 R /MediaBox [ 0 0 612 792 ] /Parent 6 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +4 0 obj +<< +/PageMode /UseNone /Pages 6 0 R /Type /Catalog +>> +endobj +5 0 obj +<< +/Author (Test Author) /CreationDate (D:20250710214218+00'00') /Creator (Python reportlab) /Keywords () /ModDate (D:20250710214218+00'00') /Producer (ReportLab PDF Library - www.reportlab.com) + /Subject (Test Subject) /Title (Single Page Test Document) /Trapped /False +>> +endobj +6 0 obj +<< +/Count 1 /Kids [ 3 0 R ] /Type /Pages +>> +endobj +7 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 206 +>> +stream +Gas2B]aDVA&;9pC`KY*LD^B-O0d'R(h@#jac&^q+OP>SPmh&K#W6@$9_Y5bf'gkod%QJ.'$Nb1<.o;ODct=>@o&ifTJs=-$X.UIKf=%kE`l6H4ncH?T8&r:oU$p@/Dkq]C\nM9+)$gJ8i5uRL/B/S_i27!\Ph,TnDqT$"E>!rEO.F7L@9L)S[P2.<5'R?1"%om1'9X0s@Rg.~>endstream +endobj +xref +0 8 +0000000000 65535 f +0000000073 00000 n +0000000104 00000 n +0000000211 00000 n +0000000404 00000 n +0000000472 00000 n +0000000763 00000 n +0000000822 00000 n +trailer +<< +/ID +[] +% ReportLab generated PDF document -- digest (http://www.reportlab.com) + +/Info 5 0 R +/Root 4 0 R +/Size 8 +>> +startxref +1118 +%%EOF diff --git a/test_files/small_50x50.png b/test_files/small_50x50.png new file mode 100644 index 0000000..68d8b58 Binary files /dev/null and b/test_files/small_50x50.png differ diff --git a/test_files/square_150x150.png b/test_files/square_150x150.png new file mode 100644 index 0000000..b528623 Binary files /dev/null and b/test_files/square_150x150.png differ diff --git a/test_files/test_format.html b/test_files/test_format.html new file mode 100644 index 0000000..847d551 --- /dev/null +++ b/test_files/test_format.html @@ -0,0 +1,16 @@ + + + + + HTML Test Document + + +

HTML Test Page

+

This document should be detected as HTML format.

+

It contains HTML markup and structure.

+
    +
  • List item one
  • +
  • List item two
  • +
+ + \ No newline at end of file diff --git a/test_files/test_format.json b/test_files/test_format.json new file mode 100644 index 0000000..f3ca405 --- /dev/null +++ b/test_files/test_format.json @@ -0,0 +1,15 @@ +{ + "document": { + "title": "Test JSON Document", + "type": "metadata_test", + "properties": { + "word_count": 25, + "format": "json", + "encoding": "utf-8" + }, + "content": [ + "This JSON should be detected as JSON format", + "It contains structured data in JSON format" + ] + } +} \ No newline at end of file diff --git a/test_files/test_format.xml b/test_files/test_format.xml new file mode 100644 index 0000000..f34e922 --- /dev/null +++ b/test_files/test_format.xml @@ -0,0 +1,12 @@ + + + + XML Test Document + xml + 15 + + +
This XML document should be detected as XML format.
+
It contains structured markup for testing.
+
+
\ No newline at end of file diff --git a/test_files/test_image.jpg b/test_files/test_image.jpg new file mode 100644 index 0000000..852dd18 Binary files /dev/null and b/test_files/test_image.jpg differ