#!/usr/bin/env python3
"""
Create test files for metadata extraction testing.
"""
import os
import sys
from pathlib import Path
# Try to import PIL for image creation
try:
from PIL import Image, ImageDraw, ImageFont
from PIL.ExifTags import TAGS
from PIL.ExifTags import GPSTAGS
PIL_AVAILABLE = True
except ImportError:
print("PIL not available, skipping image creation with EXIF")
PIL_AVAILABLE = False
# Try to import reportlab for PDF creation
try:
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter, A4
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
REPORTLAB_AVAILABLE = True
except ImportError:
print("reportlab not available, creating simple PDF-like files")
REPORTLAB_AVAILABLE = False
def create_test_images():
"""Create test images with various properties."""
if not PIL_AVAILABLE:
print("Skipping image creation - PIL not available")
return
print("Creating test images...")
# 1. Portrait image (100x200)
img = Image.new('RGB', (100, 200), color='lightblue')
draw = ImageDraw.Draw(img)
draw.text((10, 50), "Portrait\n100x200", fill='black')
img.save('test_files/portrait_100x200.png')
# 2. Landscape image (300x200)
img = Image.new('RGB', (300, 200), color='lightgreen')
draw = ImageDraw.Draw(img)
draw.text((50, 50), "Landscape 300x200", fill='black')
img.save('test_files/landscape_300x200.png')
# 3. Square image (150x150)
img = Image.new('RGB', (150, 150), color='lightyellow')
draw = ImageDraw.Draw(img)
draw.text((25, 50), "Square\n150x150", fill='black')
img.save('test_files/square_150x150.png')
# 4. High resolution image (1920x1080)
img = Image.new('RGB', (1920, 1080), color='lightcoral')
draw = ImageDraw.Draw(img)
try:
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 40)
except:
font = ImageFont.load_default()
draw.text((100, 500), "High Resolution\n1920x1080\n2.07 Megapixels", fill='black', font=font)
img.save('test_files/hires_1920x1080.png')
# 5. Small image (50x50)
img = Image.new('RGB', (50, 50), color='lightgray')
img.save('test_files/small_50x50.png')
# 6. JPEG with different color mode
img = Image.new('RGB', (200, 200), color='purple')
draw = ImageDraw.Draw(img)
draw.text((50, 50), "JPEG\nTest", fill='white')
img.save('test_files/test_image.jpg', 'JPEG')
print("Created test images")
def create_test_pdfs():
"""Create test PDFs with various properties."""
if not REPORTLAB_AVAILABLE:
print("Creating simple PDF-like files...")
# Create simple files that look like PDF headers
simple_pdfs = [
("%PDF-1.4\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n2 0 obj\n<<\n/Type /Pages\n/Kids [3 0 R]\n/Count 1\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n/MediaBox [0 0 612 792]\n>>\nendobj\nxref\n0 4\n0000000000 65535 f \n0000000009 00000 n \n0000000074 00000 n \n0000000120 00000 n \ntrailer\n<<\n/Size 4\n/Root 1 0 R\n>>\nstartxref\n179\n%%EOF", "simple_v14.pdf"),
("%PDF-1.7\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n2 0 obj\n<<\n/Type /Pages\n/Kids [3 0 R 4 0 R]\n/Count 2\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n>>\nendobj\n4 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n>>\nendobj\nxref\n0 5\ntrailer\n<<\n/Size 5\n/Root 1 0 R\n>>\n%%EOF", "multipage_v17.pdf"),
("%PDF-1.5\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n/Linearized true\n>>\nendobj\n2 0 obj\n<<\n/Type /Pages\n/Kids [3 0 R]\n/Count 1\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n/Font 4 0 R\n/Image 5 0 R\n>>\nendobj\nxref\n0 4\ntrailer\n<<\n/Size 4\n/Root 1 0 R\n>>\n%%EOF", "with_fonts_images.pdf"),
]
for content, filename in simple_pdfs:
with open(f'test_files/{filename}', 'wb') as f:
f.write(content.encode('latin1'))
print("Created simple PDF-like files")
return
print("Creating test PDFs with reportlab...")
# 1. Single page PDF v1.4
c = canvas.Canvas('test_files/single_page_v14.pdf', pagesize=letter)
c.setTitle("Single Page Test Document")
c.setAuthor("Test Author")
c.setSubject("Test Subject")
c.setCreator("Python reportlab")
c.setFont("Helvetica", 12)
c.drawString(100, 750, "Single Page PDF Document")
c.drawString(100, 700, "This is a test PDF for metadata extraction.")
c.drawString(100, 650, "It should be detected as PDF version 1.4")
c.save()
# 2. Multi-page PDF
c = canvas.Canvas('test_files/multipage_test.pdf', pagesize=A4)
c.setTitle("Multi-page Test Document")
# Page 1
c.setFont("Helvetica", 14)
c.drawString(100, 800, "Page 1 of Multi-page Document")
c.drawString(100, 750, "This document has multiple pages.")
c.showPage()
# Page 2
c.drawString(100, 800, "Page 2 of Multi-page Document")
c.drawString(100, 750, "Second page content here.")
c.showPage()
# Page 3
c.drawString(100, 800, "Page 3 - Final Page")
c.drawString(100, 750, "Third and final page.")
c.save()
# 3. PDF with fonts and complex content
c = canvas.Canvas('test_files/complex_content.pdf', pagesize=letter)
c.setTitle("Complex PDF with Fonts")
c.setFont("Helvetica-Bold", 16)
c.drawString(100, 750, "Document with Multiple Fonts")
c.setFont("Helvetica", 12)
c.drawString(100, 700, "This document contains multiple font types.")
c.setFont("Courier", 10)
c.drawString(100, 650, "Some monospace text for variety.")
# Add some graphics/lines
c.line(100, 600, 500, 600)
c.rect(100, 550, 200, 30)
c.save()
print("Created test PDFs")
def create_text_files():
"""Create various text files for testing."""
print("Creating test text files...")
# 1. Plain text with various content
content = """This is a comprehensive test document for text metadata extraction.
It contains multiple paragraphs, various types of content, and different characteristics.
Word count: This sentence has exactly seven words counting properly.
Line counting: Each line should be counted separately for accurate statistics.
Unicode content: café, naïve, résumé, piñata, Zürich, москва, 東京, 🎉✨🔥
Numbers and mixed content: 123 ABC def456 GHI789 test@example.com
Special formatting:
- Bulleted lists
- Multiple items
- With various content
The document ends here with a final paragraph."""
with open('test_files/comprehensive_text.txt', 'w', encoding='utf-8') as f:
f.write(content)
# 2. JSON format text
json_content = """{
"document": {
"title": "Test JSON Document",
"type": "metadata_test",
"properties": {
"word_count": 25,
"format": "json",
"encoding": "utf-8"
},
"content": [
"This JSON should be detected as JSON format",
"It contains structured data in JSON format"
]
}
}"""
with open('test_files/test_format.json', 'w') as f:
f.write(json_content)
# 3. XML format text
xml_content = """
This document should be detected as HTML format.
It contains HTML markup and structure.