feat(server): show source metadata EVEN better

This commit is contained in:
perf3ct 2025-07-10 21:51:30 +00:00
parent 4c4a593a1e
commit 59b4eb170c
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
26 changed files with 1017 additions and 8 deletions

View File

@ -230,8 +230,8 @@ async fn process_single_file(
user_id: Uuid,
db: Database,
) -> Result<Option<(Uuid, i64)>> {
// Extract file info with metadata
let file_info = extract_file_info_from_path(&path).await?;
// Extract basic file info first
let mut file_info = extract_file_info_from_path(&path).await?;
// Skip very large files (> 100MB)
if file_info.size > 100 * 1024 * 1024 {
@ -242,6 +242,11 @@ async fn process_single_file(
// Read file data
let file_data = fs::read(&path).await?;
// Extract content-based metadata
if let Ok(Some(content_metadata)) = crate::metadata_extraction::extract_content_metadata(&file_data, &file_info.mime_type, &file_info.name).await {
file_info.metadata = Some(content_metadata);
}
// Use the unified ingestion service with full metadata support
let ingestion_service = DocumentIngestionService::new(db, file_service);

View File

@ -1,6 +1,5 @@
use anyhow::Result;
use serde_json::{Map, Value};
use std::collections::HashMap;
/// Extract metadata from file content based on file type
pub async fn extract_content_metadata(file_data: &[u8], mime_type: &str, filename: &str) -> Result<Option<Value>> {
@ -176,4 +175,7 @@ async fn extract_text_metadata(file_data: &[u8]) -> Result<Map<String, Value>> {
}
Ok(metadata)
}
}
#[cfg(test)]
mod tests;

View File

@ -0,0 +1,320 @@
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use serde_json::Value;
#[tokio::test]
async fn test_image_metadata_extraction_portrait() {
let image_data = fs::read("test_files/portrait_100x200.png").expect("Failed to read portrait test image");
let metadata = extract_content_metadata(&image_data, "image/png", "portrait_100x200.png")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
// Check basic image properties
assert_eq!(metadata["image_width"], Value::Number(100.into()));
assert_eq!(metadata["image_height"], Value::Number(200.into()));
assert_eq!(metadata["orientation"], Value::String("portrait".to_string()));
assert_eq!(metadata["file_extension"], Value::String("png".to_string()));
// Check calculated values
assert_eq!(metadata["aspect_ratio"], Value::String("0.50".to_string()));
assert_eq!(metadata["megapixels"], Value::String("0.0 MP".to_string()));
}
#[tokio::test]
async fn test_image_metadata_extraction_landscape() {
let image_data = fs::read("test_files/landscape_300x200.png").expect("Failed to read landscape test image");
let metadata = extract_content_metadata(&image_data, "image/png", "landscape_300x200.png")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["image_width"], Value::Number(300.into()));
assert_eq!(metadata["image_height"], Value::Number(200.into()));
assert_eq!(metadata["orientation"], Value::String("landscape".to_string()));
assert_eq!(metadata["aspect_ratio"], Value::String("1.50".to_string()));
}
#[tokio::test]
async fn test_image_metadata_extraction_square() {
let image_data = fs::read("test_files/square_150x150.png").expect("Failed to read square test image");
let metadata = extract_content_metadata(&image_data, "image/png", "square_150x150.png")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["image_width"], Value::Number(150.into()));
assert_eq!(metadata["image_height"], Value::Number(150.into()));
assert_eq!(metadata["orientation"], Value::String("square".to_string()));
assert_eq!(metadata["aspect_ratio"], Value::String("1.00".to_string()));
}
#[tokio::test]
async fn test_image_metadata_extraction_high_resolution() {
let image_data = fs::read("test_files/hires_1920x1080.png").expect("Failed to read high-res test image");
let metadata = extract_content_metadata(&image_data, "image/png", "hires_1920x1080.png")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["image_width"], Value::Number(1920.into()));
assert_eq!(metadata["image_height"], Value::Number(1080.into()));
assert_eq!(metadata["orientation"], Value::String("landscape".to_string()));
assert_eq!(metadata["megapixels"], Value::String("2.1 MP".to_string()));
}
#[tokio::test]
async fn test_jpeg_metadata_extraction() {
let image_data = fs::read("test_files/test_image.jpg").expect("Failed to read JPEG test image");
let metadata = extract_content_metadata(&image_data, "image/jpeg", "test_image.jpg")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["file_extension"], Value::String("jpg".to_string()));
assert!(metadata.contains_key("image_width"));
assert!(metadata.contains_key("image_height"));
}
#[tokio::test]
async fn test_pdf_metadata_extraction_single_page() {
let pdf_data = fs::read("test_files/single_page_v14.pdf").expect("Failed to read single page PDF");
let metadata = extract_content_metadata(&pdf_data, "application/pdf", "single_page_v14.pdf")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["file_extension"], Value::String("pdf".to_string()));
// Note: PDF version detection might vary depending on how reportlab creates the file
assert!(metadata.contains_key("pdf_version") || metadata.contains_key("file_type"));
}
#[tokio::test]
async fn test_pdf_metadata_extraction_multipage() {
let pdf_data = fs::read("test_files/multipage_test.pdf").expect("Failed to read multipage PDF");
let metadata = extract_content_metadata(&pdf_data, "application/pdf", "multipage_test.pdf")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["file_extension"], Value::String("pdf".to_string()));
// Should detect multiple pages if our page counting works
if let Some(page_count) = metadata.get("page_count") {
if let Value::Number(count) = page_count {
assert!(count.as_u64().unwrap() > 1);
}
}
}
#[tokio::test]
async fn test_pdf_metadata_with_fonts_and_images() {
let pdf_data = fs::read("test_files/complex_content.pdf").expect("Failed to read complex PDF");
let metadata = extract_content_metadata(&pdf_data, "application/pdf", "complex_content.pdf")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
// Should detect fonts and potentially images/objects
if let Some(Value::Bool(has_fonts)) = metadata.get("contains_fonts") {
// Font detection might work depending on PDF structure
}
}
#[tokio::test]
async fn test_text_metadata_extraction_comprehensive() {
let text_data = fs::read("test_files/comprehensive_text.txt").expect("Failed to read comprehensive text");
let metadata = extract_content_metadata(&text_data, "text/plain", "comprehensive_text.txt")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["file_extension"], Value::String("txt".to_string()));
// Check text statistics
if let Value::Number(char_count) = &metadata["character_count"] {
assert!(char_count.as_u64().unwrap() > 500); // Should be substantial
}
if let Value::Number(word_count) = &metadata["word_count"] {
assert!(word_count.as_u64().unwrap() > 80); // Should have many words
}
if let Value::Number(line_count) = &metadata["line_count"] {
assert!(line_count.as_u64().unwrap() > 15); // Should have multiple lines
}
// Should detect Unicode content
assert_eq!(metadata["contains_unicode"], Value::Bool(true));
// Should detect likely English
if let Some(Value::String(lang)) = metadata.get("likely_language") {
assert_eq!(lang, "english");
}
}
#[tokio::test]
async fn test_text_metadata_extraction_ascii_only() {
let text_data = fs::read("test_files/ascii_only.txt").expect("Failed to read ASCII text");
let metadata = extract_content_metadata(&text_data, "text/plain", "ascii_only.txt")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
// Should NOT contain Unicode
assert!(metadata.get("contains_unicode").is_none() || metadata["contains_unicode"] == Value::Bool(false));
}
#[tokio::test]
async fn test_text_metadata_extraction_large_file() {
let text_data = fs::read("test_files/large_text.txt").expect("Failed to read large text");
let metadata = extract_content_metadata(&text_data, "text/plain", "large_text.txt")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
// Should handle large files properly
if let Value::Number(char_count) = &metadata["character_count"] {
assert!(char_count.as_u64().unwrap() > 50000); // Should be large
}
if let Value::Number(word_count) = &metadata["word_count"] {
assert!(word_count.as_u64().unwrap() > 10000); // Should have many words
}
}
#[tokio::test]
async fn test_json_format_detection() {
let text_data = fs::read("test_files/test_format.json").expect("Failed to read JSON text");
let metadata = extract_content_metadata(&text_data, "text/plain", "test_format.json")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["file_extension"], Value::String("json".to_string()));
// Should detect JSON format
if let Some(Value::String(format)) = metadata.get("text_format") {
assert_eq!(format, "json");
}
}
#[tokio::test]
async fn test_xml_format_detection() {
let text_data = fs::read("test_files/test_format.xml").expect("Failed to read XML text");
let metadata = extract_content_metadata(&text_data, "text/plain", "test_format.xml")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["file_extension"], Value::String("xml".to_string()));
// Should detect XML format
if let Some(Value::String(format)) = metadata.get("text_format") {
assert_eq!(format, "xml");
}
}
#[tokio::test]
async fn test_html_format_detection() {
let text_data = fs::read("test_files/test_format.html").expect("Failed to read HTML text");
let metadata = extract_content_metadata(&text_data, "text/plain", "test_format.html")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["file_extension"], Value::String("html".to_string()));
// Should detect HTML format
if let Some(Value::String(format)) = metadata.get("text_format") {
assert_eq!(format, "html");
}
}
#[tokio::test]
async fn test_unknown_file_type() {
let dummy_data = b"This is some random binary data that doesn't match any known format.";
let metadata = extract_content_metadata(dummy_data, "application/octet-stream", "unknown.bin")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["file_type"], Value::String("application/octet-stream".to_string()));
assert_eq!(metadata["file_extension"], Value::String("bin".to_string()));
}
#[tokio::test]
async fn test_empty_file() {
let empty_data = b"";
let metadata = extract_content_metadata(empty_data, "text/plain", "empty.txt")
.await
.expect("Failed to extract metadata");
// Should still return some metadata (at least file extension)
assert!(metadata.is_some());
let metadata = metadata.unwrap();
assert_eq!(metadata["file_extension"], Value::String("txt".to_string()));
}
#[tokio::test]
async fn test_file_without_extension() {
let text_data = b"Some text content without file extension";
let metadata = extract_content_metadata(text_data, "text/plain", "no_extension")
.await
.expect("Failed to extract metadata");
assert!(metadata.is_some());
let metadata = metadata.unwrap();
// Should not have file_extension field
assert!(!metadata.contains_key("file_extension"));
}
}

View File

@ -78,7 +78,7 @@ pub async fn upload_document(
use crate::models::FileIngestionInfo;
use chrono::Utc;
let file_info = FileIngestionInfo {
let mut file_info = FileIngestionInfo {
path: format!("upload/{}", filename), // Virtual path for web uploads
name: filename.clone(),
size: data.len() as i64,
@ -90,9 +90,14 @@ pub async fn upload_document(
permissions: None, // Web uploads don't have filesystem permissions
owner: Some(auth_user.user.username.clone()), // Uploader as owner
group: None, // Web uploads don't have filesystem groups
metadata: None, // Could extract EXIF/PDF metadata in the future
metadata: None, // Will be populated with extracted metadata below
};
// Extract content-based metadata from uploaded file
if let Ok(Some(content_metadata)) = crate::metadata_extraction::extract_content_metadata(&data, &content_type, &filename).await {
file_info.metadata = Some(content_metadata);
}
// Create ingestion service
let file_service = FileService::new(state.config.upload_path.clone());
let ingestion_service = DocumentIngestionService::new(

View File

@ -337,8 +337,13 @@ async fn process_file(
}
}
// Extract file info with metadata
let file_info = extract_file_info_from_path(path).await?;
// Extract basic file info first
let mut file_info = extract_file_info_from_path(path).await?;
// Extract content-based metadata
if let Ok(Some(content_metadata)) = crate::metadata_extraction::extract_content_metadata(&file_data, &file_info.mime_type, &file_info.name).await {
file_info.metadata = Some(content_metadata);
}
// Use the unified ingestion service for consistent deduplication
let ingestion_service = DocumentIngestionService::new(db.clone(), file_service.clone());

View File

@ -0,0 +1,5 @@
Pure ASCII text document without any Unicode characters.
This file contains only standard ASCII characters from the basic set.
Numbers: 0123456789
Punctuation: .,;:!?'"()-[]{}
All characters should be ASCII-only for testing encoding detection.

View File

@ -0,0 +1,80 @@
%PDF-1.3
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
1 0 obj
<<
/F1 2 0 R /F2 3 0 R /F3 4 0 R
>>
endobj
2 0 obj
<<
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
>>
endobj
3 0 obj
<<
/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
>>
endobj
4 0 obj
<<
/BaseFont /Courier /Encoding /WinAnsiEncoding /Name /F3 /Subtype /Type1 /Type /Font
>>
endobj
5 0 obj
<<
/Contents 9 0 R /MediaBox [ 0 0 612 792 ] /Parent 8 0 R /Resources <<
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
>> /Rotate 0 /Trans <<
>>
/Type /Page
>>
endobj
6 0 obj
<<
/PageMode /UseNone /Pages 8 0 R /Type /Catalog
>>
endobj
7 0 obj
<<
/Author (anonymous) /CreationDate (D:20250710214218+00'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20250710214218+00'00') /Producer (ReportLab PDF Library - www.reportlab.com)
/Subject (unspecified) /Title (Complex PDF with Fonts) /Trapped /False
>>
endobj
8 0 obj
<<
/Count 1 /Kids [ 5 0 R ] /Type /Pages
>>
endobj
9 0 obj
<<
/Filter [ /ASCII85Decode /FlateDecode ] /Length 246
>>
stream
Garp%aUZ01'Euj3^Z$lr70RKbDN26UQ2En"QspCZ"fK-0V7'.H&;JtnEutX^`S4<o?m.72F)V/*!70(%EeBWNg0B/(gO,TM=3`mZ1PT&X+>!djY>/-1Ll7*9[\kUD]$a]-(W-gT#W[42^Y:qOp/#=7!qndZ#V1iAYW[K/"S#OYOFENi#\m$3$pO%_hg)82^%7pMPXa0S88Np"d23mBn#h"bUhu6<Y6qlocNt;-:Db!E.?Zcu(gLV~>endstream
endobj
xref
0 10
0000000000 65535 f
0000000073 00000 n
0000000124 00000 n
0000000231 00000 n
0000000343 00000 n
0000000448 00000 n
0000000641 00000 n
0000000709 00000 n
0000001019 00000 n
0000001078 00000 n
trailer
<<
/ID
[<915ab5b109826181e9414e12bf7351a6><915ab5b109826181e9414e12bf7351a6>]
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
/Info 7 0 R
/Root 6 0 R
/Size 10
>>
startxref
1414
%%EOF

View File

@ -0,0 +1,15 @@
This is a comprehensive test document for text metadata extraction.
It contains multiple paragraphs, various types of content, and different characteristics.
Word count: This sentence has exactly seven words counting properly.
Line counting: Each line should be counted separately for accurate statistics.
Unicode content: café, naïve, résumé, piñata, Zürich, москва, 東京, 🎉✨🔥
Numbers and mixed content: 123 ABC def456 GHI789 test@example.com
Special formatting:
- Bulleted lists
- Multiple items
- With various content
The document ends here with a final paragraph.

View File

@ -0,0 +1,266 @@
#!/usr/bin/env python3
"""
Create test files for metadata extraction testing.
"""
import os
import sys
from pathlib import Path
# Try to import PIL for image creation
try:
from PIL import Image, ImageDraw, ImageFont
from PIL.ExifTags import TAGS
from PIL.ExifTags import GPSTAGS
PIL_AVAILABLE = True
except ImportError:
print("PIL not available, skipping image creation with EXIF")
PIL_AVAILABLE = False
# Try to import reportlab for PDF creation
try:
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter, A4
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
REPORTLAB_AVAILABLE = True
except ImportError:
print("reportlab not available, creating simple PDF-like files")
REPORTLAB_AVAILABLE = False
def create_test_images():
"""Create test images with various properties."""
if not PIL_AVAILABLE:
print("Skipping image creation - PIL not available")
return
print("Creating test images...")
# 1. Portrait image (100x200)
img = Image.new('RGB', (100, 200), color='lightblue')
draw = ImageDraw.Draw(img)
draw.text((10, 50), "Portrait\n100x200", fill='black')
img.save('test_files/portrait_100x200.png')
# 2. Landscape image (300x200)
img = Image.new('RGB', (300, 200), color='lightgreen')
draw = ImageDraw.Draw(img)
draw.text((50, 50), "Landscape 300x200", fill='black')
img.save('test_files/landscape_300x200.png')
# 3. Square image (150x150)
img = Image.new('RGB', (150, 150), color='lightyellow')
draw = ImageDraw.Draw(img)
draw.text((25, 50), "Square\n150x150", fill='black')
img.save('test_files/square_150x150.png')
# 4. High resolution image (1920x1080)
img = Image.new('RGB', (1920, 1080), color='lightcoral')
draw = ImageDraw.Draw(img)
try:
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 40)
except:
font = ImageFont.load_default()
draw.text((100, 500), "High Resolution\n1920x1080\n2.07 Megapixels", fill='black', font=font)
img.save('test_files/hires_1920x1080.png')
# 5. Small image (50x50)
img = Image.new('RGB', (50, 50), color='lightgray')
img.save('test_files/small_50x50.png')
# 6. JPEG with different color mode
img = Image.new('RGB', (200, 200), color='purple')
draw = ImageDraw.Draw(img)
draw.text((50, 50), "JPEG\nTest", fill='white')
img.save('test_files/test_image.jpg', 'JPEG')
print("Created test images")
def create_test_pdfs():
"""Create test PDFs with various properties."""
if not REPORTLAB_AVAILABLE:
print("Creating simple PDF-like files...")
# Create simple files that look like PDF headers
simple_pdfs = [
("%PDF-1.4\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n2 0 obj\n<<\n/Type /Pages\n/Kids [3 0 R]\n/Count 1\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n/MediaBox [0 0 612 792]\n>>\nendobj\nxref\n0 4\n0000000000 65535 f \n0000000009 00000 n \n0000000074 00000 n \n0000000120 00000 n \ntrailer\n<<\n/Size 4\n/Root 1 0 R\n>>\nstartxref\n179\n%%EOF", "simple_v14.pdf"),
("%PDF-1.7\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n2 0 obj\n<<\n/Type /Pages\n/Kids [3 0 R 4 0 R]\n/Count 2\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n>>\nendobj\n4 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n>>\nendobj\nxref\n0 5\ntrailer\n<<\n/Size 5\n/Root 1 0 R\n>>\n%%EOF", "multipage_v17.pdf"),
("%PDF-1.5\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n/Linearized true\n>>\nendobj\n2 0 obj\n<<\n/Type /Pages\n/Kids [3 0 R]\n/Count 1\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n/Font 4 0 R\n/Image 5 0 R\n>>\nendobj\nxref\n0 4\ntrailer\n<<\n/Size 4\n/Root 1 0 R\n>>\n%%EOF", "with_fonts_images.pdf"),
]
for content, filename in simple_pdfs:
with open(f'test_files/{filename}', 'wb') as f:
f.write(content.encode('latin1'))
print("Created simple PDF-like files")
return
print("Creating test PDFs with reportlab...")
# 1. Single page PDF v1.4
c = canvas.Canvas('test_files/single_page_v14.pdf', pagesize=letter)
c.setTitle("Single Page Test Document")
c.setAuthor("Test Author")
c.setSubject("Test Subject")
c.setCreator("Python reportlab")
c.setFont("Helvetica", 12)
c.drawString(100, 750, "Single Page PDF Document")
c.drawString(100, 700, "This is a test PDF for metadata extraction.")
c.drawString(100, 650, "It should be detected as PDF version 1.4")
c.save()
# 2. Multi-page PDF
c = canvas.Canvas('test_files/multipage_test.pdf', pagesize=A4)
c.setTitle("Multi-page Test Document")
# Page 1
c.setFont("Helvetica", 14)
c.drawString(100, 800, "Page 1 of Multi-page Document")
c.drawString(100, 750, "This document has multiple pages.")
c.showPage()
# Page 2
c.drawString(100, 800, "Page 2 of Multi-page Document")
c.drawString(100, 750, "Second page content here.")
c.showPage()
# Page 3
c.drawString(100, 800, "Page 3 - Final Page")
c.drawString(100, 750, "Third and final page.")
c.save()
# 3. PDF with fonts and complex content
c = canvas.Canvas('test_files/complex_content.pdf', pagesize=letter)
c.setTitle("Complex PDF with Fonts")
c.setFont("Helvetica-Bold", 16)
c.drawString(100, 750, "Document with Multiple Fonts")
c.setFont("Helvetica", 12)
c.drawString(100, 700, "This document contains multiple font types.")
c.setFont("Courier", 10)
c.drawString(100, 650, "Some monospace text for variety.")
# Add some graphics/lines
c.line(100, 600, 500, 600)
c.rect(100, 550, 200, 30)
c.save()
print("Created test PDFs")
def create_text_files():
"""Create various text files for testing."""
print("Creating test text files...")
# 1. Plain text with various content
content = """This is a comprehensive test document for text metadata extraction.
It contains multiple paragraphs, various types of content, and different characteristics.
Word count: This sentence has exactly seven words counting properly.
Line counting: Each line should be counted separately for accurate statistics.
Unicode content: café, naïve, résumé, piñata, Zürich, москва, 東京, 🎉🔥
Numbers and mixed content: 123 ABC def456 GHI789 test@example.com
Special formatting:
- Bulleted lists
- Multiple items
- With various content
The document ends here with a final paragraph."""
with open('test_files/comprehensive_text.txt', 'w', encoding='utf-8') as f:
f.write(content)
# 2. JSON format text
json_content = """{
"document": {
"title": "Test JSON Document",
"type": "metadata_test",
"properties": {
"word_count": 25,
"format": "json",
"encoding": "utf-8"
},
"content": [
"This JSON should be detected as JSON format",
"It contains structured data in JSON format"
]
}
}"""
with open('test_files/test_format.json', 'w') as f:
f.write(json_content)
# 3. XML format text
xml_content = """<?xml version="1.0" encoding="UTF-8"?>
<document type="test">
<metadata>
<title>XML Test Document</title>
<format>xml</format>
<word_count>15</word_count>
</metadata>
<content>
<section>This XML document should be detected as XML format.</section>
<section>It contains structured markup for testing.</section>
</content>
</document>"""
with open('test_files/test_format.xml', 'w') as f:
f.write(xml_content)
# 4. HTML format text
html_content = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>HTML Test Document</title>
</head>
<body>
<h1>HTML Test Page</h1>
<p>This document should be detected as HTML format.</p>
<p>It contains HTML markup and structure.</p>
<ul>
<li>List item one</li>
<li>List item two</li>
</ul>
</body>
</html>"""
with open('test_files/test_format.html', 'w') as f:
f.write(html_content)
# 5. Large text file for performance testing
large_content = "This is a large text file for testing performance. " * 1000
large_content += "\nEnd of large file with final line."
with open('test_files/large_text.txt', 'w') as f:
f.write(large_content)
# 6. ASCII-only text
ascii_content = """Pure ASCII text document without any Unicode characters.
This file contains only standard ASCII characters from the basic set.
Numbers: 0123456789
Punctuation: .,;:!?'"()-[]{}
All characters should be ASCII-only for testing encoding detection."""
with open('test_files/ascii_only.txt', 'w') as f:
f.write(ascii_content)
print("Created test text files")
def main():
"""Create all test files."""
# Ensure test_files directory exists
os.makedirs('test_files', exist_ok=True)
print("Creating test files for metadata extraction testing...")
create_text_files()
create_test_images()
create_test_pdfs()
print("\nAll test files created successfully!")
print("Files created in test_files/ directory:")
# List all created files
test_files = sorted(Path('test_files').glob('*'))
for file_path in test_files:
if file_path.is_file() and not file_path.name.endswith('.py'):
size = file_path.stat().st_size
print(f" {file_path.name} ({size} bytes)")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,42 @@
// This is a helper script to create test images
use image::{ImageBuffer, Rgb, DynamicImage};
use std::path::Path;
pub fn create_test_images() -> Result<(), Box<dyn std::error::Error>> {
// Create a simple 100x200 RGB image (portrait)
let mut img = ImageBuffer::new(100, 200);
for (x, y, pixel) in img.enumerate_pixels_mut() {
let r = (x * 255 / 100) as u8;
let g = (y * 255 / 200) as u8;
let b = 128;
*pixel = Rgb([r, g, b]);
}
let dynamic_img = DynamicImage::ImageRgb8(img);
dynamic_img.save("test_files/sample_portrait.png")?;
// Create a simple 300x200 RGB image (landscape)
let mut img2 = ImageBuffer::new(300, 200);
for (x, y, pixel) in img2.enumerate_pixels_mut() {
let r = 255 - (x * 255 / 300) as u8;
let g = (y * 255 / 200) as u8;
let b = (x + y) as u8 % 255;
*pixel = Rgb([r, g, b]);
}
let dynamic_img2 = DynamicImage::ImageRgb8(img2);
dynamic_img2.save("test_files/sample_landscape.png")?;
// Create a square image 150x150
let mut img3 = ImageBuffer::new(150, 150);
for (x, y, pixel) in img3.enumerate_pixels_mut() {
let distance = ((x as i32 - 75).pow(2) + (y as i32 - 75).pow(2)) as f32;
let intensity = (255.0 * (1.0 - distance / (75.0 * 75.0))).max(0.0) as u8;
*pixel = Rgb([intensity, 0, 255 - intensity]);
}
let dynamic_img3 = DynamicImage::ImageRgb8(img3);
dynamic_img3.save("test_files/sample_square.png")?;
Ok(())
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.0 KiB

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,106 @@
%PDF-1.3
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
1 0 obj
<<
/F1 2 0 R
>>
endobj
2 0 obj
<<
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
>>
endobj
3 0 obj
<<
/Contents 9 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 8 0 R /Resources <<
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
>> /Rotate 0 /Trans <<
>>
/Type /Page
>>
endobj
4 0 obj
<<
/Contents 10 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 8 0 R /Resources <<
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
>> /Rotate 0 /Trans <<
>>
/Type /Page
>>
endobj
5 0 obj
<<
/Contents 11 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 8 0 R /Resources <<
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
>> /Rotate 0 /Trans <<
>>
/Type /Page
>>
endobj
6 0 obj
<<
/PageMode /UseNone /Pages 8 0 R /Type /Catalog
>>
endobj
7 0 obj
<<
/Author (anonymous) /CreationDate (D:20250710214218+00'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20250710214218+00'00') /Producer (ReportLab PDF Library - www.reportlab.com)
/Subject (unspecified) /Title (Multi-page Test Document) /Trapped /False
>>
endobj
8 0 obj
<<
/Count 3 /Kids [ 3 0 R 4 0 R 5 0 R ] /Type /Pages
>>
endobj
9 0 obj
<<
/Filter [ /ASCII85Decode /FlateDecode ] /Length 169
>>
stream
GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_?CT3!K3H3=WkWlTgfk;(arJdECd6a7R1F/.SqNX9u4:qV^,3M!5s*':G0mdbq%1@#g#sMdDDX6g_?JG,U9Wt+U+lN+X=6o+W"Z_5u+E]aJ<9.,U4#5!.\</LB~>endstream
endobj
10 0 obj
<<
/Filter [ /ASCII85Decode /FlateDecode ] /Length 155
>>
stream
GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_?CW4KISi90MntRifICKNI\t><!=M#r%nT7d[ZIoh#\/F,sSps2`7d,;^!DZ+[c]rG&pT<p>FH]r4E^IYV%'!"K4m8,IJq"%nWf'(B@ns.4,~>endstream
endobj
11 0 obj
<<
/Filter [ /ASCII85Decode /FlateDecode ] /Length 139
>>
stream
GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_?CW4KISi90MntRifICKNI\t><!?b8>m7^]]Ykcn&6DdR,YK:K$]??Q;i%<4N6J"1KuCV<Kf$$:!h_//,]*ln!4=[/%f~>endstream
endobj
xref
0 12
0000000000 65535 f
0000000073 00000 n
0000000104 00000 n
0000000211 00000 n
0000000414 00000 n
0000000618 00000 n
0000000822 00000 n
0000000890 00000 n
0000001202 00000 n
0000001273 00000 n
0000001532 00000 n
0000001778 00000 n
trailer
<<
/ID
[<3b1faa812dbde8df21a25a9c074387ca><3b1faa812dbde8df21a25a9c074387ca>]
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
/Info 7 0 R
/Root 6 0 R
/Size 12
>>
startxref
2008
%%EOF

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.4 KiB

13
test_files/sample.html Normal file
View File

@ -0,0 +1,13 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Sample HTML Document</title>
</head>
<body>
<h1>Test HTML File</h1>
<p>This is a sample HTML document for testing format detection.</p>
<p>It contains multiple paragraphs and should be detected as HTML.</p>
</body>
</html>

12
test_files/sample.json Normal file
View File

@ -0,0 +1,12 @@
{
"name": "Test Document",
"type": "sample",
"metadata": {
"created": "2024-01-01",
"author": "Test User"
},
"content": [
"This is a JSON file",
"Used for testing text format detection"
]
}

8
test_files/sample.txt Normal file
View File

@ -0,0 +1,8 @@
This is a sample text file for testing metadata extraction.
It contains multiple lines and various words.
The quick brown fox jumps over the lazy dog.
This file is used to test character count, word count, and line count extraction.
Some special characters: áéíóú, çñ, and emojis 🎉✨
This should help test Unicode detection as well.

12
test_files/sample.xml Normal file
View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<document>
<title>Sample XML Document</title>
<content>
<paragraph>This is a sample XML file for testing.</paragraph>
<paragraph>It should be detected as XML format.</paragraph>
</content>
<metadata>
<author>Test User</author>
<created>2024-01-01</created>
</metadata>
</document>

View File

@ -0,0 +1,68 @@
%PDF-1.3
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
1 0 obj
<<
/F1 2 0 R
>>
endobj
2 0 obj
<<
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
>>
endobj
3 0 obj
<<
/Contents 7 0 R /MediaBox [ 0 0 612 792 ] /Parent 6 0 R /Resources <<
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
>> /Rotate 0 /Trans <<
>>
/Type /Page
>>
endobj
4 0 obj
<<
/PageMode /UseNone /Pages 6 0 R /Type /Catalog
>>
endobj
5 0 obj
<<
/Author (Test Author) /CreationDate (D:20250710214218+00'00') /Creator (Python reportlab) /Keywords () /ModDate (D:20250710214218+00'00') /Producer (ReportLab PDF Library - www.reportlab.com)
/Subject (Test Subject) /Title (Single Page Test Document) /Trapped /False
>>
endobj
6 0 obj
<<
/Count 1 /Kids [ 3 0 R ] /Type /Pages
>>
endobj
7 0 obj
<<
/Filter [ /ASCII85Decode /FlateDecode ] /Length 206
>>
stream
Gas2B]aDVA&;9pC`KY*LD^B-O0d'R(h@#jac&^q+OP>SPmh&K#W6@$9_Y5bf'gkod%QJ.'$Nb1<.o;ODct=>@o&ifTJs=-$X.UIKf=%kE`l6H4ncH?T8&r:oU$p@/Dkq]C\nM9+)$gJ8i5uRL/B/S_i27!\Ph,TnDqT$"E>!rEO.F7L@9L)S[P2.<5'R?1"%om1'9X0s@Rg.~>endstream
endobj
xref
0 8
0000000000 65535 f
0000000073 00000 n
0000000104 00000 n
0000000211 00000 n
0000000404 00000 n
0000000472 00000 n
0000000763 00000 n
0000000822 00000 n
trailer
<<
/ID
[<eb148edbf508f579851c4559676f2901><eb148edbf508f579851c4559676f2901>]
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
/Info 5 0 R
/Root 4 0 R
/Size 8
>>
startxref
1118
%%EOF

BIN
test_files/small_50x50.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 135 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 KiB

View File

@ -0,0 +1,16 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>HTML Test Document</title>
</head>
<body>
<h1>HTML Test Page</h1>
<p>This document should be detected as HTML format.</p>
<p>It contains HTML markup and structure.</p>
<ul>
<li>List item one</li>
<li>List item two</li>
</ul>
</body>
</html>

View File

@ -0,0 +1,15 @@
{
"document": {
"title": "Test JSON Document",
"type": "metadata_test",
"properties": {
"word_count": 25,
"format": "json",
"encoding": "utf-8"
},
"content": [
"This JSON should be detected as JSON format",
"It contains structured data in JSON format"
]
}
}

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<document type="test">
<metadata>
<title>XML Test Document</title>
<format>xml</format>
<word_count>15</word_count>
</metadata>
<content>
<section>This XML document should be detected as XML format.</section>
<section>It contains structured markup for testing.</section>
</content>
</document>

BIN
test_files/test_image.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.7 KiB