feat(server): show source metadata EVEN better
This commit is contained in:
parent
ea43f79a90
commit
305c6f1fb1
|
|
@ -230,8 +230,8 @@ async fn process_single_file(
|
|||
user_id: Uuid,
|
||||
db: Database,
|
||||
) -> Result<Option<(Uuid, i64)>> {
|
||||
// Extract file info with metadata
|
||||
let file_info = extract_file_info_from_path(&path).await?;
|
||||
// Extract basic file info first
|
||||
let mut file_info = extract_file_info_from_path(&path).await?;
|
||||
|
||||
// Skip very large files (> 100MB)
|
||||
if file_info.size > 100 * 1024 * 1024 {
|
||||
|
|
@ -242,6 +242,11 @@ async fn process_single_file(
|
|||
// Read file data
|
||||
let file_data = fs::read(&path).await?;
|
||||
|
||||
// Extract content-based metadata
|
||||
if let Ok(Some(content_metadata)) = crate::metadata_extraction::extract_content_metadata(&file_data, &file_info.mime_type, &file_info.name).await {
|
||||
file_info.metadata = Some(content_metadata);
|
||||
}
|
||||
|
||||
// Use the unified ingestion service with full metadata support
|
||||
let ingestion_service = DocumentIngestionService::new(db, file_service);
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
use anyhow::Result;
|
||||
use serde_json::{Map, Value};
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Extract metadata from file content based on file type
|
||||
pub async fn extract_content_metadata(file_data: &[u8], mime_type: &str, filename: &str) -> Result<Option<Value>> {
|
||||
|
|
@ -177,3 +176,6 @@ async fn extract_text_metadata(file_data: &[u8]) -> Result<Map<String, Value>> {
|
|||
|
||||
Ok(metadata)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
|
@ -0,0 +1,320 @@
|
|||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::fs;
|
||||
use serde_json::Value;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_image_metadata_extraction_portrait() {
|
||||
let image_data = fs::read("test_files/portrait_100x200.png").expect("Failed to read portrait test image");
|
||||
|
||||
let metadata = extract_content_metadata(&image_data, "image/png", "portrait_100x200.png")
|
||||
.await
|
||||
.expect("Failed to extract metadata");
|
||||
|
||||
assert!(metadata.is_some());
|
||||
let metadata = metadata.unwrap();
|
||||
|
||||
// Check basic image properties
|
||||
assert_eq!(metadata["image_width"], Value::Number(100.into()));
|
||||
assert_eq!(metadata["image_height"], Value::Number(200.into()));
|
||||
assert_eq!(metadata["orientation"], Value::String("portrait".to_string()));
|
||||
assert_eq!(metadata["file_extension"], Value::String("png".to_string()));
|
||||
|
||||
// Check calculated values
|
||||
assert_eq!(metadata["aspect_ratio"], Value::String("0.50".to_string()));
|
||||
assert_eq!(metadata["megapixels"], Value::String("0.0 MP".to_string()));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_image_metadata_extraction_landscape() {
|
||||
let image_data = fs::read("test_files/landscape_300x200.png").expect("Failed to read landscape test image");
|
||||
|
||||
let metadata = extract_content_metadata(&image_data, "image/png", "landscape_300x200.png")
|
||||
.await
|
||||
.expect("Failed to extract metadata");
|
||||
|
||||
assert!(metadata.is_some());
|
||||
let metadata = metadata.unwrap();
|
||||
|
||||
assert_eq!(metadata["image_width"], Value::Number(300.into()));
|
||||
assert_eq!(metadata["image_height"], Value::Number(200.into()));
|
||||
assert_eq!(metadata["orientation"], Value::String("landscape".to_string()));
|
||||
assert_eq!(metadata["aspect_ratio"], Value::String("1.50".to_string()));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_image_metadata_extraction_square() {
|
||||
let image_data = fs::read("test_files/square_150x150.png").expect("Failed to read square test image");
|
||||
|
||||
let metadata = extract_content_metadata(&image_data, "image/png", "square_150x150.png")
|
||||
.await
|
||||
.expect("Failed to extract metadata");
|
||||
|
||||
assert!(metadata.is_some());
|
||||
let metadata = metadata.unwrap();
|
||||
|
||||
assert_eq!(metadata["image_width"], Value::Number(150.into()));
|
||||
assert_eq!(metadata["image_height"], Value::Number(150.into()));
|
||||
assert_eq!(metadata["orientation"], Value::String("square".to_string()));
|
||||
assert_eq!(metadata["aspect_ratio"], Value::String("1.00".to_string()));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_image_metadata_extraction_high_resolution() {
|
||||
let image_data = fs::read("test_files/hires_1920x1080.png").expect("Failed to read high-res test image");
|
||||
|
||||
let metadata = extract_content_metadata(&image_data, "image/png", "hires_1920x1080.png")
|
||||
.await
|
||||
.expect("Failed to extract metadata");
|
||||
|
||||
assert!(metadata.is_some());
|
||||
let metadata = metadata.unwrap();
|
||||
|
||||
assert_eq!(metadata["image_width"], Value::Number(1920.into()));
|
||||
assert_eq!(metadata["image_height"], Value::Number(1080.into()));
|
||||
assert_eq!(metadata["orientation"], Value::String("landscape".to_string()));
|
||||
assert_eq!(metadata["megapixels"], Value::String("2.1 MP".to_string()));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_jpeg_metadata_extraction() {
|
||||
let image_data = fs::read("test_files/test_image.jpg").expect("Failed to read JPEG test image");
|
||||
|
||||
let metadata = extract_content_metadata(&image_data, "image/jpeg", "test_image.jpg")
|
||||
.await
|
||||
.expect("Failed to extract metadata");
|
||||
|
||||
assert!(metadata.is_some());
|
||||
let metadata = metadata.unwrap();
|
||||
|
||||
assert_eq!(metadata["file_extension"], Value::String("jpg".to_string()));
|
||||
assert!(metadata.contains_key("image_width"));
|
||||
assert!(metadata.contains_key("image_height"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_pdf_metadata_extraction_single_page() {
|
||||
let pdf_data = fs::read("test_files/single_page_v14.pdf").expect("Failed to read single page PDF");
|
||||
|
||||
let metadata = extract_content_metadata(&pdf_data, "application/pdf", "single_page_v14.pdf")
|
||||
.await
|
||||
.expect("Failed to extract metadata");
|
||||
|
||||
assert!(metadata.is_some());
|
||||
let metadata = metadata.unwrap();
|
||||
|
||||
assert_eq!(metadata["file_extension"], Value::String("pdf".to_string()));
|
||||
// Note: PDF version detection might vary depending on how reportlab creates the file
|
||||
assert!(metadata.contains_key("pdf_version") || metadata.contains_key("file_type"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_pdf_metadata_extraction_multipage() {
|
||||
let pdf_data = fs::read("test_files/multipage_test.pdf").expect("Failed to read multipage PDF");
|
||||
|
||||
let metadata = extract_content_metadata(&pdf_data, "application/pdf", "multipage_test.pdf")
|
||||
.await
|
||||
.expect("Failed to extract metadata");
|
||||
|
||||
assert!(metadata.is_some());
|
||||
let metadata = metadata.unwrap();
|
||||
|
||||
assert_eq!(metadata["file_extension"], Value::String("pdf".to_string()));
|
||||
// Should detect multiple pages if our page counting works
|
||||
if let Some(page_count) = metadata.get("page_count") {
|
||||
if let Value::Number(count) = page_count {
|
||||
assert!(count.as_u64().unwrap() > 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_pdf_metadata_with_fonts_and_images() {
|
||||
let pdf_data = fs::read("test_files/complex_content.pdf").expect("Failed to read complex PDF");
|
||||
|
||||
let metadata = extract_content_metadata(&pdf_data, "application/pdf", "complex_content.pdf")
|
||||
.await
|
||||
.expect("Failed to extract metadata");
|
||||
|
||||
assert!(metadata.is_some());
|
||||
let metadata = metadata.unwrap();
|
||||
|
||||
// Should detect fonts and potentially images/objects
|
||||
if let Some(Value::Bool(has_fonts)) = metadata.get("contains_fonts") {
|
||||
// Font detection might work depending on PDF structure
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_text_metadata_extraction_comprehensive() {
|
||||
let text_data = fs::read("test_files/comprehensive_text.txt").expect("Failed to read comprehensive text");
|
||||
|
||||
let metadata = extract_content_metadata(&text_data, "text/plain", "comprehensive_text.txt")
|
||||
.await
|
||||
.expect("Failed to extract metadata");
|
||||
|
||||
assert!(metadata.is_some());
|
||||
let metadata = metadata.unwrap();
|
||||
|
||||
assert_eq!(metadata["file_extension"], Value::String("txt".to_string()));
|
||||
|
||||
// Check text statistics
|
||||
if let Value::Number(char_count) = &metadata["character_count"] {
|
||||
assert!(char_count.as_u64().unwrap() > 500); // Should be substantial
|
||||
}
|
||||
|
||||
if let Value::Number(word_count) = &metadata["word_count"] {
|
||||
assert!(word_count.as_u64().unwrap() > 80); // Should have many words
|
||||
}
|
||||
|
||||
if let Value::Number(line_count) = &metadata["line_count"] {
|
||||
assert!(line_count.as_u64().unwrap() > 15); // Should have multiple lines
|
||||
}
|
||||
|
||||
// Should detect Unicode content
|
||||
assert_eq!(metadata["contains_unicode"], Value::Bool(true));
|
||||
|
||||
// Should detect likely English
|
||||
if let Some(Value::String(lang)) = metadata.get("likely_language") {
|
||||
assert_eq!(lang, "english");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_text_metadata_extraction_ascii_only() {
|
||||
let text_data = fs::read("test_files/ascii_only.txt").expect("Failed to read ASCII text");
|
||||
|
||||
let metadata = extract_content_metadata(&text_data, "text/plain", "ascii_only.txt")
|
||||
.await
|
||||
.expect("Failed to extract metadata");
|
||||
|
||||
assert!(metadata.is_some());
|
||||
let metadata = metadata.unwrap();
|
||||
|
||||
// Should NOT contain Unicode
|
||||
assert!(metadata.get("contains_unicode").is_none() || metadata["contains_unicode"] == Value::Bool(false));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_text_metadata_extraction_large_file() {
|
||||
let text_data = fs::read("test_files/large_text.txt").expect("Failed to read large text");
|
||||
|
||||
let metadata = extract_content_metadata(&text_data, "text/plain", "large_text.txt")
|
||||
.await
|
||||
.expect("Failed to extract metadata");
|
||||
|
||||
assert!(metadata.is_some());
|
||||
let metadata = metadata.unwrap();
|
||||
|
||||
// Should handle large files properly
|
||||
if let Value::Number(char_count) = &metadata["character_count"] {
|
||||
assert!(char_count.as_u64().unwrap() > 50000); // Should be large
|
||||
}
|
||||
|
||||
if let Value::Number(word_count) = &metadata["word_count"] {
|
||||
assert!(word_count.as_u64().unwrap() > 10000); // Should have many words
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_json_format_detection() {
|
||||
let text_data = fs::read("test_files/test_format.json").expect("Failed to read JSON text");
|
||||
|
||||
let metadata = extract_content_metadata(&text_data, "text/plain", "test_format.json")
|
||||
.await
|
||||
.expect("Failed to extract metadata");
|
||||
|
||||
assert!(metadata.is_some());
|
||||
let metadata = metadata.unwrap();
|
||||
|
||||
assert_eq!(metadata["file_extension"], Value::String("json".to_string()));
|
||||
|
||||
// Should detect JSON format
|
||||
if let Some(Value::String(format)) = metadata.get("text_format") {
|
||||
assert_eq!(format, "json");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_xml_format_detection() {
|
||||
let text_data = fs::read("test_files/test_format.xml").expect("Failed to read XML text");
|
||||
|
||||
let metadata = extract_content_metadata(&text_data, "text/plain", "test_format.xml")
|
||||
.await
|
||||
.expect("Failed to extract metadata");
|
||||
|
||||
assert!(metadata.is_some());
|
||||
let metadata = metadata.unwrap();
|
||||
|
||||
assert_eq!(metadata["file_extension"], Value::String("xml".to_string()));
|
||||
|
||||
// Should detect XML format
|
||||
if let Some(Value::String(format)) = metadata.get("text_format") {
|
||||
assert_eq!(format, "xml");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_html_format_detection() {
|
||||
let text_data = fs::read("test_files/test_format.html").expect("Failed to read HTML text");
|
||||
|
||||
let metadata = extract_content_metadata(&text_data, "text/plain", "test_format.html")
|
||||
.await
|
||||
.expect("Failed to extract metadata");
|
||||
|
||||
assert!(metadata.is_some());
|
||||
let metadata = metadata.unwrap();
|
||||
|
||||
assert_eq!(metadata["file_extension"], Value::String("html".to_string()));
|
||||
|
||||
// Should detect HTML format
|
||||
if let Some(Value::String(format)) = metadata.get("text_format") {
|
||||
assert_eq!(format, "html");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_unknown_file_type() {
|
||||
let dummy_data = b"This is some random binary data that doesn't match any known format.";
|
||||
|
||||
let metadata = extract_content_metadata(dummy_data, "application/octet-stream", "unknown.bin")
|
||||
.await
|
||||
.expect("Failed to extract metadata");
|
||||
|
||||
assert!(metadata.is_some());
|
||||
let metadata = metadata.unwrap();
|
||||
|
||||
assert_eq!(metadata["file_type"], Value::String("application/octet-stream".to_string()));
|
||||
assert_eq!(metadata["file_extension"], Value::String("bin".to_string()));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_empty_file() {
|
||||
let empty_data = b"";
|
||||
|
||||
let metadata = extract_content_metadata(empty_data, "text/plain", "empty.txt")
|
||||
.await
|
||||
.expect("Failed to extract metadata");
|
||||
|
||||
// Should still return some metadata (at least file extension)
|
||||
assert!(metadata.is_some());
|
||||
let metadata = metadata.unwrap();
|
||||
assert_eq!(metadata["file_extension"], Value::String("txt".to_string()));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_file_without_extension() {
|
||||
let text_data = b"Some text content without file extension";
|
||||
|
||||
let metadata = extract_content_metadata(text_data, "text/plain", "no_extension")
|
||||
.await
|
||||
.expect("Failed to extract metadata");
|
||||
|
||||
assert!(metadata.is_some());
|
||||
let metadata = metadata.unwrap();
|
||||
|
||||
// Should not have file_extension field
|
||||
assert!(!metadata.contains_key("file_extension"));
|
||||
}
|
||||
}
|
||||
|
|
@ -78,7 +78,7 @@ pub async fn upload_document(
|
|||
use crate::models::FileIngestionInfo;
|
||||
use chrono::Utc;
|
||||
|
||||
let file_info = FileIngestionInfo {
|
||||
let mut file_info = FileIngestionInfo {
|
||||
path: format!("upload/{}", filename), // Virtual path for web uploads
|
||||
name: filename.clone(),
|
||||
size: data.len() as i64,
|
||||
|
|
@ -90,9 +90,14 @@ pub async fn upload_document(
|
|||
permissions: None, // Web uploads don't have filesystem permissions
|
||||
owner: Some(auth_user.user.username.clone()), // Uploader as owner
|
||||
group: None, // Web uploads don't have filesystem groups
|
||||
metadata: None, // Could extract EXIF/PDF metadata in the future
|
||||
metadata: None, // Will be populated with extracted metadata below
|
||||
};
|
||||
|
||||
// Extract content-based metadata from uploaded file
|
||||
if let Ok(Some(content_metadata)) = crate::metadata_extraction::extract_content_metadata(&data, &content_type, &filename).await {
|
||||
file_info.metadata = Some(content_metadata);
|
||||
}
|
||||
|
||||
// Create ingestion service
|
||||
let file_service = FileService::new(state.config.upload_path.clone());
|
||||
let ingestion_service = DocumentIngestionService::new(
|
||||
|
|
|
|||
|
|
@ -337,8 +337,13 @@ async fn process_file(
|
|||
}
|
||||
}
|
||||
|
||||
// Extract file info with metadata
|
||||
let file_info = extract_file_info_from_path(path).await?;
|
||||
// Extract basic file info first
|
||||
let mut file_info = extract_file_info_from_path(path).await?;
|
||||
|
||||
// Extract content-based metadata
|
||||
if let Ok(Some(content_metadata)) = crate::metadata_extraction::extract_content_metadata(&file_data, &file_info.mime_type, &file_info.name).await {
|
||||
file_info.metadata = Some(content_metadata);
|
||||
}
|
||||
|
||||
// Use the unified ingestion service for consistent deduplication
|
||||
let ingestion_service = DocumentIngestionService::new(db.clone(), file_service.clone());
|
||||
|
|
|
|||
|
|
@ -0,0 +1,5 @@
|
|||
Pure ASCII text document without any Unicode characters.
|
||||
This file contains only standard ASCII characters from the basic set.
|
||||
Numbers: 0123456789
|
||||
Punctuation: .,;:!?'"()-[]{}
|
||||
All characters should be ASCII-only for testing encoding detection.
|
||||
|
|
@ -0,0 +1,80 @@
|
|||
%PDF-1.3
|
||||
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
|
||||
1 0 obj
|
||||
<<
|
||||
/F1 2 0 R /F2 3 0 R /F3 4 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/BaseFont /Courier /Encoding /WinAnsiEncoding /Name /F3 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/Contents 9 0 R /MediaBox [ 0 0 612 792 ] /Parent 8 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/PageMode /UseNone /Pages 8 0 R /Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/Author (anonymous) /CreationDate (D:20250710214218+00'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20250710214218+00'00') /Producer (ReportLab PDF Library - www.reportlab.com)
|
||||
/Subject (unspecified) /Title (Complex PDF with Fonts) /Trapped /False
|
||||
>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<<
|
||||
/Count 1 /Kids [ 5 0 R ] /Type /Pages
|
||||
>>
|
||||
endobj
|
||||
9 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 246
|
||||
>>
|
||||
stream
|
||||
Garp%aUZ01'Euj3^Z$lr70RKbDN26UQ2En"QspCZ"fK-0V7'.H&;JtnEutX^`S4<o?m.72F)V/*!70(%EeBWNg0B/(gO,TM=3`mZ1PT&X+>!djY>/-1Ll7*9[\kUD]$a]-(W-gT#W[42^Y:qOp/#=7!qndZ#V1iAYW[K/"S#OYOFENi#\m$3$pO%_hg)82^%7pMPXa0S88Np"d23mBn#h"bUhu6<Y6qlocNt;-:Db!E.?Zcu(gLV~>endstream
|
||||
endobj
|
||||
xref
|
||||
0 10
|
||||
0000000000 65535 f
|
||||
0000000073 00000 n
|
||||
0000000124 00000 n
|
||||
0000000231 00000 n
|
||||
0000000343 00000 n
|
||||
0000000448 00000 n
|
||||
0000000641 00000 n
|
||||
0000000709 00000 n
|
||||
0000001019 00000 n
|
||||
0000001078 00000 n
|
||||
trailer
|
||||
<<
|
||||
/ID
|
||||
[<915ab5b109826181e9414e12bf7351a6><915ab5b109826181e9414e12bf7351a6>]
|
||||
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
|
||||
|
||||
/Info 7 0 R
|
||||
/Root 6 0 R
|
||||
/Size 10
|
||||
>>
|
||||
startxref
|
||||
1414
|
||||
%%EOF
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
This is a comprehensive test document for text metadata extraction.
|
||||
|
||||
It contains multiple paragraphs, various types of content, and different characteristics.
|
||||
Word count: This sentence has exactly seven words counting properly.
|
||||
Line counting: Each line should be counted separately for accurate statistics.
|
||||
|
||||
Unicode content: café, naïve, résumé, piñata, Zürich, москва, 東京, 🎉✨🔥
|
||||
Numbers and mixed content: 123 ABC def456 GHI789 test@example.com
|
||||
|
||||
Special formatting:
|
||||
- Bulleted lists
|
||||
- Multiple items
|
||||
- With various content
|
||||
|
||||
The document ends here with a final paragraph.
|
||||
|
|
@ -0,0 +1,266 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Create test files for metadata extraction testing.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Try to import PIL for image creation
|
||||
try:
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
from PIL.ExifTags import TAGS
|
||||
from PIL.ExifTags import GPSTAGS
|
||||
PIL_AVAILABLE = True
|
||||
except ImportError:
|
||||
print("PIL not available, skipping image creation with EXIF")
|
||||
PIL_AVAILABLE = False
|
||||
|
||||
# Try to import reportlab for PDF creation
|
||||
try:
|
||||
from reportlab.pdfgen import canvas
|
||||
from reportlab.lib.pagesizes import letter, A4
|
||||
from reportlab.pdfbase import pdfmetrics
|
||||
from reportlab.pdfbase.ttfonts import TTFont
|
||||
REPORTLAB_AVAILABLE = True
|
||||
except ImportError:
|
||||
print("reportlab not available, creating simple PDF-like files")
|
||||
REPORTLAB_AVAILABLE = False
|
||||
|
||||
def create_test_images():
|
||||
"""Create test images with various properties."""
|
||||
if not PIL_AVAILABLE:
|
||||
print("Skipping image creation - PIL not available")
|
||||
return
|
||||
|
||||
print("Creating test images...")
|
||||
|
||||
# 1. Portrait image (100x200)
|
||||
img = Image.new('RGB', (100, 200), color='lightblue')
|
||||
draw = ImageDraw.Draw(img)
|
||||
draw.text((10, 50), "Portrait\n100x200", fill='black')
|
||||
img.save('test_files/portrait_100x200.png')
|
||||
|
||||
# 2. Landscape image (300x200)
|
||||
img = Image.new('RGB', (300, 200), color='lightgreen')
|
||||
draw = ImageDraw.Draw(img)
|
||||
draw.text((50, 50), "Landscape 300x200", fill='black')
|
||||
img.save('test_files/landscape_300x200.png')
|
||||
|
||||
# 3. Square image (150x150)
|
||||
img = Image.new('RGB', (150, 150), color='lightyellow')
|
||||
draw = ImageDraw.Draw(img)
|
||||
draw.text((25, 50), "Square\n150x150", fill='black')
|
||||
img.save('test_files/square_150x150.png')
|
||||
|
||||
# 4. High resolution image (1920x1080)
|
||||
img = Image.new('RGB', (1920, 1080), color='lightcoral')
|
||||
draw = ImageDraw.Draw(img)
|
||||
try:
|
||||
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 40)
|
||||
except:
|
||||
font = ImageFont.load_default()
|
||||
draw.text((100, 500), "High Resolution\n1920x1080\n2.07 Megapixels", fill='black', font=font)
|
||||
img.save('test_files/hires_1920x1080.png')
|
||||
|
||||
# 5. Small image (50x50)
|
||||
img = Image.new('RGB', (50, 50), color='lightgray')
|
||||
img.save('test_files/small_50x50.png')
|
||||
|
||||
# 6. JPEG with different color mode
|
||||
img = Image.new('RGB', (200, 200), color='purple')
|
||||
draw = ImageDraw.Draw(img)
|
||||
draw.text((50, 50), "JPEG\nTest", fill='white')
|
||||
img.save('test_files/test_image.jpg', 'JPEG')
|
||||
|
||||
print("Created test images")
|
||||
|
||||
def create_test_pdfs():
|
||||
"""Create test PDFs with various properties."""
|
||||
if not REPORTLAB_AVAILABLE:
|
||||
print("Creating simple PDF-like files...")
|
||||
# Create simple files that look like PDF headers
|
||||
simple_pdfs = [
|
||||
("%PDF-1.4\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n2 0 obj\n<<\n/Type /Pages\n/Kids [3 0 R]\n/Count 1\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n/MediaBox [0 0 612 792]\n>>\nendobj\nxref\n0 4\n0000000000 65535 f \n0000000009 00000 n \n0000000074 00000 n \n0000000120 00000 n \ntrailer\n<<\n/Size 4\n/Root 1 0 R\n>>\nstartxref\n179\n%%EOF", "simple_v14.pdf"),
|
||||
("%PDF-1.7\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n2 0 obj\n<<\n/Type /Pages\n/Kids [3 0 R 4 0 R]\n/Count 2\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n>>\nendobj\n4 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n>>\nendobj\nxref\n0 5\ntrailer\n<<\n/Size 5\n/Root 1 0 R\n>>\n%%EOF", "multipage_v17.pdf"),
|
||||
("%PDF-1.5\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n/Linearized true\n>>\nendobj\n2 0 obj\n<<\n/Type /Pages\n/Kids [3 0 R]\n/Count 1\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/Parent 2 0 R\n/Font 4 0 R\n/Image 5 0 R\n>>\nendobj\nxref\n0 4\ntrailer\n<<\n/Size 4\n/Root 1 0 R\n>>\n%%EOF", "with_fonts_images.pdf"),
|
||||
]
|
||||
|
||||
for content, filename in simple_pdfs:
|
||||
with open(f'test_files/{filename}', 'wb') as f:
|
||||
f.write(content.encode('latin1'))
|
||||
print("Created simple PDF-like files")
|
||||
return
|
||||
|
||||
print("Creating test PDFs with reportlab...")
|
||||
|
||||
# 1. Single page PDF v1.4
|
||||
c = canvas.Canvas('test_files/single_page_v14.pdf', pagesize=letter)
|
||||
c.setTitle("Single Page Test Document")
|
||||
c.setAuthor("Test Author")
|
||||
c.setSubject("Test Subject")
|
||||
c.setCreator("Python reportlab")
|
||||
c.setFont("Helvetica", 12)
|
||||
c.drawString(100, 750, "Single Page PDF Document")
|
||||
c.drawString(100, 700, "This is a test PDF for metadata extraction.")
|
||||
c.drawString(100, 650, "It should be detected as PDF version 1.4")
|
||||
c.save()
|
||||
|
||||
# 2. Multi-page PDF
|
||||
c = canvas.Canvas('test_files/multipage_test.pdf', pagesize=A4)
|
||||
c.setTitle("Multi-page Test Document")
|
||||
# Page 1
|
||||
c.setFont("Helvetica", 14)
|
||||
c.drawString(100, 800, "Page 1 of Multi-page Document")
|
||||
c.drawString(100, 750, "This document has multiple pages.")
|
||||
c.showPage()
|
||||
# Page 2
|
||||
c.drawString(100, 800, "Page 2 of Multi-page Document")
|
||||
c.drawString(100, 750, "Second page content here.")
|
||||
c.showPage()
|
||||
# Page 3
|
||||
c.drawString(100, 800, "Page 3 - Final Page")
|
||||
c.drawString(100, 750, "Third and final page.")
|
||||
c.save()
|
||||
|
||||
# 3. PDF with fonts and complex content
|
||||
c = canvas.Canvas('test_files/complex_content.pdf', pagesize=letter)
|
||||
c.setTitle("Complex PDF with Fonts")
|
||||
c.setFont("Helvetica-Bold", 16)
|
||||
c.drawString(100, 750, "Document with Multiple Fonts")
|
||||
c.setFont("Helvetica", 12)
|
||||
c.drawString(100, 700, "This document contains multiple font types.")
|
||||
c.setFont("Courier", 10)
|
||||
c.drawString(100, 650, "Some monospace text for variety.")
|
||||
# Add some graphics/lines
|
||||
c.line(100, 600, 500, 600)
|
||||
c.rect(100, 550, 200, 30)
|
||||
c.save()
|
||||
|
||||
print("Created test PDFs")
|
||||
|
||||
def create_text_files():
|
||||
"""Create various text files for testing."""
|
||||
print("Creating test text files...")
|
||||
|
||||
# 1. Plain text with various content
|
||||
content = """This is a comprehensive test document for text metadata extraction.
|
||||
|
||||
It contains multiple paragraphs, various types of content, and different characteristics.
|
||||
Word count: This sentence has exactly seven words counting properly.
|
||||
Line counting: Each line should be counted separately for accurate statistics.
|
||||
|
||||
Unicode content: café, naïve, résumé, piñata, Zürich, москва, 東京, 🎉✨🔥
|
||||
Numbers and mixed content: 123 ABC def456 GHI789 test@example.com
|
||||
|
||||
Special formatting:
|
||||
- Bulleted lists
|
||||
- Multiple items
|
||||
- With various content
|
||||
|
||||
The document ends here with a final paragraph."""
|
||||
|
||||
with open('test_files/comprehensive_text.txt', 'w', encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
|
||||
# 2. JSON format text
|
||||
json_content = """{
|
||||
"document": {
|
||||
"title": "Test JSON Document",
|
||||
"type": "metadata_test",
|
||||
"properties": {
|
||||
"word_count": 25,
|
||||
"format": "json",
|
||||
"encoding": "utf-8"
|
||||
},
|
||||
"content": [
|
||||
"This JSON should be detected as JSON format",
|
||||
"It contains structured data in JSON format"
|
||||
]
|
||||
}
|
||||
}"""
|
||||
|
||||
with open('test_files/test_format.json', 'w') as f:
|
||||
f.write(json_content)
|
||||
|
||||
# 3. XML format text
|
||||
xml_content = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<document type="test">
|
||||
<metadata>
|
||||
<title>XML Test Document</title>
|
||||
<format>xml</format>
|
||||
<word_count>15</word_count>
|
||||
</metadata>
|
||||
<content>
|
||||
<section>This XML document should be detected as XML format.</section>
|
||||
<section>It contains structured markup for testing.</section>
|
||||
</content>
|
||||
</document>"""
|
||||
|
||||
with open('test_files/test_format.xml', 'w') as f:
|
||||
f.write(xml_content)
|
||||
|
||||
# 4. HTML format text
|
||||
html_content = """<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>HTML Test Document</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>HTML Test Page</h1>
|
||||
<p>This document should be detected as HTML format.</p>
|
||||
<p>It contains HTML markup and structure.</p>
|
||||
<ul>
|
||||
<li>List item one</li>
|
||||
<li>List item two</li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
with open('test_files/test_format.html', 'w') as f:
|
||||
f.write(html_content)
|
||||
|
||||
# 5. Large text file for performance testing
|
||||
large_content = "This is a large text file for testing performance. " * 1000
|
||||
large_content += "\nEnd of large file with final line."
|
||||
|
||||
with open('test_files/large_text.txt', 'w') as f:
|
||||
f.write(large_content)
|
||||
|
||||
# 6. ASCII-only text
|
||||
ascii_content = """Pure ASCII text document without any Unicode characters.
|
||||
This file contains only standard ASCII characters from the basic set.
|
||||
Numbers: 0123456789
|
||||
Punctuation: .,;:!?'"()-[]{}
|
||||
All characters should be ASCII-only for testing encoding detection."""
|
||||
|
||||
with open('test_files/ascii_only.txt', 'w') as f:
|
||||
f.write(ascii_content)
|
||||
|
||||
print("Created test text files")
|
||||
|
||||
def main():
|
||||
"""Create all test files."""
|
||||
# Ensure test_files directory exists
|
||||
os.makedirs('test_files', exist_ok=True)
|
||||
|
||||
print("Creating test files for metadata extraction testing...")
|
||||
|
||||
create_text_files()
|
||||
create_test_images()
|
||||
create_test_pdfs()
|
||||
|
||||
print("\nAll test files created successfully!")
|
||||
print("Files created in test_files/ directory:")
|
||||
|
||||
# List all created files
|
||||
test_files = sorted(Path('test_files').glob('*'))
|
||||
for file_path in test_files:
|
||||
if file_path.is_file() and not file_path.name.endswith('.py'):
|
||||
size = file_path.stat().st_size
|
||||
print(f" {file_path.name} ({size} bytes)")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
// This is a helper script to create test images
|
||||
use image::{ImageBuffer, Rgb, DynamicImage};
|
||||
use std::path::Path;
|
||||
|
||||
pub fn create_test_images() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// Create a simple 100x200 RGB image (portrait)
|
||||
let mut img = ImageBuffer::new(100, 200);
|
||||
for (x, y, pixel) in img.enumerate_pixels_mut() {
|
||||
let r = (x * 255 / 100) as u8;
|
||||
let g = (y * 255 / 200) as u8;
|
||||
let b = 128;
|
||||
*pixel = Rgb([r, g, b]);
|
||||
}
|
||||
|
||||
let dynamic_img = DynamicImage::ImageRgb8(img);
|
||||
dynamic_img.save("test_files/sample_portrait.png")?;
|
||||
|
||||
// Create a simple 300x200 RGB image (landscape)
|
||||
let mut img2 = ImageBuffer::new(300, 200);
|
||||
for (x, y, pixel) in img2.enumerate_pixels_mut() {
|
||||
let r = 255 - (x * 255 / 300) as u8;
|
||||
let g = (y * 255 / 200) as u8;
|
||||
let b = (x + y) as u8 % 255;
|
||||
*pixel = Rgb([r, g, b]);
|
||||
}
|
||||
|
||||
let dynamic_img2 = DynamicImage::ImageRgb8(img2);
|
||||
dynamic_img2.save("test_files/sample_landscape.png")?;
|
||||
|
||||
// Create a square image 150x150
|
||||
let mut img3 = ImageBuffer::new(150, 150);
|
||||
for (x, y, pixel) in img3.enumerate_pixels_mut() {
|
||||
let distance = ((x as i32 - 75).pow(2) + (y as i32 - 75).pow(2)) as f32;
|
||||
let intensity = (255.0 * (1.0 - distance / (75.0 * 75.0))).max(0.0) as u8;
|
||||
*pixel = Rgb([intensity, 0, 255 - intensity]);
|
||||
}
|
||||
|
||||
let dynamic_img3 = DynamicImage::ImageRgb8(img3);
|
||||
dynamic_img3.save("test_files/sample_square.png")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 25 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 2.0 KiB |
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,106 @@
|
|||
%PDF-1.3
|
||||
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
|
||||
1 0 obj
|
||||
<<
|
||||
/F1 2 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/Contents 9 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 8 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/Contents 10 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 8 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/Contents 11 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 8 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/PageMode /UseNone /Pages 8 0 R /Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/Author (anonymous) /CreationDate (D:20250710214218+00'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20250710214218+00'00') /Producer (ReportLab PDF Library - www.reportlab.com)
|
||||
/Subject (unspecified) /Title (Multi-page Test Document) /Trapped /False
|
||||
>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<<
|
||||
/Count 3 /Kids [ 3 0 R 4 0 R 5 0 R ] /Type /Pages
|
||||
>>
|
||||
endobj
|
||||
9 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 169
|
||||
>>
|
||||
stream
|
||||
GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_?CT3!K3H3=WkWlTgfk;(arJdECd6a7R1F/.SqNX9u4:qV^,3M!5s*':G0mdbq%1@#g#sMdDDX6g_?JG,U9Wt+U+lN+X=6o+W"Z_5u+E]aJ<9.,U4#5!.\</LB~>endstream
|
||||
endobj
|
||||
10 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 155
|
||||
>>
|
||||
stream
|
||||
GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_?CW4KISi90MntRifICKNI\t><!=M#r%nT7d[ZIoh#\/F,sSps2`7d,;^!DZ+[c]rG&pT<p>FH]r4E^IYV%'!"K4m8,IJq"%nWf'(B@ns.4,~>endstream
|
||||
endobj
|
||||
11 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 139
|
||||
>>
|
||||
stream
|
||||
GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_?CW4KISi90MntRifICKNI\t><!?b8>m7^]]Ykcn&6DdR,YK:K$]??Q;i%<4N6J"1KuCV<Kf$$:!h_//,]*ln!4=[/%f~>endstream
|
||||
endobj
|
||||
xref
|
||||
0 12
|
||||
0000000000 65535 f
|
||||
0000000073 00000 n
|
||||
0000000104 00000 n
|
||||
0000000211 00000 n
|
||||
0000000414 00000 n
|
||||
0000000618 00000 n
|
||||
0000000822 00000 n
|
||||
0000000890 00000 n
|
||||
0000001202 00000 n
|
||||
0000001273 00000 n
|
||||
0000001532 00000 n
|
||||
0000001778 00000 n
|
||||
trailer
|
||||
<<
|
||||
/ID
|
||||
[<3b1faa812dbde8df21a25a9c074387ca><3b1faa812dbde8df21a25a9c074387ca>]
|
||||
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
|
||||
|
||||
/Info 7 0 R
|
||||
/Root 6 0 R
|
||||
/Size 12
|
||||
>>
|
||||
startxref
|
||||
2008
|
||||
%%EOF
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 1.4 KiB |
|
|
@ -0,0 +1,13 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Sample HTML Document</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Test HTML File</h1>
|
||||
<p>This is a sample HTML document for testing format detection.</p>
|
||||
<p>It contains multiple paragraphs and should be detected as HTML.</p>
|
||||
</body>
|
||||
</html>
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
{
|
||||
"name": "Test Document",
|
||||
"type": "sample",
|
||||
"metadata": {
|
||||
"created": "2024-01-01",
|
||||
"author": "Test User"
|
||||
},
|
||||
"content": [
|
||||
"This is a JSON file",
|
||||
"Used for testing text format detection"
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
This is a sample text file for testing metadata extraction.
|
||||
It contains multiple lines and various words.
|
||||
The quick brown fox jumps over the lazy dog.
|
||||
This file is used to test character count, word count, and line count extraction.
|
||||
|
||||
Some special characters: áéíóú, çñ, and emojis 🎉✨
|
||||
|
||||
This should help test Unicode detection as well.
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<document>
|
||||
<title>Sample XML Document</title>
|
||||
<content>
|
||||
<paragraph>This is a sample XML file for testing.</paragraph>
|
||||
<paragraph>It should be detected as XML format.</paragraph>
|
||||
</content>
|
||||
<metadata>
|
||||
<author>Test User</author>
|
||||
<created>2024-01-01</created>
|
||||
</metadata>
|
||||
</document>
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
%PDF-1.3
|
||||
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
|
||||
1 0 obj
|
||||
<<
|
||||
/F1 2 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/Contents 7 0 R /MediaBox [ 0 0 612 792 ] /Parent 6 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/PageMode /UseNone /Pages 6 0 R /Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/Author (Test Author) /CreationDate (D:20250710214218+00'00') /Creator (Python reportlab) /Keywords () /ModDate (D:20250710214218+00'00') /Producer (ReportLab PDF Library - www.reportlab.com)
|
||||
/Subject (Test Subject) /Title (Single Page Test Document) /Trapped /False
|
||||
>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/Count 1 /Kids [ 3 0 R ] /Type /Pages
|
||||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 206
|
||||
>>
|
||||
stream
|
||||
Gas2B]aDVA&;9pC`KY*LD^B-O0d'R(h@#jac&^q+OP>SPmh&K#W6@$9_Y5bf'gkod%QJ.'$Nb1<.o;ODct=>@o&ifTJs=-$X.UIKf=%kE`l6H4ncH?T8&r:oU$p@/Dkq]C\nM9+)$gJ8i5uRL/B/S_i27!\Ph,TnDqT$"E>!rEO.F7L@9L)S[P2.<5'R?1"%om1'9X0s@Rg.~>endstream
|
||||
endobj
|
||||
xref
|
||||
0 8
|
||||
0000000000 65535 f
|
||||
0000000073 00000 n
|
||||
0000000104 00000 n
|
||||
0000000211 00000 n
|
||||
0000000404 00000 n
|
||||
0000000472 00000 n
|
||||
0000000763 00000 n
|
||||
0000000822 00000 n
|
||||
trailer
|
||||
<<
|
||||
/ID
|
||||
[<eb148edbf508f579851c4559676f2901><eb148edbf508f579851c4559676f2901>]
|
||||
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
|
||||
|
||||
/Info 5 0 R
|
||||
/Root 4 0 R
|
||||
/Size 8
|
||||
>>
|
||||
startxref
|
||||
1118
|
||||
%%EOF
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 135 B |
Binary file not shown.
|
After Width: | Height: | Size: 1.5 KiB |
|
|
@ -0,0 +1,16 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>HTML Test Document</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>HTML Test Page</h1>
|
||||
<p>This document should be detected as HTML format.</p>
|
||||
<p>It contains HTML markup and structure.</p>
|
||||
<ul>
|
||||
<li>List item one</li>
|
||||
<li>List item two</li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
{
|
||||
"document": {
|
||||
"title": "Test JSON Document",
|
||||
"type": "metadata_test",
|
||||
"properties": {
|
||||
"word_count": 25,
|
||||
"format": "json",
|
||||
"encoding": "utf-8"
|
||||
},
|
||||
"content": [
|
||||
"This JSON should be detected as JSON format",
|
||||
"It contains structured data in JSON format"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<document type="test">
|
||||
<metadata>
|
||||
<title>XML Test Document</title>
|
||||
<format>xml</format>
|
||||
<word_count>15</word_count>
|
||||
</metadata>
|
||||
<content>
|
||||
<section>This XML document should be detected as XML format.</section>
|
||||
<section>It contains structured markup for testing.</section>
|
||||
</content>
|
||||
</document>
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 1.7 KiB |
Loading…
Reference in New Issue