endstream
+endobj
+xref
+0 12
+0000000000 65535 f
+0000000073 00000 n
+0000000104 00000 n
+0000000211 00000 n
+0000000414 00000 n
+0000000618 00000 n
+0000000822 00000 n
+0000000890 00000 n
+0000001202 00000 n
+0000001273 00000 n
+0000001532 00000 n
+0000001778 00000 n
+trailer
+<<
+/ID
+[<3b1faa812dbde8df21a25a9c074387ca><3b1faa812dbde8df21a25a9c074387ca>]
+% ReportLab generated PDF document -- digest (http://www.reportlab.com)
+
+/Info 7 0 R
+/Root 6 0 R
+/Size 12
+>>
+startxref
+2008
+%%EOF
diff --git a/test_files/portrait_100x200.png b/test_files/portrait_100x200.png
new file mode 100644
index 0000000..7e3a61c
Binary files /dev/null and b/test_files/portrait_100x200.png differ
diff --git a/test_files/sample.html b/test_files/sample.html
new file mode 100644
index 0000000..72b449d
--- /dev/null
+++ b/test_files/sample.html
@@ -0,0 +1,13 @@
+
+
+
+
+
+ Sample HTML Document
+
+
+ Test HTML File
+ This is a sample HTML document for testing format detection.
+ It contains multiple paragraphs and should be detected as HTML.
+
+
\ No newline at end of file
diff --git a/test_files/sample.json b/test_files/sample.json
new file mode 100644
index 0000000..8011f29
--- /dev/null
+++ b/test_files/sample.json
@@ -0,0 +1,12 @@
+{
+ "name": "Test Document",
+ "type": "sample",
+ "metadata": {
+ "created": "2024-01-01",
+ "author": "Test User"
+ },
+ "content": [
+ "This is a JSON file",
+ "Used for testing text format detection"
+ ]
+}
\ No newline at end of file
diff --git a/test_files/sample.txt b/test_files/sample.txt
new file mode 100644
index 0000000..37130f7
--- /dev/null
+++ b/test_files/sample.txt
@@ -0,0 +1,8 @@
+This is a sample text file for testing metadata extraction.
+It contains multiple lines and various words.
+The quick brown fox jumps over the lazy dog.
+This file is used to test character count, word count, and line count extraction.
+
+Some special characters: áéíóú, çñ, and emojis 🎉✨
+
+This should help test Unicode detection as well.
\ No newline at end of file
diff --git a/test_files/sample.xml b/test_files/sample.xml
new file mode 100644
index 0000000..6d5773c
--- /dev/null
+++ b/test_files/sample.xml
@@ -0,0 +1,12 @@
+
+
+ Sample XML Document
+
+ This is a sample XML file for testing.
+ It should be detected as XML format.
+
+
+ Test User
+ 2024-01-01
+
+
\ No newline at end of file
diff --git a/test_files/single_page_v14.pdf b/test_files/single_page_v14.pdf
new file mode 100644
index 0000000..8db1349
--- /dev/null
+++ b/test_files/single_page_v14.pdf
@@ -0,0 +1,68 @@
+%PDF-1.3
+% ReportLab Generated PDF document http://www.reportlab.com
+1 0 obj
+<<
+/F1 2 0 R
+>>
+endobj
+2 0 obj
+<<
+/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
+>>
+endobj
+3 0 obj
+<<
+/Contents 7 0 R /MediaBox [ 0 0 612 792 ] /Parent 6 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+
+>>
+ /Type /Page
+>>
+endobj
+4 0 obj
+<<
+/PageMode /UseNone /Pages 6 0 R /Type /Catalog
+>>
+endobj
+5 0 obj
+<<
+/Author (Test Author) /CreationDate (D:20250710214218+00'00') /Creator (Python reportlab) /Keywords () /ModDate (D:20250710214218+00'00') /Producer (ReportLab PDF Library - www.reportlab.com)
+ /Subject (Test Subject) /Title (Single Page Test Document) /Trapped /False
+>>
+endobj
+6 0 obj
+<<
+/Count 1 /Kids [ 3 0 R ] /Type /Pages
+>>
+endobj
+7 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 206
+>>
+stream
+Gas2B]aDVA&;9pC`KY*LD^B-O0d'R(h@#jac&^q+OP>SPmh&K#W6@$9_Y5bf'gkod%QJ.'$Nb1<.o;ODct=>@o&ifTJs=-$X.UIKf=%kE`l6H4ncH?T8&r:oU$p@/Dkq]C\nM9+)$gJ8i5uRL/B/S_i27!\Ph,TnDqT$"E>!rEO.F7L@9L)S[P2.<5'R?1"%om1'9X0s@Rg.~>endstream
+endobj
+xref
+0 8
+0000000000 65535 f
+0000000073 00000 n
+0000000104 00000 n
+0000000211 00000 n
+0000000404 00000 n
+0000000472 00000 n
+0000000763 00000 n
+0000000822 00000 n
+trailer
+<<
+/ID
+[]
+% ReportLab generated PDF document -- digest (http://www.reportlab.com)
+
+/Info 5 0 R
+/Root 4 0 R
+/Size 8
+>>
+startxref
+1118
+%%EOF
diff --git a/test_files/small_50x50.png b/test_files/small_50x50.png
new file mode 100644
index 0000000..68d8b58
Binary files /dev/null and b/test_files/small_50x50.png differ
diff --git a/test_files/square_150x150.png b/test_files/square_150x150.png
new file mode 100644
index 0000000..b528623
Binary files /dev/null and b/test_files/square_150x150.png differ
diff --git a/test_files/test_format.html b/test_files/test_format.html
new file mode 100644
index 0000000..847d551
--- /dev/null
+++ b/test_files/test_format.html
@@ -0,0 +1,16 @@
+
+
+
+
+ HTML Test Document
+
+
+ HTML Test Page
+ This document should be detected as HTML format.
+ It contains HTML markup and structure.
+
+ - List item one
+ - List item two
+
+
+
\ No newline at end of file
diff --git a/test_files/test_format.json b/test_files/test_format.json
new file mode 100644
index 0000000..f3ca405
--- /dev/null
+++ b/test_files/test_format.json
@@ -0,0 +1,15 @@
+{
+ "document": {
+ "title": "Test JSON Document",
+ "type": "metadata_test",
+ "properties": {
+ "word_count": 25,
+ "format": "json",
+ "encoding": "utf-8"
+ },
+ "content": [
+ "This JSON should be detected as JSON format",
+ "It contains structured data in JSON format"
+ ]
+ }
+}
\ No newline at end of file
diff --git a/test_files/test_format.xml b/test_files/test_format.xml
new file mode 100644
index 0000000..f34e922
--- /dev/null
+++ b/test_files/test_format.xml
@@ -0,0 +1,12 @@
+
+
+
+ XML Test Document
+ xml
+ 15
+
+
+ This XML document should be detected as XML format.
+ It contains structured markup for testing.
+
+
\ No newline at end of file
diff --git a/test_files/test_image.jpg b/test_files/test_image.jpg
new file mode 100644
index 0000000..852dd18
Binary files /dev/null and b/test_files/test_image.jpg differ