feat(tests): add more functionality to the "create_test_pdfs.py" function

This commit is contained in:
perfectra1n 2025-07-12 14:05:19 -07:00
parent 9e143649d4
commit 75bb7c5dcf
1 changed files with 128 additions and 7 deletions

View File

@ -3,14 +3,14 @@
Create proper test PDFs for debugging OCR word counting issues.
"""
import os
try:
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import os
except ImportError:
print("reportlab not installed. Trying alternative method...")
# Alternative: create simple text files for testing
import os
def create_simple_test_files():
"""Create simple text files as a fallback"""
@ -20,7 +20,7 @@ except ImportError:
# Test cases that would be similar to PDF extraction results
test_cases = [
("normal_spacing.txt", "This is a normal document with proper word spacing and punctuation."),
("soclogix_sample.txt", "SOCLogix Non-Disclosure Agreement\nThis agreement is entered into between SOCLogix and the recipient for the purpose of protecting confidential information."),
("acme_sample.txt", "ACME Non-Disclosure Agreement\nThis agreement is entered into between ACME and the recipient for the purpose of protecting confidential information."),
("multiline_text.txt", "Line one with several words\nLine two with more content\nLine three continues the pattern\nFinal line ends the document"),
("mixed_content.txt", "Document with numbers 123 and symbols @#$ mixed with normal text."),
("special_chars.txt", "Text with special characters: café naïve résumé — and 'quotes' • bullets"),
@ -42,8 +42,8 @@ def create_test_pdfs():
test_dir = "tests/test_pdfs"
os.makedirs(test_dir, exist_ok=True)
# Test case 1: Normal spacing (like SOCLogix NDA)
pdf_path = f"{test_dir}/soclogix_nda_realistic.pdf"
# Test case 1: Normal spacing (like ACME NDA)
pdf_path = f"{test_dir}/acme_nda_realistic.pdf"
c = canvas.Canvas(pdf_path, pagesize=letter)
width, height = letter
@ -52,9 +52,9 @@ def create_test_pdfs():
y_position = height - 100
lines = [
"SOCLogix Non-Disclosure Agreement",
"ACME Non-Disclosure Agreement",
"",
"This agreement is entered into between SOCLogix and the recipient",
"This agreement is entered into between ACME and the recipient",
"for the purpose of protecting confidential information.",
"",
"The recipient agrees to maintain strict confidentiality",
@ -155,6 +155,127 @@ def create_test_pdfs():
c.save()
print(f"Created: {pdf_path}")
# Test case 4: Small file (< 1MB)
pdf_path = f"{test_dir}/small_file.pdf"
c = canvas.Canvas(pdf_path, pagesize=letter)
c.setFont("Helvetica", 12)
y_position = height - 100
small_lines = [
"Small Test Document",
"",
"This is a small document for testing.",
"It should be under 1MB in size.",
"Perfect for basic upload testing.",
]
for line in small_lines:
if line:
c.drawString(72, y_position, line)
y_position -= 20
c.save()
print(f"Created: {pdf_path}")
# Test case 5: Medium file (2-10MB) - Create with repeated content
pdf_path = f"{test_dir}/medium_file.pdf"
c = canvas.Canvas(pdf_path, pagesize=letter)
c.setFont("Helvetica", 8)
# Create a 5MB file by adding many pages with lots of text
repeated_text = "This is repeated content to make the file larger and test medium file uploads. " * 15
for page_num in range(300): # More pages
y_position = height - 30
c.drawString(72, y_position, f"Page {page_num + 1}: Medium Size Test Document for Upload Testing")
y_position -= 15
# Add much more content per page
for i in range(50): # More lines per page
if y_position < 30:
break
# Use longer text to increase file size
line_text = f"Line {i + 1}: {repeated_text}"[:120]
c.drawString(72, y_position, line_text)
y_position -= 12
if page_num < 299:
c.showPage()
c.save()
print(f"Created: {pdf_path}")
# Test case 6: Large file (10-49MB) - Create with even more content
pdf_path = f"{test_dir}/large_file.pdf"
c = canvas.Canvas(pdf_path, pagesize=letter)
c.setFont("Helvetica", 6) # Very small font to fit more
# Add many pages with very dense content to reach ~25MB
dense_text = "Dense content for large file testing with lots of characters to increase file size significantly. " * 25
for page_num in range(800): # Many more pages
y_position = height - 20
c.drawString(72, y_position, f"Page {page_num + 1}: Large File Test Document for Upload Testing - Should be around 25MB")
y_position -= 12
# Add extremely dense content
for i in range(80): # Maximum lines per page
if y_position < 20:
break
line_text = f"{i + 1}: {dense_text}"[:150] # Long lines
c.drawString(72, y_position, line_text)
y_position -= 8 # Tight line spacing
if page_num < 799:
c.showPage()
c.save()
print(f"Created: {pdf_path}")
# Test case 7: Oversized file (> 50MB) - Should fail upload
pdf_path = f"{test_dir}/oversized_file.pdf"
c = canvas.Canvas(pdf_path, pagesize=letter)
c.setFont("Helvetica", 5) # Very small font
# Create an extremely large file that exceeds the 50MB limit
massive_text = "This file is designed to exceed the 50MB upload limit and should fail gracefully. " * 50
for page_num in range(1500): # Many pages to exceed 50MB
y_position = height - 15
c.drawString(72, y_position, f"Page {page_num + 1}: Oversized Test Document - Should Fail Upload (Target: >50MB)")
y_position -= 10
for i in range(100): # Maximum lines per page
if y_position < 15:
break
line_text = f"{i + 1}: {massive_text}"[:160] # Very long lines
c.drawString(72, y_position, line_text)
y_position -= 7 # Very tight spacing
if page_num < 1499:
c.showPage()
c.save()
print(f"Created: {pdf_path}")
print("\n📊 File Size Summary:")
print("=" * 50)
# Check actual file sizes
test_files = [
"small_file.pdf",
"medium_file.pdf",
"large_file.pdf",
"oversized_file.pdf",
"acme_nda_realistic.pdf",
"multipage_realistic.pdf",
"edge_cases_realistic.pdf"
]
for filename in test_files:
filepath = f"{test_dir}/{filename}"
if os.path.exists(filepath):
size_bytes = os.path.getsize(filepath)
size_mb = size_bytes / (1024 * 1024)
print(f"📄 {filename}: {size_mb:.2f} MB ({size_bytes:,} bytes)")
print("\nAll test PDFs created successfully!")
return True