feat(tests): add more functionality to the "create_test_pdfs.py" function
This commit is contained in:
parent
cd358f63f2
commit
83a55cbc32
|
|
@ -3,14 +3,14 @@
|
|||
Create proper test PDFs for debugging OCR word counting issues.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
try:
|
||||
from reportlab.pdfgen import canvas
|
||||
from reportlab.lib.pagesizes import letter
|
||||
import os
|
||||
except ImportError:
|
||||
print("reportlab not installed. Trying alternative method...")
|
||||
# Alternative: create simple text files for testing
|
||||
import os
|
||||
|
||||
def create_simple_test_files():
|
||||
"""Create simple text files as a fallback"""
|
||||
|
|
@ -20,7 +20,7 @@ except ImportError:
|
|||
# Test cases that would be similar to PDF extraction results
|
||||
test_cases = [
|
||||
("normal_spacing.txt", "This is a normal document with proper word spacing and punctuation."),
|
||||
("soclogix_sample.txt", "SOCLogix Non-Disclosure Agreement\nThis agreement is entered into between SOCLogix and the recipient for the purpose of protecting confidential information."),
|
||||
("acme_sample.txt", "ACME Non-Disclosure Agreement\nThis agreement is entered into between ACME and the recipient for the purpose of protecting confidential information."),
|
||||
("multiline_text.txt", "Line one with several words\nLine two with more content\nLine three continues the pattern\nFinal line ends the document"),
|
||||
("mixed_content.txt", "Document with numbers 123 and symbols @#$ mixed with normal text."),
|
||||
("special_chars.txt", "Text with special characters: café naïve résumé — and 'quotes' • bullets"),
|
||||
|
|
@ -42,8 +42,8 @@ def create_test_pdfs():
|
|||
test_dir = "tests/test_pdfs"
|
||||
os.makedirs(test_dir, exist_ok=True)
|
||||
|
||||
# Test case 1: Normal spacing (like SOCLogix NDA)
|
||||
pdf_path = f"{test_dir}/soclogix_nda_realistic.pdf"
|
||||
# Test case 1: Normal spacing (like ACME NDA)
|
||||
pdf_path = f"{test_dir}/acme_nda_realistic.pdf"
|
||||
c = canvas.Canvas(pdf_path, pagesize=letter)
|
||||
width, height = letter
|
||||
|
||||
|
|
@ -52,9 +52,9 @@ def create_test_pdfs():
|
|||
y_position = height - 100
|
||||
|
||||
lines = [
|
||||
"SOCLogix Non-Disclosure Agreement",
|
||||
"ACME Non-Disclosure Agreement",
|
||||
"",
|
||||
"This agreement is entered into between SOCLogix and the recipient",
|
||||
"This agreement is entered into between ACME and the recipient",
|
||||
"for the purpose of protecting confidential information.",
|
||||
"",
|
||||
"The recipient agrees to maintain strict confidentiality",
|
||||
|
|
@ -155,6 +155,127 @@ def create_test_pdfs():
|
|||
c.save()
|
||||
print(f"Created: {pdf_path}")
|
||||
|
||||
# Test case 4: Small file (< 1MB)
|
||||
pdf_path = f"{test_dir}/small_file.pdf"
|
||||
c = canvas.Canvas(pdf_path, pagesize=letter)
|
||||
c.setFont("Helvetica", 12)
|
||||
y_position = height - 100
|
||||
|
||||
small_lines = [
|
||||
"Small Test Document",
|
||||
"",
|
||||
"This is a small document for testing.",
|
||||
"It should be under 1MB in size.",
|
||||
"Perfect for basic upload testing.",
|
||||
]
|
||||
|
||||
for line in small_lines:
|
||||
if line:
|
||||
c.drawString(72, y_position, line)
|
||||
y_position -= 20
|
||||
|
||||
c.save()
|
||||
print(f"Created: {pdf_path}")
|
||||
|
||||
# Test case 5: Medium file (2-10MB) - Create with repeated content
|
||||
pdf_path = f"{test_dir}/medium_file.pdf"
|
||||
c = canvas.Canvas(pdf_path, pagesize=letter)
|
||||
c.setFont("Helvetica", 8)
|
||||
|
||||
# Create a 5MB file by adding many pages with lots of text
|
||||
repeated_text = "This is repeated content to make the file larger and test medium file uploads. " * 15
|
||||
for page_num in range(300): # More pages
|
||||
y_position = height - 30
|
||||
c.drawString(72, y_position, f"Page {page_num + 1}: Medium Size Test Document for Upload Testing")
|
||||
y_position -= 15
|
||||
|
||||
# Add much more content per page
|
||||
for i in range(50): # More lines per page
|
||||
if y_position < 30:
|
||||
break
|
||||
# Use longer text to increase file size
|
||||
line_text = f"Line {i + 1}: {repeated_text}"[:120]
|
||||
c.drawString(72, y_position, line_text)
|
||||
y_position -= 12
|
||||
|
||||
if page_num < 299:
|
||||
c.showPage()
|
||||
|
||||
c.save()
|
||||
print(f"Created: {pdf_path}")
|
||||
|
||||
# Test case 6: Large file (10-49MB) - Create with even more content
|
||||
pdf_path = f"{test_dir}/large_file.pdf"
|
||||
c = canvas.Canvas(pdf_path, pagesize=letter)
|
||||
c.setFont("Helvetica", 6) # Very small font to fit more
|
||||
|
||||
# Add many pages with very dense content to reach ~25MB
|
||||
dense_text = "Dense content for large file testing with lots of characters to increase file size significantly. " * 25
|
||||
for page_num in range(800): # Many more pages
|
||||
y_position = height - 20
|
||||
c.drawString(72, y_position, f"Page {page_num + 1}: Large File Test Document for Upload Testing - Should be around 25MB")
|
||||
y_position -= 12
|
||||
|
||||
# Add extremely dense content
|
||||
for i in range(80): # Maximum lines per page
|
||||
if y_position < 20:
|
||||
break
|
||||
line_text = f"{i + 1}: {dense_text}"[:150] # Long lines
|
||||
c.drawString(72, y_position, line_text)
|
||||
y_position -= 8 # Tight line spacing
|
||||
|
||||
if page_num < 799:
|
||||
c.showPage()
|
||||
|
||||
c.save()
|
||||
print(f"Created: {pdf_path}")
|
||||
|
||||
# Test case 7: Oversized file (> 50MB) - Should fail upload
|
||||
pdf_path = f"{test_dir}/oversized_file.pdf"
|
||||
c = canvas.Canvas(pdf_path, pagesize=letter)
|
||||
c.setFont("Helvetica", 5) # Very small font
|
||||
|
||||
# Create an extremely large file that exceeds the 50MB limit
|
||||
massive_text = "This file is designed to exceed the 50MB upload limit and should fail gracefully. " * 50
|
||||
for page_num in range(1500): # Many pages to exceed 50MB
|
||||
y_position = height - 15
|
||||
c.drawString(72, y_position, f"Page {page_num + 1}: Oversized Test Document - Should Fail Upload (Target: >50MB)")
|
||||
y_position -= 10
|
||||
|
||||
for i in range(100): # Maximum lines per page
|
||||
if y_position < 15:
|
||||
break
|
||||
line_text = f"{i + 1}: {massive_text}"[:160] # Very long lines
|
||||
c.drawString(72, y_position, line_text)
|
||||
y_position -= 7 # Very tight spacing
|
||||
|
||||
if page_num < 1499:
|
||||
c.showPage()
|
||||
|
||||
c.save()
|
||||
print(f"Created: {pdf_path}")
|
||||
|
||||
print("\n📊 File Size Summary:")
|
||||
print("=" * 50)
|
||||
|
||||
# Check actual file sizes
|
||||
test_files = [
|
||||
"small_file.pdf",
|
||||
"medium_file.pdf",
|
||||
"large_file.pdf",
|
||||
"oversized_file.pdf",
|
||||
"acme_nda_realistic.pdf",
|
||||
"multipage_realistic.pdf",
|
||||
"edge_cases_realistic.pdf"
|
||||
]
|
||||
|
||||
for filename in test_files:
|
||||
filepath = f"{test_dir}/{filename}"
|
||||
if os.path.exists(filepath):
|
||||
size_bytes = os.path.getsize(filepath)
|
||||
size_mb = size_bytes / (1024 * 1024)
|
||||
print(f"📄 {filename}: {size_mb:.2f} MB ({size_bytes:,} bytes)")
|
||||
|
||||
print("\nAll test PDFs created successfully!")
|
||||
return True
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue