From 75bb7c5dcf6c54e223ecd3d64e71ff691e2875d8 Mon Sep 17 00:00:00 2001 From: perfectra1n Date: Sat, 12 Jul 2025 14:05:19 -0700 Subject: [PATCH] feat(tests): add more functionality to the "create_test_pdfs.py" function --- create_test_pdfs.py | 135 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 128 insertions(+), 7 deletions(-) diff --git a/create_test_pdfs.py b/create_test_pdfs.py index d4055d3..5f67f84 100644 --- a/create_test_pdfs.py +++ b/create_test_pdfs.py @@ -3,14 +3,14 @@ Create proper test PDFs for debugging OCR word counting issues. """ +import os + try: from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import letter - import os except ImportError: print("reportlab not installed. Trying alternative method...") # Alternative: create simple text files for testing - import os def create_simple_test_files(): """Create simple text files as a fallback""" @@ -20,7 +20,7 @@ except ImportError: # Test cases that would be similar to PDF extraction results test_cases = [ ("normal_spacing.txt", "This is a normal document with proper word spacing and punctuation."), - ("soclogix_sample.txt", "SOCLogix Non-Disclosure Agreement\nThis agreement is entered into between SOCLogix and the recipient for the purpose of protecting confidential information."), + ("acme_sample.txt", "ACME Non-Disclosure Agreement\nThis agreement is entered into between ACME and the recipient for the purpose of protecting confidential information."), ("multiline_text.txt", "Line one with several words\nLine two with more content\nLine three continues the pattern\nFinal line ends the document"), ("mixed_content.txt", "Document with numbers 123 and symbols @#$ mixed with normal text."), ("special_chars.txt", "Text with special characters: café naïve résumé — and 'quotes' • bullets"), @@ -42,8 +42,8 @@ def create_test_pdfs(): test_dir = "tests/test_pdfs" os.makedirs(test_dir, exist_ok=True) - # Test case 1: Normal spacing (like SOCLogix NDA) - pdf_path = f"{test_dir}/soclogix_nda_realistic.pdf" + # Test case 1: Normal spacing (like ACME NDA) + pdf_path = f"{test_dir}/acme_nda_realistic.pdf" c = canvas.Canvas(pdf_path, pagesize=letter) width, height = letter @@ -52,9 +52,9 @@ def create_test_pdfs(): y_position = height - 100 lines = [ - "SOCLogix Non-Disclosure Agreement", + "ACME Non-Disclosure Agreement", "", - "This agreement is entered into between SOCLogix and the recipient", + "This agreement is entered into between ACME and the recipient", "for the purpose of protecting confidential information.", "", "The recipient agrees to maintain strict confidentiality", @@ -155,6 +155,127 @@ def create_test_pdfs(): c.save() print(f"Created: {pdf_path}") + # Test case 4: Small file (< 1MB) + pdf_path = f"{test_dir}/small_file.pdf" + c = canvas.Canvas(pdf_path, pagesize=letter) + c.setFont("Helvetica", 12) + y_position = height - 100 + + small_lines = [ + "Small Test Document", + "", + "This is a small document for testing.", + "It should be under 1MB in size.", + "Perfect for basic upload testing.", + ] + + for line in small_lines: + if line: + c.drawString(72, y_position, line) + y_position -= 20 + + c.save() + print(f"Created: {pdf_path}") + + # Test case 5: Medium file (2-10MB) - Create with repeated content + pdf_path = f"{test_dir}/medium_file.pdf" + c = canvas.Canvas(pdf_path, pagesize=letter) + c.setFont("Helvetica", 8) + + # Create a 5MB file by adding many pages with lots of text + repeated_text = "This is repeated content to make the file larger and test medium file uploads. " * 15 + for page_num in range(300): # More pages + y_position = height - 30 + c.drawString(72, y_position, f"Page {page_num + 1}: Medium Size Test Document for Upload Testing") + y_position -= 15 + + # Add much more content per page + for i in range(50): # More lines per page + if y_position < 30: + break + # Use longer text to increase file size + line_text = f"Line {i + 1}: {repeated_text}"[:120] + c.drawString(72, y_position, line_text) + y_position -= 12 + + if page_num < 299: + c.showPage() + + c.save() + print(f"Created: {pdf_path}") + + # Test case 6: Large file (10-49MB) - Create with even more content + pdf_path = f"{test_dir}/large_file.pdf" + c = canvas.Canvas(pdf_path, pagesize=letter) + c.setFont("Helvetica", 6) # Very small font to fit more + + # Add many pages with very dense content to reach ~25MB + dense_text = "Dense content for large file testing with lots of characters to increase file size significantly. " * 25 + for page_num in range(800): # Many more pages + y_position = height - 20 + c.drawString(72, y_position, f"Page {page_num + 1}: Large File Test Document for Upload Testing - Should be around 25MB") + y_position -= 12 + + # Add extremely dense content + for i in range(80): # Maximum lines per page + if y_position < 20: + break + line_text = f"{i + 1}: {dense_text}"[:150] # Long lines + c.drawString(72, y_position, line_text) + y_position -= 8 # Tight line spacing + + if page_num < 799: + c.showPage() + + c.save() + print(f"Created: {pdf_path}") + + # Test case 7: Oversized file (> 50MB) - Should fail upload + pdf_path = f"{test_dir}/oversized_file.pdf" + c = canvas.Canvas(pdf_path, pagesize=letter) + c.setFont("Helvetica", 5) # Very small font + + # Create an extremely large file that exceeds the 50MB limit + massive_text = "This file is designed to exceed the 50MB upload limit and should fail gracefully. " * 50 + for page_num in range(1500): # Many pages to exceed 50MB + y_position = height - 15 + c.drawString(72, y_position, f"Page {page_num + 1}: Oversized Test Document - Should Fail Upload (Target: >50MB)") + y_position -= 10 + + for i in range(100): # Maximum lines per page + if y_position < 15: + break + line_text = f"{i + 1}: {massive_text}"[:160] # Very long lines + c.drawString(72, y_position, line_text) + y_position -= 7 # Very tight spacing + + if page_num < 1499: + c.showPage() + + c.save() + print(f"Created: {pdf_path}") + + print("\n📊 File Size Summary:") + print("=" * 50) + + # Check actual file sizes + test_files = [ + "small_file.pdf", + "medium_file.pdf", + "large_file.pdf", + "oversized_file.pdf", + "acme_nda_realistic.pdf", + "multipage_realistic.pdf", + "edge_cases_realistic.pdf" + ] + + for filename in test_files: + filepath = f"{test_dir}/{filename}" + if os.path.exists(filepath): + size_bytes = os.path.getsize(filepath) + size_mb = size_bytes / (1024 * 1024) + print(f"📄 {filename}: {size_mb:.2f} MB ({size_bytes:,} bytes)") + print("\nAll test PDFs created successfully!") return True