Readur/tests/e2e_ocr_test.sh

182 lines
5.4 KiB
Bash
Executable File

#!/bin/bash
set -e
echo "🔍 End-to-End OCR Test"
echo "====================="
BASE_URL="http://localhost:8081"
TEST_USER="testuser"
TEST_EMAIL="test@example.com"
TEST_PASSWORD="password123"
# Function to make authenticated API calls
api_call() {
local method=$1
local endpoint=$2
local data=$3
local content_type=${4:-"application/json"}
if [ -n "$data" ]; then
curl -s -X "$method" \
-H "Content-Type: $content_type" \
-H "Authorization: Bearer $AUTH_TOKEN" \
-d "$data" \
"$BASE_URL$endpoint"
else
curl -s -X "$method" \
-H "Authorization: Bearer $AUTH_TOKEN" \
"$BASE_URL$endpoint"
fi
}
echo "1. Creating test user..."
curl -s -X POST \
-H "Content-Type: application/json" \
-d "{\"username\":\"$TEST_USER\",\"email\":\"$TEST_EMAIL\",\"password\":\"$TEST_PASSWORD\"}" \
"$BASE_URL/api/auth/register" > /dev/null
echo "2. Logging in..."
LOGIN_RESPONSE=$(curl -s -X POST \
-H "Content-Type: application/json" \
-d "{\"username\":\"$TEST_USER\",\"password\":\"$TEST_PASSWORD\"}" \
"$BASE_URL/api/auth/login")
AUTH_TOKEN=$(echo "$LOGIN_RESPONSE" | jq -r '.token')
if [ "$AUTH_TOKEN" = "null" ] || [ -z "$AUTH_TOKEN" ]; then
echo "❌ Failed to get authentication token"
exit 1
fi
echo "✅ Authentication successful"
echo "3. Creating test image with text..."
# Create a simple text image for OCR testing
cat > /tmp/test_text.txt << 'EOF'
This is a test document for OCR processing.
It contains multiple lines of text.
The OCR service should extract this text accurately.
Document ID: TEST-001
Date: 2024-01-01
EOF
echo "4. Uploading test document..."
UPLOAD_RESPONSE=$(curl -s -X POST \
-H "Authorization: Bearer $AUTH_TOKEN" \
-F "file=@/tmp/test_text.txt" \
"$BASE_URL/api/documents")
DOCUMENT_ID=$(echo "$UPLOAD_RESPONSE" | jq -r '.id')
if [ "$DOCUMENT_ID" = "null" ] || [ -z "$DOCUMENT_ID" ]; then
echo "❌ Failed to upload document"
echo "Response: $UPLOAD_RESPONSE"
exit 1
fi
echo "✅ Document uploaded with ID: $DOCUMENT_ID"
echo "5. Waiting for OCR processing..."
# Poll the document to check OCR status
max_attempts=30
attempt=0
while [ $attempt -lt $max_attempts ]; do
DOCUMENTS_RESPONSE=$(api_call "GET" "/api/documents")
OCR_STATUS=$(echo "$DOCUMENTS_RESPONSE" | jq -r ".[] | select(.id==\"$DOCUMENT_ID\") | .ocr_status")
if [ "$OCR_STATUS" = "completed" ]; then
echo "✅ OCR processing completed"
break
elif [ "$OCR_STATUS" = "failed" ]; then
echo "❌ OCR processing failed"
exit 1
fi
echo "⏳ OCR status: $OCR_STATUS (attempt $((attempt + 1))/$max_attempts)"
sleep 2
attempt=$((attempt + 1))
done
if [ $attempt -eq $max_attempts ]; then
echo "❌ OCR processing timed out"
exit 1
fi
echo "6. Retrieving OCR text..."
OCR_RESPONSE=$(api_call "GET" "/api/documents/$DOCUMENT_ID/ocr")
# Verify OCR response structure
HAS_OCR_TEXT=$(echo "$OCR_RESPONSE" | jq -r '.has_ocr_text')
OCR_TEXT=$(echo "$OCR_RESPONSE" | jq -r '.ocr_text')
OCR_CONFIDENCE=$(echo "$OCR_RESPONSE" | jq -r '.ocr_confidence')
OCR_WORD_COUNT=$(echo "$OCR_RESPONSE" | jq -r '.ocr_word_count')
echo "7. Validating OCR results..."
if [ "$HAS_OCR_TEXT" != "true" ]; then
echo "❌ Expected has_ocr_text to be true, got: $HAS_OCR_TEXT"
exit 1
fi
if [ "$OCR_TEXT" = "null" ] || [ -z "$OCR_TEXT" ]; then
echo "❌ OCR text is empty or null"
exit 1
fi
if ! echo "$OCR_TEXT" | grep -q "test document"; then
echo "❌ OCR text does not contain expected content"
echo "OCR Text: $OCR_TEXT"
exit 1
fi
echo "✅ OCR text contains expected content"
if [ "$OCR_CONFIDENCE" != "null" ] && [ -n "$OCR_CONFIDENCE" ]; then
# Check if confidence is a reasonable number (0-100)
if (( $(echo "$OCR_CONFIDENCE >= 0 && $OCR_CONFIDENCE <= 100" | bc -l) )); then
echo "✅ OCR confidence is valid: $OCR_CONFIDENCE%"
else
echo "⚠️ OCR confidence seems unusual: $OCR_CONFIDENCE%"
fi
fi
if [ "$OCR_WORD_COUNT" != "null" ] && [ "$OCR_WORD_COUNT" -gt 0 ]; then
echo "✅ OCR word count is valid: $OCR_WORD_COUNT words"
else
echo "⚠️ OCR word count is missing or zero: $OCR_WORD_COUNT"
fi
echo "8. Testing OCR endpoint error handling..."
# Test with non-existent document
NON_EXISTENT_ID="00000000-0000-0000-0000-000000000000"
ERROR_RESPONSE=$(curl -s -w "%{http_code}" -X GET \
-H "Authorization: Bearer $AUTH_TOKEN" \
"$BASE_URL/api/documents/$NON_EXISTENT_ID/ocr")
HTTP_CODE=$(echo "$ERROR_RESPONSE" | tail -c 4)
if [ "$HTTP_CODE" = "404" ]; then
echo "✅ OCR endpoint correctly returns 404 for non-existent document"
else
echo "⚠️ Expected 404 for non-existent document, got: $HTTP_CODE"
fi
echo ""
echo "🎉 End-to-End OCR Test Completed Successfully!"
echo "==============================================="
echo "✅ User registration and login"
echo "✅ Document upload"
echo "✅ OCR processing completion"
echo "✅ OCR text retrieval via API"
echo "✅ OCR response validation"
echo "✅ Error handling"
echo ""
echo "OCR Results Summary:"
echo "- Document ID: $DOCUMENT_ID"
echo "- Has OCR Text: $HAS_OCR_TEXT"
echo "- OCR Confidence: $OCR_CONFIDENCE%"
echo "- Word Count: $OCR_WORD_COUNT"
echo "- Text Preview: $(echo "$OCR_TEXT" | head -c 100)..."