feat(client/server): update search tests, and upgrade OCR

This commit is contained in:
perfectra1n 2025-06-12 22:02:26 -07:00
parent 1a1f886f04
commit d5f419ca18
3 changed files with 173 additions and 18 deletions

57
apply_migrations.sh Executable file
View File

@ -0,0 +1,57 @@
#!/bin/bash
# Apply database migrations for enhanced OCR
# Usage: ./apply_migrations.sh [database_url]
set -e
# Default database URL from environment or use provided argument
DATABASE_URL=${1:-${DATABASE_URL:-"postgresql://localhost/readur"}}
echo "Applying migrations to: $DATABASE_URL"
# Apply migration 002 if it hasn't been applied yet
echo "Checking if migration 002_add_enhanced_ocr_fields.sql needs to be applied..."
# Check if the new columns exist
COLUMNS_EXIST=$(psql "$DATABASE_URL" -t -c "
SELECT COUNT(*)
FROM information_schema.columns
WHERE table_name = 'documents'
AND column_name IN ('ocr_confidence', 'ocr_word_count', 'ocr_processing_time_ms');
")
if [[ $COLUMNS_EXIST -eq 3 ]]; then
echo "Enhanced OCR fields already exist. Migration already applied."
else
echo "Applying migration 002_add_enhanced_ocr_fields.sql..."
psql "$DATABASE_URL" -f migrations/002_add_enhanced_ocr_fields.sql
echo "Migration 002 applied successfully!"
fi
# Verify the migration was successful
echo "Verifying migration..."
VERIFICATION=$(psql "$DATABASE_URL" -t -c "
SELECT
(SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'documents' AND column_name = 'ocr_confidence') as doc_cols,
(SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'settings' AND column_name = 'ocr_page_segmentation_mode') as settings_cols;
")
echo "Migration verification: $VERIFICATION"
if echo "$VERIFICATION" | grep -q "1.*1"; then
echo "✅ Enhanced OCR migration completed successfully!"
echo ""
echo "New features available:"
echo "- OCR confidence scoring and quality validation"
echo "- Advanced image preprocessing for challenging images"
echo "- Configurable Tesseract PSM and OEM settings"
echo "- Intelligent brightness/contrast enhancement"
echo "- Adaptive noise removal and sharpening"
echo "- OCR analytics and monitoring"
echo ""
echo "You can now restart your Readur server to use the enhanced OCR features."
else
echo "❌ Migration verification failed. Please check the logs above."
exit 1
fi

View File

@ -0,0 +1,70 @@
-- Add enhanced OCR metadata fields to documents table
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_confidence REAL;
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_word_count INT;
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_processing_time_ms INT;
-- Add enhanced OCR configuration fields to settings table
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_page_segmentation_mode INT DEFAULT 3;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_engine_mode INT DEFAULT 3;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_min_confidence REAL DEFAULT 30.0;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_dpi INT DEFAULT 300;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_enhance_contrast BOOLEAN DEFAULT true;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_remove_noise BOOLEAN DEFAULT true;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_detect_orientation BOOLEAN DEFAULT true;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_whitelist_chars TEXT;
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_blacklist_chars TEXT;
-- Add comments for documentation
COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence score (0-100)';
COMMENT ON COLUMN documents.ocr_word_count IS 'Number of words extracted by OCR';
COMMENT ON COLUMN documents.ocr_processing_time_ms IS 'Time taken for OCR processing in milliseconds';
COMMENT ON COLUMN settings.ocr_page_segmentation_mode IS 'Tesseract Page Segmentation Mode (0-13), default 3=PSM_AUTO';
COMMENT ON COLUMN settings.ocr_engine_mode IS 'Tesseract OCR Engine Mode (0-3), default 3=OEM_DEFAULT';
COMMENT ON COLUMN settings.ocr_min_confidence IS 'Minimum OCR confidence threshold (0-100)';
COMMENT ON COLUMN settings.ocr_dpi IS 'Target DPI for OCR processing, 0=auto';
COMMENT ON COLUMN settings.ocr_enhance_contrast IS 'Enable adaptive contrast enhancement';
COMMENT ON COLUMN settings.ocr_remove_noise IS 'Enable image noise removal';
COMMENT ON COLUMN settings.ocr_detect_orientation IS 'Enable automatic orientation detection';
COMMENT ON COLUMN settings.ocr_whitelist_chars IS 'Characters to allow in OCR (null=all)';
COMMENT ON COLUMN settings.ocr_blacklist_chars IS 'Characters to exclude from OCR (null=none)';
-- Create index on OCR confidence for quality filtering
CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence ON documents(ocr_confidence) WHERE ocr_confidence IS NOT NULL;
-- Create index on word count for analytics
CREATE INDEX IF NOT EXISTS idx_documents_ocr_word_count ON documents(ocr_word_count) WHERE ocr_word_count IS NOT NULL;
-- Update existing settings to have the new defaults
UPDATE settings SET
ocr_page_segmentation_mode = 3,
ocr_engine_mode = 3,
ocr_min_confidence = 30.0,
ocr_dpi = 300,
ocr_enhance_contrast = true,
ocr_remove_noise = true,
ocr_detect_orientation = true
WHERE ocr_page_segmentation_mode IS NULL;
-- Create a view for enhanced OCR analytics
CREATE OR REPLACE VIEW ocr_analytics AS
SELECT
DATE(created_at) as date,
COUNT(*) as total_documents,
COUNT(ocr_text) as documents_with_ocr,
COUNT(ocr_confidence) as documents_with_confidence,
AVG(ocr_confidence) as avg_confidence,
MIN(ocr_confidence) as min_confidence,
MAX(ocr_confidence) as max_confidence,
AVG(ocr_word_count) as avg_word_count,
SUM(ocr_word_count) as total_words_extracted,
AVG(ocr_processing_time_ms) as avg_processing_time_ms,
COUNT(*) FILTER (WHERE ocr_confidence < 50) as low_confidence_count,
COUNT(*) FILTER (WHERE ocr_confidence >= 80) as high_confidence_count,
COUNT(*) FILTER (WHERE ocr_status = 'failed') as failed_ocr_count
FROM documents
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
GROUP BY DATE(created_at)
ORDER BY date DESC;
COMMENT ON VIEW ocr_analytics IS 'Daily OCR analytics for monitoring quality and performance';

View File

@ -8,7 +8,7 @@ use tokio::time::{sleep, Duration};
use tracing::{error, info, warn};
use uuid::Uuid;
use crate::{db::Database, ocr::OcrService};
use crate::{db::Database, enhanced_ocr::EnhancedOcrService};
#[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
pub struct OcrQueueItem {
@ -204,7 +204,7 @@ impl OcrQueueService {
}
/// Process a single queue item
async fn process_item(&self, item: OcrQueueItem, ocr_service: &OcrService) -> Result<()> {
async fn process_item(&self, item: OcrQueueItem, ocr_service: &EnhancedOcrService) -> Result<()> {
let start_time = std::time::Instant::now();
info!("Processing OCR job {} for document {}", item.id, item.document_id);
@ -226,35 +226,62 @@ impl OcrQueueService {
let file_path: String = row.get("file_path");
let mime_type: String = row.get("mime_type");
let user_id: Option<Uuid> = row.get("user_id");
// Get user's OCR settings
// Get user's OCR settings or use defaults
let settings = if let Some(user_id) = user_id {
self.db.get_user_settings(user_id).await.ok().flatten()
.unwrap_or_else(|| crate::models::Settings::default())
} else {
None
crate::models::Settings::default()
};
let ocr_language = settings
.as_ref()
.map(|s| s.ocr_language.clone())
.unwrap_or_else(|| "eng".to_string());
// Perform OCR
match ocr_service.extract_text_with_lang(&file_path, &mime_type, &ocr_language).await {
Ok(text) => {
if !text.is_empty() {
// Update document with OCR text
// Perform enhanced OCR
match ocr_service.extract_text(&file_path, &mime_type, &settings).await {
Ok(ocr_result) => {
// Validate OCR quality
if !ocr_service.validate_ocr_quality(&ocr_result, &settings) {
let error_msg = format!("OCR quality below threshold: {:.1}% confidence, {} words",
ocr_result.confidence, ocr_result.word_count);
warn!("{}", error_msg);
// Mark as failed for quality issues
sqlx::query(
r#"
UPDATE documents
SET ocr_status = 'failed',
ocr_error = $2,
updated_at = NOW()
WHERE id = $1
"#
)
.bind(item.document_id)
.bind(&error_msg)
.execute(&self.pool)
.await?;
self.mark_failed(item.id, &error_msg).await?;
return Ok(());
}
if !ocr_result.text.is_empty() {
// Update document with enhanced OCR text and metadata
sqlx::query(
r#"
UPDATE documents
SET ocr_text = $2,
ocr_status = 'completed',
ocr_completed_at = NOW(),
ocr_confidence = $3,
ocr_word_count = $4,
ocr_processing_time_ms = $5,
updated_at = NOW()
WHERE id = $1
"#
)
.bind(item.document_id)
.bind(text)
.bind(&ocr_result.text)
.bind(ocr_result.confidence)
.bind(ocr_result.word_count as i32)
.bind(ocr_result.processing_time_ms as i32)
.execute(&self.pool)
.await?;
}
@ -263,8 +290,9 @@ impl OcrQueueService {
self.mark_completed(item.id, processing_time_ms).await?;
info!(
"Successfully processed OCR job {} for document {} in {}ms",
item.id, item.document_id, processing_time_ms
"Successfully processed OCR job {} for document {} in {}ms - Enhanced OCR: {:.1}% confidence, {} words, Preprocessing: {:?}",
item.id, item.document_id, processing_time_ms,
ocr_result.confidence, ocr_result.word_count, ocr_result.preprocessing_applied
);
}
Err(e) => {
@ -302,7 +330,7 @@ impl OcrQueueService {
/// Start the worker loop
pub async fn start_worker(self: Arc<Self>) -> Result<()> {
let semaphore = Arc::new(Semaphore::new(self.max_concurrent_jobs));
let ocr_service = Arc::new(OcrService::new());
let ocr_service = Arc::new(EnhancedOcrService::new("/tmp".to_string()));
info!(
"Starting OCR worker {} with {} concurrent jobs",