diff --git a/apply_migrations.sh b/apply_migrations.sh new file mode 100755 index 0000000..96e9d2d --- /dev/null +++ b/apply_migrations.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# Apply database migrations for enhanced OCR +# Usage: ./apply_migrations.sh [database_url] + +set -e + +# Default database URL from environment or use provided argument +DATABASE_URL=${1:-${DATABASE_URL:-"postgresql://localhost/readur"}} + +echo "Applying migrations to: $DATABASE_URL" + +# Apply migration 002 if it hasn't been applied yet +echo "Checking if migration 002_add_enhanced_ocr_fields.sql needs to be applied..." + +# Check if the new columns exist +COLUMNS_EXIST=$(psql "$DATABASE_URL" -t -c " +SELECT COUNT(*) +FROM information_schema.columns +WHERE table_name = 'documents' +AND column_name IN ('ocr_confidence', 'ocr_word_count', 'ocr_processing_time_ms'); +") + +if [[ $COLUMNS_EXIST -eq 3 ]]; then + echo "Enhanced OCR fields already exist. Migration already applied." +else + echo "Applying migration 002_add_enhanced_ocr_fields.sql..." + psql "$DATABASE_URL" -f migrations/002_add_enhanced_ocr_fields.sql + echo "Migration 002 applied successfully!" +fi + +# Verify the migration was successful +echo "Verifying migration..." +VERIFICATION=$(psql "$DATABASE_URL" -t -c " +SELECT + (SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'documents' AND column_name = 'ocr_confidence') as doc_cols, + (SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'settings' AND column_name = 'ocr_page_segmentation_mode') as settings_cols; +") + +echo "Migration verification: $VERIFICATION" + +if echo "$VERIFICATION" | grep -q "1.*1"; then + echo "✅ Enhanced OCR migration completed successfully!" + echo "" + echo "New features available:" + echo "- OCR confidence scoring and quality validation" + echo "- Advanced image preprocessing for challenging images" + echo "- Configurable Tesseract PSM and OEM settings" + echo "- Intelligent brightness/contrast enhancement" + echo "- Adaptive noise removal and sharpening" + echo "- OCR analytics and monitoring" + echo "" + echo "You can now restart your Readur server to use the enhanced OCR features." +else + echo "❌ Migration verification failed. Please check the logs above." + exit 1 +fi \ No newline at end of file diff --git a/migrations/002_add_enhanced_ocr_fields.sql b/migrations/002_add_enhanced_ocr_fields.sql new file mode 100644 index 0000000..4567b54 --- /dev/null +++ b/migrations/002_add_enhanced_ocr_fields.sql @@ -0,0 +1,70 @@ +-- Add enhanced OCR metadata fields to documents table +ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_confidence REAL; +ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_word_count INT; +ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_processing_time_ms INT; + +-- Add enhanced OCR configuration fields to settings table +ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_page_segmentation_mode INT DEFAULT 3; +ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_engine_mode INT DEFAULT 3; +ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_min_confidence REAL DEFAULT 30.0; +ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_dpi INT DEFAULT 300; +ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_enhance_contrast BOOLEAN DEFAULT true; +ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_remove_noise BOOLEAN DEFAULT true; +ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_detect_orientation BOOLEAN DEFAULT true; +ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_whitelist_chars TEXT; +ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_blacklist_chars TEXT; + +-- Add comments for documentation +COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence score (0-100)'; +COMMENT ON COLUMN documents.ocr_word_count IS 'Number of words extracted by OCR'; +COMMENT ON COLUMN documents.ocr_processing_time_ms IS 'Time taken for OCR processing in milliseconds'; + +COMMENT ON COLUMN settings.ocr_page_segmentation_mode IS 'Tesseract Page Segmentation Mode (0-13), default 3=PSM_AUTO'; +COMMENT ON COLUMN settings.ocr_engine_mode IS 'Tesseract OCR Engine Mode (0-3), default 3=OEM_DEFAULT'; +COMMENT ON COLUMN settings.ocr_min_confidence IS 'Minimum OCR confidence threshold (0-100)'; +COMMENT ON COLUMN settings.ocr_dpi IS 'Target DPI for OCR processing, 0=auto'; +COMMENT ON COLUMN settings.ocr_enhance_contrast IS 'Enable adaptive contrast enhancement'; +COMMENT ON COLUMN settings.ocr_remove_noise IS 'Enable image noise removal'; +COMMENT ON COLUMN settings.ocr_detect_orientation IS 'Enable automatic orientation detection'; +COMMENT ON COLUMN settings.ocr_whitelist_chars IS 'Characters to allow in OCR (null=all)'; +COMMENT ON COLUMN settings.ocr_blacklist_chars IS 'Characters to exclude from OCR (null=none)'; + +-- Create index on OCR confidence for quality filtering +CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence ON documents(ocr_confidence) WHERE ocr_confidence IS NOT NULL; + +-- Create index on word count for analytics +CREATE INDEX IF NOT EXISTS idx_documents_ocr_word_count ON documents(ocr_word_count) WHERE ocr_word_count IS NOT NULL; + +-- Update existing settings to have the new defaults +UPDATE settings SET + ocr_page_segmentation_mode = 3, + ocr_engine_mode = 3, + ocr_min_confidence = 30.0, + ocr_dpi = 300, + ocr_enhance_contrast = true, + ocr_remove_noise = true, + ocr_detect_orientation = true +WHERE ocr_page_segmentation_mode IS NULL; + +-- Create a view for enhanced OCR analytics +CREATE OR REPLACE VIEW ocr_analytics AS +SELECT + DATE(created_at) as date, + COUNT(*) as total_documents, + COUNT(ocr_text) as documents_with_ocr, + COUNT(ocr_confidence) as documents_with_confidence, + AVG(ocr_confidence) as avg_confidence, + MIN(ocr_confidence) as min_confidence, + MAX(ocr_confidence) as max_confidence, + AVG(ocr_word_count) as avg_word_count, + SUM(ocr_word_count) as total_words_extracted, + AVG(ocr_processing_time_ms) as avg_processing_time_ms, + COUNT(*) FILTER (WHERE ocr_confidence < 50) as low_confidence_count, + COUNT(*) FILTER (WHERE ocr_confidence >= 80) as high_confidence_count, + COUNT(*) FILTER (WHERE ocr_status = 'failed') as failed_ocr_count +FROM documents +WHERE created_at >= CURRENT_DATE - INTERVAL '30 days' +GROUP BY DATE(created_at) +ORDER BY date DESC; + +COMMENT ON VIEW ocr_analytics IS 'Daily OCR analytics for monitoring quality and performance'; \ No newline at end of file diff --git a/src/ocr_queue.rs b/src/ocr_queue.rs index 4cac703..45b6021 100644 --- a/src/ocr_queue.rs +++ b/src/ocr_queue.rs @@ -8,7 +8,7 @@ use tokio::time::{sleep, Duration}; use tracing::{error, info, warn}; use uuid::Uuid; -use crate::{db::Database, ocr::OcrService}; +use crate::{db::Database, enhanced_ocr::EnhancedOcrService}; #[derive(Debug, Clone, Serialize, Deserialize, FromRow)] pub struct OcrQueueItem { @@ -204,7 +204,7 @@ impl OcrQueueService { } /// Process a single queue item - async fn process_item(&self, item: OcrQueueItem, ocr_service: &OcrService) -> Result<()> { + async fn process_item(&self, item: OcrQueueItem, ocr_service: &EnhancedOcrService) -> Result<()> { let start_time = std::time::Instant::now(); info!("Processing OCR job {} for document {}", item.id, item.document_id); @@ -226,35 +226,62 @@ impl OcrQueueService { let file_path: String = row.get("file_path"); let mime_type: String = row.get("mime_type"); let user_id: Option = row.get("user_id"); - // Get user's OCR settings + // Get user's OCR settings or use defaults let settings = if let Some(user_id) = user_id { self.db.get_user_settings(user_id).await.ok().flatten() + .unwrap_or_else(|| crate::models::Settings::default()) } else { - None + crate::models::Settings::default() }; - let ocr_language = settings - .as_ref() - .map(|s| s.ocr_language.clone()) - .unwrap_or_else(|| "eng".to_string()); - - // Perform OCR - match ocr_service.extract_text_with_lang(&file_path, &mime_type, &ocr_language).await { - Ok(text) => { - if !text.is_empty() { - // Update document with OCR text + // Perform enhanced OCR + match ocr_service.extract_text(&file_path, &mime_type, &settings).await { + Ok(ocr_result) => { + // Validate OCR quality + if !ocr_service.validate_ocr_quality(&ocr_result, &settings) { + let error_msg = format!("OCR quality below threshold: {:.1}% confidence, {} words", + ocr_result.confidence, ocr_result.word_count); + warn!("{}", error_msg); + + // Mark as failed for quality issues + sqlx::query( + r#" + UPDATE documents + SET ocr_status = 'failed', + ocr_error = $2, + updated_at = NOW() + WHERE id = $1 + "# + ) + .bind(item.document_id) + .bind(&error_msg) + .execute(&self.pool) + .await?; + + self.mark_failed(item.id, &error_msg).await?; + return Ok(()); + } + + if !ocr_result.text.is_empty() { + // Update document with enhanced OCR text and metadata sqlx::query( r#" UPDATE documents SET ocr_text = $2, ocr_status = 'completed', ocr_completed_at = NOW(), + ocr_confidence = $3, + ocr_word_count = $4, + ocr_processing_time_ms = $5, updated_at = NOW() WHERE id = $1 "# ) .bind(item.document_id) - .bind(text) + .bind(&ocr_result.text) + .bind(ocr_result.confidence) + .bind(ocr_result.word_count as i32) + .bind(ocr_result.processing_time_ms as i32) .execute(&self.pool) .await?; } @@ -263,8 +290,9 @@ impl OcrQueueService { self.mark_completed(item.id, processing_time_ms).await?; info!( - "Successfully processed OCR job {} for document {} in {}ms", - item.id, item.document_id, processing_time_ms + "Successfully processed OCR job {} for document {} in {}ms - Enhanced OCR: {:.1}% confidence, {} words, Preprocessing: {:?}", + item.id, item.document_id, processing_time_ms, + ocr_result.confidence, ocr_result.word_count, ocr_result.preprocessing_applied ); } Err(e) => { @@ -302,7 +330,7 @@ impl OcrQueueService { /// Start the worker loop pub async fn start_worker(self: Arc) -> Result<()> { let semaphore = Arc::new(Semaphore::new(self.max_concurrent_jobs)); - let ocr_service = Arc::new(OcrService::new()); + let ocr_service = Arc::new(EnhancedOcrService::new("/tmp".to_string())); info!( "Starting OCR worker {} with {} concurrent jobs",