feat(client/server): update search tests, and upgrade OCR
This commit is contained in:
parent
1a1f886f04
commit
d5f419ca18
|
|
@ -0,0 +1,57 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Apply database migrations for enhanced OCR
|
||||||
|
# Usage: ./apply_migrations.sh [database_url]
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Default database URL from environment or use provided argument
|
||||||
|
DATABASE_URL=${1:-${DATABASE_URL:-"postgresql://localhost/readur"}}
|
||||||
|
|
||||||
|
echo "Applying migrations to: $DATABASE_URL"
|
||||||
|
|
||||||
|
# Apply migration 002 if it hasn't been applied yet
|
||||||
|
echo "Checking if migration 002_add_enhanced_ocr_fields.sql needs to be applied..."
|
||||||
|
|
||||||
|
# Check if the new columns exist
|
||||||
|
COLUMNS_EXIST=$(psql "$DATABASE_URL" -t -c "
|
||||||
|
SELECT COUNT(*)
|
||||||
|
FROM information_schema.columns
|
||||||
|
WHERE table_name = 'documents'
|
||||||
|
AND column_name IN ('ocr_confidence', 'ocr_word_count', 'ocr_processing_time_ms');
|
||||||
|
")
|
||||||
|
|
||||||
|
if [[ $COLUMNS_EXIST -eq 3 ]]; then
|
||||||
|
echo "Enhanced OCR fields already exist. Migration already applied."
|
||||||
|
else
|
||||||
|
echo "Applying migration 002_add_enhanced_ocr_fields.sql..."
|
||||||
|
psql "$DATABASE_URL" -f migrations/002_add_enhanced_ocr_fields.sql
|
||||||
|
echo "Migration 002 applied successfully!"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Verify the migration was successful
|
||||||
|
echo "Verifying migration..."
|
||||||
|
VERIFICATION=$(psql "$DATABASE_URL" -t -c "
|
||||||
|
SELECT
|
||||||
|
(SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'documents' AND column_name = 'ocr_confidence') as doc_cols,
|
||||||
|
(SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'settings' AND column_name = 'ocr_page_segmentation_mode') as settings_cols;
|
||||||
|
")
|
||||||
|
|
||||||
|
echo "Migration verification: $VERIFICATION"
|
||||||
|
|
||||||
|
if echo "$VERIFICATION" | grep -q "1.*1"; then
|
||||||
|
echo "✅ Enhanced OCR migration completed successfully!"
|
||||||
|
echo ""
|
||||||
|
echo "New features available:"
|
||||||
|
echo "- OCR confidence scoring and quality validation"
|
||||||
|
echo "- Advanced image preprocessing for challenging images"
|
||||||
|
echo "- Configurable Tesseract PSM and OEM settings"
|
||||||
|
echo "- Intelligent brightness/contrast enhancement"
|
||||||
|
echo "- Adaptive noise removal and sharpening"
|
||||||
|
echo "- OCR analytics and monitoring"
|
||||||
|
echo ""
|
||||||
|
echo "You can now restart your Readur server to use the enhanced OCR features."
|
||||||
|
else
|
||||||
|
echo "❌ Migration verification failed. Please check the logs above."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
@ -0,0 +1,70 @@
|
||||||
|
-- Add enhanced OCR metadata fields to documents table
|
||||||
|
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_confidence REAL;
|
||||||
|
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_word_count INT;
|
||||||
|
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_processing_time_ms INT;
|
||||||
|
|
||||||
|
-- Add enhanced OCR configuration fields to settings table
|
||||||
|
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_page_segmentation_mode INT DEFAULT 3;
|
||||||
|
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_engine_mode INT DEFAULT 3;
|
||||||
|
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_min_confidence REAL DEFAULT 30.0;
|
||||||
|
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_dpi INT DEFAULT 300;
|
||||||
|
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_enhance_contrast BOOLEAN DEFAULT true;
|
||||||
|
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_remove_noise BOOLEAN DEFAULT true;
|
||||||
|
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_detect_orientation BOOLEAN DEFAULT true;
|
||||||
|
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_whitelist_chars TEXT;
|
||||||
|
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_blacklist_chars TEXT;
|
||||||
|
|
||||||
|
-- Add comments for documentation
|
||||||
|
COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence score (0-100)';
|
||||||
|
COMMENT ON COLUMN documents.ocr_word_count IS 'Number of words extracted by OCR';
|
||||||
|
COMMENT ON COLUMN documents.ocr_processing_time_ms IS 'Time taken for OCR processing in milliseconds';
|
||||||
|
|
||||||
|
COMMENT ON COLUMN settings.ocr_page_segmentation_mode IS 'Tesseract Page Segmentation Mode (0-13), default 3=PSM_AUTO';
|
||||||
|
COMMENT ON COLUMN settings.ocr_engine_mode IS 'Tesseract OCR Engine Mode (0-3), default 3=OEM_DEFAULT';
|
||||||
|
COMMENT ON COLUMN settings.ocr_min_confidence IS 'Minimum OCR confidence threshold (0-100)';
|
||||||
|
COMMENT ON COLUMN settings.ocr_dpi IS 'Target DPI for OCR processing, 0=auto';
|
||||||
|
COMMENT ON COLUMN settings.ocr_enhance_contrast IS 'Enable adaptive contrast enhancement';
|
||||||
|
COMMENT ON COLUMN settings.ocr_remove_noise IS 'Enable image noise removal';
|
||||||
|
COMMENT ON COLUMN settings.ocr_detect_orientation IS 'Enable automatic orientation detection';
|
||||||
|
COMMENT ON COLUMN settings.ocr_whitelist_chars IS 'Characters to allow in OCR (null=all)';
|
||||||
|
COMMENT ON COLUMN settings.ocr_blacklist_chars IS 'Characters to exclude from OCR (null=none)';
|
||||||
|
|
||||||
|
-- Create index on OCR confidence for quality filtering
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence ON documents(ocr_confidence) WHERE ocr_confidence IS NOT NULL;
|
||||||
|
|
||||||
|
-- Create index on word count for analytics
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_documents_ocr_word_count ON documents(ocr_word_count) WHERE ocr_word_count IS NOT NULL;
|
||||||
|
|
||||||
|
-- Update existing settings to have the new defaults
|
||||||
|
UPDATE settings SET
|
||||||
|
ocr_page_segmentation_mode = 3,
|
||||||
|
ocr_engine_mode = 3,
|
||||||
|
ocr_min_confidence = 30.0,
|
||||||
|
ocr_dpi = 300,
|
||||||
|
ocr_enhance_contrast = true,
|
||||||
|
ocr_remove_noise = true,
|
||||||
|
ocr_detect_orientation = true
|
||||||
|
WHERE ocr_page_segmentation_mode IS NULL;
|
||||||
|
|
||||||
|
-- Create a view for enhanced OCR analytics
|
||||||
|
CREATE OR REPLACE VIEW ocr_analytics AS
|
||||||
|
SELECT
|
||||||
|
DATE(created_at) as date,
|
||||||
|
COUNT(*) as total_documents,
|
||||||
|
COUNT(ocr_text) as documents_with_ocr,
|
||||||
|
COUNT(ocr_confidence) as documents_with_confidence,
|
||||||
|
AVG(ocr_confidence) as avg_confidence,
|
||||||
|
MIN(ocr_confidence) as min_confidence,
|
||||||
|
MAX(ocr_confidence) as max_confidence,
|
||||||
|
AVG(ocr_word_count) as avg_word_count,
|
||||||
|
SUM(ocr_word_count) as total_words_extracted,
|
||||||
|
AVG(ocr_processing_time_ms) as avg_processing_time_ms,
|
||||||
|
COUNT(*) FILTER (WHERE ocr_confidence < 50) as low_confidence_count,
|
||||||
|
COUNT(*) FILTER (WHERE ocr_confidence >= 80) as high_confidence_count,
|
||||||
|
COUNT(*) FILTER (WHERE ocr_status = 'failed') as failed_ocr_count
|
||||||
|
FROM documents
|
||||||
|
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
|
||||||
|
GROUP BY DATE(created_at)
|
||||||
|
ORDER BY date DESC;
|
||||||
|
|
||||||
|
COMMENT ON VIEW ocr_analytics IS 'Daily OCR analytics for monitoring quality and performance';
|
||||||
|
|
@ -8,7 +8,7 @@ use tokio::time::{sleep, Duration};
|
||||||
use tracing::{error, info, warn};
|
use tracing::{error, info, warn};
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
|
||||||
use crate::{db::Database, ocr::OcrService};
|
use crate::{db::Database, enhanced_ocr::EnhancedOcrService};
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
|
#[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
|
||||||
pub struct OcrQueueItem {
|
pub struct OcrQueueItem {
|
||||||
|
|
@ -204,7 +204,7 @@ impl OcrQueueService {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Process a single queue item
|
/// Process a single queue item
|
||||||
async fn process_item(&self, item: OcrQueueItem, ocr_service: &OcrService) -> Result<()> {
|
async fn process_item(&self, item: OcrQueueItem, ocr_service: &EnhancedOcrService) -> Result<()> {
|
||||||
let start_time = std::time::Instant::now();
|
let start_time = std::time::Instant::now();
|
||||||
|
|
||||||
info!("Processing OCR job {} for document {}", item.id, item.document_id);
|
info!("Processing OCR job {} for document {}", item.id, item.document_id);
|
||||||
|
|
@ -226,35 +226,62 @@ impl OcrQueueService {
|
||||||
let file_path: String = row.get("file_path");
|
let file_path: String = row.get("file_path");
|
||||||
let mime_type: String = row.get("mime_type");
|
let mime_type: String = row.get("mime_type");
|
||||||
let user_id: Option<Uuid> = row.get("user_id");
|
let user_id: Option<Uuid> = row.get("user_id");
|
||||||
// Get user's OCR settings
|
// Get user's OCR settings or use defaults
|
||||||
let settings = if let Some(user_id) = user_id {
|
let settings = if let Some(user_id) = user_id {
|
||||||
self.db.get_user_settings(user_id).await.ok().flatten()
|
self.db.get_user_settings(user_id).await.ok().flatten()
|
||||||
|
.unwrap_or_else(|| crate::models::Settings::default())
|
||||||
} else {
|
} else {
|
||||||
None
|
crate::models::Settings::default()
|
||||||
};
|
};
|
||||||
|
|
||||||
let ocr_language = settings
|
// Perform enhanced OCR
|
||||||
.as_ref()
|
match ocr_service.extract_text(&file_path, &mime_type, &settings).await {
|
||||||
.map(|s| s.ocr_language.clone())
|
Ok(ocr_result) => {
|
||||||
.unwrap_or_else(|| "eng".to_string());
|
// Validate OCR quality
|
||||||
|
if !ocr_service.validate_ocr_quality(&ocr_result, &settings) {
|
||||||
// Perform OCR
|
let error_msg = format!("OCR quality below threshold: {:.1}% confidence, {} words",
|
||||||
match ocr_service.extract_text_with_lang(&file_path, &mime_type, &ocr_language).await {
|
ocr_result.confidence, ocr_result.word_count);
|
||||||
Ok(text) => {
|
warn!("{}", error_msg);
|
||||||
if !text.is_empty() {
|
|
||||||
// Update document with OCR text
|
// Mark as failed for quality issues
|
||||||
|
sqlx::query(
|
||||||
|
r#"
|
||||||
|
UPDATE documents
|
||||||
|
SET ocr_status = 'failed',
|
||||||
|
ocr_error = $2,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1
|
||||||
|
"#
|
||||||
|
)
|
||||||
|
.bind(item.document_id)
|
||||||
|
.bind(&error_msg)
|
||||||
|
.execute(&self.pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
self.mark_failed(item.id, &error_msg).await?;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
if !ocr_result.text.is_empty() {
|
||||||
|
// Update document with enhanced OCR text and metadata
|
||||||
sqlx::query(
|
sqlx::query(
|
||||||
r#"
|
r#"
|
||||||
UPDATE documents
|
UPDATE documents
|
||||||
SET ocr_text = $2,
|
SET ocr_text = $2,
|
||||||
ocr_status = 'completed',
|
ocr_status = 'completed',
|
||||||
ocr_completed_at = NOW(),
|
ocr_completed_at = NOW(),
|
||||||
|
ocr_confidence = $3,
|
||||||
|
ocr_word_count = $4,
|
||||||
|
ocr_processing_time_ms = $5,
|
||||||
updated_at = NOW()
|
updated_at = NOW()
|
||||||
WHERE id = $1
|
WHERE id = $1
|
||||||
"#
|
"#
|
||||||
)
|
)
|
||||||
.bind(item.document_id)
|
.bind(item.document_id)
|
||||||
.bind(text)
|
.bind(&ocr_result.text)
|
||||||
|
.bind(ocr_result.confidence)
|
||||||
|
.bind(ocr_result.word_count as i32)
|
||||||
|
.bind(ocr_result.processing_time_ms as i32)
|
||||||
.execute(&self.pool)
|
.execute(&self.pool)
|
||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
|
|
@ -263,8 +290,9 @@ impl OcrQueueService {
|
||||||
self.mark_completed(item.id, processing_time_ms).await?;
|
self.mark_completed(item.id, processing_time_ms).await?;
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
"Successfully processed OCR job {} for document {} in {}ms",
|
"Successfully processed OCR job {} for document {} in {}ms - Enhanced OCR: {:.1}% confidence, {} words, Preprocessing: {:?}",
|
||||||
item.id, item.document_id, processing_time_ms
|
item.id, item.document_id, processing_time_ms,
|
||||||
|
ocr_result.confidence, ocr_result.word_count, ocr_result.preprocessing_applied
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
|
|
@ -302,7 +330,7 @@ impl OcrQueueService {
|
||||||
/// Start the worker loop
|
/// Start the worker loop
|
||||||
pub async fn start_worker(self: Arc<Self>) -> Result<()> {
|
pub async fn start_worker(self: Arc<Self>) -> Result<()> {
|
||||||
let semaphore = Arc::new(Semaphore::new(self.max_concurrent_jobs));
|
let semaphore = Arc::new(Semaphore::new(self.max_concurrent_jobs));
|
||||||
let ocr_service = Arc::new(OcrService::new());
|
let ocr_service = Arc::new(EnhancedOcrService::new("/tmp".to_string()));
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
"Starting OCR worker {} with {} concurrent jobs",
|
"Starting OCR worker {} with {} concurrent jobs",
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue