feat(client/server): update search tests, and upgrade OCR
This commit is contained in:
parent
1a1f886f04
commit
d5f419ca18
|
|
@ -0,0 +1,57 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Apply database migrations for enhanced OCR
|
||||
# Usage: ./apply_migrations.sh [database_url]
|
||||
|
||||
set -e
|
||||
|
||||
# Default database URL from environment or use provided argument
|
||||
DATABASE_URL=${1:-${DATABASE_URL:-"postgresql://localhost/readur"}}
|
||||
|
||||
echo "Applying migrations to: $DATABASE_URL"
|
||||
|
||||
# Apply migration 002 if it hasn't been applied yet
|
||||
echo "Checking if migration 002_add_enhanced_ocr_fields.sql needs to be applied..."
|
||||
|
||||
# Check if the new columns exist
|
||||
COLUMNS_EXIST=$(psql "$DATABASE_URL" -t -c "
|
||||
SELECT COUNT(*)
|
||||
FROM information_schema.columns
|
||||
WHERE table_name = 'documents'
|
||||
AND column_name IN ('ocr_confidence', 'ocr_word_count', 'ocr_processing_time_ms');
|
||||
")
|
||||
|
||||
if [[ $COLUMNS_EXIST -eq 3 ]]; then
|
||||
echo "Enhanced OCR fields already exist. Migration already applied."
|
||||
else
|
||||
echo "Applying migration 002_add_enhanced_ocr_fields.sql..."
|
||||
psql "$DATABASE_URL" -f migrations/002_add_enhanced_ocr_fields.sql
|
||||
echo "Migration 002 applied successfully!"
|
||||
fi
|
||||
|
||||
# Verify the migration was successful
|
||||
echo "Verifying migration..."
|
||||
VERIFICATION=$(psql "$DATABASE_URL" -t -c "
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'documents' AND column_name = 'ocr_confidence') as doc_cols,
|
||||
(SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'settings' AND column_name = 'ocr_page_segmentation_mode') as settings_cols;
|
||||
")
|
||||
|
||||
echo "Migration verification: $VERIFICATION"
|
||||
|
||||
if echo "$VERIFICATION" | grep -q "1.*1"; then
|
||||
echo "✅ Enhanced OCR migration completed successfully!"
|
||||
echo ""
|
||||
echo "New features available:"
|
||||
echo "- OCR confidence scoring and quality validation"
|
||||
echo "- Advanced image preprocessing for challenging images"
|
||||
echo "- Configurable Tesseract PSM and OEM settings"
|
||||
echo "- Intelligent brightness/contrast enhancement"
|
||||
echo "- Adaptive noise removal and sharpening"
|
||||
echo "- OCR analytics and monitoring"
|
||||
echo ""
|
||||
echo "You can now restart your Readur server to use the enhanced OCR features."
|
||||
else
|
||||
echo "❌ Migration verification failed. Please check the logs above."
|
||||
exit 1
|
||||
fi
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
-- Add enhanced OCR metadata fields to documents table
|
||||
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_confidence REAL;
|
||||
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_word_count INT;
|
||||
ALTER TABLE documents ADD COLUMN IF NOT EXISTS ocr_processing_time_ms INT;
|
||||
|
||||
-- Add enhanced OCR configuration fields to settings table
|
||||
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_page_segmentation_mode INT DEFAULT 3;
|
||||
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_engine_mode INT DEFAULT 3;
|
||||
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_min_confidence REAL DEFAULT 30.0;
|
||||
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_dpi INT DEFAULT 300;
|
||||
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_enhance_contrast BOOLEAN DEFAULT true;
|
||||
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_remove_noise BOOLEAN DEFAULT true;
|
||||
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_detect_orientation BOOLEAN DEFAULT true;
|
||||
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_whitelist_chars TEXT;
|
||||
ALTER TABLE settings ADD COLUMN IF NOT EXISTS ocr_blacklist_chars TEXT;
|
||||
|
||||
-- Add comments for documentation
|
||||
COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence score (0-100)';
|
||||
COMMENT ON COLUMN documents.ocr_word_count IS 'Number of words extracted by OCR';
|
||||
COMMENT ON COLUMN documents.ocr_processing_time_ms IS 'Time taken for OCR processing in milliseconds';
|
||||
|
||||
COMMENT ON COLUMN settings.ocr_page_segmentation_mode IS 'Tesseract Page Segmentation Mode (0-13), default 3=PSM_AUTO';
|
||||
COMMENT ON COLUMN settings.ocr_engine_mode IS 'Tesseract OCR Engine Mode (0-3), default 3=OEM_DEFAULT';
|
||||
COMMENT ON COLUMN settings.ocr_min_confidence IS 'Minimum OCR confidence threshold (0-100)';
|
||||
COMMENT ON COLUMN settings.ocr_dpi IS 'Target DPI for OCR processing, 0=auto';
|
||||
COMMENT ON COLUMN settings.ocr_enhance_contrast IS 'Enable adaptive contrast enhancement';
|
||||
COMMENT ON COLUMN settings.ocr_remove_noise IS 'Enable image noise removal';
|
||||
COMMENT ON COLUMN settings.ocr_detect_orientation IS 'Enable automatic orientation detection';
|
||||
COMMENT ON COLUMN settings.ocr_whitelist_chars IS 'Characters to allow in OCR (null=all)';
|
||||
COMMENT ON COLUMN settings.ocr_blacklist_chars IS 'Characters to exclude from OCR (null=none)';
|
||||
|
||||
-- Create index on OCR confidence for quality filtering
|
||||
CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence ON documents(ocr_confidence) WHERE ocr_confidence IS NOT NULL;
|
||||
|
||||
-- Create index on word count for analytics
|
||||
CREATE INDEX IF NOT EXISTS idx_documents_ocr_word_count ON documents(ocr_word_count) WHERE ocr_word_count IS NOT NULL;
|
||||
|
||||
-- Update existing settings to have the new defaults
|
||||
UPDATE settings SET
|
||||
ocr_page_segmentation_mode = 3,
|
||||
ocr_engine_mode = 3,
|
||||
ocr_min_confidence = 30.0,
|
||||
ocr_dpi = 300,
|
||||
ocr_enhance_contrast = true,
|
||||
ocr_remove_noise = true,
|
||||
ocr_detect_orientation = true
|
||||
WHERE ocr_page_segmentation_mode IS NULL;
|
||||
|
||||
-- Create a view for enhanced OCR analytics
|
||||
CREATE OR REPLACE VIEW ocr_analytics AS
|
||||
SELECT
|
||||
DATE(created_at) as date,
|
||||
COUNT(*) as total_documents,
|
||||
COUNT(ocr_text) as documents_with_ocr,
|
||||
COUNT(ocr_confidence) as documents_with_confidence,
|
||||
AVG(ocr_confidence) as avg_confidence,
|
||||
MIN(ocr_confidence) as min_confidence,
|
||||
MAX(ocr_confidence) as max_confidence,
|
||||
AVG(ocr_word_count) as avg_word_count,
|
||||
SUM(ocr_word_count) as total_words_extracted,
|
||||
AVG(ocr_processing_time_ms) as avg_processing_time_ms,
|
||||
COUNT(*) FILTER (WHERE ocr_confidence < 50) as low_confidence_count,
|
||||
COUNT(*) FILTER (WHERE ocr_confidence >= 80) as high_confidence_count,
|
||||
COUNT(*) FILTER (WHERE ocr_status = 'failed') as failed_ocr_count
|
||||
FROM documents
|
||||
WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
|
||||
GROUP BY DATE(created_at)
|
||||
ORDER BY date DESC;
|
||||
|
||||
COMMENT ON VIEW ocr_analytics IS 'Daily OCR analytics for monitoring quality and performance';
|
||||
|
|
@ -8,7 +8,7 @@ use tokio::time::{sleep, Duration};
|
|||
use tracing::{error, info, warn};
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::{db::Database, ocr::OcrService};
|
||||
use crate::{db::Database, enhanced_ocr::EnhancedOcrService};
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
|
||||
pub struct OcrQueueItem {
|
||||
|
|
@ -204,7 +204,7 @@ impl OcrQueueService {
|
|||
}
|
||||
|
||||
/// Process a single queue item
|
||||
async fn process_item(&self, item: OcrQueueItem, ocr_service: &OcrService) -> Result<()> {
|
||||
async fn process_item(&self, item: OcrQueueItem, ocr_service: &EnhancedOcrService) -> Result<()> {
|
||||
let start_time = std::time::Instant::now();
|
||||
|
||||
info!("Processing OCR job {} for document {}", item.id, item.document_id);
|
||||
|
|
@ -226,35 +226,62 @@ impl OcrQueueService {
|
|||
let file_path: String = row.get("file_path");
|
||||
let mime_type: String = row.get("mime_type");
|
||||
let user_id: Option<Uuid> = row.get("user_id");
|
||||
// Get user's OCR settings
|
||||
// Get user's OCR settings or use defaults
|
||||
let settings = if let Some(user_id) = user_id {
|
||||
self.db.get_user_settings(user_id).await.ok().flatten()
|
||||
.unwrap_or_else(|| crate::models::Settings::default())
|
||||
} else {
|
||||
None
|
||||
crate::models::Settings::default()
|
||||
};
|
||||
|
||||
let ocr_language = settings
|
||||
.as_ref()
|
||||
.map(|s| s.ocr_language.clone())
|
||||
.unwrap_or_else(|| "eng".to_string());
|
||||
|
||||
// Perform OCR
|
||||
match ocr_service.extract_text_with_lang(&file_path, &mime_type, &ocr_language).await {
|
||||
Ok(text) => {
|
||||
if !text.is_empty() {
|
||||
// Update document with OCR text
|
||||
// Perform enhanced OCR
|
||||
match ocr_service.extract_text(&file_path, &mime_type, &settings).await {
|
||||
Ok(ocr_result) => {
|
||||
// Validate OCR quality
|
||||
if !ocr_service.validate_ocr_quality(&ocr_result, &settings) {
|
||||
let error_msg = format!("OCR quality below threshold: {:.1}% confidence, {} words",
|
||||
ocr_result.confidence, ocr_result.word_count);
|
||||
warn!("{}", error_msg);
|
||||
|
||||
// Mark as failed for quality issues
|
||||
sqlx::query(
|
||||
r#"
|
||||
UPDATE documents
|
||||
SET ocr_status = 'failed',
|
||||
ocr_error = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
"#
|
||||
)
|
||||
.bind(item.document_id)
|
||||
.bind(&error_msg)
|
||||
.execute(&self.pool)
|
||||
.await?;
|
||||
|
||||
self.mark_failed(item.id, &error_msg).await?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if !ocr_result.text.is_empty() {
|
||||
// Update document with enhanced OCR text and metadata
|
||||
sqlx::query(
|
||||
r#"
|
||||
UPDATE documents
|
||||
SET ocr_text = $2,
|
||||
ocr_status = 'completed',
|
||||
ocr_completed_at = NOW(),
|
||||
ocr_confidence = $3,
|
||||
ocr_word_count = $4,
|
||||
ocr_processing_time_ms = $5,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
"#
|
||||
)
|
||||
.bind(item.document_id)
|
||||
.bind(text)
|
||||
.bind(&ocr_result.text)
|
||||
.bind(ocr_result.confidence)
|
||||
.bind(ocr_result.word_count as i32)
|
||||
.bind(ocr_result.processing_time_ms as i32)
|
||||
.execute(&self.pool)
|
||||
.await?;
|
||||
}
|
||||
|
|
@ -263,8 +290,9 @@ impl OcrQueueService {
|
|||
self.mark_completed(item.id, processing_time_ms).await?;
|
||||
|
||||
info!(
|
||||
"Successfully processed OCR job {} for document {} in {}ms",
|
||||
item.id, item.document_id, processing_time_ms
|
||||
"Successfully processed OCR job {} for document {} in {}ms - Enhanced OCR: {:.1}% confidence, {} words, Preprocessing: {:?}",
|
||||
item.id, item.document_id, processing_time_ms,
|
||||
ocr_result.confidence, ocr_result.word_count, ocr_result.preprocessing_applied
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
|
|
@ -302,7 +330,7 @@ impl OcrQueueService {
|
|||
/// Start the worker loop
|
||||
pub async fn start_worker(self: Arc<Self>) -> Result<()> {
|
||||
let semaphore = Arc::new(Semaphore::new(self.max_concurrent_jobs));
|
||||
let ocr_service = Arc::new(OcrService::new());
|
||||
let ocr_service = Arc::new(EnhancedOcrService::new("/tmp".to_string()));
|
||||
|
||||
info!(
|
||||
"Starting OCR worker {} with {} concurrent jobs",
|
||||
|
|
|
|||
Loading…
Reference in New Issue