From cd35f877b1a83d7d6359359742be3a8388eb4443 Mon Sep 17 00:00:00 2001 From: perf3ct Date: Fri, 13 Jun 2025 15:14:13 +0000 Subject: [PATCH] feat(migrations): try to fix the migrations service --- docker-compose.yml | 1 + migrations/20240101000000_initial_schema.sql | 17 +++++++++++++++ src/db.rs | 22 ++++++++++++++++---- src/file_service.rs | 2 ++ src/main.rs | 22 ++++++++++++++++++++ src/models.rs | 2 ++ 6 files changed, 62 insertions(+), 4 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 1ad0428..863521e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,6 +8,7 @@ services: - JWT_SECRET=your-super-secret-jwt-key-change-this-in-production - UPLOAD_PATH=/app/uploads - WATCH_FOLDER=/app/watch + - RUST_BACKTRACE=1 volumes: - uploads:/app/uploads - watch:/app/watch diff --git a/migrations/20240101000000_initial_schema.sql b/migrations/20240101000000_initial_schema.sql index 55d7702..4056df8 100644 --- a/migrations/20240101000000_initial_schema.sql +++ b/migrations/20240101000000_initial_schema.sql @@ -22,6 +22,12 @@ CREATE TABLE IF NOT EXISTS documents ( mime_type VARCHAR(100) NOT NULL, content TEXT, ocr_text TEXT, + ocr_confidence REAL, + ocr_word_count INT, + ocr_processing_time_ms INT, + ocr_status VARCHAR(20) DEFAULT 'pending', + ocr_error TEXT, + ocr_completed_at TIMESTAMPTZ, tags TEXT[] DEFAULT '{}', created_at TIMESTAMPTZ DEFAULT NOW(), updated_at TIMESTAMPTZ DEFAULT NOW(), @@ -36,6 +42,8 @@ CREATE INDEX IF NOT EXISTS idx_documents_tags ON documents USING GIN(tags); CREATE INDEX IF NOT EXISTS idx_documents_content_search ON documents USING GIN(to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, ''))); CREATE INDEX IF NOT EXISTS idx_documents_filename_trgm ON documents USING GIN(filename gin_trgm_ops); CREATE INDEX IF NOT EXISTS idx_documents_content_trgm ON documents USING GIN((COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')) gin_trgm_ops); +CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence ON documents(ocr_confidence) WHERE ocr_confidence IS NOT NULL; +CREATE INDEX IF NOT EXISTS idx_documents_ocr_word_count ON documents(ocr_word_count) WHERE ocr_word_count IS NOT NULL; -- Create settings table CREATE TABLE IF NOT EXISTS settings ( @@ -57,6 +65,15 @@ CREATE TABLE IF NOT EXISTS settings ( memory_limit_mb INT DEFAULT 512, cpu_priority VARCHAR(10) DEFAULT 'normal', enable_background_ocr BOOLEAN DEFAULT TRUE, + ocr_page_segmentation_mode INT DEFAULT 3, + ocr_engine_mode INT DEFAULT 3, + ocr_min_confidence REAL DEFAULT 30.0, + ocr_dpi INT DEFAULT 300, + ocr_enhance_contrast BOOLEAN DEFAULT true, + ocr_remove_noise BOOLEAN DEFAULT true, + ocr_detect_orientation BOOLEAN DEFAULT true, + ocr_whitelist_chars TEXT, + ocr_blacklist_chars TEXT, created_at TIMESTAMPTZ DEFAULT NOW(), updated_at TIMESTAMPTZ DEFAULT NOW() ); \ No newline at end of file diff --git a/src/db.rs b/src/db.rs index 0cee394..15a290b 100644 --- a/src/db.rs +++ b/src/db.rs @@ -322,9 +322,9 @@ impl Database { pub async fn create_document(&self, document: Document) -> Result { let row = sqlx::query( r#" - INSERT INTO documents (id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, tags, created_at, updated_at, user_id) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12) - RETURNING id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, tags, created_at, updated_at, user_id + INSERT INTO documents (id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18) + RETURNING id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id "# ) .bind(document.id) @@ -335,6 +335,12 @@ impl Database { .bind(&document.mime_type) .bind(&document.content) .bind(&document.ocr_text) + .bind(document.ocr_confidence) + .bind(document.ocr_word_count) + .bind(document.ocr_processing_time_ms) + .bind(&document.ocr_status) + .bind(&document.ocr_error) + .bind(document.ocr_completed_at) .bind(&document.tags) .bind(document.created_at) .bind(document.updated_at) @@ -355,6 +361,8 @@ impl Database { ocr_word_count: row.get("ocr_word_count"), ocr_processing_time_ms: row.get("ocr_processing_time_ms"), ocr_status: row.get("ocr_status"), + ocr_error: row.get("ocr_error"), + ocr_completed_at: row.get("ocr_completed_at"), tags: row.get("tags"), created_at: row.get("created_at"), updated_at: row.get("updated_at"), @@ -393,6 +401,8 @@ impl Database { ocr_word_count: row.get("ocr_word_count"), ocr_processing_time_ms: row.get("ocr_processing_time_ms"), ocr_status: row.get("ocr_status"), + ocr_error: row.get("ocr_error"), + ocr_completed_at: row.get("ocr_completed_at"), tags: row.get("tags"), created_at: row.get("created_at"), updated_at: row.get("updated_at"), @@ -431,6 +441,8 @@ impl Database { ocr_word_count: row.get("ocr_word_count"), ocr_processing_time_ms: row.get("ocr_processing_time_ms"), ocr_status: row.get("ocr_status"), + ocr_error: row.get("ocr_error"), + ocr_completed_at: row.get("ocr_completed_at"), tags: row.get("tags"), created_at: row.get("created_at"), updated_at: row.get("updated_at"), @@ -499,6 +511,8 @@ impl Database { ocr_word_count: row.get("ocr_word_count"), ocr_processing_time_ms: row.get("ocr_processing_time_ms"), ocr_status: row.get("ocr_status"), + ocr_error: row.get("ocr_error"), + ocr_completed_at: row.get("ocr_completed_at"), tags: row.get("tags"), created_at: row.get("created_at"), updated_at: row.get("updated_at"), @@ -580,7 +594,7 @@ impl Database { CASE WHEN filename ILIKE '%' || "# )); builder.push_bind(&search.query); - builder.push(&format!(r#"' || '%' THEN 0.8 ELSE 0 END, + builder.push(&format!(r#" || '%' THEN 0.8 ELSE 0 END, ts_rank(to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')), {}('english', "#, query_function)); builder.push_bind(&search.query); builder.push(&format!(r#")) diff --git a/src/file_service.rs b/src/file_service.rs index 0614e63..f2a92eb 100644 --- a/src/file_service.rs +++ b/src/file_service.rs @@ -62,6 +62,8 @@ impl FileService { ocr_word_count: None, ocr_processing_time_ms: None, ocr_status: Some("pending".to_string()), + ocr_error: None, + ocr_completed_at: None, tags: Vec::new(), created_at: Utc::now(), updated_at: Utc::now(), diff --git a/src/main.rs b/src/main.rs index 9fcdaa0..dc34396 100644 --- a/src/main.rs +++ b/src/main.rs @@ -82,6 +82,28 @@ async fn main() -> Result<(), Box> { } } + // Debug: Check what columns exist in documents table + let columns_result = sqlx::query( + "SELECT column_name FROM information_schema.columns + WHERE table_name = 'documents' AND table_schema = 'public' + ORDER BY ordinal_position" + ) + .fetch_all(&db.pool) + .await; + + match columns_result { + Ok(rows) => { + info!("Columns in documents table:"); + for row in rows { + let column_name: String = row.get("column_name"); + info!(" - {}", column_name); + } + } + Err(e) => { + error!("Failed to check columns: {}", e); + } + } + // Seed admin user seed::seed_admin_user(&db).await?; diff --git a/src/models.rs b/src/models.rs index 0e333a8..56961d1 100644 --- a/src/models.rs +++ b/src/models.rs @@ -54,6 +54,8 @@ pub struct Document { pub ocr_word_count: Option, pub ocr_processing_time_ms: Option, pub ocr_status: Option, + pub ocr_error: Option, + pub ocr_completed_at: Option>, pub tags: Vec, pub created_at: DateTime, pub updated_at: DateTime,