feat(server): implement ocr enhanced service throughout

This commit is contained in:
perfectra1n 2025-06-12 22:12:50 -07:00
parent d5f419ca18
commit d61b1c3f4b
12 changed files with 561 additions and 6 deletions

View File

@ -24,6 +24,7 @@ RUN apt-get update && apt-get install -y \
WORKDIR /app
COPY Cargo.toml Cargo.lock ./
COPY src ./src
COPY migrations ./migrations
RUN cargo build --release
# --- Runtime stage ---
@ -41,6 +42,9 @@ WORKDIR /app
# Copy backend binary
COPY --from=backend-builder /app/target/release/readur /app/readur
# Copy migrations directory
COPY --from=backend-builder /app/migrations /app/migrations
# Create necessary directories
RUN mkdir -p /app/uploads /app/watch /app/frontend

View File

@ -2,6 +2,7 @@ import React, { useState, useEffect } from 'react'
import FileUpload from './FileUpload'
import DocumentList from './DocumentList'
import SearchBar from './SearchBar'
import OcrAnalytics from './OcrAnalytics'
import { Document, documentService } from '../services/api'
function Dashboard() {
@ -55,6 +56,12 @@ function Dashboard() {
<SearchBar onSearch={handleSearch} />
</div>
{!searchResults && (
<div className="mb-6">
<OcrAnalytics documents={documents} />
</div>
)}
{searchResults && (
<div className="mb-4">
<button

View File

@ -42,6 +42,78 @@ function DocumentList({ documents, loading }: DocumentListProps) {
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i]
}
const getOcrStatusBadge = (document: Document) => {
if (!document.has_ocr_text) {
return null
}
const confidence = document.ocr_confidence
const status = document.ocr_status
if (status === 'failed') {
return (
<span className="ml-2 inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium bg-red-100 text-red-800">
OCR Failed
</span>
)
}
if (status === 'processing') {
return (
<span className="ml-2 inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium bg-yellow-100 text-yellow-800">
Processing...
</span>
)
}
if (confidence !== undefined) {
let badgeClass = 'bg-green-100 text-green-800'
let label = 'OCR'
if (confidence >= 80) {
badgeClass = 'bg-green-100 text-green-800'
label = `OCR ${confidence.toFixed(0)}%`
} else if (confidence >= 60) {
badgeClass = 'bg-yellow-100 text-yellow-800'
label = `OCR ${confidence.toFixed(0)}%`
} else {
badgeClass = 'bg-orange-100 text-orange-800'
label = `OCR ${confidence.toFixed(0)}%`
}
return (
<span className={`ml-2 inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium ${badgeClass}`}>
{label}
</span>
)
}
return (
<span className="ml-2 inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium bg-green-100 text-green-800">
OCR
</span>
)
}
const getOcrMetrics = (document: Document) => {
if (!document.has_ocr_text || !document.ocr_word_count) {
return null
}
const metrics = []
if (document.ocr_word_count) {
metrics.push(`${document.ocr_word_count} words`)
}
if (document.ocr_processing_time_ms) {
const seconds = (document.ocr_processing_time_ms / 1000).toFixed(1)
metrics.push(`${seconds}s`)
}
return metrics.length > 0 ? `${metrics.join(' • ')}` : null
}
if (loading) {
return (
<div className="text-center py-8">
@ -74,11 +146,8 @@ function DocumentList({ documents, loading }: DocumentListProps) {
</div>
<div className="text-sm text-gray-500">
{formatFileSize(document.file_size)} {document.mime_type}
{document.has_ocr_text && (
<span className="ml-2 inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium bg-green-100 text-green-800">
OCR
</span>
)}
{getOcrMetrics(document)}
{getOcrStatusBadge(document)}
</div>
<div className="text-xs text-gray-400">
{new Date(document.created_at).toLocaleDateString()}

View File

@ -0,0 +1,187 @@
import React, { useState, useEffect } from 'react'
import {
ChartBarIcon,
ClockIcon,
DocumentTextIcon,
ExclamationCircleIcon,
} from '@heroicons/react/24/outline'
import { Document } from '../services/api'
interface OcrAnalyticsProps {
documents: Document[]
}
interface OcrStats {
totalDocuments: number
documentsWithOcr: number
averageConfidence: number
highConfidenceCount: number
lowConfidenceCount: number
failedCount: number
processingCount: number
totalWords: number
averageProcessingTime: number
}
function OcrAnalytics({ documents }: OcrAnalyticsProps) {
const [stats, setStats] = useState<OcrStats | null>(null)
useEffect(() => {
if (documents.length === 0) {
setStats(null)
return
}
const ocrDocuments = documents.filter(doc => doc.has_ocr_text)
const completedOcr = ocrDocuments.filter(doc => doc.ocr_status === 'completed')
const failedOcr = ocrDocuments.filter(doc => doc.ocr_status === 'failed')
const processingOcr = ocrDocuments.filter(doc => doc.ocr_status === 'processing')
const confidenceScores = completedOcr
.map(doc => doc.ocr_confidence)
.filter((confidence): confidence is number => confidence !== undefined)
const wordCounts = completedOcr
.map(doc => doc.ocr_word_count)
.filter((count): count is number => count !== undefined)
const processingTimes = completedOcr
.map(doc => doc.ocr_processing_time_ms)
.filter((time): time is number => time !== undefined)
const averageConfidence = confidenceScores.length > 0
? confidenceScores.reduce((sum, conf) => sum + conf, 0) / confidenceScores.length
: 0
const totalWords = wordCounts.reduce((sum, count) => sum + count, 0)
const averageProcessingTime = processingTimes.length > 0
? processingTimes.reduce((sum, time) => sum + time, 0) / processingTimes.length
: 0
const highConfidenceCount = confidenceScores.filter(conf => conf >= 80).length
const lowConfidenceCount = confidenceScores.filter(conf => conf < 60).length
setStats({
totalDocuments: documents.length,
documentsWithOcr: ocrDocuments.length,
averageConfidence,
highConfidenceCount,
lowConfidenceCount,
failedCount: failedOcr.length,
processingCount: processingOcr.length,
totalWords,
averageProcessingTime,
})
}, [documents])
if (!stats || stats.documentsWithOcr === 0) {
return null
}
const formatTime = (ms: number) => {
if (ms < 1000) return `${Math.round(ms)}ms`
return `${(ms / 1000).toFixed(1)}s`
}
const getConfidenceColor = (confidence: number) => {
if (confidence >= 80) return 'text-green-600'
if (confidence >= 60) return 'text-yellow-600'
return 'text-orange-600'
}
const successRate = ((stats.documentsWithOcr - stats.failedCount) / stats.documentsWithOcr) * 100
return (
<div className="bg-white overflow-hidden shadow rounded-lg">
<div className="p-5">
<div className="flex items-center">
<div className="flex-shrink-0">
<ChartBarIcon className="h-6 w-6 text-gray-400" />
</div>
<div className="ml-5 w-0 flex-1">
<dl>
<dt className="text-sm font-medium text-gray-500 truncate">
OCR Analytics
</dt>
<dd className="text-lg font-medium text-gray-900">
{stats.documentsWithOcr} of {stats.totalDocuments} documents processed
</dd>
</dl>
</div>
</div>
</div>
<div className="bg-gray-50 px-5 py-3">
<div className="grid grid-cols-2 gap-4 sm:grid-cols-4">
{/* Success Rate */}
<div className="text-center">
<div className="text-lg font-semibold text-gray-900">
{successRate.toFixed(0)}%
</div>
<div className="text-xs text-gray-500">Success Rate</div>
</div>
{/* Average Confidence */}
<div className="text-center">
<div className={`text-lg font-semibold ${getConfidenceColor(stats.averageConfidence)}`}>
{stats.averageConfidence.toFixed(0)}%
</div>
<div className="text-xs text-gray-500">Avg Confidence</div>
</div>
{/* Total Words */}
<div className="text-center">
<div className="text-lg font-semibold text-gray-900">
{stats.totalWords.toLocaleString()}
</div>
<div className="text-xs text-gray-500">Words Extracted</div>
</div>
{/* Average Processing Time */}
<div className="text-center">
<div className="text-lg font-semibold text-gray-900">
{formatTime(stats.averageProcessingTime)}
</div>
<div className="text-xs text-gray-500">Avg Time</div>
</div>
</div>
{/* Quality Distribution */}
<div className="mt-4 pt-4 border-t border-gray-200">
<div className="flex justify-between items-center text-sm">
<div className="flex items-center space-x-4">
<div className="flex items-center">
<div className="w-2 h-2 bg-green-500 rounded-full mr-1"></div>
<span className="text-gray-600">High Quality: {stats.highConfidenceCount}</span>
</div>
{stats.lowConfidenceCount > 0 && (
<div className="flex items-center">
<div className="w-2 h-2 bg-orange-500 rounded-full mr-1"></div>
<span className="text-gray-600">Low Quality: {stats.lowConfidenceCount}</span>
</div>
)}
{stats.failedCount > 0 && (
<div className="flex items-center">
<div className="w-2 h-2 bg-red-500 rounded-full mr-1"></div>
<span className="text-gray-600">Failed: {stats.failedCount}</span>
</div>
)}
{stats.processingCount > 0 && (
<div className="flex items-center">
<div className="w-2 h-2 bg-yellow-500 rounded-full mr-1 animate-pulse"></div>
<span className="text-gray-600">Processing: {stats.processingCount}</span>
</div>
)}
</div>
</div>
</div>
</div>
</div>
)
}
export default OcrAnalytics

View File

@ -19,6 +19,10 @@ export interface Document {
tags: string[]
created_at: string
has_ocr_text: boolean
ocr_confidence?: number
ocr_word_count?: number
ocr_processing_time_ms?: number
ocr_status?: string
}
export interface SearchRequest {
@ -53,6 +57,10 @@ export interface EnhancedDocument {
tags: string[]
created_at: string
has_ocr_text: boolean
ocr_confidence?: number
ocr_word_count?: number
ocr_processing_time_ms?: number
ocr_status?: string
search_rank?: number
snippets: SearchSnippet[]
}

View File

@ -351,6 +351,10 @@ impl Database {
mime_type: row.get("mime_type"),
content: row.get("content"),
ocr_text: row.get("ocr_text"),
ocr_confidence: row.get("ocr_confidence"),
ocr_word_count: row.get("ocr_word_count"),
ocr_processing_time_ms: row.get("ocr_processing_time_ms"),
ocr_status: row.get("ocr_status"),
tags: row.get("tags"),
created_at: row.get("created_at"),
updated_at: row.get("updated_at"),
@ -385,6 +389,10 @@ impl Database {
mime_type: row.get("mime_type"),
content: row.get("content"),
ocr_text: row.get("ocr_text"),
ocr_confidence: row.get("ocr_confidence"),
ocr_word_count: row.get("ocr_word_count"),
ocr_processing_time_ms: row.get("ocr_processing_time_ms"),
ocr_status: row.get("ocr_status"),
tags: row.get("tags"),
created_at: row.get("created_at"),
updated_at: row.get("updated_at"),
@ -419,6 +427,10 @@ impl Database {
mime_type: row.get("mime_type"),
content: row.get("content"),
ocr_text: row.get("ocr_text"),
ocr_confidence: row.get("ocr_confidence"),
ocr_word_count: row.get("ocr_word_count"),
ocr_processing_time_ms: row.get("ocr_processing_time_ms"),
ocr_status: row.get("ocr_status"),
tags: row.get("tags"),
created_at: row.get("created_at"),
updated_at: row.get("updated_at"),
@ -483,6 +495,10 @@ impl Database {
mime_type: row.get("mime_type"),
content: row.get("content"),
ocr_text: row.get("ocr_text"),
ocr_confidence: row.get("ocr_confidence"),
ocr_word_count: row.get("ocr_word_count"),
ocr_processing_time_ms: row.get("ocr_processing_time_ms"),
ocr_status: row.get("ocr_status"),
tags: row.get("tags"),
created_at: row.get("created_at"),
updated_at: row.get("updated_at"),
@ -637,6 +653,10 @@ impl Database {
tags: row.get("tags"),
created_at: row.get("created_at"),
has_ocr_text: ocr_text.is_some(),
ocr_confidence: row.get("ocr_confidence"),
ocr_word_count: row.get("ocr_word_count"),
ocr_processing_time_ms: row.get("ocr_processing_time_ms"),
ocr_status: row.get("ocr_status"),
search_rank: Some(rank),
snippets,
});

View File

@ -58,6 +58,10 @@ impl FileService {
mime_type: mime_type.to_string(),
content: None,
ocr_text: None,
ocr_confidence: None,
ocr_word_count: None,
ocr_processing_time_ms: None,
ocr_status: Some("pending".to_string()),
tags: Vec::new(),
created_at: Utc::now(),
updated_at: Utc::now(),

View File

@ -2,8 +2,9 @@ pub mod auth;
pub mod batch_ingest;
pub mod config;
pub mod db;
pub mod enhanced_ocr; // Temporarily disabled due to compilation errors
pub mod enhanced_ocr;
pub mod file_service;
pub mod migrations;
pub mod models;
pub mod ocr;
pub mod ocr_queue;

View File

@ -43,6 +43,12 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
db.migrate().await?;
// Run automatic migrations
if let Err(e) = readur::migrations::run_startup_migrations(&config.database_url, "migrations").await {
error!("Failed to run migrations: {}", e);
return Err(e.into());
}
// Seed admin user
seed::seed_admin_user(&db).await?;

229
src/migrations.rs Normal file
View File

@ -0,0 +1,229 @@
use anyhow::Result;
use sqlx::PgPool;
use tracing::{info, warn, error};
use std::fs;
use std::path::Path;
pub struct MigrationRunner {
pool: PgPool,
migrations_dir: String,
}
#[derive(Debug)]
pub struct Migration {
pub version: i32,
pub name: String,
pub sql: String,
}
impl MigrationRunner {
pub fn new(pool: PgPool, migrations_dir: String) -> Self {
Self {
pool,
migrations_dir,
}
}
/// Initialize the migrations table if it doesn't exist
pub async fn init(&self) -> Result<()> {
sqlx::query(
r#"
CREATE TABLE IF NOT EXISTS schema_migrations (
version INTEGER PRIMARY KEY,
name VARCHAR(255) NOT NULL,
applied_at TIMESTAMPTZ DEFAULT NOW()
);
"#
)
.execute(&self.pool)
.await?;
info!("Migration system initialized");
Ok(())
}
/// Load all migration files from the migrations directory
pub fn load_migrations(&self) -> Result<Vec<Migration>> {
let mut migrations = Vec::new();
let migrations_path = Path::new(&self.migrations_dir);
if !migrations_path.exists() {
warn!("Migrations directory not found: {}", self.migrations_dir);
return Ok(migrations);
}
let mut entries: Vec<_> = fs::read_dir(migrations_path)?
.filter_map(|entry| entry.ok())
.filter(|entry| {
entry.path().extension()
.and_then(|s| s.to_str())
.map(|s| s == "sql")
.unwrap_or(false)
})
.collect();
// Sort by filename to ensure proper order
entries.sort_by(|a, b| a.file_name().cmp(&b.file_name()));
for entry in entries {
let filename = entry.file_name().to_string_lossy().to_string();
// Parse version from filename (e.g., "001_add_ocr_queue.sql" -> version 1)
if let Some(version_str) = filename.split('_').next() {
if let Ok(version) = version_str.parse::<i32>() {
let sql = fs::read_to_string(entry.path())?;
let name = filename.replace(".sql", "");
migrations.push(Migration {
version,
name,
sql,
});
}
}
}
migrations.sort_by_key(|m| m.version);
Ok(migrations)
}
/// Get the list of applied migration versions
pub async fn get_applied_migrations(&self) -> Result<Vec<i32>> {
let rows = sqlx::query_scalar::<_, i32>("SELECT version FROM schema_migrations ORDER BY version")
.fetch_all(&self.pool)
.await?;
Ok(rows)
}
/// Check if a specific migration has been applied
pub async fn is_migration_applied(&self, version: i32) -> Result<bool> {
let count: i64 = sqlx::query_scalar(
"SELECT COUNT(*) FROM schema_migrations WHERE version = $1"
)
.bind(version)
.fetch_one(&self.pool)
.await?;
Ok(count > 0)
}
/// Apply a single migration
pub async fn apply_migration(&self, migration: &Migration) -> Result<()> {
info!("Applying migration {}: {}", migration.version, migration.name);
// Start a transaction
let mut tx = self.pool.begin().await?;
// Execute the migration SQL
sqlx::query(&migration.sql)
.execute(&mut *tx)
.await
.map_err(|e| {
error!("Failed to apply migration {}: {}", migration.version, e);
e
})?;
// Record the migration as applied
sqlx::query(
"INSERT INTO schema_migrations (version, name) VALUES ($1, $2)"
)
.bind(migration.version)
.bind(&migration.name)
.execute(&mut *tx)
.await?;
// Commit the transaction
tx.commit().await?;
info!("Successfully applied migration {}: {}", migration.version, migration.name);
Ok(())
}
/// Run all pending migrations
pub async fn run_migrations(&self) -> Result<()> {
// Initialize migration system
self.init().await?;
// Load all migrations
let migrations = self.load_migrations()?;
if migrations.is_empty() {
info!("No migrations found");
return Ok(());
}
// Get applied migrations
let applied = self.get_applied_migrations().await?;
// Find pending migrations
let pending: Vec<&Migration> = migrations
.iter()
.filter(|m| !applied.contains(&m.version))
.collect();
if pending.is_empty() {
info!("All migrations are up to date");
return Ok(());
}
info!("Found {} pending migrations", pending.len());
// Apply each pending migration
for migration in pending {
self.apply_migration(migration).await?;
}
info!("All migrations completed successfully");
Ok(())
}
/// Get migration status summary
pub async fn get_status(&self) -> Result<MigrationStatus> {
self.init().await?;
let migrations = self.load_migrations()?;
let applied = self.get_applied_migrations().await?;
let pending_count = migrations
.iter()
.filter(|m| !applied.contains(&m.version))
.count();
Ok(MigrationStatus {
total_migrations: migrations.len(),
applied_migrations: applied.len(),
pending_migrations: pending_count,
latest_version: migrations.last().map(|m| m.version),
current_version: applied.last().copied(),
})
}
}
#[derive(Debug)]
pub struct MigrationStatus {
pub total_migrations: usize,
pub applied_migrations: usize,
pub pending_migrations: usize,
pub latest_version: Option<i32>,
pub current_version: Option<i32>,
}
impl MigrationStatus {
pub fn is_up_to_date(&self) -> bool {
self.pending_migrations == 0
}
pub fn needs_migration(&self) -> bool {
self.pending_migrations > 0
}
}
/// Convenience function to run migrations at startup
pub async fn run_startup_migrations(database_url: &str, migrations_dir: &str) -> Result<()> {
let pool = sqlx::PgPool::connect(database_url).await?;
let runner = MigrationRunner::new(pool, migrations_dir.to_string());
info!("Running database migrations...");
runner.run_migrations().await?;
Ok(())
}

View File

@ -50,6 +50,10 @@ pub struct Document {
pub mime_type: String,
pub content: Option<String>,
pub ocr_text: Option<String>,
pub ocr_confidence: Option<f32>,
pub ocr_word_count: Option<i32>,
pub ocr_processing_time_ms: Option<i32>,
pub ocr_status: Option<String>,
pub tags: Vec<String>,
pub created_at: DateTime<Utc>,
pub updated_at: DateTime<Utc>,
@ -66,6 +70,10 @@ pub struct DocumentResponse {
pub tags: Vec<String>,
pub created_at: DateTime<Utc>,
pub has_ocr_text: bool,
pub ocr_confidence: Option<f32>,
pub ocr_word_count: Option<i32>,
pub ocr_processing_time_ms: Option<i32>,
pub ocr_status: Option<String>,
}
#[derive(Debug, Serialize, Deserialize, ToSchema, IntoParams)]
@ -122,6 +130,10 @@ pub struct EnhancedDocumentResponse {
pub tags: Vec<String>,
pub created_at: DateTime<Utc>,
pub has_ocr_text: bool,
pub ocr_confidence: Option<f32>,
pub ocr_word_count: Option<i32>,
pub ocr_processing_time_ms: Option<i32>,
pub ocr_status: Option<String>,
pub search_rank: Option<f32>,
pub snippets: Vec<SearchSnippet>,
}
@ -145,6 +157,10 @@ impl From<Document> for DocumentResponse {
tags: doc.tags,
created_at: doc.created_at,
has_ocr_text: doc.ocr_text.is_some(),
ocr_confidence: doc.ocr_confidence,
ocr_word_count: doc.ocr_word_count,
ocr_processing_time_ms: doc.ocr_processing_time_ms,
ocr_status: doc.ocr_status,
}
}
}

View File

@ -55,6 +55,10 @@ async fn search_documents(
tags: doc.tags,
created_at: doc.created_at,
has_ocr_text: doc.ocr_text.is_some(),
ocr_confidence: doc.ocr_confidence,
ocr_word_count: doc.ocr_word_count,
ocr_processing_time_ms: doc.ocr_processing_time_ms,
ocr_status: doc.ocr_status,
search_rank: None,
snippets: Vec::new(),
}).collect(),