feat(server): implement ocr enhanced service throughout
This commit is contained in:
parent
d5f419ca18
commit
d61b1c3f4b
|
|
@ -24,6 +24,7 @@ RUN apt-get update && apt-get install -y \
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY Cargo.toml Cargo.lock ./
|
COPY Cargo.toml Cargo.lock ./
|
||||||
COPY src ./src
|
COPY src ./src
|
||||||
|
COPY migrations ./migrations
|
||||||
RUN cargo build --release
|
RUN cargo build --release
|
||||||
|
|
||||||
# --- Runtime stage ---
|
# --- Runtime stage ---
|
||||||
|
|
@ -41,6 +42,9 @@ WORKDIR /app
|
||||||
# Copy backend binary
|
# Copy backend binary
|
||||||
COPY --from=backend-builder /app/target/release/readur /app/readur
|
COPY --from=backend-builder /app/target/release/readur /app/readur
|
||||||
|
|
||||||
|
# Copy migrations directory
|
||||||
|
COPY --from=backend-builder /app/migrations /app/migrations
|
||||||
|
|
||||||
# Create necessary directories
|
# Create necessary directories
|
||||||
RUN mkdir -p /app/uploads /app/watch /app/frontend
|
RUN mkdir -p /app/uploads /app/watch /app/frontend
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ import React, { useState, useEffect } from 'react'
|
||||||
import FileUpload from './FileUpload'
|
import FileUpload from './FileUpload'
|
||||||
import DocumentList from './DocumentList'
|
import DocumentList from './DocumentList'
|
||||||
import SearchBar from './SearchBar'
|
import SearchBar from './SearchBar'
|
||||||
|
import OcrAnalytics from './OcrAnalytics'
|
||||||
import { Document, documentService } from '../services/api'
|
import { Document, documentService } from '../services/api'
|
||||||
|
|
||||||
function Dashboard() {
|
function Dashboard() {
|
||||||
|
|
@ -55,6 +56,12 @@ function Dashboard() {
|
||||||
<SearchBar onSearch={handleSearch} />
|
<SearchBar onSearch={handleSearch} />
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{!searchResults && (
|
||||||
|
<div className="mb-6">
|
||||||
|
<OcrAnalytics documents={documents} />
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
{searchResults && (
|
{searchResults && (
|
||||||
<div className="mb-4">
|
<div className="mb-4">
|
||||||
<button
|
<button
|
||||||
|
|
|
||||||
|
|
@ -42,6 +42,78 @@ function DocumentList({ documents, loading }: DocumentListProps) {
|
||||||
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i]
|
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const getOcrStatusBadge = (document: Document) => {
|
||||||
|
if (!document.has_ocr_text) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
const confidence = document.ocr_confidence
|
||||||
|
const status = document.ocr_status
|
||||||
|
|
||||||
|
if (status === 'failed') {
|
||||||
|
return (
|
||||||
|
<span className="ml-2 inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium bg-red-100 text-red-800">
|
||||||
|
OCR Failed
|
||||||
|
</span>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (status === 'processing') {
|
||||||
|
return (
|
||||||
|
<span className="ml-2 inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium bg-yellow-100 text-yellow-800">
|
||||||
|
Processing...
|
||||||
|
</span>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (confidence !== undefined) {
|
||||||
|
let badgeClass = 'bg-green-100 text-green-800'
|
||||||
|
let label = 'OCR'
|
||||||
|
|
||||||
|
if (confidence >= 80) {
|
||||||
|
badgeClass = 'bg-green-100 text-green-800'
|
||||||
|
label = `OCR ${confidence.toFixed(0)}%`
|
||||||
|
} else if (confidence >= 60) {
|
||||||
|
badgeClass = 'bg-yellow-100 text-yellow-800'
|
||||||
|
label = `OCR ${confidence.toFixed(0)}%`
|
||||||
|
} else {
|
||||||
|
badgeClass = 'bg-orange-100 text-orange-800'
|
||||||
|
label = `OCR ${confidence.toFixed(0)}%`
|
||||||
|
}
|
||||||
|
|
||||||
|
return (
|
||||||
|
<span className={`ml-2 inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium ${badgeClass}`}>
|
||||||
|
{label}
|
||||||
|
</span>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
return (
|
||||||
|
<span className="ml-2 inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium bg-green-100 text-green-800">
|
||||||
|
OCR
|
||||||
|
</span>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
const getOcrMetrics = (document: Document) => {
|
||||||
|
if (!document.has_ocr_text || !document.ocr_word_count) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
const metrics = []
|
||||||
|
|
||||||
|
if (document.ocr_word_count) {
|
||||||
|
metrics.push(`${document.ocr_word_count} words`)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (document.ocr_processing_time_ms) {
|
||||||
|
const seconds = (document.ocr_processing_time_ms / 1000).toFixed(1)
|
||||||
|
metrics.push(`${seconds}s`)
|
||||||
|
}
|
||||||
|
|
||||||
|
return metrics.length > 0 ? ` • ${metrics.join(' • ')}` : null
|
||||||
|
}
|
||||||
|
|
||||||
if (loading) {
|
if (loading) {
|
||||||
return (
|
return (
|
||||||
<div className="text-center py-8">
|
<div className="text-center py-8">
|
||||||
|
|
@ -74,11 +146,8 @@ function DocumentList({ documents, loading }: DocumentListProps) {
|
||||||
</div>
|
</div>
|
||||||
<div className="text-sm text-gray-500">
|
<div className="text-sm text-gray-500">
|
||||||
{formatFileSize(document.file_size)} • {document.mime_type}
|
{formatFileSize(document.file_size)} • {document.mime_type}
|
||||||
{document.has_ocr_text && (
|
{getOcrMetrics(document)}
|
||||||
<span className="ml-2 inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium bg-green-100 text-green-800">
|
{getOcrStatusBadge(document)}
|
||||||
OCR
|
|
||||||
</span>
|
|
||||||
)}
|
|
||||||
</div>
|
</div>
|
||||||
<div className="text-xs text-gray-400">
|
<div className="text-xs text-gray-400">
|
||||||
{new Date(document.created_at).toLocaleDateString()}
|
{new Date(document.created_at).toLocaleDateString()}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,187 @@
|
||||||
|
import React, { useState, useEffect } from 'react'
|
||||||
|
import {
|
||||||
|
ChartBarIcon,
|
||||||
|
ClockIcon,
|
||||||
|
DocumentTextIcon,
|
||||||
|
ExclamationCircleIcon,
|
||||||
|
} from '@heroicons/react/24/outline'
|
||||||
|
import { Document } from '../services/api'
|
||||||
|
|
||||||
|
interface OcrAnalyticsProps {
|
||||||
|
documents: Document[]
|
||||||
|
}
|
||||||
|
|
||||||
|
interface OcrStats {
|
||||||
|
totalDocuments: number
|
||||||
|
documentsWithOcr: number
|
||||||
|
averageConfidence: number
|
||||||
|
highConfidenceCount: number
|
||||||
|
lowConfidenceCount: number
|
||||||
|
failedCount: number
|
||||||
|
processingCount: number
|
||||||
|
totalWords: number
|
||||||
|
averageProcessingTime: number
|
||||||
|
}
|
||||||
|
|
||||||
|
function OcrAnalytics({ documents }: OcrAnalyticsProps) {
|
||||||
|
const [stats, setStats] = useState<OcrStats | null>(null)
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
if (documents.length === 0) {
|
||||||
|
setStats(null)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
const ocrDocuments = documents.filter(doc => doc.has_ocr_text)
|
||||||
|
const completedOcr = ocrDocuments.filter(doc => doc.ocr_status === 'completed')
|
||||||
|
const failedOcr = ocrDocuments.filter(doc => doc.ocr_status === 'failed')
|
||||||
|
const processingOcr = ocrDocuments.filter(doc => doc.ocr_status === 'processing')
|
||||||
|
|
||||||
|
const confidenceScores = completedOcr
|
||||||
|
.map(doc => doc.ocr_confidence)
|
||||||
|
.filter((confidence): confidence is number => confidence !== undefined)
|
||||||
|
|
||||||
|
const wordCounts = completedOcr
|
||||||
|
.map(doc => doc.ocr_word_count)
|
||||||
|
.filter((count): count is number => count !== undefined)
|
||||||
|
|
||||||
|
const processingTimes = completedOcr
|
||||||
|
.map(doc => doc.ocr_processing_time_ms)
|
||||||
|
.filter((time): time is number => time !== undefined)
|
||||||
|
|
||||||
|
const averageConfidence = confidenceScores.length > 0
|
||||||
|
? confidenceScores.reduce((sum, conf) => sum + conf, 0) / confidenceScores.length
|
||||||
|
: 0
|
||||||
|
|
||||||
|
const totalWords = wordCounts.reduce((sum, count) => sum + count, 0)
|
||||||
|
|
||||||
|
const averageProcessingTime = processingTimes.length > 0
|
||||||
|
? processingTimes.reduce((sum, time) => sum + time, 0) / processingTimes.length
|
||||||
|
: 0
|
||||||
|
|
||||||
|
const highConfidenceCount = confidenceScores.filter(conf => conf >= 80).length
|
||||||
|
const lowConfidenceCount = confidenceScores.filter(conf => conf < 60).length
|
||||||
|
|
||||||
|
setStats({
|
||||||
|
totalDocuments: documents.length,
|
||||||
|
documentsWithOcr: ocrDocuments.length,
|
||||||
|
averageConfidence,
|
||||||
|
highConfidenceCount,
|
||||||
|
lowConfidenceCount,
|
||||||
|
failedCount: failedOcr.length,
|
||||||
|
processingCount: processingOcr.length,
|
||||||
|
totalWords,
|
||||||
|
averageProcessingTime,
|
||||||
|
})
|
||||||
|
}, [documents])
|
||||||
|
|
||||||
|
if (!stats || stats.documentsWithOcr === 0) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
const formatTime = (ms: number) => {
|
||||||
|
if (ms < 1000) return `${Math.round(ms)}ms`
|
||||||
|
return `${(ms / 1000).toFixed(1)}s`
|
||||||
|
}
|
||||||
|
|
||||||
|
const getConfidenceColor = (confidence: number) => {
|
||||||
|
if (confidence >= 80) return 'text-green-600'
|
||||||
|
if (confidence >= 60) return 'text-yellow-600'
|
||||||
|
return 'text-orange-600'
|
||||||
|
}
|
||||||
|
|
||||||
|
const successRate = ((stats.documentsWithOcr - stats.failedCount) / stats.documentsWithOcr) * 100
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="bg-white overflow-hidden shadow rounded-lg">
|
||||||
|
<div className="p-5">
|
||||||
|
<div className="flex items-center">
|
||||||
|
<div className="flex-shrink-0">
|
||||||
|
<ChartBarIcon className="h-6 w-6 text-gray-400" />
|
||||||
|
</div>
|
||||||
|
<div className="ml-5 w-0 flex-1">
|
||||||
|
<dl>
|
||||||
|
<dt className="text-sm font-medium text-gray-500 truncate">
|
||||||
|
OCR Analytics
|
||||||
|
</dt>
|
||||||
|
<dd className="text-lg font-medium text-gray-900">
|
||||||
|
{stats.documentsWithOcr} of {stats.totalDocuments} documents processed
|
||||||
|
</dd>
|
||||||
|
</dl>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="bg-gray-50 px-5 py-3">
|
||||||
|
<div className="grid grid-cols-2 gap-4 sm:grid-cols-4">
|
||||||
|
{/* Success Rate */}
|
||||||
|
<div className="text-center">
|
||||||
|
<div className="text-lg font-semibold text-gray-900">
|
||||||
|
{successRate.toFixed(0)}%
|
||||||
|
</div>
|
||||||
|
<div className="text-xs text-gray-500">Success Rate</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Average Confidence */}
|
||||||
|
<div className="text-center">
|
||||||
|
<div className={`text-lg font-semibold ${getConfidenceColor(stats.averageConfidence)}`}>
|
||||||
|
{stats.averageConfidence.toFixed(0)}%
|
||||||
|
</div>
|
||||||
|
<div className="text-xs text-gray-500">Avg Confidence</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Total Words */}
|
||||||
|
<div className="text-center">
|
||||||
|
<div className="text-lg font-semibold text-gray-900">
|
||||||
|
{stats.totalWords.toLocaleString()}
|
||||||
|
</div>
|
||||||
|
<div className="text-xs text-gray-500">Words Extracted</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Average Processing Time */}
|
||||||
|
<div className="text-center">
|
||||||
|
<div className="text-lg font-semibold text-gray-900">
|
||||||
|
{formatTime(stats.averageProcessingTime)}
|
||||||
|
</div>
|
||||||
|
<div className="text-xs text-gray-500">Avg Time</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Quality Distribution */}
|
||||||
|
<div className="mt-4 pt-4 border-t border-gray-200">
|
||||||
|
<div className="flex justify-between items-center text-sm">
|
||||||
|
<div className="flex items-center space-x-4">
|
||||||
|
<div className="flex items-center">
|
||||||
|
<div className="w-2 h-2 bg-green-500 rounded-full mr-1"></div>
|
||||||
|
<span className="text-gray-600">High Quality: {stats.highConfidenceCount}</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{stats.lowConfidenceCount > 0 && (
|
||||||
|
<div className="flex items-center">
|
||||||
|
<div className="w-2 h-2 bg-orange-500 rounded-full mr-1"></div>
|
||||||
|
<span className="text-gray-600">Low Quality: {stats.lowConfidenceCount}</span>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{stats.failedCount > 0 && (
|
||||||
|
<div className="flex items-center">
|
||||||
|
<div className="w-2 h-2 bg-red-500 rounded-full mr-1"></div>
|
||||||
|
<span className="text-gray-600">Failed: {stats.failedCount}</span>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{stats.processingCount > 0 && (
|
||||||
|
<div className="flex items-center">
|
||||||
|
<div className="w-2 h-2 bg-yellow-500 rounded-full mr-1 animate-pulse"></div>
|
||||||
|
<span className="text-gray-600">Processing: {stats.processingCount}</span>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
export default OcrAnalytics
|
||||||
|
|
@ -19,6 +19,10 @@ export interface Document {
|
||||||
tags: string[]
|
tags: string[]
|
||||||
created_at: string
|
created_at: string
|
||||||
has_ocr_text: boolean
|
has_ocr_text: boolean
|
||||||
|
ocr_confidence?: number
|
||||||
|
ocr_word_count?: number
|
||||||
|
ocr_processing_time_ms?: number
|
||||||
|
ocr_status?: string
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface SearchRequest {
|
export interface SearchRequest {
|
||||||
|
|
@ -53,6 +57,10 @@ export interface EnhancedDocument {
|
||||||
tags: string[]
|
tags: string[]
|
||||||
created_at: string
|
created_at: string
|
||||||
has_ocr_text: boolean
|
has_ocr_text: boolean
|
||||||
|
ocr_confidence?: number
|
||||||
|
ocr_word_count?: number
|
||||||
|
ocr_processing_time_ms?: number
|
||||||
|
ocr_status?: string
|
||||||
search_rank?: number
|
search_rank?: number
|
||||||
snippets: SearchSnippet[]
|
snippets: SearchSnippet[]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
20
src/db.rs
20
src/db.rs
|
|
@ -351,6 +351,10 @@ impl Database {
|
||||||
mime_type: row.get("mime_type"),
|
mime_type: row.get("mime_type"),
|
||||||
content: row.get("content"),
|
content: row.get("content"),
|
||||||
ocr_text: row.get("ocr_text"),
|
ocr_text: row.get("ocr_text"),
|
||||||
|
ocr_confidence: row.get("ocr_confidence"),
|
||||||
|
ocr_word_count: row.get("ocr_word_count"),
|
||||||
|
ocr_processing_time_ms: row.get("ocr_processing_time_ms"),
|
||||||
|
ocr_status: row.get("ocr_status"),
|
||||||
tags: row.get("tags"),
|
tags: row.get("tags"),
|
||||||
created_at: row.get("created_at"),
|
created_at: row.get("created_at"),
|
||||||
updated_at: row.get("updated_at"),
|
updated_at: row.get("updated_at"),
|
||||||
|
|
@ -385,6 +389,10 @@ impl Database {
|
||||||
mime_type: row.get("mime_type"),
|
mime_type: row.get("mime_type"),
|
||||||
content: row.get("content"),
|
content: row.get("content"),
|
||||||
ocr_text: row.get("ocr_text"),
|
ocr_text: row.get("ocr_text"),
|
||||||
|
ocr_confidence: row.get("ocr_confidence"),
|
||||||
|
ocr_word_count: row.get("ocr_word_count"),
|
||||||
|
ocr_processing_time_ms: row.get("ocr_processing_time_ms"),
|
||||||
|
ocr_status: row.get("ocr_status"),
|
||||||
tags: row.get("tags"),
|
tags: row.get("tags"),
|
||||||
created_at: row.get("created_at"),
|
created_at: row.get("created_at"),
|
||||||
updated_at: row.get("updated_at"),
|
updated_at: row.get("updated_at"),
|
||||||
|
|
@ -419,6 +427,10 @@ impl Database {
|
||||||
mime_type: row.get("mime_type"),
|
mime_type: row.get("mime_type"),
|
||||||
content: row.get("content"),
|
content: row.get("content"),
|
||||||
ocr_text: row.get("ocr_text"),
|
ocr_text: row.get("ocr_text"),
|
||||||
|
ocr_confidence: row.get("ocr_confidence"),
|
||||||
|
ocr_word_count: row.get("ocr_word_count"),
|
||||||
|
ocr_processing_time_ms: row.get("ocr_processing_time_ms"),
|
||||||
|
ocr_status: row.get("ocr_status"),
|
||||||
tags: row.get("tags"),
|
tags: row.get("tags"),
|
||||||
created_at: row.get("created_at"),
|
created_at: row.get("created_at"),
|
||||||
updated_at: row.get("updated_at"),
|
updated_at: row.get("updated_at"),
|
||||||
|
|
@ -483,6 +495,10 @@ impl Database {
|
||||||
mime_type: row.get("mime_type"),
|
mime_type: row.get("mime_type"),
|
||||||
content: row.get("content"),
|
content: row.get("content"),
|
||||||
ocr_text: row.get("ocr_text"),
|
ocr_text: row.get("ocr_text"),
|
||||||
|
ocr_confidence: row.get("ocr_confidence"),
|
||||||
|
ocr_word_count: row.get("ocr_word_count"),
|
||||||
|
ocr_processing_time_ms: row.get("ocr_processing_time_ms"),
|
||||||
|
ocr_status: row.get("ocr_status"),
|
||||||
tags: row.get("tags"),
|
tags: row.get("tags"),
|
||||||
created_at: row.get("created_at"),
|
created_at: row.get("created_at"),
|
||||||
updated_at: row.get("updated_at"),
|
updated_at: row.get("updated_at"),
|
||||||
|
|
@ -637,6 +653,10 @@ impl Database {
|
||||||
tags: row.get("tags"),
|
tags: row.get("tags"),
|
||||||
created_at: row.get("created_at"),
|
created_at: row.get("created_at"),
|
||||||
has_ocr_text: ocr_text.is_some(),
|
has_ocr_text: ocr_text.is_some(),
|
||||||
|
ocr_confidence: row.get("ocr_confidence"),
|
||||||
|
ocr_word_count: row.get("ocr_word_count"),
|
||||||
|
ocr_processing_time_ms: row.get("ocr_processing_time_ms"),
|
||||||
|
ocr_status: row.get("ocr_status"),
|
||||||
search_rank: Some(rank),
|
search_rank: Some(rank),
|
||||||
snippets,
|
snippets,
|
||||||
});
|
});
|
||||||
|
|
|
||||||
|
|
@ -58,6 +58,10 @@ impl FileService {
|
||||||
mime_type: mime_type.to_string(),
|
mime_type: mime_type.to_string(),
|
||||||
content: None,
|
content: None,
|
||||||
ocr_text: None,
|
ocr_text: None,
|
||||||
|
ocr_confidence: None,
|
||||||
|
ocr_word_count: None,
|
||||||
|
ocr_processing_time_ms: None,
|
||||||
|
ocr_status: Some("pending".to_string()),
|
||||||
tags: Vec::new(),
|
tags: Vec::new(),
|
||||||
created_at: Utc::now(),
|
created_at: Utc::now(),
|
||||||
updated_at: Utc::now(),
|
updated_at: Utc::now(),
|
||||||
|
|
|
||||||
|
|
@ -2,8 +2,9 @@ pub mod auth;
|
||||||
pub mod batch_ingest;
|
pub mod batch_ingest;
|
||||||
pub mod config;
|
pub mod config;
|
||||||
pub mod db;
|
pub mod db;
|
||||||
pub mod enhanced_ocr; // Temporarily disabled due to compilation errors
|
pub mod enhanced_ocr;
|
||||||
pub mod file_service;
|
pub mod file_service;
|
||||||
|
pub mod migrations;
|
||||||
pub mod models;
|
pub mod models;
|
||||||
pub mod ocr;
|
pub mod ocr;
|
||||||
pub mod ocr_queue;
|
pub mod ocr_queue;
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,12 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
|
||||||
db.migrate().await?;
|
db.migrate().await?;
|
||||||
|
|
||||||
|
// Run automatic migrations
|
||||||
|
if let Err(e) = readur::migrations::run_startup_migrations(&config.database_url, "migrations").await {
|
||||||
|
error!("Failed to run migrations: {}", e);
|
||||||
|
return Err(e.into());
|
||||||
|
}
|
||||||
|
|
||||||
// Seed admin user
|
// Seed admin user
|
||||||
seed::seed_admin_user(&db).await?;
|
seed::seed_admin_user(&db).await?;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,229 @@
|
||||||
|
use anyhow::Result;
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use tracing::{info, warn, error};
|
||||||
|
use std::fs;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
pub struct MigrationRunner {
|
||||||
|
pool: PgPool,
|
||||||
|
migrations_dir: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct Migration {
|
||||||
|
pub version: i32,
|
||||||
|
pub name: String,
|
||||||
|
pub sql: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MigrationRunner {
|
||||||
|
pub fn new(pool: PgPool, migrations_dir: String) -> Self {
|
||||||
|
Self {
|
||||||
|
pool,
|
||||||
|
migrations_dir,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Initialize the migrations table if it doesn't exist
|
||||||
|
pub async fn init(&self) -> Result<()> {
|
||||||
|
sqlx::query(
|
||||||
|
r#"
|
||||||
|
CREATE TABLE IF NOT EXISTS schema_migrations (
|
||||||
|
version INTEGER PRIMARY KEY,
|
||||||
|
name VARCHAR(255) NOT NULL,
|
||||||
|
applied_at TIMESTAMPTZ DEFAULT NOW()
|
||||||
|
);
|
||||||
|
"#
|
||||||
|
)
|
||||||
|
.execute(&self.pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
info!("Migration system initialized");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Load all migration files from the migrations directory
|
||||||
|
pub fn load_migrations(&self) -> Result<Vec<Migration>> {
|
||||||
|
let mut migrations = Vec::new();
|
||||||
|
let migrations_path = Path::new(&self.migrations_dir);
|
||||||
|
|
||||||
|
if !migrations_path.exists() {
|
||||||
|
warn!("Migrations directory not found: {}", self.migrations_dir);
|
||||||
|
return Ok(migrations);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut entries: Vec<_> = fs::read_dir(migrations_path)?
|
||||||
|
.filter_map(|entry| entry.ok())
|
||||||
|
.filter(|entry| {
|
||||||
|
entry.path().extension()
|
||||||
|
.and_then(|s| s.to_str())
|
||||||
|
.map(|s| s == "sql")
|
||||||
|
.unwrap_or(false)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Sort by filename to ensure proper order
|
||||||
|
entries.sort_by(|a, b| a.file_name().cmp(&b.file_name()));
|
||||||
|
|
||||||
|
for entry in entries {
|
||||||
|
let filename = entry.file_name().to_string_lossy().to_string();
|
||||||
|
|
||||||
|
// Parse version from filename (e.g., "001_add_ocr_queue.sql" -> version 1)
|
||||||
|
if let Some(version_str) = filename.split('_').next() {
|
||||||
|
if let Ok(version) = version_str.parse::<i32>() {
|
||||||
|
let sql = fs::read_to_string(entry.path())?;
|
||||||
|
let name = filename.replace(".sql", "");
|
||||||
|
|
||||||
|
migrations.push(Migration {
|
||||||
|
version,
|
||||||
|
name,
|
||||||
|
sql,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
migrations.sort_by_key(|m| m.version);
|
||||||
|
Ok(migrations)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the list of applied migration versions
|
||||||
|
pub async fn get_applied_migrations(&self) -> Result<Vec<i32>> {
|
||||||
|
let rows = sqlx::query_scalar::<_, i32>("SELECT version FROM schema_migrations ORDER BY version")
|
||||||
|
.fetch_all(&self.pool)
|
||||||
|
.await?;
|
||||||
|
Ok(rows)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if a specific migration has been applied
|
||||||
|
pub async fn is_migration_applied(&self, version: i32) -> Result<bool> {
|
||||||
|
let count: i64 = sqlx::query_scalar(
|
||||||
|
"SELECT COUNT(*) FROM schema_migrations WHERE version = $1"
|
||||||
|
)
|
||||||
|
.bind(version)
|
||||||
|
.fetch_one(&self.pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
Ok(count > 0)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Apply a single migration
|
||||||
|
pub async fn apply_migration(&self, migration: &Migration) -> Result<()> {
|
||||||
|
info!("Applying migration {}: {}", migration.version, migration.name);
|
||||||
|
|
||||||
|
// Start a transaction
|
||||||
|
let mut tx = self.pool.begin().await?;
|
||||||
|
|
||||||
|
// Execute the migration SQL
|
||||||
|
sqlx::query(&migration.sql)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
error!("Failed to apply migration {}: {}", migration.version, e);
|
||||||
|
e
|
||||||
|
})?;
|
||||||
|
|
||||||
|
// Record the migration as applied
|
||||||
|
sqlx::query(
|
||||||
|
"INSERT INTO schema_migrations (version, name) VALUES ($1, $2)"
|
||||||
|
)
|
||||||
|
.bind(migration.version)
|
||||||
|
.bind(&migration.name)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
// Commit the transaction
|
||||||
|
tx.commit().await?;
|
||||||
|
|
||||||
|
info!("Successfully applied migration {}: {}", migration.version, migration.name);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run all pending migrations
|
||||||
|
pub async fn run_migrations(&self) -> Result<()> {
|
||||||
|
// Initialize migration system
|
||||||
|
self.init().await?;
|
||||||
|
|
||||||
|
// Load all migrations
|
||||||
|
let migrations = self.load_migrations()?;
|
||||||
|
if migrations.is_empty() {
|
||||||
|
info!("No migrations found");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get applied migrations
|
||||||
|
let applied = self.get_applied_migrations().await?;
|
||||||
|
|
||||||
|
// Find pending migrations
|
||||||
|
let pending: Vec<&Migration> = migrations
|
||||||
|
.iter()
|
||||||
|
.filter(|m| !applied.contains(&m.version))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if pending.is_empty() {
|
||||||
|
info!("All migrations are up to date");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("Found {} pending migrations", pending.len());
|
||||||
|
|
||||||
|
// Apply each pending migration
|
||||||
|
for migration in pending {
|
||||||
|
self.apply_migration(migration).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("All migrations completed successfully");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get migration status summary
|
||||||
|
pub async fn get_status(&self) -> Result<MigrationStatus> {
|
||||||
|
self.init().await?;
|
||||||
|
|
||||||
|
let migrations = self.load_migrations()?;
|
||||||
|
let applied = self.get_applied_migrations().await?;
|
||||||
|
|
||||||
|
let pending_count = migrations
|
||||||
|
.iter()
|
||||||
|
.filter(|m| !applied.contains(&m.version))
|
||||||
|
.count();
|
||||||
|
|
||||||
|
Ok(MigrationStatus {
|
||||||
|
total_migrations: migrations.len(),
|
||||||
|
applied_migrations: applied.len(),
|
||||||
|
pending_migrations: pending_count,
|
||||||
|
latest_version: migrations.last().map(|m| m.version),
|
||||||
|
current_version: applied.last().copied(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct MigrationStatus {
|
||||||
|
pub total_migrations: usize,
|
||||||
|
pub applied_migrations: usize,
|
||||||
|
pub pending_migrations: usize,
|
||||||
|
pub latest_version: Option<i32>,
|
||||||
|
pub current_version: Option<i32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MigrationStatus {
|
||||||
|
pub fn is_up_to_date(&self) -> bool {
|
||||||
|
self.pending_migrations == 0
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn needs_migration(&self) -> bool {
|
||||||
|
self.pending_migrations > 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convenience function to run migrations at startup
|
||||||
|
pub async fn run_startup_migrations(database_url: &str, migrations_dir: &str) -> Result<()> {
|
||||||
|
let pool = sqlx::PgPool::connect(database_url).await?;
|
||||||
|
let runner = MigrationRunner::new(pool, migrations_dir.to_string());
|
||||||
|
|
||||||
|
info!("Running database migrations...");
|
||||||
|
runner.run_migrations().await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
@ -50,6 +50,10 @@ pub struct Document {
|
||||||
pub mime_type: String,
|
pub mime_type: String,
|
||||||
pub content: Option<String>,
|
pub content: Option<String>,
|
||||||
pub ocr_text: Option<String>,
|
pub ocr_text: Option<String>,
|
||||||
|
pub ocr_confidence: Option<f32>,
|
||||||
|
pub ocr_word_count: Option<i32>,
|
||||||
|
pub ocr_processing_time_ms: Option<i32>,
|
||||||
|
pub ocr_status: Option<String>,
|
||||||
pub tags: Vec<String>,
|
pub tags: Vec<String>,
|
||||||
pub created_at: DateTime<Utc>,
|
pub created_at: DateTime<Utc>,
|
||||||
pub updated_at: DateTime<Utc>,
|
pub updated_at: DateTime<Utc>,
|
||||||
|
|
@ -66,6 +70,10 @@ pub struct DocumentResponse {
|
||||||
pub tags: Vec<String>,
|
pub tags: Vec<String>,
|
||||||
pub created_at: DateTime<Utc>,
|
pub created_at: DateTime<Utc>,
|
||||||
pub has_ocr_text: bool,
|
pub has_ocr_text: bool,
|
||||||
|
pub ocr_confidence: Option<f32>,
|
||||||
|
pub ocr_word_count: Option<i32>,
|
||||||
|
pub ocr_processing_time_ms: Option<i32>,
|
||||||
|
pub ocr_status: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize, ToSchema, IntoParams)]
|
#[derive(Debug, Serialize, Deserialize, ToSchema, IntoParams)]
|
||||||
|
|
@ -122,6 +130,10 @@ pub struct EnhancedDocumentResponse {
|
||||||
pub tags: Vec<String>,
|
pub tags: Vec<String>,
|
||||||
pub created_at: DateTime<Utc>,
|
pub created_at: DateTime<Utc>,
|
||||||
pub has_ocr_text: bool,
|
pub has_ocr_text: bool,
|
||||||
|
pub ocr_confidence: Option<f32>,
|
||||||
|
pub ocr_word_count: Option<i32>,
|
||||||
|
pub ocr_processing_time_ms: Option<i32>,
|
||||||
|
pub ocr_status: Option<String>,
|
||||||
pub search_rank: Option<f32>,
|
pub search_rank: Option<f32>,
|
||||||
pub snippets: Vec<SearchSnippet>,
|
pub snippets: Vec<SearchSnippet>,
|
||||||
}
|
}
|
||||||
|
|
@ -145,6 +157,10 @@ impl From<Document> for DocumentResponse {
|
||||||
tags: doc.tags,
|
tags: doc.tags,
|
||||||
created_at: doc.created_at,
|
created_at: doc.created_at,
|
||||||
has_ocr_text: doc.ocr_text.is_some(),
|
has_ocr_text: doc.ocr_text.is_some(),
|
||||||
|
ocr_confidence: doc.ocr_confidence,
|
||||||
|
ocr_word_count: doc.ocr_word_count,
|
||||||
|
ocr_processing_time_ms: doc.ocr_processing_time_ms,
|
||||||
|
ocr_status: doc.ocr_status,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -55,6 +55,10 @@ async fn search_documents(
|
||||||
tags: doc.tags,
|
tags: doc.tags,
|
||||||
created_at: doc.created_at,
|
created_at: doc.created_at,
|
||||||
has_ocr_text: doc.ocr_text.is_some(),
|
has_ocr_text: doc.ocr_text.is_some(),
|
||||||
|
ocr_confidence: doc.ocr_confidence,
|
||||||
|
ocr_word_count: doc.ocr_word_count,
|
||||||
|
ocr_processing_time_ms: doc.ocr_processing_time_ms,
|
||||||
|
ocr_status: doc.ocr_status,
|
||||||
search_rank: None,
|
search_rank: None,
|
||||||
snippets: Vec::new(),
|
snippets: Vec::new(),
|
||||||
}).collect(),
|
}).collect(),
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue