Readur/src/db/documents/operations.rs

274 lines
11 KiB
Rust

use anyhow::Result;
use sqlx::{QueryBuilder, Postgres, Row};
use uuid::Uuid;
use crate::models::{Document, UserRole, FailedDocument};
use super::helpers::{map_row_to_document, apply_role_based_filter, DOCUMENT_FIELDS};
use crate::db::Database;
impl Database {
/// Deletes a single document with role-based access control
pub async fn delete_document(&self, document_id: Uuid, user_id: Uuid, user_role: UserRole) -> Result<bool> {
let mut query = QueryBuilder::<Postgres>::new("DELETE FROM documents WHERE id = ");
query.push_bind(document_id);
apply_role_based_filter(&mut query, user_id, user_role);
let result = query.build().execute(&self.pool).await?;
Ok(result.rows_affected() > 0)
}
/// Bulk deletes multiple documents with role-based access control
pub async fn bulk_delete_documents(&self, document_ids: &[Uuid], user_id: Uuid, user_role: UserRole) -> Result<(Vec<Uuid>, Vec<Uuid>)> {
if document_ids.is_empty() {
return Ok((Vec::new(), Vec::new()));
}
let mut tx = self.pool.begin().await?;
let mut deleted_ids = Vec::new();
let mut failed_ids = Vec::new();
for &doc_id in document_ids {
let mut query = QueryBuilder::<Postgres>::new("DELETE FROM documents WHERE id = ");
query.push_bind(doc_id);
apply_role_based_filter(&mut query, user_id, user_role);
query.push(" RETURNING id");
match query.build().fetch_optional(&mut *tx).await {
Ok(Some(row)) => {
let deleted_id: Uuid = row.get("id");
deleted_ids.push(deleted_id);
}
Ok(None) => {
failed_ids.push(doc_id);
}
Err(_) => {
failed_ids.push(doc_id);
}
}
}
tx.commit().await?;
Ok((deleted_ids, failed_ids))
}
/// Finds documents with OCR confidence below threshold
pub async fn find_documents_by_confidence_threshold(&self, user_id: Uuid, user_role: UserRole, max_confidence: f32, limit: i64, offset: i64) -> Result<Vec<Document>> {
let mut query = QueryBuilder::<Postgres>::new("SELECT ");
query.push(DOCUMENT_FIELDS);
query.push(" FROM documents WHERE ocr_confidence IS NOT NULL AND ocr_confidence <= ");
query.push_bind(max_confidence);
apply_role_based_filter(&mut query, user_id, user_role);
query.push(" ORDER BY ocr_confidence ASC, created_at DESC");
query.push(" LIMIT ");
query.push_bind(limit);
query.push(" OFFSET ");
query.push_bind(offset);
let rows = query.build().fetch_all(&self.pool).await?;
Ok(rows.iter().map(map_row_to_document).collect())
}
/// Finds documents with failed OCR processing
pub async fn find_failed_ocr_documents(&self, user_id: Uuid, user_role: UserRole, limit: i64, offset: i64) -> Result<Vec<Document>> {
let mut query = QueryBuilder::<Postgres>::new("SELECT ");
query.push(DOCUMENT_FIELDS);
query.push(" FROM documents WHERE ocr_status = 'failed'");
apply_role_based_filter(&mut query, user_id, user_role);
query.push(" ORDER BY created_at DESC");
query.push(" LIMIT ");
query.push_bind(limit);
query.push(" OFFSET ");
query.push_bind(offset);
let rows = query.build().fetch_all(&self.pool).await?;
Ok(rows.iter().map(map_row_to_document).collect())
}
/// Finds both low confidence and failed OCR documents
pub async fn find_low_confidence_and_failed_documents(&self, user_id: Uuid, user_role: UserRole, max_confidence: f32, limit: i64, offset: i64) -> Result<Vec<Document>> {
let mut query = QueryBuilder::<Postgres>::new("SELECT ");
query.push(DOCUMENT_FIELDS);
query.push(" FROM documents WHERE (ocr_status = 'failed' OR (ocr_confidence IS NOT NULL AND ocr_confidence <= ");
query.push_bind(max_confidence);
query.push("))");
apply_role_based_filter(&mut query, user_id, user_role);
query.push(" ORDER BY CASE WHEN ocr_status = 'failed' THEN 0 ELSE 1 END, ocr_confidence ASC, created_at DESC");
query.push(" LIMIT ");
query.push_bind(limit);
query.push(" OFFSET ");
query.push_bind(offset);
let rows = query.build().fetch_all(&self.pool).await?;
Ok(rows.iter().map(map_row_to_document).collect())
}
/// Creates a failed document record
pub async fn create_failed_document(&self, failed_document: FailedDocument) -> Result<FailedDocument> {
let row = sqlx::query(
r#"
INSERT INTO failed_documents (
id, user_id, filename, original_filename, original_path, file_path,
file_size, file_hash, mime_type, content, tags, ocr_text, ocr_confidence,
ocr_word_count, ocr_processing_time_ms, failure_reason, failure_stage,
existing_document_id, ingestion_source, error_message, retry_count,
last_retry_at, created_at, updated_at
)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24)
RETURNING *
"#
)
.bind(failed_document.id)
.bind(failed_document.user_id)
.bind(&failed_document.filename)
.bind(&failed_document.original_filename)
.bind(&failed_document.original_path)
.bind(&failed_document.file_path)
.bind(failed_document.file_size)
.bind(&failed_document.file_hash)
.bind(&failed_document.mime_type)
.bind(&failed_document.content)
.bind(&failed_document.tags)
.bind(&failed_document.ocr_text)
.bind(failed_document.ocr_confidence)
.bind(failed_document.ocr_word_count)
.bind(failed_document.ocr_processing_time_ms)
.bind(&failed_document.failure_reason)
.bind(&failed_document.failure_stage)
.bind(failed_document.existing_document_id)
.bind(&failed_document.ingestion_source)
.bind(&failed_document.error_message)
.bind(failed_document.retry_count)
.bind(failed_document.last_retry_at)
.bind(failed_document.created_at)
.bind(failed_document.updated_at)
.fetch_one(&self.pool)
.await?;
Ok(FailedDocument {
id: row.get("id"),
user_id: row.get("user_id"),
filename: row.get("filename"),
original_filename: row.get("original_filename"),
original_path: row.get("original_path"),
file_path: row.get("file_path"),
file_size: row.get("file_size"),
file_hash: row.get("file_hash"),
mime_type: row.get("mime_type"),
content: row.get("content"),
tags: row.get("tags"),
ocr_text: row.get("ocr_text"),
ocr_confidence: row.get("ocr_confidence"),
ocr_word_count: row.get("ocr_word_count"),
ocr_processing_time_ms: row.get("ocr_processing_time_ms"),
failure_reason: row.get("failure_reason"),
failure_stage: row.get("failure_stage"),
existing_document_id: row.get("existing_document_id"),
ingestion_source: row.get("ingestion_source"),
error_message: row.get("error_message"),
retry_count: row.get("retry_count"),
last_retry_at: row.get("last_retry_at"),
created_at: row.get("created_at"),
updated_at: row.get("updated_at"),
})
}
/// Creates a failed document record from an existing document
pub async fn create_failed_document_from_document(&self, document: &Document, failure_reason: &str, failure_stage: &str, error_message: Option<&str>) -> Result<FailedDocument> {
let failed_doc = FailedDocument {
id: Uuid::new_v4(),
user_id: document.user_id,
filename: document.filename.clone(),
original_filename: Some(document.original_filename.clone()),
original_path: Some(document.file_path.clone()),
file_path: Some(document.file_path.clone()),
file_size: Some(document.file_size),
file_hash: document.file_hash.clone(),
mime_type: Some(document.mime_type.clone()),
content: document.content.clone(),
tags: document.tags.clone(),
ocr_text: document.ocr_text.clone(),
ocr_confidence: document.ocr_confidence,
ocr_word_count: document.ocr_word_count,
ocr_processing_time_ms: document.ocr_processing_time_ms,
failure_reason: failure_reason.to_string(),
failure_stage: failure_stage.to_string(),
existing_document_id: Some(document.id),
ingestion_source: "document_processing".to_string(),
error_message: error_message.map(|s| s.to_string()),
retry_count: Some(0),
last_retry_at: None,
created_at: chrono::Utc::now(),
updated_at: chrono::Utc::now(),
};
self.create_failed_document(failed_doc).await
}
/// Updates OCR retry information for a document
pub async fn update_document_ocr_retry(&self, document_id: Uuid, retry_count: i32, failure_reason: Option<&str>) -> Result<()> {
sqlx::query(
r#"
UPDATE documents
SET ocr_retry_count = $2, ocr_failure_reason = $3, updated_at = NOW()
WHERE id = $1
"#
)
.bind(document_id)
.bind(retry_count)
.bind(failure_reason)
.execute(&self.pool)
.await?;
Ok(())
}
/// Marks documents as completed OCR processing
pub async fn mark_documents_ocr_completed(&self, document_ids: &[Uuid]) -> Result<u64> {
if document_ids.is_empty() {
return Ok(0);
}
let result = sqlx::query(
r#"
UPDATE documents
SET ocr_status = 'completed', ocr_completed_at = NOW(), updated_at = NOW()
WHERE id = ANY($1)
"#
)
.bind(document_ids)
.execute(&self.pool)
.await?;
Ok(result.rows_affected())
}
/// Counts documents by OCR status
pub async fn count_documents_by_ocr_status(&self, user_id: Uuid, user_role: UserRole) -> Result<(i64, i64, i64, i64)> {
let mut query = QueryBuilder::<Postgres>::new(
r#"
SELECT
COUNT(*) as total,
COUNT(CASE WHEN ocr_status IS NULL OR ocr_status = 'pending' THEN 1 END) as pending,
COUNT(CASE WHEN ocr_status = 'completed' THEN 1 END) as completed,
COUNT(CASE WHEN ocr_status = 'failed' THEN 1 END) as failed
FROM documents WHERE 1=1
"#
);
apply_role_based_filter(&mut query, user_id, user_role);
let row = query.build().fetch_one(&self.pool).await?;
Ok((
row.get("total"),
row.get("pending"),
row.get("completed"),
row.get("failed"),
))
}
}