diff --git a/src/models.rs b/src/models.rs deleted file mode 100644 index 06480af..0000000 --- a/src/models.rs +++ /dev/null @@ -1,1396 +0,0 @@ -use chrono::{DateTime, Utc}; -use serde::{Deserialize, Serialize}; -use sqlx::FromRow; -use uuid::Uuid; -use utoipa::{ToSchema, IntoParams}; -use serde_json; - -#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, ToSchema)] -pub enum UserRole { - #[serde(rename = "admin")] - Admin, - #[serde(rename = "user")] - User, -} - -#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, ToSchema)] -pub enum AuthProvider { - #[serde(rename = "local")] - Local, - #[serde(rename = "oidc")] - Oidc, -} - -impl std::fmt::Display for UserRole { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - UserRole::Admin => write!(f, "admin"), - UserRole::User => write!(f, "user"), - } - } -} - -impl TryFrom for UserRole { - type Error = String; - - fn try_from(value: String) -> Result { - match value.as_str() { - "admin" => Ok(UserRole::Admin), - "user" => Ok(UserRole::User), - _ => Err(format!("Invalid user role: {}", value)), - } - } -} - -impl std::fmt::Display for AuthProvider { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - AuthProvider::Local => write!(f, "local"), - AuthProvider::Oidc => write!(f, "oidc"), - } - } -} - -impl TryFrom for AuthProvider { - type Error = String; - - fn try_from(value: String) -> Result { - match value.as_str() { - "local" => Ok(AuthProvider::Local), - "oidc" => Ok(AuthProvider::Oidc), - _ => Err(format!("Invalid auth provider: {}", value)), - } - } -} - -#[derive(Debug, Clone, Serialize, Deserialize, FromRow, ToSchema)] -pub struct User { - pub id: Uuid, - pub username: String, - pub email: String, - pub password_hash: Option, - #[sqlx(try_from = "String")] - pub role: UserRole, - pub created_at: DateTime, - pub updated_at: DateTime, - pub oidc_subject: Option, - pub oidc_issuer: Option, - pub oidc_email: Option, - #[sqlx(try_from = "String")] - pub auth_provider: AuthProvider, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct CreateUser { - pub username: String, - pub email: String, - pub password: String, - #[serde(default = "default_user_role")] - pub role: Option, -} - -fn default_user_role() -> Option { - Some(UserRole::User) -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct LoginRequest { - pub username: String, - pub password: String, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct LoginResponse { - pub token: String, - pub user: UserResponse, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct UserResponse { - pub id: Uuid, - pub username: String, - pub email: String, - pub role: UserRole, -} - -#[derive(Debug, Clone, Serialize, Deserialize, FromRow)] -pub struct Document { - pub id: Uuid, - pub filename: String, - pub original_filename: String, - pub file_path: String, - pub file_size: i64, - pub mime_type: String, - pub content: Option, - pub ocr_text: Option, - pub ocr_confidence: Option, - pub ocr_word_count: Option, - pub ocr_processing_time_ms: Option, - pub ocr_status: Option, - pub ocr_error: Option, - pub ocr_completed_at: Option>, - pub ocr_retry_count: Option, - pub ocr_failure_reason: Option, - pub tags: Vec, - pub created_at: DateTime, - pub updated_at: DateTime, - pub user_id: Uuid, - pub file_hash: Option, - /// Original file creation timestamp from source system - pub original_created_at: Option>, - /// Original file modification timestamp from source system - pub original_modified_at: Option>, - /// Additional metadata from source system (permissions, attributes, EXIF data, etc.) - pub source_metadata: Option, -} - -#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, ToSchema)] -pub enum FailureReason { - #[serde(rename = "duplicate_content")] - DuplicateContent, - #[serde(rename = "duplicate_filename")] - DuplicateFilename, - #[serde(rename = "unsupported_format")] - UnsupportedFormat, - #[serde(rename = "file_too_large")] - FileTooLarge, - #[serde(rename = "file_corrupted")] - FileCorrupted, - #[serde(rename = "access_denied")] - AccessDenied, - #[serde(rename = "low_ocr_confidence")] - LowOcrConfidence, - #[serde(rename = "ocr_timeout")] - OcrTimeout, - #[serde(rename = "ocr_memory_limit")] - OcrMemoryLimit, - #[serde(rename = "pdf_parsing_error")] - PdfParsingError, - #[serde(rename = "storage_quota_exceeded")] - StorageQuotaExceeded, - #[serde(rename = "network_error")] - NetworkError, - #[serde(rename = "permission_denied")] - PermissionDenied, - #[serde(rename = "virus_detected")] - VirusDetected, - #[serde(rename = "invalid_structure")] - InvalidStructure, - #[serde(rename = "policy_violation")] - PolicyViolation, - #[serde(rename = "other")] - Other, -} - -#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, ToSchema)] -pub enum FailureStage { - #[serde(rename = "ingestion")] - Ingestion, - #[serde(rename = "validation")] - Validation, - #[serde(rename = "ocr")] - Ocr, - #[serde(rename = "storage")] - Storage, - #[serde(rename = "processing")] - Processing, - #[serde(rename = "sync")] - Sync, -} - -impl std::fmt::Display for FailureReason { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - FailureReason::DuplicateContent => write!(f, "duplicate_content"), - FailureReason::DuplicateFilename => write!(f, "duplicate_filename"), - FailureReason::UnsupportedFormat => write!(f, "unsupported_format"), - FailureReason::FileTooLarge => write!(f, "file_too_large"), - FailureReason::FileCorrupted => write!(f, "file_corrupted"), - FailureReason::AccessDenied => write!(f, "access_denied"), - FailureReason::LowOcrConfidence => write!(f, "low_ocr_confidence"), - FailureReason::OcrTimeout => write!(f, "ocr_timeout"), - FailureReason::OcrMemoryLimit => write!(f, "ocr_memory_limit"), - FailureReason::PdfParsingError => write!(f, "pdf_parsing_error"), - FailureReason::StorageQuotaExceeded => write!(f, "storage_quota_exceeded"), - FailureReason::NetworkError => write!(f, "network_error"), - FailureReason::PermissionDenied => write!(f, "permission_denied"), - FailureReason::VirusDetected => write!(f, "virus_detected"), - FailureReason::InvalidStructure => write!(f, "invalid_structure"), - FailureReason::PolicyViolation => write!(f, "policy_violation"), - FailureReason::Other => write!(f, "other"), - } - } -} - -impl std::fmt::Display for FailureStage { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - FailureStage::Ingestion => write!(f, "ingestion"), - FailureStage::Validation => write!(f, "validation"), - FailureStage::Ocr => write!(f, "ocr"), - FailureStage::Storage => write!(f, "storage"), - FailureStage::Processing => write!(f, "processing"), - FailureStage::Sync => write!(f, "sync"), - } - } -} - -#[derive(Debug, Clone, Serialize, Deserialize, FromRow, ToSchema)] -pub struct FailedDocument { - /// Unique identifier for the failed document record - pub id: Uuid, - /// User who attempted to ingest the document - pub user_id: Uuid, - /// Filename of the failed document - pub filename: String, - /// Original filename when uploaded - pub original_filename: Option, - /// Original path where the file was located - pub original_path: Option, - /// Stored file path (if file was saved before failure) - pub file_path: Option, - /// Size of the file in bytes - pub file_size: Option, - /// SHA256 hash of the file content - pub file_hash: Option, - /// MIME type of the file - pub mime_type: Option, - /// Raw content if extracted before failure - pub content: Option, - /// Tags that were assigned/detected - pub tags: Vec, - /// Partial OCR text if extracted before failure - pub ocr_text: Option, - /// OCR confidence if calculated - pub ocr_confidence: Option, - /// Word count if calculated - pub ocr_word_count: Option, - /// Processing time before failure in milliseconds - pub ocr_processing_time_ms: Option, - /// Reason why the document failed - pub failure_reason: String, - /// Stage at which the document failed - pub failure_stage: String, - /// Reference to existing document if failed due to duplicate - pub existing_document_id: Option, - /// Source of the ingestion attempt - pub ingestion_source: String, - /// Detailed error message - pub error_message: Option, - /// Number of retry attempts - pub retry_count: Option, - /// Last retry timestamp - pub last_retry_at: Option>, - /// When the document failed - pub created_at: DateTime, - /// Last update timestamp - pub updated_at: DateTime, -} - -#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] -pub struct DocumentResponse { - /// Unique identifier for the document - pub id: Uuid, - /// Current filename in the system - pub filename: String, - /// Original filename when uploaded - pub original_filename: String, - /// File size in bytes - pub file_size: i64, - /// MIME type of the file - pub mime_type: String, - /// Tags associated with the document - pub tags: Vec, - /// Labels associated with the document - #[serde(default)] - pub labels: Vec, - /// When the document was created - pub created_at: DateTime, - /// Whether OCR text has been extracted - pub has_ocr_text: bool, - /// OCR confidence score (0-100, higher is better) - pub ocr_confidence: Option, - /// Number of words detected by OCR - pub ocr_word_count: Option, - /// Time taken for OCR processing in milliseconds - pub ocr_processing_time_ms: Option, - /// Current status of OCR processing (pending, processing, completed, failed) - pub ocr_status: Option, - /// Original file creation timestamp from source system - #[serde(skip_serializing_if = "Option::is_none", default)] - pub original_created_at: Option>, - /// Original file modification timestamp from source system - #[serde(skip_serializing_if = "Option::is_none", default)] - pub original_modified_at: Option>, - /// Additional metadata from source system (permissions, attributes, etc.) - #[serde(skip_serializing_if = "Option::is_none", default)] - pub source_metadata: Option, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema, IntoParams)] -pub struct SearchRequest { - /// Search query text (searches both document content and OCR-extracted text) - pub query: String, - /// Filter by specific tags - pub tags: Option>, - /// Filter by MIME types (e.g., "application/pdf", "image/png") - pub mime_types: Option>, - /// Maximum number of results to return (default: 25) - pub limit: Option, - /// Number of results to skip for pagination (default: 0) - pub offset: Option, - /// Whether to include text snippets with search matches (default: true) - pub include_snippets: Option, - /// Length of text snippets in characters (default: 200) - pub snippet_length: Option, - /// Search algorithm to use (default: simple) - pub search_mode: Option, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub enum SearchMode { - /// Simple text search with basic word matching - #[serde(rename = "simple")] - Simple, - /// Exact phrase matching - #[serde(rename = "phrase")] - Phrase, - /// Fuzzy search using similarity matching (good for typos and partial matches) - #[serde(rename = "fuzzy")] - Fuzzy, - /// Boolean search with AND, OR, NOT operators - #[serde(rename = "boolean")] - Boolean, -} - -impl Default for SearchMode { - fn default() -> Self { - SearchMode::Simple - } -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct SearchSnippet { - /// The snippet text content - pub text: String, - /// Starting character position in the original document - pub start_offset: i32, - /// Ending character position in the original document - pub end_offset: i32, - /// Ranges within the snippet that should be highlighted - pub highlight_ranges: Vec, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct HighlightRange { - /// Start position of highlight within the snippet - pub start: i32, - /// End position of highlight within the snippet - pub end: i32, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct EnhancedDocumentResponse { - /// Unique identifier for the document - pub id: Uuid, - /// Current filename in the system - pub filename: String, - /// Original filename when uploaded - pub original_filename: String, - /// File size in bytes - pub file_size: i64, - /// MIME type of the file - pub mime_type: String, - /// Tags associated with the document - pub tags: Vec, - /// When the document was created - pub created_at: DateTime, - /// Whether OCR text has been extracted - pub has_ocr_text: bool, - /// OCR confidence score (0-100, higher is better) - pub ocr_confidence: Option, - /// Number of words detected by OCR - pub ocr_word_count: Option, - /// Time taken for OCR processing in milliseconds - pub ocr_processing_time_ms: Option, - /// Current status of OCR processing (pending, processing, completed, failed) - pub ocr_status: Option, - /// Search relevance score (0-1, higher is more relevant) - pub search_rank: Option, - /// Text snippets showing search matches with highlights - pub snippets: Vec, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct SearchResponse { - /// List of matching documents with enhanced metadata and snippets - pub documents: Vec, - /// Total number of documents matching the search criteria - pub total: i64, - /// Time taken to execute the search in milliseconds - pub query_time_ms: u64, - /// Search suggestions for query improvement - pub suggestions: Vec, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct FacetItem { - /// The facet value (e.g., mime type or tag) - pub value: String, - /// Number of documents with this value - pub count: i64, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct SearchFacetsResponse { - /// MIME type facets with counts - pub mime_types: Vec, - /// Tag facets with counts - pub tags: Vec, -} - -impl From for DocumentResponse { - fn from(doc: Document) -> Self { - Self { - id: doc.id, - filename: doc.filename, - original_filename: doc.original_filename, - file_size: doc.file_size, - mime_type: doc.mime_type, - tags: doc.tags, - labels: Vec::new(), // Labels will be populated separately where needed - created_at: doc.created_at, - has_ocr_text: doc.ocr_text.is_some(), - ocr_confidence: doc.ocr_confidence, - ocr_word_count: doc.ocr_word_count, - ocr_processing_time_ms: doc.ocr_processing_time_ms, - ocr_status: doc.ocr_status, - original_created_at: doc.original_created_at, - original_modified_at: doc.original_modified_at, - source_metadata: doc.source_metadata, - } - } -} - -impl From for UserResponse { - fn from(user: User) -> Self { - Self { - id: user.id, - username: user.username, - email: user.email, - role: user.role, - } - } -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct UpdateUser { - pub username: Option, - pub email: Option, - pub password: Option, -} - -#[derive(Debug, Clone, Serialize, Deserialize, FromRow, ToSchema)] -pub struct Settings { - pub id: Uuid, - pub user_id: Uuid, - pub ocr_language: String, - pub concurrent_ocr_jobs: i32, - pub ocr_timeout_seconds: i32, - pub max_file_size_mb: i32, - pub allowed_file_types: Vec, - pub auto_rotate_images: bool, - pub enable_image_preprocessing: bool, - pub search_results_per_page: i32, - pub search_snippet_length: i32, - pub fuzzy_search_threshold: f32, - pub retention_days: Option, - pub enable_auto_cleanup: bool, - pub enable_compression: bool, - pub memory_limit_mb: i32, - pub cpu_priority: String, - pub enable_background_ocr: bool, - pub ocr_page_segmentation_mode: i32, - pub ocr_engine_mode: i32, - pub ocr_min_confidence: f32, - pub ocr_dpi: i32, - pub ocr_enhance_contrast: bool, - pub ocr_remove_noise: bool, - pub ocr_detect_orientation: bool, - pub ocr_whitelist_chars: Option, - pub ocr_blacklist_chars: Option, - pub ocr_brightness_boost: f32, - pub ocr_contrast_multiplier: f32, - pub ocr_noise_reduction_level: i32, - pub ocr_sharpening_strength: f32, - pub ocr_morphological_operations: bool, - pub ocr_adaptive_threshold_window_size: i32, - pub ocr_histogram_equalization: bool, - pub ocr_upscale_factor: f32, - pub ocr_max_image_width: i32, - pub ocr_max_image_height: i32, - pub save_processed_images: bool, - pub ocr_quality_threshold_brightness: f32, - pub ocr_quality_threshold_contrast: f32, - pub ocr_quality_threshold_noise: f32, - pub ocr_quality_threshold_sharpness: f32, - pub ocr_skip_enhancement: bool, - pub webdav_enabled: bool, - pub webdav_server_url: Option, - pub webdav_username: Option, - pub webdav_password: Option, - pub webdav_watch_folders: Vec, - pub webdav_file_extensions: Vec, - pub webdav_auto_sync: bool, - pub webdav_sync_interval_minutes: i32, - pub created_at: DateTime, - pub updated_at: DateTime, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct SettingsResponse { - pub ocr_language: String, - pub concurrent_ocr_jobs: i32, - pub ocr_timeout_seconds: i32, - pub max_file_size_mb: i32, - pub allowed_file_types: Vec, - pub auto_rotate_images: bool, - pub enable_image_preprocessing: bool, - pub search_results_per_page: i32, - pub search_snippet_length: i32, - pub fuzzy_search_threshold: f32, - pub retention_days: Option, - pub enable_auto_cleanup: bool, - pub enable_compression: bool, - pub memory_limit_mb: i32, - pub cpu_priority: String, - pub enable_background_ocr: bool, - pub ocr_page_segmentation_mode: i32, - pub ocr_engine_mode: i32, - pub ocr_min_confidence: f32, - pub ocr_dpi: i32, - pub ocr_enhance_contrast: bool, - pub ocr_remove_noise: bool, - pub ocr_detect_orientation: bool, - pub ocr_whitelist_chars: Option, - pub ocr_blacklist_chars: Option, - pub ocr_brightness_boost: f32, - pub ocr_contrast_multiplier: f32, - pub ocr_noise_reduction_level: i32, - pub ocr_sharpening_strength: f32, - pub ocr_morphological_operations: bool, - pub ocr_adaptive_threshold_window_size: i32, - pub ocr_histogram_equalization: bool, - pub ocr_upscale_factor: f32, - pub ocr_max_image_width: i32, - pub ocr_max_image_height: i32, - pub save_processed_images: bool, - pub ocr_quality_threshold_brightness: f32, - pub ocr_quality_threshold_contrast: f32, - pub ocr_quality_threshold_noise: f32, - pub ocr_quality_threshold_sharpness: f32, - pub ocr_skip_enhancement: bool, - pub webdav_enabled: bool, - pub webdav_server_url: Option, - pub webdav_username: Option, - pub webdav_password: Option, - pub webdav_watch_folders: Vec, - pub webdav_file_extensions: Vec, - pub webdav_auto_sync: bool, - pub webdav_sync_interval_minutes: i32, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct UpdateSettings { - pub ocr_language: Option, - pub concurrent_ocr_jobs: Option, - pub ocr_timeout_seconds: Option, - pub max_file_size_mb: Option, - pub allowed_file_types: Option>, - pub auto_rotate_images: Option, - pub enable_image_preprocessing: Option, - pub search_results_per_page: Option, - pub search_snippet_length: Option, - pub fuzzy_search_threshold: Option, - pub retention_days: Option>, - pub enable_auto_cleanup: Option, - pub enable_compression: Option, - pub memory_limit_mb: Option, - pub cpu_priority: Option, - pub enable_background_ocr: Option, - pub ocr_page_segmentation_mode: Option, - pub ocr_engine_mode: Option, - pub ocr_min_confidence: Option, - pub ocr_dpi: Option, - pub ocr_enhance_contrast: Option, - pub ocr_remove_noise: Option, - pub ocr_detect_orientation: Option, - pub ocr_whitelist_chars: Option>, - pub ocr_blacklist_chars: Option>, - pub ocr_brightness_boost: Option, - pub ocr_contrast_multiplier: Option, - pub ocr_noise_reduction_level: Option, - pub ocr_sharpening_strength: Option, - pub ocr_morphological_operations: Option, - pub ocr_adaptive_threshold_window_size: Option, - pub ocr_histogram_equalization: Option, - pub ocr_upscale_factor: Option, - pub ocr_max_image_width: Option, - pub ocr_max_image_height: Option, - pub save_processed_images: Option, - pub ocr_quality_threshold_brightness: Option, - pub ocr_quality_threshold_contrast: Option, - pub ocr_quality_threshold_noise: Option, - pub ocr_quality_threshold_sharpness: Option, - pub ocr_skip_enhancement: Option, - pub webdav_enabled: Option, - pub webdav_server_url: Option>, - pub webdav_username: Option>, - pub webdav_password: Option>, - pub webdav_watch_folders: Option>, - pub webdav_file_extensions: Option>, - pub webdav_auto_sync: Option, - pub webdav_sync_interval_minutes: Option, -} - -impl From for SettingsResponse { - fn from(settings: Settings) -> Self { - Self { - ocr_language: settings.ocr_language, - concurrent_ocr_jobs: settings.concurrent_ocr_jobs, - ocr_timeout_seconds: settings.ocr_timeout_seconds, - max_file_size_mb: settings.max_file_size_mb, - allowed_file_types: settings.allowed_file_types, - auto_rotate_images: settings.auto_rotate_images, - enable_image_preprocessing: settings.enable_image_preprocessing, - search_results_per_page: settings.search_results_per_page, - search_snippet_length: settings.search_snippet_length, - fuzzy_search_threshold: settings.fuzzy_search_threshold, - retention_days: settings.retention_days, - enable_auto_cleanup: settings.enable_auto_cleanup, - enable_compression: settings.enable_compression, - memory_limit_mb: settings.memory_limit_mb, - cpu_priority: settings.cpu_priority, - enable_background_ocr: settings.enable_background_ocr, - ocr_page_segmentation_mode: settings.ocr_page_segmentation_mode, - ocr_engine_mode: settings.ocr_engine_mode, - ocr_min_confidence: settings.ocr_min_confidence, - ocr_dpi: settings.ocr_dpi, - ocr_enhance_contrast: settings.ocr_enhance_contrast, - ocr_remove_noise: settings.ocr_remove_noise, - ocr_detect_orientation: settings.ocr_detect_orientation, - ocr_whitelist_chars: settings.ocr_whitelist_chars, - ocr_blacklist_chars: settings.ocr_blacklist_chars, - ocr_brightness_boost: settings.ocr_brightness_boost, - ocr_contrast_multiplier: settings.ocr_contrast_multiplier, - ocr_noise_reduction_level: settings.ocr_noise_reduction_level, - ocr_sharpening_strength: settings.ocr_sharpening_strength, - ocr_morphological_operations: settings.ocr_morphological_operations, - ocr_adaptive_threshold_window_size: settings.ocr_adaptive_threshold_window_size, - ocr_histogram_equalization: settings.ocr_histogram_equalization, - ocr_upscale_factor: settings.ocr_upscale_factor, - ocr_max_image_width: settings.ocr_max_image_width, - ocr_max_image_height: settings.ocr_max_image_height, - save_processed_images: settings.save_processed_images, - ocr_quality_threshold_brightness: settings.ocr_quality_threshold_brightness, - ocr_quality_threshold_contrast: settings.ocr_quality_threshold_contrast, - ocr_quality_threshold_noise: settings.ocr_quality_threshold_noise, - ocr_quality_threshold_sharpness: settings.ocr_quality_threshold_sharpness, - ocr_skip_enhancement: settings.ocr_skip_enhancement, - webdav_enabled: settings.webdav_enabled, - webdav_server_url: settings.webdav_server_url, - webdav_username: settings.webdav_username, - webdav_password: settings.webdav_password, - webdav_watch_folders: settings.webdav_watch_folders, - webdav_file_extensions: settings.webdav_file_extensions, - webdav_auto_sync: settings.webdav_auto_sync, - webdav_sync_interval_minutes: settings.webdav_sync_interval_minutes, - } - } -} - -impl Default for Settings { - fn default() -> Self { - Self { - id: Uuid::new_v4(), - user_id: Uuid::nil(), - ocr_language: "eng".to_string(), - concurrent_ocr_jobs: 4, - ocr_timeout_seconds: 300, - max_file_size_mb: 50, - allowed_file_types: vec![ - "pdf".to_string(), - "png".to_string(), - "jpg".to_string(), - "jpeg".to_string(), - "tiff".to_string(), - "bmp".to_string(), - "txt".to_string(), - ], - auto_rotate_images: true, - enable_image_preprocessing: false, - search_results_per_page: 25, - search_snippet_length: 200, - fuzzy_search_threshold: 0.8, - retention_days: None, - enable_auto_cleanup: false, - enable_compression: false, - memory_limit_mb: 512, - cpu_priority: "normal".to_string(), - enable_background_ocr: true, - ocr_page_segmentation_mode: 3, // PSM_AUTO_OSD - Fully automatic page segmentation, but no OSD - ocr_engine_mode: 3, // OEM_DEFAULT - Default, based on what is available - ocr_min_confidence: 30.0, // Minimum confidence threshold (0-100) - ocr_dpi: 300, // Optimal DPI for OCR - ocr_enhance_contrast: true, // Enable contrast enhancement - ocr_remove_noise: true, // Enable noise removal - ocr_detect_orientation: true, // Enable orientation detection - ocr_whitelist_chars: None, // No character whitelist by default - ocr_blacklist_chars: None, // No character blacklist by default - ocr_brightness_boost: 1.0, // Conservative brightness boost - ocr_contrast_multiplier: 1.2, // Conservative contrast enhancement - ocr_noise_reduction_level: 1, // Light noise reduction - ocr_sharpening_strength: 0.5, // Light sharpening - ocr_morphological_operations: false, // Conservative - no morphological ops by default - ocr_adaptive_threshold_window_size: 15, // Small window for adaptive threshold - ocr_histogram_equalization: false, // Conservative - no histogram equalization by default - ocr_upscale_factor: 1.0, // No upscaling by default - ocr_max_image_width: 3000, // Reasonable max width - ocr_max_image_height: 3000, // Reasonable max height - save_processed_images: false, // Conservative - don't save by default - ocr_quality_threshold_brightness: 0.3, // Conservative threshold - ocr_quality_threshold_contrast: 0.2, // Conservative threshold - ocr_quality_threshold_noise: 0.7, // Conservative threshold - ocr_quality_threshold_sharpness: 0.3, // Conservative threshold - ocr_skip_enhancement: false, // Allow enhancement by default - webdav_enabled: false, - webdav_server_url: None, - webdav_username: None, - webdav_password: None, - webdav_watch_folders: vec!["/Documents".to_string()], - webdav_file_extensions: vec![ - "pdf".to_string(), - "png".to_string(), - "jpg".to_string(), - "jpeg".to_string(), - "tiff".to_string(), - "bmp".to_string(), - "txt".to_string(), - ], - webdav_auto_sync: false, - webdav_sync_interval_minutes: 60, - created_at: Utc::now(), - updated_at: Utc::now(), - } - } -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct WebDAVFolderInfo { - pub path: String, - pub total_files: i64, - pub supported_files: i64, - pub estimated_time_hours: f32, - pub total_size_mb: f64, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct WebDAVCrawlEstimate { - pub folders: Vec, - pub total_files: i64, - pub total_supported_files: i64, - pub total_estimated_time_hours: f32, - pub total_size_mb: f64, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct WebDAVTestConnection { - pub server_url: String, - pub username: String, - pub password: String, - pub server_type: Option, // "nextcloud", "owncloud", "generic" -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct WebDAVConnectionResult { - pub success: bool, - pub message: String, - pub server_version: Option, - pub server_type: Option, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct WebDAVSyncStatus { - pub is_running: bool, - pub last_sync: Option>, - pub files_processed: i64, - pub files_remaining: i64, - pub current_folder: Option, - pub errors: Vec, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct Notification { - pub id: Uuid, - pub user_id: Uuid, - pub notification_type: String, - pub title: String, - pub message: String, - pub read: bool, - pub action_url: Option, - pub metadata: Option, - pub created_at: DateTime, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct CreateNotification { - pub notification_type: String, - pub title: String, - pub message: String, - pub action_url: Option, - pub metadata: Option, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct NotificationSummary { - pub unread_count: i64, - pub recent_notifications: Vec, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct WebDAVSyncState { - pub id: Uuid, - pub user_id: Uuid, - pub last_sync_at: Option>, - pub sync_cursor: Option, - pub is_running: bool, - pub files_processed: i64, - pub files_remaining: i64, - pub current_folder: Option, - pub errors: Vec, - pub created_at: DateTime, - pub updated_at: DateTime, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct UpdateWebDAVSyncState { - pub last_sync_at: Option>, - pub sync_cursor: Option, - pub is_running: bool, - pub files_processed: i64, - pub files_remaining: i64, - pub current_folder: Option, - pub errors: Vec, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct WebDAVFile { - pub id: Uuid, - pub user_id: Uuid, - pub webdav_path: String, - pub etag: String, - pub last_modified: Option>, - pub file_size: i64, - pub mime_type: String, - pub document_id: Option, - pub sync_status: String, - pub sync_error: Option, - pub created_at: DateTime, - pub updated_at: DateTime, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct CreateWebDAVFile { - pub user_id: Uuid, - pub webdav_path: String, - pub etag: String, - pub last_modified: Option>, - pub file_size: i64, - pub mime_type: String, - pub document_id: Option, - pub sync_status: String, - pub sync_error: Option, -} - -#[derive(Debug, Clone)] -pub struct FileInfo { - pub path: String, - pub name: String, - pub size: i64, - pub mime_type: String, - pub last_modified: Option>, - pub etag: String, - pub is_directory: bool, - /// Original file creation time from source system - pub created_at: Option>, - /// File permissions (Unix mode bits or similar) - pub permissions: Option, - /// File owner (username or uid) - pub owner: Option, - /// File group (groupname or gid) - pub group: Option, - /// Additional metadata from source (EXIF, PDF metadata, custom attributes, etc.) - pub metadata: Option, -} - -#[derive(Debug, Serialize, Deserialize, FromRow)] -pub struct WebDAVDirectory { - pub id: Uuid, - pub user_id: Uuid, - pub directory_path: String, - pub directory_etag: String, - pub last_scanned_at: DateTime, - pub file_count: i64, - pub total_size_bytes: i64, - pub created_at: DateTime, - pub updated_at: DateTime, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct CreateWebDAVDirectory { - pub user_id: Uuid, - pub directory_path: String, - pub directory_etag: String, - pub file_count: i64, - pub total_size_bytes: i64, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct UpdateWebDAVDirectory { - pub directory_etag: String, - pub last_scanned_at: DateTime, - pub file_count: i64, - pub total_size_bytes: i64, -} - -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash, ToSchema)] -pub enum SourceType { - #[serde(rename = "webdav")] - WebDAV, - #[serde(rename = "local_folder")] - LocalFolder, - #[serde(rename = "s3")] - S3, -} - -impl std::fmt::Display for SourceType { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - SourceType::WebDAV => write!(f, "webdav"), - SourceType::LocalFolder => write!(f, "local_folder"), - SourceType::S3 => write!(f, "s3"), - } - } -} - -impl TryFrom for SourceType { - type Error = String; - - fn try_from(value: String) -> Result { - match value.as_str() { - "webdav" => Ok(SourceType::WebDAV), - "local_folder" => Ok(SourceType::LocalFolder), - "s3" => Ok(SourceType::S3), - _ => Err(format!("Invalid source type: {}", value)), - } - } -} - -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, ToSchema)] -pub enum SourceStatus { - #[serde(rename = "idle")] - Idle, - #[serde(rename = "syncing")] - Syncing, - #[serde(rename = "error")] - Error, -} - -impl std::fmt::Display for SourceStatus { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - SourceStatus::Idle => write!(f, "idle"), - SourceStatus::Syncing => write!(f, "syncing"), - SourceStatus::Error => write!(f, "error"), - } - } -} - -impl TryFrom for SourceStatus { - type Error = String; - - fn try_from(value: String) -> Result>::Error> { - match value.as_str() { - "idle" => Ok(SourceStatus::Idle), - "syncing" => Ok(SourceStatus::Syncing), - "error" => Ok(SourceStatus::Error), - _ => Err(format!("Invalid source status: {}", value)), - } - } -} - -#[derive(Debug, Clone, Serialize, Deserialize, FromRow, ToSchema)] -pub struct Source { - pub id: Uuid, - pub user_id: Uuid, - pub name: String, - #[sqlx(try_from = "String")] - pub source_type: SourceType, - pub enabled: bool, - pub config: serde_json::Value, - #[sqlx(try_from = "String")] - pub status: SourceStatus, - pub last_sync_at: Option>, - pub last_error: Option, - pub last_error_at: Option>, - pub total_files_synced: i64, - pub total_files_pending: i64, - pub total_size_bytes: i64, - pub created_at: DateTime, - pub updated_at: DateTime, - // Validation status tracking - #[sqlx(default)] - pub validation_status: Option, - #[sqlx(default)] - pub last_validation_at: Option>, - #[sqlx(default)] - pub validation_score: Option, // 0-100 health score - #[sqlx(default)] - pub validation_issues: Option, // JSON array of validation issues -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct SourceResponse { - pub id: Uuid, - pub name: String, - pub source_type: SourceType, - pub enabled: bool, - pub config: serde_json::Value, - pub status: SourceStatus, - pub last_sync_at: Option>, - pub last_error: Option, - pub last_error_at: Option>, - pub total_files_synced: i64, - pub total_files_pending: i64, - pub total_size_bytes: i64, - pub created_at: DateTime, - pub updated_at: DateTime, - /// Total number of documents/files currently stored from this source - #[serde(default)] - pub total_documents: i64, - /// Total number of documents that have been OCR'd from this source - #[serde(default)] - pub total_documents_ocr: i64, - /// Validation status and health score - #[serde(default)] - pub validation_status: Option, - #[serde(default)] - pub last_validation_at: Option>, - #[serde(default)] - pub validation_score: Option, - #[serde(default)] - pub validation_issues: Option, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct CreateSource { - pub name: String, - pub source_type: SourceType, - pub enabled: Option, - pub config: serde_json::Value, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct UpdateSource { - pub name: Option, - pub enabled: Option, - pub config: Option, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct SourceWithStats { - pub source: SourceResponse, - pub recent_documents: Vec, - pub sync_progress: Option, -} - -impl From for SourceResponse { - fn from(source: Source) -> Self { - Self { - id: source.id, - name: source.name, - source_type: source.source_type, - enabled: source.enabled, - config: source.config, - status: source.status, - last_sync_at: source.last_sync_at, - last_error: source.last_error, - last_error_at: source.last_error_at, - total_files_synced: source.total_files_synced, - total_files_pending: source.total_files_pending, - total_size_bytes: source.total_size_bytes, - created_at: source.created_at, - updated_at: source.updated_at, - // These will be populated separately when needed - total_documents: 0, - total_documents_ocr: 0, - // Validation fields - validation_status: source.validation_status, - last_validation_at: source.last_validation_at, - validation_score: source.validation_score, - validation_issues: source.validation_issues, - } - } -} - -#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] -pub struct WebDAVSourceConfig { - pub server_url: String, - pub username: String, - pub password: String, - pub watch_folders: Vec, - pub file_extensions: Vec, - pub auto_sync: bool, - pub sync_interval_minutes: i32, - pub server_type: Option, -} - -#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] -pub struct LocalFolderSourceConfig { - pub watch_folders: Vec, - pub file_extensions: Vec, - pub auto_sync: bool, - pub sync_interval_minutes: i32, - pub recursive: bool, - pub follow_symlinks: bool, -} - -#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] -pub struct S3SourceConfig { - pub bucket_name: String, - pub region: String, - pub access_key_id: String, - pub secret_access_key: String, - pub endpoint_url: Option, // For S3-compatible services - pub prefix: Option, // Optional path prefix - pub watch_folders: Vec, // S3 prefixes to monitor - pub file_extensions: Vec, - pub auto_sync: bool, - pub sync_interval_minutes: i32, -} - -#[derive(Debug, Clone, Serialize, Deserialize, FromRow, ToSchema)] -pub struct ProcessedImage { - pub id: Uuid, - pub document_id: Uuid, - pub user_id: Uuid, - pub original_image_path: String, - pub processed_image_path: String, - pub processing_parameters: serde_json::Value, - pub processing_steps: Vec, - pub image_width: i32, - pub image_height: i32, - pub file_size: i64, - pub created_at: DateTime, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct CreateProcessedImage { - pub document_id: Uuid, - pub user_id: Uuid, - pub original_image_path: String, - pub processed_image_path: String, - pub processing_parameters: serde_json::Value, - pub processing_steps: Vec, - pub image_width: i32, - pub image_height: i32, - pub file_size: i64, -} - -#[derive(Debug, Clone, Serialize, Deserialize, FromRow, ToSchema)] -pub struct IgnoredFile { - pub id: Uuid, - pub file_hash: String, - pub filename: String, - pub original_filename: String, - pub file_path: String, - pub file_size: i64, - pub mime_type: String, - pub source_type: Option, - pub source_path: Option, - pub source_identifier: Option, - pub ignored_at: DateTime, - pub ignored_by: Uuid, - pub reason: Option, - pub created_at: DateTime, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct IgnoredFileResponse { - pub id: Uuid, - pub file_hash: String, - pub filename: String, - pub original_filename: String, - pub file_path: String, - pub file_size: i64, - pub mime_type: String, - pub source_type: Option, - pub source_path: Option, - pub source_identifier: Option, - pub ignored_at: DateTime, - pub ignored_by: Uuid, - pub ignored_by_username: Option, - pub reason: Option, - pub created_at: DateTime, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct CreateIgnoredFile { - pub file_hash: String, - pub filename: String, - pub original_filename: String, - pub file_path: String, - pub file_size: i64, - pub mime_type: String, - pub source_type: Option, - pub source_path: Option, - pub source_identifier: Option, - pub ignored_by: Uuid, - pub reason: Option, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema, IntoParams)] -pub struct IgnoredFilesQuery { - /// Maximum number of results to return (default: 25) - pub limit: Option, - /// Number of results to skip for pagination (default: 0) - pub offset: Option, - /// Filter by source type - pub source_type: Option, - /// Filter by source identifier (specific source) - pub source_identifier: Option, - /// Filter by user who ignored the files - pub ignored_by: Option, - /// Search by filename - pub filename: Option, -} - -impl From for IgnoredFileResponse { - fn from(ignored_file: IgnoredFile) -> Self { - Self { - id: ignored_file.id, - file_hash: ignored_file.file_hash, - filename: ignored_file.filename, - original_filename: ignored_file.original_filename, - file_path: ignored_file.file_path, - file_size: ignored_file.file_size, - mime_type: ignored_file.mime_type, - source_type: ignored_file.source_type, - source_path: ignored_file.source_path, - source_identifier: ignored_file.source_identifier, - ignored_at: ignored_file.ignored_at, - ignored_by: ignored_file.ignored_by, - ignored_by_username: None, // Will be populated separately where needed - reason: ignored_file.reason, - created_at: ignored_file.created_at, - } - } -} - -// Additional response schemas for better API documentation - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct DocumentListResponse { - /// List of documents - pub documents: Vec, - /// Total number of documents (without pagination) - pub total: i64, - /// Number of documents returned in this response - pub count: i64, - /// Pagination offset used - pub offset: i64, - /// Pagination limit used - pub limit: i64, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct DocumentOcrResponse { - /// Document ID - pub document_id: Uuid, - /// Original filename - pub filename: String, - /// Whether the document has OCR text available - pub has_ocr_text: bool, - /// OCR text content (if available) - pub ocr_text: Option, - /// OCR processing confidence score (0-100) - pub ocr_confidence: Option, - /// Current OCR processing status - pub ocr_status: Option, - /// Time taken for OCR processing in milliseconds - pub ocr_processing_time_ms: Option, - /// Language detected in the document - pub detected_language: Option, - /// Number of pages processed (for multi-page documents) - pub pages_processed: Option, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct DocumentOperationResponse { - /// Whether the operation was successful - pub success: bool, - /// Human-readable message describing the result - pub message: String, - /// Document ID(s) affected by the operation - pub document_ids: Vec, - /// Number of documents processed - pub count: i64, - /// Any warnings or additional information - pub warnings: Vec, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct BulkDeleteResponse { - /// Whether the operation was successful - pub success: bool, - /// Number of documents successfully deleted - pub deleted_count: i64, - /// Number of documents that failed to delete - pub failed_count: i64, - /// List of document IDs that were successfully deleted - pub deleted_documents: Vec, - /// List of document IDs that failed to delete - pub failed_documents: Vec, - /// Number of files successfully deleted from storage - pub files_deleted: i64, - /// Number of files that failed to delete from storage - pub files_failed: i64, - /// Any warnings or additional information - pub warnings: Vec, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct PaginationInfo { - /// Total number of items available - pub total: i64, - /// Number of items returned in current response - pub count: i64, - /// Current offset - pub offset: i64, - /// Current limit - pub limit: i64, - /// Whether there are more items available - pub has_more: bool, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct DocumentDuplicatesResponse { - /// List of document groups that are duplicates of each other - pub duplicate_groups: Vec>, - /// Total number of duplicate documents found - pub total_duplicates: i64, - /// Number of duplicate groups - pub group_count: i64, - /// Pagination information - pub pagination: PaginationInfo, -} \ No newline at end of file diff --git a/src/models/document.rs b/src/models/document.rs new file mode 100644 index 0000000..0eb019c --- /dev/null +++ b/src/models/document.rs @@ -0,0 +1,262 @@ +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use sqlx::FromRow; +use uuid::Uuid; +use utoipa::ToSchema; +use serde_json; + +#[derive(Debug, Clone, Serialize, Deserialize, FromRow)] +pub struct Document { + pub id: Uuid, + pub filename: String, + pub original_filename: String, + pub file_path: String, + pub file_size: i64, + pub mime_type: String, + pub content: Option, + pub ocr_text: Option, + pub ocr_confidence: Option, + pub ocr_word_count: Option, + pub ocr_processing_time_ms: Option, + pub ocr_status: Option, + pub ocr_error: Option, + pub ocr_completed_at: Option>, + pub ocr_retry_count: Option, + pub ocr_failure_reason: Option, + pub tags: Vec, + pub created_at: DateTime, + pub updated_at: DateTime, + pub user_id: Uuid, + pub file_hash: Option, + /// Original file creation timestamp from source system + pub original_created_at: Option>, + /// Original file modification timestamp from source system + pub original_modified_at: Option>, + /// Additional metadata from source system (permissions, attributes, EXIF data, etc.) + pub source_metadata: Option, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, ToSchema)] +pub enum FailureReason { + #[serde(rename = "duplicate_content")] + DuplicateContent, + #[serde(rename = "duplicate_filename")] + DuplicateFilename, + #[serde(rename = "unsupported_format")] + UnsupportedFormat, + #[serde(rename = "file_too_large")] + FileTooLarge, + #[serde(rename = "file_corrupted")] + FileCorrupted, + #[serde(rename = "access_denied")] + AccessDenied, + #[serde(rename = "low_ocr_confidence")] + LowOcrConfidence, + #[serde(rename = "ocr_timeout")] + OcrTimeout, + #[serde(rename = "ocr_memory_limit")] + OcrMemoryLimit, + #[serde(rename = "pdf_parsing_error")] + PdfParsingError, + #[serde(rename = "storage_quota_exceeded")] + StorageQuotaExceeded, + #[serde(rename = "network_error")] + NetworkError, + #[serde(rename = "permission_denied")] + PermissionDenied, + #[serde(rename = "virus_detected")] + VirusDetected, + #[serde(rename = "invalid_structure")] + InvalidStructure, + #[serde(rename = "policy_violation")] + PolicyViolation, + #[serde(rename = "other")] + Other, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, ToSchema)] +pub enum FailureStage { + #[serde(rename = "ingestion")] + Ingestion, + #[serde(rename = "validation")] + Validation, + #[serde(rename = "ocr")] + Ocr, + #[serde(rename = "storage")] + Storage, + #[serde(rename = "processing")] + Processing, + #[serde(rename = "sync")] + Sync, +} + +impl std::fmt::Display for FailureReason { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + FailureReason::DuplicateContent => write!(f, "duplicate_content"), + FailureReason::DuplicateFilename => write!(f, "duplicate_filename"), + FailureReason::UnsupportedFormat => write!(f, "unsupported_format"), + FailureReason::FileTooLarge => write!(f, "file_too_large"), + FailureReason::FileCorrupted => write!(f, "file_corrupted"), + FailureReason::AccessDenied => write!(f, "access_denied"), + FailureReason::LowOcrConfidence => write!(f, "low_ocr_confidence"), + FailureReason::OcrTimeout => write!(f, "ocr_timeout"), + FailureReason::OcrMemoryLimit => write!(f, "ocr_memory_limit"), + FailureReason::PdfParsingError => write!(f, "pdf_parsing_error"), + FailureReason::StorageQuotaExceeded => write!(f, "storage_quota_exceeded"), + FailureReason::NetworkError => write!(f, "network_error"), + FailureReason::PermissionDenied => write!(f, "permission_denied"), + FailureReason::VirusDetected => write!(f, "virus_detected"), + FailureReason::InvalidStructure => write!(f, "invalid_structure"), + FailureReason::PolicyViolation => write!(f, "policy_violation"), + FailureReason::Other => write!(f, "other"), + } + } +} + +impl std::fmt::Display for FailureStage { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + FailureStage::Ingestion => write!(f, "ingestion"), + FailureStage::Validation => write!(f, "validation"), + FailureStage::Ocr => write!(f, "ocr"), + FailureStage::Storage => write!(f, "storage"), + FailureStage::Processing => write!(f, "processing"), + FailureStage::Sync => write!(f, "sync"), + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, FromRow, ToSchema)] +pub struct FailedDocument { + /// Unique identifier for the failed document record + pub id: Uuid, + /// User who attempted to ingest the document + pub user_id: Uuid, + /// Filename of the failed document + pub filename: String, + /// Original filename when uploaded + pub original_filename: Option, + /// Original path where the file was located + pub original_path: Option, + /// Stored file path (if file was saved before failure) + pub file_path: Option, + /// Size of the file in bytes + pub file_size: Option, + /// SHA256 hash of the file content + pub file_hash: Option, + /// MIME type of the file + pub mime_type: Option, + /// Raw content if extracted before failure + pub content: Option, + /// Tags that were assigned/detected + pub tags: Vec, + /// Partial OCR text if extracted before failure + pub ocr_text: Option, + /// OCR confidence if calculated + pub ocr_confidence: Option, + /// Word count if calculated + pub ocr_word_count: Option, + /// Processing time before failure in milliseconds + pub ocr_processing_time_ms: Option, + /// Reason why the document failed + pub failure_reason: String, + /// Stage at which the document failed + pub failure_stage: String, + /// Reference to existing document if failed due to duplicate + pub existing_document_id: Option, + /// Source of the ingestion attempt + pub ingestion_source: String, + /// Detailed error message + pub error_message: Option, + /// Number of retry attempts + pub retry_count: Option, + /// Last retry timestamp + pub last_retry_at: Option>, + /// When the document failed + pub created_at: DateTime, + /// Last update timestamp + pub updated_at: DateTime, +} + +#[derive(Debug, Clone, Serialize, Deserialize, FromRow, ToSchema)] +pub struct ProcessedImage { + pub id: Uuid, + pub document_id: Uuid, + pub user_id: Uuid, + pub original_image_path: String, + pub processed_image_path: String, + pub processing_parameters: serde_json::Value, + pub processing_steps: Vec, + pub image_width: i32, + pub image_height: i32, + pub file_size: i64, + pub created_at: DateTime, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct CreateProcessedImage { + pub document_id: Uuid, + pub user_id: Uuid, + pub original_image_path: String, + pub processed_image_path: String, + pub processing_parameters: serde_json::Value, + pub processing_steps: Vec, + pub image_width: i32, + pub image_height: i32, + pub file_size: i64, +} + +#[derive(Debug, Clone, Serialize, Deserialize, FromRow, ToSchema)] +pub struct IgnoredFile { + pub id: Uuid, + pub file_hash: String, + pub filename: String, + pub original_filename: String, + pub file_path: String, + pub file_size: i64, + pub mime_type: String, + pub source_type: Option, + pub source_path: Option, + pub source_identifier: Option, + pub ignored_at: DateTime, + pub ignored_by: Uuid, + pub reason: Option, + pub created_at: DateTime, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct CreateIgnoredFile { + pub file_hash: String, + pub filename: String, + pub original_filename: String, + pub file_path: String, + pub file_size: i64, + pub mime_type: String, + pub source_type: Option, + pub source_path: Option, + pub source_identifier: Option, + pub ignored_by: Uuid, + pub reason: Option, +} + +#[derive(Debug, Clone)] +pub struct FileInfo { + pub path: String, + pub name: String, + pub size: i64, + pub mime_type: String, + pub last_modified: Option>, + pub etag: String, + pub is_directory: bool, + /// Original file creation time from source system + pub created_at: Option>, + /// File permissions (Unix mode bits or similar) + pub permissions: Option, + /// File owner (username or uid) + pub owner: Option, + /// File group (groupname or gid) + pub group: Option, + /// Additional metadata from source (EXIF, PDF metadata, custom attributes, etc.) + pub metadata: Option, +} \ No newline at end of file diff --git a/src/models/mod.rs b/src/models/mod.rs new file mode 100644 index 0000000..0e3e6e7 --- /dev/null +++ b/src/models/mod.rs @@ -0,0 +1,16 @@ +// Re-export all model types for backward compatibility and ease of use + +pub mod user; +pub mod document; +pub mod search; +pub mod settings; +pub mod source; +pub mod responses; + +// Re-export commonly used types +pub use user::*; +pub use document::*; +pub use search::*; +pub use settings::*; +pub use source::*; +pub use responses::*; \ No newline at end of file diff --git a/src/models/responses.rs b/src/models/responses.rs new file mode 100644 index 0000000..980e0ed --- /dev/null +++ b/src/models/responses.rs @@ -0,0 +1,275 @@ +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; +use utoipa::{ToSchema, IntoParams}; +use serde_json; + +use super::document::Document; + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct SearchSnippet { + /// The snippet text content + pub text: String, + /// Starting character position in the original document + pub start_offset: i32, + /// Ending character position in the original document + pub end_offset: i32, + /// Ranges within the snippet that should be highlighted + pub highlight_ranges: Vec, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct HighlightRange { + /// Start position of highlight within the snippet + pub start: i32, + /// End position of highlight within the snippet + pub end: i32, +} + +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +pub struct DocumentResponse { + /// Unique identifier for the document + pub id: Uuid, + /// Current filename in the system + pub filename: String, + /// Original filename when uploaded + pub original_filename: String, + /// File size in bytes + pub file_size: i64, + /// MIME type of the file + pub mime_type: String, + /// Tags associated with the document + pub tags: Vec, + /// Labels associated with the document + #[serde(default)] + pub labels: Vec, + /// When the document was created + pub created_at: DateTime, + /// Whether OCR text has been extracted + pub has_ocr_text: bool, + /// OCR confidence score (0-100, higher is better) + pub ocr_confidence: Option, + /// Number of words detected by OCR + pub ocr_word_count: Option, + /// Time taken for OCR processing in milliseconds + pub ocr_processing_time_ms: Option, + /// Current status of OCR processing (pending, processing, completed, failed) + pub ocr_status: Option, + /// Original file creation timestamp from source system + #[serde(skip_serializing_if = "Option::is_none", default)] + pub original_created_at: Option>, + /// Original file modification timestamp from source system + #[serde(skip_serializing_if = "Option::is_none", default)] + pub original_modified_at: Option>, + /// Additional metadata from source system (permissions, attributes, etc.) + #[serde(skip_serializing_if = "Option::is_none", default)] + pub source_metadata: Option, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct EnhancedDocumentResponse { + /// Unique identifier for the document + pub id: Uuid, + /// Current filename in the system + pub filename: String, + /// Original filename when uploaded + pub original_filename: String, + /// File size in bytes + pub file_size: i64, + /// MIME type of the file + pub mime_type: String, + /// Tags associated with the document + pub tags: Vec, + /// When the document was created + pub created_at: DateTime, + /// Whether OCR text has been extracted + pub has_ocr_text: bool, + /// OCR confidence score (0-100, higher is better) + pub ocr_confidence: Option, + /// Number of words detected by OCR + pub ocr_word_count: Option, + /// Time taken for OCR processing in milliseconds + pub ocr_processing_time_ms: Option, + /// Current status of OCR processing (pending, processing, completed, failed) + pub ocr_status: Option, + /// Search relevance score (0-1, higher is more relevant) + pub search_rank: Option, + /// Text snippets showing search matches with highlights + pub snippets: Vec, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct IgnoredFileResponse { + pub id: Uuid, + pub file_hash: String, + pub filename: String, + pub original_filename: String, + pub file_path: String, + pub file_size: i64, + pub mime_type: String, + pub source_type: Option, + pub source_path: Option, + pub source_identifier: Option, + pub ignored_at: DateTime, + pub ignored_by: Uuid, + pub ignored_by_username: Option, + pub reason: Option, + pub created_at: DateTime, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct DocumentListResponse { + /// List of documents + pub documents: Vec, + /// Total number of documents (without pagination) + pub total: i64, + /// Number of documents returned in this response + pub count: i64, + /// Pagination offset used + pub offset: i64, + /// Pagination limit used + pub limit: i64, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct DocumentOcrResponse { + /// Document ID + pub document_id: Uuid, + /// Original filename + pub filename: String, + /// Whether the document has OCR text available + pub has_ocr_text: bool, + /// OCR text content (if available) + pub ocr_text: Option, + /// OCR processing confidence score (0-100) + pub ocr_confidence: Option, + /// Current OCR processing status + pub ocr_status: Option, + /// Time taken for OCR processing in milliseconds + pub ocr_processing_time_ms: Option, + /// Language detected in the document + pub detected_language: Option, + /// Number of pages processed (for multi-page documents) + pub pages_processed: Option, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct DocumentOperationResponse { + /// Whether the operation was successful + pub success: bool, + /// Human-readable message describing the result + pub message: String, + /// Document ID(s) affected by the operation + pub document_ids: Vec, + /// Number of documents processed + pub count: i64, + /// Any warnings or additional information + pub warnings: Vec, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct BulkDeleteResponse { + /// Whether the operation was successful + pub success: bool, + /// Number of documents successfully deleted + pub deleted_count: i64, + /// Number of documents that failed to delete + pub failed_count: i64, + /// List of document IDs that were successfully deleted + pub deleted_documents: Vec, + /// List of document IDs that failed to delete + pub failed_documents: Vec, + /// Number of files successfully deleted from storage + pub files_deleted: i64, + /// Number of files that failed to delete from storage + pub files_failed: i64, + /// Any warnings or additional information + pub warnings: Vec, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct PaginationInfo { + /// Total number of items available + pub total: i64, + /// Number of items returned in current response + pub count: i64, + /// Current offset + pub offset: i64, + /// Current limit + pub limit: i64, + /// Whether there are more items available + pub has_more: bool, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct DocumentDuplicatesResponse { + /// List of document groups that are duplicates of each other + pub duplicate_groups: Vec>, + /// Total number of duplicate documents found + pub total_duplicates: i64, + /// Number of duplicate groups + pub group_count: i64, + /// Pagination information + pub pagination: PaginationInfo, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema, IntoParams)] +pub struct IgnoredFilesQuery { + /// Maximum number of results to return (default: 25) + pub limit: Option, + /// Number of results to skip for pagination (default: 0) + pub offset: Option, + /// Filter by source type + pub source_type: Option, + /// Filter by source identifier (specific source) + pub source_identifier: Option, + /// Filter by user who ignored the files + pub ignored_by: Option, + /// Search by filename + pub filename: Option, +} + +impl From for DocumentResponse { + fn from(doc: Document) -> Self { + Self { + id: doc.id, + filename: doc.filename, + original_filename: doc.original_filename, + file_size: doc.file_size, + mime_type: doc.mime_type, + tags: doc.tags, + labels: Vec::new(), // Labels will be populated separately where needed + created_at: doc.created_at, + has_ocr_text: doc.ocr_text.is_some(), + ocr_confidence: doc.ocr_confidence, + ocr_word_count: doc.ocr_word_count, + ocr_processing_time_ms: doc.ocr_processing_time_ms, + ocr_status: doc.ocr_status, + original_created_at: doc.original_created_at, + original_modified_at: doc.original_modified_at, + source_metadata: doc.source_metadata, + } + } +} + +impl From for IgnoredFileResponse { + fn from(ignored_file: crate::models::document::IgnoredFile) -> Self { + Self { + id: ignored_file.id, + file_hash: ignored_file.file_hash, + filename: ignored_file.filename, + original_filename: ignored_file.original_filename, + file_path: ignored_file.file_path, + file_size: ignored_file.file_size, + mime_type: ignored_file.mime_type, + source_type: ignored_file.source_type, + source_path: ignored_file.source_path, + source_identifier: ignored_file.source_identifier, + ignored_at: ignored_file.ignored_at, + ignored_by: ignored_file.ignored_by, + ignored_by_username: None, // Will be populated separately where needed + reason: ignored_file.reason, + created_at: ignored_file.created_at, + } + } +} \ No newline at end of file diff --git a/src/models/search.rs b/src/models/search.rs new file mode 100644 index 0000000..d9c8df5 --- /dev/null +++ b/src/models/search.rs @@ -0,0 +1,75 @@ +use serde::{Deserialize, Serialize}; +use utoipa::{ToSchema, IntoParams}; + +use super::responses::{EnhancedDocumentResponse, SearchSnippet, HighlightRange}; + +#[derive(Debug, Serialize, Deserialize, ToSchema, IntoParams)] +pub struct SearchRequest { + /// Search query text (searches both document content and OCR-extracted text) + pub query: String, + /// Filter by specific tags + pub tags: Option>, + /// Filter by MIME types (e.g., "application/pdf", "image/png") + pub mime_types: Option>, + /// Maximum number of results to return (default: 25) + pub limit: Option, + /// Number of results to skip for pagination (default: 0) + pub offset: Option, + /// Whether to include text snippets with search matches (default: true) + pub include_snippets: Option, + /// Length of text snippets in characters (default: 200) + pub snippet_length: Option, + /// Search algorithm to use (default: simple) + pub search_mode: Option, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub enum SearchMode { + /// Simple text search with basic word matching + #[serde(rename = "simple")] + Simple, + /// Exact phrase matching + #[serde(rename = "phrase")] + Phrase, + /// Fuzzy search using similarity matching (good for typos and partial matches) + #[serde(rename = "fuzzy")] + Fuzzy, + /// Boolean search with AND, OR, NOT operators + #[serde(rename = "boolean")] + Boolean, +} + +impl Default for SearchMode { + fn default() -> Self { + SearchMode::Simple + } +} + + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct SearchResponse { + /// List of matching documents with enhanced metadata and snippets + pub documents: Vec, + /// Total number of documents matching the search criteria + pub total: i64, + /// Time taken to execute the search in milliseconds + pub query_time_ms: u64, + /// Search suggestions for query improvement + pub suggestions: Vec, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct FacetItem { + /// The facet value (e.g., mime type or tag) + pub value: String, + /// Number of documents with this value + pub count: i64, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct SearchFacetsResponse { + /// MIME type facets with counts + pub mime_types: Vec, + /// Tag facets with counts + pub tags: Vec, +} \ No newline at end of file diff --git a/src/models/settings.rs b/src/models/settings.rs new file mode 100644 index 0000000..446c17d --- /dev/null +++ b/src/models/settings.rs @@ -0,0 +1,300 @@ +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use sqlx::FromRow; +use uuid::Uuid; +use utoipa::ToSchema; + +#[derive(Debug, Clone, Serialize, Deserialize, FromRow, ToSchema)] +pub struct Settings { + pub id: Uuid, + pub user_id: Uuid, + pub ocr_language: String, + pub concurrent_ocr_jobs: i32, + pub ocr_timeout_seconds: i32, + pub max_file_size_mb: i32, + pub allowed_file_types: Vec, + pub auto_rotate_images: bool, + pub enable_image_preprocessing: bool, + pub search_results_per_page: i32, + pub search_snippet_length: i32, + pub fuzzy_search_threshold: f32, + pub retention_days: Option, + pub enable_auto_cleanup: bool, + pub enable_compression: bool, + pub memory_limit_mb: i32, + pub cpu_priority: String, + pub enable_background_ocr: bool, + pub ocr_page_segmentation_mode: i32, + pub ocr_engine_mode: i32, + pub ocr_min_confidence: f32, + pub ocr_dpi: i32, + pub ocr_enhance_contrast: bool, + pub ocr_remove_noise: bool, + pub ocr_detect_orientation: bool, + pub ocr_whitelist_chars: Option, + pub ocr_blacklist_chars: Option, + pub ocr_brightness_boost: f32, + pub ocr_contrast_multiplier: f32, + pub ocr_noise_reduction_level: i32, + pub ocr_sharpening_strength: f32, + pub ocr_morphological_operations: bool, + pub ocr_adaptive_threshold_window_size: i32, + pub ocr_histogram_equalization: bool, + pub ocr_upscale_factor: f32, + pub ocr_max_image_width: i32, + pub ocr_max_image_height: i32, + pub save_processed_images: bool, + pub ocr_quality_threshold_brightness: f32, + pub ocr_quality_threshold_contrast: f32, + pub ocr_quality_threshold_noise: f32, + pub ocr_quality_threshold_sharpness: f32, + pub ocr_skip_enhancement: bool, + pub webdav_enabled: bool, + pub webdav_server_url: Option, + pub webdav_username: Option, + pub webdav_password: Option, + pub webdav_watch_folders: Vec, + pub webdav_file_extensions: Vec, + pub webdav_auto_sync: bool, + pub webdav_sync_interval_minutes: i32, + pub created_at: DateTime, + pub updated_at: DateTime, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct SettingsResponse { + pub ocr_language: String, + pub concurrent_ocr_jobs: i32, + pub ocr_timeout_seconds: i32, + pub max_file_size_mb: i32, + pub allowed_file_types: Vec, + pub auto_rotate_images: bool, + pub enable_image_preprocessing: bool, + pub search_results_per_page: i32, + pub search_snippet_length: i32, + pub fuzzy_search_threshold: f32, + pub retention_days: Option, + pub enable_auto_cleanup: bool, + pub enable_compression: bool, + pub memory_limit_mb: i32, + pub cpu_priority: String, + pub enable_background_ocr: bool, + pub ocr_page_segmentation_mode: i32, + pub ocr_engine_mode: i32, + pub ocr_min_confidence: f32, + pub ocr_dpi: i32, + pub ocr_enhance_contrast: bool, + pub ocr_remove_noise: bool, + pub ocr_detect_orientation: bool, + pub ocr_whitelist_chars: Option, + pub ocr_blacklist_chars: Option, + pub ocr_brightness_boost: f32, + pub ocr_contrast_multiplier: f32, + pub ocr_noise_reduction_level: i32, + pub ocr_sharpening_strength: f32, + pub ocr_morphological_operations: bool, + pub ocr_adaptive_threshold_window_size: i32, + pub ocr_histogram_equalization: bool, + pub ocr_upscale_factor: f32, + pub ocr_max_image_width: i32, + pub ocr_max_image_height: i32, + pub save_processed_images: bool, + pub ocr_quality_threshold_brightness: f32, + pub ocr_quality_threshold_contrast: f32, + pub ocr_quality_threshold_noise: f32, + pub ocr_quality_threshold_sharpness: f32, + pub ocr_skip_enhancement: bool, + pub webdav_enabled: bool, + pub webdav_server_url: Option, + pub webdav_username: Option, + pub webdav_password: Option, + pub webdav_watch_folders: Vec, + pub webdav_file_extensions: Vec, + pub webdav_auto_sync: bool, + pub webdav_sync_interval_minutes: i32, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct UpdateSettings { + pub ocr_language: Option, + pub concurrent_ocr_jobs: Option, + pub ocr_timeout_seconds: Option, + pub max_file_size_mb: Option, + pub allowed_file_types: Option>, + pub auto_rotate_images: Option, + pub enable_image_preprocessing: Option, + pub search_results_per_page: Option, + pub search_snippet_length: Option, + pub fuzzy_search_threshold: Option, + pub retention_days: Option>, + pub enable_auto_cleanup: Option, + pub enable_compression: Option, + pub memory_limit_mb: Option, + pub cpu_priority: Option, + pub enable_background_ocr: Option, + pub ocr_page_segmentation_mode: Option, + pub ocr_engine_mode: Option, + pub ocr_min_confidence: Option, + pub ocr_dpi: Option, + pub ocr_enhance_contrast: Option, + pub ocr_remove_noise: Option, + pub ocr_detect_orientation: Option, + pub ocr_whitelist_chars: Option>, + pub ocr_blacklist_chars: Option>, + pub ocr_brightness_boost: Option, + pub ocr_contrast_multiplier: Option, + pub ocr_noise_reduction_level: Option, + pub ocr_sharpening_strength: Option, + pub ocr_morphological_operations: Option, + pub ocr_adaptive_threshold_window_size: Option, + pub ocr_histogram_equalization: Option, + pub ocr_upscale_factor: Option, + pub ocr_max_image_width: Option, + pub ocr_max_image_height: Option, + pub save_processed_images: Option, + pub ocr_quality_threshold_brightness: Option, + pub ocr_quality_threshold_contrast: Option, + pub ocr_quality_threshold_noise: Option, + pub ocr_quality_threshold_sharpness: Option, + pub ocr_skip_enhancement: Option, + pub webdav_enabled: Option, + pub webdav_server_url: Option>, + pub webdav_username: Option>, + pub webdav_password: Option>, + pub webdav_watch_folders: Option>, + pub webdav_file_extensions: Option>, + pub webdav_auto_sync: Option, + pub webdav_sync_interval_minutes: Option, +} + +impl From for SettingsResponse { + fn from(settings: Settings) -> Self { + Self { + ocr_language: settings.ocr_language, + concurrent_ocr_jobs: settings.concurrent_ocr_jobs, + ocr_timeout_seconds: settings.ocr_timeout_seconds, + max_file_size_mb: settings.max_file_size_mb, + allowed_file_types: settings.allowed_file_types, + auto_rotate_images: settings.auto_rotate_images, + enable_image_preprocessing: settings.enable_image_preprocessing, + search_results_per_page: settings.search_results_per_page, + search_snippet_length: settings.search_snippet_length, + fuzzy_search_threshold: settings.fuzzy_search_threshold, + retention_days: settings.retention_days, + enable_auto_cleanup: settings.enable_auto_cleanup, + enable_compression: settings.enable_compression, + memory_limit_mb: settings.memory_limit_mb, + cpu_priority: settings.cpu_priority, + enable_background_ocr: settings.enable_background_ocr, + ocr_page_segmentation_mode: settings.ocr_page_segmentation_mode, + ocr_engine_mode: settings.ocr_engine_mode, + ocr_min_confidence: settings.ocr_min_confidence, + ocr_dpi: settings.ocr_dpi, + ocr_enhance_contrast: settings.ocr_enhance_contrast, + ocr_remove_noise: settings.ocr_remove_noise, + ocr_detect_orientation: settings.ocr_detect_orientation, + ocr_whitelist_chars: settings.ocr_whitelist_chars, + ocr_blacklist_chars: settings.ocr_blacklist_chars, + ocr_brightness_boost: settings.ocr_brightness_boost, + ocr_contrast_multiplier: settings.ocr_contrast_multiplier, + ocr_noise_reduction_level: settings.ocr_noise_reduction_level, + ocr_sharpening_strength: settings.ocr_sharpening_strength, + ocr_morphological_operations: settings.ocr_morphological_operations, + ocr_adaptive_threshold_window_size: settings.ocr_adaptive_threshold_window_size, + ocr_histogram_equalization: settings.ocr_histogram_equalization, + ocr_upscale_factor: settings.ocr_upscale_factor, + ocr_max_image_width: settings.ocr_max_image_width, + ocr_max_image_height: settings.ocr_max_image_height, + save_processed_images: settings.save_processed_images, + ocr_quality_threshold_brightness: settings.ocr_quality_threshold_brightness, + ocr_quality_threshold_contrast: settings.ocr_quality_threshold_contrast, + ocr_quality_threshold_noise: settings.ocr_quality_threshold_noise, + ocr_quality_threshold_sharpness: settings.ocr_quality_threshold_sharpness, + ocr_skip_enhancement: settings.ocr_skip_enhancement, + webdav_enabled: settings.webdav_enabled, + webdav_server_url: settings.webdav_server_url, + webdav_username: settings.webdav_username, + webdav_password: settings.webdav_password, + webdav_watch_folders: settings.webdav_watch_folders, + webdav_file_extensions: settings.webdav_file_extensions, + webdav_auto_sync: settings.webdav_auto_sync, + webdav_sync_interval_minutes: settings.webdav_sync_interval_minutes, + } + } +} + +impl Default for Settings { + fn default() -> Self { + Self { + id: Uuid::new_v4(), + user_id: Uuid::nil(), + ocr_language: "eng".to_string(), + concurrent_ocr_jobs: 4, + ocr_timeout_seconds: 300, + max_file_size_mb: 50, + allowed_file_types: vec![ + "pdf".to_string(), + "png".to_string(), + "jpg".to_string(), + "jpeg".to_string(), + "tiff".to_string(), + "bmp".to_string(), + "txt".to_string(), + ], + auto_rotate_images: true, + enable_image_preprocessing: false, + search_results_per_page: 25, + search_snippet_length: 200, + fuzzy_search_threshold: 0.8, + retention_days: None, + enable_auto_cleanup: false, + enable_compression: false, + memory_limit_mb: 512, + cpu_priority: "normal".to_string(), + enable_background_ocr: true, + ocr_page_segmentation_mode: 3, // PSM_AUTO_OSD - Fully automatic page segmentation, but no OSD + ocr_engine_mode: 3, // OEM_DEFAULT - Default, based on what is available + ocr_min_confidence: 30.0, // Minimum confidence threshold (0-100) + ocr_dpi: 300, // Optimal DPI for OCR + ocr_enhance_contrast: true, // Enable contrast enhancement + ocr_remove_noise: true, // Enable noise removal + ocr_detect_orientation: true, // Enable orientation detection + ocr_whitelist_chars: None, // No character whitelist by default + ocr_blacklist_chars: None, // No character blacklist by default + ocr_brightness_boost: 1.0, // Conservative brightness boost + ocr_contrast_multiplier: 1.2, // Conservative contrast enhancement + ocr_noise_reduction_level: 1, // Light noise reduction + ocr_sharpening_strength: 0.5, // Light sharpening + ocr_morphological_operations: false, // Conservative - no morphological ops by default + ocr_adaptive_threshold_window_size: 15, // Small window for adaptive threshold + ocr_histogram_equalization: false, // Conservative - no histogram equalization by default + ocr_upscale_factor: 1.0, // No upscaling by default + ocr_max_image_width: 3000, // Reasonable max width + ocr_max_image_height: 3000, // Reasonable max height + save_processed_images: false, // Conservative - don't save by default + ocr_quality_threshold_brightness: 0.3, // Conservative threshold + ocr_quality_threshold_contrast: 0.2, // Conservative threshold + ocr_quality_threshold_noise: 0.7, // Conservative threshold + ocr_quality_threshold_sharpness: 0.3, // Conservative threshold + ocr_skip_enhancement: false, // Allow enhancement by default + webdav_enabled: false, + webdav_server_url: None, + webdav_username: None, + webdav_password: None, + webdav_watch_folders: vec!["/Documents".to_string()], + webdav_file_extensions: vec![ + "pdf".to_string(), + "png".to_string(), + "jpg".to_string(), + "jpeg".to_string(), + "tiff".to_string(), + "bmp".to_string(), + "txt".to_string(), + ], + webdav_auto_sync: false, + webdav_sync_interval_minutes: 60, + created_at: chrono::Utc::now(), + updated_at: chrono::Utc::now(), + } + } +} \ No newline at end of file diff --git a/src/models/source.rs b/src/models/source.rs new file mode 100644 index 0000000..618e257 --- /dev/null +++ b/src/models/source.rs @@ -0,0 +1,383 @@ +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use sqlx::FromRow; +use uuid::Uuid; +use utoipa::ToSchema; +use serde_json; + +use super::responses::DocumentResponse; + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash, ToSchema)] +pub enum SourceType { + #[serde(rename = "webdav")] + WebDAV, + #[serde(rename = "local_folder")] + LocalFolder, + #[serde(rename = "s3")] + S3, +} + +impl std::fmt::Display for SourceType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + SourceType::WebDAV => write!(f, "webdav"), + SourceType::LocalFolder => write!(f, "local_folder"), + SourceType::S3 => write!(f, "s3"), + } + } +} + +impl TryFrom for SourceType { + type Error = String; + + fn try_from(value: String) -> Result { + match value.as_str() { + "webdav" => Ok(SourceType::WebDAV), + "local_folder" => Ok(SourceType::LocalFolder), + "s3" => Ok(SourceType::S3), + _ => Err(format!("Invalid source type: {}", value)), + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, ToSchema)] +pub enum SourceStatus { + #[serde(rename = "idle")] + Idle, + #[serde(rename = "syncing")] + Syncing, + #[serde(rename = "error")] + Error, +} + +impl std::fmt::Display for SourceStatus { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + SourceStatus::Idle => write!(f, "idle"), + SourceStatus::Syncing => write!(f, "syncing"), + SourceStatus::Error => write!(f, "error"), + } + } +} + +impl TryFrom for SourceStatus { + type Error = String; + + fn try_from(value: String) -> Result>::Error> { + match value.as_str() { + "idle" => Ok(SourceStatus::Idle), + "syncing" => Ok(SourceStatus::Syncing), + "error" => Ok(SourceStatus::Error), + _ => Err(format!("Invalid source status: {}", value)), + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, FromRow, ToSchema)] +pub struct Source { + pub id: Uuid, + pub user_id: Uuid, + pub name: String, + #[sqlx(try_from = "String")] + pub source_type: SourceType, + pub enabled: bool, + pub config: serde_json::Value, + #[sqlx(try_from = "String")] + pub status: SourceStatus, + pub last_sync_at: Option>, + pub last_error: Option, + pub last_error_at: Option>, + pub total_files_synced: i64, + pub total_files_pending: i64, + pub total_size_bytes: i64, + pub created_at: DateTime, + pub updated_at: DateTime, + // Validation status tracking + #[sqlx(default)] + pub validation_status: Option, + #[sqlx(default)] + pub last_validation_at: Option>, + #[sqlx(default)] + pub validation_score: Option, // 0-100 health score + #[sqlx(default)] + pub validation_issues: Option, // JSON array of validation issues +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct SourceResponse { + pub id: Uuid, + pub name: String, + pub source_type: SourceType, + pub enabled: bool, + pub config: serde_json::Value, + pub status: SourceStatus, + pub last_sync_at: Option>, + pub last_error: Option, + pub last_error_at: Option>, + pub total_files_synced: i64, + pub total_files_pending: i64, + pub total_size_bytes: i64, + pub created_at: DateTime, + pub updated_at: DateTime, + /// Total number of documents/files currently stored from this source + #[serde(default)] + pub total_documents: i64, + /// Total number of documents that have been OCR'd from this source + #[serde(default)] + pub total_documents_ocr: i64, + /// Validation status and health score + #[serde(default)] + pub validation_status: Option, + #[serde(default)] + pub last_validation_at: Option>, + #[serde(default)] + pub validation_score: Option, + #[serde(default)] + pub validation_issues: Option, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct CreateSource { + pub name: String, + pub source_type: SourceType, + pub enabled: Option, + pub config: serde_json::Value, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct UpdateSource { + pub name: Option, + pub enabled: Option, + pub config: Option, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct SourceWithStats { + pub source: SourceResponse, + pub recent_documents: Vec, + pub sync_progress: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +pub struct WebDAVSourceConfig { + pub server_url: String, + pub username: String, + pub password: String, + pub watch_folders: Vec, + pub file_extensions: Vec, + pub auto_sync: bool, + pub sync_interval_minutes: i32, + pub server_type: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +pub struct LocalFolderSourceConfig { + pub watch_folders: Vec, + pub file_extensions: Vec, + pub auto_sync: bool, + pub sync_interval_minutes: i32, + pub recursive: bool, + pub follow_symlinks: bool, +} + +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +pub struct S3SourceConfig { + pub bucket_name: String, + pub region: String, + pub access_key_id: String, + pub secret_access_key: String, + pub endpoint_url: Option, // For S3-compatible services + pub prefix: Option, // Optional path prefix + pub watch_folders: Vec, // S3 prefixes to monitor + pub file_extensions: Vec, + pub auto_sync: bool, + pub sync_interval_minutes: i32, +} + +// WebDAV-related structs +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct WebDAVFolderInfo { + pub path: String, + pub total_files: i64, + pub supported_files: i64, + pub estimated_time_hours: f32, + pub total_size_mb: f64, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct WebDAVCrawlEstimate { + pub folders: Vec, + pub total_files: i64, + pub total_supported_files: i64, + pub total_estimated_time_hours: f32, + pub total_size_mb: f64, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct WebDAVTestConnection { + pub server_url: String, + pub username: String, + pub password: String, + pub server_type: Option, // "nextcloud", "owncloud", "generic" +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct WebDAVConnectionResult { + pub success: bool, + pub message: String, + pub server_version: Option, + pub server_type: Option, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct WebDAVSyncStatus { + pub is_running: bool, + pub last_sync: Option>, + pub files_processed: i64, + pub files_remaining: i64, + pub current_folder: Option, + pub errors: Vec, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct WebDAVSyncState { + pub id: Uuid, + pub user_id: Uuid, + pub last_sync_at: Option>, + pub sync_cursor: Option, + pub is_running: bool, + pub files_processed: i64, + pub files_remaining: i64, + pub current_folder: Option, + pub errors: Vec, + pub created_at: DateTime, + pub updated_at: DateTime, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct UpdateWebDAVSyncState { + pub last_sync_at: Option>, + pub sync_cursor: Option, + pub is_running: bool, + pub files_processed: i64, + pub files_remaining: i64, + pub current_folder: Option, + pub errors: Vec, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct WebDAVFile { + pub id: Uuid, + pub user_id: Uuid, + pub webdav_path: String, + pub etag: String, + pub last_modified: Option>, + pub file_size: i64, + pub mime_type: String, + pub document_id: Option, + pub sync_status: String, + pub sync_error: Option, + pub created_at: DateTime, + pub updated_at: DateTime, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct CreateWebDAVFile { + pub user_id: Uuid, + pub webdav_path: String, + pub etag: String, + pub last_modified: Option>, + pub file_size: i64, + pub mime_type: String, + pub document_id: Option, + pub sync_status: String, + pub sync_error: Option, +} + +#[derive(Debug, Serialize, Deserialize, FromRow)] +pub struct WebDAVDirectory { + pub id: Uuid, + pub user_id: Uuid, + pub directory_path: String, + pub directory_etag: String, + pub last_scanned_at: DateTime, + pub file_count: i64, + pub total_size_bytes: i64, + pub created_at: DateTime, + pub updated_at: DateTime, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct CreateWebDAVDirectory { + pub user_id: Uuid, + pub directory_path: String, + pub directory_etag: String, + pub file_count: i64, + pub total_size_bytes: i64, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct UpdateWebDAVDirectory { + pub directory_etag: String, + pub last_scanned_at: DateTime, + pub file_count: i64, + pub total_size_bytes: i64, +} + +// Notification-related structs +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct Notification { + pub id: Uuid, + pub user_id: Uuid, + pub notification_type: String, + pub title: String, + pub message: String, + pub read: bool, + pub action_url: Option, + pub metadata: Option, + pub created_at: DateTime, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct CreateNotification { + pub notification_type: String, + pub title: String, + pub message: String, + pub action_url: Option, + pub metadata: Option, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct NotificationSummary { + pub unread_count: i64, + pub recent_notifications: Vec, +} + +impl From for SourceResponse { + fn from(source: Source) -> Self { + Self { + id: source.id, + name: source.name, + source_type: source.source_type, + enabled: source.enabled, + config: source.config, + status: source.status, + last_sync_at: source.last_sync_at, + last_error: source.last_error, + last_error_at: source.last_error_at, + total_files_synced: source.total_files_synced, + total_files_pending: source.total_files_pending, + total_size_bytes: source.total_size_bytes, + created_at: source.created_at, + updated_at: source.updated_at, + // These will be populated separately when needed + total_documents: 0, + total_documents_ocr: 0, + // Validation fields + validation_status: source.validation_status, + last_validation_at: source.last_validation_at, + validation_score: source.validation_score, + validation_issues: source.validation_issues, + } + } +} \ No newline at end of file diff --git a/src/models/user.rs b/src/models/user.rs new file mode 100644 index 0000000..3ef36dd --- /dev/null +++ b/src/models/user.rs @@ -0,0 +1,131 @@ +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use sqlx::FromRow; +use uuid::Uuid; +use utoipa::ToSchema; + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, ToSchema)] +pub enum UserRole { + #[serde(rename = "admin")] + Admin, + #[serde(rename = "user")] + User, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, ToSchema)] +pub enum AuthProvider { + #[serde(rename = "local")] + Local, + #[serde(rename = "oidc")] + Oidc, +} + +impl std::fmt::Display for UserRole { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + UserRole::Admin => write!(f, "admin"), + UserRole::User => write!(f, "user"), + } + } +} + +impl TryFrom for UserRole { + type Error = String; + + fn try_from(value: String) -> Result { + match value.as_str() { + "admin" => Ok(UserRole::Admin), + "user" => Ok(UserRole::User), + _ => Err(format!("Invalid user role: {}", value)), + } + } +} + +impl std::fmt::Display for AuthProvider { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + AuthProvider::Local => write!(f, "local"), + AuthProvider::Oidc => write!(f, "oidc"), + } + } +} + +impl TryFrom for AuthProvider { + type Error = String; + + fn try_from(value: String) -> Result { + match value.as_str() { + "local" => Ok(AuthProvider::Local), + "oidc" => Ok(AuthProvider::Oidc), + _ => Err(format!("Invalid auth provider: {}", value)), + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, FromRow, ToSchema)] +pub struct User { + pub id: Uuid, + pub username: String, + pub email: String, + pub password_hash: Option, + #[sqlx(try_from = "String")] + pub role: UserRole, + pub created_at: DateTime, + pub updated_at: DateTime, + pub oidc_subject: Option, + pub oidc_issuer: Option, + pub oidc_email: Option, + #[sqlx(try_from = "String")] + pub auth_provider: AuthProvider, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct CreateUser { + pub username: String, + pub email: String, + pub password: String, + #[serde(default = "default_user_role")] + pub role: Option, +} + +fn default_user_role() -> Option { + Some(UserRole::User) +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct LoginRequest { + pub username: String, + pub password: String, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct LoginResponse { + pub token: String, + pub user: UserResponse, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct UserResponse { + pub id: Uuid, + pub username: String, + pub email: String, + pub role: UserRole, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct UpdateUser { + pub username: Option, + pub email: Option, + pub password: Option, +} + +impl From for UserResponse { + fn from(user: User) -> Self { + Self { + id: user.id, + username: user.username, + email: user.email, + role: user.role, + } + } +} \ No newline at end of file