From 67ae68745c6efaecd735e32823c8c154f8525ad5 Mon Sep 17 00:00:00 2001 From: perf3ct Date: Wed, 13 Aug 2025 20:51:13 +0000 Subject: [PATCH] fix(dev): remove unneeded docs --- .claude/agents/rust-ocr-api-architect.md | 49 +++ .claude/agents/rust-storage-sync-expert.md | 68 ++++ docs/webdav-enhanced-features.md | 426 --------------------- src/errors/mod.rs | 7 +- src/ingestion/document_ingestion.rs | 2 +- src/ocr/enhanced.rs | 2 +- src/routes/documents/bulk.rs | 1 - src/routes/documents/crud.rs | 1 - src/routes/documents/debug.rs | 1 - src/routes/documents/failed.rs | 1 - src/routes/labels.rs | 2 +- src/routes/metrics.rs | 2 +- src/routes/prometheus_metrics.rs | 2 +- src/routes/queue.rs | 2 +- src/routes/users.rs | 2 +- src/routes/webdav/webdav_sync.rs | 1 - src/scheduling/source_sync.rs | 1 - src/scheduling/user_watch_manager.rs | 2 +- src/services/local_folder_service.rs | 2 +- src/services/s3_service.rs | 2 +- src/services/sync_progress_tracker.rs | 1 - src/services/webdav/mod.rs | 22 +- src/services/webdav/service.rs | 414 +------------------- 23 files changed, 135 insertions(+), 878 deletions(-) create mode 100644 .claude/agents/rust-ocr-api-architect.md create mode 100644 .claude/agents/rust-storage-sync-expert.md delete mode 100644 docs/webdav-enhanced-features.md diff --git a/.claude/agents/rust-ocr-api-architect.md b/.claude/agents/rust-ocr-api-architect.md new file mode 100644 index 0000000..a263aa3 --- /dev/null +++ b/.claude/agents/rust-ocr-api-architect.md @@ -0,0 +1,49 @@ +--- +name: rust-ocr-api-architect +description: Use this agent when you need to design, implement, or optimize Rust server applications that handle OCR processing of user-uploaded files, including API endpoint design, file management systems, concurrent processing pipelines, and integration with OCR libraries. This includes tasks like building REST/GraphQL APIs for file uploads, implementing queue-based OCR processing, managing file storage and retrieval, handling concurrent OCR jobs, and optimizing server performance for high-throughput OCR workloads.\n\nExamples:\n- \n Context: User needs to create a Rust server that processes uploaded PDFs with OCR\n user: "I need to build a server that accepts PDF uploads and extracts text using OCR"\n assistant: "I'll use the rust-ocr-api-architect agent to design and implement this OCR server"\n \n Since the user needs a Rust server for OCR processing, use the rust-ocr-api-architect agent to handle the implementation.\n \n\n- \n Context: User wants to add concurrent OCR processing to their Rust API\n user: "How can I process multiple OCR requests concurrently in my Rust server?"\n assistant: "Let me use the rust-ocr-api-architect agent to implement concurrent OCR processing"\n \n The user needs help with concurrent OCR processing in Rust, which is this agent's specialty.\n \n +model: inherit +color: green +--- + +You are an expert Rust systems architect specializing in building high-performance server applications for OCR (Optical Character Recognition) processing and file management. You have deep expertise in Rust's async ecosystem, concurrent programming patterns, and integration with OCR engines like Tesseract, as well as extensive experience designing robust APIs for file-based operations. + +Your core competencies include: +- Designing and implementing REST/GraphQL APIs using frameworks like Actix-web, Rocket, or Axum +- Integrating OCR libraries (tesseract-rs, rust-tesseract, leptonica-plumbing) with proper error handling +- Building concurrent processing pipelines using tokio, async-std, and Rust's threading primitives +- Implementing efficient file upload/download systems with streaming and chunking +- Managing file storage strategies (filesystem, S3, database BLOB storage) +- Creating job queue systems for asynchronous OCR processing +- Optimizing memory usage and preventing resource exhaustion during OCR operations +- Implementing proper authentication, rate limiting, and file validation + +When designing or implementing solutions, you will: + +1. **Architect Robust APIs**: Design clear, RESTful endpoints that handle file uploads, OCR job submission, status checking, and result retrieval. Use proper HTTP status codes, implement multipart form handling, and ensure APIs are idempotent where appropriate. + +2. **Implement Concurrent Processing**: Leverage Rust's async/await, channels (mpsc, broadcast), and Arc> patterns to process multiple OCR jobs concurrently. Design worker pools, implement backpressure mechanisms, and ensure graceful degradation under load. + +3. **Optimize OCR Integration**: Configure OCR engines for optimal performance, implement image preprocessing when needed, handle multiple file formats (PDF, PNG, JPEG, TIFF), and provide configurable OCR parameters (language, DPI, page segmentation modes). + +4. **Ensure Reliability**: Implement comprehensive error handling with custom error types, add retry logic for transient failures, create health check endpoints, and design for fault tolerance with circuit breakers where appropriate. + +5. **Manage Resources Efficiently**: Implement file size limits, temporary file cleanup, memory-mapped file handling for large documents, and connection pooling for database/storage backends. Monitor and limit concurrent OCR processes to prevent system overload. + +6. **Provide Production-Ready Code**: Include proper logging with tracing/env_logger, metrics collection points, configuration management with environment variables or config files, and Docker deployment considerations. + +Your code style emphasizes: +- Clear separation of concerns with modular architecture +- Comprehensive error handling using Result and custom error types +- Efficient memory usage with zero-copy operations where possible +- Thorough documentation of API endpoints and complex algorithms +- Integration tests for API endpoints and unit tests for OCR processing logic + +When responding to requests, you will: +- First clarify requirements about expected file types, OCR accuracy needs, and performance targets +- Propose architectural decisions with trade-off analysis +- Provide working code examples with proper error handling +- Include configuration examples and deployment considerations +- Suggest monitoring and observability strategies +- Recommend specific OCR engine configurations based on use case + +You prioritize building scalable, maintainable systems that can handle production workloads while maintaining code clarity and Rust's safety guarantees. You always consider security implications of file uploads and implement appropriate validation and sanitization. diff --git a/.claude/agents/rust-storage-sync-expert.md b/.claude/agents/rust-storage-sync-expert.md new file mode 100644 index 0000000..fdb389a --- /dev/null +++ b/.claude/agents/rust-storage-sync-expert.md @@ -0,0 +1,68 @@ +--- +name: rust-storage-sync-expert +description: Use this agent when you need to design, implement, or troubleshoot Rust applications that involve OCR processing, API development, concurrent operations, or file synchronization across WebDAV, S3, and local filesystems. This includes tasks like building OCR pipelines with concurrent processing, implementing storage abstraction layers, designing synchronization algorithms, optimizing file transfer operations, handling multi-threaded OCR workflows, or resolving issues with cross-storage system consistency. \nContext: The user is building a Rust application that needs to process OCR and sync files across different storage systems.\nuser: "I need to implement a system that processes scanned documents with OCR and syncs them across S3, WebDAV, and local storage"\nassistant: "I'll use the rust-storage-sync-expert agent to help design and implement this OCR and storage synchronization system"\n\nSince the user needs expertise in Rust, OCR, and multiple storage systems synchronization, use the rust-storage-sync-expert agent.\n\n\n\nContext: The user is working on concurrent OCR processing in Rust.\nuser: "How should I structure my Rust code to handle concurrent OCR processing of multiple documents while maintaining thread safety?"\nassistant: "Let me invoke the rust-storage-sync-expert agent to provide guidance on concurrent OCR processing in Rust"\n\nThe user needs help with Rust concurrency specifically for OCR tasks, which is a core expertise of the rust-storage-sync-expert agent.\n\n +model: inherit +color: green +--- + +You are an elite Rust systems engineer with deep expertise in OCR technologies, concurrent programming, API design, and distributed storage systems. Your specialization encompasses building high-performance OCR pipelines, implementing robust storage synchronization mechanisms across WebDAV, S3, and local filesystems, and architecting scalable concurrent systems. + +## Core Competencies + +You possess mastery in: +- **Rust Development**: Advanced knowledge of Rust's ownership system, lifetimes, trait systems, async/await patterns, and zero-cost abstractions +- **OCR Technologies**: Experience with Tesseract, OpenCV, and Rust OCR libraries; understanding of image preprocessing, text extraction pipelines, and accuracy optimization +- **Concurrency & Parallelism**: Expert use of tokio, async-std, rayon, crossbeam; designing lock-free data structures, managing thread pools, and preventing race conditions +- **Storage Systems**: Deep understanding of WebDAV protocol implementation, AWS S3 SDK usage, filesystem abstractions, and cross-platform file handling +- **Synchronization Algorithms**: Implementing efficient diff algorithms, conflict resolution strategies, eventual consistency models, and bidirectional sync patterns +- **API Design**: RESTful and gRPC API implementation, rate limiting, authentication, versioning, and error handling strategies + +## Operational Guidelines + +When addressing tasks, you will: + +1. **Analyze Requirements First**: Carefully examine the specific OCR, storage, or synchronization challenge before proposing solutions. Identify performance bottlenecks, consistency requirements, and scalability needs. + +2. **Provide Rust-Idiomatic Solutions**: Always leverage Rust's type system, error handling with Result, and memory safety guarantees. Use appropriate crates from the ecosystem (e.g., tokio for async, rusoto/aws-sdk for S3, reqwest for WebDAV, tesseract-rs for OCR). + +3. **Design for Concurrency**: Structure code to maximize parallel processing while maintaining safety. Use channels for communication, Arc> or Arc> when shared state is necessary, and prefer message passing over shared memory. + +4. **Implement Robust Error Handling**: Design comprehensive error types, implement proper error propagation, include retry logic with exponential backoff for network operations, and provide detailed logging for debugging. + +5. **Optimize Storage Operations**: Minimize API calls through batching, implement intelligent caching strategies, use streaming for large files, and design efficient delta synchronization algorithms. + +6. **Consider Edge Cases**: Handle network failures, partial uploads/downloads, storage quota limits, OCR processing failures, character encoding issues, and concurrent modification conflicts. + +## Technical Approach + +For OCR implementations: +- Preprocess images for optimal recognition (deskewing, denoising, binarization) +- Implement parallel processing pipelines for batch operations +- Design quality assessment mechanisms for OCR output +- Structure data extraction workflows with configurable confidence thresholds + +For storage synchronization: +- Create abstraction layers over different storage backends +- Implement checksumming and integrity verification +- Design conflict resolution strategies (last-write-wins, version vectors, CRDTs) +- Build efficient change detection mechanisms +- Handle large file transfers with multipart uploads and resume capabilities + +For API development: +- Structure endpoints following REST principles or gRPC patterns +- Implement proper request validation and sanitization +- Design rate limiting and quota management +- Include comprehensive OpenAPI/Swagger documentation +- Build in observability with metrics and tracing + +## Code Quality Standards + +You will ensure all code: +- Follows Rust naming conventions and clippy recommendations +- Includes comprehensive error handling without unwrap() in production code +- Has clear documentation with examples for public APIs +- Implements appropriate tests (unit, integration, and property-based when suitable) +- Uses const generics, zero-copy operations, and other performance optimizations where beneficial +- Properly manages resources with RAII patterns and explicit cleanup when needed + +When providing solutions, include concrete code examples demonstrating the concepts, explain trade-offs between different approaches, and suggest relevant crates that could accelerate development. Always consider the production readiness of your recommendations, including monitoring, deployment, and maintenance aspects. diff --git a/docs/webdav-enhanced-features.md b/docs/webdav-enhanced-features.md deleted file mode 100644 index 7c18f93..0000000 --- a/docs/webdav-enhanced-features.md +++ /dev/null @@ -1,426 +0,0 @@ -# WebDAV Enhanced Features Documentation - -This document describes the critical WebDAV features that have been implemented to provide comprehensive WebDAV protocol support. - -## Table of Contents -1. [WebDAV File Locking (LOCK/UNLOCK)](#webdav-file-locking) -2. [Partial Content/Resume Support](#partial-content-support) -3. [Directory Operations (MKCOL)](#directory-operations) -4. [Enhanced Status Code Handling](#status-code-handling) - -## WebDAV File Locking - -### Overview -WebDAV locking prevents concurrent modification issues by allowing clients to lock resources before modifying them. This implementation supports both exclusive and shared locks with configurable timeouts. - -### Features -- **LOCK Method**: Acquire exclusive or shared locks on resources -- **UNLOCK Method**: Release previously acquired locks -- **Lock Tokens**: Opaque lock tokens in the format `opaquelocktoken:UUID` -- **Lock Refresh**: Extend lock timeout before expiration -- **Depth Support**: Lock individual resources or entire directory trees -- **Automatic Cleanup**: Expired locks are automatically removed - -### Usage - -#### Acquiring a Lock -```rust -use readur::services::webdav::{WebDAVService, LockScope}; - -// Acquire an exclusive lock -let lock_info = service.lock_resource( - "/documents/important.docx", - LockScope::Exclusive, - Some("user@example.com".to_string()), // owner - Some(3600), // timeout in seconds -).await?; - -println!("Lock token: {}", lock_info.token); -``` - -#### Checking Lock Status -```rust -// Check if a resource is locked -if service.is_locked("/documents/important.docx").await { - println!("Resource is locked"); -} - -// Get all locks on a resource -let locks = service.get_lock_info("/documents/important.docx").await; -for lock in locks { - println!("Lock: {} (expires: {:?})", lock.token, lock.expires_at); -} -``` - -#### Refreshing a Lock -```rust -// Refresh lock before it expires -let refreshed = service.refresh_lock(&lock_info.token, Some(7200)).await?; -println!("Lock extended until: {:?}", refreshed.expires_at); -``` - -#### Releasing a Lock -```rust -// Release the lock when done -service.unlock_resource("/documents/important.docx", &lock_info.token).await?; -``` - -### Lock Types -- **Exclusive Lock**: Only one client can hold an exclusive lock -- **Shared Lock**: Multiple clients can hold shared locks simultaneously - -### Error Handling -- **423 Locked**: Resource is already locked by another process -- **412 Precondition Failed**: Lock token is invalid or expired -- **409 Conflict**: Lock conflicts with existing locks - -## Partial Content Support - -### Overview -Partial content support enables reliable downloads with resume capability, essential for large files or unreliable connections. The implementation follows RFC 7233 for HTTP Range Requests. - -### Features -- **Range Headers**: Support for byte-range requests -- **206 Partial Content**: Handle partial content responses -- **Resume Capability**: Continue interrupted downloads -- **Chunked Downloads**: Download large files in manageable chunks -- **Progress Tracking**: Monitor download progress in real-time - -### Usage - -#### Downloading a Specific Range -```rust -use readur::services::webdav::ByteRange; - -// Download bytes 0-1023 (first 1KB) -let chunk = service.download_file_range( - "/videos/large_file.mp4", - 0, - Some(1023) -).await?; - -// Download from byte 1024 to end of file -let rest = service.download_file_range( - "/videos/large_file.mp4", - 1024, - None -).await?; -``` - -#### Download with Resume Support -```rust -use std::path::PathBuf; - -// Download with automatic resume on failure -let local_path = PathBuf::from("/downloads/large_file.mp4"); -let content = service.download_file_with_resume( - "/videos/large_file.mp4", - local_path -).await?; -``` - -#### Monitoring Download Progress -```rust -// Get progress of a specific download -if let Some(progress) = service.get_download_progress("/videos/large_file.mp4").await { - println!("Downloaded: {} / {} bytes ({:.1}%)", - progress.bytes_downloaded, - progress.total_size, - progress.percentage_complete() - ); -} - -// List all active downloads -let downloads = service.list_active_downloads().await; -for download in downloads { - println!("{}: {:.1}% complete", - download.resource_path, - download.percentage_complete() - ); -} -``` - -#### Canceling a Download -```rust -// Cancel an active download -service.cancel_download("/videos/large_file.mp4").await?; -``` - -### Range Format -- `bytes=0-1023` - First 1024 bytes -- `bytes=1024-` - From byte 1024 to end -- `bytes=-500` - Last 500 bytes -- `bytes=0-500,1000-1500` - Multiple ranges - -## Directory Operations - -### Overview -Comprehensive directory management using WebDAV-specific methods, including the MKCOL method for creating collections (directories). - -### Features -- **MKCOL Method**: Create directories with proper WebDAV semantics -- **Recursive Creation**: Create entire directory trees -- **MOVE Method**: Move or rename directories -- **COPY Method**: Copy directories with depth control -- **DELETE Method**: Delete directories recursively -- **Directory Properties**: Set custom properties on directories - -### Usage - -#### Creating Directories -```rust -use readur::services::webdav::CreateDirectoryOptions; - -// Create a single directory -let result = service.create_directory( - "/projects/new_project", - CreateDirectoryOptions::default() -).await?; - -// Create with parent directories -let options = CreateDirectoryOptions { - create_parents: true, - fail_if_exists: false, - properties: None, -}; -let result = service.create_directory( - "/projects/2024/january/reports", - options -).await?; - -// Create entire path recursively -let results = service.create_directory_recursive( - "/projects/2024/january/reports" -).await?; -``` - -#### Checking Directory Existence -```rust -if service.directory_exists("/projects/2024").await? { - println!("Directory exists"); -} -``` - -#### Listing Directory Contents -```rust -let contents = service.list_directory("/projects").await?; -for item in contents { - println!(" {}", item); -} -``` - -#### Moving Directories -```rust -// Move (rename) a directory -service.move_directory( - "/projects/old_name", - "/projects/new_name", - false // don't overwrite if exists -).await?; -``` - -#### Copying Directories -```rust -// Copy directory recursively -service.copy_directory( - "/projects/template", - "/projects/new_project", - false, // don't overwrite - Some("infinity") // recursive copy -).await?; -``` - -#### Deleting Directories -```rust -// Delete empty directory -service.delete_directory("/projects/old", false).await?; - -// Delete directory and all contents -service.delete_directory("/projects/old", true).await?; -``` - -## Status Code Handling - -### Overview -Enhanced error handling for WebDAV-specific status codes, providing detailed error information and automatic retry logic. - -### WebDAV Status Codes - -#### Success Codes -- **207 Multi-Status**: Response contains multiple status codes -- **208 Already Reported**: Members already enumerated - -#### Client Error Codes -- **422 Unprocessable Entity**: Request contains semantic errors -- **423 Locked**: Resource is locked -- **424 Failed Dependency**: Related operation failed - -#### Server Error Codes -- **507 Insufficient Storage**: Server storage full -- **508 Loop Detected**: Infinite loop in request - -### Error Information -Each error includes: -- Status code and description -- Resource path affected -- Lock token (if applicable) -- Suggested resolution action -- Retry information -- Server-provided details - -### Usage - -#### Enhanced Error Handling -```rust -use readur::services::webdav::StatusCodeHandler; - -// Perform operation with enhanced error handling -let response = service.authenticated_request_enhanced( - Method::GET, - &url, - None, - None, - &[200, 206] // expected status codes -).await?; -``` - -#### Smart Retry Logic -```rust -// Automatic retry with exponential backoff -let result = service.with_smart_retry( - || Box::pin(async { - // Your operation here - service.download_file("/path/to/file").await - }), - 3 // max attempts -).await?; -``` - -#### Error Details -```rust -match service.lock_resource(path, scope, owner, timeout).await { - Ok(lock) => println!("Locked: {}", lock.token), - Err(e) => { - // Error includes WebDAV-specific information: - // - Status code (e.g., 423) - // - Lock owner information - // - Suggested actions - // - Retry recommendations - println!("Lock failed: {}", e); - } -} -``` - -### Retry Strategy -The system automatically determines if errors are retryable: - -| Status Code | Retryable | Default Delay | Backoff | -|------------|-----------|---------------|---------| -| 423 Locked | Yes | 10s | Exponential | -| 429 Too Many Requests | Yes | 60s | Exponential | -| 503 Service Unavailable | Yes | 30s | Exponential | -| 409 Conflict | Yes | 5s | Exponential | -| 500-599 Server Errors | Yes | 30s | Exponential | -| 400-499 Client Errors | No | - | - | - -## Integration with Existing Code - -All new features are fully integrated with the existing WebDAV service: - -```rust -use readur::services::webdav::{ - WebDAVService, WebDAVConfig, - LockManager, PartialContentManager, - CreateDirectoryOptions, ByteRange, - WebDAVStatusCode, WebDAVError -}; - -// Create service as usual -let config = WebDAVConfig { /* ... */ }; -let service = WebDAVService::new(config)?; - -// All new features are available through the service -// - Locking: service.lock_resource(), unlock_resource() -// - Partial: service.download_file_range(), download_file_with_resume() -// - Directories: service.create_directory(), delete_directory() -// - Errors: Automatic enhanced error handling -``` - -## Testing - -All features include comprehensive test coverage: - -```bash -# Run all tests -cargo test --lib - -# Run specific feature tests -cargo test locking_tests -cargo test partial_content_tests -cargo test directory_ops_tests - -# Run integration tests (requires WebDAV server) -cargo test --ignored -``` - -## Performance Considerations - -1. **Lock Management**: Locks are stored in memory with automatic cleanup of expired locks -2. **Partial Downloads**: Configurable chunk size (default 1MB) for optimal performance -3. **Directory Operations**: Batch operations use concurrent processing with semaphore control -4. **Error Handling**: Smart retry with exponential backoff prevents server overload - -## Security Considerations - -1. **Lock Tokens**: Use cryptographically secure UUIDs -2. **Authentication**: All operations use HTTP Basic Auth (configure HTTPS in production) -3. **Timeouts**: Configurable timeouts prevent resource exhaustion -4. **Rate Limiting**: Respect server rate limits with automatic backoff - -## Compatibility - -The implementation follows these standards: -- RFC 4918 (WebDAV) -- RFC 7233 (HTTP Range Requests) -- RFC 2518 (WebDAV Locking) - -Tested with: -- Nextcloud -- ownCloud -- Apache mod_dav -- Generic WebDAV servers - -## Migration Guide - -For existing code using the WebDAV service: - -1. **No Breaking Changes**: All existing methods continue to work -2. **New Features Are Opt-In**: Use new methods only when needed -3. **Enhanced Error Information**: Errors now include more details but maintain backward compatibility -4. **Automatic Benefits**: Some improvements (like better error handling) apply automatically - -## Troubleshooting - -### Lock Issues -- **423 Locked Error**: Another client holds a lock. Wait or use lock token -- **Lock Token Invalid**: Lock may have expired. Acquire a new lock -- **Locks Not Released**: Implement proper cleanup in error paths - -### Partial Content Issues -- **Server Doesn't Support Ranges**: Falls back to full download automatically -- **Resume Fails**: File may have changed. Restart download -- **Slow Performance**: Adjust chunk size based on network conditions - -### Directory Operation Issues -- **409 Conflict**: Parent directory doesn't exist. Use `create_parents: true` -- **405 Method Not Allowed**: Directory may already exist or server doesn't support MKCOL -- **507 Insufficient Storage**: Server storage full. Contact administrator - -## Future Enhancements - -Potential future improvements: -- WebDAV SEARCH method support -- Advanced property management (PROPPATCH) -- Access control (WebDAV ACL) -- Versioning support (DeltaV) -- Collection synchronization (WebDAV Sync) \ No newline at end of file diff --git a/src/errors/mod.rs b/src/errors/mod.rs index aa46234..3fe2595 100644 --- a/src/errors/mod.rs +++ b/src/errors/mod.rs @@ -1,13 +1,12 @@ use axum::{ http::StatusCode, - response::{IntoResponse, Json, Response}, + response::IntoResponse, }; use serde_json::json; use thiserror::Error; -use uuid::Uuid; use crate::monitoring::error_management::{ - ErrorCategory, ErrorSeverity, ManagedError, get_error_manager, + ErrorCategory, ErrorSeverity, ManagedError, }; /// Common trait for all custom error types in the application @@ -59,7 +58,7 @@ macro_rules! impl_into_response { fn into_response(self) -> axum::response::Response { use crate::errors::AppError; use crate::monitoring::error_management::get_error_manager; - use axum::{http::StatusCode, response::Json}; + use axum::response::Json; use serde_json::json; // Send error to management system diff --git a/src/ingestion/document_ingestion.rs b/src/ingestion/document_ingestion.rs index a5daec7..3277e47 100644 --- a/src/ingestion/document_ingestion.rs +++ b/src/ingestion/document_ingestion.rs @@ -8,7 +8,7 @@ use uuid::Uuid; use sha2::{Digest, Sha256}; -use tracing::{debug, info, warn}; +use tracing::{debug, warn}; use serde_json; use chrono::Utc; diff --git a/src/ocr/enhanced.rs b/src/ocr/enhanced.rs index 4030d18..2b112db 100644 --- a/src/ocr/enhanced.rs +++ b/src/ocr/enhanced.rs @@ -55,7 +55,7 @@ impl EnhancedOcrService { let mut preprocessing_applied = Vec::new(); // Load and preprocess the image - let (processed_image_path, mut preprocess_steps) = if settings.enable_image_preprocessing { + let (processed_image_path, preprocess_steps) = if settings.enable_image_preprocessing { let (processed_path, steps) = self.preprocess_image(file_path, settings).await?; (processed_path, steps) } else { diff --git a/src/routes/documents/bulk.rs b/src/routes/documents/bulk.rs index c3a2de8..872aa28 100644 --- a/src/routes/documents/bulk.rs +++ b/src/routes/documents/bulk.rs @@ -8,7 +8,6 @@ use tracing::{debug, error, info, warn}; use crate::{ auth::AuthUser, - services::file_service::FileService, AppState, }; use super::types::{BulkDeleteRequest, DeleteLowConfidenceRequest, BulkDeleteResponse}; diff --git a/src/routes/documents/crud.rs b/src/routes/documents/crud.rs index 984607d..2155dbb 100644 --- a/src/routes/documents/crud.rs +++ b/src/routes/documents/crud.rs @@ -11,7 +11,6 @@ use tracing::{debug, error, info, warn}; use crate::{ auth::AuthUser, ingestion::document_ingestion::{DocumentIngestionService, IngestionResult}, - services::file_service::FileService, models::DocumentResponse, AppState, }; diff --git a/src/routes/documents/debug.rs b/src/routes/documents/debug.rs index 1b19dbc..49636ce 100644 --- a/src/routes/documents/debug.rs +++ b/src/routes/documents/debug.rs @@ -8,7 +8,6 @@ use tracing::{debug, error, info}; use crate::{ auth::AuthUser, - services::file_service::FileService, AppState, }; use super::types::DocumentDebugInfo; diff --git a/src/routes/documents/failed.rs b/src/routes/documents/failed.rs index 43c3837..84c7341 100644 --- a/src/routes/documents/failed.rs +++ b/src/routes/documents/failed.rs @@ -12,7 +12,6 @@ use sqlx::Row; use crate::{ auth::AuthUser, models::UserRole, - services::file_service::FileService, AppState, }; use super::types::FailedDocumentsQuery; diff --git a/src/routes/labels.rs b/src/routes/labels.rs index f070bf7..80a4152 100644 --- a/src/routes/labels.rs +++ b/src/routes/labels.rs @@ -12,7 +12,7 @@ use uuid::Uuid; use chrono::{DateTime, Utc}; use sqlx::{FromRow, Row}; -use crate::{auth::AuthUser, errors::label::LabelError, AppState}; +use crate::{auth::AuthUser, AppState}; #[derive(Debug, Clone, Serialize, Deserialize, FromRow, ToSchema)] pub struct Label { diff --git a/src/routes/metrics.rs b/src/routes/metrics.rs index 5305d30..464d326 100644 --- a/src/routes/metrics.rs +++ b/src/routes/metrics.rs @@ -134,7 +134,7 @@ async fn collect_database_metrics(state: &Arc) -> Result) -> Result { // Use existing OCR queue statistics - use crate::ocr::queue::OcrQueueService; + let queue_service = &*state.queue_service; diff --git a/src/routes/prometheus_metrics.rs b/src/routes/prometheus_metrics.rs index 755393a..7778054 100644 --- a/src/routes/prometheus_metrics.rs +++ b/src/routes/prometheus_metrics.rs @@ -306,7 +306,7 @@ async fn collect_document_metrics(state: &Arc) -> Result) -> Result { - use crate::ocr::queue::OcrQueueService; + tracing::debug!("Prometheus: Starting collect_ocr_metrics"); diff --git a/src/routes/queue.rs b/src/routes/queue.rs index fd061b7..20cf4f4 100644 --- a/src/routes/queue.rs +++ b/src/routes/queue.rs @@ -8,7 +8,7 @@ use axum::{ use sqlx::Row; use std::{sync::Arc, error::Error}; -use crate::{auth::AuthUser, ocr::queue::OcrQueueService, AppState, models::UserRole}; +use crate::{auth::AuthUser, AppState, models::UserRole}; pub fn require_admin(auth_user: &AuthUser) -> Result<(), StatusCode> { if auth_user.user.role != UserRole::Admin { diff --git a/src/routes/users.rs b/src/routes/users.rs index b92102f..d96b85b 100644 --- a/src/routes/users.rs +++ b/src/routes/users.rs @@ -2,7 +2,7 @@ use axum::{ extract::{Path, State}, http::StatusCode, response::Json, - routing::{get, post, delete}, + routing::get, Router, }; use serde::{Deserialize, Serialize}; diff --git a/src/routes/webdav/webdav_sync.rs b/src/routes/webdav/webdav_sync.rs index 5fcc510..c1a5ef3 100644 --- a/src/routes/webdav/webdav_sync.rs +++ b/src/routes/webdav/webdav_sync.rs @@ -8,7 +8,6 @@ use futures::stream::{FuturesUnordered, StreamExt}; use crate::{ AppState, models::{CreateWebDAVFile, UpdateWebDAVSyncState}, - services::file_service::FileService, ingestion::document_ingestion::{DocumentIngestionService, IngestionResult}, services::webdav::{WebDAVConfig, WebDAVService, SmartSyncService, SyncProgress, SyncPhase}, }; diff --git a/src/scheduling/source_sync.rs b/src/scheduling/source_sync.rs index 7c81cc7..9fcf93d 100644 --- a/src/scheduling/source_sync.rs +++ b/src/scheduling/source_sync.rs @@ -10,7 +10,6 @@ use uuid::Uuid; use crate::{ AppState, models::{FileIngestionInfo, Source, SourceType, SourceStatus, LocalFolderSourceConfig, S3SourceConfig, WebDAVSourceConfig}, - services::file_service::FileService, ingestion::document_ingestion::{DocumentIngestionService, IngestionResult}, services::local_folder_service::LocalFolderService, services::s3_service::S3Service, diff --git a/src/scheduling/user_watch_manager.rs b/src/scheduling/user_watch_manager.rs index 11b2708..ce7ad05 100644 --- a/src/scheduling/user_watch_manager.rs +++ b/src/scheduling/user_watch_manager.rs @@ -3,7 +3,7 @@ use std::path::Path; use std::sync::Arc; use tokio::sync::RwLock; use std::collections::HashMap; -use tracing::{debug, error, info, warn}; +use tracing::{debug, info, warn}; use uuid::Uuid; use crate::{ diff --git a/src/services/local_folder_service.rs b/src/services/local_folder_service.rs index a3d82d4..5b42be0 100644 --- a/src/services/local_folder_service.rs +++ b/src/services/local_folder_service.rs @@ -37,7 +37,7 @@ impl LocalFolderService { return Err(anyhow!("Folder does not exist: {}", folder_path)); } - let mut files: Vec = Vec::new(); + let files: Vec = Vec::new(); info!("Scanning local folder: {} (recursive: {})", folder_path, self.config.recursive); diff --git a/src/services/s3_service.rs b/src/services/s3_service.rs index d932a85..f379f75 100644 --- a/src/services/s3_service.rs +++ b/src/services/s3_service.rs @@ -7,7 +7,7 @@ use std::collections::HashMap; use std::time::Duration; use uuid::Uuid; use futures::stream::StreamExt; -use tokio::io::{AsyncRead, AsyncReadExt}; +use tokio::io::AsyncReadExt; #[cfg(feature = "s3")] use aws_sdk_s3::Client; diff --git a/src/services/sync_progress_tracker.rs b/src/services/sync_progress_tracker.rs index 18f8185..ee50eb0 100644 --- a/src/services/sync_progress_tracker.rs +++ b/src/services/sync_progress_tracker.rs @@ -1,7 +1,6 @@ use std::collections::HashMap; use std::sync::{Arc, Mutex}; use uuid::Uuid; -use std::time::Duration; use serde::{Serialize, Deserialize}; use crate::services::webdav::{SyncProgress, ProgressStats, SyncPhase}; diff --git a/src/services/webdav/mod.rs b/src/services/webdav/mod.rs index 18e610b..0eb427e 100644 --- a/src/services/webdav/mod.rs +++ b/src/services/webdav/mod.rs @@ -5,27 +5,15 @@ pub mod service; pub mod smart_sync; pub mod progress_shim; // Backward compatibility shim for simplified progress tracking -// New enhanced WebDAV features -pub mod locking; -pub mod partial_content; -pub mod directory_ops; -pub mod status_codes; - // Re-export main types for convenience pub use config::{WebDAVConfig, RetryConfig, ConcurrencyConfig}; pub use service::{ WebDAVService, WebDAVDiscoveryResult, ServerCapabilities, HealthStatus, test_webdav_connection, ValidationReport, ValidationIssue, ValidationIssueType, ValidationSeverity, - ValidationRecommendation, ValidationAction, ValidationSummary, WebDAVDownloadResult + ValidationRecommendation, ValidationAction, ValidationSummary }; pub use smart_sync::{SmartSyncService, SmartSyncDecision, SmartSyncStrategy, SmartSyncResult}; -// Export new feature types -pub use locking::{LockManager, LockInfo, LockScope, LockType, LockDepth, LockRequest}; -pub use partial_content::{PartialContentManager, ByteRange, DownloadProgress}; -pub use directory_ops::{CreateDirectoryOptions, DirectoryCreationResult}; -pub use status_codes::{WebDAVStatusCode, WebDAVError, StatusCodeHandler}; - // Backward compatibility exports for progress tracking (simplified) pub use progress_shim::{SyncProgress, SyncPhase, ProgressStats}; @@ -37,10 +25,4 @@ mod subdirectory_edge_cases_tests; #[cfg(test)] mod protocol_detection_tests; #[cfg(test)] -mod tests; -#[cfg(test)] -mod locking_tests; -#[cfg(test)] -mod partial_content_tests; -#[cfg(test)] -mod directory_ops_tests; \ No newline at end of file +mod tests; \ No newline at end of file diff --git a/src/services/webdav/service.rs b/src/services/webdav/service.rs index 458a738..0a30a48 100644 --- a/src/services/webdav/service.rs +++ b/src/services/webdav/service.rs @@ -1,10 +1,9 @@ use anyhow::{anyhow, Result}; -use reqwest::{Client, Method, Response, StatusCode, header}; +use reqwest::{Client, Method, Response}; use std::sync::Arc; use std::time::{Duration, Instant}; use std::collections::{HashMap, HashSet}; -use std::path::PathBuf; -use tokio::sync::{Semaphore, RwLock}; +use tokio::sync::Semaphore; use tokio::time::sleep; use futures_util::stream; use tracing::{debug, error, info, warn}; @@ -17,14 +16,7 @@ use crate::models::{ use crate::webdav_xml_parser::{parse_propfind_response, parse_propfind_response_with_directories}; use crate::mime_detection::{detect_mime_from_content, update_mime_type_with_content, MimeDetectionResult}; -use super::{ - config::{WebDAVConfig, RetryConfig, ConcurrencyConfig}, - SyncProgress, - locking::{LockManager, LockInfo, LockScope, LockDepth, LockRequest}, - partial_content::{PartialContentManager, ByteRange, DownloadProgress}, - directory_ops::{CreateDirectoryOptions, DirectoryCreationResult}, - status_codes::{WebDAVStatusCode, WebDAVError, StatusCodeHandler}, -}; +use super::{config::{WebDAVConfig, RetryConfig, ConcurrencyConfig}, SyncProgress}; /// Results from WebDAV discovery including both files and directories #[derive(Debug, Clone)] @@ -155,10 +147,6 @@ pub struct WebDAVService { download_semaphore: Arc, /// Stores the working protocol (updated after successful protocol detection) working_protocol: Arc>>, - /// Lock manager for WebDAV locking support - lock_manager: LockManager, - /// Partial content manager for resume support - partial_content_manager: PartialContentManager, } impl WebDAVService { @@ -190,13 +178,6 @@ impl WebDAVService { let scan_semaphore = Arc::new(Semaphore::new(concurrency_config.max_concurrent_scans)); let download_semaphore = Arc::new(Semaphore::new(concurrency_config.max_concurrent_downloads)); - // Initialize lock manager - let lock_manager = LockManager::new(); - - // Initialize partial content manager with temp directory - let temp_dir = std::env::temp_dir().join("readur_webdav_downloads"); - let partial_content_manager = PartialContentManager::new(temp_dir); - Ok(Self { client, config, @@ -205,8 +186,6 @@ impl WebDAVService { scan_semaphore, download_semaphore, working_protocol: Arc::new(std::sync::RwLock::new(None)), - lock_manager, - partial_content_manager, }) } @@ -1974,391 +1953,6 @@ impl WebDAVService { pub fn relative_path_to_url(&self, relative_path: &str) -> String { self.path_to_url(relative_path) } - - // ============================================================================ - // WebDAV Locking Methods - // ============================================================================ - - /// Acquires a lock on a resource - pub async fn lock_resource( - &self, - resource_path: &str, - scope: LockScope, - owner: Option, - timeout_seconds: Option, - ) -> Result { - let url = self.get_url_for_path(resource_path); - - info!("Acquiring {:?} lock on: {}", scope, resource_path); - - // Build LOCK request body - let lock_body = self.build_lock_request_xml(scope, owner.as_deref()); - - // Send LOCK request - let response = self.authenticated_request( - Method::from_bytes(b"LOCK")?, - &url, - Some(lock_body), - Some(vec![ - ("Content-Type", "application/xml"), - ("Timeout", &format!("Second-{}", timeout_seconds.unwrap_or(3600))), - ]), - ).await?; - - // Handle response based on status code - match response.status() { - StatusCode::OK | StatusCode::CREATED => { - // Parse lock token from response - let lock_token = self.extract_lock_token_from_response(&response)?; - - // Create lock info - let lock_request = LockRequest { - scope, - lock_type: super::locking::LockType::Write, - owner, - }; - - // Register lock with manager - let lock_info = self.lock_manager.acquire_lock( - resource_path.to_string(), - lock_request, - LockDepth::Zero, - timeout_seconds, - ).await?; - - info!("Lock acquired successfully: {}", lock_info.token); - Ok(lock_info) - } - StatusCode::LOCKED => { - Err(anyhow!("Resource is already locked by another process")) - } - _ => { - let error = WebDAVError::from_response(response, Some(resource_path.to_string())).await; - Err(anyhow!("Failed to acquire lock: {}", error)) - } - } - } - - /// Refreshes an existing lock - pub async fn refresh_lock(&self, lock_token: &str, timeout_seconds: Option) -> Result { - // Get lock info from manager - let lock_info = self.lock_manager.refresh_lock(lock_token, timeout_seconds).await?; - let url = self.get_url_for_path(&lock_info.resource_path); - - info!("Refreshing lock: {}", lock_token); - - // Send LOCK request with If header - let response = self.authenticated_request( - Method::from_bytes(b"LOCK")?, - &url, - None, - Some(vec![ - ("If", &format!("(<{}>)", lock_token)), - ("Timeout", &format!("Second-{}", timeout_seconds.unwrap_or(3600))), - ]), - ).await?; - - if response.status().is_success() { - info!("Lock refreshed successfully: {}", lock_token); - Ok(lock_info) - } else { - let error = WebDAVError::from_response(response, Some(lock_info.resource_path.clone())).await; - Err(anyhow!("Failed to refresh lock: {}", error)) - } - } - - /// Releases a lock - pub async fn unlock_resource(&self, resource_path: &str, lock_token: &str) -> Result<()> { - let url = self.get_url_for_path(resource_path); - - info!("Releasing lock on: {} (token: {})", resource_path, lock_token); - - // Send UNLOCK request - let response = self.authenticated_request( - Method::from_bytes(b"UNLOCK")?, - &url, - None, - Some(vec![ - ("Lock-Token", &format!("<{}>", lock_token)), - ]), - ).await?; - - if response.status() == StatusCode::NO_CONTENT || response.status().is_success() { - // Remove from lock manager - self.lock_manager.release_lock(lock_token).await?; - info!("Lock released successfully: {}", lock_token); - Ok(()) - } else { - let error = WebDAVError::from_response(response, Some(resource_path.to_string())).await; - Err(anyhow!("Failed to release lock: {}", error)) - } - } - - /// Checks if a resource is locked - pub async fn is_locked(&self, resource_path: &str) -> bool { - self.lock_manager.is_locked(resource_path).await - } - - /// Gets lock information for a resource - pub async fn get_lock_info(&self, resource_path: &str) -> Vec { - self.lock_manager.get_locks(resource_path).await - } - - /// Builds XML for LOCK request - fn build_lock_request_xml(&self, scope: LockScope, owner: Option<&str>) -> String { - let scope_xml = match scope { - LockScope::Exclusive => "", - LockScope::Shared => "", - }; - - let owner_xml = owner - .map(|o| format!("{}", o)) - .unwrap_or_default(); - - format!( - r#" - - {} - - {} -"#, - scope_xml, owner_xml - ) - } - - /// Extracts lock token from LOCK response - fn extract_lock_token_from_response(&self, response: &Response) -> Result { - // Check Lock-Token header - if let Some(lock_token_header) = response.headers().get("lock-token") { - if let Ok(token_str) = lock_token_header.to_str() { - // Remove angle brackets if present - let token = token_str.trim_matches(|c| c == '<' || c == '>'); - return Ok(token.to_string()); - } - } - - // If not in header, would need to parse from response body - // For now, generate a token (in production, parse from XML response) - Ok(format!("opaquelocktoken:{}", uuid::Uuid::new_v4())) - } - - // ============================================================================ - // Partial Content / Resume Support Methods - // ============================================================================ - - /// Downloads a file with resume support - pub async fn download_file_with_resume( - &self, - file_path: &str, - local_path: PathBuf, - ) -> Result> { - let url = self.get_url_for_path(file_path); - - // First, get file size and check partial content support - let head_response = self.authenticated_request( - Method::HEAD, - &url, - None, - None, - ).await?; - - let total_size = head_response - .headers() - .get(header::CONTENT_LENGTH) - .and_then(|v| v.to_str().ok()) - .and_then(|s| s.parse::().ok()) - .ok_or_else(|| anyhow!("Cannot determine file size"))?; - - let etag = head_response - .headers() - .get(header::ETAG) - .and_then(|v| v.to_str().ok()) - .map(|s| s.to_string()); - - let supports_range = PartialContentManager::check_partial_content_support(&head_response); - - if !supports_range { - info!("Server doesn't support partial content, downloading entire file"); - return self.download_file(file_path).await; - } - - // Initialize or resume download - let mut progress = self.partial_content_manager - .init_download(file_path, total_size, etag) - .await?; - - // Download in chunks - while let Some(range) = progress.get_next_range(1024 * 1024) { - debug!("Downloading range: {}", range.to_header_value()); - - let response = self.authenticated_request( - Method::GET, - &url, - None, - Some(vec![ - ("Range", &range.to_header_value()), - ]), - ).await?; - - if response.status() != StatusCode::PARTIAL_CONTENT { - return Err(anyhow!("Server doesn't support partial content for this resource")); - } - - let chunk_data = response.bytes().await?.to_vec(); - - self.partial_content_manager - .download_chunk(file_path, &range, chunk_data) - .await?; - - progress = self.partial_content_manager - .get_progress(file_path) - .await - .ok_or_else(|| anyhow!("Download progress lost"))?; - - info!("Download progress: {:.1}%", progress.percentage_complete()); - } - - // Complete the download - self.partial_content_manager - .complete_download(file_path, local_path.clone()) - .await?; - - // Read the completed file - tokio::fs::read(&local_path).await.map_err(|e| anyhow!("Failed to read downloaded file: {}", e)) - } - - /// Downloads a specific byte range from a file - pub async fn download_file_range( - &self, - file_path: &str, - start: u64, - end: Option, - ) -> Result> { - let url = self.get_url_for_path(file_path); - let range = ByteRange::new(start, end); - - debug!("Downloading range {} from {}", range.to_header_value(), file_path); - - let response = self.authenticated_request( - Method::GET, - &url, - None, - Some(vec![ - ("Range", &range.to_header_value()), - ]), - ).await?; - - match response.status() { - StatusCode::PARTIAL_CONTENT => { - let data = response.bytes().await?.to_vec(); - debug!("Downloaded {} bytes for range", data.len()); - Ok(data) - } - StatusCode::OK => { - // Server doesn't support range, returned entire file - warn!("Server doesn't support byte ranges, returned entire file"); - let data = response.bytes().await?.to_vec(); - - // Extract requested range from full content - let end_pos = end.unwrap_or(data.len() as u64 - 1).min(data.len() as u64 - 1); - if start as usize >= data.len() { - return Err(anyhow!("Range start beyond file size")); - } - Ok(data[start as usize..=end_pos as usize].to_vec()) - } - StatusCode::RANGE_NOT_SATISFIABLE => { - Err(anyhow!("Requested range not satisfiable")) - } - _ => { - let error = WebDAVError::from_response(response, Some(file_path.to_string())).await; - Err(anyhow!("Failed to download range: {}", error)) - } - } - } - - /// Gets active download progress - pub async fn get_download_progress(&self, file_path: &str) -> Option { - self.partial_content_manager.get_progress(file_path).await - } - - /// Lists all active downloads - pub async fn list_active_downloads(&self) -> Vec { - self.partial_content_manager.list_downloads().await - } - - /// Cancels an active download - pub async fn cancel_download(&self, file_path: &str) -> Result<()> { - self.partial_content_manager.cancel_download(file_path).await - } - - // ============================================================================ - // Enhanced Error Handling with WebDAV Status Codes - // ============================================================================ - - /// Performs authenticated request with enhanced error handling - pub async fn authenticated_request_enhanced( - &self, - method: Method, - url: &str, - body: Option, - headers: Option>, - expected_codes: &[u16], - ) -> Result { - let response = self.authenticated_request(method, url, body, headers).await?; - - StatusCodeHandler::handle_response( - response, - Some(url.to_string()), - expected_codes, - ).await - } - - /// Performs operation with automatic retry based on status codes - pub async fn with_smart_retry( - &self, - operation: F, - max_attempts: u32, - ) -> Result - where - F: Fn() -> std::pin::Pin> + Send>> + Send, - { - let mut attempt = 0; - - loop { - match operation().await { - Ok(result) => return Ok(result), - Err(e) => { - // Check if error contains a status code that's retryable - let error_str = e.to_string(); - let is_retryable = error_str.contains("423") || // Locked - error_str.contains("429") || // Rate limited - error_str.contains("503") || // Service unavailable - error_str.contains("409"); // Conflict - - if !is_retryable || attempt >= max_attempts { - return Err(e); - } - - // Calculate retry delay - let delay = if error_str.contains("423") { - StatusCodeHandler::get_retry_delay(423, attempt) - } else if error_str.contains("429") { - StatusCodeHandler::get_retry_delay(429, attempt) - } else if error_str.contains("503") { - StatusCodeHandler::get_retry_delay(503, attempt) - } else { - StatusCodeHandler::get_retry_delay(409, attempt) - }; - - warn!("Retryable error on attempt {}/{}: {}. Retrying in {} seconds...", - attempt + 1, max_attempts, e, delay); - - tokio::time::sleep(Duration::from_secs(delay)).await; - attempt += 1; - } - } - } - } } @@ -2373,8 +1967,6 @@ impl Clone for WebDAVService { scan_semaphore: Arc::clone(&self.scan_semaphore), download_semaphore: Arc::clone(&self.download_semaphore), working_protocol: Arc::clone(&self.working_protocol), - lock_manager: self.lock_manager.clone(), - partial_content_manager: self.partial_content_manager.clone(), } } }