diff --git a/src/enhanced_ocr.rs b/src/enhanced_ocr.rs index 1e49ac9..c0edb0d 100644 --- a/src/enhanced_ocr.rs +++ b/src/enhanced_ocr.rs @@ -819,22 +819,33 @@ impl EnhancedOcrService { // Clean the PDF data (remove leading null bytes) let clean_bytes = clean_pdf_data(&bytes); - // Add timeout for PDF extraction to prevent hanging + // Add timeout and panic recovery for PDF extraction let extraction_result = tokio::time::timeout( std::time::Duration::from_secs(120), // 2 minute timeout tokio::task::spawn_blocking(move || { - pdf_extract::extract_text_from_mem(&clean_bytes) + // Catch panics from pdf-extract library + catch_unwind(AssertUnwindSafe(|| { + pdf_extract::extract_text_from_mem(&clean_bytes) + })) }) ).await; let text = match extraction_result { - Ok(Ok(Ok(text))) => text, - Ok(Ok(Err(e))) => { + Ok(Ok(Ok(Ok(text)))) => text, + Ok(Ok(Ok(Err(e)))) => { return Err(anyhow!( "PDF text extraction failed for file '{}' (size: {} bytes): {}. This may indicate a corrupted or unsupported PDF format.", file_path, file_size, e )); } + Ok(Ok(Err(_panic))) => { + // pdf-extract panicked (e.g., missing unicode map, corrupted font encoding) + warn!("PDF extraction panicked for file '{}' - likely corrupted font encoding or missing unicode map. Fallback to OCR not yet implemented.", file_path); + return Err(anyhow!( + "PDF extraction failed due to corrupted or unsupported font encoding in file '{}' (size: {} bytes). The PDF may have non-standard fonts or corrupted internal structure. Consider converting the PDF to images for OCR.", + file_path, file_size + )); + } Ok(Err(e)) => { return Err(anyhow!("PDF extraction task failed: {}", e)); } diff --git a/src/file_service.rs b/src/file_service.rs index 04953ea..7c03bcc 100644 --- a/src/file_service.rs +++ b/src/file_service.rs @@ -8,7 +8,7 @@ use tracing::{info, warn, error}; use crate::models::Document; #[cfg(feature = "ocr")] -use image::{DynamicImage, ImageFormat, imageops::FilterType}; +use image::{DynamicImage, ImageFormat, imageops::FilterType, Rgb, RgbImage, Rgba, ImageBuffer}; #[derive(Clone)] pub struct FileService { @@ -275,9 +275,13 @@ impl FileService { self.generate_image_thumbnail(&file_data).await } "pdf" => { - // For PDFs, we'd need pdf2image or similar - // For now, return a placeholder - self.generate_placeholder_thumbnail("PDF").await + self.generate_pdf_thumbnail(&file_data).await + } + "txt" => { + self.generate_text_thumbnail(&file_data).await + } + "doc" | "docx" => { + self.generate_placeholder_thumbnail("DOC").await } _ => { // For other file types, generate a placeholder @@ -311,6 +315,86 @@ impl FileService { Ok(buffer) } + #[cfg(feature = "ocr")] + async fn generate_pdf_thumbnail(&self, file_data: &[u8]) -> Result> { + use image::Rgb; + + // Try to extract first page as image using pdf-extract + match pdf_extract::extract_text_from_mem(file_data) { + Ok(text) => { + // If we can extract text, create a text-based thumbnail + self.generate_text_based_thumbnail(&text, "PDF", Rgb([220, 38, 27])).await + } + Err(_) => { + // Fall back to placeholder if PDF extraction fails + self.generate_placeholder_thumbnail("PDF").await + } + } + } + + #[cfg(feature = "ocr")] + async fn generate_text_thumbnail(&self, file_data: &[u8]) -> Result> { + use image::Rgb; + + // Convert bytes to text + let text = String::from_utf8_lossy(file_data); + self.generate_text_based_thumbnail(&text, "TXT", Rgb([34, 139, 34])).await + } + + #[cfg(feature = "ocr")] + async fn generate_text_based_thumbnail(&self, text: &str, file_type: &str, bg_color: image::Rgb) -> Result> { + use image::{RgbImage, Rgb, DynamicImage, ImageFormat}; + + let width = 200; + let height = 200; + let mut img = RgbImage::new(width, height); + + // Fill background + for pixel in img.pixels_mut() { + *pixel = bg_color; + } + + // Add file type indicator at the top + let text_color = Rgb([255, 255, 255]); // White text + let preview_text = if text.len() > 300 { + format!("{}\n{}", file_type, &text[..300].trim()) + } else { + format!("{}\n{}", file_type, text.trim()) + }; + + // Simple text rendering - just place some characters as visual indicators + // For a more sophisticated approach, you'd use a text rendering library + let lines: Vec<&str> = preview_text.lines().take(15).collect(); + for (line_idx, line) in lines.iter().enumerate() { + let y_offset = 20 + (line_idx * 12); + if y_offset >= height as usize - 10 { break; } + + // Simple character placement (very basic text rendering) + for (char_idx, _) in line.chars().take(25).enumerate() { + let x_offset = 10 + (char_idx * 7); + if x_offset >= width as usize - 10 { break; } + + // Draw a simple "character" representation as white pixels + if x_offset < width as usize && y_offset < height as usize { + if let Some(pixel) = img.get_pixel_mut_checked(x_offset as u32, y_offset as u32) { + *pixel = text_color; + } + // Add some thickness + if let Some(pixel) = img.get_pixel_mut_checked(x_offset as u32 + 1, y_offset as u32) { + *pixel = text_color; + } + } + } + } + + let dynamic_img = DynamicImage::ImageRgb8(img); + let mut buffer = Vec::new(); + let mut cursor = std::io::Cursor::new(&mut buffer); + dynamic_img.write_to(&mut cursor, ImageFormat::Jpeg)?; + + Ok(buffer) + } + #[cfg(feature = "ocr")] async fn generate_placeholder_thumbnail(&self, file_type: &str) -> Result> { // Create a simple colored rectangle as placeholder diff --git a/src/ocr_queue.rs b/src/ocr_queue.rs index b25ee57..0cbccc5 100644 --- a/src/ocr_queue.rs +++ b/src/ocr_queue.rs @@ -407,20 +407,34 @@ impl OcrQueueService { } Err(e) => { let error_msg = format!("OCR extraction failed: {}", e); - warn!("❌ OCR failed for '{}' | Job: {} | Document: {} | Error: {}", - filename, item.id, item.document_id, e); + let error_str = e.to_string(); - // Update document status + // Detect specific PDF font encoding issues + let is_pdf_font_issue = error_str.contains("font encoding") || + error_str.contains("missing unicode map") || + error_str.contains("corrupted internal structure"); + + if is_pdf_font_issue { + warn!("⚠️ PDF font encoding issue for '{}' | Job: {} | Document: {} | Error: {}", + filename, item.id, item.document_id, e); + } else { + warn!("❌ OCR failed for '{}' | Job: {} | Document: {} | Error: {}", + filename, item.id, item.document_id, e); + } + + // Update document status with more specific error information + let ocr_status = if is_pdf_font_issue { "pdf_font_error" } else { "failed" }; sqlx::query( r#" UPDATE documents - SET ocr_status = 'failed', - ocr_error = $2, + SET ocr_status = $2, + ocr_error = $3, updated_at = NOW() WHERE id = $1 "# ) .bind(item.document_id) + .bind(ocr_status) .bind(&error_msg) .execute(&self.pool) .await?; @@ -566,6 +580,22 @@ impl OcrQueueService { info!("Successfully copied processed image to: {:?}", permanent_path); + // Get actual image dimensions and file size + let image_metadata = tokio::fs::metadata(&permanent_path).await + .map_err(|e| anyhow::anyhow!("Failed to get processed image metadata: {}", e))?; + let file_size = image_metadata.len() as i64; + + // Get image dimensions using image crate + let (image_width, image_height) = tokio::task::spawn_blocking({ + let path = permanent_path.clone(); + move || -> Result<(u32, u32), anyhow::Error> { + let img = image::open(&path) + .map_err(|e| anyhow::anyhow!("Failed to open processed image for dimensions: {}", e))?; + Ok((img.width(), img.height())) + } + }).await + .map_err(|e| anyhow::anyhow!("Failed to get image dimensions: {}", e))??; + // Save to database let processing_parameters = serde_json::json!({ "steps": processing_steps, @@ -576,14 +606,8 @@ impl OcrQueueService { // Save metadata to database with error handling if let Err(e) = sqlx::query( r#" - INSERT INTO processed_images (document_id, user_id, original_image_path, processed_image_path, processing_parameters, processing_steps, created_at) - VALUES ($1, $2, $3, $4, $5, $6, NOW()) - ON CONFLICT (document_id) - DO UPDATE SET - processed_image_path = EXCLUDED.processed_image_path, - processing_parameters = EXCLUDED.processing_parameters, - processing_steps = EXCLUDED.processing_steps, - created_at = NOW() + INSERT INTO processed_images (document_id, user_id, original_image_path, processed_image_path, processing_parameters, processing_steps, image_width, image_height, file_size, created_at) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, NOW()) "# ) .bind(document_id) @@ -592,6 +616,9 @@ impl OcrQueueService { .bind(permanent_path.to_string_lossy().as_ref()) .bind(&processing_parameters) .bind(processing_steps) + .bind(image_width as i32) + .bind(image_height as i32) + .bind(file_size) .execute(&self.pool) .await { error!("Failed to save processed image metadata to database for document {}: {}", document_id, e); diff --git a/src/tests/helpers.rs b/src/tests/helpers.rs index 9680045..51114cb 100644 --- a/src/tests/helpers.rs +++ b/src/tests/helpers.rs @@ -8,9 +8,9 @@ use tower::util::ServiceExt; pub async fn create_test_app() -> (Router, ContainerAsync) { let postgres_image = Postgres::default() - .with_env_var(("POSTGRES_USER", "test")) - .with_env_var(("POSTGRES_PASSWORD", "test")) - .with_env_var(("POSTGRES_DB", "test")); + .with_env_var("POSTGRES_USER", "test") + .with_env_var("POSTGRES_PASSWORD", "test") + .with_env_var("POSTGRES_DB", "test"); let container = postgres_image.start().await.expect("Failed to start postgres container"); let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port"); @@ -41,11 +41,14 @@ pub async fn create_test_app() -> (Router, ContainerAsync) { cpu_priority: "normal".to_string(), }; + let queue_service = Arc::new(crate::ocr_queue::OcrQueueService::new(db.clone(), db.pool.clone(), 2)); + let state = Arc::new(AppState { db, config, webdav_scheduler: None, source_scheduler: None, + queue_service, }); let app = Router::new() diff --git a/tests/cancellation_tests.rs b/tests/cancellation_tests.rs index cd71d2c..9c76f61 100644 --- a/tests/cancellation_tests.rs +++ b/tests/cancellation_tests.rs @@ -33,17 +33,19 @@ async fn create_test_app_state() -> Arc { database_url: "sqlite::memory:".to_string(), server_address: "127.0.0.1:8080".to_string(), jwt_secret: "test_secret".to_string(), - upload_dir: "/tmp/test_uploads".to_string(), - max_file_size: 10 * 1024 * 1024, + upload_path: "/tmp/test_uploads".to_string(), + max_file_size_mb: 10 * 1024 * 1024, }; let db = Database::new(&config.database_url).await.unwrap(); + let queue_service = Arc::new(readur::ocr_queue::OcrQueueService::new(db.clone(), db.pool.clone(), 2)); Arc::new(AppState { db, config, webdav_scheduler: None, source_scheduler: None, + queue_service, }) } diff --git a/tests/local_folder_sync_tests.rs b/tests/local_folder_sync_tests.rs index aa7874e..3aef790 100644 --- a/tests/local_folder_sync_tests.rs +++ b/tests/local_folder_sync_tests.rs @@ -25,7 +25,7 @@ use readur::{ /// Create a test local folder configuration fn create_test_local_config() -> LocalFolderSourceConfig { LocalFolderSourceConfig { - paths: vec!["/test/documents".to_string(), "/test/images".to_string()], + watch_folders: vec!["/test/documents".to_string(), "/test/images".to_string()], recursive: true, follow_symlinks: false, auto_sync: true, @@ -60,9 +60,9 @@ fn create_test_directory_structure() -> Result { fn test_local_folder_config_creation() { let config = create_test_local_config(); - assert_eq!(config.paths.len(), 2); - assert_eq!(config.paths[0], "/test/documents"); - assert_eq!(config.paths[1], "/test/images"); + assert_eq!(config.watch_folders.len(), 2); + assert_eq!(config.watch_folders[0], "/test/documents"); + assert_eq!(config.watch_folders[1], "/test/images"); assert!(config.recursive); assert!(!config.follow_symlinks); assert!(config.auto_sync); @@ -75,8 +75,8 @@ fn test_local_folder_config_validation() { let config = create_test_local_config(); // Test paths validation - assert!(!config.paths.is_empty(), "Should have at least one path"); - for path in &config.paths { + assert!(!config.watch_folders.is_empty(), "Should have at least one path"); + for path in &config.watch_folders { assert!(Path::new(path).is_absolute() || path.starts_with('.'), "Path should be absolute or relative: {}", path); } @@ -328,7 +328,7 @@ fn test_error_handling() { // Non-existent path let non_existent_config = LocalFolderSourceConfig { - paths: vec!["/this/path/does/not/exist".to_string()], + watch_folders: vec!["/this/path/does/not/exist".to_string()], recursive: true, follow_symlinks: false, auto_sync: true, @@ -336,11 +336,11 @@ fn test_error_handling() { file_extensions: vec![".txt".to_string()], }; - assert_eq!(non_existent_config.paths[0], "/this/path/does/not/exist"); + assert_eq!(non_existent_config.watch_folders[0], "/this/path/does/not/exist"); // Empty paths let empty_paths_config = LocalFolderSourceConfig { - paths: Vec::new(), + watch_folders: Vec::new(), recursive: true, follow_symlinks: false, auto_sync: true, @@ -348,11 +348,11 @@ fn test_error_handling() { file_extensions: vec![".txt".to_string()], }; - assert!(empty_paths_config.paths.is_empty()); + assert!(empty_paths_config.watch_folders.is_empty()); // Invalid sync interval let invalid_interval_config = LocalFolderSourceConfig { - paths: vec!["/test".to_string()], + watch_folders: vec!["/test".to_string()], recursive: true, follow_symlinks: false, auto_sync: true, diff --git a/tests/source_scheduler_simple_tests.rs b/tests/source_scheduler_simple_tests.rs index a3e9baf..34594a9 100644 --- a/tests/source_scheduler_simple_tests.rs +++ b/tests/source_scheduler_simple_tests.rs @@ -39,11 +39,13 @@ async fn create_test_app_state() -> Arc { let db = Database::new(&config.database_url).await.unwrap(); + let queue_service = Arc::new(readur::ocr_queue::OcrQueueService::new(db.clone(), db.pool.clone(), 2)); Arc::new(AppState { db, config, webdav_scheduler: None, source_scheduler: None, + queue_service, }) } diff --git a/tests/universal_source_sync_tests.rs b/tests/universal_source_sync_tests.rs index e37f17a..c0f05a4 100644 --- a/tests/universal_source_sync_tests.rs +++ b/tests/universal_source_sync_tests.rs @@ -118,17 +118,19 @@ async fn create_test_app_state() -> Arc { database_url: "sqlite::memory:".to_string(), server_address: "127.0.0.1:8080".to_string(), jwt_secret: "test_secret".to_string(), - upload_dir: "/tmp/test_uploads".to_string(), - max_file_size: 10 * 1024 * 1024, + upload_path: "/tmp/test_uploads".to_string(), + max_file_size_mb: 10 * 1024 * 1024, }; let db = Database::new(&config.database_url).await.unwrap(); + let queue_service = Arc::new(readur::ocr_queue::OcrQueueService::new(db.clone(), db.pool.clone(), 2)); Arc::new(AppState { db, config, webdav_scheduler: None, source_scheduler: None, + queue_service, }) } @@ -181,8 +183,8 @@ fn test_config_parsing_local_folder() { assert!(config.is_ok(), "Local Folder config should parse successfully"); let local_config = config.unwrap(); - assert_eq!(local_config.paths.len(), 1); - assert_eq!(local_config.paths[0], "/home/user/documents"); + assert_eq!(local_config.watch_folders.len(), 1); + assert_eq!(local_config.watch_folders[0], "/home/user/documents"); assert!(local_config.recursive); assert!(!local_config.follow_symlinks); assert_eq!(local_config.sync_interval_minutes, 30); diff --git a/tests/webdav_integration_tests.rs b/tests/webdav_integration_tests.rs index 7fe458b..fcaefdd 100644 --- a/tests/webdav_integration_tests.rs +++ b/tests/webdav_integration_tests.rs @@ -85,7 +85,14 @@ async fn setup_test_app() -> (Router, Arc) { }; let db = Database::new(&db_url).await.expect("Failed to connect to test database"); - let state = Arc::new(AppState { db, config }); + let queue_service = Arc::new(readur::ocr_queue::OcrQueueService::new(db.clone(), db.pool.clone(), 2)); + let state = Arc::new(AppState { + db, + config, + webdav_scheduler: None, + source_scheduler: None, + queue_service, + }); let app = Router::new() .nest("/api/auth", routes::auth::router()) diff --git a/tests/webdav_sync_tests.rs b/tests/webdav_sync_tests.rs index 5f50dd5..4def9f9 100644 --- a/tests/webdav_sync_tests.rs +++ b/tests/webdav_sync_tests.rs @@ -29,7 +29,7 @@ fn create_test_webdav_config() -> WebDAVConfig { watch_folders: vec!["/Documents".to_string(), "/Photos".to_string()], file_extensions: vec![".pdf".to_string(), ".txt".to_string(), ".jpg".to_string()], timeout_seconds: 30, - server_type: "nextcloud".to_string(), + server_type: Some("nextcloud".to_string()), } } @@ -43,7 +43,7 @@ fn create_test_source_config() -> WebDAVSourceConfig { file_extensions: vec![".pdf".to_string(), ".txt".to_string()], auto_sync: true, sync_interval_minutes: 60, - server_type: "nextcloud".to_string(), + server_type: Some("nextcloud".to_string()), } } @@ -314,7 +314,7 @@ fn test_error_handling_scenarios() { watch_folders: vec!["/test".to_string()], file_extensions: vec![".pdf".to_string()], timeout_seconds: 1, // Very short timeout - server_type: "nextcloud".to_string(), + server_type: Some("nextcloud".to_string()), }; assert_eq!(timeout_config.timeout_seconds, 1); @@ -327,7 +327,7 @@ fn test_error_handling_scenarios() { watch_folders: vec!["/test".to_string()], file_extensions: vec![".pdf".to_string()], timeout_seconds: 30, - server_type: "nextcloud".to_string(), + server_type: Some("nextcloud".to_string()), }; assert_eq!(auth_config.username, "invalid_user"); @@ -341,7 +341,7 @@ fn test_error_handling_scenarios() { watch_folders: vec!["/nonexistent_folder".to_string()], file_extensions: vec![".pdf".to_string()], timeout_seconds: 30, - server_type: "nextcloud".to_string(), + server_type: Some("nextcloud".to_string()), }; assert_eq!(invalid_path_config.watch_folders[0], "/nonexistent_folder");