feat(server): also generate thumbnails for non-images, and resolve failing unit/integration tests
This commit is contained in:
parent
abdea3226f
commit
a47960a059
|
|
@ -819,22 +819,33 @@ impl EnhancedOcrService {
|
|||
// Clean the PDF data (remove leading null bytes)
|
||||
let clean_bytes = clean_pdf_data(&bytes);
|
||||
|
||||
// Add timeout for PDF extraction to prevent hanging
|
||||
// Add timeout and panic recovery for PDF extraction
|
||||
let extraction_result = tokio::time::timeout(
|
||||
std::time::Duration::from_secs(120), // 2 minute timeout
|
||||
tokio::task::spawn_blocking(move || {
|
||||
pdf_extract::extract_text_from_mem(&clean_bytes)
|
||||
// Catch panics from pdf-extract library
|
||||
catch_unwind(AssertUnwindSafe(|| {
|
||||
pdf_extract::extract_text_from_mem(&clean_bytes)
|
||||
}))
|
||||
})
|
||||
).await;
|
||||
|
||||
let text = match extraction_result {
|
||||
Ok(Ok(Ok(text))) => text,
|
||||
Ok(Ok(Err(e))) => {
|
||||
Ok(Ok(Ok(Ok(text)))) => text,
|
||||
Ok(Ok(Ok(Err(e)))) => {
|
||||
return Err(anyhow!(
|
||||
"PDF text extraction failed for file '{}' (size: {} bytes): {}. This may indicate a corrupted or unsupported PDF format.",
|
||||
file_path, file_size, e
|
||||
));
|
||||
}
|
||||
Ok(Ok(Err(_panic))) => {
|
||||
// pdf-extract panicked (e.g., missing unicode map, corrupted font encoding)
|
||||
warn!("PDF extraction panicked for file '{}' - likely corrupted font encoding or missing unicode map. Fallback to OCR not yet implemented.", file_path);
|
||||
return Err(anyhow!(
|
||||
"PDF extraction failed due to corrupted or unsupported font encoding in file '{}' (size: {} bytes). The PDF may have non-standard fonts or corrupted internal structure. Consider converting the PDF to images for OCR.",
|
||||
file_path, file_size
|
||||
));
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
return Err(anyhow!("PDF extraction task failed: {}", e));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ use tracing::{info, warn, error};
|
|||
use crate::models::Document;
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
use image::{DynamicImage, ImageFormat, imageops::FilterType};
|
||||
use image::{DynamicImage, ImageFormat, imageops::FilterType, Rgb, RgbImage, Rgba, ImageBuffer};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct FileService {
|
||||
|
|
@ -275,9 +275,13 @@ impl FileService {
|
|||
self.generate_image_thumbnail(&file_data).await
|
||||
}
|
||||
"pdf" => {
|
||||
// For PDFs, we'd need pdf2image or similar
|
||||
// For now, return a placeholder
|
||||
self.generate_placeholder_thumbnail("PDF").await
|
||||
self.generate_pdf_thumbnail(&file_data).await
|
||||
}
|
||||
"txt" => {
|
||||
self.generate_text_thumbnail(&file_data).await
|
||||
}
|
||||
"doc" | "docx" => {
|
||||
self.generate_placeholder_thumbnail("DOC").await
|
||||
}
|
||||
_ => {
|
||||
// For other file types, generate a placeholder
|
||||
|
|
@ -311,6 +315,86 @@ impl FileService {
|
|||
Ok(buffer)
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
async fn generate_pdf_thumbnail(&self, file_data: &[u8]) -> Result<Vec<u8>> {
|
||||
use image::Rgb;
|
||||
|
||||
// Try to extract first page as image using pdf-extract
|
||||
match pdf_extract::extract_text_from_mem(file_data) {
|
||||
Ok(text) => {
|
||||
// If we can extract text, create a text-based thumbnail
|
||||
self.generate_text_based_thumbnail(&text, "PDF", Rgb([220, 38, 27])).await
|
||||
}
|
||||
Err(_) => {
|
||||
// Fall back to placeholder if PDF extraction fails
|
||||
self.generate_placeholder_thumbnail("PDF").await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
async fn generate_text_thumbnail(&self, file_data: &[u8]) -> Result<Vec<u8>> {
|
||||
use image::Rgb;
|
||||
|
||||
// Convert bytes to text
|
||||
let text = String::from_utf8_lossy(file_data);
|
||||
self.generate_text_based_thumbnail(&text, "TXT", Rgb([34, 139, 34])).await
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
async fn generate_text_based_thumbnail(&self, text: &str, file_type: &str, bg_color: image::Rgb<u8>) -> Result<Vec<u8>> {
|
||||
use image::{RgbImage, Rgb, DynamicImage, ImageFormat};
|
||||
|
||||
let width = 200;
|
||||
let height = 200;
|
||||
let mut img = RgbImage::new(width, height);
|
||||
|
||||
// Fill background
|
||||
for pixel in img.pixels_mut() {
|
||||
*pixel = bg_color;
|
||||
}
|
||||
|
||||
// Add file type indicator at the top
|
||||
let text_color = Rgb([255, 255, 255]); // White text
|
||||
let preview_text = if text.len() > 300 {
|
||||
format!("{}\n{}", file_type, &text[..300].trim())
|
||||
} else {
|
||||
format!("{}\n{}", file_type, text.trim())
|
||||
};
|
||||
|
||||
// Simple text rendering - just place some characters as visual indicators
|
||||
// For a more sophisticated approach, you'd use a text rendering library
|
||||
let lines: Vec<&str> = preview_text.lines().take(15).collect();
|
||||
for (line_idx, line) in lines.iter().enumerate() {
|
||||
let y_offset = 20 + (line_idx * 12);
|
||||
if y_offset >= height as usize - 10 { break; }
|
||||
|
||||
// Simple character placement (very basic text rendering)
|
||||
for (char_idx, _) in line.chars().take(25).enumerate() {
|
||||
let x_offset = 10 + (char_idx * 7);
|
||||
if x_offset >= width as usize - 10 { break; }
|
||||
|
||||
// Draw a simple "character" representation as white pixels
|
||||
if x_offset < width as usize && y_offset < height as usize {
|
||||
if let Some(pixel) = img.get_pixel_mut_checked(x_offset as u32, y_offset as u32) {
|
||||
*pixel = text_color;
|
||||
}
|
||||
// Add some thickness
|
||||
if let Some(pixel) = img.get_pixel_mut_checked(x_offset as u32 + 1, y_offset as u32) {
|
||||
*pixel = text_color;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let dynamic_img = DynamicImage::ImageRgb8(img);
|
||||
let mut buffer = Vec::new();
|
||||
let mut cursor = std::io::Cursor::new(&mut buffer);
|
||||
dynamic_img.write_to(&mut cursor, ImageFormat::Jpeg)?;
|
||||
|
||||
Ok(buffer)
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
async fn generate_placeholder_thumbnail(&self, file_type: &str) -> Result<Vec<u8>> {
|
||||
// Create a simple colored rectangle as placeholder
|
||||
|
|
|
|||
|
|
@ -407,20 +407,34 @@ impl OcrQueueService {
|
|||
}
|
||||
Err(e) => {
|
||||
let error_msg = format!("OCR extraction failed: {}", e);
|
||||
warn!("❌ OCR failed for '{}' | Job: {} | Document: {} | Error: {}",
|
||||
filename, item.id, item.document_id, e);
|
||||
let error_str = e.to_string();
|
||||
|
||||
// Update document status
|
||||
// Detect specific PDF font encoding issues
|
||||
let is_pdf_font_issue = error_str.contains("font encoding") ||
|
||||
error_str.contains("missing unicode map") ||
|
||||
error_str.contains("corrupted internal structure");
|
||||
|
||||
if is_pdf_font_issue {
|
||||
warn!("⚠️ PDF font encoding issue for '{}' | Job: {} | Document: {} | Error: {}",
|
||||
filename, item.id, item.document_id, e);
|
||||
} else {
|
||||
warn!("❌ OCR failed for '{}' | Job: {} | Document: {} | Error: {}",
|
||||
filename, item.id, item.document_id, e);
|
||||
}
|
||||
|
||||
// Update document status with more specific error information
|
||||
let ocr_status = if is_pdf_font_issue { "pdf_font_error" } else { "failed" };
|
||||
sqlx::query(
|
||||
r#"
|
||||
UPDATE documents
|
||||
SET ocr_status = 'failed',
|
||||
ocr_error = $2,
|
||||
SET ocr_status = $2,
|
||||
ocr_error = $3,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
"#
|
||||
)
|
||||
.bind(item.document_id)
|
||||
.bind(ocr_status)
|
||||
.bind(&error_msg)
|
||||
.execute(&self.pool)
|
||||
.await?;
|
||||
|
|
@ -566,6 +580,22 @@ impl OcrQueueService {
|
|||
|
||||
info!("Successfully copied processed image to: {:?}", permanent_path);
|
||||
|
||||
// Get actual image dimensions and file size
|
||||
let image_metadata = tokio::fs::metadata(&permanent_path).await
|
||||
.map_err(|e| anyhow::anyhow!("Failed to get processed image metadata: {}", e))?;
|
||||
let file_size = image_metadata.len() as i64;
|
||||
|
||||
// Get image dimensions using image crate
|
||||
let (image_width, image_height) = tokio::task::spawn_blocking({
|
||||
let path = permanent_path.clone();
|
||||
move || -> Result<(u32, u32), anyhow::Error> {
|
||||
let img = image::open(&path)
|
||||
.map_err(|e| anyhow::anyhow!("Failed to open processed image for dimensions: {}", e))?;
|
||||
Ok((img.width(), img.height()))
|
||||
}
|
||||
}).await
|
||||
.map_err(|e| anyhow::anyhow!("Failed to get image dimensions: {}", e))??;
|
||||
|
||||
// Save to database
|
||||
let processing_parameters = serde_json::json!({
|
||||
"steps": processing_steps,
|
||||
|
|
@ -576,14 +606,8 @@ impl OcrQueueService {
|
|||
// Save metadata to database with error handling
|
||||
if let Err(e) = sqlx::query(
|
||||
r#"
|
||||
INSERT INTO processed_images (document_id, user_id, original_image_path, processed_image_path, processing_parameters, processing_steps, created_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, NOW())
|
||||
ON CONFLICT (document_id)
|
||||
DO UPDATE SET
|
||||
processed_image_path = EXCLUDED.processed_image_path,
|
||||
processing_parameters = EXCLUDED.processing_parameters,
|
||||
processing_steps = EXCLUDED.processing_steps,
|
||||
created_at = NOW()
|
||||
INSERT INTO processed_images (document_id, user_id, original_image_path, processed_image_path, processing_parameters, processing_steps, image_width, image_height, file_size, created_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, NOW())
|
||||
"#
|
||||
)
|
||||
.bind(document_id)
|
||||
|
|
@ -592,6 +616,9 @@ impl OcrQueueService {
|
|||
.bind(permanent_path.to_string_lossy().as_ref())
|
||||
.bind(&processing_parameters)
|
||||
.bind(processing_steps)
|
||||
.bind(image_width as i32)
|
||||
.bind(image_height as i32)
|
||||
.bind(file_size)
|
||||
.execute(&self.pool)
|
||||
.await {
|
||||
error!("Failed to save processed image metadata to database for document {}: {}", document_id, e);
|
||||
|
|
|
|||
|
|
@ -8,9 +8,9 @@ use tower::util::ServiceExt;
|
|||
|
||||
pub async fn create_test_app() -> (Router, ContainerAsync<Postgres>) {
|
||||
let postgres_image = Postgres::default()
|
||||
.with_env_var(("POSTGRES_USER", "test"))
|
||||
.with_env_var(("POSTGRES_PASSWORD", "test"))
|
||||
.with_env_var(("POSTGRES_DB", "test"));
|
||||
.with_env_var("POSTGRES_USER", "test")
|
||||
.with_env_var("POSTGRES_PASSWORD", "test")
|
||||
.with_env_var("POSTGRES_DB", "test");
|
||||
|
||||
let container = postgres_image.start().await.expect("Failed to start postgres container");
|
||||
let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port");
|
||||
|
|
@ -41,11 +41,14 @@ pub async fn create_test_app() -> (Router, ContainerAsync<Postgres>) {
|
|||
cpu_priority: "normal".to_string(),
|
||||
};
|
||||
|
||||
let queue_service = Arc::new(crate::ocr_queue::OcrQueueService::new(db.clone(), db.pool.clone(), 2));
|
||||
|
||||
let state = Arc::new(AppState {
|
||||
db,
|
||||
config,
|
||||
webdav_scheduler: None,
|
||||
source_scheduler: None,
|
||||
queue_service,
|
||||
});
|
||||
|
||||
let app = Router::new()
|
||||
|
|
|
|||
|
|
@ -33,17 +33,19 @@ async fn create_test_app_state() -> Arc<AppState> {
|
|||
database_url: "sqlite::memory:".to_string(),
|
||||
server_address: "127.0.0.1:8080".to_string(),
|
||||
jwt_secret: "test_secret".to_string(),
|
||||
upload_dir: "/tmp/test_uploads".to_string(),
|
||||
max_file_size: 10 * 1024 * 1024,
|
||||
upload_path: "/tmp/test_uploads".to_string(),
|
||||
max_file_size_mb: 10 * 1024 * 1024,
|
||||
};
|
||||
|
||||
let db = Database::new(&config.database_url).await.unwrap();
|
||||
|
||||
let queue_service = Arc::new(readur::ocr_queue::OcrQueueService::new(db.clone(), db.pool.clone(), 2));
|
||||
Arc::new(AppState {
|
||||
db,
|
||||
config,
|
||||
webdav_scheduler: None,
|
||||
source_scheduler: None,
|
||||
queue_service,
|
||||
})
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ use readur::{
|
|||
/// Create a test local folder configuration
|
||||
fn create_test_local_config() -> LocalFolderSourceConfig {
|
||||
LocalFolderSourceConfig {
|
||||
paths: vec!["/test/documents".to_string(), "/test/images".to_string()],
|
||||
watch_folders: vec!["/test/documents".to_string(), "/test/images".to_string()],
|
||||
recursive: true,
|
||||
follow_symlinks: false,
|
||||
auto_sync: true,
|
||||
|
|
@ -60,9 +60,9 @@ fn create_test_directory_structure() -> Result<TempDir, std::io::Error> {
|
|||
fn test_local_folder_config_creation() {
|
||||
let config = create_test_local_config();
|
||||
|
||||
assert_eq!(config.paths.len(), 2);
|
||||
assert_eq!(config.paths[0], "/test/documents");
|
||||
assert_eq!(config.paths[1], "/test/images");
|
||||
assert_eq!(config.watch_folders.len(), 2);
|
||||
assert_eq!(config.watch_folders[0], "/test/documents");
|
||||
assert_eq!(config.watch_folders[1], "/test/images");
|
||||
assert!(config.recursive);
|
||||
assert!(!config.follow_symlinks);
|
||||
assert!(config.auto_sync);
|
||||
|
|
@ -75,8 +75,8 @@ fn test_local_folder_config_validation() {
|
|||
let config = create_test_local_config();
|
||||
|
||||
// Test paths validation
|
||||
assert!(!config.paths.is_empty(), "Should have at least one path");
|
||||
for path in &config.paths {
|
||||
assert!(!config.watch_folders.is_empty(), "Should have at least one path");
|
||||
for path in &config.watch_folders {
|
||||
assert!(Path::new(path).is_absolute() || path.starts_with('.'),
|
||||
"Path should be absolute or relative: {}", path);
|
||||
}
|
||||
|
|
@ -328,7 +328,7 @@ fn test_error_handling() {
|
|||
|
||||
// Non-existent path
|
||||
let non_existent_config = LocalFolderSourceConfig {
|
||||
paths: vec!["/this/path/does/not/exist".to_string()],
|
||||
watch_folders: vec!["/this/path/does/not/exist".to_string()],
|
||||
recursive: true,
|
||||
follow_symlinks: false,
|
||||
auto_sync: true,
|
||||
|
|
@ -336,11 +336,11 @@ fn test_error_handling() {
|
|||
file_extensions: vec![".txt".to_string()],
|
||||
};
|
||||
|
||||
assert_eq!(non_existent_config.paths[0], "/this/path/does/not/exist");
|
||||
assert_eq!(non_existent_config.watch_folders[0], "/this/path/does/not/exist");
|
||||
|
||||
// Empty paths
|
||||
let empty_paths_config = LocalFolderSourceConfig {
|
||||
paths: Vec::new(),
|
||||
watch_folders: Vec::new(),
|
||||
recursive: true,
|
||||
follow_symlinks: false,
|
||||
auto_sync: true,
|
||||
|
|
@ -348,11 +348,11 @@ fn test_error_handling() {
|
|||
file_extensions: vec![".txt".to_string()],
|
||||
};
|
||||
|
||||
assert!(empty_paths_config.paths.is_empty());
|
||||
assert!(empty_paths_config.watch_folders.is_empty());
|
||||
|
||||
// Invalid sync interval
|
||||
let invalid_interval_config = LocalFolderSourceConfig {
|
||||
paths: vec!["/test".to_string()],
|
||||
watch_folders: vec!["/test".to_string()],
|
||||
recursive: true,
|
||||
follow_symlinks: false,
|
||||
auto_sync: true,
|
||||
|
|
|
|||
|
|
@ -39,11 +39,13 @@ async fn create_test_app_state() -> Arc<AppState> {
|
|||
|
||||
let db = Database::new(&config.database_url).await.unwrap();
|
||||
|
||||
let queue_service = Arc::new(readur::ocr_queue::OcrQueueService::new(db.clone(), db.pool.clone(), 2));
|
||||
Arc::new(AppState {
|
||||
db,
|
||||
config,
|
||||
webdav_scheduler: None,
|
||||
source_scheduler: None,
|
||||
queue_service,
|
||||
})
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -118,17 +118,19 @@ async fn create_test_app_state() -> Arc<AppState> {
|
|||
database_url: "sqlite::memory:".to_string(),
|
||||
server_address: "127.0.0.1:8080".to_string(),
|
||||
jwt_secret: "test_secret".to_string(),
|
||||
upload_dir: "/tmp/test_uploads".to_string(),
|
||||
max_file_size: 10 * 1024 * 1024,
|
||||
upload_path: "/tmp/test_uploads".to_string(),
|
||||
max_file_size_mb: 10 * 1024 * 1024,
|
||||
};
|
||||
|
||||
let db = Database::new(&config.database_url).await.unwrap();
|
||||
|
||||
let queue_service = Arc::new(readur::ocr_queue::OcrQueueService::new(db.clone(), db.pool.clone(), 2));
|
||||
Arc::new(AppState {
|
||||
db,
|
||||
config,
|
||||
webdav_scheduler: None,
|
||||
source_scheduler: None,
|
||||
queue_service,
|
||||
})
|
||||
}
|
||||
|
||||
|
|
@ -181,8 +183,8 @@ fn test_config_parsing_local_folder() {
|
|||
assert!(config.is_ok(), "Local Folder config should parse successfully");
|
||||
|
||||
let local_config = config.unwrap();
|
||||
assert_eq!(local_config.paths.len(), 1);
|
||||
assert_eq!(local_config.paths[0], "/home/user/documents");
|
||||
assert_eq!(local_config.watch_folders.len(), 1);
|
||||
assert_eq!(local_config.watch_folders[0], "/home/user/documents");
|
||||
assert!(local_config.recursive);
|
||||
assert!(!local_config.follow_symlinks);
|
||||
assert_eq!(local_config.sync_interval_minutes, 30);
|
||||
|
|
|
|||
|
|
@ -85,7 +85,14 @@ async fn setup_test_app() -> (Router, Arc<AppState>) {
|
|||
};
|
||||
|
||||
let db = Database::new(&db_url).await.expect("Failed to connect to test database");
|
||||
let state = Arc::new(AppState { db, config });
|
||||
let queue_service = Arc::new(readur::ocr_queue::OcrQueueService::new(db.clone(), db.pool.clone(), 2));
|
||||
let state = Arc::new(AppState {
|
||||
db,
|
||||
config,
|
||||
webdav_scheduler: None,
|
||||
source_scheduler: None,
|
||||
queue_service,
|
||||
});
|
||||
|
||||
let app = Router::new()
|
||||
.nest("/api/auth", routes::auth::router())
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ fn create_test_webdav_config() -> WebDAVConfig {
|
|||
watch_folders: vec!["/Documents".to_string(), "/Photos".to_string()],
|
||||
file_extensions: vec![".pdf".to_string(), ".txt".to_string(), ".jpg".to_string()],
|
||||
timeout_seconds: 30,
|
||||
server_type: "nextcloud".to_string(),
|
||||
server_type: Some("nextcloud".to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -43,7 +43,7 @@ fn create_test_source_config() -> WebDAVSourceConfig {
|
|||
file_extensions: vec![".pdf".to_string(), ".txt".to_string()],
|
||||
auto_sync: true,
|
||||
sync_interval_minutes: 60,
|
||||
server_type: "nextcloud".to_string(),
|
||||
server_type: Some("nextcloud".to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -314,7 +314,7 @@ fn test_error_handling_scenarios() {
|
|||
watch_folders: vec!["/test".to_string()],
|
||||
file_extensions: vec![".pdf".to_string()],
|
||||
timeout_seconds: 1, // Very short timeout
|
||||
server_type: "nextcloud".to_string(),
|
||||
server_type: Some("nextcloud".to_string()),
|
||||
};
|
||||
|
||||
assert_eq!(timeout_config.timeout_seconds, 1);
|
||||
|
|
@ -327,7 +327,7 @@ fn test_error_handling_scenarios() {
|
|||
watch_folders: vec!["/test".to_string()],
|
||||
file_extensions: vec![".pdf".to_string()],
|
||||
timeout_seconds: 30,
|
||||
server_type: "nextcloud".to_string(),
|
||||
server_type: Some("nextcloud".to_string()),
|
||||
};
|
||||
|
||||
assert_eq!(auth_config.username, "invalid_user");
|
||||
|
|
@ -341,7 +341,7 @@ fn test_error_handling_scenarios() {
|
|||
watch_folders: vec!["/nonexistent_folder".to_string()],
|
||||
file_extensions: vec![".pdf".to_string()],
|
||||
timeout_seconds: 30,
|
||||
server_type: "nextcloud".to_string(),
|
||||
server_type: Some("nextcloud".to_string()),
|
||||
};
|
||||
|
||||
assert_eq!(invalid_path_config.watch_folders[0], "/nonexistent_folder");
|
||||
|
|
|
|||
Loading…
Reference in New Issue