feat(server): also generate thumbnails for non-images, and resolve failing unit/integration tests

This commit is contained in:
perf3ct 2025-06-16 22:51:29 +00:00
parent c43994e63c
commit 0d3fe26074
10 changed files with 185 additions and 47 deletions

View File

@ -819,22 +819,33 @@ impl EnhancedOcrService {
// Clean the PDF data (remove leading null bytes)
let clean_bytes = clean_pdf_data(&bytes);
// Add timeout for PDF extraction to prevent hanging
// Add timeout and panic recovery for PDF extraction
let extraction_result = tokio::time::timeout(
std::time::Duration::from_secs(120), // 2 minute timeout
tokio::task::spawn_blocking(move || {
// Catch panics from pdf-extract library
catch_unwind(AssertUnwindSafe(|| {
pdf_extract::extract_text_from_mem(&clean_bytes)
}))
})
).await;
let text = match extraction_result {
Ok(Ok(Ok(text))) => text,
Ok(Ok(Err(e))) => {
Ok(Ok(Ok(Ok(text)))) => text,
Ok(Ok(Ok(Err(e)))) => {
return Err(anyhow!(
"PDF text extraction failed for file '{}' (size: {} bytes): {}. This may indicate a corrupted or unsupported PDF format.",
file_path, file_size, e
));
}
Ok(Ok(Err(_panic))) => {
// pdf-extract panicked (e.g., missing unicode map, corrupted font encoding)
warn!("PDF extraction panicked for file '{}' - likely corrupted font encoding or missing unicode map. Fallback to OCR not yet implemented.", file_path);
return Err(anyhow!(
"PDF extraction failed due to corrupted or unsupported font encoding in file '{}' (size: {} bytes). The PDF may have non-standard fonts or corrupted internal structure. Consider converting the PDF to images for OCR.",
file_path, file_size
));
}
Ok(Err(e)) => {
return Err(anyhow!("PDF extraction task failed: {}", e));
}

View File

@ -8,7 +8,7 @@ use tracing::{info, warn, error};
use crate::models::Document;
#[cfg(feature = "ocr")]
use image::{DynamicImage, ImageFormat, imageops::FilterType};
use image::{DynamicImage, ImageFormat, imageops::FilterType, Rgb, RgbImage, Rgba, ImageBuffer};
#[derive(Clone)]
pub struct FileService {
@ -275,9 +275,13 @@ impl FileService {
self.generate_image_thumbnail(&file_data).await
}
"pdf" => {
// For PDFs, we'd need pdf2image or similar
// For now, return a placeholder
self.generate_placeholder_thumbnail("PDF").await
self.generate_pdf_thumbnail(&file_data).await
}
"txt" => {
self.generate_text_thumbnail(&file_data).await
}
"doc" | "docx" => {
self.generate_placeholder_thumbnail("DOC").await
}
_ => {
// For other file types, generate a placeholder
@ -311,6 +315,86 @@ impl FileService {
Ok(buffer)
}
#[cfg(feature = "ocr")]
async fn generate_pdf_thumbnail(&self, file_data: &[u8]) -> Result<Vec<u8>> {
use image::Rgb;
// Try to extract first page as image using pdf-extract
match pdf_extract::extract_text_from_mem(file_data) {
Ok(text) => {
// If we can extract text, create a text-based thumbnail
self.generate_text_based_thumbnail(&text, "PDF", Rgb([220, 38, 27])).await
}
Err(_) => {
// Fall back to placeholder if PDF extraction fails
self.generate_placeholder_thumbnail("PDF").await
}
}
}
#[cfg(feature = "ocr")]
async fn generate_text_thumbnail(&self, file_data: &[u8]) -> Result<Vec<u8>> {
use image::Rgb;
// Convert bytes to text
let text = String::from_utf8_lossy(file_data);
self.generate_text_based_thumbnail(&text, "TXT", Rgb([34, 139, 34])).await
}
#[cfg(feature = "ocr")]
async fn generate_text_based_thumbnail(&self, text: &str, file_type: &str, bg_color: image::Rgb<u8>) -> Result<Vec<u8>> {
use image::{RgbImage, Rgb, DynamicImage, ImageFormat};
let width = 200;
let height = 200;
let mut img = RgbImage::new(width, height);
// Fill background
for pixel in img.pixels_mut() {
*pixel = bg_color;
}
// Add file type indicator at the top
let text_color = Rgb([255, 255, 255]); // White text
let preview_text = if text.len() > 300 {
format!("{}\n{}", file_type, &text[..300].trim())
} else {
format!("{}\n{}", file_type, text.trim())
};
// Simple text rendering - just place some characters as visual indicators
// For a more sophisticated approach, you'd use a text rendering library
let lines: Vec<&str> = preview_text.lines().take(15).collect();
for (line_idx, line) in lines.iter().enumerate() {
let y_offset = 20 + (line_idx * 12);
if y_offset >= height as usize - 10 { break; }
// Simple character placement (very basic text rendering)
for (char_idx, _) in line.chars().take(25).enumerate() {
let x_offset = 10 + (char_idx * 7);
if x_offset >= width as usize - 10 { break; }
// Draw a simple "character" representation as white pixels
if x_offset < width as usize && y_offset < height as usize {
if let Some(pixel) = img.get_pixel_mut_checked(x_offset as u32, y_offset as u32) {
*pixel = text_color;
}
// Add some thickness
if let Some(pixel) = img.get_pixel_mut_checked(x_offset as u32 + 1, y_offset as u32) {
*pixel = text_color;
}
}
}
}
let dynamic_img = DynamicImage::ImageRgb8(img);
let mut buffer = Vec::new();
let mut cursor = std::io::Cursor::new(&mut buffer);
dynamic_img.write_to(&mut cursor, ImageFormat::Jpeg)?;
Ok(buffer)
}
#[cfg(feature = "ocr")]
async fn generate_placeholder_thumbnail(&self, file_type: &str) -> Result<Vec<u8>> {
// Create a simple colored rectangle as placeholder

View File

@ -407,20 +407,34 @@ impl OcrQueueService {
}
Err(e) => {
let error_msg = format!("OCR extraction failed: {}", e);
let error_str = e.to_string();
// Detect specific PDF font encoding issues
let is_pdf_font_issue = error_str.contains("font encoding") ||
error_str.contains("missing unicode map") ||
error_str.contains("corrupted internal structure");
if is_pdf_font_issue {
warn!("⚠️ PDF font encoding issue for '{}' | Job: {} | Document: {} | Error: {}",
filename, item.id, item.document_id, e);
} else {
warn!("❌ OCR failed for '{}' | Job: {} | Document: {} | Error: {}",
filename, item.id, item.document_id, e);
}
// Update document status
// Update document status with more specific error information
let ocr_status = if is_pdf_font_issue { "pdf_font_error" } else { "failed" };
sqlx::query(
r#"
UPDATE documents
SET ocr_status = 'failed',
ocr_error = $2,
SET ocr_status = $2,
ocr_error = $3,
updated_at = NOW()
WHERE id = $1
"#
)
.bind(item.document_id)
.bind(ocr_status)
.bind(&error_msg)
.execute(&self.pool)
.await?;
@ -566,6 +580,22 @@ impl OcrQueueService {
info!("Successfully copied processed image to: {:?}", permanent_path);
// Get actual image dimensions and file size
let image_metadata = tokio::fs::metadata(&permanent_path).await
.map_err(|e| anyhow::anyhow!("Failed to get processed image metadata: {}", e))?;
let file_size = image_metadata.len() as i64;
// Get image dimensions using image crate
let (image_width, image_height) = tokio::task::spawn_blocking({
let path = permanent_path.clone();
move || -> Result<(u32, u32), anyhow::Error> {
let img = image::open(&path)
.map_err(|e| anyhow::anyhow!("Failed to open processed image for dimensions: {}", e))?;
Ok((img.width(), img.height()))
}
}).await
.map_err(|e| anyhow::anyhow!("Failed to get image dimensions: {}", e))??;
// Save to database
let processing_parameters = serde_json::json!({
"steps": processing_steps,
@ -576,14 +606,8 @@ impl OcrQueueService {
// Save metadata to database with error handling
if let Err(e) = sqlx::query(
r#"
INSERT INTO processed_images (document_id, user_id, original_image_path, processed_image_path, processing_parameters, processing_steps, created_at)
VALUES ($1, $2, $3, $4, $5, $6, NOW())
ON CONFLICT (document_id)
DO UPDATE SET
processed_image_path = EXCLUDED.processed_image_path,
processing_parameters = EXCLUDED.processing_parameters,
processing_steps = EXCLUDED.processing_steps,
created_at = NOW()
INSERT INTO processed_images (document_id, user_id, original_image_path, processed_image_path, processing_parameters, processing_steps, image_width, image_height, file_size, created_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, NOW())
"#
)
.bind(document_id)
@ -592,6 +616,9 @@ impl OcrQueueService {
.bind(permanent_path.to_string_lossy().as_ref())
.bind(&processing_parameters)
.bind(processing_steps)
.bind(image_width as i32)
.bind(image_height as i32)
.bind(file_size)
.execute(&self.pool)
.await {
error!("Failed to save processed image metadata to database for document {}: {}", document_id, e);

View File

@ -8,9 +8,9 @@ use tower::util::ServiceExt;
pub async fn create_test_app() -> (Router, ContainerAsync<Postgres>) {
let postgres_image = Postgres::default()
.with_env_var(("POSTGRES_USER", "test"))
.with_env_var(("POSTGRES_PASSWORD", "test"))
.with_env_var(("POSTGRES_DB", "test"));
.with_env_var("POSTGRES_USER", "test")
.with_env_var("POSTGRES_PASSWORD", "test")
.with_env_var("POSTGRES_DB", "test");
let container = postgres_image.start().await.expect("Failed to start postgres container");
let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port");
@ -41,11 +41,14 @@ pub async fn create_test_app() -> (Router, ContainerAsync<Postgres>) {
cpu_priority: "normal".to_string(),
};
let queue_service = Arc::new(crate::ocr_queue::OcrQueueService::new(db.clone(), db.pool.clone(), 2));
let state = Arc::new(AppState {
db,
config,
webdav_scheduler: None,
source_scheduler: None,
queue_service,
});
let app = Router::new()

View File

@ -33,17 +33,19 @@ async fn create_test_app_state() -> Arc<AppState> {
database_url: "sqlite::memory:".to_string(),
server_address: "127.0.0.1:8080".to_string(),
jwt_secret: "test_secret".to_string(),
upload_dir: "/tmp/test_uploads".to_string(),
max_file_size: 10 * 1024 * 1024,
upload_path: "/tmp/test_uploads".to_string(),
max_file_size_mb: 10 * 1024 * 1024,
};
let db = Database::new(&config.database_url).await.unwrap();
let queue_service = Arc::new(readur::ocr_queue::OcrQueueService::new(db.clone(), db.pool.clone(), 2));
Arc::new(AppState {
db,
config,
webdav_scheduler: None,
source_scheduler: None,
queue_service,
})
}

View File

@ -25,7 +25,7 @@ use readur::{
/// Create a test local folder configuration
fn create_test_local_config() -> LocalFolderSourceConfig {
LocalFolderSourceConfig {
paths: vec!["/test/documents".to_string(), "/test/images".to_string()],
watch_folders: vec!["/test/documents".to_string(), "/test/images".to_string()],
recursive: true,
follow_symlinks: false,
auto_sync: true,
@ -60,9 +60,9 @@ fn create_test_directory_structure() -> Result<TempDir, std::io::Error> {
fn test_local_folder_config_creation() {
let config = create_test_local_config();
assert_eq!(config.paths.len(), 2);
assert_eq!(config.paths[0], "/test/documents");
assert_eq!(config.paths[1], "/test/images");
assert_eq!(config.watch_folders.len(), 2);
assert_eq!(config.watch_folders[0], "/test/documents");
assert_eq!(config.watch_folders[1], "/test/images");
assert!(config.recursive);
assert!(!config.follow_symlinks);
assert!(config.auto_sync);
@ -75,8 +75,8 @@ fn test_local_folder_config_validation() {
let config = create_test_local_config();
// Test paths validation
assert!(!config.paths.is_empty(), "Should have at least one path");
for path in &config.paths {
assert!(!config.watch_folders.is_empty(), "Should have at least one path");
for path in &config.watch_folders {
assert!(Path::new(path).is_absolute() || path.starts_with('.'),
"Path should be absolute or relative: {}", path);
}
@ -328,7 +328,7 @@ fn test_error_handling() {
// Non-existent path
let non_existent_config = LocalFolderSourceConfig {
paths: vec!["/this/path/does/not/exist".to_string()],
watch_folders: vec!["/this/path/does/not/exist".to_string()],
recursive: true,
follow_symlinks: false,
auto_sync: true,
@ -336,11 +336,11 @@ fn test_error_handling() {
file_extensions: vec![".txt".to_string()],
};
assert_eq!(non_existent_config.paths[0], "/this/path/does/not/exist");
assert_eq!(non_existent_config.watch_folders[0], "/this/path/does/not/exist");
// Empty paths
let empty_paths_config = LocalFolderSourceConfig {
paths: Vec::new(),
watch_folders: Vec::new(),
recursive: true,
follow_symlinks: false,
auto_sync: true,
@ -348,11 +348,11 @@ fn test_error_handling() {
file_extensions: vec![".txt".to_string()],
};
assert!(empty_paths_config.paths.is_empty());
assert!(empty_paths_config.watch_folders.is_empty());
// Invalid sync interval
let invalid_interval_config = LocalFolderSourceConfig {
paths: vec!["/test".to_string()],
watch_folders: vec!["/test".to_string()],
recursive: true,
follow_symlinks: false,
auto_sync: true,

View File

@ -39,11 +39,13 @@ async fn create_test_app_state() -> Arc<AppState> {
let db = Database::new(&config.database_url).await.unwrap();
let queue_service = Arc::new(readur::ocr_queue::OcrQueueService::new(db.clone(), db.pool.clone(), 2));
Arc::new(AppState {
db,
config,
webdav_scheduler: None,
source_scheduler: None,
queue_service,
})
}

View File

@ -118,17 +118,19 @@ async fn create_test_app_state() -> Arc<AppState> {
database_url: "sqlite::memory:".to_string(),
server_address: "127.0.0.1:8080".to_string(),
jwt_secret: "test_secret".to_string(),
upload_dir: "/tmp/test_uploads".to_string(),
max_file_size: 10 * 1024 * 1024,
upload_path: "/tmp/test_uploads".to_string(),
max_file_size_mb: 10 * 1024 * 1024,
};
let db = Database::new(&config.database_url).await.unwrap();
let queue_service = Arc::new(readur::ocr_queue::OcrQueueService::new(db.clone(), db.pool.clone(), 2));
Arc::new(AppState {
db,
config,
webdav_scheduler: None,
source_scheduler: None,
queue_service,
})
}
@ -181,8 +183,8 @@ fn test_config_parsing_local_folder() {
assert!(config.is_ok(), "Local Folder config should parse successfully");
let local_config = config.unwrap();
assert_eq!(local_config.paths.len(), 1);
assert_eq!(local_config.paths[0], "/home/user/documents");
assert_eq!(local_config.watch_folders.len(), 1);
assert_eq!(local_config.watch_folders[0], "/home/user/documents");
assert!(local_config.recursive);
assert!(!local_config.follow_symlinks);
assert_eq!(local_config.sync_interval_minutes, 30);

View File

@ -85,7 +85,14 @@ async fn setup_test_app() -> (Router, Arc<AppState>) {
};
let db = Database::new(&db_url).await.expect("Failed to connect to test database");
let state = Arc::new(AppState { db, config });
let queue_service = Arc::new(readur::ocr_queue::OcrQueueService::new(db.clone(), db.pool.clone(), 2));
let state = Arc::new(AppState {
db,
config,
webdav_scheduler: None,
source_scheduler: None,
queue_service,
});
let app = Router::new()
.nest("/api/auth", routes::auth::router())

View File

@ -29,7 +29,7 @@ fn create_test_webdav_config() -> WebDAVConfig {
watch_folders: vec!["/Documents".to_string(), "/Photos".to_string()],
file_extensions: vec![".pdf".to_string(), ".txt".to_string(), ".jpg".to_string()],
timeout_seconds: 30,
server_type: "nextcloud".to_string(),
server_type: Some("nextcloud".to_string()),
}
}
@ -43,7 +43,7 @@ fn create_test_source_config() -> WebDAVSourceConfig {
file_extensions: vec![".pdf".to_string(), ".txt".to_string()],
auto_sync: true,
sync_interval_minutes: 60,
server_type: "nextcloud".to_string(),
server_type: Some("nextcloud".to_string()),
}
}
@ -314,7 +314,7 @@ fn test_error_handling_scenarios() {
watch_folders: vec!["/test".to_string()],
file_extensions: vec![".pdf".to_string()],
timeout_seconds: 1, // Very short timeout
server_type: "nextcloud".to_string(),
server_type: Some("nextcloud".to_string()),
};
assert_eq!(timeout_config.timeout_seconds, 1);
@ -327,7 +327,7 @@ fn test_error_handling_scenarios() {
watch_folders: vec!["/test".to_string()],
file_extensions: vec![".pdf".to_string()],
timeout_seconds: 30,
server_type: "nextcloud".to_string(),
server_type: Some("nextcloud".to_string()),
};
assert_eq!(auth_config.username, "invalid_user");
@ -341,7 +341,7 @@ fn test_error_handling_scenarios() {
watch_folders: vec!["/nonexistent_folder".to_string()],
file_extensions: vec![".pdf".to_string()],
timeout_seconds: 30,
server_type: "nextcloud".to_string(),
server_type: Some("nextcloud".to_string()),
};
assert_eq!(invalid_path_config.watch_folders[0], "/nonexistent_folder");