From a94acd7ffef1e776dae34becc40cf4a71214b19f Mon Sep 17 00:00:00 2001 From: perf3ct Date: Thu, 26 Jun 2025 20:25:52 +0000 Subject: [PATCH 1/2] feat(server): actually render PDF thumbnails? --- Cargo.lock | 81 +++++++++++++++++++++++++++++++++++++++++++-- Cargo.toml | 3 +- src/file_service.rs | 59 ++++++++++++++++++++++++++++----- 3 files changed, 132 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d2a65fa..fd83b0a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1143,6 +1143,26 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "console_error_panic_hook" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06aeb73f470f66dcdbf7223caeebb85984942f22f1adb2a088cf9668146bbbc" +dependencies = [ + "cfg-if", + "wasm-bindgen", +] + +[[package]] +name = "console_log" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be8aed40e4edbf4d3b4431ab260b63fdc40f5780a4766824329ea0f1eefe3c0f" +dependencies = [ + "log", + "web-sys", +] + [[package]] name = "const-oid" version = "0.9.6" @@ -2621,7 +2641,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-targets 0.48.5", + "windows-targets 0.52.6", ] [[package]] @@ -2764,6 +2784,12 @@ dependencies = [ "rawpointer", ] +[[package]] +name = "maybe-owned" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4facc753ae494aeb6e3c22f839b158aebd4f9270f55cd3c79906c45476c47ab4" + [[package]] name = "maybe-rayon" version = "0.1.1" @@ -3270,6 +3296,32 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "pdfium-render" +version = "0.8.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dca4101838fe3cb35e5f9dd8b55569ea483e7a342a6853878a5e56fcfdb04cc5" +dependencies = [ + "bitflags 2.9.1", + "bytemuck", + "bytes", + "chrono", + "console_error_panic_hook", + "console_log", + "image", + "itertools", + "js-sys", + "libloading", + "log", + "maybe-owned", + "once_cell", + "utf16string", + "vecmath", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "peeking_take_while" version = "0.1.2" @@ -3313,6 +3365,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "piston-float" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad78bf43dcf80e8f950c92b84f938a0fc7590b7f6866fbcbeca781609c115590" + [[package]] name = "pkcs1" version = "0.7.5" @@ -3661,6 +3719,7 @@ dependencies = [ "mime_guess", "notify", "pdf-extract", + "pdfium-render", "quick-xml", "raw-cpuid", "regex", @@ -5319,6 +5378,15 @@ version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" +[[package]] +name = "utf16string" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b62a1e85e12d5d712bf47a85f426b73d303e2d00a90de5f3004df3596e9d216" +dependencies = [ + "byteorder", +] + [[package]] name = "utf8_iter" version = "1.0.4" @@ -5409,6 +5477,15 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +[[package]] +name = "vecmath" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "956ae1e0d85bca567dee1dcf87fb1ca2e792792f66f87dced8381f99cd91156a" +dependencies = [ + "piston-float", +] + [[package]] name = "version-compare" version = "0.2.0" @@ -5626,7 +5703,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index b9107ce..851fe81 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,6 +35,7 @@ notify = "8" mime_guess = "2" tesseract = { version = "0.15", optional = true } pdf-extract = { version = "0.9", optional = true } +pdfium-render = { version = "0.8", optional = true } image = { version = "0.25", features = ["png", "jpeg", "tiff", "bmp"], optional = true } imageproc = { version = "0.25", optional = true } thiserror = "2.0" @@ -59,7 +60,7 @@ testcontainers-modules = { version = "0.12", features = ["postgres"], optional = [features] default = ["ocr", "s3"] -ocr = ["tesseract", "pdf-extract", "image", "imageproc", "raw-cpuid"] +ocr = ["tesseract", "pdf-extract", "pdfium-render", "image", "imageproc", "raw-cpuid"] s3 = ["aws-config", "aws-sdk-s3", "aws-credential-types", "aws-types"] test-utils = ["testcontainers", "testcontainers-modules"] diff --git a/src/file_service.rs b/src/file_service.rs index 350ee32..409ce26 100644 --- a/src/file_service.rs +++ b/src/file_service.rs @@ -320,18 +320,61 @@ impl FileService { #[cfg(feature = "ocr")] async fn generate_pdf_thumbnail(&self, file_data: &[u8]) -> Result> { - use image::Rgb; + use pdfium_render::prelude::*; - // Try to extract first page as image using pdf-extract with panic protection + // Try to render first page as image using pdfium-render with panic protection match catch_unwind(AssertUnwindSafe(|| { - pdf_extract::extract_text_from_mem(file_data) - })) { - Ok(Ok(text)) => { - // If we can extract text, create a text-based thumbnail - self.generate_text_based_thumbnail(&text, "PDF", Rgb([220, 38, 27])).await + // Initialize pdfium + let pdfium = Pdfium::new( + Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./")) + .or_else(|_| Pdfium::bind_to_system_library()) + .unwrap_or_else(|_| Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name()).unwrap()) + ); + + // Load PDF document from memory + let document = pdfium.load_pdf_from_byte_vec(file_data.to_vec(), None)?; + + // Get first page + let page = document.pages().first()?; + + // Render page to bitmap (200x200 thumbnail size) + let bitmap = page.render_with_config( + &PdfRenderConfig::new() + .set_target_width(200) + .set_maximum_height(200) + )?; + + // Convert to image format + let width = bitmap.width() as u32; + let height = bitmap.height() as u32; + let buffer = bitmap.as_raw_bytes(); + + // Create RGB image from BGRA buffer + let mut rgb_buffer = Vec::with_capacity((width * height * 3) as usize); + for chunk in buffer.chunks(4) { + if chunk.len() >= 4 { + // Convert BGRA to RGB + rgb_buffer.push(chunk[2]); // R + rgb_buffer.push(chunk[1]); // G + rgb_buffer.push(chunk[0]); // B + } } + + let img = image::ImageBuffer::from_raw(width, height, rgb_buffer) + .ok_or_else(|| anyhow::anyhow!("Failed to create image from buffer"))?; + + let dynamic_img = image::DynamicImage::ImageRgb8(img); + + // Convert to JPEG + let mut buffer = Vec::new(); + let mut cursor = std::io::Cursor::new(&mut buffer); + dynamic_img.write_to(&mut cursor, image::ImageFormat::Jpeg)?; + + Ok(buffer) as anyhow::Result> + })) { + Ok(Ok(thumbnail)) => Ok(thumbnail), Ok(Err(_)) | Err(_) => { - // Fall back to placeholder if PDF extraction fails or panics + // Fall back to placeholder if PDF rendering fails or panics self.generate_placeholder_thumbnail("PDF").await } } From 075657899fe5988b07042e60438f0fccb6215c27 Mon Sep 17 00:00:00 2001 From: perf3ct Date: Thu, 26 Jun 2025 20:39:42 +0000 Subject: [PATCH 2/2] feat(server): use poppler for pdf image generation --- Cargo.lock | 77 ----------------------------- Cargo.toml | 3 +- Dockerfile | 4 +- src/file_service.rs | 116 ++++++++++++++++++++++++-------------------- 4 files changed, 68 insertions(+), 132 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fd83b0a..818f5cb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1143,26 +1143,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "console_error_panic_hook" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a06aeb73f470f66dcdbf7223caeebb85984942f22f1adb2a088cf9668146bbbc" -dependencies = [ - "cfg-if", - "wasm-bindgen", -] - -[[package]] -name = "console_log" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be8aed40e4edbf4d3b4431ab260b63fdc40f5780a4766824329ea0f1eefe3c0f" -dependencies = [ - "log", - "web-sys", -] - [[package]] name = "const-oid" version = "0.9.6" @@ -2784,12 +2764,6 @@ dependencies = [ "rawpointer", ] -[[package]] -name = "maybe-owned" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4facc753ae494aeb6e3c22f839b158aebd4f9270f55cd3c79906c45476c47ab4" - [[package]] name = "maybe-rayon" version = "0.1.1" @@ -3296,32 +3270,6 @@ dependencies = [ "unicode-normalization", ] -[[package]] -name = "pdfium-render" -version = "0.8.33" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dca4101838fe3cb35e5f9dd8b55569ea483e7a342a6853878a5e56fcfdb04cc5" -dependencies = [ - "bitflags 2.9.1", - "bytemuck", - "bytes", - "chrono", - "console_error_panic_hook", - "console_log", - "image", - "itertools", - "js-sys", - "libloading", - "log", - "maybe-owned", - "once_cell", - "utf16string", - "vecmath", - "wasm-bindgen", - "wasm-bindgen-futures", - "web-sys", -] - [[package]] name = "peeking_take_while" version = "0.1.2" @@ -3365,12 +3313,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" -[[package]] -name = "piston-float" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad78bf43dcf80e8f950c92b84f938a0fc7590b7f6866fbcbeca781609c115590" - [[package]] name = "pkcs1" version = "0.7.5" @@ -3719,7 +3661,6 @@ dependencies = [ "mime_guess", "notify", "pdf-extract", - "pdfium-render", "quick-xml", "raw-cpuid", "regex", @@ -5378,15 +5319,6 @@ version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" -[[package]] -name = "utf16string" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b62a1e85e12d5d712bf47a85f426b73d303e2d00a90de5f3004df3596e9d216" -dependencies = [ - "byteorder", -] - [[package]] name = "utf8_iter" version = "1.0.4" @@ -5477,15 +5409,6 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" -[[package]] -name = "vecmath" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "956ae1e0d85bca567dee1dcf87fb1ca2e792792f66f87dced8381f99cd91156a" -dependencies = [ - "piston-float", -] - [[package]] name = "version-compare" version = "0.2.0" diff --git a/Cargo.toml b/Cargo.toml index 851fe81..b9107ce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,7 +35,6 @@ notify = "8" mime_guess = "2" tesseract = { version = "0.15", optional = true } pdf-extract = { version = "0.9", optional = true } -pdfium-render = { version = "0.8", optional = true } image = { version = "0.25", features = ["png", "jpeg", "tiff", "bmp"], optional = true } imageproc = { version = "0.25", optional = true } thiserror = "2.0" @@ -60,7 +59,7 @@ testcontainers-modules = { version = "0.12", features = ["postgres"], optional = [features] default = ["ocr", "s3"] -ocr = ["tesseract", "pdf-extract", "pdfium-render", "image", "imageproc", "raw-cpuid"] +ocr = ["tesseract", "pdf-extract", "image", "imageproc", "raw-cpuid"] s3 = ["aws-config", "aws-sdk-s3", "aws-credential-types", "aws-types"] test-utils = ["testcontainers", "testcontainers-modules"] diff --git a/Dockerfile b/Dockerfile index 7179187..40f166e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,7 +10,7 @@ RUN npm run build # --- Backend build stage --- FROM rust:1.87-bookworm as backend-builder -# Install system dependencies for OCR +# Install system dependencies for OCR and PDF processing RUN apt-get update && apt-get install -y \ tesseract-ocr \ tesseract-ocr-eng \ @@ -19,6 +19,7 @@ RUN apt-get update && apt-get install -y \ pkg-config \ libclang-dev \ clang \ + poppler-utils \ && rm -rf /var/lib/apt/lists/* WORKDIR /app @@ -35,6 +36,7 @@ RUN apt-get update && apt-get install -y \ tesseract-ocr \ tesseract-ocr-eng \ ca-certificates \ + poppler-utils \ && rm -rf /var/lib/apt/lists/* WORKDIR /app diff --git a/src/file_service.rs b/src/file_service.rs index 409ce26..67d59c6 100644 --- a/src/file_service.rs +++ b/src/file_service.rs @@ -320,61 +320,73 @@ impl FileService { #[cfg(feature = "ocr")] async fn generate_pdf_thumbnail(&self, file_data: &[u8]) -> Result> { - use pdfium_render::prelude::*; + use std::process::Command; + use tokio::fs; + use uuid::Uuid; - // Try to render first page as image using pdfium-render with panic protection - match catch_unwind(AssertUnwindSafe(|| { - // Initialize pdfium - let pdfium = Pdfium::new( - Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./")) - .or_else(|_| Pdfium::bind_to_system_library()) - .unwrap_or_else(|_| Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name()).unwrap()) - ); - - // Load PDF document from memory - let document = pdfium.load_pdf_from_byte_vec(file_data.to_vec(), None)?; - - // Get first page - let page = document.pages().first()?; - - // Render page to bitmap (200x200 thumbnail size) - let bitmap = page.render_with_config( - &PdfRenderConfig::new() - .set_target_width(200) - .set_maximum_height(200) - )?; - - // Convert to image format - let width = bitmap.width() as u32; - let height = bitmap.height() as u32; - let buffer = bitmap.as_raw_bytes(); - - // Create RGB image from BGRA buffer - let mut rgb_buffer = Vec::with_capacity((width * height * 3) as usize); - for chunk in buffer.chunks(4) { - if chunk.len() >= 4 { - // Convert BGRA to RGB - rgb_buffer.push(chunk[2]); // R - rgb_buffer.push(chunk[1]); // G - rgb_buffer.push(chunk[0]); // B + // Create a temporary file for the PDF + let temp_id = Uuid::new_v4(); + let temp_pdf_path = format!("/tmp/pdf_thumb_{}.pdf", temp_id); + let temp_png_path = format!("/tmp/pdf_thumb_{}.png", temp_id); + + // Write PDF data to temporary file + if let Err(e) = fs::write(&temp_pdf_path, file_data).await { + error!("Failed to write temporary PDF file: {}", e); + return self.generate_placeholder_thumbnail("PDF").await; + } + + // Use pdftoppm to convert first page to PNG + let output = Command::new("pdftoppm") + .arg("-f").arg("1") // First page only + .arg("-l").arg("1") // Last page (same as first) + .arg("-scale-to").arg("200") // Scale to 200px width + .arg("-png") // Output as PNG + .arg(&temp_pdf_path) + .arg(&format!("/tmp/pdf_thumb_{}", temp_id)) // Output prefix + .output(); + + // Clean up temporary PDF file + let _ = fs::remove_file(&temp_pdf_path).await; + + match output { + Ok(result) if result.status.success() => { + // pdftoppm adds "-1" to the filename for the first page + let actual_png_path = format!("/tmp/pdf_thumb_{}-1.png", temp_id); + + // Read the generated PNG file + match fs::read(&actual_png_path).await { + Ok(png_data) => { + // Clean up temporary PNG file + let _ = fs::remove_file(&actual_png_path).await; + + // Convert PNG to JPEG thumbnail + match image::load_from_memory(&png_data) { + Ok(img) => { + // Resize to 200x200 maintaining aspect ratio + let thumbnail = img.resize(200, 200, image::imageops::FilterType::Lanczos3); + + // Convert to JPEG + let mut buffer = Vec::new(); + let mut cursor = std::io::Cursor::new(&mut buffer); + if thumbnail.write_to(&mut cursor, ImageFormat::Jpeg).is_ok() { + Ok(buffer) + } else { + self.generate_placeholder_thumbnail("PDF").await + } + } + Err(_) => self.generate_placeholder_thumbnail("PDF").await, + } + } + Err(_) => { + let _ = fs::remove_file(&actual_png_path).await; + self.generate_placeholder_thumbnail("PDF").await + } } } - - let img = image::ImageBuffer::from_raw(width, height, rgb_buffer) - .ok_or_else(|| anyhow::anyhow!("Failed to create image from buffer"))?; - - let dynamic_img = image::DynamicImage::ImageRgb8(img); - - // Convert to JPEG - let mut buffer = Vec::new(); - let mut cursor = std::io::Cursor::new(&mut buffer); - dynamic_img.write_to(&mut cursor, image::ImageFormat::Jpeg)?; - - Ok(buffer) as anyhow::Result> - })) { - Ok(Ok(thumbnail)) => Ok(thumbnail), - Ok(Err(_)) | Err(_) => { - // Fall back to placeholder if PDF rendering fails or panics + _ => { + // Clean up any potential PNG files + let _ = fs::remove_file(&temp_png_path).await; + let _ = fs::remove_file(&format!("/tmp/pdf_thumb_{}-1.png", temp_id)).await; self.generate_placeholder_thumbnail("PDF").await } }