feat(server): actually render PDF thumbnails?

This commit is contained in:
perf3ct 2025-06-26 20:25:52 +00:00
parent 7df2f98b5d
commit a94acd7ffe
3 changed files with 132 additions and 11 deletions

81
Cargo.lock generated
View File

@ -1143,6 +1143,26 @@ dependencies = [
"crossbeam-utils",
]
[[package]]
name = "console_error_panic_hook"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a06aeb73f470f66dcdbf7223caeebb85984942f22f1adb2a088cf9668146bbbc"
dependencies = [
"cfg-if",
"wasm-bindgen",
]
[[package]]
name = "console_log"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be8aed40e4edbf4d3b4431ab260b63fdc40f5780a4766824329ea0f1eefe3c0f"
dependencies = [
"log",
"web-sys",
]
[[package]]
name = "const-oid"
version = "0.9.6"
@ -2621,7 +2641,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
dependencies = [
"cfg-if",
"windows-targets 0.48.5",
"windows-targets 0.52.6",
]
[[package]]
@ -2764,6 +2784,12 @@ dependencies = [
"rawpointer",
]
[[package]]
name = "maybe-owned"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4facc753ae494aeb6e3c22f839b158aebd4f9270f55cd3c79906c45476c47ab4"
[[package]]
name = "maybe-rayon"
version = "0.1.1"
@ -3270,6 +3296,32 @@ dependencies = [
"unicode-normalization",
]
[[package]]
name = "pdfium-render"
version = "0.8.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dca4101838fe3cb35e5f9dd8b55569ea483e7a342a6853878a5e56fcfdb04cc5"
dependencies = [
"bitflags 2.9.1",
"bytemuck",
"bytes",
"chrono",
"console_error_panic_hook",
"console_log",
"image",
"itertools",
"js-sys",
"libloading",
"log",
"maybe-owned",
"once_cell",
"utf16string",
"vecmath",
"wasm-bindgen",
"wasm-bindgen-futures",
"web-sys",
]
[[package]]
name = "peeking_take_while"
version = "0.1.2"
@ -3313,6 +3365,12 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "piston-float"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad78bf43dcf80e8f950c92b84f938a0fc7590b7f6866fbcbeca781609c115590"
[[package]]
name = "pkcs1"
version = "0.7.5"
@ -3661,6 +3719,7 @@ dependencies = [
"mime_guess",
"notify",
"pdf-extract",
"pdfium-render",
"quick-xml",
"raw-cpuid",
"regex",
@ -5319,6 +5378,15 @@ version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
[[package]]
name = "utf16string"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b62a1e85e12d5d712bf47a85f426b73d303e2d00a90de5f3004df3596e9d216"
dependencies = [
"byteorder",
]
[[package]]
name = "utf8_iter"
version = "1.0.4"
@ -5409,6 +5477,15 @@ version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "vecmath"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "956ae1e0d85bca567dee1dcf87fb1ca2e792792f66f87dced8381f99cd91156a"
dependencies = [
"piston-float",
]
[[package]]
name = "version-compare"
version = "0.2.0"
@ -5626,7 +5703,7 @@ version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
dependencies = [
"windows-sys 0.48.0",
"windows-sys 0.59.0",
]
[[package]]

View File

@ -35,6 +35,7 @@ notify = "8"
mime_guess = "2"
tesseract = { version = "0.15", optional = true }
pdf-extract = { version = "0.9", optional = true }
pdfium-render = { version = "0.8", optional = true }
image = { version = "0.25", features = ["png", "jpeg", "tiff", "bmp"], optional = true }
imageproc = { version = "0.25", optional = true }
thiserror = "2.0"
@ -59,7 +60,7 @@ testcontainers-modules = { version = "0.12", features = ["postgres"], optional =
[features]
default = ["ocr", "s3"]
ocr = ["tesseract", "pdf-extract", "image", "imageproc", "raw-cpuid"]
ocr = ["tesseract", "pdf-extract", "pdfium-render", "image", "imageproc", "raw-cpuid"]
s3 = ["aws-config", "aws-sdk-s3", "aws-credential-types", "aws-types"]
test-utils = ["testcontainers", "testcontainers-modules"]

View File

@ -320,18 +320,61 @@ impl FileService {
#[cfg(feature = "ocr")]
async fn generate_pdf_thumbnail(&self, file_data: &[u8]) -> Result<Vec<u8>> {
use image::Rgb;
use pdfium_render::prelude::*;
// Try to extract first page as image using pdf-extract with panic protection
// Try to render first page as image using pdfium-render with panic protection
match catch_unwind(AssertUnwindSafe(|| {
pdf_extract::extract_text_from_mem(file_data)
})) {
Ok(Ok(text)) => {
// If we can extract text, create a text-based thumbnail
self.generate_text_based_thumbnail(&text, "PDF", Rgb([220, 38, 27])).await
// Initialize pdfium
let pdfium = Pdfium::new(
Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
.or_else(|_| Pdfium::bind_to_system_library())
.unwrap_or_else(|_| Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name()).unwrap())
);
// Load PDF document from memory
let document = pdfium.load_pdf_from_byte_vec(file_data.to_vec(), None)?;
// Get first page
let page = document.pages().first()?;
// Render page to bitmap (200x200 thumbnail size)
let bitmap = page.render_with_config(
&PdfRenderConfig::new()
.set_target_width(200)
.set_maximum_height(200)
)?;
// Convert to image format
let width = bitmap.width() as u32;
let height = bitmap.height() as u32;
let buffer = bitmap.as_raw_bytes();
// Create RGB image from BGRA buffer
let mut rgb_buffer = Vec::with_capacity((width * height * 3) as usize);
for chunk in buffer.chunks(4) {
if chunk.len() >= 4 {
// Convert BGRA to RGB
rgb_buffer.push(chunk[2]); // R
rgb_buffer.push(chunk[1]); // G
rgb_buffer.push(chunk[0]); // B
}
}
let img = image::ImageBuffer::from_raw(width, height, rgb_buffer)
.ok_or_else(|| anyhow::anyhow!("Failed to create image from buffer"))?;
let dynamic_img = image::DynamicImage::ImageRgb8(img);
// Convert to JPEG
let mut buffer = Vec::new();
let mut cursor = std::io::Cursor::new(&mut buffer);
dynamic_img.write_to(&mut cursor, image::ImageFormat::Jpeg)?;
Ok(buffer) as anyhow::Result<Vec<u8>>
})) {
Ok(Ok(thumbnail)) => Ok(thumbnail),
Ok(Err(_)) | Err(_) => {
// Fall back to placeholder if PDF extraction fails or panics
// Fall back to placeholder if PDF rendering fails or panics
self.generate_placeholder_thumbnail("PDF").await
}
}