feat(server): use poppler for pdf image generation
This commit is contained in:
parent
d7aca60733
commit
d278d50f3a
|
|
@ -1143,26 +1143,6 @@ dependencies = [
|
||||||
"crossbeam-utils",
|
"crossbeam-utils",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "console_error_panic_hook"
|
|
||||||
version = "0.1.7"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "a06aeb73f470f66dcdbf7223caeebb85984942f22f1adb2a088cf9668146bbbc"
|
|
||||||
dependencies = [
|
|
||||||
"cfg-if",
|
|
||||||
"wasm-bindgen",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "console_log"
|
|
||||||
version = "1.0.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "be8aed40e4edbf4d3b4431ab260b63fdc40f5780a4766824329ea0f1eefe3c0f"
|
|
||||||
dependencies = [
|
|
||||||
"log",
|
|
||||||
"web-sys",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "const-oid"
|
name = "const-oid"
|
||||||
version = "0.9.6"
|
version = "0.9.6"
|
||||||
|
|
@ -2784,12 +2764,6 @@ dependencies = [
|
||||||
"rawpointer",
|
"rawpointer",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "maybe-owned"
|
|
||||||
version = "0.3.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "4facc753ae494aeb6e3c22f839b158aebd4f9270f55cd3c79906c45476c47ab4"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "maybe-rayon"
|
name = "maybe-rayon"
|
||||||
version = "0.1.1"
|
version = "0.1.1"
|
||||||
|
|
@ -3296,32 +3270,6 @@ dependencies = [
|
||||||
"unicode-normalization",
|
"unicode-normalization",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "pdfium-render"
|
|
||||||
version = "0.8.33"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "dca4101838fe3cb35e5f9dd8b55569ea483e7a342a6853878a5e56fcfdb04cc5"
|
|
||||||
dependencies = [
|
|
||||||
"bitflags 2.9.1",
|
|
||||||
"bytemuck",
|
|
||||||
"bytes",
|
|
||||||
"chrono",
|
|
||||||
"console_error_panic_hook",
|
|
||||||
"console_log",
|
|
||||||
"image",
|
|
||||||
"itertools",
|
|
||||||
"js-sys",
|
|
||||||
"libloading",
|
|
||||||
"log",
|
|
||||||
"maybe-owned",
|
|
||||||
"once_cell",
|
|
||||||
"utf16string",
|
|
||||||
"vecmath",
|
|
||||||
"wasm-bindgen",
|
|
||||||
"wasm-bindgen-futures",
|
|
||||||
"web-sys",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "peeking_take_while"
|
name = "peeking_take_while"
|
||||||
version = "0.1.2"
|
version = "0.1.2"
|
||||||
|
|
@ -3365,12 +3313,6 @@ version = "0.1.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
|
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "piston-float"
|
|
||||||
version = "1.0.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "ad78bf43dcf80e8f950c92b84f938a0fc7590b7f6866fbcbeca781609c115590"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pkcs1"
|
name = "pkcs1"
|
||||||
version = "0.7.5"
|
version = "0.7.5"
|
||||||
|
|
@ -3719,7 +3661,6 @@ dependencies = [
|
||||||
"mime_guess",
|
"mime_guess",
|
||||||
"notify",
|
"notify",
|
||||||
"pdf-extract",
|
"pdf-extract",
|
||||||
"pdfium-render",
|
|
||||||
"quick-xml",
|
"quick-xml",
|
||||||
"raw-cpuid",
|
"raw-cpuid",
|
||||||
"regex",
|
"regex",
|
||||||
|
|
@ -5378,15 +5319,6 @@ version = "2.1.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
|
checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "utf16string"
|
|
||||||
version = "0.2.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "0b62a1e85e12d5d712bf47a85f426b73d303e2d00a90de5f3004df3596e9d216"
|
|
||||||
dependencies = [
|
|
||||||
"byteorder",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "utf8_iter"
|
name = "utf8_iter"
|
||||||
version = "1.0.4"
|
version = "1.0.4"
|
||||||
|
|
@ -5477,15 +5409,6 @@ version = "0.2.15"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "vecmath"
|
|
||||||
version = "1.0.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "956ae1e0d85bca567dee1dcf87fb1ca2e792792f66f87dced8381f99cd91156a"
|
|
||||||
dependencies = [
|
|
||||||
"piston-float",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "version-compare"
|
name = "version-compare"
|
||||||
version = "0.2.0"
|
version = "0.2.0"
|
||||||
|
|
|
||||||
|
|
@ -35,7 +35,6 @@ notify = "8"
|
||||||
mime_guess = "2"
|
mime_guess = "2"
|
||||||
tesseract = { version = "0.15", optional = true }
|
tesseract = { version = "0.15", optional = true }
|
||||||
pdf-extract = { version = "0.9", optional = true }
|
pdf-extract = { version = "0.9", optional = true }
|
||||||
pdfium-render = { version = "0.8", optional = true }
|
|
||||||
image = { version = "0.25", features = ["png", "jpeg", "tiff", "bmp"], optional = true }
|
image = { version = "0.25", features = ["png", "jpeg", "tiff", "bmp"], optional = true }
|
||||||
imageproc = { version = "0.25", optional = true }
|
imageproc = { version = "0.25", optional = true }
|
||||||
thiserror = "2.0"
|
thiserror = "2.0"
|
||||||
|
|
@ -60,7 +59,7 @@ testcontainers-modules = { version = "0.12", features = ["postgres"], optional =
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = ["ocr", "s3"]
|
default = ["ocr", "s3"]
|
||||||
ocr = ["tesseract", "pdf-extract", "pdfium-render", "image", "imageproc", "raw-cpuid"]
|
ocr = ["tesseract", "pdf-extract", "image", "imageproc", "raw-cpuid"]
|
||||||
s3 = ["aws-config", "aws-sdk-s3", "aws-credential-types", "aws-types"]
|
s3 = ["aws-config", "aws-sdk-s3", "aws-credential-types", "aws-types"]
|
||||||
test-utils = ["testcontainers", "testcontainers-modules"]
|
test-utils = ["testcontainers", "testcontainers-modules"]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ RUN npm run build
|
||||||
# --- Backend build stage ---
|
# --- Backend build stage ---
|
||||||
FROM rust:1.87-bookworm as backend-builder
|
FROM rust:1.87-bookworm as backend-builder
|
||||||
|
|
||||||
# Install system dependencies for OCR
|
# Install system dependencies for OCR and PDF processing
|
||||||
RUN apt-get update && apt-get install -y \
|
RUN apt-get update && apt-get install -y \
|
||||||
tesseract-ocr \
|
tesseract-ocr \
|
||||||
tesseract-ocr-eng \
|
tesseract-ocr-eng \
|
||||||
|
|
@ -19,6 +19,7 @@ RUN apt-get update && apt-get install -y \
|
||||||
pkg-config \
|
pkg-config \
|
||||||
libclang-dev \
|
libclang-dev \
|
||||||
clang \
|
clang \
|
||||||
|
poppler-utils \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
@ -35,6 +36,7 @@ RUN apt-get update && apt-get install -y \
|
||||||
tesseract-ocr \
|
tesseract-ocr \
|
||||||
tesseract-ocr-eng \
|
tesseract-ocr-eng \
|
||||||
ca-certificates \
|
ca-certificates \
|
||||||
|
poppler-utils \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
|
||||||
|
|
@ -320,61 +320,73 @@ impl FileService {
|
||||||
|
|
||||||
#[cfg(feature = "ocr")]
|
#[cfg(feature = "ocr")]
|
||||||
async fn generate_pdf_thumbnail(&self, file_data: &[u8]) -> Result<Vec<u8>> {
|
async fn generate_pdf_thumbnail(&self, file_data: &[u8]) -> Result<Vec<u8>> {
|
||||||
use pdfium_render::prelude::*;
|
use std::process::Command;
|
||||||
|
use tokio::fs;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
// Try to render first page as image using pdfium-render with panic protection
|
// Create a temporary file for the PDF
|
||||||
match catch_unwind(AssertUnwindSafe(|| {
|
let temp_id = Uuid::new_v4();
|
||||||
// Initialize pdfium
|
let temp_pdf_path = format!("/tmp/pdf_thumb_{}.pdf", temp_id);
|
||||||
let pdfium = Pdfium::new(
|
let temp_png_path = format!("/tmp/pdf_thumb_{}.png", temp_id);
|
||||||
Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
|
|
||||||
.or_else(|_| Pdfium::bind_to_system_library())
|
// Write PDF data to temporary file
|
||||||
.unwrap_or_else(|_| Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name()).unwrap())
|
if let Err(e) = fs::write(&temp_pdf_path, file_data).await {
|
||||||
);
|
error!("Failed to write temporary PDF file: {}", e);
|
||||||
|
return self.generate_placeholder_thumbnail("PDF").await;
|
||||||
// Load PDF document from memory
|
}
|
||||||
let document = pdfium.load_pdf_from_byte_vec(file_data.to_vec(), None)?;
|
|
||||||
|
// Use pdftoppm to convert first page to PNG
|
||||||
// Get first page
|
let output = Command::new("pdftoppm")
|
||||||
let page = document.pages().first()?;
|
.arg("-f").arg("1") // First page only
|
||||||
|
.arg("-l").arg("1") // Last page (same as first)
|
||||||
// Render page to bitmap (200x200 thumbnail size)
|
.arg("-scale-to").arg("200") // Scale to 200px width
|
||||||
let bitmap = page.render_with_config(
|
.arg("-png") // Output as PNG
|
||||||
&PdfRenderConfig::new()
|
.arg(&temp_pdf_path)
|
||||||
.set_target_width(200)
|
.arg(&format!("/tmp/pdf_thumb_{}", temp_id)) // Output prefix
|
||||||
.set_maximum_height(200)
|
.output();
|
||||||
)?;
|
|
||||||
|
// Clean up temporary PDF file
|
||||||
// Convert to image format
|
let _ = fs::remove_file(&temp_pdf_path).await;
|
||||||
let width = bitmap.width() as u32;
|
|
||||||
let height = bitmap.height() as u32;
|
match output {
|
||||||
let buffer = bitmap.as_raw_bytes();
|
Ok(result) if result.status.success() => {
|
||||||
|
// pdftoppm adds "-1" to the filename for the first page
|
||||||
// Create RGB image from BGRA buffer
|
let actual_png_path = format!("/tmp/pdf_thumb_{}-1.png", temp_id);
|
||||||
let mut rgb_buffer = Vec::with_capacity((width * height * 3) as usize);
|
|
||||||
for chunk in buffer.chunks(4) {
|
// Read the generated PNG file
|
||||||
if chunk.len() >= 4 {
|
match fs::read(&actual_png_path).await {
|
||||||
// Convert BGRA to RGB
|
Ok(png_data) => {
|
||||||
rgb_buffer.push(chunk[2]); // R
|
// Clean up temporary PNG file
|
||||||
rgb_buffer.push(chunk[1]); // G
|
let _ = fs::remove_file(&actual_png_path).await;
|
||||||
rgb_buffer.push(chunk[0]); // B
|
|
||||||
|
// Convert PNG to JPEG thumbnail
|
||||||
|
match image::load_from_memory(&png_data) {
|
||||||
|
Ok(img) => {
|
||||||
|
// Resize to 200x200 maintaining aspect ratio
|
||||||
|
let thumbnail = img.resize(200, 200, image::imageops::FilterType::Lanczos3);
|
||||||
|
|
||||||
|
// Convert to JPEG
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
let mut cursor = std::io::Cursor::new(&mut buffer);
|
||||||
|
if thumbnail.write_to(&mut cursor, ImageFormat::Jpeg).is_ok() {
|
||||||
|
Ok(buffer)
|
||||||
|
} else {
|
||||||
|
self.generate_placeholder_thumbnail("PDF").await
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(_) => self.generate_placeholder_thumbnail("PDF").await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
let _ = fs::remove_file(&actual_png_path).await;
|
||||||
|
self.generate_placeholder_thumbnail("PDF").await
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
_ => {
|
||||||
let img = image::ImageBuffer::from_raw(width, height, rgb_buffer)
|
// Clean up any potential PNG files
|
||||||
.ok_or_else(|| anyhow::anyhow!("Failed to create image from buffer"))?;
|
let _ = fs::remove_file(&temp_png_path).await;
|
||||||
|
let _ = fs::remove_file(&format!("/tmp/pdf_thumb_{}-1.png", temp_id)).await;
|
||||||
let dynamic_img = image::DynamicImage::ImageRgb8(img);
|
|
||||||
|
|
||||||
// Convert to JPEG
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
let mut cursor = std::io::Cursor::new(&mut buffer);
|
|
||||||
dynamic_img.write_to(&mut cursor, image::ImageFormat::Jpeg)?;
|
|
||||||
|
|
||||||
Ok(buffer) as anyhow::Result<Vec<u8>>
|
|
||||||
})) {
|
|
||||||
Ok(Ok(thumbnail)) => Ok(thumbnail),
|
|
||||||
Ok(Err(_)) | Err(_) => {
|
|
||||||
// Fall back to placeholder if PDF rendering fails or panics
|
|
||||||
self.generate_placeholder_thumbnail("PDF").await
|
self.generate_placeholder_thumbnail("PDF").await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue