Merge pull request #56 from readur/fix/pdf-thumbnail-generation
feat(server): actually render PDF thumbnails
This commit is contained in:
commit
269ba4d46a
|
|
@ -2621,7 +2621,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
|
checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"windows-targets 0.48.5",
|
"windows-targets 0.52.6",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
@ -5626,7 +5626,7 @@ version = "0.1.9"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
|
checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"windows-sys 0.48.0",
|
"windows-sys 0.59.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ RUN npm run build
|
||||||
# --- Backend build stage ---
|
# --- Backend build stage ---
|
||||||
FROM rust:1.87-bookworm as backend-builder
|
FROM rust:1.87-bookworm as backend-builder
|
||||||
|
|
||||||
# Install system dependencies for OCR
|
# Install system dependencies for OCR and PDF processing
|
||||||
RUN apt-get update && apt-get install -y \
|
RUN apt-get update && apt-get install -y \
|
||||||
tesseract-ocr \
|
tesseract-ocr \
|
||||||
tesseract-ocr-eng \
|
tesseract-ocr-eng \
|
||||||
|
|
@ -19,6 +19,7 @@ RUN apt-get update && apt-get install -y \
|
||||||
pkg-config \
|
pkg-config \
|
||||||
libclang-dev \
|
libclang-dev \
|
||||||
clang \
|
clang \
|
||||||
|
poppler-utils \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
@ -35,6 +36,7 @@ RUN apt-get update && apt-get install -y \
|
||||||
tesseract-ocr \
|
tesseract-ocr \
|
||||||
tesseract-ocr-eng \
|
tesseract-ocr-eng \
|
||||||
ca-certificates \
|
ca-certificates \
|
||||||
|
poppler-utils \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
|
||||||
|
|
@ -320,18 +320,73 @@ impl FileService {
|
||||||
|
|
||||||
#[cfg(feature = "ocr")]
|
#[cfg(feature = "ocr")]
|
||||||
async fn generate_pdf_thumbnail(&self, file_data: &[u8]) -> Result<Vec<u8>> {
|
async fn generate_pdf_thumbnail(&self, file_data: &[u8]) -> Result<Vec<u8>> {
|
||||||
use image::Rgb;
|
use std::process::Command;
|
||||||
|
use tokio::fs;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
// Try to extract first page as image using pdf-extract with panic protection
|
// Create a temporary file for the PDF
|
||||||
match catch_unwind(AssertUnwindSafe(|| {
|
let temp_id = Uuid::new_v4();
|
||||||
pdf_extract::extract_text_from_mem(file_data)
|
let temp_pdf_path = format!("/tmp/pdf_thumb_{}.pdf", temp_id);
|
||||||
})) {
|
let temp_png_path = format!("/tmp/pdf_thumb_{}.png", temp_id);
|
||||||
Ok(Ok(text)) => {
|
|
||||||
// If we can extract text, create a text-based thumbnail
|
// Write PDF data to temporary file
|
||||||
self.generate_text_based_thumbnail(&text, "PDF", Rgb([220, 38, 27])).await
|
if let Err(e) = fs::write(&temp_pdf_path, file_data).await {
|
||||||
|
error!("Failed to write temporary PDF file: {}", e);
|
||||||
|
return self.generate_placeholder_thumbnail("PDF").await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use pdftoppm to convert first page to PNG
|
||||||
|
let output = Command::new("pdftoppm")
|
||||||
|
.arg("-f").arg("1") // First page only
|
||||||
|
.arg("-l").arg("1") // Last page (same as first)
|
||||||
|
.arg("-scale-to").arg("200") // Scale to 200px width
|
||||||
|
.arg("-png") // Output as PNG
|
||||||
|
.arg(&temp_pdf_path)
|
||||||
|
.arg(&format!("/tmp/pdf_thumb_{}", temp_id)) // Output prefix
|
||||||
|
.output();
|
||||||
|
|
||||||
|
// Clean up temporary PDF file
|
||||||
|
let _ = fs::remove_file(&temp_pdf_path).await;
|
||||||
|
|
||||||
|
match output {
|
||||||
|
Ok(result) if result.status.success() => {
|
||||||
|
// pdftoppm adds "-1" to the filename for the first page
|
||||||
|
let actual_png_path = format!("/tmp/pdf_thumb_{}-1.png", temp_id);
|
||||||
|
|
||||||
|
// Read the generated PNG file
|
||||||
|
match fs::read(&actual_png_path).await {
|
||||||
|
Ok(png_data) => {
|
||||||
|
// Clean up temporary PNG file
|
||||||
|
let _ = fs::remove_file(&actual_png_path).await;
|
||||||
|
|
||||||
|
// Convert PNG to JPEG thumbnail
|
||||||
|
match image::load_from_memory(&png_data) {
|
||||||
|
Ok(img) => {
|
||||||
|
// Resize to 200x200 maintaining aspect ratio
|
||||||
|
let thumbnail = img.resize(200, 200, image::imageops::FilterType::Lanczos3);
|
||||||
|
|
||||||
|
// Convert to JPEG
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
let mut cursor = std::io::Cursor::new(&mut buffer);
|
||||||
|
if thumbnail.write_to(&mut cursor, ImageFormat::Jpeg).is_ok() {
|
||||||
|
Ok(buffer)
|
||||||
|
} else {
|
||||||
|
self.generate_placeholder_thumbnail("PDF").await
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(_) => self.generate_placeholder_thumbnail("PDF").await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
let _ = fs::remove_file(&actual_png_path).await;
|
||||||
|
self.generate_placeholder_thumbnail("PDF").await
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Ok(Err(_)) | Err(_) => {
|
_ => {
|
||||||
// Fall back to placeholder if PDF extraction fails or panics
|
// Clean up any potential PNG files
|
||||||
|
let _ = fs::remove_file(&temp_png_path).await;
|
||||||
|
let _ = fs::remove_file(&format!("/tmp/pdf_thumb_{}-1.png", temp_id)).await;
|
||||||
self.generate_placeholder_thumbnail("PDF").await
|
self.generate_placeholder_thumbnail("PDF").await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue