Merge pull request #46 from readur/fix/catch-pdf-extract-errors

fix(server): catch pdf-extract spammy logs
This commit is contained in:
Jon Fuller 2025-06-25 21:35:09 -07:00 committed by GitHub
commit bae748f3df
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 16 additions and 7 deletions

View File

@ -1,6 +1,7 @@
use anyhow::Result;
use chrono::Utc;
use std::path::{Path, PathBuf};
use std::panic::{catch_unwind, AssertUnwindSafe};
use tokio::fs;
use uuid::Uuid;
use tracing::{info, warn, error};
@ -321,14 +322,16 @@ impl FileService {
async fn generate_pdf_thumbnail(&self, file_data: &[u8]) -> Result<Vec<u8>> {
use image::Rgb;
// Try to extract first page as image using pdf-extract
match pdf_extract::extract_text_from_mem(file_data) {
Ok(text) => {
// Try to extract first page as image using pdf-extract with panic protection
match catch_unwind(AssertUnwindSafe(|| {
pdf_extract::extract_text_from_mem(file_data)
})) {
Ok(Ok(text)) => {
// If we can extract text, create a text-based thumbnail
self.generate_text_based_thumbnail(&text, "PDF", Rgb([220, 38, 27])).await
}
Err(_) => {
// Fall back to placeholder if PDF extraction fails
Ok(Err(_)) | Err(_) => {
// Fall back to placeholder if PDF extraction fails or panics
self.generate_placeholder_thumbnail("PDF").await
}
}

View File

@ -1,5 +1,6 @@
use anyhow::{anyhow, Result};
use std::path::Path;
use std::panic::{catch_unwind, AssertUnwindSafe};
use crate::ocr_error::OcrError;
use crate::ocr_health::OcrHealthChecker;
@ -54,8 +55,13 @@ impl OcrService {
#[cfg(feature = "ocr")]
{
let bytes = std::fs::read(file_path)?;
let text = pdf_extract::extract_text_from_mem(&bytes)
.map_err(|e| anyhow!("Failed to extract text from PDF: {}", e))?;
let text = match catch_unwind(AssertUnwindSafe(|| {
pdf_extract::extract_text_from_mem(&bytes)
})) {
Ok(Ok(text)) => text,
Ok(Err(e)) => return Err(anyhow!("Failed to extract text from PDF: {}", e)),
Err(_) => return Err(anyhow!("PDF extraction panicked due to invalid content stream in file: {}", file_path)),
};
Ok(text.trim().to_string())
}