diff --git a/src/file_service.rs b/src/file_service.rs index cb6d80e..350ee32 100644 --- a/src/file_service.rs +++ b/src/file_service.rs @@ -1,6 +1,7 @@ use anyhow::Result; use chrono::Utc; use std::path::{Path, PathBuf}; +use std::panic::{catch_unwind, AssertUnwindSafe}; use tokio::fs; use uuid::Uuid; use tracing::{info, warn, error}; @@ -321,14 +322,16 @@ impl FileService { async fn generate_pdf_thumbnail(&self, file_data: &[u8]) -> Result> { use image::Rgb; - // Try to extract first page as image using pdf-extract - match pdf_extract::extract_text_from_mem(file_data) { - Ok(text) => { + // Try to extract first page as image using pdf-extract with panic protection + match catch_unwind(AssertUnwindSafe(|| { + pdf_extract::extract_text_from_mem(file_data) + })) { + Ok(Ok(text)) => { // If we can extract text, create a text-based thumbnail self.generate_text_based_thumbnail(&text, "PDF", Rgb([220, 38, 27])).await } - Err(_) => { - // Fall back to placeholder if PDF extraction fails + Ok(Err(_)) | Err(_) => { + // Fall back to placeholder if PDF extraction fails or panics self.generate_placeholder_thumbnail("PDF").await } } diff --git a/src/ocr.rs b/src/ocr.rs index 8885ac3..81f0a72 100644 --- a/src/ocr.rs +++ b/src/ocr.rs @@ -1,5 +1,6 @@ use anyhow::{anyhow, Result}; use std::path::Path; +use std::panic::{catch_unwind, AssertUnwindSafe}; use crate::ocr_error::OcrError; use crate::ocr_health::OcrHealthChecker; @@ -54,8 +55,13 @@ impl OcrService { #[cfg(feature = "ocr")] { let bytes = std::fs::read(file_path)?; - let text = pdf_extract::extract_text_from_mem(&bytes) - .map_err(|e| anyhow!("Failed to extract text from PDF: {}", e))?; + let text = match catch_unwind(AssertUnwindSafe(|| { + pdf_extract::extract_text_from_mem(&bytes) + })) { + Ok(Ok(text)) => text, + Ok(Err(e)) => return Err(anyhow!("Failed to extract text from PDF: {}", e)), + Err(_) => return Err(anyhow!("PDF extraction panicked due to invalid content stream in file: {}", file_path)), + }; Ok(text.trim().to_string()) }