Merge pull request #46 from readur/fix/catch-pdf-extract-errors

fix(server): catch pdf-extract spammy logs
2025-06-25 21:35:09 -07:00 · 2025-06-25 21:35:09 -07:00 · bae748f3df
parent 6f4c4dae8b d4d8ea625b
commit bae748f3df
2 changed files with 16 additions and 7 deletions
--- a/src/file_service.rs
+++ b/src/file_service.rs
@ -1,6 +1,7 @@
 use anyhow::Result;
 use chrono::Utc;
 use std::path::{Path, PathBuf};
 use std::panic::{catch_unwind, AssertUnwindSafe};
 use tokio::fs;
 use uuid::Uuid;
 use tracing::{info, warn, error};
@ -321,14 +322,16 @@ impl FileService {
    async fn generate_pdf_thumbnail(&self, file_data: &[u8]) -> Result<Vec<u8>> {
        use image::Rgb;
-        // Try to extract first page as image using pdf-extract
+        // Try to extract first page as image using pdf-extract with panic protection
-        match pdf_extract::extract_text_from_mem(file_data) {
+        match catch_unwind(AssertUnwindSafe(|| {
-            Ok(text) => {
+            pdf_extract::extract_text_from_mem(file_data)
        })) {
            Ok(Ok(text)) => {
                // If we can extract text, create a text-based thumbnail
                self.generate_text_based_thumbnail(&text, "PDF", Rgb([220, 38, 27])).await
            }
-            Err(_) => {
+            Ok(Err(_)) | Err(_) => {
-                // Fall back to placeholder if PDF extraction fails
+                // Fall back to placeholder if PDF extraction fails or panics
                self.generate_placeholder_thumbnail("PDF").await
            }
        }
--- a/src/ocr.rs
+++ b/src/ocr.rs
@ -1,5 +1,6 @@
 use anyhow::{anyhow, Result};
 use std::path::Path;
 use std::panic::{catch_unwind, AssertUnwindSafe};
 use crate::ocr_error::OcrError;
 use crate::ocr_health::OcrHealthChecker;
@ -54,8 +55,13 @@ impl OcrService {
        #[cfg(feature = "ocr")]
        {
            let bytes = std::fs::read(file_path)?;
-            let text = pdf_extract::extract_text_from_mem(&bytes)
+            let text = match catch_unwind(AssertUnwindSafe(|| {
-                .map_err(|e| anyhow!("Failed to extract text from PDF: {}", e))?;
+                pdf_extract::extract_text_from_mem(&bytes)
            })) {
                Ok(Ok(text)) => text,
                Ok(Err(e)) => return Err(anyhow!("Failed to extract text from PDF: {}", e)),
                Err(_) => return Err(anyhow!("PDF extraction panicked due to invalid content stream in file: {}", file_path)),
            };
            Ok(text.trim().to_string())
        }