fix(server): catch pdf-extract spammy logs

2025-06-25 23:26:11 +00:00 · 2025-06-25 23:26:11 +00:00 · 40afb5ade5
parent 182ba7cf18
commit 40afb5ade5
2 changed files with 16 additions and 7 deletions
--- a/src/file_service.rs
+++ b/src/file_service.rs
@ -1,6 +1,7 @@
 use anyhow::Result;
 use chrono::Utc;
 use std::path::{Path, PathBuf};
+use std::panic::{catch_unwind, AssertUnwindSafe};
 use tokio::fs;
 use uuid::Uuid;
 use tracing::{info, warn, error};
@ -321,14 +322,16 @@ impl FileService {
    async fn generate_pdf_thumbnail(&self, file_data: &[u8]) -> Result<Vec<u8>> {
        use image::Rgb;
        
-        // Try to extract first page as image using pdf-extract
-        match pdf_extract::extract_text_from_mem(file_data) {
-            Ok(text) => {
+        // Try to extract first page as image using pdf-extract with panic protection
+        match catch_unwind(AssertUnwindSafe(|| {
+            pdf_extract::extract_text_from_mem(file_data)
+        })) {
+            Ok(Ok(text)) => {
                // If we can extract text, create a text-based thumbnail
                self.generate_text_based_thumbnail(&text, "PDF", Rgb([220, 38, 27])).await
            }
-            Err(_) => {
-                // Fall back to placeholder if PDF extraction fails
+            Ok(Err(_)) | Err(_) => {
+                // Fall back to placeholder if PDF extraction fails or panics
                self.generate_placeholder_thumbnail("PDF").await
            }
        }
--- a/src/ocr.rs
+++ b/src/ocr.rs
@ -1,5 +1,6 @@
 use anyhow::{anyhow, Result};
 use std::path::Path;
+use std::panic::{catch_unwind, AssertUnwindSafe};
 use crate::ocr_error::OcrError;
 use crate::ocr_health::OcrHealthChecker;

@ -54,8 +55,13 @@ impl OcrService {
        #[cfg(feature = "ocr")]
        {
            let bytes = std::fs::read(file_path)?;
-            let text = pdf_extract::extract_text_from_mem(&bytes)
-                .map_err(|e| anyhow!("Failed to extract text from PDF: {}", e))?;
+            let text = match catch_unwind(AssertUnwindSafe(|| {
+                pdf_extract::extract_text_from_mem(&bytes)
+            })) {
+                Ok(Ok(text)) => text,
+                Ok(Err(e)) => return Err(anyhow!("Failed to extract text from PDF: {}", e)),
+                Err(_) => return Err(anyhow!("PDF extraction panicked due to invalid content stream in file: {}", file_path)),
+            };
            
            Ok(text.trim().to_string())
        }