Merge pull request #46 from readur/fix/catch-pdf-extract-errors
fix(server): catch pdf-extract spammy logs
This commit is contained in:
commit
bae748f3df
|
|
@ -1,6 +1,7 @@
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
use std::panic::{catch_unwind, AssertUnwindSafe};
|
||||||
use tokio::fs;
|
use tokio::fs;
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
use tracing::{info, warn, error};
|
use tracing::{info, warn, error};
|
||||||
|
|
@ -321,14 +322,16 @@ impl FileService {
|
||||||
async fn generate_pdf_thumbnail(&self, file_data: &[u8]) -> Result<Vec<u8>> {
|
async fn generate_pdf_thumbnail(&self, file_data: &[u8]) -> Result<Vec<u8>> {
|
||||||
use image::Rgb;
|
use image::Rgb;
|
||||||
|
|
||||||
// Try to extract first page as image using pdf-extract
|
// Try to extract first page as image using pdf-extract with panic protection
|
||||||
match pdf_extract::extract_text_from_mem(file_data) {
|
match catch_unwind(AssertUnwindSafe(|| {
|
||||||
Ok(text) => {
|
pdf_extract::extract_text_from_mem(file_data)
|
||||||
|
})) {
|
||||||
|
Ok(Ok(text)) => {
|
||||||
// If we can extract text, create a text-based thumbnail
|
// If we can extract text, create a text-based thumbnail
|
||||||
self.generate_text_based_thumbnail(&text, "PDF", Rgb([220, 38, 27])).await
|
self.generate_text_based_thumbnail(&text, "PDF", Rgb([220, 38, 27])).await
|
||||||
}
|
}
|
||||||
Err(_) => {
|
Ok(Err(_)) | Err(_) => {
|
||||||
// Fall back to placeholder if PDF extraction fails
|
// Fall back to placeholder if PDF extraction fails or panics
|
||||||
self.generate_placeholder_thumbnail("PDF").await
|
self.generate_placeholder_thumbnail("PDF").await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
10
src/ocr.rs
10
src/ocr.rs
|
|
@ -1,5 +1,6 @@
|
||||||
use anyhow::{anyhow, Result};
|
use anyhow::{anyhow, Result};
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
use std::panic::{catch_unwind, AssertUnwindSafe};
|
||||||
use crate::ocr_error::OcrError;
|
use crate::ocr_error::OcrError;
|
||||||
use crate::ocr_health::OcrHealthChecker;
|
use crate::ocr_health::OcrHealthChecker;
|
||||||
|
|
||||||
|
|
@ -54,8 +55,13 @@ impl OcrService {
|
||||||
#[cfg(feature = "ocr")]
|
#[cfg(feature = "ocr")]
|
||||||
{
|
{
|
||||||
let bytes = std::fs::read(file_path)?;
|
let bytes = std::fs::read(file_path)?;
|
||||||
let text = pdf_extract::extract_text_from_mem(&bytes)
|
let text = match catch_unwind(AssertUnwindSafe(|| {
|
||||||
.map_err(|e| anyhow!("Failed to extract text from PDF: {}", e))?;
|
pdf_extract::extract_text_from_mem(&bytes)
|
||||||
|
})) {
|
||||||
|
Ok(Ok(text)) => text,
|
||||||
|
Ok(Err(e)) => return Err(anyhow!("Failed to extract text from PDF: {}", e)),
|
||||||
|
Err(_) => return Err(anyhow!("PDF extraction panicked due to invalid content stream in file: {}", file_path)),
|
||||||
|
};
|
||||||
|
|
||||||
Ok(text.trim().to_string())
|
Ok(text.trim().to_string())
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue