106 lines
3.5 KiB
Rust
106 lines
3.5 KiB
Rust
use anyhow::{anyhow, Result};
|
|
use std::path::Path;
|
|
use crate::ocr_error::OcrError;
|
|
use crate::ocr_health::OcrHealthChecker;
|
|
|
|
#[cfg(feature = "ocr")]
|
|
use tesseract::Tesseract;
|
|
|
|
pub struct OcrService {
|
|
health_checker: OcrHealthChecker,
|
|
}
|
|
|
|
impl OcrService {
|
|
pub fn new() -> Self {
|
|
Self {
|
|
health_checker: OcrHealthChecker::new(),
|
|
}
|
|
}
|
|
|
|
pub async fn extract_text_from_image(&self, file_path: &str) -> Result<String> {
|
|
self.extract_text_from_image_with_lang(file_path, "eng").await
|
|
}
|
|
|
|
pub async fn extract_text_from_image_with_lang(&self, file_path: &str, lang: &str) -> Result<String> {
|
|
#[cfg(feature = "ocr")]
|
|
{
|
|
// Perform health checks first
|
|
self.health_checker.check_tesseract_installation()
|
|
.map_err(|e: OcrError| anyhow!(e))?;
|
|
self.health_checker.check_language_data(lang)
|
|
.map_err(|e: OcrError| anyhow!(e))?;
|
|
|
|
let mut tesseract = Tesseract::new(None, Some(lang))
|
|
.map_err(|e| anyhow!(OcrError::InitializationFailed {
|
|
details: e.to_string()
|
|
}))?
|
|
.set_image(file_path)?;
|
|
|
|
let text = tesseract.get_text()
|
|
.map_err(|e| anyhow!(OcrError::InitializationFailed {
|
|
details: format!("Failed to extract text: {}", e)
|
|
}))?;
|
|
|
|
Ok(text.trim().to_string())
|
|
}
|
|
|
|
#[cfg(not(feature = "ocr"))]
|
|
{
|
|
Err(anyhow!(OcrError::TesseractNotInstalled))
|
|
}
|
|
}
|
|
|
|
pub async fn extract_text_from_pdf(&self, file_path: &str) -> Result<String> {
|
|
#[cfg(feature = "ocr")]
|
|
{
|
|
let bytes = std::fs::read(file_path)?;
|
|
let text = pdf_extract::extract_text_from_mem(&bytes)
|
|
.map_err(|e| anyhow!("Failed to extract text from PDF: {}", e))?;
|
|
|
|
Ok(text.trim().to_string())
|
|
}
|
|
|
|
#[cfg(not(feature = "ocr"))]
|
|
{
|
|
Err(anyhow!(OcrError::TesseractNotInstalled))
|
|
}
|
|
}
|
|
|
|
pub async fn extract_text(&self, file_path: &str, mime_type: &str) -> Result<String> {
|
|
self.extract_text_with_lang(file_path, mime_type, "eng").await
|
|
}
|
|
|
|
pub async fn extract_text_with_lang(&self, file_path: &str, mime_type: &str, lang: &str) -> Result<String> {
|
|
match mime_type {
|
|
"application/pdf" => self.extract_text_from_pdf(file_path).await,
|
|
"image/png" | "image/jpeg" | "image/jpg" | "image/tiff" | "image/bmp" => {
|
|
self.extract_text_from_image_with_lang(file_path, lang).await
|
|
}
|
|
"text/plain" => {
|
|
let text = tokio::fs::read_to_string(file_path).await?;
|
|
Ok(text)
|
|
}
|
|
_ => {
|
|
if self.is_image_file(file_path) {
|
|
self.extract_text_from_image_with_lang(file_path, lang).await
|
|
} else {
|
|
Err(anyhow!(OcrError::InvalidImageFormat {
|
|
details: format!("Unsupported MIME type: {}", mime_type)
|
|
}))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn is_image_file(&self, file_path: &str) -> bool {
|
|
if let Some(extension) = Path::new(file_path)
|
|
.extension()
|
|
.and_then(|ext| ext.to_str())
|
|
{
|
|
let ext_lower = extension.to_lowercase();
|
|
matches!(ext_lower.as_str(), "png" | "jpg" | "jpeg" | "tiff" | "bmp" | "gif")
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
} |