feat(server): fix recursively scanning the uploads folder, and the quick search bar
This commit is contained in:
parent
99521b4ca0
commit
4aa3d77e40
|
|
@ -2929,6 +2929,7 @@ dependencies = [
|
|||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"sqlx",
|
||||
"sysinfo",
|
||||
"tempfile",
|
||||
|
|
|
|||
|
|
@ -48,6 +48,7 @@ hostname = "0.4"
|
|||
walkdir = "2"
|
||||
clap = { version = "4", features = ["derive"] }
|
||||
utoipa = { version = "5", features = ["axum_extras", "chrono", "uuid"] }
|
||||
sha2 = "0.10"
|
||||
utoipa-swagger-ui = { version = "9", features = ["axum"] }
|
||||
|
||||
[features]
|
||||
|
|
|
|||
|
|
@ -33,35 +33,18 @@ import {
|
|||
AccessTime as TimeIcon,
|
||||
} from '@mui/icons-material';
|
||||
import { useNavigate } from 'react-router-dom';
|
||||
import { documentService, SearchRequest } from '../../services/api';
|
||||
import { documentService, SearchRequest, EnhancedDocument, SearchResponse } from '../../services/api';
|
||||
|
||||
interface GlobalSearchBarProps {
|
||||
sx?: SxProps<Theme>;
|
||||
[key: string]: any;
|
||||
}
|
||||
|
||||
interface Document {
|
||||
id: string;
|
||||
original_filename: string;
|
||||
filename?: string;
|
||||
file_size: number;
|
||||
mime_type: string;
|
||||
has_ocr_text?: boolean;
|
||||
search_rank?: number;
|
||||
snippets?: Array<{ text: string }>;
|
||||
}
|
||||
|
||||
interface SearchResponse {
|
||||
documents: Document[];
|
||||
total_count: number;
|
||||
search_time_ms: number;
|
||||
}
|
||||
|
||||
const GlobalSearchBar: React.FC<GlobalSearchBarProps> = ({ sx, ...props }) => {
|
||||
const navigate = useNavigate();
|
||||
const theme = useTheme();
|
||||
const [query, setQuery] = useState<string>('');
|
||||
const [results, setResults] = useState<Document[]>([]);
|
||||
const [results, setResults] = useState<EnhancedDocument[]>([]);
|
||||
const [loading, setLoading] = useState<boolean>(false);
|
||||
const [showResults, setShowResults] = useState<boolean>(false);
|
||||
const [recentSearches, setRecentSearches] = useState<string[]>([]);
|
||||
|
|
@ -221,7 +204,7 @@ const GlobalSearchBar: React.FC<GlobalSearchBarProps> = ({ sx, ...props }) => {
|
|||
setSearchProgress(0);
|
||||
};
|
||||
|
||||
const handleDocumentClick = (doc: Document): void => {
|
||||
const handleDocumentClick = (doc: EnhancedDocument): void => {
|
||||
saveRecentSearch(query);
|
||||
setShowResults(false);
|
||||
navigate(`/documents/${doc.id}`);
|
||||
|
|
@ -661,7 +644,7 @@ const GlobalSearchBar: React.FC<GlobalSearchBarProps> = ({ sx, ...props }) => {
|
|||
flex: 1,
|
||||
}}
|
||||
>
|
||||
{highlightText(generateContextSnippet(doc.original_filename, query), query)}
|
||||
{highlightText(doc.original_filename || doc.filename, query)}
|
||||
</Typography>
|
||||
}
|
||||
secondary={
|
||||
|
|
@ -725,7 +708,7 @@ const GlobalSearchBar: React.FC<GlobalSearchBarProps> = ({ sx, ...props }) => {
|
|||
flex: 1,
|
||||
}}
|
||||
>
|
||||
{highlightText(doc.snippets[0].text.substring(0, 80) + '...', query)}
|
||||
{highlightText(doc.snippets[0]?.text?.substring(0, 80) + '...' || '', query)}
|
||||
</Typography>
|
||||
)}
|
||||
</Box>
|
||||
|
|
|
|||
|
|
@ -80,23 +80,22 @@ const DocumentDetailsPage: React.FC = () => {
|
|||
}, [document]);
|
||||
|
||||
const fetchDocumentDetails = async (): Promise<void> => {
|
||||
if (!id) {
|
||||
setError('No document ID provided');
|
||||
setLoading(false);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
setLoading(true);
|
||||
setError(null);
|
||||
|
||||
// Since we don't have a direct document details endpoint,
|
||||
// we'll fetch the document from the list and find the matching one
|
||||
const response = await documentService.list(1000, 0);
|
||||
const foundDoc = response.data.find(doc => doc.id === id);
|
||||
|
||||
if (foundDoc) {
|
||||
setDocument(foundDoc);
|
||||
} else {
|
||||
setError('Document not found');
|
||||
}
|
||||
} catch (err) {
|
||||
setError('Failed to load document details');
|
||||
console.error(err);
|
||||
const response = await documentService.getById(id);
|
||||
setDocument(response.data);
|
||||
} catch (err: any) {
|
||||
const errorMessage = err.message || 'Failed to load document details';
|
||||
setError(errorMessage);
|
||||
console.error('Failed to fetch document details:', err);
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -111,6 +111,23 @@ export const documentService = {
|
|||
})
|
||||
},
|
||||
|
||||
getById: (id: string) => {
|
||||
// Use the document list endpoint with pagination to find the specific document
|
||||
// This is a temporary solution until we have a proper document details endpoint
|
||||
return api.get<Document[]>('/documents', {
|
||||
params: {
|
||||
limit: 1000, // Fetch a reasonable amount to find our document
|
||||
offset: 0
|
||||
}
|
||||
}).then(response => {
|
||||
const document = response.data.find(doc => doc.id === id);
|
||||
if (!document) {
|
||||
throw new Error('Document not found');
|
||||
}
|
||||
return { data: document };
|
||||
})
|
||||
},
|
||||
|
||||
download: (id: string) => {
|
||||
return api.get(`/documents/${id}/download`, {
|
||||
responseType: 'blob',
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ impl Config {
|
|||
pub fn from_env() -> Result<Self> {
|
||||
dotenvy::dotenv().ok();
|
||||
|
||||
Ok(Config {
|
||||
let config = Config {
|
||||
database_url: env::var("DATABASE_URL")
|
||||
.unwrap_or_else(|_| "postgresql://readur:readur@localhost/readur".to_string()),
|
||||
server_address: {
|
||||
|
|
@ -85,6 +85,68 @@ impl Config {
|
|||
.unwrap_or(512),
|
||||
cpu_priority: env::var("CPU_PRIORITY")
|
||||
.unwrap_or_else(|_| "normal".to_string()),
|
||||
})
|
||||
};
|
||||
|
||||
// Validate configuration to prevent recursion issues
|
||||
config.validate_paths()?;
|
||||
|
||||
Ok(config)
|
||||
}
|
||||
|
||||
fn validate_paths(&self) -> Result<()> {
|
||||
use std::path::Path;
|
||||
|
||||
let upload_path = Path::new(&self.upload_path);
|
||||
let watch_path = Path::new(&self.watch_folder);
|
||||
|
||||
// Normalize paths to handle relative paths and symlinks
|
||||
let upload_canonical = upload_path.canonicalize()
|
||||
.unwrap_or_else(|_| upload_path.to_path_buf());
|
||||
let watch_canonical = watch_path.canonicalize()
|
||||
.unwrap_or_else(|_| watch_path.to_path_buf());
|
||||
|
||||
// Check if paths are the same
|
||||
if upload_canonical == watch_canonical {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Configuration Error: UPLOAD_PATH and WATCH_FOLDER cannot be the same directory.\n\
|
||||
This would cause infinite recursion where WebDAV files are downloaded to the upload \n\
|
||||
directory and then immediately reprocessed by the watcher.\n\
|
||||
Current config:\n\
|
||||
- UPLOAD_PATH: {}\n\
|
||||
- WATCH_FOLDER: {}\n\
|
||||
Please set them to different directories.",
|
||||
self.upload_path, self.watch_folder
|
||||
));
|
||||
}
|
||||
|
||||
// Check if watch folder is inside upload folder
|
||||
if watch_canonical.starts_with(&upload_canonical) {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Configuration Error: WATCH_FOLDER cannot be inside UPLOAD_PATH.\n\
|
||||
This would cause recursion where WebDAV files downloaded to uploads are \n\
|
||||
detected by the watcher as new files.\n\
|
||||
Current config:\n\
|
||||
- UPLOAD_PATH: {}\n\
|
||||
- WATCH_FOLDER: {}\n\
|
||||
Please move the watch folder outside the upload directory.",
|
||||
self.upload_path, self.watch_folder
|
||||
));
|
||||
}
|
||||
|
||||
// Check if upload folder is inside watch folder
|
||||
if upload_canonical.starts_with(&watch_canonical) {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Configuration Error: UPLOAD_PATH cannot be inside WATCH_FOLDER.\n\
|
||||
This would cause recursion where files from the watch folder are \n\
|
||||
copied to uploads (inside the watch folder) and reprocessed.\n\
|
||||
Current config:\n\
|
||||
- UPLOAD_PATH: {}\n\
|
||||
- WATCH_FOLDER: {}\n\
|
||||
Please move the upload directory outside the watch folder.",
|
||||
self.upload_path, self.watch_folder
|
||||
));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
@ -8,6 +8,7 @@ use axum::{
|
|||
use serde::Deserialize;
|
||||
use std::sync::Arc;
|
||||
use utoipa::ToSchema;
|
||||
use sha2::{Sha256, Digest};
|
||||
|
||||
use crate::{
|
||||
auth::AuthUser,
|
||||
|
|
@ -85,6 +86,27 @@ async fn upload_document(
|
|||
return Err(StatusCode::PAYLOAD_TOO_LARGE);
|
||||
}
|
||||
|
||||
// Calculate file hash for deduplication
|
||||
let file_hash = calculate_file_hash(&data);
|
||||
|
||||
// Check if this exact file content already exists in the system
|
||||
// This prevents uploading and processing duplicate files
|
||||
if let Ok(existing_docs) = state.db.get_documents_by_user_with_role(auth_user.user.id, auth_user.user.role, 1000, 0).await {
|
||||
for existing_doc in existing_docs {
|
||||
// Quick size check first (much faster than hash comparison)
|
||||
if existing_doc.file_size == file_size {
|
||||
// Read the existing file and compare hashes
|
||||
if let Ok(existing_file_data) = tokio::fs::read(&existing_doc.file_path).await {
|
||||
let existing_hash = calculate_file_hash(&existing_file_data);
|
||||
if file_hash == existing_hash {
|
||||
// Return the existing document instead of creating a duplicate
|
||||
return Ok(Json(existing_doc.into()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mime_type = mime_guess::from_path(&filename)
|
||||
.first_or_octet_stream()
|
||||
.to_string();
|
||||
|
|
@ -135,6 +157,13 @@ async fn upload_document(
|
|||
Err(StatusCode::BAD_REQUEST)
|
||||
}
|
||||
|
||||
fn calculate_file_hash(data: &[u8]) -> String {
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(data);
|
||||
let result = hasher.finalize();
|
||||
format!("{:x}", result)
|
||||
}
|
||||
|
||||
#[utoipa::path(
|
||||
get,
|
||||
path = "/api/documents",
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ use tracing::{error, info, warn};
|
|||
use chrono::Utc;
|
||||
use tokio::sync::Semaphore;
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use sha2::{Sha256, Digest};
|
||||
|
||||
use crate::{
|
||||
AppState,
|
||||
|
|
@ -241,6 +242,46 @@ async fn process_single_file(
|
|||
|
||||
info!("Downloaded file: {} ({} bytes)", file_info.name, file_data.len());
|
||||
|
||||
// Calculate file hash for deduplication
|
||||
let file_hash = calculate_file_hash(&file_data);
|
||||
|
||||
// Check if this exact file content already exists in the system
|
||||
// This prevents downloading and processing duplicate files from WebDAV
|
||||
if let Ok(existing_docs) = state.db.get_documents_by_user_with_role(user_id, crate::models::UserRole::User, 1000, 0).await {
|
||||
for existing_doc in existing_docs {
|
||||
// Quick size check first (much faster than hash comparison)
|
||||
if existing_doc.file_size == file_data.len() as i64 {
|
||||
// Read the existing file and compare hashes
|
||||
if let Ok(existing_file_data) = tokio::fs::read(&existing_doc.file_path).await {
|
||||
let existing_hash = calculate_file_hash(&existing_file_data);
|
||||
if file_hash == existing_hash {
|
||||
info!("Skipping duplicate WebDAV file content: {} (hash: {}, already exists as: {})",
|
||||
file_info.name, &file_hash[..8], existing_doc.original_filename);
|
||||
|
||||
// Still record this WebDAV file in the tracking table to prevent re-downloading
|
||||
let webdav_file = CreateWebDAVFile {
|
||||
user_id,
|
||||
webdav_path: file_info.path.clone(),
|
||||
etag: file_info.etag.clone(),
|
||||
last_modified: file_info.last_modified,
|
||||
file_size: file_info.size,
|
||||
mime_type: file_info.mime_type.clone(),
|
||||
document_id: Some(existing_doc.id), // Link to existing document
|
||||
sync_status: "duplicate_content".to_string(),
|
||||
sync_error: None,
|
||||
};
|
||||
|
||||
if let Err(e) = state.db.create_or_update_webdav_file(&webdav_file).await {
|
||||
error!("Failed to record duplicate WebDAV file: {}", e);
|
||||
}
|
||||
|
||||
return Ok(false); // Not processed (duplicate)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create file service and save file to disk
|
||||
let file_service = FileService::new(state.config.upload_path.clone());
|
||||
|
||||
|
|
@ -312,4 +353,11 @@ async fn process_single_file(
|
|||
}
|
||||
|
||||
Ok(true) // Successfully processed
|
||||
}
|
||||
|
||||
fn calculate_file_hash(data: &[u8]) -> String {
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(data);
|
||||
let result = hasher.finalize();
|
||||
format!("{:x}", result)
|
||||
}
|
||||
|
|
@ -7,6 +7,7 @@ use tokio::sync::mpsc;
|
|||
use tokio::time::{interval, sleep};
|
||||
use tracing::{debug, error, info, warn};
|
||||
use walkdir::WalkDir;
|
||||
use sha2::{Sha256, Digest};
|
||||
|
||||
use crate::{config::Config, db::Database, file_service::FileService, ocr_queue::OcrQueueService};
|
||||
|
||||
|
|
@ -134,7 +135,9 @@ async fn start_polling_watcher(
|
|||
let mut interval = interval(Duration::from_secs(config.watch_interval_seconds.unwrap_or(30)));
|
||||
|
||||
// Initial scan
|
||||
info!("Starting initial scan of watch directory: {}", config.watch_folder);
|
||||
scan_directory(&config.watch_folder, &mut known_files, &db, &file_service, &queue_service, &config).await?;
|
||||
info!("Initial scan completed. Found {} files to track", known_files.len());
|
||||
|
||||
loop {
|
||||
interval.tick().await;
|
||||
|
|
@ -242,6 +245,19 @@ async fn process_file(
|
|||
return Ok(());
|
||||
}
|
||||
|
||||
// CRITICAL: Skip files that are in the upload directory - these are managed by WebDAV/manual uploads
|
||||
let path_str = path.to_string_lossy();
|
||||
let upload_path_normalized = std::path::Path::new(&config.upload_path)
|
||||
.canonicalize()
|
||||
.unwrap_or_else(|_| std::path::PathBuf::from(&config.upload_path));
|
||||
|
||||
if let Ok(file_canonical) = path.canonicalize() {
|
||||
if file_canonical.starts_with(&upload_path_normalized) {
|
||||
debug!("Skipping file in upload directory (managed by WebDAV/manual upload): {}", filename);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
// Check file age if configured
|
||||
if let Some(max_age_hours) = config.max_file_age_hours {
|
||||
if let Ok(metadata) = tokio::fs::metadata(path).await {
|
||||
|
|
@ -255,7 +271,7 @@ async fn process_file(
|
|||
}
|
||||
}
|
||||
|
||||
info!("Processing new file: {:?}", path);
|
||||
info!("Processing new file: {:?} (from watch directory: {})", path, config.watch_folder);
|
||||
|
||||
let file_data = tokio::fs::read(path).await?;
|
||||
let file_size = file_data.len() as i64;
|
||||
|
|
@ -283,16 +299,35 @@ async fn process_file(
|
|||
return Ok(());
|
||||
}
|
||||
|
||||
// Check for duplicate files (same filename and size)
|
||||
if let Ok(existing_docs) = db.find_documents_by_filename(&filename).await {
|
||||
for doc in existing_docs {
|
||||
if doc.file_size == file_size {
|
||||
info!("Skipping duplicate file: {} (already exists with same size)", filename);
|
||||
return Ok(());
|
||||
// Fetch admin user ID from database for watch folder documents
|
||||
let admin_user = db.get_user_by_username("admin").await?
|
||||
.ok_or_else(|| anyhow::anyhow!("Admin user not found. Please ensure the admin user is created."))?;
|
||||
let admin_user_id = admin_user.id;
|
||||
|
||||
// Calculate file hash for deduplication
|
||||
let file_hash = calculate_file_hash(&file_data);
|
||||
|
||||
// Check if this exact file content already exists in the system by comparing
|
||||
// against existing files with the same size (performance optimization)
|
||||
if let Ok(existing_docs) = db.get_documents_by_user_with_role(admin_user_id, crate::models::UserRole::Admin, 1000, 0).await {
|
||||
for existing_doc in existing_docs {
|
||||
// Quick size check first (much faster than hash comparison)
|
||||
if existing_doc.file_size == file_size {
|
||||
// Read the existing file and compare hashes
|
||||
if let Ok(existing_file_data) = tokio::fs::read(&existing_doc.file_path).await {
|
||||
let existing_hash = calculate_file_hash(&existing_file_data);
|
||||
if file_hash == existing_hash {
|
||||
info!("Skipping duplicate file content: {} (hash: {}, already exists as: {})",
|
||||
filename, &file_hash[..8], existing_doc.original_filename);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!("File content is unique: {} (hash: {})", filename, &file_hash[..8]);
|
||||
|
||||
// Validate PDF files before processing
|
||||
if mime_type == "application/pdf" {
|
||||
if !is_valid_pdf(&file_data) {
|
||||
|
|
@ -310,11 +345,6 @@ async fn process_file(
|
|||
|
||||
let saved_file_path = file_service.save_file(&filename, &file_data).await?;
|
||||
|
||||
// Fetch admin user ID from database for watch folder documents
|
||||
let admin_user = db.get_user_by_username("admin").await?
|
||||
.ok_or_else(|| anyhow::anyhow!("Admin user not found. Please ensure the admin user is created."))?;
|
||||
let admin_user_id = admin_user.id;
|
||||
|
||||
let document = file_service.create_document(
|
||||
&filename,
|
||||
&filename,
|
||||
|
|
@ -410,4 +440,11 @@ fn clean_pdf_data(data: &[u8]) -> &[u8] {
|
|||
|
||||
// If no PDF header found, return original data
|
||||
data
|
||||
}
|
||||
|
||||
fn calculate_file_hash(data: &[u8]) -> String {
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(data);
|
||||
let result = hasher.finalize();
|
||||
format!("{:x}", result)
|
||||
}
|
||||
Loading…
Reference in New Issue