feat(server): show source metadata better, and implement tests
This commit is contained in:
parent
b7f1522b4a
commit
ea43f79a90
|
|
@ -31,6 +31,15 @@ interface FileIntegrityDisplayProps {
|
||||||
updatedAt: string;
|
updatedAt: string;
|
||||||
userId?: string;
|
userId?: string;
|
||||||
username?: string;
|
username?: string;
|
||||||
|
// Additional metadata fields
|
||||||
|
sourceType?: string;
|
||||||
|
sourcePath?: string;
|
||||||
|
filePermissions?: number;
|
||||||
|
fileOwner?: string;
|
||||||
|
fileGroup?: string;
|
||||||
|
originalCreatedAt?: string;
|
||||||
|
originalModifiedAt?: string;
|
||||||
|
sourceMetadata?: any;
|
||||||
compact?: boolean;
|
compact?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -43,6 +52,14 @@ const FileIntegrityDisplay: React.FC<FileIntegrityDisplayProps> = ({
|
||||||
updatedAt,
|
updatedAt,
|
||||||
userId,
|
userId,
|
||||||
username,
|
username,
|
||||||
|
sourceType,
|
||||||
|
sourcePath,
|
||||||
|
filePermissions,
|
||||||
|
fileOwner,
|
||||||
|
fileGroup,
|
||||||
|
originalCreatedAt,
|
||||||
|
originalModifiedAt,
|
||||||
|
sourceMetadata,
|
||||||
compact = false,
|
compact = false,
|
||||||
}) => {
|
}) => {
|
||||||
const [copied, setCopied] = useState(false);
|
const [copied, setCopied] = useState(false);
|
||||||
|
|
@ -203,7 +220,7 @@ const FileIntegrityDisplay: React.FC<FileIntegrityDisplayProps> = ({
|
||||||
}}
|
}}
|
||||||
/>
|
/>
|
||||||
<Typography variant="h6" sx={{ fontWeight: 600 }}>
|
<Typography variant="h6" sx={{ fontWeight: 600 }}>
|
||||||
File Integrity & Verification
|
Document Details
|
||||||
</Typography>
|
</Typography>
|
||||||
</Box>
|
</Box>
|
||||||
|
|
||||||
|
|
@ -340,8 +357,146 @@ const FileIntegrityDisplay: React.FC<FileIntegrityDisplayProps> = ({
|
||||||
}}
|
}}
|
||||||
/>
|
/>
|
||||||
</Box>
|
</Box>
|
||||||
|
|
||||||
|
{fileOwner && (
|
||||||
|
<Box sx={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center' }}>
|
||||||
|
<Typography variant="body2" color="text.secondary">
|
||||||
|
Owner
|
||||||
|
</Typography>
|
||||||
|
<Typography variant="body2" sx={{ fontFamily: 'monospace', fontSize: '0.8rem', fontWeight: 500 }}>
|
||||||
|
{fileOwner}
|
||||||
|
</Typography>
|
||||||
|
</Box>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{sourcePath && (
|
||||||
|
<Box sx={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center' }}>
|
||||||
|
<Typography variant="body2" color="text.secondary">
|
||||||
|
Source Path
|
||||||
|
</Typography>
|
||||||
|
<Typography variant="body2" sx={{ fontFamily: 'monospace', fontSize: '0.8rem', maxWidth: '60%', overflow: 'hidden', textOverflow: 'ellipsis', fontWeight: 500 }}>
|
||||||
|
{sourcePath}
|
||||||
|
</Typography>
|
||||||
|
</Box>
|
||||||
|
)}
|
||||||
</Stack>
|
</Stack>
|
||||||
</Box>
|
</Box>
|
||||||
|
|
||||||
|
{/* Additional Source Information */}
|
||||||
|
{(sourceType || fileGroup || filePermissions) && (
|
||||||
|
<Box sx={{ pt: 3, borderTop: `1px solid ${theme.palette.divider}` }}>
|
||||||
|
<Typography variant="subtitle1" sx={{ mb: 2, fontWeight: 600, display: 'flex', alignItems: 'center' }}>
|
||||||
|
<InfoIcon sx={{ mr: 1, fontSize: 18, color: theme.palette.info.main }} />
|
||||||
|
Additional Source Details
|
||||||
|
</Typography>
|
||||||
|
|
||||||
|
<Stack spacing={2}>
|
||||||
|
{sourceType && (
|
||||||
|
<Box sx={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center' }}>
|
||||||
|
<Typography variant="body2" color="text.secondary">Source Type:</Typography>
|
||||||
|
<Chip
|
||||||
|
label={sourceType}
|
||||||
|
size="small"
|
||||||
|
sx={{
|
||||||
|
fontSize: '0.75rem',
|
||||||
|
backgroundColor: theme.palette.info.light,
|
||||||
|
color: theme.palette.info.dark,
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
</Box>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{fileGroup && (
|
||||||
|
<Box sx={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center' }}>
|
||||||
|
<Typography variant="body2" color="text.secondary">File Group:</Typography>
|
||||||
|
<Typography variant="body2" sx={{ fontFamily: 'monospace', fontSize: '0.8rem' }}>
|
||||||
|
{fileGroup}
|
||||||
|
</Typography>
|
||||||
|
</Box>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{filePermissions && (
|
||||||
|
<Box sx={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center' }}>
|
||||||
|
<Typography variant="body2" color="text.secondary">Permissions:</Typography>
|
||||||
|
<Typography variant="body2" sx={{ fontFamily: 'monospace', fontSize: '0.8rem' }}>
|
||||||
|
{filePermissions.toString(8)} ({filePermissions})
|
||||||
|
</Typography>
|
||||||
|
</Box>
|
||||||
|
)}
|
||||||
|
</Stack>
|
||||||
|
</Box>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Timestamps */}
|
||||||
|
{(originalCreatedAt || originalModifiedAt) && (
|
||||||
|
<Box sx={{ pt: 3, borderTop: `1px solid ${theme.palette.divider}` }}>
|
||||||
|
<Typography variant="subtitle1" sx={{ mb: 2, fontWeight: 600, display: 'flex', alignItems: 'center' }}>
|
||||||
|
<InfoIcon sx={{ mr: 1, fontSize: 18, color: theme.palette.secondary.main }} />
|
||||||
|
Original Timestamps
|
||||||
|
</Typography>
|
||||||
|
|
||||||
|
<Stack spacing={2}>
|
||||||
|
{originalCreatedAt && (
|
||||||
|
<Box sx={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center' }}>
|
||||||
|
<Typography variant="body2" color="text.secondary">Original Created:</Typography>
|
||||||
|
<Typography variant="body2" sx={{ fontSize: '0.8rem' }}>
|
||||||
|
{new Date(originalCreatedAt).toLocaleString()}
|
||||||
|
</Typography>
|
||||||
|
</Box>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{originalModifiedAt && (
|
||||||
|
<Box sx={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center' }}>
|
||||||
|
<Typography variant="body2" color="text.secondary">Original Modified:</Typography>
|
||||||
|
<Typography variant="body2" sx={{ fontSize: '0.8rem' }}>
|
||||||
|
{new Date(originalModifiedAt).toLocaleString()}
|
||||||
|
</Typography>
|
||||||
|
</Box>
|
||||||
|
)}
|
||||||
|
</Stack>
|
||||||
|
</Box>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Source Metadata - displayed as simple key-value pairs */}
|
||||||
|
{sourceMetadata && Object.keys(sourceMetadata).length > 0 && (
|
||||||
|
<Box sx={{ pt: 3, borderTop: `1px solid ${theme.palette.divider}` }}>
|
||||||
|
<Typography variant="subtitle1" sx={{ mb: 2, fontWeight: 600, display: 'flex', alignItems: 'center' }}>
|
||||||
|
<InfoIcon sx={{ mr: 1, fontSize: 18, color: theme.palette.secondary.main }} />
|
||||||
|
Source Metadata
|
||||||
|
</Typography>
|
||||||
|
|
||||||
|
<Stack spacing={2}>
|
||||||
|
{Object.entries(sourceMetadata).map(([key, value]) => {
|
||||||
|
// Skip null/undefined values and complex objects
|
||||||
|
if (value === null || value === undefined || typeof value === 'object') return null;
|
||||||
|
|
||||||
|
// Format the key to be more readable
|
||||||
|
const formattedKey = key
|
||||||
|
.replace(/_/g, ' ')
|
||||||
|
.replace(/([A-Z])/g, ' $1')
|
||||||
|
.replace(/^./, str => str.toUpperCase())
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
// Format the value
|
||||||
|
const formattedValue = typeof value === 'boolean'
|
||||||
|
? (value ? 'Yes' : 'No')
|
||||||
|
: String(value);
|
||||||
|
|
||||||
|
return (
|
||||||
|
<Box key={key} sx={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center' }}>
|
||||||
|
<Typography variant="body2" color="text.secondary">
|
||||||
|
{formattedKey}:
|
||||||
|
</Typography>
|
||||||
|
<Typography variant="body2" sx={{ fontSize: '0.8rem', fontWeight: 500, maxWidth: '60%', overflow: 'hidden', textOverflow: 'ellipsis' }}>
|
||||||
|
{formattedValue}
|
||||||
|
</Typography>
|
||||||
|
</Box>
|
||||||
|
);
|
||||||
|
}).filter(Boolean)}
|
||||||
|
</Stack>
|
||||||
|
</Box>
|
||||||
|
)}
|
||||||
|
|
||||||
</Paper>
|
</Paper>
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -52,7 +52,6 @@ import DocumentViewer from '../components/DocumentViewer';
|
||||||
import LabelSelector from '../components/Labels/LabelSelector';
|
import LabelSelector from '../components/Labels/LabelSelector';
|
||||||
import { type LabelData } from '../components/Labels/Label';
|
import { type LabelData } from '../components/Labels/Label';
|
||||||
import MetadataDisplay from '../components/MetadataDisplay';
|
import MetadataDisplay from '../components/MetadataDisplay';
|
||||||
import MetadataParser from '../components/MetadataParser';
|
|
||||||
import FileIntegrityDisplay from '../components/FileIntegrityDisplay';
|
import FileIntegrityDisplay from '../components/FileIntegrityDisplay';
|
||||||
import ProcessingTimeline from '../components/ProcessingTimeline';
|
import ProcessingTimeline from '../components/ProcessingTimeline';
|
||||||
import { RetryHistoryModal } from '../components/RetryHistoryModal';
|
import { RetryHistoryModal } from '../components/RetryHistoryModal';
|
||||||
|
|
@ -700,6 +699,14 @@ const DocumentDetailsPage: React.FC = () => {
|
||||||
updatedAt={document.updated_at}
|
updatedAt={document.updated_at}
|
||||||
userId={document.user_id}
|
userId={document.user_id}
|
||||||
username={document.username}
|
username={document.username}
|
||||||
|
sourceType={document.source_type}
|
||||||
|
sourcePath={document.source_path}
|
||||||
|
filePermissions={document.file_permissions}
|
||||||
|
fileOwner={document.file_owner}
|
||||||
|
fileGroup={document.file_group}
|
||||||
|
originalCreatedAt={document.original_created_at}
|
||||||
|
originalModifiedAt={document.original_modified_at}
|
||||||
|
sourceMetadata={document.source_metadata}
|
||||||
/>
|
/>
|
||||||
</Box>
|
</Box>
|
||||||
</Grid>
|
</Grid>
|
||||||
|
|
@ -891,122 +898,6 @@ const DocumentDetailsPage: React.FC = () => {
|
||||||
ocrError={ocrData?.ocr_error}
|
ocrError={ocrData?.ocr_error}
|
||||||
/>
|
/>
|
||||||
|
|
||||||
{/* Source Information */}
|
|
||||||
{(document.source_type || document.file_permissions || document.file_owner || document.file_group) && (
|
|
||||||
<Card
|
|
||||||
sx={{
|
|
||||||
backgroundColor: theme.palette.background.paper,
|
|
||||||
backdropFilter: 'blur(10px)',
|
|
||||||
}}
|
|
||||||
>
|
|
||||||
<CardContent sx={{ p: 4 }}>
|
|
||||||
<Typography variant="h5" sx={{ mb: 3, fontWeight: 700, display: 'flex', alignItems: 'center' }}>
|
|
||||||
<SourceIcon sx={{ mr: 1, color: theme.palette.primary.main }} />
|
|
||||||
Source Information
|
|
||||||
</Typography>
|
|
||||||
|
|
||||||
<Grid container spacing={3}>
|
|
||||||
{document.source_type && (
|
|
||||||
<Grid item xs={12} sm={6}>
|
|
||||||
<Box sx={{ p: 2, borderRadius: 2, backgroundColor: theme.palette.action.hover }}>
|
|
||||||
<Typography variant="subtitle2" color="text.secondary" sx={{ mb: 1 }}>
|
|
||||||
Source Type
|
|
||||||
</Typography>
|
|
||||||
<Chip
|
|
||||||
label={document.source_type.replace('_', ' ').toUpperCase()}
|
|
||||||
color="primary"
|
|
||||||
variant="outlined"
|
|
||||||
/>
|
|
||||||
</Box>
|
|
||||||
</Grid>
|
|
||||||
)}
|
|
||||||
|
|
||||||
{document.file_permissions && (
|
|
||||||
<Grid item xs={12} sm={6}>
|
|
||||||
<Box sx={{ p: 2, borderRadius: 2, backgroundColor: theme.palette.action.hover }}>
|
|
||||||
<Typography variant="subtitle2" color="text.secondary" sx={{ mb: 1 }}>
|
|
||||||
File Permissions
|
|
||||||
</Typography>
|
|
||||||
<Typography variant="body1" sx={{ fontFamily: 'monospace', fontWeight: 600 }}>
|
|
||||||
{document.file_permissions.toString(8)} ({document.file_permissions})
|
|
||||||
</Typography>
|
|
||||||
</Box>
|
|
||||||
</Grid>
|
|
||||||
)}
|
|
||||||
|
|
||||||
{document.file_owner && (
|
|
||||||
<Grid item xs={12} sm={6}>
|
|
||||||
<Box sx={{ p: 2, borderRadius: 2, backgroundColor: theme.palette.action.hover }}>
|
|
||||||
<Typography variant="subtitle2" color="text.secondary" sx={{ mb: 1 }}>
|
|
||||||
File Owner
|
|
||||||
</Typography>
|
|
||||||
<Typography variant="body1" sx={{ fontWeight: 600 }}>
|
|
||||||
{document.file_owner}
|
|
||||||
</Typography>
|
|
||||||
</Box>
|
|
||||||
</Grid>
|
|
||||||
)}
|
|
||||||
|
|
||||||
{document.file_group && (
|
|
||||||
<Grid item xs={12} sm={6}>
|
|
||||||
<Box sx={{ p: 2, borderRadius: 2, backgroundColor: theme.palette.action.hover }}>
|
|
||||||
<Typography variant="subtitle2" color="text.secondary" sx={{ mb: 1 }}>
|
|
||||||
File Group
|
|
||||||
</Typography>
|
|
||||||
<Typography variant="body1" sx={{ fontWeight: 600 }}>
|
|
||||||
{document.file_group}
|
|
||||||
</Typography>
|
|
||||||
</Box>
|
|
||||||
</Grid>
|
|
||||||
)}
|
|
||||||
|
|
||||||
{document.source_path && (
|
|
||||||
<Grid item xs={12}>
|
|
||||||
<Box sx={{ p: 2, borderRadius: 2, backgroundColor: theme.palette.action.hover }}>
|
|
||||||
<Typography variant="subtitle2" color="text.secondary" sx={{ mb: 1 }}>
|
|
||||||
Original Source Path
|
|
||||||
</Typography>
|
|
||||||
<Typography
|
|
||||||
variant="body1"
|
|
||||||
sx={{
|
|
||||||
fontFamily: 'monospace',
|
|
||||||
fontWeight: 600,
|
|
||||||
wordBreak: 'break-all',
|
|
||||||
backgroundColor: theme.palette.background.default,
|
|
||||||
p: 1,
|
|
||||||
borderRadius: 1,
|
|
||||||
}}
|
|
||||||
>
|
|
||||||
{document.source_path}
|
|
||||||
</Typography>
|
|
||||||
</Box>
|
|
||||||
</Grid>
|
|
||||||
)}
|
|
||||||
</Grid>
|
|
||||||
</CardContent>
|
|
||||||
</Card>
|
|
||||||
)}
|
|
||||||
|
|
||||||
{/* Enhanced Metadata Display */}
|
|
||||||
{document.source_metadata && Object.keys(document.source_metadata).length > 0 && (
|
|
||||||
<Card
|
|
||||||
sx={{
|
|
||||||
backgroundColor: theme.palette.background.paper,
|
|
||||||
backdropFilter: 'blur(10px)',
|
|
||||||
}}
|
|
||||||
>
|
|
||||||
<CardContent sx={{ p: 4 }}>
|
|
||||||
<Typography variant="h5" sx={{ mb: 3, fontWeight: 700 }}>
|
|
||||||
📊 Rich Metadata Analysis
|
|
||||||
</Typography>
|
|
||||||
<MetadataParser
|
|
||||||
metadata={document.source_metadata}
|
|
||||||
fileType={document.mime_type}
|
|
||||||
/>
|
|
||||||
</CardContent>
|
|
||||||
</Card>
|
|
||||||
)}
|
|
||||||
|
|
||||||
{/* Tags and Labels */}
|
{/* Tags and Labels */}
|
||||||
<Card
|
<Card
|
||||||
sx={{
|
sx={{
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@ use crate::{
|
||||||
services::file_service::FileService,
|
services::file_service::FileService,
|
||||||
ingestion::document_ingestion::{DocumentIngestionService, IngestionResult, DeduplicationPolicy},
|
ingestion::document_ingestion::{DocumentIngestionService, IngestionResult, DeduplicationPolicy},
|
||||||
ocr::queue::OcrQueueService,
|
ocr::queue::OcrQueueService,
|
||||||
models::FileInfo,
|
models::FileIngestionInfo,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct BatchIngester {
|
pub struct BatchIngester {
|
||||||
|
|
@ -166,8 +166,8 @@ impl BatchIngester {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract FileInfo from filesystem path and metadata
|
/// Extract FileIngestionInfo from filesystem path and metadata
|
||||||
async fn extract_file_info_from_path(path: &Path) -> Result<FileInfo> {
|
async fn extract_file_info_from_path(path: &Path) -> Result<FileIngestionInfo> {
|
||||||
let metadata = fs::metadata(path).await?;
|
let metadata = fs::metadata(path).await?;
|
||||||
let filename = path
|
let filename = path
|
||||||
.file_name()
|
.file_name()
|
||||||
|
|
@ -208,7 +208,7 @@ async fn extract_file_info_from_path(path: &Path) -> Result<FileInfo> {
|
||||||
#[cfg(not(unix))]
|
#[cfg(not(unix))]
|
||||||
let (permissions, owner, group) = (None, None, None);
|
let (permissions, owner, group) = (None, None, None);
|
||||||
|
|
||||||
Ok(FileInfo {
|
Ok(FileIngestionInfo {
|
||||||
path: path.to_string_lossy().to_string(),
|
path: path.to_string_lossy().to_string(),
|
||||||
name: filename,
|
name: filename,
|
||||||
size: file_size,
|
size: file_size,
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,7 @@ use tracing::{debug, info, warn};
|
||||||
use serde_json;
|
use serde_json;
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
|
|
||||||
use crate::models::{Document, FileInfo};
|
use crate::models::{Document, FileIngestionInfo};
|
||||||
use crate::db::Database;
|
use crate::db::Database;
|
||||||
use crate::services::file_service::FileService;
|
use crate::services::file_service::FileService;
|
||||||
|
|
||||||
|
|
@ -76,8 +76,8 @@ impl DocumentIngestionService {
|
||||||
Self { db, file_service }
|
Self { db, file_service }
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract metadata from FileInfo for storage in document
|
/// Extract metadata from FileIngestionInfo for storage in document
|
||||||
fn extract_metadata_from_file_info(file_info: &FileInfo) -> (Option<chrono::DateTime<chrono::Utc>>, Option<chrono::DateTime<chrono::Utc>>, Option<serde_json::Value>) {
|
fn extract_metadata_from_file_info(file_info: &FileIngestionInfo) -> (Option<chrono::DateTime<chrono::Utc>>, Option<chrono::DateTime<chrono::Utc>>, Option<serde_json::Value>) {
|
||||||
let original_created_at = file_info.created_at;
|
let original_created_at = file_info.created_at;
|
||||||
let original_modified_at = file_info.last_modified;
|
let original_modified_at = file_info.last_modified;
|
||||||
|
|
||||||
|
|
@ -315,10 +315,10 @@ impl DocumentIngestionService {
|
||||||
format!("{:x}", result)
|
format!("{:x}", result)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Ingest document from source with FileInfo metadata
|
/// Ingest document from source with FileIngestionInfo metadata
|
||||||
pub async fn ingest_from_file_info(
|
pub async fn ingest_from_file_info(
|
||||||
&self,
|
&self,
|
||||||
file_info: &FileInfo,
|
file_info: &FileIngestionInfo,
|
||||||
file_data: Vec<u8>,
|
file_data: Vec<u8>,
|
||||||
user_id: Uuid,
|
user_id: Uuid,
|
||||||
deduplication_policy: DeduplicationPolicy,
|
deduplication_policy: DeduplicationPolicy,
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ pub mod config;
|
||||||
pub mod db;
|
pub mod db;
|
||||||
pub mod db_guardrails_simple;
|
pub mod db_guardrails_simple;
|
||||||
pub mod ingestion;
|
pub mod ingestion;
|
||||||
|
pub mod metadata_extraction;
|
||||||
pub mod models;
|
pub mod models;
|
||||||
pub mod monitoring;
|
pub mod monitoring;
|
||||||
pub mod ocr;
|
pub mod ocr;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,179 @@
|
||||||
|
use anyhow::Result;
|
||||||
|
use serde_json::{Map, Value};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
/// Extract metadata from file content based on file type
|
||||||
|
pub async fn extract_content_metadata(file_data: &[u8], mime_type: &str, filename: &str) -> Result<Option<Value>> {
|
||||||
|
let mut metadata = Map::new();
|
||||||
|
|
||||||
|
match mime_type {
|
||||||
|
// Image files - extract basic image info
|
||||||
|
mime if mime.starts_with("image/") => {
|
||||||
|
if let Ok(img_metadata) = extract_image_metadata(file_data).await {
|
||||||
|
metadata.extend(img_metadata);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// PDF files - extract basic PDF info
|
||||||
|
"application/pdf" => {
|
||||||
|
if let Ok(pdf_metadata) = extract_pdf_metadata(file_data).await {
|
||||||
|
metadata.extend(pdf_metadata);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Text files - extract basic text info
|
||||||
|
"text/plain" => {
|
||||||
|
if let Ok(text_metadata) = extract_text_metadata(file_data).await {
|
||||||
|
metadata.extend(text_metadata);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
_ => {
|
||||||
|
// For other file types, add basic file information
|
||||||
|
metadata.insert("file_type".to_string(), Value::String(mime_type.to_string()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add filename-based metadata
|
||||||
|
if let Some(extension) = std::path::Path::new(filename)
|
||||||
|
.extension()
|
||||||
|
.and_then(|ext| ext.to_str())
|
||||||
|
{
|
||||||
|
metadata.insert("file_extension".to_string(), Value::String(extension.to_lowercase()));
|
||||||
|
}
|
||||||
|
|
||||||
|
if metadata.is_empty() {
|
||||||
|
Ok(None)
|
||||||
|
} else {
|
||||||
|
Ok(Some(Value::Object(metadata)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract metadata from image files
|
||||||
|
async fn extract_image_metadata(file_data: &[u8]) -> Result<Map<String, Value>> {
|
||||||
|
let mut metadata = Map::new();
|
||||||
|
|
||||||
|
// Try to load image and get basic properties
|
||||||
|
if let Ok(img) = image::load_from_memory(file_data) {
|
||||||
|
metadata.insert("image_width".to_string(), Value::Number(img.width().into()));
|
||||||
|
metadata.insert("image_height".to_string(), Value::Number(img.height().into()));
|
||||||
|
metadata.insert("image_format".to_string(), Value::String(format!("{:?}", img.color())));
|
||||||
|
|
||||||
|
// Calculate aspect ratio
|
||||||
|
let aspect_ratio = img.width() as f64 / img.height() as f64;
|
||||||
|
metadata.insert("aspect_ratio".to_string(), Value::String(format!("{:.2}", aspect_ratio)));
|
||||||
|
|
||||||
|
// Determine orientation
|
||||||
|
let orientation = if img.width() > img.height() {
|
||||||
|
"landscape"
|
||||||
|
} else if img.height() > img.width() {
|
||||||
|
"portrait"
|
||||||
|
} else {
|
||||||
|
"square"
|
||||||
|
};
|
||||||
|
metadata.insert("orientation".to_string(), Value::String(orientation.to_string()));
|
||||||
|
|
||||||
|
// Calculate megapixels
|
||||||
|
let megapixels = (img.width() as f64 * img.height() as f64) / 1_000_000.0;
|
||||||
|
metadata.insert("megapixels".to_string(), Value::String(format!("{:.1} MP", megapixels)));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(metadata)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract metadata from PDF files
|
||||||
|
async fn extract_pdf_metadata(file_data: &[u8]) -> Result<Map<String, Value>> {
|
||||||
|
let mut metadata = Map::new();
|
||||||
|
|
||||||
|
// Basic PDF detection and info
|
||||||
|
if file_data.len() >= 5 && &file_data[0..4] == b"%PDF" {
|
||||||
|
// Extract PDF version from header
|
||||||
|
if let Some(version_end) = file_data[0..20].iter().position(|&b| b == b'\n' || b == b'\r') {
|
||||||
|
if let Ok(header) = std::str::from_utf8(&file_data[0..version_end]) {
|
||||||
|
if let Some(version) = header.strip_prefix("%PDF-") {
|
||||||
|
metadata.insert("pdf_version".to_string(), Value::String(version.to_string()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to count pages by counting "Type /Page" entries
|
||||||
|
let content = String::from_utf8_lossy(file_data);
|
||||||
|
let page_count = content.matches("/Type /Page").count();
|
||||||
|
if page_count > 0 {
|
||||||
|
metadata.insert("page_count".to_string(), Value::Number(page_count.into()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look for basic PDF info
|
||||||
|
if content.contains("/Linearized") {
|
||||||
|
metadata.insert("linearized".to_string(), Value::Bool(true));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for encryption
|
||||||
|
if content.contains("/Encrypt") {
|
||||||
|
metadata.insert("encrypted".to_string(), Value::Bool(true));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to find creation/modification dates in metadata
|
||||||
|
if let Some(creation_start) = content.find("/CreationDate") {
|
||||||
|
if let Some(date_start) = content[creation_start..].find('(') {
|
||||||
|
if let Some(date_end) = content[creation_start + date_start..].find(')') {
|
||||||
|
let date_str = &content[creation_start + date_start + 1..creation_start + date_start + date_end];
|
||||||
|
metadata.insert("pdf_creation_date".to_string(), Value::String(date_str.to_string()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Basic content analysis
|
||||||
|
if content.contains("/Font") {
|
||||||
|
metadata.insert("contains_fonts".to_string(), Value::Bool(true));
|
||||||
|
}
|
||||||
|
|
||||||
|
if content.contains("/Image") || content.contains("/XObject") {
|
||||||
|
metadata.insert("contains_images".to_string(), Value::Bool(true));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(metadata)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract metadata from text files
|
||||||
|
async fn extract_text_metadata(file_data: &[u8]) -> Result<Map<String, Value>> {
|
||||||
|
let mut metadata = Map::new();
|
||||||
|
|
||||||
|
if let Ok(text) = std::str::from_utf8(file_data) {
|
||||||
|
// Basic text statistics
|
||||||
|
let char_count = text.chars().count();
|
||||||
|
let word_count = text.split_whitespace().count();
|
||||||
|
let line_count = text.lines().count();
|
||||||
|
|
||||||
|
metadata.insert("character_count".to_string(), Value::Number(char_count.into()));
|
||||||
|
metadata.insert("word_count".to_string(), Value::Number(word_count.into()));
|
||||||
|
metadata.insert("line_count".to_string(), Value::Number(line_count.into()));
|
||||||
|
|
||||||
|
// Detect text encoding characteristics
|
||||||
|
if text.chars().any(|c| !c.is_ascii()) {
|
||||||
|
metadata.insert("contains_unicode".to_string(), Value::Bool(true));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for common file formats within text
|
||||||
|
if text.trim_start().starts_with("<?xml") {
|
||||||
|
metadata.insert("text_format".to_string(), Value::String("xml".to_string()));
|
||||||
|
} else if text.trim_start().starts_with('{') || text.trim_start().starts_with('[') {
|
||||||
|
metadata.insert("text_format".to_string(), Value::String("json".to_string()));
|
||||||
|
} else if text.contains("<!DOCTYPE html") || text.contains("<html") {
|
||||||
|
metadata.insert("text_format".to_string(), Value::String("html".to_string()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Basic language detection (very simple)
|
||||||
|
let english_words = ["the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by"];
|
||||||
|
let english_count = english_words.iter()
|
||||||
|
.map(|&word| text.to_lowercase().matches(word).count())
|
||||||
|
.sum::<usize>();
|
||||||
|
|
||||||
|
if english_count > word_count / 20 { // If more than 5% are common English words
|
||||||
|
metadata.insert("likely_language".to_string(), Value::String("english".to_string()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(metadata)
|
||||||
|
}
|
||||||
|
|
@ -253,7 +253,7 @@ pub struct CreateIgnoredFile {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct FileInfo {
|
pub struct FileIngestionInfo {
|
||||||
pub path: String,
|
pub path: String,
|
||||||
pub name: String,
|
pub name: String,
|
||||||
pub size: i64,
|
pub size: i64,
|
||||||
|
|
|
||||||
|
|
@ -74,6 +74,25 @@ pub async fn upload_document(
|
||||||
|
|
||||||
info!("Uploading document: {} ({} bytes)", filename, data.len());
|
info!("Uploading document: {} ({} bytes)", filename, data.len());
|
||||||
|
|
||||||
|
// Create FileIngestionInfo from uploaded data
|
||||||
|
use crate::models::FileIngestionInfo;
|
||||||
|
use chrono::Utc;
|
||||||
|
|
||||||
|
let file_info = FileIngestionInfo {
|
||||||
|
path: format!("upload/{}", filename), // Virtual path for web uploads
|
||||||
|
name: filename.clone(),
|
||||||
|
size: data.len() as i64,
|
||||||
|
mime_type: content_type.clone(),
|
||||||
|
last_modified: Some(Utc::now()), // Upload time as last modified
|
||||||
|
etag: format!("{}-{}", data.len(), Utc::now().timestamp()),
|
||||||
|
is_directory: false,
|
||||||
|
created_at: Some(Utc::now()), // Upload time as creation time
|
||||||
|
permissions: None, // Web uploads don't have filesystem permissions
|
||||||
|
owner: Some(auth_user.user.username.clone()), // Uploader as owner
|
||||||
|
group: None, // Web uploads don't have filesystem groups
|
||||||
|
metadata: None, // Could extract EXIF/PDF metadata in the future
|
||||||
|
};
|
||||||
|
|
||||||
// Create ingestion service
|
// Create ingestion service
|
||||||
let file_service = FileService::new(state.config.upload_path.clone());
|
let file_service = FileService::new(state.config.upload_path.clone());
|
||||||
let ingestion_service = DocumentIngestionService::new(
|
let ingestion_service = DocumentIngestionService::new(
|
||||||
|
|
@ -81,25 +100,14 @@ pub async fn upload_document(
|
||||||
file_service,
|
file_service,
|
||||||
);
|
);
|
||||||
|
|
||||||
let request = crate::ingestion::document_ingestion::DocumentIngestionRequest {
|
match ingestion_service.ingest_from_file_info(
|
||||||
file_data: data,
|
&file_info,
|
||||||
filename: filename.clone(),
|
data,
|
||||||
original_filename: filename,
|
auth_user.user.id,
|
||||||
mime_type: content_type,
|
crate::ingestion::document_ingestion::DeduplicationPolicy::Skip,
|
||||||
user_id: auth_user.user.id,
|
"web_upload",
|
||||||
source_type: Some("web_upload".to_string()),
|
None
|
||||||
source_id: None,
|
).await {
|
||||||
deduplication_policy: crate::ingestion::document_ingestion::DeduplicationPolicy::Skip,
|
|
||||||
original_created_at: None,
|
|
||||||
original_modified_at: None,
|
|
||||||
source_path: None, // Web uploads don't have a source path
|
|
||||||
file_permissions: None, // Web uploads don't preserve permissions
|
|
||||||
file_owner: None, // Web uploads don't preserve owner
|
|
||||||
file_group: None, // Web uploads don't preserve group
|
|
||||||
source_metadata: None,
|
|
||||||
};
|
|
||||||
|
|
||||||
match ingestion_service.ingest_document(request).await {
|
|
||||||
Ok(IngestionResult::Created(document)) => {
|
Ok(IngestionResult::Created(document)) => {
|
||||||
info!("Document uploaded successfully: {}", document.id);
|
info!("Document uploaded successfully: {}", document.id);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -230,7 +230,7 @@ async fn process_single_file(
|
||||||
state: Arc<AppState>,
|
state: Arc<AppState>,
|
||||||
user_id: uuid::Uuid,
|
user_id: uuid::Uuid,
|
||||||
webdav_service: &WebDAVService,
|
webdav_service: &WebDAVService,
|
||||||
file_info: &crate::models::FileInfo,
|
file_info: &crate::models::FileIngestionInfo,
|
||||||
enable_background_ocr: bool,
|
enable_background_ocr: bool,
|
||||||
semaphore: Arc<Semaphore>,
|
semaphore: Arc<Semaphore>,
|
||||||
webdav_source_id: Option<uuid::Uuid>,
|
webdav_source_id: Option<uuid::Uuid>,
|
||||||
|
|
@ -384,7 +384,7 @@ pub async fn process_files_for_deep_scan(
|
||||||
state: Arc<AppState>,
|
state: Arc<AppState>,
|
||||||
user_id: uuid::Uuid,
|
user_id: uuid::Uuid,
|
||||||
webdav_service: &WebDAVService,
|
webdav_service: &WebDAVService,
|
||||||
files_to_process: &[crate::models::FileInfo],
|
files_to_process: &[crate::models::FileIngestionInfo],
|
||||||
enable_background_ocr: bool,
|
enable_background_ocr: bool,
|
||||||
webdav_source_id: Option<uuid::Uuid>,
|
webdav_source_id: Option<uuid::Uuid>,
|
||||||
) -> Result<usize, anyhow::Error> {
|
) -> Result<usize, anyhow::Error> {
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ use uuid::Uuid;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
AppState,
|
AppState,
|
||||||
models::{FileInfo, Source, SourceType, SourceStatus, LocalFolderSourceConfig, S3SourceConfig, WebDAVSourceConfig},
|
models::{FileIngestionInfo, Source, SourceType, SourceStatus, LocalFolderSourceConfig, S3SourceConfig, WebDAVSourceConfig},
|
||||||
services::file_service::FileService,
|
services::file_service::FileService,
|
||||||
ingestion::document_ingestion::{DocumentIngestionService, IngestionResult},
|
ingestion::document_ingestion::{DocumentIngestionService, IngestionResult},
|
||||||
services::local_folder_service::LocalFolderService,
|
services::local_folder_service::LocalFolderService,
|
||||||
|
|
@ -227,7 +227,7 @@ impl SourceSyncService {
|
||||||
where
|
where
|
||||||
F: Fn(String) -> Fut1,
|
F: Fn(String) -> Fut1,
|
||||||
D: Fn(String) -> Fut2 + Clone,
|
D: Fn(String) -> Fut2 + Clone,
|
||||||
Fut1: std::future::Future<Output = Result<Vec<FileInfo>>>,
|
Fut1: std::future::Future<Output = Result<Vec<FileIngestionInfo>>>,
|
||||||
Fut2: std::future::Future<Output = Result<Vec<u8>>>,
|
Fut2: std::future::Future<Output = Result<Vec<u8>>>,
|
||||||
{
|
{
|
||||||
let mut total_files_processed = 0;
|
let mut total_files_processed = 0;
|
||||||
|
|
@ -328,7 +328,7 @@ impl SourceSyncService {
|
||||||
where
|
where
|
||||||
F: Fn(String) -> Fut1,
|
F: Fn(String) -> Fut1,
|
||||||
D: Fn(String) -> Fut2 + Clone,
|
D: Fn(String) -> Fut2 + Clone,
|
||||||
Fut1: std::future::Future<Output = Result<Vec<FileInfo>>>,
|
Fut1: std::future::Future<Output = Result<Vec<FileIngestionInfo>>>,
|
||||||
Fut2: std::future::Future<Output = Result<Vec<u8>>>,
|
Fut2: std::future::Future<Output = Result<Vec<u8>>>,
|
||||||
{
|
{
|
||||||
let mut total_files_processed = 0;
|
let mut total_files_processed = 0;
|
||||||
|
|
@ -514,7 +514,7 @@ impl SourceSyncService {
|
||||||
state: Arc<AppState>,
|
state: Arc<AppState>,
|
||||||
user_id: Uuid,
|
user_id: Uuid,
|
||||||
source_id: Uuid,
|
source_id: Uuid,
|
||||||
file_info: &FileInfo,
|
file_info: &FileIngestionInfo,
|
||||||
enable_background_ocr: bool,
|
enable_background_ocr: bool,
|
||||||
semaphore: Arc<Semaphore>,
|
semaphore: Arc<Semaphore>,
|
||||||
download_file: D,
|
download_file: D,
|
||||||
|
|
@ -593,7 +593,7 @@ impl SourceSyncService {
|
||||||
state: Arc<AppState>,
|
state: Arc<AppState>,
|
||||||
user_id: Uuid,
|
user_id: Uuid,
|
||||||
source_id: Uuid,
|
source_id: Uuid,
|
||||||
file_info: &FileInfo,
|
file_info: &FileIngestionInfo,
|
||||||
enable_background_ocr: bool,
|
enable_background_ocr: bool,
|
||||||
semaphore: Arc<Semaphore>,
|
semaphore: Arc<Semaphore>,
|
||||||
download_file: D,
|
download_file: D,
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ use crate::{
|
||||||
services::file_service::FileService,
|
services::file_service::FileService,
|
||||||
ingestion::document_ingestion::{DocumentIngestionService, IngestionResult, DeduplicationPolicy},
|
ingestion::document_ingestion::{DocumentIngestionService, IngestionResult, DeduplicationPolicy},
|
||||||
ocr::queue::OcrQueueService,
|
ocr::queue::OcrQueueService,
|
||||||
models::FileInfo,
|
models::FileIngestionInfo,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub async fn start_folder_watcher(config: Config, db: Database) -> Result<()> {
|
pub async fn start_folder_watcher(config: Config, db: Database) -> Result<()> {
|
||||||
|
|
@ -372,8 +372,8 @@ async fn process_file(
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract FileInfo from filesystem path and metadata (for watcher)
|
/// Extract FileIngestionInfo from filesystem path and metadata (for watcher)
|
||||||
async fn extract_file_info_from_path(path: &Path) -> Result<FileInfo> {
|
async fn extract_file_info_from_path(path: &Path) -> Result<FileIngestionInfo> {
|
||||||
let metadata = tokio::fs::metadata(path).await?;
|
let metadata = tokio::fs::metadata(path).await?;
|
||||||
let filename = path
|
let filename = path
|
||||||
.file_name()
|
.file_name()
|
||||||
|
|
@ -411,7 +411,7 @@ async fn extract_file_info_from_path(path: &Path) -> Result<FileInfo> {
|
||||||
#[cfg(not(unix))]
|
#[cfg(not(unix))]
|
||||||
let (permissions, owner, group) = (None, None, None);
|
let (permissions, owner, group) = (None, None, None);
|
||||||
|
|
||||||
Ok(FileInfo {
|
Ok(FileIngestionInfo {
|
||||||
path: path.to_string_lossy().to_string(),
|
path: path.to_string_lossy().to_string(),
|
||||||
name: filename,
|
name: filename,
|
||||||
size: file_size,
|
size: file_size,
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ use walkdir::WalkDir;
|
||||||
use sha2::{Sha256, Digest};
|
use sha2::{Sha256, Digest};
|
||||||
use serde_json;
|
use serde_json;
|
||||||
|
|
||||||
use crate::models::{FileInfo, LocalFolderSourceConfig};
|
use crate::models::{FileIngestionInfo, LocalFolderSourceConfig};
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct LocalFolderService {
|
pub struct LocalFolderService {
|
||||||
|
|
@ -31,13 +31,13 @@ impl LocalFolderService {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Discover files in a specific folder
|
/// Discover files in a specific folder
|
||||||
pub async fn discover_files_in_folder(&self, folder_path: &str) -> Result<Vec<FileInfo>> {
|
pub async fn discover_files_in_folder(&self, folder_path: &str) -> Result<Vec<FileIngestionInfo>> {
|
||||||
let path = Path::new(folder_path);
|
let path = Path::new(folder_path);
|
||||||
if !path.exists() {
|
if !path.exists() {
|
||||||
return Err(anyhow!("Folder does not exist: {}", folder_path));
|
return Err(anyhow!("Folder does not exist: {}", folder_path));
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut files: Vec<FileInfo> = Vec::new();
|
let mut files: Vec<FileIngestionInfo> = Vec::new();
|
||||||
|
|
||||||
info!("Scanning local folder: {} (recursive: {})", folder_path, self.config.recursive);
|
info!("Scanning local folder: {} (recursive: {})", folder_path, self.config.recursive);
|
||||||
|
|
||||||
|
|
@ -45,8 +45,8 @@ impl LocalFolderService {
|
||||||
let folder_path_clone = folder_path.to_string();
|
let folder_path_clone = folder_path.to_string();
|
||||||
let config = self.config.clone();
|
let config = self.config.clone();
|
||||||
|
|
||||||
let discovered_files = tokio::task::spawn_blocking(move || -> Result<Vec<FileInfo>> {
|
let discovered_files = tokio::task::spawn_blocking(move || -> Result<Vec<FileIngestionInfo>> {
|
||||||
let mut files: Vec<FileInfo> = Vec::new();
|
let mut files: Vec<FileIngestionInfo> = Vec::new();
|
||||||
|
|
||||||
let walker = if config.recursive {
|
let walker = if config.recursive {
|
||||||
WalkDir::new(&folder_path_clone)
|
WalkDir::new(&folder_path_clone)
|
||||||
|
|
@ -137,7 +137,7 @@ impl LocalFolderService {
|
||||||
// Add file attributes
|
// Add file attributes
|
||||||
additional_metadata.insert("readonly".to_string(), serde_json::Value::Bool(metadata.permissions().readonly()));
|
additional_metadata.insert("readonly".to_string(), serde_json::Value::Bool(metadata.permissions().readonly()));
|
||||||
|
|
||||||
let file_info = FileInfo {
|
let file_info = FileIngestionInfo {
|
||||||
path: path.to_string_lossy().to_string(),
|
path: path.to_string_lossy().to_string(),
|
||||||
name: file_name,
|
name: file_name,
|
||||||
size: metadata.len() as i64,
|
size: metadata.len() as i64,
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,7 @@ use aws_credential_types::Credentials;
|
||||||
#[cfg(feature = "s3")]
|
#[cfg(feature = "s3")]
|
||||||
use aws_types::region::Region as AwsRegion;
|
use aws_types::region::Region as AwsRegion;
|
||||||
|
|
||||||
use crate::models::{FileInfo, S3SourceConfig};
|
use crate::models::{FileIngestionInfo, S3SourceConfig};
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct S3Service {
|
pub struct S3Service {
|
||||||
|
|
@ -81,7 +81,7 @@ impl S3Service {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Discover files in a specific S3 prefix (folder)
|
/// Discover files in a specific S3 prefix (folder)
|
||||||
pub async fn discover_files_in_folder(&self, folder_path: &str) -> Result<Vec<FileInfo>> {
|
pub async fn discover_files_in_folder(&self, folder_path: &str) -> Result<Vec<FileIngestionInfo>> {
|
||||||
#[cfg(not(feature = "s3"))]
|
#[cfg(not(feature = "s3"))]
|
||||||
{
|
{
|
||||||
return Err(anyhow!("S3 support not compiled in"));
|
return Err(anyhow!("S3 support not compiled in"));
|
||||||
|
|
@ -176,7 +176,7 @@ impl S3Service {
|
||||||
// If we have region info, add it
|
// If we have region info, add it
|
||||||
metadata_map.insert("s3_region".to_string(), serde_json::Value::String(self.config.region.clone()));
|
metadata_map.insert("s3_region".to_string(), serde_json::Value::String(self.config.region.clone()));
|
||||||
|
|
||||||
let file_info = FileInfo {
|
let file_info = FileIngestionInfo {
|
||||||
path: key.clone(),
|
path: key.clone(),
|
||||||
name: file_name,
|
name: file_name,
|
||||||
size,
|
size,
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
use anyhow::{anyhow, Result};
|
use anyhow::{anyhow, Result};
|
||||||
use tracing::warn;
|
use tracing::warn;
|
||||||
|
|
||||||
use crate::models::{FileInfo, S3SourceConfig};
|
use crate::models::{FileIngestionInfo, S3SourceConfig};
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct S3Service {
|
pub struct S3Service {
|
||||||
|
|
@ -14,7 +14,7 @@ impl S3Service {
|
||||||
Err(anyhow!("S3 support not compiled in. Enable the 's3' feature to use S3 sources."))
|
Err(anyhow!("S3 support not compiled in. Enable the 's3' feature to use S3 sources."))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn discover_files_in_folder(&self, _folder_path: &str) -> Result<Vec<FileInfo>> {
|
pub async fn discover_files_in_folder(&self, _folder_path: &str) -> Result<Vec<FileIngestionInfo>> {
|
||||||
warn!("S3 support not compiled in");
|
warn!("S3 support not compiled in");
|
||||||
Ok(Vec::new())
|
Ok(Vec::new())
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ use tokio::sync::Semaphore;
|
||||||
use futures_util::stream::{self, StreamExt};
|
use futures_util::stream::{self, StreamExt};
|
||||||
use tracing::{debug, info, warn};
|
use tracing::{debug, info, warn};
|
||||||
|
|
||||||
use crate::models::{FileInfo, WebDAVCrawlEstimate, WebDAVFolderInfo};
|
use crate::models::{FileIngestionInfo, WebDAVCrawlEstimate, WebDAVFolderInfo};
|
||||||
use crate::webdav_xml_parser::{parse_propfind_response, parse_propfind_response_with_directories};
|
use crate::webdav_xml_parser::{parse_propfind_response, parse_propfind_response_with_directories};
|
||||||
use super::config::{WebDAVConfig, ConcurrencyConfig};
|
use super::config::{WebDAVConfig, ConcurrencyConfig};
|
||||||
use super::connection::WebDAVConnection;
|
use super::connection::WebDAVConnection;
|
||||||
|
|
@ -30,7 +30,7 @@ impl WebDAVDiscovery {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Discovers files in a directory with support for pagination and filtering
|
/// Discovers files in a directory with support for pagination and filtering
|
||||||
pub async fn discover_files(&self, directory_path: &str, recursive: bool) -> Result<Vec<FileInfo>> {
|
pub async fn discover_files(&self, directory_path: &str, recursive: bool) -> Result<Vec<FileIngestionInfo>> {
|
||||||
info!("🔍 Discovering files in directory: {}", directory_path);
|
info!("🔍 Discovering files in directory: {}", directory_path);
|
||||||
|
|
||||||
if recursive {
|
if recursive {
|
||||||
|
|
@ -41,7 +41,7 @@ impl WebDAVDiscovery {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Discovers files in a single directory (non-recursive)
|
/// Discovers files in a single directory (non-recursive)
|
||||||
async fn discover_files_single_directory(&self, directory_path: &str) -> Result<Vec<FileInfo>> {
|
async fn discover_files_single_directory(&self, directory_path: &str) -> Result<Vec<FileIngestionInfo>> {
|
||||||
let url = self.connection.get_url_for_path(directory_path);
|
let url = self.connection.get_url_for_path(directory_path);
|
||||||
|
|
||||||
let propfind_body = r#"<?xml version="1.0" encoding="utf-8"?>
|
let propfind_body = r#"<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
|
@ -72,7 +72,7 @@ impl WebDAVDiscovery {
|
||||||
let files = parse_propfind_response(&body)?;
|
let files = parse_propfind_response(&body)?;
|
||||||
|
|
||||||
// Filter files based on supported extensions
|
// Filter files based on supported extensions
|
||||||
let filtered_files: Vec<FileInfo> = files
|
let filtered_files: Vec<FileIngestionInfo> = files
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter(|file| {
|
.filter(|file| {
|
||||||
!file.is_directory && self.config.is_supported_extension(&file.name)
|
!file.is_directory && self.config.is_supported_extension(&file.name)
|
||||||
|
|
@ -84,7 +84,7 @@ impl WebDAVDiscovery {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Discovers files recursively in directory tree
|
/// Discovers files recursively in directory tree
|
||||||
async fn discover_files_recursive(&self, root_directory: &str) -> Result<Vec<FileInfo>> {
|
async fn discover_files_recursive(&self, root_directory: &str) -> Result<Vec<FileIngestionInfo>> {
|
||||||
let mut all_files = Vec::new();
|
let mut all_files = Vec::new();
|
||||||
let mut directories_to_scan = vec![root_directory.to_string()];
|
let mut directories_to_scan = vec![root_directory.to_string()];
|
||||||
let semaphore = Semaphore::new(self.concurrency_config.max_concurrent_scans);
|
let semaphore = Semaphore::new(self.concurrency_config.max_concurrent_scans);
|
||||||
|
|
@ -126,7 +126,7 @@ impl WebDAVDiscovery {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Scans a directory and returns both files and subdirectories
|
/// Scans a directory and returns both files and subdirectories
|
||||||
async fn scan_directory_with_subdirs(&self, directory_path: &str) -> Result<(Vec<FileInfo>, Vec<String>)> {
|
async fn scan_directory_with_subdirs(&self, directory_path: &str) -> Result<(Vec<FileIngestionInfo>, Vec<String>)> {
|
||||||
let url = self.connection.get_url_for_path(directory_path);
|
let url = self.connection.get_url_for_path(directory_path);
|
||||||
|
|
||||||
let propfind_body = r#"<?xml version="1.0" encoding="utf-8"?>
|
let propfind_body = r#"<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
|
@ -309,7 +309,7 @@ impl WebDAVDiscovery {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Calculates the ratio of supported files in a sample
|
/// Calculates the ratio of supported files in a sample
|
||||||
fn calculate_support_ratio(&self, sample_files: &[FileInfo]) -> f64 {
|
fn calculate_support_ratio(&self, sample_files: &[FileIngestionInfo]) -> f64 {
|
||||||
if sample_files.is_empty() {
|
if sample_files.is_empty() {
|
||||||
return 1.0; // Assume all files are supported if no sample
|
return 1.0; // Assume all files are supported if no sample
|
||||||
}
|
}
|
||||||
|
|
@ -323,7 +323,7 @@ impl WebDAVDiscovery {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Filters files by last modified date (for incremental syncs)
|
/// Filters files by last modified date (for incremental syncs)
|
||||||
pub fn filter_files_by_date(&self, files: Vec<FileInfo>, since: chrono::DateTime<chrono::Utc>) -> Vec<FileInfo> {
|
pub fn filter_files_by_date(&self, files: Vec<FileIngestionInfo>, since: chrono::DateTime<chrono::Utc>) -> Vec<FileIngestionInfo> {
|
||||||
files
|
files
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter(|file| {
|
.filter(|file| {
|
||||||
|
|
@ -335,7 +335,7 @@ impl WebDAVDiscovery {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Deduplicates files by ETag or path
|
/// Deduplicates files by ETag or path
|
||||||
pub fn deduplicate_files(&self, files: Vec<FileInfo>) -> Vec<FileInfo> {
|
pub fn deduplicate_files(&self, files: Vec<FileIngestionInfo>) -> Vec<FileIngestionInfo> {
|
||||||
let mut seen_etags = HashSet::new();
|
let mut seen_etags = HashSet::new();
|
||||||
let mut seen_paths = HashSet::new();
|
let mut seen_paths = HashSet::new();
|
||||||
let mut deduplicated = Vec::new();
|
let mut deduplicated = Vec::new();
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@ use tokio::sync::Semaphore;
|
||||||
use tracing::{debug, error, info};
|
use tracing::{debug, error, info};
|
||||||
|
|
||||||
use crate::models::{
|
use crate::models::{
|
||||||
FileInfo, WebDAVConnectionResult, WebDAVCrawlEstimate, WebDAVTestConnection,
|
FileIngestionInfo, WebDAVConnectionResult, WebDAVCrawlEstimate, WebDAVTestConnection,
|
||||||
};
|
};
|
||||||
|
|
||||||
use super::config::{WebDAVConfig, RetryConfig, ConcurrencyConfig};
|
use super::config::{WebDAVConfig, RetryConfig, ConcurrencyConfig};
|
||||||
|
|
@ -107,7 +107,7 @@ impl WebDAVService {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Discovers all files in watch folders
|
/// Discovers all files in watch folders
|
||||||
pub async fn discover_all_files(&self) -> Result<Vec<FileInfo>> {
|
pub async fn discover_all_files(&self) -> Result<Vec<FileIngestionInfo>> {
|
||||||
info!("🔍 Discovering all files in watch folders");
|
info!("🔍 Discovering all files in watch folders");
|
||||||
let mut all_files = Vec::new();
|
let mut all_files = Vec::new();
|
||||||
|
|
||||||
|
|
@ -134,7 +134,7 @@ impl WebDAVService {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Discovers files changed since a specific date (for incremental syncs)
|
/// Discovers files changed since a specific date (for incremental syncs)
|
||||||
pub async fn discover_changed_files(&self, since: chrono::DateTime<chrono::Utc>) -> Result<Vec<FileInfo>> {
|
pub async fn discover_changed_files(&self, since: chrono::DateTime<chrono::Utc>) -> Result<Vec<FileIngestionInfo>> {
|
||||||
info!("🔍 Discovering files changed since: {}", since);
|
info!("🔍 Discovering files changed since: {}", since);
|
||||||
|
|
||||||
let all_files = self.discover_all_files().await?;
|
let all_files = self.discover_all_files().await?;
|
||||||
|
|
@ -145,7 +145,7 @@ impl WebDAVService {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Discovers files in a specific directory
|
/// Discovers files in a specific directory
|
||||||
pub async fn discover_files_in_directory(&self, directory_path: &str, recursive: bool) -> Result<Vec<FileInfo>> {
|
pub async fn discover_files_in_directory(&self, directory_path: &str, recursive: bool) -> Result<Vec<FileIngestionInfo>> {
|
||||||
info!("🔍 Discovering files in directory: {} (recursive: {})", directory_path, recursive);
|
info!("🔍 Discovering files in directory: {} (recursive: {})", directory_path, recursive);
|
||||||
self.discovery.discover_files(directory_path, recursive).await
|
self.discovery.discover_files(directory_path, recursive).await
|
||||||
}
|
}
|
||||||
|
|
@ -181,8 +181,8 @@ impl WebDAVService {
|
||||||
Ok(content.to_vec())
|
Ok(content.to_vec())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Downloads a file from WebDAV server using FileInfo
|
/// Downloads a file from WebDAV server using FileIngestionInfo
|
||||||
pub async fn download_file_info(&self, file_info: &FileInfo) -> Result<Vec<u8>> {
|
pub async fn download_file_info(&self, file_info: &FileIngestionInfo) -> Result<Vec<u8>> {
|
||||||
let _permit = self.download_semaphore.acquire().await?;
|
let _permit = self.download_semaphore.acquire().await?;
|
||||||
|
|
||||||
debug!("⬇️ Downloading file: {}", file_info.path);
|
debug!("⬇️ Downloading file: {}", file_info.path);
|
||||||
|
|
@ -213,7 +213,7 @@ impl WebDAVService {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Downloads multiple files concurrently
|
/// Downloads multiple files concurrently
|
||||||
pub async fn download_files(&self, files: &[FileInfo]) -> Result<Vec<(FileInfo, Result<Vec<u8>>)>> {
|
pub async fn download_files(&self, files: &[FileIngestionInfo]) -> Result<Vec<(FileIngestionInfo, Result<Vec<u8>>)>> {
|
||||||
info!("⬇️ Downloading {} files concurrently", files.len());
|
info!("⬇️ Downloading {} files concurrently", files.len());
|
||||||
|
|
||||||
let tasks = files.iter().map(|file| {
|
let tasks = files.iter().map(|file| {
|
||||||
|
|
@ -237,7 +237,7 @@ impl WebDAVService {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Gets file metadata without downloading content
|
/// Gets file metadata without downloading content
|
||||||
pub async fn get_file_metadata(&self, file_path: &str) -> Result<FileInfo> {
|
pub async fn get_file_metadata(&self, file_path: &str) -> Result<FileIngestionInfo> {
|
||||||
debug!("📋 Getting metadata for file: {}", file_path);
|
debug!("📋 Getting metadata for file: {}", file_path);
|
||||||
|
|
||||||
let url = self.connection.get_url_for_path(file_path);
|
let url = self.connection.get_url_for_path(file_path);
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::super::{WebDAVService, WebDAVConfig};
|
use super::super::{WebDAVService, WebDAVConfig};
|
||||||
use crate::models::FileInfo;
|
use crate::models::FileIngestionInfo;
|
||||||
use tokio;
|
use tokio;
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
|
|
@ -22,10 +22,10 @@ fn create_test_webdav_service() -> WebDAVService {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test scenario that matches the real-world bug: deep nested structure with various file types
|
// Test scenario that matches the real-world bug: deep nested structure with various file types
|
||||||
fn create_complex_nested_structure() -> Vec<FileInfo> {
|
fn create_complex_nested_structure() -> Vec<FileIngestionInfo> {
|
||||||
vec![
|
vec![
|
||||||
// Root directories at different levels
|
// Root directories at different levels
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments".to_string(),
|
path: "/FullerDocuments".to_string(),
|
||||||
name: "FullerDocuments".to_string(),
|
name: "FullerDocuments".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -39,7 +39,7 @@ fn create_complex_nested_structure() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments".to_string(),
|
path: "/FullerDocuments/JonDocuments".to_string(),
|
||||||
name: "JonDocuments".to_string(),
|
name: "JonDocuments".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -54,7 +54,7 @@ fn create_complex_nested_structure() -> Vec<FileInfo> {
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
// Multiple levels of nesting
|
// Multiple levels of nesting
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments/Work".to_string(),
|
path: "/FullerDocuments/JonDocuments/Work".to_string(),
|
||||||
name: "Work".to_string(),
|
name: "Work".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -68,7 +68,7 @@ fn create_complex_nested_structure() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments/Personal".to_string(),
|
path: "/FullerDocuments/JonDocuments/Personal".to_string(),
|
||||||
name: "Personal".to_string(),
|
name: "Personal".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -82,7 +82,7 @@ fn create_complex_nested_structure() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments/Work/Projects".to_string(),
|
path: "/FullerDocuments/JonDocuments/Work/Projects".to_string(),
|
||||||
name: "Projects".to_string(),
|
name: "Projects".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -96,7 +96,7 @@ fn create_complex_nested_structure() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments/Work/Reports".to_string(),
|
path: "/FullerDocuments/JonDocuments/Work/Reports".to_string(),
|
||||||
name: "Reports".to_string(),
|
name: "Reports".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -110,7 +110,7 @@ fn create_complex_nested_structure() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments/Work/Projects/WebApp".to_string(),
|
path: "/FullerDocuments/JonDocuments/Work/Projects/WebApp".to_string(),
|
||||||
name: "WebApp".to_string(),
|
name: "WebApp".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -125,7 +125,7 @@ fn create_complex_nested_structure() -> Vec<FileInfo> {
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
// Files at various nesting levels - this is the key part that was failing
|
// Files at various nesting levels - this is the key part that was failing
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments/index.txt".to_string(),
|
path: "/FullerDocuments/JonDocuments/index.txt".to_string(),
|
||||||
name: "index.txt".to_string(),
|
name: "index.txt".to_string(),
|
||||||
size: 1500,
|
size: 1500,
|
||||||
|
|
@ -139,7 +139,7 @@ fn create_complex_nested_structure() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments/Work/schedule.pdf".to_string(),
|
path: "/FullerDocuments/JonDocuments/Work/schedule.pdf".to_string(),
|
||||||
name: "schedule.pdf".to_string(),
|
name: "schedule.pdf".to_string(),
|
||||||
size: 2048000,
|
size: 2048000,
|
||||||
|
|
@ -153,7 +153,7 @@ fn create_complex_nested_structure() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments/Work/Projects/proposal.docx".to_string(),
|
path: "/FullerDocuments/JonDocuments/Work/Projects/proposal.docx".to_string(),
|
||||||
name: "proposal.docx".to_string(),
|
name: "proposal.docx".to_string(),
|
||||||
size: 1024000,
|
size: 1024000,
|
||||||
|
|
@ -167,7 +167,7 @@ fn create_complex_nested_structure() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments/Work/Projects/WebApp/design.pdf".to_string(),
|
path: "/FullerDocuments/JonDocuments/Work/Projects/WebApp/design.pdf".to_string(),
|
||||||
name: "design.pdf".to_string(),
|
name: "design.pdf".to_string(),
|
||||||
size: 3072000,
|
size: 3072000,
|
||||||
|
|
@ -181,7 +181,7 @@ fn create_complex_nested_structure() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments/Work/Reports/monthly.pdf".to_string(),
|
path: "/FullerDocuments/JonDocuments/Work/Reports/monthly.pdf".to_string(),
|
||||||
name: "monthly.pdf".to_string(),
|
name: "monthly.pdf".to_string(),
|
||||||
size: 4096000,
|
size: 4096000,
|
||||||
|
|
@ -195,7 +195,7 @@ fn create_complex_nested_structure() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments/Personal/diary.txt".to_string(),
|
path: "/FullerDocuments/JonDocuments/Personal/diary.txt".to_string(),
|
||||||
name: "diary.txt".to_string(),
|
name: "diary.txt".to_string(),
|
||||||
size: 5120,
|
size: 5120,
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ use quick_xml::reader::Reader;
|
||||||
use std::str;
|
use std::str;
|
||||||
use serde_json;
|
use serde_json;
|
||||||
|
|
||||||
use crate::models::FileInfo;
|
use crate::models::FileIngestionInfo;
|
||||||
|
|
||||||
#[derive(Debug, Default)]
|
#[derive(Debug, Default)]
|
||||||
struct PropFindResponse {
|
struct PropFindResponse {
|
||||||
|
|
@ -24,7 +24,7 @@ struct PropFindResponse {
|
||||||
metadata: Option<serde_json::Value>,
|
metadata: Option<serde_json::Value>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileInfo>> {
|
pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileIngestionInfo>> {
|
||||||
let mut reader = Reader::from_str(xml_text);
|
let mut reader = Reader::from_str(xml_text);
|
||||||
reader.config_mut().trim_text(true);
|
reader.config_mut().trim_text(true);
|
||||||
|
|
||||||
|
|
@ -200,7 +200,7 @@ pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileInfo>> {
|
||||||
// Use the metadata collected during parsing
|
// Use the metadata collected during parsing
|
||||||
let metadata = resp.metadata;
|
let metadata = resp.metadata;
|
||||||
|
|
||||||
let file_info = FileInfo {
|
let file_info = FileIngestionInfo {
|
||||||
path: resp.href.clone(),
|
path: resp.href.clone(),
|
||||||
name,
|
name,
|
||||||
size: resp.content_length.unwrap_or(0),
|
size: resp.content_length.unwrap_or(0),
|
||||||
|
|
@ -248,7 +248,7 @@ pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileInfo>> {
|
||||||
|
|
||||||
/// Parse PROPFIND response including both files and directories
|
/// Parse PROPFIND response including both files and directories
|
||||||
/// This is used for shallow directory scans where we need to track directory structure
|
/// This is used for shallow directory scans where we need to track directory structure
|
||||||
pub fn parse_propfind_response_with_directories(xml_text: &str) -> Result<Vec<FileInfo>> {
|
pub fn parse_propfind_response_with_directories(xml_text: &str) -> Result<Vec<FileIngestionInfo>> {
|
||||||
let mut reader = Reader::from_str(xml_text);
|
let mut reader = Reader::from_str(xml_text);
|
||||||
reader.config_mut().trim_text(true);
|
reader.config_mut().trim_text(true);
|
||||||
|
|
||||||
|
|
@ -415,7 +415,7 @@ pub fn parse_propfind_response_with_directories(xml_text: &str) -> Result<Vec<Fi
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
let file_info = FileInfo {
|
let file_info = FileIngestionInfo {
|
||||||
path: resp.href.clone(),
|
path: resp.href.clone(),
|
||||||
name,
|
name,
|
||||||
size: resp.content_length.unwrap_or(0),
|
size: resp.content_length.unwrap_or(0),
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ use readur::{
|
||||||
AppState,
|
AppState,
|
||||||
db::Database,
|
db::Database,
|
||||||
config::Config,
|
config::Config,
|
||||||
models::{FileInfo, Document, Source, SourceType, SourceStatus},
|
models::{FileIngestionInfo, Document, Source, SourceType, SourceStatus},
|
||||||
};
|
};
|
||||||
|
|
||||||
// Helper function to calculate file hash
|
// Helper function to calculate file hash
|
||||||
|
|
@ -20,8 +20,8 @@ fn calculate_file_hash(data: &[u8]) -> String {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to create test file info
|
// Helper function to create test file info
|
||||||
fn create_test_file_info(name: &str, path: &str, content: &[u8]) -> FileInfo {
|
fn create_test_file_info(name: &str, path: &str, content: &[u8]) -> FileIngestionInfo {
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
name: name.to_string(),
|
name: name.to_string(),
|
||||||
path: path.to_string(),
|
path: path.to_string(),
|
||||||
size: content.len() as i64,
|
size: content.len() as i64,
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@ use tokio;
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use readur::models::{FileInfo, CreateWebDAVDirectory, CreateUser, UserRole};
|
use readur::models::{FileIngestionInfo, CreateWebDAVDirectory, CreateUser, UserRole};
|
||||||
use readur::services::webdav::{WebDAVService, WebDAVConfig};
|
use readur::services::webdav::{WebDAVService, WebDAVConfig};
|
||||||
use readur::db::Database;
|
use readur::db::Database;
|
||||||
|
|
||||||
|
|
@ -22,10 +22,10 @@ fn create_test_webdav_service() -> WebDAVService {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mock files structure that represents a real directory with subdirectories
|
// Mock files structure that represents a real directory with subdirectories
|
||||||
fn mock_realistic_directory_structure() -> Vec<FileInfo> {
|
fn mock_realistic_directory_structure() -> Vec<FileIngestionInfo> {
|
||||||
vec![
|
vec![
|
||||||
// Parent root directory
|
// Parent root directory
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments".to_string(),
|
path: "/FullerDocuments".to_string(),
|
||||||
name: "FullerDocuments".to_string(),
|
name: "FullerDocuments".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -40,7 +40,7 @@ fn mock_realistic_directory_structure() -> Vec<FileInfo> {
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
// Root directory
|
// Root directory
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments".to_string(),
|
path: "/FullerDocuments/JonDocuments".to_string(),
|
||||||
name: "JonDocuments".to_string(),
|
name: "JonDocuments".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -55,7 +55,7 @@ fn mock_realistic_directory_structure() -> Vec<FileInfo> {
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
// Subdirectory level 1
|
// Subdirectory level 1
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments/Projects".to_string(),
|
path: "/FullerDocuments/JonDocuments/Projects".to_string(),
|
||||||
name: "Projects".to_string(),
|
name: "Projects".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -69,7 +69,7 @@ fn mock_realistic_directory_structure() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments/Archive".to_string(),
|
path: "/FullerDocuments/JonDocuments/Archive".to_string(),
|
||||||
name: "Archive".to_string(),
|
name: "Archive".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -84,7 +84,7 @@ fn mock_realistic_directory_structure() -> Vec<FileInfo> {
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
// Subdirectory level 2
|
// Subdirectory level 2
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments/Projects/WebDev".to_string(),
|
path: "/FullerDocuments/JonDocuments/Projects/WebDev".to_string(),
|
||||||
name: "WebDev".to_string(),
|
name: "WebDev".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -98,7 +98,7 @@ fn mock_realistic_directory_structure() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments/Projects/Mobile".to_string(),
|
path: "/FullerDocuments/JonDocuments/Projects/Mobile".to_string(),
|
||||||
name: "Mobile".to_string(),
|
name: "Mobile".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -113,7 +113,7 @@ fn mock_realistic_directory_structure() -> Vec<FileInfo> {
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
// Files in various directories
|
// Files in various directories
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments/readme.txt".to_string(),
|
path: "/FullerDocuments/JonDocuments/readme.txt".to_string(),
|
||||||
name: "readme.txt".to_string(),
|
name: "readme.txt".to_string(),
|
||||||
size: 1024,
|
size: 1024,
|
||||||
|
|
@ -127,7 +127,7 @@ fn mock_realistic_directory_structure() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments/Projects/project-overview.pdf".to_string(),
|
path: "/FullerDocuments/JonDocuments/Projects/project-overview.pdf".to_string(),
|
||||||
name: "project-overview.pdf".to_string(),
|
name: "project-overview.pdf".to_string(),
|
||||||
size: 2048000,
|
size: 2048000,
|
||||||
|
|
@ -141,7 +141,7 @@ fn mock_realistic_directory_structure() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments/Projects/WebDev/website-specs.docx".to_string(),
|
path: "/FullerDocuments/JonDocuments/Projects/WebDev/website-specs.docx".to_string(),
|
||||||
name: "website-specs.docx".to_string(),
|
name: "website-specs.docx".to_string(),
|
||||||
size: 512000,
|
size: 512000,
|
||||||
|
|
@ -155,7 +155,7 @@ fn mock_realistic_directory_structure() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments/Projects/Mobile/app-design.pdf".to_string(),
|
path: "/FullerDocuments/JonDocuments/Projects/Mobile/app-design.pdf".to_string(),
|
||||||
name: "app-design.pdf".to_string(),
|
name: "app-design.pdf".to_string(),
|
||||||
size: 1536000,
|
size: 1536000,
|
||||||
|
|
@ -169,7 +169,7 @@ fn mock_realistic_directory_structure() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/FullerDocuments/JonDocuments/Archive/old-notes.txt".to_string(),
|
path: "/FullerDocuments/JonDocuments/Archive/old-notes.txt".to_string(),
|
||||||
name: "old-notes.txt".to_string(),
|
name: "old-notes.txt".to_string(),
|
||||||
size: 256,
|
size: 256,
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ use readur::{
|
||||||
AppState,
|
AppState,
|
||||||
db::Database,
|
db::Database,
|
||||||
config::Config,
|
config::Config,
|
||||||
models::{FileInfo, CreateWebDAVFile, Document},
|
models::{FileIngestionInfo, CreateWebDAVFile, Document},
|
||||||
};
|
};
|
||||||
|
|
||||||
// Helper function to calculate file hash
|
// Helper function to calculate file hash
|
||||||
|
|
@ -20,8 +20,8 @@ fn calculate_file_hash(data: &[u8]) -> String {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to create test file info
|
// Helper function to create test file info
|
||||||
fn create_test_file_info(name: &str, path: &str, size: i64) -> FileInfo {
|
fn create_test_file_info(name: &str, path: &str, size: i64) -> FileIngestionInfo {
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
name: name.to_string(),
|
name: name.to_string(),
|
||||||
path: path.to_string(),
|
path: path.to_string(),
|
||||||
size,
|
size,
|
||||||
|
|
@ -282,7 +282,7 @@ async fn test_webdav_sync_etag_change_detection() -> Result<()> {
|
||||||
assert_eq!(existing_file.etag, old_etag);
|
assert_eq!(existing_file.etag, old_etag);
|
||||||
|
|
||||||
// Simulate file with new ETag (indicating change)
|
// Simulate file with new ETag (indicating change)
|
||||||
let file_info = FileInfo {
|
let file_info = FileIngestionInfo {
|
||||||
name: "updated.pdf".to_string(),
|
name: "updated.pdf".to_string(),
|
||||||
path: webdav_path.to_string(),
|
path: webdav_path.to_string(),
|
||||||
size: 1024,
|
size: 1024,
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
use readur::services::webdav::{WebDAVService, WebDAVConfig};
|
use readur::services::webdav::{WebDAVService, WebDAVConfig};
|
||||||
use readur::models::FileInfo;
|
use readur::models::FileIngestionInfo;
|
||||||
use tokio;
|
use tokio;
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
|
|
||||||
|
|
@ -38,10 +38,10 @@ fn mock_directory_etag_response(etag: &str) -> String {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mock complex nested directory structure
|
// Mock complex nested directory structure
|
||||||
fn mock_nested_directory_files() -> Vec<FileInfo> {
|
fn mock_nested_directory_files() -> Vec<FileIngestionInfo> {
|
||||||
vec![
|
vec![
|
||||||
// Root directory
|
// Root directory
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents".to_string(),
|
path: "/Documents".to_string(),
|
||||||
name: "Documents".to_string(),
|
name: "Documents".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -56,7 +56,7 @@ fn mock_nested_directory_files() -> Vec<FileInfo> {
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
// Level 1 directories
|
// Level 1 directories
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/2024".to_string(),
|
path: "/Documents/2024".to_string(),
|
||||||
name: "2024".to_string(),
|
name: "2024".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -70,7 +70,7 @@ fn mock_nested_directory_files() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/Archive".to_string(),
|
path: "/Documents/Archive".to_string(),
|
||||||
name: "Archive".to_string(),
|
name: "Archive".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -85,7 +85,7 @@ fn mock_nested_directory_files() -> Vec<FileInfo> {
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
// Level 2 directories
|
// Level 2 directories
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/2024/Q1".to_string(),
|
path: "/Documents/2024/Q1".to_string(),
|
||||||
name: "Q1".to_string(),
|
name: "Q1".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -99,7 +99,7 @@ fn mock_nested_directory_files() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/2024/Q2".to_string(),
|
path: "/Documents/2024/Q2".to_string(),
|
||||||
name: "Q2".to_string(),
|
name: "Q2".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -114,7 +114,7 @@ fn mock_nested_directory_files() -> Vec<FileInfo> {
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
// Level 3 directory
|
// Level 3 directory
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/2024/Q1/Reports".to_string(),
|
path: "/Documents/2024/Q1/Reports".to_string(),
|
||||||
name: "Reports".to_string(),
|
name: "Reports".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -129,7 +129,7 @@ fn mock_nested_directory_files() -> Vec<FileInfo> {
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
// Files at various levels
|
// Files at various levels
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/root-file.pdf".to_string(),
|
path: "/Documents/root-file.pdf".to_string(),
|
||||||
name: "root-file.pdf".to_string(),
|
name: "root-file.pdf".to_string(),
|
||||||
size: 1024000,
|
size: 1024000,
|
||||||
|
|
@ -143,7 +143,7 @@ fn mock_nested_directory_files() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/2024/annual-report.pdf".to_string(),
|
path: "/Documents/2024/annual-report.pdf".to_string(),
|
||||||
name: "annual-report.pdf".to_string(),
|
name: "annual-report.pdf".to_string(),
|
||||||
size: 2048000,
|
size: 2048000,
|
||||||
|
|
@ -157,7 +157,7 @@ fn mock_nested_directory_files() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/2024/Q1/q1-summary.pdf".to_string(),
|
path: "/Documents/2024/Q1/q1-summary.pdf".to_string(),
|
||||||
name: "q1-summary.pdf".to_string(),
|
name: "q1-summary.pdf".to_string(),
|
||||||
size: 512000,
|
size: 512000,
|
||||||
|
|
@ -171,7 +171,7 @@ fn mock_nested_directory_files() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/2024/Q1/Reports/detailed-report.pdf".to_string(),
|
path: "/Documents/2024/Q1/Reports/detailed-report.pdf".to_string(),
|
||||||
name: "detailed-report.pdf".to_string(),
|
name: "detailed-report.pdf".to_string(),
|
||||||
size: 4096000,
|
size: 4096000,
|
||||||
|
|
@ -185,7 +185,7 @@ fn mock_nested_directory_files() -> Vec<FileInfo> {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/Archive/old-document.pdf".to_string(),
|
path: "/Documents/Archive/old-document.pdf".to_string(),
|
||||||
name: "old-document.pdf".to_string(),
|
name: "old-document.pdf".to_string(),
|
||||||
size: 256000,
|
size: 256000,
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
use readur::services::webdav::{WebDAVService, WebDAVConfig};
|
use readur::services::webdav::{WebDAVService, WebDAVConfig};
|
||||||
use readur::models::FileInfo;
|
use readur::models::FileIngestionInfo;
|
||||||
use tokio;
|
use tokio;
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
|
|
||||||
|
|
@ -23,7 +23,7 @@ async fn test_empty_directory_tracking() {
|
||||||
let service = create_test_webdav_service();
|
let service = create_test_webdav_service();
|
||||||
|
|
||||||
// Test completely empty directory
|
// Test completely empty directory
|
||||||
let empty_files: Vec<FileInfo> = vec![];
|
let empty_files: Vec<FileIngestionInfo> = vec![];
|
||||||
|
|
||||||
// Test the directory extraction logic that happens in track_subdirectories_recursively
|
// Test the directory extraction logic that happens in track_subdirectories_recursively
|
||||||
let mut all_directories = std::collections::BTreeSet::new();
|
let mut all_directories = std::collections::BTreeSet::new();
|
||||||
|
|
@ -57,7 +57,7 @@ async fn test_directory_only_structure() {
|
||||||
|
|
||||||
// Test structure with only directories, no files
|
// Test structure with only directories, no files
|
||||||
let directory_only_files = vec![
|
let directory_only_files = vec![
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents".to_string(),
|
path: "/Documents".to_string(),
|
||||||
name: "Documents".to_string(),
|
name: "Documents".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -71,7 +71,7 @@ async fn test_directory_only_structure() {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/Empty1".to_string(),
|
path: "/Documents/Empty1".to_string(),
|
||||||
name: "Empty1".to_string(),
|
name: "Empty1".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -85,7 +85,7 @@ async fn test_directory_only_structure() {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/Empty2".to_string(),
|
path: "/Documents/Empty2".to_string(),
|
||||||
name: "Empty2".to_string(),
|
name: "Empty2".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -136,7 +136,7 @@ async fn test_very_deep_nesting() {
|
||||||
|
|
||||||
let deep_files = vec![
|
let deep_files = vec![
|
||||||
// All directories in the path
|
// All directories in the path
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents".to_string(),
|
path: "/Documents".to_string(),
|
||||||
name: "Documents".to_string(),
|
name: "Documents".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -151,7 +151,7 @@ async fn test_very_deep_nesting() {
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
// All intermediate directories from L1 to L10
|
// All intermediate directories from L1 to L10
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/L1".to_string(),
|
path: "/Documents/L1".to_string(),
|
||||||
name: "L1".to_string(),
|
name: "L1".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -165,7 +165,7 @@ async fn test_very_deep_nesting() {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/L1/L2".to_string(),
|
path: "/Documents/L1/L2".to_string(),
|
||||||
name: "L2".to_string(),
|
name: "L2".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -179,7 +179,7 @@ async fn test_very_deep_nesting() {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/L1/L2/L3".to_string(),
|
path: "/Documents/L1/L2/L3".to_string(),
|
||||||
name: "L3".to_string(),
|
name: "L3".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -193,7 +193,7 @@ async fn test_very_deep_nesting() {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: deep_path.to_string(),
|
path: deep_path.to_string(),
|
||||||
name: "L10".to_string(),
|
name: "L10".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -208,7 +208,7 @@ async fn test_very_deep_nesting() {
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
// File at the deepest level
|
// File at the deepest level
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: file_path.clone(),
|
path: file_path.clone(),
|
||||||
name: "deep-file.pdf".to_string(),
|
name: "deep-file.pdf".to_string(),
|
||||||
size: 1024000,
|
size: 1024000,
|
||||||
|
|
@ -266,7 +266,7 @@ async fn test_special_characters_in_paths() {
|
||||||
|
|
||||||
// Test paths with special characters, spaces, unicode
|
// Test paths with special characters, spaces, unicode
|
||||||
let special_files = vec![
|
let special_files = vec![
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/Folder with spaces".to_string(),
|
path: "/Documents/Folder with spaces".to_string(),
|
||||||
name: "Folder with spaces".to_string(),
|
name: "Folder with spaces".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -280,7 +280,7 @@ async fn test_special_characters_in_paths() {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/Folder-with-dashes".to_string(),
|
path: "/Documents/Folder-with-dashes".to_string(),
|
||||||
name: "Folder-with-dashes".to_string(),
|
name: "Folder-with-dashes".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -294,7 +294,7 @@ async fn test_special_characters_in_paths() {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/Документы".to_string(), // Cyrillic
|
path: "/Documents/Документы".to_string(), // Cyrillic
|
||||||
name: "Документы".to_string(),
|
name: "Документы".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -308,7 +308,7 @@ async fn test_special_characters_in_paths() {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/Folder with spaces/file with spaces.pdf".to_string(),
|
path: "/Documents/Folder with spaces/file with spaces.pdf".to_string(),
|
||||||
name: "file with spaces.pdf".to_string(),
|
name: "file with spaces.pdf".to_string(),
|
||||||
size: 1024000,
|
size: 1024000,
|
||||||
|
|
@ -686,7 +686,7 @@ async fn test_large_directory_structures() {
|
||||||
let mut large_files = Vec::new();
|
let mut large_files = Vec::new();
|
||||||
|
|
||||||
// Add root directory
|
// Add root directory
|
||||||
large_files.push(FileInfo {
|
large_files.push(FileIngestionInfo {
|
||||||
path: "/Documents".to_string(),
|
path: "/Documents".to_string(),
|
||||||
name: "Documents".to_string(),
|
name: "Documents".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -706,7 +706,7 @@ async fn test_large_directory_structures() {
|
||||||
let level1_path = format!("/Documents/Dir{:03}", i);
|
let level1_path = format!("/Documents/Dir{:03}", i);
|
||||||
|
|
||||||
// Add level-1 directory
|
// Add level-1 directory
|
||||||
large_files.push(FileInfo {
|
large_files.push(FileIngestionInfo {
|
||||||
path: level1_path.clone(),
|
path: level1_path.clone(),
|
||||||
name: format!("Dir{:03}", i),
|
name: format!("Dir{:03}", i),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -724,7 +724,7 @@ async fn test_large_directory_structures() {
|
||||||
// Add 10 subdirectories
|
// Add 10 subdirectories
|
||||||
for j in 0..10 {
|
for j in 0..10 {
|
||||||
let level2_path = format!("{}/SubDir{:02}", level1_path, j);
|
let level2_path = format!("{}/SubDir{:02}", level1_path, j);
|
||||||
large_files.push(FileInfo {
|
large_files.push(FileIngestionInfo {
|
||||||
path: level2_path.clone(),
|
path: level2_path.clone(),
|
||||||
name: format!("SubDir{:02}", j),
|
name: format!("SubDir{:02}", j),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -741,7 +741,7 @@ async fn test_large_directory_structures() {
|
||||||
|
|
||||||
// Add 5 files in each subdirectory
|
// Add 5 files in each subdirectory
|
||||||
for k in 0..5 {
|
for k in 0..5 {
|
||||||
large_files.push(FileInfo {
|
large_files.push(FileIngestionInfo {
|
||||||
path: format!("{}/file{:02}.pdf", level2_path, k),
|
path: format!("{}/file{:02}.pdf", level2_path, k),
|
||||||
name: format!("file{:02}.pdf", k),
|
name: format!("file{:02}.pdf", k),
|
||||||
size: 1024 * (k + 1) as i64,
|
size: 1024 * (k + 1) as i64,
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
use readur::services::webdav::{WebDAVService, WebDAVConfig, RetryConfig};
|
use readur::services::webdav::{WebDAVService, WebDAVConfig, RetryConfig};
|
||||||
use readur::webdav_xml_parser::parse_propfind_response;
|
use readur::webdav_xml_parser::parse_propfind_response;
|
||||||
use readur::models::FileInfo;
|
use readur::models::FileIngestionInfo;
|
||||||
use readur::models::*;
|
use readur::models::*;
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
|
@ -607,7 +607,7 @@ fn test_special_characters_in_paths() {
|
||||||
];
|
];
|
||||||
|
|
||||||
for path in test_paths {
|
for path in test_paths {
|
||||||
let file_info = FileInfo {
|
let file_info = FileIngestionInfo {
|
||||||
path: path.to_string(),
|
path: path.to_string(),
|
||||||
name: std::path::Path::new(path)
|
name: std::path::Path::new(path)
|
||||||
.file_name()
|
.file_name()
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@ use tokio;
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use readur::models::FileInfo;
|
use readur::models::FileIngestionInfo;
|
||||||
use readur::services::webdav::{WebDAVService, WebDAVConfig};
|
use readur::services::webdav::{WebDAVService, WebDAVConfig};
|
||||||
|
|
||||||
// Helper function to create test WebDAV service for smart scanning
|
// Helper function to create test WebDAV service for smart scanning
|
||||||
|
|
@ -35,10 +35,10 @@ fn create_generic_webdav_service() -> WebDAVService {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mock directory structure with subdirectories for testing
|
// Mock directory structure with subdirectories for testing
|
||||||
fn create_mock_directory_structure() -> Vec<FileInfo> {
|
fn create_mock_directory_structure() -> Vec<FileIngestionInfo> {
|
||||||
vec![
|
vec![
|
||||||
// Root directory
|
// Root directory
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents".to_string(),
|
path: "/Documents".to_string(),
|
||||||
name: "Documents".to_string(),
|
name: "Documents".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -53,7 +53,7 @@ fn create_mock_directory_structure() -> Vec<FileInfo> {
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
// Subdirectory 1 - Changed
|
// Subdirectory 1 - Changed
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/Projects".to_string(),
|
path: "/Documents/Projects".to_string(),
|
||||||
name: "Projects".to_string(),
|
name: "Projects".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -68,7 +68,7 @@ fn create_mock_directory_structure() -> Vec<FileInfo> {
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
// File in changed subdirectory
|
// File in changed subdirectory
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/Projects/report.pdf".to_string(),
|
path: "/Documents/Projects/report.pdf".to_string(),
|
||||||
name: "report.pdf".to_string(),
|
name: "report.pdf".to_string(),
|
||||||
size: 1024000,
|
size: 1024000,
|
||||||
|
|
@ -83,7 +83,7 @@ fn create_mock_directory_structure() -> Vec<FileInfo> {
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
// Subdirectory 2 - Unchanged
|
// Subdirectory 2 - Unchanged
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/Archive".to_string(),
|
path: "/Documents/Archive".to_string(),
|
||||||
name: "Archive".to_string(),
|
name: "Archive".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
use readur::services::webdav::{WebDAVService, WebDAVConfig};
|
use readur::services::webdav::{WebDAVService, WebDAVConfig};
|
||||||
use readur::models::FileInfo;
|
use readur::models::FileIngestionInfo;
|
||||||
use tokio;
|
use tokio;
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
|
|
||||||
|
|
@ -98,7 +98,7 @@ async fn test_update_single_directory_tracking() {
|
||||||
|
|
||||||
// Create mock files representing a shallow directory scan
|
// Create mock files representing a shallow directory scan
|
||||||
let files = vec![
|
let files = vec![
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents".to_string(),
|
path: "/Documents".to_string(),
|
||||||
name: "Documents".to_string(),
|
name: "Documents".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
@ -112,7 +112,7 @@ async fn test_update_single_directory_tracking() {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/file1.pdf".to_string(),
|
path: "/Documents/file1.pdf".to_string(),
|
||||||
name: "file1.pdf".to_string(),
|
name: "file1.pdf".to_string(),
|
||||||
size: 1024000,
|
size: 1024000,
|
||||||
|
|
@ -126,7 +126,7 @@ async fn test_update_single_directory_tracking() {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/file2.pdf".to_string(),
|
path: "/Documents/file2.pdf".to_string(),
|
||||||
name: "file2.pdf".to_string(),
|
name: "file2.pdf".to_string(),
|
||||||
size: 2048000,
|
size: 2048000,
|
||||||
|
|
@ -140,7 +140,7 @@ async fn test_update_single_directory_tracking() {
|
||||||
group: Some("admin".to_string()),
|
group: Some("admin".to_string()),
|
||||||
metadata: None,
|
metadata: None,
|
||||||
},
|
},
|
||||||
FileInfo {
|
FileIngestionInfo {
|
||||||
path: "/Documents/SubFolder".to_string(),
|
path: "/Documents/SubFolder".to_string(),
|
||||||
name: "SubFolder".to_string(),
|
name: "SubFolder".to_string(),
|
||||||
size: 0,
|
size: 0,
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
use readur::services::webdav::{WebDAVService, WebDAVConfig};
|
use readur::services::webdav::{WebDAVService, WebDAVConfig};
|
||||||
use readur::models::FileInfo;
|
use readur::models::FileIngestionInfo;
|
||||||
use readur::models::*;
|
use readur::models::*;
|
||||||
use tokio;
|
use tokio;
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue