feat(server/client): implement updated FailedOcrPage, duplicate management, and file hashing
This commit is contained in:
parent
9dccc6d1de
commit
bdb136d615
|
|
@ -27,6 +27,8 @@ import {
|
|||
Collapse,
|
||||
LinearProgress,
|
||||
Snackbar,
|
||||
Tabs,
|
||||
Tab,
|
||||
} from '@mui/material';
|
||||
import {
|
||||
Refresh as RefreshIcon,
|
||||
|
|
@ -37,6 +39,7 @@ import {
|
|||
Schedule as ScheduleIcon,
|
||||
Visibility as VisibilityIcon,
|
||||
Download as DownloadIcon,
|
||||
FileCopy as FileCopyIcon,
|
||||
} from '@mui/icons-material';
|
||||
import { format } from 'date-fns';
|
||||
import { api, documentService } from '../services/api';
|
||||
|
|
@ -87,16 +90,54 @@ interface RetryResponse {
|
|||
estimated_wait_minutes?: number;
|
||||
}
|
||||
|
||||
interface DuplicateDocument {
|
||||
id: string;
|
||||
filename: string;
|
||||
original_filename: string;
|
||||
file_size: number;
|
||||
mime_type: string;
|
||||
created_at: string;
|
||||
user_id: string;
|
||||
}
|
||||
|
||||
interface DuplicateGroup {
|
||||
file_hash: string;
|
||||
duplicate_count: number;
|
||||
first_uploaded: string;
|
||||
last_uploaded: string;
|
||||
documents: DuplicateDocument[];
|
||||
}
|
||||
|
||||
interface DuplicatesResponse {
|
||||
duplicates: DuplicateGroup[];
|
||||
pagination: {
|
||||
total: number;
|
||||
limit: number;
|
||||
offset: number;
|
||||
has_more: boolean;
|
||||
};
|
||||
statistics: {
|
||||
total_duplicate_groups: number;
|
||||
};
|
||||
}
|
||||
|
||||
const FailedOcrPage: React.FC = () => {
|
||||
const [currentTab, setCurrentTab] = useState(0);
|
||||
const [documents, setDocuments] = useState<FailedDocument[]>([]);
|
||||
const [duplicates, setDuplicates] = useState<DuplicateGroup[]>([]);
|
||||
const [loading, setLoading] = useState(true);
|
||||
const [duplicatesLoading, setDuplicatesLoading] = useState(false);
|
||||
const [retrying, setRetrying] = useState<string | null>(null);
|
||||
const [statistics, setStatistics] = useState<FailedOcrResponse['statistics'] | null>(null);
|
||||
const [duplicateStatistics, setDuplicateStatistics] = useState<DuplicatesResponse['statistics'] | null>(null);
|
||||
const [pagination, setPagination] = useState({ page: 1, limit: 25 });
|
||||
const [duplicatesPagination, setDuplicatesPagination] = useState({ page: 1, limit: 25 });
|
||||
const [totalPages, setTotalPages] = useState(0);
|
||||
const [duplicatesTotalPages, setDuplicatesTotalPages] = useState(0);
|
||||
const [selectedDocument, setSelectedDocument] = useState<FailedDocument | null>(null);
|
||||
const [detailsOpen, setDetailsOpen] = useState(false);
|
||||
const [expandedRows, setExpandedRows] = useState<Set<string>>(new Set());
|
||||
const [expandedDuplicateGroups, setExpandedDuplicateGroups] = useState<Set<string>>(new Set());
|
||||
const [snackbar, setSnackbar] = useState<{ open: boolean; message: string; severity: 'success' | 'error' }>({
|
||||
open: false,
|
||||
message: '',
|
||||
|
|
@ -124,10 +165,37 @@ const FailedOcrPage: React.FC = () => {
|
|||
}
|
||||
};
|
||||
|
||||
const fetchDuplicates = async () => {
|
||||
try {
|
||||
setDuplicatesLoading(true);
|
||||
const offset = (duplicatesPagination.page - 1) * duplicatesPagination.limit;
|
||||
const response = await documentService.getDuplicates(duplicatesPagination.limit, offset);
|
||||
|
||||
setDuplicates(response.data.duplicates);
|
||||
setDuplicateStatistics(response.data.statistics);
|
||||
setDuplicatesTotalPages(Math.ceil(response.data.pagination.total / duplicatesPagination.limit));
|
||||
} catch (error) {
|
||||
console.error('Failed to fetch duplicates:', error);
|
||||
setSnackbar({
|
||||
open: true,
|
||||
message: 'Failed to load duplicate documents',
|
||||
severity: 'error'
|
||||
});
|
||||
} finally {
|
||||
setDuplicatesLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
fetchFailedDocuments();
|
||||
}, [pagination.page]);
|
||||
|
||||
useEffect(() => {
|
||||
if (currentTab === 1) {
|
||||
fetchDuplicates();
|
||||
}
|
||||
}, [currentTab, duplicatesPagination.page]);
|
||||
|
||||
const handleRetryOcr = async (document: FailedDocument) => {
|
||||
try {
|
||||
setRetrying(document.id);
|
||||
|
|
@ -200,6 +268,28 @@ const FailedOcrPage: React.FC = () => {
|
|||
setDetailsOpen(true);
|
||||
};
|
||||
|
||||
const toggleDuplicateGroupExpansion = (groupHash: string) => {
|
||||
const newExpanded = new Set(expandedDuplicateGroups);
|
||||
if (newExpanded.has(groupHash)) {
|
||||
newExpanded.delete(groupHash);
|
||||
} else {
|
||||
newExpanded.add(groupHash);
|
||||
}
|
||||
setExpandedDuplicateGroups(newExpanded);
|
||||
};
|
||||
|
||||
const handleTabChange = (event: React.SyntheticEvent, newValue: number) => {
|
||||
setCurrentTab(newValue);
|
||||
};
|
||||
|
||||
const refreshCurrentTab = () => {
|
||||
if (currentTab === 0) {
|
||||
fetchFailedDocuments();
|
||||
} else {
|
||||
fetchDuplicates();
|
||||
}
|
||||
};
|
||||
|
||||
if (loading && documents.length === 0) {
|
||||
return (
|
||||
<Box display="flex" justifyContent="center" alignItems="center" minHeight="400px">
|
||||
|
|
@ -212,20 +302,38 @@ const FailedOcrPage: React.FC = () => {
|
|||
<Box sx={{ p: 3 }}>
|
||||
<Box display="flex" justifyContent="space-between" alignItems="center" mb={3}>
|
||||
<Typography variant="h4" component="h1">
|
||||
Failed OCR Documents
|
||||
Failed OCR & Duplicates
|
||||
</Typography>
|
||||
<Button
|
||||
variant="outlined"
|
||||
startIcon={<RefreshIcon />}
|
||||
onClick={fetchFailedDocuments}
|
||||
disabled={loading}
|
||||
onClick={refreshCurrentTab}
|
||||
disabled={loading || duplicatesLoading}
|
||||
>
|
||||
Refresh
|
||||
</Button>
|
||||
</Box>
|
||||
|
||||
{/* Statistics Overview */}
|
||||
{statistics && (
|
||||
<Paper sx={{ mb: 3 }}>
|
||||
<Tabs value={currentTab} onChange={handleTabChange} aria-label="failed ocr and duplicates tabs">
|
||||
<Tab
|
||||
icon={<ErrorIcon />}
|
||||
label={`Failed OCR${statistics ? ` (${statistics.total_failed})` : ''}`}
|
||||
iconPosition="start"
|
||||
/>
|
||||
<Tab
|
||||
icon={<FileCopyIcon />}
|
||||
label={`Duplicates${duplicateStatistics ? ` (${duplicateStatistics.total_duplicate_groups})` : ''}`}
|
||||
iconPosition="start"
|
||||
/>
|
||||
</Tabs>
|
||||
</Paper>
|
||||
|
||||
{/* Failed OCR Tab Content */}
|
||||
{currentTab === 0 && (
|
||||
<>
|
||||
{/* Statistics Overview */}
|
||||
{statistics && (
|
||||
<Grid container spacing={3} mb={3}>
|
||||
<Grid item xs={12} md={4}>
|
||||
<Card>
|
||||
|
|
@ -435,6 +543,174 @@ const FailedOcrPage: React.FC = () => {
|
|||
)}
|
||||
</>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
|
||||
{/* Duplicates Tab Content */}
|
||||
{currentTab === 1 && (
|
||||
<>
|
||||
{/* Duplicate Statistics Overview */}
|
||||
{duplicateStatistics && (
|
||||
<Grid container spacing={3} mb={3}>
|
||||
<Grid item xs={12} md={6}>
|
||||
<Card>
|
||||
<CardContent>
|
||||
<Typography variant="h6" color="warning.main">
|
||||
<FileCopyIcon sx={{ mr: 1, verticalAlign: 'middle' }} />
|
||||
Total Duplicate Groups
|
||||
</Typography>
|
||||
<Typography variant="h3" color="warning.main">
|
||||
{duplicateStatistics.total_duplicate_groups}
|
||||
</Typography>
|
||||
</CardContent>
|
||||
</Card>
|
||||
</Grid>
|
||||
</Grid>
|
||||
)}
|
||||
|
||||
{duplicatesLoading ? (
|
||||
<Box display="flex" justifyContent="center" alignItems="center" minHeight="400px">
|
||||
<CircularProgress />
|
||||
</Box>
|
||||
) : duplicates.length === 0 ? (
|
||||
<Alert severity="success" sx={{ mt: 2 }}>
|
||||
<AlertTitle>No duplicates found!</AlertTitle>
|
||||
You don't have any duplicate documents. All your files have unique content.
|
||||
</Alert>
|
||||
) : (
|
||||
<>
|
||||
<Alert severity="info" sx={{ mb: 2 }}>
|
||||
<AlertTitle>Duplicate Documents</AlertTitle>
|
||||
These documents have identical content but may have different filenames.
|
||||
You can click on each group to see all the documents with the same content.
|
||||
</Alert>
|
||||
|
||||
<TableContainer component={Paper}>
|
||||
<Table>
|
||||
<TableHead>
|
||||
<TableRow>
|
||||
<TableCell />
|
||||
<TableCell>Content Hash</TableCell>
|
||||
<TableCell>Duplicate Count</TableCell>
|
||||
<TableCell>First Uploaded</TableCell>
|
||||
<TableCell>Last Uploaded</TableCell>
|
||||
<TableCell>Actions</TableCell>
|
||||
</TableRow>
|
||||
</TableHead>
|
||||
<TableBody>
|
||||
{duplicates.map((group) => (
|
||||
<React.Fragment key={group.file_hash}>
|
||||
<TableRow>
|
||||
<TableCell>
|
||||
<IconButton
|
||||
size="small"
|
||||
onClick={() => toggleDuplicateGroupExpansion(group.file_hash)}
|
||||
>
|
||||
{expandedDuplicateGroups.has(group.file_hash) ? <ExpandLessIcon /> : <ExpandMoreIcon />}
|
||||
</IconButton>
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
<Typography variant="body2" fontFamily="monospace">
|
||||
{group.file_hash.substring(0, 16)}...
|
||||
</Typography>
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
<Chip
|
||||
label={`${group.duplicate_count} files`}
|
||||
color="warning"
|
||||
size="small"
|
||||
/>
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
<Typography variant="body2">
|
||||
{format(new Date(group.first_uploaded), 'MMM dd, yyyy')}
|
||||
</Typography>
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
<Typography variant="body2">
|
||||
{format(new Date(group.last_uploaded), 'MMM dd, yyyy')}
|
||||
</Typography>
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
<Typography variant="body2" color="text.secondary">
|
||||
View files below
|
||||
</Typography>
|
||||
</TableCell>
|
||||
</TableRow>
|
||||
<TableRow>
|
||||
<TableCell sx={{ paddingBottom: 0, paddingTop: 0 }} colSpan={6}>
|
||||
<Collapse in={expandedDuplicateGroups.has(group.file_hash)} timeout="auto" unmountOnExit>
|
||||
<Box sx={{ margin: 1, p: 2, bgcolor: 'grey.50' }}>
|
||||
<Typography variant="h6" gutterBottom>
|
||||
Duplicate Files ({group.duplicate_count} total)
|
||||
</Typography>
|
||||
<Grid container spacing={2}>
|
||||
{group.documents.map((doc, index) => (
|
||||
<Grid item xs={12} md={6} key={doc.id}>
|
||||
<Card variant="outlined">
|
||||
<CardContent>
|
||||
<Typography variant="body2" fontWeight="bold">
|
||||
{doc.filename}
|
||||
</Typography>
|
||||
{doc.original_filename !== doc.filename && (
|
||||
<Typography variant="caption" color="text.secondary">
|
||||
Original: {doc.original_filename}
|
||||
</Typography>
|
||||
)}
|
||||
<Typography variant="caption" display="block" color="text.secondary">
|
||||
{formatFileSize(doc.file_size)} • {doc.mime_type}
|
||||
</Typography>
|
||||
<Typography variant="caption" display="block" color="text.secondary">
|
||||
Uploaded: {format(new Date(doc.created_at), 'MMM dd, yyyy HH:mm')}
|
||||
</Typography>
|
||||
<Box mt={1}>
|
||||
<Tooltip title="View Document">
|
||||
<IconButton
|
||||
size="small"
|
||||
onClick={() => window.open(`/api/documents/${doc.id}/view`, '_blank')}
|
||||
>
|
||||
<VisibilityIcon />
|
||||
</IconButton>
|
||||
</Tooltip>
|
||||
<Tooltip title="Download Document">
|
||||
<IconButton
|
||||
size="small"
|
||||
onClick={() => window.open(`/api/documents/${doc.id}/download`, '_blank')}
|
||||
>
|
||||
<DownloadIcon />
|
||||
</IconButton>
|
||||
</Tooltip>
|
||||
</Box>
|
||||
</CardContent>
|
||||
</Card>
|
||||
</Grid>
|
||||
))}
|
||||
</Grid>
|
||||
</Box>
|
||||
</Collapse>
|
||||
</TableCell>
|
||||
</TableRow>
|
||||
</React.Fragment>
|
||||
))}
|
||||
</TableBody>
|
||||
</Table>
|
||||
</TableContainer>
|
||||
|
||||
{/* Duplicates Pagination */}
|
||||
{duplicatesTotalPages > 1 && (
|
||||
<Box display="flex" justifyContent="center" mt={3}>
|
||||
<Pagination
|
||||
count={duplicatesTotalPages}
|
||||
page={duplicatesPagination.page}
|
||||
onChange={(_, page) => setDuplicatesPagination(prev => ({ ...prev, page }))}
|
||||
color="primary"
|
||||
/>
|
||||
</Box>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
|
||||
{/* Document Details Dialog */}
|
||||
<Dialog
|
||||
|
|
|
|||
|
|
@ -183,6 +183,12 @@ export const documentService = {
|
|||
})
|
||||
},
|
||||
|
||||
getDuplicates: (limit = 25, offset = 0) => {
|
||||
return api.get(`/documents/duplicates`, {
|
||||
params: { limit, offset },
|
||||
})
|
||||
},
|
||||
|
||||
search: (searchRequest: SearchRequest) => {
|
||||
return api.get<SearchResponse>('/search', {
|
||||
params: searchRequest,
|
||||
|
|
|
|||
|
|
@ -991,6 +991,7 @@ impl Database {
|
|||
created_at: row.get("created_at"),
|
||||
updated_at: row.get("updated_at"),
|
||||
user_id: row.get("user_id"),
|
||||
file_hash: row.get("file_hash"),
|
||||
});
|
||||
}
|
||||
|
||||
|
|
@ -1125,6 +1126,7 @@ impl Database {
|
|||
created_at: row.get("created_at"),
|
||||
updated_at: row.get("updated_at"),
|
||||
user_id: row.get("user_id"),
|
||||
file_hash: row.get("file_hash"),
|
||||
})),
|
||||
None => Ok(None),
|
||||
}
|
||||
|
|
@ -1170,4 +1172,124 @@ impl Database {
|
|||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get documents grouped by duplicate hashes for a user
|
||||
pub async fn get_user_duplicates(&self, user_id: Uuid, user_role: crate::models::UserRole, limit: i64, offset: i64) -> Result<(Vec<serde_json::Value>, i64)> {
|
||||
let (docs_query, count_query) = if user_role == crate::models::UserRole::Admin {
|
||||
// Admins can see all duplicates
|
||||
(
|
||||
r#"
|
||||
SELECT
|
||||
file_hash,
|
||||
COUNT(*) as duplicate_count,
|
||||
MIN(created_at) as first_uploaded,
|
||||
MAX(created_at) as last_uploaded,
|
||||
json_agg(
|
||||
json_build_object(
|
||||
'id', id,
|
||||
'filename', filename,
|
||||
'original_filename', original_filename,
|
||||
'file_size', file_size,
|
||||
'mime_type', mime_type,
|
||||
'created_at', created_at,
|
||||
'user_id', user_id
|
||||
) ORDER BY created_at
|
||||
) as documents
|
||||
FROM documents
|
||||
WHERE file_hash IS NOT NULL
|
||||
GROUP BY file_hash
|
||||
HAVING COUNT(*) > 1
|
||||
ORDER BY duplicate_count DESC, first_uploaded DESC
|
||||
LIMIT $1 OFFSET $2
|
||||
"#,
|
||||
r#"
|
||||
SELECT COUNT(*) as total FROM (
|
||||
SELECT file_hash
|
||||
FROM documents
|
||||
WHERE file_hash IS NOT NULL
|
||||
GROUP BY file_hash
|
||||
HAVING COUNT(*) > 1
|
||||
) as duplicate_groups
|
||||
"#
|
||||
)
|
||||
} else {
|
||||
// Regular users see only their own duplicates
|
||||
(
|
||||
r#"
|
||||
SELECT
|
||||
file_hash,
|
||||
COUNT(*) as duplicate_count,
|
||||
MIN(created_at) as first_uploaded,
|
||||
MAX(created_at) as last_uploaded,
|
||||
json_agg(
|
||||
json_build_object(
|
||||
'id', id,
|
||||
'filename', filename,
|
||||
'original_filename', original_filename,
|
||||
'file_size', file_size,
|
||||
'mime_type', mime_type,
|
||||
'created_at', created_at,
|
||||
'user_id', user_id
|
||||
) ORDER BY created_at
|
||||
) as documents
|
||||
FROM documents
|
||||
WHERE user_id = $3 AND file_hash IS NOT NULL
|
||||
GROUP BY file_hash
|
||||
HAVING COUNT(*) > 1
|
||||
ORDER BY duplicate_count DESC, first_uploaded DESC
|
||||
LIMIT $1 OFFSET $2
|
||||
"#,
|
||||
r#"
|
||||
SELECT COUNT(*) as total FROM (
|
||||
SELECT file_hash
|
||||
FROM documents
|
||||
WHERE user_id = $1 AND file_hash IS NOT NULL
|
||||
GROUP BY file_hash
|
||||
HAVING COUNT(*) > 1
|
||||
) as duplicate_groups
|
||||
"#
|
||||
)
|
||||
};
|
||||
|
||||
let rows = if user_role == crate::models::UserRole::Admin {
|
||||
sqlx::query(docs_query)
|
||||
.bind(limit)
|
||||
.bind(offset)
|
||||
.fetch_all(&self.pool)
|
||||
.await?
|
||||
} else {
|
||||
sqlx::query(docs_query)
|
||||
.bind(limit)
|
||||
.bind(offset)
|
||||
.bind(user_id)
|
||||
.fetch_all(&self.pool)
|
||||
.await?
|
||||
};
|
||||
|
||||
let duplicates: Vec<serde_json::Value> = rows
|
||||
.into_iter()
|
||||
.map(|row| {
|
||||
serde_json::json!({
|
||||
"file_hash": row.get::<String, _>("file_hash"),
|
||||
"duplicate_count": row.get::<i64, _>("duplicate_count"),
|
||||
"first_uploaded": row.get::<chrono::DateTime<chrono::Utc>, _>("first_uploaded"),
|
||||
"last_uploaded": row.get::<chrono::DateTime<chrono::Utc>, _>("last_uploaded"),
|
||||
"documents": row.get::<serde_json::Value, _>("documents")
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
let total = if user_role == crate::models::UserRole::Admin {
|
||||
sqlx::query_scalar::<_, i64>(count_query)
|
||||
.fetch_one(&self.pool)
|
||||
.await?
|
||||
} else {
|
||||
sqlx::query_scalar::<_, i64>(count_query)
|
||||
.bind(user_id)
|
||||
.fetch_one(&self.pool)
|
||||
.await?
|
||||
};
|
||||
|
||||
Ok((duplicates, total))
|
||||
}
|
||||
}
|
||||
|
|
@ -37,6 +37,7 @@ pub fn router() -> Router<Arc<AppState>> {
|
|||
.route("/{id}/processed-image", get(get_processed_image))
|
||||
.route("/{id}/retry-ocr", post(retry_ocr))
|
||||
.route("/failed-ocr", get(get_failed_ocr_documents))
|
||||
.route("/duplicates", get(get_user_duplicates))
|
||||
}
|
||||
|
||||
#[utoipa::path(
|
||||
|
|
@ -226,7 +227,7 @@ fn calculate_file_hash(data: &[u8]) -> String {
|
|||
("ocr_status" = Option<String>, Query, description = "Filter by OCR status (pending, processing, completed, failed)")
|
||||
),
|
||||
responses(
|
||||
(status = 200, description = "List of user documents", body = Vec<DocumentResponse>),
|
||||
(status = 200, description = "Paginated list of user documents with metadata", body = String),
|
||||
(status = 401, description = "Unauthorized")
|
||||
)
|
||||
)]
|
||||
|
|
@ -809,4 +810,50 @@ async fn get_failure_statistics(
|
|||
.collect();
|
||||
|
||||
Ok(serde_json::json!(categories))
|
||||
}
|
||||
|
||||
#[utoipa::path(
|
||||
get,
|
||||
path = "/api/documents/duplicates",
|
||||
tag = "documents",
|
||||
security(
|
||||
("bearer_auth" = [])
|
||||
),
|
||||
params(
|
||||
("limit" = Option<i64>, Query, description = "Number of duplicate groups to return per page"),
|
||||
("offset" = Option<i64>, Query, description = "Number of duplicate groups to skip")
|
||||
),
|
||||
responses(
|
||||
(status = 200, description = "User's duplicate documents grouped by hash", body = String),
|
||||
(status = 401, description = "Unauthorized")
|
||||
)
|
||||
)]
|
||||
async fn get_user_duplicates(
|
||||
State(state): State<Arc<AppState>>,
|
||||
auth_user: AuthUser,
|
||||
Query(query): Query<PaginationQuery>,
|
||||
) -> Result<Json<serde_json::Value>, StatusCode> {
|
||||
let limit = query.limit.unwrap_or(25);
|
||||
let offset = query.offset.unwrap_or(0);
|
||||
|
||||
let (duplicates, total_count) = state
|
||||
.db
|
||||
.get_user_duplicates(auth_user.user.id, auth_user.user.role, limit, offset)
|
||||
.await
|
||||
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
|
||||
|
||||
let response = serde_json::json!({
|
||||
"duplicates": duplicates,
|
||||
"pagination": {
|
||||
"total": total_count,
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
"has_more": offset + limit < total_count
|
||||
},
|
||||
"statistics": {
|
||||
"total_duplicate_groups": total_count
|
||||
}
|
||||
});
|
||||
|
||||
Ok(Json(response))
|
||||
}
|
||||
|
|
@ -8,7 +8,12 @@ use crate::{
|
|||
models::{
|
||||
CreateUser, LoginRequest, LoginResponse, UserResponse, UpdateUser,
|
||||
DocumentResponse, SearchRequest, SearchResponse, EnhancedDocumentResponse,
|
||||
SettingsResponse, UpdateSettings, SearchMode, SearchSnippet, HighlightRange
|
||||
SettingsResponse, UpdateSettings, SearchMode, SearchSnippet, HighlightRange,
|
||||
FacetItem, SearchFacetsResponse, Notification, NotificationSummary, CreateNotification,
|
||||
Source, SourceResponse, CreateSource, UpdateSource, SourceWithStats,
|
||||
WebDAVSourceConfig, LocalFolderSourceConfig, S3SourceConfig,
|
||||
WebDAVCrawlEstimate, WebDAVTestConnection, WebDAVConnectionResult, WebDAVSyncStatus,
|
||||
ProcessedImage, CreateProcessedImage
|
||||
},
|
||||
routes::metrics::{
|
||||
SystemMetrics, DatabaseMetrics, OcrMetrics, DocumentMetrics, UserMetrics, GeneralSystemMetrics
|
||||
|
|
@ -26,10 +31,19 @@ use crate::{
|
|||
// Document endpoints
|
||||
crate::routes::documents::upload_document,
|
||||
crate::routes::documents::list_documents,
|
||||
crate::routes::documents::get_document_by_id,
|
||||
crate::routes::documents::download_document,
|
||||
crate::routes::documents::view_document,
|
||||
crate::routes::documents::get_document_thumbnail,
|
||||
crate::routes::documents::get_document_ocr,
|
||||
crate::routes::documents::get_processed_image,
|
||||
crate::routes::documents::retry_ocr,
|
||||
crate::routes::documents::get_failed_ocr_documents,
|
||||
crate::routes::documents::get_user_duplicates,
|
||||
// Search endpoints
|
||||
crate::routes::search::search_documents,
|
||||
crate::routes::search::enhanced_search_documents,
|
||||
crate::routes::search::get_search_facets,
|
||||
// Settings endpoints
|
||||
crate::routes::settings::get_settings,
|
||||
crate::routes::settings::update_settings,
|
||||
|
|
@ -42,14 +56,46 @@ use crate::{
|
|||
// Queue endpoints
|
||||
crate::routes::queue::get_queue_stats,
|
||||
crate::routes::queue::requeue_failed,
|
||||
crate::routes::queue::get_ocr_status,
|
||||
crate::routes::queue::pause_ocr_processing,
|
||||
crate::routes::queue::resume_ocr_processing,
|
||||
// Metrics endpoints
|
||||
crate::routes::metrics::get_system_metrics,
|
||||
// Notifications endpoints
|
||||
crate::routes::notifications::get_notifications,
|
||||
crate::routes::notifications::get_notification_summary,
|
||||
crate::routes::notifications::mark_notification_read,
|
||||
crate::routes::notifications::mark_all_notifications_read,
|
||||
crate::routes::notifications::delete_notification,
|
||||
// Sources endpoints
|
||||
crate::routes::sources::list_sources,
|
||||
crate::routes::sources::create_source,
|
||||
crate::routes::sources::get_source,
|
||||
crate::routes::sources::update_source,
|
||||
crate::routes::sources::delete_source,
|
||||
crate::routes::sources::trigger_sync,
|
||||
crate::routes::sources::stop_sync,
|
||||
crate::routes::sources::test_connection,
|
||||
crate::routes::sources::estimate_crawl,
|
||||
crate::routes::sources::estimate_crawl_with_config,
|
||||
crate::routes::sources::test_connection_with_config,
|
||||
// WebDAV endpoints
|
||||
crate::routes::webdav::start_webdav_sync,
|
||||
crate::routes::webdav::cancel_webdav_sync,
|
||||
crate::routes::webdav::get_webdav_sync_status,
|
||||
crate::routes::webdav::test_webdav_connection,
|
||||
crate::routes::webdav::estimate_webdav_crawl,
|
||||
),
|
||||
components(
|
||||
schemas(
|
||||
CreateUser, LoginRequest, LoginResponse, UserResponse, UpdateUser,
|
||||
DocumentResponse, SearchRequest, SearchResponse, EnhancedDocumentResponse,
|
||||
SettingsResponse, UpdateSettings, SearchMode, SearchSnippet, HighlightRange,
|
||||
FacetItem, SearchFacetsResponse, Notification, NotificationSummary, CreateNotification,
|
||||
Source, SourceResponse, CreateSource, UpdateSource, SourceWithStats,
|
||||
WebDAVSourceConfig, LocalFolderSourceConfig, S3SourceConfig,
|
||||
WebDAVCrawlEstimate, WebDAVTestConnection, WebDAVConnectionResult, WebDAVSyncStatus,
|
||||
ProcessedImage, CreateProcessedImage,
|
||||
SystemMetrics, DatabaseMetrics, OcrMetrics, DocumentMetrics, UserMetrics, GeneralSystemMetrics
|
||||
)
|
||||
),
|
||||
|
|
@ -61,6 +107,9 @@ use crate::{
|
|||
(name = "users", description = "User management endpoints"),
|
||||
(name = "queue", description = "OCR queue management endpoints"),
|
||||
(name = "metrics", description = "System metrics and monitoring endpoints"),
|
||||
(name = "notifications", description = "User notification endpoints"),
|
||||
(name = "sources", description = "Document source management endpoints"),
|
||||
(name = "webdav", description = "WebDAV synchronization endpoints"),
|
||||
),
|
||||
modifiers(&SecurityAddon),
|
||||
info(
|
||||
|
|
|
|||
|
|
@ -48,6 +48,7 @@ mod tests {
|
|||
created_at: Utc::now(),
|
||||
updated_at: Utc::now(),
|
||||
user_id,
|
||||
file_hash: Some("abcd1234567890123456789012345678901234567890123456789012345678".to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ mod tests {
|
|||
created_at: Utc::now(),
|
||||
updated_at: Utc::now(),
|
||||
user_id,
|
||||
file_hash: Some("1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef".to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -48,6 +49,7 @@ mod tests {
|
|||
created_at: Utc::now(),
|
||||
updated_at: Utc::now(),
|
||||
user_id,
|
||||
file_hash: Some("fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321".to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -71,6 +73,7 @@ mod tests {
|
|||
created_at: Utc::now(),
|
||||
updated_at: Utc::now(),
|
||||
user_id,
|
||||
file_hash: Some("abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890".to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -938,6 +938,7 @@ mod tests {
|
|||
created_at: Utc::now(),
|
||||
updated_at: Utc::now(),
|
||||
user_id: user.id,
|
||||
file_hash: Some("0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef".to_string()),
|
||||
};
|
||||
|
||||
db.create_document(document).await.unwrap();
|
||||
|
|
|
|||
|
|
@ -0,0 +1,391 @@
|
|||
use anyhow::Result;
|
||||
use chrono::Utc;
|
||||
use std::sync::Arc;
|
||||
use uuid::Uuid;
|
||||
use sha2::{Sha256, Digest};
|
||||
|
||||
use readur::{
|
||||
AppState,
|
||||
db::Database,
|
||||
config::Config,
|
||||
models::{Document, CreateUser, UserRole},
|
||||
};
|
||||
|
||||
// Helper function to calculate file hash
|
||||
fn calculate_file_hash(data: &[u8]) -> String {
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(data);
|
||||
let result = hasher.finalize();
|
||||
format!("{:x}", result)
|
||||
}
|
||||
|
||||
// Helper function to create test document
|
||||
fn create_test_document(user_id: Uuid, filename: &str, file_hash: String) -> Document {
|
||||
Document {
|
||||
id: Uuid::new_v4(),
|
||||
filename: filename.to_string(),
|
||||
original_filename: filename.to_string(),
|
||||
file_path: format!("/tmp/{}", filename),
|
||||
file_size: 1024,
|
||||
mime_type: "application/pdf".to_string(),
|
||||
content: None,
|
||||
ocr_text: None,
|
||||
ocr_confidence: None,
|
||||
ocr_word_count: None,
|
||||
ocr_processing_time_ms: None,
|
||||
ocr_status: Some("pending".to_string()),
|
||||
ocr_error: None,
|
||||
ocr_completed_at: None,
|
||||
tags: Vec::new(),
|
||||
created_at: Utc::now(),
|
||||
updated_at: Utc::now(),
|
||||
user_id,
|
||||
file_hash: Some(file_hash),
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to create test user with unique identifier
|
||||
fn create_test_user_with_suffix(suffix: &str) -> CreateUser {
|
||||
CreateUser {
|
||||
username: format!("testuser_{}", suffix),
|
||||
email: format!("test_{}@example.com", suffix),
|
||||
password: "test_password".to_string(),
|
||||
role: Some(UserRole::User),
|
||||
}
|
||||
}
|
||||
|
||||
async fn create_test_app_state() -> Result<Arc<AppState>> {
|
||||
let config = Config::from_env().unwrap_or_else(|_| {
|
||||
// Create a test config if env fails
|
||||
Config {
|
||||
database_url: "postgresql://readur:readur@localhost:5432/readur".to_string(),
|
||||
server_address: "127.0.0.1:8000".to_string(),
|
||||
jwt_secret: "test-secret".to_string(),
|
||||
upload_path: "./test-uploads".to_string(),
|
||||
watch_folder: "./test-watch".to_string(),
|
||||
allowed_file_types: vec!["pdf".to_string(), "txt".to_string()],
|
||||
watch_interval_seconds: Some(30),
|
||||
file_stability_check_ms: Some(500),
|
||||
max_file_age_hours: None,
|
||||
ocr_language: "eng".to_string(),
|
||||
concurrent_ocr_jobs: 2,
|
||||
ocr_timeout_seconds: 60,
|
||||
max_file_size_mb: 10,
|
||||
memory_limit_mb: 256,
|
||||
cpu_priority: "normal".to_string(),
|
||||
}
|
||||
});
|
||||
let db = Database::new("postgresql://readur:readur@localhost:5432/readur").await?;
|
||||
let queue_service = std::sync::Arc::new(
|
||||
readur::ocr_queue::OcrQueueService::new(db.clone(), db.get_pool().clone(), 1)
|
||||
);
|
||||
|
||||
Ok(Arc::new(AppState {
|
||||
db,
|
||||
config,
|
||||
webdav_scheduler: None,
|
||||
source_scheduler: None,
|
||||
queue_service,
|
||||
}))
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_document_upload_duplicate_detection_returns_existing() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple()));
|
||||
|
||||
// Create user in database
|
||||
let created_user = state.db.create_user(user).await?;
|
||||
let user_id = created_user.id;
|
||||
|
||||
// Test content
|
||||
let test_content = b"This is test PDF content for upload duplicate detection";
|
||||
let file_hash = calculate_file_hash(test_content);
|
||||
|
||||
// Create existing document with same hash
|
||||
let existing_doc = create_test_document(user_id, "existing.pdf", file_hash.clone());
|
||||
let created_doc = state.db.create_document(existing_doc).await?;
|
||||
|
||||
// Test that the hash lookup would find the existing document
|
||||
let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &file_hash).await?;
|
||||
assert!(duplicate_check.is_some(), "Should find existing document with same hash");
|
||||
|
||||
let found_doc = duplicate_check.unwrap();
|
||||
assert_eq!(found_doc.id, created_doc.id);
|
||||
assert_eq!(found_doc.file_hash, Some(file_hash));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_document_upload_unique_content_processed() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple()));
|
||||
|
||||
// Create user in database
|
||||
let created_user = state.db.create_user(user).await?;
|
||||
let user_id = created_user.id;
|
||||
|
||||
// Test content
|
||||
let test_content = b"This is unique PDF content for upload processing";
|
||||
let file_hash = calculate_file_hash(test_content);
|
||||
|
||||
// Verify no existing document with this hash
|
||||
let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &file_hash).await?;
|
||||
assert!(duplicate_check.is_none(), "Should not find any existing document with this hash");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_document_upload_different_users_same_content() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
|
||||
// Create two users
|
||||
let user1 = create_test_user_with_suffix(&format!("different_users_1_{}", Uuid::new_v4().simple()));
|
||||
let created_user1 = state.db.create_user(user1).await?;
|
||||
let user1_id = created_user1.id;
|
||||
|
||||
let user2 = create_test_user_with_suffix(&format!("different_users_2_{}", Uuid::new_v4().simple()));
|
||||
let created_user2 = state.db.create_user(user2).await?;
|
||||
let user2_id = created_user2.id;
|
||||
|
||||
// Test content
|
||||
let test_content = b"Shared content between different users for upload";
|
||||
let file_hash = calculate_file_hash(test_content);
|
||||
|
||||
// Create document for user1 with this hash
|
||||
let user1_doc = create_test_document(user1_id, "user1.pdf", file_hash.clone());
|
||||
state.db.create_document(user1_doc).await?;
|
||||
|
||||
// Check that user2 doesn't see user1's document as duplicate
|
||||
let duplicate_check = state.db.get_document_by_user_and_hash(user2_id, &file_hash).await?;
|
||||
assert!(duplicate_check.is_none(), "User2 should not see user1's document as duplicate");
|
||||
|
||||
// User2 should be able to create their own document with same hash
|
||||
let user2_doc = create_test_document(user2_id, "user2.pdf", file_hash.clone());
|
||||
let result = state.db.create_document(user2_doc).await;
|
||||
assert!(result.is_ok(), "User2 should be able to create document with same hash");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_document_upload_hash_calculation_accuracy() -> Result<()> {
|
||||
// Test various file contents and ensure hash calculation is accurate
|
||||
let test_cases = vec![
|
||||
(b"" as &[u8], "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"), // Empty
|
||||
(b"a", "ca978112ca1bbdcafac231b39a23dc4da786eff8147c4e72b9807785afee48bb"), // Single char
|
||||
(b"Hello, World!", "dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f"), // Text
|
||||
];
|
||||
|
||||
for (content, expected_hash) in test_cases {
|
||||
let calculated_hash = calculate_file_hash(content);
|
||||
assert_eq!(calculated_hash, expected_hash, "Hash mismatch for content: {:?}", content);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_document_upload_large_file_hash() -> Result<()> {
|
||||
// Test hash calculation for larger files
|
||||
let large_content = vec![b'X'; 1_000_000]; // 1MB of 'X' characters
|
||||
|
||||
let hash1 = calculate_file_hash(&large_content);
|
||||
let hash2 = calculate_file_hash(&large_content);
|
||||
|
||||
// Hash should be consistent
|
||||
assert_eq!(hash1, hash2);
|
||||
assert_eq!(hash1.len(), 64); // SHA256 hex length
|
||||
assert!(hash1.chars().all(|c| c.is_ascii_hexdigit()));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_document_upload_binary_content_hash() -> Result<()> {
|
||||
// Test hash calculation for binary content
|
||||
let mut binary_content = Vec::new();
|
||||
for i in 0..256 {
|
||||
binary_content.push(i as u8);
|
||||
}
|
||||
|
||||
let hash = calculate_file_hash(&binary_content);
|
||||
|
||||
assert_eq!(hash.len(), 64);
|
||||
assert!(hash.chars().all(|c| c.is_ascii_hexdigit()));
|
||||
|
||||
// Same binary content should produce same hash
|
||||
let hash2 = calculate_file_hash(&binary_content);
|
||||
assert_eq!(hash, hash2);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_document_upload_duplicate_prevention_database_constraint() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple()));
|
||||
// Create user in database and get the created user
|
||||
let created_user = state.db.create_user(user).await?;
|
||||
let user_id = created_user.id;
|
||||
|
||||
let test_hash = "duplicate_upload_test_hash_123456789012345678901234567890123456";
|
||||
|
||||
// Create first document with the hash
|
||||
let doc1 = create_test_document(user_id, "test1.pdf", test_hash.to_string());
|
||||
let result1 = state.db.create_document(doc1).await;
|
||||
assert!(result1.is_ok(), "First document should be created successfully");
|
||||
|
||||
// Try to create second document with same hash for same user
|
||||
let doc2 = create_test_document(user_id, "test2.pdf", test_hash.to_string());
|
||||
let result2 = state.db.create_document(doc2).await;
|
||||
|
||||
// This should fail due to unique constraint
|
||||
assert!(result2.is_err(), "Second document with same hash should fail");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_document_upload_filename_vs_content_duplicate() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple()));
|
||||
// Create user in database and get the created user
|
||||
let created_user = state.db.create_user(user).await?;
|
||||
let user_id = created_user.id;
|
||||
|
||||
// Same content, different filenames
|
||||
let content = b"Same content, different names";
|
||||
let hash = calculate_file_hash(content);
|
||||
|
||||
// Create first document
|
||||
let doc1 = create_test_document(user_id, "document_v1.pdf", hash.clone());
|
||||
state.db.create_document(doc1).await?;
|
||||
|
||||
// Check that same content is detected as duplicate regardless of filename
|
||||
let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &hash).await?;
|
||||
assert!(duplicate_check.is_some(), "Same content should be detected as duplicate regardless of filename");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_document_upload_unicode_content_hash() -> Result<()> {
|
||||
// Test hash calculation with unicode content
|
||||
let unicode_content = "Hello 世界 🌍 café naïve résumé".as_bytes();
|
||||
|
||||
let hash1 = calculate_file_hash(unicode_content);
|
||||
let hash2 = calculate_file_hash(unicode_content);
|
||||
|
||||
// Hash should be consistent for unicode content
|
||||
assert_eq!(hash1, hash2);
|
||||
assert_eq!(hash1.len(), 64);
|
||||
assert!(hash1.chars().all(|c| c.is_ascii_hexdigit()));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_document_upload_concurrent_same_content() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple()));
|
||||
// Create user in database and get the created user
|
||||
let created_user = state.db.create_user(user).await?;
|
||||
let user_id = created_user.id;
|
||||
|
||||
let test_content = b"Concurrent upload test content";
|
||||
let file_hash = calculate_file_hash(test_content);
|
||||
|
||||
// Simulate concurrent uploads of same content
|
||||
let mut handles = Vec::new();
|
||||
|
||||
for i in 0..5 {
|
||||
let state_clone = state.clone();
|
||||
let hash_clone = file_hash.clone();
|
||||
|
||||
let handle = tokio::spawn(async move {
|
||||
let doc = create_test_document(user_id, &format!("concurrent{}.pdf", i), hash_clone);
|
||||
state_clone.db.create_document(doc).await
|
||||
});
|
||||
|
||||
handles.push(handle);
|
||||
}
|
||||
|
||||
// Wait for all operations and count results
|
||||
let mut success_count = 0;
|
||||
let mut error_count = 0;
|
||||
|
||||
for handle in handles {
|
||||
match handle.await? {
|
||||
Ok(_) => success_count += 1,
|
||||
Err(_) => error_count += 1,
|
||||
}
|
||||
}
|
||||
|
||||
// Only one should succeed due to unique constraint
|
||||
assert_eq!(success_count, 1, "Only one document should be created successfully");
|
||||
assert_eq!(error_count, 4, "Four operations should fail due to duplicate hash");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_document_upload_mime_type_independence() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple()));
|
||||
// Create user in database and get the created user
|
||||
let created_user = state.db.create_user(user).await?;
|
||||
let user_id = created_user.id;
|
||||
|
||||
let content = b"Same content, different perceived types";
|
||||
let hash = calculate_file_hash(content);
|
||||
|
||||
// Create document as PDF
|
||||
let mut pdf_doc = create_test_document(user_id, "test.pdf", hash.clone());
|
||||
pdf_doc.mime_type = "application/pdf".to_string();
|
||||
state.db.create_document(pdf_doc).await?;
|
||||
|
||||
// Try to upload same content as text file - should be detected as duplicate
|
||||
let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &hash).await?;
|
||||
assert!(duplicate_check.is_some(), "Same content should be detected as duplicate regardless of MIME type");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_document_upload_performance_hash_lookup() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple()));
|
||||
// Create user in database and get the created user
|
||||
let created_user = state.db.create_user(user).await?;
|
||||
let user_id = created_user.id;
|
||||
|
||||
// Create multiple documents with different hashes
|
||||
let mut test_hashes = Vec::new();
|
||||
|
||||
for i in 0..50 {
|
||||
let content = format!("Performance test content {}", i);
|
||||
let hash = calculate_file_hash(content.as_bytes());
|
||||
test_hashes.push(hash.clone());
|
||||
|
||||
let doc = create_test_document(user_id, &format!("perf_test_{}.pdf", i), hash);
|
||||
state.db.create_document(doc).await?;
|
||||
}
|
||||
|
||||
// Measure hash lookup performance
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
for hash in &test_hashes {
|
||||
let result = state.db.get_document_by_user_and_hash(user_id, hash).await?;
|
||||
assert!(result.is_some(), "Should find document with hash: {}", hash);
|
||||
}
|
||||
|
||||
let duration = start.elapsed();
|
||||
|
||||
// Hash lookups should be very fast
|
||||
assert!(duration.as_millis() < 2000, "Hash lookups should be fast even with many documents: {:?}", duration);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -0,0 +1,276 @@
|
|||
use anyhow::Result;
|
||||
use chrono::Utc;
|
||||
use uuid::Uuid;
|
||||
use sha2::{Sha256, Digest};
|
||||
use tempfile::TempDir;
|
||||
|
||||
use readur::{
|
||||
db::Database,
|
||||
file_service::FileService,
|
||||
models::{Document, CreateUser, UserRole},
|
||||
};
|
||||
|
||||
const TEST_DB_URL: &str = "postgresql://readur:readur@localhost:5432/readur";
|
||||
|
||||
// Helper function to create a test user with unique identifier
|
||||
async fn create_test_user(db: &Database, username: &str) -> Result<Uuid> {
|
||||
let unique_suffix = Uuid::new_v4().simple();
|
||||
let user = CreateUser {
|
||||
username: format!("{}_{}", username, unique_suffix),
|
||||
email: format!("{}_{}@example.com", username, unique_suffix),
|
||||
password: "password123".to_string(),
|
||||
role: Some(UserRole::User),
|
||||
};
|
||||
let created_user = db.create_user(user).await?;
|
||||
Ok(created_user.id)
|
||||
}
|
||||
|
||||
// Helper function to calculate file hash
|
||||
fn calculate_file_hash(data: &[u8]) -> String {
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(data);
|
||||
let result = hasher.finalize();
|
||||
format!("{:x}", result)
|
||||
}
|
||||
|
||||
// Helper function to create a test document
|
||||
fn create_test_document(user_id: Uuid, filename: &str, file_hash: Option<String>) -> Document {
|
||||
Document {
|
||||
id: Uuid::new_v4(),
|
||||
filename: filename.to_string(),
|
||||
original_filename: filename.to_string(),
|
||||
file_path: format!("/tmp/{}", filename),
|
||||
file_size: 1024,
|
||||
mime_type: "application/pdf".to_string(),
|
||||
content: None,
|
||||
ocr_text: None,
|
||||
ocr_confidence: None,
|
||||
ocr_word_count: None,
|
||||
ocr_processing_time_ms: None,
|
||||
ocr_status: Some("pending".to_string()),
|
||||
ocr_error: None,
|
||||
ocr_completed_at: None,
|
||||
tags: Vec::new(),
|
||||
created_at: Utc::now(),
|
||||
updated_at: Utc::now(),
|
||||
user_id,
|
||||
file_hash,
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get_document_by_user_and_hash_found() -> Result<()> {
|
||||
let db = Database::new(TEST_DB_URL).await?;
|
||||
let user_id = create_test_user(&db, "testuser1").await?;
|
||||
let file_hash = "abcd1234567890";
|
||||
|
||||
// Create a document with the hash
|
||||
let document = create_test_document(user_id, "test.pdf", Some(file_hash.to_string()));
|
||||
let created_doc = db.create_document(document).await?;
|
||||
|
||||
// Test finding the document by hash
|
||||
let found_doc = db.get_document_by_user_and_hash(user_id, file_hash).await?;
|
||||
|
||||
assert!(found_doc.is_some());
|
||||
let found_doc = found_doc.unwrap();
|
||||
assert_eq!(found_doc.id, created_doc.id);
|
||||
assert_eq!(found_doc.file_hash, Some(file_hash.to_string()));
|
||||
assert_eq!(found_doc.user_id, user_id);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get_document_by_user_and_hash_not_found() -> Result<()> {
|
||||
let db = Database::new(TEST_DB_URL).await?;
|
||||
let user_id = Uuid::new_v4();
|
||||
let non_existent_hash = "nonexistent1234567890";
|
||||
|
||||
// Test finding a non-existent hash
|
||||
let found_doc = db.get_document_by_user_and_hash(user_id, non_existent_hash).await?;
|
||||
|
||||
assert!(found_doc.is_none());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get_document_by_user_and_hash_different_user() -> Result<()> {
|
||||
let db = Database::new(TEST_DB_URL).await?;
|
||||
let user1_id = create_test_user(&db, "testuser2").await?;
|
||||
let user2_id = create_test_user(&db, "testuser3").await?;
|
||||
let file_hash = "shared_hash_1234567890";
|
||||
|
||||
// Create a document for user1 with the hash
|
||||
let document = create_test_document(user1_id, "test.pdf", Some(file_hash.to_string()));
|
||||
db.create_document(document).await?;
|
||||
|
||||
// Test that user2 cannot find user1's document by hash
|
||||
let found_doc = db.get_document_by_user_and_hash(user2_id, file_hash).await?;
|
||||
|
||||
assert!(found_doc.is_none(), "User should not be able to access another user's documents");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_duplicate_hash_prevention_same_user() -> Result<()> {
|
||||
let db = Database::new(TEST_DB_URL).await?;
|
||||
let user_id = create_test_user(&db, "testuser4").await?;
|
||||
let file_hash = "duplicate_hash_1234567890";
|
||||
|
||||
// Create first document with the hash
|
||||
let document1 = create_test_document(user_id, "test1.pdf", Some(file_hash.to_string()));
|
||||
let result1 = db.create_document(document1).await;
|
||||
assert!(result1.is_ok(), "First document with hash should be created successfully");
|
||||
|
||||
// Try to create second document with same hash for same user
|
||||
let document2 = create_test_document(user_id, "test2.pdf", Some(file_hash.to_string()));
|
||||
let result2 = db.create_document(document2).await;
|
||||
|
||||
// This should fail due to unique constraint
|
||||
assert!(result2.is_err(), "Second document with same hash for same user should fail");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_same_hash_different_users_allowed() -> Result<()> {
|
||||
let db = Database::new(TEST_DB_URL).await?;
|
||||
let user1_id = create_test_user(&db, "testuser5").await?;
|
||||
let user2_id = create_test_user(&db, "testuser6").await?;
|
||||
let file_hash = "shared_content_hash_1234567890";
|
||||
|
||||
// Create document for user1 with the hash
|
||||
let document1 = create_test_document(user1_id, "test1.pdf", Some(file_hash.to_string()));
|
||||
let result1 = db.create_document(document1).await;
|
||||
assert!(result1.is_ok(), "First user's document should be created successfully");
|
||||
|
||||
// Create document for user2 with same hash
|
||||
let document2 = create_test_document(user2_id, "test2.pdf", Some(file_hash.to_string()));
|
||||
let result2 = db.create_document(document2).await;
|
||||
assert!(result2.is_ok(), "Second user's document with same hash should be allowed");
|
||||
|
||||
// Verify both users can find their respective documents
|
||||
let found_doc1 = db.get_document_by_user_and_hash(user1_id, file_hash).await?;
|
||||
let found_doc2 = db.get_document_by_user_and_hash(user2_id, file_hash).await?;
|
||||
|
||||
assert!(found_doc1.is_some());
|
||||
assert!(found_doc2.is_some());
|
||||
assert_ne!(found_doc1.unwrap().id, found_doc2.unwrap().id);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_null_hash_allowed_multiple() -> Result<()> {
|
||||
let db = Database::new(TEST_DB_URL).await?;
|
||||
let user_id = create_test_user(&db, "testuser7").await?;
|
||||
|
||||
// Create multiple documents with null hash (should be allowed)
|
||||
let document1 = create_test_document(user_id, "test1.pdf", None);
|
||||
let result1 = db.create_document(document1).await;
|
||||
assert!(result1.is_ok(), "First document with null hash should be created");
|
||||
|
||||
let document2 = create_test_document(user_id, "test2.pdf", None);
|
||||
let result2 = db.create_document(document2).await;
|
||||
assert!(result2.is_ok(), "Second document with null hash should be created");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_calculate_file_hash_consistency() {
|
||||
let test_data = b"Hello, World! This is test content for hash calculation.";
|
||||
|
||||
// Calculate hash multiple times
|
||||
let hash1 = calculate_file_hash(test_data);
|
||||
let hash2 = calculate_file_hash(test_data);
|
||||
let hash3 = calculate_file_hash(test_data);
|
||||
|
||||
// All hashes should be identical
|
||||
assert_eq!(hash1, hash2);
|
||||
assert_eq!(hash2, hash3);
|
||||
|
||||
// Hash should be 64 characters (SHA256 hex)
|
||||
assert_eq!(hash1.len(), 64);
|
||||
|
||||
// Should be valid hex
|
||||
assert!(hash1.chars().all(|c| c.is_ascii_hexdigit()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_calculate_file_hash_different_content() {
|
||||
let data1 = b"Content 1";
|
||||
let data2 = b"Content 2";
|
||||
let data3 = b"content 1"; // Different case
|
||||
|
||||
let hash1 = calculate_file_hash(data1);
|
||||
let hash2 = calculate_file_hash(data2);
|
||||
let hash3 = calculate_file_hash(data3);
|
||||
|
||||
// All hashes should be different
|
||||
assert_ne!(hash1, hash2);
|
||||
assert_ne!(hash1, hash3);
|
||||
assert_ne!(hash2, hash3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_calculate_file_hash_empty_content() {
|
||||
let empty_data = b"";
|
||||
let hash = calculate_file_hash(empty_data);
|
||||
|
||||
// Should produce a valid hash even for empty content
|
||||
assert_eq!(hash.len(), 64);
|
||||
assert!(hash.chars().all(|c| c.is_ascii_hexdigit()));
|
||||
|
||||
// Known SHA256 hash of empty string
|
||||
assert_eq!(hash, "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_file_service_create_document_with_hash() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let upload_path = temp_dir.path().to_string_lossy().to_string();
|
||||
let file_service = FileService::new(upload_path);
|
||||
let user_id = Uuid::new_v4();
|
||||
let test_hash = "test_hash_1234567890";
|
||||
|
||||
let document = file_service.create_document(
|
||||
"test.pdf",
|
||||
"original.pdf",
|
||||
"/path/to/file.pdf",
|
||||
1024,
|
||||
"application/pdf",
|
||||
user_id,
|
||||
Some(test_hash.to_string()),
|
||||
);
|
||||
|
||||
assert_eq!(document.filename, "test.pdf");
|
||||
assert_eq!(document.original_filename, "original.pdf");
|
||||
assert_eq!(document.file_hash, Some(test_hash.to_string()));
|
||||
assert_eq!(document.user_id, user_id);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_file_service_create_document_without_hash() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let upload_path = temp_dir.path().to_string_lossy().to_string();
|
||||
let file_service = FileService::new(upload_path);
|
||||
let user_id = Uuid::new_v4();
|
||||
|
||||
let document = file_service.create_document(
|
||||
"test.pdf",
|
||||
"original.pdf",
|
||||
"/path/to/file.pdf",
|
||||
1024,
|
||||
"application/pdf",
|
||||
user_id,
|
||||
None,
|
||||
);
|
||||
|
||||
assert_eq!(document.filename, "test.pdf");
|
||||
assert_eq!(document.original_filename, "original.pdf");
|
||||
assert_eq!(document.file_hash, None);
|
||||
assert_eq!(document.user_id, user_id);
|
||||
}
|
||||
|
|
@ -0,0 +1,440 @@
|
|||
use anyhow::Result;
|
||||
use chrono::Utc;
|
||||
use std::sync::Arc;
|
||||
use uuid::Uuid;
|
||||
use sha2::{Sha256, Digest};
|
||||
|
||||
use readur::{
|
||||
AppState,
|
||||
db::Database,
|
||||
config::Config,
|
||||
models::{FileInfo, Document, Source, SourceType, SourceStatus},
|
||||
};
|
||||
|
||||
// Helper function to calculate file hash
|
||||
fn calculate_file_hash(data: &[u8]) -> String {
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(data);
|
||||
let result = hasher.finalize();
|
||||
format!("{:x}", result)
|
||||
}
|
||||
|
||||
// Helper function to create test file info
|
||||
fn create_test_file_info(name: &str, path: &str, content: &[u8]) -> FileInfo {
|
||||
FileInfo {
|
||||
name: name.to_string(),
|
||||
path: path.to_string(),
|
||||
size: content.len() as i64,
|
||||
last_modified: Some(Utc::now()),
|
||||
etag: "test-etag".to_string(),
|
||||
mime_type: "application/pdf".to_string(),
|
||||
is_directory: false,
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to create test document
|
||||
fn create_test_document(user_id: Uuid, filename: &str, file_hash: String) -> Document {
|
||||
Document {
|
||||
id: Uuid::new_v4(),
|
||||
filename: filename.to_string(),
|
||||
original_filename: filename.to_string(),
|
||||
file_path: format!("/tmp/{}", filename),
|
||||
file_size: 1024,
|
||||
mime_type: "application/pdf".to_string(),
|
||||
content: None,
|
||||
ocr_text: None,
|
||||
ocr_confidence: None,
|
||||
ocr_word_count: None,
|
||||
ocr_processing_time_ms: None,
|
||||
ocr_status: Some("pending".to_string()),
|
||||
ocr_error: None,
|
||||
ocr_completed_at: None,
|
||||
tags: Vec::new(),
|
||||
created_at: Utc::now(),
|
||||
updated_at: Utc::now(),
|
||||
user_id,
|
||||
file_hash: Some(file_hash),
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to create test source
|
||||
fn create_test_source(user_id: Uuid, source_type: SourceType) -> Source {
|
||||
Source {
|
||||
id: Uuid::new_v4(),
|
||||
user_id,
|
||||
name: "Test Source".to_string(),
|
||||
source_type,
|
||||
config: serde_json::json!({}),
|
||||
status: SourceStatus::Idle,
|
||||
enabled: true,
|
||||
last_sync_at: None,
|
||||
last_error: None,
|
||||
last_error_at: None,
|
||||
total_files_synced: 0,
|
||||
total_files_pending: 0,
|
||||
total_size_bytes: 0,
|
||||
created_at: Utc::now(),
|
||||
updated_at: Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to create a test user with unique identifier
|
||||
async fn create_test_user(db: &Database, username: &str) -> Result<Uuid> {
|
||||
use readur::models::{CreateUser, UserRole};
|
||||
let unique_suffix = Uuid::new_v4().simple();
|
||||
let user = CreateUser {
|
||||
username: format!("{}_{}", username, unique_suffix),
|
||||
email: format!("{}_{}@example.com", username, unique_suffix),
|
||||
password: "password123".to_string(),
|
||||
role: Some(UserRole::User),
|
||||
};
|
||||
let created_user = db.create_user(user).await?;
|
||||
Ok(created_user.id)
|
||||
}
|
||||
|
||||
async fn create_test_app_state() -> Result<Arc<AppState>> {
|
||||
let config = Config::from_env().unwrap_or_else(|_| {
|
||||
Config {
|
||||
database_url: "postgresql://readur:readur@localhost:5432/readur".to_string(),
|
||||
server_address: "127.0.0.1:8000".to_string(),
|
||||
jwt_secret: "test-secret".to_string(),
|
||||
upload_path: "./test-uploads".to_string(),
|
||||
watch_folder: "./test-watch".to_string(),
|
||||
allowed_file_types: vec!["pdf".to_string(), "txt".to_string()],
|
||||
watch_interval_seconds: Some(30),
|
||||
file_stability_check_ms: Some(500),
|
||||
max_file_age_hours: None,
|
||||
ocr_language: "eng".to_string(),
|
||||
concurrent_ocr_jobs: 2,
|
||||
ocr_timeout_seconds: 60,
|
||||
max_file_size_mb: 10,
|
||||
memory_limit_mb: 256,
|
||||
cpu_priority: "normal".to_string(),
|
||||
}
|
||||
});
|
||||
let db = Database::new("postgresql://readur:readur@localhost:5432/readur").await?;
|
||||
let queue_service = std::sync::Arc::new(
|
||||
readur::ocr_queue::OcrQueueService::new(db.clone(), db.get_pool().clone(), 1)
|
||||
);
|
||||
|
||||
Ok(Arc::new(AppState {
|
||||
db,
|
||||
config,
|
||||
webdav_scheduler: None,
|
||||
source_scheduler: None,
|
||||
queue_service,
|
||||
}))
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_source_sync_duplicate_detection_skips_duplicate() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user_id = create_test_user(&state.db, "source_sync_test").await?;
|
||||
|
||||
// Test content
|
||||
let test_content = b"This is test content for source sync duplicate detection";
|
||||
let file_hash = calculate_file_hash(test_content);
|
||||
|
||||
// Create existing document with same hash
|
||||
let existing_doc = create_test_document(user_id, "existing.pdf", file_hash.clone());
|
||||
state.db.create_document(existing_doc).await?;
|
||||
|
||||
// Check if duplicate exists using the efficient method
|
||||
let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &file_hash).await?;
|
||||
|
||||
assert!(duplicate_check.is_some(), "Should find existing document with same hash");
|
||||
|
||||
let found_doc = duplicate_check.unwrap();
|
||||
assert_eq!(found_doc.file_hash, Some(file_hash));
|
||||
assert_eq!(found_doc.user_id, user_id);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_source_sync_duplicate_detection_processes_unique() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user_id = create_test_user(&state.db, "source_sync_test").await?;
|
||||
|
||||
// Test content
|
||||
let test_content = b"This is unique content that should be processed by source sync";
|
||||
let file_hash = calculate_file_hash(test_content);
|
||||
|
||||
// Verify no existing document with this hash
|
||||
let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &file_hash).await?;
|
||||
assert!(duplicate_check.is_none(), "Should not find any existing document with this hash");
|
||||
|
||||
// This indicates the file would be processed normally
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_source_sync_duplicate_different_users() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user1_id = create_test_user(&state.db, "source_sync_user1").await?;
|
||||
let user2_id = create_test_user(&state.db, "source_sync_user2").await?;
|
||||
|
||||
// Test content
|
||||
let test_content = b"Shared content between different users in source sync";
|
||||
let file_hash = calculate_file_hash(test_content);
|
||||
|
||||
// Create document for user1 with this hash
|
||||
let user1_doc = create_test_document(user1_id, "user1.pdf", file_hash.clone());
|
||||
state.db.create_document(user1_doc).await?;
|
||||
|
||||
// Check that user2 doesn't see user1's document as duplicate
|
||||
let duplicate_check = state.db.get_document_by_user_and_hash(user2_id, &file_hash).await?;
|
||||
assert!(duplicate_check.is_none(), "User2 should not see user1's document as duplicate");
|
||||
|
||||
// User2 should be able to create their own document with same hash
|
||||
let user2_doc = create_test_document(user2_id, "user2.pdf", file_hash.clone());
|
||||
let result = state.db.create_document(user2_doc).await;
|
||||
assert!(result.is_ok(), "User2 should be able to create document with same hash");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_source_sync_hash_calculation_consistency() -> Result<()> {
|
||||
let test_content = b"Test content for hash consistency in source sync";
|
||||
|
||||
// Calculate hash multiple times
|
||||
let hash1 = calculate_file_hash(test_content);
|
||||
let hash2 = calculate_file_hash(test_content);
|
||||
let hash3 = calculate_file_hash(test_content);
|
||||
|
||||
// All hashes should be identical
|
||||
assert_eq!(hash1, hash2);
|
||||
assert_eq!(hash2, hash3);
|
||||
|
||||
// Hash should be 64 characters (SHA256 hex)
|
||||
assert_eq!(hash1.len(), 64);
|
||||
|
||||
// Should be valid hex
|
||||
assert!(hash1.chars().all(|c| c.is_ascii_hexdigit()));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_source_sync_duplicate_detection_performance() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user_id = create_test_user(&state.db, "source_sync_test").await?;
|
||||
|
||||
// Create multiple documents with different hashes
|
||||
let mut created_hashes = Vec::new();
|
||||
|
||||
for i in 0..10 {
|
||||
let content = format!("Test content number {}", i);
|
||||
let hash = calculate_file_hash(content.as_bytes());
|
||||
created_hashes.push(hash.clone());
|
||||
|
||||
let doc = create_test_document(user_id, &format!("test{}.pdf", i), hash);
|
||||
state.db.create_document(doc).await?;
|
||||
}
|
||||
|
||||
// Test lookup performance - should be fast even with multiple documents
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
for hash in &created_hashes {
|
||||
let result = state.db.get_document_by_user_and_hash(user_id, hash).await?;
|
||||
assert!(result.is_some(), "Should find document with hash: {}", hash);
|
||||
}
|
||||
|
||||
let duration = start.elapsed();
|
||||
assert!(duration.as_millis() < 1000, "Hash lookups should be fast: {:?}", duration);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_source_sync_file_modification_detection() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user_id = create_test_user(&state.db, "source_sync_test").await?;
|
||||
|
||||
// Original content
|
||||
let original_content = b"Original file content";
|
||||
let original_hash = calculate_file_hash(original_content);
|
||||
|
||||
// Modified content (same file, different content)
|
||||
let modified_content = b"Modified file content";
|
||||
let modified_hash = calculate_file_hash(modified_content);
|
||||
|
||||
// Create document with original content
|
||||
let original_doc = create_test_document(user_id, "test.pdf", original_hash.clone());
|
||||
state.db.create_document(original_doc).await?;
|
||||
|
||||
// Check original content is found
|
||||
let original_check = state.db.get_document_by_user_and_hash(user_id, &original_hash).await?;
|
||||
assert!(original_check.is_some(), "Should find document with original hash");
|
||||
|
||||
// Check modified content is not found (different hash)
|
||||
let modified_check = state.db.get_document_by_user_and_hash(user_id, &modified_hash).await?;
|
||||
assert!(modified_check.is_none(), "Should not find document with modified hash");
|
||||
|
||||
// Verify hashes are actually different
|
||||
assert_ne!(original_hash, modified_hash, "Original and modified content should have different hashes");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_source_sync_edge_case_empty_files() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user_id = create_test_user(&state.db, "source_sync_test").await?;
|
||||
|
||||
// Empty file content
|
||||
let empty_content = b"";
|
||||
let empty_hash = calculate_file_hash(empty_content);
|
||||
|
||||
// Create document with empty content
|
||||
let empty_doc = create_test_document(user_id, "empty.pdf", empty_hash.clone());
|
||||
state.db.create_document(empty_doc).await?;
|
||||
|
||||
// Check empty file is found
|
||||
let empty_check = state.db.get_document_by_user_and_hash(user_id, &empty_hash).await?;
|
||||
assert!(empty_check.is_some(), "Should find document with empty content hash");
|
||||
|
||||
// Verify empty hash is the known SHA256 empty string hash
|
||||
assert_eq!(empty_hash, "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_source_sync_large_file_hash_consistency() -> Result<()> {
|
||||
// Simulate large file content
|
||||
let large_content = vec![b'A'; 10_000_000]; // 10MB of 'A' characters
|
||||
|
||||
// Calculate hash
|
||||
let hash = calculate_file_hash(&large_content);
|
||||
|
||||
// Hash should still be 64 characters
|
||||
assert_eq!(hash.len(), 64);
|
||||
assert!(hash.chars().all(|c| c.is_ascii_hexdigit()));
|
||||
|
||||
// Calculate same hash again to ensure consistency
|
||||
let hash2 = calculate_file_hash(&large_content);
|
||||
assert_eq!(hash, hash2);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_source_sync_binary_file_handling() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user_id = create_test_user(&state.db, "source_sync_test").await?;
|
||||
|
||||
// Binary content (PDF header + some binary data)
|
||||
let mut binary_content = b"%PDF-1.4\n".to_vec();
|
||||
binary_content.extend_from_slice(&[0u8, 1u8, 2u8, 3u8, 255u8, 254u8, 253u8]);
|
||||
|
||||
let binary_hash = calculate_file_hash(&binary_content);
|
||||
|
||||
// Create document with binary content
|
||||
let binary_doc = create_test_document(user_id, "binary.pdf", binary_hash.clone());
|
||||
state.db.create_document(binary_doc).await?;
|
||||
|
||||
// Check binary file is found
|
||||
let binary_check = state.db.get_document_by_user_and_hash(user_id, &binary_hash).await?;
|
||||
assert!(binary_check.is_some(), "Should find document with binary content hash");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_source_sync_unicode_filename_handling() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user_id = create_test_user(&state.db, "source_sync_test").await?;
|
||||
|
||||
// Unicode content and filename
|
||||
let unicode_content = "Test content with unicode: 测试内容 🚀 café".as_bytes();
|
||||
let unicode_hash = calculate_file_hash(unicode_content);
|
||||
|
||||
// Create document with unicode filename
|
||||
let unicode_doc = create_test_document(user_id, "测试文档🚀.pdf", unicode_hash.clone());
|
||||
state.db.create_document(unicode_doc).await?;
|
||||
|
||||
// Check unicode file is found
|
||||
let unicode_check = state.db.get_document_by_user_and_hash(user_id, &unicode_hash).await?;
|
||||
assert!(unicode_check.is_some(), "Should find document with unicode content hash");
|
||||
|
||||
let found_doc = unicode_check.unwrap();
|
||||
assert_eq!(found_doc.filename, "测试文档🚀.pdf");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_source_sync_concurrent_hash_operations() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user_id = create_test_user(&state.db, "source_sync_test").await?;
|
||||
|
||||
// Create multiple concurrent hash lookup operations
|
||||
let mut handles = Vec::new();
|
||||
|
||||
for i in 0..20 {
|
||||
let state_clone = state.clone();
|
||||
let hash = format!("{}test_hash_concurrent_{}", "a".repeat(40), i);
|
||||
|
||||
let handle = tokio::spawn(async move {
|
||||
state_clone.db.get_document_by_user_and_hash(user_id, &hash).await
|
||||
});
|
||||
|
||||
handles.push(handle);
|
||||
}
|
||||
|
||||
// Wait for all concurrent operations
|
||||
let mut results = Vec::new();
|
||||
for handle in handles {
|
||||
let result = handle.await??;
|
||||
results.push(result);
|
||||
}
|
||||
|
||||
// All should return None (no documents exist with these hashes)
|
||||
for (i, result) in results.iter().enumerate() {
|
||||
assert!(result.is_none(), "Concurrent operation {} should return None", i);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_source_sync_duplicate_prevention_race_condition() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user_id = create_test_user(&state.db, "source_sync_test").await?;
|
||||
|
||||
let test_hash = "race_condition_test_hash_123456789012345678901234567890123456";
|
||||
|
||||
// Try to create multiple documents with same hash concurrently
|
||||
let mut handles = Vec::new();
|
||||
|
||||
for i in 0..5 {
|
||||
let state_clone = state.clone();
|
||||
let hash_clone = test_hash.to_string();
|
||||
|
||||
let handle = tokio::spawn(async move {
|
||||
let doc = create_test_document(user_id, &format!("test{}.pdf", i), hash_clone);
|
||||
state_clone.db.create_document(doc).await
|
||||
});
|
||||
|
||||
handles.push(handle);
|
||||
}
|
||||
|
||||
// Wait for all operations and count successes
|
||||
let mut success_count = 0;
|
||||
let mut error_count = 0;
|
||||
|
||||
for handle in handles {
|
||||
match handle.await? {
|
||||
Ok(_) => success_count += 1,
|
||||
Err(_) => error_count += 1,
|
||||
}
|
||||
}
|
||||
|
||||
// Only one should succeed due to unique constraint
|
||||
assert_eq!(success_count, 1, "Only one document should be created successfully");
|
||||
assert_eq!(error_count, 4, "Four operations should fail due to duplicate hash");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -0,0 +1,389 @@
|
|||
use anyhow::Result;
|
||||
use chrono::Utc;
|
||||
use std::sync::Arc;
|
||||
use uuid::Uuid;
|
||||
use sha2::{Sha256, Digest};
|
||||
|
||||
use readur::{
|
||||
AppState,
|
||||
db::Database,
|
||||
config::Config,
|
||||
models::{FileInfo, CreateWebDAVFile, Document},
|
||||
};
|
||||
|
||||
// Helper function to calculate file hash
|
||||
fn calculate_file_hash(data: &[u8]) -> String {
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(data);
|
||||
let result = hasher.finalize();
|
||||
format!("{:x}", result)
|
||||
}
|
||||
|
||||
// Helper function to create test file info
|
||||
fn create_test_file_info(name: &str, path: &str, size: i64) -> FileInfo {
|
||||
FileInfo {
|
||||
name: name.to_string(),
|
||||
path: path.to_string(),
|
||||
size,
|
||||
last_modified: Some(Utc::now()),
|
||||
etag: "test-etag".to_string(),
|
||||
mime_type: "application/pdf".to_string(),
|
||||
is_directory: false,
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to create test document
|
||||
fn create_test_document(user_id: Uuid, filename: &str, file_hash: String) -> Document {
|
||||
Document {
|
||||
id: Uuid::new_v4(),
|
||||
filename: filename.to_string(),
|
||||
original_filename: filename.to_string(),
|
||||
file_path: format!("/tmp/{}", filename),
|
||||
file_size: 1024,
|
||||
mime_type: "application/pdf".to_string(),
|
||||
content: None,
|
||||
ocr_text: None,
|
||||
ocr_confidence: None,
|
||||
ocr_word_count: None,
|
||||
ocr_processing_time_ms: None,
|
||||
ocr_status: Some("pending".to_string()),
|
||||
ocr_error: None,
|
||||
ocr_completed_at: None,
|
||||
tags: Vec::new(),
|
||||
created_at: Utc::now(),
|
||||
updated_at: Utc::now(),
|
||||
user_id,
|
||||
file_hash: Some(file_hash),
|
||||
}
|
||||
}
|
||||
|
||||
// Mock WebDAV service for testing
|
||||
#[derive(Clone)]
|
||||
struct MockWebDAVService {
|
||||
pub test_files: std::collections::HashMap<String, Vec<u8>>,
|
||||
}
|
||||
|
||||
impl MockWebDAVService {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
test_files: std::collections::HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn add_test_file(&mut self, path: &str, content: Vec<u8>) {
|
||||
self.test_files.insert(path.to_string(), content);
|
||||
}
|
||||
|
||||
async fn download_file(&self, path: &str) -> Result<Vec<u8>> {
|
||||
self.test_files
|
||||
.get(path)
|
||||
.cloned()
|
||||
.ok_or_else(|| anyhow::anyhow!("File not found: {}", path))
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to create a test user with unique identifier
|
||||
async fn create_test_user(db: &Database, username: &str) -> Result<Uuid> {
|
||||
use readur::models::{CreateUser, UserRole};
|
||||
let unique_suffix = Uuid::new_v4().simple();
|
||||
let user = CreateUser {
|
||||
username: format!("{}_{}", username, unique_suffix),
|
||||
email: format!("{}_{}@example.com", username, unique_suffix),
|
||||
password: "password123".to_string(),
|
||||
role: Some(UserRole::User),
|
||||
};
|
||||
let created_user = db.create_user(user).await?;
|
||||
Ok(created_user.id)
|
||||
}
|
||||
|
||||
async fn create_test_app_state() -> Result<Arc<AppState>> {
|
||||
let config = Config::from_env().unwrap_or_else(|_| {
|
||||
Config {
|
||||
database_url: "postgresql://readur:readur@localhost:5432/readur".to_string(),
|
||||
server_address: "127.0.0.1:8000".to_string(),
|
||||
jwt_secret: "test-secret".to_string(),
|
||||
upload_path: "./test-uploads".to_string(),
|
||||
watch_folder: "./test-watch".to_string(),
|
||||
allowed_file_types: vec!["pdf".to_string(), "txt".to_string()],
|
||||
watch_interval_seconds: Some(30),
|
||||
file_stability_check_ms: Some(500),
|
||||
max_file_age_hours: None,
|
||||
ocr_language: "eng".to_string(),
|
||||
concurrent_ocr_jobs: 2,
|
||||
ocr_timeout_seconds: 60,
|
||||
max_file_size_mb: 10,
|
||||
memory_limit_mb: 256,
|
||||
cpu_priority: "normal".to_string(),
|
||||
}
|
||||
});
|
||||
let db = Database::new("postgresql://readur:readur@localhost:5432/readur").await?;
|
||||
let queue_service = std::sync::Arc::new(
|
||||
readur::ocr_queue::OcrQueueService::new(db.clone(), db.get_pool().clone(), 1)
|
||||
);
|
||||
|
||||
Ok(Arc::new(AppState {
|
||||
db,
|
||||
config,
|
||||
webdav_scheduler: None,
|
||||
source_scheduler: None,
|
||||
queue_service,
|
||||
}))
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_webdav_sync_duplicate_detection_skips_duplicate() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user_id = create_test_user(&state.db, "webdav_test").await?;
|
||||
|
||||
// Test content
|
||||
let test_content = b"This is test PDF content for duplicate detection";
|
||||
let file_hash = calculate_file_hash(test_content);
|
||||
|
||||
// Create existing document with same hash
|
||||
let existing_doc = create_test_document(user_id, "existing.pdf", file_hash.clone());
|
||||
state.db.create_document(existing_doc).await?;
|
||||
|
||||
// Setup mock WebDAV service
|
||||
let mut webdav_service = MockWebDAVService::new();
|
||||
webdav_service.add_test_file("/test/duplicate.pdf", test_content.to_vec());
|
||||
|
||||
// Create file info for the duplicate file
|
||||
let file_info = create_test_file_info("duplicate.pdf", "/test/duplicate.pdf", test_content.len() as i64);
|
||||
|
||||
// Create a mock process_single_file function (since the actual one is private)
|
||||
// We'll test the duplicate detection logic directly
|
||||
|
||||
// Check if duplicate exists using the new efficient method
|
||||
let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &file_hash).await?;
|
||||
|
||||
assert!(duplicate_check.is_some(), "Should find existing document with same hash");
|
||||
|
||||
let found_doc = duplicate_check.unwrap();
|
||||
assert_eq!(found_doc.file_hash, Some(file_hash));
|
||||
assert_eq!(found_doc.user_id, user_id);
|
||||
|
||||
// Verify that WebDAV tracking would record this as a duplicate
|
||||
let webdav_file = CreateWebDAVFile {
|
||||
user_id,
|
||||
webdav_path: file_info.path.clone(),
|
||||
etag: file_info.etag.clone(),
|
||||
last_modified: file_info.last_modified,
|
||||
file_size: file_info.size,
|
||||
mime_type: file_info.mime_type.clone(),
|
||||
document_id: Some(found_doc.id),
|
||||
sync_status: "duplicate_content".to_string(),
|
||||
sync_error: None,
|
||||
};
|
||||
|
||||
let created_webdav_file = state.db.create_or_update_webdav_file(&webdav_file).await?;
|
||||
assert_eq!(created_webdav_file.sync_status, "duplicate_content");
|
||||
assert_eq!(created_webdav_file.document_id, Some(found_doc.id));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_webdav_sync_duplicate_detection_processes_unique() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user_id = create_test_user(&state.db, "webdav_test").await?;
|
||||
|
||||
// Test content
|
||||
let test_content = b"This is unique PDF content that should be processed";
|
||||
let file_hash = calculate_file_hash(test_content);
|
||||
|
||||
// Verify no existing document with this hash
|
||||
let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &file_hash).await?;
|
||||
assert!(duplicate_check.is_none(), "Should not find any existing document with this hash");
|
||||
|
||||
// This indicates the file would be processed normally
|
||||
// In the actual sync, this would proceed to save the file and create a new document
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_webdav_sync_duplicate_different_users() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user1_id = create_test_user(&state.db, "webdav_user1").await?;
|
||||
let user2_id = create_test_user(&state.db, "webdav_user2").await?;
|
||||
|
||||
// Test content
|
||||
let test_content = b"Shared content between different users";
|
||||
let file_hash = calculate_file_hash(test_content);
|
||||
|
||||
// Create document for user1 with this hash
|
||||
let user1_doc = create_test_document(user1_id, "user1.pdf", file_hash.clone());
|
||||
state.db.create_document(user1_doc).await?;
|
||||
|
||||
// Check that user2 doesn't see user1's document as duplicate
|
||||
let duplicate_check = state.db.get_document_by_user_and_hash(user2_id, &file_hash).await?;
|
||||
assert!(duplicate_check.is_none(), "User2 should not see user1's document as duplicate");
|
||||
|
||||
// User2 should be able to create their own document with same hash
|
||||
let user2_doc = create_test_document(user2_id, "user2.pdf", file_hash.clone());
|
||||
let result = state.db.create_document(user2_doc).await;
|
||||
assert!(result.is_ok(), "User2 should be able to create document with same hash");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_webdav_sync_etag_change_detection() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user_id = create_test_user(&state.db, "webdav_test").await?;
|
||||
|
||||
let webdav_path = "/test/updated.pdf";
|
||||
let old_etag = "old-etag-123";
|
||||
let new_etag = "new-etag-456";
|
||||
|
||||
// Create a document first
|
||||
let test_doc = create_test_document(user_id, "updated.pdf", "etag_test_hash_1234567890".to_string());
|
||||
let created_doc = state.db.create_document(test_doc).await?;
|
||||
|
||||
// Create initial WebDAV file record
|
||||
let initial_webdav_file = CreateWebDAVFile {
|
||||
user_id,
|
||||
webdav_path: webdav_path.to_string(),
|
||||
etag: old_etag.to_string(),
|
||||
last_modified: Some(Utc::now()),
|
||||
file_size: 1024,
|
||||
mime_type: "application/pdf".to_string(),
|
||||
document_id: Some(created_doc.id),
|
||||
sync_status: "synced".to_string(),
|
||||
sync_error: None,
|
||||
};
|
||||
|
||||
state.db.create_or_update_webdav_file(&initial_webdav_file).await?;
|
||||
|
||||
// Check existing WebDAV file
|
||||
let existing_file = state.db.get_webdav_file_by_path(user_id, webdav_path).await?;
|
||||
assert!(existing_file.is_some());
|
||||
|
||||
let existing_file = existing_file.unwrap();
|
||||
assert_eq!(existing_file.etag, old_etag);
|
||||
|
||||
// Simulate file with new ETag (indicating change)
|
||||
let file_info = FileInfo {
|
||||
name: "updated.pdf".to_string(),
|
||||
path: webdav_path.to_string(),
|
||||
size: 1024,
|
||||
last_modified: Some(Utc::now()),
|
||||
etag: new_etag.to_string(),
|
||||
mime_type: "application/pdf".to_string(),
|
||||
is_directory: false,
|
||||
};
|
||||
|
||||
// ETag comparison should detect change
|
||||
assert_ne!(existing_file.etag, file_info.etag, "ETag change should be detected");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_webdav_sync_hash_collision_prevention() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user_id = create_test_user(&state.db, "webdav_test").await?;
|
||||
|
||||
// Create document with specific hash
|
||||
let test_hash = "abcd1234567890123456789012345678901234567890123456789012345678";
|
||||
let document = create_test_document(user_id, "original.pdf", test_hash.to_string());
|
||||
state.db.create_document(document).await?;
|
||||
|
||||
// Try to create another document with same hash (should fail due to unique constraint)
|
||||
let duplicate_document = create_test_document(user_id, "duplicate.pdf", test_hash.to_string());
|
||||
let result = state.db.create_document(duplicate_document).await;
|
||||
|
||||
assert!(result.is_err(), "Should not be able to create duplicate hash for same user");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_webdav_sync_file_content_vs_metadata_change() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user_id = create_test_user(&state.db, "webdav_test").await?;
|
||||
|
||||
// Original content and hash
|
||||
let original_content = b"Original file content";
|
||||
let original_hash = calculate_file_hash(original_content);
|
||||
|
||||
// Create original document
|
||||
let original_doc = create_test_document(user_id, "test.pdf", original_hash.clone());
|
||||
state.db.create_document(original_doc).await?;
|
||||
|
||||
// Same content but different metadata (name, etc.) - should still be detected as duplicate
|
||||
let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &original_hash).await?;
|
||||
assert!(duplicate_check.is_some(), "Same content should be detected as duplicate regardless of filename");
|
||||
|
||||
// Different content - should not be duplicate
|
||||
let different_content = b"Different file content";
|
||||
let different_hash = calculate_file_hash(different_content);
|
||||
|
||||
let unique_check = state.db.get_document_by_user_and_hash(user_id, &different_hash).await?;
|
||||
assert!(unique_check.is_none(), "Different content should not be detected as duplicate");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_webdav_sync_error_handling_invalid_hash() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user_id = create_test_user(&state.db, "webdav_test").await?;
|
||||
|
||||
// Test with invalid hash formats
|
||||
let invalid_g_hash = "g".repeat(64);
|
||||
let invalid_hashes = vec![
|
||||
"", // Empty
|
||||
"short", // Too short
|
||||
"invalid_characters_!@#$", // Invalid characters
|
||||
&invalid_g_hash, // Invalid hex (contains 'g')
|
||||
];
|
||||
|
||||
for invalid_hash in invalid_hashes {
|
||||
let result = state.db.get_document_by_user_and_hash(user_id, invalid_hash).await;
|
||||
// Should handle gracefully - either return None or proper error
|
||||
match result {
|
||||
Ok(doc) => assert!(doc.is_none(), "Invalid hash should not match any document"),
|
||||
Err(_) => {} // Acceptable to return error for invalid input
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_webdav_sync_concurrent_duplicate_detection() -> Result<()> {
|
||||
let state = create_test_app_state().await?;
|
||||
let user_id = create_test_user(&state.db, "webdav_test").await?;
|
||||
|
||||
let test_content = b"Concurrent test content";
|
||||
let file_hash = calculate_file_hash(test_content);
|
||||
|
||||
// Simulate concurrent duplicate checks
|
||||
let mut handles = Vec::new();
|
||||
|
||||
for i in 0..5 {
|
||||
let state_clone = state.clone();
|
||||
let hash_clone = file_hash.clone();
|
||||
|
||||
let handle = tokio::spawn(async move {
|
||||
state_clone.db.get_document_by_user_and_hash(user_id, &hash_clone).await
|
||||
});
|
||||
|
||||
handles.push(handle);
|
||||
}
|
||||
|
||||
// Wait for all concurrent operations
|
||||
let mut all_none = true;
|
||||
for handle in handles {
|
||||
let result = handle.await??;
|
||||
if result.is_some() {
|
||||
all_none = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Since no document exists with this hash, all should return None
|
||||
assert!(all_none, "All concurrent checks should return None for non-existent hash");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Loading…
Reference in New Issue