feat(server/client): implement updated FailedOcrPage, duplicate management, and file hashing

This commit is contained in:
perf3ct 2025-06-17 16:17:23 +00:00
parent 9dccc6d1de
commit bdb136d615
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
12 changed files with 2008 additions and 7 deletions

View File

@ -27,6 +27,8 @@ import {
Collapse,
LinearProgress,
Snackbar,
Tabs,
Tab,
} from '@mui/material';
import {
Refresh as RefreshIcon,
@ -37,6 +39,7 @@ import {
Schedule as ScheduleIcon,
Visibility as VisibilityIcon,
Download as DownloadIcon,
FileCopy as FileCopyIcon,
} from '@mui/icons-material';
import { format } from 'date-fns';
import { api, documentService } from '../services/api';
@ -87,16 +90,54 @@ interface RetryResponse {
estimated_wait_minutes?: number;
}
interface DuplicateDocument {
id: string;
filename: string;
original_filename: string;
file_size: number;
mime_type: string;
created_at: string;
user_id: string;
}
interface DuplicateGroup {
file_hash: string;
duplicate_count: number;
first_uploaded: string;
last_uploaded: string;
documents: DuplicateDocument[];
}
interface DuplicatesResponse {
duplicates: DuplicateGroup[];
pagination: {
total: number;
limit: number;
offset: number;
has_more: boolean;
};
statistics: {
total_duplicate_groups: number;
};
}
const FailedOcrPage: React.FC = () => {
const [currentTab, setCurrentTab] = useState(0);
const [documents, setDocuments] = useState<FailedDocument[]>([]);
const [duplicates, setDuplicates] = useState<DuplicateGroup[]>([]);
const [loading, setLoading] = useState(true);
const [duplicatesLoading, setDuplicatesLoading] = useState(false);
const [retrying, setRetrying] = useState<string | null>(null);
const [statistics, setStatistics] = useState<FailedOcrResponse['statistics'] | null>(null);
const [duplicateStatistics, setDuplicateStatistics] = useState<DuplicatesResponse['statistics'] | null>(null);
const [pagination, setPagination] = useState({ page: 1, limit: 25 });
const [duplicatesPagination, setDuplicatesPagination] = useState({ page: 1, limit: 25 });
const [totalPages, setTotalPages] = useState(0);
const [duplicatesTotalPages, setDuplicatesTotalPages] = useState(0);
const [selectedDocument, setSelectedDocument] = useState<FailedDocument | null>(null);
const [detailsOpen, setDetailsOpen] = useState(false);
const [expandedRows, setExpandedRows] = useState<Set<string>>(new Set());
const [expandedDuplicateGroups, setExpandedDuplicateGroups] = useState<Set<string>>(new Set());
const [snackbar, setSnackbar] = useState<{ open: boolean; message: string; severity: 'success' | 'error' }>({
open: false,
message: '',
@ -124,10 +165,37 @@ const FailedOcrPage: React.FC = () => {
}
};
const fetchDuplicates = async () => {
try {
setDuplicatesLoading(true);
const offset = (duplicatesPagination.page - 1) * duplicatesPagination.limit;
const response = await documentService.getDuplicates(duplicatesPagination.limit, offset);
setDuplicates(response.data.duplicates);
setDuplicateStatistics(response.data.statistics);
setDuplicatesTotalPages(Math.ceil(response.data.pagination.total / duplicatesPagination.limit));
} catch (error) {
console.error('Failed to fetch duplicates:', error);
setSnackbar({
open: true,
message: 'Failed to load duplicate documents',
severity: 'error'
});
} finally {
setDuplicatesLoading(false);
}
};
useEffect(() => {
fetchFailedDocuments();
}, [pagination.page]);
useEffect(() => {
if (currentTab === 1) {
fetchDuplicates();
}
}, [currentTab, duplicatesPagination.page]);
const handleRetryOcr = async (document: FailedDocument) => {
try {
setRetrying(document.id);
@ -200,6 +268,28 @@ const FailedOcrPage: React.FC = () => {
setDetailsOpen(true);
};
const toggleDuplicateGroupExpansion = (groupHash: string) => {
const newExpanded = new Set(expandedDuplicateGroups);
if (newExpanded.has(groupHash)) {
newExpanded.delete(groupHash);
} else {
newExpanded.add(groupHash);
}
setExpandedDuplicateGroups(newExpanded);
};
const handleTabChange = (event: React.SyntheticEvent, newValue: number) => {
setCurrentTab(newValue);
};
const refreshCurrentTab = () => {
if (currentTab === 0) {
fetchFailedDocuments();
} else {
fetchDuplicates();
}
};
if (loading && documents.length === 0) {
return (
<Box display="flex" justifyContent="center" alignItems="center" minHeight="400px">
@ -212,20 +302,38 @@ const FailedOcrPage: React.FC = () => {
<Box sx={{ p: 3 }}>
<Box display="flex" justifyContent="space-between" alignItems="center" mb={3}>
<Typography variant="h4" component="h1">
Failed OCR Documents
Failed OCR & Duplicates
</Typography>
<Button
variant="outlined"
startIcon={<RefreshIcon />}
onClick={fetchFailedDocuments}
disabled={loading}
onClick={refreshCurrentTab}
disabled={loading || duplicatesLoading}
>
Refresh
</Button>
</Box>
{/* Statistics Overview */}
{statistics && (
<Paper sx={{ mb: 3 }}>
<Tabs value={currentTab} onChange={handleTabChange} aria-label="failed ocr and duplicates tabs">
<Tab
icon={<ErrorIcon />}
label={`Failed OCR${statistics ? ` (${statistics.total_failed})` : ''}`}
iconPosition="start"
/>
<Tab
icon={<FileCopyIcon />}
label={`Duplicates${duplicateStatistics ? ` (${duplicateStatistics.total_duplicate_groups})` : ''}`}
iconPosition="start"
/>
</Tabs>
</Paper>
{/* Failed OCR Tab Content */}
{currentTab === 0 && (
<>
{/* Statistics Overview */}
{statistics && (
<Grid container spacing={3} mb={3}>
<Grid item xs={12} md={4}>
<Card>
@ -435,6 +543,174 @@ const FailedOcrPage: React.FC = () => {
)}
</>
)}
</>
)}
{/* Duplicates Tab Content */}
{currentTab === 1 && (
<>
{/* Duplicate Statistics Overview */}
{duplicateStatistics && (
<Grid container spacing={3} mb={3}>
<Grid item xs={12} md={6}>
<Card>
<CardContent>
<Typography variant="h6" color="warning.main">
<FileCopyIcon sx={{ mr: 1, verticalAlign: 'middle' }} />
Total Duplicate Groups
</Typography>
<Typography variant="h3" color="warning.main">
{duplicateStatistics.total_duplicate_groups}
</Typography>
</CardContent>
</Card>
</Grid>
</Grid>
)}
{duplicatesLoading ? (
<Box display="flex" justifyContent="center" alignItems="center" minHeight="400px">
<CircularProgress />
</Box>
) : duplicates.length === 0 ? (
<Alert severity="success" sx={{ mt: 2 }}>
<AlertTitle>No duplicates found!</AlertTitle>
You don't have any duplicate documents. All your files have unique content.
</Alert>
) : (
<>
<Alert severity="info" sx={{ mb: 2 }}>
<AlertTitle>Duplicate Documents</AlertTitle>
These documents have identical content but may have different filenames.
You can click on each group to see all the documents with the same content.
</Alert>
<TableContainer component={Paper}>
<Table>
<TableHead>
<TableRow>
<TableCell />
<TableCell>Content Hash</TableCell>
<TableCell>Duplicate Count</TableCell>
<TableCell>First Uploaded</TableCell>
<TableCell>Last Uploaded</TableCell>
<TableCell>Actions</TableCell>
</TableRow>
</TableHead>
<TableBody>
{duplicates.map((group) => (
<React.Fragment key={group.file_hash}>
<TableRow>
<TableCell>
<IconButton
size="small"
onClick={() => toggleDuplicateGroupExpansion(group.file_hash)}
>
{expandedDuplicateGroups.has(group.file_hash) ? <ExpandLessIcon /> : <ExpandMoreIcon />}
</IconButton>
</TableCell>
<TableCell>
<Typography variant="body2" fontFamily="monospace">
{group.file_hash.substring(0, 16)}...
</Typography>
</TableCell>
<TableCell>
<Chip
label={`${group.duplicate_count} files`}
color="warning"
size="small"
/>
</TableCell>
<TableCell>
<Typography variant="body2">
{format(new Date(group.first_uploaded), 'MMM dd, yyyy')}
</Typography>
</TableCell>
<TableCell>
<Typography variant="body2">
{format(new Date(group.last_uploaded), 'MMM dd, yyyy')}
</Typography>
</TableCell>
<TableCell>
<Typography variant="body2" color="text.secondary">
View files below
</Typography>
</TableCell>
</TableRow>
<TableRow>
<TableCell sx={{ paddingBottom: 0, paddingTop: 0 }} colSpan={6}>
<Collapse in={expandedDuplicateGroups.has(group.file_hash)} timeout="auto" unmountOnExit>
<Box sx={{ margin: 1, p: 2, bgcolor: 'grey.50' }}>
<Typography variant="h6" gutterBottom>
Duplicate Files ({group.duplicate_count} total)
</Typography>
<Grid container spacing={2}>
{group.documents.map((doc, index) => (
<Grid item xs={12} md={6} key={doc.id}>
<Card variant="outlined">
<CardContent>
<Typography variant="body2" fontWeight="bold">
{doc.filename}
</Typography>
{doc.original_filename !== doc.filename && (
<Typography variant="caption" color="text.secondary">
Original: {doc.original_filename}
</Typography>
)}
<Typography variant="caption" display="block" color="text.secondary">
{formatFileSize(doc.file_size)} {doc.mime_type}
</Typography>
<Typography variant="caption" display="block" color="text.secondary">
Uploaded: {format(new Date(doc.created_at), 'MMM dd, yyyy HH:mm')}
</Typography>
<Box mt={1}>
<Tooltip title="View Document">
<IconButton
size="small"
onClick={() => window.open(`/api/documents/${doc.id}/view`, '_blank')}
>
<VisibilityIcon />
</IconButton>
</Tooltip>
<Tooltip title="Download Document">
<IconButton
size="small"
onClick={() => window.open(`/api/documents/${doc.id}/download`, '_blank')}
>
<DownloadIcon />
</IconButton>
</Tooltip>
</Box>
</CardContent>
</Card>
</Grid>
))}
</Grid>
</Box>
</Collapse>
</TableCell>
</TableRow>
</React.Fragment>
))}
</TableBody>
</Table>
</TableContainer>
{/* Duplicates Pagination */}
{duplicatesTotalPages > 1 && (
<Box display="flex" justifyContent="center" mt={3}>
<Pagination
count={duplicatesTotalPages}
page={duplicatesPagination.page}
onChange={(_, page) => setDuplicatesPagination(prev => ({ ...prev, page }))}
color="primary"
/>
</Box>
)}
</>
)}
</>
)}
{/* Document Details Dialog */}
<Dialog

View File

@ -183,6 +183,12 @@ export const documentService = {
})
},
getDuplicates: (limit = 25, offset = 0) => {
return api.get(`/documents/duplicates`, {
params: { limit, offset },
})
},
search: (searchRequest: SearchRequest) => {
return api.get<SearchResponse>('/search', {
params: searchRequest,

View File

@ -991,6 +991,7 @@ impl Database {
created_at: row.get("created_at"),
updated_at: row.get("updated_at"),
user_id: row.get("user_id"),
file_hash: row.get("file_hash"),
});
}
@ -1125,6 +1126,7 @@ impl Database {
created_at: row.get("created_at"),
updated_at: row.get("updated_at"),
user_id: row.get("user_id"),
file_hash: row.get("file_hash"),
})),
None => Ok(None),
}
@ -1170,4 +1172,124 @@ impl Database {
None => Ok(None),
}
}
/// Get documents grouped by duplicate hashes for a user
pub async fn get_user_duplicates(&self, user_id: Uuid, user_role: crate::models::UserRole, limit: i64, offset: i64) -> Result<(Vec<serde_json::Value>, i64)> {
let (docs_query, count_query) = if user_role == crate::models::UserRole::Admin {
// Admins can see all duplicates
(
r#"
SELECT
file_hash,
COUNT(*) as duplicate_count,
MIN(created_at) as first_uploaded,
MAX(created_at) as last_uploaded,
json_agg(
json_build_object(
'id', id,
'filename', filename,
'original_filename', original_filename,
'file_size', file_size,
'mime_type', mime_type,
'created_at', created_at,
'user_id', user_id
) ORDER BY created_at
) as documents
FROM documents
WHERE file_hash IS NOT NULL
GROUP BY file_hash
HAVING COUNT(*) > 1
ORDER BY duplicate_count DESC, first_uploaded DESC
LIMIT $1 OFFSET $2
"#,
r#"
SELECT COUNT(*) as total FROM (
SELECT file_hash
FROM documents
WHERE file_hash IS NOT NULL
GROUP BY file_hash
HAVING COUNT(*) > 1
) as duplicate_groups
"#
)
} else {
// Regular users see only their own duplicates
(
r#"
SELECT
file_hash,
COUNT(*) as duplicate_count,
MIN(created_at) as first_uploaded,
MAX(created_at) as last_uploaded,
json_agg(
json_build_object(
'id', id,
'filename', filename,
'original_filename', original_filename,
'file_size', file_size,
'mime_type', mime_type,
'created_at', created_at,
'user_id', user_id
) ORDER BY created_at
) as documents
FROM documents
WHERE user_id = $3 AND file_hash IS NOT NULL
GROUP BY file_hash
HAVING COUNT(*) > 1
ORDER BY duplicate_count DESC, first_uploaded DESC
LIMIT $1 OFFSET $2
"#,
r#"
SELECT COUNT(*) as total FROM (
SELECT file_hash
FROM documents
WHERE user_id = $1 AND file_hash IS NOT NULL
GROUP BY file_hash
HAVING COUNT(*) > 1
) as duplicate_groups
"#
)
};
let rows = if user_role == crate::models::UserRole::Admin {
sqlx::query(docs_query)
.bind(limit)
.bind(offset)
.fetch_all(&self.pool)
.await?
} else {
sqlx::query(docs_query)
.bind(limit)
.bind(offset)
.bind(user_id)
.fetch_all(&self.pool)
.await?
};
let duplicates: Vec<serde_json::Value> = rows
.into_iter()
.map(|row| {
serde_json::json!({
"file_hash": row.get::<String, _>("file_hash"),
"duplicate_count": row.get::<i64, _>("duplicate_count"),
"first_uploaded": row.get::<chrono::DateTime<chrono::Utc>, _>("first_uploaded"),
"last_uploaded": row.get::<chrono::DateTime<chrono::Utc>, _>("last_uploaded"),
"documents": row.get::<serde_json::Value, _>("documents")
})
})
.collect();
let total = if user_role == crate::models::UserRole::Admin {
sqlx::query_scalar::<_, i64>(count_query)
.fetch_one(&self.pool)
.await?
} else {
sqlx::query_scalar::<_, i64>(count_query)
.bind(user_id)
.fetch_one(&self.pool)
.await?
};
Ok((duplicates, total))
}
}

View File

@ -37,6 +37,7 @@ pub fn router() -> Router<Arc<AppState>> {
.route("/{id}/processed-image", get(get_processed_image))
.route("/{id}/retry-ocr", post(retry_ocr))
.route("/failed-ocr", get(get_failed_ocr_documents))
.route("/duplicates", get(get_user_duplicates))
}
#[utoipa::path(
@ -226,7 +227,7 @@ fn calculate_file_hash(data: &[u8]) -> String {
("ocr_status" = Option<String>, Query, description = "Filter by OCR status (pending, processing, completed, failed)")
),
responses(
(status = 200, description = "List of user documents", body = Vec<DocumentResponse>),
(status = 200, description = "Paginated list of user documents with metadata", body = String),
(status = 401, description = "Unauthorized")
)
)]
@ -809,4 +810,50 @@ async fn get_failure_statistics(
.collect();
Ok(serde_json::json!(categories))
}
#[utoipa::path(
get,
path = "/api/documents/duplicates",
tag = "documents",
security(
("bearer_auth" = [])
),
params(
("limit" = Option<i64>, Query, description = "Number of duplicate groups to return per page"),
("offset" = Option<i64>, Query, description = "Number of duplicate groups to skip")
),
responses(
(status = 200, description = "User's duplicate documents grouped by hash", body = String),
(status = 401, description = "Unauthorized")
)
)]
async fn get_user_duplicates(
State(state): State<Arc<AppState>>,
auth_user: AuthUser,
Query(query): Query<PaginationQuery>,
) -> Result<Json<serde_json::Value>, StatusCode> {
let limit = query.limit.unwrap_or(25);
let offset = query.offset.unwrap_or(0);
let (duplicates, total_count) = state
.db
.get_user_duplicates(auth_user.user.id, auth_user.user.role, limit, offset)
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
let response = serde_json::json!({
"duplicates": duplicates,
"pagination": {
"total": total_count,
"limit": limit,
"offset": offset,
"has_more": offset + limit < total_count
},
"statistics": {
"total_duplicate_groups": total_count
}
});
Ok(Json(response))
}

View File

@ -8,7 +8,12 @@ use crate::{
models::{
CreateUser, LoginRequest, LoginResponse, UserResponse, UpdateUser,
DocumentResponse, SearchRequest, SearchResponse, EnhancedDocumentResponse,
SettingsResponse, UpdateSettings, SearchMode, SearchSnippet, HighlightRange
SettingsResponse, UpdateSettings, SearchMode, SearchSnippet, HighlightRange,
FacetItem, SearchFacetsResponse, Notification, NotificationSummary, CreateNotification,
Source, SourceResponse, CreateSource, UpdateSource, SourceWithStats,
WebDAVSourceConfig, LocalFolderSourceConfig, S3SourceConfig,
WebDAVCrawlEstimate, WebDAVTestConnection, WebDAVConnectionResult, WebDAVSyncStatus,
ProcessedImage, CreateProcessedImage
},
routes::metrics::{
SystemMetrics, DatabaseMetrics, OcrMetrics, DocumentMetrics, UserMetrics, GeneralSystemMetrics
@ -26,10 +31,19 @@ use crate::{
// Document endpoints
crate::routes::documents::upload_document,
crate::routes::documents::list_documents,
crate::routes::documents::get_document_by_id,
crate::routes::documents::download_document,
crate::routes::documents::view_document,
crate::routes::documents::get_document_thumbnail,
crate::routes::documents::get_document_ocr,
crate::routes::documents::get_processed_image,
crate::routes::documents::retry_ocr,
crate::routes::documents::get_failed_ocr_documents,
crate::routes::documents::get_user_duplicates,
// Search endpoints
crate::routes::search::search_documents,
crate::routes::search::enhanced_search_documents,
crate::routes::search::get_search_facets,
// Settings endpoints
crate::routes::settings::get_settings,
crate::routes::settings::update_settings,
@ -42,14 +56,46 @@ use crate::{
// Queue endpoints
crate::routes::queue::get_queue_stats,
crate::routes::queue::requeue_failed,
crate::routes::queue::get_ocr_status,
crate::routes::queue::pause_ocr_processing,
crate::routes::queue::resume_ocr_processing,
// Metrics endpoints
crate::routes::metrics::get_system_metrics,
// Notifications endpoints
crate::routes::notifications::get_notifications,
crate::routes::notifications::get_notification_summary,
crate::routes::notifications::mark_notification_read,
crate::routes::notifications::mark_all_notifications_read,
crate::routes::notifications::delete_notification,
// Sources endpoints
crate::routes::sources::list_sources,
crate::routes::sources::create_source,
crate::routes::sources::get_source,
crate::routes::sources::update_source,
crate::routes::sources::delete_source,
crate::routes::sources::trigger_sync,
crate::routes::sources::stop_sync,
crate::routes::sources::test_connection,
crate::routes::sources::estimate_crawl,
crate::routes::sources::estimate_crawl_with_config,
crate::routes::sources::test_connection_with_config,
// WebDAV endpoints
crate::routes::webdav::start_webdav_sync,
crate::routes::webdav::cancel_webdav_sync,
crate::routes::webdav::get_webdav_sync_status,
crate::routes::webdav::test_webdav_connection,
crate::routes::webdav::estimate_webdav_crawl,
),
components(
schemas(
CreateUser, LoginRequest, LoginResponse, UserResponse, UpdateUser,
DocumentResponse, SearchRequest, SearchResponse, EnhancedDocumentResponse,
SettingsResponse, UpdateSettings, SearchMode, SearchSnippet, HighlightRange,
FacetItem, SearchFacetsResponse, Notification, NotificationSummary, CreateNotification,
Source, SourceResponse, CreateSource, UpdateSource, SourceWithStats,
WebDAVSourceConfig, LocalFolderSourceConfig, S3SourceConfig,
WebDAVCrawlEstimate, WebDAVTestConnection, WebDAVConnectionResult, WebDAVSyncStatus,
ProcessedImage, CreateProcessedImage,
SystemMetrics, DatabaseMetrics, OcrMetrics, DocumentMetrics, UserMetrics, GeneralSystemMetrics
)
),
@ -61,6 +107,9 @@ use crate::{
(name = "users", description = "User management endpoints"),
(name = "queue", description = "OCR queue management endpoints"),
(name = "metrics", description = "System metrics and monitoring endpoints"),
(name = "notifications", description = "User notification endpoints"),
(name = "sources", description = "Document source management endpoints"),
(name = "webdav", description = "WebDAV synchronization endpoints"),
),
modifiers(&SecurityAddon),
info(

View File

@ -48,6 +48,7 @@ mod tests {
created_at: Utc::now(),
updated_at: Utc::now(),
user_id,
file_hash: Some("abcd1234567890123456789012345678901234567890123456789012345678".to_string()),
}
}

View File

@ -25,6 +25,7 @@ mod tests {
created_at: Utc::now(),
updated_at: Utc::now(),
user_id,
file_hash: Some("1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef".to_string()),
}
}
@ -48,6 +49,7 @@ mod tests {
created_at: Utc::now(),
updated_at: Utc::now(),
user_id,
file_hash: Some("fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321".to_string()),
}
}
@ -71,6 +73,7 @@ mod tests {
created_at: Utc::now(),
updated_at: Utc::now(),
user_id,
file_hash: Some("abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890".to_string()),
}
}

View File

@ -938,6 +938,7 @@ mod tests {
created_at: Utc::now(),
updated_at: Utc::now(),
user_id: user.id,
file_hash: Some("0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef".to_string()),
};
db.create_document(document).await.unwrap();

View File

@ -0,0 +1,391 @@
use anyhow::Result;
use chrono::Utc;
use std::sync::Arc;
use uuid::Uuid;
use sha2::{Sha256, Digest};
use readur::{
AppState,
db::Database,
config::Config,
models::{Document, CreateUser, UserRole},
};
// Helper function to calculate file hash
fn calculate_file_hash(data: &[u8]) -> String {
let mut hasher = Sha256::new();
hasher.update(data);
let result = hasher.finalize();
format!("{:x}", result)
}
// Helper function to create test document
fn create_test_document(user_id: Uuid, filename: &str, file_hash: String) -> Document {
Document {
id: Uuid::new_v4(),
filename: filename.to_string(),
original_filename: filename.to_string(),
file_path: format!("/tmp/{}", filename),
file_size: 1024,
mime_type: "application/pdf".to_string(),
content: None,
ocr_text: None,
ocr_confidence: None,
ocr_word_count: None,
ocr_processing_time_ms: None,
ocr_status: Some("pending".to_string()),
ocr_error: None,
ocr_completed_at: None,
tags: Vec::new(),
created_at: Utc::now(),
updated_at: Utc::now(),
user_id,
file_hash: Some(file_hash),
}
}
// Helper function to create test user with unique identifier
fn create_test_user_with_suffix(suffix: &str) -> CreateUser {
CreateUser {
username: format!("testuser_{}", suffix),
email: format!("test_{}@example.com", suffix),
password: "test_password".to_string(),
role: Some(UserRole::User),
}
}
async fn create_test_app_state() -> Result<Arc<AppState>> {
let config = Config::from_env().unwrap_or_else(|_| {
// Create a test config if env fails
Config {
database_url: "postgresql://readur:readur@localhost:5432/readur".to_string(),
server_address: "127.0.0.1:8000".to_string(),
jwt_secret: "test-secret".to_string(),
upload_path: "./test-uploads".to_string(),
watch_folder: "./test-watch".to_string(),
allowed_file_types: vec!["pdf".to_string(), "txt".to_string()],
watch_interval_seconds: Some(30),
file_stability_check_ms: Some(500),
max_file_age_hours: None,
ocr_language: "eng".to_string(),
concurrent_ocr_jobs: 2,
ocr_timeout_seconds: 60,
max_file_size_mb: 10,
memory_limit_mb: 256,
cpu_priority: "normal".to_string(),
}
});
let db = Database::new("postgresql://readur:readur@localhost:5432/readur").await?;
let queue_service = std::sync::Arc::new(
readur::ocr_queue::OcrQueueService::new(db.clone(), db.get_pool().clone(), 1)
);
Ok(Arc::new(AppState {
db,
config,
webdav_scheduler: None,
source_scheduler: None,
queue_service,
}))
}
#[tokio::test]
async fn test_document_upload_duplicate_detection_returns_existing() -> Result<()> {
let state = create_test_app_state().await?;
let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple()));
// Create user in database
let created_user = state.db.create_user(user).await?;
let user_id = created_user.id;
// Test content
let test_content = b"This is test PDF content for upload duplicate detection";
let file_hash = calculate_file_hash(test_content);
// Create existing document with same hash
let existing_doc = create_test_document(user_id, "existing.pdf", file_hash.clone());
let created_doc = state.db.create_document(existing_doc).await?;
// Test that the hash lookup would find the existing document
let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &file_hash).await?;
assert!(duplicate_check.is_some(), "Should find existing document with same hash");
let found_doc = duplicate_check.unwrap();
assert_eq!(found_doc.id, created_doc.id);
assert_eq!(found_doc.file_hash, Some(file_hash));
Ok(())
}
#[tokio::test]
async fn test_document_upload_unique_content_processed() -> Result<()> {
let state = create_test_app_state().await?;
let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple()));
// Create user in database
let created_user = state.db.create_user(user).await?;
let user_id = created_user.id;
// Test content
let test_content = b"This is unique PDF content for upload processing";
let file_hash = calculate_file_hash(test_content);
// Verify no existing document with this hash
let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &file_hash).await?;
assert!(duplicate_check.is_none(), "Should not find any existing document with this hash");
Ok(())
}
#[tokio::test]
async fn test_document_upload_different_users_same_content() -> Result<()> {
let state = create_test_app_state().await?;
// Create two users
let user1 = create_test_user_with_suffix(&format!("different_users_1_{}", Uuid::new_v4().simple()));
let created_user1 = state.db.create_user(user1).await?;
let user1_id = created_user1.id;
let user2 = create_test_user_with_suffix(&format!("different_users_2_{}", Uuid::new_v4().simple()));
let created_user2 = state.db.create_user(user2).await?;
let user2_id = created_user2.id;
// Test content
let test_content = b"Shared content between different users for upload";
let file_hash = calculate_file_hash(test_content);
// Create document for user1 with this hash
let user1_doc = create_test_document(user1_id, "user1.pdf", file_hash.clone());
state.db.create_document(user1_doc).await?;
// Check that user2 doesn't see user1's document as duplicate
let duplicate_check = state.db.get_document_by_user_and_hash(user2_id, &file_hash).await?;
assert!(duplicate_check.is_none(), "User2 should not see user1's document as duplicate");
// User2 should be able to create their own document with same hash
let user2_doc = create_test_document(user2_id, "user2.pdf", file_hash.clone());
let result = state.db.create_document(user2_doc).await;
assert!(result.is_ok(), "User2 should be able to create document with same hash");
Ok(())
}
#[tokio::test]
async fn test_document_upload_hash_calculation_accuracy() -> Result<()> {
// Test various file contents and ensure hash calculation is accurate
let test_cases = vec![
(b"" as &[u8], "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"), // Empty
(b"a", "ca978112ca1bbdcafac231b39a23dc4da786eff8147c4e72b9807785afee48bb"), // Single char
(b"Hello, World!", "dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f"), // Text
];
for (content, expected_hash) in test_cases {
let calculated_hash = calculate_file_hash(content);
assert_eq!(calculated_hash, expected_hash, "Hash mismatch for content: {:?}", content);
}
Ok(())
}
#[tokio::test]
async fn test_document_upload_large_file_hash() -> Result<()> {
// Test hash calculation for larger files
let large_content = vec![b'X'; 1_000_000]; // 1MB of 'X' characters
let hash1 = calculate_file_hash(&large_content);
let hash2 = calculate_file_hash(&large_content);
// Hash should be consistent
assert_eq!(hash1, hash2);
assert_eq!(hash1.len(), 64); // SHA256 hex length
assert!(hash1.chars().all(|c| c.is_ascii_hexdigit()));
Ok(())
}
#[tokio::test]
async fn test_document_upload_binary_content_hash() -> Result<()> {
// Test hash calculation for binary content
let mut binary_content = Vec::new();
for i in 0..256 {
binary_content.push(i as u8);
}
let hash = calculate_file_hash(&binary_content);
assert_eq!(hash.len(), 64);
assert!(hash.chars().all(|c| c.is_ascii_hexdigit()));
// Same binary content should produce same hash
let hash2 = calculate_file_hash(&binary_content);
assert_eq!(hash, hash2);
Ok(())
}
#[tokio::test]
async fn test_document_upload_duplicate_prevention_database_constraint() -> Result<()> {
let state = create_test_app_state().await?;
let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple()));
// Create user in database and get the created user
let created_user = state.db.create_user(user).await?;
let user_id = created_user.id;
let test_hash = "duplicate_upload_test_hash_123456789012345678901234567890123456";
// Create first document with the hash
let doc1 = create_test_document(user_id, "test1.pdf", test_hash.to_string());
let result1 = state.db.create_document(doc1).await;
assert!(result1.is_ok(), "First document should be created successfully");
// Try to create second document with same hash for same user
let doc2 = create_test_document(user_id, "test2.pdf", test_hash.to_string());
let result2 = state.db.create_document(doc2).await;
// This should fail due to unique constraint
assert!(result2.is_err(), "Second document with same hash should fail");
Ok(())
}
#[tokio::test]
async fn test_document_upload_filename_vs_content_duplicate() -> Result<()> {
let state = create_test_app_state().await?;
let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple()));
// Create user in database and get the created user
let created_user = state.db.create_user(user).await?;
let user_id = created_user.id;
// Same content, different filenames
let content = b"Same content, different names";
let hash = calculate_file_hash(content);
// Create first document
let doc1 = create_test_document(user_id, "document_v1.pdf", hash.clone());
state.db.create_document(doc1).await?;
// Check that same content is detected as duplicate regardless of filename
let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &hash).await?;
assert!(duplicate_check.is_some(), "Same content should be detected as duplicate regardless of filename");
Ok(())
}
#[tokio::test]
async fn test_document_upload_unicode_content_hash() -> Result<()> {
// Test hash calculation with unicode content
let unicode_content = "Hello 世界 🌍 café naïve résumé".as_bytes();
let hash1 = calculate_file_hash(unicode_content);
let hash2 = calculate_file_hash(unicode_content);
// Hash should be consistent for unicode content
assert_eq!(hash1, hash2);
assert_eq!(hash1.len(), 64);
assert!(hash1.chars().all(|c| c.is_ascii_hexdigit()));
Ok(())
}
#[tokio::test]
async fn test_document_upload_concurrent_same_content() -> Result<()> {
let state = create_test_app_state().await?;
let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple()));
// Create user in database and get the created user
let created_user = state.db.create_user(user).await?;
let user_id = created_user.id;
let test_content = b"Concurrent upload test content";
let file_hash = calculate_file_hash(test_content);
// Simulate concurrent uploads of same content
let mut handles = Vec::new();
for i in 0..5 {
let state_clone = state.clone();
let hash_clone = file_hash.clone();
let handle = tokio::spawn(async move {
let doc = create_test_document(user_id, &format!("concurrent{}.pdf", i), hash_clone);
state_clone.db.create_document(doc).await
});
handles.push(handle);
}
// Wait for all operations and count results
let mut success_count = 0;
let mut error_count = 0;
for handle in handles {
match handle.await? {
Ok(_) => success_count += 1,
Err(_) => error_count += 1,
}
}
// Only one should succeed due to unique constraint
assert_eq!(success_count, 1, "Only one document should be created successfully");
assert_eq!(error_count, 4, "Four operations should fail due to duplicate hash");
Ok(())
}
#[tokio::test]
async fn test_document_upload_mime_type_independence() -> Result<()> {
let state = create_test_app_state().await?;
let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple()));
// Create user in database and get the created user
let created_user = state.db.create_user(user).await?;
let user_id = created_user.id;
let content = b"Same content, different perceived types";
let hash = calculate_file_hash(content);
// Create document as PDF
let mut pdf_doc = create_test_document(user_id, "test.pdf", hash.clone());
pdf_doc.mime_type = "application/pdf".to_string();
state.db.create_document(pdf_doc).await?;
// Try to upload same content as text file - should be detected as duplicate
let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &hash).await?;
assert!(duplicate_check.is_some(), "Same content should be detected as duplicate regardless of MIME type");
Ok(())
}
#[tokio::test]
async fn test_document_upload_performance_hash_lookup() -> Result<()> {
let state = create_test_app_state().await?;
let user = create_test_user_with_suffix(&format!("upload_{}", uuid::Uuid::new_v4().simple()));
// Create user in database and get the created user
let created_user = state.db.create_user(user).await?;
let user_id = created_user.id;
// Create multiple documents with different hashes
let mut test_hashes = Vec::new();
for i in 0..50 {
let content = format!("Performance test content {}", i);
let hash = calculate_file_hash(content.as_bytes());
test_hashes.push(hash.clone());
let doc = create_test_document(user_id, &format!("perf_test_{}.pdf", i), hash);
state.db.create_document(doc).await?;
}
// Measure hash lookup performance
let start = std::time::Instant::now();
for hash in &test_hashes {
let result = state.db.get_document_by_user_and_hash(user_id, hash).await?;
assert!(result.is_some(), "Should find document with hash: {}", hash);
}
let duration = start.elapsed();
// Hash lookups should be very fast
assert!(duration.as_millis() < 2000, "Hash lookups should be fast even with many documents: {:?}", duration);
Ok(())
}

View File

@ -0,0 +1,276 @@
use anyhow::Result;
use chrono::Utc;
use uuid::Uuid;
use sha2::{Sha256, Digest};
use tempfile::TempDir;
use readur::{
db::Database,
file_service::FileService,
models::{Document, CreateUser, UserRole},
};
const TEST_DB_URL: &str = "postgresql://readur:readur@localhost:5432/readur";
// Helper function to create a test user with unique identifier
async fn create_test_user(db: &Database, username: &str) -> Result<Uuid> {
let unique_suffix = Uuid::new_v4().simple();
let user = CreateUser {
username: format!("{}_{}", username, unique_suffix),
email: format!("{}_{}@example.com", username, unique_suffix),
password: "password123".to_string(),
role: Some(UserRole::User),
};
let created_user = db.create_user(user).await?;
Ok(created_user.id)
}
// Helper function to calculate file hash
fn calculate_file_hash(data: &[u8]) -> String {
let mut hasher = Sha256::new();
hasher.update(data);
let result = hasher.finalize();
format!("{:x}", result)
}
// Helper function to create a test document
fn create_test_document(user_id: Uuid, filename: &str, file_hash: Option<String>) -> Document {
Document {
id: Uuid::new_v4(),
filename: filename.to_string(),
original_filename: filename.to_string(),
file_path: format!("/tmp/{}", filename),
file_size: 1024,
mime_type: "application/pdf".to_string(),
content: None,
ocr_text: None,
ocr_confidence: None,
ocr_word_count: None,
ocr_processing_time_ms: None,
ocr_status: Some("pending".to_string()),
ocr_error: None,
ocr_completed_at: None,
tags: Vec::new(),
created_at: Utc::now(),
updated_at: Utc::now(),
user_id,
file_hash,
}
}
#[tokio::test]
async fn test_get_document_by_user_and_hash_found() -> Result<()> {
let db = Database::new(TEST_DB_URL).await?;
let user_id = create_test_user(&db, "testuser1").await?;
let file_hash = "abcd1234567890";
// Create a document with the hash
let document = create_test_document(user_id, "test.pdf", Some(file_hash.to_string()));
let created_doc = db.create_document(document).await?;
// Test finding the document by hash
let found_doc = db.get_document_by_user_and_hash(user_id, file_hash).await?;
assert!(found_doc.is_some());
let found_doc = found_doc.unwrap();
assert_eq!(found_doc.id, created_doc.id);
assert_eq!(found_doc.file_hash, Some(file_hash.to_string()));
assert_eq!(found_doc.user_id, user_id);
Ok(())
}
#[tokio::test]
async fn test_get_document_by_user_and_hash_not_found() -> Result<()> {
let db = Database::new(TEST_DB_URL).await?;
let user_id = Uuid::new_v4();
let non_existent_hash = "nonexistent1234567890";
// Test finding a non-existent hash
let found_doc = db.get_document_by_user_and_hash(user_id, non_existent_hash).await?;
assert!(found_doc.is_none());
Ok(())
}
#[tokio::test]
async fn test_get_document_by_user_and_hash_different_user() -> Result<()> {
let db = Database::new(TEST_DB_URL).await?;
let user1_id = create_test_user(&db, "testuser2").await?;
let user2_id = create_test_user(&db, "testuser3").await?;
let file_hash = "shared_hash_1234567890";
// Create a document for user1 with the hash
let document = create_test_document(user1_id, "test.pdf", Some(file_hash.to_string()));
db.create_document(document).await?;
// Test that user2 cannot find user1's document by hash
let found_doc = db.get_document_by_user_and_hash(user2_id, file_hash).await?;
assert!(found_doc.is_none(), "User should not be able to access another user's documents");
Ok(())
}
#[tokio::test]
async fn test_duplicate_hash_prevention_same_user() -> Result<()> {
let db = Database::new(TEST_DB_URL).await?;
let user_id = create_test_user(&db, "testuser4").await?;
let file_hash = "duplicate_hash_1234567890";
// Create first document with the hash
let document1 = create_test_document(user_id, "test1.pdf", Some(file_hash.to_string()));
let result1 = db.create_document(document1).await;
assert!(result1.is_ok(), "First document with hash should be created successfully");
// Try to create second document with same hash for same user
let document2 = create_test_document(user_id, "test2.pdf", Some(file_hash.to_string()));
let result2 = db.create_document(document2).await;
// This should fail due to unique constraint
assert!(result2.is_err(), "Second document with same hash for same user should fail");
Ok(())
}
#[tokio::test]
async fn test_same_hash_different_users_allowed() -> Result<()> {
let db = Database::new(TEST_DB_URL).await?;
let user1_id = create_test_user(&db, "testuser5").await?;
let user2_id = create_test_user(&db, "testuser6").await?;
let file_hash = "shared_content_hash_1234567890";
// Create document for user1 with the hash
let document1 = create_test_document(user1_id, "test1.pdf", Some(file_hash.to_string()));
let result1 = db.create_document(document1).await;
assert!(result1.is_ok(), "First user's document should be created successfully");
// Create document for user2 with same hash
let document2 = create_test_document(user2_id, "test2.pdf", Some(file_hash.to_string()));
let result2 = db.create_document(document2).await;
assert!(result2.is_ok(), "Second user's document with same hash should be allowed");
// Verify both users can find their respective documents
let found_doc1 = db.get_document_by_user_and_hash(user1_id, file_hash).await?;
let found_doc2 = db.get_document_by_user_and_hash(user2_id, file_hash).await?;
assert!(found_doc1.is_some());
assert!(found_doc2.is_some());
assert_ne!(found_doc1.unwrap().id, found_doc2.unwrap().id);
Ok(())
}
#[tokio::test]
async fn test_null_hash_allowed_multiple() -> Result<()> {
let db = Database::new(TEST_DB_URL).await?;
let user_id = create_test_user(&db, "testuser7").await?;
// Create multiple documents with null hash (should be allowed)
let document1 = create_test_document(user_id, "test1.pdf", None);
let result1 = db.create_document(document1).await;
assert!(result1.is_ok(), "First document with null hash should be created");
let document2 = create_test_document(user_id, "test2.pdf", None);
let result2 = db.create_document(document2).await;
assert!(result2.is_ok(), "Second document with null hash should be created");
Ok(())
}
#[test]
fn test_calculate_file_hash_consistency() {
let test_data = b"Hello, World! This is test content for hash calculation.";
// Calculate hash multiple times
let hash1 = calculate_file_hash(test_data);
let hash2 = calculate_file_hash(test_data);
let hash3 = calculate_file_hash(test_data);
// All hashes should be identical
assert_eq!(hash1, hash2);
assert_eq!(hash2, hash3);
// Hash should be 64 characters (SHA256 hex)
assert_eq!(hash1.len(), 64);
// Should be valid hex
assert!(hash1.chars().all(|c| c.is_ascii_hexdigit()));
}
#[test]
fn test_calculate_file_hash_different_content() {
let data1 = b"Content 1";
let data2 = b"Content 2";
let data3 = b"content 1"; // Different case
let hash1 = calculate_file_hash(data1);
let hash2 = calculate_file_hash(data2);
let hash3 = calculate_file_hash(data3);
// All hashes should be different
assert_ne!(hash1, hash2);
assert_ne!(hash1, hash3);
assert_ne!(hash2, hash3);
}
#[test]
fn test_calculate_file_hash_empty_content() {
let empty_data = b"";
let hash = calculate_file_hash(empty_data);
// Should produce a valid hash even for empty content
assert_eq!(hash.len(), 64);
assert!(hash.chars().all(|c| c.is_ascii_hexdigit()));
// Known SHA256 hash of empty string
assert_eq!(hash, "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855");
}
#[tokio::test]
async fn test_file_service_create_document_with_hash() {
let temp_dir = TempDir::new().unwrap();
let upload_path = temp_dir.path().to_string_lossy().to_string();
let file_service = FileService::new(upload_path);
let user_id = Uuid::new_v4();
let test_hash = "test_hash_1234567890";
let document = file_service.create_document(
"test.pdf",
"original.pdf",
"/path/to/file.pdf",
1024,
"application/pdf",
user_id,
Some(test_hash.to_string()),
);
assert_eq!(document.filename, "test.pdf");
assert_eq!(document.original_filename, "original.pdf");
assert_eq!(document.file_hash, Some(test_hash.to_string()));
assert_eq!(document.user_id, user_id);
}
#[tokio::test]
async fn test_file_service_create_document_without_hash() {
let temp_dir = TempDir::new().unwrap();
let upload_path = temp_dir.path().to_string_lossy().to_string();
let file_service = FileService::new(upload_path);
let user_id = Uuid::new_v4();
let document = file_service.create_document(
"test.pdf",
"original.pdf",
"/path/to/file.pdf",
1024,
"application/pdf",
user_id,
None,
);
assert_eq!(document.filename, "test.pdf");
assert_eq!(document.original_filename, "original.pdf");
assert_eq!(document.file_hash, None);
assert_eq!(document.user_id, user_id);
}

View File

@ -0,0 +1,440 @@
use anyhow::Result;
use chrono::Utc;
use std::sync::Arc;
use uuid::Uuid;
use sha2::{Sha256, Digest};
use readur::{
AppState,
db::Database,
config::Config,
models::{FileInfo, Document, Source, SourceType, SourceStatus},
};
// Helper function to calculate file hash
fn calculate_file_hash(data: &[u8]) -> String {
let mut hasher = Sha256::new();
hasher.update(data);
let result = hasher.finalize();
format!("{:x}", result)
}
// Helper function to create test file info
fn create_test_file_info(name: &str, path: &str, content: &[u8]) -> FileInfo {
FileInfo {
name: name.to_string(),
path: path.to_string(),
size: content.len() as i64,
last_modified: Some(Utc::now()),
etag: "test-etag".to_string(),
mime_type: "application/pdf".to_string(),
is_directory: false,
}
}
// Helper function to create test document
fn create_test_document(user_id: Uuid, filename: &str, file_hash: String) -> Document {
Document {
id: Uuid::new_v4(),
filename: filename.to_string(),
original_filename: filename.to_string(),
file_path: format!("/tmp/{}", filename),
file_size: 1024,
mime_type: "application/pdf".to_string(),
content: None,
ocr_text: None,
ocr_confidence: None,
ocr_word_count: None,
ocr_processing_time_ms: None,
ocr_status: Some("pending".to_string()),
ocr_error: None,
ocr_completed_at: None,
tags: Vec::new(),
created_at: Utc::now(),
updated_at: Utc::now(),
user_id,
file_hash: Some(file_hash),
}
}
// Helper function to create test source
fn create_test_source(user_id: Uuid, source_type: SourceType) -> Source {
Source {
id: Uuid::new_v4(),
user_id,
name: "Test Source".to_string(),
source_type,
config: serde_json::json!({}),
status: SourceStatus::Idle,
enabled: true,
last_sync_at: None,
last_error: None,
last_error_at: None,
total_files_synced: 0,
total_files_pending: 0,
total_size_bytes: 0,
created_at: Utc::now(),
updated_at: Utc::now(),
}
}
// Helper function to create a test user with unique identifier
async fn create_test_user(db: &Database, username: &str) -> Result<Uuid> {
use readur::models::{CreateUser, UserRole};
let unique_suffix = Uuid::new_v4().simple();
let user = CreateUser {
username: format!("{}_{}", username, unique_suffix),
email: format!("{}_{}@example.com", username, unique_suffix),
password: "password123".to_string(),
role: Some(UserRole::User),
};
let created_user = db.create_user(user).await?;
Ok(created_user.id)
}
async fn create_test_app_state() -> Result<Arc<AppState>> {
let config = Config::from_env().unwrap_or_else(|_| {
Config {
database_url: "postgresql://readur:readur@localhost:5432/readur".to_string(),
server_address: "127.0.0.1:8000".to_string(),
jwt_secret: "test-secret".to_string(),
upload_path: "./test-uploads".to_string(),
watch_folder: "./test-watch".to_string(),
allowed_file_types: vec!["pdf".to_string(), "txt".to_string()],
watch_interval_seconds: Some(30),
file_stability_check_ms: Some(500),
max_file_age_hours: None,
ocr_language: "eng".to_string(),
concurrent_ocr_jobs: 2,
ocr_timeout_seconds: 60,
max_file_size_mb: 10,
memory_limit_mb: 256,
cpu_priority: "normal".to_string(),
}
});
let db = Database::new("postgresql://readur:readur@localhost:5432/readur").await?;
let queue_service = std::sync::Arc::new(
readur::ocr_queue::OcrQueueService::new(db.clone(), db.get_pool().clone(), 1)
);
Ok(Arc::new(AppState {
db,
config,
webdav_scheduler: None,
source_scheduler: None,
queue_service,
}))
}
#[tokio::test]
async fn test_source_sync_duplicate_detection_skips_duplicate() -> Result<()> {
let state = create_test_app_state().await?;
let user_id = create_test_user(&state.db, "source_sync_test").await?;
// Test content
let test_content = b"This is test content for source sync duplicate detection";
let file_hash = calculate_file_hash(test_content);
// Create existing document with same hash
let existing_doc = create_test_document(user_id, "existing.pdf", file_hash.clone());
state.db.create_document(existing_doc).await?;
// Check if duplicate exists using the efficient method
let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &file_hash).await?;
assert!(duplicate_check.is_some(), "Should find existing document with same hash");
let found_doc = duplicate_check.unwrap();
assert_eq!(found_doc.file_hash, Some(file_hash));
assert_eq!(found_doc.user_id, user_id);
Ok(())
}
#[tokio::test]
async fn test_source_sync_duplicate_detection_processes_unique() -> Result<()> {
let state = create_test_app_state().await?;
let user_id = create_test_user(&state.db, "source_sync_test").await?;
// Test content
let test_content = b"This is unique content that should be processed by source sync";
let file_hash = calculate_file_hash(test_content);
// Verify no existing document with this hash
let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &file_hash).await?;
assert!(duplicate_check.is_none(), "Should not find any existing document with this hash");
// This indicates the file would be processed normally
Ok(())
}
#[tokio::test]
async fn test_source_sync_duplicate_different_users() -> Result<()> {
let state = create_test_app_state().await?;
let user1_id = create_test_user(&state.db, "source_sync_user1").await?;
let user2_id = create_test_user(&state.db, "source_sync_user2").await?;
// Test content
let test_content = b"Shared content between different users in source sync";
let file_hash = calculate_file_hash(test_content);
// Create document for user1 with this hash
let user1_doc = create_test_document(user1_id, "user1.pdf", file_hash.clone());
state.db.create_document(user1_doc).await?;
// Check that user2 doesn't see user1's document as duplicate
let duplicate_check = state.db.get_document_by_user_and_hash(user2_id, &file_hash).await?;
assert!(duplicate_check.is_none(), "User2 should not see user1's document as duplicate");
// User2 should be able to create their own document with same hash
let user2_doc = create_test_document(user2_id, "user2.pdf", file_hash.clone());
let result = state.db.create_document(user2_doc).await;
assert!(result.is_ok(), "User2 should be able to create document with same hash");
Ok(())
}
#[tokio::test]
async fn test_source_sync_hash_calculation_consistency() -> Result<()> {
let test_content = b"Test content for hash consistency in source sync";
// Calculate hash multiple times
let hash1 = calculate_file_hash(test_content);
let hash2 = calculate_file_hash(test_content);
let hash3 = calculate_file_hash(test_content);
// All hashes should be identical
assert_eq!(hash1, hash2);
assert_eq!(hash2, hash3);
// Hash should be 64 characters (SHA256 hex)
assert_eq!(hash1.len(), 64);
// Should be valid hex
assert!(hash1.chars().all(|c| c.is_ascii_hexdigit()));
Ok(())
}
#[tokio::test]
async fn test_source_sync_duplicate_detection_performance() -> Result<()> {
let state = create_test_app_state().await?;
let user_id = create_test_user(&state.db, "source_sync_test").await?;
// Create multiple documents with different hashes
let mut created_hashes = Vec::new();
for i in 0..10 {
let content = format!("Test content number {}", i);
let hash = calculate_file_hash(content.as_bytes());
created_hashes.push(hash.clone());
let doc = create_test_document(user_id, &format!("test{}.pdf", i), hash);
state.db.create_document(doc).await?;
}
// Test lookup performance - should be fast even with multiple documents
let start = std::time::Instant::now();
for hash in &created_hashes {
let result = state.db.get_document_by_user_and_hash(user_id, hash).await?;
assert!(result.is_some(), "Should find document with hash: {}", hash);
}
let duration = start.elapsed();
assert!(duration.as_millis() < 1000, "Hash lookups should be fast: {:?}", duration);
Ok(())
}
#[tokio::test]
async fn test_source_sync_file_modification_detection() -> Result<()> {
let state = create_test_app_state().await?;
let user_id = create_test_user(&state.db, "source_sync_test").await?;
// Original content
let original_content = b"Original file content";
let original_hash = calculate_file_hash(original_content);
// Modified content (same file, different content)
let modified_content = b"Modified file content";
let modified_hash = calculate_file_hash(modified_content);
// Create document with original content
let original_doc = create_test_document(user_id, "test.pdf", original_hash.clone());
state.db.create_document(original_doc).await?;
// Check original content is found
let original_check = state.db.get_document_by_user_and_hash(user_id, &original_hash).await?;
assert!(original_check.is_some(), "Should find document with original hash");
// Check modified content is not found (different hash)
let modified_check = state.db.get_document_by_user_and_hash(user_id, &modified_hash).await?;
assert!(modified_check.is_none(), "Should not find document with modified hash");
// Verify hashes are actually different
assert_ne!(original_hash, modified_hash, "Original and modified content should have different hashes");
Ok(())
}
#[tokio::test]
async fn test_source_sync_edge_case_empty_files() -> Result<()> {
let state = create_test_app_state().await?;
let user_id = create_test_user(&state.db, "source_sync_test").await?;
// Empty file content
let empty_content = b"";
let empty_hash = calculate_file_hash(empty_content);
// Create document with empty content
let empty_doc = create_test_document(user_id, "empty.pdf", empty_hash.clone());
state.db.create_document(empty_doc).await?;
// Check empty file is found
let empty_check = state.db.get_document_by_user_and_hash(user_id, &empty_hash).await?;
assert!(empty_check.is_some(), "Should find document with empty content hash");
// Verify empty hash is the known SHA256 empty string hash
assert_eq!(empty_hash, "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855");
Ok(())
}
#[tokio::test]
async fn test_source_sync_large_file_hash_consistency() -> Result<()> {
// Simulate large file content
let large_content = vec![b'A'; 10_000_000]; // 10MB of 'A' characters
// Calculate hash
let hash = calculate_file_hash(&large_content);
// Hash should still be 64 characters
assert_eq!(hash.len(), 64);
assert!(hash.chars().all(|c| c.is_ascii_hexdigit()));
// Calculate same hash again to ensure consistency
let hash2 = calculate_file_hash(&large_content);
assert_eq!(hash, hash2);
Ok(())
}
#[tokio::test]
async fn test_source_sync_binary_file_handling() -> Result<()> {
let state = create_test_app_state().await?;
let user_id = create_test_user(&state.db, "source_sync_test").await?;
// Binary content (PDF header + some binary data)
let mut binary_content = b"%PDF-1.4\n".to_vec();
binary_content.extend_from_slice(&[0u8, 1u8, 2u8, 3u8, 255u8, 254u8, 253u8]);
let binary_hash = calculate_file_hash(&binary_content);
// Create document with binary content
let binary_doc = create_test_document(user_id, "binary.pdf", binary_hash.clone());
state.db.create_document(binary_doc).await?;
// Check binary file is found
let binary_check = state.db.get_document_by_user_and_hash(user_id, &binary_hash).await?;
assert!(binary_check.is_some(), "Should find document with binary content hash");
Ok(())
}
#[tokio::test]
async fn test_source_sync_unicode_filename_handling() -> Result<()> {
let state = create_test_app_state().await?;
let user_id = create_test_user(&state.db, "source_sync_test").await?;
// Unicode content and filename
let unicode_content = "Test content with unicode: 测试内容 🚀 café".as_bytes();
let unicode_hash = calculate_file_hash(unicode_content);
// Create document with unicode filename
let unicode_doc = create_test_document(user_id, "测试文档🚀.pdf", unicode_hash.clone());
state.db.create_document(unicode_doc).await?;
// Check unicode file is found
let unicode_check = state.db.get_document_by_user_and_hash(user_id, &unicode_hash).await?;
assert!(unicode_check.is_some(), "Should find document with unicode content hash");
let found_doc = unicode_check.unwrap();
assert_eq!(found_doc.filename, "测试文档🚀.pdf");
Ok(())
}
#[tokio::test]
async fn test_source_sync_concurrent_hash_operations() -> Result<()> {
let state = create_test_app_state().await?;
let user_id = create_test_user(&state.db, "source_sync_test").await?;
// Create multiple concurrent hash lookup operations
let mut handles = Vec::new();
for i in 0..20 {
let state_clone = state.clone();
let hash = format!("{}test_hash_concurrent_{}", "a".repeat(40), i);
let handle = tokio::spawn(async move {
state_clone.db.get_document_by_user_and_hash(user_id, &hash).await
});
handles.push(handle);
}
// Wait for all concurrent operations
let mut results = Vec::new();
for handle in handles {
let result = handle.await??;
results.push(result);
}
// All should return None (no documents exist with these hashes)
for (i, result) in results.iter().enumerate() {
assert!(result.is_none(), "Concurrent operation {} should return None", i);
}
Ok(())
}
#[tokio::test]
async fn test_source_sync_duplicate_prevention_race_condition() -> Result<()> {
let state = create_test_app_state().await?;
let user_id = create_test_user(&state.db, "source_sync_test").await?;
let test_hash = "race_condition_test_hash_123456789012345678901234567890123456";
// Try to create multiple documents with same hash concurrently
let mut handles = Vec::new();
for i in 0..5 {
let state_clone = state.clone();
let hash_clone = test_hash.to_string();
let handle = tokio::spawn(async move {
let doc = create_test_document(user_id, &format!("test{}.pdf", i), hash_clone);
state_clone.db.create_document(doc).await
});
handles.push(handle);
}
// Wait for all operations and count successes
let mut success_count = 0;
let mut error_count = 0;
for handle in handles {
match handle.await? {
Ok(_) => success_count += 1,
Err(_) => error_count += 1,
}
}
// Only one should succeed due to unique constraint
assert_eq!(success_count, 1, "Only one document should be created successfully");
assert_eq!(error_count, 4, "Four operations should fail due to duplicate hash");
Ok(())
}

View File

@ -0,0 +1,389 @@
use anyhow::Result;
use chrono::Utc;
use std::sync::Arc;
use uuid::Uuid;
use sha2::{Sha256, Digest};
use readur::{
AppState,
db::Database,
config::Config,
models::{FileInfo, CreateWebDAVFile, Document},
};
// Helper function to calculate file hash
fn calculate_file_hash(data: &[u8]) -> String {
let mut hasher = Sha256::new();
hasher.update(data);
let result = hasher.finalize();
format!("{:x}", result)
}
// Helper function to create test file info
fn create_test_file_info(name: &str, path: &str, size: i64) -> FileInfo {
FileInfo {
name: name.to_string(),
path: path.to_string(),
size,
last_modified: Some(Utc::now()),
etag: "test-etag".to_string(),
mime_type: "application/pdf".to_string(),
is_directory: false,
}
}
// Helper function to create test document
fn create_test_document(user_id: Uuid, filename: &str, file_hash: String) -> Document {
Document {
id: Uuid::new_v4(),
filename: filename.to_string(),
original_filename: filename.to_string(),
file_path: format!("/tmp/{}", filename),
file_size: 1024,
mime_type: "application/pdf".to_string(),
content: None,
ocr_text: None,
ocr_confidence: None,
ocr_word_count: None,
ocr_processing_time_ms: None,
ocr_status: Some("pending".to_string()),
ocr_error: None,
ocr_completed_at: None,
tags: Vec::new(),
created_at: Utc::now(),
updated_at: Utc::now(),
user_id,
file_hash: Some(file_hash),
}
}
// Mock WebDAV service for testing
#[derive(Clone)]
struct MockWebDAVService {
pub test_files: std::collections::HashMap<String, Vec<u8>>,
}
impl MockWebDAVService {
fn new() -> Self {
Self {
test_files: std::collections::HashMap::new(),
}
}
fn add_test_file(&mut self, path: &str, content: Vec<u8>) {
self.test_files.insert(path.to_string(), content);
}
async fn download_file(&self, path: &str) -> Result<Vec<u8>> {
self.test_files
.get(path)
.cloned()
.ok_or_else(|| anyhow::anyhow!("File not found: {}", path))
}
}
// Helper function to create a test user with unique identifier
async fn create_test_user(db: &Database, username: &str) -> Result<Uuid> {
use readur::models::{CreateUser, UserRole};
let unique_suffix = Uuid::new_v4().simple();
let user = CreateUser {
username: format!("{}_{}", username, unique_suffix),
email: format!("{}_{}@example.com", username, unique_suffix),
password: "password123".to_string(),
role: Some(UserRole::User),
};
let created_user = db.create_user(user).await?;
Ok(created_user.id)
}
async fn create_test_app_state() -> Result<Arc<AppState>> {
let config = Config::from_env().unwrap_or_else(|_| {
Config {
database_url: "postgresql://readur:readur@localhost:5432/readur".to_string(),
server_address: "127.0.0.1:8000".to_string(),
jwt_secret: "test-secret".to_string(),
upload_path: "./test-uploads".to_string(),
watch_folder: "./test-watch".to_string(),
allowed_file_types: vec!["pdf".to_string(), "txt".to_string()],
watch_interval_seconds: Some(30),
file_stability_check_ms: Some(500),
max_file_age_hours: None,
ocr_language: "eng".to_string(),
concurrent_ocr_jobs: 2,
ocr_timeout_seconds: 60,
max_file_size_mb: 10,
memory_limit_mb: 256,
cpu_priority: "normal".to_string(),
}
});
let db = Database::new("postgresql://readur:readur@localhost:5432/readur").await?;
let queue_service = std::sync::Arc::new(
readur::ocr_queue::OcrQueueService::new(db.clone(), db.get_pool().clone(), 1)
);
Ok(Arc::new(AppState {
db,
config,
webdav_scheduler: None,
source_scheduler: None,
queue_service,
}))
}
#[tokio::test]
async fn test_webdav_sync_duplicate_detection_skips_duplicate() -> Result<()> {
let state = create_test_app_state().await?;
let user_id = create_test_user(&state.db, "webdav_test").await?;
// Test content
let test_content = b"This is test PDF content for duplicate detection";
let file_hash = calculate_file_hash(test_content);
// Create existing document with same hash
let existing_doc = create_test_document(user_id, "existing.pdf", file_hash.clone());
state.db.create_document(existing_doc).await?;
// Setup mock WebDAV service
let mut webdav_service = MockWebDAVService::new();
webdav_service.add_test_file("/test/duplicate.pdf", test_content.to_vec());
// Create file info for the duplicate file
let file_info = create_test_file_info("duplicate.pdf", "/test/duplicate.pdf", test_content.len() as i64);
// Create a mock process_single_file function (since the actual one is private)
// We'll test the duplicate detection logic directly
// Check if duplicate exists using the new efficient method
let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &file_hash).await?;
assert!(duplicate_check.is_some(), "Should find existing document with same hash");
let found_doc = duplicate_check.unwrap();
assert_eq!(found_doc.file_hash, Some(file_hash));
assert_eq!(found_doc.user_id, user_id);
// Verify that WebDAV tracking would record this as a duplicate
let webdav_file = CreateWebDAVFile {
user_id,
webdav_path: file_info.path.clone(),
etag: file_info.etag.clone(),
last_modified: file_info.last_modified,
file_size: file_info.size,
mime_type: file_info.mime_type.clone(),
document_id: Some(found_doc.id),
sync_status: "duplicate_content".to_string(),
sync_error: None,
};
let created_webdav_file = state.db.create_or_update_webdav_file(&webdav_file).await?;
assert_eq!(created_webdav_file.sync_status, "duplicate_content");
assert_eq!(created_webdav_file.document_id, Some(found_doc.id));
Ok(())
}
#[tokio::test]
async fn test_webdav_sync_duplicate_detection_processes_unique() -> Result<()> {
let state = create_test_app_state().await?;
let user_id = create_test_user(&state.db, "webdav_test").await?;
// Test content
let test_content = b"This is unique PDF content that should be processed";
let file_hash = calculate_file_hash(test_content);
// Verify no existing document with this hash
let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &file_hash).await?;
assert!(duplicate_check.is_none(), "Should not find any existing document with this hash");
// This indicates the file would be processed normally
// In the actual sync, this would proceed to save the file and create a new document
Ok(())
}
#[tokio::test]
async fn test_webdav_sync_duplicate_different_users() -> Result<()> {
let state = create_test_app_state().await?;
let user1_id = create_test_user(&state.db, "webdav_user1").await?;
let user2_id = create_test_user(&state.db, "webdav_user2").await?;
// Test content
let test_content = b"Shared content between different users";
let file_hash = calculate_file_hash(test_content);
// Create document for user1 with this hash
let user1_doc = create_test_document(user1_id, "user1.pdf", file_hash.clone());
state.db.create_document(user1_doc).await?;
// Check that user2 doesn't see user1's document as duplicate
let duplicate_check = state.db.get_document_by_user_and_hash(user2_id, &file_hash).await?;
assert!(duplicate_check.is_none(), "User2 should not see user1's document as duplicate");
// User2 should be able to create their own document with same hash
let user2_doc = create_test_document(user2_id, "user2.pdf", file_hash.clone());
let result = state.db.create_document(user2_doc).await;
assert!(result.is_ok(), "User2 should be able to create document with same hash");
Ok(())
}
#[tokio::test]
async fn test_webdav_sync_etag_change_detection() -> Result<()> {
let state = create_test_app_state().await?;
let user_id = create_test_user(&state.db, "webdav_test").await?;
let webdav_path = "/test/updated.pdf";
let old_etag = "old-etag-123";
let new_etag = "new-etag-456";
// Create a document first
let test_doc = create_test_document(user_id, "updated.pdf", "etag_test_hash_1234567890".to_string());
let created_doc = state.db.create_document(test_doc).await?;
// Create initial WebDAV file record
let initial_webdav_file = CreateWebDAVFile {
user_id,
webdav_path: webdav_path.to_string(),
etag: old_etag.to_string(),
last_modified: Some(Utc::now()),
file_size: 1024,
mime_type: "application/pdf".to_string(),
document_id: Some(created_doc.id),
sync_status: "synced".to_string(),
sync_error: None,
};
state.db.create_or_update_webdav_file(&initial_webdav_file).await?;
// Check existing WebDAV file
let existing_file = state.db.get_webdav_file_by_path(user_id, webdav_path).await?;
assert!(existing_file.is_some());
let existing_file = existing_file.unwrap();
assert_eq!(existing_file.etag, old_etag);
// Simulate file with new ETag (indicating change)
let file_info = FileInfo {
name: "updated.pdf".to_string(),
path: webdav_path.to_string(),
size: 1024,
last_modified: Some(Utc::now()),
etag: new_etag.to_string(),
mime_type: "application/pdf".to_string(),
is_directory: false,
};
// ETag comparison should detect change
assert_ne!(existing_file.etag, file_info.etag, "ETag change should be detected");
Ok(())
}
#[tokio::test]
async fn test_webdav_sync_hash_collision_prevention() -> Result<()> {
let state = create_test_app_state().await?;
let user_id = create_test_user(&state.db, "webdav_test").await?;
// Create document with specific hash
let test_hash = "abcd1234567890123456789012345678901234567890123456789012345678";
let document = create_test_document(user_id, "original.pdf", test_hash.to_string());
state.db.create_document(document).await?;
// Try to create another document with same hash (should fail due to unique constraint)
let duplicate_document = create_test_document(user_id, "duplicate.pdf", test_hash.to_string());
let result = state.db.create_document(duplicate_document).await;
assert!(result.is_err(), "Should not be able to create duplicate hash for same user");
Ok(())
}
#[tokio::test]
async fn test_webdav_sync_file_content_vs_metadata_change() -> Result<()> {
let state = create_test_app_state().await?;
let user_id = create_test_user(&state.db, "webdav_test").await?;
// Original content and hash
let original_content = b"Original file content";
let original_hash = calculate_file_hash(original_content);
// Create original document
let original_doc = create_test_document(user_id, "test.pdf", original_hash.clone());
state.db.create_document(original_doc).await?;
// Same content but different metadata (name, etc.) - should still be detected as duplicate
let duplicate_check = state.db.get_document_by_user_and_hash(user_id, &original_hash).await?;
assert!(duplicate_check.is_some(), "Same content should be detected as duplicate regardless of filename");
// Different content - should not be duplicate
let different_content = b"Different file content";
let different_hash = calculate_file_hash(different_content);
let unique_check = state.db.get_document_by_user_and_hash(user_id, &different_hash).await?;
assert!(unique_check.is_none(), "Different content should not be detected as duplicate");
Ok(())
}
#[tokio::test]
async fn test_webdav_sync_error_handling_invalid_hash() -> Result<()> {
let state = create_test_app_state().await?;
let user_id = create_test_user(&state.db, "webdav_test").await?;
// Test with invalid hash formats
let invalid_g_hash = "g".repeat(64);
let invalid_hashes = vec![
"", // Empty
"short", // Too short
"invalid_characters_!@#$", // Invalid characters
&invalid_g_hash, // Invalid hex (contains 'g')
];
for invalid_hash in invalid_hashes {
let result = state.db.get_document_by_user_and_hash(user_id, invalid_hash).await;
// Should handle gracefully - either return None or proper error
match result {
Ok(doc) => assert!(doc.is_none(), "Invalid hash should not match any document"),
Err(_) => {} // Acceptable to return error for invalid input
}
}
Ok(())
}
#[tokio::test]
async fn test_webdav_sync_concurrent_duplicate_detection() -> Result<()> {
let state = create_test_app_state().await?;
let user_id = create_test_user(&state.db, "webdav_test").await?;
let test_content = b"Concurrent test content";
let file_hash = calculate_file_hash(test_content);
// Simulate concurrent duplicate checks
let mut handles = Vec::new();
for i in 0..5 {
let state_clone = state.clone();
let hash_clone = file_hash.clone();
let handle = tokio::spawn(async move {
state_clone.db.get_document_by_user_and_hash(user_id, &hash_clone).await
});
handles.push(handle);
}
// Wait for all concurrent operations
let mut all_none = true;
for handle in handles {
let result = handle.await??;
if result.is_some() {
all_none = false;
}
}
// Since no document exists with this hash, all should return None
assert!(all_none, "All concurrent checks should return None for non-existent hash");
Ok(())
}