feat(client): update failedOcr page for duplicates

This commit is contained in:
perf3ct 2025-06-17 16:52:45 +00:00
parent 80d58b0f28
commit 24e7dff9a5
2 changed files with 132 additions and 39 deletions

View File

@ -29,6 +29,7 @@ import {
Snackbar, Snackbar,
Tabs, Tabs,
Tab, Tab,
useTheme,
} from '@mui/material'; } from '@mui/material';
import { import {
Refresh as RefreshIcon, Refresh as RefreshIcon,
@ -40,6 +41,8 @@ import {
Visibility as VisibilityIcon, Visibility as VisibilityIcon,
Download as DownloadIcon, Download as DownloadIcon,
FileCopy as FileCopyIcon, FileCopy as FileCopyIcon,
Delete as DeleteIcon,
FindInPage as FindInPageIcon,
} from '@mui/icons-material'; } from '@mui/icons-material';
import { format } from 'date-fns'; import { format } from 'date-fns';
import { api, documentService } from '../services/api'; import { api, documentService } from '../services/api';
@ -122,6 +125,7 @@ interface DuplicatesResponse {
} }
const FailedOcrPage: React.FC = () => { const FailedOcrPage: React.FC = () => {
const theme = useTheme();
const [currentTab, setCurrentTab] = useState(0); const [currentTab, setCurrentTab] = useState(0);
const [documents, setDocuments] = useState<FailedDocument[]>([]); const [documents, setDocuments] = useState<FailedDocument[]>([]);
const [duplicates, setDuplicates] = useState<DuplicateGroup[]>([]); const [duplicates, setDuplicates] = useState<DuplicateGroup[]>([]);
@ -580,9 +584,19 @@ const FailedOcrPage: React.FC = () => {
) : ( ) : (
<> <>
<Alert severity="info" sx={{ mb: 2 }}> <Alert severity="info" sx={{ mb: 2 }}>
<AlertTitle>Duplicate Documents</AlertTitle> <AlertTitle>Duplicate Documents Found</AlertTitle>
These documents have identical content but may have different filenames. These documents have identical content but may have different filenames.
You can click on each group to see all the documents with the same content. You can expand each group to see all files with the same content and choose which ones to keep.
</Alert>
<Alert severity="warning" sx={{ mb: 2 }}>
<AlertTitle>What should you do?</AlertTitle>
<Box component="ul" sx={{ mt: 1, mb: 0, pl: 2 }}>
<li><strong>Review each group:</strong> Click to expand and see all duplicate files</li>
<li><strong>Keep the best version:</strong> Choose the file with the most descriptive name</li>
<li><strong>Check content:</strong> Use View/Download to verify files are truly identical</li>
<li><strong>Note for admin:</strong> Consider implementing bulk delete functionality for duplicates</li>
</Box>
</Alert> </Alert>
<TableContainer component={Paper}> <TableContainer component={Paper}>
@ -640,34 +654,92 @@ const FailedOcrPage: React.FC = () => {
<TableRow> <TableRow>
<TableCell sx={{ paddingBottom: 0, paddingTop: 0 }} colSpan={6}> <TableCell sx={{ paddingBottom: 0, paddingTop: 0 }} colSpan={6}>
<Collapse in={expandedDuplicateGroups.has(group.file_hash)} timeout="auto" unmountOnExit> <Collapse in={expandedDuplicateGroups.has(group.file_hash)} timeout="auto" unmountOnExit>
<Box sx={{ margin: 1, p: 2, bgcolor: 'grey.50' }}> <Box
<Typography variant="h6" gutterBottom> sx={{
margin: 1,
p: 3,
background: theme.palette.mode === 'light'
? 'rgba(248, 250, 252, 0.8)'
: 'rgba(30, 30, 30, 0.8)',
backdropFilter: 'blur(10px)',
borderRadius: 2,
border: `1px solid ${theme.palette.divider}`,
}}
>
<Typography variant="h6" gutterBottom sx={{
color: theme.palette.primary.main,
display: 'flex',
alignItems: 'center',
gap: 1
}}>
<FileCopyIcon />
Duplicate Files ({group.duplicate_count} total) Duplicate Files ({group.duplicate_count} total)
</Typography> </Typography>
<Alert severity="info" sx={{ mb: 2, fontSize: '0.875rem' }}>
<strong>Storage Impact:</strong> These {group.duplicate_count} files contain identical content.
Consider keeping only the best-named version to save space.
</Alert>
<Grid container spacing={2}> <Grid container spacing={2}>
{group.documents.map((doc, index) => ( {group.documents.map((doc, index) => (
<Grid item xs={12} md={6} key={doc.id}> <Grid item xs={12} md={6} lg={4} key={doc.id}>
<Card variant="outlined"> <Card
variant="outlined"
sx={{
background: theme.palette.mode === 'light'
? 'rgba(255, 255, 255, 0.9)'
: 'rgba(40, 40, 40, 0.9)',
backdropFilter: 'blur(5px)',
border: `1px solid ${theme.palette.divider}`,
transition: 'all 0.2s ease',
'&:hover': {
transform: 'translateY(-2px)',
boxShadow: theme.shadows[4],
}
}}
>
<CardContent> <CardContent>
<Typography variant="body2" fontWeight="bold"> <Box display="flex" justifyContent="space-between" alignItems="flex-start" mb={1}>
<Typography variant="body2" fontWeight="bold" sx={{
color: theme.palette.text.primary,
wordBreak: 'break-word',
flex: 1,
mr: 1
}}>
{doc.filename} {doc.filename}
</Typography> </Typography>
{index === 0 && (
<Chip
label="First"
size="small"
color="primary"
variant="outlined"
/>
)}
</Box>
{doc.original_filename !== doc.filename && ( {doc.original_filename !== doc.filename && (
<Typography variant="caption" color="text.secondary"> <Typography variant="caption" color="text.secondary" display="block">
Original: {doc.original_filename} Original: {doc.original_filename}
</Typography> </Typography>
)} )}
<Typography variant="caption" display="block" color="text.secondary">
<Typography variant="caption" display="block" color="text.secondary" sx={{ mb: 1 }}>
{formatFileSize(doc.file_size)} {doc.mime_type} {formatFileSize(doc.file_size)} {doc.mime_type}
</Typography> </Typography>
<Typography variant="caption" display="block" color="text.secondary">
<Typography variant="caption" display="block" color="text.secondary" sx={{ mb: 2 }}>
Uploaded: {format(new Date(doc.created_at), 'MMM dd, yyyy HH:mm')} Uploaded: {format(new Date(doc.created_at), 'MMM dd, yyyy HH:mm')}
</Typography> </Typography>
<Box mt={1}>
<Box display="flex" justifyContent="space-between" alignItems="center">
<Box>
<Tooltip title="View Document"> <Tooltip title="View Document">
<IconButton <IconButton
size="small" size="small"
onClick={() => window.open(`/api/documents/${doc.id}/view`, '_blank')} onClick={() => window.open(`/api/documents/${doc.id}/view`, '_blank')}
sx={{ color: theme.palette.primary.main }}
> >
<VisibilityIcon /> <VisibilityIcon />
</IconButton> </IconButton>
@ -676,11 +748,32 @@ const FailedOcrPage: React.FC = () => {
<IconButton <IconButton
size="small" size="small"
onClick={() => window.open(`/api/documents/${doc.id}/download`, '_blank')} onClick={() => window.open(`/api/documents/${doc.id}/download`, '_blank')}
sx={{ color: theme.palette.secondary.main }}
> >
<DownloadIcon /> <DownloadIcon />
</IconButton> </IconButton>
</Tooltip> </Tooltip>
</Box> </Box>
<Tooltip title="Get document details and duplicate information">
<Button
size="small"
variant="outlined"
color="info"
startIcon={<FindInPageIcon />}
sx={{ fontSize: '0.75rem' }}
onClick={() => {
setSnackbar({
open: true,
message: `Document "${doc.filename}" has ${group.duplicate_count - 1} duplicate(s). Content hash: ${group.file_hash.substring(0, 16)}...`,
severity: 'info'
});
}}
>
Info
</Button>
</Tooltip>
</Box>
</CardContent> </CardContent>
</Card> </Card>
</Grid> </Grid>

View File

@ -128,7 +128,7 @@ impl Database {
// Admin with OCR filter // Admin with OCR filter
sqlx::query( sqlx::query(
r#" r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
FROM documents FROM documents
WHERE ocr_status = $3 WHERE ocr_status = $3
ORDER BY created_at DESC ORDER BY created_at DESC
@ -145,7 +145,7 @@ impl Database {
// Admin without OCR filter // Admin without OCR filter
sqlx::query( sqlx::query(
r#" r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
FROM documents FROM documents
ORDER BY created_at DESC ORDER BY created_at DESC
LIMIT $1 OFFSET $2 LIMIT $1 OFFSET $2
@ -160,7 +160,7 @@ impl Database {
// Regular user with OCR filter // Regular user with OCR filter
sqlx::query( sqlx::query(
r#" r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
FROM documents FROM documents
WHERE user_id = $3 AND ocr_status = $4 WHERE user_id = $3 AND ocr_status = $4
ORDER BY created_at DESC ORDER BY created_at DESC
@ -178,7 +178,7 @@ impl Database {
// Regular user without OCR filter // Regular user without OCR filter
sqlx::query( sqlx::query(
r#" r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
FROM documents FROM documents
WHERE user_id = $3 WHERE user_id = $3
ORDER BY created_at DESC ORDER BY created_at DESC
@ -267,7 +267,7 @@ impl Database {
pub async fn get_documents_by_user(&self, user_id: Uuid, limit: i64, offset: i64) -> Result<Vec<Document>> { pub async fn get_documents_by_user(&self, user_id: Uuid, limit: i64, offset: i64) -> Result<Vec<Document>> {
let rows = sqlx::query( let rows = sqlx::query(
r#" r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
FROM documents FROM documents
WHERE user_id = $1 WHERE user_id = $1
ORDER BY created_at DESC ORDER BY created_at DESC
@ -311,7 +311,7 @@ impl Database {
pub async fn find_documents_by_filename(&self, filename: &str) -> Result<Vec<Document>> { pub async fn find_documents_by_filename(&self, filename: &str) -> Result<Vec<Document>> {
let rows = sqlx::query( let rows = sqlx::query(
r#" r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
FROM documents FROM documents
WHERE filename = $1 OR original_filename = $1 WHERE filename = $1 OR original_filename = $1
ORDER BY created_at DESC ORDER BY created_at DESC
@ -352,7 +352,7 @@ impl Database {
pub async fn search_documents(&self, user_id: Uuid, search: SearchRequest) -> Result<(Vec<Document>, i64)> { pub async fn search_documents(&self, user_id: Uuid, search: SearchRequest) -> Result<(Vec<Document>, i64)> {
let mut query_builder = QueryBuilder::new( let mut query_builder = QueryBuilder::new(
r#" r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash,
ts_rank(to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')), plainto_tsquery('english', "# ts_rank(to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')), plainto_tsquery('english', "#
); );
@ -455,7 +455,7 @@ impl Database {
// Use trigram similarity for substring matching // Use trigram similarity for substring matching
let mut builder = QueryBuilder::new( let mut builder = QueryBuilder::new(
r#" r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash,
GREATEST( GREATEST(
similarity(filename, "# similarity(filename, "#
); );
@ -498,7 +498,7 @@ impl Database {
let mut builder = QueryBuilder::new(&format!( let mut builder = QueryBuilder::new(&format!(
r#" r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash,
GREATEST( GREATEST(
CASE WHEN filename ILIKE '%' || "# CASE WHEN filename ILIKE '%' || "#
)); ));
@ -644,7 +644,7 @@ impl Database {
// Use trigram similarity for substring matching // Use trigram similarity for substring matching
let mut builder = QueryBuilder::new( let mut builder = QueryBuilder::new(
r#" r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash,
GREATEST( GREATEST(
similarity(filename, "# similarity(filename, "#
); );
@ -683,7 +683,7 @@ impl Database {
let mut builder = QueryBuilder::new(&format!( let mut builder = QueryBuilder::new(&format!(
r#" r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash,
GREATEST( GREATEST(
CASE WHEN filename ILIKE '%' || "# CASE WHEN filename ILIKE '%' || "#
)); ));
@ -1080,14 +1080,14 @@ impl Database {
let query = if user_role == crate::models::UserRole::Admin { let query = if user_role == crate::models::UserRole::Admin {
// Admins can see any document // Admins can see any document
r#" r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
FROM documents FROM documents
WHERE id = $1 WHERE id = $1
"# "#
} else { } else {
// Regular users can only see their own documents // Regular users can only see their own documents
r#" r#"
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
FROM documents FROM documents
WHERE id = $1 AND user_id = $2 WHERE id = $1 AND user_id = $2
"# "#