feat(client): update failedOcr page for duplicates

2025-06-17 16:52:45 +00:00 · 2025-06-17 16:52:45 +00:00 · 24e7dff9a5
parent 80d58b0f28
commit 24e7dff9a5
2 changed files with 132 additions and 39 deletions
--- a/frontend/src/pages/FailedOcrPage.tsx
+++ b/frontend/src/pages/FailedOcrPage.tsx
@ -29,6 +29,7 @@ import {
  Snackbar,
  Tabs,
  Tab,
+  useTheme,
 } from '@mui/material';
 import {
  Refresh as RefreshIcon,
@ -40,6 +41,8 @@ import {
  Visibility as VisibilityIcon,
  Download as DownloadIcon,
  FileCopy as FileCopyIcon,
+  Delete as DeleteIcon,
+  FindInPage as FindInPageIcon,
 } from '@mui/icons-material';
 import { format } from 'date-fns';
 import { api, documentService } from '../services/api';
@ -122,6 +125,7 @@ interface DuplicatesResponse {
 }

 const FailedOcrPage: React.FC = () => {
+  const theme = useTheme();
  const [currentTab, setCurrentTab] = useState(0);
  const [documents, setDocuments] = useState<FailedDocument[]>([]);
  const [duplicates, setDuplicates] = useState<DuplicateGroup[]>([]);
@ -580,9 +584,19 @@ const FailedOcrPage: React.FC = () => {
          ) : (
            <>
              <Alert severity="info" sx={{ mb: 2 }}>
-                <AlertTitle>Duplicate Documents</AlertTitle>
+                <AlertTitle>Duplicate Documents Found</AlertTitle>
                These documents have identical content but may have different filenames. 
-                You can click on each group to see all the documents with the same content.
+                You can expand each group to see all files with the same content and choose which ones to keep.
+              </Alert>
+
+              <Alert severity="warning" sx={{ mb: 2 }}>
+                <AlertTitle>What should you do?</AlertTitle>
+                <Box component="ul" sx={{ mt: 1, mb: 0, pl: 2 }}>
+                  <li><strong>Review each group:</strong> Click to expand and see all duplicate files</li>
+                  <li><strong>Keep the best version:</strong> Choose the file with the most descriptive name</li>
+                  <li><strong>Check content:</strong> Use View/Download to verify files are truly identical</li>
+                  <li><strong>Note for admin:</strong> Consider implementing bulk delete functionality for duplicates</li>
+                </Box>
              </Alert>

              <TableContainer component={Paper}>
@ -640,45 +654,124 @@ const FailedOcrPage: React.FC = () => {
                        <TableRow>
                          <TableCell sx={{ paddingBottom: 0, paddingTop: 0 }} colSpan={6}>
                            <Collapse in={expandedDuplicateGroups.has(group.file_hash)} timeout="auto" unmountOnExit>
-                              <Box sx={{ margin: 1, p: 2, bgcolor: 'grey.50' }}>
-                                <Typography variant="h6" gutterBottom>
+                              <Box 
+                                sx={{ 
+                                  margin: 1, 
+                                  p: 3,
+                                  background: theme.palette.mode === 'light' 
+                                    ? 'rgba(248, 250, 252, 0.8)' 
+                                    : 'rgba(30, 30, 30, 0.8)',
+                                  backdropFilter: 'blur(10px)',
+                                  borderRadius: 2,
+                                  border: `1px solid ${theme.palette.divider}`,
+                                }}
+                              >
+                                <Typography variant="h6" gutterBottom sx={{ 
+                                  color: theme.palette.primary.main,
+                                  display: 'flex',
+                                  alignItems: 'center',
+                                  gap: 1
+                                }}>
+                                  <FileCopyIcon />
                                  Duplicate Files ({group.duplicate_count} total)
                                </Typography>
+                                
+                                <Alert severity="info" sx={{ mb: 2, fontSize: '0.875rem' }}>
+                                  <strong>Storage Impact:</strong> These {group.duplicate_count} files contain identical content. 
+                                  Consider keeping only the best-named version to save space.
+                                </Alert>
+
                                <Grid container spacing={2}>
                                  {group.documents.map((doc, index) => (
-                                    <Grid item xs={12} md={6} key={doc.id}>
-                                      <Card variant="outlined">
+                                    <Grid item xs={12} md={6} lg={4} key={doc.id}>
+                                      <Card 
+                                        variant="outlined"
+                                        sx={{
+                                          background: theme.palette.mode === 'light'
+                                            ? 'rgba(255, 255, 255, 0.9)'
+                                            : 'rgba(40, 40, 40, 0.9)',
+                                          backdropFilter: 'blur(5px)',
+                                          border: `1px solid ${theme.palette.divider}`,
+                                          transition: 'all 0.2s ease',
+                                          '&:hover': {
+                                            transform: 'translateY(-2px)',
+                                            boxShadow: theme.shadows[4],
+                                          }
+                                        }}
+                                      >
                                        <CardContent>
-                                          <Typography variant="body2" fontWeight="bold">
-                                            {doc.filename}
-                                          </Typography>
+                                          <Box display="flex" justifyContent="space-between" alignItems="flex-start" mb={1}>
+                                            <Typography variant="body2" fontWeight="bold" sx={{ 
+                                              color: theme.palette.text.primary,
+                                              wordBreak: 'break-word',
+                                              flex: 1,
+                                              mr: 1
+                                            }}>
+                                              {doc.filename}
+                                            </Typography>
+                                            {index === 0 && (
+                                              <Chip 
+                                                label="First" 
+                                                size="small" 
+                                                color="primary" 
+                                                variant="outlined"
+                                              />
+                                            )}
+                                          </Box>
+                                          
                                          {doc.original_filename !== doc.filename && (
-                                            <Typography variant="caption" color="text.secondary">
+                                            <Typography variant="caption" color="text.secondary" display="block">
                                              Original: {doc.original_filename}
                                            </Typography>
                                          )}
-                                          <Typography variant="caption" display="block" color="text.secondary">
+                                          
+                                          <Typography variant="caption" display="block" color="text.secondary" sx={{ mb: 1 }}>
                                            {formatFileSize(doc.file_size)} • {doc.mime_type}
                                          </Typography>
-                                          <Typography variant="caption" display="block" color="text.secondary">
+                                          
+                                          <Typography variant="caption" display="block" color="text.secondary" sx={{ mb: 2 }}>
                                            Uploaded: {format(new Date(doc.created_at), 'MMM dd, yyyy HH:mm')}
                                          </Typography>
-                                          <Box mt={1}>
-                                            <Tooltip title="View Document">
-                                              <IconButton
+                                          
+                                          <Box display="flex" justifyContent="space-between" alignItems="center">
+                                            <Box>
+                                              <Tooltip title="View Document">
+                                                <IconButton
+                                                  size="small"
+                                                  onClick={() => window.open(`/api/documents/${doc.id}/view`, '_blank')}
+                                                  sx={{ color: theme.palette.primary.main }}
+                                                >
+                                                  <VisibilityIcon />
+                                                </IconButton>
+                                              </Tooltip>
+                                              <Tooltip title="Download Document">
+                                                <IconButton
+                                                  size="small"
+                                                  onClick={() => window.open(`/api/documents/${doc.id}/download`, '_blank')}
+                                                  sx={{ color: theme.palette.secondary.main }}
+                                                >
+                                                  <DownloadIcon />
+                                                </IconButton>
+                                              </Tooltip>
+                                            </Box>
+                                            
+                                            <Tooltip title="Get document details and duplicate information">
+                                              <Button
                                                size="small"
-                                                onClick={() => window.open(`/api/documents/${doc.id}/view`, '_blank')}
+                                                variant="outlined"
+                                                color="info"
+                                                startIcon={<FindInPageIcon />}
+                                                sx={{ fontSize: '0.75rem' }}
+                                                onClick={() => {
+                                                  setSnackbar({
+                                                    open: true,
+                                                    message: `Document "${doc.filename}" has ${group.duplicate_count - 1} duplicate(s). Content hash: ${group.file_hash.substring(0, 16)}...`,
+                                                    severity: 'info'
+                                                  });
+                                                }}
                                              >
-                                                <VisibilityIcon />
-                                              </IconButton>
-                                            </Tooltip>
-                                            <Tooltip title="Download Document">
-                                              <IconButton
-                                                size="small"
-                                                onClick={() => window.open(`/api/documents/${doc.id}/download`, '_blank')}
-                                              >
-                                                <DownloadIcon />
-                                              </IconButton>
+                                                Info
+                                              </Button>
                                            </Tooltip>
                                          </Box>
                                        </CardContent>
--- a/src/db/documents.rs
+++ b/src/db/documents.rs
@ -128,7 +128,7 @@ impl Database {
                // Admin with OCR filter
                sqlx::query(
                    r#"
-                    SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id
+                    SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
                    FROM documents 
                    WHERE ocr_status = $3
                    ORDER BY created_at DESC 
@ -145,7 +145,7 @@ impl Database {
                // Admin without OCR filter
                sqlx::query(
                    r#"
-                    SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id
+                    SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
                    FROM documents 
                    ORDER BY created_at DESC 
                    LIMIT $1 OFFSET $2
@ -160,7 +160,7 @@ impl Database {
                // Regular user with OCR filter
                sqlx::query(
                    r#"
-                    SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id
+                    SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
                    FROM documents 
                    WHERE user_id = $3 AND ocr_status = $4
                    ORDER BY created_at DESC 
@ -178,7 +178,7 @@ impl Database {
                // Regular user without OCR filter
                sqlx::query(
                    r#"
-                    SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id
+                    SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
                    FROM documents 
                    WHERE user_id = $3 
                    ORDER BY created_at DESC 
@ -267,7 +267,7 @@ impl Database {
    pub async fn get_documents_by_user(&self, user_id: Uuid, limit: i64, offset: i64) -> Result<Vec<Document>> {
        let rows = sqlx::query(
            r#"
-            SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id
+            SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
            FROM documents 
            WHERE user_id = $1 
            ORDER BY created_at DESC 
@ -311,7 +311,7 @@ impl Database {
    pub async fn find_documents_by_filename(&self, filename: &str) -> Result<Vec<Document>> {
        let rows = sqlx::query(
            r#"
-            SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id
+            SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
            FROM documents 
            WHERE filename = $1 OR original_filename = $1
            ORDER BY created_at DESC
@ -352,7 +352,7 @@ impl Database {
    pub async fn search_documents(&self, user_id: Uuid, search: SearchRequest) -> Result<(Vec<Document>, i64)> {
        let mut query_builder = QueryBuilder::new(
            r#"
-            SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id,
+            SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash,
                   ts_rank(to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')), plainto_tsquery('english', "# 
        );
        
@ -455,7 +455,7 @@ impl Database {
            // Use trigram similarity for substring matching
            let mut builder = QueryBuilder::new(
                r#"
-                SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id,
+                SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash,
                       GREATEST(
                           similarity(filename, "#
            );
@ -498,7 +498,7 @@ impl Database {

            let mut builder = QueryBuilder::new(&format!(
                r#"
-                SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id,
+                SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash,
                       GREATEST(
                           CASE WHEN filename ILIKE '%' || "#
            ));
@ -644,7 +644,7 @@ impl Database {
            // Use trigram similarity for substring matching
            let mut builder = QueryBuilder::new(
                r#"
-                SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id,
+                SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash,
                       GREATEST(
                           similarity(filename, "#
            );
@ -683,7 +683,7 @@ impl Database {

            let mut builder = QueryBuilder::new(&format!(
                r#"
-                SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id,
+                SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash,
                       GREATEST(
                           CASE WHEN filename ILIKE '%' || "#
            ));
@ -1080,14 +1080,14 @@ impl Database {
        let query = if user_role == crate::models::UserRole::Admin {
            // Admins can see any document
            r#"
-            SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id
+            SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
            FROM documents 
            WHERE id = $1
            "#
        } else {
            // Regular users can only see their own documents
            r#"
-            SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id
+            SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
            FROM documents 
            WHERE id = $1 AND user_id = $2
            "#