feat(server/client): add failed_documents table to handle failures, and move logic of failures

2025-06-28 20:52:58 +00:00 · 2025-06-28 20:52:58 +00:00 · 84577806ef
parent fce56b660b
commit 84577806ef
9 changed files with 936 additions and 20 deletions
--- a/frontend/src/components/Layout/AppLayout.tsx
+++ b/frontend/src/components/Layout/AppLayout.tsx
@ -36,6 +36,7 @@ import {
  Label as LabelIcon,
  Block as BlockIcon,
  Api as ApiIcon,
+  ManageAccounts as ManageIcon,
 } from '@mui/icons-material';
 import { useNavigate, useLocation } from 'react-router-dom';
 import { useAuth } from '../../contexts/AuthContext';
@ -69,7 +70,7 @@ const navigationItems: NavigationItem[] = [
  { text: 'Labels', icon: LabelIcon, path: '/labels' },
  { text: 'Sources', icon: StorageIcon, path: '/sources' },
  { text: 'Watch Folder', icon: FolderIcon, path: '/watch' },
-  { text: 'Failed OCR', icon: ErrorIcon, path: '/failed-ocr' },
+  { text: 'Document Management', icon: ManageIcon, path: '/failed-ocr' },
  { text: 'Ignored Files', icon: BlockIcon, path: '/ignored-files' },
 ];

--- a/frontend/src/pages/FailedOcrPage.tsx
+++ b/frontend/src/pages/FailedOcrPage.tsx
@ -47,6 +47,7 @@ import {
  Delete as DeleteIcon,
  FindInPage as FindInPageIcon,
  OpenInNew as OpenInNewIcon,
+  Warning as WarningIcon,
 } from '@mui/icons-material';
 import { format } from 'date-fns';
 import { api, documentService, queueService } from '../services/api';
@ -135,16 +136,22 @@ const FailedOcrPage: React.FC = () => {
  const [currentTab, setCurrentTab] = useState(0);
  const [documents, setDocuments] = useState<FailedDocument[]>([]);
  const [duplicates, setDuplicates] = useState<DuplicateGroup[]>([]);
+  const [failedDocuments, setFailedDocuments] = useState<any[]>([]);
  const [loading, setLoading] = useState(true);
  const [duplicatesLoading, setDuplicatesLoading] = useState(false);
+  const [failedDocumentsLoading, setFailedDocumentsLoading] = useState(false);
+  const [failedDocumentsFilters, setFailedDocumentsFilters] = useState<{ stage?: string; reason?: string }>({});
+  const [selectedFailedDocument, setSelectedFailedDocument] = useState<any>(null);
  const [retrying, setRetrying] = useState<string | null>(null);
  const [retryingAll, setRetryingAll] = useState(false);
  const [statistics, setStatistics] = useState<FailedOcrResponse['statistics'] | null>(null);
  const [duplicateStatistics, setDuplicateStatistics] = useState<DuplicatesResponse['statistics'] | null>(null);
  const [pagination, setPagination] = useState({ page: 1, limit: 25 });
  const [duplicatesPagination, setDuplicatesPagination] = useState({ page: 1, limit: 25 });
+  const [failedDocumentsPagination, setFailedDocumentsPagination] = useState({ page: 1, limit: 25 });
  const [totalPages, setTotalPages] = useState(0);
  const [duplicatesTotalPages, setDuplicatesTotalPages] = useState(0);
+  const [failedDocumentsTotalPages, setFailedDocumentsTotalPages] = useState(0);
  const [selectedDocument, setSelectedDocument] = useState<FailedDocument | null>(null);
  const [detailsOpen, setDetailsOpen] = useState(false);
  const [expandedRows, setExpandedRows] = useState<Set<string>>(new Set());
@ -223,8 +230,59 @@ const FailedOcrPage: React.FC = () => {
  useEffect(() => {
    if (currentTab === 1) {
      fetchDuplicates();
+    } else if (currentTab === 4) {
+      fetchFailedDocumentsList();
    }
-  }, [currentTab, duplicatesPagination.page]);
+  }, [currentTab, duplicatesPagination.page, failedDocumentsPagination.page, failedDocumentsFilters]);
+
+  const fetchFailedDocumentsList = async () => {
+    try {
+      setFailedDocumentsLoading(true);
+      const offset = (failedDocumentsPagination.page - 1) * failedDocumentsPagination.limit;
+      const response = await documentService.getFailedDocuments(
+        failedDocumentsPagination.limit, 
+        offset, 
+        failedDocumentsFilters.stage, 
+        failedDocumentsFilters.reason
+      );
+      
+      if (response?.data) {
+        setFailedDocuments(response.data.documents || []);
+        if (response.data.pagination) {
+          setFailedDocumentsTotalPages(Math.ceil(response.data.pagination.total / failedDocumentsPagination.limit));
+        }
+      }
+    } catch (error) {
+      console.error('Failed to fetch failed documents:', error);
+      setSnackbar({
+        open: true,
+        message: 'Failed to load failed documents',
+        severity: 'error'
+      });
+    } finally {
+      setFailedDocumentsLoading(false);
+    }
+  };
+
+  const getFailureReasonColor = (reason: string): "error" | "warning" | "info" | "default" => {
+    switch (reason) {
+      case 'low_ocr_confidence':
+      case 'ocr_timeout':
+      case 'ocr_memory_limit':
+      case 'pdf_parsing_error':
+        return 'error';
+      case 'duplicate_content':
+      case 'unsupported_format':
+      case 'file_too_large':
+        return 'warning';
+      case 'file_corrupted':
+      case 'access_denied':
+      case 'permission_denied':
+        return 'error';
+      default:
+        return 'default';
+    }
+  };

  const handleRetryOcr = async (document: FailedDocument) => {
    try {
@ -309,6 +367,8 @@ const FailedOcrPage: React.FC = () => {
      case 'Timeout':
      case 'Memory Limit':
        return 'error';
+      case 'Low OCR Confidence':
+        return 'warning';
      case 'Unknown Error':
        return 'info';
      default:
@ -354,6 +414,8 @@ const FailedOcrPage: React.FC = () => {
      handlePreviewLowConfidence();
    } else if (currentTab === 3) {
      handlePreviewFailedDocuments();
+    } else if (currentTab === 4) {
+      fetchFailedDocumentsList();
    }
  };

@ -488,22 +550,27 @@ const FailedOcrPage: React.FC = () => {
        <Tabs value={currentTab} onChange={handleTabChange} aria-label="document management tabs">
          <Tab
            icon={<ErrorIcon />}
-            label={`Failed OCR${statistics ? ` (${statistics.total_failed})` : ''}`}
+            label={`Failed Documents${statistics ? ` (${statistics.total_failed})` : ''}`}
            iconPosition="start"
          />
          <Tab
            icon={<FileCopyIcon />}
-            label={`Duplicates${duplicateStatistics ? ` (${duplicateStatistics.total_duplicate_groups})` : ''}`}
+            label={`Duplicate Files${duplicateStatistics ? ` (${duplicateStatistics.total_duplicate_groups})` : ''}`}
            iconPosition="start"
          />
          <Tab
            icon={<FindInPageIcon />}
-            label={`Low Confidence${previewData ? ` (${previewData.matched_count})` : ''}`}
+            label={`Low Quality Manager${previewData ? ` (${previewData.matched_count})` : ''}`}
            iconPosition="start"
          />
          <Tab
            icon={<DeleteIcon />}
-            label="Delete Failed"
+            label="Bulk Cleanup"
+            iconPosition="start"
+          />
+          <Tab
+            icon={<WarningIcon />}
+            label="Failed Documents"
            iconPosition="start"
          />
        </Tabs>
@ -1073,15 +1140,62 @@ const FailedOcrPage: React.FC = () => {
                <Typography color={previewData.matched_count > 0 ? 'warning.main' : 'success.main'}>
                  {previewData.message}
                </Typography>
-                {previewData.matched_count > 0 && (
+                {previewData.matched_count > 0 && previewData.documents && (
                  <Box sx={{ mt: 2 }}>
-                    <Typography variant="body2" color="text.secondary">
-                      Document IDs that would be deleted:
+                    <Typography variant="body2" color="text.secondary" gutterBottom>
+                      Documents that would be deleted:
                    </Typography>
-                    <Typography variant="body2" sx={{ fontFamily: 'monospace', wordBreak: 'break-all' }}>
-                      {previewData.document_ids.slice(0, 10).join(', ')}
-                      {previewData.document_ids.length > 10 && ` ... and ${previewData.document_ids.length - 10} more`}
+                    <TableContainer component={Paper} variant="outlined" sx={{ mt: 2 }}>
+                      <Table size="small">
+                        <TableHead>
+                          <TableRow>
+                            <TableCell>Filename</TableCell>
+                            <TableCell>Size</TableCell>
+                            <TableCell>OCR Confidence</TableCell>
+                            <TableCell>Status</TableCell>
+                            <TableCell>Date</TableCell>
+                          </TableRow>
+                        </TableHead>
+                        <TableBody>
+                          {previewData.documents.slice(0, 20).map((doc: any) => (
+                            <TableRow key={doc.id}>
+                              <TableCell>
+                                <Typography variant="body2" noWrap>
+                                  {doc.original_filename || doc.filename}
                                </Typography>
+                              </TableCell>
+                              <TableCell>
+                                <Typography variant="body2">
+                                  {formatFileSize(doc.file_size)}
+                                </Typography>
+                              </TableCell>
+                              <TableCell>
+                                <Typography variant="body2" color={doc.ocr_confidence ? 'warning.main' : 'error.main'}>
+                                  {doc.ocr_confidence ? `${doc.ocr_confidence.toFixed(1)}%` : 'N/A'}
+                                </Typography>
+                              </TableCell>
+                              <TableCell>
+                                <Chip
+                                  size="small"
+                                  label={doc.ocr_status || 'Unknown'}
+                                  color={doc.ocr_status === 'failed' ? 'error' : 'default'}
+                                />
+                              </TableCell>
+                              <TableCell>
+                                <Typography variant="body2">
+                                  {new Date(doc.created_at).toLocaleDateString()}
+                                </Typography>
+                              </TableCell>
+                            </TableRow>
+                          ))}
+                        </TableBody>
+                      </Table>
+                    </TableContainer>
+                    {previewData.documents.length > 20 && (
+                      <Typography variant="body2" color="text.secondary" sx={{ mt: 1 }}>
+                        ... and {previewData.documents.length - 20} more documents
+                      </Typography>
+                    )}
                  </Box>
                )}
              </CardContent>
@ -1175,6 +1289,187 @@ const FailedOcrPage: React.FC = () => {
        </>
      )}

+      {/* Failed Documents Tab Content */}
+      {currentTab === 4 && (
+        <>
+          <Alert severity="info" sx={{ mb: 3 }}>
+            <AlertTitle>Failed Documents Overview</AlertTitle>
+            <Typography>
+              This shows all documents that failed at any stage of processing: ingestion, validation, OCR, storage, etc.
+              Use the filters to narrow down by failure stage or specific reason.
+            </Typography>
+          </Alert>
+
+          {/* Filter Controls */}
+          <Card sx={{ mb: 3 }}>
+            <CardContent>
+              <Grid container spacing={3} alignItems="center">
+                <Grid item xs={12} md={3}>
+                  <TextField
+                    label="Filter by Stage"
+                    select
+                    value={failedDocumentsFilters.stage || ''}
+                    onChange={(e) => setFailedDocumentsFilters(prev => ({ ...prev, stage: e.target.value || undefined }))}
+                    fullWidth
+                    SelectProps={{ native: true }}
+                  >
+                    <option value="">All Stages</option>
+                    <option value="ocr">OCR Processing</option>
+                    <option value="ingestion">Document Ingestion</option>
+                    <option value="validation">Validation</option>
+                    <option value="storage">File Storage</option>
+                    <option value="processing">Processing</option>
+                    <option value="sync">Synchronization</option>
+                  </TextField>
+                </Grid>
+                <Grid item xs={12} md={3}>
+                  <TextField
+                    label="Filter by Reason"
+                    select
+                    value={failedDocumentsFilters.reason || ''}
+                    onChange={(e) => setFailedDocumentsFilters(prev => ({ ...prev, reason: e.target.value || undefined }))}
+                    fullWidth
+                    SelectProps={{ native: true }}
+                  >
+                    <option value="">All Reasons</option>
+                    <option value="duplicate_content">Duplicate Content</option>
+                    <option value="low_ocr_confidence">Low OCR Confidence</option>
+                    <option value="unsupported_format">Unsupported Format</option>
+                    <option value="file_too_large">File Too Large</option>
+                    <option value="file_corrupted">File Corrupted</option>
+                    <option value="ocr_timeout">OCR Timeout</option>
+                    <option value="pdf_parsing_error">PDF Parsing Error</option>
+                    <option value="other">Other</option>
+                  </TextField>
+                </Grid>
+                <Grid item xs={12} md={2}>
+                  <Button
+                    variant="outlined"
+                    onClick={fetchFailedDocumentsList}
+                    disabled={failedDocumentsLoading}
+                    startIcon={failedDocumentsLoading ? <CircularProgress size={20} /> : <RefreshIcon />}
+                    fullWidth
+                  >
+                    Apply Filters
+                  </Button>
+                </Grid>
+              </Grid>
+            </CardContent>
+          </Card>
+
+          {/* Failed Documents List */}
+          {failedDocuments.length > 0 && (
+            <Card sx={{ mb: 3 }}>
+              <CardContent>
+                <Typography variant="h6" gutterBottom>
+                  Failed Documents ({failedDocuments.length})
+                </Typography>
+                <TableContainer component={Paper} variant="outlined">
+                  <Table>
+                    <TableHead>
+                      <TableRow>
+                        <TableCell>Filename</TableCell>
+                        <TableCell>Stage</TableCell>
+                        <TableCell>Reason</TableCell>
+                        <TableCell>Size</TableCell>
+                        <TableCell>Date</TableCell>
+                        <TableCell>Actions</TableCell>
+                      </TableRow>
+                    </TableHead>
+                    <TableBody>
+                      {failedDocuments.slice((failedDocumentsPagination.page - 1) * failedDocumentsPagination.limit, failedDocumentsPagination.page * failedDocumentsPagination.limit).map((doc: any) => (
+                        <TableRow key={doc.id}>
+                          <TableCell>
+                            <Typography variant="body2" noWrap>
+                              {doc.original_filename || doc.filename}
+                            </Typography>
+                            {doc.ingestion_source && (
+                              <Chip size="small" label={doc.ingestion_source} variant="outlined" sx={{ mt: 0.5 }} />
+                            )}
+                          </TableCell>
+                          <TableCell>
+                            <Chip
+                              size="small"
+                              label={doc.source || doc.failure_stage}
+                              color={doc.failure_stage === 'ocr' ? 'error' : 'warning'}
+                            />
+                          </TableCell>
+                          <TableCell>
+                            <Chip
+                              size="small"
+                              label={doc.failure_category || doc.failure_reason}
+                              color={getFailureReasonColor(doc.failure_reason)}
+                            />
+                          </TableCell>
+                          <TableCell>
+                            <Typography variant="body2">
+                              {doc.file_size ? formatFileSize(doc.file_size) : 'N/A'}
+                            </Typography>
+                          </TableCell>
+                          <TableCell>
+                            <Typography variant="body2">
+                              {new Date(doc.created_at).toLocaleDateString()}
+                            </Typography>
+                          </TableCell>
+                          <TableCell>
+                            <IconButton
+                              size="small"
+                              onClick={() => setSelectedFailedDocument(doc)}
+                              title="View Details"
+                            >
+                              <InfoIcon />
+                            </IconButton>
+                            {doc.existing_document_id && (
+                              <IconButton
+                                size="small"
+                                onClick={() => navigate(`/documents/${doc.existing_document_id}`)}
+                                title="View Existing Document"
+                              >
+                                <OpenInNewIcon />
+                              </IconButton>
+                            )}
+                          </TableCell>
+                        </TableRow>
+                      ))}
+                    </TableBody>
+                  </Table>
+                </TableContainer>
+                
+                {/* Pagination */}
+                {failedDocumentsTotalPages > 1 && (
+                  <Box display="flex" justifyContent="center" mt={2}>
+                    <Pagination
+                      count={failedDocumentsTotalPages}
+                      page={failedDocumentsPagination.page}
+                      onChange={(_, page) => setFailedDocumentsPagination(prev => ({ ...prev, page }))}
+                      color="primary"
+                    />
+                  </Box>
+                )}
+              </CardContent>
+            </Card>
+          )}
+
+          {/* Loading State */}
+          {failedDocumentsLoading && (
+            <Box display="flex" justifyContent="center" alignItems="center" minHeight="200px">
+              <CircularProgress />
+              <Typography sx={{ ml: 2 }}>Loading failed documents...</Typography>
+            </Box>
+          )}
+
+          {/* Empty State */}
+          {!failedDocumentsLoading && failedDocuments.length === 0 && (
+            <Alert severity="success">
+              <AlertTitle>No Failed Documents Found</AlertTitle>
+              <Typography>
+                No documents have failed processing with the current filters. This is good!
+              </Typography>
+            </Alert>
+          )}
+        </>
+      )}
+
      {/* Confirmation Dialog */}
      <Dialog
        open={confirmDeleteOpen}
--- a/frontend/src/pages/SourcesPage.tsx
+++ b/frontend/src/pages/SourcesPage.tsx
@ -151,6 +151,7 @@ const SourcesPage: React.FC = () => {
  const [testingConnection, setTestingConnection] = useState(false);
  const [syncingSource, setSyncingSource] = useState<string | null>(null);
  const [stoppingSync, setStoppingSync] = useState<string | null>(null);
+  const [autoRefreshing, setAutoRefreshing] = useState(false);

  useEffect(() => {
    loadSources();
@ -159,6 +160,25 @@ const SourcesPage: React.FC = () => {
    }
  }, [user]);

+  // Auto-refresh sources when any source is syncing
+  useEffect(() => {
+    const activeSyncingSources = sources.filter(source => source.status === 'syncing');
+    
+    if (activeSyncingSources.length > 0) {
+      setAutoRefreshing(true);
+      const interval = setInterval(() => {
+        loadSources();
+      }, 5000); // Poll every 5 seconds during active sync
+      
+      return () => {
+        clearInterval(interval);
+        setAutoRefreshing(false);
+      };
+    } else {
+      setAutoRefreshing(false);
+    }
+  }, [sources]);
+
  // Update default folders when source type changes
  useEffect(() => {
    if (!editingSource) { // Only for new sources
@ -979,8 +999,9 @@ const SourcesPage: React.FC = () => {
          <Button
            variant="outlined"
            size="large"
-            startIcon={<AutoFixHighIcon />}
+            startIcon={autoRefreshing ? <CircularProgress size={20} /> : <AutoFixHighIcon />}
            onClick={loadSources}
+            disabled={autoRefreshing}
            sx={{
              borderRadius: 3,
              px: 4,
@ -993,7 +1014,7 @@ const SourcesPage: React.FC = () => {
              transition: 'all 0.3s cubic-bezier(0.4, 0, 0.2, 1)',
            }}
          >
-            Refresh
+            {autoRefreshing ? 'Auto-refreshing...' : 'Refresh'}
          </Button>

          {/* OCR Controls for Admin Users */}
--- a/frontend/src/services/api.ts
+++ b/frontend/src/services/api.ts
@ -200,8 +200,8 @@ export const documentService = {
  },

  getFailedOcrDocuments: (limit = 50, offset = 0) => {
-    return api.get(`/documents/failed-ocr`, {
-      params: { limit, offset },
+    return api.get(`/documents/failed`, {
+      params: { stage: 'ocr', limit, offset },
    })
  },

@ -253,6 +253,13 @@ export const documentService = {
      preview_only: previewOnly
    })
  },
+
+  getFailedDocuments: (limit = 25, offset = 0, stage?: string, reason?: string) => {
+    const params: any = { limit, offset };
+    if (stage) params.stage = stage;
+    if (reason) params.reason = reason;
+    return api.get('/documents/failed', { params })
+  },
 }

 export interface OcrStatusResponse {
--- a/migrations/20250628000003_add_failed_documents_table.sql
+++ b/migrations/20250628000003_add_failed_documents_table.sql
@ -0,0 +1,103 @@
+-- Add table to track documents that failed at any stage of processing
+-- This provides visibility into documents that failed during: ingestion, validation, OCR, etc.
+
+CREATE TABLE IF NOT EXISTS failed_documents (
+    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    user_id UUID REFERENCES users(id) ON DELETE CASCADE,
+    filename TEXT NOT NULL,
+    original_filename TEXT, -- Original name when uploaded (if available)
+    original_path TEXT, -- Path where file was located
+    file_path TEXT, -- Stored file path (if file was saved before failure)
+    file_size BIGINT,
+    file_hash VARCHAR(64),
+    mime_type TEXT,
+    
+    -- Document content (if available before failure)
+    content TEXT, -- Raw content if extracted
+    tags TEXT[], -- Tags that were assigned/detected
+    
+    -- OCR-related fields (for OCR stage failures)
+    ocr_text TEXT, -- Partial OCR text if extracted before failure
+    ocr_confidence REAL, -- OCR confidence if calculated
+    ocr_word_count INTEGER, -- Word count if calculated
+    ocr_processing_time_ms INTEGER, -- Processing time before failure
+    
+    -- Failure information
+    failure_reason TEXT NOT NULL,
+    failure_stage TEXT NOT NULL, -- 'ingestion', 'validation', 'ocr', 'storage', etc.
+    existing_document_id UUID REFERENCES documents(id) ON DELETE SET NULL,
+    ingestion_source TEXT NOT NULL, -- 'batch', 'sync', 'webdav', 'upload', etc.
+    error_message TEXT, -- Detailed error information
+    
+    -- Retry information
+    retry_count INTEGER DEFAULT 0,
+    last_retry_at TIMESTAMPTZ,
+    
+    -- Timestamps
+    created_at TIMESTAMPTZ DEFAULT NOW(),
+    updated_at TIMESTAMPTZ DEFAULT NOW(),
+    
+    CONSTRAINT check_failure_reason CHECK (failure_reason IN (
+        'duplicate_content', 
+        'duplicate_filename', 
+        'unsupported_format',
+        'file_too_large',
+        'file_corrupted',
+        'access_denied',
+        'low_ocr_confidence',
+        'ocr_timeout',
+        'ocr_memory_limit',
+        'pdf_parsing_error',
+        'storage_quota_exceeded',
+        'network_error',
+        'permission_denied',
+        'virus_detected',
+        'invalid_structure',
+        'policy_violation',
+        'other'
+    )),
+    
+    CONSTRAINT check_failure_stage CHECK (failure_stage IN (
+        'ingestion',
+        'validation', 
+        'ocr',
+        'storage',
+        'processing',
+        'sync'
+    ))
+);
+
+-- Indexes for efficient querying
+CREATE INDEX IF NOT EXISTS idx_failed_documents_user_id ON failed_documents(user_id);
+CREATE INDEX IF NOT EXISTS idx_failed_documents_created_at ON failed_documents(created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_failed_documents_failure_reason ON failed_documents(failure_reason);
+CREATE INDEX IF NOT EXISTS idx_failed_documents_failure_stage ON failed_documents(failure_stage);
+CREATE INDEX IF NOT EXISTS idx_failed_documents_ingestion_source ON failed_documents(ingestion_source);
+CREATE INDEX IF NOT EXISTS idx_failed_documents_file_hash ON failed_documents(file_hash) WHERE file_hash IS NOT NULL;
+
+-- Add comments for documentation
+COMMENT ON TABLE failed_documents IS 'Tracks documents that failed at any stage of processing (ingestion, validation, OCR, etc.)';
+COMMENT ON COLUMN failed_documents.failure_reason IS 'Specific reason why the document failed';
+COMMENT ON COLUMN failed_documents.failure_stage IS 'Stage at which the document failed (ingestion, validation, ocr, etc.)';
+COMMENT ON COLUMN failed_documents.existing_document_id IS 'Reference to existing document if failed due to duplicate content';
+COMMENT ON COLUMN failed_documents.ingestion_source IS 'Source of the ingestion attempt (batch, sync, webdav, upload, etc.)';
+COMMENT ON COLUMN failed_documents.error_message IS 'Detailed error message for troubleshooting';
+
+-- Create a view for failed documents summary by reason and stage
+CREATE OR REPLACE VIEW failed_documents_summary AS
+SELECT 
+    failure_reason,
+    failure_stage,
+    ingestion_source,
+    COUNT(*) as document_count,
+    SUM(file_size) as total_size,
+    AVG(file_size) as avg_size,
+    MIN(created_at) as first_occurrence,
+    MAX(created_at) as last_occurrence
+FROM failed_documents 
+GROUP BY failure_reason, failure_stage, ingestion_source
+ORDER BY document_count DESC;
+
+-- Grant appropriate permissions
+-- GRANT SELECT, INSERT ON failed_documents TO readur_user;
+-- GRANT SELECT ON failed_documents_summary TO readur_user;
--- a/migrations/20250628000004_migrate_failed_ocr_to_failed_documents.sql
+++ b/migrations/20250628000004_migrate_failed_ocr_to_failed_documents.sql
@ -0,0 +1,123 @@
+-- Migration to move existing failed OCR documents from documents table to failed_documents table
+-- This consolidates all failure tracking into a single table
+
+-- First, ensure the failed_documents table exists
+-- (This migration depends on 20250628000003_add_failed_documents_table.sql)
+
+-- Move failed OCR documents to failed_documents table
+INSERT INTO failed_documents (
+    user_id,
+    filename,
+    original_filename,
+    file_path,
+    file_size,
+    file_hash,
+    mime_type,
+    content,
+    tags,
+    ocr_text,
+    ocr_confidence,
+    ocr_word_count,
+    ocr_processing_time_ms,
+    failure_reason,
+    failure_stage,
+    ingestion_source,
+    error_message,
+    retry_count,
+    created_at,
+    updated_at
+)
+SELECT 
+    d.user_id,
+    d.filename,
+    d.original_filename,
+    d.file_path,
+    d.file_size,
+    d.file_hash,
+    d.mime_type,
+    d.content,
+    d.tags,
+    d.ocr_text,
+    d.ocr_confidence,
+    d.ocr_word_count,
+    d.ocr_processing_time_ms,
+    COALESCE(d.ocr_failure_reason, 'other') as failure_reason,
+    'ocr' as failure_stage,
+    'migration' as ingestion_source, -- Mark these as migrated from existing system
+    d.ocr_error as error_message,
+    COALESCE(q.retry_count, 0) as retry_count,
+    d.created_at,
+    d.updated_at
+FROM documents d
+LEFT JOIN (
+    SELECT document_id, COUNT(*) as retry_count
+    FROM ocr_queue 
+    WHERE status IN ('failed', 'completed')
+    GROUP BY document_id
+) q ON d.id = q.document_id
+WHERE d.ocr_status = 'failed';
+
+-- Log the migration for audit purposes
+INSERT INTO failed_documents (
+    user_id,
+    filename,
+    original_filename,
+    failure_reason,
+    failure_stage,
+    ingestion_source,
+    error_message,
+    created_at,
+    updated_at
+) VALUES (
+    '00000000-0000-0000-0000-000000000000'::uuid, -- System user ID
+    'migration_log',
+    'Failed OCR Migration Log',
+    'migration_completed',
+    'migration',
+    'system',
+    'Migrated ' || (SELECT COUNT(*) FROM documents WHERE ocr_status = 'failed') || ' failed OCR documents to failed_documents table',
+    NOW(),
+    NOW()
+);
+
+-- Remove failed OCR documents from documents table
+-- Note: This uses CASCADE to also clean up related records in ocr_queue table
+DELETE FROM documents WHERE ocr_status = 'failed';
+
+-- Update statistics and constraints
+ANALYZE documents;
+ANALYZE failed_documents;
+
+-- Add comment documenting the migration
+COMMENT ON TABLE failed_documents IS 'Tracks all documents that failed at any stage of processing. Consolidated from documents table (OCR failures) and new ingestion failures as of migration 20250628000004.';
+
+-- Create indexes for efficient querying of migrated data
+CREATE INDEX IF NOT EXISTS idx_failed_documents_failure_stage_reason ON failed_documents(failure_stage, failure_reason);
+CREATE INDEX IF NOT EXISTS idx_failed_documents_ocr_confidence ON failed_documents(ocr_confidence) WHERE ocr_confidence IS NOT NULL;
+
+-- Optional: Create a view for backward compatibility during transition
+CREATE OR REPLACE VIEW legacy_failed_ocr_documents AS
+SELECT 
+    id,
+    user_id,
+    filename,
+    original_filename,
+    file_path,
+    file_size,
+    mime_type,
+    tags,
+    ocr_text,
+    ocr_confidence,
+    ocr_word_count,
+    ocr_processing_time_ms,
+    failure_reason as ocr_failure_reason,
+    error_message as ocr_error,
+    'failed' as ocr_status,
+    retry_count,
+    created_at,
+    updated_at
+FROM failed_documents
+WHERE failure_stage = 'ocr';
+
+-- Grant appropriate permissions
+-- GRANT SELECT ON legacy_failed_ocr_documents TO readur_user;
--- a/src/models.rs
+++ b/src/models.rs
@ -135,6 +135,149 @@ pub struct Document {
    pub file_hash: Option<String>,
 }

+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, ToSchema)]
+pub enum FailureReason {
+    #[serde(rename = "duplicate_content")]
+    DuplicateContent,
+    #[serde(rename = "duplicate_filename")]
+    DuplicateFilename,
+    #[serde(rename = "unsupported_format")]
+    UnsupportedFormat,
+    #[serde(rename = "file_too_large")]
+    FileTooLarge,
+    #[serde(rename = "file_corrupted")]
+    FileCorrupted,
+    #[serde(rename = "access_denied")]
+    AccessDenied,
+    #[serde(rename = "low_ocr_confidence")]
+    LowOcrConfidence,
+    #[serde(rename = "ocr_timeout")]
+    OcrTimeout,
+    #[serde(rename = "ocr_memory_limit")]
+    OcrMemoryLimit,
+    #[serde(rename = "pdf_parsing_error")]
+    PdfParsingError,
+    #[serde(rename = "storage_quota_exceeded")]
+    StorageQuotaExceeded,
+    #[serde(rename = "network_error")]
+    NetworkError,
+    #[serde(rename = "permission_denied")]
+    PermissionDenied,
+    #[serde(rename = "virus_detected")]
+    VirusDetected,
+    #[serde(rename = "invalid_structure")]
+    InvalidStructure,
+    #[serde(rename = "policy_violation")]
+    PolicyViolation,
+    #[serde(rename = "other")]
+    Other,
+}
+
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, ToSchema)]
+pub enum FailureStage {
+    #[serde(rename = "ingestion")]
+    Ingestion,
+    #[serde(rename = "validation")]
+    Validation,
+    #[serde(rename = "ocr")]
+    Ocr,
+    #[serde(rename = "storage")]
+    Storage,
+    #[serde(rename = "processing")]
+    Processing,
+    #[serde(rename = "sync")]
+    Sync,
+}
+
+impl std::fmt::Display for FailureReason {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            FailureReason::DuplicateContent => write!(f, "duplicate_content"),
+            FailureReason::DuplicateFilename => write!(f, "duplicate_filename"),
+            FailureReason::UnsupportedFormat => write!(f, "unsupported_format"),
+            FailureReason::FileTooLarge => write!(f, "file_too_large"),
+            FailureReason::FileCorrupted => write!(f, "file_corrupted"),
+            FailureReason::AccessDenied => write!(f, "access_denied"),
+            FailureReason::LowOcrConfidence => write!(f, "low_ocr_confidence"),
+            FailureReason::OcrTimeout => write!(f, "ocr_timeout"),
+            FailureReason::OcrMemoryLimit => write!(f, "ocr_memory_limit"),
+            FailureReason::PdfParsingError => write!(f, "pdf_parsing_error"),
+            FailureReason::StorageQuotaExceeded => write!(f, "storage_quota_exceeded"),
+            FailureReason::NetworkError => write!(f, "network_error"),
+            FailureReason::PermissionDenied => write!(f, "permission_denied"),
+            FailureReason::VirusDetected => write!(f, "virus_detected"),
+            FailureReason::InvalidStructure => write!(f, "invalid_structure"),
+            FailureReason::PolicyViolation => write!(f, "policy_violation"),
+            FailureReason::Other => write!(f, "other"),
+        }
+    }
+}
+
+impl std::fmt::Display for FailureStage {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            FailureStage::Ingestion => write!(f, "ingestion"),
+            FailureStage::Validation => write!(f, "validation"),
+            FailureStage::Ocr => write!(f, "ocr"),
+            FailureStage::Storage => write!(f, "storage"),
+            FailureStage::Processing => write!(f, "processing"),
+            FailureStage::Sync => write!(f, "sync"),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, FromRow, ToSchema)]
+pub struct FailedDocument {
+    /// Unique identifier for the failed document record
+    pub id: Uuid,
+    /// User who attempted to ingest the document
+    pub user_id: Uuid,
+    /// Filename of the failed document
+    pub filename: String,
+    /// Original filename when uploaded
+    pub original_filename: Option<String>,
+    /// Original path where the file was located
+    pub original_path: Option<String>,
+    /// Stored file path (if file was saved before failure)
+    pub file_path: Option<String>,
+    /// Size of the file in bytes
+    pub file_size: Option<i64>,
+    /// SHA256 hash of the file content
+    pub file_hash: Option<String>,
+    /// MIME type of the file
+    pub mime_type: Option<String>,
+    /// Raw content if extracted before failure
+    pub content: Option<String>,
+    /// Tags that were assigned/detected
+    pub tags: Vec<String>,
+    /// Partial OCR text if extracted before failure
+    pub ocr_text: Option<String>,
+    /// OCR confidence if calculated
+    pub ocr_confidence: Option<f32>,
+    /// Word count if calculated
+    pub ocr_word_count: Option<i32>,
+    /// Processing time before failure in milliseconds
+    pub ocr_processing_time_ms: Option<i32>,
+    /// Reason why the document failed
+    pub failure_reason: String,
+    /// Stage at which the document failed
+    pub failure_stage: String,
+    /// Reference to existing document if failed due to duplicate
+    pub existing_document_id: Option<Uuid>,
+    /// Source of the ingestion attempt
+    pub ingestion_source: String,
+    /// Detailed error message
+    pub error_message: Option<String>,
+    /// Number of retry attempts
+    pub retry_count: Option<i32>,
+    /// Last retry timestamp
+    pub last_retry_at: Option<DateTime<Utc>>,
+    /// When the document failed
+    pub created_at: DateTime<Utc>,
+    /// Last update timestamp
+    pub updated_at: DateTime<Utc>,
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize, ToSchema)]
 pub struct DocumentResponse {
    /// Unique identifier for the document
--- a/src/ocr/queue.rs
+++ b/src/ocr/queue.rs
@ -322,11 +322,12 @@ impl OcrQueueService {
                            warn!("⚠️  OCR quality issues for '{}' | Job: {} | Document: {} | {:.1}% confidence | {} words", 
                                  filename, item.id, item.document_id, ocr_result.confidence, ocr_result.word_count);
                            
-                            // Mark as failed for quality issues
+                            // Mark as failed for quality issues with proper failure reason
                            sqlx::query(
                                r#"
                                UPDATE documents
                                SET ocr_status = 'failed',
+                                    ocr_failure_reason = 'low_ocr_confidence',
                                    ocr_error = $2,
                                    updated_at = NOW()
                                WHERE id = $1
--- a/src/routes/documents.rs
+++ b/src/routes/documents.rs
@ -26,6 +26,14 @@ struct PaginationQuery {
    ocr_status: Option<String>,
 }

+#[derive(Deserialize, ToSchema)]
+struct FailedDocumentsQuery {
+    limit: Option<i64>,
+    offset: Option<i64>,
+    stage: Option<String>,  // 'ocr', 'ingestion', 'validation', etc.
+    reason: Option<String>, // 'duplicate_content', 'low_ocr_confidence', etc.
+}
+
 #[derive(Deserialize, Serialize, ToSchema)]
 pub struct BulkDeleteRequest {
    pub document_ids: Vec<uuid::Uuid>,
@ -50,8 +58,8 @@ pub fn router() -> Router<Arc<AppState>> {
        .route("/{id}/ocr", get(get_document_ocr))
        .route("/{id}/processed-image", get(get_processed_image))
        .route("/{id}/retry-ocr", post(retry_ocr))
-        .route("/failed-ocr", get(get_failed_ocr_documents))
        .route("/duplicates", get(get_user_duplicates))
+        .route("/failed", get(get_failed_documents))
        .route("/delete-low-confidence", post(delete_low_confidence_documents))
        .route("/delete-failed-ocr", post(delete_failed_ocr_documents))
 }
@ -757,6 +765,202 @@ async fn get_failed_ocr_documents(
    Ok(Json(response))
 }

+#[utoipa::path(
+    get,
+    path = "/api/documents/failed",
+    tag = "documents",
+    security(
+        ("bearer_auth" = [])
+    ),
+    params(
+        ("limit" = Option<i64>, Query, description = "Number of documents to return"),
+        ("offset" = Option<i64>, Query, description = "Number of documents to skip"),
+        ("stage" = Option<String>, Query, description = "Filter by failure stage (ocr, ingestion, validation, etc.)"),
+        ("reason" = Option<String>, Query, description = "Filter by failure reason")
+    ),
+    responses(
+        (status = 200, description = "List of failed documents", body = String),
+        (status = 401, description = "Unauthorized")
+    )
+)]
+async fn get_failed_documents(
+    State(state): State<Arc<AppState>>,
+    auth_user: AuthUser,
+    Query(params): Query<FailedDocumentsQuery>,
+) -> Result<Json<serde_json::Value>, StatusCode> {
+    let limit = params.limit.unwrap_or(25);
+    let offset = params.offset.unwrap_or(0);
+    
+    // Query the unified failed_documents table
+    let mut query_builder = sqlx::QueryBuilder::new(
+        r#"
+        SELECT id, filename, original_filename, file_path, file_size, mime_type,
+               content, tags, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms,
+               failure_reason, failure_stage, error_message, existing_document_id,
+               ingestion_source, retry_count, last_retry_at, created_at, updated_at
+        FROM failed_documents
+        WHERE ($1::uuid IS NULL OR user_id = $1)
+        "#
+    );
+    
+    let mut bind_count = 1;
+    
+    // Add stage filter if specified
+    if let Some(stage) = &params.stage {
+        bind_count += 1;
+        query_builder.push(&format!(" AND failure_stage = ${}", bind_count));
+    }
+    
+    // Add reason filter if specified  
+    if let Some(reason) = &params.reason {
+        bind_count += 1;
+        query_builder.push(&format!(" AND failure_reason = ${}", bind_count));
+    }
+    
+    query_builder.push(" ORDER BY created_at DESC");
+    query_builder.push(&format!(" LIMIT ${} OFFSET ${}", bind_count + 1, bind_count + 2));
+    
+    let mut query = query_builder.build();
+    
+    // Bind parameters in order
+    query = query.bind(if auth_user.user.role == crate::models::UserRole::Admin { 
+        None 
+    } else { 
+        Some(auth_user.user.id) 
+    });
+    
+    if let Some(stage) = &params.stage {
+        query = query.bind(stage);
+    }
+    
+    if let Some(reason) = &params.reason {
+        query = query.bind(reason);
+    }
+    
+    query = query.bind(limit).bind(offset);
+    
+    let failed_docs = query
+        .fetch_all(state.db.get_pool())
+        .await
+        .map_err(|e| {
+            tracing::error!("Failed to fetch failed documents: {}", e);
+            StatusCode::INTERNAL_SERVER_ERROR
+        })?;
+    
+    // Count total for pagination
+    let mut count_query_builder = sqlx::QueryBuilder::new(
+        "SELECT COUNT(*) FROM failed_documents WHERE ($1::uuid IS NULL OR user_id = $1)"
+    );
+    
+    let mut count_bind_count = 1;
+    
+    if let Some(stage) = &params.stage {
+        count_bind_count += 1;
+        count_query_builder.push(&format!(" AND failure_stage = ${}", count_bind_count));
+    }
+    
+    if let Some(reason) = &params.reason {
+        count_bind_count += 1;
+        count_query_builder.push(&format!(" AND failure_reason = ${}", count_bind_count));
+    }
+    
+    let mut count_query = count_query_builder.build_query_scalar::<i64>();
+    
+    count_query = count_query.bind(if auth_user.user.role == crate::models::UserRole::Admin { 
+        None 
+    } else { 
+        Some(auth_user.user.id) 
+    });
+    
+    if let Some(stage) = &params.stage {
+        count_query = count_query.bind(stage);
+    }
+    
+    if let Some(reason) = &params.reason {
+        count_query = count_query.bind(reason);
+    }
+    
+    let total_count = count_query
+        .fetch_one(state.db.get_pool())
+        .await
+        .unwrap_or(0);
+    
+    // Convert to JSON response format
+    let documents: Vec<serde_json::Value> = failed_docs.iter().map(|row| {
+        serde_json::json!({
+            "id": row.get::<uuid::Uuid, _>("id"),
+            "filename": row.get::<String, _>("filename"),
+            "original_filename": row.get::<Option<String>, _>("original_filename"),
+            "file_path": row.get::<Option<String>, _>("file_path"),
+            "file_size": row.get::<Option<i64>, _>("file_size"),
+            "mime_type": row.get::<Option<String>, _>("mime_type"),
+            "content": row.get::<Option<String>, _>("content"),
+            "tags": row.get::<Vec<String>, _>("tags"),
+            "ocr_text": row.get::<Option<String>, _>("ocr_text"),
+            "ocr_confidence": row.get::<Option<f32>, _>("ocr_confidence"),
+            "ocr_word_count": row.get::<Option<i32>, _>("ocr_word_count"),
+            "ocr_processing_time_ms": row.get::<Option<i32>, _>("ocr_processing_time_ms"),
+            "failure_reason": row.get::<String, _>("failure_reason"),
+            "failure_stage": row.get::<String, _>("failure_stage"),
+            "error_message": row.get::<Option<String>, _>("error_message"),
+            "existing_document_id": row.get::<Option<uuid::Uuid>, _>("existing_document_id"),
+            "ingestion_source": row.get::<String, _>("ingestion_source"),
+            "retry_count": row.get::<Option<i32>, _>("retry_count"),
+            "last_retry_at": row.get::<Option<chrono::DateTime<chrono::Utc>>, _>("last_retry_at"),
+            "created_at": row.get::<chrono::DateTime<chrono::Utc>, _>("created_at"),
+            "updated_at": row.get::<chrono::DateTime<chrono::Utc>, _>("updated_at"),
+            
+            // Computed fields for backward compatibility
+            "failure_category": categorize_failure_reason(
+                Some(&row.get::<String, _>("failure_reason")),
+                row.get::<Option<String>, _>("error_message").as_deref()
+            ),
+            "source": match row.get::<String, _>("failure_stage").as_str() {
+                "ocr" => "OCR Processing",
+                "ingestion" => "Document Ingestion", 
+                "validation" => "Document Validation",
+                "storage" => "File Storage",
+                "processing" => "Document Processing",
+                "sync" => "Source Synchronization",
+                _ => "Unknown"
+            }
+        })
+    }).collect();
+    
+    // Calculate statistics for the response
+    let mut stage_stats = std::collections::HashMap::new();
+    let mut reason_stats = std::collections::HashMap::new();
+    
+    for doc in &documents {
+        let stage = doc["failure_stage"].as_str().unwrap_or("unknown");
+        let reason = doc["failure_reason"].as_str().unwrap_or("unknown");
+        
+        *stage_stats.entry(stage).or_insert(0) += 1;
+        *reason_stats.entry(reason).or_insert(0) += 1;
+    }
+    
+    let response = serde_json::json!({
+        "documents": documents,
+        "pagination": {
+            "limit": limit,
+            "offset": offset,
+            "total": total_count,
+            "total_pages": (total_count as f64 / limit as f64).ceil() as i64
+        },
+        "statistics": {
+            "total_failed": total_count,
+            "by_stage": stage_stats,
+            "by_reason": reason_stats
+        },
+        "filters": {
+            "stage": params.stage,
+            "reason": params.reason
+        }
+    });
+    
+    Ok(Json(response))
+}
+
 async fn calculate_estimated_wait_time(priority: i32) -> i64 {
    // Simple estimation based on priority - in a real implementation,
    // this would check actual queue depth and processing times
@ -775,6 +979,7 @@ fn categorize_failure_reason(failure_reason: Option<&str>, error_message: Option
        Some("processing_timeout") => "Timeout",
        Some("memory_limit") => "Memory Limit",
        Some("pdf_parsing_panic") => "PDF Parsing Error",
+        Some("low_ocr_confidence") => "Low OCR Confidence",
        Some("unknown") | None => {
            // Try to categorize based on error message
            if let Some(error) = error_message {
@ -787,6 +992,8 @@ fn categorize_failure_reason(failure_reason: Option<&str>, error_message: Option
                    "PDF Font Issues"
                } else if error_lower.contains("corrupt") {
                    "PDF Corruption"
+                } else if error_lower.contains("quality below threshold") || error_lower.contains("confidence") {
+                    "Low OCR Confidence"
                } else {
                    "Unknown Error"
                }
@ -1066,12 +1273,27 @@ pub async fn delete_low_confidence_documents(
    let matched_count = matched_documents.len();

    if is_preview {
+        // Convert documents to response format with key details
+        let document_details: Vec<serde_json::Value> = matched_documents.iter().map(|d| {
+            serde_json::json!({
+                "id": d.id,
+                "filename": d.filename,
+                "original_filename": d.original_filename,
+                "file_size": d.file_size,
+                "ocr_confidence": d.ocr_confidence,
+                "ocr_status": d.ocr_status,
+                "created_at": d.created_at,
+                "mime_type": d.mime_type
+            })
+        }).collect();
+
        return Ok(Json(serde_json::json!({
            "success": true,
            "message": format!("Found {} documents with OCR confidence below {}%", matched_count, request.max_confidence),
            "matched_count": matched_count,
            "preview": true,
-            "document_ids": matched_documents.iter().map(|d| d.id).collect::<Vec<_>>()
+            "document_ids": matched_documents.iter().map(|d| d.id).collect::<Vec<_>>(),
+            "documents": document_details
        })));
    }