From 582617ab88fdc20eef1d0006709e83fa0d52c462 Mon Sep 17 00:00:00 2001
From: perfectra1n <jonfuller2012@gmail.com>
Date: Fri, 27 Jun 2025 20:23:59 -0700
Subject: [PATCH 1/4] fix(server/client): fix incorrect OCR measurements

---
 Cargo.toml                                    |   1 +
 frontend/src/pages/FailedOcrPage.tsx          | 169 +++++++
 frontend/src/services/api.ts                  |   5 +
 ...20250628000001_backfill_ocr_confidence.sql |  59 +++
 src/db/documents.rs                           | 159 ++++++
 src/ocr/enhanced.rs                           |  20 +-
 src/routes/documents.rs                       | 101 +++-
 src/tests/document_routes_tests.rs            | 300 ++++++++++++
 src/tests/documents_tests.rs                  | 394 +++++++++++++++
 src/tests/enhanced_ocr_tests.rs               | 455 ++++++++++++++++++
 src/tests/mod.rs                              |   1 +
 ...ion_document_deletion_integration_tests.rs | 271 +++++++++++
 12 files changed, 1926 insertions(+), 9 deletions(-)
 create mode 100644 migrations/20250628000001_backfill_ocr_confidence.sql
 create mode 100644 src/tests/enhanced_ocr_tests.rs
diff --git a/Cargo.toml b/Cargo.toml
index e6a62c2..902f113 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -71,6 +71,7 @@ testcontainers = "0.24"
 testcontainers-modules = { version = "0.12", features = ["postgres"] }
 wiremock = "0.6"
 tokio-test = "0.4"
+futures = "0.3"
 
 [profile.test]
 incremental = false
diff --git a/frontend/src/pages/FailedOcrPage.tsx b/frontend/src/pages/FailedOcrPage.tsx
index 2dad6da..7b33a89 100644
--- a/frontend/src/pages/FailedOcrPage.tsx
+++ b/frontend/src/pages/FailedOcrPage.tsx
@@ -155,6 +155,11 @@ const FailedOcrPage: React.FC = () => {
   const [previewData, setPreviewData] = useState<any>(null);
   const [confirmDeleteOpen, setConfirmDeleteOpen] = useState(false);
 
+  // Failed documents deletion state
+  const [failedDocsLoading, setFailedDocsLoading] = useState(false);
+  const [failedPreviewData, setFailedPreviewData] = useState<any>(null);
+  const [confirmDeleteFailedOpen, setConfirmDeleteFailedOpen] = useState(false);
+
   const fetchFailedDocuments = async () => {
     try {
       setLoading(true);
@@ -308,6 +313,8 @@ const FailedOcrPage: React.FC = () => {
       fetchDuplicates();
     } else if (currentTab === 2) {
       handlePreviewLowConfidence();
+    } else if (currentTab === 3) {
+      handlePreviewFailedDocuments();
     }
   };
 
@@ -369,6 +376,51 @@ const FailedOcrPage: React.FC = () => {
     }
   };
 
+  // Failed documents handlers
+  const handlePreviewFailedDocuments = async () => {
+    try {
+      setFailedDocsLoading(true);
+      const response = await documentService.deleteFailedOcr(true);
+      setFailedPreviewData(response.data);
+    } catch (error) {
+      setSnackbar({
+        open: true,
+        message: 'Failed to preview failed documents',
+        severity: 'error'
+      });
+    } finally {
+      setFailedDocsLoading(false);
+    }
+  };
+
+  const handleDeleteFailedDocuments = async () => {
+    try {
+      setFailedDocsLoading(true);
+      const response = await documentService.deleteFailedOcr(false);
+      
+      setSnackbar({
+        open: true,
+        message: response.data.message,
+        severity: 'success'
+      });
+      setFailedPreviewData(null);
+      setConfirmDeleteFailedOpen(false);
+      
+      // Refresh failed OCR tab if currently viewing it
+      if (currentTab === 0) {
+        fetchFailedDocuments();
+      }
+    } catch (error) {
+      setSnackbar({
+        open: true,
+        message: 'Failed to delete failed documents',
+        severity: 'error'
+      });
+    } finally {
+      setFailedDocsLoading(false);
+    }
+  };
+
   if (loading && (!documents || documents.length === 0)) {
     return (
       <Box display="flex" justifyContent="center" alignItems="center" minHeight="400px">
@@ -410,6 +462,11 @@ const FailedOcrPage: React.FC = () => {
             label={`Low Confidence${previewData ? ` (${previewData.matched_count})` : ''}`}
             iconPosition="start"
           />
+          <Tab
+            icon={<DeleteIcon />}
+            label="Delete Failed"
+            iconPosition="start"
+          />
         </Tabs>
       </Paper>
 
@@ -989,6 +1046,83 @@ const FailedOcrPage: React.FC = () => {
         </>
       )}
 
+      {/* Delete Failed Documents Tab Content */}
+      {currentTab === 3 && (
+        <>
+          <Alert severity="warning" sx={{ mb: 3 }}>
+            <AlertTitle>Delete Failed OCR Documents</AlertTitle>
+            <Typography>
+              This tool allows you to delete all documents where OCR processing failed completely. 
+              This includes documents with NULL confidence values or explicit failure status.
+              Use the preview feature first to see what documents would be affected before deleting.
+            </Typography>
+          </Alert>
+
+          <Card sx={{ mb: 3 }}>
+            <CardContent>
+              <Grid container spacing={3} alignItems="center">
+                <Grid item xs={12} md={6}>
+                  <Button
+                    variant="outlined"
+                    onClick={handlePreviewFailedDocuments}
+                    disabled={failedDocsLoading}
+                    startIcon={failedDocsLoading ? <CircularProgress size={20} /> : <FindInPageIcon />}
+                    fullWidth
+                  >
+                    Preview Failed Documents
+                  </Button>
+                </Grid>
+                <Grid item xs={12} md={6}>
+                  <Button
+                    variant="contained"
+                    color="error"
+                    onClick={() => setConfirmDeleteFailedOpen(true)}
+                    disabled={!failedPreviewData || failedPreviewData.matched_count === 0 || failedDocsLoading}
+                    startIcon={<DeleteIcon />}
+                    fullWidth
+                  >
+                    Delete Failed Documents
+                  </Button>
+                </Grid>
+              </Grid>
+            </CardContent>
+          </Card>
+
+          {/* Preview Results */}
+          {failedPreviewData && (
+            <Card sx={{ mb: 3 }}>
+              <CardContent>
+                <Typography variant="h6" gutterBottom>
+                  Preview Results
+                </Typography>
+                <Typography color={failedPreviewData.matched_count > 0 ? 'error.main' : 'success.main'}>
+                  {failedPreviewData.message}
+                </Typography>
+                {failedPreviewData.matched_count > 0 && (
+                  <Box sx={{ mt: 2 }}>
+                    <Typography variant="body2" color="text.secondary">
+                      Document IDs that would be deleted:
+                    </Typography>
+                    <Typography variant="body2" sx={{ fontFamily: 'monospace', wordBreak: 'break-all' }}>
+                      {failedPreviewData.document_ids.slice(0, 10).join(', ')}
+                      {failedPreviewData.document_ids.length > 10 && ` ... and ${failedPreviewData.document_ids.length - 10} more`}
+                    </Typography>
+                  </Box>
+                )}
+              </CardContent>
+            </Card>
+          )}
+
+          {/* Loading State */}
+          {failedDocsLoading && !failedPreviewData && (
+            <Box display="flex" justifyContent="center" alignItems="center" minHeight="200px">
+              <CircularProgress />
+              <Typography sx={{ ml: 2 }}>Processing request...</Typography>
+            </Box>
+          )}
+        </>
+      )}
+
       {/* Confirmation Dialog */}
       <Dialog
         open={confirmDeleteOpen}
@@ -1024,6 +1158,41 @@ const FailedOcrPage: React.FC = () => {
         </DialogActions>
       </Dialog>
 
+      {/* Confirmation Dialog for Failed Documents */}
+      <Dialog
+        open={confirmDeleteFailedOpen}
+        onClose={() => setConfirmDeleteFailedOpen(false)}
+        maxWidth="sm"
+        fullWidth
+      >
+        <DialogTitle color="error.main">
+          <DeleteIcon sx={{ mr: 1, verticalAlign: 'middle' }} />
+          Confirm Failed Document Deletion
+        </DialogTitle>
+        <DialogContent>
+          <Typography>
+            Are you sure you want to delete {failedPreviewData?.matched_count || 0} documents with failed OCR processing?
+          </Typography>
+          <Alert severity="error" sx={{ mt: 2 }}>
+            This action cannot be undone. The documents and their files will be permanently deleted.
+          </Alert>
+        </DialogContent>
+        <DialogActions>
+          <Button onClick={() => setConfirmDeleteFailedOpen(false)}>
+            Cancel
+          </Button>
+          <Button
+            onClick={handleDeleteFailedDocuments}
+            color="error"
+            variant="contained"
+            disabled={failedDocsLoading}
+            startIcon={failedDocsLoading ? <CircularProgress size={20} /> : <DeleteIcon />}
+          >
+            {failedDocsLoading ? 'Deleting...' : 'Delete Failed Documents'}
+          </Button>
+        </DialogActions>
+      </Dialog>
+
       {/* Document Details Dialog */}
       <Dialog
         open={detailsOpen}
diff --git a/frontend/src/services/api.ts b/frontend/src/services/api.ts
index 0b6abe4..7b01d76 100644
--- a/frontend/src/services/api.ts
+++ b/frontend/src/services/api.ts
@@ -248,6 +248,11 @@ export const documentService = {
       preview_only: previewOnly
     })
   },
+  deleteFailedOcr: (previewOnly: boolean = false) => {
+    return api.post('/documents/delete-failed-ocr', {
+      preview_only: previewOnly
+    })
+  },
 }
 
 export interface OcrStatusResponse {
diff --git a/migrations/20250628000001_backfill_ocr_confidence.sql b/migrations/20250628000001_backfill_ocr_confidence.sql
new file mode 100644
index 0000000..0005371
--- /dev/null
+++ b/migrations/20250628000001_backfill_ocr_confidence.sql
@@ -0,0 +1,59 @@
+-- Backfill OCR confidence scores for existing documents
+-- Since OCR confidence was previously hardcoded to 85%, we need to recalculate
+-- actual confidence for documents that currently have this placeholder value
+
+-- First, let's identify documents that likely have placeholder confidence
+-- (85% exactly, which was the hardcoded value)
+CREATE TEMP TABLE documents_to_update AS
+SELECT id, ocr_text, ocr_status 
+FROM documents 
+WHERE ocr_confidence = 85.0 
+  AND ocr_status = 'completed' 
+  AND ocr_text IS NOT NULL 
+  AND length(trim(ocr_text)) > 0;
+
+-- For now, we'll estimate confidence based on text quality metrics
+-- This is a rough approximation until we can re-run OCR with actual confidence
+UPDATE documents 
+SET ocr_confidence = CASE
+    -- High quality text: good length, reasonable character distribution
+    WHEN length(trim(ocr_text)) > 1000 
+         AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), char(10), ''))) * 100.0 / length(ocr_text) > 10.0  -- > 10% whitespace
+         AND length(replace(replace(replace(ocr_text, ' ', ''), char(10), ''), char(13), '')) * 100.0 / length(ocr_text) > 70.0  -- > 70% non-whitespace chars
+    THEN 90.0 + (random() * 8.0)  -- 90-98%
+    
+    -- Medium quality text: decent length, some structure
+    WHEN length(trim(ocr_text)) > 100 
+         AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), char(10), ''))) * 100.0 / length(ocr_text) > 5.0   -- > 5% whitespace
+         AND length(replace(replace(replace(ocr_text, ' ', ''), char(10), ''), char(13), '')) * 100.0 / length(ocr_text) > 50.0  -- > 50% non-whitespace chars
+    THEN 70.0 + (random() * 15.0)  -- 70-85%
+    
+    -- Low quality text: short or poor structure
+    WHEN length(trim(ocr_text)) > 10
+         AND length(replace(replace(replace(ocr_text, ' ', ''), char(10), ''), char(13), '')) * 100.0 / length(ocr_text) > 30.0  -- > 30% non-whitespace chars
+    THEN 40.0 + (random() * 25.0)  -- 40-65%
+    
+    -- Very poor quality: very short or mostly garbage
+    ELSE 20.0 + (random() * 15.0)  -- 20-35%
+END
+WHERE id IN (SELECT id FROM documents_to_update);
+
+-- Add a comment explaining what we did
+COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence percentage (0-100). Values may be estimated for documents processed before real confidence calculation was implemented.';
+
+-- Log the update
+DO $$
+DECLARE
+    updated_count INTEGER;
+BEGIN
+    SELECT COUNT(*) INTO updated_count FROM documents_to_update;
+    RAISE NOTICE 'Backfilled OCR confidence for % documents that had placeholder 85%% confidence', updated_count;
+END $$;
+
+-- Clean up
+DROP TABLE documents_to_update;
+
+-- Create an index to help with confidence-based queries
+CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence_range 
+ON documents(ocr_confidence) 
+WHERE ocr_confidence IS NOT NULL;
\ No newline at end of file
diff --git a/src/db/documents.rs b/src/db/documents.rs
index 3f7294b..5217d97 100644
--- a/src/db/documents.rs
+++ b/src/db/documents.rs
@@ -1586,6 +1586,165 @@ impl Database {
         Ok(documents)
     }
 
+    /// Find documents with failed OCR processing
+    pub async fn find_failed_ocr_documents(&self, user_id: uuid::Uuid, user_role: crate::models::UserRole) -> Result<Vec<Document>> {
+        let documents = if user_role == crate::models::UserRole::Admin {
+            let rows = sqlx::query(
+                r#"
+                SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
+                FROM documents 
+                WHERE ocr_status = 'failed' OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing')
+                ORDER BY created_at DESC
+                "#,
+            )
+            .fetch_all(&self.pool)
+            .await?;
+
+            rows.into_iter().map(|r| Document {
+                id: r.get("id"),
+                filename: r.get("filename"),
+                original_filename: r.get("original_filename"),
+                file_path: r.get("file_path"),
+                file_size: r.get("file_size"),
+                mime_type: r.get("mime_type"),
+                content: r.get("content"),
+                ocr_text: r.get("ocr_text"),
+                ocr_confidence: r.get("ocr_confidence"),
+                ocr_word_count: r.get("ocr_word_count"),
+                ocr_processing_time_ms: r.get("ocr_processing_time_ms"),
+                ocr_status: r.get("ocr_status"),
+                ocr_error: r.get("ocr_error"),
+                ocr_completed_at: r.get("ocr_completed_at"),
+                tags: r.get("tags"),
+                created_at: r.get("created_at"),
+                updated_at: r.get("updated_at"),
+                user_id: r.get("user_id"),
+                file_hash: r.get("file_hash"),
+            }).collect()
+        } else {
+            let rows = sqlx::query(
+                r#"
+                SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
+                FROM documents 
+                WHERE (ocr_status = 'failed' OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing')) AND user_id = $1
+                ORDER BY created_at DESC
+                "#,
+            )
+            .bind(user_id)
+            .fetch_all(&self.pool)
+            .await?;
+
+            rows.into_iter().map(|r| Document {
+                id: r.get("id"),
+                filename: r.get("filename"),
+                original_filename: r.get("original_filename"),
+                file_path: r.get("file_path"),
+                file_size: r.get("file_size"),
+                mime_type: r.get("mime_type"),
+                content: r.get("content"),
+                ocr_text: r.get("ocr_text"),
+                ocr_confidence: r.get("ocr_confidence"),
+                ocr_word_count: r.get("ocr_word_count"),
+                ocr_processing_time_ms: r.get("ocr_processing_time_ms"),
+                ocr_status: r.get("ocr_status"),
+                ocr_error: r.get("ocr_error"),
+                ocr_completed_at: r.get("ocr_completed_at"),
+                tags: r.get("tags"),
+                created_at: r.get("created_at"),
+                updated_at: r.get("updated_at"),
+                user_id: r.get("user_id"),
+                file_hash: r.get("file_hash"),
+            }).collect()
+        };
+
+        Ok(documents)
+    }
+
+    /// Find documents with low confidence or failed OCR (combined)
+    pub async fn find_low_confidence_and_failed_documents(&self, max_confidence: f32, user_id: uuid::Uuid, user_role: crate::models::UserRole) -> Result<Vec<Document>> {
+        let documents = if user_role == crate::models::UserRole::Admin {
+            let rows = sqlx::query(
+                r#"
+                SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
+                FROM documents 
+                WHERE (ocr_confidence IS NOT NULL AND ocr_confidence < $1) 
+                   OR ocr_status = 'failed' 
+                   OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing')
+                ORDER BY 
+                    CASE WHEN ocr_confidence IS NOT NULL THEN ocr_confidence ELSE -1 END ASC, 
+                    created_at DESC
+                "#,
+            )
+            .bind(max_confidence)
+            .fetch_all(&self.pool)
+            .await?;
+
+            rows.into_iter().map(|r| Document {
+                id: r.get("id"),
+                filename: r.get("filename"),
+                original_filename: r.get("original_filename"),
+                file_path: r.get("file_path"),
+                file_size: r.get("file_size"),
+                mime_type: r.get("mime_type"),
+                content: r.get("content"),
+                ocr_text: r.get("ocr_text"),
+                ocr_confidence: r.get("ocr_confidence"),
+                ocr_word_count: r.get("ocr_word_count"),
+                ocr_processing_time_ms: r.get("ocr_processing_time_ms"),
+                ocr_status: r.get("ocr_status"),
+                ocr_error: r.get("ocr_error"),
+                ocr_completed_at: r.get("ocr_completed_at"),
+                tags: r.get("tags"),
+                created_at: r.get("created_at"),
+                updated_at: r.get("updated_at"),
+                user_id: r.get("user_id"),
+                file_hash: r.get("file_hash"),
+            }).collect()
+        } else {
+            let rows = sqlx::query(
+                r#"
+                SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash
+                FROM documents 
+                WHERE ((ocr_confidence IS NOT NULL AND ocr_confidence < $1) 
+                    OR ocr_status = 'failed' 
+                    OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing'))
+                  AND user_id = $2
+                ORDER BY 
+                    CASE WHEN ocr_confidence IS NOT NULL THEN ocr_confidence ELSE -1 END ASC, 
+                    created_at DESC
+                "#,
+            )
+            .bind(max_confidence)
+            .bind(user_id)
+            .fetch_all(&self.pool)
+            .await?;
+
+            rows.into_iter().map(|r| Document {
+                id: r.get("id"),
+                filename: r.get("filename"),
+                original_filename: r.get("original_filename"),
+                file_path: r.get("file_path"),
+                file_size: r.get("file_size"),
+                mime_type: r.get("mime_type"),
+                content: r.get("content"),
+                ocr_text: r.get("ocr_text"),
+                ocr_confidence: r.get("ocr_confidence"),
+                ocr_word_count: r.get("ocr_word_count"),
+                ocr_processing_time_ms: r.get("ocr_processing_time_ms"),
+                ocr_status: r.get("ocr_status"),
+                ocr_error: r.get("ocr_error"),
+                ocr_completed_at: r.get("ocr_completed_at"),
+                tags: r.get("tags"),
+                created_at: r.get("created_at"),
+                updated_at: r.get("updated_at"),
+                user_id: r.get("user_id"),
+                file_hash: r.get("file_hash"),
+            }).collect()
+        };
+
+        Ok(documents)
+    }
+
     pub async fn count_documents_for_source(&self, source_id: Uuid) -> Result<(i64, i64)> {
         let row = sqlx::query(
             r#"
diff --git a/src/ocr/enhanced.rs b/src/ocr/enhanced.rs
index 87e690d..4531f7d 100644
--- a/src/ocr/enhanced.rs
+++ b/src/ocr/enhanced.rs
@@ -295,15 +295,21 @@ impl EnhancedOcrService {
         Ok(tesseract)
     }
     
-    /// Calculate overall confidence score
+    /// Calculate overall confidence score using Tesseract's mean confidence
     #[cfg(feature = "ocr")]
-    fn calculate_overall_confidence(&self, _tesseract: &mut Tesseract) -> Result<f32> {
-        // Note: get_word_confidences may not be available in current tesseract crate version
-        // For now, we'll estimate confidence based on text quality
-        // This can be enhanced when the API is available or with alternative methods
+    fn calculate_overall_confidence(&self, tesseract: &mut Tesseract) -> Result<f32> {
+        // Use Tesseract's built-in mean confidence calculation
+        let confidence = tesseract.mean_text_conf();
         
-        // Return a reasonable default confidence for now
-        Ok(85.0)
+        // Convert from i32 to f32 and ensure it's within valid range
+        let confidence_f32 = confidence as f32;
+        
+        // Clamp confidence to valid range (0.0 to 100.0)
+        let clamped_confidence = confidence_f32.max(0.0).min(100.0);
+        
+        debug!("Tesseract confidence: {} -> {:.1}%", confidence, clamped_confidence);
+        
+        Ok(clamped_confidence)
     }
     
     /// Detect and correct image orientation
diff --git a/src/routes/documents.rs b/src/routes/documents.rs
index 010cc1b..048899b 100644
--- a/src/routes/documents.rs
+++ b/src/routes/documents.rs
@@ -53,6 +53,7 @@ pub fn router() -> Router<Arc<AppState>> {
         .route("/failed-ocr", get(get_failed_ocr_documents))
         .route("/duplicates", get(get_user_duplicates))
         .route("/delete-low-confidence", post(delete_low_confidence_documents))
+        .route("/delete-failed-ocr", post(delete_failed_ocr_documents))
 }
 
 #[utoipa::path(
@@ -1055,10 +1056,10 @@ pub async fn delete_low_confidence_documents(
 
     let is_preview = request.preview_only.unwrap_or(false);
     
-    // Find documents with confidence below threshold
+    // Find documents with confidence below threshold OR failed OCR
     let matched_documents = state
         .db
-        .find_documents_by_confidence_threshold(request.max_confidence, auth_user.user.id, auth_user.user.role)
+        .find_low_confidence_and_failed_documents(request.max_confidence, auth_user.user.id, auth_user.user.role)
         .await
         .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
 
@@ -1136,4 +1137,100 @@ pub async fn delete_low_confidence_documents(
         "ignored_file_creation_failures": ignored_file_creation_failures,
         "deleted_document_ids": deleted_documents.iter().map(|d| d.id).collect::<Vec<_>>()
     })))
+}
+
+/// Delete all documents with failed OCR processing
+pub async fn delete_failed_ocr_documents(
+    State(state): State<Arc<AppState>>,
+    auth_user: AuthUser,
+    Json(request): Json<serde_json::Value>,
+) -> Result<Json<serde_json::Value>, StatusCode> {
+    let is_preview = request.get("preview_only").and_then(|v| v.as_bool()).unwrap_or(false);
+    
+    // Find documents with failed OCR
+    let matched_documents = state
+        .db
+        .find_failed_ocr_documents(auth_user.user.id, auth_user.user.role)
+        .await
+        .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
+
+    let matched_count = matched_documents.len();
+
+    if is_preview {
+        return Ok(Json(serde_json::json!({
+            "success": true,
+            "message": format!("Found {} documents with failed OCR processing", matched_count),
+            "matched_count": matched_count,
+            "preview": true,
+            "document_ids": matched_documents.iter().map(|d| d.id).collect::<Vec<_>>()
+        })));
+    }
+
+    if matched_documents.is_empty() {
+        return Ok(Json(serde_json::json!({
+            "success": true,
+            "message": "No documents found with failed OCR processing",
+            "deleted_count": 0
+        })));
+    }
+
+    // Extract document IDs for bulk deletion
+    let document_ids: Vec<uuid::Uuid> = matched_documents.iter().map(|d| d.id).collect();
+
+    // Use existing bulk delete logic
+    let deleted_documents = state
+        .db
+        .bulk_delete_documents(&document_ids, auth_user.user.id, auth_user.user.role)
+        .await
+        .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
+
+    // Create ignored file records for all successfully deleted documents
+    let mut ignored_file_creation_failures = 0;
+    for document in &deleted_documents {
+        let reason = if let Some(ref error) = document.ocr_error {
+            format!("deleted due to failed OCR processing: {}", error)
+        } else {
+            "deleted due to failed OCR processing".to_string()
+        };
+        
+        if let Err(e) = crate::db::ignored_files::create_ignored_file_from_document(
+            state.db.get_pool(),
+            document.id,
+            auth_user.user.id,
+            Some(reason),
+            None,
+            None,
+            None,
+        ).await {
+            ignored_file_creation_failures += 1;
+            tracing::warn!("Failed to create ignored file record for document {}: {}", document.id, e);
+        }
+    }
+
+    let file_service = FileService::new(state.config.upload_path.clone());
+    let mut successful_file_deletions = 0;
+    let mut failed_file_deletions = 0;
+
+    for document in &deleted_documents {
+        match file_service.delete_document_files(document).await {
+            Ok(_) => successful_file_deletions += 1,
+            Err(e) => {
+                failed_file_deletions += 1;
+                tracing::warn!("Failed to delete files for document {}: {}", document.id, e);
+            }
+        }
+    }
+
+    let deleted_count = deleted_documents.len();
+
+    Ok(Json(serde_json::json!({
+        "success": true,
+        "message": format!("Successfully deleted {} documents with failed OCR processing", deleted_count),
+        "deleted_count": deleted_count,
+        "matched_count": matched_count,
+        "successful_file_deletions": successful_file_deletions,
+        "failed_file_deletions": failed_file_deletions,
+        "ignored_file_creation_failures": ignored_file_creation_failures,
+        "deleted_document_ids": deleted_documents.iter().map(|d| d.id).collect::<Vec<_>>()
+    })))
 }
\ No newline at end of file
diff --git a/src/tests/document_routes_tests.rs b/src/tests/document_routes_tests.rs
index 12ae45d..7627db8 100644
--- a/src/tests/document_routes_tests.rs
+++ b/src/tests/document_routes_tests.rs
@@ -633,4 +633,304 @@ mod document_routes_deletion_tests {
             // This should result in zero matched documents
         }
     }
+
+    #[cfg(test)]
+    mod delete_failed_ocr_tests {
+        use super::*;
+        use serde_json::json;
+
+        #[test]
+        fn test_delete_failed_ocr_request_serialization() {
+            // Test preview mode
+            let preview_request = json!({
+                "preview_only": true
+            });
+            
+            let parsed: serde_json::Value = serde_json::from_value(preview_request).unwrap();
+            assert_eq!(parsed["preview_only"], true);
+
+            // Test delete mode
+            let delete_request = json!({
+                "preview_only": false
+            });
+            
+            let parsed: serde_json::Value = serde_json::from_value(delete_request).unwrap();
+            assert_eq!(parsed["preview_only"], false);
+
+            // Test empty request (should default to preview_only: false)
+            let empty_request = json!({});
+            
+            let parsed: serde_json::Value = serde_json::from_value(empty_request).unwrap();
+            assert!(parsed.get("preview_only").is_none() || parsed["preview_only"] == false);
+        }
+
+        #[test]
+        fn test_delete_failed_ocr_user_authorization() {
+            let admin_user = create_test_user(UserRole::Admin);
+            let regular_user = create_test_user(UserRole::User);
+            
+            // Both admins and regular users should be able to delete their own failed documents
+            assert_eq!(admin_user.role, UserRole::Admin);
+            assert_eq!(regular_user.role, UserRole::User);
+
+            // Admin should be able to see all failed documents
+            // Regular user should only see their own failed documents
+            // This logic would be tested in the actual endpoint implementation
+        }
+
+        #[test]
+        fn test_failed_document_criteria() {
+            let user_id = Uuid::new_v4();
+
+            // Test document with failed OCR status
+            let mut failed_doc = create_test_document(user_id);
+            failed_doc.ocr_status = Some("failed".to_string());
+            failed_doc.ocr_confidence = None;
+            failed_doc.ocr_error = Some("OCR processing failed".to_string());
+            
+            // Should be included in failed document deletion
+            assert_eq!(failed_doc.ocr_status, Some("failed".to_string()));
+            assert!(failed_doc.ocr_confidence.is_none());
+
+            // Test document with NULL confidence but completed status
+            let mut null_confidence_doc = create_test_document(user_id);
+            null_confidence_doc.ocr_status = Some("completed".to_string());
+            null_confidence_doc.ocr_confidence = None;
+            null_confidence_doc.ocr_text = Some("Text but no confidence".to_string());
+            
+            // Should be included in failed document deletion (NULL confidence indicates failure)
+            assert_eq!(null_confidence_doc.ocr_status, Some("completed".to_string()));
+            assert!(null_confidence_doc.ocr_confidence.is_none());
+
+            // Test document with successful OCR
+            let mut success_doc = create_test_document(user_id);
+            success_doc.ocr_status = Some("completed".to_string());
+            success_doc.ocr_confidence = Some(85.0);
+            success_doc.ocr_text = Some("Successfully extracted text".to_string());
+            
+            // Should NOT be included in failed document deletion
+            assert_eq!(success_doc.ocr_status, Some("completed".to_string()));
+            assert!(success_doc.ocr_confidence.is_some());
+
+            // Test document with pending status
+            let mut pending_doc = create_test_document(user_id);
+            pending_doc.ocr_status = Some("pending".to_string());
+            pending_doc.ocr_confidence = None;
+            
+            // Should NOT be included in failed document deletion (still processing)
+            assert_eq!(pending_doc.ocr_status, Some("pending".to_string()));
+
+            // Test document with processing status
+            let mut processing_doc = create_test_document(user_id);
+            processing_doc.ocr_status = Some("processing".to_string());
+            processing_doc.ocr_confidence = None;
+            
+            // Should NOT be included in failed document deletion (still processing)
+            assert_eq!(processing_doc.ocr_status, Some("processing".to_string()));
+        }
+
+        #[test]
+        fn test_delete_failed_ocr_response_format() {
+            // Test preview response format
+            let preview_response = json!({
+                "success": true,
+                "message": "Found 5 documents with failed OCR processing",
+                "matched_count": 5,
+                "preview": true,
+                "document_ids": ["id1", "id2", "id3", "id4", "id5"]
+            });
+
+            assert_eq!(preview_response["success"], true);
+            assert_eq!(preview_response["matched_count"], 5);
+            assert_eq!(preview_response["preview"], true);
+            assert!(preview_response["document_ids"].is_array());
+
+            // Test delete response format
+            let delete_response = json!({
+                "success": true,
+                "message": "Successfully deleted 3 documents with failed OCR processing",
+                "deleted_count": 3,
+                "matched_count": 3,
+                "successful_file_deletions": 3,
+                "failed_file_deletions": 0,
+                "ignored_file_creation_failures": 0,
+                "deleted_document_ids": ["id1", "id2", "id3"]
+            });
+
+            assert_eq!(delete_response["success"], true);
+            assert_eq!(delete_response["deleted_count"], 3);
+            assert_eq!(delete_response["matched_count"], 3);
+            assert!(delete_response["deleted_document_ids"].is_array());
+            assert!(delete_response.get("preview").is_none()); // Should not have preview flag in delete response
+
+            // Test no documents found response
+            let no_docs_response = json!({
+                "success": true,
+                "message": "No documents found with failed OCR processing",
+                "deleted_count": 0
+            });
+
+            assert_eq!(no_docs_response["success"], true);
+            assert_eq!(no_docs_response["deleted_count"], 0);
+        }
+
+        #[test]
+        fn test_delete_failed_ocr_error_scenarios() {
+            // Test with no failed documents
+            let no_failed_docs_request = json!({
+                "preview_only": true
+            });
+
+            // Should return success with 0 matched count
+            // This would be tested in integration tests with actual database
+
+            // Test with file deletion failures
+            let file_deletion_error = json!({
+                "success": true,
+                "message": "Successfully deleted 2 documents with failed OCR processing",
+                "deleted_count": 2,
+                "matched_count": 2,
+                "successful_file_deletions": 1,
+                "failed_file_deletions": 1,
+                "ignored_file_creation_failures": 0,
+                "deleted_document_ids": ["id1", "id2"]
+            });
+
+            // Should still report success but indicate file deletion issues
+            assert_eq!(file_deletion_error["success"], true);
+            assert_eq!(file_deletion_error["failed_file_deletions"], 1);
+
+            // Test with ignored file creation failures
+            let ignored_file_error = json!({
+                "success": true,
+                "message": "Successfully deleted 2 documents with failed OCR processing",
+                "deleted_count": 2,
+                "matched_count": 2,
+                "successful_file_deletions": 2,
+                "failed_file_deletions": 0,
+                "ignored_file_creation_failures": 1,
+                "deleted_document_ids": ["id1", "id2"]
+            });
+
+            assert_eq!(ignored_file_error["success"], true);
+            assert_eq!(ignored_file_error["ignored_file_creation_failures"], 1);
+        }
+
+        #[test]
+        fn test_delete_failed_ocr_failure_reason_handling() {
+            let user_id = Uuid::new_v4();
+
+            // Test document with specific failure reason
+            let mut ocr_timeout_doc = create_test_document(user_id);
+            ocr_timeout_doc.ocr_status = Some("failed".to_string());
+            ocr_timeout_doc.ocr_error = Some("OCR processing timed out after 2 minutes".to_string());
+            
+            // Test document with corruption error
+            let mut corruption_doc = create_test_document(user_id);
+            corruption_doc.ocr_status = Some("failed".to_string());
+            corruption_doc.ocr_error = Some("Invalid image format - file appears corrupted".to_string());
+            
+            // Test document with font encoding error
+            let mut font_error_doc = create_test_document(user_id);
+            font_error_doc.ocr_status = Some("failed".to_string());
+            font_error_doc.ocr_error = Some("PDF text extraction failed due to font encoding issues".to_string());
+            
+            // All should be valid candidates for deletion
+            assert!(ocr_timeout_doc.ocr_error.is_some());
+            assert!(corruption_doc.ocr_error.is_some());
+            assert!(font_error_doc.ocr_error.is_some());
+            
+            // The deletion should create appropriate ignored file records with the error reasons
+        }
+
+        #[test]
+        fn test_delete_failed_ocr_ignored_file_creation() {
+            // Test that deleted failed documents create proper ignored file records
+            let user_id = Uuid::new_v4();
+            
+            let mut failed_doc = create_test_document(user_id);
+            failed_doc.ocr_status = Some("failed".to_string());
+            failed_doc.ocr_error = Some("OCR processing failed due to corrupted image".to_string());
+            
+            // Expected ignored file reason should include the error
+            let expected_reason = "deleted due to failed OCR processing: OCR processing failed due to corrupted image";
+            
+            // In the actual implementation, this would be tested by verifying the ignored file record
+            assert!(failed_doc.ocr_error.is_some());
+            
+            // Test document with no specific error
+            let mut failed_no_error_doc = create_test_document(user_id);
+            failed_no_error_doc.ocr_status = Some("failed".to_string());
+            failed_no_error_doc.ocr_error = None;
+            
+            // Should use generic reason
+            let expected_generic_reason = "deleted due to failed OCR processing";
+            
+            // Both should result in appropriate ignored file records
+            assert_eq!(failed_doc.ocr_status, Some("failed".to_string()));
+            assert_eq!(failed_no_error_doc.ocr_status, Some("failed".to_string()));
+        }
+
+        #[test]
+        fn test_delete_failed_ocr_vs_low_confidence_distinction() {
+            let user_id = Uuid::new_v4();
+
+            // Failed OCR document (should be in failed deletion, not low confidence)
+            let mut failed_doc = create_test_document(user_id);
+            failed_doc.ocr_status = Some("failed".to_string());
+            failed_doc.ocr_confidence = None;
+            
+            // Low confidence document (should be in low confidence deletion, not failed)
+            let mut low_confidence_doc = create_test_document(user_id);
+            low_confidence_doc.ocr_status = Some("completed".to_string());
+            low_confidence_doc.ocr_confidence = Some(25.0);
+            
+            // NULL confidence but completed (edge case - should be in failed deletion)
+            let mut null_confidence_doc = create_test_document(user_id);
+            null_confidence_doc.ocr_status = Some("completed".to_string());
+            null_confidence_doc.ocr_confidence = None;
+            
+            // High confidence document (should be in neither)
+            let mut high_confidence_doc = create_test_document(user_id);
+            high_confidence_doc.ocr_status = Some("completed".to_string());
+            high_confidence_doc.ocr_confidence = Some(95.0);
+            
+            // Verify the logic for each type
+            assert_eq!(failed_doc.ocr_status, Some("failed".to_string()));
+            assert!(failed_doc.ocr_confidence.is_none());
+            
+            assert_eq!(low_confidence_doc.ocr_status, Some("completed".to_string()));
+            assert!(low_confidence_doc.ocr_confidence.unwrap() < 50.0);
+            
+            assert_eq!(null_confidence_doc.ocr_status, Some("completed".to_string()));
+            assert!(null_confidence_doc.ocr_confidence.is_none());
+            
+            assert_eq!(high_confidence_doc.ocr_status, Some("completed".to_string()));
+            assert!(high_confidence_doc.ocr_confidence.unwrap() > 50.0);
+        }
+
+        #[test]
+        fn test_delete_failed_ocr_endpoint_path() {
+            // Test that the endpoint path is correct
+            let endpoint_path = "/api/documents/delete-failed-ocr";
+            
+            // This would be used in integration tests
+            assert!(endpoint_path.contains("delete-failed-ocr"));
+            assert!(endpoint_path.starts_with("/api/documents/"));
+        }
+
+        #[test]
+        fn test_delete_failed_ocr_http_methods() {
+            // The endpoint should only accept POST requests
+            // GET, PUT, DELETE should not be allowed
+            
+            // This would be tested in integration tests with actual HTTP requests
+            let allowed_method = "POST";
+            let disallowed_methods = vec!["GET", "PUT", "DELETE", "PATCH"];
+            
+            assert_eq!(allowed_method, "POST");
+            assert!(disallowed_methods.contains(&"GET"));
+            assert!(disallowed_methods.contains(&"DELETE"));
+        }
+    }
 }
\ No newline at end of file
diff --git a/src/tests/documents_tests.rs b/src/tests/documents_tests.rs
index 0291c29..17b1050 100644
--- a/src/tests/documents_tests.rs
+++ b/src/tests/documents_tests.rs
@@ -1796,4 +1796,398 @@ mod deletion_error_handling_tests {
             }
         }
     }
+
+    #[tokio::test]
+    async fn test_find_failed_ocr_documents() {
+        use testcontainers::{runners::AsyncRunner};
+        use testcontainers_modules::postgres::Postgres;
+        
+        let postgres_image = Postgres::default();
+        let container = postgres_image.start().await.expect("Failed to start postgres container");
+        let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port");
+        
+        // Use TEST_DATABASE_URL if available, otherwise use the container
+        let connection_string = std::env::var("TEST_DATABASE_URL")
+            .unwrap_or_else(|_| format!("postgres://postgres:postgres@127.0.0.1:{}/postgres", port));
+        let database = Database::new(&connection_string).await.unwrap();
+        database.migrate().await.unwrap();
+        let user_id = Uuid::new_v4();
+        let admin_user_id = Uuid::new_v4();
+
+        // Create test documents with different OCR statuses
+        let mut success_doc = create_test_document(user_id);
+        success_doc.ocr_status = Some("completed".to_string());
+        success_doc.ocr_confidence = Some(85.0);
+        success_doc.ocr_text = Some("Successfully extracted text".to_string());
+
+        let mut failed_doc = create_test_document(user_id);
+        failed_doc.ocr_status = Some("failed".to_string());
+        failed_doc.ocr_confidence = None;
+        failed_doc.ocr_text = None;
+        failed_doc.ocr_error = Some("OCR processing failed due to corrupted image".to_string());
+
+        let mut null_confidence_doc = create_test_document(user_id);
+        null_confidence_doc.ocr_status = Some("completed".to_string());
+        null_confidence_doc.ocr_confidence = None; // NULL confidence but not failed
+        null_confidence_doc.ocr_text = Some("Text extracted but no confidence".to_string());
+
+        let mut pending_doc = create_test_document(user_id);
+        pending_doc.ocr_status = Some("pending".to_string());
+        pending_doc.ocr_confidence = None;
+        pending_doc.ocr_text = None;
+
+        let mut processing_doc = create_test_document(user_id);
+        processing_doc.ocr_status = Some("processing".to_string());
+        processing_doc.ocr_confidence = None;
+        processing_doc.ocr_text = None;
+
+        // Different user's failed document
+        let mut other_user_failed_doc = create_test_document(admin_user_id);
+        other_user_failed_doc.ocr_status = Some("failed".to_string());
+        other_user_failed_doc.ocr_confidence = None;
+
+        // Insert all documents
+        let success_id = database.create_document(success_doc).await.unwrap().id;
+        let failed_id = database.create_document(failed_doc).await.unwrap().id;
+        let null_confidence_id = database.create_document(null_confidence_doc).await.unwrap().id;
+        let pending_id = database.create_document(pending_doc).await.unwrap().id;
+        let processing_id = database.create_document(processing_doc).await.unwrap().id;
+        let other_user_failed_id = database.create_document(other_user_failed_doc).await.unwrap().id;
+
+        // Test as regular user
+        let failed_docs = database
+            .find_failed_ocr_documents(user_id, crate::models::UserRole::User)
+            .await
+            .unwrap();
+
+        // Should find: failed_doc and null_confidence_doc (but not pending/processing)
+        assert_eq!(failed_docs.len(), 2);
+        let failed_ids: Vec<Uuid> = failed_docs.iter().map(|d| d.id).collect();
+        assert!(failed_ids.contains(&failed_id));
+        assert!(failed_ids.contains(&null_confidence_id));
+        assert!(!failed_ids.contains(&success_id));
+        assert!(!failed_ids.contains(&pending_id));
+        assert!(!failed_ids.contains(&processing_id));
+        assert!(!failed_ids.contains(&other_user_failed_id)); // Different user
+
+        // Test as admin
+        let admin_failed_docs = database
+            .find_failed_ocr_documents(admin_user_id, crate::models::UserRole::Admin)
+            .await
+            .unwrap();
+
+        // Should find all failed documents (from all users)
+        assert!(admin_failed_docs.len() >= 3); // At least our 3 failed docs
+        let admin_failed_ids: Vec<Uuid> = admin_failed_docs.iter().map(|d| d.id).collect();
+        assert!(admin_failed_ids.contains(&failed_id));
+        assert!(admin_failed_ids.contains(&null_confidence_id));
+        assert!(admin_failed_ids.contains(&other_user_failed_id));
+    }
+
+    #[tokio::test]
+    async fn test_find_low_confidence_and_failed_documents() {
+        use testcontainers::{runners::AsyncRunner};
+        use testcontainers_modules::postgres::Postgres;
+        
+        let postgres_image = Postgres::default();
+        let container = postgres_image.start().await.expect("Failed to start postgres container");
+        let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port");
+        
+        // Use TEST_DATABASE_URL if available, otherwise use the container
+        let connection_string = std::env::var("TEST_DATABASE_URL")
+            .unwrap_or_else(|_| format!("postgres://postgres:postgres@127.0.0.1:{}/postgres", port));
+        let database = Database::new(&connection_string).await.unwrap();
+        database.migrate().await.unwrap();
+        let user_id = Uuid::new_v4();
+
+        // Create test documents with different confidence levels
+        let mut high_confidence_doc = create_test_document(user_id);
+        high_confidence_doc.ocr_confidence = Some(95.0);
+        high_confidence_doc.ocr_status = Some("completed".to_string());
+
+        let mut medium_confidence_doc = create_test_document(user_id);
+        medium_confidence_doc.ocr_confidence = Some(65.0);
+        medium_confidence_doc.ocr_status = Some("completed".to_string());
+
+        let mut low_confidence_doc = create_test_document(user_id);
+        low_confidence_doc.ocr_confidence = Some(25.0);
+        low_confidence_doc.ocr_status = Some("completed".to_string());
+
+        let mut failed_doc = create_test_document(user_id);
+        failed_doc.ocr_status = Some("failed".to_string());
+        failed_doc.ocr_confidence = None;
+        failed_doc.ocr_error = Some("Processing failed".to_string());
+
+        let mut null_confidence_doc = create_test_document(user_id);
+        null_confidence_doc.ocr_status = Some("completed".to_string());
+        null_confidence_doc.ocr_confidence = None;
+
+        let mut pending_doc = create_test_document(user_id);
+        pending_doc.ocr_status = Some("pending".to_string());
+        pending_doc.ocr_confidence = None;
+
+        // Insert all documents
+        let high_id = database.create_document(high_confidence_doc).await.unwrap().id;
+        let medium_id = database.create_document(medium_confidence_doc).await.unwrap().id;
+        let low_id = database.create_document(low_confidence_doc).await.unwrap().id;
+        let failed_id = database.create_document(failed_doc).await.unwrap().id;
+        let null_confidence_id = database.create_document(null_confidence_doc).await.unwrap().id;
+        let pending_id = database.create_document(pending_doc).await.unwrap().id;
+
+        // Test with threshold of 50% - should include low confidence, failed, and null confidence
+        let threshold_50_docs = database
+            .find_low_confidence_and_failed_documents(50.0, user_id, crate::models::UserRole::User)
+            .await
+            .unwrap();
+
+        assert_eq!(threshold_50_docs.len(), 3);
+        let threshold_50_ids: Vec<Uuid> = threshold_50_docs.iter().map(|d| d.id).collect();
+        assert!(threshold_50_ids.contains(&low_id)); // 25% confidence
+        assert!(threshold_50_ids.contains(&failed_id)); // failed status
+        assert!(threshold_50_ids.contains(&null_confidence_id)); // NULL confidence
+        assert!(!threshold_50_ids.contains(&high_id)); // 95% confidence
+        assert!(!threshold_50_ids.contains(&medium_id)); // 65% confidence
+        assert!(!threshold_50_ids.contains(&pending_id)); // pending status
+
+        // Test with threshold of 70% - should include low and medium confidence, failed, and null confidence
+        let threshold_70_docs = database
+            .find_low_confidence_and_failed_documents(70.0, user_id, crate::models::UserRole::User)
+            .await
+            .unwrap();
+
+        assert_eq!(threshold_70_docs.len(), 4);
+        let threshold_70_ids: Vec<Uuid> = threshold_70_docs.iter().map(|d| d.id).collect();
+        assert!(threshold_70_ids.contains(&low_id)); // 25% confidence
+        assert!(threshold_70_ids.contains(&medium_id)); // 65% confidence
+        assert!(threshold_70_ids.contains(&failed_id)); // failed status
+        assert!(threshold_70_ids.contains(&null_confidence_id)); // NULL confidence
+        assert!(!threshold_70_ids.contains(&high_id)); // 95% confidence
+        assert!(!threshold_70_ids.contains(&pending_id)); // pending status
+
+        // Test with threshold of 100% - should include all except pending/processing
+        let threshold_100_docs = database
+            .find_low_confidence_and_failed_documents(100.0, user_id, crate::models::UserRole::User)
+            .await
+            .unwrap();
+
+        assert_eq!(threshold_100_docs.len(), 5);
+        let threshold_100_ids: Vec<Uuid> = threshold_100_docs.iter().map(|d| d.id).collect();
+        assert!(threshold_100_ids.contains(&high_id)); // 95% confidence
+        assert!(threshold_100_ids.contains(&medium_id)); // 65% confidence
+        assert!(threshold_100_ids.contains(&low_id)); // 25% confidence
+        assert!(threshold_100_ids.contains(&failed_id)); // failed status
+        assert!(threshold_100_ids.contains(&null_confidence_id)); // NULL confidence
+        assert!(!threshold_100_ids.contains(&pending_id)); // pending status
+
+        // Test with threshold of 0% - should only include failed and null confidence
+        let threshold_0_docs = database
+            .find_low_confidence_and_failed_documents(0.0, user_id, crate::models::UserRole::User)
+            .await
+            .unwrap();
+
+        assert_eq!(threshold_0_docs.len(), 2);
+        let threshold_0_ids: Vec<Uuid> = threshold_0_docs.iter().map(|d| d.id).collect();
+        assert!(threshold_0_ids.contains(&failed_id)); // failed status
+        assert!(threshold_0_ids.contains(&null_confidence_id)); // NULL confidence
+        assert!(!threshold_0_ids.contains(&high_id)); // 95% confidence
+        assert!(!threshold_0_ids.contains(&medium_id)); // 65% confidence
+        assert!(!threshold_0_ids.contains(&low_id)); // 25% confidence
+        assert!(!threshold_0_ids.contains(&pending_id)); // pending status
+    }
+
+    #[tokio::test]
+    async fn test_find_documents_by_confidence_threshold_original_behavior() {
+        use testcontainers::{runners::AsyncRunner};
+        use testcontainers_modules::postgres::Postgres;
+        
+        let postgres_image = Postgres::default();
+        let container = postgres_image.start().await.expect("Failed to start postgres container");
+        let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port");
+        
+        // Use TEST_DATABASE_URL if available, otherwise use the container
+        let connection_string = std::env::var("TEST_DATABASE_URL")
+            .unwrap_or_else(|_| format!("postgres://postgres:postgres@127.0.0.1:{}/postgres", port));
+        let database = Database::new(&connection_string).await.unwrap();
+        database.migrate().await.unwrap();
+        let user_id = Uuid::new_v4();
+
+        // Create test documents to verify original behavior is preserved
+        let mut high_confidence_doc = create_test_document(user_id);
+        high_confidence_doc.ocr_confidence = Some(90.0);
+        high_confidence_doc.ocr_status = Some("completed".to_string());
+
+        let mut low_confidence_doc = create_test_document(user_id);
+        low_confidence_doc.ocr_confidence = Some(40.0);
+        low_confidence_doc.ocr_status = Some("completed".to_string());
+
+        let mut null_confidence_doc = create_test_document(user_id);
+        null_confidence_doc.ocr_confidence = None;
+        null_confidence_doc.ocr_status = Some("completed".to_string());
+
+        let mut failed_doc = create_test_document(user_id);
+        failed_doc.ocr_confidence = None;
+        failed_doc.ocr_status = Some("failed".to_string());
+
+        // Insert documents
+        let high_id = database.create_document(high_confidence_doc).await.unwrap().id;
+        let low_id = database.create_document(low_confidence_doc).await.unwrap().id;
+        let null_confidence_id = database.create_document(null_confidence_doc).await.unwrap().id;
+        let failed_id = database.create_document(failed_doc).await.unwrap().id;
+
+        // Test original method - should only find documents with explicit confidence below threshold
+        let original_results = database
+            .find_documents_by_confidence_threshold(50.0, user_id, crate::models::UserRole::User)
+            .await
+            .unwrap();
+
+        // Should only include low_confidence_doc (40%), not NULL confidence or failed docs
+        assert_eq!(original_results.len(), 1);
+        assert_eq!(original_results[0].id, low_id);
+        
+        let original_ids: Vec<Uuid> = original_results.iter().map(|d| d.id).collect();
+        assert!(!original_ids.contains(&high_id)); // 90% > 50%
+        assert!(!original_ids.contains(&null_confidence_id)); // NULL confidence excluded
+        assert!(!original_ids.contains(&failed_id)); // NULL confidence excluded
+    }
+
+    #[tokio::test]
+    async fn test_confidence_query_ordering() {
+        use testcontainers::{runners::AsyncRunner};
+        use testcontainers_modules::postgres::Postgres;
+        
+        let postgres_image = Postgres::default();
+        let container = postgres_image.start().await.expect("Failed to start postgres container");
+        let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port");
+        
+        // Use TEST_DATABASE_URL if available, otherwise use the container
+        let connection_string = std::env::var("TEST_DATABASE_URL")
+            .unwrap_or_else(|_| format!("postgres://postgres:postgres@127.0.0.1:{}/postgres", port));
+        let database = Database::new(&connection_string).await.unwrap();
+        database.migrate().await.unwrap();
+        let user_id = Uuid::new_v4();
+
+        // Create documents with different confidence levels and statuses
+        let mut confidence_10_doc = create_test_document(user_id);
+        confidence_10_doc.ocr_confidence = Some(10.0);
+        confidence_10_doc.ocr_status = Some("completed".to_string());
+
+        let mut confidence_30_doc = create_test_document(user_id);
+        confidence_30_doc.ocr_confidence = Some(30.0);
+        confidence_30_doc.ocr_status = Some("completed".to_string());
+
+        let mut failed_doc = create_test_document(user_id);
+        failed_doc.ocr_confidence = None;
+        failed_doc.ocr_status = Some("failed".to_string());
+
+        let mut null_confidence_doc = create_test_document(user_id);
+        null_confidence_doc.ocr_confidence = None;
+        null_confidence_doc.ocr_status = Some("completed".to_string());
+
+        // Insert documents
+        let id_10 = database.create_document(confidence_10_doc).await.unwrap().id;
+        let id_30 = database.create_document(confidence_30_doc).await.unwrap().id;
+        let failed_id = database.create_document(failed_doc).await.unwrap().id;
+        let null_id = database.create_document(null_confidence_doc).await.unwrap().id;
+
+        // Test ordering in combined query
+        let results = database
+            .find_low_confidence_and_failed_documents(50.0, user_id, crate::models::UserRole::User)
+            .await
+            .unwrap();
+
+        assert_eq!(results.len(), 4);
+
+        // Check that documents with actual confidence are ordered by confidence (ascending)
+        // and NULL confidence documents come first (due to CASE WHEN ordering)
+        let confidence_values: Vec<Option<f32>> = results.iter().map(|d| d.ocr_confidence).collect();
+        
+        // First two should be NULL confidence (failed and completed with NULL)
+        assert!(confidence_values[0].is_none());
+        assert!(confidence_values[1].is_none());
+        
+        // Next should be lowest confidence
+        assert_eq!(confidence_values[2], Some(10.0));
+        
+        // Last should be higher confidence
+        assert_eq!(confidence_values[3], Some(30.0));
+    }
+
+    #[tokio::test]
+    async fn test_user_isolation_in_confidence_queries() {
+        use testcontainers::{runners::AsyncRunner};
+        use testcontainers_modules::postgres::Postgres;
+        
+        let postgres_image = Postgres::default();
+        let container = postgres_image.start().await.expect("Failed to start postgres container");
+        let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port");
+        
+        // Use TEST_DATABASE_URL if available, otherwise use the container
+        let connection_string = std::env::var("TEST_DATABASE_URL")
+            .unwrap_or_else(|_| format!("postgres://postgres:postgres@127.0.0.1:{}/postgres", port));
+        let database = Database::new(&connection_string).await.unwrap();
+        database.migrate().await.unwrap();
+        let user1_id = Uuid::new_v4();
+        let user2_id = Uuid::new_v4();
+
+        // Create documents for user1
+        let mut user1_low_doc = create_test_document(user1_id);
+        user1_low_doc.ocr_confidence = Some(20.0);
+
+        let mut user1_failed_doc = create_test_document(user1_id);
+        user1_failed_doc.ocr_status = Some("failed".to_string());
+        user1_failed_doc.ocr_confidence = None;
+
+        // Create documents for user2
+        let mut user2_low_doc = create_test_document(user2_id);
+        user2_low_doc.ocr_confidence = Some(25.0);
+
+        let mut user2_failed_doc = create_test_document(user2_id);
+        user2_failed_doc.ocr_status = Some("failed".to_string());
+        user2_failed_doc.ocr_confidence = None;
+
+        // Insert documents
+        let user1_low_id: Uuid = database.create_document(user1_low_doc).await.unwrap().id;
+        let user1_failed_id: Uuid = database.create_document(user1_failed_doc).await.unwrap().id;
+        let user2_low_id: Uuid = database.create_document(user2_low_doc).await.unwrap().id;
+        let user2_failed_id: Uuid = database.create_document(user2_failed_doc).await.unwrap().id;
+
+        // Test user1 can only see their documents
+        let user1_results = database
+            .find_low_confidence_and_failed_documents(50.0, user1_id, crate::models::UserRole::User)
+            .await
+            .unwrap();
+
+        assert_eq!(user1_results.len(), 2);
+        let user1_ids: Vec<Uuid> = user1_results.iter().map(|d| d.id).collect();
+        assert!(user1_ids.contains(&user1_low_id));
+        assert!(user1_ids.contains(&user1_failed_id));
+        assert!(!user1_ids.contains(&user2_low_id));
+        assert!(!user1_ids.contains(&user2_failed_id));
+
+        // Test user2 can only see their documents
+        let user2_results = database
+            .find_low_confidence_and_failed_documents(50.0, user2_id, crate::models::UserRole::User)
+            .await
+            .unwrap();
+
+        assert_eq!(user2_results.len(), 2);
+        let user2_ids: Vec<Uuid> = user2_results.iter().map(|d| d.id).collect();
+        assert!(user2_ids.contains(&user2_low_id));
+        assert!(user2_ids.contains(&user2_failed_id));
+        assert!(!user2_ids.contains(&user1_low_id));
+        assert!(!user2_ids.contains(&user1_failed_id));
+
+        // Test admin can see all documents
+        let admin_results = database
+            .find_low_confidence_and_failed_documents(50.0, user1_id, crate::models::UserRole::Admin)
+            .await
+            .unwrap();
+
+        assert!(admin_results.len() >= 4); // At least our 4 test documents
+        let admin_ids: Vec<Uuid> = admin_results.iter().map(|d| d.id).collect();
+        assert!(admin_ids.contains(&user1_low_id));
+        assert!(admin_ids.contains(&user1_failed_id));
+        assert!(admin_ids.contains(&user2_low_id));
+        assert!(admin_ids.contains(&user2_failed_id));
+    }
 }
\ No newline at end of file
diff --git a/src/tests/enhanced_ocr_tests.rs b/src/tests/enhanced_ocr_tests.rs
new file mode 100644
index 0000000..efb17b3
--- /dev/null
+++ b/src/tests/enhanced_ocr_tests.rs
@@ -0,0 +1,455 @@
+#[cfg(test)]
+mod tests {
+    use crate::ocr::enhanced::{EnhancedOcrService, OcrResult, ImageQualityStats};
+    use crate::models::Settings;
+    use std::fs;
+    use tempfile::{NamedTempFile, TempDir};
+
+    fn create_test_settings() -> Settings {
+        Settings::default()
+    }
+
+    fn create_temp_dir() -> TempDir {
+        TempDir::new().expect("Failed to create temp directory")
+    }
+
+    #[test]
+    fn test_enhanced_ocr_service_creation() {
+        let temp_dir = create_temp_dir();
+        let temp_path = temp_dir.path().to_str().unwrap().to_string();
+        let service = EnhancedOcrService::new(temp_path);
+        
+        // Service should be created successfully
+        assert!(!service.temp_dir.is_empty());
+    }
+
+    #[test]
+    fn test_image_quality_stats_creation() {
+        let stats = ImageQualityStats {
+            average_brightness: 128.0,
+            contrast_ratio: 0.5,
+            noise_level: 0.1,
+            sharpness: 0.8,
+        };
+        
+        assert_eq!(stats.average_brightness, 128.0);
+        assert_eq!(stats.contrast_ratio, 0.5);
+        assert_eq!(stats.noise_level, 0.1);
+        assert_eq!(stats.sharpness, 0.8);
+    }
+
+    #[test]
+    fn test_ocr_result_structure() {
+        let result = OcrResult {
+            text: "Test text".to_string(),
+            confidence: 85.5,
+            processing_time_ms: 1500,
+            word_count: 2,
+            preprocessing_applied: vec!["noise_reduction".to_string()],
+            processed_image_path: Some("/tmp/processed.png".to_string()),
+        };
+        
+        assert_eq!(result.text, "Test text");
+        assert_eq!(result.confidence, 85.5);
+        assert_eq!(result.processing_time_ms, 1500);
+        assert_eq!(result.word_count, 2);
+        assert_eq!(result.preprocessing_applied.len(), 1);
+        assert!(result.processed_image_path.is_some());
+    }
+
+    #[tokio::test]
+    async fn test_extract_text_from_plain_text() {
+        let temp_dir = create_temp_dir();
+        let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
+        let settings = create_test_settings();
+        
+        let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
+        let test_content = "This is a test text file with multiple words.";
+        fs::write(temp_file.path(), test_content).unwrap();
+        
+        let result = service
+            .extract_text(temp_file.path().to_str().unwrap(), "text/plain", &settings)
+            .await;
+        
+        assert!(result.is_ok());
+        let ocr_result = result.unwrap();
+        assert_eq!(ocr_result.text.trim(), test_content);
+        assert_eq!(ocr_result.confidence, 100.0); // Plain text should be 100% confident
+        assert_eq!(ocr_result.word_count, 9); // "This is a test text file with multiple words"
+        assert!(ocr_result.processing_time_ms > 0);
+        assert!(ocr_result.preprocessing_applied.contains(&"Plain text read".to_string()));
+    }
+
+    #[tokio::test]
+    async fn test_extract_text_with_context() {
+        let temp_dir = create_temp_dir();
+        let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
+        let settings = create_test_settings();
+        
+        let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
+        let test_content = "Context test content";
+        fs::write(temp_file.path(), test_content).unwrap();
+        
+        let result = service
+            .extract_text_with_context(
+                temp_file.path().to_str().unwrap(),
+                "text/plain",
+                "test_file.txt",
+                19, // Length of "Context test content"
+                &settings,
+            )
+            .await;
+        
+        assert!(result.is_ok());
+        let ocr_result = result.unwrap();
+        assert_eq!(ocr_result.text.trim(), test_content);
+        assert_eq!(ocr_result.confidence, 100.0);
+    }
+
+    #[tokio::test]
+    async fn test_extract_text_unsupported_mime_type() {
+        let temp_dir = create_temp_dir();
+        let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
+        let settings = create_test_settings();
+        
+        let temp_file = NamedTempFile::new().unwrap();
+        fs::write(temp_file.path(), "some content").unwrap();
+        
+        let result = service
+            .extract_text(temp_file.path().to_str().unwrap(), "application/unknown", &settings)
+            .await;
+        
+        assert!(result.is_err());
+        let error_msg = result.unwrap_err().to_string();
+        assert!(error_msg.contains("Unsupported file type"));
+    }
+
+    #[tokio::test]
+    async fn test_extract_text_nonexistent_file() {
+        let temp_dir = create_temp_dir();
+        let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
+        let settings = create_test_settings();
+        
+        let result = service
+            .extract_text("/nonexistent/file.txt", "text/plain", &settings)
+            .await;
+        
+        assert!(result.is_err());
+    }
+
+    #[tokio::test]
+    async fn test_extract_text_large_file_truncation() {
+        let temp_dir = create_temp_dir();
+        let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
+        let settings = create_test_settings();
+        
+        let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
+        
+        // Create a file larger than the limit (50MB for text files)
+        let large_content = "A".repeat(60 * 1024 * 1024); // 60MB
+        fs::write(temp_file.path(), &large_content).unwrap();
+        
+        let result = service
+            .extract_text(temp_file.path().to_str().unwrap(), "text/plain", &settings)
+            .await;
+        
+        // Should fail due to size limit
+        assert!(result.is_err());
+        let error_msg = result.unwrap_err().to_string();
+        assert!(error_msg.contains("too large"));
+    }
+
+    #[cfg(feature = "ocr")]
+    #[test]
+    fn test_validate_ocr_quality_high_confidence() {
+        let temp_dir = create_temp_dir();
+        let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
+        let mut settings = create_test_settings();
+        settings.ocr_min_confidence = 30.0;
+        
+        let result = OcrResult {
+            text: "This is high quality OCR text with good words.".to_string(),
+            confidence: 95.0,
+            processing_time_ms: 1000,
+            word_count: 9,
+            preprocessing_applied: vec![],
+            processed_image_path: None,
+        };
+        
+        let is_valid = service.validate_ocr_quality(&result, &settings);
+        assert!(is_valid);
+    }
+
+    #[cfg(feature = "ocr")]
+    #[test]
+    fn test_validate_ocr_quality_low_confidence() {
+        let temp_dir = create_temp_dir();
+        let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
+        let mut settings = create_test_settings();
+        settings.ocr_min_confidence = 50.0;
+        
+        let result = OcrResult {
+            text: "Poor quality text".to_string(),
+            confidence: 25.0, // Below threshold
+            processing_time_ms: 1000,
+            word_count: 3,
+            preprocessing_applied: vec![],
+            processed_image_path: None,
+        };
+        
+        let is_valid = service.validate_ocr_quality(&result, &settings);
+        assert!(!is_valid);
+    }
+
+    #[cfg(feature = "ocr")]
+    #[test]
+    fn test_validate_ocr_quality_no_words() {
+        let temp_dir = create_temp_dir();
+        let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
+        let settings = create_test_settings();
+        
+        let result = OcrResult {
+            text: "".to_string(),
+            confidence: 95.0,
+            processing_time_ms: 1000,
+            word_count: 0, // No words
+            preprocessing_applied: vec![],
+            processed_image_path: None,
+        };
+        
+        let is_valid = service.validate_ocr_quality(&result, &settings);
+        assert!(!is_valid);
+    }
+
+    #[cfg(feature = "ocr")]
+    #[test]
+    fn test_validate_ocr_quality_poor_character_distribution() {
+        let temp_dir = create_temp_dir();
+        let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
+        let settings = create_test_settings();
+        
+        let result = OcrResult {
+            text: "!!!@@@###$$$%%%^^^&&&***".to_string(), // Mostly symbols, < 30% alphanumeric
+            confidence: 85.0,
+            processing_time_ms: 1000,
+            word_count: 1,
+            preprocessing_applied: vec![],
+            processed_image_path: None,
+        };
+        
+        let is_valid = service.validate_ocr_quality(&result, &settings);
+        assert!(!is_valid);
+    }
+
+    #[cfg(feature = "ocr")]
+    #[test]
+    fn test_validate_ocr_quality_good_character_distribution() {
+        let temp_dir = create_temp_dir();
+        let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
+        let settings = create_test_settings();
+        
+        let result = OcrResult {
+            text: "The quick brown fox jumps over the lazy dog. 123".to_string(), // Good alphanumeric ratio
+            confidence: 85.0,
+            processing_time_ms: 1000,
+            word_count: 10,
+            preprocessing_applied: vec![],
+            processed_image_path: None,
+        };
+        
+        let is_valid = service.validate_ocr_quality(&result, &settings);
+        assert!(is_valid);
+    }
+
+    #[tokio::test]
+    async fn test_word_count_calculation() {
+        let temp_dir = create_temp_dir();
+        let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
+        let settings = create_test_settings();
+        
+        let test_cases = vec![
+            ("", 0),
+            ("word", 1),
+            ("two words", 2),
+            ("  spaced   words  ", 2),
+            ("Multiple\nlines\nof\ntext", 4),
+            ("punctuation, words! work? correctly.", 4),
+        ];
+        
+        for (content, expected_count) in test_cases {
+            let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
+            fs::write(temp_file.path(), content).unwrap();
+            
+            let result = service
+                .extract_text(temp_file.path().to_str().unwrap(), "text/plain", &settings)
+                .await;
+            
+            assert!(result.is_ok());
+            let ocr_result = result.unwrap();
+            assert_eq!(ocr_result.word_count, expected_count, "Failed for content: '{}'", content);
+        }
+    }
+
+    #[tokio::test]
+    async fn test_pdf_extraction_with_invalid_pdf() {
+        let temp_dir = create_temp_dir();
+        let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
+        let settings = create_test_settings();
+        
+        let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
+        fs::write(temp_file.path(), "Not a valid PDF").unwrap();
+        
+        let result = service
+            .extract_text(temp_file.path().to_str().unwrap(), "application/pdf", &settings)
+            .await;
+        
+        assert!(result.is_err());
+        let error_msg = result.unwrap_err().to_string();
+        assert!(error_msg.contains("Invalid PDF") || error_msg.contains("Missing") || error_msg.contains("corrupted"));
+    }
+
+    #[tokio::test]
+    async fn test_pdf_extraction_with_minimal_valid_pdf() {
+        let temp_dir = create_temp_dir();
+        let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
+        let settings = create_test_settings();
+        
+        // Minimal PDF with "Hello" text
+        let pdf_content = b"%PDF-1.4
+1 0 obj
+<< /Type /Catalog /Pages 2 0 R >>
+endobj
+2 0 obj
+<< /Type /Pages /Kids [3 0 R] /Count 1 >>
+endobj
+3 0 obj
+<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /MediaBox [0 0 612 792] /Contents 5 0 R >>
+endobj
+4 0 obj
+<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
+endobj
+5 0 obj
+<< /Length 44 >>
+stream
+BT
+/F1 12 Tf
+100 700 Td
+(Hello) Tj
+ET
+endstream
+endobj
+xref
+0 6
+0000000000 65535 f
+0000000009 00000 n
+0000000058 00000 n
+0000000115 00000 n
+0000000262 00000 n
+0000000341 00000 n
+trailer
+<< /Size 6 /Root 1 0 R >>
+startxref
+435
+%%EOF";
+        
+        let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
+        fs::write(temp_file.path(), pdf_content).unwrap();
+        
+        let result = service
+            .extract_text(temp_file.path().to_str().unwrap(), "application/pdf", &settings)
+            .await;
+        
+        match result {
+            Ok(ocr_result) => {
+                // PDF extraction succeeded
+                assert_eq!(ocr_result.confidence, 95.0); // PDF text extraction should be high confidence
+                assert!(ocr_result.processing_time_ms > 0);
+                assert!(ocr_result.preprocessing_applied.contains(&"PDF text extraction".to_string()));
+                println!("PDF extracted text: '{}'", ocr_result.text);
+            }
+            Err(e) => {
+                // PDF extraction might fail depending on the pdf-extract library
+                println!("PDF extraction failed (may be expected): {}", e);
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_pdf_size_limit() {
+        let temp_dir = create_temp_dir();
+        let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
+        let settings = create_test_settings();
+        
+        let temp_file = NamedTempFile::with_suffix(".pdf").unwrap();
+        
+        // Create a file larger than the 100MB PDF limit
+        let large_pdf_content = format!("%PDF-1.4\n{}", "A".repeat(110 * 1024 * 1024));
+        fs::write(temp_file.path(), large_pdf_content).unwrap();
+        
+        let result = service
+            .extract_text(temp_file.path().to_str().unwrap(), "application/pdf", &settings)
+            .await;
+        
+        assert!(result.is_err());
+        let error_msg = result.unwrap_err().to_string();
+        assert!(error_msg.contains("too large"));
+    }
+
+    #[test]
+    fn test_settings_default_values() {
+        let settings = Settings::default();
+        
+        // Test that OCR-related settings have reasonable defaults
+        assert_eq!(settings.ocr_min_confidence, 30.0);
+        assert_eq!(settings.ocr_dpi, 300);
+        assert_eq!(settings.ocr_page_segmentation_mode, 3);
+        assert_eq!(settings.ocr_engine_mode, 3);
+        assert!(settings.enable_background_ocr);
+        assert!(settings.ocr_enhance_contrast);
+        assert!(settings.ocr_remove_noise);
+        assert!(settings.ocr_detect_orientation);
+    }
+
+    #[tokio::test]
+    async fn test_concurrent_ocr_processing() {
+        let temp_dir = create_temp_dir();
+        let service = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
+        let settings = create_test_settings();
+        
+        let mut handles = vec![];
+        
+        // Process multiple files concurrently
+        for i in 0..5 {
+            let temp_file = NamedTempFile::with_suffix(".txt").unwrap();
+            let content = format!("Concurrent test content {}", i);
+            fs::write(temp_file.path(), &content).unwrap();
+            
+            let service_clone = EnhancedOcrService::new(temp_dir.path().to_str().unwrap().to_string());
+            let settings_clone = settings.clone();
+            let file_path = temp_file.path().to_str().unwrap().to_string();
+            
+            let handle = tokio::spawn(async move {
+                let result = service_clone
+                    .extract_text(&file_path, "text/plain", &settings_clone)
+                    .await;
+                
+                // Keep temp_file alive until task completes
+                drop(temp_file);
+                result
+            });
+            
+            handles.push(handle);
+        }
+        
+        // Wait for all tasks to complete
+        let results = futures::future::join_all(handles).await;
+        
+        // All tasks should succeed
+        for (i, result) in results.into_iter().enumerate() {
+            assert!(result.is_ok(), "Task {} failed", i);
+            let ocr_result = result.unwrap().unwrap();
+            assert!(ocr_result.text.contains(&format!("Concurrent test content {}", i)));
+            assert_eq!(ocr_result.confidence, 100.0);
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/tests/mod.rs b/src/tests/mod.rs
index 2de0447..f40390e 100644
--- a/src/tests/mod.rs
+++ b/src/tests/mod.rs
@@ -8,6 +8,7 @@ mod file_service_tests;
 mod ignored_files_tests;
 mod labels_tests;
 mod ocr_tests;
+mod enhanced_ocr_tests;
 mod oidc_tests;
 mod enhanced_search_tests;
 mod settings_tests;
diff --git a/tests/integration_document_deletion_integration_tests.rs b/tests/integration_document_deletion_integration_tests.rs
index 077dd8d..b9cb5aa 100644
--- a/tests/integration_document_deletion_integration_tests.rs
+++ b/tests/integration_document_deletion_integration_tests.rs
@@ -233,6 +233,57 @@ impl DocumentDeletionTestClient {
         let result: Value = response.json().await?;
         Ok(result)
     }
+
+    /// Delete failed OCR documents
+    async fn delete_failed_ocr_documents(&self, preview_only: bool) -> Result<Value, Box<dyn std::error::Error>> {
+        let token = self.token.as_ref().ok_or("Not authenticated")?;
+        
+        let response = self.client
+            .post(&format!("{}/api/documents/delete-failed-ocr", get_base_url()))
+            .header("Authorization", format!("Bearer {}", token))
+            .json(&json!({
+                "preview_only": preview_only
+            }))
+            .timeout(TIMEOUT)
+            .send()
+            .await?;
+        
+        if !response.status().is_success() {
+            return Err(format!("Delete failed OCR documents failed: {}", response.text().await?).into());
+        }
+        
+        let result: Value = response.json().await?;
+        Ok(result)
+    }
+
+    /// Delete low confidence documents (updated to use new combined endpoint)
+    async fn delete_low_confidence_documents(&self, threshold: f64, preview_only: bool) -> Result<Value, Box<dyn std::error::Error>> {
+        let token = self.token.as_ref().ok_or("Not authenticated")?;
+        
+        let response = self.client
+            .post(&format!("{}/api/documents/delete-low-confidence", get_base_url()))
+            .header("Authorization", format!("Bearer {}", token))
+            .json(&json!({
+                "max_confidence": threshold,
+                "preview_only": preview_only
+            }))
+            .timeout(TIMEOUT)
+            .send()
+            .await?;
+        
+        if !response.status().is_success() {
+            return Err(format!("Delete low confidence documents failed: {}", response.text().await?).into());
+        }
+        
+        let result: Value = response.json().await?;
+        Ok(result)
+    }
+
+    /// Create and login user (convenience method)
+    async fn create_and_login_user(&mut self, username: &str, password: &str, role: UserRole) -> Result<String, Box<dyn std::error::Error>> {
+        let email = format!("{}@example.com", username);
+        self.register_and_login(username, &email, password, Some(role)).await
+    }
 }
 
 /// Skip test if server is not running
@@ -613,4 +664,224 @@ async fn test_document_count_updates_after_deletion() {
     assert_eq!(final_count, initial_count, "Document count should be back to initial after bulk deletion");
     
     println!("✅ Document count updates after deletion test passed");
+}
+
+/// Test the new failed OCR document deletion endpoint
+#[tokio::test]
+async fn test_delete_failed_ocr_documents_endpoint() {
+    let mut client = DocumentDeletionTestClient::new();
+    
+    if let Err(e) = client.check_server_health().await {
+        println!("⚠️ Server not available: {}. Skipping test.", e);
+        return;
+    }
+    
+    println!("🧪 Testing failed OCR document deletion endpoint...");
+    
+    // Create and login as regular user
+    client.create_and_login_user("failed_ocr_user", "failed_ocr_password", UserRole::User)
+        .await.expect("Failed to create and login user");
+    
+    // Preview failed documents (should return empty initially)
+    let preview_response = client.delete_failed_ocr_documents(true)
+        .await.expect("Failed to preview failed OCR documents");
+    
+    assert_eq!(preview_response["success"], true);
+    assert!(preview_response["matched_count"].as_i64().unwrap() >= 0);
+    assert_eq!(preview_response["preview"], true);
+    
+    println!("📋 Preview request successful: {} failed documents found", 
+             preview_response["matched_count"]);
+    
+    // If there are failed documents, test deletion
+    if preview_response["matched_count"].as_i64().unwrap() > 0 {
+        // Test actual deletion
+        let delete_response = client.delete_failed_ocr_documents(false)
+            .await.expect("Failed to delete failed OCR documents");
+        
+        assert_eq!(delete_response["success"], true);
+        assert!(delete_response["deleted_count"].as_i64().unwrap() >= 0);
+        assert!(delete_response.get("preview").is_none());
+        
+        println!("🗑️ Successfully deleted {} failed documents", 
+                 delete_response["deleted_count"]);
+    } else {
+        println!("ℹ️ No failed documents found to delete");
+    }
+    
+    println!("✅ Failed OCR document deletion endpoint test passed");
+}
+
+/// Test confidence-based vs failed document deletion distinction
+#[tokio::test]
+async fn test_confidence_vs_failed_document_distinction() {
+    let mut client = DocumentDeletionTestClient::new();
+    
+    if let Err(e) = client.check_server_health().await {
+        println!("⚠️ Server not available: {}. Skipping test.", e);
+        return;
+    }
+    
+    println!("🧪 Testing distinction between confidence and failed document deletion...");
+    
+    // Create and login as admin to see all documents
+    client.create_and_login_user("distinction_admin", "distinction_password", UserRole::Admin)
+        .await.expect("Failed to create and login admin");
+    
+    // Get baseline counts
+    let initial_low_confidence = client.delete_low_confidence_documents(30.0, true)
+        .await.expect("Failed to preview low confidence documents");
+    let initial_failed = client.delete_failed_ocr_documents(true)
+        .await.expect("Failed to preview failed documents");
+    
+    let initial_low_count = initial_low_confidence["matched_count"].as_i64().unwrap();
+    let initial_failed_count = initial_failed["matched_count"].as_i64().unwrap();
+    
+    println!("📊 Initial counts - Low confidence: {}, Failed: {}", 
+             initial_low_count, initial_failed_count);
+    
+    // Test that the endpoints return different sets of documents
+    // (This assumes there are some of each type in the system)
+    
+    // Verify that failed documents endpoint only includes failed/NULL confidence docs
+    if initial_failed_count > 0 {
+        let failed_docs = initial_failed["document_ids"].as_array().unwrap();
+        println!("🔍 Found {} failed document IDs", failed_docs.len());
+    }
+    
+    // Verify that low confidence endpoint respects threshold
+    if initial_low_count > 0 {
+        let low_confidence_docs = initial_low_confidence["document_ids"].as_array().unwrap();
+        println!("🔍 Found {} low confidence document IDs", low_confidence_docs.len());
+    }
+    
+    println!("✅ Document type distinction test passed");
+}
+
+/// Test error handling for delete endpoints
+#[tokio::test]
+async fn test_delete_endpoints_error_handling() {
+    let client = DocumentDeletionTestClient::new();
+    
+    if let Err(e) = client.check_server_health().await {
+        println!("⚠️ Server not available: {}. Skipping test.", e);
+        return;
+    }
+    
+    println!("🧪 Testing delete endpoints error handling...");
+    
+    // Test unauthenticated request
+    let failed_response = client.client
+        .post(&format!("{}/api/documents/delete-failed-ocr", get_base_url()))
+        .json(&json!({"preview_only": true}))
+        .timeout(TIMEOUT)
+        .send()
+        .await
+        .expect("Failed to send request");
+    
+    assert_eq!(failed_response.status(), 401, "Should require authentication");
+    
+    // Test invalid JSON
+    let invalid_json_response = client.client
+        .post(&format!("{}/api/documents/delete-failed-ocr", get_base_url()))
+        .header("content-type", "application/json")
+        .body("invalid json")
+        .timeout(TIMEOUT)
+        .send()
+        .await
+        .expect("Failed to send request");
+    
+    assert!(invalid_json_response.status().is_client_error(), "Should reject invalid JSON");
+    
+    println!("✅ Error handling test passed");
+}
+
+/// Test role-based access for new delete endpoints
+#[tokio::test]
+async fn test_role_based_access_for_delete_endpoints() {
+    let mut client = DocumentDeletionTestClient::new();
+    
+    if let Err(e) = client.check_server_health().await {
+        println!("⚠️ Server not available: {}. Skipping test.", e);
+        return;
+    }
+    
+    println!("🧪 Testing role-based access for delete endpoints...");
+    
+    // Test as regular user
+    client.create_and_login_user("delete_regular_user", "delete_password", UserRole::User)
+        .await.expect("Failed to create and login user");
+    
+    let user_response = client.delete_failed_ocr_documents(true)
+        .await.expect("Failed to preview as user");
+    
+    assert_eq!(user_response["success"], true);
+    let user_count = user_response["matched_count"].as_i64().unwrap();
+    
+    // Test as admin
+    client.create_and_login_user("delete_admin_user", "delete_admin_password", UserRole::Admin)
+        .await.expect("Failed to create and login admin");
+    
+    let admin_response = client.delete_failed_ocr_documents(true)
+        .await.expect("Failed to preview as admin");
+    
+    assert_eq!(admin_response["success"], true);
+    let admin_count = admin_response["matched_count"].as_i64().unwrap();
+    
+    // Admin should see at least as many documents as regular user
+    assert!(admin_count >= user_count, 
+            "Admin should see at least as many documents as user");
+    
+    println!("👤 User can see {} documents, Admin can see {} documents", 
+             user_count, admin_count);
+    
+    println!("✅ Role-based access test passed");
+}
+
+/// Test the enhanced low confidence deletion with failed documents
+#[tokio::test]
+async fn test_enhanced_low_confidence_deletion() {
+    let mut client = DocumentDeletionTestClient::new();
+    
+    if let Err(e) = client.check_server_health().await {
+        println!("⚠️ Server not available: {}. Skipping test.", e);
+        return;
+    }
+    
+    println!("🧪 Testing enhanced low confidence deletion (includes failed docs)...");
+    
+    // Create and login as admin
+    client.create_and_login_user("enhanced_delete_admin", "enhanced_password", UserRole::Admin)
+        .await.expect("Failed to create and login admin");
+    
+    // Test with various thresholds
+    let thresholds = vec![0.0, 30.0, 50.0, 85.0, 100.0];
+    
+    for threshold in thresholds {
+        let response = client.delete_low_confidence_documents(threshold, true)
+            .await.expect(&format!("Failed to preview with threshold {}", threshold));
+        
+        assert_eq!(response["success"], true);
+        let count = response["matched_count"].as_i64().unwrap();
+        
+        println!("🎯 Threshold {}%: {} documents would be deleted", threshold, count);
+        
+        // Verify response format
+        assert!(response.get("document_ids").is_some());
+        assert_eq!(response["preview"], true);
+    }
+    
+    // Test that higher thresholds generally include more documents
+    let low_threshold_response = client.delete_low_confidence_documents(10.0, true)
+        .await.expect("Failed to preview with low threshold");
+    let high_threshold_response = client.delete_low_confidence_documents(90.0, true)
+        .await.expect("Failed to preview with high threshold");
+    
+    let low_count = low_threshold_response["matched_count"].as_i64().unwrap();
+    let high_count = high_threshold_response["matched_count"].as_i64().unwrap();
+    
+    assert!(high_count >= low_count, 
+            "Higher threshold should include at least as many documents as lower threshold");
+    
+    println!("✅ Enhanced low confidence deletion test passed");
 }
\ No newline at end of file

From e995653d69ce6df4d5da498e61c9a81def6e58f9 Mon Sep 17 00:00:00 2001
From: perf3ct <jonfuller2012@gmail.com>
Date: Sat, 28 Jun 2025 14:51:06 +0000
Subject: [PATCH 2/4] fix(migrations): resolve issue in migration for ocr
 confidence

---
 migrations/20250628000001_backfill_ocr_confidence.sql | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/migrations/20250628000001_backfill_ocr_confidence.sql b/migrations/20250628000001_backfill_ocr_confidence.sql
index 0005371..6a095a6 100644
--- a/migrations/20250628000001_backfill_ocr_confidence.sql
+++ b/migrations/20250628000001_backfill_ocr_confidence.sql
@@ -18,19 +18,19 @@ UPDATE documents
 SET ocr_confidence = CASE
     -- High quality text: good length, reasonable character distribution
     WHEN length(trim(ocr_text)) > 1000 
-         AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), char(10), ''))) * 100.0 / length(ocr_text) > 10.0  -- > 10% whitespace
-         AND length(replace(replace(replace(ocr_text, ' ', ''), char(10), ''), char(13), '')) * 100.0 / length(ocr_text) > 70.0  -- > 70% non-whitespace chars
+         AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), chr(10), ''))) * 100.0 / length(ocr_text) > 10.0  -- > 10% whitespace
+         AND length(replace(replace(replace(ocr_text, ' ', ''), chr(10), ''), chr(13), '')) * 100.0 / length(ocr_text) > 70.0  -- > 70% non-whitespace chars
     THEN 90.0 + (random() * 8.0)  -- 90-98%
     
     -- Medium quality text: decent length, some structure
     WHEN length(trim(ocr_text)) > 100 
-         AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), char(10), ''))) * 100.0 / length(ocr_text) > 5.0   -- > 5% whitespace
-         AND length(replace(replace(replace(ocr_text, ' ', ''), char(10), ''), char(13), '')) * 100.0 / length(ocr_text) > 50.0  -- > 50% non-whitespace chars
+         AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), chr(10), ''))) * 100.0 / length(ocr_text) > 5.0   -- > 5% whitespace
+         AND length(replace(replace(replace(ocr_text, ' ', ''), chr(10), ''), chr(13), '')) * 100.0 / length(ocr_text) > 50.0  -- > 50% non-whitespace chars
     THEN 70.0 + (random() * 15.0)  -- 70-85%
     
     -- Low quality text: short or poor structure
     WHEN length(trim(ocr_text)) > 10
-         AND length(replace(replace(replace(ocr_text, ' ', ''), char(10), ''), char(13), '')) * 100.0 / length(ocr_text) > 30.0  -- > 30% non-whitespace chars
+         AND length(replace(replace(replace(ocr_text, ' ', ''), chr(10), ''), chr(13), '')) * 100.0 / length(ocr_text) > 30.0  -- > 30% non-whitespace chars
     THEN 40.0 + (random() * 25.0)  -- 40-65%
     
     -- Very poor quality: very short or mostly garbage

From 69425b220165ca5ca03695dc72e017b06e28ac9f Mon Sep 17 00:00:00 2001
From: perf3ct <jonfuller2012@gmail.com>
Date: Sat, 28 Jun 2025 14:53:45 +0000
Subject: [PATCH 3/4] feat(migration): instead of hardcoded guessing, re-enter
 those documents into the queue

---
 ...20250628000001_backfill_ocr_confidence.sql | 64 ++++++-------------
 1 file changed, 20 insertions(+), 44 deletions(-)

diff --git a/migrations/20250628000001_backfill_ocr_confidence.sql b/migrations/20250628000001_backfill_ocr_confidence.sql
index 6a095a6..829d5cd 100644
--- a/migrations/20250628000001_backfill_ocr_confidence.sql
+++ b/migrations/20250628000001_backfill_ocr_confidence.sql
@@ -1,59 +1,35 @@
--- Backfill OCR confidence scores for existing documents
--- Since OCR confidence was previously hardcoded to 85%, we need to recalculate
--- actual confidence for documents that currently have this placeholder value
+-- Re-queue documents with placeholder OCR confidence for reprocessing
+-- Since OCR confidence was previously hardcoded to 85%, we need to reprocess
+-- these documents to get accurate confidence scores
 
--- First, let's identify documents that likely have placeholder confidence
--- (85% exactly, which was the hardcoded value)
-CREATE TEMP TABLE documents_to_update AS
-SELECT id, ocr_text, ocr_status 
-FROM documents 
-WHERE ocr_confidence = 85.0 
-  AND ocr_status = 'completed' 
-  AND ocr_text IS NOT NULL 
-  AND length(trim(ocr_text)) > 0;
-
--- For now, we'll estimate confidence based on text quality metrics
--- This is a rough approximation until we can re-run OCR with actual confidence
+-- Mark documents with exactly 85% confidence as pending OCR reprocessing
 UPDATE documents 
-SET ocr_confidence = CASE
-    -- High quality text: good length, reasonable character distribution
-    WHEN length(trim(ocr_text)) > 1000 
-         AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), chr(10), ''))) * 100.0 / length(ocr_text) > 10.0  -- > 10% whitespace
-         AND length(replace(replace(replace(ocr_text, ' ', ''), chr(10), ''), chr(13), '')) * 100.0 / length(ocr_text) > 70.0  -- > 70% non-whitespace chars
-    THEN 90.0 + (random() * 8.0)  -- 90-98%
-    
-    -- Medium quality text: decent length, some structure
-    WHEN length(trim(ocr_text)) > 100 
-         AND (length(ocr_text) - length(replace(replace(ocr_text, ' ', ''), chr(10), ''))) * 100.0 / length(ocr_text) > 5.0   -- > 5% whitespace
-         AND length(replace(replace(replace(ocr_text, ' ', ''), chr(10), ''), chr(13), '')) * 100.0 / length(ocr_text) > 50.0  -- > 50% non-whitespace chars
-    THEN 70.0 + (random() * 15.0)  -- 70-85%
-    
-    -- Low quality text: short or poor structure
-    WHEN length(trim(ocr_text)) > 10
-         AND length(replace(replace(replace(ocr_text, ' ', ''), chr(10), ''), chr(13), '')) * 100.0 / length(ocr_text) > 30.0  -- > 30% non-whitespace chars
-    THEN 40.0 + (random() * 25.0)  -- 40-65%
-    
-    -- Very poor quality: very short or mostly garbage
-    ELSE 20.0 + (random() * 15.0)  -- 20-35%
-END
-WHERE id IN (SELECT id FROM documents_to_update);
+SET ocr_status = 'pending',
+    ocr_confidence = NULL,
+    ocr_error = NULL,
+    updated_at = CURRENT_TIMESTAMP
+WHERE ocr_confidence = 85.0 
+  AND ocr_status = 'completed'
+  AND ocr_text IS NOT NULL;
 
 -- Add a comment explaining what we did
-COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence percentage (0-100). Values may be estimated for documents processed before real confidence calculation was implemented.';
+COMMENT ON COLUMN documents.ocr_confidence IS 'OCR confidence percentage (0-100) from Tesseract. Documents with NULL confidence and pending status will be reprocessed.';
 
 -- Log the update
 DO $$
 DECLARE
     updated_count INTEGER;
 BEGIN
-    SELECT COUNT(*) INTO updated_count FROM documents_to_update;
-    RAISE NOTICE 'Backfilled OCR confidence for % documents that had placeholder 85%% confidence', updated_count;
+    GET DIAGNOSTICS updated_count = ROW_COUNT;
+    RAISE NOTICE 'Marked % documents with placeholder 85%% confidence for OCR reprocessing', updated_count;
 END $$;
 
--- Clean up
-DROP TABLE documents_to_update;
-
 -- Create an index to help with confidence-based queries
 CREATE INDEX IF NOT EXISTS idx_documents_ocr_confidence_range 
 ON documents(ocr_confidence) 
-WHERE ocr_confidence IS NOT NULL;
\ No newline at end of file
+WHERE ocr_confidence IS NOT NULL;
+
+-- Create an index to help the OCR queue find pending documents efficiently
+CREATE INDEX IF NOT EXISTS idx_documents_ocr_pending 
+ON documents(created_at) 
+WHERE ocr_status = 'pending';
\ No newline at end of file

From 9079529eb5be8d6e0ff3b48d08deba4cd8aac553 Mon Sep 17 00:00:00 2001
From: perf3ct <jonfuller2012@gmail.com>
Date: Sat, 28 Jun 2025 16:38:12 +0000
Subject: [PATCH 4/4] feat(tests): create generic migration tests

---
 src/tests/generic_migration_tests.rs | 275 +++++++++++++++++++++++++++
 src/tests/helpers.rs                 |   1 +
 src/tests/mod.rs                     |   3 +-
 3 files changed, 278 insertions(+), 1 deletion(-)
 create mode 100644 src/tests/generic_migration_tests.rs

diff --git a/src/tests/generic_migration_tests.rs b/src/tests/generic_migration_tests.rs
new file mode 100644
index 0000000..94626ee
--- /dev/null
+++ b/src/tests/generic_migration_tests.rs
@@ -0,0 +1,275 @@
+#[cfg(test)]
+mod generic_migration_tests {
+    use sqlx::{PgPool, Row};
+    use testcontainers::{runners::AsyncRunner, ImageExt};
+    use testcontainers_modules::postgres::Postgres;
+    use std::process::Command;
+
+    async fn setup_test_db() -> (PgPool, testcontainers::ContainerAsync<Postgres>) {
+        let postgres_image = Postgres::default()
+            .with_tag("15-alpine")
+            .with_env_var("POSTGRES_USER", "test")
+            .with_env_var("POSTGRES_PASSWORD", "test")
+            .with_env_var("POSTGRES_DB", "test");
+        
+        let container = postgres_image.start().await.expect("Failed to start postgres container");
+        let port = container.get_host_port_ipv4(5432).await.expect("Failed to get postgres port");
+        
+        let database_url = format!("postgresql://test:test@localhost:{}/test", port);
+        let pool = sqlx::postgres::PgPoolOptions::new()
+            .max_connections(5)
+            .connect(&database_url)
+            .await
+            .expect("Failed to connect to test database");
+        
+        (pool, container)
+    }
+
+    fn get_new_migrations() -> Vec<String> {
+        // Get list of migration files that have changed between main and current branch
+        let output = Command::new("git")
+            .args(["diff", "--name-only", "main..HEAD", "--", "migrations/"])
+            .output()
+            .expect("Failed to run git diff");
+        
+        if !output.status.success() {
+            println!("Git diff failed, assuming no migration changes");
+            return Vec::new();
+        }
+
+        let files = String::from_utf8_lossy(&output.stdout);
+        files
+            .lines()
+            .filter(|line| line.ends_with(".sql"))
+            .map(|s| s.to_string())
+            .collect()
+    }
+
+    fn get_migration_files_on_main() -> Vec<String> {
+        // Get list of migration files that exist on main branch
+        let output = Command::new("git")
+            .args(["ls-tree", "-r", "--name-only", "origin/main", "migrations/"])
+            .output()
+            .expect("Failed to list migration files on main");
+        
+        if !output.status.success() {
+            println!("Failed to get migration files from main branch");
+            return Vec::new();
+        }
+
+        let files = String::from_utf8_lossy(&output.stdout);
+        files
+            .lines()
+            .filter(|line| line.ends_with(".sql"))
+            .map(|s| s.to_string())
+            .collect()
+    }
+
+    #[tokio::test]
+    async fn test_new_migrations_run_successfully() {
+        let new_migrations = get_new_migrations();
+        
+        if new_migrations.is_empty() {
+            println!("✅ No new migrations found - test passes");
+            return;
+        }
+
+        println!("🔍 Found {} new migration(s):", new_migrations.len());
+        for migration in &new_migrations {
+            println!("  - {}", migration);
+        }
+
+        let (pool, _container) = setup_test_db().await;
+        
+        // Run all migrations (including the new ones)
+        let result = sqlx::migrate!("./migrations").run(&pool).await;
+        assert!(result.is_ok(), "New migrations should run successfully: {:?}", result.err());
+        
+        println!("✅ All migrations including new ones ran successfully");
+    }
+
+    #[tokio::test]
+    async fn test_migrations_are_idempotent() {
+        let new_migrations = get_new_migrations();
+        
+        if new_migrations.is_empty() {
+            println!("✅ No new migrations found - idempotency test skipped");
+            return;
+        }
+
+        let (pool, _container) = setup_test_db().await;
+        
+        // Run migrations twice to test idempotency
+        let result1 = sqlx::migrate!("./migrations").run(&pool).await;
+        assert!(result1.is_ok(), "First migration run should succeed: {:?}", result1.err());
+        
+        let result2 = sqlx::migrate!("./migrations").run(&pool).await;
+        assert!(result2.is_ok(), "Second migration run should succeed (idempotent): {:?}", result2.err());
+        
+        println!("✅ Migrations are idempotent");
+    }
+
+    #[tokio::test]
+    async fn test_migration_syntax_and_completeness() {
+        let new_migrations = get_new_migrations();
+        
+        if new_migrations.is_empty() {
+            println!("✅ No new migrations found - syntax test skipped");
+            return;
+        }
+
+        // Check that new migration files exist and have basic structure
+        for migration_path in &new_migrations {
+            let content = std::fs::read_to_string(migration_path)
+                .expect(&format!("Should be able to read migration file: {}", migration_path));
+            
+            assert!(!content.trim().is_empty(), "Migration file should not be empty: {}", migration_path);
+            
+            // Basic syntax check - should not contain obvious SQL syntax errors
+            assert!(!content.contains("syntax error"), "Migration should not contain 'syntax error': {}", migration_path);
+            
+            println!("✅ Migration file {} has valid syntax", migration_path);
+        }
+    }
+
+    #[tokio::test]
+    async fn test_migration_rollback_safety() {
+        let new_migrations = get_new_migrations();
+        
+        if new_migrations.is_empty() {
+            println!("✅ No new migrations found - rollback safety test skipped");
+            return;
+        }
+
+        let (pool, _container) = setup_test_db().await;
+        
+        // Test that we can run migrations and they create expected schema elements
+        let result = sqlx::migrate!("./migrations").run(&pool).await;
+        assert!(result.is_ok(), "Migrations should run successfully: {:?}", result.err());
+        
+        // Verify basic schema integrity
+        let tables = sqlx::query("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'")
+            .fetch_all(&pool)
+            .await
+            .expect("Should be able to query table list");
+        
+        assert!(!tables.is_empty(), "Should have created at least one table");
+        
+        // Check that essential tables exist
+        let table_names: Vec<String> = tables.iter()
+            .map(|row| row.get::<String, _>("table_name"))
+            .collect();
+        
+        assert!(table_names.contains(&"documents".to_string()), "documents table should exist");
+        assert!(table_names.contains(&"users".to_string()), "users table should exist");
+        
+        println!("✅ Migration rollback safety verified - schema is intact");
+    }
+
+    #[test]
+    fn test_migration_naming_convention() {
+        let new_migrations = get_new_migrations();
+        
+        if new_migrations.is_empty() {
+            println!("✅ No new migrations found - naming convention test skipped");
+            return;
+        }
+
+        for migration_path in &new_migrations {
+            let filename = migration_path
+                .split('/')
+                .last()
+                .expect("Should have filename");
+            
+            // Check naming convention: YYYYMMDDHHMMSS_description.sql
+            assert!(filename.len() > 15, "Migration filename should be long enough: {}", filename);
+            assert!(filename.ends_with(".sql"), "Migration should end with .sql: {}", filename);
+            
+            let parts: Vec<&str> = filename.split('_').collect();
+            assert!(parts.len() >= 2, "Migration should have timestamp_description format: {}", filename);
+            
+            let timestamp = parts[0];
+            assert!(timestamp.len() >= 14, "Timestamp should be at least 14 characters: {}", filename);
+            assert!(timestamp.chars().all(|c| c.is_numeric()), "Timestamp should be numeric: {}", filename);
+            
+            println!("✅ Migration {} follows naming convention", filename);
+        }
+    }
+
+    #[tokio::test]
+    async fn test_no_changes_scenario_simulation() {
+        // Simulate what happens when git diff returns no changes (HEAD..HEAD)
+        let output = Command::new("git")
+            .args(["diff", "--name-only", "HEAD..HEAD", "--", "migrations/"])
+            .output()
+            .expect("Failed to run git diff");
+        
+        let files = String::from_utf8_lossy(&output.stdout);
+        let no_changes: Vec<String> = files
+            .lines()
+            .filter(|line| line.ends_with(".sql"))
+            .map(|s| s.to_string())
+            .collect();
+        
+        // This should be empty (no changes between HEAD and itself)
+        assert!(no_changes.is_empty(), "HEAD..HEAD should show no changes");
+        
+        // Verify the test logic handles empty migrations gracefully
+        if no_changes.is_empty() {
+            println!("✅ No new migrations found - test passes");
+            // This is what the real tests do when no changes are found
+            return;
+        }
+        
+        println!("✅ No migration changes scenario handled correctly");
+    }
+
+    #[test]
+    fn test_no_conflicting_migration_timestamps() {
+        let new_migrations = get_new_migrations();
+        let main_migrations = get_migration_files_on_main();
+        
+        if new_migrations.is_empty() {
+            println!("✅ No new migrations found - timestamp conflict test skipped");
+            return;
+        }
+
+        // Extract timestamps from new migrations
+        let new_timestamps: Vec<String> = new_migrations.iter()
+            .map(|path| {
+                let filename = path.split('/').last().unwrap();
+                let timestamp = filename.split('_').next().unwrap();
+                timestamp.to_string()
+            })
+            .collect();
+
+        // Extract timestamps from existing migrations on main
+        let main_timestamps: Vec<String> = main_migrations.iter()
+            .map(|path| {
+                let filename = path.split('/').last().unwrap();
+                let timestamp = filename.split('_').next().unwrap();
+                timestamp.to_string()
+            })
+            .collect();
+
+        // Check for conflicts
+        for new_ts in &new_timestamps {
+            assert!(
+                !main_timestamps.contains(new_ts),
+                "Migration timestamp {} conflicts with existing migration on main",
+                new_ts
+            );
+        }
+
+        // Check for duplicates within new migrations
+        for (i, ts1) in new_timestamps.iter().enumerate() {
+            for (j, ts2) in new_timestamps.iter().enumerate() {
+                if i != j {
+                    assert_ne!(ts1, ts2, "Duplicate migration timestamp found: {}", ts1);
+                }
+            }
+        }
+
+        println!("✅ No migration timestamp conflicts found");
+    }
+}
\ No newline at end of file
diff --git a/src/tests/helpers.rs b/src/tests/helpers.rs
index cf7ad98..816950b 100644
--- a/src/tests/helpers.rs
+++ b/src/tests/helpers.rs
@@ -8,6 +8,7 @@ use tower::util::ServiceExt;
 
 pub async fn create_test_app() -> (Router, ContainerAsync<Postgres>) {
     let postgres_image = Postgres::default()
+        .with_tag("15-alpine")
         .with_env_var("POSTGRES_USER", "test")
         .with_env_var("POSTGRES_PASSWORD", "test")
         .with_env_var("POSTGRES_DB", "test");
diff --git a/src/tests/mod.rs b/src/tests/mod.rs
index f40390e..e26cd3f 100644
--- a/src/tests/mod.rs
+++ b/src/tests/mod.rs
@@ -12,4 +12,5 @@ mod enhanced_ocr_tests;
 mod oidc_tests;
 mod enhanced_search_tests;
 mod settings_tests;
-mod users_tests; 
+mod users_tests;
+mod generic_migration_tests;