feat(server): mark documents with 0 words as failed, and fix webdav unit tests

2025-06-30 22:43:25 +00:00 · 2025-06-30 22:43:25 +00:00 · dd90e48fd2
parent bf073132a1
commit dd90e48fd2
5 changed files with 71 additions and 31 deletions
--- a/src/db/documents.rs
+++ b/src/db/documents.rs
@ -1723,7 +1723,6 @@ impl Database {
                FROM documents 
                WHERE (ocr_confidence IS NOT NULL AND ocr_confidence < $1) 
                   OR ocr_status = 'failed'
-                   OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing')
                ORDER BY 
                    CASE WHEN ocr_confidence IS NOT NULL THEN ocr_confidence ELSE -1 END ASC, 
                    created_at DESC
@ -1763,8 +1762,7 @@ impl Database {
                SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
                FROM documents 
                WHERE ((ocr_confidence IS NOT NULL AND ocr_confidence < $1) 
-                    OR ocr_status = 'failed' 
-                    OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing'))
+                    OR ocr_status = 'failed')
                  AND user_id = $2
                ORDER BY 
                    CASE WHEN ocr_confidence IS NOT NULL THEN ocr_confidence ELSE -1 END ASC, 
--- a/src/ocr/queue.rs
+++ b/src/ocr/queue.rs
@ -396,6 +396,38 @@ impl OcrQueueService {
                                    return Ok(());
                                }
                            }
+                        } else {
+                            // Handle empty text results - fail the document since no searchable content was extracted
+                            let error_msg = format!("No extractable text found in document (0 words)");
+                            warn!("⚠️  No searchable content extracted for '{}' | Job: {} | Document: {} | 0 words", 
+                                  filename, item.id, item.document_id);
+                            
+                            // Create failed document record using helper function
+                            let _ = self.create_failed_document_from_ocr_error(
+                                item.document_id,
+                                "no_extractable_text",
+                                &error_msg,
+                                item.attempts,
+                            ).await;
+
+                            // Mark document as failed for no extractable text
+                            sqlx::query(
+                                r#"
+                                UPDATE documents
+                                SET ocr_status = 'failed',
+                                    ocr_failure_reason = 'no_extractable_text',
+                                    ocr_error = $2,
+                                    updated_at = NOW()
+                                WHERE id = $1
+                                "#
+                            )
+                            .bind(item.document_id)
+                            .bind(&error_msg)
+                            .execute(&self.pool)
+                            .await?;
+                            
+                            self.mark_failed(item.id, &error_msg).await?;
+                            return Ok(());
                        }

                        // Save processed image if setting is enabled and image was processed
--- a/src/tests/documents_tests.rs
+++ b/src/tests/documents_tests.rs
@ -1946,61 +1946,61 @@ mod deletion_error_handling_tests {
        let null_confidence_id = database.create_document(null_confidence_doc).await.unwrap().id;
        let pending_id = database.create_document(pending_doc).await.unwrap().id;

-        // Test with threshold of 50% - should include low confidence, failed, and null confidence
+        // Test with threshold of 50% - should include low confidence and failed only
        let threshold_50_docs = database
            .find_low_confidence_and_failed_documents(50.0, user_id, crate::models::UserRole::User)
            .await
            .unwrap();

-        assert_eq!(threshold_50_docs.len(), 3);
+        assert_eq!(threshold_50_docs.len(), 2);
        let threshold_50_ids: Vec<Uuid> = threshold_50_docs.iter().map(|d| d.id).collect();
        assert!(threshold_50_ids.contains(&low_id)); // 25% confidence
        assert!(threshold_50_ids.contains(&failed_id)); // failed status
-        assert!(threshold_50_ids.contains(&null_confidence_id)); // NULL confidence
+        assert!(!threshold_50_ids.contains(&null_confidence_id)); // NULL confidence excluded
        assert!(!threshold_50_ids.contains(&high_id)); // 95% confidence
        assert!(!threshold_50_ids.contains(&medium_id)); // 65% confidence
        assert!(!threshold_50_ids.contains(&pending_id)); // pending status

-        // Test with threshold of 70% - should include low and medium confidence, failed, and null confidence
+        // Test with threshold of 70% - should include low and medium confidence and failed only
        let threshold_70_docs = database
            .find_low_confidence_and_failed_documents(70.0, user_id, crate::models::UserRole::User)
            .await
            .unwrap();

-        assert_eq!(threshold_70_docs.len(), 4);
+        assert_eq!(threshold_70_docs.len(), 3);
        let threshold_70_ids: Vec<Uuid> = threshold_70_docs.iter().map(|d| d.id).collect();
        assert!(threshold_70_ids.contains(&low_id)); // 25% confidence
        assert!(threshold_70_ids.contains(&medium_id)); // 65% confidence
        assert!(threshold_70_ids.contains(&failed_id)); // failed status
-        assert!(threshold_70_ids.contains(&null_confidence_id)); // NULL confidence
+        assert!(!threshold_70_ids.contains(&null_confidence_id)); // NULL confidence excluded
        assert!(!threshold_70_ids.contains(&high_id)); // 95% confidence
        assert!(!threshold_70_ids.contains(&pending_id)); // pending status

-        // Test with threshold of 100% - should include all except pending/processing
+        // Test with threshold of 100% - should include all confidence levels and failed only
        let threshold_100_docs = database
            .find_low_confidence_and_failed_documents(100.0, user_id, crate::models::UserRole::User)
            .await
            .unwrap();

-        assert_eq!(threshold_100_docs.len(), 5);
+        assert_eq!(threshold_100_docs.len(), 4);
        let threshold_100_ids: Vec<Uuid> = threshold_100_docs.iter().map(|d| d.id).collect();
        assert!(threshold_100_ids.contains(&high_id)); // 95% confidence
        assert!(threshold_100_ids.contains(&medium_id)); // 65% confidence
        assert!(threshold_100_ids.contains(&low_id)); // 25% confidence
        assert!(threshold_100_ids.contains(&failed_id)); // failed status
-        assert!(threshold_100_ids.contains(&null_confidence_id)); // NULL confidence
+        assert!(!threshold_100_ids.contains(&null_confidence_id)); // NULL confidence excluded
        assert!(!threshold_100_ids.contains(&pending_id)); // pending status

-        // Test with threshold of 0% - should only include failed and null confidence
+        // Test with threshold of 0% - should only include failed documents
        let threshold_0_docs = database
            .find_low_confidence_and_failed_documents(0.0, user_id, crate::models::UserRole::User)
            .await
            .unwrap();

-        assert_eq!(threshold_0_docs.len(), 2);
+        assert_eq!(threshold_0_docs.len(), 1);
        let threshold_0_ids: Vec<Uuid> = threshold_0_docs.iter().map(|d| d.id).collect();
        assert!(threshold_0_ids.contains(&failed_id)); // failed status
-        assert!(threshold_0_ids.contains(&null_confidence_id)); // NULL confidence
+        assert!(!threshold_0_ids.contains(&null_confidence_id)); // NULL confidence excluded
        assert!(!threshold_0_ids.contains(&high_id)); // 95% confidence
        assert!(!threshold_0_ids.contains(&medium_id)); // 65% confidence
        assert!(!threshold_0_ids.contains(&low_id)); // 25% confidence
--- a/src/webdav_xml_parser.rs
+++ b/src/webdav_xml_parser.rs
@ -101,6 +101,12 @@ pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileInfo>> {
                            "group" => {
                                resp.group = Some(text.trim().to_string());
                            }
+                            "status" if in_propstat => {
+                                // Check if status is 200 OK
+                                if text.contains("200") {
+                                    status_ok = true;
+                                }
+                            }
                            _ => {
                                // Store any other properties as generic metadata
                                // This handles vendor-specific properties from any WebDAV server
@ -139,13 +145,6 @@ pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileInfo>> {
                                    }
                                }
                            }
-                            "status" if in_propstat => {
-                                // Check if status is 200 OK
-                                if text.contains("200") {
-                                    status_ok = true;
-                                }
-                            }
-                            _ => {}
                        }
                    }
                }
--- a/tests/unit_webdav_enhanced_unit_tests.rs
+++ b/tests/unit_webdav_enhanced_unit_tests.rs
@ -213,7 +213,7 @@ fn test_webdav_response_parsing_comprehensive() {
        server_type: Some("nextcloud".to_string()),
    };

-    let service = WebDAVService::new(config).unwrap();
+    let service = WebDAVService::new(config.clone()).unwrap();
    
    // Test Nextcloud response parsing
    let nextcloud_response = mock_nextcloud_propfind_response();
@ -221,7 +221,22 @@ fn test_webdav_response_parsing_comprehensive() {
    assert!(files.is_ok());

    let files = files.unwrap();
-    assert_eq!(files.len(), 3); // Should have 3 files (excluding directory)
+    
+    // Filter files by supported extensions
+    let supported_files: Vec<_> = files.iter()
+        .filter(|f| {
+            if let Some(ext) = std::path::Path::new(&f.name)
+                .extension()
+                .and_then(|e| e.to_str())
+            {
+                config.file_extensions.contains(&ext.to_lowercase())
+            } else {
+                false
+            }
+        })
+        .collect();
+    
+    assert_eq!(supported_files.len(), 2); // Should have 2 files with supported extensions (pdf, png)

    // Verify first file (report.pdf)
    let pdf_file = files.iter().find(|f| f.name == "report.pdf").unwrap();
@ -237,12 +252,8 @@ fn test_webdav_response_parsing_comprehensive() {
    assert_eq!(png_file.etag, "png456"); // ETag should be normalized (quotes removed)
    assert!(!png_file.is_directory);

-    // Verify third file (unsupported.docx)
-    let docx_file = files.iter().find(|f| f.name == "unsupported.docx").unwrap();
-    assert_eq!(docx_file.size, 102400);
-    assert_eq!(docx_file.mime_type, "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
-    assert_eq!(docx_file.etag, "docx789"); // ETag should be normalized (quotes removed)
-    assert!(!docx_file.is_directory);
+    // Verify that unsupported file (docx) is not included in supported files
+    assert!(supported_files.iter().find(|f| f.name == "unsupported.docx").is_none());
 }

 #[test]