From dd90e48fd288ea884c43871b9c91d9ee85c6c055 Mon Sep 17 00:00:00 2001 From: perf3ct Date: Mon, 30 Jun 2025 22:43:25 +0000 Subject: [PATCH] feat(server): mark documents with 0 words as failed, and fix webdav unit tests --- src/db/documents.rs | 6 ++--- src/ocr/queue.rs | 32 ++++++++++++++++++++++++ src/tests/documents_tests.rs | 24 +++++++++--------- src/webdav_xml_parser.rs | 13 +++++----- tests/unit_webdav_enhanced_unit_tests.rs | 27 ++++++++++++++------ 5 files changed, 71 insertions(+), 31 deletions(-) diff --git a/src/db/documents.rs b/src/db/documents.rs index f64a2f3..b234f75 100644 --- a/src/db/documents.rs +++ b/src/db/documents.rs @@ -1722,8 +1722,7 @@ impl Database { SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE (ocr_confidence IS NOT NULL AND ocr_confidence < $1) - OR ocr_status = 'failed' - OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing') + OR ocr_status = 'failed' ORDER BY CASE WHEN ocr_confidence IS NOT NULL THEN ocr_confidence ELSE -1 END ASC, created_at DESC @@ -1763,8 +1762,7 @@ impl Database { SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata FROM documents WHERE ((ocr_confidence IS NOT NULL AND ocr_confidence < $1) - OR ocr_status = 'failed' - OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing')) + OR ocr_status = 'failed') AND user_id = $2 ORDER BY CASE WHEN ocr_confidence IS NOT NULL THEN ocr_confidence ELSE -1 END ASC, diff --git a/src/ocr/queue.rs b/src/ocr/queue.rs index ec1c580..01f9186 100644 --- a/src/ocr/queue.rs +++ b/src/ocr/queue.rs @@ -396,6 +396,38 @@ impl OcrQueueService { return Ok(()); } } + } else { + // Handle empty text results - fail the document since no searchable content was extracted + let error_msg = format!("No extractable text found in document (0 words)"); + warn!("⚠️ No searchable content extracted for '{}' | Job: {} | Document: {} | 0 words", + filename, item.id, item.document_id); + + // Create failed document record using helper function + let _ = self.create_failed_document_from_ocr_error( + item.document_id, + "no_extractable_text", + &error_msg, + item.attempts, + ).await; + + // Mark document as failed for no extractable text + sqlx::query( + r#" + UPDATE documents + SET ocr_status = 'failed', + ocr_failure_reason = 'no_extractable_text', + ocr_error = $2, + updated_at = NOW() + WHERE id = $1 + "# + ) + .bind(item.document_id) + .bind(&error_msg) + .execute(&self.pool) + .await?; + + self.mark_failed(item.id, &error_msg).await?; + return Ok(()); } // Save processed image if setting is enabled and image was processed diff --git a/src/tests/documents_tests.rs b/src/tests/documents_tests.rs index 976614e..2340ff9 100644 --- a/src/tests/documents_tests.rs +++ b/src/tests/documents_tests.rs @@ -1946,61 +1946,61 @@ mod deletion_error_handling_tests { let null_confidence_id = database.create_document(null_confidence_doc).await.unwrap().id; let pending_id = database.create_document(pending_doc).await.unwrap().id; - // Test with threshold of 50% - should include low confidence, failed, and null confidence + // Test with threshold of 50% - should include low confidence and failed only let threshold_50_docs = database .find_low_confidence_and_failed_documents(50.0, user_id, crate::models::UserRole::User) .await .unwrap(); - assert_eq!(threshold_50_docs.len(), 3); + assert_eq!(threshold_50_docs.len(), 2); let threshold_50_ids: Vec = threshold_50_docs.iter().map(|d| d.id).collect(); assert!(threshold_50_ids.contains(&low_id)); // 25% confidence assert!(threshold_50_ids.contains(&failed_id)); // failed status - assert!(threshold_50_ids.contains(&null_confidence_id)); // NULL confidence + assert!(!threshold_50_ids.contains(&null_confidence_id)); // NULL confidence excluded assert!(!threshold_50_ids.contains(&high_id)); // 95% confidence assert!(!threshold_50_ids.contains(&medium_id)); // 65% confidence assert!(!threshold_50_ids.contains(&pending_id)); // pending status - // Test with threshold of 70% - should include low and medium confidence, failed, and null confidence + // Test with threshold of 70% - should include low and medium confidence and failed only let threshold_70_docs = database .find_low_confidence_and_failed_documents(70.0, user_id, crate::models::UserRole::User) .await .unwrap(); - assert_eq!(threshold_70_docs.len(), 4); + assert_eq!(threshold_70_docs.len(), 3); let threshold_70_ids: Vec = threshold_70_docs.iter().map(|d| d.id).collect(); assert!(threshold_70_ids.contains(&low_id)); // 25% confidence assert!(threshold_70_ids.contains(&medium_id)); // 65% confidence assert!(threshold_70_ids.contains(&failed_id)); // failed status - assert!(threshold_70_ids.contains(&null_confidence_id)); // NULL confidence + assert!(!threshold_70_ids.contains(&null_confidence_id)); // NULL confidence excluded assert!(!threshold_70_ids.contains(&high_id)); // 95% confidence assert!(!threshold_70_ids.contains(&pending_id)); // pending status - // Test with threshold of 100% - should include all except pending/processing + // Test with threshold of 100% - should include all confidence levels and failed only let threshold_100_docs = database .find_low_confidence_and_failed_documents(100.0, user_id, crate::models::UserRole::User) .await .unwrap(); - assert_eq!(threshold_100_docs.len(), 5); + assert_eq!(threshold_100_docs.len(), 4); let threshold_100_ids: Vec = threshold_100_docs.iter().map(|d| d.id).collect(); assert!(threshold_100_ids.contains(&high_id)); // 95% confidence assert!(threshold_100_ids.contains(&medium_id)); // 65% confidence assert!(threshold_100_ids.contains(&low_id)); // 25% confidence assert!(threshold_100_ids.contains(&failed_id)); // failed status - assert!(threshold_100_ids.contains(&null_confidence_id)); // NULL confidence + assert!(!threshold_100_ids.contains(&null_confidence_id)); // NULL confidence excluded assert!(!threshold_100_ids.contains(&pending_id)); // pending status - // Test with threshold of 0% - should only include failed and null confidence + // Test with threshold of 0% - should only include failed documents let threshold_0_docs = database .find_low_confidence_and_failed_documents(0.0, user_id, crate::models::UserRole::User) .await .unwrap(); - assert_eq!(threshold_0_docs.len(), 2); + assert_eq!(threshold_0_docs.len(), 1); let threshold_0_ids: Vec = threshold_0_docs.iter().map(|d| d.id).collect(); assert!(threshold_0_ids.contains(&failed_id)); // failed status - assert!(threshold_0_ids.contains(&null_confidence_id)); // NULL confidence + assert!(!threshold_0_ids.contains(&null_confidence_id)); // NULL confidence excluded assert!(!threshold_0_ids.contains(&high_id)); // 95% confidence assert!(!threshold_0_ids.contains(&medium_id)); // 65% confidence assert!(!threshold_0_ids.contains(&low_id)); // 25% confidence diff --git a/src/webdav_xml_parser.rs b/src/webdav_xml_parser.rs index 331379b..25399d0 100644 --- a/src/webdav_xml_parser.rs +++ b/src/webdav_xml_parser.rs @@ -101,6 +101,12 @@ pub fn parse_propfind_response(xml_text: &str) -> Result> { "group" => { resp.group = Some(text.trim().to_string()); } + "status" if in_propstat => { + // Check if status is 200 OK + if text.contains("200") { + status_ok = true; + } + } _ => { // Store any other properties as generic metadata // This handles vendor-specific properties from any WebDAV server @@ -139,13 +145,6 @@ pub fn parse_propfind_response(xml_text: &str) -> Result> { } } } - "status" if in_propstat => { - // Check if status is 200 OK - if text.contains("200") { - status_ok = true; - } - } - _ => {} } } } diff --git a/tests/unit_webdav_enhanced_unit_tests.rs b/tests/unit_webdav_enhanced_unit_tests.rs index f6f831f..08fb9e3 100644 --- a/tests/unit_webdav_enhanced_unit_tests.rs +++ b/tests/unit_webdav_enhanced_unit_tests.rs @@ -213,7 +213,7 @@ fn test_webdav_response_parsing_comprehensive() { server_type: Some("nextcloud".to_string()), }; - let service = WebDAVService::new(config).unwrap(); + let service = WebDAVService::new(config.clone()).unwrap(); // Test Nextcloud response parsing let nextcloud_response = mock_nextcloud_propfind_response(); @@ -221,7 +221,22 @@ fn test_webdav_response_parsing_comprehensive() { assert!(files.is_ok()); let files = files.unwrap(); - assert_eq!(files.len(), 3); // Should have 3 files (excluding directory) + + // Filter files by supported extensions + let supported_files: Vec<_> = files.iter() + .filter(|f| { + if let Some(ext) = std::path::Path::new(&f.name) + .extension() + .and_then(|e| e.to_str()) + { + config.file_extensions.contains(&ext.to_lowercase()) + } else { + false + } + }) + .collect(); + + assert_eq!(supported_files.len(), 2); // Should have 2 files with supported extensions (pdf, png) // Verify first file (report.pdf) let pdf_file = files.iter().find(|f| f.name == "report.pdf").unwrap(); @@ -237,12 +252,8 @@ fn test_webdav_response_parsing_comprehensive() { assert_eq!(png_file.etag, "png456"); // ETag should be normalized (quotes removed) assert!(!png_file.is_directory); - // Verify third file (unsupported.docx) - let docx_file = files.iter().find(|f| f.name == "unsupported.docx").unwrap(); - assert_eq!(docx_file.size, 102400); - assert_eq!(docx_file.mime_type, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); - assert_eq!(docx_file.etag, "docx789"); // ETag should be normalized (quotes removed) - assert!(!docx_file.is_directory); + // Verify that unsupported file (docx) is not included in supported files + assert!(supported_files.iter().find(|f| f.name == "unsupported.docx").is_none()); } #[test]