feat(server): mark documents with 0 words as failed, and fix webdav unit tests
This commit is contained in:
parent
bf073132a1
commit
dd90e48fd2
|
|
@ -1723,7 +1723,6 @@ impl Database {
|
|||
FROM documents
|
||||
WHERE (ocr_confidence IS NOT NULL AND ocr_confidence < $1)
|
||||
OR ocr_status = 'failed'
|
||||
OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing')
|
||||
ORDER BY
|
||||
CASE WHEN ocr_confidence IS NOT NULL THEN ocr_confidence ELSE -1 END ASC,
|
||||
created_at DESC
|
||||
|
|
@ -1763,8 +1762,7 @@ impl Database {
|
|||
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
|
||||
FROM documents
|
||||
WHERE ((ocr_confidence IS NOT NULL AND ocr_confidence < $1)
|
||||
OR ocr_status = 'failed'
|
||||
OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing'))
|
||||
OR ocr_status = 'failed')
|
||||
AND user_id = $2
|
||||
ORDER BY
|
||||
CASE WHEN ocr_confidence IS NOT NULL THEN ocr_confidence ELSE -1 END ASC,
|
||||
|
|
|
|||
|
|
@ -396,6 +396,38 @@ impl OcrQueueService {
|
|||
return Ok(());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Handle empty text results - fail the document since no searchable content was extracted
|
||||
let error_msg = format!("No extractable text found in document (0 words)");
|
||||
warn!("⚠️ No searchable content extracted for '{}' | Job: {} | Document: {} | 0 words",
|
||||
filename, item.id, item.document_id);
|
||||
|
||||
// Create failed document record using helper function
|
||||
let _ = self.create_failed_document_from_ocr_error(
|
||||
item.document_id,
|
||||
"no_extractable_text",
|
||||
&error_msg,
|
||||
item.attempts,
|
||||
).await;
|
||||
|
||||
// Mark document as failed for no extractable text
|
||||
sqlx::query(
|
||||
r#"
|
||||
UPDATE documents
|
||||
SET ocr_status = 'failed',
|
||||
ocr_failure_reason = 'no_extractable_text',
|
||||
ocr_error = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
"#
|
||||
)
|
||||
.bind(item.document_id)
|
||||
.bind(&error_msg)
|
||||
.execute(&self.pool)
|
||||
.await?;
|
||||
|
||||
self.mark_failed(item.id, &error_msg).await?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Save processed image if setting is enabled and image was processed
|
||||
|
|
|
|||
|
|
@ -1946,61 +1946,61 @@ mod deletion_error_handling_tests {
|
|||
let null_confidence_id = database.create_document(null_confidence_doc).await.unwrap().id;
|
||||
let pending_id = database.create_document(pending_doc).await.unwrap().id;
|
||||
|
||||
// Test with threshold of 50% - should include low confidence, failed, and null confidence
|
||||
// Test with threshold of 50% - should include low confidence and failed only
|
||||
let threshold_50_docs = database
|
||||
.find_low_confidence_and_failed_documents(50.0, user_id, crate::models::UserRole::User)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(threshold_50_docs.len(), 3);
|
||||
assert_eq!(threshold_50_docs.len(), 2);
|
||||
let threshold_50_ids: Vec<Uuid> = threshold_50_docs.iter().map(|d| d.id).collect();
|
||||
assert!(threshold_50_ids.contains(&low_id)); // 25% confidence
|
||||
assert!(threshold_50_ids.contains(&failed_id)); // failed status
|
||||
assert!(threshold_50_ids.contains(&null_confidence_id)); // NULL confidence
|
||||
assert!(!threshold_50_ids.contains(&null_confidence_id)); // NULL confidence excluded
|
||||
assert!(!threshold_50_ids.contains(&high_id)); // 95% confidence
|
||||
assert!(!threshold_50_ids.contains(&medium_id)); // 65% confidence
|
||||
assert!(!threshold_50_ids.contains(&pending_id)); // pending status
|
||||
|
||||
// Test with threshold of 70% - should include low and medium confidence, failed, and null confidence
|
||||
// Test with threshold of 70% - should include low and medium confidence and failed only
|
||||
let threshold_70_docs = database
|
||||
.find_low_confidence_and_failed_documents(70.0, user_id, crate::models::UserRole::User)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(threshold_70_docs.len(), 4);
|
||||
assert_eq!(threshold_70_docs.len(), 3);
|
||||
let threshold_70_ids: Vec<Uuid> = threshold_70_docs.iter().map(|d| d.id).collect();
|
||||
assert!(threshold_70_ids.contains(&low_id)); // 25% confidence
|
||||
assert!(threshold_70_ids.contains(&medium_id)); // 65% confidence
|
||||
assert!(threshold_70_ids.contains(&failed_id)); // failed status
|
||||
assert!(threshold_70_ids.contains(&null_confidence_id)); // NULL confidence
|
||||
assert!(!threshold_70_ids.contains(&null_confidence_id)); // NULL confidence excluded
|
||||
assert!(!threshold_70_ids.contains(&high_id)); // 95% confidence
|
||||
assert!(!threshold_70_ids.contains(&pending_id)); // pending status
|
||||
|
||||
// Test with threshold of 100% - should include all except pending/processing
|
||||
// Test with threshold of 100% - should include all confidence levels and failed only
|
||||
let threshold_100_docs = database
|
||||
.find_low_confidence_and_failed_documents(100.0, user_id, crate::models::UserRole::User)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(threshold_100_docs.len(), 5);
|
||||
assert_eq!(threshold_100_docs.len(), 4);
|
||||
let threshold_100_ids: Vec<Uuid> = threshold_100_docs.iter().map(|d| d.id).collect();
|
||||
assert!(threshold_100_ids.contains(&high_id)); // 95% confidence
|
||||
assert!(threshold_100_ids.contains(&medium_id)); // 65% confidence
|
||||
assert!(threshold_100_ids.contains(&low_id)); // 25% confidence
|
||||
assert!(threshold_100_ids.contains(&failed_id)); // failed status
|
||||
assert!(threshold_100_ids.contains(&null_confidence_id)); // NULL confidence
|
||||
assert!(!threshold_100_ids.contains(&null_confidence_id)); // NULL confidence excluded
|
||||
assert!(!threshold_100_ids.contains(&pending_id)); // pending status
|
||||
|
||||
// Test with threshold of 0% - should only include failed and null confidence
|
||||
// Test with threshold of 0% - should only include failed documents
|
||||
let threshold_0_docs = database
|
||||
.find_low_confidence_and_failed_documents(0.0, user_id, crate::models::UserRole::User)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(threshold_0_docs.len(), 2);
|
||||
assert_eq!(threshold_0_docs.len(), 1);
|
||||
let threshold_0_ids: Vec<Uuid> = threshold_0_docs.iter().map(|d| d.id).collect();
|
||||
assert!(threshold_0_ids.contains(&failed_id)); // failed status
|
||||
assert!(threshold_0_ids.contains(&null_confidence_id)); // NULL confidence
|
||||
assert!(!threshold_0_ids.contains(&null_confidence_id)); // NULL confidence excluded
|
||||
assert!(!threshold_0_ids.contains(&high_id)); // 95% confidence
|
||||
assert!(!threshold_0_ids.contains(&medium_id)); // 65% confidence
|
||||
assert!(!threshold_0_ids.contains(&low_id)); // 25% confidence
|
||||
|
|
|
|||
|
|
@ -101,6 +101,12 @@ pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileInfo>> {
|
|||
"group" => {
|
||||
resp.group = Some(text.trim().to_string());
|
||||
}
|
||||
"status" if in_propstat => {
|
||||
// Check if status is 200 OK
|
||||
if text.contains("200") {
|
||||
status_ok = true;
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// Store any other properties as generic metadata
|
||||
// This handles vendor-specific properties from any WebDAV server
|
||||
|
|
@ -139,13 +145,6 @@ pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileInfo>> {
|
|||
}
|
||||
}
|
||||
}
|
||||
"status" if in_propstat => {
|
||||
// Check if status is 200 OK
|
||||
if text.contains("200") {
|
||||
status_ok = true;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -213,7 +213,7 @@ fn test_webdav_response_parsing_comprehensive() {
|
|||
server_type: Some("nextcloud".to_string()),
|
||||
};
|
||||
|
||||
let service = WebDAVService::new(config).unwrap();
|
||||
let service = WebDAVService::new(config.clone()).unwrap();
|
||||
|
||||
// Test Nextcloud response parsing
|
||||
let nextcloud_response = mock_nextcloud_propfind_response();
|
||||
|
|
@ -221,7 +221,22 @@ fn test_webdav_response_parsing_comprehensive() {
|
|||
assert!(files.is_ok());
|
||||
|
||||
let files = files.unwrap();
|
||||
assert_eq!(files.len(), 3); // Should have 3 files (excluding directory)
|
||||
|
||||
// Filter files by supported extensions
|
||||
let supported_files: Vec<_> = files.iter()
|
||||
.filter(|f| {
|
||||
if let Some(ext) = std::path::Path::new(&f.name)
|
||||
.extension()
|
||||
.and_then(|e| e.to_str())
|
||||
{
|
||||
config.file_extensions.contains(&ext.to_lowercase())
|
||||
} else {
|
||||
false
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
assert_eq!(supported_files.len(), 2); // Should have 2 files with supported extensions (pdf, png)
|
||||
|
||||
// Verify first file (report.pdf)
|
||||
let pdf_file = files.iter().find(|f| f.name == "report.pdf").unwrap();
|
||||
|
|
@ -237,12 +252,8 @@ fn test_webdav_response_parsing_comprehensive() {
|
|||
assert_eq!(png_file.etag, "png456"); // ETag should be normalized (quotes removed)
|
||||
assert!(!png_file.is_directory);
|
||||
|
||||
// Verify third file (unsupported.docx)
|
||||
let docx_file = files.iter().find(|f| f.name == "unsupported.docx").unwrap();
|
||||
assert_eq!(docx_file.size, 102400);
|
||||
assert_eq!(docx_file.mime_type, "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
|
||||
assert_eq!(docx_file.etag, "docx789"); // ETag should be normalized (quotes removed)
|
||||
assert!(!docx_file.is_directory);
|
||||
// Verify that unsupported file (docx) is not included in supported files
|
||||
assert!(supported_files.iter().find(|f| f.name == "unsupported.docx").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
Loading…
Reference in New Issue