feat(server): mark documents with 0 words as failed, and fix webdav unit tests
This commit is contained in:
parent
bf073132a1
commit
dd90e48fd2
|
|
@ -1722,8 +1722,7 @@ impl Database {
|
||||||
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
|
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
|
||||||
FROM documents
|
FROM documents
|
||||||
WHERE (ocr_confidence IS NOT NULL AND ocr_confidence < $1)
|
WHERE (ocr_confidence IS NOT NULL AND ocr_confidence < $1)
|
||||||
OR ocr_status = 'failed'
|
OR ocr_status = 'failed'
|
||||||
OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing')
|
|
||||||
ORDER BY
|
ORDER BY
|
||||||
CASE WHEN ocr_confidence IS NOT NULL THEN ocr_confidence ELSE -1 END ASC,
|
CASE WHEN ocr_confidence IS NOT NULL THEN ocr_confidence ELSE -1 END ASC,
|
||||||
created_at DESC
|
created_at DESC
|
||||||
|
|
@ -1763,8 +1762,7 @@ impl Database {
|
||||||
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
|
SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, ocr_confidence, ocr_word_count, ocr_processing_time_ms, ocr_status, ocr_error, ocr_completed_at, tags, created_at, updated_at, user_id, file_hash, original_created_at, original_modified_at, source_metadata
|
||||||
FROM documents
|
FROM documents
|
||||||
WHERE ((ocr_confidence IS NOT NULL AND ocr_confidence < $1)
|
WHERE ((ocr_confidence IS NOT NULL AND ocr_confidence < $1)
|
||||||
OR ocr_status = 'failed'
|
OR ocr_status = 'failed')
|
||||||
OR (ocr_confidence IS NULL AND ocr_status != 'pending' AND ocr_status != 'processing'))
|
|
||||||
AND user_id = $2
|
AND user_id = $2
|
||||||
ORDER BY
|
ORDER BY
|
||||||
CASE WHEN ocr_confidence IS NOT NULL THEN ocr_confidence ELSE -1 END ASC,
|
CASE WHEN ocr_confidence IS NOT NULL THEN ocr_confidence ELSE -1 END ASC,
|
||||||
|
|
|
||||||
|
|
@ -396,6 +396,38 @@ impl OcrQueueService {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// Handle empty text results - fail the document since no searchable content was extracted
|
||||||
|
let error_msg = format!("No extractable text found in document (0 words)");
|
||||||
|
warn!("⚠️ No searchable content extracted for '{}' | Job: {} | Document: {} | 0 words",
|
||||||
|
filename, item.id, item.document_id);
|
||||||
|
|
||||||
|
// Create failed document record using helper function
|
||||||
|
let _ = self.create_failed_document_from_ocr_error(
|
||||||
|
item.document_id,
|
||||||
|
"no_extractable_text",
|
||||||
|
&error_msg,
|
||||||
|
item.attempts,
|
||||||
|
).await;
|
||||||
|
|
||||||
|
// Mark document as failed for no extractable text
|
||||||
|
sqlx::query(
|
||||||
|
r#"
|
||||||
|
UPDATE documents
|
||||||
|
SET ocr_status = 'failed',
|
||||||
|
ocr_failure_reason = 'no_extractable_text',
|
||||||
|
ocr_error = $2,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1
|
||||||
|
"#
|
||||||
|
)
|
||||||
|
.bind(item.document_id)
|
||||||
|
.bind(&error_msg)
|
||||||
|
.execute(&self.pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
self.mark_failed(item.id, &error_msg).await?;
|
||||||
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Save processed image if setting is enabled and image was processed
|
// Save processed image if setting is enabled and image was processed
|
||||||
|
|
|
||||||
|
|
@ -1946,61 +1946,61 @@ mod deletion_error_handling_tests {
|
||||||
let null_confidence_id = database.create_document(null_confidence_doc).await.unwrap().id;
|
let null_confidence_id = database.create_document(null_confidence_doc).await.unwrap().id;
|
||||||
let pending_id = database.create_document(pending_doc).await.unwrap().id;
|
let pending_id = database.create_document(pending_doc).await.unwrap().id;
|
||||||
|
|
||||||
// Test with threshold of 50% - should include low confidence, failed, and null confidence
|
// Test with threshold of 50% - should include low confidence and failed only
|
||||||
let threshold_50_docs = database
|
let threshold_50_docs = database
|
||||||
.find_low_confidence_and_failed_documents(50.0, user_id, crate::models::UserRole::User)
|
.find_low_confidence_and_failed_documents(50.0, user_id, crate::models::UserRole::User)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
assert_eq!(threshold_50_docs.len(), 3);
|
assert_eq!(threshold_50_docs.len(), 2);
|
||||||
let threshold_50_ids: Vec<Uuid> = threshold_50_docs.iter().map(|d| d.id).collect();
|
let threshold_50_ids: Vec<Uuid> = threshold_50_docs.iter().map(|d| d.id).collect();
|
||||||
assert!(threshold_50_ids.contains(&low_id)); // 25% confidence
|
assert!(threshold_50_ids.contains(&low_id)); // 25% confidence
|
||||||
assert!(threshold_50_ids.contains(&failed_id)); // failed status
|
assert!(threshold_50_ids.contains(&failed_id)); // failed status
|
||||||
assert!(threshold_50_ids.contains(&null_confidence_id)); // NULL confidence
|
assert!(!threshold_50_ids.contains(&null_confidence_id)); // NULL confidence excluded
|
||||||
assert!(!threshold_50_ids.contains(&high_id)); // 95% confidence
|
assert!(!threshold_50_ids.contains(&high_id)); // 95% confidence
|
||||||
assert!(!threshold_50_ids.contains(&medium_id)); // 65% confidence
|
assert!(!threshold_50_ids.contains(&medium_id)); // 65% confidence
|
||||||
assert!(!threshold_50_ids.contains(&pending_id)); // pending status
|
assert!(!threshold_50_ids.contains(&pending_id)); // pending status
|
||||||
|
|
||||||
// Test with threshold of 70% - should include low and medium confidence, failed, and null confidence
|
// Test with threshold of 70% - should include low and medium confidence and failed only
|
||||||
let threshold_70_docs = database
|
let threshold_70_docs = database
|
||||||
.find_low_confidence_and_failed_documents(70.0, user_id, crate::models::UserRole::User)
|
.find_low_confidence_and_failed_documents(70.0, user_id, crate::models::UserRole::User)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
assert_eq!(threshold_70_docs.len(), 4);
|
assert_eq!(threshold_70_docs.len(), 3);
|
||||||
let threshold_70_ids: Vec<Uuid> = threshold_70_docs.iter().map(|d| d.id).collect();
|
let threshold_70_ids: Vec<Uuid> = threshold_70_docs.iter().map(|d| d.id).collect();
|
||||||
assert!(threshold_70_ids.contains(&low_id)); // 25% confidence
|
assert!(threshold_70_ids.contains(&low_id)); // 25% confidence
|
||||||
assert!(threshold_70_ids.contains(&medium_id)); // 65% confidence
|
assert!(threshold_70_ids.contains(&medium_id)); // 65% confidence
|
||||||
assert!(threshold_70_ids.contains(&failed_id)); // failed status
|
assert!(threshold_70_ids.contains(&failed_id)); // failed status
|
||||||
assert!(threshold_70_ids.contains(&null_confidence_id)); // NULL confidence
|
assert!(!threshold_70_ids.contains(&null_confidence_id)); // NULL confidence excluded
|
||||||
assert!(!threshold_70_ids.contains(&high_id)); // 95% confidence
|
assert!(!threshold_70_ids.contains(&high_id)); // 95% confidence
|
||||||
assert!(!threshold_70_ids.contains(&pending_id)); // pending status
|
assert!(!threshold_70_ids.contains(&pending_id)); // pending status
|
||||||
|
|
||||||
// Test with threshold of 100% - should include all except pending/processing
|
// Test with threshold of 100% - should include all confidence levels and failed only
|
||||||
let threshold_100_docs = database
|
let threshold_100_docs = database
|
||||||
.find_low_confidence_and_failed_documents(100.0, user_id, crate::models::UserRole::User)
|
.find_low_confidence_and_failed_documents(100.0, user_id, crate::models::UserRole::User)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
assert_eq!(threshold_100_docs.len(), 5);
|
assert_eq!(threshold_100_docs.len(), 4);
|
||||||
let threshold_100_ids: Vec<Uuid> = threshold_100_docs.iter().map(|d| d.id).collect();
|
let threshold_100_ids: Vec<Uuid> = threshold_100_docs.iter().map(|d| d.id).collect();
|
||||||
assert!(threshold_100_ids.contains(&high_id)); // 95% confidence
|
assert!(threshold_100_ids.contains(&high_id)); // 95% confidence
|
||||||
assert!(threshold_100_ids.contains(&medium_id)); // 65% confidence
|
assert!(threshold_100_ids.contains(&medium_id)); // 65% confidence
|
||||||
assert!(threshold_100_ids.contains(&low_id)); // 25% confidence
|
assert!(threshold_100_ids.contains(&low_id)); // 25% confidence
|
||||||
assert!(threshold_100_ids.contains(&failed_id)); // failed status
|
assert!(threshold_100_ids.contains(&failed_id)); // failed status
|
||||||
assert!(threshold_100_ids.contains(&null_confidence_id)); // NULL confidence
|
assert!(!threshold_100_ids.contains(&null_confidence_id)); // NULL confidence excluded
|
||||||
assert!(!threshold_100_ids.contains(&pending_id)); // pending status
|
assert!(!threshold_100_ids.contains(&pending_id)); // pending status
|
||||||
|
|
||||||
// Test with threshold of 0% - should only include failed and null confidence
|
// Test with threshold of 0% - should only include failed documents
|
||||||
let threshold_0_docs = database
|
let threshold_0_docs = database
|
||||||
.find_low_confidence_and_failed_documents(0.0, user_id, crate::models::UserRole::User)
|
.find_low_confidence_and_failed_documents(0.0, user_id, crate::models::UserRole::User)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
assert_eq!(threshold_0_docs.len(), 2);
|
assert_eq!(threshold_0_docs.len(), 1);
|
||||||
let threshold_0_ids: Vec<Uuid> = threshold_0_docs.iter().map(|d| d.id).collect();
|
let threshold_0_ids: Vec<Uuid> = threshold_0_docs.iter().map(|d| d.id).collect();
|
||||||
assert!(threshold_0_ids.contains(&failed_id)); // failed status
|
assert!(threshold_0_ids.contains(&failed_id)); // failed status
|
||||||
assert!(threshold_0_ids.contains(&null_confidence_id)); // NULL confidence
|
assert!(!threshold_0_ids.contains(&null_confidence_id)); // NULL confidence excluded
|
||||||
assert!(!threshold_0_ids.contains(&high_id)); // 95% confidence
|
assert!(!threshold_0_ids.contains(&high_id)); // 95% confidence
|
||||||
assert!(!threshold_0_ids.contains(&medium_id)); // 65% confidence
|
assert!(!threshold_0_ids.contains(&medium_id)); // 65% confidence
|
||||||
assert!(!threshold_0_ids.contains(&low_id)); // 25% confidence
|
assert!(!threshold_0_ids.contains(&low_id)); // 25% confidence
|
||||||
|
|
|
||||||
|
|
@ -101,6 +101,12 @@ pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileInfo>> {
|
||||||
"group" => {
|
"group" => {
|
||||||
resp.group = Some(text.trim().to_string());
|
resp.group = Some(text.trim().to_string());
|
||||||
}
|
}
|
||||||
|
"status" if in_propstat => {
|
||||||
|
// Check if status is 200 OK
|
||||||
|
if text.contains("200") {
|
||||||
|
status_ok = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
_ => {
|
_ => {
|
||||||
// Store any other properties as generic metadata
|
// Store any other properties as generic metadata
|
||||||
// This handles vendor-specific properties from any WebDAV server
|
// This handles vendor-specific properties from any WebDAV server
|
||||||
|
|
@ -139,13 +145,6 @@ pub fn parse_propfind_response(xml_text: &str) -> Result<Vec<FileInfo>> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"status" if in_propstat => {
|
|
||||||
// Check if status is 200 OK
|
|
||||||
if text.contains("200") {
|
|
||||||
status_ok = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -213,7 +213,7 @@ fn test_webdav_response_parsing_comprehensive() {
|
||||||
server_type: Some("nextcloud".to_string()),
|
server_type: Some("nextcloud".to_string()),
|
||||||
};
|
};
|
||||||
|
|
||||||
let service = WebDAVService::new(config).unwrap();
|
let service = WebDAVService::new(config.clone()).unwrap();
|
||||||
|
|
||||||
// Test Nextcloud response parsing
|
// Test Nextcloud response parsing
|
||||||
let nextcloud_response = mock_nextcloud_propfind_response();
|
let nextcloud_response = mock_nextcloud_propfind_response();
|
||||||
|
|
@ -221,7 +221,22 @@ fn test_webdav_response_parsing_comprehensive() {
|
||||||
assert!(files.is_ok());
|
assert!(files.is_ok());
|
||||||
|
|
||||||
let files = files.unwrap();
|
let files = files.unwrap();
|
||||||
assert_eq!(files.len(), 3); // Should have 3 files (excluding directory)
|
|
||||||
|
// Filter files by supported extensions
|
||||||
|
let supported_files: Vec<_> = files.iter()
|
||||||
|
.filter(|f| {
|
||||||
|
if let Some(ext) = std::path::Path::new(&f.name)
|
||||||
|
.extension()
|
||||||
|
.and_then(|e| e.to_str())
|
||||||
|
{
|
||||||
|
config.file_extensions.contains(&ext.to_lowercase())
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
assert_eq!(supported_files.len(), 2); // Should have 2 files with supported extensions (pdf, png)
|
||||||
|
|
||||||
// Verify first file (report.pdf)
|
// Verify first file (report.pdf)
|
||||||
let pdf_file = files.iter().find(|f| f.name == "report.pdf").unwrap();
|
let pdf_file = files.iter().find(|f| f.name == "report.pdf").unwrap();
|
||||||
|
|
@ -237,12 +252,8 @@ fn test_webdav_response_parsing_comprehensive() {
|
||||||
assert_eq!(png_file.etag, "png456"); // ETag should be normalized (quotes removed)
|
assert_eq!(png_file.etag, "png456"); // ETag should be normalized (quotes removed)
|
||||||
assert!(!png_file.is_directory);
|
assert!(!png_file.is_directory);
|
||||||
|
|
||||||
// Verify third file (unsupported.docx)
|
// Verify that unsupported file (docx) is not included in supported files
|
||||||
let docx_file = files.iter().find(|f| f.name == "unsupported.docx").unwrap();
|
assert!(supported_files.iter().find(|f| f.name == "unsupported.docx").is_none());
|
||||||
assert_eq!(docx_file.size, 102400);
|
|
||||||
assert_eq!(docx_file.mime_type, "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
|
|
||||||
assert_eq!(docx_file.etag, "docx789"); // ETag should be normalized (quotes removed)
|
|
||||||
assert!(!docx_file.is_directory);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue