feat(debug): debug page actually works and does something

This commit is contained in:
perf3ct 2025-07-01 00:15:48 +00:00
parent b307cfd509
commit 231f88f038
4 changed files with 1616 additions and 0 deletions

View File

@ -18,6 +18,7 @@ import WatchFolderPage from './pages/WatchFolderPage';
import DocumentManagementPage from './pages/DocumentManagementPage';
import LabelsPage from './pages/LabelsPage';
import IgnoredFilesPage from './pages/IgnoredFilesPage';
import DebugPage from './pages/DebugPage';
function App(): React.ReactElement {
const { user, loading } = useAuth();
@ -77,6 +78,7 @@ function App(): React.ReactElement {
<Route path="/settings" element={<SettingsPage />} />
<Route path="/documents/management" element={<DocumentManagementPage />} />
<Route path="/ignored-files" element={<IgnoredFilesPage />} />
<Route path="/debug" element={<DebugPage />} />
<Route path="/profile" element={<div>Profile Page - Coming Soon</div>} />
</Routes>
</AppLayout>

View File

@ -37,6 +37,7 @@ import {
Block as BlockIcon,
Api as ApiIcon,
ManageAccounts as ManageIcon,
BugReport as BugReportIcon,
} from '@mui/icons-material';
import { useNavigate, useLocation } from 'react-router-dom';
import { useAuth } from '../../contexts/AuthContext';
@ -72,6 +73,7 @@ const navigationItems: NavigationItem[] = [
{ text: 'Watch Folder', icon: FolderIcon, path: '/watch' },
{ text: 'Document Management', icon: ManageIcon, path: '/documents/management' },
{ text: 'Ignored Files', icon: BlockIcon, path: '/ignored-files' },
{ text: 'Debug', icon: BugReportIcon, path: '/debug' },
];
const AppLayout: React.FC<AppLayoutProps> = ({ children }) => {

File diff suppressed because it is too large Load Diff

View File

@ -58,6 +58,7 @@ pub fn router() -> Router<Arc<AppState>> {
.route("/{id}/ocr", get(get_document_ocr))
.route("/{id}/processed-image", get(get_processed_image))
.route("/{id}/retry-ocr", post(retry_ocr))
.route("/{id}/debug", get(get_document_debug_info))
.route("/duplicates", get(get_user_duplicates))
.route("/failed", get(get_failed_documents))
.route("/failed/{id}/view", get(view_failed_document))
@ -645,6 +646,560 @@ async fn retry_ocr(
}
}
#[utoipa::path(
get,
path = "/api/documents/{id}/debug",
tag = "documents",
security(
("bearer_auth" = [])
),
params(
("id" = uuid::Uuid, Path, description = "Document ID")
),
responses(
(status = 200, description = "Debug information for document processing pipeline", body = String),
(status = 404, description = "Document not found"),
(status = 401, description = "Unauthorized")
)
)]
async fn get_document_debug_info(
State(state): State<Arc<AppState>>,
auth_user: AuthUser,
Path(document_id): Path<uuid::Uuid>,
) -> Result<Json<serde_json::Value>, StatusCode> {
tracing::info!("Starting debug analysis for document {} by user {}", document_id, auth_user.user.id);
// Get the document
let document = match state
.db
.get_document_by_id(document_id, auth_user.user.id, auth_user.user.role)
.await
{
Ok(Some(doc)) => {
tracing::info!("Found document: {} ({})", doc.filename, doc.mime_type);
doc
}
Ok(None) => {
tracing::warn!("Document {} not found for user {}", document_id, auth_user.user.id);
return Err(StatusCode::NOT_FOUND);
}
Err(e) => {
tracing::error!("Database error fetching document {}: {}", document_id, e);
return Err(StatusCode::INTERNAL_SERVER_ERROR);
}
};
// Get user settings
tracing::info!("Fetching user settings for user {}", auth_user.user.id);
let settings = match state
.db
.get_user_settings(auth_user.user.id)
.await
{
Ok(Some(s)) => {
tracing::info!("Found user settings: OCR enabled={}, min_confidence={}", s.enable_background_ocr, s.ocr_min_confidence);
s
}
Ok(None) => {
tracing::info!("No user settings found, using defaults");
crate::models::Settings::default()
}
Err(e) => {
tracing::error!("Error fetching user settings: {}", e);
return Err(StatusCode::INTERNAL_SERVER_ERROR);
}
};
// Get OCR queue history for this document
tracing::info!("Fetching OCR queue history for document {}", document_id);
let queue_history = match sqlx::query(
r#"
SELECT id, status, priority, created_at, started_at, completed_at,
error_message, attempts, worker_id
FROM ocr_queue
WHERE document_id = $1
ORDER BY created_at DESC
LIMIT 10
"#
)
.bind(document_id)
.fetch_all(state.db.get_pool())
.await {
Ok(history) => {
tracing::info!("Queue history query successful, found {} entries", history.len());
history
},
Err(e) => {
tracing::error!("Queue history query error: {}", e);
return Err(StatusCode::INTERNAL_SERVER_ERROR);
}
};
// Get processed image info if it exists
tracing::info!("Fetching processed image for document {}", document_id);
let processed_image = match state
.db
.get_processed_image_by_document_id(document_id, auth_user.user.id)
.await {
Ok(Some(img)) => {
tracing::info!("Found processed image for document {}", document_id);
Some(img)
},
Ok(None) => {
tracing::info!("No processed image found for document {}", document_id);
None
},
Err(e) => {
tracing::warn!("Error fetching processed image for document {}: {}", document_id, e);
None
}
};
// Get failed document record if it exists
tracing::info!("Fetching failed document record for document {}", document_id);
let failed_document = match sqlx::query(
r#"
SELECT failure_reason, failure_stage, error_message, retry_count,
last_retry_at, created_at, content, ocr_text, ocr_confidence,
ocr_word_count, ocr_processing_time_ms
FROM failed_documents
WHERE id = $1 OR existing_document_id = $1
ORDER BY created_at DESC
LIMIT 1
"#
)
.bind(document_id)
.fetch_optional(state.db.get_pool())
.await {
Ok(result) => {
tracing::info!("Failed document query successful, found: {}", result.is_some());
result
},
Err(e) => {
tracing::error!("Failed document query error: {}", e);
return Err(StatusCode::INTERNAL_SERVER_ERROR);
}
};
// Get detailed OCR processing logs and attempts
tracing::info!("Fetching detailed OCR processing logs for document {}", document_id);
let ocr_processing_logs = match sqlx::query(
r#"
SELECT id, status, priority, created_at, started_at, completed_at,
error_message, attempts, worker_id, processing_time_ms, file_size
FROM ocr_queue
WHERE document_id = $1
ORDER BY created_at ASC
"#
)
.bind(document_id)
.fetch_all(state.db.get_pool())
.await {
Ok(logs) => {
tracing::info!("OCR processing logs query successful, found {} entries", logs.len());
logs
},
Err(e) => {
tracing::error!("OCR processing logs query error: {}", e);
return Err(StatusCode::INTERNAL_SERVER_ERROR);
}
};
// File service for file info
let file_service = FileService::new(state.config.upload_path.clone());
// Check if file exists
let file_exists = tokio::fs::metadata(&document.file_path).await.is_ok();
let file_metadata = if file_exists {
tokio::fs::metadata(&document.file_path).await.ok()
} else {
None
};
// Try to analyze file content for additional diagnostic info
tracing::info!("Analyzing file content for document {} (exists: {})", document_id, file_exists);
let file_analysis = if file_exists {
match analyze_file_content(&document.file_path, &document.mime_type).await {
Ok(analysis) => {
tracing::info!("File analysis successful for document {}", document_id);
analysis
},
Err(e) => {
tracing::warn!("Failed to analyze file content for {}: {}", document_id, e);
FileAnalysis {
error_details: Some(format!("File analysis failed: {}", e)),
..Default::default()
}
}
}
} else {
tracing::warn!("File does not exist for document {}, skipping analysis", document_id);
FileAnalysis::default()
};
// Pipeline steps analysis
let mut pipeline_steps = Vec::new();
// Step 1: File Upload & Ingestion
pipeline_steps.push(serde_json::json!({
"step": 1,
"name": "File Upload & Ingestion",
"status": "completed", // Document exists if we got this far
"details": {
"filename": document.filename,
"original_filename": document.original_filename,
"file_size": document.file_size,
"mime_type": document.mime_type,
"file_exists": file_exists,
"file_path": document.file_path,
"created_at": document.created_at,
"file_metadata": file_metadata.as_ref().map(|m| serde_json::json!({
"size": m.len(),
"modified": m.modified().ok(),
"is_file": m.is_file(),
"is_dir": m.is_dir()
})),
"file_analysis": file_analysis
},
"success": true,
"error": None::<String>
}));
// Step 2: OCR Queue Enrollment
let queue_enrollment_status = if queue_history.is_empty() {
if settings.enable_background_ocr {
"not_queued"
} else {
"ocr_disabled"
}
} else {
"queued"
};
pipeline_steps.push(serde_json::json!({
"step": 2,
"name": "OCR Queue Enrollment",
"status": queue_enrollment_status,
"details": {
"user_ocr_enabled": settings.enable_background_ocr,
"queue_entries_count": queue_history.len(),
"queue_history": queue_history.iter().map(|row| serde_json::json!({
"id": row.get::<uuid::Uuid, _>("id"),
"status": row.get::<String, _>("status"),
"priority": row.get::<i32, _>("priority"),
"created_at": row.get::<chrono::DateTime<chrono::Utc>, _>("created_at"),
"started_at": row.get::<Option<chrono::DateTime<chrono::Utc>>, _>("started_at"),
"completed_at": row.get::<Option<chrono::DateTime<chrono::Utc>>, _>("completed_at"),
"error_message": row.get::<Option<String>, _>("error_message"),
"attempts": row.get::<i32, _>("attempts"),
"worker_id": row.get::<Option<String>, _>("worker_id")
})).collect::<Vec<_>>()
},
"success": !queue_history.is_empty() || !settings.enable_background_ocr,
"error": if !settings.enable_background_ocr && queue_history.is_empty() {
Some("OCR processing is disabled in user settings")
} else { None }
}));
// Step 3: OCR Processing
let ocr_status = document.ocr_status.as_deref().unwrap_or("not_started");
let ocr_success = matches!(ocr_status, "completed");
pipeline_steps.push(serde_json::json!({
"step": 3,
"name": "OCR Text Extraction",
"status": ocr_status,
"details": {
"ocr_text_length": document.ocr_text.as_ref().map(|t| t.len()).unwrap_or(0),
"ocr_confidence": document.ocr_confidence,
"ocr_word_count": document.ocr_word_count,
"ocr_processing_time_ms": document.ocr_processing_time_ms,
"ocr_completed_at": document.ocr_completed_at,
"ocr_error": document.ocr_error,
"has_processed_image": processed_image.is_some(),
"processed_image_info": processed_image.as_ref().map(|pi| serde_json::json!({
"image_path": pi.processed_image_path,
"image_width": pi.image_width,
"image_height": pi.image_height,
"file_size": pi.file_size,
"processing_parameters": pi.processing_parameters,
"processing_steps": pi.processing_steps,
"created_at": pi.created_at
}))
},
"success": ocr_success,
"error": document.ocr_error.clone()
}));
// Step 4: Quality Validation
let quality_passed = if let Some(confidence) = document.ocr_confidence {
confidence >= settings.ocr_min_confidence && document.ocr_word_count.unwrap_or(0) > 0
} else {
false
};
pipeline_steps.push(serde_json::json!({
"step": 4,
"name": "OCR Quality Validation",
"status": if ocr_success {
if quality_passed { "passed" } else { "failed" }
} else {
"not_reached"
},
"details": {
"quality_thresholds": {
"min_confidence": settings.ocr_min_confidence,
"brightness_threshold": settings.ocr_quality_threshold_brightness,
"contrast_threshold": settings.ocr_quality_threshold_contrast,
"noise_threshold": settings.ocr_quality_threshold_noise,
"sharpness_threshold": settings.ocr_quality_threshold_sharpness
},
"actual_values": {
"confidence": document.ocr_confidence,
"word_count": document.ocr_word_count,
"processed_image_available": processed_image.is_some(),
"processing_parameters": processed_image.as_ref().map(|pi| &pi.processing_parameters)
},
"quality_checks": {
"confidence_check": document.ocr_confidence.map(|c| c >= settings.ocr_min_confidence),
"word_count_check": document.ocr_word_count.map(|w| w > 0),
"processed_image_available": processed_image.is_some()
}
},
"success": quality_passed,
"error": if !quality_passed && ocr_success {
Some(format!("Quality validation failed: confidence {:.1}% (required: {:.1}%), words: {}",
document.ocr_confidence.unwrap_or(0.0),
settings.ocr_min_confidence,
document.ocr_word_count.unwrap_or(0)
))
} else { None }
}));
// Overall summary
let overall_status = if quality_passed {
"success"
} else if matches!(ocr_status, "failed") {
"failed"
} else if matches!(ocr_status, "processing") {
"processing"
} else if matches!(ocr_status, "pending") {
"pending"
} else {
"not_started"
};
Ok(Json(serde_json::json!({
"document_id": document_id,
"filename": document.filename,
"overall_status": overall_status,
"pipeline_steps": pipeline_steps,
"failed_document_info": failed_document.as_ref().map(|row| serde_json::json!({
"failure_reason": row.get::<String, _>("failure_reason"),
"failure_stage": row.get::<String, _>("failure_stage"),
"error_message": row.get::<Option<String>, _>("error_message"),
"retry_count": row.get::<Option<i32>, _>("retry_count"),
"last_retry_at": row.get::<Option<chrono::DateTime<chrono::Utc>>, _>("last_retry_at"),
"created_at": row.get::<chrono::DateTime<chrono::Utc>, _>("created_at"),
"content_preview": row.get::<Option<String>, _>("content").map(|c|
c.chars().take(200).collect::<String>()
),
"failed_ocr_text": row.get::<Option<String>, _>("ocr_text"),
"failed_ocr_confidence": row.get::<Option<f32>, _>("ocr_confidence"),
"failed_ocr_word_count": row.get::<Option<i32>, _>("ocr_word_count"),
"failed_ocr_processing_time_ms": row.get::<Option<i32>, _>("ocr_processing_time_ms")
})),
"user_settings": {
"enable_background_ocr": settings.enable_background_ocr,
"ocr_min_confidence": settings.ocr_min_confidence,
"max_file_size_mb": settings.max_file_size_mb,
"quality_thresholds": {
"brightness": settings.ocr_quality_threshold_brightness,
"contrast": settings.ocr_quality_threshold_contrast,
"noise": settings.ocr_quality_threshold_noise,
"sharpness": settings.ocr_quality_threshold_sharpness
}
},
"debug_timestamp": chrono::Utc::now(),
"file_analysis": file_analysis,
"detailed_processing_logs": ocr_processing_logs.iter().map(|row| serde_json::json!({
"id": row.get::<uuid::Uuid, _>("id"),
"status": row.get::<String, _>("status"),
"priority": row.get::<i32, _>("priority"),
"created_at": row.get::<chrono::DateTime<chrono::Utc>, _>("created_at"),
"started_at": row.get::<Option<chrono::DateTime<chrono::Utc>>, _>("started_at"),
"completed_at": row.get::<Option<chrono::DateTime<chrono::Utc>>, _>("completed_at"),
"error_message": row.get::<Option<String>, _>("error_message"),
"attempts": row.get::<i32, _>("attempts"),
"worker_id": row.get::<Option<String>, _>("worker_id"),
"processing_time_ms": row.get::<Option<i32>, _>("processing_time_ms"),
"file_size": row.get::<Option<i64>, _>("file_size"),
// Calculate processing duration if both timestamps are available
"processing_duration_ms": if let (Some(started), Some(completed)) = (
row.get::<Option<chrono::DateTime<chrono::Utc>>, _>("started_at"),
row.get::<Option<chrono::DateTime<chrono::Utc>>, _>("completed_at")
) {
Some((completed.timestamp_millis() - started.timestamp_millis()) as i32)
} else {
row.get::<Option<i32>, _>("processing_time_ms")
},
// Calculate queue wait time
"queue_wait_time_ms": if let Some(started) = row.get::<Option<chrono::DateTime<chrono::Utc>>, _>("started_at") {
let created = row.get::<chrono::DateTime<chrono::Utc>, _>("created_at");
Some((started.timestamp_millis() - created.timestamp_millis()) as i32)
} else {
None::<i32>
}
})).collect::<Vec<_>>()
})))
}
#[derive(Debug, Default, serde::Serialize)]
struct FileAnalysis {
file_type: String,
file_size_bytes: u64,
is_readable: bool,
pdf_info: Option<PdfAnalysis>,
text_preview: Option<String>,
error_details: Option<String>,
}
#[derive(Debug, serde::Serialize)]
struct PdfAnalysis {
is_valid_pdf: bool,
page_count: Option<i32>,
has_text_content: bool,
has_images: bool,
is_encrypted: bool,
pdf_version: Option<String>,
font_count: usize,
text_extraction_error: Option<String>,
estimated_text_length: usize,
}
async fn analyze_file_content(file_path: &str, mime_type: &str) -> Result<FileAnalysis, Box<dyn std::error::Error + Send + Sync>> {
let mut analysis = FileAnalysis {
file_type: mime_type.to_string(),
..Default::default()
};
// Try to read file size
if let Ok(metadata) = tokio::fs::metadata(file_path).await {
analysis.file_size_bytes = metadata.len();
}
// Try to read the file
let file_content = match tokio::fs::read(file_path).await {
Ok(content) => {
analysis.is_readable = true;
content
}
Err(e) => {
analysis.error_details = Some(format!("Failed to read file: {}", e));
return Ok(analysis);
}
};
// Analyze based on file type
if mime_type.contains("pdf") {
analysis.pdf_info = Some(analyze_pdf_content(&file_content).await);
} else if mime_type.starts_with("text/") {
// For text files, show a preview
match String::from_utf8(file_content.clone()) {
Ok(text) => {
analysis.text_preview = Some(text.chars().take(500).collect());
}
Err(e) => {
analysis.error_details = Some(format!("Failed to decode text file: {}", e));
}
}
}
Ok(analysis)
}
async fn analyze_pdf_content(content: &[u8]) -> PdfAnalysis {
use std::panic;
let mut analysis = PdfAnalysis {
is_valid_pdf: false,
page_count: None,
has_text_content: false,
has_images: false,
is_encrypted: false,
pdf_version: None,
font_count: 0,
text_extraction_error: None,
estimated_text_length: 0,
};
// Check PDF header
if content.len() < 8 {
analysis.text_extraction_error = Some("File too small to be a valid PDF".to_string());
return analysis;
}
if !content.starts_with(b"%PDF-") {
analysis.text_extraction_error = Some("File does not start with PDF header".to_string());
return analysis;
}
analysis.is_valid_pdf = true;
// Extract PDF version from header
if content.len() >= 8 {
if let Ok(header) = std::str::from_utf8(&content[0..8]) {
if let Some(version) = header.strip_prefix("%PDF-") {
analysis.pdf_version = Some(version.to_string());
}
}
}
// Try to extract text using pdf_extract (same as the main OCR pipeline)
let text_result = panic::catch_unwind(|| {
pdf_extract::extract_text_from_mem(content)
});
match text_result {
Ok(Ok(text)) => {
analysis.has_text_content = !text.trim().is_empty();
analysis.estimated_text_length = text.len();
// Count words for comparison with OCR results
let word_count = text.split_whitespace().count();
if word_count == 0 && text.len() > 0 {
analysis.text_extraction_error = Some("PDF contains characters but no extractable words".to_string());
}
}
Ok(Err(e)) => {
analysis.text_extraction_error = Some(format!("PDF text extraction failed: {}", e));
}
Err(_) => {
analysis.text_extraction_error = Some("PDF text extraction panicked (likely corrupted PDF)".to_string());
}
}
// Basic PDF structure analysis
let content_str = String::from_utf8_lossy(content);
// Check for encryption
analysis.is_encrypted = content_str.contains("/Encrypt");
// Check for images
analysis.has_images = content_str.contains("/Image") || content_str.contains("/XObject");
// Estimate page count (rough)
let page_matches = content_str.matches("/Type /Page").count();
if page_matches > 0 {
analysis.page_count = Some(page_matches as i32);
}
// Count fonts (rough)
analysis.font_count = content_str.matches("/Type /Font").count();
analysis
}
#[utoipa::path(
get,
path = "/api/documents/failed-ocr",