Readur/tests/integration_office_extracti...

use anyhow::Result;
use std::fs;
use std::io::Write;
use std::time::Duration;
use tempfile::TempDir;
use tokio::time::timeout;

use readur::ocr::{
    OcrService, OcrConfig,
};

/// Test utilities for creating mock Office documents
struct OfficeTestDocuments {
    temp_dir: TempDir,
}

impl OfficeTestDocuments {
    fn new() -> Result<Self> {
        Ok(Self {
            temp_dir: TempDir::new()?,
        })
    }

    /// Create a mock DOCX file (simplified ZIP structure with XML content)
    fn create_mock_docx(&self, filename: &str, content: &str) -> Result<String> {
        let file_path = self.temp_dir.path().join(filename);

        // Create a proper ZIP structure for DOCX
        let file = fs::File::create(&file_path)?;
        let mut zip = zip::ZipWriter::new(file);

        // Add [Content_Types].xml
        zip.start_file("[Content_Types].xml", zip::write::FileOptions::default())?;
        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
    <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
    <Default Extension="xml" ContentType="application/xml"/>
    <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
</Types>"#)?;

        // Add _rels/.rels
        zip.start_file("_rels/.rels", zip::write::FileOptions::default())?;
        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
    <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
</Relationships>"#)?;

        // Add word/document.xml with the actual content
        zip.start_file("word/document.xml", zip::write::FileOptions::default())?;
        let document_xml = format!(r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
    <w:body>
        <w:p>
            <w:r>
                <w:t>{}</w:t>
            </w:r>
        </w:p>
    </w:body>
</w:document>"#, content);
        zip.write_all(document_xml.as_bytes())?;

        zip.finish()?;

        Ok(file_path.to_string_lossy().to_string())
    }

    /// Create a mock XLSX file with spreadsheet content
    fn create_mock_xlsx(&self, filename: &str, content: &[&str]) -> Result<String> {
        let file_path = self.temp_dir.path().join(filename);

        let file = fs::File::create(&file_path)?;
        let mut zip = zip::ZipWriter::new(file);

        // Add [Content_Types].xml with shared strings support
        zip.start_file("[Content_Types].xml", zip::write::FileOptions::default())?;
        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
    <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
    <Default Extension="xml" ContentType="application/xml"/>
    <Override PartName="/xl/workbook.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"/>
    <Override PartName="/xl/worksheets/sheet1.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml"/>
    <Override PartName="/xl/sharedStrings.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml"/>
</Types>"#)?;

        // Add _rels/.rels
        zip.start_file("_rels/.rels", zip::write::FileOptions::default())?;
        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
    <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="xl/workbook.xml"/>
</Relationships>"#)?;

        // Add xl/workbook.xml
        zip.start_file("xl/workbook.xml", zip::write::FileOptions::default())?;
        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
    <sheets>
        <sheet name="Sheet1" sheetId="1" r:id="rId1"/>
    </sheets>
</workbook>"#)?;

        // Add xl/_rels/workbook.xml.rels with shared strings relationship
        zip.start_file("xl/_rels/workbook.xml.rels", zip::write::FileOptions::default())?;
        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
    <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet" Target="worksheets/sheet1.xml"/>
    <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings" Target="sharedStrings.xml"/>
</Relationships>"#)?;

        // Add xl/sharedStrings.xml with the text content
        zip.start_file("xl/sharedStrings.xml", zip::write::FileOptions::default())?;
        let mut shared_strings_xml = String::from(r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<sst xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" count="{count}" uniqueCount="{count}">"#);
        shared_strings_xml = shared_strings_xml.replace("{count}", &content.len().to_string());

        for cell_content in content {
            shared_strings_xml.push_str(&format!(r#"
    <si><t>{}</t></si>"#, cell_content));
        }

        shared_strings_xml.push_str(r#"
</sst>"#);
        zip.write_all(shared_strings_xml.as_bytes())?;

        // Add xl/worksheets/sheet1.xml with references to shared strings
        zip.start_file("xl/worksheets/sheet1.xml", zip::write::FileOptions::default())?;
        let mut worksheet_xml = String::from(r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
    <sheetData>"#);

        for (row_idx, _) in content.iter().enumerate() {
            worksheet_xml.push_str(&format!(r#"
        <row r="{}">
            <c r="A{}" t="s">
                <v>{}</v>
            </c>
        </row>"#, row_idx + 1, row_idx + 1, row_idx));
        }

        worksheet_xml.push_str(r#"
    </sheetData>
</worksheet>"#);

        zip.write_all(worksheet_xml.as_bytes())?;
        zip.finish()?;

        Ok(file_path.to_string_lossy().to_string())
    }

    /// Create a corrupted file for testing error handling
    fn create_corrupted_file(&self, filename: &str) -> Result<String> {
        let file_path = self.temp_dir.path().join(filename);
        let mut file = fs::File::create(&file_path)?;
        file.write_all(b"This is not a valid Office document but pretends to be one")?;
        Ok(file_path.to_string_lossy().to_string())
    }

    /// Create an empty file
    fn create_empty_file(&self, filename: &str) -> Result<String> {
        let file_path = self.temp_dir.path().join(filename);
        fs::File::create(&file_path)?;
        Ok(file_path.to_string_lossy().to_string())
    }
}

/// Create a test OCR service with XML extraction
fn create_test_ocr_service(temp_dir: &str) -> OcrService {
    let config = OcrConfig {
        temp_dir: temp_dir.to_string(),
    };

    OcrService::new_with_config(config)
}

#[tokio::test]
async fn test_extract_text_from_docx() -> Result<()> {
    let test_docs = OfficeTestDocuments::new()?;
    let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());

    let test_content = "This is a test DOCX document with sample content for extraction testing.";
    let docx_path = test_docs.create_mock_docx("test.docx", test_content)?;

    let result = ocr_service.extract_text_from_office_document(
        &docx_path,
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    ).await?;

    // The method now returns an OcrResult
    println!("Extracted text: '{}'", result.text);
    assert!(!result.text.is_empty());
    assert!(result.text.contains(test_content));
    assert!(result.confidence > 0.0);
    assert!(result.word_count > 0);

    Ok(())
}

#[tokio::test]
async fn test_extract_text_from_xlsx() -> Result<()> {
    let test_docs = OfficeTestDocuments::new()?;
    let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());

    let test_content = vec![
        "Header 1",
        "Data Row 1",
        "Data Row 2",
        "Summary Data",
    ];
    let xlsx_path = test_docs.create_mock_xlsx("test.xlsx", &test_content)?;

    let result = ocr_service.extract_text_from_office_document(
        &xlsx_path,
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
    ).await?;

    // The method now returns an OcrResult
    println!("XLSX extracted text: '{}'", result.text);
    assert!(!result.text.is_empty());
    // Check if it contains some of our test content
    assert!(result.text.contains("Header") || result.text.contains("Data"));
    assert!(result.confidence > 0.0);
    assert!(result.word_count > 0);

    Ok(())
}

#[tokio::test]
async fn test_extraction_modes() -> Result<()> {
    let test_docs = OfficeTestDocuments::new()?;
    let temp_dir = test_docs.temp_dir.path().to_string_lossy().to_string();

    let test_content = "Test document for mode comparison";
    let docx_path = test_docs.create_mock_docx("test_modes.docx", test_content)?;

    // Test XML extraction with the simplified approach
    let ocr_config = OcrConfig {
        temp_dir: temp_dir.clone(),
    };

    let ocr_service = OcrService::new_with_config(ocr_config);

    let result = ocr_service.extract_text_from_office_document_with_config(
        &docx_path,
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    ).await;

    // XML extraction should succeed with our test document
    assert!(result.is_ok(), "XML extraction failed: {:?}", result);
    let extracted_result = result?;
    assert!(!extracted_result.text.is_empty());
    assert!(extracted_result.confidence > 0.0);
    assert!(extracted_result.word_count > 0);

    Ok(())
}

#[tokio::test]
async fn test_fallback_mechanism() -> Result<()> {
    let test_docs = OfficeTestDocuments::new()?;
    let temp_dir = test_docs.temp_dir.path().to_string_lossy().to_string();

    // Create a service with XML extraction
    let config = OcrConfig {
        temp_dir,
    };

    let ocr_service = OcrService::new_with_config(config);
    let docx_path = test_docs.create_mock_docx("fallback_test.docx", "Fallback test content")?;

    // The XML extraction should succeed
    let result = ocr_service.extract_text_from_office_document(
        &docx_path,
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    ).await?;

    // The method now returns an OcrResult
    assert!(result.text.contains("Fallback test content"));
    assert!(result.confidence > 0.0);
    assert!(result.word_count > 0);

    Ok(())
}

#[tokio::test]
async fn test_timeout_handling() -> Result<()> {
    let test_docs = OfficeTestDocuments::new()?;
    let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());

    let docx_path = test_docs.create_mock_docx("timeout_test.docx", "Test content")?;

    // Test timeout behavior (the timeout logic is now in the XML extractor itself)
    let result = timeout(
        Duration::from_millis(2000), // Give overall test 2 seconds
        ocr_service.extract_text_from_office_document_with_config(
            &docx_path,
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
        )
    ).await;

    // Should complete successfully even with short timeout for our simple test file
    assert!(result.is_ok());
    let extraction_result = result??;
    assert!(!extraction_result.text.is_empty());
    assert!(extraction_result.confidence > 0.0);
    assert!(extraction_result.word_count > 0);

    Ok(())
}

#[tokio::test]
async fn test_error_handling() -> Result<()> {
    let test_docs = OfficeTestDocuments::new()?;
    let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());

    // Test with corrupted file
    let corrupted_path = test_docs.create_corrupted_file("corrupted.docx")?;
    let result = ocr_service.extract_text_from_office_document(
        &corrupted_path,
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    ).await;

    assert!(result.is_err());
    let error_msg = result.unwrap_err().to_string();
    assert!(error_msg.contains("corrupted") || error_msg.contains("invalid") || error_msg.contains("parsing"));

    // Test with empty file
    let empty_path = test_docs.create_empty_file("empty.docx")?;
    let result = ocr_service.extract_text_from_office_document(
        &empty_path,
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    ).await;

    assert!(result.is_err());

    // Test with non-existent file
    let result = ocr_service.extract_text_from_office_document(
        "/path/that/does/not/exist.docx",
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    ).await;

    assert!(result.is_err());

    Ok(())
}

#[tokio::test]
async fn test_concurrent_extraction() -> Result<()> {
    let test_docs = OfficeTestDocuments::new()?;
    let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());

    // Create multiple test documents
    let mut tasks = Vec::new();
    let mut file_paths = Vec::new();

    for i in 0..5 {
        let content = format!("Test document {} with unique content", i);
        let file_path = test_docs.create_mock_docx(&format!("concurrent_test_{}.docx", i), &content)?;
        file_paths.push(file_path);
    }

    // Launch concurrent extraction tasks
    for file_path in file_paths {
        let ocr_service_clone = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
        let task = tokio::spawn(async move {
            ocr_service_clone.extract_text_from_office_document(
                &file_path,
                "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
            ).await
        });
        tasks.push(task);
    }

    // Wait for all tasks to complete
    let results = futures::future::join_all(tasks).await;

    // Verify all extractions succeeded
    for (i, task_result) in results.into_iter().enumerate() {
        let ocr_result = task_result??;
        assert!(!ocr_result.text.is_empty(), "Task {} failed", i);
        assert!(ocr_result.text.contains(&format!("Test document {}", i)));
        assert!(ocr_result.confidence > 0.0);
        assert!(ocr_result.word_count > 0);
    }

    Ok(())
}

#[tokio::test]
async fn test_circuit_breaker() -> Result<()> {
    let test_docs = OfficeTestDocuments::new()?;

    // Create service with XML extraction
    let config = OcrConfig {
        temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(),
    };

    let ocr_service = OcrService::new_with_config(config);

    // Create a valid document for later success testing
    let valid_path = test_docs.create_mock_docx("circuit_test.docx", "Valid document")?;

    // Create corrupted files to cause failures
    let corrupted1 = test_docs.create_corrupted_file("corrupted1.docx")?;
    let corrupted2 = test_docs.create_corrupted_file("corrupted2.docx")?;

    // First failure
    let result1 = ocr_service.extract_text_from_office_document(
        &corrupted1,
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    ).await;
    assert!(result1.is_err());

    // Second failure - should trip circuit breaker
    let result2 = ocr_service.extract_text_from_office_document(
        &corrupted2,
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    ).await;
    assert!(result2.is_err());

    // Third attempt - should succeed since circuit breaker functionality was removed
    let result3 = ocr_service.extract_text_from_office_document(
        &valid_path,
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    ).await;
    // With simplified architecture, valid documents should always work
    assert!(result3.is_ok());
    let valid_result = result3.unwrap();
    assert!(valid_result.text.contains("Valid document"));
    assert!(valid_result.confidence > 0.0);
    assert!(valid_result.word_count > 0);

    Ok(())
}

#[tokio::test]
async fn test_statistics_tracking() -> Result<()> {
    let test_docs = OfficeTestDocuments::new()?;
    let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());

    // Perform some extractions to verify functionality
    let valid_path = test_docs.create_mock_docx("stats_test.docx", "Statistics test document")?;

    for i in 0..3 {
        let result = ocr_service.extract_text_from_office_document(
            &valid_path,
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
        ).await;

        assert!(result.is_ok(), "Extraction {} failed: {:?}", i, result);
        let ocr_result = result.unwrap();
        assert!(!ocr_result.text.is_empty());
        assert!(ocr_result.confidence > 0.0);
        assert!(ocr_result.word_count > 0);
        assert!(ocr_result.processing_time_ms > 0);
    }

    // All extractions succeeded, indicating the XML extraction is working correctly

    Ok(())
}

#[tokio::test]
async fn test_mime_type_support() -> Result<()> {
    let test_docs = OfficeTestDocuments::new()?;
    let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());

    // Test supported MIME types
    let supported_types = ocr_service.get_supported_mime_types();
    assert!(supported_types.contains(&"application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
    assert!(supported_types.contains(&"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
    assert!(supported_types.contains(&"application/pdf"));
    assert!(supported_types.contains(&"image/png"));

    // Test Office document support
    assert!(ocr_service.supports_office_documents());

    Ok(())
}

#[tokio::test]
async fn test_learning_mechanism() -> Result<()> {
    let test_docs = OfficeTestDocuments::new()?;

    // Create service with XML extraction
    let config = OcrConfig {
        temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(),
    };

    let ocr_service = OcrService::new_with_config(config);

    // Process several documents of the same type to build learning data
    for i in 0..3 {
        let content = format!("Learning test document {} content", i);
        let docx_path = test_docs.create_mock_docx(&format!("learning_{}.docx", i), &content)?;

        let result = ocr_service.extract_text_from_office_document(
            &docx_path,
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
        ).await;

        assert!(result.is_ok(), "Extraction iteration {} failed: {:?}", i, result);
        let ocr_result = result?;
        assert!(!ocr_result.text.is_empty());
        assert!(ocr_result.text.contains(&format!("document {}", i)));
        assert!(ocr_result.confidence > 0.0);
        assert!(ocr_result.word_count > 0);
    }

    // With the simplified XML-only architecture, the system should consistently work
    // All extractions succeeded, indicating the XML extraction is working correctly

    Ok(())
}

#[tokio::test]
async fn test_integration_with_main_extract_text() -> Result<()> {
    let test_docs = OfficeTestDocuments::new()?;
    let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());

    // Test that the main extract_text method properly handles Office documents
    let test_content = "Integration test for main extract_text method";
    let docx_path = test_docs.create_mock_docx("integration.docx", test_content)?;

    // This should use the fallback strategy internally
    let result = ocr_service.extract_text(
        &docx_path,
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    ).await?;

    assert!(!result.is_empty());
    assert!(result.contains("Integration test"));

    // Test with XLSX as well
    let xlsx_content = vec!["Cell 1", "Cell 2", "Cell 3"];
    let xlsx_path = test_docs.create_mock_xlsx("integration.xlsx", &xlsx_content)?;

    let result = ocr_service.extract_text(
        &xlsx_path,
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
    ).await?;

    assert!(!result.is_empty());
    assert!(result.contains("Cell 1"));

    Ok(())
}

/// Performance benchmark test (not run by default due to #[ignore])
#[tokio::test]
#[ignore]
async fn benchmark_extraction_performance() -> Result<()> {
    let test_docs = OfficeTestDocuments::new()?;
    let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());

    // Create a larger test document
    let large_content = "This is a large test document. ".repeat(1000);
    let docx_path = test_docs.create_mock_docx("benchmark.docx", &large_content)?;

    let start_time = std::time::Instant::now();
    let num_iterations = 10;

    for i in 0..num_iterations {
        let result = ocr_service.extract_text_from_office_document(
            &docx_path,
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
        ).await?;

        assert!(!result.text.is_empty());
        println!("Iteration {}: extracted {} chars, confidence: {:.1}%",
            i,
            result.text.len(),
            result.confidence
        );
    }

    let total_time = start_time.elapsed();
    let avg_time = total_time / num_iterations;

    println!("Average extraction time: {:?}", avg_time);
    println!("Total time for {} iterations: {:?}", num_iterations, total_time);

    // Performance assertions (adjust based on your requirements)
    assert!(avg_time < Duration::from_secs(5), "Average extraction time too slow: {:?}", avg_time);

    Ok(())
}