Merge pull request #197 from readur/fix/doc-and-docx-utf-issues
feat(office): try to resolve docx/doc not working
This commit is contained in:
commit
1b7fbed90d
|
|
@ -21,6 +21,9 @@ jobs:
|
|||
services:
|
||||
postgres:
|
||||
image: postgres:17
|
||||
credentials:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
env:
|
||||
POSTGRES_USER: readur
|
||||
POSTGRES_PASSWORD: readur
|
||||
|
|
@ -34,6 +37,12 @@ jobs:
|
|||
--health-retries 5
|
||||
|
||||
steps:
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v5
|
||||
|
||||
|
|
|
|||
|
|
@ -22,6 +22,9 @@ jobs:
|
|||
services:
|
||||
postgres:
|
||||
image: postgres:17
|
||||
credentials:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
env:
|
||||
POSTGRES_USER: readur
|
||||
POSTGRES_PASSWORD: readur
|
||||
|
|
@ -35,9 +38,25 @@ jobs:
|
|||
--health-retries 5
|
||||
|
||||
steps:
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v5
|
||||
|
||||
- name: Pre-pull Docker images for testcontainers
|
||||
run: |
|
||||
echo "Pre-pulling Docker images that testcontainers will use..."
|
||||
docker pull postgres:latest
|
||||
docker pull postgres:15
|
||||
docker pull postgres:15-alpine
|
||||
docker pull postgres:17
|
||||
echo "Images pulled successfully. These are now in local Docker cache."
|
||||
echo "Testcontainers will use the local cached images."
|
||||
|
||||
- name: Remove local env files to prevent conflicts
|
||||
run: |
|
||||
# Remove or rename env files so they don't override CI environment variables
|
||||
|
|
@ -61,7 +80,9 @@ jobs:
|
|||
pkg-config \
|
||||
libclang-dev \
|
||||
ocrmypdf \
|
||||
clang
|
||||
clang \
|
||||
antiword \
|
||||
catdoc
|
||||
|
||||
- name: Setup Rust
|
||||
uses: dtolnay/rust-toolchain@stable
|
||||
|
|
@ -155,6 +176,8 @@ jobs:
|
|||
RUST_LOG: debug
|
||||
RUST_BACKTRACE: 1
|
||||
DEBUG: 1
|
||||
TESTCONTAINERS_RYUK_DISABLED: true
|
||||
DOCKER_HOST: unix:///var/run/docker.sock
|
||||
|
||||
- name: Print server logs on failure
|
||||
if: failure()
|
||||
|
|
|
|||
|
|
@ -38,7 +38,9 @@ jobs:
|
|||
pkg-config \
|
||||
libclang-dev \
|
||||
ocrmypdf \
|
||||
clang
|
||||
clang \
|
||||
antiword \
|
||||
catdoc
|
||||
|
||||
- name: Setup Rust
|
||||
uses: dtolnay/rust-toolchain@stable
|
||||
|
|
|
|||
|
|
@ -33,6 +33,17 @@ version = "2.0.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
|
||||
|
||||
[[package]]
|
||||
name = "aes"
|
||||
version = "0.8.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"cipher",
|
||||
"cpufeatures",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.3"
|
||||
|
|
@ -993,6 +1004,26 @@ dependencies = [
|
|||
"either",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bzip2"
|
||||
version = "0.4.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8"
|
||||
dependencies = [
|
||||
"bzip2-sys",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bzip2-sys"
|
||||
version = "0.1.13+1.0.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"pkg-config",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.2.27"
|
||||
|
|
@ -1152,6 +1183,12 @@ version = "0.9.6"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
|
||||
|
||||
[[package]]
|
||||
name = "constant_time_eq"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc"
|
||||
|
||||
[[package]]
|
||||
name = "core-foundation"
|
||||
version = "0.9.4"
|
||||
|
|
@ -2656,7 +2693,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"windows-targets 0.48.5",
|
||||
"windows-targets 0.53.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -3265,12 +3302,35 @@ dependencies = [
|
|||
"syn 2.0.103",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "password-hash"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7676374caaee8a325c9e7a2ae557f216c5563a171d6997b0ef8a65af35147700"
|
||||
dependencies = [
|
||||
"base64ct",
|
||||
"rand_core 0.6.4",
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "paste"
|
||||
version = "1.0.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
|
||||
|
||||
[[package]]
|
||||
name = "pbkdf2"
|
||||
version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "83a0692ec44e4cf1ef28ca317f14f8f07da2d95ec3fa01f86e4467b725e60917"
|
||||
dependencies = [
|
||||
"digest",
|
||||
"hmac",
|
||||
"password-hash",
|
||||
"sha2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "peeking_take_while"
|
||||
version = "0.1.2"
|
||||
|
|
@ -3653,6 +3713,7 @@ dependencies = [
|
|||
"readur",
|
||||
"regex",
|
||||
"reqwest 0.12.23",
|
||||
"rust_xlsxwriter",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
|
|
@ -3677,6 +3738,7 @@ dependencies = [
|
|||
"uuid",
|
||||
"walkdir",
|
||||
"wiremock",
|
||||
"zip 0.6.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -3935,6 +3997,15 @@ dependencies = [
|
|||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rust_xlsxwriter"
|
||||
version = "0.80.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "442eafa04d985ae671e027481e07a5b70fdb1b2cb5e46d9e074b67ca98e01a0a"
|
||||
dependencies = [
|
||||
"zip 2.4.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc-demangle"
|
||||
version = "0.1.25"
|
||||
|
|
@ -5481,7 +5552,7 @@ dependencies = [
|
|||
"serde_json",
|
||||
"url",
|
||||
"utoipa",
|
||||
"zip",
|
||||
"zip 3.0.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -5742,7 +5813,7 @@ version = "0.1.9"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
|
||||
dependencies = [
|
||||
"windows-sys 0.48.0",
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -6271,6 +6342,43 @@ dependencies = [
|
|||
"syn 2.0.103",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zip"
|
||||
version = "0.6.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261"
|
||||
dependencies = [
|
||||
"aes",
|
||||
"byteorder",
|
||||
"bzip2",
|
||||
"constant_time_eq",
|
||||
"crc32fast",
|
||||
"crossbeam-utils",
|
||||
"flate2",
|
||||
"hmac",
|
||||
"pbkdf2",
|
||||
"sha1",
|
||||
"time",
|
||||
"zstd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zip"
|
||||
version = "2.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50"
|
||||
dependencies = [
|
||||
"arbitrary",
|
||||
"crc32fast",
|
||||
"crossbeam-utils",
|
||||
"displaydoc",
|
||||
"flate2",
|
||||
"indexmap 2.9.0",
|
||||
"memchr",
|
||||
"thiserror 2.0.16",
|
||||
"zopfli",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zip"
|
||||
version = "3.0.0"
|
||||
|
|
@ -6303,6 +6411,35 @@ dependencies = [
|
|||
"simd-adler32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd"
|
||||
version = "0.11.2+zstd.1.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4"
|
||||
dependencies = [
|
||||
"zstd-safe",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-safe"
|
||||
version = "5.0.2+zstd.1.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"zstd-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-sys"
|
||||
version = "2.0.15+zstd.1.5.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"pkg-config",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zune-core"
|
||||
version = "0.4.12"
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ name = "test_runner"
|
|||
path = "src/bin/test_runner.rs"
|
||||
|
||||
|
||||
|
||||
[dependencies]
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
axum = { version = "0.8", features = ["multipart", "ws"] }
|
||||
|
|
@ -61,6 +62,8 @@ sha2 = "0.10"
|
|||
utoipa-swagger-ui = { version = "9", features = ["axum"] }
|
||||
testcontainers = { version = "0.24", optional = true }
|
||||
testcontainers-modules = { version = "0.12", features = ["postgres"], optional = true }
|
||||
# Office document support - now using XML extraction only
|
||||
zip = "0.6" # Still needed for other archive handling
|
||||
rand = "0.8"
|
||||
|
||||
[features]
|
||||
|
|
@ -78,6 +81,8 @@ rand = "0.8"
|
|||
# Database testing dependencies
|
||||
testcontainers = "0.24"
|
||||
testcontainers-modules = { version = "0.12", features = ["postgres"] }
|
||||
# Dependencies for creating proper test Office documents
|
||||
rust_xlsxwriter = "0.80" # For creating proper XLSX test files
|
||||
|
||||
# Enable test-utils feature for all tests
|
||||
readur = { path = ".", features = ["test-utils"] }
|
||||
|
|
|
|||
|
|
@ -86,6 +86,9 @@ RUN apt-get update && apt-get install -y \
|
|||
poppler-utils \
|
||||
ocrmypdf \
|
||||
curl \
|
||||
# Legacy DOC file support (lightweight tools)
|
||||
antiword \
|
||||
catdoc \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
|
|
|||
11
README.md
11
README.md
|
|
@ -13,8 +13,8 @@ You can check our our docs at [docs.readur.app](https://docs.readur.app).
|
|||
|---------|-------------|---------------|
|
||||
| 🔐 **Secure Authentication** | JWT-based user authentication with bcrypt password hashing + OIDC/SSO support | [User Management](https://docs.readur.app/user-management-guide/), [OIDC Setup](https://docs.readur.app/oidc-setup/) |
|
||||
| 👥 **User Management** | Role-based access control with Admin and User roles | [User Management Guide](https://docs.readur.app/user-management-guide/) |
|
||||
| 📤 **Smart File Upload** | Drag-and-drop support for PDF, images, text files, and Office documents | [File Upload Guide](https://docs.readur.app/file-upload-guide/) |
|
||||
| 🔍 **Advanced OCR** | Automatic text extraction using Tesseract for searchable document content | [OCR Optimization](https://docs.readur.app/dev/OCR_OPTIMIZATION_GUIDE/) |
|
||||
| 📤 **Smart File Upload** | Drag-and-drop support for PDF, images, text files, and Office documents (DOCX, XLSX, DOC*) | [File Upload Guide](https://docs.readur.app/file-upload-guide/) |
|
||||
| 🔍 **Advanced OCR** | Automatic text extraction using Tesseract and Office document parsing | [OCR Optimization](https://docs.readur.app/dev/OCR_OPTIMIZATION_GUIDE/) |
|
||||
| 🌍 **Multi-Language OCR** | Process documents in multiple languages simultaneously with automatic language detection | [Multi-Language OCR Guide](https://docs.readur.app/multi-language-ocr-guide/) |
|
||||
| 🔎 **Powerful Search** | PostgreSQL full-text search with multiple modes (simple, phrase, fuzzy, boolean) | [Advanced Search Guide](https://docs.readur.app/advanced-search/) |
|
||||
| 🔗 **Multi-Source Sync** | WebDAV, Local Folders, and S3-compatible storage integration | [Sources Guide](https://docs.readur.app/sources-guide/), [S3 Storage Guide](https://docs.readur.app/s3-storage-guide/) |
|
||||
|
|
@ -106,6 +106,13 @@ open http://localhost:8000
|
|||
- 4+ CPU cores, 4GB+ RAM, 50GB+ SSD
|
||||
- See [deployment guide](https://docs.readur.app/deployment/) for details
|
||||
|
||||
### Optional Dependencies
|
||||
For legacy Microsoft Word (.doc) file support, install one of:
|
||||
- `antiword` - Lightweight DOC text extractor
|
||||
- `catdoc` - Alternative DOC text extraction tool
|
||||
|
||||
*Note: Modern Office formats (DOCX, XLSX) are fully supported without additional dependencies.*
|
||||
|
||||
## 🤝 Contributing
|
||||
|
||||
We welcome contributions! Please see our [Contributing Guide](CONTRIBUTING.md) and [Development Setup](https://docs.readur.app/dev/development/) for details.
|
||||
|
|
|
|||
|
|
@ -33,6 +33,9 @@ This guide covers contributing to Readur, setting up a development environment,
|
|||
- PostgreSQL 14+
|
||||
- Tesseract OCR 4.0+
|
||||
- Git
|
||||
- **Optional but recommended** for legacy DOC file support:
|
||||
- antiword (`apt-get install antiword` or `brew install antiword`)
|
||||
- catdoc (`apt-get install catdoc` or `brew install catdoc`)
|
||||
|
||||
### Local Development
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,239 @@
|
|||
# Office Document Support
|
||||
|
||||
Readur provides comprehensive support for extracting text from Microsoft Office documents, enabling full-text search and content analysis across your document library.
|
||||
|
||||
## Supported Formats
|
||||
|
||||
### Modern Office Formats (Native Support)
|
||||
These formats are fully supported without any additional dependencies:
|
||||
|
||||
- **DOCX** - Word documents (Office 2007+)
|
||||
- Full text extraction from document body
|
||||
- Section and paragraph structure preservation
|
||||
- Header and footer content extraction
|
||||
|
||||
- **XLSX** - Excel spreadsheets (Office 2007+)
|
||||
- Text extraction from all worksheets
|
||||
- Cell content with proper formatting
|
||||
- Sheet names and structure preservation
|
||||
|
||||
### Legacy Office Formats (External Tools Required)
|
||||
These older formats require external tools for text extraction:
|
||||
|
||||
- **DOC** - Legacy Word documents (Office 97-2003)
|
||||
- Requires `antiword`, `catdoc`, or `wvText`
|
||||
- Binary format parsing via external tools
|
||||
|
||||
- **XLS** - Legacy Excel spreadsheets (Office 97-2003)
|
||||
- Currently returns an error suggesting conversion to XLSX
|
||||
|
||||
## Installation
|
||||
|
||||
### Docker Installation
|
||||
The official Docker image includes all necessary dependencies:
|
||||
|
||||
```bash
|
||||
docker pull readur/readur:latest
|
||||
```
|
||||
|
||||
The Docker image includes `antiword` and `catdoc` pre-installed for legacy DOC support.
|
||||
|
||||
### Manual Installation
|
||||
|
||||
#### For Modern Formats (DOCX, XLSX)
|
||||
No additional dependencies required - these formats are parsed using built-in XML processing.
|
||||
|
||||
#### For Legacy DOC Files
|
||||
Install one of the following tools:
|
||||
|
||||
**Ubuntu/Debian:**
|
||||
```bash
|
||||
# Option 1: antiword (recommended, lightweight)
|
||||
sudo apt-get install antiword
|
||||
|
||||
# Option 2: catdoc (good alternative)
|
||||
sudo apt-get install catdoc
|
||||
|
||||
# Option 3: wv (includes wvText)
|
||||
sudo apt-get install wv
|
||||
```
|
||||
|
||||
**macOS:**
|
||||
```bash
|
||||
# Option 1: antiword
|
||||
brew install antiword
|
||||
|
||||
# Option 2: catdoc
|
||||
brew install catdoc
|
||||
|
||||
# Option 3: wv
|
||||
brew install wv
|
||||
```
|
||||
|
||||
**Alpine Linux:**
|
||||
```bash
|
||||
# Option 1: antiword
|
||||
apk add antiword
|
||||
|
||||
# Option 2: catdoc
|
||||
apk add catdoc
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
### Modern Office Format Processing (DOCX/XLSX)
|
||||
|
||||
1. **ZIP Extraction**: Modern Office files are ZIP archives containing XML files
|
||||
2. **XML Parsing**: Secure XML parser extracts text content
|
||||
3. **Content Assembly**: Text from different document parts is assembled
|
||||
4. **Cleaning**: Excessive whitespace and formatting artifacts are removed
|
||||
|
||||
### Legacy DOC Processing
|
||||
|
||||
1. **Tool Detection**: System checks for available tools (antiword, catdoc, wvText)
|
||||
2. **External Processing**: Selected tool converts DOC to plain text
|
||||
3. **Security Validation**: File paths are validated to prevent injection attacks
|
||||
4. **Timeout Protection**: 30-second timeout prevents hanging processes
|
||||
5. **Text Cleaning**: Output is sanitized and normalized
|
||||
|
||||
## Configuration
|
||||
|
||||
### Timeout Settings
|
||||
Office document extraction timeout can be configured in user settings:
|
||||
|
||||
- **Default**: 120 seconds
|
||||
- **Range**: 1-600 seconds
|
||||
- **Applies to**: DOCX and XLSX processing
|
||||
|
||||
### Error Handling
|
||||
|
||||
When processing fails, Readur provides helpful error messages:
|
||||
|
||||
- **Missing Tools**: Instructions for installing required tools
|
||||
- **File Too Large**: Suggestions for file size reduction
|
||||
- **Corrupted Files**: Guidance on file repair options
|
||||
- **Unsupported Formats**: Conversion recommendations
|
||||
|
||||
## Security Features
|
||||
|
||||
### Built-in Protections
|
||||
|
||||
1. **ZIP Bomb Protection**: Limits decompressed size to prevent resource exhaustion
|
||||
2. **Path Validation**: Prevents directory traversal and injection attacks
|
||||
3. **XML Security**: Entity expansion and external entity attacks prevented
|
||||
4. **Process Isolation**: External tools run with limited permissions
|
||||
5. **Timeout Enforcement**: Prevents infinite processing loops
|
||||
|
||||
### File Size Limits
|
||||
|
||||
- **Maximum Office Document Size**: 50MB
|
||||
- **Maximum Decompressed Size**: 500MB (ZIP bomb protection)
|
||||
- **Compression Ratio Limit**: 100:1
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Processing Speed
|
||||
|
||||
Typical extraction times:
|
||||
- **DOCX (1-10 pages)**: 50-200ms
|
||||
- **DOCX (100+ pages)**: 500-2000ms
|
||||
- **XLSX (small)**: 100-300ms
|
||||
- **XLSX (large)**: 1000-5000ms
|
||||
- **DOC (via antiword)**: 100-500ms
|
||||
|
||||
### Resource Usage
|
||||
|
||||
- **Memory**: ~10-50MB per document during processing
|
||||
- **CPU**: Single-threaded extraction, minimal impact
|
||||
- **Disk**: Temporary files cleaned automatically
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
#### "No DOC extraction tools available"
|
||||
**Solution**: Install antiword or catdoc as described above.
|
||||
|
||||
#### "Document processing timed out"
|
||||
**Possible causes**:
|
||||
- Very large or complex document
|
||||
- Corrupted file structure
|
||||
- System resource constraints
|
||||
|
||||
**Solutions**:
|
||||
1. Increase timeout in settings
|
||||
2. Convert to PDF format
|
||||
3. Split large documents
|
||||
|
||||
#### "Document format not supported"
|
||||
**Affected formats**: PPT, PPTX, and other Office formats
|
||||
|
||||
**Solution**: Convert to supported format (PDF, DOCX, TXT)
|
||||
|
||||
### Verification
|
||||
|
||||
To verify Office document support:
|
||||
|
||||
```bash
|
||||
# Check for DOC support
|
||||
which antiword || which catdoc || echo "No DOC tools installed"
|
||||
|
||||
# Test extraction (Docker)
|
||||
docker exec readur-container antiword -v
|
||||
|
||||
# Test extraction (Manual)
|
||||
antiword test.doc
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Prefer Modern Formats**: Use DOCX over DOC when possible
|
||||
2. **Convert Legacy Files**: Batch convert DOC to DOCX for better performance
|
||||
3. **Monitor File Sizes**: Large Office files may need splitting
|
||||
4. **Regular Updates**: Keep external tools updated for security
|
||||
5. **Test Extraction**: Verify text extraction quality after setup
|
||||
|
||||
## Migration from DOC to DOCX
|
||||
|
||||
For better performance and reliability, consider converting legacy DOC files:
|
||||
|
||||
### Using LibreOffice (Batch Conversion)
|
||||
```bash
|
||||
libreoffice --headless --convert-to docx *.doc
|
||||
```
|
||||
|
||||
### Using Microsoft Word (Windows)
|
||||
PowerShell script for batch conversion available in `/scripts/convert-doc-to-docx.ps1`
|
||||
|
||||
## API Usage
|
||||
|
||||
### Upload Office Document
|
||||
```bash
|
||||
curl -X POST http://localhost:8000/api/documents/upload \
|
||||
-H "Authorization: Bearer YOUR_TOKEN" \
|
||||
-F "file=@document.docx"
|
||||
```
|
||||
|
||||
### Check Processing Status
|
||||
```bash
|
||||
curl http://localhost:8000/api/documents/{id}/status \
|
||||
-H "Authorization: Bearer YOUR_TOKEN"
|
||||
```
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
Planned improvements for Office document support:
|
||||
|
||||
- [ ] Native DOC parsing (without external tools)
|
||||
- [ ] PowerPoint (PPTX/PPT) support
|
||||
- [ ] Table structure preservation
|
||||
- [ ] Embedded image extraction
|
||||
- [ ] Style and formatting metadata
|
||||
- [ ] Track changes and comments extraction
|
||||
|
||||
## Related Documentation
|
||||
|
||||
- [File Upload Guide](./file-upload-guide.md)
|
||||
- [OCR Optimization Guide](./dev/OCR_OPTIMIZATION_GUIDE.md)
|
||||
- [Advanced Search](./advanced-search.md)
|
||||
- [Configuration Reference](./configuration-reference.md)
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
-- Add office document extraction settings to the settings table
|
||||
-- This migration adds timeout controls for Office document extraction using XML parsing
|
||||
|
||||
-- Add office extraction timeout column (default: 120 seconds)
|
||||
ALTER TABLE settings
|
||||
ADD COLUMN IF NOT EXISTS office_extraction_timeout_seconds INTEGER NOT NULL DEFAULT 120
|
||||
CHECK (office_extraction_timeout_seconds > 0 AND office_extraction_timeout_seconds <= 600);
|
||||
|
||||
-- Add office extraction detailed logging column (default: false for production)
|
||||
ALTER TABLE settings
|
||||
ADD COLUMN IF NOT EXISTS office_extraction_enable_detailed_logging BOOLEAN NOT NULL DEFAULT false;
|
||||
|
||||
-- Add comment to document the new columns
|
||||
COMMENT ON COLUMN settings.office_extraction_timeout_seconds IS
|
||||
'Timeout in seconds for office document extraction (1-600 seconds, default: 120)';
|
||||
|
||||
COMMENT ON COLUMN settings.office_extraction_enable_detailed_logging IS
|
||||
'Enable detailed logging for office document extraction operations (default: false)';
|
||||
|
||||
-- The default values are already set in the column definitions above
|
||||
-- No need to insert default settings as they should be created when users are created
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
use anyhow::Result;
|
||||
use anyhow::{anyhow, Result};
|
||||
use sqlx::Row;
|
||||
use uuid::Uuid;
|
||||
use serde_json::Value;
|
||||
|
|
@ -75,6 +75,9 @@ fn settings_from_row(row: &sqlx::postgres::PgRow) -> crate::models::Settings {
|
|||
webdav_file_extensions: row.get("webdav_file_extensions"),
|
||||
webdav_auto_sync: row.get("webdav_auto_sync"),
|
||||
webdav_sync_interval_minutes: row.get("webdav_sync_interval_minutes"),
|
||||
// Office document extraction configuration
|
||||
office_extraction_timeout_seconds: row.get("office_extraction_timeout_seconds"),
|
||||
office_extraction_enable_detailed_logging: row.get("office_extraction_enable_detailed_logging"),
|
||||
created_at: row.get("created_at"),
|
||||
updated_at: row.get("updated_at"),
|
||||
}
|
||||
|
|
@ -102,6 +105,8 @@ impl Database {
|
|||
ocr_quality_threshold_sharpness, ocr_skip_enhancement,
|
||||
webdav_enabled, webdav_server_url, webdav_username, webdav_password,
|
||||
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes,
|
||||
COALESCE(office_extraction_timeout_seconds, 120) as office_extraction_timeout_seconds,
|
||||
COALESCE(office_extraction_enable_detailed_logging, true) as office_extraction_enable_detailed_logging,
|
||||
created_at, updated_at
|
||||
FROM settings WHERE user_id = $1"#
|
||||
)
|
||||
|
|
@ -137,6 +142,8 @@ impl Database {
|
|||
ocr_quality_threshold_sharpness, ocr_skip_enhancement,
|
||||
webdav_enabled, webdav_server_url, webdav_username, webdav_password,
|
||||
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes,
|
||||
COALESCE(office_extraction_timeout_seconds, 120) as office_extraction_timeout_seconds,
|
||||
COALESCE(office_extraction_enable_detailed_logging, false) as office_extraction_enable_detailed_logging,
|
||||
created_at, updated_at
|
||||
FROM settings
|
||||
WHERE webdav_enabled = true AND webdav_auto_sync = true"#
|
||||
|
|
@ -151,7 +158,112 @@ impl Database {
|
|||
Ok(settings_list)
|
||||
}
|
||||
|
||||
/// Validate office extraction settings
|
||||
fn validate_office_extraction_settings(settings: &crate::models::UpdateSettings) -> Result<()> {
|
||||
// Validate timeout
|
||||
if let Some(timeout) = settings.office_extraction_timeout_seconds {
|
||||
if timeout <= 0 {
|
||||
return Err(anyhow!(
|
||||
"Office extraction timeout must be greater than 0 seconds, got: {}",
|
||||
timeout
|
||||
));
|
||||
}
|
||||
if timeout > 600 {
|
||||
return Err(anyhow!(
|
||||
"Office extraction timeout cannot exceed 600 seconds (10 minutes) for system stability, got: {}",
|
||||
timeout
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// Logging setting doesn't need validation as it's boolean
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Validate general settings constraints
|
||||
fn validate_settings_constraints(settings: &crate::models::UpdateSettings) -> Result<()> {
|
||||
// Validate OCR settings
|
||||
if let Some(concurrent_jobs) = settings.concurrent_ocr_jobs {
|
||||
if concurrent_jobs < 1 || concurrent_jobs > 20 {
|
||||
return Err(anyhow!(
|
||||
"Concurrent OCR jobs must be between 1 and 20, got: {}",
|
||||
concurrent_jobs
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(timeout) = settings.ocr_timeout_seconds {
|
||||
if timeout < 10 || timeout > 1800 {
|
||||
return Err(anyhow!(
|
||||
"OCR timeout must be between 10 and 1800 seconds, got: {}",
|
||||
timeout
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(max_size) = settings.max_file_size_mb {
|
||||
if max_size < 1 || max_size > 500 {
|
||||
return Err(anyhow!(
|
||||
"Maximum file size must be between 1 and 500 MB, got: {}",
|
||||
max_size
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(memory_limit) = settings.memory_limit_mb {
|
||||
if memory_limit < 64 || memory_limit > 8192 {
|
||||
return Err(anyhow!(
|
||||
"Memory limit must be between 64 and 8192 MB, got: {}",
|
||||
memory_limit
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(results_per_page) = settings.search_results_per_page {
|
||||
if results_per_page < 1 || results_per_page > 1000 {
|
||||
return Err(anyhow!(
|
||||
"Search results per page must be between 1 and 1000, got: {}",
|
||||
results_per_page
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(snippet_length) = settings.search_snippet_length {
|
||||
if snippet_length < 10 || snippet_length > 2000 {
|
||||
return Err(anyhow!(
|
||||
"Search snippet length must be between 10 and 2000 characters, got: {}",
|
||||
snippet_length
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(threshold) = settings.fuzzy_search_threshold {
|
||||
if threshold < 0.0 || threshold > 1.0 {
|
||||
return Err(anyhow!(
|
||||
"Fuzzy search threshold must be between 0.0 and 1.0, got: {}",
|
||||
threshold
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// Validate WebDAV settings
|
||||
if let Some(sync_interval) = settings.webdav_sync_interval_minutes {
|
||||
if sync_interval < 1 || sync_interval > 10080 { // max 1 week
|
||||
return Err(anyhow!(
|
||||
"WebDAV sync interval must be between 1 and 10080 minutes (1 week), got: {}",
|
||||
sync_interval
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn create_or_update_settings(&self, user_id: Uuid, settings: &crate::models::UpdateSettings) -> Result<crate::models::Settings> {
|
||||
// Validate settings before saving
|
||||
Self::validate_office_extraction_settings(settings)?;
|
||||
Self::validate_settings_constraints(settings)?;
|
||||
// Get existing settings to merge with updates
|
||||
let existing = self.get_user_settings(user_id).await?;
|
||||
let defaults = crate::models::Settings::default();
|
||||
|
|
@ -179,9 +291,10 @@ impl Database {
|
|||
ocr_quality_threshold_brightness, ocr_quality_threshold_contrast, ocr_quality_threshold_noise,
|
||||
ocr_quality_threshold_sharpness, ocr_skip_enhancement,
|
||||
webdav_enabled, webdav_server_url, webdav_username, webdav_password,
|
||||
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes
|
||||
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes,
|
||||
office_extraction_timeout_seconds, office_extraction_enable_detailed_logging
|
||||
)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $50, $51, $52, $53)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $50, $51, $52, $53, $54, $55)
|
||||
ON CONFLICT (user_id) DO UPDATE SET
|
||||
ocr_language = $2,
|
||||
preferred_languages = $3,
|
||||
|
|
@ -235,6 +348,8 @@ impl Database {
|
|||
webdav_file_extensions = $51,
|
||||
webdav_auto_sync = $52,
|
||||
webdav_sync_interval_minutes = $53,
|
||||
office_extraction_timeout_seconds = $54,
|
||||
office_extraction_enable_detailed_logging = $55,
|
||||
updated_at = NOW()
|
||||
RETURNING id, user_id, ocr_language,
|
||||
COALESCE(preferred_languages, '["eng"]'::jsonb) as preferred_languages,
|
||||
|
|
@ -254,6 +369,8 @@ impl Database {
|
|||
ocr_quality_threshold_sharpness, ocr_skip_enhancement,
|
||||
webdav_enabled, webdav_server_url, webdav_username, webdav_password,
|
||||
webdav_watch_folders, webdav_file_extensions, webdav_auto_sync, webdav_sync_interval_minutes,
|
||||
COALESCE(office_extraction_timeout_seconds, 120) as office_extraction_timeout_seconds,
|
||||
COALESCE(office_extraction_enable_detailed_logging, false) as office_extraction_enable_detailed_logging,
|
||||
created_at, updated_at
|
||||
"#
|
||||
)
|
||||
|
|
@ -310,6 +427,8 @@ impl Database {
|
|||
.bind(settings.webdav_file_extensions.as_ref().unwrap_or(¤t.webdav_file_extensions))
|
||||
.bind(settings.webdav_auto_sync.unwrap_or(current.webdav_auto_sync))
|
||||
.bind(settings.webdav_sync_interval_minutes.unwrap_or(current.webdav_sync_interval_minutes))
|
||||
.bind(settings.office_extraction_timeout_seconds.unwrap_or(current.office_extraction_timeout_seconds))
|
||||
.bind(settings.office_extraction_enable_detailed_logging.unwrap_or(current.office_extraction_enable_detailed_logging))
|
||||
.fetch_one(&self.pool)
|
||||
.await?;
|
||||
|
||||
|
|
|
|||
|
|
@ -22,6 +22,20 @@ impl DocumentTransactionManager {
|
|||
}
|
||||
|
||||
/// Update OCR results with full transaction safety and validation
|
||||
/// Sanitize text for PostgreSQL storage
|
||||
/// Removes null bytes and ensures valid UTF-8 encoding
|
||||
fn sanitize_text_for_db(text: &str) -> String {
|
||||
// Remove null bytes which PostgreSQL cannot store in TEXT fields
|
||||
let cleaned: String = text
|
||||
.chars()
|
||||
.filter(|&c| c != '\0')
|
||||
.collect();
|
||||
|
||||
// Additional safety: ensure the string is valid UTF-8
|
||||
// (should already be, but this is defensive)
|
||||
String::from_utf8_lossy(cleaned.as_bytes()).to_string()
|
||||
}
|
||||
|
||||
pub async fn update_ocr_with_validation(
|
||||
&self,
|
||||
document_id: Uuid,
|
||||
|
|
@ -81,7 +95,18 @@ impl DocumentTransactionManager {
|
|||
return Ok(false);
|
||||
}
|
||||
|
||||
// 5. Perform the update with additional safety checks
|
||||
// 5. Sanitize text before database insertion
|
||||
let sanitized_text = Self::sanitize_text_for_db(ocr_text);
|
||||
|
||||
// Log if sanitization was needed
|
||||
if sanitized_text.len() != ocr_text.len() {
|
||||
warn!(
|
||||
"Text sanitization was required for document {}: original {} chars, sanitized {} chars",
|
||||
document_id, ocr_text.len(), sanitized_text.len()
|
||||
);
|
||||
}
|
||||
|
||||
// 6. Perform the update with additional safety checks
|
||||
let updated_rows = sqlx::query!(
|
||||
r#"
|
||||
UPDATE documents
|
||||
|
|
@ -96,7 +121,7 @@ impl DocumentTransactionManager {
|
|||
AND ocr_status != 'completed' -- Extra safety check
|
||||
"#,
|
||||
document_id,
|
||||
ocr_text,
|
||||
sanitized_text.as_str(),
|
||||
confidence,
|
||||
word_count,
|
||||
processing_time_ms
|
||||
|
|
@ -110,7 +135,7 @@ impl DocumentTransactionManager {
|
|||
return Ok(false);
|
||||
}
|
||||
|
||||
// 6. Remove from OCR queue atomically
|
||||
// 7. Remove from OCR queue atomically
|
||||
let queue_removed = sqlx::query!(
|
||||
r#"
|
||||
DELETE FROM ocr_queue
|
||||
|
|
@ -126,12 +151,12 @@ impl DocumentTransactionManager {
|
|||
warn!("Document {} not found in OCR queue during completion", document_id);
|
||||
}
|
||||
|
||||
// 7. Commit transaction
|
||||
// 8. Commit transaction
|
||||
tx.commit().await?;
|
||||
|
||||
info!(
|
||||
"Document {} OCR updated successfully: {} chars, {:.1}% confidence, {} words",
|
||||
document_id, ocr_text.len(), confidence, word_count
|
||||
document_id, sanitized_text.len(), confidence, word_count
|
||||
);
|
||||
|
||||
Ok(true)
|
||||
|
|
@ -530,6 +555,26 @@ impl DistributedLock {
|
|||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// Mock tests for the transaction manager
|
||||
// These would need a test database to run properly
|
||||
#[test]
|
||||
fn test_sanitize_text_for_db() {
|
||||
// Test removing null bytes
|
||||
let text_with_nulls = "Hello\0World\0!";
|
||||
let sanitized = TransactionManager::sanitize_text_for_db(text_with_nulls);
|
||||
assert_eq!(sanitized, "HelloWorld!");
|
||||
|
||||
// Test preserving normal text
|
||||
let normal_text = "This is a normal PDF text with special chars: €£¥";
|
||||
let sanitized = TransactionManager::sanitize_text_for_db(normal_text);
|
||||
assert_eq!(sanitized, normal_text);
|
||||
|
||||
// Test handling empty string
|
||||
let empty = "";
|
||||
let sanitized = TransactionManager::sanitize_text_for_db(empty);
|
||||
assert_eq!(sanitized, "");
|
||||
|
||||
// Test handling text with multiple null bytes
|
||||
let many_nulls = "\0\0Start\0Middle\0\0End\0\0";
|
||||
let sanitized = TransactionManager::sanitize_text_for_db(many_nulls);
|
||||
assert_eq!(sanitized, "StartMiddleEnd");
|
||||
}
|
||||
}
|
||||
|
|
@ -60,6 +60,9 @@ pub struct Settings {
|
|||
pub webdav_file_extensions: Vec<String>,
|
||||
pub webdav_auto_sync: bool,
|
||||
pub webdav_sync_interval_minutes: i32,
|
||||
// Office document extraction configuration
|
||||
pub office_extraction_timeout_seconds: i32,
|
||||
pub office_extraction_enable_detailed_logging: bool,
|
||||
pub created_at: DateTime<Utc>,
|
||||
pub updated_at: DateTime<Utc>,
|
||||
}
|
||||
|
|
@ -118,6 +121,9 @@ pub struct SettingsResponse {
|
|||
pub webdav_file_extensions: Vec<String>,
|
||||
pub webdav_auto_sync: bool,
|
||||
pub webdav_sync_interval_minutes: i32,
|
||||
// Office document extraction configuration
|
||||
pub office_extraction_timeout_seconds: i32,
|
||||
pub office_extraction_enable_detailed_logging: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, ToSchema)]
|
||||
|
|
@ -174,6 +180,9 @@ pub struct UpdateSettings {
|
|||
pub webdav_file_extensions: Option<Vec<String>>,
|
||||
pub webdav_auto_sync: Option<bool>,
|
||||
pub webdav_sync_interval_minutes: Option<i32>,
|
||||
// Office document extraction configuration
|
||||
pub office_extraction_timeout_seconds: Option<i32>,
|
||||
pub office_extraction_enable_detailed_logging: Option<bool>,
|
||||
}
|
||||
|
||||
impl From<Settings> for SettingsResponse {
|
||||
|
|
@ -231,6 +240,9 @@ impl From<Settings> for SettingsResponse {
|
|||
webdav_file_extensions: settings.webdav_file_extensions,
|
||||
webdav_auto_sync: settings.webdav_auto_sync,
|
||||
webdav_sync_interval_minutes: settings.webdav_sync_interval_minutes,
|
||||
// Office document extraction configuration
|
||||
office_extraction_timeout_seconds: settings.office_extraction_timeout_seconds,
|
||||
office_extraction_enable_detailed_logging: settings.office_extraction_enable_detailed_logging,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -295,6 +307,9 @@ impl UpdateSettings {
|
|||
webdav_file_extensions: None,
|
||||
webdav_auto_sync: None,
|
||||
webdav_sync_interval_minutes: None,
|
||||
// Office document extraction configuration - don't update these in language update
|
||||
office_extraction_timeout_seconds: None,
|
||||
office_extraction_enable_detailed_logging: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -372,6 +387,9 @@ impl Default for Settings {
|
|||
],
|
||||
webdav_auto_sync: false,
|
||||
webdav_sync_interval_minutes: 60,
|
||||
// Office document extraction configuration defaults
|
||||
office_extraction_timeout_seconds: 120, // 2 minutes default timeout
|
||||
office_extraction_enable_detailed_logging: false, // Conservative default
|
||||
created_at: chrono::Utc::now(),
|
||||
updated_at: chrono::Utc::now(),
|
||||
}
|
||||
|
|
|
|||
|
|
@ -16,6 +16,33 @@ use tesseract::{Tesseract, PageSegMode, OcrEngineMode};
|
|||
|
||||
use crate::models::Settings;
|
||||
use crate::services::file_service::FileService;
|
||||
use super::xml_extractor::XmlOfficeExtractor;
|
||||
// Removed text_sanitization import - now using minimal inline sanitization
|
||||
|
||||
/// RAII guard for automatic cleanup of temporary files
|
||||
struct FileCleanupGuard {
|
||||
file_path: String,
|
||||
}
|
||||
|
||||
impl FileCleanupGuard {
|
||||
fn new(file_path: &str) -> Self {
|
||||
Self {
|
||||
file_path: file_path.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for FileCleanupGuard {
|
||||
fn drop(&mut self) {
|
||||
if std::path::Path::new(&self.file_path).exists() {
|
||||
if let Err(e) = std::fs::remove_file(&self.file_path) {
|
||||
warn!("Failed to clean up temporary file '{}': {}", self.file_path, e);
|
||||
} else {
|
||||
debug!("Cleaned up temporary file: {}", self.file_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ImageQualityStats {
|
||||
|
|
@ -41,6 +68,31 @@ pub struct EnhancedOcrService {
|
|||
}
|
||||
|
||||
impl EnhancedOcrService {
|
||||
// Security limits for Office document processing
|
||||
const MAX_OFFICE_DOCUMENT_SIZE: u64 = 100 * 1024 * 1024; // 100MB for all Office documents
|
||||
const MAX_ENTRY_NAME_LENGTH: usize = 255; // Maximum length of entry names
|
||||
|
||||
/// Remove null bytes from text to prevent PostgreSQL errors
|
||||
/// This is the ONLY sanitization we do - preserving all other original content
|
||||
fn remove_null_bytes(text: &str) -> String {
|
||||
let original_len = text.len();
|
||||
let cleaned: String = text.chars().filter(|&c| c != '\0').collect();
|
||||
|
||||
// Log if we found and removed null bytes (shouldn't happen with valid documents)
|
||||
let cleaned_len = cleaned.len();
|
||||
if cleaned_len < original_len {
|
||||
let null_bytes_removed = text.chars().filter(|&c| c == '\0').count();
|
||||
warn!(
|
||||
"Removed {} null bytes from extracted text (original: {} chars, cleaned: {} chars). \
|
||||
This indicates corrupted or malformed document data.",
|
||||
null_bytes_removed, original_len, cleaned_len
|
||||
);
|
||||
}
|
||||
|
||||
cleaned
|
||||
}
|
||||
|
||||
|
||||
pub fn new(temp_dir: String, file_service: FileService) -> Self {
|
||||
Self { temp_dir, file_service }
|
||||
}
|
||||
|
|
@ -1069,7 +1121,7 @@ impl EnhancedOcrService {
|
|||
let ocr_text_result = tokio::task::spawn_blocking({
|
||||
let temp_ocr_path = temp_ocr_path.clone();
|
||||
move || -> Result<String> {
|
||||
let bytes = std::fs::read(&temp_ocr_path)?;
|
||||
let _bytes = std::fs::read(&temp_ocr_path)?;
|
||||
// Catch panics from pdf-extract library (same pattern as used elsewhere)
|
||||
// Extract text from the OCR'd PDF using ocrmypdf's sidecar option
|
||||
let temp_text_path = format!("{}.txt", temp_ocr_path);
|
||||
|
|
@ -1276,7 +1328,7 @@ impl EnhancedOcrService {
|
|||
// Look for text objects (BT...ET blocks)
|
||||
if !in_text_object && char == 'B' {
|
||||
// Check if this might be the start of "BT" (Begin Text)
|
||||
if let Some(window) = bytes.windows(2).find(|w| w == b"BT") {
|
||||
if let Some(_window) = bytes.windows(2).find(|w| w == b"BT") {
|
||||
in_text_object = true;
|
||||
continue;
|
||||
}
|
||||
|
|
@ -1284,7 +1336,7 @@ impl EnhancedOcrService {
|
|||
|
||||
if in_text_object && char == 'E' {
|
||||
// Check if this might be the start of "ET" (End Text)
|
||||
if let Some(window) = bytes.windows(2).find(|w| w == b"ET") {
|
||||
if let Some(_window) = bytes.windows(2).find(|w| w == b"ET") {
|
||||
in_text_object = false;
|
||||
if !current_text.trim().is_empty() {
|
||||
extracted_text.push_str(¤t_text);
|
||||
|
|
@ -1411,6 +1463,46 @@ impl EnhancedOcrService {
|
|||
self.extract_text(file_path, mime_type, settings).await
|
||||
}
|
||||
|
||||
/// Extract text from Office documents (DOCX, DOC, Excel) using XML extraction
|
||||
pub async fn extract_text_from_office(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result<OcrResult> {
|
||||
let start_time = std::time::Instant::now();
|
||||
info!("Extracting text from Office document: {} (type: {})", file_path, mime_type);
|
||||
|
||||
// Check file size before processing
|
||||
let metadata = tokio::fs::metadata(file_path).await?;
|
||||
let file_size = metadata.len();
|
||||
|
||||
if file_size > Self::MAX_OFFICE_DOCUMENT_SIZE {
|
||||
return Err(anyhow!(
|
||||
"Office document too large: {:.1} MB (max: {:.1} MB). Consider converting to PDF or splitting the document.",
|
||||
file_size as f64 / (1024.0 * 1024.0),
|
||||
Self::MAX_OFFICE_DOCUMENT_SIZE as f64 / (1024.0 * 1024.0)
|
||||
));
|
||||
}
|
||||
|
||||
// Use XML extraction as the primary method
|
||||
let xml_extractor = XmlOfficeExtractor::new(self.temp_dir.clone());
|
||||
let xml_result = xml_extractor.extract_text_from_office(file_path, mime_type).await?;
|
||||
|
||||
let total_time = start_time.elapsed().as_millis() as u64;
|
||||
|
||||
info!(
|
||||
"Office document extraction completed: {} words in {}ms using XML extraction",
|
||||
xml_result.word_count,
|
||||
total_time
|
||||
);
|
||||
|
||||
// Convert OfficeExtractionResult to OcrResult for backward compatibility
|
||||
Ok(OcrResult {
|
||||
text: xml_result.text,
|
||||
confidence: xml_result.confidence,
|
||||
processing_time_ms: xml_result.processing_time_ms,
|
||||
word_count: xml_result.word_count,
|
||||
preprocessing_applied: vec![format!("XML extraction - {}", xml_result.extraction_method)],
|
||||
processed_image_path: None,
|
||||
})
|
||||
}
|
||||
|
||||
/// Extract text from any supported file type
|
||||
pub async fn extract_text(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result<OcrResult> {
|
||||
// Resolve the actual file path
|
||||
|
|
@ -1455,13 +1547,16 @@ impl EnhancedOcrService {
|
|||
|
||||
let text = tokio::fs::read_to_string(&resolved_path).await?;
|
||||
|
||||
// Only remove null bytes - preserve all original formatting
|
||||
let cleaned_text = Self::remove_null_bytes(&text);
|
||||
|
||||
// Limit text content size in memory
|
||||
const MAX_TEXT_CONTENT_SIZE: usize = 10 * 1024 * 1024; // 10MB of text content
|
||||
let trimmed_text = if text.len() > MAX_TEXT_CONTENT_SIZE {
|
||||
warn!("Text file content too large ({} chars), truncating to {} chars", text.len(), MAX_TEXT_CONTENT_SIZE);
|
||||
format!("{}... [TEXT TRUNCATED DUE TO SIZE]", &text[..MAX_TEXT_CONTENT_SIZE])
|
||||
let trimmed_text = if cleaned_text.len() > MAX_TEXT_CONTENT_SIZE {
|
||||
warn!("Text file content too large ({} chars), truncating to {} chars", cleaned_text.len(), MAX_TEXT_CONTENT_SIZE);
|
||||
format!("{}... [TEXT TRUNCATED DUE TO SIZE]", &cleaned_text[..MAX_TEXT_CONTENT_SIZE])
|
||||
} else {
|
||||
text.trim().to_string()
|
||||
cleaned_text.trim().to_string()
|
||||
};
|
||||
|
||||
let processing_time = start_time.elapsed().as_millis() as u64;
|
||||
|
|
@ -1476,6 +1571,16 @@ impl EnhancedOcrService {
|
|||
processed_image_path: None, // No image processing for plain text
|
||||
})
|
||||
}
|
||||
// Handle Office document formats
|
||||
mime if matches!(mime,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
|
||||
"application/msword" |
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
) => {
|
||||
// extract_text_from_office now returns OcrResult directly
|
||||
self.extract_text_from_office(&resolved_path, mime, settings).await
|
||||
}
|
||||
_ => Err(anyhow::anyhow!("Unsupported file type: {}", mime_type)),
|
||||
}
|
||||
}
|
||||
|
|
@ -1609,6 +1714,11 @@ impl EnhancedOcrService {
|
|||
pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
pub fn count_words_safely(&self, text: &str) -> usize {
|
||||
// Simple word count for non-OCR builds
|
||||
text.split_whitespace().count()
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if the given bytes represent a valid PDF file
|
||||
|
|
|
|||
100
src/ocr/mod.rs
100
src/ocr/mod.rs
|
|
@ -5,6 +5,7 @@ pub mod error;
|
|||
pub mod health;
|
||||
pub mod queue;
|
||||
pub mod tests;
|
||||
pub mod xml_extractor;
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use std::path::Path;
|
||||
|
|
@ -16,12 +17,37 @@ use tesseract::Tesseract;
|
|||
|
||||
pub struct OcrService {
|
||||
health_checker: OcrHealthChecker,
|
||||
temp_dir: String,
|
||||
}
|
||||
|
||||
/// Configuration for the OCR service
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct OcrConfig {
|
||||
/// Temporary directory for processing
|
||||
pub temp_dir: String,
|
||||
}
|
||||
|
||||
impl Default for OcrConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
temp_dir: std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl OcrService {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
health_checker: OcrHealthChecker::new(),
|
||||
temp_dir: std::env::var("TEMP_DIR").unwrap_or_else(|_| "/tmp".to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create OCR service with configuration
|
||||
pub fn new_with_config(config: OcrConfig) -> Self {
|
||||
Self {
|
||||
health_checker: OcrHealthChecker::new(),
|
||||
temp_dir: config.temp_dir,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -158,6 +184,39 @@ impl OcrService {
|
|||
}
|
||||
}
|
||||
|
||||
/// Extract text from Office documents using XML extraction
|
||||
pub async fn extract_text_from_office_document(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
) -> Result<crate::ocr::enhanced::OcrResult> {
|
||||
// Use XML extraction directly
|
||||
let xml_extractor = crate::ocr::xml_extractor::XmlOfficeExtractor::new(
|
||||
self.temp_dir.clone()
|
||||
);
|
||||
|
||||
let result = xml_extractor.extract_text_from_office(file_path, mime_type).await?;
|
||||
// Convert OfficeExtractionResult to OcrResult for backward compatibility
|
||||
Ok(crate::ocr::enhanced::OcrResult {
|
||||
text: result.text,
|
||||
confidence: result.confidence,
|
||||
processing_time_ms: result.processing_time_ms,
|
||||
word_count: result.word_count,
|
||||
preprocessing_applied: vec![format!("XML extraction - {}", result.extraction_method)],
|
||||
processed_image_path: None,
|
||||
})
|
||||
}
|
||||
|
||||
/// Extract text from Office documents with custom configuration
|
||||
pub async fn extract_text_from_office_document_with_config(
|
||||
&self,
|
||||
file_path: &str,
|
||||
mime_type: &str,
|
||||
) -> Result<crate::ocr::enhanced::OcrResult> {
|
||||
// Use the same XML extraction logic as the basic method
|
||||
self.extract_text_from_office_document(file_path, mime_type).await
|
||||
}
|
||||
|
||||
pub async fn extract_text(&self, file_path: &str, mime_type: &str) -> Result<String> {
|
||||
self.extract_text_with_lang(file_path, mime_type, "eng").await
|
||||
}
|
||||
|
|
@ -165,6 +224,16 @@ impl OcrService {
|
|||
pub async fn extract_text_with_lang(&self, file_path: &str, mime_type: &str, lang: &str) -> Result<String> {
|
||||
match mime_type {
|
||||
"application/pdf" => self.extract_text_from_pdf(file_path).await,
|
||||
// Office document types - use fallback strategy if available
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation" |
|
||||
"application/msword" |
|
||||
"application/vnd.ms-excel" |
|
||||
"application/vnd.ms-powerpoint" => {
|
||||
let result = self.extract_text_from_office_document(file_path, mime_type).await?;
|
||||
Ok(result.text)
|
||||
}
|
||||
"image/png" | "image/jpeg" | "image/jpg" | "image/tiff" | "image/bmp" => {
|
||||
self.extract_text_from_image_with_lang(file_path, lang).await
|
||||
}
|
||||
|
|
@ -234,4 +303,35 @@ impl OcrService {
|
|||
false
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Check if Office document extraction is available
|
||||
pub fn supports_office_documents(&self) -> bool {
|
||||
true // XML extraction is always available
|
||||
}
|
||||
|
||||
/// Get supported MIME types
|
||||
pub fn get_supported_mime_types(&self) -> Vec<&'static str> {
|
||||
let mut types = vec![
|
||||
"application/pdf",
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/jpg",
|
||||
"image/tiff",
|
||||
"image/bmp",
|
||||
"text/plain",
|
||||
];
|
||||
|
||||
// Office document types are always supported via XML extraction
|
||||
types.extend_from_slice(&[
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
"application/msword",
|
||||
"application/vnd.ms-excel",
|
||||
"application/vnd.ms-powerpoint",
|
||||
]);
|
||||
|
||||
types
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -101,6 +101,9 @@ async fn get_settings(
|
|||
webdav_file_extensions: default.webdav_file_extensions,
|
||||
webdav_auto_sync: default.webdav_auto_sync,
|
||||
webdav_sync_interval_minutes: default.webdav_sync_interval_minutes,
|
||||
// Office document extraction configuration
|
||||
office_extraction_timeout_seconds: default.office_extraction_timeout_seconds,
|
||||
office_extraction_enable_detailed_logging: default.office_extraction_enable_detailed_logging,
|
||||
}
|
||||
},
|
||||
};
|
||||
|
|
|
|||
|
|
@ -387,9 +387,9 @@ async fn process_file(
|
|||
.first_or_octet_stream()
|
||||
.to_string();
|
||||
|
||||
// Check if file is OCR-able
|
||||
if !is_ocr_able_file(&mime_type) {
|
||||
debug!("Skipping non-OCR-able file: {} ({})", filename, mime_type);
|
||||
// Check if file can have text extracted (OCR or Office document text extraction)
|
||||
if !is_text_extractable_file(&mime_type) {
|
||||
debug!("Skipping non-text-extractable file: {} ({})", filename, mime_type);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
|
|
@ -540,11 +540,29 @@ async fn extract_file_info_from_path(path: &Path) -> Result<FileIngestionInfo> {
|
|||
}
|
||||
|
||||
fn is_ocr_able_file(mime_type: &str) -> bool {
|
||||
// Check mime types that are suitable for OCR processing (images and PDFs)
|
||||
matches!(mime_type,
|
||||
"application/pdf" |
|
||||
"application/pdf" |
|
||||
"image/png" | "image/jpeg" | "image/jpg" |
|
||||
"image/tiff" | "image/bmp" | "image/gif"
|
||||
)
|
||||
}
|
||||
|
||||
fn is_text_extractable_file(mime_type: &str) -> bool {
|
||||
// Check mime types that support text extraction (OCR + Office documents + plain text)
|
||||
matches!(mime_type,
|
||||
// OCR-able files
|
||||
"application/pdf" |
|
||||
"image/png" | "image/jpeg" | "image/jpg" |
|
||||
"image/tiff" | "image/bmp" | "image/gif" |
|
||||
// Plain text
|
||||
"text/plain" |
|
||||
"image/png" | "image/jpeg" | "image/jpg" | "image/tiff" | "image/bmp" | "image/gif" |
|
||||
"application/msword" | "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
// Office document formats
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" | // DOCX
|
||||
"application/msword" | // DOC
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | // XLSX
|
||||
"application/vnd.ms-excel" | // XLS
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation" // PPTX (for future)
|
||||
)
|
||||
}
|
||||
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,511 @@
|
|||
use readur::ocr::enhanced::EnhancedOcrService;
|
||||
use readur::models::Settings;
|
||||
use readur::services::file_service::FileService;
|
||||
use std::fs;
|
||||
use std::io::Write;
|
||||
use tempfile::TempDir;
|
||||
use zip::write::FileOptions;
|
||||
use zip::{ZipWriter, CompressionMethod};
|
||||
|
||||
/// Helper function to create a proper DOCX file for testing
|
||||
/// Creates a comprehensive DOCX structure that docx-rs can parse
|
||||
fn create_test_docx(content: &str) -> Vec<u8> {
|
||||
let mut buffer = Vec::new();
|
||||
{
|
||||
let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
|
||||
let options = FileOptions::default().compression_method(CompressionMethod::Deflated);
|
||||
|
||||
// Add [Content_Types].xml - More comprehensive structure
|
||||
zip.start_file("[Content_Types].xml", options).unwrap();
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
||||
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
||||
<Default Extension="xml" ContentType="application/xml"/>
|
||||
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
|
||||
<Override PartName="/word/styles.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"/>
|
||||
<Override PartName="/word/settings.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml"/>
|
||||
<Override PartName="/word/fontTable.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml"/>
|
||||
</Types>"#).unwrap();
|
||||
|
||||
// Add _rels/.rels
|
||||
zip.add_directory("_rels/", options).unwrap();
|
||||
zip.start_file("_rels/.rels", options).unwrap();
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
||||
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
|
||||
</Relationships>"#).unwrap();
|
||||
|
||||
// Add word directory and its _rels subdirectory
|
||||
zip.add_directory("word/", options).unwrap();
|
||||
zip.add_directory("word/_rels/", options).unwrap();
|
||||
|
||||
// Add word/_rels/document.xml.rels
|
||||
zip.start_file("word/_rels/document.xml.rels", options).unwrap();
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
||||
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles" Target="styles.xml"/>
|
||||
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings" Target="settings.xml"/>
|
||||
<Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable" Target="fontTable.xml"/>
|
||||
</Relationships>"#).unwrap();
|
||||
|
||||
// Add word/document.xml with proper structure
|
||||
zip.start_file("word/document.xml", options).unwrap();
|
||||
// Escape XML entities and remove null bytes to create valid XML
|
||||
let escaped_content = content.replace('&', "&")
|
||||
.replace('<', "<")
|
||||
.replace('>', ">")
|
||||
.replace('\0', ""); // Remove null bytes as they're invalid in XML
|
||||
let document_xml = format!(r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:body>
|
||||
<w:p>
|
||||
<w:r>
|
||||
<w:t>{}</w:t>
|
||||
</w:r>
|
||||
</w:p>
|
||||
<w:sectPr>
|
||||
<w:pgSz w:w="12240" w:h="15840"/>
|
||||
<w:pgMar w:top="1440" w:right="1440" w:bottom="1440" w:left="1440" w:header="720" w:footer="720" w:gutter="0"/>
|
||||
</w:sectPr>
|
||||
</w:body>
|
||||
</w:document>"#, escaped_content);
|
||||
zip.write_all(document_xml.as_bytes()).unwrap();
|
||||
|
||||
// Add word/styles.xml (minimal styles)
|
||||
zip.start_file("word/styles.xml", options).unwrap();
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:docDefaults>
|
||||
<w:rPrDefault>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri" w:eastAsia="Calibri" w:hAnsi="Calibri" w:cs="Calibri"/>
|
||||
<w:sz w:val="22"/>
|
||||
<w:szCs w:val="22"/>
|
||||
<w:lang w:val="en-US" w:eastAsia="en-US" w:bidi="ar-SA"/>
|
||||
</w:rPr>
|
||||
</w:rPrDefault>
|
||||
</w:docDefaults>
|
||||
</w:styles>"#).unwrap();
|
||||
|
||||
// Add word/settings.xml (minimal settings)
|
||||
zip.start_file("word/settings.xml", options).unwrap();
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:settings xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:defaultTabStop w:val="708"/>
|
||||
</w:settings>"#).unwrap();
|
||||
|
||||
// Add word/fontTable.xml (minimal font table)
|
||||
zip.start_file("word/fontTable.xml", options).unwrap();
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:fonts xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:font w:name="Calibri">
|
||||
<w:panose1 w:val="020F0502020204030204"/>
|
||||
<w:charset w:val="00"/>
|
||||
<w:family w:val="swiss"/>
|
||||
<w:pitch w:val="variable"/>
|
||||
</w:font>
|
||||
</w:fonts>"#).unwrap();
|
||||
|
||||
zip.finish().unwrap();
|
||||
}
|
||||
buffer
|
||||
}
|
||||
|
||||
/// Helper function to create a proper XLSX file for testing
|
||||
/// Uses rust_xlsxwriter to create a real XLSX file that calamine can properly read
|
||||
fn create_test_xlsx(content: &str) -> Vec<u8> {
|
||||
use rust_xlsxwriter::*;
|
||||
|
||||
let mut workbook = Workbook::new();
|
||||
let worksheet = workbook.add_worksheet();
|
||||
|
||||
// Add the test content to cell A1
|
||||
worksheet.write_string(0, 0, content).expect("Failed to write to worksheet");
|
||||
|
||||
// Save to buffer and return bytes
|
||||
workbook.save_to_buffer().expect("Failed to save XLSX to buffer")
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_docx_text_extraction() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let docx_path = temp_dir.path().join("test.docx");
|
||||
|
||||
// Create a test DOCX file
|
||||
let test_content = "This is a test DOCX document with some content.";
|
||||
let docx_data = create_test_docx(test_content);
|
||||
fs::write(&docx_path, docx_data).unwrap();
|
||||
|
||||
// Create OCR service
|
||||
let ocr_service = EnhancedOcrService {
|
||||
temp_dir: temp_dir.path().to_str().unwrap().to_string(),
|
||||
file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
|
||||
};
|
||||
|
||||
let settings = Settings::default();
|
||||
|
||||
// Extract text from DOCX
|
||||
let result = ocr_service.extract_text_from_office(
|
||||
docx_path.to_str().unwrap(),
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
&settings
|
||||
).await;
|
||||
|
||||
assert!(result.is_ok(), "DOCX extraction should succeed");
|
||||
let ocr_result = result.unwrap();
|
||||
// The extracted text may include section breaks and other document structure
|
||||
assert!(ocr_result.text.contains(test_content), "Should contain the test content: {}", ocr_result.text);
|
||||
assert_eq!(ocr_result.confidence, 100.0);
|
||||
assert!(ocr_result.word_count > 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_xlsx_text_extraction() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let xlsx_path = temp_dir.path().join("test.xlsx");
|
||||
|
||||
// Create a test XLSX file
|
||||
let test_content = "Excel spreadsheet test data";
|
||||
let xlsx_data = create_test_xlsx(test_content);
|
||||
fs::write(&xlsx_path, xlsx_data).unwrap();
|
||||
|
||||
// Create OCR service
|
||||
let ocr_service = EnhancedOcrService {
|
||||
temp_dir: temp_dir.path().to_str().unwrap().to_string(),
|
||||
file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
|
||||
};
|
||||
|
||||
let settings = Settings::default();
|
||||
|
||||
// Extract text from XLSX
|
||||
let result = ocr_service.extract_text_from_office(
|
||||
xlsx_path.to_str().unwrap(),
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
&settings
|
||||
).await;
|
||||
|
||||
assert!(result.is_ok(), "XLSX extraction should succeed");
|
||||
let ocr_result = result.unwrap();
|
||||
assert_eq!(ocr_result.text.trim(), test_content);
|
||||
assert_eq!(ocr_result.confidence, 100.0);
|
||||
assert!(ocr_result.word_count > 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_null_byte_removal() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let docx_path = temp_dir.path().join("test_nulls.docx");
|
||||
|
||||
// Create a test DOCX file with null bytes embedded (shouldn't happen in real files)
|
||||
let test_content = "Test\0with\0null\0bytes";
|
||||
let docx_data = create_test_docx(test_content);
|
||||
fs::write(&docx_path, docx_data).unwrap();
|
||||
|
||||
// Create OCR service
|
||||
let ocr_service = EnhancedOcrService {
|
||||
temp_dir: temp_dir.path().to_str().unwrap().to_string(),
|
||||
file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
|
||||
};
|
||||
|
||||
let settings = Settings::default();
|
||||
|
||||
// Extract text from DOCX
|
||||
let result = ocr_service.extract_text_from_office(
|
||||
docx_path.to_str().unwrap(),
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
&settings
|
||||
).await;
|
||||
|
||||
assert!(result.is_ok(), "DOCX extraction should succeed even with null bytes");
|
||||
let ocr_result = result.unwrap();
|
||||
|
||||
// Verify null bytes were removed (they were stripped during DOCX creation since they're invalid in XML)
|
||||
assert!(!ocr_result.text.contains('\0'), "Extracted text should not contain null bytes");
|
||||
// The XML extraction may add section breaks, so check if the main text is present
|
||||
assert!(ocr_result.text.contains("Testwithnullbytes"), "Extracted text should contain the expected content");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_preserve_formatting() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let docx_path = temp_dir.path().join("test_formatting.docx");
|
||||
|
||||
// Create a test DOCX file with special formatting
|
||||
let test_content = "Line 1\n\nLine 2\t\tTabbed\n Indented ";
|
||||
let docx_data = create_test_docx(test_content);
|
||||
fs::write(&docx_path, docx_data).unwrap();
|
||||
|
||||
// Create OCR service
|
||||
let ocr_service = EnhancedOcrService {
|
||||
temp_dir: temp_dir.path().to_str().unwrap().to_string(),
|
||||
file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
|
||||
};
|
||||
|
||||
let settings = Settings::default();
|
||||
|
||||
// Extract text from DOCX
|
||||
let result = ocr_service.extract_text_from_office(
|
||||
docx_path.to_str().unwrap(),
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
&settings
|
||||
).await;
|
||||
|
||||
assert!(result.is_ok(), "DOCX extraction should succeed");
|
||||
let ocr_result = result.unwrap();
|
||||
|
||||
// Verify formatting is preserved (no aggressive sanitization)
|
||||
// Note: The DOCX might not preserve exact formatting, but we shouldn't be removing it
|
||||
assert!(ocr_result.text.contains("Line 1"));
|
||||
assert!(ocr_result.text.contains("Line 2"));
|
||||
assert!(ocr_result.text.contains("Tabbed"));
|
||||
assert!(ocr_result.text.contains("Indented"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_empty_docx() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let docx_path = temp_dir.path().join("empty.docx");
|
||||
|
||||
// Create an empty DOCX file
|
||||
let docx_data = create_test_docx("");
|
||||
fs::write(&docx_path, docx_data).unwrap();
|
||||
|
||||
// Create OCR service
|
||||
let ocr_service = EnhancedOcrService {
|
||||
temp_dir: temp_dir.path().to_str().unwrap().to_string(),
|
||||
file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
|
||||
};
|
||||
|
||||
let settings = Settings::default();
|
||||
|
||||
// Extract text from empty DOCX
|
||||
let result = ocr_service.extract_text_from_office(
|
||||
docx_path.to_str().unwrap(),
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
&settings
|
||||
).await;
|
||||
|
||||
// Should fail with appropriate error message
|
||||
assert!(result.is_err(), "Empty DOCX should return an error");
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
assert!(error_msg.contains("No text content found") || error_msg.contains("empty"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_corrupted_docx() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let docx_path = temp_dir.path().join("corrupted.docx");
|
||||
|
||||
// Create a corrupted DOCX file (not a valid ZIP)
|
||||
fs::write(&docx_path, b"This is not a valid DOCX file").unwrap();
|
||||
|
||||
// Create OCR service
|
||||
let ocr_service = EnhancedOcrService {
|
||||
temp_dir: temp_dir.path().to_str().unwrap().to_string(),
|
||||
file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
|
||||
};
|
||||
|
||||
let settings = Settings::default();
|
||||
|
||||
// Try to extract text from corrupted DOCX
|
||||
let result = ocr_service.extract_text_from_office(
|
||||
docx_path.to_str().unwrap(),
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
&settings
|
||||
).await;
|
||||
|
||||
// Should fail with appropriate error message
|
||||
assert!(result.is_err(), "Corrupted DOCX should return an error");
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
// Check for various error messages that indicate a corrupted file
|
||||
assert!(
|
||||
error_msg.contains("invalid Zip archive") || // Actual error from zip crate
|
||||
error_msg.contains("Invalid ZIP") ||
|
||||
error_msg.contains("corrupted") ||
|
||||
error_msg.contains("Could not find central directory"),
|
||||
"Expected error about invalid/corrupted file, got: {}", error_msg
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_legacy_doc_error() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let doc_path = temp_dir.path().join("legacy.doc");
|
||||
|
||||
// Create a fake DOC file
|
||||
fs::write(&doc_path, b"Legacy DOC format").unwrap();
|
||||
|
||||
// Create OCR service
|
||||
let ocr_service = EnhancedOcrService {
|
||||
temp_dir: temp_dir.path().to_str().unwrap().to_string(),
|
||||
file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
|
||||
};
|
||||
|
||||
let settings = Settings::default();
|
||||
|
||||
// Try to extract text from legacy DOC
|
||||
let result = ocr_service.extract_text_from_office(
|
||||
doc_path.to_str().unwrap(),
|
||||
"application/msword",
|
||||
&settings
|
||||
).await;
|
||||
|
||||
// Should fail with helpful error about external tools not available
|
||||
assert!(result.is_err(), "Legacy DOC should return an error");
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
// The error message now comes from external tool extraction failure
|
||||
assert!(error_msg.contains("DOC extraction tools") || error_msg.contains("antiword") || error_msg.contains("catdoc"),
|
||||
"Expected error about DOC extraction tools, got: {}", error_msg);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_file_size_limit() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let docx_path = temp_dir.path().join("large.docx");
|
||||
|
||||
// Create a DOCX that would exceed size limit (simulated by very long content)
|
||||
let large_content = "x".repeat(100_000); // Large but not actually 50MB in ZIP
|
||||
let docx_data = create_test_docx(&large_content);
|
||||
fs::write(&docx_path, docx_data).unwrap();
|
||||
|
||||
// Create OCR service
|
||||
let ocr_service = EnhancedOcrService {
|
||||
temp_dir: temp_dir.path().to_str().unwrap().to_string(),
|
||||
file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
|
||||
};
|
||||
|
||||
let settings = Settings::default();
|
||||
|
||||
// Extract text from large DOCX
|
||||
let result = ocr_service.extract_text_from_office(
|
||||
docx_path.to_str().unwrap(),
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
&settings
|
||||
).await;
|
||||
|
||||
// Should succeed for content within limits
|
||||
assert!(result.is_ok(), "DOCX within size limits should succeed");
|
||||
}
|
||||
|
||||
/// Helper function to create a minimal DOC file for testing
|
||||
/// Note: This creates a fake DOC file since real DOC format is complex binary
|
||||
fn create_fake_doc_file() -> Vec<u8> {
|
||||
// Create a DOC-like header that might fool basic detection
|
||||
// but will fail in actual conversion/extraction
|
||||
let mut doc_data = Vec::new();
|
||||
|
||||
// DOC files start with compound document signature
|
||||
doc_data.extend_from_slice(&[0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1]);
|
||||
|
||||
// Add some padding to make it look like a real file
|
||||
doc_data.extend_from_slice(b"This is fake DOC content for testing purposes");
|
||||
doc_data.resize(1024, 0); // Pad to reasonable size
|
||||
|
||||
doc_data
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_legacy_doc_enhanced_error_message() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let doc_path = temp_dir.path().join("test.doc");
|
||||
|
||||
// Create a fake DOC file
|
||||
let doc_data = create_fake_doc_file();
|
||||
fs::write(&doc_path, doc_data).unwrap();
|
||||
|
||||
// Create OCR service
|
||||
let ocr_service = EnhancedOcrService {
|
||||
temp_dir: temp_dir.path().to_str().unwrap().to_string(),
|
||||
file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
|
||||
};
|
||||
|
||||
let settings = Settings::default();
|
||||
|
||||
// Try to extract text from legacy DOC
|
||||
let result = ocr_service.extract_text_from_office(
|
||||
doc_path.to_str().unwrap(),
|
||||
"application/msword",
|
||||
&settings
|
||||
).await;
|
||||
|
||||
// Should fail with enhanced error message
|
||||
assert!(result.is_err(), "Legacy DOC should return an error without tools");
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
|
||||
// Verify enhanced error message mentions extraction tools
|
||||
assert!(error_msg.contains("None of the DOC extraction tools") || error_msg.contains("All extraction methods failed"), "Should mention extraction tools failed");
|
||||
assert!(error_msg.contains("antiword"), "Should mention antiword tool");
|
||||
assert!(error_msg.contains("catdoc"), "Should mention catdoc tool");
|
||||
}
|
||||
|
||||
// Note: DOC to DOCX conversion tests removed since we no longer use LibreOffice
|
||||
// Legacy DOC files are now handled by lightweight tools (antiword/catdoc) only
|
||||
|
||||
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_doc_extraction_multiple_strategies() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let doc_path = temp_dir.path().join("multitest.doc");
|
||||
|
||||
// Create a fake DOC file
|
||||
let doc_data = create_fake_doc_file();
|
||||
fs::write(&doc_path, doc_data).unwrap();
|
||||
|
||||
// Create OCR service
|
||||
let ocr_service = EnhancedOcrService {
|
||||
temp_dir: temp_dir.path().to_str().unwrap().to_string(),
|
||||
file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
|
||||
};
|
||||
|
||||
let settings = Settings::default();
|
||||
let start_time = std::time::Instant::now();
|
||||
|
||||
// Test Office extraction with the DOC file (this should fail as DOC files are not XML-based)
|
||||
let result = ocr_service.extract_text_from_office(
|
||||
doc_path.to_str().unwrap(),
|
||||
"application/msword",
|
||||
&settings
|
||||
).await;
|
||||
|
||||
// Should fail since external DOC tools are not available in test environment
|
||||
assert!(result.is_err(), "Should fail for DOC files as external tools are not available");
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
|
||||
// Verify it mentions external tool issues for DOC files
|
||||
assert!(error_msg.contains("DOC extraction tools") || error_msg.contains("antiword") || error_msg.contains("catdoc"),
|
||||
"Should mention external tool issues: {}", error_msg);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_doc_error_message_includes_processing_time() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let doc_path = temp_dir.path().join("timed.doc");
|
||||
|
||||
// Create a fake DOC file
|
||||
let doc_data = create_fake_doc_file();
|
||||
fs::write(&doc_path, doc_data).unwrap();
|
||||
|
||||
// Create OCR service
|
||||
let ocr_service = EnhancedOcrService {
|
||||
temp_dir: temp_dir.path().to_str().unwrap().to_string(),
|
||||
file_service: FileService::new(temp_dir.path().to_str().unwrap().to_string()),
|
||||
};
|
||||
|
||||
let settings = Settings::default();
|
||||
|
||||
// Try to extract text from legacy DOC
|
||||
let result = ocr_service.extract_text_from_office(
|
||||
doc_path.to_str().unwrap(),
|
||||
"application/msword",
|
||||
&settings
|
||||
).await;
|
||||
|
||||
// Should fail and include processing time in error message
|
||||
assert!(result.is_err(), "Should fail without tools");
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
assert!(error_msg.contains("Processing time:") && error_msg.contains("ms"),
|
||||
"Should include processing time: {}", error_msg);
|
||||
}
|
||||
|
||||
// Note: UUID uniqueness test removed since we no longer use temporary conversion directories
|
||||
|
|
@ -0,0 +1,585 @@
|
|||
use anyhow::Result;
|
||||
use std::fs;
|
||||
use std::io::Write;
|
||||
use std::time::Duration;
|
||||
use tempfile::TempDir;
|
||||
use tokio::time::timeout;
|
||||
|
||||
use readur::ocr::{
|
||||
OcrService, OcrConfig,
|
||||
};
|
||||
|
||||
/// Test utilities for creating mock Office documents
|
||||
struct OfficeTestDocuments {
|
||||
temp_dir: TempDir,
|
||||
}
|
||||
|
||||
impl OfficeTestDocuments {
|
||||
fn new() -> Result<Self> {
|
||||
Ok(Self {
|
||||
temp_dir: TempDir::new()?,
|
||||
})
|
||||
}
|
||||
|
||||
/// Create a mock DOCX file (simplified ZIP structure with XML content)
|
||||
fn create_mock_docx(&self, filename: &str, content: &str) -> Result<String> {
|
||||
let file_path = self.temp_dir.path().join(filename);
|
||||
|
||||
// Create a proper ZIP structure for DOCX
|
||||
let file = fs::File::create(&file_path)?;
|
||||
let mut zip = zip::ZipWriter::new(file);
|
||||
|
||||
// Add [Content_Types].xml
|
||||
zip.start_file("[Content_Types].xml", zip::write::FileOptions::default())?;
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
||||
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
||||
<Default Extension="xml" ContentType="application/xml"/>
|
||||
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
|
||||
</Types>"#)?;
|
||||
|
||||
// Add _rels/.rels
|
||||
zip.start_file("_rels/.rels", zip::write::FileOptions::default())?;
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
||||
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
|
||||
</Relationships>"#)?;
|
||||
|
||||
// Add word/document.xml with the actual content
|
||||
zip.start_file("word/document.xml", zip::write::FileOptions::default())?;
|
||||
let document_xml = format!(r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:body>
|
||||
<w:p>
|
||||
<w:r>
|
||||
<w:t>{}</w:t>
|
||||
</w:r>
|
||||
</w:p>
|
||||
</w:body>
|
||||
</w:document>"#, content);
|
||||
zip.write_all(document_xml.as_bytes())?;
|
||||
|
||||
zip.finish()?;
|
||||
|
||||
Ok(file_path.to_string_lossy().to_string())
|
||||
}
|
||||
|
||||
/// Create a mock XLSX file with spreadsheet content
|
||||
fn create_mock_xlsx(&self, filename: &str, content: &[&str]) -> Result<String> {
|
||||
let file_path = self.temp_dir.path().join(filename);
|
||||
|
||||
let file = fs::File::create(&file_path)?;
|
||||
let mut zip = zip::ZipWriter::new(file);
|
||||
|
||||
// Add [Content_Types].xml with shared strings support
|
||||
zip.start_file("[Content_Types].xml", zip::write::FileOptions::default())?;
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
||||
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
||||
<Default Extension="xml" ContentType="application/xml"/>
|
||||
<Override PartName="/xl/workbook.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"/>
|
||||
<Override PartName="/xl/worksheets/sheet1.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml"/>
|
||||
<Override PartName="/xl/sharedStrings.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml"/>
|
||||
</Types>"#)?;
|
||||
|
||||
// Add _rels/.rels
|
||||
zip.start_file("_rels/.rels", zip::write::FileOptions::default())?;
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
||||
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="xl/workbook.xml"/>
|
||||
</Relationships>"#)?;
|
||||
|
||||
// Add xl/workbook.xml
|
||||
zip.start_file("xl/workbook.xml", zip::write::FileOptions::default())?;
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
|
||||
<sheets>
|
||||
<sheet name="Sheet1" sheetId="1" r:id="rId1"/>
|
||||
</sheets>
|
||||
</workbook>"#)?;
|
||||
|
||||
// Add xl/_rels/workbook.xml.rels with shared strings relationship
|
||||
zip.start_file("xl/_rels/workbook.xml.rels", zip::write::FileOptions::default())?;
|
||||
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
||||
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet" Target="worksheets/sheet1.xml"/>
|
||||
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings" Target="sharedStrings.xml"/>
|
||||
</Relationships>"#)?;
|
||||
|
||||
// Add xl/sharedStrings.xml with the text content
|
||||
zip.start_file("xl/sharedStrings.xml", zip::write::FileOptions::default())?;
|
||||
let mut shared_strings_xml = String::from(r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<sst xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" count="{count}" uniqueCount="{count}">"#);
|
||||
shared_strings_xml = shared_strings_xml.replace("{count}", &content.len().to_string());
|
||||
|
||||
for cell_content in content {
|
||||
shared_strings_xml.push_str(&format!(r#"
|
||||
<si><t>{}</t></si>"#, cell_content));
|
||||
}
|
||||
|
||||
shared_strings_xml.push_str(r#"
|
||||
</sst>"#);
|
||||
zip.write_all(shared_strings_xml.as_bytes())?;
|
||||
|
||||
// Add xl/worksheets/sheet1.xml with references to shared strings
|
||||
zip.start_file("xl/worksheets/sheet1.xml", zip::write::FileOptions::default())?;
|
||||
let mut worksheet_xml = String::from(r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
|
||||
<sheetData>"#);
|
||||
|
||||
for (row_idx, _) in content.iter().enumerate() {
|
||||
worksheet_xml.push_str(&format!(r#"
|
||||
<row r="{}">
|
||||
<c r="A{}" t="s">
|
||||
<v>{}</v>
|
||||
</c>
|
||||
</row>"#, row_idx + 1, row_idx + 1, row_idx));
|
||||
}
|
||||
|
||||
worksheet_xml.push_str(r#"
|
||||
</sheetData>
|
||||
</worksheet>"#);
|
||||
|
||||
zip.write_all(worksheet_xml.as_bytes())?;
|
||||
zip.finish()?;
|
||||
|
||||
Ok(file_path.to_string_lossy().to_string())
|
||||
}
|
||||
|
||||
/// Create a corrupted file for testing error handling
|
||||
fn create_corrupted_file(&self, filename: &str) -> Result<String> {
|
||||
let file_path = self.temp_dir.path().join(filename);
|
||||
let mut file = fs::File::create(&file_path)?;
|
||||
file.write_all(b"This is not a valid Office document but pretends to be one")?;
|
||||
Ok(file_path.to_string_lossy().to_string())
|
||||
}
|
||||
|
||||
/// Create an empty file
|
||||
fn create_empty_file(&self, filename: &str) -> Result<String> {
|
||||
let file_path = self.temp_dir.path().join(filename);
|
||||
fs::File::create(&file_path)?;
|
||||
Ok(file_path.to_string_lossy().to_string())
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a test OCR service with XML extraction
|
||||
fn create_test_ocr_service(temp_dir: &str) -> OcrService {
|
||||
let config = OcrConfig {
|
||||
temp_dir: temp_dir.to_string(),
|
||||
};
|
||||
|
||||
OcrService::new_with_config(config)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_extract_text_from_docx() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
|
||||
let test_content = "This is a test DOCX document with sample content for extraction testing.";
|
||||
let docx_path = test_docs.create_mock_docx("test.docx", test_content)?;
|
||||
|
||||
let result = ocr_service.extract_text_from_office_document(
|
||||
&docx_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await?;
|
||||
|
||||
// The method now returns an OcrResult
|
||||
println!("Extracted text: '{}'", result.text);
|
||||
assert!(!result.text.is_empty());
|
||||
assert!(result.text.contains(test_content));
|
||||
assert!(result.confidence > 0.0);
|
||||
assert!(result.word_count > 0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_extract_text_from_xlsx() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
|
||||
let test_content = vec![
|
||||
"Header 1",
|
||||
"Data Row 1",
|
||||
"Data Row 2",
|
||||
"Summary Data",
|
||||
];
|
||||
let xlsx_path = test_docs.create_mock_xlsx("test.xlsx", &test_content)?;
|
||||
|
||||
let result = ocr_service.extract_text_from_office_document(
|
||||
&xlsx_path,
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
).await?;
|
||||
|
||||
// The method now returns an OcrResult
|
||||
println!("XLSX extracted text: '{}'", result.text);
|
||||
assert!(!result.text.is_empty());
|
||||
// Check if it contains some of our test content
|
||||
assert!(result.text.contains("Header") || result.text.contains("Data"));
|
||||
assert!(result.confidence > 0.0);
|
||||
assert!(result.word_count > 0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_extraction_modes() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let temp_dir = test_docs.temp_dir.path().to_string_lossy().to_string();
|
||||
|
||||
let test_content = "Test document for mode comparison";
|
||||
let docx_path = test_docs.create_mock_docx("test_modes.docx", test_content)?;
|
||||
|
||||
// Test XML extraction with the simplified approach
|
||||
let ocr_config = OcrConfig {
|
||||
temp_dir: temp_dir.clone(),
|
||||
};
|
||||
|
||||
let ocr_service = OcrService::new_with_config(ocr_config);
|
||||
|
||||
let result = ocr_service.extract_text_from_office_document_with_config(
|
||||
&docx_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
).await;
|
||||
|
||||
// XML extraction should succeed with our test document
|
||||
assert!(result.is_ok(), "XML extraction failed: {:?}", result);
|
||||
let extracted_result = result?;
|
||||
assert!(!extracted_result.text.is_empty());
|
||||
assert!(extracted_result.confidence > 0.0);
|
||||
assert!(extracted_result.word_count > 0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fallback_mechanism() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let temp_dir = test_docs.temp_dir.path().to_string_lossy().to_string();
|
||||
|
||||
// Create a service with XML extraction
|
||||
let config = OcrConfig {
|
||||
temp_dir,
|
||||
};
|
||||
|
||||
let ocr_service = OcrService::new_with_config(config);
|
||||
let docx_path = test_docs.create_mock_docx("fallback_test.docx", "Fallback test content")?;
|
||||
|
||||
// The XML extraction should succeed
|
||||
let result = ocr_service.extract_text_from_office_document(
|
||||
&docx_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await?;
|
||||
|
||||
// The method now returns an OcrResult
|
||||
assert!(result.text.contains("Fallback test content"));
|
||||
assert!(result.confidence > 0.0);
|
||||
assert!(result.word_count > 0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_timeout_handling() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
|
||||
let docx_path = test_docs.create_mock_docx("timeout_test.docx", "Test content")?;
|
||||
|
||||
// Test timeout behavior (the timeout logic is now in the XML extractor itself)
|
||||
let result = timeout(
|
||||
Duration::from_millis(2000), // Give overall test 2 seconds
|
||||
ocr_service.extract_text_from_office_document_with_config(
|
||||
&docx_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
)
|
||||
).await;
|
||||
|
||||
// Should complete successfully even with short timeout for our simple test file
|
||||
assert!(result.is_ok());
|
||||
let extraction_result = result??;
|
||||
assert!(!extraction_result.text.is_empty());
|
||||
assert!(extraction_result.confidence > 0.0);
|
||||
assert!(extraction_result.word_count > 0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_error_handling() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
|
||||
// Test with corrupted file
|
||||
let corrupted_path = test_docs.create_corrupted_file("corrupted.docx")?;
|
||||
let result = ocr_service.extract_text_from_office_document(
|
||||
&corrupted_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await;
|
||||
|
||||
assert!(result.is_err());
|
||||
let error_msg = result.unwrap_err().to_string();
|
||||
assert!(error_msg.contains("corrupted") || error_msg.contains("invalid") || error_msg.contains("parsing"));
|
||||
|
||||
// Test with empty file
|
||||
let empty_path = test_docs.create_empty_file("empty.docx")?;
|
||||
let result = ocr_service.extract_text_from_office_document(
|
||||
&empty_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await;
|
||||
|
||||
assert!(result.is_err());
|
||||
|
||||
// Test with non-existent file
|
||||
let result = ocr_service.extract_text_from_office_document(
|
||||
"/path/that/does/not/exist.docx",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await;
|
||||
|
||||
assert!(result.is_err());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_concurrent_extraction() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
|
||||
// Create multiple test documents
|
||||
let mut tasks = Vec::new();
|
||||
let mut file_paths = Vec::new();
|
||||
|
||||
for i in 0..5 {
|
||||
let content = format!("Test document {} with unique content", i);
|
||||
let file_path = test_docs.create_mock_docx(&format!("concurrent_test_{}.docx", i), &content)?;
|
||||
file_paths.push(file_path);
|
||||
}
|
||||
|
||||
// Launch concurrent extraction tasks
|
||||
for file_path in file_paths {
|
||||
let ocr_service_clone = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
let task = tokio::spawn(async move {
|
||||
ocr_service_clone.extract_text_from_office_document(
|
||||
&file_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await
|
||||
});
|
||||
tasks.push(task);
|
||||
}
|
||||
|
||||
// Wait for all tasks to complete
|
||||
let results = futures::future::join_all(tasks).await;
|
||||
|
||||
// Verify all extractions succeeded
|
||||
for (i, task_result) in results.into_iter().enumerate() {
|
||||
let ocr_result = task_result??;
|
||||
assert!(!ocr_result.text.is_empty(), "Task {} failed", i);
|
||||
assert!(ocr_result.text.contains(&format!("Test document {}", i)));
|
||||
assert!(ocr_result.confidence > 0.0);
|
||||
assert!(ocr_result.word_count > 0);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_circuit_breaker() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
|
||||
// Create service with XML extraction
|
||||
let config = OcrConfig {
|
||||
temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(),
|
||||
};
|
||||
|
||||
let ocr_service = OcrService::new_with_config(config);
|
||||
|
||||
// Create a valid document for later success testing
|
||||
let valid_path = test_docs.create_mock_docx("circuit_test.docx", "Valid document")?;
|
||||
|
||||
// Create corrupted files to cause failures
|
||||
let corrupted1 = test_docs.create_corrupted_file("corrupted1.docx")?;
|
||||
let corrupted2 = test_docs.create_corrupted_file("corrupted2.docx")?;
|
||||
|
||||
// First failure
|
||||
let result1 = ocr_service.extract_text_from_office_document(
|
||||
&corrupted1,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await;
|
||||
assert!(result1.is_err());
|
||||
|
||||
// Second failure - should trip circuit breaker
|
||||
let result2 = ocr_service.extract_text_from_office_document(
|
||||
&corrupted2,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await;
|
||||
assert!(result2.is_err());
|
||||
|
||||
// Third attempt - should succeed since circuit breaker functionality was removed
|
||||
let result3 = ocr_service.extract_text_from_office_document(
|
||||
&valid_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await;
|
||||
// With simplified architecture, valid documents should always work
|
||||
assert!(result3.is_ok());
|
||||
let valid_result = result3.unwrap();
|
||||
assert!(valid_result.text.contains("Valid document"));
|
||||
assert!(valid_result.confidence > 0.0);
|
||||
assert!(valid_result.word_count > 0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_statistics_tracking() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
|
||||
// Perform some extractions to verify functionality
|
||||
let valid_path = test_docs.create_mock_docx("stats_test.docx", "Statistics test document")?;
|
||||
|
||||
for i in 0..3 {
|
||||
let result = ocr_service.extract_text_from_office_document(
|
||||
&valid_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await;
|
||||
|
||||
assert!(result.is_ok(), "Extraction {} failed: {:?}", i, result);
|
||||
let ocr_result = result.unwrap();
|
||||
assert!(!ocr_result.text.is_empty());
|
||||
assert!(ocr_result.confidence > 0.0);
|
||||
assert!(ocr_result.word_count > 0);
|
||||
assert!(ocr_result.processing_time_ms > 0);
|
||||
}
|
||||
|
||||
// All extractions succeeded, indicating the XML extraction is working correctly
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_mime_type_support() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
|
||||
// Test supported MIME types
|
||||
let supported_types = ocr_service.get_supported_mime_types();
|
||||
assert!(supported_types.contains(&"application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
|
||||
assert!(supported_types.contains(&"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
|
||||
assert!(supported_types.contains(&"application/pdf"));
|
||||
assert!(supported_types.contains(&"image/png"));
|
||||
|
||||
// Test Office document support
|
||||
assert!(ocr_service.supports_office_documents());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_learning_mechanism() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
|
||||
// Create service with XML extraction
|
||||
let config = OcrConfig {
|
||||
temp_dir: test_docs.temp_dir.path().to_string_lossy().to_string(),
|
||||
};
|
||||
|
||||
let ocr_service = OcrService::new_with_config(config);
|
||||
|
||||
// Process several documents of the same type to build learning data
|
||||
for i in 0..3 {
|
||||
let content = format!("Learning test document {} content", i);
|
||||
let docx_path = test_docs.create_mock_docx(&format!("learning_{}.docx", i), &content)?;
|
||||
|
||||
let result = ocr_service.extract_text_from_office_document(
|
||||
&docx_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await;
|
||||
|
||||
assert!(result.is_ok(), "Extraction iteration {} failed: {:?}", i, result);
|
||||
let ocr_result = result?;
|
||||
assert!(!ocr_result.text.is_empty());
|
||||
assert!(ocr_result.text.contains(&format!("document {}", i)));
|
||||
assert!(ocr_result.confidence > 0.0);
|
||||
assert!(ocr_result.word_count > 0);
|
||||
}
|
||||
|
||||
// With the simplified XML-only architecture, the system should consistently work
|
||||
// All extractions succeeded, indicating the XML extraction is working correctly
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_integration_with_main_extract_text() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
|
||||
// Test that the main extract_text method properly handles Office documents
|
||||
let test_content = "Integration test for main extract_text method";
|
||||
let docx_path = test_docs.create_mock_docx("integration.docx", test_content)?;
|
||||
|
||||
// This should use the fallback strategy internally
|
||||
let result = ocr_service.extract_text(
|
||||
&docx_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await?;
|
||||
|
||||
assert!(!result.is_empty());
|
||||
assert!(result.contains("Integration test"));
|
||||
|
||||
// Test with XLSX as well
|
||||
let xlsx_content = vec!["Cell 1", "Cell 2", "Cell 3"];
|
||||
let xlsx_path = test_docs.create_mock_xlsx("integration.xlsx", &xlsx_content)?;
|
||||
|
||||
let result = ocr_service.extract_text(
|
||||
&xlsx_path,
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
).await?;
|
||||
|
||||
assert!(!result.is_empty());
|
||||
assert!(result.contains("Cell 1"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Performance benchmark test (not run by default due to #[ignore])
|
||||
#[tokio::test]
|
||||
#[ignore]
|
||||
async fn benchmark_extraction_performance() -> Result<()> {
|
||||
let test_docs = OfficeTestDocuments::new()?;
|
||||
let ocr_service = create_test_ocr_service(test_docs.temp_dir.path().to_string_lossy().as_ref());
|
||||
|
||||
// Create a larger test document
|
||||
let large_content = "This is a large test document. ".repeat(1000);
|
||||
let docx_path = test_docs.create_mock_docx("benchmark.docx", &large_content)?;
|
||||
|
||||
let start_time = std::time::Instant::now();
|
||||
let num_iterations = 10;
|
||||
|
||||
for i in 0..num_iterations {
|
||||
let result = ocr_service.extract_text_from_office_document(
|
||||
&docx_path,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
).await?;
|
||||
|
||||
assert!(!result.text.is_empty());
|
||||
println!("Iteration {}: extracted {} chars, confidence: {:.1}%",
|
||||
i,
|
||||
result.text.len(),
|
||||
result.confidence
|
||||
);
|
||||
}
|
||||
|
||||
let total_time = start_time.elapsed();
|
||||
let avg_time = total_time / num_iterations;
|
||||
|
||||
println!("Average extraction time: {:?}", avg_time);
|
||||
println!("Total time for {} iterations: {:?}", num_iterations, total_time);
|
||||
|
||||
// Performance assertions (adjust based on your requirements)
|
||||
assert!(avg_time < Duration::from_secs(5), "Average extraction time too slow: {:?}", avg_time);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -115,6 +115,8 @@ mod tests {
|
|||
webdav_file_extensions: None,
|
||||
webdav_auto_sync: None,
|
||||
webdav_sync_interval_minutes: None,
|
||||
office_extraction_timeout_seconds: None,
|
||||
office_extraction_enable_detailed_logging: None,
|
||||
};
|
||||
|
||||
let response = ctx.app
|
||||
|
|
@ -238,6 +240,8 @@ mod tests {
|
|||
webdav_file_extensions: None,
|
||||
webdav_auto_sync: None,
|
||||
webdav_sync_interval_minutes: None,
|
||||
office_extraction_timeout_seconds: None,
|
||||
office_extraction_enable_detailed_logging: None,
|
||||
};
|
||||
|
||||
let response = ctx.app
|
||||
|
|
@ -388,6 +392,8 @@ mod tests {
|
|||
webdav_file_extensions: None,
|
||||
webdav_auto_sync: None,
|
||||
webdav_sync_interval_minutes: None,
|
||||
office_extraction_timeout_seconds: None,
|
||||
office_extraction_enable_detailed_logging: None,
|
||||
};
|
||||
|
||||
let response = ctx.app
|
||||
|
|
@ -515,6 +521,8 @@ mod tests {
|
|||
webdav_file_extensions: None,
|
||||
webdav_auto_sync: None,
|
||||
webdav_sync_interval_minutes: None,
|
||||
office_extraction_timeout_seconds: None,
|
||||
office_extraction_enable_detailed_logging: None,
|
||||
};
|
||||
|
||||
let response = ctx.app
|
||||
|
|
|
|||
|
|
@ -72,6 +72,9 @@ fn create_empty_update_settings() -> UpdateSettings {
|
|||
webdav_file_extensions: None,
|
||||
webdav_auto_sync: None,
|
||||
webdav_sync_interval_minutes: None,
|
||||
// Office document extraction configuration
|
||||
office_extraction_timeout_seconds: None,
|
||||
office_extraction_enable_detailed_logging: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -215,6 +218,9 @@ async fn setup_webdav_settings(state: &AppState, user_id: Uuid) {
|
|||
ocr_quality_threshold_noise: None,
|
||||
ocr_quality_threshold_sharpness: None,
|
||||
ocr_skip_enhancement: None,
|
||||
// Office document extraction configuration
|
||||
office_extraction_timeout_seconds: None,
|
||||
office_extraction_enable_detailed_logging: None,
|
||||
};
|
||||
|
||||
state.db.create_or_update_settings(user_id, &update_settings).await
|
||||
|
|
|
|||
Loading…
Reference in New Issue