diff --git a/Cargo.lock b/Cargo.lock
index a2e6f43..553a468 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,6 +2,12 @@
# It is not intended for manual editing.
version = 4
+[[package]]
+name = "ab_glyph_rasterizer"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c71b1793ee61086797f5c80b6efa2b8ffa6d5dd703f118545808a7f2e27f7046"
+
[[package]]
name = "addr2line"
version = "0.24.2"
@@ -125,6 +131,15 @@ version = "1.0.98"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487"
+[[package]]
+name = "approx"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6"
+dependencies = [
+ "num-traits",
+]
+
[[package]]
name = "async-trait"
version = "0.1.88"
@@ -275,6 +290,12 @@ dependencies = [
"which",
]
+[[package]]
+name = "bit_field"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc827186963e592360843fb5ba4b973e145841266c1357f7180c43526f2e5b61"
+
[[package]]
name = "bitflags"
version = "1.3.2"
@@ -325,6 +346,12 @@ version = "3.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "793db76d6187cd04dff33004d8e6c9cc4e05cd330500379d2394209271b4aeee"
+[[package]]
+name = "bytemuck"
+version = "1.23.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c76a5792e44e4abe34d3abf15636779261d45a7450612059293d1d2cfc63422"
+
[[package]]
name = "byteorder"
version = "1.5.0"
@@ -437,6 +464,12 @@ version = "0.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675"
+[[package]]
+name = "color_quant"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b"
+
[[package]]
name = "colorchoice"
version = "1.0.4"
@@ -449,6 +482,15 @@ version = "0.9.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
+[[package]]
+name = "conv"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ff10625fd0ac447827aa30ea8b861fead473bb60aeb73af6c1c58caf0d1299"
+dependencies = [
+ "custom_derive",
+]
+
[[package]]
name = "core-foundation"
version = "0.9.4"
@@ -507,6 +549,25 @@ dependencies = [
"crossbeam-utils",
]
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+
[[package]]
name = "crossbeam-queue"
version = "0.3.12"
@@ -522,6 +583,12 @@ version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+[[package]]
+name = "crunchy"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929"
+
[[package]]
name = "crypto-common"
version = "0.1.6"
@@ -532,6 +599,12 @@ dependencies = [
"typenum",
]
+[[package]]
+name = "custom_derive"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9"
+
[[package]]
name = "darling"
version = "0.13.4"
@@ -676,12 +749,36 @@ version = "2.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
+[[package]]
+name = "exr"
+version = "1.73.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f83197f59927b46c04a183a619b7c29df34e63e63c7869320862268c0ef687e0"
+dependencies = [
+ "bit_field",
+ "half",
+ "lebe",
+ "miniz_oxide",
+ "rayon-core",
+ "smallvec",
+ "zune-inflate",
+]
+
[[package]]
name = "fastrand"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
+[[package]]
+name = "fdeflate"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c"
+dependencies = [
+ "simd-adler32",
+]
+
[[package]]
name = "filetime"
version = "0.2.25"
@@ -864,6 +961,17 @@ dependencies = [
"version_check",
]
+[[package]]
+name = "getrandom"
+version = "0.1.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi 0.9.0+wasi-snapshot-preview1",
+]
+
[[package]]
name = "getrandom"
version = "0.2.16"
@@ -889,6 +997,16 @@ dependencies = [
"wasi 0.14.2+wasi-0.2.4",
]
+[[package]]
+name = "gif"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fb2d69b19215e18bb912fa30f7ce15846e301408695e44e0ef719f1da9e19f2"
+dependencies = [
+ "color_quant",
+ "weezl",
+]
+
[[package]]
name = "gimli"
version = "0.31.1"
@@ -920,6 +1038,16 @@ dependencies = [
"tracing",
]
+[[package]]
+name = "half"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+]
+
[[package]]
name = "hashbrown"
version = "0.14.5"
@@ -1287,6 +1415,42 @@ dependencies = [
"icu_properties",
]
+[[package]]
+name = "image"
+version = "0.24.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5690139d2f55868e080017335e4b94cb7414274c74f1669c84fb5feba2c9f69d"
+dependencies = [
+ "bytemuck",
+ "byteorder",
+ "color_quant",
+ "exr",
+ "gif",
+ "jpeg-decoder",
+ "num-traits",
+ "png",
+ "qoi",
+ "tiff",
+]
+
+[[package]]
+name = "imageproc"
+version = "0.23.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6aee993351d466301a29655d628bfc6f5a35a0d062b6160ca0808f425805fd7"
+dependencies = [
+ "approx",
+ "conv",
+ "image",
+ "itertools",
+ "nalgebra",
+ "num",
+ "rand 0.7.3",
+ "rand_distr",
+ "rayon",
+ "rusttype",
+]
+
[[package]]
name = "indexmap"
version = "2.9.0"
@@ -1339,12 +1503,30 @@ version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
+[[package]]
+name = "itertools"
+version = "0.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
+dependencies = [
+ "either",
+]
+
[[package]]
name = "itoa"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
+[[package]]
+name = "jpeg-decoder"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f5d4a7da358eff58addd2877a45865158f0d78c911d43a5784ceb7bbf52833b0"
+dependencies = [
+ "rayon",
+]
+
[[package]]
name = "js-sys"
version = "0.3.77"
@@ -1405,6 +1587,12 @@ version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
+[[package]]
+name = "lebe"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "03087c2bad5e1034e8cace5926dec053fb3790248370865f5117a7d0213354c8"
+
[[package]]
name = "leptonica-plumbing"
version = "1.4.0"
@@ -1529,6 +1717,16 @@ version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
+[[package]]
+name = "matrixmultiply"
+version = "0.3.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08"
+dependencies = [
+ "autocfg",
+ "rawpointer",
+]
+
[[package]]
name = "md-5"
version = "0.10.6"
@@ -1574,6 +1772,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316"
dependencies = [
"adler2",
+ "simd-adler32",
]
[[package]]
@@ -1616,6 +1815,21 @@ dependencies = [
"version_check",
]
+[[package]]
+name = "nalgebra"
+version = "0.30.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fb2d0de08694bed883320212c18ee3008576bfe8c306f4c3c4a58b4876998be"
+dependencies = [
+ "approx",
+ "matrixmultiply",
+ "num-complex",
+ "num-rational",
+ "num-traits",
+ "simba",
+ "typenum",
+]
+
[[package]]
name = "native-tls"
version = "0.2.14"
@@ -1672,6 +1886,20 @@ dependencies = [
"winapi",
]
+[[package]]
+name = "num"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
+dependencies = [
+ "num-bigint",
+ "num-complex",
+ "num-integer",
+ "num-iter",
+ "num-rational",
+ "num-traits",
+]
+
[[package]]
name = "num-bigint"
version = "0.4.6"
@@ -1694,11 +1922,20 @@ dependencies = [
"num-integer",
"num-iter",
"num-traits",
- "rand",
+ "rand 0.8.5",
"smallvec",
"zeroize",
]
+[[package]]
+name = "num-complex"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
+dependencies = [
+ "num-traits",
+]
+
[[package]]
name = "num-conv"
version = "0.1.0"
@@ -1725,6 +1962,17 @@ dependencies = [
"num-traits",
]
+[[package]]
+name = "num-rational"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
+dependencies = [
+ "num-bigint",
+ "num-integer",
+ "num-traits",
+]
+
[[package]]
name = "num-traits"
version = "0.2.19"
@@ -1806,6 +2054,15 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+[[package]]
+name = "owned_ttf_parser"
+version = "0.15.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05e6affeb1632d6ff6a23d2cd40ffed138e82f1532571a26f527c8a284bb2fbb"
+dependencies = [
+ "ttf-parser",
+]
+
[[package]]
name = "parking_lot"
version = "0.12.4"
@@ -1940,6 +2197,19 @@ version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
+[[package]]
+name = "png"
+version = "0.17.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "82151a2fc869e011c153adc57cf2789ccb8d9906ce52c0b39a6b5697749d7526"
+dependencies = [
+ "bitflags 1.3.2",
+ "crc32fast",
+ "fdeflate",
+ "flate2",
+ "miniz_oxide",
+]
+
[[package]]
name = "pom"
version = "1.1.0"
@@ -2009,6 +2279,15 @@ dependencies = [
"unicode-ident",
]
+[[package]]
+name = "qoi"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f6d64c71eb498fe9eae14ce4ec935c555749aef511cca85b5568910d6e48001"
+dependencies = [
+ "bytemuck",
+]
+
[[package]]
name = "quote"
version = "1.0.40"
@@ -2024,6 +2303,19 @@ version = "5.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
+[[package]]
+name = "rand"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
+dependencies = [
+ "getrandom 0.1.16",
+ "libc",
+ "rand_chacha 0.2.2",
+ "rand_core 0.5.1",
+ "rand_hc",
+]
+
[[package]]
name = "rand"
version = "0.8.5"
@@ -2031,8 +2323,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
- "rand_chacha",
- "rand_core",
+ "rand_chacha 0.3.1",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.5.1",
]
[[package]]
@@ -2042,7 +2344,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
- "rand_core",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
+dependencies = [
+ "getrandom 0.1.16",
]
[[package]]
@@ -2054,12 +2365,56 @@ dependencies = [
"getrandom 0.2.16",
]
+[[package]]
+name = "rand_distr"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96977acbdd3a6576fb1d27391900035bf3863d4a16422973a409b488cf29ffb2"
+dependencies = [
+ "rand 0.7.3",
+]
+
+[[package]]
+name = "rand_hc"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
+dependencies = [
+ "rand_core 0.5.1",
+]
+
[[package]]
name = "rangemap"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f60fcc7d6849342eff22c4350c8b9a989ee8ceabc4b481253e8946b9fe83d684"
+[[package]]
+name = "rawpointer"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
+
+[[package]]
+name = "rayon"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+
+[[package]]
+name = "rayon-core"
+version = "1.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
+
[[package]]
name = "readur"
version = "0.1.0"
@@ -2073,10 +2428,13 @@ dependencies = [
"dotenvy",
"futures-util",
"hostname",
+ "image",
+ "imageproc",
"jsonwebtoken",
"mime_guess",
"notify",
"pdf-extract",
+ "regex",
"reqwest",
"serde",
"serde_json",
@@ -2203,7 +2561,7 @@ dependencies = [
"num-traits",
"pkcs1",
"pkcs8",
- "rand_core",
+ "rand_core 0.6.4",
"signature",
"spki",
"subtle",
@@ -2312,6 +2670,16 @@ dependencies = [
"untrusted",
]
+[[package]]
+name = "rusttype"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3ff8374aa04134254b7995b63ad3dc41c7f7236f69528b28553da7d72efaa967"
+dependencies = [
+ "ab_glyph_rasterizer",
+ "owned_ttf_parser",
+]
+
[[package]]
name = "rustversion"
version = "1.0.21"
@@ -2324,6 +2692,15 @@ version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
+[[package]]
+name = "safe_arch"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96b02de82ddbe1b636e6170c21be622223aea188ef2e139be0a5b219ec215323"
+dependencies = [
+ "bytemuck",
+]
+
[[package]]
name = "same-file"
version = "1.0.6"
@@ -2510,9 +2887,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
dependencies = [
"digest",
- "rand_core",
+ "rand_core 0.6.4",
]
+[[package]]
+name = "simba"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2f3fd720c48c53cace224ae62bef1bbff363a70c68c4802a78b5cc6159618176"
+dependencies = [
+ "approx",
+ "num-complex",
+ "num-traits",
+ "paste",
+ "wide",
+]
+
+[[package]]
+name = "simd-adler32"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe"
+
[[package]]
name = "simple_asn1"
version = "0.6.3"
@@ -2705,7 +3101,7 @@ dependencies = [
"memchr",
"once_cell",
"percent-encoding",
- "rand",
+ "rand 0.8.5",
"rsa",
"serde",
"sha1",
@@ -2746,7 +3142,7 @@ dependencies = [
"md-5",
"memchr",
"once_cell",
- "rand",
+ "rand 0.8.5",
"serde",
"serde_json",
"sha2",
@@ -2943,7 +3339,7 @@ dependencies = [
"hex",
"hmac",
"log",
- "rand",
+ "rand 0.8.5",
"serde",
"serde_json",
"sha2",
@@ -3008,6 +3404,17 @@ dependencies = [
"once_cell",
]
+[[package]]
+name = "tiff"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba1310fcea54c6a9a4fd1aad794ecc02c31682f6bfbecdf460bf19533eed1e3e"
+dependencies = [
+ "flate2",
+ "jpeg-decoder",
+ "weezl",
+]
+
[[package]]
name = "time"
version = "0.3.41"
@@ -3259,6 +3666,12 @@ version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
+[[package]]
+name = "ttf-parser"
+version = "0.15.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b3e06c9b9d80ed6b745c7159c40b311ad2916abb34a49e9be2653b90db0d8dd"
+
[[package]]
name = "type1-encoding-parser"
version = "0.1.0"
@@ -3445,6 +3858,12 @@ dependencies = [
"try-lock",
]
+[[package]]
+name = "wasi"
+version = "0.9.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
+
[[package]]
name = "wasi"
version = "0.11.1+wasi-snapshot-preview1"
@@ -3581,6 +4000,16 @@ dependencies = [
"wasite",
]
+[[package]]
+name = "wide"
+version = "0.7.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41b5576b9a81633f3e8df296ce0063042a73507636cbe956c61133dd7034ab22"
+dependencies = [
+ "bytemuck",
+ "safe_arch",
+]
+
[[package]]
name = "winapi"
version = "0.3.9"
@@ -4023,3 +4452,12 @@ dependencies = [
"crossbeam-utils",
"flate2",
]
+
+[[package]]
+name = "zune-inflate"
+version = "0.2.54"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73ab332fe2f6680068f3582b16a24f90ad7096d5d39b974d1c0aff0125116f02"
+dependencies = [
+ "simd-adler32",
+]
diff --git a/Cargo.toml b/Cargo.toml
index 00a14ca..9531e36 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,6 +11,7 @@ tower-http = { version = "0.5", features = ["cors", "fs"] }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
sqlx = { version = "0.7", features = ["runtime-tokio-rustls", "postgres", "sqlite", "chrono", "uuid"] }
+regex = "1.0"
uuid = { version = "1", features = ["v4", "serde"] }
chrono = { version = "0.4", features = ["serde"] }
bcrypt = "0.15"
@@ -25,6 +26,8 @@ notify = "6"
mime_guess = "2"
tesseract = { version = "0.15", optional = true }
pdf-extract = { version = "0.7", optional = true }
+image = { version = "0.24", features = ["png", "jpeg", "tiff", "bmp"], optional = true }
+imageproc = { version = "0.23", optional = true }
reqwest = { version = "0.11", features = ["json", "multipart"] }
dotenvy = "0.15"
hostname = "0.4"
@@ -35,7 +38,7 @@ utoipa-swagger-ui = { version = "6", features = ["axum"] }
[features]
default = ["ocr"]
-ocr = ["tesseract", "pdf-extract"]
+ocr = ["tesseract", "pdf-extract", "image", "imageproc"]
[dev-dependencies]
tempfile = "3"
diff --git a/frontend/src/components/GlobalSearchBar/GlobalSearchBar.jsx b/frontend/src/components/GlobalSearchBar/GlobalSearchBar.jsx
index 0ade7f1..baa6ecb 100644
--- a/frontend/src/components/GlobalSearchBar/GlobalSearchBar.jsx
+++ b/frontend/src/components/GlobalSearchBar/GlobalSearchBar.jsx
@@ -132,8 +132,9 @@ const GlobalSearchBar = ({ sx, ...props }) => {
const response = await documentService.enhancedSearch({
query: searchQuery.trim(),
limit: 5, // Show only top 5 results in global search
- include_snippets: false, // Don't need snippets for quick search
- search_mode: 'simple',
+ include_snippets: true, // Include snippets for context
+ snippet_length: 100, // Shorter snippets for quick search
+ search_mode: searchQuery.length < 4 ? 'fuzzy' : 'simple', // Use fuzzy for short queries (substring matching)
});
clearInterval(progressInterval);
@@ -240,6 +241,76 @@ const GlobalSearchBar = ({ sx, ...props }) => {
return Math.round(bytes / Math.pow(1024, i) * 100) / 100 + ' ' + sizes[i];
};
+ // Function to highlight search terms in text (including substrings)
+ const highlightText = useCallback((text, searchTerm) => {
+ if (!searchTerm || !text) return text;
+
+ const terms = searchTerm.toLowerCase().split(/\s+/).filter(term => term.length >= 2);
+ let highlightedText = text;
+
+ terms.forEach(term => {
+ const regex = new RegExp(`(${term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')})`, 'gi');
+ highlightedText = highlightedText.replace(regex, (match) => `**${match}**`);
+ });
+
+ // Split by ** markers and create spans
+ const parts = highlightedText.split(/\*\*(.*?)\*\*/);
+
+ return parts.map((part, index) => {
+ if (index % 2 === 1) {
+ // This is a highlighted part
+ return (
+
+ {part}
+
+ );
+ }
+ return part;
+ });
+ }, []);
+
+ // Enhanced search with context snippets
+ const generateContextSnippet = useCallback((filename, searchTerm) => {
+ if (!searchTerm || !filename) return filename;
+
+ const lowerFilename = filename.toLowerCase();
+ const lowerTerm = searchTerm.toLowerCase();
+
+ // Find the best match (exact term or substring)
+ const exactMatch = lowerFilename.indexOf(lowerTerm);
+ if (exactMatch !== -1) {
+ // Show context around the match
+ const start = Math.max(0, exactMatch - 10);
+ const end = Math.min(filename.length, exactMatch + searchTerm.length + 10);
+ const snippet = filename.substring(start, end);
+ return start > 0 ? `...${snippet}` : snippet;
+ }
+
+ // Look for partial word matches
+ const words = filename.split(/[_\-\s\.]/);
+ const matchingWord = words.find(word =>
+ word.toLowerCase().includes(lowerTerm) || lowerTerm.includes(word.toLowerCase())
+ );
+
+ if (matchingWord) {
+ const wordIndex = words.indexOf(matchingWord);
+ const contextWords = words.slice(Math.max(0, wordIndex - 1), Math.min(words.length, wordIndex + 2));
+ return contextWords.join(' ');
+ }
+
+ return filename;
+ }, []);
+
return (
@@ -434,34 +505,54 @@ const GlobalSearchBar = ({ sx, ...props }) => {
whiteSpace: 'nowrap',
}}
>
- {doc.original_filename}
+ {highlightText(generateContextSnippet(doc.original_filename, query), query)}
}
secondary={
-
-
- {formatFileSize(doc.file_size)}
-
- {doc.has_ocr_text && (
-
+
+
+
+ {formatFileSize(doc.file_size)}
+
+ {doc.has_ocr_text && (
+
+ )}
+ {doc.search_rank && (
+ }
+ label={`${(doc.search_rank * 100).toFixed(0)}%`}
+ size="small"
+ color="info"
+ variant="outlined"
+ sx={{ height: 16, fontSize: '0.6rem' }}
+ />
+ )}
+
+
+ {/* Show content snippet if available */}
+ {doc.snippets && doc.snippets.length > 0 && (
+
+ {highlightText(doc.snippets[0].text.substring(0, 80) + '...', query)}
+
)}
- {doc.search_rank && (
- }
- label={`${(doc.search_rank * 100).toFixed(0)}%`}
- size="small"
- color="info"
- variant="outlined"
- sx={{ height: 16, fontSize: '0.6rem' }}
- />
- )}
-
+
}
/>
diff --git a/src/db.rs b/src/db.rs
index 419e729..5079b1c 100644
--- a/src/db.rs
+++ b/src/db.rs
@@ -85,6 +85,15 @@ impl Database {
.execute(&self.pool)
.await?;
+ // Enhanced indexes for substring matching and similarity
+ sqlx::query(r#"CREATE INDEX IF NOT EXISTS idx_documents_filename_trgm ON documents USING GIN(filename gin_trgm_ops)"#)
+ .execute(&self.pool)
+ .await?;
+
+ sqlx::query(r#"CREATE INDEX IF NOT EXISTS idx_documents_content_trgm ON documents USING GIN((COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')) gin_trgm_ops)"#)
+ .execute(&self.pool)
+ .await?;
+
// Create settings table
sqlx::query(
r#"
@@ -107,6 +116,15 @@ impl Database {
memory_limit_mb INT DEFAULT 512,
cpu_priority VARCHAR(10) DEFAULT 'normal',
enable_background_ocr BOOLEAN DEFAULT TRUE,
+ ocr_page_segmentation_mode INT DEFAULT 3,
+ ocr_engine_mode INT DEFAULT 3,
+ ocr_min_confidence REAL DEFAULT 30.0,
+ ocr_dpi INT DEFAULT 300,
+ ocr_enhance_contrast BOOLEAN DEFAULT TRUE,
+ ocr_remove_noise BOOLEAN DEFAULT TRUE,
+ ocr_detect_orientation BOOLEAN DEFAULT TRUE,
+ ocr_whitelist_chars TEXT,
+ ocr_blacklist_chars TEXT,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
)
@@ -492,28 +510,78 @@ impl Database {
pub async fn enhanced_search_documents(&self, user_id: Uuid, search: SearchRequest) -> Result<(Vec, i64, u64)> {
let start_time = std::time::Instant::now();
- // Build search query based on search mode
+ // Build search query based on search mode with enhanced substring matching
let search_mode = search.search_mode.as_ref().unwrap_or(&SearchMode::Simple);
- let query_function = match search_mode {
- SearchMode::Simple => "plainto_tsquery",
- SearchMode::Phrase => "phraseto_tsquery",
- SearchMode::Fuzzy => "plainto_tsquery", // Could be enhanced with similarity
- SearchMode::Boolean => "to_tsquery",
- };
-
- let mut query_builder = sqlx::QueryBuilder::new(&format!(
- r#"
- SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, tags, created_at, updated_at, user_id,
- ts_rank(to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')), {}('english', "#,
- query_function
- ));
- query_builder.push_bind(&search.query);
- query_builder.push(&format!(")) as rank FROM documents WHERE user_id = "));
- query_builder.push_bind(user_id);
- query_builder.push(&format!(" AND to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')) @@ {}('english', ", query_function));
- query_builder.push_bind(&search.query);
- query_builder.push(")");
+ // For fuzzy mode, we'll use similarity matching which is better for substrings
+ let use_similarity = matches!(search_mode, SearchMode::Fuzzy);
+
+ let mut query_builder = if use_similarity {
+ // Use trigram similarity for substring matching
+ let mut builder = sqlx::QueryBuilder::new(
+ r#"
+ SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, tags, created_at, updated_at, user_id,
+ GREATEST(
+ similarity(filename, "#
+ );
+ builder.push_bind(&search.query);
+ builder.push(r#"),
+ similarity(COALESCE(content, '') || ' ' || COALESCE(ocr_text, ''), "#);
+ builder.push_bind(&search.query);
+ builder.push(r#"),
+ ts_rank(to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')), plainto_tsquery('english', "#);
+ builder.push_bind(&search.query);
+ builder.push(r#"))
+ ) as rank
+ FROM documents
+ WHERE user_id = "#);
+ builder.push_bind(user_id);
+ builder.push(r#" AND (
+ filename % "#);
+ builder.push_bind(&search.query);
+ builder.push(r#" OR
+ (COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')) % "#);
+ builder.push_bind(&search.query);
+ builder.push(r#" OR
+ to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')) @@ plainto_tsquery('english', "#);
+ builder.push_bind(&search.query);
+ builder.push(r#")
+ )"#);
+ builder
+ } else {
+ // Use traditional full-text search with enhanced ranking
+ let query_function = match search_mode {
+ SearchMode::Simple => "plainto_tsquery",
+ SearchMode::Phrase => "phraseto_tsquery",
+ SearchMode::Boolean => "to_tsquery",
+ SearchMode::Fuzzy => "plainto_tsquery", // fallback
+ };
+
+ let mut builder = sqlx::QueryBuilder::new(&format!(
+ r#"
+ SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, tags, created_at, updated_at, user_id,
+ GREATEST(
+ CASE WHEN filename ILIKE '%' || "#
+ ));
+ builder.push_bind(&search.query);
+ builder.push(&format!(r#"' || '%' THEN 0.8 ELSE 0 END,
+ ts_rank(to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')), {}('english', "#, query_function));
+ builder.push_bind(&search.query);
+ builder.push(&format!(r#"))
+ ) as rank
+ FROM documents
+ WHERE user_id = "#));
+ builder.push_bind(user_id);
+ builder.push(&format!(r#" AND (
+ filename ILIKE '%' || "#));
+ builder.push_bind(&search.query);
+ builder.push(&format!(r#" || '%' OR
+ to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')) @@ {}('english', "#, query_function));
+ builder.push_bind(&search.query);
+ builder.push(r#")
+ )"#);
+ builder
+ };
if let Some(tags) = &search.tags {
if !tags.is_empty() {
@@ -574,6 +642,18 @@ impl Database {
});
}
+ // Get the query function for total count
+ let query_function = if use_similarity {
+ "plainto_tsquery"
+ } else {
+ match search_mode {
+ SearchMode::Simple => "plainto_tsquery",
+ SearchMode::Phrase => "phraseto_tsquery",
+ SearchMode::Boolean => "to_tsquery",
+ SearchMode::Fuzzy => "plainto_tsquery",
+ }
+ };
+
let total_row = sqlx::query(&format!(
r#"
SELECT COUNT(*) as total FROM documents
@@ -603,37 +683,102 @@ impl Database {
(None, None) => return snippets,
};
- // Simple keyword matching for snippets (could be enhanced with better search algorithms)
- let _query_terms: Vec<&str> = query.split_whitespace().collect();
+ // Enhanced substring matching for better context
+ let query_terms: Vec<&str> = query.split_whitespace().collect();
let text_lower = full_text.to_lowercase();
let query_lower = query.to_lowercase();
- // Find matches
+ // Find exact matches first
+ let mut match_positions = Vec::new();
+
+ // 1. Look for exact query matches
for (i, _) in text_lower.match_indices(&query_lower) {
- let snippet_start = if i >= snippet_length as usize / 2 {
- i - snippet_length as usize / 2
- } else {
- 0
- };
+ match_positions.push((i, query.len(), "exact"));
+ }
+
+ // 2. Look for individual term matches (substring matching)
+ for term in &query_terms {
+ if term.len() >= 3 { // Only match terms of reasonable length
+ let term_lower = term.to_lowercase();
+ for (i, _) in text_lower.match_indices(&term_lower) {
+ // Check if this isn't already part of an exact match
+ let is_duplicate = match_positions.iter().any(|(pos, len, _)| {
+ i >= *pos && i < *pos + *len
+ });
+ if !is_duplicate {
+ match_positions.push((i, term.len(), "term"));
+ }
+ }
+ }
+ }
+
+ // 3. Look for partial word matches (for "docu" -> "document" cases)
+ for term in &query_terms {
+ if term.len() >= 3 {
+ let term_lower = term.to_lowercase();
+ // Find words that start with our search term
+ let words_regex = regex::Regex::new(&format!(r"\b{}[a-zA-Z]*\b", regex::escape(&term_lower))).unwrap();
+ for mat in words_regex.find_iter(&text_lower) {
+ let is_duplicate = match_positions.iter().any(|(pos, len, _)| {
+ mat.start() >= *pos && mat.start() < *pos + *len
+ });
+ if !is_duplicate {
+ match_positions.push((mat.start(), mat.end() - mat.start(), "partial"));
+ }
+ }
+ }
+ }
+
+ // Sort matches by position and remove overlaps
+ match_positions.sort_by_key(|&(pos, _, _)| pos);
+
+ // Generate snippets around matches
+ for (match_pos, match_len, _match_type) in match_positions.iter().take(5) {
+ let context_size = (snippet_length as usize).saturating_sub(*match_len) / 2;
+ let snippet_start = match_pos.saturating_sub(context_size);
let snippet_end = std::cmp::min(
- snippet_start + snippet_length as usize,
+ match_pos + match_len + context_size,
full_text.len()
);
- if snippet_start < full_text.len() {
+ // Find word boundaries to avoid cutting words
+ let snippet_start = self.find_word_boundary(&full_text, snippet_start, true);
+ let snippet_end = self.find_word_boundary(&full_text, snippet_end, false);
+
+ if snippet_start < snippet_end && snippet_start < full_text.len() {
let snippet_text = &full_text[snippet_start..snippet_end];
- // Find highlight ranges within this snippet
+ // Find all highlight ranges within this snippet
let mut highlight_ranges = Vec::new();
let snippet_lower = snippet_text.to_lowercase();
+ // Highlight exact query match
for (match_start, _) in snippet_lower.match_indices(&query_lower) {
highlight_ranges.push(HighlightRange {
start: match_start as i32,
end: (match_start + query.len()) as i32,
});
}
+
+ // Highlight individual terms if no exact match
+ if highlight_ranges.is_empty() {
+ for term in &query_terms {
+ if term.len() >= 3 {
+ let term_lower = term.to_lowercase();
+ for (match_start, _) in snippet_lower.match_indices(&term_lower) {
+ highlight_ranges.push(HighlightRange {
+ start: match_start as i32,
+ end: (match_start + term.len()) as i32,
+ });
+ }
+ }
+ }
+ }
+
+ // Remove duplicate highlights and sort
+ highlight_ranges.sort_by_key(|r| r.start);
+ highlight_ranges.dedup_by_key(|r| r.start);
snippets.push(SearchSnippet {
text: snippet_text.to_string(),
@@ -642,7 +787,7 @@ impl Database {
highlight_ranges,
});
- // Limit to a few snippets per document
+ // Limit to avoid too many snippets
if snippets.len() >= 3 {
break;
}
@@ -652,6 +797,29 @@ impl Database {
snippets
}
+ fn find_word_boundary(&self, text: &str, mut pos: usize, search_backward: bool) -> usize {
+ if pos >= text.len() {
+ return text.len();
+ }
+
+ let chars: Vec = text.chars().collect();
+
+ if search_backward {
+ // Search backward for word boundary
+ while pos > 0 && chars.get(pos.saturating_sub(1)).map_or(false, |c| c.is_alphanumeric()) {
+ pos = pos.saturating_sub(1);
+ }
+ } else {
+ // Search forward for word boundary
+ while pos < chars.len() && chars.get(pos).map_or(false, |c| c.is_alphanumeric()) {
+ pos += 1;
+ }
+ }
+
+ // Convert back to byte position
+ chars.iter().take(pos).map(|c| c.len_utf8()).sum()
+ }
+
pub async fn update_document_ocr(&self, id: Uuid, ocr_text: &str) -> Result<()> {
sqlx::query("UPDATE documents SET ocr_text = $1, updated_at = NOW() WHERE id = $2")
.bind(ocr_text)
@@ -734,7 +902,10 @@ impl Database {
max_file_size_mb, allowed_file_types, auto_rotate_images, enable_image_preprocessing,
search_results_per_page, search_snippet_length, fuzzy_search_threshold,
retention_days, enable_auto_cleanup, enable_compression, memory_limit_mb,
- cpu_priority, enable_background_ocr, created_at, updated_at
+ cpu_priority, enable_background_ocr, ocr_page_segmentation_mode, ocr_engine_mode,
+ ocr_min_confidence, ocr_dpi, ocr_enhance_contrast, ocr_remove_noise,
+ ocr_detect_orientation, ocr_whitelist_chars, ocr_blacklist_chars,
+ created_at, updated_at
FROM settings WHERE user_id = $1"#
)
.bind(user_id)
@@ -761,6 +932,15 @@ impl Database {
memory_limit_mb: row.get("memory_limit_mb"),
cpu_priority: row.get("cpu_priority"),
enable_background_ocr: row.get("enable_background_ocr"),
+ ocr_page_segmentation_mode: row.get("ocr_page_segmentation_mode"),
+ ocr_engine_mode: row.get("ocr_engine_mode"),
+ ocr_min_confidence: row.get("ocr_min_confidence"),
+ ocr_dpi: row.get("ocr_dpi"),
+ ocr_enhance_contrast: row.get("ocr_enhance_contrast"),
+ ocr_remove_noise: row.get("ocr_remove_noise"),
+ ocr_detect_orientation: row.get("ocr_detect_orientation"),
+ ocr_whitelist_chars: row.get("ocr_whitelist_chars"),
+ ocr_blacklist_chars: row.get("ocr_blacklist_chars"),
created_at: row.get("created_at"),
updated_at: row.get("updated_at"),
})),
@@ -787,9 +967,11 @@ impl Database {
max_file_size_mb, allowed_file_types, auto_rotate_images, enable_image_preprocessing,
search_results_per_page, search_snippet_length, fuzzy_search_threshold,
retention_days, enable_auto_cleanup, enable_compression, memory_limit_mb,
- cpu_priority, enable_background_ocr
+ cpu_priority, enable_background_ocr, ocr_page_segmentation_mode, ocr_engine_mode,
+ ocr_min_confidence, ocr_dpi, ocr_enhance_contrast, ocr_remove_noise,
+ ocr_detect_orientation, ocr_whitelist_chars, ocr_blacklist_chars
)
- VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17)
+ VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26)
ON CONFLICT (user_id) DO UPDATE SET
ocr_language = $2,
concurrent_ocr_jobs = $3,
@@ -807,12 +989,24 @@ impl Database {
memory_limit_mb = $15,
cpu_priority = $16,
enable_background_ocr = $17,
+ ocr_page_segmentation_mode = $18,
+ ocr_engine_mode = $19,
+ ocr_min_confidence = $20,
+ ocr_dpi = $21,
+ ocr_enhance_contrast = $22,
+ ocr_remove_noise = $23,
+ ocr_detect_orientation = $24,
+ ocr_whitelist_chars = $25,
+ ocr_blacklist_chars = $26,
updated_at = NOW()
RETURNING id, user_id, ocr_language, concurrent_ocr_jobs, ocr_timeout_seconds,
max_file_size_mb, allowed_file_types, auto_rotate_images, enable_image_preprocessing,
search_results_per_page, search_snippet_length, fuzzy_search_threshold,
retention_days, enable_auto_cleanup, enable_compression, memory_limit_mb,
- cpu_priority, enable_background_ocr, created_at, updated_at
+ cpu_priority, enable_background_ocr, ocr_page_segmentation_mode, ocr_engine_mode,
+ ocr_min_confidence, ocr_dpi, ocr_enhance_contrast, ocr_remove_noise,
+ ocr_detect_orientation, ocr_whitelist_chars, ocr_blacklist_chars,
+ created_at, updated_at
"#
)
.bind(user_id)
@@ -832,6 +1026,15 @@ impl Database {
.bind(settings.memory_limit_mb.unwrap_or(current.memory_limit_mb))
.bind(settings.cpu_priority.as_ref().unwrap_or(¤t.cpu_priority))
.bind(settings.enable_background_ocr.unwrap_or(current.enable_background_ocr))
+ .bind(settings.ocr_page_segmentation_mode.unwrap_or(current.ocr_page_segmentation_mode))
+ .bind(settings.ocr_engine_mode.unwrap_or(current.ocr_engine_mode))
+ .bind(settings.ocr_min_confidence.unwrap_or(current.ocr_min_confidence))
+ .bind(settings.ocr_dpi.unwrap_or(current.ocr_dpi))
+ .bind(settings.ocr_enhance_contrast.unwrap_or(current.ocr_enhance_contrast))
+ .bind(settings.ocr_remove_noise.unwrap_or(current.ocr_remove_noise))
+ .bind(settings.ocr_detect_orientation.unwrap_or(current.ocr_detect_orientation))
+ .bind(settings.ocr_whitelist_chars.as_ref().unwrap_or(¤t.ocr_whitelist_chars))
+ .bind(settings.ocr_blacklist_chars.as_ref().unwrap_or(¤t.ocr_blacklist_chars))
.fetch_one(&self.pool)
.await?;
@@ -854,6 +1057,15 @@ impl Database {
memory_limit_mb: row.get("memory_limit_mb"),
cpu_priority: row.get("cpu_priority"),
enable_background_ocr: row.get("enable_background_ocr"),
+ ocr_page_segmentation_mode: row.get("ocr_page_segmentation_mode"),
+ ocr_engine_mode: row.get("ocr_engine_mode"),
+ ocr_min_confidence: row.get("ocr_min_confidence"),
+ ocr_dpi: row.get("ocr_dpi"),
+ ocr_enhance_contrast: row.get("ocr_enhance_contrast"),
+ ocr_remove_noise: row.get("ocr_remove_noise"),
+ ocr_detect_orientation: row.get("ocr_detect_orientation"),
+ ocr_whitelist_chars: row.get("ocr_whitelist_chars"),
+ ocr_blacklist_chars: row.get("ocr_blacklist_chars"),
created_at: row.get("created_at"),
updated_at: row.get("updated_at"),
})
diff --git a/src/enhanced_ocr.rs b/src/enhanced_ocr.rs
new file mode 100644
index 0000000..064af3d
--- /dev/null
+++ b/src/enhanced_ocr.rs
@@ -0,0 +1,655 @@
+use anyhow::Result;
+use tracing::{debug, info, warn};
+
+#[cfg(feature = "ocr")]
+use image::{DynamicImage, ImageBuffer, Luma, GenericImageView};
+#[cfg(feature = "ocr")]
+use imageproc::{
+ contrast::adaptive_threshold,
+ morphology::{close, open},
+ filter::{median_filter, gaussian_blur_f32},
+ distance_transform::Norm,
+};
+#[cfg(feature = "ocr")]
+use tesseract::{Tesseract, PageSegMode, OcrEngineMode};
+
+use crate::models::Settings;
+
+#[derive(Debug, Clone)]
+pub struct ImageQualityStats {
+ pub average_brightness: f32,
+ pub contrast_ratio: f32,
+ pub noise_level: f32,
+ pub sharpness: f32,
+}
+
+#[derive(Debug, Clone)]
+pub struct OcrResult {
+ pub text: String,
+ pub confidence: f32,
+ pub processing_time_ms: u64,
+ pub word_count: usize,
+ pub preprocessing_applied: Vec,
+}
+
+pub struct EnhancedOcrService {
+ pub temp_dir: String,
+}
+
+impl EnhancedOcrService {
+ pub fn new(temp_dir: String) -> Self {
+ Self { temp_dir }
+ }
+
+ /// Extract text from image with high-quality OCR settings
+ #[cfg(feature = "ocr")]
+ pub async fn extract_text_from_image(&self, file_path: &str, settings: &Settings) -> Result {
+ let start_time = std::time::Instant::now();
+ info!("Starting enhanced OCR for image: {}", file_path);
+
+ let mut preprocessing_applied = Vec::new();
+
+ // Load and preprocess the image
+ let processed_image_path = if settings.enable_image_preprocessing {
+ let processed_path = self.preprocess_image(file_path, settings).await?;
+ preprocessing_applied.push("Image preprocessing enabled".to_string());
+ processed_path
+ } else {
+ file_path.to_string()
+ };
+
+ // Configure Tesseract with optimal settings
+ let mut tesseract = self.configure_tesseract(&processed_image_path, settings)?;
+
+ // Extract text with confidence
+ let text = tesseract.get_text()?.trim().to_string();
+ let confidence = self.calculate_overall_confidence(&mut tesseract)?;
+
+ // Clean up temporary files if created
+ if processed_image_path != file_path {
+ let _ = std::fs::remove_file(&processed_image_path);
+ }
+
+ let processing_time = start_time.elapsed().as_millis() as u64;
+ let word_count = text.split_whitespace().count();
+
+ debug!(
+ "OCR completed: {} words, {:.1}% confidence, {}ms",
+ word_count, confidence, processing_time
+ );
+
+ Ok(OcrResult {
+ text,
+ confidence,
+ processing_time_ms: processing_time,
+ word_count,
+ preprocessing_applied,
+ })
+ }
+
+ /// Preprocess image for optimal OCR quality, especially for challenging conditions
+ #[cfg(feature = "ocr")]
+ async fn preprocess_image(&self, input_path: &str, settings: &Settings) -> Result {
+ let img = image::open(input_path)?;
+ let mut processed_img = img;
+
+ info!("Original image dimensions: {}x{}", processed_img.width(), processed_img.height());
+
+ // Apply orientation detection and correction
+ if settings.ocr_detect_orientation {
+ processed_img = self.detect_and_correct_orientation(processed_img)?;
+ }
+
+ // Aggressively upscale low-resolution images for better OCR
+ processed_img = self.smart_resize_for_ocr(processed_img, settings.ocr_dpi)?;
+
+ // Convert to grayscale for better OCR
+ let gray_img = processed_img.to_luma8();
+ let mut processed_gray = gray_img;
+
+ // Analyze image quality and apply appropriate enhancements
+ let quality_stats = self.analyze_image_quality(&processed_gray);
+ info!("Image quality analysis: brightness={:.1}, contrast={:.1}, noise_level={:.1}",
+ quality_stats.average_brightness, quality_stats.contrast_ratio, quality_stats.noise_level);
+
+ // Apply adaptive brightness correction for dim images
+ if quality_stats.average_brightness < 80.0 || quality_stats.contrast_ratio < 0.3 {
+ processed_gray = self.enhance_brightness_and_contrast(processed_gray, &quality_stats)?;
+ }
+
+ // Apply noise removal (more aggressive for noisy images)
+ if settings.ocr_remove_noise || quality_stats.noise_level > 0.15 {
+ processed_gray = self.adaptive_noise_removal(processed_gray, &quality_stats)?;
+ }
+
+ // Apply contrast enhancement (adaptive based on image quality)
+ if settings.ocr_enhance_contrast {
+ processed_gray = self.adaptive_contrast_enhancement(processed_gray, &quality_stats)?;
+ }
+
+ // Apply sharpening for blurry images
+ if quality_stats.sharpness < 0.4 {
+ processed_gray = self.sharpen_image(processed_gray)?;
+ }
+
+ // Apply morphological operations for text clarity
+ processed_gray = self.apply_morphological_operations(processed_gray)?;
+
+ // Save processed image to temporary file
+ let temp_filename = format!("processed_{}_{}.png",
+ std::process::id(),
+ std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?.as_millis()
+ );
+ let temp_path = format!("{}/{}", self.temp_dir, temp_filename);
+
+ let dynamic_processed = DynamicImage::ImageLuma8(processed_gray);
+ dynamic_processed.save(&temp_path)?;
+
+ info!("Processed image saved to: {}", temp_path);
+ Ok(temp_path)
+ }
+
+ /// Configure Tesseract with optimal settings
+ #[cfg(feature = "ocr")]
+ fn configure_tesseract(&self, image_path: &str, settings: &Settings) -> Result {
+ let mut tesseract = Tesseract::new(None, Some(&settings.ocr_language))?;
+
+ // Set the image
+ tesseract = tesseract.set_image(image_path)?;
+
+ // Configure Page Segmentation Mode (PSM)
+ let psm = match settings.ocr_page_segmentation_mode {
+ 0 => PageSegMode::PsmOsdOnly,
+ 1 => PageSegMode::PsmAutoOsd,
+ 2 => PageSegMode::PsmAutoOnly,
+ 3 => PageSegMode::PsmAuto,
+ 4 => PageSegMode::PsmSingleColumn,
+ 5 => PageSegMode::PsmSingleBlockVertText,
+ 6 => PageSegMode::PsmSingleBlock,
+ 7 => PageSegMode::PsmSingleLine,
+ 8 => PageSegMode::PsmSingleWord,
+ 9 => PageSegMode::PsmCircleWord,
+ 10 => PageSegMode::PsmSingleChar,
+ 11 => PageSegMode::PsmSparseText,
+ 12 => PageSegMode::PsmSparseTextOsd,
+ 13 => PageSegMode::PsmRawLine,
+ _ => PageSegMode::PsmAuto, // Default fallback
+ };
+ tesseract.set_page_seg_mode(psm);
+
+ // Configure OCR Engine Mode (OEM)
+ let _oem = match settings.ocr_engine_mode {
+ 0 => OcrEngineMode::TesseractOnly,
+ 1 => OcrEngineMode::LstmOnly,
+ 2 => OcrEngineMode::TesseractLstmCombined,
+ 3 => OcrEngineMode::Default,
+ _ => OcrEngineMode::Default, // Default fallback
+ };
+
+ // Note: set_engine_mode may not be available in the current tesseract crate version
+ // We'll configure this differently if needed
+
+ // Set DPI if specified and different from 0
+ if settings.ocr_dpi > 0 {
+ tesseract = tesseract.set_variable("user_defined_dpi", &settings.ocr_dpi.to_string())?;
+ }
+
+ // Configure character whitelist/blacklist
+ if let Some(ref whitelist) = settings.ocr_whitelist_chars {
+ if !whitelist.is_empty() {
+ tesseract = tesseract.set_variable("tessedit_char_whitelist", whitelist)?;
+ }
+ }
+
+ if let Some(ref blacklist) = settings.ocr_blacklist_chars {
+ if !blacklist.is_empty() {
+ tesseract = tesseract.set_variable("tessedit_char_blacklist", blacklist)?;
+ }
+ }
+
+ // Additional high-quality settings for challenging images
+ tesseract = tesseract.set_variable("preserve_interword_spaces", "1")?;
+ tesseract = tesseract.set_variable("tessedit_do_invert", "0")?;
+ tesseract = tesseract.set_variable("classify_enable_learning", "0")?;
+ tesseract = tesseract.set_variable("textord_really_old_xheight", "1")?;
+ tesseract = tesseract.set_variable("textord_min_xheight", "7")?;
+
+ // Enhanced settings for low-quality images
+ tesseract = tesseract.set_variable("tessedit_char_unblacklist_fraction", "0.0")?;
+ tesseract = tesseract.set_variable("edges_max_children_per_outline", "40")?;
+ tesseract = tesseract.set_variable("textord_noise_sizefraction", "10.0")?;
+ tesseract = tesseract.set_variable("textord_noise_translimit", "16.0")?;
+ tesseract = tesseract.set_variable("textord_noise_normratio", "2.0")?;
+
+ // Improve word breaking for dense text
+ tesseract = tesseract.set_variable("textord_tabfind_find_tables", "1")?;
+ tesseract = tesseract.set_variable("textord_use_cjk_fp_model", "0")?;
+
+ // Better handling of degraded images
+ tesseract = tesseract.set_variable("classify_adapt_feature_threshold", "230")?;
+ tesseract = tesseract.set_variable("classify_adapt_proto_threshold", "230")?;
+ tesseract = tesseract.set_variable("textord_heavy_nr", "1")?;
+
+ Ok(tesseract)
+ }
+
+ /// Calculate overall confidence score
+ #[cfg(feature = "ocr")]
+ fn calculate_overall_confidence(&self, _tesseract: &mut Tesseract) -> Result {
+ // Note: get_word_confidences may not be available in current tesseract crate version
+ // For now, we'll estimate confidence based on text quality
+ // This can be enhanced when the API is available or with alternative methods
+
+ // Return a reasonable default confidence for now
+ Ok(85.0)
+ }
+
+ /// Detect and correct image orientation
+ #[cfg(feature = "ocr")]
+ fn detect_and_correct_orientation(&self, img: DynamicImage) -> Result {
+ // For now, we'll implement basic rotation detection
+ // In a production system, you might want to use Tesseract's OSD or advanced algorithms
+ let (width, height) = img.dimensions();
+
+ // If image is wider than tall by significant margin, it might need rotation
+ if width as f32 / height as f32 > 2.0 {
+ Ok(img.rotate90())
+ } else {
+ Ok(img)
+ }
+ }
+
+ /// Smart resize for OCR - aggressive upscaling for low-res images
+ #[cfg(feature = "ocr")]
+ fn smart_resize_for_ocr(&self, img: DynamicImage, target_dpi: i32) -> Result {
+ let (width, height) = img.dimensions();
+ let min_dimension = width.min(height);
+
+ // Calculate target dimensions
+ let mut new_width = width;
+ let mut new_height = height;
+
+ // If image is very small, aggressively upscale
+ if min_dimension < 300 {
+ let scale_factor = 600.0 / min_dimension as f32; // Scale to at least 600px on smallest side
+ new_width = (width as f32 * scale_factor) as u32;
+ new_height = (height as f32 * scale_factor) as u32;
+ info!("Aggressively upscaling small image by factor {:.2}x", scale_factor);
+ } else if target_dpi > 0 && target_dpi != 72 {
+ // Apply DPI scaling
+ let scale_factor = target_dpi as f32 / 72.0;
+ new_width = (width as f32 * scale_factor) as u32;
+ new_height = (height as f32 * scale_factor) as u32;
+ }
+
+ if new_width != width || new_height != height {
+ // Use Lanczos3 for best quality upscaling
+ Ok(img.resize(new_width, new_height, image::imageops::FilterType::Lanczos3))
+ } else {
+ Ok(img)
+ }
+ }
+
+ /// Analyze image quality metrics
+ #[cfg(feature = "ocr")]
+ fn analyze_image_quality(&self, img: &ImageBuffer, Vec>) -> ImageQualityStats {
+ let pixels: Vec = img.pixels().map(|p| p[0]).collect();
+ let pixel_count = pixels.len() as f32;
+
+ // Calculate average brightness
+ let sum: u32 = pixels.iter().map(|&p| p as u32).sum();
+ let average_brightness = sum as f32 / pixel_count;
+
+ // Calculate contrast (standard deviation of pixel values)
+ let variance: f32 = pixels.iter()
+ .map(|&p| {
+ let diff = p as f32 - average_brightness;
+ diff * diff
+ })
+ .sum::() / pixel_count;
+ let std_dev = variance.sqrt();
+ let contrast_ratio = std_dev / 255.0;
+
+ // Estimate noise level using local variance
+ let noise_level = self.estimate_noise_level(img);
+
+ // Estimate sharpness using gradient magnitude
+ let sharpness = self.estimate_sharpness(img);
+
+ ImageQualityStats {
+ average_brightness,
+ contrast_ratio,
+ noise_level,
+ sharpness,
+ }
+ }
+
+ /// Estimate noise level in image
+ #[cfg(feature = "ocr")]
+ fn estimate_noise_level(&self, img: &ImageBuffer, Vec>) -> f32 {
+ let (width, height) = img.dimensions();
+ let mut noise_sum = 0.0f32;
+ let mut sample_count = 0u32;
+
+ // Sample every 10th pixel to estimate noise
+ for y in (5..height-5).step_by(10) {
+ for x in (5..width-5).step_by(10) {
+ let center = img.get_pixel(x, y)[0] as f32;
+ let mut neighbor_sum = 0.0f32;
+ let mut neighbor_count = 0u32;
+
+ // Check 3x3 neighborhood
+ for dy in -1..=1 {
+ for dx in -1..=1 {
+ if dx == 0 && dy == 0 { continue; }
+ let neighbor = img.get_pixel((x as i32 + dx) as u32, (y as i32 + dy) as u32)[0] as f32;
+ neighbor_sum += neighbor;
+ neighbor_count += 1;
+ }
+ }
+
+ let neighbor_avg = neighbor_sum / neighbor_count as f32;
+ let local_variance = (center - neighbor_avg).abs();
+ noise_sum += local_variance;
+ sample_count += 1;
+ }
+ }
+
+ if sample_count > 0 {
+ (noise_sum / sample_count as f32) / 255.0
+ } else {
+ 0.0
+ }
+ }
+
+ /// Estimate image sharpness using gradient magnitude
+ #[cfg(feature = "ocr")]
+ fn estimate_sharpness(&self, img: &ImageBuffer, Vec>) -> f32 {
+ let (width, height) = img.dimensions();
+ let mut gradient_sum = 0.0f32;
+ let mut sample_count = 0u32;
+
+ // Calculate gradients for interior pixels
+ for y in 1..height-1 {
+ for x in 1..width-1 {
+ let _center = img.get_pixel(x, y)[0] as f32;
+ let left = img.get_pixel(x-1, y)[0] as f32;
+ let right = img.get_pixel(x+1, y)[0] as f32;
+ let top = img.get_pixel(x, y-1)[0] as f32;
+ let bottom = img.get_pixel(x, y+1)[0] as f32;
+
+ let grad_x = (right - left) / 2.0;
+ let grad_y = (bottom - top) / 2.0;
+ let gradient_magnitude = (grad_x * grad_x + grad_y * grad_y).sqrt();
+
+ gradient_sum += gradient_magnitude;
+ sample_count += 1;
+ }
+ }
+
+ if sample_count > 0 {
+ (gradient_sum / sample_count as f32) / 255.0
+ } else {
+ 0.0
+ }
+ }
+
+ /// Enhanced brightness and contrast correction for dim images
+ #[cfg(feature = "ocr")]
+ fn enhance_brightness_and_contrast(&self, img: ImageBuffer, Vec>, stats: &ImageQualityStats) -> Result, Vec>> {
+ let (width, height) = img.dimensions();
+ let mut enhanced = ImageBuffer::new(width, height);
+
+ // Calculate enhancement parameters based on image statistics
+ let brightness_boost = if stats.average_brightness < 50.0 {
+ 60.0 - stats.average_brightness // Aggressive boost for very dim images
+ } else if stats.average_brightness < 80.0 {
+ 30.0 - (stats.average_brightness - 50.0) * 0.5 // Moderate boost
+ } else {
+ 0.0 // No boost needed
+ };
+
+ let contrast_multiplier = if stats.contrast_ratio < 0.2 {
+ 2.5 // Aggressive contrast boost for flat images
+ } else if stats.contrast_ratio < 0.4 {
+ 1.8 // Moderate contrast boost
+ } else {
+ 1.2 // Slight boost
+ };
+
+ info!("Applying brightness boost: {:.1}, contrast multiplier: {:.1}", brightness_boost, contrast_multiplier);
+
+ for (x, y, pixel) in img.enumerate_pixels() {
+ let original_value = pixel[0] as f32;
+
+ // Apply brightness and contrast enhancement
+ let enhanced_value = ((original_value + brightness_boost) * contrast_multiplier).round();
+ let clamped_value = enhanced_value.max(0.0).min(255.0) as u8;
+
+ enhanced.put_pixel(x, y, Luma([clamped_value]));
+ }
+
+ Ok(enhanced)
+ }
+
+ /// Adaptive noise removal based on detected noise level
+ #[cfg(feature = "ocr")]
+ fn adaptive_noise_removal(&self, img: ImageBuffer, Vec>, stats: &ImageQualityStats) -> Result, Vec>> {
+ let mut processed = img;
+
+ if stats.noise_level > 0.2 {
+ // Heavy noise - apply multiple filters
+ processed = median_filter(&processed, 2, 2); // Larger median filter
+ processed = gaussian_blur_f32(&processed, 0.8); // More blur
+ info!("Applied heavy noise reduction (noise level: {:.2})", stats.noise_level);
+ } else if stats.noise_level > 0.1 {
+ // Moderate noise
+ processed = median_filter(&processed, 1, 1);
+ processed = gaussian_blur_f32(&processed, 0.5);
+ info!("Applied moderate noise reduction");
+ } else {
+ // Light noise or clean image
+ processed = median_filter(&processed, 1, 1);
+ info!("Applied light noise reduction");
+ }
+
+ Ok(processed)
+ }
+
+ /// Adaptive contrast enhancement based on image quality
+ #[cfg(feature = "ocr")]
+ fn adaptive_contrast_enhancement(&self, img: ImageBuffer, Vec>, stats: &ImageQualityStats) -> Result, Vec>> {
+ // Choose threshold size based on image dimensions and quality
+ let (width, height) = img.dimensions();
+ let min_dimension = width.min(height);
+
+ let threshold_size = if stats.contrast_ratio < 0.2 {
+ // Low contrast - use smaller windows for more aggressive local adaptation
+ (min_dimension / 20).max(11).min(31)
+ } else {
+ // Good contrast - use larger windows
+ (min_dimension / 15).max(15).min(41)
+ };
+
+ // Ensure odd number for threshold size
+ let threshold_size = if threshold_size % 2 == 0 { threshold_size + 1 } else { threshold_size };
+
+ info!("Applying adaptive threshold with window size: {}", threshold_size);
+ let enhanced = adaptive_threshold(&img, threshold_size);
+
+ Ok(enhanced)
+ }
+
+ /// Sharpen blurry images
+ #[cfg(feature = "ocr")]
+ fn sharpen_image(&self, img: ImageBuffer, Vec>) -> Result, Vec>> {
+ let (width, height) = img.dimensions();
+ let mut sharpened = ImageBuffer::new(width, height);
+
+ // Unsharp mask kernel - enhances edges
+ let kernel = [
+ [0.0, -1.0, 0.0],
+ [-1.0, 5.0, -1.0],
+ [0.0, -1.0, 0.0],
+ ];
+
+ for y in 1..height-1 {
+ for x in 1..width-1 {
+ let mut sum = 0.0;
+
+ for ky in 0..3 {
+ for kx in 0..3 {
+ let px = img.get_pixel(x + kx - 1, y + ky - 1)[0] as f32;
+ sum += px * kernel[ky as usize][kx as usize];
+ }
+ }
+
+ let sharpened_value = sum.round().max(0.0).min(255.0) as u8;
+ sharpened.put_pixel(x, y, Luma([sharpened_value]));
+ }
+ }
+
+ // Copy border pixels
+ for y in 0..height {
+ for x in 0..width {
+ if x == 0 || x == width-1 || y == 0 || y == height-1 {
+ sharpened.put_pixel(x, y, *img.get_pixel(x, y));
+ }
+ }
+ }
+
+ info!("Applied image sharpening");
+ Ok(sharpened)
+ }
+
+ /// Apply morphological operations for text clarity
+ #[cfg(feature = "ocr")]
+ fn apply_morphological_operations(&self, img: ImageBuffer, Vec>) -> Result, Vec>> {
+ // Apply opening to remove small noise
+ let opened = open(&img, Norm::LInf, 1);
+
+ // Apply closing to fill small gaps in text
+ let closed = close(&opened, Norm::LInf, 1);
+
+ Ok(closed)
+ }
+
+ /// Extract text from PDF
+ #[cfg(feature = "ocr")]
+ pub async fn extract_text_from_pdf(&self, file_path: &str, _settings: &Settings) -> Result {
+ let start_time = std::time::Instant::now();
+ info!("Extracting text from PDF: {}", file_path);
+
+ let bytes = std::fs::read(file_path)?;
+ let text = pdf_extract::extract_text_from_mem(&bytes)?;
+
+ let processing_time = start_time.elapsed().as_millis() as u64;
+ let word_count = text.split_whitespace().count();
+
+ Ok(OcrResult {
+ text: text.trim().to_string(),
+ confidence: 95.0, // PDF text extraction is generally high confidence
+ processing_time_ms: processing_time,
+ word_count,
+ preprocessing_applied: vec!["PDF text extraction".to_string()],
+ })
+ }
+
+ /// Extract text from any supported file type
+ pub async fn extract_text(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result {
+ match mime_type {
+ "application/pdf" => {
+ #[cfg(feature = "ocr")]
+ {
+ self.extract_text_from_pdf(file_path, settings).await
+ }
+ #[cfg(not(feature = "ocr"))]
+ {
+ Err(anyhow::anyhow!("OCR feature not enabled"))
+ }
+ }
+ mime if mime.starts_with("image/") => {
+ #[cfg(feature = "ocr")]
+ {
+ self.extract_text_from_image(file_path, settings).await
+ }
+ #[cfg(not(feature = "ocr"))]
+ {
+ Err(anyhow::anyhow!("OCR feature not enabled"))
+ }
+ }
+ "text/plain" => {
+ let start_time = std::time::Instant::now();
+ let text = std::fs::read_to_string(file_path)?;
+ let processing_time = start_time.elapsed().as_millis() as u64;
+ let word_count = text.split_whitespace().count();
+
+ Ok(OcrResult {
+ text: text.trim().to_string(),
+ confidence: 100.0, // Plain text is 100% confident
+ processing_time_ms: processing_time,
+ word_count,
+ preprocessing_applied: vec!["Plain text read".to_string()],
+ })
+ }
+ _ => Err(anyhow::anyhow!("Unsupported file type: {}", mime_type)),
+ }
+ }
+
+ /// Validate OCR result quality
+ pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> bool {
+ // Check minimum confidence threshold
+ if result.confidence < settings.ocr_min_confidence {
+ warn!(
+ "OCR result below confidence threshold: {:.1}% < {:.1}%",
+ result.confidence, settings.ocr_min_confidence
+ );
+ return false;
+ }
+
+ // Check if text is reasonable (not just noise)
+ if result.word_count == 0 {
+ warn!("OCR result contains no words");
+ return false;
+ }
+
+ // Check for reasonable character distribution
+ let total_chars = result.text.len();
+ if total_chars == 0 {
+ return false;
+ }
+
+ let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count();
+ let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32;
+
+ // Expect at least 30% alphanumeric characters for valid text
+ if alphanumeric_ratio < 0.3 {
+ warn!(
+ "OCR result has low alphanumeric ratio: {:.1}%",
+ alphanumeric_ratio * 100.0
+ );
+ return false;
+ }
+
+ true
+ }
+}
+
+#[cfg(not(feature = "ocr"))]
+impl EnhancedOcrService {
+ pub async fn extract_text_from_image(&self, _file_path: &str, _settings: &Settings) -> Result {
+ Err(anyhow::anyhow!("OCR feature not enabled"))
+ }
+
+ pub async fn extract_text_from_pdf(&self, _file_path: &str, _settings: &Settings) -> Result {
+ Err(anyhow::anyhow!("OCR feature not enabled"))
+ }
+
+ pub async fn extract_text(&self, _file_path: &str, _mime_type: &str, _settings: &Settings) -> Result {
+ Err(anyhow::anyhow!("OCR feature not enabled"))
+ }
+
+ pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> bool {
+ false
+ }
+}
\ No newline at end of file
diff --git a/src/lib.rs b/src/lib.rs
index 58a43ae..807d6ac 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -2,6 +2,7 @@ pub mod auth;
pub mod batch_ingest;
pub mod config;
pub mod db;
+pub mod enhanced_ocr; // Temporarily disabled due to compilation errors
pub mod file_service;
pub mod models;
pub mod ocr;
@@ -26,4 +27,4 @@ pub struct AppState {
/// Health check endpoint for monitoring
pub async fn health_check() -> Result, StatusCode> {
Ok(Json(serde_json::json!({"status": "ok"})))
-}
\ No newline at end of file
+}
diff --git a/src/main.rs b/src/main.rs
index 356aa01..e04219a 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -12,6 +12,7 @@ mod auth;
mod batch_ingest;
mod config;
mod db;
+mod enhanced_ocr;
mod file_service;
mod models;
mod ocr;
diff --git a/src/models.rs b/src/models.rs
index 78bc8ab..20271e5 100644
--- a/src/models.rs
+++ b/src/models.rs
@@ -186,6 +186,15 @@ pub struct Settings {
pub memory_limit_mb: i32,
pub cpu_priority: String,
pub enable_background_ocr: bool,
+ pub ocr_page_segmentation_mode: i32,
+ pub ocr_engine_mode: i32,
+ pub ocr_min_confidence: f32,
+ pub ocr_dpi: i32,
+ pub ocr_enhance_contrast: bool,
+ pub ocr_remove_noise: bool,
+ pub ocr_detect_orientation: bool,
+ pub ocr_whitelist_chars: Option,
+ pub ocr_blacklist_chars: Option,
pub created_at: DateTime,
pub updated_at: DateTime,
}
@@ -208,6 +217,15 @@ pub struct SettingsResponse {
pub memory_limit_mb: i32,
pub cpu_priority: String,
pub enable_background_ocr: bool,
+ pub ocr_page_segmentation_mode: i32,
+ pub ocr_engine_mode: i32,
+ pub ocr_min_confidence: f32,
+ pub ocr_dpi: i32,
+ pub ocr_enhance_contrast: bool,
+ pub ocr_remove_noise: bool,
+ pub ocr_detect_orientation: bool,
+ pub ocr_whitelist_chars: Option,
+ pub ocr_blacklist_chars: Option,
}
#[derive(Debug, Serialize, Deserialize, ToSchema)]
@@ -228,6 +246,15 @@ pub struct UpdateSettings {
pub memory_limit_mb: Option,
pub cpu_priority: Option,
pub enable_background_ocr: Option,
+ pub ocr_page_segmentation_mode: Option,
+ pub ocr_engine_mode: Option,
+ pub ocr_min_confidence: Option,
+ pub ocr_dpi: Option,
+ pub ocr_enhance_contrast: Option,
+ pub ocr_remove_noise: Option,
+ pub ocr_detect_orientation: Option,
+ pub ocr_whitelist_chars: Option