feat(client/server): update search tests, and upgrade OCR

2025-06-12 22:00:14 -07:00 · 2025-06-12 22:00:14 -07:00 · 1a1f886f04
parent 1f50004d66
commit 1a1f886f04
11 changed files with 1715 additions and 73 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2,6 +2,12 @@
 # It is not intended for manual editing.
 version = 4
 [[package]]
 name = "ab_glyph_rasterizer"
 version = "0.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c71b1793ee61086797f5c80b6efa2b8ffa6d5dd703f118545808a7f2e27f7046"
 [[package]]
 name = "addr2line"
 version = "0.24.2"
@ -125,6 +131,15 @@ version = "1.0.98"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487"
 [[package]]
 name = "approx"
 version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6"
 dependencies = [
 "num-traits",
 ]
 [[package]]
 name = "async-trait"
 version = "0.1.88"
@ -275,6 +290,12 @@ dependencies = [
 "which",
 ]
 [[package]]
 name = "bit_field"
 version = "0.10.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dc827186963e592360843fb5ba4b973e145841266c1357f7180c43526f2e5b61"
 [[package]]
 name = "bitflags"
 version = "1.3.2"
@ -325,6 +346,12 @@ version = "3.18.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "793db76d6187cd04dff33004d8e6c9cc4e05cd330500379d2394209271b4aeee"
 [[package]]
 name = "bytemuck"
 version = "1.23.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5c76a5792e44e4abe34d3abf15636779261d45a7450612059293d1d2cfc63422"
 [[package]]
 name = "byteorder"
 version = "1.5.0"
@ -437,6 +464,12 @@ version = "0.7.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675"
 [[package]]
 name = "color_quant"
 version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b"
 [[package]]
 name = "colorchoice"
 version = "1.0.4"
@ -449,6 +482,15 @@ version = "0.9.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
 [[package]]
 name = "conv"
 version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "78ff10625fd0ac447827aa30ea8b861fead473bb60aeb73af6c1c58caf0d1299"
 dependencies = [
 "custom_derive",
 ]
 [[package]]
 name = "core-foundation"
 version = "0.9.4"
@ -507,6 +549,25 @@ dependencies = [
 "crossbeam-utils",
 ]
 [[package]]
 name = "crossbeam-deque"
 version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
 dependencies = [
 "crossbeam-epoch",
 "crossbeam-utils",
 ]
 [[package]]
 name = "crossbeam-epoch"
 version = "0.9.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
 dependencies = [
 "crossbeam-utils",
 ]
 [[package]]
 name = "crossbeam-queue"
 version = "0.3.12"
@ -522,6 +583,12 @@ version = "0.8.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
 [[package]]
 name = "crunchy"
 version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929"
 [[package]]
 name = "crypto-common"
 version = "0.1.6"
@ -532,6 +599,12 @@ dependencies = [
 "typenum",
 ]
 [[package]]
 name = "custom_derive"
 version = "0.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9"
 [[package]]
 name = "darling"
 version = "0.13.4"
@ -676,12 +749,36 @@ version = "2.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
 [[package]]
 name = "exr"
 version = "1.73.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f83197f59927b46c04a183a619b7c29df34e63e63c7869320862268c0ef687e0"
 dependencies = [
 "bit_field",
 "half",
 "lebe",
 "miniz_oxide",
 "rayon-core",
 "smallvec",
 "zune-inflate",
 ]
 [[package]]
 name = "fastrand"
 version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
 [[package]]
 name = "fdeflate"
 version = "0.3.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c"
 dependencies = [
 "simd-adler32",
 ]
 [[package]]
 name = "filetime"
 version = "0.2.25"
@ -864,6 +961,17 @@ dependencies = [
 "version_check",
 ]
 [[package]]
 name = "getrandom"
 version = "0.1.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
 dependencies = [
 "cfg-if",
 "libc",
 "wasi 0.9.0+wasi-snapshot-preview1",
 ]
 [[package]]
 name = "getrandom"
 version = "0.2.16"
@ -889,6 +997,16 @@ dependencies = [
 "wasi 0.14.2+wasi-0.2.4",
 ]
 [[package]]
 name = "gif"
 version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3fb2d69b19215e18bb912fa30f7ce15846e301408695e44e0ef719f1da9e19f2"
 dependencies = [
 "color_quant",
 "weezl",
 ]
 [[package]]
 name = "gimli"
 version = "0.31.1"
@ -920,6 +1038,16 @@ dependencies = [
 "tracing",
 ]
 [[package]]
 name = "half"
 version = "2.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9"
 dependencies = [
 "cfg-if",
 "crunchy",
 ]
 [[package]]
 name = "hashbrown"
 version = "0.14.5"
@ -1287,6 +1415,42 @@ dependencies = [
 "icu_properties",
 ]
 [[package]]
 name = "image"
 version = "0.24.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5690139d2f55868e080017335e4b94cb7414274c74f1669c84fb5feba2c9f69d"
 dependencies = [
 "bytemuck",
 "byteorder",
 "color_quant",
 "exr",
 "gif",
 "jpeg-decoder",
 "num-traits",
 "png",
 "qoi",
 "tiff",
 ]
 [[package]]
 name = "imageproc"
 version = "0.23.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6aee993351d466301a29655d628bfc6f5a35a0d062b6160ca0808f425805fd7"
 dependencies = [
 "approx",
 "conv",
 "image",
 "itertools",
 "nalgebra",
 "num",
 "rand 0.7.3",
 "rand_distr",
 "rayon",
 "rusttype",
 ]
 [[package]]
 name = "indexmap"
 version = "2.9.0"
@ -1339,12 +1503,30 @@ version = "1.70.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
 [[package]]
 name = "itertools"
 version = "0.10.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
 dependencies = [
 "either",
 ]
 [[package]]
 name = "itoa"
 version = "1.0.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
 [[package]]
 name = "jpeg-decoder"
 version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f5d4a7da358eff58addd2877a45865158f0d78c911d43a5784ceb7bbf52833b0"
 dependencies = [
 "rayon",
 ]
 [[package]]
 name = "js-sys"
 version = "0.3.77"
@ -1405,6 +1587,12 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 [[package]]
 name = "lebe"
 version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "03087c2bad5e1034e8cace5926dec053fb3790248370865f5117a7d0213354c8"
 [[package]]
 name = "leptonica-plumbing"
 version = "1.4.0"
@ -1529,6 +1717,16 @@ version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
 [[package]]
 name = "matrixmultiply"
 version = "0.3.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08"
 dependencies = [
 "autocfg",
 "rawpointer",
 ]
 [[package]]
 name = "md-5"
 version = "0.10.6"
@ -1574,6 +1772,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316"
 dependencies = [
 "adler2",
 "simd-adler32",
 ]
 [[package]]
@ -1616,6 +1815,21 @@ dependencies = [
 "version_check",
 ]
 [[package]]
 name = "nalgebra"
 version = "0.30.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4fb2d0de08694bed883320212c18ee3008576bfe8c306f4c3c4a58b4876998be"
 dependencies = [
 "approx",
 "matrixmultiply",
 "num-complex",
 "num-rational",
 "num-traits",
 "simba",
 "typenum",
 ]
 [[package]]
 name = "native-tls"
 version = "0.2.14"
@ -1672,6 +1886,20 @@ dependencies = [
 "winapi",
 ]
 [[package]]
 name = "num"
 version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
 dependencies = [
 "num-bigint",
 "num-complex",
 "num-integer",
 "num-iter",
 "num-rational",
 "num-traits",
 ]
 [[package]]
 name = "num-bigint"
 version = "0.4.6"
@ -1694,11 +1922,20 @@ dependencies = [
 "num-integer",
 "num-iter",
 "num-traits",
- "rand",
+ "rand 0.8.5",
 "smallvec",
 "zeroize",
 ]
 [[package]]
 name = "num-complex"
 version = "0.4.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
 dependencies = [
 "num-traits",
 ]
 [[package]]
 name = "num-conv"
 version = "0.1.0"
@ -1725,6 +1962,17 @@ dependencies = [
 "num-traits",
 ]
 [[package]]
 name = "num-rational"
 version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
 dependencies = [
 "num-bigint",
 "num-integer",
 "num-traits",
 ]
 [[package]]
 name = "num-traits"
 version = "0.2.19"
@ -1806,6 +2054,15 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
 [[package]]
 name = "owned_ttf_parser"
 version = "0.15.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "05e6affeb1632d6ff6a23d2cd40ffed138e82f1532571a26f527c8a284bb2fbb"
 dependencies = [
 "ttf-parser",
 ]
 [[package]]
 name = "parking_lot"
 version = "0.12.4"
@ -1940,6 +2197,19 @@ version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
 [[package]]
 name = "png"
 version = "0.17.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "82151a2fc869e011c153adc57cf2789ccb8d9906ce52c0b39a6b5697749d7526"
 dependencies = [
 "bitflags 1.3.2",
 "crc32fast",
 "fdeflate",
 "flate2",
 "miniz_oxide",
 ]
 [[package]]
 name = "pom"
 version = "1.1.0"
@ -2009,6 +2279,15 @@ dependencies = [
 "unicode-ident",
 ]
 [[package]]
 name = "qoi"
 version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f6d64c71eb498fe9eae14ce4ec935c555749aef511cca85b5568910d6e48001"
 dependencies = [
 "bytemuck",
 ]
 [[package]]
 name = "quote"
 version = "1.0.40"
@ -2024,6 +2303,19 @@ version = "5.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
 [[package]]
 name = "rand"
 version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
 dependencies = [
 "getrandom 0.1.16",
 "libc",
 "rand_chacha 0.2.2",
 "rand_core 0.5.1",
 "rand_hc",
 ]
 [[package]]
 name = "rand"
 version = "0.8.5"
@ -2031,8 +2323,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
 dependencies = [
 "libc",
- "rand_chacha",
+ "rand_chacha 0.3.1",
- "rand_core",
+ "rand_core 0.6.4",
 ]
 [[package]]
 name = "rand_chacha"
 version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
 dependencies = [
 "ppv-lite86",
 "rand_core 0.5.1",
 ]
 [[package]]
@ -2042,7 +2344,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
 dependencies = [
 "ppv-lite86",
- "rand_core",
+ "rand_core 0.6.4",
 ]
 [[package]]
 name = "rand_core"
 version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
 dependencies = [
 "getrandom 0.1.16",
 ]
 [[package]]
@ -2054,12 +2365,56 @@ dependencies = [
 "getrandom 0.2.16",
 ]
 [[package]]
 name = "rand_distr"
 version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "96977acbdd3a6576fb1d27391900035bf3863d4a16422973a409b488cf29ffb2"
 dependencies = [
 "rand 0.7.3",
 ]
 [[package]]
 name = "rand_hc"
 version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
 dependencies = [
 "rand_core 0.5.1",
 ]
 [[package]]
 name = "rangemap"
 version = "1.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f60fcc7d6849342eff22c4350c8b9a989ee8ceabc4b481253e8946b9fe83d684"
 [[package]]
 name = "rawpointer"
 version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
 [[package]]
 name = "rayon"
 version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
 dependencies = [
 "either",
 "rayon-core",
 ]
 [[package]]
 name = "rayon-core"
 version = "1.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
 dependencies = [
 "crossbeam-deque",
 "crossbeam-utils",
 ]
 [[package]]
 name = "readur"
 version = "0.1.0"
@ -2073,10 +2428,13 @@ dependencies = [
 "dotenvy",
 "futures-util",
 "hostname",
 "image",
 "imageproc",
 "jsonwebtoken",
 "mime_guess",
 "notify",
 "pdf-extract",
 "regex",
 "reqwest",
 "serde",
 "serde_json",
@ -2203,7 +2561,7 @@ dependencies = [
 "num-traits",
 "pkcs1",
 "pkcs8",
- "rand_core",
+ "rand_core 0.6.4",
 "signature",
 "spki",
 "subtle",
@ -2312,6 +2670,16 @@ dependencies = [
 "untrusted",
 ]
 [[package]]
 name = "rusttype"
 version = "0.9.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3ff8374aa04134254b7995b63ad3dc41c7f7236f69528b28553da7d72efaa967"
 dependencies = [
 "ab_glyph_rasterizer",
 "owned_ttf_parser",
 ]
 [[package]]
 name = "rustversion"
 version = "1.0.21"
@ -2324,6 +2692,15 @@ version = "1.0.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
 [[package]]
 name = "safe_arch"
 version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "96b02de82ddbe1b636e6170c21be622223aea188ef2e139be0a5b219ec215323"
 dependencies = [
 "bytemuck",
 ]
 [[package]]
 name = "same-file"
 version = "1.0.6"
@ -2510,9 +2887,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
 dependencies = [
 "digest",
- "rand_core",
+ "rand_core 0.6.4",
 ]
 [[package]]
 name = "simba"
 version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2f3fd720c48c53cace224ae62bef1bbff363a70c68c4802a78b5cc6159618176"
 dependencies = [
 "approx",
 "num-complex",
 "num-traits",
 "paste",
 "wide",
 ]
 [[package]]
 name = "simd-adler32"
 version = "0.3.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe"
 [[package]]
 name = "simple_asn1"
 version = "0.6.3"
@ -2705,7 +3101,7 @@ dependencies = [
 "memchr",
 "once_cell",
 "percent-encoding",
- "rand",
+ "rand 0.8.5",
 "rsa",
 "serde",
 "sha1",
@ -2746,7 +3142,7 @@ dependencies = [
 "md-5",
 "memchr",
 "once_cell",
- "rand",
+ "rand 0.8.5",
 "serde",
 "serde_json",
 "sha2",
@ -2943,7 +3339,7 @@ dependencies = [
 "hex",
 "hmac",
 "log",
- "rand",
+ "rand 0.8.5",
 "serde",
 "serde_json",
 "sha2",
@ -3008,6 +3404,17 @@ dependencies = [
 "once_cell",
 ]
 [[package]]
 name = "tiff"
 version = "0.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ba1310fcea54c6a9a4fd1aad794ecc02c31682f6bfbecdf460bf19533eed1e3e"
 dependencies = [
 "flate2",
 "jpeg-decoder",
 "weezl",
 ]
 [[package]]
 name = "time"
 version = "0.3.41"
@ -3259,6 +3666,12 @@ version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
 [[package]]
 name = "ttf-parser"
 version = "0.15.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7b3e06c9b9d80ed6b745c7159c40b311ad2916abb34a49e9be2653b90db0d8dd"
 [[package]]
 name = "type1-encoding-parser"
 version = "0.1.0"
@ -3445,6 +3858,12 @@ dependencies = [
 "try-lock",
 ]
 [[package]]
 name = "wasi"
 version = "0.9.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
 [[package]]
 name = "wasi"
 version = "0.11.1+wasi-snapshot-preview1"
@ -3581,6 +4000,16 @@ dependencies = [
 "wasite",
 ]
 [[package]]
 name = "wide"
 version = "0.7.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "41b5576b9a81633f3e8df296ce0063042a73507636cbe956c61133dd7034ab22"
 dependencies = [
 "bytemuck",
 "safe_arch",
 ]
 [[package]]
 name = "winapi"
 version = "0.3.9"
@ -4023,3 +4452,12 @@ dependencies = [
 "crossbeam-utils",
 "flate2",
 ]
 [[package]]
 name = "zune-inflate"
 version = "0.2.54"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "73ab332fe2f6680068f3582b16a24f90ad7096d5d39b974d1c0aff0125116f02"
 dependencies = [
 "simd-adler32",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -11,6 +11,7 @@ tower-http = { version = "0.5", features = ["cors", "fs"] }
 serde = { version = "1", features = ["derive"] }
 serde_json = "1"
 sqlx = { version = "0.7", features = ["runtime-tokio-rustls", "postgres", "sqlite", "chrono", "uuid"] }
 regex = "1.0"
 uuid = { version = "1", features = ["v4", "serde"] }
 chrono = { version = "0.4", features = ["serde"] }
 bcrypt = "0.15"
@ -25,6 +26,8 @@ notify = "6"
 mime_guess = "2"
 tesseract = { version = "0.15", optional = true }
 pdf-extract = { version = "0.7", optional = true }
 image = { version = "0.24", features = ["png", "jpeg", "tiff", "bmp"], optional = true }
 imageproc = { version = "0.23", optional = true }
 reqwest = { version = "0.11", features = ["json", "multipart"] }
 dotenvy = "0.15"
 hostname = "0.4"
@ -35,7 +38,7 @@ utoipa-swagger-ui = { version = "6", features = ["axum"] }
 [features]
 default = ["ocr"]
-ocr = ["tesseract", "pdf-extract"]
+ocr = ["tesseract", "pdf-extract", "image", "imageproc"]
 [dev-dependencies]
 tempfile = "3"
--- a/frontend/src/components/GlobalSearchBar/GlobalSearchBar.jsx
+++ b/frontend/src/components/GlobalSearchBar/GlobalSearchBar.jsx
@ -132,8 +132,9 @@ const GlobalSearchBar = ({ sx, ...props }) => {
      const response = await documentService.enhancedSearch({
        query: searchQuery.trim(),
        limit: 5, // Show only top 5 results in global search
-        include_snippets: false, // Don't need snippets for quick search
+        include_snippets: true, // Include snippets for context
-        search_mode: 'simple',
+        snippet_length: 100, // Shorter snippets for quick search
        search_mode: searchQuery.length < 4 ? 'fuzzy' : 'simple', // Use fuzzy for short queries (substring matching)
      });
      clearInterval(progressInterval);
@ -240,6 +241,76 @@ const GlobalSearchBar = ({ sx, ...props }) => {
    return Math.round(bytes / Math.pow(1024, i) * 100) / 100 + ' ' + sizes[i];
  };
  // Function to highlight search terms in text (including substrings)
  const highlightText = useCallback((text, searchTerm) => {
    if (!searchTerm || !text) return text;
    const terms = searchTerm.toLowerCase().split(/\s+/).filter(term => term.length >= 2);
    let highlightedText = text;
    terms.forEach(term => {
      const regex = new RegExp(`(${term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')})`, 'gi');
      highlightedText = highlightedText.replace(regex, (match) => `**${match}**`);
    });
    // Split by ** markers and create spans
    const parts = highlightedText.split(/\*\*(.*?)\*\*/);
    return parts.map((part, index) => {
      if (index % 2 === 1) {
        // This is a highlighted part
        return (
          <Box
            key={index}
            component="mark"
            sx={{
              backgroundColor: 'primary.light',
              color: 'primary.contrastText',
              padding: '0 2px',
              borderRadius: '2px',
              fontWeight: 600,
            }}
          >
            {part}
          </Box>
        );
      }
      return part;
    });
  }, []);
  // Enhanced search with context snippets
  const generateContextSnippet = useCallback((filename, searchTerm) => {
    if (!searchTerm || !filename) return filename;
    const lowerFilename = filename.toLowerCase();
    const lowerTerm = searchTerm.toLowerCase();
    // Find the best match (exact term or substring)
    const exactMatch = lowerFilename.indexOf(lowerTerm);
    if (exactMatch !== -1) {
      // Show context around the match
      const start = Math.max(0, exactMatch - 10);
      const end = Math.min(filename.length, exactMatch + searchTerm.length + 10);
      const snippet = filename.substring(start, end);
      return start > 0 ? `...${snippet}` : snippet;
    }
    // Look for partial word matches
    const words = filename.split(/[_\-\s\.]/);
    const matchingWord = words.find(word => 
      word.toLowerCase().includes(lowerTerm) || lowerTerm.includes(word.toLowerCase())
    );
    if (matchingWord) {
      const wordIndex = words.indexOf(matchingWord);
      const contextWords = words.slice(Math.max(0, wordIndex - 1), Math.min(words.length, wordIndex + 2));
      return contextWords.join(' ');
    }
    return filename;
  }, []);
  return (
    <ClickAwayListener onClickAway={handleClickAway}>
      <Box sx={{ position: 'relative', ...sx }} {...props}>
@ -434,34 +505,54 @@ const GlobalSearchBar = ({ sx, ...props }) => {
                                  whiteSpace: 'nowrap',
                                }}
                              >
-                                {doc.original_filename}
+                                {highlightText(generateContextSnippet(doc.original_filename, query), query)}
                              </Typography>
                            }
                            secondary={
-                              <Stack direction="row" spacing={1} alignItems="center">
+                              <Box>
-                                <Typography variant="caption" color="text.secondary">
+                                <Stack direction="row" spacing={1} alignItems="center" sx={{ mb: 0.5 }}>
-                                  {formatFileSize(doc.file_size)}
+                                  <Typography variant="caption" color="text.secondary">
-                                </Typography>
+                                    {formatFileSize(doc.file_size)}
-                                {doc.has_ocr_text && (
+                                  </Typography>
-                                  <Chip
+                                  {doc.has_ocr_text && (
-                                    label="OCR"
+                                    <Chip
-                                    size="small"
+                                      label="OCR"
-                                    color="success"
+                                      size="small"
-                                    variant="outlined"
+                                      color="success"
-                                    sx={{ height: 16, fontSize: '0.6rem' }}
+                                      variant="outlined"
-                                  />
+                                      sx={{ height: 16, fontSize: '0.6rem' }}
                                    />
                                  )}
                                  {doc.search_rank && (
                                    <Chip
                                      icon={<TrendingIcon sx={{ fontSize: 10 }} />}
                                      label={`${(doc.search_rank * 100).toFixed(0)}%`}
                                      size="small"
                                      color="info"
                                      variant="outlined"
                                      sx={{ height: 16, fontSize: '0.6rem' }}
                                    />
                                  )}
                                </Stack>
                                {/* Show content snippet if available */}
                                {doc.snippets && doc.snippets.length > 0 && (
                                  <Typography 
                                    variant="caption" 
                                    color="text.secondary"
                                    sx={{
                                      display: 'block',
                                      overflow: 'hidden',
                                      textOverflow: 'ellipsis',
                                      whiteSpace: 'nowrap',
                                      fontSize: '0.7rem',
                                      fontStyle: 'italic',
                                    }}
                                  >
                                    {highlightText(doc.snippets[0].text.substring(0, 80) + '...', query)}
                                  </Typography>
                                )}
-                                {doc.search_rank && (
+                              </Box>
                                  <Chip
                                    icon={<TrendingIcon sx={{ fontSize: 10 }} />}
                                    label={`${(doc.search_rank * 100).toFixed(0)}%`}
                                    size="small"
                                    color="info"
                                    variant="outlined"
                                    sx={{ height: 16, fontSize: '0.6rem' }}
                                  />
                                )}
                              </Stack>
                            }
                          />
                        </ListItem>
--- a/src/db.rs
+++ b/src/db.rs
@ -85,6 +85,15 @@ impl Database {
            .execute(&self.pool)
            .await?;
        // Enhanced indexes for substring matching and similarity
        sqlx::query(r#"CREATE INDEX IF NOT EXISTS idx_documents_filename_trgm ON documents USING GIN(filename gin_trgm_ops)"#)
            .execute(&self.pool)
            .await?;
        sqlx::query(r#"CREATE INDEX IF NOT EXISTS idx_documents_content_trgm ON documents USING GIN((COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')) gin_trgm_ops)"#)
            .execute(&self.pool)
            .await?;
        // Create settings table
        sqlx::query(
            r#"
@ -107,6 +116,15 @@ impl Database {
                memory_limit_mb INT DEFAULT 512,
                cpu_priority VARCHAR(10) DEFAULT 'normal',
                enable_background_ocr BOOLEAN DEFAULT TRUE,
                ocr_page_segmentation_mode INT DEFAULT 3,
                ocr_engine_mode INT DEFAULT 3,
                ocr_min_confidence REAL DEFAULT 30.0,
                ocr_dpi INT DEFAULT 300,
                ocr_enhance_contrast BOOLEAN DEFAULT TRUE,
                ocr_remove_noise BOOLEAN DEFAULT TRUE,
                ocr_detect_orientation BOOLEAN DEFAULT TRUE,
                ocr_whitelist_chars TEXT,
                ocr_blacklist_chars TEXT,
                created_at TIMESTAMPTZ DEFAULT NOW(),
                updated_at TIMESTAMPTZ DEFAULT NOW()
            )
@ -492,28 +510,78 @@ impl Database {
    pub async fn enhanced_search_documents(&self, user_id: Uuid, search: SearchRequest) -> Result<(Vec<EnhancedDocumentResponse>, i64, u64)> {
        let start_time = std::time::Instant::now();
-        // Build search query based on search mode
+        // Build search query based on search mode with enhanced substring matching
        let search_mode = search.search_mode.as_ref().unwrap_or(&SearchMode::Simple);
        let query_function = match search_mode {
            SearchMode::Simple => "plainto_tsquery",
            SearchMode::Phrase => "phraseto_tsquery", 
            SearchMode::Fuzzy => "plainto_tsquery", // Could be enhanced with similarity
            SearchMode::Boolean => "to_tsquery",
        };
        let mut query_builder = sqlx::QueryBuilder::new(&format!(
            r#"
            SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, tags, created_at, updated_at, user_id,
                   ts_rank(to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')), {}('english', "#,
            query_function
        ));
-        query_builder.push_bind(&search.query);
+        // For fuzzy mode, we'll use similarity matching which is better for substrings
-        query_builder.push(&format!(")) as rank FROM documents WHERE user_id = "));
+        let use_similarity = matches!(search_mode, SearchMode::Fuzzy);
-        query_builder.push_bind(user_id);
+        
-        query_builder.push(&format!(" AND to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')) @@ {}('english', ", query_function));
+        let mut query_builder = if use_similarity {
-        query_builder.push_bind(&search.query);
+            // Use trigram similarity for substring matching
-        query_builder.push(")");
+            let mut builder = sqlx::QueryBuilder::new(
                r#"
                SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, tags, created_at, updated_at, user_id,
                       GREATEST(
                           similarity(filename, "#
            );
            builder.push_bind(&search.query);
            builder.push(r#"),
                           similarity(COALESCE(content, '') || ' ' || COALESCE(ocr_text, ''), "#);
            builder.push_bind(&search.query);
            builder.push(r#"),
                           ts_rank(to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')), plainto_tsquery('english', "#);
            builder.push_bind(&search.query);
            builder.push(r#"))
                       ) as rank
                FROM documents 
                WHERE user_id = "#);
            builder.push_bind(user_id);
            builder.push(r#" AND (
                    filename % "#);
            builder.push_bind(&search.query);
            builder.push(r#" OR
                    (COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')) % "#);
            builder.push_bind(&search.query);
            builder.push(r#" OR
                    to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')) @@ plainto_tsquery('english', "#);
            builder.push_bind(&search.query);
            builder.push(r#")
                )"#);
            builder
        } else {
            // Use traditional full-text search with enhanced ranking
            let query_function = match search_mode {
                SearchMode::Simple => "plainto_tsquery",
                SearchMode::Phrase => "phraseto_tsquery", 
                SearchMode::Boolean => "to_tsquery",
                SearchMode::Fuzzy => "plainto_tsquery", // fallback
            };
            let mut builder = sqlx::QueryBuilder::new(&format!(
                r#"
                SELECT id, filename, original_filename, file_path, file_size, mime_type, content, ocr_text, tags, created_at, updated_at, user_id,
                       GREATEST(
                           CASE WHEN filename ILIKE '%' || "#
            ));
            builder.push_bind(&search.query);
            builder.push(&format!(r#"' || '%' THEN 0.8 ELSE 0 END,
                           ts_rank(to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')), {}('english', "#, query_function));
            builder.push_bind(&search.query);
            builder.push(&format!(r#"))
                       ) as rank
                FROM documents 
                WHERE user_id = "#));
            builder.push_bind(user_id);
            builder.push(&format!(r#" AND (
                    filename ILIKE '%' || "#));
            builder.push_bind(&search.query);
            builder.push(&format!(r#" || '%' OR
                    to_tsvector('english', COALESCE(content, '') || ' ' || COALESCE(ocr_text, '')) @@ {}('english', "#, query_function));
            builder.push_bind(&search.query);
            builder.push(r#")
                )"#);
            builder
        };
        if let Some(tags) = &search.tags {
            if !tags.is_empty() {
@ -574,6 +642,18 @@ impl Database {
            });
        }
        // Get the query function for total count
        let query_function = if use_similarity {
            "plainto_tsquery"
        } else {
            match search_mode {
                SearchMode::Simple => "plainto_tsquery",
                SearchMode::Phrase => "phraseto_tsquery", 
                SearchMode::Boolean => "to_tsquery",
                SearchMode::Fuzzy => "plainto_tsquery",
            }
        };
        let total_row = sqlx::query(&format!(
            r#"
            SELECT COUNT(*) as total FROM documents 
@ -603,37 +683,102 @@ impl Database {
            (None, None) => return snippets,
        };
-        // Simple keyword matching for snippets (could be enhanced with better search algorithms)
+        // Enhanced substring matching for better context
-        let _query_terms: Vec<&str> = query.split_whitespace().collect();
+        let query_terms: Vec<&str> = query.split_whitespace().collect();
        let text_lower = full_text.to_lowercase();
        let query_lower = query.to_lowercase();
-        // Find matches
+        // Find exact matches first
        let mut match_positions = Vec::new();
        // 1. Look for exact query matches
        for (i, _) in text_lower.match_indices(&query_lower) {
-            let snippet_start = if i >= snippet_length as usize / 2 {
+            match_positions.push((i, query.len(), "exact"));
-                i - snippet_length as usize / 2
+        }
-            } else {
+        
-                0
+        // 2. Look for individual term matches (substring matching)
-            };
+        for term in &query_terms {
            if term.len() >= 3 { // Only match terms of reasonable length
                let term_lower = term.to_lowercase();
                for (i, _) in text_lower.match_indices(&term_lower) {
                    // Check if this isn't already part of an exact match
                    let is_duplicate = match_positions.iter().any(|(pos, len, _)| {
                        i >= *pos && i < *pos + *len
                    });
                    if !is_duplicate {
                        match_positions.push((i, term.len(), "term"));
                    }
                }
            }
        }
        // 3. Look for partial word matches (for "docu" -> "document" cases)
        for term in &query_terms {
            if term.len() >= 3 {
                let term_lower = term.to_lowercase();
                // Find words that start with our search term
                let words_regex = regex::Regex::new(&format!(r"\b{}[a-zA-Z]*\b", regex::escape(&term_lower))).unwrap();
                for mat in words_regex.find_iter(&text_lower) {
                    let is_duplicate = match_positions.iter().any(|(pos, len, _)| {
                        mat.start() >= *pos && mat.start() < *pos + *len
                    });
                    if !is_duplicate {
                        match_positions.push((mat.start(), mat.end() - mat.start(), "partial"));
                    }
                }
            }
        }
        // Sort matches by position and remove overlaps
        match_positions.sort_by_key(|&(pos, _, _)| pos);
        // Generate snippets around matches
        for (match_pos, match_len, _match_type) in match_positions.iter().take(5) {
            let context_size = (snippet_length as usize).saturating_sub(*match_len) / 2;
            let snippet_start = match_pos.saturating_sub(context_size);
            let snippet_end = std::cmp::min(
-                snippet_start + snippet_length as usize,
+                match_pos + match_len + context_size,
                full_text.len()
            );
-            if snippet_start < full_text.len() {
+            // Find word boundaries to avoid cutting words
            let snippet_start = self.find_word_boundary(&full_text, snippet_start, true);
            let snippet_end = self.find_word_boundary(&full_text, snippet_end, false);
            if snippet_start < snippet_end && snippet_start < full_text.len() {
                let snippet_text = &full_text[snippet_start..snippet_end];
-                // Find highlight ranges within this snippet
+                // Find all highlight ranges within this snippet
                let mut highlight_ranges = Vec::new();
                let snippet_lower = snippet_text.to_lowercase();
                // Highlight exact query match
                for (match_start, _) in snippet_lower.match_indices(&query_lower) {
                    highlight_ranges.push(HighlightRange {
                        start: match_start as i32,
                        end: (match_start + query.len()) as i32,
                    });
                }
                // Highlight individual terms if no exact match
                if highlight_ranges.is_empty() {
                    for term in &query_terms {
                        if term.len() >= 3 {
                            let term_lower = term.to_lowercase();
                            for (match_start, _) in snippet_lower.match_indices(&term_lower) {
                                highlight_ranges.push(HighlightRange {
                                    start: match_start as i32,
                                    end: (match_start + term.len()) as i32,
                                });
                            }
                        }
                    }
                }
                // Remove duplicate highlights and sort
                highlight_ranges.sort_by_key(|r| r.start);
                highlight_ranges.dedup_by_key(|r| r.start);
                snippets.push(SearchSnippet {
                    text: snippet_text.to_string(),
@ -642,7 +787,7 @@ impl Database {
                    highlight_ranges,
                });
-                // Limit to a few snippets per document
+                // Limit to avoid too many snippets
                if snippets.len() >= 3 {
                    break;
                }
@ -652,6 +797,29 @@ impl Database {
        snippets
    }
    fn find_word_boundary(&self, text: &str, mut pos: usize, search_backward: bool) -> usize {
        if pos >= text.len() {
            return text.len();
        }
        let chars: Vec<char> = text.chars().collect();
        if search_backward {
            // Search backward for word boundary
            while pos > 0 && chars.get(pos.saturating_sub(1)).map_or(false, |c| c.is_alphanumeric()) {
                pos = pos.saturating_sub(1);
            }
        } else {
            // Search forward for word boundary
            while pos < chars.len() && chars.get(pos).map_or(false, |c| c.is_alphanumeric()) {
                pos += 1;
            }
        }
        // Convert back to byte position
        chars.iter().take(pos).map(|c| c.len_utf8()).sum()
    }
    pub async fn update_document_ocr(&self, id: Uuid, ocr_text: &str) -> Result<()> {
        sqlx::query("UPDATE documents SET ocr_text = $1, updated_at = NOW() WHERE id = $2")
            .bind(ocr_text)
@ -734,7 +902,10 @@ impl Database {
               max_file_size_mb, allowed_file_types, auto_rotate_images, enable_image_preprocessing,
               search_results_per_page, search_snippet_length, fuzzy_search_threshold,
               retention_days, enable_auto_cleanup, enable_compression, memory_limit_mb,
-               cpu_priority, enable_background_ocr, created_at, updated_at
+               cpu_priority, enable_background_ocr, ocr_page_segmentation_mode, ocr_engine_mode,
               ocr_min_confidence, ocr_dpi, ocr_enhance_contrast, ocr_remove_noise,
               ocr_detect_orientation, ocr_whitelist_chars, ocr_blacklist_chars,
               created_at, updated_at
               FROM settings WHERE user_id = $1"#
        )
        .bind(user_id)
@ -761,6 +932,15 @@ impl Database {
                memory_limit_mb: row.get("memory_limit_mb"),
                cpu_priority: row.get("cpu_priority"),
                enable_background_ocr: row.get("enable_background_ocr"),
                ocr_page_segmentation_mode: row.get("ocr_page_segmentation_mode"),
                ocr_engine_mode: row.get("ocr_engine_mode"),
                ocr_min_confidence: row.get("ocr_min_confidence"),
                ocr_dpi: row.get("ocr_dpi"),
                ocr_enhance_contrast: row.get("ocr_enhance_contrast"),
                ocr_remove_noise: row.get("ocr_remove_noise"),
                ocr_detect_orientation: row.get("ocr_detect_orientation"),
                ocr_whitelist_chars: row.get("ocr_whitelist_chars"),
                ocr_blacklist_chars: row.get("ocr_blacklist_chars"),
                created_at: row.get("created_at"),
                updated_at: row.get("updated_at"),
            })),
@ -787,9 +967,11 @@ impl Database {
                max_file_size_mb, allowed_file_types, auto_rotate_images, enable_image_preprocessing,
                search_results_per_page, search_snippet_length, fuzzy_search_threshold,
                retention_days, enable_auto_cleanup, enable_compression, memory_limit_mb,
-                cpu_priority, enable_background_ocr
+                cpu_priority, enable_background_ocr, ocr_page_segmentation_mode, ocr_engine_mode,
                ocr_min_confidence, ocr_dpi, ocr_enhance_contrast, ocr_remove_noise,
                ocr_detect_orientation, ocr_whitelist_chars, ocr_blacklist_chars
            )
-            VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17)
+            VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26)
            ON CONFLICT (user_id) DO UPDATE SET
                ocr_language = $2,
                concurrent_ocr_jobs = $3,
@ -807,12 +989,24 @@ impl Database {
                memory_limit_mb = $15,
                cpu_priority = $16,
                enable_background_ocr = $17,
                ocr_page_segmentation_mode = $18,
                ocr_engine_mode = $19,
                ocr_min_confidence = $20,
                ocr_dpi = $21,
                ocr_enhance_contrast = $22,
                ocr_remove_noise = $23,
                ocr_detect_orientation = $24,
                ocr_whitelist_chars = $25,
                ocr_blacklist_chars = $26,
                updated_at = NOW()
            RETURNING id, user_id, ocr_language, concurrent_ocr_jobs, ocr_timeout_seconds,
                      max_file_size_mb, allowed_file_types, auto_rotate_images, enable_image_preprocessing,
                      search_results_per_page, search_snippet_length, fuzzy_search_threshold,
                      retention_days, enable_auto_cleanup, enable_compression, memory_limit_mb,
-                      cpu_priority, enable_background_ocr, created_at, updated_at
+                      cpu_priority, enable_background_ocr, ocr_page_segmentation_mode, ocr_engine_mode,
                      ocr_min_confidence, ocr_dpi, ocr_enhance_contrast, ocr_remove_noise,
                      ocr_detect_orientation, ocr_whitelist_chars, ocr_blacklist_chars,
                      created_at, updated_at
            "#
        )
        .bind(user_id)
@ -832,6 +1026,15 @@ impl Database {
        .bind(settings.memory_limit_mb.unwrap_or(current.memory_limit_mb))
        .bind(settings.cpu_priority.as_ref().unwrap_or(&current.cpu_priority))
        .bind(settings.enable_background_ocr.unwrap_or(current.enable_background_ocr))
        .bind(settings.ocr_page_segmentation_mode.unwrap_or(current.ocr_page_segmentation_mode))
        .bind(settings.ocr_engine_mode.unwrap_or(current.ocr_engine_mode))
        .bind(settings.ocr_min_confidence.unwrap_or(current.ocr_min_confidence))
        .bind(settings.ocr_dpi.unwrap_or(current.ocr_dpi))
        .bind(settings.ocr_enhance_contrast.unwrap_or(current.ocr_enhance_contrast))
        .bind(settings.ocr_remove_noise.unwrap_or(current.ocr_remove_noise))
        .bind(settings.ocr_detect_orientation.unwrap_or(current.ocr_detect_orientation))
        .bind(settings.ocr_whitelist_chars.as_ref().unwrap_or(&current.ocr_whitelist_chars))
        .bind(settings.ocr_blacklist_chars.as_ref().unwrap_or(&current.ocr_blacklist_chars))
        .fetch_one(&self.pool)
        .await?;
@ -854,6 +1057,15 @@ impl Database {
            memory_limit_mb: row.get("memory_limit_mb"),
            cpu_priority: row.get("cpu_priority"),
            enable_background_ocr: row.get("enable_background_ocr"),
            ocr_page_segmentation_mode: row.get("ocr_page_segmentation_mode"),
            ocr_engine_mode: row.get("ocr_engine_mode"),
            ocr_min_confidence: row.get("ocr_min_confidence"),
            ocr_dpi: row.get("ocr_dpi"),
            ocr_enhance_contrast: row.get("ocr_enhance_contrast"),
            ocr_remove_noise: row.get("ocr_remove_noise"),
            ocr_detect_orientation: row.get("ocr_detect_orientation"),
            ocr_whitelist_chars: row.get("ocr_whitelist_chars"),
            ocr_blacklist_chars: row.get("ocr_blacklist_chars"),
            created_at: row.get("created_at"),
            updated_at: row.get("updated_at"),
        })
--- a/src/enhanced_ocr.rs
+++ b/src/enhanced_ocr.rs
@ -0,0 +1,655 @@
 use anyhow::Result;
 use tracing::{debug, info, warn};
 #[cfg(feature = "ocr")]
 use image::{DynamicImage, ImageBuffer, Luma, GenericImageView};
 #[cfg(feature = "ocr")]
 use imageproc::{
    contrast::adaptive_threshold,
    morphology::{close, open},
    filter::{median_filter, gaussian_blur_f32},
    distance_transform::Norm,
 };
 #[cfg(feature = "ocr")]
 use tesseract::{Tesseract, PageSegMode, OcrEngineMode};
 use crate::models::Settings;
 #[derive(Debug, Clone)]
 pub struct ImageQualityStats {
    pub average_brightness: f32,
    pub contrast_ratio: f32,
    pub noise_level: f32,
    pub sharpness: f32,
 }
 #[derive(Debug, Clone)]
 pub struct OcrResult {
    pub text: String,
    pub confidence: f32,
    pub processing_time_ms: u64,
    pub word_count: usize,
    pub preprocessing_applied: Vec<String>,
 }
 pub struct EnhancedOcrService {
    pub temp_dir: String,
 }
 impl EnhancedOcrService {
    pub fn new(temp_dir: String) -> Self {
        Self { temp_dir }
    }
    /// Extract text from image with high-quality OCR settings
    #[cfg(feature = "ocr")]
    pub async fn extract_text_from_image(&self, file_path: &str, settings: &Settings) -> Result<OcrResult> {
        let start_time = std::time::Instant::now();
        info!("Starting enhanced OCR for image: {}", file_path);
        let mut preprocessing_applied = Vec::new();
        // Load and preprocess the image
        let processed_image_path = if settings.enable_image_preprocessing {
            let processed_path = self.preprocess_image(file_path, settings).await?;
            preprocessing_applied.push("Image preprocessing enabled".to_string());
            processed_path
        } else {
            file_path.to_string()
        };
        // Configure Tesseract with optimal settings
        let mut tesseract = self.configure_tesseract(&processed_image_path, settings)?;
        // Extract text with confidence
        let text = tesseract.get_text()?.trim().to_string();
        let confidence = self.calculate_overall_confidence(&mut tesseract)?;
        // Clean up temporary files if created
        if processed_image_path != file_path {
            let _ = std::fs::remove_file(&processed_image_path);
        }
        let processing_time = start_time.elapsed().as_millis() as u64;
        let word_count = text.split_whitespace().count();
        debug!(
            "OCR completed: {} words, {:.1}% confidence, {}ms",
            word_count, confidence, processing_time
        );
        Ok(OcrResult {
            text,
            confidence,
            processing_time_ms: processing_time,
            word_count,
            preprocessing_applied,
        })
    }
    /// Preprocess image for optimal OCR quality, especially for challenging conditions
    #[cfg(feature = "ocr")]
    async fn preprocess_image(&self, input_path: &str, settings: &Settings) -> Result<String> {
        let img = image::open(input_path)?;
        let mut processed_img = img;
        info!("Original image dimensions: {}x{}", processed_img.width(), processed_img.height());
        // Apply orientation detection and correction
        if settings.ocr_detect_orientation {
            processed_img = self.detect_and_correct_orientation(processed_img)?;
        }
        // Aggressively upscale low-resolution images for better OCR
        processed_img = self.smart_resize_for_ocr(processed_img, settings.ocr_dpi)?;
        // Convert to grayscale for better OCR
        let gray_img = processed_img.to_luma8();
        let mut processed_gray = gray_img;
        // Analyze image quality and apply appropriate enhancements
        let quality_stats = self.analyze_image_quality(&processed_gray);
        info!("Image quality analysis: brightness={:.1}, contrast={:.1}, noise_level={:.1}", 
               quality_stats.average_brightness, quality_stats.contrast_ratio, quality_stats.noise_level);
        // Apply adaptive brightness correction for dim images
        if quality_stats.average_brightness < 80.0 || quality_stats.contrast_ratio < 0.3 {
            processed_gray = self.enhance_brightness_and_contrast(processed_gray, &quality_stats)?;
        }
        // Apply noise removal (more aggressive for noisy images)
        if settings.ocr_remove_noise || quality_stats.noise_level > 0.15 {
            processed_gray = self.adaptive_noise_removal(processed_gray, &quality_stats)?;
        }
        // Apply contrast enhancement (adaptive based on image quality)
        if settings.ocr_enhance_contrast {
            processed_gray = self.adaptive_contrast_enhancement(processed_gray, &quality_stats)?;
        }
        // Apply sharpening for blurry images
        if quality_stats.sharpness < 0.4 {
            processed_gray = self.sharpen_image(processed_gray)?;
        }
        // Apply morphological operations for text clarity
        processed_gray = self.apply_morphological_operations(processed_gray)?;
        // Save processed image to temporary file
        let temp_filename = format!("processed_{}_{}.png", 
            std::process::id(), 
            std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?.as_millis()
        );
        let temp_path = format!("{}/{}", self.temp_dir, temp_filename);
        let dynamic_processed = DynamicImage::ImageLuma8(processed_gray);
        dynamic_processed.save(&temp_path)?;
        info!("Processed image saved to: {}", temp_path);
        Ok(temp_path)
    }
    /// Configure Tesseract with optimal settings
    #[cfg(feature = "ocr")]
    fn configure_tesseract(&self, image_path: &str, settings: &Settings) -> Result<Tesseract> {
        let mut tesseract = Tesseract::new(None, Some(&settings.ocr_language))?;
        // Set the image
        tesseract = tesseract.set_image(image_path)?;
        // Configure Page Segmentation Mode (PSM)
        let psm = match settings.ocr_page_segmentation_mode {
            0 => PageSegMode::PsmOsdOnly,
            1 => PageSegMode::PsmAutoOsd,
            2 => PageSegMode::PsmAutoOnly,
            3 => PageSegMode::PsmAuto,
            4 => PageSegMode::PsmSingleColumn,
            5 => PageSegMode::PsmSingleBlockVertText,
            6 => PageSegMode::PsmSingleBlock,
            7 => PageSegMode::PsmSingleLine,
            8 => PageSegMode::PsmSingleWord,
            9 => PageSegMode::PsmCircleWord,
            10 => PageSegMode::PsmSingleChar,
            11 => PageSegMode::PsmSparseText,
            12 => PageSegMode::PsmSparseTextOsd,
            13 => PageSegMode::PsmRawLine,
            _ => PageSegMode::PsmAuto, // Default fallback
        };
        tesseract.set_page_seg_mode(psm);
        // Configure OCR Engine Mode (OEM)
        let _oem = match settings.ocr_engine_mode {
            0 => OcrEngineMode::TesseractOnly,
            1 => OcrEngineMode::LstmOnly,
            2 => OcrEngineMode::TesseractLstmCombined,
            3 => OcrEngineMode::Default,
            _ => OcrEngineMode::Default, // Default fallback
        };
        // Note: set_engine_mode may not be available in the current tesseract crate version
        // We'll configure this differently if needed
        // Set DPI if specified and different from 0
        if settings.ocr_dpi > 0 {
            tesseract = tesseract.set_variable("user_defined_dpi", &settings.ocr_dpi.to_string())?;
        }
        // Configure character whitelist/blacklist
        if let Some(ref whitelist) = settings.ocr_whitelist_chars {
            if !whitelist.is_empty() {
                tesseract = tesseract.set_variable("tessedit_char_whitelist", whitelist)?;
            }
        }
        if let Some(ref blacklist) = settings.ocr_blacklist_chars {
            if !blacklist.is_empty() {
                tesseract = tesseract.set_variable("tessedit_char_blacklist", blacklist)?;
            }
        }
        // Additional high-quality settings for challenging images
        tesseract = tesseract.set_variable("preserve_interword_spaces", "1")?;
        tesseract = tesseract.set_variable("tessedit_do_invert", "0")?;
        tesseract = tesseract.set_variable("classify_enable_learning", "0")?;
        tesseract = tesseract.set_variable("textord_really_old_xheight", "1")?;
        tesseract = tesseract.set_variable("textord_min_xheight", "7")?;
        // Enhanced settings for low-quality images
        tesseract = tesseract.set_variable("tessedit_char_unblacklist_fraction", "0.0")?;
        tesseract = tesseract.set_variable("edges_max_children_per_outline", "40")?;
        tesseract = tesseract.set_variable("textord_noise_sizefraction", "10.0")?;
        tesseract = tesseract.set_variable("textord_noise_translimit", "16.0")?;
        tesseract = tesseract.set_variable("textord_noise_normratio", "2.0")?;
        // Improve word breaking for dense text
        tesseract = tesseract.set_variable("textord_tabfind_find_tables", "1")?;
        tesseract = tesseract.set_variable("textord_use_cjk_fp_model", "0")?;
        // Better handling of degraded images
        tesseract = tesseract.set_variable("classify_adapt_feature_threshold", "230")?;
        tesseract = tesseract.set_variable("classify_adapt_proto_threshold", "230")?;
        tesseract = tesseract.set_variable("textord_heavy_nr", "1")?;
        Ok(tesseract)
    }
    /// Calculate overall confidence score
    #[cfg(feature = "ocr")]
    fn calculate_overall_confidence(&self, _tesseract: &mut Tesseract) -> Result<f32> {
        // Note: get_word_confidences may not be available in current tesseract crate version
        // For now, we'll estimate confidence based on text quality
        // This can be enhanced when the API is available or with alternative methods
        // Return a reasonable default confidence for now
        Ok(85.0)
    }
    /// Detect and correct image orientation
    #[cfg(feature = "ocr")]
    fn detect_and_correct_orientation(&self, img: DynamicImage) -> Result<DynamicImage> {
        // For now, we'll implement basic rotation detection
        // In a production system, you might want to use Tesseract's OSD or advanced algorithms
        let (width, height) = img.dimensions();
        // If image is wider than tall by significant margin, it might need rotation
        if width as f32 / height as f32 > 2.0 {
            Ok(img.rotate90())
        } else {
            Ok(img)
        }
    }
    /// Smart resize for OCR - aggressive upscaling for low-res images
    #[cfg(feature = "ocr")]
    fn smart_resize_for_ocr(&self, img: DynamicImage, target_dpi: i32) -> Result<DynamicImage> {
        let (width, height) = img.dimensions();
        let min_dimension = width.min(height);
        // Calculate target dimensions
        let mut new_width = width;
        let mut new_height = height;
        // If image is very small, aggressively upscale
        if min_dimension < 300 {
            let scale_factor = 600.0 / min_dimension as f32; // Scale to at least 600px on smallest side
            new_width = (width as f32 * scale_factor) as u32;
            new_height = (height as f32 * scale_factor) as u32;
            info!("Aggressively upscaling small image by factor {:.2}x", scale_factor);
        } else if target_dpi > 0 && target_dpi != 72 {
            // Apply DPI scaling
            let scale_factor = target_dpi as f32 / 72.0;
            new_width = (width as f32 * scale_factor) as u32;
            new_height = (height as f32 * scale_factor) as u32;
        }
        if new_width != width || new_height != height {
            // Use Lanczos3 for best quality upscaling
            Ok(img.resize(new_width, new_height, image::imageops::FilterType::Lanczos3))
        } else {
            Ok(img)
        }
    }
    /// Analyze image quality metrics
    #[cfg(feature = "ocr")]
    fn analyze_image_quality(&self, img: &ImageBuffer<Luma<u8>, Vec<u8>>) -> ImageQualityStats {
        let pixels: Vec<u8> = img.pixels().map(|p| p[0]).collect();
        let pixel_count = pixels.len() as f32;
        // Calculate average brightness
        let sum: u32 = pixels.iter().map(|&p| p as u32).sum();
        let average_brightness = sum as f32 / pixel_count;
        // Calculate contrast (standard deviation of pixel values)
        let variance: f32 = pixels.iter()
            .map(|&p| {
                let diff = p as f32 - average_brightness;
                diff * diff
            })
            .sum::<f32>() / pixel_count;
        let std_dev = variance.sqrt();
        let contrast_ratio = std_dev / 255.0;
        // Estimate noise level using local variance
        let noise_level = self.estimate_noise_level(img);
        // Estimate sharpness using gradient magnitude
        let sharpness = self.estimate_sharpness(img);
        ImageQualityStats {
            average_brightness,
            contrast_ratio,
            noise_level,
            sharpness,
        }
    }
    /// Estimate noise level in image
    #[cfg(feature = "ocr")]
    fn estimate_noise_level(&self, img: &ImageBuffer<Luma<u8>, Vec<u8>>) -> f32 {
        let (width, height) = img.dimensions();
        let mut noise_sum = 0.0f32;
        let mut sample_count = 0u32;
        // Sample every 10th pixel to estimate noise
        for y in (5..height-5).step_by(10) {
            for x in (5..width-5).step_by(10) {
                let center = img.get_pixel(x, y)[0] as f32;
                let mut neighbor_sum = 0.0f32;
                let mut neighbor_count = 0u32;
                // Check 3x3 neighborhood
                for dy in -1..=1 {
                    for dx in -1..=1 {
                        if dx == 0 && dy == 0 { continue; }
                        let neighbor = img.get_pixel((x as i32 + dx) as u32, (y as i32 + dy) as u32)[0] as f32;
                        neighbor_sum += neighbor;
                        neighbor_count += 1;
                    }
                }
                let neighbor_avg = neighbor_sum / neighbor_count as f32;
                let local_variance = (center - neighbor_avg).abs();
                noise_sum += local_variance;
                sample_count += 1;
            }
        }
        if sample_count > 0 {
            (noise_sum / sample_count as f32) / 255.0
        } else {
            0.0
        }
    }
    /// Estimate image sharpness using gradient magnitude
    #[cfg(feature = "ocr")]
    fn estimate_sharpness(&self, img: &ImageBuffer<Luma<u8>, Vec<u8>>) -> f32 {
        let (width, height) = img.dimensions();
        let mut gradient_sum = 0.0f32;
        let mut sample_count = 0u32;
        // Calculate gradients for interior pixels
        for y in 1..height-1 {
            for x in 1..width-1 {
                let _center = img.get_pixel(x, y)[0] as f32;
                let left = img.get_pixel(x-1, y)[0] as f32;
                let right = img.get_pixel(x+1, y)[0] as f32;
                let top = img.get_pixel(x, y-1)[0] as f32;
                let bottom = img.get_pixel(x, y+1)[0] as f32;
                let grad_x = (right - left) / 2.0;
                let grad_y = (bottom - top) / 2.0;
                let gradient_magnitude = (grad_x * grad_x + grad_y * grad_y).sqrt();
                gradient_sum += gradient_magnitude;
                sample_count += 1;
            }
        }
        if sample_count > 0 {
            (gradient_sum / sample_count as f32) / 255.0
        } else {
            0.0
        }
    }
    /// Enhanced brightness and contrast correction for dim images
    #[cfg(feature = "ocr")]
    fn enhance_brightness_and_contrast(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>, stats: &ImageQualityStats) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
        let (width, height) = img.dimensions();
        let mut enhanced = ImageBuffer::new(width, height);
        // Calculate enhancement parameters based on image statistics
        let brightness_boost = if stats.average_brightness < 50.0 {
            60.0 - stats.average_brightness  // Aggressive boost for very dim images
        } else if stats.average_brightness < 80.0 {
            30.0 - (stats.average_brightness - 50.0) * 0.5  // Moderate boost
        } else {
            0.0  // No boost needed
        };
        let contrast_multiplier = if stats.contrast_ratio < 0.2 {
            2.5  // Aggressive contrast boost for flat images
        } else if stats.contrast_ratio < 0.4 {
            1.8  // Moderate contrast boost
        } else {
            1.2  // Slight boost
        };
        info!("Applying brightness boost: {:.1}, contrast multiplier: {:.1}", brightness_boost, contrast_multiplier);
        for (x, y, pixel) in img.enumerate_pixels() {
            let original_value = pixel[0] as f32;
            // Apply brightness and contrast enhancement
            let enhanced_value = ((original_value + brightness_boost) * contrast_multiplier).round();
            let clamped_value = enhanced_value.max(0.0).min(255.0) as u8;
            enhanced.put_pixel(x, y, Luma([clamped_value]));
        }
        Ok(enhanced)
    }
    /// Adaptive noise removal based on detected noise level
    #[cfg(feature = "ocr")]
    fn adaptive_noise_removal(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>, stats: &ImageQualityStats) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
        let mut processed = img;
        if stats.noise_level > 0.2 {
            // Heavy noise - apply multiple filters
            processed = median_filter(&processed, 2, 2);  // Larger median filter
            processed = gaussian_blur_f32(&processed, 0.8);  // More blur
            info!("Applied heavy noise reduction (noise level: {:.2})", stats.noise_level);
        } else if stats.noise_level > 0.1 {
            // Moderate noise
            processed = median_filter(&processed, 1, 1);
            processed = gaussian_blur_f32(&processed, 0.5);
            info!("Applied moderate noise reduction");
        } else {
            // Light noise or clean image
            processed = median_filter(&processed, 1, 1);
            info!("Applied light noise reduction");
        }
        Ok(processed)
    }
    /// Adaptive contrast enhancement based on image quality
    #[cfg(feature = "ocr")]
    fn adaptive_contrast_enhancement(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>, stats: &ImageQualityStats) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
        // Choose threshold size based on image dimensions and quality
        let (width, height) = img.dimensions();
        let min_dimension = width.min(height);
        let threshold_size = if stats.contrast_ratio < 0.2 {
            // Low contrast - use smaller windows for more aggressive local adaptation
            (min_dimension / 20).max(11).min(31)
        } else {
            // Good contrast - use larger windows
            (min_dimension / 15).max(15).min(41)
        };
        // Ensure odd number for threshold size
        let threshold_size = if threshold_size % 2 == 0 { threshold_size + 1 } else { threshold_size };
        info!("Applying adaptive threshold with window size: {}", threshold_size);
        let enhanced = adaptive_threshold(&img, threshold_size);
        Ok(enhanced)
    }
    /// Sharpen blurry images
    #[cfg(feature = "ocr")]
    fn sharpen_image(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
        let (width, height) = img.dimensions();
        let mut sharpened = ImageBuffer::new(width, height);
        // Unsharp mask kernel - enhances edges
        let kernel = [
            [0.0, -1.0, 0.0],
            [-1.0, 5.0, -1.0],
            [0.0, -1.0, 0.0],
        ];
        for y in 1..height-1 {
            for x in 1..width-1 {
                let mut sum = 0.0;
                for ky in 0..3 {
                    for kx in 0..3 {
                        let px = img.get_pixel(x + kx - 1, y + ky - 1)[0] as f32;
                        sum += px * kernel[ky as usize][kx as usize];
                    }
                }
                let sharpened_value = sum.round().max(0.0).min(255.0) as u8;
                sharpened.put_pixel(x, y, Luma([sharpened_value]));
            }
        }
        // Copy border pixels
        for y in 0..height {
            for x in 0..width {
                if x == 0 || x == width-1 || y == 0 || y == height-1 {
                    sharpened.put_pixel(x, y, *img.get_pixel(x, y));
                }
            }
        }
        info!("Applied image sharpening");
        Ok(sharpened)
    }
    /// Apply morphological operations for text clarity
    #[cfg(feature = "ocr")]
    fn apply_morphological_operations(&self, img: ImageBuffer<Luma<u8>, Vec<u8>>) -> Result<ImageBuffer<Luma<u8>, Vec<u8>>> {
        // Apply opening to remove small noise
        let opened = open(&img, Norm::LInf, 1);
        // Apply closing to fill small gaps in text
        let closed = close(&opened, Norm::LInf, 1);
        Ok(closed)
    }
    /// Extract text from PDF
    #[cfg(feature = "ocr")]
    pub async fn extract_text_from_pdf(&self, file_path: &str, _settings: &Settings) -> Result<OcrResult> {
        let start_time = std::time::Instant::now();
        info!("Extracting text from PDF: {}", file_path);
        let bytes = std::fs::read(file_path)?;
        let text = pdf_extract::extract_text_from_mem(&bytes)?;
        let processing_time = start_time.elapsed().as_millis() as u64;
        let word_count = text.split_whitespace().count();
        Ok(OcrResult {
            text: text.trim().to_string(),
            confidence: 95.0, // PDF text extraction is generally high confidence
            processing_time_ms: processing_time,
            word_count,
            preprocessing_applied: vec!["PDF text extraction".to_string()],
        })
    }
    /// Extract text from any supported file type
    pub async fn extract_text(&self, file_path: &str, mime_type: &str, settings: &Settings) -> Result<OcrResult> {
        match mime_type {
            "application/pdf" => {
                #[cfg(feature = "ocr")]
                {
                    self.extract_text_from_pdf(file_path, settings).await
                }
                #[cfg(not(feature = "ocr"))]
                {
                    Err(anyhow::anyhow!("OCR feature not enabled"))
                }
            }
            mime if mime.starts_with("image/") => {
                #[cfg(feature = "ocr")]
                {
                    self.extract_text_from_image(file_path, settings).await
                }
                #[cfg(not(feature = "ocr"))]
                {
                    Err(anyhow::anyhow!("OCR feature not enabled"))
                }
            }
            "text/plain" => {
                let start_time = std::time::Instant::now();
                let text = std::fs::read_to_string(file_path)?;
                let processing_time = start_time.elapsed().as_millis() as u64;
                let word_count = text.split_whitespace().count();
                Ok(OcrResult {
                    text: text.trim().to_string(),
                    confidence: 100.0, // Plain text is 100% confident
                    processing_time_ms: processing_time,
                    word_count,
                    preprocessing_applied: vec!["Plain text read".to_string()],
                })
            }
            _ => Err(anyhow::anyhow!("Unsupported file type: {}", mime_type)),
        }
    }
    /// Validate OCR result quality
    pub fn validate_ocr_quality(&self, result: &OcrResult, settings: &Settings) -> bool {
        // Check minimum confidence threshold
        if result.confidence < settings.ocr_min_confidence {
            warn!(
                "OCR result below confidence threshold: {:.1}% < {:.1}%", 
                result.confidence, settings.ocr_min_confidence
            );
            return false;
        }
        // Check if text is reasonable (not just noise)
        if result.word_count == 0 {
            warn!("OCR result contains no words");
            return false;
        }
        // Check for reasonable character distribution
        let total_chars = result.text.len();
        if total_chars == 0 {
            return false;
        }
        let alphanumeric_chars = result.text.chars().filter(|c| c.is_alphanumeric()).count();
        let alphanumeric_ratio = alphanumeric_chars as f32 / total_chars as f32;
        // Expect at least 30% alphanumeric characters for valid text
        if alphanumeric_ratio < 0.3 {
            warn!(
                "OCR result has low alphanumeric ratio: {:.1}%", 
                alphanumeric_ratio * 100.0
            );
            return false;
        }
        true
    }
 }
 #[cfg(not(feature = "ocr"))]
 impl EnhancedOcrService {
    pub async fn extract_text_from_image(&self, _file_path: &str, _settings: &Settings) -> Result<OcrResult> {
        Err(anyhow::anyhow!("OCR feature not enabled"))
    }
    pub async fn extract_text_from_pdf(&self, _file_path: &str, _settings: &Settings) -> Result<OcrResult> {
        Err(anyhow::anyhow!("OCR feature not enabled"))
    }
    pub async fn extract_text(&self, _file_path: &str, _mime_type: &str, _settings: &Settings) -> Result<OcrResult> {
        Err(anyhow::anyhow!("OCR feature not enabled"))
    }
    pub fn validate_ocr_quality(&self, _result: &OcrResult, _settings: &Settings) -> bool {
        false
    }
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -2,6 +2,7 @@ pub mod auth;
 pub mod batch_ingest;
 pub mod config;
 pub mod db;
 pub mod enhanced_ocr; // Temporarily disabled due to compilation errors
 pub mod file_service;
 pub mod models;
 pub mod ocr;
@ -26,4 +27,4 @@ pub struct AppState {
 /// Health check endpoint for monitoring
 pub async fn health_check() -> Result<Json<serde_json::Value>, StatusCode> {
    Ok(Json(serde_json::json!({"status": "ok"})))
-}
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -12,6 +12,7 @@ mod auth;
 mod batch_ingest;
 mod config;
 mod db;
 mod enhanced_ocr;
 mod file_service;
 mod models;
 mod ocr;
--- a/src/models.rs
+++ b/src/models.rs
@ -186,6 +186,15 @@ pub struct Settings {
    pub memory_limit_mb: i32,
    pub cpu_priority: String,
    pub enable_background_ocr: bool,
    pub ocr_page_segmentation_mode: i32,
    pub ocr_engine_mode: i32,
    pub ocr_min_confidence: f32,
    pub ocr_dpi: i32,
    pub ocr_enhance_contrast: bool,
    pub ocr_remove_noise: bool,
    pub ocr_detect_orientation: bool,
    pub ocr_whitelist_chars: Option<String>,
    pub ocr_blacklist_chars: Option<String>,
    pub created_at: DateTime<Utc>,
    pub updated_at: DateTime<Utc>,
 }
@ -208,6 +217,15 @@ pub struct SettingsResponse {
    pub memory_limit_mb: i32,
    pub cpu_priority: String,
    pub enable_background_ocr: bool,
    pub ocr_page_segmentation_mode: i32,
    pub ocr_engine_mode: i32,
    pub ocr_min_confidence: f32,
    pub ocr_dpi: i32,
    pub ocr_enhance_contrast: bool,
    pub ocr_remove_noise: bool,
    pub ocr_detect_orientation: bool,
    pub ocr_whitelist_chars: Option<String>,
    pub ocr_blacklist_chars: Option<String>,
 }
 #[derive(Debug, Serialize, Deserialize, ToSchema)]
@ -228,6 +246,15 @@ pub struct UpdateSettings {
    pub memory_limit_mb: Option<i32>,
    pub cpu_priority: Option<String>,
    pub enable_background_ocr: Option<bool>,
    pub ocr_page_segmentation_mode: Option<i32>,
    pub ocr_engine_mode: Option<i32>,
    pub ocr_min_confidence: Option<f32>,
    pub ocr_dpi: Option<i32>,
    pub ocr_enhance_contrast: Option<bool>,
    pub ocr_remove_noise: Option<bool>,
    pub ocr_detect_orientation: Option<bool>,
    pub ocr_whitelist_chars: Option<Option<String>>,
    pub ocr_blacklist_chars: Option<Option<String>>,
 }
 impl From<Settings> for SettingsResponse {
@ -249,6 +276,15 @@ impl From<Settings> for SettingsResponse {
            memory_limit_mb: settings.memory_limit_mb,
            cpu_priority: settings.cpu_priority,
            enable_background_ocr: settings.enable_background_ocr,
            ocr_page_segmentation_mode: settings.ocr_page_segmentation_mode,
            ocr_engine_mode: settings.ocr_engine_mode,
            ocr_min_confidence: settings.ocr_min_confidence,
            ocr_dpi: settings.ocr_dpi,
            ocr_enhance_contrast: settings.ocr_enhance_contrast,
            ocr_remove_noise: settings.ocr_remove_noise,
            ocr_detect_orientation: settings.ocr_detect_orientation,
            ocr_whitelist_chars: settings.ocr_whitelist_chars,
            ocr_blacklist_chars: settings.ocr_blacklist_chars,
        }
    }
 }
@ -282,6 +318,15 @@ impl Default for Settings {
            memory_limit_mb: 512,
            cpu_priority: "normal".to_string(),
            enable_background_ocr: true,
            ocr_page_segmentation_mode: 3, // PSM_AUTO_OSD - Fully automatic page segmentation, but no OSD
            ocr_engine_mode: 3, // OEM_DEFAULT - Default, based on what is available
            ocr_min_confidence: 30.0, // Minimum confidence threshold (0-100)
            ocr_dpi: 300, // Optimal DPI for OCR
            ocr_enhance_contrast: true, // Enable contrast enhancement
            ocr_remove_noise: true, // Enable noise removal
            ocr_detect_orientation: true, // Enable orientation detection
            ocr_whitelist_chars: None, // No character whitelist by default
            ocr_blacklist_chars: None, // No character blacklist by default
            created_at: Utc::now(),
            updated_at: Utc::now(),
        }
--- a/src/routes/settings.rs
+++ b/src/routes/settings.rs
@ -61,6 +61,15 @@ async fn get_settings(
                memory_limit_mb: default.memory_limit_mb,
                cpu_priority: default.cpu_priority,
                enable_background_ocr: default.enable_background_ocr,
                ocr_page_segmentation_mode: default.ocr_page_segmentation_mode,
                ocr_engine_mode: default.ocr_engine_mode,
                ocr_min_confidence: default.ocr_min_confidence,
                ocr_dpi: default.ocr_dpi,
                ocr_enhance_contrast: default.ocr_enhance_contrast,
                ocr_remove_noise: default.ocr_remove_noise,
                ocr_detect_orientation: default.ocr_detect_orientation,
                ocr_whitelist_chars: default.ocr_whitelist_chars,
                ocr_blacklist_chars: default.ocr_blacklist_chars,
            }
        },
    };
--- a/src/tests/enhanced_search_tests.rs
+++ b/src/tests/enhanced_search_tests.rs
@ -720,6 +720,175 @@ mod tests {
        assert!(snippets[0].highlight_ranges.len() >= 3); // Should find multiple "test" instances
    }
    #[test]
    fn test_substring_matching_basic() {
        let mock_db = MockDatabase::new();
        // Test "docu" matching "document"
        let content = "This is a document about important documents and documentation.";
        let snippets = mock_db.generate_snippets("docu", Some(content), None, 100);
        assert!(!snippets.is_empty());
        let snippet = &snippets[0];
        assert!(snippet.text.to_lowercase().contains("document"));
        assert!(!snippet.highlight_ranges.is_empty());
    }
    #[test]
    fn test_substring_matching_partial_words() {
        let mock_db = MockDatabase::new();
        // Test partial word matching
        let content = "The application processes various applications and applicants.";
        let snippets = mock_db.generate_snippets("app", Some(content), None, 100);
        assert!(!snippets.is_empty());
        // Should find matches in "application", "applications", "applicants"
        let total_highlights: usize = snippets.iter()
            .map(|s| s.highlight_ranges.len())
            .sum();
        assert!(total_highlights >= 1); // At least one match
    }
    #[test]
    fn test_substring_matching_filename_context() {
        let mock_db = MockDatabase::new();
        // Test filename matching with context
        let content = "Contract agreement between parties for legal documentation.";
        let snippets = mock_db.generate_snippets("contr", Some(content), None, 80);
        assert!(!snippets.is_empty());
        let snippet = &snippets[0];
        assert!(snippet.text.to_lowercase().contains("contract"));
        // Should provide context around the match
        assert!(snippet.text.len() <= 80);
        assert!(snippet.text.contains("Contract"));
    }
    #[test]
    fn test_enhanced_snippet_generation_word_boundaries() {
        let mock_db = MockDatabase::new();
        // Test that snippets respect word boundaries
        let content = "The document processing system handles document management and documentation workflows efficiently.";
        let snippets = mock_db.generate_snippets("doc", Some(content), None, 50);
        assert!(!snippets.is_empty());
        let snippet = &snippets[0];
        // Should find "document", "documentation" etc.
        assert!(snippet.text.to_lowercase().contains("doc"));
        // Snippet should not cut words in the middle
        let words: Vec<&str> = snippet.text.split_whitespace().collect();
        assert!(words.len() > 0);
        // First and last words should be complete (not cut off)
        if snippet.start_offset > 0 {
            assert!(!snippet.text.starts_with(" "));
        }
    }
    #[test]
    fn test_fuzzy_search_mode_simulation() {
        // Since we can't easily test the DB query here, test the logic
        // that would be used in fuzzy mode
        let query = "docu";
        let filename1 = "important_document.pdf";
        let filename2 = "user_documentation.txt";
        let filename3 = "unrelated_file.jpg";
        // Simulate fuzzy matching logic
        let matches_file1 = filename1.to_lowercase().contains(&query.to_lowercase());
        let matches_file2 = filename2.to_lowercase().contains(&query.to_lowercase());
        let matches_file3 = filename3.to_lowercase().contains(&query.to_lowercase());
        assert!(matches_file1); // "docu" should match "document"
        assert!(matches_file2); // "docu" should match "documentation"
        assert!(!matches_file3); // "docu" should not match "unrelated_file"
    }
    #[test]
    fn test_context_snippet_generation() {
        let mock_db = MockDatabase::new();
        // Test that snippets provide good context
        let long_content = "In the beginning of this long document, there are many important details about document processing. Later in the document, we discuss document management systems and their implementation. Finally, the document concludes with documentation best practices.";
        let snippets = mock_db.generate_snippets("document management", Some(long_content), None, 80);
        assert!(!snippets.is_empty());
        let snippet = &snippets[0];
        // Should contain the exact phrase and surrounding context
        assert!(snippet.text.to_lowercase().contains("document management"));
        assert!(snippet.text.len() <= 80);
        // Should have proper highlight ranges for multi-word queries
        assert!(!snippet.highlight_ranges.is_empty());
    }
    #[test]
    fn test_multiple_term_substring_matching() {
        let mock_db = MockDatabase::new();
        // Test matching multiple partial terms
        let content = "The application documentation covers app development and application deployment procedures.";
        let snippets = mock_db.generate_snippets("app dev", Some(content), None, 100);
        assert!(!snippets.is_empty());
        let snippet = &snippets[0];
        // Should find both "app" (in various forms) and "dev"
        assert!(snippet.text.to_lowercase().contains("app") || snippet.text.to_lowercase().contains("application"));
        assert!(snippet.text.to_lowercase().contains("dev"));
    }
    #[test]
    fn test_similarity_scoring_logic() {
        // Test the logic that would be used for similarity scoring
        let query = "docu";
        let test_cases = vec![
            ("document.pdf", true),      // Should match
            ("documentation.txt", true), // Should match
            ("my_docs.pdf", false),      // Might not match depending on threshold
            ("picture.jpg", false),      // Should not match
        ];
        for (filename, should_match) in test_cases {
            let contains_query = filename.to_lowercase().contains(&query.to_lowercase());
            // In a real implementation, this would use PostgreSQL's similarity() function
            // with a threshold like 0.3
            let similarity_match = contains_query; // Simplified for testing
            if should_match {
                assert!(similarity_match, "Expected '{}' to match '{}'", filename, query);
            }
        }
    }
    #[test]
    fn test_enhanced_ranking_with_substring_matches() {
        // Test that substring matches get appropriate ranking
        let mock_db = MockDatabase::new();
        // Exact match should rank higher than substring match
        let exact_content = "Document processing and document management";
        let substring_content = "Documentation and documents are important";
        let exact_snippets = mock_db.generate_snippets("document", Some(exact_content), None, 100);
        let substring_snippets = mock_db.generate_snippets("document", Some(substring_content), None, 100);
        assert!(!exact_snippets.is_empty());
        assert!(!substring_snippets.is_empty());
        // Both should find matches
        assert!(exact_snippets[0].highlight_ranges.len() >= 1);
        assert!(substring_snippets[0].highlight_ranges.len() >= 1);
    }
    // Integration tests that would work with actual database
    #[tokio::test]
    #[ignore = "Requires PostgreSQL database for integration testing"]
--- a/src/tests/settings_tests.rs
+++ b/src/tests/settings_tests.rs
@ -57,6 +57,15 @@ mod tests {
            memory_limit_mb: None,
            cpu_priority: None,
            enable_background_ocr: None,
            ocr_page_segmentation_mode: None,
            ocr_engine_mode: None,
            ocr_min_confidence: None,
            ocr_dpi: None,
            ocr_enhance_contrast: None,
            ocr_remove_noise: None,
            ocr_detect_orientation: None,
            ocr_whitelist_chars: None,
            ocr_blacklist_chars: None,
        };
        let response = app
@ -144,6 +153,15 @@ mod tests {
            memory_limit_mb: None,
            cpu_priority: None,
            enable_background_ocr: None,
            ocr_page_segmentation_mode: None,
            ocr_engine_mode: None,
            ocr_min_confidence: None,
            ocr_dpi: None,
            ocr_enhance_contrast: None,
            ocr_remove_noise: None,
            ocr_detect_orientation: None,
            ocr_whitelist_chars: None,
            ocr_blacklist_chars: None,
        };
        let response = app