aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHiroki Tagato <tagattie@FreeBSD.org>2024-02-12 08:28:03 +0000
committerHiroki Tagato <tagattie@FreeBSD.org>2024-02-12 08:34:14 +0000
commite3dfc2fad4565873d807ba820a24094b97890f98 (patch)
treedf12fa7af316dad1d9d15216dc726dd4b5237824
parent54cee7bd5a75afc210b1fccec19e9274e81fce44 (diff)
downloadports-e3dfc2fad4565873d807ba820a24094b97890f98.tar.gz
ports-e3dfc2fad4565873d807ba820a24094b97890f98.zip
textproc/py-tokenizers: add port: Fast state-of-the-art tokenizers optimized for research and production
Provides an implementation of today's most used tokenizers, with a focus on performance and versatility. Main features: - Train new vocabularies and tokenize, using today's most used tokenizers. - Extremely fast (both training and tokenization), thanks to the Rust implementation. Takes less than 20 seconds to tokenize a GB of text on a server's CPU. - Easy to use, but also extremely versatile. - Designed for research and production. - Normalization comes with alignments tracking. It's always possible to get the part of the original sentence that corresponds to a given token. - Does all the pre-processing: Truncate, Pad, add the special tokens your model needs. WWW: https://github.com/huggingface/tokenizers
-rw-r--r--textproc/Makefile1
-rw-r--r--textproc/py-tokenizers/Makefile29
-rw-r--r--textproc/py-tokenizers/Makefile.crates149
-rw-r--r--textproc/py-tokenizers/distinfo301
-rw-r--r--textproc/py-tokenizers/pkg-descr16
5 files changed, 496 insertions, 0 deletions
diff --git a/textproc/Makefile b/textproc/Makefile
index c51bc706f8c6..8cabafff97f3 100644
--- a/textproc/Makefile
+++ b/textproc/Makefile
@@ -1618,6 +1618,7 @@
SUBDIR += py-tiktoken
SUBDIR += py-tinycss
SUBDIR += py-tinycss2
+ SUBDIR += py-tokenizers
SUBDIR += py-toml
SUBDIR += py-tomli
SUBDIR += py-tomli-w
diff --git a/textproc/py-tokenizers/Makefile b/textproc/py-tokenizers/Makefile
new file mode 100644
index 000000000000..4447eab63c38
--- /dev/null
+++ b/textproc/py-tokenizers/Makefile
@@ -0,0 +1,29 @@
+PORTNAME= tokenizers
+DISTVERSION= 0.15.1
+CATEGORIES= textproc python
+MASTER_SITES= PYPI
+PKGNAMEPREFIX= ${PYTHON_PKGNAMEPREFIX}
+DISTFILES= ${PORTNAME}-${PORTVERSION}${EXTRACT_SUFX}
+
+MAINTAINER= tagattie@FreeBSD.org
+COMMENT= Fast state-of-the-art tokenizers optimized for research and production
+WWW= https://github.com/huggingface/tokenizers
+
+LICENSE= APACHE20
+
+BUILD_DEPENDS= ${PYTHON_PKGNAMEPREFIX}maturin>=1.0<2.0:devel/py-maturin@${PY_FLAVOR}
+RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}huggingface-hub>=0.16.4<1.0:misc/py-huggingface-hub@${PY_FLAVOR}
+
+USES= cargo python
+USE_PYTHON= autoplist pep517
+
+CARGO_CARGOTOML=${WRKSRC}/bindings/python/Cargo.toml
+CARGO_CARGOLOCK=${WRKSRC}/bindings/python/Cargo.lock
+CARGO_BUILD= no
+CARGO_INSTALL= no
+CARGO_TEST= no
+
+post-install:
+ @${FIND} ${STAGEDIR}${PYTHON_SITELIBDIR} -type f -name '*.so' -exec ${STRIP_CMD} {} ';'
+
+.include <bsd.port.mk>
diff --git a/textproc/py-tokenizers/Makefile.crates b/textproc/py-tokenizers/Makefile.crates
new file mode 100644
index 000000000000..80ac75f5ceca
--- /dev/null
+++ b/textproc/py-tokenizers/Makefile.crates
@@ -0,0 +1,149 @@
+CARGO_CRATES= aho-corasick-1.1.2 \
+ anstream-0.6.5 \
+ anstyle-1.0.4 \
+ anstyle-parse-0.2.3 \
+ anstyle-query-1.0.2 \
+ anstyle-wincon-3.0.2 \
+ autocfg-1.1.0 \
+ base64-0.13.1 \
+ bitflags-1.3.2 \
+ bitflags-2.4.1 \
+ cc-1.0.83 \
+ cfg-if-1.0.0 \
+ clap-4.4.11 \
+ clap_builder-4.4.11 \
+ clap_derive-4.4.7 \
+ clap_lex-0.6.0 \
+ colorchoice-1.0.0 \
+ console-0.15.7 \
+ crossbeam-deque-0.8.4 \
+ crossbeam-epoch-0.9.16 \
+ crossbeam-utils-0.8.17 \
+ darling-0.14.4 \
+ darling_core-0.14.4 \
+ darling_macro-0.14.4 \
+ derive_builder-0.12.0 \
+ derive_builder_core-0.12.0 \
+ derive_builder_macro-0.12.0 \
+ either-1.9.0 \
+ encode_unicode-0.3.6 \
+ env_logger-0.10.1 \
+ errno-0.3.8 \
+ esaxx-rs-0.1.10 \
+ fastrand-2.0.1 \
+ fnv-1.0.7 \
+ getrandom-0.2.11 \
+ heck-0.4.1 \
+ hermit-abi-0.3.3 \
+ humantime-2.1.0 \
+ ident_case-1.0.1 \
+ indicatif-0.17.7 \
+ indoc-2.0.4 \
+ instant-0.1.12 \
+ is-terminal-0.4.9 \
+ itertools-0.11.0 \
+ itoa-1.0.10 \
+ lazy_static-1.4.0 \
+ libc-0.2.151 \
+ linux-raw-sys-0.4.12 \
+ lock_api-0.4.11 \
+ log-0.4.20 \
+ macro_rules_attribute-0.2.0 \
+ macro_rules_attribute-proc_macro-0.2.0 \
+ matrixmultiply-0.3.8 \
+ memchr-2.6.4 \
+ memoffset-0.9.0 \
+ minimal-lexical-0.2.1 \
+ monostate-0.1.10 \
+ monostate-impl-0.1.10 \
+ ndarray-0.15.6 \
+ nom-7.1.3 \
+ num-complex-0.4.4 \
+ num-integer-0.1.45 \
+ num-traits-0.2.17 \
+ number_prefix-0.4.0 \
+ numpy-0.20.0 \
+ once_cell-1.19.0 \
+ onig-6.4.0 \
+ onig_sys-69.8.1 \
+ parking_lot-0.12.1 \
+ parking_lot_core-0.9.9 \
+ paste-1.0.14 \
+ pkg-config-0.3.27 \
+ portable-atomic-1.6.0 \
+ ppv-lite86-0.2.17 \
+ proc-macro2-1.0.70 \
+ pyo3-0.20.2 \
+ pyo3-build-config-0.20.2 \
+ pyo3-ffi-0.20.2 \
+ pyo3-macros-0.20.2 \
+ pyo3-macros-backend-0.20.2 \
+ quote-1.0.33 \
+ rand-0.8.5 \
+ rand_chacha-0.3.1 \
+ rand_core-0.6.4 \
+ rawpointer-0.2.1 \
+ rayon-1.8.0 \
+ rayon-cond-0.3.0 \
+ rayon-core-1.12.0 \
+ redox_syscall-0.4.1 \
+ regex-1.10.2 \
+ regex-automata-0.4.3 \
+ regex-syntax-0.7.5 \
+ regex-syntax-0.8.2 \
+ rustc-hash-1.1.0 \
+ rustix-0.38.28 \
+ ryu-1.0.16 \
+ scopeguard-1.2.0 \
+ serde-1.0.193 \
+ serde_derive-1.0.193 \
+ serde_json-1.0.108 \
+ smallvec-1.11.2 \
+ spm_precompiled-0.1.4 \
+ strsim-0.10.0 \
+ syn-1.0.109 \
+ syn-2.0.41 \
+ target-lexicon-0.12.12 \
+ tempfile-3.8.1 \
+ termcolor-1.4.0 \
+ thiserror-1.0.51 \
+ thiserror-impl-1.0.51 \
+ unicode-ident-1.0.12 \
+ unicode-normalization-alignments-0.1.12 \
+ unicode-segmentation-1.10.1 \
+ unicode-width-0.1.11 \
+ unicode_categories-0.1.1 \
+ unindent-0.2.3 \
+ utf8parse-0.2.1 \
+ wasi-0.11.0+wasi-snapshot-preview1 \
+ winapi-0.3.9 \
+ winapi-i686-pc-windows-gnu-0.4.0 \
+ winapi-util-0.1.6 \
+ winapi-x86_64-pc-windows-gnu-0.4.0 \
+ windows-sys-0.45.0 \
+ windows-sys-0.48.0 \
+ windows-sys-0.52.0 \
+ windows-targets-0.42.2 \
+ windows-targets-0.48.5 \
+ windows-targets-0.52.0 \
+ windows_aarch64_gnullvm-0.42.2 \
+ windows_aarch64_gnullvm-0.48.5 \
+ windows_aarch64_gnullvm-0.52.0 \
+ windows_aarch64_msvc-0.42.2 \
+ windows_aarch64_msvc-0.48.5 \
+ windows_aarch64_msvc-0.52.0 \
+ windows_i686_gnu-0.42.2 \
+ windows_i686_gnu-0.48.5 \
+ windows_i686_gnu-0.52.0 \
+ windows_i686_msvc-0.42.2 \
+ windows_i686_msvc-0.48.5 \
+ windows_i686_msvc-0.52.0 \
+ windows_x86_64_gnu-0.42.2 \
+ windows_x86_64_gnu-0.48.5 \
+ windows_x86_64_gnu-0.52.0 \
+ windows_x86_64_gnullvm-0.42.2 \
+ windows_x86_64_gnullvm-0.48.5 \
+ windows_x86_64_gnullvm-0.52.0 \
+ windows_x86_64_msvc-0.42.2 \
+ windows_x86_64_msvc-0.48.5 \
+ windows_x86_64_msvc-0.52.0
diff --git a/textproc/py-tokenizers/distinfo b/textproc/py-tokenizers/distinfo
new file mode 100644
index 000000000000..6b821159c95f
--- /dev/null
+++ b/textproc/py-tokenizers/distinfo
@@ -0,0 +1,301 @@
+TIMESTAMP = 1707702588
+SHA256 (tokenizers-0.15.1.tar.gz) = c0a331d6d5a3d6e97b7f99f562cee8d56797180797bc55f12070e495e717c980
+SIZE (tokenizers-0.15.1.tar.gz) = 320398
+SHA256 (rust/crates/aho-corasick-1.1.2.crate) = b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0
+SIZE (rust/crates/aho-corasick-1.1.2.crate) = 183136
+SHA256 (rust/crates/anstream-0.6.5.crate) = d664a92ecae85fd0a7392615844904654d1d5f5514837f471ddef4a057aba1b6
+SIZE (rust/crates/anstream-0.6.5.crate) = 30004
+SHA256 (rust/crates/anstyle-1.0.4.crate) = 7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87
+SIZE (rust/crates/anstyle-1.0.4.crate) = 13998
+SHA256 (rust/crates/anstyle-parse-0.2.3.crate) = c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c
+SIZE (rust/crates/anstyle-parse-0.2.3.crate) = 24699
+SHA256 (rust/crates/anstyle-query-1.0.2.crate) = e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648
+SIZE (rust/crates/anstyle-query-1.0.2.crate) = 8739
+SHA256 (rust/crates/anstyle-wincon-3.0.2.crate) = 1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7
+SIZE (rust/crates/anstyle-wincon-3.0.2.crate) = 11272
+SHA256 (rust/crates/autocfg-1.1.0.crate) = d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa
+SIZE (rust/crates/autocfg-1.1.0.crate) = 13272
+SHA256 (rust/crates/base64-0.13.1.crate) = 9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8
+SIZE (rust/crates/base64-0.13.1.crate) = 61002
+SHA256 (rust/crates/bitflags-1.3.2.crate) = bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a
+SIZE (rust/crates/bitflags-1.3.2.crate) = 23021
+SHA256 (rust/crates/bitflags-2.4.1.crate) = 327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07
+SIZE (rust/crates/bitflags-2.4.1.crate) = 37043
+SHA256 (rust/crates/cc-1.0.83.crate) = f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0
+SIZE (rust/crates/cc-1.0.83.crate) = 68343
+SHA256 (rust/crates/cfg-if-1.0.0.crate) = baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd
+SIZE (rust/crates/cfg-if-1.0.0.crate) = 7934
+SHA256 (rust/crates/clap-4.4.11.crate) = bfaff671f6b22ca62406885ece523383b9b64022e341e53e009a62ebc47a45f2
+SIZE (rust/crates/clap-4.4.11.crate) = 54782
+SHA256 (rust/crates/clap_builder-4.4.11.crate) = a216b506622bb1d316cd51328dce24e07bdff4a6128a47c7e7fad11878d5adbb
+SIZE (rust/crates/clap_builder-4.4.11.crate) = 163317
+SHA256 (rust/crates/clap_derive-4.4.7.crate) = cf9804afaaf59a91e75b022a30fb7229a7901f60c755489cc61c9b423b836442
+SIZE (rust/crates/clap_derive-4.4.7.crate) = 29046
+SHA256 (rust/crates/clap_lex-0.6.0.crate) = 702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1
+SIZE (rust/crates/clap_lex-0.6.0.crate) = 12272
+SHA256 (rust/crates/colorchoice-1.0.0.crate) = acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7
+SIZE (rust/crates/colorchoice-1.0.0.crate) = 6857
+SHA256 (rust/crates/console-0.15.7.crate) = c926e00cc70edefdc64d3a5ff31cc65bb97a3460097762bd23afb4d8145fccf8
+SIZE (rust/crates/console-0.15.7.crate) = 35409
+SHA256 (rust/crates/crossbeam-deque-0.8.4.crate) = fca89a0e215bab21874660c67903c5f143333cab1da83d041c7ded6053774751
+SIZE (rust/crates/crossbeam-deque-0.8.4.crate) = 21752
+SHA256 (rust/crates/crossbeam-epoch-0.9.16.crate) = 2d2fe95351b870527a5d09bf563ed3c97c0cffb87cf1c78a591bf48bb218d9aa
+SIZE (rust/crates/crossbeam-epoch-0.9.16.crate) = 47037
+SHA256 (rust/crates/crossbeam-utils-0.8.17.crate) = c06d96137f14f244c37f989d9fff8f95e6c18b918e71f36638f8c49112e4c78f
+SIZE (rust/crates/crossbeam-utils-0.8.17.crate) = 42324
+SHA256 (rust/crates/darling-0.14.4.crate) = 7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850
+SIZE (rust/crates/darling-0.14.4.crate) = 25168
+SHA256 (rust/crates/darling_core-0.14.4.crate) = 109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0
+SIZE (rust/crates/darling_core-0.14.4.crate) = 57485
+SHA256 (rust/crates/darling_macro-0.14.4.crate) = a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e
+SIZE (rust/crates/darling_macro-0.14.4.crate) = 1896
+SHA256 (rust/crates/derive_builder-0.12.0.crate) = 8d67778784b508018359cbc8696edb3db78160bab2c2a28ba7f56ef6932997f8
+SIZE (rust/crates/derive_builder-0.12.0.crate) = 35456
+SHA256 (rust/crates/derive_builder_core-0.12.0.crate) = c11bdc11a0c47bc7d37d582b5285da6849c96681023680b906673c5707af7b0f
+SIZE (rust/crates/derive_builder_core-0.12.0.crate) = 31438
+SHA256 (rust/crates/derive_builder_macro-0.12.0.crate) = ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e
+SIZE (rust/crates/derive_builder_macro-0.12.0.crate) = 6288
+SHA256 (rust/crates/either-1.9.0.crate) = a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07
+SIZE (rust/crates/either-1.9.0.crate) = 16660
+SHA256 (rust/crates/encode_unicode-0.3.6.crate) = a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f
+SIZE (rust/crates/encode_unicode-0.3.6.crate) = 45741
+SHA256 (rust/crates/env_logger-0.10.1.crate) = 95b3f3e67048839cb0d0781f445682a35113da7121f7c949db0e2be96a4fbece
+SIZE (rust/crates/env_logger-0.10.1.crate) = 36524
+SHA256 (rust/crates/errno-0.3.8.crate) = a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245
+SIZE (rust/crates/errno-0.3.8.crate) = 10645
+SHA256 (rust/crates/esaxx-rs-0.1.10.crate) = d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6
+SIZE (rust/crates/esaxx-rs-0.1.10.crate) = 175210
+SHA256 (rust/crates/fastrand-2.0.1.crate) = 25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5
+SIZE (rust/crates/fastrand-2.0.1.crate) = 14664
+SHA256 (rust/crates/fnv-1.0.7.crate) = 3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1
+SIZE (rust/crates/fnv-1.0.7.crate) = 11266
+SHA256 (rust/crates/getrandom-0.2.11.crate) = fe9006bed769170c11f845cf00c7c1e9092aeb3f268e007c3e760ac68008070f
+SIZE (rust/crates/getrandom-0.2.11.crate) = 35391
+SHA256 (rust/crates/heck-0.4.1.crate) = 95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8
+SIZE (rust/crates/heck-0.4.1.crate) = 11567
+SHA256 (rust/crates/hermit-abi-0.3.3.crate) = d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7
+SIZE (rust/crates/hermit-abi-0.3.3.crate) = 14253
+SHA256 (rust/crates/humantime-2.1.0.crate) = 9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4
+SIZE (rust/crates/humantime-2.1.0.crate) = 16749
+SHA256 (rust/crates/ident_case-1.0.1.crate) = b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39
+SIZE (rust/crates/ident_case-1.0.1.crate) = 3492
+SHA256 (rust/crates/indicatif-0.17.7.crate) = fb28741c9db9a713d93deb3bb9515c20788cef5815265bee4980e87bde7e0f25
+SIZE (rust/crates/indicatif-0.17.7.crate) = 63108
+SHA256 (rust/crates/indoc-2.0.4.crate) = 1e186cfbae8084e513daff4240b4797e342f988cecda4fb6c939150f96315fd8
+SIZE (rust/crates/indoc-2.0.4.crate) = 14311
+SHA256 (rust/crates/instant-0.1.12.crate) = 7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c
+SIZE (rust/crates/instant-0.1.12.crate) = 6128
+SHA256 (rust/crates/is-terminal-0.4.9.crate) = cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b
+SIZE (rust/crates/is-terminal-0.4.9.crate) = 8109
+SHA256 (rust/crates/itertools-0.11.0.crate) = b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57
+SIZE (rust/crates/itertools-0.11.0.crate) = 125074
+SHA256 (rust/crates/itoa-1.0.10.crate) = b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c
+SIZE (rust/crates/itoa-1.0.10.crate) = 10534
+SHA256 (rust/crates/lazy_static-1.4.0.crate) = e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646
+SIZE (rust/crates/lazy_static-1.4.0.crate) = 10443
+SHA256 (rust/crates/libc-0.2.151.crate) = 302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4
+SIZE (rust/crates/libc-0.2.151.crate) = 736640
+SHA256 (rust/crates/linux-raw-sys-0.4.12.crate) = c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456
+SIZE (rust/crates/linux-raw-sys-0.4.12.crate) = 1465800
+SHA256 (rust/crates/lock_api-0.4.11.crate) = 3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45
+SIZE (rust/crates/lock_api-0.4.11.crate) = 27487
+SHA256 (rust/crates/log-0.4.20.crate) = b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f
+SIZE (rust/crates/log-0.4.20.crate) = 38307
+SHA256 (rust/crates/macro_rules_attribute-0.2.0.crate) = 8a82271f7bc033d84bbca59a3ce3e4159938cb08a9c3aebbe54d215131518a13
+SIZE (rust/crates/macro_rules_attribute-0.2.0.crate) = 15408
+SHA256 (rust/crates/macro_rules_attribute-proc_macro-0.2.0.crate) = b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568
+SIZE (rust/crates/macro_rules_attribute-proc_macro-0.2.0.crate) = 8264
+SHA256 (rust/crates/matrixmultiply-0.3.8.crate) = 7574c1cf36da4798ab73da5b215bbf444f50718207754cb522201d78d1cd0ff2
+SIZE (rust/crates/matrixmultiply-0.3.8.crate) = 57530
+SHA256 (rust/crates/memchr-2.6.4.crate) = f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167
+SIZE (rust/crates/memchr-2.6.4.crate) = 94439
+SHA256 (rust/crates/memoffset-0.9.0.crate) = 5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c
+SIZE (rust/crates/memoffset-0.9.0.crate) = 9033
+SHA256 (rust/crates/minimal-lexical-0.2.1.crate) = 68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a
+SIZE (rust/crates/minimal-lexical-0.2.1.crate) = 94841
+SHA256 (rust/crates/monostate-0.1.10.crate) = e404e13820ea0df0eda93aa294e0c80de76a0daa6bec590d376fbec6d7810394
+SIZE (rust/crates/monostate-0.1.10.crate) = 13986
+SHA256 (rust/crates/monostate-impl-0.1.10.crate) = 531c82a934da419bed3da09bd87d6e98c72f8d4aa755427b3b009c2b8b8c433c
+SIZE (rust/crates/monostate-impl-0.1.10.crate) = 7187
+SHA256 (rust/crates/ndarray-0.15.6.crate) = adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32
+SIZE (rust/crates/ndarray-0.15.6.crate) = 275225
+SHA256 (rust/crates/nom-7.1.3.crate) = d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a
+SIZE (rust/crates/nom-7.1.3.crate) = 117570
+SHA256 (rust/crates/num-complex-0.4.4.crate) = 1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214
+SIZE (rust/crates/num-complex-0.4.4.crate) = 29564
+SHA256 (rust/crates/num-integer-0.1.45.crate) = 225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9
+SIZE (rust/crates/num-integer-0.1.45.crate) = 22529
+SHA256 (rust/crates/num-traits-0.2.17.crate) = 39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c
+SIZE (rust/crates/num-traits-0.2.17.crate) = 50190
+SHA256 (rust/crates/number_prefix-0.4.0.crate) = 830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3
+SIZE (rust/crates/number_prefix-0.4.0.crate) = 6922
+SHA256 (rust/crates/numpy-0.20.0.crate) = bef41cbb417ea83b30525259e30ccef6af39b31c240bda578889494c5392d331
+SIZE (rust/crates/numpy-0.20.0.crate) = 71258
+SHA256 (rust/crates/once_cell-1.19.0.crate) = 3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92
+SIZE (rust/crates/once_cell-1.19.0.crate) = 33046
+SHA256 (rust/crates/onig-6.4.0.crate) = 8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f
+SIZE (rust/crates/onig-6.4.0.crate) = 32616
+SHA256 (rust/crates/onig_sys-69.8.1.crate) = 7b829e3d7e9cc74c7e315ee8edb185bf4190da5acde74afd7fc59c35b1f086e7
+SIZE (rust/crates/onig_sys-69.8.1.crate) = 638216
+SHA256 (rust/crates/parking_lot-0.12.1.crate) = 3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f
+SIZE (rust/crates/parking_lot-0.12.1.crate) = 40967
+SHA256 (rust/crates/parking_lot_core-0.9.9.crate) = 4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e
+SIZE (rust/crates/parking_lot_core-0.9.9.crate) = 32445
+SHA256 (rust/crates/paste-1.0.14.crate) = de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c
+SIZE (rust/crates/paste-1.0.14.crate) = 18157
+SHA256 (rust/crates/pkg-config-0.3.27.crate) = 26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964
+SIZE (rust/crates/pkg-config-0.3.27.crate) = 18838
+SHA256 (rust/crates/portable-atomic-1.6.0.crate) = 7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0
+SIZE (rust/crates/portable-atomic-1.6.0.crate) = 140689
+SHA256 (rust/crates/ppv-lite86-0.2.17.crate) = 5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de
+SIZE (rust/crates/ppv-lite86-0.2.17.crate) = 22242
+SHA256 (rust/crates/proc-macro2-1.0.70.crate) = 39278fbbf5fb4f646ce651690877f89d1c5811a3d4acb27700c1cb3cdb78fd3b
+SIZE (rust/crates/proc-macro2-1.0.70.crate) = 44343
+SHA256 (rust/crates/pyo3-0.20.2.crate) = 9a89dc7a5850d0e983be1ec2a463a171d20990487c3cfcd68b5363f1ee3d6fe0
+SIZE (rust/crates/pyo3-0.20.2.crate) = 434326
+SHA256 (rust/crates/pyo3-build-config-0.20.2.crate) = 07426f0d8fe5a601f26293f300afd1a7b1ed5e78b2a705870c5f30893c5163be
+SIZE (rust/crates/pyo3-build-config-0.20.2.crate) = 30029
+SHA256 (rust/crates/pyo3-ffi-0.20.2.crate) = dbb7dec17e17766b46bca4f1a4215a85006b4c2ecde122076c562dd058da6cf1
+SIZE (rust/crates/pyo3-ffi-0.20.2.crate) = 64601
+SHA256 (rust/crates/pyo3-macros-0.20.2.crate) = 05f738b4e40d50b5711957f142878cfa0f28e054aa0ebdfc3fd137a843f74ed3
+SIZE (rust/crates/pyo3-macros-0.20.2.crate) = 7925
+SHA256 (rust/crates/pyo3-macros-backend-0.20.2.crate) = 0fc910d4851847827daf9d6cdd4a823fbdaab5b8818325c5e97a86da79e8881f
+SIZE (rust/crates/pyo3-macros-backend-0.20.2.crate) = 49710
+SHA256 (rust/crates/quote-1.0.33.crate) = 5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae
+SIZE (rust/crates/quote-1.0.33.crate) = 28090
+SHA256 (rust/crates/rand-0.8.5.crate) = 34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404
+SIZE (rust/crates/rand-0.8.5.crate) = 87113
+SHA256 (rust/crates/rand_chacha-0.3.1.crate) = e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88
+SIZE (rust/crates/rand_chacha-0.3.1.crate) = 15251
+SHA256 (rust/crates/rand_core-0.6.4.crate) = ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c
+SIZE (rust/crates/rand_core-0.6.4.crate) = 22666
+SHA256 (rust/crates/rawpointer-0.2.1.crate) = 60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3
+SIZE (rust/crates/rawpointer-0.2.1.crate) = 7490
+SHA256 (rust/crates/rayon-1.8.0.crate) = 9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1
+SIZE (rust/crates/rayon-1.8.0.crate) = 170172
+SHA256 (rust/crates/rayon-cond-0.3.0.crate) = 059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9
+SIZE (rust/crates/rayon-cond-0.3.0.crate) = 9913
+SHA256 (rust/crates/rayon-core-1.12.0.crate) = 5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed
+SIZE (rust/crates/rayon-core-1.12.0.crate) = 70081
+SHA256 (rust/crates/redox_syscall-0.4.1.crate) = 4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa
+SIZE (rust/crates/redox_syscall-0.4.1.crate) = 24858
+SHA256 (rust/crates/regex-1.10.2.crate) = 380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343
+SIZE (rust/crates/regex-1.10.2.crate) = 252839
+SHA256 (rust/crates/regex-automata-0.4.3.crate) = 5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f
+SIZE (rust/crates/regex-automata-0.4.3.crate) = 617011
+SHA256 (rust/crates/regex-syntax-0.7.5.crate) = dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da
+SIZE (rust/crates/regex-syntax-0.7.5.crate) = 343366
+SHA256 (rust/crates/regex-syntax-0.8.2.crate) = c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f
+SIZE (rust/crates/regex-syntax-0.8.2.crate) = 347228
+SHA256 (rust/crates/rustc-hash-1.1.0.crate) = 08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2
+SIZE (rust/crates/rustc-hash-1.1.0.crate) = 9331
+SHA256 (rust/crates/rustix-0.38.28.crate) = 72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316
+SIZE (rust/crates/rustix-0.38.28.crate) = 365398
+SHA256 (rust/crates/ryu-1.0.16.crate) = f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c
+SIZE (rust/crates/ryu-1.0.16.crate) = 47351
+SHA256 (rust/crates/scopeguard-1.2.0.crate) = 94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49
+SIZE (rust/crates/scopeguard-1.2.0.crate) = 11619
+SHA256 (rust/crates/serde-1.0.193.crate) = 25dd9975e68d0cb5aa1120c288333fc98731bd1dd12f561e468ea4728c042b89
+SIZE (rust/crates/serde-1.0.193.crate) = 76863
+SHA256 (rust/crates/serde_derive-1.0.193.crate) = 43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3
+SIZE (rust/crates/serde_derive-1.0.193.crate) = 55692
+SHA256 (rust/crates/serde_json-1.0.108.crate) = 3d1c7e3eac408d115102c4c24ad393e0821bb3a5df4d506a80f85f7a742a526b
+SIZE (rust/crates/serde_json-1.0.108.crate) = 146476
+SHA256 (rust/crates/smallvec-1.11.2.crate) = 4dccd0940a2dcdf68d092b8cbab7dc0ad8fa938bf95787e1b916b0e3d0e8e970
+SIZE (rust/crates/smallvec-1.11.2.crate) = 34801
+SHA256 (rust/crates/spm_precompiled-0.1.4.crate) = 5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326
+SIZE (rust/crates/spm_precompiled-0.1.4.crate) = 557527
+SHA256 (rust/crates/strsim-0.10.0.crate) = 73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623
+SIZE (rust/crates/strsim-0.10.0.crate) = 11355
+SHA256 (rust/crates/syn-1.0.109.crate) = 72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237
+SIZE (rust/crates/syn-1.0.109.crate) = 237611
+SHA256 (rust/crates/syn-2.0.41.crate) = 44c8b28c477cc3bf0e7966561e3460130e1255f7a1cf71931075f1c5e7a7e269
+SIZE (rust/crates/syn-2.0.41.crate) = 246016
+SHA256 (rust/crates/target-lexicon-0.12.12.crate) = 14c39fd04924ca3a864207c66fc2cd7d22d7c016007f9ce846cbb9326331930a
+SIZE (rust/crates/target-lexicon-0.12.12.crate) = 25156
+SHA256 (rust/crates/tempfile-3.8.1.crate) = 7ef1adac450ad7f4b3c28589471ade84f25f731a7a0fe30d71dfa9f60fd808e5
+SIZE (rust/crates/tempfile-3.8.1.crate) = 32164
+SHA256 (rust/crates/termcolor-1.4.0.crate) = ff1bc3d3f05aff0403e8ac0d92ced918ec05b666a43f83297ccef5bea8a3d449
+SIZE (rust/crates/termcolor-1.4.0.crate) = 18765
+SHA256 (rust/crates/thiserror-1.0.51.crate) = f11c217e1416d6f036b870f14e0413d480dbf28edbee1f877abaf0206af43bb7
+SIZE (rust/crates/thiserror-1.0.51.crate) = 20045
+SHA256 (rust/crates/thiserror-impl-1.0.51.crate) = 01742297787513b79cf8e29d1056ede1313e2420b7b3b15d0a768b4921f549df
+SIZE (rust/crates/thiserror-impl-1.0.51.crate) = 15372
+SHA256 (rust/crates/unicode-ident-1.0.12.crate) = 3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b
+SIZE (rust/crates/unicode-ident-1.0.12.crate) = 42168
+SHA256 (rust/crates/unicode-normalization-alignments-0.1.12.crate) = 43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de
+SIZE (rust/crates/unicode-normalization-alignments-0.1.12.crate) = 91546
+SHA256 (rust/crates/unicode-segmentation-1.10.1.crate) = 1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36
+SIZE (rust/crates/unicode-segmentation-1.10.1.crate) = 98416
+SHA256 (rust/crates/unicode-width-0.1.11.crate) = e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85
+SIZE (rust/crates/unicode-width-0.1.11.crate) = 19187
+SHA256 (rust/crates/unicode_categories-0.1.1.crate) = 39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e
+SIZE (rust/crates/unicode_categories-0.1.1.crate) = 87298
+SHA256 (rust/crates/unindent-0.2.3.crate) = c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce
+SIZE (rust/crates/unindent-0.2.3.crate) = 7306
+SHA256 (rust/crates/utf8parse-0.2.1.crate) = 711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a
+SIZE (rust/crates/utf8parse-0.2.1.crate) = 13435
+SHA256 (rust/crates/wasi-0.11.0+wasi-snapshot-preview1.crate) = 9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423
+SIZE (rust/crates/wasi-0.11.0+wasi-snapshot-preview1.crate) = 28131
+SHA256 (rust/crates/winapi-0.3.9.crate) = 5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419
+SIZE (rust/crates/winapi-0.3.9.crate) = 1200382
+SHA256 (rust/crates/winapi-i686-pc-windows-gnu-0.4.0.crate) = ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6
+SIZE (rust/crates/winapi-i686-pc-windows-gnu-0.4.0.crate) = 2918815
+SHA256 (rust/crates/winapi-util-0.1.6.crate) = f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596
+SIZE (rust/crates/winapi-util-0.1.6.crate) = 12234
+SHA256 (rust/crates/winapi-x86_64-pc-windows-gnu-0.4.0.crate) = 712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f
+SIZE (rust/crates/winapi-x86_64-pc-windows-gnu-0.4.0.crate) = 2947998
+SHA256 (rust/crates/windows-sys-0.45.0.crate) = 75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0
+SIZE (rust/crates/windows-sys-0.45.0.crate) = 2568659
+SHA256 (rust/crates/windows-sys-0.48.0.crate) = 677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9
+SIZE (rust/crates/windows-sys-0.48.0.crate) = 2628884
+SHA256 (rust/crates/windows-sys-0.52.0.crate) = 282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d
+SIZE (rust/crates/windows-sys-0.52.0.crate) = 2576877
+SHA256 (rust/crates/windows-targets-0.42.2.crate) = 8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071
+SIZE (rust/crates/windows-targets-0.42.2.crate) = 5492
+SHA256 (rust/crates/windows-targets-0.48.5.crate) = 9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c
+SIZE (rust/crates/windows-targets-0.48.5.crate) = 6904
+SHA256 (rust/crates/windows-targets-0.52.0.crate) = 8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd
+SIZE (rust/crates/windows-targets-0.52.0.crate) = 6229
+SHA256 (rust/crates/windows_aarch64_gnullvm-0.42.2.crate) = 597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8
+SIZE (rust/crates/windows_aarch64_gnullvm-0.42.2.crate) = 364071
+SHA256 (rust/crates/windows_aarch64_gnullvm-0.48.5.crate) = 2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8
+SIZE (rust/crates/windows_aarch64_gnullvm-0.48.5.crate) = 418492
+SHA256 (rust/crates/windows_aarch64_gnullvm-0.52.0.crate) = cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea
+SIZE (rust/crates/windows_aarch64_gnullvm-0.52.0.crate) = 430182
+SHA256 (rust/crates/windows_aarch64_msvc-0.42.2.crate) = e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43
+SIZE (rust/crates/windows_aarch64_msvc-0.42.2.crate) = 666981
+SHA256 (rust/crates/windows_aarch64_msvc-0.48.5.crate) = dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc
+SIZE (rust/crates/windows_aarch64_msvc-0.48.5.crate) = 798483
+SHA256 (rust/crates/windows_aarch64_msvc-0.52.0.crate) = bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef
+SIZE (rust/crates/windows_aarch64_msvc-0.52.0.crate) = 821663
+SHA256 (rust/crates/windows_i686_gnu-0.42.2.crate) = c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f
+SIZE (rust/crates/windows_i686_gnu-0.42.2.crate) = 736236
+SHA256 (rust/crates/windows_i686_gnu-0.48.5.crate) = a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e
+SIZE (rust/crates/windows_i686_gnu-0.48.5.crate) = 844891
+SHA256 (rust/crates/windows_i686_gnu-0.52.0.crate) = a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313
+SIZE (rust/crates/windows_i686_gnu-0.52.0.crate) = 870285
+SHA256 (rust/crates/windows_i686_msvc-0.42.2.crate) = 44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060
+SIZE (rust/crates/windows_i686_msvc-0.42.2.crate) = 724951
+SHA256 (rust/crates/windows_i686_msvc-0.48.5.crate) = 8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406
+SIZE (rust/crates/windows_i686_msvc-0.48.5.crate) = 864300
+SHA256 (rust/crates/windows_i686_msvc-0.52.0.crate) = ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a
+SIZE (rust/crates/windows_i686_msvc-0.52.0.crate) = 888693
+SHA256 (rust/crates/windows_x86_64_gnu-0.42.2.crate) = 8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36
+SIZE (rust/crates/windows_x86_64_gnu-0.42.2.crate) = 699373
+SHA256 (rust/crates/windows_x86_64_gnu-0.48.5.crate) = 53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e
+SIZE (rust/crates/windows_x86_64_gnu-0.48.5.crate) = 801619
+SHA256 (rust/crates/windows_x86_64_gnu-0.52.0.crate) = 3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd
+SIZE (rust/crates/windows_x86_64_gnu-0.52.0.crate) = 826213
+SHA256 (rust/crates/windows_x86_64_gnullvm-0.42.2.crate) = 26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3
+SIZE (rust/crates/windows_x86_64_gnullvm-0.42.2.crate) = 364068
+SHA256 (rust/crates/windows_x86_64_gnullvm-0.48.5.crate) = 0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc
+SIZE (rust/crates/windows_x86_64_gnullvm-0.48.5.crate) = 418486
+SHA256 (rust/crates/windows_x86_64_gnullvm-0.52.0.crate) = 1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e
+SIZE (rust/crates/windows_x86_64_gnullvm-0.52.0.crate) = 430165
+SHA256 (rust/crates/windows_x86_64_msvc-0.42.2.crate) = 9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0
+SIZE (rust/crates/windows_x86_64_msvc-0.42.2.crate) = 666936
+SHA256 (rust/crates/windows_x86_64_msvc-0.48.5.crate) = ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538
+SIZE (rust/crates/windows_x86_64_msvc-0.48.5.crate) = 798412
+SHA256 (rust/crates/windows_x86_64_msvc-0.52.0.crate) = dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04
+SIZE (rust/crates/windows_x86_64_msvc-0.52.0.crate) = 821600
diff --git a/textproc/py-tokenizers/pkg-descr b/textproc/py-tokenizers/pkg-descr
new file mode 100644
index 000000000000..37ba482d53d2
--- /dev/null
+++ b/textproc/py-tokenizers/pkg-descr
@@ -0,0 +1,16 @@
+Provides an implementation of today's most used tokenizers, with a
+focus on performance and versatility.
+
+Main features:
+- Train new vocabularies and tokenize, using today's most used
+ tokenizers.
+- Extremely fast (both training and tokenization), thanks to the Rust
+ implementation. Takes less than 20 seconds to tokenize a GB of text
+ on a server's CPU.
+- Easy to use, but also extremely versatile.
+- Designed for research and production.
+- Normalization comes with alignments tracking. It's always possible
+ to get the part of the original sentence that corresponds to a given
+ token.
+- Does all the pre-processing: Truncate, Pad, add the special tokens
+ your model needs.