diff options
author | Yuri Victorovich <yuri@FreeBSD.org> | 2023-01-25 07:45:57 +0000 |
---|---|---|
committer | Yuri Victorovich <yuri@FreeBSD.org> | 2023-01-25 07:45:57 +0000 |
commit | 6d884b207aab2373494bbd713278a80474a58601 (patch) | |
tree | c8502a6195bedc16be2b4936bfbe25018b466a98 | |
parent | d8ced9147b8a35d1c0b23daf31dbe5da1b3eab3a (diff) | |
download | ports-6d884b207aab2373494bbd713278a80474a58601.tar.gz ports-6d884b207aab2373494bbd713278a80474a58601.zip |
textproc/py-sentencepiece: New port: Unsupervised text tokenizer for Neural Network-based text generation
-rw-r--r-- | textproc/Makefile | 1 | ||||
-rw-r--r-- | textproc/py-sentencepiece/Makefile | 26 | ||||
-rw-r--r-- | textproc/py-sentencepiece/distinfo | 3 | ||||
-rw-r--r-- | textproc/py-sentencepiece/pkg-descr | 7 |
4 files changed, 37 insertions, 0 deletions
diff --git a/textproc/Makefile b/textproc/Makefile index e2d0e0ea9521..3d52828e2e12 100644 --- a/textproc/Makefile +++ b/textproc/Makefile @@ -1496,6 +1496,7 @@ SUBDIR += py-rst2html5 SUBDIR += py-rstfmt SUBDIR += py-scour + SUBDIR += py-sentencepiece SUBDIR += py-simplebayes SUBDIR += py-smartypants SUBDIR += py-snowballstemmer diff --git a/textproc/py-sentencepiece/Makefile b/textproc/py-sentencepiece/Makefile new file mode 100644 index 000000000000..fe1b9cfd4ba7 --- /dev/null +++ b/textproc/py-sentencepiece/Makefile @@ -0,0 +1,26 @@ +PORTNAME= sentencepiece +DISTVERSIONPREFIX= v +DISTVERSION= 0.1.97 +CATEGORIES= textproc # machine-learning +PKGNAMEPREFIX= ${PYTHON_PKGNAMEPREFIX} + +MAINTAINER= yuri@FreeBSD.org +COMMENT= Unsupervised text tokenizer for Neural Network-based text generation +WWW= https://github.com/google/sentencepiece + +LICENSE= APACHE20 +LICENSE_FILE= ${WRKSRC}/../LICENSE + +LIB_DEPENDS= libsentencepiece.so:textproc/sentencepiece + +USES= compiler:c++17-lang pkgconfig python +USE_PYTHON= distutils autoplist pytest + +USE_GITHUB= yes +GH_ACCOUNT= google + +WRKSRC_SUBDIR= python + +TEST_ENV= ${MAKE_ENV} PYTHONPATH=${STAGEDIR}${PYTHONPREFIX_SITELIBDIR} + +.include <bsd.port.mk> diff --git a/textproc/py-sentencepiece/distinfo b/textproc/py-sentencepiece/distinfo new file mode 100644 index 000000000000..c29dc9430710 --- /dev/null +++ b/textproc/py-sentencepiece/distinfo @@ -0,0 +1,3 @@ +TIMESTAMP = 1673860778 +SHA256 (google-sentencepiece-v0.1.97_GH0.tar.gz) = 41c3a07f315e3ac87605460c8bb8d739955bc8e7f478caec4017ef9b7d78669b +SIZE (google-sentencepiece-v0.1.97_GH0.tar.gz) = 11945436 diff --git a/textproc/py-sentencepiece/pkg-descr b/textproc/py-sentencepiece/pkg-descr new file mode 100644 index 000000000000..62b7de5f4ece --- /dev/null +++ b/textproc/py-sentencepiece/pkg-descr @@ -0,0 +1,7 @@ +SentencePiece is an unsupervised text tokenizer and detokenizer mainly for +Neural Network-based text generation systems where the vocabulary size is +predetermined prior to the neural model training. SentencePiece implements +subword units (e.g., byte-pair-encoding (BPE)) and unigram language model +with the extension of direct training from raw sentences. SentencePiece +allows us to make a purely end-to-end system that does not depend on +language-specific pre/postprocessing. |