aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYuri Victorovich <yuri@FreeBSD.org>2023-01-25 07:45:57 +0000
committerYuri Victorovich <yuri@FreeBSD.org>2023-01-25 07:45:57 +0000
commit6d884b207aab2373494bbd713278a80474a58601 (patch)
treec8502a6195bedc16be2b4936bfbe25018b466a98
parentd8ced9147b8a35d1c0b23daf31dbe5da1b3eab3a (diff)
downloadports-6d884b207aab2373494bbd713278a80474a58601.tar.gz
ports-6d884b207aab2373494bbd713278a80474a58601.zip
textproc/py-sentencepiece: New port: Unsupervised text tokenizer for Neural Network-based text generation
-rw-r--r--textproc/Makefile1
-rw-r--r--textproc/py-sentencepiece/Makefile26
-rw-r--r--textproc/py-sentencepiece/distinfo3
-rw-r--r--textproc/py-sentencepiece/pkg-descr7
4 files changed, 37 insertions, 0 deletions
diff --git a/textproc/Makefile b/textproc/Makefile
index e2d0e0ea9521..3d52828e2e12 100644
--- a/textproc/Makefile
+++ b/textproc/Makefile
@@ -1496,6 +1496,7 @@
SUBDIR += py-rst2html5
SUBDIR += py-rstfmt
SUBDIR += py-scour
+ SUBDIR += py-sentencepiece
SUBDIR += py-simplebayes
SUBDIR += py-smartypants
SUBDIR += py-snowballstemmer
diff --git a/textproc/py-sentencepiece/Makefile b/textproc/py-sentencepiece/Makefile
new file mode 100644
index 000000000000..fe1b9cfd4ba7
--- /dev/null
+++ b/textproc/py-sentencepiece/Makefile
@@ -0,0 +1,26 @@
+PORTNAME= sentencepiece
+DISTVERSIONPREFIX= v
+DISTVERSION= 0.1.97
+CATEGORIES= textproc # machine-learning
+PKGNAMEPREFIX= ${PYTHON_PKGNAMEPREFIX}
+
+MAINTAINER= yuri@FreeBSD.org
+COMMENT= Unsupervised text tokenizer for Neural Network-based text generation
+WWW= https://github.com/google/sentencepiece
+
+LICENSE= APACHE20
+LICENSE_FILE= ${WRKSRC}/../LICENSE
+
+LIB_DEPENDS= libsentencepiece.so:textproc/sentencepiece
+
+USES= compiler:c++17-lang pkgconfig python
+USE_PYTHON= distutils autoplist pytest
+
+USE_GITHUB= yes
+GH_ACCOUNT= google
+
+WRKSRC_SUBDIR= python
+
+TEST_ENV= ${MAKE_ENV} PYTHONPATH=${STAGEDIR}${PYTHONPREFIX_SITELIBDIR}
+
+.include <bsd.port.mk>
diff --git a/textproc/py-sentencepiece/distinfo b/textproc/py-sentencepiece/distinfo
new file mode 100644
index 000000000000..c29dc9430710
--- /dev/null
+++ b/textproc/py-sentencepiece/distinfo
@@ -0,0 +1,3 @@
+TIMESTAMP = 1673860778
+SHA256 (google-sentencepiece-v0.1.97_GH0.tar.gz) = 41c3a07f315e3ac87605460c8bb8d739955bc8e7f478caec4017ef9b7d78669b
+SIZE (google-sentencepiece-v0.1.97_GH0.tar.gz) = 11945436
diff --git a/textproc/py-sentencepiece/pkg-descr b/textproc/py-sentencepiece/pkg-descr
new file mode 100644
index 000000000000..62b7de5f4ece
--- /dev/null
+++ b/textproc/py-sentencepiece/pkg-descr
@@ -0,0 +1,7 @@
+SentencePiece is an unsupervised text tokenizer and detokenizer mainly for
+Neural Network-based text generation systems where the vocabulary size is
+predetermined prior to the neural model training. SentencePiece implements
+subword units (e.g., byte-pair-encoding (BPE)) and unigram language model
+with the extension of direct training from raw sentences. SentencePiece
+allows us to make a purely end-to-end system that does not depend on
+language-specific pre/postprocessing.