src - FreeBSD source tree

diff options


context:
space:
mode:

author	Tim J. Robbins <tjr@FreeBSD.org>	2004-07-27 06:29:48 +0000
committer	Tim J. Robbins <tjr@FreeBSD.org>	2004-07-27 06:29:48 +0000
commit	ea9a9a377b9293ec1e6b30cc139722af963695f5 (patch)
tree	ce076ebbfc63439f98dfceb8a3d9fd1dd055a829 /lib/libc/locale/utf8.c
parent	47df0a78eb6228de22b552a1e0345d78a572b72a (diff)
download	src-ea9a9a377b9293ec1e6b30cc139722af963695f5.tar.gz src-ea9a9a377b9293ec1e6b30cc139722af963695f5.zip

Add UTF-8-specific implementations of mbsnrtowcs() and wcsnrtombs().

These convert plain ASCII characters in-line, making them only slightly slower than the single-byte ("NONE" encoding) version when processing ASCII strings.

Notes

Notes: svn path=/head/; revision=132687

Diffstat (limited to 'lib/libc/locale/utf8.c')

-rw-r--r--

lib/libc/locale/utf8.c

163

1 files changed, 163 insertions, 0 deletions

diff --git a/lib/libc/locale/utf8.c b/lib/libc/locale/utf8.c
index bab04f05ebb7..8ef7b9af2782 100644
--- a/lib/libc/locale/utf8.c
+++ b/lib/libc/locale/utf8.c

@@ -28,6 +28,7 @@

__FBSDID("$FreeBSD$");

#include <errno.h>

+#include <limits.h>

#include <runetype.h>

#include <stdlib.h>

#include <string.h>

@@ -37,7 +38,11 @@ __FBSDID("$FreeBSD$");

size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t,

mbstate_t * __restrict);

int _UTF8_mbsinit(const mbstate_t *);

+size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, const char ** __restrict,

+ size_t, size_t, mbstate_t * __restrict);

size_t _UTF8_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict);

+size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,

+ size_t, size_t, mbstate_t * __restrict);

typedef struct {

wchar_t ch;

@@ -52,6 +57,8 @@ _UTF8_init(_RuneLocale *rl)

__mbrtowc = _UTF8_mbrtowc;

__wcrtomb = _UTF8_wcrtomb;

__mbsinit = _UTF8_mbsinit;

+ __mbsnrtowcs = _UTF8_mbsnrtowcs;

+ __wcsnrtombs = _UTF8_wcsnrtombs;

_CurrentRuneLocale = rl;

__mb_cur_max = 6;

@@ -188,6 +195,88 @@ _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,

}

size_t

+_UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,

+ size_t nms, size_t len, mbstate_t * __restrict ps)

+ _UTF8State *us;

+ const char *s;

+ size_t nchr;

+ wchar_t wc;

+ size_t nb;

+ us = (_UTF8State *)ps;

+ s = *src;

+ nchr = 0;

+ if (dst == NULL) {

+ /*

+ * The fast path in the loop below is not safe if an ASCII

+ * character appears as anything but the first byte of a

+ * multibyte sequence. Check now to avoid doing it in the loop.

+ */

+ if (nms > 0 && us->want > 0 && (signed char)*s > 0) {

+ errno = EILSEQ;

+ return ((size_t)-1);

+ }

+ for (;;) {

+ if (nms > 0 && (signed char)*s > 0)

+ /*

+ * Fast path for plain ASCII characters

+ * excluding NUL.

+ */

+ nb = 1;

+ else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==

+ (size_t)-1)

+ /* Invalid sequence - mbrtowc() sets errno. */

+ return ((size_t)-1);

+ else if (nb == 0 || nb == (size_t)-2)

+ return (nchr);

+ s += nb;

+ nms -= nb;

+ nchr++;

+ }

+ /*NOTREACHED*/

+ }

+ /*

+ * The fast path in the loop below is not safe if an ASCII

+ * character appears as anything but the first byte of a

+ * multibyte sequence. Check now to avoid doing it in the loop.

+ */

+ if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {

+ errno = EILSEQ;

+ return ((size_t)-1);

+ }

+ while (len-- > 0) {

+ if (nms > 0 && (signed char)*s > 0) {

+ /*

+ * Fast path for plain ASCII characters

+ * excluding NUL.

+ */

+ *dst = (wchar_t)*s;

+ nb = 1;

+ } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==

+ (size_t)-1) {

+ *src = s;

+ return ((size_t)-1);

+ } else if (nb == (size_t)-2) {

+ *src = s + nms;

+ return (nchr);

+ } else if (nb == 0) {

+ *src = NULL;

+ return (nchr);

+ }

+ s += nb;

+ nms -= nb;

+ nchr++;

+ dst++;

+ }

+ *src = s;

+ return (nchr);

+size_t

_UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)

{

_UTF8State *us;

@@ -254,3 +343,77 @@ _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)

return (len);

}

+size_t

+_UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,

+ size_t nwc, size_t len, mbstate_t * __restrict ps)

+ _UTF8State *us;

+ char buf[MB_LEN_MAX];

+ const wchar_t *s;

+ size_t nbytes;

+ size_t nb;

+ us = (_UTF8State *)ps;

+ if (us->want != 0) {

+ errno = EINVAL;

+ return ((size_t)-1);

+ }

+ s = *src;

+ nbytes = 0;

+ if (dst == NULL) {

+ while (nwc-- > 0) {

+ if (0 <= *s && *s < 0x80)

+ /* Fast path for plain ASCII characters. */

+ nb = 1;

+ else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==

+ (size_t)-1)

+ /* Invalid character - wcrtomb() sets errno. */

+ return ((size_t)-1);

+ if (*s == L'\0')

+ return (nbytes + nb - 1);

+ s++;

+ nbytes += nb;

+ }

+ return (nbytes);

+ }

+ while (len > 0 && nwc-- > 0) {

+ if (0 <= *s && *s < 0x80) {

+ /* Fast path for plain ASCII characters. */

+ nb = 1;

+ *dst = *s;

+ } else if (len > (size_t)MB_CUR_MAX) {

+ /* Enough space to translate in-place. */

+ if ((nb = (int)_UTF8_wcrtomb(dst, *s, ps)) < 0) {

+ *src = s;

+ return ((size_t)-1);

+ }

+ } else {

+ /*

+ * May not be enough space; use temp. buffer.

+ */

+ if ((nb = (int)_UTF8_wcrtomb(buf, *s, ps)) < 0) {

+ *src = s;

+ return ((size_t)-1);

+ }

+ if (nb > (int)len)

+ /* MB sequence for character won't fit. */

+ break;

+ memcpy(dst, buf, nb);

+ }

+ if (*s == L'\0') {

+ *src = NULL;

+ return (nbytes + nb - 1);

+ }

+ s++;

+ dst += nb;

+ len -= nb;

+ nbytes += nb;

+ }

+ *src = s;

+ return (nbytes);