diff options
Diffstat (limited to 'contrib/less/charset.c')
-rw-r--r-- | contrib/less/charset.c | 250 |
1 files changed, 151 insertions, 99 deletions
diff --git a/contrib/less/charset.c b/contrib/less/charset.c index b37c8a29cbd9..881ebafd02cf 100644 --- a/contrib/less/charset.c +++ b/contrib/less/charset.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 1984-2021 Mark Nudelman + * Copyright (C) 1984-2023 Mark Nudelman * * You may distribute under the terms of either the GNU General Public * License or the Less License, as specified in the README file. @@ -21,6 +21,7 @@ #endif #include "charset.h" +#include "xbuf.h" #if MSDOS_COMPILER==WIN32C #define WIN32_LEAN_AND_MEAN @@ -123,6 +124,108 @@ static char *binfmt = NULL; static char *utfbinfmt = NULL; public int binattr = AT_STANDOUT|AT_COLOR_BIN; +static struct xbuffer user_wide_array; +static struct xbuffer user_ubin_array; +static struct xbuffer user_compose_array; +static struct xbuffer user_prt_array; +static struct wchar_range_table user_wide_table; +static struct wchar_range_table user_ubin_table; +static struct wchar_range_table user_compose_table; +static struct wchar_range_table user_prt_table; + +/* + * Set a wchar_range_table to the table in an xbuffer. + */ +static void wchar_range_table_set(struct wchar_range_table *tbl, struct xbuffer *arr) +{ + tbl->table = (struct wchar_range *) arr->data; + tbl->count = arr->end / sizeof(struct wchar_range); +} + +/* + * Skip over a "U" or "U+" prefix before a hex codepoint. + */ +static char * skip_uprefix(char *s) +{ + if (*s == 'U' || *s == 'u') + if (*++s == '+') ++s; + return s; +} + +/* + * Parse a dash-separated range of hex values. + */ +static void wchar_range_get(char **ss, struct wchar_range *range) +{ + char *s = skip_uprefix(*ss); + range->first = lstrtoul(s, &s, 16); + if (s[0] == '-') + { + s = skip_uprefix(&s[1]); + range->last = lstrtoul(s, &s, 16); + } else + { + range->last = range->first; + } + *ss = s; +} + +/* + * Parse the LESSUTFCHARDEF variable. + */ +static void ichardef_utf(char *s) +{ + xbuf_init(&user_wide_array); + xbuf_init(&user_ubin_array); + xbuf_init(&user_compose_array); + xbuf_init(&user_prt_array); + + if (s != NULL) + { + while (s[0] != '\0') + { + struct wchar_range range; + wchar_range_get(&s, &range); + if (range.last == 0) + { + error("invalid hex number(s) in LESSUTFCHARDEF", NULL_PARG); + quit(QUIT_ERROR); + } + if (*s++ != ':') + { + error("missing colon in LESSUTFCHARDEF", NULL_PARG); + quit(QUIT_ERROR); + } + switch (*s++) + { + case 'b': + xbuf_add_data(&user_ubin_array, (unsigned char *) &range, sizeof(range)); + break; + case 'c': + xbuf_add_data(&user_compose_array, (unsigned char *) &range, sizeof(range)); + break; + case 'w': + xbuf_add_data(&user_wide_array, (unsigned char *) &range, sizeof(range)); + xbuf_add_data(&user_prt_array, (unsigned char *) &range, sizeof(range)); + break; + case 'p': case '.': + xbuf_add_data(&user_prt_array, (unsigned char *) &range, sizeof(range)); + break; + case '\0': + s--; + break; + default: + /* Ignore unknown character attribute. */ + break; + } + if (s[0] == ',') ++s; + } + } + wchar_range_table_set(&user_wide_table, &user_wide_array); + wchar_range_table_set(&user_ubin_table, &user_ubin_array); + wchar_range_table_set(&user_compose_table, &user_compose_array); + wchar_range_table_set(&user_prt_table, &user_prt_array); +} /* * Define a charset, given a description string. @@ -138,9 +241,7 @@ public int binattr = AT_STANDOUT|AT_COLOR_BIN; * b binary character * c control character */ - static void -ichardef(s) - char *s; +static void ichardef(char *s) { char *cp; int n; @@ -165,10 +266,12 @@ ichardef(s) case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - n = (10 * n) + (s[-1] - '0'); + if (ckd_mul(&n, n, 10) || ckd_add(&n, n, s[-1] - '0')) + goto invalid_chardef; continue; default: + invalid_chardef: error("invalid chardef", NULL_PARG); quit(QUIT_ERROR); /*NOTREACHED*/ @@ -195,10 +298,7 @@ ichardef(s) * Define a charset, given a charset name. * The valid charset names are listed in the "charsets" array. */ - static int -icharset(name, no_error) - char *name; - int no_error; +static int icharset(char *name, int no_error) { struct charset *p; struct cs_alias *a; @@ -244,8 +344,7 @@ icharset(name, no_error) /* * Define a charset, given a locale name. */ - static void -ilocale(VOID_PARAM) +static void ilocale(void) { int c; @@ -264,12 +363,7 @@ ilocale(VOID_PARAM) /* * Define the printing format for control (or binary utf) chars. */ - public void -setfmt(s, fmtvarptr, attrptr, default_fmt) - char *s; - char **fmtvarptr; - int *attrptr; - char *default_fmt; +public void setfmt(char *s, char **fmtvarptr, int *attrptr, char *default_fmt, int for_printf) { if (s && utf_mode) { @@ -286,10 +380,12 @@ setfmt(s, fmtvarptr, attrptr, default_fmt) } } - /* %n is evil */ - if (s == NULL || *s == '\0' || - (*s == '*' && (s[1] == '\0' || s[2] == '\0' || strchr(s + 2, 'n'))) || - (*s != '*' && strchr(s, 'n'))) + if (s == NULL || *s == '\0') + s = default_fmt; + else if (for_printf && + ((*s == '*' && (s[1] == '\0' || s[2] == '\0' || strchr(s + 2, 'n'))) || + (*s != '*' && strchr(s, 'n')))) + /* %n is evil */ s = default_fmt; /* @@ -314,8 +410,7 @@ setfmt(s, fmtvarptr, attrptr, default_fmt) /* * */ - static void -set_charset(VOID_PARAM) +static void set_charset(void) { char *s; @@ -327,6 +422,9 @@ set_charset(VOID_PARAM) if (icharset("utf-8", 1)) return; #endif + + ichardef_utf(lgetenv("LESSUTFCHARDEF")); + /* * See if environment variable LESSCHARSET is defined. */ @@ -394,8 +492,7 @@ set_charset(VOID_PARAM) /* * Initialize charset data structures. */ - public void -init_charset(VOID_PARAM) +public void init_charset(void) { char *s; @@ -406,18 +503,16 @@ init_charset(VOID_PARAM) set_charset(); s = lgetenv("LESSBINFMT"); - setfmt(s, &binfmt, &binattr, "*s<%02X>"); + setfmt(s, &binfmt, &binattr, "*s<%02X>", TRUE); s = lgetenv("LESSUTFBINFMT"); - setfmt(s, &utfbinfmt, &binattr, "<U+%04lX>"); + setfmt(s, &utfbinfmt, &binattr, "<U+%04lX>", TRUE); } /* * Is a given character a "binary" character? */ - public int -binary_char(c) - LWCHAR c; +public int binary_char(LWCHAR c) { if (utf_mode) return (is_ubin_char(c)); @@ -428,9 +523,7 @@ binary_char(c) /* * Is a given character a "control" character? */ - public int -control_char(c) - LWCHAR c; +public int control_char(LWCHAR c) { c &= 0377; return (chardef[c] & IS_CONTROL_CHAR); @@ -440,12 +533,10 @@ control_char(c) * Return the printable form of a character. * For example, in the "ascii" charset '\3' is printed as "^C". */ - public char * -prchar(c) - LWCHAR c; +public char * prchar(LWCHAR c) { /* {{ This buffer can be overrun if LESSBINFMT is a long string. }} */ - static char buf[32]; + static char buf[MAX_PRCHAR_LEN+1]; c &= 0377; if ((c < 128 || !utf_mode) && !control_char(c)) @@ -476,11 +567,9 @@ prchar(c) /* * Return the printable form of a UTF-8 character. */ - public char * -prutfchar(ch) - LWCHAR ch; +public char * prutfchar(LWCHAR ch) { - static char buf[32]; + static char buf[MAX_PRCHAR_LEN+1]; if (ch == ESC) strcpy(buf, "ESC"); @@ -507,9 +596,7 @@ prutfchar(ch) /* * Get the length of a UTF-8 character in bytes. */ - public int -utf_len(ch) - int ch; +public int utf_len(int ch) { if ((ch & 0x80) == 0) return 1; @@ -530,10 +617,7 @@ utf_len(ch) /* * Does the parameter point to the lead byte of a well-formed UTF-8 character? */ - public int -is_utf8_well_formed(ss, slen) - char *ss; - int slen; +public int is_utf8_well_formed(char *ss, int slen) { int i; int len; @@ -568,10 +652,7 @@ is_utf8_well_formed(ss, slen) /* * Skip bytes until a UTF-8 lead byte (11xxxxxx) or ASCII byte (0xxxxxxx) is found. */ - public void -utf_skip_to_lead(pp, limit) - char **pp; - char *limit; +public void utf_skip_to_lead(char **pp, char *limit) { do { ++(*pp); @@ -582,9 +663,7 @@ utf_skip_to_lead(pp, limit) /* * Get the value of a UTF-8 character. */ - public LWCHAR -get_wchar(p) - constant char *p; +public LWCHAR get_wchar(constant char *p) { switch (utf_len(p[0])) { @@ -634,10 +713,7 @@ get_wchar(p) /* * Store a character into a UTF-8 string. */ - public void -put_wchar(pp, ch) - char **pp; - LWCHAR ch; +public void put_wchar(char **pp, LWCHAR ch) { if (!utf_mode || ch < 0x80) { @@ -684,11 +760,7 @@ put_wchar(pp, ch) /* * Step forward or backward one character in a string. */ - public LWCHAR -step_char(pp, dir, limit) - char **pp; - signed int dir; - constant char *limit; +public LWCHAR step_char(char **pp, signed int dir, constant char *limit) { LWCHAR ch; int len; @@ -758,16 +830,13 @@ static struct wchar_range comb_table[] = { }; - static int -is_in_table(ch, table) - LWCHAR ch; - struct wchar_range_table *table; +static int is_in_table(LWCHAR ch, struct wchar_range_table *table) { int hi; int lo; /* Binary search in the table. */ - if (ch < table->table[0].first) + if (table->table == NULL || table->count == 0 || ch < table->table[0].first) return 0; lo = 0; hi = table->count - 1; @@ -788,46 +857,32 @@ is_in_table(ch, table) * Is a character a UTF-8 composing character? * If a composing character follows any char, the two combine into one glyph. */ - public int -is_composing_char(ch) - LWCHAR ch; +public int is_composing_char(LWCHAR ch) { - return is_in_table(ch, &compose_table) || + if (is_in_table(ch, &user_prt_table)) return 0; + return is_in_table(ch, &user_compose_table) || + is_in_table(ch, &compose_table) || (bs_mode != BS_CONTROL && is_in_table(ch, &fmt_table)); } /* * Should this UTF-8 character be treated as binary? */ - public int -is_ubin_char(ch) - LWCHAR ch; +public int is_ubin_char(LWCHAR ch) { - int ubin = is_in_table(ch, &ubin_table) || - (bs_mode == BS_CONTROL && is_in_table(ch, &fmt_table)); -#if MSDOS_COMPILER==WIN32C - if (!ubin && utf_mode == 2 && ch < 0x10000) - { - /* - * Consider it binary if it can't be converted. - */ - BOOL used_default = TRUE; - WideCharToMultiByte(GetConsoleOutputCP(), WC_NO_BEST_FIT_CHARS, (LPCWSTR) &ch, 1, NULL, 0, NULL, &used_default); - if (used_default) - ubin = 1; - } -#endif - return ubin; + if (is_in_table(ch, &user_prt_table)) return 0; + return is_in_table(ch, &user_ubin_table) || + is_in_table(ch, &ubin_table) || + (bs_mode == BS_CONTROL && is_in_table(ch, &fmt_table)); } /* * Is this a double width UTF-8 character? */ - public int -is_wide_char(ch) - LWCHAR ch; +public int is_wide_char(LWCHAR ch) { - return is_in_table(ch, &wide_table); + return is_in_table(ch, &user_wide_table) || + is_in_table(ch, &wide_table); } /* @@ -835,10 +890,7 @@ is_wide_char(ch) * A combining char acts like an ordinary char, but if it follows * a specific char (not any char), the two combine into one glyph. */ - public int -is_combining_char(ch1, ch2) - LWCHAR ch1; - LWCHAR ch2; +public int is_combining_char(LWCHAR ch1, LWCHAR ch2) { /* The table is small; use linear search. */ int i; |