aboutsummaryrefslogtreecommitdiff
path: root/contrib/less/charset.c
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/less/charset.c')
-rw-r--r--contrib/less/charset.c250
1 files changed, 151 insertions, 99 deletions
diff --git a/contrib/less/charset.c b/contrib/less/charset.c
index b37c8a29cbd9..881ebafd02cf 100644
--- a/contrib/less/charset.c
+++ b/contrib/less/charset.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 1984-2021 Mark Nudelman
+ * Copyright (C) 1984-2023 Mark Nudelman
*
* You may distribute under the terms of either the GNU General Public
* License or the Less License, as specified in the README file.
@@ -21,6 +21,7 @@
#endif
#include "charset.h"
+#include "xbuf.h"
#if MSDOS_COMPILER==WIN32C
#define WIN32_LEAN_AND_MEAN
@@ -123,6 +124,108 @@ static char *binfmt = NULL;
static char *utfbinfmt = NULL;
public int binattr = AT_STANDOUT|AT_COLOR_BIN;
+static struct xbuffer user_wide_array;
+static struct xbuffer user_ubin_array;
+static struct xbuffer user_compose_array;
+static struct xbuffer user_prt_array;
+static struct wchar_range_table user_wide_table;
+static struct wchar_range_table user_ubin_table;
+static struct wchar_range_table user_compose_table;
+static struct wchar_range_table user_prt_table;
+
+/*
+ * Set a wchar_range_table to the table in an xbuffer.
+ */
+static void wchar_range_table_set(struct wchar_range_table *tbl, struct xbuffer *arr)
+{
+ tbl->table = (struct wchar_range *) arr->data;
+ tbl->count = arr->end / sizeof(struct wchar_range);
+}
+
+/*
+ * Skip over a "U" or "U+" prefix before a hex codepoint.
+ */
+static char * skip_uprefix(char *s)
+{
+ if (*s == 'U' || *s == 'u')
+ if (*++s == '+') ++s;
+ return s;
+}
+
+/*
+ * Parse a dash-separated range of hex values.
+ */
+static void wchar_range_get(char **ss, struct wchar_range *range)
+{
+ char *s = skip_uprefix(*ss);
+ range->first = lstrtoul(s, &s, 16);
+ if (s[0] == '-')
+ {
+ s = skip_uprefix(&s[1]);
+ range->last = lstrtoul(s, &s, 16);
+ } else
+ {
+ range->last = range->first;
+ }
+ *ss = s;
+}
+
+/*
+ * Parse the LESSUTFCHARDEF variable.
+ */
+static void ichardef_utf(char *s)
+{
+ xbuf_init(&user_wide_array);
+ xbuf_init(&user_ubin_array);
+ xbuf_init(&user_compose_array);
+ xbuf_init(&user_prt_array);
+
+ if (s != NULL)
+ {
+ while (s[0] != '\0')
+ {
+ struct wchar_range range;
+ wchar_range_get(&s, &range);
+ if (range.last == 0)
+ {
+ error("invalid hex number(s) in LESSUTFCHARDEF", NULL_PARG);
+ quit(QUIT_ERROR);
+ }
+ if (*s++ != ':')
+ {
+ error("missing colon in LESSUTFCHARDEF", NULL_PARG);
+ quit(QUIT_ERROR);
+ }
+ switch (*s++)
+ {
+ case 'b':
+ xbuf_add_data(&user_ubin_array, (unsigned char *) &range, sizeof(range));
+ break;
+ case 'c':
+ xbuf_add_data(&user_compose_array, (unsigned char *) &range, sizeof(range));
+ break;
+ case 'w':
+ xbuf_add_data(&user_wide_array, (unsigned char *) &range, sizeof(range));
+ xbuf_add_data(&user_prt_array, (unsigned char *) &range, sizeof(range));
+ break;
+ case 'p': case '.':
+ xbuf_add_data(&user_prt_array, (unsigned char *) &range, sizeof(range));
+ break;
+ case '\0':
+ s--;
+ break;
+ default:
+ /* Ignore unknown character attribute. */
+ break;
+ }
+ if (s[0] == ',') ++s;
+ }
+ }
+ wchar_range_table_set(&user_wide_table, &user_wide_array);
+ wchar_range_table_set(&user_ubin_table, &user_ubin_array);
+ wchar_range_table_set(&user_compose_table, &user_compose_array);
+ wchar_range_table_set(&user_prt_table, &user_prt_array);
+}
/*
* Define a charset, given a description string.
@@ -138,9 +241,7 @@ public int binattr = AT_STANDOUT|AT_COLOR_BIN;
* b binary character
* c control character
*/
- static void
-ichardef(s)
- char *s;
+static void ichardef(char *s)
{
char *cp;
int n;
@@ -165,10 +266,12 @@ ichardef(s)
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
- n = (10 * n) + (s[-1] - '0');
+ if (ckd_mul(&n, n, 10) || ckd_add(&n, n, s[-1] - '0'))
+ goto invalid_chardef;
continue;
default:
+ invalid_chardef:
error("invalid chardef", NULL_PARG);
quit(QUIT_ERROR);
/*NOTREACHED*/
@@ -195,10 +298,7 @@ ichardef(s)
* Define a charset, given a charset name.
* The valid charset names are listed in the "charsets" array.
*/
- static int
-icharset(name, no_error)
- char *name;
- int no_error;
+static int icharset(char *name, int no_error)
{
struct charset *p;
struct cs_alias *a;
@@ -244,8 +344,7 @@ icharset(name, no_error)
/*
* Define a charset, given a locale name.
*/
- static void
-ilocale(VOID_PARAM)
+static void ilocale(void)
{
int c;
@@ -264,12 +363,7 @@ ilocale(VOID_PARAM)
/*
* Define the printing format for control (or binary utf) chars.
*/
- public void
-setfmt(s, fmtvarptr, attrptr, default_fmt)
- char *s;
- char **fmtvarptr;
- int *attrptr;
- char *default_fmt;
+public void setfmt(char *s, char **fmtvarptr, int *attrptr, char *default_fmt, int for_printf)
{
if (s && utf_mode)
{
@@ -286,10 +380,12 @@ setfmt(s, fmtvarptr, attrptr, default_fmt)
}
}
- /* %n is evil */
- if (s == NULL || *s == '\0' ||
- (*s == '*' && (s[1] == '\0' || s[2] == '\0' || strchr(s + 2, 'n'))) ||
- (*s != '*' && strchr(s, 'n')))
+ if (s == NULL || *s == '\0')
+ s = default_fmt;
+ else if (for_printf &&
+ ((*s == '*' && (s[1] == '\0' || s[2] == '\0' || strchr(s + 2, 'n'))) ||
+ (*s != '*' && strchr(s, 'n'))))
+ /* %n is evil */
s = default_fmt;
/*
@@ -314,8 +410,7 @@ setfmt(s, fmtvarptr, attrptr, default_fmt)
/*
*
*/
- static void
-set_charset(VOID_PARAM)
+static void set_charset(void)
{
char *s;
@@ -327,6 +422,9 @@ set_charset(VOID_PARAM)
if (icharset("utf-8", 1))
return;
#endif
+
+ ichardef_utf(lgetenv("LESSUTFCHARDEF"));
+
/*
* See if environment variable LESSCHARSET is defined.
*/
@@ -394,8 +492,7 @@ set_charset(VOID_PARAM)
/*
* Initialize charset data structures.
*/
- public void
-init_charset(VOID_PARAM)
+public void init_charset(void)
{
char *s;
@@ -406,18 +503,16 @@ init_charset(VOID_PARAM)
set_charset();
s = lgetenv("LESSBINFMT");
- setfmt(s, &binfmt, &binattr, "*s<%02X>");
+ setfmt(s, &binfmt, &binattr, "*s<%02X>", TRUE);
s = lgetenv("LESSUTFBINFMT");
- setfmt(s, &utfbinfmt, &binattr, "<U+%04lX>");
+ setfmt(s, &utfbinfmt, &binattr, "<U+%04lX>", TRUE);
}
/*
* Is a given character a "binary" character?
*/
- public int
-binary_char(c)
- LWCHAR c;
+public int binary_char(LWCHAR c)
{
if (utf_mode)
return (is_ubin_char(c));
@@ -428,9 +523,7 @@ binary_char(c)
/*
* Is a given character a "control" character?
*/
- public int
-control_char(c)
- LWCHAR c;
+public int control_char(LWCHAR c)
{
c &= 0377;
return (chardef[c] & IS_CONTROL_CHAR);
@@ -440,12 +533,10 @@ control_char(c)
* Return the printable form of a character.
* For example, in the "ascii" charset '\3' is printed as "^C".
*/
- public char *
-prchar(c)
- LWCHAR c;
+public char * prchar(LWCHAR c)
{
/* {{ This buffer can be overrun if LESSBINFMT is a long string. }} */
- static char buf[32];
+ static char buf[MAX_PRCHAR_LEN+1];
c &= 0377;
if ((c < 128 || !utf_mode) && !control_char(c))
@@ -476,11 +567,9 @@ prchar(c)
/*
* Return the printable form of a UTF-8 character.
*/
- public char *
-prutfchar(ch)
- LWCHAR ch;
+public char * prutfchar(LWCHAR ch)
{
- static char buf[32];
+ static char buf[MAX_PRCHAR_LEN+1];
if (ch == ESC)
strcpy(buf, "ESC");
@@ -507,9 +596,7 @@ prutfchar(ch)
/*
* Get the length of a UTF-8 character in bytes.
*/
- public int
-utf_len(ch)
- int ch;
+public int utf_len(int ch)
{
if ((ch & 0x80) == 0)
return 1;
@@ -530,10 +617,7 @@ utf_len(ch)
/*
* Does the parameter point to the lead byte of a well-formed UTF-8 character?
*/
- public int
-is_utf8_well_formed(ss, slen)
- char *ss;
- int slen;
+public int is_utf8_well_formed(char *ss, int slen)
{
int i;
int len;
@@ -568,10 +652,7 @@ is_utf8_well_formed(ss, slen)
/*
* Skip bytes until a UTF-8 lead byte (11xxxxxx) or ASCII byte (0xxxxxxx) is found.
*/
- public void
-utf_skip_to_lead(pp, limit)
- char **pp;
- char *limit;
+public void utf_skip_to_lead(char **pp, char *limit)
{
do {
++(*pp);
@@ -582,9 +663,7 @@ utf_skip_to_lead(pp, limit)
/*
* Get the value of a UTF-8 character.
*/
- public LWCHAR
-get_wchar(p)
- constant char *p;
+public LWCHAR get_wchar(constant char *p)
{
switch (utf_len(p[0]))
{
@@ -634,10 +713,7 @@ get_wchar(p)
/*
* Store a character into a UTF-8 string.
*/
- public void
-put_wchar(pp, ch)
- char **pp;
- LWCHAR ch;
+public void put_wchar(char **pp, LWCHAR ch)
{
if (!utf_mode || ch < 0x80)
{
@@ -684,11 +760,7 @@ put_wchar(pp, ch)
/*
* Step forward or backward one character in a string.
*/
- public LWCHAR
-step_char(pp, dir, limit)
- char **pp;
- signed int dir;
- constant char *limit;
+public LWCHAR step_char(char **pp, signed int dir, constant char *limit)
{
LWCHAR ch;
int len;
@@ -758,16 +830,13 @@ static struct wchar_range comb_table[] = {
};
- static int
-is_in_table(ch, table)
- LWCHAR ch;
- struct wchar_range_table *table;
+static int is_in_table(LWCHAR ch, struct wchar_range_table *table)
{
int hi;
int lo;
/* Binary search in the table. */
- if (ch < table->table[0].first)
+ if (table->table == NULL || table->count == 0 || ch < table->table[0].first)
return 0;
lo = 0;
hi = table->count - 1;
@@ -788,46 +857,32 @@ is_in_table(ch, table)
* Is a character a UTF-8 composing character?
* If a composing character follows any char, the two combine into one glyph.
*/
- public int
-is_composing_char(ch)
- LWCHAR ch;
+public int is_composing_char(LWCHAR ch)
{
- return is_in_table(ch, &compose_table) ||
+ if (is_in_table(ch, &user_prt_table)) return 0;
+ return is_in_table(ch, &user_compose_table) ||
+ is_in_table(ch, &compose_table) ||
(bs_mode != BS_CONTROL && is_in_table(ch, &fmt_table));
}
/*
* Should this UTF-8 character be treated as binary?
*/
- public int
-is_ubin_char(ch)
- LWCHAR ch;
+public int is_ubin_char(LWCHAR ch)
{
- int ubin = is_in_table(ch, &ubin_table) ||
- (bs_mode == BS_CONTROL && is_in_table(ch, &fmt_table));
-#if MSDOS_COMPILER==WIN32C
- if (!ubin && utf_mode == 2 && ch < 0x10000)
- {
- /*
- * Consider it binary if it can't be converted.
- */
- BOOL used_default = TRUE;
- WideCharToMultiByte(GetConsoleOutputCP(), WC_NO_BEST_FIT_CHARS, (LPCWSTR) &ch, 1, NULL, 0, NULL, &used_default);
- if (used_default)
- ubin = 1;
- }
-#endif
- return ubin;
+ if (is_in_table(ch, &user_prt_table)) return 0;
+ return is_in_table(ch, &user_ubin_table) ||
+ is_in_table(ch, &ubin_table) ||
+ (bs_mode == BS_CONTROL && is_in_table(ch, &fmt_table));
}
/*
* Is this a double width UTF-8 character?
*/
- public int
-is_wide_char(ch)
- LWCHAR ch;
+public int is_wide_char(LWCHAR ch)
{
- return is_in_table(ch, &wide_table);
+ return is_in_table(ch, &user_wide_table) ||
+ is_in_table(ch, &wide_table);
}
/*
@@ -835,10 +890,7 @@ is_wide_char(ch)
* A combining char acts like an ordinary char, but if it follows
* a specific char (not any char), the two combine into one glyph.
*/
- public int
-is_combining_char(ch1, ch2)
- LWCHAR ch1;
- LWCHAR ch2;
+public int is_combining_char(LWCHAR ch1, LWCHAR ch2)
{
/* The table is small; use linear search. */
int i;