aboutsummaryrefslogtreecommitdiff
path: root/contrib/sendmail/libsm/utf8_valid.c
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/sendmail/libsm/utf8_valid.c')
-rw-r--r--contrib/sendmail/libsm/utf8_valid.c104
1 files changed, 104 insertions, 0 deletions
diff --git a/contrib/sendmail/libsm/utf8_valid.c b/contrib/sendmail/libsm/utf8_valid.c
new file mode 100644
index 000000000000..3181eca907b9
--- /dev/null
+++ b/contrib/sendmail/libsm/utf8_valid.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2020 Proofpoint, Inc. and its suppliers.
+ * All rights reserved.
+ *
+ * By using this file, you agree to the terms and conditions set
+ * forth in the LICENSE file which can be found at the top level of
+ * the sendmail distribution.
+ *
+ */
+
+#include <sm/gen.h>
+#include <sm/sendmail.h>
+#include <sm/ixlen.h>
+
+#if USE_EAI
+
+/*
+** legal utf-8 byte sequence
+** http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
+**
+** Code Points 1st 2s 3s 4s
+** U+0000..U+007F 00..7F
+** U+0080..U+07FF C2..DF 80..BF
+** U+0800..U+0FFF E0 A0..BF 80..BF
+** U+1000..U+CFFF E1..EC 80..BF 80..BF
+** U+D000..U+D7FF ED 80..9F 80..BF
+** U+E000..U+FFFF EE..EF 80..BF 80..BF
+** U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
+** U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
+** U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
+*/
+
+/*
+** based on
+** https://github.com/lemire/fastvalidate-utf-8.git
+** which is distributed under an MIT license (besides others).
+*/
+
+bool
+utf8_valid(b, length)
+ const char *b;
+ size_t length;
+{
+ const unsigned char *bytes;
+ size_t index;
+
+ bytes = (const unsigned char *)b;
+ index = 0;
+ while (true)
+ {
+ unsigned char byte1;
+
+ do { /* fast ASCII Path */
+ if (index >= length)
+ return true;
+ byte1 = bytes[index++];
+ } while (byte1 < 0x80);
+ if (byte1 < 0xE0)
+ {
+ /* Two-byte form. */
+ if (index == length)
+ return false;
+ if (byte1 < 0xC2 || bytes[index++] > 0xBF)
+ return false;
+ }
+ else if (byte1 < 0xF0)
+ {
+ /* Three-byte form. */
+ if (index + 1 >= length)
+ return false;
+ unsigned char byte2 = bytes[index++];
+ if (byte2 > 0xBF
+ /* Overlong? 5 most significant bits must not all be zero. */
+ || (byte1 == 0xE0 && byte2 < 0xA0)
+ /* Check for illegal surrogate codepoints. */
+ || (byte1 == 0xED && 0xA0 <= byte2)
+ /* Third byte trailing-byte test. */
+ || bytes[index++] > 0xBF)
+ return false;
+ }
+ else
+ {
+
+ /* Four-byte form. */
+ if (index + 2 >= length)
+ return false;
+ int byte2 = bytes[index++];
+ if (byte2 > 0xBF
+ /* Check that 1 <= plane <= 16. Tricky optimized form of: */
+ /* if (byte1 > (byte) 0xF4 */
+ /* || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 */
+ /* || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) */
+ || (((byte1 << 28) + (byte2 - 0x90)) >> 30) != 0
+ /* Third byte trailing-byte test */
+ || bytes[index++] > 0xBF
+ /* Fourth byte trailing-byte test */
+ || bytes[index++] > 0xBF)
+ return false;
+ }
+ }
+ /* NOTREACHED */
+ return false;
+}
+#endif /* USE_EAI */