blob: 07c5b02e438b96d199f2eec51c43e4e896937510 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
|
--- libxklavier/utf8.c 1970-01-01 03:00:00.000000000 +0300
+++ libxklavier/utf8.c 2015-11-18 01:50:50.406110000 +0300
@@ -0,0 +1,61 @@
+/*
+ * The utf8_check() function scans the '\0'-terminated string starting
+ * at s. It returns a pointer to the first byte of the first malformed
+ * or overlong UTF-8 sequence found, or NULL if the string contains
+ * only correct UTF-8. It also spots UTF-8 sequences that could cause
+ * trouble if converted to UTF-16, namely surrogate characters
+ * (U+D800..U+DFFF) and non-Unicode positions (U+FFFE..U+FFFF). This
+ * routine is very likely to find a malformed sequence if the input
+ * uses any other encoding than UTF-8. It therefore can be used as a
+ * very effective heuristic for distinguishing between UTF-8 and other
+ * encodings.
+ *
+ * I wrote this code mainly as a specification of functionality; there
+ * are no doubt performance optimizations possible for certain CPUs.
+ *
+ * Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> -- 2005-03-30
+ * License: http://www.cl.cam.ac.uk/~mgk25/short-license.html
+ */
+
+
+guchar *
+utf8_check(guchar *s)
+{
+ while (*s) {
+ if (*s < 0x80)
+ /* 0xxxxxxx */
+ s++;
+ else if ((s[0] & 0xe0) == 0xc0) {
+ /* 110XXXXx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[0] & 0xfe) == 0xc0) /* overlong? */
+ return s;
+ else
+ s += 2;
+ } else if ((s[0] & 0xf0) == 0xe0) {
+ /* 1110XXXX 10Xxxxxx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || /* overlong? */
+ (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || /* surrogate? */
+ (s[0] == 0xef && s[1] == 0xbf &&
+ (s[2] & 0xfe) == 0xbe)) /* U+FFFE or U+FFFF? */
+ return s;
+ else
+ s += 3;
+ } else if ((s[0] & 0xf8) == 0xf0) {
+ /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ (s[3] & 0xc0) != 0x80 ||
+ (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || /* overlong? */
+ (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */
+ return s;
+ else
+ s += 4;
+ } else
+ return s;
+ }
+
+ return NULL;
+}
|