x11/libxklavier/files/patch-utf8.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

--- libxklavier/utf8.c	1970-01-01 03:00:00.000000000 +0300
+++ libxklavier/utf8.c	2015-11-18 01:50:50.406110000 +0300
@@ -0,0 +1,61 @@
+/*
+ * The utf8_check() function scans the '\0'-terminated string starting
+ * at s. It returns a pointer to the first byte of the first malformed
+ * or overlong UTF-8 sequence found, or NULL if the string contains
+ * only correct UTF-8. It also spots UTF-8 sequences that could cause
+ * trouble if converted to UTF-16, namely surrogate characters
+ * (U+D800..U+DFFF) and non-Unicode positions (U+FFFE..U+FFFF). This
+ * routine is very likely to find a malformed sequence if the input
+ * uses any other encoding than UTF-8. It therefore can be used as a
+ * very effective heuristic for distinguishing between UTF-8 and other
+ * encodings.
+ *
+ * I wrote this code mainly as a specification of functionality; there
+ * are no doubt performance optimizations possible for certain CPUs.
+ *
+ * Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> -- 2005-03-30
+ * License: http://www.cl.cam.ac.uk/~mgk25/short-license.html
+ */
+
+
+guchar *
+utf8_check(guchar *s)
+{
+	while (*s) {
+		if (*s < 0x80)
+			/* 0xxxxxxx */
+			s++;
+		else if ((s[0] & 0xe0) == 0xc0) {
+			/* 110XXXXx 10xxxxxx */
+			if ((s[1] & 0xc0) != 0x80 ||
+					(s[0] & 0xfe) == 0xc0)          /* overlong? */
+				return s;
+			else
+				s += 2;
+		} else if ((s[0] & 0xf0) == 0xe0) {
+			/* 1110XXXX 10Xxxxxx 10xxxxxx */
+			if ((s[1] & 0xc0) != 0x80 ||
+					(s[2] & 0xc0) != 0x80 ||
+					(s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) ||  /* overlong? */
+					(s[0] == 0xed && (s[1] & 0xe0) == 0xa0) ||  /* surrogate? */
+					(s[0] == 0xef && s[1] == 0xbf &&
+					 (s[2] & 0xfe) == 0xbe))            /* U+FFFE or U+FFFF? */
+				return s;
+			else
+				s += 3;
+		} else if ((s[0] & 0xf8) == 0xf0) {
+			/* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
+			if ((s[1] & 0xc0) != 0x80 ||
+					(s[2] & 0xc0) != 0x80 ||
+					(s[3] & 0xc0) != 0x80 ||
+					(s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) ||    /* overlong? */
+					(s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */
+				return s;
+			else
+				s += 4;
+		} else
+			return s;
+	}
+
+	return NULL;
+}