1 files changed, 362 insertions, 0 deletions
diff --git a/src/stringprep/uni_parse.tcl b/src/stringprep/uni_parse.tcl
new file mode 100644
index 00000000..7d6458f2
--- /dev/null
+++ b/src/stringprep/uni_parse.tcl
@@ -0,0 +1,362 @@
+# uni_parse.tcl --
+#
+#	This program parses the UnicodeData file and generates the
+#	corresponding uni_data.c file with compressed character
+#	data tables.  The input to this program should be rfc3454.txt
+#
+# Copyright (c) 1998-1999 by Scriptics Corporation.
+# All rights reserved.
+#
+# Modified for ejabberd by Alexey Shchepin
+# 
+# RCS: @(#) $Id$
+
+
+namespace eval uni {
+    set shift 5;		# number of bits of data within a page
+				# This value can be adjusted to find the
+				# best split to minimize table size
+
+    variable pMap;		# map from page to page index, each entry is
+				# an index into the pages table, indexed by
+				# page number
+    variable pages;		# map from page index to page info, each
+				# entry is a list of indices into the groups
+				# table, the list is indexed by the offset
+    variable groups;		# list of character info values, indexed by
+				# group number, initialized with the
+				# unassigned character group
+}
+
+proc uni::getValue {tables delta} {
+    set ac 0
+    set c11 0
+    set c21 0
+    set b1 0
+    set d1 0
+    set d2 0
+    set xnp 0
+
+    foreach tab $tables {
+	switch -glob -- $tab {
+	    C.1.1 {set c11 1}
+	    C.2.1 {set c21 1}
+	    C.*   {set ac 1}
+	    A.1   {set ac 1}
+	    B.1   {set b1 1}
+	    D.1   {set d1 1}
+	    D.2   {set d2 1}
+	    XNP   {set xnp 1}
+	}
+    }
+
+    set val [expr {($ac  << 0) |
+		   ($c11 << 1) |
+		   ($c21 << 2) |
+		   ($b1  << 3) |
+		   ($d1  << 4) |
+		   ($d2  << 5) |
+		   ($xnp << 6) |
+		   ($delta << 16)}]
+
+    return $val
+}
+
+proc uni::getGroup {value} {
+    variable groups
+
+    set gIndex [lsearch -exact $groups $value]
+    if {$gIndex == -1} {
+	set gIndex [llength $groups]
+	lappend groups $value
+    }
+    return $gIndex
+}
+
+proc uni::addPage {info} {
+    variable pMap
+    variable pages
+    
+    set pIndex [lsearch -exact $pages $info]
+    if {$pIndex == -1} {
+	set pIndex [llength $pages]
+	lappend pages $info
+    }
+    lappend pMap $pIndex
+    return
+}
+
+proc uni::load_tables {data} {
+    variable casemap
+    variable tablemap
+
+    for {set i 0} {$i <= 0xffff} {incr i} {
+	set casemap($i) 0
+	set tablemap($i) {}
+    }
+
+    set table ""
+
+    foreach line [split $data \n] {
+	if {$table == ""} {
+	    if {[regexp {   ----- Start Table (.*) -----} $line temp table]} {
+		#puts "Start table '$table'"
+	    }
+	} else {
+	    if {[regexp {   ----- End Table (.*) -----} $line temp table1]} {
+		set table ""
+	    } else {
+		if {$table == "B.1"} {
+		    if {[regexp {^   ([[:xdigit:]]+); ;} $line \
+			     temp val]} {
+			scan $val %x val
+			if {$val <= 0xffff} {
+			    lappend tablemap($val) $table
+			}
+		    }
+		} elseif {$table == "B.3"} {
+		    if {[regexp {^   ([[:xdigit:]]+); ([[:xdigit:]]+);} $line \
+			     temp from to]} {
+			scan $from %x from
+			scan $to %x to
+			if {$from <= 0xffff && $to <= 0xffff} {
+			    set casemap($from) [expr {$to - $from}]
+			}
+		    } else {
+			# TODO
+		    }
+		    
+		} elseif {$table != "B.2"} {
+		    if {[regexp {^   ([[:xdigit:]]+)-([[:xdigit:]]+)} $line \
+			     temp from to]} {
+			scan $from %x from
+			scan $to %x to
+			for {set i $from} {$i <= $to && $i <= 0xffff} {incr i} {
+			    lappend tablemap($i) $table
+			}
+		    } elseif {[regexp {^   ([[:xdigit:]]+)} $line \
+			     temp val]} {
+			scan $val %x val
+			if {$val <= 0xffff} {
+			    lappend tablemap($val) $table
+			}
+		    }
+		}
+	    }
+	}
+    }
+
+    # XMPP nodeprep prohibited
+    foreach val {22 26 27 2f 3a 3c 3e 40} {
+	scan $val %x val
+	lappend tablemap($val) XNP
+    }
+}
+
+proc uni::buildTables {} {
+    variable shift
+
+    variable casemap
+    variable tablemap
+
+    variable pMap {}
+    variable pages {}
+    variable groups {}
+    set info {}			;# temporary page info
+    
+    set mask [expr {(1 << $shift) - 1}]
+
+    set next 0
+
+    for {set i 0} {$i <= 0xffff} {incr i} {
+	set gIndex [getGroup [getValue $tablemap($i) $casemap($i)]]
+
+	# Split character index into offset and page number
+	set offset [expr {$i & $mask}]
+	set page [expr {($i >> $shift)}]
+
+	# Add the group index to the info for the current page
+	lappend info $gIndex
+
+	# If this is the last entry in the page, add the page
+	if {$offset == $mask} {
+	    addPage $info
+	    set info {}
+	}
+    }
+    return
+}
+
+proc uni::main {} {
+    global argc argv0 argv
+    variable pMap
+    variable pages
+    variable groups
+    variable shift
+
+    if {$argc != 2} {
+	puts stderr "\nusage: $argv0 <datafile> <outdir>\n"
+	exit 1
+    }
+    set f [open [lindex $argv 0] r]
+    set data [read $f]
+    close $f
+
+    load_tables $data
+    buildTables
+    puts "X = [llength $pMap]  Y= [llength $pages]  A= [llength $groups]"
+    set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]
+    puts "shift = 6, space = $size"
+
+    set f [open [file join [lindex $argv 1] uni_data.c] w]
+    fconfigure $f -translation lf
+    puts $f "/*
+ * uni_data.c --
+ *
+ *	Declarations of Unicode character information tables.  This file is
+ *	automatically generated by the uni_parse.tcl script.  Do not
+ *	modify this file by hand.
+ *
+ * Copyright (c) 1998 by Scriptics Corporation.
+ * All rights reserved.
+ *
+ * Modified for ejabberd by Alexey Shchepin
+ *
+ * RCS: @(#) \$Id\$
+ */
+
+/*
+ * A 16-bit Unicode character is split into two parts in order to index
+ * into the following tables.  The lower OFFSET_BITS comprise an offset
+ * into a page of characters.  The upper bits comprise the page number.
+ */
+
+#define OFFSET_BITS $shift
+
+/*
+ * The pageMap is indexed by page number and returns an alternate page number
+ * that identifies a unique page of characters.  Many Unicode characters map
+ * to the same alternate page number.
+ */
+
+static unsigned char pageMap\[\] = {"
+    set line "    "
+    set last [expr {[llength $pMap] - 1}]
+    for {set i 0} {$i <= $last} {incr i} {
+	append line [lindex $pMap $i]
+	if {$i != $last} {
+	    append line ", "
+	}
+	if {[string length $line] > 70} {
+	    puts $f $line
+	    set line "    "
+	}
+    }
+    puts $f $line
+    puts $f "};
+
+/*
+ * The groupMap is indexed by combining the alternate page number with
+ * the page offset and returns a group number that identifies a unique
+ * set of character attributes.
+ */
+
+static unsigned char groupMap\[\] = {"
+    set line "    "
+    set lasti [expr {[llength $pages] - 1}]
+    for {set i 0} {$i <= $lasti} {incr i} {
+	set page [lindex $pages $i]
+	set lastj [expr {[llength $page] - 1}]
+	for {set j 0} {$j <= $lastj} {incr j} {
+	    append line [lindex $page $j]
+	    if {$j != $lastj || $i != $lasti} {
+		append line ", "
+	    }
+	    if {[string length $line] > 70} {
+		puts $f $line
+		set line "    "
+	    }
+	}
+    }
+    puts $f $line
+    puts $f "};
+
+/*
+ * Each group represents a unique set of character attributes.  The attributes
+ * are encoded into a 32-bit value as follows:
+ *
+ * Bit  0	A.1 | C.1.2 | C.2.2 | C.3 -- C.9
+ *
+ * Bit  1	C.1.1
+ *
+ * Bit  2	C.2.1
+ *
+ * Bit  3	B.1
+ *
+ * Bit  4	B.1
+ *
+ * Bit  5	D.1
+ *
+ * Bit  6	D.2
+ *
+ * Bits 7-15	Reserved for future use.
+ *
+ * Bits 16-31	Case delta: delta for case conversions.  This should be the
+ *			    highest field so we can easily sign extend.
+ */
+
+static int groups\[\] = {"
+    set line "    "
+    set last [expr {[llength $groups] - 1}]
+    for {set i 0} {$i <= $last} {incr i} {
+	set val [lindex $groups $i]
+
+	append line [format "%d" $val]
+	if {$i != $last} {
+	    append line ", "
+	}
+	if {[string length $line] > 65} {
+	    puts $f $line
+	    set line "    "
+	}
+    }
+    puts $f $line
+    puts $f "};
+
+/*
+ * The following constants are used to determine the category of a
+ * Unicode character.
+ */
+
+#define ACMask  (1 << 0)
+#define C11Mask (1 << 1)
+#define C21Mask (1 << 2)
+#define B1Mask  (1 << 3)
+#define D1Mask  (1 << 4)
+#define D2Mask  (1 << 5)
+#define XNPMask (1 << 6)
+
+/*
+ * The following macros extract the fields of the character info.  The
+ * GetDelta() macro is complicated because we can't rely on the C compiler
+ * to do sign extension on right shifts.
+ */
+
+#define GetCaseType(info) (((info) & 0xE0) >> 5)
+#define GetCategory(info) ((info) & 0x1F)
+#define GetDelta(info) (((info) > 0) ? ((info) >> 16) : (~(~((info)) >> 16)))
+
+/*
+ * This macro extracts the information about a character from the
+ * Unicode character tables.
+ */
+
+#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
+"
+
+    close $f
+}
+
+uni::main
+
+return