summaryrefslogtreecommitdiff
path: root/src/stringprep/uni_parse2.tcl
diff options
context:
space:
mode:
Diffstat (limited to 'src/stringprep/uni_parse2.tcl')
-rw-r--r--src/stringprep/uni_parse2.tcl737
1 files changed, 737 insertions, 0 deletions
diff --git a/src/stringprep/uni_parse2.tcl b/src/stringprep/uni_parse2.tcl
new file mode 100644
index 00000000..456b9d13
--- /dev/null
+++ b/src/stringprep/uni_parse2.tcl
@@ -0,0 +1,737 @@
+# uni_parse2.tcl --
+#
+# This program parses the UnicodeData file and generates the
+# corresponding uni_norm.c file with compressed character
+# data tables. The input to this program should be the latest
+# UnicodeData.txt and CompositionExclusions.txt files from:
+# ftp://ftp.unicode.org/Public/UNIDATA/
+#
+# Copyright (c) 1998-1999 by Scriptics Corporation.
+# All rights reserved.
+#
+# Modified for ejabberd by Alexey Shchepin
+#
+# RCS: @(#) $Id$
+
+
+namespace eval uni {
+ set cclass_shift 6
+ set decomp_shift 5
+ set comp_shift 5
+ set shift 5; # number of bits of data within a page
+ # This value can be adjusted to find the
+ # best split to minimize table size
+
+ variable pMap; # map from page to page index, each entry is
+ # an index into the pages table, indexed by
+ # page number
+ variable pages; # map from page index to page info, each
+ # entry is a list of indices into the groups
+ # table, the list is indexed by the offset
+ variable groups; # list of character info values, indexed by
+ # group number, initialized with the
+ # unassigned character group
+
+ variable categories {
+ Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp
+ Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
+ }; # Ordered list of character categories, must
+ # match the enumeration in the header file.
+
+ variable titleCount 0; # Count of the number of title case
+ # characters. This value is used in the
+ # regular expression code to allocate enough
+ # space for the title case variants.
+}
+
+proc uni::getValue {items index} {
+ variable categories
+ variable titleCount
+
+ # Extract character info
+
+ set category [lindex $items 2]
+ if {[scan [lindex $items 12] %4x toupper] == 1} {
+ set toupper [expr {$index - $toupper}]
+ } else {
+ set toupper {}
+ }
+ if {[scan [lindex $items 13] %4x tolower] == 1} {
+ set tolower [expr {$tolower - $index}]
+ } else {
+ set tolower {}
+ }
+ if {[scan [lindex $items 14] %4x totitle] == 1} {
+ set totitle [expr {$index - $totitle}]
+ } else {
+ set totitle {}
+ }
+
+ set categoryIndex [lsearch -exact $categories $category]
+ if {$categoryIndex < 0} {
+ puts "Unexpected character category: $index($category)"
+ set categoryIndex 0
+ } elseif {$category == "Lt"} {
+ incr titleCount
+ }
+
+ return "$categoryIndex,$toupper,$tolower,$totitle"
+}
+
+proc uni::getGroup {value} {
+ variable groups
+
+ set gIndex [lsearch -exact $groups $value]
+ if {$gIndex == -1} {
+ set gIndex [llength $groups]
+ lappend groups $value
+ }
+ return $gIndex
+}
+
+proc uni::addPage {info} {
+ variable pMap
+ variable pages
+
+ set pIndex [lsearch -exact $pages $info]
+ if {$pIndex == -1} {
+ set pIndex [llength $pages]
+ lappend pages $info
+ }
+ lappend pMap $pIndex
+ return
+}
+
+proc uni::addPage {map_var pages_var info} {
+ variable $map_var
+ variable $pages_var
+
+ set pIndex [lsearch -exact [set $pages_var] $info]
+ if {$pIndex == -1} {
+ set pIndex [llength [set $pages_var]]
+ lappend $pages_var $info
+ }
+ lappend $map_var $pIndex
+ return
+}
+
+proc uni::load_exclusions {data} {
+ variable exclusions
+
+ foreach line [split $data \n] {
+ if {$line == ""} continue
+
+ set items [split $line " "]
+
+ if {[lindex $items 0] == "#"} continue
+
+ scan [lindex $items 0] %x index
+
+ set exclusions($index) ""
+ }
+}
+
+proc uni::load_tables {data} {
+ variable cclass_map
+ variable decomp_map
+ variable comp_map
+ variable comp_first
+ variable comp_second
+ variable exclusions
+
+ foreach line [split $data \n] {
+ if {$line == ""} continue
+
+ set items [split $line \;]
+
+ scan [lindex $items 0] %x index
+ set cclass [lindex $items 3]
+ set decomp [lindex $items 5]
+
+ set cclass_map($index) $cclass
+ #set decomp_map($index) $cclass
+
+ if {$decomp != ""} {
+ if {[string index [lindex $decomp 0] 0] == "<"} {
+ set decomp1 [lreplace $decomp 0 0]
+ set decomp {}
+ foreach ch $decomp1 {
+ scan $ch %x ch
+ lappend decomp $ch
+ }
+ set decomp_map($index) $decomp
+ } else {
+ switch -- [llength $decomp] {
+ 1 {
+ scan $decomp %x ch
+ set decomp_map($index) $ch
+ }
+ 2 {
+ scan $decomp "%x %x" ch1 ch2
+ set decomp [list $ch1 $ch2]
+ set decomp_map($index) $decomp
+ # hackish
+ if {(![info exists cclass_map($ch1)] || \
+ $cclass_map($ch1) == 0) && \
+ ![info exists exclusions($index)]} {
+ if {[info exists comp_first($ch1)]} {
+ incr comp_first($ch1)
+ } else {
+ set comp_first($ch1) 1
+ }
+ if {[info exists comp_second($ch2)]} {
+ incr comp_second($ch2)
+ } else {
+ set comp_second($ch2) 1
+ }
+ set comp_map($decomp) $index
+ } else {
+ puts "Excluded $index"
+ }
+ }
+ default {
+ puts "Bad canonical decomposition: $line"
+ }
+ }
+ }
+
+ #puts "[format 0x%0.4x $index]\t$cclass\t$decomp_map($index)"
+ }
+ }
+ #puts [array get comp_first]
+ #puts [array get comp_second]
+}
+
+proc uni::buildTables {} {
+ variable cclass_shift
+ variable decomp_shift
+ variable comp_shift
+
+ variable cclass_map
+ variable cclass_pmap {}
+ variable cclass_pages {}
+ variable decomp_map
+ variable decomp_pmap {}
+ variable decomp_pages {}
+ variable decomp_list {}
+ variable comp_map
+ variable comp_pmap {}
+ variable comp_pages {}
+ variable comp_first
+ variable comp_second
+ variable comp_first_list {}
+ variable comp_second_list {}
+ variable comp_x_list {}
+ variable comp_y_list {}
+ variable comp_both_map {}
+
+ set cclass_info {}
+ set decomp_info {}
+ set comp_info {}
+
+ set cclass_mask [expr {(1 << $cclass_shift) - 1}]
+ set decomp_mask [expr {(1 << $decomp_shift) - 1}]
+ set comp_mask [expr {(1 << $comp_shift) - 1}]
+
+ foreach comp [array names comp_map] {
+ set ch1 [lindex $comp 0]
+ set ch2 [lindex $comp 1]
+
+ if {$comp_first($ch1) == 1} {
+ set i [llength $comp_first_list]
+ lappend comp_first_list [list $ch2 $comp_map($comp)]
+ set comp_info_map($ch1) [expr {$i | (1 << 16)}]
+ } elseif {$comp_second($ch2) == 1} {
+ set i [llength $comp_second_list]
+ lappend comp_second_list [list $ch1 $comp_map($comp)]
+ set comp_info_map($ch2) [expr {$i | (1 << 16)}]
+ } else {
+ if {[lsearch -exact $comp_x_list $ch1] < 0} {
+ set i [llength $comp_x_list]
+ lappend comp_x_list $ch1
+ set comp_info_map($ch1) $i
+ }
+ if {[lsearch -exact $comp_y_list $ch2] < 0} {
+ set i [llength $comp_y_list]
+ lappend comp_y_list $ch2
+ set comp_info_map($ch2) $i
+ }
+ }
+ }
+
+ set next 0
+
+ for {set i 0} {$i <= 0xffff} {incr i} {
+ #set gIndex [getGroup [getValue $i]]
+
+ set cclass_offset [expr {$i & $cclass_mask}]
+
+ if {[info exists cclass_map($i)]} {
+ set cclass $cclass_map($i)
+ } else {
+ set cclass 0
+ }
+ lappend cclass_info $cclass
+
+ if {$cclass_offset == $cclass_mask} {
+ addPage cclass_pmap cclass_pages $cclass_info
+ set cclass_info {}
+ }
+
+
+ set decomp_offset [expr {$i & $decomp_mask}]
+
+ if {[info exists decomp_map($i)]} {
+ set decomp $decomp_map($i)
+ if {[info exists decomp_used($decomp)]} {
+ lappend decomp_info $decomp_used($decomp)
+ } else {
+ set val [expr {([llength $decomp] << 16) + \
+ [llength $decomp_list]}]
+ #set val [expr {[llength $decomp_list]}]
+ lappend decomp_info $val
+ set decomp_used($decomp) $val
+ #puts "$val $decomp"
+ foreach d $decomp {
+ lappend decomp_list $d
+ }
+ }
+ } else {
+ lappend decomp_info -1
+ }
+
+ if {$decomp_offset == $decomp_mask} {
+ addPage decomp_pmap decomp_pages $decomp_info
+ set decomp_info {}
+ }
+
+
+ set comp_offset [expr {$i & $comp_mask}]
+
+ if {[info exists comp_info_map($i)]} {
+ set comp $comp_info_map($i)
+ } else {
+ set comp -1
+ }
+ lappend comp_info $comp
+
+ if {$comp_offset == $comp_mask} {
+ addPage comp_pmap comp_pages $comp_info
+ set comp_info {}
+ }
+ }
+
+ #puts [array get decomp_map]
+ #puts $decomp_list
+
+ return
+}
+
+proc uni::main {} {
+ global argc argv0 argv
+ variable cclass_shift
+ variable cclass_pmap
+ variable cclass_pages
+ variable decomp_shift
+ variable decomp_pmap
+ variable decomp_pages
+ variable decomp_list
+ variable comp_shift
+ variable comp_map
+ variable comp_pmap
+ variable comp_pages
+ variable comp_first_list
+ variable comp_second_list
+ variable comp_x_list
+ variable comp_y_list
+ variable pages
+ variable groups {}
+ variable titleCount
+
+ if {$argc != 3} {
+ puts stderr "\nusage: $argv0 <datafile> <exclusionsfile> <outdir>\n"
+ exit 1
+ }
+ set f [open [lindex $argv 1] r]
+ set data [read $f]
+ close $f
+
+ load_exclusions $data
+
+ set f [open [lindex $argv 0] r]
+ set data [read $f]
+ close $f
+
+ load_tables $data
+ buildTables
+ #puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]"
+ #set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]
+ #puts "shift = 6, space = $size"
+ #puts "title case count = $titleCount"
+
+ set f [open [file join [lindex $argv 2] uni_norm.c] w]
+ fconfigure $f -translation lf
+ puts $f "/*
+ * uni_norm.c --
+ *
+ * Declarations of Unicode character information tables. This file is
+ * automatically generated by the uni_parse2.tcl script. Do not
+ * modify this file by hand.
+ *
+ * Copyright (c) 1998 by Scriptics Corporation.
+ * All rights reserved.
+ *
+ * Modified for ejabberd by Alexey Shchepin
+ *
+ * RCS: @(#) \$Id\$
+ */
+
+/*
+ * A 16-bit Unicode character is split into two parts in order to index
+ * into the following tables. The lower CCLASS_OFFSET_BITS comprise an offset
+ * into a page of characters. The upper bits comprise the page number.
+ */
+
+#define CCLASS_OFFSET_BITS $cclass_shift
+
+/*
+ * The pageMap is indexed by page number and returns an alternate page number
+ * that identifies a unique page of characters. Many Unicode characters map
+ * to the same alternate page number.
+ */
+
+static unsigned char cclassPageMap\[\] = {"
+ set line " "
+ set last [expr {[llength $cclass_pmap] - 1}]
+ for {set i 0} {$i <= $last} {incr i} {
+ append line [lindex $cclass_pmap $i]
+ if {$i != $last} {
+ append line ", "
+ }
+ if {[string length $line] > 70} {
+ puts $f $line
+ set line " "
+ }
+ }
+ puts $f $line
+ puts $f "};
+
+/*
+ * The groupMap is indexed by combining the alternate page number with
+ * the page offset and returns a group number that identifies a unique
+ * set of character attributes.
+ */
+
+static unsigned char cclassGroupMap\[\] = {"
+ set line " "
+ set lasti [expr {[llength $cclass_pages] - 1}]
+ for {set i 0} {$i <= $lasti} {incr i} {
+ set page [lindex $cclass_pages $i]
+ set lastj [expr {[llength $page] - 1}]
+ for {set j 0} {$j <= $lastj} {incr j} {
+ append line [lindex $page $j]
+ if {$j != $lastj || $i != $lasti} {
+ append line ", "
+ }
+ if {[string length $line] > 70} {
+ puts $f $line
+ set line " "
+ }
+ }
+ }
+ puts $f $line
+ puts $f "};
+
+/*
+ * Each group represents a unique set of character attributes. The attributes
+ * are encoded into a 32-bit value as follows:
+ *
+ * Bits 0-4 Character category: see the constants listed below.
+ *
+ * Bits 5-7 Case delta type: 000 = identity
+ * 010 = add delta for lower
+ * 011 = add delta for lower, add 1 for title
+ * 100 = sutract delta for title/upper
+ * 101 = sub delta for upper, sub 1 for title
+ * 110 = sub delta for upper, add delta for lower
+ *
+ * Bits 8-21 Reserved for future use.
+ *
+ * Bits 22-31 Case delta: delta for case conversions. This should be the
+ * highest field so we can easily sign extend.
+ */
+
+static int cclass_groups\[\] = {"
+ set line " "
+ set last [expr {[llength $groups] - 1}]
+ for {set i 0} {$i <= $last} {incr i} {
+ foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {}
+
+ # Compute the case conversion type and delta
+
+ if {$totitle != ""} {
+ if {$totitle == $toupper} {
+ # subtract delta for title or upper
+ set case 4
+ set delta $toupper
+ } elseif {$toupper != ""} {
+ # subtract delta for upper, subtract 1 for title
+ set case 5
+ set delta $toupper
+ } else {
+ # add delta for lower, add 1 for title
+ set case 3
+ set delta $tolower
+ }
+ } elseif {$toupper != ""} {
+ # subtract delta for upper, add delta for lower
+ set case 6
+ set delta $toupper
+ } elseif {$tolower != ""} {
+ # add delta for lower
+ set case 2
+ set delta $tolower
+ } else {
+ # noop
+ set case 0
+ set delta 0
+ }
+
+ set val [expr {($delta << 22) | ($case << 5) | $type}]
+
+ append line [format "%d" $val]
+ if {$i != $last} {
+ append line ", "
+ }
+ if {[string length $line] > 65} {
+ puts $f $line
+ set line " "
+ }
+ }
+ puts $f $line
+ puts $f "};
+
+#define GetUniCharCClass(ch) (cclassGroupMap\[(cclassPageMap\[(((int)(ch)) & 0xffff) >> CCLASS_OFFSET_BITS\] << CCLASS_OFFSET_BITS) | ((ch) & ((1 << CCLASS_OFFSET_BITS)-1))\])
+
+
+#define DECOMP_OFFSET_BITS $decomp_shift
+
+/*
+ * The pageMap is indexed by page number and returns an alternate page number
+ * that identifies a unique page of characters. Many Unicode characters map
+ * to the same alternate page number.
+ */
+
+static unsigned char decompPageMap\[\] = {"
+ set line " "
+ set last [expr {[llength $decomp_pmap] - 1}]
+ for {set i 0} {$i <= $last} {incr i} {
+ append line [lindex $decomp_pmap $i]
+ if {$i != $last} {
+ append line ", "
+ }
+ if {[string length $line] > 70} {
+ puts $f $line
+ set line " "
+ }
+ }
+ puts $f $line
+ puts $f "};
+
+/*
+ * The groupMap is indexed by combining the alternate page number with
+ * the page offset and returns a group number that identifies a unique
+ * set of character attributes.
+ */
+
+static int decompGroupMap\[\] = {"
+ set line " "
+ set lasti [expr {[llength $decomp_pages] - 1}]
+ for {set i 0} {$i <= $lasti} {incr i} {
+ set page [lindex $decomp_pages $i]
+ set lastj [expr {[llength $page] - 1}]
+ for {set j 0} {$j <= $lastj} {incr j} {
+ append line [lindex $page $j]
+ if {$j != $lastj || $i != $lasti} {
+ append line ", "
+ }
+ if {[string length $line] > 70} {
+ puts $f $line
+ set line " "
+ }
+ }
+ }
+ puts $f $line
+ puts $f "};
+
+/*
+ * Each group represents a unique set of character attributes. The attributes...
+ */
+
+static int decompList\[\] = {"
+ set line " "
+ set last [expr {[llength $decomp_list] - 1}]
+ for {set i 0} {$i <= $last} {incr i} {
+ set val [lindex $decomp_list $i]
+
+ append line [format "%d" $val]
+ if {$i != $last} {
+ append line ", "
+ }
+ if {[string length $line] > 70} {
+ puts $f $line
+ set line " "
+ }
+ }
+ puts $f $line
+ puts $f "};
+
+
+/*
+ * This macro extracts the information about a character from the
+ * Unicode character tables.
+ */
+
+//#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> CCLASS_OFFSET_BITS\] << CCLASS_OFFSET_BITS) | ((ch) & ((1 << CCLASS_OFFSET_BITS)-1))\]\])
+
+#define GetUniCharDecompInfo(ch) (decompGroupMap\[(decompPageMap\[(((int)(ch)) & 0xffff) >> DECOMP_OFFSET_BITS\] << DECOMP_OFFSET_BITS) | ((ch) & ((1 << DECOMP_OFFSET_BITS)-1))\])
+
+#define GetDecompShift(info) ((info) & 0xffff)
+#define GetDecompLen(info) ((info) >> 16)
+
+
+#define COMP_OFFSET_BITS $comp_shift
+
+/*
+ * The pageMap is indexed by page number and returns an alternate page number
+ * that identifies a unique page of characters. Many Unicode characters map
+ * to the same alternate page number.
+ */
+
+static unsigned char compPageMap\[\] = {"
+ set line " "
+ set last [expr {[llength $comp_pmap] - 1}]
+ for {set i 0} {$i <= $last} {incr i} {
+ append line [lindex $comp_pmap $i]
+ if {$i != $last} {
+ append line ", "
+ }
+ if {[string length $line] > 70} {
+ puts $f $line
+ set line " "
+ }
+ }
+ puts $f $line
+ puts $f "};
+
+/*
+ * The groupMap is indexed by combining the alternate page number with
+ * the page offset and returns a group number that identifies a unique
+ * set of character attributes.
+ */
+
+static int compGroupMap\[\] = {"
+ set line " "
+ set lasti [expr {[llength $comp_pages] - 1}]
+ for {set i 0} {$i <= $lasti} {incr i} {
+ set page [lindex $comp_pages $i]
+ set lastj [expr {[llength $page] - 1}]
+ for {set j 0} {$j <= $lastj} {incr j} {
+ append line [lindex $page $j]
+ if {$j != $lastj || $i != $lasti} {
+ append line ", "
+ }
+ if {[string length $line] > 70} {
+ puts $f $line
+ set line " "
+ }
+ }
+ }
+ puts $f $line
+ puts $f "};
+
+/*
+ * ...
+ */
+
+static int compFirstList\[\]\[2\] = {"
+ set line " "
+ set last [expr {[llength $comp_first_list] - 1}]
+ for {set i 0} {$i <= $last} {incr i} {
+ set val [lindex $comp_first_list $i]
+
+ append line [format "{%d, %d}" [lindex $val 0] [lindex $val 1]]
+ if {$i != $last} {
+ append line ", "
+ }
+ if {[string length $line] > 60} {
+ puts $f $line
+ set line " "
+ }
+ }
+ puts $f $line
+ puts $f "};
+
+static int compSecondList\[\]\[2\] = {"
+ set line " "
+ set last [expr {[llength $comp_second_list] - 1}]
+ for {set i 0} {$i <= $last} {incr i} {
+ set val [lindex $comp_second_list $i]
+
+ append line [format "{%d, %d}" [lindex $val 0] [lindex $val 1]]
+ if {$i != $last} {
+ append line ", "
+ }
+ if {[string length $line] > 60} {
+ puts $f $line
+ set line " "
+ }
+ }
+ puts $f $line
+ puts $f "};
+
+static int compBothList\[[llength $comp_x_list]\]\[[llength $comp_y_list]\] = {"
+ set lastx [expr {[llength $comp_x_list] - 1}]
+ set lasty [expr {[llength $comp_y_list] - 1}]
+ for {set i 0} {$i <= $lastx} {incr i} {
+ puts $f " \{"
+ set line " "
+ for {set j 0} {$j <= $lasty} {incr j} {
+ set comp [list [lindex $comp_x_list $i] [lindex $comp_y_list $j]]
+ if {[info exists comp_map($comp)]} {
+ set val $comp_map($comp)
+ } else {
+ set val 0
+ }
+
+ append line [format "%d" $val]
+ if {$j != $lasty} {
+ append line ", "
+ }
+ if {[string length $line] > 70} {
+ puts $f $line
+ set line " "
+ }
+ }
+ puts $f $line
+ if {$j != $lasty} {
+ puts $f " \},"
+ } else {
+ puts $f " \}"
+ }
+ }
+ puts $f "};
+
+
+#define GetUniCharCompInfo(ch) (compGroupMap\[(compPageMap\[(((int)(ch)) & 0xffff) >> COMP_OFFSET_BITS\] << COMP_OFFSET_BITS) | ((ch) & ((1 << COMP_OFFSET_BITS)-1))\])
+
+#define CompSingleMask (1 << 16)
+#define CompMask ((1 << 16) - 1)
+"
+
+ close $f
+}
+
+uni::main
+
+return