diff options
Diffstat (limited to 'src/stringprep/uni_parse2.tcl')
-rw-r--r-- | src/stringprep/uni_parse2.tcl | 737 |
1 files changed, 737 insertions, 0 deletions
diff --git a/src/stringprep/uni_parse2.tcl b/src/stringprep/uni_parse2.tcl new file mode 100644 index 00000000..456b9d13 --- /dev/null +++ b/src/stringprep/uni_parse2.tcl @@ -0,0 +1,737 @@ +# uni_parse2.tcl -- +# +# This program parses the UnicodeData file and generates the +# corresponding uni_norm.c file with compressed character +# data tables. The input to this program should be the latest +# UnicodeData.txt and CompositionExclusions.txt files from: +# ftp://ftp.unicode.org/Public/UNIDATA/ +# +# Copyright (c) 1998-1999 by Scriptics Corporation. +# All rights reserved. +# +# Modified for ejabberd by Alexey Shchepin +# +# RCS: @(#) $Id$ + + +namespace eval uni { + set cclass_shift 6 + set decomp_shift 5 + set comp_shift 5 + set shift 5; # number of bits of data within a page + # This value can be adjusted to find the + # best split to minimize table size + + variable pMap; # map from page to page index, each entry is + # an index into the pages table, indexed by + # page number + variable pages; # map from page index to page info, each + # entry is a list of indices into the groups + # table, the list is indexed by the offset + variable groups; # list of character info values, indexed by + # group number, initialized with the + # unassigned character group + + variable categories { + Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp + Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So + }; # Ordered list of character categories, must + # match the enumeration in the header file. + + variable titleCount 0; # Count of the number of title case + # characters. This value is used in the + # regular expression code to allocate enough + # space for the title case variants. +} + +proc uni::getValue {items index} { + variable categories + variable titleCount + + # Extract character info + + set category [lindex $items 2] + if {[scan [lindex $items 12] %4x toupper] == 1} { + set toupper [expr {$index - $toupper}] + } else { + set toupper {} + } + if {[scan [lindex $items 13] %4x tolower] == 1} { + set tolower [expr {$tolower - $index}] + } else { + set tolower {} + } + if {[scan [lindex $items 14] %4x totitle] == 1} { + set totitle [expr {$index - $totitle}] + } else { + set totitle {} + } + + set categoryIndex [lsearch -exact $categories $category] + if {$categoryIndex < 0} { + puts "Unexpected character category: $index($category)" + set categoryIndex 0 + } elseif {$category == "Lt"} { + incr titleCount + } + + return "$categoryIndex,$toupper,$tolower,$totitle" +} + +proc uni::getGroup {value} { + variable groups + + set gIndex [lsearch -exact $groups $value] + if {$gIndex == -1} { + set gIndex [llength $groups] + lappend groups $value + } + return $gIndex +} + +proc uni::addPage {info} { + variable pMap + variable pages + + set pIndex [lsearch -exact $pages $info] + if {$pIndex == -1} { + set pIndex [llength $pages] + lappend pages $info + } + lappend pMap $pIndex + return +} + +proc uni::addPage {map_var pages_var info} { + variable $map_var + variable $pages_var + + set pIndex [lsearch -exact [set $pages_var] $info] + if {$pIndex == -1} { + set pIndex [llength [set $pages_var]] + lappend $pages_var $info + } + lappend $map_var $pIndex + return +} + +proc uni::load_exclusions {data} { + variable exclusions + + foreach line [split $data \n] { + if {$line == ""} continue + + set items [split $line " "] + + if {[lindex $items 0] == "#"} continue + + scan [lindex $items 0] %x index + + set exclusions($index) "" + } +} + +proc uni::load_tables {data} { + variable cclass_map + variable decomp_map + variable comp_map + variable comp_first + variable comp_second + variable exclusions + + foreach line [split $data \n] { + if {$line == ""} continue + + set items [split $line \;] + + scan [lindex $items 0] %x index + set cclass [lindex $items 3] + set decomp [lindex $items 5] + + set cclass_map($index) $cclass + #set decomp_map($index) $cclass + + if {$decomp != ""} { + if {[string index [lindex $decomp 0] 0] == "<"} { + set decomp1 [lreplace $decomp 0 0] + set decomp {} + foreach ch $decomp1 { + scan $ch %x ch + lappend decomp $ch + } + set decomp_map($index) $decomp + } else { + switch -- [llength $decomp] { + 1 { + scan $decomp %x ch + set decomp_map($index) $ch + } + 2 { + scan $decomp "%x %x" ch1 ch2 + set decomp [list $ch1 $ch2] + set decomp_map($index) $decomp + # hackish + if {(![info exists cclass_map($ch1)] || \ + $cclass_map($ch1) == 0) && \ + ![info exists exclusions($index)]} { + if {[info exists comp_first($ch1)]} { + incr comp_first($ch1) + } else { + set comp_first($ch1) 1 + } + if {[info exists comp_second($ch2)]} { + incr comp_second($ch2) + } else { + set comp_second($ch2) 1 + } + set comp_map($decomp) $index + } else { + puts "Excluded $index" + } + } + default { + puts "Bad canonical decomposition: $line" + } + } + } + + #puts "[format 0x%0.4x $index]\t$cclass\t$decomp_map($index)" + } + } + #puts [array get comp_first] + #puts [array get comp_second] +} + +proc uni::buildTables {} { + variable cclass_shift + variable decomp_shift + variable comp_shift + + variable cclass_map + variable cclass_pmap {} + variable cclass_pages {} + variable decomp_map + variable decomp_pmap {} + variable decomp_pages {} + variable decomp_list {} + variable comp_map + variable comp_pmap {} + variable comp_pages {} + variable comp_first + variable comp_second + variable comp_first_list {} + variable comp_second_list {} + variable comp_x_list {} + variable comp_y_list {} + variable comp_both_map {} + + set cclass_info {} + set decomp_info {} + set comp_info {} + + set cclass_mask [expr {(1 << $cclass_shift) - 1}] + set decomp_mask [expr {(1 << $decomp_shift) - 1}] + set comp_mask [expr {(1 << $comp_shift) - 1}] + + foreach comp [array names comp_map] { + set ch1 [lindex $comp 0] + set ch2 [lindex $comp 1] + + if {$comp_first($ch1) == 1} { + set i [llength $comp_first_list] + lappend comp_first_list [list $ch2 $comp_map($comp)] + set comp_info_map($ch1) [expr {$i | (1 << 16)}] + } elseif {$comp_second($ch2) == 1} { + set i [llength $comp_second_list] + lappend comp_second_list [list $ch1 $comp_map($comp)] + set comp_info_map($ch2) [expr {$i | (1 << 16)}] + } else { + if {[lsearch -exact $comp_x_list $ch1] < 0} { + set i [llength $comp_x_list] + lappend comp_x_list $ch1 + set comp_info_map($ch1) $i + } + if {[lsearch -exact $comp_y_list $ch2] < 0} { + set i [llength $comp_y_list] + lappend comp_y_list $ch2 + set comp_info_map($ch2) $i + } + } + } + + set next 0 + + for {set i 0} {$i <= 0xffff} {incr i} { + #set gIndex [getGroup [getValue $i]] + + set cclass_offset [expr {$i & $cclass_mask}] + + if {[info exists cclass_map($i)]} { + set cclass $cclass_map($i) + } else { + set cclass 0 + } + lappend cclass_info $cclass + + if {$cclass_offset == $cclass_mask} { + addPage cclass_pmap cclass_pages $cclass_info + set cclass_info {} + } + + + set decomp_offset [expr {$i & $decomp_mask}] + + if {[info exists decomp_map($i)]} { + set decomp $decomp_map($i) + if {[info exists decomp_used($decomp)]} { + lappend decomp_info $decomp_used($decomp) + } else { + set val [expr {([llength $decomp] << 16) + \ + [llength $decomp_list]}] + #set val [expr {[llength $decomp_list]}] + lappend decomp_info $val + set decomp_used($decomp) $val + #puts "$val $decomp" + foreach d $decomp { + lappend decomp_list $d + } + } + } else { + lappend decomp_info -1 + } + + if {$decomp_offset == $decomp_mask} { + addPage decomp_pmap decomp_pages $decomp_info + set decomp_info {} + } + + + set comp_offset [expr {$i & $comp_mask}] + + if {[info exists comp_info_map($i)]} { + set comp $comp_info_map($i) + } else { + set comp -1 + } + lappend comp_info $comp + + if {$comp_offset == $comp_mask} { + addPage comp_pmap comp_pages $comp_info + set comp_info {} + } + } + + #puts [array get decomp_map] + #puts $decomp_list + + return +} + +proc uni::main {} { + global argc argv0 argv + variable cclass_shift + variable cclass_pmap + variable cclass_pages + variable decomp_shift + variable decomp_pmap + variable decomp_pages + variable decomp_list + variable comp_shift + variable comp_map + variable comp_pmap + variable comp_pages + variable comp_first_list + variable comp_second_list + variable comp_x_list + variable comp_y_list + variable pages + variable groups {} + variable titleCount + + if {$argc != 3} { + puts stderr "\nusage: $argv0 <datafile> <exclusionsfile> <outdir>\n" + exit 1 + } + set f [open [lindex $argv 1] r] + set data [read $f] + close $f + + load_exclusions $data + + set f [open [lindex $argv 0] r] + set data [read $f] + close $f + + load_tables $data + buildTables + #puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]" + #set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}] + #puts "shift = 6, space = $size" + #puts "title case count = $titleCount" + + set f [open [file join [lindex $argv 2] uni_norm.c] w] + fconfigure $f -translation lf + puts $f "/* + * uni_norm.c -- + * + * Declarations of Unicode character information tables. This file is + * automatically generated by the uni_parse2.tcl script. Do not + * modify this file by hand. + * + * Copyright (c) 1998 by Scriptics Corporation. + * All rights reserved. + * + * Modified for ejabberd by Alexey Shchepin + * + * RCS: @(#) \$Id\$ + */ + +/* + * A 16-bit Unicode character is split into two parts in order to index + * into the following tables. The lower CCLASS_OFFSET_BITS comprise an offset + * into a page of characters. The upper bits comprise the page number. + */ + +#define CCLASS_OFFSET_BITS $cclass_shift + +/* + * The pageMap is indexed by page number and returns an alternate page number + * that identifies a unique page of characters. Many Unicode characters map + * to the same alternate page number. + */ + +static unsigned char cclassPageMap\[\] = {" + set line " " + set last [expr {[llength $cclass_pmap] - 1}] + for {set i 0} {$i <= $last} {incr i} { + append line [lindex $cclass_pmap $i] + if {$i != $last} { + append line ", " + } + if {[string length $line] > 70} { + puts $f $line + set line " " + } + } + puts $f $line + puts $f "}; + +/* + * The groupMap is indexed by combining the alternate page number with + * the page offset and returns a group number that identifies a unique + * set of character attributes. + */ + +static unsigned char cclassGroupMap\[\] = {" + set line " " + set lasti [expr {[llength $cclass_pages] - 1}] + for {set i 0} {$i <= $lasti} {incr i} { + set page [lindex $cclass_pages $i] + set lastj [expr {[llength $page] - 1}] + for {set j 0} {$j <= $lastj} {incr j} { + append line [lindex $page $j] + if {$j != $lastj || $i != $lasti} { + append line ", " + } + if {[string length $line] > 70} { + puts $f $line + set line " " + } + } + } + puts $f $line + puts $f "}; + +/* + * Each group represents a unique set of character attributes. The attributes + * are encoded into a 32-bit value as follows: + * + * Bits 0-4 Character category: see the constants listed below. + * + * Bits 5-7 Case delta type: 000 = identity + * 010 = add delta for lower + * 011 = add delta for lower, add 1 for title + * 100 = sutract delta for title/upper + * 101 = sub delta for upper, sub 1 for title + * 110 = sub delta for upper, add delta for lower + * + * Bits 8-21 Reserved for future use. + * + * Bits 22-31 Case delta: delta for case conversions. This should be the + * highest field so we can easily sign extend. + */ + +static int cclass_groups\[\] = {" + set line " " + set last [expr {[llength $groups] - 1}] + for {set i 0} {$i <= $last} {incr i} { + foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {} + + # Compute the case conversion type and delta + + if {$totitle != ""} { + if {$totitle == $toupper} { + # subtract delta for title or upper + set case 4 + set delta $toupper + } elseif {$toupper != ""} { + # subtract delta for upper, subtract 1 for title + set case 5 + set delta $toupper + } else { + # add delta for lower, add 1 for title + set case 3 + set delta $tolower + } + } elseif {$toupper != ""} { + # subtract delta for upper, add delta for lower + set case 6 + set delta $toupper + } elseif {$tolower != ""} { + # add delta for lower + set case 2 + set delta $tolower + } else { + # noop + set case 0 + set delta 0 + } + + set val [expr {($delta << 22) | ($case << 5) | $type}] + + append line [format "%d" $val] + if {$i != $last} { + append line ", " + } + if {[string length $line] > 65} { + puts $f $line + set line " " + } + } + puts $f $line + puts $f "}; + +#define GetUniCharCClass(ch) (cclassGroupMap\[(cclassPageMap\[(((int)(ch)) & 0xffff) >> CCLASS_OFFSET_BITS\] << CCLASS_OFFSET_BITS) | ((ch) & ((1 << CCLASS_OFFSET_BITS)-1))\]) + + +#define DECOMP_OFFSET_BITS $decomp_shift + +/* + * The pageMap is indexed by page number and returns an alternate page number + * that identifies a unique page of characters. Many Unicode characters map + * to the same alternate page number. + */ + +static unsigned char decompPageMap\[\] = {" + set line " " + set last [expr {[llength $decomp_pmap] - 1}] + for {set i 0} {$i <= $last} {incr i} { + append line [lindex $decomp_pmap $i] + if {$i != $last} { + append line ", " + } + if {[string length $line] > 70} { + puts $f $line + set line " " + } + } + puts $f $line + puts $f "}; + +/* + * The groupMap is indexed by combining the alternate page number with + * the page offset and returns a group number that identifies a unique + * set of character attributes. + */ + +static int decompGroupMap\[\] = {" + set line " " + set lasti [expr {[llength $decomp_pages] - 1}] + for {set i 0} {$i <= $lasti} {incr i} { + set page [lindex $decomp_pages $i] + set lastj [expr {[llength $page] - 1}] + for {set j 0} {$j <= $lastj} {incr j} { + append line [lindex $page $j] + if {$j != $lastj || $i != $lasti} { + append line ", " + } + if {[string length $line] > 70} { + puts $f $line + set line " " + } + } + } + puts $f $line + puts $f "}; + +/* + * Each group represents a unique set of character attributes. The attributes... + */ + +static int decompList\[\] = {" + set line " " + set last [expr {[llength $decomp_list] - 1}] + for {set i 0} {$i <= $last} {incr i} { + set val [lindex $decomp_list $i] + + append line [format "%d" $val] + if {$i != $last} { + append line ", " + } + if {[string length $line] > 70} { + puts $f $line + set line " " + } + } + puts $f $line + puts $f "}; + + +/* + * This macro extracts the information about a character from the + * Unicode character tables. + */ + +//#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> CCLASS_OFFSET_BITS\] << CCLASS_OFFSET_BITS) | ((ch) & ((1 << CCLASS_OFFSET_BITS)-1))\]\]) + +#define GetUniCharDecompInfo(ch) (decompGroupMap\[(decompPageMap\[(((int)(ch)) & 0xffff) >> DECOMP_OFFSET_BITS\] << DECOMP_OFFSET_BITS) | ((ch) & ((1 << DECOMP_OFFSET_BITS)-1))\]) + +#define GetDecompShift(info) ((info) & 0xffff) +#define GetDecompLen(info) ((info) >> 16) + + +#define COMP_OFFSET_BITS $comp_shift + +/* + * The pageMap is indexed by page number and returns an alternate page number + * that identifies a unique page of characters. Many Unicode characters map + * to the same alternate page number. + */ + +static unsigned char compPageMap\[\] = {" + set line " " + set last [expr {[llength $comp_pmap] - 1}] + for {set i 0} {$i <= $last} {incr i} { + append line [lindex $comp_pmap $i] + if {$i != $last} { + append line ", " + } + if {[string length $line] > 70} { + puts $f $line + set line " " + } + } + puts $f $line + puts $f "}; + +/* + * The groupMap is indexed by combining the alternate page number with + * the page offset and returns a group number that identifies a unique + * set of character attributes. + */ + +static int compGroupMap\[\] = {" + set line " " + set lasti [expr {[llength $comp_pages] - 1}] + for {set i 0} {$i <= $lasti} {incr i} { + set page [lindex $comp_pages $i] + set lastj [expr {[llength $page] - 1}] + for {set j 0} {$j <= $lastj} {incr j} { + append line [lindex $page $j] + if {$j != $lastj || $i != $lasti} { + append line ", " + } + if {[string length $line] > 70} { + puts $f $line + set line " " + } + } + } + puts $f $line + puts $f "}; + +/* + * ... + */ + +static int compFirstList\[\]\[2\] = {" + set line " " + set last [expr {[llength $comp_first_list] - 1}] + for {set i 0} {$i <= $last} {incr i} { + set val [lindex $comp_first_list $i] + + append line [format "{%d, %d}" [lindex $val 0] [lindex $val 1]] + if {$i != $last} { + append line ", " + } + if {[string length $line] > 60} { + puts $f $line + set line " " + } + } + puts $f $line + puts $f "}; + +static int compSecondList\[\]\[2\] = {" + set line " " + set last [expr {[llength $comp_second_list] - 1}] + for {set i 0} {$i <= $last} {incr i} { + set val [lindex $comp_second_list $i] + + append line [format "{%d, %d}" [lindex $val 0] [lindex $val 1]] + if {$i != $last} { + append line ", " + } + if {[string length $line] > 60} { + puts $f $line + set line " " + } + } + puts $f $line + puts $f "}; + +static int compBothList\[[llength $comp_x_list]\]\[[llength $comp_y_list]\] = {" + set lastx [expr {[llength $comp_x_list] - 1}] + set lasty [expr {[llength $comp_y_list] - 1}] + for {set i 0} {$i <= $lastx} {incr i} { + puts $f " \{" + set line " " + for {set j 0} {$j <= $lasty} {incr j} { + set comp [list [lindex $comp_x_list $i] [lindex $comp_y_list $j]] + if {[info exists comp_map($comp)]} { + set val $comp_map($comp) + } else { + set val 0 + } + + append line [format "%d" $val] + if {$j != $lasty} { + append line ", " + } + if {[string length $line] > 70} { + puts $f $line + set line " " + } + } + puts $f $line + if {$j != $lasty} { + puts $f " \}," + } else { + puts $f " \}" + } + } + puts $f "}; + + +#define GetUniCharCompInfo(ch) (compGroupMap\[(compPageMap\[(((int)(ch)) & 0xffff) >> COMP_OFFSET_BITS\] << COMP_OFFSET_BITS) | ((ch) & ((1 << COMP_OFFSET_BITS)-1))\]) + +#define CompSingleMask (1 << 16) +#define CompMask ((1 << 16) - 1) +" + + close $f +} + +uni::main + +return |