diff options
Diffstat (limited to 'src/stringprep/uni_parse.tcl')
-rw-r--r-- | src/stringprep/uni_parse.tcl | 362 |
1 files changed, 362 insertions, 0 deletions
diff --git a/src/stringprep/uni_parse.tcl b/src/stringprep/uni_parse.tcl new file mode 100644 index 00000000..7d6458f2 --- /dev/null +++ b/src/stringprep/uni_parse.tcl @@ -0,0 +1,362 @@ +# uni_parse.tcl -- +# +# This program parses the UnicodeData file and generates the +# corresponding uni_data.c file with compressed character +# data tables. The input to this program should be rfc3454.txt +# +# Copyright (c) 1998-1999 by Scriptics Corporation. +# All rights reserved. +# +# Modified for ejabberd by Alexey Shchepin +# +# RCS: @(#) $Id$ + + +namespace eval uni { + set shift 5; # number of bits of data within a page + # This value can be adjusted to find the + # best split to minimize table size + + variable pMap; # map from page to page index, each entry is + # an index into the pages table, indexed by + # page number + variable pages; # map from page index to page info, each + # entry is a list of indices into the groups + # table, the list is indexed by the offset + variable groups; # list of character info values, indexed by + # group number, initialized with the + # unassigned character group +} + +proc uni::getValue {tables delta} { + set ac 0 + set c11 0 + set c21 0 + set b1 0 + set d1 0 + set d2 0 + set xnp 0 + + foreach tab $tables { + switch -glob -- $tab { + C.1.1 {set c11 1} + C.2.1 {set c21 1} + C.* {set ac 1} + A.1 {set ac 1} + B.1 {set b1 1} + D.1 {set d1 1} + D.2 {set d2 1} + XNP {set xnp 1} + } + } + + set val [expr {($ac << 0) | + ($c11 << 1) | + ($c21 << 2) | + ($b1 << 3) | + ($d1 << 4) | + ($d2 << 5) | + ($xnp << 6) | + ($delta << 16)}] + + return $val +} + +proc uni::getGroup {value} { + variable groups + + set gIndex [lsearch -exact $groups $value] + if {$gIndex == -1} { + set gIndex [llength $groups] + lappend groups $value + } + return $gIndex +} + +proc uni::addPage {info} { + variable pMap + variable pages + + set pIndex [lsearch -exact $pages $info] + if {$pIndex == -1} { + set pIndex [llength $pages] + lappend pages $info + } + lappend pMap $pIndex + return +} + +proc uni::load_tables {data} { + variable casemap + variable tablemap + + for {set i 0} {$i <= 0xffff} {incr i} { + set casemap($i) 0 + set tablemap($i) {} + } + + set table "" + + foreach line [split $data \n] { + if {$table == ""} { + if {[regexp { ----- Start Table (.*) -----} $line temp table]} { + #puts "Start table '$table'" + } + } else { + if {[regexp { ----- End Table (.*) -----} $line temp table1]} { + set table "" + } else { + if {$table == "B.1"} { + if {[regexp {^ ([[:xdigit:]]+); ;} $line \ + temp val]} { + scan $val %x val + if {$val <= 0xffff} { + lappend tablemap($val) $table + } + } + } elseif {$table == "B.3"} { + if {[regexp {^ ([[:xdigit:]]+); ([[:xdigit:]]+);} $line \ + temp from to]} { + scan $from %x from + scan $to %x to + if {$from <= 0xffff && $to <= 0xffff} { + set casemap($from) [expr {$to - $from}] + } + } else { + # TODO + } + + } elseif {$table != "B.2"} { + if {[regexp {^ ([[:xdigit:]]+)-([[:xdigit:]]+)} $line \ + temp from to]} { + scan $from %x from + scan $to %x to + for {set i $from} {$i <= $to && $i <= 0xffff} {incr i} { + lappend tablemap($i) $table + } + } elseif {[regexp {^ ([[:xdigit:]]+)} $line \ + temp val]} { + scan $val %x val + if {$val <= 0xffff} { + lappend tablemap($val) $table + } + } + } + } + } + } + + # XMPP nodeprep prohibited + foreach val {22 26 27 2f 3a 3c 3e 40} { + scan $val %x val + lappend tablemap($val) XNP + } +} + +proc uni::buildTables {} { + variable shift + + variable casemap + variable tablemap + + variable pMap {} + variable pages {} + variable groups {} + set info {} ;# temporary page info + + set mask [expr {(1 << $shift) - 1}] + + set next 0 + + for {set i 0} {$i <= 0xffff} {incr i} { + set gIndex [getGroup [getValue $tablemap($i) $casemap($i)]] + + # Split character index into offset and page number + set offset [expr {$i & $mask}] + set page [expr {($i >> $shift)}] + + # Add the group index to the info for the current page + lappend info $gIndex + + # If this is the last entry in the page, add the page + if {$offset == $mask} { + addPage $info + set info {} + } + } + return +} + +proc uni::main {} { + global argc argv0 argv + variable pMap + variable pages + variable groups + variable shift + + if {$argc != 2} { + puts stderr "\nusage: $argv0 <datafile> <outdir>\n" + exit 1 + } + set f [open [lindex $argv 0] r] + set data [read $f] + close $f + + load_tables $data + buildTables + puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]" + set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}] + puts "shift = 6, space = $size" + + set f [open [file join [lindex $argv 1] uni_data.c] w] + fconfigure $f -translation lf + puts $f "/* + * uni_data.c -- + * + * Declarations of Unicode character information tables. This file is + * automatically generated by the uni_parse.tcl script. Do not + * modify this file by hand. + * + * Copyright (c) 1998 by Scriptics Corporation. + * All rights reserved. + * + * Modified for ejabberd by Alexey Shchepin + * + * RCS: @(#) \$Id\$ + */ + +/* + * A 16-bit Unicode character is split into two parts in order to index + * into the following tables. The lower OFFSET_BITS comprise an offset + * into a page of characters. The upper bits comprise the page number. + */ + +#define OFFSET_BITS $shift + +/* + * The pageMap is indexed by page number and returns an alternate page number + * that identifies a unique page of characters. Many Unicode characters map + * to the same alternate page number. + */ + +static unsigned char pageMap\[\] = {" + set line " " + set last [expr {[llength $pMap] - 1}] + for {set i 0} {$i <= $last} {incr i} { + append line [lindex $pMap $i] + if {$i != $last} { + append line ", " + } + if {[string length $line] > 70} { + puts $f $line + set line " " + } + } + puts $f $line + puts $f "}; + +/* + * The groupMap is indexed by combining the alternate page number with + * the page offset and returns a group number that identifies a unique + * set of character attributes. + */ + +static unsigned char groupMap\[\] = {" + set line " " + set lasti [expr {[llength $pages] - 1}] + for {set i 0} {$i <= $lasti} {incr i} { + set page [lindex $pages $i] + set lastj [expr {[llength $page] - 1}] + for {set j 0} {$j <= $lastj} {incr j} { + append line [lindex $page $j] + if {$j != $lastj || $i != $lasti} { + append line ", " + } + if {[string length $line] > 70} { + puts $f $line + set line " " + } + } + } + puts $f $line + puts $f "}; + +/* + * Each group represents a unique set of character attributes. The attributes + * are encoded into a 32-bit value as follows: + * + * Bit 0 A.1 | C.1.2 | C.2.2 | C.3 -- C.9 + * + * Bit 1 C.1.1 + * + * Bit 2 C.2.1 + * + * Bit 3 B.1 + * + * Bit 4 B.1 + * + * Bit 5 D.1 + * + * Bit 6 D.2 + * + * Bits 7-15 Reserved for future use. + * + * Bits 16-31 Case delta: delta for case conversions. This should be the + * highest field so we can easily sign extend. + */ + +static int groups\[\] = {" + set line " " + set last [expr {[llength $groups] - 1}] + for {set i 0} {$i <= $last} {incr i} { + set val [lindex $groups $i] + + append line [format "%d" $val] + if {$i != $last} { + append line ", " + } + if {[string length $line] > 65} { + puts $f $line + set line " " + } + } + puts $f $line + puts $f "}; + +/* + * The following constants are used to determine the category of a + * Unicode character. + */ + +#define ACMask (1 << 0) +#define C11Mask (1 << 1) +#define C21Mask (1 << 2) +#define B1Mask (1 << 3) +#define D1Mask (1 << 4) +#define D2Mask (1 << 5) +#define XNPMask (1 << 6) + +/* + * The following macros extract the fields of the character info. The + * GetDelta() macro is complicated because we can't rely on the C compiler + * to do sign extension on right shifts. + */ + +#define GetCaseType(info) (((info) & 0xE0) >> 5) +#define GetCategory(info) ((info) & 0x1F) +#define GetDelta(info) (((info) > 0) ? ((info) >> 16) : (~(~((info)) >> 16))) + +/* + * This macro extracts the information about a character from the + * Unicode character tables. + */ + +#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) +" + + close $f +} + +uni::main + +return |