diff options
Diffstat (limited to 'src/stringprep/uni_parse2.tcl')
-rw-r--r-- | src/stringprep/uni_parse2.tcl | 702 |
1 files changed, 0 insertions, 702 deletions
diff --git a/src/stringprep/uni_parse2.tcl b/src/stringprep/uni_parse2.tcl deleted file mode 100644 index 950090a09..000000000 --- a/src/stringprep/uni_parse2.tcl +++ /dev/null @@ -1,702 +0,0 @@ -# uni_parse2.tcl -- -# -# This program parses the UnicodeData file and generates the -# corresponding uni_norm.c file with compressed character -# data tables. The input to this program should be -# UnicodeData-3.2.0.txt and CompositionExclusions-3.2.0.txt files from: -# ftp://ftp.unicode.org/Public/UNIDATA/ -# -# Copyright (c) 1998-1999 by Scriptics Corporation. -# All rights reserved. -# -# Modified for ejabberd by Alexey Shchepin -# -# RCS: @(#) $Id$ - - -namespace eval uni { - set cclass_shift 8 - set decomp_shift 8 - set comp_shift 8 - set shift 5; # number of bits of data within a page - # This value can be adjusted to find the - # best split to minimize table size - - variable pMap; # map from page to page index, each entry is - # an index into the pages table, indexed by - # page number - variable pages; # map from page index to page info, each - # entry is a list of indices into the groups - # table, the list is indexed by the offset - variable groups; # list of character info values, indexed by - # group number, initialized with the - # unassigned character group - - variable categories { - Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp - Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So - }; # Ordered list of character categories, must - # match the enumeration in the header file. - - variable titleCount 0; # Count of the number of title case - # characters. This value is used in the - # regular expression code to allocate enough - # space for the title case variants. -} - -proc uni::getValue {items index} { - variable categories - variable titleCount - - # Extract character info - - set category [lindex $items 2] - if {[scan [lindex $items 12] %4x toupper] == 1} { - set toupper [expr {$index - $toupper}] - } else { - set toupper {} - } - if {[scan [lindex $items 13] %4x tolower] == 1} { - set tolower [expr {$tolower - $index}] - } else { - set tolower {} - } - if {[scan [lindex $items 14] %4x totitle] == 1} { - set totitle [expr {$index - $totitle}] - } else { - set totitle {} - } - - set categoryIndex [lsearch -exact $categories $category] - if {$categoryIndex < 0} { - puts "Unexpected character category: $index($category)" - set categoryIndex 0 - } elseif {$category == "Lt"} { - incr titleCount - } - - return "$categoryIndex,$toupper,$tolower,$totitle" -} - -proc uni::getGroup {value} { - variable groups - - set gIndex [lsearch -exact $groups $value] - if {$gIndex == -1} { - set gIndex [llength $groups] - lappend groups $value - } - return $gIndex -} - -proc uni::addPage {info} { - variable pMap - variable pages - - set pIndex [lsearch -exact $pages $info] - if {$pIndex == -1} { - set pIndex [llength $pages] - lappend pages $info - } - lappend pMap $pIndex - return -} - -proc uni::addPage {map_var pages_var info} { - variable $map_var - variable $pages_var - - set pIndex [lsearch -exact [set $pages_var] $info] - if {$pIndex == -1} { - set pIndex [llength [set $pages_var]] - lappend $pages_var $info - } - lappend $map_var $pIndex - return -} - -proc uni::load_exclusions {data} { - variable exclusions - - foreach line [split $data \n] { - if {$line == ""} continue - - set items [split $line " "] - - if {[lindex $items 0] == "#"} continue - - scan [lindex $items 0] %x index - - set exclusions($index) "" - } -} - -proc uni::load_tables {data} { - variable cclass_map - variable decomp_map - variable comp_map - variable comp_first - variable comp_second - variable exclusions - - foreach line [split $data \n] { - if {$line == ""} continue - - set items [split $line \;] - - scan [lindex $items 0] %x index - set cclass [lindex $items 3] - set decomp [lindex $items 5] - - set cclass_map($index) $cclass - #set decomp_map($index) $cclass - - if {$decomp != ""} { - if {[string index [lindex $decomp 0] 0] == "<"} { - set decomp1 [lreplace $decomp 0 0] - set decomp {} - foreach ch $decomp1 { - scan $ch %x ch - lappend decomp $ch - } - set decomp_map($index) $decomp - } else { - switch -- [llength $decomp] { - 1 { - scan $decomp %x ch - set decomp_map($index) $ch - } - 2 { - scan $decomp "%x %x" ch1 ch2 - set decomp [list $ch1 $ch2] - set decomp_map($index) $decomp - # hackish - if {(![info exists cclass_map($ch1)] || \ - $cclass_map($ch1) == 0) && \ - ![info exists exclusions($index)]} { - if {[info exists comp_first($ch1)]} { - incr comp_first($ch1) - } else { - set comp_first($ch1) 1 - } - if {[info exists comp_second($ch2)]} { - incr comp_second($ch2) - } else { - set comp_second($ch2) 1 - } - set comp_map($decomp) $index - } else { - puts "Excluded $index" - } - } - default { - puts "Bad canonical decomposition: $line" - } - } - } - - #puts "[format 0x%0.4x $index]\t$cclass\t$decomp_map($index)" - } - } - #puts [array get comp_first] - #puts [array get comp_second] -} - -proc uni::buildTables {} { - variable cclass_shift - variable decomp_shift - variable comp_shift - - variable cclass_map - variable cclass_pmap {} - variable cclass_pages {} - variable decomp_map - variable decomp_pmap {} - variable decomp_pages {} - variable decomp_list {} - variable comp_map - variable comp_pmap {} - variable comp_pages {} - variable comp_first - variable comp_second - variable comp_first_list {} - variable comp_second_list {} - variable comp_x_list {} - variable comp_y_list {} - variable comp_both_map {} - - set cclass_info {} - set decomp_info {} - set comp_info {} - - set cclass_mask [expr {(1 << $cclass_shift) - 1}] - set decomp_mask [expr {(1 << $decomp_shift) - 1}] - set comp_mask [expr {(1 << $comp_shift) - 1}] - - foreach comp [array names comp_map] { - set ch1 [lindex $comp 0] - if {[info exists comp_first($ch1)] && $comp_first($ch1) > 0 && \ - [info exists comp_second($ch1)] && $comp_second($ch1) > 0} { - if {[lsearch -exact $comp_x_list $ch1] < 0} { - set i [llength $comp_x_list] - lappend comp_x_list $ch1 - set comp_info_map($ch1) $i - lappend comp_y_list $ch1 - set comp_info_map($ch1) $i - puts "There should be no symbols which appears on" - puts "both first and second place in composition" - exit - } - } - } - - foreach comp [array names comp_map] { - set ch1 [lindex $comp 0] - set ch2 [lindex $comp 1] - - if {$comp_first($ch1) == 1 && ![info exists comp_second($ch1)]} { - set i [llength $comp_first_list] - lappend comp_first_list [list $ch2 $comp_map($comp)] - set comp_info_map($ch1) [expr {$i | (1 << 16)}] - } elseif {$comp_second($ch2) == 1 && ![info exists comp_first($ch2)]} { - set i [llength $comp_second_list] - lappend comp_second_list [list $ch1 $comp_map($comp)] - set comp_info_map($ch2) [expr {$i | (1 << 16) | (1 << 17)}] - } else { - if {[lsearch -exact $comp_x_list $ch1] < 0} { - set i [llength $comp_x_list] - lappend comp_x_list $ch1 - set comp_info_map($ch1) $i - } - if {[lsearch -exact $comp_y_list $ch2] < 0} { - set i [llength $comp_y_list] - lappend comp_y_list $ch2 - set comp_info_map($ch2) [expr {$i | (1 << 17)}] - } - } - } - - set next 0 - - for {set i 0} {$i <= 0x10ffff} {incr i} { - #set gIndex [getGroup [getValue $i]] - - set cclass_offset [expr {$i & $cclass_mask}] - - if {[info exists cclass_map($i)]} { - set cclass $cclass_map($i) - } else { - set cclass 0 - } - lappend cclass_info $cclass - - if {$cclass_offset == $cclass_mask} { - addPage cclass_pmap cclass_pages $cclass_info - set cclass_info {} - } - - - set decomp_offset [expr {$i & $decomp_mask}] - - if {[info exists decomp_map($i)]} { - set decomp $decomp_map($i) - set b 1 - while {$b} { - set b 0 - for {set j 0} {$j < [llength $decomp]} {incr j} { - if {[info exists \ - decomp_map([set ch1 [lindex $decomp $j]])]} { - #puts -$decomp - set decomp [eval [list lreplace $decomp $j $j] \ - $decomp_map($ch1)] - #puts +$decomp - set b 1 - } - } - } - - if {[info exists decomp_used($decomp)]} { - lappend decomp_info $decomp_used($decomp) - } else { - set val [expr {([llength $decomp] << 16) + \ - [llength $decomp_list]}] - #set val [expr {[llength $decomp_list]}] - lappend decomp_info $val - set decomp_used($decomp) $val - #puts "$val $decomp" - foreach d $decomp { - lappend decomp_list $d - } - } - } else { - lappend decomp_info -1 - } - - if {$decomp_offset == $decomp_mask} { - addPage decomp_pmap decomp_pages $decomp_info - set decomp_info {} - } - - - set comp_offset [expr {$i & $comp_mask}] - - if {[info exists comp_info_map($i)]} { - set comp $comp_info_map($i) - } else { - set comp -1 - } - lappend comp_info $comp - - if {$comp_offset == $comp_mask} { - addPage comp_pmap comp_pages $comp_info - set comp_info {} - } - } - - #puts [array get decomp_map] - #puts $decomp_list - - return -} - -proc uni::main {} { - global argc argv0 argv - variable cclass_shift - variable cclass_pmap - variable cclass_pages - variable decomp_shift - variable decomp_pmap - variable decomp_pages - variable decomp_list - variable comp_shift - variable comp_map - variable comp_pmap - variable comp_pages - variable comp_first_list - variable comp_second_list - variable comp_x_list - variable comp_y_list - variable pages - variable groups {} - variable titleCount - - if {$argc != 3} { - puts stderr "\nusage: $argv0 <datafile> <exclusionsfile> <outdir>\n" - exit 1 - } - set f [open [lindex $argv 1] r] - set data [read $f] - close $f - - load_exclusions $data - - set f [open [lindex $argv 0] r] - set data [read $f] - close $f - - load_tables $data - buildTables - #puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]" - #set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}] - #puts "shift = 6, space = $size" - #puts "title case count = $titleCount" - - set f [open [file join [lindex $argv 2] uni_norm.c] w] - fconfigure $f -translation lf - puts $f "/* - * uni_norm.c -- - * - * Declarations of Unicode character information tables. This file is - * automatically generated by the uni_parse2.tcl script. Do not - * modify this file by hand. - * - * Copyright (c) 1998 by Scriptics Corporation. - * All rights reserved. - * - * Modified for ejabberd by Alexey Shchepin - * - * RCS: @(#) \$Id\$ - */ - -/* - * A 16-bit Unicode character is split into two parts in order to index - * into the following tables. The lower CCLASS_OFFSET_BITS comprise an offset - * into a page of characters. The upper bits comprise the page number. - */ - -#define CCLASS_OFFSET_BITS $cclass_shift - -/* - * The pageMap is indexed by page number and returns an alternate page number - * that identifies a unique page of characters. Many Unicode characters map - * to the same alternate page number. - */ - -static unsigned char cclassPageMap\[\] = {" - set line " " - set last [expr {[llength $cclass_pmap] - 1}] - for {set i 0} {$i <= $last} {incr i} { - append line [lindex $cclass_pmap $i] - if {$i != $last} { - append line ", " - } - if {[string length $line] > 70} { - puts $f $line - set line " " - } - } - puts $f $line - puts $f "}; - -/* - * The cclassGroupMap is indexed by combining the alternate page number with - * the page offset and returns a combining class number. - */ - -static unsigned char cclassGroupMap\[\] = {" - set line " " - set lasti [expr {[llength $cclass_pages] - 1}] - for {set i 0} {$i <= $lasti} {incr i} { - set page [lindex $cclass_pages $i] - set lastj [expr {[llength $page] - 1}] - for {set j 0} {$j <= $lastj} {incr j} { - append line [lindex $page $j] - if {$j != $lastj || $i != $lasti} { - append line ", " - } - if {[string length $line] > 70} { - puts $f $line - set line " " - } - } - } - puts $f $line - puts $f "}; - -#define GetUniCharCClass(ch) (cclassGroupMap\[(cclassPageMap\[(((int)(ch)) & 0x1fffff) >> CCLASS_OFFSET_BITS\] << CCLASS_OFFSET_BITS) | ((ch) & ((1 << CCLASS_OFFSET_BITS)-1))\]) - - -#define DECOMP_OFFSET_BITS $decomp_shift - -/* - * The pageMap is indexed by page number and returns an alternate page number - * that identifies a unique page of characters. Many Unicode characters map - * to the same alternate page number. - */ - -static unsigned char decompPageMap\[\] = {" - set line " " - set last [expr {[llength $decomp_pmap] - 1}] - for {set i 0} {$i <= $last} {incr i} { - append line [lindex $decomp_pmap $i] - if {$i != $last} { - append line ", " - } - if {[string length $line] > 70} { - puts $f $line - set line " " - } - } - puts $f $line - puts $f "}; - -/* - * The decompGroupMap is indexed by combining the alternate page number with - * the page offset and returns a group number that identifies a length and - * shift of decomposition sequence in decompList - */ - -static int decompGroupMap\[\] = {" - set line " " - set lasti [expr {[llength $decomp_pages] - 1}] - for {set i 0} {$i <= $lasti} {incr i} { - set page [lindex $decomp_pages $i] - set lastj [expr {[llength $page] - 1}] - for {set j 0} {$j <= $lastj} {incr j} { - append line [lindex $page $j] - if {$j != $lastj || $i != $lasti} { - append line ", " - } - if {[string length $line] > 70} { - puts $f $line - set line " " - } - } - } - puts $f $line - puts $f "}; - -/* - * List of decomposition sequences - */ - -static int decompList\[\] = {" - set line " " - set last [expr {[llength $decomp_list] - 1}] - for {set i 0} {$i <= $last} {incr i} { - set val [lindex $decomp_list $i] - - append line [format "%d" $val] - if {$i != $last} { - append line ", " - } - if {[string length $line] > 70} { - puts $f $line - set line " " - } - } - puts $f $line - puts $f "}; - - -/* - * This macro extracts the information about a character from the - * Unicode character tables. - */ - -#define GetUniCharDecompInfo(ch) (decompGroupMap\[(decompPageMap\[(((int)(ch)) & 0x1fffff) >> DECOMP_OFFSET_BITS\] << DECOMP_OFFSET_BITS) | ((ch) & ((1 << DECOMP_OFFSET_BITS)-1))\]) - -#define GetDecompShift(info) ((info) & 0xffff) -#define GetDecompLen(info) ((info) >> 16) - - -#define COMP_OFFSET_BITS $comp_shift - -/* - * The pageMap is indexed by page number and returns an alternate page number - * that identifies a unique page of characters. Many Unicode characters map - * to the same alternate page number. - */ - -static unsigned char compPageMap\[\] = {" - set line " " - set last [expr {[llength $comp_pmap] - 1}] - for {set i 0} {$i <= $last} {incr i} { - append line [lindex $comp_pmap $i] - if {$i != $last} { - append line ", " - } - if {[string length $line] > 70} { - puts $f $line - set line " " - } - } - puts $f $line - puts $f "}; - -/* - * The groupMap is indexed by combining the alternate page number with - * the page offset and returns a group number that identifies a unique - * set of character attributes. - */ - -static int compGroupMap\[\] = {" - set line " " - set lasti [expr {[llength $comp_pages] - 1}] - for {set i 0} {$i <= $lasti} {incr i} { - set page [lindex $comp_pages $i] - set lastj [expr {[llength $page] - 1}] - for {set j 0} {$j <= $lastj} {incr j} { - append line [lindex $page $j] - if {$j != $lastj || $i != $lasti} { - append line ", " - } - if {[string length $line] > 70} { - puts $f $line - set line " " - } - } - } - puts $f $line - puts $f "}; - -/* - * Lists of compositions for characters that appears only in one composition - */ - -static int compFirstList\[\]\[2\] = {" - set line " " - set last [expr {[llength $comp_first_list] - 1}] - for {set i 0} {$i <= $last} {incr i} { - set val [lindex $comp_first_list $i] - - append line [format "{%d, %d}" [lindex $val 0] [lindex $val 1]] - if {$i != $last} { - append line ", " - } - if {[string length $line] > 60} { - puts $f $line - set line " " - } - } - puts $f $line - puts $f "}; - -static int compSecondList\[\]\[2\] = {" - set line " " - set last [expr {[llength $comp_second_list] - 1}] - for {set i 0} {$i <= $last} {incr i} { - set val [lindex $comp_second_list $i] - - append line [format "{%d, %d}" [lindex $val 0] [lindex $val 1]] - if {$i != $last} { - append line ", " - } - if {[string length $line] > 60} { - puts $f $line - set line " " - } - } - puts $f $line - puts $f "}; - -/* - * Compositions matrix - */ - -static int compBothList\[[llength $comp_x_list]\]\[[llength $comp_y_list]\] = {" - set lastx [expr {[llength $comp_x_list] - 1}] - set lasty [expr {[llength $comp_y_list] - 1}] - for {set i 0} {$i <= $lastx} {incr i} { - puts $f " \{" - set line " " - for {set j 0} {$j <= $lasty} {incr j} { - set comp [list [lindex $comp_x_list $i] [lindex $comp_y_list $j]] - if {[info exists comp_map($comp)]} { - set val $comp_map($comp) - } else { - set val 0 - } - - append line [format "%d" $val] - if {$j != $lasty} { - append line ", " - } - if {[string length $line] > 70} { - puts $f $line - set line " " - } - } - puts $f $line - if {$j != $lasty} { - puts $f " \}," - } else { - puts $f " \}" - } - } - puts $f "}; - - -#define GetUniCharCompInfo(ch) (compGroupMap\[(compPageMap\[(((int)(ch)) & 0x1fffff) >> COMP_OFFSET_BITS\] << COMP_OFFSET_BITS) | ((ch) & ((1 << COMP_OFFSET_BITS)-1))\]) - -#define CompSingleMask (1 << 16) -#define CompMask ((1 << 16) - 1) -#define CompSecondMask (1 << 17) -" - - close $f -} - -uni::main - -return |