aboutsummaryrefslogtreecommitdiff
path: root/src/stringprep/uni_parse2.tcl
diff options
context:
space:
mode:
Diffstat (limited to 'src/stringprep/uni_parse2.tcl')
-rw-r--r--src/stringprep/uni_parse2.tcl702
1 files changed, 0 insertions, 702 deletions
diff --git a/src/stringprep/uni_parse2.tcl b/src/stringprep/uni_parse2.tcl
deleted file mode 100644
index 950090a09..000000000
--- a/src/stringprep/uni_parse2.tcl
+++ /dev/null
@@ -1,702 +0,0 @@
-# uni_parse2.tcl --
-#
-# This program parses the UnicodeData file and generates the
-# corresponding uni_norm.c file with compressed character
-# data tables. The input to this program should be
-# UnicodeData-3.2.0.txt and CompositionExclusions-3.2.0.txt files from:
-# ftp://ftp.unicode.org/Public/UNIDATA/
-#
-# Copyright (c) 1998-1999 by Scriptics Corporation.
-# All rights reserved.
-#
-# Modified for ejabberd by Alexey Shchepin
-#
-# RCS: @(#) $Id$
-
-
-namespace eval uni {
- set cclass_shift 8
- set decomp_shift 8
- set comp_shift 8
- set shift 5; # number of bits of data within a page
- # This value can be adjusted to find the
- # best split to minimize table size
-
- variable pMap; # map from page to page index, each entry is
- # an index into the pages table, indexed by
- # page number
- variable pages; # map from page index to page info, each
- # entry is a list of indices into the groups
- # table, the list is indexed by the offset
- variable groups; # list of character info values, indexed by
- # group number, initialized with the
- # unassigned character group
-
- variable categories {
- Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp
- Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
- }; # Ordered list of character categories, must
- # match the enumeration in the header file.
-
- variable titleCount 0; # Count of the number of title case
- # characters. This value is used in the
- # regular expression code to allocate enough
- # space for the title case variants.
-}
-
-proc uni::getValue {items index} {
- variable categories
- variable titleCount
-
- # Extract character info
-
- set category [lindex $items 2]
- if {[scan [lindex $items 12] %4x toupper] == 1} {
- set toupper [expr {$index - $toupper}]
- } else {
- set toupper {}
- }
- if {[scan [lindex $items 13] %4x tolower] == 1} {
- set tolower [expr {$tolower - $index}]
- } else {
- set tolower {}
- }
- if {[scan [lindex $items 14] %4x totitle] == 1} {
- set totitle [expr {$index - $totitle}]
- } else {
- set totitle {}
- }
-
- set categoryIndex [lsearch -exact $categories $category]
- if {$categoryIndex < 0} {
- puts "Unexpected character category: $index($category)"
- set categoryIndex 0
- } elseif {$category == "Lt"} {
- incr titleCount
- }
-
- return "$categoryIndex,$toupper,$tolower,$totitle"
-}
-
-proc uni::getGroup {value} {
- variable groups
-
- set gIndex [lsearch -exact $groups $value]
- if {$gIndex == -1} {
- set gIndex [llength $groups]
- lappend groups $value
- }
- return $gIndex
-}
-
-proc uni::addPage {info} {
- variable pMap
- variable pages
-
- set pIndex [lsearch -exact $pages $info]
- if {$pIndex == -1} {
- set pIndex [llength $pages]
- lappend pages $info
- }
- lappend pMap $pIndex
- return
-}
-
-proc uni::addPage {map_var pages_var info} {
- variable $map_var
- variable $pages_var
-
- set pIndex [lsearch -exact [set $pages_var] $info]
- if {$pIndex == -1} {
- set pIndex [llength [set $pages_var]]
- lappend $pages_var $info
- }
- lappend $map_var $pIndex
- return
-}
-
-proc uni::load_exclusions {data} {
- variable exclusions
-
- foreach line [split $data \n] {
- if {$line == ""} continue
-
- set items [split $line " "]
-
- if {[lindex $items 0] == "#"} continue
-
- scan [lindex $items 0] %x index
-
- set exclusions($index) ""
- }
-}
-
-proc uni::load_tables {data} {
- variable cclass_map
- variable decomp_map
- variable comp_map
- variable comp_first
- variable comp_second
- variable exclusions
-
- foreach line [split $data \n] {
- if {$line == ""} continue
-
- set items [split $line \;]
-
- scan [lindex $items 0] %x index
- set cclass [lindex $items 3]
- set decomp [lindex $items 5]
-
- set cclass_map($index) $cclass
- #set decomp_map($index) $cclass
-
- if {$decomp != ""} {
- if {[string index [lindex $decomp 0] 0] == "<"} {
- set decomp1 [lreplace $decomp 0 0]
- set decomp {}
- foreach ch $decomp1 {
- scan $ch %x ch
- lappend decomp $ch
- }
- set decomp_map($index) $decomp
- } else {
- switch -- [llength $decomp] {
- 1 {
- scan $decomp %x ch
- set decomp_map($index) $ch
- }
- 2 {
- scan $decomp "%x %x" ch1 ch2
- set decomp [list $ch1 $ch2]
- set decomp_map($index) $decomp
- # hackish
- if {(![info exists cclass_map($ch1)] || \
- $cclass_map($ch1) == 0) && \
- ![info exists exclusions($index)]} {
- if {[info exists comp_first($ch1)]} {
- incr comp_first($ch1)
- } else {
- set comp_first($ch1) 1
- }
- if {[info exists comp_second($ch2)]} {
- incr comp_second($ch2)
- } else {
- set comp_second($ch2) 1
- }
- set comp_map($decomp) $index
- } else {
- puts "Excluded $index"
- }
- }
- default {
- puts "Bad canonical decomposition: $line"
- }
- }
- }
-
- #puts "[format 0x%0.4x $index]\t$cclass\t$decomp_map($index)"
- }
- }
- #puts [array get comp_first]
- #puts [array get comp_second]
-}
-
-proc uni::buildTables {} {
- variable cclass_shift
- variable decomp_shift
- variable comp_shift
-
- variable cclass_map
- variable cclass_pmap {}
- variable cclass_pages {}
- variable decomp_map
- variable decomp_pmap {}
- variable decomp_pages {}
- variable decomp_list {}
- variable comp_map
- variable comp_pmap {}
- variable comp_pages {}
- variable comp_first
- variable comp_second
- variable comp_first_list {}
- variable comp_second_list {}
- variable comp_x_list {}
- variable comp_y_list {}
- variable comp_both_map {}
-
- set cclass_info {}
- set decomp_info {}
- set comp_info {}
-
- set cclass_mask [expr {(1 << $cclass_shift) - 1}]
- set decomp_mask [expr {(1 << $decomp_shift) - 1}]
- set comp_mask [expr {(1 << $comp_shift) - 1}]
-
- foreach comp [array names comp_map] {
- set ch1 [lindex $comp 0]
- if {[info exists comp_first($ch1)] && $comp_first($ch1) > 0 && \
- [info exists comp_second($ch1)] && $comp_second($ch1) > 0} {
- if {[lsearch -exact $comp_x_list $ch1] < 0} {
- set i [llength $comp_x_list]
- lappend comp_x_list $ch1
- set comp_info_map($ch1) $i
- lappend comp_y_list $ch1
- set comp_info_map($ch1) $i
- puts "There should be no symbols which appears on"
- puts "both first and second place in composition"
- exit
- }
- }
- }
-
- foreach comp [array names comp_map] {
- set ch1 [lindex $comp 0]
- set ch2 [lindex $comp 1]
-
- if {$comp_first($ch1) == 1 && ![info exists comp_second($ch1)]} {
- set i [llength $comp_first_list]
- lappend comp_first_list [list $ch2 $comp_map($comp)]
- set comp_info_map($ch1) [expr {$i | (1 << 16)}]
- } elseif {$comp_second($ch2) == 1 && ![info exists comp_first($ch2)]} {
- set i [llength $comp_second_list]
- lappend comp_second_list [list $ch1 $comp_map($comp)]
- set comp_info_map($ch2) [expr {$i | (1 << 16) | (1 << 17)}]
- } else {
- if {[lsearch -exact $comp_x_list $ch1] < 0} {
- set i [llength $comp_x_list]
- lappend comp_x_list $ch1
- set comp_info_map($ch1) $i
- }
- if {[lsearch -exact $comp_y_list $ch2] < 0} {
- set i [llength $comp_y_list]
- lappend comp_y_list $ch2
- set comp_info_map($ch2) [expr {$i | (1 << 17)}]
- }
- }
- }
-
- set next 0
-
- for {set i 0} {$i <= 0x10ffff} {incr i} {
- #set gIndex [getGroup [getValue $i]]
-
- set cclass_offset [expr {$i & $cclass_mask}]
-
- if {[info exists cclass_map($i)]} {
- set cclass $cclass_map($i)
- } else {
- set cclass 0
- }
- lappend cclass_info $cclass
-
- if {$cclass_offset == $cclass_mask} {
- addPage cclass_pmap cclass_pages $cclass_info
- set cclass_info {}
- }
-
-
- set decomp_offset [expr {$i & $decomp_mask}]
-
- if {[info exists decomp_map($i)]} {
- set decomp $decomp_map($i)
- set b 1
- while {$b} {
- set b 0
- for {set j 0} {$j < [llength $decomp]} {incr j} {
- if {[info exists \
- decomp_map([set ch1 [lindex $decomp $j]])]} {
- #puts -$decomp
- set decomp [eval [list lreplace $decomp $j $j] \
- $decomp_map($ch1)]
- #puts +$decomp
- set b 1
- }
- }
- }
-
- if {[info exists decomp_used($decomp)]} {
- lappend decomp_info $decomp_used($decomp)
- } else {
- set val [expr {([llength $decomp] << 16) + \
- [llength $decomp_list]}]
- #set val [expr {[llength $decomp_list]}]
- lappend decomp_info $val
- set decomp_used($decomp) $val
- #puts "$val $decomp"
- foreach d $decomp {
- lappend decomp_list $d
- }
- }
- } else {
- lappend decomp_info -1
- }
-
- if {$decomp_offset == $decomp_mask} {
- addPage decomp_pmap decomp_pages $decomp_info
- set decomp_info {}
- }
-
-
- set comp_offset [expr {$i & $comp_mask}]
-
- if {[info exists comp_info_map($i)]} {
- set comp $comp_info_map($i)
- } else {
- set comp -1
- }
- lappend comp_info $comp
-
- if {$comp_offset == $comp_mask} {
- addPage comp_pmap comp_pages $comp_info
- set comp_info {}
- }
- }
-
- #puts [array get decomp_map]
- #puts $decomp_list
-
- return
-}
-
-proc uni::main {} {
- global argc argv0 argv
- variable cclass_shift
- variable cclass_pmap
- variable cclass_pages
- variable decomp_shift
- variable decomp_pmap
- variable decomp_pages
- variable decomp_list
- variable comp_shift
- variable comp_map
- variable comp_pmap
- variable comp_pages
- variable comp_first_list
- variable comp_second_list
- variable comp_x_list
- variable comp_y_list
- variable pages
- variable groups {}
- variable titleCount
-
- if {$argc != 3} {
- puts stderr "\nusage: $argv0 <datafile> <exclusionsfile> <outdir>\n"
- exit 1
- }
- set f [open [lindex $argv 1] r]
- set data [read $f]
- close $f
-
- load_exclusions $data
-
- set f [open [lindex $argv 0] r]
- set data [read $f]
- close $f
-
- load_tables $data
- buildTables
- #puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]"
- #set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]
- #puts "shift = 6, space = $size"
- #puts "title case count = $titleCount"
-
- set f [open [file join [lindex $argv 2] uni_norm.c] w]
- fconfigure $f -translation lf
- puts $f "/*
- * uni_norm.c --
- *
- * Declarations of Unicode character information tables. This file is
- * automatically generated by the uni_parse2.tcl script. Do not
- * modify this file by hand.
- *
- * Copyright (c) 1998 by Scriptics Corporation.
- * All rights reserved.
- *
- * Modified for ejabberd by Alexey Shchepin
- *
- * RCS: @(#) \$Id\$
- */
-
-/*
- * A 16-bit Unicode character is split into two parts in order to index
- * into the following tables. The lower CCLASS_OFFSET_BITS comprise an offset
- * into a page of characters. The upper bits comprise the page number.
- */
-
-#define CCLASS_OFFSET_BITS $cclass_shift
-
-/*
- * The pageMap is indexed by page number and returns an alternate page number
- * that identifies a unique page of characters. Many Unicode characters map
- * to the same alternate page number.
- */
-
-static unsigned char cclassPageMap\[\] = {"
- set line " "
- set last [expr {[llength $cclass_pmap] - 1}]
- for {set i 0} {$i <= $last} {incr i} {
- append line [lindex $cclass_pmap $i]
- if {$i != $last} {
- append line ", "
- }
- if {[string length $line] > 70} {
- puts $f $line
- set line " "
- }
- }
- puts $f $line
- puts $f "};
-
-/*
- * The cclassGroupMap is indexed by combining the alternate page number with
- * the page offset and returns a combining class number.
- */
-
-static unsigned char cclassGroupMap\[\] = {"
- set line " "
- set lasti [expr {[llength $cclass_pages] - 1}]
- for {set i 0} {$i <= $lasti} {incr i} {
- set page [lindex $cclass_pages $i]
- set lastj [expr {[llength $page] - 1}]
- for {set j 0} {$j <= $lastj} {incr j} {
- append line [lindex $page $j]
- if {$j != $lastj || $i != $lasti} {
- append line ", "
- }
- if {[string length $line] > 70} {
- puts $f $line
- set line " "
- }
- }
- }
- puts $f $line
- puts $f "};
-
-#define GetUniCharCClass(ch) (cclassGroupMap\[(cclassPageMap\[(((int)(ch)) & 0x1fffff) >> CCLASS_OFFSET_BITS\] << CCLASS_OFFSET_BITS) | ((ch) & ((1 << CCLASS_OFFSET_BITS)-1))\])
-
-
-#define DECOMP_OFFSET_BITS $decomp_shift
-
-/*
- * The pageMap is indexed by page number and returns an alternate page number
- * that identifies a unique page of characters. Many Unicode characters map
- * to the same alternate page number.
- */
-
-static unsigned char decompPageMap\[\] = {"
- set line " "
- set last [expr {[llength $decomp_pmap] - 1}]
- for {set i 0} {$i <= $last} {incr i} {
- append line [lindex $decomp_pmap $i]
- if {$i != $last} {
- append line ", "
- }
- if {[string length $line] > 70} {
- puts $f $line
- set line " "
- }
- }
- puts $f $line
- puts $f "};
-
-/*
- * The decompGroupMap is indexed by combining the alternate page number with
- * the page offset and returns a group number that identifies a length and
- * shift of decomposition sequence in decompList
- */
-
-static int decompGroupMap\[\] = {"
- set line " "
- set lasti [expr {[llength $decomp_pages] - 1}]
- for {set i 0} {$i <= $lasti} {incr i} {
- set page [lindex $decomp_pages $i]
- set lastj [expr {[llength $page] - 1}]
- for {set j 0} {$j <= $lastj} {incr j} {
- append line [lindex $page $j]
- if {$j != $lastj || $i != $lasti} {
- append line ", "
- }
- if {[string length $line] > 70} {
- puts $f $line
- set line " "
- }
- }
- }
- puts $f $line
- puts $f "};
-
-/*
- * List of decomposition sequences
- */
-
-static int decompList\[\] = {"
- set line " "
- set last [expr {[llength $decomp_list] - 1}]
- for {set i 0} {$i <= $last} {incr i} {
- set val [lindex $decomp_list $i]
-
- append line [format "%d" $val]
- if {$i != $last} {
- append line ", "
- }
- if {[string length $line] > 70} {
- puts $f $line
- set line " "
- }
- }
- puts $f $line
- puts $f "};
-
-
-/*
- * This macro extracts the information about a character from the
- * Unicode character tables.
- */
-
-#define GetUniCharDecompInfo(ch) (decompGroupMap\[(decompPageMap\[(((int)(ch)) & 0x1fffff) >> DECOMP_OFFSET_BITS\] << DECOMP_OFFSET_BITS) | ((ch) & ((1 << DECOMP_OFFSET_BITS)-1))\])
-
-#define GetDecompShift(info) ((info) & 0xffff)
-#define GetDecompLen(info) ((info) >> 16)
-
-
-#define COMP_OFFSET_BITS $comp_shift
-
-/*
- * The pageMap is indexed by page number and returns an alternate page number
- * that identifies a unique page of characters. Many Unicode characters map
- * to the same alternate page number.
- */
-
-static unsigned char compPageMap\[\] = {"
- set line " "
- set last [expr {[llength $comp_pmap] - 1}]
- for {set i 0} {$i <= $last} {incr i} {
- append line [lindex $comp_pmap $i]
- if {$i != $last} {
- append line ", "
- }
- if {[string length $line] > 70} {
- puts $f $line
- set line " "
- }
- }
- puts $f $line
- puts $f "};
-
-/*
- * The groupMap is indexed by combining the alternate page number with
- * the page offset and returns a group number that identifies a unique
- * set of character attributes.
- */
-
-static int compGroupMap\[\] = {"
- set line " "
- set lasti [expr {[llength $comp_pages] - 1}]
- for {set i 0} {$i <= $lasti} {incr i} {
- set page [lindex $comp_pages $i]
- set lastj [expr {[llength $page] - 1}]
- for {set j 0} {$j <= $lastj} {incr j} {
- append line [lindex $page $j]
- if {$j != $lastj || $i != $lasti} {
- append line ", "
- }
- if {[string length $line] > 70} {
- puts $f $line
- set line " "
- }
- }
- }
- puts $f $line
- puts $f "};
-
-/*
- * Lists of compositions for characters that appears only in one composition
- */
-
-static int compFirstList\[\]\[2\] = {"
- set line " "
- set last [expr {[llength $comp_first_list] - 1}]
- for {set i 0} {$i <= $last} {incr i} {
- set val [lindex $comp_first_list $i]
-
- append line [format "{%d, %d}" [lindex $val 0] [lindex $val 1]]
- if {$i != $last} {
- append line ", "
- }
- if {[string length $line] > 60} {
- puts $f $line
- set line " "
- }
- }
- puts $f $line
- puts $f "};
-
-static int compSecondList\[\]\[2\] = {"
- set line " "
- set last [expr {[llength $comp_second_list] - 1}]
- for {set i 0} {$i <= $last} {incr i} {
- set val [lindex $comp_second_list $i]
-
- append line [format "{%d, %d}" [lindex $val 0] [lindex $val 1]]
- if {$i != $last} {
- append line ", "
- }
- if {[string length $line] > 60} {
- puts $f $line
- set line " "
- }
- }
- puts $f $line
- puts $f "};
-
-/*
- * Compositions matrix
- */
-
-static int compBothList\[[llength $comp_x_list]\]\[[llength $comp_y_list]\] = {"
- set lastx [expr {[llength $comp_x_list] - 1}]
- set lasty [expr {[llength $comp_y_list] - 1}]
- for {set i 0} {$i <= $lastx} {incr i} {
- puts $f " \{"
- set line " "
- for {set j 0} {$j <= $lasty} {incr j} {
- set comp [list [lindex $comp_x_list $i] [lindex $comp_y_list $j]]
- if {[info exists comp_map($comp)]} {
- set val $comp_map($comp)
- } else {
- set val 0
- }
-
- append line [format "%d" $val]
- if {$j != $lasty} {
- append line ", "
- }
- if {[string length $line] > 70} {
- puts $f $line
- set line " "
- }
- }
- puts $f $line
- if {$j != $lasty} {
- puts $f " \},"
- } else {
- puts $f " \}"
- }
- }
- puts $f "};
-
-
-#define GetUniCharCompInfo(ch) (compGroupMap\[(compPageMap\[(((int)(ch)) & 0x1fffff) >> COMP_OFFSET_BITS\] << COMP_OFFSET_BITS) | ((ch) & ((1 << COMP_OFFSET_BITS)-1))\])
-
-#define CompSingleMask (1 << 16)
-#define CompMask ((1 << 16) - 1)
-#define CompSecondMask (1 << 17)
-"
-
- close $f
-}
-
-uni::main
-
-return