diff options
author | H. Peter Anvin <hpa@smyrno.hos.anvin.org> | 2005-11-26 12:46:32 -0800 |
---|---|---|
committer | H. Peter Anvin <hpa@smyrno.hos.anvin.org> | 2005-11-26 12:46:32 -0800 |
commit | 8a4b9405622aff89b58fd157eff1c3280f72af8b (patch) | |
tree | 0533606651116523c60e0ef4b4fa23833b1c90f6 | |
parent | 48e36b984fcb63f4b8c0e2d68b18311069051efc (diff) | |
download | libucd-8a4b9405622aff89b58fd157eff1c3280f72af8b.tar.gz |
Compression algorithm for the names array
-rwxr-xr-x | simplecomp.pl | 141 |
1 files changed, 141 insertions, 0 deletions
diff --git a/simplecomp.pl b/simplecomp.pl new file mode 100755 index 0000000..305a108 --- /dev/null +++ b/simplecomp.pl @@ -0,0 +1,141 @@ +#!/usr/bin/perl +# +# Simple-minded compression algorithm for the Unicode names database +# +# We create a fixed dictionary of 255 symbols (with 0 = end of string); +# the first 38 are the symbols space, dash, 0-9, A-Z which are the +# characters used in Unicode names, and the remaining 217 are common +# phrases. +# + +sub split_by_word($) { + my ($str) = @_; + my @l = (); + my @s = split(/([-\s]+)/, $str); + + # Append separated whitespace to each string + while ( scalar(@s) ) { + my $x = shift(@s); + $x .= shift(@s); + push(@l, $x); + } + + return @l; +} + +@names = (); + +# Treat these combinations as single tokens +# This list should really be generated automatically +@unitokens = ('LATIN SMALL LETTER', 'LATIN CAPITAL LETTER', + 'CAPITAL LETTER', 'SMALL LETTER', 'BRAILLE PATTERN', + 'BYZANTINE MUSICAL SYMBOL', 'CANADIAN SYLLABICS', + 'CHEROKEE LETTER', 'VARIATION SELECTOR', + 'APL FUNCTIONAL SYMBOL', 'BOX DRAWINGS', + 'CJK COMPATIBILITY IDEOGRAPH', 'KANGXI RADICAL', + 'LINEAR B', 'MUSICAL SYMBOL', 'ROMAN NUMERAL', + 'SANS-SERIF', 'LESS-THAN', 'GREATER-THAN', 'SYLOTI NAGRI', + 'TAI LE LETTER', 'TETRAGRAM FOR', 'THAI CHARACTER', + 'TIBETAN SUBJOINED LETTER', 'VULGAR FRACTION', + 'YI SYLLABLE', 'CJK RADICAL', 'YI RADICAL', + 'ETHIOPIC SYLLABLE', 'IDEOGRAPHIC TELEGRAPH SYMBOL FOR', + 'DOUBLE-STRUCK', 'NEW TAI LUE', 'PRESENTATION FORM FOR', + 'UGARITIC LETTER', 'CYPRIOT SYLLABLE' + ); + +while ( defined($line = <STDIN>) ) { + chomp $line; + + # Add a redundant space to each name; we remove this one + # automatically during decoding + $line .= ' '; + + my $ut, $utx; + foreach $ut ( @unitokens ) { + ($utx = $ut) =~ tr/ -/_+/; + $line =~ s/\b$ut\b/$utx/g; + } + push(@names, $line); +} + +# +# Split sets into words and count +# +%word_weight = (); + +foreach $n ( @names ) { + foreach $w ( split_by_word($n) ) { + if ( defined($word_weight{$w}) ) { + $word_weight{$w} += length($w)-1; + } else { + $word_weight{$w} = -1; # First encounter saves nothing + } + } +} + +@commons = sort { $word_weight{$b} <=> $word_weight{$a} } keys(%word_weight); + +@dictionary = split(//, " -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"); + +$base_dict = scalar(@dictionary); +$dict_len = 255; + +%symbol_index = (); +@symbols = (undef) x ($dict_len+1); +$symbols[0] = ''; + +# Identity-map single characters +foreach $scs ( @dictionary ) { + $symbols[ord($scs)] = $scs; + $symbol_index{$scs} = ord($scs); +} +$next_index = 1; + +while ( scalar(@dictionary) < $dict_len ) { + push(@dictionary, shift(@commons)); +} + + +$s = 0; +for ( $i = 0 ; $i < $dict_len ; $i++ ) { + $w = $dictionary[$i]; + printf("%3d %8d \"%s\"\n", $i, $word_weight{$w}, $w); + if ( length($w) > 1 ) { + $s += $word_weight{$w}; + while ( defined($symbols[$next_index]) ) { + $next_index++; + } + $symbols[$next_index] = $w; + $symbol_index{$w} = $next_index; + $next_index++; + } +} + + +print "Bytes saved: $s\n"; + +# Sort dictionary in order by decreasing length +@dictionary = sort { length($b) <=> length($a) } @dictionary; + +open(NLC, '>', 'gen/namelist.compr') or die; +foreach $n ( sort(@names) ) { + $n =~ tr/_+/ -/; + foreach $di ( @dictionary ) { + $c = chr($symbol_index{$di}); + $di =~ tr/_+/ -/; + $n =~ s/$di/$c/g; + } + print NLC $n, "\0"; +} +close(NLC); + +open(NLD, '>', 'gen/namelist_dict.c') or die; +printf NLD "const char * const _libucd_namelist_dict[%d] = {\n", $dict_len+1; +for ( $i = 0 ; $i <= $dict_len ; $i++ ) { + $sym = $symbols[$i]; + $sym =~ tr/_+/ -/; + printf NLD "\t\"%s\",\n", $sym; +} +print NLD "};\n"; +close(NLD); + |