aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@smyrno.hos.anvin.org>2005-11-26 12:46:32 -0800
committerH. Peter Anvin <hpa@smyrno.hos.anvin.org>2005-11-26 12:46:32 -0800
commit8a4b9405622aff89b58fd157eff1c3280f72af8b (patch)
tree0533606651116523c60e0ef4b4fa23833b1c90f6
parent48e36b984fcb63f4b8c0e2d68b18311069051efc (diff)
downloadlibucd-8a4b9405622aff89b58fd157eff1c3280f72af8b.tar.gz
Compression algorithm for the names array
-rwxr-xr-xsimplecomp.pl141
1 files changed, 141 insertions, 0 deletions
diff --git a/simplecomp.pl b/simplecomp.pl
new file mode 100755
index 0000000..305a108
--- /dev/null
+++ b/simplecomp.pl
@@ -0,0 +1,141 @@
+#!/usr/bin/perl
+#
+# Simple-minded compression algorithm for the Unicode names database
+#
+# We create a fixed dictionary of 255 symbols (with 0 = end of string);
+# the first 38 are the symbols space, dash, 0-9, A-Z which are the
+# characters used in Unicode names, and the remaining 217 are common
+# phrases.
+#
+
+sub split_by_word($) {
+ my ($str) = @_;
+ my @l = ();
+ my @s = split(/([-\s]+)/, $str);
+
+ # Append separated whitespace to each string
+ while ( scalar(@s) ) {
+ my $x = shift(@s);
+ $x .= shift(@s);
+ push(@l, $x);
+ }
+
+ return @l;
+}
+
+@names = ();
+
+# Treat these combinations as single tokens
+# This list should really be generated automatically
+@unitokens = ('LATIN SMALL LETTER', 'LATIN CAPITAL LETTER',
+ 'CAPITAL LETTER', 'SMALL LETTER', 'BRAILLE PATTERN',
+ 'BYZANTINE MUSICAL SYMBOL', 'CANADIAN SYLLABICS',
+ 'CHEROKEE LETTER', 'VARIATION SELECTOR',
+ 'APL FUNCTIONAL SYMBOL', 'BOX DRAWINGS',
+ 'CJK COMPATIBILITY IDEOGRAPH', 'KANGXI RADICAL',
+ 'LINEAR B', 'MUSICAL SYMBOL', 'ROMAN NUMERAL',
+ 'SANS-SERIF', 'LESS-THAN', 'GREATER-THAN', 'SYLOTI NAGRI',
+ 'TAI LE LETTER', 'TETRAGRAM FOR', 'THAI CHARACTER',
+ 'TIBETAN SUBJOINED LETTER', 'VULGAR FRACTION',
+ 'YI SYLLABLE', 'CJK RADICAL', 'YI RADICAL',
+ 'ETHIOPIC SYLLABLE', 'IDEOGRAPHIC TELEGRAPH SYMBOL FOR',
+ 'DOUBLE-STRUCK', 'NEW TAI LUE', 'PRESENTATION FORM FOR',
+ 'UGARITIC LETTER', 'CYPRIOT SYLLABLE'
+ );
+
+while ( defined($line = <STDIN>) ) {
+ chomp $line;
+
+ # Add a redundant space to each name; we remove this one
+ # automatically during decoding
+ $line .= ' ';
+
+ my $ut, $utx;
+ foreach $ut ( @unitokens ) {
+ ($utx = $ut) =~ tr/ -/_+/;
+ $line =~ s/\b$ut\b/$utx/g;
+ }
+ push(@names, $line);
+}
+
+#
+# Split sets into words and count
+#
+%word_weight = ();
+
+foreach $n ( @names ) {
+ foreach $w ( split_by_word($n) ) {
+ if ( defined($word_weight{$w}) ) {
+ $word_weight{$w} += length($w)-1;
+ } else {
+ $word_weight{$w} = -1; # First encounter saves nothing
+ }
+ }
+}
+
+@commons = sort { $word_weight{$b} <=> $word_weight{$a} } keys(%word_weight);
+
+@dictionary = split(//, " -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ");
+
+$base_dict = scalar(@dictionary);
+$dict_len = 255;
+
+%symbol_index = ();
+@symbols = (undef) x ($dict_len+1);
+$symbols[0] = '';
+
+# Identity-map single characters
+foreach $scs ( @dictionary ) {
+ $symbols[ord($scs)] = $scs;
+ $symbol_index{$scs} = ord($scs);
+}
+$next_index = 1;
+
+while ( scalar(@dictionary) < $dict_len ) {
+ push(@dictionary, shift(@commons));
+}
+
+
+$s = 0;
+for ( $i = 0 ; $i < $dict_len ; $i++ ) {
+ $w = $dictionary[$i];
+ printf("%3d %8d \"%s\"\n", $i, $word_weight{$w}, $w);
+ if ( length($w) > 1 ) {
+ $s += $word_weight{$w};
+ while ( defined($symbols[$next_index]) ) {
+ $next_index++;
+ }
+ $symbols[$next_index] = $w;
+ $symbol_index{$w} = $next_index;
+ $next_index++;
+ }
+}
+
+
+print "Bytes saved: $s\n";
+
+# Sort dictionary in order by decreasing length
+@dictionary = sort { length($b) <=> length($a) } @dictionary;
+
+open(NLC, '>', 'gen/namelist.compr') or die;
+foreach $n ( sort(@names) ) {
+ $n =~ tr/_+/ -/;
+ foreach $di ( @dictionary ) {
+ $c = chr($symbol_index{$di});
+ $di =~ tr/_+/ -/;
+ $n =~ s/$di/$c/g;
+ }
+ print NLC $n, "\0";
+}
+close(NLC);
+
+open(NLD, '>', 'gen/namelist_dict.c') or die;
+printf NLD "const char * const _libucd_namelist_dict[%d] = {\n", $dict_len+1;
+for ( $i = 0 ; $i <= $dict_len ; $i++ ) {
+ $sym = $symbols[$i];
+ $sym =~ tr/_+/ -/;
+ printf NLD "\t\"%s\",\n", $sym;
+}
+print NLD "};\n";
+close(NLD);
+