Compression algorithm for the names array

author: H. Peter Anvin <hpa@smyrno.hos.anvin.org> 2005-11-26 12:46:32 -0800
committer: H. Peter Anvin <hpa@smyrno.hos.anvin.org> 2005-11-26 12:46:32 -0800
commit: 8a4b9405622aff89b58fd157eff1c3280f72af8b (patch)
tree: 0533606651116523c60e0ef4b4fa23833b1c90f6
parent: 48e36b984fcb63f4b8c0e2d68b18311069051efc (diff)
download: libucd-8a4b9405622aff89b58fd157eff1c3280f72af8b.tar.gz
1 files changed, 141 insertions, 0 deletions
diff --git a/simplecomp.pl b/simplecomp.pl
new file mode 100755
index 0000000..305a108
--- /dev/null
+++ b/simplecomp.pl
@@ -0,0 +1,141 @@
+#!/usr/bin/perl
+#
+# Simple-minded compression algorithm for the Unicode names database
+#
+# We create a fixed dictionary of 255 symbols (with 0 = end of string);
+# the first 38 are the symbols space, dash, 0-9, A-Z which are the
+# characters used in Unicode names, and the remaining 217 are common
+# phrases.
+#
+
+sub split_by_word($) {
+    my ($str) = @_;
+    my @l = ();
+    my @s = split(/([-\s]+)/, $str);
+
+    # Append separated whitespace to each string
+    while ( scalar(@s) ) {
+	my $x = shift(@s);
+	$x .= shift(@s);
+	push(@l, $x);
+    }
+
+    return @l;
+}
+
+@names = ();
+
+# Treat these combinations as single tokens
+# This list should really be generated automatically
+@unitokens = ('LATIN SMALL LETTER', 'LATIN CAPITAL LETTER',
+	      'CAPITAL LETTER', 'SMALL LETTER', 'BRAILLE PATTERN',
+	      'BYZANTINE MUSICAL SYMBOL', 'CANADIAN SYLLABICS',
+	      'CHEROKEE LETTER', 'VARIATION SELECTOR',
+	      'APL FUNCTIONAL SYMBOL', 'BOX DRAWINGS',
+	      'CJK COMPATIBILITY IDEOGRAPH', 'KANGXI RADICAL',
+	      'LINEAR B', 'MUSICAL SYMBOL', 'ROMAN NUMERAL',
+	      'SANS-SERIF', 'LESS-THAN', 'GREATER-THAN', 'SYLOTI NAGRI',
+	      'TAI LE LETTER', 'TETRAGRAM FOR', 'THAI CHARACTER',
+	      'TIBETAN SUBJOINED LETTER', 'VULGAR FRACTION',
+	      'YI SYLLABLE', 'CJK RADICAL', 'YI RADICAL',
+	      'ETHIOPIC SYLLABLE', 'IDEOGRAPHIC TELEGRAPH SYMBOL FOR',
+	      'DOUBLE-STRUCK', 'NEW TAI LUE', 'PRESENTATION FORM FOR',
+	      'UGARITIC LETTER', 'CYPRIOT SYLLABLE'
+	      );
+
+while ( defined($line = <STDIN>) ) {
+    chomp $line;
+
+    # Add a redundant space to each name; we remove this one
+    # automatically during decoding
+    $line .= ' ';
+
+    my $ut, $utx;
+    foreach $ut ( @unitokens ) {
+	($utx = $ut) =~ tr/ -/_+/;
+	$line =~ s/\b$ut\b/$utx/g;
+    }
+    push(@names, $line);
+}
+
+#
+# Split sets into words and count
+#
+%word_weight = ();
+
+foreach $n ( @names ) {
+    foreach $w ( split_by_word($n) ) {
+	if ( defined($word_weight{$w}) ) {
+	    $word_weight{$w} += length($w)-1;
+	} else {
+	    $word_weight{$w} = -1; # First encounter saves nothing
+	}
+    }
+}
+
+@commons = sort { $word_weight{$b} <=> $word_weight{$a} } keys(%word_weight);
+
+@dictionary = split(//, " -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ");
+
+$base_dict = scalar(@dictionary);
+$dict_len = 255;
+
+%symbol_index = ();
+@symbols = (undef) x ($dict_len+1);
+$symbols[0] = '';
+
+# Identity-map single characters
+foreach $scs ( @dictionary ) {
+    $symbols[ord($scs)] = $scs;
+    $symbol_index{$scs} = ord($scs);
+}
+$next_index = 1;
+
+while ( scalar(@dictionary) < $dict_len ) {
+    push(@dictionary, shift(@commons));
+}
+
+
+$s = 0;
+for ( $i = 0 ; $i < $dict_len ; $i++ ) {
+    $w = $dictionary[$i];
+    printf("%3d %8d \"%s\"\n", $i, $word_weight{$w}, $w);
+    if ( length($w) > 1 ) {
+	$s += $word_weight{$w};
+	while ( defined($symbols[$next_index]) ) {
+	    $next_index++;
+	}
+	$symbols[$next_index] = $w;
+	$symbol_index{$w} = $next_index;
+	$next_index++;
+    }
+}
+
+
+print "Bytes saved: $s\n";
+
+# Sort dictionary in order by decreasing length
+@dictionary = sort { length($b) <=> length($a) } @dictionary;
+
+open(NLC, '>', 'gen/namelist.compr') or die;
+foreach $n ( sort(@names) ) {
+    $n =~ tr/_+/ -/;
+    foreach $di ( @dictionary ) {
+	$c = chr($symbol_index{$di});
+	$di =~ tr/_+/ -/;
+	$n =~ s/$di/$c/g;
+    }
+    print NLC $n, "\0";
+}
+close(NLC);
+
+open(NLD, '>', 'gen/namelist_dict.c') or die;
+printf NLD "const char * const _libucd_namelist_dict[%d] = {\n", $dict_len+1;
+for ( $i = 0 ; $i <= $dict_len ; $i++ ) {
+    $sym = $symbols[$i];
+    $sym =~ tr/_+/ -/;
+    printf NLD "\t\"%s\",\n", $sym;
+}
+print NLD "};\n";
+close(NLD);
+
author	H. Peter Anvin <hpa@smyrno.hos.anvin.org>	2005-11-26 12:46:32 -0800
committer	H. Peter Anvin <hpa@smyrno.hos.anvin.org>	2005-11-26 12:46:32 -0800
commit	8a4b9405622aff89b58fd157eff1c3280f72af8b (patch)
tree	0533606651116523c60e0ef4b4fa23833b1c90f6
parent	48e36b984fcb63f4b8c0e2d68b18311069051efc (diff)
download	libucd-8a4b9405622aff89b58fd157eff1c3280f72af8b.tar.gz