simplecomp.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173

#!/usr/bin/perl
#
# Simple-minded compression algorithm for the Unicode names database
#
# We create a fixed dictionary of 255 symbols (with 0 = end of string);
# the first 38 are the symbols space, dash, 0-9, A-Z which are the
# characters used in Unicode names, and the remaining 217 are common
# phrases.
#

use bytes;

sub split_by_word($) {
    my ($str) = @_;
    my @l = ();
    my @s = split(/([-\s]+)/, $str);

    # Append separated whitespace to each string
    while ( scalar(@s) ) {
	my $x = shift(@s);
	$x .= shift(@s);
	push(@l, $x);
    }

    return @l;
}

@names = ();

# Treat these combinations as single tokens
# This list should really be generated automatically
@unitokens = ('LATIN SMALL LETTER', 'LATIN CAPITAL LETTER',
	      'CAPITAL LETTER', 'SMALL LETTER', 'BRAILLE PATTERN',
	      'BYZANTINE MUSICAL SYMBOL', 'CANADIAN SYLLABICS',
	      'CHEROKEE LETTER', 'VARIATION SELECTOR',
	      'APL FUNCTIONAL SYMBOL', 'BOX DRAWINGS',
	      'CJK COMPATIBILITY IDEOGRAPH', 'KANGXI RADICAL',
	      'LINEAR B', 'MUSICAL SYMBOL', 'ROMAN NUMERAL',
	      'SANS-SERIF', 'LESS-THAN', 'GREATER-THAN', 'SYLOTI NAGRI',
	      'TAI LE LETTER', 'TETRAGRAM FOR', 'THAI CHARACTER',
	      'TIBETAN SUBJOINED LETTER', 'VULGAR FRACTION',
	      'YI SYLLABLE', 'CJK RADICAL', 'YI RADICAL',
	      'ETHIOPIC SYLLABLE', 'IDEOGRAPHIC TELEGRAPH SYMBOL FOR',
	      'DOUBLE-STRUCK', 'NEW TAI LUE', 'PRESENTATION FORM FOR',
	      'UGARITIC LETTER', 'CYPRIOT SYLLABLE'
	      );

while ( defined($line = <STDIN>) ) {
    chomp $line;

    $ix = hex substr($line,0,5);
    $name = substr($line,6);

    # Add a redundant space to each name; we remove this one
    # automatically during decoding
    # XXX: Try with and without this
    $name .= ' ';

    my $ut, $utx;
    foreach $ut ( @unitokens ) {
	($utx = $ut) =~ tr/ -/_+/;
	$name =~ s/\b$ut\b/$utx/g;
    }
    push(@names, $name);
    $name_to_ucs{$name} = $ix;
}

#
# Split sets into words and count
#
%word_weight = ();

foreach $n ( @names ) {
    foreach $w ( split_by_word($n) ) {
	if ( defined($word_weight{$w}) ) {
	    $word_weight{$w} += length($w)-1;
	} else {
	    $word_weight{$w} = -1; # First encounter saves nothing
	}
    }
}

@commons = sort { $word_weight{$b} <=> $word_weight{$a} } keys(%word_weight);

@dictionary = split(//, " -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ");

$base_dict = scalar(@dictionary);
$dict_len = 256;

%symbol_index = ();
@symbols = (undef) x ($dict_len);

# Identity-map single characters
foreach $scs ( @dictionary ) {
    $symbols[ord($scs)] = $scs;
    $symbol_index{$scs} = ord($scs);
}
$next_index = 0;

while ( scalar(@dictionary) < $dict_len ) {
    push(@dictionary, shift(@commons));
}


$s = 0;
for ( $i = 0 ; $i < $dict_len ; $i++ ) {
    $w = $dictionary[$i];
    printf("%3d %8d \"%s\"\n", $i, $word_weight{$w}, $w);
    if ( length($w) > 1 ) {
	$s += $word_weight{$w};
	while ( defined($symbols[$next_index]) ) {
	    $next_index++;
	}
	$symbols[$next_index] = $w;
	$symbol_index{$w} = $next_index;
	$next_index++;
    }
}

# Sort dictionary in order by decreasing length
@dictionary = sort { length($b) <=> length($a) } @dictionary;

sub compress_string($) {
    my ($na) = @_;
    my $di, $c;

    foreach $di ( @dictionary ) {
	die "No index for symbol: $di\n" unless (defined($symbol_index{$di}));
	$c = chr($symbol_index{$di});
	($rd = $di) =~ tr/_+/ -/;
	$na =~ s/$rd/$c/g;
    }

    return $na;
}

$offset = 0;
$uc_bytes = 0;

open(NLC, '>', 'gen/nameslist.compr') or die;
open(NLO, '>', 'gen/nameslist.offset') or die;
foreach $n ( @names ) {
    ($na1 = $n) =~ tr/_+/ -/;
    ($na2 = $na1) =~ s/ $//;
    $true_name = $na2;		# Actually desired output
    
    $na1 = compress_string($na1);
    $na2 = compress_string($na2);
    
    $na = length($na1) < length($na2) ? $na1 : $na2;

    # Prefix byte for *uncompressed* length, then compressed data
    print  NLC chr(length($true_name)), $na;
    printf NLO "%05x %d\n", $name_to_ucs{$n}, $offset;
    $offset += length($na)+1;
    $uc_bytes += length($true_name)+1;
}
close(NLC);
close(NLO);

print "uncompressed $uc_bytes bytes, compressed $offset bytes\n";
printf "savings %d (%.1f%%)\n", $uc_bytes-$offset, 100*(1-$offset/$uc_bytes);

open(NLD, '>', 'gen/nameslist_dict.c') or die;
printf NLD "const char * const _libucd_nameslist_dict[%d] = {\n", $dict_len;
for ( $i = 0 ; $i < $dict_len ; $i++ ) {
    $sym = $symbols[$i];
    $sym =~ tr/_+/ -/;
    printf NLD "\t\"%s\",\n", $sym;
}
print NLD "};\n";
close(NLD);