aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@smyrno.hos.anvin.org>2005-11-29 01:56:53 -0800
committerH. Peter Anvin <hpa@smyrno.hos.anvin.org>2005-11-29 01:56:53 -0800
commita3564739b9504e1a58faa89274faaa65e3663a8f (patch)
treefb1eb44818a4574a6df2471fbf1cdc35f0ac9bda
parenta6d38fc8943ac026adb0e84fdbe9ac34830386bb (diff)
downloadlibucd-a3564739b9504e1a58faa89274faaa65e3663a8f.tar.gz
Flesh out the properties array
-rwxr-xr-xconvert_ucd.pl95
-rw-r--r--libucd_int.h11
2 files changed, 101 insertions, 5 deletions
diff --git a/convert_ucd.pl b/convert_ucd.pl
index e19a79a..2da914f 100755
--- a/convert_ucd.pl
+++ b/convert_ucd.pl
@@ -381,6 +381,14 @@ sub dump_prop_list()
#
%prop_array_position = ();
+sub emit_int24($) {
+ my($v) = @_;
+ return sprintf("{0x%02x, 0x%02x, 0x%02x}",
+ $v & 0xff,
+ ($v >> 8) & 0xff,
+ ($v >> 16) & 0xff);
+}
+
sub make_properties_array()
{
my $fh, $c, $prev, $mine, $cnt, $cp;
@@ -408,14 +416,99 @@ sub make_properties_array()
# Careful with the formatting: we rely on the fact that
# the first 14 characters contain the UCS value and the rest
# the properties.
+
+ # Code point UCS value
$mine = sprintf("\t{\n\t\t0x%05x,\n", $c);
+
+ # General category
+ my $gc = $$cp{'General_Category'} || 'Cn';
+ $mine .= "\t\tUC_GC_$gc,\n";
+
+ # Script
+ my $scr = $$cp{'Script'} || 'Common';
+ $mine .= "\t\tUC_SCR_$scr,\n";
+
+ # Numeric value
+ my $nv = $$cp{'Numeric_Value'};
+ if ( $nv > 255 ) {
+ my $exp = int(log($nv)/log(10))-1;
+ my $num = int($nv/(10**$exp));
+ $mine .= "\t\t$num, 128+$exp,\n";
+ } else {
+ my $num = $nv + 0;
+ my $den = 1;
+
+ if ( $nv != 0 ) {
+ while ( ($nv-($num/$den))/$nv > 1e-7 ) {
+ $den++;
+ $num = int($nv*$den+0.5);
+ }
+ }
+ $mine .= "\t\t$num, $den,\n";
+ }
+
+ # Boolean properties and block index
my $bp;
foreach $bp ( @boolean_props ) {
if ( $$cp{$bp} ) {
$mine .= "\t\tUC_FL_\U$bp\E |\n";
}
}
- $mine .= "\t\t0,\n"; # Easy way to terminate a bit sequence
+ my $block = $$cp{'Block'};
+ $block =~ tr/ .-/___/;
+ $mine .= "\t\t(UC_BLK_$block << 48),\n";
+
+ # Simple case mappings
+ my $sum = ($$cp{'Simple_Uppercase_Mapping'} || $c) - $c;
+ $mine .= "\t\t".emit_int24($sum).",\n";
+ my $slm = ($$cp{'Simple_Lowercase_Mapping'} || $c) - $c;
+ $mine .= "\t\t".emit_int24($slm).",\n";
+ my $stm = ($$cp{'Simple_Titlecase_Mapping'} || $c) - $c;
+ $mine .= "\t\t".emit_int24($stm).",\n";
+
+ # Age (assume 31.7 as maximum; Unicode has traditionally not had
+ # many minor versions per major version.)
+ my $age = $$cp{'Age'} || '0.0';
+ my (@sage) = split(/\./, $age);
+ $mine .= sprintf("\t\t(%d << 5) + %d, /* $age */\n", $sage[0], $sage[1]);
+
+ # Padding
+ $mine .= "\t\t{ 0, 0, },\n";
+
+ # Arabic Joining Type
+ my $ajt = $$cp{'Arabic_Joining_Type'} ||
+ ($gc eq 'Mn' || $gc eq 'Me' || $gc eq 'Cf') ? 'T' : 'U';
+ $mine .= "\t\tUC_AJT_$ajt,\n";
+
+ # Arabic Joining Group
+ my $ajg = $$cp{'Arabic_Joining_Group'} || 'No_Joining_Group';
+ $ajg =~ tr/ /_/;
+ $ajg =~ s/([A-Z])([A-Z]+)/$1\L$2\E/g;
+ $mine .= "\t\tUC_AJG_$ajg,\n";
+
+ # East Asian Width
+ my $eaw = $$cp{'East_Asian_Width'} || 'N';
+ $mine .= "\t\tUC_EAW_$eaw,\n";
+
+ # Hangul Syllable Type
+ my $hst = $$cp{'Hangul_Syllable_Type'} || 'NA';
+ $mine .= "\t\tUC_HST_$hst,\n";
+
+ # Line Break
+ my $lb = $$cp{'Line_Break'} || 'XX';
+ $mine .= "\t\tUC_LB_$lb,\n";
+
+ # Numeric Type
+ my $nt = $$cp{'Numeric_Type'} || 'None';
+ $mine .= "\t\tUC_NT_$nt,\n";
+
+ # Canonical Combining Class
+ my $ccc = $$cp{'Canonical_Combining_Class'} || 'NR';
+ $mine .= "\t\tUC_CCC_$ccc,\n";
+
+ # Bidi Class
+ my $bc = $$cp{'Bidi_Class'} || 'L';
+ $mine .= "\t\tUC_BC_$bc,\n";
# Additional properties...
$mine .= "\t},\n";
diff --git a/libucd_int.h b/libucd_int.h
index d3260e9..8ab2f06 100644
--- a/libucd_int.h
+++ b/libucd_int.h
@@ -16,14 +16,17 @@ extern const char _libucd_hangul_jamo_v[][4];
extern const char _libucd_hangul_jamo_t[][4];
struct _libucd_property_array {
+ int32_t ucd; /* Wasteful but fast (used in search) */
+ uint8_t general_category;
+ uint8_t script;
+ uint8_t numeric_value_num;
+ uint8_t numeric_value_den_exp; /* bit 7 = 1 if exponent */
uint64_t flags_block; /* Block index is high byte */
int24 simple_uppercase;
int24 simple_lowercase;
int24 simple_titlecase;
- uint8_t numeric_value_num;
- uint8_t numeric_value_den_exp; /* bit 7 = 1 if exponent */
- uint8_t general_category;
- uint8_t script;
+ uint8_t age; /* (major << 5) + minor */
+ uint8_t pad[2]; /* Do something useful here... */
unsigned arabic_joining_type :3;
unsigned arabic_joining_group :6;
unsigned east_asian_width :3;