Generate enumerations automagically; canonicalize names

author: H. Peter Anvin <hpa@smyrno.hos.anvin.org> 2005-12-05 21:25:37 -0800
committer: H. Peter Anvin <hpa@smyrno.hos.anvin.org> 2005-12-05 21:25:37 -0800
commit: bd8acab4d3bd0b010396d7f6ffd9f5b9cb5db619 (patch)
tree: dbec74f33de14a14e5f9acebaeafc21b4d0abe42
parent: 8986e43af73d8f8f8e93e169878ecd29d5592a3a (diff)
download: libucd-bd8acab4d3bd0b010396d7f6ffd9f5b9cb5db619.tar.gz
7 files changed, 265 insertions, 596 deletions
diff --git a/Makefile b/Makefile
index b4aec4f..dd9655e 100644
--- a/Makefile
+++ b/Makefile
@@ -48,12 +48,17 @@ CVT_FILES = gen/jamo.c gen/nameslist.tab gen/nametoucs.keys gen/nametoucs.tab \
 .c.hi:
 	$(HOST_CC) $(HOST_CFLAGS) -E -o $@ $<
 
+enums/%.o: enums/%.c $(HDRS)
+	$(CC) $(CFLAGS) -c -o $@ $<
+enums/%.lo: enums/%.c $(HDRS)
+	$(CC) $(CFLAGS) $(PICFLAGS) -c -o $@ $<
+
 # -----------------------------------------------------------------------
 
 LIBSRCS = proparray.c gen/nametoucs_hash.c gen/ucstoname_hash.c \
 	  gen/jamo.c gen/nameslist.c gen/nameslist_dict.c \
 	  gen/ucstoname_tab.c gen/nametoucs_tab.c nametoucs.c \
-	  ucslookup.c cache.c
+	  ucslookup.c cache.c $(wildcard enums/*.c)
 
 LIBOBJS = $(patsubst %.c,%.o,$(LIBSRCS))
 SO_OBJS = $(patsubst %.c,%.lo,$(LIBSRCS))
@@ -63,7 +68,7 @@ SO_OBJS = $(patsubst %.c,%.lo,$(LIBSRCS))
 all : $(LIB_FILE) $(SO_FILE) $(SO_NAME)
 
 clean:
-	rm -rf gen
+	rm -rf gen enums
 	rm -f *.o *.i *.*.a *.so *.so.*
 	$(MAKE) -C perfect clean
 
@@ -129,6 +134,9 @@ $(SO_NAME): $(SO_FILE)
 	ln -f $(SO_FILE) $(SO_NAME)
 endif
 
+ucd.h: ucd.h.in enum.list makeenums.pl
+	$(PERL) makeenums.pl
+
 # -----------------------------------------------------------------------
 
 proparray.o: proparray.c ucd.h $(HDRS) gen/proparray.c
diff --git a/convert_ucd.pl b/convert_ucd.pl
index 5cc3e97..c927b9c 100755
--- a/convert_ucd.pl
+++ b/convert_ucd.pl
@@ -397,8 +397,8 @@ sub make_properties_array()
 	$mine .= "\t\tUC_GC_$gc,\n";
 
 	# Script
-	my $scr = $$cp{'Script'} || 'Common';
-	$mine .= "\t\tUC_SCR_$scr,\n";
+	my $sc = $$cp{'Script'} || 'Common';
+	$mine .= "\t\tUC_SC_$sc,\n";
 
 	# Numeric value
 	my $nv = $$cp{'Numeric_Value'};
@@ -476,8 +476,8 @@ sub make_properties_array()
 	$mine .= "\t\tUC_JG_$ajg,\n";
 
 	# East Asian Width
-	my $eaw = $$cp{'East_Asian_Width'} || 'N';
-	$mine .= "\t\tUC_EAW_$eaw,\n";
+	my $ea = $$cp{'East_Asian_Width'} || 'N';
+	$mine .= "\t\tUC_EA_$ea,\n";
 
 	# Hangul Syllable Type
 	my $hst = $$cp{'Hangul_Syllable_Type'} || 'NA';
@@ -493,7 +493,7 @@ sub make_properties_array()
 
 	# Bidi Class
 	my $bc = $$cp{'Bidi_Class'} || 'L';
-	$mine .= "\t\tUC_BIDI_$bc,\n";
+	$mine .= "\t\tUC_BC_$bc,\n";
 
 	# Additional properties...
 	$mine .= "\t},\n";
diff --git a/libucd_int.h b/libucd_int.h
index 5514ac0..ff3d74a 100644
--- a/libucd_int.h
+++ b/libucd_int.h
@@ -31,7 +31,7 @@ struct _libucd_property_array {
   int24    simple_lowercase;
   int24    simple_titlecase;
   uint8_t  age;			/* (major << 3) + minor */
-  uint8_t  combining_class;
+  uint8_t  canonical_combining_class;
   unsigned sentence_break	:4;
   unsigned grapheme_cluster_break :4;
   unsigned word_break		:3;
@@ -77,4 +77,9 @@ struct libucd_private {
 struct unicode_character_data *
 unicode_character_data_raw(int32_t ucs);
 
+struct libucd_enum_names {
+  const char *long_name;
+  const char *short_name;
+};
+
 #endif
diff --git a/makeenums.pl b/makeenums.pl
new file mode 100755
index 0000000..2178da1
--- /dev/null
+++ b/makeenums.pl
@@ -0,0 +1,118 @@
+#!/usr/bin/perl
+#
+# Process enum.list and output ucd.h as well as
+# C code indicies for each enum.
+#
+
+sub close_frag($) {
+    my($longname) = @_;
+
+    print UCD_H "};\n\n";
+    print FRAG "};\n";
+    print FRAG <<EOF;
+
+int
+unicode_property_\L${longname}\E_names(enum unicode_\L${longname}\E v,
+				      const char **longname,
+				      const char **shortname)
+{
+    const char *ln, *sn;
+    int rv;
+
+    if ( (unsigned)v >= sizeof enum_names/sizeof(struct libucd_enum_names) ) {
+	ln = NULL;
+	sn = NULL;
+	rv = 1;
+    } else {
+	ln = enum_names[v].long_name;
+	sn = enum_names[v].short_name;
+	rv = 0;
+    }
+    if ( longname )  *longname  = ln;
+    if ( shortname ) *shortname = sn;
+    return rv;
+}
+EOF
+    close(FRAG);
+}    
+
+open(LIST,   '<', "enum.list") or die;
+open(UCD_IN, '<', "ucd.h.in")  or die;
+open(UCD_H,  '>', "ucd.h")     or die;
+mkdir("enums", 0777);
+
+while ( defined($line = <UCD_IN>) ) {
+    last if ( $line =~ /ENUMS\;/ );
+    print UCD_H $line;
+}
+
+undef $shortname;
+
+while( defined($line = <LIST>) ) {
+    chomp $line;
+    next if ( $line =~ /^\s*(\#\#.*|)$/ );
+
+    if ( $line =~ /^\#\s*(\S+)\s+\((\S+)\)\s+(long|short)\s*$/ ) {
+	$prev = $longname;
+
+	$longname  = $1;
+	$shortname = $2;
+	$whichname = ($3 eq 'long') ? 1 : 0;
+
+	if ( defined($prev) ) {
+	    close_frag($prev);
+	}
+	print UCD_H "enum unicode_\L${longname}\E {\n";
+	open(FRAG, '>', "enums/${longname}.c") or die;
+	print FRAG "#include \"libucd_int.h\"\n";
+	print FRAG "static const struct libucd_enum_names enum_names[] = {\n";
+    } elsif ( $line =~ /\;/ ) {
+	$line =~ s/\s*\#.*$//;	# Remove comments
+	@list = split(/\s*;\s*/, $line);
+
+	if ( shift(@list) ne $shortname ) {
+	    die "$0: Unexpected line: $line\n";
+	}
+
+	undef $epos;
+	if ( $list[0] =~ /^[0-9]+$/ ) {
+	    $epos = shift(@list);
+	}
+
+	# Write ucd.h
+
+	($na = $list[$whichname]) =~ tr/-/_/;
+	$nx = $list[1-$whichname];
+	$nx = ($nx eq 'n/a') ? '' : "/* $nx */";
+
+	if ( defined($epos) ) {
+	    printf UCD_H "  %-30s = %3d,    %s\n", 
+	    "UC_\U${shortname}\E_${na}", $epos, $nx;
+	} else {
+	    printf UCD_H "  %-40s %s\n", 
+	    "UC_\U${shortname}\E_${na},", $nx;
+	}
+	
+	# Write generator fragment
+
+	($ln = $list[1]) =~ tr/_/ /;
+	$sn = $list[0];
+	$sn = ($sn eq 'n/a') ? '0' : "\"$sn\"";
+	if ( defined($epos) ) {
+	    printf FRAG "\t[%3d] = { \"%s\", %s },\n", $epos, $ln, $sn;
+	} else {
+	    printf FRAG "\t{ \"%s\", %s },\n", $ln, $sn;
+	}
+    } else {
+	die "$0: Cannot parse: $line\n";
+    }
+}
+
+close_frag($longname);
+
+while ( defined($line = <UCD_IN>) ) {
+    print UCD_H $line;
+}
+
+close(UCD_IN);
+close(UCD_H);
diff --git a/ucd.h b/ucd.h
deleted file mode 100644
index c15ad09..0000000
--- a/ucd.h
+++ /dev/null
@@ -1,587 +0,0 @@
-/* ----------------------------------------------------------------------- *
- *   
- *   Copyright 2005 H. Peter Anvin - All Rights Reserved
- *
- *   Permission is hereby granted, free of charge, to any person
- *   obtaining a copy of this software and associated documentation
- *   files (the "Software"), to deal in the Software without
- *   restriction, including without limitation the rights to use,
- *   copy, modify, merge, publish, distribute, sublicense, and/or
- *   sell copies of the Software, and to permit persons to whom
- *   the Software is furnished to do so, subject to the following
- *   conditions:
- *   
- *   The above copyright notice and this permission notice shall
- *   be included in all copies or substantial portions of the Software.
- *   
- *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- *   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- *   OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- *   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- *   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- *   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- *   FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- *   OTHER DEALINGS IN THE SOFTWARE.
- *
- * ----------------------------------------------------------------------- */
-
-/*
- * ucd.h
- *
- * Simple interface to the Unicode Character Database
- */
-
-#ifndef UCD_H
-#define UCD_H
-
-#include <stdint.h>
-
-#define LIBUCD_THREAD_SUPPORT
-
-#ifdef LIBUCD_THREAD_SUPPORT
-# include <pthread.h>
-#endif
-
-enum unicode_bidi_class {
-  UC_BIDI_AL,			/* Arabic_Letter */
-  UC_BIDI_AN,			/* Arabic_Number */
-  UC_BIDI_B,			/* Paragraph_Separator */
-  UC_BIDI_BN,			/* Boundary_Neutral */
-  UC_BIDI_CS,			/* Common_Separator */
-  UC_BIDI_EN,			/* European_Number */
-  UC_BIDI_ES,			/* European_Separator */
-  UC_BIDI_ET,			/* European_Terminator */
-  UC_BIDI_L,			/* Left_To_Right */
-  UC_BIDI_LRE,			/* Left_To_Right_Embedding */
-  UC_BIDI_LRO,			/* Left_To_Right_Override */
-  UC_BIDI_NSM,			/* Nonspacing_Mark */
-  UC_BIDI_ON,			/* Other_Neutral */
-  UC_BIDI_PDF,			/* Pop_Directional_Format */
-  UC_BIDI_R,			/* Right_To_Left */
-  UC_BIDI_RLE,			/* Right_To_Left_Embedding */
-  UC_BIDI_RLO,			/* Right_To_Left_Override */
-  UC_BIDI_S,			/* Segment_Separator */
-  UC_BIDI_WS,			/* White_Space */
-};
-
-enum unicode_combining_class {
-  UC_CCC_NR = 0,		/* Not_Reordered */
-  UC_CCC_OV = 1,		/* Overlay */
-  UC_CCC_NK = 7,		/* Nukta */
-  UC_CCC_KV = 8,		/* Kana_Voicing */
-  UC_CCC_VR = 9,		/* Virama */
-  UC_CCC_ATBL = 200,		/* Attached_Below_Left */
-  UC_CCC_ATB = 202,		/* Attached_Below */
-  UC_CCC_ATAR = 216,		/* Attached_Above_Right */
-  UC_CCC_BL = 218,		/* Below_Left */
-  UC_CCC_B = 220,		/* Below */
-  UC_CCC_BR = 222,		/* Below_Right */
-  UC_CCC_L = 224,		/* Left */
-  UC_CCC_R = 226,		/* Right */
-  UC_CCC_AL = 228,		/* Above_Left */
-  UC_CCC_A = 230,		/* Above */
-  UC_CCC_AR = 232,		/* Above_Right */
-  UC_CCC_DB = 233,		/* Double_Below */
-  UC_CCC_DA = 234,		/* Double_Above */
-  UC_CCC_IS = 240,		/* Iota_Subscript */
-};
-
-enum unicode_east_asian_width {
-  UC_EAW_A,			/* Ambiguous */
-  UC_EAW_F,			/* Fullwidth */
-  UC_EAW_H,			/* Halfwidth */
-  UC_EAW_N,			/* Neutral */
-  UC_EAW_Na,			/* Narrow */
-  UC_EAW_W,			/* Wide */
-};
-
-enum unicode_grapheme_cluster_break {
-  UC_GCB_Other = 0,		/* Other */
-  UC_GCB_Control,
-  UC_GCB_CR,
-  UC_GCB_Extend,
-  UC_GCB_L,
-  UC_GCB_LF,
-  UC_GCB_LV,
-  UC_GCB_LVT,
-  UC_GCB_T,
-  UC_GCB_V,
-};
-
-enum unicode_hangul_syllable_type {
-  UC_HST_NA = 0,		/* Not applicable */
-  UC_HST_L,			/* Leading_Jamo */
-  UC_HST_LV,			/* LV syllable */
-  UC_HST_LVT,			/* LVT syllable */
-  UC_HST_T,			/* Trailing_Jamo */
-  UC_HST_V,			/* Vowel_Jamo */
-};
-enum unicode_joining_group {
-  UC_JG_No_Joining_Group = 0,
-  UC_JG_Ain,
-  UC_JG_Alaph,
-  UC_JG_Alef,
-  UC_JG_Beh,
-  UC_JG_Beth,
-  UC_JG_Dal,
-  UC_JG_Dalath_Rish,
-  UC_JG_E,
-  UC_JG_Fe,
-  UC_JG_Feh,
-  UC_JG_Final_Semkath,
-  UC_JG_Gaf,
-  UC_JG_Gamal,
-  UC_JG_Hah,
-  UC_JG_Hamza_On_Heh_Goal,
-  UC_JG_He,
-  UC_JG_Heh,
-  UC_JG_Heh_Goal,
-  UC_JG_Heth,
-  UC_JG_Kaf,
-  UC_JG_Kaph,
-  UC_JG_Khaph,
-  UC_JG_Knotted_Heh,
-  UC_JG_Lam,
-  UC_JG_Lamadh,
-  UC_JG_Meem,
-  UC_JG_Mim,
-  UC_JG_Noon,
-  UC_JG_Nun,
-  UC_JG_Pe,
-  UC_JG_Qaf,
-  UC_JG_Qaph,
-  UC_JG_Reh,
-  UC_JG_Reversed_Pe,
-  UC_JG_Sad,
-  UC_JG_Sadhe,
-  UC_JG_Seen,
-  UC_JG_Semkath,
-  UC_JG_Shin,
-  UC_JG_Swash_Kaf,
-  UC_JG_Syriac_Waw,
-  UC_JG_Tah,
-  UC_JG_Taw,
-  UC_JG_Teh_Marbuta,
-  UC_JG_Teth,
-  UC_JG_Waw,
-  UC_JG_Yeh,
-  UC_JG_Yeh_Barree,
-  UC_JG_Yeh_With_Tail,
-  UC_JG_Yudh,
-  UC_JG_Yudh_He,
-  UC_JG_Zain,
-  UC_JG_Zhain,
-};
-
-enum unicode_joining_type {
-  UC_JT_U = 0,
-  UC_JT_R,
-  UC_JT_L,
-  UC_JT_D,
-  UC_JT_C,
-  UC_JT_T,
-};
-
-enum unicode_ternary {
-  UC_FALSE = 0,
-  UC_TRUE = 1,
-  UC_MAYBE = 2
-};
-
-enum unicode_numeric_type {
-  UC_NT_None = 0,
-  UC_NT_Numeric,
-  UC_NT_Digit,
-  UC_NT_Decimal,
-};
-
-enum unicode_sentence_break {
-  UC_SB_Other = 0,
-  UC_SB_Sep,
-  UC_SB_Format,
-  UC_SB_Sp,
-  UC_SB_Lower,
-  UC_SB_Upper,
-  UC_SB_OLetter,
-  UC_SB_Numeric,
-  UC_SB_ATerm,
-  UC_SB_STerm,
-  UC_SB_Close,
-};
-
-enum unicode_word_break {
-  UC_WB_Other = 0,
-  UC_WB_Format,
-  UC_WB_Katakana,
-  UC_WB_ALetter,
-  UC_WB_MidLetter,
-  UC_WB_MidNum,
-  UC_WB_Numeric,
-  UC_WB_ExtendNumLet,
-};
-
-enum unicode_line_break {
-  UC_LB_XX = 0,
-  UC_LB_BK,
-  UC_LB_CR,
-  UC_LB_LF,
-  UC_LB_CM,
-  UC_LB_SG,
-  UC_LB_GL,
-  UC_LB_CB,
-  UC_LB_SP,
-  UC_LB_ZW,
-  UC_LB_NL,
-  UC_LB_WJ,
-  UC_LB_JL,
-  UC_LB_JV,
-  UC_LB_JT,
-  UC_LB_H2,
-  UC_LB_H3,
-  UC_LB_OP,
-  UC_LB_CL,
-  UC_LB_QU,
-  UC_LB_NS,
-  UC_LB_EX,
-  UC_LB_SY,
-  UC_LB_IS,
-  UC_LB_PR,
-  UC_LB_PO,
-  UC_LB_NU,
-  UC_LB_AL,
-  UC_LB_ID,
-  UC_LB_IN,
-  UC_LB_HY,
-  UC_LB_BB,
-  UC_LB_BA,
-  UC_LB_SA,
-  UC_LB_AI,
-  UC_LB_B2,
-};
-
-enum unicode_general_category {
-  UC_GC_Cn = 0,
-  UC_GC_Cc,
-  UC_GC_Cf,
-  UC_GC_Co,
-  UC_GC_Cs,
-  UC_GC_Ll,
-  UC_GC_Lm,
-  UC_GC_Lo,
-  UC_GC_Lt,
-  UC_GC_Lu,
-  UC_GC_Mc,
-  UC_GC_Me,
-  UC_GC_Mn,
-  UC_GC_Nd,
-  UC_GC_Nl,
-  UC_GC_No,
-  UC_GC_Pc,
-  UC_GC_Pd,
-  UC_GC_Pe,
-  UC_GC_Pf,
-  UC_GC_Pi,
-  UC_GC_Po,
-  UC_GC_Ps,
-  UC_GC_Sc,
-  UC_GC_Sk,
-  UC_GC_Sm,
-  UC_GC_So,
-  UC_GC_Sp,
-  UC_GC_Zl,
-  UC_GC_Zp,
-  UC_GC_Zs,
-};
-
-enum unicode_script {
-  UC_SCR_Common = 0,
-  UC_SCR_Latin,
-  UC_SCR_Greek,
-  UC_SCR_Cyrillic,
-  UC_SCR_Armenian,
-  UC_SCR_Hebrew,
-  UC_SCR_Arabic,
-  UC_SCR_Syriac,
-  UC_SCR_Thaana,
-  UC_SCR_Devanagari,
-  UC_SCR_Bengali,
-  UC_SCR_Gurmukhi,
-  UC_SCR_Gujarati,
-  UC_SCR_Oriya,
-  UC_SCR_Tamil,
-  UC_SCR_Telugu,
-  UC_SCR_Kannada,
-  UC_SCR_Malayalam,
-  UC_SCR_Sinhala,
-  UC_SCR_Thai,
-  UC_SCR_Lao,
-  UC_SCR_Tibetan,
-  UC_SCR_Myanmar,
-  UC_SCR_Georgian,
-  UC_SCR_Hangul,
-  UC_SCR_Ethiopic,
-  UC_SCR_Cherokee,
-  UC_SCR_Canadian_Aboriginal,
-  UC_SCR_Ogham,
-  UC_SCR_Runic,
-  UC_SCR_Khmer,
-  UC_SCR_Mongolian,
-  UC_SCR_Hiragana,
-  UC_SCR_Katakana,
-  UC_SCR_Bopomofo,
-  UC_SCR_Han,
-  UC_SCR_Yi,
-  UC_SCR_Old_Italic,
-  UC_SCR_Gothic,
-  UC_SCR_Deseret,
-  UC_SCR_Inherited,
-  UC_SCR_Tagalog,
-  UC_SCR_Hanunoo,
-  UC_SCR_Buhid,
-  UC_SCR_Tagbanwa,
-  UC_SCR_Limbu,
-  UC_SCR_Tai_Le,
-  UC_SCR_Linear_B,
-  UC_SCR_Ugaritic,
-  UC_SCR_Shavian,
-  UC_SCR_Osmanya,
-  UC_SCR_Cypriot,
-  UC_SCR_Braille,
-  UC_SCR_Buginese,
-  UC_SCR_Coptic,
-  UC_SCR_New_Tai_Lue,
-  UC_SCR_Glagolitic,
-  UC_SCR_Tifinagh,
-  UC_SCR_Syloti_Nagri,
-  UC_SCR_Old_Persian,
-  UC_SCR_Kharoshthi,
-};
-
-enum unicode_block {
-  UC_BLK_No_Block = 0,
-  UC_BLK_Basic_Latin,
-  UC_BLK_Latin_1_Supplement,
-  UC_BLK_Latin_Extended_A,
-  UC_BLK_Latin_Extended_B,
-  UC_BLK_IPA_Extensions,
-  UC_BLK_Spacing_Modifier_Letters,
-  UC_BLK_Combining_Diacritical_Marks,
-  UC_BLK_Greek_and_Coptic,
-  UC_BLK_Cyrillic,
-  UC_BLK_Cyrillic_Supplement,
-  UC_BLK_Armenian,
-  UC_BLK_Hebrew,
-  UC_BLK_Arabic,
-  UC_BLK_Syriac,
-  UC_BLK_Arabic_Supplement,
-  UC_BLK_Thaana,
-  UC_BLK_Devanagari,
-  UC_BLK_Bengali,
-  UC_BLK_Gurmukhi,
-  UC_BLK_Gujarati,
-  UC_BLK_Oriya,
-  UC_BLK_Tamil,
-  UC_BLK_Telugu,
-  UC_BLK_Kannada,
-  UC_BLK_Malayalam,
-  UC_BLK_Sinhala,
-  UC_BLK_Thai,
-  UC_BLK_Lao,
-  UC_BLK_Tibetan,
-  UC_BLK_Myanmar,
-  UC_BLK_Georgian,
-  UC_BLK_Hangul_Jamo,
-  UC_BLK_Ethiopic,
-  UC_BLK_Ethiopic_Supplement,
-  UC_BLK_Cherokee,
-  UC_BLK_Unified_Canadian_Aboriginal_Syllabics,
-  UC_BLK_Ogham,
-  UC_BLK_Runic,
-  UC_BLK_Tagalog,
-  UC_BLK_Hanunoo,
-  UC_BLK_Buhid,
-  UC_BLK_Tagbanwa,
-  UC_BLK_Khmer,
-  UC_BLK_Mongolian,
-  UC_BLK_Limbu,
-  UC_BLK_Tai_Le,
-  UC_BLK_New_Tai_Lue,
-  UC_BLK_Khmer_Symbols,
-  UC_BLK_Buginese,
-  UC_BLK_Phonetic_Extensions,
-  UC_BLK_Phonetic_Extensions_Supplement,
-  UC_BLK_Combining_Diacritical_Marks_Supplement,
-  UC_BLK_Latin_Extended_Additional,
-  UC_BLK_Greek_Extended,
-  UC_BLK_General_Punctuation,
-  UC_BLK_Superscripts_and_Subscripts,
-  UC_BLK_Currency_Symbols,
-  UC_BLK_Combining_Diacritical_Marks_for_Symbols,
-  UC_BLK_Letterlike_Symbols,
-  UC_BLK_Number_Forms,
-  UC_BLK_Arrows,
-  UC_BLK_Mathematical_Operators,
-  UC_BLK_Miscellaneous_Technical,
-  UC_BLK_Control_Pictures,
-  UC_BLK_Optical_Character_Recognition,
-  UC_BLK_Enclosed_Alphanumerics,
-  UC_BLK_Box_Drawing,
-  UC_BLK_Block_Elements,
-  UC_BLK_Geometric_Shapes,
-  UC_BLK_Miscellaneous_Symbols,
-  UC_BLK_Dingbats,
-  UC_BLK_Miscellaneous_Mathematical_Symbols_A,
-  UC_BLK_Supplemental_Arrows_A,
-  UC_BLK_Braille_Patterns,
-  UC_BLK_Supplemental_Arrows_B,
-  UC_BLK_Miscellaneous_Mathematical_Symbols_B,
-  UC_BLK_Supplemental_Mathematical_Operators,
-  UC_BLK_Miscellaneous_Symbols_and_Arrows,
-  UC_BLK_Glagolitic,
-  UC_BLK_Coptic,
-  UC_BLK_Georgian_Supplement,
-  UC_BLK_Tifinagh,
-  UC_BLK_Ethiopic_Extended,
-  UC_BLK_Supplemental_Punctuation,
-  UC_BLK_CJK_Radicals_Supplement,
-  UC_BLK_Kangxi_Radicals,
-  UC_BLK_Ideographic_Description_Characters,
-  UC_BLK_CJK_Symbols_and_Punctuation,
-  UC_BLK_Hiragana,
-  UC_BLK_Katakana,
-  UC_BLK_Bopomofo,
-  UC_BLK_Hangul_Compatibility_Jamo,
-  UC_BLK_Kanbun,
-  UC_BLK_Bopomofo_Extended,
-  UC_BLK_CJK_Strokes,
-  UC_BLK_Katakana_Phonetic_Extensions,
-  UC_BLK_Enclosed_CJK_Letters_and_Months,
-  UC_BLK_CJK_Compatibility,
-  UC_BLK_CJK_Unified_Ideographs_Extension_A,
-  UC_BLK_Yijing_Hexagram_Symbols,
-  UC_BLK_CJK_Unified_Ideographs,
-  UC_BLK_Yi_Syllables,
-  UC_BLK_Yi_Radicals,
-  UC_BLK_Modifier_Tone_Letters,
-  UC_BLK_Syloti_Nagri,
-  UC_BLK_Hangul_Syllables,
-  UC_BLK_High_Surrogates,
-  UC_BLK_High_Private_Use_Surrogates,
-  UC_BLK_Low_Surrogates,
-  UC_BLK_Private_Use_Area,
-  UC_BLK_CJK_Compatibility_Ideographs,
-  UC_BLK_Alphabetic_Presentation_Forms,
-  UC_BLK_Arabic_Presentation_Forms_A,
-  UC_BLK_Variation_Selectors,
-  UC_BLK_Vertical_Forms,
-  UC_BLK_Combining_Half_Marks,
-  UC_BLK_CJK_Compatibility_Forms,
-  UC_BLK_Small_Form_Variants,
-  UC_BLK_Arabic_Presentation_Forms_B,
-  UC_BLK_Halfwidth_and_Fullwidth_Forms,
-  UC_BLK_Specials,
-  UC_BLK_Linear_B_Syllabary,
-  UC_BLK_Linear_B_Ideograms,
-  UC_BLK_Aegean_Numbers,
-  UC_BLK_Ancient_Greek_Numbers,
-  UC_BLK_Old_Italic,
-  UC_BLK_Gothic,
-  UC_BLK_Ugaritic,
-  UC_BLK_Old_Persian,
-  UC_BLK_Deseret,
-  UC_BLK_Shavian,
-  UC_BLK_Osmanya,
-  UC_BLK_Cypriot_Syllabary,
-  UC_BLK_Kharoshthi,
-  UC_BLK_Byzantine_Musical_Symbols,
-  UC_BLK_Musical_Symbols,
-  UC_BLK_Ancient_Greek_Musical_Notation,
-  UC_BLK_Tai_Xuan_Jing_Symbols,
-  UC_BLK_Mathematical_Alphanumeric_Symbols,
-  UC_BLK_CJK_Unified_Ideographs_Extension_B,
-  UC_BLK_CJK_Compatibility_Ideographs_Supplement,
-  UC_BLK_Tags,
-  UC_BLK_Variation_Selectors_Supplement,
-  UC_BLK_Supplementary_Private_Use_Area_A,
-  UC_BLK_Supplementary_Private_Use_Area_B,
-};
-
-#define UC_FLAG(x) (UINT64_C(1) << (x))
-
-#define UC_FL_COMPOSITION_EXCLUSION     UC_FLAG(0)
-#define UC_FL_ALPHABETIC                UC_FLAG(1)
-#define UC_FL_DEFAULT_IGNORABLE_CODE_POINT  UC_FLAG(2)
-#define UC_FL_LOWERCASE                 UC_FLAG(3)
-#define UC_FL_GRAPHEME_BASE             UC_FLAG(4)
-#define UC_FL_GRAPHEME_EXTEND           UC_FLAG(5)
-#define UC_FL_ID_START                  UC_FLAG(6)
-#define UC_FL_ID_CONTINUE               UC_FLAG(7)
-#define UC_FL_MATH                      UC_FLAG(8)
-#define UC_FL_UPPERCASE                 UC_FLAG(9)
-#define UC_FL_XID_START                 UC_FLAG(10)
-#define UC_FL_XID_CONTINUE              UC_FLAG(11)
-#define UC_FL_HEX_DIGIT                 UC_FLAG(12)
-#define UC_FL_BIDI_CONTROL              UC_FLAG(13)
-#define UC_FL_DASH                      UC_FLAG(14)
-#define UC_FL_DEPRECATED                UC_FLAG(15)
-#define UC_FL_DIACRITIC                 UC_FLAG(16)
-#define UC_FL_EXTENDER                  UC_FLAG(17)
-#define UC_FL_GRAPHEME_LINK             UC_FLAG(18)
-#define UC_FL_IDEOGRAPHIC               UC_FLAG(19)
-#define UC_FL_IDS_BINARY_OPERATOR       UC_FLAG(20)
-#define UC_FL_IDS_TRINARY_OPERATOR      UC_FLAG(21)
-#define UC_FL_JOIN_CONTROL              UC_FLAG(22)
-#define UC_FL_LOGICAL_ORDER_EXCEPTION   UC_FLAG(23)
-#define UC_FL_NONCHARACTER_CODE_POINT   UC_FLAG(24)
-#define UC_FL_PATTERN_SYNTAX            UC_FLAG(25)
-#define UC_FL_PATTERN_WHITE_SPACE       UC_FLAG(26)
-#define UC_FL_QUOTATION_MARK            UC_FLAG(27)
-#define UC_FL_RADICAL                   UC_FLAG(28)
-#define UC_FL_SOFT_DOTTED               UC_FLAG(29)
-#define UC_FL_STERM                     UC_FLAG(30)
-#define UC_FL_TERMINAL_PUNCTUATION	UC_FLAG(31)
-#define UC_FL_UNIFIED_IDEOGRAPH		UC_FLAG(32)
-#define UC_FL_VARIATION_SELECTOR	UC_FLAG(33)
-#define UC_FL_WHITE_SPACE		UC_FLAG(34)
-#define UC_FL_BIDI_MIRRORED		UC_FLAG(35)
-
-struct unicode_character_data {
-  int32_t ucs;			/* Actual codepoint */
-  uint16_t size;		/* Size of this structure */
-  uint16_t alloc_size;		/* Allocation size */
-  uint64_t fl;			/* Flags */
-  const char *name;
-  const char *bidi_mirroring_glyph;
-  const char *uppercase_mapping;
-  const char *lowercase_mapping;
-  const char *titlecase_mapping;
-  int32_t simple_uppercase;
-  int32_t simple_lowercase;
-  int32_t simple_titlecase;
-  /* Numeric value = num/den * 10^exp */
-  uint8_t numeric_value_num;
-  uint8_t numeric_value_den;
-  uint8_t numeric_value_exp;
-  uint8_t age_ma, age_mi;
-  enum unicode_general_category         general_category;
-  enum unicode_block			block;
-  enum unicode_script                   script;
-  enum unicode_joining_type      	joining_type;
-  enum unicode_joining_group     	joining_group;
-  enum unicode_east_asian_width         east_asian_width;
-  enum unicode_hangul_syllable_type     hangul_syllable_type;
-  enum unicode_numeric_type             numeric_type;
-  enum unicode_combining_class	        combining_class;
-  enum unicode_bidi_class	    	bidi_class;
-  enum unicode_grapheme_cluster_break	grapheme_cluster_break;
-  enum unicode_sentence_break		sentence_break;
-  enum unicode_word_break		word_break;
-  enum unicode_line_break		line_break;
-};
-
-struct unicode_character_data *unicode_character_data(int32_t);
-struct unicode_character_data *unicode_character_get(struct unicode_character_data *);
-void unicode_character_put(struct unicode_character_data *);
-struct unicode_character_data *unicode_character_lookup(const char *);
-
-#endif /* UCD_H */
diff --git a/ucd.h.in b/ucd.h.in
new file mode 100644
index 0000000..74bab3c
--- /dev/null
+++ b/ucd.h.in
@@ -0,0 +1,125 @@
+/* -*- c -*- ------------------------------------------------------------- *
+ *   
+ *   Copyright 2005 H. Peter Anvin - All Rights Reserved
+ *
+ *   Permission is hereby granted, free of charge, to any person
+ *   obtaining a copy of this software and associated documentation
+ *   files (the "Software"), to deal in the Software without
+ *   restriction, including without limitation the rights to use,
+ *   copy, modify, merge, publish, distribute, sublicense, and/or
+ *   sell copies of the Software, and to permit persons to whom
+ *   the Software is furnished to do so, subject to the following
+ *   conditions:
+ *   
+ *   The above copyright notice and this permission notice shall
+ *   be included in all copies or substantial portions of the Software.
+ *   
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ *   OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ *   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ *   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *   FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ *   OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * ucd.h
+ *
+ * Simple interface to the Unicode Character Database
+ */
+
+#ifndef UCD_H
+#define UCD_H
+
+#include <stdint.h>
+
+#define LIBUCD_THREAD_SUPPORT
+
+#ifdef LIBUCD_THREAD_SUPPORT
+# include <pthread.h>
+#endif
+
+ENUMS;
+
+#define UC_FLAG(x) (UINT64_C(1) << (x))
+
+#define UC_FL_COMPOSITION_EXCLUSION     UC_FLAG(0)
+#define UC_FL_ALPHABETIC                UC_FLAG(1)
+#define UC_FL_DEFAULT_IGNORABLE_CODE_POINT  UC_FLAG(2)
+#define UC_FL_LOWERCASE                 UC_FLAG(3)
+#define UC_FL_GRAPHEME_BASE             UC_FLAG(4)
+#define UC_FL_GRAPHEME_EXTEND           UC_FLAG(5)
+#define UC_FL_ID_START                  UC_FLAG(6)
+#define UC_FL_ID_CONTINUE               UC_FLAG(7)
+#define UC_FL_MATH                      UC_FLAG(8)
+#define UC_FL_UPPERCASE                 UC_FLAG(9)
+#define UC_FL_XID_START                 UC_FLAG(10)
+#define UC_FL_XID_CONTINUE              UC_FLAG(11)
+#define UC_FL_HEX_DIGIT                 UC_FLAG(12)
+#define UC_FL_BIDI_CONTROL              UC_FLAG(13)
+#define UC_FL_DASH                      UC_FLAG(14)
+#define UC_FL_DEPRECATED                UC_FLAG(15)
+#define UC_FL_DIACRITIC                 UC_FLAG(16)
+#define UC_FL_EXTENDER                  UC_FLAG(17)
+#define UC_FL_GRAPHEME_LINK             UC_FLAG(18)
+#define UC_FL_IDEOGRAPHIC               UC_FLAG(19)
+#define UC_FL_IDS_BINARY_OPERATOR       UC_FLAG(20)
+#define UC_FL_IDS_TRINARY_OPERATOR      UC_FLAG(21)
+#define UC_FL_JOIN_CONTROL              UC_FLAG(22)
+#define UC_FL_LOGICAL_ORDER_EXCEPTION   UC_FLAG(23)
+#define UC_FL_NONCHARACTER_CODE_POINT   UC_FLAG(24)
+#define UC_FL_PATTERN_SYNTAX            UC_FLAG(25)
+#define UC_FL_PATTERN_WHITE_SPACE       UC_FLAG(26)
+#define UC_FL_QUOTATION_MARK            UC_FLAG(27)
+#define UC_FL_RADICAL                   UC_FLAG(28)
+#define UC_FL_SOFT_DOTTED               UC_FLAG(29)
+#define UC_FL_STERM                     UC_FLAG(30)
+#define UC_FL_TERMINAL_PUNCTUATION	UC_FLAG(31)
+#define UC_FL_UNIFIED_IDEOGRAPH		UC_FLAG(32)
+#define UC_FL_VARIATION_SELECTOR	UC_FLAG(33)
+#define UC_FL_WHITE_SPACE		UC_FLAG(34)
+#define UC_FL_BIDI_MIRRORED		UC_FLAG(35)
+
+struct unicode_character_data {
+  int32_t ucs;			/* Actual codepoint */
+  uint16_t size;		/* Size of this structure */
+  uint16_t alloc_size;		/* Allocation size */
+  uint64_t fl;			/* Flags */
+  const char *name;
+  const char *bidi_mirroring_glyph;
+  const char *uppercase_mapping;
+  const char *lowercase_mapping;
+  const char *titlecase_mapping;
+  int32_t simple_uppercase;
+  int32_t simple_lowercase;
+  int32_t simple_titlecase;
+  /* Numeric value = num/den * 10^exp */
+  uint8_t numeric_value_num;
+  uint8_t numeric_value_den;
+  uint8_t numeric_value_exp;
+  uint8_t age_ma, age_mi;
+  enum unicode_general_category         general_category;
+  enum unicode_block			block;
+  enum unicode_script                   script;
+  enum unicode_joining_type      	joining_type;
+  enum unicode_joining_group     	joining_group;
+  enum unicode_east_asian_width         east_asian_width;
+  enum unicode_hangul_syllable_type     hangul_syllable_type;
+  enum unicode_numeric_type             numeric_type;
+  enum unicode_canonical_combining_class canonical_combining_class;
+  enum unicode_bidi_class	    	bidi_class;
+  enum unicode_grapheme_cluster_break	grapheme_cluster_break;
+  enum unicode_sentence_break		sentence_break;
+  enum unicode_word_break		word_break;
+  enum unicode_line_break		line_break;
+};
+
+struct unicode_character_data *unicode_character_data(int32_t);
+struct unicode_character_data *unicode_character_get(struct unicode_character_data *);
+void unicode_character_put(struct unicode_character_data *);
+struct unicode_character_data *unicode_character_lookup(const char *);
+
+#endif /* UCD_H */
diff --git a/ucslookup.c b/ucslookup.c
index 3283b0b..3c6aa46 100644
--- a/ucslookup.c
+++ b/ucslookup.c
@@ -141,7 +141,7 @@ alloc_copy_properties(const struct _libucd_property_array *prop,
   ucd->east_asian_width = prop->east_asian_width;
   ucd->hangul_syllable_type = prop->hangul_syllable_type;
   ucd->numeric_type = prop->numeric_type;
-  ucd->combining_class = prop->combining_class;
+  ucd->canonical_combining_class = prop->canonical_combining_class;
   ucd->bidi_class = prop->bidi_class;
   ucd->grapheme_cluster_break = prop->grapheme_cluster_break;
   ucd->sentence_break = prop->sentence_break;
author	H. Peter Anvin <hpa@smyrno.hos.anvin.org>	2005-12-05 21:25:37 -0800
committer	H. Peter Anvin <hpa@smyrno.hos.anvin.org>	2005-12-05 21:25:37 -0800
commit	bd8acab4d3bd0b010396d7f6ffd9f5b9cb5db619 (patch)
tree	dbec74f33de14a14e5f9acebaeafc21b4d0abe42
parent	8986e43af73d8f8f8e93e169878ecd29d5592a3a (diff)
download	libucd-bd8acab4d3bd0b010396d7f6ffd9f5b9cb5db619.tar.gz