diff options
author | H. Peter Anvin <hpa@smyrno.hos.anvin.org> | 2005-12-05 21:25:37 -0800 |
---|---|---|
committer | H. Peter Anvin <hpa@smyrno.hos.anvin.org> | 2005-12-05 21:25:37 -0800 |
commit | bd8acab4d3bd0b010396d7f6ffd9f5b9cb5db619 (patch) | |
tree | dbec74f33de14a14e5f9acebaeafc21b4d0abe42 | |
parent | 8986e43af73d8f8f8e93e169878ecd29d5592a3a (diff) | |
download | libucd-bd8acab4d3bd0b010396d7f6ffd9f5b9cb5db619.tar.gz |
Generate enumerations automagically; canonicalize names
-rw-r--r-- | Makefile | 12 | ||||
-rwxr-xr-x | convert_ucd.pl | 10 | ||||
-rw-r--r-- | libucd_int.h | 7 | ||||
-rwxr-xr-x | makeenums.pl | 118 | ||||
-rw-r--r-- | ucd.h | 587 | ||||
-rw-r--r-- | ucd.h.in | 125 | ||||
-rw-r--r-- | ucslookup.c | 2 |
7 files changed, 265 insertions, 596 deletions
@@ -48,12 +48,17 @@ CVT_FILES = gen/jamo.c gen/nameslist.tab gen/nametoucs.keys gen/nametoucs.tab \ .c.hi: $(HOST_CC) $(HOST_CFLAGS) -E -o $@ $< +enums/%.o: enums/%.c $(HDRS) + $(CC) $(CFLAGS) -c -o $@ $< +enums/%.lo: enums/%.c $(HDRS) + $(CC) $(CFLAGS) $(PICFLAGS) -c -o $@ $< + # ----------------------------------------------------------------------- LIBSRCS = proparray.c gen/nametoucs_hash.c gen/ucstoname_hash.c \ gen/jamo.c gen/nameslist.c gen/nameslist_dict.c \ gen/ucstoname_tab.c gen/nametoucs_tab.c nametoucs.c \ - ucslookup.c cache.c + ucslookup.c cache.c $(wildcard enums/*.c) LIBOBJS = $(patsubst %.c,%.o,$(LIBSRCS)) SO_OBJS = $(patsubst %.c,%.lo,$(LIBSRCS)) @@ -63,7 +68,7 @@ SO_OBJS = $(patsubst %.c,%.lo,$(LIBSRCS)) all : $(LIB_FILE) $(SO_FILE) $(SO_NAME) clean: - rm -rf gen + rm -rf gen enums rm -f *.o *.i *.*.a *.so *.so.* $(MAKE) -C perfect clean @@ -129,6 +134,9 @@ $(SO_NAME): $(SO_FILE) ln -f $(SO_FILE) $(SO_NAME) endif +ucd.h: ucd.h.in enum.list makeenums.pl + $(PERL) makeenums.pl + # ----------------------------------------------------------------------- proparray.o: proparray.c ucd.h $(HDRS) gen/proparray.c diff --git a/convert_ucd.pl b/convert_ucd.pl index 5cc3e97..c927b9c 100755 --- a/convert_ucd.pl +++ b/convert_ucd.pl @@ -397,8 +397,8 @@ sub make_properties_array() $mine .= "\t\tUC_GC_$gc,\n"; # Script - my $scr = $$cp{'Script'} || 'Common'; - $mine .= "\t\tUC_SCR_$scr,\n"; + my $sc = $$cp{'Script'} || 'Common'; + $mine .= "\t\tUC_SC_$sc,\n"; # Numeric value my $nv = $$cp{'Numeric_Value'}; @@ -476,8 +476,8 @@ sub make_properties_array() $mine .= "\t\tUC_JG_$ajg,\n"; # East Asian Width - my $eaw = $$cp{'East_Asian_Width'} || 'N'; - $mine .= "\t\tUC_EAW_$eaw,\n"; + my $ea = $$cp{'East_Asian_Width'} || 'N'; + $mine .= "\t\tUC_EA_$ea,\n"; # Hangul Syllable Type my $hst = $$cp{'Hangul_Syllable_Type'} || 'NA'; @@ -493,7 +493,7 @@ sub make_properties_array() # Bidi Class my $bc = $$cp{'Bidi_Class'} || 'L'; - $mine .= "\t\tUC_BIDI_$bc,\n"; + $mine .= "\t\tUC_BC_$bc,\n"; # Additional properties... $mine .= "\t},\n"; diff --git a/libucd_int.h b/libucd_int.h index 5514ac0..ff3d74a 100644 --- a/libucd_int.h +++ b/libucd_int.h @@ -31,7 +31,7 @@ struct _libucd_property_array { int24 simple_lowercase; int24 simple_titlecase; uint8_t age; /* (major << 3) + minor */ - uint8_t combining_class; + uint8_t canonical_combining_class; unsigned sentence_break :4; unsigned grapheme_cluster_break :4; unsigned word_break :3; @@ -77,4 +77,9 @@ struct libucd_private { struct unicode_character_data * unicode_character_data_raw(int32_t ucs); +struct libucd_enum_names { + const char *long_name; + const char *short_name; +}; + #endif diff --git a/makeenums.pl b/makeenums.pl new file mode 100755 index 0000000..2178da1 --- /dev/null +++ b/makeenums.pl @@ -0,0 +1,118 @@ +#!/usr/bin/perl +# +# Process enum.list and output ucd.h as well as +# C code indicies for each enum. +# + +sub close_frag($) { + my($longname) = @_; + + print UCD_H "};\n\n"; + print FRAG "};\n"; + print FRAG <<EOF; + +int +unicode_property_\L${longname}\E_names(enum unicode_\L${longname}\E v, + const char **longname, + const char **shortname) +{ + const char *ln, *sn; + int rv; + + if ( (unsigned)v >= sizeof enum_names/sizeof(struct libucd_enum_names) ) { + ln = NULL; + sn = NULL; + rv = 1; + } else { + ln = enum_names[v].long_name; + sn = enum_names[v].short_name; + rv = 0; + } + if ( longname ) *longname = ln; + if ( shortname ) *shortname = sn; + return rv; +} +EOF + close(FRAG); +} + +open(LIST, '<', "enum.list") or die; +open(UCD_IN, '<', "ucd.h.in") or die; +open(UCD_H, '>', "ucd.h") or die; +mkdir("enums", 0777); + +while ( defined($line = <UCD_IN>) ) { + last if ( $line =~ /ENUMS\;/ ); + print UCD_H $line; +} + +undef $shortname; + +while( defined($line = <LIST>) ) { + chomp $line; + next if ( $line =~ /^\s*(\#\#.*|)$/ ); + + if ( $line =~ /^\#\s*(\S+)\s+\((\S+)\)\s+(long|short)\s*$/ ) { + $prev = $longname; + + $longname = $1; + $shortname = $2; + $whichname = ($3 eq 'long') ? 1 : 0; + + if ( defined($prev) ) { + close_frag($prev); + } + print UCD_H "enum unicode_\L${longname}\E {\n"; + open(FRAG, '>', "enums/${longname}.c") or die; + print FRAG "#include \"libucd_int.h\"\n"; + print FRAG "static const struct libucd_enum_names enum_names[] = {\n"; + } elsif ( $line =~ /\;/ ) { + $line =~ s/\s*\#.*$//; # Remove comments + @list = split(/\s*;\s*/, $line); + + if ( shift(@list) ne $shortname ) { + die "$0: Unexpected line: $line\n"; + } + + undef $epos; + if ( $list[0] =~ /^[0-9]+$/ ) { + $epos = shift(@list); + } + + # Write ucd.h + + ($na = $list[$whichname]) =~ tr/-/_/; + $nx = $list[1-$whichname]; + $nx = ($nx eq 'n/a') ? '' : "/* $nx */"; + + if ( defined($epos) ) { + printf UCD_H " %-30s = %3d, %s\n", + "UC_\U${shortname}\E_${na}", $epos, $nx; + } else { + printf UCD_H " %-40s %s\n", + "UC_\U${shortname}\E_${na},", $nx; + } + + # Write generator fragment + + ($ln = $list[1]) =~ tr/_/ /; + $sn = $list[0]; + $sn = ($sn eq 'n/a') ? '0' : "\"$sn\""; + if ( defined($epos) ) { + printf FRAG "\t[%3d] = { \"%s\", %s },\n", $epos, $ln, $sn; + } else { + printf FRAG "\t{ \"%s\", %s },\n", $ln, $sn; + } + } else { + die "$0: Cannot parse: $line\n"; + } +} + +close_frag($longname); + +while ( defined($line = <UCD_IN>) ) { + print UCD_H $line; +} + +close(UCD_IN); +close(UCD_H); @@ -1,587 +0,0 @@ -/* ----------------------------------------------------------------------- * - * - * Copyright 2005 H. Peter Anvin - All Rights Reserved - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall - * be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - * ----------------------------------------------------------------------- */ - -/* - * ucd.h - * - * Simple interface to the Unicode Character Database - */ - -#ifndef UCD_H -#define UCD_H - -#include <stdint.h> - -#define LIBUCD_THREAD_SUPPORT - -#ifdef LIBUCD_THREAD_SUPPORT -# include <pthread.h> -#endif - -enum unicode_bidi_class { - UC_BIDI_AL, /* Arabic_Letter */ - UC_BIDI_AN, /* Arabic_Number */ - UC_BIDI_B, /* Paragraph_Separator */ - UC_BIDI_BN, /* Boundary_Neutral */ - UC_BIDI_CS, /* Common_Separator */ - UC_BIDI_EN, /* European_Number */ - UC_BIDI_ES, /* European_Separator */ - UC_BIDI_ET, /* European_Terminator */ - UC_BIDI_L, /* Left_To_Right */ - UC_BIDI_LRE, /* Left_To_Right_Embedding */ - UC_BIDI_LRO, /* Left_To_Right_Override */ - UC_BIDI_NSM, /* Nonspacing_Mark */ - UC_BIDI_ON, /* Other_Neutral */ - UC_BIDI_PDF, /* Pop_Directional_Format */ - UC_BIDI_R, /* Right_To_Left */ - UC_BIDI_RLE, /* Right_To_Left_Embedding */ - UC_BIDI_RLO, /* Right_To_Left_Override */ - UC_BIDI_S, /* Segment_Separator */ - UC_BIDI_WS, /* White_Space */ -}; - -enum unicode_combining_class { - UC_CCC_NR = 0, /* Not_Reordered */ - UC_CCC_OV = 1, /* Overlay */ - UC_CCC_NK = 7, /* Nukta */ - UC_CCC_KV = 8, /* Kana_Voicing */ - UC_CCC_VR = 9, /* Virama */ - UC_CCC_ATBL = 200, /* Attached_Below_Left */ - UC_CCC_ATB = 202, /* Attached_Below */ - UC_CCC_ATAR = 216, /* Attached_Above_Right */ - UC_CCC_BL = 218, /* Below_Left */ - UC_CCC_B = 220, /* Below */ - UC_CCC_BR = 222, /* Below_Right */ - UC_CCC_L = 224, /* Left */ - UC_CCC_R = 226, /* Right */ - UC_CCC_AL = 228, /* Above_Left */ - UC_CCC_A = 230, /* Above */ - UC_CCC_AR = 232, /* Above_Right */ - UC_CCC_DB = 233, /* Double_Below */ - UC_CCC_DA = 234, /* Double_Above */ - UC_CCC_IS = 240, /* Iota_Subscript */ -}; - -enum unicode_east_asian_width { - UC_EAW_A, /* Ambiguous */ - UC_EAW_F, /* Fullwidth */ - UC_EAW_H, /* Halfwidth */ - UC_EAW_N, /* Neutral */ - UC_EAW_Na, /* Narrow */ - UC_EAW_W, /* Wide */ -}; - -enum unicode_grapheme_cluster_break { - UC_GCB_Other = 0, /* Other */ - UC_GCB_Control, - UC_GCB_CR, - UC_GCB_Extend, - UC_GCB_L, - UC_GCB_LF, - UC_GCB_LV, - UC_GCB_LVT, - UC_GCB_T, - UC_GCB_V, -}; - -enum unicode_hangul_syllable_type { - UC_HST_NA = 0, /* Not applicable */ - UC_HST_L, /* Leading_Jamo */ - UC_HST_LV, /* LV syllable */ - UC_HST_LVT, /* LVT syllable */ - UC_HST_T, /* Trailing_Jamo */ - UC_HST_V, /* Vowel_Jamo */ -}; -enum unicode_joining_group { - UC_JG_No_Joining_Group = 0, - UC_JG_Ain, - UC_JG_Alaph, - UC_JG_Alef, - UC_JG_Beh, - UC_JG_Beth, - UC_JG_Dal, - UC_JG_Dalath_Rish, - UC_JG_E, - UC_JG_Fe, - UC_JG_Feh, - UC_JG_Final_Semkath, - UC_JG_Gaf, - UC_JG_Gamal, - UC_JG_Hah, - UC_JG_Hamza_On_Heh_Goal, - UC_JG_He, - UC_JG_Heh, - UC_JG_Heh_Goal, - UC_JG_Heth, - UC_JG_Kaf, - UC_JG_Kaph, - UC_JG_Khaph, - UC_JG_Knotted_Heh, - UC_JG_Lam, - UC_JG_Lamadh, - UC_JG_Meem, - UC_JG_Mim, - UC_JG_Noon, - UC_JG_Nun, - UC_JG_Pe, - UC_JG_Qaf, - UC_JG_Qaph, - UC_JG_Reh, - UC_JG_Reversed_Pe, - UC_JG_Sad, - UC_JG_Sadhe, - UC_JG_Seen, - UC_JG_Semkath, - UC_JG_Shin, - UC_JG_Swash_Kaf, - UC_JG_Syriac_Waw, - UC_JG_Tah, - UC_JG_Taw, - UC_JG_Teh_Marbuta, - UC_JG_Teth, - UC_JG_Waw, - UC_JG_Yeh, - UC_JG_Yeh_Barree, - UC_JG_Yeh_With_Tail, - UC_JG_Yudh, - UC_JG_Yudh_He, - UC_JG_Zain, - UC_JG_Zhain, -}; - -enum unicode_joining_type { - UC_JT_U = 0, - UC_JT_R, - UC_JT_L, - UC_JT_D, - UC_JT_C, - UC_JT_T, -}; - -enum unicode_ternary { - UC_FALSE = 0, - UC_TRUE = 1, - UC_MAYBE = 2 -}; - -enum unicode_numeric_type { - UC_NT_None = 0, - UC_NT_Numeric, - UC_NT_Digit, - UC_NT_Decimal, -}; - -enum unicode_sentence_break { - UC_SB_Other = 0, - UC_SB_Sep, - UC_SB_Format, - UC_SB_Sp, - UC_SB_Lower, - UC_SB_Upper, - UC_SB_OLetter, - UC_SB_Numeric, - UC_SB_ATerm, - UC_SB_STerm, - UC_SB_Close, -}; - -enum unicode_word_break { - UC_WB_Other = 0, - UC_WB_Format, - UC_WB_Katakana, - UC_WB_ALetter, - UC_WB_MidLetter, - UC_WB_MidNum, - UC_WB_Numeric, - UC_WB_ExtendNumLet, -}; - -enum unicode_line_break { - UC_LB_XX = 0, - UC_LB_BK, - UC_LB_CR, - UC_LB_LF, - UC_LB_CM, - UC_LB_SG, - UC_LB_GL, - UC_LB_CB, - UC_LB_SP, - UC_LB_ZW, - UC_LB_NL, - UC_LB_WJ, - UC_LB_JL, - UC_LB_JV, - UC_LB_JT, - UC_LB_H2, - UC_LB_H3, - UC_LB_OP, - UC_LB_CL, - UC_LB_QU, - UC_LB_NS, - UC_LB_EX, - UC_LB_SY, - UC_LB_IS, - UC_LB_PR, - UC_LB_PO, - UC_LB_NU, - UC_LB_AL, - UC_LB_ID, - UC_LB_IN, - UC_LB_HY, - UC_LB_BB, - UC_LB_BA, - UC_LB_SA, - UC_LB_AI, - UC_LB_B2, -}; - -enum unicode_general_category { - UC_GC_Cn = 0, - UC_GC_Cc, - UC_GC_Cf, - UC_GC_Co, - UC_GC_Cs, - UC_GC_Ll, - UC_GC_Lm, - UC_GC_Lo, - UC_GC_Lt, - UC_GC_Lu, - UC_GC_Mc, - UC_GC_Me, - UC_GC_Mn, - UC_GC_Nd, - UC_GC_Nl, - UC_GC_No, - UC_GC_Pc, - UC_GC_Pd, - UC_GC_Pe, - UC_GC_Pf, - UC_GC_Pi, - UC_GC_Po, - UC_GC_Ps, - UC_GC_Sc, - UC_GC_Sk, - UC_GC_Sm, - UC_GC_So, - UC_GC_Sp, - UC_GC_Zl, - UC_GC_Zp, - UC_GC_Zs, -}; - -enum unicode_script { - UC_SCR_Common = 0, - UC_SCR_Latin, - UC_SCR_Greek, - UC_SCR_Cyrillic, - UC_SCR_Armenian, - UC_SCR_Hebrew, - UC_SCR_Arabic, - UC_SCR_Syriac, - UC_SCR_Thaana, - UC_SCR_Devanagari, - UC_SCR_Bengali, - UC_SCR_Gurmukhi, - UC_SCR_Gujarati, - UC_SCR_Oriya, - UC_SCR_Tamil, - UC_SCR_Telugu, - UC_SCR_Kannada, - UC_SCR_Malayalam, - UC_SCR_Sinhala, - UC_SCR_Thai, - UC_SCR_Lao, - UC_SCR_Tibetan, - UC_SCR_Myanmar, - UC_SCR_Georgian, - UC_SCR_Hangul, - UC_SCR_Ethiopic, - UC_SCR_Cherokee, - UC_SCR_Canadian_Aboriginal, - UC_SCR_Ogham, - UC_SCR_Runic, - UC_SCR_Khmer, - UC_SCR_Mongolian, - UC_SCR_Hiragana, - UC_SCR_Katakana, - UC_SCR_Bopomofo, - UC_SCR_Han, - UC_SCR_Yi, - UC_SCR_Old_Italic, - UC_SCR_Gothic, - UC_SCR_Deseret, - UC_SCR_Inherited, - UC_SCR_Tagalog, - UC_SCR_Hanunoo, - UC_SCR_Buhid, - UC_SCR_Tagbanwa, - UC_SCR_Limbu, - UC_SCR_Tai_Le, - UC_SCR_Linear_B, - UC_SCR_Ugaritic, - UC_SCR_Shavian, - UC_SCR_Osmanya, - UC_SCR_Cypriot, - UC_SCR_Braille, - UC_SCR_Buginese, - UC_SCR_Coptic, - UC_SCR_New_Tai_Lue, - UC_SCR_Glagolitic, - UC_SCR_Tifinagh, - UC_SCR_Syloti_Nagri, - UC_SCR_Old_Persian, - UC_SCR_Kharoshthi, -}; - -enum unicode_block { - UC_BLK_No_Block = 0, - UC_BLK_Basic_Latin, - UC_BLK_Latin_1_Supplement, - UC_BLK_Latin_Extended_A, - UC_BLK_Latin_Extended_B, - UC_BLK_IPA_Extensions, - UC_BLK_Spacing_Modifier_Letters, - UC_BLK_Combining_Diacritical_Marks, - UC_BLK_Greek_and_Coptic, - UC_BLK_Cyrillic, - UC_BLK_Cyrillic_Supplement, - UC_BLK_Armenian, - UC_BLK_Hebrew, - UC_BLK_Arabic, - UC_BLK_Syriac, - UC_BLK_Arabic_Supplement, - UC_BLK_Thaana, - UC_BLK_Devanagari, - UC_BLK_Bengali, - UC_BLK_Gurmukhi, - UC_BLK_Gujarati, - UC_BLK_Oriya, - UC_BLK_Tamil, - UC_BLK_Telugu, - UC_BLK_Kannada, - UC_BLK_Malayalam, - UC_BLK_Sinhala, - UC_BLK_Thai, - UC_BLK_Lao, - UC_BLK_Tibetan, - UC_BLK_Myanmar, - UC_BLK_Georgian, - UC_BLK_Hangul_Jamo, - UC_BLK_Ethiopic, - UC_BLK_Ethiopic_Supplement, - UC_BLK_Cherokee, - UC_BLK_Unified_Canadian_Aboriginal_Syllabics, - UC_BLK_Ogham, - UC_BLK_Runic, - UC_BLK_Tagalog, - UC_BLK_Hanunoo, - UC_BLK_Buhid, - UC_BLK_Tagbanwa, - UC_BLK_Khmer, - UC_BLK_Mongolian, - UC_BLK_Limbu, - UC_BLK_Tai_Le, - UC_BLK_New_Tai_Lue, - UC_BLK_Khmer_Symbols, - UC_BLK_Buginese, - UC_BLK_Phonetic_Extensions, - UC_BLK_Phonetic_Extensions_Supplement, - UC_BLK_Combining_Diacritical_Marks_Supplement, - UC_BLK_Latin_Extended_Additional, - UC_BLK_Greek_Extended, - UC_BLK_General_Punctuation, - UC_BLK_Superscripts_and_Subscripts, - UC_BLK_Currency_Symbols, - UC_BLK_Combining_Diacritical_Marks_for_Symbols, - UC_BLK_Letterlike_Symbols, - UC_BLK_Number_Forms, - UC_BLK_Arrows, - UC_BLK_Mathematical_Operators, - UC_BLK_Miscellaneous_Technical, - UC_BLK_Control_Pictures, - UC_BLK_Optical_Character_Recognition, - UC_BLK_Enclosed_Alphanumerics, - UC_BLK_Box_Drawing, - UC_BLK_Block_Elements, - UC_BLK_Geometric_Shapes, - UC_BLK_Miscellaneous_Symbols, - UC_BLK_Dingbats, - UC_BLK_Miscellaneous_Mathematical_Symbols_A, - UC_BLK_Supplemental_Arrows_A, - UC_BLK_Braille_Patterns, - UC_BLK_Supplemental_Arrows_B, - UC_BLK_Miscellaneous_Mathematical_Symbols_B, - UC_BLK_Supplemental_Mathematical_Operators, - UC_BLK_Miscellaneous_Symbols_and_Arrows, - UC_BLK_Glagolitic, - UC_BLK_Coptic, - UC_BLK_Georgian_Supplement, - UC_BLK_Tifinagh, - UC_BLK_Ethiopic_Extended, - UC_BLK_Supplemental_Punctuation, - UC_BLK_CJK_Radicals_Supplement, - UC_BLK_Kangxi_Radicals, - UC_BLK_Ideographic_Description_Characters, - UC_BLK_CJK_Symbols_and_Punctuation, - UC_BLK_Hiragana, - UC_BLK_Katakana, - UC_BLK_Bopomofo, - UC_BLK_Hangul_Compatibility_Jamo, - UC_BLK_Kanbun, - UC_BLK_Bopomofo_Extended, - UC_BLK_CJK_Strokes, - UC_BLK_Katakana_Phonetic_Extensions, - UC_BLK_Enclosed_CJK_Letters_and_Months, - UC_BLK_CJK_Compatibility, - UC_BLK_CJK_Unified_Ideographs_Extension_A, - UC_BLK_Yijing_Hexagram_Symbols, - UC_BLK_CJK_Unified_Ideographs, - UC_BLK_Yi_Syllables, - UC_BLK_Yi_Radicals, - UC_BLK_Modifier_Tone_Letters, - UC_BLK_Syloti_Nagri, - UC_BLK_Hangul_Syllables, - UC_BLK_High_Surrogates, - UC_BLK_High_Private_Use_Surrogates, - UC_BLK_Low_Surrogates, - UC_BLK_Private_Use_Area, - UC_BLK_CJK_Compatibility_Ideographs, - UC_BLK_Alphabetic_Presentation_Forms, - UC_BLK_Arabic_Presentation_Forms_A, - UC_BLK_Variation_Selectors, - UC_BLK_Vertical_Forms, - UC_BLK_Combining_Half_Marks, - UC_BLK_CJK_Compatibility_Forms, - UC_BLK_Small_Form_Variants, - UC_BLK_Arabic_Presentation_Forms_B, - UC_BLK_Halfwidth_and_Fullwidth_Forms, - UC_BLK_Specials, - UC_BLK_Linear_B_Syllabary, - UC_BLK_Linear_B_Ideograms, - UC_BLK_Aegean_Numbers, - UC_BLK_Ancient_Greek_Numbers, - UC_BLK_Old_Italic, - UC_BLK_Gothic, - UC_BLK_Ugaritic, - UC_BLK_Old_Persian, - UC_BLK_Deseret, - UC_BLK_Shavian, - UC_BLK_Osmanya, - UC_BLK_Cypriot_Syllabary, - UC_BLK_Kharoshthi, - UC_BLK_Byzantine_Musical_Symbols, - UC_BLK_Musical_Symbols, - UC_BLK_Ancient_Greek_Musical_Notation, - UC_BLK_Tai_Xuan_Jing_Symbols, - UC_BLK_Mathematical_Alphanumeric_Symbols, - UC_BLK_CJK_Unified_Ideographs_Extension_B, - UC_BLK_CJK_Compatibility_Ideographs_Supplement, - UC_BLK_Tags, - UC_BLK_Variation_Selectors_Supplement, - UC_BLK_Supplementary_Private_Use_Area_A, - UC_BLK_Supplementary_Private_Use_Area_B, -}; - -#define UC_FLAG(x) (UINT64_C(1) << (x)) - -#define UC_FL_COMPOSITION_EXCLUSION UC_FLAG(0) -#define UC_FL_ALPHABETIC UC_FLAG(1) -#define UC_FL_DEFAULT_IGNORABLE_CODE_POINT UC_FLAG(2) -#define UC_FL_LOWERCASE UC_FLAG(3) -#define UC_FL_GRAPHEME_BASE UC_FLAG(4) -#define UC_FL_GRAPHEME_EXTEND UC_FLAG(5) -#define UC_FL_ID_START UC_FLAG(6) -#define UC_FL_ID_CONTINUE UC_FLAG(7) -#define UC_FL_MATH UC_FLAG(8) -#define UC_FL_UPPERCASE UC_FLAG(9) -#define UC_FL_XID_START UC_FLAG(10) -#define UC_FL_XID_CONTINUE UC_FLAG(11) -#define UC_FL_HEX_DIGIT UC_FLAG(12) -#define UC_FL_BIDI_CONTROL UC_FLAG(13) -#define UC_FL_DASH UC_FLAG(14) -#define UC_FL_DEPRECATED UC_FLAG(15) -#define UC_FL_DIACRITIC UC_FLAG(16) -#define UC_FL_EXTENDER UC_FLAG(17) -#define UC_FL_GRAPHEME_LINK UC_FLAG(18) -#define UC_FL_IDEOGRAPHIC UC_FLAG(19) -#define UC_FL_IDS_BINARY_OPERATOR UC_FLAG(20) -#define UC_FL_IDS_TRINARY_OPERATOR UC_FLAG(21) -#define UC_FL_JOIN_CONTROL UC_FLAG(22) -#define UC_FL_LOGICAL_ORDER_EXCEPTION UC_FLAG(23) -#define UC_FL_NONCHARACTER_CODE_POINT UC_FLAG(24) -#define UC_FL_PATTERN_SYNTAX UC_FLAG(25) -#define UC_FL_PATTERN_WHITE_SPACE UC_FLAG(26) -#define UC_FL_QUOTATION_MARK UC_FLAG(27) -#define UC_FL_RADICAL UC_FLAG(28) -#define UC_FL_SOFT_DOTTED UC_FLAG(29) -#define UC_FL_STERM UC_FLAG(30) -#define UC_FL_TERMINAL_PUNCTUATION UC_FLAG(31) -#define UC_FL_UNIFIED_IDEOGRAPH UC_FLAG(32) -#define UC_FL_VARIATION_SELECTOR UC_FLAG(33) -#define UC_FL_WHITE_SPACE UC_FLAG(34) -#define UC_FL_BIDI_MIRRORED UC_FLAG(35) - -struct unicode_character_data { - int32_t ucs; /* Actual codepoint */ - uint16_t size; /* Size of this structure */ - uint16_t alloc_size; /* Allocation size */ - uint64_t fl; /* Flags */ - const char *name; - const char *bidi_mirroring_glyph; - const char *uppercase_mapping; - const char *lowercase_mapping; - const char *titlecase_mapping; - int32_t simple_uppercase; - int32_t simple_lowercase; - int32_t simple_titlecase; - /* Numeric value = num/den * 10^exp */ - uint8_t numeric_value_num; - uint8_t numeric_value_den; - uint8_t numeric_value_exp; - uint8_t age_ma, age_mi; - enum unicode_general_category general_category; - enum unicode_block block; - enum unicode_script script; - enum unicode_joining_type joining_type; - enum unicode_joining_group joining_group; - enum unicode_east_asian_width east_asian_width; - enum unicode_hangul_syllable_type hangul_syllable_type; - enum unicode_numeric_type numeric_type; - enum unicode_combining_class combining_class; - enum unicode_bidi_class bidi_class; - enum unicode_grapheme_cluster_break grapheme_cluster_break; - enum unicode_sentence_break sentence_break; - enum unicode_word_break word_break; - enum unicode_line_break line_break; -}; - -struct unicode_character_data *unicode_character_data(int32_t); -struct unicode_character_data *unicode_character_get(struct unicode_character_data *); -void unicode_character_put(struct unicode_character_data *); -struct unicode_character_data *unicode_character_lookup(const char *); - -#endif /* UCD_H */ diff --git a/ucd.h.in b/ucd.h.in new file mode 100644 index 0000000..74bab3c --- /dev/null +++ b/ucd.h.in @@ -0,0 +1,125 @@ +/* -*- c -*- ------------------------------------------------------------- * + * + * Copyright 2005 H. Peter Anvin - All Rights Reserved + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall + * be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * ----------------------------------------------------------------------- */ + +/* + * ucd.h + * + * Simple interface to the Unicode Character Database + */ + +#ifndef UCD_H +#define UCD_H + +#include <stdint.h> + +#define LIBUCD_THREAD_SUPPORT + +#ifdef LIBUCD_THREAD_SUPPORT +# include <pthread.h> +#endif + +ENUMS; + +#define UC_FLAG(x) (UINT64_C(1) << (x)) + +#define UC_FL_COMPOSITION_EXCLUSION UC_FLAG(0) +#define UC_FL_ALPHABETIC UC_FLAG(1) +#define UC_FL_DEFAULT_IGNORABLE_CODE_POINT UC_FLAG(2) +#define UC_FL_LOWERCASE UC_FLAG(3) +#define UC_FL_GRAPHEME_BASE UC_FLAG(4) +#define UC_FL_GRAPHEME_EXTEND UC_FLAG(5) +#define UC_FL_ID_START UC_FLAG(6) +#define UC_FL_ID_CONTINUE UC_FLAG(7) +#define UC_FL_MATH UC_FLAG(8) +#define UC_FL_UPPERCASE UC_FLAG(9) +#define UC_FL_XID_START UC_FLAG(10) +#define UC_FL_XID_CONTINUE UC_FLAG(11) +#define UC_FL_HEX_DIGIT UC_FLAG(12) +#define UC_FL_BIDI_CONTROL UC_FLAG(13) +#define UC_FL_DASH UC_FLAG(14) +#define UC_FL_DEPRECATED UC_FLAG(15) +#define UC_FL_DIACRITIC UC_FLAG(16) +#define UC_FL_EXTENDER UC_FLAG(17) +#define UC_FL_GRAPHEME_LINK UC_FLAG(18) +#define UC_FL_IDEOGRAPHIC UC_FLAG(19) +#define UC_FL_IDS_BINARY_OPERATOR UC_FLAG(20) +#define UC_FL_IDS_TRINARY_OPERATOR UC_FLAG(21) +#define UC_FL_JOIN_CONTROL UC_FLAG(22) +#define UC_FL_LOGICAL_ORDER_EXCEPTION UC_FLAG(23) +#define UC_FL_NONCHARACTER_CODE_POINT UC_FLAG(24) +#define UC_FL_PATTERN_SYNTAX UC_FLAG(25) +#define UC_FL_PATTERN_WHITE_SPACE UC_FLAG(26) +#define UC_FL_QUOTATION_MARK UC_FLAG(27) +#define UC_FL_RADICAL UC_FLAG(28) +#define UC_FL_SOFT_DOTTED UC_FLAG(29) +#define UC_FL_STERM UC_FLAG(30) +#define UC_FL_TERMINAL_PUNCTUATION UC_FLAG(31) +#define UC_FL_UNIFIED_IDEOGRAPH UC_FLAG(32) +#define UC_FL_VARIATION_SELECTOR UC_FLAG(33) +#define UC_FL_WHITE_SPACE UC_FLAG(34) +#define UC_FL_BIDI_MIRRORED UC_FLAG(35) + +struct unicode_character_data { + int32_t ucs; /* Actual codepoint */ + uint16_t size; /* Size of this structure */ + uint16_t alloc_size; /* Allocation size */ + uint64_t fl; /* Flags */ + const char *name; + const char *bidi_mirroring_glyph; + const char *uppercase_mapping; + const char *lowercase_mapping; + const char *titlecase_mapping; + int32_t simple_uppercase; + int32_t simple_lowercase; + int32_t simple_titlecase; + /* Numeric value = num/den * 10^exp */ + uint8_t numeric_value_num; + uint8_t numeric_value_den; + uint8_t numeric_value_exp; + uint8_t age_ma, age_mi; + enum unicode_general_category general_category; + enum unicode_block block; + enum unicode_script script; + enum unicode_joining_type joining_type; + enum unicode_joining_group joining_group; + enum unicode_east_asian_width east_asian_width; + enum unicode_hangul_syllable_type hangul_syllable_type; + enum unicode_numeric_type numeric_type; + enum unicode_canonical_combining_class canonical_combining_class; + enum unicode_bidi_class bidi_class; + enum unicode_grapheme_cluster_break grapheme_cluster_break; + enum unicode_sentence_break sentence_break; + enum unicode_word_break word_break; + enum unicode_line_break line_break; +}; + +struct unicode_character_data *unicode_character_data(int32_t); +struct unicode_character_data *unicode_character_get(struct unicode_character_data *); +void unicode_character_put(struct unicode_character_data *); +struct unicode_character_data *unicode_character_lookup(const char *); + +#endif /* UCD_H */ diff --git a/ucslookup.c b/ucslookup.c index 3283b0b..3c6aa46 100644 --- a/ucslookup.c +++ b/ucslookup.c @@ -141,7 +141,7 @@ alloc_copy_properties(const struct _libucd_property_array *prop, ucd->east_asian_width = prop->east_asian_width; ucd->hangul_syllable_type = prop->hangul_syllable_type; ucd->numeric_type = prop->numeric_type; - ucd->combining_class = prop->combining_class; + ucd->canonical_combining_class = prop->canonical_combining_class; ucd->bidi_class = prop->bidi_class; ucd->grapheme_cluster_break = prop->grapheme_cluster_break; ucd->sentence_break = prop->sentence_break; |