aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@smyrno.hos.anvin.org>2005-12-04 18:36:27 -0800
committerH. Peter Anvin <hpa@smyrno.hos.anvin.org>2005-12-04 18:36:27 -0800
commit2d1f849dd782091cdea13864be00a3a8fef0f6bc (patch)
tree4d5d7d41c7eb768a76c343bfb2936a1e980fac33
parent6d4ddcfd62b997bb8454e521c635c47a6a848816 (diff)
downloadlibucd-2d1f849dd782091cdea13864be00a3a8fef0f6bc.tar.gz
Actual ucs-to-properties function
-rw-r--r--ucslookup.c210
1 files changed, 210 insertions, 0 deletions
diff --git a/ucslookup.c b/ucslookup.c
new file mode 100644
index 0000000..c38e162
--- /dev/null
+++ b/ucslookup.c
@@ -0,0 +1,210 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <errno.h>
+#include "libucd_int.h"
+#include "gen/ucstoname_hash.h"
+
+/*
+ * This returns the name for a specific UCS in a user-provided buffer,
+ * and returns the total length regardless of overrun, like snprintf().
+ * This is used with names that *are* found in the hash table only.
+ */
+static void
+libucd_mkname(char *buf, const unsigned char *nameslist_ptr)
+{
+ const unsigned char *p = nameslist_ptr;
+ char *q = buf;
+ uint8_t c;
+ char *pp, cc;
+ int n = *p++; /* Characters remaining */
+
+ while ( n ) {
+ pp = _libucd_nameslist_dict[*p++];
+ while ( (cc = *pp++) ) {
+ *q++ = cc;
+ if ( --n == 0 )
+ break;
+ }
+ }
+
+ *q = '\0';
+}
+
+/*
+ * Take a character in the range of the Hangul characters, and compute
+ * its formal name. Returns the length, or 0 if invalid.
+ */
+static size_t
+hangul_name(char *buf, size_t n, int32_t codepoint)
+{
+ /* See the Unicode Standard, version 4.1, section 3.12 */
+ const int32_t SBase = 0xAC00;
+ const int32_t LCount = 19;
+ const int32_t VCount = 21;
+ const int32_t TCount = 28;
+ const int32_t NCount = VCount * TCount; /* 588 */
+ const int32_t SCount = NCount * LCount; /* 11172 */
+
+ int32_t SIndex, L, V, T;
+
+ SIndex = codepoint - SBase;
+ if ( SIndex < 0 || SIndex >= SCount )
+ return 0;
+
+ L = SIndex/NCount;
+ V = (SIndex % NCount)/TCount;
+ T = SIndex % TCount;
+
+ return snprintf(buf, n, "HANGUL SYLLABLE %s%s%s",
+ libucd_hangul_jamo_l[L],
+ libucd_hangul_jamo_v[V],
+ libucd_hangul_jamo_t[T]);
+}
+
+/*
+ * Binary search of the properties array (for non-hashed characters)
+ */
+static const struct _libucd_property_array *
+search_prop_array(int32_t ucs)
+{
+ int l, h, m;
+ const struct _libucd_property_array *pa;
+
+ l = 0;
+ h = _libucd_property_array_count-1;
+
+ for (;;) {
+ m = (l+h) >> 2;
+ pa = &_libucd_property_array[m];
+ if ( ucs >= pa[0].ucs ) {
+ if ( ucs < pa[1].ucs )
+ return pa;
+ else
+ l = m+1;
+ } else {
+ h = m-1;
+ }
+ }
+}
+
+/*
+ * Allocate memory and copy properties
+ */
+static struct unicode_character_data *
+alloc_copy_properties(const struct _libucd_property_array *prop,
+ int32_t ucs, size_t namelen)
+{
+ struct unicode_character_data *ucd;
+
+ ucd = malloc(sizeof(struct unicode_character_data)+namelen+1);
+ if ( !ucd )
+ return NULL;
+ ucd->name = (char *)(ucd+1);
+
+ ucd->fl = prop->flags_block & UINT64_C(0xffffffffffff);
+ ucd->bidi_mirroring_glyph = NULL; /* NYS */
+ ucd->uppercase_mapping = NULL; /* NYS */
+ ucd->lowercase_mapping = NULL; /* NYS */
+ ucd->titlecase_mapping = NULL; /* NYS */
+ ucd->ucs = ucs;
+ ucd->simple_uppercase = ucs + getint24(prop->simple_uppercase);
+ ucd->simple_lowercase = ucs + getint24(prop->simple_lowercase);
+ ucd->simple_titlecase = ucs + getint24(prop->simple_titlecase);
+ ucd->numeric_value_num = prop->numeric_value_num;
+ if ( prop->numeric_value_den_exp & 0x80 ) {
+ ucd->numeric_value_exp = prop->numeric_value_den_exp & 0x7f;
+ ucd->numeric_value_den = 1;
+ } else {
+ ucd->numeric_value_exp = 1;
+ ucd->numeric_value_den = prop->numeric_value_den_exp;
+ }
+ ucd->age_ma = prop->age >> 3;
+ ucd->age_mi = prop->age & 7;
+ ucd->general_category = prop->general_category;
+ ucd->block = (prop->flags_block >> 48) & 0xff;
+ ucd->script = prop->script;
+ ucd->joining_type = prop->joining_type;
+ ucd->joining_group = prop->joining_group;
+ ucd->east_asian_width = prop->east_asian_width;
+ ucd->hangul_syllable_type = prop->hangul_syllable_type;
+ ucd->numeric_type = prop->numeric_type;
+ ucd->combining_class = prop->combining_class;
+ ucd->bidi_class = prop->bidi_class;
+ ucd->grapheme_cluster_break = prop->grapheme_cluster_break;
+ ucd->sentence_break = prop->sentence_break;
+ ucd->word_break = prop->word_break;
+ ucd->line_break = prop->line_break;
+
+ return ucd;
+}
+
+/*
+ * Standard entry point for the user
+ */
+struct unicode_character_data *
+unicode_character_data(int32_t ucs)
+{
+ uint32_t hash;
+ const struct _libucd_ucstoname_tab *unt;
+ const struct _libucd_property_array *prop;
+ size_t namelen;
+ struct unicode_character_data *ucd;
+ char *nameptr;
+
+ if ( ucs < 0 || ucs > UCS_MAX ) {
+ errno = EINVAL;
+ return NULL; /* Invalid UCS value */
+ }
+
+ hash = _libucd_ucstoname_hash(ucs);
+
+ if ( hash >= PHASHNKEYS ) {
+ unt = NULL;
+ } else {
+ unt = _libucd_ucstoname_tab[hash];
+ if ( getint24(unt->ucs) != ucs )
+ unt = NULL;
+ }
+
+ if ( unt ) {
+ const unsigned char *nameptr =
+ &_libucd_names_list[getuint24(unt->names_offset)];
+ prop = &_libucd_property_array[unt->proparray_offset];
+ namelen = *nameptr;
+
+ ucd = alloc_copy_properties(prop, ucs, namelen);
+ if ( !ucd )
+ return NULL;
+ libucd_mkname((char *)ucd->name, nameptr);
+ } else {
+ prop = search_prop_array(ucs);
+
+ if ( ucs >= 0xAC00 && ucs < 0xAC00+19*21*28 ) {
+ namelen = hangul_name(NULL, 0, ucs);
+ ucd = alloc_copy_properties(prop, ucs, namelen);
+ if ( !ucd )
+ return NULL;
+ hangul_name((char *)ucd->name, namelen+1, ucs);
+ } else if ( prop->flags_block & UC_FL_UNIFIED_IDEOGRAPH ) {
+ /* "CJK UNIFIED IDEOGRAPH-XXXX[X] */
+ namelen = (ucs > 0xffff) ? 27 : 26;
+ ucd = alloc_copy_properties(prop, ucs, namelen);
+ if ( !ucd )
+ return NULL;
+ snprintf((char *)ucd->name, namelen+1, "CJK UNIFIED IDEOGRAPH-%04X", ucd);
+ } else {
+ /* Unnamed character */
+ namelen = -1;
+ ucd = alloc_copy_properties(prop, ucs, namelen);
+ if ( !ucd )
+ return NULL;
+ ucd->name = NULL;
+ }
+ }
+
+ return ucd;
+}
+
+