aboutsummaryrefslogtreecommitdiffstats
path: root/libucd.3
blob: 7743ea8e5a85e298d85987224ae8176e4b5fb11b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
.\" -----------------------------------------------------------------------
.\"   
.\"   Copyright 2005 H. Peter Anvin - All Rights Reserved
.\"
.\"   Permission is hereby granted, free of charge, to any person
.\"   obtaining a copy of this software and associated documentation
.\"   files (the "Software"), to deal in the Software without
.\"   restriction, including without limitation the rights to use,
.\"   copy, modify, merge, publish, distribute, sublicense, and/or
.\"   sell copies of the Software, and to permit persons to whom
.\"   the Software is furnished to do so, subject to the following
.\"   conditions:
.\"   
.\"   The above copyright notice and this permission notice shall
.\"   be included in all copies or substantial portions of the Software.
.\"   
.\"   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
.\"   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
.\"   OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
.\"   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
.\"   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
.\"   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
.\"   FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
.\"   OTHER DEALINGS IN THE SOFTWARE.
.\"
.\" -----------------------------------------------------------------------
.TH LIBUCD 3 2005-12-29 libucd ""
.SH NAME
unicode_character_data, unicode_character_lookup, unicode_character_get,
unicode_character_put, unicode_database_version \-
query the Unicode Character Database
.SH SYNOPSIS
.nf
.B #include <ucd.h>
.sp
.B const struct unicode_character_data *unicode_character_data(int32_t);
.sp
.B const struct unicode_character_data *unicode_character_lookup(const char *);
.sp
.B const struct unicode_character_data *unicode_character_get(const struct unicode_character_data *);
.sp
.B void unicode_character_put(const struct unicode_character_data *);
.sp
.B uint32_t unicode_database_version();
.fi
.SH DESCRIPTION
These functions query the Unicode Character Database for attribute
information about a certain character.  Attribute information is
represented by a pointer to a reference-counted structure.  Any
internal pointers in the structure should be assumed to point to
memory protected by the same reference count.
.PP
\fBunicode_character_data()\fP returns the attributes for a character
specified by Unicode index.  If the argument is outside the Unicode
range (0 to 0x10ffff) it returns NULL with \fIerrno\fP set to EINVAL.
.PP
\fBunicode_character_lookup()\fP searches for a character by Unicode
name (e.g. "LATIN CAPITAL LETTER A".)  If no character by that name is
known, it returns NULL with \fIerrno\fP set to EINVAL.  Only exact
matches are returned; if a case-insensitive matching is desired, the
string should be upper-cased.
.PP
\fBunicode_character_get()\fP creates a new reference to an attribute
structure, and returns the new reference.  This pointer may or may not
be a pointer to the original structure.
.PP
\fBunicode_character_put()\fP unreferences an attribute structure and
frees it if appropriate.
.PP
\fBunicode_character_version()\fP returns the version of the Unicode
database from which the library was generated, in the format
(major << 16)+(minor << 8)+(subminor).
.PP
The \fIunicode_character_data\fP structure is defined in <ucd.h> and
contains at least the following fields:
.sp
.RS
.nf
.ne 4
.ta 0n 4n 44n
struct unicode_character_data {
	int32_t					ucs;
	uint16_t				size;
	uint64_t				fl;
	const char				*name;
	int32_t					simple_uppercase;
	int32_t					simple_lowercase;
	int32_t					simple_titlecase;
	uint8_t					numeric_value_num;
	uint8_t					numeric_value_den;
	uint8_t					numeric_value_exp;
	uint8_t					age_ma, age_mi;
	enum unicode_general_category		general_category;
	enum unicode_block			block;
	enum unicode_script			script;
	enum unicode_joining_type		joining_type;
	enum unicode_joining_group		joining_group;
	enum unicode_east_asian_width		east_asian_width;
	enum unicode_hangul_syllable_type	hangul_syllable_type;
	enum unicode_numeric_type		numeric_type;
	enum unicode_canonical_combining_class	canonical_combining_class;
	enum unicode_bidi_class			bidi_class;
	enum unicode_grapheme_cluster_break	grapheme_cluster_break;
	enum unicode_sentence_break		sentence_break;
	enum unicode_word_break			word_break;
	enum unicode_line_break			line_break;
};
.ta
.fi
.RE
.PP
The members of the \fIunicode_character_data\fP structure are:
.TP
.B ucs
The Unicode index of the character.
.TP
.B size
The size of the structure, in bytes.  This can be used to determine
the availability of a specific field if one is added in future
versions.
.TP
.B fl
A boolean combination of flags (UC_FL_), defined in ucd.h.
.TP
.B name
The Unicode name of the character.
.TP
.B bidi_mirroring_glyph
The Unicode string which corresponds to the mirror image of this
character.  \fINot yet implemented.\fP
.TP
.B simple_uppercase
The simple (single codepoint) uppercase mapping string for this character.  
.TP
.B simple_lowercase
The simple (single codepoint) lowercase mapping string for this character.  
.TP
.B simple_titlecase
The simple (single codepoint) titlecase mapping string for this character.  
.TP
.B numeric_value_num
.TP
.B numeric_value_den
.TP
.B numeric_value_exp
For a number, the numeric value is given as num/den * 10^exp.
.TP
.B age_ma
.TP
.B age_mi
Major and minor Unicode version when this character was introduced.
If this is a vacant codepoint, this has the value 0.0.
.PP
All enumerations are properties defined in the Unicode standard.  Most
Unicode properties has both a long and a short form.  The
corresponding strings can be obtained by calling the function
.sp
.B int unicode_property_names_\fIproperty\fP(enum
unicode_\fIproperty\fP \fIvalue\fP, const char **\fIlongname\fP, const char **\fIshortname\fP);
.sp
where the first argument is the enumeration value, and the \fIlongname\fP
and \fIshortname\fP arguments return pointers to the respective strings.
.SH "RETURN VALUE"
\fBunicode_character_data()\fP, \fBunicode_character_lookup()\fP, or
\fBunicode_character_get()\fP return an attribute structure pointer on
success, or NULL on failure.  In the case of failure, \fIerrno\fP is
set to the appropriate error value (in the current implementation,
either EINVAL or ENOMEM.)
.PP
\fBunicode_database_version()\fP returns the version of the underlying
Unicode database, in the format (major << 16)+(minor << 8)+(subminor).
.PP
The \fBunicode_property_names\fP functions return zero on success, or
nonzero if the enumeration value was out of range.
.SH "BUGS"
The fields related to bidirectional mirroring and non-simple case
mappings are not yet populated.
.PP
There is no interface to the Unihan database.  This perhaps should be
a separate library.
.SH "SEE ALSO"
The Unicode Standard,
.IR http://www.unicode.org/ .