ChangeSet 1.1290.15.18, 2004/03/03 10:04:33-08:00, david-b@pacbell.net [PATCH] USB Gadget: make usb gadget strings talk utf-8 Teach gadget/usbstring to expect UTF-8 strings, not ISO-8859/1 ones. This just gets rid of an API issue: no hacks needed for non-Western languages, and multi-language support will be lots easier. Current drivers won't notice the API change, they use US-ASCII (which is a strict superset of both encodings). Future drivers may want to teach utf8_to_utf16le() about the four-byte encodings, so they can emit surrogate pairs for those Unicode characters. drivers/usb/gadget/usbstring.c | 92 ++++++++++++++++++++++++++++++++++------- include/linux/usb_gadget.h | 2 2 files changed, 79 insertions(+), 15 deletions(-) diff -Nru a/drivers/usb/gadget/usbstring.c b/drivers/usb/gadget/usbstring.c --- a/drivers/usb/gadget/usbstring.c Wed Mar 17 15:48:29 2004 +++ b/drivers/usb/gadget/usbstring.c Wed Mar 17 15:48:29 2004 @@ -16,24 +16,89 @@ #include #include +#include + + +static int utf8_to_utf16le(const char *s, u16 *cp, unsigned len) +{ + int count = 0; + u8 c; + u16 uchar; + + /* this insists on correct encodings, though not minimal ones. + * BUT it currently rejects legit 4-byte UTF-8 code points, + * which need surrogate pairs. (Unicode 3.1 can use them.) + */ + while (len != 0 && (c = (u8) *s++) != 0) { + if (unlikely(c & 0x80)) { + // 2-byte sequence: + // 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx + if ((c & 0xe0) == 0xc0) { + uchar = (c & 0x1f) << 6; + + c = (u8) *s++; + if ((c & 0xc0) != 0xc0) + goto fail; + c &= 0x3f; + uchar |= c; + + // 3-byte sequence (most CJKV characters): + // zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx + } else if ((c & 0xf0) == 0xe0) { + uchar = (c & 0x0f) << 12; + + c = (u8) *s++; + if ((c & 0xc0) != 0xc0) + goto fail; + c &= 0x3f; + uchar |= c << 6; + + c = (u8) *s++; + if ((c & 0xc0) != 0xc0) + goto fail; + c &= 0x3f; + uchar |= c; + + /* no bogus surrogates */ + if (0xd800 <= uchar && uchar <= 0xdfff) + goto fail; + + // 4-byte sequence (surrogate pairs, currently rare): + // 11101110wwwwzzzzyy + 110111yyyyxxxxxx + // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx + // (uuuuu = wwww + 1) + // FIXME accept the surrogate code points (only) + + } else + goto fail; + } else + uchar = c; + put_unaligned (cpu_to_le16 (uchar), cp++); + count++; + len--; + } + return count; +fail: + return -1; +} + /** * usb_gadget_get_string - fill out a string descriptor - * @table: of c strings using iso latin/1 characters + * @table: of c strings encoded using UTF-8 * @id: string id, from low byte of wValue in get string descriptor * @buf: at least 256 bytes * - * Finds the iso latin/1 string matching the ID, and converts it into a + * Finds the UTF-8 string matching the ID, and converts it into a * string descriptor in utf16-le. * Returns length of descriptor (always even) or negative errno * - * If your driver needs stings in multiple languages, you'll need to - * to use some alternate solution for languages where the ISO 8859/1 - * (latin/1) character set can't be used. For example, they can't be - * used with Chinese (Big5, GB2312, etc), Japanese, Korean, or many other - * languages. You'd likely "switch (wIndex) { ... }" in your ep0 - * string descriptor logic, using this routine in cases where "western - * european" characters suffice for the strings being returned. + * If your driver needs stings in multiple languages, you'll probably + * "switch (wIndex) { ... }" in your ep0 string descriptor logic, + * using this routine after choosing which set of UTF-8 strings to use. + * Note that US-ASCII is a strict subset of UTF-8; any string bytes with + * the eighth bit set will be multibyte UTF-8 characters, not ISO-8859/1 + * characters (which are also widely used in C strings). */ int usb_gadget_get_string (struct usb_gadget_strings *table, int id, u8 *buf) @@ -59,13 +124,12 @@ /* string descriptors have length, tag, then UTF16-LE text */ len = min ((size_t) 126, strlen (s->s)); + memset (buf + 2, 0, 2 * len); /* zero all the bytes */ + len = utf8_to_utf16le(s->s, (u16 *)&buf[2], len); + if (len < 0) + return -EINVAL; buf [0] = (len + 1) * 2; buf [1] = USB_DT_STRING; - memset (buf + 2, 0, 2 * len); /* zero all the high bytes */ - while (len) { - buf [2 * len] = s->s [len - 1]; - len--; - } return buf [0]; } diff -Nru a/include/linux/usb_gadget.h b/include/linux/usb_gadget.h --- a/include/linux/usb_gadget.h Wed Mar 17 15:48:29 2004 +++ b/include/linux/usb_gadget.h Wed Mar 17 15:48:29 2004 @@ -707,7 +707,7 @@ /** * struct usb_string - wraps a C string and its USB id * @id:the (nonzero) ID for this string - * @s:the string, in ISO-8859/1 characters + * @s:the string, in UTF-8 encoding * * If you're using usb_gadget_get_string(), use this to wrap a string * together with its ID.