diff options
author | Linus Torvalds <torvalds@g5.osdl.org> | 2005-10-02 17:47:28 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-10-02 17:47:28 -0700 |
commit | f313bcf64a798d1b461f10c51227c94dbcff1225 (patch) | |
tree | 3ef3a3d9cef1932e69a9a06e86a2f48d5b1cc084 | |
parent | 0fc43a842945b46a3831620a8d42b54fffafb6a8 (diff) | |
download | uemacs-f313bcf64a798d1b461f10c51227c94dbcff1225.tar.gz |
Add support for a "utf-8" mode
NOTE! MicroEmacs is very much a byte-based editor, and the new utf-8
support is purely an issue of terminal input and output. The file
contents themselves are in the 8-bit space. In that space, Unicode is
the same as Latin1.
The new mode is called "utf-8", and is enabled automatically by the
new emacs.rc when $LANG contains the substring "UTF-8".
I'm sure people would like to some day also edit real UTF-8 contents,
rather than just edit old 8-bit Latin1 contents in a UTF-8 terminal.
However, that's an independent (and much bigger and thornier) issue.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | emacs.rc | 4 | ||||
-rw-r--r-- | estruct.h | 3 | ||||
-rw-r--r-- | globals.c | 6 | ||||
-rw-r--r-- | posix.c | 95 |
4 files changed, 103 insertions, 5 deletions
@@ -282,4 +282,8 @@ bind-to-key newline ^J !endif !endif +!if &gre &sin $LANG "UTF-8" 0 + add-global-mode "utf-8" +!endif + set $discmd "TRUE" @@ -493,7 +493,7 @@ typedef struct BUFFER { #define BFTRUNC 0x04 /* buffer was truncated when read */ /* mode flags */ -#define NUMMODES 9 /* # of defined modes */ +#define NUMMODES 10 /* # of defined modes */ #define MDWRAP 0x0001 /* word wrap */ #define MDCMOD 0x0002 /* C indentation and fence match */ @@ -504,6 +504,7 @@ typedef struct BUFFER { #define MDMAGIC 0x0040 /* regular expresions in search */ #define MDCRYPT 0x0080 /* encrytion mode active */ #define MDASAVE 0x0100 /* auto-save mode */ +#define MDUTF8 0x0200 /* UTF-8 input/output mode */ /* * The starting position of a region, and the size of the region in @@ -13,13 +13,13 @@ int revexist = FALSE; /* does reverse video exist? */ int flickcode = FALSE; /* do flicker supression? */ char *modename[] = { /* name of modes */ "WRAP", "CMODE", "SPELL", "EXACT", "VIEW", "OVER", - "MAGIC", "CRYPT", "ASAVE" + "MAGIC", "CRYPT", "ASAVE", "UTF-8" }; char *mode2name[] = { /* name of modes */ "Wrap", "Cmode", "Spell", "Exact", "View", "Over", - "Magic", "Crypt", "Asave" + "Magic", "Crypt", "Asave", "utf-8" }; -char modecode[] = "WCSEVOMYA"; /* letters to represent modes */ +char modecode[] = "WCSEVOMYAU"; /* letters to represent modes */ int gmode = 0; /* global editor mode */ int gflags = GFREAD; /* global control flag */ #if PKCODE & IBMPC @@ -23,6 +23,20 @@ #include <fcntl.h> #include <errno.h> +/* + * NOTE NOTE NOTE! + * + * Uemacs is currently very much byte-oriented, and not at all UTF8-aware + * interally. However, this allows it to understand a _terminal_ that is + * in utf-8 mode, and will turn input into the 8-bit subset, and will turn + * things back into UTF8 on output. + * + * Do _not_ confuse this with the notion of actually being able to edit + * UTF-8 file _contents_. That's a totally different thing. + */ +#define utf8_mode() \ + (curwp && curwp->w_bufp && (curwp->w_bufp->b_mode & MDUTF8)) + static int kbdflgs; /* saved keyboard fd flags */ static int kbdpoll; /* in O_NDELAY mode */ @@ -99,6 +113,22 @@ void ttclose(void) */ int ttputc(int c) { + /* + * We always represent things in 1 byte, but if we output + * in UTF-8, we may need to expand that into 2 bytes.. + * + * Some day we might even be able to handle UTF-8 _content_. + * + * That day is not today. + */ + if (utf8_mode()) { + c &= 0xff; + if (c >= 0x80) { + unsigned char first = (c >> 6) | 0xc0; + fputc(first, stdout); + c = (c & 0x3f) | 0x80; + } + } fputc(c, stdout); return (TRUE); } @@ -138,7 +168,70 @@ void ttflush(void) */ int ttgetc(void) { - return (255 & fgetc(stdin)); /* 8BIT P.K. */ + static unsigned char pending; + unsigned char c, second; + int n; + + if (pending) { + c = pending; + pending = 0; + return c; + } + + n = read(0, &c, 1); + if (n != 1) + return 0; + + if (!utf8_mode()) + return c; + + /* Normal 7-bit? */ + if (!(c & 0x80)) + return c; + + /* + * Unexpected UTF-8 continuation character? Maybe + * we're in non-UTF mode, or maybe it's a control + * character.. Regardless, just pass it on. + */ + if (!(c & 0x40)) + return c; + + /* + * Multi-byte sequences.. Right now we only + * want to get characters that can be represented + * in a single byte, so we're not interested in + * anything else.. + */ + if (c & 0x3c) + return c; + + /* + * Two-byte sequence representing 0x80-0xff.. We want + * to do this read with a timeout. + */ + ntermios.c_cc[VMIN] = 1; + ntermios.c_cc[VTIME] = 10; /* 1 second */ + tcsetattr(0, TCSANOW, &ntermios); + + n = read(0, &second, 1); + + /* Undo timeout */ + ntermios.c_cc[VTIME] = 0; + tcsetattr(0, TCSANOW, &ntermios); + + if (n != 1) + return c; + + if ((second & 0xc0) != 0x80) { + pending = second; + return c; + } + + c = (c << 6) | (second & 0x3f); + + /* Ok, real UTF-8 character */ + return c; } /* typahead: Check to see if any characters are already in the |