aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@g5.osdl.org>2005-10-02 17:47:28 -0700
committerLinus Torvalds <torvalds@g5.osdl.org>2005-10-02 17:47:28 -0700
commitf313bcf64a798d1b461f10c51227c94dbcff1225 (patch)
tree3ef3a3d9cef1932e69a9a06e86a2f48d5b1cc084
parent0fc43a842945b46a3831620a8d42b54fffafb6a8 (diff)
downloaduemacs-f313bcf64a798d1b461f10c51227c94dbcff1225.tar.gz
Add support for a "utf-8" mode
NOTE! MicroEmacs is very much a byte-based editor, and the new utf-8 support is purely an issue of terminal input and output. The file contents themselves are in the 8-bit space. In that space, Unicode is the same as Latin1. The new mode is called "utf-8", and is enabled automatically by the new emacs.rc when $LANG contains the substring "UTF-8". I'm sure people would like to some day also edit real UTF-8 contents, rather than just edit old 8-bit Latin1 contents in a UTF-8 terminal. However, that's an independent (and much bigger and thornier) issue. Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--emacs.rc4
-rw-r--r--estruct.h3
-rw-r--r--globals.c6
-rw-r--r--posix.c95
4 files changed, 103 insertions, 5 deletions
diff --git a/emacs.rc b/emacs.rc
index 407d908..06c0f12 100644
--- a/emacs.rc
+++ b/emacs.rc
@@ -282,4 +282,8 @@ bind-to-key newline ^J
!endif
!endif
+!if &gre &sin $LANG "UTF-8" 0
+ add-global-mode "utf-8"
+!endif
+
set $discmd "TRUE"
diff --git a/estruct.h b/estruct.h
index ca9a800..9f8ff60 100644
--- a/estruct.h
+++ b/estruct.h
@@ -493,7 +493,7 @@ typedef struct BUFFER {
#define BFTRUNC 0x04 /* buffer was truncated when read */
/* mode flags */
-#define NUMMODES 9 /* # of defined modes */
+#define NUMMODES 10 /* # of defined modes */
#define MDWRAP 0x0001 /* word wrap */
#define MDCMOD 0x0002 /* C indentation and fence match */
@@ -504,6 +504,7 @@ typedef struct BUFFER {
#define MDMAGIC 0x0040 /* regular expresions in search */
#define MDCRYPT 0x0080 /* encrytion mode active */
#define MDASAVE 0x0100 /* auto-save mode */
+#define MDUTF8 0x0200 /* UTF-8 input/output mode */
/*
* The starting position of a region, and the size of the region in
diff --git a/globals.c b/globals.c
index 9c154ec..fb40822 100644
--- a/globals.c
+++ b/globals.c
@@ -13,13 +13,13 @@ int revexist = FALSE; /* does reverse video exist? */
int flickcode = FALSE; /* do flicker supression? */
char *modename[] = { /* name of modes */
"WRAP", "CMODE", "SPELL", "EXACT", "VIEW", "OVER",
- "MAGIC", "CRYPT", "ASAVE"
+ "MAGIC", "CRYPT", "ASAVE", "UTF-8"
};
char *mode2name[] = { /* name of modes */
"Wrap", "Cmode", "Spell", "Exact", "View", "Over",
- "Magic", "Crypt", "Asave"
+ "Magic", "Crypt", "Asave", "utf-8"
};
-char modecode[] = "WCSEVOMYA"; /* letters to represent modes */
+char modecode[] = "WCSEVOMYAU"; /* letters to represent modes */
int gmode = 0; /* global editor mode */
int gflags = GFREAD; /* global control flag */
#if PKCODE & IBMPC
diff --git a/posix.c b/posix.c
index cc09d03..c07e0ee 100644
--- a/posix.c
+++ b/posix.c
@@ -23,6 +23,20 @@
#include <fcntl.h>
#include <errno.h>
+/*
+ * NOTE NOTE NOTE!
+ *
+ * Uemacs is currently very much byte-oriented, and not at all UTF8-aware
+ * interally. However, this allows it to understand a _terminal_ that is
+ * in utf-8 mode, and will turn input into the 8-bit subset, and will turn
+ * things back into UTF8 on output.
+ *
+ * Do _not_ confuse this with the notion of actually being able to edit
+ * UTF-8 file _contents_. That's a totally different thing.
+ */
+#define utf8_mode() \
+ (curwp && curwp->w_bufp && (curwp->w_bufp->b_mode & MDUTF8))
+
static int kbdflgs; /* saved keyboard fd flags */
static int kbdpoll; /* in O_NDELAY mode */
@@ -99,6 +113,22 @@ void ttclose(void)
*/
int ttputc(int c)
{
+ /*
+ * We always represent things in 1 byte, but if we output
+ * in UTF-8, we may need to expand that into 2 bytes..
+ *
+ * Some day we might even be able to handle UTF-8 _content_.
+ *
+ * That day is not today.
+ */
+ if (utf8_mode()) {
+ c &= 0xff;
+ if (c >= 0x80) {
+ unsigned char first = (c >> 6) | 0xc0;
+ fputc(first, stdout);
+ c = (c & 0x3f) | 0x80;
+ }
+ }
fputc(c, stdout);
return (TRUE);
}
@@ -138,7 +168,70 @@ void ttflush(void)
*/
int ttgetc(void)
{
- return (255 & fgetc(stdin)); /* 8BIT P.K. */
+ static unsigned char pending;
+ unsigned char c, second;
+ int n;
+
+ if (pending) {
+ c = pending;
+ pending = 0;
+ return c;
+ }
+
+ n = read(0, &c, 1);
+ if (n != 1)
+ return 0;
+
+ if (!utf8_mode())
+ return c;
+
+ /* Normal 7-bit? */
+ if (!(c & 0x80))
+ return c;
+
+ /*
+ * Unexpected UTF-8 continuation character? Maybe
+ * we're in non-UTF mode, or maybe it's a control
+ * character.. Regardless, just pass it on.
+ */
+ if (!(c & 0x40))
+ return c;
+
+ /*
+ * Multi-byte sequences.. Right now we only
+ * want to get characters that can be represented
+ * in a single byte, so we're not interested in
+ * anything else..
+ */
+ if (c & 0x3c)
+ return c;
+
+ /*
+ * Two-byte sequence representing 0x80-0xff.. We want
+ * to do this read with a timeout.
+ */
+ ntermios.c_cc[VMIN] = 1;
+ ntermios.c_cc[VTIME] = 10; /* 1 second */
+ tcsetattr(0, TCSANOW, &ntermios);
+
+ n = read(0, &second, 1);
+
+ /* Undo timeout */
+ ntermios.c_cc[VTIME] = 0;
+ tcsetattr(0, TCSANOW, &ntermios);
+
+ if (n != 1)
+ return c;
+
+ if ((second & 0xc0) != 0x80) {
+ pending = second;
+ return c;
+ }
+
+ c = (c << 6) | (second & 0x3f);
+
+ /* Ok, real UTF-8 character */
+ return c;
}
/* typahead: Check to see if any characters are already in the