Make cursor movement (largely) understand UTF-8 character boundaries

Ok, so it may do odd things if it's not truly utf-8, and when moving up and down lines that have utf-8 the cursor moves oddly (because the byte offset within the line stays constant, rather than the character offset), but with this you can actually open the UTF8 example file and move around it, and at least some of the movement makes sense. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 16:40:36 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 16:40:36 -0700
commit: 6b793211c2aec69115dd2769892be0524801f7d8 (patch)
tree: 4a9cab1b1a97c7c66ab90353ad6aab56abad30e0
parent: e62cdf04cff63381121364cd6ef077f00d72307a (diff)
download: uemacs-6b793211c2aec69115dd2769892be0524801f7d8.tar.gz
3 files changed, 31 insertions, 11 deletions
diff --git a/basic.c b/basic.c
index 3bf0227..3a7d6f7 100644
--- a/basic.c
+++ b/basic.c
@@ -15,6 +15,7 @@
 #include "edef.h"
 #include "efunc.h"
 #include "line.h"
+#include "utf8.h"
 
 /*
  * This routine, given a pointer to a struct line, and the current cursor goal
@@ -74,8 +75,15 @@ int backchar(int f, int n)
 			curwp->w_dotp = lp;
 			curwp->w_doto = llength(lp);
 			curwp->w_flag |= WFMOVE;
-		} else
-			curwp->w_doto--;
+		} else {
+			do {
+				unsigned char c;
+				curwp->w_doto--;
+				c = lgetc(curwp->w_dotp, curwp->w_doto);
+				if (is_beginning_utf8(c))
+					break;
+			} while (curwp->w_doto);
+		}
 	}
 	return TRUE;
 }
@@ -100,14 +108,22 @@ int forwchar(int f, int n)
 	if (n < 0)
 		return backchar(f, -n);
 	while (n--) {
-		if (curwp->w_doto == llength(curwp->w_dotp)) {
+		int len = llength(curwp->w_dotp);
+		if (curwp->w_doto == len) {
 			if (curwp->w_dotp == curbp->b_linep)
 				return FALSE;
 			curwp->w_dotp = lforw(curwp->w_dotp);
 			curwp->w_doto = 0;
 			curwp->w_flag |= WFMOVE;
-		} else
-			curwp->w_doto++;
+		} else {
+			do {
+				unsigned char c;
+				curwp->w_doto++;
+				c = lgetc(curwp->w_dotp, curwp->w_doto);
+				if (is_beginning_utf8(c))
+					break;
+			} while (curwp->w_doto < len);
+		}
 	}
 	return TRUE;
 }
diff --git a/display.c b/display.c
index 82b4f84..676514d 100644
--- a/display.c
+++ b/display.c
@@ -528,7 +528,6 @@ static void updall(struct window *wp)
 void updpos(void)
 {
 	struct line *lp;
-	int c;
 	int i;
 
 	/* find the current row */
@@ -543,13 +542,13 @@ void updpos(void)
 	curcol = 0;
 	i = 0;
 	while (i < curwp->w_doto) {
-		c = lgetc(lp, i++);
+		unicode_t c;
+		int bytes;
+
+		bytes = utf8_to_unicode(lp->l_text, i, curwp->w_doto, &c);
+		i += bytes;
 		if (c == '\t')
 			curcol |= tabmask;
-		else if (c < 0x20 || c == 0x7f)
-			++curcol;
-		else if (c >= 0x80 && c <= 0xa0)
-			curcol+=2;
 
 		++curcol;
 	}
diff --git a/utf8.h b/utf8.h
index b60ccd2..c317a6a 100644
--- a/utf8.h
+++ b/utf8.h
@@ -6,4 +6,9 @@ typedef unsigned int unicode_t;
 unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res);
 unsigned unicode_to_utf8(unsigned int c, char *utf8);
 
+static inline int is_beginning_utf8(unsigned char c)
+{
+	return (c & 0xc0) != 0x80;
+}
+
 #endif
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-07-10 16:40:36 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-07-10 16:40:36 -0700
commit	6b793211c2aec69115dd2769892be0524801f7d8 (patch)
tree	4a9cab1b1a97c7c66ab90353ad6aab56abad30e0
parent	e62cdf04cff63381121364cd6ef077f00d72307a (diff)
download	uemacs-6b793211c2aec69115dd2769892be0524801f7d8.tar.gz