From 10e290ee83f985e0e44932889bee655e6dea1088 Mon Sep 17 00:00:00 2001
From: Jonas Fonseca <fonseca@diku.dk>
Date: Sat, 20 May 2006 03:30:41 +0200
Subject: [PATCH] Add preliminary support for UTF-8 handling in the main view

It is very simple but fixes the issue with unaligned drawing of the line
between the author and log title columns.
---
 tig.c | 176 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 173 insertions(+), 3 deletions(-)

diff --git a/tig.c b/tig.c
index ca57f75..95da3db 100644
--- a/tig.c
+++ b/tig.c
@@ -55,6 +55,7 @@
 static void die(const char *err, ...);
 static void report(const char *msg, ...);
 static void set_nonblocking_input(bool loading);
+static size_t utf8_length(const char *string, size_t max_width, int *coloffset, int *trimmed);
 
 #define ABS(x)		((x) >= 0  ? (x) : -(x))
 #define MIN(x, y)	((x) < (y) ? (x) :  (y))
@@ -74,6 +75,8 @@ static void set_nonblocking_input(bool loading);
 #define DATE_FORMAT	"%Y-%m-%d %H:%M"
 #define DATE_COLS	STRING_SIZE("2006-04-29 14:21 ")
 
+#define AUTHOR_COLS	20
+
 /* The default interval between line numbers. */
 #define NUMBER_INTERVAL	1
 
@@ -1566,6 +1569,8 @@ main_draw(struct view *view, unsigned int lineno)
 	enum line_type type;
 	int col = 0;
 	size_t timelen;
+	size_t authorlen;
+	int trimmed;
 
 	if (view->offset + lineno >= view->lines)
 		return FALSE;
@@ -1597,8 +1602,11 @@ main_draw(struct view *view, unsigned int lineno)
 	if (type != LINE_CURSOR)
 		wattrset(view->win, get_line_attr(LINE_MAIN_AUTHOR));
 
-	if (strlen(commit->author) > 19) {
-		waddnstr(view->win, commit->author, 18);
+	/* FIXME: Make this optional, and add i18n.commitEncoding support. */
+	authorlen = utf8_length(commit->author, AUTHOR_COLS - 2, &col, &trimmed);
+
+	if (trimmed) {
+		waddnstr(view->win, commit->author, authorlen);
 		if (type != LINE_CURSOR)
 			wattrset(view->win, get_line_attr(LINE_MAIN_DELIM));
 		waddch(view->win, '~');
@@ -1606,7 +1614,7 @@ main_draw(struct view *view, unsigned int lineno)
 		waddstr(view->win, commit->author);
 	}
 
-	col += 20;
+	col += AUTHOR_COLS;
 	if (type != LINE_CURSOR)
 		wattrset(view->win, A_NORMAL);
 
@@ -1897,6 +1905,166 @@ get_request(int key)
 }
 
 
+/*
+ * Unicode / UTF-8 handling
+ *
+ * NOTE: Much of the following code for dealing with unicode is derived from
+ * ELinks' UTF-8 code developed by Scrool <scroolik@gmail.com>. Origin file is
+ * src/intl/charset.c from the utf8 branch commit elinks-0.11.0-g31f2c28.
+ */
+
+/* I've (over)annotated a lot of code snippets because I am not entirely
+ * confident that the approach taken by this small UTF-8 interface is correct.
+ * --jonas */
+
+static inline int
+unicode_width(unsigned long c)
+{
+	if (c >= 0x1100 &&
+	   (c <= 0x115f				/* Hangul Jamo */
+	    || c == 0x2329
+	    || c == 0x232a
+	    || (c >= 0x2e80  && c <= 0xa4cf && c != 0x303f)
+	    					/* CJK ... Yi */
+	    || (c >= 0xac00  && c <= 0xd7a3)	/* Hangul Syllables */
+	    || (c >= 0xf900  && c <= 0xfaff)	/* CJK Compatibility Ideographs */
+	    || (c >= 0xfe30  && c <= 0xfe6f)	/* CJK Compatibility Forms */
+	    || (c >= 0xff00  && c <= 0xff60)	/* Fullwidth Forms */
+	    || (c >= 0xffe0  && c <= 0xffe6)
+	    || (c >= 0x20000 && c <= 0x2fffd)
+	    || (c >= 0x30000 && c <= 0x3fffd)))
+		return 2;
+
+	return 1;
+}
+
+/* Number of bytes used for encoding a UTF-8 character indexed by first byte.
+ * Illegal bytes are set one. */
+static const unsigned char utf8_bytes[256] = {
+	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+	2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
+	3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
+};
+
+/* Decode UTF-8 multi-byte representation into a unicode character. */
+static inline unsigned long
+utf8_to_unicode(const char *string, size_t length)
+{
+	unsigned long unicode;
+
+	switch (length) {
+	case 1:
+		unicode  =   string[0];
+		break;
+	case 2:
+		unicode  =  (string[0] & 0x1f) << 6;
+		unicode +=  (string[1] & 0x3f);
+		break;
+	case 3:
+		unicode  =  (string[0] & 0x0f) << 12;
+		unicode += ((string[1] & 0x3f) << 6);
+		unicode +=  (string[2] & 0x3f);
+		break;
+	case 4:
+		unicode  =  (string[0] & 0x0f) << 18;
+		unicode += ((string[1] & 0x3f) << 12);
+		unicode += ((string[2] & 0x3f) << 6);
+		unicode +=  (string[3] & 0x3f);
+		break;
+	case 5:
+		unicode  =  (string[0] & 0x0f) << 24;
+		unicode += ((string[1] & 0x3f) << 18);
+		unicode += ((string[2] & 0x3f) << 12);
+		unicode += ((string[3] & 0x3f) << 6);
+		unicode +=  (string[4] & 0x3f);
+		break;
+	case 6:	
+		unicode  =  (string[0] & 0x01) << 30;
+		unicode += ((string[1] & 0x3f) << 24);
+		unicode += ((string[2] & 0x3f) << 18);
+		unicode += ((string[3] & 0x3f) << 12);
+		unicode += ((string[4] & 0x3f) << 6);
+		unicode +=  (string[5] & 0x3f);
+		break;
+	default:
+		die("Invalid unicode length");
+	}
+
+	/* Invalid characters could return the special 0xfffd value but NUL
+	 * should be just as good. */
+	return unicode > 0xffff ? 0 : unicode;
+}
+
+/* Calculates how much of string can be shown within the given maximum width
+ * and sets trimmed parameter to non-zero value if all of string could not be
+ * shown.
+ *
+ * Additionally, adds to coloffset how many many columns to move to align with
+ * the expected position. Takes into account how multi-byte and double-width
+ * characters will effect the cursor position.
+ *
+ * Returns the number of bytes to output from string to satisfy max_width. */
+static size_t
+utf8_length(const char *string, size_t max_width, int *coloffset, int *trimmed)
+{
+	const char *start = string;
+	const char *end = strchr(string, '\0');
+	size_t mbwidth = 0;
+	size_t width = 0;
+
+	*trimmed = 0;
+
+	while (string < end) {
+		int c = *(unsigned char *) string;
+		unsigned char bytes = utf8_bytes[c];
+		size_t ucwidth;
+		unsigned long unicode;
+
+		if (string + bytes > end)
+			break;
+
+		/* Change representation to figure out whether
+		 * it is a single- or double-width character. */
+
+		unicode = utf8_to_unicode(string, bytes);
+		/* FIXME: Graceful handling of invalid unicode character. */
+		if (!unicode)
+			break;
+
+		ucwidth = unicode_width(unicode);
+		width  += ucwidth;
+		if (width > max_width) {
+			*trimmed = 1;
+			break;
+		}
+
+		/* The column offset collects the differences between the
+		 * number of bytes encoding a character and the number of
+		 * columns will be used for rendering said character.
+		 *
+		 * So if some character A is encoded in 2 bytes, but will be
+		 * represented on the screen using only 1 byte this will and up
+		 * adding 1 to the multi-byte column offset.
+		 *
+		 * Assumes that no double-width character can be encoding in
+		 * less than two bytes. */
+		if (bytes > ucwidth)
+			mbwidth += bytes - ucwidth;
+
+		string  += bytes;
+	}
+
+	*coloffset += mbwidth;
+
+	return string - start;
+}
+
+
 /*
  * Status management
  */
@@ -2316,6 +2484,8 @@ main(int argc, char *argv[])
  * ----
  * Known bugs and problems:
  *
+ * - In it's current state tig is pretty much UTF-8 only.
+ *
  * - If the screen width is very small the main view can draw
  *   outside the current view causing bad wrapping. Same goes
  *   for title and status windows.
-- 
2.32.0.93.g670b81a890