1 files changed, 66 insertions, 44 deletions
diff --git a/uxd.c b/uxd.c
index b30b22b..7724669 100644
--- a/uxd.c
+++ b/uxd.c
@@ -560,10 +560,6 @@ int is_overlong(int cont_count, unsigned char *b) {
 	if(!cont_count)
 		return 0;
 
-	/* java mode (MUTF-8) allows exactly one overlong: */
-	if(java_mode && cont_count == 1 && b[0] == 0xc0 && b[1] == 0x80)
-		return 0;
-
 	/* 2 byte seqs, if the first byte is 0xc0 or 0xc1, it's overlong. */
 	if(cont_count == 1 && b[0] <= 0xc1)
 		return 1;
@@ -617,13 +613,67 @@ void push_back_byte(int c) {
 	}
 }
 
+char *classify_char(int *hl, unsigned char *bytes, int cont_count) {
+	char *b = (char *)bytes;
+	int c;
+
+	c = b[0];
+	if(cont_count == 0) {
+		if(c <= ' ' || c == 0x7f) {
+			*hl = HL_SPECIAL;
+			return get_special(c);
+		} else {
+			*hl = HL_NORMAL;
+			return b;
+		}
+	}
+
+	if(cont_count == 2 && is_bom(bytes)) {
+		*hl = HL_SPECIAL;
+		return PRINT_BOM;
+	}
+
+	if(is_overlong(cont_count, bytes)) {
+		/* java mode (MUTF-8) allows exactly one overlong: */
+		if(java_mode && cont_count == 1 && bytes[0] == 0xc0 && bytes[1] == 0x80) {
+			*hl = HL_SPECIAL; /* or should it be normal? */
+			return get_special(0);
+		} else if(permissive) {
+			*hl = HL_NORMAL;
+		} else {
+			*hl = HL_BAD;
+		}
+		return PRINT_BAD;
+	}
+
+	if(is_surrogate(cont_count, bytes)) {
+		if(wtf8_mode || permissive) {
+			*hl = HL_NORMAL;
+		} else {
+			*hl = HL_BAD;
+		}
+		return PRINT_BAD;
+	}
+
+	if(is_out_of_range(cont_count, bytes)) {
+		if(permissive) {
+			*hl = HL_NORMAL;
+		} else {
+			*hl = HL_BAD;
+		}
+		return PRINT_BAD;
+	}
+
+	*hl = HL_NORMAL;
+	return b;
+}
+
 /* This is the 'workhorse', called for each character in the file.
    Return value: false = EOF, true = more data to read */
 int dump_utf8_char(void) {
 	unsigned char bytes[] = { 0, 0, 0, 0, 0 };
-	unsigned char *cont_bytes = bytes + 1;
 	char *printable;
-	int bad = 0, special = 0, overlong = 0, hl_type;
+	int bad = 0, hl_type;
 	int c, cont_count, i;
 	static int byte0;
 
@@ -645,8 +695,6 @@ int dump_utf8_char(void) {
 	if(c <= 0x7f) {
 		ascii_count++;
 		cont_count = 0;
-		if(c <= ' ' || c == 0x7f)
-			special = 1;
 	} else if((c & 0xe0) == 0xc0) {   /* 110xxxxx */
 		cont_count = 1;
 	} else if((c & 0xf0) == 0xe0) {   /* 1110xxxx */
@@ -675,7 +723,7 @@ int dump_utf8_char(void) {
 
 		byte_count++;
 
-		cb = cont_bytes[i] = (unsigned char)c;
+		cb = bytes[i + 1] = (unsigned char)c;
 		if((cb & 0xc0) != 0x80) {
 			/* Expected 10xxxxxx, got something else */
 			cont_count = i;
@@ -686,46 +734,22 @@ int dump_utf8_char(void) {
 		}
 	}
 
-	if(!permissive) {
-		/* don't check bad sequences for out-of-range or surrogate */
-		if(!bad) {
-			if(is_out_of_range(cont_count, bytes))
-				bad = 1;
-			else if((!wtf8_mode) && is_surrogate(cont_count, bytes))
-				bad = 1;
-		}
-
-		if(is_overlong(cont_count, bytes))
-			overlong = 1;
+	if(bad) {
+		hl_type = HL_BAD;
+		printable = PRINT_BAD;
+	} else {
+		printable = classify_char(&hl_type, bytes, cont_count);
 	}
 
-	if(bad || overlong) {
+	if(hl_type == HL_BAD) {
 		bad_count++;
 	} else {
 		char_count++;
-		if(cont_count)
-			multi_count++;
+		if(cont_count) multi_count++;
 	}
 
-	/* decide how to highlight the current character */
-	if(bad) {
-		hl_type = HL_BAD;
-		printable = PRINT_BAD;
-	} else if(special) {
-		hl_type = HL_SPECIAL;
-		printable = get_special(bytes[0]);
-	} else if(cont_count == 2 && is_bom(bytes)) {
-		hl_type = HL_SPECIAL;
-		printable = PRINT_BOM;
-	} else {
-		if(overlong) {
-			hl_type = HL_OVERLONG;
-			printable = PRINT_BAD;
-		} else {
-			hl_type = HL_NORMAL;
-			printable = (char *)bytes;
-		}
-	}
+	if(hl_type == HL_NORMAL && hilite_multi && cont_count)
+		hl_type = HL_NORM_INV;
 
 	/* human-readable (right) column: */
 	append_hilite(right_buf, hl_type);
@@ -733,8 +757,6 @@ int dump_utf8_char(void) {
 	append_hilite_off(right_buf);
 
 	/* hex columns: */
-	if(hilite_multi && cont_count)
-		hl_type = HL_NORM_INV;
 	for(i = 0; i <= cont_count; i++) {
 		append_left(bytes[i], (i != cont_count), hl_type);
 	}