aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--uxd.c110
1 files changed, 66 insertions, 44 deletions
diff --git a/uxd.c b/uxd.c
index b30b22b..7724669 100644
--- a/uxd.c
+++ b/uxd.c
@@ -560,10 +560,6 @@ int is_overlong(int cont_count, unsigned char *b) {
if(!cont_count)
return 0;
- /* java mode (MUTF-8) allows exactly one overlong: */
- if(java_mode && cont_count == 1 && b[0] == 0xc0 && b[1] == 0x80)
- return 0;
-
/* 2 byte seqs, if the first byte is 0xc0 or 0xc1, it's overlong. */
if(cont_count == 1 && b[0] <= 0xc1)
return 1;
@@ -617,13 +613,67 @@ void push_back_byte(int c) {
}
}
+char *classify_char(int *hl, unsigned char *bytes, int cont_count) {
+ char *b = (char *)bytes;
+ int c;
+
+ c = b[0];
+ if(cont_count == 0) {
+ if(c <= ' ' || c == 0x7f) {
+ *hl = HL_SPECIAL;
+ return get_special(c);
+ } else {
+ *hl = HL_NORMAL;
+ return b;
+ }
+ }
+
+ if(cont_count == 2 && is_bom(bytes)) {
+ *hl = HL_SPECIAL;
+ return PRINT_BOM;
+ }
+
+ if(is_overlong(cont_count, bytes)) {
+ /* java mode (MUTF-8) allows exactly one overlong: */
+ if(java_mode && cont_count == 1 && bytes[0] == 0xc0 && bytes[1] == 0x80) {
+ *hl = HL_SPECIAL; /* or should it be normal? */
+ return get_special(0);
+ } else if(permissive) {
+ *hl = HL_NORMAL;
+ } else {
+ *hl = HL_BAD;
+ }
+ return PRINT_BAD;
+ }
+
+ if(is_surrogate(cont_count, bytes)) {
+ if(wtf8_mode || permissive) {
+ *hl = HL_NORMAL;
+ } else {
+ *hl = HL_BAD;
+ }
+ return PRINT_BAD;
+ }
+
+ if(is_out_of_range(cont_count, bytes)) {
+ if(permissive) {
+ *hl = HL_NORMAL;
+ } else {
+ *hl = HL_BAD;
+ }
+ return PRINT_BAD;
+ }
+
+ *hl = HL_NORMAL;
+ return b;
+}
+
/* This is the 'workhorse', called for each character in the file.
Return value: false = EOF, true = more data to read */
int dump_utf8_char(void) {
unsigned char bytes[] = { 0, 0, 0, 0, 0 };
- unsigned char *cont_bytes = bytes + 1;
char *printable;
- int bad = 0, special = 0, overlong = 0, hl_type;
+ int bad = 0, hl_type;
int c, cont_count, i;
static int byte0;
@@ -645,8 +695,6 @@ int dump_utf8_char(void) {
if(c <= 0x7f) {
ascii_count++;
cont_count = 0;
- if(c <= ' ' || c == 0x7f)
- special = 1;
} else if((c & 0xe0) == 0xc0) { /* 110xxxxx */
cont_count = 1;
} else if((c & 0xf0) == 0xe0) { /* 1110xxxx */
@@ -675,7 +723,7 @@ int dump_utf8_char(void) {
byte_count++;
- cb = cont_bytes[i] = (unsigned char)c;
+ cb = bytes[i + 1] = (unsigned char)c;
if((cb & 0xc0) != 0x80) {
/* Expected 10xxxxxx, got something else */
cont_count = i;
@@ -686,46 +734,22 @@ int dump_utf8_char(void) {
}
}
- if(!permissive) {
- /* don't check bad sequences for out-of-range or surrogate */
- if(!bad) {
- if(is_out_of_range(cont_count, bytes))
- bad = 1;
- else if((!wtf8_mode) && is_surrogate(cont_count, bytes))
- bad = 1;
- }
-
- if(is_overlong(cont_count, bytes))
- overlong = 1;
+ if(bad) {
+ hl_type = HL_BAD;
+ printable = PRINT_BAD;
+ } else {
+ printable = classify_char(&hl_type, bytes, cont_count);
}
- if(bad || overlong) {
+ if(hl_type == HL_BAD) {
bad_count++;
} else {
char_count++;
- if(cont_count)
- multi_count++;
+ if(cont_count) multi_count++;
}
- /* decide how to highlight the current character */
- if(bad) {
- hl_type = HL_BAD;
- printable = PRINT_BAD;
- } else if(special) {
- hl_type = HL_SPECIAL;
- printable = get_special(bytes[0]);
- } else if(cont_count == 2 && is_bom(bytes)) {
- hl_type = HL_SPECIAL;
- printable = PRINT_BOM;
- } else {
- if(overlong) {
- hl_type = HL_OVERLONG;
- printable = PRINT_BAD;
- } else {
- hl_type = HL_NORMAL;
- printable = (char *)bytes;
- }
- }
+ if(hl_type == HL_NORMAL && hilite_multi && cont_count)
+ hl_type = HL_NORM_INV;
/* human-readable (right) column: */
append_hilite(right_buf, hl_type);
@@ -733,8 +757,6 @@ int dump_utf8_char(void) {
append_hilite_off(right_buf);
/* hex columns: */
- if(hilite_multi && cont_count)
- hl_type = HL_NORM_INV;
for(i = 0; i <= cont_count; i++) {
append_left(bytes[i], (i != cont_count), hl_type);
}