diff options
-rw-r--r-- | uxd.c | 110 |
1 files changed, 66 insertions, 44 deletions
@@ -560,10 +560,6 @@ int is_overlong(int cont_count, unsigned char *b) { if(!cont_count) return 0; - /* java mode (MUTF-8) allows exactly one overlong: */ - if(java_mode && cont_count == 1 && b[0] == 0xc0 && b[1] == 0x80) - return 0; - /* 2 byte seqs, if the first byte is 0xc0 or 0xc1, it's overlong. */ if(cont_count == 1 && b[0] <= 0xc1) return 1; @@ -617,13 +613,67 @@ void push_back_byte(int c) { } } +char *classify_char(int *hl, unsigned char *bytes, int cont_count) { + char *b = (char *)bytes; + int c; + + c = b[0]; + if(cont_count == 0) { + if(c <= ' ' || c == 0x7f) { + *hl = HL_SPECIAL; + return get_special(c); + } else { + *hl = HL_NORMAL; + return b; + } + } + + if(cont_count == 2 && is_bom(bytes)) { + *hl = HL_SPECIAL; + return PRINT_BOM; + } + + if(is_overlong(cont_count, bytes)) { + /* java mode (MUTF-8) allows exactly one overlong: */ + if(java_mode && cont_count == 1 && bytes[0] == 0xc0 && bytes[1] == 0x80) { + *hl = HL_SPECIAL; /* or should it be normal? */ + return get_special(0); + } else if(permissive) { + *hl = HL_NORMAL; + } else { + *hl = HL_BAD; + } + return PRINT_BAD; + } + + if(is_surrogate(cont_count, bytes)) { + if(wtf8_mode || permissive) { + *hl = HL_NORMAL; + } else { + *hl = HL_BAD; + } + return PRINT_BAD; + } + + if(is_out_of_range(cont_count, bytes)) { + if(permissive) { + *hl = HL_NORMAL; + } else { + *hl = HL_BAD; + } + return PRINT_BAD; + } + + *hl = HL_NORMAL; + return b; +} + /* This is the 'workhorse', called for each character in the file. Return value: false = EOF, true = more data to read */ int dump_utf8_char(void) { unsigned char bytes[] = { 0, 0, 0, 0, 0 }; - unsigned char *cont_bytes = bytes + 1; char *printable; - int bad = 0, special = 0, overlong = 0, hl_type; + int bad = 0, hl_type; int c, cont_count, i; static int byte0; @@ -645,8 +695,6 @@ int dump_utf8_char(void) { if(c <= 0x7f) { ascii_count++; cont_count = 0; - if(c <= ' ' || c == 0x7f) - special = 1; } else if((c & 0xe0) == 0xc0) { /* 110xxxxx */ cont_count = 1; } else if((c & 0xf0) == 0xe0) { /* 1110xxxx */ @@ -675,7 +723,7 @@ int dump_utf8_char(void) { byte_count++; - cb = cont_bytes[i] = (unsigned char)c; + cb = bytes[i + 1] = (unsigned char)c; if((cb & 0xc0) != 0x80) { /* Expected 10xxxxxx, got something else */ cont_count = i; @@ -686,46 +734,22 @@ int dump_utf8_char(void) { } } - if(!permissive) { - /* don't check bad sequences for out-of-range or surrogate */ - if(!bad) { - if(is_out_of_range(cont_count, bytes)) - bad = 1; - else if((!wtf8_mode) && is_surrogate(cont_count, bytes)) - bad = 1; - } - - if(is_overlong(cont_count, bytes)) - overlong = 1; + if(bad) { + hl_type = HL_BAD; + printable = PRINT_BAD; + } else { + printable = classify_char(&hl_type, bytes, cont_count); } - if(bad || overlong) { + if(hl_type == HL_BAD) { bad_count++; } else { char_count++; - if(cont_count) - multi_count++; + if(cont_count) multi_count++; } - /* decide how to highlight the current character */ - if(bad) { - hl_type = HL_BAD; - printable = PRINT_BAD; - } else if(special) { - hl_type = HL_SPECIAL; - printable = get_special(bytes[0]); - } else if(cont_count == 2 && is_bom(bytes)) { - hl_type = HL_SPECIAL; - printable = PRINT_BOM; - } else { - if(overlong) { - hl_type = HL_OVERLONG; - printable = PRINT_BAD; - } else { - hl_type = HL_NORMAL; - printable = (char *)bytes; - } - } + if(hl_type == HL_NORMAL && hilite_multi && cont_count) + hl_type = HL_NORM_INV; /* human-readable (right) column: */ append_hilite(right_buf, hl_type); @@ -733,8 +757,6 @@ int dump_utf8_char(void) { append_hilite_off(right_buf); /* hex columns: */ - if(hilite_multi && cont_count) - hl_type = HL_NORM_INV; for(i = 0; i <= cont_count; i++) { append_left(bytes[i], (i != cont_count), hl_type); } |