diff options
author | B. Watson <urchlay@slackware.uk> | 2024-12-12 15:40:38 -0500 |
---|---|---|
committer | B. Watson <urchlay@slackware.uk> | 2024-12-12 15:40:38 -0500 |
commit | 5414bc808430e80139a6debe56ccea15afedb3b6 (patch) | |
tree | 67c87f84b978adc24e5f8edb22e8b6bb6698874d | |
parent | de1c3571112e85507fea5e13c046eaf7b5514be1 (diff) | |
download | uxd-5414bc808430e80139a6debe56ccea15afedb3b6.tar.gz |
UTF-16 warnings, better visible null & escape, handle BOM.
-rw-r--r-- | uxd.1 | 7 | ||||
-rw-r--r-- | uxd.c | 75 | ||||
-rw-r--r-- | uxd.rst | 7 |
3 files changed, 67 insertions, 22 deletions
@@ -104,8 +104,9 @@ Printable characters (except the space, U+0020) alternate between green and yell .B \fBpurple\fP Spaces and unprintable characters ("control" characters, newlines, tabs, etc). These are printed as "visible" characters, e.g. ␣ for the space, ↵ for a newline. -This is an improvement over the usual practice of printing these as periods, like -standard hex dumpers do. +Hopefilly this is an improvement over the usual practice of printing these as periods, like +standard hex dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed +as a purple letter B. .TP .B \fBred\fP Invalid UTF\-8 sequences. These are rendered with a red foreground, to make them @@ -132,7 +133,7 @@ ANSI\-style escape sequences, Unicode, and UTF\-8 rendering. The author\(aqs testing is done primarily with \fBurxvt\fP(1). Other terminals aren\(aqt tested as often. .sp -Known to work: urxvt, xterm, st, xfce4\-terminal, gnome\-terminal, the Linux console (but +Known to work: urxvt, xterm, st, xfce4\-terminal, gnome\-terminal, kitty, the Linux console (but see \fBFONTS\fP, below). .sp Known \fBnot\fP to work: rxvt (doesn\(aqt support Unicode at all). @@ -29,10 +29,10 @@ error. If we get a sequence-starter, but the sequence doesn't have the correct number of continuation bytes (e.g. 110xxxxx followed by anything that isn't 10xxxxxx), that's an error too. -BOM: if the file contains ef bb bf (aka U+FEFF), it should be colorized +BOM: if the file contains ef bb bf (aka U+FEFF), it will be colorized as a special (non-printable). If the file begins with ff fe, it's UTF-16 (little endian). If it's -fe ff, it's UTF-16 big-endian. Probably we should detect these and +fe ff, it's UTF-16 big-endian. We detect these and print a warning on stderr. */ @@ -54,7 +54,6 @@ print a warning on stderr. #define BAD_FG BLACK #define BAD_BG RED -// const int normal_colors[] = { GREEN, PURPLE, CYAN }; const int normal_colors[] = { GREEN, YELLOW }; int cur_normal_color = 0; int dump_color; @@ -100,9 +99,15 @@ void open_input(const int argc, const char *argv1) { } } +/* Unicode control character printable equivalents. For 0, use + the "empty set" symbol. It's a lot more readable than the "nul" + symbol, ␀. Escape, tab, newline, space are what urxvt uses in + its "keycap picture" mode. The rest of there are hard to read at + normal font sizes, but it's still better than using a dot for + everything like xxd does. */ char * const special_symbols[] = { - "␀", "␁", "␂", "␃", "␄", "␅", "␆", "␇", "␈", "⇥", "↵", "␋", "␌", "␍", "␎", "␏", - "␐", "␑", "␒", "␓", "␔", "␕", "␖", "␗", "␘", "␙", "␚", "␛", "␜", "␝", "␞", "␟", + "∅", "␁", "␂", "␃", "␄", "␅", "␆", "␇", "␈", "⇥", "↵", "␋", "␌", "␍", "␎", "␏", + "␐", "␑", "␒", "␓", "␔", "␕", "␖", "␗", "␘", "␙", "␚", "⎋", "␜", "␝", "␞", "␟", "␣", }; @@ -112,6 +117,8 @@ char *get_special(unsigned char c) { return "?"; /* should never happen */ } +/* set name to use for error messages. this must be called before + open_input(). */ void set_self(const char *argv0) { self = strrchr(argv0, '/'); @@ -121,6 +128,21 @@ void set_self(const char *argv0) { self = argv0; } +void print_line(void) { + int spacing = MAX_DUMP_COLS - dump_column; + + printf("%s", left_buf); + + /* line up the rightmost field (human-readable) */ + while(spacing--) printf(" "); + + printf(" %s\n", right_buf); + + /* clear the buffers, start a new line */ + left_buf[0] = right_buf[0] = '\0'; + dump_column = 0; +} + void next_normal_color() { cur_normal_color++; cur_normal_color %= (sizeof(normal_colors) / sizeof(int)); @@ -139,14 +161,6 @@ void append_color(char *buf, int fgcolor, int bgcolor) { strcat(buf, tmpbuf); } -void print_line(void) { - int spacing = MAX_DUMP_COLS - dump_column; - printf("%s", left_buf); - while(spacing--) printf(" "); - printf(" %s\n", right_buf); - left_buf[0] = right_buf[0] = '\0'; -} - void append_color_off(char *buf) { strcat(buf, "\x1b[0m"); } @@ -169,20 +183,38 @@ void append_left(unsigned char byte, int fgcolor, int bgcolor) { if(dump_column == 7) strcat(left_buf, " "); dump_column++; - if(dump_column == MAX_DUMP_COLS) { + if(dump_column == MAX_DUMP_COLS) print_line(); - dump_column = 0; - } filepos++; } +void check_utf16(int byte0, int byte1) { + char *endian; + + if(byte0 == 0xff && byte1 == 0xfe) { + endian = "little"; + } else if(byte0 == 0xfe && byte1 == 0xff) { + endian = "big"; + } else { + return; + } + + fprintf(stderr, "%s: input looks like UTF-16, %s-endian\n", self, endian); +} + +int is_bom(unsigned char *b) { + return (b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); +} + +/* return value: false = EOF, true = more data to read */ int dump_utf8_char(void) { unsigned char bytes[] = { 0, 0, 0, 0, 0 }; unsigned char *cont_bytes = bytes + 1; char *printable; int bad = 0, special = 0; int c, cont_count, i, fg, bg; + static int byte0; c = fgetc(input); if(c == EOF) @@ -190,6 +222,12 @@ int dump_utf8_char(void) { bytes[0] = (unsigned char)c; + if(filepos == 0) { + byte0 = c; + } else if(filepos == 1) { + check_utf16(byte0, c); + } + if(c < 0x7f) { cont_count = 0; if(c <= ' ' || c == 0x7f) @@ -236,6 +274,10 @@ int dump_utf8_char(void) { fg = SPECIAL; bg = 0; printable = get_special(bytes[0]); + } else if(cont_count == 2 && is_bom(bytes)) { + fg = SPECIAL; + bg = 0; + printable = "B"; } else { fg = normal_colors[cur_normal_color]; bg = 0; @@ -258,6 +300,7 @@ void dump_file(void) { while(dump_utf8_char()) ; + /* handle the last line, if the file size not divisible by 16. */ if(dump_column) print_line(); } @@ -88,8 +88,9 @@ COLORS **purple** Spaces and unprintable characters ("control" characters, newlines, tabs, etc). These are printed as "visible" characters, e.g. ␣ for the space, ↵ for a newline. - This is an improvement over the usual practice of printing these as periods, like - standard hex dumpers do. + Hopefully this is an improvement over the usual practice of printing these as periods, like + standard hex dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed + as a purple letter B. **red** Invalid UTF-8 sequences. These are rendered with a red foreground, to make them @@ -111,7 +112,7 @@ ANSI-style escape sequences, Unicode, and UTF-8 rendering. The author's testing is done primarily with **urxvt**\(1). Other terminals aren't tested as often. -Known to work: urxvt, xterm, st, xfce4-terminal, gnome-terminal, the Linux console (but +Known to work: urxvt, xterm, st, xfce4-terminal, gnome-terminal, kitty, the Linux console (but see **FONTS**, below). Known **not** to work: rxvt (doesn't support Unicode at all). |