#include #include #include #include /* UTF-8 spec summary, taken from Wikipedia and elsewhere, kept here for locality of reference. Codepoints 0-0x7f encode as themselves, one byte each, bit 7 always 0. 0x80 and up are encoded as multiple bytes. The first byte's bit 7 is always 1. The top bits determine the byte length of the sequence: 110 - 2 bytes 1110 - 3 bytes 11110 - 4 bytes Continuation (2nd and further bytes) have 10 as the top 2 bits. If we get a continuation that's not after a sequence-starter, that's an error. If we get a sequence-starter, but the sequence doesn't have the correct number of continuation bytes (e.g. 110xxxxx followed by anything that isn't 10xxxxxx), that's an error too. Note that we don't actually do a full decode of the codepoint bits. It's enough to look at the top bits to keep track of multibyte characters. BOM: if the file contains ef bb bf (aka U+FEFF), it will be colorized as a special (non-printable). If the file begins with ff fe, it's UTF-16 (little endian). If it's fe ff, it's UTF-16 big-endian. We detect these and print a warning on stderr. */ /* from getopt.c */ extern int my_getopt(int, char **, char *); extern char *optarg; extern int optind; #ifndef VERSION #define VERSION "(unknown version)" #endif #ifndef BUFSIZ #define BUFSIZ 4096 #endif /* ANSI colors */ #define BLACK 0 /* don't use (could be the background color) */ #define RED 1 #define GREEN 2 #define YELLOW 3 #define BLUE 4 /* don't use (hard to read on many terminals) */ #define PURPLE 5 #define CYAN 6 #define WHITE 7 /* don't use (could be the background color) */ #define SPECIAL PURPLE #define BAD_FG BLACK #define BAD_BG RED const int normal_colors[] = { GREEN, YELLOW }; int cur_normal_color = 0; int dump_color; const char *self; FILE *input; /* these buffers are bigger than they need to be really. */ char left_buf[4096]; char right_buf[4096]; #define MAX_DUMP_COLS 16 int dump_column = 0; int filepos = 0; /* options */ int bold = 0; /* -b */ int hilite_multi = 0; /* -r */ int mono = 0; /* -m */ long display_offset = 0; /* -o */ long seekpos = 0; /* -s, -S */ int seek_offset_zero = 0; /* -S */ long limit; /* -l */ const char *hex_byte_fmt = "%02x"; /* -u */ const char *hex_word_fmt = "%04x: "; /* " */ void usage(void) { printf("uxd (Utf-8 heX Dump) v" VERSION " by B. Watson. WTFPL.\n"); printf("Usage: %s []\n", self); printf(" With no , or with -, read standard input.\n"); exit(0); } void version(void) { printf("%s\n", VERSION); exit(0); } void open_input(const char *arg) { if(!arg || (strcmp(arg, "-") == 0)) { input = stdin; } else { input = fopen(arg, "rb"); if(!input) { fprintf(stderr, "%s: ", self); perror(arg); exit(1); } } } long parse_number(const char *s) { return strtol(s, NULL, 0); /* TODO: error checking */ } void parse_options(int argc, char **argv) { int opt; if(argc > 1) { if(strcmp(argv[1], "--help") == 0) usage(); if(strcmp(argv[1], "--version") == 0) version(); } while((opt = my_getopt(argc, argv, "bl:rmo:S:s:uhv")) != -1) { switch(opt) { case 'b': bold = 1; break; case 'l': limit = parse_number(optarg); break; case 'r': hilite_multi = 1; break; case 'm': mono = 1; break; case 'o': display_offset = parse_number(optarg); break; case 'S': seek_offset_zero = 1; /* fall thru */ case 's': seekpos = parse_number(optarg); break; case 'u': hex_byte_fmt = "%02X"; hex_word_fmt = "%04X: "; break; case 'h': usage(); break; case 'v': version(); break; default: exit(1); } } /* filename (if present) must come after all -options, and there can only be one filename. */ if(optind < (argc - 1)) usage(); open_input(argv[optind]); } /* Unicode control character printable equivalents. For 0, use the "empty set" symbol. It's a lot more readable than the "nul" symbol, ␀. Escape, tab, newline, space are what urxvt uses in its "keycap picture" mode. The rest of there are hard to read at normal font sizes, but it's still better than using a dot for everything like xxd does. */ char * const special_symbols[] = { "∅", "␁", "␂", "␃", "␄", "␅", "␆", "␇", "␈", "⇥", "↵", "␋", "␌", "␍", "␎", "␏", "␐", "␑", "␒", "␓", "␔", "␕", "␖", "␗", "␘", "␙", "␚", "⎋", "␜", "␝", "␞", "␟", "␣", }; char *get_special(unsigned char c) { if(c == 0x7f) return "⌦"; /* tab */ if(c <= ' ') return special_symbols[c]; return "?"; /* should never happen */ } /* Set name to use for error messages. This must be called before open_input(). */ void set_self(const char *argv0) { self = strrchr(argv0, '/'); if(self) self++; else self = argv0; } void print_line(void) { int spacing = MAX_DUMP_COLS - dump_column; printf("%s", left_buf); /* line up the rightmost field (human-readable), for the partial line at the end of the output (if there is one). */ while(spacing--) printf(" "); if(dump_column < (MAX_DUMP_COLS / 2)) putchar(' '); printf(" %s\n", right_buf); /* clear the buffers, start a new line */ left_buf[0] = right_buf[0] = '\0'; dump_column = 0; } void next_normal_color() { cur_normal_color++; cur_normal_color %= (sizeof(normal_colors) / sizeof(int)); } void append_color(char *buf, int fgcolor, int bgcolor) { char tmpbuf[100]; sprintf(tmpbuf, "\x1b[%d;3%d", bold, fgcolor); strcat(buf, tmpbuf); if(bgcolor) { sprintf(tmpbuf, ";4%d", bgcolor); strcat(buf, tmpbuf); } sprintf(tmpbuf, "m"); strcat(buf, tmpbuf); } void append_color_off(char *buf) { strcat(buf, "\x1b[0m"); } void append_right(char *str) { strcat(right_buf, str); } void append_left(unsigned char byte, int dash, int fgcolor, int bgcolor) { char tmpbuf[100]; if(!dump_column) sprintf(left_buf, hex_word_fmt, filepos + display_offset); append_color(left_buf, fgcolor, bgcolor); sprintf(tmpbuf, hex_byte_fmt, byte); strcat(left_buf, tmpbuf); dump_column++; if(dash) { strcat(left_buf, "-"); if(dump_column == (MAX_DUMP_COLS / 2)) strcat(left_buf, "-"); append_color_off(left_buf); } else { append_color_off(left_buf); strcat(left_buf, " "); if(dump_column == (MAX_DUMP_COLS / 2)) strcat(left_buf, " "); } if(dump_column == MAX_DUMP_COLS) print_line(); filepos++; } void check_utf16(int byte0, int byte1) { char *endian; if(byte0 == 0xff && byte1 == 0xfe) { endian = "little"; } else if(byte0 == 0xfe && byte1 == 0xff) { endian = "big"; } else { return; } fprintf(stderr, "%s: input looks like UTF-16, %s-endian\n", self, endian); } /* Since we're not fully decoding the code points, we have to check for the actual UTF-8 representation of our one special multibyte char. */ int is_bom(unsigned char *b) { return (b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); } /* U+10FFFF is the last valid codepoint. It encodes to f4 8f bf bf. 'count' is the count of continuation bytes only (so, 3 for a 4-byte sqeuence). */ int is_out_of_range(int count, unsigned char *b) { if(count < 3) return 0; if(b[0] < 0xf4) return 0; if(b[1] < 0x90) return 0; return 1; } /* This is the 'workhorse', called for each character in the file. Return value: false = EOF, true = more data to read */ int dump_utf8_char(void) { unsigned char bytes[] = { 0, 0, 0, 0, 0 }; unsigned char *cont_bytes = bytes + 1; char *printable; int bad = 0, special = 0; int c, cont_count, i, fg, bg; static int byte0; c = fgetc(input); if(c == EOF) return 0; bytes[0] = (unsigned char)c; if(filepos == 0) { byte0 = c; } else if(filepos == 1) { check_utf16(byte0, c); } if(c < 0x7f) { cont_count = 0; if(c <= ' ' || c == 0x7f) special = 1; } else if((c & 0xe0) == 0xc0) /* 110xxxxx */ cont_count = 1; else if((c & 0xf0) == 0xe0) /* 1110xxxx */ cont_count = 2; else if((c & 0xf8) == 0xf0) /* 11110xxx */ cont_count = 3; else { cont_count = 0; bad = 1; } for(i = 0; i < cont_count; i++) { int cb; c = fgetc(input); if(c == EOF) { /* EOF in mid-sequence */ cont_count = i; bad = 1; break; } cb = cont_bytes[i] = (unsigned char)c; if((cb & 0xc0) != 0x80) { /* Expected 10xxxxxx, got something else */ cont_count = i; bad = 1; ungetc(cb, input); break; } } if(is_out_of_range(cont_count, bytes)) bad = 1; if(bad) { fg = BAD_FG; bg = BAD_BG; /* replacement character � is U+FFFD */ printable = "�"; } else if(special) { fg = SPECIAL; bg = 0; printable = get_special(bytes[0]); } else if(cont_count == 2 && is_bom(bytes)) { fg = SPECIAL; bg = 0; printable = "B"; } else { fg = normal_colors[cur_normal_color]; bg = 0; printable = (char *)bytes; next_normal_color(); } append_color(right_buf, fg, bg); append_right(printable); append_color_off(right_buf); if(hilite_multi && cont_count) { c = bg; bg = fg; fg = c; } for(i = 0; i <= cont_count; i++) { append_left(bytes[i], (i != cont_count), fg, bg); } return 1; } /* this only gets called when reading stdin. */ void skip_input(unsigned int bytes) { char tmp[BUFSIZ]; if(fread(tmp, 1, bytes, input) < bytes) { if(feof(input)) return; /* this probably never happens when reading from stdin: */ fprintf(stderr, "%s: ", self); perror("fread()"); exit(1); } } /* this only gets called when reading stdin. */ void fake_seek(void) { long i = seekpos; while(i >= BUFSIZ) { skip_input(BUFSIZ); if(feof(input)) return; i -= BUFSIZ; } skip_input(i); } /* used by -s option */ void seek_input(void) { int whence = SEEK_SET; if(seekpos < 0) { whence = SEEK_END; } if(fseek(input, seekpos, whence) == 0) { filepos = ftell(input); return; } /* fseek() failed, likely we're reading stdin. fake it, if we can. */ if(whence == SEEK_SET) { clearerr(input); fake_seek(); filepos = seekpos; } else { perror(self); exit(1); } } void dump_file(void) { if(seekpos) seek_input(); if(seek_offset_zero) filepos = 0; while(dump_utf8_char()) if(limit && (filepos >= limit)) break; /* handle the last line, if the file size not divisible by 16. */ if(dump_column) print_line(); } int main(int argc, char **argv) { set_self(argv[0]); parse_options(argc, argv); dump_file(); fclose(input); return 0; }