#include #include #include #include /* UTF-8 spec summary, taken from Wikipedia and elsewhere, kept here for locality of reference. Codepoints 0-0x7f encode as themselves, one byte each, bit 7 always 0. 0x80 and up are encoded as multiple bytes. The first byte's bit 7 is always 1. The top bits determine the byte length of the sequence: 110 - 2 bytes 1110 - 3 bytes 11110 - 4 bytes Continuation (2nd and further bytes) have 10 as the top 2 bits. If we get a continuation that's not after a sequence-starter, that's an error. If we get a sequence-starter, but the sequence doesn't have the correct number of continuation bytes (e.g. 110xxxxx followed by anything that isn't 10xxxxxx), that's an error too. Note that we don't actually do a full decode of the codepoint bits. It's enough to look at the top bits to keep track of multibyte characters. BOM: if the file contains ef bb bf (aka U+FEFF), it will be colorized as a special (non-printable). If the file begins with ff fe, it's UTF-16 (little endian). If it's fe ff, it's UTF-16 big-endian. We detect these and print a warning on stderr. */ /* from getopt.c */ extern int my_getopt(int, char **, char *); extern char *optarg; extern int optind; #ifndef VERSION #define VERSION "(unknown version)" #endif #ifndef BUFSIZ #define BUFSIZ 4096 #endif #define NO_COLOR "NO_COLOR" #define ENV_OPTS "UXD_OPTS" #define MAX_ARGS 64 /* ANSI colors */ #define BLACK 0 /* don't use (could be the background color) */ #define RED 1 #define GREEN 2 #define YELLOW 3 #define BLUE 4 /* don't use (hard to read on many terminals) */ #define PURPLE 5 #define CYAN 6 #define WHITE 7 /* don't use (could be the background color) */ #define SPECIAL PURPLE #define BAD_FG BLACK #define BAD_BG bad_color #define HL_NORMAL 0 #define HL_NORM_INV 1 #define HL_SPECIAL 2 #define HL_BAD 3 int normal_colors[] = { GREEN, YELLOW }; int cur_normal_hilite = 1; int bad_color = RED; int special_color = SPECIAL; const char *self; FILE *input; /* these buffers are bigger than they need to be really. */ char left_buf[4096]; char right_buf[4096]; #define MAX_DUMP_COLS 16 int dump_column = 0; int filepos = 0; /* Unicode control character printable equivalents. For 0, use the "empty set" symbol. It's a lot more readable than the "nul" symbol, ␀. Escape, tab, newline, space are what urxvt uses in its "keycap picture" mode. The rest of there are hard to read at normal font sizes, but it's still better than using a dot for everything like xxd does. */ char * const special_symbols[] = { "∅", "␁", "␂", "␃", "␄", "␅", "␆", "␇", "␈", "⇥", "↵", "␋", "␌", "␍", "␎", "␏", "␐", "␑", "␒", "␓", "␔", "␕", "␖", "␗", "␘", "␙", "␚", "⎋", "␜", "␝", "␞", "␟", "␣", }; /* options */ int print_info = 0; /* -i */ int bold = 0; /* -b */ int hilite_multi = 0; /* -r */ int mono = 0; /* -m */ long display_offset = 0; /* -o */ long seekpos = 0; /* -s, -S */ int seek_offset_zero = 0; /* -S */ long limit; /* -l */ const char *hex_byte_fmt = "%02x"; /* -u */ const char *hex_word_fmt = "%04x: "; /* " */ /* stats for -i option */ long byte_count = 0; long ascii_count = 0; long multi_count = 0; long bad_count = 0; long char_count = 0; void usage(void) { printf("uxd (Utf-8 heX Dump) v" VERSION " by B. Watson. WTFPL.\n"); printf("Usage: %s []\n", self); printf(" With no , or with -, read standard input.\n"); exit(0); } void version(void) { printf("%s\n", VERSION); exit(0); } void open_input(const char *arg) { if(!arg || (strcmp(arg, "-") == 0)) { input = stdin; } else { input = fopen(arg, "rb"); if(!input) { fprintf(stderr, "%s: ", self); perror(arg); exit(1); } } } void color_error(void) { fprintf(stderr, "%s: invalid -c colors (-h for help).\n", self); exit(1); } void check_color(char c) { if(c < '0' || c > '7') color_error(); } void parse_colors(char *arg) { if(!arg[0]) return; /* should never happen anyway */ /* first 2 are required */ check_color(arg[0]); check_color(arg[1]); normal_colors[0] = arg[0] - '0'; normal_colors[1] = arg[1] - '0'; /* optional 3rd color */ if(!arg[2]) return; check_color(arg[2]); special_color = arg[2] - '0'; /* optional 4th color */ if(!arg[3]) return; check_color(arg[3]); bad_color = arg[3] - '0'; } long parse_number(const char *s) { return strtol(s, NULL, 0); /* TODO: error checking */ } void parse_args(int argc, char **argv) { int opt; if(argc > 1) { if(strcmp(argv[1], "--help") == 0) usage(); if(strcmp(argv[1], "--version") == 0) version(); } while((opt = my_getopt(argc, argv, "ic:nbl:rmo:S:s:uhv")) != -1) { switch(opt) { case 'i': print_info = 1; break; case 'c': mono = 0; parse_colors(optarg); break; case 'n': break; /* already handled in parse_options() */ case 'b': bold = 1; break; case 'l': limit = parse_number(optarg); break; case 'r': hilite_multi = 1; break; case 'm': mono = 1; break; case 'o': display_offset = parse_number(optarg); break; case 'S': seek_offset_zero = 1; /* fall thru */ case 's': seekpos = parse_number(optarg); break; case 'u': hex_byte_fmt = "%02X"; hex_word_fmt = "%04X: "; break; case 'h': usage(); break; case 'v': version(); break; default: exit(1); } } /* filename (if present) must come after all -options, and there can only be one filename. */ if(optind < (argc - 1)) usage(); open_input(argv[optind]); } /* read options from the environment and the command line, create a new argv/argc that has all the options from both, with the environment ones first. */ void parse_options(int argc, char **argv) { int nargc; char **real_argv = argv; char *nargv[MAX_ARGS + 1]; char *env, *p; if(getenv(NO_COLOR)) mono = 1; env = getenv(ENV_OPTS); if(!env) { /* nothing in the env, use regular args as-is */ parse_args(argc, argv); return; } nargv[0] = (char *)self; nargv[1] = env; nargc = 2; for(p = env; *p; p++) { if(*p == ' ' || *p == '\t') { *p = '\0'; if(nargc == MAX_ARGS) break; nargv[nargc++] = p + 1; } } argv++; /* skip exe name */ while(*argv) { /* have to check for the -n option here */ if(argv[0][0] == '-' && argv[0][1] == 'n') { parse_args(argc, real_argv); return; } if(nargc == MAX_ARGS) break; nargv[nargc++] = *argv; argv++; } nargv[nargc] = NULL; parse_args(nargc, nargv); } char *get_special(unsigned char c) { if(c == 0x7f) return "⌦"; /* tab */ if(c <= ' ') return special_symbols[c]; return "?"; /* should never happen */ } /* Set name to use for error messages. This must be called before open_input(). */ void set_self(const char *argv0) { self = strrchr(argv0, '/'); if(self) self++; else self = argv0; } void print_line(void) { int spacing = MAX_DUMP_COLS - dump_column; printf("%s", left_buf); /* line up the rightmost field (human-readable), for the partial line at the end of the output (if there is one). */ while(spacing--) printf(" "); if(dump_column < (MAX_DUMP_COLS / 2)) putchar(' '); printf(" %s\n", right_buf); /* clear the buffers, start a new line */ left_buf[0] = right_buf[0] = '\0'; dump_column = 0; } void next_normal_hilite() { cur_normal_hilite = !cur_normal_hilite; } void append_color(char *buf, int hl_type) { char tmpbuf[100]; int fgcolor, bgcolor; switch(hl_type) { case HL_NORMAL: fgcolor = normal_colors[cur_normal_hilite]; bgcolor = 0; break; case HL_NORM_INV: fgcolor = 0; bgcolor = normal_colors[cur_normal_hilite]; break; case HL_SPECIAL: fgcolor = special_color; bgcolor = 0; break; default: case HL_BAD: fgcolor = BAD_FG; bgcolor = bad_color; break; } sprintf(tmpbuf, "\x1b[%d;3%d", bold, fgcolor); strcat(buf, tmpbuf); if(bgcolor) { sprintf(tmpbuf, ";4%d", bgcolor); strcat(buf, tmpbuf); } sprintf(tmpbuf, "m"); strcat(buf, tmpbuf); } void append_mono(char *buf, int hl_type) { char tmpbuf[100]; int code; switch(hl_type) { case HL_NORMAL: case HL_NORM_INV: code = cur_normal_hilite ? 4 : 0; /* underline : normal */ break; case HL_SPECIAL: code = 1; /* bold */ break; default: case HL_BAD: code = 7; /* reverse video */ break; } sprintf(tmpbuf, "\x1b[%dm", code); strcat(buf, tmpbuf); } void append_hilite(char *buf, int hl_type) { if(mono) append_mono(buf, hl_type); else append_color(buf, hl_type); } void append_hilite_off(char *buf) { strcat(buf, "\x1b[0m"); } void append_right(char *str) { strcat(right_buf, str); } void append_left(unsigned char byte, int dash, int hl_type) { char tmpbuf[100]; if(!dump_column) sprintf(left_buf, hex_word_fmt, filepos + display_offset); append_hilite(left_buf, hl_type); sprintf(tmpbuf, hex_byte_fmt, byte); strcat(left_buf, tmpbuf); dump_column++; if(dash) { strcat(left_buf, "-"); if(dump_column == (MAX_DUMP_COLS / 2)) strcat(left_buf, "-"); append_hilite_off(left_buf); } else { append_hilite_off(left_buf); strcat(left_buf, " "); if(dump_column == (MAX_DUMP_COLS / 2)) strcat(left_buf, " "); } if(dump_column == MAX_DUMP_COLS) print_line(); filepos++; } void check_utf16(int byte0, int byte1) { char *endian; if(byte0 == 0xff && byte1 == 0xfe) { endian = "little"; } else if(byte0 == 0xfe && byte1 == 0xff) { endian = "big"; } else { return; } fprintf(stderr, "%s: input looks like UTF-16, %s-endian\n", self, endian); } /* Since we're not fully decoding the code points, we have to check for the actual UTF-8 representation of our one special multibyte char. */ int is_bom(unsigned char *b) { return (b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); } /* U+10FFFF is the last valid codepoint. It encodes to f4 8f bf bf. 'count' is the count of continuation bytes only (so, 3 for a 4-byte sqeuence). */ int is_out_of_range(int count, unsigned char *b) { if(count < 3) return 0; if(b[0] < 0xf4) return 0; if(b[1] < 0x90) return 0; return 1; } /* This is the 'workhorse', called for each character in the file. Return value: false = EOF, true = more data to read */ int dump_utf8_char(void) { unsigned char bytes[] = { 0, 0, 0, 0, 0 }; unsigned char *cont_bytes = bytes + 1; char *printable; int bad = 0, special = 0, hl_type; int c, cont_count, i; static int byte0; c = fgetc(input); if(c == EOF) return 0; byte_count++; bytes[0] = (unsigned char)c; if(filepos == 0) { byte0 = c; } else if(filepos == 1) { check_utf16(byte0, c); } /* look at 1st byte to find out how long the sequence is */ if(c < 0x7f) { ascii_count++; cont_count = 0; if(c <= ' ' || c == 0x7f) special = 1; } else if((c & 0xe0) == 0xc0) { /* 110xxxxx */ cont_count = 1; } else if((c & 0xf0) == 0xe0) { /* 1110xxxx */ cont_count = 2; } else if((c & 0xf8) == 0xf0) { /* 11110xxx */ cont_count = 3; } else { /* high bit set, but not a valid sequence-starter */ cont_count = 0; bad = 1; } /* read and validate the continuation bytes, if any */ for(i = 0; i < cont_count; i++) { int cb; c = fgetc(input); if(c == EOF) { /* EOF in mid-sequence. Don't return 0 here, since we still have to dump the partial sequence. The next call will give us EOF again. */ cont_count = i; bad = 1; break; } byte_count++; cb = cont_bytes[i] = (unsigned char)c; if((cb & 0xc0) != 0x80) { /* Expected 10xxxxxx, got something else */ cont_count = i; bad = 1; ungetc(cb, input); break; } } if(is_out_of_range(cont_count, bytes)) bad = 1; if(bad) { bad_count++; } else { char_count++; if(cont_count) multi_count++; } /* decide how to highlight the current character */ if(bad) { hl_type = HL_BAD; /* replacement character � is U+FFFD */ printable = "�"; } else if(special) { hl_type = HL_SPECIAL; printable = get_special(bytes[0]); } else if(cont_count == 2 && is_bom(bytes)) { hl_type = HL_SPECIAL; printable = "B"; } else { hl_type = HL_NORMAL; printable = (char *)bytes; next_normal_hilite(); } /* human-readable (right) column: */ append_hilite(right_buf, hl_type); append_right(printable); append_hilite_off(right_buf); /* hex columns: */ if(hilite_multi && cont_count) hl_type = HL_NORM_INV; for(i = 0; i <= cont_count; i++) { append_left(bytes[i], (i != cont_count), hl_type); } return 1; } /* this only gets called when reading stdin. */ void skip_input(unsigned int bytes) { char tmp[BUFSIZ]; if(fread(tmp, 1, bytes, input) < bytes) { if(feof(input)) return; /* this probably never happens when reading from stdin: */ fprintf(stderr, "%s: ", self); perror("fread()"); exit(1); } } /* this only gets called when reading stdin. */ void fake_seek(void) { long i = seekpos; while(i >= BUFSIZ) { skip_input(BUFSIZ); if(feof(input)) return; i -= BUFSIZ; } skip_input(i); } /* used by -s / -S options */ void seek_input(void) { int whence = SEEK_SET; if(seekpos < 0) { whence = SEEK_END; } if(fseek(input, seekpos, whence) == 0) { filepos = ftell(input); return; } /* fseek() failed, likely we're reading stdin. fake it, if we can. */ if(whence == SEEK_SET) { clearerr(input); fake_seek(); filepos = seekpos; } else { perror(self); exit(1); } } void dump_file(void) { if(seekpos) seek_input(); if(seek_offset_zero) filepos = 0; while(dump_utf8_char()) if(limit && (filepos >= limit)) break; /* handle the last line, if the file size not divisible by 16. */ if(dump_column) print_line(); if(print_info) { printf("Bytes: %ld\n", byte_count); printf("Characters: %ld\n", char_count); printf(" ASCII: %ld\n", ascii_count); printf(" Multibyte: %ld\n", multi_count); printf("Bad sequences: %ld\n", bad_count); } } int main(int argc, char **argv) { set_self(argv[0]); parse_options(argc, argv); dump_file(); fclose(input); return 0; }