#include #include #include #include /* UTF-8 spec summary, taken from Wikipedia and elsewhere, kept here for locality of reference. Codepoints 0-0x7f encode as themselves, one byte each, bit 7 always 0. 0x80 and up are encoded as multiple bytes. The first byte's bit 7 is always 1. The top bits determine the byte length of the sequence: 110 - 2 bytes 1110 - 3 bytes 11110 - 4 bytes Continuation (2nd and further bytes) have 10 as the top 2 bits. If we get a continuation that's not after a sequence-starter, that's an error. If we get a sequence-starter, but the sequence doesn't have the correct number of continuation bytes (e.g. 110xxxxx followed by anything that isn't 10xxxxxx), that's an error too. Note that we don't actually do a full decode of the codepoint bits. It's enough to look at the top bits to keep track of multibyte characters. BOM: if the file contains ef bb bf (aka U+FEFF), it will be colorized as a special (non-printable). If the file begins with ff fe, it's UTF-16 (little endian). If it's fe ff, it's UTF-16 big-endian. We detect these and print a warning on stderr. */ /* from getopt.c */ extern int my_getopt(int, char **, char *); extern char *my_optarg; extern int my_optind; #ifndef VERSION #define VERSION "(unknown version)" #endif #ifndef BUFSIZ #define BUFSIZ 4096 #endif /* environment variables. */ #define NO_COLOR "NO_COLOR" #define ENV_OPTS "UXD_OPTS" /* maximum number of arguments, including environment and argv. */ #define MAX_ARGS 64 /* ANSI colors */ #define BLACK 0 /* don't use (could be the background color) */ #define RED 1 #define GREEN 2 #define YELLOW 3 #define BLUE 4 /* don't use (hard to read on many terminals) */ #define PURPLE 5 #define CYAN 6 #define WHITE 7 /* don't use (could be the background color) */ /* highlight types. */ #define HL_NORMAL 0 #define HL_NORM_INV 1 #define HL_SPECIAL 2 #define HL_SPEC_INV 3 #define HL_BAD 4 /* terminal codes for mono highlighting. */ #define MONO_NORMAL 0 #define MONO_UNDERLINE 4 #define MONO_BOLD 1 #define MONO_REVERSE 7 /* terminal codes to enable/disable UTF-8 mode */ #define ESC_UTF8_ON "\x1b%G" #define ESC_UTF8_OFF "\x1b%@" /* replacement character � is U+FFFD */ #define PRINT_BAD "�" #define PRINT_BOM "B" #define PRINT_OLONG "O" #define PRINT_OORANGE ">" #define PRINT_SURR "S" /* sprintf() formats for hex data */ #define LC_BYTE_FMT "%02x" #define LC_ADDR_FMT "%04x: " #define UC_BYTE_FMT "%02X" #define UC_ADDR_FMT "%04X: " /* name (read from argv[0]), for error/warning messages. */ const char *self; /* the input file, either stdin or a file we open for reading. */ FILE *input; /* default colors */ int normal_colors[] = { GREEN, YELLOW }; int special_colors[] = { PURPLE, CYAN }; int bad_color = RED; /* toggles between 0 and 1 for each normal/special character */ int cur_normal_hilite = 0; int cur_special_hilite = 0; /* these buffers are bigger than they need to be really. */ /* offset and hex bytes: */ char left_buf[4096]; /* printable form: */ char right_buf[4096]; /* dump_column ranges 0..(MAX_DUMP_COLS-1) */ #define MAX_DUMP_COLS 16 int dump_column = 0; /* where we're at in the input. */ int filepos = 0; /* Unicode control character printable equivalents. For 0, use the "empty set" symbol. It's a lot more readable than the "nul" symbol, ␀. Escape, tab, newline, space are what urxvt uses in its "keycap picture" mode. The rest of these are hard to read at normal font sizes, but it's still better than using a dot for everything like xxd does. */ char * const special_symbols[] = { /* 0-0x0f: */ "∅", "␁", "␂", "␃", "␄", "␅", "␆", "␇", "␈", "⇥", "↵", "␋", "␌", "␍", "␎", "␏", /* 0x10-0x1f: */ "␐", "␑", "␒", "␓", "␔", "␕", "␖", "␗", "␘", "␙", "␚", "⎋", "␜", "␝", "␞", "␟", /* 0x20 (space): */ "␣", }; /* options */ int alternate_colors = 1; /* -1 */ int print_info_opt = 0; /* -i */ int bold = 0; /* -b */ int hilite_multi = 1; /* -r */ int mono = 0; /* -m */ long display_offset = 0; /* -o */ long seekpos = 0; /* -s, -S */ int seek_offset_zero = 0; /* -S */ long limit; /* -l */ const char *hex_byte_fmt = LC_BYTE_FMT; /* -u */ const char *hex_addr_fmt = LC_ADDR_FMT; /* " */ char *dump_data_arg = NULL; /* -d */ long dump_data_idx = 0; /* -d */ int term_utf8 = 0; /* -t, -T */ int restore_term = 0; /* -T only */ int java_mode = 0; /* -j */ int wtf8_mode = 0; /* -w */ int permissive = 0; /* -l */ /* stats for -i option */ long byte_count = 0; long ascii_count = 0; long multi_count = 0; long bad_count = 0; long char_count = 0; void usage(void) { extern char *usage_opts[]; char **opt; puts("uxd (Utf-8 heX Dump) v" VERSION " by B. Watson. WTFPL."); printf("Usage: %s -[options] []\n", self); puts(" With no , or with -, read standard input."); puts("Options:"); for(opt = usage_opts; *opt; opt++) { puts(*opt); } exit(0); } void version(void) { printf("%s\n", VERSION); exit(0); } void open_input(const char *arg) { if(!arg || (strcmp(arg, "-") == 0)) { input = stdin; freopen(NULL, "rb", stdin); } else { input = fopen(arg, "rb"); if(!input) { fprintf(stderr, "%s: ", self); perror(arg); exit(1); } } } void color_error(void) { fprintf(stderr, "%s: invalid -c colors (-h for help).\n", self); exit(1); } int num_to_color(char c) { if(c < '0' || c > '7') color_error(); return c - '0'; } void parse_colors(char *arg) { static int *colors[] = { &normal_colors[0], &normal_colors[1], &special_colors[0], &special_colors[1], &bad_color }; int i, c; i = strlen(arg); if(i < 1 || i > 5) color_error(); for(i = 0; i < 5; i++) { c = arg[i]; if(!c) break; *colors[i] = num_to_color(c); } } void number_err(int opt) { fprintf(stderr, "%s: invalid number for -%c option.\n", self, opt); exit(1); } long parse_number(int opt, const char *s) { char *e; long result; result = strtol(s, &e, 0); /* require at least one digit (otherwise -sk would be allowed) */ if(e == s) number_err(opt); switch(e[0]) { case 0: break; case 'b': case 'B': if(e[1]) number_err(opt); break; /* allow & ignore b/B for "bytes" */ case 'k': result *= 1024L; break; case 'm': result *= 1048576L; break; case 'g': result *= 1073741824L; break; case 't': result *= 1099511627776L; break; case 'K': result *= 1000L; break; case 'M': result *= 1000000L; break; case 'G': result *= 1000000000L; break; case 'T': result *= 1000000000000L; break; default: number_err(opt); } /* allow e.g. "kb" for kilobytes (but reject e.g. "kx") */ if(e[0] && e[1] && e[1] != 'b' && e[1] != 'B') number_err(opt); return result; } void parse_args(int argc, char **argv) { int opt; if(argc > 1) { if(strcmp(argv[1], "--help") == 0) usage(); if(strcmp(argv[1], "--version") == 0) version(); } while((opt = my_getopt(argc, argv, "jwptTd:1ic:nbl:rmo:S:s:uhv")) != -1) { switch(opt) { case 'j': java_mode = 1; break; case 'w': wtf8_mode = 1; break; case 'p': permissive = 1; break; case 't': term_utf8 = restore_term = 1; break; case 'T': term_utf8 = 1; restore_term = 0; break; case 'd': if(dump_data_arg) { fprintf(stderr, "%s: multiple -d options not supported.\n", self); exit(1); } dump_data_arg = my_optarg; break; case '1': alternate_colors = 0; break; case 'i': print_info_opt = 1; break; case 'c': mono = 0; parse_colors(my_optarg); break; case 'n': break; /* already handled in parse_options() */ case 'b': bold = 1; break; case 'l': limit = parse_number(opt, my_optarg); if(limit < 0) { fprintf(stderr, "%s: negative limit for -l not allowed.\n", self); exit(1); } break; case 'r': hilite_multi = 0; break; case 'm': mono = 1; break; case 'o': display_offset = parse_number(opt, my_optarg); break; case 'S': seek_offset_zero = 1; /* fall thru */ case 's': seekpos = parse_number(opt, my_optarg); break; case 'u': hex_byte_fmt = UC_BYTE_FMT; hex_addr_fmt = UC_ADDR_FMT; break; case 'h': usage(); break; case 'v': version(); break; default: exit(1); } } if(dump_data_arg) { if(my_optind != argc) { fprintf(stderr, "%s: cannot give a filename when -d is used.\n", self); exit(1); } } else { /* filename (if present) must come after all -options, and there can only be one filename. */ if(my_optind < (argc - 1)) usage(); open_input(argv[my_optind]); } } /* read options from the environment and the command line, create a new argv/argc that has all the options from both, with the environment ones first. */ void parse_options(int argc, char **argv) { int nargc; char **real_argv = argv; char *nargv[MAX_ARGS + 1]; char *env, *p; if(getenv(NO_COLOR)) mono = 1; env = getenv(ENV_OPTS); if(!env) { /* nothing in the env, use regular args as-is */ parse_args(argc, argv); return; } nargv[0] = (char *)self; nargv[1] = env; nargc = 2; for(p = env; *p; p++) { if(*p == ' ' || *p == '\t') { *p = '\0'; if(nargc == MAX_ARGS) break; nargv[nargc++] = p + 1; } } argv++; /* skip exe name */ while(*argv) { /* have to check for the -n option here */ if(argv[0][0] == '-' && argv[0][1] == 'n') { parse_args(argc, real_argv); return; } if(nargc == MAX_ARGS) break; nargv[nargc++] = *argv; argv++; } nargv[nargc] = NULL; parse_args(nargc, nargv); } char *get_special(unsigned char c) { if(c == 0x7f) return "⌦"; /* tab */ if(c <= ' ') return special_symbols[c]; return "?"; /* should never happen */ } /* Set name to use for error messages. This must be called before open_input(). */ void set_self(const char *argv0) { self = strrchr(argv0, '/'); if(self) self++; else self = argv0; } void print_line(void) { int spacing = MAX_DUMP_COLS - dump_column; printf("%s", left_buf); /* line up the rightmost field (human-readable), for the partial line at the end of the output (if there is one). */ while(spacing--) printf(" "); if(dump_column < (MAX_DUMP_COLS / 2)) putchar(' '); printf(" %s\n", right_buf); /* clear the buffers, start a new line */ left_buf[0] = right_buf[0] = '\0'; dump_column = 0; } void next_normal_hilite(void) { if(alternate_colors) cur_normal_hilite = !cur_normal_hilite; } void next_special_hilite(void) { if(alternate_colors) cur_special_hilite = !cur_special_hilite; } void append_color(char *buf, int hl_type) { char tmpbuf[100]; int fgcolor, bgcolor; switch(hl_type) { case HL_NORMAL: fgcolor = normal_colors[cur_normal_hilite]; bgcolor = 0; break; case HL_NORM_INV: fgcolor = 0; bgcolor = normal_colors[cur_normal_hilite]; break; case HL_SPECIAL: fgcolor = special_colors[cur_special_hilite]; bgcolor = 0; break; case HL_SPEC_INV: fgcolor = 0; bgcolor = special_colors[cur_special_hilite]; break; case HL_BAD: default: fgcolor = 0; bgcolor = bad_color; break; } sprintf(tmpbuf, "\x1b[%d;3%d", bold, fgcolor); strcat(buf, tmpbuf); if(bgcolor) { sprintf(tmpbuf, ";4%d", bgcolor); strcat(buf, tmpbuf); } sprintf(tmpbuf, "m"); strcat(buf, tmpbuf); } void append_mono(char *buf, int hl_type) { char tmpbuf[100]; int code; switch(hl_type) { case HL_NORMAL: case HL_NORM_INV: code = cur_normal_hilite ? MONO_UNDERLINE : MONO_NORMAL; break; case HL_SPECIAL: case HL_SPEC_INV: code = MONO_BOLD; break; default: case HL_BAD: code = MONO_REVERSE; break; } sprintf(tmpbuf, "\x1b[%dm", code); strcat(buf, tmpbuf); } void append_hilite(char *buf, int hl_type) { if(mono) append_mono(buf, hl_type); else append_color(buf, hl_type); } void append_hilite_off(char *buf) { strcat(buf, "\x1b[0m"); } void append_right(char *str) { strcat(right_buf, str); } void append_left(unsigned char byte, int dash, int hl_type) { char tmpbuf[100]; if(!dump_column) sprintf(left_buf, hex_addr_fmt, filepos + display_offset); append_hilite(left_buf, hl_type); sprintf(tmpbuf, hex_byte_fmt, byte); strcat(left_buf, tmpbuf); dump_column++; if(dash) { strcat(left_buf, "-"); if(dump_column == (MAX_DUMP_COLS / 2)) strcat(left_buf, "-"); append_hilite_off(left_buf); } else { append_hilite_off(left_buf); strcat(left_buf, " "); if(dump_column == (MAX_DUMP_COLS / 2)) strcat(left_buf, " "); } if(dump_column == MAX_DUMP_COLS) print_line(); filepos++; } void check_utf16(int byte0, int byte1) { char *endian; if(byte0 == 0xff && byte1 == 0xfe) { endian = "little"; } else if(byte0 == 0xfe && byte1 == 0xff) { endian = "big"; } else { return; } fprintf(stderr, "%s: input looks like UTF-16, %s-endian\n", self, endian); } /* Since we're not fully decoding the code points, we have to check for the actual UTF-8 representation of our one special multibyte char. */ int is_bom(unsigned char *b) { return (b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); } /* Detect overlong encodings, without doing a full decode. */ int is_overlong(int cont_count, unsigned char *b) { /* 1 byte seqs are never overlong. */ if(!cont_count) return 0; /* 2 byte seqs, if the first byte is 0xc0 or 0xc1, it's overlong. */ if(cont_count == 1 && b[0] <= 0xc1) return 1; /* for 3 and 4 byte seqs, the 2nd byte matters too. */ if(cont_count == 2 && b[0] == 0xe0 && b[1] <= 0x9f) return 1; if(cont_count == 3 && b[0] == 0xf0 && b[1] <= 0x8f) return 1; return 0; } /* U+10FFFF is the last valid codepoint. It encodes to f4 8f bf bf. 'count' is the count of continuation bytes only (so, 3 for a 4-byte sqeuence). */ int is_out_of_range(int cont_count, unsigned char *b) { if(cont_count < 3) return 0; if(b[0] < 0xf4) return 0; if(b[1] < 0x90) return 0; return 1; } /* surrogates for UTF-16 are not valid Unicode (therefore not UTF-8) */ int is_surrogate(int cont_count, unsigned char *b) { if(cont_count != 2) return 0; return b[0] == 0xed && b[1] > 0x9f; } int get_next_byte(void) { int c; if(dump_data_arg) { /* have to cast this to unsigned char and back to int, to emulate fgetc() */ c = (unsigned char)dump_data_arg[dump_data_idx++]; if(!c) c = EOF; } else { c = fgetc(input); } return c; } void push_back_byte(int c) { if(dump_data_arg) { if(dump_data_idx) dump_data_idx--; } else { ungetc(c, input); } } char *classify_char(int *hl, unsigned char *bytes, int cont_count) { char *b = (char *)bytes; int c; c = b[0]; if(cont_count == 0) { if(c <= ' ' || c == 0x7f) { *hl = HL_SPECIAL; return get_special(c); } else { *hl = HL_NORMAL; return b; } } if(cont_count == 2 && is_bom(bytes)) { *hl = HL_SPEC_INV; return PRINT_BOM; } if(is_overlong(cont_count, bytes)) { /* java mode (MUTF-8) allows exactly one overlong: */ if(java_mode && cont_count == 1 && bytes[0] == 0xc0 && bytes[1] == 0x80) { *hl = HL_SPEC_INV; return get_special(0); } else if(permissive) { *hl = HL_NORMAL; } else { *hl = HL_BAD; } return PRINT_OLONG; } if(is_surrogate(cont_count, bytes)) { if(wtf8_mode || permissive) { *hl = HL_SPEC_INV; } else { *hl = HL_BAD; } return PRINT_SURR; } if(is_out_of_range(cont_count, bytes)) { if(permissive) { *hl = HL_SPEC_INV; } else { *hl = HL_BAD; } return PRINT_OORANGE; } *hl = HL_NORMAL; return b; } /* This is the 'workhorse', called for each character in the file. Return value: false = EOF, true = more data to read */ int dump_utf8_char(void) { unsigned char bytes[] = { 0, 0, 0, 0, 0 }; char *printable; int bad = 0, hl_type; int c, cont_count, i; static int byte0; c = get_next_byte(); if(c == EOF) return 0; byte_count++; bytes[0] = (unsigned char)c; if(filepos == 0) { byte0 = c; } else if(filepos == 1) { check_utf16(byte0, c); } /* look at 1st byte to find out how long the sequence is */ if(c <= 0x7f) { ascii_count++; cont_count = 0; } else if((c & 0xe0) == 0xc0) { /* 110xxxxx */ cont_count = 1; } else if((c & 0xf0) == 0xe0) { /* 1110xxxx */ cont_count = 2; } else if((c & 0xf8) == 0xf0) { /* 11110xxx */ cont_count = 3; } else { /* high bit set, but not a valid sequence-starter */ cont_count = 0; bad = 1; } /* read and validate the continuation bytes, if any */ for(i = 0; i < cont_count; i++) { int cb; c = get_next_byte(); if(c == EOF) { /* EOF in mid-sequence. Don't return 0 here, since we still have to dump the partial sequence. The next call will give us EOF again. */ cont_count = i; bad = 1; break; } byte_count++; cb = bytes[i + 1] = (unsigned char)c; if((cb & 0xc0) != 0x80) { /* Expected 10xxxxxx, got something else */ cont_count = i; bad = 1; push_back_byte(cb); byte_count--; break; } } if(bad) { hl_type = HL_BAD; printable = PRINT_BAD; } else { printable = classify_char(&hl_type, bytes, cont_count); } if(hl_type == HL_BAD) { bad_count++; } else { char_count++; if(cont_count) multi_count++; } if(hl_type == HL_NORMAL && hilite_multi && cont_count) hl_type = HL_NORM_INV; /* human-readable (right) column: */ append_hilite(right_buf, hl_type); append_right(printable); append_hilite_off(right_buf); /* hex columns: */ for(i = 0; i <= cont_count; i++) { append_left(bytes[i], (i != cont_count), hl_type); } if(hl_type == HL_NORMAL || hl_type == HL_NORM_INV) next_normal_hilite(); if(hl_type == HL_SPECIAL || hl_type == HL_SPEC_INV) next_special_hilite(); return 1; } /* this only gets called when reading stdin. */ void skip_input(unsigned int bytes) { char tmp[BUFSIZ]; if(fread(tmp, 1, bytes, input) < bytes) { if(feof(input)) return; /* this probably never happens when reading from stdin: */ fprintf(stderr, "%s: ", self); perror("fread()"); exit(1); } } /* this only gets called when reading stdin. */ void fake_seek(void) { long i = seekpos; while(i >= BUFSIZ) { skip_input(BUFSIZ); if(feof(input)) return; i -= BUFSIZ; } skip_input(i); } /* used by -s / -S options */ void seek_input(void) { int whence = SEEK_SET; if(seekpos < 0) { whence = SEEK_END; } if(fseek(input, seekpos, whence) == 0) { filepos = ftell(input); return; } /* fseek() failed, likely we're reading stdin. fake it, if we can. */ if(whence == SEEK_SET) { clearerr(input); fake_seek(); filepos = seekpos; } else { fprintf(stderr, "%s: are you trying to seek backwards in stdin?\n", self); perror(self); exit(1); } } void print_info(void) { printf("\nBytes: %ld\n", byte_count); printf("Valid characters: %ld\n", char_count); printf(" ASCII: %ld\n", ascii_count); printf(" Multibyte: %ld\n", multi_count); printf("Bad sequences: %ld\n", bad_count); } void dump_loop(void) { while(dump_utf8_char()) if(limit && (byte_count >= limit)) break; /* handle the last line, if the file size not divisible by 16. */ if(dump_column) print_line(); } void dump_file(void) { if(seekpos) seek_input(); if(seek_offset_zero) filepos = 0; dump_loop(); fclose(input); } void dump_data(void) { int datalen; datalen = strlen(dump_data_arg); if(seekpos >= datalen) return; if(seekpos < 0) dump_data_idx = datalen + seekpos; else if(seekpos) dump_data_idx = seekpos; if(seek_offset_zero) filepos = 0; else filepos = dump_data_idx; dump_loop(); } int main(int argc, char **argv) { set_self(argv[0]); parse_options(argc, argv); if(term_utf8) /* -t, -T */ fputs(ESC_UTF8_ON, stdout); if(dump_data_arg) dump_data(); /* -d */ else dump_file(); if(print_info_opt) /* -i */ print_info(); if(restore_term) /* -T */ fputs(ESC_UTF8_OFF, stdout); return 0; }