From 4df7bb4d762ff945fb7a823cb4c153cab7e3c273 Mon Sep 17 00:00:00 2001 From: "B. Watson" Date: Thu, 12 Dec 2024 06:21:05 -0500 Subject: initial commit --- uxd.c | 271 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 271 insertions(+) create mode 100644 uxd.c (limited to 'uxd.c') diff --git a/uxd.c b/uxd.c new file mode 100644 index 0000000..00a2686 --- /dev/null +++ b/uxd.c @@ -0,0 +1,271 @@ +#include +#include +#include +#include + +/* output looks like: + + 0 1 2 3 4 5 6 7 8 9 A B C D E F +0000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 abcdefghijklmnop + +...first column will extend to more digits if needed. +*/ + +/* UTF-8 spec summary, taken from Wikipedia and elsewhere, kept here + for locality of reference. + +Codepoints 0-0x7f encode as themselves, one byte each, bit 7 always 0. + +0x80 and up are encoded as multiple bytes. The first byte's bit 7 is +always 1. The top bits determine the byte length of the sequence: + +110 - 2 bytes +1110 - 3 bytes +11110 - 4 bytes + +Continuation (2nd and further bytes) have 10 as the top 2 bits. If +we get a continuation that's not after a sequence-starter, that's an +error. If we get a sequence-starter, but the sequence doesn't have +the correct number of continuation bytes (e.g. 110xxxxx followed by +anything that isn't 10xxxxxx), that's an error too. + +BOM: if the file contains ef bb bf (aka U+FEFF), it should be colorized +as a special (non-printable). +If the file begins with ff fe, it's UTF-16 (little endian). If it's +fe ff, it's UTF-16 big-endian. Probably we should detect these and +print a warning on stderr. +*/ + +/* max UTF-8 sequence length, in bytes */ +#define MAXUTF8 4 + +/* ANSI color */ +#define BLACK 0 /* don't use */ +#define RED 1 +#define GREEN 2 +#define YELLOW 3 +#define BLUE 4 /* don't use */ +#define PURPLE 5 +#define CYAN 6 +#define WHITE 7 /* don't use */ + +#define SPECIAL PURPLE + +#define BAD_FG BLACK +#define BAD_BG RED + +// const int normal_colors[] = { GREEN, PURPLE, CYAN }; +const int normal_colors[] = { GREEN, YELLOW }; +int cur_normal_color = 0; +int dump_color; + +const char *self; +FILE *input; + +/* these buffers are bigger than they need to be really. */ +char left_buf[4096]; +char right_buf[4096]; + +#define MAX_DUMP_COLS 16 +int dump_column = 0; +int filepos = 0; + +void usage(void) { + printf("Usage: %s \n", self); + printf(" With no , or with -, read standard input.\n"); + exit(0); +} + +void open_input(const int argc, const char *argv1) { + if(argc == 1) { + input = stdin; + return; + } + + if(argv1[0] == '-' && argv1[1] != '\0') { + usage(); + } + + if(argc == 2) { + if(strcmp(argv1, "-") == 0) + input = stdin; + else { + input = fopen(argv1, "rb"); + if(!input) { + fprintf(stderr, "%s: ", self); + perror(argv1); + exit(1); + } + } + } +} + +char * const special_symbols[] = { + "␀", "␁", "␂", "␃", "␄", "␅", "␆", "␇", "␈", "⇥", "↵", "␋", "␌", "␍", "␎", "␏", + "␐", "␑", "␒", "␓", "␔", "␕", "␖", "␗", "␘", "␙", "␚", "␛", "␜", "␝", "␞", "␟", + "␣", +}; + +char *get_special(unsigned char c) { + if(c == 0x7f) return "⌦"; + if(c <= ' ') return special_symbols[c]; + return "?"; /* should never happen */ +} + +void set_self(const char *argv0) { + self = strrchr(argv0, '/'); + + if(self) + self++; + else + self = argv0; +} + +void next_normal_color() { + cur_normal_color++; + cur_normal_color %= (sizeof(normal_colors) / sizeof(int)); +} + +void append_color(char *buf, int fgcolor, int bgcolor) { + char tmpbuf[100]; + + sprintf(tmpbuf, "\x1b[0;3%d", fgcolor); + strcat(buf, tmpbuf); + if(bgcolor) { + sprintf(tmpbuf, ";4%d", bgcolor); + strcat(buf, tmpbuf); + } + sprintf(tmpbuf, "m"); + strcat(buf, tmpbuf); +} + +void print_line(void) { + int spacing = MAX_DUMP_COLS - dump_column; + printf("%s", left_buf); + while(spacing--) printf(" "); + printf(" %s\n", right_buf); + left_buf[0] = right_buf[0] = '\0'; +} + +void append_color_off(char *buf) { + strcat(buf, "\x1b[0m"); +} + +void append_right(char *str) { + strcat(right_buf, str); +} + +void append_left(unsigned char byte, int fgcolor, int bgcolor) { + char tmpbuf[100]; + + if(!dump_column) + sprintf(left_buf, "%04x: ", filepos); + + append_color(left_buf, fgcolor, bgcolor); + sprintf(tmpbuf, "%02x", byte); + strcat(left_buf, tmpbuf); + append_color_off(left_buf); + strcat(left_buf, " "); + + if(dump_column == 7) strcat(left_buf, " "); + dump_column++; + if(dump_column == MAX_DUMP_COLS) { + print_line(); + dump_column = 0; + } + + filepos++; +} + +int dump_utf8_char(void) { + unsigned char bytes[] = { 0, 0, 0, 0, 0 }; + unsigned char *cont_bytes = bytes + 1; + char *printable; + int bad = 0, special = 0; + int c, cont_count, i, fg, bg; + + c = fgetc(input); + if(c == EOF) + return 0; + + bytes[0] = (unsigned char)c; + + if(c < 0x7f) { + cont_count = 0; + if(c <= ' ' || c == 0x7f) + special = 1; + } else if((c & 0xe0) == 0xc0) /* 110xxxxx */ + cont_count = 1; + else if((c & 0xf0) == 0xe0) /* 1110xxxx */ + cont_count = 2; + else if((c & 0xf8) == 0xf0) /* 11110xxx */ + cont_count = 3; + else { + cont_count = 0; + bad = 1; + } + + for(i = 0; i < cont_count; i++) { + int cb; + c = fgetc(input); + + if(c == EOF) { + /* EOF in mid-sequence */ + cont_count = i; + bad = 1; + break; + } + + cb = cont_bytes[i] = (unsigned char)c; + if((cb & 0xc0) != 0x80) { + /* Expected 10xxxxxx, got something else */ + cont_count = i; + bad = 1; + ungetc(cb, input); + break; + } + } + + /* TODO: handle BOM? what about combining diacritics? */ + if(bad) { + fg = BAD_FG; + bg = BAD_BG; + /* replacement character � is U+FFFD */ + printable = "�"; + } else if(special) { + fg = SPECIAL; + bg = 0; + printable = get_special(bytes[0]); + } else { + fg = normal_colors[cur_normal_color]; + bg = 0; + printable = (char *)bytes; + next_normal_color(); + } + + append_color(right_buf, fg, bg); + append_right(printable); + append_color_off(right_buf); + + for(i = 0; i <= cont_count; i++) { + append_left(bytes[i], fg, bg); + } + + return 1; +} + +void dump_file(void) { + while(dump_utf8_char()) + ; + + if(dump_column) + print_line(); +} + +int main(int argc, char **argv) { + set_self(argv[0]); + open_input(argc, argv[1]); + dump_file(); + fclose(input); + return 0; +} -- cgit v1.2.3