aboutsummaryrefslogtreecommitdiff
path: root/uxd.c
diff options
context:
space:
mode:
Diffstat (limited to 'uxd.c')
-rw-r--r--uxd.c271
1 files changed, 271 insertions, 0 deletions
diff --git a/uxd.c b/uxd.c
new file mode 100644
index 0000000..00a2686
--- /dev/null
+++ b/uxd.c
@@ -0,0 +1,271 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/* output looks like:
+
+ 0 1 2 3 4 5 6 7 8 9 A B C D E F
+0000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 abcdefghijklmnop
+
+...first column will extend to more digits if needed.
+*/
+
+/* UTF-8 spec summary, taken from Wikipedia and elsewhere, kept here
+ for locality of reference.
+
+Codepoints 0-0x7f encode as themselves, one byte each, bit 7 always 0.
+
+0x80 and up are encoded as multiple bytes. The first byte's bit 7 is
+always 1. The top bits determine the byte length of the sequence:
+
+110 - 2 bytes
+1110 - 3 bytes
+11110 - 4 bytes
+
+Continuation (2nd and further bytes) have 10 as the top 2 bits. If
+we get a continuation that's not after a sequence-starter, that's an
+error. If we get a sequence-starter, but the sequence doesn't have
+the correct number of continuation bytes (e.g. 110xxxxx followed by
+anything that isn't 10xxxxxx), that's an error too.
+
+BOM: if the file contains ef bb bf (aka U+FEFF), it should be colorized
+as a special (non-printable).
+If the file begins with ff fe, it's UTF-16 (little endian). If it's
+fe ff, it's UTF-16 big-endian. Probably we should detect these and
+print a warning on stderr.
+*/
+
+/* max UTF-8 sequence length, in bytes */
+#define MAXUTF8 4
+
+/* ANSI color */
+#define BLACK 0 /* don't use */
+#define RED 1
+#define GREEN 2
+#define YELLOW 3
+#define BLUE 4 /* don't use */
+#define PURPLE 5
+#define CYAN 6
+#define WHITE 7 /* don't use */
+
+#define SPECIAL PURPLE
+
+#define BAD_FG BLACK
+#define BAD_BG RED
+
+// const int normal_colors[] = { GREEN, PURPLE, CYAN };
+const int normal_colors[] = { GREEN, YELLOW };
+int cur_normal_color = 0;
+int dump_color;
+
+const char *self;
+FILE *input;
+
+/* these buffers are bigger than they need to be really. */
+char left_buf[4096];
+char right_buf[4096];
+
+#define MAX_DUMP_COLS 16
+int dump_column = 0;
+int filepos = 0;
+
+void usage(void) {
+ printf("Usage: %s <file>\n", self);
+ printf(" With no <file>, or with -, read standard input.\n");
+ exit(0);
+}
+
+void open_input(const int argc, const char *argv1) {
+ if(argc == 1) {
+ input = stdin;
+ return;
+ }
+
+ if(argv1[0] == '-' && argv1[1] != '\0') {
+ usage();
+ }
+
+ if(argc == 2) {
+ if(strcmp(argv1, "-") == 0)
+ input = stdin;
+ else {
+ input = fopen(argv1, "rb");
+ if(!input) {
+ fprintf(stderr, "%s: ", self);
+ perror(argv1);
+ exit(1);
+ }
+ }
+ }
+}
+
+char * const special_symbols[] = {
+ "␀", "␁", "␂", "␃", "␄", "␅", "␆", "␇", "␈", "⇥", "↵", "␋", "␌", "␍", "␎", "␏",
+ "␐", "␑", "␒", "␓", "␔", "␕", "␖", "␗", "␘", "␙", "␚", "␛", "␜", "␝", "␞", "␟",
+ "␣",
+};
+
+char *get_special(unsigned char c) {
+ if(c == 0x7f) return "⌦";
+ if(c <= ' ') return special_symbols[c];
+ return "?"; /* should never happen */
+}
+
+void set_self(const char *argv0) {
+ self = strrchr(argv0, '/');
+
+ if(self)
+ self++;
+ else
+ self = argv0;
+}
+
+void next_normal_color() {
+ cur_normal_color++;
+ cur_normal_color %= (sizeof(normal_colors) / sizeof(int));
+}
+
+void append_color(char *buf, int fgcolor, int bgcolor) {
+ char tmpbuf[100];
+
+ sprintf(tmpbuf, "\x1b[0;3%d", fgcolor);
+ strcat(buf, tmpbuf);
+ if(bgcolor) {
+ sprintf(tmpbuf, ";4%d", bgcolor);
+ strcat(buf, tmpbuf);
+ }
+ sprintf(tmpbuf, "m");
+ strcat(buf, tmpbuf);
+}
+
+void print_line(void) {
+ int spacing = MAX_DUMP_COLS - dump_column;
+ printf("%s", left_buf);
+ while(spacing--) printf(" ");
+ printf(" %s\n", right_buf);
+ left_buf[0] = right_buf[0] = '\0';
+}
+
+void append_color_off(char *buf) {
+ strcat(buf, "\x1b[0m");
+}
+
+void append_right(char *str) {
+ strcat(right_buf, str);
+}
+
+void append_left(unsigned char byte, int fgcolor, int bgcolor) {
+ char tmpbuf[100];
+
+ if(!dump_column)
+ sprintf(left_buf, "%04x: ", filepos);
+
+ append_color(left_buf, fgcolor, bgcolor);
+ sprintf(tmpbuf, "%02x", byte);
+ strcat(left_buf, tmpbuf);
+ append_color_off(left_buf);
+ strcat(left_buf, " ");
+
+ if(dump_column == 7) strcat(left_buf, " ");
+ dump_column++;
+ if(dump_column == MAX_DUMP_COLS) {
+ print_line();
+ dump_column = 0;
+ }
+
+ filepos++;
+}
+
+int dump_utf8_char(void) {
+ unsigned char bytes[] = { 0, 0, 0, 0, 0 };
+ unsigned char *cont_bytes = bytes + 1;
+ char *printable;
+ int bad = 0, special = 0;
+ int c, cont_count, i, fg, bg;
+
+ c = fgetc(input);
+ if(c == EOF)
+ return 0;
+
+ bytes[0] = (unsigned char)c;
+
+ if(c < 0x7f) {
+ cont_count = 0;
+ if(c <= ' ' || c == 0x7f)
+ special = 1;
+ } else if((c & 0xe0) == 0xc0) /* 110xxxxx */
+ cont_count = 1;
+ else if((c & 0xf0) == 0xe0) /* 1110xxxx */
+ cont_count = 2;
+ else if((c & 0xf8) == 0xf0) /* 11110xxx */
+ cont_count = 3;
+ else {
+ cont_count = 0;
+ bad = 1;
+ }
+
+ for(i = 0; i < cont_count; i++) {
+ int cb;
+ c = fgetc(input);
+
+ if(c == EOF) {
+ /* EOF in mid-sequence */
+ cont_count = i;
+ bad = 1;
+ break;
+ }
+
+ cb = cont_bytes[i] = (unsigned char)c;
+ if((cb & 0xc0) != 0x80) {
+ /* Expected 10xxxxxx, got something else */
+ cont_count = i;
+ bad = 1;
+ ungetc(cb, input);
+ break;
+ }
+ }
+
+ /* TODO: handle BOM? what about combining diacritics? */
+ if(bad) {
+ fg = BAD_FG;
+ bg = BAD_BG;
+ /* replacement character � is U+FFFD */
+ printable = "�";
+ } else if(special) {
+ fg = SPECIAL;
+ bg = 0;
+ printable = get_special(bytes[0]);
+ } else {
+ fg = normal_colors[cur_normal_color];
+ bg = 0;
+ printable = (char *)bytes;
+ next_normal_color();
+ }
+
+ append_color(right_buf, fg, bg);
+ append_right(printable);
+ append_color_off(right_buf);
+
+ for(i = 0; i <= cont_count; i++) {
+ append_left(bytes[i], fg, bg);
+ }
+
+ return 1;
+}
+
+void dump_file(void) {
+ while(dump_utf8_char())
+ ;
+
+ if(dump_column)
+ print_line();
+}
+
+int main(int argc, char **argv) {
+ set_self(argv[0]);
+ open_input(argc, argv[1]);
+ dump_file();
+ fclose(input);
+ return 0;
+}