#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

/* UTF-8 spec summary, taken from Wikipedia and elsewhere, kept here
   for locality of reference.

Codepoints 0-0x7f encode as themselves, one byte each, bit 7 always 0.

0x80 and up are encoded as multiple bytes. The first byte's bit 7 is
always 1. The top bits determine the byte length of the sequence:

110 - 2 bytes
1110 - 3 bytes
11110 - 4 bytes

Continuation (2nd and further bytes) have 10 as the top 2 bits. If
we get a continuation that's not after a sequence-starter, that's an
error. If we get a sequence-starter, but the sequence doesn't have
the correct number of continuation bytes (e.g. 110xxxxx followed by
anything that isn't 10xxxxxx), that's an error too.

Note that we don't actually do a full decode of the codepoint bits.
It's enough to look at the top bits to keep track of multibyte
characters.

BOM: if the file contains ef bb bf (aka U+FEFF), it will be colorized
as a special (non-printable).

If the file begins with ff fe, it's UTF-16 (little endian). If it's
fe ff, it's UTF-16 big-endian. We detect these and print a warning
on stderr.
*/

/* from getopt.c */
extern int my_getopt(int, char **, char *);
extern char *optarg;
extern int optind;

#ifndef VERSION
#define VERSION "(unknown version)"
#endif

#ifndef BUFSIZ
#define BUFSIZ 4096
#endif

#define NO_COLOR "NO_COLOR"
#define ENV_OPTS "UXD_OPTS"
#define MAX_ARGS 64

/* ANSI colors */
#define BLACK  0 /* don't use (could be the background color) */
#define RED    1
#define GREEN  2
#define YELLOW 3
#define BLUE   4 /* don't use (hard to read on many terminals) */
#define PURPLE 5
#define CYAN   6
#define WHITE  7 /* don't use (could be the background color) */

#define SPECIAL PURPLE

#define BAD_FG BLACK
#define BAD_BG bad_color

int normal_colors[] = { GREEN, YELLOW };
int cur_normal_color = 0;
int dump_color;
int bad_color = RED;
int special_color = SPECIAL;

const char *self;
FILE *input;

/* these buffers are bigger than they need to be really. */
char left_buf[4096];
char right_buf[4096];

#define MAX_DUMP_COLS 16
int dump_column = 0;
int filepos = 0;

/* Unicode control character printable equivalents. For 0, use
   the "empty set" symbol. It's a lot more readable than the "nul"
   symbol, ␀. Escape, tab, newline, space are what urxvt uses in
   its "keycap picture" mode. The rest of there are hard to read at
   normal font sizes, but it's still better than using a dot for
   everything like xxd does. */
char * const special_symbols[] = {
	"∅", "␁", "␂", "␃", "␄", "␅", "␆", "␇", "␈", "⇥", "↵", "␋", "␌", "␍", "␎", "␏",
	"␐", "␑", "␒", "␓", "␔", "␕", "␖", "␗", "␘", "␙", "␚", "⎋", "␜", "␝", "␞", "␟",
	"␣",
};

/* options */
int bold = 0; /* -b */
int hilite_multi = 0; /* -r */
int mono = 0; /* -m */
long display_offset = 0; /* -o */
long seekpos = 0; /* -s, -S */
int seek_offset_zero = 0; /* -S */
long limit; /* -l */
const char *hex_byte_fmt = "%02x";   /* -u */
const char *hex_word_fmt = "%04x: "; /* "  */

void usage(void) {
	printf("uxd (Utf-8 heX Dump) v" VERSION " by B. Watson. WTFPL.\n");
	printf("Usage: %s [<file>]\n", self);
	printf("  With no <file>, or with -, read standard input.\n");
	exit(0);
}

void version(void) {
	printf("%s\n", VERSION);
	exit(0);
}

void open_input(const char *arg) {
	if(!arg || (strcmp(arg, "-") == 0)) {
		input = stdin;
	} else {
		input = fopen(arg, "rb");
		if(!input) {
			fprintf(stderr, "%s: ", self);
			perror(arg);
			exit(1);
		}
	}
}

void color_error(void) {
	fprintf(stderr, "%s: invalid -c colors (-h for help).\n", self);
	exit(1);
}

void check_color(char c) {
	if(c < '0' || c > '7') color_error();
}

void parse_colors(char *arg) {
	if(!arg[0]) return; /* should never happen anyway */

	/* first 2 are required */
	check_color(arg[0]);
	check_color(arg[1]);

	normal_colors[0] = arg[0] - '0';
	normal_colors[1] = arg[1] - '0';

	/* optional 3rd color */
	if(!arg[2]) return;
	check_color(arg[2]);
	special_color = arg[2] - '0';

	/* optional 4th color */
	if(!arg[3]) return;
	check_color(arg[3]);
	bad_color = arg[3] - '0';
}

long parse_number(const char *s) {
	return strtol(s, NULL, 0); /* TODO: error checking */
}

void parse_args(int argc, char **argv) {
	int opt;

	if(argc > 1) {
		if(strcmp(argv[1], "--help") == 0)
			usage();
		if(strcmp(argv[1], "--version") == 0)
			version();
	}

	while((opt = my_getopt(argc, argv, "c:nbl:rmo:S:s:uhv")) != -1) {
		switch(opt) {
			case 'c':
				mono = 0; parse_colors(optarg); break;
			case 'n':
				break; /* already handled in parse_options() */
			case 'b':
				bold = 1; break;
			case 'l':
				limit = parse_number(optarg); break;
			case 'r':
				hilite_multi = 1; break;
			case 'm':
				mono = 1; break;
			case 'o':
				display_offset = parse_number(optarg); break;
			case 'S':
				seek_offset_zero = 1;
				/* fall thru */
			case 's':
				seekpos = parse_number(optarg);
				break;
			case 'u':
				hex_byte_fmt = "%02X"; hex_word_fmt = "%04X: "; break;
			case 'h':
				usage(); break;
			case 'v':
				version(); break;
			default:
				exit(1);
		}
	}

	/* filename (if present) must come after all -options, and
	   there can only be one filename. */
	if(optind < (argc - 1)) usage();

	open_input(argv[optind]);
}

/* read options from the environment and the command line, create a
   new argv/argc that has all the options from both, with the
   environment ones first. */
void parse_options(int argc, char **argv) {
	int nargc;
	char **real_argv = argv;
	char *nargv[MAX_ARGS + 1];
	char *env, *p;

	if(getenv(NO_COLOR))
		mono = 1;

	env = getenv(ENV_OPTS);
	if(!env) {
		/* nothing in the env, use regular args as-is */
		parse_args(argc, argv);
		return;
	}

	nargv[0] = (char *)self;
	nargv[1] = env;
	nargc = 2;

	for(p = env; *p; p++) {
		if(*p == ' ' || *p == '\t') {
			*p = '\0';
			if(nargc == MAX_ARGS) break;
			nargv[nargc++] = p + 1;
		}
	}

	argv++; /* skip exe name */
	while(*argv) {
		/* have to check for the -n option here */
		if(argv[0][0] == '-' && argv[0][1] == 'n') {
			parse_args(argc, real_argv);
			return;
		}
		if(nargc == MAX_ARGS) break;
		nargv[nargc++] = *argv;
		argv++;
	}

	nargv[nargc] = NULL;
	parse_args(nargc, nargv);
}

char *get_special(unsigned char c) {
	if(c == 0x7f) return "⌦"; /* tab */
	if(c <= ' ') return special_symbols[c];
	return "?"; /* should never happen */
}

/* Set name to use for error messages. This must be called before
   open_input(). */
void set_self(const char *argv0) {
	self = strrchr(argv0, '/');

	if(self)
		self++;
	else
		self = argv0;
}

void print_line(void) {
	int spacing = MAX_DUMP_COLS - dump_column;

	printf("%s", left_buf);

	/* line up the rightmost field (human-readable), for the partial
	   line at the end of the output (if there is one). */
	while(spacing--) printf("   ");
	if(dump_column < (MAX_DUMP_COLS / 2)) putchar(' ');

	printf(" %s\n", right_buf);

	/* clear the buffers, start a new line */
	left_buf[0] = right_buf[0] = '\0';
	dump_column = 0;
}

void next_normal_color() {
	cur_normal_color++;
	cur_normal_color %= (sizeof(normal_colors) / sizeof(int));
}

void append_color(char *buf, int fgcolor, int bgcolor) {
	char tmpbuf[100];

	sprintf(tmpbuf, "\x1b[%d;3%d", bold, fgcolor);
	strcat(buf, tmpbuf);
	if(bgcolor) {
		sprintf(tmpbuf, ";4%d", bgcolor);
		strcat(buf, tmpbuf);
	}
	sprintf(tmpbuf, "m");
	strcat(buf, tmpbuf);
}

void append_color_off(char *buf) {
	strcat(buf, "\x1b[0m");
}

void append_right(char *str) {
	strcat(right_buf, str);
}

void append_left(unsigned char byte, int dash, int fgcolor, int bgcolor) {
	char tmpbuf[100];

	if(!dump_column)
		sprintf(left_buf, hex_word_fmt, filepos + display_offset);

	append_color(left_buf, fgcolor, bgcolor);
	sprintf(tmpbuf, hex_byte_fmt, byte);
	strcat(left_buf, tmpbuf);

	dump_column++;

	if(dash) {
		strcat(left_buf, "-");
		if(dump_column == (MAX_DUMP_COLS / 2))
			strcat(left_buf, "-");
		append_color_off(left_buf);
	} else {
		append_color_off(left_buf);
		strcat(left_buf, " ");
		if(dump_column == (MAX_DUMP_COLS / 2))
			strcat(left_buf, " ");
	}

	if(dump_column == MAX_DUMP_COLS)
		print_line();

	filepos++;
}

void check_utf16(int byte0, int byte1) {
	char *endian;

	if(byte0 == 0xff && byte1 == 0xfe) {
		endian = "little";
	} else if(byte0 == 0xfe && byte1 == 0xff) {
		endian = "big";
	} else {
		return;
	}

	fprintf(stderr, "%s: input looks like UTF-16, %s-endian\n", self, endian);
}

/* Since we're not fully decoding the code points, we have to check
   for the actual UTF-8 representation of our one special multibyte char. */
int is_bom(unsigned char *b) {
	return (b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
}

/* U+10FFFF is the last valid codepoint. It encodes to f4 8f bf bf.
   'count' is the count of continuation bytes only (so, 3 for a 4-byte
   sqeuence). */
int is_out_of_range(int count, unsigned char *b) {
	if(count < 3) return 0;
	if(b[0] < 0xf4) return 0;
	if(b[1] < 0x90) return 0;
	return 1;
}

/* This is the 'workhorse', called for each character in the file.
   Return value: false = EOF, true = more data to read */
int dump_utf8_char(void) {
	unsigned char bytes[] = { 0, 0, 0, 0, 0 };
	unsigned char *cont_bytes = bytes + 1;
	char *printable;
	int bad = 0, special = 0;
	int c, cont_count, i, fg, bg;
	static int byte0;

	c = fgetc(input);
	if(c == EOF)
		return 0;

	bytes[0] = (unsigned char)c;

	if(filepos == 0) {
		byte0 = c;
	} else if(filepos == 1) {
		check_utf16(byte0, c);
	}

	if(c < 0x7f) {
		cont_count = 0;
		if(c <= ' ' || c == 0x7f)
			special = 1;
	} else if((c & 0xe0) == 0xc0) /* 110xxxxx */
		cont_count = 1;
	else if((c & 0xf0) == 0xe0)   /* 1110xxxx */
		cont_count = 2;
	else if((c & 0xf8) == 0xf0)   /* 11110xxx */
		cont_count = 3;
	else {
		cont_count = 0;
		bad = 1;
	}

	for(i = 0; i < cont_count; i++) {
		int cb;
		c = fgetc(input);

		if(c == EOF) {
			/* EOF in mid-sequence */
			cont_count = i;
			bad = 1;
			break;
		}

		cb = cont_bytes[i] = (unsigned char)c;
		if((cb & 0xc0) != 0x80) {
			/* Expected 10xxxxxx, got something else */
			cont_count = i;
			bad = 1;
			ungetc(cb, input);
			break;
		}
	}

	if(is_out_of_range(cont_count, bytes))
		bad = 1;

	if(bad) {
		fg = BAD_FG;
		bg = BAD_BG;
		/* replacement character � is U+FFFD */
		printable = "�";
	} else if(special) {
		fg = special_color;
		bg = 0;
		printable = get_special(bytes[0]);
	} else if(cont_count == 2 && is_bom(bytes)) {
		fg = special_color;
		bg = 0;
		printable = "B";
	} else {
		fg = normal_colors[cur_normal_color];
		bg = 0;
		printable = (char *)bytes;
		next_normal_color();
	}

	append_color(right_buf, fg, bg);
	append_right(printable);
	append_color_off(right_buf);

	if(hilite_multi && cont_count) {
		c = bg; bg = fg; fg = c;
	}

	for(i = 0; i <= cont_count; i++) {
		append_left(bytes[i], (i != cont_count), fg, bg);
	}

	return 1;
}

/* this only gets called when reading stdin. */
void skip_input(unsigned int bytes) {
	char tmp[BUFSIZ];

	if(fread(tmp, 1, bytes, input) < bytes) {
		if(feof(input)) return;

		/* this probably never happens when reading from stdin: */
		fprintf(stderr, "%s: ", self);
		perror("fread()");
		exit(1);
	}
}

/* this only gets called when reading stdin. */
void fake_seek(void) {
	long i = seekpos;

	while(i >= BUFSIZ) {
		skip_input(BUFSIZ);
		if(feof(input)) return;
		i -= BUFSIZ;
	}

	skip_input(i);
}

/* used by -s / -S options */
void seek_input(void) {
	int whence = SEEK_SET;

	if(seekpos < 0) {
		whence = SEEK_END;
	}

	if(fseek(input, seekpos, whence) == 0) {
		filepos = ftell(input);
		return;
	}

	/* fseek() failed, likely we're reading stdin. fake it, if we can. */
	if(whence == SEEK_SET) {
		clearerr(input);
		fake_seek();
		filepos = seekpos;
	} else {
		perror(self);
		exit(1);
	}
}

void dump_file(void) {
	if(seekpos) seek_input();
	if(seek_offset_zero) filepos = 0;

	while(dump_utf8_char())
		if(limit && (filepos >= limit)) break;

	/* handle the last line, if the file size not divisible by 16. */
	if(dump_column)
		print_line();
}

int main(int argc, char **argv) {
	set_self(argv[0]);
	parse_options(argc, argv);
	dump_file();
	fclose(input);
	return 0;
}