#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

/* UTF-8 spec summary, taken from Wikipedia and elsewhere, kept here
   for locality of reference.

Codepoints 0-0x7f encode as themselves, one byte each, bit 7 always 0.

0x80 and up are encoded as multiple bytes. The first byte's bit 7 is
always 1. The top bits determine the byte length of the sequence:

110 - 2 bytes
1110 - 3 bytes
11110 - 4 bytes

Continuation (2nd and further bytes) have 10 as the top 2 bits. If
we get a continuation that's not after a sequence-starter, that's an
error. If we get a sequence-starter, but the sequence doesn't have
the correct number of continuation bytes (e.g. 110xxxxx followed by
anything that isn't 10xxxxxx), that's an error too.

Note that we don't actually do a full decode of the codepoint bits.
It's enough to look at the top bits to keep track of multibyte
characters.

BOM: if the file contains ef bb bf (aka U+FEFF), it will be colorized
as a special (non-printable).

If the file begins with ff fe, it's UTF-16 (little endian). If it's
fe ff, it's UTF-16 big-endian. We detect these and print a warning
on stderr.
*/

/* from getopt.c */
extern int my_getopt(int, char **, char *);
extern char *my_optarg;
extern int my_optind;

#ifndef VERSION
#define VERSION "(unknown version)"
#endif

#ifndef BUFSIZ
#define BUFSIZ 4096
#endif

/* environment variables. */
#define NO_COLOR "NO_COLOR"
#define ENV_OPTS "UXD_OPTS"

/* maximum number of arguments, including environment and argv. */
#define MAX_ARGS 64

/* ANSI colors */
#define BLACK  0 /* don't use (could be the background color) */
#define RED    1
#define GREEN  2
#define YELLOW 3
#define BLUE   4 /* don't use (hard to read on many terminals) */
#define PURPLE 5
#define CYAN   6
#define WHITE  7 /* don't use (could be the background color) */

/* highlight types. */
#define HL_NORMAL 0
#define HL_NORM_INV 1
#define HL_SPECIAL 2
#define HL_SPEC_INV 3
#define HL_BAD 4

/* terminal codes for mono highlighting. */
#define MONO_NORMAL 0
#define MONO_UNDERLINE 4
#define MONO_BOLD 1
#define MONO_REVERSE 7

/* terminal codes to enable/disable UTF-8 mode */
#define ESC_UTF8_ON  "\x1b%G"
#define ESC_UTF8_OFF "\x1b%@"

/* replacement character � is U+FFFD */
#define PRINT_BAD "�"
#define PRINT_BOM "B"
#define PRINT_OLONG "O"
#define PRINT_OORANGE ">"
#define PRINT_SURR "S"

/* sprintf() formats for hex data */
#define LC_BYTE_FMT "%02x"
#define LC_ADDR_FMT "%04x: "
#define UC_BYTE_FMT "%02X"
#define UC_ADDR_FMT "%04X: "

/* name (read from argv[0]), for error/warning messages. */
const char *self;

/* the input file, either stdin or a file we open for reading. */
FILE *input;

/* default colors */
int normal_colors[] = { GREEN, YELLOW };
int special_colors[] = { PURPLE, CYAN };
int bad_color = RED;

/* toggles between 0 and 1 for each normal/special character */
int cur_normal_hilite = 0;
int cur_special_hilite = 0;

/* these buffers are bigger than they need to be really. */
/* offset and hex bytes: */
char left_buf[4096];

/* printable form: */
char right_buf[4096];

/* dump_column ranges 0..(MAX_DUMP_COLS-1) */
#define MAX_DUMP_COLS 16
int dump_column = 0;

/* where we're at in the input. */
int filepos = 0;

/* Unicode control character printable equivalents. For 0, use
   the "empty set" symbol. It's a lot more readable than the "nul"
   symbol, ␀. Escape, tab, newline, space are what urxvt uses in
   its "keycap picture" mode. The rest of these are hard to read at
   normal font sizes, but it's still better than using a dot for
   everything like xxd does. */
char * const special_symbols[] = {
	/* 0-0x0f: */
	"∅", "␁", "␂", "␃", "␄", "␅", "␆", "␇", "␈", "⇥", "↵", "␋", "␌", "␍", "␎", "␏",
	/* 0x10-0x1f: */
	"␐", "␑", "␒", "␓", "␔", "␕", "␖", "␗", "␘", "␙", "␚", "⎋", "␜", "␝", "␞", "␟",
	/* 0x20 (space): */
	"␣",
};

/* options */
int alternate_colors = 1; /* -1 */
int print_info_opt = 0;   /* -i */
int bold = 0;         /* -b */
int hilite_multi = 1; /* -r */
int mono = 0;         /* -m */
long display_offset = 0; /* -o */
long seekpos = 0;     /* -s, -S */
int seek_offset_zero = 0; /* -S */
long limit;           /* -l */
const char *hex_byte_fmt = LC_BYTE_FMT;   /* -u */
const char *hex_addr_fmt = LC_ADDR_FMT; /* "  */
char *dump_data_arg = NULL; /* -d */
long dump_data_idx  = 0;    /* -d */
int term_utf8 = 0;    /* -t, -T */
int restore_term = 0; /* -T only */
int java_mode = 0;    /* -j */
int wtf8_mode = 0;    /* -w */
int permissive = 0;   /* -l */

/* stats for -i option */
long byte_count = 0;
long ascii_count = 0;
long multi_count = 0;
long bad_count = 0;
long char_count = 0;

void usage(void) {
	extern char *usage_opts[];
	char **opt;

	puts("uxd (Utf-8 heX Dump) v" VERSION " by B. Watson. WTFPL.");
	printf("Usage: %s -[options] [<file>]\n", self);
	puts("  With no <file>, or with -, read standard input.");
	puts("Options:");

	for(opt = usage_opts; *opt; opt++) {
		puts(*opt);
	}

	exit(0);
}

void version(void) {
	printf("%s\n", VERSION);
	exit(0);
}

void open_input(const char *arg) {
	if(!arg || (strcmp(arg, "-") == 0)) {
		input = stdin;
		freopen(NULL, "rb", stdin);
	} else {
		input = fopen(arg, "rb");
		if(!input) {
			fprintf(stderr, "%s: ", self);
			perror(arg);
			exit(1);
		}
	}
}

void color_error(void) {
	fprintf(stderr, "%s: invalid -c colors (-h for help).\n", self);
	exit(1);
}

void check_color(char c) {
	if(c < '0' || c > '7') color_error();
}

void parse_colors(char *arg) {
	if(!arg[0]) return; /* should never happen anyway */

	/* first 2 are required */
	check_color(arg[0]);
	check_color(arg[1]);

	normal_colors[0] = arg[0] - '0';
	normal_colors[1] = arg[1] - '0';

	/* optional 3rd color */
	if(!arg[2]) return;
	check_color(arg[2]);
	special_colors[0] = arg[2] - '0';

	/* optional 4th color */
	if(!arg[3]) return;
	check_color(arg[3]);
	special_colors[1] = arg[3] - '0';

	/* optional 5th color */
	if(!arg[4]) return;
	check_color(arg[4]);
	bad_color = arg[4] - '0';

	if(arg[5]) color_error();
}

void number_err(int opt) {
	fprintf(stderr, "%s: invalid number for -%c option.\n", self, opt);
	exit(1);
}

long parse_number(int opt, const char *s) {
	char *e;
	long result;

	result = strtol(s, &e, 0);

	/* require at least one digit (otherwise -sk would be allowed) */
	if(e == s)
		number_err(opt);

	switch(e[0]) {
		case 0: break;
		case 'b':
		case 'B':
			if(e[1]) number_err(opt);
			break; /* allow & ignore b/B for "bytes" */
		case 'k': result *= 1024L; break;
		case 'm': result *= 1048576L; break;
		case 'g': result *= 1073741824L; break;
		case 'K': result *= 1000L; break;
		case 'M': result *= 1000000L; break;
		case 'G': result *= 1000000000L; break;
		default:
			number_err(opt);
	}

	/* allow e.g. "kb" for kilobytes (but reject e.g. "kx") */
	if(e[0] && e[1] && e[1] != 'b' && e[1] != 'B')
		number_err(opt);

	return result;
}

void parse_args(int argc, char **argv) {
	int opt;

	if(argc > 1) {
		if(strcmp(argv[1], "--help") == 0)
			usage();
		if(strcmp(argv[1], "--version") == 0)
			version();
	}

	while((opt = my_getopt(argc, argv, "jwptTd:1ic:nbl:rmo:S:s:uhv")) != -1) {
		switch(opt) {
			case 'j':
				java_mode = 1; break;
			case 'w':
				wtf8_mode = 1; break;
			case 'p':
				permissive = 1; break;
			case 't':
				term_utf8 = restore_term = 1; break;
			case 'T':
				term_utf8 = 1; restore_term = 0; break;
			case 'd':
				if(dump_data_arg) {
					fprintf(stderr, "%s: multiple -d options not supported.\n", self);
					exit(1);
				}
				dump_data_arg = my_optarg; break;
			case '1':
				alternate_colors = 0; break;
			case 'i':
				print_info_opt = 1; break;
			case 'c':
				mono = 0; parse_colors(my_optarg); break;
			case 'n':
				break; /* already handled in parse_options() */
			case 'b':
				bold = 1; break;
			case 'l':
				limit = parse_number(opt, my_optarg);
				if(limit < 0) {
					fprintf(stderr, "%s: negative limit for -l not allowed.\n", self);
					exit(1);
				}
				break;
			case 'r':
				hilite_multi = 0; break;
			case 'm':
				mono = 1; break;
			case 'o':
				display_offset = parse_number(opt, my_optarg); break;
			case 'S':
				seek_offset_zero = 1;
				/* fall thru */
			case 's':
				seekpos = parse_number(opt, my_optarg);
				break;
			case 'u':
				hex_byte_fmt = UC_BYTE_FMT; hex_addr_fmt = UC_ADDR_FMT; break;
			case 'h':
				usage(); break;
			case 'v':
				version(); break;
			default:
				exit(1);
		}
	}

	if(dump_data_arg) {
		if(my_optind != argc) {
			fprintf(stderr, "%s: cannot give a filename when -d is used.\n", self);
			exit(1);
		}
	} else {
		/* filename (if present) must come after all -options, and
		   there can only be one filename. */
		if(my_optind < (argc - 1)) usage();

		open_input(argv[my_optind]);
	}
}

/* read options from the environment and the command line, create a
   new argv/argc that has all the options from both, with the
   environment ones first. */
void parse_options(int argc, char **argv) {
	int nargc;
	char **real_argv = argv;
	char *nargv[MAX_ARGS + 1];
	char *env, *p;

	if(getenv(NO_COLOR))
		mono = 1;

	env = getenv(ENV_OPTS);
	if(!env) {
		/* nothing in the env, use regular args as-is */
		parse_args(argc, argv);
		return;
	}

	nargv[0] = (char *)self;
	nargv[1] = env;
	nargc = 2;

	for(p = env; *p; p++) {
		if(*p == ' ' || *p == '\t') {
			*p = '\0';
			if(nargc == MAX_ARGS) break;
			nargv[nargc++] = p + 1;
		}
	}

	argv++; /* skip exe name */
	while(*argv) {
		/* have to check for the -n option here */
		if(argv[0][0] == '-' && argv[0][1] == 'n') {
			parse_args(argc, real_argv);
			return;
		}
		if(nargc == MAX_ARGS) break;
		nargv[nargc++] = *argv;
		argv++;
	}

	nargv[nargc] = NULL;
	parse_args(nargc, nargv);
}

char *get_special(unsigned char c) {
	if(c == 0x7f) return "⌦"; /* tab */
	if(c <= ' ') return special_symbols[c];
	return "?"; /* should never happen */
}

/* Set name to use for error messages. This must be called before
   open_input(). */
void set_self(const char *argv0) {
	self = strrchr(argv0, '/');

	if(self)
		self++;
	else
		self = argv0;
}

void print_line(void) {
	int spacing = MAX_DUMP_COLS - dump_column;

	printf("%s", left_buf);

	/* line up the rightmost field (human-readable), for the partial
	   line at the end of the output (if there is one). */
	while(spacing--) printf("   ");
	if(dump_column < (MAX_DUMP_COLS / 2)) putchar(' ');

	printf(" %s\n", right_buf);

	/* clear the buffers, start a new line */
	left_buf[0] = right_buf[0] = '\0';
	dump_column = 0;
}

void next_normal_hilite(void) {
	if(alternate_colors)
		cur_normal_hilite = !cur_normal_hilite;
}

void next_special_hilite(void) {
	if(alternate_colors)
		cur_special_hilite = !cur_special_hilite;
}

void append_color(char *buf, int hl_type) {
	char tmpbuf[100];
	int fgcolor, bgcolor;

	switch(hl_type) {
		case HL_NORMAL:
			fgcolor = normal_colors[cur_normal_hilite];
			bgcolor = 0;
			break;
		case HL_NORM_INV:
			fgcolor = 0;
			bgcolor = normal_colors[cur_normal_hilite];
			break;
		case HL_SPECIAL:
			fgcolor = special_colors[cur_special_hilite];
			bgcolor = 0;
			break;
		case HL_SPEC_INV:
			fgcolor = 0;
			bgcolor = special_colors[cur_special_hilite];
			break;
		case HL_BAD:
		default:
			fgcolor = 0;
			bgcolor = bad_color;
			break;
	}

	sprintf(tmpbuf, "\x1b[%d;3%d", bold, fgcolor);
	strcat(buf, tmpbuf);
	if(bgcolor) {
		sprintf(tmpbuf, ";4%d", bgcolor);
		strcat(buf, tmpbuf);
	}
	sprintf(tmpbuf, "m");
	strcat(buf, tmpbuf);
}

void append_mono(char *buf, int hl_type) {
	char tmpbuf[100];
	int code;

	switch(hl_type) {
		case HL_NORMAL:
		case HL_NORM_INV:
			code = cur_normal_hilite ? MONO_UNDERLINE : MONO_NORMAL;
			break;
		case HL_SPECIAL:
		case HL_SPEC_INV:
			code = MONO_BOLD;
			break;
		default:
		case HL_BAD:
			code = MONO_REVERSE;
			break;
	}

	sprintf(tmpbuf, "\x1b[%dm", code);
	strcat(buf, tmpbuf);
}

void append_hilite(char *buf, int hl_type) {
	if(mono)
		append_mono(buf, hl_type);
	else
		append_color(buf, hl_type);
}

void append_hilite_off(char *buf) {
	strcat(buf, "\x1b[0m");
}

void append_right(char *str) {
	strcat(right_buf, str);
}

void append_left(unsigned char byte, int dash, int hl_type) {
	char tmpbuf[100];

	if(!dump_column)
		sprintf(left_buf, hex_addr_fmt, filepos + display_offset);

	append_hilite(left_buf, hl_type);
	sprintf(tmpbuf, hex_byte_fmt, byte);
	strcat(left_buf, tmpbuf);

	dump_column++;

	if(dash) {
		strcat(left_buf, "-");
		if(dump_column == (MAX_DUMP_COLS / 2))
			strcat(left_buf, "-");
		append_hilite_off(left_buf);
	} else {
		append_hilite_off(left_buf);
		strcat(left_buf, " ");
		if(dump_column == (MAX_DUMP_COLS / 2))
			strcat(left_buf, " ");
	}

	if(dump_column == MAX_DUMP_COLS)
		print_line();

	filepos++;
}

void check_utf16(int byte0, int byte1) {
	char *endian;

	if(byte0 == 0xff && byte1 == 0xfe) {
		endian = "little";
	} else if(byte0 == 0xfe && byte1 == 0xff) {
		endian = "big";
	} else {
		return;
	}

	fprintf(stderr, "%s: input looks like UTF-16, %s-endian\n", self, endian);
}

/* Since we're not fully decoding the code points, we have to check
   for the actual UTF-8 representation of our one special multibyte char. */
int is_bom(unsigned char *b) {
	return (b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
}

/* Detect overlong encodings, without doing a full decode. */
int is_overlong(int cont_count, unsigned char *b) {
	/* 1 byte seqs are never overlong. */
	if(!cont_count)
		return 0;

	/* 2 byte seqs, if the first byte is 0xc0 or 0xc1, it's overlong. */
	if(cont_count == 1 && b[0] <= 0xc1)
		return 1;

	/* for 3 and 4 byte seqs, the 2nd byte matters too. */
	if(cont_count == 2 && b[0] == 0xe0 && b[1] <= 0x9f)
		return 1;

	if(cont_count == 3 && b[0] == 0xf0 && b[1] <= 0x8f)
		return 1;

	return 0;
}

/* U+10FFFF is the last valid codepoint. It encodes to f4 8f bf bf.
   'count' is the count of continuation bytes only (so, 3 for a 4-byte
   sqeuence). */
int is_out_of_range(int cont_count, unsigned char *b) {
	if(cont_count < 3) return 0;
	if(b[0] < 0xf4) return 0;
	if(b[1] < 0x90) return 0;
	return 1;
}

/* surrogates for UTF-16 are not valid Unicode (therefore not UTF-8) */
int is_surrogate(int cont_count, unsigned char *b) {
	if(cont_count != 2) return 0;
	return b[0] == 0xed && b[1] > 0x9f;
}

int get_next_byte(void) {
	int c;

	if(dump_data_arg) {
		/* have to cast this to unsigned char and back to int,
		   to emulate fgetc() */
		c = (unsigned char)dump_data_arg[dump_data_idx++];
		if(!c) c = EOF;
	} else {
		c = fgetc(input);
	}

	return c;
}

void push_back_byte(int c) {
	if(dump_data_arg) {
		if(dump_data_idx) dump_data_idx--;
	} else {
		ungetc(c, input);
	}
}

char *classify_char(int *hl, unsigned char *bytes, int cont_count) {
	char *b = (char *)bytes;
	int c;

	c = b[0];
	if(cont_count == 0) {
		if(c <= ' ' || c == 0x7f) {
			*hl = HL_SPECIAL;
			return get_special(c);
		} else {
			*hl = HL_NORMAL;
			return b;
		}
	}

	if(cont_count == 2 && is_bom(bytes)) {
		*hl = HL_SPEC_INV;
		return PRINT_BOM;
	}

	if(is_overlong(cont_count, bytes)) {
		/* java mode (MUTF-8) allows exactly one overlong: */
		if(java_mode && cont_count == 1 && bytes[0] == 0xc0 && bytes[1] == 0x80) {
			*hl = HL_SPEC_INV;
			return get_special(0);
		} else if(permissive) {
			*hl = HL_NORMAL;
		} else {
			*hl = HL_BAD;
		}
		return PRINT_OLONG;
	}

	if(is_surrogate(cont_count, bytes)) {
		if(wtf8_mode || permissive) {
			*hl = HL_SPEC_INV;
		} else {
			*hl = HL_BAD;
		}
		return PRINT_SURR;
	}

	if(is_out_of_range(cont_count, bytes)) {
		if(permissive) {
			*hl = HL_SPEC_INV;
		} else {
			*hl = HL_BAD;
		}
		return PRINT_OORANGE;
	}

	*hl = HL_NORMAL;
	return b;
}

/* This is the 'workhorse', called for each character in the file.
   Return value: false = EOF, true = more data to read */
int dump_utf8_char(void) {
	unsigned char bytes[] = { 0, 0, 0, 0, 0 };
	char *printable;
	int bad = 0, hl_type;
	int c, cont_count, i;
	static int byte0;

	c = get_next_byte();
	if(c == EOF)
		return 0;

	byte_count++;

	bytes[0] = (unsigned char)c;

	if(filepos == 0) {
		byte0 = c;
	} else if(filepos == 1) {
		check_utf16(byte0, c);
	}

	/* look at 1st byte to find out how long the sequence is */
	if(c <= 0x7f) {
		ascii_count++;
		cont_count = 0;
	} else if((c & 0xe0) == 0xc0) {   /* 110xxxxx */
		cont_count = 1;
	} else if((c & 0xf0) == 0xe0) {   /* 1110xxxx */
		cont_count = 2;
	} else if((c & 0xf8) == 0xf0) {   /* 11110xxx */
		cont_count = 3;
	} else {
		/* high bit set, but not a valid sequence-starter */
		cont_count = 0;
		bad = 1;
	}

	/* read and validate the continuation bytes, if any */
	for(i = 0; i < cont_count; i++) {
		int cb;
		c = get_next_byte();

		if(c == EOF) {
			/* EOF in mid-sequence. Don't return 0 here, since we still
			   have to dump the partial sequence. The next call will
			   give us EOF again. */
			cont_count = i;
			bad = 1;
			break;
		}

		byte_count++;

		cb = bytes[i + 1] = (unsigned char)c;
		if((cb & 0xc0) != 0x80) {
			/* Expected 10xxxxxx, got something else */
			cont_count = i;
			bad = 1;
			push_back_byte(cb);
			byte_count--;
			break;
		}
	}

	if(bad) {
		hl_type = HL_BAD;
		printable = PRINT_BAD;
	} else {
		printable = classify_char(&hl_type, bytes, cont_count);
	}

	if(hl_type == HL_BAD) {
		bad_count++;
	} else {
		char_count++;
		if(cont_count) multi_count++;
	}

	if(hl_type == HL_NORMAL && hilite_multi && cont_count)
		hl_type = HL_NORM_INV;

	/* human-readable (right) column: */
	append_hilite(right_buf, hl_type);
	append_right(printable);
	append_hilite_off(right_buf);

	/* hex columns: */
	for(i = 0; i <= cont_count; i++) {
		append_left(bytes[i], (i != cont_count), hl_type);
	}

	if(hl_type == HL_NORMAL || hl_type == HL_NORM_INV)
		next_normal_hilite();

	if(hl_type == HL_SPECIAL || hl_type == HL_SPEC_INV)
		next_special_hilite();

	return 1;
}

/* this only gets called when reading stdin. */
void skip_input(unsigned int bytes) {
	char tmp[BUFSIZ];

	if(fread(tmp, 1, bytes, input) < bytes) {
		if(feof(input)) return;

		/* this probably never happens when reading from stdin: */
		fprintf(stderr, "%s: ", self);
		perror("fread()");
		exit(1);
	}
}

/* this only gets called when reading stdin. */
void fake_seek(void) {
	long i = seekpos;

	while(i >= BUFSIZ) {
		skip_input(BUFSIZ);
		if(feof(input)) return;
		i -= BUFSIZ;
	}

	skip_input(i);
}

/* used by -s / -S options */
void seek_input(void) {
	int whence = SEEK_SET;

	if(seekpos < 0) {
		whence = SEEK_END;
	}

	if(fseek(input, seekpos, whence) == 0) {
		filepos = ftell(input);
		return;
	}

	/* fseek() failed, likely we're reading stdin. fake it, if we can. */
	if(whence == SEEK_SET) {
		clearerr(input);
		fake_seek();
		filepos = seekpos;
	} else {
		fprintf(stderr, "%s: are you trying to seek backwards in stdin?\n", self);
		perror(self);
		exit(1);
	}
}

void print_info(void) {
	printf("\nBytes: %ld\n", byte_count);
	printf("Valid characters: %ld\n", char_count);
	printf("  ASCII: %ld\n", ascii_count);
	printf("  Multibyte: %ld\n", multi_count);
	printf("Bad sequences: %ld\n", bad_count);
}

void dump_loop(void) {
	while(dump_utf8_char())
		if(limit && (byte_count >= limit)) break;

	/* handle the last line, if the file size not divisible by 16. */
	if(dump_column)
		print_line();

}

void dump_file(void) {
	if(seekpos) seek_input();
	if(seek_offset_zero) filepos = 0;

	dump_loop();

	fclose(input);
}

void dump_data(void) {
	int datalen;

	datalen = strlen(dump_data_arg);

	if(seekpos >= datalen)
		return;

	if(seekpos < 0)
		dump_data_idx = datalen + seekpos;
	else if(seekpos)
		dump_data_idx = seekpos;

	if(seek_offset_zero)
		filepos = 0;
	else
		filepos = dump_data_idx;

	dump_loop();
}

int main(int argc, char **argv) {
	set_self(argv[0]);

	parse_options(argc, argv);

	if(term_utf8)       /* -t, -T */
		fputs(ESC_UTF8_ON, stdout);

	if(dump_data_arg)
		dump_data();     /* -d */
	else
		dump_file();

	if(print_info_opt)  /* -i */
		print_info();

	if(restore_term)    /* -T */
		fputs(ESC_UTF8_OFF, stdout);

	return 0;
}