diff options
| -rw-r--r-- | uxd.c | 42 | 
1 files changed, 25 insertions, 17 deletions
| @@ -29,25 +29,27 @@ error. If we get a sequence-starter, but the sequence doesn't have  the correct number of continuation bytes (e.g. 110xxxxx followed by  anything that isn't 10xxxxxx), that's an error too. +Note that we don't actually do a full decode of the codepoint bits. +It's enough to look at the top bits to keep track of multibyte +characters. +  BOM: if the file contains ef bb bf (aka U+FEFF), it will be colorized  as a special (non-printable). +  If the file begins with ff fe, it's UTF-16 (little endian). If it's -fe ff, it's UTF-16 big-endian. We detect these and -print a warning on stderr. +fe ff, it's UTF-16 big-endian. We detect these and print a warning +on stderr.  */ -/* max UTF-8 sequence length, in bytes */ -#define MAXUTF8 4 - -/* ANSI color */ -#define BLACK 0 /* don't use */ -#define RED 1 -#define GREEN 2 +/* ANSI colors */ +#define BLACK  0 /* don't use (could be the background color) */ +#define RED    1 +#define GREEN  2  #define YELLOW 3 -#define BLUE 4 /* don't use */ +#define BLUE   4 /* don't use (hard to read on many terminals) */  #define PURPLE 5 -#define CYAN 6 -#define WHITE 7 /* don't use */ +#define CYAN   6 +#define WHITE  7 /* don't use (could be the background color) */  #define SPECIAL PURPLE @@ -112,12 +114,12 @@ char * const special_symbols[] = {  };  char *get_special(unsigned char c) { -	if(c == 0x7f) return "⌦"; +	if(c == 0x7f) return "⌦"; /* tab */  	if(c <= ' ') return special_symbols[c];  	return "?"; /* should never happen */  } -/* set name to use for error messages. this must be called before +/* Set name to use for error messages. This must be called before     open_input(). */  void set_self(const char *argv0) {  	self = strrchr(argv0, '/'); @@ -133,7 +135,8 @@ void print_line(void) {  	printf("%s", left_buf); -	/* line up the rightmost field (human-readable) */ +	/* line up the rightmost field (human-readable), for the partial +	   line at the end of the output (if there is one). */  	while(spacing--) printf("   ");  	if(dump_column < (MAX_DUMP_COLS / 2)) putchar(' '); @@ -214,11 +217,15 @@ void check_utf16(int byte0, int byte1) {  	fprintf(stderr, "%s: input looks like UTF-16, %s-endian\n", self, endian);  } +/* Since we're not fully decoding the code points, we have to check +   for the actual UTF-8 representation of our one special multibyte char. */  int is_bom(unsigned char *b) {  	return (b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);  } -/* U+10FFFF is the last valid codepoint. It encodes to f4 8f bf bf. */ +/* U+10FFFF is the last valid codepoint. It encodes to f4 8f bf bf. +   'count' is the count of continuation bytes only (so, 3 for a 4-byte +   sqeuence). */  int is_out_of_range(int count, unsigned char *b) {  	if(count < 3) return 0;  	if(b[0] < 0xf4) return 0; @@ -226,7 +233,8 @@ int is_out_of_range(int count, unsigned char *b) {  	return 1;  } -/* return value: false = EOF, true = more data to read */ +/* This is the 'workhorse', called for each character in the file. +   Return value: false = EOF, true = more data to read */  int dump_utf8_char(void) {  	unsigned char bytes[] = { 0, 0, 0, 0, 0 };  	unsigned char *cont_bytes = bytes + 1; | 
