From ae5af3deb1137d06214ef95e96998a5c1ebb6746 Mon Sep 17 00:00:00 2001 From: "B. Watson" Date: Fri, 13 Dec 2024 06:21:38 -0500 Subject: commentary (no code changes) --- uxd.c | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) (limited to 'uxd.c') diff --git a/uxd.c b/uxd.c index d71d6d5..41a390b 100644 --- a/uxd.c +++ b/uxd.c @@ -29,25 +29,27 @@ error. If we get a sequence-starter, but the sequence doesn't have the correct number of continuation bytes (e.g. 110xxxxx followed by anything that isn't 10xxxxxx), that's an error too. +Note that we don't actually do a full decode of the codepoint bits. +It's enough to look at the top bits to keep track of multibyte +characters. + BOM: if the file contains ef bb bf (aka U+FEFF), it will be colorized as a special (non-printable). + If the file begins with ff fe, it's UTF-16 (little endian). If it's -fe ff, it's UTF-16 big-endian. We detect these and -print a warning on stderr. +fe ff, it's UTF-16 big-endian. We detect these and print a warning +on stderr. */ -/* max UTF-8 sequence length, in bytes */ -#define MAXUTF8 4 - -/* ANSI color */ -#define BLACK 0 /* don't use */ -#define RED 1 -#define GREEN 2 +/* ANSI colors */ +#define BLACK 0 /* don't use (could be the background color) */ +#define RED 1 +#define GREEN 2 #define YELLOW 3 -#define BLUE 4 /* don't use */ +#define BLUE 4 /* don't use (hard to read on many terminals) */ #define PURPLE 5 -#define CYAN 6 -#define WHITE 7 /* don't use */ +#define CYAN 6 +#define WHITE 7 /* don't use (could be the background color) */ #define SPECIAL PURPLE @@ -112,12 +114,12 @@ char * const special_symbols[] = { }; char *get_special(unsigned char c) { - if(c == 0x7f) return "⌦"; + if(c == 0x7f) return "⌦"; /* tab */ if(c <= ' ') return special_symbols[c]; return "?"; /* should never happen */ } -/* set name to use for error messages. this must be called before +/* Set name to use for error messages. This must be called before open_input(). */ void set_self(const char *argv0) { self = strrchr(argv0, '/'); @@ -133,7 +135,8 @@ void print_line(void) { printf("%s", left_buf); - /* line up the rightmost field (human-readable) */ + /* line up the rightmost field (human-readable), for the partial + line at the end of the output (if there is one). */ while(spacing--) printf(" "); if(dump_column < (MAX_DUMP_COLS / 2)) putchar(' '); @@ -214,11 +217,15 @@ void check_utf16(int byte0, int byte1) { fprintf(stderr, "%s: input looks like UTF-16, %s-endian\n", self, endian); } +/* Since we're not fully decoding the code points, we have to check + for the actual UTF-8 representation of our one special multibyte char. */ int is_bom(unsigned char *b) { return (b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); } -/* U+10FFFF is the last valid codepoint. It encodes to f4 8f bf bf. */ +/* U+10FFFF is the last valid codepoint. It encodes to f4 8f bf bf. + 'count' is the count of continuation bytes only (so, 3 for a 4-byte + sqeuence). */ int is_out_of_range(int count, unsigned char *b) { if(count < 3) return 0; if(b[0] < 0xf4) return 0; @@ -226,7 +233,8 @@ int is_out_of_range(int count, unsigned char *b) { return 1; } -/* return value: false = EOF, true = more data to read */ +/* This is the 'workhorse', called for each character in the file. + Return value: false = EOF, true = more data to read */ int dump_utf8_char(void) { unsigned char bytes[] = { 0, 0, 0, 0, 0 }; unsigned char *cont_bytes = bytes + 1; -- cgit v1.2.3