commentary (no code changes)

author: B. Watson <urchlay@slackware.uk> 2024-12-13 06:21:38 -0500
committer: B. Watson <urchlay@slackware.uk> 2024-12-13 06:21:38 -0500
commit: ae5af3deb1137d06214ef95e96998a5c1ebb6746 (patch)
tree: e499f043696ad9b405b9e8daa0c435f7a7ee7ad1 /uxd.c
parent: ec1150407869211a0d4607419986a5f185cd8d30 (diff)
download: uxd-ae5af3deb1137d06214ef95e96998a5c1ebb6746.tar.gz
1 files changed, 25 insertions, 17 deletions
diff --git a/uxd.c b/uxd.c
index d71d6d5..41a390b 100644
--- a/uxd.c
+++ b/uxd.c
@@ -29,25 +29,27 @@ error. If we get a sequence-starter, but the sequence doesn't have
 the correct number of continuation bytes (e.g. 110xxxxx followed by
 anything that isn't 10xxxxxx), that's an error too.
 
+Note that we don't actually do a full decode of the codepoint bits.
+It's enough to look at the top bits to keep track of multibyte
+characters.
+
 BOM: if the file contains ef bb bf (aka U+FEFF), it will be colorized
 as a special (non-printable).
+
 If the file begins with ff fe, it's UTF-16 (little endian). If it's
-fe ff, it's UTF-16 big-endian. We detect these and
-print a warning on stderr.
+fe ff, it's UTF-16 big-endian. We detect these and print a warning
+on stderr.
 */
 
-/* max UTF-8 sequence length, in bytes */
-#define MAXUTF8 4
-
-/* ANSI color */
-#define BLACK 0 /* don't use */
-#define RED 1
-#define GREEN 2
+/* ANSI colors */
+#define BLACK  0 /* don't use (could be the background color) */
+#define RED    1
+#define GREEN  2
 #define YELLOW 3
-#define BLUE 4 /* don't use */
+#define BLUE   4 /* don't use (hard to read on many terminals) */
 #define PURPLE 5
-#define CYAN 6
-#define WHITE 7 /* don't use */
+#define CYAN   6
+#define WHITE  7 /* don't use (could be the background color) */
 
 #define SPECIAL PURPLE
 
@@ -112,12 +114,12 @@ char * const special_symbols[] = {
 };
 
 char *get_special(unsigned char c) {
-	if(c == 0x7f) return "⌦";
+	if(c == 0x7f) return "⌦"; /* tab */
 	if(c <= ' ') return special_symbols[c];
 	return "?"; /* should never happen */
 }
 
-/* set name to use for error messages. this must be called before
+/* Set name to use for error messages. This must be called before
    open_input(). */
 void set_self(const char *argv0) {
 	self = strrchr(argv0, '/');
@@ -133,7 +135,8 @@ void print_line(void) {
 
 	printf("%s", left_buf);
 
-	/* line up the rightmost field (human-readable) */
+	/* line up the rightmost field (human-readable), for the partial
+	   line at the end of the output (if there is one). */
 	while(spacing--) printf("   ");
 	if(dump_column < (MAX_DUMP_COLS / 2)) putchar(' ');
 
@@ -214,11 +217,15 @@ void check_utf16(int byte0, int byte1) {
 	fprintf(stderr, "%s: input looks like UTF-16, %s-endian\n", self, endian);
 }
 
+/* Since we're not fully decoding the code points, we have to check
+   for the actual UTF-8 representation of our one special multibyte char. */
 int is_bom(unsigned char *b) {
 	return (b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
 }
 
-/* U+10FFFF is the last valid codepoint. It encodes to f4 8f bf bf. */
+/* U+10FFFF is the last valid codepoint. It encodes to f4 8f bf bf.
+   'count' is the count of continuation bytes only (so, 3 for a 4-byte
+   sqeuence). */
 int is_out_of_range(int count, unsigned char *b) {
 	if(count < 3) return 0;
 	if(b[0] < 0xf4) return 0;
@@ -226,7 +233,8 @@ int is_out_of_range(int count, unsigned char *b) {
 	return 1;
 }
 
-/* return value: false = EOF, true = more data to read */
+/* This is the 'workhorse', called for each character in the file.
+   Return value: false = EOF, true = more data to read */
 int dump_utf8_char(void) {
 	unsigned char bytes[] = { 0, 0, 0, 0, 0 };
 	unsigned char *cont_bytes = bytes + 1;
author	B. Watson <urchlay@slackware.uk>	2024-12-13 06:21:38 -0500
committer	B. Watson <urchlay@slackware.uk>	2024-12-13 06:21:38 -0500
commit	ae5af3deb1137d06214ef95e96998a5c1ebb6746 (patch)
tree	e499f043696ad9b405b9e8daa0c435f7a7ee7ad1 /uxd.c
parent	ec1150407869211a0d4607419986a5f185cd8d30 (diff)
download	uxd-ae5af3deb1137d06214ef95e96998a5c1ebb6746.tar.gz