From 343fd43b95960f59a3bf901f59503757b81a5592 Mon Sep 17 00:00:00 2001
From: "B. Watson" <urchlay@slackware.uk>
Date: Thu, 12 Dec 2024 16:46:35 -0500
Subject: fix Makefile, dashes in hex dump, red for codepoints > U+10FFFF, fix
 spacing.

---
 Makefile |  6 +++++-
 uxd.1    |  7 ++-----
 uxd.c    | 29 +++++++++++++++++++++++++----
 uxd.rst  |  7 ++-----
 4 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/Makefile b/Makefile
index a9ee0e3..0c10d3a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,15 @@
 CFLAGS=-O2 -fPIC -Wall
 
+.PHONY: all test man clean
+
 all: uxd man
 
 test: uxd
 	./uxd
 
-man: uxd.rst
+man: uxd.1
+
+uxd.1: uxd.rst
 	rst2man uxd.rst > uxd.1
 
 clean:
diff --git a/uxd.1 b/uxd.1
index cb69b28..fe1bb34 100644
--- a/uxd.1
+++ b/uxd.1
@@ -121,6 +121,8 @@ bytes (with their high 2 bits set to \fB10\fP).
 Continuation bytes that aren\(aqt preceded by a valid prefix byte.
 .IP \(bu 2
 Truncated UTF\-8 sequence at EOF.
+.IP \(bu 2
+Codepoints above U+10FFFF, which are disallowed by RFC 3629.
 .UNINDENT
 .UNINDENT
 .UNINDENT
@@ -172,11 +174,6 @@ that could be a 1\-byte sequence, but is encoded as 2 or more).
 Sequences like this really should be colorized in red. Technically,
 this means \fBuxd\fP supports WTF\-8, not UTF\-8.
 .sp
-RFC 3629 doesn\(aqt allow UTF\-8 to use codepoints above U+10FFFF. 4\-byte
-sequences can support codepoints U+110000 to U+1FFFFF, which are not
-valid Unicode. If these occur in the input, \fBuxd\fP should colorize
-them in red, but it doesn\(aqt (yet).
-.sp
 There should be options and/or a config file to change the colors,
 rather than baking them into the binary.
 .sp
diff --git a/uxd.c b/uxd.c
index cff81cf..d71d6d5 100644
--- a/uxd.c
+++ b/uxd.c
@@ -135,6 +135,7 @@ void print_line(void) {
 
 	/* line up the rightmost field (human-readable) */
 	while(spacing--) printf("   ");
+	if(dump_column < (MAX_DUMP_COLS / 2)) putchar(' ');
 
 	printf(" %s\n", right_buf);
 
@@ -178,12 +179,21 @@ void append_left(unsigned char byte, int dash, int fgcolor, int bgcolor) {
 	append_color(left_buf, fgcolor, bgcolor);
 	sprintf(tmpbuf, "%02x", byte);
 	strcat(left_buf, tmpbuf);
-	if(dash) strcat(left_buf, "-");
-	append_color_off(left_buf);
-	if(!dash) strcat(left_buf, " ");
 
-	if(dump_column == 7) strcat(left_buf, " ");
 	dump_column++;
+
+	if(dash) {
+		strcat(left_buf, "-");
+		if(dump_column == (MAX_DUMP_COLS / 2))
+			strcat(left_buf, "-");
+		append_color_off(left_buf);
+	} else {
+		append_color_off(left_buf);
+		strcat(left_buf, " ");
+		if(dump_column == (MAX_DUMP_COLS / 2))
+			strcat(left_buf, " ");
+	}
+
 	if(dump_column == MAX_DUMP_COLS)
 		print_line();
 
@@ -208,6 +218,14 @@ int is_bom(unsigned char *b) {
 	return (b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
 }
 
+/* U+10FFFF is the last valid codepoint. It encodes to f4 8f bf bf. */
+int is_out_of_range(int count, unsigned char *b) {
+	if(count < 3) return 0;
+	if(b[0] < 0xf4) return 0;
+	if(b[1] < 0x90) return 0;
+	return 1;
+}
+
 /* return value: false = EOF, true = more data to read */
 int dump_utf8_char(void) {
 	unsigned char bytes[] = { 0, 0, 0, 0, 0 };
@@ -265,6 +283,9 @@ int dump_utf8_char(void) {
 		}
 	}
 
+	if(is_out_of_range(cont_count, bytes))
+		bad = 1;
+
 	if(bad) {
 		fg = BAD_FG;
 		bg = BAD_BG;
diff --git a/uxd.rst b/uxd.rst
index 11784fe..c8bdfed 100644
--- a/uxd.rst
+++ b/uxd.rst
@@ -103,6 +103,8 @@ COLORS
 
     - Truncated UTF-8 sequence at EOF.
 
+    - Codepoints above U+10FFFF, which are disallowed by RFC 3629.
+
 TERMINAL SUPPORT
 ================
 
@@ -161,11 +163,6 @@ that could be a 1-byte sequence, but is encoded as 2 or more).
 Sequences like this really should be colorized in red. Technically,
 this means **uxd** supports WTF-8, not UTF-8.
 
-RFC 3629 doesn't allow UTF-8 to use codepoints above U+10FFFF. 4-byte
-sequences can support codepoints U+110000 to U+1FFFFF, which are not
-valid Unicode. If these occur in the input, **uxd** should colorize
-them in red, but it doesn't (yet).
-
 There should be options and/or a config file to change the colors,
 rather than baking them into the binary.
 
-- 
cgit v1.2.3