From f0e0a74cbf43d771075ad2d801197b8072d5b15c Mon Sep 17 00:00:00 2001
From: "B. Watson" <urchlay@slackware.uk>
Date: Tue, 17 Dec 2024 22:47:36 -0500
Subject: uxd.c: add overlong sequence detection; ver.rst: regenerate

---
 uxd.1   | 25 +++++++++++++------------
 uxd.c   | 44 +++++++++++++++++++++++++++++++++++++++-----
 uxd.rst | 23 ++++++++++++-----------
 ver.rst |  2 +-
 4 files changed, 65 insertions(+), 29 deletions(-)

diff --git a/uxd.1 b/uxd.1
index 35bfc4f..ca39178 100644
--- a/uxd.1
+++ b/uxd.1
@@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.TH "UXD" 1 "2024-12-17" "0.1.0" "Urchlay's Utilities"
+.TH "UXD" 1 "2024-12-17" "0.2.1" "Urchlay's Utilities"
 .SH NAME
 uxd \- UTF-8 hex dumper
 .SH SYNOPSIS
@@ -270,15 +270,21 @@ changed with the \fB\-c\fP option (see above).
 Printable characters (except the space, U+0020) alternate between green and yellow.
 .TP
 .B \fBpurple\fP
-Spaces and unprintable characters ("control" characters, newlines, tabs, etc).
-These are printed as "visible" characters, e.g. ␣ for the space, ↵ for a newline.
-Hopefully this is an improvement over the usual practice of printing these as periods, like
-standard hex dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed
+Spaces and unprintable characters ("control" characters, newlines,
+tabs, etc).  These are printed as "visible" characters, e.g. ␣ for
+the space, ↵ for a newline.  Hopefully this is an improvement over
+the usual practice of printing these as periods, like standard hex
+dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed
 as a purple letter B.
+.sp
+Note: Overlong encodings (e.g. codepoints U+0000 to U+007F encoded
+as 2 or more bytes) are rendered as � (U+0FFD) in reverse video
+purple.
 .TP
 .B \fBred\fP
-Invalid UTF\-8 sequences. These are rendered with a red background, to make them
-stand out. Examples of invalid sequences:
+Invalid UTF\-8 sequences. These are rendered as � (U+0FFD) with
+a red background, to make them stand out. Examples of invalid
+sequences:
 .INDENT 7.0
 .INDENT 3.5
 .INDENT 0.0
@@ -359,11 +365,6 @@ input file. Invalid input (non\-UTF\-8) doesn\(aqt count as an error;
 it\(aqll just have lots of red in the output.
 .SH BUGS
 .sp
-\fBuxd\fP doesn\(aqt check for overlong UTF\-8 encodings (e.g. a character
-that could be a 1\-byte sequence, but is encoded as 2 or more).
-Sequences like this really should be colorized in red. Technically,
-this means \fBuxd\fP supports WTF\-8, not UTF\-8.
-.sp
 There should be options and/or a config file to change the colors,
 rather than baking them into the binary.
 .sp
diff --git a/uxd.c b/uxd.c
index dbea5f0..38c5862 100644
--- a/uxd.c
+++ b/uxd.c
@@ -77,6 +77,7 @@ int cur_normal_hilite = 0;
 #define HL_NORM_INV 1
 #define HL_SPECIAL 2
 #define HL_BAD 3
+#define HL_OVERLONG 4
 
 /* terminal codes for mono highlighting. */
 #define MONO_NORMAL 0
@@ -417,8 +418,12 @@ void append_color(char *buf, int hl_type) {
 			fgcolor = special_color;
 			bgcolor = 0;
 			break;
-		default:
+		case HL_OVERLONG:
+			fgcolor = 0;
+			bgcolor = special_color;
+			break;
 		case HL_BAD:
+		default:
 			fgcolor = 0;
 			bgcolor = bad_color;
 			break;
@@ -447,6 +452,7 @@ void append_mono(char *buf, int hl_type) {
 			code = MONO_BOLD;
 			break;
 		default:
+		case HL_OVERLONG: /* maybe change this later */
 		case HL_BAD:
 			code = MONO_REVERSE;
 			break;
@@ -521,6 +527,26 @@ int is_bom(unsigned char *b) {
 	return (b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
 }
 
+/* Detect overlong encodings, without doing a full decode. */
+int is_overlong(int cont_count, unsigned char *b) {
+	/* 1 byte seqs are never overlong. */
+	if(!cont_count)
+		return 0;
+
+	/* 2 byte seqs, if the first byte is 0xc0 or 0xc1, it's overlong. */
+	if(cont_count == 1 && b[0] <= 0xc1)
+		return 1;
+
+	/* for 3 and 4 byte seqs, it's the 2nd byte that matters. */
+	if(cont_count == 2 && b[1] <= 0x9f)
+		return 1;
+
+	if(cont_count == 3 && b[1] <= 0x8f)
+		return 1;
+
+	return 0;
+}
+
 /* U+10FFFF is the last valid codepoint. It encodes to f4 8f bf bf.
    'count' is the count of continuation bytes only (so, 3 for a 4-byte
    sqeuence). */
@@ -560,7 +586,7 @@ int dump_utf8_char(void) {
 	unsigned char bytes[] = { 0, 0, 0, 0, 0 };
 	unsigned char *cont_bytes = bytes + 1;
 	char *printable;
-	int bad = 0, special = 0, hl_type;
+	int bad = 0, special = 0, overlong = 0, hl_type;
 	int c, cont_count, i;
 	static int byte0;
 
@@ -625,7 +651,10 @@ int dump_utf8_char(void) {
 	if(is_out_of_range(cont_count, bytes))
 		bad = 1;
 
-	if(bad) {
+	if(is_overlong(cont_count, bytes))
+		overlong = 1;
+
+	if(bad || overlong) {
 		bad_count++;
 	} else {
 		char_count++;
@@ -644,8 +673,13 @@ int dump_utf8_char(void) {
 		hl_type = HL_SPECIAL;
 		printable = PRINT_BOM;
 	} else {
-		hl_type = HL_NORMAL;
-		printable = (char *)bytes;
+		if(overlong) {
+			hl_type = HL_OVERLONG;
+			printable = PRINT_BAD;
+		} else {
+			hl_type = HL_NORMAL;
+			printable = (char *)bytes;
+		}
 	}
 
 	/* human-readable (right) column: */
diff --git a/uxd.rst b/uxd.rst
index 597084b..535177d 100644
--- a/uxd.rst
+++ b/uxd.rst
@@ -227,15 +227,21 @@ changed with the **-c** option (see above).
   Printable characters (except the space, U+0020) alternate between green and yellow.
 
 **purple**
-  Spaces and unprintable characters ("control" characters, newlines, tabs, etc).
-  These are printed as "visible" characters, e.g. ␣ for the space, ↵ for a newline.
-  Hopefully this is an improvement over the usual practice of printing these as periods, like
-  standard hex dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed
+  Spaces and unprintable characters ("control" characters, newlines,
+  tabs, etc).  These are printed as "visible" characters, e.g. ␣ for
+  the space, ↵ for a newline.  Hopefully this is an improvement over
+  the usual practice of printing these as periods, like standard hex
+  dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed
   as a purple letter B.
 
+  Note: Overlong encodings (e.g. codepoints U+0000 to U+007F encoded
+  as 2 or more bytes) are rendered as � (U+0FFD) in reverse video
+  purple.
+
 **red**
-  Invalid UTF-8 sequences. These are rendered with a red background, to make them
-  stand out. Examples of invalid sequences:
+  Invalid UTF-8 sequences. These are rendered as � (U+0FFD) with
+  a red background, to make them stand out. Examples of invalid
+  sequences:
 
     - Prefix bytes (>= 0x80) which are not followed by the correct number of continuation
       bytes (with their high 2 bits set to **10**).
@@ -319,11 +325,6 @@ it'll just have lots of red in the output.
 BUGS
 ====
 
-**uxd** doesn't check for overlong UTF-8 encodings (e.g. a character
-that could be a 1-byte sequence, but is encoded as 2 or more).
-Sequences like this really should be colorized in red. Technically,
-this means **uxd** supports WTF-8, not UTF-8.
-
 There should be options and/or a config file to change the colors,
 rather than baking them into the binary.
 
diff --git a/ver.rst b/ver.rst
index 6b13b18..6f54bd4 100644
--- a/ver.rst
+++ b/ver.rst
@@ -1 +1 @@
-.. |version| replace:: 0.1.0
+.. |version| replace:: 0.2.1
-- 
cgit v1.2.3