detect UTF-16 surrogates as bad, use red for overlong

author: B. Watson <urchlay@slackware.uk> 2024-12-18 05:47:07 -0500
committer: B. Watson <urchlay@slackware.uk> 2024-12-18 05:47:07 -0500
commit: c205a7ea2a7171b61dae4ac51a3a251cceb1dde1 (patch)
tree: 58447b4934f93eb8cb48909fc1efc3b15c72c5ed
parent: f467fec27bc25d51020ce482750361c102417efb (diff)
download: uxd-c205a7ea2a7171b61dae4ac51a3a251cceb1dde1.tar.gz
3 files changed, 35 insertions, 16 deletions
diff --git a/uxd.1 b/uxd.1
index ca39178..90b23a3 100644
--- a/uxd.1
+++ b/uxd.1
@@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
-.TH "UXD" 1 "2024-12-17" "0.2.1" "Urchlay's Utilities"
+.TH "UXD" 1 "2024-12-18" "0.2.1" "Urchlay's Utilities"
 .SH NAME
 uxd \- UTF-8 hex dumper
 .SH SYNOPSIS
@@ -276,15 +276,11 @@ the space, ↵ for a newline.  Hopefully this is an improvement over
 the usual practice of printing these as periods, like standard hex
 dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed
 as a purple letter B.
-.sp
-Note: Overlong encodings (e.g. codepoints U+0000 to U+007F encoded
-as 2 or more bytes) are rendered as � (U+0FFD) in reverse video
-purple.
 .TP
 .B \fBred\fP
 Invalid UTF\-8 sequences. These are rendered as � (U+0FFD) with
-a red background, to make them stand out. Examples of invalid
-sequences:
+a red background, to make them stand out. Invalid
+sequences are:
 .INDENT 7.0
 .INDENT 3.5
 .INDENT 0.0
@@ -296,8 +292,16 @@ Continuation bytes that aren\(aqt preceded by a valid prefix byte.
 .IP \(bu 2
 Truncated UTF\-8 sequence at EOF.
 .IP \(bu 2
+UTF\-16 surrogates (codepoints U+D800 to U+DFFF).
+.IP \(bu 2
 Codepoints above U+10FFFF, which are disallowed by RFC 3629.
+.IP \(bu 2
+Overlong encodings (e.g. codepoints U+0000 to U+007F encoded
+as 2 or more bytes).
 .UNINDENT
+.sp
+Each occurrence of any of the above will increment the "Bad
+Sequences" count, if the \fB\-i\fP option is used.
 .UNINDENT
 .UNINDENT
 .UNINDENT
diff --git a/uxd.c b/uxd.c
index e32356b..d141c2e 100644
--- a/uxd.c
+++ b/uxd.c
@@ -425,9 +425,11 @@ void append_color(char *buf, int hl_type) {
 			bgcolor = 0;
 			break;
 		case HL_OVERLONG:
+			/* don't use a separate color for this any more
 			fgcolor = 0;
 			bgcolor = special_color;
 			break;
+			*/
 		case HL_BAD:
 		default:
 			fgcolor = 0;
@@ -563,6 +565,12 @@ int is_out_of_range(int cont_count, unsigned char *b) {
 	return 1;
 }
 
+/* surrogates for UTF-16 are not valid Unicode (therefore not UTF-8) */
+int is_surrogate(int cont_count, unsigned char *b) {
+	if(cont_count != 2) return 0;
+	return b[0] == 0xed && b[1] > 0x9f;
+}
+
 int get_next_byte(void) {
 	int c;
 
@@ -655,9 +663,12 @@ int dump_utf8_char(void) {
 		}
 	}
 
-	/* don't check bad sequences for out-of-range */
-	if(!bad && is_out_of_range(cont_count, bytes))
-		bad = 1;
+	/* don't check bad sequences for out-of-range or surrogate */
+	if(!bad) {
+		if(is_out_of_range(cont_count, bytes) || is_surrogate(cont_count, bytes))
+			bad = 1;
+	}
+
 
 	if(is_overlong(cont_count, bytes))
 		overlong = 1;
diff --git a/uxd.rst b/uxd.rst
index 535177d..1789efe 100644
--- a/uxd.rst
+++ b/uxd.rst
@@ -234,14 +234,10 @@ changed with the **-c** option (see above).
   dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed
   as a purple letter B.
 
-  Note: Overlong encodings (e.g. codepoints U+0000 to U+007F encoded
-  as 2 or more bytes) are rendered as � (U+0FFD) in reverse video
-  purple.
-
 **red**
   Invalid UTF-8 sequences. These are rendered as � (U+0FFD) with
-  a red background, to make them stand out. Examples of invalid
-  sequences:
+  a red background, to make them stand out. Invalid
+  sequences are:
 
     - Prefix bytes (>= 0x80) which are not followed by the correct number of continuation
       bytes (with their high 2 bits set to **10**).
@@ -250,8 +246,16 @@ changed with the **-c** option (see above).
 
     - Truncated UTF-8 sequence at EOF.
 
+    - UTF-16 surrogates (codepoints U+D800 to U+DFFF).
+
     - Codepoints above U+10FFFF, which are disallowed by RFC 3629.
 
+    - Overlong encodings (e.g. codepoints U+0000 to U+007F encoded
+      as 2 or more bytes).
+
+    Each occurrence of any of the above will increment the "Bad
+    Sequences" count, if the **-i** option is used.
+
 TERMINAL SUPPORT
 ================
author	B. Watson <urchlay@slackware.uk>	2024-12-18 05:47:07 -0500
committer	B. Watson <urchlay@slackware.uk>	2024-12-18 05:47:07 -0500
commit	c205a7ea2a7171b61dae4ac51a3a251cceb1dde1 (patch)
tree	58447b4934f93eb8cb48909fc1efc3b15c72c5ed
parent	f467fec27bc25d51020ce482750361c102417efb (diff)
download	uxd-c205a7ea2a7171b61dae4ac51a3a251cceb1dde1.tar.gz