aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorB. Watson <urchlay@slackware.uk>2024-12-18 05:47:07 -0500
committerB. Watson <urchlay@slackware.uk>2024-12-18 05:47:07 -0500
commitc205a7ea2a7171b61dae4ac51a3a251cceb1dde1 (patch)
tree58447b4934f93eb8cb48909fc1efc3b15c72c5ed
parentf467fec27bc25d51020ce482750361c102417efb (diff)
downloaduxd-c205a7ea2a7171b61dae4ac51a3a251cceb1dde1.tar.gz
detect UTF-16 surrogates as bad, use red for overlong
-rw-r--r--uxd.118
-rw-r--r--uxd.c17
-rw-r--r--uxd.rst16
3 files changed, 35 insertions, 16 deletions
diff --git a/uxd.1 b/uxd.1
index ca39178..90b23a3 100644
--- a/uxd.1
+++ b/uxd.1
@@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
..
-.TH "UXD" 1 "2024-12-17" "0.2.1" "Urchlay's Utilities"
+.TH "UXD" 1 "2024-12-18" "0.2.1" "Urchlay's Utilities"
.SH NAME
uxd \- UTF-8 hex dumper
.SH SYNOPSIS
@@ -276,15 +276,11 @@ the space, ↵ for a newline. Hopefully this is an improvement over
the usual practice of printing these as periods, like standard hex
dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed
as a purple letter B.
-.sp
-Note: Overlong encodings (e.g. codepoints U+0000 to U+007F encoded
-as 2 or more bytes) are rendered as � (U+0FFD) in reverse video
-purple.
.TP
.B \fBred\fP
Invalid UTF\-8 sequences. These are rendered as � (U+0FFD) with
-a red background, to make them stand out. Examples of invalid
-sequences:
+a red background, to make them stand out. Invalid
+sequences are:
.INDENT 7.0
.INDENT 3.5
.INDENT 0.0
@@ -296,8 +292,16 @@ Continuation bytes that aren\(aqt preceded by a valid prefix byte.
.IP \(bu 2
Truncated UTF\-8 sequence at EOF.
.IP \(bu 2
+UTF\-16 surrogates (codepoints U+D800 to U+DFFF).
+.IP \(bu 2
Codepoints above U+10FFFF, which are disallowed by RFC 3629.
+.IP \(bu 2
+Overlong encodings (e.g. codepoints U+0000 to U+007F encoded
+as 2 or more bytes).
.UNINDENT
+.sp
+Each occurrence of any of the above will increment the "Bad
+Sequences" count, if the \fB\-i\fP option is used.
.UNINDENT
.UNINDENT
.UNINDENT
diff --git a/uxd.c b/uxd.c
index e32356b..d141c2e 100644
--- a/uxd.c
+++ b/uxd.c
@@ -425,9 +425,11 @@ void append_color(char *buf, int hl_type) {
bgcolor = 0;
break;
case HL_OVERLONG:
+ /* don't use a separate color for this any more
fgcolor = 0;
bgcolor = special_color;
break;
+ */
case HL_BAD:
default:
fgcolor = 0;
@@ -563,6 +565,12 @@ int is_out_of_range(int cont_count, unsigned char *b) {
return 1;
}
+/* surrogates for UTF-16 are not valid Unicode (therefore not UTF-8) */
+int is_surrogate(int cont_count, unsigned char *b) {
+ if(cont_count != 2) return 0;
+ return b[0] == 0xed && b[1] > 0x9f;
+}
+
int get_next_byte(void) {
int c;
@@ -655,9 +663,12 @@ int dump_utf8_char(void) {
}
}
- /* don't check bad sequences for out-of-range */
- if(!bad && is_out_of_range(cont_count, bytes))
- bad = 1;
+ /* don't check bad sequences for out-of-range or surrogate */
+ if(!bad) {
+ if(is_out_of_range(cont_count, bytes) || is_surrogate(cont_count, bytes))
+ bad = 1;
+ }
+
if(is_overlong(cont_count, bytes))
overlong = 1;
diff --git a/uxd.rst b/uxd.rst
index 535177d..1789efe 100644
--- a/uxd.rst
+++ b/uxd.rst
@@ -234,14 +234,10 @@ changed with the **-c** option (see above).
dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed
as a purple letter B.
- Note: Overlong encodings (e.g. codepoints U+0000 to U+007F encoded
- as 2 or more bytes) are rendered as � (U+0FFD) in reverse video
- purple.
-
**red**
Invalid UTF-8 sequences. These are rendered as � (U+0FFD) with
- a red background, to make them stand out. Examples of invalid
- sequences:
+ a red background, to make them stand out. Invalid
+ sequences are:
- Prefix bytes (>= 0x80) which are not followed by the correct number of continuation
bytes (with their high 2 bits set to **10**).
@@ -250,8 +246,16 @@ changed with the **-c** option (see above).
- Truncated UTF-8 sequence at EOF.
+ - UTF-16 surrogates (codepoints U+D800 to U+DFFF).
+
- Codepoints above U+10FFFF, which are disallowed by RFC 3629.
+ - Overlong encodings (e.g. codepoints U+0000 to U+007F encoded
+ as 2 or more bytes).
+
+ Each occurrence of any of the above will increment the "Bad
+ Sequences" count, if the **-i** option is used.
+
TERMINAL SUPPORT
================