From f0e0a74cbf43d771075ad2d801197b8072d5b15c Mon Sep 17 00:00:00 2001 From: "B. Watson" Date: Tue, 17 Dec 2024 22:47:36 -0500 Subject: uxd.c: add overlong sequence detection; ver.rst: regenerate --- uxd.1 | 25 +++++++++++++------------ uxd.c | 44 +++++++++++++++++++++++++++++++++++++++----- uxd.rst | 23 ++++++++++++----------- ver.rst | 2 +- 4 files changed, 65 insertions(+), 29 deletions(-) diff --git a/uxd.1 b/uxd.1 index 35bfc4f..ca39178 100644 --- a/uxd.1 +++ b/uxd.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "UXD" 1 "2024-12-17" "0.1.0" "Urchlay's Utilities" +.TH "UXD" 1 "2024-12-17" "0.2.1" "Urchlay's Utilities" .SH NAME uxd \- UTF-8 hex dumper .SH SYNOPSIS @@ -270,15 +270,21 @@ changed with the \fB\-c\fP option (see above). Printable characters (except the space, U+0020) alternate between green and yellow. .TP .B \fBpurple\fP -Spaces and unprintable characters ("control" characters, newlines, tabs, etc). -These are printed as "visible" characters, e.g. ␣ for the space, ↵ for a newline. -Hopefully this is an improvement over the usual practice of printing these as periods, like -standard hex dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed +Spaces and unprintable characters ("control" characters, newlines, +tabs, etc). These are printed as "visible" characters, e.g. ␣ for +the space, ↵ for a newline. Hopefully this is an improvement over +the usual practice of printing these as periods, like standard hex +dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed as a purple letter B. +.sp +Note: Overlong encodings (e.g. codepoints U+0000 to U+007F encoded +as 2 or more bytes) are rendered as � (U+0FFD) in reverse video +purple. .TP .B \fBred\fP -Invalid UTF\-8 sequences. These are rendered with a red background, to make them -stand out. Examples of invalid sequences: +Invalid UTF\-8 sequences. These are rendered as � (U+0FFD) with +a red background, to make them stand out. Examples of invalid +sequences: .INDENT 7.0 .INDENT 3.5 .INDENT 0.0 @@ -359,11 +365,6 @@ input file. Invalid input (non\-UTF\-8) doesn\(aqt count as an error; it\(aqll just have lots of red in the output. .SH BUGS .sp -\fBuxd\fP doesn\(aqt check for overlong UTF\-8 encodings (e.g. a character -that could be a 1\-byte sequence, but is encoded as 2 or more). -Sequences like this really should be colorized in red. Technically, -this means \fBuxd\fP supports WTF\-8, not UTF\-8. -.sp There should be options and/or a config file to change the colors, rather than baking them into the binary. .sp diff --git a/uxd.c b/uxd.c index dbea5f0..38c5862 100644 --- a/uxd.c +++ b/uxd.c @@ -77,6 +77,7 @@ int cur_normal_hilite = 0; #define HL_NORM_INV 1 #define HL_SPECIAL 2 #define HL_BAD 3 +#define HL_OVERLONG 4 /* terminal codes for mono highlighting. */ #define MONO_NORMAL 0 @@ -417,8 +418,12 @@ void append_color(char *buf, int hl_type) { fgcolor = special_color; bgcolor = 0; break; - default: + case HL_OVERLONG: + fgcolor = 0; + bgcolor = special_color; + break; case HL_BAD: + default: fgcolor = 0; bgcolor = bad_color; break; @@ -447,6 +452,7 @@ void append_mono(char *buf, int hl_type) { code = MONO_BOLD; break; default: + case HL_OVERLONG: /* maybe change this later */ case HL_BAD: code = MONO_REVERSE; break; @@ -521,6 +527,26 @@ int is_bom(unsigned char *b) { return (b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); } +/* Detect overlong encodings, without doing a full decode. */ +int is_overlong(int cont_count, unsigned char *b) { + /* 1 byte seqs are never overlong. */ + if(!cont_count) + return 0; + + /* 2 byte seqs, if the first byte is 0xc0 or 0xc1, it's overlong. */ + if(cont_count == 1 && b[0] <= 0xc1) + return 1; + + /* for 3 and 4 byte seqs, it's the 2nd byte that matters. */ + if(cont_count == 2 && b[1] <= 0x9f) + return 1; + + if(cont_count == 3 && b[1] <= 0x8f) + return 1; + + return 0; +} + /* U+10FFFF is the last valid codepoint. It encodes to f4 8f bf bf. 'count' is the count of continuation bytes only (so, 3 for a 4-byte sqeuence). */ @@ -560,7 +586,7 @@ int dump_utf8_char(void) { unsigned char bytes[] = { 0, 0, 0, 0, 0 }; unsigned char *cont_bytes = bytes + 1; char *printable; - int bad = 0, special = 0, hl_type; + int bad = 0, special = 0, overlong = 0, hl_type; int c, cont_count, i; static int byte0; @@ -625,7 +651,10 @@ int dump_utf8_char(void) { if(is_out_of_range(cont_count, bytes)) bad = 1; - if(bad) { + if(is_overlong(cont_count, bytes)) + overlong = 1; + + if(bad || overlong) { bad_count++; } else { char_count++; @@ -644,8 +673,13 @@ int dump_utf8_char(void) { hl_type = HL_SPECIAL; printable = PRINT_BOM; } else { - hl_type = HL_NORMAL; - printable = (char *)bytes; + if(overlong) { + hl_type = HL_OVERLONG; + printable = PRINT_BAD; + } else { + hl_type = HL_NORMAL; + printable = (char *)bytes; + } } /* human-readable (right) column: */ diff --git a/uxd.rst b/uxd.rst index 597084b..535177d 100644 --- a/uxd.rst +++ b/uxd.rst @@ -227,15 +227,21 @@ changed with the **-c** option (see above). Printable characters (except the space, U+0020) alternate between green and yellow. **purple** - Spaces and unprintable characters ("control" characters, newlines, tabs, etc). - These are printed as "visible" characters, e.g. ␣ for the space, ↵ for a newline. - Hopefully this is an improvement over the usual practice of printing these as periods, like - standard hex dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed + Spaces and unprintable characters ("control" characters, newlines, + tabs, etc). These are printed as "visible" characters, e.g. ␣ for + the space, ↵ for a newline. Hopefully this is an improvement over + the usual practice of printing these as periods, like standard hex + dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed as a purple letter B. + Note: Overlong encodings (e.g. codepoints U+0000 to U+007F encoded + as 2 or more bytes) are rendered as � (U+0FFD) in reverse video + purple. + **red** - Invalid UTF-8 sequences. These are rendered with a red background, to make them - stand out. Examples of invalid sequences: + Invalid UTF-8 sequences. These are rendered as � (U+0FFD) with + a red background, to make them stand out. Examples of invalid + sequences: - Prefix bytes (>= 0x80) which are not followed by the correct number of continuation bytes (with their high 2 bits set to **10**). @@ -319,11 +325,6 @@ it'll just have lots of red in the output. BUGS ==== -**uxd** doesn't check for overlong UTF-8 encodings (e.g. a character -that could be a 1-byte sequence, but is encoded as 2 or more). -Sequences like this really should be colorized in red. Technically, -this means **uxd** supports WTF-8, not UTF-8. - There should be options and/or a config file to change the colors, rather than baking them into the binary. diff --git a/ver.rst b/ver.rst index 6b13b18..6f54bd4 100644 --- a/ver.rst +++ b/ver.rst @@ -1 +1 @@ -.. |version| replace:: 0.1.0 +.. |version| replace:: 0.2.1 -- cgit v1.2.3