diff options
author | B. Watson <urchlay@slackware.uk> | 2024-12-18 05:47:07 -0500 |
---|---|---|
committer | B. Watson <urchlay@slackware.uk> | 2024-12-18 05:47:07 -0500 |
commit | c205a7ea2a7171b61dae4ac51a3a251cceb1dde1 (patch) | |
tree | 58447b4934f93eb8cb48909fc1efc3b15c72c5ed | |
parent | f467fec27bc25d51020ce482750361c102417efb (diff) | |
download | uxd-c205a7ea2a7171b61dae4ac51a3a251cceb1dde1.tar.gz |
detect UTF-16 surrogates as bad, use red for overlong
-rw-r--r-- | uxd.1 | 18 | ||||
-rw-r--r-- | uxd.c | 17 | ||||
-rw-r--r-- | uxd.rst | 16 |
3 files changed, 35 insertions, 16 deletions
@@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "UXD" 1 "2024-12-17" "0.2.1" "Urchlay's Utilities" +.TH "UXD" 1 "2024-12-18" "0.2.1" "Urchlay's Utilities" .SH NAME uxd \- UTF-8 hex dumper .SH SYNOPSIS @@ -276,15 +276,11 @@ the space, ↵ for a newline. Hopefully this is an improvement over the usual practice of printing these as periods, like standard hex dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed as a purple letter B. -.sp -Note: Overlong encodings (e.g. codepoints U+0000 to U+007F encoded -as 2 or more bytes) are rendered as � (U+0FFD) in reverse video -purple. .TP .B \fBred\fP Invalid UTF\-8 sequences. These are rendered as � (U+0FFD) with -a red background, to make them stand out. Examples of invalid -sequences: +a red background, to make them stand out. Invalid +sequences are: .INDENT 7.0 .INDENT 3.5 .INDENT 0.0 @@ -296,8 +292,16 @@ Continuation bytes that aren\(aqt preceded by a valid prefix byte. .IP \(bu 2 Truncated UTF\-8 sequence at EOF. .IP \(bu 2 +UTF\-16 surrogates (codepoints U+D800 to U+DFFF). +.IP \(bu 2 Codepoints above U+10FFFF, which are disallowed by RFC 3629. +.IP \(bu 2 +Overlong encodings (e.g. codepoints U+0000 to U+007F encoded +as 2 or more bytes). .UNINDENT +.sp +Each occurrence of any of the above will increment the "Bad +Sequences" count, if the \fB\-i\fP option is used. .UNINDENT .UNINDENT .UNINDENT @@ -425,9 +425,11 @@ void append_color(char *buf, int hl_type) { bgcolor = 0; break; case HL_OVERLONG: + /* don't use a separate color for this any more fgcolor = 0; bgcolor = special_color; break; + */ case HL_BAD: default: fgcolor = 0; @@ -563,6 +565,12 @@ int is_out_of_range(int cont_count, unsigned char *b) { return 1; } +/* surrogates for UTF-16 are not valid Unicode (therefore not UTF-8) */ +int is_surrogate(int cont_count, unsigned char *b) { + if(cont_count != 2) return 0; + return b[0] == 0xed && b[1] > 0x9f; +} + int get_next_byte(void) { int c; @@ -655,9 +663,12 @@ int dump_utf8_char(void) { } } - /* don't check bad sequences for out-of-range */ - if(!bad && is_out_of_range(cont_count, bytes)) - bad = 1; + /* don't check bad sequences for out-of-range or surrogate */ + if(!bad) { + if(is_out_of_range(cont_count, bytes) || is_surrogate(cont_count, bytes)) + bad = 1; + } + if(is_overlong(cont_count, bytes)) overlong = 1; @@ -234,14 +234,10 @@ changed with the **-c** option (see above). dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed as a purple letter B. - Note: Overlong encodings (e.g. codepoints U+0000 to U+007F encoded - as 2 or more bytes) are rendered as � (U+0FFD) in reverse video - purple. - **red** Invalid UTF-8 sequences. These are rendered as � (U+0FFD) with - a red background, to make them stand out. Examples of invalid - sequences: + a red background, to make them stand out. Invalid + sequences are: - Prefix bytes (>= 0x80) which are not followed by the correct number of continuation bytes (with their high 2 bits set to **10**). @@ -250,8 +246,16 @@ changed with the **-c** option (see above). - Truncated UTF-8 sequence at EOF. + - UTF-16 surrogates (codepoints U+D800 to U+DFFF). + - Codepoints above U+10FFFF, which are disallowed by RFC 3629. + - Overlong encodings (e.g. codepoints U+0000 to U+007F encoded + as 2 or more bytes). + + Each occurrence of any of the above will increment the "Bad + Sequences" count, if the **-i** option is used. + TERMINAL SUPPORT ================ |