aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorB. Watson <urchlay@slackware.uk>2024-12-17 22:47:36 -0500
committerB. Watson <urchlay@slackware.uk>2024-12-17 22:47:57 -0500
commitf0e0a74cbf43d771075ad2d801197b8072d5b15c (patch)
tree71d2f41619aa4cc39487c850a59e97f90895669b
parent548e7d04b4b2fa60b71615ed590be54016dac52d (diff)
downloaduxd-f0e0a74cbf43d771075ad2d801197b8072d5b15c.tar.gz
uxd.c: add overlong sequence detection; ver.rst: regenerate
-rw-r--r--uxd.125
-rw-r--r--uxd.c44
-rw-r--r--uxd.rst23
-rw-r--r--ver.rst2
4 files changed, 65 insertions, 29 deletions
diff --git a/uxd.1 b/uxd.1
index 35bfc4f..ca39178 100644
--- a/uxd.1
+++ b/uxd.1
@@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
..
-.TH "UXD" 1 "2024-12-17" "0.1.0" "Urchlay's Utilities"
+.TH "UXD" 1 "2024-12-17" "0.2.1" "Urchlay's Utilities"
.SH NAME
uxd \- UTF-8 hex dumper
.SH SYNOPSIS
@@ -270,15 +270,21 @@ changed with the \fB\-c\fP option (see above).
Printable characters (except the space, U+0020) alternate between green and yellow.
.TP
.B \fBpurple\fP
-Spaces and unprintable characters ("control" characters, newlines, tabs, etc).
-These are printed as "visible" characters, e.g. ␣ for the space, ↵ for a newline.
-Hopefully this is an improvement over the usual practice of printing these as periods, like
-standard hex dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed
+Spaces and unprintable characters ("control" characters, newlines,
+tabs, etc). These are printed as "visible" characters, e.g. ␣ for
+the space, ↵ for a newline. Hopefully this is an improvement over
+the usual practice of printing these as periods, like standard hex
+dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed
as a purple letter B.
+.sp
+Note: Overlong encodings (e.g. codepoints U+0000 to U+007F encoded
+as 2 or more bytes) are rendered as � (U+0FFD) in reverse video
+purple.
.TP
.B \fBred\fP
-Invalid UTF\-8 sequences. These are rendered with a red background, to make them
-stand out. Examples of invalid sequences:
+Invalid UTF\-8 sequences. These are rendered as � (U+0FFD) with
+a red background, to make them stand out. Examples of invalid
+sequences:
.INDENT 7.0
.INDENT 3.5
.INDENT 0.0
@@ -359,11 +365,6 @@ input file. Invalid input (non\-UTF\-8) doesn\(aqt count as an error;
it\(aqll just have lots of red in the output.
.SH BUGS
.sp
-\fBuxd\fP doesn\(aqt check for overlong UTF\-8 encodings (e.g. a character
-that could be a 1\-byte sequence, but is encoded as 2 or more).
-Sequences like this really should be colorized in red. Technically,
-this means \fBuxd\fP supports WTF\-8, not UTF\-8.
-.sp
There should be options and/or a config file to change the colors,
rather than baking them into the binary.
.sp
diff --git a/uxd.c b/uxd.c
index dbea5f0..38c5862 100644
--- a/uxd.c
+++ b/uxd.c
@@ -77,6 +77,7 @@ int cur_normal_hilite = 0;
#define HL_NORM_INV 1
#define HL_SPECIAL 2
#define HL_BAD 3
+#define HL_OVERLONG 4
/* terminal codes for mono highlighting. */
#define MONO_NORMAL 0
@@ -417,8 +418,12 @@ void append_color(char *buf, int hl_type) {
fgcolor = special_color;
bgcolor = 0;
break;
- default:
+ case HL_OVERLONG:
+ fgcolor = 0;
+ bgcolor = special_color;
+ break;
case HL_BAD:
+ default:
fgcolor = 0;
bgcolor = bad_color;
break;
@@ -447,6 +452,7 @@ void append_mono(char *buf, int hl_type) {
code = MONO_BOLD;
break;
default:
+ case HL_OVERLONG: /* maybe change this later */
case HL_BAD:
code = MONO_REVERSE;
break;
@@ -521,6 +527,26 @@ int is_bom(unsigned char *b) {
return (b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
}
+/* Detect overlong encodings, without doing a full decode. */
+int is_overlong(int cont_count, unsigned char *b) {
+ /* 1 byte seqs are never overlong. */
+ if(!cont_count)
+ return 0;
+
+ /* 2 byte seqs, if the first byte is 0xc0 or 0xc1, it's overlong. */
+ if(cont_count == 1 && b[0] <= 0xc1)
+ return 1;
+
+ /* for 3 and 4 byte seqs, it's the 2nd byte that matters. */
+ if(cont_count == 2 && b[1] <= 0x9f)
+ return 1;
+
+ if(cont_count == 3 && b[1] <= 0x8f)
+ return 1;
+
+ return 0;
+}
+
/* U+10FFFF is the last valid codepoint. It encodes to f4 8f bf bf.
'count' is the count of continuation bytes only (so, 3 for a 4-byte
sqeuence). */
@@ -560,7 +586,7 @@ int dump_utf8_char(void) {
unsigned char bytes[] = { 0, 0, 0, 0, 0 };
unsigned char *cont_bytes = bytes + 1;
char *printable;
- int bad = 0, special = 0, hl_type;
+ int bad = 0, special = 0, overlong = 0, hl_type;
int c, cont_count, i;
static int byte0;
@@ -625,7 +651,10 @@ int dump_utf8_char(void) {
if(is_out_of_range(cont_count, bytes))
bad = 1;
- if(bad) {
+ if(is_overlong(cont_count, bytes))
+ overlong = 1;
+
+ if(bad || overlong) {
bad_count++;
} else {
char_count++;
@@ -644,8 +673,13 @@ int dump_utf8_char(void) {
hl_type = HL_SPECIAL;
printable = PRINT_BOM;
} else {
- hl_type = HL_NORMAL;
- printable = (char *)bytes;
+ if(overlong) {
+ hl_type = HL_OVERLONG;
+ printable = PRINT_BAD;
+ } else {
+ hl_type = HL_NORMAL;
+ printable = (char *)bytes;
+ }
}
/* human-readable (right) column: */
diff --git a/uxd.rst b/uxd.rst
index 597084b..535177d 100644
--- a/uxd.rst
+++ b/uxd.rst
@@ -227,15 +227,21 @@ changed with the **-c** option (see above).
Printable characters (except the space, U+0020) alternate between green and yellow.
**purple**
- Spaces and unprintable characters ("control" characters, newlines, tabs, etc).
- These are printed as "visible" characters, e.g. ␣ for the space, ↵ for a newline.
- Hopefully this is an improvement over the usual practice of printing these as periods, like
- standard hex dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed
+ Spaces and unprintable characters ("control" characters, newlines,
+ tabs, etc). These are printed as "visible" characters, e.g. ␣ for
+ the space, ↵ for a newline. Hopefully this is an improvement over
+ the usual practice of printing these as periods, like standard hex
+ dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed
as a purple letter B.
+ Note: Overlong encodings (e.g. codepoints U+0000 to U+007F encoded
+ as 2 or more bytes) are rendered as � (U+0FFD) in reverse video
+ purple.
+
**red**
- Invalid UTF-8 sequences. These are rendered with a red background, to make them
- stand out. Examples of invalid sequences:
+ Invalid UTF-8 sequences. These are rendered as � (U+0FFD) with
+ a red background, to make them stand out. Examples of invalid
+ sequences:
- Prefix bytes (>= 0x80) which are not followed by the correct number of continuation
bytes (with their high 2 bits set to **10**).
@@ -319,11 +325,6 @@ it'll just have lots of red in the output.
BUGS
====
-**uxd** doesn't check for overlong UTF-8 encodings (e.g. a character
-that could be a 1-byte sequence, but is encoded as 2 or more).
-Sequences like this really should be colorized in red. Technically,
-this means **uxd** supports WTF-8, not UTF-8.
-
There should be options and/or a config file to change the colors,
rather than baking them into the binary.
diff --git a/ver.rst b/ver.rst
index 6b13b18..6f54bd4 100644
--- a/ver.rst
+++ b/ver.rst
@@ -1 +1 @@
-.. |version| replace:: 0.1.0
+.. |version| replace:: 0.2.1