aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile6
-rw-r--r--uxd.17
-rw-r--r--uxd.c29
-rw-r--r--uxd.rst7
4 files changed, 34 insertions, 15 deletions
diff --git a/Makefile b/Makefile
index a9ee0e3..0c10d3a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,15 @@
CFLAGS=-O2 -fPIC -Wall
+.PHONY: all test man clean
+
all: uxd man
test: uxd
./uxd
-man: uxd.rst
+man: uxd.1
+
+uxd.1: uxd.rst
rst2man uxd.rst > uxd.1
clean:
diff --git a/uxd.1 b/uxd.1
index cb69b28..fe1bb34 100644
--- a/uxd.1
+++ b/uxd.1
@@ -121,6 +121,8 @@ bytes (with their high 2 bits set to \fB10\fP).
Continuation bytes that aren\(aqt preceded by a valid prefix byte.
.IP \(bu 2
Truncated UTF\-8 sequence at EOF.
+.IP \(bu 2
+Codepoints above U+10FFFF, which are disallowed by RFC 3629.
.UNINDENT
.UNINDENT
.UNINDENT
@@ -172,11 +174,6 @@ that could be a 1\-byte sequence, but is encoded as 2 or more).
Sequences like this really should be colorized in red. Technically,
this means \fBuxd\fP supports WTF\-8, not UTF\-8.
.sp
-RFC 3629 doesn\(aqt allow UTF\-8 to use codepoints above U+10FFFF. 4\-byte
-sequences can support codepoints U+110000 to U+1FFFFF, which are not
-valid Unicode. If these occur in the input, \fBuxd\fP should colorize
-them in red, but it doesn\(aqt (yet).
-.sp
There should be options and/or a config file to change the colors,
rather than baking them into the binary.
.sp
diff --git a/uxd.c b/uxd.c
index cff81cf..d71d6d5 100644
--- a/uxd.c
+++ b/uxd.c
@@ -135,6 +135,7 @@ void print_line(void) {
/* line up the rightmost field (human-readable) */
while(spacing--) printf(" ");
+ if(dump_column < (MAX_DUMP_COLS / 2)) putchar(' ');
printf(" %s\n", right_buf);
@@ -178,12 +179,21 @@ void append_left(unsigned char byte, int dash, int fgcolor, int bgcolor) {
append_color(left_buf, fgcolor, bgcolor);
sprintf(tmpbuf, "%02x", byte);
strcat(left_buf, tmpbuf);
- if(dash) strcat(left_buf, "-");
- append_color_off(left_buf);
- if(!dash) strcat(left_buf, " ");
- if(dump_column == 7) strcat(left_buf, " ");
dump_column++;
+
+ if(dash) {
+ strcat(left_buf, "-");
+ if(dump_column == (MAX_DUMP_COLS / 2))
+ strcat(left_buf, "-");
+ append_color_off(left_buf);
+ } else {
+ append_color_off(left_buf);
+ strcat(left_buf, " ");
+ if(dump_column == (MAX_DUMP_COLS / 2))
+ strcat(left_buf, " ");
+ }
+
if(dump_column == MAX_DUMP_COLS)
print_line();
@@ -208,6 +218,14 @@ int is_bom(unsigned char *b) {
return (b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
}
+/* U+10FFFF is the last valid codepoint. It encodes to f4 8f bf bf. */
+int is_out_of_range(int count, unsigned char *b) {
+ if(count < 3) return 0;
+ if(b[0] < 0xf4) return 0;
+ if(b[1] < 0x90) return 0;
+ return 1;
+}
+
/* return value: false = EOF, true = more data to read */
int dump_utf8_char(void) {
unsigned char bytes[] = { 0, 0, 0, 0, 0 };
@@ -265,6 +283,9 @@ int dump_utf8_char(void) {
}
}
+ if(is_out_of_range(cont_count, bytes))
+ bad = 1;
+
if(bad) {
fg = BAD_FG;
bg = BAD_BG;
diff --git a/uxd.rst b/uxd.rst
index 11784fe..c8bdfed 100644
--- a/uxd.rst
+++ b/uxd.rst
@@ -103,6 +103,8 @@ COLORS
- Truncated UTF-8 sequence at EOF.
+ - Codepoints above U+10FFFF, which are disallowed by RFC 3629.
+
TERMINAL SUPPORT
================
@@ -161,11 +163,6 @@ that could be a 1-byte sequence, but is encoded as 2 or more).
Sequences like this really should be colorized in red. Technically,
this means **uxd** supports WTF-8, not UTF-8.
-RFC 3629 doesn't allow UTF-8 to use codepoints above U+10FFFF. 4-byte
-sequences can support codepoints U+110000 to U+1FFFFF, which are not
-valid Unicode. If these occur in the input, **uxd** should colorize
-them in red, but it doesn't (yet).
-
There should be options and/or a config file to change the colors,
rather than baking them into the binary.