From 343fd43b95960f59a3bf901f59503757b81a5592 Mon Sep 17 00:00:00 2001 From: "B. Watson" Date: Thu, 12 Dec 2024 16:46:35 -0500 Subject: fix Makefile, dashes in hex dump, red for codepoints > U+10FFFF, fix spacing. --- Makefile | 6 +++++- uxd.1 | 7 ++----- uxd.c | 29 +++++++++++++++++++++++++---- uxd.rst | 7 ++----- 4 files changed, 34 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index a9ee0e3..0c10d3a 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,15 @@ CFLAGS=-O2 -fPIC -Wall +.PHONY: all test man clean + all: uxd man test: uxd ./uxd -man: uxd.rst +man: uxd.1 + +uxd.1: uxd.rst rst2man uxd.rst > uxd.1 clean: diff --git a/uxd.1 b/uxd.1 index cb69b28..fe1bb34 100644 --- a/uxd.1 +++ b/uxd.1 @@ -121,6 +121,8 @@ bytes (with their high 2 bits set to \fB10\fP). Continuation bytes that aren\(aqt preceded by a valid prefix byte. .IP \(bu 2 Truncated UTF\-8 sequence at EOF. +.IP \(bu 2 +Codepoints above U+10FFFF, which are disallowed by RFC 3629. .UNINDENT .UNINDENT .UNINDENT @@ -172,11 +174,6 @@ that could be a 1\-byte sequence, but is encoded as 2 or more). Sequences like this really should be colorized in red. Technically, this means \fBuxd\fP supports WTF\-8, not UTF\-8. .sp -RFC 3629 doesn\(aqt allow UTF\-8 to use codepoints above U+10FFFF. 4\-byte -sequences can support codepoints U+110000 to U+1FFFFF, which are not -valid Unicode. If these occur in the input, \fBuxd\fP should colorize -them in red, but it doesn\(aqt (yet). -.sp There should be options and/or a config file to change the colors, rather than baking them into the binary. .sp diff --git a/uxd.c b/uxd.c index cff81cf..d71d6d5 100644 --- a/uxd.c +++ b/uxd.c @@ -135,6 +135,7 @@ void print_line(void) { /* line up the rightmost field (human-readable) */ while(spacing--) printf(" "); + if(dump_column < (MAX_DUMP_COLS / 2)) putchar(' '); printf(" %s\n", right_buf); @@ -178,12 +179,21 @@ void append_left(unsigned char byte, int dash, int fgcolor, int bgcolor) { append_color(left_buf, fgcolor, bgcolor); sprintf(tmpbuf, "%02x", byte); strcat(left_buf, tmpbuf); - if(dash) strcat(left_buf, "-"); - append_color_off(left_buf); - if(!dash) strcat(left_buf, " "); - if(dump_column == 7) strcat(left_buf, " "); dump_column++; + + if(dash) { + strcat(left_buf, "-"); + if(dump_column == (MAX_DUMP_COLS / 2)) + strcat(left_buf, "-"); + append_color_off(left_buf); + } else { + append_color_off(left_buf); + strcat(left_buf, " "); + if(dump_column == (MAX_DUMP_COLS / 2)) + strcat(left_buf, " "); + } + if(dump_column == MAX_DUMP_COLS) print_line(); @@ -208,6 +218,14 @@ int is_bom(unsigned char *b) { return (b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf); } +/* U+10FFFF is the last valid codepoint. It encodes to f4 8f bf bf. */ +int is_out_of_range(int count, unsigned char *b) { + if(count < 3) return 0; + if(b[0] < 0xf4) return 0; + if(b[1] < 0x90) return 0; + return 1; +} + /* return value: false = EOF, true = more data to read */ int dump_utf8_char(void) { unsigned char bytes[] = { 0, 0, 0, 0, 0 }; @@ -265,6 +283,9 @@ int dump_utf8_char(void) { } } + if(is_out_of_range(cont_count, bytes)) + bad = 1; + if(bad) { fg = BAD_FG; bg = BAD_BG; diff --git a/uxd.rst b/uxd.rst index 11784fe..c8bdfed 100644 --- a/uxd.rst +++ b/uxd.rst @@ -103,6 +103,8 @@ COLORS - Truncated UTF-8 sequence at EOF. + - Codepoints above U+10FFFF, which are disallowed by RFC 3629. + TERMINAL SUPPORT ================ @@ -161,11 +163,6 @@ that could be a 1-byte sequence, but is encoded as 2 or more). Sequences like this really should be colorized in red. Technically, this means **uxd** supports WTF-8, not UTF-8. -RFC 3629 doesn't allow UTF-8 to use codepoints above U+10FFFF. 4-byte -sequences can support codepoints U+110000 to U+1FFFFF, which are not -valid Unicode. If these occur in the input, **uxd** should colorize -them in red, but it doesn't (yet). - There should be options and/or a config file to change the colors, rather than baking them into the binary. -- cgit v1.2.3