From d0b8532b703ef515b89eb8f34c0402262f3d3f7e Mon Sep 17 00:00:00 2001 From: "B. Watson" Date: Wed, 18 Dec 2024 07:05:01 -0500 Subject: add -j/-p/-w options. --- usage.c | 2 ++ uxd.1 | 42 +++++++++++++++++++++++++++++++++++++++--- uxd.c | 34 +++++++++++++++++++++++++--------- uxd.rst | 40 +++++++++++++++++++++++++++++++++++++--- 4 files changed, 103 insertions(+), 15 deletions(-) diff --git a/usage.c b/usage.c index ae9a754..a957552 100644 --- a/usage.c +++ b/usage.c @@ -5,6 +5,7 @@ char *usage_opts[] = { " -d data: dump this data instead of a file.", " -h, --help: print this help message.", " -i: print number of bytes/chars/ascii/multibyte/bad sequences.", + " -j: java (MUTF-8) mode: allow 0xc0 0x80 for U+0000.", " -l length: stop dumping after bytes (not characters).", " -m: monochrome mode.", " -n: ignore UXD_OPTS environment variable.", @@ -14,5 +15,6 @@ char *usage_opts[] = { " -S pos: like -s, but also sets -o so addresses start at 0.", " -u: uppercase hex digits.", " -v, --version: print version of uxd.", + " -w: WTF-8 mode (allow surrogates).", (char *)0 }; diff --git a/uxd.1 b/uxd.1 index 6258f9c..8ada567 100644 --- a/uxd.1 +++ b/uxd.1 @@ -121,6 +121,17 @@ bad sequences. . .INDENT 0.0 .TP +.B \-j +Java mode (aka MUTF\-8). Identical to UTF\-8 except it allows the +overlong \fB0xc0 0x80\fP encoding for codepoint U+0000 (aka NUL), +which normally would be considered an error. +This may be useful for looking at serialized data created by Java +programs. +.UNINDENT +.\" java (MUTF-8) mode: allow 0xc0 0x80 for U+0000. +. +.INDENT 0.0 +.TP .BI \-l \ length Stop dumping after \fIlength\fP bytes (not characters). If the limit is reached in the middle of a multibyte character, the entire character @@ -157,6 +168,11 @@ if you can think of a reason to want it to be. . .INDENT 0.0 .TP +.B \-p +Permissive mode. Turns off error highlighting for overlongs, codepoints +above \fBU+10FFFF\fP, and surrogates. Only malformed sequences will be +highlighed in red. +.TP .B \-r Highlight multi\-byte sequences in reverse video, in the hex output. Ignored if \fB\-m\fP given. @@ -218,6 +234,13 @@ Print version number and exit. .UNINDENT .\" print version of uxd. . +.INDENT 0.0 +.TP +.B \-w +WTF\-8 mode. Surrogates \fBU+D800\fP to \fBU+D8FF\fP will not be considered errors. +.UNINDENT +.\" WTF-8 mode (allow surrogates). +. .SH OUTPUT FORMAT .sp The output is designed to fit in an 80\-column terminal. @@ -389,10 +412,23 @@ Zero for success, non\-zero for failure. Failure status will only be returned if \fBuxd\fP failed to open the input file. Invalid input (non\-UTF\-8) doesn\(aqt count as an error; it\(aqll just have lots of red in the output. -.SH BUGS +.SH LIMITATIONS +.sp +There are not bugs, because they\(aqre part of the design. +.sp +Only UTF\-8 and a couple of variants (WTF\-8, MUTF\-8) are supported. +There is no support for UTF\-16, UTF\-32, UTF\-EBCDIC, or any other +non\-UTF\-8 encoding. .sp -There should be options and/or a config file to change the colors, -rather than baking them into the binary. +There\(aqs no support for any number base except hex. +.sp +The input is read one byte at a time, so a search or regex match +option would be difficult or impossible to implement. +.sp +Seeking backwards from the end of the file is impossible when reading +from standard input. The only way to fake this would be to read the +whole file into memory at startup, which \fBuxd\fP doesn\(aqt do. +.SH BUGS .sp Combining characters are not handled well. Or at all, really: the 2 characters being combined will have an ANSI color code in between. diff --git a/uxd.c b/uxd.c index e839330..b30b22b 100644 --- a/uxd.c +++ b/uxd.c @@ -150,6 +150,9 @@ char *dump_data_arg = NULL; /* -d */ long dump_data_idx = 0; /* -d */ int term_utf8 = 0; /* -t, -T */ int restore_term = 0; /* -T only */ +int java_mode = 0; /* -j */ +int wtf8_mode = 0; /* -w */ +int permissive = 0; /* -l */ /* stats for -i option */ long byte_count = 0; @@ -265,8 +268,14 @@ void parse_args(int argc, char **argv) { version(); } - while((opt = my_getopt(argc, argv, "tTd:1ic:nbl:rmo:S:s:uhv")) != -1) { + while((opt = my_getopt(argc, argv, "jwptTd:1ic:nbl:rmo:S:s:uhv")) != -1) { switch(opt) { + case 'j': + java_mode = 1; break; + case 'w': + wtf8_mode = 1; break; + case 'p': + permissive = 1; break; case 't': term_utf8 = restore_term = 1; break; case 'T': @@ -551,6 +560,10 @@ int is_overlong(int cont_count, unsigned char *b) { if(!cont_count) return 0; + /* java mode (MUTF-8) allows exactly one overlong: */ + if(java_mode && cont_count == 1 && b[0] == 0xc0 && b[1] == 0x80) + return 0; + /* 2 byte seqs, if the first byte is 0xc0 or 0xc1, it's overlong. */ if(cont_count == 1 && b[0] <= 0xc1) return 1; @@ -673,15 +686,18 @@ int dump_utf8_char(void) { } } - /* don't check bad sequences for out-of-range or surrogate */ - if(!bad) { - if(is_out_of_range(cont_count, bytes) || is_surrogate(cont_count, bytes)) - bad = 1; - } - + if(!permissive) { + /* don't check bad sequences for out-of-range or surrogate */ + if(!bad) { + if(is_out_of_range(cont_count, bytes)) + bad = 1; + else if((!wtf8_mode) && is_surrogate(cont_count, bytes)) + bad = 1; + } - if(is_overlong(cont_count, bytes)) - overlong = 1; + if(is_overlong(cont_count, bytes)) + overlong = 1; + } if(bad || overlong) { bad_count++; diff --git a/uxd.rst b/uxd.rst index 459de77..2220174 100644 --- a/uxd.rst +++ b/uxd.rst @@ -98,6 +98,15 @@ as *K*, *M*, and *G* for power-of-10 based (e.g. *1K* is 1000 bytes). .. print number of bytes/chars/ascii/multibyte/bad sequences. +-j + Java mode (aka MUTF-8). Identical to UTF-8 except it allows the + overlong **0xc0 0x80** encoding for codepoint U+0000 (aka NUL), + which normally would be considered an error. + This may be useful for looking at serialized data created by Java + programs. + +.. java (MUTF-8) mode: allow 0xc0 0x80 for U+0000. + -l length Stop dumping after *length* bytes (not characters). If the limit is reached in the middle of a multibyte character, the entire character @@ -126,6 +135,11 @@ as *K*, *M*, and *G* for power-of-10 based (e.g. *1K* is 1000 bytes). .. added to hex offsets (decimal, 0x hex, 0 octal). +-p + Permissive mode. Turns off error highlighting for overlongs, codepoints + above **U+10FFFF**, and surrogates. Only malformed sequences will be + highlighed in red. + -r Highlight multi-byte sequences in reverse video, in the hex output. Ignored if **-m** given. @@ -171,6 +185,11 @@ as *K*, *M*, and *G* for power-of-10 based (e.g. *1K* is 1000 bytes). .. print version of uxd. +-w + WTF-8 mode. Surrogates **U+D800** to **U+D8FF** will not be considered errors. + +.. WTF-8 mode (allow surrogates). + OUTPUT FORMAT ============= @@ -340,12 +359,27 @@ Failure status will only be returned if **uxd** failed to open the input file. Invalid input (non-UTF-8) doesn't count as an error; it'll just have lots of red in the output. +LIMITATIONS +=========== + +There are not bugs, because they're part of the design. + +Only UTF-8 and a couple of variants (WTF-8, MUTF-8) are supported. +There is no support for UTF-16, UTF-32, UTF-EBCDIC, or any other +non-UTF-8 encoding. + +There's no support for any number base except hex. + +The input is read one byte at a time, so a search or regex match +option would be difficult or impossible to implement. + +Seeking backwards from the end of the file is impossible when reading +from standard input. The only way to fake this would be to read the +whole file into memory at startup, which **uxd** doesn't do. + BUGS ==== -There should be options and/or a config file to change the colors, -rather than baking them into the binary. - Combining characters are not handled well. Or at all, really: the 2 characters being combined will have an ANSI color code in between. urxvt at least ignores the color code, so the composite character -- cgit v1.2.3