aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--usage.c2
-rw-r--r--uxd.142
-rw-r--r--uxd.c34
-rw-r--r--uxd.rst40
4 files changed, 103 insertions, 15 deletions
diff --git a/usage.c b/usage.c
index ae9a754..a957552 100644
--- a/usage.c
+++ b/usage.c
@@ -5,6 +5,7 @@ char *usage_opts[] = {
" -d data: dump this data instead of a file.",
" -h, --help: print this help message.",
" -i: print number of bytes/chars/ascii/multibyte/bad sequences.",
+ " -j: java (MUTF-8) mode: allow 0xc0 0x80 for U+0000.",
" -l length: stop dumping after <length> bytes (not characters).",
" -m: monochrome mode.",
" -n: ignore UXD_OPTS environment variable.",
@@ -14,5 +15,6 @@ char *usage_opts[] = {
" -S pos: like -s, but also sets -o so addresses start at 0.",
" -u: uppercase hex digits.",
" -v, --version: print version of uxd.",
+ " -w: WTF-8 mode (allow surrogates).",
(char *)0
};
diff --git a/uxd.1 b/uxd.1
index 6258f9c..8ada567 100644
--- a/uxd.1
+++ b/uxd.1
@@ -121,6 +121,17 @@ bad sequences.
.
.INDENT 0.0
.TP
+.B \-j
+Java mode (aka MUTF\-8). Identical to UTF\-8 except it allows the
+overlong \fB0xc0 0x80\fP encoding for codepoint U+0000 (aka NUL),
+which normally would be considered an error.
+This may be useful for looking at serialized data created by Java
+programs.
+.UNINDENT
+.\" java (MUTF-8) mode: allow 0xc0 0x80 for U+0000.
+.
+.INDENT 0.0
+.TP
.BI \-l \ length
Stop dumping after \fIlength\fP bytes (not characters). If the limit is
reached in the middle of a multibyte character, the entire character
@@ -157,6 +168,11 @@ if you can think of a reason to want it to be.
.
.INDENT 0.0
.TP
+.B \-p
+Permissive mode. Turns off error highlighting for overlongs, codepoints
+above \fBU+10FFFF\fP, and surrogates. Only malformed sequences will be
+highlighed in red.
+.TP
.B \-r
Highlight multi\-byte sequences in reverse video, in the hex
output. Ignored if \fB\-m\fP given.
@@ -218,6 +234,13 @@ Print version number and exit.
.UNINDENT
.\" print version of uxd.
.
+.INDENT 0.0
+.TP
+.B \-w
+WTF\-8 mode. Surrogates \fBU+D800\fP to \fBU+D8FF\fP will not be considered errors.
+.UNINDENT
+.\" WTF-8 mode (allow surrogates).
+.
.SH OUTPUT FORMAT
.sp
The output is designed to fit in an 80\-column terminal.
@@ -389,10 +412,23 @@ Zero for success, non\-zero for failure.
Failure status will only be returned if \fBuxd\fP failed to open the
input file. Invalid input (non\-UTF\-8) doesn\(aqt count as an error;
it\(aqll just have lots of red in the output.
-.SH BUGS
+.SH LIMITATIONS
+.sp
+There are not bugs, because they\(aqre part of the design.
+.sp
+Only UTF\-8 and a couple of variants (WTF\-8, MUTF\-8) are supported.
+There is no support for UTF\-16, UTF\-32, UTF\-EBCDIC, or any other
+non\-UTF\-8 encoding.
.sp
-There should be options and/or a config file to change the colors,
-rather than baking them into the binary.
+There\(aqs no support for any number base except hex.
+.sp
+The input is read one byte at a time, so a search or regex match
+option would be difficult or impossible to implement.
+.sp
+Seeking backwards from the end of the file is impossible when reading
+from standard input. The only way to fake this would be to read the
+whole file into memory at startup, which \fBuxd\fP doesn\(aqt do.
+.SH BUGS
.sp
Combining characters are not handled well. Or at all, really: the 2
characters being combined will have an ANSI color code in between.
diff --git a/uxd.c b/uxd.c
index e839330..b30b22b 100644
--- a/uxd.c
+++ b/uxd.c
@@ -150,6 +150,9 @@ char *dump_data_arg = NULL; /* -d */
long dump_data_idx = 0; /* -d */
int term_utf8 = 0; /* -t, -T */
int restore_term = 0; /* -T only */
+int java_mode = 0; /* -j */
+int wtf8_mode = 0; /* -w */
+int permissive = 0; /* -l */
/* stats for -i option */
long byte_count = 0;
@@ -265,8 +268,14 @@ void parse_args(int argc, char **argv) {
version();
}
- while((opt = my_getopt(argc, argv, "tTd:1ic:nbl:rmo:S:s:uhv")) != -1) {
+ while((opt = my_getopt(argc, argv, "jwptTd:1ic:nbl:rmo:S:s:uhv")) != -1) {
switch(opt) {
+ case 'j':
+ java_mode = 1; break;
+ case 'w':
+ wtf8_mode = 1; break;
+ case 'p':
+ permissive = 1; break;
case 't':
term_utf8 = restore_term = 1; break;
case 'T':
@@ -551,6 +560,10 @@ int is_overlong(int cont_count, unsigned char *b) {
if(!cont_count)
return 0;
+ /* java mode (MUTF-8) allows exactly one overlong: */
+ if(java_mode && cont_count == 1 && b[0] == 0xc0 && b[1] == 0x80)
+ return 0;
+
/* 2 byte seqs, if the first byte is 0xc0 or 0xc1, it's overlong. */
if(cont_count == 1 && b[0] <= 0xc1)
return 1;
@@ -673,15 +686,18 @@ int dump_utf8_char(void) {
}
}
- /* don't check bad sequences for out-of-range or surrogate */
- if(!bad) {
- if(is_out_of_range(cont_count, bytes) || is_surrogate(cont_count, bytes))
- bad = 1;
- }
-
+ if(!permissive) {
+ /* don't check bad sequences for out-of-range or surrogate */
+ if(!bad) {
+ if(is_out_of_range(cont_count, bytes))
+ bad = 1;
+ else if((!wtf8_mode) && is_surrogate(cont_count, bytes))
+ bad = 1;
+ }
- if(is_overlong(cont_count, bytes))
- overlong = 1;
+ if(is_overlong(cont_count, bytes))
+ overlong = 1;
+ }
if(bad || overlong) {
bad_count++;
diff --git a/uxd.rst b/uxd.rst
index 459de77..2220174 100644
--- a/uxd.rst
+++ b/uxd.rst
@@ -98,6 +98,15 @@ as *K*, *M*, and *G* for power-of-10 based (e.g. *1K* is 1000 bytes).
.. print number of bytes/chars/ascii/multibyte/bad sequences.
+-j
+ Java mode (aka MUTF-8). Identical to UTF-8 except it allows the
+ overlong **0xc0 0x80** encoding for codepoint U+0000 (aka NUL),
+ which normally would be considered an error.
+ This may be useful for looking at serialized data created by Java
+ programs.
+
+.. java (MUTF-8) mode: allow 0xc0 0x80 for U+0000.
+
-l length
Stop dumping after *length* bytes (not characters). If the limit is
reached in the middle of a multibyte character, the entire character
@@ -126,6 +135,11 @@ as *K*, *M*, and *G* for power-of-10 based (e.g. *1K* is 1000 bytes).
.. added to hex offsets (decimal, 0x hex, 0 octal).
+-p
+ Permissive mode. Turns off error highlighting for overlongs, codepoints
+ above **U+10FFFF**, and surrogates. Only malformed sequences will be
+ highlighed in red.
+
-r
Highlight multi-byte sequences in reverse video, in the hex
output. Ignored if **-m** given.
@@ -171,6 +185,11 @@ as *K*, *M*, and *G* for power-of-10 based (e.g. *1K* is 1000 bytes).
.. print version of uxd.
+-w
+ WTF-8 mode. Surrogates **U+D800** to **U+D8FF** will not be considered errors.
+
+.. WTF-8 mode (allow surrogates).
+
OUTPUT FORMAT
=============
@@ -340,12 +359,27 @@ Failure status will only be returned if **uxd** failed to open the
input file. Invalid input (non-UTF-8) doesn't count as an error;
it'll just have lots of red in the output.
+LIMITATIONS
+===========
+
+There are not bugs, because they're part of the design.
+
+Only UTF-8 and a couple of variants (WTF-8, MUTF-8) are supported.
+There is no support for UTF-16, UTF-32, UTF-EBCDIC, or any other
+non-UTF-8 encoding.
+
+There's no support for any number base except hex.
+
+The input is read one byte at a time, so a search or regex match
+option would be difficult or impossible to implement.
+
+Seeking backwards from the end of the file is impossible when reading
+from standard input. The only way to fake this would be to read the
+whole file into memory at startup, which **uxd** doesn't do.
+
BUGS
====
-There should be options and/or a config file to change the colors,
-rather than baking them into the binary.
-
Combining characters are not handled well. Or at all, really: the 2
characters being combined will have an ANSI color code in between.
urxvt at least ignores the color code, so the composite character