4 files changed, 103 insertions, 15 deletions
diff --git a/usage.c b/usage.c
index ae9a754..a957552 100644
--- a/usage.c
+++ b/usage.c
@@ -5,6 +5,7 @@ char *usage_opts[] = {
 	"  -d data: dump this data instead of a file.",
 	"  -h, --help: print this help message.",
 	"  -i: print number of bytes/chars/ascii/multibyte/bad sequences.",
+	"  -j: java (MUTF-8) mode: allow 0xc0 0x80 for U+0000.",
 	"  -l length: stop dumping after <length> bytes (not characters).",
 	"  -m: monochrome mode.",
 	"  -n: ignore UXD_OPTS environment variable.",
@@ -14,5 +15,6 @@ char *usage_opts[] = {
 	"  -S pos: like -s, but also sets -o so addresses start at 0.",
 	"  -u: uppercase hex digits.",
 	"  -v, --version: print version of uxd.",
+	"  -w: WTF-8 mode (allow surrogates).",
 	(char *)0
 };
diff --git a/uxd.1 b/uxd.1
index 6258f9c..8ada567 100644
--- a/uxd.1
+++ b/uxd.1
@@ -121,6 +121,17 @@ bad sequences.
 .
 .INDENT 0.0
 .TP
+.B  \-j
+Java mode (aka MUTF\-8). Identical to UTF\-8 except it allows the
+overlong \fB0xc0 0x80\fP encoding for codepoint U+0000 (aka NUL),
+which normally would be considered an error.
+This may be useful for looking at serialized data created by Java
+programs.
+.UNINDENT
+.\" java (MUTF-8) mode: allow 0xc0 0x80 for U+0000.
+.
+.INDENT 0.0
+.TP
 .BI \-l \ length
 Stop dumping after \fIlength\fP bytes (not characters). If the limit is
 reached in the middle of a multibyte character, the entire character
@@ -157,6 +168,11 @@ if you can think of a reason to want it to be.
 .
 .INDENT 0.0
 .TP
+.B  \-p
+Permissive mode. Turns off error highlighting for overlongs, codepoints
+above \fBU+10FFFF\fP, and surrogates. Only malformed sequences will be
+highlighed in red.
+.TP
 .B  \-r
 Highlight multi\-byte sequences in reverse video, in the hex
 output. Ignored if \fB\-m\fP given.
@@ -218,6 +234,13 @@ Print version number and exit.
 .UNINDENT
 .\" print version of uxd.
 .
+.INDENT 0.0
+.TP
+.B  \-w
+WTF\-8 mode. Surrogates \fBU+D800\fP to \fBU+D8FF\fP will not be considered errors.
+.UNINDENT
+.\" WTF-8 mode (allow surrogates).
+.
 .SH OUTPUT FORMAT
 .sp
 The output is designed to fit in an 80\-column terminal.
@@ -389,10 +412,23 @@ Zero for success, non\-zero for failure.
 Failure status will only be returned if \fBuxd\fP failed to open the
 input file. Invalid input (non\-UTF\-8) doesn\(aqt count as an error;
 it\(aqll just have lots of red in the output.
-.SH BUGS
+.SH LIMITATIONS
+.sp
+There are not bugs, because they\(aqre part of the design.
+.sp
+Only UTF\-8 and a couple of variants (WTF\-8, MUTF\-8) are supported.
+There is no support for UTF\-16, UTF\-32, UTF\-EBCDIC, or any other
+non\-UTF\-8 encoding.
 .sp
-There should be options and/or a config file to change the colors,
-rather than baking them into the binary.
+There\(aqs no support for any number base except hex.
+.sp
+The input is read one byte at a time, so a search or regex match
+option would be difficult or impossible to implement.
+.sp
+Seeking backwards from the end of the file is impossible when reading
+from standard input. The only way to fake this would be to read the
+whole file into memory at startup, which \fBuxd\fP doesn\(aqt do.
+.SH BUGS
 .sp
 Combining characters are not handled well. Or at all, really: the 2
 characters being combined will have an ANSI color code in between.
diff --git a/uxd.c b/uxd.c
index e839330..b30b22b 100644
--- a/uxd.c
+++ b/uxd.c
@@ -150,6 +150,9 @@ char *dump_data_arg = NULL; /* -d */
 long dump_data_idx  = 0;    /* -d */
 int term_utf8 = 0;    /* -t, -T */
 int restore_term = 0; /* -T only */
+int java_mode = 0;    /* -j */
+int wtf8_mode = 0;    /* -w */
+int permissive = 0;   /* -l */
 
 /* stats for -i option */
 long byte_count = 0;
@@ -265,8 +268,14 @@ void parse_args(int argc, char **argv) {
 			version();
 	}
 
-	while((opt = my_getopt(argc, argv, "tTd:1ic:nbl:rmo:S:s:uhv")) != -1) {
+	while((opt = my_getopt(argc, argv, "jwptTd:1ic:nbl:rmo:S:s:uhv")) != -1) {
 		switch(opt) {
+			case 'j':
+				java_mode = 1; break;
+			case 'w':
+				wtf8_mode = 1; break;
+			case 'p':
+				permissive = 1; break;
 			case 't':
 				term_utf8 = restore_term = 1; break;
 			case 'T':
@@ -551,6 +560,10 @@ int is_overlong(int cont_count, unsigned char *b) {
 	if(!cont_count)
 		return 0;
 
+	/* java mode (MUTF-8) allows exactly one overlong: */
+	if(java_mode && cont_count == 1 && b[0] == 0xc0 && b[1] == 0x80)
+		return 0;
+
 	/* 2 byte seqs, if the first byte is 0xc0 or 0xc1, it's overlong. */
 	if(cont_count == 1 && b[0] <= 0xc1)
 		return 1;
@@ -673,15 +686,18 @@ int dump_utf8_char(void) {
 		}
 	}
 
-	/* don't check bad sequences for out-of-range or surrogate */
-	if(!bad) {
-		if(is_out_of_range(cont_count, bytes) || is_surrogate(cont_count, bytes))
-			bad = 1;
-	}
-
+	if(!permissive) {
+		/* don't check bad sequences for out-of-range or surrogate */
+		if(!bad) {
+			if(is_out_of_range(cont_count, bytes))
+				bad = 1;
+			else if((!wtf8_mode) && is_surrogate(cont_count, bytes))
+				bad = 1;
+		}
 
-	if(is_overlong(cont_count, bytes))
-		overlong = 1;
+		if(is_overlong(cont_count, bytes))
+			overlong = 1;
+	}
 
 	if(bad || overlong) {
 		bad_count++;
diff --git a/uxd.rst b/uxd.rst
index 459de77..2220174 100644
--- a/uxd.rst
+++ b/uxd.rst
@@ -98,6 +98,15 @@ as *K*, *M*, and *G* for power-of-10 based (e.g. *1K* is 1000 bytes).
 
 .. print number of bytes/chars/ascii/multibyte/bad sequences.
 
+-j
+  Java mode (aka MUTF-8). Identical to UTF-8 except it allows the
+  overlong **0xc0 0x80** encoding for codepoint U+0000 (aka NUL),
+  which normally would be considered an error.
+  This may be useful for looking at serialized data created by Java
+  programs.
+
+.. java (MUTF-8) mode: allow 0xc0 0x80 for U+0000.
+
 -l length
   Stop dumping after *length* bytes (not characters). If the limit is
   reached in the middle of a multibyte character, the entire character
@@ -126,6 +135,11 @@ as *K*, *M*, and *G* for power-of-10 based (e.g. *1K* is 1000 bytes).
 
 .. added to hex offsets (decimal, 0x hex, 0 octal).
 
+-p
+  Permissive mode. Turns off error highlighting for overlongs, codepoints
+  above **U+10FFFF**, and surrogates. Only malformed sequences will be
+  highlighed in red.
+
 -r
   Highlight multi-byte sequences in reverse video, in the hex
   output. Ignored if **-m** given.
@@ -171,6 +185,11 @@ as *K*, *M*, and *G* for power-of-10 based (e.g. *1K* is 1000 bytes).
 
 .. print version of uxd.
 
+-w
+  WTF-8 mode. Surrogates **U+D800** to **U+D8FF** will not be considered errors.
+
+.. WTF-8 mode (allow surrogates).
+
 OUTPUT FORMAT
 =============
 
@@ -340,12 +359,27 @@ Failure status will only be returned if **uxd** failed to open the
 input file. Invalid input (non-UTF-8) doesn't count as an error;
 it'll just have lots of red in the output.
 
+LIMITATIONS
+===========
+
+There are not bugs, because they're part of the design.
+
+Only UTF-8 and a couple of variants (WTF-8, MUTF-8) are supported.
+There is no support for UTF-16, UTF-32, UTF-EBCDIC, or any other
+non-UTF-8 encoding.
+
+There's no support for any number base except hex.
+
+The input is read one byte at a time, so a search or regex match
+option would be difficult or impossible to implement.
+
+Seeking backwards from the end of the file is impossible when reading
+from standard input. The only way to fake this would be to read the
+whole file into memory at startup, which **uxd** doesn't do.
+
 BUGS
 ====
 
-There should be options and/or a config file to change the colors,
-rather than baking them into the binary.
-
 Combining characters are not handled well. Or at all, really: the 2
 characters being combined will have an ANSI color code in between.
 urxvt at least ignores the color code, so the composite character