aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--uxd.17
-rw-r--r--uxd.c75
-rw-r--r--uxd.rst7
3 files changed, 67 insertions, 22 deletions
diff --git a/uxd.1 b/uxd.1
index 68c4554..d032c25 100644
--- a/uxd.1
+++ b/uxd.1
@@ -104,8 +104,9 @@ Printable characters (except the space, U+0020) alternate between green and yell
.B \fBpurple\fP
Spaces and unprintable characters ("control" characters, newlines, tabs, etc).
These are printed as "visible" characters, e.g. ␣ for the space, ↵ for a newline.
-This is an improvement over the usual practice of printing these as periods, like
-standard hex dumpers do.
+Hopefilly this is an improvement over the usual practice of printing these as periods, like
+standard hex dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed
+as a purple letter B.
.TP
.B \fBred\fP
Invalid UTF\-8 sequences. These are rendered with a red foreground, to make them
@@ -132,7 +133,7 @@ ANSI\-style escape sequences, Unicode, and UTF\-8 rendering.
The author\(aqs testing is done primarily with \fBurxvt\fP(1). Other
terminals aren\(aqt tested as often.
.sp
-Known to work: urxvt, xterm, st, xfce4\-terminal, gnome\-terminal, the Linux console (but
+Known to work: urxvt, xterm, st, xfce4\-terminal, gnome\-terminal, kitty, the Linux console (but
see \fBFONTS\fP, below).
.sp
Known \fBnot\fP to work: rxvt (doesn\(aqt support Unicode at all).
diff --git a/uxd.c b/uxd.c
index 00a2686..edf071d 100644
--- a/uxd.c
+++ b/uxd.c
@@ -29,10 +29,10 @@ error. If we get a sequence-starter, but the sequence doesn't have
the correct number of continuation bytes (e.g. 110xxxxx followed by
anything that isn't 10xxxxxx), that's an error too.
-BOM: if the file contains ef bb bf (aka U+FEFF), it should be colorized
+BOM: if the file contains ef bb bf (aka U+FEFF), it will be colorized
as a special (non-printable).
If the file begins with ff fe, it's UTF-16 (little endian). If it's
-fe ff, it's UTF-16 big-endian. Probably we should detect these and
+fe ff, it's UTF-16 big-endian. We detect these and
print a warning on stderr.
*/
@@ -54,7 +54,6 @@ print a warning on stderr.
#define BAD_FG BLACK
#define BAD_BG RED
-// const int normal_colors[] = { GREEN, PURPLE, CYAN };
const int normal_colors[] = { GREEN, YELLOW };
int cur_normal_color = 0;
int dump_color;
@@ -100,9 +99,15 @@ void open_input(const int argc, const char *argv1) {
}
}
+/* Unicode control character printable equivalents. For 0, use
+ the "empty set" symbol. It's a lot more readable than the "nul"
+ symbol, ␀. Escape, tab, newline, space are what urxvt uses in
+ its "keycap picture" mode. The rest of there are hard to read at
+ normal font sizes, but it's still better than using a dot for
+ everything like xxd does. */
char * const special_symbols[] = {
- "␀", "␁", "␂", "␃", "␄", "␅", "␆", "␇", "␈", "⇥", "↵", "␋", "␌", "␍", "␎", "␏",
- "␐", "␑", "␒", "␓", "␔", "␕", "␖", "␗", "␘", "␙", "␚", "␛", "␜", "␝", "␞", "␟",
+ "∅", "␁", "␂", "␃", "␄", "␅", "␆", "␇", "␈", "⇥", "↵", "␋", "␌", "␍", "␎", "␏",
+ "␐", "␑", "␒", "␓", "␔", "␕", "␖", "␗", "␘", "␙", "␚", "⎋", "␜", "␝", "␞", "␟",
"␣",
};
@@ -112,6 +117,8 @@ char *get_special(unsigned char c) {
return "?"; /* should never happen */
}
+/* set name to use for error messages. this must be called before
+ open_input(). */
void set_self(const char *argv0) {
self = strrchr(argv0, '/');
@@ -121,6 +128,21 @@ void set_self(const char *argv0) {
self = argv0;
}
+void print_line(void) {
+ int spacing = MAX_DUMP_COLS - dump_column;
+
+ printf("%s", left_buf);
+
+ /* line up the rightmost field (human-readable) */
+ while(spacing--) printf(" ");
+
+ printf(" %s\n", right_buf);
+
+ /* clear the buffers, start a new line */
+ left_buf[0] = right_buf[0] = '\0';
+ dump_column = 0;
+}
+
void next_normal_color() {
cur_normal_color++;
cur_normal_color %= (sizeof(normal_colors) / sizeof(int));
@@ -139,14 +161,6 @@ void append_color(char *buf, int fgcolor, int bgcolor) {
strcat(buf, tmpbuf);
}
-void print_line(void) {
- int spacing = MAX_DUMP_COLS - dump_column;
- printf("%s", left_buf);
- while(spacing--) printf(" ");
- printf(" %s\n", right_buf);
- left_buf[0] = right_buf[0] = '\0';
-}
-
void append_color_off(char *buf) {
strcat(buf, "\x1b[0m");
}
@@ -169,20 +183,38 @@ void append_left(unsigned char byte, int fgcolor, int bgcolor) {
if(dump_column == 7) strcat(left_buf, " ");
dump_column++;
- if(dump_column == MAX_DUMP_COLS) {
+ if(dump_column == MAX_DUMP_COLS)
print_line();
- dump_column = 0;
- }
filepos++;
}
+void check_utf16(int byte0, int byte1) {
+ char *endian;
+
+ if(byte0 == 0xff && byte1 == 0xfe) {
+ endian = "little";
+ } else if(byte0 == 0xfe && byte1 == 0xff) {
+ endian = "big";
+ } else {
+ return;
+ }
+
+ fprintf(stderr, "%s: input looks like UTF-16, %s-endian\n", self, endian);
+}
+
+int is_bom(unsigned char *b) {
+ return (b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
+}
+
+/* return value: false = EOF, true = more data to read */
int dump_utf8_char(void) {
unsigned char bytes[] = { 0, 0, 0, 0, 0 };
unsigned char *cont_bytes = bytes + 1;
char *printable;
int bad = 0, special = 0;
int c, cont_count, i, fg, bg;
+ static int byte0;
c = fgetc(input);
if(c == EOF)
@@ -190,6 +222,12 @@ int dump_utf8_char(void) {
bytes[0] = (unsigned char)c;
+ if(filepos == 0) {
+ byte0 = c;
+ } else if(filepos == 1) {
+ check_utf16(byte0, c);
+ }
+
if(c < 0x7f) {
cont_count = 0;
if(c <= ' ' || c == 0x7f)
@@ -236,6 +274,10 @@ int dump_utf8_char(void) {
fg = SPECIAL;
bg = 0;
printable = get_special(bytes[0]);
+ } else if(cont_count == 2 && is_bom(bytes)) {
+ fg = SPECIAL;
+ bg = 0;
+ printable = "B";
} else {
fg = normal_colors[cur_normal_color];
bg = 0;
@@ -258,6 +300,7 @@ void dump_file(void) {
while(dump_utf8_char())
;
+ /* handle the last line, if the file size not divisible by 16. */
if(dump_column)
print_line();
}
diff --git a/uxd.rst b/uxd.rst
index f6f3bd3..8356f14 100644
--- a/uxd.rst
+++ b/uxd.rst
@@ -88,8 +88,9 @@ COLORS
**purple**
Spaces and unprintable characters ("control" characters, newlines, tabs, etc).
These are printed as "visible" characters, e.g. ␣ for the space, ↵ for a newline.
- This is an improvement over the usual practice of printing these as periods, like
- standard hex dumpers do.
+ Hopefully this is an improvement over the usual practice of printing these as periods, like
+ standard hex dumpers do. The Unicode BOM (byte order marker, U+FEFF) is printed
+ as a purple letter B.
**red**
Invalid UTF-8 sequences. These are rendered with a red foreground, to make them
@@ -111,7 +112,7 @@ ANSI-style escape sequences, Unicode, and UTF-8 rendering.
The author's testing is done primarily with **urxvt**\(1). Other
terminals aren't tested as often.
-Known to work: urxvt, xterm, st, xfce4-terminal, gnome-terminal, the Linux console (but
+Known to work: urxvt, xterm, st, xfce4-terminal, gnome-terminal, kitty, the Linux console (but
see **FONTS**, below).
Known **not** to work: rxvt (doesn't support Unicode at all).