aboutsummaryrefslogtreecommitdiff
path: root/uxd.c
diff options
context:
space:
mode:
authorB. Watson <urchlay@slackware.uk>2024-12-12 15:40:38 -0500
committerB. Watson <urchlay@slackware.uk>2024-12-12 15:40:38 -0500
commit5414bc808430e80139a6debe56ccea15afedb3b6 (patch)
tree67c87f84b978adc24e5f8edb22e8b6bb6698874d /uxd.c
parentde1c3571112e85507fea5e13c046eaf7b5514be1 (diff)
downloaduxd-5414bc808430e80139a6debe56ccea15afedb3b6.tar.gz
UTF-16 warnings, better visible null & escape, handle BOM.
Diffstat (limited to 'uxd.c')
-rw-r--r--uxd.c75
1 files changed, 59 insertions, 16 deletions
diff --git a/uxd.c b/uxd.c
index 00a2686..edf071d 100644
--- a/uxd.c
+++ b/uxd.c
@@ -29,10 +29,10 @@ error. If we get a sequence-starter, but the sequence doesn't have
the correct number of continuation bytes (e.g. 110xxxxx followed by
anything that isn't 10xxxxxx), that's an error too.
-BOM: if the file contains ef bb bf (aka U+FEFF), it should be colorized
+BOM: if the file contains ef bb bf (aka U+FEFF), it will be colorized
as a special (non-printable).
If the file begins with ff fe, it's UTF-16 (little endian). If it's
-fe ff, it's UTF-16 big-endian. Probably we should detect these and
+fe ff, it's UTF-16 big-endian. We detect these and
print a warning on stderr.
*/
@@ -54,7 +54,6 @@ print a warning on stderr.
#define BAD_FG BLACK
#define BAD_BG RED
-// const int normal_colors[] = { GREEN, PURPLE, CYAN };
const int normal_colors[] = { GREEN, YELLOW };
int cur_normal_color = 0;
int dump_color;
@@ -100,9 +99,15 @@ void open_input(const int argc, const char *argv1) {
}
}
+/* Unicode control character printable equivalents. For 0, use
+ the "empty set" symbol. It's a lot more readable than the "nul"
+ symbol, ␀. Escape, tab, newline, space are what urxvt uses in
+ its "keycap picture" mode. The rest of there are hard to read at
+ normal font sizes, but it's still better than using a dot for
+ everything like xxd does. */
char * const special_symbols[] = {
- "␀", "␁", "␂", "␃", "␄", "␅", "␆", "␇", "␈", "⇥", "↵", "␋", "␌", "␍", "␎", "␏",
- "␐", "␑", "␒", "␓", "␔", "␕", "␖", "␗", "␘", "␙", "␚", "␛", "␜", "␝", "␞", "␟",
+ "∅", "␁", "␂", "␃", "␄", "␅", "␆", "␇", "␈", "⇥", "↵", "␋", "␌", "␍", "␎", "␏",
+ "␐", "␑", "␒", "␓", "␔", "␕", "␖", "␗", "␘", "␙", "␚", "⎋", "␜", "␝", "␞", "␟",
"␣",
};
@@ -112,6 +117,8 @@ char *get_special(unsigned char c) {
return "?"; /* should never happen */
}
+/* set name to use for error messages. this must be called before
+ open_input(). */
void set_self(const char *argv0) {
self = strrchr(argv0, '/');
@@ -121,6 +128,21 @@ void set_self(const char *argv0) {
self = argv0;
}
+void print_line(void) {
+ int spacing = MAX_DUMP_COLS - dump_column;
+
+ printf("%s", left_buf);
+
+ /* line up the rightmost field (human-readable) */
+ while(spacing--) printf(" ");
+
+ printf(" %s\n", right_buf);
+
+ /* clear the buffers, start a new line */
+ left_buf[0] = right_buf[0] = '\0';
+ dump_column = 0;
+}
+
void next_normal_color() {
cur_normal_color++;
cur_normal_color %= (sizeof(normal_colors) / sizeof(int));
@@ -139,14 +161,6 @@ void append_color(char *buf, int fgcolor, int bgcolor) {
strcat(buf, tmpbuf);
}
-void print_line(void) {
- int spacing = MAX_DUMP_COLS - dump_column;
- printf("%s", left_buf);
- while(spacing--) printf(" ");
- printf(" %s\n", right_buf);
- left_buf[0] = right_buf[0] = '\0';
-}
-
void append_color_off(char *buf) {
strcat(buf, "\x1b[0m");
}
@@ -169,20 +183,38 @@ void append_left(unsigned char byte, int fgcolor, int bgcolor) {
if(dump_column == 7) strcat(left_buf, " ");
dump_column++;
- if(dump_column == MAX_DUMP_COLS) {
+ if(dump_column == MAX_DUMP_COLS)
print_line();
- dump_column = 0;
- }
filepos++;
}
+void check_utf16(int byte0, int byte1) {
+ char *endian;
+
+ if(byte0 == 0xff && byte1 == 0xfe) {
+ endian = "little";
+ } else if(byte0 == 0xfe && byte1 == 0xff) {
+ endian = "big";
+ } else {
+ return;
+ }
+
+ fprintf(stderr, "%s: input looks like UTF-16, %s-endian\n", self, endian);
+}
+
+int is_bom(unsigned char *b) {
+ return (b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
+}
+
+/* return value: false = EOF, true = more data to read */
int dump_utf8_char(void) {
unsigned char bytes[] = { 0, 0, 0, 0, 0 };
unsigned char *cont_bytes = bytes + 1;
char *printable;
int bad = 0, special = 0;
int c, cont_count, i, fg, bg;
+ static int byte0;
c = fgetc(input);
if(c == EOF)
@@ -190,6 +222,12 @@ int dump_utf8_char(void) {
bytes[0] = (unsigned char)c;
+ if(filepos == 0) {
+ byte0 = c;
+ } else if(filepos == 1) {
+ check_utf16(byte0, c);
+ }
+
if(c < 0x7f) {
cont_count = 0;
if(c <= ' ' || c == 0x7f)
@@ -236,6 +274,10 @@ int dump_utf8_char(void) {
fg = SPECIAL;
bg = 0;
printable = get_special(bytes[0]);
+ } else if(cont_count == 2 && is_bom(bytes)) {
+ fg = SPECIAL;
+ bg = 0;
+ printable = "B";
} else {
fg = normal_colors[cur_normal_color];
bg = 0;
@@ -258,6 +300,7 @@ void dump_file(void) {
while(dump_utf8_char())
;
+ /* handle the last line, if the file size not divisible by 16. */
if(dump_column)
print_line();
}