aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorB. Watson <urchlay@slackware.uk>2024-12-13 06:21:38 -0500
committerB. Watson <urchlay@slackware.uk>2024-12-13 06:21:38 -0500
commitae5af3deb1137d06214ef95e96998a5c1ebb6746 (patch)
treee499f043696ad9b405b9e8daa0c435f7a7ee7ad1
parentec1150407869211a0d4607419986a5f185cd8d30 (diff)
downloaduxd-ae5af3deb1137d06214ef95e96998a5c1ebb6746.tar.gz
commentary (no code changes)
-rw-r--r--uxd.c42
1 files changed, 25 insertions, 17 deletions
diff --git a/uxd.c b/uxd.c
index d71d6d5..41a390b 100644
--- a/uxd.c
+++ b/uxd.c
@@ -29,25 +29,27 @@ error. If we get a sequence-starter, but the sequence doesn't have
the correct number of continuation bytes (e.g. 110xxxxx followed by
anything that isn't 10xxxxxx), that's an error too.
+Note that we don't actually do a full decode of the codepoint bits.
+It's enough to look at the top bits to keep track of multibyte
+characters.
+
BOM: if the file contains ef bb bf (aka U+FEFF), it will be colorized
as a special (non-printable).
+
If the file begins with ff fe, it's UTF-16 (little endian). If it's
-fe ff, it's UTF-16 big-endian. We detect these and
-print a warning on stderr.
+fe ff, it's UTF-16 big-endian. We detect these and print a warning
+on stderr.
*/
-/* max UTF-8 sequence length, in bytes */
-#define MAXUTF8 4
-
-/* ANSI color */
-#define BLACK 0 /* don't use */
-#define RED 1
-#define GREEN 2
+/* ANSI colors */
+#define BLACK 0 /* don't use (could be the background color) */
+#define RED 1
+#define GREEN 2
#define YELLOW 3
-#define BLUE 4 /* don't use */
+#define BLUE 4 /* don't use (hard to read on many terminals) */
#define PURPLE 5
-#define CYAN 6
-#define WHITE 7 /* don't use */
+#define CYAN 6
+#define WHITE 7 /* don't use (could be the background color) */
#define SPECIAL PURPLE
@@ -112,12 +114,12 @@ char * const special_symbols[] = {
};
char *get_special(unsigned char c) {
- if(c == 0x7f) return "⌦";
+ if(c == 0x7f) return "⌦"; /* tab */
if(c <= ' ') return special_symbols[c];
return "?"; /* should never happen */
}
-/* set name to use for error messages. this must be called before
+/* Set name to use for error messages. This must be called before
open_input(). */
void set_self(const char *argv0) {
self = strrchr(argv0, '/');
@@ -133,7 +135,8 @@ void print_line(void) {
printf("%s", left_buf);
- /* line up the rightmost field (human-readable) */
+ /* line up the rightmost field (human-readable), for the partial
+ line at the end of the output (if there is one). */
while(spacing--) printf(" ");
if(dump_column < (MAX_DUMP_COLS / 2)) putchar(' ');
@@ -214,11 +217,15 @@ void check_utf16(int byte0, int byte1) {
fprintf(stderr, "%s: input looks like UTF-16, %s-endian\n", self, endian);
}
+/* Since we're not fully decoding the code points, we have to check
+ for the actual UTF-8 representation of our one special multibyte char. */
int is_bom(unsigned char *b) {
return (b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf);
}
-/* U+10FFFF is the last valid codepoint. It encodes to f4 8f bf bf. */
+/* U+10FFFF is the last valid codepoint. It encodes to f4 8f bf bf.
+ 'count' is the count of continuation bytes only (so, 3 for a 4-byte
+ sqeuence). */
int is_out_of_range(int count, unsigned char *b) {
if(count < 3) return 0;
if(b[0] < 0xf4) return 0;
@@ -226,7 +233,8 @@ int is_out_of_range(int count, unsigned char *b) {
return 1;
}
-/* return value: false = EOF, true = more data to read */
+/* This is the 'workhorse', called for each character in the file.
+ Return value: false = EOF, true = more data to read */
int dump_utf8_char(void) {
unsigned char bytes[] = { 0, 0, 0, 0, 0 };
unsigned char *cont_bytes = bytes + 1;