diff options
-rw-r--r-- | Makefile | 13 | ||||
-rw-r--r-- | a8cat.1 | 124 | ||||
-rw-r--r-- | a8cat.c | 187 | ||||
-rw-r--r-- | a8cat.rst | 58 | ||||
-rw-r--r-- | atables.c | 265 | ||||
-rw-r--r-- | atables.h | 2 | ||||
-rw-r--r-- | mkatables.pl | 116 | ||||
-rw-r--r-- | wtable.c | 140 | ||||
-rw-r--r-- | wtable.h | 2 |
9 files changed, 905 insertions, 2 deletions
@@ -16,9 +16,9 @@ CC=gcc CFLAGS=-Wall $(COPT) -ansi -D_GNU_SOURCE -DVERSION=\"$(VERSION)\" # BINS and SCRIPTS go in $BINDIR, DOCS go in $DOCDIR -BINS=a8eol atr2xfd atrsize axe blob2c blob2xex cart2xex cxrefbas dumpbas fenders protbas renumbas rom2cart unmac65 unprotbas vxrefbas xex1to2 xexamine xexcat xexsplit xfd2atr listbas +BINS=a8eol atr2xfd atrsize axe blob2c blob2xex cart2xex cxrefbas dumpbas fenders protbas renumbas rom2cart unmac65 unprotbas vxrefbas xex1to2 xexamine xexcat xexsplit xfd2atr listbas a8cat SCRIPTS=dasm2atasm a8utf8 -MANS=a8eol.1 xfd2atr.1 atr2xfd.1 blob2c.1 cart2xex.1 fenders.1 xexsplit.1 xexcat.1 atrsize.1 rom2cart.1 unmac65.1 axe.1 dasm2atasm.1 a8utf8.1 blob2xex.1 xexamine.1 xex1to2.1 unprotbas.1 protbas.1 renumbas.1 dumpbas.1 vxrefbas.1 cxrefbas.1 listbas.1 +MANS=a8eol.1 xfd2atr.1 atr2xfd.1 blob2c.1 cart2xex.1 fenders.1 xexsplit.1 xexcat.1 atrsize.1 rom2cart.1 unmac65.1 axe.1 dasm2atasm.1 a8utf8.1 blob2xex.1 xexamine.1 xex1to2.1 unprotbas.1 protbas.1 renumbas.1 dumpbas.1 vxrefbas.1 cxrefbas.1 listbas.1 a8cat.1 MAN5S=xex.5 MAN7S=atascii.7 DOCS=README.txt equates.inc *.dasm LICENSE ksiders/atr.txt @@ -66,6 +66,12 @@ listbas: listbas.c bas.o bcdfp.o tokens.o bas.o: bas.c bas.h +wtable.o: wtable.c wtable.h + +atables.o: atables.c atables.h + +a8cat: a8cat.c atables.o wtable.o + subdirs: for dir in $(SUBDIRS); do make -C $$dir COPT="$(COPT)"; done @@ -75,6 +81,9 @@ xfd2atr: xfd2atr.c atr2xfd: atr2xfd.c +atables.c: mkatables.pl + perl mkatables.pl > atables.c + # note to cross-compiler users: If you're building the *.bin targets, # blob2c needs to be executable on the build host. It'd also be nice # to build a blob2c for the target platform... Probably you can do @@ -0,0 +1,124 @@ +.\" Man page generated from reStructuredText. +. +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.TH "A8CAT" 1 "2024-06-29" "0.2.1" "Urchlay's Atari 8-bit Tools" +.SH NAME +a8cat \- Convert Atari 8-bit text to UTF-8 encoded Unicode. +.SH SYNOPSIS +.sp +\fIa8cat\fP [\fB\-r\fP] [\fB\-i\fP] [\fB\-u\fP] [\fB\-t\fP] [\fIinfile\fP] [\fIinfile ...\fP] +.SH DESCRIPTION +.sp +Convert Atari 8\-bit ATASCII or XL ICS (International Character +Set) text to UTF\-8 encoded Unicode. Control graphics characters are +replaced with their nearest Unicode equivalents (mostly from the Box +Drawing block, or from the Basic Latin block with \fB\-i\fP option). +.sp +If no \fIinfile\fPs are given, input is read from standard input. Output always +goes to standard output; to write to a file, use a command like: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +a8cat atari.txt > converted.txt +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +The output is plain UTF\-8 Unicode, without BOM. +.sp +Inverse video (characters codes above \fB$80\fP) are translated using +the ANSI/VT\-100 reverse video escape sequences. Exception: \fB$9B\fP +(Atari EOL) is translated to \fB\en\fP (newline). +.SH OPTIONS +.INDENT 0.0 +.TP +.B \-i +Input uses Atari XL/XE International Character Set encoding, rather than +ATASCII graphics. +.TP +.B \-u +Use "underlining" for inverse video. Each inverse character is followed by +a backspace, then a \fI_\fP character. When viewed in a pager such as \fBless\fP(1), +this causes the characters to appear underlined. Output created with this +option cannot be converted back to ATASCII with the \fB\-r\fP option. +.TP +.B \-t +Text mode. Normally, everything but EOL (\fB$9B\fP) is converted to a +Unicode graphics character. In text mode, ATASCII tabs, backspace, +and bells are translated to the ASCII versions. +.TP +.B \-r +Reverse conversion: Input is UTF\-8, output is ATASCII (or XL ICS, with \fB\-i\fP). +Beware that printing ATASCII to a terminal may look funny, and may even confuse +the terminal. Redirecting to a file is safe. +.UNINDENT +.SH COPYRIGHT +.sp +WTFPL. See \fI\%http://www.wtfpl.net/txt/copying/\fP for details. +.SH AUTHOR +.INDENT 0.0 +.IP B. 3 +Watson <\fI\%urchlay@slackware.uk\fP>; Urchlay on irc.libera.chat \fI##atari\fP\&. +.UNINDENT +.SH SEE ALSO +.sp +\fBa8eol\fP(1), +\fBa8utf8\fP(1), +\fBatr2xfd\fP(1), +\fBatrsize\fP(1), +\fBaxe\fP(1), +\fBblob2c\fP(1), +\fBblob2xex\fP(1), +\fBcart2xex\fP(1), +\fBcxrefbas\fP(1), +\fBdasm2atasm\fP(1), +\fBdumpbas\fP(1), +\fBf2toxex\fP(1), +\fBfenders\fP(1), +\fBlistbas\fP(1), +\fBprotbas\fP(1), +\fBrenumbas\fP(1), +\fBrom2cart\fP(1), +\fBunmac65\fP(1), +\fBunprotbas\fP(1), +\fBvxrefbas\fP(1), +\fBxexamine\fP(1), +\fBxexcat\fP(1), +\fBxexsplit\fP(1), +\fBxfd2atr\fP(1), +\fBxex\fP(5), +\fBatascii\fP(7). +.sp +Any good Atari 8\-bit book: \fIDe Re Atari\fP, \fIThe Atari BASIC Reference +Manual\fP, the \fIOS Users\(aq Guide\fP, \fIMapping the Atari\fP, etc. +.\" Generated by docutils manpage writer. +. @@ -0,0 +1,187 @@ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <unistd.h> +#include <locale.h> +#include <wchar.h> +#include <errno.h> + +#include "atables.h" +#include "wtable.h" + +const char **table = ata2utf; + +const char *inverse_on = "\x1b[7m"; +const char *inverse_off = "\x1b[0m"; + +int underline = 0, reverse = 0, textmode = 0, ics = 0; + +void print_help(void) { + printf("Usage: a8cat [-i] [-u] [file ...]\n"); +} + +FILE *open_input(const char *file) { + FILE *input; + + if(file[0] == '-' && file[1] == 0) { + if(freopen(NULL, "rb", stdin)) { + input = stdin; + } else { + perror("(standard input)"); + return NULL; + } + } else if(!(input = fopen(file, "rb"))) { + perror(file); + return NULL; + } + + return input; +} + +int handle_escape_seq(int inv, FILE *input) { + int count, c; + char buf[5] = { 0x1b, 0, 0, 0, 0 }; + + for(count = 1; count < 4; count++) { + c = fgetwc(input); + if(c == WEOF) break; + buf[count] = c; /* FIXME: might be a wide char! */ + } + + if(strcmp(inverse_on, buf) == 0) { + return 0x80; + } else if(strcmp(inverse_off, buf) == 0) { + return 0; + } else { + fputs(buf, stdout); + return inv; + } +} + +int a8revcat(const char *file) { + FILE *input; + int c, d, inv = 0; + + if( !(input = open_input(file)) ) + return 1; + + setlocale(LC_CTYPE, "en_US.UTF-8"); + while( (c = fgetwc(input)) != WEOF ) { + if(c == 0x1b) { + inv = handle_escape_seq(inv, input); + } else if(c == '\n') { + putchar(0x9b); + } else if(c < 0x80) { + putchar(c | inv); + } else { + d = wchar2atascii(c, ics); + if(d == -1) { + fprintf(stderr, "warning: unrecognized Unicode character %04x\n", c); + } else { + putchar(d | inv); + } + } + } + + return 0; +} + +/* XXX: hard-coded ANSI/vt100 escape sequences. would be + better but more complex to use terminfo to support any ol' + terminal... */ +void inverse(int onoff) { + fputs((onoff ? inverse_on : inverse_off ), stdout); +} + +int a8cat(const char *file) { + FILE *input; + int c, inv = 0; + + if( !(input = open_input(file)) ) + return 1; + + while( (c = fgetc(input)) != EOF ) { + if(c == 0x9b) { + putchar('\n'); + continue; + } + + if(textmode) { + switch(c) { + case 0x09: /* Atari TAB is same as ASCII */ + putchar('\t'); + continue; + case 0xfd: /* bell */ + putchar('\a'); + continue; + case 0x7e: /* backspace */ + putchar('\b'); + continue; + default: break; + } + } + + if(!underline) { + /* strings of inverse chars only get one "inverse on" ANSI + sequence, and one "inverse off" afterwards. */ + if(c & 0x80) { + if(!inv) { + inv = 1; + inverse(1); + } + } else { + if(inv) { + inv = 0; + inverse(0); + } + } + } + + fputs(table[c & 0x7f], stdout); + + if(underline && (c & 0x80)) { + putchar('\b'); + putchar('_'); + } + } + + /* gotta turn off inverse, so if there's another file after this one, + it doesn't start out being printed in inverse. */ + if(inv && !underline) inverse(0); + + fclose(input); + return 0; +} + +int main(int argc, char **argv) { + int opt, result = 0; + + while( (opt = getopt(argc, argv, "ihurt")) != -1) { + switch(opt) { + case 'i': table = ics2utf; ics = 1; break; + case 'h': print_help(); exit(0); break; + case 'u': underline = 1; break; + case 'r': reverse = 1; break; + case 't': textmode = 1; break; + default: print_help(); exit(1); break; + } + } + + if(reverse) { + if(underline || textmode) { + fprintf(stderr, "-t and -u options don't make sense with -r.\n"); + exit(1); + } + } + + if(optind >= argc) { + result = (reverse ? a8revcat("-") : a8cat("-")); + } else { + while(optind < argc) { + result += (reverse ? a8revcat(argv[optind]) : a8cat(argv[optind])); + optind++; + } + } + + exit(result); +} diff --git a/a8cat.rst b/a8cat.rst new file mode 100644 index 0000000..7557c01 --- /dev/null +++ b/a8cat.rst @@ -0,0 +1,58 @@ +===== +a8cat +===== + +-------------------------------------------------- +Convert Atari 8-bit text to UTF-8 encoded Unicode. +-------------------------------------------------- + +.. include:: manhdr.rst + +SYNOPSIS +======== + +*a8cat* [**-r**] [**-i**] [**-u**] [**-t**] [*infile*] [*infile ...*] + +DESCRIPTION +=========== + +Convert Atari 8-bit ATASCII or XL ICS (International Character +Set) text to UTF-8 encoded Unicode. Control graphics characters are +replaced with their nearest Unicode equivalents (mostly from the Box +Drawing block, or from the Basic Latin block with **-i** option). + +If no *infile*\s are given, input is read from standard input. Output always +goes to standard output; to write to a file, use a command like:: + + a8cat atari.txt > converted.txt + +The output is plain UTF-8 Unicode, without BOM. + +Inverse video (characters codes above **$80**) are translated using +the ANSI/VT-100 reverse video escape sequences. Exception: **$9B** +(Atari EOL) is translated to **\\n** (newline). + +OPTIONS +======= + +-i + Input uses Atari XL/XE International Character Set encoding, rather than + ATASCII graphics. + +-u + Use "underlining" for inverse video. Each inverse character is followed by + a backspace, then a *_* character. When viewed in a pager such as **less**\(1), + this causes the characters to appear underlined. Output created with this + option cannot be converted back to ATASCII with the **-r** option. + +-t + Text mode. Normally, everything but EOL (**$9B**) is converted to a + Unicode graphics character. In text mode, ATASCII tabs, backspace, + and bells are translated to the ASCII versions. + +-r + Reverse conversion: Input is UTF-8, output is ATASCII (or XL ICS, with **-i**). + Beware that printing ATASCII to a terminal may look funny, and may even confuse + the terminal. Redirecting to a file is safe. + +.. include:: manftr.rst diff --git a/atables.c b/atables.c new file mode 100644 index 0000000..ea6eedc --- /dev/null +++ b/atables.c @@ -0,0 +1,265 @@ +/* ATASCII to UTF-8 tables. Generated by mkatables.pl. + Do not edit this file; edit mkatables.pl instead. */ + +const char *ata2utf[] = { + "♥", /* 0 $00 ^@ */ + "┣", /* 1 $01 ^A */ + "┃", /* 2 $02 ^B */ + "┛", /* 3 $03 ^C */ + "┫", /* 4 $04 ^D */ + "┓", /* 5 $05 ^E */ + "╱", /* 6 $06 ^F */ + "╲", /* 7 $07 ^G */ + "◢", /* 8 $08 ^H */ + "▗", /* 9 $09 ^I */ + "◣", /* 10 $0a ^J */ + "▝", /* 11 $0b ^K */ + "▘", /* 12 $0c ^L */ + "▔", /* 13 $0d ^M */ + "▁", /* 14 $0e ^N */ + "▖", /* 15 $0f ^O */ + "♣", /* 16 $10 ^P */ + "┏", /* 17 $11 ^Q */ + "━", /* 18 $12 ^R */ + "╋", /* 19 $13 ^S */ + "●", /* 20 $14 ^T */ + "▄", /* 21 $15 ^U */ + "▎", /* 22 $16 ^V */ + "┳", /* 23 $17 ^W */ + "┻", /* 24 $18 ^X */ + "▌", /* 25 $19 ^Y */ + "┗", /* 26 $1a ^Z */ + "␛", /* 27 $1b ^[ */ + "↑", /* 28 $1c ^\ */ + "↓", /* 29 $1d ^] */ + "←", /* 30 $1e ^^ */ + "→", /* 31 $1f ^_ */ + " ", /* 32 $20 */ + "!", /* 33 $21 ! */ + "\"", /* 34 $22 " */ + "#", /* 35 $23 # */ + "$", /* 36 $24 $ */ + "%", /* 37 $25 % */ + "&", /* 38 $26 & */ + "'", /* 39 $27 ' */ + "(", /* 40 $28 ( */ + ")", /* 41 $29 ) */ + "*", /* 42 $2a * */ + "+", /* 43 $2b + */ + ",", /* 44 $2c , */ + "-", /* 45 $2d - */ + ".", /* 46 $2e . */ + "/", /* 47 $2f / */ + "0", /* 48 $30 0 */ + "1", /* 49 $31 1 */ + "2", /* 50 $32 2 */ + "3", /* 51 $33 3 */ + "4", /* 52 $34 4 */ + "5", /* 53 $35 5 */ + "6", /* 54 $36 6 */ + "7", /* 55 $37 7 */ + "8", /* 56 $38 8 */ + "9", /* 57 $39 9 */ + ":", /* 58 $3a : */ + ";", /* 59 $3b ; */ + "<", /* 60 $3c < */ + "=", /* 61 $3d = */ + ">", /* 62 $3e > */ + "?", /* 63 $3f ? */ + "@", /* 64 $40 @ */ + "A", /* 65 $41 A */ + "B", /* 66 $42 B */ + "C", /* 67 $43 C */ + "D", /* 68 $44 D */ + "E", /* 69 $45 E */ + "F", /* 70 $46 F */ + "G", /* 71 $47 G */ + "H", /* 72 $48 H */ + "I", /* 73 $49 I */ + "J", /* 74 $4a J */ + "K", /* 75 $4b K */ + "L", /* 76 $4c L */ + "M", /* 77 $4d M */ + "N", /* 78 $4e N */ + "O", /* 79 $4f O */ + "P", /* 80 $50 P */ + "Q", /* 81 $51 Q */ + "R", /* 82 $52 R */ + "S", /* 83 $53 S */ + "T", /* 84 $54 T */ + "U", /* 85 $55 U */ + "V", /* 86 $56 V */ + "W", /* 87 $57 W */ + "X", /* 88 $58 X */ + "Y", /* 89 $59 Y */ + "Z", /* 90 $5a Z */ + "[", /* 91 $5b [ */ + "\\", /* 92 $5c \ */ + "]", /* 93 $5d ] */ + "^", /* 94 $5e ^ */ + "_", /* 95 $5f _ */ + "◆", /* 96 $60 ` */ + "a", /* 97 $61 a */ + "b", /* 98 $62 b */ + "c", /* 99 $63 c */ + "d", /* 100 $64 d */ + "e", /* 101 $65 e */ + "f", /* 102 $66 f */ + "g", /* 103 $67 g */ + "h", /* 104 $68 h */ + "i", /* 105 $69 i */ + "j", /* 106 $6a j */ + "k", /* 107 $6b k */ + "l", /* 108 $6c l */ + "m", /* 109 $6d m */ + "n", /* 110 $6e n */ + "o", /* 111 $6f o */ + "p", /* 112 $70 p */ + "q", /* 113 $71 q */ + "r", /* 114 $72 r */ + "s", /* 115 $73 s */ + "t", /* 116 $74 t */ + "u", /* 117 $75 u */ + "v", /* 118 $76 v */ + "w", /* 119 $77 w */ + "x", /* 120 $78 x */ + "y", /* 121 $79 y */ + "z", /* 122 $7a z */ + "♠", /* 123 $7b { */ + "|", /* 124 $7c | */ + "↰", /* 125 $7d } */ + "◀", /* 126 $7e ~ */ + "▶", /* 127 $7f [del] */ +}; + +const char *ics2utf[] = { + "á", /* 0 $00 ^@ */ + "ù", /* 1 $01 ^A */ + "Ñ", /* 2 $02 ^B */ + "É", /* 3 $03 ^C */ + "ç", /* 4 $04 ^D */ + "ô", /* 5 $05 ^E */ + "ò", /* 6 $06 ^F */ + "ì", /* 7 $07 ^G */ + "£", /* 8 $08 ^H */ + "ï", /* 9 $09 ^I */ + "ü", /* 10 $0a ^J */ + "ä", /* 11 $0b ^K */ + "Ö", /* 12 $0c ^L */ + "ú", /* 13 $0d ^M */ + "ó", /* 14 $0e ^N */ + "ö", /* 15 $0f ^O */ + "Ü", /* 16 $10 ^P */ + "â", /* 17 $11 ^Q */ + "û", /* 18 $12 ^R */ + "î", /* 19 $13 ^S */ + "é", /* 20 $14 ^T */ + "è", /* 21 $15 ^U */ + "ñ", /* 22 $16 ^V */ + "ê", /* 23 $17 ^W */ + "ȧ", /* 24 $18 ^X */ + "à", /* 25 $19 ^Y */ + "Ȧ", /* 26 $1a ^Z */ + "␛", /* 27 $1b ^[ */ + "↑", /* 28 $1c ^\ */ + "↓", /* 29 $1d ^] */ + "←", /* 30 $1e ^^ */ + "→", /* 31 $1f ^_ */ + " ", /* 32 $20 */ + "!", /* 33 $21 ! */ + "\"", /* 34 $22 " */ + "#", /* 35 $23 # */ + "$", /* 36 $24 $ */ + "%", /* 37 $25 % */ + "&", /* 38 $26 & */ + "'", /* 39 $27 ' */ + "(", /* 40 $28 ( */ + ")", /* 41 $29 ) */ + "*", /* 42 $2a * */ + "+", /* 43 $2b + */ + ",", /* 44 $2c , */ + "-", /* 45 $2d - */ + ".", /* 46 $2e . */ + "/", /* 47 $2f / */ + "0", /* 48 $30 0 */ + "1", /* 49 $31 1 */ + "2", /* 50 $32 2 */ + "3", /* 51 $33 3 */ + "4", /* 52 $34 4 */ + "5", /* 53 $35 5 */ + "6", /* 54 $36 6 */ + "7", /* 55 $37 7 */ + "8", /* 56 $38 8 */ + "9", /* 57 $39 9 */ + ":", /* 58 $3a : */ + ";", /* 59 $3b ; */ + "<", /* 60 $3c < */ + "=", /* 61 $3d = */ + ">", /* 62 $3e > */ + "?", /* 63 $3f ? */ + "@", /* 64 $40 @ */ + "A", /* 65 $41 A */ + "B", /* 66 $42 B */ + "C", /* 67 $43 C */ + "D", /* 68 $44 D */ + "E", /* 69 $45 E */ + "F", /* 70 $46 F */ + "G", /* 71 $47 G */ + "H", /* 72 $48 H */ + "I", /* 73 $49 I */ + "J", /* 74 $4a J */ + "K", /* 75 $4b K */ + "L", /* 76 $4c L */ + "M", /* 77 $4d M */ + "N", /* 78 $4e N */ + "O", /* 79 $4f O */ + "P", /* 80 $50 P */ + "Q", /* 81 $51 Q */ + "R", /* 82 $52 R */ + "S", /* 83 $53 S */ + "T", /* 84 $54 T */ + "U", /* 85 $55 U */ + "V", /* 86 $56 V */ + "W", /* 87 $57 W */ + "X", /* 88 $58 X */ + "Y", /* 89 $59 Y */ + "Z", /* 90 $5a Z */ + "[", /* 91 $5b [ */ + "\\", /* 92 $5c \ */ + "]", /* 93 $5d ] */ + "^", /* 94 $5e ^ */ + "_", /* 95 $5f _ */ + "¡", /* 96 $60 ` */ + "a", /* 97 $61 a */ + "b", /* 98 $62 b */ + "c", /* 99 $63 c */ + "d", /* 100 $64 d */ + "e", /* 101 $65 e */ + "f", /* 102 $66 f */ + "g", /* 103 $67 g */ + "h", /* 104 $68 h */ + "i", /* 105 $69 i */ + "j", /* 106 $6a j */ + "k", /* 107 $6b k */ + "l", /* 108 $6c l */ + "m", /* 109 $6d m */ + "n", /* 110 $6e n */ + "o", /* 111 $6f o */ + "p", /* 112 $70 p */ + "q", /* 113 $71 q */ + "r", /* 114 $72 r */ + "s", /* 115 $73 s */ + "t", /* 116 $74 t */ + "u", /* 117 $75 u */ + "v", /* 118 $76 v */ + "w", /* 119 $77 w */ + "x", /* 120 $78 x */ + "y", /* 121 $79 y */ + "z", /* 122 $7a z */ + "Ä", /* 123 $7b { */ + "|", /* 124 $7c | */ + "↰", /* 125 $7d } */ + "◀", /* 126 $7e ~ */ + "▶", /* 127 $7f [del] */ +}; + diff --git a/atables.h b/atables.h new file mode 100644 index 0000000..56e6c34 --- /dev/null +++ b/atables.h @@ -0,0 +1,2 @@ +extern const char *ata2utf[]; +extern const char *ics2utf[]; diff --git a/mkatables.pl b/mkatables.pl new file mode 100644 index 0000000..1eb3a08 --- /dev/null +++ b/mkatables.pl @@ -0,0 +1,116 @@ +#!/usr/bin/perl -w + +%atascii = ( + 0 => "♥", + 1 => "┣", + 2 => "┃", + 3 => "┛", + 4 => "┫", + 5 => "┓", + 6 => "╱", + 7 => "╲", + 8 => "◢", + 9 => "▗", + 10 => "◣", + 11 => "▝", + 12 => "▘", + 13 => "▔", + 14 => "▁", + 15 => "▖", + 16 => "♣", + 17 => "┏", + 18 => "━", + 19 => "╋", + 20 => "●", + 21 => "▄", + 22 => "▎", + 23 => "┳", + 24 => "┻", + 25 => "▌", + 26 => "┗", + 27 => "␛", + 28 => "↑", + 29 => "↓", + 30 => "←", + 31 => "→", + 34 => "\\\"", + 92 => "\\\\", + 96 => "◆", + 123 => "♠", + 125 => "↰", + 126 => "◀", + 127 => "▶", +); + +%xl = ( + 0 => "á", + 1 => "ù", + 2 => "Ñ", + 3 => "É", + 4 => "ç", + 5 => "ô", + 6 => "ò", + 7 => "ì", + 8 => "£", + 9 => "ï", + 10 => "ü", + 11 => "ä", + 12 => "Ö", + 13 => "ú", + 14 => "ó", + 15 => "ö", + 16 => "Ü", + 17 => "â", + 18 => "û", + 19 => "î", + 20 => "é", + 21 => "è", + 22 => "ñ", + 23 => "ê", + 24 => "ȧ", + 25 => "à", + 26 => "Ȧ", + 27 => "␛", + 28 => "↑", + 29 => "↓", + 30 => "←", + 31 => "→", + 34 => "\\\"", + 92 => "\\\\", + 96 => "¡", + 123 => "Ä", + 125 => "↰", + 126 => "◀", + 127 => "▶", +); + +sub getcharname { + my $c = shift; + if($c == 127) { + return "[del]"; + } elsif($c < 32) { + return "^" . chr($c + 64); + } else { + return chr($c); + } +} + +sub mktable { + my ($name, $hash) = @_; + + print "const char *$name\[\] = {\n"; + for (0..127) { + my $cmt = sprintf("/* %3d \$%02x %5s */", $_, $_, getcharname($_)); + print "\t\"" . ($hash->{$_} || chr($_)), "\", $cmt\n"; + } + print "};\n\n"; +} + +print <<EOF; +/* ATASCII to UTF-8 tables. Generated by mkatables.pl. + Do not edit this file; edit mkatables.pl instead. */ + +EOF + +mktable("ata2utf", \%atascii); +mktable("ics2utf", \%xl); diff --git a/wtable.c b/wtable.c new file mode 100644 index 0000000..3c008b3 --- /dev/null +++ b/wtable.c @@ -0,0 +1,140 @@ +/* ref: +https://stackoverflow.com/questions/21737906/how-to-read-write-utf8-text-files-in-c +*/ + +#include <stdio.h> +#include <wchar.h> +#include "wtable.h" + +/* +#define WSEARCH_DEBUG +*/ + +wint_t wchar2ata[][2] = { + /* Unicode, ATASCII */ + { 0x2190, 0x1e }, + { 0x2191, 0x1c }, + { 0x2192, 0x1f }, + { 0x2193, 0x1d }, + { 0x21b0, 0x7d }, + { 0x241b, 0x1b }, + { 0x2501, 0x12 }, + { 0x2503, 0x02 }, + { 0x250f, 0x11 }, + { 0x2513, 0x05 }, + { 0x2517, 0x1a }, + { 0x251b, 0x03 }, + { 0x2523, 0x01 }, + { 0x252b, 0x04 }, + { 0x2533, 0x17 }, + { 0x253b, 0x18 }, + { 0x254b, 0x13 }, + { 0x2571, 0x06 }, + { 0x2572, 0x07 }, + { 0x2581, 0x0e }, + { 0x2584, 0x15 }, + { 0x258c, 0x19 }, + { 0x258e, 0x16 }, + { 0x2594, 0x0d }, + { 0x2596, 0x0f }, + { 0x2597, 0x09 }, + { 0x2598, 0x0c }, + { 0x259d, 0x0b }, + { 0x25b6, 0x7f }, + { 0x25c0, 0x7e }, + { 0x25c6, 0x60 }, + { 0x25cf, 0x14 }, + { 0x25e2, 0x08 }, + { 0x25e3, 0x0a }, + { 0x2660, 0x7b }, + { 0x2663, 0x10 }, + { 0x2665, 0x00 }, +}; + +wint_t wchar2ics[][2] = { + /* Unicode, ATASCII */ + { 0x00a1, 0x60 }, + { 0x00a3, 0x08 }, + { 0x00c4, 0x7b }, + { 0x00c9, 0x03 }, + { 0x00d1, 0x02 }, + { 0x00d6, 0x0c }, + { 0x00dc, 0x10 }, + { 0x00e0, 0x19 }, + { 0x00e1, 0x00 }, + { 0x00e2, 0x11 }, + { 0x00e4, 0x0b }, + { 0x00e7, 0x04 }, + { 0x00e8, 0x15 }, + { 0x00e9, 0x14 }, + { 0x00ea, 0x17 }, + { 0x00ec, 0x07 }, + { 0x00ee, 0x13 }, + { 0x00ef, 0x09 }, + { 0x00f1, 0x16 }, + { 0x00f2, 0x06 }, + { 0x00f3, 0x0e }, + { 0x00f4, 0x05 }, + { 0x00f6, 0x0f }, + { 0x00f9, 0x01 }, + { 0x00fa, 0x0d }, + { 0x00fb, 0x12 }, + { 0x00fc, 0x0a }, + { 0x0226, 0x1a }, + { 0x0227, 0x18 }, + { 0x2190, 0x1e }, + { 0x2191, 0x1c }, + { 0x2192, 0x1f }, + { 0x2193, 0x1d }, + { 0x21b0, 0x7d }, + { 0x241b, 0x1b }, + { 0x25b6, 0x7f }, + { 0x25c0, 0x7e }, +}; + +static int tblsize = sizeof(wchar2ata) / sizeof(wchar2ata[0]); + +static wint_t wsearch(wint_t table[][2], wint_t target, int start, int end) { + wint_t *elem; + int center; + +#ifdef WSEARCH_DEBUG + fprintf(stderr, "wsearch(0x%04x, %d, %d)\n", target, start, end); +#endif + + if(start == end) { + if(table[start][0] == target) + return table[start][1]; + else + return -1; + } else { + center = (start + end) / 2; + elem = table[center]; + +#ifdef WSEARCH_DEBUG + fprintf(stderr, "elem = 0x%04x, 0x%02x\n", elem[0], elem[1]); +#endif + + if(elem[0] == target) + return elem[1]; + else if(elem[0] > target) + return wsearch(table, target, start, center); + else + return wsearch(table, target, center + 1, end); + } +} + +int wchar2atascii(wint_t wc, int ics) { + return wsearch((ics ? wchar2ics : wchar2ata), wc, 0, tblsize - 1); +} + +#ifdef WSEARCH_DEBUG +int main(int argc, char **argv) { + printf("%02x\n", wchar2atascii(0x2190, 0)); + printf("%02x\n", wchar2atascii(0x2571, 0)); + printf("%02x\n", wchar2atascii(0x25c6, 0)); + printf("%02x\n", wchar2atascii(0x2665, 0)); + printf("%02x\n", wchar2atascii(0x2510, 0)); + return 0; +} +#endif diff --git a/wtable.h b/wtable.h new file mode 100644 index 0000000..11c5fa2 --- /dev/null +++ b/wtable.h @@ -0,0 +1,2 @@ +extern wint_t wchar2ata[][2]; +extern int wchar2atascii(wint_t wc, int ics); |