diff options
| -rw-r--r-- | Makefile | 13 | ||||
| -rw-r--r-- | a8cat.1 | 124 | ||||
| -rw-r--r-- | a8cat.c | 187 | ||||
| -rw-r--r-- | a8cat.rst | 58 | ||||
| -rw-r--r-- | atables.c | 265 | ||||
| -rw-r--r-- | atables.h | 2 | ||||
| -rw-r--r-- | mkatables.pl | 116 | ||||
| -rw-r--r-- | wtable.c | 140 | ||||
| -rw-r--r-- | wtable.h | 2 | 
9 files changed, 905 insertions, 2 deletions
| @@ -16,9 +16,9 @@ CC=gcc  CFLAGS=-Wall $(COPT) -ansi -D_GNU_SOURCE -DVERSION=\"$(VERSION)\"  # BINS and SCRIPTS go in $BINDIR, DOCS go in $DOCDIR -BINS=a8eol atr2xfd atrsize axe blob2c blob2xex cart2xex cxrefbas dumpbas fenders protbas renumbas rom2cart unmac65 unprotbas vxrefbas xex1to2 xexamine xexcat xexsplit xfd2atr listbas +BINS=a8eol atr2xfd atrsize axe blob2c blob2xex cart2xex cxrefbas dumpbas fenders protbas renumbas rom2cart unmac65 unprotbas vxrefbas xex1to2 xexamine xexcat xexsplit xfd2atr listbas a8cat  SCRIPTS=dasm2atasm a8utf8 -MANS=a8eol.1 xfd2atr.1 atr2xfd.1 blob2c.1 cart2xex.1 fenders.1 xexsplit.1 xexcat.1 atrsize.1 rom2cart.1 unmac65.1 axe.1 dasm2atasm.1 a8utf8.1 blob2xex.1 xexamine.1 xex1to2.1 unprotbas.1 protbas.1 renumbas.1 dumpbas.1 vxrefbas.1 cxrefbas.1 listbas.1 +MANS=a8eol.1 xfd2atr.1 atr2xfd.1 blob2c.1 cart2xex.1 fenders.1 xexsplit.1 xexcat.1 atrsize.1 rom2cart.1 unmac65.1 axe.1 dasm2atasm.1 a8utf8.1 blob2xex.1 xexamine.1 xex1to2.1 unprotbas.1 protbas.1 renumbas.1 dumpbas.1 vxrefbas.1 cxrefbas.1 listbas.1 a8cat.1  MAN5S=xex.5  MAN7S=atascii.7  DOCS=README.txt equates.inc *.dasm LICENSE ksiders/atr.txt @@ -66,6 +66,12 @@ listbas: listbas.c bas.o bcdfp.o tokens.o  bas.o: bas.c bas.h +wtable.o: wtable.c wtable.h + +atables.o: atables.c atables.h + +a8cat: a8cat.c atables.o wtable.o +  subdirs:  	for dir in $(SUBDIRS); do make -C $$dir COPT="$(COPT)"; done @@ -75,6 +81,9 @@ xfd2atr: xfd2atr.c  atr2xfd: atr2xfd.c +atables.c: mkatables.pl +	perl mkatables.pl > atables.c +  # note to cross-compiler users: If you're building the *.bin targets,  # blob2c needs to be executable on the build host. It'd also be nice  # to build a blob2c for the target platform... Probably you can do @@ -0,0 +1,124 @@ +.\" Man page generated from reStructuredText. +. +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.TH "A8CAT" 1 "2024-06-29" "0.2.1" "Urchlay's Atari 8-bit Tools" +.SH NAME +a8cat \- Convert Atari 8-bit text to UTF-8 encoded Unicode. +.SH SYNOPSIS +.sp +\fIa8cat\fP [\fB\-r\fP] [\fB\-i\fP] [\fB\-u\fP] [\fB\-t\fP] [\fIinfile\fP] [\fIinfile ...\fP] +.SH DESCRIPTION +.sp +Convert Atari 8\-bit ATASCII or XL ICS (International Character +Set) text to UTF\-8 encoded Unicode. Control graphics characters are +replaced with their nearest Unicode equivalents (mostly from the Box +Drawing block, or from the Basic Latin block with \fB\-i\fP option). +.sp +If no \fIinfile\fPs are given, input is read from standard input. Output always +goes to standard output; to write to a file, use a command like: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +a8cat atari.txt > converted.txt +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +The output is plain UTF\-8 Unicode, without BOM. +.sp +Inverse video (characters codes above \fB$80\fP) are translated using +the ANSI/VT\-100 reverse video escape sequences. Exception: \fB$9B\fP +(Atari EOL) is translated to \fB\en\fP (newline). +.SH OPTIONS +.INDENT 0.0 +.TP +.B  \-i +Input uses Atari XL/XE International Character Set encoding, rather than +ATASCII graphics. +.TP +.B  \-u +Use "underlining" for inverse video. Each inverse character is followed by +a backspace, then a \fI_\fP character. When viewed in a pager such as \fBless\fP(1), +this causes the characters to appear underlined. Output created with this +option cannot be converted back to ATASCII with the \fB\-r\fP option. +.TP +.B  \-t +Text mode. Normally, everything but EOL (\fB$9B\fP) is converted to a +Unicode graphics character. In text mode, ATASCII tabs, backspace, +and bells are translated to the ASCII versions. +.TP +.B  \-r +Reverse conversion: Input is UTF\-8, output is ATASCII (or XL ICS, with \fB\-i\fP). +Beware that printing ATASCII to a terminal may look funny, and may even confuse +the terminal. Redirecting to a file is safe. +.UNINDENT +.SH COPYRIGHT +.sp +WTFPL. See \fI\%http://www.wtfpl.net/txt/copying/\fP for details. +.SH AUTHOR +.INDENT 0.0 +.IP B. 3 +Watson <\fI\%urchlay@slackware.uk\fP>; Urchlay on irc.libera.chat \fI##atari\fP\&. +.UNINDENT +.SH SEE ALSO +.sp +\fBa8eol\fP(1), +\fBa8utf8\fP(1), +\fBatr2xfd\fP(1), +\fBatrsize\fP(1), +\fBaxe\fP(1), +\fBblob2c\fP(1), +\fBblob2xex\fP(1), +\fBcart2xex\fP(1), +\fBcxrefbas\fP(1), +\fBdasm2atasm\fP(1), +\fBdumpbas\fP(1), +\fBf2toxex\fP(1), +\fBfenders\fP(1), +\fBlistbas\fP(1), +\fBprotbas\fP(1), +\fBrenumbas\fP(1), +\fBrom2cart\fP(1), +\fBunmac65\fP(1), +\fBunprotbas\fP(1), +\fBvxrefbas\fP(1), +\fBxexamine\fP(1), +\fBxexcat\fP(1), +\fBxexsplit\fP(1), +\fBxfd2atr\fP(1), +\fBxex\fP(5), +\fBatascii\fP(7). +.sp +Any good Atari 8\-bit book: \fIDe Re Atari\fP, \fIThe Atari BASIC  Reference +Manual\fP,  the  \fIOS Users\(aq Guide\fP, \fIMapping the Atari\fP, etc. +.\" Generated by docutils manpage writer. +. @@ -0,0 +1,187 @@ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <unistd.h> +#include <locale.h> +#include <wchar.h> +#include <errno.h> + +#include "atables.h" +#include "wtable.h" + +const char **table = ata2utf; + +const char *inverse_on  = "\x1b[7m"; +const char *inverse_off = "\x1b[0m"; + +int underline = 0, reverse = 0, textmode = 0, ics = 0; + +void print_help(void) { +	printf("Usage: a8cat [-i] [-u] [file ...]\n"); +} + +FILE *open_input(const char *file) { +	FILE *input; + +	if(file[0] == '-' && file[1] == 0) { +		if(freopen(NULL, "rb", stdin)) { +			input = stdin; +		} else { +			perror("(standard input)"); +			return NULL; +		} +	} else if(!(input = fopen(file, "rb"))) { +		perror(file); +		return NULL; +	} + +	return input; +} + +int handle_escape_seq(int inv, FILE *input) { +	int count, c; +	char buf[5] = { 0x1b, 0, 0, 0, 0 }; + +	for(count = 1; count < 4; count++) { +		c = fgetwc(input); +		if(c == WEOF) break; +		buf[count] = c; /* FIXME: might be a wide char! */ +	} + +	if(strcmp(inverse_on, buf) == 0) { +		return 0x80; +	} else if(strcmp(inverse_off, buf) == 0) { +		return 0; +	} else { +		fputs(buf, stdout); +		return inv; +	} +} + +int a8revcat(const char *file) { +	FILE *input; +	int c, d, inv = 0; + +	if( !(input = open_input(file)) ) +		return 1; + +	setlocale(LC_CTYPE, "en_US.UTF-8"); +	while( (c = fgetwc(input)) != WEOF ) { +		if(c == 0x1b) { +			inv = handle_escape_seq(inv, input); +		} else if(c == '\n') { +			putchar(0x9b); +		} else if(c < 0x80) { +			putchar(c | inv); +		} else { +			d = wchar2atascii(c, ics); +			if(d == -1) { +				fprintf(stderr, "warning: unrecognized Unicode character %04x\n", c); +			} else { +				putchar(d | inv); +			} +		} +	} + +	return 0; +} + +/* XXX: hard-coded ANSI/vt100 escape sequences. would be +   better but more complex to use terminfo to support any ol' +   terminal... */ +void inverse(int onoff) { +	fputs((onoff ? inverse_on : inverse_off ), stdout); +} + +int a8cat(const char *file) { +	FILE *input; +	int c, inv = 0; + +	if( !(input = open_input(file)) ) +		return 1; + +	while( (c = fgetc(input)) != EOF ) { +		if(c == 0x9b) { +			putchar('\n'); +			continue; +		} + +		if(textmode) { +			switch(c) { +				case 0x09: /* Atari TAB is same as ASCII */ +					putchar('\t'); +					continue; +				case 0xfd: /* bell */ +					putchar('\a'); +					continue; +				case 0x7e: /* backspace */ +					putchar('\b'); +					continue; +				default: break; +			} +		} + +		if(!underline) { +			/* strings of inverse chars only get one "inverse on" ANSI +			   sequence, and one "inverse off" afterwards. */ +			if(c & 0x80) { +				if(!inv) { +					inv = 1; +					inverse(1); +				} +			} else { +				if(inv) { +					inv = 0; +					inverse(0); +				} +			} +		} + +		fputs(table[c & 0x7f], stdout); + +		if(underline && (c & 0x80)) { +			putchar('\b'); +			putchar('_'); +		} +	} + +	/* gotta turn off inverse, so if there's another file after this one, +	   it doesn't start out being printed in inverse. */ +	if(inv && !underline) inverse(0); + +	fclose(input); +	return 0; +} + +int main(int argc, char **argv) { +	int opt, result = 0; + +	while( (opt = getopt(argc, argv, "ihurt")) != -1) { +		switch(opt) { +			case 'i': table = ics2utf; ics = 1; break; +			case 'h': print_help(); exit(0); break; +			case 'u': underline = 1; break; +			case 'r': reverse = 1; break; +			case 't': textmode = 1; break; +			default: print_help(); exit(1); break; +		} +	} + +	if(reverse) { +		if(underline || textmode) { +			fprintf(stderr, "-t and -u options don't make sense with -r.\n"); +			exit(1); +		} +	} + +	if(optind >= argc) { +		result = (reverse ? a8revcat("-") : a8cat("-")); +	} else { +		while(optind < argc) { +			result += (reverse ? a8revcat(argv[optind]) : a8cat(argv[optind])); +			optind++; +		} +	} + +	exit(result); +} diff --git a/a8cat.rst b/a8cat.rst new file mode 100644 index 0000000..7557c01 --- /dev/null +++ b/a8cat.rst @@ -0,0 +1,58 @@ +===== +a8cat +===== + +-------------------------------------------------- +Convert Atari 8-bit text to UTF-8 encoded Unicode. +-------------------------------------------------- + +.. include:: manhdr.rst + +SYNOPSIS +======== + +*a8cat* [**-r**] [**-i**] [**-u**] [**-t**] [*infile*] [*infile ...*] + +DESCRIPTION +=========== + +Convert Atari 8-bit ATASCII or XL ICS (International Character +Set) text to UTF-8 encoded Unicode. Control graphics characters are +replaced with their nearest Unicode equivalents (mostly from the Box +Drawing block, or from the Basic Latin block with **-i** option). + +If no *infile*\s are given, input is read from standard input. Output always +goes to standard output; to write to a file, use a command like:: + +  a8cat atari.txt > converted.txt + +The output is plain UTF-8 Unicode, without BOM. + +Inverse video (characters codes above **$80**) are translated using +the ANSI/VT-100 reverse video escape sequences. Exception: **$9B** +(Atari EOL) is translated to **\\n** (newline). + +OPTIONS +======= + +-i +  Input uses Atari XL/XE International Character Set encoding, rather than +  ATASCII graphics. + +-u +  Use "underlining" for inverse video. Each inverse character is followed by +  a backspace, then a *_* character. When viewed in a pager such as **less**\(1), +  this causes the characters to appear underlined. Output created with this +  option cannot be converted back to ATASCII with the **-r** option. + +-t +  Text mode. Normally, everything but EOL (**$9B**) is converted to a +  Unicode graphics character. In text mode, ATASCII tabs, backspace, +  and bells are translated to the ASCII versions. + +-r +  Reverse conversion: Input is UTF-8, output is ATASCII (or XL ICS, with **-i**). +  Beware that printing ATASCII to a terminal may look funny, and may even confuse +  the terminal. Redirecting to a file is safe. + +.. include:: manftr.rst diff --git a/atables.c b/atables.c new file mode 100644 index 0000000..ea6eedc --- /dev/null +++ b/atables.c @@ -0,0 +1,265 @@ +/* ATASCII to UTF-8 tables. Generated by mkatables.pl. +   Do not edit this file; edit mkatables.pl instead. */ + +const char *ata2utf[] = { +	"♥",  /*   0 $00    ^@ */ +	"┣",  /*   1 $01    ^A */ +	"┃",  /*   2 $02    ^B */ +	"┛",  /*   3 $03    ^C */ +	"┫",  /*   4 $04    ^D */ +	"┓",  /*   5 $05    ^E */ +	"╱",  /*   6 $06    ^F */ +	"╲",  /*   7 $07    ^G */ +	"◢",  /*   8 $08    ^H */ +	"▗",  /*   9 $09    ^I */ +	"◣",  /*  10 $0a    ^J */ +	"▝",  /*  11 $0b    ^K */ +	"▘",  /*  12 $0c    ^L */ +	"▔",  /*  13 $0d    ^M */ +	"▁",  /*  14 $0e    ^N */ +	"▖",  /*  15 $0f    ^O */ +	"♣",  /*  16 $10    ^P */ +	"┏",  /*  17 $11    ^Q */ +	"━",  /*  18 $12    ^R */ +	"╋",  /*  19 $13    ^S */ +	"●",  /*  20 $14    ^T */ +	"▄",  /*  21 $15    ^U */ +	"▎",  /*  22 $16    ^V */ +	"┳",  /*  23 $17    ^W */ +	"┻",  /*  24 $18    ^X */ +	"▌",  /*  25 $19    ^Y */ +	"┗",  /*  26 $1a    ^Z */ +	"␛",  /*  27 $1b    ^[ */ +	"↑",  /*  28 $1c    ^\ */ +	"↓",  /*  29 $1d    ^] */ +	"←",  /*  30 $1e    ^^ */ +	"→",  /*  31 $1f    ^_ */ +	" ",  /*  32 $20       */ +	"!",  /*  33 $21     ! */ +	"\"",  /*  34 $22     " */ +	"#",  /*  35 $23     # */ +	"$",  /*  36 $24     $ */ +	"%",  /*  37 $25     % */ +	"&",  /*  38 $26     & */ +	"'",  /*  39 $27     ' */ +	"(",  /*  40 $28     ( */ +	")",  /*  41 $29     ) */ +	"*",  /*  42 $2a     * */ +	"+",  /*  43 $2b     + */ +	",",  /*  44 $2c     , */ +	"-",  /*  45 $2d     - */ +	".",  /*  46 $2e     . */ +	"/",  /*  47 $2f     / */ +	"0",  /*  48 $30     0 */ +	"1",  /*  49 $31     1 */ +	"2",  /*  50 $32     2 */ +	"3",  /*  51 $33     3 */ +	"4",  /*  52 $34     4 */ +	"5",  /*  53 $35     5 */ +	"6",  /*  54 $36     6 */ +	"7",  /*  55 $37     7 */ +	"8",  /*  56 $38     8 */ +	"9",  /*  57 $39     9 */ +	":",  /*  58 $3a     : */ +	";",  /*  59 $3b     ; */ +	"<",  /*  60 $3c     < */ +	"=",  /*  61 $3d     = */ +	">",  /*  62 $3e     > */ +	"?",  /*  63 $3f     ? */ +	"@",  /*  64 $40     @ */ +	"A",  /*  65 $41     A */ +	"B",  /*  66 $42     B */ +	"C",  /*  67 $43     C */ +	"D",  /*  68 $44     D */ +	"E",  /*  69 $45     E */ +	"F",  /*  70 $46     F */ +	"G",  /*  71 $47     G */ +	"H",  /*  72 $48     H */ +	"I",  /*  73 $49     I */ +	"J",  /*  74 $4a     J */ +	"K",  /*  75 $4b     K */ +	"L",  /*  76 $4c     L */ +	"M",  /*  77 $4d     M */ +	"N",  /*  78 $4e     N */ +	"O",  /*  79 $4f     O */ +	"P",  /*  80 $50     P */ +	"Q",  /*  81 $51     Q */ +	"R",  /*  82 $52     R */ +	"S",  /*  83 $53     S */ +	"T",  /*  84 $54     T */ +	"U",  /*  85 $55     U */ +	"V",  /*  86 $56     V */ +	"W",  /*  87 $57     W */ +	"X",  /*  88 $58     X */ +	"Y",  /*  89 $59     Y */ +	"Z",  /*  90 $5a     Z */ +	"[",  /*  91 $5b     [ */ +	"\\",  /*  92 $5c     \ */ +	"]",  /*  93 $5d     ] */ +	"^",  /*  94 $5e     ^ */ +	"_",  /*  95 $5f     _ */ +	"◆",  /*  96 $60     ` */ +	"a",  /*  97 $61     a */ +	"b",  /*  98 $62     b */ +	"c",  /*  99 $63     c */ +	"d",  /* 100 $64     d */ +	"e",  /* 101 $65     e */ +	"f",  /* 102 $66     f */ +	"g",  /* 103 $67     g */ +	"h",  /* 104 $68     h */ +	"i",  /* 105 $69     i */ +	"j",  /* 106 $6a     j */ +	"k",  /* 107 $6b     k */ +	"l",  /* 108 $6c     l */ +	"m",  /* 109 $6d     m */ +	"n",  /* 110 $6e     n */ +	"o",  /* 111 $6f     o */ +	"p",  /* 112 $70     p */ +	"q",  /* 113 $71     q */ +	"r",  /* 114 $72     r */ +	"s",  /* 115 $73     s */ +	"t",  /* 116 $74     t */ +	"u",  /* 117 $75     u */ +	"v",  /* 118 $76     v */ +	"w",  /* 119 $77     w */ +	"x",  /* 120 $78     x */ +	"y",  /* 121 $79     y */ +	"z",  /* 122 $7a     z */ +	"♠",  /* 123 $7b     { */ +	"|",  /* 124 $7c     | */ +	"↰",  /* 125 $7d     } */ +	"◀",  /* 126 $7e     ~ */ +	"▶",  /* 127 $7f [del] */ +}; + +const char *ics2utf[] = { +	"á",  /*   0 $00    ^@ */ +	"ù",  /*   1 $01    ^A */ +	"Ñ",  /*   2 $02    ^B */ +	"É",  /*   3 $03    ^C */ +	"ç",  /*   4 $04    ^D */ +	"ô",  /*   5 $05    ^E */ +	"ò",  /*   6 $06    ^F */ +	"ì",  /*   7 $07    ^G */ +	"£",  /*   8 $08    ^H */ +	"ï",  /*   9 $09    ^I */ +	"ü",  /*  10 $0a    ^J */ +	"ä",  /*  11 $0b    ^K */ +	"Ö",  /*  12 $0c    ^L */ +	"ú",  /*  13 $0d    ^M */ +	"ó",  /*  14 $0e    ^N */ +	"ö",  /*  15 $0f    ^O */ +	"Ü",  /*  16 $10    ^P */ +	"â",  /*  17 $11    ^Q */ +	"û",  /*  18 $12    ^R */ +	"î",  /*  19 $13    ^S */ +	"é",  /*  20 $14    ^T */ +	"è",  /*  21 $15    ^U */ +	"ñ",  /*  22 $16    ^V */ +	"ê",  /*  23 $17    ^W */ +	"ȧ",  /*  24 $18    ^X */ +	"à",  /*  25 $19    ^Y */ +	"Ȧ",  /*  26 $1a    ^Z */ +	"␛",  /*  27 $1b    ^[ */ +	"↑",  /*  28 $1c    ^\ */ +	"↓",  /*  29 $1d    ^] */ +	"←",  /*  30 $1e    ^^ */ +	"→",  /*  31 $1f    ^_ */ +	" ",  /*  32 $20       */ +	"!",  /*  33 $21     ! */ +	"\"",  /*  34 $22     " */ +	"#",  /*  35 $23     # */ +	"$",  /*  36 $24     $ */ +	"%",  /*  37 $25     % */ +	"&",  /*  38 $26     & */ +	"'",  /*  39 $27     ' */ +	"(",  /*  40 $28     ( */ +	")",  /*  41 $29     ) */ +	"*",  /*  42 $2a     * */ +	"+",  /*  43 $2b     + */ +	",",  /*  44 $2c     , */ +	"-",  /*  45 $2d     - */ +	".",  /*  46 $2e     . */ +	"/",  /*  47 $2f     / */ +	"0",  /*  48 $30     0 */ +	"1",  /*  49 $31     1 */ +	"2",  /*  50 $32     2 */ +	"3",  /*  51 $33     3 */ +	"4",  /*  52 $34     4 */ +	"5",  /*  53 $35     5 */ +	"6",  /*  54 $36     6 */ +	"7",  /*  55 $37     7 */ +	"8",  /*  56 $38     8 */ +	"9",  /*  57 $39     9 */ +	":",  /*  58 $3a     : */ +	";",  /*  59 $3b     ; */ +	"<",  /*  60 $3c     < */ +	"=",  /*  61 $3d     = */ +	">",  /*  62 $3e     > */ +	"?",  /*  63 $3f     ? */ +	"@",  /*  64 $40     @ */ +	"A",  /*  65 $41     A */ +	"B",  /*  66 $42     B */ +	"C",  /*  67 $43     C */ +	"D",  /*  68 $44     D */ +	"E",  /*  69 $45     E */ +	"F",  /*  70 $46     F */ +	"G",  /*  71 $47     G */ +	"H",  /*  72 $48     H */ +	"I",  /*  73 $49     I */ +	"J",  /*  74 $4a     J */ +	"K",  /*  75 $4b     K */ +	"L",  /*  76 $4c     L */ +	"M",  /*  77 $4d     M */ +	"N",  /*  78 $4e     N */ +	"O",  /*  79 $4f     O */ +	"P",  /*  80 $50     P */ +	"Q",  /*  81 $51     Q */ +	"R",  /*  82 $52     R */ +	"S",  /*  83 $53     S */ +	"T",  /*  84 $54     T */ +	"U",  /*  85 $55     U */ +	"V",  /*  86 $56     V */ +	"W",  /*  87 $57     W */ +	"X",  /*  88 $58     X */ +	"Y",  /*  89 $59     Y */ +	"Z",  /*  90 $5a     Z */ +	"[",  /*  91 $5b     [ */ +	"\\",  /*  92 $5c     \ */ +	"]",  /*  93 $5d     ] */ +	"^",  /*  94 $5e     ^ */ +	"_",  /*  95 $5f     _ */ +	"¡",  /*  96 $60     ` */ +	"a",  /*  97 $61     a */ +	"b",  /*  98 $62     b */ +	"c",  /*  99 $63     c */ +	"d",  /* 100 $64     d */ +	"e",  /* 101 $65     e */ +	"f",  /* 102 $66     f */ +	"g",  /* 103 $67     g */ +	"h",  /* 104 $68     h */ +	"i",  /* 105 $69     i */ +	"j",  /* 106 $6a     j */ +	"k",  /* 107 $6b     k */ +	"l",  /* 108 $6c     l */ +	"m",  /* 109 $6d     m */ +	"n",  /* 110 $6e     n */ +	"o",  /* 111 $6f     o */ +	"p",  /* 112 $70     p */ +	"q",  /* 113 $71     q */ +	"r",  /* 114 $72     r */ +	"s",  /* 115 $73     s */ +	"t",  /* 116 $74     t */ +	"u",  /* 117 $75     u */ +	"v",  /* 118 $76     v */ +	"w",  /* 119 $77     w */ +	"x",  /* 120 $78     x */ +	"y",  /* 121 $79     y */ +	"z",  /* 122 $7a     z */ +	"Ä",  /* 123 $7b     { */ +	"|",  /* 124 $7c     | */ +	"↰",  /* 125 $7d     } */ +	"◀",  /* 126 $7e     ~ */ +	"▶",  /* 127 $7f [del] */ +}; + diff --git a/atables.h b/atables.h new file mode 100644 index 0000000..56e6c34 --- /dev/null +++ b/atables.h @@ -0,0 +1,2 @@ +extern const char *ata2utf[]; +extern const char *ics2utf[]; diff --git a/mkatables.pl b/mkatables.pl new file mode 100644 index 0000000..1eb3a08 --- /dev/null +++ b/mkatables.pl @@ -0,0 +1,116 @@ +#!/usr/bin/perl -w + +%atascii = ( +	0 => "♥", +	1 => "┣", +	2 => "┃", +	3 => "┛", +	4 => "┫", +	5 => "┓", +	6 => "╱", +	7 => "╲", +	8 => "◢", +	9 => "▗", +	10 => "◣", +	11 => "▝", +	12 => "▘", +	13 => "▔", +	14 => "▁", +	15 => "▖", +	16 => "♣", +	17 => "┏", +	18 => "━", +	19 => "╋", +	20 => "●", +	21 => "▄", +	22 => "▎", +	23 => "┳", +	24 => "┻", +	25 => "▌", +	26 => "┗", +	27 => "␛", +	28 => "↑", +	29 => "↓", +	30 => "←", +	31 => "→", +	34 => "\\\"", +	92 => "\\\\", +	96 => "◆", +	123 => "♠", +	125 => "↰", +	126 => "◀", +	127 => "▶", +); + +%xl = ( +	0 => "á", +	1 => "ù", +	2 => "Ñ", +	3 => "É", +	4 => "ç", +	5 => "ô", +	6 => "ò", +	7 => "ì", +	8 => "£", +	9 => "ï", +	10 => "ü", +	11 => "ä", +	12 => "Ö", +	13 => "ú", +	14 => "ó", +	15 => "ö", +	16 => "Ü", +	17 => "â", +	18 => "û", +	19 => "î", +	20 => "é", +	21 => "è", +	22 => "ñ", +	23 => "ê", +	24 => "ȧ", +	25 => "à", +	26 => "Ȧ", +	27 => "␛", +	28 => "↑", +	29 => "↓", +	30 => "←", +	31 => "→", +	34 => "\\\"", +	92 => "\\\\", +	96 => "¡", +	123 => "Ä", +	125 => "↰", +	126 => "◀", +	127 => "▶", +); + +sub getcharname { +	my $c = shift; +	if($c == 127) { +		return "[del]"; +	} elsif($c < 32) { +		return "^" . chr($c + 64); +	} else { +		return chr($c); +	} +} + +sub mktable { +	my ($name, $hash) = @_; + +	print "const char *$name\[\] = {\n"; +	for (0..127) { +		my $cmt = sprintf("/* %3d \$%02x %5s */", $_, $_, getcharname($_)); +		print "\t\"" . ($hash->{$_} || chr($_)), "\",  $cmt\n"; +	} +	print "};\n\n"; +} + +print <<EOF; +/* ATASCII to UTF-8 tables. Generated by mkatables.pl. +   Do not edit this file; edit mkatables.pl instead. */ + +EOF + +mktable("ata2utf", \%atascii); +mktable("ics2utf", \%xl); diff --git a/wtable.c b/wtable.c new file mode 100644 index 0000000..3c008b3 --- /dev/null +++ b/wtable.c @@ -0,0 +1,140 @@ +/* ref: +https://stackoverflow.com/questions/21737906/how-to-read-write-utf8-text-files-in-c +*/ + +#include <stdio.h> +#include <wchar.h> +#include "wtable.h" + +/* +#define WSEARCH_DEBUG +*/ + +wint_t wchar2ata[][2] = { +	/* Unicode, ATASCII */ +	{ 0x2190, 0x1e }, +	{ 0x2191, 0x1c }, +	{ 0x2192, 0x1f }, +	{ 0x2193, 0x1d }, +	{ 0x21b0, 0x7d }, +	{ 0x241b, 0x1b }, +	{ 0x2501, 0x12 }, +	{ 0x2503, 0x02 }, +	{ 0x250f, 0x11 }, +	{ 0x2513, 0x05 }, +	{ 0x2517, 0x1a }, +	{ 0x251b, 0x03 }, +	{ 0x2523, 0x01 }, +	{ 0x252b, 0x04 }, +	{ 0x2533, 0x17 }, +	{ 0x253b, 0x18 }, +	{ 0x254b, 0x13 }, +	{ 0x2571, 0x06 }, +	{ 0x2572, 0x07 }, +	{ 0x2581, 0x0e }, +	{ 0x2584, 0x15 }, +	{ 0x258c, 0x19 }, +	{ 0x258e, 0x16 }, +	{ 0x2594, 0x0d }, +	{ 0x2596, 0x0f }, +	{ 0x2597, 0x09 }, +	{ 0x2598, 0x0c }, +	{ 0x259d, 0x0b }, +	{ 0x25b6, 0x7f }, +	{ 0x25c0, 0x7e }, +	{ 0x25c6, 0x60 }, +	{ 0x25cf, 0x14 }, +	{ 0x25e2, 0x08 }, +	{ 0x25e3, 0x0a }, +	{ 0x2660, 0x7b }, +	{ 0x2663, 0x10 }, +	{ 0x2665, 0x00 }, +}; + +wint_t wchar2ics[][2] = { +	/* Unicode, ATASCII */ +	{ 0x00a1, 0x60 }, +	{ 0x00a3, 0x08 }, +	{ 0x00c4, 0x7b }, +	{ 0x00c9, 0x03 }, +	{ 0x00d1, 0x02 }, +	{ 0x00d6, 0x0c }, +	{ 0x00dc, 0x10 }, +	{ 0x00e0, 0x19 }, +	{ 0x00e1, 0x00 }, +	{ 0x00e2, 0x11 }, +	{ 0x00e4, 0x0b }, +	{ 0x00e7, 0x04 }, +	{ 0x00e8, 0x15 }, +	{ 0x00e9, 0x14 }, +	{ 0x00ea, 0x17 }, +	{ 0x00ec, 0x07 }, +	{ 0x00ee, 0x13 }, +	{ 0x00ef, 0x09 }, +	{ 0x00f1, 0x16 }, +	{ 0x00f2, 0x06 }, +	{ 0x00f3, 0x0e }, +	{ 0x00f4, 0x05 }, +	{ 0x00f6, 0x0f }, +	{ 0x00f9, 0x01 }, +	{ 0x00fa, 0x0d }, +	{ 0x00fb, 0x12 }, +	{ 0x00fc, 0x0a }, +	{ 0x0226, 0x1a }, +	{ 0x0227, 0x18 }, +	{ 0x2190, 0x1e }, +	{ 0x2191, 0x1c }, +	{ 0x2192, 0x1f }, +	{ 0x2193, 0x1d }, +	{ 0x21b0, 0x7d }, +	{ 0x241b, 0x1b }, +	{ 0x25b6, 0x7f }, +	{ 0x25c0, 0x7e }, +}; + +static int tblsize = sizeof(wchar2ata) / sizeof(wchar2ata[0]); + +static wint_t wsearch(wint_t table[][2], wint_t target, int start, int end) { +	wint_t *elem; +	int center; + +#ifdef WSEARCH_DEBUG +	fprintf(stderr, "wsearch(0x%04x, %d, %d)\n", target, start, end); +#endif + +	if(start == end) { +		if(table[start][0] == target) +			return table[start][1]; +		else +			return -1; +	} else { +		center = (start + end) / 2; +		elem = table[center]; + +#ifdef WSEARCH_DEBUG +		fprintf(stderr, "elem = 0x%04x, 0x%02x\n", elem[0], elem[1]); +#endif + +		if(elem[0] == target) +			return elem[1]; +		else if(elem[0] > target) +			return wsearch(table, target, start, center); +		else +			return wsearch(table, target, center + 1, end); +	} +} + +int wchar2atascii(wint_t wc, int ics) { +	return wsearch((ics ? wchar2ics : wchar2ata), wc, 0, tblsize - 1); +} + +#ifdef WSEARCH_DEBUG +int main(int argc, char **argv) { +	printf("%02x\n", wchar2atascii(0x2190, 0)); +	printf("%02x\n", wchar2atascii(0x2571, 0)); +	printf("%02x\n", wchar2atascii(0x25c6, 0)); +	printf("%02x\n", wchar2atascii(0x2665, 0)); +	printf("%02x\n", wchar2atascii(0x2510, 0)); +	return 0; +} +#endif diff --git a/wtable.h b/wtable.h new file mode 100644 index 0000000..11c5fa2 --- /dev/null +++ b/wtable.h @@ -0,0 +1,2 @@ +extern wint_t wchar2ata[][2]; +extern int wchar2atascii(wint_t wc, int ics); | 
