textcomp.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127

/* textcomp.c - compress strings of text to 6 bits per byte.
	loosely based on the z-machine's ZSCII compression.

	Example: "Taipan" (7 bytes, including null terminator) encodes as
	0xb8 0x12 0x50 0x04 0xe0 0x00 (6 bytes).

	Longer strings approach 75% compression ratio.

	No encoded string can be over 256 bytes long, as the decompressor
	can't currently handle it.

	The alphabet contains only upper/lowercase letters, space, newline,
	and some punctuation. In particular, numbers are not supported.

	alphabet:
	0 = end
	1-26 = a-z
	27-52 = A-Z
	53 = space
	54 = !
	55 = %
	56 = ,
	57 = .
	58 = ?
	59 = :
	60 = '
	61 = (
	62 = )
	63 = newline

	All the strings used by taipan.c are listed in the __END__ section
	of messages.pl. The perl script calls this program (textcomp) once
	per string, and outputs C source consisting of the encoded versions.
	Each string in the __END__ section is preceded by a name, and the
	generated C source uses these names with M_ prefixed.

	taipan.c calls the function print_msg(const char *) to decode and
	print an encoded message. The decoding step slows down printing a bit,
	but it's not really noticeable. cputc() is used for printing, so it
	respects the reverse video setting (set by rvs_on() and rvs_off()).

	When a newline is printed, the decoder always prints a carriage
	return first. Any \r sequences listed in messages.pl are discarded
	before encoding is done.

	Actually, no prompts ever use capital X or Z. These should be used for
	dictionary lookups. Maybe X is followed by a 3-bit dict ID, for the 8
	most commonly repeated phrases (one of which will of course be "Taipan"),
	and Z is a 5- or 6-bit ID for 32 or 64 less common phrases. So far this
	isn't implemented because the decompressor isn't reentrant (yet).
*/

#include <stdio.h>
#include <stdlib.h>

unsigned char out[1024];
int bitcount = 0;

int getcode(int c) {
	if(c >= 'a' && c <= 'z')
		return c - 'a' + 1;
	if(c >= 'A' && c <= 'Z')
		return c - 'A' + 27;

	switch(c) {
		case ' ': return 53;
		case '!': return 54;
		case '%': return 55;
		case ',': return 56;
		case '.': return 57;
		case '?': return 58;
		case ':': return 59;
		case '\'': return 60;
		case '(': return 61;
		case ')': return 62;
		case '\n': return 63;
		case '\r': break;
		default:
			fprintf(stderr, "unhandled ASCII code %d\n", c);
			exit(1);
	}

	return 0; /* never executes, shut gcc -Wall up */
}

void appendbit(unsigned char b) {
	int pos = bitcount / 8;
	int bitpos = 7 - (bitcount % 8);
	unsigned char val = b << bitpos;
	out[pos] |= val;
	fprintf(stderr, "%d: appending bit %d at pos %d, bitpos %d, value $%02x\n", bitcount, b, pos, bitpos, val);
	bitcount++;
}

void appendcode(int code) {
	int bit;
	for(bit = 0x20; bit > 0; bit >>= 1) {
		appendbit((code & bit) != 0);
	}
}

int main(int argc, char **argv) {
	int c, code, count = 1; /* 1 to account for null terminator */

	while((c = getchar()) != EOF) {
		code = getcode(c);
		fprintf(stderr, "c == %d, code == %d\n", c, code);
		appendcode(code);
		count++;
	}
	appendcode(0);

	code = 0;
	for(c = 0; c < ((bitcount + 7) / 8); c++) {
		printf("0x%02x ", out[c]);
		code++;
	}

	if(code > 256) {
		fprintf(stderr, "input too long\n");
		exit(1);
	}

	fprintf(stderr, "%d bytes in (added null), %d bytes out, ratio %.2f\n",
			count, code, (float)(code)/(float)count);
	return 0;
}