text compression. 7017 bytes free.

author: B. Watson <yalhcru@gmail.com> 2016-02-21 18:53:49 -0500
committer: B. Watson <yalhcru@gmail.com> 2016-02-21 18:53:49 -0500
commit: a4a884600888e5e2e6b5231c2840c01b44eae644 (patch)
tree: bd7b2ce9c4208c7e7674312968d0a7f61456f1dd /textcomp.c
parent: 6f5bc5adbc21865093ff24af6dbf0c24ea4cfe9c (diff)
download: taipan-a4a884600888e5e2e6b5231c2840c01b44eae644.tar.gz
1 files changed, 126 insertions, 0 deletions
diff --git a/textcomp.c b/textcomp.c
new file mode 100644
index 0000000..f82a91c
--- /dev/null
+++ b/textcomp.c
@@ -0,0 +1,126 @@
+/* textcomp.c - compress strings of text to 6 bits per byte.
+	loosely based on the z-machine's ZSCII compression.
+
+	Example: "Taipan" (7 bytes, including null terminator) encodes as
+	0xb8 0x12 0x50 0x04 0xe0 0x00 (6 bytes).
+
+	Longer strings approach 75% compression ratio.
+
+	No encoded string can be over 256 bytes long, as the decompressor
+	can't currently handle it.
+
+	The alphabet contains only upper/lowercase letters, space, newline,
+	and some punctuation. In particular, numbers are not supported.
+
+	alphabet:
+	0 = end
+	1-26 = a-z
+	27-52 = A-Z
+	53 = space
+	54 = !
+	55 = %
+	56 = ,
+	57 = .
+	58 = ?
+	59 = :
+	60 = '
+	61 = (
+	62 = )
+	63 = newline
+
+	All the strings used by taipan.c are listed in the __END__ section
+	of messages.pl. The perl script calls this program (textcomp) once
+	per string, and outputs C source consisting of the encoded versions.
+	Each string in the __END__ section is preceded by a name, and the
+	generated C source uses these names with M_ prefixed.
+
+	taipan.c calls the function print_msg(const char *) to decode and
+	print an encoded message. The decoding step slows down printing a bit,
+	but it's not really noticeable. cputc() is used for printing, so it
+	respects the reverse video setting (set by rvs_on() and rvs_off()).
+
+	When a newline is printed, the decoder always prints a carriage
+	return first. Any \r sequences listed in messages.pl are discarded
+	before encoding is done.
+
+	Actually, no prompts ever use capital X or Z. These should be used for
+	dictionary lookups. Maybe X is followed by a 3-bit dict ID, for the 8
+	most commonly repeated phrases (one of which will of course be "Taipan"),
+	and Z is a 5- or 6-bit ID for 32 or 64 less common phrases. So far this
+	isn't implemented because the decompressor isn't reentrant (yet).
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+unsigned char out[1024];
+int bitcount = 0;
+
+int getcode(int c) {
+	if(c >= 'a' && c <= 'z')
+		return c - 'a' + 1;
+	if(c >= 'A' && c <= 'Z')
+		return c - 'A' + 27;
+
+	switch(c) {
+		case ' ': return 53;
+		case '!': return 54;
+		case '%': return 55;
+		case ',': return 56;
+		case '.': return 57;
+		case '?': return 58;
+		case ':': return 59;
+		case '\'': return 60;
+		case '(': return 61;
+		case ')': return 62;
+		case '\n': return 63;
+		case '\r': break;
+		default:
+			fprintf(stderr, "unhandled ASCII code %d\n", c);
+			exit(1);
+			break;
+	}
+}
+
+void appendbit(unsigned char b) {
+	int pos = bitcount / 8;
+	int bitpos = 7 - (bitcount % 8);
+	unsigned char val = b << bitpos;
+	out[pos] |= val;
+	fprintf(stderr, "%d: appending bit %d at pos %d, bitpos %d, value $%02x\n", bitcount, b, pos, bitpos, val);
+	bitcount++;
+}
+
+void appendcode(int code) {
+	int bit;
+	for(bit = 0x20; bit > 0; bit >>= 1) {
+		appendbit((code & bit) != 0);
+	}
+}
+
+int main(int argc, char **argv) {
+	int c, code, count = 1; /* 1 to account for null terminator */
+
+	while((c = getchar()) != EOF) {
+		code = getcode(c);
+		fprintf(stderr, "c == %d, code == %d\n", c, code);
+		appendcode(code);
+		count++;
+	}
+	appendcode(0);
+
+	code = 0;
+	for(c = 0; c < ((bitcount + 7) / 8); c++) {
+		printf("0x%02x ", out[c]);
+		code++;
+	}
+
+	if(code > 256) {
+		fprintf(stderr, "input too long\n");
+		exit(1);
+	}
+
+	fprintf(stderr, "%d bytes in (added null), %d bytes out, ratio %.2f\n",
+			count, code, (float)(code)/(float)count);
+	return 0;
+}
author	B. Watson <yalhcru@gmail.com>	2016-02-21 18:53:49 -0500
committer	B. Watson <yalhcru@gmail.com>	2016-02-21 18:53:49 -0500
commit	a4a884600888e5e2e6b5231c2840c01b44eae644 (patch)
tree	bd7b2ce9c4208c7e7674312968d0a7f61456f1dd /textcomp.c
parent	6f5bc5adbc21865093ff24af6dbf0c24ea4cfe9c (diff)
download	taipan-a4a884600888e5e2e6b5231c2840c01b44eae644.tar.gz