aboutsummaryrefslogtreecommitdiff
path: root/textcomp.c
diff options
context:
space:
mode:
authorB. Watson <yalhcru@gmail.com>2016-02-21 18:53:49 -0500
committerB. Watson <yalhcru@gmail.com>2016-02-21 18:53:49 -0500
commita4a884600888e5e2e6b5231c2840c01b44eae644 (patch)
treebd7b2ce9c4208c7e7674312968d0a7f61456f1dd /textcomp.c
parent6f5bc5adbc21865093ff24af6dbf0c24ea4cfe9c (diff)
downloadtaipan-a4a884600888e5e2e6b5231c2840c01b44eae644.tar.gz
text compression. 7017 bytes free.
Diffstat (limited to 'textcomp.c')
-rw-r--r--textcomp.c126
1 files changed, 126 insertions, 0 deletions
diff --git a/textcomp.c b/textcomp.c
new file mode 100644
index 0000000..f82a91c
--- /dev/null
+++ b/textcomp.c
@@ -0,0 +1,126 @@
+/* textcomp.c - compress strings of text to 6 bits per byte.
+ loosely based on the z-machine's ZSCII compression.
+
+ Example: "Taipan" (7 bytes, including null terminator) encodes as
+ 0xb8 0x12 0x50 0x04 0xe0 0x00 (6 bytes).
+
+ Longer strings approach 75% compression ratio.
+
+ No encoded string can be over 256 bytes long, as the decompressor
+ can't currently handle it.
+
+ The alphabet contains only upper/lowercase letters, space, newline,
+ and some punctuation. In particular, numbers are not supported.
+
+ alphabet:
+ 0 = end
+ 1-26 = a-z
+ 27-52 = A-Z
+ 53 = space
+ 54 = !
+ 55 = %
+ 56 = ,
+ 57 = .
+ 58 = ?
+ 59 = :
+ 60 = '
+ 61 = (
+ 62 = )
+ 63 = newline
+
+ All the strings used by taipan.c are listed in the __END__ section
+ of messages.pl. The perl script calls this program (textcomp) once
+ per string, and outputs C source consisting of the encoded versions.
+ Each string in the __END__ section is preceded by a name, and the
+ generated C source uses these names with M_ prefixed.
+
+ taipan.c calls the function print_msg(const char *) to decode and
+ print an encoded message. The decoding step slows down printing a bit,
+ but it's not really noticeable. cputc() is used for printing, so it
+ respects the reverse video setting (set by rvs_on() and rvs_off()).
+
+ When a newline is printed, the decoder always prints a carriage
+ return first. Any \r sequences listed in messages.pl are discarded
+ before encoding is done.
+
+ Actually, no prompts ever use capital X or Z. These should be used for
+ dictionary lookups. Maybe X is followed by a 3-bit dict ID, for the 8
+ most commonly repeated phrases (one of which will of course be "Taipan"),
+ and Z is a 5- or 6-bit ID for 32 or 64 less common phrases. So far this
+ isn't implemented because the decompressor isn't reentrant (yet).
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+unsigned char out[1024];
+int bitcount = 0;
+
+int getcode(int c) {
+ if(c >= 'a' && c <= 'z')
+ return c - 'a' + 1;
+ if(c >= 'A' && c <= 'Z')
+ return c - 'A' + 27;
+
+ switch(c) {
+ case ' ': return 53;
+ case '!': return 54;
+ case '%': return 55;
+ case ',': return 56;
+ case '.': return 57;
+ case '?': return 58;
+ case ':': return 59;
+ case '\'': return 60;
+ case '(': return 61;
+ case ')': return 62;
+ case '\n': return 63;
+ case '\r': break;
+ default:
+ fprintf(stderr, "unhandled ASCII code %d\n", c);
+ exit(1);
+ break;
+ }
+}
+
+void appendbit(unsigned char b) {
+ int pos = bitcount / 8;
+ int bitpos = 7 - (bitcount % 8);
+ unsigned char val = b << bitpos;
+ out[pos] |= val;
+ fprintf(stderr, "%d: appending bit %d at pos %d, bitpos %d, value $%02x\n", bitcount, b, pos, bitpos, val);
+ bitcount++;
+}
+
+void appendcode(int code) {
+ int bit;
+ for(bit = 0x20; bit > 0; bit >>= 1) {
+ appendbit((code & bit) != 0);
+ }
+}
+
+int main(int argc, char **argv) {
+ int c, code, count = 1; /* 1 to account for null terminator */
+
+ while((c = getchar()) != EOF) {
+ code = getcode(c);
+ fprintf(stderr, "c == %d, code == %d\n", c, code);
+ appendcode(code);
+ count++;
+ }
+ appendcode(0);
+
+ code = 0;
+ for(c = 0; c < ((bitcount + 7) / 8); c++) {
+ printf("0x%02x ", out[c]);
+ code++;
+ }
+
+ if(code > 256) {
+ fprintf(stderr, "input too long\n");
+ exit(1);
+ }
+
+ fprintf(stderr, "%d bytes in (added null), %d bytes out, ratio %.2f\n",
+ count, code, (float)(code)/(float)count);
+ return 0;
+}