diff options
author | B. Watson <yalhcru@gmail.com> | 2016-02-21 18:53:49 -0500 |
---|---|---|
committer | B. Watson <yalhcru@gmail.com> | 2016-02-21 18:53:49 -0500 |
commit | a4a884600888e5e2e6b5231c2840c01b44eae644 (patch) | |
tree | bd7b2ce9c4208c7e7674312968d0a7f61456f1dd /textcomp.c | |
parent | 6f5bc5adbc21865093ff24af6dbf0c24ea4cfe9c (diff) | |
download | taipan-a4a884600888e5e2e6b5231c2840c01b44eae644.tar.gz |
text compression. 7017 bytes free.
Diffstat (limited to 'textcomp.c')
-rw-r--r-- | textcomp.c | 126 |
1 files changed, 126 insertions, 0 deletions
diff --git a/textcomp.c b/textcomp.c new file mode 100644 index 0000000..f82a91c --- /dev/null +++ b/textcomp.c @@ -0,0 +1,126 @@ +/* textcomp.c - compress strings of text to 6 bits per byte. + loosely based on the z-machine's ZSCII compression. + + Example: "Taipan" (7 bytes, including null terminator) encodes as + 0xb8 0x12 0x50 0x04 0xe0 0x00 (6 bytes). + + Longer strings approach 75% compression ratio. + + No encoded string can be over 256 bytes long, as the decompressor + can't currently handle it. + + The alphabet contains only upper/lowercase letters, space, newline, + and some punctuation. In particular, numbers are not supported. + + alphabet: + 0 = end + 1-26 = a-z + 27-52 = A-Z + 53 = space + 54 = ! + 55 = % + 56 = , + 57 = . + 58 = ? + 59 = : + 60 = ' + 61 = ( + 62 = ) + 63 = newline + + All the strings used by taipan.c are listed in the __END__ section + of messages.pl. The perl script calls this program (textcomp) once + per string, and outputs C source consisting of the encoded versions. + Each string in the __END__ section is preceded by a name, and the + generated C source uses these names with M_ prefixed. + + taipan.c calls the function print_msg(const char *) to decode and + print an encoded message. The decoding step slows down printing a bit, + but it's not really noticeable. cputc() is used for printing, so it + respects the reverse video setting (set by rvs_on() and rvs_off()). + + When a newline is printed, the decoder always prints a carriage + return first. Any \r sequences listed in messages.pl are discarded + before encoding is done. + + Actually, no prompts ever use capital X or Z. These should be used for + dictionary lookups. Maybe X is followed by a 3-bit dict ID, for the 8 + most commonly repeated phrases (one of which will of course be "Taipan"), + and Z is a 5- or 6-bit ID for 32 or 64 less common phrases. So far this + isn't implemented because the decompressor isn't reentrant (yet). +*/ + +#include <stdio.h> +#include <stdlib.h> + +unsigned char out[1024]; +int bitcount = 0; + +int getcode(int c) { + if(c >= 'a' && c <= 'z') + return c - 'a' + 1; + if(c >= 'A' && c <= 'Z') + return c - 'A' + 27; + + switch(c) { + case ' ': return 53; + case '!': return 54; + case '%': return 55; + case ',': return 56; + case '.': return 57; + case '?': return 58; + case ':': return 59; + case '\'': return 60; + case '(': return 61; + case ')': return 62; + case '\n': return 63; + case '\r': break; + default: + fprintf(stderr, "unhandled ASCII code %d\n", c); + exit(1); + break; + } +} + +void appendbit(unsigned char b) { + int pos = bitcount / 8; + int bitpos = 7 - (bitcount % 8); + unsigned char val = b << bitpos; + out[pos] |= val; + fprintf(stderr, "%d: appending bit %d at pos %d, bitpos %d, value $%02x\n", bitcount, b, pos, bitpos, val); + bitcount++; +} + +void appendcode(int code) { + int bit; + for(bit = 0x20; bit > 0; bit >>= 1) { + appendbit((code & bit) != 0); + } +} + +int main(int argc, char **argv) { + int c, code, count = 1; /* 1 to account for null terminator */ + + while((c = getchar()) != EOF) { + code = getcode(c); + fprintf(stderr, "c == %d, code == %d\n", c, code); + appendcode(code); + count++; + } + appendcode(0); + + code = 0; + for(c = 0; c < ((bitcount + 7) / 8); c++) { + printf("0x%02x ", out[c]); + code++; + } + + if(code > 256) { + fprintf(stderr, "input too long\n"); + exit(1); + } + + fprintf(stderr, "%d bytes in (added null), %d bytes out, ratio %.2f\n", + count, code, (float)(code)/(float)count); + return 0; +} |