/* textcomp.c - compress strings of text to 6 bits per byte. loosely based on the z-machine's ZSCII compression. Example: "Taipan" (7 bytes, including null terminator) encodes as 0xb8 0x12 0x50 0x04 0xe0 0x00 (6 bytes). Longer strings approach 75% compression ratio. Sadly, the result has to be padded to an 8-bit byte boundary, or else we'd get 75% for every string. Input length | Encoded length | Ratio (incl. null) | | 2 | 2 | 100%, don't bother 3 | 3 | 100%, don't bother 4 | 3 | 75% 5 | 4 | 80% 6 | 5 | 83% 7 | 6 | 86% 8 | 6 | 75% 9 | 7 | 78% 10 | 8 | 80% 11 | 9 | 82% 12 | 9 | 75% 13 | 10 | 77% 14 | 11 | 79% 15 | 12 | 80% 16 | 12 | 75% ...etc etc No encoded string can be over 256 bytes long, as the decompressor can't currently handle it. The alphabet contains only upper/lowercase letters, space, newline, and some punctuation. In particular, numbers are not supported. alphabet: 0 = end 1-26 = a-z 27-52 = A-Z 53 = space 54 = ! 55 = % 56 = , 57 = . 58 = ? 59 = : 60 = ' 61 = ( 62 = ) 63 = newline All the strings used by taipan.c are listed in the __END__ section of messages.pl. The perl script calls this program (textcomp) once per string, and outputs C source consisting of the encoded versions. Each string in the __END__ section is preceded by a name, and the generated C source uses these names with M_ prefixed. taipan.c calls the function print_msg(const char *) to decode and print an encoded message. The decoding step slows down printing a bit, but it's not really noticeable. cputc() is used for printing, so it respects the reverse video setting (set by rvs_on() and rvs_off()). The task of replacing cputs("some string") with print_msg(M_some_string) is done manually. When a newline is printed, the decoder always prints a carriage return first. Any \r sequences listed in messages.pl are discarded before encoding is done. Actually, no prompts ever use capital X or Z. These should be used for dictionary lookups. Maybe X is followed by a 3-bit dict ID, for the 8 most commonly repeated phrases (one of which will of course be "Taipan"), and Z is a 5- or 6-bit ID for 32 or 64 less common phrases. So far this isn't implemented because the decompressor isn't reentrant (yet). */ #include #include unsigned char out[1024]; int bitcount = 0; int getcode(int c) { if(c >= 'a' && c <= 'z') return c - 'a' + 1; if(c >= 'A' && c <= 'Z') return c - 'A' + 27; switch(c) { case ' ': return 53; case '!': return 54; case '%': return 55; case ',': return 56; case '.': return 57; case '?': return 58; case ':': return 59; case '\'': return 60; case '(': return 61; case ')': return 62; case '\n': return 63; case '\r': break; default: fprintf(stderr, "unhandled ASCII code %d\n", c); exit(1); } return 0; /* never executes, shut gcc -Wall up */ } void appendbit(unsigned char b) { int pos = bitcount / 8; int bitpos = 7 - (bitcount % 8); unsigned char val = b << bitpos; out[pos] |= val; fprintf(stderr, "%d: appending bit %d at pos %d, bitpos %d, value $%02x\n", bitcount, b, pos, bitpos, val); bitcount++; } void appendcode(int code) { int bit; for(bit = 0x20; bit > 0; bit >>= 1) { appendbit((code & bit) != 0); } } int main(int argc, char **argv) { int c, code, count = 1; /* 1 to account for null terminator */ while((c = getchar()) != EOF) { code = getcode(c); fprintf(stderr, "c == %d, code == %d\n", c, code); appendcode(code); count++; } appendcode(0); code = 0; for(c = 0; c < ((bitcount + 7) / 8); c++) { printf("0x%02x ", out[c]); code++; } if(code > 256) { fprintf(stderr, "input too long\n"); exit(1); } fprintf(stderr, "%d bytes in (added null), %d bytes out, ratio %.2f\n", count, code, (float)(code)/(float)count); return 0; }