/* textcomp.c - compress strings of text to 6 bits per byte. loosely based on the z-machine's ZSCII compression. Example: "Taipan" (7 bytes, including null terminator) encodes as 0xb8 0x12 0x50 0x04 0xe0 0x00 (6 bytes). Longer strings approach 75% compression ratio. Sadly, the result has to be padded to an 8-bit byte boundary, or else we'd get 75% for every string. Input length | Encoded length | Ratio (incl. null) | | 2 | 2 | 100%, don't bother 3 | 3 | 100%, don't bother 4 | 3 | 75% 5 | 4 | 80% 6 | 5 | 83% 7 | 6 | 86% 8 | 6 | 75% 9 | 7 | 78% 10 | 8 | 80% 11 | 9 | 82% 12 | 9 | 75% 13 | 10 | 77% 14 | 11 | 79% 15 | 12 | 80% 16 | 12 | 75% ...etc etc No encoded string can be over 256 bytes long, as the decompressor can't currently handle it. The alphabet contains only upper/lowercase letters, space, newline, and some punctuation. In particular, numbers are not supported. alphabet: 0 = end 1-26 = a-z 27-52 = A-Z 53 = space 54 = ! 55 = % 56 = , 57 = . 58 = ? 59 = : 60 = ' 61 = ( 62 = ) 63 = newline All the strings used by taipan.c are listed in the __END__ section of messages.pl. The perl script calls this program (textcomp) once per string, and outputs C source consisting of the encoded versions. Each string in the __END__ section is preceded by a name, and the generated C source uses these names with M_ prefixed. taipan.c calls the function print_msg(const char *) to decode and print an encoded message. The decoding step slows down printing a bit, but it's not really noticeable. cputc() is used for printing, so it respects the reverse video setting (set by rvs_on() and rvs_off()). The task of replacing cputs("some string") with print_msg(M_some_string) is done manually. When a newline is printed, our modified conio moves the cursor to the start of the next line, so no \r's are needed. Any \r sequences listed in messages.pl are discarded before encoding is done. Actually, no prompts ever use capital X or Z. These should be used for dictionary lookups. Maybe X is followed by a 3-bit dict ID, for the 8 most commonly repeated phrases (one of which will of course be "Taipan"), and Z is a 5- or 6-bit ID for 32 or 64 less common phrases. So far this isn't implemented because the decompressor isn't reentrant (yet). */ #include #include unsigned char out[1024]; int bitcount = 0; int getcode(int c) { if(c >= 'a' && c <= 'z') return c - 'a' + 1; if(c >= 'A' && c <= 'Z') return c - 'A' + 27; switch(c) { case ' ': return 53; case '!': return 54; case '%': return 55; case ',': return 56; case '.': return 57; case '?': return 58; case ':': return 59; case '\'': return 60; case '(': return 61; case ')': return 62; case '\n': return 63; case '\r': break; default: fprintf(stderr, "unhandled ASCII code %d\n", c); exit(1); } return 0; /* never executes, shut gcc -Wall up */ } void appendbit(unsigned char b) { int pos = bitcount / 8; int bitpos = 7 - (bitcount % 8); unsigned char val = b << bitpos; out[pos] |= val; fprintf(stderr, "%d: appending bit %d at pos %d, bitpos %d, value $%02x\n", bitcount, b, pos, bitpos, val); bitcount++; } void appendcode(int code) { int bit; for(bit = 0x20; bit > 0; bit >>= 1) { appendbit((code & bit) != 0); } } int main(int argc, char **argv) { int c, code, count = 1; /* 1 to account for null terminator */ while((c = getchar()) != EOF) { code = getcode(c); fprintf(stderr, "c == %d, code == %d\n", c, code); appendcode(code); count++; } appendcode(0); code = 0; for(c = 0; c < ((bitcount + 7) / 8); c++) { printf("0x%02x ", out[c]); code++; } if(code > 256) { fprintf(stderr, "input too long\n"); exit(1); } fprintf(stderr, "%d bytes in (added null), %d bytes out, ratio %.2f\n", count, code, (float)(code)/(float)count); return 0; }