/* * See the bottom of this file for an explanation of the data * structure, complete with high-tech ASCII art. */ #include #include #include #include "u816.h" #include "crunch.h" #include "self.h" #include "bytorder.h" #define INITIAL_BITS 9 #define MAX_BITS 12 #define MAX_TOKENS (1 << MAX_BITS) #define TOK_RESET 256 #define TOK_END 257 #define INIT_TOKEN 258 extern int opt_verbose; /* alf.c, -v = 1, -vv = 2 */ u8 input_buf[MAX_INPUT_SIZE]; u8 output_buf[MAX_INPUT_SIZE + 3]; unsigned int input_len, output_len, out_bitpos; /* sizeof(short) on my machine is 2, so this is a 2MB table. definitely not how you'd do it on the Atari, but 2MB is nothing on a modern system. */ short tokens[MAX_TOKENS][256]; int token_bits; int shiftamt; /* precalculated MAX_BITS - token_bits */ int max_token; int curr_token = INIT_TOKEN; int in_pos; /*********************************************************************/ /* -vv */ int maxkidcount = 0, maxlevel = 0, totalkidcount = 0, nodeswithkids = 0; /* -vv */ char *fmt_chr(u8 c) { static char buf[10]; if(c > 33 && c < 127) sprintf(buf, "'%c'", c); else sprintf(buf, "$%02x", c); return buf; } /* -vv */ void indent(int level) { int i; for(i = 0; i < level; i++) fputs(" ", stdout); } /* -vv */ void print_tok(short tok, u8 chr) { printf("#%d/%s\n", tok, fmt_chr(chr)); } /* -vv */ void dump_kids(short tok, int level) { int i, j; int kidcount = 0; if(level > maxlevel) maxlevel = level; for(i = 0; i < 256; i++) { if( (j = tokens[tok][i]) ) { kidcount++; totalkidcount++; indent(level); print_tok(j, i); dump_kids(j, level + 1); } } if(kidcount) { indent(level - 1); printf("#%d has %d kids\n", tok, kidcount); nodeswithkids++; if(kidcount > maxkidcount) maxkidcount = kidcount; } else { indent(level); fputs("(no kids)\n", stdout); } } /* -vv */ void dump_tokens(void) { int i, j, prune; maxkidcount = maxlevel = totalkidcount = nodeswithkids = 0; for(i = 0; i < 256; i++) { prune = 1; for(j = 0; j < 256; j++) { if(tokens[i][j]) { prune = 0; break; } } if(!prune) { print_tok(i, i); dump_kids(i, 1); } } printf("\nmaxkidcount %d, maxlevel = %d, totalkidcount = %d\n", maxkidcount, maxlevel, totalkidcount); printf("nodeswithkids %d, avgkidcount: %.2f\n--\n\n", nodeswithkids, ((float)totalkidcount) / (float)(nodeswithkids)); } /*********************************************************************/ void init_table(void) { memset(tokens, 0, sizeof(tokens)); token_bits = INITIAL_BITS; max_token = 1 << INITIAL_BITS; curr_token = INIT_TOKEN; /* precalculated for store_token() */ shiftamt = MAX_BITS - token_bits; } void check_output_len(void) { if(output_len >= MAX_INPUT_SIZE) { fprintf(stderr, "%s: fatal: compressed file would be >16MB.\n", self); exit(1); } } void inc_output_len(void) { output_len++; check_output_len(); output_buf[output_len] = 0; } #if !defined(APPEND_BIT) && defined ALF_ENDIAN_OK /* This is 25% faster, but it requires knowing the endianness of the platform at compile time. See bytorder.h for gory details. */ union { unsigned int ui; u8 bytes[4]; } ui2bytes; void store_token(int tok) { if(opt_verbose > 1) { printf("<%d >%d:%d #%d", in_pos, output_len, out_bitpos, tok); if(tok == TOK_RESET) fputs(" RESET", stdout); else if(tok == TOK_END) fputs(" END", stdout); else if(tok < 256) printf(" %s", fmt_chr(tok)); putchar('\n'); } /* align token so, no matter what its size (9 thru 12), its top bit is at bit 23 of a 32-bit int. */ ui2bytes.ui = tok << ((12 - out_bitpos) + shiftamt); /* always append 12 bits (it's quicker to do that than have conditionals to decide whether the last byte is needed) */ output_buf[output_len] |= ui2bytes.bytes[HIBYTE]; output_buf[output_len + 1] = ui2bytes.bytes[MIDBYTE]; output_buf[output_len + 2] = ui2bytes.bytes[LOBYTE]; /* update based on actual token size */ out_bitpos += token_bits; output_len += out_bitpos / 8; out_bitpos %= 8; check_output_len(); } #else /* Slower but portable. */ void append_bit(int bit) { output_buf[output_len] |= (bit << (7 - out_bitpos)); out_bitpos++; if(out_bitpos == 8) { out_bitpos = 0; inc_output_len(); } } void store_token(int tok) { int mask; if(opt_verbose > 1) { printf("<%d >%d:%d #%d", in_pos, output_len, out_bitpos, tok); if(tok == TOK_RESET) fputs(" RESET", stdout); else if(tok == TOK_END) fputs(" END", stdout); else if(tok < 256) printf(" %s", fmt_chr(tok)); putchar('\n'); } for(mask = 1 << (token_bits - 1); mask; mask >>= 1) { append_bit(tok & mask ? 1 : 0); } } #endif short match_token(void) { short t, nt; t = input_buf[in_pos]; in_pos++; if(in_pos == input_len) return t; while((nt = tokens[t][input_buf[in_pos]])) { t = nt; in_pos++; if(in_pos == input_len) break; } return t; } void make_token(short tok, u8 chr) { if(curr_token == max_token) { if(opt_verbose > 1) { printf("\ntoken %d won't fit in %d bits, ", max_token, token_bits); } if(token_bits == MAX_BITS) { if(opt_verbose > 1) { printf("resetting token_bits to %d\n", INITIAL_BITS); printf("token table is full, clearing, old contents:\n"); dump_tokens(); } store_token(TOK_RESET); /* stored at the *old* token size! */ init_table(); return; /* since we're starting over, *don't* make a token */ } else { token_bits++; /* precalculated for store_token() */ shiftamt--; if(opt_verbose > 1) { printf("token_bits increased to %d\n", token_bits); } } max_token = 1 << token_bits; } tokens[tok][chr] = curr_token; curr_token++; } void crunch(void) { short tok; if(opt_verbose > 1) putchar('\n'); init_table(); out_bitpos = 0; in_pos = 0; output_buf[output_len] = 0; /* just in case */ /* 0-byte input files don't get a TOK_RESET */ if(!input_len) { store_token(TOK_END); inc_output_len(); return; } store_token(TOK_RESET); while(in_pos < input_len) { tok = match_token(); store_token(tok); if(in_pos < input_len) make_token(tok, input_buf[in_pos]); } store_token(TOK_END); if(out_bitpos) inc_output_len(); if(opt_verbose > 1) { printf("\nfinal token table contents:\n"); dump_tokens(); } init_table(); } /* ******************************************************************** The tokens are stored in a tree with 256 roots (tokens[0] to tokens[255]). There's one root per character code (0-255). This serves as the hard-coded initial dictionary (each token number at this level maps to its character code). Each element of tokens[] has an array of 256 shorts, which are the token numbers for each possible character that follows it. These token numbers are just more indices into the tokens[] array. When creating a token, you pass make_token() the index of an existing token. For example if your new token is for the string "ab", you pass make_token() the ASCII value of 'a', and it creates the token by storing the new token's index at tokens['a']['b']. You can think of tokens[] as 256 trees, each of which has 256 branches, which can be 0 (no node here) or the base of another 256-branch tree. Using a static array wastes memory, but not much by modern standards, and it avoids a time-consuming linear search of each level of the tree to see if the current character matches an existing token. Array lookups are O(1). An example: $ echo -n abababcabc > ABC.TXT $ ./alf -vv 1 ABC.TXT Crunching ABC.TXT as ABC.TXT <0 >29:0 #256 RESET <1 >30:1 #97 'a' <2 >31:2 #98 'b' <4 >32:3 #258 <6 >33:4 #258 <7 >34:5 #99 'c' <10 >35:6 #261 <10 >36:7 #257 END final token table contents: #97/'a' #258/'b' #260/'a' (no kids) #261/'c' (no kids) #258 has 2 kids #97 has 1 kids #98/'b' #259/'a' (no kids) #98 has 1 kids #99/'c' #262/'a' (no kids) #99 has 1 kids maxkidcount 2, maxlevel = 3, totalkidcount = 5 nodeswithkids 4, avgkidcount: 1.25 -- 10/38 (-279%) Elapsed time: 0.001s (0.01MB/s). Backed up old '1' to '1~'. The first part of the output (all the lines that start with <) is the actual compressed data, in human-readable form. Single-byte tokens (ones that map to a single charater) are displayed as their ASCII value, if printable. The tokens numbered 258 and above represent multiple bytes. Where does the number 258 come from? Token numbers are sequential, in the order the tokens are created (based on the input data). Since #0-#255 are the predefined 1-byte tokens, and #256 and #257 are reserved for the RESET and END control codes, the first 2-byte token created will be numbered #258. And what does token #258 mean? Take a look at the next bit of output (final token table contents). #258 shows up as a child of #97. So, #258 represents the string "ab". #258 has 2 children itself, #260, which represents "aba" (not used in the compressed data) and #261 ("abc", which does get used). The way the token structure gets stored in memory... say, token #261, which represents "abc". "a" through "c" are ASCII 97 to 99. So, tokens[97][98] has the value 258 (the token for "ab"). Then, tokens[258][99] has the value 261, which is the token for "abc" (the previous token, with a "c" appended). If there were tokens of 4 or more bytes that begin with "abc" (e.g. "abcd"), tokens[261]'s array of 256 would have non-zero items... but in this case, there aren't, so token[261] has an array of all 0 (shows as "no kids", above). Notice that the first time "abc" appeared in the input, it was compressed as 2 tokens: #258 (for the "ab") and #99 (the "c"). When the encoder saw the string "abc" for the first time, it couldn't use the "abc" token because it didn't exist yet. Instead, it creates the "abc" token. The next time "abc" appears in the input, *then* it can use the "abc" token to represent it. If it makes it easier to visualize, here is the above token table represented as a tree, with crude ASCII art. Roots (1-byte tokens): a (97) b (98) c (99) / / / 2-byte tokens: b (258) a (259) a (262) / \ 3-byte tokens: (260) a c (261) ********************************************************************* */