/* * See the bottom of this file for an explanation of the data * structure, complete with high-tech ASCII art. */ #include #include #include #include "u816.h" #include "crunch.h" #include "self.h" #define INITIAL_BITS 9 #define MAX_BITS 12 #define MAX_TOKENS (1 << MAX_BITS) #define TOK_RESET 256 #define TOK_END 257 #define INIT_TOKEN 258 extern int opt_verbose; /* alf.c, -v = 1, -vv = 2 */ u8 input_buf[MAX_INPUT_SIZE]; u8 output_buf[MAX_INPUT_SIZE]; unsigned int input_len, output_len, out_bitpos; /* sizeof(short) on my machine is 2, so this is a 2MB table. definitely not how you'd do it on the Atari, but 2MB is nothing on a modern system. */ short tokens[MAX_TOKENS][256]; int token_bits; int max_token; int curr_token = INIT_TOKEN; int in_pos; /*********************************************************************/ /* -vv */ int maxkidcount = 0, maxlevel = 0, totalkidcount = 0, nodeswithkids = 0; /* -vv */ char *fmt_chr(u8 c) { static char buf[10]; if(c > 33 && c < 127) sprintf(buf, "'%c'", c); else sprintf(buf, "$%02x", c); return buf; } /* -vv */ void indent(int level) { int i; for(i = 0; i < level; i++) fputs(" ", stdout); } /* -vv */ void print_tok(short tok, u8 chr) { printf("#%d/%s\n", tok, fmt_chr(chr)); } /* -vv */ void dump_kids(short tok, int level) { int i, j; int kidcount = 0; if(level > maxlevel) maxlevel = level; for(i = 0; i < 256; i++) { if( (j = tokens[tok][i]) ) { kidcount++; totalkidcount++; indent(level); print_tok(j, i); dump_kids(j, level + 1); } } if(kidcount) { indent(level - 1); printf("#%d has %d kids\n", tok, kidcount); nodeswithkids++; if(kidcount > maxkidcount) maxkidcount = kidcount; } else { indent(level); fputs("(no kids)\n", stdout); } } /* -vv */ void dump_tokens(void) { int i, j, prune; maxkidcount = maxlevel = totalkidcount = nodeswithkids = 0; for(i = 0; i < 256; i++) { prune = 1; for(j = 0; j < 256; j++) { if(tokens[i][j]) { prune = 0; break; } } if(!prune) { print_tok(i, i); dump_kids(i, 1); } } printf("maxkidcount %d, maxlevel = %d, totalkidcount = %d\n", maxkidcount, maxlevel, totalkidcount); printf("avgkidcount: %.2f\n", ((float)totalkidcount) / (float)(nodeswithkids)); } /*********************************************************************/ void init_table(void) { memset(tokens, 0, sizeof(tokens)); token_bits = INITIAL_BITS; max_token = 1 << INITIAL_BITS; curr_token = INIT_TOKEN; } void inc_output_len(void) { if(++output_len == MAX_INPUT_SIZE) { fprintf(stderr, "%s: fatal: compressed file would be >16MB.\n", self); exit(1); } output_buf[output_len] = 0; } void append_bit(int bit) { output_buf[output_len] |= (bit << (7 - out_bitpos)); out_bitpos++; if(out_bitpos == 8) { out_bitpos = 0; inc_output_len(); } } void store_token(int tok) { int mask; if(opt_verbose > 1) { printf("<%d >%d:%d #%d\n", in_pos, output_len, out_bitpos, tok); } for(mask = 1 << (token_bits - 1); mask; mask >>= 1) { append_bit(tok & mask ? 1 : 0); } } short match_token(void) { short t, nt; t = input_buf[in_pos]; in_pos++; if(in_pos == input_len) return t; while((nt = tokens[t][input_buf[in_pos]])) { t = nt; in_pos++; if(in_pos == input_len) break; } return t; } void make_token(short tok, u8 chr) { if(curr_token == max_token) { if(opt_verbose > 1) { printf("\ntoken %d won't fit in %d bits, ", max_token, token_bits); } if(token_bits == MAX_BITS) { if(opt_verbose > 1) { printf("resetting token_bits to %d\n", INITIAL_BITS); printf("token table is full, clearing, old contents:\n"); dump_tokens(); } store_token(TOK_RESET); /* stored at the *old* token size! */ token_bits = INITIAL_BITS; init_table(); return; /* since we're starting over, *don't* make a token */ } else { token_bits++; if(opt_verbose > 1) { printf("token_bits increased to %d\n", token_bits); } } max_token = 1 << token_bits; } tokens[tok][chr] = curr_token; curr_token++; } void crunch(void) { short tok; if(opt_verbose > 1) putchar('\n'); init_table(); out_bitpos = 0; in_pos = 0; /* 0-byte input files don't get a TOK_RESET */ if(!input_len) { store_token(TOK_END); inc_output_len(); return; } store_token(TOK_RESET); while(in_pos < input_len) { tok = match_token(); store_token(tok); if(in_pos < input_len) make_token(tok, input_buf[in_pos]); } store_token(TOK_END); if(out_bitpos) inc_output_len(); if(opt_verbose > 1) { printf("\nfinal token table contents:\n"); dump_tokens(); } init_table(); } /* ******************************************************************** The tokens are stored in a tree with 256 roots (tokens[0 to 256]). There's one root per character code (0-255). This serves as the hard-coded initial dictionary (each token number at this level maps to its character code). Each element of tokens[] has an array of 256 shorts, which are the token numbers for each possible character that follows it. When creating a token, you pass make_token() the index of an existing token. For example if your new token is for the string "ab", you pass make_token() the ASCII value of 'a', and it creates the token by storing the new token's index at tokens['a']['b']. You can think of tokens[] as 256 trees, each of which has 256 branches, which can be 0 (no node here) or the base of another 256-branch tree. For "abc" as the first 3 bytes of the input file, we get: tokens['a']['b'] = 258 (aka INIT_TOKEN) tokens['b']['c'] = 259 Using a static array wastes memory, but not much by modern standards, and it avoids a time-consuming linear search of each level of the tree to see if the current character matches an existing token. Array lookups are O(1). ********************************************************************* */