diff options
| author | B. Watson <urchlay@slackware.uk> | 2025-12-10 01:07:22 -0500 |
|---|---|---|
| committer | B. Watson <urchlay@slackware.uk> | 2025-12-10 01:07:22 -0500 |
| commit | 56b4fa84a48a1ad07eef1b5bb85e12acccc6e8b4 (patch) | |
| tree | 5b8c08bbb426441b01ec1f5801cddff34367197d | |
| parent | fed727745fa49a91303101ef99b4d0e768aee102 (diff) | |
| download | alftools-56b4fa84a48a1ad07eef1b5bb85e12acccc6e8b4.tar.gz | |
crunch.c: restore -vv (needs work still), update explanation at bottom of the file.
| -rw-r--r-- | src/crunch.c | 226 |
1 files changed, 57 insertions, 169 deletions
diff --git a/src/crunch.c b/src/crunch.c index 709e522..655e441 100644 --- a/src/crunch.c +++ b/src/crunch.c @@ -17,12 +17,15 @@ #define TOK_END 257 #define INIT_TOKEN 258 -extern int opt_verbose; /* alf.c, -v option */ +extern int opt_verbose; /* alf.c, -v = 1, -vv = 2 */ u8 input_buf[MAX_INPUT_SIZE]; u8 output_buf[MAX_INPUT_SIZE]; unsigned int input_len, output_len, out_bitpos; +/* sizeof(short) on my machine is 2, so this is a 2MB table. + definitely not how you'd do it on the Atari, but 2MB is nothing + on a modern system. */ short tokens[MAX_TOKENS][256]; int token_bits; @@ -30,16 +33,11 @@ int max_token; int curr_token = INIT_TOKEN; int in_pos; -void init_table(void) { - int i; - - memset(tokens, 0, sizeof(tokens)); - - token_bits = INITIAL_BITS; - max_token = 1 << INITIAL_BITS; - curr_token = INIT_TOKEN; -} +/*********************************************************************/ +/* -vv */ +int maxkidcount = 0, maxlevel = 0, totalkidcount = 0, nodeswithkids = 0; +/* -vv */ char *fmt_chr(u8 c) { static char buf[10]; if(c > 33 && c < 127) @@ -49,6 +47,7 @@ char *fmt_chr(u8 c) { return buf; } +/* -vv */ void indent(int level) { int i; @@ -56,51 +55,57 @@ void indent(int level) { fputs(" ", stdout); } -#if 0 -int maxkidcount = 0, maxlevel = 0, totalkidcount = 0, nodeswithkids = 0; -void dump_kids(token_t *t, int level) { - token_t *kid; - int kidcount =0; +/* -vv */ +void print_tok(short tok, u8 chr) { + printf("#%d/%s\n", tok, fmt_chr(chr)); +} + +/* -vv */ +void dump_kids(short tok, int level) { + int i, j; + int kidcount = 0; if(level > maxlevel) maxlevel = level; - if(t->kids) { - kid = t->kids; - while(kid) { + for(i = 0; i < 256; i++) { + if( (j = tokens[tok][i]) ) { kidcount++; totalkidcount++; indent(level); - printf("#%d/%s\n", kid->number, fmt_chr(kid->chr)); - dump_kids(kid, level + 1); - kid = kid->sibling; + print_tok(j, i); + dump_kids(j, level + 1); } - indent(level - 1); - printf("#%d has %d kids\n", t->number, kidcount); - if(kidcount > maxkidcount) maxkidcount = kidcount; - nodeswithkids++; - } else { + } + + if(!kidcount) { indent(level); fputs("(no kids)\n", stdout); } } -#endif +/* -vv */ void dump_tokens(void) { -#if 0 int i; maxkidcount = maxlevel = totalkidcount = nodeswithkids = 0; for(i = 0; i < 256; i++) { - if(tokens[i].kids) { - printf("tokens[%s], #%d\n", fmt_chr(tokens[i].chr), tokens[i].number); - dump_kids(&tokens[i], 1); - } + print_tok(i, i); + dump_kids(i, 0); } printf("maxkidcount %d, maxlevel = %d, totalkidcount = %d\n", maxkidcount, maxlevel, totalkidcount); printf("avgkidcount: %.2f\n", ((float)totalkidcount) / (float)(nodeswithkids)); -#endif +} + +/*********************************************************************/ + +void init_table(void) { + memset(tokens, 0, sizeof(tokens)); + + token_bits = INITIAL_BITS; + max_token = 1 << INITIAL_BITS; + curr_token = INIT_TOKEN; } void inc_output_len(void) { @@ -132,96 +137,6 @@ void store_token(int tok) { } } -#if 0 -token_t *get_kid(token_t *t, u8 chr) { - token_t *kid; - - if(!t->kids) return 0; - - kid = t->kids; - while(kid) { - if(kid->chr == chr) - return kid; - kid = kid->sibling; - } - return 0; -} - -token_t *match_token(void) { - token_t *t, *p; - - t = &tokens[input_buf[in_pos]]; - in_pos++; - if(in_pos == input_len) - return t; - - p = t; - while((p = get_kid(t, input_buf[in_pos]))) { - t = p; - in_pos++; - if(in_pos == input_len) break; - } - return t; -} - -token_t *new_token(u8 chr) { - token_t *newtok; - - newtok = &tokens[curr_token]; - newtok->chr = chr; - newtok->kids = newtok->sibling = 0; - newtok->number = curr_token; - - return newtok; -} - -/* since we're not sorting the kids, adding the new kid as the - head of the list is faster than walking the list to add it - to the tail. gives about a 10% speedup. */ -void add_kid(token_t *oldtok, token_t *newtok) { - if(!oldtok->kids) { - oldtok->kids = newtok; - return; - } - - newtok->sibling = oldtok->kids; - oldtok->kids = newtok; -} - -void make_token(token_t *oldtok, u8 newchr) { - token_t *newtok; - - /* if the token table is full, reset it. basically start over like - we would with a new file. */ - if(curr_token == max_token) { - if(opt_verbose > 1) { - printf("\ntoken %d won't fit in %d bits, ", max_token, token_bits); - } - if(token_bits == MAX_BITS) { - if(opt_verbose > 1) { - printf("resetting token_bits to %d\n", INITIAL_BITS); - printf("token table is full, clearing, old contents:\n"); - dump_tokens(); - } - store_token(TOK_RESET); /* stored at the *old* token size! */ - token_bits = INITIAL_BITS; - init_table(); - return; /* since we're starting over, *don't* make a token */ - } else { - token_bits++; - if(opt_verbose > 1) { - printf("token_bits increased to %d\n", token_bits); - } - } - max_token = 1 << token_bits; - } - - newtok = new_token(newchr); - add_kid(oldtok, newtok); - curr_token++; -} -#endif - short match_token(void) { short t, nt; @@ -266,6 +181,7 @@ void make_token(short tok, u8 chr) { tokens[tok][chr] = curr_token; curr_token++; } + void crunch(void) { short tok; @@ -306,53 +222,25 @@ There's one root per character code (0-255). This serves as the hard-coded initial dictionary (each token number at this level maps to its character code). -Each node's siblings pointer points to the next node at that level. -This makes each level of a tree a linked list. +Each element of tokens[] has an array of 256 shorts, which are the +token numbers for each possible character that follows it. -Each node that has children, its kids pointer points to the head of -the siblings list (the first of its child nodes). - -When creating a token, you pass make_token() a pointer to an existing +When creating a token, you pass make_token() the index of an existing token. For example if your new token is for the string "ab", you pass -make_token() the address of tokens['a']. The new token will be -added to the 'a' token's kids as the sibling of the last kid in the -list (if there are already kids) or the first kid if there are none. - -The number in the token_t increases sequentially as tokens are -created, and is the compressed token that actually get written to the -ALF file. The chr is the last character of the token (the previous -ones are at higher levels of the tree). - -What match_token() does is walk the tree, starting from the current -input position and from the root node for the byte at the current -position, and search each level for a token whose chr matches the -next byte in the input. It keeps searching until there's no match, and -returns the last token it matched. It can return the root, which means -no other tokens start with that character. - -An example: the input begins with "abcdaZ". - - -Partial token structure: - -token[['a'] = token #97, no siblings - \- kids: token #130, b; token #<whatever>, Z - \- kids: token #131, c - -Later on in the file, if we run into the string "abcX", match_token() -will start with the root token for 'a', walk through its kids looking -for 'b' (which it finds), then walk through the 'b' token's kids -looking for a 'c' (which it finds). Suppose the letter 'X' has never -occured yet in the input. match_token() will return token #131 (follow -the tree, that's the token for "abc"). - -A new token is created for every character in the file that doesn't -match one of the existing tokens. - -Walking the tree is fast, but it might be made even faster by sorting -the kids at each level, and doing a binary search instead of linear... -or not: most nodes that have kids only have 2 or 3 (avgkidcount around -2.5 for a text file). The overhead of sorting might make performance -worse. +make_token() the ASCII value of 'a', and it creates the token by storing +the new token's index at tokens['a']['b']. + +You can think of tokens[] as 256 trees, each of which has 256 branches, +which can be 0 (no node here) or the base of another 256-branch tree. + +For "abc" as the first 3 bytes of the input file, we get: + +tokens['a']['b'] = 258 (aka INIT_TOKEN) +tokens['b']['c'] = 259 + +Using a static array wastes memory, but not much by modern standards, +and it avoids a time-consuming linear search of each level of the +tree to see if the current character matches an existing token. Array +lookups are O(1). ********************************************************************* */ |
