aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorB. Watson <urchlay@slackware.uk>2025-12-10 01:07:22 -0500
committerB. Watson <urchlay@slackware.uk>2025-12-10 01:07:22 -0500
commit56b4fa84a48a1ad07eef1b5bb85e12acccc6e8b4 (patch)
tree5b8c08bbb426441b01ec1f5801cddff34367197d
parentfed727745fa49a91303101ef99b4d0e768aee102 (diff)
downloadalftools-56b4fa84a48a1ad07eef1b5bb85e12acccc6e8b4.tar.gz
crunch.c: restore -vv (needs work still), update explanation at bottom of the file.
-rw-r--r--src/crunch.c226
1 files changed, 57 insertions, 169 deletions
diff --git a/src/crunch.c b/src/crunch.c
index 709e522..655e441 100644
--- a/src/crunch.c
+++ b/src/crunch.c
@@ -17,12 +17,15 @@
#define TOK_END 257
#define INIT_TOKEN 258
-extern int opt_verbose; /* alf.c, -v option */
+extern int opt_verbose; /* alf.c, -v = 1, -vv = 2 */
u8 input_buf[MAX_INPUT_SIZE];
u8 output_buf[MAX_INPUT_SIZE];
unsigned int input_len, output_len, out_bitpos;
+/* sizeof(short) on my machine is 2, so this is a 2MB table.
+ definitely not how you'd do it on the Atari, but 2MB is nothing
+ on a modern system. */
short tokens[MAX_TOKENS][256];
int token_bits;
@@ -30,16 +33,11 @@ int max_token;
int curr_token = INIT_TOKEN;
int in_pos;
-void init_table(void) {
- int i;
-
- memset(tokens, 0, sizeof(tokens));
-
- token_bits = INITIAL_BITS;
- max_token = 1 << INITIAL_BITS;
- curr_token = INIT_TOKEN;
-}
+/*********************************************************************/
+/* -vv */
+int maxkidcount = 0, maxlevel = 0, totalkidcount = 0, nodeswithkids = 0;
+/* -vv */
char *fmt_chr(u8 c) {
static char buf[10];
if(c > 33 && c < 127)
@@ -49,6 +47,7 @@ char *fmt_chr(u8 c) {
return buf;
}
+/* -vv */
void indent(int level) {
int i;
@@ -56,51 +55,57 @@ void indent(int level) {
fputs(" ", stdout);
}
-#if 0
-int maxkidcount = 0, maxlevel = 0, totalkidcount = 0, nodeswithkids = 0;
-void dump_kids(token_t *t, int level) {
- token_t *kid;
- int kidcount =0;
+/* -vv */
+void print_tok(short tok, u8 chr) {
+ printf("#%d/%s\n", tok, fmt_chr(chr));
+}
+
+/* -vv */
+void dump_kids(short tok, int level) {
+ int i, j;
+ int kidcount = 0;
if(level > maxlevel) maxlevel = level;
- if(t->kids) {
- kid = t->kids;
- while(kid) {
+ for(i = 0; i < 256; i++) {
+ if( (j = tokens[tok][i]) ) {
kidcount++;
totalkidcount++;
indent(level);
- printf("#%d/%s\n", kid->number, fmt_chr(kid->chr));
- dump_kids(kid, level + 1);
- kid = kid->sibling;
+ print_tok(j, i);
+ dump_kids(j, level + 1);
}
- indent(level - 1);
- printf("#%d has %d kids\n", t->number, kidcount);
- if(kidcount > maxkidcount) maxkidcount = kidcount;
- nodeswithkids++;
- } else {
+ }
+
+ if(!kidcount) {
indent(level);
fputs("(no kids)\n", stdout);
}
}
-#endif
+/* -vv */
void dump_tokens(void) {
-#if 0
int i;
maxkidcount = maxlevel = totalkidcount = nodeswithkids = 0;
for(i = 0; i < 256; i++) {
- if(tokens[i].kids) {
- printf("tokens[%s], #%d\n", fmt_chr(tokens[i].chr), tokens[i].number);
- dump_kids(&tokens[i], 1);
- }
+ print_tok(i, i);
+ dump_kids(i, 0);
}
printf("maxkidcount %d, maxlevel = %d, totalkidcount = %d\n", maxkidcount, maxlevel, totalkidcount);
printf("avgkidcount: %.2f\n", ((float)totalkidcount) / (float)(nodeswithkids));
-#endif
+}
+
+/*********************************************************************/
+
+void init_table(void) {
+ memset(tokens, 0, sizeof(tokens));
+
+ token_bits = INITIAL_BITS;
+ max_token = 1 << INITIAL_BITS;
+ curr_token = INIT_TOKEN;
}
void inc_output_len(void) {
@@ -132,96 +137,6 @@ void store_token(int tok) {
}
}
-#if 0
-token_t *get_kid(token_t *t, u8 chr) {
- token_t *kid;
-
- if(!t->kids) return 0;
-
- kid = t->kids;
- while(kid) {
- if(kid->chr == chr)
- return kid;
- kid = kid->sibling;
- }
- return 0;
-}
-
-token_t *match_token(void) {
- token_t *t, *p;
-
- t = &tokens[input_buf[in_pos]];
- in_pos++;
- if(in_pos == input_len)
- return t;
-
- p = t;
- while((p = get_kid(t, input_buf[in_pos]))) {
- t = p;
- in_pos++;
- if(in_pos == input_len) break;
- }
- return t;
-}
-
-token_t *new_token(u8 chr) {
- token_t *newtok;
-
- newtok = &tokens[curr_token];
- newtok->chr = chr;
- newtok->kids = newtok->sibling = 0;
- newtok->number = curr_token;
-
- return newtok;
-}
-
-/* since we're not sorting the kids, adding the new kid as the
- head of the list is faster than walking the list to add it
- to the tail. gives about a 10% speedup. */
-void add_kid(token_t *oldtok, token_t *newtok) {
- if(!oldtok->kids) {
- oldtok->kids = newtok;
- return;
- }
-
- newtok->sibling = oldtok->kids;
- oldtok->kids = newtok;
-}
-
-void make_token(token_t *oldtok, u8 newchr) {
- token_t *newtok;
-
- /* if the token table is full, reset it. basically start over like
- we would with a new file. */
- if(curr_token == max_token) {
- if(opt_verbose > 1) {
- printf("\ntoken %d won't fit in %d bits, ", max_token, token_bits);
- }
- if(token_bits == MAX_BITS) {
- if(opt_verbose > 1) {
- printf("resetting token_bits to %d\n", INITIAL_BITS);
- printf("token table is full, clearing, old contents:\n");
- dump_tokens();
- }
- store_token(TOK_RESET); /* stored at the *old* token size! */
- token_bits = INITIAL_BITS;
- init_table();
- return; /* since we're starting over, *don't* make a token */
- } else {
- token_bits++;
- if(opt_verbose > 1) {
- printf("token_bits increased to %d\n", token_bits);
- }
- }
- max_token = 1 << token_bits;
- }
-
- newtok = new_token(newchr);
- add_kid(oldtok, newtok);
- curr_token++;
-}
-#endif
-
short match_token(void) {
short t, nt;
@@ -266,6 +181,7 @@ void make_token(short tok, u8 chr) {
tokens[tok][chr] = curr_token;
curr_token++;
}
+
void crunch(void) {
short tok;
@@ -306,53 +222,25 @@ There's one root per character code (0-255). This serves as the
hard-coded initial dictionary (each token number at this level maps to
its character code).
-Each node's siblings pointer points to the next node at that level.
-This makes each level of a tree a linked list.
+Each element of tokens[] has an array of 256 shorts, which are the
+token numbers for each possible character that follows it.
-Each node that has children, its kids pointer points to the head of
-the siblings list (the first of its child nodes).
-
-When creating a token, you pass make_token() a pointer to an existing
+When creating a token, you pass make_token() the index of an existing
token. For example if your new token is for the string "ab", you pass
-make_token() the address of tokens['a']. The new token will be
-added to the 'a' token's kids as the sibling of the last kid in the
-list (if there are already kids) or the first kid if there are none.
-
-The number in the token_t increases sequentially as tokens are
-created, and is the compressed token that actually get written to the
-ALF file. The chr is the last character of the token (the previous
-ones are at higher levels of the tree).
-
-What match_token() does is walk the tree, starting from the current
-input position and from the root node for the byte at the current
-position, and search each level for a token whose chr matches the
-next byte in the input. It keeps searching until there's no match, and
-returns the last token it matched. It can return the root, which means
-no other tokens start with that character.
-
-An example: the input begins with "abcdaZ".
-
-
-Partial token structure:
-
-token[['a'] = token #97, no siblings
- \- kids: token #130, b; token #<whatever>, Z
- \- kids: token #131, c
-
-Later on in the file, if we run into the string "abcX", match_token()
-will start with the root token for 'a', walk through its kids looking
-for 'b' (which it finds), then walk through the 'b' token's kids
-looking for a 'c' (which it finds). Suppose the letter 'X' has never
-occured yet in the input. match_token() will return token #131 (follow
-the tree, that's the token for "abc").
-
-A new token is created for every character in the file that doesn't
-match one of the existing tokens.
-
-Walking the tree is fast, but it might be made even faster by sorting
-the kids at each level, and doing a binary search instead of linear...
-or not: most nodes that have kids only have 2 or 3 (avgkidcount around
-2.5 for a text file). The overhead of sorting might make performance
-worse.
+make_token() the ASCII value of 'a', and it creates the token by storing
+the new token's index at tokens['a']['b'].
+
+You can think of tokens[] as 256 trees, each of which has 256 branches,
+which can be 0 (no node here) or the base of another 256-branch tree.
+
+For "abc" as the first 3 bytes of the input file, we get:
+
+tokens['a']['b'] = 258 (aka INIT_TOKEN)
+tokens['b']['c'] = 259
+
+Using a static array wastes memory, but not much by modern standards,
+and it avoids a time-consuming linear search of each level of the
+tree to see if the current character matches an existing token. Array
+lookups are O(1).
********************************************************************* */