crunch.c: restore -vv (needs work still), update explanation at bottom of the file.

author: B. Watson <urchlay@slackware.uk> 2025-12-10 01:07:22 -0500
committer: B. Watson <urchlay@slackware.uk> 2025-12-10 01:07:22 -0500
commit: 56b4fa84a48a1ad07eef1b5bb85e12acccc6e8b4 (patch)
tree: 5b8c08bbb426441b01ec1f5801cddff34367197d
parent: fed727745fa49a91303101ef99b4d0e768aee102 (diff)
download: alftools-56b4fa84a48a1ad07eef1b5bb85e12acccc6e8b4.tar.gz
1 files changed, 57 insertions, 169 deletions
diff --git a/src/crunch.c b/src/crunch.c
index 709e522..655e441 100644
--- a/src/crunch.c
+++ b/src/crunch.c
@@ -17,12 +17,15 @@
 #define TOK_END 257
 #define INIT_TOKEN 258
 
-extern int opt_verbose; /* alf.c, -v option */
+extern int opt_verbose; /* alf.c, -v = 1, -vv = 2 */
 
 u8 input_buf[MAX_INPUT_SIZE];
 u8 output_buf[MAX_INPUT_SIZE];
 unsigned int input_len, output_len, out_bitpos;
 
+/* sizeof(short) on my machine is 2, so this is a 2MB table.
+   definitely not how you'd do it on the Atari, but 2MB is nothing
+	on a modern system. */
 short tokens[MAX_TOKENS][256];
 
 int token_bits;
@@ -30,16 +33,11 @@ int max_token;
 int curr_token = INIT_TOKEN;
 int in_pos;
 
-void init_table(void) {
-	int i;
-
-	memset(tokens, 0, sizeof(tokens));
-
-	token_bits = INITIAL_BITS;
-	max_token = 1 << INITIAL_BITS;
-	curr_token = INIT_TOKEN;
-}
+/*********************************************************************/
+/* -vv */
+int maxkidcount = 0, maxlevel = 0, totalkidcount = 0, nodeswithkids = 0;
 
+/* -vv */
 char *fmt_chr(u8 c) {
 	static char buf[10];
 	if(c > 33 && c < 127)
@@ -49,6 +47,7 @@ char *fmt_chr(u8 c) {
 	return buf;
 }
 
+/* -vv */
 void indent(int level) {
 	int i;
 
@@ -56,51 +55,57 @@ void indent(int level) {
 		fputs("  ", stdout);
 }
 
-#if 0
-int maxkidcount = 0, maxlevel = 0, totalkidcount = 0, nodeswithkids = 0;
-void dump_kids(token_t *t, int level) {
-	token_t *kid;
-	int kidcount =0;
+/* -vv */
+void print_tok(short tok, u8 chr) {
+	printf("#%d/%s\n", tok, fmt_chr(chr));
+}
+
+/* -vv */
+void dump_kids(short tok, int level) {
+	int i, j;
+	int kidcount = 0;
 
 	if(level > maxlevel) maxlevel = level;
 
-	if(t->kids) {
-		kid = t->kids;
-		while(kid) {
+	for(i = 0; i < 256; i++) {
+		if( (j = tokens[tok][i]) ) {
 			kidcount++;
 			totalkidcount++;
 			indent(level);
-			printf("#%d/%s\n", kid->number, fmt_chr(kid->chr));
-			dump_kids(kid, level + 1);
-			kid = kid->sibling;
+			print_tok(j, i);
+			dump_kids(j, level + 1);
 		}
-		indent(level - 1);
-		printf("#%d has %d kids\n", t->number, kidcount);
-		if(kidcount > maxkidcount) maxkidcount = kidcount;
-		nodeswithkids++;
-	} else {
+	}
+
+	if(!kidcount) {
 		indent(level);
 		fputs("(no kids)\n", stdout);
 	}
 }
-#endif
 
+/* -vv */
 void dump_tokens(void) {
-#if 0
 	int i;
 
 	maxkidcount = maxlevel = totalkidcount = nodeswithkids = 0;
 
 	for(i = 0; i < 256; i++) {
-		if(tokens[i].kids) {
-			printf("tokens[%s], #%d\n", fmt_chr(tokens[i].chr), tokens[i].number);
-			dump_kids(&tokens[i], 1);
-		}
+		print_tok(i, i);
+		dump_kids(i, 0);
 	}
 
 	printf("maxkidcount %d, maxlevel = %d, totalkidcount = %d\n", maxkidcount, maxlevel, totalkidcount);
 	printf("avgkidcount: %.2f\n", ((float)totalkidcount) / (float)(nodeswithkids));
-#endif
+}
+
+/*********************************************************************/
+
+void init_table(void) {
+	memset(tokens, 0, sizeof(tokens));
+
+	token_bits = INITIAL_BITS;
+	max_token = 1 << INITIAL_BITS;
+	curr_token = INIT_TOKEN;
 }
 
 void inc_output_len(void) {
@@ -132,96 +137,6 @@ void store_token(int tok) {
 	}
 }
 
-#if 0
-token_t *get_kid(token_t *t, u8 chr) {
-	token_t *kid;
-
-	if(!t->kids) return 0;
-
-	kid = t->kids;
-	while(kid) {
-		if(kid->chr == chr)
-			return kid;
-		kid = kid->sibling;
-	}
-	return 0;
-}
-
-token_t *match_token(void) {
-	token_t *t, *p;
-
-	t = &tokens[input_buf[in_pos]];
-	in_pos++;
-	if(in_pos == input_len)
-		return t;
-
-	p = t;
-	while((p = get_kid(t, input_buf[in_pos]))) {
-		t = p;
-		in_pos++;
-		if(in_pos == input_len) break;
-	}
-	return t;
-}
-
-token_t *new_token(u8 chr) {
-	token_t *newtok;
-
-	newtok = &tokens[curr_token];
-	newtok->chr = chr;
-	newtok->kids = newtok->sibling = 0;
-	newtok->number = curr_token;
-
-	return newtok;
-}
-
-/* since we're not sorting the kids, adding the new kid as the
-   head of the list is faster than walking the list to add it
-   to the tail. gives about a 10% speedup. */
-void add_kid(token_t *oldtok, token_t *newtok) {
-	if(!oldtok->kids) {
-		oldtok->kids = newtok;
-		return;
-	}
-
-	newtok->sibling = oldtok->kids;
-	oldtok->kids = newtok;
-}
-
-void make_token(token_t *oldtok, u8 newchr) {
-	token_t *newtok;
-
-	/* if the token table is full, reset it. basically start over like
-	   we would with a new file. */
-	if(curr_token == max_token) {
-		if(opt_verbose > 1) {
-			printf("\ntoken %d won't fit in %d bits, ", max_token, token_bits);
-		}
-		if(token_bits == MAX_BITS) {
-			if(opt_verbose > 1) {
-				printf("resetting token_bits to %d\n", INITIAL_BITS);
-				printf("token table is full, clearing, old contents:\n");
-				dump_tokens();
-			}
-			store_token(TOK_RESET); /* stored at the *old* token size! */
-			token_bits = INITIAL_BITS;
-			init_table();
-			return; /* since we're starting over, *don't* make a token */
-		} else {
-			token_bits++;
-			if(opt_verbose > 1) {
-				printf("token_bits increased to %d\n", token_bits);
-			}
-		}
-		max_token = 1 << token_bits;
-	}
-
-	newtok = new_token(newchr);
-	add_kid(oldtok, newtok);
-	curr_token++;
-}
-#endif
-
 short match_token(void) {
 	short t, nt;
 
@@ -266,6 +181,7 @@ void make_token(short tok, u8 chr) {
 	tokens[tok][chr] = curr_token;
 	curr_token++;
 }
+
 void crunch(void) {
 	short tok;
 
@@ -306,53 +222,25 @@ There's one root per character code (0-255). This serves as the
 hard-coded initial dictionary (each token number at this level maps to
 its character code).
 
-Each node's siblings pointer points to the next node at that level.
-This makes each level of a tree a linked list.
+Each element of tokens[] has an array of 256 shorts, which are the
+token numbers for each possible character that follows it.
 
-Each node that has children, its kids pointer points to the head of
-the siblings list (the first of its child nodes).
-
-When creating a token, you pass make_token() a pointer to an existing
+When creating a token, you pass make_token() the index of an existing
 token. For example if your new token is for the string "ab", you pass
-make_token() the address of tokens['a']. The new token will be
-added to the 'a' token's kids as the sibling of the last kid in the
-list (if there are already kids) or the first kid if there are none.
-
-The number in the token_t increases sequentially as tokens are
-created, and is the compressed token that actually get written to the
-ALF file. The chr is the last character of the token (the previous
-ones are at higher levels of the tree).
-
-What match_token() does is walk the tree, starting from the current
-input position and from the root node for the byte at the current
-position, and search each level for a token whose chr matches the
-next byte in the input. It keeps searching until there's no match, and
-returns the last token it matched. It can return the root, which means
-no other tokens start with that character.
-
-An example: the input begins with "abcdaZ".
-
-
-Partial token structure:
-
-token[['a'] = token #97, no siblings
- \- kids: token #130, b; token #<whatever>, Z
-    \- kids: token #131, c
-
-Later on in the file, if we run into the string "abcX", match_token()
-will start with the root token for 'a', walk through its kids looking
-for 'b' (which it finds), then walk through the 'b' token's kids
-looking for a 'c' (which it finds). Suppose the letter 'X' has never
-occured yet in the input. match_token() will return token #131 (follow
-the tree, that's the token for "abc").
-
-A new token is created for every character in the file that doesn't
-match one of the existing tokens.
-
-Walking the tree is fast, but it might be made even faster by sorting
-the kids at each level, and doing a binary search instead of linear...
-or not: most nodes that have kids only have 2 or 3 (avgkidcount around
-2.5 for a text file). The overhead of sorting might make performance
-worse.
+make_token() the ASCII value of 'a', and it creates the token by storing
+the new token's index at tokens['a']['b'].
+
+You can think of tokens[] as 256 trees, each of which has 256 branches,
+which can be 0 (no node here) or the base of another 256-branch tree.
+
+For "abc" as the first 3 bytes of the input file, we get:
+
+tokens['a']['b'] = 258 (aka INIT_TOKEN)
+tokens['b']['c'] = 259
+
+Using a static array wastes memory, but not much by modern standards,
+and it avoids a time-consuming linear search of each level of the
+tree to see if the current character matches an existing token. Array
+lookups are O(1).
 
 ********************************************************************* */
author	B. Watson <urchlay@slackware.uk>	2025-12-10 01:07:22 -0500
committer	B. Watson <urchlay@slackware.uk>	2025-12-10 01:07:22 -0500
commit	56b4fa84a48a1ad07eef1b5bb85e12acccc6e8b4 (patch)
tree	5b8c08bbb426441b01ec1f5801cddff34367197d
parent	fed727745fa49a91303101ef99b4d0e768aee102 (diff)
download	alftools-56b4fa84a48a1ad07eef1b5bb85e12acccc6e8b4.tar.gz