alf: isolate crunch algo in its own file.

author: B. Watson <urchlay@slackware.uk> 2025-11-30 17:17:46 -0500
committer: B. Watson <urchlay@slackware.uk> 2025-11-30 17:17:46 -0500
commit: 2b21c11d156e5c76c550b61a3b3a33a89d8578f9 (patch)
tree: df06d5bf78486717b34949971131b6d16044e4d3
parent: be8cd823dbd9b27889421bb1dc70217d39752663 (diff)
download: alftools-2b21c11d156e5c76c550b61a3b3a33a89d8578f9.tar.gz
4 files changed, 179 insertions, 173 deletions
diff --git a/src/Makefile b/src/Makefile
index 823e57c..7933168 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -45,7 +45,7 @@ MANS=alf.1 alfsum.1 unalf.1
 
 UNALF_OBJS=unalf.o io.o listalf.o extract.o f65.o glob.o opts.o usage.o self.o asmcode.o sanity.o
 ALFSUM_OBJS=alfsum.o self.o
-ALF_OBJS=alf.o self.o alfusage.o sanity.o
+ALF_OBJS=alf.o self.o alfusage.o sanity.o crunch.o
 
 .PHONY: all clean install crosswin windows windows-upload realclean
 
@@ -65,7 +65,7 @@ alf: $(ALF_OBJS)
 
 usage.o: usage.c
 
-alfusage.o: usage.c
+alfusage.o: alfusage.c
 
 f65.o: ../f65/f65.c ../f65/f65.h
 	$(CC) $(CFLAGS) -c -o f65.o ../f65/f65.c
@@ -90,7 +90,9 @@ extract.o: extract.c addrs.h unalf.h ../f65/f65.h
 
 sanity.o: sanity.c sanity.h u816.h
 
-alf.o: alf.c sanity.h self.h u816.h
+crunch.o: crunch.c crunch.h
+
+alf.o: alf.c sanity.h self.h u816.h crunch.h
 
 ver.rst:
 	echo '.. |version| replace:: $(VERSION)' > ver.rst
diff --git a/src/alf.c b/src/alf.c
index f3c4e0c..c748cee 100644
--- a/src/alf.c
+++ b/src/alf.c
@@ -10,20 +10,7 @@
 #include "u816.h"
 #include "sanity.h"
 #include "self.h"
-
-#define INITIAL_BITS 9
-#define MAX_BITS 12
-#define MAX_TOKENS (1 << MAX_BITS)
-#define TOK_RESET 256
-#define TOK_END 257
-#define INIT_TOKEN 258 /* 256 = reset dict, 257 = token_bits++ */
-
-#define MAX_INPUT_SIZE (1 << 24)
-
-u8 input_buf[MAX_INPUT_SIZE];
-u8 output_buf[MAX_INPUT_SIZE];
-u8 byte_tokens[256];
-unsigned int input_len, output_len, out_bitpos;
+#include "crunch.h"
 
 int opt_append = 0;
 int opt_overwrite = 0;
@@ -34,37 +21,10 @@ int opt_txtconv = 0;
 int opt_verbose = 0;
 
 struct stat in_file_stat;
-
-typedef struct s_token {
-	u8 *start;
-	u16 length;
-} token_t;
-
-token_t tokentab[MAX_TOKENS];
+long hdr_compsize_pos;
 
 FILE *out_file, *in_file;
 const char *out_filename, *in_filename;
-int token_bits;
-int max_token;
-int curr_token;
-long hdr_compsize_pos;
-int in_pos;
-
-void init_table(void) {
-	int i;
-
-	memset(tokentab, 0, sizeof(tokentab));
-
-	token_bits = INITIAL_BITS;
-	max_token = 1 << INITIAL_BITS;
-	curr_token = INIT_TOKEN;
-
-	for(i = 0; i < 256; i++) {
-		byte_tokens[i] = (u8)i;
-		tokentab[i].start = &byte_tokens[i];
-		tokentab[i].length = 1;
-	}
-}
 
 void store_quad(int pos, unsigned long data) {
 	int i;
@@ -185,132 +145,6 @@ void open_input(const char *filename) {
 	}
 }
 
-void inc_output_len(void) {
-	if(++output_len == MAX_INPUT_SIZE) {
-		fprintf(stderr, "%s: fatal: compressed file would be >16MB.\n", self);
-		exit(1);
-	}
-}
-
-void append_bit(int bit) {
-	output_buf[output_len] |= (bit << (7 - out_bitpos));
-	out_bitpos++;
-	if(out_bitpos == 8) {
-		out_bitpos = 0;
-		inc_output_len();
-	}
-}
-
-void store_token(int tok) {
-	int mask;
-
-	for(mask = 1 << (token_bits - 1); mask; mask >>= 1) {
-		append_bit(tok & mask ? 1 : 0);
-	}
-}
-
-/* match_token() is a brute-force search, which is why alf is so slow.
-   I'll do something smarter at some point.
-   search backwards, the tokens are stored with longer ones later
-   in the list. */
-int match_token(int pos) {
-	int i, len, maxlen;
-	token_t *t;
-	u8 *p, *q;
-
-	maxlen = input_len - pos;
-
-	for(i = curr_token - 1; i >= INIT_TOKEN; i--) {
-		t = &tokentab[i];
-
-		/* don't search past the end of the input */
-		if(t->length > maxlen) continue;
-
-		/* if the first char doesn't match, don't bother with memcmp.
-		   this is a 5x speedup (!) */
-		if(input_buf[pos] != *(t->start)) continue;
-
-		/* this is where alf spends most of its time.
-			using memcmp is noticeably slower than the code below. */
-		/*
-		if(memcmp(&input_buf[pos], t->start, t->length) == 0)
-			return i;
-		*/
-
-		/* inline memcmp replacement of sorts. I don't think it's really
-		   faster than memcmp(), it only seems that way because there's
-		   no function call overhead. ~20% speedup.
-		   making it search backwards gives a further ~25% speedup.
-		 */
-		len = t->length;
-		p = &input_buf[pos] + len - 1;
-		q = t->start + len - 1;
-		while(len) {
-			if(*p != *q) break;
-			p--; q--;
-			len--;
-		}
-		if(!len) return i;
-	}
-
-	/* hard-coded single character tokens map to their values, no need
-	   to search. */
-	return input_buf[pos];
-}
-
-void make_token(int start, int end) {
-	int i;
-	/* if the token table is full, reset it. basically start over like
-	   we would with a new file. */
-	if(curr_token == max_token) {
-		if(token_bits == MAX_BITS) {
-			store_token(TOK_RESET); /* stored at the *old* token size! */
-			token_bits = INITIAL_BITS;
-			init_table();
-			return; /* since we're starting over, *don't* make a token */
-		} else {
-			token_bits++;
-		}
-		max_token = 1 << token_bits;
-	}
-	tokentab[curr_token].start = &input_buf[start];
-	tokentab[curr_token].length = end - start + 1;
-	curr_token++;
-
-	if(opt_verbose) {
-		printf("%d -> %d (%d) = ", start, end, end - start + 1);
-		for(i = start; i < end; i++)
-			putchar(input_buf[i]);
-		putchar('\n');
-	}
-}
-
-void crunch(void) {
-	int new_pos;
-	in_pos = 0;
-	int token;
-
-	out_bitpos = 0;
-
-	/* 0-byte input files don't get a TOK_RESET */
-	if(input_len)
-		store_token(TOK_RESET);
-
-	while(in_pos < input_len) {
-		if(opt_verbose) printf("in_pos==%d\n", in_pos);
-		token = match_token(in_pos);
-		store_token(token);
-		new_pos = in_pos + tokentab[token].length;
-		if(new_pos < input_len)
-			make_token(in_pos, new_pos);
-		in_pos = new_pos;
-	}
-
-	store_token(TOK_END);
-	if(out_bitpos) inc_output_len();
-	update_header();
-}
-
 void make_backup(void) {
 	char bak[PATH_MAX + 2];
 	strncpy(bak, out_filename, PATH_MAX);
@@ -330,8 +164,6 @@ void convert_eols(void) {
 }
 
 void crunch_file(const char *filename) {
-	init_table();
-
 	open_input(filename);
 
 	/* read in entire input, couldn't do it this way on the Atari */
@@ -354,6 +186,7 @@ void crunch_file(const char *filename) {
 
 	/* crunches the entire input to memory! */
 	crunch();
+	update_header();
 
 	/* don't open the output file until crunch() has succeeded once.
 	   this avoids leaving 0-byte turds */
diff --git a/src/crunch.c b/src/crunch.c
new file mode 100644
index 0000000..0cc8092
--- /dev/null
+++ b/src/crunch.c
@@ -0,0 +1,164 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "u816.h"
+#include "crunch.h"
+#include "self.h"
+
+#define INITIAL_BITS 9
+#define MAX_BITS 12
+#define MAX_TOKENS (1 << MAX_BITS)
+#define TOK_RESET 256
+#define TOK_END 257
+#define INIT_TOKEN 258
+
+u8 input_buf[MAX_INPUT_SIZE];
+u8 output_buf[MAX_INPUT_SIZE];
+unsigned int input_len, output_len, out_bitpos;
+
+u8 byte_tokens[256];
+typedef struct s_token {
+	u8 *start;
+	u16 length;
+} token_t;
+
+token_t tokentab[MAX_TOKENS];
+
+int token_bits;
+int max_token;
+int curr_token;
+int in_pos;
+
+void init_table(void) {
+	int i;
+
+	memset(tokentab, 0, sizeof(tokentab));
+
+	token_bits = INITIAL_BITS;
+	max_token = 1 << INITIAL_BITS;
+	curr_token = INIT_TOKEN;
+
+	for(i = 0; i < 256; i++) {
+		byte_tokens[i] = (u8)i;
+		tokentab[i].start = &byte_tokens[i];
+		tokentab[i].length = 1;
+	}
+}
+
+void inc_output_len(void) {
+	if(++output_len == MAX_INPUT_SIZE) {
+		fprintf(stderr, "%s: fatal: compressed file would be >16MB.\n", self);
+		exit(1);
+	}
+}
+
+void append_bit(int bit) {
+	output_buf[output_len] |= (bit << (7 - out_bitpos));
+	out_bitpos++;
+	if(out_bitpos == 8) {
+		out_bitpos = 0;
+		inc_output_len();
+	}
+}
+
+void store_token(int tok) {
+	int mask;
+
+	for(mask = 1 << (token_bits - 1); mask; mask >>= 1) {
+		append_bit(tok & mask ? 1 : 0);
+	}
+}
+
+/* match_token() is a brute-force search, which is why alf is so slow.
+   I'll do something smarter at some point.
+   search backwards, the tokens are stored with longer ones later
+   in the list. */
+int match_token(int pos) {
+	int i, len, maxlen;
+	token_t *t;
+	u8 *p, *q;
+
+	maxlen = input_len - pos;
+
+	for(i = curr_token - 1; i >= INIT_TOKEN; i--) {
+		t = &tokentab[i];
+
+		/* don't search past the end of the input */
+		if(t->length > maxlen) continue;
+
+		/* if the first char doesn't match, don't bother with memcmp.
+		   this is a 5x speedup (!) */
+		if(input_buf[pos] != *(t->start)) continue;
+
+		/* this is where alf spends most of its time.
+			using memcmp is noticeably slower than the code below. */
+		/*
+		if(memcmp(&input_buf[pos], t->start, t->length) == 0)
+			return i;
+		*/
+
+		/* inline memcmp replacement of sorts. I don't think it's really
+		   faster than memcmp(), it only seems that way because there's
+		   no function call overhead. ~20% speedup.
+		   making it search backwards gives a further ~25% speedup.
+		 */
+		len = t->length;
+		p = &input_buf[pos] + len - 1;
+		q = t->start + len - 1;
+		while(len) {
+			if(*p != *q) break;
+			p--; q--;
+			len--;
+		}
+		if(!len) return i;
+	}
+
+	/* hard-coded single character tokens map to their values, no need
+	   to search. */
+	return input_buf[pos];
+}
+
+void make_token(int start, int end) {
+	/* if the token table is full, reset it. basically start over like
+	   we would with a new file. */
+	if(curr_token == max_token) {
+		if(token_bits == MAX_BITS) {
+			store_token(TOK_RESET); /* stored at the *old* token size! */
+			token_bits = INITIAL_BITS;
+			init_table();
+			return; /* since we're starting over, *don't* make a token */
+		} else {
+			token_bits++;
+		}
+		max_token = 1 << token_bits;
+	}
+	tokentab[curr_token].start = &input_buf[start];
+	tokentab[curr_token].length = end - start + 1;
+	curr_token++;
+}
+
+void crunch(void) {
+	int new_pos;
+	int token;
+
+	init_table();
+	out_bitpos = 0;
+	in_pos = 0;
+
+	/* 0-byte input files don't get a TOK_RESET */
+	if(input_len)
+		store_token(TOK_RESET);
+
+	while(in_pos < input_len) {
+		token = match_token(in_pos);
+		store_token(token);
+		new_pos = in_pos + tokentab[token].length;
+		if(new_pos < input_len)
+			make_token(in_pos, new_pos);
+		in_pos = new_pos;
+	}
+
+	store_token(TOK_END);
+	if(out_bitpos) inc_output_len();
+}
+
diff --git a/src/crunch.h b/src/crunch.h
new file mode 100644
index 0000000..4db2f1a
--- /dev/null
+++ b/src/crunch.h
@@ -0,0 +1,7 @@
+#define MAX_INPUT_SIZE (1 << 24)
+
+extern u8 input_buf[MAX_INPUT_SIZE];
+extern u8 output_buf[MAX_INPUT_SIZE];
+extern unsigned int input_len, output_len, out_bitpos;
+
+void crunch(void);
author	B. Watson <urchlay@slackware.uk>	2025-11-30 17:17:46 -0500
committer	B. Watson <urchlay@slackware.uk>	2025-11-30 17:17:46 -0500
commit	2b21c11d156e5c76c550b61a3b3a33a89d8578f9 (patch)
tree	df06d5bf78486717b34949971131b6d16044e4d3
parent	be8cd823dbd9b27889421bb1dc70217d39752663 (diff)
download	alftools-2b21c11d156e5c76c550b61a3b3a33a89d8578f9.tar.gz