aboutsummaryrefslogtreecommitdiff
path: root/src/crunch.c
diff options
context:
space:
mode:
authorB. Watson <urchlay@slackware.uk>2025-11-30 17:17:46 -0500
committerB. Watson <urchlay@slackware.uk>2025-11-30 17:17:46 -0500
commit2b21c11d156e5c76c550b61a3b3a33a89d8578f9 (patch)
treedf06d5bf78486717b34949971131b6d16044e4d3 /src/crunch.c
parentbe8cd823dbd9b27889421bb1dc70217d39752663 (diff)
downloadalftools-2b21c11d156e5c76c550b61a3b3a33a89d8578f9.tar.gz
alf: isolate crunch algo in its own file.
Diffstat (limited to 'src/crunch.c')
-rw-r--r--src/crunch.c164
1 files changed, 164 insertions, 0 deletions
diff --git a/src/crunch.c b/src/crunch.c
new file mode 100644
index 0000000..0cc8092
--- /dev/null
+++ b/src/crunch.c
@@ -0,0 +1,164 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "u816.h"
+#include "crunch.h"
+#include "self.h"
+
+#define INITIAL_BITS 9
+#define MAX_BITS 12
+#define MAX_TOKENS (1 << MAX_BITS)
+#define TOK_RESET 256
+#define TOK_END 257
+#define INIT_TOKEN 258
+
+u8 input_buf[MAX_INPUT_SIZE];
+u8 output_buf[MAX_INPUT_SIZE];
+unsigned int input_len, output_len, out_bitpos;
+
+u8 byte_tokens[256];
+typedef struct s_token {
+ u8 *start;
+ u16 length;
+} token_t;
+
+token_t tokentab[MAX_TOKENS];
+
+int token_bits;
+int max_token;
+int curr_token;
+int in_pos;
+
+void init_table(void) {
+ int i;
+
+ memset(tokentab, 0, sizeof(tokentab));
+
+ token_bits = INITIAL_BITS;
+ max_token = 1 << INITIAL_BITS;
+ curr_token = INIT_TOKEN;
+
+ for(i = 0; i < 256; i++) {
+ byte_tokens[i] = (u8)i;
+ tokentab[i].start = &byte_tokens[i];
+ tokentab[i].length = 1;
+ }
+}
+
+void inc_output_len(void) {
+ if(++output_len == MAX_INPUT_SIZE) {
+ fprintf(stderr, "%s: fatal: compressed file would be >16MB.\n", self);
+ exit(1);
+ }
+}
+
+void append_bit(int bit) {
+ output_buf[output_len] |= (bit << (7 - out_bitpos));
+ out_bitpos++;
+ if(out_bitpos == 8) {
+ out_bitpos = 0;
+ inc_output_len();
+ }
+}
+
+void store_token(int tok) {
+ int mask;
+
+ for(mask = 1 << (token_bits - 1); mask; mask >>= 1) {
+ append_bit(tok & mask ? 1 : 0);
+ }
+}
+
+/* match_token() is a brute-force search, which is why alf is so slow.
+ I'll do something smarter at some point.
+ search backwards, the tokens are stored with longer ones later
+ in the list. */
+int match_token(int pos) {
+ int i, len, maxlen;
+ token_t *t;
+ u8 *p, *q;
+
+ maxlen = input_len - pos;
+
+ for(i = curr_token - 1; i >= INIT_TOKEN; i--) {
+ t = &tokentab[i];
+
+ /* don't search past the end of the input */
+ if(t->length > maxlen) continue;
+
+ /* if the first char doesn't match, don't bother with memcmp.
+ this is a 5x speedup (!) */
+ if(input_buf[pos] != *(t->start)) continue;
+
+ /* this is where alf spends most of its time.
+ using memcmp is noticeably slower than the code below. */
+ /*
+ if(memcmp(&input_buf[pos], t->start, t->length) == 0)
+ return i;
+ */
+
+ /* inline memcmp replacement of sorts. I don't think it's really
+ faster than memcmp(), it only seems that way because there's
+ no function call overhead. ~20% speedup.
+ making it search backwards gives a further ~25% speedup.
+ */
+ len = t->length;
+ p = &input_buf[pos] + len - 1;
+ q = t->start + len - 1;
+ while(len) {
+ if(*p != *q) break;
+ p--; q--;
+ len--;
+ }
+ if(!len) return i;
+ }
+
+ /* hard-coded single character tokens map to their values, no need
+ to search. */
+ return input_buf[pos];
+}
+
+void make_token(int start, int end) {
+ /* if the token table is full, reset it. basically start over like
+ we would with a new file. */
+ if(curr_token == max_token) {
+ if(token_bits == MAX_BITS) {
+ store_token(TOK_RESET); /* stored at the *old* token size! */
+ token_bits = INITIAL_BITS;
+ init_table();
+ return; /* since we're starting over, *don't* make a token */
+ } else {
+ token_bits++;
+ }
+ max_token = 1 << token_bits;
+ }
+ tokentab[curr_token].start = &input_buf[start];
+ tokentab[curr_token].length = end - start + 1;
+ curr_token++;
+}
+
+void crunch(void) {
+ int new_pos;
+ int token;
+
+ init_table();
+ out_bitpos = 0;
+ in_pos = 0;
+
+ /* 0-byte input files don't get a TOK_RESET */
+ if(input_len)
+ store_token(TOK_RESET);
+
+ while(in_pos < input_len) {
+ token = match_token(in_pos);
+ store_token(token);
+ new_pos = in_pos + tokentab[token].length;
+ if(new_pos < input_len)
+ make_token(in_pos, new_pos);
+ in_pos = new_pos;
+ }
+
+ store_token(TOK_END);
+ if(out_bitpos) inc_output_len();
+}
+