aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorB. Watson <urchlay@slackware.uk>2025-12-01 04:03:11 -0500
committerB. Watson <urchlay@slackware.uk>2025-12-01 04:03:11 -0500
commit4842123935ecdf55e322d8661366d55ad77ab2a8 (patch)
tree823b1d5c4fe2977bf7c0decdd729ff640fd2aca2 /src
parent2b21c11d156e5c76c550b61a3b3a33a89d8578f9 (diff)
downloadalftools-4842123935ecdf55e322d8661366d55ad77ab2a8.tar.gz
Replace brute-force match_token() with something smarter (and *much* faster, too).
Diffstat (limited to 'src')
-rw-r--r--src/crunch.c163
1 files changed, 93 insertions, 70 deletions
diff --git a/src/crunch.c b/src/crunch.c
index 0cc8092..0d56723 100644
--- a/src/crunch.c
+++ b/src/crunch.c
@@ -16,33 +16,38 @@ u8 input_buf[MAX_INPUT_SIZE];
u8 output_buf[MAX_INPUT_SIZE];
unsigned int input_len, output_len, out_bitpos;
-u8 byte_tokens[256];
typedef struct s_token {
- u8 *start;
- u16 length;
+ u8 chr;
+ u16 number;
+ struct s_token *sibling;
+ struct s_token *kids;
} token_t;
-token_t tokentab[MAX_TOKENS];
+token_t root_tokens[256];
+
+/* not used for lookups, just a fast way to free() everything */
+token_t *tokentab[MAX_TOKENS];
int token_bits;
int max_token;
-int curr_token;
+int curr_token = INIT_TOKEN;
int in_pos;
+int new_pos;
void init_table(void) {
int i;
- memset(tokentab, 0, sizeof(tokentab));
+ for(i = INIT_TOKEN; i < curr_token; i++)
+ free(tokentab[i]);
+
+ for(i = 0; i < 256; i++) {
+ root_tokens[i].chr = root_tokens[i].number = i;
+ root_tokens[i].kids = root_tokens[i].sibling = 0;
+ }
token_bits = INITIAL_BITS;
max_token = 1 << INITIAL_BITS;
curr_token = INIT_TOKEN;
-
- for(i = 0; i < 256; i++) {
- byte_tokens[i] = (u8)i;
- tokentab[i].start = &byte_tokens[i];
- tokentab[i].length = 1;
- }
}
void inc_output_len(void) {
@@ -69,56 +74,70 @@ void store_token(int tok) {
}
}
-/* match_token() is a brute-force search, which is why alf is so slow.
- I'll do something smarter at some point.
- search backwards, the tokens are stored with longer ones later
- in the list. */
-int match_token(int pos) {
- int i, len, maxlen;
- token_t *t;
- u8 *p, *q;
-
- maxlen = input_len - pos;
-
- for(i = curr_token - 1; i >= INIT_TOKEN; i--) {
- t = &tokentab[i];
-
- /* don't search past the end of the input */
- if(t->length > maxlen) continue;
-
- /* if the first char doesn't match, don't bother with memcmp.
- this is a 5x speedup (!) */
- if(input_buf[pos] != *(t->start)) continue;
-
- /* this is where alf spends most of its time.
- using memcmp is noticeably slower than the code below. */
- /*
- if(memcmp(&input_buf[pos], t->start, t->length) == 0)
- return i;
- */
-
- /* inline memcmp replacement of sorts. I don't think it's really
- faster than memcmp(), it only seems that way because there's
- no function call overhead. ~20% speedup.
- making it search backwards gives a further ~25% speedup.
- */
- len = t->length;
- p = &input_buf[pos] + len - 1;
- q = t->start + len - 1;
- while(len) {
- if(*p != *q) break;
- p--; q--;
- len--;
- }
- if(!len) return i;
+token_t *get_kid(token_t *t, u8 chr) {
+ token_t *kid;
+
+ if(!t->kids) return 0;
+
+ kid = t->kids;
+ while(kid) {
+ if(kid->chr == chr)
+ return kid;
+ kid = kid->sibling;
+ }
+ return 0;
+}
+
+token_t *match_token(int pos) {
+ token_t *t, *p;
+
+ t = &root_tokens[input_buf[pos]];
+ new_pos = pos + 1;
+ p = t;
+ while((p = get_kid(t, input_buf[new_pos]))) {
+ t = p;
+ new_pos++;
+ if(new_pos == input_len) break;
+ }
+ return t;
+}
+
+token_t *new_token(u8 chr) {
+ token_t *newtok;
+
+ newtok = malloc(sizeof(token_t));
+ if(!newtok) {
+ fprintf(stderr, "%s: fatal: out of memory!\n", self);
+ exit(1);
}
- /* hard-coded single character tokens map to their values, no need
- to search. */
- return input_buf[pos];
+ newtok->chr = chr;
+ newtok->kids = newtok->sibling = 0;
+ newtok->number = curr_token;
+
+ tokentab[curr_token] = newtok;
+
+ return newtok;
}
-void make_token(int start, int end) {
+void add_kid(token_t *oldtok, token_t *newtok) {
+ token_t *kid;
+
+ if(!oldtok->kids) {
+ oldtok->kids = newtok;
+ return;
+ }
+
+ kid = oldtok->kids;
+ while(kid->sibling)
+ kid = kid->sibling;
+
+ kid->sibling = newtok;
+}
+
+void make_token(token_t *oldtok, u8 newchr) {
+ token_t *newtok;
+
/* if the token table is full, reset it. basically start over like
we would with a new file. */
if(curr_token == max_token) {
@@ -132,33 +151,37 @@ void make_token(int start, int end) {
}
max_token = 1 << token_bits;
}
- tokentab[curr_token].start = &input_buf[start];
- tokentab[curr_token].length = end - start + 1;
+
+ newtok = new_token(newchr);
+ add_kid(oldtok, newtok);
curr_token++;
}
void crunch(void) {
- int new_pos;
- int token;
+ token_t *tok;
init_table();
out_bitpos = 0;
- in_pos = 0;
+ in_pos = new_pos = 0;
/* 0-byte input files don't get a TOK_RESET */
- if(input_len)
- store_token(TOK_RESET);
+ if(!input_len) {
+ store_token(TOK_END);
+ return;
+ }
+
+ store_token(TOK_RESET);
while(in_pos < input_len) {
- token = match_token(in_pos);
- store_token(token);
- new_pos = in_pos + tokentab[token].length;
+ tok = match_token(in_pos);
+ store_token(tok->number);
if(new_pos < input_len)
- make_token(in_pos, new_pos);
+ make_token(tok, input_buf[new_pos]);
in_pos = new_pos;
}
store_token(TOK_END);
if(out_bitpos) inc_output_len();
-}
+ init_table();
+}