#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "u816.h"
#include "crunch.h"
#include "self.h"

#define INITIAL_BITS 9
#define MAX_BITS 12
#define MAX_TOKENS (1 << MAX_BITS)
#define TOK_RESET 256
#define TOK_END 257
#define INIT_TOKEN 258

u8 input_buf[MAX_INPUT_SIZE];
u8 output_buf[MAX_INPUT_SIZE];
unsigned int input_len, output_len, out_bitpos;

u8 byte_tokens[256];
typedef struct s_token {
	u8 *start;
	u16 length;
} token_t;

token_t tokentab[MAX_TOKENS];

int token_bits;
int max_token;
int curr_token;
int in_pos;

void init_table(void) {
	int i;

	memset(tokentab, 0, sizeof(tokentab));

	token_bits = INITIAL_BITS;
	max_token = 1 << INITIAL_BITS;
	curr_token = INIT_TOKEN;

	for(i = 0; i < 256; i++) {
		byte_tokens[i] = (u8)i;
		tokentab[i].start = &byte_tokens[i];
		tokentab[i].length = 1;
	}
}

void inc_output_len(void) {
	if(++output_len == MAX_INPUT_SIZE) {
		fprintf(stderr, "%s: fatal: compressed file would be >16MB.\n", self);
		exit(1);
	}
}

void append_bit(int bit) {
	output_buf[output_len] |= (bit << (7 - out_bitpos));
	out_bitpos++;
	if(out_bitpos == 8) {
		out_bitpos = 0;
		inc_output_len();
	}
}

void store_token(int tok) {
	int mask;

	for(mask = 1 << (token_bits - 1); mask; mask >>= 1) {
		append_bit(tok & mask ? 1 : 0);
	}
}

/* match_token() is a brute-force search, which is why alf is so slow.
   I'll do something smarter at some point.
   search backwards, the tokens are stored with longer ones later
   in the list. */
int match_token(int pos) {
	int i, len, maxlen;
	token_t *t;
	u8 *p, *q;

	maxlen = input_len - pos;

	for(i = curr_token - 1; i >= INIT_TOKEN; i--) {
		t = &tokentab[i];

		/* don't search past the end of the input */
		if(t->length > maxlen) continue;

		/* if the first char doesn't match, don't bother with memcmp.
		   this is a 5x speedup (!) */
		if(input_buf[pos] != *(t->start)) continue;

		/* this is where alf spends most of its time.
			using memcmp is noticeably slower than the code below. */
		/*
		if(memcmp(&input_buf[pos], t->start, t->length) == 0)
			return i;
		*/

		/* inline memcmp replacement of sorts. I don't think it's really
		   faster than memcmp(), it only seems that way because there's
		   no function call overhead. ~20% speedup.
		   making it search backwards gives a further ~25% speedup.
		 */
		len = t->length;
		p = &input_buf[pos] + len - 1;
		q = t->start + len - 1;
		while(len) {
			if(*p != *q) break;
			p--; q--;
			len--;
		}
		if(!len) return i;
	}

	/* hard-coded single character tokens map to their values, no need
	   to search. */
	return input_buf[pos];
}

void make_token(int start, int end) {
	/* if the token table is full, reset it. basically start over like
	   we would with a new file. */
	if(curr_token == max_token) {
		if(token_bits == MAX_BITS) {
			store_token(TOK_RESET); /* stored at the *old* token size! */
			token_bits = INITIAL_BITS;
			init_table();
			return; /* since we're starting over, *don't* make a token */
		} else {
			token_bits++;
		}
		max_token = 1 << token_bits;
	}
	tokentab[curr_token].start = &input_buf[start];
	tokentab[curr_token].length = end - start + 1;
	curr_token++;
}

void crunch(void) {
	int new_pos;
	int token;

	init_table();
	out_bitpos = 0;
	in_pos = 0;

	/* 0-byte input files don't get a TOK_RESET */
	if(input_len)
		store_token(TOK_RESET);

	while(in_pos < input_len) {
		token = match_token(in_pos);
		store_token(token);
		new_pos = in_pos + tokentab[token].length;
		if(new_pos < input_len)
			make_token(in_pos, new_pos);
		in_pos = new_pos;
	}

	store_token(TOK_END);
	if(out_bitpos) inc_output_len();
}