dictionary text compression, 7666 bytes free

author: B. Watson <yalhcru@gmail.com> 2016-02-25 16:51:09 -0500
committer: B. Watson <yalhcru@gmail.com> 2016-02-25 16:51:09 -0500
commit: 9dc7267a351b79ac5b855e812520362f28922341 (patch)
tree: b40e8e2144358752b990f1c94091fda0880e3eb7 /textdecomp.s
parent: c9c027bb8d620eb3f5066440067327b7d932a7e1 (diff)
download: taipan-9dc7267a351b79ac5b855e812520362f28922341.tar.gz
1 files changed, 132 insertions, 1 deletions
diff --git a/textdecomp.s b/textdecomp.s
index a19d460..da0206a 100644
--- a/textdecomp.s
+++ b/textdecomp.s
@@ -12,6 +12,82 @@
  bitcount = FR0+1 ; counts 8..1, current bit in inbyte
  inbyte = FR0+2
  ysave = FR0+3
+ dict_escape = FR0+4
+
+ .rodata
+; one or two letter words are not worth listing here. 3 is only good
+; if it's used pretty often.
+; entry 0 is a dummy! The encoder gets confused by "Z\0". This may get fixed.
+; dictionary size cannot exceed 255 bytes.
+; the quoted stuff in comments is read by messages.pl, it needs to be exact.
+dict00:
+dict01: .byte $98, $9d, $73, $54, $53, $80 ; "Li Yuen", 4 occurrences
+dict02: .byte $7c, $c1, $05, $4b, $57, $12, $3d, $42, $05, $48, $00 ; "Elder Brother", 3
+dict03: .byte $64, $f5, $40 ; "you", 30
+dict04: .byte $d7, $c1, $4d, $00 ; " 'em", 8
+dict05: .byte $cc, $f5, $40 ; "You", 16
+dict06: .byte $d4, $80, $56, $14, $00 ; " have", 11
+dict07: .byte $d5, $32, $01, $30, $c0, $00 ; " shall", 6
+dict08: .byte $fb, $5c, $49, $50, $8d, $40 ; ") With ", 2
+dict09: .byte $05, $21, $cf, $00 ; "argo", 6
+dict10: .byte $4c, $82, $50, $00 ; "ship", 10
+dict11: .byte $d5, $70, $52, $14, $83, $d5, $4c, $50, $00 ; " warehouse", 4
+dict12: .byte $d5, $42, $05, $00 ; " the" 17
+dict13: .byte $d4, $f1, $80 ; " of", 14
+dict14: .byte $5c, $93, $0c, $00 ; "will", 8
+dict15: .byte $d4, $21, $45, $3b, $50, $00 ; " been ", 6
+dict16: .byte $d5, $43, $f5, $00 ; " to ", 12
+dict17: .byte $20, $14, $f5, $00 ; "has ", 7
+dict18: .byte $18, $f4, $b5, $00 ; "for ", 7
+dict19: .byte $25, $3d, $40 ; "is ", 9
+dict20: .byte $04, $e1, $00 ; "and", 10
+dict21: .byte $d4, $30, $53, $20, $00 ; " cash", 8
+dict22: .byte $04, $41, $09, $50, $93, $ce, $04, $cd, $40 ; "additional ", 3
+dict23: .byte $b8, $12, $50, $04, $e0, $00 ; "Taipan", 3 (but really many more!)
+dict24: .byte $d4, $f3, $8c, $67, $50 ; " only ", 3
+dict25: .byte $d4, $25, $47, $1c, $54, $93, $00 ; " buggers", 3
+dict26: .byte $5c, $95, $08, $d4, $00 ; "with ", 4
+;dict27: .byte $78, $fd, $40 ; "Do ", 4
+
+dict_offsets:
+ .byte dict00 - dict00
+ .byte dict01 - dict00
+ .byte dict02 - dict00
+ .byte dict03 - dict00
+ .byte dict04 - dict00
+ .byte dict05 - dict00
+ .byte dict06 - dict00
+ .byte dict07 - dict00
+ .byte dict08 - dict00
+ .byte dict09 - dict00
+ .byte dict10 - dict00
+ .byte dict11 - dict00
+ .byte dict12 - dict00
+ .byte dict13 - dict00
+ .byte dict14 - dict00
+ .byte dict15 - dict00
+ .byte dict16 - dict00
+ .byte dict17 - dict00
+ .byte dict18 - dict00
+ .byte dict19 - dict00
+ .byte dict20 - dict00
+ .byte dict21 - dict00
+ .byte dict22 - dict00
+ .byte dict23 - dict00
+ .byte dict24 - dict00
+ .byte dict25 - dict00
+ .byte dict26 - dict00
+ ;.byte dict27 - dict00
+
+; rough estimate of how many bytes are saved by the dictionary
+; stuff: the dictionary + extra decoder stuff costs 221 bytes (vs.
+; the original textdecode.s without dictionary).
+; each dictionary entry saves (length - 2) * (occurrences - 1) bytes.
+; with only dict00 - dict23, we'll save around 173 bytes.
+; actually it works out to 179 bytes, but the estimate was close.
+
+ dictsize = * - dict00
+ .out .sprintf("dictionary plus dict_offsets is %d bytes", dictsize)
 
  .rodata
 table: ; outbyte values 53..63
@@ -24,11 +100,12 @@ table: ; outbyte values 53..63
   .code
  .endif
 
-; extern void __fastcall__ print_msg(char *msg);
+; extern void __fastcall__ print_msg(const char *msg);
 _print_msg:
  sta srcptr
  stx srcptr+1
  lda #0
+ sta dict_escape
  sta outbyte
  ldy #$ff ; since we increment it first thing...
 
@@ -54,6 +131,13 @@ _print_msg:
  rts ; 0 = end of message
 
 @notend:
+ ldx dict_escape ; was last character a Z?
+ beq @normalchar
+
+ jsr dict_lookup
+ jmp @noprint
+
+@normalchar:
  cmp #27
  bcs @notlower
  adc #'a'-1 ; 1-26 are a-z
@@ -61,6 +145,11 @@ _print_msg:
 
 @notlower:
  cmp #52
+ bne @notdict
+ inc dict_escape ; Z means next 6 bits are dictionary ID
+ bne @noprint
+
+@notdict:
  bcs @notupper
  adc #38 ; 27-52 are A-Z
  bne @printit
@@ -74,6 +163,7 @@ _print_msg:
  sty ysave ; _cputc trashes Y
  jsr _cputc
  ldy ysave
+@noprint:
  lda #0
  sta outbyte
  ldx #6
@@ -81,6 +171,47 @@ _print_msg:
  beq @nextbyte
  bne @bitloop
 
+dict_lookup:
+ ; dictionary lookup time. save our state on the stack
+ tya
+ pha
+ lda inbyte
+ pha
+ lda srcptr
+ pha
+ lda srcptr+1
+ pha
+ lda bitcount
+ pha
+
+ ; recursive call
+ ldx outbyte
+ lda dict_offsets,x
+ clc
+ adc #<dict00
+ sta dict_escape ; temp usage
+ lda #>dict00
+ adc #0
+ tax
+ lda dict_escape
+ jsr _print_msg
+
+ ; restore old state
+ lda #0
+ sta dict_escape
+ pla
+ sta bitcount
+ pla
+ sta srcptr+1
+ pla
+ sta srcptr
+ pla
+ sta inbyte
+ pla
+ tay
+ rts
+
  decodersize = * - _print_msg
 
  .out .sprintf("print_msg() is %d bytes", decodersize + tablesize)
+ .out .sprintf("total textdecomp is %d bytes", decodersize + tablesize + dictsize)
author	B. Watson <yalhcru@gmail.com>	2016-02-25 16:51:09 -0500
committer	B. Watson <yalhcru@gmail.com>	2016-02-25 16:51:09 -0500
commit	9dc7267a351b79ac5b855e812520362f28922341 (patch)
tree	b40e8e2144358752b990f1c94091fda0880e3eb7 /textdecomp.s
parent	c9c027bb8d620eb3f5066440067327b7d932a7e1 (diff)
download	taipan-9dc7267a351b79ac5b855e812520362f28922341.tar.gz