From 542c4c433a2ca17856d3d229111dc36813f594d3 Mon Sep 17 00:00:00 2001
From: "B. Watson" <urchlay@slackware.uk>
Date: Fri, 4 Nov 2022 02:02:09 -0400
Subject: More micro-opts.

---
 dla.s | 52 ++++++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

(limited to 'dla.s')

diff --git a/dla.s b/dla.s
index 229a9ef..68a13ba 100644
--- a/dla.s
+++ b/dla.s
@@ -507,60 +507,59 @@ oob:
 ;;; This is the innermost loop, so it should be as optimized as
 ;;; possible (we're not there yet).
 drunkwalk:
- ; using bit/bmi/bvc saves 4.25 cycles on average, compared to
+ ; X holds the X coord the whole time, only needs to be loaded on entry.
+ ldx part_x ;3
+ ; using bit/bmi/bvc saves 5.25 cycles on average, compared to
  ; immediate cmp and bne.
- ; 4 code paths: up=16, down=17, left=17, right=15, avg=16.25.
+ ; 4 code paths: up=16, down=17, left=15, right=13, avg=15.25.
  ; note that part_x and part_y are *never* zero; all the bne's here
  ; are "branch always".
+dwloop:
+ ldy part_y
  bit RANDOM ;4 ; use top 2 bits (probably more random, definitely faster)
  bmi lr     ;2/3
  bvc down   ;2/3
- dec part_y ;5 ; N=1 V=1 up
+ dey        ;2 ; N=1 V=1 up
  bne checkbounds ;3
 down:
- inc part_y ;5 ; N=1 V=0 down
+ iny        ;2 ; N=1 V=0 down
  bne checkbounds ;3
 lr:
- bvc right ;2/3
- dec part_x ;5 ; N=0 V=1 left
+ bvc right  ;2/3
+ dex        ;3 ; N=0 V=1 left
  bne checkbounds ;3
 right:
- inc part_x ;5
+ inx        ;3 ; N=0 V=0 right
 
 checkbounds:
  ; all the "cmp #0" here get their operands modified by set_limits.
- lda part_x
 selfmod_xmin = * + 1
- cmp #0
- beq oob
+ cpx #0               ; 2
+ beq oob              ; 3
 selfmod_xmax = * + 1
- cmp #0
- beq oob
- ;sta cursor_x
- lda part_y
+ cpx #0               ; 2
+ beq oob              ; 3
 selfmod_ymin = * + 1
- cmp #0
- beq oob
+ cpy #0               ; 2
+ beq oob              ; 3
 selfmod_ymax = * + 1
- cmp #0
- beq oob
- ;sta cursor_y
- ; checkbounds is 30 cycles when the pixel is in bounds.
- ; I think we can beat this.
+ cpy #0               ; 2
+ beq oob              ; 3
+ ; checkbounds is 20 cycles when the pixel is in bounds.
 
 checkneigh:
  ; check neighbors. used to be a subroutine, inlined it.
  ; also inlined plotsetup here.
- ldx part_y
- lda lineaddrs_l,x
+ sty part_y           ; 3
+ lda lineaddrs_l,y
  sta pixptr
- lda lineaddrs_h,x
+ lda lineaddrs_h,y
  sta pixptr+1
 
  ; 3/4 of the time, we can use a faster code path, check
  ; (-1,0) and (1,0) at the same time. this happens only when
  ; both pixels lie within the same byte.
- ldx part_x
+ ;ldx part_x ; X already has this from before
  lda fastmasks,x
  beq slow_x
  ldy xoffsets,x
@@ -603,9 +602,10 @@ pp1ok:
  lda (pixptr),y
  and pixmask
  bne stick
- jmp drunkwalk ; too far for a branch
+ jmp dwloop ; too far for branch. X *still* holds (maybe modified) part_x
 
 stick: ; we always get here with Z flag clear
+ stx part_x ; only update part_x at exit.
  rts
 
 ;;; Subroutine: drawseed
-- 
cgit v1.2.3