aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--dla.s52
1 files changed, 26 insertions, 26 deletions
diff --git a/dla.s b/dla.s
index 229a9ef..68a13ba 100644
--- a/dla.s
+++ b/dla.s
@@ -507,60 +507,59 @@ oob:
;;; This is the innermost loop, so it should be as optimized as
;;; possible (we're not there yet).
drunkwalk:
- ; using bit/bmi/bvc saves 4.25 cycles on average, compared to
+ ; X holds the X coord the whole time, only needs to be loaded on entry.
+ ldx part_x ;3
+ ; using bit/bmi/bvc saves 5.25 cycles on average, compared to
; immediate cmp and bne.
- ; 4 code paths: up=16, down=17, left=17, right=15, avg=16.25.
+ ; 4 code paths: up=16, down=17, left=15, right=13, avg=15.25.
; note that part_x and part_y are *never* zero; all the bne's here
; are "branch always".
+dwloop:
+ ldy part_y
bit RANDOM ;4 ; use top 2 bits (probably more random, definitely faster)
bmi lr ;2/3
bvc down ;2/3
- dec part_y ;5 ; N=1 V=1 up
+ dey ;2 ; N=1 V=1 up
bne checkbounds ;3
down:
- inc part_y ;5 ; N=1 V=0 down
+ iny ;2 ; N=1 V=0 down
bne checkbounds ;3
lr:
- bvc right ;2/3
- dec part_x ;5 ; N=0 V=1 left
+ bvc right ;2/3
+ dex ;3 ; N=0 V=1 left
bne checkbounds ;3
right:
- inc part_x ;5
+ inx ;3 ; N=0 V=0 right
checkbounds:
; all the "cmp #0" here get their operands modified by set_limits.
- lda part_x
selfmod_xmin = * + 1
- cmp #0
- beq oob
+ cpx #0 ; 2
+ beq oob ; 3
selfmod_xmax = * + 1
- cmp #0
- beq oob
- ;sta cursor_x
- lda part_y
+ cpx #0 ; 2
+ beq oob ; 3
selfmod_ymin = * + 1
- cmp #0
- beq oob
+ cpy #0 ; 2
+ beq oob ; 3
selfmod_ymax = * + 1
- cmp #0
- beq oob
- ;sta cursor_y
- ; checkbounds is 30 cycles when the pixel is in bounds.
- ; I think we can beat this.
+ cpy #0 ; 2
+ beq oob ; 3
+ ; checkbounds is 20 cycles when the pixel is in bounds.
checkneigh:
; check neighbors. used to be a subroutine, inlined it.
; also inlined plotsetup here.
- ldx part_y
- lda lineaddrs_l,x
+ sty part_y ; 3
+ lda lineaddrs_l,y
sta pixptr
- lda lineaddrs_h,x
+ lda lineaddrs_h,y
sta pixptr+1
; 3/4 of the time, we can use a faster code path, check
; (-1,0) and (1,0) at the same time. this happens only when
; both pixels lie within the same byte.
- ldx part_x
+ ;ldx part_x ; X already has this from before
lda fastmasks,x
beq slow_x
ldy xoffsets,x
@@ -603,9 +602,10 @@ pp1ok:
lda (pixptr),y
and pixmask
bne stick
- jmp drunkwalk ; too far for a branch
+ jmp dwloop ; too far for branch. X *still* holds (maybe modified) part_x
stick: ; we always get here with Z flag clear
+ stx part_x ; only update part_x at exit.
rts
;;; Subroutine: drawseed