From 542c4c433a2ca17856d3d229111dc36813f594d3 Mon Sep 17 00:00:00 2001 From: "B. Watson" Date: Fri, 4 Nov 2022 02:02:09 -0400 Subject: More micro-opts. --- dla.s | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'dla.s') diff --git a/dla.s b/dla.s index 229a9ef..68a13ba 100644 --- a/dla.s +++ b/dla.s @@ -507,60 +507,59 @@ oob: ;;; This is the innermost loop, so it should be as optimized as ;;; possible (we're not there yet). drunkwalk: - ; using bit/bmi/bvc saves 4.25 cycles on average, compared to + ; X holds the X coord the whole time, only needs to be loaded on entry. + ldx part_x ;3 + ; using bit/bmi/bvc saves 5.25 cycles on average, compared to ; immediate cmp and bne. - ; 4 code paths: up=16, down=17, left=17, right=15, avg=16.25. + ; 4 code paths: up=16, down=17, left=15, right=13, avg=15.25. ; note that part_x and part_y are *never* zero; all the bne's here ; are "branch always". +dwloop: + ldy part_y bit RANDOM ;4 ; use top 2 bits (probably more random, definitely faster) bmi lr ;2/3 bvc down ;2/3 - dec part_y ;5 ; N=1 V=1 up + dey ;2 ; N=1 V=1 up bne checkbounds ;3 down: - inc part_y ;5 ; N=1 V=0 down + iny ;2 ; N=1 V=0 down bne checkbounds ;3 lr: - bvc right ;2/3 - dec part_x ;5 ; N=0 V=1 left + bvc right ;2/3 + dex ;3 ; N=0 V=1 left bne checkbounds ;3 right: - inc part_x ;5 + inx ;3 ; N=0 V=0 right checkbounds: ; all the "cmp #0" here get their operands modified by set_limits. - lda part_x selfmod_xmin = * + 1 - cmp #0 - beq oob + cpx #0 ; 2 + beq oob ; 3 selfmod_xmax = * + 1 - cmp #0 - beq oob - ;sta cursor_x - lda part_y + cpx #0 ; 2 + beq oob ; 3 selfmod_ymin = * + 1 - cmp #0 - beq oob + cpy #0 ; 2 + beq oob ; 3 selfmod_ymax = * + 1 - cmp #0 - beq oob - ;sta cursor_y - ; checkbounds is 30 cycles when the pixel is in bounds. - ; I think we can beat this. + cpy #0 ; 2 + beq oob ; 3 + ; checkbounds is 20 cycles when the pixel is in bounds. checkneigh: ; check neighbors. used to be a subroutine, inlined it. ; also inlined plotsetup here. - ldx part_y - lda lineaddrs_l,x + sty part_y ; 3 + lda lineaddrs_l,y sta pixptr - lda lineaddrs_h,x + lda lineaddrs_h,y sta pixptr+1 ; 3/4 of the time, we can use a faster code path, check ; (-1,0) and (1,0) at the same time. this happens only when ; both pixels lie within the same byte. - ldx part_x + ;ldx part_x ; X already has this from before lda fastmasks,x beq slow_x ldy xoffsets,x @@ -603,9 +602,10 @@ pp1ok: lda (pixptr),y and pixmask bne stick - jmp drunkwalk ; too far for a branch + jmp dwloop ; too far for branch. X *still* holds (maybe modified) part_x stick: ; we always get here with Z flag clear + stx part_x ; only update part_x at exit. rts ;;; Subroutine: drawseed -- cgit v1.2.3