From 63ca0adbebe136dbcedfd203399fe959a99b4f90 Mon Sep 17 00:00:00 2001 From: "B. Watson" Date: Sat, 5 Nov 2022 18:44:44 -0400 Subject: Unroll drunkwalk some, v0.1.2, save ~25s runtime. --- dla.s | 234 ++++++++++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 165 insertions(+), 69 deletions(-) diff --git a/dla.s b/dla.s index 48a6031..785f3e9 100644 --- a/dla.s +++ b/dla.s @@ -505,55 +505,53 @@ oob: drunkwalk: ; X holds the X coord the whole time, only needs to be loaded on entry. ; preload pixptr, too. - ldx part_x - ldy part_y ;3 + ldx part_x ; 3 + ldy part_y ; 3 lda lineaddrs_l,y ; 5 sta pixptr ; 3 lda lineaddrs_h,y ; 5 sta pixptr+1 ; 3 - ; using bit/bmi/bvc saves 5.25 cycles on average, compared to - ; immediate cmp and bne. - ; 4 code paths: up=15, down=18, left=19, right=17, avg=17.25. + ; 4 code paths: TODO: count ; note that part_x and part_y are *never* zero; all the bne's here ; are "branch always". ; all the "cmp #0" here get their operands modified by set_limits. dwloop: - ldy part_y ; 3 - bit RANDOM ;4 ; use top 2 bits (probably more random, definitely faster) - bmi lr ;2/3 - bvc down ;2/3 - dey ;2 ; N=1 V=1 up + ldy part_y ; 3 + bit RANDOM ; 4 ; use top 2 bits (probably more random, definitely faster) + bmi lr ; 2/3 + bvc down ; 2/3 + dey ; 2 ; N=1 V=1 up selfmod_ymin = * + 1 - cpy #0 ; 2 - beq oob ; 2 - bne checkneigh ;3 + cpy #0 ; 2 + beq oob ; 2 + jmp check_lru down: - iny ;2 ; N=1 V=0 down + iny ; 2 ; N=1 V=0 down selfmod_ymax = * + 1 - cpy #0 ; 2 - beq oob ; 2 - bne checkneigh ;3 + cpy #0 ; 2 + beq oob ; 2 + jmp check_lrd lr: - bvc right ;2/3 - dex ;3 ; N=0 V=1 left + bvc right ; 2/3 + dex ; 3 ; N=0 V=1 left selfmod_xmin = * + 1 - cpx #0 ; 2 - beq oob ; 2 - ldy xoffsets-1,x ; moved left, check left X neighbor only. - lda xmasks-1,x ; right X neighbor definitely empty, because - and (pixptr),y ; we just moved out of that cell. - bne stick - beq check_y ; 3 ; still have to check Y (up/down) neighbors. + cpx #0 ; 2 + beq oob ; 2 + ldy xoffsets-1,x ; 4 ; moved left, check left X neighbor only. + lda xmasks-1,x ; 4 ; right X neighbor definitely empty, because + and (pixptr),y ; 5 ; we just moved out of that cell. + bne stick ; 2/3 + beq check_ud ; 3 ; still have to check Y (up/down) neighbors. right: - inx ;3 ; N=0 V=0 right + inx ; 3 ; N=0 V=0 right selfmod_xmax = * + 1 - cpx #0 ; 2 - beq oob ; 2 - ldy xoffsets+1,x ; as above, moved right, check right neighbor only. - lda xmasks+1,x - and (pixptr),y - bne stick - beq check_y ; 3 + cpx #0 ; 2 + beq oob ; 2 + ldy xoffsets+1,x ; 4 ; as above, moved right, check right X neighbor only. + lda xmasks+1,x ; 4 + and (pixptr),y ; 5 + bne stick ; 2/3 + beq check_ud ; 3 ; still check Y neighbors. checkneigh: ; check neighbors. used to be a subroutine, inlined it. @@ -569,55 +567,153 @@ checkneigh: ; (-1,0) and (1,0) at the same time. this happens only when ; both pixels lie within the same byte. ;ldx part_x ; X already has this from before - lda fastmasks,x - beq slow_x - ldy xoffsets,x - and (pixptr),y - bne stick - beq check_y + lda fastmasks,x ; 4 + beq slow_x ; 2/3 + ldy xoffsets,x ; 4 + and (pixptr),y ; 5 + bne stick ; 2/3 + beq check_ud ; 2/3 slow_x: ; (-1,0) - ldy xoffsets-1,x - lda xmasks-1,x - and (pixptr),y - bne stick + ldy xoffsets-1,x ; 4 + lda xmasks-1,x ; 4 + and (pixptr),y ; 5 + bne stick ; 2/3 ; (1,0) - ldy xoffsets+1,x - lda xmasks+1,x - and (pixptr),y - bne stick + ldy xoffsets+1,x ; 4 + lda xmasks+1,x ; 4 + and (pixptr),y ; 5 + bne stick ; 2/3 -check_y: +check_ud: ; this happens no matter what direction the pixel moved. ; (0,-1) ; subtract 32 (one line) from the pointer. one cycle faster ; than reloading from lineaddrs_l/h table. - lda pixptr ; 3 - sec ; 2 - sbc #$20 ; 2 - sta pixptr2 ; 3 - lda pixptr+1 ; 3 - sbc #0 ; 2 - sta pixptr2+1 ; 3 + lda pixptr ; 3 + sec ; 2 + sbc #$20 ; 2 + sta pixptr2 ; 3 + lda pixptr+1 ; 3 + sbc #0 ; 2 + sta pixptr2+1 ; 3 ;ldx part_x ; X already has this from before - ldy xoffsets,x - lda xmasks,x - sta pixmask - and (pixptr2),y - bne stick + ldy xoffsets,x ; 4 + lda xmasks,x ; 4 + sta pixmask ; 3 + and (pixptr2),y ; 5 + bne stick ; 2/3 ; (0,1) - tya - ora #$40 ; add 64, AKA 2 screen lines - tay - lda (pixptr2),y - and pixmask - bne stick - jmp dwloop ; too far for a branch + tya ; 2 + ora #$40 ; 2 ; add 64, AKA 2 screen lines + tay ; 2 + lda (pixptr2),y ; 5 + and pixmask ; 3 + bne stick ; 2/3 + jmp dwloop ; 3 ; too far for a branch stick: ; we always get here with Z flag clear stx part_x ; only update part_x at exit. rts +check_lru: + sty part_y ; 3 + lda lineaddrs_l,y ; 5 + sta pixptr ; 3 + lda lineaddrs_h,y ; 5 + sta pixptr+1 ; 3 + + ; 3/4 of the time, we can use a faster code path, check + ; (-1,0) and (1,0) at the same time. this happens only when + ; both pixels lie within the same byte. + ;ldx part_x ; X already has this from before + lda fastmasks,x ; 4 + beq slow_x_lru ; 2/3 + ldy xoffsets,x ; 4 + and (pixptr),y ; 5 + bne stick ; 2/3 + beq check_u ; 2/3 +slow_x_lru: + ; (-1,0) + ldy xoffsets-1,x ; 4 + lda xmasks-1,x ; 4 + and (pixptr),y ; 5 + bne stick ; 2/3 + ; (1,0) + ldy xoffsets+1,x ; 4 + lda xmasks+1,x ; 4 + and (pixptr),y ; 5 + bne stick ; 2/3 + +check_u: + ; (0,-1) + ; subtract 32 (one line) from the pointer. one cycle faster + ; than reloading from lineaddrs_l/h table. + lda pixptr ; 3 + sec ; 2 + sbc #$20 ; 2 + sta pixptr2 ; 3 + lda pixptr+1 ; 3 + sbc #0 ; 2 + sta pixptr2+1 ; 3 + ;ldx part_x ; X already has this from before + ldy xoffsets,x ; 4 + lda xmasks,x ; 4 + ;sta pixmask ; 3 + and (pixptr2),y ; 5 + bne stick ; 2/3 + jmp dwloop ; 3 ; too far for a branch + + +check_lrd: + sty part_y ; 3 + lda lineaddrs_l,y ; 5 + sta pixptr ; 3 + lda lineaddrs_h,y ; 5 + sta pixptr+1 ; 3 + + ; 3/4 of the time, we can use a faster code path, check + ; (-1,0) and (1,0) at the same time. this happens only when + ; both pixels lie within the same byte. + ;ldx part_x ; X already has this from before + lda fastmasks,x ; 4 + beq slow_x_lrd ; 2/3 + ldy xoffsets,x ; 4 + and (pixptr),y ; 5 + bne stick ; 2/3 + beq check_d ; 2/3 +slow_x_lrd: + ; (-1,0) + ldy xoffsets-1,x ; 4 + lda xmasks-1,x ; 4 + and (pixptr),y ; 5 + bne stick ; 2/3 + ; (1,0) + ldy xoffsets+1,x ; 4 + lda xmasks+1,x ; 4 + and (pixptr),y ; 5 + bne stick ; 2/3 + +check_d: + ; (0,-1) + lda pixptr ; 3 + clc ; 2 + adc #$20 ; 2 + sta pixptr2 ; 3 + lda pixptr+1 ; 3 + adc #0 ; 2 + sta pixptr2+1 ; 3 + ;ldx part_x ; X already has this from before + ldy xoffsets,x ; 4 + lda xmasks,x ; 4 + ;sta pixmask ; 3 + and (pixptr2),y ; 5 + bne stick2 ; 2/3 + jmp dwloop ; 3 ; too far for a branch +stick2: + stx part_x ; only update part_x at exit. + rts + ;;; Subroutine: drawseed ;;; dispatch to appropriate seed subroutine drawseed: @@ -772,7 +868,7 @@ ci_done: ; banner and saveprompt must start with a clear-screen code. banner: .byte $7d, "Diffusion Limited Aggregate",$9b - .byte "Urchlay's ASM version 0.1.1",$9b,$9b + .byte "Urchlay's ASM version 0.1.2",$9b,$9b .byte "Particle count range: 1 to 65535",$9b .byte "How many particles [",$0 -- cgit v1.2.3