aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorB. Watson <urchlay@slackware.uk>2022-11-05 18:44:44 -0400
committerB. Watson <urchlay@slackware.uk>2022-11-05 18:44:44 -0400
commit63ca0adbebe136dbcedfd203399fe959a99b4f90 (patch)
tree62d4fbc440139e0af94915c56000ba7b8759655b
parent34ffde01e321b9a433cc6ecc32a6b8a7adebce7c (diff)
downloaddla-asm-63ca0adbebe136dbcedfd203399fe959a99b4f90.tar.gz
Unroll drunkwalk some, v0.1.2, save ~25s runtime.
-rw-r--r--dla.s234
1 files changed, 165 insertions, 69 deletions
diff --git a/dla.s b/dla.s
index 48a6031..785f3e9 100644
--- a/dla.s
+++ b/dla.s
@@ -505,55 +505,53 @@ oob:
drunkwalk:
; X holds the X coord the whole time, only needs to be loaded on entry.
; preload pixptr, too.
- ldx part_x
- ldy part_y ;3
+ ldx part_x ; 3
+ ldy part_y ; 3
lda lineaddrs_l,y ; 5
sta pixptr ; 3
lda lineaddrs_h,y ; 5
sta pixptr+1 ; 3
- ; using bit/bmi/bvc saves 5.25 cycles on average, compared to
- ; immediate cmp and bne.
- ; 4 code paths: up=15, down=18, left=19, right=17, avg=17.25.
+ ; 4 code paths: TODO: count
; note that part_x and part_y are *never* zero; all the bne's here
; are "branch always".
; all the "cmp #0" here get their operands modified by set_limits.
dwloop:
- ldy part_y ; 3
- bit RANDOM ;4 ; use top 2 bits (probably more random, definitely faster)
- bmi lr ;2/3
- bvc down ;2/3
- dey ;2 ; N=1 V=1 up
+ ldy part_y ; 3
+ bit RANDOM ; 4 ; use top 2 bits (probably more random, definitely faster)
+ bmi lr ; 2/3
+ bvc down ; 2/3
+ dey ; 2 ; N=1 V=1 up
selfmod_ymin = * + 1
- cpy #0 ; 2
- beq oob ; 2
- bne checkneigh ;3
+ cpy #0 ; 2
+ beq oob ; 2
+ jmp check_lru
down:
- iny ;2 ; N=1 V=0 down
+ iny ; 2 ; N=1 V=0 down
selfmod_ymax = * + 1
- cpy #0 ; 2
- beq oob ; 2
- bne checkneigh ;3
+ cpy #0 ; 2
+ beq oob ; 2
+ jmp check_lrd
lr:
- bvc right ;2/3
- dex ;3 ; N=0 V=1 left
+ bvc right ; 2/3
+ dex ; 3 ; N=0 V=1 left
selfmod_xmin = * + 1
- cpx #0 ; 2
- beq oob ; 2
- ldy xoffsets-1,x ; moved left, check left X neighbor only.
- lda xmasks-1,x ; right X neighbor definitely empty, because
- and (pixptr),y ; we just moved out of that cell.
- bne stick
- beq check_y ; 3 ; still have to check Y (up/down) neighbors.
+ cpx #0 ; 2
+ beq oob ; 2
+ ldy xoffsets-1,x ; 4 ; moved left, check left X neighbor only.
+ lda xmasks-1,x ; 4 ; right X neighbor definitely empty, because
+ and (pixptr),y ; 5 ; we just moved out of that cell.
+ bne stick ; 2/3
+ beq check_ud ; 3 ; still have to check Y (up/down) neighbors.
right:
- inx ;3 ; N=0 V=0 right
+ inx ; 3 ; N=0 V=0 right
selfmod_xmax = * + 1
- cpx #0 ; 2
- beq oob ; 2
- ldy xoffsets+1,x ; as above, moved right, check right neighbor only.
- lda xmasks+1,x
- and (pixptr),y
- bne stick
- beq check_y ; 3
+ cpx #0 ; 2
+ beq oob ; 2
+ ldy xoffsets+1,x ; 4 ; as above, moved right, check right X neighbor only.
+ lda xmasks+1,x ; 4
+ and (pixptr),y ; 5
+ bne stick ; 2/3
+ beq check_ud ; 3 ; still check Y neighbors.
checkneigh:
; check neighbors. used to be a subroutine, inlined it.
@@ -569,55 +567,153 @@ checkneigh:
; (-1,0) and (1,0) at the same time. this happens only when
; both pixels lie within the same byte.
;ldx part_x ; X already has this from before
- lda fastmasks,x
- beq slow_x
- ldy xoffsets,x
- and (pixptr),y
- bne stick
- beq check_y
+ lda fastmasks,x ; 4
+ beq slow_x ; 2/3
+ ldy xoffsets,x ; 4
+ and (pixptr),y ; 5
+ bne stick ; 2/3
+ beq check_ud ; 2/3
slow_x:
; (-1,0)
- ldy xoffsets-1,x
- lda xmasks-1,x
- and (pixptr),y
- bne stick
+ ldy xoffsets-1,x ; 4
+ lda xmasks-1,x ; 4
+ and (pixptr),y ; 5
+ bne stick ; 2/3
; (1,0)
- ldy xoffsets+1,x
- lda xmasks+1,x
- and (pixptr),y
- bne stick
+ ldy xoffsets+1,x ; 4
+ lda xmasks+1,x ; 4
+ and (pixptr),y ; 5
+ bne stick ; 2/3
-check_y:
+check_ud:
; this happens no matter what direction the pixel moved.
; (0,-1)
; subtract 32 (one line) from the pointer. one cycle faster
; than reloading from lineaddrs_l/h table.
- lda pixptr ; 3
- sec ; 2
- sbc #$20 ; 2
- sta pixptr2 ; 3
- lda pixptr+1 ; 3
- sbc #0 ; 2
- sta pixptr2+1 ; 3
+ lda pixptr ; 3
+ sec ; 2
+ sbc #$20 ; 2
+ sta pixptr2 ; 3
+ lda pixptr+1 ; 3
+ sbc #0 ; 2
+ sta pixptr2+1 ; 3
;ldx part_x ; X already has this from before
- ldy xoffsets,x
- lda xmasks,x
- sta pixmask
- and (pixptr2),y
- bne stick
+ ldy xoffsets,x ; 4
+ lda xmasks,x ; 4
+ sta pixmask ; 3
+ and (pixptr2),y ; 5
+ bne stick ; 2/3
; (0,1)
- tya
- ora #$40 ; add 64, AKA 2 screen lines
- tay
- lda (pixptr2),y
- and pixmask
- bne stick
- jmp dwloop ; too far for a branch
+ tya ; 2
+ ora #$40 ; 2 ; add 64, AKA 2 screen lines
+ tay ; 2
+ lda (pixptr2),y ; 5
+ and pixmask ; 3
+ bne stick ; 2/3
+ jmp dwloop ; 3 ; too far for a branch
stick: ; we always get here with Z flag clear
stx part_x ; only update part_x at exit.
rts
+check_lru:
+ sty part_y ; 3
+ lda lineaddrs_l,y ; 5
+ sta pixptr ; 3
+ lda lineaddrs_h,y ; 5
+ sta pixptr+1 ; 3
+
+ ; 3/4 of the time, we can use a faster code path, check
+ ; (-1,0) and (1,0) at the same time. this happens only when
+ ; both pixels lie within the same byte.
+ ;ldx part_x ; X already has this from before
+ lda fastmasks,x ; 4
+ beq slow_x_lru ; 2/3
+ ldy xoffsets,x ; 4
+ and (pixptr),y ; 5
+ bne stick ; 2/3
+ beq check_u ; 2/3
+slow_x_lru:
+ ; (-1,0)
+ ldy xoffsets-1,x ; 4
+ lda xmasks-1,x ; 4
+ and (pixptr),y ; 5
+ bne stick ; 2/3
+ ; (1,0)
+ ldy xoffsets+1,x ; 4
+ lda xmasks+1,x ; 4
+ and (pixptr),y ; 5
+ bne stick ; 2/3
+
+check_u:
+ ; (0,-1)
+ ; subtract 32 (one line) from the pointer. one cycle faster
+ ; than reloading from lineaddrs_l/h table.
+ lda pixptr ; 3
+ sec ; 2
+ sbc #$20 ; 2
+ sta pixptr2 ; 3
+ lda pixptr+1 ; 3
+ sbc #0 ; 2
+ sta pixptr2+1 ; 3
+ ;ldx part_x ; X already has this from before
+ ldy xoffsets,x ; 4
+ lda xmasks,x ; 4
+ ;sta pixmask ; 3
+ and (pixptr2),y ; 5
+ bne stick ; 2/3
+ jmp dwloop ; 3 ; too far for a branch
+
+
+check_lrd:
+ sty part_y ; 3
+ lda lineaddrs_l,y ; 5
+ sta pixptr ; 3
+ lda lineaddrs_h,y ; 5
+ sta pixptr+1 ; 3
+
+ ; 3/4 of the time, we can use a faster code path, check
+ ; (-1,0) and (1,0) at the same time. this happens only when
+ ; both pixels lie within the same byte.
+ ;ldx part_x ; X already has this from before
+ lda fastmasks,x ; 4
+ beq slow_x_lrd ; 2/3
+ ldy xoffsets,x ; 4
+ and (pixptr),y ; 5
+ bne stick ; 2/3
+ beq check_d ; 2/3
+slow_x_lrd:
+ ; (-1,0)
+ ldy xoffsets-1,x ; 4
+ lda xmasks-1,x ; 4
+ and (pixptr),y ; 5
+ bne stick ; 2/3
+ ; (1,0)
+ ldy xoffsets+1,x ; 4
+ lda xmasks+1,x ; 4
+ and (pixptr),y ; 5
+ bne stick ; 2/3
+
+check_d:
+ ; (0,-1)
+ lda pixptr ; 3
+ clc ; 2
+ adc #$20 ; 2
+ sta pixptr2 ; 3
+ lda pixptr+1 ; 3
+ adc #0 ; 2
+ sta pixptr2+1 ; 3
+ ;ldx part_x ; X already has this from before
+ ldy xoffsets,x ; 4
+ lda xmasks,x ; 4
+ ;sta pixmask ; 3
+ and (pixptr2),y ; 5
+ bne stick2 ; 2/3
+ jmp dwloop ; 3 ; too far for a branch
+stick2:
+ stx part_x ; only update part_x at exit.
+ rts
+
;;; Subroutine: drawseed
;;; dispatch to appropriate seed subroutine
drawseed:
@@ -772,7 +868,7 @@ ci_done:
; banner and saveprompt must start with a clear-screen code.
banner:
.byte $7d, "Diffusion Limited Aggregate",$9b
- .byte "Urchlay's ASM version 0.1.1",$9b,$9b
+ .byte "Urchlay's ASM version 0.1.2",$9b,$9b
.byte "Particle count range: 1 to 65535",$9b
.byte "How many particles [",$0