; 48-bit integer bignum implementation for taipan. ; the Atari 800/XL/XE build doesn't use this, it's only for ; the 5200 (though the A8 build can be tested with it). ; .include "atari.inc" ; don't include atari.inc on the 5200, ; I'll get confused even if ca65 doesn't. FR0 = $d4 ; this is the only label we need from atari.inc. on ; the 5200, $d4 and up are free zero page. .export _big_copy, _ulong_to_big, _big_cmp, _big_to_ulong, _big_add, _big_sub, _big_negate .export _big_div, _big_mul .import popax, pushax .importzp ptr1, ptr2, ptr3, tmp1 ; used by _big_div numerator = FR0 denominator = FR0+6 result = FR0+12 quotient = FR0+18 halfnum = FR0+24 newdenom = FR0+30 ;sign = FR0+36 ; used by _big_mul multiplicand = FR0 multiplier = FR0+6 ;result = FR0+12 ; same as above start = * ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; extern char __cdecl__ big_mul(bignump dest, bignump multiplicand, bignump multiplier); ; basically, init result to 0, shift the multiplicand right, add the multiplier ; to the result if a 1 shifted out. Then shift the multiplier left, repeat. ; 1010 x 0101 (10 times 5): ; result = 0; ; 0101 >> 1, shifted out a 1, so result += 1010, then shift the 1010 left to get 10100. ; 010 >> 1, shifted out a 0, so result stays same, then shift the 10100 left to get 101000 ; 01 >> 1, shifted out a 1, so result += 101000, then shift the 101000 left to get 1010000 ; 0 >> 1, shifted out a 0, result stays same. ; result ends up 1010 + 101000 or 50. ; the only PITA here is extending out to 48 bits. No overflow detection, ; hey, C doesn't detect int overflows either so why should I worry? _big_mul: jsr popax sta ptr1 stx ptr1+1 jsr popax sta ptr2 stx ptr2+1 ldy #5 @p1lp: lda (ptr1),y sta multiplier,y lda (ptr2),y sta multiplicand,y lda #0 sta result,y dey bpl @p1lp ldy #48 ; bit count @shiftlp: ; for each bit, shift the multiplicand right... lsr multiplicand+5 ror multiplicand+4 ror multiplicand+3 ror multiplicand+2 ror multiplicand+1 ror multiplicand bcc @dont_add ; if we shifted out a 0, don't add multiplier ; if we shifted out a 1, add the multiplier. clc lda multiplier adc result sta result lda multiplier+1 adc result+1 sta result+1 lda multiplier+2 adc result+2 sta result+2 lda multiplier+3 adc result+3 sta result+3 lda multiplier+4 adc result+4 sta result+4 lda multiplier+5 adc result+5 sta result+5 ; unconditionally shift the multiplier left @dont_add: asl multiplier rol multiplier+1 rol multiplier+2 rol multiplier+3 rol multiplier+4 rol multiplier+5 ; are we done? dey bne @shiftlp ; no, do next bit jmp mul_div_done ; yes, store result. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; extern char __cdecl__ big_div(bignump dest, bignump dividend, bignump divisor); ; dividend and divisor AKA numerator and denominator. ; this eats a ton of zero page, but we can afford it. ; wrote this in C and translated to asm by hand. The C is very asm-like, ; and would make your professor cry (or curse). The resulting asm is ; a bloated mess, but still smaller than if I'd compiled the C code ; with cc65. There's lots of room for improvement. ;;int divide(int num, int denom) { ;; int result, quotient, newdenom, halfnum; _big_div: jsr popax ; get dividend (denominator) sta ptr1 stx ptr1+1 jsr popax ; get divisor (numerator) sta ptr2 stx ptr2+1 ; leave dest argument on the stack. ;; result = 0; ldy #5 ; copy values to ZP @bdcp: lda (ptr1),y sta denominator,y lda (ptr2),y sta numerator,y lda #0 sta result,y dey bpl @bdcp ; at this point we are done with ptr1 and ptr2, so we don't ; have to save them before calling our other functions. ; don't need this. ;; ; deal with signs. ;; ;lda #0 ; A already 0 from loop above ;; sta sign ;; ldx #denominator ;; jsr fixsign ;; ldx #numerator ;; jsr fixsign ;;outerloop: ;; newdenom = denom; @outerloop: ldx #5 @copy_and_zero: lda denominator,x sta newdenom,x lda #0 sta quotient,x dex bpl @copy_and_zero ;; quotient = 1; inc quotient ; other bytes were zeroed in loop above. ;; if(newdenom < num) ;; goto checkequal; lda #newdenom ldx #0 jsr pushax lda #numerator ldx #0 jsr _big_cmp ; Z flag for equal, N=1 less, N=0 greater ;; if(newdenom < num) ;; goto innerprep; bmi @innerprep ;; if(newdenom == num) ;; goto innerprep; beq @innerprep ;; quotient = 0; ;; num = 0; ;; goto addquot; lda #0 sta quotient sta numerator sta numerator+1 sta numerator+2 sta numerator+3 sta numerator+4 sta numerator+5 beq @addquot ;;innerprep: ;; halfnum = num >> 1; @innerprep: lda numerator+5 lsr sta halfnum+5 lda numerator+4 ror sta halfnum+4 lda numerator+3 ror sta halfnum+3 lda numerator+2 ror sta halfnum+2 lda numerator+1 ror sta halfnum+1 lda numerator ror sta halfnum ;;innerloop: ;; if(newdenom > halfnum) ;; goto innerdone; @innerloop: lda #newdenom ldx #0 jsr pushax lda #halfnum ldx #0 jsr _big_cmp beq @notgt bmi @notgt bpl @innerdone @notgt: ;; newdenom <<= 1; asl newdenom rol newdenom+1 rol newdenom+2 rol newdenom+3 rol newdenom+4 rol newdenom+5 ;; quotient <<= 1; asl quotient rol quotient+1 rol quotient+2 rol quotient+3 rol quotient+4 rol quotient+5 ;; goto innerloop; jmp @innerloop ;;innerdone: ;; num -= newdenom; @innerdone: lda #numerator ldx #0 jsr pushax lda #numerator ldx #0 jsr pushax lda #newdenom ldx #0 jsr pushax jsr _big_sub ;;addquot: ;; result += quotient; @addquot: lda #result ldx #0 jsr pushax lda #result ldx #0 jsr pushax lda #quotient ldx #0 jsr pushax jsr _big_add ;; if(num) goto outerloop; lda numerator ora numerator+1 ora numerator+2 ora numerator+3 ora numerator+4 ora numerator+5 beq mul_div_done jmp @outerloop ; too far for relative branch ;; return result; mul_div_done: ; destination is still on the stack. jsr popax sta ptr1 stx ptr1+1 ldy #5 @doneloop: lda result,y sta (ptr1),y dey bpl @doneloop rts ;;} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; extern void __fastcall__ big_copy(bignump dest, bignump src); _big_copy: sta ptr1 stx ptr1+1 jsr popax sta ptr2 stx ptr2+1 ldy #5 @bclp: lda (ptr1),y sta (ptr2),y dey bpl @bclp rts ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; extern void __fastcall__ ulong_to_big(const unsigned long l, bignump b); _ulong_to_big: ; ptr1 = b sta ptr1 stx ptr1+1 ; zero out top 16 bits of b ldy #4 lda #0 sta (ptr1),y iny sta (ptr1),y ; copy bottom 16 bits of l to bottom 16 bits of b jsr popax ldy #0 ; popax eats the Y reg sta (ptr1),y iny txa ; sadly there's no stx (zp),y on the 6502 sta (ptr1),y ; copy top 16 bits of l to middle 16 bits of b jsr popax ldy #2 sta (ptr1),y iny txa sta (ptr1),y rts ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; TODO: consolidate with big_copy(), they're almost identical. ; extern char __fastcall__ big_to_ulong(bignump b, unsigned long *l); _big_to_ulong: sta ptr2 stx ptr2+1 jsr popax sta ptr1 stx ptr1+1 ldy #3 @bclp: lda (ptr1),y sta (ptr2),y dey bpl @bclp rts ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; extern signed char __fastcall__ big_cmp(bignump a, bignump b); _big_cmp: sta ptr2 ; ptr2 = b; stx ptr2+1 jsr popax sta ptr1 ; ptr1 = a; stx ptr1+1 ; since this is a signed compare, we'll check the sign bits first. ; if a is negative and b isn't, or vice versa, we shortcut the rest ; of the compare. ldy #5 ; point at the MSB lda (ptr1),y ; get MSB of a bpl @a_pos lda (ptr2),y ; a is negative, is b? bmi @cmplp ; yep, do a normal compare bpl @return_neg ; no, return negative result @a_pos: lda (ptr2),y ; a is positive, is b? bmi @return_pos ; no, return positive result ; yes, fall thru and do a normal compare ; Y is still 5 no matter how we got here. @cmplp: sec lda (ptr1),y sbc (ptr2),y ; A = a[Y] - b[Y]; bne @unequal ; if non-zero subtraction result, go check carry dey ; else got zero, keep comparing bpl @cmplp ; more bytes to check? bmi @return_a ; no. note: A still zero here @unequal: bcs @return_pos @return_neg: lda #$ff ; negative result, return -1 .byte $2c ; skip next 2 bytes @return_pos: lda #$01 ; positive result, return 1 (actually $0101) @return_a: tax ; sign-extend, the optimizer gets wonky about char vs int returns rts ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; extern void __fastcall__ big_negate(bignump b); ; 2's complement negation: invert all the bits, then add 1 ; TODO: this should be smaller. _big_negate: sta ptr1 stx ptr1+1 ldy #0 @invloop: lda (ptr1),y eor #$ff sta (ptr1),y iny cpy #6 bne @invloop ldy #0 sec php @addloop: plp lda (ptr1),y adc #0 php sta (ptr1),y iny cpy #6 bne @addloop plp rts ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; extern char __cdecl__ big_add(bignump dest, bignump addend1, bignump addend2); ; extern char __cdecl__ big_sub(bignump dest, bignump minuend, bignump subtrahend); ; addition and subtraction are almost identical, so the code here does both. ; TODO: this is not as compact as it could be. _big_sub: lda #$80 .byte $2c _big_add: lda #0 sta tmp1 ; tmp1 is negative if subtracting, positive if adding jsr popax ; addend2 or subtrahend sta ptr2 stx ptr2+1 jsr popax ; addend1 or minuend sta ptr1 stx ptr1+1 jsr popax ; dest sta ptr3 stx ptr3+1 ldy #0 bit tmp1 bmi @setc ; set carry if subtracting... clc ; otherwise clear it .byte $24 ; bit ZP, skip only 1 byte @setc: sec php ; we have to keep the C flag on the stack since the cpy below trashes it @addlp: plp lda (ptr1),y bit tmp1 bmi @sub adc (ptr2),y .byte $2c @sub: sbc (ptr2),y @store: sta (ptr3),y php iny cpy #6 bne @addlp plp rts .out .sprintf("bigint48 code is %d bytes", *-start) ; was going to be a helper routine for big_mul and big_div, but ; taipan doesn't need signed multiply/divide. ;; fixsign: ;; lda 0+5,x ;; bpl @pos ;; lda #$80 ;; eor sign ;; sta sign ;; txa ;; ldx #0 ;; jsr _big_negate ;; @pos: ;; rts