1 files changed, 516 insertions, 0 deletions
diff --git a/bigint48.s b/bigint48.s
new file mode 100644
index 0000000..1971af2
--- /dev/null
+++ b/bigint48.s
@@ -0,0 +1,516 @@
+
+; 48-bit integer bignum implementation for taipan.
+; the Atari 800/XL/XE build doesn't use this, it's only for
+; the 5200 (though the A8 build can be tested with it).
+
+; .include "atari.inc" ; don't include atari.inc on the 5200, I'll get confused.
+ FR0 = $d4 ; this is the only label we need from atari.inc
+
+ .export _big_copy, _ulong_to_big, _big_cmp, _big_to_ulong, _big_add, _big_sub, _big_negate
+ .export _big_div, _big_mul
+ .import popax, pushax
+ .importzp ptr1, ptr2, ptr3, tmp1
+
+ ; used by _big_div
+ numerator = FR0
+ denominator = FR0+6
+ result = FR0+12
+ quotient = FR0+18
+ halfnum = FR0+24
+ newdenom = FR0+30
+ ;sign = FR0+36
+
+ ; used by _big_mul
+ multiplicand = FR0
+ multiplier = FR0+6
+ ;result = FR0+12 ; same as above
+
+ start = *
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; extern char __cdecl__ big_mul(bignump dest, bignump multiplicand, bignump multiplier);
+; basically, init result to 0, shift the multiplicand right, add the multiplier
+; to the result if a 1 shifted out. Then shift the multiplier left, repeat.
+; 1010 x 0101 (10 times 5):
+; result = 0;
+; 0101 >> 1, shifted out a 1, so result += 1010, then shift the 1010 left to get 10100.
+; 010 >> 1, shifted out a 0, so result stays same, then shift the 10100 left to get 101000
+; 01 >> 1, shifted out a 1, so result += 101000, then shift the 101000 left to get 1010000
+; 0 >> 1, shifted out a 0, result stays same.
+; result ends up 1010 + 101000 or 50.
+; the only PITA here is extending out to 48 bits. No overflow detection,
+; hey, C doesn't detect int overflows either so why should I worry?
+_big_mul:
+ jsr popax
+ sta ptr1
+ stx ptr1+1
+ jsr popax
+ sta ptr2
+ stx ptr2+1
+
+ ldy #5
+@p1lp:
+ lda (ptr1),y
+ sta multiplier,y
+ lda (ptr2),y
+ sta multiplicand,y
+ lda #0
+ sta result,y
+ dey
+ bpl @p1lp
+
+ ldy #48 ; bit count
+@shiftlp:
+ ; for each bit, shift the multiplicand right...
+ lsr multiplicand+5
+ ror multiplicand+4
+ ror multiplicand+3
+ ror multiplicand+2
+ ror multiplicand+1
+ ror multiplicand
+ bcc @dont_add ; if we shifted out a 0, don't add multiplier
+
+ ; if we shifted out a 1, add the multiplier.
+ clc
+ lda multiplier
+ adc result
+ sta result
+ lda multiplier+1
+ adc result+1
+ sta result+1
+ lda multiplier+2
+ adc result+2
+ sta result+2
+ lda multiplier+3
+ adc result+3
+ sta result+3
+ lda multiplier+4
+ adc result+4
+ sta result+4
+ lda multiplier+5
+ adc result+5
+ sta result+5
+
+ ; unconditionally shift the multiplier left
+@dont_add:
+ asl multiplier
+ rol multiplier+1
+ rol multiplier+2
+ rol multiplier+3
+ rol multiplier+4
+ rol multiplier+5
+
+ ; are we done?
+ dey
+ bne @shiftlp ; no, do next bit
+
+ jmp mul_div_done ; yes, store result.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; extern char __cdecl__ big_div(bignump dest, bignump dividend, bignump divisor);
+; dividend and divisor AKA numerator and denominator.
+; this eats a ton of zero page, but we can afford it.
+
+; wrote this in C and translated to asm by hand. The C is very asm-like,
+; and would make your professor cry (or curse). The resulting asm is
+; a bloated mess, but still smaller than if I'd compiled the C code
+; with cc65. There's lots of room for improvement.
+
+;;int divide(int num, int denom) {
+;;   int result, quotient, newdenom, halfnum;
+
+_big_div:
+ jsr popax ; get dividend (denominator)
+ sta ptr1
+ stx ptr1+1
+ jsr popax ; get divisor (numerator)
+ sta ptr2
+ stx ptr2+1
+ ; leave dest argument on the stack.
+
+;;   result = 0;
+ ldy #5 ; copy values to ZP
+@bdcp:
+ lda (ptr1),y
+ sta denominator,y
+ lda (ptr2),y
+ sta numerator,y
+ lda #0
+ sta result,y
+ dey
+ bpl @bdcp
+ ; at this point we are done with ptr1 and ptr2, so we don't
+ ; have to save them before calling our other functions.
+
+ ; don't need this.
+;;  ; deal with signs.
+;;  ;lda #0 ; A already 0 from loop above
+;;  sta sign
+;;  ldx #denominator
+;;  jsr fixsign
+;;  ldx #numerator
+;;  jsr fixsign
+
+;;outerloop:
+;;   newdenom = denom;
+@outerloop:
+ ldx #5
+@copy_and_zero:
+ lda denominator,x
+ sta newdenom,x
+ lda #0
+ sta quotient,x
+ dex
+ bpl @copy_and_zero
+
+;;   quotient = 1;
+ inc quotient ; other bytes were zeroed in loop above.
+
+;;   if(newdenom < num)
+;;      goto checkequal;
+ lda #newdenom
+ ldx #0
+ jsr pushax
+ lda #numerator
+ ldx #0
+ jsr _big_cmp ; Z flag for equal, N=1 less, N=0 greater
+;;   if(newdenom < num)
+;;      goto innerprep;
+ bmi @innerprep
+;;   if(newdenom == num)
+;;      goto innerprep;
+ beq @innerprep
+
+;;   quotient = 0;
+;;   num = 0;
+;;   goto addquot;
+ lda #0
+ sta quotient
+ sta numerator
+ sta numerator+1
+ sta numerator+2
+ sta numerator+3
+ sta numerator+4
+ sta numerator+5
+ beq @addquot
+
+;;innerprep:
+;;   halfnum = num >> 1;
+@innerprep:
+ lda numerator+5
+ lsr
+ sta halfnum+5
+ lda numerator+4
+ ror
+ sta halfnum+4
+ lda numerator+3
+ ror
+ sta halfnum+3
+ lda numerator+2
+ ror
+ sta halfnum+2
+ lda numerator+1
+ ror
+ sta halfnum+1
+ lda numerator
+ ror
+ sta halfnum
+
+;;innerloop:
+;;   if(newdenom > halfnum)
+;;      goto innerdone;
+@innerloop:
+ lda #newdenom
+ ldx #0
+ jsr pushax
+ lda #halfnum
+ ldx #0
+ jsr _big_cmp
+ beq @notgt
+ bmi @notgt
+ bpl @innerdone
+@notgt:
+
+;;   newdenom <<= 1;
+ asl newdenom
+ rol newdenom+1
+ rol newdenom+2
+ rol newdenom+3
+ rol newdenom+4
+ rol newdenom+5
+
+;;   quotient <<= 1;
+ asl quotient
+ rol quotient+1
+ rol quotient+2
+ rol quotient+3
+ rol quotient+4
+ rol quotient+5
+
+;;   goto innerloop;
+ jmp @innerloop
+
+;;innerdone:
+;;   num -= newdenom;
+@innerdone:
+ lda #numerator
+ ldx #0
+ jsr pushax
+ lda #numerator
+ ldx #0
+ jsr pushax
+ lda #newdenom
+ ldx #0
+ jsr pushax
+ jsr _big_sub
+
+;;addquot:
+;;   result += quotient;
+@addquot:
+ lda #result
+ ldx #0
+ jsr pushax
+ lda #result
+ ldx #0
+ jsr pushax
+ lda #quotient
+ ldx #0
+ jsr pushax
+ jsr _big_add
+
+;;   if(num) goto outerloop;
+ lda numerator
+ ora numerator+1
+ ora numerator+2
+ ora numerator+3
+ ora numerator+4
+ ora numerator+5
+ beq mul_div_done
+ jmp @outerloop ; too far for relative branch
+
+;;   return result;
+mul_div_done:
+; destination is still on the stack.
+ jsr popax
+ sta ptr1
+ stx ptr1+1
+ ldy #5
+@doneloop:
+ lda result,y
+ sta (ptr1),y
+ dey
+ bpl @doneloop
+
+ rts
+
+;;}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; extern void __fastcall__ big_copy(bignump dest, bignump src);
+_big_copy:
+ sta ptr1
+ stx ptr1+1
+ jsr popax
+ sta ptr2
+ stx ptr2+1
+ ldy #5
+@bclp:
+ lda (ptr1),y
+ sta (ptr2),y
+ dey
+ bpl @bclp
+ rts
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; extern void __fastcall__ ulong_to_big(const unsigned long l, bignump b);
+_ulong_to_big:
+ ; ptr1 = b
+ sta ptr1
+ stx ptr1+1
+
+ ; zero out top 16 bits of b
+ ldy #4
+ lda #0
+ sta (ptr1),y 
+ iny
+ sta (ptr1),y
+
+ ; copy bottom 16 bits of l to bottom 16 bits of b
+ jsr popax
+ ldy #0    ; popax eats the Y reg
+ sta (ptr1),y
+ iny
+ txa       ; sadly there's no stx (zp),y on the 6502
+ sta (ptr1),y
+
+ ; copy top 16 bits of l to middle 16 bits of b
+ jsr popax
+ ldy #2
+ sta (ptr1),y
+ iny
+ txa
+ sta (ptr1),y
+ rts
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; TODO: consolidate with big_copy(), they're almost identical.
+; extern char __fastcall__ big_to_ulong(bignump b, unsigned long *l);
+_big_to_ulong:
+ sta ptr2
+ stx ptr2+1
+ jsr popax
+ sta ptr1
+ stx ptr1+1
+ ldy #3
+@bclp:
+ lda (ptr1),y
+ sta (ptr2),y
+ dey
+ bpl @bclp
+ rts
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; extern signed char __fastcall__ big_cmp(bignump a, bignump b);
+_big_cmp:
+ sta ptr2     ; ptr2 = b;
+ stx ptr2+1
+ jsr popax
+ sta ptr1     ; ptr1 = a;
+ stx ptr1+1
+
+ ; since this is a signed compare, we'll check the sign bits first.
+ ; if a is negative and b isn't, or vice versa, we shortcut the rest
+ ; of the compare.
+ ldy #5       ; point at the MSB
+ lda (ptr1),y ; get MSB of a
+ bpl @a_pos
+ lda (ptr2),y ; a is negative, is b?
+ bmi @cmplp   ; yep, do a normal compare
+ bpl @return_neg ; no, return negative result
+
+@a_pos:
+ lda (ptr2),y ; a is positive, is b?
+ bmi @return_pos ; no, return positive result
+              ; yes, fall thru and do a normal compare
+
+ ; Y is still 5 no matter how we got here.
+@cmplp:
+ sec
+ lda (ptr1),y
+ sbc (ptr2),y ; A = a[Y] - b[Y];
+ bne @unequal ; if non-zero subtraction result, go check carry
+ dey          ; else got zero, keep comparing
+ bpl @cmplp   ; more bytes to check?
+ bmi @return_a ; no. note: A still zero here
+@unequal:
+ bcs @return_pos
+@return_neg:
+ lda #$ff     ; negative result, return -1
+ .byte $2c    ; skip next 2 bytes
+@return_pos:
+ lda #$01     ; positive result, return 1 (actually $0101)
+@return_a:
+ tax ; sign-extend, the optimizer gets wonky about char vs int returns
+ rts
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; extern void __fastcall__ big_negate(bignump b);
+; 2's complement negation: invert all the bits, then add 1
+; TODO: this should be smaller.
+_big_negate:
+ sta ptr1
+ stx ptr1+1
+
+ ldy #0
+@invloop:
+ lda (ptr1),y
+ eor #$ff
+ sta (ptr1),y
+ iny
+ cpy #6
+ bne @invloop
+
+ ldy #0
+ sec
+ php
+@addloop:
+ plp
+ lda (ptr1),y 
+ adc #0
+ php
+ sta (ptr1),y
+ iny
+ cpy #6
+ bne @addloop
+
+ plp
+ rts
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; extern char __cdecl__ big_add(bignump dest, bignump addend1, bignump addend2);
+; extern char __cdecl__ big_sub(bignump dest, bignump minuend, bignump subtrahend);
+; addition and subtraction are almost identical, so the code here does both.
+; TODO: this is not as compact as it could be.
+_big_sub:
+ lda #$80
+ .byte $2c
+_big_add:
+ lda #0
+ sta tmp1 ; tmp1 is negative if subtracting, positive if adding
+
+ jsr popax ; addend2 or subtrahend
+ sta ptr2
+ stx ptr2+1
+ jsr popax ; addend1 or minuend
+ sta ptr1
+ stx ptr1+1
+ jsr popax ; dest
+ sta ptr3
+ stx ptr3+1
+
+ ldy #0
+
+ bit tmp1
+ bmi @setc ; set carry if subtracting...
+ clc       ; otherwise clear it
+ .byte $24 ; bit ZP, skip only 1 byte
+@setc:
+ sec
+
+ php ; we have to keep the C flag on the stack since the cpy below trashes it
+@addlp:
+ plp
+ lda (ptr1),y
+ bit tmp1
+ bmi @sub
+ adc (ptr2),y
+ .byte $2c
+
+@sub:
+ sbc (ptr2),y
+
+@store:
+ sta (ptr3),y
+ php
+ iny
+ cpy #6
+ bne @addlp
+ plp
+ rts
+
+ .out .sprintf("bigint48 code is %d bytes", *-start)
+
+; was going to be a helper routine for big_mul and big_div, but
+; taipan doesn't need signed multiply/divide.
+;; fixsign:
+;;  lda 0+5,x
+;;  bpl @pos
+;;  lda #$80
+;;  eor sign
+;;  sta sign
+;;  txa
+;;  ldx #0
+;;  jsr _big_negate
+;; @pos:
+;;  rts
+