ld hl,FP1
ld de,FP2
call FloatDiv
<<code>>
FP1:
.db 0,2, $c9,$0f,$da,$a2,$21,$68,$c2,$34 ;pi, not rounded up
FP2:
.db 0,2, $ad,$f8,$54,$58,$a2,$bb,$4a,$9A ;e, not rounded up
The 80-bit division and multiplication routines do not handle overflow yet. On the to-do list:Given what SirCmpwn did to AssemblyBandit when he tried to contribute to KnightOS CSE, I would be careful about contributing for it.What happend to AssemblyBandit O.O
First digit = 0 ('digits' are 8-bit ints, so on [0,255])
Now 8AD1/AC = CE, so 8AD176.00 - AC0980*0.CE = 8AD176-8A6FAF = 61D1
Now 61D1/AC = 91, so 61D1.0000 - AC0980*.0091 = 61D1.0-6171.6180 = 5F.9E80
Now 5F9E/AC = 8E, so 5F.9E80 - AC0980*.00008E = 5F.9E8000-5F.6D4500 = .313B
In this case, there were no over estimates. We would have know if the subtraction step yeilded a negative output. To adjust this, decrement the new digit by 1 and add AC0980 to the int. So the example gives 8AD176/AC0980 = 0.CE918E, or in base 10, 9097590/11274624=.806908488274Div_Sub:
;DE/C, DE <C*256, C>127
ld a,d
sla e \ rla \ jr c,$+5 \ cp c \ jr c,$+4 \ sub c \ inc e
sla e \ rla \ jr c,$+5 \ cp c \ jr c,$+4 \ sub c \ inc e
sla e \ rla \ jr c,$+5 \ cp c \ jr c,$+4 \ sub c \ inc e
sla e \ rla \ jr c,$+5 \ cp c \ jr c,$+4 \ sub c \ inc e
sla e \ rla \ jr c,$+5 \ cp c \ jr c,$+4 \ sub c \ inc e
sla e \ rla \ jr c,$+5 \ cp c \ jr c,$+4 \ sub c \ inc e
sla e \ rla \ jr c,$+5 \ cp c \ jr c,$+4 \ sub c \ inc e
sla e \ adc a,a \ jr c,$+5 \ ret p \ cp c \ ret c \ inc e \ ret
FloatDiv_80:
; 1 bit sign + 15 bits signed exponent (16384 is exp = 0) (little endian)
; 64 bits mantissa, (big endian)
;Inputs:
; HL points to dividend
; DE points to divisor
ex de,hl
call LoadFPOPs
ld hl,(fpOP1)
ld de,(fpOP2)
ld a,h
xor d
push af
res 7,d
res 7,h
sbc hl,de
ld bc,16384
add hl,bc
pop af
and $80
or h
ld h,a
ld (fpOP3),hl
;Now perform the division of fpOP2/fpOP1
;The algo works like this:
; Take the first byte of fpOP2, compare against that of fpOP1
; If it is bigger, since fpOP1 should have bit 7 set (normalized numbers),
; it divides at most once. So the first byte is 1, subtract fpOP2-fpOP1->fpOP2
; After this, we repeatedly compare the upper two bytes of fpOP1 to the first byte
; of fpOP1. This is to estimate how many times fpOP1 can be divided by fpOP1.
; This is just a guestimate, but each digit is an overestimate by at most 1!
;
; Example with smaller numbers. Take 8AD176/AC0980
; First digit = 0 ('digits' are 8-bit ints, so on [0,255])
; Now 8AD1/AC = CE, so 8AD176.00 - AC0980*0.CE = 8AD176-8A6FAF = 61D1
; Now 61D1/AC = 91, so 61D1.0000 - AC0980*.0091 = 61D1.0-6171.6180 = 5F.9E80
; Now 5F9E/AC = 8E, so 5F.9E80 - AC0980*.00008E = 5F.9E8000-5F.6D4500 = .313B
; In this case, there were no over estimates. We would have know if the subtraction step
; yeilded a negative output. To adjust this, decrement the new digit by 1 and add AC0980 to the int.
; So the example gives 8AD176/AC0980 = 0.CE918E, or in base 10, 9097590/11274624=.806908488274
;fpOP1+2 has denom
;fpOP2+2 has num
ld de,fpOP2-2
ld hl,fpOP2+2
ldi \ ldi \ ldi
ldi \ ldi \ ldi
ldi \ ldi \ ldi
ldi \ ldi \ ldi
denom = fpOP1+2
numer = fpOP2-2
outp = numer-1
ld hl,denom
ld de,numer
call cp_64b
ld hl,numer-1
ld (hl),0
jr c,noadjust
inc (hl)
ex de,hl
inc de
ld hl,denom
call sub_64b
ex de,hl \ dec hl
noadjust:
inc hl
ld de,numer+8
call div_sub_1
call div_sub_1
call div_sub_1
call div_sub_1
call div_sub_1
call div_sub_1
call div_sub_1
call div_sub_1
ld de,801Eh
ld hl,800Bh
ld a,(hl)
rra
jr nc,directcopy
inc hl \ ld a,(hl) \ rra \ ld (de),a \ inc de
inc hl \ ld a,(hl) \ rra \ ld (de),a \ inc de
inc hl \ ld a,(hl) \ rra \ ld (de),a \ inc de
inc hl \ ld a,(hl) \ rra \ ld (de),a \ inc de
inc hl \ ld a,(hl) \ rra \ ld (de),a \ inc de
inc hl \ ld a,(hl) \ rra \ ld (de),a \ inc de
inc hl \ ld a,(hl) \ rra \ ld (de),a \ inc de
inc hl \ ld a,(hl) \ rra \ ld (de),a \ ret
directcopy:
inc hl
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ld hl,(fpOP3) \ dec hl \ ld (fpOP3),hl \ ret
div_sub_1:
ld bc,(denom)
ld a,(hl)
inc hl
push hl
ld l,(hl)
ld h,a
ex de,hl
call Div_Sub
ld c,e
ex de,hl
call fused_mul_sub
ld hl,9
add hl,de
ex de,hl
pop hl
ret
fused_mul_sub:
;multiply denominator*E and subtract from numerator
xor a
ld hl,(denom+6) \ ld b,a \ ld l,b
sla h \ jr nc,$+3 \ ld l,c
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
ld a,(de) \ sub l \ ld (de),a \ dec de
ld a,h \ adc a,b
ld hl,(denom+5) \ ld l,b
sla h \ jr nc,$+3 \ ld l,c
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add a,l \ jr nc,$+3 \ inc h \ ld l,a
ld a,(de) \ sub l \ ld (de),a \ ld a,h \ adc a,b \ dec de
ld hl,(denom+4) \ ld l,b
sla h \ jr nc,$+3 \ ld l,c
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add a,l \ jr nc,$+3 \ inc h \ ld l,a
ld a,(de) \ sub l \ ld (de),a \ ld a,h \ adc a,b \ dec de
ld hl,(denom+3) \ ld l,b
sla h \ jr nc,$+3 \ ld l,c
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add a,l \ jr nc,$+3 \ inc h \ ld l,a
ld a,(de) \ sub l \ ld (de),a \ ld a,h \ adc a,b \ dec de
ld hl,(denom+2) \ ld l,b
sla h \ jr nc,$+3 \ ld l,c
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add a,l \ jr nc,$+3 \ inc h \ ld l,a
ld a,(de) \ sub l \ ld (de),a \ ld a,h \ adc a,b \ dec de
ld hl,(denom+1) \ ld l,b
sla h \ jr nc,$+3 \ ld l,c
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add a,l \ jr nc,$+3 \ inc h \ ld l,a
ld a,(de) \ sub l \ ld (de),a \ ld a,h \ adc a,b \ dec de
ld hl,(denom) \ ld l,b
sla h \ jr nc,$+3 \ ld l,c
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add a,l \ jr nc,$+3 \ inc h \ ld l,a
ld a,(de) \ sub l \ ld (de),a \ ld a,h \ adc a,b \ dec de
ld hl,(denom-1) \ ld l,b
sla h \ jr nc,$+3 \ ld l,c
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add hl,hl \ jr nc,$+3 \ add hl,bc
add a,l \ jr nc,$+3 \ inc h \ ld l,a
ld a,(de) \ sub l \ ld (de),a \ ld a,h \ dec de
ld l,a
ld a,(de)
sbc a,l
;if c flag is set, overestimate
ld a,c \ ld (de),a
ret nc
ld hl,8
add hl,de
ex de,hl
ld hl,denom+7
ld a,(de) \ add a,(hl) \ ld (de),a \ dec hl \ dec de
ld a,(de) \ adc a,(hl) \ ld (de),a \ dec hl \ dec de
ld a,(de) \ adc a,(hl) \ ld (de),a \ dec hl \ dec de
ld a,(de) \ adc a,(hl) \ ld (de),a \ dec hl \ dec de
ld a,(de) \ adc a,(hl) \ ld (de),a \ dec hl \ dec de
ld a,(de) \ adc a,(hl) \ ld (de),a \ dec hl \ dec de
ld a,(de) \ adc a,(hl) \ ld (de),a \ dec hl \ dec de
ld a,(de) \ adc a,(hl) \ ld (de),a \ dec de
ex de,hl \ dec (hl) \ ex de,hl
ret
;num+7 - hl
sub_64b:
;(de)-(hl), big endian 64-bit.
ld bc,7
add hl,bc
ex de,hl
add hl,bc
ex de,hl
ld a,(de) \ sub (hl) \ ld (de),a \ dec de \ dec hl
ld a,(de) \ sbc a,(hl) \ ld (de),a \ dec de \ dec hl
ld a,(de) \ sbc a,(hl) \ ld (de),a \ dec de \ dec hl
ld a,(de) \ sbc a,(hl) \ ld (de),a \ dec de \ dec hl
ld a,(de) \ sbc a,(hl) \ ld (de),a \ dec de \ dec hl
ld a,(de) \ sbc a,(hl) \ ld (de),a \ dec de \ dec hl
ld a,(de) \ sbc a,(hl) \ ld (de),a \ dec de \ dec hl
ld a,(de) \ sbc a,(hl) \ ld (de),a \ ret
cp_64b:
;compares (de) to (hl), big endian 64-bit ints
ld a,(de) \ cp (hl) \ ret nz \ inc de \ inc hl
ld a,(de) \ cp (hl) \ ret nz \ inc de \ inc hl
ld a,(de) \ cp (hl) \ ret nz \ inc de \ inc hl
ld a,(de) \ cp (hl) \ ret nz \ inc de \ inc hl
ld a,(de) \ cp (hl) \ ret nz \ inc de \ inc hl
ld a,(de) \ cp (hl) \ ret nz \ inc de \ inc hl
ld a,(de) \ cp (hl) \ ret nz \ inc de \ inc hl
ld a,(de) \ cp (hl) \ ret
LoadFPOPs:
;HL points to the first
;DE points to the second
push de
ld de,fpOP1
xor a
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ld (de),a \ inc de
ld (de),a \ inc de
ld (de),a \ inc de
ld (de),a \ inc de
pop hl
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ld (de),a \ inc de
ld (de),a \ inc de
ld (de),a \ inc de
ld (de),a \ inc de
ret
.echo "Size:",$-Div_Sub
It is so large because I unrolled much of it. As an example, using e and pi: ld hl,float_e
ld de,float_pi
jp FloatDiv_80
;e/pi =0.dd816a76547ca9910802972996d4e3
float_pi:
.dw 16384+1 \ .db $c9,$0f,$da,$a2,$21,$68,$c2,$34 ;pi, not rounded up
float_e:
.dw 16384+1 \ .db $ad,$f8,$54,$58,$a2,$bb,$4a,$9A ;e, not rounded up
clock cycles ops per sec, 6MHz
Add/Sub 1200 cc 5000
Multiplication 13000 cc 461
Division 19000 cc 315
Sqrt 108000 cc 55
Args used:
1.570796326794897
57.29577951308232
For example, 57.29577951308232/1.570796326794897
TI-OS Float80 diff ratio analysis
add/subtract 2758 3166 +408 1.1479 Add/sub is a bit slower, possibly noticeably
multiply 35587 10851 -24736 0.3049 Multiplication is signigicantly faster. Noticeable.
divide 40521 18538 -21983 0.4575 Division is significantly faster. Noticeable.
square root 86825 46831 -39994 0.5394 Square roots, are significantly faster. Noticeable
notes: TI-Floats are approximately 47 bits of precision. Float80 uses 64 bits of precision (that is 14 digits versus 19)
Well, here are timings I got from WabbitEmu for the OS (86825 ccs) and mine (46831ccs). So it isn't quite twice as fast, but it is almost. I am also working on a routine to cut out another 16000 or so, so then it will be almost 3 times faster. For the timings I have:Code: [Select]
Args used:
1.570796326794897
57.29577951308232
For example, 57.29577951308232/1.570796326794897
TI-OS Float80 diff ratio analysis
add/subtract 2758 3166 +408 1.1479 Add/sub is a bit slower, possibly noticeably
multiply 35587 10851 -24736 0.3049 Multiplication is signigicantly faster. Noticeable.
divide 40521 18538 -21983 0.4575 Division is significantly faster. Noticeable.
square root 86825 46831 -39994 0.5394 Square roots, are significantly faster. Noticeable
notes: TI-Floats are approximately 47 bits of precision. Float80 uses 64 bits of precision (that is 14 digits versus 19)
Hmm, isn't 16-bit math sufficient, then? Also, in good news, I actually shaved off close to 18000 more clock cycles from the square root routine, putting it at a little over 3 times faster than TI's. I am back to working on the exponential and logarithm routine, but they are table based (using a single 64-element LUT with 9 bytes to each element). From this I will build the Float->Str routine.
or a \ sbc hl,de \ jr c,$+7 \ set 7,b \ jp $+4 \ add hl,de \ srl d \ rr e
I was just wondering if instead of all those "jp $+4"s, if you tried using "set 7,b \ .db $38 ;jr c,... \ add hl,de \ ; ..." you might be able to save two bytes and 3 t-states (x 15 repetitions). Since the carry will never be set there, it'll just skip the add hl,de which it will read as part of the jr. When the condition is false, jr is actually faster (and, of course, smaller) than a jp.
Xeda, i was just looking through the 24-bit division routine and saw this line:Code: [Select]or a \ sbc hl,de \ jr c,$+7 \ set 7,b \ jp $+4 \ add hl,de \ srl d \ rr e
I was just wondering if instead of all those "jp $+4"s, if you tried using "set 7,b \ .db $56 ;jr c,... \ add hl,de \ ; ..." you might be able to save a byte and 3 t-states (x 15 repetitions). Since the carry will never be set there, it'll just skip the add hl,de which it will read as part of the jr. When the condition is false, jr is actually faster (and, of course, smaller) than a jp.
ld hl,const_pi ;pi is the first arg
ld d,h \ ld e,l ;pi is also the second
ld bc,scrap
call mulSingle ;pi*pi = pi^2
ld h,b \ ld l,c ;Gonna square the result
ld d,b \ ld e,c ;BC points to the result of the previous multiply, now HL and DE do, too.
call mulSingle ;= pi^4
call mulSingle ;= pi^8
call mulSingle ;= pi^16
call mulSingle ;= pi^32
call invSingle ;= 1/pi^32 = pi^-32
call lgSingle ;= lg(pi^-32)
call single2string
bcall(_PutS)
absSingle
func: |x| -> z
mem: None
addSingle
func: x+y -> z
mem: 6 bytes
Note: special cases not done
subSingle
func: x-y -> z
mem: 10 bytes
Note: special cases not done
rsubSingle
func: -x+y -> z
mem: 10 bytes
Note: special cases not done
invSingle
func: 1/x -> z
mem: 5 bytes
divSingle
func: x/y -> z
mem: 5 bytes
cmpSingle
func: compare x to y, no output
return z flag if x=y (error is up to the last 2 bits)
return c flag if x<y
return nc if x>=y
mem: None
single2string
func: string(x) -> z
mem: 44 bytes
mulSingle
func: x*y -> z
mem: 6 bytes
negSingle
func: -x -> z
mem: None
TI-OS z80float dif %
ln = 131547.46cc ~165000 +33452.54 125.43% much slower :(
atan = 173317.82cc ~174000 +682.18 100.39% slightly slower
atanh = 175320.91cc ~174000 -1320.91 99.25% slightly faster
sqrt = 77699.51cc 6540.79 -71158.72 8.42% Way faster!
mul = 30229.53cc 9928.23 -20301.30 32.84% over 3 times faster
add = 1737.99cc 2094.31 +356.32 120.50% slower :(
Also, have a recent screenshot:Basic arithmetic:
absSingle |x| -> z Computes the absolute value
addSingle x+y -> z
ameanSingle (x+y)/2 -> z. Arithmetic mean of two numbers.
cmpSingle cmp(x,y) Compare two numbers. Output is in the flags register!
rsubSingle y-x -> z
subSingle x-y -> z
divSingle x/y -> z
invSingle 1/x -> z
mulSingle x*y -> z
negSingle -x -> z
sqrtSingle sqrt(x*y) -> z
geomeanSingle sqrt(x*y) -> z
Logs, Exponentials, Powers
expSingle e^x -> z
pow2Single 2^x -> z
pow10Single 10^x-> z
powSingle y^x -> z
lgSingle log2(x) -> z
lnSingle ln(x) -> z
log10Single log10(x) -> z
logSingle log_y(x) -> z
Trig, Hyperbolic, and their Inverses
acoshSingle acosh(x) -> z
acosSingle acos(x) -> z
asinhSingle asinh(x) -> z
asinSingle asin(x) -> z
atanhSingle atanh(x) -> z
atanSingle atan(x) -> z
coshSingle cosh(x) -> z
cosSingle cos(x) -> z
sinhSingle sinh(x) -> z
sinSingle sin(x) -> z
tanhSingle tanh(x) -> z
tanSingle tan(x) -> z
Special-Purpose Used by various internal functions, or optimized for special cases
bg2iSingle 1/BG(x,y) -> z Fewer iterations, but enough to be suitable for ln(x). Kind of a special-purpose routine
bgiSingle 1/BG(x,y) -> z More iterations, general-purpose, needed for the inverse trig and hyperbolics
div255Single x/255 -> z
div85Single x/85 -> z
div51Single x/51 -> z
div17Single x/17 -> z
div15Single x/15 -> z
div5Single x/5 -> z
div3Single x/3 -> z
mul10Single x*10 -> z
mulSingle_p375 x*0.375 -> z Used in bg2iSingle. x*(3/8)
mulSingle_p34375 x*0.34375-> z Used in bgiSingle. x*(11/32)
mulSingle_p041015625 x*0.041015625-> z Used in bgiSingle. x*(21/512)
Miscellaneous and Utility
randSingle rand -> z
single2str str(x) -> z Convert a single to a null-terminated string, with formatting
single2TI tifloat(x) -> z Converts a single to a TI-float. Useful for interacting with the TI-OS
ti2single single(tifloat x)->z Converts a TI-float to a single. Useful for interacting with the TI-OS
single2char Honestly, I forgot what it does, but I use it in some string routines. probably converts to a uint8
pushpop pushes the main registers to the stack and sets up a routine so that when your code exits, it restores registers. Replaces manually surrounding code with push...pop
xcmp for comparing two numbers
xneg -x -> z
xabs |x|-> z
xinv 1/x -> z Observed a bug in 1/pi !
xpow x^y -> z
xpow2 2^x
xpow10 10^x
xlog log_y(x) It's failing miserably
xlg log2(x)
xlog10 log10(x) Observed a bug in log10(pi)
I made the str->single routine better (it had been quickly thrown together and failed on many/most cases). Now it appears that digits get swapped in some cases! :( I have to look into this.