; hldebc = hla * cde, a = 0
ld b,0
push hl
ld l,b
ld h,a
ld ix,0
ld a,24
Loop:
add ix,ix
adc hl,hl
ex (sp),hl
adc hl,hl
ex (sp),hl
jr nc,Next
add ix,de
adc hl,bc
jr nc,Next
ex (sp),hl
inc hl
ex (sp),hl
Next:
dec a
jr nz,Loop
pop de
ex de,hl
push ix ; ld c,ixl
pop bc ; ld b,ixh
...cool, I was right! ; hldebc = hlc * bde
ld (iy+asm_Flag1),b
xor a
ld ix,0
ld b,24
Loop:
add ix,ix
rla
rl c
adc hl,hl
jr nc,Next
add ix,de
adc a,(iy+asm_Flag1)
rl c
jr nc,Next
inc hl
Next:
djnz Loop
ld e,a
ld d,c
push ix ; ld c,ixl
pop bc ; ld b,ixh
Ok, this is a bit of a stretch, but how about square-rooting a 24-bit number
Well I am almost positive that using my algorithm would require the use of RAM, even if you used all the shadow registers, too.Can you help me, conceptually with doing math higher than 2 bytes? Maybe I'll write it myself.
0→A
0→D ;This is where the result will go
0→C ;This is a carry
For(B,1,12) ;12 is half the number of input bits
D+D→D ;We want to shift D left
D+D+1→C ;The difference of consecutive squares?
A<<E ;rotate E left, then rotate the carry into A
A<<E ;In z80, "rlc e \ rla"
If A>=C ;If A is greater than or equal to C
D+1→D
A-C→A
End
;You will need an input (24 bits in the above example)
;You will need an output (12 bits in the above example)
;You will need a counter (4 bits in the above example)
;You will need an accumulator (24 bits in the above example)
;You will need a carry (13 bits in the above example)
;
;You will need at least (77 bits in the above example)
So for a simple 8-bit implementation in Z80 code:;===============================================================
sqrtE:
;===============================================================
;Input:
; E is the value to find the square root of
;Outputs:
; A is E-D^2
; B is 0
; D is the result
; E is not changed
; HL is not changed
;Destroys:
; C=2D+1 if D is even, 2D-1 if D is odd
xor a ;1 4 4
ld d,a ;1 4 4
ld c,a ;1 4 4
ld b,4 ;2 7 7
sqrtELoop:
rlc d ;2 8 32
ld c,d ;1 4 16
scf ;1 4 16
rl c ;2 8 32
rlc e ;2 8 32
rla ;1 4 16
rlc e ;2 8 32
rla ;1 4 16
cp c ;1 4 16
jr c,$+4 ;4 12|15 48+3x
inc d ;-- -- --
sub c ;-- -- --
djnz sqrtELoop ;2 13|8 47
ret ;1 10 10
If you want to refine the accuracy to round to the nearest integer, you can add this code to the end of the Z80 code: cp d ;1 4 4
jr c,$+3 ;3 12|11 12|11
inc d ;-- -- --
It takes advantage of the linear differences of a quadratic (meaning (a(x+1)2+b(x+1)+c)-(ax2+bx+c) is a lnear equation)
multBCDbyEHL:
ld a, b
ld b, c
ld c, d
ld d, a
multDBCbyEHL:
push de
push hl
ld ix, $8000
ld (ix), l
xor a
ld h, a
ld l, a
call do8Bits
ld (ix+1), a
pop af
ld (ix), a
push de
ld a, (ix+1)
call do8Bits
ld (ix+1), a
pop af ;least sig number
ex de, hl
ex (sp), hl ;D and 2nd least sig in for new number
ld (ix), l
pop hl ;D and 2nd least sig
push af ;least sig number
push hl ;2nd least sig number
ex de, hl
ld a, (ix+1)
call do8Bits
ld b, a
ld c, h
ld d, l
pop hl
ld a, l
pop hl
ld h, a
ret
;####
;input: DBC = 1 number
; AHL = running number
; (ix) = to multiply by
;output: AHLE = output
; E is done
; DBC = 1 number
do8Bits:
ld (ix+1), 8
loop:
srl (ix)
jr nc, skip
add hl, bc
adc a, d
skip:
rra
rr h
rr l
rr e
dec (ix+1)
jr nz, loop
ret
; ahl = sqrt(hldebc)
push bc ; ld c,ixl
pop ix ; ld b,ixh
push de
ld c,l
ld a,h
ld hl,0
ld b,h
ld e,l
ld d,h
ld (iy+asm_Flag1),d
ld (iy+asm_Flag2),24
Loop:
cp $40
push af
sbc hl,de
ld a,b
sbc a,(iy+asm_Flag1)
jr c,Restore
ld b,a
pop af
sub $40
scf
jr Skip
Restore:
pop af
adc hl,de
or a
Skip:
rl e
rl d
rl (iy+asm_Flag1)
add ix,ix
ex (sp),hl
adc hl,hl
ex (sp),hl
rl c
rla
adc hl,hl
rl b
add ix,ix
ex (sp),hl
adc hl,hl
ex (sp),hl
rl c
rla
adc hl,hl
rl b
dec (iy+asm_Flag2)
jr nz,Loop
pop hl
ld a,(iy+asm_Flag1)
ret
; dea = sqrt(hldebc)
di
push bc ; ld c,ixl
pop ix ; ld b,ixh
ld bc,$40
ld a,l
ld l,h
ld h,b
exx
ld de,0
ld l,e
ld h,d
ld b,24
or a
Loop:
exx
sbc hl,bc
exx
sbc hl,de
jr nc,Skip
exx
add hl,bc
exx
adc hl,de
Skip:
exx
ccf
rl b
exx
rl e
rl d
exx
add ix,ix
rl e
rl d
rla
adc hl,hl
exx
adc hl,hl
exx
add ix,ix
rl e
rl d
rla
adc hl,hl
exx
adc hl,hl
djnz Loop
exx
ld a,b
exx
ei
ret
B/2→A ;start with anything, really
For(C,1,12
(A+B/A)/2→A
End
432
x27
-----
Well that is actually the best method and fastest. What you do is multiply 2*432. THen you shift the digits left and throw a zero at the end. Then you multiply 432*7 and add the two values together. Voila, multiplication :) Now in binary, math is almost always easier. Because you are using 0 and 1, multiplication is easy:11000101
11010111
so you take the last digit of the second number and you get 1. Multiply 1 times the first and you get 11000101. Now shift that left and check the next bit of the bottom number. If that is 1, add it to the accumulator (the accumulator in this case is the running total, not necessarily the a register).;D*E
xor a ;This is the accumulator
ld b,8 ;this is the counter
MultLoop:
add a,a ;this is to rotate the accumulator left. Use rlca, too
rlc e ;This puts the next bit in E in the carry
jr nc,$+3 ;If the bit in E was not 1, you add 0 (so don't add!)
add a,d ;1*D=D, so add D. Binary makes math easy.
djnz MultLoop
ret
Hey, jacobly, I tested both of your multiplication routines and it doesn't seem like they're doing the same thing... I've tried both of them in my program but they have different results.They both have different inputs. :)
// Multiply a times b
temp = 0
repeat for each bit in a
temp <<= 1
if (high bit of a set)
temp += b
a <<= 1
return temp
// Divide a by b
temp = 0
repeat for each bit in a
temp <<= 1
temp += high bit of a
a <<= 1
if (temp >= b)
temp -= b
set low bit of a
return a
// Sqrt a
temp = 0
b = 0
repeat for every 2 bits in a
temp += high 2 bits of a
a <<= 2
test = b << 2 + 1
b <<= 1
if (temp >= test)
temp -= test
set low bit of b
return b
// Sqrt a, sometimes better with multiple-of-a-byte registers
temp = high byte of a
a <<= 8
b = 0
repeat for every 2 bits in a
test = b << 8 + 0x40
b <<= 1
if (temp >= test)
temp -= test
set low bit of b
temp += high 2 bits of a
a <<= 2
return b
The tricky part is figuring out how many bits are in each variable and allocating the z80 registers accordingly.
333
x471
----
1) 4*333
+Acc = 1332
Acc*10= 13320
2) 7*333
+Acc = 15651
Acc*10= 156510
3) 1*333
+Acc = 156843
Here is the algorithm in binary: 00100110
x11011001
---------------
1) 1x00100110
+Acc = 00100110
Acc*2= 01001100
2) 1x00100110
+Acc = 01001100+00100110=01110010
Acc*2= 11100100
3) 0x00100110
+Acc = 11100100
Acc*2= 111001000
4) 1x00100110
+Acc = 111101110
Acc*2= 1111011100
5) 1x00100110
+Acc = 10000000010
Acc*2= 100000000100
6) 0x00100110
+Acc = 100000000100
Acc*2= 1000000001000
7) 0x00100110
+Acc = 1000000001000
Acc*2= 10000000010000
8) 1x00100110
+Acc = 10000000110110
// Multiply a times b
temp = 0
repeat for each bit in a
temp <<= 1
if (high bit of a set)
temp += b
a <<= 1
return temp
if a and b are 2 bytes, temp is 4 bytes, and you loop 16 times.// Sqrt a
temp = high byte of a
a <<= 8
b = 0
repeat for every 2 bits in a
test = b << 8 + 0x40
b <<= 1
if (temp >= test)
temp -= test
set low bit of b
temp += high 2 bits of a
a <<= 2
return b
If a is 4 bytes, then b and temp are 2 bytes, and you loop 16 times.Code: [Select]// Multiply a times b
if a and b are 2 bytes, temp is 4 bytes, and you loop 16 times.
temp = 0
repeat for each bit in a
temp <<= 1
if (high bit of a set)
temp += b
a <<= 1
return tempSpoiler For for code:Code: [Select]// Sqrt a
If a is 4 bytes, then b and temp are 2 bytes, and you loop 16 times.
temp = high byte of a
a <<= 8
b = 0
repeat for every 2 bits in a
test = b << 8 + 0x40
b <<= 1
if (temp >= test)
temp -= test
set low bit of b
temp += high 2 bits of a
a <<= 2
return bSpoiler For code:
ld hl,0
ld a,16
MultLoop:
add hl,hl ;shifts hl left
rl e \ rl d ;shifts de left and if hl overflowed, it overflows into de
jr nc,$+6 ;if the bit in DE is o, skip this chunk
add hl,bc ;add bc to hl (think of this as the first number)
jr nc,$+3 ;overflow into de
inc de
dec a
jr nz,MultLoop
ret
That will multiply DE times BC and return the result in DEHL. I will see if I can port a square root routine for 32-bit...Code: [Select]// Multiply a times b
if a and b are 2 bytes, temp is 4 bytes, and you loop 16 times.
temp = 0
repeat for each bit in a
temp <<= 1
if (high bit of a set)
temp += b
a <<= 1
return tempSpoiler For for code:Code: [Select]// Sqrt a
If a is 4 bytes, then b and temp are 2 bytes, and you loop 16 times.
temp = high byte of a
a <<= 8
b = 0
repeat for every 2 bits in a
test = b << 8 + 0x40
b <<= 1
if (temp >= test)
temp -= test
set low bit of b
temp += high 2 bits of a
a <<= 2
return bSpoiler For code:
Kool. Thanks. But, shouldn't the first one have two inputs?
My first multiplication routine takes 2746 - 4570 cycles, the second takes 1680 - 2880 cycles.Oh boy, optimization time :D
SUBFIRST .macro src1, src2, hdest, ldest
exx
ld a, src1
sub src2
jr nc, $ + 4
neg
exx
ld l, a
ld a, ldest
sub (hl)
ld ldest, a
inc h
ld a, hdest
sbc a, (hl)
ld hdest, a
.endm
SUBNEXT .macro src1, src2, hdest, ldest
dec h
ex af, af'
exx
ld a, src1
sub src2
jr nc, $ + 4
neg
exx
ld l, a
ex af, af'
ld a, ldest
sbc a, (hl)
ld ldest, a
inc h
ld a, hdest
sbc a, (hl)
ld hdest, a
.endm
BDE_times_CHL_sqrdiff_v3:
ld a, d
exx
ld h, high(sqrtab)
ld l, a
ld e, (hl)
inc h
ld d, (hl) ; DE = d²
exx
ld a, b
exx
ld l, a
ld b, (hl)
dec h
ld c, (hl) ; BC = b²
exx
ld a, e
exx
ld l, a
ld a, (hl)
inc h
ld h, (hl)
ld l, a ; HL = e²
call BC_DE_HL_times_10101
push bc
push hl
push de
exx
ld a, h
exx
ld h, high(sqrtab)
ld l, a
ld e, (hl)
inc h
ld d, (hl) ; DE = h²
exx
ld a, c
exx
ld l, a
ld b, (hl)
dec h
ld c, (hl) ; BC = c²
exx
ld a, l
exx
ld l, a
ld a, (hl)
inc h
ld h, (hl)
ld l, a ; HL = l²
call BC_DE_HL_times_10101
pop ix
add ix, de
pop de
adc hl, de
ex de, hl
pop hl
adc hl, bc
ld b, h
ld c, l ; BCDEIX = total
push af
ld h, high(sqrtab)
SUBFIRST e, l, ixh, ixl
SUBNEXT d, h, d, e
SUBNEXT b, c, b, c
jp nc, BDE_times_CHL_sqrdiff_v3_nc1
pop af
ccf
push af
BDE_times_CHL_sqrdiff_v3_nc1:
inc b
dec h
SUBFIRST e, h, e, ixh
SUBNEXT d, c, c, d
jr nc, BDE_times_CHL_sqrdiff_v3_nc2
dec b
jp nz, BDE_times_CHL_sqrdiff_v3_nc2
pop af
ccf
push af
BDE_times_CHL_sqrdiff_v3_nc2:
dec h
SUBFIRST d, l, e, ixh
SUBNEXT b, h, c, d
jr nc, BDE_times_CHL_sqrdiff_v3_nc3
dec b
jp nz, BDE_times_CHL_sqrdiff_v3_nc3
pop af
ccf
push af
BDE_times_CHL_sqrdiff_v3_nc3:
inc c
dec h
SUBFIRST b, l, d, e
jr nc, BDE_times_CHL_sqrdiff_v3_nc4
dec c
jp nz, BDE_times_CHL_sqrdiff_v3_nc4
dec b
jp nz, BDE_times_CHL_sqrdiff_v3_nc4
pop af
ccf
push af
BDE_times_CHL_sqrdiff_v3_nc4:
dec h
SUBFIRST e, c, d, e
pop hl
jr nc, BDE_times_CHL_sqrdiff_v3_nc5
dec c
jp nz, BDE_times_CHL_sqrdiff_v3_nc5
dec b
jp nz, BDE_times_CHL_sqrdiff_v3_nc5
inc l
BDE_times_CHL_sqrdiff_v3_nc5:
dec b
dec c
rr l
rr b
rr c
rr d
rr e
ld a, ixl
ld l, a
ld a, ixh
rra
rr l
ret
BC_DE_HL_times_10101:
push bc
ld a, h
ex af, af'
sub a
ld c, a
ld b, l
add hl, bc
adc a, a
ld b, e
add hl, bc
adc a, c ; AHL = [ L+H+E L ]
pop bc
push hl
push bc
ld c, a
ld b, 0
ex af, af'
ld h, a
add hl, bc ; no way this can carry (initial HL is a square)
ld c, a
ld b, e
sub a
add hl, bc
adc a, a ; AHL(SP+2) = [ H+E L+H L+H+E L ]
add hl, de
adc a, 0 ; AHL(SP+2) = [ H+E+D L+H+E L+H+E L ]
pop bc
add hl, bc
adc a, 0 ; AHL(SP) = [ H+E+D+B L+H+E+C L+H+E L ]
ld e, d
ld d, c
add hl, de
adc a, b
jr nc, BC_DE_HL_times_10101_nc1
inc b ; BAHL(SP) = [ B B H+E+D+C+B L+H+E+D+C L+H+E L ]
BC_DE_HL_times_10101_nc1:
add a, e
jr nc, BC_DE_HL_times_10101_nc2
inc b ; BAHL(SP) = [ B D+B H+E+D+C+B L+H+E+D+C L+H+E L ]
BC_DE_HL_times_10101_nc2:
pop de
add a, c
ld c, a
ret nc
inc b ; BCHLDE = [ B D+C+B H+E+D+C+B L+H+E+D+C L+H+E L ]
ret
I do have a 24-bit floating-point multiplication routine ;D
saved 2 bytes, 1149 cycles saved by using iy too (and in a way compatible with TIOS, imagine that)Code: [Select]; hldebc = hlc * bde
ld (iy+asm_Flag1),b
xor a
ld ix,0
ld b,24
Loop:
add ix,ix
rla
rl c
adc hl,hl
jr nc,Next
add ix,de
adc a,(iy+asm_Flag1)
rl c
jr nc,Next
inc hl
Next:
djnz Loop
ld e,a
ld d,c
push ix ; ld c,ixl
pop bc ; ld b,ixh
or a ;to make sure the c flag is reset. Not always necessary if you know the c flag will be reset
sbc hl,bc ;you can do sbc hl,de also.
32-bit addition (you mean two 32-bit inputs?);Inputs:
; HLBC is one of the 32-bit inputs
; DE points to the other 32-bit input in RAM
;Outputs:
; HLBC is the 32-bit result
; DE is incremented 3 times
; A=H
; c flag is set if there is an overflow
ld a,(de) \ inc de
add a,c \ ld c,a
ld a,(de) \ inc de
adc a,b \ ld b,a
ld a,(de) \ inc de
adc a,l \ ld l,a
ld a,(de)
adc a,h \ ld h,a
ret
Squaring and square rooting... I will think on it D:I do have a 24-bit floating-point multiplication routine ;D
saved 2 bytes, 1149 cycles saved by using iy too (and in a way compatible with TIOS, imagine that)Code: [Select]; hldebc = hlc * bde
ld (iy+asm_Flag1),b
xor a
ld ix,0
ld b,24
Loop:
add ix,ix
rla
rl c
adc hl,hl
jr nc,Next
add ix,de
adc a,(iy+asm_Flag1)
rl c
jr nc,Next
inc hl
Next:
djnz Loop
ld e,a
ld d,c
push ix ; ld c,ixl
pop bc ; ld b,ixh
Jacobly, are you sure this works because I've been going through the code and it seems to me like the second 'rl c' should instead be 'add carry flag to c'. 2 times 'rl c' per loop seems wrong to me. Could you explain please? Because I've tried it as well in wabbitemu, taking the different input in account, and it is still not doing the right thing.
; hldebc = hlc * bde
ld (iy+asm_Flag1),b
xor a
ld ix,0
ld b,24
Loop:
add ix,ix
rla
rl c
adc hl,hl
jr nc,Next
add ix,de
adc a,(iy+asm_Flag1)
jr nc,Next
inc c
jr nz,Next
inc hl
Next:
djnz Loop
ld e,a
ld d,c
push ix ; ld c,ixl
pop bc ; ld b,ixh
That's strange. My test program must not have been working right, because when I went back and changed it a bit, it suddenly started telling me that the second routine doesn't work. :/
Anyway, my new test program seems to agree with this change. :)Code: [Select]; hldebc = hlc * bde
ld (iy+asm_Flag1),b
xor a
ld ix,0
ld b,24
Loop:
add ix,ix
rla
rl c
adc hl,hl
jr nc,Next
add ix,de
adc a,(iy+asm_Flag1)
jr nc,Next
inc c
jr nz,Next
inc hl
Next:
djnz Loop
ld e,a
ld d,c
push ix ; ld c,ixl
pop bc ; ld b,ixh