;calcmaniac84
cpHLDE:
or a
sbc hl,de
add hl,de
ret
;Important note: because the code is 3 bytes and a call is 3 bytes, just macro in:
;SPASM, TASM and BRASS compatible, I guess
#define cp_HLDE or a \ sbc hl,de \ add hl,de
;- Reverse a
;input: Byte in A
;output: Reversed byte in A
;destroys B
;Clock cycles: 66
;Bytes: 18
;author: calcmaniac84
reversea:
ld b,a
rrca
rrca
xor b
and %10101010
xor b
ld b,a
rrca
rrca
rrca
rrca
xor b
and %01100110
xor b
rrca
ret
;reverse hl
;curiosity: a easy port of a common reverse A register is more efficient than tricky stuff
;calcmaniac84
;28 bytes and 104 cycles
ld a,l
rla
rr h
rla
rr h
rla
rr h
rla
rr h
rla
rr h
rla
rr h
rla
rr h
rla
rr h
rla
rrca
ld l,a
ret
;calc84maniac
;in: a = ABCDEFGH
;out: hl= AABBCCDDEEFFGGHH
rrca
rra
rra
ld l,a
rra
sra l
rla
rr l
sra l
rra
rr l
sra l
rrca
rra
rra
ld h,a
rra
sra h
rla
rr h
sra h
rra
rr h
sra h
ret
;Galandros optimized routines
;try to beat me... maybe is possible...
;Displays A register content on screen in decimal ASCII number, using no addition memory
DispA:
ld c,-100
call Na1
ld c,-10
call Na1
ld c,-1
Na1: ld b,'0'-1
Na2: inc b
add a,c
jr c,Na2
sub c ;works as add 100/10/1
push af ;safer than ld c,a
ld a,b ;char is in b
CALL PUTCHAR ;plot a char. Replace with bcall(_PutC) or similar.
pop af ;safer than ld a,c
ret
;Note the following one is optimized for RPGs menus and the such, it is quite flexible. I am going to use in Lost Legends I ^^
;I started with one which used addition RAM for temporary storage (made by me, too), and optimized for size, speed and no extra memory use! ^.^
;the inc's and dec's were trick to debug -.-", the registers b and c are like counters and flags
;DispHL for games
;input: hl=num, d=row,e=col, c=number of algarisms to skip
;number of numbers' characters to display: 5 ; example: 65000
;output: hl displayed, with algarisms skiped and spaces for initial zeros
DispHL_games:
inc c
ld b,1 ;skip 0 flag
ld (CurRow),de
;Number in hl to decimal ASCII
;Thanks to z80 Bits
;inputs: hl = number to ASCII
;example: hl=300 outputs ' 300'
;destroys: af, hl, de used
ld de,-10000
call Num1
ld de,-1000
call Num1
ld de,-100
call Num1
ld e,-10
call Num1
ld e,-1
Num1:
ld a,'0'-1
Num2: inc a
add hl,de
jr c,Num2
sbc hl,de
dec c ;c is skipping
jr nz,skipnum
inc c
djnz notcharnumzero
cp '0'
jr nz,notcharnumzero
leadingzero:
inc b
skipnum:
ld a,' '
notcharnumzero:
push bc
call PUTCHAR ;bcall(_PutC) works, not sure if it preserves bc
pop bc
ret
PUTCHAR:
bcall(_PutC)
ret
;Example usage of DispHL_games to understand what I mean
Test2:
ld hl,60003
ld de,$0101
ld c,0
call DispHL_games
ld hl,60003
ld de,$0102
ld c,1
call DispHL_games
ret
ld a,xx
out (1),a
ld a,(de)
in a,(1)
and yy
9 Bytes, 43 T-States.ld a,xx
out (1),a
ld b,yy
in a,(1)
and b
9 Bytes, 40 T-States.
di
ld a,$99
ld bc,$0100
ld h,a
ld d,a
ld l,c
ld e,b
ld i,a
inc a
ld (hl),a
ldir
ld l,a
ld (hl),$c3
inc l
ld (hl),intvec & $ff
inc l
ld (hl),intvec >> 8
im 2
ei
;author: calcmaniac84, I think
;Copy zero terminated string at HL to DE.
StrCopy:
xor a
docopystr:
cp (hl)
ldi
jr nz,docopystr
ret
scroll_up:
#ifdef DEBUG
cp 64+1
call c,ErrorOverFlow
#endif
add a,a
add a,a
ld l,a
ld e,a
ld h,0
ld d,h
add hl,hl
add hl,de ; hl=a*12
push hl
ld de,768
ex de,hl
; carry is never set here if input is correct
; or a
sbc hl,de
ld b,h
ld c,l ; bc=768-12*a
ex de,hl
ld de,plotsscreen
add hl,de
ldir
;blank remaining area
ld h,d
ld l,e
inc de
ld (hl),$00
pop bc
dec bc ; bc=12*a-1
ldir
ret
;PSEUDO CODE
; ld hl,plotsscreen+12*a
; ld de,plotsscreen
; ld bc,768-12*a
; ldir
; ld h,d
; ld l,e
; ld (hl),$00
; inc de
; ld bc,12*a
; dec bc
; ldir
; ret
scroll_down:
#ifdef DEBUG
cp 64+1
call c,ErrorOverFlow
#endif
; a can be from 1 to 63
; a can be multiplied by 4
add a,a
add a,a ; a*4
ld l,a ; hl = a*4
ld e,a
xor a
ld h,a
ld d,a
add hl,hl ; hl = a*8
add hl,de ; hl = a*12
ld e,a ; de = 0
push hl ; a*12 will needed later
push hl ; 2 times
ex de,hl
;carry is never set here
; or a
sbc hl,de ; hl= -a*12, de=a*12
ld de,plotsscreen+767
add hl,de ; hl=plotsscreen+767-12*a
pop bc
push hl
ld hl,768+1
;carry always set
; or a
sbc hl,bc
ld b,h
ld c,l
pop hl
lddr
;blank remaining area
ld h,d
ld l,e
ld (hl),$00
dec de
pop bc
dec bc
lddr
ret
; ld hl,plotsscreen+767-12*a
; ld de,plotsscreen+767
; ld bc,768-12*a
; lddr
; or
; ld (hl),$00 ;; ld hl,plotsscreen
; ld h,d ;; ld (hl),$00
; ld l,e ;; ld de,hl+1
; dec de ;; ld bc,12*a-1
; ld bc,12*a-1 ;; ldir
; lddr ;; ret
; ret
Very nice! I'll add these to my utils.z80 file that is included in all my app builds.Actually I am working on something like that. I am hand writing C functions in z80 assembly just for fun. :P I will share them when I finish.
Anyone wanting to compile a stdlib.c and revive the tisdcc project? j/k
Very nice! I'll add these to my utils.z80 file that is included in all my app builds.I think I remember this, it was Halifax from the old Omnimaga forums who worked on it, right? There was a thread about it somewhere
Anyone wanting to compile a stdlib.c and revive the tisdcc project? j/k
Seems pretty impossible to me.O.O
B_CALL(_SetXXXXOP2)
B_CALL(_Op2ToOP1)
ld hl,$yyxx
ld (PenCol),hl
ld a,5
B_CALL(_DispOP1A)
push hl
ld hl,$yyxx
ld (PenCol),hl
pop hl
B_CALL(_DispHL)
Just be aware it's right-justified in 5 spaces. (Since $ffff is 5 decimal digits, 65535)Quigibo's Challenge!Challenge accepted.
Can any of the following be done in 6 or fewer bytes? The input and output must be HL.
- Multiply by 128?
- Signed division by any nontrivial constant, other than 2, including negative numbers?
- Modulus with any constant that is not a power of 2?
Constants need to be 16-bit?
- Signed division by any nontrivial constant, other than 2, including negative numbers?
- Modulus with any constant that is not a power of 2?
Those multiplications by 128 won't work. Multiplying 256*128 would give 0.True. I tested small numbers. Works up to 255.
xor a ; resets carry and register
rr h
rr l ; divide hl by 2
rra
ld h,l
ld l,a ; multiply hl by 256 (moving low byte to high byte trick)
;8 bytes, 32 clocks
;conventional way is 7 bytes and 77 clocks
;it does in less than half the time with only 1 byte cost, substantial speed increase with only 1 byte cost
Axe Parser could improve with a option to make it optimize for speed instead of the size default.Yeah, I'm planning to do that eventually.
Signed division by 2 is just sra h \ rr l. And I have only found daa useful (other than its intended use) when converting between hex digits and ASCII, I think.Axe Parser could improve with a option to make it optimize for speed instead of the size default.Yeah, I'm planning to do that eventually.
Actually, I found out my optimized signed division by 2 doesn't work, so that's up in the air too now. Also, would multiplying by negative 2 be do-able in 6 bytes? It can certainly be done in 7 so it seems possible maybe using some trick?
EDIT: also, is the daa instruction ever useful for optimizations? I can't think of how I would use it other than floating point math.
And I have only found daa useful (other than its intended use) when converting between hex digits and ASCII, I think.Yes, it is true. Strange how it works...
Signed division by 2 is just sra h \ rr l.
It rounds down, just like the int() command that everyone is used to. int(-1/2)=-1. That's how it works.Signed division by 2 is just sra h \ rr l.
But what about -1/2? Shouldn't that give 0? Because if you do that operation to %11111111 11111111 it remains -1. Or does this routine always round up in magnitude instead of down? It might throw some people off since it would be inconsistent with my regular unoptimized signed division routine.
;Divides HL by DE and stores to HL
SDiv:
ld a,h
add a,a
jr nc,$+9
xor a
sub l
ld l,a
sbc a,a
sub h
ld h,a
scf
rra
xor d
push af
bit 7,d
jr z,$+8
xor a
sub e
ld e,a
sbc a,a
sub d
ld d,a
call Divide
pop af
add a,a
ret nc
xor a
sub l
ld l,a
sbc a,a
sub h
ld h,a
ret
SignedDivision:
ld a,h
xor d
push af
bit 7,h
jr z,$+8
xor a
sub l
ld l,a
sbc a,a
sub h
ld h,a
bit 7,d
jr z,$+8
xor a
sub e
ld e,a
sbc a,a
sub d
ld d,a
call RegularDivision
pop af
add a,a
ret nc
xor a
sub l
ld l,a
sbc a,a
sub h
ld h,a
ret
;input: a=x, e=y, b=height, c=width in bytes
;complement box area at x,y
cplbox:
ld h,0
ld d,h
; you can optimize here for speed with sla l and avoid some adds
ld l,e
add hl,de
add hl,de
add hl,hl
add hl,hl
ld e,a
srl e
srl e
srl e
add hl,de
ld de,plotsscreen
add hl,de ;get first position in buffer
and 7
ld d,$FF
ld e,0 ;de is a mask
jr z,cplaligned
maskrotateloop:
srl d
rr e
dec a
jr nz,maskrotateloop
cplaligned:
cplboxheightloop:
push bc
push hl
ld b,c
cplboxwifthtloop:
ld a,(hl) ;\
xor d ;|
ld (hl),a ;|
inc hl ;| complement buffer
ld a,(hl) ;|
xor e ;|
ld (hl),a ;/
djnz cplboxwifthtloop ; loop for width
pop hl
ld c,12
add hl,bc ; advance for next line
pop bc
djnz cplboxheightloop ; loop height
ret
sub l
ld l,a
sbc a,a
sub h
ld h,a
C_Div_D:
;Inputs:
; C is the numerator
; D is the denominator
;Outputs:
; A is the remainder
; B is 0
; C is the result of C/D
; D,E,H,L are not changed
;
ld b,8
xor a
sla c
rla
cp d
jr c,$+4
inc c
sub d
djnz $-8
ret
DE_Times_A:
;Inputs:
; DE and A are factors
;Outputs:
; A is not changed
; B is 0
; C is not changed
; DE is not changed
; HL is the product
;
ld b,8
ld hl,0
add hl,hl
rlca
jr nc,$+3
add hl,de
djnz $-5
ret
DE_Times_BC:
;Inputs:
; DE and BC are factors
;Outputs:
; A is 0
; BC is not changed
; DE is 0
; HL is the product
;
ld hl,0
ld a,16
Mul_Loop_1:
add hl,hl
ex de,hl
add hl,hl
ex de,hl
jr nc,$+3
add hl,bc
dec a
jr nz,Mul_Loop_1
ret
DEHL_Div_C:
;Inputs:
; DEHL is a 32 bit value where DE is the upper 16 bits
; C is the value to divide DEHL by
;Outputs:
; A is the remainder
; B is 0
; C is not changed
; DEHL is the result of the division
;
ld b,32
xor a
add hl,hl
ex de,hl
adc hl,hl
ex de,hl
rla
cp c
jr c,$+4
inc l
sub c
djnz $-10
ret
;===============================================================
DEHL_Times_A:
;===============================================================
;Inputs:
; DEHL is a 32 bit factor
; A is an 8 bit factor
;Outputs:
; interrupts disabled
; BC is not changed
; AHLDE is the 40-bit result
; D'E' is the lower 16 bits of the input
; H'L' is the lower 16 bits of the output
; B' is 0
; C' is not changed
; A' is not changed
;===============================================================
di
push hl
or a
sbc hl,hl
exx
pop de
sbc hl,hl
ld b,8
mul32Loop:
add hl,hl
exx
adc hl,hl
exx
add a,a
jr nc,$+8
add hl,de
exx
adc hl,de
inc a
exx
djnz mul32Loop
push hl
exx
pop de
ret
GCDHL_BC:
;Inputs:
; HL is a number
; BC is a number
;Outputs:
; A is 0
; BC is the GCD
; DE is 0
;Destroys:
; HL
;Size: 25 bytes
;Speed: 30 to 49708 cycles
; -As slow as about 126 times per second at 6MHz
; -As fast as about 209715 times per second at 6MHz
;Speed break down:
; If HL=BC, 30 cycles
; 24+1552x
; If BC>HL, add 20 cycles
; *x is from 1 to at most 32 (because we use 2 16-bit numbers)
;
or a \ sbc hl,bc ;B7ED42 19
ret z ;C8 5|11
add hl,bc ;09 11
jr nc,$+8 ;3006 11|31
ld a,h ;7C --
ld h,b ;60 --
ld b,a ;47 --
ld a,l ;7D --
ld l,c ;69 --
ld c,a ;4F --
Loop:
call HL_Div_BC ;CD**** 1511
ld a,d \ or e ;7AB2 8
ret z ;C8 5|11
ld h,b \ ld l,c ;6069 8
ld b,d \ ld c,e ;424B 8
jr $-10 ;18F8 12
;===============================================================
DE_Div_BC_round:
;===============================================================
;Performs DE/BC, rounded
;Speed: 1172+6b cycles, 1268cc worst case
;Size: 25 bytes
;Inputs:
; DE is the numerator
; BC is the denominator
;Outputs:
; DE is the quotient
; BC is divided by 2 (truncated)
; A reflects the low bits of the quotient
;Destroys: HL
;===============================================================
ld a,d
ld hl,0
ld d,16
rl e
rla
adc hl,hl
sbc hl,bc
jr c,$+3
add hl,bc
dec d
jr nz,$-11
cpl
ld d,a
ld a,e
cpl
ld e,a
ret
HL_Div_C:
;Inputs:
; HL is the numerator
; C is the denominator
;Outputs:
; A is the remainder
; B is 0
; C is not changed
; DE is not changed
; HL is the quotient
;
ld b,16
xor a
add hl,hl
rla
cp c
jr c,$+4
inc l
sub c
djnz $-7
ret
HLDE_Div_C:
;Inputs:
; HLDE is a 32 bit value where HL is the upper 16 bits
; C is the value to divide HLDE by
;Outputs:
; A is the remainder
; B is 0
; C is not changed
; HLDE is the result of the division
;
ld b,32
xor a
ex de,hl
add hl,hl
ex de,hl
adc hl,hl
rla
cp c
jr c,$+4
inc e
sub c
djnz $-10
ret
;===============================================================
nCrHL_DE:
;===============================================================
;Inputs:
; hl is "n"
; de is "r"
;Outputs:
; interrupts off
; a is 0
; bc is an intermediate result
; de is "n"
; hl is the result
; a' is not changed
; bc' is "r"+1
; de' is the same as bc
; hl' is "r" or the compliment, whichever is smaller
;===============================================================
or a ;reset carry flag
sbc hl,de
ret c ;r should not be bigger than n
sbc hl,de \ add hl,de
jr nc,$+3
ex de,hl
;hl is R
push de
ld bc,1 ;A
exx
pop de ;N
ld bc,1 ;C
ld h,b \ ld l,c ;D
nCrLoop:
push de
push hl
call DE_Times_BC
push hl \ exx \ pop de
push hl
call DE_Div_BC
pop de
push hl \ ex de,hl \ exx \ pop hl
ld b,h \ ld c,l
pop de \ add hl,de
pop de \ inc de
exx
inc bc
or a \ sbc hl,bc \ add hl,bc
exx
jr nc,nCrLoop
ret
RoundHL_Div_C:
;Inputs:
; HL is the numerator
; C is the denominator
;Outputs:
; A is twice the remainder of the unrounded value
; B is 0
; C is not changed
; DE is not changed
; HL is the rounded quotient
; c flag set means no rounding was performed
; reset means the value was rounded
;
ld b,16
xor a
add hl,hl
rla
cp c
jr c,$+4
inc l
sub c
djnz $-7
add a,a
cp c
jr c,$+3
inc hl
ret
;===============================================================
sqrtE:
;===============================================================
;Input:
; E is the value to find the square root of
;Outputs:
; A is E-D^2
; B is 0
; D is the rounded result
; E is not changed
; HL is not changed
;Destroys:
; C
;
xor a ;1 4 4
ld d,a ;1 4 4
ld c,a ;1 4 4
ld b,4 ;2 7 7
sqrtELoop:
rlc d ;2 8 32
ld c,d ;1 4 16
scf ;1 4 16
rl c ;2 8 32
rlc e ;2 8 32
rla ;1 4 16
rlc e ;2 8 32
rla ;1 4 16
cp c ;1 4 16
jr c,$+4 ;4 12|15 48+3x
inc d ;-- -- --
sub c ;-- -- --
djnz sqrtELoop ;2 13|8 47
cp d ;1 4 4
jr c,$+3 ;3 12|11 12|11
inc d ;-- -- --
ret ;1 10 10
;===============================================================
;Size : 29 bytes
;Speed : 347+3x cycles plus 1 if rounded down
; x is the number of set bits in the result.
;===============================================================
;===============================================================
sqrtE:
;===============================================================
;Input:
; E is the value to find the square root of
;Outputs:
; A is E-D^2
; B is 0
; D is the result
; E is not changed
; HL is not changed
;Destroys:
; C=2D+1 if D is even, 2D-1 if D is odd
xor a ;1 4 4
ld d,a ;1 4 4
ld c,a ;1 4 4
ld b,4 ;2 7 7
sqrtELoop:
rlc d ;2 8 32
ld c,d ;1 4 16
scf ;1 4 16
rl c ;2 8 32
rlc e ;2 8 32
rla ;1 4 16
rlc e ;2 8 32
rla ;1 4 16
cp c ;1 4 16
jr c,$+4 ;4 12|15 48+3x
inc d ;-- -- --
sub c ;-- -- --
djnz sqrtELoop ;2 13|8 47
ret ;1 10 10
;===============================================================
;Size : 25 bytes
;Speed : 332+3x cycles
; x is the number of set bits in the result. This will not
; exceed 4, so the range for cycles is 332 to 344. To put this
; into perspective, under the slowest conditions (4 set bits
; in the result at 6MHz), this can execute over 18000 times
; in a second.
;===============================================================
ld a,c
ld c,b
ld hl,0
ld b,16
Mul_Loop_1:
add hl,hl
add a,a
rl c
jr nc,$+3
add hl,de
djnz Mul_Loop_1
ret
ld hl,0
ld b,16
Mul_Loop_1:
add hl,hl
add a,a
rl c
jr nc,$+3
add hl,de
djnz Mul_Loop_1
ret
For your DE_Times_BC, this is one byte more in overhead, but much faster:I like this method for speeding things up! This is exactly the kind of thing I was hoping for. I want to understand how to program better, so the more ideas I can learn, the better off I should be on my quest :DCode: [Select]ld a,c
ld c,b
ld hl,0
ld b,16
Mul_Loop_1:
add hl,hl
add a,a
rl c
jr nc,$+3
add hl,de
djnz Mul_Loop_1
ret
;===============================================================
HL_SDiv_BC:
;===============================================================
;Performs HL/BC
;Speed: 1494 cycles
; **same cycles as the regular HL_Div_BC
; add 25 if HL is negative
; add 25 if BC is negative
; add another 46 if only one is negative
;Size: 54 bytes
; **31 bytes larger than the regular HL_Div_BC
;Inputs:
; HL is the numerator
; BC is the denominator
;Outputs:
; HL is the quotient
; DE is the remainder
; BC is not changed
; A is 0
; z flag is set
; c flag is reset
;===============================================================
ld a,h
xor b
and 128
push af
absHL:
bit 7,h
jr z,absBC
ld a,l \ cpl \ ld l,a
ld a,h \ cpl \ ld h,a
inc hl
absBC:
bit 7,b
jr z,$+9
ld a,c \ cpl \ ld c,a
ld a,b \ cpl \ ld b,a
inc bc
add hl,hl
ld a,15
ld de,0
Div_Loop_1:
add hl,hl
ex de,hl
adc hl,hl
or a
sbc hl,bc
jr c,$+5
inc e
jr $+3
add hl,bc
ex de,hl
dec a
jr nz,Div_Loop_1
pop af \ ret z
ld a,l \ cpl \ ld l,a
ld a,h \ cpl \ ld h,a
inc hl
ret
;===============================================================
HL_SDiv_BC:
;===============================================================
;Performs HL/BC
;Speed: 1168 to 1318 cycles depending on how many set bits in the result
; add 19 if HL is negative
; add 19 if BC is positive
; add another 28 if only one is negative
;Size: 54 bytes
; **31 bytes larger than the regular HL_Div_BC
;Inputs:
; HL is the numerator
; BC is the denominator
;Outputs:
; HL is the quotient
; DE is the remainder
; BC = -abs(BC)
;===============================================================
ld a,h
xor b
push af
absHL:
add hl,hl
jr nc,negabsBC
xor a \ sub l \ ld l,a
sbc a,a \ sub h \ ld h,a
negabsBC:
bit 7,b
jr nz,$+8
xor a \ sub c \ ld c,a
sbc a,a \ sub b \ ld b,a
ex de,hl
xor a
ld h,a
ld l,a
ld a,15
Div_Loop_1:
rl e \ rl d
adc hl,hl
add hl,bc
jr c,$+4
sbc hl,bc
dec a
jr nz,Div_Loop_1
ex de,hl
adc hl,hl
pop af \ ret p
xor a \ sub l \ ld l,a
sbc a,a \ sub h \ ld h,a
ret
Signed16To8:
ld a,l
add a,a
sbc a,a
sub h
ld a,l
ret z
ld a,h
add a,a
sbc a,a
xor %01111111
ret
ShiftScreenRight4:
;Shifts the graph screen right 4 pixels
ld hl,plotSScreen
ld c,64
xor a
ld b,12
rrd
inc hl
djnz $-3
dec c
jr nz,$-9
ret
ShiftScreenLeft4:
;Shifts the graph screen left 4 pixels
ld hl,plotSScreen+767
ld c,64
xor a
ld b,12
rld
dec hl
djnz $-3
dec c
jr nz,$-9
ret
It is the same size as shifting 1 pixel, though 7680 cycles slower. That is still about 1 bazillion times faster than shifting left or right 1 pixel, 4 times. I've been using these for years in my graphics related programs x.x I hope they prove useful!;===============================================================
ConvRStr:
;===============================================================
;Input:
; DE points to the base 10 number string in RAM.
;Outputs:
; HL is the 16-bit value of the number
; DE points to the byte after the number
; BC is HL/10
; c flag reset (nc)
; z flag reset (nz)
;Destroys:
; A (actually, add 30h and you get the ending token)
;Size: 23 bytes
;Speed: 104n+42+11c
; n is the number of digits
; c is at most n-2
; at most 595 cycles for any 16-bit decimal value
;===============================================================
ld hl,0 ; 10 : 210000
ConvLoop: ;
ld a,(de) ; 7 : 1A
sub 30h ; 7 : D630
cp 10 ; 7 : FE0A
ret nc ;5|11 : D0
inc de ; 6 : 13
;
ld b,h ; 4 : 44
ld c,l ; 4 : 4D
add hl,hl ; 11 : 29
add hl,hl ; 11 : 29
add hl,bc ; 11 : 09
add hl,hl ; 11 : 29
;
add a,l ; 4 : 85
ld l,a ; 4 : 6F
jr nc,ConvLoop ;12|23: 30EE
inc h ; --- : 24
jr ConvLoop ; --- : 18EB
The ones with t-states as '---' are computed along with the previous instruction to make calculations easier. Anyways, to give an idea, at the slowest, this can execute 9803 times per second (assuming you are using call which takes another 17 t-states). This stops reading when a character that is not a decimal number is run into (for example, a comma or newline).Here's a very optimized way to convert a 16-bit signed number into an 8-bit signed number in a with overflow handling (if hl<-128, a=-128; if hl>127, a=127). Two added bonus to being super small and super fast are that it destroys nothing and that you could easily modify it to make the input a 16-bit register other than hl.Code: [Select]Signed16To8:
ld a,l
add a,a
sbc a,a
sub h
ld a,l
ret z
ld a,h
add a,a
sbc a,a
xor %01111111
ret
Signed16To8:
ld a,l
add a,a
ld a,h
adc a,l
cp l
ret z
ld a,$7F
ret p
inc a
ret
PseudoRandWord:
;Outputs:
; BC was the previous pseudorandom number
; HL is the pseudorandom number
;f(n+1)=(241f(n)+257) mod 65536 ;65536
;181 cycles, add 17 if called
ld hl,(randSeed)
ld c,l
ld b,h
add hl,hl
add hl,bc
add hl,hl
add hl,bc
add hl,hl
add hl,bc
add hl,hl
add hl,hl
add hl,hl
add hl,hl
add hl,bc
inc h
inc hl
ld (randSeed),hl
ret
There are a few other nice features, too. For example, every 16-bit value is hit if you run this 65536 times. Or, if you only read 1 byte (for example, H from the output), it will hit every 8-bit number once if you run this 256 times. Plus, it can be seeded, which has its own uses. This can be modified to be smaller, too, if you know what you are doing, but I just like the numbers 241 and 257. Anyways, it produces some nice results :)FindNumPages:
;Inputs:
; The app base page is loaded in MemBank1
;Outputs:
; c flag set if the field was found
; nc means the app header subfield was not found
; A is the number of app pages
; B is 0
; (HL) is the number of app pages
ld hl,4000h
ld bc,128
ld a,c
or a
FNPLoop:
cpir
ret po
ret nz
inc a
cp (hl)
jr z,$+5
dec a
jr FNPLoop
inc l
ld a,(hl)
scf
ret
I made that to be a faster alternative to using a bcall :)
FindNumPages:
;Inputs:
; The app base page is loaded in MemBank1
;Outputs:
; A, (HL) is the number of app pages
ld hl,4000h
ld a,81h
ld c,a
FNPLoop:
dec a
cpir
inc a
cp (hl)
jr nz,FNPLoop
inc l
ld a,(hl)
ret
rectangle
;inputs: l=Y, a=X, b=height, c=length
;save coords & stuff
h, a
push hl
push bc
call rectangle.calc ;below
pop bc
pop hl
rectangle.display
; inputs: h=X, l=Y, b=height
ld a, h
ld e, l
ld h, $00
ld d, h
add hl, de
add hl, de
add hl, hl
add hl, hl ;l*12
ld e, a
srl e
srl e
srl e ;x/8
add hl, de
ld de, gbuf
add hl, de
rectangle.display.loop:
push bc
push hl
ld a, (hl)
ld c, a
ld a, (rectangle.scanline1) ; somewhere in ram...
xor c
ld (hl), a
inc hl
ld a, (rectangle.scanline2)
or a
jr z, rectangle.display.noloop2
ld b, a
rectangle.display.loop2:
ld a, (hl)
xor $FF
ld (hl), a
inc hl
djnz rectangle.display.loop2
rectangle.display.noloop2:
ld a, (hl)
ld c, a
ld a, (rectangle.scanline3)
xor c
ld (hl), a
pop hl
pop bc
ld de, 12
add hl, de
djnz rectangle.display.loop
ret
rectangle.calc
;inputs:
; a = x
; b = height
; c = length
ld d, a
ld a, $FF
ld (rectangle.scanline1), a
xor a
ld (rectangle.scanline2), a
ld (rectangle.scanline3), a
ld a, d
and 7
ld d, a
or a
jr z, rectangle.skipShift1
ld e, $FF
rectangle.shift1
srl e
dec a
or a
jr nz, rectangle.shift1
ld a, e
ld (rectangle.scanline1), a
rectangle.skipShift1
ld a, d ; a = shift right
ld h, a ; save
add a, c ; a + c
ld b, a ; save b = a + c
and 7 ; /8 Rest?
ld d, a ; Rest
ld a, 8
sub d ; 8 - Rest
ld d, a ; = d
ld e, $FF
rectangle.shift2
sla e
dec a
or a
jr nz, rectangle.shift2
ld a, e
ld (rectangle.scanline3), a
ld a, 16
ld e, h ; a
sub e ; 16 - a
sub d ; -d
srl a
srl a
srl a
ld d, a
;
ld a, c
srl a
srl a
srl a ; /8
sub d
ld d, a
ld a, (rectangle.scanline2)
add a, d
ld (rectangle.scanline2), a
;
ld a, b
and %11111000
or a ; if (shift_right + length)<8, do (rectangle.scanline1 & rectangle.scanline3)
ret nz
ld a, (rectangle.scanline1)
ld d, a
ld a, (rectangle.scanline3)
and d
ld (rectangle.scanline1), a
xor a
ld (rectangle.scanline2), a
ld (rectangle.scanline3), a
ret
How bad is it? :D
;=======================================;
; Rectangle Filling Routine Version 1.0 ;
; By Jason Kovacs & The TCPA - 10/11/99 ;
;=======================================;
; Input: D = Top Left X Coordinate, E = Top Left Y Coordinate
; H = Bottom Right X Coord, L = Bottom Right Y Coord
; C = Color of Lines (0-White, 1-Black, 2-XORed)
;
; Output: A Rectangle is drawn to the Graph Buffer with its border
; and everything within it Filled in according to the value in
; reg C which specifies the Color.
;
; Registers Affected: AF Destroyed; B=0 ; C, DE, HL Preserved.
; The Index Registers and the Shadow Registers Aren't Used.
Rectangle_Filled:
ld a,l
sub e
inc a
ld b,a
ld a,h
sub d
inc a
push de
Rect_Fill_Loop:
push af
call V_Line
pop af
inc d
dec a
jr nz, Rect_Fill_Loop
pop de
ret
;=======================================;
; Horizontal and Verticle Line Routines ;
; By Jason Kovacs & The TCPA - 10/11/99 ;
;=======================================;
; For H_Line and V_Line:
;
; Input: B = Length of Line (Number of Pixels)
; C = Color of Line (0-White, 1-Black, 2-XORed)
; D = X Coordinate Start of the Line
; E = Y Coordinate Start of the Line
;
; Output: Lines are Drawn to the Graph Buffer, and the Starting
; Byte and Pixel Mask are Automatically determined according
; to the Input of the Coordinates in DE.
;
; Registers Affected: All Registers are Preserved Except AF.
V_Line:
push de
push hl
push bc
ld a,d
call Getpix
pop bc
push bc
ld d,c
ld c,a
ld a,d
ld de,12
or a
call z,V_White_Line
dec a
call z,V_Black_Line
dec a
call z,V_XORed_Line
pop bc
pop hl
pop de
ret
V_White_Line:
ld a,c
cpl
ld c,a
V_White_Line_2:
ld a,(hl)
and c
ld (hl),a
add hl,de
djnz V_White_Line_2
xor a
ret
V_Black_Line:
ld a,(hl)
or c
ld (hl),a
add hl,de
djnz V_Black_Line
xor a
ret
V_XORed_Line:
ld a,(hl)
xor c
ld (hl),a
add hl,de
djnz V_XORed_Line
ret
Getpix:
ld d,0
ld h,d
ld l,e
add hl,de
add hl,de
add hl,hl
add hl,hl
ld de,plotsscreen
add hl,de
Getbit:
ld b,0
ld c,a
and %00000111
srl c
srl c
srl c
add hl,bc
ld b,a
inc b
ld a,%00000001
GBLoop:
rrca
djnz GBLoop
ret
DrawRectToGraph:
ld hl,9340h
;===============================================================
DrawRectToBuffer:
DrawRect:
;===============================================================
;Inputs:
; A is the type of rectangle to draw
; 0 =White
; 1 =Black
; 2 =XOR
; 3 =Black border
; 4 =White border
; 5 =XOR border
; 6 =Black border, white inside
; 7 =Black border, XOR inside
; 8 =White border, black inside
; 9 =White border, XOR inside
; 10=Shift Up
; 11=Shift Down
;
;
; 14=pxlTestRect (returns the number of on pixels in the rectangular region)
; 15=pxlTestBorder (returns the number of on pixels on the border, good for collision detection)
; B is the height
; C is the Y pixel coordinate
; D is the width in pixels
; E is is the X pixel coordinate
;===============================================================
di
push hl
pop ix
ex af,af'
;Check if coords are negative
ld a,c
or a
jp p,$+9
add a,b
ret nc
ret z
ld b,a
ld c,0
ld a,e
or a
jp p,$+9
add a,d
ret nc
ret z
ld d,a
ld e,0
;Check dimensions
ld a,b
or a
ret z
jp p,$+6
neg
ld b,a
add a,c
sub 64
jr c,$+6
neg
add a,b
ld b,a
ld a,d
or a
ret z
jp p,$+6
neg
ld d,a
add a,e
sub 96
jr c,$+6
neg
add a,d
ld d,a
ld a,c
cp 64
ret nc
ld a,e
cp 96
ret nc
MakePattern:
push bc
ld hl,RectData
ld b,24
xor a
ld (hl),a
inc l
djnz $-2
ld hl,RectData
ld c,RectData+12
ld a,e
sub 8
jr c,$+6
inc l
inc c
jr $-6
add a,8
ld e,a
ld b,a
inc b
ld a,d
add a,e
ld e,a
ld a,1
rrca
djnz $-1
ld b,l
push af
ld l,c
or (hl)
ld (hl),a
ld l,b
pop af
dec a
scf
adc a,a
ld (hl),a
ld a,e
sub 8
jr c,$+10
jr z,$+10
inc l
ld (hl),-1
inc c
jr $-10
add a,8
ld b,a
or a
ld a,1
jr z,$+5
rrca
djnz $-1
ld b,l
push af
ld l,c
or (hl)
ld (hl),a
ld l,b
pop af
dec a
cpl
and (hl)
ld (hl),a
pop bc
ld a,b
ld b,0
ld h,b
ld l,c
add hl,hl
add hl,bc
add hl,hl
add hl,hl
push ix
pop bc
add hl,bc
ld b,a
ex af,af'
.db $CB,$67,$28,$10,$D6,$10,$F5,$0E,$18,$11
.dw RectData
.db $1A,$2F,$12,$13,$0D,$20,$F9,$F1
or a
jr nz,$+13h
.db $0E,$0C,$11
.dw RectData
.db $1A,$2F,$A6,$77,$13,$23,$0D,$20,$F7,$10,$F0,$C9
dec a
jr nz,$+12h
.db $0E,$0C,$11
.dw RectData
.db $1A,$B6,$77,$13,$23,$0D,$20,$F8,$10,$F1,$C9
dec a
jr nz,$+12h
.db $0E,$0C,$11
.dw RectData
.db $1A,$AE,$77,$13,$23
.db $0D,$20,$F8,$10,$F1,$C9
dec a
jr nz,$+26h
.db $0E,$0C,$11
.dw RectData
.db $1A,$B6,$77,$13,$23,$0D,$20,$F8,$05,$C8
.db $05,$28,$0F,$0E,$0C,$11
.dw RectData+12
.db $1A,$B6,$77,$13,$23,$0D,$20,$F8,$10,$F1,$04,$18,$DC
dec a
jr nz,$+28h
.db $0E,$0C,$11
.dw RectData
.db $1A,$A6,$AE,$77,$13,$23,$0D,$20,$F7,$05,$C8,$05,$28,$10,$0E,$0C,$11
.dw RectData+12
.db $1A,$A6,$AE,$77,$13,$23,$0D,$20,$F7,$10,$F0,$04,$18,$DA
dec a
jr nz,$+26h
.db $0E,$0C,$11
.dw RectData
.db $1A,$AE
.db $77,$13,$23,$0D,$20,$F8,$05,$C8,$05,$28,$0F,$0E,$0C,$11
.dw RectData+12
.db $1A,$AE,$77,$13,$23,$0D,$20,$F8
.db $10,$F1,$04,$18,$DC
dec a
jr nz,$+36h
.db $0E,$0C,$11
.dw RectData
.db $1A,$B6,$77,$13,$23,$0D,$20,$F8,$05,$C8,$05
.db $28,$1F,$E5,$0E,$0C,$11
.dw RectData
.db $1A,$A6,$AE,$77,$13,$23,$0D,$20,$F7,$E1,$0E,$0C,$11
.dw RectData+12
.db $1A
.db $B6,$77,$13,$23,$0D,$20,$F8,$10,$E1,$04,$18,$CC
.db $3D,$20,$33,$0E,$0C,$11
.dw RectData
.db $1A,$B6,$77,$13
.db $23,$0D,$20,$F8,$05,$C8,$05,$28,$1E,$E5,$0E,$0C,$11
.dw RectData
.db $1A,$AE,$77,$13,$23,$0D,$20,$F8,$E1
.db $0E,$0C,$11
.dw RectData+12
.db $1A,$B6,$77,$13,$23,$0D,$20,$F8,$10,$E2,$04,$18,$CD,$3D,$20,$34,$0E,$0C,$11
.dw RectData
.db $1A,$A6,$AE,$77,$13,$23,$0D,$20,$F7,$05,$C8,$05,$28,$1E,$E5,$0E,$0C,$11
.dw RectData
.db $1A,$B6
.db $77,$13,$23,$0D,$20,$F8,$E1,$0E,$0C,$11
.dw RectData+12
.db $1A,$AE,$77,$13,$23,$0D,$20,$F8,$10,$E2,$04,$18
.db $CC,$3D,$20,$35,$0E,$0C,$11
.dw RectData
.db $1A,$A6,$AE,$77,$13,$23,$0D,$20,$F7,$05,$C8,$05,$28,$1F,$E5
.db $0E,$0C,$11
.dw RectData
.db $1A,$AE,$77,$13,$23,$0D,$20,$F8,$E1,$0E,$0C,$11
.dw RectData+12
.db $1A,$A6,$AE,$77,$13
.db $23,$0D,$20,$F7,$10,$E1,$04,$18,$CB,$3D,$20,$37,$05,$C8,$F3,$E5,$D9,$01,$0C,$00,$E1,$09,$D9,$0E
.db $0C,$11
.dw RectData
.db $D5,$D9,$D1,$D9,$1A,$2F,$A6,$D9,$47,$1A,$A6,$B0,$13,$23,$D9,$77,$13,$23,$0D,$20
.db $EF,$10,$E4,$0E,$0C,$11
.dw RectData
.db $1A,$2F,$A6,$77,$13,$23,$0D,$20,$F7,$FB,$C9
.db $3D,$20,$40,$F3,$C5
.db $11,$0C,$00,$19,$10,$FD,$2B,$E5,$D9,$11,$F4,$FF,$E1,$19,$D9,$C1,$05,$C8,$0E,$0C,$11
.dw RectData+11
.db $D5
.db $D9,$D1,$D9,$1A,$2F,$A6,$D9,$47,$1A,$A6,$B0,$1B,$2B,$D9,$77,$1B,$2B,$0D,$20,$EF,$10,$E4,$0E,$0C
.db $11
.dw RectData+11
.db $1A,$2F,$A6,$77,$1B,$2B,$0D,$20,$F7,$FB,$C9
dec a
ret z
dec a
ret z
exx
ld de,0
ld c,8
exx
PxlTestRect:
dec a
jr nz,PxlTestBorder
ld c,12
ld de,RectData
PxlTstRectLoop:
call PxlTestWithMask
djnz PxlTstRectLoop-5
exx
;DE contains the number of pixels
ret
PxlTestBorder:
dec a
ret nz
ld c,12
ld de,RectData
call PxlTestWithMask
dec b
jr z,PxlTestBorder-4
dec b
jr z,PxlTestBrdrEnd
ld c,12
ld de,RectData+12
PxlTstBrdrLoop:
call PxlTestWithMask
djnz PxlTstBrdrLoop-5
PxlTestBrdrEnd:
ld de,RectData
ld c,12
call PxlTestWithMask
exx
;DE contains the number of on pixels
ret
PxlTestWithMask:
ld a,(de)
and (hl)
exx
ld b,c
add a,a
jr nc,$+3
inc de
jr z,$+4
djnz $-6
exx
inc de
inc hl
dec c
jr nz,PxlTestWithMask
ret
.addinstr add ahl,de 00CE19 3 NOP 1 ;add hl,de \ adc a,0
.addinstr add ahl,cde 8919 2 NOP 1 ;add hl,de \ adc a,c
Now whenever i want to do 24-bit addition (well, kinda) i can just use the new add ahl,de and add ahl,cde instructions ;).addinstr <instruction> <asm code for the instruction, written in hex> <amounth of bytes of asm code> NOP 1
EXAMPLE EXAMPLE
INSTRUCTION DEFINITION SOURCE OBJECT
-------------------------------------------------------------------
XYZ * FF 3 NOTOUCH 1 xyz 1234h FF 34 12
XYZ * FF 2 NOTOUCH 1 xyz 1234h FF 34
ZYX * FE 3 SWAP 1 zyx 1234h FE 12 34
ZYX * FE 3 R2 1 zyx $+4 FE 01 00
ABC *,* FD 3 COMBINE 1 abc 45h,67h FD 45 67
ABC *,* FD 3 CSWAP 1 abc 45h,67h FD 67 45
ADD A,#* FC 2 NOTOUCH 1 add A,#'B' FC 42
RET "" FB 1 NOTOUCH 1 ret FB
LD IX,* 21DD 4 NOTOUCH 1 ld IX,1234h DD 21 34 12
LD IX,* 21DD 4 NOTOUCH 1 1 0 ld IX,1234h DD 21 68 24
LD IX,* 21DD 4 NOTOUCH 1 0 1 ld IX,1234h DD 21 35 12
LD IX,* 21DD 4 NOTOUCH 1 1 1 ld IX,1234h DD 21 69 24
LD IX,* 21DD 4 NOTOUCH 1 8 12 ld IX,34h DD 21 12 34
#ifdef TI83P
GBUF_LSB = $40
GBUF_MSB = $93
#else
GBUF_LSB = $29
GBUF_MSB = $8E
#endif
;b = # columns
;c = # rows
;d = starting x
;e = starting y
rectangle_filled_xor:
ld a,$AE ;xor (hl)
jr rectangle_filled2
rectangle_filled_solid:
ld a,$B6 ;or (hl)
rectangle_filled2:
push de
push bc
ld (or_xor),a ;use smc for xor/solid fill
ld a,d ;starting x
and $7 ;what bit do we start on?
ex af,af'
ld a,d ;starting x
ld l,e ;ld hl,e
ld h,0 ; ..
ld d,h ;set d = 0
add hl,de ;starting y * 12
add hl,de ;x3
add hl,hl ;x6
add hl,hl ;x12
rra ;a = x coord / 8
rra ;
rra ;
and %00011111 ;starting x/8 (starting byte in gbuf)
add a,GBUF_LSB
ld e,a ;
ld d,GBUF_MSB ;
add hl,de ;hl = offset in gbuf
ex af,af' ;carry should be reset and z affected from and $7
ld e,a
ld a,%10000000
jr z,$+6
rra
dec e
jr nz,$-2
ld d,a ;starting bit to draw
rectangle_loop_y:
push bc
push hl
rectangle_loop_x:
ld e,a ;save a (overwritten with or (hl))
or_xor = $
or (hl) ;smc will modify this to or/xor
ld (hl),a
ld a,e ;recall a
rrca ;rotate a to draw the next bit
jr nc,$+3
inc hl
djnz rectangle_loop_x
pop hl ;hl = first column in gbuf row
ld c,12 ;b = 0, bc = 12
add hl,bc ;move down to next row
pop bc ;restore b (# columns)
ld a,d ;restore a (starting bit to draw)
dec c
jr nz,rectangle_loop_y
rectangle_end:
pop bc
pop de
ret
start:
ld de,$041B
ld bc,$1103
call draw_box2
call ionFastCopy
bcall _getkey
ret
draw_box2:
call rectangle_filled_solid
inc d
inc e
dec b
dec b
dec c
dec c
call rectangle_filled_xor
ret
ld hl,or_xor
ld (hl),a ;use smc for xor/solid fill
To save a byte and a clock cycle, this can simply be: ld (or_xor),a
rra ;a = x coord / 8
rra ;
rra ;
and %00011111 ;starting x/8 (starting byte in gbuf)
ld e,a ;
add hl,de ;add x
ld de,gbuf ;
add hl,de ;hl = offset in gbuf
To save 7 clock cycles and 0 bytes: rra ;a = x coord / 8
rra ;
rra ;
and %00011111 ;starting x/8 (starting byte in gbuf)
or 40h ;gbuf=9340h, 40h = %01000000, so this won't cause problems
ld e,a ;
ld d,93h
add hl,de ;add x
pop hl
pop bc ;restore b (# columns)
pop af
ld de,12
add hl,de ;move down to next row
Since B is already 0 at the start of this code, save a byte and 3 t-states: pop hl
ld c,12
add hl,bc
pop bc ;restore b (# columns)
pop af
And now you can actually modify that whole routine to be a little faster: ld d,a
rectangle_loop_y:
push bc
push hl
rectangle_loop_x:
ld e,a
or_xor = $
or (hl) ;smc will modify this to or/xor
ld (hl),a
ld a,e
rrca
jr nc,$+3
inc hl
djnz rectangle_loop_x
pop hl
ld c,12
add hl,bc
pop bc
ld a,d
dec c
jr nz,rectangle_loop_y
You get rid of a push af / pop af in the loop which takes 21 t-states and replace it with a ld a,d which is 4 t-states.;D is the x coordinate, here
;HL points to the pattern buffer (12 bytes)
ld a,d
and 7
ld a,80h
ld b,a
jr z,$+5
rrca
djnz $-1
;A is the mask if it were for a pixel
;B is 0
add a,a
dec a
ld (hl),a
So for example, if D and 7 = 3, you would get %00010000. 'add a,a' turns that to %00100000 and then dec a → %00011111. ;b = # columns
;c = # rows
;d = starting x
;e = starting y
rectangle_filled2:
ld a,d ;a = starting x coord
ld l,e ;ld hl,e
ld h,0 ; ..
ld d,h ;set d = 0
add hl,de ;starting y * 12
add hl,de ;x3
add hl,hl ;x6
add hl,hl ;x12
rra ;a = x coord / 8
rra ;
rra ;
and %00011111 ;starting x/8
ld e,a ;
add hl,de ;add x
ld de,gbuf
add hl,de ;offset in gbuf
ld a,b ;b = no columns
rra
rra
rra
and %00011111 ;no. columns / 8
ld b,a
ld a,12
sub b
ld e,a
ld d,0
rectangle_loop_y:
push bc
rectangle_loop_x:
ld (hl),$FF
inc hl
djnz rectangle_loop_x
pop bc ;restore c (# columns)
add hl,de ;move down to next row
dec c
jr nz,rectangle_loop_y
rectangle_end:
ret
Much smaller and faster, but certain cases (for example, a rectangle less than 8 pixels wide, 2 byte rectangles, etc.) seemed like they were going to bump up the size quite a bit and what I wanted was to write it more for size than speed since it's not being used in any time-critical areas of the game (mostly in menus and the battle engine). How do you handle cases where, say, a rectangle doesn't fill an entire byte?#ifdef TI83P
GBUF_LSB = $40
GBUF_MSB = $93
.org progstart-2
.db $bb,$6d
#else
GBUF_LSB = $29
GBUF_MSB = $8E
.org progstart
#endif
;...
rra ;a = x coord / 8
rra ;
rra ;
and %00011111 ;starting x/8 (starting byte in gbuf)
add a,GBUF_LSB
ld e,a ;
ld d,GBUF_MSB ;
add hl,de ;hl = offset in gbuf
...works just fine for the 83/+ :)#ifdef TI83P
GBUF_LSB = $40
GBUF_MSB = $93
#else
GBUF_LSB = $29
GBUF_MSB = $8E
#endif
;b = # rows
;c = # columns
;d = starting x
;e = starting y
rectangle_filled_xor:
ld a,$AE ;xor (hl)
jr rectangle_filled2
rectangle_filled_solid:
ld a,$B6 ;or (hl)
rectangle_filled2:
push de
push bc
ld (or_xor),a ;use smc for xor/solid fill
ld a,d ;starting x
and $7 ;what bit do we start on?
ex af,af'
ld a,d ;starting x
ld l,e ;ld hl,e
ld h,0 ; ..
ld d,h ;set d = 0
add hl,de ;starting y * 12
add hl,de ;x3
add hl,hl ;x6
add hl,hl ;x12
rra ;a = x coord / 8
rra ;
rra ;
and %00011111 ;starting x/8 (starting byte in gbuf)
add a,GBUF_LSB
ld e,a ;
ld d,GBUF_MSB ;
add hl,de ;hl = offset in gbuf
ex af,af' ;carry should be reset and z affected from and $7
ld d,a
ld a,%10000000
jr z,$+6
rra
dec d
jr nz,$-2
ld e,12
rectangle_loop_x:
push af
push bc
push hl
ld c,a
rectangle_loop_y:
or_xor = $
or (hl) ;smc will modify this to or/xor
ld (hl),a
ld a,c
add hl,de
djnz rectangle_loop_y
pop hl
pop bc
pop af
rrca
jr nc,$+3
inc hl
dec c
jr nz,rectangle_loop_x
rectangle_end:
pop bc
pop de
ret
start:
ld de,$0204
ld bc,$2121
call draw_box2
call ionFastCopy
bcall _getkey
ret
draw_box2:
call rectangle_filled_solid
inc d
inc e
dec b
dec b
dec c
dec c
call rectangle_filled_xor
ret
;b = # rows
;c = # columns
;d = starting x
;e = starting y
rectangle:
push de
push bc
ld a,d ;starting x
and $7 ;what bit do we start on?
ex af,af'
ld a,d ;starting x
ld l,e ;ld hl,e
ld h,0 ; ..
ld d,h ;set d = 0
add hl,de ;starting y * 12
add hl,de ;x3
add hl,hl ;x6
add hl,hl ;x12
rra ;a = x coord / 8
rra ;
rra ;
and %00011111 ;starting x/8 (starting byte in gbuf)
add a,GBUF_LSB
ld e,a ;
ld d,GBUF_MSB ;
add hl,de ;hl = offset in gbuf
ex af,af' ;carry should be reset and z affected from and $7
ld e,a
ld a,%10000000
jr z,$+6
rra
dec e
jr nz,$-2
dec b ;you could adjust your input to take care of this, ie b = width-2, c = height-1 and save 3 bytes here
dec b ;we draw the ends separately
dec c ;we'll draw the last line at the end
ld d,a ;starting bit to draw
;d = starting bit
rectangle_loop_y:
push bc
push hl
call rectangle_loop_x
pop hl ;hl = first column in gbuf row
ld c,12 ;b = 0, bc = 12
add hl,b ;move down to next row
pop bc ;restore b (# columns)
xor a
; cp c ; # UNCOMMENT TO ALLOW LINES WITH A HEIGHT OF 1 PIXEL
; jr z,rectangle_end ; #
ld (ld_hl),a ;change ld (hl),a to nop
ld a,d ;restore a (starting bit to draw)
dec c
jr nz,rectangle_loop_y
ld a,$77 ;ld (hl),a
ld (ld_hl),a ;return nop to ld (hl),a
ld a,d
call rectangle_loop_x
rectangle_end:
pop bc
pop de
ret
rectangle_loop_x:
or (hl) ;first bit
ld (hl),a
; inc b ; # UNCOMMENT TO ALLOW LINES WITH A WIDTH OF 1 PIXEL
; ret z ; #
; dec b ; #
; jr z,rectangle_loop_x_end ; #
ld a,d
rectangle_loop_x_inner:
rrca ;rotate a to draw the next bit
jr nc,$+3
inc hl
ld e,a ;save a (overwritten with or (hl))
or (hl) ;smc will modify this to or/xor
ld_hl = $
ld (hl),a
ld a,e ;recall a
djnz rectangle_loop_x_inner
rectangle_loop_x_end:
rrca ;rotate a to draw the next bit
jr nc,$+3
inc hl
or (hl) ;last bit
ld (hl),a
ret
If you don't care about the coordinates, you can just remove the push/pops and get another 4 bytes. And i've got another idea which should be faster (essentially working with bytes instead of pixels), though it might take up a bit more space. If you're interested let me know and i'll try to work on it some.Rectangle_or:
ld a,$B6
jr Rectangle
Rectangle_xor:
ld a,$AE
Rectangle:
; DE = (x,y)
; BC = (height,width)
ld (smc_logic1),a
ld (smc_logic0),a
push de
push bc
push bc
ld a,d
call ComputeByte
ld (smc_FirstByte),a
ex (sp),hl
ld a,d
neg
and 7
ld b,a
ld a,l
sub b
ex (sp),hl
ld c,a
call ComputeByte
cpl
ld (smc_LastByte),a
ld b,a
sra c \ sra c \ sra c
inc c
; ld a,c
; and %11111000
; rra \ rra \ rra
; inc a
; ld c,a
ld a,d
ld d,0
ld h,d
ld l,e
add hl,hl
add hl,de
add hl,hl
add hl,hl
and %11111000
rra \ rra \ rra
add a,GBUF_LSB
ld e,a
ld d,GBUF_MSB
add hl,de
;HL points to the first byte
pop de
;D is the height
;E is the number of bytes wide
inc c
dec c
jr nz,RectOverLoop-1
ld a,(smc_FirstByte)
and b
ld c,a ;value
ld b,d ;height
ld de,12
ld a,c
smc_logic1:
or (hl)
ld (hl),a
add hl,de
djnz $-4
pop bc
pop de
ret
ld e,c
RectOverLoop:
ld b,e
ld c,12
.db 3Eh ;start of ld a,*
smc_FirstByte:
.db 0
RectLoop:
smc_logic0:
or (hl)
ld (hl),a
inc hl
dec c
jr z,ExitLoop
ld a,-1
djnz RectLoop
; jp p,$+4
; dec b
.db 3Eh ;start of ld a,*
smc_LastByte:
.db 0
or (hl)
ld (hl),a
add hl,bc
ExitLoop:
dec d
jr nz,RectOverLoop
pop bc
pop de
ret
ComputeByte:
and 7
ld b,a
ld a,80h
jr z,$+5
rrca
djnz $-1
add a,a
dec a
ret
;;
;;rectXOR
;;rectOR
;;rectErase
;; (B,C) = (x,y) signed
;; (D,E) = (w,h) unsigned
;; HL points to buf
rectXOR:
push hl
call rectSub
pop ix
ret nc
ex de,hl
add ix,de
ex de,hl
push ix
pop hl
dec b
jp m,xorrect0
inc b
xor_rect_loop:
push bc
push hl
ld a,(hl) \ xor d \ ld (hl),a \ inc hl
dec b
jr z,$+8
ld a,(hl) \ cpl \ ld (hl),a \ inc hl \ djnz $-4
ld a,(hl) \ xor e \ ld (hl),a
ld bc,12
pop hl
add hl,bc
pop bc
dec c
jr nz,xor_rect_loop
ret
xorrect0:
ld a,d
and e
ld b,c
ld c,a
ld de,12
ld a,c
xor (hl)
ld (hl),a
add hl,de
djnz $-4
ret
rectErase:
push hl
call rectSub
pop ix
ret nc
ex de,hl
add ix,de
ex de,hl
push ix
pop hl
ld a,d
cpl
ld d,a
ld a,e
cpl
ld e,a
dec b
jp m,eraserect0
inc b
erase_rect_loop:
push bc
push hl
ld a,(hl) \ and d \ ld (hl),a \ inc hl
dec b
jr z,$+7
xor a
ld (hl),a \ inc hl \ djnz $-2
ld a,(hl) \ and e \ ld (hl),a
ld bc,12
pop hl
add hl,bc
pop bc
dec c
jr nz,erase_rect_loop
ret
eraserect0:
ld a,d
xor e
ld b,c
ld c,a
ld de,12
ld a,c
and (hl)
ld (hl),a
add hl,de
djnz $-4
ret
rectOR:
push hl
call rectSub
pop ix
ret nc
ex de,hl
add ix,de
ex de,hl
push ix
pop hl
dec b
jp m,orrect0
inc b
or_rect_loop:
push bc
push hl
ld a,(hl) \ or d \ ld (hl),a \ inc hl
dec b
jr z,$+8
ld c,-1
ld (hl),c \ inc hl \ djnz $-2
ld a,(hl) \ or e \ ld (hl),a
ld bc,12
pop hl
add hl,bc
pop bc
dec c
jr nz,or_rect_loop
ret
orrect0:
ld a,d
and e
ld b,c
ld c,a
ld de,12
ld a,c
or (hl)
ld (hl),a
add hl,de
djnz $-4
ret
rectsub:
;(B,C) = (x,y) signed
;(D,E) = (w,h) unsigned
;Output:
; Start Mask D
; End Mask E
; Byte width B
; Height C
; buf offset HL
bit 7,b
jr z,+_
;Here, b is negative, so we have to add width to x.
;If the result is still negative, the entire box is out of bounds, so return
;otherwise, set width=newvalue,b=0
ld a,d
add a,b
ret nc
ld d,a
ld b,0
_:
bit 7,c
jr z,+_
ld a,e
add a,c
ret nc
ld e,a
ld c,0
_:
;We have clipped all negative areas.
;Now we need to verify that (x,y) are on the screen.
;If they aren't, then the whole rectangle is off-screen so no need to draw.
ld a,b
cp 96
ret nc
ld a,c
cp 64
ret nc
;Let's also verfiy that height and width are non-zero:
ld a,d
or a
ret z
ld a,e
or a
ret z
;Now we need to clip the width and height to be in-bounds
add a,c
cp 65
jr c,+_
;Here we need to set e=64-c
ld a,64
sub c
ld e,a
_:
ld a,d
add a,b
cp 97
jr c,+_
;Here we need to set d=96-b
ld a,96
sub b
ld d,a
_:
;B is starting X
;C is starting Y
;D is width
;E is height
push bc
ld a,b
and 7
ld b,a
ld a,-1
jr z,+_
rra \ djnz $-1
_:
inc a
cpl
ld h,a ;start mask
ld a,b
add a,d
and 7
ld b,a
ld a,-1
jr z,+_
rra \ djnz $-1
_:
inc a
ld l,a ;end mask
ex (sp),hl
;stack now holds DE
;HL is now the coordinates
;B=0, C=height
;A,BC are free to destroy
ld a,h
ld h,b
add hl,hl
add hl,bc
add hl,hl
add hl,hl
ld b,a
rrca
rrca
rrca
and 31
add a,l
ld l,a
jr nc,$+3
inc h
;B is the starting x, D is width
;Only A,B,D,E are available
ld a,b
add a,d
and $F8
ld d,a
ld a,b
and $F8
ld b,a
ld a,d
sub b
rrca
rrca
rrca
ld b,a
ld c,e
pop de
scf
ret
ComputeByte:
and 7
ld b,a
ld a,$FF
ret z
srl a ;or or a \ rra
djnz $-2
ret
It's interesting to see how our syntax/style differs even on bits of code that do exactly the same thing :)I noticed that, too, it is kind of neat. I noticed that I mask A and then shift, whereas you shift, then mask. I've done both, but I typically do the former.
And if you don't mind sacrificing just a few clocks (altogether maybe between 8-64), maybe you could try this and save 2 bytes:Nice, I like that! When B is not 0, that is actually anywhere from 6 cycles faster to 18 cycles slower, so I think that is a great trade-off.
ComputeByte:
neg ; or 'cpl \ inc a
and 7
ld b,a
ld a,FFh
ret z
add a,a
djnz $-1
ret
This happens to be faster than my original and smaller by one byte. Still, it is larger than yours :/
GCDDE_HL:
;Inputs:
; HL,DE are the two values
;Outputs:
; B is 0
; DE is 0
; HL is the GCD
; C is not changed
;Destroys:
; A
xor a ;AF 4
ld b,a ;47 4
CheckMax: ;
sbc hl,de ;ED52 15n
jr z,AdjustGCD ;28** 12n-5
jr nc,ParityCheck ;30** 12n-5
xor a ;AF 4(n-a)
sub l ;95 4(n-a)
ld l,a ;6F 4(n-a)
sbc a,a ;9F 4(n-a)
sub h ;94 4(n-a)
ld h,a ;67 4(n-a)
ex de,hl
jp CheckMax ;C3**** 10(n-a)
ParityCheck: ;
bit 0,e ;CB** 8a
jr nz,DE_Odd ;20** 12a-5b
bit 0,l ;CB** 8b
jr z,BothEven ;28** 12b-5c
rr d ;CB** 8(n-a-b-c)
rr e ;CB** 8(n-a-b-c)
jp CheckMax ;C3**** 10(n-a-b-c)
BothEven: ;
inc b ;04 4c
rr d \ rr e ; 16c
rr h \ rr l ; 16c
jp CheckMax ; 10c
DE_Odd: ;
bit 0,l ; 8b
jr nz,BothOdd ; 12b-5d
rr h \ rr l ; 16(n-a-b-d)
jp CheckMax ; 10(n-a-b-d)
BothOdd: ;
sbc hl,de ; 15d
rr h \ rr l ; 16d
jp CheckMax ; 10d
AdjustGCD: ;
ex de,hl ; 4
inc b ; 4
dec b ; 4
ret z ; 11+4(k>0)
add hl,hl ; 11k
djnz $-1 ; 13k-5
ret ; --
It is a lot faster than my other version which used division to compute the mod of two 16-bit numbers x.x The JP instructions can be changed to JR for better portability and to save a byte each time.GCD_A_C:
;Outputs:
; A is the GCD
; C should be the smallest odd number that divides both inputs
; B is 0
;Destroys:
; D
ld b,1
CheckMax:
sub c
jr z,AdjustGCD
jr nc,ParityCheck
neg
ld d,a
ld a,c
ld c,a
jr CheckMax
ParityCheck:
rrc c
jr c,c_Odd
inc b
rrca
jr nc,CheckMax
rlca
djnz CheckMax
c_Odd:
rlc c
rrca
jr nc,CheckMax
rlca
jr CheckMax
AdjustGCD:
ld a,c
dec b
ret z
add a,a
djnz $-1
ret
EDIT2: I think I computed a massive overestimate of the slowest speed of the first routine to be a little over 4000 cycles. My old routine used about 1500 cycles at the fastest for a non-trivial result. 1500 is likely close to the slowest that the new routine will run at :)GCDDE_HL:
;Inputs:
; HL,DE are the two values
;Outputs:
; B is 0
; DE is 0
; HL is the GCD
; C is not changed
; A is not changed
ld b,1
or a
CheckMax: ;
sbc hl,de ;ED52 15n
jr z,AdjustGCD ;28** 12n-5
jr nc,ParityCheck ;30** 12n-5
add hl,de
or a
ex de,hl
ParityCheck: ;
bit 0,e ;CB** 8a
jr nz,DE_Odd ;20** 12a-5b
bit 0,l ;CB** 8b
jr z,BothEven ;28** 12b-5c
rr d ;CB** 8(n-a-b-c)
rr e ;CB** 8(n-a-b-c)
jp CheckMax ;C3**** 10(n-a-b-c)
BothEven: ;
inc b ;04 4c
rr d \ rr e ; 16c
HL_Even:
rr h \ rr l ; 16c
jp CheckMax ; 10c
DE_Odd: ;
bit 0,l ; 8b
jr z,HL_Even ; 12b-5d
sbc hl,de ; 15d
rr h \ rr l ; 16d
jp nz,CheckMax ; 10d
AdjustGCD: ;
ex de,hl ; 4
dec b ; 4
ret z ; 11+4(k>0)
add hl,hl ; 11k
djnz $-1 ; 13k-5
ret ; --
For the sign testing, I came up with this:Code: [Select]SignedDivision:
ld a,h
xor d
push af
bit 7,h
jr z,$+8
xor a
sub l
ld l,a
sbc a,a
sub h
ld h,a
bit 7,d
jr z,$+8
xor a
sub e
ld e,a
sbc a,a
sub d
ld d,a
call RegularDivision
pop af
add a,a
ret nc
xor a
sub l
ld l,a
sbc a,a
sub h
ld h,a
ret
SignedDivision:
ld a,h
xor d
push af
xor d
jp p,$+9
xor a
sub l
ld l,a
sbc a,a
sub h
ld h,a
bit 7,d
jr z,$+8
xor a
sub e
ld e,a
sbc a,a
sub d
ld d,a
call RegularDivision
pop af
ret p
xor a
sub l
ld l,a
sbc a,a
sub h
ld h,a
ret
In all, it saves 1 bytes and at least 5 t-states (it will be either 5 or 10).
SqrtL:
;Inputs:
; L is the value to find the square root of
;Outputs:
; C is the result
; B,L are 0
; DE is not changed
; H is how far away it is from the next smallest perfect square
; L is 0
; z flag set if it was a perfect square
;Destroyed:
; A
ld bc,400h ; 10 10
ld h,c ; 4 4
sqrt8Loop: ;
add hl,hl ;11 44
add hl,hl ;11 44
rl c ; 8 32
ld a,c ; 4 16
rla ; 4 16
sub a,h ; 4 16
jr nc,$+5 ;12|19 48+7x
inc c
cpl
ld h,a
djnz sqrt8Loop ;13|8 47
ret ;10 10
;287+7x, x is the number of bits in the result
;min: 287
;max: 315
;19 bytes
Also, in case anybody needed a small GCD (Greatest Common Divisor) routine, I have this:GCDHL_DE:
;Outputs:
; DE is the GCD
GCDLoop:
or a
sbc hl,de
ret z
jr nc,$-3
add hl,de
ex de,hl
jp GCDLoop
(a faster one is a few posts up)HL_mod_3:
;Outputs:
; Preserves HL
; A is the remainder
; destroys DE,BC
; z flag if divisible by 3, else nz
ld bc,030Fh
ld a,h
add a,l
sbc a,0 ;conditional decrement
;Now we need to add the upper and lower nibble in a
ld d,a
and c
ld e,a
ld a,d
rlca
rlca
rlca
rlca
and c
add a,e
sub c
jr nc,$+3
add a,c
;add the lower half nibbles
ld d,a
sra d
sra d
and b
add a,d
sub b
ret nc
add a,b
ret
;at most 132 cycles, at least 123
lognat:
;Input: H.L needs to be on [1,2]
;Output: H.L if z flag is set, else if nz, no result
;avg speed: 677 t-states
dec h
dec h
jr nz,$+8
inc l \ dec l
ret nz
ld l,177
ret
inc h
ret nz
ld b,h
ld c,l
ld e,l
ld d,8
add hl,hl
add hl,hl
add hl,de
ex de,hl
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ adc a,a
ld h,a \ ld l,b
sla h \ jr c,$+3 \ ld l,c
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
rl l
ld a,h
adc a,b
ld l,a
ld h,b
cp a
ret
I had fun twisting some of the logic, but there is a division and multiplication in there. It might seem backwards, but it was optimal :)
lognat:
;Input: H.L needs to be on (0,128.0)
;Output: H.L if c flag set
; returns nc if input is negative (HL not modified)
;Error:
; The error on the outputs is as follows:
; 20592 inputs are exact
; 12075 inputs are off by 1/256
; 100 inputs are off by 2/256
; So all 32767 inputs are within 2/256, with average error being <1/683 which is smaller than 1/256.
;Size: 177 bytes
;Speed: average speed is less than 1250 t-states
ld a,h \ or l \ jr nz,$+5
ld h,80h \ ret
dec h
dec h
jr nz,$+9
inc l \ dec l
jr nz,normalizeln-1
ld l,177
ret
inc h
jr nz,normalizeln
ld b,h
ld c,l
ld e,l
ld d,8
add hl,hl
add hl,hl
add hl,de
ex de,hl
call HL_Div_DE
ld h,a \ ld l,b
sla h \ jr c,$+3 \ ld l,c
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
rl l
ld a,h
adc a,b
ld h,b
ld l,a
scf
ret
HL_Div_DE:
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ adc a,a \ ret
inc h
normalizeln:
xor a
inc h \ ret m
ld d,a \ ld e,a
ld a,l
jr z,toosmall
inc e \ srl h \ rra \ jr nz,$-4
rla \ rl h
dec e
stepin:
ld l,a
push de
call lognat
pop de
;now multiply DE by 355, then divide by 2 (rounding)
ld b,d \ ld c,e \ ld a,d
ex de,hl
add hl,hl
add hl,hl ;4
add hl,bc ;5
add hl,hl ;10
add hl,bc ;11
add hl,hl ;22
add hl,hl
add hl,hl
add hl,hl
add hl,bc
add hl,hl
add hl,bc
sra h \ rr l
adc hl,de
scf
ret
toosmall:
dec d
dec e \ add a,a \ jr nc,$-2
inc h
jp stepin
The worst error is 2/256, which is much better than I thought it would be :) On [1,2] it is at worst 1/256 and the average case is <1/683 which is smaller than 1/256 (the smallest positive 8.8 number).
ln_88:
;Input: HL is a fixed point number
;Output: ln(H.L)->H.L
;Speed: Avg: 340+(325 worst case)
call lg_88
;now signed multiply HL by 355, then divide by 2 (rounding)
ld de,0
bit 7,h
jr z,$+9
dec e \ xor a \ sub l \ ld l,a
sbc a,a \ sub h \ ld h,a
ld b,h
ld c,l
xor a
add hl,hl
add hl,hl \ rla
add hl,bc \ adc a,d
add hl,hl \ rla
add hl,bc \ adc a,d
add hl,hl \ rla
add hl,hl \ rla
add hl,hl \ rla
add hl,hl \ rla
add hl,bc \ adc a,d
add hl,hl \ rla
add hl,bc \ adc a,d
sra a \ rr h
ld l,h
ld h,a
inc e
ret nz
xor a \ sub l \ ld l,a
sbc a,a \ sub h \ ld h,a
ret
lg_88:
;Input: HL is a fixed point number
;Output: lg(H.L)->H.L
;Speed: Avg: 340
ld de,lgLUT
ld b,0
ld a,h
or a
ret m
ld a,l
jr z,$+8
inc b \ srl h \ rra \ jr nz,$-4
or a \ jr nz,$+6
ld hl,8000h \ ret
rra \ inc b \ jr nc,$-2
;A is the element to look up in the LUT
ld l,a
ld c,h
dec b
add hl,hl
add hl,de
ld e,(hl)
inc hl
ld d,(hl)
ex de,hl
add hl,bc
ret
lglut:
.dw $F800
.dw $F996
.dw $FA52
.dw $FACF
.dw $FB2C
.dw $FB76
.dw $FBB3
.dw $FBE8
.dw $FC16
.dw $FC3F
.dw $FC64
.dw $FC86
.dw $FCA5
.dw $FCC1
.dw $FCDC
.dw $FCF4
.dw $FD0B
.dw $FD21
.dw $FD36
.dw $FD49
.dw $FD5C
.dw $FD6D
.dw $FD7E
.dw $FD8E
.dw $FD9D
.dw $FDAC
.dw $FDBA
.dw $FDC8
.dw $FDD5
.dw $FDE2
.dw $FDEE
.dw $FDFA
.dw $FE06
.dw $FE11
.dw $FE1C
.dw $FE26
.dw $FE31
.dw $FE3B
.dw $FE44
.dw $FE4E
.dw $FE57
.dw $FE60
.dw $FE69
.dw $FE71
.dw $FE7A
.dw $FE82
.dw $FE8A
.dw $FE92
.dw $FE9A
.dw $FEA1
.dw $FEA9
.dw $FEB0
.dw $FEB7
.dw $FEBE
.dw $FEC5
.dw $FECB
.dw $FED2
.dw $FED8
.dw $FEDF
.dw $FEE5
.dw $FEEB
.dw $FEF1
.dw $FEF7
.dw $FEFD
.dw $FF03
.dw $FF09
.dw $FF0E
.dw $FF14
.dw $FF19
.dw $FF1E
.dw $FF24
.dw $FF29
.dw $FF2E
.dw $FF33
.dw $FF38
.dw $FF3D
.dw $FF42
.dw $FF47
.dw $FF4B
.dw $FF50
.dw $FF55
.dw $FF59
.dw $FF5E
.dw $FF62
.dw $FF67
.dw $FF6B
.dw $FF6F
.dw $FF74
.dw $FF78
.dw $FF7C
.dw $FF80
.dw $FF84
.dw $FF88
.dw $FF8C
.dw $FF90
.dw $FF94
.dw $FF98
.dw $FF9B
.dw $FF9F
.dw $FFA3
.dw $FFA7
.dw $FFAA
.dw $FFAE
.dw $FFB2
.dw $FFB5
.dw $FFB9
.dw $FFBC
.dw $FFC0
.dw $FFC3
.dw $FFC6
.dw $FFCA
.dw $FFCD
.dw $FFD0
.dw $FFD4
.dw $FFD7
.dw $FFDA
.dw $FFDD
.dw $FFE0
.dw $FFE4
.dw $FFE7
.dw $FFEA
.dw $FFED
.dw $FFF0
.dw $FFF3
.dw $FFF6
.dw $FFF9
.dw $FFFC
.dw $FFFF
arctan_88:
;Input:
; D.E
;Output: atan(D.E)->D.E
push de
ld a,d
or a
jp p,$+5
neg
ld d,a
dec a
jr nz,checkneedinv
inc e \ dec e \ jr nz,checkneedinv
pop af \ rla \ ld de,201 \ ret nc \ ld de,-201 \ ret
checkneedinv:
inc a
call nz,DEgt1_Inv
;0.E is the value to atan
ld hl,adjustatan
push hl
ld a,e
cp 46 \ ret c
dec a \ cp 42h \ ret c
dec a \ cp 4Eh \ ret c
dec a \ cp 57h \ ret c
dec a \ cp 5Eh \ ret c
dec a \ cp 64h \ ret c
dec a \ cp 6Ah \ ret c
dec a \ cp 6Fh \ ret c
sub 6Fh \ ld e,a
ld hl,atanlut
add hl,de
ld a,(hl)
ret
adjustatan:
ld e,a
pop bc
ld a,b
or a
jp p,$+5
neg
jr z,$+9
ld hl,402
or a
sbc hl,de
ex de,hl
rl b
ret nc
xor a
sub e
ld e,a
sbc a,a
sub d
ld d,a
ret
DEgt1_Inv:
;Works if DE>1
ld hl,256
ld b,8
InvLoop:
add hl,hl
sbc hl,de
jr nc,$+3
add hl,de
adc a,a
djnz InvLoop
cpl
ld e,a
ld d,b
ret
atanlut:
.db $6F
.db $6F
.db $70
.db $71
.db $72
.db $73
.db $73
.db $74
.db $75
.db $76
.db $77
.db $77
.db $78
.db $79
.db $7A
.db $7B
.db $7B
.db $7C
.db $7D
.db $7E
.db $7F
.db $7F
.db $80
.db $81
.db $82
.db $82
.db $83
.db $84
.db $85
.db $85
.db $86
.db $87
.db $88
.db $88
.db $89
.db $8A
.db $8B
.db $8B
.db $8C
.db $8D
.db $8E
.db $8E
.db $8F
.db $90
.db $90
.db $91
.db $92
.db $93
.db $93
.db $94
.db $95
.db $95
.db $96
.db $97
.db $97
.db $98
.db $99
.db $9A
.db $9A
.db $9B
.db $9C
.db $9C
.db $9D
.db $9E
.db $9E
.db $9F
.db $A0
.db $A0
.db $A1
.db $A2
.db $A2
.db $A3
.db $A3
.db $A4
.db $A5
.db $A5
.db $A6
.db $A7
.db $A7
.db $A8
.db $A9
.db $A9
.db $AA
.db $AA
.db $AB
.db $AC
.db $AC
.db $AD
.db $AD
.db $AE
.db $AF
.db $AF
.db $B0
.db $B0
.db $B1
.db $B2
.db $B2
.db $B3
.db $B3
.db $B4
.db $B5
.db $B5
.db $B6
.db $B6
.db $B7
.db $B7
.db $B8
.db $B9
.db $B9
.db $BA
.db $BA
.db $BB
.db $BB
.db $BC
.db $BC
.db $BD
.db $BE
.db $BE
.db $BF
.db $BF
.db $C0
.db $C0
.db $C1
.db $C1
.db $C2
.db $C2
.db $C3
.db $C3
.db $C4
.db $C4
.db $C5
.db $C6
.db $C6
.db $C7
.db $C7
.db $C8
.db $C8
.db $C9
Rand24:
;Inputs: seed1,seed2
;Outputs:
; HLA is the pseudo-random number
; seed1,seed2 incremented accordingly
;Destroys: BC,DE
;Notes:
; seed1*243+83 mod 65519 -> seed1
; seed2*251+43 mod 65521 -> seed2
; Output seed1*seed2
ld hl,(seed1)
xor a
ld b,h \ ld c,l
ld de,83
add hl,hl \ rla ;2
add hl,bc \ adc a,d ;3
add hl,hl \ rla ;6
add hl,bc \ adc a,d ;7
add hl,hl \ rla ;14
add hl,bc \ adc a,d ;15
add hl,hl \ rla ;30
add hl,hl \ rla ;60
add hl,hl \ rla ;120
add hl,bc \ adc a,d ;121
add hl,hl \ rla ;242
add hl,bc \ adc a,d ;243
add hl,de \ adc a,d ;243x+83
;now AHL mod 65519
; Essentially, we can at least subtract A*65519=A(65536-17), so add A*17 to HL
ex de,hl
ld l,a
ld b,h
ld c,l
add hl,hl
add hl,hl
add hl,hl
add hl,hl
add hl,bc
add hl,de
ld de,65519
jr nc,$+5
or a \ sbc hl,de
or a \ sbc hl,de
jr nc,$+3
add hl,de
ld (seed1),hl
;seed1 done, now we need to do seed2
ld hl,(seed2)
; seed1*243+83 mod 65519 -> seed1
; seed2*251+43 mod 65521 -> seed2
; Output seed1*seed2
xor a
ld b,h \ ld c,l
ld de,43
add hl,hl \ rla ;2
add hl,bc \ adc a,d ;3
add hl,hl \ rla ;6
add hl,bc \ adc a,d ;7
add hl,hl \ rla ;14
add hl,bc \ adc a,d ;15
add hl,hl \ rla ;30
add hl,bc \ adc a,d ;31
add hl,hl \ rla ;62
add hl,hl \ rla ;124
add hl,bc \ adc a,d ;125
add hl,hl \ rla ;250
add hl,bc \ adc a,d ;251
add hl,de \ adc a,d ;251x+83
;now AHL mod 65521
; Essentially, we can at least subtract A*65521=A(65536-15), so add A*15 to HL
ex de,hl
ld l,a
ld b,h
ld c,l
add hl,hl
add hl,hl
add hl,hl
add hl,hl
sbc hl,bc
add hl,de
ld de,65521
jr nc,$+5
or a \ sbc hl,de
or a \ sbc hl,de
jr nc,$+3
add hl,de
ld (seed2),hl
;now seed1 and seed 2 are computed
ld bc,(seed1)
ex de,hl
call BC_Times_DE
ex de,hl
ld l,b
ld h,0
ld b,h
ld c,l
add hl,hl
add hl,hl
add hl,bc
add hl,hl
add hl,hl
add hl,bc
add hl,hl
add hl,bc
ld c,d
ld d,e
ld e,a
ld a,c
sbc hl,bc
sbc a,b
ret nc
ld c,43
add hl,bc
ret
;now do BC_times_DE
BC_Times_DE:
;BHLA is the result
ld a,b
or a
ld hl,0
ld b,h
;1
add a,a
jr nc,$+4
ld h,d
ld l,e
;2
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,b
;227+10b-7p
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,b
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,b
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,b
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,b
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,b
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,b
;===
;AHL is the result of B*DE*256
push hl
ld h,b
ld l,b
ld b,a
ld a,c
ld c,h
;1
add a,a
jr nc,$+4
ld h,d
ld l,e
;2
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,c
;227+10b-7p
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,c
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,c
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,c
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,c
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,c
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,c
pop de
;Now BDE*256+AHL
ld c,a
ld a,l
ld l,h
ld h,c
add hl,de
ret nc
inc b
;BHLA is the 32-bit result
ret
seed1:
.dw 0
seed2:
.dw 0
To test its 'randomness' I only used HL and it did not exhibit a perfectly uniform random distribution (standard deviation was slightly off from the expected value). However, it seems to be on par with the routine used by the OS and I believe I am using a modified version of that.rand:
;Tested and passes all CAcert tests
;Uses a very simple 32-bit LCG and 32-bit LFSR
;it has a period of 18,446,744,069,414,584,320
;roughly 18.4 quintillion.
;291cc
seed1_0=$+1
ld hl,12345
seed1_1=$+1
ld de,6789
ld b,h
ld c,l
add hl,hl \ rl e \ rl d
add hl,hl \ rl e \ rl d
inc l
add hl,bc
ld (seed1_0),hl
ld hl,(seed1_1)
adc hl,de
ld (seed1_1),hl
ex de,hl
seed2_0=$+1
ld hl,9876
seed2_1=$+1
ld bc,54321
add hl,hl \ rl c \ rl b
ld (seed2_1),bc
sbc a,a
and %11000101
xor l
ld l,a
ld (seed2_0),hl
ex de,hl
add hl,bc
ret
EDIT (28 August 2019): Hey, so here is a compact version from a few years ago:;#define smc ;uncomment if you are using SMC
rand16:
;collaboration by Zeda with Runer112
;160cc or 148cc if using SMC
;26 bytes
;cycle: 4,294,901,760 (almost 4.3 billion)
#ifdef smc
seed1=$+1
ld hl,9999
#else
ld hl,(seed1)
#endif
ld b,h
ld c,l
add hl,hl
add hl,hl
inc l
add hl,bc
ld (seed1),hl
#ifdef smc
seed2=$+1
ld hl,9999
#else
ld hl,(seed2)
#endif
add hl,hl
sbc a,a
and %00101101
xor l
ld l,a
ld (seed2),hl
add hl,bc
ret
And here is a different routine that is faster and might be better (it is on the tests I am using):xsp32:
;Inputs: (seed1), (seed2), and (seed3) are 16-bit seeds. (seed1) and (seed2) can't both be 0.
;Outputs: HL is the pseudorandom number
;Destroys: A,DE,BC
;cycle: 281,474,976,645,120
;It would take about 185 years at 15MHz to repeat
;min: 258cc (236cc if using SMC)
;max: 288cc (266cc if using SMC)
;avg: 273cc (251cc if using SMC)
;63 bytes (62 bytes if using SMC)
#ifdef SMC
seed1 = $+1
ld hl,12345
seed2 = $+1
ld de,6789
#else
ld hl,(seed1)
ld de,(seed2)
#endif
;first, XOR it with itself, shifted left 23 bits
;low bit of d needs to be shifted in
ld a,h
rra
ld a,l
rra
jr nc,+_
rl e
ccf
rr e
_:
xor d
ld d,a
;XOR it with itself, shifted right 15 bits
ld a,h
rla
ld a,e
rla
xor l
ld l,a
ld a,e
rla
ld a,d
rla
jr nc,+_
rr e
ccf
rl e
_:
xor h
ld h,a
;XOR it with itself, shifted left 17 bits
;HL<<1
ld (seed1),hl
add hl,hl
ld a,h
xor d
ld h,a
ld a,l
xor e
ld l,a
ld (seed2),hl
ex de,hl
#ifdef SMC
seed3 = $+1
ld hl,33333
#else
ld hl,(seed3)
#endif
inc hl
inc h
ld (seed3),hl
add hl,de
ret
It has a smaller period, but still far larger than a calc needs. It uses smaller state, and combines a 32-bit xorshift with a basic 16-bit counter (increments by 257). xs32:
;32-bit xorshift
;seed^=seed<<23
;seed^=seed>>15
;seed^=seed<<17
;min: 209cc (193cc if using SMC)
;max: 239cc (223cc if using SMC)
;avg: 224cc (208cc if using SMC)
;53 bytes (52 bytes if using SMC)
#ifdef SMC
seed1 = $+1
ld hl,12345
seed2 = $+1
ld de,6789
#else
ld hl,(seed1)
ld de,(seed2)
#endif
;first, XOR it with itself, shifted left 23 bits
;low bit of d needs to be shifted in
ld a,h
rra
ld a,l
rra
jr nc,+_
rl e
ccf
rr e
_:
xor d
ld d,a
;XOR it with itself, shifted right 15 bits
ld a,h
rla
ld a,e
rla
xor l
ld l,a
ld a,e
rla
ld a,d
rla
jr nc,+_
rr e
ccf
rl e
_:
xor h
ld h,a
;XOR it with itself, shifted left 17 bits
;HL<<1
ld (seed1),hl
add hl,hl
ld a,h
xor d
ld h,a
ld a,l
xor e
ld l,a
ld (seed2),hl
ret
SqrtHLDE:
;input: HLDE
;Output: BC
;310 bytes
;Average is about 1443 t-states
push de
xor a
ld b,a
ld e,l
ld l,h
ld h,a
add hl,hl
add hl,hl
cp h
jr nc,$+5
dec h
ld a,4
add hl,hl
add hl,hl
ld c,a
sub h
jr nc,$+6
cpl
ld h,a
inc c
inc c
ld a,c
add hl,hl
add hl,hl
add a,a
ld c,a
sub h
jr nc,$+6
cpl
ld h,a
inc c
inc c
ld a,c
add hl,hl
add hl,hl
add a,a
ld c,a
sub h
jr nc,$+6
cpl
ld h,a
inc c
inc c
ld a,c
ld l,e
add hl,hl
add hl,hl
add a,a
ld c,a
sub h
jr nc,$+6
cpl
ld h,a
inc c
inc c
ld a,c
add hl,hl
add hl,hl
add a,a
ld c,a
sub h
jr nc,$+6
cpl
ld h,a
inc c
inc c
ld a,c
add a,a \ ld c,a
add hl,hl
add hl,hl
jr nc,$+6
sub h \ jp $+6
sub h
jr nc,$+6
inc c \ inc c
cpl
ld h,a
ld a,l
ld l,h
add a,a
ld h,a
adc hl,hl
adc hl,hl
sll c \ rl b
sbc hl,bc
jr nc,$+3
add hl,bc
sbc a,a \ add a,a \ inc a \ add a,c \ ld c,a
;iteration 9
;now I need to rotate in more bits
pop de
sla d \ adc hl,hl \ sla d \ adc hl,hl
sll c \ rl b
sbc hl,bc
jr nc,$+3
add hl,bc
sbc a,a \ add a,a \ inc a \ add a,c \ ld c,a
sla d \ adc hl,hl \ sla d \ adc hl,hl
sll c \ rl b
sbc hl,bc
jr nc,$+3
add hl,bc
sbc a,a \ add a,a \ inc a \ add a,c \ ld c,a
sla d \ adc hl,hl \ sla d \ adc hl,hl
sll c \ rl b
sbc hl,bc
jr nc,$+3
add hl,bc
sbc a,a \ add a,a \ inc a \ add a,c \ ld c,a
sla d \ adc hl,hl \ sla d \ adc hl,hl
sll c \ rl b
sbc hl,bc
jr nc,$+3
add hl,bc
sbc a,a \ add a,a \ inc a \ add a,c \ ld c,a
sla e \ adc hl,hl \ sla e \ adc hl,hl
sll c \ rl b
sbc hl,bc
jr nc,$+3
add hl,bc
sbc a,a \ add a,a \ inc a \ add a,c \ ld c,a
sla e \ adc hl,hl \ sla e \ adc hl,hl
sll c \ rl b
sbc hl,bc
jr nc,$+3
add hl,bc
sbc a,a \ add a,a \ inc a \ add a,c \ ld c,a
sll c \ rl b
sla e \ adc hl,hl \ sla e \ adc hl,hl
jr nc,$+9
or a
sbc hl,bc
inc c
jp $+13
sbc hl,bc
jr nc,$+3
add hl,bc
sbc a,a \ add a,a \ inc a \ add a,c \ ld c,a
sla e \ adc hl,hl \ jr nc,$+4
inc c \ ret
sla e \ adc hl,hl \ jr nc,$+8
sbc hl,bc \ or a \ jp $+7
or a \ sbc hl,bc \ ret c
scf \ sbc hl,bc \ ret c
inc c \ ret
sDiv2416:
xor d
push af
xor d
ld b, a
jp p, divdPos
xor a
sub l
ld l, a
ld a, 0
sbc a, h
ld h, a
ld a, 0
sbc a, b
ld b, a
divdPos:
bit 7, d
jr z, divrPos
xor a
sub e
ld e, a
ld a, 0
sbc a, d
ld d, a
ld a, b
divrPos:
push hl
pop ix
ld hl, 0
ld b, 24
divLoop:
add ix, ix
rla
adc hl, hl
sbc hl, de
jr nc, $ + 4
add hl, de
; jp nc, xxxx
; it works because the carry can not be set and "inc ix" is 2 bytes
.db $D2
inc ix
djnz divLoop
push ix
pop hl
ld b, a
pop af
ld a, b
ret p
xor a
sub l
ld l, a
ld a, 0
sbc a, h
ld h, a
ld a, 0
sbc a, b
ret
fastLDIR:
;copy BC bytes from HL to DE
;Cost:
; 27cc for having to call
; 110cc for setting up the loop, worst case
; 10cc * ceiling(BC/n) ;n=2^k for some k, see the line below "ldirloop:"
; 16cc * BC
;costs roughly 152-BC*(5-10/n) more than a simple LDIR (worst case)
;for n=4, BC>=61 saves
;for n=8, BC>=41 saves
;for n=16, BC>=35 saves * default, see the "ldirloop" to change
;for n=32, BC>=33 saves
;for n=64, BC>=32 saves
push hl
push af
xor a
sub c
and 15 ;change to n-1
add a,a
ld hl,ldirloop
add a,l
ld l,a
jr nc,$+3 ;these aren't needed if the ldirloop doesn't cross a 256 byte boundary. Can save 12cc on the above timings and 3 bytes.
inc h ;
pop af
ex (sp),hl
ret
ldirloop:
;n=16, (number of LDI instructions, use qty of 4,8,16,32,64)
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
_ldirloop_end:
ldi
jp pe,ldirloop
ret
This might be useful for things like copying code to RAM (from an App) in speed critical applications, or other data handling tasks. xor a
sub c
and c
So if c=%10110100, this would return a=%00000100 (it also always returns nc, and z only when c=0).
smc = 1 ;use 1 if the code is in RAM, since it is faster. If it is in an app, use 0 and define a 2-byte RAM location for the seed.
lehmer:
;;Input:
;; (seed) has the seed value of the RNG
;;Output:
;; (seed) is updated, HL is the result
;;Destroys:
;; A,DE,BC
;;Timing:
;; if seed>0 231cc or 232cc, condition dependent
;; if seed=0 91cc
;; if smc=1 subtract 6cc
;;Size: 44 bytes
;;Notes:
;; Uses the Lehmer RNG used by the Sinclair ZX81
;; 75x mod 65537 -> x
#IF smc == 0
ld hl,(seed)
#ELSE
seed = $+1
ld hl,0
#ENDIF
;multiply by 75
ld c,l
ld b,h
xor a
adc hl,hl \ jr z,special \ ld d,a \ rla
add hl,hl \ rla
add hl,hl \ rla \ add hl,bc \ adc a,d
add hl,hl \ rla
add hl,hl \ rla \ add hl,bc \ adc a,d
add hl,hl \ rla \ add hl,bc
;modulo 65537, see note below on how this works
ld e,a
sbc hl,de ;No need to reset the c flag since it is already
jr nc,$+3
inc hl
ld (seed),hl
ret
special:
;In the case that HL=0, this should be interpreted as 65536 = -1 mod 65537, so return -75 mod 65537 = -74 mod 65536 in HL
ld hl,-74
ld (seed),hl
ret
;mod by 2^16 + 1 (a prime)
;current form is A*2^16+HL
;need:
; (A*2^16+HL) mod (2^16+1)
;add 0 as +1-1
; (A*(2^16+1-1)+HL) mod (2^16+1)
;distribute
; (A*(2^16+1)-A+HL) mod (2^16+1)
;A*(2^16+1) mod 2^16+1 = 0, so remove
; (-A+HL) mod (2^16+1)
;Oh hey, that's easy! :P
;I use this trick everywhere, you should, too.
L_To_Dec:
;;Unrolled
;;Converts the 8-bit register L to binary coded decimal
;;Digits stored in LA (A has the lower 2 digits, L the upper).
;;Inputs: L is the 8-bit unsigned int to convert
;;Output: A has the lower 2 digits (in BCD form), L has the upper
;;Destroys: H,F
;;141cc
;;27 bytes
ld h,0
add hl,hl
add hl,hl
add hl,hl
add hl,hl
ld a,h \ daa \ rl l
adc a,a \ daa \ rl l
adc a,a \ daa \ rl l
adc a,a \ daa \ rl l
adc a,a \ daa \ rl l
ret
setXXOP2:
ld hl,OP2
jr setXX
setXXOP1:
ld hl,OP1
setXX:
;;Inputs: A is the unsigned int
;; HL is where to write the TI float
;;Destroys:All
;;291cc+38b (or 144cc if A=0)
;;average: b=29/255
;;295.3215686cc
;;59 bytes
ld c,0
ld (hl),c \ inc hl
ld (hl),81h
inc hl \ ld (hl),c
ld d,h \ ld e,l
inc hl \ ld (hl),c
inc hl \ ld (hl),c
inc hl \ ld (hl),c
inc hl \ ld (hl),c
inc hl \ ld (hl),c
inc hl \ ld (hl),c
or a \ ret z ;If A is zero, exit early. +138cc
ld l,a ;\
ld h,c ; |
add hl,hl ; |Start converting A to BCD
add hl,hl ; |
add hl,hl ; |
add hl,hl ; |
ld a,h \ daa \ rl l ; |Finish converting A to BCD
adc a,a \ daa \ rl l ; |Number is in LA
adc a,a \ daa \ rl l ; |
adc a,a \ daa \ rl l ; |
adc a,a \ daa ;/ +124cc
ex de,hl
ld (hl),a
and $F0
ret nz ;+29cc
rld ;\ Rotate up 1 digit
dec hl ; |
ld (hl),80h ; |
ret ; /
And if you want to get all 3 digits:setXXXOP1:
ld hl,OP1
setXXX:
;;Inputs: A is the unsigned int
;; HL is where to write the TI float
;;Destroys:All
;;423cc+13a+63b (or 233cc if A=0)
;;average: a=99/255, b=29/255
;;435.2117647cc average
;;64 bytes
ld bc,$0700
ld (hl),c
inc hl
ld (hl),83h
ld d,h
ld e,l
inc hl \ ld (hl),c \ djnz $-2
or a \ ret z ;If A is zero, exit early. +227cc
ld l,a ;\
ld h,c ; |
add hl,hl ; |Start converting A to BCD
add hl,hl ; |
add hl,hl ; |
add hl,hl ; |
ld a,h \ daa \ rl l ; |Finish converting A to BCD
adc a,a \ daa \ rl l ; |Number is in LA
adc a,a \ daa \ rl l ; |
adc a,a \ daa \ rl l ; |
adc a,a \ daa \ rl l ;/ +132cc
ex de,hl
jr nz,$+6 \ ld e,a \ xor a \ ld (hl),81h ;+(21+4/85)cc
inc hl
ld (hl),e
inc hl
ld (hl),a
ld a,e
and $F0
ret nz ;+48cc
rld ;\ Rotate up 1 digit
dec hl ; |
rld ; |
dec hl ; |
dec (hl) ; |Decrement exponent
ret ; /+63(29/255)cc
And if you want to do that, but maybe a little faster:setXXX:
;;Inputs: A is the unsigned int
;; HL is where to write the TI float
;;Destroys:All
;;334cc+13a+63b (or 144cc if A=0)
;;average: a=99/255, b=29/255
;;346.2117647cc average
;;75 bytes
ld c,0
ld (hl),c
inc hl
ld (hl),83h
ld d,h
ld e,l
inc hl \ ld (hl),c
inc hl \ ld (hl),c
inc hl \ ld (hl),c
inc hl \ ld (hl),c
inc hl \ ld (hl),c
inc hl \ ld (hl),c
inc hl \ ld (hl),c
or a \ ret z ;If A is zero, exit early. +227cc
ld l,a ;\
ld h,c ; |
add hl,hl ; |Start converting A to BCD
add hl,hl ; |
add hl,hl ; |
add hl,hl ; |
ld a,h \ daa \ rl l ; |Finish converting A to BCD
adc a,a \ daa \ rl l ; |Number is in LA
adc a,a \ daa \ rl l ; |
adc a,a \ daa \ rl l ; |
adc a,a \ daa \ rl l ;/ +132cc
ex de,hl
jr nz,$+6 \ ld e,a \ xor a \ ld (hl),81h ;+(21+4/85)cc
inc hl
ld (hl),e
inc hl
ld (hl),a
ld a,e
and $F0
ret nz ;+48cc
rld ;\ Rotate up 1 digit
dec hl ; |
rld ; |
dec hl ; |
dec (hl) ; |Decrement exponent
ret ; /+63(29/255)cc
And if you want to convert signed 8-bit ints:setXXX_signed:
;;Inputs: A is the signed int
;; HL is where to write the TI float
ld c,0
ld (hl),c
add a,a
jr c,$+6
neg
ld (hl),80h
inc hl
ld (hl),81h
ld d,h
ld e,l
inc hl \ ld (hl),c \ djnz $-2
or a \ ret z
ld l,a ;\
ld h,c ; |
add hl,hl ; |Start converting A to BCD
add hl,hl ; |
add hl,hl ; |
add hl,hl ; |
ld a,h \ daa \ rl l ; |Finish converting A to BCD
adc a,a \ daa \ rl l ; |Number is in cA
adc a,a \ daa \ rl l ; |(c is carry)
adc a,a \ daa ;/
ex de,hl
jr nc,$+15
ld (hl),82h
inc hl
inc hl
ld (hl),a
xor a
rld
or $10
dec hl
ld (hl),a
ret
inc hl
ld (hl),a
and $F0
ret nz
rld ;\ Rotate up 1 digit
dec hl ; |
ld (hl),80h ; |
ret ; /
ConvOP1:
;;Output: HL is the 16-bit result.
ld de,OP1
ConvFloat:
;;Input: DE points to the float.
;;Output: HL is the 16-bit result.
;;Errors: DataType if the float is negative or complex
;; Domain if the integer exceeds 16 bits.
;;Timings: Assume no errors were called.
;; Input is on:
;; (0,1) => 59cc Average=59
;; 0 or [1,10) => 120cc or 129cc =124.5
;; [10,100) => 176cc or 177cc =176.5
;; [100,1000) => 309cc, 310cc, 318cc, or 319cc. =314
;; [1000,10000) => 376cc to 378cc =377
;; [10000,65536) => 514cc to 516cc, or 523cc to 525cc =519.5
;;Average case: 496.577178955078125cc
;;vs 959.656982421875cc
;;87 bytes
ld a,(de)
or a
jr nz,ErrDataType
inc de
ld hl,0
ld a,(de)
inc de
sub 80h
ret c
jr z,final
cp 5
jp c,enterloop
ErrDomain:
;Throws a domain error.
bcall(_ErrDomain)
ErrDataType:
;Throws a data type error.
bcall(_ErrDataType)
loop:
ld a,b
ld b,h
ld c,l
add hl,hl
add hl,bc
add hl,hl
add hl,hl
add hl,hl
add hl,bc
add hl,hl
add hl,hl
enterloop:
ld b,a
ex de,hl
ld a,(hl) \ and $F0 \ rra \ ld c,a \ rra \ rra \ sub c \ add a,(hl)
inc hl
ex de,hl
add a,l
ld l,a
jr nc,$+3
inc h
dec b
ret z
djnz loop
ld b,h
ld c,l
xor a
;check overflow in this mul by 10!
add hl,hl \ adc a,a
add hl,hl \ adc a,a
add hl,bc \ adc a,0
add hl,hl \ adc a,a
jr nz,ErrDomain
final:
ld a,(de)
rrca
rrca
rrca
rrca
and 15
add a,l
ld l,a
ret nc
inc h
ret nz
jr ErrDomain
If you do not want to throw errors and are comfortable returning garbage results:ConvOP1:
;;Output: HL is the 16-bit result.
ld de,OP1
ConvFloat:
;;Input: DE points to the float.
;;Output: HL is the 16-bit result.
;; Input is on:
;; (0,1) => 57cc Average=57
;; 0 or [1,10) => 117cc or 126cc =121.5
;; [10,100) => 174cc or 175cc =174.5
;; [100,1000) => 276cc, 277cc, 285cc, or 286cc. =281
;; [1000,10000) => 374cc to 376cc =375
;; [10000,65536) => 481cc to483cc, or 490cc to 492cc =486.5
;;Average case: 467.88153076171875cc
ld hl,0
ld a,(de)
or a
ret z
inc de
ld a,(de)
inc de
sub 80h
ret c
jr z,final
cp 5
jp c,enterloop
ret
loop:
ld a,b
ld b,h
ld c,l
add hl,hl
add hl,bc
add hl,hl
add hl,hl
add hl,hl
add hl,bc
add hl,hl
add hl,hl
enterloop:
ld b,a
ex de,hl
ld a,(hl) \ and $F0 \ rra \ ld c,a \ rra \ rra \ sub c \ add a,(hl)
inc hl
ex de,hl
add a,l
ld l,a
jr nc,$+3
inc h
dec b
ret z
djnz loop
ld b,h
ld c,l
add hl,hl
add hl,hl
add hl,bc
add hl,hl
final:
ld a,(de)
rrca
rrca
rrca
rrca
and 15
add a,l
ld l,a
ret nc
inc h
ret
ConvOP1:
ld hl,OP1
convFloat:
;;Inputs: HL points to the TI Float.
;;Outputs: The float is converted to a 16-bit integer held in DE, or LSB in A.
;;avg: 436.723938
;; 0-digit : 30cc (0,1)
;; 1-digit : 92cc or 100cc 0 or [1,10)
;; 2-digit : 134cc [10,100)
;; 3-digit : 257cc or 265cc [100,1000)
;; 4-digit : 329cc or 330cc [1000,10000)
;; 5-digit : 453cc or 454cc or 461cc or 462cc [10000,65536)
;;95 bytes
ld a,(hl)
or a
jr nz,err_datatype
ld de,0
inc hl
ld a,(hl)
sub 80h
ld b,a ;ugh, gotta keep A as the LSB, but we need to get B to be the exponent
ld a,e ;
ret c
inc hl
ld a,(hl)
jr z,lastdigit2
; ld a,(hl) ;\ I feel particularly proud of this code, so feel free to use it ^~^
and $F0 ; |It converts a byte of BCD to an 8-bit int.
rra ; |The catch is, I only have A and C to use, and (hl) is the BCD
ld c,a ; |number.
rra ; |If you come up with faster, please let me know and post it in
rra ; |the optimized routines thread on popular TI forums.
sub a,c ; |Algo: Multiply upper digit by -6, add the original byte.
add a,(hl) ;/ Result is upper_digit*(-6+16)+lower_digit
ld e,a
dec b
ret z
dec b
jr z,lastdigit
inc hl
ex de,hl
ld a,b
sla l
add hl,hl
ld b,h
ld c,l
add hl,hl
add hl,bc
add hl,hl
add hl,hl
add hl,hl
add hl,bc
ex de,hl
ld b,a
ld a,(hl) ;\ I feel particularly proud of this code, so feel free to use it ^~^
and $F0 ; |It converts a byte of BCD to an 8-bit int.
rra ; |The catch is, I only have A and C to use, and (hl) is the BCD
ld c,a ; |number.
rra ; |If you come up with faster, please let me know and post it in
rra ; |the optimized routines thread on popular TI forums.
sub a,c ; |Algo: Multiply upper digit by -6, add the original byte.
add a,(hl) ;/ Result is upper_digit*(-6+16)+lower_digit. Ha, repost.
; inc hl ;
add a,e
ld e,a
jr nc,$+3
inc d
dec b
ret z
dec b
jr nz,err_domain
lastdigit:
inc hl
ld a,(hl)
ld h,d
ld l,e
add hl,hl
add hl,hl
add hl,de
add hl,hl
jr c,err_domain
ex de,hl
lastdigit2:
rrca
rrca
rrca
rrca
and 15
add a,e
ld e,a
ret nc
inc d
ret nz
err_domain:
bcall(_ErrDomain)
err_datatype:
bcall(_ErrDataType)
mul16 596.34375cc, 92 bytes (incl. DE_Times_A) |
mul16:
;Inputs:
; BC,DE are unsigned integers
;Output:
; HL:DE is the 32-bit product
;Destroys:
; A,B,C
;min: 359cc
;max: 717cc
;avg: 596.34375cc
;92 bytes
ld a,c
call DE_Times_A
push hl
push af
ld a,b
call DE_Times_A+2
pop bc
pop de
;AHL
; BDE
ld c,d
add hl,bc
adc a,0
;AHLE
ld d,l
ld l,h
ld h,a
;HLDE
ret
DE_Times_A:
;Input: DE,A
;Output: A:HL is the product, C=0, B,DE unaffected, z flag set if result is zero, c flag set if A is input as 1, else nc.
;A:128~255 219+6{0,10}+{0,19} avg=258.5 *1/2
;A:64~127 203+5{0,10}+{0,19} avg=237.5 *1/4
;A:32~63 187+4{0,10}+{0,19} avg=216.5 *1/8
;A:16~31 171+3{0,10}+{0,19} avg=195.5 *1/16
;A:8~15 155+2{0,10}+{0,19} avg=174.5 *1/32
;A:4~7 139+{0,10}+{0,19} avg=153.5 *1/64
;A:2~3 123+{0,19} avg=132.5 *1/128
;A:1 107cc avg=107 *1/256
;A:0 119cc avg=119 *1/256
;overall avg: 237.671875cc
ld c,0
ld h,d
ld l,e
rla \ jr c,mul_07
rla \ jr c,mul_06
rla \ jr c,mul_05
rla \ jr c,mul_04
rla \ jr c,mul_03
rla \ jr c,mul_02
rla \ jr c,mul_01
rla
ret c
ld h,a
ld l,a
ret
mul_07:
add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_06:
add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_05:
add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_04:
add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_03:
add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_02:
add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_01:
add hl,hl \ rla \ ret nc \ add hl,de \ adc a,c
ret
DE_Times_A:
;Input: DE,A
;Output: A:HL is the product, C=0, B,DE unaffected, z flag set if result is zero, c flag set if A is input as 1, else nc.
;A:128~255 219+6{0,10}+{0,19} avg=258.5 *1/2
;A:64~127 203+5{0,10}+{0,19} avg=237.5 *1/4
;A:32~63 187+4{0,10}+{0,19} avg=216.5 *1/8
;A:16~31 171+3{0,10}+{0,19} avg=195.5 *1/16
;A:8~15 155+2{0,10}+{0,19} avg=174.5 *1/32
;A:4~7 139+{0,10}+{0,19} avg=153.5 *1/64
;A:2~3 123+{0,19} avg=132.5 *1/128
;A:1 107cc avg=107 *1/256
;A:0 119cc avg=119 *1/256
;overall avg: 237.671875cc
ld c,0
ld h,d
ld l,e
rla \ jr c,mul_07
rla \ jr c,mul_06
rla \ jr c,mul_05
rla \ jr c,mul_04
rla \ jr c,mul_03
rla \ jr c,mul_02
rla \ jr c,mul_01
rla
ret c
ld h,a
ld l,a
ret
mul_07:
add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_06:
add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_05:
add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_04:
add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_03:
add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_02:
add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c
mul_01:
add hl,hl \ rla \ ret nc \ add hl,de \ adc a,c
ret
; divide HL by C (HL is signed, C is not)
; output: HL = quotient, A = remainder
divHLbyCs:
bit 7,h
push af
jr z,divHLbyCsStart
ld a,h \ cpl \ ld h,a
ld a,l \ cpl \ ld l,a
inc hl
divHLbyCsStart:
xor a
ld b,16
divHLbyCsLoop:
add hl,hl
rla
cp c
jp c,divHLbyCsNext
sub c
inc l
divHLbyCsNext:
djnz divHLbyCLoop
ld b,a
pop af
ld a,b
ret z
ld a,h \ cpl \ ld h,a
ld a,l \ cpl \ ld l,a
inc hl
ret
; divide HL by C (HL is signed, C is not)
; output: HL = quotient, A = remainder
divHLbyCs:
bit 7,h
push af
jr z,divHLbyCsStart
xor a \ sub l \ ld l,a
sbc a,a \ sub h \ ld h,a
divHLbyCsStart:
xor a
ld b,16
divHLbyCsLoop:
add hl,hl
rla
cp c
jp c,divHLbyCsNext
sub c
inc l
divHLbyCsNext:
djnz divHLbyCLoop
ld b,a
pop af
ld a,b
ret z
xor a \ sub l \ ld l,a
sbc a,a \ sub h \ ld h,a
ld a,c
sub b
ret
I changed the output remainder so that it returns the remainder modulo c. So -7/3 returns 2 instead of 1. This means that no bytes nor clock cycles were saved after all :P
pushpop:
;26 bytes, adds 229cc to the calling routine
ex (sp),hl
push de
push bc
push af
push hl
ld hl,pushpopret
ex (sp),hl
push hl
push af
ld hl,12
add hl,sp
ld a,(hl)
inc hl
ld h,(hl)
ld l,a
pop af
ret
pushpopret:
pop af
pop bc
pop de
pop hl
ret
diRestore:
ex (sp),hl
push hl
push af
ld hl,restoreei
ld a,r
jp pe,+_
dec hl
dec hl
_:
di
inc sp
inc sp
inc sp
inc sp
ex (sp),hl
dec sp
dec sp
dec sp
dec sp
pop af
ret
restoredi:
di
ret
restoreei:
ei
ret
Edit: calc84maniac brought up that those 'inc sp' instructions might cause some issues if interrupts fire in between them. The code is modified to disable interrupts first, then mess with the stack.
endianswap32:
push hl
ld h,e
ld e,l
inc sp
push hl
inc sp
pop hl
inc sp
ret
atan8:
;computes 256*atan(A/256)->A
;56 bytes including the LUT
;min: 246cc
;max: 271cc
;avg: 258.5cc
rlca
rlca
rlca
ld d,a
and 7
ld hl,atan8LUT
add a,l
ld l,a
#if (atan8LUT&255)>248 ;this section not included in size/speed totals
jr nc,$+3 ;can add three bytes, 12cc to max, 11cc to min, and 11.5cc to avg
inc h
#endif
ld c,(hl)
inc hl
ld a,(hl)
sub c
ld e,0
ex de,hl
ld d,l
ld e,a
sla h \ jr nc,$+3 \ ld l,e
add hl,hl \ jr nc,$+3 \ add hl,de
add hl,hl \ jr nc,$+3 \ add hl,de
add hl,hl \ jr nc,$+3 \ add hl,de
add hl,hl \ jr nc,$+3 \ add hl,de
add hl,hl
add hl,hl
add hl,hl
; add hl,hl ;used in rounding...
ld a,h
; rra ;but doesn't seem to improve the error
adc a,c
ret
atan8LUT:
.db 0,32,63,92,119,143,165,184,201
sqrtHLIX:
;Input: HLIX
;Output: DE is the sqrt, AHL is the remainder
;speed: 751+6{0,6}+{0,3+{0,18}}+{0,38}+sqrtHL
;min: 1103
;max: 1237
;avg: 1165.5
;166 bytes
call sqrtHL ;expects returns A as sqrt, HL as remainder, D = 0
add a,a
ld e,a
rl d
ld a,ixh
sll e \ rl d
add a,a \ adc hl,hl
add a,a \ adc hl,hl
sbc hl,de
jr nc,+_
add hl,de
dec e
.db $FE ;start of `cp *`
_:
inc e
sll e \ rl d
add a,a \ adc hl,hl
add a,a \ adc hl,hl
sbc hl,de
jr nc,+_
add hl,de
dec e
.db $FE ;start of `cp *`
_:
inc e
sll e \ rl d
add a,a \ adc hl,hl
add a,a \ adc hl,hl
sbc hl,de
jr nc,+_
add hl,de
dec e
.db $FE ;start of `cp *`
_:
inc e
sll e \ rl d
add a,a \ adc hl,hl
add a,a \ adc hl,hl
sbc hl,de
jr nc,+_
add hl,de
dec e
.db $FE ;start of `cp *`
_:
inc e
;Now we have four more iterations
;The first two are no problem
ld a,ixl
sll e \ rl d
add a,a \ adc hl,hl
add a,a \ adc hl,hl
sbc hl,de
jr nc,+_
add hl,de
dec e
.db $FE ;start of `cp *`
_:
inc e
sll e \ rl d
add a,a \ adc hl,hl
add a,a \ adc hl,hl
sbc hl,de
jr nc,+_
add hl,de
dec e
.db $FE ;start of `cp *`
_:
inc e
sqrt32_iter15:
;On the next iteration, HL might temporarily overflow by 1 bit
sll e \ rl d ;sla e \ rl d \ inc e
add a,a
adc hl,hl
add a,a
adc hl,hl ;This might overflow!
jr c,sqrt32_iter15_br0
;
sbc hl,de
jr nc,+_
add hl,de
dec e
jr sqrt32_iter16
sqrt32_iter15_br0:
or a
sbc hl,de
_:
inc e
;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anyways
sqrt32_iter16:
add a,a
ld b,a ;either 0x00 or 0x80
adc hl,hl
rla
adc hl,hl
rla
;AHL - (DE+DE+1)
sbc hl,de \ sbc a,b
inc e
or a
sbc hl,de \ sbc a,b
ret p
add hl,de
adc a,b
dec e
add hl,de
adc a,b
ret
This uses this sqrtHL routine:;written by Zeda
sqrtHL:
;returns A as the sqrt, HL as the remainder, D = 0
;min: 352cc
;max: 391cc
;avg: 371.5cc
ld de,05040h ; 10
ld a,h ; 4
sub e ; 4
jr nc,sq7 ;\
add a,e ; | branch 1: 12cc
ld d,16 ; | branch 2: 18cc
sq7: ;/
; ----------
cp d ; 4
jr c,sq6 ;\
sub d ; | branch 1: 12cc
set 5,d ; | branch 2: 19cc
sq6: ;/
; ----------
res 4,d ; 8
srl d ; 8
set 2,d ; 8
cp d ; 4
jr c,sq5 ;\
sub d ; | branch 1: 12cc
set 3,d ; | branch 2: 19cc
sq5: ;/
srl d ; 8
; ----------
inc a ; 4
sub d ; 4
jr nc,sq4 ;\
dec d ; | branch 1: 12cc
add a,d ; | branch 2: 19cc
dec d ; | <-- this resets the low bit of D, so `srl d` resets carry.
sq4: ;/
srl d ; 8
ld h,a ; 4
; ----------
ld a,e ; 4
sbc hl,de ; 15
jr nc,sq3 ;\
add hl,de ; | 12cc or 18cc
sq3: ;/
ccf ; 4
rra ; 4
srl d ; 8
rra ; 4
; ----------
ld e,a ; 4
sbc hl,de ; 15
jr c,sq2 ;\
or 20h ; | branch 1: 23cc
db 254 ; | <-- start of `cp *` which is 7cc to skip the next byte.
sq2: ; | branch 2: 21cc
add hl,de ;/
xor 18h ; 7
srl d ; 8
rra ; 4
; ----------
ld e,a ; 4
sbc hl,de ; 15
jr c,sq1 ;\
or 8 ; | branch 1: 23cc
db 254 ; | <-- start of `cp *` which is 7cc to skip the next byte.
sq1: ; | branch 2: 21cc
add hl,de ;/
xor 6 ; 7
srl d ; 8
rra ; 4
; ----------
ld e,a ; 4
sbc hl,de ; 15
jr nc,+_ ; \
add hl,de ; 15 |
srl d ; 8 |
rra ; 4 | branch 1: 38cc
ret ; 10 | branch 2: 40cc
_: ; |
inc a ; 4 |
srl d ; 8 |
rra ; 4 |
ret ; 10 /
;Converts an 8-bit signed integer to a string
itoa_8:
;Input:
; A is a signed integer
; HL points to where the null-terminated ASCII string is stored (needs at most 5 bytes)
;Output:
; The number is converted to a null-terminated string at HL
;Destroys:
; Up to five bytes at HL
; All registers preserved.
;on 0 to 9: 252 D=0
;on 10 to 99: 258+20D D=0 to 9
;on 100 to 127: 277+20D D=0 to 2
;on -1 to -9: 276 D=0
;on -10 to -99: 282+20D D=0 to 9
;on -100 to -128: 301+20D D=0 to 2
;min: 252cc (+23cc over original)
;max: 462cc (-49cc over original)
;avg: 343.74609375cc = 87999/256
;54 bytes
push hl
push de
push bc
push af
or a
jp p,itoa_pos
neg
ld (hl),$1A ;start if neg char on TI-OS
inc hl
itoa_pos:
;A is on [0,128]
;calculate 100s place, plus 1 for a future calculation
ld b,'0'
cp 100 \ jr c,$+5 \ sub 100 \ inc b
;calculate 10s place digit, +1 for future calculation
ld de,$0A2F
inc e \ sub d \ jr nc,$-2
ld c,a
;Digits are now in D, C, A
; strip leading zeros!
ld a,'0'
cp b \ jr z,$+5 \ ld (hl),b \ inc hl \ .db $FE ; start of `cp *` to skip the next byte, turns into `cp $BB` which will always return nz and nc
cp e \ jr z,$+4 \ ld (hl),e \ inc hl
add a,c
add a,d
ld (hl),a
inc hl
ld (hl),0
pop af
pop bc
pop de
pop hl
ret
;This converts a fixed-point number to a string.
;It displays up to 3 digits after the decimal.
fixed88_to_str:
;Inputs:
; D.E is the fixed-point number
; HL points to where the string gets output.
; Needs at most 9 bytes.
;Outputs:
; HL is preserved
;Destroys:
; AF,DE,BC
;First check if the input is negative.
;If so, write a negative sign and negate
push hl
ld a,d
or a
jp p,+_
ld (hl),$1A ;negative sign on TI-OS
inc hl
xor a
sub e
ld e,a
sbc a,a
sub d
_:
;Our adjusted number is in A.E
;Now we can print the integer part
call itoa_8
;Check if we need to print the fractional part
xor a
cp e
jr z,fixed88_to_str_end
;We need to write the fractional part, so seek the end of the string
;Search for the null byte. A is already 0
cpir
;Write a decimal
dec hl
ld (hl),'.'
ld b,3
_:
;Multiply E by 10, converting overflow to an ASCII digit
call fixed88_to_str_e_times_10
inc hl
ld (hl),a
djnz -_
;Strip the ending zeros
ld a,'0'
_:
cp (hl)
dec hl
jr z,-_
;write a null byte
inc hl
inc hl
ld (hl),0
fixed88_to_str_end:
;restore HL
pop hl
ret
fixed88_to_str_e_times_10:
ld a,e
ld d,0
add a,a \ rl d
add a,a \ rl d
add a,e \ jr nc,$+3 \ inc d
add a,a
ld e,a
ld a,d
rla
add a,'0'
ret
sqrtA:
;Input: A
;Output: D is the square root, A is the remainder (input-D^2)
;Destroys: BC
;speed: 161+{0,6}+{0,1}+{0,1}+{0,3}
;min: 161cc
;max: 172cc
;avg: 166.5cc
;45 bytes
ld d,$40
sub d
jr nc,+_
add a,d
ld d,0
_:
set 4,d
sub d
jr nc,+_
add a,d
.db $01 ;start of ld bc,** which is 10cc to skip the next two bytes.
_:
set 5,d
res 4,d
srl d
set 2,d
sub d
jr nc,+_
add a,d
.db $01 ;start of ld bc,** which is 10cc to skip the next two bytes.
_:
set 3,d
res 2,d
srl d
inc d
sub d
jr nc,+_
add a,d
dec d
_:
inc d
srl d
ret
sqrtfixed_88:
;Input: A.E ==> D.E
;Output: DE is the sqrt, AHL is the remainder
;Speed: 690+6{0,13}+{0,3+{0,18}}+{0,38}+sqrtA
;min: 855cc
;max: 1003cc
;avg: 924.5cc
;152 bytes
call sqrtA
ld l,a
ld a,e
ld h,0
ld e,d
ld d,h
sla e
rl d
sll e \ rl d
add a,a \ adc hl,hl
add a,a \ adc hl,hl
sbc hl,de
jr nc,+_
add hl,de
dec e
.db $FE ;start of `cp *`
_:
inc e
sll e \ rl d
add a,a \ adc hl,hl
add a,a \ adc hl,hl
sbc hl,de
jr nc,+_
add hl,de
dec e
.db $FE ;start of `cp *`
_:
inc e
sll e \ rl d
add a,a \ adc hl,hl
add a,a \ adc hl,hl
sbc hl,de
jr nc,+_
add hl,de
dec e
.db $FE ;start of `cp *`
_:
inc e
sll e \ rl d
add a,a \ adc hl,hl
add a,a \ adc hl,hl
sbc hl,de
jr nc,+_
add hl,de
dec e
.db $FE ;start of `cp *`
_:
inc e
;Now we have four more iterations
;The first two are no problem
sll e \ rl d
add hl,hl
add hl,hl
sbc hl,de
jr nc,+_
add hl,de
dec e
.db $FE ;start of `cp *`
_:
inc e
sll e \ rl d
add hl,hl
add hl,hl
sbc hl,de
jr nc,+_
add hl,de
dec e
.db $FE ;start of `cp *`
_:
inc e
sqrtfixed_88_iter11:
;On the next iteration, HL might temporarily overflow by 1 bit
sll e \ rl d ;sla e \ rl d \ inc e
add hl,hl
add hl,hl
jr c,sqrtfixed_88_iter11_br0
;
sbc hl,de
jr nc,+_
add hl,de
dec e
jr sqrtfixed_88_iter12
sqrtfixed_88_iter11_br0:
or a
sbc hl,de
_:
inc e
;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anyways
sqrtfixed_88_iter12:
ld b,a ;A is 0, so B is 0
add hl,hl
add hl,hl
rla
;AHL - (DE+DE+1)
sbc hl,de \ sbc a,b
inc e
or a
sbc hl,de \ sbc a,b
ret p
add hl,de
adc a,b
dec e
add hl,de
adc a,b
ret
; Requires
; mul16 ;BC*DE ==> DEHL
; DEHL_Div_BC ;DEHL/BC ==> DEHL
ncr_HL_DE:
;"n choose r", defined as n!/(r!(n-r)!)
;Computes "HL choose DE"
;Inputs: HL,DE
;Outputs:
; HL is the result
; "HL choose DE"
; carry flag reset means overflow
;Destroys:
; A,BC,DE,IX
;Notes:
; Overflow is returned as 0
; Overflow happens if HL choose DE exceeds 65535
; This algorithm is constructed in such a way that intermediate
; operations won't erroneously trigger overflow.
;66 bytes
ld bc,1
or a
sbc hl,de
jr c,ncr_oob
jr z,ncr_exit
sbc hl,de
add hl,de
jr c,$+3
ex de,hl
ld a,h
or l
push hl
pop ix
ncr_exit:
ld h,b
ld l,c
scf
ret z
ncr_loop:
push bc \ push de
push hl \ push bc
ld b,h
ld c,l
call mul16 ;BC*DE ==> DEHL
pop bc
call DEHL_Div_BC ;result in DEHL
ld a,d
or e
pop bc
pop de
jr nz,ncr_overflow
add hl,bc
jr c,ncr_overflow
pop bc
inc bc
ld a,b
cp ixh
jr c,ncr_loop
ld a,ixl
cp c
jr nc,ncr_loop
ret
ncr_overflow:
pop bc
xor a
ld b,a
ncr_oob:
ld h,b
ld l,b
ret
;Converts an 8-bit unsigned integer to a string
uitoa_8:
;Input:
; A is a signed integer
; HL points to where the null-terminated ASCII string is stored (needs at most 5 bytes)
;Output:
; The number is converted to a null-terminated string at HL
;Destroys:
; Up to four bytes at HL
; All registers preserved.
;on 0 to 9: 238 D=0
;on 10 to 99: 244+20D D=0 to 9
;on 100 to 255: 257+2{0,6}+20D D=0 to 5
;min: 238cc
;max: 424cc
;avg: 317.453125cc = 81268/256 = (238*10 + 334*90+313*156)/256
;52 bytes
push hl
push de
push bc
push af
;A is on [0,255]
;calculate 100s place, plus 1 for a future calculation
ld b,'0'
cp 100 \ jr c,$+5 \ sub 100 \ inc b
cp 100 \ jr c,$+5 \ sub 100 \ inc b
;calculate 10s place digit, +1 for future calculation
ld de,$0A2F
inc e \ sub d \ jr nc,$-2
ld c,a
;Digits are now in D, C, A
; strip leading zeros!
ld a,'0'
cp b \ jr z,$+5 \ ld (hl),b \ inc hl \ .db $FE ; start of `cp *` to skip the next byte, turns into `cp $BB` which will always return nz and nc
cp e \ jr z,$+4 \ ld (hl),e \ inc hl
add a,c
add a,d
ld (hl),a
inc hl
ld (hl),0
pop af
pop bc
pop de
pop hl
ret
;Converts a 16-bit signed integer to an ASCII string.
itoa_16:
;Input:
; DE is the number to convert
; HL points to where to write the ASCII string (up to 7 bytes needed).
;Output:
; HL points to the null-terminated ASCII string
; NOTE: This isn't necessarily the same as the input HL.
push de
push bc
push af
push hl
bit 7,d
jr z,+_
xor a
sub e
ld e,a
sbc a,a
sub d
ld d,a
ld (hl),$1A ;negative char on TI-OS
inc hl
_:
ex de,hl
ld bc,-10000
ld a,'0'-1
inc a \ add hl,bc \ jr c,$-2
ld (de),a
inc de
ld bc,1000
ld a,'9'+1
dec a \ add hl,bc \ jr nc,$-2
ld (de),a
inc de
ld bc,-100
ld a,'0'-1
inc a \ add hl,bc \ jr c,$-2
ld (de),a
inc de
ld a,l
ld h,'9'+1
dec h \ add a,10 \ jr nc,$-3
add a,'0'
ex de,hl
ld (hl),d
inc hl
ld (hl),a
inc hl
ld (hl),0
;No strip the leading zeros
pop hl
;If the first char is a negative sign, skip it
ld a,(hl)
cp $1A
push af
ld a,'0'
jr nz,$+3
inc hl
cp (hl)
jr z,$-2
;Check if we need to re-write the negative sign
pop af
jr nz,+_
dec hl
ld (hl),a
_:
pop af
pop bc
pop de
ret
;Converts a 16-bit unsigned integer to an ASCII string.
uitoa_16:
;Input:
; DE is the number to convert
; HL points to where to write the ASCII string (up to 6 bytes needed).
;Output:
; HL points to the null-terminated ASCII string
; NOTE: This isn't necessarily the same as the input HL.
push de
push bc
push af
ex de,hl
ld bc,-10000
ld a,'0'-1
inc a \ add hl,bc \ jr c,$-2
ld (de),a
inc de
ld bc,1000
ld a,'9'+1
dec a \ add hl,bc \ jr nc,$-2
ld (de),a
inc de
ld bc,-100
ld a,'0'-1
inc a \ add hl,bc \ jr c,$-2
ld (de),a
inc de
ld a,l
ld h,'9'+1
dec h \ add a,10 \ jr nc,$-3
add a,'0'
ex de,hl
ld (hl),d
inc hl
ld (hl),a
inc hl
ld (hl),0
;No strip the leading zeros
ld c,-6
add hl,bc
ld a,'0'
inc hl \ cp (hl) \ jr z,$-2
pop af
pop bc
pop de
ret
;Masked Sprite routine
putsprite_masked:
;Inputs:
; (A,L) = (x,y)
; B is height
; IX points to the sprite data
; first byte is the data
; second byte is mask
; continues, alternating like this.
;
;Outputs:
; Mask is ANDed to the buffer, then data is ORed on top of that.
;
;Destroys:
; AF, BC, DE, HL, IX
;
;Notes:
; To set a pixel...
; black: mask is any, data is 1
; white: mask is 0, data is 0
; clear: mask is 1, data is 0 (keeps the data from the buffer)
;
;This routine is free to use :)
;65 bytes (or 66 bytes if gbuf is not located at 0x**40
ld e,l
ld h,0
ld d,h
add hl,hl
add hl,de
add hl,hl
add hl,hl
ld e,a
and 7
ld c,a
xor e ;essentially gets E with the bottom 3 bits reset
#if (plotSScreen&255) = 64
inc a
rra
rra
rra
ld e,a
ld d,plotSScreen>>8
#else
rra
rra
rra
ld e,a
add hl,de
ld de,plotSScreen
#endif
add hl,de
putsprite_masked_loop:
push bc
xor a
ld d,(ix)
ld e,a
sub c
ld b,c
ld c,$FF
inc ix
ld a,(ix)
jr z,putsprite_masked_rotdone
putsprite_masked_rot:
scf
rra
rr c
srl d
rr e
djnz putsprite_masked_rot
putsprite_masked_rotdone:
and (hl)
or d
ld (hl),a
inc hl
ld a,(hl)
and c
or e
ld (hl),a
ld c,11
add hl,bc
inc ix
pop bc
djnz putsprite_masked_loop
ret
But if you want even faster and smaller, use a non-traditional mask technique by ORing the mask onto the buffer, then XORing the data on top of it. The format is less intuitive, but it allows for white/black/clear/invert instead of just white/black/clear:;Masked Sprite routine
putsprite_masked:
;Inputs:
; (A,L) = (x,y)
; B is height
; IX points to the sprite data
; first byte is the data
; second byte is mask
; continues, alternating like this.
;
;Outputs:
; Mask is ORed to the buffer, then data is XORed on top of that.
;
;Destroys:
; AF, BC, DE, HL, IX
;
;Notes:
; To set a pixel...
; black: mask is 1, data is 0
; white: mask is 1, data is 1
; clear: mask is 0, data is 0 (keeps the data from the buffer)
; invert: mask is 0, data is 1 (inverts the data from the buffer)
;
;This routine is free to use :)
;63 bytes (or 64 bytes if gbuf is not located at 0x**40
ld e,l
ld h,0
ld d,h
add hl,hl
add hl,de
add hl,hl
add hl,hl
ld e,a
and 7
ld c,a
xor e ;essentially gets E with the bottom 3 bits reset
#if (plotSScreen&255) = 64
inc a
rra
rra
rra
ld e,a
ld d,plotSScreen>>8
#else
rra
rra
rra
ld e,a
add hl,de
ld de,plotSScreen
#endif
add hl,de
putsprite_masked_loop:
push bc
xor a
ld d,(ix)
ld e,a
or c
ld b,c
ld c,e
inc ix
ld a,(ix)
jr z,putsprite_masked_rotdone
putsprite_masked_rot:
rra
rr c
srl d
rr e
djnz putsprite_masked_rot
putsprite_masked_rotdone:
or (hl)
xor d
ld (hl),a
inc hl
ld a,(hl)
or c
xor e
ld (hl),a
ld c,11
add hl,bc
inc ix
pop bc
djnz putsprite_masked_loop
ret
;133 bytes total
;This is made by Zeda, feel free to use it for whatever.
;Takes inputs for a big sprite and sets up masks and clipping
;requires 4 bytes of temporary RAM, but doesn't use SMC
spritetmp = 8000h ;relocate this as needed! Just need 4 bytes.
sprite_width = spritetmp+0
sprite_x = spritetmp+1
sprite_mask0 = spritetmp+2
sprite_mask1 = spritetmp+3
bigsprite_subroutine:
;Inputs:
; B is the X-coordinate
; C is the Y-Coordinate
; DE points to the sprite
; H is the height
; L is the width in bytes
;Outputs:
; carry flag is set if okay to draw, nc if out-of-bounds.
; B is height.
; C is width.
; HL points to the byte to start drawing at.
; DE points to where to start sourcing the sprite data
; (sprite_width) is the width of the sprite in bytes
; (sprite_x) is the intitial x coordinate to begin drawing at
; (sprite_mask0) is the left mask
; (sprite_mask1) is the right mask
;92 bytes
;First check if the sprite is on-screen in the horizontal direction
ld a,c
cp 64
jr c,+_
add a,h
ret nc
ld h,a
push hl
xor a
ld h,a
sub c
ex de,hl
add hl,de
dec a
jr nz,$-2
ex de,hl
pop hl
xor a
ld c,a
_:
;Next check h+c<=64
ld a,64
sub c
cp h
jr nc,+_
ld h,a
_:
;Make sure the height is not now 0
ld a,h
or a
ret z
;Save the width and height of the sprite
push hl ;height,width
ld h,b
ld (sprite_width),hl ;x,width
push de ;sprite pointer
;Set up a pointer to the routine for shifting the routine for shifting the sprite data
ld ixh,rshiftHA_7>>8
ld a,h
cpl
and 7
ld l,a
add a,a
add a,l
add a,rshiftHA_7&255
ld ixl,a
#if (rshiftHA_7&255)>234
jr nc,$+4
inc ixh
#endif
ld a,b
and 7
ld de,spritemask
add a,e
ld e,a
#if spritemask&255>248
jr nc,$+3
inc d
#endif
ld a,(de)
ld (sprite_mask0),a
cpl
ld (sprite_mask1),a
;
;
ld a,c
add a,a
sbc a,a
ld h,a
ld a,b
ld b,h
ld l,c
add hl,hl
add hl,bc
add hl,hl
add hl,hl
ld c,a
add a,a
sbc a,a
ld b,a
ld a,c
sra c
sra c
sra c
add hl,bc
ld bc,plotSScreen
add hl,bc
pop de
pop bc
;B is height
;C is width
ex de,hl
scf
ret
rshiftHA_7:
rr h \ rra
rr h \ rra
rr h \ rra
rr h \ rra
rr h \ rra
rr h \ rra
rr h \ rra
ex de,hl
ld e,a
ret
spritemask:
.db $00,$80,$C0,$E0,$F0,$F8,$FC,$FE
call_ix:
jp (ix)
Then you can draw a big sprite with OR logic:bigsprite_OR:
;Inputs:
; B is the X-coordinate
; C is the Y-Coordinate
; DE points to the sprite
; H is the height
; L is the width in bytes
;68 bytes
;Set up the clipping
call bigsprite_subroutine
ret nc
bigsprite_OR_loop:
push bc ;height,width
push de ;gbuf ptr
push hl ;sprite data pointer
ld a,(sprite_x)
ld c,a
add a,8
ld (sprite_x),a
spriteloop_OR:
push bc
push hl
ld h,(hl)
xor a
call call_ix
ld a,c
cp 96
jr nc,+_
ld a,(hl)
or d
ld (hl),a
ld a,c
_:
inc hl
add a,8
cp 96
jr nc,+_
ld a,(sprite_mask1)
ld a,(hl)
or e
ld (hl),a
_:
ld bc,11
add hl,bc
ex de,hl
pop hl
ld a,(sprite_width)
ld c,a
add hl,bc
pop bc
djnz spriteloop_OR
pop hl
inc hl
pop de
inc de
pop bc
dec c
jr nz,bigsprite_OR_loop
ret
Or draw with XOR logic:bigsprite_XOR:
;Inputs:
; B is the X-coordinate
; C is the Y-Coordinate
; DE points to the sprite
; H is the height
; L is the width in bytes
;68 bytes
;Set up the clipping
call bigsprite_subroutine
ret nc
bigsprite_XOR_loop:
push bc ;height,width
push de ;gbuf ptr
push hl ;sprite data pointer
ld a,(sprite_x)
ld c,a
add a,8
ld (sprite_x),a
spriteloop_XOR:
push bc
push hl
ld h,(hl)
xor a
call call_ix
ld a,c
cp 96
jr nc,+_
ld a,(hl)
xor d
ld (hl),a
ld a,c
_:
inc hl
add a,8
cp 96
jr nc,+_
ld a,(sprite_mask1)
ld a,(hl)
xor e
ld (hl),a
_:
ld bc,11
add hl,bc
ex de,hl
pop hl
ld a,(sprite_width)
ld c,a
add hl,bc
pop bc
djnz spriteloop_XOR
pop hl
inc hl
pop de
inc de
pop bc
dec c
jr nz,bigsprite_XOR_loop
ret
Or draw with AND logic:bigsprite_AND:
;Inputs:
; B is the X-coordinate
; C is the Y-Coordinate
; DE points to the sprite
; H is the height
; L is the width in bytes
;69 bytes
;Set up the clipping
call bigsprite_subroutine
ret nc
bigsprite_AND_loop:
push bc ;height,width
push de ;gbuf ptr
push hl ;sprite data pointer
ld a,(sprite_x)
ld c,a
add a,8
ld (sprite_x),a
spriteloop_AND:
push bc
push hl
ld h,(hl)
scf \ sbc a,a
call call_ix
ld a,c
cp 96
jr nc,+_
ld a,(hl)
and d
ld (hl),a
ld a,c
_:
inc hl
add a,8
cp 96
jr nc,+_
ld a,(sprite_mask1)
ld a,(hl)
and e
ld (hl),a
_:
ld bc,11
add hl,bc
ex de,hl
pop hl
ld a,(sprite_width)
ld c,a
add hl,bc
pop bc
djnz spriteloop_AND
pop hl
inc hl
pop de
inc de
pop bc
dec c
jr nz,bigsprite_AND_loop
ret
Or draw with Erase logic:bigsprite_Erase:
;Inputs:
; B is the X-coordinate
; C is the Y-Coordinate
; DE points to the sprite
; H is the height
; L is the width in bytes
;67 bytes
;Set up the clipping
call bigsprite_subroutine
ret nc
bigsprite_Erase_loop:
push bc ;height,width
push de ;gbuf ptr
push hl ;sprite data pointer
ld a,(sprite_x)
ld c,a
add a,8
ld (sprite_x),a
spriteloop_Erase:
push bc
push hl
ld h,(hl)
xor a
call call_ix
ld a,c
cp 96
jr nc,+_
ld a,d
cpl
and (hl)
ld (hl),a
ld a,c
_:
inc hl
add a,8
cp 96
jr nc,+_
ld a,e
cpl
and (hl)
ld (hl),a
_:
ld bc,11
add hl,bc
ex de,hl
pop hl
ld a,(sprite_width)
ld c,a
add hl,bc
pop bc
djnz spriteloop_Erase
pop hl
inc hl
pop de
inc de
pop bc
dec c
jr nz,bigsprite_Erase_loop
ret
Or draw with Overwrite logic:bigsprite_Overwrite:
;Inputs:
; B is the X-coordinate
; C is the Y-Coordinate
; DE points to the sprite
; H is the height
; L is the width in bytes
;71 bytes
;Set up the clipping
call bigsprite_subroutine
ret nc
bigsprite_Overwrite_loop:
push bc ;height,width
push de ;gbuf ptr
push hl ;sprite data pointer
ld a,(sprite_x)
ld c,a
add a,8
ld (sprite_x),a
spriteloop_Overwrite:
push bc
push hl
ld h,(hl)
xor a
call call_ix
ld a,c
cp 96
jr nc,+_
ld a,(sprite_mask0)
and (hl)
or d
ld (hl),a
ld a,c
_:
inc hl
add a,8
cp 96
jr nc,+_
ld a,(sprite_mask1)
and (hl)
or e
ld (hl),a
_:
ld bc,11
add hl,bc
ex de,hl
pop hl
ld a,(sprite_width)
ld c,a
add hl,bc
pop bc
djnz spriteloop_Overwrite
pop hl
inc hl
pop de
inc de
pop bc
dec c
jr nz,bigsprite_Overwrite_loop
ret
@Zeda: Nice performance aware exercise! (Except for the many push and pop which look a bit dated to me).Thanks :) I wrote these with apps in mind, so I tried to reduce the need for external RAM. I should definitely make versions that take full advantage of SMC, though!
I wonder how many are really interested in speeding up the calculations these days.
It seems all they care about is python, java and what-have-you-funky-high-level-language :/.
;Written by Zeda Thomas, free to use.
;This draws a circle centered at 8-bit coordinates and with radius up to 127.
;IX points to a `plot` routine that takes (B,C)=(x,y) as input and does something
;with it, like plot the pixel a certain color, or plot a "big" pixel, or whatever.
; plot
; Takes coordinates, (B,C) = (x,y) and plots the point.
;
;For example, on the TI-83+/84+/SE the plot routine might look like:
; plot:
; call getpixelloc
; ret nc ;Exit if the coordinates are out-of-bounds
; or (hl)
; ld (hl),a
; ret
;
;
; Required subroutines:
; call_ix:
; jp (ix)
circle:
;Input:
; (B,C) is the center (x,y)
; D is the radius, unsigned, less than 128 (0 or greater than 128 just quits).
; IX points to a `plot` routine that takes (B,C)=(x,y) as input.
ld a,d
add a,a
ret c
ret z
ld l,d
dec a
ld e,a
dec a ;if the pixel is only 1 wide, just plot the point
jp z,call_ix ;Jump to the plot routine
xor a
ld h,-1
ld d,1
scf ;skip the first plot
circleloop:
call nc,plot4
inc h
sub d
inc d
inc d
jr nc,circleloop
_:
dec l
call plot4
add a,e
dec e
ret z
dec e
jr nc,-_
jp circleloop
plot4:
;BC is center
;HL is x,y
push de
push af
push hl
push bc
;If H is 0, or L is 0, we need to draw only half
push hl
ld a,b
sub h
ld b,a
add a,h
add a,h
ld h,a
ld a,c
sub l
ld c,a
add a,l
add a,l
ld l,a
;B is x0-x
;C is y0-y
;H is x0+x
;L is y0+y
;plot(x0-x,y0-y)
;plot(x0+x,y0+y)
push bc
push hl
call call_ix ;call the plot routine
pop bc
push bc
call call_ix ;call the plot routine
;now swap the y coords
pop hl
pop bc
ld a,l
ld l,c
ld c,a
pop de
xor a
cp d
jr z,+_
cp e
jr z,+_
;plot(x0-x,y0+y)
;plot(x0+x,y0-y)
push hl
call call_ix ;call the plot routine
pop bc
call call_ix ;call the plot routine
_:
pop bc
pop hl
pop af
pop de
ret
The really cool feature about this is that you can define a custom plot routine pointed to by IX, so it isn't TI-specific, and you can do all sorts of wonky things like:;calling with `ld ix,pixelOn_2x2`
pixelOn_2x2:
sla b
ret c
sla c
ret c
push bc
call pixelOn
pop bc
inc b
push bc
call pixelOn
pop bc
inc c
push bc
call pixelOn
pop bc
dec b
jp pixelOn
;calling with `ld ix,pixelOn_circle`
pixelOn_circle:
ld a,b
cp 32
ret nc
add a,a
add a,a
ld b,a
ld a,c
cp 32
ret nc
add a,a
add a,a
ld c,a
ld d,4
push ix ;need to save IX!
ld ix,pixelOn
call circle
pop ix
ret
EDIT: I inlined some subroutines because there was no reason to have them called. It was a waste of clock cycles and space!;Written by Zeda Thomas, free to use.
;This draws the fill of a circle centered at 8-bit coordinates and with radius
;up to 127.
;IX points to a `horizontal line` routine that takes E=x, A=y, D=width as input
;and does something with it, like plot a horizontal line.
;
; For example, on the ti-83+/84+/SE calculators, you might have:
; horizontal_line:
; ld b,e
; ld c,a
; ld e,1
; ld hl,gbuf
; jp rectOR
; Required subroutines:
; call_ix:
; jp (ix)
filledcircle:
;Input:
; (B,C) is the center (x,y)
; D is the radius, unsigned, less than 128 (0 or greater than 128 just quits).
; IX points to a `plot` routine that takes (B,C)=(x,y) as input.
ld a,d
add a,a
ret c
ret z
ld l,d
dec a
ld e,a
xor a
ld h,-1
ld d,1
filledcircleloop:
; call c,fillcircle_plot
inc h
sub d
inc d
inc d
jr nc,filledcircleloop
_:
dec l
call fillcircle_plot
add a,e
dec e
ret z
dec e
jr nc,-_
jp filledcircleloop
fillcircle_plot:
inc h
dec h
ret z
push hl
push de
push bc
push af
dec h
ld a,b
sub h
ld e,a
ld d,h
sll d ;aka `slia`, undocumented
ld a,l
or a
ld h,c
jr z,+_
add a,h
push de
push hl
call nz,call_ix
pop hl
pop de
_:
ld a,h
sub l
call call_ix
pop af
pop bc
pop de
pop hl
ret
;Requires `call_ix` routine that looks like:
; call_ix:
; jp (ix)
;
;IX points to a comparison routine that takes as input:
; HL points to the input element to find
; DE is the element to compare to
;Outputs are:
; carry set if the HL element is less than the DE element
; carry reset if the HL element is greater than or equal to the DE element
; z set if the HL element is equal to the DE element
; nz set if the HL element is not equal to the DE element
;
;This is useful if you have a table of pointers to strings, and want to find a
;string in the array. See the end of this file for an example.
;
binarysearch:
;This is a general-purpose binary search routine.
;
;Inputs:
; BC is the element to find
; HL is the number of elements
; IX points to a callback routine that compares the input element
;Outputs:
; DE is the matching element index
; z means match found
; nz means no match
; In this case, DE is interpreted as where the match should have been
;Destroys:
; AF, DE
;RAM needed:
; 10 bytes of stack space (4 pushes and a call)
;
ld de,-1
binarysearch_loop_inc_lower:
inc de
binarysearch_loop:
push hl
push de
or a
sbc hl,de
jr z,binarysearch_nomatch
rr h
rr l
add hl,de
ld d,h
ld e,l
push hl ;test index
push bc ;input
ld h,b
ld l,c
call call_ix
pop bc ;input
jr nc,binarysearch_input_bigger_or_equal
pop hl ;test index is the new upper index
pop de ;restore the lower index
pop af ;dummy pop
jr binarysearch_loop
binarysearch_input_bigger_or_equal:
pop de ;test index +1 is the new lower index
pop hl ;dummy pop
pop hl ;restore upper index
jr nz,binarysearch_loop_inc_lower
;a match was found
;DE is left as the index
ret
binarysearch_nomatch:
pop de ;lower index
pop hl ;upper index (also lower index and test index)
or 1
ret
; This routine requires the following subroutine:
; call_ix:
; jp (ix)
;
; To use SMC, define SMC.
; #define SMC
;
; If you are not using SMC, you'll need to define `arraylen` to point
;to 2 bytes of scrap RAM
heapsort:
;Inputs:
; BC is the size of the array.
; IX points to the routine that compares or swpas two entries
; HL is the index of the first element
; DE is the index of the second element
; c means swap
; nc means compare HL to DE
; Should return:
; z if they are equal
; nc if HL>=DE
; c if HL<DE
;Outputs:
; The data is sorted.
;Destroys:
; All
;Notes:
; You can make the comparison routine work any way that you want :)
; For example, you can invert the carry flag output to sort descending.
;If the array is size 0 or 1, then it is sorted
inc b
dec b
jr nz,heapsort_heapify
dec c
ret z
inc c
ret z
heapsort_heapify:
call heapify
ld hl,(arraylen)
heapsort_loop:
dec hl
ld (arraylen),hl
push hl
;HL is the last element
ld de,0
call heapsort_swap
ld bc,1
call propogate
pop hl
ld a,h
or l
jr nz,heapsort_loop
ret
heapify:
;Inputs:
; HL points to the array data
; BC is the size of the array. NOT GREATER THAN 32767
; IX points to the routine that compares the values
ld (arraylen),bc
srl b
rr c
_:
push bc
call propogate
pop bc
dec bc
ld a,b
or c
jr nz,-_
ret
propogate:
;BC-1 is the parent index
;2BC-1 is the child0 index
;2BC is the child1 index
sla c
rl b
ld d,b
ld e,c
#ifdef SMC
arraylen=$+1
ld hl,0
#else
ld hl,(arraylen)
#endif
sbc hl,de
add hl,de
ret c ;no children
;z means one child
;compare the two children
ld h,b
ld l,c
dec hl
;HL is the child0 index
;DE is the child1 index
call nz,heapsort_cmp
jr nc,+_
ex de,hl
_:
;parent is (HL+1)>>1
;child is HL
ld d,h
ld e,l
inc hl
srl h
rr l
dec hl
call heapsort_cmp
ret nc
call heapsort_swap
;values heapsort_swapped, now set parent=child+1
ld b,d
ld c,e
inc bc
jr propogate
heapsort_swap:
;HL and DE are the indices of the elements to heapsort_swap
push hl
push de
scf
call call_ix
pop de
pop hl
ret
heapsort_cmp:
push hl
push de
or a
call call_ix
pop de
pop hl
ret
;HL must be non-zero
add hl,hl
sbc a,a
and %00101101
xor l
ld l,a
ld a,r
add a,a ;Because R is a 7-bit register
add a,h
;HL is the new seed
;A is the output, or AL for 16 bits.
;HL must be non-zero
add hl,hl
sbc a,a
and %10000111
xor l
ld l,a
ld a,r
add a,a ;Because R is a 7-bit register
add a,h
;HL is the new seed
;A is the output, or AL for 16 bits.
;E must be non-zero
;10 bytes, 39cc
;min period: 255
;max period: 32640
;avg period: 21760
sla e
sbc a,a
and %01011111
xor e
ld e,a
ld a,r
add a,e
;E is the new seed, non-zero so long as the input was non-zero
;A is the output
mul_hl_1337: ; size optimized
;17 bytes, 173cc
ld b,h
ld c,l
add hl,hl
add hl,hl
add hl,bc
add hl,hl
add hl,hl
add hl,hl
add hl,bc
add hl,hl
add hl,bc
add hl,hl
add hl,bc
add hl,hl
add hl,hl
add hl,hl
add hl,bc
mul_hl_1337: ; speed optimized
;25 bytes, 148cc
ld b,h
ld c,l
add hl,hl
add hl,bc
add hl,hl
add hl,bc
ld b,h
ld c,l
add hl,hl
add hl,bc
ld a,h
ld h,l
rra
rr h
rra
rr h
rra
and 11000000b
ld l,a
or a
sbc hl,bc
Wow, that's really neat. I don't have a use for that at the moment but I bet I can find one in some old, slow code.It looks through all the numbers 1-65535 to find some good code.
How did you go about making the routine generator? I assume there are some simple rules it follows.