Calculator Community > The Axe Parser Project

Assembly Programmers - Help Axe Optimize!

<< < (60/60)

jo-thijs:
I found this in the Commands.inc file of axe1.2.2a:
p_IntNe:
   .db 8
   xor   a
   sbc   hl,de
   jr   z,$+5
   ld   hl,1

I can't find the purpose of xor a.

Runer112:
Reset the carry flag for sbc hl,de it seems. :P

Xeda112358:
I think I finally have a major optimization after having worked on link routines for the past couple of weeks. I didn't modify the timeout or syncing code, just the core get/send stuff. I've tested it and it is reliable.

For reference, in the even that p_SendByte doesn't have to wait, the new routine is 931cc vs 1647cc. Here are my proposed routines:

p_GetByte: +0 bytes, presumably as much faster as p_SendByte

--- Code: ---p_GetByte:
.db __GetByteEnd-$-1
di
ld bc,$0803 ;Bit counter in b, bit mask in c
ld hl,-1
xor a
out (0),a ;Make sure we are reset
in a,(0)
and c ;Check to see if sender is ready
dec a
ret nz ;If not, then go back
inc a
out (0),a ;Relay a confirmation
ex (sp),hl ;Wait at until confirmation is read (59 T-states minimum)
ex (sp),hl
ld a,(de) ;Bit counter in b and bitmask in c
xor a ;Store received byte in l
ld hl,$AA
out (0),a ;Reset the ports to receive data

__GetByteLoop:
    in a,(0)
    xor l
    rra
    jr c,__GetByteLoop
    in a,(0)
    rra
    rra             ;bits cycled in are masked with 0x55. Need to invert anyways, so mask at the end with 0xAA
    rr l
    djnz __GetByteLoop
    ret

--- End code ---
   
p_SendByte: -4 bytes, -723cc

--- Code: ---p_SendByte:
    .db __SendByteEnd-$-1
di
ld bc,$5503 ;Bit counter in b, bit mask in c
ld a,%00000010
out (0),a ;Indicate we are ready to send
__SendByteTimeout:
dec hl
ld a,h
or l
jr z,__SendByteDone
in a,(0) ;Loop is 59 T-states maximum
and c
jr nz,__SendByteTimeout ;Keep looping till we get it
out (0),a
__SendLoop:
    rrc e
    ccf
    rla
    sla b
    ccf
    rla
    out (0),a
    ex (sp),hl
    ex (sp),hl
    nop
    jr nz,__SendLoop
;need 37cc
    xor a
    ex (sp),hl
    ex (sp),hl
__SendByteDone
    out (0),a
    ret
__SendByteEnd:

--- End code ---
EDIT: I looked at the timeout code for p_SendByte, and realized that my code didn't need B to be a counter but instead I was using D as a kind of counter. By using B instead of D, I could cut out the ld d,$55, saving 2 bytes and 7cc.

Xeda112358:
Here is an optimized p_LineShr routine. NOTE: It flips the meaning of the carry flag on output, so the line routines that use this will need to ret c instead of ret nc.
Original routine

--- Code: ---p_LineShr:
.db __LineShrEnd-$-1
;; l=y2, ix=buff, (sp)=ret, (sp+2)=ret_2, (sp+4)=x2, (sp+6)=y1, (sp+8)=x1
ld a,l
pop bc
pop hl
pop de
ex (sp),hl
ld d,l
pop hl
ex (sp),hl
push bc

;; a=y2, d=y1, e=x2, l=x1, (sp)=ret, (sp+2)=ret_2
cp 64
ret nc
ld h,a
ld a,d
cp 64
ret nc

ld a,l
cp 96
ret nc
ld a,e
cp 96
ret nc

sub l
jr nc,__LineShrSkipRev
ex de,hl
neg

;; a=dx, d=y1, e=x2, h=y2, l=x1
__LineShrSkipRev:
push af ; Saving DX (it will be popped into HL below)
ld a,l ; IX+=L/8+D*12 (actually D*4+D*4+D*4)
rra
rra
rra
and %00011111
ld c,a
ld b,0
add ix,bc
ld a,d
add a,a
add a,a
ld c,a
add ix,bc
add ix,bc
add ix,bc
ld a,l ; Calculating the starting pixel mask
and %00000111
inc a
ld b,a
ld a,%00000001
__LineShrMaskLoop:
rrca
djnz __LineShrMaskLoop
ld c,a
ld a,h ; Calculating delta Y and negating the Y increment if necessary
sub d ; This is the last instruction for which we need the original data
ld de,12
jr nc,__LineShrSkipNeg
ld de,-12
neg
__LineShrSkipNeg:
pop hl ; Recalling DX
ld l,a ; H=DX, L=DY
cp h
jr nc,__LineVert ; Line is rather vertical than horizontal
ld a,h
__LineVert:
ld b,a ; Pixel counter
inc b
cp l
scf ; Setting up gradient counter
ccf
rra
scf
ret ; c=1, z=vertical major
__LineShrEnd:

--- End code ---
   
Optimized routine: -4 bytes, -13cc

--- Code: ---p_LineShr:
.db __LineShrEnd-$-1
;; l=y2, ix=buff, (sp)=ret, (sp+2)=ret_2, (sp+4)=x2, (sp+6)=y1, (sp+8)=x1
ld a,l
pop bc
pop hl
pop de
ex (sp),hl
ld d,l
pop hl
ex (sp),hl
push bc

;; a=y2, d=y1, e=x2, l=x1, (sp)=ret, (sp+2)=ret_2
ld h,a
ld a,63
cp h
ret c
cp d
ret c

ld a,95
cp l
ret c
cp e
ret c
ld a,e

sub l
jr nc,__LineShrSkipRev
ex de,hl
neg

;; a=dx, d=y1, e=x2, h=y2, l=x1
__LineShrSkipRev:
push af ; Saving DX (it will be popped into HL below)
ld a,d
add a,a
add a,a
ld c,a
ld b,0
add ix,bc
add ix,bc
add ix,bc
ld a,l
and 7
ld e,a
xor l
rra
rra
rra
ld c,a
add ix,bc
ld b,a
inc b
ld a,%00000001
__LineShrMaskLoop:
rrca
djnz __LineShrMaskLoop
ld c,a
ld a,h ; Calculating delta Y and negating the Y increment if necessary
sub d ; This is the last instruction for which we need the original data
ld de,12
jr nc,__LineShrSkipNeg
ld de,-12
neg
__LineShrSkipNeg:
pop hl ; Recalling DX
ld l,a ; H=DX, L=DY
cp h
jr nc,__LineVert ; Line is rather vertical than horizontal
ld a,h
__LineVert:
ld b,a ; Pixel counter
inc b
cp l
res 0,a ; Setting up gradient counter
rrca
ret ; c=0, z=vertical major
__LineShrEnd:

--- End code ---



Or this version, it only save 3 bytes, but saves 10 more clock cycles:

--- Code: ---p_LineShr:
.db __LineShrEnd-$-1
;; l=y2, ix=buff, (sp)=ret, (sp+2)=ret_2, (sp+4)=x2, (sp+6)=y1, (sp+8)=x1
ld a,l
pop bc
pop hl
pop de
ex (sp),hl
ld d,l
pop hl
ex (sp),hl
push bc

;; a=y2, d=y1, e=x2, l=x1, (sp)=ret, (sp+2)=ret_2
ld h,a
ld a,63
cp h
ret c
cp d
ret c

ld a,95
cp l
ret c
cp e
ret c
ld a,e

sub l
jr nc,__LineShrSkipRev
ex de,hl
neg

;; a=dx, d=y1, e=x2, h=y2, l=x1
__LineShrSkipRev:
ld e,a ; Saving DX
ld a,l ; IX+=L/8+D*12 (actually D*4+D*4+D*4)
rra
rra
rra
and %00011111
ld c,a
ld b,0
add ix,bc
ld a,d
add a,a
add a,a
ld c,a
add ix,bc
add ix,bc
add ix,bc
ld a,l ; Calculating the starting pixel mask
and %00000111
inc a
ld b,a
ld a,%00000001
__LineShrMaskLoop:
rrca
djnz __LineShrMaskLoop
ld c,a
ld a,h ; Calculating delta Y and negating the Y increment if necessary
sub d ; This is the last instruction for which we need the original data

ld h,e ; DX
ld l,a ; DY

ld de,12
jr nc,__LineShrSkipNeg
ld de,-12
neg
__LineShrSkipNeg:
cp h
jr nc,__LineVert ; Line is rather vertical than horizontal
ld a,h
__LineVert:
ld b,a ; Pixel counter
inc b
cp l
res 0,a ; Setting up gradient counter
rrca
ret ; c=0, z=vertical major
__LineShrEnd:

--- End code ---

Xeda112358:
p_EQ0

The current routine is 7 bytes and 36cc:

--- Code: ---;7 bytes, 36cc
ld a,l
or h
add a,255
sbc hl,hl
inc hl

--- End code ---

But we can save 8cc without sacrificing bytes:

--- Code: ---;7 bytes, 28cc
xor a
cp h
ld h,a
sbc a,l
sbc a,a
ld l,a
inc l

--- End code ---

Navigation

[0] Message Index

[*] Previous page

Go to full version