A83: Optimized Movax's sprite routine


A83: Optimized Movax's sprite routine

It confounds me why the ever-so-popular sprite routine is not optimized, even by simple methods (xor a instead of cp 0, etc).  Here is the optimized version, 6 bytes smaller:
; Modified SPRXOR (10 clocks saved if aligned sprite, 20 clocks if unaligned)
; Xor 8x8 sprite a=x, e=y, bc=sprite address
    push    bc
;====   Calculate the address in graphbuf   ====
    ld      hl,0
    ld      d,l             ; was ld d,0 (saves 3 clocks and 1 byte)
    add     hl,de
    add     hl,de
    add     hl,de
    add     hl,hl
    add     hl,hl
    ;    ld      d,0     ; already at zero! (saves 4 clocks and 2 bytes)
    ld      e,a
    srl     e
    srl     e
    srl     e
    add     hl,de
    ld      de,8e29h
    add     hl,de
    ld      b,00000111b
    and     b
    and     a               ; was cp 0 (saves 3 clocks and 1 byte)
    jp      z,ALIGN
;====   Non aligned sprite blit starts here   ====
    pop     ix
    ld      d,a
    ld      e,8
    ld      b,(ix+0)
    ld      c,0
    push    de
    srl     b
    rr      c
    dec     d
    jr      nz,SHLOP
    pop     de
    ld      a,b
    xor     (hl)
    ld      (hl),a
    inc     hl
    ld      a,c
    xor     (hl)
    ld      (hl),a
    ld      bc,11
    add     hl,bc
    inc     ix
    dec     e
    jr      nz,LILOP
    ret    ; was jp done1 (saves 10 clocks and (3-1)=2 bytes)
;====   Aligned sprite blit starts here   ====
    pop     de
    ld      b,8
    ld      a,(de)
    xor     (hl)
    ld      (hl),a
    inc     de
    push    bc
    ld      bc,12
    add     hl,bc
    pop     bc
    djnz    ALOP1
; Modified SPRXOR
I do admire the beauty of the routine, but I think such widely used code should be optimized as much as possible.
