Assembly code snippets

Details

Title	Matrix Transform of Vector (MMX)
Author	bitRAKE
Submitted by:	bitRAKE
Date added:	2002-02-23 17:52:52
Date modified:	2002-02-23 17:52:52

Comments

This algo is the fastest I've seen: 12 cycles on PMMX - P3MMX, and 10 cycles on Athlon MMX!

Snippet

; [ A B C D ]
; [ E F G H ]  X  [ W X Y Z ]  =  [ AW+BX+CY+DZ EW+FX+GY+HZ IW+JX+KY+LZ  ? ]
; [ I J K L ]

; 16 bit numbers are scaled to a fixed point size of:
; 1.111 1111 1111 1111 ; first bit is sign bit
NUMBER_SCALE EQU 15 ; 1 / 2^15

    pMatrix EQU [esp +  8] ; 4x3 transform matrix pointer
    pVector EQU [esp + 12] ; source vectors pointer
    iNumVec EQU [esp + 16] ; number of vectors to transform
    pResult EQU [esp + 20] ; destination for transformed vectors

    mov ecx,iNumVec
    mov eax,pMatrix
    lea edx,[ecx*8] ; size of source/dest vector buffer
    neg ecx

; load entire 3x4 matrix
    movq mm0,[eax +  0]
    movq mm1,[eax +  8]
    movq mm2,[eax + 16]

    mov eax,edx
    add edx,pVector
    add eax,pResult
NextVect:
    ; Load vector (4 16-bit elements) into reg
    movq    mm3,[edx + ecx*8]
    inc ecx

    movq    mm4,mm3     ;copy to other regs for use by 3 pmadds
    pmaddwd mm3,mm0     ;multiply row0 X vector

    movq    mm5,mm4
    pmaddwd mm4,mm1     ;multiply row1 X vector

    movq    mm6,mm3     ; A1 A2
    pmaddwd mm5,mm2     ;multiply row2 X vector

    punpckldq mm3,mm4   ; B2 A2
    punpckhdq mm6,mm4   ; B1 A1

    movq    mm4,mm5     ;add row2 high and low order 32-bit results
    punpckhdq mm5,mm5   ;   psrlq   mm5,32

    paddd   mm3,mm6     ; B1+B2 A1+A2
    paddd   mm5,mm4

    psrad   mm3,NUMBER_SCALE-2
    psrad   mm5,NUMBER_SCALE-2

    packssdw mm3,mm5    ; pack dwords into words
; might need to mask off high word of MMX reg?
    movq    [eax + ecx*8 - 8],mm3 ; store resulting vector

    jnz NextVect    ;then loop back to do the next one.

    emms            ;end MMX state (remove fp reg valids)