Assembly code snippets
Details
Title | Matrix Transform of Vector (MMX) |
---|---|
Author | bitRAKE |
Submitted by: | bitRAKE |
Date added: | 2002-02-23 17:52:52 |
Date modified: | 2002-02-23 17:52:52 |
Comments
This algo is the fastest I've seen: 12 cycles on PMMX - P3MMX, and 10 cycles on Athlon MMX!
Snippet
; [ A B C D ]
; [ E F G H ] X [ W X Y Z ] = [ AW+BX+CY+DZ EW+FX+GY+HZ IW+JX+KY+LZ ? ]
; [ I J K L ]
; 16 bit numbers are scaled to a fixed point size of:
; 1.111 1111 1111 1111 ; first bit is sign bit
NUMBER_SCALE EQU 15 ; 1 / 2^15
pMatrix EQU [esp + 8] ; 4x3 transform matrix pointer
pVector EQU [esp + 12] ; source vectors pointer
iNumVec EQU [esp + 16] ; number of vectors to transform
pResult EQU [esp + 20] ; destination for transformed vectors
mov ecx,iNumVec
mov eax,pMatrix
lea edx,[ecx*8] ; size of source/dest vector buffer
neg ecx
; load entire 3x4 matrix
movq mm0,[eax + 0]
movq mm1,[eax + 8]
movq mm2,[eax + 16]
mov eax,edx
add edx,pVector
add eax,pResult
NextVect:
; Load vector (4 16-bit elements) into reg
movq mm3,[edx + ecx*8]
inc ecx
movq mm4,mm3 ;copy to other regs for use by 3 pmadds
pmaddwd mm3,mm0 ;multiply row0 X vector
movq mm5,mm4
pmaddwd mm4,mm1 ;multiply row1 X vector
movq mm6,mm3 ; A1 A2
pmaddwd mm5,mm2 ;multiply row2 X vector
punpckldq mm3,mm4 ; B2 A2
punpckhdq mm6,mm4 ; B1 A1
movq mm4,mm5 ;add row2 high and low order 32-bit results
punpckhdq mm5,mm5 ; psrlq mm5,32
paddd mm3,mm6 ; B1+B2 A1+A2
paddd mm5,mm4
psrad mm3,NUMBER_SCALE-2
psrad mm5,NUMBER_SCALE-2
packssdw mm3,mm5 ; pack dwords into words
; might need to mask off high word of MMX reg?
movq [eax + ecx*8 - 8],mm3 ; store resulting vector
jnz NextVect ;then loop back to do the next one.
emms ;end MMX state (remove fp reg valids)
; [ E F G H ] X [ W X Y Z ] = [ AW+BX+CY+DZ EW+FX+GY+HZ IW+JX+KY+LZ ? ]
; [ I J K L ]
; 16 bit numbers are scaled to a fixed point size of:
; 1.111 1111 1111 1111 ; first bit is sign bit
NUMBER_SCALE EQU 15 ; 1 / 2^15
pMatrix EQU [esp + 8] ; 4x3 transform matrix pointer
pVector EQU [esp + 12] ; source vectors pointer
iNumVec EQU [esp + 16] ; number of vectors to transform
pResult EQU [esp + 20] ; destination for transformed vectors
mov ecx,iNumVec
mov eax,pMatrix
lea edx,[ecx*8] ; size of source/dest vector buffer
neg ecx
; load entire 3x4 matrix
movq mm0,[eax + 0]
movq mm1,[eax + 8]
movq mm2,[eax + 16]
mov eax,edx
add edx,pVector
add eax,pResult
NextVect:
; Load vector (4 16-bit elements) into reg
movq mm3,[edx + ecx*8]
inc ecx
movq mm4,mm3 ;copy to other regs for use by 3 pmadds
pmaddwd mm3,mm0 ;multiply row0 X vector
movq mm5,mm4
pmaddwd mm4,mm1 ;multiply row1 X vector
movq mm6,mm3 ; A1 A2
pmaddwd mm5,mm2 ;multiply row2 X vector
punpckldq mm3,mm4 ; B2 A2
punpckhdq mm6,mm4 ; B1 A1
movq mm4,mm5 ;add row2 high and low order 32-bit results
punpckhdq mm5,mm5 ; psrlq mm5,32
paddd mm3,mm6 ; B1+B2 A1+A2
paddd mm5,mm4
psrad mm3,NUMBER_SCALE-2
psrad mm5,NUMBER_SCALE-2
packssdw mm3,mm5 ; pack dwords into words
; might need to mask off high word of MMX reg?
movq [eax + ecx*8 - 8],mm3 ; store resulting vector
jnz NextVect ;then loop back to do the next one.
emms ;end MMX state (remove fp reg valids)