Assembly code snippets
Details
Title | Fast Alpha Blend (MMX) |
---|---|
Author | bitRAKE |
Submitted by: | bitRAKE |
Date added: | 2002-02-17 21:46:06 |
Date modified: | 2002-02-17 21:46:06 |
Comments
Under 7 cycles per pixel - too bad memory is so slow. For better speed: process your graphics in small bands, or integrate your other pixel processing with this code. That way the data will still be in the cache.
Snippet
AlphaBlend: ; 13056 ticks for 1000 quads on Athlon.
mov eax,Source
mov edx,Destination
mov ecx,Count ; number of pixels/2
dec ecx
pxor mm7,mm7
movq mm0,[eax+ecx*8]
movq mm1,[edx+ecx*8]
@@: movq mm4,mm0
movq mm2,mm0
psrlw mm4,1
movq mm3,mm1
movq mm5,mm4
punpcklbw mm0,mm7
punpcklbw mm1,mm7
punpckhbw mm2,mm7
punpckhbw mm3,mm7
psubsw mm0,mm1
psubsw mm2,mm3
punpcklwd mm4,mm4
punpckhwd mm5,mm5
punpckhdq mm4,mm4
punpckhdq mm5,mm5
psllw mm0,1
psllw mm2,1
pmulhw mm0,mm4
pmulhw mm2,mm5
paddsw mm0,mm1
paddsw mm2,mm3
packuswb mm0,mm2
movq [edx+ecx*8],mm0
dec ecx
movq mm0,[eax+ecx*8]
movq mm1,[edx+ecx*8]
jns @B
ret 12
mov eax,Source
mov edx,Destination
mov ecx,Count ; number of pixels/2
dec ecx
pxor mm7,mm7
movq mm0,[eax+ecx*8]
movq mm1,[edx+ecx*8]
@@: movq mm4,mm0
movq mm2,mm0
psrlw mm4,1
movq mm3,mm1
movq mm5,mm4
punpcklbw mm0,mm7
punpcklbw mm1,mm7
punpckhbw mm2,mm7
punpckhbw mm3,mm7
psubsw mm0,mm1
psubsw mm2,mm3
punpcklwd mm4,mm4
punpckhwd mm5,mm5
punpckhdq mm4,mm4
punpckhdq mm5,mm5
psllw mm0,1
psllw mm2,1
pmulhw mm0,mm4
pmulhw mm2,mm5
paddsw mm0,mm1
paddsw mm2,mm3
packuswb mm0,mm2
movq [edx+ecx*8],mm0
dec ecx
movq mm0,[eax+ecx*8]
movq mm1,[edx+ecx*8]
jns @B
ret 12