MMX Technology Code Optimization
by Max I. Fomitchev

Listing One

movq    mm0,[esi]   ; these two instructions won't execute in same cycle
add     eax,ebx
movq    mm0,mm1     ; these two would
add     eax,ebx

Listing Two
psraw       mm0,8    ; these two instructions won't execute in same cycle
punpckhbw   mm1,mm2

Listing Three
pmullw  mm0,mm1      ; these two instructions won't execute in same cycle
pmullh  mm2,mm1

Listing Four
paddw   mm0,mm1     ; these two instructions won't execute in same cycle
pmullh  mm2,mm0
movq    mm0,[esi]   ; these two would
movq    mm1,mm0

Listing Five
; mm2 = threshold, all memory references are L1 cache hits
M:  movq    mm0,[esi + ebx]     ; 1
    movq    mm1,mm0
    pcmpgtw mm0,mm2             ; 2
    pand    mm1,mm0             ; 3
    movq    [esi + ebx],mm1     ; 4
    add     ebx,8               ; 5
jz      M                       ; total of 5*DataSize / 8 cycles

Listing Six
M:  movq    mm0,[esi + ebx]     ; 1
movq    mm1,mm0

   movq    mm3,[esi + ebx + 8] ; 2
movq    mm4,mm3
pcmpgtw mm0,mm2     ; 3
pcmpgtw mm3,mm2
pand    mm1,mm0     ; 4
pand    mm4,mm3
movq    [esi + ebx],mm1     ; 5

movq    [esi + ebx + 8],mm4 ; 6
add     ebx,16      ; 7
jz      M           ; total of 7*DataSize / 16 cycles

Listing Seven
M:  movq    mm0,[esi + ebx]     ; 1
movq    mm1,mm0
    movq    mm3,[esi + ebx + 8] ; 2
movq    mm4,mm3
pcmpgtw mm0,mm2     ; 3
pcmpgtw mm3,mm2
pand    mm1,mm0     ; 4
add     ebx,16
pand    mm4,mm3     ; 5
movq    [esi + ebx - 16],mm1

movq    [esi + ebx - 8],mm4 ; 6
jz      M           ; total of 6*DataSize / 16 = 3*DataSize / 8 cycles

Listing Eight
short *p, *pnew;
pnew = (short*)(((int)p + 7) & -8); // ensure 64-bit alignment

Listing Nine
    ...
    nop     ; inserted to align the branch target, notice that nop is not a part of the loop and
M:      ; is executed once
   ...
   jz       M

Listing Ten
movq    mm0,[esi]           ; load 8 pixels from current row
movq    mm1,[esi + image_width] ; load 8 pixels from the next row
paddb   mm0,mm1         ; summate
movq    [esi],mm0           ; store
 ...                        ; repeat for each row, than move to the next column

Listing Eleven
; poor scheduling
pmullw  mm0,mm1     ; 3 cycle latency on Pentium II
paddw   mm2,mm0     ; this instruction will stall for 2 cycles
; optimal scheduling
pmullw  mm0,mm1     ; 3 cycle latency on Pentium II
MMX inst 1          ; do something (2nd cycle)
MMX inst 2          ; do something (3rd cycle)
paddw   mm2,mm0     ; this instruction will execute without delay


Listing Twelve
include vmm.inc
 ...
mov eax,Time        ; time in ms
mov ebx,VMHandle
VMMCall Adjust_Execution_Time 
 ...
mov eax,PriorityBoost   ; use Time_Critical_Boost for best performance
mov ebx,VMHandle
VMMCall Adjust_Exec_Priority 

Listing Thirteen 
CurrentThread = Get_Cur_Thread 
VMCPD_GET_THREAD (CurrentThread, MyVxD_Buff) 
 ... 
MMX instructions
 ... 
VMCPD_SET_THREAD (CurrentThread, MyVxD_Buff) 

Listing Fourteen
clock_t c1, c2;
 ...
SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);
SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL);
c1 = clock();
for ( i = 0; i < M; i++ )
    testfunc();
c2 = clock();
printf("%g seconds\n", float(c2 - c1)/M/CLOCKS_PER_SEC);

Listing Fifteen
movq    mm0,[esi]       ; Load first 8-byte chunk
movq    mm1,mm0
movq    mm2,[esi + 8]   ; Load second 8-byte chunk
pand    mm0,mask        ; Clear high bytes of each 16-bit value in first 
                        ; chunk
                        ; mask = 00FF00FF00FF00FF
psrlw   mm1,8           ; Clear (sign extend) high bytes of the first chunk
movq    mm3,mm2
pand    mm2,mask        ; Clear high bytes of each 16-bit value in the second chunk
psrlw   mm3,8           ; Clear (sign extend) high bytes of the second chunk
paddsw  mm0,mm1         ; Add high bytes and low bytes together
paddsw  mm2,mm3
psraw   mm0,1           ; Divide the results by 2
psraw   mm2,1
packsswb    mm0,mm2     ; Pack two averaged 8-byte chunks into one
movq    [esi],mm0       ; Write back

Listing Sixteen
; mm0 = mm2 = 76543210
punpckhbw   mm0,mm0     ; duplicate high bytes: 77665544
punpcklbw   mm2,mm2     ; duplicate low bytes: 33221100


Listing Seventeen
punpckhbw   mm0,mm0     ; mm0 = 77665544

movq   mm1,mm0
punpcklbw   mm2,mm2     ; mm1 = 33221100
movq    mm3,mm2
punpckhwd   mm0,mm0     ; mm0 = 77776666
punpcklwd   mm1,mm1     ; mm1 = 55554444
punpckhwd   mm2,mm2     ; mm2 = 33332222
punpcklwd   mm3,mm3     ; mm3 = 11110000

Listing Eighteen
pxor    mm1,mm1
pxor    mm2,mm2
punpcklwb   mm1,mm0     ; mm0 = source data
punpcklwb   mm2,mm0

Listing Nineteen
movq    mm0,mm4     ; mm0 = [src], mm2 = [dest]
pcmpeqb mm0,colorKey    ; mm0 = bitmask
pand    mm2,mm0     ; mm2 = [dest] AND bitmask
pandn   mm0,mm4     ; mm0 = NOT bitmask AND [src]
por     mm2,mm0     ; mm2 = mm2 OR mm0



4