MMX Technology Code Optimization by Max I. Fomitchev Listing One movq mm0,[esi] ; these two instructions won't execute in same cycle add eax,ebx movq mm0,mm1 ; these two would add eax,ebx Listing Two psraw mm0,8 ; these two instructions won't execute in same cycle punpckhbw mm1,mm2 Listing Three pmullw mm0,mm1 ; these two instructions won't execute in same cycle pmullh mm2,mm1 Listing Four paddw mm0,mm1 ; these two instructions won't execute in same cycle pmullh mm2,mm0 movq mm0,[esi] ; these two would movq mm1,mm0 Listing Five ; mm2 = threshold, all memory references are L1 cache hits M: movq mm0,[esi + ebx] ; 1 movq mm1,mm0 pcmpgtw mm0,mm2 ; 2 pand mm1,mm0 ; 3 movq [esi + ebx],mm1 ; 4 add ebx,8 ; 5 jz M ; total of 5*DataSize / 8 cycles Listing Six M: movq mm0,[esi + ebx] ; 1 movq mm1,mm0 movq mm3,[esi + ebx + 8] ; 2 movq mm4,mm3 pcmpgtw mm0,mm2 ; 3 pcmpgtw mm3,mm2 pand mm1,mm0 ; 4 pand mm4,mm3 movq [esi + ebx],mm1 ; 5 movq [esi + ebx + 8],mm4 ; 6 add ebx,16 ; 7 jz M ; total of 7*DataSize / 16 cycles Listing Seven M: movq mm0,[esi + ebx] ; 1 movq mm1,mm0 movq mm3,[esi + ebx + 8] ; 2 movq mm4,mm3 pcmpgtw mm0,mm2 ; 3 pcmpgtw mm3,mm2 pand mm1,mm0 ; 4 add ebx,16 pand mm4,mm3 ; 5 movq [esi + ebx - 16],mm1 movq [esi + ebx - 8],mm4 ; 6 jz M ; total of 6*DataSize / 16 = 3*DataSize / 8 cycles Listing Eight short *p, *pnew; pnew = (short*)(((int)p + 7) & -8); // ensure 64-bit alignment Listing Nine ... nop ; inserted to align the branch target, notice that nop is not a part of the loop and M: ; is executed once ... jz M Listing Ten movq mm0,[esi] ; load 8 pixels from current row movq mm1,[esi + image_width] ; load 8 pixels from the next row paddb mm0,mm1 ; summate movq [esi],mm0 ; store ... ; repeat for each row, than move to the next column Listing Eleven ; poor scheduling pmullw mm0,mm1 ; 3 cycle latency on Pentium II paddw mm2,mm0 ; this instruction will stall for 2 cycles ; optimal scheduling pmullw mm0,mm1 ; 3 cycle latency on Pentium II MMX inst 1 ; do something (2nd cycle) MMX inst 2 ; do something (3rd cycle) paddw mm2,mm0 ; this instruction will execute without delay Listing Twelve include vmm.inc ... mov eax,Time ; time in ms mov ebx,VMHandle VMMCall Adjust_Execution_Time ... mov eax,PriorityBoost ; use Time_Critical_Boost for best performance mov ebx,VMHandle VMMCall Adjust_Exec_Priority Listing Thirteen CurrentThread = Get_Cur_Thread VMCPD_GET_THREAD (CurrentThread, MyVxD_Buff) ... MMX instructions ... VMCPD_SET_THREAD (CurrentThread, MyVxD_Buff) Listing Fourteen clock_t c1, c2; ... SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS); SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL); c1 = clock(); for ( i = 0; i < M; i++ ) testfunc(); c2 = clock(); printf("%g seconds\n", float(c2 - c1)/M/CLOCKS_PER_SEC); Listing Fifteen movq mm0,[esi] ; Load first 8-byte chunk movq mm1,mm0 movq mm2,[esi + 8] ; Load second 8-byte chunk pand mm0,mask ; Clear high bytes of each 16-bit value in first ; chunk ; mask = 00FF00FF00FF00FF psrlw mm1,8 ; Clear (sign extend) high bytes of the first chunk movq mm3,mm2 pand mm2,mask ; Clear high bytes of each 16-bit value in the second chunk psrlw mm3,8 ; Clear (sign extend) high bytes of the second chunk paddsw mm0,mm1 ; Add high bytes and low bytes together paddsw mm2,mm3 psraw mm0,1 ; Divide the results by 2 psraw mm2,1 packsswb mm0,mm2 ; Pack two averaged 8-byte chunks into one movq [esi],mm0 ; Write back Listing Sixteen ; mm0 = mm2 = 76543210 punpckhbw mm0,mm0 ; duplicate high bytes: 77665544 punpcklbw mm2,mm2 ; duplicate low bytes: 33221100 Listing Seventeen punpckhbw mm0,mm0 ; mm0 = 77665544 movq mm1,mm0 punpcklbw mm2,mm2 ; mm1 = 33221100 movq mm3,mm2 punpckhwd mm0,mm0 ; mm0 = 77776666 punpcklwd mm1,mm1 ; mm1 = 55554444 punpckhwd mm2,mm2 ; mm2 = 33332222 punpcklwd mm3,mm3 ; mm3 = 11110000 Listing Eighteen pxor mm1,mm1 pxor mm2,mm2 punpcklwb mm1,mm0 ; mm0 = source data punpcklwb mm2,mm0 Listing Nineteen movq mm0,mm4 ; mm0 = [src], mm2 = [dest] pcmpeqb mm0,colorKey ; mm0 = bitmask pand mm2,mm0 ; mm2 = [dest] AND bitmask pandn mm0,mm4 ; mm0 = NOT bitmask AND [src] por mm2,mm0 ; mm2 = mm2 OR mm0 4