Optimizing 3DNow! Real-time Graphics by Max I. Fomitchev Listing One ; compute y/w and x/w MOVD MM0,mem ; 0 | w PFRCP MM1,MM0 ; 1/w | 1/w (14-bit approximation) PUNPCKLDQ MM0,MM0 ; w | w PFRCPIT1 MM0,MM1 ; 1/w | 1/w (intermediate) MOVQ MM2,mem ; y | x PFRCPIT2 MM0,MM1 ; 1/w | 1/w (24-bit precision) PFMUL MM2,MM0 ; y/w | x/w Listing Two MOVD MM0,mem ; 0 | w PFRSQRT MM1,MM0 ; 1/sqrt(w) | 1/sqrt(w) (15-bit approximation) MOVQ MM2,MM1 PUNPCKLDQ MM0,MM0 ; w | w PFMUL MM1,MM1 ; sqrt(w)^2 | sqrt(w)^2 (15-bit approximation) PFRSQIT1 MM1,MM0 ; 1/sqrt(w) | 1/sqrt(w) (intermediate) PFRCPIT2 MM1,MM2 ; 1/sqrt(w) | 1/sqrt(w) (24-bit precision) PFMUL MM0,MM1 ; sqrt(w) | sqrt(w) Listing Three PFSUB MM1,MM3 ; this instruction pair will execute PFMUL MM2,mem ; in the same clock cycle PFADD MM1,MM2 ; this instruction pair won't execute PFMUL MM1,MM3 ; simultaneously due to register dependency Listing Four PFMIN MM1,MM2 PFSUB MM3,a MOVQ mem,MM1 ; 1-cycle stall occurs here because the result PFMIN MM2,MM3 ; in MM1 will be ready in the next cycle Listing Five PFMIN MM1,MM2 PFSUB MM3,a PFMIN MM2,MM3 ; 1-cycle stall occurs here PFSUB MM4,b MOVQ mem,MM1 ; no stall, the result in MM1 is ready PFSUB MM3,a Listing Six (a) for ( j = 0; j < n; j++ ) beta += a[j]*r[n - j]; (b) ; eax = j, ebx = n-j-1 MOVQ MM0,a[EAX*4] ; a[j] | a[j+1] MOVQ MM1,r[EBX*4] ; r[n-j-1] | r[n-j] PFMUL MM0,MM1 ; a[j]*r[n-j-1] | a[j+1]*r[n-j] Listing Seven (a) // Calculate r in reverse order (if possible) or swap the array elements ... for ( j = 0; j < n; j++ ) // beta calculation loop beta += a[j]*r[c + j]; // c is some constant (b) for ( j = 0; j < n; j++ ) { float c = r[n - j]; // swap(r[j], r[n-j]; r[n - j] = r[j]; r[j] = c; } (c) MOVQ MM0,a[EAX*4] ; a[j] | a[j+1] PSWAPD MM1,r[EBX*4] ; r[n-j] | r[n-j-1] PFMUL MM0,MM1 ; a[j]*r[n-j] | a[j+1]*r[n-j-1] (d) PFACC MM0,MM0 ; a[j]*r[n-j] + a[j+1]*r[n-j-1] MOVD beta,MM0 Listing Eight (a) ; n-th iteration MOVQ MM0,a[EAX*4] ; a[n] | a[n+1] MOVQ MM1,r[EBX*4] ; r[c+n] | r[c+n+1] PFMUL MM0,MM1 ; a[n]*r[c+n] | a[n+1]*r[c+n+1] ... ; after the loop PFACC MM0,MM0 ; a[n]*r[c+n] + a[n+1]*r[c+n+1] MOVD beta,MM0 ; Oops! Extra a[n+1]*r[c+n+1] (b) float a[m], r[m]; // m is some constant // Initialize arrays a and r ... for ( n = 1; n < m; n++ ) // main loop { beta = 0; for ( j = 0; j < n; j++ ) beta += a[j]*r[c + j]; // c is some constant // Do something else ... } (c) ; Compute beta MOV ECX,n DEC ECX MOV EAX,0 ; j = 0 PXOR MM2,MM2 ; beta = 0 M: MOVQ MM0,a[EAX] ; a[j] | a[j+1] MOVQ MM1,r[EAX] ; r[c+j] | r[c+j+1] PFMUL MM0,MM1 ; a[n]*r[c+j] | a[j+1]*r[c+j+1] PFADD MM2,MM0 ; beta0 | beta1 ADD EAX,8 SUB ECX,2 JG M ; end of even part JNE SKIPODD ; odd part processing MOVD MM0,a[EAX] ; a[n-1] | 0 MOVD MM1,r[EAX] ; r[c+n-1] | 0 PFMUL MM0,MM1 ; a[n-1]*r[c+n-1] | 0 PFADD MM2,MM0 ; beta0 | beta1 SKIPODD: PFACC MM2,MM2 ; beta = beta0 + beta1 MOVD beta,MM2 (d) // Unroll for n = 1 beta = a[0]*r[c]; // Do something else for n = 1 ... // Process for n = 2, 3,...m for ( n = 2; n < m; n++ ) // main loop { // 3DNow! optimized loop for beta calculation __asm { } // Do something else ... } Listing Nine (a) __int64 StartTicks, EndTicks; _asm { CPUID RDTSC MOV DWORD PTR StartTicks,EAX MOV DWORD PTR StartTicks[4],EDX // Code which performance is measured ... CPUID RDTSC MOV DWORD PTR EndTicks,EAX MOV DWORD PTR EndTicks[4],EDX // EndTicks - StartTicks = Running Time in CPU cycles } (b) __int64 StartTicks, EndTicks, Ticks = 0x7FFFFFFFFFFFFFFF; for ( int i = 0; i < 10; i++ ) { // Flush the data cache if necessary // memset(data, 0, sizeof_data_cache); _asm { CPUID RDTSC MOV DWORD PTR StartTicks,EAX MOV DWORD PTR StartTicks[4],EDX // Code which performance is measured ... CPUID RDTSC MOV DWORD PTR EndTicks,EAX MOV DWORD PTR EndTicks[4],EDX } if ( EndTicks - StartTicks < Ticks ) Ticks = EndTicks - StartTicks; } 4