_OPTIMIZING MATRIX MATH ON THE PENTIUM_ by Harlan W. Stockman Listing One .386P .model small .data ALIGN 4 dstorage DQ 0.0 .code ;******************* ddot() ********************* ; double ddot(int n, double *xptr, double *yptr) ; ..forms the dot product of two row vectors.. ; RETURNS product in edx:eax public _ddot _ddot proc push ebx ;---STACK:--- ;+-------+------------+-------+--------+--------+ ;| ebx | ret addr | n | xptr | yptr | ;^esp ^esp+4 ^esp+8 ^esp+12 ^esp+16 ;-----------------------------------------------+ mov ecx, dword ptr [esp+8] test ecx, ecx ;<= 0 iterations ? jle badboy mov eax, dword ptr [esp+12] ;eax = xptr mov ebx, dword ptr [esp+16] ;ebx = yptr fldz ;initialize accumulator.. ;---determine length of clean-up, main loop iterations--- mov edx, ecx and edx, 3 ;edx is length of cleanup... shr ecx, 2 ;loops unrolled by 4, so adjust counter... jz cleanup1 ;=======loop1======= loop1: fld qword ptr [eax] fmul qword ptr [ebx] fadd fld qword ptr [eax+8] fmul qword ptr [ebx+8] fadd fld qword ptr [eax+16] fmul qword ptr [ebx+16] fadd fld qword ptr [eax+24] fmul qword ptr [ebx+24] fadd add eax,32 add ebx,32 dec ecx ;faster than "loop" on pentium... jnz loop1 ;=====END loop1===== cleanup1: or edx,edx jz store1 fld qword ptr [eax] fmul qword ptr [ebx] fadd dec edx jz store1 fld qword ptr [eax+8] fmul qword ptr [ebx+8] fadd dec edx jz store1 fld qword ptr [eax+16] fmul qword ptr [ebx+16] fadd ;---store result------- store1: fstp dstorage ;Zortech expects to see result in edx:eax... fwait ;Needed for 387... mov eax, dword ptr dstorage mov edx, dword ptr dstorage+4 ;------- pop ebx ret badboy: xor eax,eax xor edx,edx pop ebx ret _ddot endp Listing Two ;******************* daxpy() ******************** ; void daxpy(int n, double *aptr, double *xptr, double *yptr) ; ..forms the sum of a*x[i] + y[i], and stores in y[].. ; RETURNS nothing. public _daxpy _daxpy proc push ebx ;---STACK:---;+--------+------------+--------+--------+--------+--------+ ;| ebx | ret addr | n | aptr | xptr | yptr | ;^esp ^esp+4 ^esp+8 ^esp+12 ^esp+16 ^esp+20 ;----------------------------------------------------------+ mov ecx, dword ptr [esp+8] test ecx, ecx ;<=0 iterations ? jle badboy5 ;---load *aptr onto fp stack mov eax, dword ptr [esp+12] ;address of multiplier (aptr).. ;---test if *aptr is positive or negative 0.0--- mov edx, dword ptr [eax+4] ;upper dword of *aptr.. and edx, 01111111111111111111111111111111B ;mask off sign bit or edx, dword ptr [eax] jz badboy5 ;---load *aptr onto stack if not 0.0--- fld qword ptr [eax] ;multiplier now in ST(0).. mov eax, dword ptr [esp+16] mov ebx, dword ptr [esp+20] ;---determine length of clean-up, main loop iterations--- mov edx, ecx and edx, 3 ;edx is length of cleanup... shr ecx, 2 ;loops unrolled by 4, so adjust counter... jz cleanup5 ;=======loop5======= loop5: fld qword ptr [eax] fmul st,st(1) fadd qword ptr [ebx] ;---next element--- fld qword ptr [eax+8] fmul st,st(2) fadd qword ptr [ebx+8] ;---next element--- fld qword ptr [eax+16] fmul st,st(3) fadd qword ptr [ebx+16] ;---next element--- fld qword ptr [eax+24] fmul st,st(4) fadd qword ptr [ebx+24] ;---store new y[]'s, clean stack-- fxch st(2) ; !!! Avoid data collision !!! fstp qword ptr [ebx+8] fstp qword ptr [ebx+16] fstp qword ptr [ebx+24] fstp qword ptr [ebx] add eax,32 add ebx,32 dec ecx ;faster than "loop" on pentium, jnz loop5 ;due to instruction pairing... ;=====END loop5===== cleanup5: or edx,edx jz stckcln5 ;---1st cleanup element--- fld qword ptr [eax] fmul st,st(1) fadd qword ptr [ebx] fstp qword ptr [ebx] dec edx jz stckcln5 ;---next element--- fld qword ptr [eax+8] fmul st,st(1) fadd qword ptr [ebx+8] fstp qword ptr [ebx+8] dec edx jz stckcln5 ;---next element--- fld qword ptr [eax+16] fmul st,st(1) fadd qword ptr [ebx+16] fstp qword ptr [ebx+16] ;-----------------stckcln5: ;must clean *aptr off stack fstp st(0) badboy5: pop ebx ret _daxpy endp Example 1: Matrix multiplication. (a) void normal(){ int i,j,k; for (i=0;i