_OPTIMIZING MATRIX MATH ON THE PENTIUM_
by Harlan W. Stockman


Listing One

          .386P
          .model    small
.data
ALIGN 4
dstorage DQ 0.0
.code
;******************* ddot() *********************
; double ddot(int n, double *xptr, double *yptr)
;   ..forms the dot product of two row vectors..
; RETURNS product in edx:eax

          public    _ddot
_ddot     proc
          push ebx

          ;---STACK:---
;+-------+------------+-------+--------+--------+
;|  ebx  |  ret addr  |  n    |  xptr  |  yptr  |
;^esp    ^esp+4       ^esp+8  ^esp+12  ^esp+16
;-----------------------------------------------+
          mov ecx, dword ptr [esp+8]
          test ecx, ecx  ;<= 0 iterations ?
          jle badboy
          mov eax, dword ptr [esp+12]   ;eax = xptr
          mov ebx, dword ptr [esp+16]   ;ebx = yptr
          fldz           ;initialize accumulator..
          ;---determine length of clean-up, main loop iterations---
          mov edx, ecx
          and edx, 3     ;edx is length of cleanup...
          shr ecx, 2     ;loops unrolled by 4, so adjust counter...
          jz cleanup1
          ;=======loop1=======
   loop1: fld qword ptr [eax]
          fmul qword ptr [ebx]
          fadd
          fld qword ptr [eax+8]
          fmul qword ptr [ebx+8]
          fadd
          fld qword ptr [eax+16]
          fmul qword ptr [ebx+16]
          fadd
          fld qword ptr [eax+24]
          fmul qword ptr [ebx+24]
          fadd
          add eax,32
          add ebx,32
          dec ecx        ;faster than "loop" on pentium...
          jnz loop1
          ;=====END loop1=====
cleanup1: or edx,edx
          jz store1
          fld qword ptr [eax]
          fmul qword ptr [ebx]
          fadd
          dec edx
          jz store1
          fld qword ptr [eax+8]
          fmul qword ptr [ebx+8]
          fadd
          dec edx
          jz store1
          fld qword ptr [eax+16]
          fmul qword ptr [ebx+16]
          fadd
          ;---store result-------
  store1: fstp dstorage  ;Zortech expects to see result in edx:eax...
          fwait          ;Needed for 387...
          mov eax, dword ptr dstorage

          mov edx, dword ptr dstorage+4
          ;-------
          pop ebx
          ret
  badboy: xor eax,eax
          xor edx,edx
          pop ebx
          ret
_ddot     endp
         



Listing Two

;******************* daxpy() ********************
; void daxpy(int n, double *aptr, double *xptr, double *yptr)
;   ..forms the sum of a*x[i] + y[i], and stores in y[]..
; RETURNS nothing.
          public    _daxpy
_daxpy    proc
          push ebx
          ;---STACK:---;+--------+------------+--------+--------+--------+--------+
;|  ebx   |  ret addr  |  n     |  aptr  |  xptr  |  yptr  |
;^esp     ^esp+4       ^esp+8   ^esp+12  ^esp+16  ^esp+20
;----------------------------------------------------------+
          mov ecx, dword ptr [esp+8]
          test ecx, ecx                 ;<=0 iterations ?
          jle badboy5
          ;---load *aptr onto fp stack
          mov eax, dword ptr [esp+12]   ;address of multiplier (aptr)..
          ;---test if *aptr is positive or negative 0.0---          mov edx, dword ptr [eax+4]    ;upper dword of *aptr..
          and edx, 01111111111111111111111111111111B ;mask off sign bit
          or edx, dword ptr [eax]
          jz badboy5
          ;---load *aptr onto stack if not 0.0---          fld qword ptr [eax]           ;multiplier now in ST(0)..
          mov eax, dword ptr [esp+16]
          mov ebx, dword ptr [esp+20]
          ;---determine length of clean-up, main loop iterations---          mov edx, ecx
          and edx, 3     ;edx is length of cleanup...
          shr ecx, 2     ;loops unrolled by 4, so adjust counter...
          jz cleanup5
          ;=======loop5=======
   loop5: fld qword ptr [eax]
          fmul st,st(1)
          fadd qword ptr [ebx]
          ;---next element---          fld qword ptr [eax+8]
          fmul st,st(2)
          fadd qword ptr [ebx+8]
          ;---next element---          fld qword ptr [eax+16]
          fmul st,st(3)
          fadd qword ptr [ebx+16]
          ;---next element---          fld qword ptr [eax+24]
          fmul st,st(4)
          fadd qword ptr [ebx+24]
          ;---store new y[]'s, clean stack--          fxch st(2)     ; !!! Avoid data collision !!!
          fstp qword ptr [ebx+8]
          fstp qword ptr [ebx+16]
          fstp qword ptr [ebx+24]
          fstp qword ptr [ebx]
          add eax,32
          add ebx,32
          dec ecx        ;faster than "loop" on pentium,
          jnz loop5      ;due to instruction pairing...
          ;=====END loop5=====
cleanup5: or edx,edx
          jz stckcln5
          ;---1st cleanup element---          fld qword ptr [eax]
          fmul st,st(1)
          fadd qword ptr [ebx]
          fstp qword ptr [ebx]
          dec edx
          jz stckcln5
          ;---next element---          fld qword ptr [eax+8]
          fmul st,st(1)
          fadd qword ptr [ebx+8]
          fstp qword ptr [ebx+8]
          dec edx
          jz stckcln5
          ;---next element---          fld qword ptr [eax+16]
          fmul st,st(1)
          fadd qword ptr [ebx+16]
          fstp qword ptr [ebx+16]          
          ;-----------------stckcln5: ;must clean *aptr off stack
          fstp st(0)
 badboy5: pop ebx
          ret
_daxpy    endp




Example 1: Matrix multiplication.

(a)
void normal(){
   int i,j,k;
   for (i=0;i<N;i++){
      for (j=0;j<N;j++){
         c[i][j] = 0.0;
         for (k=0;k<N;k++){
            c[i][j] += a[i][k]*b[k][j];
         }
      }
   }
}

(b)

void transpose(){
   int i,j,k;
   double temp;
   for (i=0;i<N;i++){
      for (j=0;j<N;j++){
         bt[j][i] = b[i][j];
      }
   }
   for (i=0;i<N;i++){
       for (j=0;j<N;j++){
#ifndef ASMLOOP
          temp = a[i][0]*bt[j][0];
          for (k=1;k<N;k++){
             temp += a[i][k]*bt[j][k];
          }
          c[i][j] = temp;
#else
          c[i][j] = ddot(N, a[i], bt[j]);
#endif
      }
   }
}

(c)

void reg_loops(){
   int i,j,k;
   double a_entry;
   for (i=0;i<N;i++){
      for (j=0;j<N;j++){
         c[i][j] = 0.0;
      }

   }
   for (i=0;i<N;i++){
      for (k=0;k<N;k++){
#ifndef ASMLOOP
         a_entry = a[i][k];
         for (j=0;j<N;j++){
            c[i][j] += a_entry*b[k][j];
         }
#else
         daxpy(N, a[i]+k, b[k], c[i]);
#endif
      }
   }
}



Example 2:

aptr = a[i];
cptr = c[i] + j;
bptr = b[0] + j;
for(k=0; k<500; bptr+=500;){
    *cptr += *(aptr + k) * *bptr;
}


Example 3: C code and assembly language produced by compiler.

/* Main loop of daxpy() in C: */

for (i=m; i<n; i=i+4) {
    y[i]   = y[i]   + a*x[i];
    y[i+1] = y[i+1] + a*x[i+1];
    y[i+2] = y[i+2] + a*x[i+2];
    y[i+3] = y[i+3] + a*x[i+3];
}

--------------------------------; Assembly language produced by 
; compiler:

L124: mov   ECX, EBX
      sub   ECX, ESI
      mov   EAX, 014h[EBP]
      fld   qword ptr [EAX][ECX]
      fmul  qword ptr 0Ch[EBP]
      mov   EAX, 01Ch[EBP]
      fadd  qword ptr [EAX][ECX]
      fstp  qword ptr [EAX][ECX]
      fld   qword ptr [EDX][ECX]
      fmul  qword ptr 0Ch[EBP]
      mov   EAX, -010h[EBP]

      fadd  qword ptr [EAX][ECX]
      fstp  qword ptr [EAX][ECX]
      wait
      mov   EAX, -0Ch[EBP]
      fld   qword ptr [EAX][ECX]
      fmul  qword ptr 0Ch[EBP]
      mov   EAX, -8[EBP]
      fadd  qword ptr [EAX][ECX]
      fstp  qword ptr [EAX][ECX]
      fld   qword ptr [EBX]
      fmul  qword ptr 0Ch[EBP]
      mov   EAX, -4[EBP]
      fadd  qword ptr [EAX][ECX]
      fstp  qword ptr [EAX][ECX]
      wait
      add   EBX, 020h
      add   EDI, 4
      cmp   EDI, 8[EBP]
      jl    L124





Example 4: 

(a)                               (b)                           (c) 

loop1: fld qword ptr [eax]       loop2: fld qword ptr [eax]     loop3: fld qword ptr [eax]     
       fmul st,st(1)                    fmul st,st(1)                  fmul st,st(1)           
       fadd qword ptr [ebx]             fadd qword ptr [ebx]           fadd qword ptr [ebx]    
       fstp qword ptr [ebx]             ;---next element---            ;---next element---     
       ;---next element---              fld qword ptr [eax+8]          fld qword ptr [eax+8]   
       fld qword ptr [eax+8]            fmul st,st(2)                  fmul st,st(2)           
       fmul st,st(1)                    fadd qword ptr [ebx+8]         fadd qword ptr [ebx+8]  
       fadd qword ptr [ebx+8]           ;---next element---            ;---next element---     
       fstp qword ptr [ebx+8]           fld qword ptr [eax+16]         fld qword ptr [eax+16]  
       ;---next element---              fmul st,st(3)                  fmul st,st(3)           
       fld qword ptr [eax+16]           fadd qword ptr [ebx+16]        fadd qword ptr [ebx+16] 
       fmul st,st(1)                    ;---next element---            ;---next element---     
       fadd qword ptr [ebx+16]          fld qword ptr [eax+24]         fld qword ptr [eax+24]  
       fstp qword ptr [ebx+16]          fmul st,st(4)                  fmul st,st(4)           
       ;---next element---              fadd qword ptr [ebx+24]        fadd qword ptr [ebx+24] 
       fld qword ptr [eax+24]           ;---store new y[]'s---         ;---store new y[]'s---  
       fmul st,st(1)                    fstp qword ptr [ebx+24]        fxch st(2) ;!!! fxch !!!
       fadd qword ptr [ebx+24]          fstp qword ptr [ebx+16]        fstp qword ptr [ebx+8]  
       fstp qword ptr [ebx+24]          fstp qword ptr [ebx+8]         fstp qword ptr [ebx+16] 
       add eax,32                       fstp qword ptr [ebx]           fstp qword ptr [ebx+24] 
       add ebx,32                       add eax,32                     fstp qword ptr [ebx]    
       dec ecx                          add ebx,32                     add eax,32
       jnz loop1                        dec ecx                        add ebx,32
                                        jnz loop2                      dec ecx
                                                                       jnz loop3





Example 5:

(a)

   fld qwptr [eax]    fmul st, st(1)    fadd qwptr [ebx]   fld qwptr [eax]


st0      x[i]        a*x[i]              a*x[i] + y[i]      a
st1        a           a                   a       st2 
st3 


(b)

    fld qwptr [eax]    fmul st, st(1)    fadd qwptr [ebx]   fld qwptr [eax+8]

st0       x[i]             a*x[i]           a*x[i] + y[i]       x[i+1]     
st1         a                a                 a        a*x[i] + y[i] 
st2                                                          a
st3 

   fadd qwptr [ebx+24]    fxch st(2)     fstp qwptr [ebx+8]  fstp qwptr [ebx+16]

st0  a*x[i+3]+y[i+3]     a*x[i+1]+y[i+1]   a*x[i+2]+y[i+2] a*x[i+3]+y[i+3]
st1  a*x[i+2]+y[i+2]     a*x[i+2]+y[i+2]   a*x[i+3]+y[i+3] a*x[i] + y[i] 
st2  a*x[i+1]+y[i+1]     a*x[i+3]+y[i+3]   a*x[i] + y[i]     a       
st3  a*x[i] + y[i]       a*x[i] + y[i]         a       
st4    a                     a       

