by Algorithm Alley Tim Kientzle Figure 1: (a) C(i) = a(i)/2 SUM from x=0 to (N-1) s(x) cos(PI i (2x+1)/ 2N) (b) C(j,i) = a(j)/2 a(i)/2 SUM from y=0 to (N-1) SUM from x=0 to (N-1) s(y,x) cos(PI i (2x+1)/2N) cos(PI j (2y+1)/2N) (c) C(j,i) = a(j)/2 SUM from y=0 to (N-1) cos(PI j (2y+1)/2N) [ a(i)/2 SUM from x=0 to (N-1) s(y,x) cos(PI i (2x+1)/2N) ] Figure 2: (a) s(x) = SUM from i=0 to (N-1) a(i)/2 C(i) cos(PI i (2x+1)/2N) (b) s(y,x) = SUM from j=0 to (N-1) SUM from i=0 to (N-1) a(j)/2 a(i)/2 C(j,i) cos(PI i (2x+1)/2N) cos(PI j (2y+1)/2N) (c) s(y,x) = SUM from j=0 to (N-1) a(j)/2 cos(PI j (2y+1)/2N) [ SUM from i=0 to (N-1) a(i)/2 C(j,i) cos(PI i (2x+1)/2N) ] Figure 6: (a) A --+-----------+-- C C = A * k * cos(3pi/8) + B * k * sin(3pi/8) | kR3 | B --+-----------+-- D D = -A * k * sin(3pi/8) + B * k * cos(3pi/8) (b) temp = (A+B) * k*cos(3pi/8) C = temp + B * (k*sin(3pi/8) - k*cos(3pi/8)) D = temp + A * (-k*sin(3pi/8) - k*cos(3pi/8)) (c) static const int r2c3=554; /* sqrt(2)*cos(3pi/8)*1024 */ static const int r2s3=1337; /* sqrt(2)*cos(3pi/8)*1024 */ x3=r2c3*(x1+x0); x0=(-r2c3+r2s3)*x0+x3; x1=(-r2c3-r2s3)*x1+x3; Example 1: dct1d4PtTest(int *dctBlock) { static const int r2c3=554, r2s3=1337; int x0=dctBlock[0], x1=dctBlock[1], x2=dctBlock[2], x3=dctBlock[3]; int x4=x0+x3; x0-=x3; x3=x1+x2; x1-=x2; /* Stage 1 */ x2=x4+x3; x4-=x3; /* Stage 2 */ x3=r2c3*(x1+x0)+512; x0=(-r2c3+r2s3)*x0+x3; x1=(-r2c3-r2s3)*x1+x3; dctBlock[0] = x2; dctBlock[2] = x4; /* Round and output */ dctBlock[1] = x0>>10; dctBlock[3] = x1>>10; } Example 2: C = A + B * ( sin(3pi/8) / cos(3pi/8) ) D = -A * (sin(3pi/8)/cos(3pi/8)) + B C *= cos(3pi/8) D *= cos(3pi/8) Table 1: Implementation Timing 4-Element DCT 12.5 microseconds Example 1 0.18 microseconds 8-Element DCT 46.9 microseconds Listing One 0.625 microseconds 8(8-Element DCT 4375.0 microseconds (Direct 2-D calculation) 8(8-Element DCT 640.0 microseconds (Using 16 1-D calculations) Listing Two 7.6 microseconds Listing One static void dct1dTest(int *dctBlock) { static const int c1=1004 /*cos(pi/16)<<10*/, s1=200 /*sin(pi/16)<<10*/; static const int c3=851 /*cos(3pi/16)<<10*/, s3=569 /*sin(3pi/16)<<10*/; static const int r2c6=554 /*sqrt(2)*cos(6pi/16)<<10*/, r2s6=1337; static const int r2=181; /* sqrt(2)<<7 */ int x0=dctBlock[0], x1=dctBlock[1], x2=dctBlock[2], x3=dctBlock[3], x4=dctBlock[4], x5=dctBlock[5], x6=dctBlock[6], x7=dctBlock[7]; int x8; /* Stage 1 */ x8=x7+x0; x0-=x7; x7=x1+x6; x1-=x6; x6=x2+x5; x2-=x5; x5=x3+x4; x3-=x4; /* Stage 2 */ x4=x8+x5; x8-=x5; x5=x7+x6; x7-=x6; x6=c1*(x1+x2); x2=(-s1-c1)*x2+x6; x1=(s1-c1)*x1+x6; x6=c3*(x0+x3); x3=(-s3-c3)*x3+x6; x0=(s3-c3)*x0+x6; /* Stage 3 */ x6=x4+x5; x4-=x5; x5=r2c6*(x7+x8); x7=(-r2s6-r2c6)*x7+x5; x8=(r2s6-r2c6)*x8+x5; x5=x0+x2;x0-=x2; x2=x3+x1; x3-=x1; /* Stage 4, round, and output */ dctBlock[0]=x6; dctBlock[4]=x4; dctBlock[2]=(x8+512)>>10; dctBlock[6] = (x7+512)>>10; dctBlock[7]=(x2-x5+512)>>10; dctBlock[1]=(x2+x5+512)>>10; dctBlock[3]=(x3*r2+65536)>>17; dctBlock[5]=(x0*r2+65536)>>17; } static void dct2dTest(int (*dctBlock)[8]) { static const int c1=1004 /*cos(pi/16)<<10*/, s1=200 /*sin(pi/16)<<10*/; static const int c3=851 /*cos(3pi/16)<<10*/, s3=569 /*sin(3pi/16)<<10*/; static const int r2c6=554 /*sqrt(2)*cos(6pi/16)<<10*/, r2s6=1337; static const int r2=181; /* sqrt(2)<<7 */ int row,col; for(row=0;row<8;row++) { int x0=dctBlock[row][0], x1=dctBlock[row][1], x2=dctBlock[row][2], x3=dctBlock[row][3], x4=dctBlock[row][4], x5=dctBlock[row][5], x6=dctBlock[row][6], x7=dctBlock[row][7], x8; /* Stage 1 */ x8=x7+x0; x0-=x7; x7=x1+x6; x1-=x6; x6=x2+x5; x2-=x5; x5=x3+x4; x3-=x4; /* Stage 2 */ x4=x8+x5; x8-=x5; x5=x7+x6; x7-=x6; x6=c1*(x1+x2); x2=(-s1-c1)*x2+x6; x1=(s1-c1)*x1+x6; x6=c3*(x0+x3); x3=(-s3-c3)*x3+x6; x0=(s3-c3)*x0+x6; /* Stage 3 */ x6=x4+x5; x4-=x5; x5=x0+x2;x0-=x2; x2=x3+x1; x3-=x1; x1=r2c6*(x7+x8); x7=(-r2s6-r2c6)*x7+x1; x8=(r2s6-r2c6)*x8+x1; /* Stage 4 and output */ dctBlock[row][0]=x6; dctBlock[row][4]=x4; dctBlock[row][2]=x8>>10; dctBlock[row][6] = x7>>10; dctBlock[row][7]=(x2-x5)>>10; dctBlock[row][1]=(x2+x5)>>10; dctBlock[row][3]=(x3*r2)>>17; dctBlock[row][5]=(x0*r2)>>17; } Listing Two for(col=0;col<8;col++) { int x0=dctBlock[0][col], x1=dctBlock[1][col], x2=dctBlock[2][col], x3=dctBlock[3][col], x4=dctBlock[4][col], x5=dctBlock[5][col], x6=dctBlock[6][col], x7=dctBlock[7][col], x8; /* Stage 1 */ x8=x7+x0; x0-=x7; x7=x1+x6; x1-=x6; x6=x2+x5; x2-=x5; x5=x3+x4; x3-=x4; /* Stage 2 */ x4=x8+x5; x8-=x5; x5=x7+x6; x7-=x6; x6=c1*(x1+x2); x2=(-s1-c1)*x2+x6; x1=(s1-c1)*x1+x6; x6=c3*(x0+x3); x3=(-s3-c3)*x3+x6; x0=(s3-c3)*x0+x6; /* Stage 3 */ x6=x4+x5; x4-=x5; x5=x0+x2;x0-=x2; x2=x3+x1; x3-=x1; x1=r2c6*(x7+x8); x7=(-r2s6-r2c6)*x7+x1; x8=(r2s6-r2c6)*x8+x1; /* Stage 4 and output */ dctBlock[0][col]=(x6+16)>>5; dctBlock[4][col]=(x4+16)>>5; dctBlock[2][col]=(x8+16384)>>15; dctBlock[6][col] = (x7+16384)>>15; dctBlock[7][col]=(x2-x5+16384)>>15; dctBlock[1][col]=(x2+x5+16384)>>15; dctBlock[3][col]=((x3>>8)*r2+8192)>>14; dctBlock[5][col]=((x0>>8)*r2+8192)>>14; } } 1