_HIGH-PERFORMANCE PROGRAMMING FOR THE POWERPC_ by Kip McClanahan, Mike Phillip, and Mark VandenBrink Listing One /*---------------------------------------------------------------------------+ | Windows NT for PowerPC Alignment Demonstration Program | | | | Mark VandenBrink, markv@risc.sps.mot.com | | Kip McClanahan, kip_mcclanahan@risc.sps.mot.com -or- kip@io.com | | Mike Phillip, phillip@risc.sps.mot.com | | | | | +---------------------------------------------------------------------------*/ #include #include #include #include #include #include #include #include // force compiler fix-ups for data accesses within the // BMP structure by using #pragma pack(1). #pragma pack(1) // Standard Windows3.x BMP file header format // typedef struct BMPHeader { USHORT FileType; // offset 0 ULONG FileSize; // offset 2 USHORT reserved1; // offset 6 USHORT reserved2; // offset 8 ULONG BMPDataOffset; // offset 10 }; struct BMPHeader bmpBuffer; // declare structure // // Print an error message to to the screen and exit. // static VOID Die(char *format, ...) { va_list va; va_start(va, format); fprintf(stderr, "\n\nALIGN: "); vfprintf(stderr, format, va); ExitProcess(2); } // // Return a timestamp from the high frequency performance counters (if // one exists). Return the stamp in units of number of milliseconds // static UINT GetTimeStamp(VOID) { static DWORD FreqInMs = 0; LARGE_INTEGER Time; if (!FreqInMs) { if (QueryPerformanceFrequency(&Time) == TRUE) { if (Time.HighPart) { Die("Timer has too high a resolution\n"); } // // 100-nanosecond units // FreqInMs = Time.LowPart / 1000; } else { Die("Could not get frequency of perfomance counter\n"); } } if (QueryPerformanceCounter(&Time) == FALSE) { Die("System does not support high-resolution timer\n"); } return Time.LowPart / FreqInMs; } // // Essentially useless function that returns a value to place at // IntPointer. Function used to prevent compiler from optimizing // away references to *IntPointer inside a loop. // DWORD GetNextValue(VOID) { static DWORD NextValue = 0; return NextValue++; } main(int argc, char **argv) { CHAR Buffer[1024]; UINT EndTime; UINT Max = 0; UINT StartTime; UINT i; struct BMPHeader *headerPtr; int *IntPointer2; __unaligned int *IntPointer; switch (argc) { case 3: // // Note: setting thread's priority to THREAD_PRIORITY_TIME_CRITICAL // can effectively bring a machine to its knees, depending on the // process priority class. // SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL ); Max = strtoul(argv[1], 0, 0); // // The naturally aligned case // if (argv[2][0] == '-' && argv[2][1] == '0') { printf("ONLY aligned references\n"); IntPointer2 = (int *)(&Buffer[4]); printf("Buffer at %x, IntPointer = %x\n", Buffer, IntPointer2); StartTime = GetTimeStamp(); for (i = 0; i < Max; i++) { *IntPointer2 = GetNextValue(); } EndTime = GetTimeStamp(); break; } // // The no fix-ups, alignment exception causing case // if (argv[2][0] == '-' && argv[2][1] == '1') { printf("NO support for misaligned references\n"); IntPointer2 = (int *)(&Buffer[3]); printf("Buffer at %x, IntPointer = %x\n", Buffer, IntPointer2); StartTime = GetTimeStamp(); for (i = 0; i < Max; i++) { *IntPointer2 = GetNextValue(); } EndTime = GetTimeStamp(); break; } // // OS-based fix-ups // if (argv[2][0] == '-' && argv[2][1] == '2') { printf("OS support of misaligned references.\n"); SetErrorMode(SEM_NOALIGNMENTFAULTEXCEPT); IntPointer2 = (int *)(&Buffer[3]); printf("Buffer at %x, IntPointer = %x\n", Buffer, IntPointer2); StartTime = GetTimeStamp(); for (i = 0; i < Max; i++) { *IntPointer2 = GetNextValue(); } EndTime = GetTimeStamp(); break; } if (argv[2][0] == '-' && argv[2][1] == '3') { printf("Using __UNALIGNED qualifier.\n"); IntPointer = (int *)(&Buffer[3]); printf("Buffer at %x, IntPointer = %x\n", Buffer, IntPointer); StartTime = GetTimeStamp(); for (i = 0; i < Max; i++) { *IntPointer = GetNextValue(); } EndTime = GetTimeStamp(); break; } if (argv[2][0] == '-' && argv[2][1] == '4') { headerPtr = (struct BMPHeader *)Buffer; printf("Using #pragma pack(1) directive\n"); printf("Access offset @%x\n", (ULONG)&(headerPtr->BMPDataOffset)); StartTime = GetTimeStamp(); for (i = 0; i < Max; i++) { headerPtr->BMPDataOffset = GetNextValue(); } EndTime = GetTimeStamp(); break; } // // fall through // default: fprintf(stderr, "Usage: ALIGN number-of-iterations [-option]\n"); fprintf(stderr, "\nwhere option is one of the following:\n"); fprintf(stderr, "\t-0 Use ONLY aligned accesses.\n"); fprintf(stderr, "\t-1 NO alignment fix ups (causes an exception).\n"); fprintf(stderr, "\t-2 Use OS-based fix ups for misaligned accesses.\n"); fprintf(stderr, "\t-3 Use __UNALIGNED type qualifier.\n"); fprintf(stderr, "\t-4 Use #PRAGMA PACK(1) directive.\n"); ExitProcess(0); } printf("%d milliseconds\n", EndTime - StartTime); ExitProcess(0); } Listing Two ; ; Typical 32-bit store instruction ; ; Assumes: ; r3 contains word to store at address contained in r4 ; stw r3, 0(r4) ; store word contained in r3 ; to address contained in r4 + 0 Listing Three ; ; The equivalent 32-bit store resulting from use of the ; __unaligned type qualifier in the pointer declaration ; for IntPointer. ; Assumes: ; r3 contains word to store at address contained in r4 ; For the purposes of this example, assume that ; r3 = 0x12345678. ; stb r3, 0(r4) ; store the lower byte (0x78) ; of r3 into address contained ; in r4 + 0. rlwinm r5, r3,24,8,31 ; extract bits 16-23 into the ; low-order position of r5 ; How the rlwinm instruction works: ; Step 1: rotate contents of r3 left by 24 bits ; Result: 0x78123456 ; Step 2: generate a mask with 1-bits from ; bit 8 to 31 Result: 0x00ffffff ; Step 3: AND the contents of r3 with mask and ; place the result into r5. ; Result: r5 = 0x00123456 ; NOTE: the next stb instruction will store ; 0x56 into the address (r4 + 1). ; See Figure 1. stb r5, 1(r4) ; store next byte at r4 + 1 rlwinm r5, r3,16,16,31 ; extract bits 8-15 into r5 stb r5, 2(r4) ; store next byte at r4 + 2 rlwinm r3, r3,8,24,31 ; extract bits 0-7 into r3 stb r3, 3(r4) ; store final byte at r4 + 3 Figure 2: #define rULONG(x) (ULONG)( \ *(UCHAR *)(&x) | \ (*((UCHAR *)(&x)+1) << 8) | \ (*((UCHAR *)(&x)+2) << 16) | \ (*((UCHAR *)(&x)+3) << 24) ) #define rUSHORT(x) (USHORT)( \ *(UCHAR *)(&x) | \ (*((UCHAR *)(&x)+1) << 8))