_MIGRATING C CODE TO UNICODE_ by Timothy D. Nestved Listing One /** Name: UNICODE.h Desc: Contains both explicit/generic data types, macros, and function prototypes. Stage 1 modifications. ANSI compilation: default UNICODE compilation: use /DUNICODE. Hungarian notation is not used **/ #ifndef _unicode_h_ #define _unicode_h_ /* data type definitions */ /** CHAR and BYTE may already be #define'd or typedef'd in the compiler's standard include files, so a conditional check may need to be added to avoid redefinitions, errors and warnings. **/ // explicit types typedef char CHAR ; // standard char typedef CHAR * P_CHAR ; typedef unsigned short UNICHAR ; // UNICODE explicit data types typedef UNICHAR * P_UNICHAR ; typedef unsigned char BYTE ; // data buffer data types typedef BYTE * P_BYTE ; // text character generic types #if defined( UNICODE ) typedef UNICHAR TCHAR ; // generic data types (really Unicode) typedef TCHAR * P_TCHAR ; #else typedef CHAR TCHAR ; // generic data types (really ANSI) typedef TCHAR * P_TCHAR ; #endif /* macros */ #if defined( UNICODE ) # define TEXT(literal) L##literal // wide literal constant L'c' L"str" #else # define TEXT(literal) literal // literal constant 'c' "str" #endif #define CALC_CHAR2BYTES(exp) ( (exp) * sizeof(TCHAR) ) // n chars -> n bytes #define CALC_BYTE2CHARS(exp) ( (exp) / sizeof(TCHAR) ) // n bytes -> n chars /* function prototypes */ #if defined( UNICODE ) void mapAnsi2Unic( P_CHAR pAnsiStr, P_UNICHAR pUnicStr ) ; P_UNICHAR ucschr( P_UNICHAR pUCS, UNICHAR token ) ; P_UNICHAR ucscpy( P_UNICHAR pDst, P_UNICHAR pSrc ) ; int ucslen( P_UNICHAR pUCS ) ; void mapAnsi2Unic( P_CHAR pAnsiStr, P_UNICHAR pUnicStr ) ; UNICHAR mapExtCh2Unic( CHAR ansiChar ) ; #endif /** The function prototypes should be placed in the appropriately named header file. Always define the ANSI and Unicode explicit set first, then the generic set using the explicit names previously defined. **/ #define strcmpA strcmp // ANSI explicit string compare #define strcpyA strcpy // ANSI explicit string copy #define strlenA strlen // ANSI explicit string length #define strchrA strchr // ANSI explicit #define strcmpU ucscmp // UNICODE explicit string compare #define strcpyU ucscpy // UNICODE explicit string copy #define strlenU ucslen // UNICODE explicit string length #define strchrU ucschr // UNICODE explicit #if defined( UNICODE ) # define txtcmp strcmpU // generic string compare (really UNICODE) # define txtcpy strcpyU // generic string copy (really UNICODE) # define txtlen strlenU // generic string length (really UNICODE) # define txtchr strchrU // generic string (really UNICODE) #else # define txtcmp strcmpA // generic string compare (really ANSI) # define txtcpy strcpyA // generic string copy (really ANSI) # define txtlen strlenA // generic string length (really ANSI) # define txtchr strchrA // generic string (really ANSI) #endif /** Prototype parameter examples. It is important not to declare functions, parameters and return values incorrectly. Follow these two basic rules: 1) Never use generic data types for an explicit prototype 2) Avoid creating a generic prototype using explicit data types **/ #if defined( INCORRECT ) int mapUnic2Ansi( P_TCHAR pUniStr, P_TCHAR pAnsiStr, TCHAR rplCh ) ; P_CHAR encryptString( P_CHAR pStr ) ; int uniStrLen( P_TCHAR pUniStr ) ; #else int mapUnic2Ansi( P_UNICHAR pUniStr, P_CHAR pAnsiStr, CHAR rplCh ) ; P_TCHAR encryptString( P_TCHAR pStr ) ; int uniStrLen( P_UNICHAR pUniStr ) ; #endif #endif // end include file Listing Two /** Name: UCS.c Desc: Unicode character string functions. Hungarian notation is not used. **/ #if defined( UNICODE ) #include #include "unicode.h" #define ANSIEXTCHBASE 0x80 // start of ANSI extended characters static UNICHAR mapExtCh2Unic( CHAR ansiChar ) ; P_UNICHAR ucschr( P_UNICHAR pUCS, UNICHAR token ) { P_UNICHAR pStr = pUCS ; for ( ; *pStr != token; pStr++ ) ; return( pStr ) ; } P_UNICHAR ucscpy( P_UNICHAR pDst, P_UNICHAR pSrc ) { P_UNICHAR pStr = pDst ; while ( *pDst++ = *pSrc++ ) ; return( pStr ) ; } int ucslen( P_UNICHAR pUCS ) { P_UNICHAR pStr = pUCS ; for ( ; *pStr; pStr++ ) ; return( (int)( pStr - pUCS ) ) ; } void mapAnsi2Unic( P_CHAR pAnsiStr, P_UNICHAR pUnicStr ) { P_CHAR pSrc = pAnsiStr ; P_UNICHAR pDst = pUnicStr ; for ( ; *pSrc; pSrc++ ) { *pDst++ = (UNICHAR)( ( *pSrc > ANSIEXTCHBASE ) ? mapExtCh2Unic( *pSrc ) : *pSrc ) ; } *pDst = L'\0' ; // NULL terminate the Unicode string } UNICHAR mapExtCh2Unic( CHAR ansiChar ) { static UNICHAR extChArray[] = { /* 80 */ 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7, 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5, /* 90 */ 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9, 0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192, /* A0 */ 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA, 0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB, /* B0 */ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510, /* C0 */ 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567, /* D0 */ 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580, /* E0 */ 0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4, 0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 0x03B5, 0x2229, /* F0 */ 0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248, 0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x25A0, 0x00A0 } ; return( extChArray[ (int)( ansiChar - ANSIEXTCHBASE ) ] ) ; } Listing Three /** Name: BEFORE.c (convert) Desc: Shows a functions before migration to allow you to apply the steps discussed in the article and compare your results. The functions are strictly examples. Hungarian notation is not used. **/ #include #include #include #include "unicode.h" P_CHAR GenericText( P_CHAR text, int *bytes ) ; P_CHAR ExplicitText( P_CHAR pStr ) ; P_CHAR DataStream( P_CHAR pDst, P_CHAR pSrc, int count ) ; P_CHAR GenericText( P_CHAR text, int *bytes ) { P_CHAR pStr1 ; P_CHAR pStr2 ; pStr1 = strchr( text, 'A' ) ; // locate the token char pStr2 = strchr( pStr1, '@' ) ; // locate the delimiting char *pStr2 = '\0' ; // terminate string at delimiter *bytes = pStr2 - pStr1 ; // calc number of bytes // number chars == number of bytes return( pStr1 ) ; // return start of token found } P_CHAR ExplicitText( P_CHAR pStr ) { P_CHAR pAnsiStr ; // search a string that is explicitly an ANSI string pAnsiStr = strchr( pStr, 'W' ) ; *pAnsiStr = '\0' ; return( pStr ) ; } P_CHAR DataStream( P_CHAR pDst, P_CHAR pSrc, int count ) { strncpy( pDst, pSrc, count ) ; // strncpy used purely as an example *( pDst + 13 ) = 0 ; // just to randomly truncate the stream return( pDst ) ; } int main( void ) { CHAR text[ 78 ] ; // text string - conversion P_CHAR pToken ; P_CHAR pStr1 ; CHAR specialThanksTo[] = "Dawn Woods for editing my article" ; P_CHAR pStr2 ; CHAR data[ 78 ] = "Imagine this is byte data and not text!" ; CHAR temp[ 78 ] ; // temp data stream buffer P_CHAR pStr3 ; int nBytes ; strcpy( text, "Text with A Token@ in the stream." ) ; /* text string that should be converted to a generic string */ pStr1 = GenericText( text, &nBytes ) ; // get a token from the string pToken = (P_CHAR)malloc( nBytes + 1 ) ; // alloc space for token only strcpy( pToken, pStr1 ) ; // cpy str to alloc'd space printf( "Token '%s' (A Token)\n, pToken ) ; printf( "Bytes: %d\nCharacters: %d\n\n",strlen( pToken ),strlen( pToken)); free( pToken ) ; /* explicit text string that should not be generic */ pStr2 = ExplicitText( specialThanksTo ) ; printf( "Thanks '%s'\nBytes: %d\nCharacters: %d\n\n", pStr2, strlen( pStr2 ), strlen( pStr2 ) ) ; /* processing data in a stream */ pStr3 = DataStream( temp, data, 78 ) ; printf( "Stream '%s'\nCopy '%s'\n", data, pStr3 ) ; return( 0 ) ; } Listing Four /** Name: AFTER.c (convert) Desc: Shows a functions after migration to allow you to apply the steps discussed in the article and compare your results. The functions are strictly examples. Hungarian notation is not used **/ #include #include #include #include "unicode.h" P_TCHAR GenericText( P_TCHAR text, int *bytes ) ; P_CHAR ExplicitText( P_CHAR pStr ) ; P_BYTE DataStream( P_BYTE pDst, P_BYTE pSrc, int count ) ; P_TCHAR GenericText( P_TCHAR text, int *bytes ) { P_TCHAR pStr1 ; P_TCHAR pStr2 ; pStr1 = txtchr( text, TEXT('A') ) ; // locate the token char pStr2 = txtchr( pStr1, TEXT('@') ) ; // locate the delimiting char *pStr2 = TEXT('\0') ; // terminate string at delimiter *bytes = pStr2 - pStr1 ; // calc number of bytes // number chars == number of bytes return( pStr1 ) ; // return start of token found } P_CHAR ExplicitText( P_CHAR pStr ) { P_CHAR pAnsiStr ; // search a string that is explicitly an ANSI string pAnsiStr = strchr( pStr, 'W' ) ; *pAnsiStr = '\0' ; return( pStr ) ; } P_UNICHAR ExplicitUnicodeText( P_UNICHAR pStr ) { P_UNICHAR pUnicStr ; // purely an example, code not called // search a string that is explicitly an ANSI string pUnicStr = strchrU( pStr, L'W' ) ; *pUnicStr = L'\0' ; return( pStr ) ; } P_BYTE DataStream( P_BYTE pDst, P_BYTE pSrc, int count ) { memmove( pDst, pSrc, count ) ; *( pDst + 13 ) = 0 ; // just to randomly truncate the stream return( pDst ) ; } int main( void ) { TCHAR text[ 78 ] ; // text string - conversion P_TCHAR pToken ; P_TCHAR pStr1 ; CHAR specialThanksTo[] = "Dawn Woods for editing my article" ; P_CHAR pStr2 ; BYTE data[ 78 ] = "Imagine this is byte data and not text!" ; BYTE temp[ 78 ] ; // temp data stream buffer P_BYTE pStr3 ; int nBytes ; txtcpy( text, TEXT("Text with A Token@ in the stream.") ) ; /* text string that should be converted to a generic string */ pStr1 = GenericText( text, &nBytes ) ; // get a token from the string pToken = (P_CHAR)malloc( nBytes + 1 ) ; // alloc space for token only txtcpy( pToken, pStr1 ) ; // cpy str to alloc'd space printf( "Token '%s' (A Token)\n, pToken ) ; // need mapUnic2Ansi() printf( "Bytes: %d\nCharacters: %d\n\n", CALC_CHAR2BYTES(txtlen( pToken )), txtlen( pToken ) ) ; free( pToken ) ; /* explicit text string that should not be generic */ pStr2 = ExplicitText( specialThanksTo ) ; printf( "Thanks '%s'\nBytes: %d\nCharacters: %d\n\n", pStr2, strlenA( pStr2 ), strlenA( pStr2 ) ) ; /* processing data in a stream */ pStr3 = DataStream( temp, data, 78 ) ; printf( "Stream '%s'\nCopy '%s'\n", data, pStr3 ) ; return( 0 ) ; } Listing Six Token '' (A Token) Bytes: 14 Characters: 7 Thanks 'Dawn ' Bytes: 5 Characters: 5 Stream 'Imagine this is byte data and not text!' Copy 'Imagine this ' Listing Seven /** Name: TRANSPAR.c Desc: Example of how to use macros (such as the TEXT() macro) to eliminate unnecessary conditional directives, potential semantic errors and improve readability. **/ // Migration is NOT transparent. Requires a conditional directive in source. void notTransparent( void ) { // ... #if defined( UNICODE ) ucscpy( text, L"Text with A Token@ in the stream." ) ; // ... #else strcpy( text, "Text with A Token@ in the stream." ) ; // ... #endif // ... } // Migration is transparent. No conditional directive required void transparent( void ) { // ... txtcpy( text, TEXT( "Text with A Token@ in the stream." ) ) ; // ... }