_MIGRATING C CODE TO UNICODE_
by Timothy D. Nestved

Listing One

/** Name:   UNICODE.h
    Desc:   Contains both explicit/generic data types, macros, and function
            prototypes. Stage 1 modifications. ANSI compilation:    default
            UNICODE compilation: use  /DUNICODE. Hungarian notation is not used
**/

#ifndef _unicode_h_
#define _unicode_h_

/*  data type definitions */
/** CHAR and BYTE may already be #define'd or typedef'd in the compiler's 
standard include files, so a conditional check may need to be added to avoid 
redefinitions, errors and warnings. **/

// explicit types
typedef char            CHAR ;      // standard char 
typedef CHAR *          P_CHAR ;

typedef unsigned short  UNICHAR ;   // UNICODE explicit data types
typedef UNICHAR *       P_UNICHAR ;

typedef unsigned char   BYTE ;      // data buffer data types
typedef BYTE *          P_BYTE ;
// text character generic types 
#if defined( UNICODE )
   typedef UNICHAR      TCHAR ;     // generic data types (really Unicode)
   typedef TCHAR *      P_TCHAR ;
#else
   typedef CHAR         TCHAR ;     // generic data types (really ANSI)
   typedef TCHAR *      P_TCHAR ;
#endif
/* macros */
#if defined( UNICODE )
#   define TEXT(literal)   L##literal   // wide literal constant L'c' L"str"
#else
#   define TEXT(literal)   literal      // literal constant 'c' "str"
#endif

#define CALC_CHAR2BYTES(exp) ( (exp) * sizeof(TCHAR) ) // n chars -> n bytes 
#define CALC_BYTE2CHARS(exp) ( (exp) / sizeof(TCHAR) ) // n bytes -> n chars

/* function prototypes */
#if defined( UNICODE )
void        mapAnsi2Unic( P_CHAR pAnsiStr, P_UNICHAR pUnicStr ) ;
P_UNICHAR   ucschr( P_UNICHAR pUCS, UNICHAR token ) ;
P_UNICHAR   ucscpy( P_UNICHAR pDst, P_UNICHAR pSrc ) ;
int         ucslen( P_UNICHAR pUCS ) ;
void        mapAnsi2Unic( P_CHAR pAnsiStr, P_UNICHAR pUnicStr ) ;
UNICHAR     mapExtCh2Unic( CHAR ansiChar ) ;
#endif

/** The function prototypes should be placed in the appropriately named header
file. Always define the ANSI and Unicode explicit set first, then the generic
set using the explicit names previously defined. **/
#define strcmpA      strcmp   // ANSI explicit string compare
#define strcpyA      strcpy   // ANSI explicit string copy
#define strlenA      strlen   // ANSI explicit string length
#define strchrA      strchr   // ANSI explicit

#define strcmpU      ucscmp   // UNICODE explicit string compare
#define strcpyU      ucscpy   // UNICODE explicit string copy
#define strlenU      ucslen   // UNICODE explicit string length
#define strchrU      ucschr   // UNICODE explicit

#if defined( UNICODE )
#   define txtcmp    strcmpU   // generic string compare (really UNICODE)
#   define txtcpy    strcpyU   // generic string copy    (really UNICODE)
#   define txtlen    strlenU   // generic string length  (really UNICODE)
#   define txtchr    strchrU   // generic string         (really UNICODE)
#else
#   define txtcmp    strcmpA   // generic string compare (really ANSI)
#   define txtcpy    strcpyA   // generic string copy    (really ANSI)
#   define txtlen    strlenA   // generic string length  (really ANSI)
#   define txtchr    strchrA   // generic string         (really ANSI)
#endif

/** Prototype parameter examples. It is important not to declare functions, 
parameters and return values incorrectly.  Follow these two basic rules:
1) Never use generic data types for an explicit prototype
2) Avoid creating a generic prototype using explicit data types **/
#if defined( INCORRECT )
int     mapUnic2Ansi( P_TCHAR pUniStr, P_TCHAR pAnsiStr, TCHAR rplCh ) ;
P_CHAR  encryptString( P_CHAR pStr ) ;
int     uniStrLen( P_TCHAR pUniStr ) ;
#else
int     mapUnic2Ansi( P_UNICHAR pUniStr, P_CHAR pAnsiStr, CHAR rplCh ) ;
P_TCHAR encryptString( P_TCHAR pStr ) ;
int     uniStrLen( P_UNICHAR pUniStr ) ;
#endif

#endif   // end include file



Listing Two

/** Name:   UCS.c
    Desc:   Unicode character string functions. Hungarian 
notation is not used.
**/

#if defined( UNICODE )

#include <stdio.h>
#include "unicode.h"

#define ANSIEXTCHBASE   0x80      // start of ANSI extended 
characters

static UNICHAR mapExtCh2Unic( CHAR ansiChar ) ;

P_UNICHAR
ucschr( P_UNICHAR pUCS, UNICHAR token )
{
P_UNICHAR pStr = pUCS ;
   for ( ; *pStr != token; pStr++ ) ;
   return( pStr ) ;
}
P_UNICHAR
ucscpy( P_UNICHAR pDst, P_UNICHAR pSrc )
{
P_UNICHAR pStr = pDst ;
   while ( *pDst++ = *pSrc++ ) ;
   return( pStr ) ;
}
int
ucslen( P_UNICHAR pUCS )
{
P_UNICHAR pStr = pUCS ;
   for ( ; *pStr; pStr++ ) ;
   return( (int)( pStr - pUCS ) ) ;
}
void
mapAnsi2Unic( P_CHAR pAnsiStr, P_UNICHAR pUnicStr )
{
P_CHAR    pSrc = pAnsiStr ;
P_UNICHAR pDst = pUnicStr ;
   for ( ; *pSrc; pSrc++ )
   {
      *pDst++ = (UNICHAR)( ( *pSrc > ANSIEXTCHBASE ) ?
                mapExtCh2Unic( *pSrc ) : *pSrc ) ;
   }
   *pDst = L'\0' ;      // NULL terminate the Unicode string
}
UNICHAR
mapExtCh2Unic( CHAR ansiChar )
{
static UNICHAR extChArray[] = {
     /* 80 */  0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 
0x00E5, 0x00E7,
               0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 
0x00C4, 0x00C5,
     /* 90 */  0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 
0x00FB, 0x00F9,
               0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 
0x20A7, 0x0192,
     /* A0 */  0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 
0x00AA, 0x00BA,
               0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 
0x00AB, 0x00BB,
     /* B0 */  0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 
0x2562, 0x2556,
               0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 
0x255B, 0x2510,
     /* C0 */  0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 
0x255E, 0x255F,
               0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 
0x256C, 0x2567,
     /* D0 */  0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 
0x2553, 0x256B,
               0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 
0x2590, 0x2580,
     /* E0 */  0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 
0x00B5, 0x03C4,
               0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 
0x03B5, 0x2229,
     /* F0 */  0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 
0x00F7, 0x2248,
               0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 
0x25A0, 0x00A0 
} ;
   return( extChArray[ (int)( ansiChar - ANSIEXTCHBASE ) ] ) ;
}



Listing Three

/** Name:   BEFORE.c   (convert)
    Desc:   Shows a functions before migration to allow you to apply the steps
            discussed in the article and compare your results. The functions 
            are strictly examples. Hungarian notation is not used.
**/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "unicode.h"

P_CHAR   GenericText( P_CHAR text, int *bytes ) ;
P_CHAR   ExplicitText( P_CHAR pStr ) ;
P_CHAR   DataStream( P_CHAR pDst, P_CHAR pSrc, int count ) ;

P_CHAR
GenericText( P_CHAR text, int *bytes )
{
P_CHAR   pStr1 ;
P_CHAR   pStr2 ;
   pStr1  = strchr( text, 'A' ) ;      // locate the token char
   pStr2  = strchr( pStr1, '@' ) ;      // locate the delimiting char
   *pStr2 = '\0' ;                     // terminate string at delimiter
   *bytes = pStr2 - pStr1 ;            // calc number of bytes
                                       // number chars == number of bytes
   return( pStr1 ) ;                   // return start of token found
}
P_CHAR
ExplicitText( P_CHAR pStr )
{
P_CHAR   pAnsiStr ;
   // search a string that is explicitly an ANSI string
   pAnsiStr  = strchr( pStr, 'W' ) ;
   *pAnsiStr = '\0' ;
   return( pStr ) ;
}
P_CHAR
DataStream( P_CHAR pDst, P_CHAR pSrc, int count )
{
   strncpy( pDst, pSrc, count ) ;   // strncpy used purely as an example
   *( pDst + 13 ) = 0 ;            // just to randomly truncate the stream
   return( pDst ) ;
}
int main( void )
{
CHAR      text[ 78 ] ;      // text string - conversion
P_CHAR   pToken ;
P_CHAR   pStr1 ;
CHAR      specialThanksTo[] = "Dawn Woods for editing my article" ;
P_CHAR   pStr2 ;
CHAR      data[ 78 ] = "Imagine this is byte data and not text!" ;
CHAR      temp[ 78 ] ;      // temp data stream buffer
P_CHAR   pStr3 ;
int      nBytes ;
   strcpy( text, "Text with A Token@ in the stream." ) ;
   /* text string that should be converted to a generic string */
   pStr1  = GenericText( text, &nBytes ) ;   // get a token from the string
   pToken = (P_CHAR)malloc( nBytes + 1 ) ;   // alloc space for token only
   strcpy( pToken, pStr1 ) ;                 // cpy str to alloc'd space
   printf( "Token '%s' (A Token)\n, pToken ) ;
   printf( "Bytes: %d\nCharacters: %d\n\n",strlen( pToken ),strlen( pToken));
   free( pToken ) ;
   /* explicit text string that should not be generic */
   pStr2 = ExplicitText( specialThanksTo ) ;
   printf( "Thanks '%s'\nBytes: %d\nCharacters: %d\n\n",
            pStr2, strlen( pStr2 ), strlen( pStr2 ) ) ;
   /* processing data in a stream */
   pStr3 = DataStream( temp, data, 78 ) ;
   printf( "Stream '%s'\nCopy   '%s'\n", data, pStr3 ) ;
    return( 0 ) ;
}




Listing Four

/** Name:   AFTER.c   (convert)
   Desc:   Shows a functions after migration to allow you to 
apply the steps
           discussed in the article and compare your results. 
The functions
           are strictly examples.
           Hungarian notation is not used
**/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "unicode.h"

P_TCHAR  GenericText( P_TCHAR text, int *bytes ) ;
P_CHAR   ExplicitText( P_CHAR pStr ) ;
P_BYTE   DataStream( P_BYTE pDst, P_BYTE pSrc, int count ) ;

P_TCHAR
GenericText( P_TCHAR text, int *bytes )
{
P_TCHAR  pStr1 ;
P_TCHAR  pStr2 ;
   pStr1  = txtchr( text, TEXT('A') ) ;      // locate the 
token char
   pStr2  = txtchr( pStr1, TEXT('@') ) ;     // locate the 
delimiting char
   *pStr2 = TEXT('\0') ;                     // terminate 
string at delimiter
   *bytes = pStr2 - pStr1 ;            // calc number of bytes
                                       // number chars == 
number of bytes
   return( pStr1 ) ;                   // return start of token 
found
}
P_CHAR
ExplicitText( P_CHAR pStr )
{
P_CHAR   pAnsiStr ;
   // search a string that is explicitly an ANSI string
   pAnsiStr  = strchr( pStr, 'W' ) ;
   *pAnsiStr = '\0' ;
   return( pStr ) ;
}
P_UNICHAR
ExplicitUnicodeText( P_UNICHAR pStr )
{
P_UNICHAR   pUnicStr ;  // purely an example, code not called
   // search a string that is explicitly an ANSI string
   pUnicStr  = strchrU( pStr, L'W' ) ;
   *pUnicStr = L'\0' ;
   return( pStr ) ;
}
P_BYTE
DataStream( P_BYTE pDst, P_BYTE pSrc, int count )
{
   memmove( pDst, pSrc, count ) ;
   *( pDst + 13 ) = 0 ;          // just to randomly truncate 
the stream
   return( pDst ) ;
}
int main( void )
{
TCHAR    text[ 78 ] ;      // text string - conversion
P_TCHAR  pToken ;
P_TCHAR  pStr1 ;
CHAR     specialThanksTo[] = "Dawn Woods for editing my 
article" ;
P_CHAR   pStr2 ;
BYTE     data[ 78 ] = "Imagine this is byte data and not text!" 
;
BYTE     temp[ 78 ] ;      // temp data stream buffer
P_BYTE   pStr3 ;
int      nBytes ;
   txtcpy( text, TEXT("Text with A Token@ in the stream.") ) ;
   /* text string that should be converted to a generic string 
*/
   pStr1  = GenericText( text, &nBytes ) ;   // get a token 
from the string
   pToken = (P_CHAR)malloc( nBytes + 1 ) ;   // alloc space for 
token only
   txtcpy( pToken, pStr1 ) ;                 // cpy str to 
alloc'd space
   printf( "Token '%s' (A Token)\n, pToken ) ;  // need 
mapUnic2Ansi()
   printf( "Bytes: %d\nCharacters: %d\n\n",
            CALC_CHAR2BYTES(txtlen( pToken )), txtlen( pToken ) 
) ;
   free( pToken ) ;
   /* explicit text string that should not be generic */
   pStr2 = ExplicitText( specialThanksTo ) ;
   printf( "Thanks '%s'\nBytes: %d\nCharacters: %d\n\n",
            pStr2, strlenA( pStr2 ), strlenA( pStr2 ) ) ;
   /* processing data in a stream */
   pStr3 = DataStream( temp, data, 78 ) ;
   printf( "Stream '%s'\nCopy   '%s'\n", data, pStr3 ) ;
   return( 0 ) ;
}



Listing Six

Token '' (A Token)
Bytes: 14
Characters: 7

Thanks 'Dawn '
Bytes: 5
Characters: 5

Stream 'Imagine this is byte data and not text!'
Copy   'Imagine this '



Listing Seven

/** Name:   TRANSPAR.c
    Desc:  Example of how to use macros (such as the TEXT() 
macro) to eliminate
           unnecessary conditional directives, potential 
semantic errors and 
           improve readability.
**/
//    Migration is NOT transparent. Requires a conditional 
directive in source.
void notTransparent( void )
{
      // ...
#if defined( UNICODE )
      ucscpy( text, L"Text with A Token@ in the stream." ) ;
      // ...
#else
      strcpy( text, "Text with A Token@ in the stream." ) ;
      // ...
#endif
      // ...
}
//    Migration is transparent. No conditional directive 
required
void transparent( void )
{
      // ...
      txtcpy( text, TEXT( "Text with A Token@ in the stream." ) 
) ;
      // ...
}




