_A GENERIC PARSING ENGINE IN C++_ by Todd D. Esposito and Andrew K. Johnson Listing One //------------------------------------------------------------------ // gpString.h - Declaration of the gpString class. // // Copyright 1994 Prodis Incorporated. // // Architect: TDE // Developer: AKJ // //------------------------------------------------------------------ #ifndef GPSTRING_H #define GPSTRING_H #include #include #include #define NPOS 32000 typedef unsigned size_t; // StripType is used with ::Strip() enum StripType { Trailing, Leading, Both, All }; // Direction and Inclusive are used // with ::FindString() and ::FindChar() enum Direction { Forward, Backward }; enum Inclusive { Of, NotOf }; class gpString { protected : char *cText; size_t nSize; int CharIsOfString(char cChar, char *cBuffer); public : // Constructors: gpString (const char *cString); gpString (char cChar); gpString (size_t nNewSize, char cFill = ' '); gpString ( ); gpString (gpString &sString); // Destructor: ~gpString ( ); // Size-related functions: size_t Length ( ) {return strlen(cText);} void Resize (size_t nNew); size_t Size ( ) {return nSize;} // Case Conversion - returns a copy: gpString ToUpper ( ); gpString ToLower ( ); // Assignment Operators: // Return the character in the string at a given offset. char &operator[] (int nPos) {return *(cText+nPos);} // Type conversion to char *. operator char *( ); // Assign one string to another. gpString &operator= (gpString &oString); // Append another string to this one. gpString &operator+= (gpString &oString); // Concatenate two strings. gpString operator+ (gpString &oString); // Relational operators: // Compare two strings for equality. virtual int operator== (gpString &oString); virtual int operator== (char *cString) {return operator==(gpString(cString));} // Compare two strings for inequality. int operator!= (gpString &oString); int operator!= (char *cString) {return operator!=(gpString(cString));} // Compare two strings for exclusive alphanumeric precedence. int operator< (gpString &oString); int operator< (char *cString) {return operator<(gpString(cString));} // Compare two strings for exclusive alphanumeric antecedence. int operator> (gpString &oString); int operator> (char *cString) {return operator>(gpString(cString));} // Compare two strings for inclusive alphanumeric precedence. int operator<= (gpString &oString); int operator<= (char *cString) {return operator<=(gpString(cString));} // Compare two strings for inclusive alphanumeric antecedence. int operator>= (gpString &oString); int operator>= (char *cString) {return operator>=(gpString(cString));} // Search Functions: // Find the target substring within this string. size_t FindSubstring (gpString &sTarget , Direction dDirect = Forward, size_t nStart = 0); // Find any character of the target string. size_t FindChar (Inclusive iIsOf, gpString &sTarget , Direction dDirect = Forward, size_t nStart = 0); // Edit Routines: // Insert the new string at the given position. gpString &Insert (gpString &sNew, size_t nPos); // Remove nSize characters starting with position nPos. gpString &Remove (size_t nPos, size_t nSize = NPOS); // Replace nSize characters with the new string. gpString &Replace (size_t nPos, size_t nSize, gpString &sNew); // Remove the given characters at the given positions. // We default to stripping leading and trailing whitespace. gpString &Strip (StripType s = Both , gpString &sTarget = gpString (" \t")); }; #endif Listing Two //------------------------------------------------------------------ // gpRegExp.cpp - Definition of the gpRegExp class. // // Copyright 1994 Prodis Incorporated. // // Purpose: The General Purpose Regular Expression class handles // pattern matching duties. // // Architect: TDE // Developer: AKJ // // Modification History: // 09/08/94 TDE: Original Code. // 09/09/94 AKJ: Fixed TDE's stuff, and fleshed out functions. // 01/17/95 AKJ: Added support for +. // 02/22/95 AKJ: Minor fix to Opional to allow fallback. // //------------------------------------------------------------------ #include #include #include #define LOWER_BOUND 1 #define UPPER_BOUND 3 // Create a gpRegExp object from a character string. gpRegExp::gpRegExp (const char *cNewText) : gpString (cNewText) { nDoICount = 0; fTopLevel = top; NextAtom = 0; ParseAtoms (); } gpRegExp::gpRegExp (char cChar, int top) : gpString (cChar) { nDoICount = 0; NextAtom = 0; fTopLevel = top; ParseAtoms (); } gpRegExp::gpRegExp (int top) : gpString ( ) { nDoICount = 0; NextAtom = 0; fTopLevel = top; ParseAtoms (); } gpRegExp::gpRegExp (const gpString &s, int top) : gpString (s) { nDoICount = 0; NextAtom = 0; fTopLevel = top; ParseAtoms (); } gpRegExp::~gpRegExp ( ) { if (NextAtom) delete NextAtom; } gpRegExp &gpRegExp::operator= (gpString &oString) { if (NextAtom) delete NextAtom; NextAtom = 0; fTopLevel = 1; (*this) = oString; ParseAtoms (); return *this; } gpRegExp &gpRegExp::operator= (char *cString) { if (NextAtom) delete NextAtom; (*this) = (cString); NextAtom = 0; fTopLevel = 1; ParseAtoms (); return *this; } //------------------------------------------------------------- void gpRegExp::ParseAtoms ( ) { int nPos = 0; int GotToken = 0; gpString *copy; ExpType = Literal; size_t nOffset; // Release the previous atoms and reset parameters. // This is ususally only for when an assign is done. firstOnly = 1; lastOnly = 0; if (fTopLevel) { // First, optimize by removing extraneous '.*'s if (FindSubstring ("^.*") == 0) { firstOnly = 0; Remove (0, 3); } else if (FindSubstring (".*") == 0) { firstOnly = 0; Remove (0, 2); } // Next, check for "Beginning-of-Line" if (*this[0] == '^') Remove (0, 1); else firstOnly = 0; } // Strip out the first atom in the string. copy = new gpString (cText); while (nPos < Length() && ! GotToken) { switch ((*this)[nPos]) { case '\\': // We have to Quote the next Remove (nPos, 1); // character, so remove the copy->Remove (nPos, 1); // slash and get the char. nPos++; break; case '.': // if we get a '.' if ((nPos) == 0) // and it's the first char if ((*this)[1] == '*') // and it's followed by a '*' { Remove (2); // Then we have a '.*' token. copy->Remove (0, 2); GotToken = 1; ExpType = MultiWild0; } else if ((*this)[1] == '+') // if followed by '+' { Remove (2); // Then we have a '.+' token. copy->Remove (0, 2); GotToken = 1; ExpType = MultiWild1; } else { Remove (1); // we have a plain copy->Remove (0, 1); // old '.' token. GotToken = 1; ExpType = Wild; } else { Remove (nPos); // we have a literal token. copy->Remove (0, nPos); GotToken = 1; ExpType = Literal; } break; case '*': // if we get a '*' if (nPos == 1) // and it's the second character { Remove (2); // The we have a * token. copy->Remove (0, 2); GotToken = 1; ExpType = MultiChar0; } else { Remove (nPos - 1); // Or, we have a literal token. copy->Remove (0, nPos - 1); GotToken = 1; ExpType = Literal; } break; case '+': // if we get a '+' if (nPos == 1) // and it's the second character { Remove (2); // The we have a + token. copy->Remove (0, 2); GotToken = 1; ExpType = MultiChar1; } else { Remove (nPos - 1); // Or, we have a literal token. copy->Remove (0, nPos - 1); GotToken = 1; ExpType = Literal; } break; case '$': Remove (nPos); // the buck stops here. copy->Remove (0); // And we won't have any kids. lastOnly = 1; GotToken = 1; ExpType = Literal; break; case '[': // if we get a '[' if ((nPos) > 0) // and it's NOT the first char { Remove (nPos); // we have a literal copy->Remove (0, nPos); ExpType = Literal; GotToken = 1; } else // or we are beginning a range. nPos++; break; case ']': // when we get ']' if ((*this)[nPos + 1] == '*') // we may have [...]* { Remove (nPos + 2); copy->Remove (0, nPos + 2); GotToken = 1; ExpType = MultiRange0; } else if ((*this)[nPos + 1] == '+') // we may have [...]+ { Remove (nPos + 2); copy->Remove (0, nPos + 2); GotToken = 1; ExpType = MultiRange1; } else { Remove (nPos + 1); // or just plain old [...] copy->Remove (0, nPos + 1); GotToken = 1; ExpType = Range; } break; case '{': // we have a brace if ((nPos) > 0) // and it's NOT the first char { Remove (nPos); // we have a literal copy->Remove (0, nPos); ExpType = Literal; GotToken = 1; } else // or we are beginning an // Optional expression { nOffset = FindChar (Of, "}"); copy->Remove (0, nOffset+1); Remove (nOffset); Remove(0, 1); while ( (nOffset = FindChar (Of, "|")) != NPOS ) { (*this)[nOffset] = '\0'; lrChildren.AddItem ( new gpRegExp (*this, 0) ); Remove (0, nOffset+1); } lrChildren.AddItem ( new gpRegExp (*this, 0) ); GotToken = 1; ExpType = Optional; } break; case '&' : // if we get ampersand if (nPos > 0) // and we've already got an atom { GotToken = 1; // then we stop where we are ExpType = Literal; Remove (nPos); copy->Remove (0, nPos); } else { // otherwise, we are starting nDoICount = 1; // a meaningful atom. Remove (0,1); copy->Remove (0,1); } break; default: // Just copy the character. nPos++; } } // Pass the rest along to the next atom. if (GotToken && (*copy != "")) NextAtom = new gpRegExp (*copy, 0); // Flag this guy as NOT top level. if (copy) delete copy; } //------------------------------------------------------------------ // // Routine : operator== (gpString) // // Function : sees if the given regular expression matches // this gpString. // // Notes : Normally, the gpString may be longer than the regular // expression; the comparison ends with the last character // in that expression. However, if the last character of // the expression is '$', then an exact match is called // for, and the gpString may not have extra characters. // //------------------------------------------------------------------ int gpRegExp::operator==(gpString &sExpress) { int nMin = 0; int lMatch = 0; // Assume that the match will fail. int nPos; gpString sBuffer; char cBuffer; gpString sStringSaver; if (fTopLevel) sStringSaver = sExpress; sLastMatch = ""; switch (ExpType) { case Literal : nPos = sExpress.FindSubstring (cText); if ((firstOnly && !nPos)||(!firstOnly && (nPos != NPOS))) { sLastMatch = cText; sExpress.Remove (0, nPos); sExpress.Remove (0, Length () ); lMatch = match_remainder (sExpress); } break; case MultiChar1 : nMin = 1; // set our min chars to 1 and fall thru case MultiChar0 : nPos = sExpress.FindChar(NotOf,(*this)[0]); if (nPos == NPOS) nPos = sExpress.Length (); lMatch = DecrementingMatch(nMin, nPos, sExpress); break; case Wild : if (sExpress.Length () ) { sLastMatch = sExpress[0]; lMatch = match_remainder(sExpress.Remove (0, 1) ); } break; case MultiWild1 : nMin = 1; // set our min chars to 1 and fall thru case MultiWild0 : nPos = sExpress.Length (); lMatch = DecrementingMatch(nMin, nPos, sExpress); break; case Range : cBuffer = sExpress[0]; cBuffer = toupper (cBuffer); if ( (cBuffer >= toupper(cText[LOWER_BOUND])) && (cBuffer <= toupper(cText[UPPER_BOUND]))) { lMatch = match_remainder(sExpress.Remove (0,1)); sLastMatch = cBuffer; } break; case MultiRange1 : nMin = 1; // set our min chars to 1 and fall thru case MultiRange0 : for (nPos = 0; (toupper(sExpress[nPos]) >= toupper(cText[LOWER_BOUND])) && (toupper(sExpress[nPos]) <= toupper(cText[UPPER_BOUND])); nPos++ ); lMatch = DecrementingMatch(nMin, nPos, sExpress); break; case Optional : { gpString sBuffer(sExpress); if (lrChildren.Seek (sExpress)) sLastMatch = lrChildren.Peek()->LastMatch(); if ((lMatch = match_remainder (sExpress)) == 0) { sLastMatch = ""; lMatch = match_remainder (sBuffer); } } } if (fTopLevel) sExpress = sStringSaver; return (lMatch); } // These helpers will keep trying to match a Multi-type atom. // First, try to match as much as possible, then try to match // the next atom. If that atom succeeds, good. // If not, we need to decrement our match string by one character // and retry. We do this until we have reached our minimum chars. int gpRegExp::DecrementingMatch(int nMin, int nPos , gpString &sExpress) { int lMatch = 0; gpString sBuffer; for (; !lMatch && (nPos >= nMin); nPos--) { sBuffer = sExpress; sBuffer.Remove(0, nPos); lMatch = match_remainder(sBuffer); } if (lMatch) { sLastMatch = sExpress; sLastMatch.Remove(++nPos); } return lMatch; } int gpRegExp::match_remainder (gpString &sExpress) { int lMatch = 1; if (lastOnly) { if (sExpress.Length () ) lMatch = 0; } else { if (NextAtom) lMatch = ((*NextAtom) == sExpress); } return lMatch; } //------------------------------------------------------------------ // // Routine : DumpParameters // // Function : travers the atom tree, creating a list of the // meaningful parameters. // //------------------------------------------------------------------ void gpRegExp::DumpParameters (StringList &lsParms) { if (nDoICount) lsParms.AddItem (new gpString (LastMatch () )); if (NextAtom) NextAtom->DumpParameters (lsParms); } //------------------------------------------------------------------ // The following code implements a List of gpRegExp's // We keep it here because the Optional atoms use it. //------------------------------------------------------------------ RegExpList::RegExpList ( ): List () { pFirst = pLast = pCurrent = 0; } RegExpList::~RegExpList ( ) { for (gpRegExp *eRegExp = Reset (); eRegExp; eRegExp = GetNext () ) delete eRegExp; } gpRegExp *RegExpList::Seek (gpString &sName) { int lFound = 0; Reset (); while (!lFound && pCurrent) { if (*(Peek ()) == sName ) lFound = 1; else GetNext (); } return pCurrent; } gpRegExp *RegExpList::Seek (char *cName) { gpString sName (cName); return (Seek (sName) ); } gpRegExp *RegExpList::Reset ( ) { return (pFirst = pCurrent = (gpRegExp *)List::Reset () ); } gpRegExp *RegExpList::GetNext ( ) { return (pCurrent = (gpRegExp *)List::GetNext () ); } gpRegExp *RegExpList::AddItem (gpRegExp *eNew) { pLast = pCurrent = (gpRegExp *)List::AddItem (eNew); if (!pFirst) pFirst = pLast; return (pCurrent); } gpRegExp *RegExpList::Peek ( ) { return pCurrent; } gpRegExp *RegExpList::Seek (int nSequence) { if (nListSize < nSequence) return 0; Reset (); for (int i = 1; i < nSequence; GetNext (), i++); return pCurrent; } gpRegExp &RegExpList::operator[] (int nSequence) { return (*(Seek (nSequence))); } void RegExpList::Clear ( ) { for (gpRegExp *eRegExp = Reset (); eRegExp; eRegExp = GetNext () ) delete eRegExp; List::Clear (); pFirst = pLast = pCurrent = 0; } Listing Three //------------------------------------------------------------------ // Syntax.cpp - Definition of the Syntax and SyntaxList classes. // // Copyright 1994 Prodis Incorporated. // // Purpose: The Syntax class pairs syntactic patterns (REs) with // tokens (ints) // // Architect: AKJ // Developer: AKJ // // Modification History: //------------------------------------------------------------------ #include #include SyntaxList::SyntaxList ( ): List () { pFirst = pLast = pCurrent = 0; } SyntaxList::~SyntaxList ( ) { for (Syntax *eSyntax = Reset (); eSyntax; eSyntax = GetNext () ) delete eSyntax; } Syntax *SyntaxList::Seek (gpString &sName) { int lFound = 0; Reset (); while (!lFound && pCurrent) { if ( (*(pCurrent->reSyntax)) == sName) lFound = 1; else GetNext (); } return pCurrent; } Syntax *SyntaxList::Seek (char *cName) { gpString sName (cName); return (Seek (sName)); } Syntax *SyntaxList::Reset ( ) { return (pFirst = pCurrent = (Syntax *)List::Reset () ); } Syntax *SyntaxList::GetNext ( ) { return (pCurrent = (Syntax *)List::GetNext () ); } Syntax *SyntaxList::AddItem (Syntax *eNew) { pLast = pCurrent = (Syntax *)List::AddItem (eNew); if (!pFirst) pFirst = pLast; return (pCurrent); } Syntax *SyntaxList::Peek ( ) { return pCurrent; } //------------------------------------------------------------------ // Syntax object starts here: Syntax::Syntax () { reSyntax = 0; nToken = 0; } Syntax::Syntax (gpRegExp *reExpress, int nNewToken) { reSyntax = reExpress; nToken = nNewToken; } Syntax::Syntax (char *cExpress, int nNewToken) { reSyntax = new gpRegExp(cExpress); nToken = nNewToken; } Listing Four // BasicMac.h - define tokens for the BASIC-Macro language. // We need only include the FLOW.H file; it includes everything // else we need. #include // Function prototypes: int EvaluateExpression (gpString &sExpress); gpFlowControl *BuildMacroEngine(); gpParser *BuildExpEngine( ); // These functions are prototyped, but not actually coded. // These are just for demonstration purposes, and would need // to be written in a productions system. int GetMacroLine (gpString &sLine); int RewindMacroSource (int nLine); void ErrorHandler(); void SetVariableToValue (gpString &p1, gpString &p2); void DoMenuItem (gpString &p1, gpString &p2, gpString &p3); void MessageBox (gpString &p1, gpString &p2, gpString &p3); void CallFunctionWithParms (gpString &p1, gpString &p2); int CheckEquals (gpString &p1, gpString &p2); int CheckGreater (gpString &p1, gpString &p2); int CheckLessThan (gpString &p1, gpString &p2); // Define our Tokens: // We base each token on TK_USERDEF to avoid conflicts with // predefined tokens. #define TK_DOMENUITEM (TK_USERDEF + 1) #define TK_ASSIGN (TK_USERDEF + 2) #define TK_MBOX (TK_USERDEF + 3) #define TK_CALL (TK_USERDEF + 4) Listing Five // BasicMac.cpp - demonstration of using the Prodis Parsing Engine. // // Notes: // We include BasicMac.h for our tokens, and it includes // all other neccessary files. #include "basicmac.h" // Declare global variable, just for ease of demonstration // flow control object for parsing input lines gpFlowControl *fcMacEngine; // parser object for parsing expressions gpParser *pExpEngine; int main ( ) { // Build the macro-language flow control object. fcMacEngine = BuildMacroEngine(); // Build the expression parser object. pExpEngine = BuildExpEngine(); gpString sLine; // buffer for input lines int nLineNo, nToken; // line number, token StringList slParms; // string list for holding parameters nLineNo = GetMacroLine(sLine); // Get the first line. // nLineNo will be zero when there are no more input lines. while(nLineNo) { // Parse the input line. The engine will return a token. nToken = fcMacroEngine->Parse(sLine, slParms); switch(nToken) { // no match was found for the line case TK_UNRECOGNIZED : ErrorHandler(); //Send an error message break; case TK_ASSIGN : // first parm was the variable name // second parm was the value to set it to SetVariableToValue(slParms[1], slParms[2]); break; case TK_IF : case TK_WHILE : { // slParms[1] has the expression following the 'if' // or while. It must be parsed and evaluated. int IsTrue = EvalateExpression(slParms[1]); PostExpressionValue(IsTrue); break; } case TK_DOMENUITEM : // The following function - not listed - // must implement the macro-language line : // DoCmd DoMenuItem [Parm1], [Parms2], [Parm3] DoMenuItem (lsParms[1], lsParms[2], lsParms[3]); break; case TK_MBOX : // The following function - not listed - // causes a Message Box to be displayed MessageBox(lsParms[1], lsParms[2], lsParms[3]); break; case TK_CALL : // The following function - not listed - // Matches the first parameter with a // function call and gets its arguments // from the second parameter. CallFunctionWithParms(lsParms[1], lsParms[2]); break; case TK_ENDWHILE : // When an ENDWHILE is returned, the first parameter // is the number of the line continaing the matching // WHILE clause, which will be re-evaluated. RewindMacroSource (atoi (lsParms[1])); break; case TK_ELSE : case TK_ENDIF : // We don't need to do anything here; gpFlowControl // will handle it all. However, if we were // interested, we could trap these lines. break; } nLineNo = GetMacroLine(sLine); // get the next line } return (1); } gpFlowControl *BuildMacroEngine( ) { gpFlowControl fcNew = new FlowControl; fcNew.AddSyntax ("^ *If +&.+ +Then *$", TK_IF); fcNew.AddSyntax ("^ *Else *$", TK_ELSE); fcNew.AddSyntax ("^ *Endif *$", TK_ENDIF); fcNew.AddSyntax ("^ *Let &.+ *= *&.+ *$", TK_ASSIGN); fcNew.AddSyntax ("^ *While *&.* *$", TK_WHILE); fcNew.AddSyntax ("^ *WEnd *$", TK_ENDWHILE); fcNew.AddSyntax ("^ *Call *MessageBox *(&.+,&.+,&.+) *$" , TK_MBOX); fcNew.AddSyntax ("^ *Call *&.+ *(&.+) *$", TK_CALL); fcNew.AddSyntax ("^ *DoCmd +DoMenuItem +&.+, +&.+, +&.+ *$" , TK_DOMENUITEM); return fcNew; } gpParser *BuildExpEngine( ) { gpParser pNew = new gpParser; pNew.AddSyntax ("&.+ *= *&.+", TK_EQUALS); pNew.AddSyntax ("&.+ *< *&.+", TK_LESS_THAN); pNew.AddSyntax ("&.+ *> *&.+", TK_GREATER_THAN); return pNew; } int EvalExp (gpString &sExpress) { int nReturn = 0; //return value int nToken; //token StringList lsParms; //String list for parameters // parse the given line and get the token nToken = pExpEngine->Parse(sExpress, lsParms); switch (nToken) { //the line is : expression = expression //if the two expressions are equal, return TRUE case TK_EQUALS : if CheckEquals(lsParms[1], lsParms[2]) nReturn = 1; break; //the line is : expression > expression //if the first expression is greater, return TRUE case TK_GREATER_THAN : if CheckGreater(lsParms[1], lsParms[2]) nReturn = 1; break; //the line is : expression < expression //if the second expression is greater, return TRUE case TK_LESS_THAN : if CheckLessThan (lsParms[1], lsParms[2]) nReturn = 1; } return nReturn; } Listing Six //------------------------------------------------------------------ // Syntoken.h - "System" Syntax tokens. // // Copyright 1994 Prodis Incorporated. // // Architect: AKJ // Developer: AKJ // // Modification History: //------------------------------------------------------------------ // Define "non-language" tokens: #define TK_UNRECOGNIZED 0 #define TK_NOOP 1 #define TK_REWIND 2 #define TK_COMMENT 3 // Define flow-control tokens: #define TK_IF 10 #define TK_ELSE 11 #define TK_MISMATCHED_ELSE 12 #define TK_ENDIF 13 #define TK_MISMATCHED_ENDIF 14 #define TK_LABEL 15 #define TK_GOTO 16 #define TK_WHILE 17 #define TK_ENDWHILE 18 #define TK_MISMATCHED_ENDWHILE 19 // Define Expression realational tokens: // (These are just here because we ALWAYS seem to need them.) #define TK_EQUALS 901 #define TK_NOT_EQUAL 902 #define TK_GREATER_THAN 903 #define TK_LESS_THAN 904 #define TK_GREATER_OR_EQUAL 905 #define TK_LESS_OR_EQUAL 906 #define TK_AND 907 #define TK_OR 908 #define TK_NOT 909 // Define the base for "user-defined" (app-specific) tokens. // Additional definitions should be of the form: // #define TK_SOMETOKEN (TK_USERDEF + n) #define TK_USERDEF 1000