stdin
by default, or another file may be
specified as a command line argument), then prints it out. At the end,
it prints out the symbol table which has been constructed.
<pp1.c>= #include <stdio.h> #include <stdlib.h> #include "global.h" #include "symtab.h" #include "token.h" #include "lexer.h" char *progname, /* name under which lexer is being run */ *filename; /* name of PSub input file */ main(int argc, char *argv[]) { Token currTok; FILE *fileptr; <process arguments> do { currTok = getNextToken(); printf("%-20s %-20s %-20s\n", showTokenKind(getTokenKind(currTok)), getTokenLexeme(currTok), showTokenValue(getTokenValue(currTok))); } while (getTokenKind(currTok) != EofTOK); printf("\n"); printSymbolTable(); exit(EXIT_SUCCESS); }
Definesargc
,argv
,currTok
,filename
,fileptr
,progname
(links are to index).
If there are any arguments, the first is taken as the name of a PSub program to be lexed. Otherwise, the standard input is lexed.
<process arguments>= (<-U) progname = argv[0]; if (argc > 1) { filename = argv[1]; if (fileptr = fopen(filename, "r")) { initLexer(fileptr); } else { perror(progname); exit(EXIT_FAILURE); } } else { filename = "stdin"; initLexer(stdin); }
<global.h>= #ifndef GLOBAL_H #define GLOBAL_H extern char *progname; extern char *filename; extern int lineno; #endif GLOBAL_H
SymTabEntry
and a function lookupSymbol()
which
takes a string argument and returns the corresponding table entry for
that symbol, creating a new entry if necessary. There is also a
function printSymbolTable()
which prints the contents of the table
for debugging purposes.
<symtab.h>= #ifndef SYMTAB_H #define SYMTAB_H <SymTabEntry ADT> SymTabEntry lookupSymbol(const char *symbol); void printSymbolTable(void); #endif SYMTAB_H
For this project, each symbol table entry only needs to contain the name
of the identifier; future projects will require additional fields such
as type and scope information. Identifier names are assumed to be
less than MAXSYMLEN
characters, although attempts are made to check
that this is not exceeded.
<SymTabEntry ADT>= (<-U) #define MAXSYMLEN 100 typedef struct { char name[MAXSYMLEN]; } *SymTabEntry; /* Constructor */ SymTabEntry createEntry(const char *name); /* Destructor */ void destroyEntry(SymTabEntry entry); /* Selector */ char *getSymbolName(const SymTabEntry entry);
DefinesMAXSYMLEN
,SymTabEntry
(links are to index).
tsearch()
. From the man page:
The comparison function we provide uses#include <search.h>
void *tsearch(const void *key, void **rootp,
int (*compar)(const void *, const void *));
tsearch()
is a routine for manipulating binary search trees. It is generalized from Knuth (6.2.2) Algorithm T. All comparisons are done with a user-supplied routine. This routine is called with two arguments, the pointers to the elements being compared. It returns an integer less than, equal to, or greater than 0, according to whether the first argument is to be considered less than, equal to, or greater than the second argument. The comparison function need not compare every byte, so arbitrary data may be contained in the elements in addition to the values being compared.
tsearch()
is used to build and access the tree.key
is a pointer to a datum to be accessed or stored. If there is a datum in the tree equal to*key
(the value pointed to bykey
), a pointer to this found datum is returned. Otherwise,*key
is inserted, and a pointer to it returned. Only pointers are copied, so the calling routine must store the data.rootp
points to a variable that points to the root of a tree. ANULL
value for the variable pointed to byrootp
denotes an empty tree; in this case, the variable will be set to point to the datum which will be at the root of the new tree.
strncasecmp()
to perform a
case-insensitive test, since case is not significant in PSub.
<symtab.c>= [D->] #include <search.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include "global.h" #include "symtab.h" <SymTabEntry ADT implementation> static void *root = NULL; /* the root of the symbol table */ static int compareEntries(const void *entry1, const void *entry2) { return strncasecmp(((const SymTabEntry)entry1)->name, ((const SymTabEntry)entry2)->name, MAXSYMLEN); } SymTabEntry lookupSymbol(const char *symbol) { SymTabEntry newEntry, *foundEntryPtr; newEntry = createEntry(symbol); foundEntryPtr = (SymTabEntry *)tsearch((void *)newEntry, &root, compareEntries); if (foundEntryPtr == NULL) { perror(progname); } if (*foundEntryPtr != newEntry) { destroyEntry(newEntry); } return *foundEntryPtr; }
DefinescompareEntries
,lookupSymbol
,root
(links are to index).
To print the symbol table, we use the C library function twalk()
.
From the man page:
Our auxiliary function#include <search.h>
void twalk(void *root, void (*action)(void *, VISIT, int));
twalk()
traverses a binary search tree.root
is the root of the tree to be traversed. (Any node in a tree may be used as the root for a walk below that node.)action
is the name of a routine to be invoked at each node. This routine is, in turn, called with three arguments. The first argument is the address of the node being visited. The second argument is a value from an enumeration data typetypedef enum { preorder, postorder, endorder, leaf } VISIT;
(defined in the<search.h>
header), depending on whether this is the first, second or third time that the node has been visited (during a depth-first, left-to-right traversal of the tree), or whether the node is a leaf. The third argument is the level of the node in the tree, with the root being level zero.
printEntry()
prints the symbol table entry
when the VISIT
parameter is either postorder
or leaf
, which
means that each interior node is printed between its left child and its
right child; this gives what is commonly called an inorder
traversal (despite the misleading use of postorder
). The result is
that the entries are printed in sorted order.
<symtab.c>+= [<-D] static void printEntry(const void *entry, VISIT order, int level) { if (order == postorder || order == leaf) { printf("%s\n", getSymbolName(*(SymTabEntry *)entry)); } } void printSymbolTable(void) { printf("Symbol Table\n"); printf("------------\n"); twalk(root, printEntry); printf("------------\n"); }
DefinesprintEntry
,printSymbolTable
(links are to index).
<SymTabEntry ADT implementation>= (<-U) SymTabEntry createEntry(const char *name) { SymTabEntry entry; if (entry = (SymTabEntry)malloc(sizeof entry)) { entry->name[0] = 0; /* set to null string */ strncat(entry->name, name, MAXSYMLEN); return entry; } else { perror(progname); } } void destroyEntry(SymTabEntry entry) { free((void *)entry); } char *getSymbolName(const SymTabEntry entry) { return entry->name; }
DefinescreateEntry
,destroyEntry
,getSymbolName
(links are to index).
getNextToken()
, which returns the next token found on the input,
and initLexer()
, which takes as an argument the file pointer which
the lexer should use for input. It depends on the abstract type
Token
, defined in token.h, as well as SymTabEntry
,
defined in symtab.h.
<lexer.h>= #ifndef LEXER_H #define LEXER_H Token getNextToken(void); void initLexer(FILE *fileptr); #endif LEXER_H
<lexer.l>= %{ #include <stdlib.h> #include "global.h" #include "symtab.h" #include "token.h" #include "lexer.h" <lexical C declarations> %} <lexical definitions> %% <lexical rules> %% <lexical C functions>
The actions of the lexer include keeping track of the current line
number, lineno
, for reporting errors:
<lexical C declarations>= (<-U) [D->] int lineno = 1;
Defineslineno
(links are to index).
flex uses the macro YY_DECL
to declare the lexing function;
we want it to be called getNextToken()
and have return type Token
:
<lexical C declarations>+= (<-U) [<-D->] #define YY_DECL Token getNextToken(void)
DefinesgetNextToken
(links are to index).
The scanned token will be collected in currToken
. For convenience,
we define the macro setCT()
to initialize the appropriate fields
using setToken()
:
<lexical C declarations>+= (<-U) [<-D] Token currToken; #define setCT(kind,val) setToken(currToken, kind, yytext, val)
DefinescurrToken
,setCT
(links are to index).
The definitions start with a case-insensitive version of the alphabet:
<lexical definitions>= (<-U) [D->] a [Aa] b [Bb] c [Cc] d [Dd] e [Ee] f [Ff] g [Gg] h [Hh] i [Ii] j [Jj] k [Kk] l [Ll] m [Mm] n [Nn] o [Oo] p [Pp] q [Qq] r [Rr] s [Ss] t [Tt] u [Uu] v [Vv] w [Ww] x [Xx] y [Yy] z [Zz]
An integer is one or more digits, and a real number is an integer followed by either a fractional part or an exponent, or both:
<lexical definitions>+= (<-U) [<-D->] integer [0-9]+ frac "."{integer} expt {e}[-+]?{integer} real {integer}({frac}{expt}?|{expt})
Definesexpt
,frac
,integer
,real
(links are to index).
An identifier is a letter followed by one or more alphanumerics; the case is preserved but is not significant in searching the symbol table.
<lexical definitions>+= (<-U) [<-D->] ident [A-Za-z][A-Za-z0-9]*
Definesident
(links are to index).
Whitespace is spaces, tabs, carriage returns, and newlines. Newlines are handled separately to maintain the line number count
<lexical definitions>+= (<-U) [<-D->] white [ \t\r]+ newline \n
Definesnewline
,white
(links are to index).
The exclusive states COMMENT1 and COMMENT2 handle the body of comments (respectively surrounded by { } and (* *), so that we can continue to process newlines inside them.
<lexical definitions>+= (<-U) [<-D] %x COMMENT1 COMMENT2
DefinesCOMMENT1
,COMMENT2
(links are to index).
Here are the rules for dealing with whitespace and comments:
<lexical rules>= (<-U) [D->] {white} /* eat up whitespace */ {newline} lineno++; "{" BEGIN(COMMENT1); "(*" BEGIN(COMMENT2); <COMMENT1,COMMENT2>{newline} lineno++; <COMMENT1>[^}\n]+ /* ignore the comment body except for newlines */ <COMMENT2>[^*\n]+ /* ignore the comment body except for newlines */ <COMMENT2>"*" /* ignore lone stars in the comment body */ <COMMENT1>"}" BEGIN(INITIAL); <COMMENT2>"*)" BEGIN(INITIAL); <COMMENT1,COMMENT2><<EOF>> { fprintf(stderr, "Error: unclosed comment at EOF in %s, line %d\n", filename, lineno); return setCT(EofTOK, setNoValue()); } <<EOF>> return setCT(EofTOK, setNoValue());
Here are the rules for dealing with keywords:
<lexical rules>+= (<-U) [<-D->] {p}{r}{o}{g}{r}{a}{m} return setCT(ProgramTOK, setNoValue()); {p}{r}{o}{c}{e}{d}{u}{r}{e} return setCT(ProcedureTOK, setNoValue()); {f}{u}{n}{c}{t}{i}{o}{n} return setCT(FunctionTOK, setNoValue()); {b}{e}{g}{i}{n} return setCT(BeginTOK, setNoValue()); {e}{n}{d} return setCT(EndTOK, setNoValue()); {i}{f} return setCT(IfTOK, setNoValue()); {t}{h}{e}{n} return setCT(ThenTOK, setNoValue()); {e}{l}{s}{e} return setCT(ElseTOK, setNoValue()); {w}{h}{i}{l}{e} return setCT(WhileTOK, setNoValue()); {d}{o} return setCT(DoTOK, setNoValue()); {r}{e}{p}{e}{a}{t} return setCT(RepeatTOK, setNoValue()); {u}{n}{t}{i}{l} return setCT(UntilTOK, setNoValue()); {v}{a}{r} return setCT(VarTOK, setNoValue()); {i}{n}{t}{e}{g}{e}{r} return setCT(IntegerTOK, setNoValue()); {r}{e}{a}{l} return setCT(RealTOK, setNoValue()); {b}{o}{o}{l}{e}{a}{n} return setCT(BooleanTOK, setNoValue()); {n}{o}{t} return setCT(NotTOK, setNoValue());
Here are the rules for dealing with operators:
<lexical rules>+= (<-U) [<-D->] ":=" return setCT(AssignTOK, setNoValue()); "=" return setCT(RelOpTOK, setOpValue(EqOP)); "<>" return setCT(RelOpTOK, setOpValue(NotEqOP)); "<" return setCT(RelOpTOK, setOpValue(LessOP)); "<=" return setCT(RelOpTOK, setOpValue(LessEqOP)); ">" return setCT(RelOpTOK, setOpValue(GrtrOP)); ">=" return setCT(RelOpTOK, setOpValue(GrtrEqOP)); {o}{r} return setCT(AddOpTOK, setOpValue(OrOP)); "+" return setCT(AddOpTOK, setOpValue(PlusOP)); "-" return setCT(AddOpTOK, setOpValue(MinusOP)); {a}{n}{d} return setCT(MulOpTOK, setOpValue(AndOP)); {m}{o}{d} return setCT(MulOpTOK, setOpValue(ModOP)); {d}{i}{v} return setCT(MulOpTOK, setOpValue(DivOP)); "*" return setCT(MulOpTOK, setOpValue(TimesOP)); "/" return setCT(MulOpTOK, setOpValue(DivideOP));
Here are the rules for dealing with identifiers and numbers (note that the identifier rule has to come after all of the keyword tokens):
<lexical rules>+= (<-U) [<-D->] {ident} { return setCT(IdentifierTOK, setSymbolValue(lookupSymbol(yytext))); } {integer} return setCT(NumberTOK, setIntegerValue(atoi(yytext))); {real} return setCT(NumberTOK, setRealValue(atof(yytext)));
And here are the rules for dealing with punctuation and everything else:
<lexical rules>+= (<-U) [<-D] "(" return setCT(LParenTOK, setNoValue()); ")" return setCT(RParenTOK, setNoValue()); ":" return setCT(ColonTOK, setNoValue()); ";" return setCT(SemicolonTOK, setNoValue()); "," return setCT(CommaTOK, setNoValue()); "." return setCT(PeriodTOK, setNoValue()); . { fprintf(stderr, "Error: illegal character %s in %s, line %d\n", yytext, filename, lineno); }
All we need to do to initialize the lexer is set yyin
to the desired
input file pointer.
<lexical C functions>= (<-U) void initLexer(FILE *fileptr) { yyin = fileptr; }
DefinesinitLexer
(links are to index).
Token
contains the kind of the token, the actual lexeme found, and
the corresponding interpretation of that lexeme as a value (in the case
of a token kind where the lexeme provides additional information). For
example, the token 3.14
has kind NumberTOK
, lexeme "3.14"
,
and value 3.14
(as a double
).
<token.h>= #ifndef TOKEN_H #define TOKEN_H <TokenKind ADT> <TokenValue ADT> typedef struct { TokenKind kind; char *lexeme; TokenValue val; } Token; /* Initializer */ Token setToken(Token token, const TokenKind kind, char *lexeme, const TokenValue val); /* Selectors */ TokenKind getTokenKind(const Token token); char *getTokenLexeme(const Token token); TokenValue getTokenValue(const Token token); #endif TOKEN_H
DefinesToken
(links are to index).
Here are all of the token kinds of PSub, including the special kind
EofTOK
which is only returned on end-of-file.
<TokenKind ADT>= (<-U) typedef enum { IdentifierTOK, NumberTOK, ProgramTOK, ProcedureTOK, FunctionTOK, BeginTOK, EndTOK, IfTOK, ThenTOK, ElseTOK, WhileTOK, DoTOK, RepeatTOK, UntilTOK, VarTOK, IntegerTOK, RealTOK, BooleanTOK, NotTOK, AssignTOK, RelOpTOK, AddOpTOK, MulOpTOK, LParenTOK, RParenTOK, ColonTOK, SemicolonTOK, CommaTOK, PeriodTOK, EofTOK } TokenKind; /* Converter */ char *showTokenKind(const TokenKind kind);
DefinesTokenKind
(links are to index).
The value component of a Token
may be one of several types:
int
, double
, SymTabEntry
, or OpKind
. The type
field
identifies what sort of datum is in the value
field; if it is
NoVAL
, then there is no associated value for this token (for
example, it might just be a keyword or punctuation).
<TokenValue ADT>= (<-U) typedef enum { IntegerVAL, RealVAL, SymbolVAL, OpVAL, NoVAL } TokenValueType; <OpKind ADT> typedef struct { TokenValueType type; union { int intval; double realval; SymTabEntry symbolval; OpKind opval; } value; } TokenValue; /* Initializers */ TokenValue setIntegerValue(const int intval); TokenValue setRealValue(const double realval); TokenValue setSymbolValue(const SymTabEntry symbolval); TokenValue setOpValue(const int opval); TokenValue setNoValue(void); /* Converter */ char *showTokenValue(const TokenValue value); /* Selectors will be needed in future projects, but not yet */
DefinesTokenValue
,TokenValueType
(links are to index).
The type OpKind
distinguishes which particular operator was seen,
since the token kind only discriminates between relational, additive,
and multiplicative operators.
<OpKind ADT>= (<-U) typedef enum { EqOP, NotEqOP, LessOP, LessEqOP, GrtrOP, GrtrEqOP, /* RelOps */ OrOP, PlusOP, MinusOP, /* AddOps */ AndOP, DivOP, ModOP, TimesOP, DivideOP /* MulOps */ } OpKind; /* Converter */ char *showOpKind(const OpKind kind);
DefinesOpKind
(links are to index).
<token.c>= #include "global.h" #include "symtab.h" #include "token.h" <TokenKind ADT implementation> <TokenValue ADT implementation> Token setToken(Token token, const TokenKind kind, char *lexeme, const TokenValue val) { token.kind = kind; token.lexeme = lexeme; token.val = val; return token; } TokenKind getTokenKind(const Token token) { return token.kind; } char *getTokenLexeme(const Token token) { return token.lexeme; } TokenValue getTokenValue(const Token token) { return token.val; }
DefinesgetTokenKind
,getTokenLexeme
,getTokenValue
,setToken
(links are to index).
<TokenKind ADT implementation>= (<-U) char *showTokenKind(const TokenKind kind) { switch (kind) { case IdentifierTOK: return "IdentifierTOK"; case NumberTOK: return "NumberTOK"; case ProgramTOK: return "ProgramTOK"; case ProcedureTOK: return "ProcedureTOK"; case FunctionTOK: return "FunctionTOK"; case BeginTOK: return "BeginTOK"; case EndTOK: return "EndTOK"; case IfTOK: return "IfTOK"; case ThenTOK: return "ThenTOK"; case ElseTOK: return "ElseTOK"; case WhileTOK: return "WhileTOK"; case DoTOK: return "DoTOK"; case RepeatTOK: return "RepeatTOK"; case UntilTOK: return "UntilTOK"; case VarTOK: return "VarTOK"; case IntegerTOK: return "IntegerTOK"; case RealTOK: return "RealTOK"; case BooleanTOK: return "BooleanTOK"; case NotTOK: return "NotTOK"; case AssignTOK: return "AssignTOK"; case RelOpTOK: return "RelOpTOK"; case AddOpTOK: return "AddOpTOK"; case MulOpTOK: return "MulOpTOK"; case LParenTOK: return "LParenTOK"; case RParenTOK: return "RParenTOK"; case ColonTOK: return "ColonTOK"; case SemicolonTOK: return "SemicolonTOK"; case CommaTOK: return "CommaTOK"; case PeriodTOK: return "PeriodTOK"; case EofTOK: return "EofTOK"; } }
DefinesshowTokenKind
(links are to index).
<TokenValue ADT implementation>= (<-U) <OpKind ADT implementation> TokenValue currTokenValue; TokenValue setIntegerValue(const int intval) { currTokenValue.type = IntegerVAL; currTokenValue.value.intval = intval; return currTokenValue; } TokenValue setRealValue(const double realval) { currTokenValue.type = RealVAL; currTokenValue.value.realval = realval; return currTokenValue; } TokenValue setSymbolValue(const SymTabEntry symbolval) { currTokenValue.type = SymbolVAL; currTokenValue.value.symbolval = symbolval; return currTokenValue; } TokenValue setOpValue(const int opval) { currTokenValue.type = OpVAL; currTokenValue.value.opval = opval; return currTokenValue; } TokenValue setNoValue(void) { currTokenValue.type = NoVAL; return currTokenValue; } /* a buffer in which to print out numbers */ static char valueString[100]; char *showTokenValue(const TokenValue val) { switch (val.type) { case IntegerVAL: sprintf(valueString, "%d", val.value.intval); return valueString; case RealVAL: sprintf(valueString, "%G", val.value.realval); return valueString; case SymbolVAL: return getSymbolName(val.value.symbolval); case OpVAL: return showOpKind(val.value.opval); case NoVAL: return ""; } }
DefinescurrTokenValue
,setIntegerValue
,setNoValue
,setOpValue
,setRealValue
,setSymbolValue
,showTokenValue
,valueString
(links are to index).
<OpKind ADT implementation>= (<-U) char *showOpKind(const OpKind kind) { switch (kind) { case EqOP: return "EqOP"; case NotEqOP: return "NotEqOP"; case LessOP: return "LessOP"; case LessEqOP: return "LessEqOP"; case GrtrOP: return "GrtrOP"; case GrtrEqOP: return "GrtrEqOP"; case OrOP: return "OrOP"; case PlusOP: return "PlusOP"; case MinusOP: return "MinusOP"; case AndOP: return "AndOP"; case DivOP: return "DivOP"; case ModOP: return "ModOP"; case TimesOP: return "TimesOP"; case DivideOP: return "DivideOP"; } }
DefinesshowOpKind
(links are to index).