home *** CD-ROM | disk | FTP | other *** search
- /*
- * tlex.c -- the lexical analyzer.
- */
-
- #include "::h:gsupport.h"
- #include "tproto.h"
- #include "trans.h"
- #include "tlex.h"
- #include "tree.h"
- #include "token.h"
- #include <ctype.h>
-
- #if MACINTOSH
- #if MPW
- #include <CursorCtl.h>
- #define CURSORINTERVAL 100
- #endif /* MPW */
- #endif /* MACINTOSH */
-
- /*
- * Prototypes.
- */
-
- hidden int bufcmp Params((char *s));
- hidden int ctlesc Params((noargs));
- hidden struct toktab *findres Params((noargs));
- hidden struct toktab *getident Params((int ac,int *cc));
- hidden struct toktab *getnum Params((int ac,int *cc));
- hidden struct toktab *getopr Params((int ac,int *cc));
- hidden struct toktab *getstring Params((int ac,int *cc));
- hidden int hexesc Params((noargs));
- hidden int nextchar Params((noargs));
- hidden int octesc Params((int ac));
- hidden int setfilenm Params((int c));
- hidden int setlineno Params((noargs));
-
- #define isletter(s) (isupper(c) | islower(c))
-
- #if EBCDIC
- extern char ToEBCDIC[256], FromEBCDIC[256];
- #endif /* EBCDIC */
-
- #if !EBCDIC
- #define tonum(c) (isdigit(c) ? (c - '0') : ((c & 037) + 9))
-
- /*
- * esctab - translates single-character escapes in string literals.
- */
-
- static unsigned char esctab[] = {
- 000, 001, 002, 003, 004, 005, 006, 007, /* NUL-BEL */
- 010, 011, 012, 013, 014, 015, 016, 017, /* BS -SI */
- 020, 021, 022, 023, 024, 025, 026, 027, /* DLE-ETB */
- 030, 031, 032, 033, 034, 035, 036, 037, /* CAN-US */
- ' ', '!', '"', '#', '$', '%', '&', '\'', /* !"#$%&' */
- '(', ')', '*', '+', ',', '-', '.', '/', /* ()*+,-./ */
- 000, 001, 002, 003, 004, 005, 006, 007, /* 01234567 */
- 010, 011, ':', ';', '<', '=', '>', '?', /* 89:;<=>? */
- '@', 'A', '\b', 'C', 0177, 033, 014, 'G', /* @ABCDEFG */
- 'H', 'I', 'J', 'K', '\n', 'M', '\n', 'O', /* HIJKLMNO */
- 'P', 'Q', '\r', 'S', '\t', 'U', 013, 'W', /* PQRSTUVW */
- 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', /* XYZ[\]^_ */
- '`', 'a', '\b', 'c', 0177, 033, 014, 'g', /* `abcdefg */
- 'h', 'i', 'j', 'k', '\n', 'm', '\n', 'o', /* hijklmno */
- 'p', 'q', '\r', 's', '\t', 'u', 013, 'w', /* pqrstuvw */
- 'x', 'y', 'z', '{', '|', '}', '~', 0177, /* xyz{|}~ */
- 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207,
- 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217,
- 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227,
- 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237,
- 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247,
- 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257,
- 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267,
- 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277,
- 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307,
- 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317,
- 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327,
- 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337,
- 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347,
- 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357,
- 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367,
- 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377,
- };
- #else /* !EBCDIC */
- /*
- * This is the EBCDIC table for handling escapes.
- */
- static unsigned char esctab[] = {
- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
- 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
- 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
- 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
- 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
- 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
- 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
- 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
- ' ', 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
- 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
- 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
- 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
- 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
- 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
- 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
- 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
- 0x80, 'a', 0x16, 'c', 0x07, 0x27, 0x0c, 'g',
- 'h', 'i', 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
-
- #if EBCDIC == 2
- 0x90, 'j', 'k', 0x15, 'm', 0x15, 'o', 'p',
- #else /* EBCDIC == 2 */
- 0x90, 'j', 'k', 0x25, 'm', 0x15, 'o', 'p',
- #endif /* EBCDIC == 2 */
-
- 'q', 0x0d, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
- 0xa0, 0xa1, 's', 0x05, 'u', 0x0b, 'w', 'x',
- 'y', 'z', 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
- 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
- 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
- 0xc0, 'A', 0x16, 'C', 0x07, 0x27, 0x0c, 'G',
- 'H', 'I', 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
-
- #if EBCDIC == 2
- 0xd0, 'J', 'K', 0x15, 'M', 0x15, 'O', 'P',
- #else /* EBCDIC == 2 */
- 0xd0, 'J', 'K', 0x25, 'M', 0x15, 'O', 'P',
- #endif /* EBCDIC == 2 */
-
- 'Q', 0x0d, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
- 0xe0, 0xe1, 'S', 0x05, 'U', 0x0b, 'W', 'X',
- 'Y', 'Z', 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
- 0, 1, 2, 3, 4, 5, 6, 7,
- 8, 9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
- };
- #endif /* !EBCDIC */
-
- struct node tok_loc =
- {0, NULL, 0, 0}; /* "model" node containing location of current token */
-
- struct toktab dottok =
- {".", DOT, Beginner}; /* token struct for DOT token */
-
- struct str_buf lex_sbuf;
-
- /*
- * yylex - find the next token in the input stream, and return its token
- * type and value to the parser.
- *
- * Variables of interest:
- *
- * cc - character following last token.
- * nlflag - set if a newline was between the last token and the current token
- * lastend - set if the last token was an Ender.
- * lastval - when a semicolon is inserted and returned, lastval gets the
- * token value that would have been returned if the semicolon hadn't
- * been inserted.
- */
-
- static struct toktab *lasttok = NULL;
- static int lastend = 0;
- static int eofflag = 0;
- static int cc = '\n';
-
- int yylex()
- {
- register struct toktab *t;
- register int c;
- int nlflag;
- static nodeptr lastval;
- static struct node semi_loc;
-
- if (lasttok != NULL) {
- /*
- * A semicolon was inserted and returned on the last call to yylex,
- * instead of going to the input, return lasttok and set the
- * appropriate variables.
- */
-
- yylval = lastval;
- tok_loc = *lastval;
- t = lasttok;
- goto ret;
- }
- nlflag = 0;
- loop:
- c = cc;
- /*
- * Remember where a semicolon will go if we insert one.
- */
- semi_loc.n_file = tok_loc.n_file;
- semi_loc.n_line = in_line;
- if (cc == '\n')
- --semi_loc.n_line;
- semi_loc.n_col = incol;
- /*
- * Skip whitespace and comments and process #line directives.
- */
- while (c == Comment || isspace(c)) {
- if (c == '\n') {
- nlflag++;
- c = NextChar;
- if (c == Comment) {
- /*
- * Check for #line directive at start of line.
- */
- if (('l' == (c = NextChar)) &&
- ('i' == (c = NextChar)) &&
- ('n' == (c = NextChar)) &&
- ('e' == (c = NextChar))) {
- c = setlineno();
- while ((c == ' ') || (c == '\t'))
- c = NextChar;
- if (c != EOF && c != '\n')
- c = setfilenm(c);
- }
- while (c != EOF && c != '\n')
- c = NextChar;
- }
- }
- else {
- if (c == Comment) {
- while (c != EOF && c != '\n')
- c = NextChar;
- }
- else {
- c = NextChar;
- }
- }
- }
- /*
- * A token is the next thing in the input. Set token location to
- * the current line and column.
- */
- tok_loc.n_line = in_line;
- tok_loc.n_col = incol;
-
- if (c == EOF) {
- /*
- * End of file has been reached. Set eofflag, return T_Eof, and
- * set cc to EOF so that any subsequent scans also return T_Eof.
- */
- if (eofflag++) {
- eofflag = 0;
- cc = '\n';
- yylval = NULL;
- return 0;
- }
- cc = EOF;
- t = T_Eof;
- yylval = NULL;
- goto ret;
- }
-
- /*
- * Look at current input character to determine what class of token
- * is next and take the appropriate action. Note that the various
- * token gathering routines write a value into cc.
- */
- if (isalpha(c) || (c == '_')) { /* gather ident or reserved word */
- if ((t = getident(c, &cc)) == NULL)
- goto loop;
- }
- else if (isdigit(c) || (c == '.')) { /* gather numeric literal or "." */
- if ((t = getnum(c, &cc)) == NULL)
- goto loop;
- }
- else if (c == '"' || c == '\'') { /* gather string or cset literal */
- if ((t = getstring(c, &cc)) == NULL)
- goto loop;
- }
- else { /* gather longest legal operator */
- if ((t = getopr(c, &cc)) == NULL)
- goto loop;
- yylval = OpNode(t->t_type);
- }
- if (nlflag && lastend && (t->t_flags & Beginner)) {
- /*
- * A newline was encountered between the current token and the last,
- * the last token was an Ender, and the current token is a Beginner.
- * Return a semicolon and save the current token in lastval.
- */
- lastval = yylval;
- lasttok = t;
- tok_loc = semi_loc;
- yylval = OpNode(SEMICOL);
- return SEMICOL;
- }
- ret:
- /*
- * Clear lasttok, set lastend if the token being returned is an
- * Ender, and return the token.
- */
- lasttok = 0;
- lastend = t->t_flags & Ender;
- return (t->t_type);
- }
-
- #ifdef MultipleRuns
- /*
- * yylexinit - initialize variables for multiple runs
- */
- novalue yylexinit()
- {
- lasttok = NULL;
- lastend = 0;
- eofflag = 0;
- cc = '\n';
- }
-
- #endif /* MultipleRuns */
- /*
- * getident - gather an identifier beginning with ac. The character
- * following identifier goes in cc.
- */
-
- static struct toktab *getident(ac, cc)
- int ac;
- int *cc;
- {
- register int c;
- register struct toktab *t;
-
- c = ac;
- /*
- * Copy characters into string space until a non-alphanumeric character
- * is found.
- */
- do {
- AppChar(lex_sbuf, c);
- c = NextChar;
- } while (isalnum(c) || (c == '_'));
- *cc = c;
- /*
- * If the identifier is a reserved word, make a ResNode for it and return
- * the token value. Otherwise, install it with putid, make an
- * IdNode for it, and return.
- */
- if ((t = findres()) != NULL) {
- lex_sbuf.endimage = lex_sbuf.strtimage;
- yylval = ResNode(t->t_type);
- return t;
- }
- else {
- yylval = IdNode(str_install(&lex_sbuf));
- return (struct toktab *)T_Ident;
- }
- }
-
- /*
- * findres - if the string just copied into the string space by getident
- * is a reserved word, return a pointer to its entry in the token table.
- * Return NULL if the string isn't a reserved word.
- */
-
- static struct toktab *findres()
- {
- register struct toktab *t;
- register char c;
-
- c = *lex_sbuf.strtimage;
- if (!islower(c))
- return NULL;
- /*
- * Point t at first reserved word that starts with c (if any).
- */
- if ((t = restab[c - 'a']) == NULL)
- return NULL;
- /*
- * Search through reserved words, stopping when a match is found
- * or when the current reserved word doesn't start with c.
- */
- while (t->t_word[0] == c) {
- if (bufcmp(t->t_word))
- return t;
- t++;
- }
- return NULL;
- }
-
- /*
- * bufcmp - compare a null terminated string to what is in the string buffer.
- */
- static int bufcmp(s)
- char *s;
- {
- register char *s1;
- s1 = lex_sbuf.strtimage;
- while (s != '\0' && s1 < lex_sbuf.endimage && *s == *s1) {
- ++s;
- ++s1;
- }
- if (*s == '\0' && s1 == lex_sbuf.endimage)
- return 1;
- else
- return 0;
- }
-
- /*
- * getnum - gather a numeric literal starting with ac and put the
- * character following the literal into *cc.
- *
- * getnum also handles the "." operator, which is distinguished from
- * a numeric literal by what follows it.
- */
-
- static struct toktab *getnum(ac, cc)
- int ac;
- int *cc;
- {
- register int c, r, state;
- int realflag;
-
- c = ac;
- if (c == '.') {
- r = 0;
- state = 7;
- realflag = 1;
- }
- else {
- r = tonum(c);
- state = 0;
- realflag = 0;
- }
- for (;;) {
- AppChar(lex_sbuf, c);
- c = NextChar;
- switch (state) {
- case 0: /* integer part */
- if (isdigit(c)) { r = r * 10 + tonum(c); continue; }
- if (c == '.') { state = 1; realflag++; continue; }
- if (c == 'e' || c == 'E') { state = 2; realflag++; continue; }
- if (c == 'r' || c == 'R') {
- state = 5;
- if (r < 2 || r > 36)
- tfatal("invalid radix for integer literal", (char *)NULL);
- continue;
- }
- break;
- case 1: /* fractional part */
- if (isdigit(c)) continue;
- if (c == 'e' || c == 'E') { state = 2; continue; }
- break;
- case 2: /* optional exponent sign */
- if (c == '+' || c == '-') { state = 3; continue; }
- case 3: /* first digit after e, e+, or e- */
- if (isdigit(c)) { state = 4; continue; }
- tfatal("invalid real literal", (char *)NULL);
- break;
- case 4: /* remaining digits after e */
- if (isdigit(c)) continue;
- break;
- case 5: /* first digit after r */
- if ((isdigit(c) || isletter(c)) && tonum(c) < r)
- { state = 6; continue; }
- tfatal("invalid integer literal", (char *)NULL);
- break;
- case 6: /* remaining digits after r */
- if (isdigit(c) || isletter(c)) {
- if (tonum(c) >= r) { /* illegal digit for radix r */
- tfatal("invalid digit in integer literal", (char *)NULL);
- r = tonum('z'); /* prevent more messages */
- }
- continue;
- }
- break;
- case 7: /* token began with "." */
- if (isdigit(c)) {
- state = 1; /* followed by digit is a real const */
- continue;
- }
- *cc = c; /* anything else is just a dot */
- lex_sbuf.endimage--; /* remove dot (undo AppChar) */
- yylval = OpNode(DOT);
- return &dottok;
- }
- break;
- }
- *cc = c;
- if (realflag) {
- yylval = RealNode(str_install(&lex_sbuf));
- return T_Real;
- }
- yylval = IntNode(str_install(&lex_sbuf));
- return T_Int;
- }
-
- /*
- * getstring - gather a string literal starting with ac and place the
- * character following the literal in *cc.
- */
-
- static struct toktab *getstring(ac, cc)
- int ac;
- int *cc;
- {
- register int c, sc;
- int sav_indx;
- int len;
-
- sc = ac;
- sav_indx = -1;
- c = NextChar;
- while (c != sc && c != '\n' && c != EOF) {
- /*
- * If a '_' is the last non-white space before a new-line,
- * we must remember where it is.
- */
- if (c == '_')
- sav_indx = lex_sbuf.endimage - lex_sbuf.strtimage;
- else if (!isspace(c))
- sav_indx = -1;
-
- if (c == Escape) {
- c = NextChar;
- if (c == EOF)
- break;
-
- #ifdef VarTran
- AppChar(lex_sbuf, Escape);
- if (c == '^') {
- c = NextChar;
- if (c == EOF)
- break;
- AppChar(lex_sbuf, '^');
- }
- #else /* VarTran */
- if (isoctal(c))
- c = octesc(c);
- else if (c == 'x')
- c = hexesc();
- else if (c == '^')
- c = ctlesc();
- else
- c = esctab[c];
- #endif /* VarTran */
-
- }
- AppChar(lex_sbuf, c);
- c = NextChar;
-
- /*
- * If a '_' is the last non-white space before a new-line, the
- * string continues at the first non-white space on the next line
- * and everything from the '_' to the end of this line is ignored.
- */
- if (c == '\n' && sav_indx >= 0) {
- lex_sbuf.endimage = lex_sbuf.strtimage + sav_indx;
- while ((c = NextChar) != EOF && isspace(c))
- ;
- }
- }
- if (c == sc)
- *cc = ' ';
- else {
- tfatal("unclosed quote", (char *)NULL);
- *cc = c;
- }
- len = lex_sbuf.endimage - lex_sbuf.strtimage + 1; /* includes '\0' */
- if (ac == '"') { /* a string literal */
- yylval = StrNode(str_install(&lex_sbuf), len);
- return T_String;
- }
- else { /* a cset literal */
- yylval = CsetNode(str_install(&lex_sbuf), len);
- return T_Cset;
- }
- }
-
- #ifndef VarTran
-
- /*
- * ctlesc - translate a control escape -- backslash followed by
- * caret and one character.
- */
-
- static int ctlesc()
- {
- register int c;
-
- c = NextChar;
- if (c == EOF)
- return EOF;
-
- #if !EBCDIC
- return (c & 037);
- #else /* !EBCDIC */
- return ToEBCDIC[FromEBCDIC[c] & 037];
- /* ctrl-x in EBCDIC is the EBCDIC equivalent */
- /* to ASCII ctrl-x */
- #endif /* !EBCDIC */
-
- }
-
- /*
- * octesc - translate an octal escape -- backslash followed by
- * one, two, or three octal digits.
- */
-
- static int octesc(ac)
- int ac;
- {
- register int c, nc, i;
-
- c = 0;
- nc = ac;
- i = 1;
- do {
- c = (c << 3) | (nc - '0');
- nc = NextChar;
- if (nc == EOF)
- return EOF;
- } while (isoctal(nc) && i++ < 3);
- PushChar(nc);
-
- #if EBCDIC != 2
- return (c & 0377);
- #else /* EBCDIC != 2 */
- return ToEBCDIC[c & 0377];
- #endif /* EBCDIC != 2 */
- }
-
- /*
- * hexesc - translate a hexadecimal escape -- backslash-x
- * followed by one or two hexadecimal digits.
- */
-
- static int hexesc()
- {
- register int c, nc, i;
-
- c = 0;
- i = 0;
- while (i++ < 2) {
- nc = NextChar;
- if (nc == EOF)
- return EOF;
- if (nc >= 'a' && nc <= 'f')
- nc -= 'a' - 10;
- else if (nc >= 'A' && nc <= 'F')
- nc -= 'A' - 10;
- else if (isdigit(nc))
- nc -= '0';
- else {
- PushChar(nc);
- break;
- }
- c = (c << 4) | nc;
- }
-
- #if EBCDIC != 2
- return c;
- #else /* EBCDIC != 2 */
- return ToEBCDIC[c];
- #endif /* EBCDIC != 2 */
-
- }
-
- #endif /* VarTran */
-
- /*
- * getopr - find the longest legal operator and return a pointer
- * to its entry in the token table.
- */
-
- static struct toktab *getopr(ac, cc)
- int ac;
- int *cc;
- {
- register struct optab *state;
- register char c, i;
-
- state = state0;
- c = ac;
- for (;;) {
- while ((i = state->o_input) && c != i)
- state++;
- switch (state->o_action) {
- case A_Goto:
- state = (struct optab *) state->o_val;
- c = NextChar;
- continue;
- case A_Error:
- tfatal("invalid character", (char *)NULL);
- *cc = ' ';
- return NULL;
- case A_Return:
- *cc = c;
- return (struct toktab *)(state->o_val);
- case A_Immret:
- *cc = ' ';
- return (struct toktab *)(state->o_val);
- }
- }
- }
-
- /*
- * setlineno - set line number from #line comment, return following char.
- */
-
- static int setlineno()
- {
- register int c;
-
- while ((c = NextChar) == ' ' || c == '\t')
- ;
- if (c < '0' || c > '9') {
- tfatal("no line number in #line directive", "");
- while (c != EOF && c != '\n')
- c = NextChar;
- return c;
- }
- in_line = 0;
- while (c >= '0' && c <= '9') {
- in_line = in_line * 10 + (c - '0');
- c = NextChar;
- }
- return c;
- }
-
- /*
- * setfilenm - set file name from #line comment, return following char.
- */
-
- static int setfilenm(c)
- register int c;
- {
- while (c == ' ' || c == '\t')
- c = NextChar;
- if (c != '"') {
- tfatal("'\"' missing from file name in #line directive", "");
- while (c != EOF && c != '\n')
- c = NextChar;
- return c;
- }
- while ((c = NextChar) != '"' && c != EOF && c != '\n')
- AppChar(lex_sbuf, c);
- if (c == '"') {
- tok_loc.n_file = str_install(&lex_sbuf);
- return NextChar;
- }
- else {
- tfatal("'\"' missing from file name in #line directive", "");
- return c;
- }
- }
-
- /*
- * nextchar - return the next character in the input.
- */
-
- static int nextchar()
- {
- register int c;
-
- #if MACINTOSH
- #if MPW
- {
- static short cursorcount = CURSORINTERVAL;
- if (--cursorcount == 0) {
- RotateCursor(0);
- cursorcount = CURSORINTERVAL;
- }
- }
- #endif /* MPW */
- #endif /* MACINTOSH */
-
- if (c = peekc) {
- peekc = 0;
- return c;
- }
- c = getc(srcfile);
- switch (c) {
- case EOF:
- if (incol) {
- c = '\n';
- in_line++;
- incol = 0;
- peekc = EOF;
- break;
- }
- else {
- in_line = 0;
- incol = 0;
- break;
- }
- case '\n':
- in_line++;
- incol = 0;
- break;
- case '\t':
- incol = (incol | 7) + 1;
- break;
- case '\b':
- if (incol)
- incol--;
- break;
- default:
- incol++;
- }
- return c;
- }
-