Celestin Apprentice 2

home *** CD-ROM | disk | FTP | other *** search

/ Celestin Apprentice 2 / Apprentice-Release2.iso / Tools / Languages / Icon 8.1 / msm-1 / icont.sit / tlex.c < prev next >

Wrap

C/C++ Source or Header | 1992-09-19 | 20.4 KB | 801 lines | [TEXT/MPS ]

/* * tlex.c -- the lexical analyzer. */ #include "::h:gsupport.h" #include "tproto.h" #include "trans.h" #include "tlex.h" #include "tree.h" #include "token.h" #include <ctype.h> #if MACINTOSH #if MPW #include <CursorCtl.h> #define CURSORINTERVAL 100 #endif /* MPW */ #endif /* MACINTOSH */ /* * Prototypes. */ hidden int bufcmp Params((char *s)); hidden int ctlesc Params((noargs)); hidden struct toktab *findres Params((noargs)); hidden struct toktab *getident Params((int ac,int *cc)); hidden struct toktab *getnum Params((int ac,int *cc)); hidden struct toktab *getopr Params((int ac,int *cc)); hidden struct toktab *getstring Params((int ac,int *cc)); hidden int hexesc Params((noargs)); hidden int nextchar Params((noargs)); hidden int octesc Params((int ac)); hidden int setfilenm Params((int c)); hidden int setlineno Params((noargs)); #define isletter(s) (isupper(c) | islower(c)) #if EBCDIC extern char ToEBCDIC[256], FromEBCDIC[256]; #endif /* EBCDIC */ #if !EBCDIC #define tonum(c) (isdigit(c) ? (c - '0') : ((c & 037) + 9)) /* * esctab - translates single-character escapes in string literals. */ static unsigned char esctab[] = { 000, 001, 002, 003, 004, 005, 006, 007, /* NUL-BEL */ 010, 011, 012, 013, 014, 015, 016, 017, /* BS -SI */ 020, 021, 022, 023, 024, 025, 026, 027, /* DLE-ETB */ 030, 031, 032, 033, 034, 035, 036, 037, /* CAN-US */ ' ', '!', '"', '#', '$', '%', '&', '\'', /* !"#$%&' */ '(', ')', '*', '+', ',', '-', '.', '/', /* ()*+,-./ */ 000, 001, 002, 003, 004, 005, 006, 007, /* 01234567 */ 010, 011, ':', ';', '<', '=', '>', '?', /* 89:;<=>? */ '@', 'A', '\b', 'C', 0177, 033, 014, 'G', /* @ABCDEFG */ 'H', 'I', 'J', 'K', '\n', 'M', '\n', 'O', /* HIJKLMNO */ 'P', 'Q', '\r', 'S', '\t', 'U', 013, 'W', /* PQRSTUVW */ 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', /* XYZ[\]^_ */ '`', 'a', '\b', 'c', 0177, 033, 014, 'g', /* `abcdefg */ 'h', 'i', 'j', 'k', '\n', 'm', '\n', 'o', /* hijklmno */ 'p', 'q', '\r', 's', '\t', 'u', 013, 'w', /* pqrstuvw */ 'x', 'y', 'z', '{', '|', '}', '~', 0177, /* xyz{|}~ */ 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207, 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217, 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227, 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237, 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247, 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257, 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267, 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277, 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307, 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317, 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327, 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337, 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347, 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357, 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367, 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377, }; #else /* !EBCDIC */ /* * This is the EBCDIC table for handling escapes. */ static unsigned char esctab[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, ' ', 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 'a', 0x16, 'c', 0x07, 0x27, 0x0c, 'g', 'h', 'i', 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, #if EBCDIC == 2 0x90, 'j', 'k', 0x15, 'm', 0x15, 'o', 'p', #else /* EBCDIC == 2 */ 0x90, 'j', 'k', 0x25, 'm', 0x15, 'o', 'p', #endif /* EBCDIC == 2 */ 'q', 0x0d, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 's', 0x05, 'u', 0x0b, 'w', 'x', 'y', 'z', 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 'A', 0x16, 'C', 0x07, 0x27, 0x0c, 'G', 'H', 'I', 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, #if EBCDIC == 2 0xd0, 'J', 'K', 0x15, 'M', 0x15, 'O', 'P', #else /* EBCDIC == 2 */ 0xd0, 'J', 'K', 0x25, 'M', 0x15, 'O', 'P', #endif /* EBCDIC == 2 */ 'Q', 0x0d, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 'S', 0x05, 'U', 0x0b, 'W', 'X', 'Y', 'Z', 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, }; #endif /* !EBCDIC */ struct node tok_loc = {0, NULL, 0, 0}; /* "model" node containing location of current token */ struct toktab dottok = {".", DOT, Beginner}; /* token struct for DOT token */ struct str_buf lex_sbuf; /* * yylex - find the next token in the input stream, and return its token * type and value to the parser. * * Variables of interest: * * cc - character following last token. * nlflag - set if a newline was between the last token and the current token * lastend - set if the last token was an Ender. * lastval - when a semicolon is inserted and returned, lastval gets the * token value that would have been returned if the semicolon hadn't * been inserted. */ static struct toktab *lasttok = NULL; static int lastend = 0; static int eofflag = 0; static int cc = '\n'; int yylex() { register struct toktab *t; register int c; int nlflag; static nodeptr lastval; static struct node semi_loc; if (lasttok != NULL) { /* * A semicolon was inserted and returned on the last call to yylex, * instead of going to the input, return lasttok and set the * appropriate variables. */ yylval = lastval; tok_loc = *lastval; t = lasttok; goto ret; } nlflag = 0; loop: c = cc; /* * Remember where a semicolon will go if we insert one. */ semi_loc.n_file = tok_loc.n_file; semi_loc.n_line = in_line; if (cc == '\n') --semi_loc.n_line; semi_loc.n_col = incol; /* * Skip whitespace and comments and process #line directives. */ while (c == Comment || isspace(c)) { if (c == '\n') { nlflag++; c = NextChar; if (c == Comment) { /* * Check for #line directive at start of line. */ if (('l' == (c = NextChar)) && ('i' == (c = NextChar)) && ('n' == (c = NextChar)) && ('e' == (c = NextChar))) { c = setlineno(); while ((c == ' ') || (c == '\t')) c = NextChar; if (c != EOF && c != '\n') c = setfilenm(c); } while (c != EOF && c != '\n') c = NextChar; } } else { if (c == Comment) { while (c != EOF && c != '\n') c = NextChar; } else { c = NextChar; } } } /* * A token is the next thing in the input. Set token location to * the current line and column. */ tok_loc.n_line = in_line; tok_loc.n_col = incol; if (c == EOF) { /* * End of file has been reached. Set eofflag, return T_Eof, and * set cc to EOF so that any subsequent scans also return T_Eof. */ if (eofflag++) { eofflag = 0; cc = '\n'; yylval = NULL; return 0; } cc = EOF; t = T_Eof; yylval = NULL; goto ret; } /* * Look at current input character to determine what class of token * is next and take the appropriate action. Note that the various * token gathering routines write a value into cc. */ if (isalpha(c) || (c == '_')) { /* gather ident or reserved word */ if ((t = getident(c, &cc)) == NULL) goto loop; } else if (isdigit(c) || (c == '.')) { /* gather numeric literal or "." */ if ((t = getnum(c, &cc)) == NULL) goto loop; } else if (c == '"' || c == '\'') { /* gather string or cset literal */ if ((t = getstring(c, &cc)) == NULL) goto loop; } else { /* gather longest legal operator */ if ((t = getopr(c, &cc)) == NULL) goto loop; yylval = OpNode(t->t_type); } if (nlflag && lastend && (t->t_flags & Beginner)) { /* * A newline was encountered between the current token and the last, * the last token was an Ender, and the current token is a Beginner. * Return a semicolon and save the current token in lastval. */ lastval = yylval; lasttok = t; tok_loc = semi_loc; yylval = OpNode(SEMICOL); return SEMICOL; } ret: /* * Clear lasttok, set lastend if the token being returned is an * Ender, and return the token. */ lasttok = 0; lastend = t->t_flags & Ender; return (t->t_type); } #ifdef MultipleRuns /* * yylexinit - initialize variables for multiple runs */ novalue yylexinit() { lasttok = NULL; lastend = 0; eofflag = 0; cc = '\n'; } #endif /* MultipleRuns */ /* * getident - gather an identifier beginning with ac. The character * following identifier goes in cc. */ static struct toktab *getident(ac, cc) int ac; int *cc; { register int c; register struct toktab *t; c = ac; /* * Copy characters into string space until a non-alphanumeric character * is found. */ do { AppChar(lex_sbuf, c); c = NextChar; } while (isalnum(c) || (c == '_')); *cc = c; /* * If the identifier is a reserved word, make a ResNode for it and return * the token value. Otherwise, install it with putid, make an * IdNode for it, and return. */ if ((t = findres()) != NULL) { lex_sbuf.endimage = lex_sbuf.strtimage; yylval = ResNode(t->t_type); return t; } else { yylval = IdNode(str_install(&lex_sbuf)); return (struct toktab *)T_Ident; } } /* * findres - if the string just copied into the string space by getident * is a reserved word, return a pointer to its entry in the token table. * Return NULL if the string isn't a reserved word. */ static struct toktab *findres() { register struct toktab *t; register char c; c = *lex_sbuf.strtimage; if (!islower(c)) return NULL; /* * Point t at first reserved word that starts with c (if any). */ if ((t = restab[c - 'a']) == NULL) return NULL; /* * Search through reserved words, stopping when a match is found * or when the current reserved word doesn't start with c. */ while (t->t_word[0] == c) { if (bufcmp(t->t_word)) return t; t++; } return NULL; } /* * bufcmp - compare a null terminated string to what is in the string buffer. */ static int bufcmp(s) char *s; { register char *s1; s1 = lex_sbuf.strtimage; while (s != '\0' && s1 < lex_sbuf.endimage && *s == *s1) { ++s; ++s1; } if (*s == '\0' && s1 == lex_sbuf.endimage) return 1; else return 0; } /* * getnum - gather a numeric literal starting with ac and put the * character following the literal into *cc. * * getnum also handles the "." operator, which is distinguished from * a numeric literal by what follows it. */ static struct toktab *getnum(ac, cc) int ac; int *cc; { register int c, r, state; int realflag; c = ac; if (c == '.') { r = 0; state = 7; realflag = 1; } else { r = tonum(c); state = 0; realflag = 0; } for (;;) { AppChar(lex_sbuf, c); c = NextChar; switch (state) { case 0: /* integer part */ if (isdigit(c)) { r = r * 10 + tonum(c); continue; } if (c == '.') { state = 1; realflag++; continue; } if (c == 'e' || c == 'E') { state = 2; realflag++; continue; } if (c == 'r' || c == 'R') { state = 5; if (r < 2 || r > 36) tfatal("invalid radix for integer literal", (char *)NULL); continue; } break; case 1: /* fractional part */ if (isdigit(c)) continue; if (c == 'e' || c == 'E') { state = 2; continue; } break; case 2: /* optional exponent sign */ if (c == '+' || c == '-') { state = 3; continue; } case 3: /* first digit after e, e+, or e- */ if (isdigit(c)) { state = 4; continue; } tfatal("invalid real literal", (char *)NULL); break; case 4: /* remaining digits after e */ if (isdigit(c)) continue; break; case 5: /* first digit after r */ if ((isdigit(c) || isletter(c)) && tonum(c) < r) { state = 6; continue; } tfatal("invalid integer literal", (char *)NULL); break; case 6: /* remaining digits after r */ if (isdigit(c) || isletter(c)) { if (tonum(c) >= r) { /* illegal digit for radix r */ tfatal("invalid digit in integer literal", (char *)NULL); r = tonum('z'); /* prevent more messages */ } continue; } break; case 7: /* token began with "." */ if (isdigit(c)) { state = 1; /* followed by digit is a real const */ continue; } *cc = c; /* anything else is just a dot */ lex_sbuf.endimage--; /* remove dot (undo AppChar) */ yylval = OpNode(DOT); return &dottok; } break; } *cc = c; if (realflag) { yylval = RealNode(str_install(&lex_sbuf)); return T_Real; } yylval = IntNode(str_install(&lex_sbuf)); return T_Int; } /* * getstring - gather a string literal starting with ac and place the * character following the literal in *cc. */ static struct toktab *getstring(ac, cc) int ac; int *cc; { register int c, sc; int sav_indx; int len; sc = ac; sav_indx = -1; c = NextChar; while (c != sc && c != '\n' && c != EOF) { /* * If a '_' is the last non-white space before a new-line, * we must remember where it is. */ if (c == '_') sav_indx = lex_sbuf.endimage - lex_sbuf.strtimage; else if (!isspace(c)) sav_indx = -1; if (c == Escape) { c = NextChar; if (c == EOF) break; #ifdef VarTran AppChar(lex_sbuf, Escape); if (c == '^') { c = NextChar; if (c == EOF) break; AppChar(lex_sbuf, '^'); } #else /* VarTran */ if (isoctal(c)) c = octesc(c); else if (c == 'x') c = hexesc(); else if (c == '^') c = ctlesc(); else c = esctab[c]; #endif /* VarTran */ } AppChar(lex_sbuf, c); c = NextChar; /* * If a '_' is the last non-white space before a new-line, the * string continues at the first non-white space on the next line * and everything from the '_' to the end of this line is ignored. */ if (c == '\n' && sav_indx >= 0) { lex_sbuf.endimage = lex_sbuf.strtimage + sav_indx; while ((c = NextChar) != EOF && isspace(c)) ; } } if (c == sc) *cc = ' '; else { tfatal("unclosed quote", (char *)NULL); *cc = c; } len = lex_sbuf.endimage - lex_sbuf.strtimage + 1; /* includes '\0' */ if (ac == '"') { /* a string literal */ yylval = StrNode(str_install(&lex_sbuf), len); return T_String; } else { /* a cset literal */ yylval = CsetNode(str_install(&lex_sbuf), len); return T_Cset; } } #ifndef VarTran /* * ctlesc - translate a control escape -- backslash followed by * caret and one character. */ static int ctlesc() { register int c; c = NextChar; if (c == EOF) return EOF; #if !EBCDIC return (c & 037); #else /* !EBCDIC */ return ToEBCDIC[FromEBCDIC[c] & 037]; /* ctrl-x in EBCDIC is the EBCDIC equivalent */ /* to ASCII ctrl-x */ #endif /* !EBCDIC */ } /* * octesc - translate an octal escape -- backslash followed by * one, two, or three octal digits. */ static int octesc(ac) int ac; { register int c, nc, i; c = 0; nc = ac; i = 1; do { c = (c << 3) | (nc - '0'); nc = NextChar; if (nc == EOF) return EOF; } while (isoctal(nc) && i++ < 3); PushChar(nc); #if EBCDIC != 2 return (c & 0377); #else /* EBCDIC != 2 */ return ToEBCDIC[c & 0377]; #endif /* EBCDIC != 2 */ } /* * hexesc - translate a hexadecimal escape -- backslash-x * followed by one or two hexadecimal digits. */ static int hexesc() { register int c, nc, i; c = 0; i = 0; while (i++ < 2) { nc = NextChar; if (nc == EOF) return EOF; if (nc >= 'a' && nc <= 'f') nc -= 'a' - 10; else if (nc >= 'A' && nc <= 'F') nc -= 'A' - 10; else if (isdigit(nc)) nc -= '0'; else { PushChar(nc); break; } c = (c << 4) | nc; } #if EBCDIC != 2 return c; #else /* EBCDIC != 2 */ return ToEBCDIC[c]; #endif /* EBCDIC != 2 */ } #endif /* VarTran */ /* * getopr - find the longest legal operator and return a pointer * to its entry in the token table. */ static struct toktab *getopr(ac, cc) int ac; int *cc; { register struct optab *state; register char c, i; state = state0; c = ac; for (;;) { while ((i = state->o_input) && c != i) state++; switch (state->o_action) { case A_Goto: state = (struct optab *) state->o_val; c = NextChar; continue; case A_Error: tfatal("invalid character", (char *)NULL); *cc = ' '; return NULL; case A_Return: *cc = c; return (struct toktab *)(state->o_val); case A_Immret: *cc = ' '; return (struct toktab *)(state->o_val); } } } /* * setlineno - set line number from #line comment, return following char. */ static int setlineno() { register int c; while ((c = NextChar) == ' ' || c == '\t') ; if (c < '0' || c > '9') { tfatal("no line number in #line directive", ""); while (c != EOF && c != '\n') c = NextChar; return c; } in_line = 0; while (c >= '0' && c <= '9') { in_line = in_line * 10 + (c - '0'); c = NextChar; } return c; } /* * setfilenm - set file name from #line comment, return following char. */ static int setfilenm(c) register int c; { while (c == ' ' || c == '\t') c = NextChar; if (c != '"') { tfatal("'\"' missing from file name in #line directive", ""); while (c != EOF && c != '\n') c = NextChar; return c; } while ((c = NextChar) != '"' && c != EOF && c != '\n') AppChar(lex_sbuf, c); if (c == '"') { tok_loc.n_file = str_install(&lex_sbuf); return NextChar; } else { tfatal("'\"' missing from file name in #line directive", ""); return c; } } /* * nextchar - return the next character in the input. */ static int nextchar() { register int c; #if MACINTOSH #if MPW { static short cursorcount = CURSORINTERVAL; if (--cursorcount == 0) { RotateCursor(0); cursorcount = CURSORINTERVAL; } } #endif /* MPW */ #endif /* MACINTOSH */ if (c = peekc) { peekc = 0; return c; } c = getc(srcfile); switch (c) { case EOF: if (incol) { c = '\n'; in_line++; incol = 0; peekc = EOF; break; } else { in_line = 0; incol = 0; break; } case '\n': in_line++; incol = 0; break; case '\t': incol = (incol | 7) + 1; break; case '\b': if (incol) incol--; break; default: incol++; } return c; }