home *** CD-ROM | disk | FTP | other *** search
- /*
- EPSHeader
-
- File: asmtag.c
- Author: J. Kercheval
- Created: Sun, 07/14/1991 17:25:26
- */
- /*
- EPSRevision History
-
- J. Kercheval Sun, 07/14/1991 20:25:59 creation
- J. Kercheval Mon, 07/15/1991 22:47:30 finish finite state machine parser
- J. Kercheval Wed, 07/17/1991 21:35:43 add IsMember() and get_token()
- J. Kercheval Thu, 07/18/1991 19:57:34 add flags checking
- J. Kercheval Sun, 07/21/1991 15:58:56 add comment block support
- J. Kercheval Sat, 07/27/1991 21:16:53 remove public post process support
- J. Kercheval Sat, 07/27/1991 22:50:49 performance considerations (+10%)
- J. Kercheval Sat, 08/10/1991 18:14:46 Speed up IsMember()
- J. Kercheval Fri, 09/13/1991 01:17:05 add when_loading() to remap def_srch_case_map[]
- J. Kercheval Thu, 10/03/1991 12:27:37 fix logic outputting local labels
- J. Kercheval Sat, 10/05/1991 14:06:33 add ASMTagWant defines
- */
-
- /*
- * This file implements tagging for .ASM and .INC files which contain 80x86
- * assembler in the MASM/TASM syntax. This file defines no new commands and
- * is intended to work with the tags package included with V5.0 of Epsilon.
- * There is no problem using modified tags packages providing calls are made
- * to tags_suffix_???() routines in the same way Epsilon does this and that
- * an output routine add_tag() is used. All that should be required is to
- * compile and load this file and this module will be used transparently to
- * you.
- *
- * This module implements tagging for UNION, STRUC, MACRO, PROC, LABEL
- * keywords as well as for implicit labels (label:) and for data defintions
- * (ie. equ, =, dq, dw, db, etc....). The performance cost on a per tag
- * basis is negligable, but since more tagging is done, you should expect a
- * practical 10%-20% performance hit on a per file basis. This tagger is not
- * intended to do all of your work for you but is designed to be used in
- * conjunction with the tags generator I have developed and is now available.
- * This file implements the same semantic parser as is found in that
- * executable. Use the executable in your make file for very fast and
- * updated tags. If you have problems finding it, contact me and I can point
- * the way...
- *
- * There is defined at the end of this module a when_loading() function which
- * alters the default search case map to allow *correct* (or at least
- * consistent sorting with sort routines external to Epsilon. In particular,
- * to produce the same sort order as any UNIX, VMS or HP style sort or with
- * the tags generator this module is supposed to coexist with this mapping
- * must be done. You should see no difference in the location of sorted
- * buffers except for lines starting with ^, [, \, ] and _.
- *
- * This code is dedicated to the public domain with the caveat that Lugaru is
- * welcome to use this within their distribution source code which is
- * supplied with Epsilon.
- *
- * Good Tagging,
- *
- * jbk@wrq.com
- *
- * John Kercheval
- * 127 NW Bowdoin Pl #105
- * Seattle, WA 98107-4960
- * August 10, 1991
- */
-
- #include <eel.h>
-
- #ifndef BOOLEAN
- #define BOOLEAN int
- #define TRUE 1
- #define FALSE 0
- #endif
-
- /* This is a list of the types of tokens you may want to tag. Set them true
- * if you want that particular type of tag.
- */
- #define ASMTagWantProc TRUE
- #define ASMTagWantMacro TRUE
- #define ASMTagWantLabel TRUE
- #define ASMTagWantStruc TRUE
- #define ASMTagWantUnion TRUE
- #define ASMTagWantDefine TRUE
-
- /*
- * The finite state machine allows the following interesting paths
- *
- * 1 - Discard, Parse1, Symbol1
- * 2 - Discard, Parse1, Parse2, Symbol2
- * 3 - Discard, Parse1, Parse2, Define
- *
- * all the important cases follow one of these paths according to MASM/TASM
- * syntax. The exit state is for finish up routine calls and some paths not
- * covered here are simple error paths and probably result from syntax errors
- *
- * enum state { Discard, Parse1, Parse2, Symbol1, Symbol2, Define, Exit };
- */
- /*
- * emulate an enumerated type for the state machine
- */
-
- #define Discard 0
- #define Parse1 1
- #define Parse2 2
- #define Symbol1 3
- #define Symbol2 4
- #define Define 5
- #define Exit 6
-
- typedef int State;
-
- #define COMMENT_CHAR ';'
-
- #define SYMBOL_SIZE 15
-
-
- /*----------------------------------------------------------------------------
- *
- * The symbol lists represent all the symbols we are interested in either
- * obtaining or ignoring. The first element of each of these symbol lists is
- * a string containing all the first characters within the symbol list. This
- * allows faster rejection for IsMember() which is called often.
- *
- ---------------------------------------------------------------------------*/
-
- /* symbols which are not significant for this parser */
- char ASM_NOP_Sym[][SYMBOL_SIZE] =
- {
- "cpbfnwo", /* list of starting characters of symbols
- * below */
- "c", /* C language declaration */
- "pascal", /* PASCAL language declaration */
- "basic", /* BASIC language declaration */
- "fortran", /* FORTRAN language declaration */
- "prolog", /* PROLOG language declaration */
- "nolanguage", /* generic language declaration */
- "windows", /* WINDOWS exit and entry modifier */
- "oddnear", /* overlay modifier */
- "oddfar", /* overlay modifier */
- "normal", /* normal procedure entry/exit code */
- "\0"
- };
-
- /* symbols which begin a comment block */
- char ASM_comment_block[][SYMBOL_SIZE] =
- {
- "c", /* list of starting characters of symbols
- * below */
- "comment", /* begin comment block, next character is
- * delimiter */
- "\0"
- };
-
-
- /* create the function for determining if a character is a delimiter */
- #define IsDelim(c) ( _ASM_delim_boolean_table[c] )
-
- /* the indexed table for white space character lookup */
- BOOLEAN _ASM_delim_boolean_table[256];
-
- /* valid delimiters for this syntax */
- char ASM_delim[] = " \t\n;:=.,\"()<>[]*-+/";
-
-
- /* create the function for determining if a character is a whitespace */
- #define IsWhite(c) ( _ASM_white_boolean_table[c] )
-
- /* the indexed table for white space character lookup */
- BOOLEAN _ASM_white_boolean_table[256];
-
- /* whitespace characters */
- char ASM_white[] = " \t\v\f";
-
-
- /* symbols which both are delimiters and a special token, these are
- special tokens only when found at the the beginning of a string of
- 1 or more delimiters */
- char ASM_delim_Sym[] = "=:";
-
- /* symbols which fit into the Define state and represent a tagged symbol */
- /* state Define depends on the token ":" being at index 1 in this list */
- char ASM_def[][SYMBOL_SIZE] =
- {
- ":e=cd", /* list of starting characters of symbols
- * below */
- ":", /* local labels */
- "equ", /* equivalence */
- "=", /* equivalence */
- "catstr", /* concatenated and named strings */
- "db", /* named byte data definition */
- "dw", /* named word data definition */
- "dd", /* named double word data definition */
- "dp", /* named 6 byte far pointer data area
- * definition */
- "df", /* named 6 byte far pointer definition */
- "dq", /* named quad word data definition */
- "dt", /* named 10 byte data area */
- "\0"
- };
-
- /* symbols which fit into the Symbol state and represent a tagged symbol */
- char ASM_sym[][SYMBOL_SIZE] =
- {
- "pmlsu", /* list of starting character of symbols
- * below */
- "proc", /* procedures */
- "macro", /* macros */
- "label", /* local labels */
- "struc", /* structures */
- "union", /* unions */
- "\0"
- };
-
-
- /*----------------------------------------------------------------------------
- *
- * ASMParserInit() initializes the tables required by the parser The tables
- * used are a simple boolean index which are true if the character
- * corresponding to the index is a member of the associated table.
- *
- ---------------------------------------------------------------------------*/
-
- ASMParserInit()
- {
- char *s;
- int i;
-
- /* init the entire block to FALSE */
- for (i = 0; i < 256; i++) {
- _ASM_delim_boolean_table[i] = FALSE;
- _ASM_white_boolean_table[i] = FALSE;
- }
-
- /* set the characters in the delim set to TRUE */
- for (s = ASM_delim; *s; s++) {
- _ASM_delim_boolean_table[*s] = TRUE;
- }
-
- /* NULL is also a delimiter */
- _ASM_delim_boolean_table['\0'] = TRUE;
-
- /* set the characters in the white set to TRUE */
- for (s = ASM_white; *s; s++) {
- _ASM_white_boolean_table[*s] = TRUE;
- }
- }
-
-
- /*----------------------------------------------------------------------------
- *
- * strchr() is the standard string library function strchr()
- *
- ---------------------------------------------------------------------------*/
-
- char *strchr(s, c)
- char *s;
- char c;
- {
- char *ret = s;
-
- while (*ret) {
- if (*ret == c)
- return ret;
- ret++;
- }
-
- if (*ret == c)
- return ret;
-
- return NULL;
- }
-
-
- /*----------------------------------------------------------------------------
- *
- * ASMSymbolWanted() returns true if the index into the sym token list is one
- * of the wanted symbols according to the ASMTagWant defines. The indexes
- * belong with the following ASMTagWant defines:
- *
- * Flag Symbol Index
- * --------------- ------- -----
- * ASMTagWantProc "proc" 1
- * ASMTagWantMacro "macro" 2
- * ASMTagWantLabel "label" 3
- * ASMTagWantStruc "struc" 4
- * ASMTagWantUnion "union" 5
- *
- ---------------------------------------------------------------------------*/
-
- BOOLEAN ASMSymbolWanted(index)
- int index;
- {
- /* return true if the associated flag is true */
- switch (index) {
- case 1:
- return ASMTagWantProc;
- break;
- case 2:
- return ASMTagWantMacro;
- break;
- case 3:
- return ASMTagWantLabel;
- break;
- case 4:
- return ASMTagWantStruc;
- break;
- case 5:
- return ASMTagWantUnion;
- break;
- default:
- return FALSE;
- break;
- }
- }
-
-
- /*----------------------------------------------------------------------------
- *
- * ASMIsMember() takes the token passed and check for membership in the null
- * terminated array, tokenlist, and return TRUE if a member and FALSE
- * otherwise, index is the index into the token list of the symbol if return
- * value is TRUE
- *
- ---------------------------------------------------------------------------*/
-
- BOOLEAN ASMIsMember(token_list, token, index)
- char token_list[][SYMBOL_SIZE];
- char *token;
- int *index;
- {
- int old_case_fold = case_fold;
-
- /* use non case sensitive string compare */
- case_fold = 1;
-
- /* look for dirty rejection */
- if (!strchr(token_list[0], tolower(token[0])))
- return FALSE;
-
- /* march through array until membership is determined */
- for (*index = 1; *token_list[*index]; (*index)++) {
-
- /* return true if token found */
- if (!strfcmp(token, token_list[*index])) {
- case_fold = old_case_fold;
- return TRUE;
- }
- }
-
- /* did not find it */
- case_fold = old_case_fold;
- return FALSE;
- }
-
-
- /*----------------------------------------------------------------------------
- *
- * ASM_get_token() will obtain the next token in the line pointed to by lptr
- * and in addition will return FALSE if EOL is reached or a comment character
- * is the first non whitespace character found
- *
- ---------------------------------------------------------------------------*/
-
- BOOLEAN ASM_get_token(lptr, token)
- char **lptr;
- char *token;
- {
- char *s; /* start location in string */
- int token_length; /* the length of the current token */
- int dummy; /* a temporary variable */
-
- /* loop until we have a valid token or end of string */
- do {
- /* move past whitespace */
- while (IsWhite(**lptr)) {
- (*lptr)++;
- }
-
- /* return false if end of line */
- if (!**lptr)
- return FALSE;
-
- /* check if comment */
- if (**lptr == COMMENT_CHAR) {
- return FALSE;
- }
-
- /* check of delimiter token */
- if (strchr(ASM_delim_Sym, **lptr)) {
- token[0] = **lptr;
- token[1] = '\0';
- (*lptr)++;
- }
- else {
-
- /* save the beginning location */
- s = *lptr;
-
- /* move to the next delimiter in the line */
- while (!IsDelim(**lptr)) {
- (*lptr)++;
- }
-
- /* get the token */
- token_length = *lptr - s;
- strncpy(token, s, token_length);
- token[token_length] = '\0';
- }
-
- } while (ASMIsMember(ASM_NOP_Sym, token, &dummy));
-
- return TRUE;
- }
-
-
- /*----------------------------------------------------------------------------
- *
- * getline() obtain the next line in the buffer
- *
- ---------------------------------------------------------------------------*/
-
- BOOLEAN getline(inbuf, line)
- char *inbuf;
- char *line;
- {
- char *oldbuf = bufname;
- int cur_point = point;
-
- bufname = inbuf;
-
- nl_forward();
- if (cur_point != point) {
- grab(cur_point, point, line);
- }
- else {
- return FALSE;
- }
-
- bufname = oldbuf;
- return TRUE;
- }
-
-
- /*----------------------------------------------------------------------------
- *
- * output_tag() places the tag in the correct format into the output buffer
- * by a call to add_tag()
- *
- ---------------------------------------------------------------------------*/
-
- output_tag(outbuf, line, symbol, infname, line_number, char_number)
- char *outbuf;
- char *line;
- char *symbol;
- char *infname;
- int line_number;
- int char_number;
- {
- /* this is just a shell call to add_tag() defined in tags.e but is an
- * ideal place to add code for other output formats or extra output
- * information etc. */
- add_tag(symbol, infname, char_number);
- return;
- }
-
-
- /*----------------------------------------------------------------------------
- *
- * ASMtags() tags an input stream assuming input format of ASM 80x86 format
- * in MASM/TASM syntax
- *
- ---------------------------------------------------------------------------*/
-
- ASMTags(inbuf, infname, outbuf)
- char *inbuf;
- char *infname;
- char *outbuf;
- {
- State state; /* the current state of the parser */
-
- char line[256]; /* the current input line */
- char cur_token[256]; /* the current token */
- char prev_token[256]; /* the previous token */
-
- char *lptr; /* pointer into line for token parser */
- char *prev_lptr; /* pointer into line for previous token */
-
- int line_number; /* the current line in the file */
- int line_length; /* the length of the current line */
- int char_number; /* the current character in the file */
-
- int symbol_index; /* the index into the token list of the
- * symbol */
-
- char *oldbuf = bufname;
- int *oldpoint;
- int *oldmark;
-
- /* save current buffer state */
- oldpoint = alloc_spot();
- exchange_point_and_mark();
- oldmark = alloc_spot();
-
- /* init the engine */
- ASMParserInit();
- cur_token[0] = '\0';
- prev_token[0] = '\0';
- state = Discard;
- line_number = 0;
- line_length = 0;
- char_number = 0;
- lptr = prev_lptr = (char *) NULL;
-
- for (;;) {
-
- switch (state) {
-
- case Discard: /* current line is not valid */
-
- /* if EOF then return */
- if (getline(inbuf, line)) {
- lptr = line;
-
- /* increment counters */
- line_number++;
-
- /* char_number increments by length of previous line */
- char_number += line_length;
-
- /* line length */
- line_length = strlen(line);
- state = Parse1;
- }
- else {
- state = Exit;
- }
- break;
-
- case Parse1: /* parsing for first *special* token */
-
- /* get the next valid token */
- if (!ASM_get_token(&lptr, cur_token)) {
-
- /* if no token left or a comment as first non white space
- * char in remainder of line */
- state = Discard;
- }
- else {
-
- /* move the cur_token to prev_token */
- strcpy(prev_token, cur_token);
-
- /* check for membership in the tagging symbol club */
- if (ASMIsMember(ASM_sym, cur_token, &symbol_index)) {
- state = Symbol1;
- }
- else {
-
- /* check if comment block */
- if (ASMIsMember(ASM_comment_block,
- cur_token, &symbol_index)) {
-
- /* get the next non white character, this makes
- * the assumption that the delimiter character is
- * on the same line as the comment symbol. If the
- * delimiter character is not on the current line
- * then parsing continues normally on the next
- * line. */
- while (IsWhite(*lptr)) {
- lptr++;
- }
-
- if (*lptr) {
-
- /* this is the delimiter character, store it
- * and move lptr past it */
- *cur_token = *lptr;
- lptr++;
-
- /* move over comment block, remembering to
- * update line info as we go */
- while (*lptr != *cur_token) {
-
- /* get a new line if end of line */
- if (!*lptr) {
- if (!getline(inbuf, line)) {
- *cur_token = *lptr;
- }
- else {
- lptr = line;
-
- /* increment counters */
- line_number++;
-
- /* char_number increments by
- * length of previous line */
- char_number += line_length;
-
- /* line length */
- line_length = strlen(line);
- }
- }
- else {
- lptr++;
- }
- }
- }
-
- state = Discard;
- }
- else {
-
- /* nothing special, parse the next symbol */
- state = Parse2;
- }
- }
- }
- break;
-
- case Parse2: /* parsing for second *special* token */
-
- /* save the previous position */
- prev_lptr = lptr;
-
- /* get the next token */
- if (!ASM_get_token(&lptr, cur_token)) {
-
- /* no token left, reset machine */
- state = Discard;
- }
- else {
-
- if (ASMIsMember(ASM_sym, cur_token, &symbol_index)) {
-
- /* found a major symbol */
- state = Symbol2;
- }
- else {
-
- if (ASMIsMember(ASM_def, cur_token, &symbol_index)) {
-
- /* found a defining token */
- state = Define;
- }
- else {
- state = Discard;
- }
- }
- }
-
- break;
-
- case Symbol1: /* next token, ignore if no token found */
-
- /* get the next symbol and output it */
- if (ASM_get_token(&lptr, cur_token)) {
-
- if (ASMSymbolWanted(symbol_index)) {
- output_tag(outbuf, line, cur_token, infname,
- line_number, char_number +
- lptr - line -
- strlen(cur_token));
- }
- }
-
- /* reset machine */
- state = Discard;
-
- break;
-
- case Symbol2: /* previous token was the wanted symbol */
-
- /* the previous token is the symbol of interest */
- if (ASMSymbolWanted(symbol_index)) {
- output_tag(outbuf, line, prev_token, infname,
- line_number, char_number +
- prev_lptr - line -
- strlen(prev_token));
- }
-
- /* reset machine */
- state = Discard;
-
- break;
-
- case Define: /* previous token was the wanted symbol */
-
- /* the previous token is the symbol of interest */
- if ((ASMTagWantDefine && symbol_index != 1) ||
- (ASMTagWantLabel && symbol_index == 1)) {
- output_tag(outbuf, line, prev_token, infname,
- line_number, char_number +
- prev_lptr - line -
- strlen(prev_token));
- }
-
- /* reset machine */
- state = Discard;
-
- break;
-
- case Exit: /* clean it up */
-
- /* restore original location */
- bufname = oldbuf;
- point = *oldpoint;
- mark = *oldmark;
- free_spot(oldpoint);
- free_spot(oldmark);
- return;
- break;
-
- default: /* not reached */
- break;
- }
- }
- }
-
- /*----------------------------------------------------------------------------
- *
- * tag_suffix_asm() and tag_suffix_inc() are recognized procedure names
- * to the tags package in Epsilon and will be called automatically when
- * tagging needs to happen for these extensions. tag_suffix_asm() is a
- * replacement for the routine of the same name defined in tags.e and
- * tag_suffix_inc() is new.
- *
- ---------------------------------------------------------------------------*/
-
-
- tag_suffix_asm()
- {
- /* the third parameter, the output buffer name is not actually used by
- * anyone but is left here for a time when this information may be
- * needed. The current algorithm is to let the funtion add_tag() decide
- * the buffer name to send the output to. As a little more than
- * coincedence, the name used here is the same used in add_tag() defined
- * in tags.e */
- ASMTags(bufname, filename, "-tags");
- }
-
- tag_suffix_inc()
- {
- tag_suffix_asm();
- }
-
- /* rebuild the default character maps */
- when_loading()
- {
- #define UCLC(up, low) def_char_class[low] = C_LOWER, \
- def_char_class[up] = C_UPPER, \
- def_srch_case_map[up] = low, \
- def_case_map[low] = up, \
- def_case_map[up] = low
-
- int i, j;
-
- for (i = 0; i < 256; i++)
- def_case_map[i] = def_srch_case_map[i] = i;
- for (i = 'A', j = 'a'; i <= 'Z'; i++, j++)
- UCLC(i, j);
- for (i = 131; i < 154; i++)
- def_char_class[i] = C_LOWER;
- for (i = 160; i < 164; i++)
- def_char_class[i] = C_LOWER;
- UCLC('Ç', 'ç');
- UCLC('Ä', 'ä');
- UCLC('Å', 'å');
- UCLC('É', 'é');
- UCLC('Æ', 'æ');
- UCLC('Ö', 'ö');
- UCLC('Ü', 'ü');
- UCLC('Ñ', 'ñ');
- }
-