home *** CD-ROM | disk | FTP | other *** search
- Path: sparky!uunet!spool.mu.edu!olivea!pagesat!spssig.spss.com!uchinews!ellis!goer
- From: goer@ellis.uchicago.edu (Richard L. Goerwitz)
- Newsgroups: comp.lang.icon
- Subject: parser generator, part 3
- Message-ID: <1993Jan3.211920.28528@midway.uchicago.edu>
- Date: 3 Jan 93 21:19:20 GMT
- References: <1993Jan3.211757.28395@midway.uchicago.edu>
- Sender: news@uchinews.uchicago.edu (News System)
- Reply-To: goer@midway.uchicago.edu
- Organization: University of Chicago
- Lines: 1017
-
- ---- Cut Here and feed the following to sh ----
- #!/bin/sh
- # this is ibpag.03 (part 3 of a multipart archive)
- # do not concatenate these parts, unpack them in order with /bin/sh
- # file maketbls.icn continued
- #
- if test ! -r _shar_seq_.tmp; then
- echo 'Please unpack part 1 first!'
- exit 1
- fi
- (read Scheck
- if test "$Scheck" != 3; then
- echo Please unpack part "$Scheck" next!
- exit 1
- else
- exit 0
- fi
- ) < _shar_seq_.tmp || exit 1
- if test ! -f _shar_wnt_.tmp; then
- echo 'x - still skipping maketbls.icn'
- else
- echo 'x - continuing file maketbls.icn'
- sed 's/^X//' << 'SHAR_EOF' >> 'maketbls.icn' &&
- X
- X *arglst[1] <= 1 | *arglst = 1 & { return arglst[1] }
- X sortfield := arglst[2] | { return sortf(arglst[1]) }
- X arglst[1] := sortf(arglst[1], sortfield)
- X
- X old_i := 1
- X every i := old_i+1 to *arglst[1] do {
- X if not (arglst[1][old_i][sortfield] === arglst[1][i][sortfield])
- X then {
- X return sortff!(push(arglst[3:0], arglst[1][old_i : i])) |||
- X sortff!(push(arglst[2:0], arglst[1][i : 0]))
- X }
- X }
- X return sortff!(push(arglst[3:0], arglst[1]))
- X
- Xend
- SHAR_EOF
- echo 'File maketbls.icn is complete' &&
- true || echo 'restore of maketbls.icn failed'
- rm -f _shar_wnt_.tmp
- fi
- # ============= preproc.icn ==============
- if test -f 'preproc.icn' -a X"$1" != X"-c"; then
- echo 'x - skipping preproc.icn (File already exists)'
- rm -f _shar_wnt_.tmp
- else
- > _shar_wnt_.tmp
- echo 'x - extracting preproc.icn (Text)'
- sed 's/^X//' << 'SHAR_EOF' > 'preproc.icn' &&
- X############################################################################
- X#
- X# Name: preproc.icn
- X#
- X# Title: file preprocessing utilities for IBPAG
- X#
- X# Author: Richard L. Goerwitz
- X#
- X# Version: 1.18
- X#
- X############################################################################
- X#
- X# This file contains the preprocessing subsystem for IBPAG.
- X# Essentially, the routines contained here read the file, find the
- X# defines, do macro substitutions, find the start_symbol declaration
- X# (if there is one), and find and store rule definitions, outputting
- X# Icon procedures in their place. The stored rule definitions later
- X# get used by CONST_STATES, and turned into a parser.
- X#
- X############################################################################
- X#
- X# Links: none
- X#
- X# See also: ibpag.icn, maketbls.icn, debugme.icn, and esp. itokens.icn
- X#
- X############################################################################
- X
- X# ximage is only for debugging
- X# link ximage
- X
- X# declared in maketbls.icn
- X# global ptbl, start_symbol, line_number
- X
- Xrecord symbol(str, terminal)
- Xrecord rule(LHS, RHS, priority, associativity, procname)
- X
- X#
- X# makeptbl: file -> table
- X# (f) -> ptbl
- X#
- X# Where f is a file containing IBPAG source code, and ptbl is a
- X# table of the productions contained in f; writes a preprocessed
- X# version of f to f2 (stdout by default) as a side-effect (an
- X# important one, nonetheless). If a "start_symbol X" declaration
- X# is encountered, it also sets the start-symbol to "X" (default is
- X# "S").
- X#
- X# The structure of ptbl: key = LHS (string), value = rule list
- X# (i.e. list of rule records). Keys are always nonterminals, as
- X# there is no need to record terminals (they appear only in the
- X# RHS of rules).
- X#
- X# Ptbl is global, so there really isn't any need to return it. It
- X# is used by almost every routine in maketbls.icn.
- X#
- Xprocedure makeptbl(f, f2)
- X
- X local rulenum, state, separator, T, r, astr, new_r, RHS
- X # global ptbl, start_symbol
- X initial {
- X start_symbol := "S"
- X ptbl := table()
- X }
- X
- X rulenum := 0
- X state := 0
- X separator := ""
- X #
- X # Iparse_tokens is in itokens.icn and suspends TOK records
- X # having a sym and str field. The sym field contains symbol
- X # names; the str field contains their actual string values in the
- X # source file.
- X #
- X every T := \iparse_tokens(f) do {
- X
- X #
- X # Check for null sym field (iparse_tokens uses TOK(&null,
- X # "\n") to signal the presence of a syntactically meaningless
- X # newline; we want to print it later on so as to maintain the
- X # original line structure), but we don't want to actually
- X # parse it here.
- X #
- X if \T.sym then {
- X
- X # Note that this little automaton passes its input through
- X # only under certain conditions in states 0 and 8.
- X # Otherwise it is either reading a rule or a start_symbol
- X # definition.
- X #
- X case state of {
- X
- X 0 : {
- X # Typical case: We are looking for the start of
- X # the next start_symbol or rule declaration. If
- X # neither is found, do nuttin' except to pass the
- X # T record on to the printing routine below.
- X #
- X case T.sym of {
- X "STARTSYM" : {
- X state := 9
- X next
- X }
- X "RULE" : {
- X r := rule(,,,,"_" || right(rulenum +:= 1, 5, "0"))
- X state := 1
- X next
- X }
- X default : &null
- X }
- X }
- X
- X 1 : {
- X # We are in a rule def. Look for the priority
- X # next. If we don't get an INTLIT or REALLIT,
- X # then assign a default priority, and see if we
- X # have an IDENT. If so, then see if it's the
- X # associativity; if so, set the associativity; if
- X # not, assign a default associativity and then see
- X # if we have a rule LHS. If not, then we have an
- X # error.
- X #
- X if T.sym == ("INTLIT"|"REALLIT") then {
- X r.priority := real(T.str)
- X state := 2
- X next
- X } else if T.sym == "IDENT" then {
- X # if you change the default priority here, do
- X # it in maketbls.icn, too (e.g. rule(,,1))
- X r.priority := 1
- X if T.str == ("left"|"right"|"none")
- X then {
- X r.associativity := T.str
- X state := 3
- X next
- X } else {
- X r.associativity := "none" # default
- X r.LHS := T.str
- X state := 4
- X next
- X }
- X } else oh_no("line "|| line_number, 11)
- X }
- X
- X 2 : {
- X # We have our priority; now get the associativity
- X # (looks to the tokenizer like an identifier). If
- X # the identifier doesn't have a string value of
- X # "right," "left," or "none," then assign a
- X # default associativity, and assume that the
- X # identifier is the LHS of a rule.
- X #
- X T.sym == "IDENT" | oh_no("line "|| line_number, 12)
- X if T.str == ("left"|"right"|"none") then {
- X r.associativity := T.str
- X state := 3
- X next
- X } else {
- X r.associativity := "none" # default
- X r.LHS := T.str
- X state := 4
- X next
- X }
- X }
- X
- X 3 : {
- X # Now read the LHS of the rule.
- X #
- X if T.sym == "IDENT" then {
- X r.LHS := T.str
- X state := 4
- X next
- X } else oh_no("line "|| line_number, 1)
- X }
- X
- X 4 : {
- X # Now go for the RHS of the rule (which looks like
- X # the argument list to an Icon procedure).
- X #
- X if T.sym == "LPAREN" then {
- X r.RHS := []
- X state := 5
- X next
- X } else oh_no("line "|| line_number, 3)
- X }
- X
- X 5 : {
- X # We have the left parenthesis; now read the
- X # arguments. Note that the smallest argument list
- X # possible is the empty string, i.e. (""). The
- X # alternation operator, |, is permitted, but has
- X # different semantics than it does for Icon code.
- X #
- X case T.sym of {
- X "IDENT" : put(r.RHS, symbol(T.str)) &
- X state := 6
- X "STRINGLIT" : put(r.RHS, symbol(no_quotes(T.str), 1)) &
- X state := 6
- X default : oh_no("line "|| line_number, 2)
- X }
- X next
- X }
- X
- X 6 : {
- X # We have just read an element for the RHS of a
- X # rule; now we either close the current position
- X # with a comma, close the entire RHS with a
- X # parend, or insert an(other) alternative, via the
- X # BAR, for the last element.
- X #
- X case T.sym of {
- X "BAR" : r.RHS[-1] := [r.RHS[-1]] &
- X state := 7
- X "COMMA" : state := 5
- X "RPAREN" : {
- X astr := ""
- X every astr ||:= "arg" || (1 to *r.RHS) || ","
- X astr := trim(astr, ',')
- X write(f2, "procedure ", r.procname, "(", astr, ")")
- X separator := ""
- X every RHS := expand_rhs(r.RHS) do {
- X new_r := copy(r)
- X new_r.RHS := RHS
- X /ptbl[new_r.LHS] := []
- X put(ptbl[new_r.LHS], new_r)
- X }
- X state := 8
- X }
- X default : oh_no("line "|| line_number, 10)
- X }
- X next
- X }
- X
- X 7 : {
- X # Like state 5, only for elements encountered
- X # after a BAR. These get stuffed into a list, and
- X # later turned into RHSs that are identical except
- X # in cases where a BAR specified alternates for a
- X # given position. See expand_rhs().
- X #
- X case T.sym of {
- X "IDENT" : put(r.RHS[-1], symbol(T.str)) &
- X state := 6
- X "STRINGLIT" : put(r.RHS[-1],
- X symbol(no_quotes(T.str), 1)) &
- X state := 6
- X default : oh_no("line "|| line_number, 6)
- X }
- X next
- X }
- X
- X 8 : {
- X # We're done the rule definition. We are again
- X # passing tokens through to the printing routine
- X # below. We're still looking for an "end"
- X # keyword, though. When we get it, go back to
- X # state 0.
- X #
- X case T.sym of {
- X "RULE" : oh_no("line "||line_number, 7)
- X "PROCEDURE" : oh_no("line "||line_number, 8)
- X "END" : state := 0
- X default : &null
- X }
- X # Don't go for another token yet. NO "next"!
- X }
- X
- X 9 : {
- X # This state is selected by a preceding
- X # "START_SYMBOL" symbol. We don't pass input
- X # through (note the "next" below). Input gets
- X # passed through again when we hit state 0.
- X #
- X if T.sym == "IDENT" then {
- X start_symbol := T.str
- X state := 0
- X next
- X } else oh_no("line "||line_number, 4)
- X }
- X }
- X }
- X
- X # This is the "printing" routine mentioned above...
- X #
- X # NB: Newlines that don't need to be present are signalled by
- X # a null sym field. See the procedure do_newline(). If this
- X # modelled the real Icon tokenizer, such newlines would be
- X # ignored.
- X #
- X if any(&digits ++ &letters ++'_.', \T.str, 1, 2) & \T.sym ~=="DOT"
- X then writes(f2, separator)
- X
- X writes(f2, T.str)
- X
- X if any(&digits ++ &letters ++'_.', \T.str,-1, 0) & \T.sym ~=="DOT"
- X then separator := " " else separator := ""
- X }
- X
- X# write(ximage(ptbl))
- X # Ptbl is global, so this really isn't necessary.
- X return ptbl
- X
- Xend
- X
- X
- X#
- X# no_quotes: string -> string
- X# s -> s2
- X#
- X# Where s is the literal value of some STRINGLIT, and s2 is that
- X# same literal value, with the enclosing quotation markes
- X# removed. E.g. "\"ab\"" -> "ab".
- X#
- Xprocedure no_quotes(s)
- X return s ? 2(="\"", tab(-1), ="\"")
- Xend
- X
- X
- X#
- X# expand_rhs: list -> list
- X#
- X# Expand_rhs takes a list in which the elements are either
- X# symbols or lists of symbols, and produces lists with only
- X# symbols. E.g.
- X#
- X# [[[symbol1], symbol2], symbol3] -> [symbol1, symbol3]
- X# [symbol2, symbol3]
- X#
- Xprocedure expand_rhs(RHS)
- X *RHS = 0 & { return RHS }
- X suspend [expand_elem(RHS[1])] ||| expand_rhs(RHS[2:0])
- Xend
- X#
- X#
- Xprocedure expand_elem(elem)
- X if type(elem) == "symbol"
- X then return elem
- X else {
- X suspend expand_elem(elem[1])
- X suspend elem[2]
- X }
- Xend
- X
- SHAR_EOF
- true || echo 'restore of preproc.icn failed'
- rm -f _shar_wnt_.tmp
- fi
- # ============= itokens.icn ==============
- if test -f 'itokens.icn' -a X"$1" != X"-c"; then
- echo 'x - skipping itokens.icn (File already exists)'
- rm -f _shar_wnt_.tmp
- else
- > _shar_wnt_.tmp
- echo 'x - extracting itokens.icn (Text)'
- sed 's/^X//' << 'SHAR_EOF' > 'itokens.icn' &&
- X############################################################################
- X#
- X# Name: itokens.icn
- X#
- X# Title: itokens (Icon source-file tokenizer)
- X#
- X# Author: Richard L. Goerwitz
- X#
- X# Version: 1.7
- X#
- X############################################################################
- X#
- X# This file contains a tokenizer to be used for Icon source files.
- X# Normally it would be incorporated into a package utilizing IBPAG,
- X# but it has a stub main procedure that makes it potentially a
- X# standalone package as well.
- X#
- X############################################################################
- X#
- X# Links: slashupto
- X#
- X# Requires: coexpressions
- X#
- X############################################################################
- X
- X#link ximage
- X#link slashupto #make sure you have version 1.2 or above
- X
- Xglobal next_c, line_number
- Xrecord TOK(sym, str)
- X
- X#
- X# stub main for testing
- X#
- X#procedure main()
- X#
- X# local separator
- X# separator := ""
- X# every T := \iparse_tokens(&input) do {
- X# #
- X# # Newlines that don't need to be present are signalled by a
- X# # null sym field. See the procedure do_newline(). If this
- X# # modelled the real Icon tokenizer, such newlines would be
- X# # ignored, and no token (not even a dummy token) would be
- X# # suspended.
- X# #
- X# if any(&digits ++ &letters ++ '_.', \T.str, 1, 2) & \T.sym ~== "DOT"
- X# then writes(separator)
- X# writes(T.str)
- X# if any(&digits ++ &letters ++ '_.', \T.str, -1, 0) & \T.sym ~== "DOT"
- X# then separator := " " else separator := ""
- X# }
- X#
- X#end
- X
- X#
- X# iparse_tokens: file -> TOK records (a generator)
- X# (stream) -> tokens
- X#
- X# Where file is an open input stream, and tokens are TOK records
- X# holding both the token type and actual token text.
- X#
- X# TOK records contain two parts, a preterminal symbol (the first
- X# "sym" field), and the actual text of the token ("str"). The
- X# parser only pays attention to the sym field, although the
- X# strings themselves get pushed onto the value stack.
- X#
- X# Note the following kludge: Unlike real Icon tokenizers, this
- X# procedure returns syntactially meaningless newlines as TOK
- X# records with a null sym field. Normally they would be ignored.
- X# I wanted to return them so they could be printed on the output
- X# stream, thus preserving the line structure of the original
- X# file, and making later diagnostic messages more usable.
- X#
- Xprocedure iparse_tokens(stream, getchar)
- X
- X local elem, whitespace, last_token, token, primitives, reserveds
- X static be_tbl, reserved_tbl, operators
- X initial {
- X
- X # Primitive Tokens
- X #
- X primitives := [
- X ["identifier", "IDENT", "be"],
- X ["integer-literal", "INTLIT", "be"],
- X ["real-literal", "REALLIT", "be"],
- X ["string-literal", "STRINGLIT", "be"],
- X ["cset-literal", "CSETLIT", "be"],
- X ["end-of-file", "EOFX", "" ]]
- X
- X # Reserved Words
- X #
- X reserveds := [
- X ["break", "BREAK", "be"],
- X ["by", "BY", "" ],
- X ["case", "CASE", "b" ],
- X ["create", "CREATE", "b" ],
- X ["default", "DEFAULT", "b" ],
- X ["do", "DO", "" ],
- X ["else", "ELSE", "" ],
- X ["end", "END", "b" ],
- X ["every", "EVERY", "b" ],
- X ["fail", "FAIL", "be"],
- X ["global", "GLOBAL", "" ],
- X ["if", "IF", "b" ],
- X ["initial", "INITIAL", "b" ],
- X ["invocable", "INVOCABLE", "" ],
- X ["link", "LINK", "" ],
- X ["local", "LOCAL", "b" ],
- X ["next", "NEXT", "be"],
- X ["not", "NOT", "b" ],
- X ["of", "OF", "" ],
- X ["procedure", "PROCEDURE", "" ],
- X ["record", "RECORD", "" ],
- X ["repeat", "REPEAT", "b" ],
- X ["return", "RETURN", "be"],
- X # Keyword beginning a rule definition. Like "procedure."
- X ["rule", "RULE", "" ], #<-NB
- X # Keyword beginning a start-symbol declaration.
- X ["start_symbol", "STARTSYM", "" ],
- X ["static", "STATIC", "b" ],
- X ["suspend", "SUSPEND", "be"],
- X ["then", "THEN", "" ],
- X ["to", "TO", "" ],
- X ["until", "UNTIL", "b" ],
- X ["while", "WHILE", "b" ]]
- X
- X # Operators
- X #
- X operators := [
- X [":=", "ASSIGN", "" ],
- X ["@", "AT", "b" ],
- X ["@:=", "AUGACT", "" ],
- X ["&:=", "AUGAND", "" ],
- X ["=:=", "AUGEQ", "" ],
- X ["===:=", "AUGEQV", "" ],
- X [">=:=", "AUGGE", "" ],
- X [">:=", "AUGGT", "" ],
- X ["<=:=", "AUGLE", "" ],
- X ["<:=", "AUGLT", "" ],
- X ["~=:=", "AUGNE", "" ],
- X ["~===:=", "AUGNEQV", "" ],
- X ["==:=", "AUGSEQ", "" ],
- X [">>=:=", "AUGSGE", "" ],
- X [">>:=", "AUGSGT", "" ],
- X ["<<=:=", "AUGSLE", "" ],
- X ["<<:=", "AUGSLT", "" ],
- X ["~==:=", "AUGSNE", "" ],
- X ["\\", "BACKSLASH", "b" ],
- X ["!", "BANG", "b" ],
- X ["|", "BAR", "b" ],
- X ["^", "CARET", "b" ],
- X ["^:=", "CARETASGN", "b" ],
- X [":", "COLON", "" ],
- X [",", "COMMA", "" ],
- X ["||", "CONCAT", "b" ],
- X ["||:=", "CONCATASGN","" ],
- X ["&", "CONJUNC", "b" ],
- X [".", "DOT", "b" ],
- X ["--", "DIFF", "b" ],
- X ["--:=", "DIFFASGN", "" ],
- X ["===", "EQUIV", "b" ],
- X ["**", "INTER", "b" ],
- X ["**:=", "INTERASGN", "" ],
- X ["{", "LBRACE", "b" ],
- X ["[", "LBRACK", "b" ],
- X ["|||", "LCONCAT", "b" ],
- X ["|||:=", "LCONCATASGN","" ],
- X ["==", "LEXEQ", "b" ],
- X [">>=", "LEXGE", "" ],
- X [">>", "LEXGT", "" ],
- X ["<<=", "LEXLE", "" ],
- X ["<<", "LEXLT", "" ],
- X ["~==", "LEXNE", "b" ],
- X ["(", "LPAREN", "b" ],
- X ["-:", "MCOLON", "" ],
- X ["-", "MINUS", "b" ],
- X ["-:=", "MINUSASGN", "" ],
- X ["%", "MOD", "" ],
- X ["%:=", "MODASGN", "" ],
- X ["~===", "NOTEQUIV", "b" ],
- X ["=", "NUMEQ", "b" ],
- X [">=", "NUMGE", "" ],
- X [">", "NUMGT", "" ],
- X ["<=", "NUMLE", "" ],
- X ["<", "NUMLT", "" ],
- X ["~=", "NUMNE", "b" ],
- X ["+:", "PCOLON", "" ],
- X ["+", "PLUS", "b" ],
- X ["+:=", "PLUSASGN", "" ],
- X ["?", "QMARK", "b" ],
- X ["<-", "REVASSIGN", "" ],
- X ["<->", "REVSWAP", "" ],
- X ["}", "RBRACE", "e" ],
- X ["]", "RBRACK", "e" ],
- X [")", "RPAREN", "e" ],
- X [";", "SEMICOL", "" ],
- X ["?:=", "SCANASGN", "" ],
- X ["/", "SLASH", "b" ],
- X ["/:=", "SLASHASGN", "" ],
- X ["*", "STAR", "b" ],
- X ["*:=", "STARASGN", "" ],
- X [":=:", "SWAP", "" ],
- X ["~", "TILDE", "b" ],
- X ["++", "UNION", "b" ],
- X ["++:=", "UNIONASGN", "" ],
- X ["$(", "LBRACE", "b" ],
- X ["$)", "RBRACE", "e" ],
- X ["$<", "LBRACK", "b" ],
- X ["$>", "RBRACK", "e" ]]
- X
- X # static be_tbl, reserved_tbl
- X reserved_tbl := table()
- X every elem := !reserveds do
- X insert(reserved_tbl, elem[1], elem[2])
- X be_tbl := table()
- X every elem := !primitives | !reserveds | !operators do {
- X insert(be_tbl, elem[2], elem[3])
- X }
- X }
- X
- X /getchar := create {
- X line_number := 0
- X ! ( 1(!stream, line_number +:=1) || "\n" )
- X }
- X whitespace := ' \t'
- X /next_c := @getchar
- X
- X repeat {
- X case next_c of {
- X
- X "." : {
- X # Could be a real literal *or* a dot operator. Check
- X # following character to see if it's a digit. If so,
- X # it's a real literal. We can only get away with
- X # doing the dot here because it is not a substring of
- X # any longer identifier. If this gets changed, we'll
- X # have to move this code into do_operator().
- X #
- X last_token := do_dot(getchar)
- X suspend last_token
- X# write(&errout, "next_c == ", image(next_c))
- X next
- X }
- X
- X "\n" : {
- X # If do_newline fails, it means we're at the end of
- X # the input stream, and we should break out of the
- X # repeat loop.
- X #
- X every last_token := do_newline(getchar, last_token, be_tbl)
- X do suspend last_token
- X if next_c === &null then break
- X next
- X }
- X
- X "\#" : {
- X # Just a comment. Strip it by reading every character
- X # up to the next newline. The global var next_c
- X # should *always* == "\n" when this is done.
- X #
- X do_number_sign(getchar)
- X# write(&errout, "next_c == ", image(next_c))
- X next
- X }
- X
- X "\"" : {
- X # Suspend as STRINGLIT everything from here up to the
- X # next non-backslashed quotation mark, inclusive
- X # (accounting for the _ line-continuation convention).
- X #
- X last_token := do_quotation_mark(getchar)
- X suspend last_token
- X# write(&errout, "next_c == ", image(next_c))
- X next
- X }
- X
- X "'" : {
- X # Suspend as CSETLIT everything from here up to the
- X # next non-backslashed apostrophe, inclusive.
- X #
- X last_token := do_apostrophe(getchar)
- X suspend last_token
- X# write(&errout, "next_c == ", image(next_c))
- X next
- X }
- X
- X &null : oh_no(&null, 5) # unexpected EOF message
- X
- X default : {
- X # If we get to here, we have either whitespace, an
- X # integer or real literal, an identifier or reserved
- X # word (both get handled by do_identifier), or an
- X # operator. The question of which we have can be
- X # determined by checking the first character.
- X #
- X if any(whitespace, next_c) then {
- X # Like all of the TOK forming procedures,
- X # do_whitespace resets next_c.
- X do_whitespace(getchar, whitespace)
- X # don't suspend any tokens
- X next
- X }
- X if any(&digits, next_c) then {
- X last_token := do_digits(getchar)
- X suspend last_token
- X next
- X }
- X if any(&letters ++ '_', next_c) then {
- X last_token := do_identifier(getchar, reserved_tbl)
- X suspend last_token
- X next
- X }
- X# write(&errout, "it's an operator")
- X last_token := do_operator(getchar, operators)
- X suspend last_token
- X next
- X }
- X }
- X }
- X
- X # If stream argument is nonnull, then we are in the top-level
- X # iparse_tokens(). If not, then we are in a recursive call, and
- X # we should not emit all this end-of-file crap.
- X #
- X if \stream then {
- X suspend TOK("EOFX")
- X return TOK("$")
- X }
- X else fail
- X
- Xend
- X
- X
- X#
- X# do_dot: coexpression -> TOK record
- X# getchar -> t
- X#
- X# Where getchar is the coexpression that produces the next
- X# character from the input stream and t is a token record whose
- X# sym field contains either "REALLIT" or "DOT". Essentially,
- X# do_dot checks the next char on the input stream to see if it's
- X# an integer. Since the preceding char was a dot, an integer
- X# tips us off that we have a real literal. Otherwise, it's just
- X# a dot operator. Note that do_dot resets next_c for the next
- X# cycle through the main case loop in the calling procedure.
- X#
- Xprocedure do_dot(getchar)
- X
- X local token
- X # global next_c
- X
- X# write(&errout, "it's a dot")
- X
- X # If dot's followed by a digit, then we have a real literal.
- X #
- X if any(&digits, next_c := @getchar) then {
- X# write(&errout, "dot -> it's a real literal")
- X token := "." || next_c
- X while any(&digits, next_c := @getchar) do
- X token ||:= next_c
- X if token ||:= (next_c == ("e"|"E")) then {
- X while (next_c := @getchar) == "0"
- X while any(&digits, next_c) do {
- X token ||:= next_c
- X next_c = @getchar
- X }
- X }
- X return TOK("REALLIT", token)
- X }
- X
- X # Dot not followed by an integer; so we just have a dot operator,
- X # and not a real literal.
- X #
- X# write(&errout, "dot -> just a plain dot")
- X return TOK("DOT", ".")
- X
- Xend
- X
- X
- X#
- X# do_newline: coexpression x TOK record x table -> TOK records
- X# (getchar, last_token, be_tbl) -> Ts (a generator)
- X#
- X# Where getchar is the coexpression that returns the next
- X# character from the input stream, last_token is the last TOK
- X# record suspended by the calling procedure, be_tbl is a table of
- X# tokens and their "beginner/ender" status, and Ts are TOK
- X# records. Note that do_newline resets next_c. Do_newline is a
- X# mess. What it does is check the last token suspended by the
- X# calling procedure to see if it was a beginner or ender. It
- X# then gets the next token by calling iparse_tokens again. If
- X# the next token is a beginner and the last token is an ender,
- X# then we have to suspend a SEMICOL token. In either event, both
- X# the last and next token are suspended.
- X#
- Xprocedure do_newline(getchar, last_token, be_tbl)
- X
- X local next_token
- X # global next_c
- X
- X# write(&errout, "it's a newline")
- X
- X # Go past any additional newlines.
- X #
- X while next_c == "\n" do {
- X # NL can be the last char in the getchar stream; if it *is*,
- X # then signal that it's time to break out of the repeat loop
- X # in the calling procedure.
- X #
- X next_c := @getchar | {
- X next_c := &null
- X fail
- X }
- X suspend TOK(&null, next_c == "\n")
- X }
- X
- X # If there was a last token (i.e. if a newline wasn't the first
- X # character of significance in the input stream), then check to
- X # see if it was an ender. If so, then check to see if the next
- X # token is a beginner. If so, then suspend a TOK("SEMICOL")
- X # record before suspending the next token.
- X #
- X if find("e", be_tbl[(\last_token).sym]) then {
- X# write(&errout, "calling iparse_tokens via do_newline")
- X# &trace := -1
- X # First arg to iparse_tokens can be null here.
- X if next_token := iparse_tokens(&null, getchar)
- X then {
- X# write(&errout, "call of iparse_tokens via do_newline yields ",
- X# ximage(next_token))
- X if find("b", be_tbl[next_token.sym])
- X then suspend TOK("SEMICOL", "\n")
- X #
- X # See below. If this were like the real Icon parser,
- X # the following line would be commented out.
- X #
- X else suspend TOK(&null, "\n")
- X return next_token
- X }
- X else {
- X #
- X # If this were a *real* Icon tokenizer, it would not emit
- X # any record here, but would simply fail. Instead, we'll
- X # emit a dummy record with a null sym field.
- X #
- X return TOK(&null, "\n")
- X# &trace := 0
- X# fail
- X }
- X }
- X
- X # See above. Again, if this were like Icon's own tokenizer, we
- X # would just fail here, and not return any TOK record.
- X #
- X# &trace := 0
- X return TOK(&null, "\n")
- X# fail
- X
- Xend
- X
- X
- X#
- X# do_number_sign: coexpression -> &null
- X# getchar ->
- X#
- X# Where getchar is the coexpression that pops characters off the
- X# main input stream. Sets the global variable next_c. This
- X# procedure simply reads characters until it gets a newline, then
- X# returns with next_c == "\n". Since the starting character was
- X# a number sign, this has the effect of stripping comments.
- X#
- Xprocedure do_number_sign(getchar)
- X
- X # global next_c
- X
- X# write(&errout, "it's a number sign")
- X while next_c ~== "\n" do {
- X next_c := @getchar
- X }
- X
- X # Return to calling procedure to cycle around again with the new
- X # next_c already set. Next_c should always be "\n" at this point.
- X return
- X
- Xend
- X
- X
- X#
- X# do_quotation_mark: coexpression -> TOK record
- X# getchar -> t
- X#
- X# Where getchar is the coexpression that yields another character
- X# from the input stream, and t is a TOK record with "STRINGLIT"
- X# as its sym field. Puts everything upto and including the next
- X# non-backslashed quotation mark into the str field. Handles the
- X# underscore continuation convention.
- X#
- Xprocedure do_quotation_mark(getchar)
- X
- X local token
- X # global next_c
- X
- X # write(&errout, "it's a string literal")
- X token := "\""
- X while next_c := @getchar do {
- X if next_c == "\n" & token[-1] == "_" then {
- X token := token[1:-1]
- X next
- X } else {
- X if slashupto("\"", token ||:= next_c, 2)
- X then {
- X next_c := @getchar
- X # resume outermost (repeat) loop in calling procedure,
- X # with the new (here explicitly set) next_c
- X return TOK("STRINGLIT", token)
- X }
- X }
- X }
- X
- Xend
- X
- X
- X#
- X# do_apostrophe: coexpression -> TOK record
- X# getchar -> t
- X#
- X# Where getchar is the coexpression that yields another character
- X# from the input stream, and t is a TOK record with "CSETLIT"
- X# as its sym field. Puts everything upto and including the next
- X# non-backslashed apostrope into the str field.
- X#
- Xprocedure do_apostrophe(getchar)
- X
- X local token
- X # global next_c
- X
- X# write(&errout, "it's a cset literal")
- X token := "'"
- X while next_c := @getchar do {
- X if slashupto("'", token ||:= next_c, 2)
- X then {
- X next_c := @getchar
- X # Return & resume outermost containing loop in calling
- X # procedure w/ new next_c.
- X return TOK("CSETLIT", token)
- X }
- X }
- X
- Xend
- X
- X
- X#
- X# do_digits: coexpression -> TOK record
- X# getchar -> t
- X#
- X# Where getchar is the coexpression that produces the next char
- X# on the input stream, and where t is a TOK record containing
- X# either "REALLIT" or "INTLIT" in its sym field, and the text of
- X# the numeric literal in its str field.
- X#
- Xprocedure do_digits(getchar)
- X
- X local token, tok_record
- X # global next_c
- X
- X # Assume integer literal until proven otherwise....
- X tok_record := TOK("INTLIT")
- X
- X# write(&errout, "it's an integer or real literal")
- X token := ("0" ~== next_c) | ""
- X while any(&digits, next_c := @getchar) do
- X token ||:= next_c
- X if token ||:= (next_c == ("R"|"r")) then {
- X while any(&digits, next_c := @getchar) do
- X token ||:= next_c
- X } else {
- X if token ||:= (next_c == ".") then {
- X while any(&digits, next_c := @getchar) do
- X token ||:= next_c
- X tok_record := TOK("REALLIT")
- X }
- X if token ||:= (next_c == ("e"|"E")) then {
- X while any(&digits, next_c := @getchar) do
- X token ||:= next_c
- X tok_record := TOK("REALLIT")
- X }
- X }
- X tok_record.str := ("" ~== token) | 0
- X return tok_record
- X
- Xend
- X
- X
- X#
- X# do_whitespace: coexpression x cset -> &null
- X# getchar x whitespace -> &null
- X#
- X# Where getchar is the coexpression producing the next char on
- X# the input stream. Do_whitespace just repeats until it finds a
- SHAR_EOF
- true || echo 'restore of itokens.icn failed'
- fi
- echo 'End of part 3'
- echo 'File itokens.icn is continued in part 4'
- echo 4 > _shar_seq_.tmp
- exit 0
- --
-
- -Richard L. Goerwitz goer%midway@uchicago.bitnet
- goer@midway.uchicago.edu rutgers!oddjob!ellis!goer
-