home *** CD-ROM | disk | FTP | other *** search
Text File | 1994-09-27 | 16.8 KB | 656 lines | [TEXT/CCL2] |
- ;;; File: parser/lexer Author: John
-
- ;;; token data structure: a list with the token type in the
- ;;; car and other information in the rest of the list. Symbols
- ;;; designate the token type.
-
- ;;; Reserved tokens use the name as the type and have no args.
- ;;; Reserved tokens:
- ;;; case class data default deriving else hiding if import in infix
- ;;; infixl infixr instance interface let module of renaming then to
- ;;; type where .. :: => = @ \ | ~ <- -> `
- ;;; Other tokens:
- ;;; (file string)
- ;;; (newline line indent-column)
- ;;; (conid string)
- ;;; (varid string)
- ;;; (consym string)
- ;;; (varsym string)
- ;;; (comment string) ;;; not used at the moment
- ;;; (integer integer)
- ;;; (float integer fraction exponent)
- ;;; (string string)
- ;;; (eof)
-
-
- ;;; *** All of the stuff for lexing character and string literals is
- ;;; *** broken because it assumes that the host Lisp uses the ASCII
- ;;; *** encoding for characters and supports at least 255 characters.
- ;;; *** I have marked the specific places in the code where these
- ;;; *** assumptions are made, but fixing the problem will probably
- ;;; *** require more drastic changes anyway -- such as using integers
- ;;; *** instead of characters and vectors of integers instead of characters
- ;;; *** throughout the compiler.
-
- (define *max-char* 255) ; highest char-code allowed.
-
- ;;; This defines the long names of the control chars. Note that some of
- ;;; this duplicates the table above & the reader.
-
- (define *control-char-names* '(
- ("NUL" . 0) ("SOH" . 1) ("STX" . 2) ("ETX" . 3)
- ("EOT" . 4) ("ENQ" . 5) ("ACK" . 6) ("BEL" . 7)
- ("BS" . 8) ("HT" . 9) ("LF" . 10) ("VT" . 11)
- ("FF" . 12) ("CR" . 13) ("SO" . 14) ("SI" . 15)
- ("DLE" . 16) ("DC1" . 17) ("DC2" . 18) ("DC3" . 19)
- ("DC4" . 20) ("NAK" . 21) ("SYN" . 22) ("ETB" . 23)
- ("CAN" . 24) ("EM" . 25) ("SUB" . 26) ("ESC" . 27)
- ("FS" . 28) ("GS" . 29) ("RS" . 30) ("US" . 31)
- ("SP" . 32) ("DEL" . 127)))
-
- ;;; This defines the short names for a few control chars. This
- ;;; is keyed off the previous table
-
- (define *short-control-char-names* '(
- (#\a . "BEL") (#\b . "BS") (#\f . "FF") (#\n . "LF")
- (#\r . "CR") (#\t . "HT") (#\v . "VT")))
-
- ;;; This is used in the ^X construct. Assume that ^X = code for ^A + X-A
- ;;; *** This is an invalid assumption.
-
- (define *control-A* 1)
-
- ;;; This function is the interface between the lexer and the rest
- ;;; of the system. Note that the `file' reported in error messages
- ;;; must be bound in an outer context.
-
-
- ;;; *** I think this function should be binding these variables and not
- ;;; *** just assigning them.
-
- (define (lex-port port literate?)
- (lex-port-1 port literate? '#t))
-
- (define (lex-port/nolines port)
- (lex-port-1 port '#f '#f))
-
- (define (lex-port-1 port literate? has-lines?)
- (setf *lex-literate?* literate?)
- (setf *current-line* (if has-lines? 1 -1000))
- (setf *current-col* 0)
- (setf *on-new-line?* '#t)
- (setf *save-col?* '#f)
- (setf *port* port)
- (setf *tokens* '())
- (setf *char* (read-char *port*))
- (setf *peek-char* (read-char *port*))
- (when (eof-object? *char*)
- (setf *char* '#\space))
- (when (eof-object? *peek-char*)
- (setf *peek-char* '#\space))
- (setf *at-eof/p?* '#f)
- (setf *at-eof?* '#f)
- (when *lex-literate?*
- (process-literate-comments '#t))
- (parse-till-eof)
- (nreverse *tokens*))
-
- (define (parse-till-eof)
- (cond (*at-eof?*
- (emit-token 'eof)
- '())
- (else
- (lex-one-token)
- (parse-till-eof))))
-
- ;;; There is an assumption that the scanner never peeks beyond a newline.
- ;;; In literate mode, this may reveal the wrong thing.
-
- (define (advance-char)
- (if (and *lex-literate?* (eqv? *char* #\newline))
- (process-literate-comments '#f)
- (advance-char-1)))
-
- (define (advance-char-1)
- (cond ((eqv? *char* #\newline)
- (setf *on-new-line?* '#t)
- (incf (the fixnum *current-line*))
- (setf *current-col* 0))
- ((eqv? *char* #\tab)
- (incf (the fixnum *current-col*) (- 8 (modulo *current-col* 8))))
- (else
- (incf (the fixnum *current-col*))))
- (setf *char* *peek-char*)
- (setf *at-eof?* *at-eof/p?*)
- (setf *peek-char* (read-char *port*))
- (when (eof-object? *peek-char*)
- (setf *at-eof/p?* '#t)
- (setf *peek-char* '#\space))
- *char*)
-
- (define (peek-char-2)
- (let ((ch (peek-char *port*)))
- (if (eof-object? ch)
- '#\space
- ch)))
-
- (define (lex-one-token)
- (setf *start-line* *current-line*) ; capture the loc at the start of the token
- (setf *start-col* *current-col*)
- (unless *at-eof?*
- (char-case *char*
- (whitechar
- (advance-char)
- (lex-one-token))
- (#\- (char-case *peek-char*
- (#\- (lex-comment))
- (#\} (signal-missing-begin-comment)
- (advance-char)
- (advance-char)
- (lex-one-token))
- (else
- (lex-varsym))))
- (#\{ (cond ((char=? *peek-char* '#\-)
- (advance-char)
- (advance-char)
- (cond ((char=? *char* '#\#)
- (advance-char)
- (emit-token 'begin-annotation))
- (else
- (lex-ncomment)
- (lex-one-token))))
- (else
- (advance-char)
- (emit-token '\{ ))))
- (small (lex-varid))
- (large (lex-conid))
- (#\( (advance-char)
- (emit-token '\())
- (#\: (lex-consym))
- (#\` (advance-char)
- (emit-token '\`))
- ((symbol presymbol) (lex-varsym))
- (digit (lex-numeric))
- (#\' (lex-char))
- (#\" (lex-string))
- (#\) (advance-char)
- (emit-token '\)))
- (#\, (advance-char)
- (emit-token '\,))
- (#\; (advance-char)
- (emit-token '\;))
- (#\[ (advance-char)
- (emit-token '\[))
- (#\] (advance-char)
- (emit-token '\]))
- (#\_ (advance-char)
- (emit-token '\_))
- (#\} (advance-char)
- (emit-token '\}))
- (else
- (signal-invalid-character *char*)
- (advance-char)
- (lex-one-token)))))
-
- (define (signal-missing-begin-comment)
- (lexer-error 'missing-begin-comment
- "`-}' appears outside of a nested comment."))
-
- (define (signal-invalid-character ch)
- (lexer-error 'invalid-character
- "Invalid character `~a' appears in source program." ch))
-
- (define (advance-past-white)
- (unless *at-eof?*
- (char-case *char*
- (whitechar
- (advance-char)
- (advance-past-white))
- (else
- '()))))
-
- (define (process-literate-comments at-start?)
- (unless at-start? (advance-char-1))
- (let ((l (classify-line)))
- (cond ((or *at-eof?* (eq? l 'program))
- '())
- ((eq? l 'blank)
- (skip-literate-comment '#t))
- (else
- (when (not at-start?)
- (lexer-error 'blank-line-needed
- "Literate comments must be preceeded by a blank line"))
- (skip-literate-comment '#f)))))
-
- (define (skip-literate-comment prev-blank)
- (skip-past-line)
- (let ((l (classify-line)))
- (cond (*at-eof?*
- '())
- ((eq? l 'comment)
- (skip-literate-comment '#f))
- ((eq? l 'blank)
- (skip-literate-comment '#t))
- (else
- (when (not prev-blank)
- (lexer-error 'blank-line-needed
- "Literate comments must be followed by a blank line"))))))
-
- (define (classify-line)
- (if *at-eof?*
- 'blank
- (char-case *char*
- (#\>
- (advance-char-1)
- 'program)
- (#\newline 'blank)
- (whitechar
- (classify-line-1))
- (else 'comment))))
-
- (define (classify-line-1)
- (advance-char-1)
- (char-case *char*
- (#\newline 'blank)
- (whitechar (classify-line-1))
- (else 'comment)))
-
- (define (skip-past-line)
- (when (not *at-eof?*)
- (char-case *char*
- (#\newline
- (advance-char-1))
- (else
- (advance-char-1)
- (skip-past-line)))))
-
- (define (lex-comment) ;; a -- style comment
- (advance-char)
- (cond (*at-eof?* (lexer-eof-in-comment *current-line*))
- ((char=? *char* #\newline)
- (lex-one-token))
- (else
- (lex-comment))))
-
- (define (lexer-eof-in-comment start-line)
- (signal-eof-in-comment start-line)
- (lex-one-token)) ; will return the eof token
-
- (define (signal-eof-in-comment start-line)
- (lexer-error 'eof-in-comment
- "End of file in comment starting at line ~A." start-line))
-
- ;;; Here *char* and *peek-char* are the first two chars on a line.
-
- (define (scan-symbol)
- (scan-list-of (symbol #\:)))
-
- (define (scan-var-con)
- (scan-list-of (large small digit #\' #\_)))
-
- (define (lex-ncomment)
- (lex-ncomment-1 *current-line*))
-
- (define (lex-ncomment-1 start-line)
- (if *at-eof?*
- (lexer-eof-in-comment start-line)
- (char-case *char*
- (#\- (cond ((char=? *peek-char* #\})
- (advance-char)
- (advance-char))
- (else
- (advance-char)
- (lex-ncomment-1 start-line))))
- (#\{ (cond ((char=? *peek-char* #\-)
- (advance-char)
- (advance-char)
- (lex-ncomment)
- (lex-ncomment-1 start-line))
- (else
- (advance-char)
- (lex-ncomment-1 start-line))))
- (else
- (advance-char)
- (lex-ncomment-1 start-line)))))
-
- (define (lex-varid)
- (let ((sym (scan-var-con)))
- (parse-reserved sym varid
- "case" "class"
- "data" "default" "deriving"
- "else"
- "hiding"
- "if" "import" "in" "infix" "infixl" "infixr" "instance" "interface"
- "let"
- "module"
- "of"
- "renaming"
- "then" "to" "type"
- "where")))
-
- (define (lex-conid)
- (let ((sym (scan-var-con)))
- (emit-token/string 'conid sym)))
-
- (define (lex-consym)
- (let ((sym (scan-symbol)))
- (cond ((string=/list? (cdr sym) ":")
- (emit-token '\:\:))
- (else
- (emit-token/string 'consym sym)))))
-
- (define (lex-varsym)
- (let ((sym (scan-symbol)))
- (cond ((and (string=/list? sym "<") (char=? *char* #\-))
- (advance-char)
- (emit-token '\<\-))
- ((and (string=/list? sym "#")
- (char=? *char* #\-)
- (char=? *peek-char* #\}))
- (advance-char)
- (advance-char)
- (emit-token 'end-annotation))
- (else
- (parse-reserved sym varsym
- ".."
- "=>" "="
- "->"
- "@"
- "\\"
- "|"
- "~")))))
-
- (define (lex-integer radix)
- (lex-integer-1 radix 0))
-
- (define (lex-integer-1 radix psum)
- (declare (type fixnum radix)
- (type integer psum))
- (let ((d (char->digit *char* radix)))
- (if d
- (begin
- (advance-char)
- (lex-integer-1 radix (+ (* psum radix) (the fixnum d))))
- psum)))
-
- (define (lex-fraction int-part denominator)
- (declare (type integer int-part denominator))
- (let ((d (char->digit *char* 10)))
- (if d
- (begin
- (advance-char)
- (lex-fraction
- (+ (* int-part 10) (the fixnum d)) (* denominator 10)))
- (values int-part denominator))))
-
- (define (lex-numeric)
- (let ((int-part (lex-integer 10)))
- (if (and (char=? *char* #\.)
- (char->digit *peek-char* 10))
- (lex-float int-part)
- (emit-token 'integer int-part))))
-
- (define (lex-float int-part)
- (advance-char)
- (multiple-value-bind (numerator denominator) (lex-fraction int-part 1)
- (let ((no-exponent
- (lambda () (emit-token 'float numerator denominator 0))))
- (char-case *char*
- (exponent
- (char-case *peek-char*
- (digit
- (advance-char)
- (lex-float/exp numerator denominator 1))
- ((#\+ #\-)
- (cond ((char->digit (peek-char-2) 10)
- (let ((sign (if (char=? *peek-char* '#\+) 1 -1)))
- (advance-char)
- (advance-char)
- (lex-float/exp numerator denominator sign)))
- (else
- (funcall no-exponent))))
- (else
- (funcall no-exponent))))
- (else
- (emit-token 'float numerator denominator 0))))))
-
- (define (lex-float/exp numerator denominator sign)
- (let ((exponent (lex-integer 10)))
- (emit-token 'float numerator denominator (* sign exponent))))
-
- (define (lex-char)
- (advance-char)
- (let ((c
- (char-case *char*
- (#\' (signal-null-character)
- '#\?)
- (#\\ (lex-escaped-char '#f))
- ((#\space graphic)
- (let ((ch *char*))
- (advance-char)
- ch))
- (else
- (signal-bad-character-constant *char*)
- (advance-char)
- `#\?))))
- (cond ((char=? *char* '#\')
- (advance-char)
- (emit-token 'char c))
- (else
- (signal-missing-char-quote)
- (skip-to-quote-or-eol)))))
-
- (define (signal-null-character)
- (lexer-error 'null-character
- "Null character '' is illegal - use '\\'' for a quote."))
-
- (define (signal-bad-character-constant ch)
- (lexer-error 'bad-character-constant
- "The character `~a' may not appear in a character literal." ch))
-
- (define (signal-missing-char-quote)
- (lexer-error 'missing-char-quote
- "Character constant has more than one character."))
-
-
- (define (skip-to-quote-or-eol)
- (if *at-eof?*
- (lex-one-token)
- (char-case *char*
- (#\' (advance-char)
- (lex-one-token))
- (#\newline (advance-char)
- (lex-one-token))
- (else
- (advance-char)
- (skip-to-quote-or-eol)))))
-
- (define (lex-string)
- (advance-char)
- (emit-token 'string (list->string (gather-string-chars))))
-
- (define (gather-string-chars)
- (char-case *char*
- (#\\
- (let ((ch (lex-escaped-char '#t)))
- (if (eq? ch 'null)
- (gather-string-chars)
- (cons ch (gather-string-chars)))))
- (#\"
- (advance-char)
- '())
- ((graphic #\space)
- (let ((ch *char*))
- (advance-char)
- (cons ch (gather-string-chars))))
- (#\newline
- (signal-missing-string-quote)
- '())
- (else
- (signal-bad-string-constant *char*)
- (advance-char)
- (gather-string-chars))))
-
- (define (signal-missing-string-quote)
- (lexer-error 'missing-string-quote
- "String continued over end of line."))
-
- (define (signal-bad-string-constant ch)
- (lexer-error 'bad-string-constant
- "The character `~a' may not appear in a string literal." ch))
-
-
- (define (convert-stupid-control-character-names)
- (let ((c1 *char*)
- (c2 *peek-char*))
- (advance-char)
- (advance-char)
- (let ((s2 (string c1 c2))
- (s3 (string c1 c2 *char*)))
- (let ((srch3 (assoc s3 *control-char-names*)))
- (cond (srch3
- (advance-char)
- (integer->char (cdr srch3)))
- (else
- (let ((srch2 (assoc s2 *control-char-names*)))
- (cond (srch2
- (integer->char (cdr srch2)))
- (else
- (signal-bad-control-char s3)
- `#\?)))))))))
-
- (define (signal-bad-control-char name)
- (lexer-error 'invalid-control-char
- "`~a' is not a recognized control character name." name))
-
-
- (define (lex-escaped-char in-string?)
- (advance-char)
- (char-case *char*
- ((#\a #\b #\f #\n #\r #\t #\v)
- (let* ((ccode (cdr (assoc *char* *short-control-char-names*)))
- (ccode1 (cdr (assoc ccode *control-char-names*))))
- (advance-char)
- (integer->char ccode1)))
- ((#\\ #\' #\")
- (let ((ch *char*))
- (advance-char)
- ch))
- (#\&
- (advance-char)
- (cond (in-string? 'null)
- (else
- (signal-bad-&-escape)
- '#\?)))
- (#\^
- ;; *** This code is problematic because it assumes
- ;; *** (1) that you can do the arithmetic on the character codes
- ;; *** (2) that the resulting integer can actually be coerced to
- ;; *** the right character object in the host Lisp.
- (advance-char)
- (char-case *char*
- ((large #\@ #\[ #\\ #\] #\^ #\_)
- (let ((code (+ (- (char->integer *char*)
- (char->integer '#\A))
- *control-A*)))
- (advance-char)
- (integer->char code)))
- (else
- (signal-bad-^-escape *char*)
- '#\?)))
- (large
- (convert-stupid-control-character-names))
- (digit
- (convert-num-to-char (lex-integer 10)))
- (#\o
- (advance-char)
- (cond ((char->digit *char* 8)
- (convert-num-to-char (lex-integer 8)))
- (else
- (signal-missing-octal-digits)
- '#\?)))
- (#\x
- (advance-char)
- (cond ((char->digit *char* 16)
- (convert-num-to-char (lex-integer 16)))
- (else
- (signal-missing-hex-digits)
- `#\?)))
- (whitechar
- (cond (in-string?
- (lex-gap))
- (else
- (signal-bad-gap)
- `#\?)))
- (else
- (signal-bad-escape *char*)
- `#\?)))
-
- (define (signal-bad-&-escape)
- (lexer-error 'bad-&-escape
- "The escape `\\&' is not allowed inside a character literal."))
-
- (define (signal-bad-^-escape ch)
- (lexer-error 'bad-^-escape
- "The escape `\\^~a' is not recognized." ch))
-
- (define (signal-missing-octal-digits)
- (lexer-error 'missing-octal-digits
- "No digits provided for `\\o' escape."))
-
- (define (signal-missing-hex-digits)
- (lexer-error 'missing-hex-digits
- "No digits provided for `\\x' escape."))
-
- (define (signal-bad-gap)
- (lexer-error 'invalid-gap
- "Gaps are not allowed inside character literals."))
-
- (define (signal-bad-escape ch)
- (lexer-error 'bad-escape
- "The escape `\\~a' is not recognized." ch))
-
-
-
- ;;; *** This code is problematic because it assumes that integers
- ;;; *** between 0 and 255 map on to characters with the corresponding
- ;;; *** ASCII encoding in the host Lisp, and that the host Lisp actually
- ;;; *** supports 255 characters.
-
- (define (convert-num-to-char num)
- (cond ((and (>= num 0) (>= *max-char* num))
- (integer->char num))
- (else
- (signal-char-out-of-range num)
- '#\?)))
-
- (define (signal-char-out-of-range num)
- (lexer-error 'char-out-of-range
- "There is no character corresponding to code ~s." num))
-
-
- (define (lex-gap)
- (cond (*at-eof?*
- (signal-eof-in-gap)
- 'null)
- (else
- (char-case *char*
- (whitechar
- (advance-char)
- (lex-gap))
- (#\\
- (advance-char)
- 'null)
- (else
- (signal-missing-gap)
- 'null)))))
-
-
- (define (signal-eof-in-gap)
- (lexer-error 'eof-in-gap
- "End of file encountered inside gap."))
-
- (define (signal-missing-gap)
- (lexer-error 'missing-gap
- "Missing gap delimiter, or junk inside gap."))
-