home *** CD-ROM | disk | FTP | other *** search
- ############################################################################
- #
- # File: sentence.icn
- #
- # Subject: Procedure to generate sentences in file
- #
- # Author: Richard L. Goerwitz
- #
- # Date: June 3, 1991
- #
- ###########################################################################
- #
- # Version: 1.2
- #
- ###########################################################################
- #
- # sentence(f) - suspends sentences from file f
- #
- # A lot of grammatical and stylistic analysis programs are predicated
- # on the notion of a sentence. For instance, some programs count the
- # number of words in each sentence. Other count the number and length
- # of clauses. Still others pedantically check for sentence-final par-
- # ticles and prepositions.
- #
- # This procedure, sentence(), is supposed to be used as a filter for
- # ASCII text files, suspending everything that looks remotely like a
- # sentence in them.
- #
- # BUGS: Cannot correctly parse sentences with constructs like "R. L.
- # Goerwitz" in them. The algorithm can be much improved simply by
- # checking to see if the word after the period is in /usr/dict/words
- # or whatever your system dictionary file is. If it isn't, then it's
- # likely not to be the beginning of a sentence (this also is not in-
- # fallible, naturally).
- #
- ############################################################################
- #
- # Requires: co-expressions
- #
- ############################################################################
-
-
- procedure sentence(intext)
-
- local sentence, get_line, line, tmp_s, end_part, whole_thing
- static inits, punct
- initial {
- inits := &ucase ++ &digits
- punct := ".\"'!?)]"
- }
- sentence := ""
- get_line := create read_line(intext)
-
- while line := @get_line do {
-
- # If we hit a blank line, it's a signal from read_line that we
- # have encountered a change in the indentation level, and
- # should call it a sentence break (though it could just be
- # indentation for a quote, a section header, etc., it seems
- # these all indicate major, sentence-like divisions in the
- # text).
- if line == "" then {
- suspend sentence
- sentence := ""
- next
- }
-
- # Go on until you can't find any more sentence-endings in line,
- # then break and get another line.
- repeat {
-
- # Scan for a sentence break somewhere in line.
- line ? {
-
- # Ugly, but it works. Look for sequences containing
- # things like periods and question marks, followed by
- # a space and another space or a word beginning with
- # a capital letter. If we don't have enough context,
- # append the next line from intext to line & scan again.
- if tmp_s := tab(upto(punct)) &
- upto('!?.', end_part := tab(many(punct))) &
- not (pos(-1), line ||:= @get_line, next) &
- =" " & (=" " | (tab(many('\'"('))|&null,any(inits)))
- # IF YOU WANT TO ADD A DICTIONARY CHECK, then read in
- # a dictionary like /usr/dict/words, and then change
- # any(inits) above to something like (any(inits),
- # longstr(list_of_usrdictwords,map(&subject),&pos), =" ")
- # where longstr() matches each string in list_of_usr-
- # dictwords.
- then {
-
- # Don't bother with little two-letter hunks.
- whole_thing := sentence || tmp_s || end_part
- if *whole_thing > 3 | find(" ",whole_thing)
- then suspend whole_thing
-
- tab(many(' '))
- line := tab(0)
- sentence := ""
- next
- }
- else break
- }
- }
-
- # Otherwise just tack line onto sentence & try again.
- sentence ||:= line
- }
-
- return sentence
-
- end
-
-
-
-
- procedure read_line(intext)
-
- local new_line, ilevel, junk_count, space_count, line
- static last_ilevel, blank_flag
- last_ilevel := 0
-
- while line := trim(!intext,'\t ') do {
-
- # Check to see if line is blank; if so, set blank_flag.
- if line == "" then
- { blank_flag := 1; next }
-
- # Determine current indentation level.
- detab(line) ? {
- ilevel := *tab(many(' ')) | 0
- }
-
- line ? {
-
- tab(many('\t '))
-
- # Signal the calling procedure if there is a change in the
- # indentation level by suspending a blank line.
- if (ilevel > last_ilevel) | (ilevel < last_ilevel, \blank_flag)
- then suspend ""
- last_ilevel := ilevel
-
- # Put a space on the end of line, unless it ends in a dash.
- new_line := tab(-1) || (="-" | (move(1) || " "))
- # Make sure the flag that indicates blank lines is unset.
- blank_flag := &null
- }
-
- # Suspend the newly reformatted, trimmed, space-terminated line.
- suspend new_line
- }
-
- end
-