home *** CD-ROM | disk | FTP | other *** search
- ############################################################################
- #
- # File: ngrams.icn
- #
- # Subject: Procedure to generate n-grams
- #
- # Author: Ralph E. Griswold
- #
- # Date: June 10, 1988
- #
- ###########################################################################
- #
- # The procedure ngrams(file,n,c,t) generates a tabulation of the n-grams
- # in the specified file. If c is non-null, it is used as the set of
- # characters from which n-grams are taken (other characters break n-grams).
- # The default for c is the upper- and lowercase letters. If t is non-null,
- # the tabulation is given in order of frequency; otherwise in alphabetical
- # order of n-grams.
- #
- # Note:
- #
- # The n-grams are kept in a table within the procedure and all n-grams
- # are processed before the tabulation is generated. Consequently, this
- # procedure is unsuitable if there are very many different n-grams.
- #
- ############################################################################
-
- procedure ngrams(f,i,c,t)
- local line, grams, a, count
-
- if not (integer(i) > 0) then stop("invalid ngrams specification")
- if type(f) ~== ("file" | "window") then stop("invalid file specification")
- /c := &lcase || &ucase
- if not (c := cset(c)) then stop("invalid cset specification")
- grams := table(0)
- line := ""
- while line ||:= reads(f,1000) do
- line ? while tab(upto(c)) do
- (tab(many(c)) \ 1) ? while grams[move(i)] +:= 1 do
- move(-i + 1)
- if /t then {
- a := sort(grams,4)
- while count := pull(a) do
- suspend pull(a) || right(count,8)
- }
- else {
- a := sort(grams,3)
- suspend |(get(a) || right(get(a),8))
- }
- end
-