home *** CD-ROM | disk | FTP | other *** search
- Newsgroups: alt.sources
- From: goer@ellis.uchicago.edu (Richard L. Goerwitz)
- Subject: kjv browser, part 9 of 11
- Message-ID: <1991Jul3.065253.28409@midway.uchicago.edu>
- Date: Wed, 3 Jul 1991 06:52:53 GMT
-
- ---- Cut Here and feed the following to sh ----
- #!/bin/sh
- # this is bibleref.09 (part 9 of a multipart archive)
- # do not concatenate these parts, unpack them in order with /bin/sh
- # file makeind.icn continued
- #
- if test ! -r _shar_seq_.tmp; then
- echo 'Please unpack part 1 first!'
- exit 1
- fi
- (read Scheck
- if test "$Scheck" != 9; then
- echo Please unpack part "$Scheck" next!
- exit 1
- else
- exit 0
- fi
- ) < _shar_seq_.tmp || exit 1
- if test ! -f _shar_wnt_.tmp; then
- echo 'x - still skipping makeind.icn'
- else
- echo 'x - continuing file makeind.icn'
- sed 's/^X//' << 'SHAR_EOF' >> 'makeind.icn' &&
- X # (keys are option letters).
- X #
- X usage:= "usage: makeind -f filename -m int -n int [-l int] [-s]"
- X opt_table := initialize_IS(a)
- X fname := \opt_table["f"] | stop(usage)
- X rollover_field := opt_table["l"] # (optional)
- X
- X #
- X # Begin the process of tokenizing, recording token locations, and
- X # of storing this information in two separate files.
- X #
- X # Read input file, making a table of words and their locations.
- X index_table := create_index(fname)
- X
- X #
- X # Write keys to one file, with pointers into another file
- X # containing the bitmaps for each key.
- X #
- X index_fname := dir_name(fname)||create_fname(fname, "IND")
- X bitmap_fname := dir_name(fname)||create_fname(fname, "BMP")
- X write_tokens_and_offsets(index_fname, bitmap_fname, index_table)
- X
- X #
- X # Re-open fname and store the locations for each chunk of text
- X # marked by a ::location marker. This could certainly be
- X # incorporated into the indexing routines, but only at the great
- X # expense of clarity.
- X #
- X upto_field := 1 < (IS.no * 2) / 3 | 1
- X bofname := dir_name(fname)||create_fname(fname, "OFS")
- X bitmap_offset_table :=
- X store_bitmaps_and_offsets(fname, upto_field)
- X # store in .OFS file
- X write_bitmaps_and_offsets(bofname, bitmap_offset_table, upto_field)
- X
- X #
- X # Re-open fname again, and store the pre-rollover bitmaps in the
- X # .LIM file. Obviously this procedure could be stuffed into
- X # another one above (e.g. store_bitmaps_and_offsets()).
- X #
- X if \rollover_field then {
- X #
- X # Let's say we are using the Bible as our text, and we want to
- X # create all the bitmaps for Genesis 1:9-2:10. We need to know
- X # what verse chapter 1 goes up to. By supplying makeind
- X # with a "-l 3" argument, you are telling it to store this in-
- X # formation for later use by expandrf().
- X #
- X limits_fname := dir_name(fname)||create_fname(fname, "LIM")
- X write_limits(limits_fname, fname, rollover_field)
- X IS.r_field := rollover_field
- X }
- X
- X #
- X # Write IS record to the .IS file.
- X #
- X out_IS := open(dir_name(fname)||create_fname(fname, "IS"), "w") |
- X abort("makeind","can't open .IS file",2)
- X writes(out_IS, encode(IS))
- X close(out_IS)
- X
- X # All is well. Exit with zero status.
- X exit(0)
- X
- Xend
- X
- X
- X#
- X# initialize_IS
- X#
- X# Sets up main parameters for the current index file, such as the
- X# field separator to be used in tokenizing the file, the string and
- X# bit lengths of bitmap fields, the number of fields, and the size of
- X# the actual bitmaps (in bytes) as written to disk (comes out to the
- X# smallest multiple of eight greater than the field length times the
- X# field number. The marker length has to be set in the main
- X# procedure, so initialize_IS leaves it null for now.
- X#
- Xprocedure initialize_IS(a)
- X
- X local usage, fname, opt_table
- X # global IS
- X
- X usage:="usage: makeind -f filename -m int -n int [-l int] [-s]"
- X
- X IS := is() # set up some IS fields
- X opt_table := options(a, "f:m:n+sS:l+")
- X 3 <= *opt_table <= 6 | stop(usage)
- X IS.no := \opt_table["n"] | stop(usage)
- X IS.FS := \opt_table["S"] | "['.]?[^-0-9A-Za-z']+'?"
- X IS.is_case_sensitive := opt_table["s"] # normally is &null
- X
- X #
- X # Calculate string representation length for fields, as well as
- X # the number of bits required for their integer representation.
- X # I.e. if the opt_table["m"] value is 99, this will take two chars to
- X # represent as a string ("99"), but 7 binary "digits" to represent
- X # internally as a base-two integer.
- X #
- X IS.s_len := *string(opt_table["m"])
- X IS.len := *exbase10(opt_table["m"], 2)
- X
- X return opt_table
- X
- Xend
- X
- X
- X#
- X# create_index
- X#
- X# Creates a table containing all tokens in the file fname, with the
- X# set of each token's locations recorded as values for those tokens.
- X# IS.FS is a nawk-style field separator regular expression.
- X# If &null, defaults to ~(&digits++&letters). IS.s_len
- X# is the location marker string-representation field length. Index_
- X# stats.len is the number of binary digits needed for an
- X# integer representation of a given field. IS.no is
- X# the number of fields.
- X#
- Xprocedure create_index(fname)
- X
- X local intext, wordtbl, line, bitmap, token
- X
- X intext := open(fname) |
- X abort("create_index","can't open index file, "||fname, 9)
- X wordtbl := table()
- X
- X while line := read(intext) do {
- X line ? {
- X if ="::" then {
- X bitmap := digits_2_bitmap(tab(0)) # in indexutl.icn
- X } else {
- X # gettokens() resides in a separate file, gettokens.icn
- X every token := gettokens(IS.is_case_sensitive) do {
- X /wordtbl[token] := set()
- X insert(wordtbl[token], \bitmap) |
- X abort("create_index","text before location-marker",8)
- X }
- X }
- X }
- X }
- X \line | abort("create_index", "empty input file, "||fname, 8)
- X close(intext)
- X return wordtbl
- X
- Xend
- X
- X
- X#
- X# write_tokens_and_offsets
- X#
- X# Writes to one file a list of all tokens collected from the input
- X# file, one to a line, followed by a tab, and then a byte offset into
- X# another file where the bitmaps for that token are kept.
- X#
- X# token tab offset
- X#
- X# A seek to "offset" in the bitmap file will put you at the start of a
- X# block of bitmaps.
- X#
- Xprocedure write_tokens_and_offsets(index_fname, bitmap_fname, t)
- X
- X local outtokens, outbitmaps, index_lst, i, bitmap_length, bitmap
- X
- X outtokens := open(index_fname, "w") |
- X abort("write_tokens_and_offsets","can't open "||index_fname,6)
- X outbitmaps := open(bitmap_fname, "w") |
- X abort("write_tokens_and_offsets","can't open "||bitmap_fname,5)
- X # Calculate the length of bitmaps (must be the smallest multiple of
- X # 8 >= (IS.len * IS.no)).
- X bitmap_length := ((IS.len * IS.no) <= seq(0,8))
- X index_lst := sort(t, 3)
- X
- X every i := 1 to *index_lst-1 by 2 do {
- X
- X # Write token to index file with the offset of that token's
- X # bitmaps in the bitmap file.
- X write(outtokens, index_lst[i], "\t", where(outbitmaps))
- X
- X # Now write the bitmaps for the above token to the bitmap file.
- X # First write out the number of bitmaps in this block. Two bytes
- X # are allotted to hold this count (16 bits).
- X if *index_lst[i+1] > 65535 then { # just in case
- X abort("write_tokens_and_offsets",
- X "too many bitmaps for"||index_lst[i], 16)
- X }
- X write_int(outbitmaps, *index_lst[i+1], 16)
- X # Having written the bitmap count, now write the bitmaps proper
- X # to the bitmap file.
- X every write_int(outbitmaps, !index_lst[i+1], bitmap_length)
- X }
- X
- X # Close files. Return number of keys processed (any better ideas??)
- X every close(outtokens | outbitmaps)
- X return *index_lst / 2 # return number of keys in index file
- X
- Xend
- X
- X
- X
- X#
- X# store_bitmaps_and_offsets
- X#
- X# Runs through the file called fname, finding all the location
- X# markers, and recording the offset of the text they precede. Writes
- X# bitmap : offset pairs to a .ofs file. Note that the full bitmap is
- X# not stored. Rather only the first upto_field fields are stored.
- X# Normally upto_field = IS.no - 1.
- X#
- Xprocedure store_bitmaps_and_offsets(fname, upto_field)
- X
- X local intext, current_location, last_major_division,
- X major_division, bitmap_offset_table
- X
- X intext := open(fname) |
- X abort("store_bitmaps_and_offsets","can't open "||fname,5)
- X bitmap_offset_table := table()
- X
- X while (current_location := where(intext), line := read(intext)) do {
- X line ? {
- X if ="::" then {
- X major_division :=
- X ishift(digits_2_bitmap(tab(0)), # in indexutl.icn
- X -((IS.no - upto_field) * IS.len))
- X if \last_major_division = major_division then
- X next
- X else {
- X insert(
- X bitmap_offset_table, major_division, current_location)
- X last_major_division := major_division
- X }
- X }
- X }
- X }
- X
- X return bitmap_offset_table
- X
- Xend
- X
- X
- X#
- X# write_bitmaps_and_offsets
- X#
- X# Does the actual writing of bitmaps and offsets to a file. Receives
- X# a table of bitmaps cut down to upto_field fields. Shinking the
- X# bitmaps lessens the size of the resulting file, but requires a bit
- X# more I/O when it comes time to look something up.
- X#
- Xprocedure write_bitmaps_and_offsets(bofname, t, upto_field)
- X
- X local outtext, tmp_list, i, offset_length,
- X block_size, stored_bitmap_length
- X
- X outtext := open(bofname, "w") |
- X abort("write_bitmaps_and_offsets","can't open "||bofname,5)
- X stored_bitmap_length := ((IS.len * upto_field) <= seq(0,8))
- X tmp_list := sort(t, 3)
- X
- X every i := 1 to *tmp_list-1 by 2 do {
- X
- X # Number of bits needed to hold offset.
- X offset_length := (*exbase10(tmp_list[i+1], 2) <= seq(0,8))
- X # Number of bytes needed to hold bitmap and offset (both).
- X block_size := (stored_bitmap_length + offset_length) / 8
- X
- X # We could just code the length of the offset, since the bitmap's
- X # length is fixed (and known). Seems better to code the block's
- X # total length just in case something gets screwed up. An 8-bit
- X # limit means the bitmap+offset length cannot exceed 2^9-1 (255)
- X # characters.
- X if block_size > 255 then
- X abort("write_bitmaps_and_offsets","bitmap+offset too big",15)
- X write_int(outtext, block_size, 8)
- X write_int(outtext, tmp_list[i], stored_bitmap_length)
- X write_int(outtext, tmp_list[i+1], offset_length)
- X
- X }
- X
- X return
- X
- Xend
- X
- X#
- X# write_limits
- X#
- X# Writes out the bitmaps that will be needed in order for expandrf()
- X# to be able to know when the rollover field rolls over.
- X#
- Xprocedure write_limits(out_fname, in_fname, r_field)
- X
- X local in, out, shift_bits_out, bitmap_length, bitmaps_read,
- X line, bitmap, short_bitmap, old_bitmap
- X
- X in := open(in_fname) |
- X abort("write_limits","can't open "||in_fname,5)
- X out := open(out_fname, "w") |
- X abort("write_limits","can't open "||out_fname,5)
- X r_field <= IS.no |
- X abort("write_limits","-l value should not exceed that of -n",50)
- X shift_bits_out := -(((IS.no-r_field)+ 1) * IS.len)
- X bitmap_length := ((IS.len * IS.no) <= seq(0,8))
- X bitmaps_read := 0
- X
- X while line := read(in) do {
- X line ? {
- X if ="::" then {
- X bitmaps_read +:= 1
- X bitmap := digits_2_bitmap(tab(0)) # in indexutl.icn
- X short_bitmap := ishift(bitmap, shift_bits_out)
- X if ishift(\old_bitmap, shift_bits_out) ~== short_bitmap
- X then write_int(out, old_bitmap, bitmap_length)
- X old_bitmap := bitmap
- X }
- X }
- X }
- X
- X write_int(out, \old_bitmap, bitmap_length)
- X every close(in | out)
- X return bitmaps_read
- X
- Xend
- SHAR_EOF
- echo 'File makeind.icn is complete' &&
- true || echo 'restore of makeind.icn failed'
- rm -f _shar_wnt_.tmp
- fi
- # ============= gettokens.icn ==============
- if test -f 'gettokens.icn' -a X"$1" != X"-c"; then
- echo 'x - skipping gettokens.icn (File already exists)'
- rm -f _shar_wnt_.tmp
- else
- > _shar_wnt_.tmp
- echo 'x - extracting gettokens.icn (Text)'
- sed 's/^X//' << 'SHAR_EOF' > 'gettokens.icn' &&
- X############################################################################
- X#
- X# Name: gettokens.icn
- X#
- X# Title: get tokens from text-base file
- X#
- X# Author: Richard L. Goerwitz
- X#
- X# Version: 1.2
- X#
- X############################################################################
- X#
- X# Tokenizing routine used by makeind.icn to create index.
- X#
- X############################################################################
- X#
- X# See also: ./makeind.icn
- X#
- X#############################################################################
- X
- X# declared in ./indexutl.icn (q.v.)
- X# global IS
- X#
- X# One idea for gettokens, good for small indices. Uses field separator
- X# (IS.FS). Also uses (slow) findre. Farther below is a less flexible
- X# version of gettokens which runs faster.
- X#
- X#procedure gettokens(is_case_sensitive)
- X#
- X# # Used within a scanning expression. Returns tokens in
- X# # &subject[&pos:0] (&pos normally = 1). Tokens are stretches of
- X# # text separated by the IS.FS field separator. This
- X# # field separator is a nawk style FS regular expression. If null,
- X# # it gets defined as ~(&digits++&letters).
- X#
- X# local token
- X# static non_alphanums
- X# initial non_alphanums := ~(&digits ++ &letters ++ '-')
- X#
- X# /IS.FS := non_alphanums
- X#
- X# while token := tab(findre(IS.FS)) do {
- X# tab(__endpoint)
- X# tab(many('\'')) # unfortunate by-product of findre's weakness
- X# if \is_case_sensitive
- X# then suspend "" ~== trim(token,'\t ')
- X# else suspend map("" ~== trim(token,'\t '))
- X# }
- X#
- X# # Return the rest of &subject. Even though we're not tabbing
- X# # upto FS, this is normally what the user intends.
- X# if \is_case_sensitive
- X# then return "" ~== trim(tab(0),'\t ')
- X# else return map("" ~== trim(tab(0),'\t '))
- X#
- X#end
- X
- Xprocedure gettokens(is_case_sensitive)
- X
- X # Used within a scanning expression. Returns tokens in
- X # &subject[&pos:0] (&pos normally = 1). Tokens are stretches of
- X # text separated by an optional apostrophe or dash, then any
- X # stretch of non-alphanumeric characters, then an optional apos-
- X # trophe.
- X
- X local token
- X static alphanums, wordchars
- X initial {
- X alphanums := &digits ++ &letters ++ '-'
- X wordchars := alphanums ++ '\''
- X }
- X
- X tab(upto(alphanums))
- X while token := tab(many(wordchars)) do {
- X if \is_case_sensitive
- X then suspend "" ~== trim(token,'\t \'-')
- X else suspend map("" ~== trim(token,'\t \'-'))
- X tab(upto(alphanums))
- X }
- X
- Xend
- SHAR_EOF
- true || echo 'restore of gettokens.icn failed'
- rm -f _shar_wnt_.tmp
- fi
- # ============= Makefile.dist ==============
- if test -f 'Makefile.dist' -a X"$1" != X"-c"; then
- echo 'x - skipping Makefile.dist (File already exists)'
- rm -f _shar_wnt_.tmp
- else
- > _shar_wnt_.tmp
- echo 'x - extracting Makefile.dist (Text)'
- sed 's/^X//' << 'SHAR_EOF' > 'Makefile.dist' &&
- X##########################################################################
- X#
- X# Makefile.dist for bibleref.
- X#
- X##########################################################################
- X#
- X# User-modifiable section. Read carefully! You will almost
- X# certainly have to change some settings here.
- X#
- X
- X#
- X# Destination directory for binaries; library directory for auxiliary
- X# files. Owner and group for public executables. Leave the trailing
- X# slash off of directory names.
- X#
- XDESTDIR = /usr/local/bin
- X# DESTDIR = $(HOME)/bin
- XLIBDIR = /usr/local/lib/$(PROGNAME)
- X# LIBDIR = $(HOME)/$(PROGNAME)
- X# LIBDIR = /usr/local/share/lib/$(PROGNAME)
- XOWNER = root #bin
- XGROUP = root #bin
- X
- X#
- X# Name of your icon compiler and compiler flags.
- X#
- XICONC = /usr/icon/v8/bin/icont
- XIFLAGS = -Sc 200 -Si 1000 -Sn 2000 -SF 30
- X
- X#
- X# Names of KJV files as packaged in the PC-SIG disk set (19 discs).
- X# Mine were snarfed from helens.stanford.edu (36.0.2.99) as kjv.tar.Z.
- X# You will need to link these to the current directory. Please don't
- X# copy them all over, or if you do, be sure to delete them afterwards.
- X# They aren't needed after you are done indexing.
- X#
- XRAWFILES = gen.txt exo.txt lev.txt num.txt deu.txt jos.txt jdg.txt \
- X rth.txt sa1.txt sa2.txt ki1.txt ki2.txt ch1.txt ch2.txt \
- X ezr.txt neh.txt est.txt job.txt psa.txt pro.txt ecc.txt \
- X son.txt isa.txt jer.txt lam.txt eze.txt dan.txt hos.txt \
- X joe.txt amo.txt oba.txt jon.txt mic.txt nah.txt hab.txt \
- X zep.txt hag.txt zec.txt mal.txt mat.txt mar.txt luk.txt \
- X joh.txt act.txt rom.txt co1.txt co2.txt gal.txt eph.txt \
- X phi.txt col.txt th1.txt th2.txt ti1.txt ti2.txt tit.txt \
- X phm.txt heb.txt jam.txt pe1.txt pe2.txt jo1.txt jo2.txt \
- X jo3.txt jud.txt rev.txt
- X#
- X# If you have your KJV in a single file, that's fine. Just be sure
- X# the books are in their correct order (as above), and are in the PC-SIG
- X# disk-set format.
- X# RAWFILES = ./kjv.Z
- X
- X#
- X# If you've compressed your KJV file(s), use zcat; otherwise use cat.
- X#
- XCAT = cat
- X# CAT = zcat
- X
- X#
- X# Change these only if you're pretty sure of what you're doing.
- X#
- XSHELL = /bin/sh
- XMAKE = make
- X
- X
- X###########################################################################
- X#
- X# Don't change anything below this line.
- X#
- X
- XRTVFILE = kjv.rtv
- X
- XCONVERTER = kjv2rtv
- XCONVERTSRC = $(CONVERTER).icn convertr.icn name2num.icn complete.icn
- X
- XINDEXER = makeind
- XINDEXSRC = $(INDEXER).icn gettokens.icn indexutl.icn
- X
- XDUMMY_FILE = index.done
- XPROGNAME = bibleref
- X
- XSEARCHSRC = $(PROGNAME).icn ref2bmap.icn name2num.icn convertb.icn \
- X listutil.icn passutil.icn srchutil.icn complete.icn \
- X ipause.icn rewrap.icn binsrch.icn bmp2text.icn initfile.icn \
- X retrieve.icn indexutl.icn retrops.icn whatnext.icn iolib.icn \
- X iscreen.icn findre.icn
- X
- Xall: $(DUMMY_FILE) $(PROGNAME)
- X
- X$(DUMMY_FILE):
- X @echo ""
- X @echo "This may take a while (about 1 minute/MB on a Sun4)."
- X @echo ""
- X @sleep 2
- X $(ICONC) $(IFLAGS) -o $(CONVERTER) $(CONVERTSRC)
- X $(CAT) $(RAWFILES) | $(CONVERTER) > $(RTVFILE)
- X @echo ""
- X @echo "This may take a long time (c. 20 min./MB on a Sun4)."
- X @echo "Kids, don't even *think* of trying this at home."
- X @echo ""
- X @sleep 2
- X $(ICONC) $(IFLAGS) -o $(INDEXER) $(INDEXSRC)
- X $(INDEXER) -f $(RTVFILE) -m 200 -n 3 -l 3
- X touch $(DUMMY_FILE)
- X
- X$(PROGNAME): $(SEARCHSRC)
- X $(ICONC) $(IFLAGS) -o $(PROGNAME) $(SEARCHSRC)
- X
- X$(PROGNAME).icn: $(PROGNAME).src
- X sed "s|/usr/local/lib/bibleref/kjv.rtv|$(LIBDIR)/$(RTVFILE)|" $(PROGNAME).src > $(PROGNAME).icn
- X
- X$(CONVERTER): $(CONVERTSRC)
- X $(ICONC) $(IFLAGS) -o $(CONVERTER) $(CONVERTSRC)
- X
- X$(INDEXER): $(INDEXSRC)
- X $(ICONC) $(IFLAGS) -o $(INDEXER) $(INDEXSRC)
- X
- X
- X##########################################################################
- X#
- X# Pseudo-target names (install, clean, clobber)
- X#
- X
- X# Pessimistic assumptions regarding the environment (in particular,
- X# I don't assume you have the BSD "install" shell script).
- Xinstall: all
- X -test -d $(DESTDIR) || mkdir $(DESTDIR) && chmod 755 $(DESTDIR)
- X cp $(PROGNAME) $(DESTDIR)/$(PROGNAME)
- X chgrp $(GROUP) $(DESTDIR)/$(PROGNAME)
- X chown $(OWNER) $(DESTDIR)/$(PROGNAME)
- X -test -d $(LIBDIR) || mkdir $(LIBDIR) && chmod 755 $(LIBDIR)
- X mv xxx* $(RTVFILE) $(LIBDIR)/
- X chgrp $(GROUP) $(LIBDIR)
- X chown $(OWNER) $(LIBDIR)
- X chgrp $(GROUP) $(LIBDIR)/xxx* $(LIBDIR)/$(RTVFILE)
- X chown $(OWNER) $(LIBDIR)/xxx* $(LIBDIR)/$(RTVFILE)
- X @echo ""
- X @echo "Done."
- X @echo ""
- X
- X#
- X# For storing the pre-indexed files. All that needs to be done here
- X# is to unpack the archive on another machine, and make $(PROGNAME).
- X#
- Xtar: all
- X tar -cf ./$(PROGNAME).tar $(PROGNAME).src $(DUMMY_FILE) $(AUXILSRC) \
- X Makefile.dist README
- X
- X#
- X# Cleanup
- X#
- Xclean:
- X rm -f $(CONVERTER) $(INDEXER) $(PROGNAME)
- X
- X# Be careful; use this target, and you'll be back to square one.
- Xclobber: clean
- X @echo "Okay, you asked for it."
- X rm -f $(RAWFILES) xxx*.??? $(RTVFILE) $(DUMMY_FILE) $(PROGNAME).icn
- SHAR_EOF
- true || echo 'restore of Makefile.dist failed'
- rm -f _shar_wnt_.tmp
- fi
- # ============= README ==============
- if test -f 'README' -a X"$1" != X"-c"; then
- echo 'x - skipping README (File already exists)'
- rm -f _shar_wnt_.tmp
- else
- > _shar_wnt_.tmp
- echo 'x - extracting README (Text)'
- sed 's/^X//' << 'SHAR_EOF' > 'README' &&
- X--------
- SHAR_EOF
- true || echo 'restore of README failed'
- fi
- echo 'End of part 9'
- echo 'File README is continued in part 10'
- echo 10 > _shar_seq_.tmp
- exit 0
- --
-
- -Richard L. Goerwitz goer%sophist@uchicago.bitnet
- goer@sophist.uchicago.edu rutgers!oddjob!gide!sophist!goer
-