home *** CD-ROM | disk | FTP | other *** search
- ############################################################################
- #
- # File: huffstuf.icn
- #
- # Subject: Procedures for huffman coding
- #
- # Author: Richard L. Goerwitz
- #
- # Date: July 20, 1992
- #
- ###########################################################################
- #
- # Version: 1.2
- #
- ###########################################################################
- #
- # An odd assortment of tools that lets me compress text using an
- # Iconish version of a generic Huffman algorithm.
- #
- ############################################################################
- #
- # Links: codeobj outbits inbits
- #
- # See also: hufftab.icn, press.icn
- #
- ############################################################################
-
- # From the IPL.
- link codeobj
-
- # Necessary records.
- record nodE(l,r,n)
- record _ND(l,r)
- record leaF(c,n)
- record huffcode(c,i,len)
-
- # For debugging purposes.
- # link ximage
-
- # Count of chars in input file.
- global count_of_all_chars
-
-
- procedure main(a)
-
- local direction, usage, size, char_tbl, heap, tree, h_tbl
- usage := "huffcode -i|o filename1"
-
- direction := pop(a) | stop(usage)
- direction ?:= { ="-"; tab(any('oi')) } | stop(usage)
- *a = 1 | stop(usage)
-
- intext := open(a[1]) | quitprog("huffcode", "can't open "||a[1], 1)
- size := 80
-
- if direction == "o" then {
-
- char_tbl := table()
- while count_chars_in_s(reads(intext), char_tbl)
- heap := initialize_heap(char_tbl)
- tree := heap_2_tree(heap)
- h_tbl := hash_codes(tree)
-
- put_tree(&output, tree)
- seek(intext, 1)
- every writes(&output, encode_string(|reads(intext, size), h_tbl))
-
- }
- else {
- tree := get_tree(intext)
- every writes(&output, decode_rest_of_file(intext, size, tree))
- }
-
- end
-
-
- procedure count_chars_in_s(s, char_tbl)
-
- #
- # Count chars in s, placing stats in char_tbl (keys = chars in
- # s, values = leaF records, with the counts for each chr in s
- # contained in char_tbl[chr].n).
- #
- local chr
- initial {
- /char_tbl &
- quitprog("count_chars_in_s", "need 2 args - 1 string, 2 table", 9)
- *char_tbl ~= 0 &
- quitprog("count_chars_in_s","start me with an empty table",8)
- count_of_all_chars := 0
- }
-
- # Reset character count on no-arg invocation.
- /s & /char_tbl & {
- count_of_all_chars := 0
- return
- }
-
- # Insert counts for characters into char_tbl. Note that we don't
- # just put them into the table as-is. Rather, we put them into
- # a record which contains the character associated with the count.
- # These records are later used by the Huffman encoding algorithm.
- s ? {
- while chr := move(1) do {
- count_of_all_chars +:= 1
- /char_tbl[chr] := leaF(chr,0)
- char_tbl[chr].n +:= 1
- }
- }
- return *char_tbl # for lack of anything better
-
- end
-
-
- procedure initialize_heap(char_tbl)
-
- #
- # Create heap data structure out of the table filled out by
- # successive calls to count_chars_in_s(s,t). The heap is just a
- # list. Naturally, it's size can be obtained via *heap.
- #
- local heap
-
- heap := list()
- every push(heap, !char_tbl) do
- reshuffle_heap(heap, 1)
- return heap
-
- end
-
-
- procedure reshuffle_heap(h, k)
-
- #
- # Based loosely on Sedgewick (2nd. ed., 1988), p. 160. Take k-th
- # node on the heap, and walk down the heap, switching this node
- # along the way with the child whose value is the least AND whose
- # value is less than this node's. Stop when you find no children
- # whose value is less than that of the original node. Elements on
- # heap are records of type leaF, with the values contained in the
- # "n" field.
- #
- local j
-
- # While we haven't spilled off the end of the heap (the size of the
- # heap is *h; *h / 2 is the biggest k we need to look at)...
- while k <= (*h / 2) do {
-
- # ...double k, assign the result to j.
- j := k+k
-
- # If we aren't at the end of the heap...
- if j < *h then {
- # ...check to see which of h[k]'s children is the smallest,
- # and make j point to it.
- if h[j].n > h[j+1].n then
- # h[j] :=: h[j+1]
- j +:= 1
- }
-
- # If the current parent (h[k]) has a value less than those of its
- # children, then break; we're done.
- if h[k].n <= h[j].n then break
-
- # Otherwise, switch the parent for the child, and loop around
- # again, with k (the pointer to the parent) now pointing to the
- # new offset of the element we have been working on.
- h[k] :=: h[j]
- k := j
-
- }
-
- return k
-
- end
-
-
- procedure heap_2_tree(h)
-
- #
- # Construct the Huffman tree out of heap h. Find the smallest
- # element, pop it off the heap, then reshuffle the heap. After
- # reshuffling, replace the top record on the stack with a nodE()
- # record whose n field equal to the sum of the n fields for the
- # element popped off the stack originally, and the one that is
- # now about to be replaced. Link the new nodE record to the 2
- # elements on the heap it is now replacing. Reshuffle the heap
- # again, then repeat. You're done when the size of the heap is
- # 1. That one element remaining (h[1]) is your Huffman tree.
- #
- # Based loosely on Sedgewick (2nd ed., 1988), p. 328-9.
- #
- local frst, scnd, count
-
- until *h = 1 do {
-
- h[1] :=: h[*h] # Reverse first and last elements.
- frst := pull(h) # Pop last elem off & save it.
- reshuffle_heap(h, 1) # Resettle the heap.
- scnd := !h # Save (but don't clobber) top element.
-
- count := frst.n + scnd.n
- frst := { if *frst = 2 then frst.c else _ND(frst.l, frst.r) }
- scnd := { if *scnd = 2 then scnd.c else _ND(scnd.l, scnd.r) }
-
- h[1] := nodE(frst, scnd, count) # Create new nodE().
- reshuffle_heap(h, 1) # Resettle once again.
- }
-
- # H is no longer a stack. It's single element - the root of a
- # Huffman tree made up of nodE()s and leaF()s. Put the l and r
- # fields of that element into an _ND record, and return the new
- # record.
- return _ND(h[1].l, h[1].r)
-
- end
-
-
- procedure hash_codes(tr)
-
- #
- # Hash Huffman codes. Tr (arg 1) is a Huffman tree created by
- # heap_2_tree(heap). Output is a table, with the keys
- # representing characters, and the values being records of type
- # huffcode(i,len), where i is the Huffcode (an integer) and len is
- # the number of bits it occupies.
- #
- local code
-
- huff_tbl := table()
- every code := collect_bits(tr) do
- insert(huff_tbl, code.c, code)
- return huff_tbl
-
- end
-
-
- procedure collect_bits(tr, i, len)
-
- #
- # Decompose Huffman tree tr into huffcode() records which contain
- # 3 fields: c (the character encoded), i (its integer code),
- # and len (the number of bytes the integer code occupies). Sus-
- # pend one such record for each character encoded in tree tr.
- #
-
- if type(tr) == "string" then
- return huffcode(tr, i, len)
- else {
- (/len := 1) | (len +:= 1)
- (/i := 0) | (i *:= 2)
- suspend collect_bits(tr.l, i, len)
- i +:= 1
- suspend collect_bits(tr.r, i, len)
- }
-
- end
-
-
- procedure put_tree(f, tr)
-
- #
- # Writes Huffman tree tr to file f. Uses first two bits to store
- # the size of the tree.
- #
- local stringized_tr
- # global count_of_all_chars
-
- /f | /tr & quitprog("put_tree","I need two nonnull arguments",7)
-
- stringized_tr := encode(tr)
- every writes(f, outbits(*stringized_tr, 16)) # use two bytes
- outbits() # just in case
- writes(f, stringized_tr)
- # How many characters are there in the input file?
- every writes(f, outbits(count_of_all_chars, 32))
- outbits()
-
- end
-
-
- procedure get_tree(f)
-
- #
- # Reads in Huffman tree from file f, sets pointer to the first
- # encoded bit (as opposed to the bits which form the tree des-
- # cription) in file f.
- #
- local stringized_tr_size, tr
- # global count_of_all_chars
-
- stringized_tr_size := inbits(f, 16)
- tr := decode(reads(f, stringized_tr_size)) |
- quitprog("get_tree", "can't decode tree", 6)
- count_of_all_chars := inbits(f, 32) |
- quitprog("get_tree", "garbled input file", 10)
- return tr
-
- end
-
-
- procedure encode_string(s, huffman_table)
-
- #
- # Encode string s using the codes in huffman_table (created by
- # hash_codes, which in turns uses the Huffman tree created by
- # heap_2_tree).
- #
- # Make sure you are using reads() and not read, unless you don't
- # want to preserve newlines.
- #
- local s2, chr, hcode # hcode stores huffcode records
- static chars_written
- initial chars_written := 0
-
- s2 := ""
- s ? {
- while chr := move(1) do {
- chars_written +:= 1
- hcode := \huffman_table[chr] |
- quitprog("encode_string", "unexpected char, "||image(chr), 11)
- every s2 ||:= outbits(hcode.i, hcode.len)
- }
- # If at end of output stream, then flush outbits buffer.
- if chars_written = count_of_all_chars then {
- chars_written := 0
- s2 ||:= outbits()
- } else {
- if chars_written > count_of_all_chars then {
- chars_written := 0
- quitprog("encode_string", "you're trying to write _
- more chars than you originally tabulated", 12)
- }
- }
- }
- return s2
-
- end
-
-
- procedure decode_rest_of_file(f, size, huffman_tree)
-
- local s2, line, E, chr, bit
- static chars_decoded
- initial chars_decoded := 0
-
- E := huffman_tree
- while line := reads(f, size) do {
- line ? {
- s2 := ""
- while chr := move(1) do {
- every bit := iand(1, ishift(ord(chr), -7 to 0)) do {
- E := { if bit = 0 then E.l else E.r }
- if s2 ||:= string(E) then {
- chars_decoded +:= 1
- if chars_decoded = count_of_all_chars then {
- chars_decoded := 0
- break { break break }
- }
- else E := huffman_tree
- }
- }
- }
- suspend s2
- }
- }
- suspend s2
-
- end
-
-
- procedure quitprog(p, m, c)
-
- /m := "program error"
- write(&errout, p, ": ", m)
- exit(\c | 1)
-
- end
-