home *** CD-ROM | disk | FTP | other *** search
- {.IN+}
- {.PW132}
- (*$V-,R+,B- *)
- PROGRAM induce ;
-
- (* Copyright 1986 - MicroExpert Systems
- Box 430 R.D. 2
- Nassau, NY 12123 *)
-
- (* Induce implements the ID3 algorithm for the generation of rules from a data
- set as described in the BYTE article "Finding Knowledge in Data".
-
- This program has been tested using Turbo ver 3.01A on an IBM PC. It has
- been run under both DOS 2.1 and Concurrent 4.1 .
-
- The source for this program is contained in two files, INDUCE.PAS and
- INDUCE.INC. The program produces one overlay file INDUCE.000 .
-
- INDUCE produces a knowledge base which can be used with MicroExpert.
- MicroExpert is an expert system shell written in Turbo Pascal for the
- IBM PC and Apple II. It is available for $49.95 and comes with complete
- source code. It can be order by writing to :
- McGraw-Hill Book Company
- P.O. Box 400
- Hightstown, NJ 08520
-
- Or calling 1-800-628-004 or in New York state 212/512-2999.
-
- We would be pleased to hear your comments, good or bad, or any applications
- and modifications of the program. Contact us at the above address or
- on BIX. Our id is bbt and we may be contacted via BIXmail or by leaving
- comments in the MicroExpert conference.
-
- Bill and Bev Thompson *)
-
-
- CONST
- ln2 = 0.69314718 ;
- limit = 1.0E-20 ;
- debug = false ;
- back_space = ^H ;
- tab = ^I ;
- eof_mark = ^Z ;
- esc = #27 ;
- quote_char = #39 ;
- left_arrow = #75 ;
- end_key = #79 ;
- del_line = ^X ;
- return = ^M ;
- bell = ^G ;
-
- TYPE
- counter = 0 .. maxint ;
- string80 = string[80] ;
- string132 = string[132] ;
- string255 = string[255] ;
- text_file = text ;
- char_set = SET OF char ;
- node_type = (cons_node,symbol,number,free_node) ;
- node_ptr = ^node ;
- node = RECORD
- in_use : boolean ;
- CASE tag : node_type OF
- cons_node : (tail_ptr : node_ptr ;
- head_ptr : node_ptr) ;
- symbol : (string_data : string80) ;
- number : (num_data : real) ;
- free_node : (next_free : node_ptr ;
- block_cnt : counter) ;
- END ;
-
- (* node is the basic allocation unit for lists. The fields are used as
- follows:
-
- in_use - in_use = false tells the garbage collector that this node
- is available for re-use.
- tag - which kind of node this is.
- cons_node - cons_nodes consist of two pointers. one to the head (first item)
- the other to the rest of the list. They are the "glue" which
- holds the list together. The list (A B C) would be stored as
- ------- -------- --------
- | .| . |-----> | .| . |------> | .| . |---> NIL
- --|----- --|------ --|-----
- | | |
- V V V
- A B C
-
- The boxes are the cons nodes, the first part of the box
- holds the head pointer, then second contains the tail.
- symbol - holds string values, we don't actually use the entire 80
- characters in most cases.
- number - contains a real number.
- free_node - the garbage collector gathers all unused nodes and puts
- them on a free list. It also compacts the free space into
- contiguous blocks. next_free points to the next free block.
- block_cnt contains a count of the number of contiguous 8 byte free
- blocks which follow this one. *)
-
-
- VAR
- example_file : text_file ;
- line : string255 ;
- c_list,examples,attrib_list,saved_list,initial_heap,free : node_ptr ;
- total_free : real ;
- no_of_cols : counter ;
-
- (* The important globals are:
- example_file - text file containing the original example set. See the
- documentation file for its format.
- line - line buffer for reading in the text file
- c_list - the classification tree
- examples - the list of examples
- attrib_list - list of attribute names and their values
- saved_list - list of all items that absolutely must be saved if garbage
- collection occurs. Usually has at least the examples and
- attrib_list attcahed to it.
- initial_heap - the value of the heap pointer at the start of the program.
- used by the garbage collector
- free - the list of free nodes.
- total_free - total number of free blocks on the free list.
- no_of_cols - the total number of attributes + the class attribute in
- the example set. *)
-
-
- (*$I induce.inc *)
-
-
- PROCEDURE read_from_file(VAR f : text_file) ;
- (* Read a line form file f and store it in the global variable, line.
- It ignores blank lines and comments. When the end of file is reached
- eof_mark is returned. *)
- CONST
- in_comment : boolean = false ; (* static *)
-
- PROCEDURE read_a_line ;
- BEGIN
- (*$I- *)
- readln(f,line) ;
- (*$I+ *)
- IF ioresult <> 0
- THEN line := eof_mark
- ELSE IF pos('(*',line) > 0
- THEN
- IF pos('*)',line) > 0
- THEN delete(line,pos('(*',line),pos('*)',line) - pos('(*',line) + 2)
- ELSE
- BEGIN
- in_comment := true ;
- line := '' ;
- END ;
- END ; (* read_a_line *)
-
- BEGIN
- line := '' ;
- IF eof(f)
- THEN line := eof_mark
- ELSE
- BEGIN
- read_a_line ;
- IF in_comment
- THEN
- IF pos('*)',line) > 0
- THEN
- BEGIN
- delete(line,1,pos('*)',line) + 1) ;
- in_comment := false ;
- END
- ELSE read_from_file(f) ;
- END ;
- strip_leading_blanks(line) ;
- strip_trailing_blanks(line) ;
- IF line = ''
- THEN read_from_file(f) ;
- END ; (* read_from_file *)
-
-
- OVERLAY PROCEDURE expand(example_list : node_ptr ;
- VAR new_example_list : node_ptr) ;
- (* Expand "don't care" values into values from attrib_list.
- example_list - unexpanded example set
- new_example_list - expanded set *)
-
- PROCEDURE dup_and_copy(list : node_ptr) ;
- (* This routine creates a new version of the current row, pointed to by
- list. If it finds a regular attribute value, it just appends the value
- to the row it is constructing. If it finds a '*', indicating a
- "don't care" value, it call copy_to_new_list to expand the value and
- attach the new rows to new_example_list.
- Notice that we attach anything we don't want to be trashed by the
- garbage collector to the head of saved_list and remove it at the end
- of the routine. copy_to_new_list saves new_list because it calls
- dup_and_copy and that routine might initiate garbage collection. *)
- VAR
- new_list,attr_ptr : node_ptr ;
- copied : boolean ;
-
- PROCEDURE copy_to_new_list ;
- (* This routine does the actual expansion. It attaches a value for the
- attribute, pointed to by p, to the row that has been constructed
- so far and attaches the rest of the list to the end of the row.
- It calls dup_and_copy to expand any more *'s in the row and finally
- attach the row to the new_example_list *)
- VAR
- p,new_row : node_ptr ;
- BEGIN
- saved_list := cons(new_list,saved_list) ;
- copied := true ;
- p := tail(head(attr_ptr)) ;
- WHILE p <> NIL DO
- BEGIN
- new_row := append_list(new_list,cons(head(p),tail(list))) ;
- dup_and_copy(new_row) ;
- p := tail(p) ;
- END ;
- saved_list := tail(saved_list) ;
- END ; (* copy_to_new_list *)
-
- BEGIN
- saved_list := cons(list,saved_list) ;
- test_memory ;
- new_list := NIL ;
- attr_ptr := attrib_list ;
- copied := false ;
- WHILE (list <> NIL) AND (NOT copied) DO
- IF string_val(head(list)) = '*'
- THEN copy_to_new_list
- ELSE
- BEGIN
- new_list := append_list(new_list,cons(head(list),NIL)) ;
- list := tail(list) ;
- attr_ptr := tail(attr_ptr) ;
- END ;
- IF NOT copied
- THEN new_example_list := append_list(new_example_list,cons(new_list,NIL)) ;
- saved_list := cons(new_example_list,tail(saved_list)) ;
- END ; (* dup_and_copy *)
-
- BEGIN
- new_example_list := NIL ;
- WHILE example_list <> NIL DO
- BEGIN
- dup_and_copy(head(example_list)) ;
- example_list := tail(example_list) ;
- END ;
- END ; (* expand *)
-
-
- OVERLAY FUNCTION conflicts(example_list : node_ptr) : boolean ;
- (* Search for conflicts by using match_list to compare each row against
- the rows which follow it in the example list. conflicts returns true
- if a match is found. *)
- VAR
- p : node_ptr ;
- found_match : boolean ;
-
- PROCEDURE conflict_message ;
- BEGIN
- writeln ;
- writeln('A conflict exists between rows:') ;
- writeln ;
- print_list(head(example_list)) ;
- writeln ;
- print_list(head(p)) ;
- writeln ;
- writeln('Processing cannot continue.') ;
- END ; (* conflict_message *)
-
- BEGIN
- found_match := false ;
- WHILE (example_list <> NIL) AND (NOT found_match) DO
- BEGIN
- p := tail(example_list) ;
- WHILE (p <> NIL) AND (NOT found_match) DO
- IF match_lists(tail(head(example_list)),tail(head(p)))
- THEN found_match := true
- ELSE p := tail(p) ;
- IF NOT found_match
- THEN example_list := tail(example_list) ;
- END ;
- IF found_match
- THEN conflict_message ;
- conflicts := found_match ;
- END ; (* conflicts *)
-
-
- OVERLAY PROCEDURE build_table ;
- (* Read the example file and build the attrib_list and examples. The
- format of these two lists is described in the BYTE article mentioned
- at the beginning of the program. This routine doesn't do much error
- checking, so be careful with your example files. *)
- VAR
- new_row : node_ptr ;
- token : string80 ;
-
- PROCEDURE scan ;
- (* Get a single token from the input line. This procedure strips leading
- and trailing blanks and tabs, but interior spaces are sigificant.
- A token is any string between the first non-space character and a
- comma or end of line. Case is significant in tokens, 'Cat' and 'cat'
- will be treated as different values by the program. *)
- VAR
- comma_pos : byte ;
- BEGIN
- strip_leading_blanks(line) ;
- IF line = ''
- THEN token := ''
- ELSE
- BEGIN
- comma_pos := pos(',',line) ;
- IF comma_pos > 0
- THEN
- BEGIN
- token := copy(line,1,comma_pos - 1) ;
- delete(line,1,comma_pos) ;
- END
- ELSE
- BEGIN
- token := line ;
- line := '' ;
- END ;
- IF token = ''
- THEN token := '*' ;
- strip_trailing_blanks(token) ;
- END ;
- END ; (* scan *)
-
- PROCEDURE build_a_row ;
- (* Builds an example row. Symbolic and numerical attributes are handled
- differently. Input lines are read one token at a time and storage is
- allocated for the new token. The attrib_list is examined to see if
- the new value appears on the list of values for that attribute. If it
- does not, the value is added to the list. Symbolic values are added
- to the end of the list of values for the attribute, numerical values are
- stored in order. Once the new row is constructed it is appended
- to the example set. *)
- VAR
- at_list,row_list,token_ptr : node_ptr ;
-
- PROCEDURE length_error ;
- (* Signal an error, probably a missing value. The row in question
- will not be included in the example set, but the attribute list
- may be damaged, so don't trust results after and error. *)
- BEGIN
- writeln ;
- writeln('Missing attribute in row:') ;
- print_list(row_list) ;
- writeln ;
- writeln ;
- wait ;
- END ; (* length_error *)
-
- PROCEDURE add_value ;
- (* Add a new value to the attribue list. The variable attrib_list keeps
- track of the current column as the row is scanned. If token was
- found to already be on the attrib_list, head(attrib_list) is appended
- to at_list. If the token is a new value, it is added to the list at the
- head of attrib_list, and then head(attrib_list) is appended to at_list.
- After reading the entire row from the file, attrib_list is set to
- point to at_list. This way attrib_list is reconstructed for each
- row. *)
-
- PROCEDURE insert_number ;
- (* Insert a number into the attribute list. The list of values
- for numerical attributes is maintained in order. This is done by
- comparing the value of the token against the other items on the
- list. As the comparison is done, the values are copied to new_list.
- When a value is found that is greater than the token value or the
- end of the list is reached, the token is appened to new_list and
- then the reaming values on the old list are appended to new_list.
- Finally new_list is appended to at_list. All of this appending
- produces lots of garbage. *)
- VAR
- new_list,p : node_ptr ;
- r : real ;
- inserted : boolean ;
-
- PROCEDURE build_new_list ;
- (* This routine does the actual insetion described above. *)
- BEGIN
- WHILE (p <> NIL) AND (NOT inserted) DO
- BEGIN
- IF abs(r - num_val(head(p))) < limit
- THEN
- BEGIN
- inserted := true ;
- new_list := append_list(new_list,p) ;
- END
- ELSE IF r > num_val(head(p))
- THEN
- BEGIN
- new_list := append_list(new_list,cons(head(p),NIL)) ;
- p := tail(p) ;
- END
- ELSE
- BEGIN
- new_list := append_list(new_list,append_list(
- cons(token_ptr,NIL),p)) ;
- inserted := true ;
- END ;
- END ;
- END ; (* build_new_list *)
-
- BEGIN
- r := num_val(token_ptr) ;
- inserted := false ;
- new_list := cons(head(head(attrib_list)),NIL) ;
- p := tail(head(attrib_list)) ;
- build_new_list ;
- IF (p = NIL) AND (NOT inserted)
- THEN new_list := append_list(new_list,cons(token_ptr,NIL)) ;
- at_list := append_list(at_list,cons(new_list,NIL)) ;
- END ; (* insert_number *)
-
- BEGIN
- IF tag_value(token_ptr) = number
- THEN insert_number
- ELSE at_list := append_list(at_list,
- cons(append_list(head(attrib_list),
- cons(token_ptr,NIL)),
- NIL)) ;
- END ; (* add_value *)
-
- BEGIN
- saved_list := cons(examples,attrib_list) ;
- test_memory ;
- at_list := NIL ;
- row_list := NIL ;
- scan ;
- WHILE token <> '' DO
- BEGIN
- IF pos(':NUMBER',toupper(string_val(head(head(attrib_list))))) > 0
- THEN token_ptr := alloc_num(toreal(token))
- ELSE token_ptr := alloc_str(token) ;
- IF (NOT on_list(token,head(attrib_list))) AND (token <> '*')
- THEN add_value
- ELSE at_list := append_list(at_list,cons(head(attrib_list),NIL)) ;
- row_list := append_list(row_list,cons(token_ptr,NIL)) ;
- attrib_list := tail(attrib_list) ;
- scan ;
- END ;
- attrib_list := at_list ;
- IF list_length(row_list) = no_of_cols
- THEN examples := append_list(examples,cons(row_list,NIL))
- ELSE length_error ;
- END ; (* build_a_row *)
-
- PROCEDURE build_attrib_list ;
- (* constructs the initial attrib_list from the first row in the file.
- Initially the attrib_list is simply a list of the attribute names,
- build_a_row adds the values to it. This routine also counts the
- number of columns (attributes) in the table. *)
- BEGIN
- attrib_list := NIL ;
- no_of_cols := 0 ;
- scan ;
- WHILE token <> '' DO
- BEGIN
- attrib_list := append_list(attrib_list,cons(cons(alloc_str(token),NIL),
- NIL)) ;
- no_of_cols := no_of_cols + 1 ;
- scan ;
- END ;
- END ; (* build_attrib_list *)
-
- BEGIN
- examples := NIL ;
- line := '' ;
- read_from_file(example_file) ;
- IF line <> eof_mark
- THEN build_attrib_list ;
- read_from_file(example_file) ;
- WHILE line <> eof_mark DO
- BEGIN
- build_a_row ;
- read_from_file(example_file) ;
- END ;
- END ; (* build_table *)
-
- OVERLAY FUNCTION classify_it : node_ptr ;
- (* is an overlay function which calls classify. We do it this way to avoid
- swapping due to recursion. *)
-
- FUNCTION classify(example_list,chosen_list : node_ptr) : node_ptr ;
- (* This is the main processing routine of the program. It is passed two
- lists, a list of rows, example_list and a list of attributes already
- chosen. The second list is simply for convenience. That way we don't
- have to calculate the entropy for attribute which can no longer
- contribute to splitting example_set. classify returns a pointer to the
- classification tree built from the example_set.
- If the example_list passed to it contains only a single class value,
- classify returns the class_name (attribute name of the first column) and
- the class value.
- Variables:
- split_elem - the column (attribute number) to split on
- classify_list - a temporary list to hold the tree
- split_value - for numerical attributes. It contains the value which
- produces the best numerical split.
- classify prints a dot on the screen each time it is entered to show you
- that the program really hasn't died. *)
- VAR
- split_elem : counter ;
- classify_list : node_ptr ;
- split_value : real ;
-
- PROCEDURE find_split(VAR split_elem : counter ; VAR min_split_value : real) ;
- (* finds the best attribute to split on. It returns the column number on
- which to split and for numerical attribute, the value which produces the
- best split. For each active attribute it constructs a class_list. the
- class_list has the following format:
- ( (attribute value #1 (class1 count) (class2 count) ....)
- (attribute value #2 (class1 count) (class2 count) ....) .....)
- The counts are the number of times each class appears in a row with
- a particular value of the attribute. This list is used to calculate
- the entropy of the attribute. *)
- VAR
- i : counter ;
- attrib : node_ptr ;
- ent,min_entropy,split_value : real ;
-
- FUNCTION entropy(list : node_ptr ; cases : counter) : real ;
- (* list is a class list. cases is the number of examples under
- consideration. This routine calculates the entropy H(C|A) from the class
- list. *)
- VAR
- sum,sum1,sum2,r : real ;
- p : node_ptr ;
-
- FUNCTION log2(x : real) : real ;
- BEGIN
- IF abs(x) < limit
- THEN log2 := 0.0
- ELSE log2 := ln(x) / ln2 ;
- END ; (* log2 *)
-
- BEGIN
- sum := 0.0 ;
- WHILE list <> NIL DO
- BEGIN
- sum1 := 0.0 ;
- sum2 := 0.0 ;
- p := tail(head(list)) ;
- WHILE p <> NIL DO
- BEGIN
- r := num_val(head(tail(head(p)))) ;
- sum1 := sum1 + r * log2(r) ;
- sum2 := sum2 + r ;
- p := tail(p) ;
- END ;
- sum := sum + (sum2 * log2(sum2)) - sum1 ;
- list := tail(list) ;
- END ;
- entropy := sum / cases ;
- END ; (* entropy *)
-
- PROCEDURE numeric_entropy(elem_no : counter ;
- VAR num_entropy,num_split_value : real) ;
- (* Find the best split for a numeric attribute. elem_no is the column we
- are working on. num_entropy is the best entropy for this attribute
- and num_split_value is the split which gives that value. In
- addition to the class list, this routine produces an ordered list
- of the values for this attribute, called num_list. This list is
- used in making the splits. Each split is half way between successive
- values on the num_list. The entropy is calculated for each split. *)
- VAR
- class_list,sp,num_list : node_ptr ;
- sp_val,num_ent : real ;
- total_cases : counter ;
-
- PROCEDURE make_num_list ;
- (* constructs num_list. This is essentially the same routine as
- insert_number in build_table *)
- VAR
- new_list,p,q : node_ptr ;
- r : real ;
- inserted : boolean ;
-
- PROCEDURE add_to_new_list ;
- BEGIN
- WHILE (p <> NIL) AND (NOT inserted) DO
- BEGIN
- IF abs(r - num_val(head(p))) < limit
- THEN
- BEGIN
- inserted := true ;
- new_list := append_list(new_list,p) ;
- END
- ELSE IF r > num_val(head(p))
- THEN
- BEGIN
- new_list := append_list(new_list,cons(head(p),NIL)) ;
- p := tail(p) ;
- END
- ELSE
- BEGIN
- new_list := append_list(new_list,append_list(
- cons(alloc_num(r),NIL),p)) ;
- inserted := true ;
- END ;
- END ;
- END ; (* add_to_new_list *)
-
- BEGIN
- test_memory ;
- num_list := NIL ;
- q := example_list ;
- WHILE q <> NIL DO
- BEGIN
- r := num_val(head(element(head(q),elem_no))) ;
- new_list := NIL ;
- p := num_list ;
- inserted := false ;
- add_to_new_list ;
- IF (p = NIL) AND (NOT inserted)
- THEN new_list := append_list(new_list,cons(alloc_num(r),NIL)) ;
- num_list := new_list ;
- q := tail(q) ;
- END ;
- END ; (* make_num_list *)
-
- PROCEDURE make_numeric_class_list(v : real) ;
- (* builds the class list. v is the value to split on. The class_list
- contains lists for two ranges < v and >= v. The list has the format:
- ( (< v (class1 count) (class2 count) .....)
- (>= v (class1 count) (class2 count) .....)) *)
- VAR
- temp_list,p : node_ptr ;
- v_str : string80 ;
- BEGIN
- str(v,v_str) ;
- temp_list := NIL ;
- p := tail(head(attrib_list)) ;
- WHILE p <> NIL DO
- BEGIN
- temp_list := append_list(temp_list,cons(cons(head(p),
- cons(alloc_num(0.0),NIL)),NIL)) ;
- p := tail(p) ;
- END ;
- class_list := cons(cons(alloc_str(concat('< ',v_str)),temp_list),
- cons(cons(alloc_str(concat('>= ',v_str)),
- copy_list(temp_list)),NIL)) ;
- END ; (* make_numeric_class_list *)
-
- PROCEDURE count_numeric_classes(v : real ; elem_no : counter) ;
- (* count the classes for each range. It reads the example list, extracts
- the value for the attribute, searches the class list and increments
- the appropriate class value in the list. v is the split value. *)
- VAR
- px,py : node_ptr ;
-
- PROCEDURE numeric_increment(list : node_ptr ; attr_v,atv : string80) ;
- (* search list (class_list) and compare attr_v to v, the split_value.
- atv is the class_value. Once we find the sub-list with the proper
- range we search its tail for atv to increment the class count. *)
-
- PROCEDURE do_increment(v_list : node_ptr) ;
- VAR
- p,q : node_ptr ;
- BEGIN
- q := tail(v_list) ;
- WHILE q <> NIL DO
- IF string_val(head(head(q))) = atv
- THEN
- BEGIN
- p := head(tail(head(q))) ;
- IF tag_value(p) = number
- THEN p^.num_data := p^.num_data + 1.0 ;
- total_cases := total_cases + 1 ;
- q := NIL ;
- END
- ELSE q := tail(q) ;
- END ; (* do_increment *)
-
- BEGIN
- IF toreal(attr_v) < v
- THEN do_increment(head(list))
- ELSE do_increment(head(tail(list))) ;
- END ; (* numeric_increment *)
-
- BEGIN
- total_cases := 0 ;
- px := example_list ;
- WHILE px <> NIL DO
- BEGIN
- py := head(px) ;
- numeric_increment(class_list,string_val(head(element(py,elem_no))),
- string_val(head(py))) ;
- px := tail(px) ;
- END ;
- END ; (* count_numeric_classes *)
-
- BEGIN
- num_entropy := 1.0E+37 ;
- make_num_list ;
- sp := tail(num_list) ;
- saved_list := cons(num_list,saved_list) ;
- WHILE sp <> NIL DO
- BEGIN
- test_memory ;
- sp_val := num_val(head(num_list))
- + ((num_val(head(sp)) - num_val(head(num_list))) / 2.0) ;
- make_numeric_class_list(sp_val) ;
- count_numeric_classes(sp_val,elem_no) ;
- num_ent := entropy(class_list,total_cases) ;
- IF num_ent < num_entropy
- THEN
- BEGIN
- num_entropy := num_ent ;
- num_split_value := sp_val ;
- END ;
- num_list := sp ;
- sp := tail(sp) ;
- END ;
- saved_list := tail(saved_list) ;
- END ; (* numeric_entropy *)
-
- PROCEDURE symbol_entropy(val_list : node_ptr ; elem_no : counter ;
- VAR sym_ent,sym_split_val : real) ;
- (* Find the entropy for a symbolic attribute. val_list is the list
- of possible values for this attribute from the attrib_list.
- elem_no is the column number, sym_ent is the entropy for this attribute.
- sym_split_value is always 0. This routine constructs a class list
- as described above and counts the classes for each value of the
- attribute as in the numeric case, only there is no range splitting.
- Symbolic attributes can result in mult-way partitions of the
- example_list, numerical attributes always produce binary splits. *)
- VAR
- class_list : node_ptr ;
- total_cases : counter ;
-
- PROCEDURE make_class_list(a_list : node_ptr) ;
- (* builds the initial class list. See above comments for format. *)
- VAR
- temp_list,p : node_ptr ;
- BEGIN
- WHILE a_list <> NIL DO
- BEGIN
- temp_list := cons(head(a_list),NIL) ;
- p := tail(head(attrib_list)) ;
- WHILE p <> NIL DO
- BEGIN
- temp_list := append_list(temp_list,cons(cons(head(p),
- cons(alloc_num(0.0),NIL)),NIL)) ;
- p := tail(p) ;
- END ;
- class_list := append_list(class_list,cons(temp_list,NIL)) ;
- a_list := tail(a_list) ;
- END ;
- END ; (* make_class_list *)
-
- PROCEDURE count_classes(elem_no : counter) ;
- (* traverses the example_list and counts class values. *)
- VAR
- px,py : node_ptr ;
-
- PROCEDURE increment(list : node_ptr ; attr,v : string80) ;
- (* search list (class_list) and compare attr to the head of each sub-list.
- v is the class_value. Once we find the sub-list with the proper
- range we search its tail for v to increment the class count. *)
- VAR
- p,q : node_ptr ;
- BEGIN
- WHILE list <> NIL DO
- IF string_val(head(head(list))) = attr
- THEN
- BEGIN
- q := tail(head(list)) ;
- WHILE q <> NIL DO
- IF string_val(head(head(q))) = v
- THEN
- BEGIN
- p := head(tail(head(q))) ;
- IF tag_value(p) = number
- THEN p^.num_data := p^.num_data + 1.0 ;
- total_cases := total_cases + 1 ;
- list := NIL ;
- q := NIL ;
- END
- ELSE q := tail(q) ;
- END
- ELSE list := tail(list) ;
- END ; (* increment *)
-
- BEGIN
- total_cases := 0 ;
- px := example_list ;
- WHILE px <> NIL DO
- BEGIN
- py := head(px) ;
- increment(class_list,string_val(head(element(py,elem_no))),
- string_val(head(py))) ;
- px := tail(px) ;
- END ;
- END ; (* count_classes *)
-
- BEGIN
- class_list := NIL ;
- make_class_list(val_list) ;
- count_classes(elem_no) ;
- sym_ent := entropy(class_list,total_cases) ;
- sym_split_val := 0.0
- END ; (* symbol_entropy *)
-
- BEGIN
- min_entropy := 1.0E+37 ;
- FOR i := 2 TO no_of_cols DO
- BEGIN
- test_memory ;
- attrib := head(element(attrib_list,i)) ;
- IF NOT on_list(string_val(head(attrib)),chosen_list)
- THEN
- BEGIN
- IF pos(':NUMBER',toupper(string_val(head(attrib)))) > 0
- THEN numeric_entropy(i,ent,split_value)
- ELSE symbol_entropy(tail(attrib),i,ent,split_value) ;
- IF ent < min_entropy
- THEN
- BEGIN
- min_entropy := ent ;
- split_elem := i ;
- min_split_value := split_value ;
- END ;
- END ;
- END ;
- END ; (* find_split *)
-
- FUNCTION split(elem_no : counter ; split_val : real) : node_ptr ;
- (* This routine splits the example_list into sets which contain a single
- value of the split attribute. elem_no is the column on which to split.
- split_val is the split value for numerical attributes.
- split_item points to the attribute's entry in the attribute list.
- split_list is the tree which is returned by split. Its format is
- (attribute_name (value1 classify(partition with attribute = value1)
- (value2 classify(partition with attribute = value2)
- ....... ) *)
- VAR
- split_list,split_item,new_chosen : node_ptr ;
-
- PROCEDURE numeric_split ;
- (* Splitting on a numerical attribute splits the examples into
- two groups, those with values < split_value and those with
- values >= split_value. new_list1 and new_list2 are the example
- sets for the two categories. It returns a split_list as follows:
- (attribute_name ('< split_val'
- classify(all examples with attribute value < split_val)
- ('>= split_val'
- classify(all examples with attribute value >= split_val))
- Notice all the lists placed on the saved_list. These are the items
- that we must retain should any of the calls to classify invoke garbage
- collection *)
- VAR
- new_list1,new_list2,q,valu : node_ptr ;
- split_str : string80 ;
- BEGIN
- str(split_val,split_str) ;
- valu := cons(alloc_str(concat('< ',split_str)),
- cons(alloc_str(concat('>= ',split_str)),NIL)) ;
- q := example_list ;
- new_list1 := NIL ;
- new_list2 := NIL ;
- WHILE q <> NIL DO
- BEGIN
- IF num_val(head(element(head(q),elem_no))) < split_val
- THEN new_list1 := append_list(new_list1,cons(head(q),NIL))
- ELSE new_list2 := append_list(new_list2,cons(head(q),NIL)) ;
- q := tail(q) ;
- END ;
- saved_list := cons(split_list,cons(valu,cons(new_list2,saved_list))) ;
- split_list := append_list(split_list,
- cons(cons(head(valu),
- cons(classify(new_list1,chosen_list),NIL)),NIL)) ;
- saved_list := cons(split_list,tail(saved_list)) ;
- split_list := append_list(split_list,
- cons(cons(head(tail(valu)),
- cons(classify(new_list2,chosen_list),NIL)),NIL)) ;
- saved_list := tail(tail(tail(saved_list))) ;
- END ; (* numeric_split *)
-
- PROCEDURE symbol_split ;
- (* performs the split for symbolic attributes. For each value of the
- attribute, it searches the example list for matches and attaches
- examples with a match to new_example_list. If it finds any matches
- it appends the value and the result of classifying the new_example_list
- to split_list. This is a very inefficient way of doing this. It
- would be better to sort the example_list using column elem_no as
- a key. *)
- VAR
- valu,q,new_example_list : node_ptr ;
- BEGIN
- valu := tail(split_item) ;
- WHILE valu <> NIL DO
- BEGIN
- q := example_list ;
- new_example_list := NIL ;
- WHILE q <> NIL DO
- BEGIN
- IF string_val(head(valu)) = string_val(head(element(head(q),elem_no)))
- THEN new_example_list := append_list(new_example_list,cons(head(q),NIL)) ;
- q := tail(q) ;
- END ;
- IF new_example_list <> NIL
- THEN
- BEGIN
- saved_list := cons(split_list,saved_list) ;
- split_list := append_list(split_list,
- cons(cons(head(valu),
- cons(classify(new_example_list,new_chosen),NIL)),
- NIL)) ;
- saved_list := tail(saved_list) ;
- END ;
- valu := tail(valu) ;
- END ;
- END ; (* symbol_split *)
-
- BEGIN
- split_item := head(element(attrib_list,elem_no)) ;
- new_chosen := cons(head(split_item),chosen_list) ;
- split_list := cons(head(split_item),NIL) ;
- IF pos(':NUMBER',toupper(string_val(head(split_item)))) > 0
- THEN numeric_split
- ELSE symbol_split ;
- split := split_list ;
- END ; (* split *)
-
- FUNCTION single_class : boolean ;
- (* returns true if the example_list contains only a single class value. *)
- VAR
- first_val : string80 ;
- p : node_ptr ;
- more_than_one : boolean ;
- BEGIN
- first_val := string_val(head(head(example_list))) ;
- more_than_one := false ;
- p := tail(example_list) ;
- WHILE (p <> NIL) AND (NOT more_than_one) DO
- IF string_val(head(head(p))) <> first_val
- THEN more_than_one := true
- ELSE p := tail(p) ;
- single_class := NOT more_than_one ;
- END ; (* single_class *)
-
- BEGIN
- write('.') ;
- split_elem := 0 ;
- saved_list := cons(chosen_list,cons(example_list,saved_list)) ;
- IF NOT single_class
- THEN find_split(split_elem,split_value) ;
- IF split_elem = 0
- THEN classify_list := cons(head(head(attrib_list)),
- cons(cons(head(head(example_list)),NIL),NIL))
- ELSE classify_list := split(split_elem,split_value) ;
- saved_list := append_list(tail(tail(saved_list)),cons(classify_list,NIL)) ;
- classify := classify_list ;
- END ; (* classify *)
-
- BEGIN
- classify_it := classify(examples,NIL) ;
- END ; (* classify_it *)
-
-
- OVERLAY PROCEDURE print_rule_list(list : node_ptr) ;
- (* This routine transforms the tree into a set of IF/THEN statements and
- writes them to a file. It produces a knowledge base for MicroExpert [c],
- if you want to produces rules for another shell, this routine will
- have to be modified. *)
- VAR
- rule_count : counter ;
- rule_file : text_file ;
- file_name : string80 ;
- used_attribs : node_ptr ;
-
- PROCEDURE print_rule(tree,rule_list : node_ptr) ;
- (* Do a depth first traversal of tree. On entry rule_list contains a
- list of attribute value pairs. When tree is finally NIL, i.e. a
- terminal node of the tree has been encountered, the rule_list is
- printed. If entered with a non-NIL tree, the routine creates a new
- attribute value pair, attaches it to rule_list and explores further
- down the tree. It also attaches the attribute names to used_attribs
- so that they can be used to generate prompts. *)
- VAR
- p : node_ptr ;
-
- PROCEDURE print_the_rule(list : node_ptr) ;
- (* Prints the rule_list, with rules formatted for MicroExpert. *)
- VAR
- s : string80 ;
-
- PROCEDURE write_compare ;
- VAR
- comp_str : string[2] ;
-
- FUNCTION quote(w : string80) : string80 ;
- BEGIN
- quote := '''' + w + '''' ;
- END ; (* quote *)
-
- BEGIN
- comp_str := '' ;
- WHILE s[1] <> ' ' DO
- BEGIN
- comp_str := comp_str + s[1] ;
- delete(s,1,1) ;
- END ;
- strip_leading_blanks(s) ;
- strip_trailing_blanks(s) ;
- writeln(rule_file,'function compare(',attrib_value(head(head(list))),
- ',',quote(comp_str),',',quote(s),')') ;
- END ; (* write_compare *)
-
- BEGIN
- writeln(rule_file,rule_count) ;
- rule_count := rule_count + 1 ;
- write(rule_file,'If ') ;
- WHILE list <> NIL DO
- BEGIN
- s := string_val(head(tail(head(list)))) ;
- IF s[1] IN ['<','>']
- THEN write_compare
- ELSE writeln(rule_file,attrib_value(head(head(list))),' is ',s) ;
- list := tail(list) ;
- IF list <> NIL
- THEN
- IF tail(list) = NIL
- THEN write(rule_file,'then ')
- ELSE write(rule_file,'and ') ;
- END ;
- writeln(rule_file,'.') ;
- writeln(rule_file) ;
- END ; (* print_the_rule *)
-
- BEGIN
- IF tree = NIL
- THEN print_the_rule(rule_list)
- ELSE
- BEGIN
- IF head(tree) <> head(head(attrib_list))
- THEN
- IF NOT on_list(string_val(head(tree)),used_attribs)
- THEN used_attribs := cons(head(tree),used_attribs) ;
- p := tail(tree) ;
- WHILE p <> NIL DO
- BEGIN
- print_rule(head(tail(head(p))),
- append_list(rule_list,cons(cons(head(tree),
- cons(head(head(p)),NIL)),
- NIL))) ;
- p := tail(p) ;
- END ;
- END ;
- END ; (* print_rule *)
-
- PROCEDURE print_prompts ;
- (* This routine traverses the attribute list and writes a prompt for
- each attribute on the list. MicroExpert does not automatically
- generate prompts, so this is necessary. The format
- of the questions may seem dumb. For a working knowledge base, you
- will want to edit the prompts and add translations. *)
- VAR
- q : node_ptr ;
- BEGIN
- q := used_attribs ;
- WHILE q <> NIL DO
- BEGIN
- writeln(rule_file) ;
- IF pos(':NUMBER',toupper(string_val(head(q)))) > 0
- THEN writeln(rule_file,'Numeric prompt ',attrib_value(head(q)))
- ELSE writeln(rule_file,'Prompt ',attrib_value(head(q))) ;
- writeln(rule_file,'What is the value of ',attrib_value(head(q)),' ?') ;
- writeln(rule_file,'.') ;
- q := tail(q) ;
- END ;
- END ; (* print_prompts *)
-
- BEGIN
- writeln ;
- write('Output the rules to what file (Press <ENTER> for screen.) ? ') ;
- readln(file_name) ;
- strip_leading_blanks(file_name) ;
- IF file_name = ''
- THEN file_name := 'con:' ;
- assign(rule_file,file_name) ;
- rewrite(rule_file) ;
- writeln(rule_file) ;
- rule_count := 1 ;
- used_attribs := NIL ;
- print_rule(list,NIL) ;
- print_prompts ;
- IF is_console(rule_file)
- THEN wait ;
- close(rule_file) ;
- END ; (* print_rule_list *)
-
-
- OVERLAY PROCEDURE print_tree(list : node_ptr) ;
- (* Print the tree. This is really just a pretty print routine, which
- indents each sub_list. *)
- VAR
- indent_level : counter ;
- tree_file : text_file ;
- file_name : string80 ;
-
- PROCEDURE print_the_tree(tree : node_ptr ; VAR indent : counter) ;
- VAR
- p : node_ptr ;
- BEGIN
- IF tree <> NIL
- THEN
- CASE tree^.tag OF
- number,
- symbol : BEGIN
- write(tree_file,attrib_value(tree),' ') ;
- indent := indent + length(attrib_value(tree)) + 1 ;
- END ;
- cons_node : BEGIN
- write(tree_file,'(') ;
- indent := indent + 1 ;
- print_the_tree(head(tree),indent) ;
- p := tail(tree) ;
- WHILE p <> NIL DO
- BEGIN
- print_the_tree(head(p),indent) ;
- IF list_length(p) > 1
- THEN
- BEGIN
- writeln(tree_file) ;
- write(tree_file,' ' : indent) ;
- END ;
- p := tail(p) ;
- END ;
- indent := indent - length(attrib_value(head(tree))) - 2 ;
- write(tree_file,') ') ;
- END ;
- END ;
- END ; (* print_the_tree *)
-
- BEGIN
- writeln ;
- write('Output the tree to what file (Press <ENTER> for screen.) ? ') ;
- readln(file_name) ;
- strip_leading_blanks(file_name) ;
- IF file_name = ''
- THEN file_name := 'con:' ;
- assign(tree_file,file_name) ;
- rewrite(tree_file) ;
- writeln(tree_file) ;
- indent_level := 0 ;
- print_the_tree(list,indent_level) ;
- writeln(tree_file) ;
- writeln(tree_file) ;
- IF is_console(tree_file)
- THEN wait ;
- close(tree_file) ;
- END ; (* print_tree *)
-
-
- OVERLAY FUNCTION got_file : boolean ;
- (* asks for an example file name and tries to open it. If it can't
- open the file, it complains and asks for a new file *)
- VAR
- example_name : string80 ;
- BEGIN
- writeln ;
- write('Example File (Press <ENTER> to quit.) : ') ;
- readln(example_name) ;
- IF example_name = ''
- THEN got_file := false
- ELSE
- BEGIN
- IF pos('.',example_name) = 0
- THEN example_name := concat(example_name,'.EX') ;
- IF open(example_file,example_name)
- THEN got_file := true
- ELSE
- BEGIN
- writeln ;
- writeln(toupper(example_name),' could not be found.') ;
- writeln ;
- got_file := got_file ;
- END ;
- END ;
- END ; (* got_file *)
-
-
- BEGIN
- free := NIL ;
- initial_heap := HeapPtr ;
- total_free := 0.0 ;
- clrscr ;
- WHILE got_file DO
- BEGIN
- build_table ;
- close(example_file) ;
- IF NOT conflicts(examples)
- THEN
- BEGIN
- saved_list := cons(attrib_list,examples) ;
- expand(examples,examples) ;
- writeln ;
- saved_list := cons(attrib_list,examples) ;
- c_list := classify_it ;
- saved_list := cons(c_list,attrib_list) ;
- writeln ;
- test_memory ;
- print_tree(c_list) ;
- writeln ;
- test_memory ;
- writeln ;
- print_rule_list(c_list) ;
- clrscr ;
- END ;
- END ;
- END.
-
-
-