Programmer 7500

home *** CD-ROM | disk | FTP | other *** search

/ Programmer 7500 / MAX_PROGRAMMERS.iso / PASCAL / INDUCE.ZIP / INDUCE.PAS < prev next >

Wrap

Pascal/Delphi Source File | 1986-06-04 | 42.3 KB | 1,214 lines

{.IN+} {.PW132} (*$V-,R+,B- *) PROGRAM induce ; (* Copyright 1986 - MicroExpert Systems Box 430 R.D. 2 Nassau, NY 12123 *) (* Induce implements the ID3 algorithm for the generation of rules from a data set as described in the BYTE article "Finding Knowledge in Data". This program has been tested using Turbo ver 3.01A on an IBM PC. It has been run under both DOS 2.1 and Concurrent 4.1 . The source for this program is contained in two files, INDUCE.PAS and INDUCE.INC. The program produces one overlay file INDUCE.000 . INDUCE produces a knowledge base which can be used with MicroExpert. MicroExpert is an expert system shell written in Turbo Pascal for the IBM PC and Apple II. It is available for $49.95 and comes with complete source code. It can be order by writing to : McGraw-Hill Book Company P.O. Box 400 Hightstown, NJ 08520 Or calling 1-800-628-004 or in New York state 212/512-2999. We would be pleased to hear your comments, good or bad, or any applications and modifications of the program. Contact us at the above address or on BIX. Our id is bbt and we may be contacted via BIXmail or by leaving comments in the MicroExpert conference. Bill and Bev Thompson *) CONST ln2 = 0.69314718 ; limit = 1.0E-20 ; debug = false ; back_space = ^H ; tab = ^I ; eof_mark = ^Z ; esc = #27 ; quote_char = #39 ; left_arrow = #75 ; end_key = #79 ; del_line = ^X ; return = ^M ; bell = ^G ; TYPE counter = 0 .. maxint ; string80 = string[80] ; string132 = string[132] ; string255 = string[255] ; text_file = text ; char_set = SET OF char ; node_type = (cons_node,symbol,number,free_node) ; node_ptr = ^node ; node = RECORD in_use : boolean ; CASE tag : node_type OF cons_node : (tail_ptr : node_ptr ; head_ptr : node_ptr) ; symbol : (string_data : string80) ; number : (num_data : real) ; free_node : (next_free : node_ptr ; block_cnt : counter) ; END ; (* node is the basic allocation unit for lists. The fields are used as follows: in_use - in_use = false tells the garbage collector that this node is available for re-use. tag - which kind of node this is. cons_node - cons_nodes consist of two pointers. one to the head (first item) the other to the rest of the list. They are the "glue" which holds the list together. The list (A B C) would be stored as ------- -------- -------- | .| . |-----> | .| . |------> | .| . |---> NIL --|----- --|------ --|----- | | | V V V A B C The boxes are the cons nodes, the first part of the box holds the head pointer, then second contains the tail. symbol - holds string values, we don't actually use the entire 80 characters in most cases. number - contains a real number. free_node - the garbage collector gathers all unused nodes and puts them on a free list. It also compacts the free space into contiguous blocks. next_free points to the next free block. block_cnt contains a count of the number of contiguous 8 byte free blocks which follow this one. *) VAR example_file : text_file ; line : string255 ; c_list,examples,attrib_list,saved_list,initial_heap,free : node_ptr ; total_free : real ; no_of_cols : counter ; (* The important globals are: example_file - text file containing the original example set. See the documentation file for its format. line - line buffer for reading in the text file c_list - the classification tree examples - the list of examples attrib_list - list of attribute names and their values saved_list - list of all items that absolutely must be saved if garbage collection occurs. Usually has at least the examples and attrib_list attcahed to it. initial_heap - the value of the heap pointer at the start of the program. used by the garbage collector free - the list of free nodes. total_free - total number of free blocks on the free list. no_of_cols - the total number of attributes + the class attribute in the example set. *) (*$I induce.inc *) PROCEDURE read_from_file(VAR f : text_file) ; (* Read a line form file f and store it in the global variable, line. It ignores blank lines and comments. When the end of file is reached eof_mark is returned. *) CONST in_comment : boolean = false ; (* static *) PROCEDURE read_a_line ; BEGIN (*$I- *) readln(f,line) ; (*$I+ *) IF ioresult <> 0 THEN line := eof_mark ELSE IF pos('(*',line) > 0 THEN IF pos('*)',line) > 0 THEN delete(line,pos('(*',line),pos('*)',line) - pos('(*',line) + 2) ELSE BEGIN in_comment := true ; line := '' ; END ; END ; (* read_a_line *) BEGIN line := '' ; IF eof(f) THEN line := eof_mark ELSE BEGIN read_a_line ; IF in_comment THEN IF pos('*)',line) > 0 THEN BEGIN delete(line,1,pos('*)',line) + 1) ; in_comment := false ; END ELSE read_from_file(f) ; END ; strip_leading_blanks(line) ; strip_trailing_blanks(line) ; IF line = '' THEN read_from_file(f) ; END ; (* read_from_file *) OVERLAY PROCEDURE expand(example_list : node_ptr ; VAR new_example_list : node_ptr) ; (* Expand "don't care" values into values from attrib_list. example_list - unexpanded example set new_example_list - expanded set *) PROCEDURE dup_and_copy(list : node_ptr) ; (* This routine creates a new version of the current row, pointed to by list. If it finds a regular attribute value, it just appends the value to the row it is constructing. If it finds a '*', indicating a "don't care" value, it call copy_to_new_list to expand the value and attach the new rows to new_example_list. Notice that we attach anything we don't want to be trashed by the garbage collector to the head of saved_list and remove it at the end of the routine. copy_to_new_list saves new_list because it calls dup_and_copy and that routine might initiate garbage collection. *) VAR new_list,attr_ptr : node_ptr ; copied : boolean ; PROCEDURE copy_to_new_list ; (* This routine does the actual expansion. It attaches a value for the attribute, pointed to by p, to the row that has been constructed so far and attaches the rest of the list to the end of the row. It calls dup_and_copy to expand any more *'s in the row and finally attach the row to the new_example_list *) VAR p,new_row : node_ptr ; BEGIN saved_list := cons(new_list,saved_list) ; copied := true ; p := tail(head(attr_ptr)) ; WHILE p <> NIL DO BEGIN new_row := append_list(new_list,cons(head(p),tail(list))) ; dup_and_copy(new_row) ; p := tail(p) ; END ; saved_list := tail(saved_list) ; END ; (* copy_to_new_list *) BEGIN saved_list := cons(list,saved_list) ; test_memory ; new_list := NIL ; attr_ptr := attrib_list ; copied := false ; WHILE (list <> NIL) AND (NOT copied) DO IF string_val(head(list)) = '*' THEN copy_to_new_list ELSE BEGIN new_list := append_list(new_list,cons(head(list),NIL)) ; list := tail(list) ; attr_ptr := tail(attr_ptr) ; END ; IF NOT copied THEN new_example_list := append_list(new_example_list,cons(new_list,NIL)) ; saved_list := cons(new_example_list,tail(saved_list)) ; END ; (* dup_and_copy *) BEGIN new_example_list := NIL ; WHILE example_list <> NIL DO BEGIN dup_and_copy(head(example_list)) ; example_list := tail(example_list) ; END ; END ; (* expand *) OVERLAY FUNCTION conflicts(example_list : node_ptr) : boolean ; (* Search for conflicts by using match_list to compare each row against the rows which follow it in the example list. conflicts returns true if a match is found. *) VAR p : node_ptr ; found_match : boolean ; PROCEDURE conflict_message ; BEGIN writeln ; writeln('A conflict exists between rows:') ; writeln ; print_list(head(example_list)) ; writeln ; print_list(head(p)) ; writeln ; writeln('Processing cannot continue.') ; END ; (* conflict_message *) BEGIN found_match := false ; WHILE (example_list <> NIL) AND (NOT found_match) DO BEGIN p := tail(example_list) ; WHILE (p <> NIL) AND (NOT found_match) DO IF match_lists(tail(head(example_list)),tail(head(p))) THEN found_match := true ELSE p := tail(p) ; IF NOT found_match THEN example_list := tail(example_list) ; END ; IF found_match THEN conflict_message ; conflicts := found_match ; END ; (* conflicts *) OVERLAY PROCEDURE build_table ; (* Read the example file and build the attrib_list and examples. The format of these two lists is described in the BYTE article mentioned at the beginning of the program. This routine doesn't do much error checking, so be careful with your example files. *) VAR new_row : node_ptr ; token : string80 ; PROCEDURE scan ; (* Get a single token from the input line. This procedure strips leading and trailing blanks and tabs, but interior spaces are sigificant. A token is any string between the first non-space character and a comma or end of line. Case is significant in tokens, 'Cat' and 'cat' will be treated as different values by the program. *) VAR comma_pos : byte ; BEGIN strip_leading_blanks(line) ; IF line = '' THEN token := '' ELSE BEGIN comma_pos := pos(',',line) ; IF comma_pos > 0 THEN BEGIN token := copy(line,1,comma_pos - 1) ; delete(line,1,comma_pos) ; END ELSE BEGIN token := line ; line := '' ; END ; IF token = '' THEN token := '*' ; strip_trailing_blanks(token) ; END ; END ; (* scan *) PROCEDURE build_a_row ; (* Builds an example row. Symbolic and numerical attributes are handled differently. Input lines are read one token at a time and storage is allocated for the new token. The attrib_list is examined to see if the new value appears on the list of values for that attribute. If it does not, the value is added to the list. Symbolic values are added to the end of the list of values for the attribute, numerical values are stored in order. Once the new row is constructed it is appended to the example set. *) VAR at_list,row_list,token_ptr : node_ptr ; PROCEDURE length_error ; (* Signal an error, probably a missing value. The row in question will not be included in the example set, but the attribute list may be damaged, so don't trust results after and error. *) BEGIN writeln ; writeln('Missing attribute in row:') ; print_list(row_list) ; writeln ; writeln ; wait ; END ; (* length_error *) PROCEDURE add_value ; (* Add a new value to the attribue list. The variable attrib_list keeps track of the current column as the row is scanned. If token was found to already be on the attrib_list, head(attrib_list) is appended to at_list. If the token is a new value, it is added to the list at the head of attrib_list, and then head(attrib_list) is appended to at_list. After reading the entire row from the file, attrib_list is set to point to at_list. This way attrib_list is reconstructed for each row. *) PROCEDURE insert_number ; (* Insert a number into the attribute list. The list of values for numerical attributes is maintained in order. This is done by comparing the value of the token against the other items on the list. As the comparison is done, the values are copied to new_list. When a value is found that is greater than the token value or the end of the list is reached, the token is appened to new_list and then the reaming values on the old list are appended to new_list. Finally new_list is appended to at_list. All of this appending produces lots of garbage. *) VAR new_list,p : node_ptr ; r : real ; inserted : boolean ; PROCEDURE build_new_list ; (* This routine does the actual insetion described above. *) BEGIN WHILE (p <> NIL) AND (NOT inserted) DO BEGIN IF abs(r - num_val(head(p))) < limit THEN BEGIN inserted := true ; new_list := append_list(new_list,p) ; END ELSE IF r > num_val(head(p)) THEN BEGIN new_list := append_list(new_list,cons(head(p),NIL)) ; p := tail(p) ; END ELSE BEGIN new_list := append_list(new_list,append_list( cons(token_ptr,NIL),p)) ; inserted := true ; END ; END ; END ; (* build_new_list *) BEGIN r := num_val(token_ptr) ; inserted := false ; new_list := cons(head(head(attrib_list)),NIL) ; p := tail(head(attrib_list)) ; build_new_list ; IF (p = NIL) AND (NOT inserted) THEN new_list := append_list(new_list,cons(token_ptr,NIL)) ; at_list := append_list(at_list,cons(new_list,NIL)) ; END ; (* insert_number *) BEGIN IF tag_value(token_ptr) = number THEN insert_number ELSE at_list := append_list(at_list, cons(append_list(head(attrib_list), cons(token_ptr,NIL)), NIL)) ; END ; (* add_value *) BEGIN saved_list := cons(examples,attrib_list) ; test_memory ; at_list := NIL ; row_list := NIL ; scan ; WHILE token <> '' DO BEGIN IF pos(':NUMBER',toupper(string_val(head(head(attrib_list))))) > 0 THEN token_ptr := alloc_num(toreal(token)) ELSE token_ptr := alloc_str(token) ; IF (NOT on_list(token,head(attrib_list))) AND (token <> '*') THEN add_value ELSE at_list := append_list(at_list,cons(head(attrib_list),NIL)) ; row_list := append_list(row_list,cons(token_ptr,NIL)) ; attrib_list := tail(attrib_list) ; scan ; END ; attrib_list := at_list ; IF list_length(row_list) = no_of_cols THEN examples := append_list(examples,cons(row_list,NIL)) ELSE length_error ; END ; (* build_a_row *) PROCEDURE build_attrib_list ; (* constructs the initial attrib_list from the first row in the file. Initially the attrib_list is simply a list of the attribute names, build_a_row adds the values to it. This routine also counts the number of columns (attributes) in the table. *) BEGIN attrib_list := NIL ; no_of_cols := 0 ; scan ; WHILE token <> '' DO BEGIN attrib_list := append_list(attrib_list,cons(cons(alloc_str(token),NIL), NIL)) ; no_of_cols := no_of_cols + 1 ; scan ; END ; END ; (* build_attrib_list *) BEGIN examples := NIL ; line := '' ; read_from_file(example_file) ; IF line <> eof_mark THEN build_attrib_list ; read_from_file(example_file) ; WHILE line <> eof_mark DO BEGIN build_a_row ; read_from_file(example_file) ; END ; END ; (* build_table *) OVERLAY FUNCTION classify_it : node_ptr ; (* is an overlay function which calls classify. We do it this way to avoid swapping due to recursion. *) FUNCTION classify(example_list,chosen_list : node_ptr) : node_ptr ; (* This is the main processing routine of the program. It is passed two lists, a list of rows, example_list and a list of attributes already chosen. The second list is simply for convenience. That way we don't have to calculate the entropy for attribute which can no longer contribute to splitting example_set. classify returns a pointer to the classification tree built from the example_set. If the example_list passed to it contains only a single class value, classify returns the class_name (attribute name of the first column) and the class value. Variables: split_elem - the column (attribute number) to split on classify_list - a temporary list to hold the tree split_value - for numerical attributes. It contains the value which produces the best numerical split. classify prints a dot on the screen each time it is entered to show you that the program really hasn't died. *) VAR split_elem : counter ; classify_list : node_ptr ; split_value : real ; PROCEDURE find_split(VAR split_elem : counter ; VAR min_split_value : real) ; (* finds the best attribute to split on. It returns the column number on which to split and for numerical attribute, the value which produces the best split. For each active attribute it constructs a class_list. the class_list has the following format: ( (attribute value #1 (class1 count) (class2 count) ....) (attribute value #2 (class1 count) (class2 count) ....) .....) The counts are the number of times each class appears in a row with a particular value of the attribute. This list is used to calculate the entropy of the attribute. *) VAR i : counter ; attrib : node_ptr ; ent,min_entropy,split_value : real ; FUNCTION entropy(list : node_ptr ; cases : counter) : real ; (* list is a class list. cases is the number of examples under consideration. This routine calculates the entropy H(C|A) from the class list. *) VAR sum,sum1,sum2,r : real ; p : node_ptr ; FUNCTION log2(x : real) : real ; BEGIN IF abs(x) < limit THEN log2 := 0.0 ELSE log2 := ln(x) / ln2 ; END ; (* log2 *) BEGIN sum := 0.0 ; WHILE list <> NIL DO BEGIN sum1 := 0.0 ; sum2 := 0.0 ; p := tail(head(list)) ; WHILE p <> NIL DO BEGIN r := num_val(head(tail(head(p)))) ; sum1 := sum1 + r * log2(r) ; sum2 := sum2 + r ; p := tail(p) ; END ; sum := sum + (sum2 * log2(sum2)) - sum1 ; list := tail(list) ; END ; entropy := sum / cases ; END ; (* entropy *) PROCEDURE numeric_entropy(elem_no : counter ; VAR num_entropy,num_split_value : real) ; (* Find the best split for a numeric attribute. elem_no is the column we are working on. num_entropy is the best entropy for this attribute and num_split_value is the split which gives that value. In addition to the class list, this routine produces an ordered list of the values for this attribute, called num_list. This list is used in making the splits. Each split is half way between successive values on the num_list. The entropy is calculated for each split. *) VAR class_list,sp,num_list : node_ptr ; sp_val,num_ent : real ; total_cases : counter ; PROCEDURE make_num_list ; (* constructs num_list. This is essentially the same routine as insert_number in build_table *) VAR new_list,p,q : node_ptr ; r : real ; inserted : boolean ; PROCEDURE add_to_new_list ; BEGIN WHILE (p <> NIL) AND (NOT inserted) DO BEGIN IF abs(r - num_val(head(p))) < limit THEN BEGIN inserted := true ; new_list := append_list(new_list,p) ; END ELSE IF r > num_val(head(p)) THEN BEGIN new_list := append_list(new_list,cons(head(p),NIL)) ; p := tail(p) ; END ELSE BEGIN new_list := append_list(new_list,append_list( cons(alloc_num(r),NIL),p)) ; inserted := true ; END ; END ; END ; (* add_to_new_list *) BEGIN test_memory ; num_list := NIL ; q := example_list ; WHILE q <> NIL DO BEGIN r := num_val(head(element(head(q),elem_no))) ; new_list := NIL ; p := num_list ; inserted := false ; add_to_new_list ; IF (p = NIL) AND (NOT inserted) THEN new_list := append_list(new_list,cons(alloc_num(r),NIL)) ; num_list := new_list ; q := tail(q) ; END ; END ; (* make_num_list *) PROCEDURE make_numeric_class_list(v : real) ; (* builds the class list. v is the value to split on. The class_list contains lists for two ranges < v and >= v. The list has the format: ( (< v (class1 count) (class2 count) .....) (>= v (class1 count) (class2 count) .....)) *) VAR temp_list,p : node_ptr ; v_str : string80 ; BEGIN str(v,v_str) ; temp_list := NIL ; p := tail(head(attrib_list)) ; WHILE p <> NIL DO BEGIN temp_list := append_list(temp_list,cons(cons(head(p), cons(alloc_num(0.0),NIL)),NIL)) ; p := tail(p) ; END ; class_list := cons(cons(alloc_str(concat('< ',v_str)),temp_list), cons(cons(alloc_str(concat('>= ',v_str)), copy_list(temp_list)),NIL)) ; END ; (* make_numeric_class_list *) PROCEDURE count_numeric_classes(v : real ; elem_no : counter) ; (* count the classes for each range. It reads the example list, extracts the value for the attribute, searches the class list and increments the appropriate class value in the list. v is the split value. *) VAR px,py : node_ptr ; PROCEDURE numeric_increment(list : node_ptr ; attr_v,atv : string80) ; (* search list (class_list) and compare attr_v to v, the split_value. atv is the class_value. Once we find the sub-list with the proper range we search its tail for atv to increment the class count. *) PROCEDURE do_increment(v_list : node_ptr) ; VAR p,q : node_ptr ; BEGIN q := tail(v_list) ; WHILE q <> NIL DO IF string_val(head(head(q))) = atv THEN BEGIN p := head(tail(head(q))) ; IF tag_value(p) = number THEN p^.num_data := p^.num_data + 1.0 ; total_cases := total_cases + 1 ; q := NIL ; END ELSE q := tail(q) ; END ; (* do_increment *) BEGIN IF toreal(attr_v) < v THEN do_increment(head(list)) ELSE do_increment(head(tail(list))) ; END ; (* numeric_increment *) BEGIN total_cases := 0 ; px := example_list ; WHILE px <> NIL DO BEGIN py := head(px) ; numeric_increment(class_list,string_val(head(element(py,elem_no))), string_val(head(py))) ; px := tail(px) ; END ; END ; (* count_numeric_classes *) BEGIN num_entropy := 1.0E+37 ; make_num_list ; sp := tail(num_list) ; saved_list := cons(num_list,saved_list) ; WHILE sp <> NIL DO BEGIN test_memory ; sp_val := num_val(head(num_list)) + ((num_val(head(sp)) - num_val(head(num_list))) / 2.0) ; make_numeric_class_list(sp_val) ; count_numeric_classes(sp_val,elem_no) ; num_ent := entropy(class_list,total_cases) ; IF num_ent < num_entropy THEN BEGIN num_entropy := num_ent ; num_split_value := sp_val ; END ; num_list := sp ; sp := tail(sp) ; END ; saved_list := tail(saved_list) ; END ; (* numeric_entropy *) PROCEDURE symbol_entropy(val_list : node_ptr ; elem_no : counter ; VAR sym_ent,sym_split_val : real) ; (* Find the entropy for a symbolic attribute. val_list is the list of possible values for this attribute from the attrib_list. elem_no is the column number, sym_ent is the entropy for this attribute. sym_split_value is always 0. This routine constructs a class list as described above and counts the classes for each value of the attribute as in the numeric case, only there is no range splitting. Symbolic attributes can result in mult-way partitions of the example_list, numerical attributes always produce binary splits. *) VAR class_list : node_ptr ; total_cases : counter ; PROCEDURE make_class_list(a_list : node_ptr) ; (* builds the initial class list. See above comments for format. *) VAR temp_list,p : node_ptr ; BEGIN WHILE a_list <> NIL DO BEGIN temp_list := cons(head(a_list),NIL) ; p := tail(head(attrib_list)) ; WHILE p <> NIL DO BEGIN temp_list := append_list(temp_list,cons(cons(head(p), cons(alloc_num(0.0),NIL)),NIL)) ; p := tail(p) ; END ; class_list := append_list(class_list,cons(temp_list,NIL)) ; a_list := tail(a_list) ; END ; END ; (* make_class_list *) PROCEDURE count_classes(elem_no : counter) ; (* traverses the example_list and counts class values. *) VAR px,py : node_ptr ; PROCEDURE increment(list : node_ptr ; attr,v : string80) ; (* search list (class_list) and compare attr to the head of each sub-list. v is the class_value. Once we find the sub-list with the proper range we search its tail for v to increment the class count. *) VAR p,q : node_ptr ; BEGIN WHILE list <> NIL DO IF string_val(head(head(list))) = attr THEN BEGIN q := tail(head(list)) ; WHILE q <> NIL DO IF string_val(head(head(q))) = v THEN BEGIN p := head(tail(head(q))) ; IF tag_value(p) = number THEN p^.num_data := p^.num_data + 1.0 ; total_cases := total_cases + 1 ; list := NIL ; q := NIL ; END ELSE q := tail(q) ; END ELSE list := tail(list) ; END ; (* increment *) BEGIN total_cases := 0 ; px := example_list ; WHILE px <> NIL DO BEGIN py := head(px) ; increment(class_list,string_val(head(element(py,elem_no))), string_val(head(py))) ; px := tail(px) ; END ; END ; (* count_classes *) BEGIN class_list := NIL ; make_class_list(val_list) ; count_classes(elem_no) ; sym_ent := entropy(class_list,total_cases) ; sym_split_val := 0.0 END ; (* symbol_entropy *) BEGIN min_entropy := 1.0E+37 ; FOR i := 2 TO no_of_cols DO BEGIN test_memory ; attrib := head(element(attrib_list,i)) ; IF NOT on_list(string_val(head(attrib)),chosen_list) THEN BEGIN IF pos(':NUMBER',toupper(string_val(head(attrib)))) > 0 THEN numeric_entropy(i,ent,split_value) ELSE symbol_entropy(tail(attrib),i,ent,split_value) ; IF ent < min_entropy THEN BEGIN min_entropy := ent ; split_elem := i ; min_split_value := split_value ; END ; END ; END ; END ; (* find_split *) FUNCTION split(elem_no : counter ; split_val : real) : node_ptr ; (* This routine splits the example_list into sets which contain a single value of the split attribute. elem_no is the column on which to split. split_val is the split value for numerical attributes. split_item points to the attribute's entry in the attribute list. split_list is the tree which is returned by split. Its format is (attribute_name (value1 classify(partition with attribute = value1) (value2 classify(partition with attribute = value2) ....... ) *) VAR split_list,split_item,new_chosen : node_ptr ; PROCEDURE numeric_split ; (* Splitting on a numerical attribute splits the examples into two groups, those with values < split_value and those with values >= split_value. new_list1 and new_list2 are the example sets for the two categories. It returns a split_list as follows: (attribute_name ('< split_val' classify(all examples with attribute value < split_val) ('>= split_val' classify(all examples with attribute value >= split_val)) Notice all the lists placed on the saved_list. These are the items that we must retain should any of the calls to classify invoke garbage collection *) VAR new_list1,new_list2,q,valu : node_ptr ; split_str : string80 ; BEGIN str(split_val,split_str) ; valu := cons(alloc_str(concat('< ',split_str)), cons(alloc_str(concat('>= ',split_str)),NIL)) ; q := example_list ; new_list1 := NIL ; new_list2 := NIL ; WHILE q <> NIL DO BEGIN IF num_val(head(element(head(q),elem_no))) < split_val THEN new_list1 := append_list(new_list1,cons(head(q),NIL)) ELSE new_list2 := append_list(new_list2,cons(head(q),NIL)) ; q := tail(q) ; END ; saved_list := cons(split_list,cons(valu,cons(new_list2,saved_list))) ; split_list := append_list(split_list, cons(cons(head(valu), cons(classify(new_list1,chosen_list),NIL)),NIL)) ; saved_list := cons(split_list,tail(saved_list)) ; split_list := append_list(split_list, cons(cons(head(tail(valu)), cons(classify(new_list2,chosen_list),NIL)),NIL)) ; saved_list := tail(tail(tail(saved_list))) ; END ; (* numeric_split *) PROCEDURE symbol_split ; (* performs the split for symbolic attributes. For each value of the attribute, it searches the example list for matches and attaches examples with a match to new_example_list. If it finds any matches it appends the value and the result of classifying the new_example_list to split_list. This is a very inefficient way of doing this. It would be better to sort the example_list using column elem_no as a key. *) VAR valu,q,new_example_list : node_ptr ; BEGIN valu := tail(split_item) ; WHILE valu <> NIL DO BEGIN q := example_list ; new_example_list := NIL ; WHILE q <> NIL DO BEGIN IF string_val(head(valu)) = string_val(head(element(head(q),elem_no))) THEN new_example_list := append_list(new_example_list,cons(head(q),NIL)) ; q := tail(q) ; END ; IF new_example_list <> NIL THEN BEGIN saved_list := cons(split_list,saved_list) ; split_list := append_list(split_list, cons(cons(head(valu), cons(classify(new_example_list,new_chosen),NIL)), NIL)) ; saved_list := tail(saved_list) ; END ; valu := tail(valu) ; END ; END ; (* symbol_split *) BEGIN split_item := head(element(attrib_list,elem_no)) ; new_chosen := cons(head(split_item),chosen_list) ; split_list := cons(head(split_item),NIL) ; IF pos(':NUMBER',toupper(string_val(head(split_item)))) > 0 THEN numeric_split ELSE symbol_split ; split := split_list ; END ; (* split *) FUNCTION single_class : boolean ; (* returns true if the example_list contains only a single class value. *) VAR first_val : string80 ; p : node_ptr ; more_than_one : boolean ; BEGIN first_val := string_val(head(head(example_list))) ; more_than_one := false ; p := tail(example_list) ; WHILE (p <> NIL) AND (NOT more_than_one) DO IF string_val(head(head(p))) <> first_val THEN more_than_one := true ELSE p := tail(p) ; single_class := NOT more_than_one ; END ; (* single_class *) BEGIN write('.') ; split_elem := 0 ; saved_list := cons(chosen_list,cons(example_list,saved_list)) ; IF NOT single_class THEN find_split(split_elem,split_value) ; IF split_elem = 0 THEN classify_list := cons(head(head(attrib_list)), cons(cons(head(head(example_list)),NIL),NIL)) ELSE classify_list := split(split_elem,split_value) ; saved_list := append_list(tail(tail(saved_list)),cons(classify_list,NIL)) ; classify := classify_list ; END ; (* classify *) BEGIN classify_it := classify(examples,NIL) ; END ; (* classify_it *) OVERLAY PROCEDURE print_rule_list(list : node_ptr) ; (* This routine transforms the tree into a set of IF/THEN statements and writes them to a file. It produces a knowledge base for MicroExpert [c], if you want to produces rules for another shell, this routine will have to be modified. *) VAR rule_count : counter ; rule_file : text_file ; file_name : string80 ; used_attribs : node_ptr ; PROCEDURE print_rule(tree,rule_list : node_ptr) ; (* Do a depth first traversal of tree. On entry rule_list contains a list of attribute value pairs. When tree is finally NIL, i.e. a terminal node of the tree has been encountered, the rule_list is printed. If entered with a non-NIL tree, the routine creates a new attribute value pair, attaches it to rule_list and explores further down the tree. It also attaches the attribute names to used_attribs so that they can be used to generate prompts. *) VAR p : node_ptr ; PROCEDURE print_the_rule(list : node_ptr) ; (* Prints the rule_list, with rules formatted for MicroExpert. *) VAR s : string80 ; PROCEDURE write_compare ; VAR comp_str : string[2] ; FUNCTION quote(w : string80) : string80 ; BEGIN quote := '''' + w + '''' ; END ; (* quote *) BEGIN comp_str := '' ; WHILE s[1] <> ' ' DO BEGIN comp_str := comp_str + s[1] ; delete(s,1,1) ; END ; strip_leading_blanks(s) ; strip_trailing_blanks(s) ; writeln(rule_file,'function compare(',attrib_value(head(head(list))), ',',quote(comp_str),',',quote(s),')') ; END ; (* write_compare *) BEGIN writeln(rule_file,rule_count) ; rule_count := rule_count + 1 ; write(rule_file,'If ') ; WHILE list <> NIL DO BEGIN s := string_val(head(tail(head(list)))) ; IF s[1] IN ['<','>'] THEN write_compare ELSE writeln(rule_file,attrib_value(head(head(list))),' is ',s) ; list := tail(list) ; IF list <> NIL THEN IF tail(list) = NIL THEN write(rule_file,'then ') ELSE write(rule_file,'and ') ; END ; writeln(rule_file,'.') ; writeln(rule_file) ; END ; (* print_the_rule *) BEGIN IF tree = NIL THEN print_the_rule(rule_list) ELSE BEGIN IF head(tree) <> head(head(attrib_list)) THEN IF NOT on_list(string_val(head(tree)),used_attribs) THEN used_attribs := cons(head(tree),used_attribs) ; p := tail(tree) ; WHILE p <> NIL DO BEGIN print_rule(head(tail(head(p))), append_list(rule_list,cons(cons(head(tree), cons(head(head(p)),NIL)), NIL))) ; p := tail(p) ; END ; END ; END ; (* print_rule *) PROCEDURE print_prompts ; (* This routine traverses the attribute list and writes a prompt for each attribute on the list. MicroExpert does not automatically generate prompts, so this is necessary. The format of the questions may seem dumb. For a working knowledge base, you will want to edit the prompts and add translations. *) VAR q : node_ptr ; BEGIN q := used_attribs ; WHILE q <> NIL DO BEGIN writeln(rule_file) ; IF pos(':NUMBER',toupper(string_val(head(q)))) > 0 THEN writeln(rule_file,'Numeric prompt ',attrib_value(head(q))) ELSE writeln(rule_file,'Prompt ',attrib_value(head(q))) ; writeln(rule_file,'What is the value of ',attrib_value(head(q)),' ?') ; writeln(rule_file,'.') ; q := tail(q) ; END ; END ; (* print_prompts *) BEGIN writeln ; write('Output the rules to what file (Press <ENTER> for screen.) ? ') ; readln(file_name) ; strip_leading_blanks(file_name) ; IF file_name = '' THEN file_name := 'con:' ; assign(rule_file,file_name) ; rewrite(rule_file) ; writeln(rule_file) ; rule_count := 1 ; used_attribs := NIL ; print_rule(list,NIL) ; print_prompts ; IF is_console(rule_file) THEN wait ; close(rule_file) ; END ; (* print_rule_list *) OVERLAY PROCEDURE print_tree(list : node_ptr) ; (* Print the tree. This is really just a pretty print routine, which indents each sub_list. *) VAR indent_level : counter ; tree_file : text_file ; file_name : string80 ; PROCEDURE print_the_tree(tree : node_ptr ; VAR indent : counter) ; VAR p : node_ptr ; BEGIN IF tree <> NIL THEN CASE tree^.tag OF number, symbol : BEGIN write(tree_file,attrib_value(tree),' ') ; indent := indent + length(attrib_value(tree)) + 1 ; END ; cons_node : BEGIN write(tree_file,'(') ; indent := indent + 1 ; print_the_tree(head(tree),indent) ; p := tail(tree) ; WHILE p <> NIL DO BEGIN print_the_tree(head(p),indent) ; IF list_length(p) > 1 THEN BEGIN writeln(tree_file) ; write(tree_file,' ' : indent) ; END ; p := tail(p) ; END ; indent := indent - length(attrib_value(head(tree))) - 2 ; write(tree_file,') ') ; END ; END ; END ; (* print_the_tree *) BEGIN writeln ; write('Output the tree to what file (Press <ENTER> for screen.) ? ') ; readln(file_name) ; strip_leading_blanks(file_name) ; IF file_name = '' THEN file_name := 'con:' ; assign(tree_file,file_name) ; rewrite(tree_file) ; writeln(tree_file) ; indent_level := 0 ; print_the_tree(list,indent_level) ; writeln(tree_file) ; writeln(tree_file) ; IF is_console(tree_file) THEN wait ; close(tree_file) ; END ; (* print_tree *) OVERLAY FUNCTION got_file : boolean ; (* asks for an example file name and tries to open it. If it can't open the file, it complains and asks for a new file *) VAR example_name : string80 ; BEGIN writeln ; write('Example File (Press <ENTER> to quit.) : ') ; readln(example_name) ; IF example_name = '' THEN got_file := false ELSE BEGIN IF pos('.',example_name) = 0 THEN example_name := concat(example_name,'.EX') ; IF open(example_file,example_name) THEN got_file := true ELSE BEGIN writeln ; writeln(toupper(example_name),' could not be found.') ; writeln ; got_file := got_file ; END ; END ; END ; (* got_file *) BEGIN free := NIL ; initial_heap := HeapPtr ; total_free := 0.0 ; clrscr ; WHILE got_file DO BEGIN build_table ; close(example_file) ; IF NOT conflicts(examples) THEN BEGIN saved_list := cons(attrib_list,examples) ; expand(examples,examples) ; writeln ; saved_list := cons(attrib_list,examples) ; c_list := classify_it ; saved_list := cons(c_list,attrib_list) ; writeln ; test_memory ; print_tree(c_list) ; writeln ; test_memory ; writeln ; print_rule_list(c_list) ; clrscr ; END ; END ; END.