_ALGORITHM ALLEY_ by Andrew Colin Listing One typedef struct node { UINT idx; /* ID code for attribute */ REAL threshold; /* Numerical threshold for attribute test */ struct node *on; /* Address of 'on' node */ struct node *off; /* Address of 'off' node */ struct node *parent; /* Addess of parent node */ } NODE; Listing Two NEGENTROPY negentropy ( REAL **data, UINT n_samples, NODE *local, UINT target) { /* Calculates the entropy of classification of an attribute, given a table * of attributes already used, the attribute on which splitting is to be * taken, and the target attribute. Entropy is calculated in bits, so logs * are taken to base 2 by dividing by LN_2. * The returned value always lies in the (closed) range [0, 1]. */ NEGENTROPY ret_val; NODE *_node, *_parent; UINT on_ctr, off_ctr, p1, p2, i, _match; REAL p_on, p_off, negentropy_on, negentropy_off; on_ctr = off_ctr = p1 = p2 = 0; /* Scan through all supplied data samples */ for (i=0; i= threshold' matches at each step, * where idx and threshold are taken from each node in turn. */ _match = 1; _node = local; while (_node->parent != NULL) { /* If at root node, all entries match */ _parent = _node->parent; if (_node == _parent->on) { /* if parent node is ON */ if (data[i][_parent->idx] < _parent->threshold) _match = 0; } else if (_node == _parent->off) { /* if parent node is OFF */ if (data[i][_parent->idx] >= _parent->threshold) _match = 0; } _node = _parent; } if (_match) { if (data[i][local->idx] >= local->threshold) { on_ctr++; if (data[i][target] >= 0.5) p1++; } else { off_ctr++; if (data[i][target] >= 0.5) p2++; } } } /* for (i=0; i 0) { p_on = (REAL) p1 / (REAL) on_ctr; p_off = 1 - p_on; negentropy_on = -entropy (p_on) - entropy (p_off); } else negentropy_on = 0.0; /* 2: Entropy of subtable with activation OFF */ if (off_ctr > 0) { p_on = (REAL) p2 / (REAL) off_ctr; p_off = 1 - p_on; negentropy_off = -entropy (p_on) - entropy (p_off); } else negentropy_off = 0.0; ret_val.ne = (negentropy_on * on_ctr + negentropy_off * off_ctr); ret_val.ne /= (on_ctr + off_ctr); /* If all values examined were the same, set 'ret_val.status' to * the target value since this will be an end-of-branch node */ if ((p1 == on_ctr) && (p2 == off_ctr)) ret_val.status = ON; else if ((p1 == 0) && (p2 == 0)) ret_val.status = OFF; else ret_val.status = INACTIVE; return ret_val; } Listing Three NODE* ID3 ( MATRIX *matrix, NODE* parent, UINT target, UINT state) /* Routine to build a decision tree, based on Quinlan's ID3 algorithm. */ { NEGENTROPY negentropy_struct; NODE *node; UINT n_vars = matrix->width, n_samples = matrix->height, i, j, split; REAL **data = matrix->data; REAL best_threshold, min_negentropy, _negentropy; /* Allocate memory for this node */ node = (NODE*) malloc (sizeof(NODE)); if (!node) err_exit (__FILE__, __LINE__); /* Set up links in decision tree */ node->parent = parent; /* Set address of parent node */ if (parent != NULL) /* parent to child; not relevant for root node */ { /* Pass address of this node to the parent node */ if (state == ON) parent->on = node; else if (state == OFF) parent->off = node; } /* Select attribute with lowest negentropy for splitting. Scan through * ALL attributes (except target) and ALL data samples. This is inefficient * for data sets with repeated values, but will do for illustrative purposes */ min_negentropy = 1.0; for (i=0; iidx = i; node->threshold = data[j][i]; /* ...and calculate the negentropy of this partition */ negentropy_struct = negentropy (data, n_samples, node, target); _negentropy = negentropy_struct.ne; /* If this negentropy is lower than any other, retain the index and threshold for future use */ if (_negentropy < min_negentropy) { min_negentropy = _negentropy; split = i; best_threshold = data[j][i]; } } /*if (i != target)*/ } /*for (j=0; jidx = split; node->threshold = best_threshold; /* If the negentropy routine found itself at an end-of-branch * for the decision tree, the 'status' flag in 'negentropy_struct' * is set to ON or OFF and the node labelled accordingly. Otherwise, * ID3 continues to call itself until all end-of-branch nodes are found. */ if (negentropy_struct.status != INACTIVE) { node->on = node->off = NULL; node->idx = negentropy_struct.status; } else { node->on = ID3 (matrix, node, target, ON); node->off = ID3 (matrix, node, target, OFF); } return node; } Listing Four Welcome to ID3 Last compiled on Dec 28 1995, 09:15:09 if { Ringo >= 0.33 then if { George >= 0.63 then if { Paul >= 0.58 then if { George >= 0.99 then OFF else ON } else if { Ringo >= 0.77 then if { Paul >= 0.13 then ON else OFF } else if { John >= 0.52 then if { Ringo >= 0.57 then ON else if { John >= 0.79 then ON else OFF } } else OFF } } } else ON } else if { George >= 0.34 then if { John >= 0.52 then if { Paul >= 0.43 then if { George >= 0.76 then OFF else if { John >= 0.65 then ON else if { John >= 0.52 then if { Paul >= 0.79 then ON else OFF } else ON } } } else OFF } else OFF } else if { John >= 0.49 then ON else if { Ringo >= 0.08 then if { John >= 0.01 then ON else OFF } else OFF } } } } Figure 1: Two different decision trees to answer the question: Does the animal lay eggs? a) Does animal have feathers? Yes: Lays eggs (raven, albatross, ostrich) No : Is animal warmblooded? Yes: Does not lay eggs (koala, dolphin) No: Lays eggs (crocodile) b) Does animal have fur? Yes: Does not lay eggs (koala) No: Does animal have feathers? Yes: Lays eggs (raven, albatross, ostrich) No: Is animal warmblooded? Yes: Does not lay eggs (dolphin) No: Lays eggs (crocodile) Figure 2: Definition of negentropy. p(ON) and p(OFF) are the measured probabilities of an answer being true or false. If p(ON) and p(OFF) are both non-zero: -p(ON) log2 p(ON) - p(OFF) log2 p(OFF) Otherwise: 0 Figure 3: Decision tree for orchids Is the pistil width >= 1.80? Yes: class Setosa No : Is the pistil length >= 5.60? Yes: class Setosa No : class Virgin