Untitled
unknown
plain_text
23 days ago
2.5 kB
2
Indexable
Never
class DecisionStumpInfoGain(DecisionStumpErrorRate): # This is not required, but one way to simplify the code is # to have this class inherit from DecisionStumpErrorRate. # Which methods (init, fit, predict) do you need to overwrite? y_hat_yes = None y_hat_no = None j_best = None t_best = None # Change the below to fit using the information gain criterion mentioned in lecture def fit(self, X, y): n, d = X.shape count = np.bincount(y, minlength=2) y_mode = np.argmax(count) self.y_hat_yes = y_mode self.y_hat_no = None self.j_best = None self.t_best = None if np.unique(y).size <= 1: return max_gain = 0 # minimum max_gain entropy_H = entropy(count / np.sum(count)) # entropy of the entire dataset to be used at every loop # Loop over features looking for the best split for j in range(d): # d for i in range(n): # n t = X[i, j] is_greater_or_equal = X[:, j] > t # Labels of samples assigned to the two different splits, # by taking the values that are "true" in y[is_greater_or_equal] class_a_samples = y[is_greater_or_equal] # label of samples meeting the threshold class_b_samples = y[~is_greater_or_equal] # label of samples not meeting the threshold # Find the size of each split len_a = len(class_a_samples) len_b = len(class_b_samples) # If division by zero for empty classes, skip loop if len_a == 0 or len_b == 0: continue # Compute the entropy for the two groups, ensuring arrays passed to entropy sum up to 1 entropy_a = entropy(np.bincount(class_a_samples, minlength=2) / len_a) entropy_b = entropy(np.bincount(class_b_samples, minlength=2) / len_b) # Calculate information gain info_gain = entropy_H - ((len_a / n) * entropy_a) - ((len_b / n) * entropy_b) # Compare to max gain so far if info_gain > max_gain: max_gain = info_gain self.j_best = j self.t_best = t self.y_hat_yes = utils.mode(y[is_greater_or_equal]) self.y_hat_no = utils.mode(y[~is_greater_or_equal])
Leave a Comment