Untitled
unknown
plain_text
a year ago
2.5 kB
8
Indexable
class DecisionStumpInfoGain(DecisionStumpErrorRate):
# This is not required, but one way to simplify the code is
# to have this class inherit from DecisionStumpErrorRate.
# Which methods (init, fit, predict) do you need to overwrite?
y_hat_yes = None
y_hat_no = None
j_best = None
t_best = None
# Change the below to fit using the information gain criterion mentioned in lecture
def fit(self, X, y):
n, d = X.shape
count = np.bincount(y, minlength=2)
y_mode = np.argmax(count)
self.y_hat_yes = y_mode
self.y_hat_no = None
self.j_best = None
self.t_best = None
if np.unique(y).size <= 1:
return
max_gain = 0 # minimum max_gain
entropy_H = entropy(count / np.sum(count)) # entropy of the entire dataset to be used at every loop
# Loop over features looking for the best split
for j in range(d): # d
for i in range(n): # n
t = X[i, j]
is_greater_or_equal = X[:, j] > t
# Labels of samples assigned to the two different splits,
# by taking the values that are "true" in y[is_greater_or_equal]
class_a_samples = y[is_greater_or_equal] # label of samples meeting the threshold
class_b_samples = y[~is_greater_or_equal] # label of samples not meeting the threshold
# Find the size of each split
len_a = len(class_a_samples)
len_b = len(class_b_samples)
# If division by zero for empty classes, skip loop
if len_a == 0 or len_b == 0:
continue
# Compute the entropy for the two groups, ensuring arrays passed to entropy sum up to 1
entropy_a = entropy(np.bincount(class_a_samples, minlength=2) / len_a)
entropy_b = entropy(np.bincount(class_b_samples, minlength=2) / len_b)
# Calculate information gain
info_gain = entropy_H - ((len_a / n) * entropy_a) - ((len_b / n) * entropy_b)
# Compare to max gain so far
if info_gain > max_gain:
max_gain = info_gain
self.j_best = j
self.t_best = t
self.y_hat_yes = utils.mode(y[is_greater_or_equal])
self.y_hat_no = utils.mode(y[~is_greater_or_equal])Editor is loading...
Leave a Comment