Untitled

class DecisionStumpInfoGain(DecisionStumpErrorRate):
    # This is not required, but one way to simplify the code is
    # to have this class inherit from DecisionStumpErrorRate.
    # Which methods (init, fit, predict) do you need to overwrite?
    y_hat_yes = None
    y_hat_no = None
    j_best = None
    t_best = None

    # Change the below to fit using the information gain criterion mentioned in lecture
    def fit(self, X, y):
        n, d = X.shape
        count = np.bincount(y, minlength=2)
        y_mode = np.argmax(count)

        self.y_hat_yes = y_mode
        self.y_hat_no = None
        self.j_best = None
        self.t_best = None

        if np.unique(y).size <= 1:
            return

        max_gain = 0 # minimum max_gain
        entropy_H = entropy(count / np.sum(count))  # entropy of the entire dataset to be used at every loop

        # Loop over features looking for the best split
        for j in range(d): # d
            for i in range(n): # n
                t = X[i, j]
                is_greater_or_equal = X[:, j] > t

                # Labels of samples assigned to the two different splits,
                # by taking the values that are "true" in y[is_greater_or_equal]
                class_a_samples = y[is_greater_or_equal] # label of samples meeting the threshold
                class_b_samples = y[~is_greater_or_equal] # label of samples not meeting the threshold

                # Find the size of each split
                len_a = len(class_a_samples)
                len_b = len(class_b_samples)

                # If division by zero for empty classes, skip loop
                if len_a == 0 or len_b == 0:
                    continue
            
                # Compute the entropy for the two groups, ensuring arrays passed to entropy sum up to 1
                entropy_a = entropy(np.bincount(class_a_samples, minlength=2) / len_a)
                entropy_b = entropy(np.bincount(class_b_samples, minlength=2) / len_b)

                # Calculate information gain
                info_gain = entropy_H - ((len_a / n) * entropy_a) - ((len_b / n) * entropy_b)

                # Compare to max gain so far
                if info_gain > max_gain:
                    max_gain = info_gain
                    self.j_best = j
                    self.t_best = t
                    self.y_hat_yes = utils.mode(y[is_greater_or_equal])
                    self.y_hat_no = utils.mode(y[~is_greater_or_equal])
Editor is loading...