Untitled

 avatar
user_6445976
plain_text
a month ago
19 kB
3
Indexable
import numpy as np

class LinearRegression:
    def __init__(self):
        self.coefficients = None  # To store the model coefficients after fitting

    def fit(self, X, y):
        """
        Fit the linear regression model to the data.

        Parameters:
        X (numpy.ndarray): 2D array of shape (n_samples, n_features)
        y (numpy.ndarray): 1D or 2D array of shape (n_samples,) or (n_samples, n_targets)
        """
        # Ensure X is a 2D array
        if X.ndim == 1:
            X = X.reshape(-1, 1)
        
        # Add a column of ones to X for the intercept term
        ones = np.ones((X.shape[0], 1))
        X_b = np.hstack((ones, X))  # X with bias term

        # Compute the coefficients using the Normal Equation
        # β = (XᵀX)^(-1) Xᵀy
        try:
            XTX = X_b.T @ X_b
            XTy = X_b.T @ y
            self.coefficients = np.linalg.inv(XTX) @ XTy
        except np.linalg.LinAlgError:
            # If XTX is singular, use the pseudo-inverse
            self.coefficients = np.linalg.pinv(X_b) @ y

    def predict(self, X):
        """
        Predict using the linear regression model.

        Parameters:
        X (numpy.ndarray): 2D array of shape (n_samples, n_features)

        Returns:
        numpy.ndarray: Predicted values
        """
        if self.coefficients is None:
            raise ValueError("Model has not been fitted yet.")
        
        # Ensure X is a 2D array
        if X.ndim == 1:
            X = X.reshape(-1, 1)
        
        # Add a column of ones to X for the intercept term
        ones = np.ones((X.shape[0], 1))
        X_b = np.hstack((ones, X))  # X with bias term

        # Compute predictions
        return X_b @ self.coefficients

    def score(self, X, y):
        """
        Calculate the coefficient of determination R^2 of the prediction.

        Parameters:
        X (numpy.ndarray): 2D array of shape (n_samples, n_features)
        y (numpy.ndarray): 1D or 2D array of shape (n_samples,) or (n_samples, n_targets)

        Returns:
        float: R^2 score
        """
        y_pred = self.predict(X)
        ss_res = np.sum((y - y_pred) ** 2)
        ss_tot = np.sum((y - np.mean(y)) ** 2)
        return 1 - ss_res / ss_tot

import numpy as np

class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000, fit_intercept=True, verbose=False):
        """
        Initialize the Logistic Regression model.

        Parameters:
        - learning_rate (float): The step size for gradient descent updates.
        - num_iterations (int): Number of iterations for training.
        - fit_intercept (bool): Whether to include an intercept term.
        - verbose (bool): If True, prints loss every 100 iterations.
        """
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.fit_intercept = fit_intercept
        self.verbose = verbose
        self.weights = None  # Model weights
        self.bias = None     # Model bias

    def __add_intercept(self, X):
        """
        Add an intercept term to the feature matrix.

        Parameters:
        - X (numpy.ndarray): Feature matrix.

        Returns:
        - numpy.ndarray: Feature matrix with intercept term.
        """
        intercept = np.ones((X.shape[0], 1))
        return np.hstack((intercept, X))

    def __sigmoid(self, z):
        """
        Compute the sigmoid function.

        Parameters:
        - z (numpy.ndarray): Linear combination of inputs and weights.

        Returns:
        - numpy.ndarray: Sigmoid of input z.
        """
        return 1 / (1 + np.exp(-z))

    def __loss(self, h, y):
        """
        Compute the loss using binary cross-entropy.

        Parameters:
        - h (numpy.ndarray): Predicted probabilities.
        - y (numpy.ndarray): True labels.

        Returns:
        - float: Loss value.
        """
        m = y.shape[0]
        # To avoid log(0), we clip h to [1e-15, 1 - 1e-15]
        h = np.clip(h, 1e-15, 1 - 1e-15)
        return (-1 / m) * (np.dot(y, np.log(h)) + np.dot((1 - y), np.log(1 - h)))

    def fit(self, X, y):
        """
        Fit the Logistic Regression model to the data using Gradient Descent.

        Parameters:
        - X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).
        - y (numpy.ndarray): Binary labels of shape (n_samples,).
        """
        if self.fit_intercept:
            X = self.__add_intercept(X)
        
        # Initialize weights
        self.weights = np.zeros(X.shape[1])

        for i in range(self.num_iterations):
            z = np.dot(X, self.weights)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / y.size
            self.weights -= self.learning_rate * gradient

            if self.verbose and i % 100 == 0:
                loss = self.__loss(h, y)
                print(f'Iteration {i}: loss {loss}')

    def predict_proba(self, X):
        """
        Predict probability estimates for the input data.

        Parameters:
        - X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).

        Returns:
        - numpy.ndarray: Predicted probabilities of shape (n_samples,).
        """
        if self.fit_intercept:
            X = self.__add_intercept(X)
        
        return self.__sigmoid(np.dot(X, self.weights))

    def predict(self, X, threshold=0.5):
        """
        Predict binary labels for the input data.

        Parameters:
        - X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).
        - threshold (float): Threshold for classifying probabilities.

        Returns:
        - numpy.ndarray: Predicted binary labels of shape (n_samples,).
        """
        return (self.predict_proba(X) >= threshold).astype(int)

    def score(self, X, y, threshold=0.5):
        """
        Calculate the accuracy of the model.

        Parameters:
        - X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).
        - y (numpy.ndarray): True binary labels of shape (n_samples,).
        - threshold (float): Threshold for classifying probabilities.

        Returns:
        - float: Accuracy score.
        """
        preds = self.predict(X, threshold)
        return (preds == y).mean()


import numpy as np

def gini_index(y):
    """
    Calculate the Gini Index for a list of classes.

    Parameters:
    - y (numpy.ndarray): Array of class labels.

    Returns:
    - float: Gini Index.
    """
    classes, counts = np.unique(y, return_counts=True)
    probabilities = counts / counts.sum()
    gini = 1 - np.sum(probabilities ** 2)
    return gini
import numpy as np

def entropy(y):
    """
    Calculate the Entropy for a list of classes.

    Parameters:
    - y (numpy.ndarray): Array of class labels.

    Returns:
    - float: Entropy.
    """
    classes, counts = np.unique(y, return_counts=True)
    probabilities = counts / counts.sum()
    # To handle log(0), we add a small epsilon where probability is zero
    epsilon = 1e-15
    entropy = -np.sum(probabilities * np.log2(probabilities + epsilon))
    return entropy


import numpy as np

def information_gain(y, X_column, criterion='entropy'):
    """
    Calculate the Information Gain of a dataset for a specific feature.

    Parameters:
    - y (numpy.ndarray): Array of class labels.
    - X_column (numpy.ndarray): Array of feature values.
    - criterion (str): 'entropy' or 'gini' to specify the impurity measure.

    Returns:
    - float: Information Gain.
    """
    # Calculate the base impurity
    if criterion == 'entropy':
        base_impurity = entropy(y)
    elif criterion == 'gini':
        base_impurity = gini_index(y)
    else:
        raise ValueError("Criterion must be 'entropy' or 'gini'")
    
    # Get unique values and their counts
    values, counts = np.unique(X_column, return_counts=True)
    
    # Calculate the weighted impurity after the split
    weighted_impurity = 0
    for v, count in zip(values, counts):
        y_subset = y[X_column == v]
        if criterion == 'entropy':
            impurity = entropy(y_subset)
        else:
            impurity = gini_index(y_subset)
        weighted_impurity += (count / len(y)) * impurity
    
    # Information Gain is the reduction in impurity
    info_gain = base_impurity - weighted_impurity
    return info_gain

import numpy as np

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, criterion='entropy'):
        """
        Initialize the Decision Tree.

        Parameters:
        - max_depth (int): Maximum depth of the tree.
        - min_samples_split (int): Minimum number of samples required to split a node.
        - criterion (str): 'entropy' or 'gini' to specify the impurity measure.
        """
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.root = None

    class Node:
        def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
            """
            Initialize a node.

            Parameters:
            - feature (int): Feature index to split on.
            - threshold (float): Threshold value to split.
            - left (Node): Left child node.
            - right (Node): Right child node.
            - value (int/float): Class label if it's a leaf node.
            """
            self.feature = feature
            self.threshold = threshold
            self.left = left
            self.right = right
            self.value = value

    def fit(self, X, y):
        """
        Build the decision tree.

        Parameters:
        - X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).
        - y (numpy.ndarray): Class labels of shape (n_samples,).
        """
        self.root = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        num_labels = len(np.unique(y))

        # Stopping criteria
        if (self.max_depth is not None and depth >= self.max_depth) or num_labels == 1 or num_samples < self.min_samples_split:
            leaf_value = self._most_common_label(y)
            return self.Node(value=leaf_value)

        # Find the best split
        best_feature, best_threshold, best_info_gain = self._best_split(X, y, num_features)

        if best_info_gain == 0:
            leaf_value = self._most_common_label(y)
            return self.Node(value=leaf_value)

        # Split the dataset
        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        left = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right = self._build_tree(X[right_indices], y[right_indices], depth + 1)
        return self.Node(feature=best_feature, threshold=best_threshold, left=left, right=right)

    def _best_split(self, X, y, num_features):
        best_info_gain = -1
        best_feature, best_threshold = None, None

        for feature in range(num_features):
            X_column = X[:, feature]
            thresholds = np.unique(X_column)
            for threshold in thresholds:
                # Split
                left_indices = X_column <= threshold
                right_indices = X_column > threshold

                if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
                    continue

                # Compute Information Gain
                if self.criterion == 'entropy':
                    current_info_gain = information_gain(y, X_column <= threshold, criterion='entropy')
                else:
                    current_info_gain = information_gain(y, X_column <= threshold, criterion='gini')

                if current_info_gain > best_info_gain:
                    best_info_gain = current_info_gain
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold, best_info_gain

    def _most_common_label(self, y):
        """
        Find the most common class label.

        Parameters:
        - y (numpy.ndarray): Array of class labels.

        Returns:
        - int/float: Most common class label.
        """
        values, counts = np.unique(y, return_counts=True)
        return values[np.argmax(counts)]

    def predict(self, X):
        """
        Predict class labels for samples in X.

        Parameters:
        - X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).

        Returns:
        - numpy.ndarray: Predicted class labels.
        """
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        """
        Traverse the tree to make a prediction.

        Parameters:
        - x (numpy.ndarray): Single sample.
        - node (Node): Current node in the tree.

        Returns:
        - int/float: Predicted class label.
        """
        if node.value is not None:
            return node.value

        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

    def score(self, X, y):
        """
        Calculate the accuracy of the model.

        Parameters:
        - X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).
        - y (numpy.ndarray): True class labels.

        Returns:
        - float: Accuracy score.
        """
        preds = self.predict(X)
        return np.mean(preds == y)

import numpy as np
from collections import Counter

class RandomForest:
    def __init__(self, n_trees=10, max_depth=None, min_samples_split=2, criterion='entropy', max_features='sqrt'):
        """
        Initialize the Random Forest.

        Parameters:
        - n_trees (int): Number of trees in the forest.
        - max_depth (int): Maximum depth of each tree.
        - min_samples_split (int): Minimum number of samples required to split a node.
        - criterion (str): 'entropy' or 'gini' to specify the impurity measure.
        - max_features (str or int): Number of features to consider when looking for the best split.
                                     If 'sqrt', then max_features = sqrt(n_features).
        """
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        """
        Build the Random Forest.

        Parameters:
        - X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).
        - y (numpy.ndarray): Class labels of shape (n_samples,).
        """
        self.trees = []
        n_samples, n_features = X.shape

        # Determine number of features to sample
        if self.max_features == 'sqrt':
            max_features = int(np.sqrt(n_features))
        elif isinstance(self.max_features, int):
            max_features = self.max_features
        else:
            max_features = n_features  # Use all features

        for _ in range(self.n_trees):
            # Bootstrap sampling
            indices = np.random.choice(n_samples, size=n_samples, replace=True)
            X_sample = X[indices]
            y_sample = y[indices]

            # Feature sampling
            feature_indices = np.random.choice(n_features, size=max_features, replace=False)

            # Train a Decision Tree on the sampled data and features
            tree = DecisionTree(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                criterion=self.criterion
            )
            tree.fit(X_sample[:, feature_indices], y_sample)

            # Store the tree and the feature indices used
            self.trees.append((tree, feature_indices))

    def predict(self, X):
        """
        Predict class labels for samples in X.

        Parameters:
        - X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).

        Returns:
        - numpy.ndarray: Predicted class labels.
        """
        tree_preds = np.array([
            tree.predict(X[:, feature_indices]) 
            for tree, feature_indices in self.trees
        ])

        # Transpose to have shape (n_samples, n_trees)
        tree_preds = tree_preds.T

        # Majority vote
        y_pred = np.array([Counter(row).most_common(1)[0][0] for row in tree_preds])
        return y_pred

    def score(self, X, y):
        """
        Calculate the accuracy of the model.

        Parameters:
        - X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).
        - y (numpy.ndarray): True class labels.

        Returns:
        - float: Accuracy score.
        """
        preds = self.predict(X)
        return np.mean(preds == y)

import numpy as np

class PCA:
    def __init__(self, n_components):
        """
        Initialize PCA.

        Parameters:
        - n_components (int): Number of principal components to retain.
        """
        self.n_components = n_components
        self.components = None
        self.mean = None

    def fit(self, X):
        """
        Fit the PCA model to the data.

        Parameters:
        - X (numpy.ndarray): Data matrix of shape (n_samples, n_features).
        """
        # Center the data
        self.mean = np.mean(X, axis=0)
        X_centered = X - self.mean

        # Compute covariance matrix
        covariance_matrix = np.cov(X_centered, rowvar=False)

        # Compute eigenvalues and eigenvectors
        eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)

        # Sort eigenvectors by decreasing eigenvalues
        sorted_idx = np.argsort(eigenvalues)[::-1]
        eigenvectors = eigenvectors[:, sorted_idx]
        eigenvalues = eigenvalues[sorted_idx]

        # Select the top n_components
        self.components = eigenvectors[:, :self.n_components]

    def transform(self, X):
        """
        Project the data onto the principal components.

        Parameters:
        - X (numpy.ndarray): Data matrix of shape (n_samples, n_features).

        Returns:
        - numpy.ndarray: Transformed data of shape (n_samples, n_components).
        """
        X_centered = X - self.mean
        return np.dot(X_centered, self.components)

    def fit_transform(self, X):
        """
        Fit the PCA model and apply the dimensionality reduction on the data.

        Parameters:
        - X (numpy.ndarray): Data matrix of shape (n_samples, n_features).

        Returns:
        - numpy.ndarray: Transformed data of shape (n_samples, n_components).
        """
        self.fit(X)
        return self.transform(X)
Leave a Comment