Untitled
import numpy as np class LinearRegression: def __init__(self): self.coefficients = None # To store the model coefficients after fitting def fit(self, X, y): """ Fit the linear regression model to the data. Parameters: X (numpy.ndarray): 2D array of shape (n_samples, n_features) y (numpy.ndarray): 1D or 2D array of shape (n_samples,) or (n_samples, n_targets) """ # Ensure X is a 2D array if X.ndim == 1: X = X.reshape(-1, 1) # Add a column of ones to X for the intercept term ones = np.ones((X.shape[0], 1)) X_b = np.hstack((ones, X)) # X with bias term # Compute the coefficients using the Normal Equation # β = (XᵀX)^(-1) Xᵀy try: XTX = X_b.T @ X_b XTy = X_b.T @ y self.coefficients = np.linalg.inv(XTX) @ XTy except np.linalg.LinAlgError: # If XTX is singular, use the pseudo-inverse self.coefficients = np.linalg.pinv(X_b) @ y def predict(self, X): """ Predict using the linear regression model. Parameters: X (numpy.ndarray): 2D array of shape (n_samples, n_features) Returns: numpy.ndarray: Predicted values """ if self.coefficients is None: raise ValueError("Model has not been fitted yet.") # Ensure X is a 2D array if X.ndim == 1: X = X.reshape(-1, 1) # Add a column of ones to X for the intercept term ones = np.ones((X.shape[0], 1)) X_b = np.hstack((ones, X)) # X with bias term # Compute predictions return X_b @ self.coefficients def score(self, X, y): """ Calculate the coefficient of determination R^2 of the prediction. Parameters: X (numpy.ndarray): 2D array of shape (n_samples, n_features) y (numpy.ndarray): 1D or 2D array of shape (n_samples,) or (n_samples, n_targets) Returns: float: R^2 score """ y_pred = self.predict(X) ss_res = np.sum((y - y_pred) ** 2) ss_tot = np.sum((y - np.mean(y)) ** 2) return 1 - ss_res / ss_tot import numpy as np class LogisticRegression: def __init__(self, learning_rate=0.01, num_iterations=1000, fit_intercept=True, verbose=False): """ Initialize the Logistic Regression model. Parameters: - learning_rate (float): The step size for gradient descent updates. - num_iterations (int): Number of iterations for training. - fit_intercept (bool): Whether to include an intercept term. - verbose (bool): If True, prints loss every 100 iterations. """ self.learning_rate = learning_rate self.num_iterations = num_iterations self.fit_intercept = fit_intercept self.verbose = verbose self.weights = None # Model weights self.bias = None # Model bias def __add_intercept(self, X): """ Add an intercept term to the feature matrix. Parameters: - X (numpy.ndarray): Feature matrix. Returns: - numpy.ndarray: Feature matrix with intercept term. """ intercept = np.ones((X.shape[0], 1)) return np.hstack((intercept, X)) def __sigmoid(self, z): """ Compute the sigmoid function. Parameters: - z (numpy.ndarray): Linear combination of inputs and weights. Returns: - numpy.ndarray: Sigmoid of input z. """ return 1 / (1 + np.exp(-z)) def __loss(self, h, y): """ Compute the loss using binary cross-entropy. Parameters: - h (numpy.ndarray): Predicted probabilities. - y (numpy.ndarray): True labels. Returns: - float: Loss value. """ m = y.shape[0] # To avoid log(0), we clip h to [1e-15, 1 - 1e-15] h = np.clip(h, 1e-15, 1 - 1e-15) return (-1 / m) * (np.dot(y, np.log(h)) + np.dot((1 - y), np.log(1 - h))) def fit(self, X, y): """ Fit the Logistic Regression model to the data using Gradient Descent. Parameters: - X (numpy.ndarray): Feature matrix of shape (n_samples, n_features). - y (numpy.ndarray): Binary labels of shape (n_samples,). """ if self.fit_intercept: X = self.__add_intercept(X) # Initialize weights self.weights = np.zeros(X.shape[1]) for i in range(self.num_iterations): z = np.dot(X, self.weights) h = self.__sigmoid(z) gradient = np.dot(X.T, (h - y)) / y.size self.weights -= self.learning_rate * gradient if self.verbose and i % 100 == 0: loss = self.__loss(h, y) print(f'Iteration {i}: loss {loss}') def predict_proba(self, X): """ Predict probability estimates for the input data. Parameters: - X (numpy.ndarray): Feature matrix of shape (n_samples, n_features). Returns: - numpy.ndarray: Predicted probabilities of shape (n_samples,). """ if self.fit_intercept: X = self.__add_intercept(X) return self.__sigmoid(np.dot(X, self.weights)) def predict(self, X, threshold=0.5): """ Predict binary labels for the input data. Parameters: - X (numpy.ndarray): Feature matrix of shape (n_samples, n_features). - threshold (float): Threshold for classifying probabilities. Returns: - numpy.ndarray: Predicted binary labels of shape (n_samples,). """ return (self.predict_proba(X) >= threshold).astype(int) def score(self, X, y, threshold=0.5): """ Calculate the accuracy of the model. Parameters: - X (numpy.ndarray): Feature matrix of shape (n_samples, n_features). - y (numpy.ndarray): True binary labels of shape (n_samples,). - threshold (float): Threshold for classifying probabilities. Returns: - float: Accuracy score. """ preds = self.predict(X, threshold) return (preds == y).mean() import numpy as np def gini_index(y): """ Calculate the Gini Index for a list of classes. Parameters: - y (numpy.ndarray): Array of class labels. Returns: - float: Gini Index. """ classes, counts = np.unique(y, return_counts=True) probabilities = counts / counts.sum() gini = 1 - np.sum(probabilities ** 2) return gini import numpy as np def entropy(y): """ Calculate the Entropy for a list of classes. Parameters: - y (numpy.ndarray): Array of class labels. Returns: - float: Entropy. """ classes, counts = np.unique(y, return_counts=True) probabilities = counts / counts.sum() # To handle log(0), we add a small epsilon where probability is zero epsilon = 1e-15 entropy = -np.sum(probabilities * np.log2(probabilities + epsilon)) return entropy import numpy as np def information_gain(y, X_column, criterion='entropy'): """ Calculate the Information Gain of a dataset for a specific feature. Parameters: - y (numpy.ndarray): Array of class labels. - X_column (numpy.ndarray): Array of feature values. - criterion (str): 'entropy' or 'gini' to specify the impurity measure. Returns: - float: Information Gain. """ # Calculate the base impurity if criterion == 'entropy': base_impurity = entropy(y) elif criterion == 'gini': base_impurity = gini_index(y) else: raise ValueError("Criterion must be 'entropy' or 'gini'") # Get unique values and their counts values, counts = np.unique(X_column, return_counts=True) # Calculate the weighted impurity after the split weighted_impurity = 0 for v, count in zip(values, counts): y_subset = y[X_column == v] if criterion == 'entropy': impurity = entropy(y_subset) else: impurity = gini_index(y_subset) weighted_impurity += (count / len(y)) * impurity # Information Gain is the reduction in impurity info_gain = base_impurity - weighted_impurity return info_gain import numpy as np class DecisionTree: def __init__(self, max_depth=None, min_samples_split=2, criterion='entropy'): """ Initialize the Decision Tree. Parameters: - max_depth (int): Maximum depth of the tree. - min_samples_split (int): Minimum number of samples required to split a node. - criterion (str): 'entropy' or 'gini' to specify the impurity measure. """ self.max_depth = max_depth self.min_samples_split = min_samples_split self.criterion = criterion self.root = None class Node: def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None): """ Initialize a node. Parameters: - feature (int): Feature index to split on. - threshold (float): Threshold value to split. - left (Node): Left child node. - right (Node): Right child node. - value (int/float): Class label if it's a leaf node. """ self.feature = feature self.threshold = threshold self.left = left self.right = right self.value = value def fit(self, X, y): """ Build the decision tree. Parameters: - X (numpy.ndarray): Feature matrix of shape (n_samples, n_features). - y (numpy.ndarray): Class labels of shape (n_samples,). """ self.root = self._build_tree(X, y) def _build_tree(self, X, y, depth=0): num_samples, num_features = X.shape num_labels = len(np.unique(y)) # Stopping criteria if (self.max_depth is not None and depth >= self.max_depth) or num_labels == 1 or num_samples < self.min_samples_split: leaf_value = self._most_common_label(y) return self.Node(value=leaf_value) # Find the best split best_feature, best_threshold, best_info_gain = self._best_split(X, y, num_features) if best_info_gain == 0: leaf_value = self._most_common_label(y) return self.Node(value=leaf_value) # Split the dataset left_indices = X[:, best_feature] <= best_threshold right_indices = X[:, best_feature] > best_threshold left = self._build_tree(X[left_indices], y[left_indices], depth + 1) right = self._build_tree(X[right_indices], y[right_indices], depth + 1) return self.Node(feature=best_feature, threshold=best_threshold, left=left, right=right) def _best_split(self, X, y, num_features): best_info_gain = -1 best_feature, best_threshold = None, None for feature in range(num_features): X_column = X[:, feature] thresholds = np.unique(X_column) for threshold in thresholds: # Split left_indices = X_column <= threshold right_indices = X_column > threshold if len(y[left_indices]) == 0 or len(y[right_indices]) == 0: continue # Compute Information Gain if self.criterion == 'entropy': current_info_gain = information_gain(y, X_column <= threshold, criterion='entropy') else: current_info_gain = information_gain(y, X_column <= threshold, criterion='gini') if current_info_gain > best_info_gain: best_info_gain = current_info_gain best_feature = feature best_threshold = threshold return best_feature, best_threshold, best_info_gain def _most_common_label(self, y): """ Find the most common class label. Parameters: - y (numpy.ndarray): Array of class labels. Returns: - int/float: Most common class label. """ values, counts = np.unique(y, return_counts=True) return values[np.argmax(counts)] def predict(self, X): """ Predict class labels for samples in X. Parameters: - X (numpy.ndarray): Feature matrix of shape (n_samples, n_features). Returns: - numpy.ndarray: Predicted class labels. """ return np.array([self._traverse_tree(x, self.root) for x in X]) def _traverse_tree(self, x, node): """ Traverse the tree to make a prediction. Parameters: - x (numpy.ndarray): Single sample. - node (Node): Current node in the tree. Returns: - int/float: Predicted class label. """ if node.value is not None: return node.value if x[node.feature] <= node.threshold: return self._traverse_tree(x, node.left) else: return self._traverse_tree(x, node.right) def score(self, X, y): """ Calculate the accuracy of the model. Parameters: - X (numpy.ndarray): Feature matrix of shape (n_samples, n_features). - y (numpy.ndarray): True class labels. Returns: - float: Accuracy score. """ preds = self.predict(X) return np.mean(preds == y) import numpy as np from collections import Counter class RandomForest: def __init__(self, n_trees=10, max_depth=None, min_samples_split=2, criterion='entropy', max_features='sqrt'): """ Initialize the Random Forest. Parameters: - n_trees (int): Number of trees in the forest. - max_depth (int): Maximum depth of each tree. - min_samples_split (int): Minimum number of samples required to split a node. - criterion (str): 'entropy' or 'gini' to specify the impurity measure. - max_features (str or int): Number of features to consider when looking for the best split. If 'sqrt', then max_features = sqrt(n_features). """ self.n_trees = n_trees self.max_depth = max_depth self.min_samples_split = min_samples_split self.criterion = criterion self.max_features = max_features self.trees = [] def fit(self, X, y): """ Build the Random Forest. Parameters: - X (numpy.ndarray): Feature matrix of shape (n_samples, n_features). - y (numpy.ndarray): Class labels of shape (n_samples,). """ self.trees = [] n_samples, n_features = X.shape # Determine number of features to sample if self.max_features == 'sqrt': max_features = int(np.sqrt(n_features)) elif isinstance(self.max_features, int): max_features = self.max_features else: max_features = n_features # Use all features for _ in range(self.n_trees): # Bootstrap sampling indices = np.random.choice(n_samples, size=n_samples, replace=True) X_sample = X[indices] y_sample = y[indices] # Feature sampling feature_indices = np.random.choice(n_features, size=max_features, replace=False) # Train a Decision Tree on the sampled data and features tree = DecisionTree( max_depth=self.max_depth, min_samples_split=self.min_samples_split, criterion=self.criterion ) tree.fit(X_sample[:, feature_indices], y_sample) # Store the tree and the feature indices used self.trees.append((tree, feature_indices)) def predict(self, X): """ Predict class labels for samples in X. Parameters: - X (numpy.ndarray): Feature matrix of shape (n_samples, n_features). Returns: - numpy.ndarray: Predicted class labels. """ tree_preds = np.array([ tree.predict(X[:, feature_indices]) for tree, feature_indices in self.trees ]) # Transpose to have shape (n_samples, n_trees) tree_preds = tree_preds.T # Majority vote y_pred = np.array([Counter(row).most_common(1)[0][0] for row in tree_preds]) return y_pred def score(self, X, y): """ Calculate the accuracy of the model. Parameters: - X (numpy.ndarray): Feature matrix of shape (n_samples, n_features). - y (numpy.ndarray): True class labels. Returns: - float: Accuracy score. """ preds = self.predict(X) return np.mean(preds == y) import numpy as np class PCA: def __init__(self, n_components): """ Initialize PCA. Parameters: - n_components (int): Number of principal components to retain. """ self.n_components = n_components self.components = None self.mean = None def fit(self, X): """ Fit the PCA model to the data. Parameters: - X (numpy.ndarray): Data matrix of shape (n_samples, n_features). """ # Center the data self.mean = np.mean(X, axis=0) X_centered = X - self.mean # Compute covariance matrix covariance_matrix = np.cov(X_centered, rowvar=False) # Compute eigenvalues and eigenvectors eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix) # Sort eigenvectors by decreasing eigenvalues sorted_idx = np.argsort(eigenvalues)[::-1] eigenvectors = eigenvectors[:, sorted_idx] eigenvalues = eigenvalues[sorted_idx] # Select the top n_components self.components = eigenvectors[:, :self.n_components] def transform(self, X): """ Project the data onto the principal components. Parameters: - X (numpy.ndarray): Data matrix of shape (n_samples, n_features). Returns: - numpy.ndarray: Transformed data of shape (n_samples, n_components). """ X_centered = X - self.mean return np.dot(X_centered, self.components) def fit_transform(self, X): """ Fit the PCA model and apply the dimensionality reduction on the data. Parameters: - X (numpy.ndarray): Data matrix of shape (n_samples, n_features). Returns: - numpy.ndarray: Transformed data of shape (n_samples, n_components). """ self.fit(X) return self.transform(X)
Leave a Comment