Untitled
user_6445976
plain_text
a year ago
19 kB
7
Indexable
import numpy as np
class LinearRegression:
def __init__(self):
self.coefficients = None # To store the model coefficients after fitting
def fit(self, X, y):
"""
Fit the linear regression model to the data.
Parameters:
X (numpy.ndarray): 2D array of shape (n_samples, n_features)
y (numpy.ndarray): 1D or 2D array of shape (n_samples,) or (n_samples, n_targets)
"""
# Ensure X is a 2D array
if X.ndim == 1:
X = X.reshape(-1, 1)
# Add a column of ones to X for the intercept term
ones = np.ones((X.shape[0], 1))
X_b = np.hstack((ones, X)) # X with bias term
# Compute the coefficients using the Normal Equation
# β = (XᵀX)^(-1) Xᵀy
try:
XTX = X_b.T @ X_b
XTy = X_b.T @ y
self.coefficients = np.linalg.inv(XTX) @ XTy
except np.linalg.LinAlgError:
# If XTX is singular, use the pseudo-inverse
self.coefficients = np.linalg.pinv(X_b) @ y
def predict(self, X):
"""
Predict using the linear regression model.
Parameters:
X (numpy.ndarray): 2D array of shape (n_samples, n_features)
Returns:
numpy.ndarray: Predicted values
"""
if self.coefficients is None:
raise ValueError("Model has not been fitted yet.")
# Ensure X is a 2D array
if X.ndim == 1:
X = X.reshape(-1, 1)
# Add a column of ones to X for the intercept term
ones = np.ones((X.shape[0], 1))
X_b = np.hstack((ones, X)) # X with bias term
# Compute predictions
return X_b @ self.coefficients
def score(self, X, y):
"""
Calculate the coefficient of determination R^2 of the prediction.
Parameters:
X (numpy.ndarray): 2D array of shape (n_samples, n_features)
y (numpy.ndarray): 1D or 2D array of shape (n_samples,) or (n_samples, n_targets)
Returns:
float: R^2 score
"""
y_pred = self.predict(X)
ss_res = np.sum((y - y_pred) ** 2)
ss_tot = np.sum((y - np.mean(y)) ** 2)
return 1 - ss_res / ss_tot
import numpy as np
class LogisticRegression:
def __init__(self, learning_rate=0.01, num_iterations=1000, fit_intercept=True, verbose=False):
"""
Initialize the Logistic Regression model.
Parameters:
- learning_rate (float): The step size for gradient descent updates.
- num_iterations (int): Number of iterations for training.
- fit_intercept (bool): Whether to include an intercept term.
- verbose (bool): If True, prints loss every 100 iterations.
"""
self.learning_rate = learning_rate
self.num_iterations = num_iterations
self.fit_intercept = fit_intercept
self.verbose = verbose
self.weights = None # Model weights
self.bias = None # Model bias
def __add_intercept(self, X):
"""
Add an intercept term to the feature matrix.
Parameters:
- X (numpy.ndarray): Feature matrix.
Returns:
- numpy.ndarray: Feature matrix with intercept term.
"""
intercept = np.ones((X.shape[0], 1))
return np.hstack((intercept, X))
def __sigmoid(self, z):
"""
Compute the sigmoid function.
Parameters:
- z (numpy.ndarray): Linear combination of inputs and weights.
Returns:
- numpy.ndarray: Sigmoid of input z.
"""
return 1 / (1 + np.exp(-z))
def __loss(self, h, y):
"""
Compute the loss using binary cross-entropy.
Parameters:
- h (numpy.ndarray): Predicted probabilities.
- y (numpy.ndarray): True labels.
Returns:
- float: Loss value.
"""
m = y.shape[0]
# To avoid log(0), we clip h to [1e-15, 1 - 1e-15]
h = np.clip(h, 1e-15, 1 - 1e-15)
return (-1 / m) * (np.dot(y, np.log(h)) + np.dot((1 - y), np.log(1 - h)))
def fit(self, X, y):
"""
Fit the Logistic Regression model to the data using Gradient Descent.
Parameters:
- X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).
- y (numpy.ndarray): Binary labels of shape (n_samples,).
"""
if self.fit_intercept:
X = self.__add_intercept(X)
# Initialize weights
self.weights = np.zeros(X.shape[1])
for i in range(self.num_iterations):
z = np.dot(X, self.weights)
h = self.__sigmoid(z)
gradient = np.dot(X.T, (h - y)) / y.size
self.weights -= self.learning_rate * gradient
if self.verbose and i % 100 == 0:
loss = self.__loss(h, y)
print(f'Iteration {i}: loss {loss}')
def predict_proba(self, X):
"""
Predict probability estimates for the input data.
Parameters:
- X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).
Returns:
- numpy.ndarray: Predicted probabilities of shape (n_samples,).
"""
if self.fit_intercept:
X = self.__add_intercept(X)
return self.__sigmoid(np.dot(X, self.weights))
def predict(self, X, threshold=0.5):
"""
Predict binary labels for the input data.
Parameters:
- X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).
- threshold (float): Threshold for classifying probabilities.
Returns:
- numpy.ndarray: Predicted binary labels of shape (n_samples,).
"""
return (self.predict_proba(X) >= threshold).astype(int)
def score(self, X, y, threshold=0.5):
"""
Calculate the accuracy of the model.
Parameters:
- X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).
- y (numpy.ndarray): True binary labels of shape (n_samples,).
- threshold (float): Threshold for classifying probabilities.
Returns:
- float: Accuracy score.
"""
preds = self.predict(X, threshold)
return (preds == y).mean()
import numpy as np
def gini_index(y):
"""
Calculate the Gini Index for a list of classes.
Parameters:
- y (numpy.ndarray): Array of class labels.
Returns:
- float: Gini Index.
"""
classes, counts = np.unique(y, return_counts=True)
probabilities = counts / counts.sum()
gini = 1 - np.sum(probabilities ** 2)
return gini
import numpy as np
def entropy(y):
"""
Calculate the Entropy for a list of classes.
Parameters:
- y (numpy.ndarray): Array of class labels.
Returns:
- float: Entropy.
"""
classes, counts = np.unique(y, return_counts=True)
probabilities = counts / counts.sum()
# To handle log(0), we add a small epsilon where probability is zero
epsilon = 1e-15
entropy = -np.sum(probabilities * np.log2(probabilities + epsilon))
return entropy
import numpy as np
def information_gain(y, X_column, criterion='entropy'):
"""
Calculate the Information Gain of a dataset for a specific feature.
Parameters:
- y (numpy.ndarray): Array of class labels.
- X_column (numpy.ndarray): Array of feature values.
- criterion (str): 'entropy' or 'gini' to specify the impurity measure.
Returns:
- float: Information Gain.
"""
# Calculate the base impurity
if criterion == 'entropy':
base_impurity = entropy(y)
elif criterion == 'gini':
base_impurity = gini_index(y)
else:
raise ValueError("Criterion must be 'entropy' or 'gini'")
# Get unique values and their counts
values, counts = np.unique(X_column, return_counts=True)
# Calculate the weighted impurity after the split
weighted_impurity = 0
for v, count in zip(values, counts):
y_subset = y[X_column == v]
if criterion == 'entropy':
impurity = entropy(y_subset)
else:
impurity = gini_index(y_subset)
weighted_impurity += (count / len(y)) * impurity
# Information Gain is the reduction in impurity
info_gain = base_impurity - weighted_impurity
return info_gain
import numpy as np
class DecisionTree:
def __init__(self, max_depth=None, min_samples_split=2, criterion='entropy'):
"""
Initialize the Decision Tree.
Parameters:
- max_depth (int): Maximum depth of the tree.
- min_samples_split (int): Minimum number of samples required to split a node.
- criterion (str): 'entropy' or 'gini' to specify the impurity measure.
"""
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.criterion = criterion
self.root = None
class Node:
def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
"""
Initialize a node.
Parameters:
- feature (int): Feature index to split on.
- threshold (float): Threshold value to split.
- left (Node): Left child node.
- right (Node): Right child node.
- value (int/float): Class label if it's a leaf node.
"""
self.feature = feature
self.threshold = threshold
self.left = left
self.right = right
self.value = value
def fit(self, X, y):
"""
Build the decision tree.
Parameters:
- X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).
- y (numpy.ndarray): Class labels of shape (n_samples,).
"""
self.root = self._build_tree(X, y)
def _build_tree(self, X, y, depth=0):
num_samples, num_features = X.shape
num_labels = len(np.unique(y))
# Stopping criteria
if (self.max_depth is not None and depth >= self.max_depth) or num_labels == 1 or num_samples < self.min_samples_split:
leaf_value = self._most_common_label(y)
return self.Node(value=leaf_value)
# Find the best split
best_feature, best_threshold, best_info_gain = self._best_split(X, y, num_features)
if best_info_gain == 0:
leaf_value = self._most_common_label(y)
return self.Node(value=leaf_value)
# Split the dataset
left_indices = X[:, best_feature] <= best_threshold
right_indices = X[:, best_feature] > best_threshold
left = self._build_tree(X[left_indices], y[left_indices], depth + 1)
right = self._build_tree(X[right_indices], y[right_indices], depth + 1)
return self.Node(feature=best_feature, threshold=best_threshold, left=left, right=right)
def _best_split(self, X, y, num_features):
best_info_gain = -1
best_feature, best_threshold = None, None
for feature in range(num_features):
X_column = X[:, feature]
thresholds = np.unique(X_column)
for threshold in thresholds:
# Split
left_indices = X_column <= threshold
right_indices = X_column > threshold
if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
continue
# Compute Information Gain
if self.criterion == 'entropy':
current_info_gain = information_gain(y, X_column <= threshold, criterion='entropy')
else:
current_info_gain = information_gain(y, X_column <= threshold, criterion='gini')
if current_info_gain > best_info_gain:
best_info_gain = current_info_gain
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold, best_info_gain
def _most_common_label(self, y):
"""
Find the most common class label.
Parameters:
- y (numpy.ndarray): Array of class labels.
Returns:
- int/float: Most common class label.
"""
values, counts = np.unique(y, return_counts=True)
return values[np.argmax(counts)]
def predict(self, X):
"""
Predict class labels for samples in X.
Parameters:
- X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).
Returns:
- numpy.ndarray: Predicted class labels.
"""
return np.array([self._traverse_tree(x, self.root) for x in X])
def _traverse_tree(self, x, node):
"""
Traverse the tree to make a prediction.
Parameters:
- x (numpy.ndarray): Single sample.
- node (Node): Current node in the tree.
Returns:
- int/float: Predicted class label.
"""
if node.value is not None:
return node.value
if x[node.feature] <= node.threshold:
return self._traverse_tree(x, node.left)
else:
return self._traverse_tree(x, node.right)
def score(self, X, y):
"""
Calculate the accuracy of the model.
Parameters:
- X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).
- y (numpy.ndarray): True class labels.
Returns:
- float: Accuracy score.
"""
preds = self.predict(X)
return np.mean(preds == y)
import numpy as np
from collections import Counter
class RandomForest:
def __init__(self, n_trees=10, max_depth=None, min_samples_split=2, criterion='entropy', max_features='sqrt'):
"""
Initialize the Random Forest.
Parameters:
- n_trees (int): Number of trees in the forest.
- max_depth (int): Maximum depth of each tree.
- min_samples_split (int): Minimum number of samples required to split a node.
- criterion (str): 'entropy' or 'gini' to specify the impurity measure.
- max_features (str or int): Number of features to consider when looking for the best split.
If 'sqrt', then max_features = sqrt(n_features).
"""
self.n_trees = n_trees
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.criterion = criterion
self.max_features = max_features
self.trees = []
def fit(self, X, y):
"""
Build the Random Forest.
Parameters:
- X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).
- y (numpy.ndarray): Class labels of shape (n_samples,).
"""
self.trees = []
n_samples, n_features = X.shape
# Determine number of features to sample
if self.max_features == 'sqrt':
max_features = int(np.sqrt(n_features))
elif isinstance(self.max_features, int):
max_features = self.max_features
else:
max_features = n_features # Use all features
for _ in range(self.n_trees):
# Bootstrap sampling
indices = np.random.choice(n_samples, size=n_samples, replace=True)
X_sample = X[indices]
y_sample = y[indices]
# Feature sampling
feature_indices = np.random.choice(n_features, size=max_features, replace=False)
# Train a Decision Tree on the sampled data and features
tree = DecisionTree(
max_depth=self.max_depth,
min_samples_split=self.min_samples_split,
criterion=self.criterion
)
tree.fit(X_sample[:, feature_indices], y_sample)
# Store the tree and the feature indices used
self.trees.append((tree, feature_indices))
def predict(self, X):
"""
Predict class labels for samples in X.
Parameters:
- X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).
Returns:
- numpy.ndarray: Predicted class labels.
"""
tree_preds = np.array([
tree.predict(X[:, feature_indices])
for tree, feature_indices in self.trees
])
# Transpose to have shape (n_samples, n_trees)
tree_preds = tree_preds.T
# Majority vote
y_pred = np.array([Counter(row).most_common(1)[0][0] for row in tree_preds])
return y_pred
def score(self, X, y):
"""
Calculate the accuracy of the model.
Parameters:
- X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).
- y (numpy.ndarray): True class labels.
Returns:
- float: Accuracy score.
"""
preds = self.predict(X)
return np.mean(preds == y)
import numpy as np
class PCA:
def __init__(self, n_components):
"""
Initialize PCA.
Parameters:
- n_components (int): Number of principal components to retain.
"""
self.n_components = n_components
self.components = None
self.mean = None
def fit(self, X):
"""
Fit the PCA model to the data.
Parameters:
- X (numpy.ndarray): Data matrix of shape (n_samples, n_features).
"""
# Center the data
self.mean = np.mean(X, axis=0)
X_centered = X - self.mean
# Compute covariance matrix
covariance_matrix = np.cov(X_centered, rowvar=False)
# Compute eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
# Sort eigenvectors by decreasing eigenvalues
sorted_idx = np.argsort(eigenvalues)[::-1]
eigenvectors = eigenvectors[:, sorted_idx]
eigenvalues = eigenvalues[sorted_idx]
# Select the top n_components
self.components = eigenvectors[:, :self.n_components]
def transform(self, X):
"""
Project the data onto the principal components.
Parameters:
- X (numpy.ndarray): Data matrix of shape (n_samples, n_features).
Returns:
- numpy.ndarray: Transformed data of shape (n_samples, n_components).
"""
X_centered = X - self.mean
return np.dot(X_centered, self.components)
def fit_transform(self, X):
"""
Fit the PCA model and apply the dimensionality reduction on the data.
Parameters:
- X (numpy.ndarray): Data matrix of shape (n_samples, n_features).
Returns:
- numpy.ndarray: Transformed data of shape (n_samples, n_components).
"""
self.fit(X)
return self.transform(X)Editor is loading...
Leave a Comment