Untitled
user_3839718
python
a year ago
1.5 kB
14
Indexable
Never
import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error from sklearn.datasets import make_regression # Step 1: Generate a synthetic dataset X, y = make_regression(n_samples=1000, n_features=1, noise=10, random_state=42) # Step 2: Split the data into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Step 3: Apply linear regression on subsets of increasing size and calculate MSE subset_sizes = np.linspace(10, len(X_train), 10, dtype=int) mse_values = [] for size in subset_sizes: mse_avg = 0 # Averaging over 10 random permutations for _ in range(10): subset_idx = np.random.choice(len(X_train), size, replace=False) X_subset, y_subset = X_train[subset_idx], y_train[subset_idx] # Apply linear regression model = LinearRegression() model.fit(X_subset, y_subset) # Calculate MSE on the training subset y_pred = model.predict(X_subset) mse = mean_squared_error(y_subset, y_pred) mse_avg += mse mse_avg /= 10 mse_values.append(mse_avg) # Step 4: Plot the results plt.figure(figsize=(10, 6)) plt.plot(subset_sizes, mse_values, marker='o') plt.xlabel('Number of Samples (m)') plt.ylabel('Mean Squared Error (MSE)') plt.title('MSE on Training Set as a Function of Number of Samples') plt.grid(True) plt.show()