Untitled
unknown
plain_text
9 months ago
26 kB
2
Indexable
# Operating system libraries
import os
from pprint import pprint
import random
from functools import wraps
# Numerical and math libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Deep Learning libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
# Machine learning libraries
from sklearn.model_selection import KFold
#from sklearn.metrics import make_scorer
#from sklearn import set_config
enable_torch_cuda = True
enable_sklearn_cuda = True # Enable cuML for sklearn if available
if enable_torch_cuda and torch.cuda.is_available():
print("Detected and enable CUDA support in PyTorch")
torch.set_default_device('cuda')
if enable_sklearn_cuda:
try:
from cuml.svm import SVR as SVR
from cuml.svm import LinearSVR as LinearSVR # Import LinearSVR from cuML
from cuml.ensemble import RandomForestRegressor as MultiOutputRegressor
print("Detected and enabled CUDA support in cuML (sklearn-cuda)")
except ImportError:
print("cuML (sklearn-cuda) not found or not installed with GPU support. Falling back to CPU-based sklearn.")
from sklearn.svm import SVR as SVR
from sklearn.svm import LinearSVR as LinearSVR # Import LinearSVR from sklearn
from sklearn.multioutput import MultiOutputRegressor
else:
from sklearn.svm import SVR as SVR
from sklearn.svm import LinearSVR as LinearSVR # Import LinearSVR from sklearn
from sklearn.multioutput import MultiOutputRegressor
# Parameters for every model
param_grid_base = {
'modeltype': ['nn', 'svr', 'linearsvr'], # Add 'linearsvr'
}
# Parameters for NN models
param_grid_nn = {
**param_grid_base,
'hidden_size': np.linspace(50, 300, 15, dtype=int).tolist(),
'learning_rate': np.logspace(-4, -1, num=10).tolist(),
'weight_decay': np.logspace(-3.5, -2, 5).tolist(),
'num_epochs': np.linspace(15, 1000, 10, dtype=int).tolist(),
'batch_size': [250], # np.logspace(np.log10(1), np.log10(250 + 1), 5, dtype=int).tolist(), ## too slow?
'dropout_prob': np.logspace(-3, -0.1, 3).tolist(),
'patience': np.linspace(2, 10, 8).tolist(),
'threshold': np.logspace(-5, -3, num=8).tolist(),
}
# Parameters for SVR models
param_grid_svr = {
**param_grid_base,
'kernel': {'linear': 0.4, 'rbf': 0.6 },
'C': np.logspace(-2, 3, 6).tolist(),
'epsilon': [0.01, 0.1, 0.5, 1, 5],
'gamma': {'scale':0.30, 'auto':0.10, 0.01:0.15, 0.1:0.15, 1.0:0.15, 10.0:0.15}
}
# Parameters for LinearSVR models
param_grid_linearsvr = { # Define parameters for LinearSVR
**param_grid_base,
'C': np.logspace(-2, 3, 6).tolist(),
'epsilon': [0.01, 0.1, 0.5, 1, 5],
'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'] # Example parameters for LinearSVR
}
# Parameters for all the possible models
param_grid = {**param_grid_nn, **param_grid_svr, **param_grid_linearsvr} # Include param_grid_linearsvr
# Meta-parameters for sampling and cross validation
n_samples = 10
outer_kfold = 10
inner_kfold = 5
outputDir = "plots"
randomSeed = 42
# Set the seed for reproducibility
random.seed(randomSeed)
# Get the number of available CPU cores
num_threads = os.cpu_count()
if (num_threads > 1):
print(f"Detected and enable multithread support with {num_threads} threads")
# Set the number of threads for CPU operations
torch.set_num_threads(num_threads)
os.environ["OMP_NUM_THREADS"] = str(num_threads)
os.environ["MKL_NUM_THREADS"] = str(num_threads)
torch.set_num_interop_threads(num_threads)
# Set additional environment variables for sklearn
os.environ["JOBLIB_NUM_CPU_THREADS"] = str(-1)
os.environ["LOOPY_BACKEND"] = "loky"
def set_n_jobs_default(cls):
"""Decorator to set n_jobs=-1 by default for scikit-learn estimators."""
original_init = cls.__init__
@wraps(original_init)
def new_init(self, *args, **kwargs):
if 'n_jobs' not in kwargs:
kwargs['n_jobs'] = -1
original_init(self, *args, **kwargs)
cls.__init__ = new_init
return cls
MultiOutputRegressor = set_n_jobs_default(MultiOutputRegressor)
# Set environment to suppress warnings
#os.environ["PYTHONWARNINGS"] = "ignore:You are using torch.load with weights_only=False"
# Read the data
data = pd.read_csv("ML-CUP24-TR.csv", comment='#', header=None)
# All columns except the last 3 and the first are input data
X_raw = data.iloc[:, 1:-3].values
# The last 3 columns are the desired outputs
y_raw = data.iloc[:, -3:].values
# Normalize data
def normalize_data(inputs):
return inputs
mean = np.mean(inputs, axis=0)
std = np.std(inputs, axis=0)
return (inputs - mean) / std
# Convert the data to NumPY tensors and PyTorch tensors
X_np = normalize_data(X_raw).astype(np.float32)
y_np = y_raw.astype(np.float32)
X_torch = torch.tensor(normalize_data(X_raw), dtype=torch.float32)
y_torch = torch.tensor(y_raw, dtype=torch.float32)
# Define hyperparameters
input_size = X_torch.shape[1]
output_size = 3
# Define a simple neural network
class NeuralNet(nn.Module):
def __init__(self, hidden_size, dropout_prob):
super(NeuralNet, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, output_size)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(dropout_prob)
def forward(self, x):
out = self.fc1(x)
out = self.relu(out)
out = self.fc2(out)
return out
# Define the Mean Euclidean Error function (MEE)
def mean_euclidean_error(y_true, y_pred):
return np.mean(np.sqrt(np.sum((y_true - y_pred) ** 2, axis=1)))
# Create a custom scorer (a negative MEE)
#neg_mee_scorer = make_scorer(mean_euclidean_error, greater_is_better=False)
# Outer K-fold for testing purposes
testFold = KFold(n_splits=outer_kfold, shuffle=True, random_state=randomSeed)
# Track MEE scores for each testing fold
testFold_best_train_mees = []
testFold_best_val_mees = []
testFold_best_parameters = []
testFold_best_model = []
for fold, (tval_index, test_index) in enumerate(testFold.split(X_np)):
tval_X_np, test_X_np = X_np[tval_index], X_np[test_index]
tval_y_np, test_y_np = y_np[tval_index], y_np[test_index]
tval_X_torch, test_X_torch = X_torch[tval_index], X_torch[test_index]
tval_y_torch, test_y_torch = y_torch[tval_index], y_torch[test_index]
# Track MEE scores for this random search
randomSearch_avg_train_mees = []
randomSearch_avg_val_mees = []
randomSearch_parameters = []
def train_model(
train_X_torch, train_X_np,
train_y_torch, train_y_np,
val_X_torch, val_X_np,
val_y_torch, val_y_np,
params):
if (params[0] == 'nn'):
(modeltype,hidden_size,learning_rate,weight_decay,num_epochs,batch_size,dropout_prob,patience,threshold) = params
model = NeuralNet(hidden_size, dropout_prob)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
# Move model to CUDA if available
if enable_torch_cuda and torch.cuda.is_available():
model = model.cuda()
# Split the training data into batches
def batchify(data, batch_size):
return [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
train_X_batches = batchify(train_X_torch, batch_size)
train_y_batches = batchify(train_y_torch, batch_size)
n_batches = len(train_X_batches)
# Track MEE for each epoch
epoch_train_mees = []
epoch_val_mees = []
for epoch in range(num_epochs):
model.train()
train_mee = 0.0
for i in range(0,n_batches):
batch_X = train_X_batches[i]
batch_y = train_y_batches[i]
optimizer.zero_grad()
outputs = model(batch_X)
loss = criterion(outputs, batch_y)
loss.backward()
optimizer.step()
# Calculate MEE for the batch
train_mee += mean_euclidean_error(batch_y.detach().cpu().numpy(), outputs.detach().cpu().numpy())
# Average MEE over all batches
train_mee /= n_batches
epoch_train_mees.append(train_mee)
model.eval()
with torch.no_grad():
epoch_val_mees.append(mean_euclidean_error(val_y_np, model(val_X_torch).detach().cpu().numpy()))
return model, epoch_train_mees[-1], epoch_val_mees[-1], epoch_train_mees, epoch_val_mees
elif (params[0] == 'svr'):
(modeltype,kernel,C,epsilon,gamma) = params
svr = MultiOutputRegressor(SVR(kernel=kernel,C=C,epsilon=epsilon,gamma=gamma))
svr.fit(train_X_np,train_y_np)
train_mees = mean_euclidean_error(train_y_np,svr.predict(train_X_np))
val_mees = mean_euclidean_error(val_y_np,svr.predict(val_X_np))
return svr, train_mees, val_mees, [], []
elif (params[0] == 'linearsvr'): # Handle LinearSVR
(modeltype, C, epsilon, loss) = params # Adjust parameters as needed
linearsvr = MultiOutputRegressor(LinearSVR(C=C, epsilon=epsilon, loss=loss)) # Use LinearSVR
linearsvr.fit(train_X_np,train_y_np)
train_mees = mean_euclidean_error(train_y_np, linearsvr.predict(train_X_np))
val_mees = mean_euclidean_error(val_y_np, linearsvr.predict(val_X_np))
return linearsvr, train_mees, val_mees, [], []
else:
os.abort()
# Search over a random sample of the parameter combinations
for i in range(0,n_samples):
def rchoice(pg):
return random.choice(pg)
modeltype = rchoice(param_grid['modeltype'])
if (modeltype == 'nn'):
hidden_size = rchoice(param_grid['hidden_size'])
learning_rate = rchoice(param_grid['learning_rate'])
weight_decay = rchoice(param_grid['weight_decay'])
num_epochs = rchoice(param_grid['num_epochs'])
batch_size = rchoice(param_grid['batch_size'])
dropout_prob = rchoice(param_grid['dropout_prob'])
patience = rchoice(param_grid['patience'])
threshold = rchoice(param_grid['threshold'])
# Put all the random parameters into a tuple
params = (modeltype,hidden_size,learning_rate,weight_decay,num_epochs,batch_size,dropout_prob,patience,threshold)
def rchoicep(pg):
return random.choices(list(pg.keys()), weights=list(pg.values()), k=1)[0]
if (modeltype == 'svr'):
kernel = rchoicep(param_grid['kernel'])
C = rchoice(param_grid['C'])
epsilon = rchoice(param_grid['epsilon'])
gamma = rchoicep(param_grid['gamma'])
# Put all the random parameters into a tuple
params = (modeltype,kernel,C,epsilon,gamma)
if (modeltype == 'linearsvr'): # Parameters for LinearSVR
C = rchoice(param_grid['C'])
epsilon = rchoice(param_grid['epsilon'])
loss = rchoice(param_grid_linearsvr['loss'])
# Put all the random parameters into a tuple
params = (modeltype, C, epsilon, loss)
validFold = KFold(n_splits=inner_kfold, shuffle=True, random_state=randomSeed)
# The MEE scores for each validation fold
validFold_train_mees = []
validFold_val_mees = []
# Inner K-fold for validation purposes
for internalFold, (train_index, val_index) in enumerate(validFold.split(tval_X_np)):
train_X_np, val_X_np = tval_X_np[train_index], tval_X_np[val_index]
train_y_np, val_y_np = tval_y_np[train_index], tval_y_np[val_index]
train_X_torch, val_X_torch = tval_X_torch[train_index], tval_X_torch[val_index]
train_y_torch, val_y_torch = tval_y_torch[train_index], tval_y_torch[val_index]
_, train_mees, val_mees, epoch_train_mees, epoch_val_mees = train_model(
train_X_torch,train_X_np,
train_y_torch,train_y_np,
val_X_torch, val_X_np,
val_y_torch, val_y_np,
params)
validFold_train_mees.append(train_mees)
validFold_val_mees.append(val_mees)
randomSearch_avg_train_mees.append(np.mean(validFold_train_mees))
randomSearch_avg_val_mees.append(np.mean(validFold_val_mees))
#randomSearch_avg_train_mees.append(np.sqrt(np.mean(np.pow(validFold_train_mees,2)))) # Quadratic mean
#randomSearch_avg_val_mees.append(np.sqrt(np.mean(np.pow(validFold_val_mees,2)))) # Quadratic mean
randomSearch_parameters.append(params)
# Sort the random search results by validation MEE
sorted_indices = np.argsort(randomSearch_avg_val_mees)
randomSearch_avg_train_mees = [randomSearch_avg_train_mees[i] for i in sorted_indices]
randomSearch_avg_val_mees = [randomSearch_avg_val_mees[i] for i in sorted_indices]
randomSearch_parameters = [randomSearch_parameters[i] for i in sorted_indices]
print(f"RandomSearch Best Training MEE: {randomSearch_avg_train_mees[0]}")
print(f"RandomSearch Best Validation MEE: {randomSearch_avg_val_mees[0]}")
print(f"RandomSearch Best Parameters: {randomSearch_parameters[0]}")
# Create directory if it doesn't exist
os.makedirs(f'{outputDir}/{fold}', exist_ok=True)
# Find the smallest i such that randomSearch_parameters[i][0] == 'nn'
try:
nn_index = next(i for i, params in enumerate(randomSearch_parameters) if params[0] == 'nn')
except StopIteration:
print("No 'nn' model found in randomSearch_parameters")
nn_index = -1 # Indicate no NN model found
# Find the smallest i such that randomSearch_parameters[i][0] == 'svr'
try:
svr_index = next(i for i, params in enumerate(randomSearch_parameters) if params[0] == 'svr')
except StopIteration:
print("No 'svr' model found in randomSearch_parameters")
svr_index = -1 # Indicate no SVR model found
# Find the smallest i such that randomSearch_parameters[i][0] == 'linearsvr'
try:
linearsvr_index = next(i for i, params in enumerate(randomSearch_parameters) if params[0] == 'linearsvr')
except StopIteration:
print("No 'linearsvr' model found in randomSearch_parameters")
linearsvr_index = -1 # Indicate no LinearSVR model found
if nn_index != -1:
# Our best NN model
nn_model, nn_train_mees, nn_val_mees, nn_epoch_train_mees, nn_epoch_val_mees = train_model(
tval_X_torch, tval_X_np,
tval_y_torch, tval_y_np,
test_X_torch, test_X_np,
test_y_torch, test_y_np,
randomSearch_parameters[nn_index])
# Print results for our best NN model
print(f"Best refit NN Training MEE: {nn_train_mees}")
print(f"Best refit NN Testing MEE: {nn_val_mees}")
# Plot learning curve for our best NN model
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(nn_epoch_train_mees) + 1), nn_epoch_train_mees, label='Train MEE')
plt.plot(range(1, len(nn_epoch_val_mees) + 1), nn_epoch_val_mees, label='Test MEE', linestyle='--')
plt.xlabel('Epoch')
plt.ylabel('MEE')
plt.title('NN Learning Curve - Best Model')
plt.legend()
plt.grid()
# Save the plot to a file
plt.savefig(f'{outputDir}/{fold}/nn_learning_curve.png')
#plt.show()
else:
nn_train_mees = float('inf') # Assign infinity if no NN model
nn_val_mees = float('inf')
if svr_index != -1:
# Our best SVR model
svr_model, svr_train_mees, svr_val_mees, svr_epoch_train_mees, svr_epoch_val_mees = train_model(
tval_X_torch, tval_X_np,
tval_y_torch, tval_y_np,
test_X_torch, test_X_np,
test_y_torch, test_y_np,
randomSearch_parameters[svr_index])
# Print results for our best SVR model
print(f"Best refit SVR Training MEE: {svr_train_mees}")
print(f"Best refit SVR Testing MEE: {svr_val_mees}")
else:
svr_train_mees = float('inf') # Assign infinity if no SVR model
svr_val_mees = float('inf')
if linearsvr_index != -1:
# Our best LinearSVR model
linearsvr_model, linearsvr_train_mees, linearsvr_val_mees, linearsvr_epoch_train_mees, linearsvr_epoch_val_mees = train_model(
tval_X_torch, tval_X_np,
tval_y_torch, tval_y_np,
test_X_torch, test_X_np,
test_y_torch, test_y_np,
randomSearch_parameters[linearsvr_index])
# Print results for our best LinearSVR model
print(f"Best refit LinearSVR Training MEE: {linearsvr_train_mees}")
print(f"Best refit LinearSVR Testing MEE: {linearsvr_val_mees}")
else:
linearsvr_train_mees = float('inf') # Assign infinity if no LinearSVR model
linearsvr_val_mees = float('inf')
df = pd.DataFrame({
'avg_train_mee': randomSearch_avg_train_mees,
'avg_val_mee': randomSearch_avg_val_mees,
'parameters': randomSearch_parameters,
})
def unpackParams(key,x):
if (x[0] == 'nn' and key in param_grid_nn):
return x[list(param_grid_nn).index(key)]
elif (x[0] == 'svr' and key in param_grid_svr):
return x[list(param_grid_svr).index(key)]
elif (x[0] == 'linearsvr' and key in param_grid_linearsvr):
return x[list(param_grid_linearsvr).index(key)]
else:
return None
for i, key in enumerate(param_grid.keys()):
df[key] = df['parameters'].apply(lambda x: unpackParams(key,x))
df.drop('parameters', axis=1, inplace=True)
# Print the DataFrame to a file
df.to_csv(f'{outputDir}/{fold}/results.csv', index=False)
# Select rows in the dataframe where 'modeltype' is 'nn'
nn_df = df[df['modeltype'] == 'nn']
# Plot statistics for each NN parameter
for column in param_grid_nn.keys():
if column == 'modeltype' or column == 'gamma':
continue; # we have nothing to do with modeltype and we don't know how to handle mixed data
# Calculate the average of avg_val_mee for each unique value in the column
average_values = nn_df.groupby(column)['avg_val_mee'].mean().reset_index()
#pprint(average_values)
# Plotting
plt.figure(figsize=(8, 5))
plt.plot(average_values[column], average_values['avg_val_mee'], marker='o', linestyle='-', color='blue')
# Add labels and title
plt.xlabel(column)
if pd.api.types.is_numeric_dtype(nn_df[column]) and (max(average_values[column])/min(average_values[column]) > 20):
plt.xscale('log')
plt.ylabel('Average Mean Test Error')
plt.title(f'Effect of NN {column} on Average Mean Test Error')
#plt.legend()
#plt.grid(True)
# Save the plot to a file
plt.savefig(f'{outputDir}/{fold}/nn_{column}.png')
#plt.show()
# Select rows in the dataframe where 'modeltype' is 'svr'
nn_svr = df[df['modeltype'] == 'svr']
# Plot statistics for each SVR parameter
for column in param_grid_svr.keys():
if column == 'modeltype' or column == 'gamma':
continue; # we have nothing to do with modeltype and we don't know how to handle mixed data
# Calculate the average of avg_val_mee for each unique value in the column
average_values = nn_svr.groupby(column)['avg_val_mee'].mean().reset_index()
#pprint(average_values)
# Plotting
plt.figure(figsize=(8, 5))
plt.plot(average_values[column], average_values['avg_val_mee'], marker='o', linestyle='-', color='blue')
# Add labels and title
plt.xlabel(column)
if pd.api.types.is_numeric_dtype(nn_svr[column]) and (max(average_values[column])/min(average_values[column]) > 20):
plt.xscale('log')
plt.ylabel('Average Mean Test Error')
plt.title(f'Effect of SVR {column} on Average Mean Test Error')
#plt.legend()
#plt.grid(True)
# Save the plot to a file
plt.savefig(f'{outputDir}/{fold}/svr_{column}.png')
#plt.show()
# Select rows in the dataframe where 'modeltype' is 'linearsvr'
nn_linearsvr = df[df['modeltype'] == 'linearsvr']
# Plot statistics for each LinearSVR parameter
for column in param_grid_linearsvr.keys():
if column == 'modeltype': # or column == 'gamma': LinearSVR doesn't have gamma
continue; # we have nothing to do with modeltype and we don't know how to handle mixed data
# Calculate the average of avg_val_mee for each unique value in the column
average_values = nn_linearsvr.groupby(column)['avg_val_mee'].mean().reset_index()
#pprint(average_values)
# Plotting
plt.figure(figsize=(8, 5))
plt.plot(average_values[column], average_values['avg_val_mee'], marker='o', linestyle='-', color='blue')
# Add labels and title
plt.xlabel(column)
if pd.api.types.is_numeric_dtype(nn_linearsvr[column]) and (max(average_values[column])/min(average_values[column]) > 20):
plt.xscale('log')
plt.ylabel('Average Mean Test Error')
plt.title(f'Effect of LinearSVR {column} on Average Mean Test Error')
#plt.legend()
#plt.grid(True)
# Save the plot to a file
plt.savefig(f'{outputDir}/{fold}/linearsvr_{column}.png')
#plt.show()
best_model_index = np.argmin([nn_val_mees, svr_val_mees, linearsvr_val_mees]) # Find the index of the best model
best_val_mees_list = [nn_val_mees, svr_val_mees, linearsvr_val_mees]
best_train_mees_list = [nn_train_mees, svr_train_mees, linearsvr_train_mees]
model_types = ['nn', 'svr', 'linearsvr']
if best_model_index == 0 and nn_index != -1:
testFold_best_parameters.append(randomSearch_parameters[nn_index])
testFold_best_train_mees.append(nn_train_mees)
testFold_best_val_mees.append(nn_val_mees)
testFold_best_model.append(nn_model)
elif best_model_index == 1 and svr_index != -1:
testFold_best_parameters.append(randomSearch_parameters[svr_index])
testFold_best_train_mees.append(svr_train_mees)
testFold_best_val_mees.append(svr_val_mees)
testFold_best_model.append(svr_model)
elif best_model_index == 2 and linearsvr_index != -1:
testFold_best_parameters.append(randomSearch_parameters[linearsvr_index])
testFold_best_train_mees.append(linearsvr_train_mees)
testFold_best_val_mees.append(linearsvr_val_mees)
testFold_best_model.append(linearsvr_model)
else: # Fallback to SVR if something went wrong and no better model is found.
if svr_index != -1:
testFold_best_parameters.append(randomSearch_parameters[svr_index])
testFold_best_train_mees.append(svr_train_mees)
testFold_best_val_mees.append(svr_val_mees)
testFold_best_model.append(svr_model)
elif nn_index != -1:
testFold_best_parameters.append(randomSearch_parameters[nn_index])
testFold_best_train_mees.append(nn_train_mees)
testFold_best_val_mees.append(nn_val_mees)
testFold_best_model.append(nn_model)
elif linearsvr_index != -1:
testFold_best_parameters.append(randomSearch_parameters[linearsvr_index])
testFold_best_train_mees.append(linearsvr_train_mees)
testFold_best_val_mees.append(linearsvr_val_mees)
testFold_best_model.append(linearsvr_model)
else:
print("No model found in this fold!")
continue # skip to the next fold if no model was trained in this iteration.
# Sort the best models by validation MEE
sorted_indices = np.argsort(testFold_best_val_mees)
testFold_best_train_mees = [testFold_best_train_mees[i] for i in sorted_indices]
testFold_best_val_mees = [testFold_best_val_mees[i] for i in sorted_indices]
testFold_best_parameters = [testFold_best_parameters[i] for i in sorted_indices]
testFold_best_model = [testFold_best_model[i] for i in sorted_indices]
# Print results of the test fold
df = pd.DataFrame({
'testFold_best_train_mees': testFold_best_train_mees,
'testFold_best_val_mees': testFold_best_val_mees,
'testFold_best_parameters': testFold_best_parameters,
})
# Print the DataFrame to a file
df.to_csv(f'{outputDir}/results.csv', index=False)
# Evaluate the best model
def bestModelEval(X):
bestModelType = testFold_best_parameters[0][0]
bestModel = testFold_best_model[0]
if (bestModelType == 'nn'):
return bestModel(X)
elif (bestModelType == 'svr'):
return bestModel(X)
elif (bestModelType == 'linearsvr'):
return bestModel(X)
else:
print("Unknwon model type?")
os.abort()Editor is loading...
Leave a Comment