Untitled
unknown
plain_text
22 days ago
26 kB
0
Indexable
# Operating system libraries import os from pprint import pprint import random from functools import wraps # Numerical and math libraries import pandas as pd import numpy as np import matplotlib.pyplot as plt # Deep Learning libraries import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import DataLoader, TensorDataset # Machine learning libraries from sklearn.model_selection import KFold #from sklearn.metrics import make_scorer #from sklearn import set_config enable_torch_cuda = True enable_sklearn_cuda = True # Enable cuML for sklearn if available if enable_torch_cuda and torch.cuda.is_available(): print("Detected and enable CUDA support in PyTorch") torch.set_default_device('cuda') if enable_sklearn_cuda: try: from cuml.svm import SVR as SVR from cuml.svm import LinearSVR as LinearSVR # Import LinearSVR from cuML from cuml.ensemble import RandomForestRegressor as MultiOutputRegressor print("Detected and enabled CUDA support in cuML (sklearn-cuda)") except ImportError: print("cuML (sklearn-cuda) not found or not installed with GPU support. Falling back to CPU-based sklearn.") from sklearn.svm import SVR as SVR from sklearn.svm import LinearSVR as LinearSVR # Import LinearSVR from sklearn from sklearn.multioutput import MultiOutputRegressor else: from sklearn.svm import SVR as SVR from sklearn.svm import LinearSVR as LinearSVR # Import LinearSVR from sklearn from sklearn.multioutput import MultiOutputRegressor # Parameters for every model param_grid_base = { 'modeltype': ['nn', 'svr', 'linearsvr'], # Add 'linearsvr' } # Parameters for NN models param_grid_nn = { **param_grid_base, 'hidden_size': np.linspace(50, 300, 15, dtype=int).tolist(), 'learning_rate': np.logspace(-4, -1, num=10).tolist(), 'weight_decay': np.logspace(-3.5, -2, 5).tolist(), 'num_epochs': np.linspace(15, 1000, 10, dtype=int).tolist(), 'batch_size': [250], # np.logspace(np.log10(1), np.log10(250 + 1), 5, dtype=int).tolist(), ## too slow? 'dropout_prob': np.logspace(-3, -0.1, 3).tolist(), 'patience': np.linspace(2, 10, 8).tolist(), 'threshold': np.logspace(-5, -3, num=8).tolist(), } # Parameters for SVR models param_grid_svr = { **param_grid_base, 'kernel': {'linear': 0.4, 'rbf': 0.6 }, 'C': np.logspace(-2, 3, 6).tolist(), 'epsilon': [0.01, 0.1, 0.5, 1, 5], 'gamma': {'scale':0.30, 'auto':0.10, 0.01:0.15, 0.1:0.15, 1.0:0.15, 10.0:0.15} } # Parameters for LinearSVR models param_grid_linearsvr = { # Define parameters for LinearSVR **param_grid_base, 'C': np.logspace(-2, 3, 6).tolist(), 'epsilon': [0.01, 0.1, 0.5, 1, 5], 'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'] # Example parameters for LinearSVR } # Parameters for all the possible models param_grid = {**param_grid_nn, **param_grid_svr, **param_grid_linearsvr} # Include param_grid_linearsvr # Meta-parameters for sampling and cross validation n_samples = 10 outer_kfold = 10 inner_kfold = 5 outputDir = "plots" randomSeed = 42 # Set the seed for reproducibility random.seed(randomSeed) # Get the number of available CPU cores num_threads = os.cpu_count() if (num_threads > 1): print(f"Detected and enable multithread support with {num_threads} threads") # Set the number of threads for CPU operations torch.set_num_threads(num_threads) os.environ["OMP_NUM_THREADS"] = str(num_threads) os.environ["MKL_NUM_THREADS"] = str(num_threads) torch.set_num_interop_threads(num_threads) # Set additional environment variables for sklearn os.environ["JOBLIB_NUM_CPU_THREADS"] = str(-1) os.environ["LOOPY_BACKEND"] = "loky" def set_n_jobs_default(cls): """Decorator to set n_jobs=-1 by default for scikit-learn estimators.""" original_init = cls.__init__ @wraps(original_init) def new_init(self, *args, **kwargs): if 'n_jobs' not in kwargs: kwargs['n_jobs'] = -1 original_init(self, *args, **kwargs) cls.__init__ = new_init return cls MultiOutputRegressor = set_n_jobs_default(MultiOutputRegressor) # Set environment to suppress warnings #os.environ["PYTHONWARNINGS"] = "ignore:You are using torch.load with weights_only=False" # Read the data data = pd.read_csv("ML-CUP24-TR.csv", comment='#', header=None) # All columns except the last 3 and the first are input data X_raw = data.iloc[:, 1:-3].values # The last 3 columns are the desired outputs y_raw = data.iloc[:, -3:].values # Normalize data def normalize_data(inputs): return inputs mean = np.mean(inputs, axis=0) std = np.std(inputs, axis=0) return (inputs - mean) / std # Convert the data to NumPY tensors and PyTorch tensors X_np = normalize_data(X_raw).astype(np.float32) y_np = y_raw.astype(np.float32) X_torch = torch.tensor(normalize_data(X_raw), dtype=torch.float32) y_torch = torch.tensor(y_raw, dtype=torch.float32) # Define hyperparameters input_size = X_torch.shape[1] output_size = 3 # Define a simple neural network class NeuralNet(nn.Module): def __init__(self, hidden_size, dropout_prob): super(NeuralNet, self).__init__() self.fc1 = nn.Linear(input_size, hidden_size) self.fc2 = nn.Linear(hidden_size, output_size) self.relu = nn.ReLU() self.dropout = nn.Dropout(dropout_prob) def forward(self, x): out = self.fc1(x) out = self.relu(out) out = self.fc2(out) return out # Define the Mean Euclidean Error function (MEE) def mean_euclidean_error(y_true, y_pred): return np.mean(np.sqrt(np.sum((y_true - y_pred) ** 2, axis=1))) # Create a custom scorer (a negative MEE) #neg_mee_scorer = make_scorer(mean_euclidean_error, greater_is_better=False) # Outer K-fold for testing purposes testFold = KFold(n_splits=outer_kfold, shuffle=True, random_state=randomSeed) # Track MEE scores for each testing fold testFold_best_train_mees = [] testFold_best_val_mees = [] testFold_best_parameters = [] testFold_best_model = [] for fold, (tval_index, test_index) in enumerate(testFold.split(X_np)): tval_X_np, test_X_np = X_np[tval_index], X_np[test_index] tval_y_np, test_y_np = y_np[tval_index], y_np[test_index] tval_X_torch, test_X_torch = X_torch[tval_index], X_torch[test_index] tval_y_torch, test_y_torch = y_torch[tval_index], y_torch[test_index] # Track MEE scores for this random search randomSearch_avg_train_mees = [] randomSearch_avg_val_mees = [] randomSearch_parameters = [] def train_model( train_X_torch, train_X_np, train_y_torch, train_y_np, val_X_torch, val_X_np, val_y_torch, val_y_np, params): if (params[0] == 'nn'): (modeltype,hidden_size,learning_rate,weight_decay,num_epochs,batch_size,dropout_prob,patience,threshold) = params model = NeuralNet(hidden_size, dropout_prob) criterion = nn.MSELoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # Move model to CUDA if available if enable_torch_cuda and torch.cuda.is_available(): model = model.cuda() # Split the training data into batches def batchify(data, batch_size): return [data[i:i + batch_size] for i in range(0, len(data), batch_size)] train_X_batches = batchify(train_X_torch, batch_size) train_y_batches = batchify(train_y_torch, batch_size) n_batches = len(train_X_batches) # Track MEE for each epoch epoch_train_mees = [] epoch_val_mees = [] for epoch in range(num_epochs): model.train() train_mee = 0.0 for i in range(0,n_batches): batch_X = train_X_batches[i] batch_y = train_y_batches[i] optimizer.zero_grad() outputs = model(batch_X) loss = criterion(outputs, batch_y) loss.backward() optimizer.step() # Calculate MEE for the batch train_mee += mean_euclidean_error(batch_y.detach().cpu().numpy(), outputs.detach().cpu().numpy()) # Average MEE over all batches train_mee /= n_batches epoch_train_mees.append(train_mee) model.eval() with torch.no_grad(): epoch_val_mees.append(mean_euclidean_error(val_y_np, model(val_X_torch).detach().cpu().numpy())) return model, epoch_train_mees[-1], epoch_val_mees[-1], epoch_train_mees, epoch_val_mees elif (params[0] == 'svr'): (modeltype,kernel,C,epsilon,gamma) = params svr = MultiOutputRegressor(SVR(kernel=kernel,C=C,epsilon=epsilon,gamma=gamma)) svr.fit(train_X_np,train_y_np) train_mees = mean_euclidean_error(train_y_np,svr.predict(train_X_np)) val_mees = mean_euclidean_error(val_y_np,svr.predict(val_X_np)) return svr, train_mees, val_mees, [], [] elif (params[0] == 'linearsvr'): # Handle LinearSVR (modeltype, C, epsilon, loss) = params # Adjust parameters as needed linearsvr = MultiOutputRegressor(LinearSVR(C=C, epsilon=epsilon, loss=loss)) # Use LinearSVR linearsvr.fit(train_X_np,train_y_np) train_mees = mean_euclidean_error(train_y_np, linearsvr.predict(train_X_np)) val_mees = mean_euclidean_error(val_y_np, linearsvr.predict(val_X_np)) return linearsvr, train_mees, val_mees, [], [] else: os.abort() # Search over a random sample of the parameter combinations for i in range(0,n_samples): def rchoice(pg): return random.choice(pg) modeltype = rchoice(param_grid['modeltype']) if (modeltype == 'nn'): hidden_size = rchoice(param_grid['hidden_size']) learning_rate = rchoice(param_grid['learning_rate']) weight_decay = rchoice(param_grid['weight_decay']) num_epochs = rchoice(param_grid['num_epochs']) batch_size = rchoice(param_grid['batch_size']) dropout_prob = rchoice(param_grid['dropout_prob']) patience = rchoice(param_grid['patience']) threshold = rchoice(param_grid['threshold']) # Put all the random parameters into a tuple params = (modeltype,hidden_size,learning_rate,weight_decay,num_epochs,batch_size,dropout_prob,patience,threshold) def rchoicep(pg): return random.choices(list(pg.keys()), weights=list(pg.values()), k=1)[0] if (modeltype == 'svr'): kernel = rchoicep(param_grid['kernel']) C = rchoice(param_grid['C']) epsilon = rchoice(param_grid['epsilon']) gamma = rchoicep(param_grid['gamma']) # Put all the random parameters into a tuple params = (modeltype,kernel,C,epsilon,gamma) if (modeltype == 'linearsvr'): # Parameters for LinearSVR C = rchoice(param_grid['C']) epsilon = rchoice(param_grid['epsilon']) loss = rchoice(param_grid_linearsvr['loss']) # Put all the random parameters into a tuple params = (modeltype, C, epsilon, loss) validFold = KFold(n_splits=inner_kfold, shuffle=True, random_state=randomSeed) # The MEE scores for each validation fold validFold_train_mees = [] validFold_val_mees = [] # Inner K-fold for validation purposes for internalFold, (train_index, val_index) in enumerate(validFold.split(tval_X_np)): train_X_np, val_X_np = tval_X_np[train_index], tval_X_np[val_index] train_y_np, val_y_np = tval_y_np[train_index], tval_y_np[val_index] train_X_torch, val_X_torch = tval_X_torch[train_index], tval_X_torch[val_index] train_y_torch, val_y_torch = tval_y_torch[train_index], tval_y_torch[val_index] _, train_mees, val_mees, epoch_train_mees, epoch_val_mees = train_model( train_X_torch,train_X_np, train_y_torch,train_y_np, val_X_torch, val_X_np, val_y_torch, val_y_np, params) validFold_train_mees.append(train_mees) validFold_val_mees.append(val_mees) randomSearch_avg_train_mees.append(np.mean(validFold_train_mees)) randomSearch_avg_val_mees.append(np.mean(validFold_val_mees)) #randomSearch_avg_train_mees.append(np.sqrt(np.mean(np.pow(validFold_train_mees,2)))) # Quadratic mean #randomSearch_avg_val_mees.append(np.sqrt(np.mean(np.pow(validFold_val_mees,2)))) # Quadratic mean randomSearch_parameters.append(params) # Sort the random search results by validation MEE sorted_indices = np.argsort(randomSearch_avg_val_mees) randomSearch_avg_train_mees = [randomSearch_avg_train_mees[i] for i in sorted_indices] randomSearch_avg_val_mees = [randomSearch_avg_val_mees[i] for i in sorted_indices] randomSearch_parameters = [randomSearch_parameters[i] for i in sorted_indices] print(f"RandomSearch Best Training MEE: {randomSearch_avg_train_mees[0]}") print(f"RandomSearch Best Validation MEE: {randomSearch_avg_val_mees[0]}") print(f"RandomSearch Best Parameters: {randomSearch_parameters[0]}") # Create directory if it doesn't exist os.makedirs(f'{outputDir}/{fold}', exist_ok=True) # Find the smallest i such that randomSearch_parameters[i][0] == 'nn' try: nn_index = next(i for i, params in enumerate(randomSearch_parameters) if params[0] == 'nn') except StopIteration: print("No 'nn' model found in randomSearch_parameters") nn_index = -1 # Indicate no NN model found # Find the smallest i such that randomSearch_parameters[i][0] == 'svr' try: svr_index = next(i for i, params in enumerate(randomSearch_parameters) if params[0] == 'svr') except StopIteration: print("No 'svr' model found in randomSearch_parameters") svr_index = -1 # Indicate no SVR model found # Find the smallest i such that randomSearch_parameters[i][0] == 'linearsvr' try: linearsvr_index = next(i for i, params in enumerate(randomSearch_parameters) if params[0] == 'linearsvr') except StopIteration: print("No 'linearsvr' model found in randomSearch_parameters") linearsvr_index = -1 # Indicate no LinearSVR model found if nn_index != -1: # Our best NN model nn_model, nn_train_mees, nn_val_mees, nn_epoch_train_mees, nn_epoch_val_mees = train_model( tval_X_torch, tval_X_np, tval_y_torch, tval_y_np, test_X_torch, test_X_np, test_y_torch, test_y_np, randomSearch_parameters[nn_index]) # Print results for our best NN model print(f"Best refit NN Training MEE: {nn_train_mees}") print(f"Best refit NN Testing MEE: {nn_val_mees}") # Plot learning curve for our best NN model plt.figure(figsize=(10, 6)) plt.plot(range(1, len(nn_epoch_train_mees) + 1), nn_epoch_train_mees, label='Train MEE') plt.plot(range(1, len(nn_epoch_val_mees) + 1), nn_epoch_val_mees, label='Test MEE', linestyle='--') plt.xlabel('Epoch') plt.ylabel('MEE') plt.title('NN Learning Curve - Best Model') plt.legend() plt.grid() # Save the plot to a file plt.savefig(f'{outputDir}/{fold}/nn_learning_curve.png') #plt.show() else: nn_train_mees = float('inf') # Assign infinity if no NN model nn_val_mees = float('inf') if svr_index != -1: # Our best SVR model svr_model, svr_train_mees, svr_val_mees, svr_epoch_train_mees, svr_epoch_val_mees = train_model( tval_X_torch, tval_X_np, tval_y_torch, tval_y_np, test_X_torch, test_X_np, test_y_torch, test_y_np, randomSearch_parameters[svr_index]) # Print results for our best SVR model print(f"Best refit SVR Training MEE: {svr_train_mees}") print(f"Best refit SVR Testing MEE: {svr_val_mees}") else: svr_train_mees = float('inf') # Assign infinity if no SVR model svr_val_mees = float('inf') if linearsvr_index != -1: # Our best LinearSVR model linearsvr_model, linearsvr_train_mees, linearsvr_val_mees, linearsvr_epoch_train_mees, linearsvr_epoch_val_mees = train_model( tval_X_torch, tval_X_np, tval_y_torch, tval_y_np, test_X_torch, test_X_np, test_y_torch, test_y_np, randomSearch_parameters[linearsvr_index]) # Print results for our best LinearSVR model print(f"Best refit LinearSVR Training MEE: {linearsvr_train_mees}") print(f"Best refit LinearSVR Testing MEE: {linearsvr_val_mees}") else: linearsvr_train_mees = float('inf') # Assign infinity if no LinearSVR model linearsvr_val_mees = float('inf') df = pd.DataFrame({ 'avg_train_mee': randomSearch_avg_train_mees, 'avg_val_mee': randomSearch_avg_val_mees, 'parameters': randomSearch_parameters, }) def unpackParams(key,x): if (x[0] == 'nn' and key in param_grid_nn): return x[list(param_grid_nn).index(key)] elif (x[0] == 'svr' and key in param_grid_svr): return x[list(param_grid_svr).index(key)] elif (x[0] == 'linearsvr' and key in param_grid_linearsvr): return x[list(param_grid_linearsvr).index(key)] else: return None for i, key in enumerate(param_grid.keys()): df[key] = df['parameters'].apply(lambda x: unpackParams(key,x)) df.drop('parameters', axis=1, inplace=True) # Print the DataFrame to a file df.to_csv(f'{outputDir}/{fold}/results.csv', index=False) # Select rows in the dataframe where 'modeltype' is 'nn' nn_df = df[df['modeltype'] == 'nn'] # Plot statistics for each NN parameter for column in param_grid_nn.keys(): if column == 'modeltype' or column == 'gamma': continue; # we have nothing to do with modeltype and we don't know how to handle mixed data # Calculate the average of avg_val_mee for each unique value in the column average_values = nn_df.groupby(column)['avg_val_mee'].mean().reset_index() #pprint(average_values) # Plotting plt.figure(figsize=(8, 5)) plt.plot(average_values[column], average_values['avg_val_mee'], marker='o', linestyle='-', color='blue') # Add labels and title plt.xlabel(column) if pd.api.types.is_numeric_dtype(nn_df[column]) and (max(average_values[column])/min(average_values[column]) > 20): plt.xscale('log') plt.ylabel('Average Mean Test Error') plt.title(f'Effect of NN {column} on Average Mean Test Error') #plt.legend() #plt.grid(True) # Save the plot to a file plt.savefig(f'{outputDir}/{fold}/nn_{column}.png') #plt.show() # Select rows in the dataframe where 'modeltype' is 'svr' nn_svr = df[df['modeltype'] == 'svr'] # Plot statistics for each SVR parameter for column in param_grid_svr.keys(): if column == 'modeltype' or column == 'gamma': continue; # we have nothing to do with modeltype and we don't know how to handle mixed data # Calculate the average of avg_val_mee for each unique value in the column average_values = nn_svr.groupby(column)['avg_val_mee'].mean().reset_index() #pprint(average_values) # Plotting plt.figure(figsize=(8, 5)) plt.plot(average_values[column], average_values['avg_val_mee'], marker='o', linestyle='-', color='blue') # Add labels and title plt.xlabel(column) if pd.api.types.is_numeric_dtype(nn_svr[column]) and (max(average_values[column])/min(average_values[column]) > 20): plt.xscale('log') plt.ylabel('Average Mean Test Error') plt.title(f'Effect of SVR {column} on Average Mean Test Error') #plt.legend() #plt.grid(True) # Save the plot to a file plt.savefig(f'{outputDir}/{fold}/svr_{column}.png') #plt.show() # Select rows in the dataframe where 'modeltype' is 'linearsvr' nn_linearsvr = df[df['modeltype'] == 'linearsvr'] # Plot statistics for each LinearSVR parameter for column in param_grid_linearsvr.keys(): if column == 'modeltype': # or column == 'gamma': LinearSVR doesn't have gamma continue; # we have nothing to do with modeltype and we don't know how to handle mixed data # Calculate the average of avg_val_mee for each unique value in the column average_values = nn_linearsvr.groupby(column)['avg_val_mee'].mean().reset_index() #pprint(average_values) # Plotting plt.figure(figsize=(8, 5)) plt.plot(average_values[column], average_values['avg_val_mee'], marker='o', linestyle='-', color='blue') # Add labels and title plt.xlabel(column) if pd.api.types.is_numeric_dtype(nn_linearsvr[column]) and (max(average_values[column])/min(average_values[column]) > 20): plt.xscale('log') plt.ylabel('Average Mean Test Error') plt.title(f'Effect of LinearSVR {column} on Average Mean Test Error') #plt.legend() #plt.grid(True) # Save the plot to a file plt.savefig(f'{outputDir}/{fold}/linearsvr_{column}.png') #plt.show() best_model_index = np.argmin([nn_val_mees, svr_val_mees, linearsvr_val_mees]) # Find the index of the best model best_val_mees_list = [nn_val_mees, svr_val_mees, linearsvr_val_mees] best_train_mees_list = [nn_train_mees, svr_train_mees, linearsvr_train_mees] model_types = ['nn', 'svr', 'linearsvr'] if best_model_index == 0 and nn_index != -1: testFold_best_parameters.append(randomSearch_parameters[nn_index]) testFold_best_train_mees.append(nn_train_mees) testFold_best_val_mees.append(nn_val_mees) testFold_best_model.append(nn_model) elif best_model_index == 1 and svr_index != -1: testFold_best_parameters.append(randomSearch_parameters[svr_index]) testFold_best_train_mees.append(svr_train_mees) testFold_best_val_mees.append(svr_val_mees) testFold_best_model.append(svr_model) elif best_model_index == 2 and linearsvr_index != -1: testFold_best_parameters.append(randomSearch_parameters[linearsvr_index]) testFold_best_train_mees.append(linearsvr_train_mees) testFold_best_val_mees.append(linearsvr_val_mees) testFold_best_model.append(linearsvr_model) else: # Fallback to SVR if something went wrong and no better model is found. if svr_index != -1: testFold_best_parameters.append(randomSearch_parameters[svr_index]) testFold_best_train_mees.append(svr_train_mees) testFold_best_val_mees.append(svr_val_mees) testFold_best_model.append(svr_model) elif nn_index != -1: testFold_best_parameters.append(randomSearch_parameters[nn_index]) testFold_best_train_mees.append(nn_train_mees) testFold_best_val_mees.append(nn_val_mees) testFold_best_model.append(nn_model) elif linearsvr_index != -1: testFold_best_parameters.append(randomSearch_parameters[linearsvr_index]) testFold_best_train_mees.append(linearsvr_train_mees) testFold_best_val_mees.append(linearsvr_val_mees) testFold_best_model.append(linearsvr_model) else: print("No model found in this fold!") continue # skip to the next fold if no model was trained in this iteration. # Sort the best models by validation MEE sorted_indices = np.argsort(testFold_best_val_mees) testFold_best_train_mees = [testFold_best_train_mees[i] for i in sorted_indices] testFold_best_val_mees = [testFold_best_val_mees[i] for i in sorted_indices] testFold_best_parameters = [testFold_best_parameters[i] for i in sorted_indices] testFold_best_model = [testFold_best_model[i] for i in sorted_indices] # Print results of the test fold df = pd.DataFrame({ 'testFold_best_train_mees': testFold_best_train_mees, 'testFold_best_val_mees': testFold_best_val_mees, 'testFold_best_parameters': testFold_best_parameters, }) # Print the DataFrame to a file df.to_csv(f'{outputDir}/results.csv', index=False) # Evaluate the best model def bestModelEval(X): bestModelType = testFold_best_parameters[0][0] bestModel = testFold_best_model[0] if (bestModelType == 'nn'): return bestModel(X) elif (bestModelType == 'svr'): return bestModel(X) elif (bestModelType == 'linearsvr'): return bestModel(X) else: print("Unknwon model type?") os.abort()
Editor is loading...
Leave a Comment