Untitled
unknown
python
14 days ago
2.8 kB
2
Indexable
Never
from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import mean_squared_error # Fixing randomness, please do not change this line. np.random.seed(1111) max_depths = [1, 2, 5, 8, 9, 10, 11, 12, 13, 15, 20] depth_RMSEs = list() # 4-fold cross validation for param in max_depths: print(f'Calculating RMSE for tree with a depth {param}...') ##### YOUR CODE STARTS ##### # number of folds n_folds = 4 # create a vector of fold indices # with length of this vector equal to number of rows in training data # you can use np.concatenate and np.repeat functions n_folds = 4 num_rows = len(X) fold_size = num_rows // n_folds # Create a list that will store the fold indices folds_indx = np.array([]) # Generate fold indices for i in range(n_folds): if i < n_folds - 1: fold = np.full(fold_size, i) else: fold = np.full(num_rows - fold_size * i, i) folds_indx = np.concatenate([folds_indx, fold]) np.random.shuffle(folds_indx) # initialise variable that will hold RMSEs for each fold fold_RMSEs = [] # changed from dict() to list() for fold_indx in np.arange(n_folds): # split data into train_X, train_y and val_X, val_y depending on the fold # use previously generated folds_indx to fetch the right rows train_X = X[folds_indx != fold_indx] train_y = y[folds_indx != fold_indx] val_X = X[folds_indx == fold_indx] val_y = y[folds_indx == fold_indx] # Use .loc to avoid SettingWithCopyWarning train_X.loc[:, 'pickup_datetime'] = pd.to_numeric(train_X['pickup_datetime']) val_X.loc[:, 'pickup_datetime'] = pd.to_numeric(val_X['pickup_datetime']) # a few tests that must pass to make sure you are doing the right thing # please, do not delete these lines assert len(train_X) + len(val_X) == len(X), "Train and validation sets combined should be equal to the size of the original dataset" assert len(train_X) > len(val_X), "Train set should be larger than validation set (at least three times)" assert len(train_X) == 748518 or len(train_X) == 748519, "Train set should be of size 748518 or 748519" # train the decision tree with max_depth = param dtr = DecisionTreeRegressor(max_depth=param) # fit the decision tree on training data dtr.fit(train_X, train_y) # predict validation data val_predictions = dtr.predict(val_X) # calculate RMSE on validation for this fold rmse = np.sqrt(mean_squared_error(val_y, val_predictions)) fold_RMSEs.append(rmse) # changed from fold_RMSEs[fold_indx] = rmse to fold_RMSEs.append(rmse) ##### YOUR CODE ENDS ##### print(f'Average validation RMSE for {param} of trees is {np.mean(fold_RMSEs)}') depth_RMSEs.append(np.mean(fold_RMSEs))
Leave a Comment