Untitled
unknown
python
a year ago
2.8 kB
12
Indexable
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
# Fixing randomness, please do not change this line.
np.random.seed(1111)
max_depths = [1, 2, 5, 8, 9, 10, 11, 12, 13, 15, 20]
depth_RMSEs = list()
# 4-fold cross validation
for param in max_depths:
print(f'Calculating RMSE for tree with a depth {param}...')
##### YOUR CODE STARTS #####
# number of folds
n_folds = 4
# create a vector of fold indices
# with length of this vector equal to number of rows in training data
# you can use np.concatenate and np.repeat functions
n_folds = 4
num_rows = len(X)
fold_size = num_rows // n_folds
# Create a list that will store the fold indices
folds_indx = np.array([])
# Generate fold indices
for i in range(n_folds):
if i < n_folds - 1:
fold = np.full(fold_size, i)
else:
fold = np.full(num_rows - fold_size * i, i)
folds_indx = np.concatenate([folds_indx, fold])
np.random.shuffle(folds_indx)
# initialise variable that will hold RMSEs for each fold
fold_RMSEs = [] # changed from dict() to list()
for fold_indx in np.arange(n_folds):
# split data into train_X, train_y and val_X, val_y depending on the fold
# use previously generated folds_indx to fetch the right rows
train_X = X[folds_indx != fold_indx]
train_y = y[folds_indx != fold_indx]
val_X = X[folds_indx == fold_indx]
val_y = y[folds_indx == fold_indx]
# Use .loc to avoid SettingWithCopyWarning
train_X.loc[:, 'pickup_datetime'] = pd.to_numeric(train_X['pickup_datetime'])
val_X.loc[:, 'pickup_datetime'] = pd.to_numeric(val_X['pickup_datetime'])
# a few tests that must pass to make sure you are doing the right thing
# please, do not delete these lines
assert len(train_X) + len(val_X) == len(X), "Train and validation sets combined should be equal to the size of the original dataset"
assert len(train_X) > len(val_X), "Train set should be larger than validation set (at least three times)"
assert len(train_X) == 748518 or len(train_X) == 748519, "Train set should be of size 748518 or 748519"
# train the decision tree with max_depth = param
dtr = DecisionTreeRegressor(max_depth=param)
# fit the decision tree on training data
dtr.fit(train_X, train_y)
# predict validation data
val_predictions = dtr.predict(val_X)
# calculate RMSE on validation for this fold
rmse = np.sqrt(mean_squared_error(val_y, val_predictions))
fold_RMSEs.append(rmse) # changed from fold_RMSEs[fold_indx] = rmse to fold_RMSEs.append(rmse)
##### YOUR CODE ENDS #####
print(f'Average validation RMSE for {param} of trees is {np.mean(fold_RMSEs)}')
depth_RMSEs.append(np.mean(fold_RMSEs))Editor is loading...
Leave a Comment