Untitled

mail@pastecode.io avatar
unknown
python
14 days ago
2.8 kB
2
Indexable
Never
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Fixing randomness, please do not change this line.
np.random.seed(1111)

max_depths = [1, 2, 5, 8, 9, 10, 11, 12, 13, 15, 20]
depth_RMSEs = list()

# 4-fold cross validation
for param in max_depths:
  print(f'Calculating RMSE for tree with a depth {param}...')
  ##### YOUR CODE STARTS #####
  # number of folds
  n_folds = 4

  # create a vector of fold indices
  # with length of this vector equal to number of rows in training data
  # you can use np.concatenate and np.repeat functions
  n_folds = 4
  num_rows = len(X)
  fold_size = num_rows // n_folds

# Create a list that will store the fold indices
  folds_indx = np.array([])

# Generate fold indices

  for i in range(n_folds):
      if i < n_folds - 1:
          fold = np.full(fold_size, i)
      else:
          fold = np.full(num_rows - fold_size * i, i)
      folds_indx = np.concatenate([folds_indx, fold])

  np.random.shuffle(folds_indx)

  # initialise variable that will hold RMSEs for each fold
  fold_RMSEs = [] # changed from dict() to list()
  for fold_indx in np.arange(n_folds):

    # split data into train_X, train_y and val_X, val_y depending on the fold
    # use previously generated folds_indx to fetch the right rows
    train_X = X[folds_indx != fold_indx]
    train_y = y[folds_indx  != fold_indx]
    val_X = X[folds_indx  == fold_indx]
    val_y = y[folds_indx  == fold_indx]

    # Use .loc to avoid SettingWithCopyWarning
    train_X.loc[:, 'pickup_datetime'] = pd.to_numeric(train_X['pickup_datetime'])
    val_X.loc[:, 'pickup_datetime'] = pd.to_numeric(val_X['pickup_datetime'])


    # a few tests that must pass to make sure you are doing the right thing
    # please, do not delete these lines
    assert len(train_X) + len(val_X) == len(X), "Train and validation sets combined should be equal to the size of the original dataset"
    assert len(train_X) > len(val_X), "Train set should be larger than validation set (at least three times)"
    assert len(train_X) == 748518 or len(train_X) == 748519, "Train set should be of size 748518 or 748519"

    # train the decision tree with max_depth = param
    dtr = DecisionTreeRegressor(max_depth=param)

    # fit the decision tree on training data
    dtr.fit(train_X, train_y)

    # predict validation data
    val_predictions = dtr.predict(val_X)

    # calculate RMSE on validation for this fold
    rmse = np.sqrt(mean_squared_error(val_y, val_predictions))
    fold_RMSEs.append(rmse) # changed from fold_RMSEs[fold_indx] = rmse to fold_RMSEs.append(rmse)

  ##### YOUR CODE ENDS #####
  print(f'Average validation RMSE for {param} of trees is {np.mean(fold_RMSEs)}')
  depth_RMSEs.append(np.mean(fold_RMSEs))
Leave a Comment