Untitled
unknown
plain_text
a year ago
4.9 kB
5
Indexable
import gzip from pathlib import Path import numpy as np import pandas as pd import simdjson from xgboost import XGBRegressor from sklearn.metrics import mean_absolute_error """ Question: The goal of this exercise is to implement a simple model that predicts the minimum price of a search. This model will be used to color the calendar in the UI of the frontpage. E.g. when a user searches for the flight ZRH -> LON with departure date 28.02.2024 the model should predict the minimum price that we will see over all results. To train the model we can use logs (MBL) from the backend (BE). MBL is stored as jsonl partitioned by year,month,day,hour, e.g. /disk3/data/mbl/raw/year=2024/month=02/day=01/hour=00 contains multiple jsonl files such as data-2024-02-01-00.00.00.10-p4-analytics-seeker-mbl-flight-0-0000000000581601118.json.gz. Hints: === MBL structure === Each line in these files represents a search as json with the following structure: - message - SearchID - SearchLegs (list) - Origin - Destination - Date - FlightResults (list) - PriceOptions (list) - TotalPrice - ... - ... - context - ... === unzip files === with gzip.open(file, 'rb') as f: for line in f: === parse json lines === import simdjson parser = simdjson.Parser() raw_search = parser.parse(line) Follow up questions: - how do you handle unseen categorical data, e.g. if we have LON in test but not in train - other features? - how to scale? - what other models? """ def load_data(path: str, max_num_rows=None) -> pd.DataFrame: files = list(Path(path).rglob('**/*.json.gz')) rows = [] for file in files: with gzip.open(file, 'rb') as f: for line in f: parser = simdjson.Parser() raw_search = parser.parse(line) # search legs raw_search_legs = raw_search['message']['SearchLegs'] if len(raw_search_legs) > 1: # only take direct flights for now continue raw_search_leg = raw_search['message']['SearchLegs'][0] origin = raw_search_leg['Origin'] destination = raw_search_leg['Destination'] departure_date = raw_search_leg['Date'] # min price over all price options min_price = np.Inf search_id = raw_search['message']['SearchID'] for raw_result in raw_search['message']['FlightResults']: for raw_price_option in raw_result['PriceOptions']: price = raw_price_option['TotalPrice'] if price < min_price: min_price = price if 1_000 <= min_price <= 10_000: rows.append({ 'search_id': search_id, 'origin': origin, 'destination': destination, 'departure_date': departure_date, 'min_price': min_price, }) if max_num_rows is not None and len(rows) == max_num_rows: df = pd.DataFrame(rows) return df if len(rows) % 100 == 0: print(len(rows)) df = pd.DataFrame(rows) return df if __name__ == '__main__': # load data if False: path = '/disk3/data/mbl/raw/year=2024/month=02/day=01' df = load_data(path, max_num_rows=10_000) print(len(df)) df.to_csv('/disk2/lneukom/tmp/df.csv') df = pd.read_csv('/disk2/lneukom/tmp/df.csv') # feature engineering df['search_id'] = df['search_id'].astype('category') df['origin'] = df['origin'].astype('category') df['destination'] = df['destination'].astype('category') df['departure_date'] = df['departure_date'].astype('category') # split data train_df = df[:len(df) // 2] test_df = df[len(df) // 2 + 1:] # create splits and select features features = ['origin', 'destination', 'departure_date'] target = 'min_price' X_train = train_df[features] y_train = train_df[target] X_test = test_df[features] y_test = test_df[target] # train eval_set = [(X_train, y_train), (X_test, y_test)] eval_metric = ['mae', 'rmse'] model = XGBRegressor( enable_categorical=True, # verbosity=2, ) model.fit( X_train, y_train, eval_set=eval_set, eval_metric=eval_metric, verbose=True, ) # predict y_pred = model.predict(X_test) # baseline y_baseline = [np.mean(y_train)] * len(y_test) # eval model_mae = mean_absolute_error(y_test, y_pred) baseline_mae = mean_absolute_error(y_test, y_baseline) print(f'baseline mae: {baseline_mae}') print(f'mae: {model_mae}')
Editor is loading...
Leave a Comment