Untitled

import gzip
from pathlib import Path

import numpy as np
import pandas as pd
import simdjson
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error
"""
Question:

The goal of this exercise is to implement a simple model that predicts the minimum price of a search. This
model will be used to color the calendar in the UI of the frontpage. E.g. when a user searches for the flight
ZRH -> LON with departure date 28.02.2024 the model should predict the minimum price that we will see over 
all results.

To train the model we can use logs (MBL) from the backend (BE). MBL is stored as jsonl partitioned by 
year,month,day,hour, e.g. /disk3/data/mbl/raw/year=2024/month=02/day=01/hour=00 contains multiple jsonl files such
as data-2024-02-01-00.00.00.10-p4-analytics-seeker-mbl-flight-0-0000000000581601118.json.gz.

Hints: 

=== MBL structure ===
Each line in these files represents a search as json with the following structure:
- message
    - SearchID
    - SearchLegs (list)
        - Origin
        - Destination
        - Date
    - FlightResults (list)
        - PriceOptions (list)
            - TotalPrice
            - ...
    - ...
- context
    - ...

=== unzip files  ===
with gzip.open(file, 'rb') as f:
    for line in f:

=== parse json lines  ===
import simdjson
parser = simdjson.Parser()
raw_search = parser.parse(line)


Follow up questions:
- how do you handle unseen categorical data, e.g. if we have LON in test but not in train
- other features?
- how to scale?
- what other models?
"""


def load_data(path: str, max_num_rows=None) -> pd.DataFrame:
    files = list(Path(path).rglob('**/*.json.gz'))

    rows = []
    for file in files:
        with gzip.open(file, 'rb') as f:
            for line in f:
                parser = simdjson.Parser()
                raw_search = parser.parse(line)

                # search legs
                raw_search_legs = raw_search['message']['SearchLegs']
                if len(raw_search_legs) > 1:
                    # only take direct flights for now
                    continue

                raw_search_leg = raw_search['message']['SearchLegs'][0]
                origin = raw_search_leg['Origin']
                destination = raw_search_leg['Destination']
                departure_date = raw_search_leg['Date']

                # min price over all price options
                min_price = np.Inf
                search_id = raw_search['message']['SearchID']
                for raw_result in raw_search['message']['FlightResults']:
                    for raw_price_option in raw_result['PriceOptions']:
                        price = raw_price_option['TotalPrice']
                        if price < min_price:
                            min_price = price

                if 1_000 <= min_price <= 10_000:
                    rows.append({
                        'search_id': search_id,
                        'origin': origin,
                        'destination': destination,
                        'departure_date': departure_date,
                        'min_price': min_price,
                    })

                    if max_num_rows is not None and len(rows) == max_num_rows:
                        df = pd.DataFrame(rows)
                        return df

                    if len(rows) % 100 == 0:
                        print(len(rows))

    df = pd.DataFrame(rows)
    return df


if __name__ == '__main__':
    # load data
    if False:
        path = '/disk3/data/mbl/raw/year=2024/month=02/day=01'
        df = load_data(path, max_num_rows=10_000)
        print(len(df))
        df.to_csv('/disk2/lneukom/tmp/df.csv')

    df = pd.read_csv('/disk2/lneukom/tmp/df.csv')

    # feature engineering
    df['search_id'] = df['search_id'].astype('category')
    df['origin'] = df['origin'].astype('category')
    df['destination'] = df['destination'].astype('category')
    df['departure_date'] = df['departure_date'].astype('category')

    # split data
    train_df = df[:len(df) // 2]
    test_df = df[len(df) // 2 + 1:]

    # create splits and select features
    features = ['origin', 'destination', 'departure_date']
    target = 'min_price'

    X_train = train_df[features]
    y_train = train_df[target]

    X_test = test_df[features]
    y_test = test_df[target]

    # train
    eval_set = [(X_train, y_train), (X_test, y_test)]
    eval_metric = ['mae', 'rmse']
    model = XGBRegressor(
        enable_categorical=True,  # verbosity=2,
    )
    model.fit(
        X_train,
        y_train,
        eval_set=eval_set,
        eval_metric=eval_metric,
        verbose=True,
    )

    # predict
    y_pred = model.predict(X_test)

    # baseline
    y_baseline = [np.mean(y_train)] * len(y_test)

    # eval
    model_mae = mean_absolute_error(y_test, y_pred)
    baseline_mae = mean_absolute_error(y_test, y_baseline)
    print(f'baseline mae: {baseline_mae}')
    print(f'mae: {model_mae}')
Editor is loading...