Untitled

mail@pastecode.io avatar
unknown
plain_text
2 years ago
1.4 kB
2
Indexable
def make_feature_array(raw_features):
    res = []
    for key in sorted(raw_features):
        res.append(raw_features[key])
    return np.array(res)

def make_dataset(directory):
    objects = defaultdict(dict)
    for dataset_path in tqdm(os.listdir(directory)):
        dataset = pd.read_csv(os.path.join(directory, dataset_path))
        for _, row in dataset.iterrows():
            id = row.id
            row_dict = dict(row)
            del row_dict["id"]
            
            if "is_bot" in row:
                is_bot = row.is_bot
                del row_dict["is_bot"]
                row_dict[f"{dataset_path}_is_bot"] = is_bot
            
            objects[id].update(row_dict)
    
    object_2_fetures = dict()
    for object in objects:
        object_2_fetures[object] = make_feature_array(objects[object])
    return object_2_fetures, sorted(objects[object])

res, f_names = make_dataset("inference_val")

def merge_dataset(feature_dataset, pd_dataset, train_mode = True):
    X, y, ids = [], [], []
    for _, row in pd_dataset.iterrows():
        if train_mode:
            y.append(row.label)
        X.append(feature_dataset[row.id])
        ids.append(row.id)
    if train_mode:
        return np.vstack(X), np.array(y), np.array(ids)
    return np.vstack(X), np.array(ids)

X_train, y_train, train_ids = merge_dataset(res, val_part)