def make_feature_array(raw_features):
res = []
for key in sorted(raw_features):
res.append(raw_features[key])
return np.array(res)
def make_dataset(directory):
objects = defaultdict(dict)
for dataset_path in tqdm(os.listdir(directory)):
dataset = pd.read_csv(os.path.join(directory, dataset_path))
for _, row in dataset.iterrows():
id = row.id
row_dict = dict(row)
del row_dict["id"]
if "is_bot" in row:
is_bot = row.is_bot
del row_dict["is_bot"]
row_dict[f"{dataset_path}_is_bot"] = is_bot
objects[id].update(row_dict)
object_2_fetures = dict()
for object in objects:
object_2_fetures[object] = make_feature_array(objects[object])
return object_2_fetures, sorted(objects[object])