Untitled
unknown
plain_text
a year ago
1.4 kB
2
Indexable
Never
def make_feature_array(raw_features): res = [] for key in sorted(raw_features): res.append(raw_features[key]) return np.array(res) def make_dataset(directory): objects = defaultdict(dict) for dataset_path in tqdm(os.listdir(directory)): dataset = pd.read_csv(os.path.join(directory, dataset_path)) for _, row in dataset.iterrows(): id = row.id row_dict = dict(row) del row_dict["id"] if "is_bot" in row: is_bot = row.is_bot del row_dict["is_bot"] row_dict[f"{dataset_path}_is_bot"] = is_bot objects[id].update(row_dict) object_2_fetures = dict() for object in objects: object_2_fetures[object] = make_feature_array(objects[object]) return object_2_fetures, sorted(objects[object]) res, f_names = make_dataset("inference_val") def merge_dataset(feature_dataset, pd_dataset, train_mode = True): X, y, ids = [], [], [] for _, row in pd_dataset.iterrows(): if train_mode: y.append(row.label) X.append(feature_dataset[row.id]) ids.append(row.id) if train_mode: return np.vstack(X), np.array(y), np.array(ids) return np.vstack(X), np.array(ids) X_train, y_train, train_ids = merge_dataset(res, val_part)