Untitled
user_5387634
plain_text
9 months ago
653 B
3
Indexable
import datasets from datasets import load_dataset import json import os def load_cnn(mode="train"): raw_datasets = load_dataset('cnn_dailymail', name='3.0.0')[mode] # 'article' 'highlights' data = [] for x in raw_datasets: data_item = { 'prompt': " ".join(x['article'].replace("\n", " ").split()) + '\n\nSummarize the above article:', 'completion': x['highlights'].replace("\n", " "), } data.append(data_item) return data mode = "train" data = load_cnn(mode) if mode == "train": data = data[:100000] fo = open(f"{mode}.jsonl", "w") for i in data: fo.write(json.dumps(i) + "\n"
Editor is loading...
Leave a Comment