Untitled
user_5387634
plain_text
a year ago
653 B
5
Indexable
import datasets
from datasets import load_dataset
import json
import os
def load_cnn(mode="train"):
raw_datasets = load_dataset('cnn_dailymail', name='3.0.0')[mode]
# 'article' 'highlights'
data = []
for x in raw_datasets:
data_item = {
'prompt': " ".join(x['article'].replace("\n", " ").split()) + '\n\nSummarize the above article:',
'completion': x['highlights'].replace("\n", " "),
}
data.append(data_item)
return data
mode = "train"
data = load_cnn(mode)
if mode == "train":
data = data[:100000]
fo = open(f"{mode}.jsonl", "w")
for i in data:
fo.write(json.dumps(i) + "\n"Editor is loading...
Leave a Comment