Untitled

 avatar
user_5387634
plain_text
9 months ago
653 B
3
Indexable
import datasets
from datasets import load_dataset
import json
import os

def load_cnn(mode="train"):
    raw_datasets = load_dataset('cnn_dailymail', name='3.0.0')[mode]
    # 'article' 'highlights'
    data = []
    for x in raw_datasets:
        data_item = {
            'prompt': " ".join(x['article'].replace("\n", " ").split()) + '\n\nSummarize the above article:',
            'completion': x['highlights'].replace("\n", " "),
        }
        data.append(data_item)
    return data
mode = "train"
data = load_cnn(mode)
if mode == "train":
    data = data[:100000]
fo = open(f"{mode}.jsonl", "w")
for i in data:
    fo.write(json.dumps(i) + "\n"
Editor is loading...
Leave a Comment