Read CACM
unknown
python
4 years ago
1.3 kB
6
Indexable
def read_cacm_docs(root_folder = "./drive/MyDrive/datasets/"): """ Reads in the CACM documents. The dataset is assumed to be in the folder "./datasets/" by default Returns: A list of 2-tuples: (doc_id, document), where 'document' is a single string created by appending the title and abstract (separated by a "\n"). In case the record doesn't have an abstract, the document is composed only by the title """ # YOUR CODE HERE new_doc = [] titles = [] abstracts = [] with open(root_folder + '/cacm.all', 'r') as d: document = d.readlines() # lines = f.readlines() lines = document for i in range(len(lines)): line = lines[i].split() if line[0] == '.I': index = int(line[1]) if line[0] == '.T': i += 1 title = lines[i].strip() while lines[i+1][0] != '.': i += 1 title += " " + lines[i].strip() if line[0] == '.W': i += 1 title += '\n'+lines[i].strip() while lines[i+1].strip() != '.B': i+=1 title += " " + lines[i].strip() if line[0] == '.B': abstracts.append((index, title)) return abstracts docs = read_cacm_docs()
Editor is loading...