Read CACM
unknown
python
5 years ago
1.3 kB
9
Indexable
def read_cacm_docs(root_folder = "./drive/MyDrive/datasets/"):
"""
Reads in the CACM documents. The dataset is assumed to be in the folder "./datasets/" by default
Returns: A list of 2-tuples: (doc_id, document), where 'document' is a single string created by
appending the title and abstract (separated by a "\n").
In case the record doesn't have an abstract, the document is composed only by the title
"""
# YOUR CODE HERE
new_doc = []
titles = []
abstracts = []
with open(root_folder + '/cacm.all', 'r') as d:
document = d.readlines()
# lines = f.readlines()
lines = document
for i in range(len(lines)):
line = lines[i].split()
if line[0] == '.I':
index = int(line[1])
if line[0] == '.T':
i += 1
title = lines[i].strip()
while lines[i+1][0] != '.':
i += 1
title += " " + lines[i].strip()
if line[0] == '.W':
i += 1
title += '\n'+lines[i].strip()
while lines[i+1].strip() != '.B':
i+=1
title += " " + lines[i].strip()
if line[0] == '.B':
abstracts.append((index, title))
return abstracts
docs = read_cacm_docs()Editor is loading...