Read CACM

mail@pastecode.io avatar
unknown
python
3 years ago
1.3 kB
4
Indexable
Never
def read_cacm_docs(root_folder = "./drive/MyDrive/datasets/"):
    """
        Reads in the CACM documents. The dataset is assumed to be in the folder "./datasets/" by default
        Returns: A list of 2-tuples: (doc_id, document), where 'document' is a single string created by 
            appending the title and abstract (separated by a "\n"). 
            In case the record doesn't have an abstract, the document is composed only by the title
    """
    # YOUR CODE HERE
    new_doc = []
    titles = []
    abstracts = []

    with open(root_folder + '/cacm.all', 'r') as d:
      document = d.readlines()
      # lines = f.readlines()
      lines = document

      for i in range(len(lines)):
        line = lines[i].split()
        if line[0] == '.I':
          index = int(line[1])

        if line[0] == '.T':
          i += 1
          title = lines[i].strip()
          while lines[i+1][0] != '.':
            i += 1
            title += " " + lines[i].strip()

        if line[0] == '.W':
          i += 1
          title += '\n'+lines[i].strip()
          while lines[i+1].strip() != '.B':
            i+=1
            title += " " + lines[i].strip()

        if line[0] == '.B':
          abstracts.append((index, title))

    return abstracts
           
docs = read_cacm_docs()