Untitled

mail@pastecode.io avatar
unknown
plain_text
a year ago
2.6 kB
2
Indexable
Never
### BEGIN SOLUTION
import regex as re
### END SOLUTION

# in *year*
# (day) *month* *year* 
# *year* *month* (day)
# *year* to *year*
# *year* - *year*

def extract_date_expressions(sentences):
  """
  :param sentences: A list of strings, where each string is one sentence
  :return: A pandas DataFrame with the columns
                "Date" (extracted date expressions as a string)
                "Sentence" (sentences from which a date expression was extracted)
  """
  ### BEGIN SOLUTION

  # matches: year(,) month(,) (day)
  pattern1 = r'([0-9]+),*\s*(january|february|march|april|may|june|july|august|september|october|november|december),*\s*([0-9]*)'
  
  # matches: (day)(,) month(,) year
  pattern2 = r'([0-9]*),*\s*(january|february|march|april|may|june|july|august|september|october|november|december),*\s*([0-9]+)'

  # matches: in year(,) (month)(,) (day)
  pattern3 = r'(\bin\b)\s*([0-9]+),*\s*(january|february|march|april|may|june|july|august|september|october|november|december)*,*\s*([0-9]*)'

  # matches: year to/- year
  pattern4 = r'([0-9]+)\s*(to|-)\s*([0-9]+)'

  expression_dict = {
      'Date': [],
      'Sentence': []
      }

  # iterate sentences
  for sentence in sentences:
    sent = sentence.lower()

    # expression list
    date_expressions = []

    # match pattern1
    matches1 = re.findall(pattern1, sent)
    for m in matches1:
      expression = m[0] + ' ' + m[1].capitalize() + ' ' + m[2]
      date_expressions.append(expression.lstrip())

    # match pattern2
    matches2 = re.findall(pattern2, sent)
    for m in matches2:
      expression = m[0] + ' ' + m[1].capitalize() + ' ' + m[2]
      # there are some similarities with matches1. This prevents duplicates
      if expression not in date_expressions:
        date_expressions.append(expression.lstrip())

    # match pattern3
    matches3 = re.findall(pattern3, sent)
    for m in matches3:
      date_expressions.append(m[1])

    # match pattern4
    matches4 = re.findall(pattern4, sent)
    for m in matches4:
      date_expressions.append(m[0])
      date_expressions.append(m[2])
    
    # add expressions to the final dictionary
    expression_dict['Date'].extend(date_expressions)
    expression_dict['Sentence'].extend([sentence] * len(date_expressions))


  return pd.DataFrame(expression_dict)


  ### END SOLUTION

# Apply the function to the tokenized text:
df_dates_geoffrey =  extract_date_expressions(sentence_tokenize_text(text_geoffrey))
df_dates_geoffrey # use this for testing