Untitled
unknown
plain_text
2 years ago
2.6 kB
9
Indexable
### BEGIN SOLUTION
import regex as re
### END SOLUTION
# in *year*
# (day) *month* *year*
# *year* *month* (day)
# *year* to *year*
# *year* - *year*
def extract_date_expressions(sentences):
"""
:param sentences: A list of strings, where each string is one sentence
:return: A pandas DataFrame with the columns
"Date" (extracted date expressions as a string)
"Sentence" (sentences from which a date expression was extracted)
"""
### BEGIN SOLUTION
# matches: year(,) month(,) (day)
pattern1 = r'([0-9]+),*\s*(january|february|march|april|may|june|july|august|september|october|november|december),*\s*([0-9]*)'
# matches: (day)(,) month(,) year
pattern2 = r'([0-9]*),*\s*(january|february|march|april|may|june|july|august|september|october|november|december),*\s*([0-9]+)'
# matches: in year(,) (month)(,) (day)
pattern3 = r'(\bin\b)\s*([0-9]+),*\s*(january|february|march|april|may|june|july|august|september|october|november|december)*,*\s*([0-9]*)'
# matches: year to/- year
pattern4 = r'([0-9]+)\s*(to|-)\s*([0-9]+)'
expression_dict = {
'Date': [],
'Sentence': []
}
# iterate sentences
for sentence in sentences:
sent = sentence.lower()
# expression list
date_expressions = []
# match pattern1
matches1 = re.findall(pattern1, sent)
for m in matches1:
expression = m[0] + ' ' + m[1].capitalize() + ' ' + m[2]
date_expressions.append(expression.lstrip())
# match pattern2
matches2 = re.findall(pattern2, sent)
for m in matches2:
expression = m[0] + ' ' + m[1].capitalize() + ' ' + m[2]
# there are some similarities with matches1. This prevents duplicates
if expression not in date_expressions:
date_expressions.append(expression.lstrip())
# match pattern3
matches3 = re.findall(pattern3, sent)
for m in matches3:
date_expressions.append(m[1])
# match pattern4
matches4 = re.findall(pattern4, sent)
for m in matches4:
date_expressions.append(m[0])
date_expressions.append(m[2])
# add expressions to the final dictionary
expression_dict['Date'].extend(date_expressions)
expression_dict['Sentence'].extend([sentence] * len(date_expressions))
return pd.DataFrame(expression_dict)
### END SOLUTION
# Apply the function to the tokenized text:
df_dates_geoffrey = extract_date_expressions(sentence_tokenize_text(text_geoffrey))
df_dates_geoffrey # use this for testingEditor is loading...