Untitled
unknown
plain_text
a year ago
2.6 kB
2
Indexable
Never
### BEGIN SOLUTION import regex as re ### END SOLUTION # in *year* # (day) *month* *year* # *year* *month* (day) # *year* to *year* # *year* - *year* def extract_date_expressions(sentences): """ :param sentences: A list of strings, where each string is one sentence :return: A pandas DataFrame with the columns "Date" (extracted date expressions as a string) "Sentence" (sentences from which a date expression was extracted) """ ### BEGIN SOLUTION # matches: year(,) month(,) (day) pattern1 = r'([0-9]+),*\s*(january|february|march|april|may|june|july|august|september|october|november|december),*\s*([0-9]*)' # matches: (day)(,) month(,) year pattern2 = r'([0-9]*),*\s*(january|february|march|april|may|june|july|august|september|october|november|december),*\s*([0-9]+)' # matches: in year(,) (month)(,) (day) pattern3 = r'(\bin\b)\s*([0-9]+),*\s*(january|february|march|april|may|june|july|august|september|october|november|december)*,*\s*([0-9]*)' # matches: year to/- year pattern4 = r'([0-9]+)\s*(to|-)\s*([0-9]+)' expression_dict = { 'Date': [], 'Sentence': [] } # iterate sentences for sentence in sentences: sent = sentence.lower() # expression list date_expressions = [] # match pattern1 matches1 = re.findall(pattern1, sent) for m in matches1: expression = m[0] + ' ' + m[1].capitalize() + ' ' + m[2] date_expressions.append(expression.lstrip()) # match pattern2 matches2 = re.findall(pattern2, sent) for m in matches2: expression = m[0] + ' ' + m[1].capitalize() + ' ' + m[2] # there are some similarities with matches1. This prevents duplicates if expression not in date_expressions: date_expressions.append(expression.lstrip()) # match pattern3 matches3 = re.findall(pattern3, sent) for m in matches3: date_expressions.append(m[1]) # match pattern4 matches4 = re.findall(pattern4, sent) for m in matches4: date_expressions.append(m[0]) date_expressions.append(m[2]) # add expressions to the final dictionary expression_dict['Date'].extend(date_expressions) expression_dict['Sentence'].extend([sentence] * len(date_expressions)) return pd.DataFrame(expression_dict) ### END SOLUTION # Apply the function to the tokenized text: df_dates_geoffrey = extract_date_expressions(sentence_tokenize_text(text_geoffrey)) df_dates_geoffrey # use this for testing