Untitled

 avatar
unknown
plain_text
2 years ago
522 B
6
Indexable
def preprocess(input):

    output_list = []
    splited = input.split(' ')
    
    for word in splited:
        # capture date and time
        dot_mathc = re.findall('\d[.:/-]\d',word)
        if dot_mathc:
            output_list.append(word.strip())
        else:
            # remove symbols
            temp = re.sub('\W+','',word).strip()
            # remove chars
            temp = re.sub('\D','',temp).strip()
            if len(temp) >= 3 :
                output_list.append(temp)
    
    return output_list
Editor is loading...