Untitled
unknown
plain_text
3 years ago
2.1 kB
5
Indexable
import numpy as np from nltk.stem import PorterStemmer from nltk.corpus import stopwords # takes an email as an argument # read email line-by-line and extract all the words # return list of extracted words def read_email(email): words = email.split(' ') return words # takes a list of words as an argument # replace each word by their stem word # return list of stem words def stemming(list): ps = PorterStemmer() stem_words = [] for word in list: stem_words.append(ps.stem(word)) return stem_words # takes a list of stem-words as an argument # remove stop words # return list of stem words after removing stop words def remove_stop_words(list): stop_words = set(stopwords.words('english')) stem_no_stop_words = [] for w in list: if w not in stop_words: stem_no_stop_words.append(w) return stem_no_stop_words # takes a list of stem-words as an argument # add new words to the vocabulary and assign a unique index to them # returns new vocabulary def build_vocabulary(list,vocab): for word in list: if word not in vocab: vocab.append(word) return vocab # takes a list of stem-words and vocabulary as an argument # returns bow representation def get_bow(sentence, vocabulary): email_bow = [] for word in sentence: for i in range(0,len(vocabulary)): if word is vocabulary[i]: email_bow.append(i) break return email_bow # read the entire dataset # convert emails to bow and maintain their labels # call function text_to_bow() def read_data(): file='spam_or_not_spam.csv' with open(file) as f:lines=f.readlines() parsed=[l.strip().split(',') for l in lines] vocab = [] stem_list = [] bow_list = [] for lines in parsed: words = read_email(lines[0]) stem_words = stemming(words) stem_no_stop_words = stemming(stem_words) stem_list.append(stem_no_stop_words) vocab = build_vocabulary(stem_no_stop_words,vocab) for lines in stem_list: bow_list.append(get_bow(lines,vocab)) # print(bow_list) print(stem_list) return parsed read_data()
Editor is loading...