Untitled
unknown
plain_text
4 years ago
2.1 kB
9
Indexable
import numpy as np
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
# takes an email as an argument
# read email line-by-line and extract all the words
# return list of extracted words
def read_email(email):
words = email.split(' ')
return words
# takes a list of words as an argument
# replace each word by their stem word
# return list of stem words
def stemming(list):
ps = PorterStemmer()
stem_words = []
for word in list:
stem_words.append(ps.stem(word))
return stem_words
# takes a list of stem-words as an argument
# remove stop words
# return list of stem words after removing stop words
def remove_stop_words(list):
stop_words = set(stopwords.words('english'))
stem_no_stop_words = []
for w in list:
if w not in stop_words:
stem_no_stop_words.append(w)
return stem_no_stop_words
# takes a list of stem-words as an argument
# add new words to the vocabulary and assign a unique index to them
# returns new vocabulary
def build_vocabulary(list,vocab):
for word in list:
if word not in vocab:
vocab.append(word)
return vocab
# takes a list of stem-words and vocabulary as an argument
# returns bow representation
def get_bow(sentence, vocabulary):
email_bow = []
for word in sentence:
for i in range(0,len(vocabulary)):
if word is vocabulary[i]:
email_bow.append(i)
break
return email_bow
# read the entire dataset
# convert emails to bow and maintain their labels
# call function text_to_bow()
def read_data():
file='spam_or_not_spam.csv'
with open(file) as f:lines=f.readlines()
parsed=[l.strip().split(',') for l in lines]
vocab = []
stem_list = []
bow_list = []
for lines in parsed:
words = read_email(lines[0])
stem_words = stemming(words)
stem_no_stop_words = stemming(stem_words)
stem_list.append(stem_no_stop_words)
vocab = build_vocabulary(stem_no_stop_words,vocab)
for lines in stem_list:
bow_list.append(get_bow(lines,vocab))
# print(bow_list)
print(stem_list)
return parsed
read_data()Editor is loading...