Untitled

mail@pastecode.io avatar
unknown
plain_text
2 years ago
2.1 kB
2
Indexable
Never
import numpy as np
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# takes an email as an argument
# read email line-by-line and extract all the words
# return list of extracted words
def read_email(email):
  words = email.split(' ')
  return words
  
# takes a list of words as an argument
# replace each word by their stem word
# return list of stem words
def stemming(list):
  ps = PorterStemmer()
  stem_words = []
  for word in list:
    stem_words.append(ps.stem(word))
  return stem_words

# takes a list of stem-words as an argument
# remove stop words
# return list of stem words after removing stop words
def remove_stop_words(list):
  stop_words = set(stopwords.words('english'))
  
  stem_no_stop_words = []
  for w in list:
    if w not in stop_words:
        stem_no_stop_words.append(w)
    
  return stem_no_stop_words

# takes a list of stem-words as an argument
# add new words to the vocabulary and assign a unique index to them
# returns new vocabulary
def build_vocabulary(list,vocab):
  
  for word in list:
    if word not in vocab:
      vocab.append(word)
  
  return vocab

# takes a list of stem-words and vocabulary as an argument
# returns bow representation
def get_bow(sentence, vocabulary):
  
  email_bow = []
  
  for word in sentence:
    for i in range(0,len(vocabulary)):
      if word is vocabulary[i]:
        email_bow.append(i)
        break
      
  return email_bow

# read the entire dataset
# convert emails to bow and maintain their labels
# call function text_to_bow()
def read_data():
  file='spam_or_not_spam.csv'

  with open(file) as f:lines=f.readlines()
  parsed=[l.strip().split(',') for l in lines]
  
  vocab = []
  stem_list = []
  bow_list = []
  
  for lines in parsed:
    words = read_email(lines[0])
    stem_words = stemming(words)
    stem_no_stop_words = stemming(stem_words)
    stem_list.append(stem_no_stop_words)
    vocab = build_vocabulary(stem_no_stop_words,vocab)

  for lines in stem_list:
    bow_list.append(get_bow(lines,vocab))
  
  
  # print(bow_list) 
  print(stem_list)
  return parsed

read_data()