Untitled

mail@pastecode.io avatar
unknown
python
a month ago
1.6 kB
3
Indexable
Never
import numpy as np
import nltk
import re

from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk import ne_chunk
# Download
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger') 
# Loading and read Text
f = open('sentence.txt','r',errors='ignore') 
raw_code = f.read().lower()

# Doing Segmentation and Tokenizing
segment = nltk.sent_tokenize(raw_code)
word_tokens = nltk.word_tokenize(raw_code)


# Remove word that is not Alphanumeric
clean_data = []
for words in word_tokens:
    item = []
    result = re.sub(r"[^\w\s]", "", words)
    if result != "":
        clean_data.append(result)

# Remove word that is included in StopWord English
clean_data_1 = [] 
for words in clean_data: 
    if not words in stopwords.words('english'): 
        clean_data_1.append(words) 

#Stemming and Lemmatization (Simplify word)
stemmer = PorterStemmer() 
stemmed_data = [stemmer.stem(word) for word in clean_data_1] 
lemmer = nltk.stem.WordNetLemmatizer()
lemma_data = [] 
for word in clean_data_1: 
    lemma_data.append(lemmer.lemmatize(word)) 


# Using snowball Stemmer
snow_stemmer = SnowballStemmer(language='english')
stem_words = []
for word in lemma_data:
    tempData = snow_stemmer.stem(word)
    stem_words.append(tempData)
    
#Tagging
pos_data = nltk.pos_tag(stem_words)
tagged_tokens = ne_chunk(pos_data)
tagged_tokens






Leave a Comment