Untitled
unknown
python
2 years ago
1.6 kB
12
Indexable
import numpy as np
import nltk
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk import ne_chunk
# Download
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
# Loading and read Text
f = open('sentence.txt','r',errors='ignore')
raw_code = f.read().lower()
# Doing Segmentation and Tokenizing
segment = nltk.sent_tokenize(raw_code)
word_tokens = nltk.word_tokenize(raw_code)
# Remove word that is not Alphanumeric
clean_data = []
for words in word_tokens:
item = []
result = re.sub(r"[^\w\s]", "", words)
if result != "":
clean_data.append(result)
# Remove word that is included in StopWord English
clean_data_1 = []
for words in clean_data:
if not words in stopwords.words('english'):
clean_data_1.append(words)
#Stemming and Lemmatization (Simplify word)
stemmer = PorterStemmer()
stemmed_data = [stemmer.stem(word) for word in clean_data_1]
lemmer = nltk.stem.WordNetLemmatizer()
lemma_data = []
for word in clean_data_1:
lemma_data.append(lemmer.lemmatize(word))
# Using snowball Stemmer
snow_stemmer = SnowballStemmer(language='english')
stem_words = []
for word in lemma_data:
tempData = snow_stemmer.stem(word)
stem_words.append(tempData)
#Tagging
pos_data = nltk.pos_tag(stem_words)
tagged_tokens = ne_chunk(pos_data)
tagged_tokens
Editor is loading...
Leave a Comment