Untitled
unknown
python
a year ago
1.6 kB
7
Indexable
import numpy as np import nltk import re from nltk.stem.wordnet import WordNetLemmatizer from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.stem.snowball import SnowballStemmer from nltk import ne_chunk # Download nltk.download('punkt') nltk.download('wordnet') nltk.download('stopwords') nltk.download('wordnet') nltk.download('averaged_perceptron_tagger') # Loading and read Text f = open('sentence.txt','r',errors='ignore') raw_code = f.read().lower() # Doing Segmentation and Tokenizing segment = nltk.sent_tokenize(raw_code) word_tokens = nltk.word_tokenize(raw_code) # Remove word that is not Alphanumeric clean_data = [] for words in word_tokens: item = [] result = re.sub(r"[^\w\s]", "", words) if result != "": clean_data.append(result) # Remove word that is included in StopWord English clean_data_1 = [] for words in clean_data: if not words in stopwords.words('english'): clean_data_1.append(words) #Stemming and Lemmatization (Simplify word) stemmer = PorterStemmer() stemmed_data = [stemmer.stem(word) for word in clean_data_1] lemmer = nltk.stem.WordNetLemmatizer() lemma_data = [] for word in clean_data_1: lemma_data.append(lemmer.lemmatize(word)) # Using snowball Stemmer snow_stemmer = SnowballStemmer(language='english') stem_words = [] for word in lemma_data: tempData = snow_stemmer.stem(word) stem_words.append(tempData) #Tagging pos_data = nltk.pos_tag(stem_words) tagged_tokens = ne_chunk(pos_data) tagged_tokens
Editor is loading...
Leave a Comment