Untitled
unknown
python
2 years ago
884 B
6
Indexable
import pandas as pd
import pymorphy2
try:
data = pd.read_csv('toxic_comments.csv')
except:
data = pd.read_csv('/datasets/toxic_comments.csv')
data
# Создаем объект MorphAnalyzer
morph = pymorphy2.MorphAnalyzer()
# Функция для лемматизации строки
def lemmatize_text(text):
words = text.split()
lemmatized_words = [morph.parse(word)[0].normal_form for word in words]
lemmatized_text = ' '.join(lemmatized_words)
return lemmatized_text
kol1 = 200000
# Применяем функцию к столбцу 'твиты'
df = pd.DataFrame()
%%time
df['лемматизированные_твиты'] = data['text']. iloc[:kol1].apply(lemmatize_text)
# Вывод
# CPU times: total: 1min 38s
# Wall time: 1min 38s
# Вывод результата
df. shape
# Вывод
# (159292, 1)Editor is loading...
Leave a Comment