Token Healing Script
Heal your incorrect text input.unknown
python
3 years ago
2.4 kB
18
Indexable
import nltk
import re
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker
from gingerit.gingerit import GingerIt
from transformers import GPT2LMHeadModel, GPT2Tokenizer
# Step 1: Tokenization
def tokenize_text(text):
return word_tokenize(text)
# Step 2: Spell Checking
def correct_spelling(tokens):
spell = SpellChecker()
corrected_tokens = [spell.correction(token) for token in tokens]
return corrected_tokens
# Step 3: Grammar Correction
def correct_grammar(text):
parser = GingerIt()
result = parser.parse(text)
corrected_text = result['result']
return corrected_text
# Step 4: Missing or Extra Words
def correct_missing_or_extra_words(text):
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
input_ids = tokenizer.encode(text, return_tensors="pt")
outputs = model.generate(input_ids, max_length=100, num_return_sequences=1, repetition_penalty=1.0, do_sample=True)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract the number of sentences in the original text
num_sentences = len(re.findall(r'[.!?]', text))
# Extract the same number of sentences from the generated text
extracted_sentences = re.findall(r'(.*?[.!?])', generated_text)
# # Handle missing periods at the end of sentences
# extracted_sentences = [sentence.rstrip('.!?') for sentence in extracted_sentences]
# Take the first 'num_sentences' sentences
extracted_sentences = extracted_sentences[:num_sentences]
# Join the extracted sentences to form the corrected text
corrected_text = ' '.join(extracted_sentences)
return corrected_text
# Example usage
input_text = input("Enter some text: ")
if not input_text.endswith((".", "?", "!")):
input_text += "."
#print(user_input)
# Step 1: Tokenization
tokens = tokenize_text(input_text)
# Step 2: Spell Checking
corrected_tokens = correct_spelling(tokens)
# Step 3: Grammar Correction
corrected_text = ' '.join(corrected_tokens)
corrected_text = correct_grammar(corrected_text)
# Step 4: Missing or Extra Words
corrected_text = correct_missing_or_extra_words(corrected_text)
print("Original text:", input_text)
print("Corrected text:", corrected_text)
Editor is loading...