Token healing

fix your error prone text[spelling, grammar, missing, extra words]
 avatar
unknown
python
2 years ago
2.8 kB
4
Indexable
import nltk
import re
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker
from gingerit.gingerit import GingerIt
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Step 1: Tokenization
def tokenize_text(text):
    return word_tokenize(text)

# Step 2: Spell Checking
def correct_spelling(tokens):
    spell = SpellChecker()
    corrected_tokens = [spell.correction(token) for token in tokens]
    return corrected_tokens

# Step 3: Grammar Correction
def correct_grammar(text):
    parser = GingerIt()
    result = parser.parse(text)
    corrected_text = result['result']
    return corrected_text

# Step 4: Missing or Extra Words
def correct_missing_or_extra_words(text):
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")

    input_ids = tokenizer.encode(text, return_tensors="pt")
    outputs = model.generate(input_ids, max_length=100, num_return_sequences=1, repetition_penalty=1.0, do_sample=True)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the number of sentences in the original text
    num_sentences = len(re.findall(r'[.!?]', text))
    
    # Extract the same number of sentences from the generated text
    extracted_sentences = re.findall(r'(.*?[.!?])', generated_text)
    
    # # Handle missing periods at the end of sentences
    # extracted_sentences = [sentence.rstrip('.!?') for sentence in extracted_sentences]
    
    # Take the first 'num_sentences' sentences
    extracted_sentences = extracted_sentences[:num_sentences]
    
    # Join the extracted sentences to form the corrected text
    # Join the extracted sentences to form the corrected text
    corrected_text = ' '.join(sentence.strip() for sentence in extracted_sentences[:num_sentences])

    # # Remove extra spaces between sentences
    # corrected_text = re.sub(r'\s([.!?])', r'\1', corrected_text)
    
    return corrected_text


while True:
    input_text = input("Enter some text (or 'exit' to quit): ")

    if input_text.lower() == 'exit':
        break

    if not input_text.endswith((".", "?", "!")):
        input_text += "."

    # Step 1: Tokenization
    tokens = tokenize_text(input_text)

    # Step 2: Spell Checking
    corrected_tokens = correct_spelling(tokens)

    # Step 3: Grammar Correction
    corrected_text = ' '.join(corrected_tokens)
    corrected_text = correct_grammar(corrected_text)

    # Step 4: Missing or Extra Words
    corrected_text = correct_missing_or_extra_words(corrected_text)

    print("Original text:", input_text)
    print("Corrected text:", corrected_text)
    print()

print("Exiting the program.")
Editor is loading...