Chatbot

 avatar
user_9405995
python
17 days ago
5.5 kB
10
Indexable
Never
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from collections import defaultdict
import os
import re

# Function to preprocess and tokenize the dataset
def preprocess_data(dataset, tokenizer, max_length=128):
    def tokenize_function(examples):
        # Concatenate question and answer for the model to learn both together
        concatenated = [f"Question: {q} Answer: {a}" for q, a in zip(examples['question'], examples['answer'])]
        result = tokenizer(concatenated, truncation=True, padding='max_length', max_length=max_length)
        result['labels'] = result['input_ids']  # Set labels to input_ids for language modeling
        return result

    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    return tokenized_datasets

# Function to train or fine-tune the model
def train_model():
    model_name = "./output/fine-tuned-gpt2" if os.path.exists("./output/fine-tuned-gpt2") else "gpt2"
    print(f"Training model: {model_name}")

    # Load pre-trained or fine-tuned GPT-2 model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)

    # Add padding token
    tokenizer.pad_token = tokenizer.eos_token

    # Load dataset from JSON files
    dataset = load_dataset('json', data_files={'train': 'Data/Sample_validation.json', 'validation': 'Data/Sample_validation.json'})

    # Preprocess the dataset (tokenize and format it)
    tokenized_datasets = preprocess_data(dataset, tokenizer)

    # Set up Data Collator for Language Modeling (helps with padding)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # Training arguments
    training_args = TrainingArguments(
        output_dir="./output",
        learning_rate=3e-5, 
        per_device_train_batch_size=16,  
        per_device_eval_batch_size=16, 
        evaluation_strategy="epoch",  # Evaluate based on epochs
        num_train_epochs=10,  
        weight_decay=0.01,  
        save_total_limit=2,  
        logging_dir='./logs',
        logging_strategy="epoch",  # Log at the end of each epoch 
        logging_steps=100,  # Log training progress frequently
        save_steps=500,  # Save checkpoints every 500 steps
        resume_from_checkpoint=True  # Resume from the last checkpoint
    )

    # Trainer setup
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['validation'],
        data_collator=data_collator
    )

    # Train the model (or continue training if there are checkpoints)
    trainer.train()

    # Save the model and tokenizer
    model.save_pretrained("./output/fine-tuned-gpt2")
    tokenizer.save_pretrained("./output/fine-tuned-gpt2")

# Function to load the saved model
def load_model():
    # Load fine-tuned model and tokenizer
    model = GPT2LMHeadModel.from_pretrained("./output/fine-tuned-gpt2")
    tokenizer = GPT2Tokenizer.from_pretrained("./output/fine-tuned-gpt2")
    return model, tokenizer

# Remove repetitive sentences in responses
def remove_repetitive_sentences(text):
    # Normalize and split the text into sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)

    seen = set()
    result = []

    for sentence in sentences:
        # Clean up the sentence
        cleaned_sentence = re.sub(r'\s+', ' ', sentence.strip().lower())  # Normalize whitespace and case

        # Add only unique sentences
        if cleaned_sentence and cleaned_sentence not in seen:
            result.append(sentence.strip())
            seen.add(cleaned_sentence)

    return ' '.join(result)

# Function to generate response
def generate_response(prompt, model, tokenizer):
    # Encode the prompt text and generate response
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(
        inputs, 
        max_length=100, 
        do_sample=True, 
        top_k=50, 
        top_p=0.95, 
        temperature=0.5,
        pad_token_id=tokenizer.pad_token_id
    )

    # Decode the generated text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Remove repetitive sentences
    response = remove_repetitive_sentences(response)

    return response

# Main loop
if __name__ == "__main__":
    # Check if the model is already fine-tuned and saved
    if os.path.exists("./output/fine-tuned-gpt2"):
        print("Saved model found.")
        choice = input("Do you want to train a new model or load the saved model? (train/load): ").strip().lower()

        if choice == 'train':
            train_model()
            print("Model has been trained and saved.")
            model, tokenizer = load_model()
        elif choice == 'load':
            model, tokenizer = load_model()
            print("Model loaded.")
        else:
            print("Invalid choice. Exiting.")
            exit()
    else:
        print("No saved model found. Training a new model...")
        train_model()
        model, tokenizer = load_model()

    # Interactive loop to input questions
    while True:
        user_input = input("Enter your question (or type 'exit' to quit): ")
        if user_input.lower() == 'exit':
            break
        response = generate_response(user_input, model, tokenizer)
        print(f"Response: {response}")
Leave a Comment