Chatbot
user_9405995
python
a year ago
5.5 kB
20
Indexable
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from collections import defaultdict
import os
import re
# Function to preprocess and tokenize the dataset
def preprocess_data(dataset, tokenizer, max_length=128):
def tokenize_function(examples):
# Concatenate question and answer for the model to learn both together
concatenated = [f"Question: {q} Answer: {a}" for q, a in zip(examples['question'], examples['answer'])]
result = tokenizer(concatenated, truncation=True, padding='max_length', max_length=max_length)
result['labels'] = result['input_ids'] # Set labels to input_ids for language modeling
return result
tokenized_datasets = dataset.map(tokenize_function, batched=True)
return tokenized_datasets
# Function to train or fine-tune the model
def train_model():
model_name = "./output/fine-tuned-gpt2" if os.path.exists("./output/fine-tuned-gpt2") else "gpt2"
print(f"Training model: {model_name}")
# Load pre-trained or fine-tuned GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# Add padding token
tokenizer.pad_token = tokenizer.eos_token
# Load dataset from JSON files
dataset = load_dataset('json', data_files={'train': 'Data/Sample_validation.json', 'validation': 'Data/Sample_validation.json'})
# Preprocess the dataset (tokenize and format it)
tokenized_datasets = preprocess_data(dataset, tokenizer)
# Set up Data Collator for Language Modeling (helps with padding)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# Training arguments
training_args = TrainingArguments(
output_dir="./output",
learning_rate=3e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
evaluation_strategy="epoch", # Evaluate based on epochs
num_train_epochs=10,
weight_decay=0.01,
save_total_limit=2,
logging_dir='./logs',
logging_strategy="epoch", # Log at the end of each epoch
logging_steps=100, # Log training progress frequently
save_steps=500, # Save checkpoints every 500 steps
resume_from_checkpoint=True # Resume from the last checkpoint
)
# Trainer setup
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['validation'],
data_collator=data_collator
)
# Train the model (or continue training if there are checkpoints)
trainer.train()
# Save the model and tokenizer
model.save_pretrained("./output/fine-tuned-gpt2")
tokenizer.save_pretrained("./output/fine-tuned-gpt2")
# Function to load the saved model
def load_model():
# Load fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./output/fine-tuned-gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("./output/fine-tuned-gpt2")
return model, tokenizer
# Remove repetitive sentences in responses
def remove_repetitive_sentences(text):
# Normalize and split the text into sentences
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
seen = set()
result = []
for sentence in sentences:
# Clean up the sentence
cleaned_sentence = re.sub(r'\s+', ' ', sentence.strip().lower()) # Normalize whitespace and case
# Add only unique sentences
if cleaned_sentence and cleaned_sentence not in seen:
result.append(sentence.strip())
seen.add(cleaned_sentence)
return ' '.join(result)
# Function to generate response
def generate_response(prompt, model, tokenizer):
# Encode the prompt text and generate response
inputs = tokenizer.encode(prompt, return_tensors="pt")
outputs = model.generate(
inputs,
max_length=100,
do_sample=True,
top_k=50,
top_p=0.95,
temperature=0.5,
pad_token_id=tokenizer.pad_token_id
)
# Decode the generated text
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Remove repetitive sentences
response = remove_repetitive_sentences(response)
return response
# Main loop
if __name__ == "__main__":
# Check if the model is already fine-tuned and saved
if os.path.exists("./output/fine-tuned-gpt2"):
print("Saved model found.")
choice = input("Do you want to train a new model or load the saved model? (train/load): ").strip().lower()
if choice == 'train':
train_model()
print("Model has been trained and saved.")
model, tokenizer = load_model()
elif choice == 'load':
model, tokenizer = load_model()
print("Model loaded.")
else:
print("Invalid choice. Exiting.")
exit()
else:
print("No saved model found. Training a new model...")
train_model()
model, tokenizer = load_model()
# Interactive loop to input questions
while True:
user_input = input("Enter your question (or type 'exit' to quit): ")
if user_input.lower() == 'exit':
break
response = generate_response(user_input, model, tokenizer)
print(f"Response: {response}")Editor is loading...
Leave a Comment