Chatbot
user_9405995
python
17 days ago
5.5 kB
10
Indexable
Never
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling from datasets import load_dataset from collections import defaultdict import os import re # Function to preprocess and tokenize the dataset def preprocess_data(dataset, tokenizer, max_length=128): def tokenize_function(examples): # Concatenate question and answer for the model to learn both together concatenated = [f"Question: {q} Answer: {a}" for q, a in zip(examples['question'], examples['answer'])] result = tokenizer(concatenated, truncation=True, padding='max_length', max_length=max_length) result['labels'] = result['input_ids'] # Set labels to input_ids for language modeling return result tokenized_datasets = dataset.map(tokenize_function, batched=True) return tokenized_datasets # Function to train or fine-tune the model def train_model(): model_name = "./output/fine-tuned-gpt2" if os.path.exists("./output/fine-tuned-gpt2") else "gpt2" print(f"Training model: {model_name}") # Load pre-trained or fine-tuned GPT-2 model and tokenizer model = GPT2LMHeadModel.from_pretrained(model_name) tokenizer = GPT2Tokenizer.from_pretrained(model_name) # Add padding token tokenizer.pad_token = tokenizer.eos_token # Load dataset from JSON files dataset = load_dataset('json', data_files={'train': 'Data/Sample_validation.json', 'validation': 'Data/Sample_validation.json'}) # Preprocess the dataset (tokenize and format it) tokenized_datasets = preprocess_data(dataset, tokenizer) # Set up Data Collator for Language Modeling (helps with padding) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # Training arguments training_args = TrainingArguments( output_dir="./output", learning_rate=3e-5, per_device_train_batch_size=16, per_device_eval_batch_size=16, evaluation_strategy="epoch", # Evaluate based on epochs num_train_epochs=10, weight_decay=0.01, save_total_limit=2, logging_dir='./logs', logging_strategy="epoch", # Log at the end of each epoch logging_steps=100, # Log training progress frequently save_steps=500, # Save checkpoints every 500 steps resume_from_checkpoint=True # Resume from the last checkpoint ) # Trainer setup trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets['train'], eval_dataset=tokenized_datasets['validation'], data_collator=data_collator ) # Train the model (or continue training if there are checkpoints) trainer.train() # Save the model and tokenizer model.save_pretrained("./output/fine-tuned-gpt2") tokenizer.save_pretrained("./output/fine-tuned-gpt2") # Function to load the saved model def load_model(): # Load fine-tuned model and tokenizer model = GPT2LMHeadModel.from_pretrained("./output/fine-tuned-gpt2") tokenizer = GPT2Tokenizer.from_pretrained("./output/fine-tuned-gpt2") return model, tokenizer # Remove repetitive sentences in responses def remove_repetitive_sentences(text): # Normalize and split the text into sentences sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text) seen = set() result = [] for sentence in sentences: # Clean up the sentence cleaned_sentence = re.sub(r'\s+', ' ', sentence.strip().lower()) # Normalize whitespace and case # Add only unique sentences if cleaned_sentence and cleaned_sentence not in seen: result.append(sentence.strip()) seen.add(cleaned_sentence) return ' '.join(result) # Function to generate response def generate_response(prompt, model, tokenizer): # Encode the prompt text and generate response inputs = tokenizer.encode(prompt, return_tensors="pt") outputs = model.generate( inputs, max_length=100, do_sample=True, top_k=50, top_p=0.95, temperature=0.5, pad_token_id=tokenizer.pad_token_id ) # Decode the generated text response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Remove repetitive sentences response = remove_repetitive_sentences(response) return response # Main loop if __name__ == "__main__": # Check if the model is already fine-tuned and saved if os.path.exists("./output/fine-tuned-gpt2"): print("Saved model found.") choice = input("Do you want to train a new model or load the saved model? (train/load): ").strip().lower() if choice == 'train': train_model() print("Model has been trained and saved.") model, tokenizer = load_model() elif choice == 'load': model, tokenizer = load_model() print("Model loaded.") else: print("Invalid choice. Exiting.") exit() else: print("No saved model found. Training a new model...") train_model() model, tokenizer = load_model() # Interactive loop to input questions while True: user_input = input("Enter your question (or type 'exit' to quit): ") if user_input.lower() == 'exit': break response = generate_response(user_input, model, tokenizer) print(f"Response: {response}")
Leave a Comment