import json
import os
from pprint import pprint
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import (
LoraConfig,
PeftConfig,
PeftModel,
get_peft_model,
prepare_model_for_kbit_training
)
from transformers import (
AutoConfig,
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig
)
MODEL_NAME = 'tiiuae/falcon-7b'
TRAIN_DATASET = load_dataset('json', data_files='/nfs/hpc/share/wildma/dataset_prep/datasets/falcon/verbose_labeled.json', field='data')
# MODEL_NAME = 'vilsonrodrigues/falcon-7b-instruct-sharded'
# Load falcon
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
device_map="auto",
trust_remote_code=True,
quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
def print_trainable_parameters(model):
"""
Prints the number of trainable parameters in the model.
"""
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
all_param += param.numel()
if param.requires_grad:
trainable_params += param.numel()
print(
f"trainable params: {trainable_params} || all params: {all_param} || trainables%: {100 * trainable_params / all_param}"
)
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["query_key_value"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
print_trainable_parameters(model)
# Test original model
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id
device = "cuda:0"
prompt = """
<human>: Source: {And I say}. Current translation: {}. What's the next translated Spanish word? Say the rest of the translated words if source ends with punctuation or output EOS if the current translation is finished.
<assistant>:
""".strip()
encoding = tokenizer(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
outputs = model.generate(
input_ids = encoding.input_ids,
attention_mask = encoding.attention_mask,
generation_config = generation_config
)
print(f'Current pre-training output: {tokenizer.decode(outputs[0], skip_special_tokens=True)}')
# Prep dataset
def generate_prompt(data_point):
return data_point.strip()
def generate_and_tokenize_prompt(data_point):
full_prompt = generate_prompt(data_point)
tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
return tokenized_full_prompt
data = data['train'].shuffle().map(generate_and_tokenize_prompt)
# Finetune the model
# (If this doesn't work, then try to reemplement in Jupyter)
training_args = transformers.TrainingArguments(
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
num_train_epochs=1,
learning_rate=2e-4,
fp16=True,
save_total_limit=3,
logging_steps=1,
output_dir='/nfs/hpc/share/wildma/ml_model_stuff/training/falcon_train_1',
optim="paged_adamw_8bit",
lr_scheduler_type="cosine",
warmup_ratio=0.05,
)
trainer = transformers.Trainer(
model=model,
train_dataset=data,
args=training_args,
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False
trainer.train()
model.save_pretrained('trained-falcon')
encoding = tokenizer(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
outputs = model.generate(
input_ids = encoding.input_ids,
attention_mask = encoding.attention_mask,
generation_config = generation_config
)
print(f'Post-training output: {tokenizer.decode(outputs[0], skip_special_tokens=True)}')