Falcon 7b training
unknown
python
a year ago
4.5 kB
6
Indexable
Never
import json import os from pprint import pprint import bitsandbytes as bnb import torch import torch.nn as nn import transformers from datasets import load_dataset from huggingface_hub import notebook_login from peft import ( LoraConfig, PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training ) from transformers import ( AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig ) MODEL_NAME = 'tiiuae/falcon-7b' TRAIN_DATASET = load_dataset('json', data_files='/nfs/hpc/share/wildma/dataset_prep/datasets/falcon/verbose_labeled.json', field='data') # MODEL_NAME = 'vilsonrodrigues/falcon-7b-instruct-sharded' # Load falcon bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16 ) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, device_map="auto", trust_remote_code=True, quantization_config=bnb_config ) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) tokenizer.pad_token = tokenizer.eos_token def print_trainable_parameters(model): """ Prints the number of trainable parameters in the model. """ trainable_params = 0 all_param = 0 for _, param in model.named_parameters(): all_param += param.numel() if param.requires_grad: trainable_params += param.numel() print( f"trainable params: {trainable_params} || all params: {all_param} || trainables%: {100 * trainable_params / all_param}" ) model.gradient_checkpointing_enable() model = prepare_model_for_kbit_training(model) config = LoraConfig( r=16, lora_alpha=32, target_modules=["query_key_value"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM" ) model = get_peft_model(model, config) print_trainable_parameters(model) # Test original model generation_config = model.generation_config generation_config.max_new_tokens = 200 generation_config.temperature = 0.7 generation_config.top_p = 0.7 generation_config.num_return_sequences = 1 generation_config.pad_token_id = tokenizer.eos_token_id generation_config.eos_token_id = tokenizer.eos_token_id device = "cuda:0" prompt = """ <human>: Source: {And I say}. Current translation: {}. What's the next translated Spanish word? Say the rest of the translated words if source ends with punctuation or output EOS if the current translation is finished. <assistant>: """.strip() encoding = tokenizer(prompt, return_tensors="pt").to(device) with torch.inference_mode(): outputs = model.generate( input_ids = encoding.input_ids, attention_mask = encoding.attention_mask, generation_config = generation_config ) print(f'Current pre-training output: {tokenizer.decode(outputs[0], skip_special_tokens=True)}') # Prep dataset def generate_prompt(data_point): return data_point.strip() def generate_and_tokenize_prompt(data_point): full_prompt = generate_prompt(data_point) tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True) return tokenized_full_prompt data = data['train'].shuffle().map(generate_and_tokenize_prompt) # Finetune the model # (If this doesn't work, then try to reemplement in Jupyter) training_args = transformers.TrainingArguments( per_device_train_batch_size=1, gradient_accumulation_steps=4, num_train_epochs=1, learning_rate=2e-4, fp16=True, save_total_limit=3, logging_steps=1, output_dir='/nfs/hpc/share/wildma/ml_model_stuff/training/falcon_train_1', optim="paged_adamw_8bit", lr_scheduler_type="cosine", warmup_ratio=0.05, ) trainer = transformers.Trainer( model=model, train_dataset=data, args=training_args, data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False) ) model.config.use_cache = False trainer.train() model.save_pretrained('trained-falcon') encoding = tokenizer(prompt, return_tensors="pt").to(device) with torch.inference_mode(): outputs = model.generate( input_ids = encoding.input_ids, attention_mask = encoding.attention_mask, generation_config = generation_config ) print(f'Post-training output: {tokenizer.decode(outputs[0], skip_special_tokens=True)}')