Untitled
unknown
plain_text
8 months ago
4.4 kB
6
Indexable
from datasets import load_dataset
from transformers import (
RobertaForSequenceClassification,
RobertaTokenizer,
TrainingArguments,
Trainer,
)
import os
os.environ["WANDB_DISABLED"] = "true"
import torch
import numpy as np
from peft import LoraConfig, get_peft_model
from torch.profiler import profile,ProfilerActivity , ExecutionTraceObserver
# Tokenize the dataset
def tokenize_function(examples):
return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length')
model_name = "FacebookAI/roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2) # Binary classification task
# Load the MRPC dataset from the GLUE benchmark
dataset = load_dataset("glue", "mrpc")
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# Set format for PyTorch tensors
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
train_dataset = tokenized_dataset["train"]
valid_dataset = tokenized_dataset["validation"]
test_dataset = tokenized_dataset["test"]
peft_config = LoraConfig(
r=16,
lora_alpha=16,
lora_dropout=0,
use_rslora=True,
inference_mode=False,
)
# model = get_peft_model(model, peft_config)
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=32,
num_train_epochs=1,
weight_decay=0.01,
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
lr_scheduler_type="linear",
gradient_accumulation_steps=2,
logging_steps=1,
optim="adamw_hf",
warmup_steps=100,
seed=0,
)
et = ExecutionTraceObserver()
et.register_callback("et_file0.json")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=valid_dataset,
tokenizer=tokenizer,
)
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.\n")
et.start()
# with profile(
# activities=[ProfilerActivity.CPU,ProfilerActivity.CUDA],
# record_shapes=True,
# profile_memory=True,
# with_stack=True
# ) as prof:
trainer_stats = trainer.train()
et.stop()
et.unregister_callback
et = None
# prof.export_chrome_trace("execution_trace_chrome.json")
# used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
# used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
# used_percentage = round(used_memory /max_memory*100, 3)
# lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
# print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
# print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
# print(f"Peak reserved memory = {used_memory} GB.")
# print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
# print(f"Peak reserved memory % of max memory = {used_percentage} %.")
# print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
# Save model 1
# model.save_pretrained("./roberta-mrpc-finetuned-model-1")
# tokenizer.save_pretrained("./roberta-mrpc-finetuned-model-1")
# # Function to calculate accuracy
# def calculate_accuracy(trainer, eval_dataset):
# predictions = trainer.predict(eval_dataset)
# preds = np.argmax(predictions.predictions, axis=-1)
# labels = predictions.label_ids
# accuracy = (preds == labels).mean()
# return accuracy
# accuracy_aggregated = calculate_accuracy(trainer, test_dataset)
# print(f"Accuracy of Aggregated model: {accuracy_aggregated:.4f}")
!pip install torch torchvision torchaudio
!pip install transformers datasets peft accelerate
!pip install py7zr
import py7zr
# Specify the files or directory you want to compress
files_to_compress = ['/content', '/content/et_file0.json']
# Path where you want to save the .7z file in Google Drive
output_7z_path = '/content/drive/My Drive/et_0.7z'
# Create the .7z archive
with py7zr.SevenZipFile(output_7z_path, 'w') as archive:
archive.write(files_to_compress)
Editor is loading...
Leave a Comment