Untitled
unknown
plain_text
23 days ago
4.4 kB
4
Indexable
from datasets import load_dataset from transformers import ( RobertaForSequenceClassification, RobertaTokenizer, TrainingArguments, Trainer, ) import os os.environ["WANDB_DISABLED"] = "true" import torch import numpy as np from peft import LoraConfig, get_peft_model from torch.profiler import profile,ProfilerActivity , ExecutionTraceObserver # Tokenize the dataset def tokenize_function(examples): return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length') model_name = "FacebookAI/roberta-base" tokenizer = RobertaTokenizer.from_pretrained(model_name) model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2) # Binary classification task # Load the MRPC dataset from the GLUE benchmark dataset = load_dataset("glue", "mrpc") tokenized_dataset = dataset.map(tokenize_function, batched=True) # Set format for PyTorch tensors tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"]) train_dataset = tokenized_dataset["train"] valid_dataset = tokenized_dataset["validation"] test_dataset = tokenized_dataset["test"] peft_config = LoraConfig( r=16, lora_alpha=16, lora_dropout=0, use_rslora=True, inference_mode=False, ) # model = get_peft_model(model, peft_config) training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=32, num_train_epochs=1, weight_decay=0.01, eval_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, lr_scheduler_type="linear", gradient_accumulation_steps=2, logging_steps=1, optim="adamw_hf", warmup_steps=100, seed=0, ) et = ExecutionTraceObserver() et.register_callback("et_file0.json") trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=valid_dataset, tokenizer=tokenizer, ) gpu_stats = torch.cuda.get_device_properties(0) start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.") print(f"{start_gpu_memory} GB of memory reserved.\n") et.start() # with profile( # activities=[ProfilerActivity.CPU,ProfilerActivity.CUDA], # record_shapes=True, # profile_memory=True, # with_stack=True # ) as prof: trainer_stats = trainer.train() et.stop() et.unregister_callback et = None # prof.export_chrome_trace("execution_trace_chrome.json") # used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) # used_memory_for_lora = round(used_memory - start_gpu_memory, 3) # used_percentage = round(used_memory /max_memory*100, 3) # lora_percentage = round(used_memory_for_lora/max_memory*100, 3) # print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.") # print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.") # print(f"Peak reserved memory = {used_memory} GB.") # print(f"Peak reserved memory for training = {used_memory_for_lora} GB.") # print(f"Peak reserved memory % of max memory = {used_percentage} %.") # print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.") # Save model 1 # model.save_pretrained("./roberta-mrpc-finetuned-model-1") # tokenizer.save_pretrained("./roberta-mrpc-finetuned-model-1") # # Function to calculate accuracy # def calculate_accuracy(trainer, eval_dataset): # predictions = trainer.predict(eval_dataset) # preds = np.argmax(predictions.predictions, axis=-1) # labels = predictions.label_ids # accuracy = (preds == labels).mean() # return accuracy # accuracy_aggregated = calculate_accuracy(trainer, test_dataset) # print(f"Accuracy of Aggregated model: {accuracy_aggregated:.4f}") !pip install torch torchvision torchaudio !pip install transformers datasets peft accelerate !pip install py7zr import py7zr # Specify the files or directory you want to compress files_to_compress = ['/content', '/content/et_file0.json'] # Path where you want to save the .7z file in Google Drive output_7z_path = '/content/drive/My Drive/et_0.7z' # Create the .7z archive with py7zr.SevenZipFile(output_7z_path, 'w') as archive: archive.write(files_to_compress)
Editor is loading...
Leave a Comment