Untitled
unknown
plain_text
a year ago
8.6 kB
10
Indexable
import sagemaker
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.pytorch import PyTorch
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.parameters import ParameterString, ParameterInteger, ParameterFloat, ParameterBoolean
# Define parameters for the pipeline
data_bucket_param = ParameterString(name="DataBucket", default_value="your-bucket")
train_data_key_param = ParameterString(name="TrainDataKey", default_value="your-data-path/train.csv")
validation_data_key_param = ParameterString(name="ValidationDataKey", default_value="your-data-path/validation.csv")
test_data_key_param = ParameterString(name="TestDataKey", default_value="your-data-path/test.csv")
num_classes_param = ParameterInteger(name="NumClasses", default_value=2)
multi_label_param = ParameterBoolean(name="MultiLabel", default_value=False)
train_instance_type_param = ParameterString(name="TrainInstanceType", default_value="ml.m5.large")
deploy_instance_type_param = ParameterString(name="DeployInstanceType", default_value="ml.m5.large")
deploy_model_param = ParameterBoolean(name="DeployModel", default_value=False)
epochs_param = ParameterInteger(name="Epochs", default_value=10)
batch_size_param = ParameterInteger(name="BatchSize", default_value=16)
learning_rate_param = ParameterFloat(name="LearningRate", default_value=0.001)
# Define the processor for data preprocessing
processor = ScriptProcessor(
image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.6.0-cpu-py36-ubuntu18.04",
command=["python3"],
role="YourSageMakerRole",
instance_count=1,
instance_type="ml.m5.large"
)
# Define a Processing Step
step_process = ProcessingStep(
name="PreprocessData",
processor=processor,
inputs=[
ProcessingInput(
source=f"s3://{data_bucket_param}/{train_data_key_param}",
destination="/opt/ml/processing/input/train.csv"
),
ProcessingInput(
source=f"s3://{data_bucket_param}/{validation_data_key_param}",
destination="/opt/ml/processing/input/validation.csv"
),
ProcessingInput(
source=f"s3://{data_bucket_param}/{test_data_key_param}",
destination="/opt/ml/processing/input/test.csv"
)
],
outputs=[
ProcessingOutput(
source="/opt/ml/processing/output",
destination=f"s3://{data_bucket_param}/processed-data/"
)
],
code="preprocess.py"
)
# Define the estimator for training the SetFit model
estimator = PyTorch(
entry_point="train.py",
role="YourSageMakerRole",
instance_count=1,
instance_type=train_instance_type_param,
framework_version="1.6.0",
py_version="py3",
hyperparameters={
"num_classes": num_classes_param,
"multi_label": multi_label_param,
"epochs": epochs_param,
"batch_size": batch_size_param,
"learning_rate": learning_rate_param
}
)
# Define a Training Step
step_train = TrainingStep(
name="TrainModel",
estimator=estimator,
inputs={
"train": sagemaker.inputs.TrainingInput(
s3_data=f"s3://{data_bucket_param}/processed-data/train.csv",
content_type="csv"
),
"validation": sagemaker.inputs.TrainingInput(
s3_data=f"s3://{data_bucket_param}/processed-data/validation.csv",
content_type="csv"
),
"test": sagemaker.inputs.TrainingInput(
s3_data=f"s3://{data_bucket_param}/processed-data/test.csv",
content_type="csv"
)
}
)
# Optionally, define a model deployment step
model = estimator.create_model()
step_deploy = TrainingStep(
name="DeployModel",
estimator=estimator,
inputs={
"train": sagemaker.inputs.TrainingInput(
s3_data=f"s3://{data_bucket_param}/processed-data/train.csv",
content_type="csv"
),
"validation": sagemaker.inputs.TrainingInput(
s3_data=f"s3://{data_bucket_param}/processed-data/validation.csv",
content_type="csv"
),
"test": sagemaker.inputs.TrainingInput(
s3_data=f"s3://{data_bucket_param}/processed-data/test.csv",
content_type="csv"
)
}
)
# Create the pipeline
pipeline = Pipeline(
name="SetFitTrainingPipeline",
parameters=[
data_bucket_param, train_data_key_param, validation_data_key_param, test_data_key_param,
num_classes_param, multi_label_param, train_instance_type_param, deploy_instance_type_param,
deploy_model_param, epochs_param, batch_size_param, learning_rate_param
],
steps=[step_process, step_train] # Add step_deploy if deployment is required
)
# Submit the pipeline to SageMaker
pipeline.create()
pipeline.start()
import argparse
import os
import shutil
import pandas as pd
def preprocess(input_dir, output_dir, needs_preprocessing):
# Paths for input files
train_path = os.path.join(input_dir, 'train.csv')
validation_path = os.path.join(input_dir, 'validation.csv')
test_path = os.path.join(input_dir, 'test.csv')
# Paths for output files
processed_train_path = os.path.join(output_dir, 'train.csv')
processed_validation_path = os.path.join(output_dir, 'validation.csv')
processed_test_path = os.path.join(output_dir, 'test.csv')
if needs_preprocessing:
# Load data
train_data = pd.read_csv(train_path)
validation_data = pd.read_csv(validation_path)
test_data = pd.read_csv(test_path)
# Perform your preprocessing here
# Example: Remove rows with missing values
train_data.dropna(inplace=True)
validation_data.dropna(inplace=True)
test_data.dropna(inplace=True)
# Save processed data
train_data.to_csv(processed_train_path, index=False)
validation_data.to_csv(processed_validation_path, index=False)
test_data.to_csv(processed_test_path, index=False)
else:
# Copy files directly to the output directory if no preprocessing is needed
shutil.copy(train_path, processed_train_path)
shutil.copy(validation_path, processed_validation_path)
shutil.copy(test_path, processed_test_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input_dir", type=str, required=True)
parser.add_argument("--output_dir", type=str, required=True)
parser.add_argument("--needs_preprocessing", type=bool, default=False)
args = parser.parse_args()
preprocess(args.input_dir, args.output_dir, args.needs_preprocessing)
import argparse
import os
import pandas as pd
from setfit import SetFitModel, SetFitTrainer
from sklearn.metrics import accuracy_score, f1_score
def load_data(train_path, validation_path, test_path):
train_data = pd.read_csv(train_path)
validation_data = pd.read_csv(validation_path)
test_data = pd.read_csv(test_path)
return train_data, validation_data, test_data
def train(train_data, validation_data, test_data, num_classes, multi_label, epochs, batch_size, learning_rate):
# Load a pretrained model
model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")
# Create a SetFit trainer
trainer = SetFitTrainer(
model=model,
train_dataset={"text": train_data['text'].tolist(), "label": train_data['label'].tolist()},
eval_dataset={"text": validation_data['text'].tolist(), "label": validation_data['label'].tolist()},
batch_size=batch_size,
num_iterations=epochs,
learning_rate=learning_rate
)
# Train the model
trainer.train()
# Evaluate the model
y_pred = trainer.predict(test_data['text'].tolist())
accuracy = accuracy_score(test_data['label'], y_pred)
f1 = f1_score(test_data['label'], y_pred, average='weighted')
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
# Save the model
model_save_path = os.path.join(os.getenv("SM_MODEL_DIR"), "setfit_model")
trainer.model.save_pretrained(model_save_path)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--num_classes', type=int, required=True)
parser.add_argument('--multi_label', type=bool, required=True)
parser.add_argument('--epochs', type=int, default=10)
parser.add_argument('--batch_size', type=int, default=16)
parser.add_argument('--learning_rate', type=float, default=0.001)
args
Editor is loading...
Leave a Comment