Untitled

 avatar
unknown
plain_text
a year ago
8.6 kB
10
Indexable
import sagemaker
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.pytorch import PyTorch
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.parameters import ParameterString, ParameterInteger, ParameterFloat, ParameterBoolean

# Define parameters for the pipeline
data_bucket_param = ParameterString(name="DataBucket", default_value="your-bucket")
train_data_key_param = ParameterString(name="TrainDataKey", default_value="your-data-path/train.csv")
validation_data_key_param = ParameterString(name="ValidationDataKey", default_value="your-data-path/validation.csv")
test_data_key_param = ParameterString(name="TestDataKey", default_value="your-data-path/test.csv")
num_classes_param = ParameterInteger(name="NumClasses", default_value=2)
multi_label_param = ParameterBoolean(name="MultiLabel", default_value=False)
train_instance_type_param = ParameterString(name="TrainInstanceType", default_value="ml.m5.large")
deploy_instance_type_param = ParameterString(name="DeployInstanceType", default_value="ml.m5.large")
deploy_model_param = ParameterBoolean(name="DeployModel", default_value=False)
epochs_param = ParameterInteger(name="Epochs", default_value=10)
batch_size_param = ParameterInteger(name="BatchSize", default_value=16)
learning_rate_param = ParameterFloat(name="LearningRate", default_value=0.001)

# Define the processor for data preprocessing
processor = ScriptProcessor(
    image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.6.0-cpu-py36-ubuntu18.04",
    command=["python3"],
    role="YourSageMakerRole",
    instance_count=1,
    instance_type="ml.m5.large"
)

# Define a Processing Step
step_process = ProcessingStep(
    name="PreprocessData",
    processor=processor,
    inputs=[
        ProcessingInput(
            source=f"s3://{data_bucket_param}/{train_data_key_param}",
            destination="/opt/ml/processing/input/train.csv"
        ),
        ProcessingInput(
            source=f"s3://{data_bucket_param}/{validation_data_key_param}",
            destination="/opt/ml/processing/input/validation.csv"
        ),
        ProcessingInput(
            source=f"s3://{data_bucket_param}/{test_data_key_param}",
            destination="/opt/ml/processing/input/test.csv"
        )
    ],
    outputs=[
        ProcessingOutput(
            source="/opt/ml/processing/output",
            destination=f"s3://{data_bucket_param}/processed-data/"
        )
    ],
    code="preprocess.py"
)

# Define the estimator for training the SetFit model
estimator = PyTorch(
    entry_point="train.py",
    role="YourSageMakerRole",
    instance_count=1,
    instance_type=train_instance_type_param,
    framework_version="1.6.0",
    py_version="py3",
    hyperparameters={
        "num_classes": num_classes_param,
        "multi_label": multi_label_param,
        "epochs": epochs_param,
        "batch_size": batch_size_param,
        "learning_rate": learning_rate_param
    }
)

# Define a Training Step
step_train = TrainingStep(
    name="TrainModel",
    estimator=estimator,
    inputs={
        "train": sagemaker.inputs.TrainingInput(
            s3_data=f"s3://{data_bucket_param}/processed-data/train.csv",
            content_type="csv"
        ),
        "validation": sagemaker.inputs.TrainingInput(
            s3_data=f"s3://{data_bucket_param}/processed-data/validation.csv",
            content_type="csv"
        ),
        "test": sagemaker.inputs.TrainingInput(
            s3_data=f"s3://{data_bucket_param}/processed-data/test.csv",
            content_type="csv"
        )
    }
)

# Optionally, define a model deployment step
model = estimator.create_model()
step_deploy = TrainingStep(
    name="DeployModel",
    estimator=estimator,
    inputs={
        "train": sagemaker.inputs.TrainingInput(
            s3_data=f"s3://{data_bucket_param}/processed-data/train.csv",
            content_type="csv"
        ),
        "validation": sagemaker.inputs.TrainingInput(
            s3_data=f"s3://{data_bucket_param}/processed-data/validation.csv",
            content_type="csv"
        ),
        "test": sagemaker.inputs.TrainingInput(
            s3_data=f"s3://{data_bucket_param}/processed-data/test.csv",
            content_type="csv"
        )
    }
)

# Create the pipeline
pipeline = Pipeline(
    name="SetFitTrainingPipeline",
    parameters=[
        data_bucket_param, train_data_key_param, validation_data_key_param, test_data_key_param,
        num_classes_param, multi_label_param, train_instance_type_param, deploy_instance_type_param,
        deploy_model_param, epochs_param, batch_size_param, learning_rate_param
    ],
    steps=[step_process, step_train]  # Add step_deploy if deployment is required
)

# Submit the pipeline to SageMaker
pipeline.create()
pipeline.start()


import argparse
import os
import shutil
import pandas as pd

def preprocess(input_dir, output_dir, needs_preprocessing):
    # Paths for input files
    train_path = os.path.join(input_dir, 'train.csv')
    validation_path = os.path.join(input_dir, 'validation.csv')
    test_path = os.path.join(input_dir, 'test.csv')

    # Paths for output files
    processed_train_path = os.path.join(output_dir, 'train.csv')
    processed_validation_path = os.path.join(output_dir, 'validation.csv')
    processed_test_path = os.path.join(output_dir, 'test.csv')

    if needs_preprocessing:
        # Load data
        train_data = pd.read_csv(train_path)
        validation_data = pd.read_csv(validation_path)
        test_data = pd.read_csv(test_path)

        # Perform your preprocessing here
        # Example: Remove rows with missing values
        train_data.dropna(inplace=True)
        validation_data.dropna(inplace=True)
        test_data.dropna(inplace=True)

        # Save processed data
        train_data.to_csv(processed_train_path, index=False)
        validation_data.to_csv(processed_validation_path, index=False)
        test_data.to_csv(processed_test_path, index=False)
    else:
        # Copy files directly to the output directory if no preprocessing is needed
        shutil.copy(train_path, processed_train_path)
        shutil.copy(validation_path, processed_validation_path)
        shutil.copy(test_path, processed_test_path)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_dir", type=str, required=True)
    parser.add_argument("--output_dir", type=str, required=True)
    parser.add_argument("--needs_preprocessing", type=bool, default=False)
    args = parser.parse_args()
    
    preprocess(args.input_dir, args.output_dir, args.needs_preprocessing)

import argparse
import os
import pandas as pd
from setfit import SetFitModel, SetFitTrainer
from sklearn.metrics import accuracy_score, f1_score

def load_data(train_path, validation_path, test_path):
    train_data = pd.read_csv(train_path)
    validation_data = pd.read_csv(validation_path)
    test_data = pd.read_csv(test_path)
    return train_data, validation_data, test_data

def train(train_data, validation_data, test_data, num_classes, multi_label, epochs, batch_size, learning_rate):
    # Load a pretrained model
    model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")

    # Create a SetFit trainer
    trainer = SetFitTrainer(
        model=model,
        train_dataset={"text": train_data['text'].tolist(), "label": train_data['label'].tolist()},
        eval_dataset={"text": validation_data['text'].tolist(), "label": validation_data['label'].tolist()},
        batch_size=batch_size,
        num_iterations=epochs,
        learning_rate=learning_rate
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    y_pred = trainer.predict(test_data['text'].tolist())
    accuracy = accuracy_score(test_data['label'], y_pred)
    f1 = f1_score(test_data['label'], y_pred, average='weighted')
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")

    # Save the model
    model_save_path = os.path.join(os.getenv("SM_MODEL_DIR"), "setfit_model")
    trainer.model.save_pretrained(model_save_path)

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--num_classes', type=int, required=True)
    parser.add_argument('--multi_label', type=bool, required=True)
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--learning_rate', type=float, default=0.001)
    args
Editor is loading...
Leave a Comment