Untitled
unknown
plain_text
a year ago
8.6 kB
6
Indexable
import sagemaker from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput from sagemaker.pytorch import PyTorch from sagemaker.workflow.steps import ProcessingStep, TrainingStep from sagemaker.workflow.pipeline import Pipeline from sagemaker.workflow.parameters import ParameterString, ParameterInteger, ParameterFloat, ParameterBoolean # Define parameters for the pipeline data_bucket_param = ParameterString(name="DataBucket", default_value="your-bucket") train_data_key_param = ParameterString(name="TrainDataKey", default_value="your-data-path/train.csv") validation_data_key_param = ParameterString(name="ValidationDataKey", default_value="your-data-path/validation.csv") test_data_key_param = ParameterString(name="TestDataKey", default_value="your-data-path/test.csv") num_classes_param = ParameterInteger(name="NumClasses", default_value=2) multi_label_param = ParameterBoolean(name="MultiLabel", default_value=False) train_instance_type_param = ParameterString(name="TrainInstanceType", default_value="ml.m5.large") deploy_instance_type_param = ParameterString(name="DeployInstanceType", default_value="ml.m5.large") deploy_model_param = ParameterBoolean(name="DeployModel", default_value=False) epochs_param = ParameterInteger(name="Epochs", default_value=10) batch_size_param = ParameterInteger(name="BatchSize", default_value=16) learning_rate_param = ParameterFloat(name="LearningRate", default_value=0.001) # Define the processor for data preprocessing processor = ScriptProcessor( image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.6.0-cpu-py36-ubuntu18.04", command=["python3"], role="YourSageMakerRole", instance_count=1, instance_type="ml.m5.large" ) # Define a Processing Step step_process = ProcessingStep( name="PreprocessData", processor=processor, inputs=[ ProcessingInput( source=f"s3://{data_bucket_param}/{train_data_key_param}", destination="/opt/ml/processing/input/train.csv" ), ProcessingInput( source=f"s3://{data_bucket_param}/{validation_data_key_param}", destination="/opt/ml/processing/input/validation.csv" ), ProcessingInput( source=f"s3://{data_bucket_param}/{test_data_key_param}", destination="/opt/ml/processing/input/test.csv" ) ], outputs=[ ProcessingOutput( source="/opt/ml/processing/output", destination=f"s3://{data_bucket_param}/processed-data/" ) ], code="preprocess.py" ) # Define the estimator for training the SetFit model estimator = PyTorch( entry_point="train.py", role="YourSageMakerRole", instance_count=1, instance_type=train_instance_type_param, framework_version="1.6.0", py_version="py3", hyperparameters={ "num_classes": num_classes_param, "multi_label": multi_label_param, "epochs": epochs_param, "batch_size": batch_size_param, "learning_rate": learning_rate_param } ) # Define a Training Step step_train = TrainingStep( name="TrainModel", estimator=estimator, inputs={ "train": sagemaker.inputs.TrainingInput( s3_data=f"s3://{data_bucket_param}/processed-data/train.csv", content_type="csv" ), "validation": sagemaker.inputs.TrainingInput( s3_data=f"s3://{data_bucket_param}/processed-data/validation.csv", content_type="csv" ), "test": sagemaker.inputs.TrainingInput( s3_data=f"s3://{data_bucket_param}/processed-data/test.csv", content_type="csv" ) } ) # Optionally, define a model deployment step model = estimator.create_model() step_deploy = TrainingStep( name="DeployModel", estimator=estimator, inputs={ "train": sagemaker.inputs.TrainingInput( s3_data=f"s3://{data_bucket_param}/processed-data/train.csv", content_type="csv" ), "validation": sagemaker.inputs.TrainingInput( s3_data=f"s3://{data_bucket_param}/processed-data/validation.csv", content_type="csv" ), "test": sagemaker.inputs.TrainingInput( s3_data=f"s3://{data_bucket_param}/processed-data/test.csv", content_type="csv" ) } ) # Create the pipeline pipeline = Pipeline( name="SetFitTrainingPipeline", parameters=[ data_bucket_param, train_data_key_param, validation_data_key_param, test_data_key_param, num_classes_param, multi_label_param, train_instance_type_param, deploy_instance_type_param, deploy_model_param, epochs_param, batch_size_param, learning_rate_param ], steps=[step_process, step_train] # Add step_deploy if deployment is required ) # Submit the pipeline to SageMaker pipeline.create() pipeline.start() import argparse import os import shutil import pandas as pd def preprocess(input_dir, output_dir, needs_preprocessing): # Paths for input files train_path = os.path.join(input_dir, 'train.csv') validation_path = os.path.join(input_dir, 'validation.csv') test_path = os.path.join(input_dir, 'test.csv') # Paths for output files processed_train_path = os.path.join(output_dir, 'train.csv') processed_validation_path = os.path.join(output_dir, 'validation.csv') processed_test_path = os.path.join(output_dir, 'test.csv') if needs_preprocessing: # Load data train_data = pd.read_csv(train_path) validation_data = pd.read_csv(validation_path) test_data = pd.read_csv(test_path) # Perform your preprocessing here # Example: Remove rows with missing values train_data.dropna(inplace=True) validation_data.dropna(inplace=True) test_data.dropna(inplace=True) # Save processed data train_data.to_csv(processed_train_path, index=False) validation_data.to_csv(processed_validation_path, index=False) test_data.to_csv(processed_test_path, index=False) else: # Copy files directly to the output directory if no preprocessing is needed shutil.copy(train_path, processed_train_path) shutil.copy(validation_path, processed_validation_path) shutil.copy(test_path, processed_test_path) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--input_dir", type=str, required=True) parser.add_argument("--output_dir", type=str, required=True) parser.add_argument("--needs_preprocessing", type=bool, default=False) args = parser.parse_args() preprocess(args.input_dir, args.output_dir, args.needs_preprocessing) import argparse import os import pandas as pd from setfit import SetFitModel, SetFitTrainer from sklearn.metrics import accuracy_score, f1_score def load_data(train_path, validation_path, test_path): train_data = pd.read_csv(train_path) validation_data = pd.read_csv(validation_path) test_data = pd.read_csv(test_path) return train_data, validation_data, test_data def train(train_data, validation_data, test_data, num_classes, multi_label, epochs, batch_size, learning_rate): # Load a pretrained model model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2") # Create a SetFit trainer trainer = SetFitTrainer( model=model, train_dataset={"text": train_data['text'].tolist(), "label": train_data['label'].tolist()}, eval_dataset={"text": validation_data['text'].tolist(), "label": validation_data['label'].tolist()}, batch_size=batch_size, num_iterations=epochs, learning_rate=learning_rate ) # Train the model trainer.train() # Evaluate the model y_pred = trainer.predict(test_data['text'].tolist()) accuracy = accuracy_score(test_data['label'], y_pred) f1 = f1_score(test_data['label'], y_pred, average='weighted') print(f"Accuracy: {accuracy}") print(f"F1 Score: {f1}") # Save the model model_save_path = os.path.join(os.getenv("SM_MODEL_DIR"), "setfit_model") trainer.model.save_pretrained(model_save_path) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--num_classes', type=int, required=True) parser.add_argument('--multi_label', type=bool, required=True) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--batch_size', type=int, default=16) parser.add_argument('--learning_rate', type=float, default=0.001) args
Editor is loading...
Leave a Comment