Untitled

mail@pastecode.io avatarunknown
python
a month ago
3.6 kB
2
Indexable
Never
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator


# Define the Random Forest model
rf = RandomForestClassifier(numTrees=50,
                            maxDepth=10,
                            labelCol='churn',
                            featuresCol='features_m',
                            predictionCol='rf_pred',
                            probabilityCol='rf_prob',
                            rawPredictionCol='rf_raw_pred',
                            minInstancesPerNode=100,
                            maxBins=32,
                            featureSubsetStrategy="auto",
                            subsamplingRate=1,
                            impurity='gini',
                            seed=2023)

# Define the GBT model
gbt = GBTClassifier(featuresCol="features_m",
                    labelCol='churn',
                    predictionCol="gbt_pred",
                    maxDepth=5,
                    maxBins=32,
                    minInstancesPerNode=1,
                    minInfoGain=0.0,
                    maxMemoryInMB=256,
                    cacheNodeIds=True,
                    checkpointInterval=10,
                    lossType="logistic",
                    maxIter=20,
                    stepSize=0.1,
                    seed=961928988,
                    subsamplingRate=1.0)

# Create a VectorAssembler to combine features into a single vector column
assembler = VectorAssembler(inputCols=["features_m"], outputCol="features")

# Create a pipeline with the Random Forest and GBT models
pipeline = Pipeline(stages=[assembler, rf, gbt])

# Fit the pipeline on your data
model = pipeline.fit(train_m)

# Make predictions on your data using the ensemble model
train_predictions = model.transform(train_m)
test_predictions = model.transform(test_m)

# Evaluate ROC-AUC on the training data
evaluator_train_roc = BinaryClassificationEvaluator(labelCol='churn', rawPredictionCol='rf_prob', metricName='areaUnderROC')
auc_train_roc = evaluator_train_roc.evaluate(train_predictions)

# Evaluate ROC-AUC on the test data
evaluator_test_roc = BinaryClassificationEvaluator(labelCol='churn', rawPredictionCol='rf_prob', metricName='areaUnderROC')
auc_test_roc = evaluator_test_roc.evaluate(test_predictions)

# Evaluate ROC-PR on the training data
evaluator_train_pr = BinaryClassificationEvaluator(labelCol='churn', rawPredictionCol='rf_prob', metricName='areaUnderPR')
auc_train_pr = evaluator_train_pr.evaluate(train_predictions)

# Evaluate ROC-PR on the test data
evaluator_test_pr = BinaryClassificationEvaluator(labelCol='churn', rawPredictionCol='rf_prob', metricName='areaUnderPR')
auc_test_pr = evaluator_test_pr.evaluate(test_predictions)

# Evaluate F1-score on the training data
evaluator_train_f1 = MulticlassClassificationEvaluator(labelCol='churn', predictionCol='rf_pred', metricName='f1')
f1_train = evaluator_train_f1.evaluate(train_predictions)

# Evaluate F1-score on the test data
evaluator_test_f1 = MulticlassClassificationEvaluator(labelCol='churn', predictionCol='rf_pred', metricName='f1')
f1_test = evaluator_test_f1.evaluate(test_predictions)

# Print the evaluation results
print("Train ROC-AUC: ", auc_train_roc)
print("Test ROC-AUC: ", auc_test_roc)
print("Train ROC-PR: ", auc_train_pr)
print("Test ROC-PR: ", auc_test_pr)
print("Train F1-score: ", f1_train)
print("Test F1-score: ", f1_test)