Untitled
python
a month ago
3.6 kB
2
Indexable
Never
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier from pyspark.ml.feature import VectorAssembler from pyspark.ml import Pipeline from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator # Define the Random Forest model rf = RandomForestClassifier(numTrees=50, maxDepth=10, labelCol='churn', featuresCol='features_m', predictionCol='rf_pred', probabilityCol='rf_prob', rawPredictionCol='rf_raw_pred', minInstancesPerNode=100, maxBins=32, featureSubsetStrategy="auto", subsamplingRate=1, impurity='gini', seed=2023) # Define the GBT model gbt = GBTClassifier(featuresCol="features_m", labelCol='churn', predictionCol="gbt_pred", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=True, checkpointInterval=10, lossType="logistic", maxIter=20, stepSize=0.1, seed=961928988, subsamplingRate=1.0) # Create a VectorAssembler to combine features into a single vector column assembler = VectorAssembler(inputCols=["features_m"], outputCol="features") # Create a pipeline with the Random Forest and GBT models pipeline = Pipeline(stages=[assembler, rf, gbt]) # Fit the pipeline on your data model = pipeline.fit(train_m) # Make predictions on your data using the ensemble model train_predictions = model.transform(train_m) test_predictions = model.transform(test_m) # Evaluate ROC-AUC on the training data evaluator_train_roc = BinaryClassificationEvaluator(labelCol='churn', rawPredictionCol='rf_prob', metricName='areaUnderROC') auc_train_roc = evaluator_train_roc.evaluate(train_predictions) # Evaluate ROC-AUC on the test data evaluator_test_roc = BinaryClassificationEvaluator(labelCol='churn', rawPredictionCol='rf_prob', metricName='areaUnderROC') auc_test_roc = evaluator_test_roc.evaluate(test_predictions) # Evaluate ROC-PR on the training data evaluator_train_pr = BinaryClassificationEvaluator(labelCol='churn', rawPredictionCol='rf_prob', metricName='areaUnderPR') auc_train_pr = evaluator_train_pr.evaluate(train_predictions) # Evaluate ROC-PR on the test data evaluator_test_pr = BinaryClassificationEvaluator(labelCol='churn', rawPredictionCol='rf_prob', metricName='areaUnderPR') auc_test_pr = evaluator_test_pr.evaluate(test_predictions) # Evaluate F1-score on the training data evaluator_train_f1 = MulticlassClassificationEvaluator(labelCol='churn', predictionCol='rf_pred', metricName='f1') f1_train = evaluator_train_f1.evaluate(train_predictions) # Evaluate F1-score on the test data evaluator_test_f1 = MulticlassClassificationEvaluator(labelCol='churn', predictionCol='rf_pred', metricName='f1') f1_test = evaluator_test_f1.evaluate(test_predictions) # Print the evaluation results print("Train ROC-AUC: ", auc_train_roc) print("Test ROC-AUC: ", auc_test_roc) print("Train ROC-PR: ", auc_train_pr) print("Test ROC-PR: ", auc_test_pr) print("Train F1-score: ", f1_train) print("Test F1-score: ", f1_test)