Untitled
unknown
plain_text
a year ago
1.8 kB
4
Indexable
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, CountVectorizer, StringIndexer
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Initialize Spark session
spark = SparkSession.builder \
.appName("Gradient Boosting Example") \
.config("spark.driver.memory", "16g") \
.config("spark.executor.memory", "16g") \
.getOrCreate()
# Load your dataset
df = spark.read.csv('./combined_output.csv', header=True, inferSchema=True)
# Data cleaning: Remove rows with missing or invalid codes
df_cleaned = df.dropna()
# Split the cleaned data into training and testing sets
train_data, test_data = df_cleaned.randomSplit([0.8, 0.2], seed=42)
# Tokenize the address text
tokenizer = Tokenizer(inputCol='address', outputCol='words')
# Index the labels
string_indexer = StringIndexer(inputCol='code', outputCol='label', handleInvalid="keep")
# Vectorize the tokenized words
count_vectorizer = CountVectorizer(inputCol='words', outputCol='features')
# Initialize the Gradient Boosting Classifier model
gbt_classifier = GBTClassifier(labelCol='label', featuresCol='features', maxIter=10)
# Create a pipeline
pipeline = Pipeline(stages=[tokenizer, string_indexer, count_vectorizer, gbt_classifier])
# Train the Gradient Boosting model
model = pipeline.fit(train_data)
# Make predictions
predictions = model.transform(test_data)
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print("Gradient Boosting Accuracy:", accuracy)
# Save the model
model.save('model')
# Stop the Spark session
spark.stop()Editor is loading...
Leave a Comment