Untitled

from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, CountVectorizer, StringIndexer
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Gradient Boosting Example") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "16g") \
    .getOrCreate()

# Load your dataset
df = spark.read.csv('./combined_output.csv', header=True, inferSchema=True)

# Data cleaning: Remove rows with missing or invalid codes
df_cleaned = df.dropna()

# Split the cleaned data into training and testing sets
train_data, test_data = df_cleaned.randomSplit([0.8, 0.2], seed=42)

# Tokenize the address text
tokenizer = Tokenizer(inputCol='address', outputCol='words')

# Index the labels
string_indexer = StringIndexer(inputCol='code', outputCol='label', handleInvalid="keep")

# Vectorize the tokenized words
count_vectorizer = CountVectorizer(inputCol='words', outputCol='features')

# Initialize the Gradient Boosting Classifier model
gbt_classifier = GBTClassifier(labelCol='label', featuresCol='features', maxIter=10)

# Create a pipeline
pipeline = Pipeline(stages=[tokenizer, string_indexer, count_vectorizer, gbt_classifier])

# Train the Gradient Boosting model
model = pipeline.fit(train_data)

# Make predictions
predictions = model.transform(test_data)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print("Gradient Boosting Accuracy:", accuracy)

# Save the model
model.save('model')

# Stop the Spark session
spark.stop()
Editor is loading...