Untitled
unknown
plain_text
5 months ago
1.8 kB
2
Indexable
from pyspark.sql import SparkSession from pyspark.ml.feature import Tokenizer, CountVectorizer, StringIndexer from pyspark.ml.classification import GBTClassifier from pyspark.ml import Pipeline from pyspark.ml.evaluation import MulticlassClassificationEvaluator # Initialize Spark session spark = SparkSession.builder \ .appName("Gradient Boosting Example") \ .config("spark.driver.memory", "16g") \ .config("spark.executor.memory", "16g") \ .getOrCreate() # Load your dataset df = spark.read.csv('./combined_output.csv', header=True, inferSchema=True) # Data cleaning: Remove rows with missing or invalid codes df_cleaned = df.dropna() # Split the cleaned data into training and testing sets train_data, test_data = df_cleaned.randomSplit([0.8, 0.2], seed=42) # Tokenize the address text tokenizer = Tokenizer(inputCol='address', outputCol='words') # Index the labels string_indexer = StringIndexer(inputCol='code', outputCol='label', handleInvalid="keep") # Vectorize the tokenized words count_vectorizer = CountVectorizer(inputCol='words', outputCol='features') # Initialize the Gradient Boosting Classifier model gbt_classifier = GBTClassifier(labelCol='label', featuresCol='features', maxIter=10) # Create a pipeline pipeline = Pipeline(stages=[tokenizer, string_indexer, count_vectorizer, gbt_classifier]) # Train the Gradient Boosting model model = pipeline.fit(train_data) # Make predictions predictions = model.transform(test_data) # Evaluate the model evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy') accuracy = evaluator.evaluate(predictions) print("Gradient Boosting Accuracy:", accuracy) # Save the model model.save('model') # Stop the Spark session spark.stop()
Editor is loading...
Leave a Comment