Untitled
unknown
plain_text
3 years ago
2.9 kB
4
Indexable
# from pyspark.sql import SparkSession # spark = SparkSession.builder.appName("ml").getOrCreate() # df = spark.read.csv("./letter-recognition.data", header=False, inferSchema=True) # df.show(10) # df.schema from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, StringType, IntegerType schema = StructType([ StructField("lettr", StringType(), True), StructField("x_box", IntegerType(), True), StructField("y_box", IntegerType(), True), StructField("width", IntegerType(), True), StructField("high", IntegerType(), True), StructField("onpix", IntegerType(), True), StructField("x_bar", IntegerType(), True), StructField("y_bar", IntegerType(), True), StructField("x2bar", IntegerType(), True), StructField("y2bar", IntegerType(), True), StructField("xybar", IntegerType(), True), StructField("x2ybr", IntegerType(), True), StructField("xy2br", IntegerType(), True), StructField("x_ege", IntegerType(), True), StructField("xegvy", IntegerType(), True), StructField("y_ege", IntegerType(), True), StructField("yegvx", IntegerType(), True), ]) spark = SparkSession.builder.appName("ml").getOrCreate() df = spark.read.csv("./letter-recognition.data", header=False, schema=schema) df.show(10) df.schema a = df.count() print(f"łaczna liczba wystapien: {a}") df.createOrReplaceTempView("df") spark.sql("SELECT lettr, count(*) count from df Group BY lettr ORDER BY lettr").show(10) df_train, df_eval = df.randomSplit([0.7, 0.3], 42) from pyspark.ml import feature idx = feature.StringIndexer(inputCol="lettr", outputCol="label") idx_t = idx.fit(df_train) df_train_ = idx_t.transform(df_train) df_train_.show(10) vect = feature.VectorAssembler(inputCols=df.columns[1:], outputCol="feat") df_train_ = vect.transform(df_train_) df_train_ = df_train_.select("label", "feat") df_train_.show(10) scaler = feature.StandardScaler(inputCol="feat", outputCol="features") scaler_t = scaler.fit(df_train_) df_train_ = scaler_t.transform(df_train_) df_train_.show(10, truncate=False) from pyspark.ml import classification forest = classification.RandomForestClassifier(maxDepth=8, minInstancesPerNode=5, seed=42) forest_t = forest.fit(df_train_) pred_train = forest_t.transform(df_train_) pred_train.show(10) from pyspark.ml import evaluation evaluator = evaluation.MulticlassClassificationEvaluator(metricName="accuracy") a = evaluator.evaluate(pred_train) print(f"Prawdopodobienstwo prawidłowefo doasawania litery: {a}") df_eval_ = idx_t.transform(df_eval) df_eval_ = vect.transform(df_eval_) df_eval_ = df_eval_.select("label", "feat") df_eval_ = scaler_t.transform(df_eval_) pred_eval = forest_t.transform(df_eval_) pred_eval.show(10) b = evaluator.evaluate(pred_eval) print(f"Prawdopodobienstwo prawidłowefo doasawania litery: {b}")
Editor is loading...