Untitled
unknown
plain_text
2 years ago
1.4 kB
7
Indexable
#Importing libraries from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, StringType, IntegerType # create a SparkSession object spark = SparkSession.builder.appName("CreateDataFrame").getOrCreate() # define the schema for the DataFrame schema = StructType([ StructField("first_name", StringType(), True), StructField("middle_name", StringType(), True), StructField("last_name", StringType(), True), StructField("ssn", StringType(), True), StructField("gender", StringType(), True), StructField("salary", IntegerType(), True) ]) # create a DataFrame with the given data and schema data = [ ("James", "", "Smith", "36636", "M", 3000), ("Michael","", "Rose", "40288", "M", 4000), ("Robert", "", "Williams", "42114", "M", 4000), ("Maria", "Anne", "Jones", "39192", "F", 4000), ("Jen", "Mary", "Brown", "", "F", -1) ] df = spark.createDataFrame(data, schema=schema) # show the DataFrame df.show() #Import sum,avg and round from pyspark.sql.functions import sum, avg, round # group by gender and calculate sum and average of salary grouped_df = df.groupBy("gender").agg(sum("salary"), round(avg("salary"), 1)) # rename the columns grouped_df = grouped_df.withColumnRenamed("sum(salary)", "total_salary") \ .withColumnRenamed("round(avg(salary), 1)", "avg_salary") # show the grouped DataFrame grouped_df.show()
Editor is loading...