Untitled
user_3592770
plain_text
2 years ago
1.4 kB
9
Indexable
pip install pyspark pip install pyspark[sql] from pyspark.sql.types import StructType, StructField , StringType, IntegerType data2 = [("James","","Smith", "36636", "M", 3000), ("Michael","Rose","","40288","M", 4000), ("Robert","","Williams","42114", "M", 4000), ("Maria","Anne","Jones", "39192", "F", 4000), ("Jen", "Mary", "Brown","","F", -1) ] schema = StructType([ \ StructField("firstname", StringType(), True), \ StructField("middlename", StringType(), True), \ StructField("lastname", StringType(), True), \ StructField("id", StringType(), True), \ StructField("gender", StringType(), True), \ StructField("salary", IntegerType(), True) ]) df = spark.createDataFrame(data = data2, schema = schema) df.printSchema() df.show( truncate = False) from pyspark.sql.functions import avg, sum, round # create the DataFrame from the data and schema data2 = [("James","","Smith", "36636", "M", 3000), ("Michael","Rose","","40288","M", 4000), ("Robert","","Williams","42114", "M", 4000), ("Maria","Anne","Jones", "39192", "F", 4000), ("Jen", "Mary", "Brown","","F", -1) ] df = spark.createDataFrame(data2, schema) # group by gender and calculate sum and average of salary result_df = df.groupBy("gender").agg(round(sum("salary"),1), round(avg("salary"),1)) # show the result result_df.show()
Editor is loading...