Untitled
unknown
plain_text
2 years ago
688 B
9
Indexable
from pyspark.sql.functions import * # create the DataFrame data = [ ("James", "", "Smith", "36636", "M", 3000), ("Michael","", "Rose", "40288", "M", 4000), ("Robert", "", "Williams", "42114", "M", 4000), ("Maria", "Anne", "Jones", "39192", "F", 4000), ("Jen", "Mary", "Brown", "", "F", -1) ] df = spark.createDataFrame(data, ["first_name", "middle_name", "last_name", "ssn", "gender", "salary"]) # group the data by gender and calculate sum and average of salary grouped_df = df.groupBy("gender").agg( sum("salary").alias("total_salary"), round(avg("salary"), 1).alias("avg_salary") ) # show the results grouped_df.show()
Editor is loading...