Untitled
unknown
plain_text
3 years ago
688 B
12
Indexable
from pyspark.sql.functions import *
# create the DataFrame
data = [
("James", "", "Smith", "36636", "M", 3000),
("Michael","", "Rose", "40288", "M", 4000),
("Robert", "", "Williams", "42114", "M", 4000),
("Maria", "Anne", "Jones", "39192", "F", 4000),
("Jen", "Mary", "Brown", "", "F", -1)
]
df = spark.createDataFrame(data, ["first_name", "middle_name", "last_name", "ssn", "gender", "salary"])
# group the data by gender and calculate sum and average of salary
grouped_df = df.groupBy("gender").agg(
sum("salary").alias("total_salary"),
round(avg("salary"), 1).alias("avg_salary")
)
# show the results
grouped_df.show()
Editor is loading...