Untitled

from pyspark.sql.functions import *

# create the DataFrame
data = [
    ("James", "", "Smith", "36636", "M", 3000),
    ("Michael","", "Rose", "40288", "M", 4000),
    ("Robert", "", "Williams", "42114", "M", 4000),
    ("Maria", "Anne", "Jones", "39192", "F", 4000),
    ("Jen", "Mary", "Brown", "", "F", -1)
]

df = spark.createDataFrame(data, ["first_name", "middle_name", "last_name", "ssn", "gender", "salary"])

# group the data by gender and calculate sum and average of salary
grouped_df = df.groupBy("gender").agg(
    sum("salary").alias("total_salary"), 
    round(avg("salary"), 1).alias("avg_salary")
)

# show the results
grouped_df.show()
Editor is loading...