Untitled

from pyspark.sql.functions import avg, sum



# create the DataFrame from the data and schema
data2 = [("James","","Smith", "36636", "M", 3000),
         ("Michael","Rose","","40288","M", 4000),
         ("Robert","","Williams","42114", "M", 4000),
         ("Maria","Anne","Jones", "39192", "F", 4000),
         ("Jen", "Mary", "Brown","","F", -1)
        ]
df = spark.createDataFrame(data2, schema)

# group by gender and calculate sum and average of salary
result_df = df.groupBy("gender").agg(sum("salary"), avg("salary"))

# show the result
result_df.show()
Editor is loading...