from pyspark.sql.functions import avg, sum
# create the DataFrame from the data and schema
data2 = [("James","","Smith", "36636", "M", 3000),
("Michael","Rose","","40288","M", 4000),
("Robert","","Williams","42114", "M", 4000),
("Maria","Anne","Jones", "39192", "F", 4000),
("Jen", "Mary", "Brown","","F", -1)
]
df = spark.createDataFrame(data2, schema)
# group by gender and calculate sum and average of salary
result_df = df.groupBy("gender").agg(sum("salary"), avg("salary"))
# show the result
result_df.show()