Untitled

pip install pyspark

pip install pyspark[sql]

from pyspark.sql.types import StructType, StructField , StringType, IntegerType
data2 = [("James","","Smith", "36636", "M", 3000),
         ("Michael","Rose","","40288","M", 4000),
         ("Robert","","Williams","42114", "M", 4000),
         ("Maria","Anne","Jones", "39192", "F", 4000),
         ("Jen", "Mary", "Brown","","F", -1)
        ]

schema = StructType([ \
      StructField("firstname", StringType(), True), \
      StructField("middlename", StringType(), True), \
      StructField("lastname", StringType(), True), \
      StructField("id", StringType(), True), \
      StructField("gender", StringType(), True), \
      StructField("salary", IntegerType(), True)                  
    ])

df = spark.createDataFrame(data = data2, schema = schema)
df.printSchema()
df.show( truncate = False)


from pyspark.sql.functions import avg, sum, round



# create the DataFrame from the data and schema
data2 = [("James","","Smith", "36636", "M", 3000),
         ("Michael","Rose","","40288","M", 4000),
         ("Robert","","Williams","42114", "M", 4000),
         ("Maria","Anne","Jones", "39192", "F", 4000),
         ("Jen", "Mary", "Brown","","F", -1)
        ]
df = spark.createDataFrame(data2, schema)

# group by gender and calculate sum and average of salary
result_df = df.groupBy("gender").agg(round(sum("salary"),1), round(avg("salary"),1))

# show the result
result_df.show()
Editor is loading...