Untitled

#Importing libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# create a SparkSession object
spark = SparkSession.builder.appName("CreateDataFrame").getOrCreate()

# define the schema for the DataFrame
schema = StructType([
    StructField("first_name", StringType(), True),
    StructField("middle_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("ssn", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", IntegerType(), True)
])

# create a DataFrame with the given data and schema
data = [
    ("James", "", "Smith", "36636", "M", 3000),
    ("Michael","", "Rose", "40288", "M", 4000),
    ("Robert", "", "Williams", "42114", "M", 4000),
    ("Maria", "Anne", "Jones", "39192", "F", 4000),
    ("Jen", "Mary", "Brown", "", "F", -1)
]
df = spark.createDataFrame(data, schema=schema)

# show the DataFrame
df.show()
#Import sum,avg and round 
from pyspark.sql.functions import sum, avg, round

# group by gender and calculate sum and average of salary
grouped_df = df.groupBy("gender").agg(sum("salary"), round(avg("salary"), 1))

# rename the columns
grouped_df = grouped_df.withColumnRenamed("sum(salary)", "total_salary") \
    .withColumnRenamed("round(avg(salary), 1)", "avg_salary")

# show the grouped DataFrame
grouped_df.show()
Editor is loading...