Untitled
user_3592770
plain_text
3 years ago
1.4 kB
12
Indexable
pip install pyspark
pip install pyspark[sql]
from pyspark.sql.types import StructType, StructField , StringType, IntegerType
data2 = [("James","","Smith", "36636", "M", 3000),
("Michael","Rose","","40288","M", 4000),
("Robert","","Williams","42114", "M", 4000),
("Maria","Anne","Jones", "39192", "F", 4000),
("Jen", "Mary", "Brown","","F", -1)
]
schema = StructType([ \
StructField("firstname", StringType(), True), \
StructField("middlename", StringType(), True), \
StructField("lastname", StringType(), True), \
StructField("id", StringType(), True), \
StructField("gender", StringType(), True), \
StructField("salary", IntegerType(), True)
])
df = spark.createDataFrame(data = data2, schema = schema)
df.printSchema()
df.show( truncate = False)
from pyspark.sql.functions import avg, sum, round
# create the DataFrame from the data and schema
data2 = [("James","","Smith", "36636", "M", 3000),
("Michael","Rose","","40288","M", 4000),
("Robert","","Williams","42114", "M", 4000),
("Maria","Anne","Jones", "39192", "F", 4000),
("Jen", "Mary", "Brown","","F", -1)
]
df = spark.createDataFrame(data2, schema)
# group by gender and calculate sum and average of salary
result_df = df.groupBy("gender").agg(round(sum("salary"),1), round(avg("salary"),1))
# show the result
result_df.show()Editor is loading...