Task 2
unknown
plain_text
2 years ago
2.7 kB
9
Indexable
# %load -s task_2 assignment2.py
def task_2(data_io, product_data):
# -----------------------------Column names--------------------------------
# Inputs:
salesRank_column = 'salesRank'
categories_column = 'categories'
asin_column = 'asin'
# Outputs:
category_column = 'category'
bestSalesCategory_column = 'bestSalesCategory'
bestSalesRank_column = 'bestSalesRank'
# -------------------------------------------------------------------------
# ---------------------- Your implementation begins------------------------
tf_df = product_data.withColumn(
"category",
when((col("categories").isNull()) | (size(col("categories")) == 0), None)
.otherwise(
when(size(col("categories").getItem(0)) > 0, col("categories").getItem(0).getItem(0))
.otherwise(None)
)
)
tf_df = tf_df.withColumn(
"bestSalesCategory",
when(col("salesRank").isNotNull(), expr("map_keys(salesRank)[0]")).otherwise(None)
).withColumn(
"bestSalesRank",
when(col("salesRank").isNotNull(), expr("map_values(salesRank)[0]")).otherwise(None)
)
stats = tf_df.select(
count('*').alias('count_total'),
mean('bestSalesRank').alias('mean_bestSalesRank'),
variance('bestSalesRank').alias('variance_bestSalesRank'),
count(when(col('category').isNull(), True)).alias('numNulls_category'),
countDistinct('category').alias('countDistinct_category'),
count(when(col('bestSalesCategory').isNull(), True)).alias('numNulls_bestSalesCategory'),
countDistinct('bestSalesCategory').alias('countDistinct_bestSalesCategory')
).collect()[0]
# -------------------------------------------------------------------------
# ---------------------- Put results in res dict --------------------------
res = {
'count_total': stats['count_total'],
'mean_bestSalesRank': stats['mean_bestSalesRank'],
'variance_bestSalesRank': stats['variance_bestSalesRank'],
'numNulls_category': stats['numNulls_category'],
'countDistinct_category': stats['countDistinct_category'],
'numNulls_bestSalesCategory': stats['numNulls_bestSalesCategory'],
'countDistinct_bestSalesCategory': stats['countDistinct_bestSalesCategory']
}
# Modify res:
# -------------------------------------------------------------------------
# ----------------------------- Do not change -----------------------------
data_io.save(res, 'task_2')
return res
# -------------------------------------------------------------------------
Editor is loading...
Leave a Comment