Task 2
unknown
plain_text
a year ago
2.7 kB
8
Indexable
# %load -s task_2 assignment2.py def task_2(data_io, product_data): # -----------------------------Column names-------------------------------- # Inputs: salesRank_column = 'salesRank' categories_column = 'categories' asin_column = 'asin' # Outputs: category_column = 'category' bestSalesCategory_column = 'bestSalesCategory' bestSalesRank_column = 'bestSalesRank' # ------------------------------------------------------------------------- # ---------------------- Your implementation begins------------------------ tf_df = product_data.withColumn( "category", when((col("categories").isNull()) | (size(col("categories")) == 0), None) .otherwise( when(size(col("categories").getItem(0)) > 0, col("categories").getItem(0).getItem(0)) .otherwise(None) ) ) tf_df = tf_df.withColumn( "bestSalesCategory", when(col("salesRank").isNotNull(), expr("map_keys(salesRank)[0]")).otherwise(None) ).withColumn( "bestSalesRank", when(col("salesRank").isNotNull(), expr("map_values(salesRank)[0]")).otherwise(None) ) stats = tf_df.select( count('*').alias('count_total'), mean('bestSalesRank').alias('mean_bestSalesRank'), variance('bestSalesRank').alias('variance_bestSalesRank'), count(when(col('category').isNull(), True)).alias('numNulls_category'), countDistinct('category').alias('countDistinct_category'), count(when(col('bestSalesCategory').isNull(), True)).alias('numNulls_bestSalesCategory'), countDistinct('bestSalesCategory').alias('countDistinct_bestSalesCategory') ).collect()[0] # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = { 'count_total': stats['count_total'], 'mean_bestSalesRank': stats['mean_bestSalesRank'], 'variance_bestSalesRank': stats['variance_bestSalesRank'], 'numNulls_category': stats['numNulls_category'], 'countDistinct_category': stats['countDistinct_category'], 'numNulls_bestSalesCategory': stats['numNulls_bestSalesCategory'], 'countDistinct_bestSalesCategory': stats['countDistinct_bestSalesCategory'] } # Modify res: # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_2') return res # -------------------------------------------------------------------------
Editor is loading...
Leave a Comment