Task 2

mail@pastecode.io avatar
unknown
plain_text
a month ago
2.7 kB
4
Indexable
Never
# %load -s task_2 assignment2.py
def task_2(data_io, product_data):
    # -----------------------------Column names--------------------------------
    # Inputs:
    salesRank_column = 'salesRank'
    categories_column = 'categories'
    asin_column = 'asin'
    # Outputs:
    category_column = 'category'
    bestSalesCategory_column = 'bestSalesCategory'
    bestSalesRank_column = 'bestSalesRank'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------
    tf_df = product_data.withColumn(
        "category",
        when((col("categories").isNull()) | (size(col("categories")) == 0), None)
        .otherwise(
            when(size(col("categories").getItem(0)) > 0, col("categories").getItem(0).getItem(0))
            .otherwise(None)
        )
    )
    
    
    tf_df = tf_df.withColumn(
        "bestSalesCategory",
        when(col("salesRank").isNotNull(), expr("map_keys(salesRank)[0]")).otherwise(None)
    ).withColumn(
        "bestSalesRank",
        when(col("salesRank").isNotNull(), expr("map_values(salesRank)[0]")).otherwise(None)
    )
    
    stats = tf_df.select(
        count('*').alias('count_total'),
        mean('bestSalesRank').alias('mean_bestSalesRank'),
        variance('bestSalesRank').alias('variance_bestSalesRank'),
        count(when(col('category').isNull(), True)).alias('numNulls_category'),
        countDistinct('category').alias('countDistinct_category'),
        count(when(col('bestSalesCategory').isNull(), True)).alias('numNulls_bestSalesCategory'),
        countDistinct('bestSalesCategory').alias('countDistinct_bestSalesCategory')
    ).collect()[0]



    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': stats['count_total'],
        'mean_bestSalesRank': stats['mean_bestSalesRank'],
        'variance_bestSalesRank': stats['variance_bestSalesRank'],
        'numNulls_category': stats['numNulls_category'],
        'countDistinct_category': stats['countDistinct_category'],
        'numNulls_bestSalesCategory': stats['numNulls_bestSalesCategory'],
        'countDistinct_bestSalesCategory': stats['countDistinct_bestSalesCategory']
    }
    # Modify res:




    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_2')
    return res
    # -------------------------------------------------------------------------
Leave a Comment