Untitled
unknown
plain_text
2 years ago
996 B
6
Indexable
from pyspark.sql import SparkSession from pyspark.sql.functions import col, rand, concat, lit, to_date, date_add import random spark = SparkSession.builder.appName("Large Data Generation").getOrCreate() # Number of distinct tokens distinct_tokens = 5000000 # 5 million # Total number of records total_records = 100000000 # 100 million # Generate a base dataframe with total_records df = spark.range(total_records) # Add columns df = (df.withColumn("token_sfx", (col("id") % distinct_tokens).cast("string")) .withColumn("token_sfx", concat(lit("token_"), col("token_sfx"))) .withColumn("cust_id", (rand() * 100000).cast("int")) .withColumn("event_dt", to_date(lit("2022-01-01"))) .withColumn("event_dt", date_add("event_dt", (rand() * 365).cast("int")))) df.show(5) def extract_distinct_tokens(df): distinct_tokens_df = df.select("token_sfx").dropDuplicates() return distinct_tokens_df distinct_df = extract_distinct_tokens(df) distinct_df.show(5)
Editor is loading...