Untitled

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rand, concat, lit, to_date, date_add
import random

spark = SparkSession.builder.appName("Large Data Generation").getOrCreate()

# Number of distinct tokens
distinct_tokens = 5000000  # 5 million
# Total number of records
total_records = 100000000  # 100 million

# Generate a base dataframe with total_records
df = spark.range(total_records)

# Add columns
df = (df.withColumn("token_sfx", (col("id") % distinct_tokens).cast("string"))
        .withColumn("token_sfx", concat(lit("token_"), col("token_sfx")))
        .withColumn("cust_id", (rand() * 100000).cast("int"))
        .withColumn("event_dt", to_date(lit("2022-01-01")))
        .withColumn("event_dt", date_add("event_dt", (rand() * 365).cast("int"))))

df.show(5)

def extract_distinct_tokens(df):
    distinct_tokens_df = df.select("token_sfx").dropDuplicates()
    return distinct_tokens_df

distinct_df = extract_distinct_tokens(df)
distinct_df.show(5)
Editor is loading...