Untitled
unknown
plain_text
2 years ago
996 B
9
Indexable
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rand, concat, lit, to_date, date_add
import random
spark = SparkSession.builder.appName("Large Data Generation").getOrCreate()
# Number of distinct tokens
distinct_tokens = 5000000 # 5 million
# Total number of records
total_records = 100000000 # 100 million
# Generate a base dataframe with total_records
df = spark.range(total_records)
# Add columns
df = (df.withColumn("token_sfx", (col("id") % distinct_tokens).cast("string"))
.withColumn("token_sfx", concat(lit("token_"), col("token_sfx")))
.withColumn("cust_id", (rand() * 100000).cast("int"))
.withColumn("event_dt", to_date(lit("2022-01-01")))
.withColumn("event_dt", date_add("event_dt", (rand() * 365).cast("int"))))
df.show(5)
def extract_distinct_tokens(df):
distinct_tokens_df = df.select("token_sfx").dropDuplicates()
return distinct_tokens_df
distinct_df = extract_distinct_tokens(df)
distinct_df.show(5)
Editor is loading...