mail@pastecode.io avatar
a year ago
2.0 kB
# Sure, let's consider that we have a CSV file with the following columns: Country, Health_Group, Year, and Population. We will use PySpark to load the data, and then we will perform some basic analysis like counting the number of distinct countries, health groups, and years, as well as computing the total and average population.

# Please replace 'path_to_your_dataset.csv' with the actual path to your dataset.

from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Create a SparkSession
spark = SparkSession.builder.appName('health_group_analysis').getOrCreate()

# Load the dataset
df = spark.read.csv('path_to_your_dataset.csv', inferSchema=True, header=True)

# Check the schema of the dataset

# Count the number of distinct countries
num_countries = df.select('Country').distinct().count()
print("Number of distinct countries: ", num_countries)

# Count the number of distinct health groups
num_health_groups = df.select('Health_Group').distinct().count()
print("Number of distinct health groups: ", num_health_groups)

# Count the number of distinct years
num_years = df.select('Year').distinct().count()
print("Number of distinct years: ", num_years)

# Compute the total population
total_population = df.select(sum('Population')).first()[0]
print("Total population: ", total_population)

# Compute the average population per year
avg_population_per_year = df.groupBy('Year').agg(avg('Population').alias('Average_Population')).sort('Year')

# Stop the SparkSession

# This is a simple analysis. Depending on the specific requirements of your task, you might need more complex queries or data preprocessing.

# Running this code requires a Spark environment. If you're running this locally, you'll need to have Apache Spark installed and configured properly. If you're using a cloud-based service like Databricks or AWS EMR, these environments are already set up to run PySpark code.