Untitled

mail@pastecode.io avatar
unknown
python
7 months ago
1.2 kB
2
Indexable
Never
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('../data/preprocessed_data_points.csv')
print(df.shape)
df.head(2)  

en = "energy-kj_100g"
su = "sugars_100g"
fa = "saturated-fat_100g"
sa = "sodium_100g"
pr = "proteins_100g"
fi = "fiber_100g"
fr = "fruits-vegetables-nuts-estimate-from-ingredients_100g"

profiles = df.loc[:,[en, su, fa, sa, pr, fi, fr]]
print(profiles.shape)
profiles.head(2)

# Getting quantiles using describe()
quantiles = profiles.describe(percentiles=[0, 0.20, 0.40, 0.60, 0.80, 1])

profile_dict = {}

# Plotting distribution curves for each column
for col in profiles.columns:
    plt.figure(figsize=(8, 5))
    sns.histplot(profiles[col], kde=True, bins=30, color='green', stat='density')
    plt.title(f'Distribution of {col}')
    plt.xlabel('Values')
    plt.ylabel('Density')
    
    each_criteria = []

    # Plotting quantile lines on the distribution plot
    for quantile in quantiles.loc[['0%', '20%', '40%', '60%', '80%', '100%'], col]:
        plt.axvline(x=quantile, color='red', linestyle='--', linewidth=1)
        each_criteria.append(quantile)
    
    profile_dict[col] = each_criteria
    
    plt.show()
Leave a Comment