Untitled

mail@pastecode.io avatar
unknown
plain_text
a year ago
11 kB
6
Indexable
Never
from pandas import read_csv
from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters()

filename = 'small_ca_monthly_data.csv'
data = read_csv(filename, na_values='na')
data=data.dropna(axis=1,how='all')
data = data.drop(columns='station')
print(data.shape)


from matplotlib.pyplot import figure, savefig, close
from ds_charts import bar_chart

figure(figsize=(4,2))
values = {'nr records': data.shape[0], 'nr variables': data.shape[1]}
bar_chart(list(values.keys()), list(values.values()), title='Nr of records vs nr variables')
savefig('imagens/record_variables.png')
close()

data.dtypes

cat_vars = data.select_dtypes(include='object')
data[cat_vars.columns] = data.select_dtypes(['object']).apply(lambda x: x.astype('category'))

data.dtypes


from pandas import DataFrame
def get_variable_types(df: DataFrame) -> dict:
    variable_types: dict = {
        'Numeric': [],
        'Binary': [],
        'Date': [],
        'Symbolic': []
    }
    for c in df.columns:
        uniques = df[c].dropna(inplace=False).unique()
        if len(uniques) == 2:
            variable_types['Binary'].append(c)
            df[c].astype('bool')
        elif df[c].dtype == 'datetime64':
            variable_types['Date'].append(c)
        elif df[c].dtype == 'int':
            variable_types['Numeric'].append(c)
        elif df[c].dtype == 'float':
            variable_types['Numeric'].append(c)
        else:
            df[c].astype('category')
            variable_types['Symbolic'].append(c)

    return variable_types


from matplotlib.pyplot import figure, savefig, close
from ds_charts import bar_chart, get_variable_types

variable_types = get_variable_types(data)
print(variable_types)
counts = {}
for tp in variable_types.keys():
    counts[tp] = len(variable_types[tp])
figure(figsize=(4,2))
bar_chart(list(counts.keys()), list(counts.values()), title='Nr of variables per type')
savefig('imagens/variable_types.png')
close()

from matplotlib.pyplot import figure, savefig, close
from ds_charts import bar_chart0
mv = {}
for var in data:
    nr = data[var].isna().sum()
    if nr > 0:
        mv[var] = nr

figure(figsize=(12,4))
bar_chart0(list(mv.keys()), list(mv.values()), title='Nr of missing values per variable',
            xlabel='variables', ylabel='nr missing values', rotation='vertical')
savefig('imagens/mv.png')
close()

#distribution

summary5 = data.describe()
summary5
print('---- | PRCP | TMIN | TMAX')
print('Count: ', summary5['PRCP']['count'], summary5['TMIN']['count'], summary5['TMAX']['count'])
print('Mean: ', summary5['PRCP']['mean'], summary5['TMIN']['mean'], summary5['TMAX']['mean'])
print('StDev: ', summary5['PRCP']['std'], summary5['TMIN']['std'], summary5['TMAX']['std'])
print('Min: ', summary5['PRCP']['min'], summary5['TMIN']['min'], summary5['TMAX']['max'])
print('Q1: ', summary5['PRCP']['25%'], summary5['TMIN']['25%'], summary5['TMAX']['25%'])
print('Median: ', summary5['PRCP']['50%'], summary5['TMIN']['50%'], summary5['TMAX']['50%'])
print('Q3: ', summary5['PRCP']['75%'], summary5['TMIN']['75%'], summary5['TMAX']['75%'])
print('Max: ', summary5['PRCP']['max'], summary5['TMIN']['75%'], summary5['TMAX']['75%'])

from matplotlib.pyplot import savefig, close

data.boxplot(rot=45, figsize=(20, 20))
savefig('imagens/global_boxplot.png')
close()

from matplotlib.pyplot import savefig, close, subplots
from ds_charts import get_variable_types, choose_grid, HEIGHT

numeric_vars = get_variable_types(data)['Numeric']
if [] == numeric_vars:
    raise ValueError('There are no numeric variables.')
rows, cols = choose_grid(len(numeric_vars))
fig, axs = subplots(rows, cols, figsize=(cols * HEIGHT, rows * HEIGHT), squeeze=False)
i, j = 0, 0
for n in range(len(numeric_vars)):
    axs[i, j].set_title('Boxplot for %s' % numeric_vars[n])
    axs[i, j].boxplot(data[numeric_vars[n]].dropna().values)
    i, j = (i + 1, 0) if (n + 1) % cols == 0 else (i, j + 1)
savefig('imagens/single_boxplots.png')
close()

from matplotlib.pyplot import figure, savefig, close
from ds_charts import get_variable_types, multiple_bar_chart, HEIGHT

NR_STDEV: int = 2

numeric_vars = get_variable_types(data)['Numeric']
if [] == numeric_vars:
    raise ValueError('There are no numeric variables.')

outliers_iqr = []
outliers_stdev = []
summary5 = data.describe(include='number')

for var in numeric_vars:
    iqr = 1.5 * (summary5[var]['75%'] - summary5[var]['25%'])
    outliers_iqr += [
        data[data[var] > summary5[var]['75%'] + iqr].count()[var] +
        data[data[var] < summary5[var]['25%'] - iqr].count()[var]]
    std = NR_STDEV * summary5[var]['std']
    outliers_stdev += [
        data[data[var] > summary5[var]['mean'] + std].count()[var] +
        data[data[var] < summary5[var]['mean'] - std].count()[var]]

outliers = {'iqr': outliers_iqr, 'stdev': outliers_stdev}
figure(figsize=(12, HEIGHT))
multiple_bar_chart(numeric_vars, outliers, title='Nr of outliers per variable', xlabel='variables',
                   ylabel='nr outliers', percentage=False)
savefig('imagens/outliers.png')
close()

from matplotlib.pyplot import savefig, close, subplots
from ds_charts import get_variable_types, choose_grid, HEIGHT

numeric_vars = get_variable_types(data)['Numeric']
if [] == numeric_vars:
    raise ValueError('There are no numeric variables.')

fig, axs = subplots(rows, cols, figsize=(cols * HEIGHT, rows * HEIGHT), squeeze=False)
i, j = 0, 0
print('single_histograms_numeric')
for n in range(len(numeric_vars)):
    print(n)
    axs[i, j].set_title('Histogram for %s' % numeric_vars[n])
    axs[i, j].set_xlabel(numeric_vars[n])
    axs[i, j].set_ylabel("nr records")
    axs[i, j].hist(data[numeric_vars[n]].dropna().values, 'auto')
    i, j = (i + 1, 0) if (n + 1) % cols == 0 else (i, j + 1)
savefig('imagens/single_histograms_numeric.png')
close()

from matplotlib.pyplot import savefig, close, subplots
from seaborn import distplot
from ds_charts import HEIGHT, get_variable_types

numeric_vars = get_variable_types(data)['Numeric']
if [] == numeric_vars:
    raise ValueError('There are no numeric variables.')

fig, axs = subplots(rows, cols, figsize=(cols * HEIGHT, rows * HEIGHT), squeeze=False)
i, j = 0, 0
print('histograms_trend_numeric')
for n in range(len(numeric_vars)):
    print(n)
    axs[i, j].set_title('Histogram with trend for %s' % numeric_vars[n])
    distplot(data[numeric_vars[n]].dropna().values, norm_hist=True, ax=axs[i, j], axlabel=numeric_vars[n])
    i, j = (i + 1, 0) if (n + 1) % cols == 0 else (i, j + 1)
savefig('imagens/histograms_trend_numeric.png')
close()

from numpy import log
from pandas import Series
from scipy.stats import norm, expon, lognorm
from matplotlib.pyplot import savefig, close, subplots, Axes
from ds_charts import HEIGHT, multiple_line_chart, get_variable_types


def compute_known_distributions(x_values: list) -> dict:
    distributions = dict()
    # Gaussian
    mean, sigma = norm.fit(x_values)
    distributions['Normal(%.1f,%.2f)' % (mean, sigma)] = norm.pdf(x_values, mean, sigma)
    # Exponential
    loc, scale = expon.fit(x_values)
    try:
        distributions['Exp(%.2f)' % (1 / scale)] = expon.pdf(x_values, loc, scale)
    except ZeroDivisionError:
        pass

    # LogNorm
    sigma, loc, scale = lognorm.fit(x_values)
    distributions['LogNor(%.1f,%.2f)' % (log(scale), sigma)] = lognorm.pdf(x_values, sigma, loc, scale)
    return distributions


def histogram_with_distributions(ax: Axes, series: Series, var: str):
    if not series.dropna().empty:
        values = series.sort_values().values
        ax.hist(values, 20, density=True)
        distributions = compute_known_distributions(values)
        multiple_line_chart(values, distributions, ax=ax, title='Best fit for %s' % var, xlabel=var, ylabel='')
    else:
        ax.set_title(f"{var}: No data available")

numeric_vars = get_variable_types(data)['Numeric']
if [] == numeric_vars:
    raise ValueError('There are no numeric variables.')

fig, axs = subplots(rows, cols, figsize=(cols * HEIGHT, rows * HEIGHT), squeeze=False)
i, j = 0, 0
print('histogram_numeric_distribution')
for n in range(len(numeric_vars)):
    print(n)
    histogram_with_distributions(axs[i, j], data[numeric_vars[n]].dropna(), numeric_vars[n])
    i, j = (i + 1, 0) if (n + 1) % cols == 0 else (i, j + 1)
savefig('imagens/histogram_numeric_distribution.png')
close()

from matplotlib.pyplot import savefig, close, subplots
from ds_charts import HEIGHT, choose_grid, get_variable_types, bar_chart

symbolic_vars = get_variable_types(data)['Symbolic']
if [] == symbolic_vars:
    print('There are no symbolic variables.')
else:
    rows, cols = choose_grid(len(symbolic_vars))
    fig, axs = subplots(rows, cols, figsize=(cols * HEIGHT, rows * HEIGHT), squeeze=False)
    i, j = 0, 0
    print('histograms_symbolic')
    for n in range(len(symbolic_vars)):
        print(n)
        counts = data[symbolic_vars[n]].value_counts()
        bar_chart(counts.index.to_list(), counts.values, ax=axs[i, j], title='Histogram for %s' % symbolic_vars[n],
        xlabel=symbolic_vars[n], ylabel='nr records', percentage=False)
        i, j = (i + 1, 0) if (n + 1) % cols == 0 else (i, j + 1)
    savefig('imagens/histograms_symbolic.png')
    close()

#sparsity

from matplotlib.pyplot import subplots, savefig, close
from ds_charts import get_variable_types, HEIGHT
numeric_vars = get_variable_types(data)['Numeric']

if [] == numeric_vars:
    raise ValueError('There are no numeric variables.')


rows, cols = len(numeric_vars)-1, len(numeric_vars)-1
fig, axs = subplots(rows, cols, figsize=(cols*HEIGHT, rows*HEIGHT), squeeze=False)

print('sparsity_study_numeric.png')
for i in range(len(numeric_vars)):
    var1 = numeric_vars[i]
    print(i)
    for j in range(i+1, len(numeric_vars)):
        print(j)
        var2 = numeric_vars[j]
        axs[i, j-1].set_title("%s x %s"%(var1,var2))
        axs[i, j-1].set_xlabel(var1)
        axs[i, j-1].set_ylabel(var2)
        axs[i, j-1].scatter(data[var1], data[var2])
savefig('imagens/sparsity_study_numeric.png')
close()

from matplotlib.pyplot import savefig, close, subplots
from ds_charts import HEIGHT, get_variable_types

symbolic_vars = get_variable_types(data)['Symbolic']
if [] == symbolic_vars:
    print('There are no symbolic variables.')
else:
    rows, cols = len(symbolic_vars)-1, len(symbolic_vars)-1
    fig, axs = subplots(rows, cols, figsize=(cols*HEIGHT, rows*HEIGHT), squeeze=False)
    print('sparsity_study_symbolic.png')
    for i in range(len(symbolic_vars)):
        print(i)
        var1 = symbolic_vars[i]
        for j in range(i+1, len(symbolic_vars)):
            print(j)
            var2 = symbolic_vars[j]
            axs[i, j-1].set_title("%s x %s"%(var1,var2))
            axs[i, j-1].set_xlabel(var1)
            axs[i, j-1].set_ylabel(var2)
            axs[i, j-1].scatter(data[var1], data[var2])
    savefig('imagens/sparsity_study_symbolic.png')
    close()

corr_mtx = abs(data.corr())
print(corr_mtx)

from matplotlib.pyplot import figure, savefig, close, title
from seaborn import heatmap

fig = figure(figsize=[12, 12])

heatmap(abs(corr_mtx), xticklabels=corr_mtx.columns, yticklabels=corr_mtx.columns, annot=True, cmap='Blues')
title('Correlation analysis')
savefig('imagens/correlation_analysis.png')
close()