Untitled
unknown
plain_text
2 years ago
12 kB
0
Indexable
Never
from pandas import read_csv from pandas.plotting import register_matplotlib_converters register_matplotlib_converters() filename = 'small_ca_monthly_data.csv' data = read_csv(filename, na_values='na') print(data.shape) data=data.dropna(axis=1,how='all') data = data.drop(columns='station') print(data.shape) from matplotlib.pyplot import figure, savefig, close from ds_charts import bar_chart figure(figsize=(4,2)) values = {'nr records': data.shape[0], 'nr variables': data.shape[1]} bar_chart(list(values.keys()), list(values.values()), title='Nr of records vs nr variables') savefig('imagens/record_variables.png') show() data.dtypes cat_vars = data.select_dtypes(include='object') data[cat_vars.columns] = data.select_dtypes(['object']).apply(lambda x: x.astype('category')) data.dtypes from pandas import DataFrame def get_variable_types(df: DataFrame) -> dict: variable_types: dict = { 'Numeric': [], 'Binary': [], 'Date': [], 'Symbolic': [] } for c in df.columns: uniques = df[c].dropna(inplace=False).unique() if len(uniques) == 2: variable_types['Binary'].append(c) df[c].astype('bool') elif df[c].dtype == 'datetime64': variable_types['Date'].append(c) elif df[c].dtype == 'int': variable_types['Numeric'].append(c) elif df[c].dtype == 'float': variable_types['Numeric'].append(c) else: df[c].astype('category') variable_types['Symbolic'].append(c) return variable_types from matplotlib.pyplot import figure, savefig, close from ds_charts import bar_chart, get_variable_types variable_types = get_variable_types(data) print(variable_types) counts = {} for tp in variable_types.keys(): counts[tp] = len(variable_types[tp]) figure(figsize=(4,2)) bar_chart(list(counts.keys()), list(counts.values()), title='Nr of variables per type') savefig('imagens/variable_types.png') show() from matplotlib.pyplot import figure, savefig, close from ds_charts import bar_chart0 mv = {} for var in data: nr = data[var].isna().sum() if nr > 0: mv[var] = nr figure(figsize=(12,4)) bar_chart0(list(mv.keys()), list(mv.values()), title='Nr of missing values per variable', xlabel='variables', ylabel='nr missing values', rotation='vertical') savefig('imagens/mv.png') show() #distribution summary5 = data.describe() summary5 print('---- | PRCP | TMIN | TMAX') print('Count: ', summary5['PRCP']['count'], summary5['TMIN']['count'], summary5['TMAX']['count']) print('Mean: ', summary5['PRCP']['mean'], summary5['TMIN']['mean'], summary5['TMAX']['mean']) print('StDev: ', summary5['PRCP']['std'], summary5['TMIN']['std'], summary5['TMAX']['std']) print('Min: ', summary5['PRCP']['min'], summary5['TMIN']['min'], summary5['TMAX']['max']) print('Q1: ', summary5['PRCP']['25%'], summary5['TMIN']['25%'], summary5['TMAX']['25%']) print('Median: ', summary5['PRCP']['50%'], summary5['TMIN']['50%'], summary5['TMAX']['50%']) print('Q3: ', summary5['PRCP']['75%'], summary5['TMIN']['75%'], summary5['TMAX']['75%']) print('Max: ', summary5['PRCP']['max'], summary5['TMIN']['75%'], summary5['TMAX']['75%']) from matplotlib.pyplot import savefig, close data.boxplot(rot=45, figsize=(20, 20)) savefig('imagens/global_boxplot.png') close() from matplotlib.pyplot import savefig, close, subplots from ds_charts import get_variable_types, choose_grid, HEIGHT numeric_vars = get_variable_types(data)['Numeric'] if [] == numeric_vars: raise ValueError('There are no numeric variables.') rows, cols = choose_grid(len(numeric_vars)) fig, axs = subplots(rows, cols, figsize=(cols * HEIGHT, rows * HEIGHT), squeeze=False) i, j = 0, 0 for n in range(len(numeric_vars)): axs[i, j].set_title('Boxplot for %s' % numeric_vars[n]) axs[i, j].boxplot(data[numeric_vars[n]].dropna().values) i, j = (i + 1, 0) if (n + 1) % cols == 0 else (i, j + 1) savefig('imagens/single_boxplots.png') close() from matplotlib.pyplot import figure, savefig, close from ds_charts import get_variable_types, multiple_bar_chart, HEIGHT NR_STDEV: int = 2 numeric_vars = get_variable_types(data)['Numeric'] if [] == numeric_vars: raise ValueError('There are no numeric variables.') outliers_iqr = [] outliers_stdev = [] summary5 = data.describe(include='number') for var in numeric_vars: iqr = 1.5 * (summary5[var]['75%'] - summary5[var]['25%']) outliers_iqr += [ data[data[var] > summary5[var]['75%'] + iqr].count()[var] + data[data[var] < summary5[var]['25%'] - iqr].count()[var]] std = NR_STDEV * summary5[var]['std'] outliers_stdev += [ data[data[var] > summary5[var]['mean'] + std].count()[var] + data[data[var] < summary5[var]['mean'] - std].count()[var]] outliers = {'iqr': outliers_iqr, 'stdev': outliers_stdev} figure(figsize=(12, HEIGHT)) multiple_bar_chart(numeric_vars, outliers, title='Nr of outliers per variable', xlabel='variables', ylabel='nr outliers', percentage=False) savefig('imagens/outliers.png') close() from matplotlib.pyplot import savefig, close, subplots from ds_charts import get_variable_types, choose_grid, HEIGHT numeric_vars = get_variable_types(data)['Numeric'] if [] == numeric_vars: raise ValueError('There are no numeric variables.') fig, axs = subplots(rows, cols, figsize=(cols * HEIGHT, rows * HEIGHT), squeeze=False) i, j = 0, 0 print('single_histograms_numeric') for n in range(len(numeric_vars)): print(n) axs[i, j].set_title('Histogram for %s' % numeric_vars[n]) axs[i, j].set_xlabel(numeric_vars[n]) axs[i, j].set_ylabel("nr records") axs[i, j].hist(data[numeric_vars[n]].dropna().values, 'auto') i, j = (i + 1, 0) if (n + 1) % cols == 0 else (i, j + 1) savefig('imagens/single_histograms_numeric.png') close() from matplotlib.pyplot import savefig, close, subplots from seaborn import distplot from ds_charts import HEIGHT, get_variable_types numeric_vars = get_variable_types(data)['Numeric'] #if [] == numeric_vars: # raise ValueError('There are no numeric variables.') #fig, axs = subplots(rows, cols, figsize=(cols * HEIGHT, rows * HEIGHT), squeeze=False) #i, j = 0, 0 print('histograms_trend_numeric') #for n in range(len(numeric_vars)): # print(n)® # axs[i, j].set_title('Histogram with trend for %s' % numeric_vars[n]) # distplot(data[numeric_vars[n]].dropna().values, norm_hist=True, ax=axs[i, j], axlabel=numeric_vars[n]) # i, j = (i + 1, 0) if (n + 1) % cols == 0 else (i, j + 1) #savefig('imagens/histograms_trend_numeric.png') #close() ##class distribution from matplotlib.pyplot import figure, savefig, close, bar,xlabel, ylabel,title from ds_charts import bar_chart, get_variable_types,HEIGHT binary_vars = get_variable_types(data)['Binary'] if [] == binary_vars: raise ValueError('There are no binary variables.') for n in range(len(binary_vars)): sumCount = data[binary_vars[n]].values.sum() labels = ['False','True'] counts = [len(data[binary_vars[n]].values) - sumCount,sumCount] bar(labels,counts) xlabel(binary_vars[n]) ylabel('Nr of records') title('Class Distribution') savefig('imagens/histogram_binary.png') close() from numpy import log from pandas import Series from scipy.stats import norm, expon, lognorm from matplotlib.pyplot import savefig, close, subplots, Axes from ds_charts import HEIGHT, multiple_line_chart, get_variable_types def compute_known_distributions(x_values: list) -> dict: distributions = dict() # Gaussian mean, sigma = norm.fit(x_values) distributions['Normal(%.1f,%.2f)' % (mean, sigma)] = norm.pdf(x_values, mean, sigma) # Exponential loc, scale = expon.fit(x_values) try: distributions['Exp(%.2f)' % (1 / scale)] = expon.pdf(x_values, loc, scale) except ZeroDivisionError: pass # LogNorm sigma, loc, scale = lognorm.fit(x_values) distributions['LogNor(%.1f,%.2f)' % (log(scale), sigma)] = lognorm.pdf(x_values, sigma, loc, scale) return distributions def histogram_with_distributions(ax: Axes, series: Series, var: str): if not series.dropna().empty: values = series.sort_values().values ax.hist(values, 20, density=True) distributions = compute_known_distributions(values) multiple_line_chart(values, distributions, ax=ax, title='Best fit for %s' % var, xlabel=var, ylabel='') else: ax.set_title(f"{var}: No data available") numeric_vars = get_variable_types(data)['Numeric'] if [] == numeric_vars: raise ValueError('There are no numeric variables.') fig, axs = subplots(rows, cols, figsize=(cols * HEIGHT, rows * HEIGHT), squeeze=False) i, j = 0, 0 print('histogram_numeric_distribution') for n in range(len(numeric_vars)): print(n) histogram_with_distributions(axs[i, j], data[numeric_vars[n]].dropna(), numeric_vars[n]) i, j = (i + 1, 0) if (n + 1) % cols == 0 else (i, j + 1) savefig('imagens/histogram_numeric_distribution.png') close() from matplotlib.pyplot import savefig, close, subplots from ds_charts import HEIGHT, choose_grid, get_variable_types, bar_chart symbolic_vars = get_variable_types(data)['Symbolic'] if [] == symbolic_vars: print('There are no symbolic variables.') else: rows, cols = choose_grid(len(symbolic_vars)) fig, axs = subplots(rows, cols, figsize=(cols * HEIGHT, rows * HEIGHT), squeeze=False) i, j = 0, 0 print('histograms_symbolic') for n in range(len(symbolic_vars)): print(n) counts = data[symbolic_vars[n]].value_counts() bar_chart(counts.index.to_list(), counts.values, ax=axs[i, j], title='Histogram for %s' % symbolic_vars[n], xlabel=symbolic_vars[n], ylabel='nr records', percentage=False) i, j = (i + 1, 0) if (n + 1) % cols == 0 else (i, j + 1) savefig('imagens/histograms_symbolic.png') close() #sparsity from matplotlib.pyplot import subplots, savefig, close from ds_charts import get_variable_types, HEIGHT numeric_vars = get_variable_types(data)['Numeric'] if [] == numeric_vars: raise ValueError('There are no numeric variables.') rows, cols = len(numeric_vars)-1, len(numeric_vars)-1 fig, axs = subplots(rows, cols, figsize=(cols*HEIGHT, rows*HEIGHT), squeeze=False) print('sparsity_study_numeric.png') for i in range(len(numeric_vars)): var1 = numeric_vars[i] print(i) for j in range(i+1, len(numeric_vars)): print(j) var2 = numeric_vars[j] axs[i, j-1].set_title("%s x %s"%(var1,var2)) axs[i, j-1].set_xlabel(var1) axs[i, j-1].set_ylabel(var2) axs[i, j-1].scatter(data[var1], data[var2]) savefig('imagens/sparsity_study_numeric.png') close() from matplotlib.pyplot import savefig, close, subplots from ds_charts import HEIGHT, get_variable_types symbolic_vars = get_variable_types(data)['Symbolic'] if [] == symbolic_vars: print('There are no symbolic variables.') else: rows, cols = len(symbolic_vars)-1, len(symbolic_vars)-1 fig, axs = subplots(rows, cols, figsize=(cols*HEIGHT, rows*HEIGHT), squeeze=False) print('sparsity_study_symbolic.png') for i in range(len(symbolic_vars)): print(i) var1 = symbolic_vars[i] for j in range(i+1, len(symbolic_vars)): print(j) var2 = symbolic_vars[j] axs[i, j-1].set_title("%s x %s"%(var1,var2)) axs[i, j-1].set_xlabel(var1) axs[i, j-1].set_ylabel(var2) axs[i, j-1].scatter(data[var1], data[var2]) savefig('imagens/sparsity_study_symbolic.png') close() corr_mtx = abs(data.corr()) print(corr_mtx) from matplotlib.pyplot import figure, savefig, close, title from seaborn import heatmap fig = figure(figsize=[12, 12]) heatmap(abs(corr_mtx), xticklabels=corr_mtx.columns, yticklabels=corr_mtx.columns, annot=True, cmap='Blues') title('Correlation analysis') savefig('imagens/correlation_analysis.png') close()