Untitled
1)Analiza in componente principale ACP import pandas as pd import numpy as np #A infoTari = pd.read_csv('./dataIN/MiseNatPopTari.csv' , index_col=0) codExtins = pd.read_csv('./dataIN/CoduriTariExtins.csv' , index_col=0) nume_var = infoTari.columns[2:].values print(nume_var) lista_nume_var = list(nume_var) nume_obs = infoTari.index.values print(nume_obs) conditie = infoTari['RS'] < 0 print(infoTari[conditie]) t = infoTari.merge(right = codExtins , right_index=True , left_index=True) t1 = t.groupby('Continent')[lista_nume_var].mean() print(t1) #B - ACP standardizata # 1)Variantele componentelor principale. data_std = (infoTari[nume_var] - infoTari[nume_var].mean() ) / infoTari[nume_var].std() print(data_std) from sklearn.decomposition import PCA pca = PCA() pca.fit(data_std) varianta = pca.explained_variance_ratio_ print(varianta) #Scorurile asociate instantelor. nr_componente = len(varianta) nr_componente2 = len(pca.components_) print(nr_componente,nr_componente2) et_componente = ["C"+str(i+1) for i in range(nr_componente)] print(et_componente) scoruri = np.dot(data_std , pca.components_.T) t_scoruri = pd.DataFrame(data=scoruri , index=nume_obs , columns=et_componente) print(t_scoruri) #3 ) Graficul scorurilor in primele doua axe principale t_componente = pd.DataFrame(data=pca.components_,index=et_componente,columns=nume_var) print(t_componente) from matplotlib import pyplot as plt print(pca.components_) plt.figure(figsize=(8,8)) plt.scatter(pca.components_[0, :], pca.components_[1, :]) plt.show() 2)Analiza canonica a corelatiei (standardizate) - ACC import numpy as np import pandas as pd t_ind = pd.read_csv('./DataIN/Industrie.csv' , index_col=0) t_pop = pd.read_csv('./DataIN/PopulatieLocalitati.csv', index_col=0) #print(industrie) nume_industrii = t_ind.columns[1:].values print(nume_industrii) lista_nume_industrii = list(nume_industrii) t1 = t_ind.merge(right=t_pop , right_index=True , left_index=True) print(t1) #cerinta 1--------------------------------- def perCapita(t): linie = t[lista_nume_industrii].values / t['Populatie'] rez = list(linie) rez = [t['Localitate_x'] ] + rez #print(rez) return pd.Series(data=rez , index=['Localitate_x']+lista_nume_industrii) t2 = t1[['Localitate_x','Populatie'] + lista_nume_industrii].apply(func=perCapita, axis=1 ) print(t2) #cerinta 2--------------------------------- t3 = t1[lista_nume_industrii + ['Judet']].groupby('Judet').sum() print(t3) def maxCA(t): linie = t.values #print(linie) max_pe_linie = np.argmax(linie) #print(max_pe_linie , linie[max_pe_linie]) rez = [t.index[max_pe_linie] , linie[max_pe_linie]] return pd.Series(data=rez , index=['Activitatea dominanta' , 'CA']) t4 = t3[lista_nume_industrii].apply(func = maxCA,axis =1 ) print(t4) #Analiza canonica a corelatiei (standardizate) - ACC #cerinta 3-standardizați valorile variabilelor in 2 subseturi import matplotlib.pyplot as plt import sklearn.cross_decomposition as skl tabel = pd.read_csv('./DataIN/DataSet_34.csv' , index_col =0) print(tabel) obs = tabel.index.values tabel_std = (tabel - tabel.mean()) / tabel.std() print(tabel_std) coloane_x = tabel.columns[0:4].values coloane_y = tabel.columns[4:].values Xstd = tabel_std[coloane_x] Ystd = tabel_std[coloane_y] #Cerinta 4 -------------------------------- n,p=np.shape(Xstd) q=Ystd.shape[1] m=min(p,q) print(n,p,q,m) obiectCCA = skl.CCA(n_components=m) obiectCCA.fit(X=Xstd , Y=Ystd) z,u=obiectCCA.transform(X=Xstd , Y=Ystd) z_df = pd.DataFrame(data=z , index=obs , columns=['z'+str(j+1) for j in range(m)]) z_df.to_csv('./DataOUT/Xscores_.csv') u_df = pd.DataFrame(data=u , index=obs , columns=['u'+str(j+1) for j in range(m)]) u_df.to_csv('./DataOUT/Yscores.csv') #print(z,u) #Cerinta 5---------------------------- Rxz = obiectCCA.x_loadings_ Rxz_df = pd.DataFrame(data=Rxz , index=coloane_x, columns=['z'+str(j+1) for j in range(m)]) Rxz_df.to_csv('./DataOUT/Rxz.csv') Ryu = obiectCCA.y_loadings_ Ryu_df = pd.DataFrame(data=Ryu , index=coloane_y, columns=['u'+str(j+1) for j in range(m)]) Ryu_df.to_csv('./DataOUT/Ryu.csv') #Cerinta 6---------------------------- def biplot(z,u,obs=None) : f = plt.figure(figsize=(11,8)) ax = f.add_subplot(1,1,1) assert isinstance(ax , plt.Axes) ax.scatter(x=z[:,0], y=z[:,1] , color="Red" , label='Set X') ax.scatter(x=u[:,0] , y=u[:,1] , color="Blue" , label='Set Y') if obs is not None: for i in range (len(obs)): ax.text(x=z[i,0], y=z[i,1] , s=obs[i]) ax.text(x=u[i,0] , y=u[i,1], s=obs[i]) biplot(z,u,obs=obs) plt.show() 3)Analiza Clusteri import pandas as pd import numpy as np tabel_alcool=pd.read_csv("DateIN/alcohol.csv",index_col=0) tabel_coduri=pd.read_csv("DateIN/CoduriTariExtins.csv",index_col=0) #cerinta 1 ani = tabel_alcool.columns[1:].values print(ani) lista_ani = list(ani) lista_obs = list(tabel_alcool.index.values) t = tabel_alcool.copy() t['Medie'] = tabel_alcool[lista_ani].mean(axis=1) #print(t) c1 = t[['Medie' , 'Code']].groupby('Code').mean() #print(c1) c1.to_csv('cer1.csv') t1 = pd.DataFrame(data=t[['Medie' ,'Code']] , index=lista_obs,columns=['Medie']) #print(t1) #2----------------------------------- t2 = t.merge(right = tabel_coduri , right_index=True , left_index=True) #print(t2) t3 = t2.groupby('Continent')[lista_ani].mean() t3['ValoareMaxima'] = t3[lista_ani].max(axis=1) t3['AnValoareMaxima'] = t3[lista_ani].idxmax(axis=1) #print(t3 , type(t3)) t4 = t3['AnValoareMaxima'] #print(t4) t4.to_csv('cer2.csv') #b----------------------- import seaborn as sb import matplotlib.pyplot as plt import scipy.cluster.hierarchy as hclust from sklearn.decomposition import PCA #Cluster m=len(lista_ani) n=len(lista_obs) x=tabel_alcool[lista_ani].values nan_indices = np.isnan(x) column_means = np.nanmean(x, axis=0) x[nan_indices] = np.take(column_means, np.where(nan_indices)[1]) print(x) #Construim ierarhie h=hclust.linkage(x,method="ward") print(h) #nr jonctiuni p=n-1 print("jonctiuni",p) #distanta maxima intre 2 clusteri k_diff_max=np.argmax(h[1:,2]-h[:(p-1),2]) print("distanta",k_diff_max) #nr clusteri nr_clusteri=p-k_diff_max print("nr_clusteri",nr_clusteri) plt.figure(figsize=(10,10)) hclust.dendrogram(h) plt.show() k_diff_max=p-nr_clusteri prag=(h[k_diff_max,2]+h[k_diff_max+1,2])/2 n=p+1 c=np.arange(n) for i in range(n-nr_clusteri): k1=h[i,0] k2=h[i,1] c[c==k1]=n+i c[c==k2]=n+i coduri=pd.Categorical(c).codes partitie_optima = np.array(["c"+str(cod)for cod in coduri]) print(partitie_optima) 4)Analiza factoriala import numpy as np import pandas as pd #cerinta 1 ------------------------ vot = pd.read_csv('./dataIN/Vot.csv',index_col=0) print(vot) numeCol = vot.columns[1:].values print('\n',numeCol,type(numeCol)) listaCol = list(numeCol) # Identificăm categoria de alegători cu cel mai mic procent de prezență la vot pentru fiecare localitate vot['valoare_minima'] = vot[listaCol].min(axis=1) vot['Categorie'] = vot[listaCol].idxmin(axis=1) print(vot) t=vot[['Localitate','Categorie']] print(t) #cerinta 2-------------------------- coduri= pd.read_csv('./dataIN/CoduriLocalitati.csv' , index_col=0) t1 = vot.merge(right = coduri ,left_index=True , right_index=True ) print(t1) t2 = t1.groupby('County')[listaCol].mean() print(t2) #B----------------------------------- #1)1. Aplicarea testului Bartlett de relevanță. Se va calcula și se va afişa pragul de semnificație asociat respingerii/acceptării testului (p-value). from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity from factor_analyzer import FactorAnalyzer import matplotlib.pyplot as plt #Datele utilizate sunt in df-ul vot vot_data = vot[listaCol] # Aplicarea testului Bartlett de relevanță bartlett_statistic, p_value = calculate_bartlett_sphericity(vot_data) print("Testul Bartlett de relevanță:") print("Statistică Bartlett:", bartlett_statistic) print("P-value:", p_value) #2)2. Scorurile factoriale. Vor fi salvate în fișierul f.csv. (2 puncte) # Analiza factorială factor_analyzer = FactorAnalyzer() factor_analyzer.fit(vot_data) print(vot_data) # Scorurile factoriale factor_scores = factor_analyzer.transform(vot_data) print('FORMA FACTOR SCRES',factor_scores.shape[1]) factor_scores_df = pd.DataFrame(data=factor_scores, columns=["Factor1", "Factor2","Factor 3"]) factor_scores_df.to_csv("./dataOUT/f.csv", index=False) #3 ) 3. Graficul scorurilor factoriale pentru primii doi factori. # Graficul scorurilor factoriale pentru primii doi factori plt.figure(figsize=(8, 6)) plt.scatter(factor_scores[:, 0], factor_scores[:, 1]) plt.xlabel("Factor 1") plt.ylabel("Factor 2") plt.title("Scorurile factoriale pentru primii doi factori") plt.show() 5)Analiza discriminanta import pandas as pd import numpy as np from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB import seaborn as sb import matplotlib.pyplot as plt tabel_train_test=pd.read_csv("dataIn/hernia.csv",index_col=0) variabile=list(tabel_train_test.columns) predictori=variabile[:-1] tinta=variabile[-1] x_train,x_test,y_train,y_test=train_test_split(tabel_train_test[predictori],tabel_train_test[tinta],test_size=0.4) #Creare model liniar model_lda=LinearDiscriminantAnalysis() model_lda.fit(x_train,y_train) #Predictie model liniar predictie_model_lda=model_lda.predict(x_train) print(predictie_model_lda) #Predictie in testul de aplicare x_apply=pd.read_csv("dataIn/hernia_apply.csv",index_col=0) predictie_test_aplicatie=model_lda.predict(x_apply[predictori]) print(predictie_test_aplicatie) #Creare model Bayes model_b=GaussianNB() model_b.fit(x_train,y_train) #Predictie model in bayes predictie_b_test=model_b.predict(x_apply) print(predictie_b_test) #Calcul axe discriminante clase=model_lda.classes_ #nr de functii discriminante m=len(clase)-1 #calcul scoruri model liniar z=model_lda.transform(x_test) df_z=pd.DataFrame(data=z,index=x_test.index,columns=["Z"+str(i+1)for i in range(m)]) df_z.to_csv("zz.csv") #Desen distributie-->o sg axa def plot_distributie(z,y,k=0,titlu="Distributie"): fig=plt.figure(titlu,figsize=(9,9)) ax=fig.add_subplot(1,1,1) ax.set_title(titlu) ax=sb.kdeplot(x=z[:,k],hue=y,fill=True,ax=ax) for i in range(m): plot_distributie(z,y_test,i,"Distributie") #Desen scoruri in 2 axe def plot_instante(z,y,clase,k1=0,k2=1,titlu="2Axe"): fig = plt.figure(titlu, figsize=(9, 9)) ax = fig.add_subplot(1, 1, 1) ax.set_title(titlu) ax.set_xlabel("z"+str(k1+1)) ax.set_ylabel("z"+str(k2+1)) ax=sb.scatterplot(x=z[:,k1],y=z[:,k2],hue=y,hue_order=clase,ax=ax) for i in range(m-1): for j in range(i+1,m): plot_instante(z,y_test,clase,i,j,"2Axe") plt.show()
Leave a Comment