Untitled
unknown
plain_text
2 years ago
14 kB
7
Indexable
DATA MINING LAB: 1. Central tendency : sample = [13,15,16,16,19,20,20,21,22,22,25,25,25,25,30,33,33,35,35,35,35,36,40,45,46,52,55,60,62,65,68,70] for i in sample: print(i) MEAN: sum = 0 for i in sample: sum = sum+i n = len(sample) mean = sum/n mean MEDIAN n = len(sample) if(n%2==0): med1 = sample[n//2] med2 = sample[n//2-1] median = (med1+med2)/2 else: median = sample[n//2] print(median) MODE: from collections import Counter data = Counter(sample) get_mode = dict(data) mode = [ k for k, v in get_mode.items() if v == max(list(data.values()))] if len(mode) == n: get_mode = "No mode found" else: get_mode = "Mode is:"+','.join(map(str,mode)) print(get_mode) Variance: ans = [(i - mean) ** 2 for i in sample] sum = 0 for j in ans: sum = sum+j variance = sum/n variance Standard Deviation: import math standard_devatation = math.sqrt(variance) standard_devatation IQR: # 5 number Summary calculation import numpy as np data = [13, 15,16,16,19,20,20,21,22,22,25,25,25,25,30,33,33,35,35,35,35,36,40,45,46,52,70] Q1 = np.median(data[:13]) Q3 = np.median(data[13:]) IQR = Q3 - Q1 print("IQR",IQR) print("Q1 =",Q1) print("Q3 =",Q3) minimum = np.min(data) maximum =np.max(data) print("Minimum:", minimum) print("Maximum:", maximum) new_sample = np.sort(sample) new_sample import matplotlib.pyplot as plt plt.boxplot(new_sample) plt.show() 2. BINNING list = [4,8,15,21,21,24,25,28,34] m=3 def equifreq(arr1,m): a = len(arr1) n = int(a/m) for i in range(0,m): arr=[] for j in range(i*n,(i+1)*n): if j>=a: break arr=arr+[arr1[j]] print(arr) print("Equal binning:") equalbinning= equifreq(list,m) equalbinning import numpy as np bin1 = np.zeros((3,3)) bin2 = np.zeros((3,3)) bin3 = np.zeros((3,3)) for k in range(0,9,3): m=int(k/3) mean = (list[k]+ list[k+1]+list[k+2])/3 for j in range(3): bin1[m,j] = mean print("BIN MEAN:") print(bin1) for i in range(0,9,3): k=int(i/3) for j in range(3): bin2[k,j]=list[i+1] print("BIN MEDIAN:") print(bin2) for i in range(0,9,3): k=int(i/3) for j in range(3): if(list[i+j]-list[i])<(list[i+2]-list[i+j]): bin3[k,j]=list[i] else: bin3[k,j]=list[i+2] print("BIN BOUNDRY:") print(bin3) 3. NORMALIZATATION list = [200,300,400,600,1000] MIN MAX SCALAR import numpy as np print("minimum",min(list)) print("maximum",max(list)) n=[] for j in list: new = (j - min(list))/(max(list) - min(list)) n.append(new) print("Min max Scaler is:",n) Z SCORE NORMALIZATATION import numpy as np # MEAN: sum = 0 for i in list: sum = sum+i n = len(list) mean = sum/n mean SD =np.std(list) n=[] for p in list: new = (p-mean)/SD n.append(new) print("Z-score Normalizatation:\n", n) DECIMAL SCALING j=1 n=[] for i in list: new = i/10**j n.append(new) print(n) j=1000 n1=[] for i in list: new = i/j n1.append(new) print(n1) 4. PCA: 5. import pandas as pd 6. import matplotlib.pyplot as plt 7. url ="https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" 8. df = pd.read_csv(url,names=['sepal length','speal width','petal length','petal width','target']) 9. from sklearn.preprocessing import StandardScaler 10. features = ['sepal length','sep al width','petal length','petal width'] 11. x = df.loc[:,features].values 12. y = df.loc[:,['target']].values 13. x = StandardScaler().fit_transform(x) 14. from sklearn.decomposition import PCA 15. pca=PCA(n_components=2) 16. principleComponent=pca.fit_transform(x) 17. pricipleDF=pd.DataFrame(data = principleComponent, columns=['Principle componet 1','principle component 2']) 18. finalDF = pd.concat([pricipleDF, df[['target']]],axis=1) 19. fig=plt.figure(figsize=(8,8)) 20. ax=fig.add_subplot(1,1,1) 21. ax.set_xlabel('principle component 1', fontsize = 15) 22. ax.set_ylabel('principle component 2', fontsize = 15) 23. ax.set_title('2 component PCA', fontsize=20) 24. targets =['Iris-setosa','Iris-versicolor','Iris-virginica'] 25. colors=['r','g','b'] 26. for target, color in zip(targets,colors): 27. indicesToKeep = finalDf['target'] == target 28. ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1'] 29. , finalDf.loc[indicesToKeep, 'principal component 2'] 30. , c = color 31. , s = 50) 32. ax.legend(targets) 33. ax.grid() 5. APRIORI ALGORITHM import pandas as pd df = pd.read_csv("D:\\python\\MACHINE LEARNING\\DATASETS\\apori.csv") df_id = df['TID'] df_iid = df['IID'] df iid = df_iid.to_dict().values() iid = [x.split(',') for x in iid] iid from collections import Counter cntr = Counter() for list in iid: cntr.update(list) min_support = 2 c1 = dict(sorted(cntr.items(),key = lambda i:i[0])) l1 = {x:c1[x] for x in c1 if c1[x] >= min_support} l1 = pd.DataFrame(l1.items(),columns=['TID','Support']) l1 def frequent_itemset(previous_itemset,itemset_length,minimum_support): prev_keys=[*previous_itemset.iloc[:,0]] keys=[] for i in range(0,len(prev_keys)): for j in range(0, len(prev_keys)): for item in prev_keys[j]: key_set=set(prev_keys[i]).union(set(item)) if(len(key_set))==itemset_length and key_set not in keys: keys.append(key_set) candidate_itemset={tuple(sorted(x)):0 for x in keys} for iid_set in iid: for key in keys: if all(item in iid_set for item in key): candidate_itemset[tuple(sorted(key))]=candidate_itemset[tuple(sorted(key))]+1 candidate_itemset=pd.DataFrame(candidate_itemset.items(),columns=['TID','Support']) freq_itemset=candidate_itemset.loc[candidate_itemset['Support']>=minimum_support] if len(freq_itemset)==0: print("Support values of itemset is less than minimum support",minimum_support) else: return freq_itemset l2=frequent_itemset(l1,2,2) l2 l3 = frequent_itemset(l2,3,2) l3 l4 = frequent_itemset(l1,4,2) l4 frequent_pattern = pd.concat([l1,l2,l3],axis=0) frequent_pattern 6. ID3 ALGORITHM: import numpy as np import pandas as pd df = pd.read_csv("D:\\python\\MACHINE LEARNING\\DATASETS\\ID3.csv") df.head() count=len(df) y=list(df["CLASS"]).count('yes') n=list(df["CLASS"]).count('no') info_d=-(y/count * log(y/count,2))-(n/count *log(n/count,2)) info_d youth=list(df["age"]).count('youth') middle=list(df["age"]).count('middle') senior=list(df["age"]).count('senior') youth,middle,senior youth = list(df['age']).count('youth') middle = list(df['age']).count('middle') senior = list(df['age']).count('senior') youth,middle, senior def gain(info_class,info_feature): return info_class-info_feature def infoD(df,feature): info_d=0 total=len(df) possible_states=set(list(df[feature])) for state in possible_states: state_count=list(df[feature]).count(state) info_d+=-(state_count/total*log(state_count/total,2)) return info_d info_class=infoD(df,'CLASS') info_class info_class = infoD(df,'CLASS') info_class def info_feature(df,feature): info_feature=0 total=len(df) possible_feature_states=set(list(df[feature])) possible_class_states=set(list(df['CLASS'])) for feature_state in possible_feature_states: df_feature=df.loc[df[feature]==feature_state] feature_count=len(df_feature) for state in possible_class_states: state_count=list(df_feature['CLASS']).count(state) if state_count!=0: info_feature+=(feature_count/total)*-(state_count/ feature_count* log((state_count/feature_count),2)) return info_feature gain_income=gain(info_class,info_feature(df,'INCOME')) gain_age=gain(info_class,info_feature(df,'age')) gain_student=gain(info_class,info_feature(df,'STUDENT')) gain_credit=gain(info_class,info_feature(df,'CREDIT')) gain={'INCOME':gain_income,'STUDENT':gain_student,'CREDIT':gain_credit,'age':gain_age} gain=pd.DataFrame(gain.items(),columns=['Feature','Gain']) root=gain.loc[gain['Gain']==max(gain['Gain'])] root_feature=root['Feature'].tolist()[0] root_feature ID3 program 2: from sklearn.datasets import load_iris from sklearn import tree from matplotlib import pyplot as plt iris = load_iris() x=iris.data y=iris.target clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_leaf=4) clf.fit(x,y) fig,ax = plt.subplots(figsize=(6,6)) tree.plot_tree(clf,ax=ax, feature_names=['Sepal_length','sepal_width','petal_length', 'petal_width']) plt.show() 7. KNN import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.datasets import make_blobs from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import train_test_split X, y = make_blobs(n_samples = 500, n_features = 2, centers = 4,cluster_std = 1.5, random_state = 4) plt.style.use('seaborn') plt.figure(figsize = (7,7)) plt.scatter(X[:,0], X[:,1], c=y, marker= '*',s=100,edgecolors='black') plt.show() X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0) knn5.fit(X_train, y_train) knn1.fit(X_train, y_train) y_pred_5 = knn5.predict(X_test) y_pred_1 = knn1.predict(X_test) from sklearn.metrics import accuracy_score print("Accuracy with k=5", accuracy_score(y_test, y_pred_5)*100) print("Accuracy with k=1", accuracy_score(y_test, y_pred_1)*100) plt.figure(figsize = (15,5)) plt.subplot(1,2,1) plt.scatter(X_test[:,0], X_test[:,1], c=y_pred_5, marker= '*', s=100,edgecolors='black') plt.title("Predicted values with k=5", fontsize=20) plt.subplot(1,2,2) plt.scatter(X_test[:,0], X_test[:,1], c=y_pred_1, marker= '*', s=100,edgecolors='black') plt.title("Predicted values with k=1", fontsize=20) plt.show() knn2 = KNeighborsClassifier(n_neighbors = 2) knn3 = KNeighborsClassifier(n_neighbors=3) knn4 = KNeighborsClassifier(n_neighbors = 4) knn2.fit(X_train, y_train) knn3.fit(X_train, y_train) knn4.fit(X_train, y_train) y_pred_2 = knn5.predict(X_test) y_pred_3 = knn1.predict(X_test) y_pred_4 = knn1.predict(X_test) from sklearn.metrics import accuracy_score print("Accuracy with k=2", accuracy_score(y_test, y_pred_2)*100) print("Accuracy with k=3", accuracy_score(y_test, y_pred_3)*100) print("Accuracy with k=4", accuracy_score(y_test, y_pred_4)*100) plt.figure(figsize = (15,5)) plt.subplot(1,2,1) plt.scatter(X_test[:,0], X_test[:,1], c=y_pred_2, marker= '*', s=100,edgecolors='black') plt.title("Predicted values with k=2", fontsize=20) plt.figure(figsize = (15,5)) plt.subplot(1,2,1) plt.scatter(X_test[:,0], X_test[:,1], c=y_pred_3, marker= '*', s=100,edgecolors='black') plt.title("Predicted values with k=3", fontsize=20) plt.figure(figsize = (15,5)) plt.subplot(1,2,1) plt.scatter(X_test[:,0], X_test[:,1], c=y_pred_4, marker= '*', s=100,edgecolors='black') plt.title("Predicted values with k=4", fontsize=20) 8. SIMPLE LINEAR REGRESSION import numpy as np import matplotlib.pyplot as plt def estimate_coef(x, y): # number of observations/points n = np.size(x) # mean of x and y vector m_x = np.mean(x) m_y = np.mean(y) # calculating cross-deviation and deviation about x SS_xy = np.sum(y*x) - n*m_y*m_x SS_xx = np.sum(x*x) - n*m_x*m_x # calculating regression coefficients b_1 = SS_xy / SS_xx b_0 = m_y - b_1*m_x return (b_0, b_1) def plot_regression_line(x, y, b): # plotting the actual points as scatter plot plt.scatter(x, y, color = "m", marker = "o", s = 30) # predicted response vector y_pred = b[0] + b[1]*x # plotting the regression line plt.plot(x, y_pred, color = "g") # putting labels plt.xlabel('x') plt.ylabel('y') # function to show plot plt.show() def main(): # observations / data x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12]) # estimating coefficients b = estimate_coef(x, y) print("Estimated coefficients:\nb_0 = {} \ \nb_1 = {}".format(b[0], b[1])) # plotting regression line plot_regression_line(x, y, b) if __name__ == "__main__": main() MULTIPLE LINEAR REGRESSION: import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn import preprocessing df = pd.read_csv('C:\\Users\\chira\\Downloads\\Real-estate1.csv') df.drop('No', inplace = True,axis=1) print(df.head()) print(df.columns) sns.scatterplot(x='X4 number of convenience stores', y='Y house price of unit area', data=df) X = df.drop('Y house price of unit area',axis= 1) y = df['Y house price of unit area'] print(X) print(y) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=101) model = LinearRegression() model.fit(X_train,y_train) predictions = model.predict(X_test) print( 'mean_squared_error : ', mean_squared_error(y_test, predictions)) print( 'mean_absolute_error : ', mean_absolute_error(y_test, predictions)) LOGISTIC REGRESSION import numpy from sklearn import linear_model X = numpy.array([3.78, 2.44, 2.09, 0.14, 1.72, 1.65, 4.92, 4.37, 4.96, 4.52, 3.69, 5.88]).reshape(-1,1) y = numpy.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) logr = linear_model.LogisticRegression() logr.fit(X,y) def logit2prob(logr,x): log_odds = logr.coef_ * x + logr.intercept_ odds = numpy.exp(log_odds) probability = odds / (1 + odds) return(probability) print(logit2prob(logr, X))
Editor is loading...