Untitled
unknown
plain_text
3 years ago
14 kB
14
Indexable
DATA MINING LAB:
1. Central tendency :
sample = [13,15,16,16,19,20,20,21,22,22,25,25,25,25,30,33,33,35,35,35,35,36,40,45,46,52,55,60,62,65,68,70]
for i in sample:
print(i)
MEAN:
sum = 0
for i in sample:
sum = sum+i
n = len(sample)
mean = sum/n
mean
MEDIAN
n = len(sample)
if(n%2==0):
med1 = sample[n//2]
med2 = sample[n//2-1]
median = (med1+med2)/2
else:
median = sample[n//2]
print(median)
MODE:
from collections import Counter
data = Counter(sample)
get_mode = dict(data)
mode = [ k for k, v in get_mode.items() if v == max(list(data.values()))]
if len(mode) == n:
get_mode = "No mode found"
else:
get_mode = "Mode is:"+','.join(map(str,mode))
print(get_mode)
Variance:
ans = [(i - mean) ** 2 for i in sample]
sum = 0
for j in ans:
sum = sum+j
variance = sum/n
variance
Standard Deviation:
import math
standard_devatation = math.sqrt(variance)
standard_devatation
IQR:
# 5 number Summary calculation
import numpy as np
data = [13, 15,16,16,19,20,20,21,22,22,25,25,25,25,30,33,33,35,35,35,35,36,40,45,46,52,70]
Q1 = np.median(data[:13])
Q3 = np.median(data[13:])
IQR = Q3 - Q1
print("IQR",IQR)
print("Q1 =",Q1)
print("Q3 =",Q3)
minimum = np.min(data)
maximum =np.max(data)
print("Minimum:", minimum)
print("Maximum:", maximum)
new_sample = np.sort(sample)
new_sample
import matplotlib.pyplot as plt
plt.boxplot(new_sample)
plt.show()
2. BINNING
list = [4,8,15,21,21,24,25,28,34]
m=3
def equifreq(arr1,m):
a = len(arr1)
n = int(a/m)
for i in range(0,m):
arr=[]
for j in range(i*n,(i+1)*n):
if j>=a:
break
arr=arr+[arr1[j]]
print(arr)
print("Equal binning:")
equalbinning= equifreq(list,m)
equalbinning
import numpy as np
bin1 = np.zeros((3,3))
bin2 = np.zeros((3,3))
bin3 = np.zeros((3,3))
for k in range(0,9,3):
m=int(k/3)
mean = (list[k]+ list[k+1]+list[k+2])/3
for j in range(3):
bin1[m,j] = mean
print("BIN MEAN:")
print(bin1)
for i in range(0,9,3):
k=int(i/3)
for j in range(3):
bin2[k,j]=list[i+1]
print("BIN MEDIAN:")
print(bin2)
for i in range(0,9,3):
k=int(i/3)
for j in range(3):
if(list[i+j]-list[i])<(list[i+2]-list[i+j]):
bin3[k,j]=list[i]
else:
bin3[k,j]=list[i+2]
print("BIN BOUNDRY:")
print(bin3)
3. NORMALIZATATION
list = [200,300,400,600,1000]
MIN MAX SCALAR
import numpy as np
print("minimum",min(list))
print("maximum",max(list))
n=[]
for j in list:
new = (j - min(list))/(max(list) - min(list))
n.append(new)
print("Min max Scaler is:",n)
Z SCORE NORMALIZATATION
import numpy as np
# MEAN:
sum = 0
for i in list:
sum = sum+i
n = len(list)
mean = sum/n
mean
SD =np.std(list)
n=[]
for p in list:
new = (p-mean)/SD
n.append(new)
print("Z-score Normalizatation:\n", n)
DECIMAL SCALING
j=1
n=[]
for i in list:
new = i/10**j
n.append(new)
print(n)
j=1000
n1=[]
for i in list:
new = i/j
n1.append(new)
print(n1)
4. PCA:
5. import pandas as pd
6. import matplotlib.pyplot as plt
7. url ="https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
8. df = pd.read_csv(url,names=['sepal length','speal width','petal length','petal width','target'])
9. from sklearn.preprocessing import StandardScaler
10. features = ['sepal length','sep al width','petal length','petal width']
11. x = df.loc[:,features].values
12. y = df.loc[:,['target']].values
13. x = StandardScaler().fit_transform(x)
14. from sklearn.decomposition import PCA
15. pca=PCA(n_components=2)
16. principleComponent=pca.fit_transform(x)
17. pricipleDF=pd.DataFrame(data = principleComponent, columns=['Principle componet 1','principle component 2'])
18. finalDF = pd.concat([pricipleDF, df[['target']]],axis=1)
19. fig=plt.figure(figsize=(8,8))
20. ax=fig.add_subplot(1,1,1)
21. ax.set_xlabel('principle component 1', fontsize = 15)
22. ax.set_ylabel('principle component 2', fontsize = 15)
23. ax.set_title('2 component PCA', fontsize=20)
24. targets =['Iris-setosa','Iris-versicolor','Iris-virginica']
25. colors=['r','g','b']
26. for target, color in zip(targets,colors):
27. indicesToKeep = finalDf['target'] == target
28. ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
29. , finalDf.loc[indicesToKeep, 'principal component 2']
30. , c = color
31. , s = 50)
32. ax.legend(targets)
33. ax.grid()
5. APRIORI ALGORITHM
import pandas as pd
df = pd.read_csv("D:\\python\\MACHINE LEARNING\\DATASETS\\apori.csv")
df_id = df['TID']
df_iid = df['IID']
df
iid = df_iid.to_dict().values()
iid = [x.split(',') for x in iid]
iid
from collections import Counter
cntr = Counter()
for list in iid:
cntr.update(list)
min_support = 2
c1 = dict(sorted(cntr.items(),key = lambda i:i[0]))
l1 = {x:c1[x] for x in c1 if c1[x] >= min_support}
l1 = pd.DataFrame(l1.items(),columns=['TID','Support'])
l1
def frequent_itemset(previous_itemset,itemset_length,minimum_support):
prev_keys=[*previous_itemset.iloc[:,0]]
keys=[]
for i in range(0,len(prev_keys)):
for j in range(0, len(prev_keys)):
for item in prev_keys[j]:
key_set=set(prev_keys[i]).union(set(item))
if(len(key_set))==itemset_length and key_set not in keys:
keys.append(key_set)
candidate_itemset={tuple(sorted(x)):0 for x in keys}
for iid_set in iid:
for key in keys:
if all(item in iid_set for item in key):
candidate_itemset[tuple(sorted(key))]=candidate_itemset[tuple(sorted(key))]+1
candidate_itemset=pd.DataFrame(candidate_itemset.items(),columns=['TID','Support'])
freq_itemset=candidate_itemset.loc[candidate_itemset['Support']>=minimum_support]
if len(freq_itemset)==0:
print("Support values of itemset is less than minimum support",minimum_support)
else:
return freq_itemset
l2=frequent_itemset(l1,2,2)
l2
l3 = frequent_itemset(l2,3,2)
l3
l4 = frequent_itemset(l1,4,2)
l4
frequent_pattern = pd.concat([l1,l2,l3],axis=0)
frequent_pattern
6. ID3 ALGORITHM:
import numpy as np
import pandas as pd
df = pd.read_csv("D:\\python\\MACHINE LEARNING\\DATASETS\\ID3.csv")
df.head()
count=len(df)
y=list(df["CLASS"]).count('yes')
n=list(df["CLASS"]).count('no')
info_d=-(y/count * log(y/count,2))-(n/count *log(n/count,2))
info_d
youth=list(df["age"]).count('youth')
middle=list(df["age"]).count('middle')
senior=list(df["age"]).count('senior')
youth,middle,senior
youth = list(df['age']).count('youth')
middle = list(df['age']).count('middle')
senior = list(df['age']).count('senior')
youth,middle, senior
def gain(info_class,info_feature):
return info_class-info_feature
def infoD(df,feature):
info_d=0
total=len(df)
possible_states=set(list(df[feature]))
for state in possible_states:
state_count=list(df[feature]).count(state)
info_d+=-(state_count/total*log(state_count/total,2))
return info_d
info_class=infoD(df,'CLASS')
info_class
info_class = infoD(df,'CLASS')
info_class
def info_feature(df,feature):
info_feature=0
total=len(df)
possible_feature_states=set(list(df[feature]))
possible_class_states=set(list(df['CLASS']))
for feature_state in possible_feature_states:
df_feature=df.loc[df[feature]==feature_state]
feature_count=len(df_feature)
for state in possible_class_states:
state_count=list(df_feature['CLASS']).count(state)
if state_count!=0:
info_feature+=(feature_count/total)*-(state_count/
feature_count* log((state_count/feature_count),2))
return info_feature
gain_income=gain(info_class,info_feature(df,'INCOME'))
gain_age=gain(info_class,info_feature(df,'age'))
gain_student=gain(info_class,info_feature(df,'STUDENT'))
gain_credit=gain(info_class,info_feature(df,'CREDIT'))
gain={'INCOME':gain_income,'STUDENT':gain_student,'CREDIT':gain_credit,'age':gain_age}
gain=pd.DataFrame(gain.items(),columns=['Feature','Gain'])
root=gain.loc[gain['Gain']==max(gain['Gain'])]
root_feature=root['Feature'].tolist()[0]
root_feature
ID3 program 2:
from sklearn.datasets import load_iris
from sklearn import tree
from matplotlib import pyplot as plt
iris = load_iris()
x=iris.data
y=iris.target
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_leaf=4)
clf.fit(x,y)
fig,ax = plt.subplots(figsize=(6,6))
tree.plot_tree(clf,ax=ax, feature_names=['Sepal_length','sepal_width','petal_length', 'petal_width'])
plt.show()
7. KNN
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
X, y = make_blobs(n_samples = 500, n_features = 2, centers = 4,cluster_std = 1.5, random_state = 4)
plt.style.use('seaborn')
plt.figure(figsize = (7,7))
plt.scatter(X[:,0], X[:,1], c=y, marker= '*',s=100,edgecolors='black')
plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
knn5.fit(X_train, y_train)
knn1.fit(X_train, y_train)
y_pred_5 = knn5.predict(X_test)
y_pred_1 = knn1.predict(X_test)
from sklearn.metrics import accuracy_score
print("Accuracy with k=5", accuracy_score(y_test, y_pred_5)*100)
print("Accuracy with k=1", accuracy_score(y_test, y_pred_1)*100)
plt.figure(figsize = (15,5))
plt.subplot(1,2,1)
plt.scatter(X_test[:,0], X_test[:,1], c=y_pred_5, marker= '*', s=100,edgecolors='black')
plt.title("Predicted values with k=5", fontsize=20)
plt.subplot(1,2,2)
plt.scatter(X_test[:,0], X_test[:,1], c=y_pred_1, marker= '*', s=100,edgecolors='black')
plt.title("Predicted values with k=1", fontsize=20)
plt.show()
knn2 = KNeighborsClassifier(n_neighbors = 2)
knn3 = KNeighborsClassifier(n_neighbors=3)
knn4 = KNeighborsClassifier(n_neighbors = 4)
knn2.fit(X_train, y_train)
knn3.fit(X_train, y_train)
knn4.fit(X_train, y_train)
y_pred_2 = knn5.predict(X_test)
y_pred_3 = knn1.predict(X_test)
y_pred_4 = knn1.predict(X_test)
from sklearn.metrics import accuracy_score
print("Accuracy with k=2", accuracy_score(y_test, y_pred_2)*100)
print("Accuracy with k=3", accuracy_score(y_test, y_pred_3)*100)
print("Accuracy with k=4", accuracy_score(y_test, y_pred_4)*100)
plt.figure(figsize = (15,5))
plt.subplot(1,2,1)
plt.scatter(X_test[:,0], X_test[:,1], c=y_pred_2, marker= '*', s=100,edgecolors='black')
plt.title("Predicted values with k=2", fontsize=20)
plt.figure(figsize = (15,5))
plt.subplot(1,2,1)
plt.scatter(X_test[:,0], X_test[:,1], c=y_pred_3, marker= '*', s=100,edgecolors='black')
plt.title("Predicted values with k=3", fontsize=20)
plt.figure(figsize = (15,5))
plt.subplot(1,2,1)
plt.scatter(X_test[:,0], X_test[:,1], c=y_pred_4, marker= '*', s=100,edgecolors='black')
plt.title("Predicted values with k=4", fontsize=20)
8. SIMPLE LINEAR REGRESSION
import numpy as np
import matplotlib.pyplot as plt
def estimate_coef(x, y):
# number of observations/points
n = np.size(x)
# mean of x and y vector
m_x = np.mean(x)
m_y = np.mean(y)
# calculating cross-deviation and deviation about x
SS_xy = np.sum(y*x) - n*m_y*m_x
SS_xx = np.sum(x*x) - n*m_x*m_x
# calculating regression coefficients
b_1 = SS_xy / SS_xx
b_0 = m_y - b_1*m_x
return (b_0, b_1)
def plot_regression_line(x, y, b):
# plotting the actual points as scatter plot
plt.scatter(x, y, color = "m",
marker = "o", s = 30)
# predicted response vector
y_pred = b[0] + b[1]*x
# plotting the regression line
plt.plot(x, y_pred, color = "g")
# putting labels
plt.xlabel('x')
plt.ylabel('y')
# function to show plot
plt.show()
def main():
# observations / data
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12])
# estimating coefficients
b = estimate_coef(x, y)
print("Estimated coefficients:\nb_0 = {} \
\nb_1 = {}".format(b[0], b[1]))
# plotting regression line
plot_regression_line(x, y, b)
if __name__ == "__main__":
main()
MULTIPLE LINEAR REGRESSION:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import preprocessing
df = pd.read_csv('C:\\Users\\chira\\Downloads\\Real-estate1.csv')
df.drop('No', inplace = True,axis=1)
print(df.head())
print(df.columns)
sns.scatterplot(x='X4 number of convenience stores',
y='Y house price of unit area', data=df)
X = df.drop('Y house price of unit area',axis= 1)
y = df['Y house price of unit area']
print(X)
print(y)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=101)
model = LinearRegression()
model.fit(X_train,y_train)
predictions = model.predict(X_test)
print(
'mean_squared_error : ', mean_squared_error(y_test, predictions))
print(
'mean_absolute_error : ', mean_absolute_error(y_test, predictions))
LOGISTIC REGRESSION
import numpy
from sklearn import linear_model
X = numpy.array([3.78, 2.44, 2.09, 0.14, 1.72, 1.65, 4.92, 4.37, 4.96, 4.52, 3.69, 5.88]).reshape(-1,1)
y = numpy.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
logr = linear_model.LogisticRegression()
logr.fit(X,y)
def logit2prob(logr,x):
log_odds = logr.coef_ * x + logr.intercept_
odds = numpy.exp(log_odds)
probability = odds / (1 + odds)
return(probability)
print(logit2prob(logr, X))
Editor is loading...