Untitled

mail@pastecode.io avatar
unknown
plain_text
a year ago
14 kB
4
Indexable
Never
DATA MINING LAB:
1.	Central tendency :
sample = [13,15,16,16,19,20,20,21,22,22,25,25,25,25,30,33,33,35,35,35,35,36,40,45,46,52,55,60,62,65,68,70]
for i in sample:
    print(i)
MEAN:
sum = 0 
for i in sample:
    sum = sum+i
n = len(sample)
mean = sum/n
mean
MEDIAN
n = len(sample)
if(n%2==0):
    med1 = sample[n//2]
    med2 = sample[n//2-1]
    median = (med1+med2)/2
else:
    median = sample[n//2]    
print(median)
MODE:
from collections import Counter
data = Counter(sample)
get_mode = dict(data)

mode = [ k for k, v in get_mode.items() if v == max(list(data.values()))]

if len(mode) == n:
    get_mode = "No mode found"
else:
    get_mode = "Mode is:"+','.join(map(str,mode))
    
print(get_mode)
Variance:
ans = [(i - mean) ** 2 for i in sample]
sum = 0 
for j in ans:
    sum = sum+j
variance = sum/n
variance
Standard Deviation:
import math
standard_devatation = math.sqrt(variance)
standard_devatation

IQR:
# 5 number Summary calculation 
import numpy as np
data = [13, 15,16,16,19,20,20,21,22,22,25,25,25,25,30,33,33,35,35,35,35,36,40,45,46,52,70]
Q1 = np.median(data[:13])
Q3 = np.median(data[13:])
IQR = Q3 - Q1
print("IQR",IQR)
print("Q1 =",Q1)
print("Q3 =",Q3)
minimum = np.min(data)
maximum =np.max(data)
print("Minimum:", minimum)
print("Maximum:", maximum)

new_sample = np.sort(sample)
new_sample

import matplotlib.pyplot as plt
plt.boxplot(new_sample)
plt.show()

2.	BINNING
list = [4,8,15,21,21,24,25,28,34]
m=3
def equifreq(arr1,m):
    a = len(arr1)
    n = int(a/m)
    for i in range(0,m):
        arr=[]
        for j in range(i*n,(i+1)*n):
            if j>=a:
                break
            arr=arr+[arr1[j]]
        print(arr)
print("Equal binning:")
equalbinning= equifreq(list,m)
equalbinning

import numpy as np
bin1 = np.zeros((3,3))
bin2 = np.zeros((3,3))
bin3 = np.zeros((3,3))

for k in range(0,9,3):
    m=int(k/3)
    mean = (list[k]+ list[k+1]+list[k+2])/3
    for j in range(3):
        bin1[m,j] = mean
print("BIN MEAN:")
print(bin1)

for i in range(0,9,3):
    k=int(i/3)
    for j in range(3):
        bin2[k,j]=list[i+1]
print("BIN MEDIAN:")
print(bin2)

for i in range(0,9,3):
    k=int(i/3)
    for j in range(3):
        if(list[i+j]-list[i])<(list[i+2]-list[i+j]):
            bin3[k,j]=list[i]
        else:
            bin3[k,j]=list[i+2]
print("BIN BOUNDRY:")
print(bin3)


3.	NORMALIZATATION
list = [200,300,400,600,1000]
MIN MAX SCALAR
import numpy as np
print("minimum",min(list))
print("maximum",max(list))
n=[]
for j in list:
    new = (j - min(list))/(max(list) - min(list))
    n.append(new)
print("Min max Scaler is:",n)

Z SCORE NORMALIZATATION 
import numpy as np
# MEAN:
sum = 0 
for i in list:
    sum = sum+i
n = len(list)
mean = sum/n
mean
SD =np.std(list)
n=[]
for p in list:
    new = (p-mean)/SD
    n.append(new)
print("Z-score Normalizatation:\n", n)

DECIMAL SCALING 
j=1
n=[]
for i in list:
    new = i/10**j
    n.append(new)
print(n)

j=1000
n1=[]
for i in list:
    new = i/j
    n1.append(new)
print(n1)

4.	PCA: 
5.	import pandas as pd
6.	import matplotlib.pyplot as plt
7.	url ="https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
8.	df = pd.read_csv(url,names=['sepal length','speal width','petal length','petal width','target'])
9.	from sklearn.preprocessing import StandardScaler
10.	features = ['sepal length','sep al width','petal length','petal width']
11.	x = df.loc[:,features].values
12.	y = df.loc[:,['target']].values
13.	x = StandardScaler().fit_transform(x)
14.	from sklearn.decomposition import PCA
15.	pca=PCA(n_components=2)
16.	principleComponent=pca.fit_transform(x)
17.	pricipleDF=pd.DataFrame(data = principleComponent, columns=['Principle componet 1','principle component 2'])
18.	finalDF = pd.concat([pricipleDF, df[['target']]],axis=1)
19.	fig=plt.figure(figsize=(8,8))
20.	ax=fig.add_subplot(1,1,1)
21.	ax.set_xlabel('principle component 1', fontsize = 15)
22.	ax.set_ylabel('principle component 2', fontsize = 15)
23.	ax.set_title('2 component PCA', fontsize=20)
24.	targets =['Iris-setosa','Iris-versicolor','Iris-virginica']
25.	colors=['r','g','b']
26.	for target, color in zip(targets,colors):
27.	    indicesToKeep = finalDf['target'] == target
28.	    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
29.	               , finalDf.loc[indicesToKeep, 'principal component 2']
30.	               , c = color
31.	               , s = 50)
32.	ax.legend(targets)
33.	ax.grid()

5. APRIORI ALGORITHM
import pandas as pd
df = pd.read_csv("D:\\python\\MACHINE LEARNING\\DATASETS\\apori.csv")
df_id = df['TID']
df_iid = df['IID']
df

iid = df_iid.to_dict().values()
iid = [x.split(',') for x in iid]
iid

from collections import Counter
cntr = Counter()
for list in iid:
    cntr.update(list)
min_support = 2
c1 = dict(sorted(cntr.items(),key = lambda i:i[0]))
l1 = {x:c1[x] for x in c1 if c1[x] >= min_support}
l1 = pd.DataFrame(l1.items(),columns=['TID','Support'])
l1

def frequent_itemset(previous_itemset,itemset_length,minimum_support):
  prev_keys=[*previous_itemset.iloc[:,0]]
  keys=[]
  for i in range(0,len(prev_keys)):
    for j in range(0, len(prev_keys)):
      for item in prev_keys[j]:
        key_set=set(prev_keys[i]).union(set(item))
        if(len(key_set))==itemset_length and key_set not in keys:
          keys.append(key_set)
  candidate_itemset={tuple(sorted(x)):0 for x in keys}
  for iid_set in iid:
    for key in keys:
      if all(item in iid_set for item in key):
        candidate_itemset[tuple(sorted(key))]=candidate_itemset[tuple(sorted(key))]+1
  candidate_itemset=pd.DataFrame(candidate_itemset.items(),columns=['TID','Support'])
  freq_itemset=candidate_itemset.loc[candidate_itemset['Support']>=minimum_support]
  if len(freq_itemset)==0:
    print("Support values of itemset is less than minimum support",minimum_support)
  else:
    return freq_itemset
l2=frequent_itemset(l1,2,2)
l2

l3 = frequent_itemset(l2,3,2)
l3

l4 = frequent_itemset(l1,4,2)
l4

frequent_pattern = pd.concat([l1,l2,l3],axis=0)
frequent_pattern



6. ID3 ALGORITHM:
import numpy as np 
import pandas as pd
df = pd.read_csv("D:\\python\\MACHINE LEARNING\\DATASETS\\ID3.csv")
df.head()
count=len(df)
y=list(df["CLASS"]).count('yes')
n=list(df["CLASS"]).count('no')

info_d=-(y/count * log(y/count,2))-(n/count *log(n/count,2))
info_d
youth=list(df["age"]).count('youth')
middle=list(df["age"]).count('middle')
senior=list(df["age"]).count('senior')

youth,middle,senior

youth = list(df['age']).count('youth')
middle = list(df['age']).count('middle')
senior = list(df['age']).count('senior')
youth,middle, senior

def gain(info_class,info_feature):
    return info_class-info_feature

def infoD(df,feature):
    info_d=0
    total=len(df)
    possible_states=set(list(df[feature]))
    for state in possible_states:
        state_count=list(df[feature]).count(state)
        info_d+=-(state_count/total*log(state_count/total,2))
    return info_d
info_class=infoD(df,'CLASS')
info_class

info_class = infoD(df,'CLASS')
info_class

def info_feature(df,feature):
    info_feature=0
    total=len(df)
    
    possible_feature_states=set(list(df[feature]))
    possible_class_states=set(list(df['CLASS'])) 
    
    for feature_state in possible_feature_states:
        df_feature=df.loc[df[feature]==feature_state]
        feature_count=len(df_feature)
        for state in possible_class_states:
            state_count=list(df_feature['CLASS']).count(state)
            if state_count!=0:
                info_feature+=(feature_count/total)*-(state_count/
                                 feature_count* log((state_count/feature_count),2))
    return info_feature            

gain_income=gain(info_class,info_feature(df,'INCOME'))
gain_age=gain(info_class,info_feature(df,'age'))
gain_student=gain(info_class,info_feature(df,'STUDENT'))
gain_credit=gain(info_class,info_feature(df,'CREDIT'))

gain={'INCOME':gain_income,'STUDENT':gain_student,'CREDIT':gain_credit,'age':gain_age}
gain=pd.DataFrame(gain.items(),columns=['Feature','Gain'])

root=gain.loc[gain['Gain']==max(gain['Gain'])]
root_feature=root['Feature'].tolist()[0]

root_feature

ID3 program 2:
from sklearn.datasets import load_iris
from sklearn import tree
from matplotlib import pyplot as plt
iris = load_iris()

x=iris.data
y=iris.target
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_leaf=4)
clf.fit(x,y)

fig,ax = plt.subplots(figsize=(6,6))
tree.plot_tree(clf,ax=ax, feature_names=['Sepal_length','sepal_width','petal_length', 'petal_width'])
plt.show()

7. KNN
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.datasets import make_blobs
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split


X, y = make_blobs(n_samples = 500, n_features = 2, centers = 4,cluster_std = 1.5, random_state = 4)


plt.style.use('seaborn')
plt.figure(figsize = (7,7))
plt.scatter(X[:,0], X[:,1], c=y, marker= '*',s=100,edgecolors='black')
plt.show()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

knn5.fit(X_train, y_train)
knn1.fit(X_train, y_train)

y_pred_5 = knn5.predict(X_test)
y_pred_1 = knn1.predict(X_test)


from sklearn.metrics import accuracy_score
print("Accuracy with k=5", accuracy_score(y_test, y_pred_5)*100)
print("Accuracy with k=1", accuracy_score(y_test, y_pred_1)*100)

plt.figure(figsize = (15,5))
plt.subplot(1,2,1)
plt.scatter(X_test[:,0], X_test[:,1], c=y_pred_5, marker= '*', s=100,edgecolors='black')
plt.title("Predicted values with k=5", fontsize=20)

plt.subplot(1,2,2)
plt.scatter(X_test[:,0], X_test[:,1], c=y_pred_1, marker= '*', s=100,edgecolors='black')
plt.title("Predicted values with k=1", fontsize=20)
plt.show()

knn2 = KNeighborsClassifier(n_neighbors = 2)
knn3 = KNeighborsClassifier(n_neighbors=3)
knn4 = KNeighborsClassifier(n_neighbors = 4)

knn2.fit(X_train, y_train)
knn3.fit(X_train, y_train)
knn4.fit(X_train, y_train)

y_pred_2 = knn5.predict(X_test)
y_pred_3 = knn1.predict(X_test)
y_pred_4 = knn1.predict(X_test)

from sklearn.metrics import accuracy_score
print("Accuracy with k=2", accuracy_score(y_test, y_pred_2)*100)
print("Accuracy with k=3", accuracy_score(y_test, y_pred_3)*100)
print("Accuracy with k=4", accuracy_score(y_test, y_pred_4)*100)

plt.figure(figsize = (15,5))
plt.subplot(1,2,1)
plt.scatter(X_test[:,0], X_test[:,1], c=y_pred_2, marker= '*', s=100,edgecolors='black')
plt.title("Predicted values with k=2", fontsize=20)

plt.figure(figsize = (15,5))
plt.subplot(1,2,1)
plt.scatter(X_test[:,0], X_test[:,1], c=y_pred_3, marker= '*', s=100,edgecolors='black')
plt.title("Predicted values with k=3", fontsize=20)

plt.figure(figsize = (15,5))
plt.subplot(1,2,1)
plt.scatter(X_test[:,0], X_test[:,1], c=y_pred_4, marker= '*', s=100,edgecolors='black')
plt.title("Predicted values with k=4", fontsize=20)

8. SIMPLE LINEAR REGRESSION

import numpy as np
import matplotlib.pyplot as plt

def estimate_coef(x, y):
    # number of observations/points
    n = np.size(x)
 
    # mean of x and y vector
    m_x = np.mean(x)
    m_y = np.mean(y)
 
    # calculating cross-deviation and deviation about x
    SS_xy = np.sum(y*x) - n*m_y*m_x
    SS_xx = np.sum(x*x) - n*m_x*m_x
 
    # calculating regression coefficients
    b_1 = SS_xy / SS_xx
    b_0 = m_y - b_1*m_x
 
    return (b_0, b_1)

def plot_regression_line(x, y, b):
    # plotting the actual points as scatter plot
    plt.scatter(x, y, color = "m",
               marker = "o", s = 30)
 
    # predicted response vector
    y_pred = b[0] + b[1]*x
 
    # plotting the regression line
    plt.plot(x, y_pred, color = "g")
 
    # putting labels
    plt.xlabel('x')
    plt.ylabel('y')
 
    # function to show plot
    plt.show()

def main():
    # observations / data
    x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12])
 
    # estimating coefficients
    b = estimate_coef(x, y)
    print("Estimated coefficients:\nb_0 = {}  \
          \nb_1 = {}".format(b[0], b[1]))
 
    # plotting regression line
    plot_regression_line(x, y, b)

if __name__ == "__main__":
    main()


MULTIPLE LINEAR REGRESSION:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import preprocessing

df = pd.read_csv('C:\\Users\\chira\\Downloads\\Real-estate1.csv')
df.drop('No', inplace = True,axis=1)
  
print(df.head())
print(df.columns)

sns.scatterplot(x='X4 number of convenience stores',
                y='Y house price of unit area', data=df)

X = df.drop('Y house price of unit area',axis= 1)
y = df['Y house price of unit area']
print(X)
print(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=101)

model = LinearRegression()

model.fit(X_train,y_train)

predictions = model.predict(X_test)

print(
  'mean_squared_error : ', mean_squared_error(y_test, predictions))
print(
  'mean_absolute_error : ', mean_absolute_error(y_test, predictions))

LOGISTIC REGRESSION

import numpy
from sklearn import linear_model

X = numpy.array([3.78, 2.44, 2.09, 0.14, 1.72, 1.65, 4.92, 4.37, 4.96, 4.52, 3.69, 5.88]).reshape(-1,1)
y = numpy.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])

logr = linear_model.LogisticRegression()
logr.fit(X,y)

def logit2prob(logr,x):
  log_odds = logr.coef_ * x + logr.intercept_
  odds = numpy.exp(log_odds)
  probability = odds / (1 + odds)
  return(probability)


print(logit2prob(logr, X))