Untitled
unknown
plain_text
8 months ago
18 kB
8
Indexable
Decision Tree Classification for Play Tennis Dataset with One-Hot Encoding
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics
from category_encoders import OneHotEncoder
# Load the dataset
df1 = pd.read_csv('play_tennis.csv')
print("First 5 rows of dataset:")
print(df1.head())
X = df1.iloc[:, :-1].values
Y = df1.iloc[:, -1].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("\n")
print("\nDataset Information:")
print(f"Shape of Training Data: {X_train.shape}")
print(f"Shape of Testing Data: {X_test.shape}")
print(f"Size of Training Data: {X_train.size}")
print(f"Size of Testing Data: {X_test.size}")
print(f"Data type of Training Data: {X_train.dtype}")
print(f"Data type of Testing Data: {X_test.dtype}")
print("\n")
encoder = OneHotEncoder(handle_unknown='ignore', use_cat_names=True)
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)
algo = DecisionTreeClassifier(criterion="entropy", random_state=42, max_depth=4, min_samples_leaf=5)
algo.fit(X_train, Y_train)
Data Cleaning and Exploration of Netflix Titles Dataset
import pandas as pd
# Load the dataset
dataset = pd.read_csv('netflix_titles.csv')
# Display the dataset (first 5 rows for clarity in the project)
print("Dataset (first 5 rows):")
print(dataset.head(), "\n")
print("Missing Values in Each Column:")
missing_values = dataset.isnull().sum()
print(missing_values.to_string(), "\n")
df1 = dataset.copy()
print("Before Cleaning:")
print(f"Dataset Shape: {df1.shape}")
duplicated_rows = df1.duplicated().sum()
print(f"Duplicated Rows: {duplicated_rows}\n")
print("Missing Values After Copying:")
print(df1.isnull().sum(), "\n")
df1.dropna(inplace=True)
print("After Cleaning:")
print(f"Dataset Shape: {df1.shape}")
print(f"Missing Values After Cleaning: {df1.isnull().sum()}\n")
duplicates_after_cleaning = df1.duplicated().sum()
print(f"Duplicated Rows After Cleaning: {duplicates_after_cleaning}\n")
OUTPUT_PATH = "CleanedNetflixData.csv"
df1.to_csv(OUTPUT_PATH, index=False)
# Dataframe Info: Display memory and column types information
print("Dataframe Info:")
df1_info = df1.info()
# Display unique values for key columns
print("\nUnique Values in 'type' Column:")
print(df1['type'].unique(), "\n")
print("Unique Values in 'country' Column:")
print(df1['country'].unique(), "\n")
print("Unique Values in 'duration' Column:")
print(df1['duration'].unique(), "\n")
print("Unique Values in 'director' Column:")
print(df1['director'].unique(), "\n")
print("Unique Values in 'rating' Column:")
print(df1['rating'].unique(), "\n")
print("Unique Values in 'listed_in' Column:")
print(df1['listed_in'].unique(), "\n")
print("Unique Values in 'title' Column:")
print(df1['title'].unique(), "\n")
print("Unique Counts in Each Column:")
unique_counts = df1.nunique()
print(unique_counts.to_string(), "\n")
Feature Selection and Data Preprocessing in Machine Learning
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2
dataset = pd.read_csv('pima-indians-diabetes.csv')
print("--- Dataset Overview ---")
print(dataset.head(), "\n")
duplicated_rows = dataset.duplicated().sum()
print(f"Duplicated Rows: {duplicated_rows}\n")
print("--- Unique Values in Each Column ---")
print(dataset.nunique(), "\n")
dataset['New Glucose Category'] = pd.cut(dataset['Glucose'], bins=[0, 140, 199], labels=['Normal', 'Prediabetic or Risky'])
print("--- Glucose Categories Added ---")
print(dataset[['Glucose', 'New Glucose Category']].head(), "\n")
dataset['Glucose'] = dataset['New Glucose Category'].values
dataset.drop(['New Glucose Category'], axis=1, inplace=True)
print("--- After Replacing Glucose Values and Dropping 'New Glucose Category' Column ---")
print(dataset.head(), "\n")
print("--- Glucose Category Counts ---")
print(dataset['Glucose'].value_counts(), "\n")
encoded_glucose = pd.get_dummies(dataset['Glucose'])
print("--- One-Hot Encoded Glucose Categories ---")
print(encoded_glucose.head(), "\n")
dbts_new = pd.read_csv('pima-indians-diabetes.csv')
X = dbts_new.iloc[:, 0:8] # Select input variables (8 features)
Y = dbts_new.iloc[:, 8] # Select output variable (Outcome)
test = SelectKBest(score_func=chi2, k=5)
fit = test.fit(X, Y)
print("--- Chi-Square Scores for Each Feature ---")
for i, score in enumerate(fit.scores_):
print(f"Feature {X.columns[i]}: Chi-Square Score = {score:.5f}")
dbts_ftr_sbset = fit.transform(X)
print("--- Transformed Data with Selected Features (Top 5) ---")
print(dbts_ftr_sbset[:5, :], "\n")
df = pd.read_csv("pima-indians-diabetes.csv")
X = df.drop(columns=['Outcome']) # Features
y = df['Outcome'] # Target variable (Diabetes: 1 or 0)
chi_selector = SelectKBest(score_func=chi2, k=3)
X_new = chi_selector.fit_transform(X, y)
selected_features = X.columns[chi_selector.get_support()]
print("--- Top 3 Selected Features ---")
print("Selected Features:", selected_features)
Data Preprocessing and Feature Scaling for Diabetes Prediction
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
df = pd.read_csv('pima-indians-diabetes.csv')
print(df.head(), "\n")
splitted_data = df.values
X = splitted_data[:, 0:8]
Y = splitted_data[:, 8]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
from sklearn.preprocessing import MinMaxScaler
sclr = MinMaxScaler(feature_range=(0, 1))
scaled_data_X_train = sclr.fit_transform(X_train)
np.set_printoptions(precision=4)
print(scaled_data_X_train[0:5, :], "\n")
from sklearn.preprocessing import StandardScaler
scale_ftrs_stndrd = StandardScaler().fit(X_train)
scaled_stndrd_X_train = scale_ftrs_stndrd.transform(X_train)
np.set_printoptions(precision=3)
print(scaled_stndrd_X_train[0:5, :])
Decision Tree Classification for Diabetes Prediction
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree, metrics
from category_encoders import OneHotEncoder
df1 = pd.read_csv('pima-indians-diabetes.csv')
X = df1.iloc[:, :-1].values
Y = df1.iloc[:, -1].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("Shape of the Training Data:", X_train.shape)
print("Shape of the Testing Data:", X_test.shape)
print("Size of the Training Data:", X_train.size)
print("Size of the Testing Data:", X_test.size)
print("Data type of Training Data:", X_train.dtype)
print("Data type of Testing Data:", X_test.dtype)
encoder = OneHotEncoder(handle_unknown='ignore', use_cat_names=True)
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)
algo = DecisionTreeClassifier(criterion="entropy", random_state=42, max_depth=4, min_samples_leaf=5)
algo.fit(X_train, Y_train)
result = algo.score(X_test, Y_test)
print(f"The Decision Tree model has an accuracy of: {result * 100:.3f}%")
plt.figure(figsize=(20, 5))
feature_names = list(X_train.columns)
tree.plot_tree(algo, filled=True, feature_names=feature_names, class_names=['Diabetes', 'Non Diabetes'])
plt.show()
Y_Prdct = algo.predict(X_test)
print(f"Predictions:\n{Y_Prdct}")
confusion_matrix = metrics.confusion_matrix(Y_test, Y_Prdct)
print("Confusion Matrix:\n", confusion_matrix)
print("Shape of Confusion Matrix:", confusion_matrix.shape)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix=confusion_matrix, display_labels=[False, True])
cm_display.plot(cmap=plt.cm.Blues)
plt.show()
ID3 Decision Tree for Play Tennis Prediction
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree, metrics
from category_encoders import OneHotEncoder
train_data_m = pd.read_csv('play_tennis.csv')
train_data_m.head()
def calc_total_entropy(train_data, label, class_list):
total_row = train_data.shape[0]
total_entr = 0
for c in class_list:
total_class_count = train_data[train_data[label] == c].shape[0]
total_class_entr = - (total_class_count / total_row) * np.log2(total_class_count / total_row)
total_entr += total_class_entr
return total_entr
def calc_entropy(feature_value_data, label, class_list):
class_count = feature_value_data.shape[0]
entropy = 0
for c in class_list:
label_class_count = feature_value_data[feature_value_data[label] == c].shape[0]
entropy_class = 0
if label_class_count != 0:
probability_class = label_class_count / class_count
entropy_class = - probability_class * np.log2(probability_class)
entropy += entropy_class
print(feature_value_data)
print(f"Entropy for this feature: {entropy:.4f}")
return entropy
def calc_info_gain(feature_name, train_data, label, class_list):
feature_value_list = train_data[feature_name].unique()
total_row = train_data.shape[0]
feature_info = 0.0
for feature_value in feature_value_list:
feature_value_data = train_data[train_data[feature_name] == feature_value]
feature_value_count = feature_value_data.shape[0]
feature_value_entropy = calc_entropy(feature_value_data, label, class_list)
feature_value_probability = feature_value_count / total_row
feature_info += feature_value_probability * feature_value_entropy
return calc_total_entropy(train_data, label, class_list) - feature_info
def find_most_informative_feature(train_data, label, class_list):
feature_list = train_data.columns.drop(label)
max_info_gain = -1
max_info_feature = None
for feature in feature_list:
feature_info_gain = calc_info_gain(feature, train_data, label, class_list)
if max_info_gain < feature_info_gain:
max_info_gain = feature_info_gain
max_info_feature = feature
return max_info_feature
def generate_sub_tree(feature_name, train_data, label, class_list):
feature_value_count_dict = train_data[feature_name].value_counts(sort=False)
tree = {}
for feature_value, count in feature_value_count_dict.items():
feature_value_data = train_data[train_data[feature_name] == feature_value]
assigned_to_node = False
for c in class_list:
class_count = feature_value_data[feature_value_data[label] == c].shape[0]
if class_count == count:
tree[feature_value] = c
train_data = train_data[train_data[feature_name] != feature_value]
assigned_to_node = True
if not assigned_to_node:
tree[feature_value] = "?"
return tree, train_data
def make_tree(root, prev_feature_value, train_data, label, class_list):
if train_data.shape[0] != 0:
max_info_feature = find_most_informative_feature(train_data, label, class_list)
tree, train_data = generate_sub_tree(max_info_feature, train_data, label, class_list)
next_root = None
if prev_feature_value != None:
root[prev_feature_value] = dict()
root[prev_feature_value][max_info_feature] = tree
next_root = root[prev_feature_value][max_info_feature]
else:
root[max_info_feature] = tree
next_root = root[max_info_feature]
for node, branch in list(next_root.items()):
if branch == "?":
feature_value_data = train_data[train_data[max_info_feature] == node]
make_tree(next_root, node, feature_value_data, label, class_list)
def id3(train_data_m, label):
train_data = train_data_m.copy()
tree = {}
class_list = train_data[label].unique()
make_tree(tree, None, train_data, label, class_list)
return tree
tree = id3(train_data_m, 'Play Tennis')
print("\nGenerated Decision Tree:")
print(tree)
def predict(tree, instance):
if not isinstance(tree, dict):
return tree
else:
root_node = next(iter(tree))
feature_value = instance[root_node]
if feature_value in tree[root_node]:
return predict(tree[root_node][feature_value], instance)
else:
return None
def evaluate(tree, test_data_m, label):
correct_predict = 0
wrong_predict = 0
for index, row in test_data_m.iterrows():
result = predict(tree, test_data_m.iloc[index])
if result == test_data_m[label].iloc[index]:
correct_predict += 1
else:
wrong_predict += 1
accuracy = correct_predict / (correct_predict + wrong_predict)
return accuracy
test_data_m = pd.read_csv('play_tennis.csv')
accuracy = evaluate(tree, test_data_m, 'Play Tennis')
print(f"\nModel Accuracy: {accuracy * 100:.2f}%")
Naïve Bayes Classifier for Play Tennis Prediction
import pandas as pd
from IPython.display import display
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import numpy as np
def main():
# Specify the file path
file_path = r"C:\Users\ACER\Desktop\Lab Resources\Lab Resources\Data Sets\play_tennis.csv"
# Load the dataset
data = pd.read_csv(file_path)
# Display top five rows if user wants to
print("Do you want to view the top five data tuples? (yes/no)")
choice = input().strip().lower()
if choice == 'yes':
display(data.head())
# Convert categorical columns to numerical using Label Encoding
le = LabelEncoder()
data['Outlook'] = le.fit_transform(data['Outlook'])
data['Temperature'] = le.fit_transform(data['Temperature'])
data['Humidity'] = le.fit_transform(data['Humidity'])
data['Wind'] = le.fit_transform(data['Wind'])
data['Play Tennis'] = le.fit_transform(data['Play Tennis'])
y = data['Play Tennis']
del data['Play Tennis']
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(data, y, test_size=0.25, random_state=1)
# Build model and make predictions
clf = build_model(x_train, y_train)
prediction_using_model(clf, x_test, y_test)
def build_model(x_train, y_train):
clf = GaussianNB()
clf = clf.fit(x_train, y_train)
return clf
def prediction_using_model(clf, x_test, y_test):
y_pred = clf.predict(x_test)
prediction = pd.concat([x_test.reset_index(drop=True), pd.Series(y_pred, name='Predicted Class')], axis=1)
# Display predictions
print("Do you want to view the class label prediction for top five tuples of test data?")
choice = input().strip().lower()
if choice == 'yes':
display(prediction.head())
# Evaluate the model
print("Do you want to view evaluation result of model?")
choice = input().strip().lower()
if choice == 'yes':
print("Evaluation result of model:")
model_evaluation(y_test, y_pred)
else:
print("Thank you!")
quit()
def model_evaluation(y_test, y_pred):
print("Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
display(pd.DataFrame(cm))
score = accuracy_score(y_test, y_pred)
print(f"Accuracy of Naive Bayes: {score:.4f}")
print("\nClassification Report:")
report = classification_report(y_test, y_pred, output_dict=True)
display(pd.DataFrame(report).transpose())
cm_df = pd.DataFrame(cm)
cm_df.to_csv("confusion_matrix.csv")
print("Confusion matrix saved to 'confusion_matrix.csv'.")
if __name__ == "__main__":
main()
Spam Detection Using Naïve Bayes Classifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
df = pd.read_csv('spam_ham_dataset.csv', encoding='latin1')
df['New label'] = df.label.map({'ham': 0, 'spam': 1})
df['text_len'] = df.text.apply(len)
X_train, X_test, Y_train, Y_test = train_test_split(df.text, df['New label'], test_size=0.2)
word_freq_count = CountVectorizer()
X_train_count = word_freq_count.fit_transform(X_train.values)
print("Features: ", word_freq_count.get_feature_names_out())
model = MultinomialNB()
model.fit(X_train_count, Y_train)
mail_text = ['Get the children ready we will go to dinner', 'Congratulations you got a massive offer']
mail = word_freq_count.transform(mail_text)
predictions = model.predict(mail)
print("Predictions: ", predictions)
X_test_count = word_freq_count.transform(X_test)
score = model.score(X_test_count, Y_test)
print(f"Model accuracy: {score:.4f}")
Editor is loading...
Leave a Comment