Untitled
unknown
plain_text
2 months ago
18 kB
6
Indexable
Decision Tree Classification for Play Tennis Dataset with One-Hot Encoding import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier, plot_tree from sklearn import metrics from category_encoders import OneHotEncoder # Load the dataset df1 = pd.read_csv('play_tennis.csv') print("First 5 rows of dataset:") print(df1.head()) X = df1.iloc[:, :-1].values Y = df1.iloc[:, -1].values X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) print("\n") print("\nDataset Information:") print(f"Shape of Training Data: {X_train.shape}") print(f"Shape of Testing Data: {X_test.shape}") print(f"Size of Training Data: {X_train.size}") print(f"Size of Testing Data: {X_test.size}") print(f"Data type of Training Data: {X_train.dtype}") print(f"Data type of Testing Data: {X_test.dtype}") print("\n") encoder = OneHotEncoder(handle_unknown='ignore', use_cat_names=True) X_train = encoder.fit_transform(X_train) X_test = encoder.transform(X_test) algo = DecisionTreeClassifier(criterion="entropy", random_state=42, max_depth=4, min_samples_leaf=5) algo.fit(X_train, Y_train) Data Cleaning and Exploration of Netflix Titles Dataset import pandas as pd # Load the dataset dataset = pd.read_csv('netflix_titles.csv') # Display the dataset (first 5 rows for clarity in the project) print("Dataset (first 5 rows):") print(dataset.head(), "\n") print("Missing Values in Each Column:") missing_values = dataset.isnull().sum() print(missing_values.to_string(), "\n") df1 = dataset.copy() print("Before Cleaning:") print(f"Dataset Shape: {df1.shape}") duplicated_rows = df1.duplicated().sum() print(f"Duplicated Rows: {duplicated_rows}\n") print("Missing Values After Copying:") print(df1.isnull().sum(), "\n") df1.dropna(inplace=True) print("After Cleaning:") print(f"Dataset Shape: {df1.shape}") print(f"Missing Values After Cleaning: {df1.isnull().sum()}\n") duplicates_after_cleaning = df1.duplicated().sum() print(f"Duplicated Rows After Cleaning: {duplicates_after_cleaning}\n") OUTPUT_PATH = "CleanedNetflixData.csv" df1.to_csv(OUTPUT_PATH, index=False) # Dataframe Info: Display memory and column types information print("Dataframe Info:") df1_info = df1.info() # Display unique values for key columns print("\nUnique Values in 'type' Column:") print(df1['type'].unique(), "\n") print("Unique Values in 'country' Column:") print(df1['country'].unique(), "\n") print("Unique Values in 'duration' Column:") print(df1['duration'].unique(), "\n") print("Unique Values in 'director' Column:") print(df1['director'].unique(), "\n") print("Unique Values in 'rating' Column:") print(df1['rating'].unique(), "\n") print("Unique Values in 'listed_in' Column:") print(df1['listed_in'].unique(), "\n") print("Unique Values in 'title' Column:") print(df1['title'].unique(), "\n") print("Unique Counts in Each Column:") unique_counts = df1.nunique() print(unique_counts.to_string(), "\n") Feature Selection and Data Preprocessing in Machine Learning import pandas as pd import matplotlib.pyplot as plt import numpy as np import os import seaborn as sns from sklearn.feature_selection import SelectKBest, chi2 dataset = pd.read_csv('pima-indians-diabetes.csv') print("--- Dataset Overview ---") print(dataset.head(), "\n") duplicated_rows = dataset.duplicated().sum() print(f"Duplicated Rows: {duplicated_rows}\n") print("--- Unique Values in Each Column ---") print(dataset.nunique(), "\n") dataset['New Glucose Category'] = pd.cut(dataset['Glucose'], bins=[0, 140, 199], labels=['Normal', 'Prediabetic or Risky']) print("--- Glucose Categories Added ---") print(dataset[['Glucose', 'New Glucose Category']].head(), "\n") dataset['Glucose'] = dataset['New Glucose Category'].values dataset.drop(['New Glucose Category'], axis=1, inplace=True) print("--- After Replacing Glucose Values and Dropping 'New Glucose Category' Column ---") print(dataset.head(), "\n") print("--- Glucose Category Counts ---") print(dataset['Glucose'].value_counts(), "\n") encoded_glucose = pd.get_dummies(dataset['Glucose']) print("--- One-Hot Encoded Glucose Categories ---") print(encoded_glucose.head(), "\n") dbts_new = pd.read_csv('pima-indians-diabetes.csv') X = dbts_new.iloc[:, 0:8] # Select input variables (8 features) Y = dbts_new.iloc[:, 8] # Select output variable (Outcome) test = SelectKBest(score_func=chi2, k=5) fit = test.fit(X, Y) print("--- Chi-Square Scores for Each Feature ---") for i, score in enumerate(fit.scores_): print(f"Feature {X.columns[i]}: Chi-Square Score = {score:.5f}") dbts_ftr_sbset = fit.transform(X) print("--- Transformed Data with Selected Features (Top 5) ---") print(dbts_ftr_sbset[:5, :], "\n") df = pd.read_csv("pima-indians-diabetes.csv") X = df.drop(columns=['Outcome']) # Features y = df['Outcome'] # Target variable (Diabetes: 1 or 0) chi_selector = SelectKBest(score_func=chi2, k=3) X_new = chi_selector.fit_transform(X, y) selected_features = X.columns[chi_selector.get_support()] print("--- Top 3 Selected Features ---") print("Selected Features:", selected_features) Data Preprocessing and Feature Scaling for Diabetes Prediction import pandas as pd import matplotlib.pyplot as plt import numpy as np import seaborn as sns df = pd.read_csv('pima-indians-diabetes.csv') print(df.head(), "\n") splitted_data = df.values X = splitted_data[:, 0:8] Y = splitted_data[:, 8] from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2) from sklearn.preprocessing import MinMaxScaler sclr = MinMaxScaler(feature_range=(0, 1)) scaled_data_X_train = sclr.fit_transform(X_train) np.set_printoptions(precision=4) print(scaled_data_X_train[0:5, :], "\n") from sklearn.preprocessing import StandardScaler scale_ftrs_stndrd = StandardScaler().fit(X_train) scaled_stndrd_X_train = scale_ftrs_stndrd.transform(X_train) np.set_printoptions(precision=3) print(scaled_stndrd_X_train[0:5, :]) Decision Tree Classification for Diabetes Prediction import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn import tree, metrics from category_encoders import OneHotEncoder df1 = pd.read_csv('pima-indians-diabetes.csv') X = df1.iloc[:, :-1].values Y = df1.iloc[:, -1].values X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) print("Shape of the Training Data:", X_train.shape) print("Shape of the Testing Data:", X_test.shape) print("Size of the Training Data:", X_train.size) print("Size of the Testing Data:", X_test.size) print("Data type of Training Data:", X_train.dtype) print("Data type of Testing Data:", X_test.dtype) encoder = OneHotEncoder(handle_unknown='ignore', use_cat_names=True) X_train = encoder.fit_transform(X_train) X_test = encoder.transform(X_test) algo = DecisionTreeClassifier(criterion="entropy", random_state=42, max_depth=4, min_samples_leaf=5) algo.fit(X_train, Y_train) result = algo.score(X_test, Y_test) print(f"The Decision Tree model has an accuracy of: {result * 100:.3f}%") plt.figure(figsize=(20, 5)) feature_names = list(X_train.columns) tree.plot_tree(algo, filled=True, feature_names=feature_names, class_names=['Diabetes', 'Non Diabetes']) plt.show() Y_Prdct = algo.predict(X_test) print(f"Predictions:\n{Y_Prdct}") confusion_matrix = metrics.confusion_matrix(Y_test, Y_Prdct) print("Confusion Matrix:\n", confusion_matrix) print("Shape of Confusion Matrix:", confusion_matrix.shape) cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix=confusion_matrix, display_labels=[False, True]) cm_display.plot(cmap=plt.cm.Blues) plt.show() ID3 Decision Tree for Play Tennis Prediction import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn import tree, metrics from category_encoders import OneHotEncoder train_data_m = pd.read_csv('play_tennis.csv') train_data_m.head() def calc_total_entropy(train_data, label, class_list): total_row = train_data.shape[0] total_entr = 0 for c in class_list: total_class_count = train_data[train_data[label] == c].shape[0] total_class_entr = - (total_class_count / total_row) * np.log2(total_class_count / total_row) total_entr += total_class_entr return total_entr def calc_entropy(feature_value_data, label, class_list): class_count = feature_value_data.shape[0] entropy = 0 for c in class_list: label_class_count = feature_value_data[feature_value_data[label] == c].shape[0] entropy_class = 0 if label_class_count != 0: probability_class = label_class_count / class_count entropy_class = - probability_class * np.log2(probability_class) entropy += entropy_class print(feature_value_data) print(f"Entropy for this feature: {entropy:.4f}") return entropy def calc_info_gain(feature_name, train_data, label, class_list): feature_value_list = train_data[feature_name].unique() total_row = train_data.shape[0] feature_info = 0.0 for feature_value in feature_value_list: feature_value_data = train_data[train_data[feature_name] == feature_value] feature_value_count = feature_value_data.shape[0] feature_value_entropy = calc_entropy(feature_value_data, label, class_list) feature_value_probability = feature_value_count / total_row feature_info += feature_value_probability * feature_value_entropy return calc_total_entropy(train_data, label, class_list) - feature_info def find_most_informative_feature(train_data, label, class_list): feature_list = train_data.columns.drop(label) max_info_gain = -1 max_info_feature = None for feature in feature_list: feature_info_gain = calc_info_gain(feature, train_data, label, class_list) if max_info_gain < feature_info_gain: max_info_gain = feature_info_gain max_info_feature = feature return max_info_feature def generate_sub_tree(feature_name, train_data, label, class_list): feature_value_count_dict = train_data[feature_name].value_counts(sort=False) tree = {} for feature_value, count in feature_value_count_dict.items(): feature_value_data = train_data[train_data[feature_name] == feature_value] assigned_to_node = False for c in class_list: class_count = feature_value_data[feature_value_data[label] == c].shape[0] if class_count == count: tree[feature_value] = c train_data = train_data[train_data[feature_name] != feature_value] assigned_to_node = True if not assigned_to_node: tree[feature_value] = "?" return tree, train_data def make_tree(root, prev_feature_value, train_data, label, class_list): if train_data.shape[0] != 0: max_info_feature = find_most_informative_feature(train_data, label, class_list) tree, train_data = generate_sub_tree(max_info_feature, train_data, label, class_list) next_root = None if prev_feature_value != None: root[prev_feature_value] = dict() root[prev_feature_value][max_info_feature] = tree next_root = root[prev_feature_value][max_info_feature] else: root[max_info_feature] = tree next_root = root[max_info_feature] for node, branch in list(next_root.items()): if branch == "?": feature_value_data = train_data[train_data[max_info_feature] == node] make_tree(next_root, node, feature_value_data, label, class_list) def id3(train_data_m, label): train_data = train_data_m.copy() tree = {} class_list = train_data[label].unique() make_tree(tree, None, train_data, label, class_list) return tree tree = id3(train_data_m, 'Play Tennis') print("\nGenerated Decision Tree:") print(tree) def predict(tree, instance): if not isinstance(tree, dict): return tree else: root_node = next(iter(tree)) feature_value = instance[root_node] if feature_value in tree[root_node]: return predict(tree[root_node][feature_value], instance) else: return None def evaluate(tree, test_data_m, label): correct_predict = 0 wrong_predict = 0 for index, row in test_data_m.iterrows(): result = predict(tree, test_data_m.iloc[index]) if result == test_data_m[label].iloc[index]: correct_predict += 1 else: wrong_predict += 1 accuracy = correct_predict / (correct_predict + wrong_predict) return accuracy test_data_m = pd.read_csv('play_tennis.csv') accuracy = evaluate(tree, test_data_m, 'Play Tennis') print(f"\nModel Accuracy: {accuracy * 100:.2f}%") Naïve Bayes Classifier for Play Tennis Prediction import pandas as pd from IPython.display import display from sklearn.naive_bayes import GaussianNB from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from sklearn.preprocessing import LabelEncoder import numpy as np def main(): # Specify the file path file_path = r"C:\Users\ACER\Desktop\Lab Resources\Lab Resources\Data Sets\play_tennis.csv" # Load the dataset data = pd.read_csv(file_path) # Display top five rows if user wants to print("Do you want to view the top five data tuples? (yes/no)") choice = input().strip().lower() if choice == 'yes': display(data.head()) # Convert categorical columns to numerical using Label Encoding le = LabelEncoder() data['Outlook'] = le.fit_transform(data['Outlook']) data['Temperature'] = le.fit_transform(data['Temperature']) data['Humidity'] = le.fit_transform(data['Humidity']) data['Wind'] = le.fit_transform(data['Wind']) data['Play Tennis'] = le.fit_transform(data['Play Tennis']) y = data['Play Tennis'] del data['Play Tennis'] # Train-test split x_train, x_test, y_train, y_test = train_test_split(data, y, test_size=0.25, random_state=1) # Build model and make predictions clf = build_model(x_train, y_train) prediction_using_model(clf, x_test, y_test) def build_model(x_train, y_train): clf = GaussianNB() clf = clf.fit(x_train, y_train) return clf def prediction_using_model(clf, x_test, y_test): y_pred = clf.predict(x_test) prediction = pd.concat([x_test.reset_index(drop=True), pd.Series(y_pred, name='Predicted Class')], axis=1) # Display predictions print("Do you want to view the class label prediction for top five tuples of test data?") choice = input().strip().lower() if choice == 'yes': display(prediction.head()) # Evaluate the model print("Do you want to view evaluation result of model?") choice = input().strip().lower() if choice == 'yes': print("Evaluation result of model:") model_evaluation(y_test, y_pred) else: print("Thank you!") quit() def model_evaluation(y_test, y_pred): print("Confusion Matrix:") cm = confusion_matrix(y_test, y_pred) display(pd.DataFrame(cm)) score = accuracy_score(y_test, y_pred) print(f"Accuracy of Naive Bayes: {score:.4f}") print("\nClassification Report:") report = classification_report(y_test, y_pred, output_dict=True) display(pd.DataFrame(report).transpose()) cm_df = pd.DataFrame(cm) cm_df.to_csv("confusion_matrix.csv") print("Confusion matrix saved to 'confusion_matrix.csv'.") if __name__ == "__main__": main() Spam Detection Using Naïve Bayes Classifier import pandas as pd from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB df = pd.read_csv('spam_ham_dataset.csv', encoding='latin1') df['New label'] = df.label.map({'ham': 0, 'spam': 1}) df['text_len'] = df.text.apply(len) X_train, X_test, Y_train, Y_test = train_test_split(df.text, df['New label'], test_size=0.2) word_freq_count = CountVectorizer() X_train_count = word_freq_count.fit_transform(X_train.values) print("Features: ", word_freq_count.get_feature_names_out()) model = MultinomialNB() model.fit(X_train_count, Y_train) mail_text = ['Get the children ready we will go to dinner', 'Congratulations you got a massive offer'] mail = word_freq_count.transform(mail_text) predictions = model.predict(mail) print("Predictions: ", predictions) X_test_count = word_freq_count.transform(X_test) score = model.score(X_test_count, Y_test) print(f"Model accuracy: {score:.4f}")
Editor is loading...
Leave a Comment