Untitled
unknown
plain_text
a year ago
9.4 kB
4
Indexable
import pandas as pd import numpy as np np.random.seed(0) import tensorflow as tf tf.compat.v1.set_random_seed(0) from tensorflow import keras import matplotlib.pyplot as plt from sklearn.preprocessing import LabelEncoder # Your code continues here... data = pd.read_csv("drebin-215-dataset-5560malware-9476-benign.csv") print("Total missing values : ",sum(list(data.isna().sum()))) data classes,count = np.unique(data['class'],return_counts=True) #Perform Label Encoding lbl_enc = LabelEncoder() print(lbl_enc.fit_transform(classes),classes) data = data.replace(classes,lbl_enc.fit_transform(classes)) #Dataset contains special characters like ''?' and 'S'. Set them to NaN and use dropna() to remove them data=data.replace('[?,S]',np.NaN,regex=True) print("Total missing values : ",sum(list(data.isna().sum()))) data.dropna(inplace=True) for c in data.columns: data[c] = pd.to_numeric(data[c]) data print("Total Features : ",len(data.columns)-1) plt.bar(classes,count) plt.title("Class Balance") plt.xlabel("Classes") plt.ylabel("Count") plt.show() from sklearn.utils import resample # Separate features and labels X = data.drop("class", axis=1) y = data["class"] # Count the occurrences of each class class_counts = y.value_counts() # Calculate the majority and minority class labels majority_class = class_counts.idxmax() minority_class = class_counts.idxmin() # Separate majority and minority class samples majority_samples = data[data["class"] == majority_class] minority_samples = data[data["class"] == minority_class] # Oversample the minority class to match the majority class minority_oversampled = resample(minority_samples, replace=True, # Sample with replacement n_samples=len(majority_samples), # Match majority class random_state=0) # Set random seed for reproducibility # Combine the oversampled minority class with the majority class balanced_data = pd.concat([majority_samples, minority_oversampled]) # Shuffle the balanced dataset balanced_data = balanced_data.sample(frac=1, random_state=0) # Now, balanced_data contains the balanced dataset with equal instances of both classes # Count the occurrences of each class in the balanced dataset balanced_class_counts = balanced_data["class"].value_counts() # Print the class counts print(balanced_class_counts) from sklearn.model_selection import train_test_split train_x,test_x,train_y,test_y = train_test_split(data[data.columns[:len(data.columns)-1]].to_numpy(), data[data.columns[-1]].to_numpy(), test_size = 0.2, shuffle=True) print("Train features size : ",len(train_x)) print("Train labels size : ",len(train_y)) print("Test features size : ",len(test_x)) print("Test features size : ",len(test_y)) print("Train features : ",train_x.shape) print("Train labels : ",train_y.shape) print("Test Features : ",test_x.shape) print("Test labels : ",test_y.shape) train_y = train_y.reshape((-1,1)) test_y = test_y.reshape((-1,1)) print("Train features : ",train_x.shape) print("Train labels : ",train_y.shape) print("Test Features : ",test_x.shape) print("Test labels : ",test_y.shape) import pandas as pd import seaborn as sns import matplotlib.pyplot as plt # Plot the heatmap plt.figure(figsize=(12, 8)) heatmap = sns.heatmap(data.corr(), annot=False, cmap="coolwarm") plt.title("Correlation Heatmap of the Dataset") plt.show() import pandas as pd from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score X = data.drop('class', axis=1) # Features y = data['class'] # Target variable X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) clf = DecisionTreeClassifier(random_state=42) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print(f"Accuracy: {accuracy:.2f}") import pandas as pd from sklearn.model_selection import train_test_split from sklearn.svm import SVC from sklearn.metrics import accuracy_score X = data.drop('class', axis=1) # Features y = data['class'] # Target variable X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) kernel_list = ['linear', 'poly', 'rbf'] for kernel in kernel_list: clf = SVC(kernel=kernel, random_state=42) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print(f"Accuracy with {kernel} kernel: {accuracy:.2f}") import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score X = data.drop('class', axis=1) # Features y = data['class'] # Target variable X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) clf = LogisticRegression(random_state=42) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print(f"Accuracy: {accuracy:.2f}") import pandas as pd from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score X = data.drop('class', axis=1) # Features y = data['class'] # Target variable X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) k = 5 # You can set the number of neighbors (k) as needed clf = KNeighborsClassifier(n_neighbors=k) clf.fit(X_train, y_train) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) X_train = X_train.to_numpy() # Convert DataFrame to numpy array if needed X_test = X_test.to_numpy() # Convert DataFrame to numpy array if needed y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print(f"Accuracy: {accuracy:.2f}") import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout from tensorflow.keras.optimizers import Adam from sklearn.metrics import accuracy_score X = data.drop('class', axis=1).values # Features y = data['class'].values # Target variable label_encoder = LabelEncoder() y = label_encoder.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model = Sequential([ Dense(128, activation='relu', input_shape=(X_train.shape[1],)), Dropout(0.3), Dense(64, activation='relu'), Dropout(0.3), Dense(1, activation='sigmoid') # Binary classification, so using sigmoid activation ]) model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy']) # Print model summary model.summary() model = Sequential([ Dense(128, activation='relu', input_shape=(X_train.shape[1],)), Dropout(0.3), Dense(64, activation='relu'), Dropout(0.3), Dense(1, activation='sigmoid') # Binary classification, so using sigmoid activation ]) model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy']) # Print model summary model.summary() batch_size = 32 epochs = 10 history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1) y_pred = (model.predict(X_test) > 0.5).astype(np.int32) accuracy = accuracy_score(y_test, y_pred) print(f"Accuracy: {accuracy:.2f}") import matplotlib.pyplot as plt # Names of the algorithms algorithms = ['Decision Tree', 'SVM (Linear)', 'SVM (Poly)', 'SVM (RBF)', 'LR', 'KNN', 'Sequential NN'] # Corresponding accuracy values accuracies = [98, 98, 96, 98, 98, 98, 99] # Set up the figure and axis fig, ax = plt.subplots(figsize=(10, 6)) # Create a colorful bar graph colors = plt.cm.viridis_r(accuracies) # Use a colormap to generate colors bars = plt.bar(algorithms, accuracies, color=colors) # Add data labels on the bars for bar in bars: yval = bar.get_height() plt.text(bar.get_x() + bar.get_width()/2, yval + 1, f'{yval}%', ha='center', va='bottom', fontsize=10) # Customize plot elements plt.title('Accuracy of Different Classification Algorithms') plt.xlabel('Algorithms') plt.ylabel('Accuracy (%)') plt.ylim(90, 100) # Set the y-axis limits # Show the colorful legend indicating the accuracy range sm = plt.cm.ScalarMappable(cmap=plt.cm.viridis_r, norm=plt.Normalize(vmin=min(accuracies), vmax=max(accuracies))) sm._A = [] # Fake up the array of the scalar mappable cbar = plt.colorbar(sm, orientation='vertical') cbar.set_label('Accuracy Range') # Rotate x-axis labels for better readability plt.xticks(rotation=45, ha='right') # Display the plot plt.tight_layout() plt.show()
Editor is loading...
Leave a Comment