#include <iostream>
#include <vector>
#include <fstream>
#include <sstream>
#include <algorithm>
#include <limits>
#include <map>
#include <string>
#include <cmath>
#include <cstdlib>
#include <iomanip>
using namespace std;
// Your code to read and preprocess the CSV data will go here.
// Assuming you have the preprocessed data in a 2D vector called `data` and a 1D vector called `labels`
class Node {
public:
int feature;
double threshold;
int n1;
int n2;
bool positive;
Node* left;
Node* right;
Node(int feature=-1, double threshold=0, int n1=0, int n2=0, bool positive=false, Node* left=nullptr, Node* right=nullptr)
: feature(feature), threshold(threshold), n1(n1), n2(n2), positive(positive), left(left), right(right) {}
};
double gini(const vector<int>& labels) {
map<int, int> counts;
for (int label : labels) {
counts[label]++;
}
double sum = 0;
for (const auto& [key, value] : counts) {
double p = static_cast<double>(value) / labels.size();
sum += p * p;
}
return 1 - sum;
}
tuple<int, double, double> find_best_split(const vector<vector<double>>& data, const vector<int>& labels) {
int best_feature = -1;
double best_threshold = 0;
double best_gini = numeric_limits<double>::infinity();
for (size_t feature = 0; feature < data[0].size(); ++feature) {
vector<double> feature_values;
for (size_t i = 0; i < data.size(); ++i) {
feature_values.push_back(data[i][feature]);
}
sort(feature_values.begin(), feature_values.end());
feature_values.erase(unique(feature_values.begin(), feature_values.end()), feature_values.end());
for (double threshold : feature_values) {
vector<int> left_labels;
vector<int> right_labels;
for (size_t i = 0; i < data.size(); ++i) {
if (data[i][feature] < threshold) {
left_labels.push_back(labels[i]);
} else {
right_labels.push_back(labels[i]);
}
}
double left_gini = gini(left_labels);
double right_gini = gini(right_labels);
double weighted_gini = (static_cast<double>(left_labels.size()) / labels.size()) * left_gini + (static_cast<double>(right_labels.size()) / labels.size()) * right_gini;
if (weighted_gini < best_gini) {
best_feature = feature;
best_threshold = threshold;
best_gini = weighted_gini;
}
}
}
return {best_feature, best_threshold, best_gini};
}
Node* build_tree(const vector<vector<double>>& data, const vector<int>& labels) {
if (labels.size() < 5) {
return nullptr;
}
int n1 = count(labels.begin(), labels.end(), 1);
int n2 = count(labels.begin(), labels.end(), 0);
bool positive = n1 >= n2;
Node* node = new Node(-1, 0, n1, n2, positive, nullptr, nullptr);
if (positive) {
node->n2 = 0;
} else {
node->n1 = 0;
}
if (gini(labels) == 0) {
return node;
}
int best_feature;
double best_threshold, best_gini;
tie(best_feature, best_threshold, best_gini) = find_best_split(data, labels);
node->feature = best_feature;
node->threshold = best_threshold;
vector<vector<double>> left_data, right_data;
vector<int> left_labels, right_labels;
for (size_t i = 0; i < data.size(); ++i) {
if (data[i][best_feature] < best_threshold) {
left_data.push_back(data[i]);
left_labels.push_back(labels[i]);
} else {
right_data.push_back(data[i]);
right_labels.push_back(labels[i]);
}
}
node->left = build_tree(left_data, left_labels);
node->right = build_tree(right_data, right_labels);
return node;
}
bool classify(const Node* node, const vector<double>& subject) {
if (!node) {
// Handle the case when the node is null
return false;
}
if (node->left == nullptr && node->right == nullptr) {
return node->positive;
}
if (subject[node->feature] < node->threshold) {
return classify(node->left, subject);
} else {
return classify(node->right, subject);
}
}
vector<string> split(const string& s, char delimiter) {
vector<string> tokens;
string token;
istringstream tokenStream(s);
while (getline(tokenStream, token, delimiter)) {
tokens.push_back(token);
}
return tokens;
}
int main() {
ifstream file("Missing_Data_7features.csv");
// Read the CSV file header
string header;
getline(file, header);
// Read and preprocess the data
vector<vector<double>> data;
vector<int> labels;
vector<double> column_sums(7, 0.0);
vector<int> column_counts(7, 0);
string line;
while (getline(file, line)) {
vector<string> tokens = split(line, ',');
vector<double> row;
for (size_t i = 1; i < tokens.size() - 1; ++i) {
double value = stod(tokens[i]);
if (value != -99) {
row.push_back(value);
column_sums[i - 1] += value;
column_counts[i - 1]++;
} else {
row.push_back(nan(""));
}
}
data.push_back(row);
labels.push_back(stoi(tokens.back()));
}
file.close();
// Compute column means and replace NaNs with column means
for (auto& row : data) {
for (size_t i = 0; i < row.size(); ++i) {
if (std::isnan(row[i])) {
row[i] = column_sums[i] / column_counts[i];
}
}
}
// Save the preprocessed data to a new CSV file
ofstream output_file("Estimated_Data_7features_final.csv");
output_file << header << endl;
output_file << fixed << setprecision(8);
for (size_t i = 0; i < data.size(); ++i) {
output_file << i + 1;
for (const auto& value : data[i]) {
output_file << ',' << value;
}
output_file << ',' << labels[i] << endl;
}
output_file.close();
// Compute column means and replace NaNs with column means
for (auto& row : data) {
for (size_t i = 0; i < row.size(); ++i) {
if (std::isnan(row[i])) {
row[i] = column_sums[i] / column_counts[i];
}
}
}
vector<vector<double>> train_data(data.begin(), data.begin() + 512);
vector<int> train_labels(labels.begin(), labels.begin() + 512);
vector<vector<double>> test_data(data.begin() + 512, data.end());
vector<int> test_labels(labels.begin() + 512, labels.end());
// Build the decision tree
Node* root = build_tree(train_data, train_labels);
// Classify the test set using the decision tree
vector<bool> predictions;
for (const auto& subject : test_data) {
predictions.push_back(classify(root, subject));
}
// Calculate the accuracy of the decision tree
int correct = 0;
for (size_t i = 0; i < test_labels.size(); ++i) {
if (predictions[i] == test_labels[i]) {
correct++;
}
}
double accuracy = static_cast<double>(correct) / test_labels.size();
// Print the accuracy of the decision tree
cout << "Accuracy: " << accuracy << endl;
return 0;
}