Untitled

 avatar
unknown
c_cpp
2 years ago
7.4 kB
3
Indexable
#include <iostream>
#include <vector>
#include <fstream>
#include <sstream>
#include <algorithm>
#include <limits>
#include <map>
#include <string>
#include <cmath>
#include <cstdlib>
#include <iomanip>

using namespace std;

// Your code to read and preprocess the CSV data will go here.
// Assuming you have the preprocessed data in a 2D vector called `data` and a 1D vector called `labels`

class Node {
public:
    int feature;
    double threshold;
    int n1;
    int n2;
    bool positive;
    Node* left;
    Node* right;

    Node(int feature=-1, double threshold=0, int n1=0, int n2=0, bool positive=false, Node* left=nullptr, Node* right=nullptr)
        : feature(feature), threshold(threshold), n1(n1), n2(n2), positive(positive), left(left), right(right) {}
};

double gini(const vector<int>& labels) {
    map<int, int> counts;
    for (int label : labels) {
        counts[label]++;
    }

    double sum = 0;
    for (const auto& [key, value] : counts) {
        double p = static_cast<double>(value) / labels.size();
        sum += p * p;
    }
    return 1 - sum;
}

tuple<int, double, double> find_best_split(const vector<vector<double>>& data, const vector<int>& labels) {
    int best_feature = -1;
    double best_threshold = 0;
    double best_gini = numeric_limits<double>::infinity();

    for (size_t feature = 0; feature < data[0].size(); ++feature) {
        vector<double> feature_values;
        for (size_t i = 0; i < data.size(); ++i) {
            feature_values.push_back(data[i][feature]);
        }

        sort(feature_values.begin(), feature_values.end());
        feature_values.erase(unique(feature_values.begin(), feature_values.end()), feature_values.end());

        for (double threshold : feature_values) {
            vector<int> left_labels;
            vector<int> right_labels;

            for (size_t i = 0; i < data.size(); ++i) {
                if (data[i][feature] < threshold) {
                    left_labels.push_back(labels[i]);
                } else {
                    right_labels.push_back(labels[i]);
                }
            }

            double left_gini = gini(left_labels);
            double right_gini = gini(right_labels);
            double weighted_gini = (static_cast<double>(left_labels.size()) / labels.size()) * left_gini + (static_cast<double>(right_labels.size()) / labels.size()) * right_gini;

            if (weighted_gini < best_gini) {
                best_feature = feature;
                best_threshold = threshold;
                best_gini = weighted_gini;
            }
        }
    }
    return {best_feature, best_threshold, best_gini};
}

Node* build_tree(const vector<vector<double>>& data, const vector<int>& labels) {
    if (labels.size() < 5) {
        return nullptr;
    }

    int n1 = count(labels.begin(), labels.end(), 1);
    int n2 = count(labels.begin(), labels.end(), 0);
    bool positive = n1 >= n2;

    Node* node = new Node(-1, 0, n1, n2, positive, nullptr, nullptr);

    if (positive) {
        node->n2 = 0;
    } else {
        node->n1 = 0;
    }

    if (gini(labels) == 0) {
        return node;
    }

    int best_feature;
    double best_threshold, best_gini;
    tie(best_feature, best_threshold, best_gini) = find_best_split(data, labels);

    node->feature = best_feature;
    node->threshold = best_threshold;

    vector<vector<double>> left_data, right_data;
    vector<int> left_labels, right_labels;

    for (size_t i = 0; i < data.size(); ++i) {
        if (data[i][best_feature] < best_threshold) {
            left_data.push_back(data[i]);
            left_labels.push_back(labels[i]);
        } else {
            right_data.push_back(data[i]);
            right_labels.push_back(labels[i]);
        }
    }

    node->left = build_tree(left_data, left_labels);
    node->right = build_tree(right_data, right_labels);

    return node;
}

bool classify(const Node* node, const vector<double>& subject) {
    if (!node) {
        // Handle the case when the node is null
        return false;
    }

    if (node->left == nullptr && node->right == nullptr) {
        return node->positive;
    }

    if (subject[node->feature] < node->threshold) {
        return classify(node->left, subject);
    } else {
        return classify(node->right, subject);
    }
}

vector<string> split(const string& s, char delimiter) {
    vector<string> tokens;
    string token;
    istringstream tokenStream(s);
    while (getline(tokenStream, token, delimiter)) {
        tokens.push_back(token);
    }
    return tokens;
}

int main() {
    ifstream file("Missing_Data_7features.csv");

    // Read the CSV file header
    string header;
    getline(file, header);

    // Read and preprocess the data
    vector<vector<double>> data;
    vector<int> labels;
    vector<double> column_sums(7, 0.0);
    vector<int> column_counts(7, 0);

    string line;
    while (getline(file, line)) {
        vector<string> tokens = split(line, ',');
        vector<double> row;

        for (size_t i = 1; i < tokens.size() - 1; ++i) {
            double value = stod(tokens[i]);
            if (value != -99) {
                row.push_back(value);
                column_sums[i - 1] += value;
                column_counts[i - 1]++;
            } else {
                row.push_back(nan(""));
            }
        }

        data.push_back(row);
        labels.push_back(stoi(tokens.back()));
    }

    file.close();

    // Compute column means and replace NaNs with column means
    for (auto& row : data) {
        for (size_t i = 0; i < row.size(); ++i) {
            if (std::isnan(row[i])) {
                row[i] = column_sums[i] / column_counts[i];
            }
        }
    }

    // Save the preprocessed data to a new CSV file
    ofstream output_file("Estimated_Data_7features_final.csv");
    output_file << header << endl;
    output_file << fixed << setprecision(8);

    for (size_t i = 0; i < data.size(); ++i) {
        output_file << i + 1;
        for (const auto& value : data[i]) {
            output_file << ',' << value;
        }
        output_file << ',' << labels[i] << endl;
    }

    output_file.close();

    // Compute column means and replace NaNs with column means
    for (auto& row : data) {
        for (size_t i = 0; i < row.size(); ++i) {
            if (std::isnan(row[i])) {
                row[i] = column_sums[i] / column_counts[i];
            }
        }
    }
    vector<vector<double>> train_data(data.begin(), data.begin() + 512);
    vector<int> train_labels(labels.begin(), labels.begin() + 512);
    vector<vector<double>> test_data(data.begin() + 512, data.end());
    vector<int> test_labels(labels.begin() + 512, labels.end());

    // Build the decision tree
    Node* root = build_tree(train_data, train_labels);

    // Classify the test set using the decision tree
    vector<bool> predictions;
    for (const auto& subject : test_data) {
        predictions.push_back(classify(root, subject));
    }

    // Calculate the accuracy of the decision tree
    int correct = 0;
    for (size_t i = 0; i < test_labels.size(); ++i) {
        if (predictions[i] == test_labels[i]) {
            correct++;
        }
    }
    double accuracy = static_cast<double>(correct) / test_labels.size();

    // Print the accuracy of the decision tree
    cout << "Accuracy: " << accuracy << endl;

    return 0;
}