Untitled
unknown
c_cpp
2 years ago
7.4 kB
4
Indexable
#include <iostream> #include <vector> #include <fstream> #include <sstream> #include <algorithm> #include <limits> #include <map> #include <string> #include <cmath> #include <cstdlib> #include <iomanip> using namespace std; // Your code to read and preprocess the CSV data will go here. // Assuming you have the preprocessed data in a 2D vector called `data` and a 1D vector called `labels` class Node { public: int feature; double threshold; int n1; int n2; bool positive; Node* left; Node* right; Node(int feature=-1, double threshold=0, int n1=0, int n2=0, bool positive=false, Node* left=nullptr, Node* right=nullptr) : feature(feature), threshold(threshold), n1(n1), n2(n2), positive(positive), left(left), right(right) {} }; double gini(const vector<int>& labels) { map<int, int> counts; for (int label : labels) { counts[label]++; } double sum = 0; for (const auto& [key, value] : counts) { double p = static_cast<double>(value) / labels.size(); sum += p * p; } return 1 - sum; } tuple<int, double, double> find_best_split(const vector<vector<double>>& data, const vector<int>& labels) { int best_feature = -1; double best_threshold = 0; double best_gini = numeric_limits<double>::infinity(); for (size_t feature = 0; feature < data[0].size(); ++feature) { vector<double> feature_values; for (size_t i = 0; i < data.size(); ++i) { feature_values.push_back(data[i][feature]); } sort(feature_values.begin(), feature_values.end()); feature_values.erase(unique(feature_values.begin(), feature_values.end()), feature_values.end()); for (double threshold : feature_values) { vector<int> left_labels; vector<int> right_labels; for (size_t i = 0; i < data.size(); ++i) { if (data[i][feature] < threshold) { left_labels.push_back(labels[i]); } else { right_labels.push_back(labels[i]); } } double left_gini = gini(left_labels); double right_gini = gini(right_labels); double weighted_gini = (static_cast<double>(left_labels.size()) / labels.size()) * left_gini + (static_cast<double>(right_labels.size()) / labels.size()) * right_gini; if (weighted_gini < best_gini) { best_feature = feature; best_threshold = threshold; best_gini = weighted_gini; } } } return {best_feature, best_threshold, best_gini}; } Node* build_tree(const vector<vector<double>>& data, const vector<int>& labels) { if (labels.size() < 5) { return nullptr; } int n1 = count(labels.begin(), labels.end(), 1); int n2 = count(labels.begin(), labels.end(), 0); bool positive = n1 >= n2; Node* node = new Node(-1, 0, n1, n2, positive, nullptr, nullptr); if (positive) { node->n2 = 0; } else { node->n1 = 0; } if (gini(labels) == 0) { return node; } int best_feature; double best_threshold, best_gini; tie(best_feature, best_threshold, best_gini) = find_best_split(data, labels); node->feature = best_feature; node->threshold = best_threshold; vector<vector<double>> left_data, right_data; vector<int> left_labels, right_labels; for (size_t i = 0; i < data.size(); ++i) { if (data[i][best_feature] < best_threshold) { left_data.push_back(data[i]); left_labels.push_back(labels[i]); } else { right_data.push_back(data[i]); right_labels.push_back(labels[i]); } } node->left = build_tree(left_data, left_labels); node->right = build_tree(right_data, right_labels); return node; } bool classify(const Node* node, const vector<double>& subject) { if (!node) { // Handle the case when the node is null return false; } if (node->left == nullptr && node->right == nullptr) { return node->positive; } if (subject[node->feature] < node->threshold) { return classify(node->left, subject); } else { return classify(node->right, subject); } } vector<string> split(const string& s, char delimiter) { vector<string> tokens; string token; istringstream tokenStream(s); while (getline(tokenStream, token, delimiter)) { tokens.push_back(token); } return tokens; } int main() { ifstream file("Missing_Data_7features.csv"); // Read the CSV file header string header; getline(file, header); // Read and preprocess the data vector<vector<double>> data; vector<int> labels; vector<double> column_sums(7, 0.0); vector<int> column_counts(7, 0); string line; while (getline(file, line)) { vector<string> tokens = split(line, ','); vector<double> row; for (size_t i = 1; i < tokens.size() - 1; ++i) { double value = stod(tokens[i]); if (value != -99) { row.push_back(value); column_sums[i - 1] += value; column_counts[i - 1]++; } else { row.push_back(nan("")); } } data.push_back(row); labels.push_back(stoi(tokens.back())); } file.close(); // Compute column means and replace NaNs with column means for (auto& row : data) { for (size_t i = 0; i < row.size(); ++i) { if (std::isnan(row[i])) { row[i] = column_sums[i] / column_counts[i]; } } } // Save the preprocessed data to a new CSV file ofstream output_file("Estimated_Data_7features_final.csv"); output_file << header << endl; output_file << fixed << setprecision(8); for (size_t i = 0; i < data.size(); ++i) { output_file << i + 1; for (const auto& value : data[i]) { output_file << ',' << value; } output_file << ',' << labels[i] << endl; } output_file.close(); // Compute column means and replace NaNs with column means for (auto& row : data) { for (size_t i = 0; i < row.size(); ++i) { if (std::isnan(row[i])) { row[i] = column_sums[i] / column_counts[i]; } } } vector<vector<double>> train_data(data.begin(), data.begin() + 512); vector<int> train_labels(labels.begin(), labels.begin() + 512); vector<vector<double>> test_data(data.begin() + 512, data.end()); vector<int> test_labels(labels.begin() + 512, labels.end()); // Build the decision tree Node* root = build_tree(train_data, train_labels); // Classify the test set using the decision tree vector<bool> predictions; for (const auto& subject : test_data) { predictions.push_back(classify(root, subject)); } // Calculate the accuracy of the decision tree int correct = 0; for (size_t i = 0; i < test_labels.size(); ++i) { if (predictions[i] == test_labels[i]) { correct++; } } double accuracy = static_cast<double>(correct) / test_labels.size(); // Print the accuracy of the decision tree cout << "Accuracy: " << accuracy << endl; return 0; }
Editor is loading...