percentile_removal

mail@pastecode.io avatar
unknown
c_cpp
2 years ago
2.4 kB
24
Indexable
Never
#include <iostream>
#include <vector>
#include <numeric>      // std::iota
#include <algorithm>    // std::sort, std::stable_sort
#include <iomanip>      // std::setprecision

// function to return indices of sorted vector without changing that vector
template <typename T>
std::vector<size_t> sort_indexes(const std::vector<T> &v) {
// this function was found on stackoverflow
    // initialize original index locations
    std::vector<size_t> idx(v.size());
    iota(idx.begin(), idx.end(), 0);

    // sort indexes based on comparing values in v
    // using std::stable_sort instead of std::sort
    // to avoid unnecessary index re-orderings
    // when v contains elements of equal values
    stable_sort(idx.begin(), idx.end(),
       [&v](size_t i1, size_t i2) {return v[i1] < v[i2];});

    return idx;
}

int main(int argc, const char * argv[]) {
    // create 2 distributions, but we will only remove elements (of BOTH) based on percentile of distr1
    std::vector<double> distr1, distr2;
    
    std::cout << std::setprecision(2); // just to make printout more readable
    srand(time(NULL)); // seed random based on time
    // fill vectors randomly
    for (int i = 0; i < 5; i++){
        distr1.push_back((double)rand() / RAND_MAX);
        distr2.push_back((double)rand() / RAND_MAX);
    }
    for (double d : distr1){
        std::cout << "distribution 1: " << d << "\n";
    }
    for (double d : distr2){
        std::cout << "distribution 2: " << d << "\n";
    }
    
    // return indices of distribution in ascending order
    std::vector<size_t> sorted_idx = sort_indexes(distr1);
    size_t cull_percentile = 0.2 * distr1.size(); // 20%
    std::cout << "deleting based on distr1's 20th percentile, which is " << distr1[sorted_idx[cull_percentile]] << "\n";

    // remove indices corresponding with elements less than percentile
    sorted_idx.erase(sorted_idx.begin(), sorted_idx.begin() + cull_percentile);
    std::sort(sorted_idx.begin(), sorted_idx.end());
    
    std::vector<double> culled_vec1, culled_vec2;
    for (size_t i = 0; i < sorted_idx.size(); i++){
        culled_vec1.push_back(distr1[sorted_idx[i]]);
        culled_vec2.push_back(distr2[sorted_idx[i]]);
    }

    for (double d : culled_vec1){
        std::cout << "culled vector 1: " << d << "\n";
    }
    for (double d : culled_vec2){
        std::cout << "celled vector 2: " << d << "\n";
    }
    return 0;
}