Jun 27, 2024

mail@pastecode.io avatar
unknown
c_cpp
15 days ago
16 kB
1
Indexable
Never
#include <iostream>
#include <random>
#include <string>
#include<set>
#include<fstream>

using namespace std;
//std::string randomString;
int hd_si_tau[1000][1000];

string zodd="";
string zeven="";
uint64_t zodd_t, zeven_t;
string type_map[] = {"random", "stress_random_lenk", "stress_ACGT", "stress_AC", "stress_A"};

//RESULTS
double estimator;
int S_obs;
double error_term;
int type = 1;

class SimParams{
    public:
    int k;
    unsigned long L;
    double r;

    SimParams(int k, unsigned long L, double r){
        this->k = k;
        this->L = L;
        this->r = r;
    }
    double get_one_minus_q(){
        return pow(1.0-r, k);
        //(1 − r)^k
    }
    double get_q(){
        return 1 - pow(1.0-r, k);
        //1 − (1 − r)^k
    }
    double get_p(){
        return r/3.0/(1-r);
    }

    unsigned long get_n(){
        return L+k-1;
    }  
};


std::string encode_kmer(const std::string& kmer) {
    std::string encoded_kmer;
    for (char base : kmer) {
        switch (base) {
            case 'A':
                encoded_kmer += "00"; // 'A' is encoded as 00
                break;
            case 'C':
                encoded_kmer += "01"; // 'C' is encoded as 01
                break;
            case 'G':
                encoded_kmer += "10"; // 'G' is encoded as 10
                break;
            case 'T':
                encoded_kmer += "11"; // 'T' is encoded as 11
                break;
            default:
                throw std::invalid_argument("Invalid character in k-mer: " + std::string(1, base));
        }
    }
    return encoded_kmer;
}


vector<uint64_t> stringToBinaryVec(string kmer){
    int K = kmer.length();
    string bv_line = encode_kmer(kmer);
    vector<uint64_t> curr_bv(ceil(K/32));
    for (int i = 0; i< ceil(K/32); i++){
        if(2*K>i*64){
            curr_bv[i] = std::stoull(bv_line.substr(64*i,std::min(64, 2*K-i*64)), nullptr, 2);
        }
    }
    return curr_bv;
}

uint64_t stringToBinary(string kmer){
    string bv_line = encode_kmer(kmer);
    int K = kmer.length();
    uint64_t curr_bv_lo = std::stoull(bv_line.substr(0,std::min(64, 2*K)), nullptr, 2);
    return curr_bv_lo;
} 


std::string generateRandomString(int length, unsigned int seed=0) {
    const std::string charset = "ACGT"; // Characters to choose from
    std::random_device rd;
    std::mt19937 gen(rd());

    //std::mt19937 gen(seed); to set seed
    std::uniform_int_distribution<> dis(0, charset.length() - 1);

    std::string randomString;
    randomString.reserve(length);

    for (int i = 0; i < length; ++i) {
        randomString += charset[dis(gen)];
    }

    return randomString;
}

std::string generateMutatedString(string s, double r=0.1) {
    string notT = "ACG";
    string notG = "ACT";
    string notC = "AGT";
    string notA = "CGT";

    std::random_device rd;
    std::mt19937 gen(rd());
    
    std::string mutatedString;
    mutatedString.reserve(s.length());

    for (int i = 0; i < s.length(); ++i) {
        std::uniform_real_distribution<> dis1(0.0, 1.0);
        // std::uniform_real_distribution<> dis2(0.0, 1.0);
        // std::uniform_real_distribution<> dis3(0.0, 1.0);
        double random_value = dis1(gen);
        
        
        //cout<<i<<" "<<random_value<<" "<<(1-r)<<endl;
        
        if (random_value < (1-r)) {
            mutatedString += s[i]; // Same with probability (1-r) //stay same
        } else {
            std::uniform_int_distribution<> dis(0, 2); // Range: {0, 1, 2}
            if(s[i] == 'A'){
                mutatedString += notA[dis(gen)];
            }else if(s[i] == 'C'){
                mutatedString += notC[dis(gen)];
            }else if(s[i] == 'G'){
                mutatedString += notG[dis(gen)];
            }else if(s[i] == 'T'){
                mutatedString += notT[dis(gen)];
            }
            
        }
    }
    return mutatedString;
}

// Function to calculate the probability that a standard normal variable > c
double probability_greater_than(double c) {
    double cdf_c = 0.5 * (1 + std::erf(c / std::sqrt(2))); // Calculate the CDF of the standard normal distribution at c
    //cout<<"Z"<<1.0 - cdf_c<<endl; // The probability that a standard normal variable > c
    return 1.0 - cdf_c;
}


set<string> kspectrum(string s, int k){
    set<string> kmers;
    for(int i=0; i<s.length()-k+1; i++){
        string kmer = s.substr(i, k);
        kmers.insert(kmer);
    }
    return kmers;
}
// function to calculate Hamming distance 
int hammingDist(string str1, string str2) 
{ 
    int i = 0, count = 0; 
    while (str1[i] != '\0') { 
        if (str1[i] != str2[i]) 
            count++; 
        i++; 
    } 
    return count; 
} 

void tau_join(string kmer, vector<string>& retvec){
    int k = kmer.length();
    
    for (int delta = 1; delta<=k; delta++){ // <= k 
        bool isOverlap = true;
        for(int j = delta-1; j>=0; j--){
            if(kmer[j]!=kmer[k-delta+j]){
                isOverlap = false;
                break;
            }
        }
        if(isOverlap){
            retvec.push_back(kmer+kmer.substr(delta, k-delta));
        }
       
        // kmer[0] == kmer[k-1]

        // kmer[0] = kmer[k-2]
        // kmer[1] = kmer[k-1]

        // kmer[0] = kmer[k-3]
        // kmer[1] = kmer[k-2]
        // kmer[2] = kmer[k-1]
    }
    
}

void precompute_si_tau_hd_result_thm11(SimParams& sim, string& s){
    int k = sim.k;
    set<string> set1 = kspectrum(s, k);
    std::vector<string> s_vector(set1.begin(), set1.end());

    std::vector< vector<string> > tau_joins(s_vector.size());
    for (int i = 0; i <= s.length() - k; ++i) {
        string s_i = (s.substr(i, k));
        for(int j = 0 ; j < s_vector.size(); j++ ){
            string tau = s_vector[j]; //FiX 
            hd_si_tau[i][j] = hammingDist(s_i, tau);
            tau_join(tau, tau_joins[j]);
        }
    }

    double p = sim.get_p();
    int L = s.length() - k + 1;
    double r = sim.r;

    double sum_pr = 0;
    double sum_e_max_tau = 0;
    for(int j = 0 ; j<s_vector.size(); j++){ // not on I, length of |sp(s)|

        double sum_top = 0;
        double sum_bottom1 = 0;
        double sum_bottom2 = 0;

        
        for(int i = 0; i< L; i++){ 
            double pp=pow(p, hd_si_tau[i][j]);
            double pp2 = pow(pp,2);

            sum_top -= pp;
            sum_bottom1 += pp/sim.get_one_minus_q() - pp2;
            //sum_bottom1 += pp/();


            for(int delta = 1; delta<=k; delta++){
                sum_bottom2+= -2*pow(p, hd_si_tau[i][j] + hd_si_tau[i+delta][j] );
            }

            for (int t = 0; t< tau_joins[j].size(); t++){
                string joinedString = tau_joins[j][t];
                int delta = 2*k - joinedString.length();//len = 2k -delta

                //sum_bottom2+= -2*pow(p, hd_si_tau[i][j] + hd_si_tau[i+delta][j] );

                string s_i_len_same_as_joinedString = s.substr(i, joinedString.length() );
                int d = hammingDist(s_i_len_same_as_joinedString, joinedString);
                sum_bottom2 += 2*pow(p,d)/(pow(1-r, delta));
            }
        }
        // cout<<"sum top "<<sum_top<<endl;
        // cout<<"sum b1 "<<sum_bottom1<<endl;
        // cout<<"sum b2 "<<sum_bottom2<<endl;
        // cout<<"sum_top / sqrt(sum_bottom1 + sum_bottom2 "<<sum_top / sqrt(sum_bottom1 + sum_bottom2)<<endl;

        double sigma_tau = sqrt(sum_bottom1 + sum_bottom2);
        
        
        const double pi = M_PI; // Value of pi
        double term1 = pow(2.0 / pi, 1.0 / 4.0);
        double term2 = pow(2 * k, 2.0) * L / pow(sigma_tau, 3.0);
        double term3 = sqrt(28.0) * pow(2 * k, 1.5) * sqrt(L) / (sqrt(pi) * pow(sigma_tau, 2.0));
        double e_max_tau = term1 * sqrt(term2 + term3);
        //cout<<term1<<" "<<term2<<" "<<term3<<" "<<e_max_tau<<endl;
        sum_e_max_tau += e_max_tau;
        sum_pr += probability_greater_than(sum_top / sqrt(sum_bottom1 + sum_bottom2));
    }
    estimator = sum_pr;
    error_term = sum_e_max_tau;

    //cout<<"Sum Pr"<<sum_pr<<"I "<<I<<endl;
}

string generateStressTestString(int length, int k){
    std::string pattern = "ACGT";
    if(type == 1){
        pattern = generateRandomString(k);
    }else if(type == 2){
        pattern = "ACGT";
    }else if(type == 3){
        pattern = "AC";
    }else if(type == 4){
        pattern = "A";
    }

    std::string s = "";
    std::string result;
    for (int i = 0; i < k; ++i) {
        result += pattern[i % pattern.length()];
    }

    for (int i = 0; i < floor(length/k); ++i) {
        s += result;
    }

    int patlen = s.length();
    for (int i = 1; i <= length-patlen; ++i) {
        s += "A";
    }
    //cout<<length<<" " << s.length()<<" "<<"stress test str"<<s<<endl;
    return s;
}

int intersect_size(string s, string t, int k){
    set<string> set1 = kspectrum(s, k);
    set<string> set2 = kspectrum(t, k);

    std::vector<string> intersect;

    std::set_intersection(set1.begin(), set1.end(),
                          set2.begin(), set2.end(),
                          std::back_inserter(intersect));
    return intersect.size();
}

string readString(string filename){
    string randomString;
    std::ifstream file(filename); 
    if (file.is_open()) {
        if (std::getline(file, randomString)) { 
            //std::cout << "Line read from file: " << line << std::endl;
            //L = randomString.length();
        } else {
            std::cerr << "File is empty." << std::endl;
        }
        file.close(); 
    } else {
        std::cerr << "Unable to open file." << std::endl;
    }
    return randomString;
}
string readSubstring(string filename, int L){
    std::ifstream file(filename);
    std::string line;

    if (file.is_open()) {
        file.seekg(0, std::ios::end); // Get the size of the file
        std::streampos fileSize = file.tellg();
        file.seekg(0, std::ios::beg);

        //std::srand(std::time(nullptr)); // Generate a random starting position within the file
        std::streampos startPos = (std::rand() % (int(fileSize) - L));
 
        // Move to the random starting position
        file.seekg(startPos);

        // Read 100 characters from the file
        char buffer[L+1]; // Buffer to hold 100 characters + null terminator
        file.read(buffer, L);
        buffer[L] = '\0'; // Null terminate the string

        // Assign the buffer content to the string
        line = buffer;

        //cout<< "sp" << startPos << " " <<std::rand() <<endl;
        //std::cout << "Random "<< L <<" characters from the file: " << line << std::endl;

        file.close(); // Close the file
    } else {
        std::cerr << "Unable to open file." << std::endl;
    }
    return line;
}

// double calculateMean(const std::vector<int>& numbers) {
//     double sum = 0.0;
//     for (const auto& num : numbers) {
//         sum += num;
//     }
//     return sum / numbers.size();
// }

// double calculateStandardDeviation(const std::vector<int>& numbers) {
//     double mean = calculateMean(numbers);
//     double squaredDiffSum = 0.0;
//     for (const auto& num : numbers) {
//         double diff = num - mean;
//         squaredDiffSum += diff * diff;
//     }
//     double variance = squaredDiffSum / numbers.size();
//     return std::sqrt(variance);
// }

void hammingDistanceGlobalSet(int K){
    for (int i = 0; i < K+2; ++i) { //i<32 for K=30
        zodd += "01";
    }
    for (int i = 0; i < K+2; ++i) {
        zeven += "10";
    }
    zodd_t = std::stoull(zodd, nullptr, 2);
    zeven_t = std::stoull(zeven, nullptr, 2);
}

class ResultAggregator{
    public:
        vector<int> isizes;

    double isize_mean;
    double isize_sd;

    double calculateMean(const std::vector<int>& numbers) {
        double sum = 0.0;
        for (const auto& num : numbers) {
            sum += num;
        }
        return sum / numbers.size();
    }

    double calculateMean(const std::vector<double>& numbers) {
        double sum = 0.0;
        for (const auto& num : numbers) {
            sum += num;
        }
        return sum / numbers.size();
    }

    double calculateStandardDeviation(const std::vector<int>& numbers) {
        double mean = calculateMean(numbers);
        double squaredDiffSum = 0.0;
        for (const auto& num : numbers) {
            double diff = num - mean;
            squaredDiffSum += diff * diff;
        }
        double variance = squaredDiffSum / numbers.size();
        return std::sqrt(variance);
    }

    double calculateStandardDeviation(const std::vector<double>& numbers) {
        double mean = calculateMean(numbers);
        double squaredDiffSum = 0.0;
        for (const auto& num : numbers) {
            double diff = num - mean;
            squaredDiffSum += diff * diff;
        }
        double variance = squaredDiffSum / numbers.size();
        return std::sqrt(variance);
    }

    void calculateMeanSD(const std::vector<int>& numbers, double& mean, double& sd) {
        mean = calculateMean(numbers);
        double squaredDiffSum = 0.0;
        for (const auto& num : numbers) {
            double diff = num - mean;
            squaredDiffSum += diff * diff;
        }
        double variance = squaredDiffSum / numbers.size();
        sd = std::sqrt(variance);
    }

    void calculateMeanSD(const std::vector<double>& numbers, double& mean, double& sd) {
        mean = calculateMean(numbers);
        double squaredDiffSum = 0.0;
        for (const auto& num : numbers) {
            double diff = num - mean;
            squaredDiffSum += diff * diff;
        }
        double variance = squaredDiffSum / numbers.size();
        sd = std::sqrt(variance);
    }
};


int main (int argc, char* argv[]){
    //do for different r
    srand(time(nullptr));

    int L = -1;
    int k = -1;
    double r = -1;

    string filename="";


    int num_replicates = 20; //do this random string generation 100 times
    vector<string> args(argv + 1, argv + argc);
    for (auto i = args.begin(); i != args.end(); ++i) {
            if (*i == "-h" || *i == "--help") {
                cout << "Syntax: -i [input-file] -l <length> -r <mutation-rate> -t <type-of-input-string> -k <kmer-size> -c <num-replicates>" << endl;
                return 0;
            } else if (*i == "-r") {
                r = stod(*++i);
            } else if (*i == "-l") {
                L = stoull(*++i);
            } else if (*i == "-k") {
                k = stoi(*++i);
            } else if (*i == "-t") {
                type = stoi(*++i);
            }else if (*i == "-i") {
                filename = (*++i);
            } else if (*i == "-c") {
                num_replicates = stoi(*++i);
            }
    }

    //hammingDistanceGlobalSet(K);
    
    //SimParams sim(5, 100, 0.01); //k, L, r
    SimParams sim(k, L, r); //k, L, r

    
    ResultAggregator res;

    //do a single string
    std::string randomString;
    filename=""; //filename="/Users/amatur/code/downloads/t2t_chr21.sset";
    int length = sim.get_n();
    if (filename==""){
        if(type == 0){
            randomString = generateRandomString(length);
        }else{
            randomString = generateStressTestString(length, sim.k);
        }
    }else{
        randomString = readSubstring(filename, length);
    }
    
    for(int i=0; i<num_replicates; i++){
        std::string mutatedString = generateMutatedString(randomString, sim.r);
        precompute_si_tau_hd_result_thm11(sim, randomString);
        int Isize = intersect_size(randomString, mutatedString, sim.k);
        res.isizes.push_back(Isize);
    }

    res.calculateMeanSD(res.isizes, res.isize_mean, res.isize_sd);
    
    double r_prime = 1.0 - pow(2, log2(S_obs)/sim.k-log2(sim.L)/sim.k);
    cout<<"num_replicates "<<num_replicates<< " input_s "<< type_map[type] << " L "<<sim.L<<" r "<<sim.r<< " k "<< sim.k <<  " estimate " << estimator << " |I|_mean " << res.isize_mean << " |I|_sd " << res.isize_sd << " L(1-q) "<<sim.L*(1-sim.get_q())<< " r_prime "  << r_prime << " theoretical_err " << error_term << " observed_err "<< abs( res.isize_mean-estimator)<< endl;
    //cout<<endl;

}
Leave a Comment