Untitled

mail@pastecode.io avatar
unknown
c_cpp
a month ago
16 kB
2
Indexable
Never
#include <iostream>
#include <random>
#include <string>
#include<set>
#include<fstream>

using namespace std;
std::string randomString;
std::string mutatedString;
int hd_si_tau[1000][1000];


int K = 30;
string zodd="";
string zeven="";
uint64_t zodd_t, zeven_t;


double estimator;
int S_obs;



class SimParams{
    public:
    int k;
    unsigned long L;
    double r;

    SimParams(int k, unsigned long L, double r){
        this->k = k;
        this->L = L;
        this->r = r;
    }
    double get_one_minus_q(){
        return pow(1.0-r, k);
        //(1 − r)^k
    }
    double get_q(){
        return 1 - pow(1.0-r, k);
        //1 − (1 − r)^k
    }
    double get_p(){
        return r/3.0/(1-r);
    }

    unsigned long get_n(){
        return L+k-1;
    }

    
    
};


std::string encode_kmer(const std::string& kmer) {
    std::string encoded_kmer;
    for (char base : kmer) {
        switch (base) {
            case 'A':
                encoded_kmer += "00"; // 'A' is encoded as 00
                break;
            case 'C':
                encoded_kmer += "01"; // 'C' is encoded as 01
                break;
            case 'G':
                encoded_kmer += "10"; // 'G' is encoded as 10
                break;
            case 'T':
                encoded_kmer += "11"; // 'T' is encoded as 11
                break;
            default:
                throw std::invalid_argument("Invalid character in k-mer: " + std::string(1, base));
        }
    }
    return encoded_kmer;
}


vector<uint64_t> stringToBinaryVec(string kmer){
    int K = kmer.length();
    string bv_line = encode_kmer(kmer);
    vector<uint64_t> curr_bv(ceil(K/32));
    for (int i = 0; i< ceil(K/32); i++){
        if(2*K>i*64){
            curr_bv[i] = std::stoull(bv_line.substr(64*i,std::min(64, 2*K-i*64)), nullptr, 2);
        }
    }
    return curr_bv;
}

uint64_t stringToBinary(string kmer){
    string bv_line = encode_kmer(kmer);
    uint64_t curr_bv_lo = std::stoull(bv_line.substr(0,std::min(64, 2*K)), nullptr, 2);
    return curr_bv_lo;
} 


std::string generateRandomString(int length, unsigned int seed=0) {
    const std::string charset = "ACGT"; // Characters to choose from
    std::random_device rd;
    std::mt19937 gen(rd());

    //std::mt19937 gen(seed); to set seed
    std::uniform_int_distribution<> dis(0, charset.length() - 1);

    std::string randomString;
    randomString.reserve(length);

    for (int i = 0; i < length; ++i) {
        randomString += charset[dis(gen)];
    }

    return randomString;
}

std::string generateMutatedString(string s, double r=0.1) {
    string notT = "ACG";
    string notG = "ACT";
    string notC = "AGT";
    string notA = "CGT";

    std::random_device rd;
    std::mt19937 gen(rd());
    
    std::string mutatedString;
    mutatedString.reserve(s.length());

    for (int i = 0; i < s.length(); ++i) {
        std::uniform_real_distribution<> dis1(0.0, 1.0);
        // std::uniform_real_distribution<> dis2(0.0, 1.0);
        // std::uniform_real_distribution<> dis3(0.0, 1.0);
        double random_value = dis1(gen);
        
        
        //cout<<i<<" "<<random_value<<" "<<(1-r)<<endl;
        
        if (random_value < (1-r)) {
            mutatedString += s[i]; // Same with probability (1-r) //stay same
        } else {
            std::uniform_int_distribution<> dis(0, 2); // Range: {0, 1, 2}
            if(s[i] == 'A'){
                mutatedString += notA[dis(gen)];
            }else if(s[i] == 'C'){
                mutatedString += notC[dis(gen)];
            }else if(s[i] == 'G'){
                mutatedString += notG[dis(gen)];
            }else if(s[i] == 'T'){
                mutatedString += notT[dis(gen)];
            }
            
        }
    }
    return mutatedString;
}

// Function to calculate the probability that a standard normal variable > c
double probability_greater_than(double c) {
    // Calculate the CDF of the standard normal distribution at c
    double cdf_c = 0.5 * (1 + std::erf(c / std::sqrt(2)));
    // The probability that a standard normal variable > c
    //cout<<"Z"<<1.0 - cdf_c<<endl;
    return 1.0 - cdf_c;
}


set<string> kspectrum(string s, int k){
    set<string> kmers;
    for(int i=0; i<s.length()-k+1; i++){
        string kmer = s.substr(i, k);
        kmers.insert(kmer);
    }
    return kmers;
}
// function to calculate Hamming distance 
int hammingDist(string str1, string str2) 
{ 
    int i = 0, count = 0; 
    while (str1[i] != '\0') { 
        if (str1[i] != str2[i]) 
            count++; 
        i++; 
    } 
    return count; 
} 

void tau_join(string kmer, vector<string>& retvec){
    int k = kmer.length();
    
    for (int delta = 1; delta<k; delta++){
        bool isOverlap = true;
        for(int j = delta-1; j>=0; j--){
            if(kmer[j]!=kmer[k-delta+j]){
                isOverlap = false;
                break;
            }else{
                //cout<<delta<<" " << kmer[j]<<kmer[j]<<" ";
            }
        }
        if(isOverlap){
            //cout<<delta<<" ";
            //cout<<kmer+kmer.substr(delta, k-delta);
            retvec.push_back(kmer+kmer.substr(delta, k-delta));
            //cout<<endl;
        }
       
        
        // kmer[0] == kmer[k-1]

        // kmer[0] = kmer[k-2]
        // kmer[1] = kmer[k-1]

        // kmer[0] = kmer[k-3]
        // kmer[1] = kmer[k-2]
        // kmer[2] = kmer[k-1]
    }
    
}

void precompute_si_tau_hd(SimParams& sim){
    //do all pair hd 
    // and do delta
    int k=sim.k;
    string& s = randomString;
    string& t = mutatedString;

    set<string> set1 = kspectrum(s, k);
    set<string> set2 = kspectrum(t, k);

    std::vector<string> intersect;

    std::set_intersection(set1.begin(), set1.end(),
                          set2.begin(), set2.end(),
                          std::back_inserter(intersect));


    // for (int i = 0; i < mu; ++i) {
    //     for (int j = 0; j < k; ++j) {
    //         hd_si_tau[i][j] = hammingDist(std::bitset<64>(s_i).to_string(), std::bitset<64>(tau_i).to_string());
    //     }
    // }

    std::vector< vector<string> > tau_joins(intersect.size());
    for (int i = 0; i <= s.length() - k; ++i) {
        string s_i = (s.substr(i, k));
        for(int j = 0 ; j < intersect.size(); j++ ){
            string tau = intersect[j];
            hd_si_tau[i][j] = hammingDist(s_i, tau);

            tau_join(tau, tau_joins[j]);
        }
    }


    double p = sim.get_p();

    int L = s.length() - k + 1;
    int I = intersect.size();
    double r = sim.r;


    double sum_pr = 0;
    for(int j = 0 ; j<I; j++){

        double sum_top = 0;
        double sum_bottom1 = 0;
        double sum_bottom2 = 0;

        
        for(int i = 0; i< L; i++){
            double pp=pow(p, hd_si_tau[i][j]);
            double pp2 = pow(pp,2);

            sum_top -= pp;
            sum_bottom1 += pp/sim.get_one_minus_q() - pp2;
            //sum_bottom1 += pp/();


            // for(int delta = 1; delta<=k; delta++){
            //     sum_bottom2+= -2*pow(p, hd_si_tau[i][j] + hd_si_tau[i+delta][j] );
            // }

            for (int t = 0; t< tau_joins[j].size(); t++){
                string joinedString = tau_joins[j][t];
                int delta = 2*k - joinedString.length();//len = 2k -delta

                sum_bottom2+= -2*pow(p, hd_si_tau[i][j] + hd_si_tau[i+delta][j] );

                string s_i_joinedStringLen = s.substr(i, joinedString.length() );
                int d = hammingDist(s_i_joinedStringLen, joinedString);
                sum_bottom2 += 2*pow(p,d)/(pow(1-r, delta));
            }
        }
        // cout<<"sum top "<<sum_top<<endl;
        // cout<<"sum b1 "<<sum_bottom1<<endl;
        // cout<<"sum b2 "<<sum_bottom2<<endl;
        // cout<<"sum_top / sqrt(sum_bottom1 + sum_bottom2 "<<sum_top / sqrt(sum_bottom1 + sum_bottom2)<<endl;

        sum_pr += probability_greater_than(sum_top / sqrt(sum_bottom1 + sum_bottom2));

        
    }
    estimator = sum_pr;
    S_obs = I;
    //cout<<"Sum Pr"<<sum_pr<<"I "<<I<<endl;
    

    

}

string generateStressTestString(int length, int k){
    int type=1;
    
    std::string pattern = "ACGT";
    if(type==0){
        pattern = "ACGT";
    }else if(type==1){
        pattern = "AC";
    }else if(type==2){
        pattern = "A";
    }

    std::string s = "";
    std::string result;
    for (int i = 0; i < k; ++i) {
        result += pattern[i % pattern.length()];
    }

    for (int i = 0; i < floor(length/k); ++i) {
        s += result;
    }

    int patlen = s.length();
    for (int i = 1; i <= length-patlen; ++i) {
        s += "A";
    }
    //cout<<length<<" " << s.length()<<" "<<"stress test str"<<s<<endl;
    return s;
}

unsigned long intersect_size(string s, string t, int k){
    set<string> set1 = kspectrum(s, k);
    set<string> set2 = kspectrum(t, k);

    std::vector<string> intersect;

    std::set_intersection(set1.begin(), set1.end(),
                          set2.begin(), set2.end(),
                          std::back_inserter(intersect));
    return intersect.size();
}


// k should be greater than alpha*log2(L)
double get_alpha(double r){
    return 1/(2 + log2(1/(1-r)));
}

class Thm3{

    public:
        int k;
        unsigned long L;
        double r;
        
    Thm3(SimParams p){
        this->k = p.k;
        this->L = p.L;
        this->r = p.r;
    }

    Thm3(int k, unsigned long L, double r){
        this->k = k;
        this->L = L;
        this->r = r;
    }


    // Overloading the assignment operator
    Thm3& operator=(const Thm3& other) {
        if (this != &other) {
            k = other.k;
            L = other.L;
            r = other.r;
        }
        return *this;
    }

    double error_term(){
        return (L - 1)*1.0/(pow(4.0*(1-r),k));
    }

    bool areEqual(double a, double b, double epsilon = 1e-9) {
        return fabs(a - b) < epsilon;
    }

    bool constraint_check(){
        //return areEqual(k/log2(L) - 1/(2 + log2(1/(1-r))) ,0);
        return (k/log2(L) > 1/(2 + log2(1/(1-r))));
    }
};

string readString(string filename){
    string randomString;
    std::ifstream file(filename); 
    if (file.is_open()) {
        if (std::getline(file, randomString)) { 
            //std::cout << "Line read from file: " << line << std::endl;
            //L = randomString.length();
        } else {
            std::cerr << "File is empty." << std::endl;
        }
        file.close(); 
    } else {
        std::cerr << "Unable to open file." << std::endl;
    }
    return randomString;
}
string readSubstring(string filename, int L){
    std::ifstream file(filename);
    std::string line;

    if (file.is_open()) {
        file.seekg(0, std::ios::end); // Get the size of the file
        std::streampos fileSize = file.tellg();
        file.seekg(0, std::ios::beg);

        //std::srand(std::time(nullptr)); // Generate a random starting position within the file
        std::streampos startPos = (std::rand() % (int(fileSize) - L));
 
        // Move to the random starting position
        file.seekg(startPos);

        // Read 100 characters from the file
        char buffer[L+1]; // Buffer to hold 100 characters + null terminator
        file.read(buffer, L);
        buffer[L] = '\0'; // Null terminate the string

        // Assign the buffer content to the string
        line = buffer;

        //cout<< "sp" << startPos << " " <<std::rand() <<endl;
        //std::cout << "Random "<< L <<" characters from the file: " << line << std::endl;

        file.close(); // Close the file
    } else {
        std::cerr << "Unable to open file." << std::endl;
    }
    return line;
}

double calculateMean(const std::vector<int>& numbers) {
    double sum = 0.0;
    for (const auto& num : numbers) {
        sum += num;
    }
    return sum / numbers.size();
}

double calculateStandardDeviation(const std::vector<int>& numbers) {
    double mean = calculateMean(numbers);
    double squaredDiffSum = 0.0;
    for (const auto& num : numbers) {
        double diff = num - mean;
        squaredDiffSum += diff * diff;
    }
    double variance = squaredDiffSum / numbers.size();
    return std::sqrt(variance);
}

void hammingDistanceGlobalSet(int K){
    for (int i = 0; i < K+2; ++i) { //i<32 for K=30
        zodd += "01";
    }
    for (int i = 0; i < K+2; ++i) {
        zeven += "10";
    }
    zodd_t = std::stoull(zodd, nullptr, 2);
    zeven_t = std::stoull(zeven, nullptr, 2);
}






//Setup L = 1000
// k = 1 to 800
// expectation: EL and Pr[ZT>something] should have small diff (spk(s) emax)
 
int main (int argc, char* argv[]){

    
    //do for different r
    srand(time(nullptr));

    unsigned long L = 100;
    double alpha = -1;
    int k = -1;
    double r = 0.01;

    bool justGetMin = false;
    string filename="";

    vector<string> args(argv + 1, argv + argc);
    for (auto i = args.begin(); i != args.end(); ++i) {
            if (*i == "-h" || *i == "--help") {
                cout << "Syntax: -i <input-file> -l <length> -r <mutation-rate> -p [getminalpha and exit] -k <kmer-size>" << endl;
                return 0;
            } else if (*i == "-p") {
                justGetMin = true;
            } else if (*i == "-r") {
                r = stod(*++i);
            } else if (*i == "-l") {
                L = stoull(*++i);
            } else if (*i == "-k") {
                k = stoi(*++i);
            } else if (*i == "-i") {
                filename = (*++i);
            } 
    }



    K=k;
    //hammingDistanceGlobalSet(K);
    // SimParams sim(k, L, r); //k, L, r
    //SimParams sim(50, 100, 0.01); //k, L, r
    SimParams sim(k, L, r); //k, L, r

    //SimParams sim(10, 100, 0.01); //k, L, r

    int length = sim.get_n();


    filename="";
    //filename="/Users/amatur/code/downloads/t2t_chr21.sset";
    if (filename!=""){
        randomString = readSubstring(filename, length);
    }
    double mean = 0;
    int num_replicates = 1; //do this random string generation 100 times
    vector<int> values(num_replicates);
    
    for(int i=0; i<num_replicates; i++){
        
        if (filename==""){
            //randomString = generateRandomString(length);
            randomString = generateStressTestString(length, sim.k);
        }
        //else{
        //     randomString = readSubstring(filename, length);
        // }
        
        //std::string mutatedString;
        mutatedString = generateMutatedString(randomString, sim.r);
        precompute_si_tau_hd(sim);
        // int Isize = intersect_size(randomString, mutatedString, int(k));
        // mean+=Isize;
        // values[i] = Isize;
        // if(i<5)
        // cout<<int(k)<< "|I|: "<<intersect_size(randomString, mutatedString, int(k))<<endl;
    }
    mean /= 1.0*num_replicates;

    // std::cout << "Random string of length " << length << ": " << randomString << std::endl;
    // std::cout << "Mutated string of length " << length << ": " << mutatedString << std::endl;

    // cout<<"k"<< int(k)<< " |I|: "<<mean<<endl;
    // cout<<"L(1-q)"<<sim.L*(1-sim.get_q())<<endl;

    double r_prime = 1.0 - pow(2, log2(S_obs)/sim.k-log2(sim.L)/sim.k);
    cout<<"L "<<sim.L<<" r "<<sim.r<< " k "<< sim.k <<" |I| " <<S_obs << " estimate " << estimator << " diff "<< abs(S_obs-estimator)<< " L(1-q) "<<sim.L*(1-sim.get_q())<<" sd "<<calculateStandardDeviation(values)<< " rprime "  << r_prime<<endl;
    //cout<<endl;

}
Leave a Comment