thm11
unknown
c_cpp
a year ago
17 kB
9
Indexable
#include <iostream>
#include <random>
#include <string>
#include<set>
#include<fstream>
using namespace std;
//std::string randomString;
int hd_si_tau[1000][1000];
string zodd="";
string zeven="";
uint64_t zodd_t, zeven_t;
string type_map[] = {"random", "stress_random_lenk", "stress_ACGT", "stress_AC", "stress_A"};
//RESULTS
double estimator;
double error_term;
int type = 1;
class SimParams{
public:
int k;
unsigned long L;
double r;
SimParams(int k, unsigned long L, double r){
this->k = k;
this->L = L;
this->r = r;
}
double get_one_minus_q(){
return pow(1.0-r, k);
//(1 − r)^k
}
double get_q(){
return 1 - pow(1.0-r, k);
//1 − (1 − r)^k
}
double get_p(){
return r/3.0/(1-r);
}
unsigned long get_n(){
return L+k-1;
}
};
std::string encode_kmer(const std::string& kmer) {
std::string encoded_kmer;
for (char base : kmer) {
switch (base) {
case 'A':
encoded_kmer += "00"; // 'A' is encoded as 00
break;
case 'C':
encoded_kmer += "01"; // 'C' is encoded as 01
break;
case 'G':
encoded_kmer += "10"; // 'G' is encoded as 10
break;
case 'T':
encoded_kmer += "11"; // 'T' is encoded as 11
break;
default:
throw std::invalid_argument("Invalid character in k-mer: " + std::string(1, base));
}
}
return encoded_kmer;
}
vector<uint64_t> stringToBinaryVec(string kmer){
int K = kmer.length();
string bv_line = encode_kmer(kmer);
vector<uint64_t> curr_bv(ceil(K/32));
for (int i = 0; i< ceil(K/32); i++){
if(2*K>i*64){
curr_bv[i] = std::stoull(bv_line.substr(64*i,std::min(64, 2*K-i*64)), nullptr, 2);
}
}
return curr_bv;
}
uint64_t stringToBinary(string kmer){
string bv_line = encode_kmer(kmer);
int K = kmer.length();
uint64_t curr_bv_lo = std::stoull(bv_line.substr(0,std::min(64, 2*K)), nullptr, 2);
return curr_bv_lo;
}
std::string generateRandomString(int length, unsigned int seed=0) {
const std::string charset = "ACGT"; // Characters to choose from
std::random_device rd;
std::mt19937 gen(rd());
//std::mt19937 gen(seed); to set seed
std::uniform_int_distribution<> dis(0, charset.length() - 1);
std::string randomString;
randomString.reserve(length);
for (int i = 0; i < length; ++i) {
randomString += charset[dis(gen)];
}
return randomString;
}
std::string generateMutatedString(string s, double r=0.1) {
string notT = "ACG";
string notG = "ACT";
string notC = "AGT";
string notA = "CGT";
std::random_device rd;
std::mt19937 gen(rd());
std::string mutatedString;
mutatedString.reserve(s.length());
for (int i = 0; i < s.length(); ++i) {
std::uniform_real_distribution<> dis1(0.0, 1.0);
// std::uniform_real_distribution<> dis2(0.0, 1.0);
// std::uniform_real_distribution<> dis3(0.0, 1.0);
double random_value = dis1(gen);
//cout<<i<<" "<<random_value<<" "<<(1-r)<<endl;
if (random_value < (1-r)) {
mutatedString += s[i]; // Same with probability (1-r) //stay same
} else {
std::uniform_int_distribution<> dis(0, 2); // Range: {0, 1, 2}
if(s[i] == 'A'){
mutatedString += notA[dis(gen)];
}else if(s[i] == 'C'){
mutatedString += notC[dis(gen)];
}else if(s[i] == 'G'){
mutatedString += notG[dis(gen)];
}else if(s[i] == 'T'){
mutatedString += notT[dis(gen)];
}
}
}
return mutatedString;
}
// Function to calculate the probability that a standard normal variable > c
double probability_greater_than(double c) {
double cdf_c = 0.5 * (1 + std::erf(c / std::sqrt(2))); // Calculate the CDF of the standard normal distribution at c
//cout<<"Z"<<1.0 - cdf_c<<endl; // The probability that a standard normal variable > c
return 1.0 - cdf_c;
}
set<string> kspectrum(string s, int k){
set<string> kmers;
for(int i=0; i<s.length()-k+1; i++){
string kmer = s.substr(i, k);
kmers.insert(kmer);
}
return kmers;
}
// function to calculate Hamming distance
int hammingDist(string str1, string str2)
{
int i = 0, count = 0;
while (str1[i] != '\0') {
if (str1[i] != str2[i])
count++;
i++;
}
return count;
}
void tau_join(string kmer, vector<string>& retvec){
int k = kmer.length();
for (int delta = 1; delta<=k; delta++){ // <= k
bool isOverlap = true;
for(int j = delta-1; j>=0; j--){
if(kmer[j]!=kmer[k-delta+j]){
isOverlap = false;
break;
}
}
if(isOverlap){
if(delta==k){
retvec.push_back(kmer);
}else{
retvec.push_back(kmer+kmer.substr(delta, k-delta));
}
}
// kmer[0] == kmer[k-1]
// kmer[0] = kmer[k-2]
// kmer[1] = kmer[k-1]
// kmer[0] = kmer[k-3]
// kmer[1] = kmer[k-2]
// kmer[2] = kmer[k-1]
}
}
void precompute_si_tau_hd_result_thm11(SimParams& sim, string& s){
int k = sim.k;
set<string> set1 = kspectrum(s, k);
std::vector<string> s_vector(set1.begin(), set1.end());
std::vector< vector<string> > tau_joins(s_vector.size());
for (int i = 0; i <= s.length() - k; ++i) {
string s_i = (s.substr(i, k));
for(int j = 0 ; j < s_vector.size(); j++ ){
string tau = s_vector[j]; //FiX
hd_si_tau[i][j] = hammingDist(s_i, tau);
tau_join(tau, tau_joins[j]);
}
}
double p = sim.get_p();
int L = s.length() - k + 1;
double r = sim.r;
double sum_pr = 0;
double sum_e_max_tau = 0;
for(int j = 0 ; j<s_vector.size(); j++){ // not on I, length of |sp(s)|
double sum_top = 0;
double sum_bottom1 = 0;
double sum_bottom2 = 0;
double sum_bottom2_p1 = 0;
double sum_bottom2_p2 = 0;
for(int i = 0; i< L; i++){
double pp=pow(p, hd_si_tau[i][j]);
double pp2 = pow(pp,2);
sum_top -= pp;
sum_bottom1 += pp/sim.get_one_minus_q() - pp2;
//sum_bottom1 += pp/();
for(int delta = 1; delta<=k; delta++){
if(i+delta>=L){
break;
}
sum_bottom2_p1+= -2*pow(p, hd_si_tau[i][j] + hd_si_tau[i+delta][j] );
//cout<<sum_bottom2_p1<<" sp "<< " delta " << delta << " k "<<k<< " "<< hd_si_tau[i][j] << " "<< hd_si_tau[i+delta][j] << " " << i+delta<<endl;
}
for (int t = 0; t< tau_joins[j].size(); t++){
string joinedString = tau_joins[j][t];
int delta = 2*k - joinedString.length();//len = 2k -delta
//sum_bottom2+= -2*pow(p, hd_si_tau[i][j] + hd_si_tau[i+delta][j] );
string s_i_len_same_as_joinedString = s.substr(i, joinedString.length() );
if(s_i_len_same_as_joinedString.length()==joinedString.length() ){
int d = hammingDist(s_i_len_same_as_joinedString, joinedString);
sum_bottom2_p2 += 2*pow(p,d)/(pow(1-r, delta));
}
}
}
sum_bottom2= sum_bottom2_p1 + sum_bottom2_p2;
// cout<<"sum top "<<sum_top<<endl;
// cout<<"sum b1 "<<sum_bottom1<<endl;
// cout<<"sum b2 p1 "<<sum_bottom2_p1<<endl;
// cout<<"sum b2 p2 "<<sum_bottom2_p2<<endl;
// cout<<"sum b2 "<<sum_bottom2<<endl;
// cout<<"sum_top / sqrt(sum_bottom1 + sum_bottom2 "<<sum_top / sqrt(sum_bottom1 + sum_bottom2)<<endl;
if (std::isinf(sum_bottom2_p1) && sum_bottom2_p1 < 0) {
cout<<"Error "<<j<<" "<< s_vector[j]<<endl;
exit(1);
}
double sigma_tau = sqrt(sum_bottom1 + sum_bottom2);
const double pi = M_PI; // Value of pi
double term1 = pow(2.0 / pi, 1.0 / 4.0);
double term2 = pow(2 * k, 2.0) * L / pow(sigma_tau, 3.0);
double term3 = sqrt(28.0) * pow(2 * k, 1.5) * sqrt(L) / (sqrt(pi) * pow(sigma_tau, 2.0));
double e_max_tau = term1 * sqrt(term2 + term3);
//cout<<term1<<" "<<term2<<" "<<term3<<" "<<e_max_tau<<endl;
sum_e_max_tau += e_max_tau;
sum_pr += probability_greater_than(sum_top / sqrt(sum_bottom1 + sum_bottom2));
}
estimator = sum_pr;
error_term = sum_e_max_tau;
//cout<<"Sum Pr"<<sum_pr<<"I "<<I<<endl;
}
string generateStressTestString(int length, int k){
std::string pattern = "ACGT";
if(type == 1){
pattern = generateRandomString(k);
}else if(type == 2){
pattern = "ACGT";
}else if(type == 3){
pattern = "AC";
}else if(type == 4){
pattern = "A";
}
std::string s = "";
std::string result;
for (int i = 0; i < k; ++i) {
result += pattern[i % pattern.length()];
}
for (int i = 0; i < floor(length/k); ++i) {
s += result;
}
int patlen = s.length();
for (int i = 1; i <= length-patlen; ++i) {
s += "A";
}
//cout<<length<<" " << s.length()<<" "<<"stress test str"<<s<<endl;
return s;
}
int intersect_size(string s, string t, int k){
set<string> set1 = kspectrum(s, k);
set<string> set2 = kspectrum(t, k);
std::vector<string> intersect;
std::set_intersection(set1.begin(), set1.end(),
set2.begin(), set2.end(),
std::back_inserter(intersect));
return intersect.size();
}
string readString(string filename){
string randomString;
std::ifstream file(filename);
if (file.is_open()) {
if (std::getline(file, randomString)) {
//std::cout << "Line read from file: " << line << std::endl;
//L = randomString.length();
} else {
std::cerr << "File is empty." << std::endl;
}
file.close();
} else {
std::cerr << "Unable to open file." << std::endl;
}
return randomString;
}
string readSubstring(string filename, int L){
std::ifstream file(filename);
std::string line;
if (file.is_open()) {
file.seekg(0, std::ios::end); // Get the size of the file
std::streampos fileSize = file.tellg();
file.seekg(0, std::ios::beg);
//std::srand(std::time(nullptr)); // Generate a random starting position within the file
std::streampos startPos = (std::rand() % (int(fileSize) - L));
// Move to the random starting position
file.seekg(startPos);
// Read 100 characters from the file
char buffer[L+1]; // Buffer to hold 100 characters + null terminator
file.read(buffer, L);
buffer[L] = '\0'; // Null terminate the string
// Assign the buffer content to the string
line = buffer;
//cout<< "sp" << startPos << " " <<std::rand() <<endl;
//std::cout << "Random "<< L <<" characters from the file: " << line << std::endl;
file.close(); // Close the file
} else {
std::cerr << "Unable to open file." << std::endl;
}
return line;
}
// double calculateMean(const std::vector<int>& numbers) {
// double sum = 0.0;
// for (const auto& num : numbers) {
// sum += num;
// }
// return sum / numbers.size();
// }
// double calculateStandardDeviation(const std::vector<int>& numbers) {
// double mean = calculateMean(numbers);
// double squaredDiffSum = 0.0;
// for (const auto& num : numbers) {
// double diff = num - mean;
// squaredDiffSum += diff * diff;
// }
// double variance = squaredDiffSum / numbers.size();
// return std::sqrt(variance);
// }
void hammingDistanceGlobalSet(int K){
for (int i = 0; i < K+2; ++i) { //i<32 for K=30
zodd += "01";
}
for (int i = 0; i < K+2; ++i) {
zeven += "10";
}
zodd_t = std::stoull(zodd, nullptr, 2);
zeven_t = std::stoull(zeven, nullptr, 2);
}
class ResultAggregator{
public:
vector<int> isizes;
double isize_mean;
double isize_sd;
double calculateMean(const std::vector<int>& numbers) {
double sum = 0.0;
for (const auto& num : numbers) {
sum += num;
}
return sum / numbers.size();
}
double calculateMean(const std::vector<double>& numbers) {
double sum = 0.0;
for (const auto& num : numbers) {
sum += num;
}
return sum / numbers.size();
}
double calculateStandardDeviation(const std::vector<int>& numbers) {
double mean = calculateMean(numbers);
double squaredDiffSum = 0.0;
for (const auto& num : numbers) {
double diff = num - mean;
squaredDiffSum += diff * diff;
}
double variance = squaredDiffSum / numbers.size();
return std::sqrt(variance);
}
double calculateStandardDeviation(const std::vector<double>& numbers) {
double mean = calculateMean(numbers);
double squaredDiffSum = 0.0;
for (const auto& num : numbers) {
double diff = num - mean;
squaredDiffSum += diff * diff;
}
double variance = squaredDiffSum / numbers.size();
return std::sqrt(variance);
}
void calculateMeanSD(const std::vector<int>& numbers, double& mean, double& sd) {
mean = calculateMean(numbers);
double squaredDiffSum = 0.0;
for (const auto& num : numbers) {
double diff = num - mean;
squaredDiffSum += diff * diff;
}
double variance = squaredDiffSum / numbers.size();
sd = std::sqrt(variance);
}
void calculateMeanSD(const std::vector<double>& numbers, double& mean, double& sd) {
mean = calculateMean(numbers);
double squaredDiffSum = 0.0;
for (const auto& num : numbers) {
double diff = num - mean;
squaredDiffSum += diff * diff;
}
double variance = squaredDiffSum / numbers.size();
sd = std::sqrt(variance);
}
};
int main (int argc, char* argv[]){
//do for different r
srand(time(nullptr));
int L = -1;
int k = -1;
double r = -1;
string filename="";
int num_replicates = 20; //do this random string generation 100 times
vector<string> args(argv + 1, argv + argc);
for (auto i = args.begin(); i != args.end(); ++i) {
if (*i == "-h" || *i == "--help") {
cout << "Syntax: -i [input-file] -l <length> -r <mutation-rate> -t <type-of-input-string> -k <kmer-size> -c <num-replicates>" << endl;
return 0;
} else if (*i == "-r") {
r = stod(*++i);
} else if (*i == "-l") {
L = stoull(*++i);
} else if (*i == "-k") {
k = stoi(*++i);
} else if (*i == "-t") {
type = stoi(*++i);
}else if (*i == "-i") {
filename = (*++i);
} else if (*i == "-c") {
num_replicates = stoi(*++i);
}
}
//hammingDistanceGlobalSet(K);
//SimParams sim(5, 100, 0.01); //k, L, r
SimParams sim(k, L, r); //k, L, r
ResultAggregator res;
//do a single string
std::string randomString;
filename=""; //filename="/Users/amatur/code/downloads/t2t_chr21.sset";
int length = sim.get_n();
if (filename==""){
if(type == 0){
randomString = generateRandomString(length);
}else{
randomString = generateStressTestString(length, sim.k);
}
}else{
randomString = readSubstring(filename, length);
}
precompute_si_tau_hd_result_thm11(sim, randomString);
for(int i=0; i<num_replicates; i++){
std::string mutatedString = generateMutatedString(randomString, sim.r);
int Isize = intersect_size(randomString, mutatedString, sim.k);
res.isizes.push_back(Isize);
}
res.calculateMeanSD(res.isizes, res.isize_mean, res.isize_sd);
double r_prime = 1.0 - pow(2, log2( res.isize_mean)/sim.k-log2(sim.L)/sim.k);
cout<<"num_replicates "<<num_replicates<< " input_s "<< type_map[type] << " L "<<sim.L<<" r "<<sim.r<< " k "<< sim.k << " estimate " << estimator << " |I|_mean " << res.isize_mean << " |I|_sd " << res.isize_sd << " L(1-q) "<<sim.L*(1-sim.get_q())<< " r_prime " << r_prime << " theoretical_err " << error_term << " observed_err "<< abs( res.isize_mean-estimator)<< endl;
//cout<<endl;
}Editor is loading...
Leave a Comment