Untitled
unknown
c_cpp
a year ago
16 kB
9
Indexable
#include <iostream>
#include <random>
#include <string>
#include<set>
#include<fstream>
using namespace std;
std::string randomString;
std::string mutatedString;
int hd_si_tau[1000][1000];
int K = 30;
string zodd="";
string zeven="";
uint64_t zodd_t, zeven_t;
double estimator;
int S_obs;
class SimParams{
public:
int k;
unsigned long L;
double r;
SimParams(int k, unsigned long L, double r){
this->k = k;
this->L = L;
this->r = r;
}
double get_one_minus_q(){
return pow(1.0-r, k);
//(1 − r)^k
}
double get_q(){
return 1 - pow(1.0-r, k);
//1 − (1 − r)^k
}
double get_p(){
return r/3.0/(1-r);
}
unsigned long get_n(){
return L+k-1;
}
};
std::string encode_kmer(const std::string& kmer) {
std::string encoded_kmer;
for (char base : kmer) {
switch (base) {
case 'A':
encoded_kmer += "00"; // 'A' is encoded as 00
break;
case 'C':
encoded_kmer += "01"; // 'C' is encoded as 01
break;
case 'G':
encoded_kmer += "10"; // 'G' is encoded as 10
break;
case 'T':
encoded_kmer += "11"; // 'T' is encoded as 11
break;
default:
throw std::invalid_argument("Invalid character in k-mer: " + std::string(1, base));
}
}
return encoded_kmer;
}
vector<uint64_t> stringToBinaryVec(string kmer){
int K = kmer.length();
string bv_line = encode_kmer(kmer);
vector<uint64_t> curr_bv(ceil(K/32));
for (int i = 0; i< ceil(K/32); i++){
if(2*K>i*64){
curr_bv[i] = std::stoull(bv_line.substr(64*i,std::min(64, 2*K-i*64)), nullptr, 2);
}
}
return curr_bv;
}
uint64_t stringToBinary(string kmer){
string bv_line = encode_kmer(kmer);
uint64_t curr_bv_lo = std::stoull(bv_line.substr(0,std::min(64, 2*K)), nullptr, 2);
return curr_bv_lo;
}
std::string generateRandomString(int length, unsigned int seed=0) {
const std::string charset = "ACGT"; // Characters to choose from
std::random_device rd;
std::mt19937 gen(rd());
//std::mt19937 gen(seed); to set seed
std::uniform_int_distribution<> dis(0, charset.length() - 1);
std::string randomString;
randomString.reserve(length);
for (int i = 0; i < length; ++i) {
randomString += charset[dis(gen)];
}
return randomString;
}
std::string generateMutatedString(string s, double r=0.1) {
string notT = "ACG";
string notG = "ACT";
string notC = "AGT";
string notA = "CGT";
std::random_device rd;
std::mt19937 gen(rd());
std::string mutatedString;
mutatedString.reserve(s.length());
for (int i = 0; i < s.length(); ++i) {
std::uniform_real_distribution<> dis1(0.0, 1.0);
// std::uniform_real_distribution<> dis2(0.0, 1.0);
// std::uniform_real_distribution<> dis3(0.0, 1.0);
double random_value = dis1(gen);
//cout<<i<<" "<<random_value<<" "<<(1-r)<<endl;
if (random_value < (1-r)) {
mutatedString += s[i]; // Same with probability (1-r) //stay same
} else {
std::uniform_int_distribution<> dis(0, 2); // Range: {0, 1, 2}
if(s[i] == 'A'){
mutatedString += notA[dis(gen)];
}else if(s[i] == 'C'){
mutatedString += notC[dis(gen)];
}else if(s[i] == 'G'){
mutatedString += notG[dis(gen)];
}else if(s[i] == 'T'){
mutatedString += notT[dis(gen)];
}
}
}
return mutatedString;
}
// Function to calculate the probability that a standard normal variable > c
double probability_greater_than(double c) {
// Calculate the CDF of the standard normal distribution at c
double cdf_c = 0.5 * (1 + std::erf(c / std::sqrt(2)));
// The probability that a standard normal variable > c
//cout<<"Z"<<1.0 - cdf_c<<endl;
return 1.0 - cdf_c;
}
set<string> kspectrum(string s, int k){
set<string> kmers;
for(int i=0; i<s.length()-k+1; i++){
string kmer = s.substr(i, k);
kmers.insert(kmer);
}
return kmers;
}
// function to calculate Hamming distance
int hammingDist(string str1, string str2)
{
int i = 0, count = 0;
while (str1[i] != '\0') {
if (str1[i] != str2[i])
count++;
i++;
}
return count;
}
void tau_join(string kmer, vector<string>& retvec){
int k = kmer.length();
for (int delta = 1; delta<k; delta++){
bool isOverlap = true;
for(int j = delta-1; j>=0; j--){
if(kmer[j]!=kmer[k-delta+j]){
isOverlap = false;
break;
}else{
//cout<<delta<<" " << kmer[j]<<kmer[j]<<" ";
}
}
if(isOverlap){
//cout<<delta<<" ";
//cout<<kmer+kmer.substr(delta, k-delta);
retvec.push_back(kmer+kmer.substr(delta, k-delta));
//cout<<endl;
}
// kmer[0] == kmer[k-1]
// kmer[0] = kmer[k-2]
// kmer[1] = kmer[k-1]
// kmer[0] = kmer[k-3]
// kmer[1] = kmer[k-2]
// kmer[2] = kmer[k-1]
}
}
void precompute_si_tau_hd(SimParams& sim){
//do all pair hd
// and do delta
int k=sim.k;
string& s = randomString;
string& t = mutatedString;
set<string> set1 = kspectrum(s, k);
set<string> set2 = kspectrum(t, k);
std::vector<string> intersect;
std::set_intersection(set1.begin(), set1.end(),
set2.begin(), set2.end(),
std::back_inserter(intersect));
// for (int i = 0; i < mu; ++i) {
// for (int j = 0; j < k; ++j) {
// hd_si_tau[i][j] = hammingDist(std::bitset<64>(s_i).to_string(), std::bitset<64>(tau_i).to_string());
// }
// }
std::vector< vector<string> > tau_joins(intersect.size());
for (int i = 0; i <= s.length() - k; ++i) {
string s_i = (s.substr(i, k));
for(int j = 0 ; j < intersect.size(); j++ ){
string tau = intersect[j];
hd_si_tau[i][j] = hammingDist(s_i, tau);
tau_join(tau, tau_joins[j]);
}
}
double p = sim.get_p();
int L = s.length() - k + 1;
int I = intersect.size();
double r = sim.r;
double sum_pr = 0;
for(int j = 0 ; j<I; j++){
double sum_top = 0;
double sum_bottom1 = 0;
double sum_bottom2 = 0;
for(int i = 0; i< L; i++){
double pp=pow(p, hd_si_tau[i][j]);
double pp2 = pow(pp,2);
sum_top -= pp;
sum_bottom1 += pp/sim.get_one_minus_q() - pp2;
//sum_bottom1 += pp/();
// for(int delta = 1; delta<=k; delta++){
// sum_bottom2+= -2*pow(p, hd_si_tau[i][j] + hd_si_tau[i+delta][j] );
// }
for (int t = 0; t< tau_joins[j].size(); t++){
string joinedString = tau_joins[j][t];
int delta = 2*k - joinedString.length();//len = 2k -delta
sum_bottom2+= -2*pow(p, hd_si_tau[i][j] + hd_si_tau[i+delta][j] );
string s_i_joinedStringLen = s.substr(i, joinedString.length() );
int d = hammingDist(s_i_joinedStringLen, joinedString);
sum_bottom2 += 2*pow(p,d)/(pow(1-r, delta));
}
}
// cout<<"sum top "<<sum_top<<endl;
// cout<<"sum b1 "<<sum_bottom1<<endl;
// cout<<"sum b2 "<<sum_bottom2<<endl;
// cout<<"sum_top / sqrt(sum_bottom1 + sum_bottom2 "<<sum_top / sqrt(sum_bottom1 + sum_bottom2)<<endl;
sum_pr += probability_greater_than(sum_top / sqrt(sum_bottom1 + sum_bottom2));
}
estimator = sum_pr;
S_obs = I;
//cout<<"Sum Pr"<<sum_pr<<"I "<<I<<endl;
}
string generateStressTestString(int length, int k){
int type=1;
std::string pattern = "ACGT";
if(type==0){
pattern = "ACGT";
}else if(type==1){
pattern = "AC";
}else if(type==2){
pattern = "A";
}
std::string s = "";
std::string result;
for (int i = 0; i < k; ++i) {
result += pattern[i % pattern.length()];
}
for (int i = 0; i < floor(length/k); ++i) {
s += result;
}
int patlen = s.length();
for (int i = 1; i <= length-patlen; ++i) {
s += "A";
}
//cout<<length<<" " << s.length()<<" "<<"stress test str"<<s<<endl;
return s;
}
unsigned long intersect_size(string s, string t, int k){
set<string> set1 = kspectrum(s, k);
set<string> set2 = kspectrum(t, k);
std::vector<string> intersect;
std::set_intersection(set1.begin(), set1.end(),
set2.begin(), set2.end(),
std::back_inserter(intersect));
return intersect.size();
}
// k should be greater than alpha*log2(L)
double get_alpha(double r){
return 1/(2 + log2(1/(1-r)));
}
class Thm3{
public:
int k;
unsigned long L;
double r;
Thm3(SimParams p){
this->k = p.k;
this->L = p.L;
this->r = p.r;
}
Thm3(int k, unsigned long L, double r){
this->k = k;
this->L = L;
this->r = r;
}
// Overloading the assignment operator
Thm3& operator=(const Thm3& other) {
if (this != &other) {
k = other.k;
L = other.L;
r = other.r;
}
return *this;
}
double error_term(){
return (L - 1)*1.0/(pow(4.0*(1-r),k));
}
bool areEqual(double a, double b, double epsilon = 1e-9) {
return fabs(a - b) < epsilon;
}
bool constraint_check(){
//return areEqual(k/log2(L) - 1/(2 + log2(1/(1-r))) ,0);
return (k/log2(L) > 1/(2 + log2(1/(1-r))));
}
};
string readString(string filename){
string randomString;
std::ifstream file(filename);
if (file.is_open()) {
if (std::getline(file, randomString)) {
//std::cout << "Line read from file: " << line << std::endl;
//L = randomString.length();
} else {
std::cerr << "File is empty." << std::endl;
}
file.close();
} else {
std::cerr << "Unable to open file." << std::endl;
}
return randomString;
}
string readSubstring(string filename, int L){
std::ifstream file(filename);
std::string line;
if (file.is_open()) {
file.seekg(0, std::ios::end); // Get the size of the file
std::streampos fileSize = file.tellg();
file.seekg(0, std::ios::beg);
//std::srand(std::time(nullptr)); // Generate a random starting position within the file
std::streampos startPos = (std::rand() % (int(fileSize) - L));
// Move to the random starting position
file.seekg(startPos);
// Read 100 characters from the file
char buffer[L+1]; // Buffer to hold 100 characters + null terminator
file.read(buffer, L);
buffer[L] = '\0'; // Null terminate the string
// Assign the buffer content to the string
line = buffer;
//cout<< "sp" << startPos << " " <<std::rand() <<endl;
//std::cout << "Random "<< L <<" characters from the file: " << line << std::endl;
file.close(); // Close the file
} else {
std::cerr << "Unable to open file." << std::endl;
}
return line;
}
double calculateMean(const std::vector<int>& numbers) {
double sum = 0.0;
for (const auto& num : numbers) {
sum += num;
}
return sum / numbers.size();
}
double calculateStandardDeviation(const std::vector<int>& numbers) {
double mean = calculateMean(numbers);
double squaredDiffSum = 0.0;
for (const auto& num : numbers) {
double diff = num - mean;
squaredDiffSum += diff * diff;
}
double variance = squaredDiffSum / numbers.size();
return std::sqrt(variance);
}
void hammingDistanceGlobalSet(int K){
for (int i = 0; i < K+2; ++i) { //i<32 for K=30
zodd += "01";
}
for (int i = 0; i < K+2; ++i) {
zeven += "10";
}
zodd_t = std::stoull(zodd, nullptr, 2);
zeven_t = std::stoull(zeven, nullptr, 2);
}
//Setup L = 1000
// k = 1 to 800
// expectation: EL and Pr[ZT>something] should have small diff (spk(s) emax)
int main (int argc, char* argv[]){
//do for different r
srand(time(nullptr));
unsigned long L = 100;
double alpha = -1;
int k = -1;
double r = 0.01;
bool justGetMin = false;
string filename="";
vector<string> args(argv + 1, argv + argc);
for (auto i = args.begin(); i != args.end(); ++i) {
if (*i == "-h" || *i == "--help") {
cout << "Syntax: -i <input-file> -l <length> -r <mutation-rate> -p [getminalpha and exit] -k <kmer-size>" << endl;
return 0;
} else if (*i == "-p") {
justGetMin = true;
} else if (*i == "-r") {
r = stod(*++i);
} else if (*i == "-l") {
L = stoull(*++i);
} else if (*i == "-k") {
k = stoi(*++i);
} else if (*i == "-i") {
filename = (*++i);
}
}
K=k;
//hammingDistanceGlobalSet(K);
// SimParams sim(k, L, r); //k, L, r
//SimParams sim(50, 100, 0.01); //k, L, r
SimParams sim(k, L, r); //k, L, r
//SimParams sim(10, 100, 0.01); //k, L, r
int length = sim.get_n();
filename="";
//filename="/Users/amatur/code/downloads/t2t_chr21.sset";
if (filename!=""){
randomString = readSubstring(filename, length);
}
double mean = 0;
int num_replicates = 1; //do this random string generation 100 times
vector<int> values(num_replicates);
for(int i=0; i<num_replicates; i++){
if (filename==""){
//randomString = generateRandomString(length);
randomString = generateStressTestString(length, sim.k);
}
//else{
// randomString = readSubstring(filename, length);
// }
//std::string mutatedString;
mutatedString = generateMutatedString(randomString, sim.r);
precompute_si_tau_hd(sim);
// int Isize = intersect_size(randomString, mutatedString, int(k));
// mean+=Isize;
// values[i] = Isize;
// if(i<5)
// cout<<int(k)<< "|I|: "<<intersect_size(randomString, mutatedString, int(k))<<endl;
}
mean /= 1.0*num_replicates;
// std::cout << "Random string of length " << length << ": " << randomString << std::endl;
// std::cout << "Mutated string of length " << length << ": " << mutatedString << std::endl;
// cout<<"k"<< int(k)<< " |I|: "<<mean<<endl;
// cout<<"L(1-q)"<<sim.L*(1-sim.get_q())<<endl;
double r_prime = 1.0 - pow(2, log2(S_obs)/sim.k-log2(sim.L)/sim.k);
cout<<"L "<<sim.L<<" r "<<sim.r<< " k "<< sim.k <<" |I| " <<S_obs << " estimate " << estimator << " diff "<< abs(S_obs-estimator)<< " L(1-q) "<<sim.L*(1-sim.get_q())<<" sd "<<calculateStandardDeviation(values)<< " rprime " << r_prime<<endl;
//cout<<endl;
}Editor is loading...
Leave a Comment