Untitled
unknown
c_cpp
2 years ago
8.1 kB
18
Indexable
#include <iostream>
#include <string>
#include <cmath>
#include <algorithm>
#include <iostream>
#include<sstream>
#include <fstream>
#include <vector>
#include <bitset>
using namespace std;
//vector<string> kmers;
vector<int> counts;
vector<uint64_t> kmer_binary_values;
int K = 30;
string zodd="";
string zeven="";
uint64_t zodd_t, zeven_t;
/// @brief murmurhash3 64-bit finalizer
/// @param key
/// @return
inline uint64_t hash64(uint64_t key )
{
int k = K;
uint64_t mask = (1ULL<<k*2) - 1;
key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;
key = key ^ key >> 24;
key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265
key = key ^ key >> 14;
key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21
key = key ^ key >> 28;
key = (key + (key << 31)) & mask;
return key;
}
/*
/// @brief returns 0 to 1 real value for a k-mer
/// @param kmer
/// @return
double kmer_to_real_value(const std::string& kmer) {
int k = kmer.length();
double binary_value = 0;
for (int i = 0; i < k; ++i) {
switch (kmer[i]) {
case 'A':
binary_value = binary_value * 4; // 'A' is represented as 00 in binary
break;
case 'C':
binary_value = binary_value * 4 + 1; // 'C' is represented as 01 in binary
break;
case 'G':
binary_value = binary_value * 4 + 2; // 'G' is represented as 10 in binary
break;
case 'T':
binary_value = binary_value * 4 + 3; // 'T' is represented as 11 in binary
break;
default:
throw std::invalid_argument("Invalid character in k-mer: " + kmer[i]);
}
}
// Normalize binary_value to range [0, 1]
//K = k;
double real_value = hash64(binary_value) / std::pow(4, k);
return real_value;
}*/
std::string encode_kmer(const std::string& kmer) {
std::string encoded_kmer;
for (char base : kmer) {
switch (base) {
case 'A':
encoded_kmer += "00"; // 'A' is encoded as 00
break;
case 'C':
encoded_kmer += "01"; // 'C' is encoded as 01
break;
case 'G':
encoded_kmer += "10"; // 'G' is encoded as 10
break;
case 'T':
encoded_kmer += "11"; // 'T' is encoded as 11
break;
default:
throw std::invalid_argument("Invalid character in k-mer: " + std::string(1, base));
}
}
return encoded_kmer;
}
uint64_t stringToBinary(string kmer){
string bv_line = encode_kmer(kmer);
//int K = 30;
uint64_t curr_bv_lo = std::stoull(bv_line.substr(0,std::min(64, 2*K)), nullptr, 2);
// uint64_t curr_bv_hi = 0;
// if(K >= 64){
// curr_bv_hi = std::stoull(bv_line.substr(64,bv_line.length()-64), nullptr, 2);
// }
//int hd = hammingDistance(prev_bv_hi, curr_bv_hi);
cout<<bv_line<<" "<<curr_bv_lo<<endl;
return curr_bv_lo;
}
double kmer_to_real_value(const std::string& kmer) {
return hash64(stringToBinary(kmer))*1.0/std::pow(4, K);
}
double kmer_binary_to_real_value(uint64_t kmer_binary) {
return hash64(kmer_binary)*1.0/std::pow(4, K);
}
//write a function that reads all lines from a file and put in a vector of strings
void read_kmers(string file_name){
ifstream ifs(file_name);
string line;
// vector<string> lines;
while (getline(ifs, line))
{
stringstream ss(line);
string kmer;
int count;
if (ss >> kmer >> count) {
if(kmer_binary_values.empty()){
K = kmer.length();
}
//kmers.push_back(kmer);
kmer_binary_values.push_back(stringToBinary(kmer));
counts.push_back(count);
}
}
ifs.close();
}
// function to calculate Hamming distance
int hammingDist(string str1, string str2)
{
int i = 0, count = 0;
while (str1[i] != '\0') {
if (str1[i] != str2[i])
count++;
i++;
}
return count;
}
void getSketch(double threshold, vector<uint64_t>& sketch_kmers){//threshold is the minimum value of the sketch to be considered
//vector<string> sketch_kmers;
for(int i=0; i< kmer_binary_values.size(); i++){
uint64_t kmer = kmer_binary_values[i];
if(kmer_binary_to_real_value(kmer) <= threshold){
//cout << kmer << " "<< kmer_to_real_value(kmer)<< endl;
//sketch_kmers.push_back(kmer);
sketch_kmers.push_back(kmer_binary_values[i]);
}
}
cout<< "Number of original kmers: " << kmer_binary_values.size() << endl;
cout<< "Number of sketch kmers: " << sketch_kmers.size() << endl;
}
int hammingDistance (uint64_t x, uint64_t y) {
// let zeven = (x with every even position zeroed out) XOR (y with every
// even position zeroed out)
// let zodd be similarly defined
// HD(x,y) = popcount(zeven OR zodd)
// std::bitset<64> xo(x&zodd_t); // Assuming a 64-bit binary representation
// std::bitset<64> yo(y&zodd_t); // Assuming a 64-bit binary representation
// std::bitset<64> xb(x); // Assuming a 64-bit binary representation
// std::bitset<64> yb(y); // Assuming a 64-bit binary representation
// //cout<<xb.to_string()<<" "<<yb.to_string()<<endl;
uint64_t oddt = (x&zodd_t)^(y&zodd_t);
uint64_t event = (x&zeven_t)^(y&zeven_t);
// std::bitset<64> oddtt(oddt); // Assuming a 64-bit binary representation
// cout<<xb.to_string()<<" "<<"x"<<endl;
// cout<<yb.to_string()<<" "<<"y"<<endl;
// cout<<xo.to_string()<<" "<<"x&zo"<<endl;
// cout<<yo.to_string()<<" "<<"y&zo"<<endl;
// cout<<oddtt.to_string()<<" "<<"oddxor"<<endl;
// cout<<oddtt.to_string()<<" "<<"oddxor"<<endl;
// 0101010101010101010101010101010101010101010101010101010101010101
// uint64_t res = x ^ y;
// return __builtin_popcountll (res) ;
return __builtin_popcountll (oddt | (event>>1)) ;
}
// Let x and y be the 2-bit encoding of two kmers.
// let zeven = (x with every even position zeroed out) XOR (y with every
// even position zeroed out)
// let zodd be similarly defined
// HD(x,y) = popcount(zeven OR zodd)
inline unsigned int num_pair(int n){
return (n*n - n)/2;
}
void getHDHistogram(double threshold){
vector<uint64_t> kmers_binary;
getSketch(threshold, kmers_binary);
vector<int> hd_histogram(K+1, 0);
for (int i = 0; i < kmers_binary.size(); i++){
if(counts[i] > 1){
hd_histogram[0] += num_pair(counts[i]);
}
for (int j = i+1; j < kmers_binary.size(); j++){
//int hd = hammingDistance(stringToBinary(kmers[i]), stringToBinary(kmers[j]));
//int hd = hammingDist(kmers[i], kmers[j]);
int hd = hammingDistance(kmers_binary[i], kmers_binary[j]);
//hd_histogram[hd] += 1;
hd_histogram[hd] += counts[i]*counts[j];
}
}
for (int i = 0; i < hd_histogram.size(); i++){
cout << i << " " << hd_histogram[i] << endl;
}
}
int main(int argc, char *argv[]) {
for (int i = 0; i < 32; ++i) {
zodd += "01";
}
for (int i = 0; i < 32; ++i) {
zeven += "10";
}
zodd_t = std::stoull(zodd, nullptr, 2);
zeven_t = std::stoull(zeven, nullptr, 2);
// cout<<kmer_to_real_value("ATTTTTAAAAAAAATATATATGGATATATA");
// exit(1);
// string kmer1="CTTTTTAAAAAAAATATATATGGATATATA";
// string kmer2="CTTTTTGAAAAAAATATATATGGATATAAT";
// cout<<hammingDistance(stringToBinary(kmer1), stringToBinary(kmer2));
// exit(1);
read_kmers("kmers.txt"); // K IS SET
double value = std::stod(argv[1]);
getHDHistogram(value);
//std::string kmer = "AAAAAAAAATAAAAAAAAAA";
//double real_value = kmer_to_real_value(kmer);
//std::cout << "Real value for k-mer " << kmer << ": " << real_value << std::endl;
return 0;
}Editor is loading...
Leave a Comment