Untitled
unknown
c_cpp
2 years ago
6.7 kB
9
Indexable
#include <cstdlib>
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
#include <regex>
using namespace std;
// Function to execute system command
void executeCommand(const string& command) {
int result = system(command.c_str());
if (result != 0) {
cerr << "Command failed: " << command << endl;
exit(EXIT_FAILURE);
}
}
// Function to parse the sorted BAM file and extract mutations
void detectMutationsFromBAM(const string& bamFile, const string& refFile, const string& outputFile) {
// Implementation will use a BAM file parser like samtools or htslib
// Assume we have a function parseBAM that extracts the alignment information
// Here we use samtools to convert BAM to SAM for easier parsing
string samFile = bamFile + ".sam";
string command = "samtools view " + bamFile + " > " + samFile;
executeCommand(command);
ifstream inFile(samFile);
ifstream refFileStream(refFile);
ofstream outFile(outputFile);
string line;
// Write the header for the CSV file
outFile << "Type,Position,Base\n";
if (!inFile.is_open() || !outFile.is_open() || !refFileStream.is_open()) {
cerr << "Failed to open files." << endl;
exit(EXIT_FAILURE);
}
// Read reference sequence
string refLine, refSeq = "";
while (getline(refFileStream, refLine)) {
if (refLine[0] != '>') {
refSeq += refLine;
}
}
regex cigarRegex("([0-9]+)([MIDNSHP=X])");
smatch matches;
while (getline(inFile, line)) {
if (line[0] == '@') continue; // Skip header lines
istringstream iss(line);
vector<string> tokens((istream_iterator<string>(iss)), istream_iterator<string>());
if (tokens.size() > 9 && tokens[1] != "4") { // Aligned sequence
string seq = tokens[9];
int pos = stoi(tokens[3]) - 1; // Starting position of alignment (0-based)
string cigar = tokens[5];
size_t seqIdx = 0;
while (regex_search(cigar, matches, cigarRegex)) {
int len = stoi(matches[1]);
char type = matches[2].str()[0];
switch (type) {
case 'M': // Match or mismatch (check each base)
for (int i = 0; i < len; ++i) {
if (refSeq[pos] != seq[seqIdx]) {
outFile << "Supstitucija,X," << pos << "," << seq[seqIdx] << "\n";
}
pos++;
seqIdx++;
}
break;
case 'X': // Sequence mismatch explicitly
for (int i = 0; i < len; ++i) {
outFile << "Supstitucija,X," << pos << "," << seq[seqIdx] << "\n";
pos++;
seqIdx++;
}
break;
case '=': // Exact match
pos += len;
seqIdx += len;
break;
case 'I': // Insertion to the reference
outFile << "Umetanje,I," << pos << ",";
for (int i = 0; i < len; ++i) {
outFile << seq[seqIdx++];
}
outFile << "\n";
break;
case 'D': // Deletion from the reference
for (int i = 0; i < len; ++i) {
outFile << "Brisanje,D," << pos++ << ",-\n";
}
break;
case 'N': // Skipped region from the reference
case 'S': // Soft clipping
case 'H': // Hard clipping
case 'P': // Padding
break;
}
cigar = matches.suffix().str();
}
}
}
inFile.close();
outFile.close();
}
int main() {
string reference = "data/lambda.fasta";
string reads = "data/lambda_simulated_reads.fasta";
string samOutput = "data/alignment.sam";
string bamOutput = "data/alignment.bam";
string sortedBam = "data/alignment_sorted.bam";
string csvOutput = "data/mutations.csv";
string vcfOutput = "data/variants.vcf";
string indexCmd = "samtools faidx " + reference; // Command to generate index
string markedBam = "data/alignment_marked.bam";
string metricsFile = "data/mark_duplicates_metrics.txt";
// Step 1: Run Minimap2
string minimapCmd = "minimap2 -ax sr " + reference + " " + reads + " > " + samOutput;
cout << "Running Minimap2: " << minimapCmd << endl;
executeCommand(minimapCmd);
// Step 2: Generate index for the reference genome
cout << "Generating index for the reference genome..." << endl;
executeCommand(indexCmd); // Execute the command to generate index
// Step 3: Convert SAM to BAM
string samToBamCmd = "samtools view -bS " + samOutput + " > " + bamOutput;
cout << "Converting SAM to BAM: " << samToBamCmd << endl;
executeCommand(samToBamCmd);
// Step 4: Sort BAM file
string sortCmd = "samtools sort -o " + sortedBam + " " + bamOutput;
cout << "Sorting BAM file: " << sortCmd << endl;
executeCommand(sortCmd);
// Step 5: Mark duplicates
string markDuplicatesCmd = "picard MarkDuplicates I=" + sortedBam + " O=" + markedBam + " M=" + metricsFile + " REMOVE_DUPLICATES=true";
cout << "Marking duplicates: " << markDuplicatesCmd << endl;
executeCommand(markDuplicatesCmd);
// Step 6: Index marked BAM file
string indexMarkedCmd = "samtools index " + markedBam;
cout << "Indexing marked BAM file: " << indexMarkedCmd << endl;
executeCommand(indexMarkedCmd);
// Step 7: Detect mutations from sorted BAM file
detectMutationsFromBAM(sortedBam, reference, csvOutput);
cout << "Mutation detection completed. Results are stored in " << csvOutput << endl;
// Step 8: Run FreeBayes for evaluation with optimization
string freebayesCmd = "freebayes -f " + reference + " --use-best-n-alleles 4 --min-alternate-count 3 " + markedBam + " > " + vcfOutput;
cout << "Running FreeBayes with optimization: " << freebayesCmd << endl;
executeCommand(freebayesCmd);
cout << "Variant calling completed. Results are stored in " << vcfOutput << endl;
return 0;
}
Editor is loading...
Leave a Comment