Untitled
unknown
c_cpp
a year ago
6.7 kB
7
Indexable
#include <cstdlib> #include <iostream> #include <fstream> #include <sstream> #include <string> #include <vector> #include <regex> using namespace std; // Function to execute system command void executeCommand(const string& command) { int result = system(command.c_str()); if (result != 0) { cerr << "Command failed: " << command << endl; exit(EXIT_FAILURE); } } // Function to parse the sorted BAM file and extract mutations void detectMutationsFromBAM(const string& bamFile, const string& refFile, const string& outputFile) { // Implementation will use a BAM file parser like samtools or htslib // Assume we have a function parseBAM that extracts the alignment information // Here we use samtools to convert BAM to SAM for easier parsing string samFile = bamFile + ".sam"; string command = "samtools view " + bamFile + " > " + samFile; executeCommand(command); ifstream inFile(samFile); ifstream refFileStream(refFile); ofstream outFile(outputFile); string line; // Write the header for the CSV file outFile << "Type,Position,Base\n"; if (!inFile.is_open() || !outFile.is_open() || !refFileStream.is_open()) { cerr << "Failed to open files." << endl; exit(EXIT_FAILURE); } // Read reference sequence string refLine, refSeq = ""; while (getline(refFileStream, refLine)) { if (refLine[0] != '>') { refSeq += refLine; } } regex cigarRegex("([0-9]+)([MIDNSHP=X])"); smatch matches; while (getline(inFile, line)) { if (line[0] == '@') continue; // Skip header lines istringstream iss(line); vector<string> tokens((istream_iterator<string>(iss)), istream_iterator<string>()); if (tokens.size() > 9 && tokens[1] != "4") { // Aligned sequence string seq = tokens[9]; int pos = stoi(tokens[3]) - 1; // Starting position of alignment (0-based) string cigar = tokens[5]; size_t seqIdx = 0; while (regex_search(cigar, matches, cigarRegex)) { int len = stoi(matches[1]); char type = matches[2].str()[0]; switch (type) { case 'M': // Match or mismatch (check each base) for (int i = 0; i < len; ++i) { if (refSeq[pos] != seq[seqIdx]) { outFile << "Supstitucija,X," << pos << "," << seq[seqIdx] << "\n"; } pos++; seqIdx++; } break; case 'X': // Sequence mismatch explicitly for (int i = 0; i < len; ++i) { outFile << "Supstitucija,X," << pos << "," << seq[seqIdx] << "\n"; pos++; seqIdx++; } break; case '=': // Exact match pos += len; seqIdx += len; break; case 'I': // Insertion to the reference outFile << "Umetanje,I," << pos << ","; for (int i = 0; i < len; ++i) { outFile << seq[seqIdx++]; } outFile << "\n"; break; case 'D': // Deletion from the reference for (int i = 0; i < len; ++i) { outFile << "Brisanje,D," << pos++ << ",-\n"; } break; case 'N': // Skipped region from the reference case 'S': // Soft clipping case 'H': // Hard clipping case 'P': // Padding break; } cigar = matches.suffix().str(); } } } inFile.close(); outFile.close(); } int main() { string reference = "data/lambda.fasta"; string reads = "data/lambda_simulated_reads.fasta"; string samOutput = "data/alignment.sam"; string bamOutput = "data/alignment.bam"; string sortedBam = "data/alignment_sorted.bam"; string csvOutput = "data/mutations.csv"; string vcfOutput = "data/variants.vcf"; string indexCmd = "samtools faidx " + reference; // Command to generate index string markedBam = "data/alignment_marked.bam"; string metricsFile = "data/mark_duplicates_metrics.txt"; // Step 1: Run Minimap2 string minimapCmd = "minimap2 -ax sr " + reference + " " + reads + " > " + samOutput; cout << "Running Minimap2: " << minimapCmd << endl; executeCommand(minimapCmd); // Step 2: Generate index for the reference genome cout << "Generating index for the reference genome..." << endl; executeCommand(indexCmd); // Execute the command to generate index // Step 3: Convert SAM to BAM string samToBamCmd = "samtools view -bS " + samOutput + " > " + bamOutput; cout << "Converting SAM to BAM: " << samToBamCmd << endl; executeCommand(samToBamCmd); // Step 4: Sort BAM file string sortCmd = "samtools sort -o " + sortedBam + " " + bamOutput; cout << "Sorting BAM file: " << sortCmd << endl; executeCommand(sortCmd); // Step 5: Mark duplicates string markDuplicatesCmd = "picard MarkDuplicates I=" + sortedBam + " O=" + markedBam + " M=" + metricsFile + " REMOVE_DUPLICATES=true"; cout << "Marking duplicates: " << markDuplicatesCmd << endl; executeCommand(markDuplicatesCmd); // Step 6: Index marked BAM file string indexMarkedCmd = "samtools index " + markedBam; cout << "Indexing marked BAM file: " << indexMarkedCmd << endl; executeCommand(indexMarkedCmd); // Step 7: Detect mutations from sorted BAM file detectMutationsFromBAM(sortedBam, reference, csvOutput); cout << "Mutation detection completed. Results are stored in " << csvOutput << endl; // Step 8: Run FreeBayes for evaluation with optimization string freebayesCmd = "freebayes -f " + reference + " --use-best-n-alleles 4 --min-alternate-count 3 " + markedBam + " > " + vcfOutput; cout << "Running FreeBayes with optimization: " << freebayesCmd << endl; executeCommand(freebayesCmd); cout << "Variant calling completed. Results are stored in " << vcfOutput << endl; return 0; }
Editor is loading...
Leave a Comment