Untitled

mail@pastecode.io avatar
unknown
c_cpp
a month ago
6.7 kB
4
Indexable
Never
#include <cstdlib>
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
#include <regex>

using namespace std;

// Function to execute system command
void executeCommand(const string& command) {
    int result = system(command.c_str());
    if (result != 0) {
        cerr << "Command failed: " << command << endl;
        exit(EXIT_FAILURE);
    }
}

// Function to parse the sorted BAM file and extract mutations
void detectMutationsFromBAM(const string& bamFile, const string& refFile, const string& outputFile) {
    // Implementation will use a BAM file parser like samtools or htslib
    // Assume we have a function parseBAM that extracts the alignment information
    // Here we use samtools to convert BAM to SAM for easier parsing
    string samFile = bamFile + ".sam";
    string command = "samtools view " + bamFile + " > " + samFile;
    executeCommand(command);

    ifstream inFile(samFile);
    ifstream refFileStream(refFile);
    ofstream outFile(outputFile);
    string line;

    // Write the header for the CSV file
    outFile << "Type,Position,Base\n";

    if (!inFile.is_open() || !outFile.is_open() || !refFileStream.is_open()) {
        cerr << "Failed to open files." << endl;
        exit(EXIT_FAILURE);
    }

    // Read reference sequence
    string refLine, refSeq = "";
    while (getline(refFileStream, refLine)) {
        if (refLine[0] != '>') {
            refSeq += refLine;
        }
    }

    regex cigarRegex("([0-9]+)([MIDNSHP=X])");
    smatch matches;

    while (getline(inFile, line)) {
        if (line[0] == '@') continue; // Skip header lines

        istringstream iss(line);
        vector<string> tokens((istream_iterator<string>(iss)), istream_iterator<string>());

        if (tokens.size() > 9 && tokens[1] != "4") { // Aligned sequence
            string seq = tokens[9];
            int pos = stoi(tokens[3]) - 1; // Starting position of alignment (0-based)
            string cigar = tokens[5];
            size_t seqIdx = 0;

            while (regex_search(cigar, matches, cigarRegex)) {
                int len = stoi(matches[1]);
                char type = matches[2].str()[0];

                switch (type) {
                    case 'M': // Match or mismatch (check each base)
                        for (int i = 0; i < len; ++i) {
                            if (refSeq[pos] != seq[seqIdx]) {
                                outFile << "Supstitucija,X," << pos << "," << seq[seqIdx] << "\n";
                            }
                            pos++;
                            seqIdx++;
                        }
                        break;
                    case 'X': // Sequence mismatch explicitly
                        for (int i = 0; i < len; ++i) {
                            outFile << "Supstitucija,X," << pos << "," << seq[seqIdx] << "\n";
                            pos++;
                            seqIdx++;
                        }
                        break;
                    case '=': // Exact match
                        pos += len;
                        seqIdx += len;
                        break;
                    case 'I': // Insertion to the reference
                        outFile << "Umetanje,I," << pos << ",";
                        for (int i = 0; i < len; ++i) {
                            outFile << seq[seqIdx++];
                        }
                        outFile << "\n";
                        break;
                    case 'D': // Deletion from the reference
                        for (int i = 0; i < len; ++i) {
                            outFile << "Brisanje,D," << pos++ << ",-\n";
                        }
                        break;
                    case 'N': // Skipped region from the reference
                    case 'S': // Soft clipping
                    case 'H': // Hard clipping
                    case 'P': // Padding
                        break;
                }
                cigar = matches.suffix().str();
            }
        }
    }

    inFile.close();
    outFile.close();
}

int main() {
    string reference = "data/lambda.fasta";
    string reads = "data/lambda_simulated_reads.fasta";
    string samOutput = "data/alignment.sam";
    string bamOutput = "data/alignment.bam";
    string sortedBam = "data/alignment_sorted.bam";
    string csvOutput = "data/mutations.csv";
    string vcfOutput = "data/variants.vcf";
    string indexCmd = "samtools faidx " + reference; // Command to generate index
    string markedBam = "data/alignment_marked.bam"; 
    string metricsFile = "data/mark_duplicates_metrics.txt"; 

    // Step 1: Run Minimap2
    string minimapCmd = "minimap2 -ax sr " + reference + " " + reads + " > " + samOutput;
    cout << "Running Minimap2: " << minimapCmd << endl;
    executeCommand(minimapCmd);

    // Step 2: Generate index for the reference genome
    cout << "Generating index for the reference genome..." << endl;
    executeCommand(indexCmd); // Execute the command to generate index

    // Step 3: Convert SAM to BAM
    string samToBamCmd = "samtools view -bS " + samOutput + " > " + bamOutput;
    cout << "Converting SAM to BAM: " << samToBamCmd << endl;
    executeCommand(samToBamCmd);

    // Step 4: Sort BAM file
    string sortCmd = "samtools sort -o " + sortedBam + " " + bamOutput;
    cout << "Sorting BAM file: " << sortCmd << endl;
    executeCommand(sortCmd);

    // Step 5: Mark duplicates
    string markDuplicatesCmd = "picard MarkDuplicates I=" + sortedBam + " O=" + markedBam + " M=" + metricsFile + " REMOVE_DUPLICATES=true";
    cout << "Marking duplicates: " << markDuplicatesCmd << endl;
    executeCommand(markDuplicatesCmd);

    // Step 6: Index marked BAM file
    string indexMarkedCmd = "samtools index " + markedBam;
    cout << "Indexing marked BAM file: " << indexMarkedCmd << endl;
    executeCommand(indexMarkedCmd);

    // Step 7: Detect mutations from sorted BAM file
    detectMutationsFromBAM(sortedBam, reference, csvOutput);

    cout << "Mutation detection completed. Results are stored in " << csvOutput << endl;

    // Step 8: Run FreeBayes for evaluation with optimization
    string freebayesCmd = "freebayes -f " + reference + " --use-best-n-alleles 4 --min-alternate-count 3 " + markedBam + " > " + vcfOutput;
    cout << "Running FreeBayes with optimization: " << freebayesCmd << endl;
    executeCommand(freebayesCmd);

    cout << "Variant calling completed. Results are stored in " << vcfOutput << endl;

    return 0;
}
Leave a Comment