mbi_l4
unknown
plain_text
2 years ago
5.5 kB
9
Indexable
mkdir ~/mbi_cwiczenie_4/
cd mbi_cwiczenie_4/
mkdir data
cd data
wget https://github.com/NGSchoolEU/2017/raw/master/CNV_detection/data/DGV.tar.gz
wget https://github.com/NGSchoolEU/2017/raw/master/CNV_detection/data/TGP_SV.tar.gz
wget https://github.com/NGSchoolEU/2017/raw/master/CNV_detection/data/bed.tar.gz
wget https://github.com/NGSchoolEU/2017/raw/master/CNV_detection/data/codex_output_all.tar.gz
wget https://github.com/NGSchoolEU/2017/raw/master/CNV_detection/data/coverage.tar.gz
wget https://github.com/NGSchoolEU/2017/raw/master/CNV_detection/data/refFlat.tar.gz
wget https://github.com/NGSchoolEU/2017/raw/master/CNV_detection/data/segDups.tar.gz
cd ~/mbi_cwiczenie_4/
sudo docker run --rm -it -v ~/mbi_cwiczenie_4:/mbi -w /mbi wkusmirek/mbi-lab-4 R
library(data.table)
library(parallel)
library(RCurl)
library(gdata)
library(matrixStats)
library(DNAcopy)
library(GenomicRanges)
library(Rsubread)
library(WES.1KG.WUGSC)
library(CODEX)
# set working directory to workDir
workDir <- "/mbi/"
setwd(workDir)
#set number of available cores
cores <- 4
dirPath <- system.file("extdata", package = "WES.1KG.WUGSC")
bamFile <- list.files(dirPath, pattern = '*.bam$')
bamdir <- file.path(dirPath, bamFile)
sampname <- as.matrix(read.table(file.path(dirPath, "sampname")))
bedFile <- file.path(dirPath, "chr22_400_to_500.bed")
chr <- 22
bambedObj <- getbambed(bamdir = bamdir, bedFile = bedFile,
sampname = sampname, projectname = "CODEX_demo", chr)
bamdir <- bambedObj$bamdir; sampname <- bambedObj$sampname
ref <- bambedObj$ref; projectname <- bambedObj$projectname; chr <- bambedObj$chr
coverageObj <- getcoverage(bambedObj, mapqthres = 20)
Y <- coverageObj$Y; readlength <- coverageObj$readlength
Y_ac <- apply(Y, 2,function(x){(100*x)/width(ref)})
colnames(Y_ac) <- sampname
summary(apply(Y_ac, 2, median))
projectname <- "TGP99"
outputDir <- paste0(workDir,"codex_output/")
dir.create(outputDir)
cfiles <- dir(paste0(workDir, "data/coverage/"), "*bam.coverage*")
cdf <- rbindlist(mclapply(cfiles, function(f){
print(f);
df <- fread(paste0(workDir, "data/coverage/",f));
df$SampleName <- strsplit(f, "\\.")[[1]][1];df
},mc.cores=cores))
colnames(cdf) <- c("Chr", "Start", "Stop", "ReadCount", "SampleName")
cdf <- cdf[order(cdf$SampleName, cdf$Chr, cdf$Start, cdf$Stop),]
dim(cdf)
#[1] 465498 5
head(cdf)
# Chr Start Stop ReadCcdount SampleName
#1: 20 68319 68439 103 NA06985
#2: 20 76611 77091 129 NA06985
#3: 20 123208 123358 69 NA06985
#4: 20 125995 126389 105 NA06985
#5: 20 138119 138269 37 NA06985
#6: 20 139359 139719 156 NA06985
bedFile <- paste0(workDir, "data/bed/20130108.exome.targets.bed")
sampname <- unique(cdf$SampleName)
chr <- "20"
targetsChr <- cdf[which(cdf$Chr==chr & cdf$SampleName == cdf$SampleName[1]),
c("Chr", "Start", "Stop")]
selChr <- cdf[which(cdf$Chr==chr),]
Y <- t(do.call(rbind,lapply(sampname,
function(s){selChr$ReadCount [which(selChr$SampleName == s)]})))
colnames(Y) <- sampname
rownames(Y) <- 1:nrow(Y)
dim(Y)
#[1] 4702 99
dim(targetsChr)
#[1] 4702 3
ref <- IRanges(start = targetsChr$Start, end = targetsChr$Stop)
gc <- getgc(chr, ref)
mapp <- getmapp(chr, ref)
mapp_thresh <- 0.9 # remove exons with mapability < 0.9
cov_thresh_from <- 20 # remove exons covered by less than 20 reads
cov_thresh_to <- 4000 # remove exons covered by more than 4000 reads
length_thresh_from <- 20 # remove exons of size < 20
length_thresh_to <- 2000 # remove exons of size > 2000
gc_thresh_from <- 20 # remove exons with GC < 20
gc_thresh_to <- 80 # or GC > 80
qcObj <- qc(Y, sampname, chr, ref, mapp, gc,
cov_thresh = c(cov_thresh_from, cov_thresh_to),
length_thresh = c(length_thresh_from, length_thresh_to),
mapp_thresh = mapp_thresh,
gc_thresh = c(gc_thresh_from, gc_thresh_to))
Y_qc <- qcObj$Y_qc; sampname_qc <- qcObj$sampname_qc; gc_qc <- qcObj$gc_qc
mapp_qc <- qcObj$mapp_qc; ref_qc <- qcObj$ref_qc; qcmat <- qcObj$qcmat
normObj <- normalize(Y_qc, gc_qc, K = 1:9)
Yhat <- normObj$Yhat; AIC <- normObj$AIC; BIC <- normObj$BIC
RSS <- normObj$RSS; K <- normObj$K
optK <- choiceofK(AIC, BIC, RSS, K,
filename = paste(projectname, "_", chr, "_choiceofK", ".pdf", sep = ""))
finalcall <- CODEX::segment(Y_qc, Yhat, optK = optK,
K = K, sampname_qc,
ref_qc, chr, lmax = 200,
mode = "integer")
finalcall <- data.frame(finalcall, stringsAsFactors=F)
finalcall$targetCount <- as.numeric(finalcall$ed_exon) - as.numeric(finalcall$st_exon)
plotCall <- function(calls, i, Y_qc, Yhat_opt){
startIdx <- as.numeric(calls$st_exon[i])
stopIdx <- as.numeric(calls$ed_exon[i])
sampleName <- calls$sample_name[i]
wd <- 20
startPos <- max(1,(startIdx-wd))
stopPos <- min((stopIdx+wd), nrow(Y_qc))
selQC <- Y_qc[startPos:stopPos,]
selQC[selQC ==0] <- 0.00001
selYhat <- Yhat_opt[startPos:stopPos,]
png(file="cnv.png")
matplot(matrix(rep(startPos:stopPos, ncol(selQC)),
ncol=ncol(selQC)), log(selQC/selYhat,2),
type="l",lty=1, col="dimgrey", lwd=1,
xlab="exon nr", ylab="logratio(Y/Yhat)")
lines(startPos:stopPos,log( selQC[,sampleName]/ selYhat[,sampleName],2), lwd=3, col="red")
dev.off()
}
cnvId <- 3 # indeks zmiany dla której zostanie sporządzony wykres
plotCall(finalcall, cnvId, Y_qc, Yhat[[optK]])
Editor is loading...
Leave a Comment