diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index b09cba3b0799639c7d46d9c1619dbf21f6a0c085..ea9d13742ba0a18fc373694705859836a87e37cc 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -27,6 +27,9 @@ process { withName:fastqc { queue = 'super' } + withName:inferMetadata { + queue = 'super' + } } singularity { diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index e8c1fceafd445b2ac0c3f558157bbd7ea6ce3c42..15e8effaefaa51649f6e71d6d95a2d3c98bec63f 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -39,6 +39,7 @@ referenceBase = "/project/BICF/BICF_Core/shared/gudmap/references" script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh") script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py") script_inferMeta = Channel.fromPath("${baseDir}/scripts/inferMeta.sh") +script_aggregateInference = Channel.fromPath("${baseDir}/scripts/aggregateInference.R") /* * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid @@ -195,11 +196,9 @@ stranded.into { } spike.into{ spike_getRef - spike_rseqc } species.into { species_getRef - species_rseqc } /* @@ -417,12 +416,13 @@ process inferMetadata { input: path script_inferMeta + path script_aggregateInference path reference_rseqc set val (repRID), path (inBam), path (inBai) from dedupBam_rseqc - val species_rseqc - val spike_rseqc output: + path "infer.csv" into inferMetadata + script: """ @@ -434,29 +434,28 @@ process inferMetadata { endness=`bash inferMeta.sh endness ${repRID}.rseqc.log` fail=`bash inferMeta.sh fail ${repRID}.rseqc.log` - echo \${endness} - echo \${fail} if [ \${endness} == "PairEnd" ] then percentF=`bash inferMeta.sh pef ${repRID}.rseqc.log` percentR=`bash inferMeta.sh per ${repRID}.rseqc.log` inner_distance.py -i "${inBam}" -o ${repRID}.insertSize -r ./bed/genome.bed - junction_saturation.py -i "${inBam}" -r ./bed/genome.bed -o ${repRID}.junctionSaturation elif [ \${endness} == "SingleEnd" ] then percentF=`bash inferMeta.sh sef ${repRID}.rseqc.log` percentR=`bash inferMeta.sh ser ${repRID}.rseqc.log` fi - if [ \$percentF > 0.25 ] && [ \$percentR < 0.25 ] + if [ \$percentF -gt 0.25 ] && [ \$percentR -lt 0.25 ] then + stranded="forward" if [ \$endness == "PairEnd" ] then strategy="1++,1--,2+-,2-+" else strategy="++,--" fi - elif [ \$percentR > 0.25 ] && [ \$percentF < 0.25 ] + elif [ \$percentR -gt 0.25 ] && [ \$percentF -lt 0.25 ] then + stranded="reverse" if [ \$endness == "PairEnd" ] then strategy="1+-,1-+,2++,2--" @@ -464,15 +463,38 @@ process inferMetadata { strategy="+-,-+" fi else + stranded="unstranded" strategy="us" fi - if [ \$strategy != "us" ] - then - FPKM_count.py -i "${inBam}" -o ${repRID}.rpkmCount -r ./bed/genome.bed -d \$strategy - RPKM_saturation.py -i "${inBam}" -o ${repRID}.rpkmCount -r ./bed/genome.bed -d \$strategy - else - FPKM_count.py -i "${inBam}" -o ${repRID}.rpkmCount -r ./bed/genome.bed - RPKM_saturation.py -i "${inBam}" -o ${repRID}.rpkmCount -r ./bed/genome.bed - fi + + # calcualte TIN values per feature + tin.py -i "${inBam}" -r ./bed/genome.bed + + # aggregate infered metadata (including generate TIN stats) + Rscript aggregateInference.R --endness "\${endness}" --stranded "\${stranded}" --strategy "\${strategy}" --percentF \${percentF} --percentR \${percentR} --percentFail \${fail} --tin "${inBam.baseName}.tin.xls" """ -} \ No newline at end of file +} + +// Split infered metadata into separate channels +endsMetaI = Channel.create() +strandedI = Channel.create() +strategyI = Channel.create() +percentFI = Channel.create() +percentRI = Channel.create() +percentFailI = Channel.create() +tinMinI = Channel.create() +tinMedI = Channel.create() +tinMaxI = Channel.create() +tinSDI = Channel.create() +inferMetadata.splitCsv(sep: ",", header: false).separate( + endsMetaI, + strandedI, + strategyI, + percentFI, + percentRI, + percentFailI, + tinMinI, + tinMedI, + tinMaxI, + tinSDI +) diff --git a/workflow/scripts/aggregateInference.R b/workflow/scripts/aggregateInference.R new file mode 100644 index 0000000000000000000000000000000000000000..3e8388d8c64beeaadabe2bca7a3795b8e163cb57 --- /dev/null +++ b/workflow/scripts/aggregateInference.R @@ -0,0 +1,43 @@ +gc() +library(optparse) + +option_list=list( + make_option("--endness",action="store",type='character',help="Infered Endness"), + make_option("--stranded",action="store",type='character',help="Infered Strandedness"), + make_option("--strategy",action="store",type='character',help="Infered Sequencing Strategy"), + make_option("--percentF",action="store",type='double',help="Percent Forward Aligned Reads"), + make_option("--percentR",action="store",type='double',help="Percent Reverse Aligned Reads"), + make_option("--percentFail",action="store",type='double',help="Percent Failed Aligned Reads"), + make_option("--tin",action="store",type='character',help="TIN File") +) +opt=parse_args(OptionParser(option_list=option_list)) +rm(option_list) + +if (length(setdiff(c("endness","stranded","strategy","percentF","percentR","percentFail","tin","help"),names(opt))) != 0){ + stop(paste0("No input missing ",setdiff(c("endness","stranded","strategy","percentF","percentR","percentFail","tin","help"),names(opt)),", exiting.")) +} else if (!file.exists(opt$tin)){ + stop("No tin file passed, exiting.") +} + +if (opt$endness == "PairEnd"){ + endness <- "pe" +} else if (opt$endness == "SingleEnd"){ + endness <- "se" +} + +percentF <- round(opt$percentF,digits=2) +percentR <- round(opt$percentR,digits=2) +percentFail <- round(opt$percentFail,digits=2) + +repRID <- basename(gsub(".sorted.deduped.tin.xls","",opt$tin)) + +tin <- read.delim(opt$tin,sep="\t") + +tin.min <- round(min(tin$TIN),digits=2) +tin.med <- round(median(tin$TIN),digits=2) +tin.max <- round(max(tin$TIN),digits=2) +tin.sd <- round(sd(tin$TIN),digits=2) + +output <- paste(endness,opt$stranded,opt$strategy,percentF,percentR,percentFail,tin.min,tin.med,tin.max,tin.sd,sep=",") + +write(output,file="infer.csv")