diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 88787a7f0d0fba2b92a30c81cac5f392d4e1bfea..b09cba3b0799639c7d46d9c1619dbf21f6a0c085 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -21,10 +21,10 @@ process { withName:alignData { queue = '256GB,256GBv1' } - withName: dedupData { + withName:dedupData { queue = '128GB,256GB,256GBv1' } - withName: fastqc { + withName:fastqc { queue = 'super' } } diff --git a/workflow/nextflow.config b/workflow/nextflow.config index bbe1081fb4379505f3454951743022896bf359af..3a2d89f73d59f9b450bb825a7dd85751a307f9ef 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -38,7 +38,7 @@ process { withName: fastqc { container = 'bicf/fastqc:2.0.0' } - withName:rseqc{ + withName:inferMetadata{ container = 'bicf/rseqc3.0:2.0.0' } } diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 41c7c0168f1a146f457d3384baac4d676ac62332..e8c1fceafd445b2ac0c3f558157bbd7ea6ce3c42 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -38,6 +38,7 @@ referenceBase = "/project/BICF/BICF_Core/shared/gudmap/references" // Define script files script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh") script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py") +script_inferMeta = Channel.fromPath("${baseDir}/scripts/inferMeta.sh") /* * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid @@ -305,7 +306,7 @@ process alignData { output: path ("${repRID}.sorted.bam") into rawBam - path ("${repRID}.sorted.bai") into rawBai + path ("${repRID}.sorted.bam.bai") into rawBai path ("${repRID}.align.out") path ("${repRID}.align.err") @@ -326,7 +327,7 @@ process alignData { # convert sam to bam and sort and index samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam 1>>${repRID}.align.out 2>>${repRID}.align.err; samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam 1>>${repRID}.align.out 2>>${repRID}.align.err; - samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bai 1>>${repRID}.align.out 2>>${repRID}.align.err; + samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bam.bai 1>>${repRID}.align.out 2>>${repRID}.align.err; """ } @@ -342,7 +343,7 @@ process dedupData { path rawBam output: - tuple val ("${repRID}"), path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bai") into dedupBam + tuple val ("${repRID}"), path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bam.bai") into dedupBam path ("${repRID}.dedup.out") path ("${repRID}.dedup.err") @@ -354,7 +355,7 @@ process dedupData { # remove duplicated reads java -jar /picard/build/libs/picard.jar MarkDuplicates I=${rawBam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.deduped.bam ${repRID}.deduped.bam 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err - samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bai 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err + samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bam.bai 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err """ } @@ -410,11 +411,12 @@ process fastqc { /* *rseqc: run RSeQC to collect stats and infer experimental metadata */ -process rseqc { +process inferMetadata { tag "${repRID}" publishDir "${logsDir}", mode: 'copy', pattern: "*.rseqc.err" input: + path script_inferMeta path reference_rseqc set val (repRID), path (inBam), path (inBai) from dedupBam_rseqc val species_rseqc @@ -427,25 +429,50 @@ process rseqc { hostname >${repRID}.rseqc.err ulimit -a >>${repRID}.rseqc.err - # run set the reference (bed) name - if [ "${species_rseqc}" == "Mus musculus" ] + # infer experimental setting from dedup bam + infer_experiment.py -r ./bed/genome.bed -i "${inBam}" >${repRID}.rseqc.log + + endness=`bash inferMeta.sh endness ${repRID}.rseqc.log` + fail=`bash inferMeta.sh fail ${repRID}.rseqc.log` + echo \${endness} + echo \${fail} + if [ \${endness} == "PairEnd" ] then - reference=\$(echo m${refMoVersion}) - elif [ "${species_rseqc}" == "Homo sapiens" ] + percentF=`bash inferMeta.sh pef ${repRID}.rseqc.log` + percentR=`bash inferMeta.sh per ${repRID}.rseqc.log` + inner_distance.py -i "${inBam}" -o ${repRID}.insertSize -r ./bed/genome.bed + junction_saturation.py -i "${inBam}" -r ./bed/genome.bed -o ${repRID}.junctionSaturation + elif [ \${endness} == "SingleEnd" ] then - reference=\$(echo h${refHuVersion}) + percentF=`bash inferMeta.sh sef ${repRID}.rseqc.log` + percentR=`bash inferMeta.sh ser ${repRID}.rseqc.log` fi - if [ "${spike_rseqc}" == "yes" ] + if [ \$percentF > 0.25 ] && [ \$percentR < 0.25 ] then - reference=\$(echo \${reference}-S) - elif [ "${spike_rseqc}" == "no" ] + if [ \$endness == "PairEnd" ] + then + strategy="1++,1--,2+-,2-+" + else + strategy="++,--" + fi + elif [ \$percentR > 0.25 ] && [ \$percentF < 0.25 ] then - reference=\$(echo \${reference}) + if [ \$endness == "PairEnd" ] + then + strategy="1+-,1-+,2++,2--" + else + strategy="+-,-+" + fi + else + strategy="us" + fi + if [ \$strategy != "us" ] + then + FPKM_count.py -i "${inBam}" -o ${repRID}.rpkmCount -r ./bed/genome.bed -d \$strategy + RPKM_saturation.py -i "${inBam}" -o ${repRID}.rpkmCount -r ./bed/genome.bed -d \$strategy + else + FPKM_count.py -i "${inBam}" -o ${repRID}.rpkmCount -r ./bed/genome.bed + RPKM_saturation.py -i "${inBam}" -o ${repRID}.rpkmCount -r ./bed/genome.bed fi - reference=\$(echo GRC\${reference}) - - # infer experimental setting from dedup bam - echo \${reference} - infer_experiment.py -r "./bed/\${reference}.bed" -i "${inBam}" >${repRID}.rseqc.log """ } \ No newline at end of file diff --git a/workflow/scripts/inferMeta.sh b/workflow/scripts/inferMeta.sh new file mode 100644 index 0000000000000000000000000000000000000000..a8c9839ae2328cbed3dd39b9f2be427be12722ba --- /dev/null +++ b/workflow/scripts/inferMeta.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +if [ "${1}" == "endness" ] +then + awk '/Data/ {print}' "${2}" | sed -e 's/^This is //' -e 's/ Data$//' +elif [ "${1}" == "fail" ] +then + awk '/Fraction of reads failed/ {print}' "${2}" | sed -e 's/^Fraction of reads failed to determine: //' +elif [ "${1}" == "sef" ] +then + awk '/\+\+,--/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "++,--": //' +elif [ "${1}" == "ser" ] +then + awk '/\+-,-\+/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "+-,-+": //' +elif [ "${1}" == "pef" ] +then + awk '/1\+\+,1--,2\+-,2-\+/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "1++,1--,2+-,2-+": //' +elif [ "${1}" == "per" ] +then + awk '/1\+-,1-\+,2\+\+,2--/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "1+-,1-+,2++,2--": //' +fi \ No newline at end of file