Merge branch '35-RSeQC' into 'develop'

Resolve "process_RSeQC" Closes #35 See merge request !23

Merge branch '35-RSeQC' into 'develop'
Resolve "process_RSeQC" Closes #35 See merge request !23
ad35f8df · Venkat Malladi · 6da4a7d1 · 0c28e8d2 · ad35f8df · 6da4a7d1
Commit ad35f8df authored 5 years ago by Venkat Malladi
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
 before_script:
-  - module add  python/3.6.1-2-anaconda
+  - module add  python/3.6.4-anaconda
  - pip install --user pytest-pythonpath==0.7.1 pytest-cov==2.5.1
  - module load singularity/3.0.2
  - module load nextflow/19.09.0
@@ -22,8 +22,8 @@ getData:
  stage: unit
  script:
  - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
-  - unzip ./test_data/bagit/Replicate_16-1ZX4
-  - singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' sh ./workflow/scripts/bdbagFetch.sh Replicate_16-1ZX4 16-1ZX4
+  - unzip ./test_data/bagit/Study_Q-Y4H0.zip
+  - singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' bash ./workflow/scripts/bdbagFetch.sh Study_Q-Y4H0 Q-Y4H0 TEST
  - pytest -m getData

 parseMetadata:
@@ -53,14 +53,14 @@ trimData:
 alignData:
  stage: unit
  script:
-  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p `nproc` --add-chrname --un-gz Q-Y5JA_1M.se.unal.gz -S Q-Y5JA_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5JA_1M_trimmed.fq.gz
+  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p `nproc` --add-chrname --un-gz Q-Y5JA_1M.se.unal.gz -S Q-Y5JA_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5JA_1M_trimmed.fq.gz --summary-file ${repRID}.alignSummary.txt --new-summary
  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o Q-Y5JA_1M.se.bam Q-Y5JA_1M.se.sam
  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ `nproc` -O BAM -o Q-Y5JA_1M.se.sorted.bam Q-Y5JA_1M.se.bam
-  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA_1M.se.sorted.bam Q-Y5JA_1M.se.sorted.bai
-  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p `nproc` --add-chrname --un-gz Q-Y5JA_1M.pe.unal.gz -S Q-Y5JA_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/small/Q-Y5JA_1M_R1_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5JA_1M_R2_val_2.fq.gz
+  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA_1M.se.sorted.bam Q-Y5JA_1M.se.sorted.bam.bai
+  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p `nproc` --add-chrname --un-gz Q-Y5JA_1M.pe.unal.gz -S Q-Y5JA_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/small/Q-Y5JA_1M_R1_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5JA_1M_R2_val_2.fq.gz --summary-file ${repRID}.alignSummary.txt --new-summary
  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o Q-Y5JA_1M.pe.bam Q-Y5JA_1M.pe.sam
  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ `nproc` -O BAM -o Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.bam
-  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.sorted.bai
+  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.sorted.bam.bai
  - pytest -m alignData

 dedupData:
@@ -68,7 +68,7 @@ dedupData:
  script:
  - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5JA_1M.se.sorted.bam O=Q-Y5JA_1M.se.deduped.bam M=Q-Y5JA_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true
  - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ `nproc` -O BAM -o Q-Y5JA_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5JA_1M.se.deduped.bam
-  - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ `nproc` -b ./test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.bam Q-Y5JA_1M.se.sorted.deduped.bai
+  - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ `nproc` -b ./test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.bam Q-Y5JA_1M.se.sorted.deduped.bam.bai
  - pytest -m dedupData
  
 makeBigWig:
@@ -83,6 +83,12 @@ fastqc:
  - singularity run 'docker://bicf/fastqc:2.0.0' ./test_data/fastq/small/Q-Y5JA_1M.R1.fastq.gz -o .
  - pytest -m fastqc

+inferMetadata:
+  stage: unit
+  script:
+  - singularity run 'docker://bicf/rseqc3.0:2.0.0' tin.py -i ./test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed
+  - pytest -m inferMetadata
+
 integration_se:
  stage: integration
  script:

--- a/docs/dag.png
+++ b/docs/dag.png
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -24,7 +24,10 @@ process {
  withName: dedupData {
    queue = 'super'
  }
-  withName: fastqc {
+  withName:fastqc {
+    queue = 'super'
+  }
+  withName:inferMetadata {
    queue = 'super'
  }
  withName: makeBigWig {

--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
@@ -17,13 +17,13 @@ process {
  withName:getData {
    container = 'bicf/gudmaprbkfilexfer:1.3'
  }
-  withName:parseMetadata {
+  withName: parseMetadata {
    container = 'bicf/python3:1.3'
  }
-  withName:getRef {
+  withName: getRef {
    container = 'bicf/awscli:1.1'
  }
-  withName:trimData {
+  withName: trimData {
    container = 'bicf/trimgalore:1.1'
  }
  withName: alignData {
@@ -32,11 +32,14 @@ process {
  withName: dedupData {
    container = 'bicf/gudmaprbkdedup:2.0.0'
  }
+  withName: makeBigWig {
+    container = 'bicf/deeptools3.3:2.0.0'
+  }
  withName: fastqc {
    container = 'bicf/fastqc:2.0.0'
  }
-  withName: makeBigWig {
-    container = 'bicf/deeptools3.3:2.0.0'
+  withName:inferMetadata{
+    container = 'bicf/rseqc3.0:2.0.0'
  }
 }


--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -38,6 +38,7 @@ referenceBase = "/project/BICF/BICF_Core/shared/gudmap/references"
 // Define script files
 script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh")
 script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py")
+script_inferMeta = Channel.fromPath("${baseDir}/scripts/inferMeta.sh")

 /*
 * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid
@@ -112,7 +113,7 @@ process getData {
    """
 }

-// Split fastq's
+// Replicate raw fastqs for multiple process inputs
 fastqs.into {
  fastqs_trimData
  fastqs_fastqc
@@ -184,6 +185,7 @@ metadata.splitCsv(sep: ",", header: false).separate(
  spike,
  species
 )
+// Replicate metadata for multiple process inputs
 endsManual.into {
  endsManual_trimData
  endsManual_alignData
@@ -262,6 +264,7 @@ process trimData {

  output:
    path ("*.fq.gz") into fastqs_trimmed
+    path ("*_trimming_report.txt") into trimQC
    path ("${repRID}.trimData.log")
    path ("${repRID}.trimData.err")

@@ -281,8 +284,10 @@ process trimData {
    """
 }

+// Replicate reference for multiple process inputs
 reference.into {
  reference_alignData
+  reference_inferMeta
 }

 /*
@@ -299,8 +304,8 @@ process alignData {
    path reference_alignData

  output:
-    path ("${repRID}.sorted.bam") into rawBam
-    path ("${repRID}.sorted.bai") into rawBai
+    tuple val ("${repRID}"), path ("${repRID}.sorted.bam"), path ("${repRID}.sorted.bam.bai") into rawBam
+    path ("*.alignSummary.txt") into alignQC
    path ("${repRID}.align.out")
    path ("${repRID}.align.err")

@@ -312,19 +317,24 @@ process alignData {
    # align reads
    if [ "${endsManual_alignData}" == "se" ]
    then
-      hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} -U ${fastq[0]} 1>${repRID}.align.out 2>${repRID}.align.err
+      hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} -U ${fastq[0]} --summary-file ${repRID}.alignSummary.txt --new-summary 1>${repRID}.align.out 2>${repRID}.align.err
    elif [ "${endsManual_alignData}" == "pe" ]
    then
-      hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} 1>${repRID}.align.out 2>${repRID}.align.err
+      hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} --summary-file ${repRID}.alignSummary.txt --new-summary 1>${repRID}.align.out 2>${repRID}.align.err
    fi
    
    # convert sam to bam and sort and index
    samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam 1>>${repRID}.align.out 2>>${repRID}.align.err;
    samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam 1>>${repRID}.align.out 2>>${repRID}.align.err;
-    samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bai 1>>${repRID}.align.out 2>>${repRID}.align.err;
+    samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bam.bai 1>>${repRID}.align.out 2>>${repRID}.align.err;
    """
 }

+// Replicate rawBam for multiple process inputs
+rawBam.into {
+  rawBam_dedupData
+}
+
 /*
 *dedupData: mark the duplicate reads, specifically focused on PCR or optical duplicates
 */
@@ -334,10 +344,11 @@ process dedupData {
  publishDir "${logsDir}", mode: 'copy', pattern: "*.dedup.{out,err}"

  input:
-    path rawBam
+    set val (repRID), path (inBam), path (inBai) from rawBam_dedupData

  output:
-    tuple val ("${repRID}"), path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bai") into dedupBam
+    tuple val ("${repRID}"), path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bam.bai") into dedupBam
+    path ("*.deduped.Metrics.txt") into dedupQC
    path ("${repRID}.dedup.out")
    path ("${repRID}.dedup.err")

@@ -347,9 +358,34 @@ process dedupData {
    ulimit -a >>${repRID}.dedup.err

    # remove duplicated reads
-    java -jar /picard/build/libs/picard.jar MarkDuplicates I=${rawBam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
+    java -jar /picard/build/libs/picard.jar MarkDuplicates I=${inBam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
    samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.deduped.bam ${repRID}.deduped.bam 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
-    samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bai 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
+    samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bam.bai 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
+    """
+}
+
+// Replicate dedup bam/bai for multiple process inputs
+dedupBam.into {
+  dedupBam_makeBigWig
+  dedupBam_inferMeta
+}
+
+/*
+ *Make BigWig files for output
+*/
+process makeBigWig {
+  tag "${repRID}"
+  publishDir "${logsDir}", mode: 'copy', pattern: "*.makeBigWig.err"
+
+  input:
+    set val (repRID), path (inBam), path (inBai) from dedupBam_makeBigWig
+
+  output:
+    path ("${repRID}.bw")
+
+  script:
+    """
+    bamCoverage -p `nproc` -b ${inBam} -o ${repRID}.bw
    """
 }

@@ -378,20 +414,70 @@ process fastqc {
 }

 /*
- *Make BigWig files for later processes
+ *rseqc: run RSeQC to collect stats and infer experimental metadata
 */
-process makeBigWig {
+process inferMetadata {
  tag "${repRID}"
-  publishDir "${logsDir}", mode: 'copy', pattern: "*.makeBigWig.err"
+  publishDir "${logsDir}", mode: 'copy', pattern: "*.rseqc.err"

  input:
-    set val (repRID), path (inBam), path (inBai) from dedupBam
+    path script_inferMeta
+    path reference_inferMeta
+    set val (repRID), path (inBam), path (inBai) from dedupBam_inferMeta

  output:
-    path ("${repRID}.bw")
+    path "infer.csv" into inferedMetadata
+    path "${inBam.baseName}.tin.xls" into tin
+    path "${repRID}.insertSize.inner_distance_freq.txt" optional true into innerDistance
+

  script:
    """
-    bamCoverage -p `nproc` -b ${inBam} -o ${repRID}.bw
+    hostname >${repRID}.rseqc.err
+    ulimit -a >>${repRID}.rseqc.err
+
+    # infer experimental setting from dedup bam
+    infer_experiment.py -r ./bed/genome.bed -i "${inBam}" >${repRID}.rseqc.log
+
+    endness=`bash inferMeta.sh endness ${repRID}.rseqc.log`
+    fail=`bash inferMeta.sh fail ${repRID}.rseqc.log`
+    if [ \${endness} == "PairEnd" ] 
+    then
+      percentF=`bash inferMeta.sh pef ${repRID}.rseqc.log`
+      percentR=`bash inferMeta.sh per ${repRID}.rseqc.log`
+      inner_distance.py -i "${inBam}" -o ${repRID}.insertSize -r ./bed/genome.bed
+    elif [ \${endness} == "SingleEnd" ]
+    then
+      percentF=`bash inferMeta.sh sef ${repRID}.rseqc.log`
+      percentR=`bash inferMeta.sh ser ${repRID}.rseqc.log`
+    fi
+    if [ \$percentF -gt 0.25 ] && [ \$percentR -lt 0.25 ]
+    then
+      stranded="forward"
+      if [ \$endness == "PairEnd" ]
+      then
+        strategy="1++,1--,2+-,2-+"
+      else
+        strategy="++,--"
+      fi
+    elif [ \$percentR -gt 0.25 ] && [ \$percentF -lt 0.25 ]
+    then
+      stranded="reverse"
+      if [ \$endness == "PairEnd" ]
+      then
+        strategy="1+-,1-+,2++,2--"
+      else
+        strategy="+-,-+"
+      fi
+    else
+      stranded="unstranded"
+      strategy="us"
+    fi
+
+    # calcualte TIN values per feature
+    tin.py -i "${inBam}" -r ./bed/genome.bed
+
+    # write infered metadata to file
+    echo \${endness},\${stranded},\${strategy},\${percentF},\${percentR},\${fail} > infer.csv
    """
-}
+}
\ No newline at end of file
--- a/workflow/scripts/bdbagFetch.sh
+++ b/workflow/scripts/bdbagFetch.sh
 #!/bin/bash

-bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1
-for i in $(find */ -name "*.R*.fastq.gz"); do
-    path=${2}$(echo ${i##*/} | grep -o "\.R.\.fastq\.gz");
-    mv ${i} ./${path}
-done;
\ No newline at end of file
+if [ -z "${3}" ]
+then
+bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz ${1}
+    for i in $(find */ -name "*.R*.fastq.gz")
+    do
+        path=${2}$(echo ${i##*/} | grep -o "\.R.\.fastq\.gz")
+        mv ${i} ./${path}
+    done
+elif [ "${3}" == "TEST" ]
+then
+    bdbag --resolve-fetch all --fetch-filter filename\$*.txt ${1}
+fi
\ No newline at end of file
--- a/workflow/scripts/inferMeta.sh
+++ b/workflow/scripts/inferMeta.sh
+#!/bin/bash
+
+if [ "${1}" == "endness" ]
+then
+    awk '/Data/ {print}' "${2}" | sed -e 's/^This is //' -e 's/ Data$//'
+elif [ "${1}" == "fail" ]
+then
+    awk '/Fraction of reads failed/ {print}' "${2}" | sed -e 's/^Fraction of reads failed to determine: //'
+elif [ "${1}" == "sef" ]
+then
+    awk '/\+\+,--/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "++,--": //'
+elif [ "${1}" == "ser" ]
+then
+    awk '/\+-,-\+/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "+-,-+": //'
+elif [ "${1}" == "pef" ]
+then
+    awk '/1\+\+,1--,2\+-,2-\+/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "1++,1--,2+-,2-+": //'
+elif [ "${1}" == "per" ]
+then
+    awk '/1\+-,1-\+,2\+\+,2--/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "1+-,1-+,2++,2--": //'
+fi
\ No newline at end of file
--- a/workflow/tests/test_alignReads.py
+++ b/workflow/tests/test_alignReads.py
@@ -13,11 +13,11 @@ data_output_path = os.path.dirname(os.path.abspath(__file__)) + \
 def test_alignData_se():
 	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.unal.gz'))
 	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.bam'))
-	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.bai'))
+	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.bam.bai'))


 @pytest.mark.alignData
 def test_alignData_pe():
 	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.unal.gz'))
 	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bam'))
-	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bai'))
\ No newline at end of file
+	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bam.bai'))
--- a/workflow/tests/test_dedupReads.py
+++ b/workflow/tests/test_dedupReads.py
@@ -11,4 +11,5 @@ data_output_path = os.path.dirname(os.path.abspath(__file__)) + \

 @pytest.mark.dedupData
 def test_dedupData():
-	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.deduped.bam'))
\ No newline at end of file
+	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.deduped.bam'))
+	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.deduped.bam.bai'))
--- a/workflow/tests/test_getData.py
+++ b/workflow/tests/test_getData.py
@@ -10,5 +10,5 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \

 @pytest.mark.getData
 def test_getData():
-    assert os.path.exists(os.path.join(test_output_path, 'Replicate_16-1ZX4/bagit.txt'))
-    assert os.path.exists(os.path.join(test_output_path, '16-1ZX4.R1.fastq.gz'))
\ No newline at end of file
+    assert os.path.exists(os.path.join(test_output_path, 'Study_Q-Y4H0/bagit.txt'))
+    assert os.path.exists(os.path.join(test_output_path, 'Study_Q-Y4H0/data/assets/Study/Q-Y4H0/Experiment/Q-Y4BY/Replicate/Q-Y5F8/hMARIS_SIX2+_RiboDep#1.gene.rpkm.txt'))
\ No newline at end of file
--- a/workflow/tests/test_inferMetadata.py
+++ b/workflow/tests/test_inferMetadata.py
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+from io import StringIO
+import os
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../../'
+
+@pytest.mark.inferMetadata
+def test_inferMetadata():
+    assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_1M.se.sorted.deduped.tin.xls'))
+