Merge branch '5-dedupReads' into 'develop'

Resolve "process_dedup" Closes #5 See merge request !17

Merge branch '5-dedupReads' into 'develop'
Resolve "process_dedup" Closes #5 See merge request !17
a30baa9c · Gervaise Henry · b1770e35 · 8d75705f · a30baa9c · a30baa9c
Commit a30baa9c authored 5 years ago by Gervaise Henry
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -61,6 +61,12 @@ alignData:
  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.sorted.bai
  - pytest -m alignData

+dedupData:
+  stage: unit
+  script:
+  - singularity exec 'docker://bicf/picard2.21.7:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5JA_1M.se.sorted.bam O=Q-Y5JA_1M.se.deduped.bam M=Q-Y5JA_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true
+  - pytest -m dedupData
+
 integration_se:
  stage: integration
  script:

--- a/workflow/conf/aws_ondemand.config
+++ b/workflow/conf/aws_ondemand.config
@@ -13,13 +13,22 @@ process {
  cpus = 1
  memory = '1 GB'

+  withName:parseMetadata {
+    cpus = 5
+  }
  withName:getRef {
    cpus = 8
  }
  withName:trimData {
-    cpus = 15
+    cpus = 8
+    memory = '2 GB'
  }
  withName:alignData {
    cpus = 50
+    memory = '10 GB'
+  }
+  withName:dedupData {
+    cpus = 2
+    memory = '20 GB'
  }
 }
\ No newline at end of file
--- a/workflow/conf/aws_spot.config
+++ b/workflow/conf/aws_spot.config
@@ -13,13 +13,22 @@ process {
  cpus = 1
  memory = '1 GB'

+  withName:parseMetadata {
+    cpus = 5
+  }
  withName:getRef {
    cpus = 8
  }
  withName:trimData {
-    cpus = 15
+    cpus = 8
+    memory = '2 GB'
  }
  withName:alignData {
    cpus = 50
+    memory = '10 GB'
+  }
+  withName:dedupData {
+    cpus = 2
+    memory = '20 GB'
  }
 }
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -16,10 +16,13 @@ process {
    executor = 'local'
  }
  withName:trimData {
-    queue = '256GB,256GBv1,384GB'
+    queue = 'super'
  }
  withName:alignData {
-    queue = '256GB,256GBv1,384GB'
+    queue = '256GB,256GBv1'
+  }
+  withName: dedupData {
+    queue = 'super'
  }
 }


--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
@@ -29,6 +29,9 @@ process {
  withName: alignData {
    container = 'bicf/gudmaprbkaligner:2.0.0'
  }
+  withName: dedupData {
+    container = 'bicf/picard2.21.7:2.0.0'
+  }
 }

 trace {
@@ -58,4 +61,4 @@ manifest {
  mainScript = 'rna-seq.nf'
  version = 'v0.0.1_indev'
  nextflowVersion = '>=19.09.0'
-}
+}
\ No newline at end of file
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -209,6 +209,7 @@ process getRef {
    ulimit -a >>${repRID}.getRef.err
    export https_proxy=\${http_proxy}

+    # retreive appropriate reference from S3 bucket
    if [ "${species_getRef}" == "Mus musculus" ]
    then
      references=\$(echo ${referenceBase}/mouse/${refVersion}/GRCm${refMuVersion})
@@ -275,14 +276,17 @@ process alignData {
    path reference

  output:
-    path ("${repRID}.unal.gz")
-    path ("${repRID}.sorted.bam")
-    path ("${repRID}.sorted.bai")
+    path ("${repRID}.sorted.bam") into rawBam
+    path ("${repRID}.sorted.bai") into rawBai
    path ("${repRID}.align.out")
    path ("${repRID}.align.err")

  script:
    """
+    hostname >${repRID}.align.err
+    ulimit -a >>${repRID}.align.err
+
+    # align reads
    if [ "${endsManual_alignData}" == "se" ]
    then
      hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} -U ${fastq[0]} 1>${repRID}.align.out 2>${repRID}.align.err
@@ -290,8 +294,36 @@ process alignData {
    then
      hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} 1>${repRID}.align.out 2>${repRID}.align.err
    fi
-    samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam 1>>${repRID}.align.out 2>>${repRID}.align.err
-    samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam 1>>${repRID}.align.out 2>>${repRID}.align.err
-    samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bai 1>>${repRID}.align.out 2>>${repRID}.align.err
+    
+    # convert sam to bam and sort and index
+    samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam 1>>${repRID}.align.out 2>>${repRID}.align.err;
+    samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam 1>>${repRID}.align.out 2>>${repRID}.align.err;
+    samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bai 1>>${repRID}.align.out 2>>${repRID}.align.err;
+    """
+}
+
+/*
+ *dedupReads: mark the duplicate reads, specifically focused on PCR or optical duplicates
+*/
+process dedupData {
+  tag "${repRID}"
+  publishDir "${outDir}/bam", mode: 'copy', pattern: "*.deduped.bam"
+  publishDir "${logsDir}", mode: 'copy', pattern: "*.dedup.{out,err}"
+
+  input:
+    path rawBam
+
+  output:
+    path ("${repRID}.deduped.bam") into dedupBam
+    path ("${repRID}.dedup.out")
+    path ("${repRID}.dedup.err")
+
+  script:
+    """
+    hostname >${repRID}.dedup.err
+    ulimit -a >>${repRID}.dedup.err
+
+    #Remove duplicated reads
+    java -jar /picard/build/libs/picard.jar MarkDuplicates I=${rawBam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
    """
 }
\ No newline at end of file
--- a/workflow/tests/test_alignReads.py
+++ b/workflow/tests/test_alignReads.py
@@ -20,20 +20,4 @@ def test_alignData_se():
 def test_alignData_pe():
 	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.unal.gz'))
 	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bam'))
-	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bai'))
-
-
-@pytest.mark.alignLogs
-def test_alignLogs_se():
-	assert os.path.exists(os.path.join(data_output_path, '16-1ZX4.align.err'))
-	assert '34497376 reads; of these:' in open(os.path.join(data_output_path, '16-1ZX4.align.err')).readlines()[0]
-	assert os.path.exists(os.path.join(data_output_path, '16-1ZX4.align.out'))
-
-
-@pytest.mark.alignLogs
-def test_alignLogs_pe():
-	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA.align.err'))
-	assert utils.count_lines(os.path.join(data_output_path, 'Q-Y5JA.align.err')) == 7
-	assert '15824858 reads; of these:' in open(os.path.join(data_output_path, 'Q-Y5JA.align.err')).readlines()[0]
-	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA.align.out'))
-	assert utils.count_lines(os.path.join(data_output_path, 'Q-Y5JA.align.out')) == 0
+	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bai'))
\ No newline at end of file
--- a/workflow/tests/test_dedupReads.py
+++ b/workflow/tests/test_dedupReads.py
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+import os
+import utils
+
+data_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+	'/../../'
+
+
+@pytest.mark.dedupData
+def test_dedupData():
+	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.deduped.bam'))
\ No newline at end of file