Skip to content
Snippets Groups Projects
Commit a30baa9c authored by Gervaise Henry's avatar Gervaise Henry :cowboy:
Browse files

Merge branch '5-dedupReads' into 'develop'

Resolve "process_dedup"

Closes #5

See merge request !17
parents b1770e35 8d75705f
2 merge requests!37v0.0.1,!17Resolve "process_dedup"
Pipeline #5878 passed with stages
in 51 minutes and 51 seconds
...@@ -61,6 +61,12 @@ alignData: ...@@ -61,6 +61,12 @@ alignData:
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.sorted.bai - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.sorted.bai
- pytest -m alignData - pytest -m alignData
dedupData:
stage: unit
script:
- singularity exec 'docker://bicf/picard2.21.7:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5JA_1M.se.sorted.bam O=Q-Y5JA_1M.se.deduped.bam M=Q-Y5JA_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true
- pytest -m dedupData
integration_se: integration_se:
stage: integration stage: integration
script: script:
......
...@@ -13,13 +13,22 @@ process { ...@@ -13,13 +13,22 @@ process {
cpus = 1 cpus = 1
memory = '1 GB' memory = '1 GB'
withName:parseMetadata {
cpus = 5
}
withName:getRef { withName:getRef {
cpus = 8 cpus = 8
} }
withName:trimData { withName:trimData {
cpus = 15 cpus = 8
memory = '2 GB'
} }
withName:alignData { withName:alignData {
cpus = 50 cpus = 50
memory = '10 GB'
}
withName:dedupData {
cpus = 2
memory = '20 GB'
} }
} }
\ No newline at end of file
...@@ -13,13 +13,22 @@ process { ...@@ -13,13 +13,22 @@ process {
cpus = 1 cpus = 1
memory = '1 GB' memory = '1 GB'
withName:parseMetadata {
cpus = 5
}
withName:getRef { withName:getRef {
cpus = 8 cpus = 8
} }
withName:trimData { withName:trimData {
cpus = 15 cpus = 8
memory = '2 GB'
} }
withName:alignData { withName:alignData {
cpus = 50 cpus = 50
memory = '10 GB'
}
withName:dedupData {
cpus = 2
memory = '20 GB'
} }
} }
...@@ -16,10 +16,13 @@ process { ...@@ -16,10 +16,13 @@ process {
executor = 'local' executor = 'local'
} }
withName:trimData { withName:trimData {
queue = '256GB,256GBv1,384GB' queue = 'super'
} }
withName:alignData { withName:alignData {
queue = '256GB,256GBv1,384GB' queue = '256GB,256GBv1'
}
withName: dedupData {
queue = 'super'
} }
} }
......
...@@ -29,6 +29,9 @@ process { ...@@ -29,6 +29,9 @@ process {
withName: alignData { withName: alignData {
container = 'bicf/gudmaprbkaligner:2.0.0' container = 'bicf/gudmaprbkaligner:2.0.0'
} }
withName: dedupData {
container = 'bicf/picard2.21.7:2.0.0'
}
} }
trace { trace {
...@@ -58,4 +61,4 @@ manifest { ...@@ -58,4 +61,4 @@ manifest {
mainScript = 'rna-seq.nf' mainScript = 'rna-seq.nf'
version = 'v0.0.1_indev' version = 'v0.0.1_indev'
nextflowVersion = '>=19.09.0' nextflowVersion = '>=19.09.0'
} }
\ No newline at end of file
...@@ -209,6 +209,7 @@ process getRef { ...@@ -209,6 +209,7 @@ process getRef {
ulimit -a >>${repRID}.getRef.err ulimit -a >>${repRID}.getRef.err
export https_proxy=\${http_proxy} export https_proxy=\${http_proxy}
# retreive appropriate reference from S3 bucket
if [ "${species_getRef}" == "Mus musculus" ] if [ "${species_getRef}" == "Mus musculus" ]
then then
references=\$(echo ${referenceBase}/mouse/${refVersion}/GRCm${refMuVersion}) references=\$(echo ${referenceBase}/mouse/${refVersion}/GRCm${refMuVersion})
...@@ -275,14 +276,17 @@ process alignData { ...@@ -275,14 +276,17 @@ process alignData {
path reference path reference
output: output:
path ("${repRID}.unal.gz") path ("${repRID}.sorted.bam") into rawBam
path ("${repRID}.sorted.bam") path ("${repRID}.sorted.bai") into rawBai
path ("${repRID}.sorted.bai")
path ("${repRID}.align.out") path ("${repRID}.align.out")
path ("${repRID}.align.err") path ("${repRID}.align.err")
script: script:
""" """
hostname >${repRID}.align.err
ulimit -a >>${repRID}.align.err
# align reads
if [ "${endsManual_alignData}" == "se" ] if [ "${endsManual_alignData}" == "se" ]
then then
hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} -U ${fastq[0]} 1>${repRID}.align.out 2>${repRID}.align.err hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} -U ${fastq[0]} 1>${repRID}.align.out 2>${repRID}.align.err
...@@ -290,8 +294,36 @@ process alignData { ...@@ -290,8 +294,36 @@ process alignData {
then then
hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} 1>${repRID}.align.out 2>${repRID}.align.err hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} 1>${repRID}.align.out 2>${repRID}.align.err
fi fi
samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam 1>>${repRID}.align.out 2>>${repRID}.align.err
samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam 1>>${repRID}.align.out 2>>${repRID}.align.err # convert sam to bam and sort and index
samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bai 1>>${repRID}.align.out 2>>${repRID}.align.err samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam 1>>${repRID}.align.out 2>>${repRID}.align.err;
samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam 1>>${repRID}.align.out 2>>${repRID}.align.err;
samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bai 1>>${repRID}.align.out 2>>${repRID}.align.err;
"""
}
/*
*dedupReads: mark the duplicate reads, specifically focused on PCR or optical duplicates
*/
process dedupData {
tag "${repRID}"
publishDir "${outDir}/bam", mode: 'copy', pattern: "*.deduped.bam"
publishDir "${logsDir}", mode: 'copy', pattern: "*.dedup.{out,err}"
input:
path rawBam
output:
path ("${repRID}.deduped.bam") into dedupBam
path ("${repRID}.dedup.out")
path ("${repRID}.dedup.err")
script:
"""
hostname >${repRID}.dedup.err
ulimit -a >>${repRID}.dedup.err
#Remove duplicated reads
java -jar /picard/build/libs/picard.jar MarkDuplicates I=${rawBam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
""" """
} }
\ No newline at end of file
...@@ -20,20 +20,4 @@ def test_alignData_se(): ...@@ -20,20 +20,4 @@ def test_alignData_se():
def test_alignData_pe(): def test_alignData_pe():
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.unal.gz')) assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.unal.gz'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bam')) assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bam'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bai')) assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bai'))
\ No newline at end of file
@pytest.mark.alignLogs
def test_alignLogs_se():
assert os.path.exists(os.path.join(data_output_path, '16-1ZX4.align.err'))
assert '34497376 reads; of these:' in open(os.path.join(data_output_path, '16-1ZX4.align.err')).readlines()[0]
assert os.path.exists(os.path.join(data_output_path, '16-1ZX4.align.out'))
@pytest.mark.alignLogs
def test_alignLogs_pe():
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA.align.err'))
assert utils.count_lines(os.path.join(data_output_path, 'Q-Y5JA.align.err')) == 7
assert '15824858 reads; of these:' in open(os.path.join(data_output_path, 'Q-Y5JA.align.err')).readlines()[0]
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA.align.out'))
assert utils.count_lines(os.path.join(data_output_path, 'Q-Y5JA.align.out')) == 0
#!/usr/bin/env python3
import pytest
import pandas as pd
import os
import utils
data_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../../'
@pytest.mark.dedupData
def test_dedupData():
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.deduped.bam'))
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment