Skip to content
Snippets Groups Projects
Commit a30baa9c authored by Gervaise Henry's avatar Gervaise Henry :cowboy:
Browse files

Merge branch '5-dedupReads' into 'develop'

Resolve "process_dedup"

Closes #5

See merge request !17
parents b1770e35 8d75705f
Branches
Tags
2 merge requests!37v0.0.1,!17Resolve "process_dedup"
Pipeline #5878 passed with stages
in 51 minutes and 51 seconds
......@@ -61,6 +61,12 @@ alignData:
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.sorted.bai
- pytest -m alignData
dedupData:
stage: unit
script:
- singularity exec 'docker://bicf/picard2.21.7:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5JA_1M.se.sorted.bam O=Q-Y5JA_1M.se.deduped.bam M=Q-Y5JA_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true
- pytest -m dedupData
integration_se:
stage: integration
script:
......
......@@ -13,13 +13,22 @@ process {
cpus = 1
memory = '1 GB'
withName:parseMetadata {
cpus = 5
}
withName:getRef {
cpus = 8
}
withName:trimData {
cpus = 15
cpus = 8
memory = '2 GB'
}
withName:alignData {
cpus = 50
memory = '10 GB'
}
withName:dedupData {
cpus = 2
memory = '20 GB'
}
}
\ No newline at end of file
......@@ -13,13 +13,22 @@ process {
cpus = 1
memory = '1 GB'
withName:parseMetadata {
cpus = 5
}
withName:getRef {
cpus = 8
}
withName:trimData {
cpus = 15
cpus = 8
memory = '2 GB'
}
withName:alignData {
cpus = 50
memory = '10 GB'
}
withName:dedupData {
cpus = 2
memory = '20 GB'
}
}
......@@ -16,10 +16,13 @@ process {
executor = 'local'
}
withName:trimData {
queue = '256GB,256GBv1,384GB'
queue = 'super'
}
withName:alignData {
queue = '256GB,256GBv1,384GB'
queue = '256GB,256GBv1'
}
withName: dedupData {
queue = 'super'
}
}
......
......@@ -29,6 +29,9 @@ process {
withName: alignData {
container = 'bicf/gudmaprbkaligner:2.0.0'
}
withName: dedupData {
container = 'bicf/picard2.21.7:2.0.0'
}
}
trace {
......@@ -58,4 +61,4 @@ manifest {
mainScript = 'rna-seq.nf'
version = 'v0.0.1_indev'
nextflowVersion = '>=19.09.0'
}
}
\ No newline at end of file
......@@ -209,6 +209,7 @@ process getRef {
ulimit -a >>${repRID}.getRef.err
export https_proxy=\${http_proxy}
# retreive appropriate reference from S3 bucket
if [ "${species_getRef}" == "Mus musculus" ]
then
references=\$(echo ${referenceBase}/mouse/${refVersion}/GRCm${refMuVersion})
......@@ -275,14 +276,17 @@ process alignData {
path reference
output:
path ("${repRID}.unal.gz")
path ("${repRID}.sorted.bam")
path ("${repRID}.sorted.bai")
path ("${repRID}.sorted.bam") into rawBam
path ("${repRID}.sorted.bai") into rawBai
path ("${repRID}.align.out")
path ("${repRID}.align.err")
script:
"""
hostname >${repRID}.align.err
ulimit -a >>${repRID}.align.err
# align reads
if [ "${endsManual_alignData}" == "se" ]
then
hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} -U ${fastq[0]} 1>${repRID}.align.out 2>${repRID}.align.err
......@@ -290,8 +294,36 @@ process alignData {
then
hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} 1>${repRID}.align.out 2>${repRID}.align.err
fi
samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam 1>>${repRID}.align.out 2>>${repRID}.align.err
samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam 1>>${repRID}.align.out 2>>${repRID}.align.err
samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bai 1>>${repRID}.align.out 2>>${repRID}.align.err
# convert sam to bam and sort and index
samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam 1>>${repRID}.align.out 2>>${repRID}.align.err;
samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam 1>>${repRID}.align.out 2>>${repRID}.align.err;
samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bai 1>>${repRID}.align.out 2>>${repRID}.align.err;
"""
}
/*
*dedupReads: mark the duplicate reads, specifically focused on PCR or optical duplicates
*/
process dedupData {
tag "${repRID}"
publishDir "${outDir}/bam", mode: 'copy', pattern: "*.deduped.bam"
publishDir "${logsDir}", mode: 'copy', pattern: "*.dedup.{out,err}"
input:
path rawBam
output:
path ("${repRID}.deduped.bam") into dedupBam
path ("${repRID}.dedup.out")
path ("${repRID}.dedup.err")
script:
"""
hostname >${repRID}.dedup.err
ulimit -a >>${repRID}.dedup.err
#Remove duplicated reads
java -jar /picard/build/libs/picard.jar MarkDuplicates I=${rawBam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
"""
}
\ No newline at end of file
......@@ -20,20 +20,4 @@ def test_alignData_se():
def test_alignData_pe():
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.unal.gz'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bam'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bai'))
@pytest.mark.alignLogs
def test_alignLogs_se():
assert os.path.exists(os.path.join(data_output_path, '16-1ZX4.align.err'))
assert '34497376 reads; of these:' in open(os.path.join(data_output_path, '16-1ZX4.align.err')).readlines()[0]
assert os.path.exists(os.path.join(data_output_path, '16-1ZX4.align.out'))
@pytest.mark.alignLogs
def test_alignLogs_pe():
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA.align.err'))
assert utils.count_lines(os.path.join(data_output_path, 'Q-Y5JA.align.err')) == 7
assert '15824858 reads; of these:' in open(os.path.join(data_output_path, 'Q-Y5JA.align.err')).readlines()[0]
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA.align.out'))
assert utils.count_lines(os.path.join(data_output_path, 'Q-Y5JA.align.out')) == 0
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bai'))
\ No newline at end of file
#!/usr/bin/env python3
import pytest
import pandas as pd
import os
import utils
data_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../../'
@pytest.mark.dedupData
def test_dedupData():
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.deduped.bam'))
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment