Skip to content
Snippets Groups Projects
Commit ad35f8df authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Merge branch '35-RSeQC' into 'develop'

Resolve "process_RSeQC"

Closes #35

See merge request !23
parents 6da4a7d1 0c28e8d2
Branches
Tags
3 merge requests!37v0.0.1,!24Develop,!23Resolve "process_RSeQC"
Pipeline #6271 passed with stages
in 3 hours, 46 minutes, and 25 seconds
before_script:
- module add python/3.6.1-2-anaconda
- module add python/3.6.4-anaconda
- pip install --user pytest-pythonpath==0.7.1 pytest-cov==2.5.1
- module load singularity/3.0.2
- module load nextflow/19.09.0
......@@ -22,8 +22,8 @@ getData:
stage: unit
script:
- ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
- unzip ./test_data/bagit/Replicate_16-1ZX4
- singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' sh ./workflow/scripts/bdbagFetch.sh Replicate_16-1ZX4 16-1ZX4
- unzip ./test_data/bagit/Study_Q-Y4H0.zip
- singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' bash ./workflow/scripts/bdbagFetch.sh Study_Q-Y4H0 Q-Y4H0 TEST
- pytest -m getData
parseMetadata:
......@@ -53,14 +53,14 @@ trimData:
alignData:
stage: unit
script:
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p `nproc` --add-chrname --un-gz Q-Y5JA_1M.se.unal.gz -S Q-Y5JA_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5JA_1M_trimmed.fq.gz
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p `nproc` --add-chrname --un-gz Q-Y5JA_1M.se.unal.gz -S Q-Y5JA_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5JA_1M_trimmed.fq.gz --summary-file ${repRID}.alignSummary.txt --new-summary
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o Q-Y5JA_1M.se.bam Q-Y5JA_1M.se.sam
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ `nproc` -O BAM -o Q-Y5JA_1M.se.sorted.bam Q-Y5JA_1M.se.bam
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA_1M.se.sorted.bam Q-Y5JA_1M.se.sorted.bai
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p `nproc` --add-chrname --un-gz Q-Y5JA_1M.pe.unal.gz -S Q-Y5JA_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/small/Q-Y5JA_1M_R1_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5JA_1M_R2_val_2.fq.gz
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA_1M.se.sorted.bam Q-Y5JA_1M.se.sorted.bam.bai
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p `nproc` --add-chrname --un-gz Q-Y5JA_1M.pe.unal.gz -S Q-Y5JA_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/small/Q-Y5JA_1M_R1_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5JA_1M_R2_val_2.fq.gz --summary-file ${repRID}.alignSummary.txt --new-summary
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o Q-Y5JA_1M.pe.bam Q-Y5JA_1M.pe.sam
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ `nproc` -O BAM -o Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.bam
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.sorted.bai
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.sorted.bam.bai
- pytest -m alignData
dedupData:
......@@ -68,7 +68,7 @@ dedupData:
script:
- singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5JA_1M.se.sorted.bam O=Q-Y5JA_1M.se.deduped.bam M=Q-Y5JA_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true
- singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ `nproc` -O BAM -o Q-Y5JA_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5JA_1M.se.deduped.bam
- singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ `nproc` -b ./test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.bam Q-Y5JA_1M.se.sorted.deduped.bai
- singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ `nproc` -b ./test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.bam Q-Y5JA_1M.se.sorted.deduped.bam.bai
- pytest -m dedupData
makeBigWig:
......@@ -83,6 +83,12 @@ fastqc:
- singularity run 'docker://bicf/fastqc:2.0.0' ./test_data/fastq/small/Q-Y5JA_1M.R1.fastq.gz -o .
- pytest -m fastqc
inferMetadata:
stage: unit
script:
- singularity run 'docker://bicf/rseqc3.0:2.0.0' tin.py -i ./test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed
- pytest -m inferMetadata
integration_se:
stage: integration
script:
......
docs/dag.png

135 KiB | W: | H:

docs/dag.png

170 KiB | W: | H:

docs/dag.png
docs/dag.png
docs/dag.png
docs/dag.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -24,7 +24,10 @@ process {
withName: dedupData {
queue = 'super'
}
withName: fastqc {
withName:fastqc {
queue = 'super'
}
withName:inferMetadata {
queue = 'super'
}
withName: makeBigWig {
......
......@@ -17,13 +17,13 @@ process {
withName:getData {
container = 'bicf/gudmaprbkfilexfer:1.3'
}
withName:parseMetadata {
withName: parseMetadata {
container = 'bicf/python3:1.3'
}
withName:getRef {
withName: getRef {
container = 'bicf/awscli:1.1'
}
withName:trimData {
withName: trimData {
container = 'bicf/trimgalore:1.1'
}
withName: alignData {
......@@ -32,11 +32,14 @@ process {
withName: dedupData {
container = 'bicf/gudmaprbkdedup:2.0.0'
}
withName: makeBigWig {
container = 'bicf/deeptools3.3:2.0.0'
}
withName: fastqc {
container = 'bicf/fastqc:2.0.0'
}
withName: makeBigWig {
container = 'bicf/deeptools3.3:2.0.0'
withName:inferMetadata{
container = 'bicf/rseqc3.0:2.0.0'
}
}
......
......@@ -38,6 +38,7 @@ referenceBase = "/project/BICF/BICF_Core/shared/gudmap/references"
// Define script files
script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh")
script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py")
script_inferMeta = Channel.fromPath("${baseDir}/scripts/inferMeta.sh")
/*
* splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid
......@@ -112,7 +113,7 @@ process getData {
"""
}
// Split fastq's
// Replicate raw fastqs for multiple process inputs
fastqs.into {
fastqs_trimData
fastqs_fastqc
......@@ -184,6 +185,7 @@ metadata.splitCsv(sep: ",", header: false).separate(
spike,
species
)
// Replicate metadata for multiple process inputs
endsManual.into {
endsManual_trimData
endsManual_alignData
......@@ -262,6 +264,7 @@ process trimData {
output:
path ("*.fq.gz") into fastqs_trimmed
path ("*_trimming_report.txt") into trimQC
path ("${repRID}.trimData.log")
path ("${repRID}.trimData.err")
......@@ -281,8 +284,10 @@ process trimData {
"""
}
// Replicate reference for multiple process inputs
reference.into {
reference_alignData
reference_inferMeta
}
/*
......@@ -299,8 +304,8 @@ process alignData {
path reference_alignData
output:
path ("${repRID}.sorted.bam") into rawBam
path ("${repRID}.sorted.bai") into rawBai
tuple val ("${repRID}"), path ("${repRID}.sorted.bam"), path ("${repRID}.sorted.bam.bai") into rawBam
path ("*.alignSummary.txt") into alignQC
path ("${repRID}.align.out")
path ("${repRID}.align.err")
......@@ -312,19 +317,24 @@ process alignData {
# align reads
if [ "${endsManual_alignData}" == "se" ]
then
hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} -U ${fastq[0]} 1>${repRID}.align.out 2>${repRID}.align.err
hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} -U ${fastq[0]} --summary-file ${repRID}.alignSummary.txt --new-summary 1>${repRID}.align.out 2>${repRID}.align.err
elif [ "${endsManual_alignData}" == "pe" ]
then
hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} 1>${repRID}.align.out 2>${repRID}.align.err
hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} --summary-file ${repRID}.alignSummary.txt --new-summary 1>${repRID}.align.out 2>${repRID}.align.err
fi
# convert sam to bam and sort and index
samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam 1>>${repRID}.align.out 2>>${repRID}.align.err;
samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam 1>>${repRID}.align.out 2>>${repRID}.align.err;
samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bai 1>>${repRID}.align.out 2>>${repRID}.align.err;
samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bam.bai 1>>${repRID}.align.out 2>>${repRID}.align.err;
"""
}
// Replicate rawBam for multiple process inputs
rawBam.into {
rawBam_dedupData
}
/*
*dedupData: mark the duplicate reads, specifically focused on PCR or optical duplicates
*/
......@@ -334,10 +344,11 @@ process dedupData {
publishDir "${logsDir}", mode: 'copy', pattern: "*.dedup.{out,err}"
input:
path rawBam
set val (repRID), path (inBam), path (inBai) from rawBam_dedupData
output:
tuple val ("${repRID}"), path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bai") into dedupBam
tuple val ("${repRID}"), path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bam.bai") into dedupBam
path ("*.deduped.Metrics.txt") into dedupQC
path ("${repRID}.dedup.out")
path ("${repRID}.dedup.err")
......@@ -347,9 +358,34 @@ process dedupData {
ulimit -a >>${repRID}.dedup.err
# remove duplicated reads
java -jar /picard/build/libs/picard.jar MarkDuplicates I=${rawBam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
java -jar /picard/build/libs/picard.jar MarkDuplicates I=${inBam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.deduped.bam ${repRID}.deduped.bam 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bai 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bam.bai 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
"""
}
// Replicate dedup bam/bai for multiple process inputs
dedupBam.into {
dedupBam_makeBigWig
dedupBam_inferMeta
}
/*
*Make BigWig files for output
*/
process makeBigWig {
tag "${repRID}"
publishDir "${logsDir}", mode: 'copy', pattern: "*.makeBigWig.err"
input:
set val (repRID), path (inBam), path (inBai) from dedupBam_makeBigWig
output:
path ("${repRID}.bw")
script:
"""
bamCoverage -p `nproc` -b ${inBam} -o ${repRID}.bw
"""
}
......@@ -378,20 +414,70 @@ process fastqc {
}
/*
*Make BigWig files for later processes
*rseqc: run RSeQC to collect stats and infer experimental metadata
*/
process makeBigWig {
process inferMetadata {
tag "${repRID}"
publishDir "${logsDir}", mode: 'copy', pattern: "*.makeBigWig.err"
publishDir "${logsDir}", mode: 'copy', pattern: "*.rseqc.err"
input:
set val (repRID), path (inBam), path (inBai) from dedupBam
path script_inferMeta
path reference_inferMeta
set val (repRID), path (inBam), path (inBai) from dedupBam_inferMeta
output:
path ("${repRID}.bw")
path "infer.csv" into inferedMetadata
path "${inBam.baseName}.tin.xls" into tin
path "${repRID}.insertSize.inner_distance_freq.txt" optional true into innerDistance
script:
"""
bamCoverage -p `nproc` -b ${inBam} -o ${repRID}.bw
hostname >${repRID}.rseqc.err
ulimit -a >>${repRID}.rseqc.err
# infer experimental setting from dedup bam
infer_experiment.py -r ./bed/genome.bed -i "${inBam}" >${repRID}.rseqc.log
endness=`bash inferMeta.sh endness ${repRID}.rseqc.log`
fail=`bash inferMeta.sh fail ${repRID}.rseqc.log`
if [ \${endness} == "PairEnd" ]
then
percentF=`bash inferMeta.sh pef ${repRID}.rseqc.log`
percentR=`bash inferMeta.sh per ${repRID}.rseqc.log`
inner_distance.py -i "${inBam}" -o ${repRID}.insertSize -r ./bed/genome.bed
elif [ \${endness} == "SingleEnd" ]
then
percentF=`bash inferMeta.sh sef ${repRID}.rseqc.log`
percentR=`bash inferMeta.sh ser ${repRID}.rseqc.log`
fi
if [ \$percentF -gt 0.25 ] && [ \$percentR -lt 0.25 ]
then
stranded="forward"
if [ \$endness == "PairEnd" ]
then
strategy="1++,1--,2+-,2-+"
else
strategy="++,--"
fi
elif [ \$percentR -gt 0.25 ] && [ \$percentF -lt 0.25 ]
then
stranded="reverse"
if [ \$endness == "PairEnd" ]
then
strategy="1+-,1-+,2++,2--"
else
strategy="+-,-+"
fi
else
stranded="unstranded"
strategy="us"
fi
# calcualte TIN values per feature
tin.py -i "${inBam}" -r ./bed/genome.bed
# write infered metadata to file
echo \${endness},\${stranded},\${strategy},\${percentF},\${percentR},\${fail} > infer.csv
"""
}
}
\ No newline at end of file
#!/bin/bash
bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1
for i in $(find */ -name "*.R*.fastq.gz"); do
path=${2}$(echo ${i##*/} | grep -o "\.R.\.fastq\.gz");
mv ${i} ./${path}
done;
\ No newline at end of file
if [ -z "${3}" ]
then
bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz ${1}
for i in $(find */ -name "*.R*.fastq.gz")
do
path=${2}$(echo ${i##*/} | grep -o "\.R.\.fastq\.gz")
mv ${i} ./${path}
done
elif [ "${3}" == "TEST" ]
then
bdbag --resolve-fetch all --fetch-filter filename\$*.txt ${1}
fi
\ No newline at end of file
#!/bin/bash
if [ "${1}" == "endness" ]
then
awk '/Data/ {print}' "${2}" | sed -e 's/^This is //' -e 's/ Data$//'
elif [ "${1}" == "fail" ]
then
awk '/Fraction of reads failed/ {print}' "${2}" | sed -e 's/^Fraction of reads failed to determine: //'
elif [ "${1}" == "sef" ]
then
awk '/\+\+,--/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "++,--": //'
elif [ "${1}" == "ser" ]
then
awk '/\+-,-\+/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "+-,-+": //'
elif [ "${1}" == "pef" ]
then
awk '/1\+\+,1--,2\+-,2-\+/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "1++,1--,2+-,2-+": //'
elif [ "${1}" == "per" ]
then
awk '/1\+-,1-\+,2\+\+,2--/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "1+-,1-+,2++,2--": //'
fi
\ No newline at end of file
......@@ -13,11 +13,11 @@ data_output_path = os.path.dirname(os.path.abspath(__file__)) + \
def test_alignData_se():
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.unal.gz'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.bam'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.bai'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.bam.bai'))
@pytest.mark.alignData
def test_alignData_pe():
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.unal.gz'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bam'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bai'))
\ No newline at end of file
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.pe.sorted.bam.bai'))
......@@ -11,4 +11,5 @@ data_output_path = os.path.dirname(os.path.abspath(__file__)) + \
@pytest.mark.dedupData
def test_dedupData():
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.deduped.bam'))
\ No newline at end of file
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.deduped.bam'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.deduped.bam.bai'))
......@@ -10,5 +10,5 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
@pytest.mark.getData
def test_getData():
assert os.path.exists(os.path.join(test_output_path, 'Replicate_16-1ZX4/bagit.txt'))
assert os.path.exists(os.path.join(test_output_path, '16-1ZX4.R1.fastq.gz'))
\ No newline at end of file
assert os.path.exists(os.path.join(test_output_path, 'Study_Q-Y4H0/bagit.txt'))
assert os.path.exists(os.path.join(test_output_path, 'Study_Q-Y4H0/data/assets/Study/Q-Y4H0/Experiment/Q-Y4BY/Replicate/Q-Y5F8/hMARIS_SIX2+_RiboDep#1.gene.rpkm.txt'))
\ No newline at end of file
#!/usr/bin/env python3
import pytest
import pandas as pd
from io import StringIO
import os
test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../../'
@pytest.mark.inferMetadata
def test_inferMetadata():
assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_1M.se.sorted.deduped.tin.xls'))
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment