Skip to content
Snippets Groups Projects
Commit e6676204 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Merge branch '62-output.inputBagit' into 'develop'

Resolve "Add input bagit to ouput"

Closes #62

See merge request !38
parents 8a8ce097 a86785e3
Branches
Tags
2 merge requests!39v0.0.2,!38Resolve "Add input bagit to ouput"
Pipeline #7909 passed with stages
in 1 hour, 40 minutes, and 15 seconds
...@@ -65,8 +65,8 @@ getRef: ...@@ -65,8 +65,8 @@ getRef:
trimData: trimData:
stage: unit stage: unit
script: script:
- singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5F6_1M.se -j 20 ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
- singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5F6_1M.pe -j 20 ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
- readLengthSE=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') - readLengthSE=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
- readLengthPE=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') - readLengthPE=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
- pytest -m trimData - pytest -m trimData
...@@ -130,6 +130,13 @@ dataQC: ...@@ -130,6 +130,13 @@ dataQC:
echo "tin.py -i ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"; done | singularity run 'docker://bicf/rseqc3.0:2.0.1_indev' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls echo "tin.py -i ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"; done | singularity run 'docker://bicf/rseqc3.0:2.0.1_indev' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls
- pytest -m dataQC - pytest -m dataQC
outputBag:
stage: unit
script:
- mkdir test
- singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bdbag test --archiver zip
- pytest -m outputBag
integration_se: integration_se:
stage: integration stage: integration
...@@ -181,5 +188,4 @@ consistency: ...@@ -181,5 +188,4 @@ consistency:
- assignedPE.txt - assignedPE.txt
- assignedExpectSE.txt - assignedExpectSE.txt
- assignedExpectPE.txt - assignedExpectPE.txt
expire_in: 7 days expire_in: 7 days
\ No newline at end of file
# v0.0.2 (in development) # v0.0.2 (in development)
**User Facing** **User Facing**
* Output:
* inputBag
* outputBag
**Background** **Background**
*Known Bugs* *Known Bugs*
* outputBag does not contain fetch for processed data
* Does not include automatic data upload
<hr> <hr>
......
...@@ -62,7 +62,8 @@ To run a set of replicates from study RID: ...@@ -62,7 +62,8 @@ To run a set of replicates from study RID:
------------------------------------------ ------------------------------------------
Run in repo root dir: Run in repo root dir:
* `sh workflow/scripts/splitStudy.sh [studyRID]` * `sh workflow/scripts/splitStudy.sh [studyRID]`
It will run in parallel in batches of 5 replicatesRID It will run in parallel in batches of 25 replicatesRID with 30 second delays between launches.\
NOTE: Nextflow "local" processes for all replicates will run on the node/machine the bash script is launched from... consider running the study script on the BioHPC's SLURM cluster (use `sbatch`).
...@@ -115,4 +116,4 @@ Please cite in publications: Pipeline was developed by BICF from funding provide ...@@ -115,4 +116,4 @@ Please cite in publications: Pipeline was developed by BICF from funding provide
Pipeline Directed Acyclic Graph Pipeline Directed Acyclic Graph
------------------------------- -------------------------------
![dag](docs/dag.png "DAG") ![dag](docs/dag.png "DAG")
\ No newline at end of file
docs/dag.png

665 KiB | W: | H:

docs/dag.png

673 KiB | W: | H:

docs/dag.png
docs/dag.png
docs/dag.png
docs/dag.png
  • 2-up
  • Swipe
  • Onion skin
...@@ -80,4 +80,8 @@ process { ...@@ -80,4 +80,8 @@ process {
cpus = 2 cpus = 2
memory = '1 GB' memory = '1 GB'
} }
withName: outputBag {
cpus = 1
memory = '1 GB'
}
} }
...@@ -10,7 +10,7 @@ process { ...@@ -10,7 +10,7 @@ process {
executor = 'local' executor = 'local'
} }
withName: getData { withName: getData {
executor = 'local' queue = 'super'
} }
withName: parseMetadata { withName: parseMetadata {
executor = 'local' executor = 'local'
...@@ -19,7 +19,7 @@ process { ...@@ -19,7 +19,7 @@ process {
queue = 'super' queue = 'super'
} }
withName: getRefInfer { withName: getRefInfer {
executor = 'local' queue = 'super'
} }
withName: downsampleData { withName: downsampleData {
executor = 'local' executor = 'local'
...@@ -31,7 +31,7 @@ process { ...@@ -31,7 +31,7 @@ process {
queue = 'super' queue = 'super'
} }
withName: getRef { withName: getRef {
executor = 'local' queue = 'super'
} }
withName: alignData { withName: alignData {
queue = '256GB,256GBv1' queue = '256GB,256GBv1'
...@@ -54,6 +54,9 @@ process { ...@@ -54,6 +54,9 @@ process {
withName: aggrQC { withName: aggrQC {
executor = 'local' executor = 'local'
} }
withName: outputBag {
executor = 'local'
}
} }
singularity { singularity {
......
...@@ -67,21 +67,24 @@ process { ...@@ -67,21 +67,24 @@ process {
withName: aggrQC { withName: aggrQC {
container = 'bicf/multiqc1.8:2.0.1_indev' container = 'bicf/multiqc1.8:2.0.1_indev'
} }
withName:outputBag {
container = 'bicf/gudmaprbkfilexfer:2.0.1_indev'
}
} }
trace { trace {
enabled = true enabled = false
file = 'pipeline_trace.txt' file = 'pipeline_trace.txt'
fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss' fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
} }
timeline { timeline {
enabled = true enabled = false
file = 'timeline.html' file = 'timeline.html'
} }
report { report {
enabled = true enabled = false
file = 'report.html' file = 'report.html'
} }
...@@ -94,6 +97,6 @@ manifest { ...@@ -94,6 +97,6 @@ manifest {
homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq' homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq'
description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.' description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.'
mainScript = 'rna-seq.nf' mainScript = 'rna-seq.nf'
version = 'v0.0.1' version = 'v0.0.2_indev'
nextflowVersion = '>=19.09.0' nextflowVersion = '>=19.09.0'
} }
...@@ -110,6 +110,7 @@ Development : ${params.dev} ...@@ -110,6 +110,7 @@ Development : ${params.dev}
*/ */
process getBag { process getBag {
tag "${repRID}" tag "${repRID}"
publishDir "${outDir}/inputBag", mode: 'copy', pattern: "Replicate_*.zip"
input: input:
path credential, stageAs: "credential.json" from deriva path credential, stageAs: "credential.json" from deriva
...@@ -303,11 +304,11 @@ process trimData { ...@@ -303,11 +304,11 @@ process trimData {
echo -e "LOG: trimming ${ends}" >> ${repRID}.trimData.log echo -e "LOG: trimming ${ends}" >> ${repRID}.trimData.log
if [ "${ends}" == "se" ] if [ "${ends}" == "se" ]
then then
trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} trim_galore --gzip -q 25 --length 35 --basename ${repRID} ${fastq[0]}
readLength=\$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') readLength=\$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
elif [ "${ends}" == "pe" ] elif [ "${ends}" == "pe" ]
then then
trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} trim_galore --gzip -q 25 --length 35 --paired --basename ${repRID} ${fastq[0]} ${fastq[1]}
readLength=\$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') readLength=\$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
fi fi
echo -e "LOG: trimmed" >> ${repRID}.trimData.log echo -e "LOG: trimmed" >> ${repRID}.trimData.log
...@@ -834,7 +835,7 @@ process makeBigWig { ...@@ -834,7 +835,7 @@ process makeBigWig {
*/ */
process countData { process countData {
tag "${repRID}" tag "${repRID}"
publishDir "${outDir}/count", mode: 'copy', pattern: "${repRID}*.countTable.csv" publishDir "${outDir}/count", mode: 'copy', pattern: "${repRID}*.tpmTable.csv"
input: input:
path script_calculateTPM path script_calculateTPM
...@@ -1057,4 +1058,28 @@ process aggrQC { ...@@ -1057,4 +1058,28 @@ process aggrQC {
multiqc -c ${multiqcConfig} . -n ${repRID}.multiqc.html multiqc -c ${multiqcConfig} . -n ${repRID}.multiqc.html
cp ${repRID}.multiqc_data/multiqc_data.json ${repRID}.multiqc_data.json cp ${repRID}.multiqc_data/multiqc_data.json ${repRID}.multiqc_data.json
""" """
} }
\ No newline at end of file
/*
*ouputBag: create ouputBag
*/
process outputBag {
tag "${repRID}"
publishDir "${outDir}/outputBag", mode: 'copy', pattern: "Replicate_${repRID}.outputBag.zip"
input:
path multiqc
path multiqcJSON
output:
path ("Replicate_*.zip") into outputBag
script:
"""
mkdir Replicate_${repRID}.outputBag
cp ${multiqc} Replicate_${repRID}.outputBag
cp ${multiqcJSON} Replicate_${repRID}.outputBag
bdbag Replicate_${repRID}.outputBag --archiver zip
"""
}
#!/bin/bash #!/bin/bash
#SBATCH -p super
#SBATCH --job-name GUDMAP-RBK_Study
#SBATCH -t 7-0:0:0
# query GUDMAP/RBK for study RID # query GUDMAP/RBK for study RID
echo "curl --location --request GET 'https://www.gudmap.org/ermrest/catalog/2/entity/RNASeq:Replicate/Study_RID="${1}"'" | bash > $1_studyRID.json echo "curl --location --request GET 'https://www.gudmap.org/ermrest/catalog/2/entity/RNASeq:Replicate/Study_RID="${1}"'" | bash > $1_studyRID.json
...@@ -10,7 +14,7 @@ python3 ./workflow/scripts/splitStudy.py -s $1 ...@@ -10,7 +14,7 @@ python3 ./workflow/scripts/splitStudy.py -s $1
# run pipeline on replicate RIDs in parallel # run pipeline on replicate RIDs in parallel
module load nextflow/20.01.0 module load nextflow/20.01.0
module load singularity/3.5.3 module load singularity/3.5.3
while read repRID; do echo ${repRID}; done < "$1_studyRID.csv" | xargs -P 5 -I {} nextflow run workflow/rna-seq.nf --repRID {} while read repRID; do echo ${repRID}; sleep 15; done < "$1_studyRID.csv" | xargs -P 5 -I {} nextflow -q run workflow/rna-seq.nf --repRID {}
# cleanup study RID files # cleanup study RID files
rm $1_studyRID.json rm $1_studyRID.json
......
#!/usr/bin/env python3
import pytest
import pandas as pd
from io import StringIO
import os
test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../../'
@pytest.mark.outputBag
def test_outputBag():
assert os.path.exists(os.path.join(test_output_path, 'test.zip'))
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment