diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index dc2eab106b9fff3a2d3a00dc5bde49c81046f5b2..9a846f6e432738ec3ec1a11d9dcf4bca2d473ed1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -65,8 +65,8 @@ getRef: trimData: stage: unit script: - - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5F6_1M.se -j 20 ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz - - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5F6_1M.pe -j 20 ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz + - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz + - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz - readLengthSE=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') - readLengthPE=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') - pytest -m trimData @@ -130,6 +130,13 @@ dataQC: echo "tin.py -i ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"; done | singularity run 'docker://bicf/rseqc3.0:2.0.1_indev' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls - pytest -m dataQC +outputBag: + stage: unit + script: + - mkdir test + - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bdbag test --archiver zip + - pytest -m outputBag + integration_se: stage: integration @@ -181,5 +188,4 @@ consistency: - assignedPE.txt - assignedExpectSE.txt - assignedExpectPE.txt - expire_in: 7 days - + expire_in: 7 days \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 904efcc5a2f1a025e82fa1a65d11d33f573809d6..169ed7fed6b9891ea3c6698b914f89bcded6c99f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,14 @@ # v0.0.2 (in development) **User Facing** +* Output: + * inputBag + * outputBag **Background** *Known Bugs* +* outputBag does not contain fetch for processed data +* Does not include automatic data upload <hr> diff --git a/README.md b/README.md index 23b24d840c661ec6f8c1694629b9024bf9b07caa..2216c2775a2145f8e021cb221b18414f38ed9304 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,8 @@ To run a set of replicates from study RID: ------------------------------------------ Run in repo root dir: * `sh workflow/scripts/splitStudy.sh [studyRID]` -It will run in parallel in batches of 5 replicatesRID +It will run in parallel in batches of 25 replicatesRID with 30 second delays between launches.\ +NOTE: Nextflow "local" processes for all replicates will run on the node/machine the bash script is launched from... consider running the study script on the BioHPC's SLURM cluster (use `sbatch`). @@ -115,4 +116,4 @@ Please cite in publications: Pipeline was developed by BICF from funding provide Pipeline Directed Acyclic Graph ------------------------------- - + \ No newline at end of file diff --git a/docs/dag.png b/docs/dag.png old mode 100644 new mode 100755 index 8c4896f2f0f6d2c765d6b020cddb4cda23064c97..a360ca8992473d8414143fb38bf13bfb6f6d4cce Binary files a/docs/dag.png and b/docs/dag.png differ diff --git a/workflow/conf/aws.config b/workflow/conf/aws.config index 9ecbfb98f593f167a35650299921adaf2fffbb42..b5054f724c810b4eeaa01ae03e6db1ae421ab0cc 100644 --- a/workflow/conf/aws.config +++ b/workflow/conf/aws.config @@ -80,4 +80,8 @@ process { cpus = 2 memory = '1 GB' } + withName: outputBag { + cpus = 1 + memory = '1 GB' + } } diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 338cd5d54d4258e499a60241054c5f28109e7c19..efe86bea962eca3577471efec640248748950625 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -10,7 +10,7 @@ process { executor = 'local' } withName: getData { - executor = 'local' + queue = 'super' } withName: parseMetadata { executor = 'local' @@ -19,7 +19,7 @@ process { queue = 'super' } withName: getRefInfer { - executor = 'local' + queue = 'super' } withName: downsampleData { executor = 'local' @@ -31,7 +31,7 @@ process { queue = 'super' } withName: getRef { - executor = 'local' + queue = 'super' } withName: alignData { queue = '256GB,256GBv1' @@ -54,6 +54,9 @@ process { withName: aggrQC { executor = 'local' } + withName: outputBag { + executor = 'local' + } } singularity { diff --git a/workflow/nextflow.config b/workflow/nextflow.config index db422b68cb4a8d1900cbae33801f6d5f5b8eb9a0..660ec331080a9235d0e7eddc630a48500ead61f3 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -67,21 +67,24 @@ process { withName: aggrQC { container = 'bicf/multiqc1.8:2.0.1_indev' } + withName:outputBag { + container = 'bicf/gudmaprbkfilexfer:2.0.1_indev' + } } trace { - enabled = true + enabled = false file = 'pipeline_trace.txt' fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss' } timeline { - enabled = true + enabled = false file = 'timeline.html' } report { - enabled = true + enabled = false file = 'report.html' } @@ -94,6 +97,6 @@ manifest { homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq' description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.' mainScript = 'rna-seq.nf' - version = 'v0.0.1' + version = 'v0.0.2_indev' nextflowVersion = '>=19.09.0' } diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 772d076174400bca13110869a797f87617ff5c89..a311b9e876249c0d4cfbbb1cd13e7d94b1902807 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -110,6 +110,7 @@ Development : ${params.dev} */ process getBag { tag "${repRID}" + publishDir "${outDir}/inputBag", mode: 'copy', pattern: "Replicate_*.zip" input: path credential, stageAs: "credential.json" from deriva @@ -303,11 +304,11 @@ process trimData { echo -e "LOG: trimming ${ends}" >> ${repRID}.trimData.log if [ "${ends}" == "se" ] then - trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} + trim_galore --gzip -q 25 --length 35 --basename ${repRID} ${fastq[0]} readLength=\$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') elif [ "${ends}" == "pe" ] then - trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} + trim_galore --gzip -q 25 --length 35 --paired --basename ${repRID} ${fastq[0]} ${fastq[1]} readLength=\$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') fi echo -e "LOG: trimmed" >> ${repRID}.trimData.log @@ -834,7 +835,7 @@ process makeBigWig { */ process countData { tag "${repRID}" - publishDir "${outDir}/count", mode: 'copy', pattern: "${repRID}*.countTable.csv" + publishDir "${outDir}/count", mode: 'copy', pattern: "${repRID}*.tpmTable.csv" input: path script_calculateTPM @@ -1057,4 +1058,28 @@ process aggrQC { multiqc -c ${multiqcConfig} . -n ${repRID}.multiqc.html cp ${repRID}.multiqc_data/multiqc_data.json ${repRID}.multiqc_data.json """ -} \ No newline at end of file +} + +/* + *ouputBag: create ouputBag +*/ +process outputBag { + tag "${repRID}" + publishDir "${outDir}/outputBag", mode: 'copy', pattern: "Replicate_${repRID}.outputBag.zip" + + input: + path multiqc + path multiqcJSON + + output: + path ("Replicate_*.zip") into outputBag + + script: + """ + mkdir Replicate_${repRID}.outputBag + cp ${multiqc} Replicate_${repRID}.outputBag + cp ${multiqcJSON} Replicate_${repRID}.outputBag + bdbag Replicate_${repRID}.outputBag --archiver zip + """ +} + diff --git a/workflow/scripts/splitStudy.sh b/workflow/scripts/splitStudy.sh index a64b6d9e4cde818d1c6f91fd84144b821febc536..1f82af6132dad6148adf506a34769c0af1fe9992 100644 --- a/workflow/scripts/splitStudy.sh +++ b/workflow/scripts/splitStudy.sh @@ -1,5 +1,9 @@ #!/bin/bash +#SBATCH -p super +#SBATCH --job-name GUDMAP-RBK_Study +#SBATCH -t 7-0:0:0 + # query GUDMAP/RBK for study RID echo "curl --location --request GET 'https://www.gudmap.org/ermrest/catalog/2/entity/RNASeq:Replicate/Study_RID="${1}"'" | bash > $1_studyRID.json @@ -10,7 +14,7 @@ python3 ./workflow/scripts/splitStudy.py -s $1 # run pipeline on replicate RIDs in parallel module load nextflow/20.01.0 module load singularity/3.5.3 -while read repRID; do echo ${repRID}; done < "$1_studyRID.csv" | xargs -P 5 -I {} nextflow run workflow/rna-seq.nf --repRID {} +while read repRID; do echo ${repRID}; sleep 15; done < "$1_studyRID.csv" | xargs -P 5 -I {} nextflow -q run workflow/rna-seq.nf --repRID {} # cleanup study RID files rm $1_studyRID.json diff --git a/workflow/tests/test_outputBag.py b/workflow/tests/test_outputBag.py new file mode 100644 index 0000000000000000000000000000000000000000..4132d834996e5557024dbbf587d4aca41594bf9e --- /dev/null +++ b/workflow/tests/test_outputBag.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + +@pytest.mark.outputBag +def test_outputBag(): + assert os.path.exists(os.path.join(test_output_path, 'test.zip'))