Merge branch '62-output.inputBagit' into 'develop'

Resolve "Add input bagit to ouput" Closes #62 See merge request !38

Merge branch '62-output.inputBagit' into 'develop'
Resolve "Add input bagit to ouput" Closes #62 See merge request !38
e6676204 · Venkat Malladi · 8a8ce097 · a86785e3 · e6676204 · e6676204
Commit e6676204 authored 4 years ago by Venkat Malladi
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -65,8 +65,8 @@ getRef:
 trimData:
  stage: unit
  script:
-  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5F6_1M.se -j 20 ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
-  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5F6_1M.pe -j 20 ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
+  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
+  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
  - readLengthSE=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
  - readLengthPE=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
  - pytest -m trimData
@@ -130,6 +130,13 @@ dataQC:
    echo "tin.py -i ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"; done | singularity run 'docker://bicf/rseqc3.0:2.0.1_indev' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls
  - pytest -m dataQC

+outputBag:
+  stage: unit
+  script:
+  - mkdir test
+  - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bdbag test --archiver zip
+  - pytest -m outputBag
+

 integration_se:
  stage: integration
@@ -181,5 +188,4 @@ consistency:
      - assignedPE.txt
      - assignedExpectSE.txt
      - assignedExpectPE.txt
-    expire_in: 7 days
-
+    expire_in: 7 days
\ No newline at end of file
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # v0.0.2 (in development)
 **User Facing**
+* Output:
+  * inputBag
+  * outputBag

 **Background**

 *Known Bugs*
+* outputBag does not contain fetch for processed data
+* Does not include automatic data upload

 <hr>


--- a/README.md
+++ b/README.md
@@ -62,7 +62,8 @@ To run a set of replicates from study RID:
 ------------------------------------------
 Run in repo root dir:
 * `sh workflow/scripts/splitStudy.sh [studyRID]`
-It will run in parallel in batches of 5 replicatesRID
+It will run in parallel in batches of 25 replicatesRID with 30 second delays between launches.\
+NOTE: Nextflow "local" processes for all replicates will run on the node/machine the bash script is launched from... consider running the study script on the BioHPC's SLURM cluster (use `sbatch`).



@@ -115,4 +116,4 @@ Please cite in publications: Pipeline was developed by BICF from funding provide
 Pipeline Directed Acyclic Graph
 -------------------------------

-![dag](docs/dag.png "DAG")
+![dag](docs/dag.png "DAG")
\ No newline at end of file
--- a/docs/dag.png
+++ b/docs/dag.png
--- a/workflow/conf/aws.config
+++ b/workflow/conf/aws.config
@@ -80,4 +80,8 @@ process {
    cpus = 2
    memory = '1 GB'
  }
+  withName: outputBag {
+    cpus = 1
+    memory = '1 GB'
+  }
 }
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -10,7 +10,7 @@ process {
    executor = 'local'
  }
  withName: getData {
-    executor = 'local'
+    queue = 'super'
  }
  withName: parseMetadata {
    executor = 'local'
@@ -19,7 +19,7 @@ process {
    queue = 'super'
  }
  withName: getRefInfer {
-    executor = 'local'
+    queue = 'super'
  }
  withName: downsampleData {
    executor = 'local'
@@ -31,7 +31,7 @@ process {
    queue = 'super'
  }
  withName: getRef {
-    executor = 'local'
+    queue = 'super'
  }
  withName: alignData {
    queue = '256GB,256GBv1'
@@ -54,6 +54,9 @@ process {
  withName: aggrQC {
    executor = 'local'
  }
+  withName: outputBag {
+    executor = 'local'
+  }
 }

 singularity {

--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
@@ -67,21 +67,24 @@ process {
  withName: aggrQC {
    container = 'bicf/multiqc1.8:2.0.1_indev'
  }
+  withName:outputBag {
+    container = 'bicf/gudmaprbkfilexfer:2.0.1_indev'
+  }
 }

 trace {
-  enabled = true
+  enabled = false
  file = 'pipeline_trace.txt'
  fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
 }

 timeline {
-  enabled = true
+  enabled = false
  file = 'timeline.html'
 }
 	
 report {
-  enabled = true
+  enabled = false
  file = 'report.html'
 }

@@ -94,6 +97,6 @@ manifest {
  homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq'
  description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.'
  mainScript = 'rna-seq.nf'
-  version = 'v0.0.1'
+  version = 'v0.0.2_indev'
  nextflowVersion = '>=19.09.0'
 }
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -110,6 +110,7 @@ Development            : ${params.dev}
 */
 process getBag {
  tag "${repRID}"
+  publishDir "${outDir}/inputBag", mode: 'copy', pattern: "Replicate_*.zip"

  input:
    path credential, stageAs: "credential.json" from deriva
@@ -303,11 +304,11 @@ process trimData {
    echo -e "LOG: trimming ${ends}" >> ${repRID}.trimData.log
    if [ "${ends}" == "se" ]
    then
-      trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]}
+      trim_galore --gzip -q 25 --length 35 --basename ${repRID} ${fastq[0]}
      readLength=\$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
    elif [ "${ends}" == "pe" ]
    then
-      trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]}
+      trim_galore --gzip -q 25 --length 35 --paired --basename ${repRID} ${fastq[0]} ${fastq[1]}
      readLength=\$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
    fi
    echo -e "LOG: trimmed" >> ${repRID}.trimData.log
@@ -834,7 +835,7 @@ process makeBigWig {
 */
 process countData {
  tag "${repRID}"
-  publishDir "${outDir}/count", mode: 'copy', pattern: "${repRID}*.countTable.csv"
+  publishDir "${outDir}/count", mode: 'copy', pattern: "${repRID}*.tpmTable.csv"

  input:
    path script_calculateTPM
@@ -1057,4 +1058,28 @@ process aggrQC {
    multiqc -c ${multiqcConfig} . -n ${repRID}.multiqc.html
    cp ${repRID}.multiqc_data/multiqc_data.json ${repRID}.multiqc_data.json
    """
-}
\ No newline at end of file
+}
+
+/*
+ *ouputBag: create ouputBag
+*/
+process outputBag {
+  tag "${repRID}"
+  publishDir "${outDir}/outputBag", mode: 'copy', pattern: "Replicate_${repRID}.outputBag.zip"
+  
+  input:
+    path multiqc
+    path multiqcJSON
+  
+  output:
+    path ("Replicate_*.zip") into outputBag
+
+  script:
+  """
+  mkdir Replicate_${repRID}.outputBag
+  cp ${multiqc} Replicate_${repRID}.outputBag
+  cp ${multiqcJSON} Replicate_${repRID}.outputBag
+  bdbag Replicate_${repRID}.outputBag --archiver zip
+  """
+}
+
--- a/workflow/scripts/splitStudy.sh
+++ b/workflow/scripts/splitStudy.sh
 #!/bin/bash

+#SBATCH -p super
+#SBATCH --job-name GUDMAP-RBK_Study
+#SBATCH -t 7-0:0:0
+
 # query GUDMAP/RBK for study RID
 echo "curl --location --request GET 'https://www.gudmap.org/ermrest/catalog/2/entity/RNASeq:Replicate/Study_RID="${1}"'" | bash > $1_studyRID.json

@@ -10,7 +14,7 @@ python3 ./workflow/scripts/splitStudy.py -s $1
 # run pipeline on replicate RIDs in parallel
 module load nextflow/20.01.0
 module load singularity/3.5.3
-while read repRID; do echo ${repRID}; done < "$1_studyRID.csv" | xargs -P 5 -I {} nextflow run workflow/rna-seq.nf --repRID {}
+while read repRID; do echo ${repRID}; sleep 15; done < "$1_studyRID.csv" | xargs -P 5 -I {} nextflow -q run workflow/rna-seq.nf --repRID {}

 # cleanup study RID files
 rm $1_studyRID.json

--- a/workflow/tests/test_outputBag.py
+++ b/workflow/tests/test_outputBag.py
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+from io import StringIO
+import os
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../../'
+
+@pytest.mark.outputBag
+def test_outputBag():
+    assert os.path.exists(os.path.join(test_output_path, 'test.zip'))