diff --git a/.gitignore b/.gitignore
index 2bc34493af5ad8d30a9ef477283aa7c4b32700b8..12288788210fa386427657fa55ab47b9ac14a6aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,32 +1,6 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
 
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-
-# PyInstaller
-# Created by https://www.gitignore.io/api/r,perl,macos,linux,python,windows
-# Edit at https://www.gitignore.io/?templates=r,perl,macos,linux,python,windows
+# Created by https://www.gitignore.io/api/r,perl,linux,macos,python,windows,microsoftoffice
+# Edit at https://www.gitignore.io/?templates=r,perl,linux,macos,python,windows,microsoftoffice
 
 ### Linux ###
 *~
@@ -71,6 +45,27 @@ Network Trash Folder
 Temporary Items
 .apdisk
 
+### MicrosoftOffice ###
+*.tmp
+
+# Word temporary
+~$*.doc*
+
+# Word Auto Backup File
+Backup of *.doc*
+
+# Excel temporary
+~$*.xls*
+
+# Excel Backup File
+*.xlk
+
+# PowerPoint temporary
+~$*.ppt*
+
+# Visio autosave temporary files
+*.~vsd*
+
 ### Perl ###
 !Build/
 .last_cover_stats
@@ -165,15 +160,6 @@ coverage.xml
 *.mo
 *.pot
 
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-
-# Flask stuff:
-instance/
-.webassets-cache
-
 # Scrapy stuff:
 .scrapy
 
@@ -183,31 +169,22 @@ docs/_build/
 # PyBuilder
 target/
 
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
 # pyenv
 .python-version
 
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
 # celery beat schedule file
 celerybeat-schedule
 
 # SageMath parsed files
 *.sage.py
 
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
 # Spyder project settings
 .spyderproject
 .spyproject
@@ -215,6 +192,11 @@ venv.bak/
 # Rope project settings
 .ropeproject
 
+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
+
 # mkdocs documentation
 /site
 
@@ -226,9 +208,6 @@ dmypy.json
 # Pyre type checker
 .pyre/
 
-### Python Patch ###
-.venv/
-
 ### R ###
 # History files
 .Rhistory
@@ -237,6 +216,9 @@ dmypy.json
 # Session Data files
 .RData
 
+# User-specific files
+.Ruserdata
+
 # Example code in package build process
 *-Ex.R
 
@@ -257,7 +239,7 @@ vignettes/*.pdf
 .httr-oauth
 
 # knitr and R markdown default cache directories
-/*_cache/
+*_cache/
 /cache/
 
 # Temporary files created by R markdown
@@ -271,6 +253,7 @@ vignettes/*.pdf
 ### Windows ###
 # Windows thumbnail cache files
 Thumbs.db
+Thumbs.db:encryptable
 ehthumbs.db
 ehthumbs_vista.db
 
@@ -293,11 +276,11 @@ $RECYCLE.BIN/
 # Windows shortcuts
 *.lnk
 
-# End of https://www.gitignore.io/api/r,perl,macos,linux,python,windows
+# End of https://www.gitignore.io/api/r,perl,linux,macos,python,windows,microsoftoffice
 
 # nextflow analysis folders/files
 /test_data/*
-/workflow/docker/images/*
+!/test_data/createTestData.sh
 /workflow/.nextflow/*
 /workflow/work/*
 /workflow/output/*
@@ -314,6 +297,8 @@ timeline*.html*
 *.tmp
 *.swp
 *.out
+*_studyRID.json
+*_studyRID.csv
 run*.sh
 
 !.gitkeep
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..dc2eab106b9fff3a2d3a00dc5bde49c81046f5b2
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,185 @@
+before_script:
+  - module add  python/3.6.4-anaconda
+  - pip install --user pytest-pythonpath==0.7.1 pytest-cov==2.5.1
+  - module load singularity/3.5.3
+  - module load nextflow/20.01.0
+  - ln -sfn /project/BICF/BICF_Core/shared/gudmap/test_data/* ./test_data/
+  - mkdir -p ~/.deriva
+  - mkdir -p ~/.bdbag
+
+stages:
+  - unit
+  - integration
+  - consistency
+
+getBag:
+  stage: unit
+  script:
+  - ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json
+  - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6
+  - pytest -m getBag
+
+getData:
+  stage: unit
+  script:
+  - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
+  - unzip ./test_data/bagit/Replicate_Q-Y5F6.zip
+  - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6 TEST
+  - pytest -m getData
+
+parseMetadata:
+  stage: unit
+  script:
+  - rep=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID)
+  - exp=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p expRID)
+  - study=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p studyRID)
+  - endsMeta=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsMeta)
+  - endsManual=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsManual)
+  - stranded=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded)
+  - spike=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p spike)
+  - species=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species)
+  - readLength=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.stageNew.csv" -p readLength)
+  - echo -e "${endsMeta},${endsManual},${stranded},${spike},${species},${readLength},${exp},${study},${rep}" > design.csv
+  - pytest -m parseMetadata
+
+inferMetadata:
+  stage: unit
+  script:
+  - >
+    align=$(echo $(grep "Overall alignment rate" ./test_data/meta/Q-Y5F6_1M.se.alignSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) &&
+    if [[ ${align} == "" ]]; then exit 1; fi
+  - >
+    singularity run 'docker://bicf/rseqc3.0:2.0.1_indev' infer_experiment.py -r "/project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed" -i "./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam" 1>> Q-Y5F6_1M.se.inferMetadata.log &&
+    ended=`singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/inferMeta.sh endness Q-Y5F6_1M.se.inferMetadata.log` &&
+    if [[ ${ended} == "" ]]; then exit 1; fi
+  - pytest -m inferMetadata
+
+getRef:
+  stage: unit
+  script:
+  - mkdir -p hu
+  - mkdir -p mo
+  - cp -R /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2 ./hu/
+  - cp -R /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2 ./mo/
+
+trimData:
+  stage: unit
+  script:
+  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5F6_1M.se -j 20 ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
+  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5F6_1M.pe -j 20 ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
+  - readLengthSE=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
+  - readLengthPE=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
+  - pytest -m trimData
+
+downsampleData:
+  stage: unit
+  script:
+  - singularity run 'docker://bicf/seqtk:2.0.1_indev' seqtk sample -s100 ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz 1000 1> sampled.1.fq
+  - pytest -m downsampleData
+
+alignData:
+  stage: unit
+  script:
+  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary
+  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam
+  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam
+  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.sorted.bam.bai
+  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.pe.unal.gz -S Q-Y5F6_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/small/Q-Y5F6_1M.pe_R1_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5F6_1M.pe_R2_val_2.fq.gz --summary-file Q-Y5F6_1M.pe.alignSummary.txt --new-summary
+  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.pe.bam Q-Y5F6_1M.pe.sam
+  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam
+  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai
+  - pytest -m alignData
+
+dedupData:
+  stage: unit
+  script:
+  - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true
+  - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5F6_1M.se.deduped.bam
+  - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai
+  - >
+    for i in {"chr8","chr4","chrY"}; do 
+      echo "samtools view -b Q-Y5F6_1M.se.sorted.deduped.bam ${i} > Q-Y5F6_1M.se.sorted.deduped.${i}.bam; samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.${i}.bam Q-Y5F6_1M.se.sorted.deduped.${i}.bam.bai;";
+      done | singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' parallel -j 20 -k
+  - pytest -m dedupData
+
+countData:
+  stage: unit
+  script:
+  - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' -o Q-Y5F6_1M.se.featureCounts -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam 
+  - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.featureCounts
+  - assignedReads=$(grep -m 1 'Assigned' *.summary | grep -oe '\([0-9.]*\)')
+  - pytest -m makeFeatureCounts
+
+makeBigWig:
+  stage: unit
+  script:
+  - singularity run 'docker://bicf/deeptools3.3:2.0.1_indev' bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw
+  - pytest -m makeBigWig
+
+fastqc:
+  stage: unit
+  script:
+  - singularity run 'docker://bicf/fastqc:2.0.1_indev' fastqc ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o .
+  - pytest -m fastqc
+
+dataQC:
+  stage: unit
+  script:
+  - echo -e  "geneID\tchrom\ttx_start\ttx_end\tTIN" > Q-Y5F6_1M.se.sorted.deduped.tin.xls
+  - for i in {"chr8","chr4","chrY"}; do
+    echo "tin.py -i ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"; done | singularity run 'docker://bicf/rseqc3.0:2.0.1_indev' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls
+  - pytest -m dataQC
+
+
+integration_se:
+  stage: integration
+  script:
+  - hostname
+  - ulimit -a
+  - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-1ZX4 -with-dag dag.png --ci true
+  - find . -type f -name "multiqc_data.json" -exec cp {} ./SE_multiqc_data.json \;
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - output/qc/
+      - SE_multiqc_data.json
+    expire_in: 7 days
+
+integration_pe:
+  stage: integration
+  script:
+  - hostname
+  - ulimit -a
+  - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA -with-dag dag.png --ci true
+  - find . -type f -name "multiqc_data.json" -exec cp {} ./PE_multiqc_data.json \;
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - dag.png
+      - output/qc/
+      - PE_multiqc_data.json
+    expire_in: 7 days
+
+consistency:
+  stage: consistency
+  script:
+  - grep -m 1 \"Assigned\":.[0-9] SE_multiqc_data.json | grep -oe '\([0-9.]*\)' > assignedSE.txt
+  - grep -m 1 \"Assigned\":.[0-9] PE_multiqc_data.json | grep -oe '\([0-9.]*\)' > assignedPE.txt
+  - echo 7742416 > assignedExpectSE.txt
+  - echo 2599140 > assignedExpectPE.txt
+  - pytest -m consistencySE
+  - pytest -m consistencyPE
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - SE_multiqc_data.json
+      - PE_multiqc_data.json
+      - assignedSE.txt
+      - assignedPE.txt
+      - assignedExpectSE.txt
+      - assignedExpectPE.txt
+    expire_in: 7 days
+
diff --git a/.gitlab/issue_templates/Bug.md b/.gitlab/issue_templates/Bug.md
new file mode 100644
index 0000000000000000000000000000000000000000..3af88f7da9144a3bffffeea91ce7d452cd5d8023
--- /dev/null
+++ b/.gitlab/issue_templates/Bug.md
@@ -0,0 +1,21 @@
+# Summary
+
+
+# Steps to reproduce
+
+
+# Observed bug behavior
+
+
+# Expected behavior
+
+
+# Relevant logs and/or screenshots
+
+
+# Potential fixes
+
+
+
+/label ~bug ~"To Do"
+/cc @ghenry @venkat.malladi @s181706
\ No newline at end of file
diff --git a/.gitlab/merge_request_templates/Merge_Request.md b/.gitlab/merge_request_templates/Merge_Request.md
new file mode 100644
index 0000000000000000000000000000000000000000..88c50aa2a683175a6e6635283724b11ad308e6e5
--- /dev/null
+++ b/.gitlab/merge_request_templates/Merge_Request.md
@@ -0,0 +1,20 @@
+Please fill in the appropriate checklist below (delete those which are not relevant).
+These are the most common things requested on pull requests.
+
+## PR checklist
+ - [ ] This comment contains a description of changes (with reason)
+ - [ ] If you've fixed a bug or added code that should be tested, add tests!
+ - [ ] Documentation in `docs` is updated
+ - [ ] Replace dag.png with the most recent CI pipleine integrated_pe artifact
+ - [ ] `CHANGELOG.md` is updated
+ - [ ] `README.md` is updated
+ - [ ] `LICENSE.md` is updated with new contributors
+ - [ ] Docker images moved to production release and changed in pipeline
+ - [ ] Docker images used in the CI unit tests match those used in pipeline
+
+
+* [ ] **Close issue**\
+Closes #
+
+/cc @ghenry @venkat.malladi
+/assign @ghenry
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8ff0911406a181f08e86e495c7530f87d6f43dae..904efcc5a2f1a025e82fa1a65d11d33f573809d6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,14 @@
-# v0.0.1 (in development)
+# v0.0.2 (in development)
 **User Facing**
 
 **Background**
 
 *Known Bugs*
+
+<hr>
+
+# v0.0.1
+**INITIAL BETA VERSION**\
+Does not include automatic data upload\
+This version is for initial upload of test data to GUDMAP/RBK data-hub for internal integration
+<hr>
diff --git a/LICENSE b/LICENSE
index e4cf85dd516f931238ad4ff60f25ea8fa31a5256..af5e9c54a9121cc97781032e2ddbc58d59947ec0 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1 +1,25 @@
-NOT YET LICENSED
+MIT License
+
+Copyright (c) 2019 University of Texas Southwestern Medical Center.
+
+Contributors: Gervaise H. Henry, Jon Gesell, Jeremy Mathews, and Venkat Malladi
+
+Department: Bioinformatic Core Facility, Department of Bioinformatics
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/README.md b/README.md
index abbd229ea18b8093a3e35ccaac9f1426ad09930e..23b24d840c661ec6f8c1694629b9024bf9b07caa 100644
--- a/README.md
+++ b/README.md
@@ -1,33 +1,118 @@
 |*master*|*develop*|
 |:-:|:-:|
-|[![Build Status](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/badges/master/build.svg)](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/commits/master)|[![Build Status](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/badges/develop/build.svg)](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/commits/develop)|
-
+|[![pipeline status](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/badges/master/pipeline.svg)](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/commits/master)|[![pipeline status](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/badges/develop/pipeline.svg)](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/commits/develop)|
+<!--
 [![DOI]()]()
-
-GUDMAP/RBK RNA-Seq Pipeline
-===========================
+-->
+RNA-Seq Analytic Pipeline for GUDMAP/RBK
+========================================
 
 Introduction
 ------------
+This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub. It is designed to run on the HPC cluster ([BioHPC](https://portal.biohpc.swmed.edu)) at UT Southwestern Medical Center (in conjunction with the standard nextflow profile: config `biohpc.config`)
 
+![flowchart](docs/RNA-Seq%20Pipeline%20Design%20Flowchart.jpg "Flowchart")
+
+Cloud Compatibility:
+--------------------
+This pipeline is also capable of being run on AWS. To do so:
+* Build a AWS batch queue and environment either manually or with [aws-cloudformantion](https://console.aws.amazon.com/cloudformation/home?#/stacks/new?stackName=Nextflow&templateURL=https://s3.amazonaws.com/aws-genomics-workflows/templates/nextflow/nextflow-aio.template.yaml)
+* Edit one of the aws configs in workflow/config/
+  * Replace workDir with the S3 bucket generated
+  * Change region if different
+  * Change queue to the aws batch queue generated
+* The user must have awscli configured with an appropriate authentication (with `aws configure` and access keys) in the environment which nextflow will be run
+* Add `-profile` with the name aws config which was customized
 
 To Run:
 -------
-
 * Available parameters:
-* FULL EXAMPLE:
-  ```
-  nextflow run workflow/main.nf
-  ```
-* Design example:
+  * `--deriva` active **credential.json** file from [deriva-auth](https://github.com/informatics-isi-edu/gudmap-rbk/wiki/Uploading-files-via-Deriva-client-tools#from-a-remote-server)
+  * `--bdbag` active **cookies.txt** file from [deriva-auth](https://github.com/informatics-isi-edu/gudmap-rbk/wiki/Uploading-files-via-Deriva-client-tools#from-a-remote-server)
+  * `--repRID` mRNA-seq replicate RID
+  * `--source` consortium server source
+    * **dev** = [dev.gudmap.org](dev.gudmap.org) (default, does not contain all data)
+    * **staging** = [staging.gudmap.org](staging.gudmap.org) (does not contain all data)
+    * **production** = [www.gudmap.org](www.gudmap.org) (***does contain  all data***)
+  * `--refMoVersion` mouse reference version ***(optional)***
+  * `--refHuVersion` human reference version ***(optional)***
+  * `--refERCCVersion` human reference version ***(optional)***
+  * `-profile` config profile to use ***(optional)***:
+    * defaut = processes on BioHPC cluster
+    * **biohpc** = process on BioHPC cluster
+    * **biohpc_max** = process on high power BioHPC cluster nodes (=> 128GB nodes), for resource testing
+    * **aws_ondemand** = AWS Batch on-demand instant requests
+    * **aws_spot** = AWS Batch spot instance requests
+* NOTES:
+  * once deriva-auth is run and authenticated, the two files above are saved in ```~/.deriva/``` (see official documents from [deriva](https://github.com/informatics-isi-edu/deriva-client#installer-packages-for-windows-and-macosx) on the lifetime of the credentials)
+  * reference version consists of Genome Reference Consortium version, patch release and GENCODE annotation release # (leaving the params blank will use the default version tied to the pipeline version)
+    * *current mouse* **38.p6.vM22** = GRCm38.p6 with GENCODE annotation release M22
+    * *current human* **38.p6.v31** = GRCh38.p12 with GENCODE annotation release 31
+* Tracking parameters ([Tracking Site](http://bicf.pipeline.tracker.s3-website-us-east-1.amazonaws.com/)):
+  * `--ci` boolean (default = false)
+  * `--dev` boolean (default = false)
 
+FULL EXAMPLE:
+-------------
+```
+nextflow run workflow/rna-seq.nf --deriva ./data/credential.json --bdbag ./data/cookies.txt --repRID Q-Y5JA
+```
 
+To run a set of replicates from study RID:
+------------------------------------------
+Run in repo root dir:
+* `sh workflow/scripts/splitStudy.sh [studyRID]`
+It will run in parallel in batches of 5 replicatesRID
 
+
+
+<hr>
 [**CHANGELOG**](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/blob/develop/CHANGELOG.md)
+<hr>
 
 Credits
--------
-This worklow is was developed by [Bioinformatic Core Facility (BICF), Department of Bioinformatics](http://www.utsouthwestern.edu/labs/bioinformatics/)
+=======
+This workflow is was developed by [Bioinformatic Core Facility (BICF), Department of Bioinformatics](http://www.utsouthwestern.edu/labs/bioinformatics/)
 
+PI
+--
+Venkat S. Malladi\
+*Faculty Associate & Director*\
+Bioinformatics Core Facility\
+UT Southwestern Medical Center\
+<a href="https://orcid.org/0000-0002-0144-0564" target="orcid.widget" rel="noopener noreferrer" style="vertical-align:top;"><img src="https://orcid.org/sites/default/files/images/orcid_16x16.png" style="width:1em;margin-right:.5em;" alt="ORCID iD icon">orcid.org/0000-0002-0144-0564</a>\
+[venkat.malladi@utsouthwestern.edu](mailto:venkat.malladi@utsouthwestern.edu)
+
+
+Developers
+----------
+Gervaise H. Henry\
+*Computational Biologist*\
+Department of Urology\
+UT Southwestern Medical Center\
+<a href="https://orcid.org/0000-0001-7772-9578" target="orcid.widget" rel="noopener noreferrer" style="vertical-align:top;"><img src="https://orcid.org/sites/default/files/images/orcid_16x16.png" style="width:1em;margin-right:.5em;" alt="ORCID iD icon">orcid.org/0000-0001-7772-9578</a>\
+[gervaise.henry@utsouthwestern.edu](mailto:gervaise.henry@utsouthwestern.edu)
+
+Jonathan Gesell\
+*Computational Biologist*\
+Bioinformatics Core Facility\
+UT Southwestern Medical Center\
+<a href="https://orcid.org/0000-0001-5902-3299" target="orcid.widget" rel="noopener noreferrer" style="vertical-align:top;"><img src="https://orcid.org/sites/default/files/images/orcid_16x16.png" style="width:1em;margin-right:.5em;" alt="ORCID iD icon">orcid.org/0000-0001-5902-3299</a>\
+[johnathan.gesell@utsouthwestern.edu](mailto:jonathn.gesell@utsouthwestern.edu)
+
+Jeremy A. Mathews\
+*Computational Intern*\
+Bioinformatics Core Facility\
+UT Southwestern Medical Center\
+<a href="https://orcid.org/0000-0002-2931-1430" target="orcid.widget" rel="noopener noreferrer" style="vertical-align:top;"><img src="https://orcid.org/sites/default/files/images/orcid_16x16.png" style="width:1em;margin-right:.5em;" alt="ORCID iD icon">orcid.org/0000-0002-2931-1430</a>\
+[jeremy.mathews@utsouthwestern.edu](mailto:jeremy.mathews@utsouthwestern.edu)
 
 Please cite in publications: Pipeline was developed by BICF from funding provided by **Cancer Prevention and Research Institute of Texas (RP150596)**.
+
+<hr>
+<hr>
+
+Pipeline Directed Acyclic Graph
+-------------------------------
+
+![dag](docs/dag.png "DAG")
diff --git a/cleanup.sh b/cleanup.sh
index 9569ff54fd71cd94bddde415af03a101820ab514..aa289201c531fa4f4667a04f80fd015d2200e40c 100644
--- a/cleanup.sh
+++ b/cleanup.sh
@@ -5,3 +5,5 @@ rm timeline*.html*
 rm .nextflow*.log*
 rm -r .nextflow/
 rm -r work/
+rm *_studyRID.json
+rm *_studyRID.csv
diff --git a/docs/GUDMAP.RBK Pipeline.docx b/docs/GUDMAP.RBK Pipeline.docx
deleted file mode 100755
index deae8a8fbfb7adc32ba2fba03a25eca6af57b4d7..0000000000000000000000000000000000000000
Binary files a/docs/GUDMAP.RBK Pipeline.docx and /dev/null differ
diff --git a/docs/RNA-Seq Pipeline Design Flowchart.drawio b/docs/RNA-Seq Pipeline Design Flowchart.drawio
new file mode 100644
index 0000000000000000000000000000000000000000..7b65c655592950fdb14c7a8b7c74a4f986816669
--- /dev/null
+++ b/docs/RNA-Seq Pipeline Design Flowchart.drawio	
@@ -0,0 +1 @@
+<mxfile host="Electron" modified="2020-03-23T23:17:50.947Z" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/12.6.5 Chrome/80.0.3987.86 Electron/8.0.0 Safari/537.36" etag="EoueA3AcDhzhFmxKZ9Lg" version="12.6.5" type="device"><diagram name="Page-1" id="74e2e168-ea6b-b213-b513-2b3c1d86103e">7V1Zd5u6Fv41Xuveh2QxD49Nk7S9pzlNTpq0PS9dMsg2CQYXcBP3118xCDMIAWZ2nJcYIWGs/e1J2ntrxr9fv35wwGZ1Y+vQnHGM/jrjL2ccJzASj/75LbuwheNlOWxZOoYetrH7hnvjD4wamah1a+jQTXX0bNv0jE26UbMtC2peqg04jv2S7rawzfS3bsAS5hruNWDmW78ZurcKW3mGYfY3PkJjuYq+WsE35kB7Xjr21oq+b8bxi+AvvL0G+FlRf3cFdPsl0cRfzfj3jm174af163to+pOLpy0cd11wN35vB1pelQE3Px8/6OrD5hMrf9S/boXLT/faGX6M6+3whEAdzU90aTveyl7aFjCv9q0XwY+G/mMZdLXv89m2N6iRRY1P0PN2EbHB1rNR08pbm9Fd+Gp43xOff/iPOhejq8vX6MnBxS5xcQsdYw096OA2y3N235MX4ZM4EV/vnxVc4YeFP9r/pYWTiSfG3jpa1Ovix7+vd/zPn9zPP3f2Vn76+SIaZ6wU4d0DzhJ6lLkWYqIjboI2+iHODo1zoAk843f6TUCE6mXcLxr6znHALtFhYxuW5yaefOs3oA6vmM0Y9VyImDTiUUHOIKXCGJ4pGaOq7LnAqfGf0upwQUwORx/CecBXiQndNwX4r8ELotyYF5626w3uj/jd2Zz4gxW5PH8sdVa5+3PPvucerj5yc9eRTDgMf3CyjIDni/vorx6rlAxH2oIOe0lSKbBvOLwXruHUkwapzCHPl1/+dVcf/r57v/g4N73ftzdX5pkg5BmEyErKEAyiCLX1B2GIVCL/OVak8VHD4b0wAiZjS+rDsi34pnjj01f54euvp9fvPyF8+iOrMnv11xmLXYIEb9z9fOJvP/z1+Pjp7yX439L2ft1aZyIzG4A5WJ5TzxVpjzu2nqFFHy5KJajnGX944zH9sMfJ0+jC0xgTMzB5aFVyNbJwLBP2JAgfMqYX2GPv/wT7NmFPVBb8YKjvwcEuM3HGZSHJMpQ0VV6wqqQDReHPomn/DcxtRIgZJ5mImhcLO3jPPYNIv7Y2vnHmBhB/hzqwzOZ1fxN9Wvr/b4BhoZvvEHfsPENz8VPRC4YPDrvhZncDrOIve4nW+fyvE/wJDaHMaLZpO2Gzs5yD/wR3ODQJDPHTf+OBwVMXYG2Yu3D42rZs9A4I8Kku+19J/JEzTswuwYpo3v3WYP0wvsJ0EANKoJZL/7P/YqI/4SKib1lfNu6LufWgx3D7x4Q0j+8gwbbdBDfmYL3vFZIk7uV6jv0crcj6jX6nSMT5l2xw6a7AJhyzfl36k3O+MO0XbQUc71y3te0avX448GVlePA+nHjU/QV1Dp/gT7zfEsg0v2VhmOb7gN74XfiFokFNC/sHr5W5P1dEwVe5YkAp/Kuh48HXSvMa34qndC+9RJ+D9z2TRGB2iRtM8sZLPG3RzcS9VYjx+KaUuIdkXty+TLxClubBZUz4ZGMajlG/HG5jFg3ZEYM8oyfdF2NtgsALSqg2bWWY+mews7e+KnA9oD3jK0QdRPlIKQb8lVak8Qq8f2GCOTQv4jX8kKSx25WEX/i9Pp9eR5x8+QgdHVggao6+UUGXwDSWFvqsIQL7WjQBJ/838boIFV2YpYEU3FG4OS9J8TwQNGWEqH1TXtXFwj5n7LCRXxBhA7dFjavEBgf2IIgqMqEg6sl/Lif/kS6/AMuKVM9Rto15LVRS5RONF+iIk0qYU6GFOSVaRxzTfP0tuewAHK2ZyYlNydBQlEtszgL7ssy8DK6yxmqO9KGhU430XN4aJS7WiRVdMJbliTZw7+vbTEoMIDOQbiZmuqtMBqHNzELalJaLhj1qfcAlVHqk0FN4BHPXNrcefOdoEWaD1v2VkBPzXFdyBQu/ygKcT5OBFQhyhiS748YmcoZIJAznYfzYve/6I3GnTKbsxciPlBQhy5S8FGxRbJDWMWnM0J7UaEb1rtw2B25MQwOe7wf98+my0GvLQC5lEWRZN/QGkC8ArC0wP1mbwCYkiYm0IHEjs0JKywe2H96uytptWGV0TdUlkcPHzR18aw6WhleV6nlSJ3GAqU7xAMkoiOgerWb1QGqOZIN3Revc2+epjCTRJfDARE3weA88M8ltmOTRt/0DNQ9YS39dosbX8YRvk/nMtwETGa4W4o0LXxa5OcrWs6sKIi+kQXX2YI7AQZoc2zdlmlyqqMh5tm1NfgT2fwFOmwdenXB6ME65I8ApL/fgp0qVFehkHVVsI0zYUcV4bt2aXQDX+9Wxybremp5xhg3XAux0Qsfs1niejorSjaFK1Z6tUxG9HtADnn0ThCS5HF0RMvfyQo6G6Le68CYmwZT9Dj6rsQbfCmDFQUNxe7GjDjKZcGBmmclUNbyKk4/BZBLaNZmI0U/88XugRCh0B0jyNA+TOtHMtVR6MNnlmhposoY7VrXTMNypUpVg8m3Nww0+/AzTSJqAV8GaW2wC7m+Su997DrD0OgM2xjOs1R9qBoxfab9QTn0GavSnpsB0RUjwcriK4luSIIyaEsgmAd+HlqEB810U4bI2dD0Q1za64xu+qG2F2qA1i4Jrbm3X8Aw7FQ6Dn/I50yF+Gg6gMeEiitfTDGuJGs4CdkqE27BcP6yBIytLWUNpgTMo4r2QM4pw1ZLD5O+mZdiFFFk5LYBmcVWAkSrQan33pw0HIfeq+SALzzHW09/+EZTRuWESd/RGb3vL2WJFU1jJm8LUZMpJ+2Ztb7vczXcPmug+Wupy/rz6fPttd2Wcyc3zFccO0wOshQZ4JM/yMYQB8lIPvppSXUNN1k3DWngabhpVwLa+Mu9Tew31vAs0pY2X+gQebr2eKq8G8TV+acFY4B2Xs9GDYCBt2PXnXBB3xv+Bizx3Tsm1kITxuRbHv8PTnmuB9x5KE4wJsdn0TOShjTmunjGX6d5LTFd+/75AJEzWlsNCr8aO8YC2XAGgp5PVQa1O0EOmB0GakO1jpqo0aT1AtJldz3Rk1ztwAdG8BinvS2ih1y227tox5v2tvTlw4eFmfG3eFtO8TWBtkqXQhhVP11StU3PlJ9RfLyzklF8vPSTOr+d7n23kzll9iZ0pxzdkMFWeoMFW2fRX8OOCuuMxs+XTCn51xYiD98sUo5rXi8QqPtI4bOzuFvBzg7MbhWoG2OGURYMy2G7DVlfaLXw7jRIC3SKbGOGijsR9HP9egFpD2U3WgcQafRqbAVREn1aLe10tboqcXleLaUq+dfckKCl2PQfGeFySukFFMp2Sw+0XXRiG96rc3jnr/ylPf4wvW12QCHX9gvpu03dL1H7zyit8HTGvXBwgr1wWc6Q9fnuR6DDlgUdjktROBKkfnzckqVWxJ21ITsxHUt4g5ruHN7kiDcMdAb7bdpRoU1pJ/U7FUaLaGRN2lmJYn7ylLr2lTuAzvMfUWSm26blMxRQeo9tElgV5UbAGz/DCWH4zploTF99lu8iML/acFDYj6XmlIq9ynfpO4O9b8GDdws3u+9f5w43woG7WcebvG9tYyuOMyhVJS5LYkVDciNhPOQI7svVkEMqEVhNHUzEk6ZJ3wpZkPlWiJVMgoveIynJmcBAdxUDAQXQIQ3846LWaJ/EnEIokaYg1j6GoJzdkWif5nMvmi54TVt3U+iZUcJbqcpZw/CW5o3hS3tQZrCQHJq28+WNYBsqn4rejvQOaz7gLb4McekZzf78NRd4YE70qcuLJ13k9vt2YNtCPQJELQyZRFR8zXnWypyItqbCahrAkWl2D5ruN+fTMItYsN7dYwjZccb3jvs0tVkxjkGdTBlRpf1bp4VDXeJ3qBMsDYEkt+FuKyoLwqG5RybP1UJnt3w8qB03mO05UchVRqYwClWo9VAoCvT8+xvjQ/v2gvnlgzgn15BIvpahXh0C9lN2aL5HF2f5lKM7uJ9bt3wvqsRt1Qv0hqK9+zvx4YC+r9WCf7d8PLE+GcQNY0jIlUuUECiMB+walWtNby/bvB5Qnu7htUJJK5hTHFfS+hsDWdNdyA/rB5aDBQG8bl+wgJyY0B6bAlAzg1IYDeoE+P+iJf7WgPzDMqQXhx7kskV0G4EsgOMiygdBuzYMosv0Ny2PqXncSqMWbcr2LY6HmUkJuQD9QbTf1zF0BZ3PCalFQ5UihymWFahlUcwOkMmxnfbm6A9pmBgNou49fXu7ZL+wXV/vr31fWuCWESsQFo1MsUhA5I1LPCKoaf0N+yq2xgaZh+akJl9D1c5c45hpHy1BP9ilIw5qXJVwlYw+iZKncSTu5LCoHvdsfMA8e5fNUBD30XPFiJl76z0KM7+4DGPI8VjuwgMtEgRAyNOJqu8nAguyp6a2V+FKKq5i7G2BVAQE5COvufYKA4aOKoq6IcTzayjD1z2Bnb/0pcD2gPeOrfIxPWsgjwa4HyXH+RXAC0wUavQw6ZTLw0kEobPTjrsHaMH0KPUJHBxaYpU5QUEgoayN2pX5QkcKq54rExH9sWg4R4rlUYoxRGwFdRJUtEcpi2C+WC9YbEw4R1VWWUVfMI3kqjCYk+3TQTrpMH9UuouK01OOUCEFD5I5TPIpypJUoqJSoLlrGHMNYIpcooC3UDuMLYpS6Mjb2hNdRn+w5KXTrY+gk7oNIO1wWNzn2JEfXwEC6Pwo1jx/TRR2sZplYJ63fTOtXVfqEVWYy155UfqcqP68+ysTMsan8yWn8QoX/1mq1HELO4bQ8eVM5R0zDWkDnpugQ+enpeL7fii1qlYotMuH7ui3YQjY2Bg0v7sXaqGBG0PiiNCij6tIBN0hqR8MifwzFjjgAg7QJJAmgY9P+xUCbkPbn6FXRD1X++cqAAQyuAslHqd2X6X/vOcDSa43YGM+w3gCoGZD+VrOxVhQMdk5ubdfwDJu4o/Y50yF+Gt4dMeEiWtvRDGuJGs4CJiKfRt05Q1ROb1da4Aei9ZG3hoOlKq1vy6mimVRnK0rook7NcVhOx284HVovh8YkpWkqhDAVoow4tn2YA2BJm79yYTRmK6oqmiZS+qEYwB2soWByT2QBpZi8Y1xAIUbWCVyOlJ5hHYH27+SEcIr2Vypof7F37U+cTK6F7JbpHZhAtAhywAyDlojApPJPaQgrpn15+gu2I0+VccuEF57TEuE1FWuBLqCnYS7QIZ0lFONqtgPdATR7B/OORwyg2slJ80Lenj4Kx76TU4mPRbWfzv86QLXT+ae8PIVIyG0t6MmdlHtVKEsnKI8ayvwRQJmXe4ByPKl0TTwVO7XE2piGoUoXzxlCMU+uTVgUmUpt5o4I1mtxZroMSh6O4jsBfqrXoD5FR1NOOuus17DqFpTykSY3U52b8iSevNYtjjfpW8Xy+HTEignO2f69pOKLzUtJvaFU/Mpope3/VLARBzEJJb4eXrP9ceRAYf9Mdn3d/v3wQ7uL/Cd+mFVY4BwrR8hqPY7I9u8Hse2uXfWN2EPRqQN3FXt9nRsaHKGS6piQKsgZpHJ0pGb7H4Xsltpd+jrJbtpcV02lG6a4kFhPcGf791MycDp13lsR0iTeqoDTqrmdJBE9nmLXfqTrucDvK7zwteRpyXCurLIVw6nnAqfGf0qbw3mhj5pxzQ9OegtmThNuGVFd2YHZRWQasQt9eC/sgu2vN8Yuh3EIEfjqyE1+lpVoNcPEkkM+SoZLJcM5IRjeeEw/zIBpfmKGA5mBxWvTo+UGnlpBr5Qb6MOPixu4EzdU54b57kET3UdLXc6fV59vv+2ujDOZkAA+JmZQBN/6qcUAhCGVAFxsZDUc3g8rTHvFaECngqglxr5KKslqji9KFkoJQ0qBLfE056Ph8IZ8gS4d288e23f3QzdubB36Pf4P</diagram></mxfile>
\ No newline at end of file
diff --git a/docs/RNA-Seq Pipeline Design Flowchart.jpg b/docs/RNA-Seq Pipeline Design Flowchart.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a56462a2991aecb22d5bbff493c18e3f673f2b0f
Binary files /dev/null and b/docs/RNA-Seq Pipeline Design Flowchart.jpg differ
diff --git a/docs/RNA-Seq Pipeline Design Process Table.docx b/docs/RNA-Seq Pipeline Design Process Table.docx
new file mode 100644
index 0000000000000000000000000000000000000000..21604d8f30662ffd93f8d0605b671a0921864b0c
Binary files /dev/null and b/docs/RNA-Seq Pipeline Design Process Table.docx differ
diff --git a/docs/RNA-Seq Pipeline Design Process Table.pdf b/docs/RNA-Seq Pipeline Design Process Table.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..97f1d5ddfb0ae0848aa0bf8b37681db9efeb3c6b
Binary files /dev/null and b/docs/RNA-Seq Pipeline Design Process Table.pdf differ
diff --git a/docs/dag.png b/docs/dag.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c4896f2f0f6d2c765d6b020cddb4cda23064c97
Binary files /dev/null and b/docs/dag.png differ
diff --git a/docs/gudmap-rbk_logo.png b/docs/gudmap-rbk_logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..840030e2dac96bff2d41805d6697abab29c3e5bb
Binary files /dev/null and b/docs/gudmap-rbk_logo.png differ
diff --git a/nextflow.config b/nextflow.config
deleted file mode 100644
index 28777047bfa85b13d08a0df02a22e6eac6d66540..0000000000000000000000000000000000000000
--- a/nextflow.config
+++ /dev/null
@@ -1,5 +0,0 @@
-profiles {
-  standard {
-    includeConfig 'workflow/conf/biohpc.config'
-  }
-}
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000000000000000000000000000000000000..c8d98f29942ac24e83b741a0e102714b44230eab
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+python_paths = workflow/scripts
diff --git a/test_data/createTestData.sh b/test_data/createTestData.sh
new file mode 100644
index 0000000000000000000000000000000000000000..47c73d46fba5510f64fe5f9bb5c540eb4ad618cc
--- /dev/null
+++ b/test_data/createTestData.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+
+#This script regenerates test data from replicate RID Q-Y5F6
+
+module load singularity/3.5.3
+module load pigz/2.4
+
+mkdir -p NEW_test_data
+
+ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json
+
+mkdir -p ./NEW_test_data/bagit
+singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6
+cp Replicate_Q-Y5F6.zip ./NEW_test_data/bagit/Replicate_Q-Y5F6.zip
+
+mkdir -p ./NEW_test_data/fastq
+unzip ./test_data/bagit/Replicate_Q-Y5F6.zip
+singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6
+cp Replicate_Q-Y5F6.R1.fastq.gz ./NEW_test_data/fastq/Replicate_Q-Y5F6.R1.fastq.gz
+cp Replicate_Q-Y5F6.R2.fastq.gz ./NEW_test_data/fastq/Replicate_Q-Y5F6.R2.fastq.gz
+
+mkdir -p ./NEW_test_data/fastq/small
+singularity exec 'docker://bicf/seqtk:2.0.0' seqtk sample -s100 ./NEW_test_data/fastq/Replicate_Q-Y5F6.R1.fastq.gz 1000000 1> Q-Y5F6_1M.R1.fastq
+singularity exec 'docker://bicf/seqtk:2.0.0' seqtk sample -s100 ./NEW_test_data/fastq/Replicate_Q-Y5F6.R2.fastq.gz 1000000 1> Q-Y5F6_1M.R2.fastq
+pigz Q-Y5F6_1M.R1.fastq
+pigz Q-Y5F6_1M.R2.fastq
+cp Q-Y5F6_1M.R1.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
+cp Q-Y5F6_1M.R2.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
+
+mkdir -p ./NEW_test_data/meta
+singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5F6_1M.se -j 20 ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
+singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5F6_1M.pe -j 20 ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
+cp Q-Y5F6_1M.se_trimmed.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz
+cp Q-Y5F6_1M.pe_R1_val_1.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_R1_val_1.fq.gz
+cp Q-Y5F6_1M.pe_R2_val_2.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_R2_val_2.fq.gz
+cp Q-Y5F6_1M.R1.fastq.gz_trimming_report.txt ./NEW_test_data/meta/Q-Y5F6_1M.R1.fastq.gz_trimming_report.txt
+cp Q-Y5F6_1M.R2.fastq.gz_trimming_report.txt ./NEW_test_data/meta/Q-Y5F6_1M.R2.fastq.gz_trimming_report.txt
+
+touch metaTest.csv
+echo 'Replicate_RID,Experiment_RID,Study_RID,Paired_End,File_Type,Has_Strand_Specific_Information,Used_Spike_Ins,Species' > metaTest.csv
+echo 'Replicate_RID,Experiment_RID,Study_RID,uk,FastQ,no,no,Homo sapiens' >> metaTest.csv
+cp metaTest.csv ./NEW_test_data/meta/metaTest.csv
+
+mkdir -p ./NEW_test_data/bam
+mkdir -p ./NEW_test_data/bam/small
+singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./NEW_test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary
+singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam
+singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam
+singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.sorted.bam.bai
+singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.pe.unal.gz -S Q-Y5F6_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_R1_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5F6_1M.pe_R2_val_2.fq.gz --summary-file Q-Y5F6_1M.pe.alignSummary.txt --new-summary
+singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.pe.bam Q-Y5F6_1M.pe.sam
+singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam
+singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai
+cp Q-Y5F6_1M.se.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.bam
+cp Q-Y5F6_1M.pe.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.pe.bam
+cp Q-Y5F6_1M.se.sorted.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.bam
+cp Q-Y5F6_1M.se.sorted.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.bam.bai
+cp Q-Y5F6_1M.pe.sorted.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.pe.sorted.bam
+cp Q-Y5F6_1M.pe.sorted.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.pe.sorted.bam.bai
+cp Q-Y5F6_1M.se.alignSummary.txt ./NEW_test_data/meta/Q-Y5F6_1M.se.alignSummary.txt
+cp Q-Y5F6_1M.pe.alignSummary.txt ./NEW_test_data/meta/Q-Y5F6_1M.pe.alignSummary.txt
+
+singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true
+singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.deduped.bam
+singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai
+cp Q-Y5F6_1M.se.deduped.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.deduped.bam
+cp Q-Y5F6_1M.se.sorted.deduped.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam
+cp Q-Y5F6_1M.se.sorted.deduped.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam.bai
+cp Q-Y5F6_1M.se.deduped.Metrics.txt /NEW_test_data/meta/Q-Y5F6_1M.se.deduped.Metrics.txt
+cp Q-Y5F6_1M.se.deduped.Metrics.txt ./NEW_test_data/meta/Q-Y5F6_1M.se.deduped.Metrics.txt
+
+for i in {"chr8","chr4","chrY"}; do 
+      echo "samtools view -b ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam ${i} > Q-Y5F6_1M.se.sorted.deduped.${i}.bam; samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.${i}.bam Q-Y5F6_1M.se.sorted.deduped.${i}.bam.bai;";
+      done | singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' parallel -j 20 -k
+cp Q-Y5F6_1M.se.sorted.deduped.chr4.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chr4.bam
+cp Q-Y5F6_1M.se.sorted.deduped.chr4.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chr4.bam.bai
+cp Q-Y5F6_1M.se.sorted.deduped.chr8.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chr8.bam
+cp Q-Y5F6_1M.se.sorted.deduped.chr8.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chr8.bam.bai
+cp Q-Y5F6_1M.se.sorted.deduped.chrY.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chrY.bam
+cp Q-Y5F6_1M.se.sorted.deduped.chrY.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chrY.bam.bai
+
+mkdir -p ./NEW_test_data/counts
+mkdir -p ./NEW_test_data/counts/small
+singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' -o Q-Y5F6_1M.se.featureCounts -s 1 -R SAM --primary --ignoreDup ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam 
+singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count Q-Y5F6_1M.se.featureCounts
+cp Q-Y5F6_1M.se.featureCounts ./NEW_test_data/counts/small/Q-Y5F6_1M.se.featureCounts
+cp Q-Y5F6_1M.se.countTable.csv ./NEW_test_data/counts/small/Q-Y5F6_1M.se.countTable.csv
+
+mkdir -p ./NEW_test_data/bw
+mkdir -p ./NEW_test_data/bw/small
+singularity run 'docker://bicf/deeptools3.3:2.0.0' bamCoverage -p 20 -b ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw
+cp Q-Y5F6_1M.se.bw ./NEW_test_data/bw/small/Q-Y5F6_1M.se.bw
+
+mkdir -p ./NEW_test_data/fastqc
+mkdir -p ./NEW_test_data/fastqc/small
+singularity run 'docker://bicf/fastqc:2.0.0' ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o .
+cp Q-Y5F6_1M.R1_fastqc.html ./NEW_test_data/fastqc/small/Q-Y5F6_1M.R1_fastqc.html
+cp Q-Y5F6_1M.R1_fastqc.zip ./NEW_test_data/fastqc/small/Q-Y5F6_1M.R1_fastqc.zip
+
+echo -e  "geneID\tchrom\ttx_start\ttx_end\tTIN" > Q-Y5F6_1M.se.sorted.deduped.tin.xls
+for i in {"chr8","chr4","chrY"}; do
+echo "tin.py -i ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"; done | singularity run 'docker://bicf/rseqc3.0:2.0.0' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls
+cp Q-Y5F6_1M.se.sorted.deduped.tin.xls ./NEW_test_data/meta/Q-Y5F6_1M.se.sorted.deduped.tin.xls
+
diff --git a/workflow/conf/aws.config b/workflow/conf/aws.config
new file mode 100644
index 0000000000000000000000000000000000000000..9ecbfb98f593f167a35650299921adaf2fffbb42
--- /dev/null
+++ b/workflow/conf/aws.config
@@ -0,0 +1,83 @@
+workDir = 's3://gudmap-rbk.output/work'
+aws.client.storageEncryption = 'AES256'
+aws {
+  region = 'us-east-2'
+  batch {
+    cliPath = '/home/ec2-user/miniconda/bin/aws'
+  }
+}
+
+process {
+  executor = 'awsbatch'
+  cpus = 1
+  memory = '1 GB'
+
+  withName: trackStart {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName: getBag {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName: getData {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName: parseMetadata {
+    cpus = 15
+    memory = '1 GB'
+  }
+  withName: trimData {
+    cpus = 20
+    memory = '2 GB'
+  }
+  withName: getRefInfer {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName: downsampleData {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName: alignSampleData {
+    cpus = 50
+    memory = '5 GB'
+  }
+  withName: inferMetadata {
+    cpus = 5
+    memory = '1 GB'
+  }
+  withName: getRef {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName: alignData {
+    cpus = 50
+    memory = '10 GB'
+  }
+  withName: dedupData {
+    cpus = 5
+    memory = '20 GB'
+  }
+  withName: countData {
+    cpus = 2
+    memory = '5 GB'
+  }
+  withName: makeBigWig {
+    cpus = 15
+    memory = '5 GB'
+  }
+  withName: fastqc {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName: dataQC {
+    cpus = 15
+    memory = '2 GB'
+  }
+  withName: aggrQC {
+    cpus = 2
+    memory = '1 GB'
+  }
+}
diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config
index 0ea74405dd6faeb0342698df1ef8736804dda5c6..338cd5d54d4258e499a60241054c5f28109e7c19 100755
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -1,31 +1,68 @@
 process {
   executor = 'slurm'
-  queue='super'
+  queue = 'super'
+  clusterOptions = '--hold'
 
-  // Process specific configuration
-  withLabel:getData {
-    executor = 'super'
+  withName: trackStart {
+    executor = 'local'
+  }
+  withName: getBag {
+    executor = 'local'
+  }
+  withName: getData {
+    executor = 'local'
+  }
+  withName: parseMetadata {
+    executor = 'local'
+  }
+  withName: trimData {
+    queue = 'super'
+  }
+  withName: getRefInfer {
+    executor = 'local'
+  }
+  withName: downsampleData {
+    executor = 'local'
+  }
+  withName: alignSampleData {
+    queue = 'super'
+  }
+  withName: inferMetadata {
+    queue = 'super'
+  }
+  withName: getRef {
+    executor = 'local'
+  }
+  withName: alignData {
+    queue = '256GB,256GBv1'
+  }
+  withName: dedupData {
+    queue = 'super'
+  }
+  withName: countData {
+    queue = 'super'
+  }
+  withName: makeBigWig {
+    queue = 'super'
+  }
+  withName: fastqc {
+    queue = 'super'
+  }
+  withName: dataQC {
+    queue = 'super'
+  }
+  withName: aggrQC {
+    executor = 'local'
   }
 }
 
-
-trace {
-  enabled = true
-  file = 'pipeline_trace.txt'
-  fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
-}
-
-timeline {
+singularity {
   enabled = true
-  file = 'timeline.html'
+  cacheDir = '/project/BICF/BICF_Core/shared/gudmap/singularity_cache/'
 }
 
-report {
-  enabled = true
-  file = 'report.html'
+env {
+  http_proxy = 'http://proxy.swmed.edu:3128'
+  https_proxy = 'http://proxy.swmed.edu:3128'
+  all_proxy = 'http://proxy.swmed.edu:3128'
 }
-
-tower {
-  accessToken = '3ade8f325d4855434b49aa387421a44c63e3360f'
-  enabled = true
-}
\ No newline at end of file
diff --git a/workflow/conf/biohpc_max.config b/workflow/conf/biohpc_max.config
new file mode 100755
index 0000000000000000000000000000000000000000..0e93ccf6a0be4c15c076ab6eb955a4bb39d96120
--- /dev/null
+++ b/workflow/conf/biohpc_max.config
@@ -0,0 +1,16 @@
+process {
+  executor = 'slurm'
+  queue = '256GB,256GBv1,384GB,128GB'
+  clusterOptions = '--hold'
+}
+
+singularity {
+  enabled = true
+  cacheDir = '/project/BICF/BICF_Core/shared/gudmap/singularity_cache/'
+}
+
+env {
+  http_proxy = 'http://proxy.swmed.edu:3128'
+  https_proxy = 'http://proxy.swmed.edu:3128'
+  all_proxy = 'http://proxy.swmed.edu:3128'
+}
diff --git a/workflow/conf/conda.env.bdbag.yml b/workflow/conf/conda.env.bdbag.yml
deleted file mode 100644
index 33361d301b3fac561fa39807e3c740583e57d28b..0000000000000000000000000000000000000000
--- a/workflow/conf/conda.env.bdbag.yml
+++ /dev/null
@@ -1,5 +0,0 @@
-name: bdbag
-dependencies:
-  - pandas=0.23.3=py36_0
-  - pip:
-    - bdbag==1.5.5
diff --git a/workflow/conf/multiqc_config.yaml b/workflow/conf/multiqc_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..db6a335a8e5e6f8677dd8c433d391c6fd5224751
--- /dev/null
+++ b/workflow/conf/multiqc_config.yaml
@@ -0,0 +1,114 @@
+custom_logo: './bicf_logo.png'
+custom_logo_url: 'https/utsouthwestern.edu/labs/bioinformatics/'
+custom_logo_title: 'Bioinformatics Core Facility'
+
+report_header_info:
+  - Contact Email: 'bicf@utsouthwestern.edu'
+  - Application Type: 'RNA-Seq Analytic Pipeline for GUDMAP/RBK'
+  - Department: 'Bioinformatic Core Facility, Department of Bioinformatics, University of Texas Southwestern Medical Center'
+
+title: RNA-Seq Analytic Pipeline for GUDMAP/RBK
+
+report_comment: >
+  This report has been generated by the <a href="https://doi.org/10.5281/zenodo.3625056">GUDMAP/RBK RNA-Seq Pipeline</a>
+
+top_modules:
+  - fastqc:
+      name: 'Raw'
+      info: 'Replicate Raw fastq QC Results'
+  - cutadapt:
+      name: 'Trim'
+      info: 'Replicate Trim Adapter QC Results'
+  - hisat2:
+      name: 'Align'
+      info: 'Replicate Alignment QC Results'
+      path_filters:
+        - '*alignSummary*'
+  - picard:
+      name: 'Dedup'
+      info: 'Replicate Alignement Deduplication QC Results'
+  - rseqc:
+      name: 'Inner Distance'
+      info: 'Replicate Paired End Inner Distance Distribution Results'
+      path_filters:
+        - '*insertSize*'
+  - custom_content
+  - featureCounts:
+      name: 'Count'
+      info: 'Replicate Feature Count QC Results'
+  - hisat2:
+      name: 'Inference: Align'
+      info: 'Inference Alignment (1M downsampled reads) QC Results'
+      path_filters:
+        - '*alignSampleSummary*'
+  - rseqc:
+      name: 'Inference: Stranded'
+      info: '1M Downsampled Reads Strandedness Inference Results'
+      path_filters:
+        - '*infer_experiment*'
+
+report_section_order:
+    rid:
+      order: 2000
+    meta:
+      order: 1000
+
+skip_generalstats: true
+
+custom_data:
+    rid:
+        file_format: 'tsv'
+        section_name: 'RID'
+        description: 'This is the identifying RIDs'
+        plot_type: 'table'
+        pconfig:
+            id: 'rid'
+        headers:
+            Replicate RID
+            Experiment RID
+            Study RID
+    meta:
+        file_format: 'tsv'
+        section_name: 'Metadata'
+        description: 'This is the comparison of infered metadata, submitter provided, and calculated'
+        plot_type: 'table'
+        pconfig:
+            id: 'meta'
+            format: '{:,.0f}'
+        headers:
+            Source
+            Species
+            Ends
+            Stranded
+            Spike-in
+            Raw Reads
+            Assigned Reads
+            Median Read Length
+            Median TIN
+    tin:
+        file_format: 'tsv'
+        section_name: 'TIN'
+        description: 'This is the distribution of TIN values calculated by the tool RSeQC'
+        plot_type: 'bargraph'
+        pconfig:
+            id: 'tin'
+        headers:
+            chrom
+            0 - 9
+            10 - 19
+            20 - 29
+            30 - 39
+            40 - 49
+            50 - 59
+            60 - 69
+            70 - 79
+            80 - 89
+            90 - 99
+
+sp:
+    rid:
+        fn: 'rid.tsv'
+    meta:
+        fn: 'metadata.tsv'
+    tin:
+        fn: '*.tin.hist.tsv'
diff --git a/workflow/conf/ondemand.config b/workflow/conf/ondemand.config
new file mode 100755
index 0000000000000000000000000000000000000000..131fdbb19e1fedf1bc9e206a03d801f13791b810
--- /dev/null
+++ b/workflow/conf/ondemand.config
@@ -0,0 +1,3 @@
+process {
+  queue = 'highpriority-0ef8afb0-c7ad-11ea-b907-06c94a3c6390'
+}
diff --git a/workflow/conf/replicate_export_config.json b/workflow/conf/replicate_export_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff17fa513c5bc130a2e2bdaf9aa41b070c99b290
--- /dev/null
+++ b/workflow/conf/replicate_export_config.json
@@ -0,0 +1,97 @@
+{
+  "bag": {
+    "bag_name": "Replicate_{rid}",
+    "bag_algorithms": [
+      "md5"
+    ],
+    "bag_archiver": "zip"
+  },
+  "catalog": {
+    "query_processors": [
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Study",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Study_RID)=(RNASeq:Study:RID)/Study_RID:=RID,Internal_ID,Title,Summary,Overall_Design,GEO_Series_Accession_ID,GEO_Platform_Accession_ID,Funding,Pubmed_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment:RID)/Experiment_RID:=RID,Study_RID,Internal_ID,Name,Description,Experiment_Method,Sequencing_Type,Species,Specimen_Type,Molecule_Type,Pooled_Sample,Pool_Size,Markers,Cell_Count,Treatment_Protocol,Treatment_Protocol_Reference,Isolation_Protocol,Isolation_Protocol_Reference,Growth_Protocol,Growth_Protocol_Reference,Label_Protocol,Label_Protocol_Reference,Hybridization_Protocol,Hybridization_Protocol_Reference,Scan_Protocol,Scan_Protocol_Reference,Data_Processing,Value_Definition,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment Antibodies",
+          "query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Antibodies:Experiment_RID)?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment Custom Metadata",
+          "query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Custom_Metadata:Experiment_RID)?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment Settings",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Settings:Experiment_RID)/RID,Experiment_RID,Alignment_Format,Aligner,Aligner_Version,Reference_Genome,Sequence_Trimming,Duplicate_Removal,Pre-alignment_Sequence_Removal,Junction_Reads,Library_Type,Protocol_Reference,Library_Selection,Quantification_Format,Quantification_Software,Expression_Metric,Transcriptome_Model,Sequencing_Platform,Paired_End,Read_Length,Has_Strand_Specific_Information,Used_Spike_Ins,Spike_Ins_Amount,Visualization_Format,Visualization_Software,Visualization_Version,Visualization_Setting,Notes,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Replicate",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/RID,Study_RID,Experiment_RID,Biological_Replicate_Number,Technical_Replicate_Number,Specimen_RID,Collection_Date,Mapped_Reads,GEO_Sample_Accession_ID,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Specimen",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/S:=(Specimen_RID)=(Gene_Expression:Specimen:RID)/T:=left(Stage_ID)=(Vocabulary:Developmental_Stage:ID)/$S/RID,Title,Species,Stage_ID,Stage_Name:=T:Name,Stage_Detail,Assay_Type,Strain,Wild_Type,Sex,Passage,Phenotype,Cell_Line,Parent_Specimen,Upload_Notes,Preparation,Fixation,Embedding,Internal_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT,GUDMAP2_Accession_ID?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Specimen_Anatomical_Source",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Tissue:Specimen_RID)/RID,Specimen_RID,Tissue,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Specimen_Cell_Types",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Cell_Type:Specimen)/RID,Specimen_RID:=Specimen,Cell_Type,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Single Cell Metrics",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:Single_Cell_Metrics:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Reads_%28Millions%29,Reads%2FCell,Detected_Gene_Count,Genes%2FCell,UMI%2FCell,Estimated_Cell_Count,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "File",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Caption,File_Type,File_Name,URI,File_size,MD5,GEO_Archival_URL,dbGaP_Accession_ID,Processed,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT,Legacy_File_RID,GUDMAP_NGF_OID,GUDMAP_NGS_OID?limit=none"
+        }
+      },
+      {
+        "processor": "fetch",
+        "processor_params": {
+          "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/url:=URI,length:=File_size,filename:=File_Name,md5:=MD5,Study_RID,Experiment_RID,Replicate_RID?limit=none"
+        }
+      }
+    ]
+  }
+}
diff --git a/workflow/conf/spot.config b/workflow/conf/spot.config
new file mode 100755
index 0000000000000000000000000000000000000000..d9c7a4c8fa34aadd597da0170f8e3e223923011a
--- /dev/null
+++ b/workflow/conf/spot.config
@@ -0,0 +1,3 @@
+process {
+  queue = 'default-0ef8afb0-c7ad-11ea-b907-06c94a3c6390'
+}
diff --git a/workflow/docker/.gitkeep b/workflow/docker/.gitkeep
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/workflow/docker/getData b/workflow/docker/getData
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/workflow/docker/images/.gitkeep b/workflow/docker/images/.gitkeep
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/workflow/docker/temp b/workflow/docker/temp
deleted file mode 100644
index f7dcb3af08981d465bf0838d09de1b38e9e0c5aa..0000000000000000000000000000000000000000
--- a/workflow/docker/temp
+++ /dev/null
@@ -1,14 +0,0 @@
-
-
-RUN apt-get install -y python3.7 python3-pip
-
-RUN wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-  bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b && \
-  rm Miniconda3-latest-Linux-x86_64.sh
-ENV PATH=/miniconda/bin:${PATH}                                                                                      
-RUN conda config --add channels defaults && \
-  conda config --add channels bioconda && \
-  conda config --add channels conda-forge && \
-  conda update -n base -c defaults -y conda
-
-RUN pip install --upgrade pip
diff --git a/workflow/nextflow.config b/workflow/nextflow.config
index 30e47ea1aea37ed6550cc2944d69d26e69887489..db422b68cb4a8d1900cbae33801f6d5f5b8eb9a0 100644
--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
@@ -2,4 +2,98 @@ profiles {
   standard {
     includeConfig 'conf/biohpc.config'
   }
+  biohpc {
+    includeConfig 'conf/biohpc.config'
+  }
+  biohpc_max {
+    includeConfig 'conf/biohpc_max.config'
+  }
+  aws_ondemand {
+    includeConfig 'conf/aws.config'
+    includeConfig 'conf/ondemand.config'
+  }
+  aws_spot {
+    includeConfig 'conf/aws.config'
+    includeConfig 'conf/spot.config'
+  }
+}
+
+process {
+  withName:getBag {
+    container = 'bicf/gudmaprbkfilexfer:2.0.1_indev'
+  }
+  withName:getData {
+    container = 'bicf/gudmaprbkfilexfer:2.0.1_indev'
+  }
+  withName: parseMetadata {
+    container = 'bicf/python3:2.0.1_indev'
+  }
+  withName: trimData {
+    container = 'bicf/trimgalore:1.1'
+  }
+  withName: getRefInfer {
+    container = 'bicf/awscli:1.1'
+  }
+  withName: downsampleData {
+    container = 'bicf/seqtk:2.0.1_indev'
+  }
+  withName: alignSampleData {
+    container = 'bicf/gudmaprbkaligner:2.0.1_indev'
+  }
+  withName: inferMetadata {
+    container = 'bicf/rseqc3.0:2.0.1_indev'
+  }
+  withName: getRef {
+    container = 'bicf/awscli:1.1'
+  }
+  withName: alignData {
+    container = 'bicf/gudmaprbkaligner:2.0.1_indev'
+  }
+  withName: dedupData {
+    container = 'bicf/gudmaprbkdedup:2.0.0'
+  }
+  withName: countData {
+    container = 'bicf/subread2:2.0.0'
+  }
+  withName: makeBigWig {
+    container = 'bicf/deeptools3.3:2.0.1_indev'
+  }
+  withName: fastqc {
+    container = 'bicf/fastqc:2.0.1_indev'
+  }
+  withName: dataQC {
+    container = 'bicf/rseqc3.0:2.0.1_indev'
+  }
+  withName: aggrQC {
+    container = 'bicf/multiqc1.8:2.0.1_indev'
+  }
+}
+
+trace {
+  enabled = true
+  file = 'pipeline_trace.txt'
+  fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
+}
+
+timeline {
+  enabled = true
+  file = 'timeline.html'
+}
+	
+report {
+  enabled = true
+  file = 'report.html'
+}
+
+tower {
+  accessToken = '3ade8f325d4855434b49aa387421a44c63e3360f'
+  enabled = true
+}
+
+manifest {
+  homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq'
+  description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.'
+  mainScript = 'rna-seq.nf'
+  version = 'v0.0.1'
+  nextflowVersion = '>=19.09.0'
 }
diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf
old mode 100755
new mode 100644
index d839044791bc3aaccc701c9fb62099105c97205b..772d076174400bca13110869a797f87617ff5c89
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -1,49 +1,1060 @@
 #!/usr/bin/env nextflow
 
-// Define input variables
-params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip"
+//  ########  ####  ######  ######## 
+//  ##     ##  ##  ##    ## ##       
+//  ##     ##  ##  ##       ##       
+//  ########   ##  ##       ######   
+//  ##     ##  ##  ##       ##       
+//  ##     ##  ##  ##    ## ##       
+//  ########  ####  ######  ##       
 
+// Define input variables
+params.deriva = "${baseDir}/../test_data/auth/credential.json"
+params.bdbag = "${baseDir}/../test_data/auth/cookies.txt"
+//params.repRID = "16-1ZX4"
+params.repRID = "Q-Y5JA"
+params.source = "dev"
+params.refMoVersion = "38.p6.vM22"
+params.refHuVersion = "38.p12.v31"
+params.refERCCVersion = "92"
 params.outDir = "${baseDir}/../output"
 
 // Parse input variables
+deriva = Channel
+  .fromPath(params.deriva)
+  .ifEmpty { exit 1, "deriva credential file not found: ${params.deriva}" }
 bdbag = Channel
   .fromPath(params.bdbag)
-  .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" }
-
+  .ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" }
+repRID = params.repRID
+refMoVersion = params.refMoVersion
+refHuVersion = params.refHuVersion
+refERCCVersion = params.refERCCVersion
 outDir = params.outDir
+logsDir = "${outDir}/Logs"
+
+// Define fixed files
+derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json")
+if (params.source == "dev") {
+  source = "dev.gudmap.org"
+} else if (params.source == "staging") {
+  source = "staging.gudmap.org"
+} else if (params.source == "production") {
+  source = "www.gudmap.org"
+}
+referenceBase = "s3://bicf-references"
+//referenceBase = "/project/BICF/BICF_Core/shared/gudmap/references"
+referenceInfer = Channel.fromList(["ERCC","GRCh","GRCm"])
+multiqcConfig = Channel.fromPath("${baseDir}/conf/multiqc_config.yaml")
+bicfLogo = Channel.fromPath("${baseDir}/../docs/bicf_logo.png")
+
+// Define script files
+script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh")
+script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py")
+script_inferMeta = Channel.fromPath("${baseDir}/scripts/inferMeta.sh")
+script_calculateTPM = Channel.fromPath("${baseDir}/scripts/calculateTPM.R")
+script_convertGeneSymbols = Channel.fromPath("${baseDir}/scripts/convertGeneSymbols.R")
+script_tinHist = Channel.fromPath("${baseDir}/scripts/tinHist.py")
+
+/*
+ * trackStart: track start of pipeline
+ */
+params.ci = false
+params.dev = false
+process trackStart {
+  container 'docker://bicf/bicfbase:2.1.0'
+  script:
+  """
+  hostname
+  ulimit -a
+  
+  curl -H 'Content-Type: application/json' -X PUT -d \
+    '{ \
+      "sessionId": "${workflow.sessionId}", \
+      "pipeline": "gudmap.rbk_rnaseq", \
+      "start": "${workflow.start}", \
+      "repRID": "${repRID}", \
+      "astrocyte": false, \
+      "status": "started", \
+      "nextflowVersion": "${workflow.nextflow.version}", \
+      "pipelineVersion": "${workflow.manifest.version}", \
+      "ci": ${params.ci}, \
+      "dev": ${params.dev} \
+    }' \
+    "https://xku43pcwnf.execute-api.us-east-1.amazonaws.com/ProdDeploy/pipeline-tracking"
+  """
+}
+
+log.info """\
+====================================
+BICF RNA-seq Pipeline for GUDMAP/RBK
+====================================
+Replicate RID          : ${params.repRID}
+Source Server          : ${params.source}
+Mouse Reference Version: ${params.refMoVersion}
+Human Reference Version: ${params.refHuVersion}
+ERCC Reference Version : ${params.refERCCVersion}
+Output Directory       : ${params.outDir}
+------------------------------------
+Nextflow Version       : ${workflow.nextflow.version}
+Pipeline Version       : ${workflow.manifest.version}
+Session ID             : ${workflow.sessionId}
+------------------------------------
+CI                     : ${params.ci}
+Development            : ${params.dev}
+------------------------------------
+"""
 
+/*
+ * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid
+ */
+process getBag {
+  tag "${repRID}"
+
+  input:
+    path credential, stageAs: "credential.json" from deriva
+    path derivaConfig
+
+  output:
+    path ("Replicate_*.zip") into bagit
+
+  script:
+    """
+    hostname > ${repRID}.getBag.log
+    ulimit -a >> ${repRID}.getBag.log
+
+    # link credential file for authentication
+    echo -e "LOG: linking deriva credentials" >> ${repRID}.getBag.log
+    mkdir -p ~/.deriva
+    ln -sf `readlink -e credential.json` ~/.deriva/credential.json
+    echo -e "LOG: linked" >> ${repRID}.getBag.log
+
+    # deriva-download replicate RID
+    echo -e "LOG: fetching bagit for ${repRID} in GUDMAP" >> ${repRID}.getBag.log
+    deriva-download-cli ${source} --catalog 2 ${derivaConfig} . rid=${repRID}
+    echo -e "LOG: fetched" >> ${repRID}.getBag.log
+    """
+}
 
 /*
  * getData: fetch study files from consortium with downloaded bdbag.zip
- * python must be loaded prior to nextflow run, because conda env create from .yml doesn't work with nextflow loaded module (either process in-line, or config file)
  */
- process getData {
-     publishDir "${outDir}/temp/getData", mode: "symlink"
-     conda "${baseDir}/conf/conda.env.bdbag.yml"
-
-     input:
-        file bdbag
-
-    output:
-        file("**/*.R*.fastq.gz") into fastqPaths
-        file("**/File.csv") into filePaths
-        file("**/Experiment Settings.csv") into experimentSettingsPaths
-        file("**/Experiment.csv") into experimentPaths
-
-    script:
-        """
-        hostname
-        ulimit -a
-        study=\$(echo "${bdbag}" | cut -d'.' -f1)
-        echo LOG: \${study}
-        unzip ${bdbag}
-        echo LOG: bdgag unzipped
-        python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study}
-        echo LOG: fetch file filtered for only .fastq.gz
-        #bdbag --materialize "\$(echo "${bdbag}" | cut -d'.' -f1)"
-        sh ${baseDir}/scripts/bdbagFetch.sh \${study}
-        echo LOG: bdbag fetched
-        sh ${baseDir}/scripts/renameFastq.sh \${study}
-        echo LOG: fastq.gz files renamed to replicate RID
-        """
- }
\ No newline at end of file
+process getData {
+  tag "${repRID}"
+
+  input:
+    path script_bdbagFetch
+    path cookies, stageAs: "deriva-cookies.txt" from bdbag
+    path bagit
+
+  output:
+    path ("*.R{1,2}.fastq.gz") into fastqs
+    path ("**/File.csv") into fileMeta
+    path ("**/Experiment Settings.csv") into experimentSettingsMeta
+    path ("**/Experiment.csv") into experimentMeta
+
+  script:
+    """
+    hostname > ${repRID}.getData.log
+    ulimit -a >> ${repRID}.getData.log
+    
+    # link deriva cookie for authentication
+    echo -e "LOG: linking deriva cookie" >> ${repRID}.getData.log
+    mkdir -p ~/.bdbag
+    ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt
+    echo -e "LOG: linked" >> ${repRID}.getData.log
+    
+    # get bagit basename
+    replicate=\$(basename "${bagit}" | cut -d "." -f1)
+    echo -e "LOG: bagit replicate name \${replicate}" >> ${repRID}.getData.log
+    
+    # unzip bagit
+    echo -e "LOG: unzipping replicate bagit" >> ${repRID}.getData.log
+    unzip ${bagit}
+    echo -e "LOG: unzipped" >> ${repRID}.getData.log
+    
+    # bagit fetch fastq's only and rename by repRID
+    echo -e "LOG: fetching replicate bdbag" >> ${repRID}.getData.log
+    sh ${script_bdbagFetch} \${replicate} ${repRID}
+    echo -e "LOG: fetched" >> ${repRID}.getData.log
+    """
+}
+
+// Replicate raw fastq's for multiple process inputs
+fastqs.into {
+  fastqs_trimData
+  fastqs_fastqc
+}
+
+/*
+ * parseMetadata: parses metadata to extract experiment parameters
+*/
+process parseMetadata {
+  tag "${repRID}"
+
+  input:
+    path script_parseMeta
+    path file from fileMeta
+    path experimentSettings, stageAs: "ExperimentSettings.csv" from experimentSettingsMeta
+    path experiment from experimentMeta
+
+  output:
+    path "design.csv" into metadata
+
+  script:
+    """
+    hostname > ${repRID}.parseMetadata.log
+    ulimit -a >> ${repRID}.parseMetadata.log
+
+    # check replicate RID metadata
+    rep=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p repRID)
+    echo -e "LOG: replicate RID metadata parsed: \${rep}" >> ${repRID}.parseMetadata.log
+
+    # get experiment RID metadata
+    exp=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p expRID)
+    echo -e "LOG: experiment RID metadata parsed: \${exp}" >> ${repRID}.parseMetadata.log
+    
+    # get study RID metadata
+    study=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p studyRID)
+    echo -e "LOG: study RID metadata parsed: \${study}" >> ${repRID}.parseMetadata.log
+
+    # get endedness metadata
+    endsMeta=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p endsMeta)
+    echo -e "LOG: endedness metadata parsed: \${endsMeta}" >> ${repRID}.parseMetadata.log
+    
+    # ganually get endness
+    endsManual=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p endsManual)
+    echo -e "LOG: endedness manually detected: \${endsManual}" >> ${repRID}.parseMetadata.log
+
+    # get strandedness metadata
+    stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p stranded)
+    echo -e "LOG: strandedness metadata parsed: \${stranded}" >> ${repRID}.parseMetadata.log
+    
+    # get spike-in metadata
+    spike=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p spike)
+    echo -e "LOG: spike-in metadata parsed: \${spike}" >> ${repRID}.parseMetadata.log
+    
+    # get species metadata
+    species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experiment}" -p species)
+    echo -e "LOG: species metadata parsed: \${species}" >> ${repRID}.parseMetadata.log
+
+    # get read length metadata
+    readLength=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p readLength)
+    if [ "\${readLength}" = "nan"]
+    then
+      readLength="NA"
+    fi
+    echo -e "LOG: read length metadata parsed: \${readLength}" >> ${repRID}.parseMetadata.log
+
+    # save design file
+    echo -e "\${endsMeta},\${endsManual},\${stranded},\${spike},\${species},\${readLength},\${exp},\${study}" > design.csv
+    """
+}
+
+// Split metadata into separate channels
+endsMeta = Channel.create()
+endsManual = Channel.create()
+strandedMeta = Channel.create()
+spikeMeta = Channel.create()
+speciesMeta = Channel.create()
+readLengthMeta = Channel.create()
+expRID = Channel.create()
+studyRID = Channel.create()
+metadata.splitCsv(sep: ",", header: false).separate(
+  endsMeta,
+  endsManual,
+  strandedMeta,
+  spikeMeta,
+  speciesMeta,
+  readLengthMeta,
+  expRID,
+  studyRID
+)
+// Replicate metadata for multiple process inputs
+endsManual.into {
+  endsManual_trimData
+  endsManual_downsampleData
+  endsManual_alignSampleData
+  endsManual_aggrQC
+}
+
+
+/*
+ * trimData: trims any adapter or non-host sequences from the data
+*/
+process trimData {
+  tag "${repRID}"
+
+  input:
+    val ends from endsManual_trimData
+    path (fastq) from fastqs_trimData
+
+  output:
+    path ("*.fq.gz") into fastqsTrim
+    path ("*_trimming_report.txt") into trimQC
+    path ("readLength.csv") into inferMetadata_readLength
+
+  script:
+    """
+    hostname > ${repRID}.trimData.log
+    ulimit -a >> ${repRID}.trimData.log
+
+    # trim fastq's using trim_galore and extract median read length
+    echo -e "LOG: trimming ${ends}" >> ${repRID}.trimData.log
+    if [ "${ends}" == "se" ]
+    then
+      trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]}
+      readLength=\$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
+    elif [ "${ends}" == "pe" ]
+    then
+      trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]}
+      readLength=\$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
+    fi
+    echo -e "LOG: trimmed" >> ${repRID}.trimData.log
+    echo -e "LOG: average trimmed read length: \${readLength}" >> ${repRID}.trimData.log
+    
+    # save read length file
+    echo -e "\${readLength}" > readLength.csv
+    """
+}
+
+// Extract calculated read length metadata into channel
+readLengthInfer = Channel.create()
+inferMetadata_readLength.splitCsv(sep: ",", header: false).separate(
+  readLengthInfer
+)
+
+// Replicate trimmed fastq's
+fastqsTrim.into {
+  fastqsTrim_alignData
+  fastqsTrim_downsampleData
+}
+
+/*
+  * getRefInfer: dowloads appropriate reference for metadata inference
+*/
+process getRefInfer {
+  tag "${refName}"
+
+  input:
+    val refName from referenceInfer
+
+  output:
+    tuple val (refName), path ("hisat2", type: 'dir'), path ("*.fna"), path ("*.gtf")  into refInfer
+    path ("${refName}", type: 'dir') into bedInfer
+ 
+  script:
+    """
+    hostname > ${repRID}.${refName}.getRefInfer.log
+    ulimit -a >> ${repRID}.${refName}.getRefInfer.log
+
+    # set the reference name
+    if [ "${refName}" == "ERCC" ]
+    then
+      references=\$(echo ${referenceBase}/ERCC${refERCCVersion})
+    elif [ "${refName}" == "GRCm" ]
+    then
+      references=\$(echo ${referenceBase}/GRCm${refMoVersion})
+    elif [ '${refName}' == "GRCh" ]
+    then
+      references=\$(echo ${referenceBase}/GRCh${refHuVersion})
+    else
+      echo -e "LOG: ERROR - References could not be set!\nReference found: ${referenceBase}" >> ${repRID}.${refName}.getRefInfer.log
+      exit 1
+    fi
+    mkdir ${refName}
+
+    # retreive appropriate reference appropriate location
+    echo -e "LOG: fetching ${refName} reference files from ${referenceBase}" >> ${repRID}.${refName}.getRefInfer.log
+    if [ ${referenceBase} == "s3://bicf-references" ]
+    then
+      aws s3 cp "\${references}"/hisat2 ./hisat2 --recursive
+      aws s3 cp "\${references}"/bed ./${refName}/bed --recursive
+      aws s3 cp "\${references}"/genome.fna ./
+      aws s3 cp "\${references}"/genome.gtf ./
+    elif [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references" ]
+    then
+      ln -s "\${references}"/hisat2
+      ln -s "\${references}"/bed ${refName}/bed
+      ln -s "\${references}"/genome.fna
+      ln -s "\${references}"/genome.gtf
+    fi
+    echo -e "LOG: fetched" >> ${repRID}.${refName}.getRefInfer.log
+
+    # make blank bed folder for ERCC
+    echo -e "LOG: making dummy bed folder for ERCC" >> ${repRID}.${refName}.getRefInfer.log
+    if [ "${refName}" == "ERCC" ]
+    then
+      rm -rf ${refName}/bed
+      mkdir ${refName}/bed
+      touch ${refName}/bed/temp
+    fi
+    """
+}
+
+/*
+ * downsampleData: downsample fastq's for metadata inference
+ */
+process downsampleData {
+  tag "${repRID}"
+
+  input:
+    val ends from endsManual_downsampleData
+    path fastq from fastqsTrim_downsampleData
+
+  output:
+    path ("sampled.1.fq") into fastqs1Sample
+    path ("sampled.2.fq") into fastqs2Sample
+
+  script:
+    """
+    hostname > ${repRID}.downsampleData.log
+    ulimit -a >> ${repRID}.downsampleData.log
+
+    if [ "${ends}" == "se" ]
+    then
+      echo -e "LOG: downsampling SE trimmed fastq" >> ${repRID}.downsampleData.log
+      seqtk sample -s100 *trimmed.fq.gz 100000 1> sampled.1.fq
+      touch sampled.2.fq
+    elif [ "${ends}" == "pe" ]
+    then
+      echo -e "LOG: downsampling R1 of PE trimmed fastq" >> ${repRID}.downsampleData.log
+      seqtk sample -s100 *1.fq.gz 1000000 1> sampled.1.fq
+      echo -e "LOG: downsampling R2 of PE trimmed fastq" >> ${repRID}.downsampleData.log
+      seqtk sample -s100 *2.fq.gz 1000000 1> sampled.2.fq
+    fi
+    echo -e "LOG: downsampled" >> ${repRID}.downsampleData.log
+    """
+}
+
+// Replicate the dowsampled fastq's and attatched to the references
+inferInput = endsManual_alignSampleData.combine(refInfer.combine(fastqs1Sample.collect().combine(fastqs2Sample.collect())))
+
+/*
+ * alignSampleData: aligns the downsampled reads to a reference database
+*/
+process alignSampleData {
+  tag "${ref}"
+
+  input:
+    tuple val (ends), val (ref), path (hisat2), path (fna), path (gtf), path (fastq1), path (fastq2) from inferInput
+
+  output:
+    path ("${ref}.sampled.sorted.bam") into sampleBam
+    path ("${ref}.sampled.sorted.bam.bai") into sampleBai
+    path ("${ref}.alignSampleSummary.txt") into alignSampleQC
+
+  script:
+    """
+    hostname > ${repRID}.${ref}.alignSampleData.log
+    ulimit -a >> ${repRID}.${ref}.alignSampleData.log
+
+    # align the reads with Hisat2
+    echo -e "LOG: aligning ${ends}" >> ${repRID}.${ref}.alignSampleData.log
+    if [ "${ends}" == "se" ]
+    then
+     
+      hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome -U ${fastq1} --summary-file ${ref}.alignSampleSummary.txt --new-summary
+    elif [ "${ends}" == "pe" ]
+    then
+      hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome --no-mixed --no-discordant -1 ${fastq1} -2 ${fastq2} --summary-file ${ref}.alignSampleSummary.txt --new-summary
+    fi
+    echo -e "LOG: aliged" >> ${repRID}.${ref}.alignSampleData.log
+    
+    # convert the output sam file to a sorted bam file using Samtools
+    echo -e "LOG: converting from sam to bam" >> ${repRID}.${ref}.alignSampleData.log
+    samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${ref}.sampled.bam ${ref}.sampled.sam
+
+    # sort the bam file using Samtools
+    echo -e "LOG: sorting the bam file" >> ${repRID}.${ref}.alignSampleData.log
+    samtools sort -@ `nproc` -O BAM -o ${ref}.sampled.sorted.bam ${ref}.sampled.bam
+
+    # index the sorted bam using Samtools
+    echo -e "LOG: indexing sorted bam file" >> ${repRID}.${ref}.alignSampleData.log
+    samtools index -@ `nproc` -b ${ref}.sampled.sorted.bam ${ref}.sampled.sorted.bam.bai
+    """
+}
+
+alignSampleQC.into {
+  alignSampleQC_inferMetadata
+  alignSampleQC_aggrQC
+}
+
+process inferMetadata {
+  tag "${repRID}"
+
+  input:
+    path script_inferMeta
+    path beds from bedInfer.collect()
+    path bam from sampleBam.collect()
+    path bai from sampleBai.collect()
+    path alignSummary from alignSampleQC_inferMetadata.collect()
+
+  output:
+    path "infer.csv" into inferMetadata
+    path "${repRID}.infer_experiment.txt" into inferExperiment
+
+  script:
+    """
+    hostname > ${repRID}.inferMetadata.log
+    ulimit -a >> ${repRID}.inferMetadata.log
+
+    # collect alignment rates (round down to integers)
+    align_ercc=\$(echo \$(grep "Overall alignment rate" ERCC.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%'))
+    align_ercc=\$(echo \${align_ercc%.*})
+    echo -e "LOG: alignment rate to ERCC: \${align_ercc}" >> ${repRID}.inferMetadata.log
+    align_hu=\$(echo \$(grep "Overall alignment rate" GRCh.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%'))
+    align_hu=\$(echo \${align_hu%.*})
+    echo -e "LOG: alignment rate to GRCh: \${align_hu}" >> ${repRID}.inferMetadata.log
+    align_mo=\$(echo \$(grep "Overall alignment rate" GRCm.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%'))
+    align_mo=\$(echo \${align_mo%.*})
+    echo -e "LOG: alignment rate to GRCm: \${align_mo}" >> ${repRID}.inferMetadata.log
+
+    # determine spike-in
+    if [ 1 -eq \$(echo \$(expr \${align_ercc} ">=" 10)) ]
+    then
+      spike="yes"
+    else
+      spike="no"
+    fi
+    echo -e "LOG: inference of strandedness results is: \${spike}" >> ${repRID}.inferMetadata.log
+
+    # determine species
+    if [ 1 -eq \$(echo \$(expr \${align_hu} ">=" 25)) ] && [ 1 -eq \$(echo \$(expr \${align_mo} "<" 25)) ]
+    then
+      species="Homo sapiens"
+      bam="GRCh.sampled.sorted.bam"
+      bed="./GRCh/bed/genome.bed"
+    elif [ 1 -eq \$(echo \$(expr \${align_mo} ">=" 25)) ] && [ 1 -eq \$(echo \$(expr \${align_hu} "<" 25)) ]
+    then
+      species="Mus musculus"
+      bam="GRCm.sampled.sorted.bam"
+      bed="./GRCm/bed/genome.bed"
+    else
+      echo -e "LOG: ERROR - inference of species returns an ambiguous result: hu=\${align_hu} mo=\${align_mo}" >> ${repRID}.inferMetadata.log
+      exit 1
+    fi
+    echo -e "LOG: inference of species results in: \${species}" >> ${repRID}.inferMetadata.log
+
+    # infer experimental setting from dedup bam
+    echo -e "LOG: infer experimental setting from dedup bam" >> ${repRID}.inferMetadata.log
+    infer_experiment.py -r "\${bed}" -i "\${bam}" 1>> ${repRID}.infer_experiment.txt
+    echo -e "LOG: infered" >> ${repRID}.inferMetadata.log
+
+    ended=`bash inferMeta.sh endness ${repRID}.infer_experiment.txt`
+    fail=`bash inferMeta.sh fail ${repRID}.infer_experiment.txt`
+    if [ \${ended} == "PairEnd" ] 
+    then
+      ends="pe"
+      percentF=`bash inferMeta.sh pef ${repRID}.infer_experiment.txt`
+      percentR=`bash inferMeta.sh per ${repRID}.infer_experiment.txt`
+    elif [ \${ended} == "SingleEnd" ]
+    then
+      ends="se"
+      percentF=`bash inferMeta.sh sef ${repRID}.infer_experiment.txt`
+      percentR=`bash inferMeta.sh ser ${repRID}.infer_experiment.txt`
+    fi
+    echo -e "LOG: percentage reads in the same direction as gene: \${percentF}" >> ${repRID}.inferMetadata.log
+    echo -e "LOG: percentage reads in the opposite direction as gene: \${percentR}" >> ${repRID}.inferMetadata.log
+    if [ 1 -eq \$(echo \$(expr \${percentF#*.} ">" 2500)) ] && [ 1 -eq \$(echo \$(expr \${percentR#*.} "<" 2500)) ]
+    then
+      stranded="forward"
+    elif [ 1 -eq \$(echo \$(expr \${percentR#*.} ">" 2500)) ] && [ 1 -eq \$(echo \$(expr \${percentF#*.} "<" 2500)) ]
+    then
+      stranded="reverse"
+
+    else
+      stranded="unstranded"
+    fi
+    echo -e "LOG: stradedness set to: \${stranded}" >> ${repRID}.inferMetadata.log
+
+    # write infered metadata to file
+    echo "\${ends},\${stranded},\${spike},\${species},\${align_ercc},\${align_hu},\${align_mo},\${percentF},\${percentR},\${fail}" 1>> infer.csv
+    """
+}
+
+// Split metadata into separate channels
+endsInfer = Channel.create()
+strandedInfer = Channel.create()
+spikeInfer = Channel.create()
+speciesInfer = Channel.create()
+align_erccInfer = Channel.create()
+align_huInfer = Channel.create()
+align_moInfer = Channel.create()
+percentFInfer = Channel.create()
+percentRInfer = Channel.create()
+failInfer = Channel.create()
+inferMetadata.splitCsv(sep: ",", header: false).separate(
+  endsInfer,
+  strandedInfer,
+  spikeInfer,
+  speciesInfer,
+  align_erccInfer,
+  align_huInfer,
+  align_moInfer,
+  percentFInfer,
+  percentRInfer,
+  failInfer
+)
+// Replicate metadata for multiple process inputs
+endsInfer.into {
+  endsInfer_alignData
+  endsInfer_countData
+  endsInfer_dataQC
+  endsInfer_aggrQC
+}
+strandedInfer.into {
+  strandedInfer_alignData
+  strandedInfer_countData
+  strandedInfer_aggrQC
+}
+spikeInfer.into{
+  spikeInfer_getRef
+  spikeInfer_aggrQC
+}
+speciesInfer.into {
+  speciesInfer_getRef
+  speciesInfer_aggrQC
+}
+
+
+/*
+  * getRef: downloads appropriate reference
+*/
+process getRef {
+  tag "${species}"
+
+  input:
+    val spike from spikeInfer_getRef
+    val species from speciesInfer_getRef
+
+  output:
+    tuple path ("hisat2", type: 'dir'), path ("bed", type: 'dir'), path ("*.fna"), path ("*.gtf"), path ("geneID.tsv"), path ("Entrez.tsv")  into reference
+ 
+  script:
+    """
+    hostname > ${repRID}.getRef.log
+    ulimit -a >> ${repRID}.getRef.log
+
+    # set the reference name
+    if [ "${species}" == "Mus musculus" ]
+    then
+      references=\$(echo ${referenceBase}/GRCm${refMoVersion})
+    elif [ '${species}' == "Homo sapiens" ]
+    then
+      references=\$(echo ${referenceBase}/GRCh${refHuVersion})
+    else
+      echo -e "LOG: ERROR - References could not be set!\nSpecies reference found: ${species}" >> ${repRID}.getRef.log
+      exit 1
+    fi
+    if [ "${spike}" == "yes" ]
+    then
+      references=\$(echo \${reference}-S/)
+    elif [ "${spike}" == "no" ]
+    then
+      reference=\$(echo \${references}/)
+    fi
+    echo -e "LOG: species set to \${references}" >> ${repRID}.getRef.log
+
+    # retreive appropriate reference appropriate location
+    echo -e "LOG: fetching ${species} reference files from ${referenceBase}" >> ${repRID}.getRef.log
+    if [ ${referenceBase} == "s3://bicf-references" ]
+    then
+      echo -e "LOG: grabbing reference files from S3" >> ${repRID}.getRef.log
+      aws s3 cp "\${references}"/hisat2 ./hisat2 --recursive
+      aws s3 cp "\${references}"/bed ./bed --recursive
+      aws s3 cp "\${references}"/genome.fna ./
+      aws s3 cp "\${references}"/genome.gtf ./
+      aws s3 cp "\${references}"/geneID.tsv ./
+      aws s3 cp "\${references}"/Entrez.tsv ./
+    elif [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references" ]
+    then
+      ln -s "\${references}"/hisat2
+      ln -s "\${references}"/bed
+      ln -s "\${references}"/genome.fna
+      ln -s "\${references}"/genome.gtf
+      ln -s "\${references}"/geneID.tsv
+      ln -s "\${references}"/Entrez.tsv
+    fi
+    echo -e "LOG: fetched" >> ${repRID}.getRef.log
+    """
+}
+
+// Replicate reference for multiple process inputs
+reference.into {
+  reference_alignData
+  reference_countData
+  reference_dataQC
+}
+
+/*
+ * alignData: aligns the reads to a reference database
+*/
+process alignData {
+  tag "${repRID}"
+
+  input:
+    val ends from endsInfer_alignData
+    val stranded from strandedInfer_alignData
+    path fastq from fastqsTrim_alignData
+    path reference_alignData
+
+  output:
+    tuple path ("${repRID}.sorted.bam"), path ("${repRID}.sorted.bam.bai") into rawBam
+    path ("*.alignSummary.txt") into alignQC
+
+  script:
+    """
+    hostname > ${repRID}.align.log
+    ulimit -a >> ${repRID}.align.log
+
+    # set stranded param for hisat2
+    if [ "${stranded}"=="unstranded" ]
+    then
+      strandedParam=""
+    elif [ "${stranded}" == "forward" ] && [ "${ends}" == "se" ]
+    then
+        strandedParam="--rna-strandness F"
+    elif [ "${stranded}" == "forward" ] && [ "${ends}" == "pe" ]
+    then
+      strandedParam="--rna-strandness FR"
+    elif [ "${stranded}" == "reverse" ] && [ "${ends}" == "se" ]
+    then
+        strandedParam="--rna-strandness R"
+    elif [ "${stranded}" == "reverse" ] && [ "${ends}" == "pe" ]
+    then
+      strandedParam="--rna-strandness RF"    
+    fi
+
+    # align the reads with Hisat2
+    echo -e "LOG: aligning ${ends}" >> ${repRID}.align.log
+    if [ "${ends}" == "se" ]
+    then
+      hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} -U ${fastq[0]} --summary-file ${repRID}.alignSummary.txt --new-summary
+    elif [ "${ends}" == "pe" ]
+    then
+      hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} --summary-file ${repRID}.alignSummary.txt --new-summary
+    fi
+    echo -e "LOG: alignined" >> ${repRID}.align.log
+    
+    # convert the output sam file to a sorted bam file using Samtools
+    echo -e "LOG: converting from sam to bam" >> ${repRID}.align.log
+    samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam
+
+    # sort the bam file using Samtools
+    echo -e "LOG: sorting the bam file" >> ${repRID}.align.log
+    samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam
+
+    # index the sorted bam using Samtools
+    echo -e "LOG: indexing sorted bam file" >> ${repRID}.align.log
+    samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bam.bai
+    """
+}
+
+// Replicate rawBam for multiple process inputs
+rawBam.into {
+  rawBam_dedupData
+}
+
+/*
+ *dedupData: mark the duplicate reads, specifically focused on PCR or optical duplicates
+*/
+process dedupData {
+  tag "${repRID}"
+  publishDir "${outDir}/bam", mode: 'copy', pattern: "*.deduped.bam"
+
+  input:
+    tuple path (bam), path (bai) from rawBam_dedupData
+
+  output:
+    tuple path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bam.bai") into dedupBam
+    tuple path ("${repRID}.sorted.deduped.*.bam"), path ("${repRID}.sorted.deduped.*.bam.bai") into dedupChrBam 
+    path ("*.deduped.Metrics.txt") into dedupQC
+
+  script:
+    """
+    hostname > ${repRID}.dedup.log
+    ulimit -a >> ${repRID}.dedup.log
+
+    # remove duplicated reads using Picard's MarkDuplicates
+    echo -e "LOG: deduplicating reads" >> ${repRID}.dedup.log
+    java -jar /picard/build/libs/picard.jar MarkDuplicates I=${bam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true
+    echo -e "LOG: deduplicated" >> ${repRID}.dedup.log
+
+    # sort the bam file using Samtools
+    echo -e "LOG: sorting the bam file" >> ${repRID}.dedup.log
+    samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.deduped.bam ${repRID}.deduped.bam
+    
+    # index the sorted bam using Samtools
+    echo -e "LOG: indexing sorted bam file" >> ${repRID}.dedup.log
+    samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bam.bai
+
+    # split the deduped BAM file for multi-threaded tin calculation
+    for i in `samtools view ${repRID}.sorted.deduped.bam | cut -f3 | sort | uniq`;
+      do
+      echo "echo \"LOG: splitting each chromosome into its own BAM and BAI files with Samtools\"; samtools view -b ${repRID}.sorted.deduped.bam \${i} 1>> ${repRID}.sorted.deduped.\${i}.bam; samtools index -@ `nproc` -b ${repRID}.sorted.deduped.\${i}.bam ${repRID}.sorted.deduped.\${i}.bam.bai"
+    done | parallel -j `nproc` -k
+    """
+}
+
+// Replicate dedup bam/bai for multiple process inputs
+dedupBam.into {
+  dedupBam_countData
+  dedupBam_makeBigWig
+  dedupBam_dataQC
+}
+
+/*
+ *makeBigWig: make BigWig files for output
+*/
+process makeBigWig {
+  tag "${repRID}"
+  publishDir "${outDir}/bigwig", mode: 'copy', pattern: "${repRID}.bw"
+
+  input:
+    tuple path (bam), path (bai) from dedupBam_makeBigWig
+
+  output:
+    path ("${repRID}.bw")
+
+  script:
+    """
+    hostname > ${repRID}.makeBigWig.log
+    ulimit -a >> ${repRID}.makeBigWig.log
+
+    # create bigwig
+    echo -e "LOG: creating bibWig" >> ${repRID}.makeBigWig.log
+    bamCoverage -p `nproc` -b ${bam} -o ${repRID}.bw
+    echo -e "LOG: created" >> ${repRID}.makeBigWig.log
+    """
+}
+
+/*
+ *countData: count data and calculate tpm
+*/
+process countData {
+  tag "${repRID}"
+  publishDir "${outDir}/count", mode: 'copy', pattern: "${repRID}*.countTable.csv"
+
+  input:
+    path script_calculateTPM
+    path script_convertGeneSymbols
+    tuple path (bam), path (bai) from dedupBam_countData
+    path ref from reference_countData
+    val ends from endsInfer_countData
+    val stranded from strandedInfer_countData
+
+  output:
+    path ("*.tpmTable.csv") into counts
+    path ("*.countData.summary") into countsQC
+    path ("assignedReads.csv") into inferMetadata_assignedReads
+
+  script:
+    """
+    hostname > ${repRID}.countData.log
+    ulimit -a >> ${repRID}.countData.log
+
+    # determine strandedness and setup strandig for countData
+    stranding=0;
+    if [ "${stranded}" == "unstranded" ]
+    then
+      stranding=0
+      echo -e "LOG: strandedness set to unstranded [0]" >> ${repRID}.countData.log
+    elif [ "${stranded}" == "forward" ]
+    then
+      stranding=1
+      echo -e "LOG: strandedness set to forward stranded [1]" >> ${repRID}.countData.log
+    elif [ "${stranded}" == "reverse" ]
+    then
+      stranding=2
+      echo -e "LOG: strandedness set to reverse stranded [2]" >> ${repRID}.countData.log
+    fi
+
+    # run featureCounts
+    echo -e "LOG: counting ${ends} features" >> ${repRID}.countData.log
+    if [ "${ends}" == "se" ]
+    then
+      featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
+    elif [ "${ends}" == "pe" ]
+    then
+      featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
+    fi
+    echo -e "LOG: counted" >> ${repRID}.countData.log
+    
+    # extract assigned reads
+    grep -m 1 'Assigned' *.countData.summary | grep -oe '\\([0-9.]*\\)' > assignedReads.csv
+
+    # calculate TPM from the resulting countData table
+    echo -e "LOG: calculating TPM with R" >> ${repRID}.countData.log
+    Rscript calculateTPM.R --count "${repRID}.countData"
+
+    # convert gene symbols to Entrez id's
+    echo -e "LOG: convert gene symbols to Entrez id's" >> ${repRID}.countData.log
+    Rscript convertGeneSymbols.R --repRID "${repRID}"
+    """
+}
+
+// Extract number of assigned reads metadata into channel
+assignedReadsInfer = Channel.create()
+inferMetadata_assignedReads.splitCsv(sep: ",", header: false).separate(
+  assignedReadsInfer
+)
+
+/*
+ *fastqc: run fastqc on untrimmed fastq's
+*/
+process fastqc {
+  tag "${repRID}"
+
+  input:
+    path (fastq) from fastqs_fastqc
+
+  output:
+    path ("*_fastqc.zip") into fastqc
+    path ("rawReads.csv") into inferMetadata_rawReads
+
+  script:
+    """
+    hostname > ${repRID}.fastqc.log
+    ulimit -a >> ${repRID}.fastqc.log
+
+    # run fastqc
+    echo -e "LOG: running fastq on raw fastqs" >> ${repRID}.fastqc.log
+    fastqc *.fastq.gz -o .
+
+    # count raw reads
+    zcat *.R1.fastq.gz | echo \$((`wc -l`/4)) > rawReads.csv
+    """
+}
+
+// Extract number of raw reads metadata into channel
+rawReadsInfer = Channel.create()
+inferMetadata_rawReads.splitCsv(sep: ",", header: false).separate(
+  rawReadsInfer
+)
+
+/*
+ *dataQC: calculate transcript integrity numbers (TIN) and bin as well as calculate innerdistance of PE replicates
+*/
+process dataQC {
+  tag "${repRID}"
+
+  input:
+    path script_tinHist
+    path ref from reference_dataQC
+    tuple path (bam), path (bai) from dedupBam_dataQC
+    tuple path (chrBam), path (chrBai) from dedupChrBam
+    val ends from endsInfer_dataQC
+    
+  output:
+    path "${repRID}.tin.hist.tsv" into tinHist
+    path "${repRID}.tin.med.csv" into inferMetadata_tinMed
+    path "${repRID}.insertSize.inner_distance_freq.txt" into innerDistance
+  
+  script:
+    """
+    hostname > ${repRID}.dataQC.log
+    ulimit -a >> ${repRID}.dataQC.log
+
+    # calcualte TIN values per feature on each chromosome
+    echo -e  "geneID\tchrom\ttx_start\ttx_end\tTIN" > ${repRID}.sorted.deduped.tin.xls
+    for i in `cat ./bed/genome.bed | cut -f1 | sort | uniq`; do
+      echo "echo \"LOG: running tin.py on \${i}\" >> ${repRID}.dataQC.log; tin.py -i ${repRID}.sorted.deduped.\${i}.bam  -r ./bed/genome.bed; cat ${repRID}.sorted.deduped.\${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \\\"\\\\t\${i}\\\\t\\\";";
+    done | parallel -j `nproc` -k 1>> ${repRID}.sorted.deduped.tin.xls
+
+    # bin TIN values
+    echo -e "LOG: binning TINs" >> ${repRID}.dataQC.log
+    python3 ${script_tinHist} -r ${repRID}
+    echo -e "LOG: binned" >> ${repRID}.dataQC.log
+
+    # calculate inner-distances for PE data
+    if [ "${ends}" == "pe" ]
+    then
+      echo -e "LOG: calculating inner distances for ${ends}" >> ${repRID}.dataQC.log
+      inner_distance.py -i "${bam}" -o ${repRID}.insertSize -r ./bed/genome.bed
+      echo -e "LOG: calculated" >> ${repRID}.dataQC.log
+    elif [ "${ends}" == "se" ]
+    then
+      echo -e "LOG: creating dummy inner distance file for ${ends}" >> ${repRID}.dataQC.log
+      touch ${repRID}.insertSize.inner_distance_freq.txt
+    fi
+    """
+}
+
+// Extract median TIN metadata into channel
+tinMedInfer = Channel.create()
+inferMetadata_tinMed.splitCsv(sep: ",", header: false).separate(
+  tinMedInfer
+)
+
+/*
+ *aggrQC: aggregate QC from processes as well as metadata and run MultiQC
+*/
+process aggrQC {
+  tag "${repRID}"
+  publishDir "${outDir}/report", mode: 'copy', pattern: "${repRID}.multiqc.html"
+  publishDir "${outDir}/qc", mode: 'copy', pattern: "${repRID}.multiqc_data.json"
+
+  input:
+    path multiqcConfig
+    path bicfLogo
+    path fastqc
+    path trimQC
+    path alignQC
+    path dedupQC
+    path countsQC
+    path innerDistance
+    path tinHist
+    path alignSampleQCs from alignSampleQC_aggrQC.collect()
+    path inferExperiment
+    val endsManual from endsManual_aggrQC
+    val endsM from endsMeta
+    val strandedM from strandedMeta
+    val spikeM from spikeMeta
+    val speciesM from speciesMeta
+    val endsI from endsInfer_aggrQC
+    val strandedI from strandedInfer_aggrQC
+    val spikeI from spikeInfer_aggrQC
+    val speciesI from speciesInfer_aggrQC
+    val readLengthM from readLengthMeta
+    val readLengthI from readLengthInfer
+    val rawReadsI from rawReadsInfer
+    val assignedReadsI from assignedReadsInfer
+    val tinMedI from tinMedInfer
+    val expRID
+    val studyRID
+
+  output:
+    path "${repRID}.multiqc.html" into multiqc
+    path "${repRID}.multiqc_data.json" into multiqcJSON
+
+  script:
+    """
+    hostname > ${repRID}.aggrQC.log
+    ulimit -a >> ${repRID}.aggrQC.log
+
+    # make RID table
+    echo -e "LOG: creating RID table" >> ${repRID}.aggrQC.log
+    echo -e "Replicate RID\tExperiment RID\tStudy RID" > rid.tsv
+    echo -e "${repRID}\t${expRID}\t${studyRID}" >> rid.tsv
+
+    # make metadata table
+    echo -e "LOG: creating metadata table" >> ${repRID}.aggrQC.log
+    echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in\tRaw Reads\tAssigned Reads\tMedian Read Length\tMedian TIN" > metadata.tsv
+    echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t-\t-\t'${readLengthM}'\t-" >> metadata.tsv
+    echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv
+    echo -e "Measured\t-\t${endsManual}\t-\t-\t'${rawReadsI}'\t'${assignedReadsI}'\t'${readLengthI}'\t'${tinMedI}'" >> metadata.tsv
+
+    # remove inner distance report if it is empty (SE repRID)
+    echo -e "LOG: removing dummy inner distance file" >> ${repRID}.aggrQC.log
+    if [ "${endsM}" == "se" ]
+    then
+      rm -f ${innerDistance}
+    fi
+
+    # run MultiQC
+    echo -e "LOG: running multiqc" >> ${repRID}.aggrQC.log
+    multiqc -c ${multiqcConfig} . -n ${repRID}.multiqc.html
+    cp ${repRID}.multiqc_data/multiqc_data.json ${repRID}.multiqc_data.json
+    """
+}
\ No newline at end of file
diff --git a/workflow/scripts/bdbagFetch.sh b/workflow/scripts/bdbagFetch.sh
index 28dab3f5338b3b6371b2b8f4ee7ac6bf2e715fa6..606b88397d5a6cf4feb4aa38d7615e3e3ba48735 100644
--- a/workflow/scripts/bdbagFetch.sh
+++ b/workflow/scripts/bdbagFetch.sh
@@ -1,3 +1,14 @@
-#!/bin
+#!/bin/bash
 
-bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1
\ No newline at end of file
+if [ -z "${3}" ]
+then
+    bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz ${1}
+    for i in $(find */ -name "*R*.fastq.gz")
+    do
+        path=${2}.$(echo ${i##*/} | grep -o "R[1,2].fastq.gz")
+        cp ${i} ./${path}
+    done
+elif [ "${3}" == "TEST" ]
+then
+    bdbag --resolve-fetch all --fetch-filter filename\$*.txt ${1}
+fi
diff --git a/workflow/scripts/calculateTPM.R b/workflow/scripts/calculateTPM.R
new file mode 100644
index 0000000000000000000000000000000000000000..d4851d4806c5e12e06416a103f2e6972a8b4befe
--- /dev/null
+++ b/workflow/scripts/calculateTPM.R
@@ -0,0 +1,30 @@
+gc()
+library(optparse)
+
+option_list=list(
+  make_option("--count",action="store",type='character',help="Count File")
+)
+opt=parse_args(OptionParser(option_list=option_list))
+rm(option_list)
+
+if (!("count" %in% names(opt))){
+  stop("No count file passed, exiting.")
+} else if (!file.exists(opt$count)) {
+  stop("Count file doesn't exist, exiting.")
+}
+
+repRID <- basename(gsub(".featureCounts","",opt$count))
+
+count <- read.delim(opt$count, comment.char="#") # if featureCounts file changes structure, be sure to update count and Length columns below
+colnames(count)[7] <- "count"
+
+rpk <- count$count/count$Length/1000
+
+scale <- sum(rpk)/1000000
+
+tpm <- rpk/scale
+
+output <- cbind(count,tpm)
+colnames(output)[7] <- "count"
+
+write.table(output,file=paste0(repRID,".countTable.csv"),sep=",",row.names=FALSE,quote=FALSE)
diff --git a/workflow/scripts/convertGeneSymbols.R b/workflow/scripts/convertGeneSymbols.R
new file mode 100644
index 0000000000000000000000000000000000000000..7b05b92e5d251050a78e7f546aa8b042d994c623
--- /dev/null
+++ b/workflow/scripts/convertGeneSymbols.R
@@ -0,0 +1,24 @@
+gc()
+library(optparse)
+
+option_list=list(
+  make_option("--repRID",action="store",type='character',help="Replicate RID")
+)
+opt=parse_args(OptionParser(option_list=option_list))
+rm(option_list)
+
+countTable <- read.csv(paste0(opt$repRID,".countData.countTable.csv"), stringsAsFactors=FALSE)
+geneID <- read.delim("geneID.tsv", header=FALSE, stringsAsFactors=FALSE)
+Entrez <- read.delim("Entrez.tsv", header=FALSE, stringsAsFactors=FALSE)
+
+convert <- data.frame(geneID=countTable$Geneid)
+convert <- merge(x=convert,y=geneID[,1:2],by.x="geneID",by.y="V2",all.x=TRUE)
+convert <- merge(x=convert,y=Entrez,by.x="V1",by.y="V1",all.x=TRUE)
+convert[is.na(convert$V2),3] <- ""
+convert <- convert[,-1]
+colnames(convert) <- c("GeneID","EntrezID")
+convert <- unique(convert)
+
+output <- merge(x=convert,y=countTable,by.x="GeneID",by.y="Geneid",all.x=TRUE)
+
+write.table(output,file=paste0(opt$repRID,".tpmTable.csv"),sep=",",row.names=FALSE,quote=FALSE)
\ No newline at end of file
diff --git a/workflow/scripts/inferMeta.sh b/workflow/scripts/inferMeta.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a8c9839ae2328cbed3dd39b9f2be427be12722ba
--- /dev/null
+++ b/workflow/scripts/inferMeta.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+if [ "${1}" == "endness" ]
+then
+    awk '/Data/ {print}' "${2}" | sed -e 's/^This is //' -e 's/ Data$//'
+elif [ "${1}" == "fail" ]
+then
+    awk '/Fraction of reads failed/ {print}' "${2}" | sed -e 's/^Fraction of reads failed to determine: //'
+elif [ "${1}" == "sef" ]
+then
+    awk '/\+\+,--/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "++,--": //'
+elif [ "${1}" == "ser" ]
+then
+    awk '/\+-,-\+/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "+-,-+": //'
+elif [ "${1}" == "pef" ]
+then
+    awk '/1\+\+,1--,2\+-,2-\+/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "1++,1--,2+-,2-+": //'
+elif [ "${1}" == "per" ]
+then
+    awk '/1\+-,1-\+,2\+\+,2--/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "1+-,1-+,2++,2--": //'
+fi
\ No newline at end of file
diff --git a/workflow/scripts/modifyFetch.py b/workflow/scripts/modifyFetch.py
deleted file mode 100644
index 8a330e539054c8592363bd84bb4e6a0871b750f4..0000000000000000000000000000000000000000
--- a/workflow/scripts/modifyFetch.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import argparse
-import pandas as pd
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True)
-    args = parser.parse_args()
-    return args
-
-def main():
-    args = get_args()
-    fetch = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None)
-    fetch_filtered = fetch[fetch[2].str[-9:]==".fastq.gz"]
-    fetch_filtered.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False)
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file
diff --git a/workflow/scripts/parseMeta.py b/workflow/scripts/parseMeta.py
new file mode 100644
index 0000000000000000000000000000000000000000..500054264f7c8c50310da92d9995f432930318d3
--- /dev/null
+++ b/workflow/scripts/parseMeta.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+
+import argparse
+import pandas as pd
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-r', '--repRID',help="The replicate RID.",required=True)
+    parser.add_argument('-m', '--metaFile',help="The metadata file to extract.",required=True)
+    parser.add_argument('-p', '--parameter',help="The parameter to extract.",required=True)
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+    metaFile = pd.read_csv(args.metaFile,sep=",",header=0)
+
+    # Check replicate RID metadata from 'File.csv'
+    if (args.parameter == "repRID"):
+        if (len(metaFile.Replicate_RID.unique()) > 1):
+            print("There are multiple replicate RID's in the metadata: " + " ".join(metaFile.Replicate_RID.unique()))
+            exit(1)
+        if not (metaFile.Replicate_RID.unique() == args.repRID):
+            print("Replicate RID in metadata does not match run parameters: " + metaFile.Replicate_RID.unique() + " vs " + args.repRID)
+            exit(1)
+        else:
+            rep=metaFile["Replicate_RID"].unique()[0]
+            print(rep)
+        if (len(metaFile[metaFile["File_Type"] == "FastQ"]) > 2):
+            print("There are more then 2 fastq's in the metadata: " + " ".join(metaFile[metaFile["File_Type"] == "FastQ"].RID))
+            exit(1)
+
+    # Check experiment RID metadata from 'Experiment.csv'
+    if (args.parameter == "expRID"):
+        if (len(metaFile.Experiment_RID.unique()) > 1):
+            print("There are multiple experoment RID's in the metadata: " + " ".join(metaFile.Experiment_RID.unique()))
+            exit(1)
+        else:
+            exp=metaFile["Experiment_RID"].unique()[0]
+            print(exp)
+
+    # Check study RID metadata from 'Experiment.csv'
+    if (args.parameter == "studyRID"):
+        if (len(metaFile.Study_RID.unique()) > 1):
+            print("There are multiple study RID's in the metadata: " + " ".join(metaFile.Study_RID.unique()))
+            exit(1)
+        else:
+            study=metaFile["Study_RID"].unique()[0]
+            print(study)
+    
+    # Get endedness metadata from 'Experiment Settings.csv'
+    if (args.parameter == "endsMeta"):
+        if (metaFile.Paired_End.unique() == "Single End"):
+            endsMeta = "se"
+        elif (metaFile.Paired_End.unique() == "Paired End"):
+            endsMeta = "pe"
+        else:
+            endsMeta = "uk"
+        print(endsMeta)
+    
+    # Manually get endness count from 'File.csv'
+    if (args.parameter == "endsManual"):
+        if (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 1):
+            endsManual = "se"
+        elif (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 2):
+            endsManual = "pe"
+        print(endsManual)
+    
+    # Get strandedness metadata from 'Experiment Settings.csv'
+    if (args.parameter == "stranded"):
+        if (metaFile.Has_Strand_Specific_Information.unique() == "yes"):
+            stranded = "stranded"
+        elif (metaFile.Has_Strand_Specific_Information.unique() == "no"):
+            stranded = "unstranded"
+        else:
+            print("Stranded metadata not match expected options: " + metaFile.Has_Strand_Specific_Information.unique())
+            exit(1)
+        print(stranded)
+    
+    # Get spike-in metadata from 'Experiment Settings.csv'
+    if (args.parameter == "spike"):
+        if (metaFile.Used_Spike_Ins.unique() == "yes"):
+            spike = "yes"
+        elif (metaFile.Used_Spike_Ins.unique() == "no"):
+            spike = "no"
+        else:
+            print("Spike-ins metadata not match expected options: " + metaFile.Used_Spike_Ins.unique())
+            exit(1)
+        print(spike)
+
+    # Get species metadata from 'Experiment.csv'
+    if (args.parameter == "species"):
+        if (metaFile.Species.unique() == "Mus musculus"):
+            species = "Mus musculus"
+        elif (metaFile.Species.unique() == "Homo sapiens"):
+            species = "Homo sapiens"
+        else:
+            print("Species metadata not match expected options: " + metaFile.Species.unique())
+            exit(1)
+        print(species)
+
+    # Get read length metadata from 'Experiment Settings.csv'
+    if (args.parameter == "readLength"):
+        readLength = metaFile.Read_Length.unique()
+        print(str(readLength).strip('[]'))
+
+if __name__ == '__main__':
+    main()
diff --git a/workflow/scripts/renameFastq.sh b/workflow/scripts/renameFastq.sh
deleted file mode 100644
index f5593766b3a3bd645c3f2c8758d3a20fd354c9be..0000000000000000000000000000000000000000
--- a/workflow/scripts/renameFastq.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin
-
-while read loc checksum fileLocation
-do
-    file=$(echo ${fileLocation##*/})
-    fileName=$(echo ${file%.R*.fastq.gz})
-    fileExt=$(echo ${file##${fileName}.})
-    while IFS="," read RID Study_RID Experiment_RID Replicate_RID Caption File_Type File_Name URI File_size MD5 GEO_Archival_URL dbGaP_Accession_ID Processed Notes Principal_Investigator Consortium Release_Date RCT RMT Legacy_File_RID GUDMAP_NGF_OID GUDMAP_NGS_OID
-    do
-        if [ ${file} == ${File_Name} ]
-        then
-            find . -type f -name ${file} -execdir mv {} ${Replicate_RID}.${fileExt} ';'
-        fi
-    done < $1/data/File.csv
-done < $1/fetch.txt
\ No newline at end of file
diff --git a/workflow/scripts/splitStudy.py b/workflow/scripts/splitStudy.py
new file mode 100644
index 0000000000000000000000000000000000000000..82ffc2881857dd5d1d27eee5ea6a381b02d0e9f5
--- /dev/null
+++ b/workflow/scripts/splitStudy.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+
+import argparse
+import pandas as pd
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-s', '--studyRID',help="The study RID.",required=True)
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = get_args()
+    studyRID=pd.read_json(args.studyRID+"_studyRID.json")
+    if studyRID["RID"].count() > 0:
+        studyRID["RID"].to_csv(args.studyRID+"_studyRID.csv",header=False,index=False)
+    else:
+        raise Exception("No associated replicates found: %s" %
+            studyRID)
+
+if __name__ == '__main__':
+    main()
diff --git a/workflow/scripts/splitStudy.sh b/workflow/scripts/splitStudy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a64b6d9e4cde818d1c6f91fd84144b821febc536
--- /dev/null
+++ b/workflow/scripts/splitStudy.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# query GUDMAP/RBK for study RID
+echo "curl --location --request GET 'https://www.gudmap.org/ermrest/catalog/2/entity/RNASeq:Replicate/Study_RID="${1}"'" | bash > $1_studyRID.json
+
+# extract replicate RIDs
+module load python/3.6.4-anaconda
+python3 ./workflow/scripts/splitStudy.py -s $1
+
+# run pipeline on replicate RIDs in parallel
+module load nextflow/20.01.0
+module load singularity/3.5.3
+while read repRID; do echo ${repRID}; done < "$1_studyRID.csv" | xargs -P 5 -I {} nextflow run workflow/rna-seq.nf --repRID {}
+
+# cleanup study RID files
+rm $1_studyRID.json
+rm $1_studyRID.csv
diff --git a/workflow/scripts/tinHist.py b/workflow/scripts/tinHist.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d292c2eb8cadb3b16466c6b19d0574184d439d7
--- /dev/null
+++ b/workflow/scripts/tinHist.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+
+import argparse
+import pandas as pd
+import numpy as np
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-r', '--repRID',help="The replicate RID.",required=True)
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = get_args()
+    tin = pd.read_csv(args.repRID + '.sorted.deduped.tin.xls',sep="\t",header=0)
+    
+    hist = pd.cut(tin['TIN'],bins=pd.interval_range(start=0,freq=10,end=100,closed='right')).value_counts(sort=False)
+    labels = ["{0} - {1}".format(i, i + 9) for i in range(1, 100, 10)]
+    #labels[0] = '0 - 10'
+    binned = tin.assign(Bins=lambda x: pd.cut(tin['TIN'],range(0,105,10),labels=labels,include_lowest=False,right=True))
+    binned['chrom'] = binned['chrom'] = binned['chrom'].replace('chr1','chr01')
+    binned['chrom'] = binned['chrom'].replace('chr2','chr02')
+    binned['chrom'] = binned['chrom'].replace('chr3','chr03')
+    binned['chrom'] = binned['chrom'].replace('chr4','chr04')
+    binned['chrom'] = binned['chrom'].replace('chr5','chr05')
+    binned['chrom'] = binned['chrom'].replace('chr6','chr06')
+    binned['chrom'] = binned['chrom'].replace('chr7','chr07')
+    binned['chrom'] = binned['chrom'].replace('chr8','chr08')
+    binned['chrom'] = binned['chrom'].replace('chr9','chr09')
+    hist = pd.pivot_table(binned, values='geneID', index = 'Bins', columns = 'chrom', aggfunc=np.size)
+    hist['TOTAL'] = hist.sum(axis=1)
+    hist = hist[['TOTAL'] + [ i for i in hist.columns if i != 'TOTAL']]
+    hist = hist.T.fillna(0.0).astype(int)
+    #hist = hist.apply(lambda x: x/x.sum()*100, axis=1)
+    hist.to_csv(args.repRID + '.tin.hist.tsv',sep='\t')
+    medFile = open(args.repRID + '.tin.med.csv',"w")
+    medFile.write(str(round(tin['TIN'][(tin['TIN']!=0)].median(),2)))
+    medFile.close()
+
+if __name__ == '__main__':
+    main()
diff --git a/workflow/scripts/utils.py b/workflow/scripts/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e4478e3b1cef96e9b4c05033f75122f87aad06e
--- /dev/null
+++ b/workflow/scripts/utils.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+
+#
+# * --------------------------------------------------------------------------
+# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
+# * --------------------------------------------------------------------------
+#
+
+'''General utilities.'''
+
+
+import shlex
+import logging
+import subprocess
+import sys
+import os
+
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = True
+
+
+def run_pipe(steps, outfile=None):
+    from subprocess import Popen, PIPE
+    p = None
+    p_next = None
+    first_step_n = 1
+    last_step_n = len(steps)
+    for n, step in enumerate(steps, start=first_step_n):
+        logger.debug("step %d: %s" % (n, step))
+        if n == first_step_n:
+            if n == last_step_n and outfile:  # one-step pipeline with outfile
+                with open(outfile, 'w') as fh:
+                    print("one step shlex: %s to file: %s" % (shlex.split(step), outfile))
+                    p = Popen(shlex.split(step), stdout=fh)
+                break
+            print("first step shlex to stdout: %s" % (shlex.split(step)))
+
+            p = Popen(shlex.split(step), stdout=PIPE)
+        elif n == last_step_n and outfile:  # only treat the last step specially if you're sending stdout to a file
+            with open(outfile, 'w') as fh:
+                print("last step shlex: %s to file: %s" % (shlex.split(step), outfile))
+                p_last = Popen(shlex.split(step), stdin=p.stdout, stdout=fh)
+                p.stdout.close()
+                p = p_last
+        else:  # handles intermediate steps and, in the case of a pipe to stdout, the last step
+            print("intermediate step %d shlex to stdout: %s" % (n, shlex.split(step)))
+            p_next = Popen(shlex.split(step), stdin=p.stdout, stdout=PIPE)
+            p.stdout.close()
+            p = p_next
+    out, err = p.communicate()
+    return out, err
+
+
+def block_on(command):
+    process = subprocess.Popen(shlex.split(command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
+    for line in iter(process.stdout.readline, b''):
+        sys.stdout.write(line.decode('utf-8'))
+    process.communicate()
+    return process.returncode
+
+
+def strip_extensions(filename, extensions):
+    '''Strips extensions to get basename of file.'''
+
+    basename = filename
+    for extension in extensions:
+        basename = basename.rpartition(extension)[0] or basename
+
+    return basename
+
+
+def count_lines(filename):
+    import mimetypes
+    compressed_mimetypes = [
+        "compress",
+        "bzip2",
+        "gzip"
+        ]
+    mime_type = mimetypes.guess_type(filename)[1]
+    if mime_type in compressed_mimetypes:
+        catcommand = 'gzip -dc'
+    else:
+        catcommand = 'cat'
+    out, err = run_pipe([
+        '%s %s' % (catcommand, filename),
+        'wc -l'
+        ])
+    return int(out)
+
+
+def rescale_scores(filename, scores_col, new_min=10, new_max=1000):
+    sorted_fn = 'sorted-%s' % (filename)
+    rescaled_fn = 'rescaled-%s' % (filename)
+
+    out, err = run_pipe([
+        'sort -k %dgr,%dgr %s' % (scores_col, scores_col, filename),
+        r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (NF != 0) print $0}'"""],
+        sorted_fn)
+
+    out, err = run_pipe([
+        'head -n 1 %s' % (sorted_fn),
+        'cut -f %s' % (scores_col)])
+    max_score = float(out.strip())
+    logger.info("rescale_scores: max_score = %s", max_score)
+
+    out, err = run_pipe([
+        'tail -n 1 %s' % (sorted_fn),
+        'cut -f %s' % (scores_col)])
+    min_score = float(out.strip())
+    logger.info("rescale_scores: min_score = %s", min_score)
+
+    a = min_score
+    b = max_score
+    x = new_min
+    y = new_max
+
+    if min_score == max_score:  # give all peaks new_min
+        rescale_formula = "x"
+    else:  # n is the unscaled score from scores_col
+        rescale_formula = "((n-a)*(y-x)/(b-a))+x"
+
+    out, err = run_pipe(
+        [
+            'cat %s' % (sorted_fn),
+            r"""awk 'BEGIN{OFS="\t"}{n=$%d;a=%d;b=%d;x=%d;y=%d}"""
+            % (scores_col, a, b, x, y) +
+            r"""{$%d=int(%s) ; print $0}'"""
+            % (scores_col, rescale_formula)
+        ],
+        rescaled_fn)
+
+    os.remove(sorted_fn)
+
+    return rescaled_fn
diff --git a/workflow/tests/test_alignReads.py b/workflow/tests/test_alignReads.py
new file mode 100644
index 0000000000000000000000000000000000000000..eae8780f5641404ac9caabe32089369a3b0b1ea2
--- /dev/null
+++ b/workflow/tests/test_alignReads.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+import os
+import utils
+
+data_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+	'/../../'
+
+
+@pytest.mark.alignData
+def test_alignData_se():
+	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.unal.gz'))
+	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.bam'))
+	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.bam.bai'))
+
+
+@pytest.mark.alignData
+def test_alignData_pe():
+	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.pe.unal.gz'))
+	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.pe.sorted.bam'))
+	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.pe.sorted.bam.bai'))
diff --git a/workflow/tests/test_consistency.py b/workflow/tests/test_consistency.py
new file mode 100644
index 0000000000000000000000000000000000000000..073b12826b798ac94d16fda4291dfba2c1a42203
--- /dev/null
+++ b/workflow/tests/test_consistency.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+from io import StringIO
+import os
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../../'
+
+@pytest.mark.consistencySE
+def test_consistencySE():
+    assert os.path.exists(os.path.join(test_output_path, 'SE_multiqc_data.json'))
+    assert readAssigned("assignedSE.txt","assignedExpectSE.txt")
+
+@pytest.mark.consistencyPE
+def test_consistencyPE():
+    assert os.path.exists(os.path.join(test_output_path, 'PE_multiqc_data.json'))
+    assert readAssigned("assignedPE.txt","assignedExpectPE.txt")
+
+def readAssigned(fileAssigned,fileExpectAssigned):
+    data = False
+    assigned = open(fileAssigned, "r")
+    expect = open(fileExpectAssigned, "r")
+    lineAssigned = assigned.readline()
+    lineExpect = expect.readline()
+    if int(lineAssigned.strip()) < (int(lineExpect.strip())+(int(lineExpect.strip())*0.00001)) and int(lineAssigned.strip()) > (int(lineExpect.strip())-(int(lineExpect.strip())*0.00001)):
+        data = True
+
+    return data
diff --git a/workflow/tests/test_dataQC.py b/workflow/tests/test_dataQC.py
new file mode 100644
index 0000000000000000000000000000000000000000..e77d4680fd8eac61c3a9b9a8fd175136a61244b9
--- /dev/null
+++ b/workflow/tests/test_dataQC.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+from io import StringIO
+import os
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../../'
+
+@pytest.mark.dataQC
+def test_dataQC():
+    assert os.path.exists(os.path.join(test_output_path, 'Q-Y5F6_1M.se.sorted.deduped.tin.xls'))
+    assert countLines(os.path.join(test_output_path, 'Q-Y5F6_1M.se.sorted.deduped.tin.xls'))
+
+def countLines(fileName):
+    data = False
+    file = open(fileName, "r")
+    file.readline()
+    if file.readlines()[6] != "geneID":
+        data = True
+
+    return data
diff --git a/workflow/tests/test_dedupReads.py b/workflow/tests/test_dedupReads.py
new file mode 100644
index 0000000000000000000000000000000000000000..49cf420e2f0e2de923ae580c51dcdf42549f0713
--- /dev/null
+++ b/workflow/tests/test_dedupReads.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+import os
+import utils
+
+data_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+	'/../../'
+
+
+@pytest.mark.dedupData
+def test_dedupData():
+    assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.bam'))
+    assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.bam.bai'))
+    assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr8.bam'))
+    assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr8.bam.bai'))
+    assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr4.bam'))
+    assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr4.bam.bai'))
+    assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chrY.bam'))
+    assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chrY.bam.bai'))
diff --git a/workflow/tests/test_downsampleData.py b/workflow/tests/test_downsampleData.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd42c49e169e387dd9662903b20866c40aec8907
--- /dev/null
+++ b/workflow/tests/test_downsampleData.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+from io import StringIO
+import os
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../../'
+
+@pytest.mark.downsampleData
+def test_downsampleData():
+    assert os.path.exists(os.path.join(test_output_path, 'sampled.1.fq'))
\ No newline at end of file
diff --git a/workflow/tests/test_fastqc.py b/workflow/tests/test_fastqc.py
new file mode 100644
index 0000000000000000000000000000000000000000..89303fe78e081a26fd6a8ea633997dffb8f920a6
--- /dev/null
+++ b/workflow/tests/test_fastqc.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+from io import StringIO
+import os
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../../'
+
+@pytest.mark.fastqc
+def test_fastqc():
+    assert os.path.exists(os.path.join(test_output_path, 'Q-Y5F6_1M.R1_fastqc.zip'))
diff --git a/workflow/tests/test_getBag.py b/workflow/tests/test_getBag.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c63c9d95ac33aceeaa965852ede7bfc5e86bdc7
--- /dev/null
+++ b/workflow/tests/test_getBag.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+from io import StringIO
+import os
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../../'
+
+@pytest.mark.getBag
+def test_getBag():
+    assert os.path.exists(os.path.join(test_output_path, 'Replicate_Q-Y5F6.zip'))
diff --git a/workflow/tests/test_getData.py b/workflow/tests/test_getData.py
new file mode 100644
index 0000000000000000000000000000000000000000..a14be93ea8103aadf44aca3156ee28036cb6113e
--- /dev/null
+++ b/workflow/tests/test_getData.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+from io import StringIO
+import os
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../../'
+
+@pytest.mark.getData
+def test_getData():
+    assert os.path.exists(os.path.join(test_output_path, 'Replicate_Q-Y5F6/bagit.txt'))
+    assert os.path.exists(os.path.join(test_output_path, 'Replicate_Q-Y5F6/data/assets/Study/Q-Y4GY/Experiment/Q-Y4DP/Replicate/Q-Y5F6/mMARIS_Six2-#3.gene.rpkm.txt'))
diff --git a/workflow/tests/test_inferMetadata.py b/workflow/tests/test_inferMetadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..518664ced5ab4dd4e713dede9c58b9d87594799a
--- /dev/null
+++ b/workflow/tests/test_inferMetadata.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+from io import StringIO
+import os
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../../'
+
+@pytest.mark.inferMetadata
+def test_inferMetadata():
+    assert os.path.exists(os.path.join(test_output_path, 'Q-Y5F6_1M.se.inferMetadata.log'))
+
diff --git a/workflow/tests/test_makeBigWig.py b/workflow/tests/test_makeBigWig.py
new file mode 100644
index 0000000000000000000000000000000000000000..9292ac64714017fde8371927cee616374a457b3a
--- /dev/null
+++ b/workflow/tests/test_makeBigWig.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+import os
+import utils
+
+data_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+	'/../../'
+
+
+@pytest.mark.makeBigWig
+def test_makeBigWig():
+    assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.bw'))
diff --git a/workflow/tests/test_makeFeatureCounts.py b/workflow/tests/test_makeFeatureCounts.py
new file mode 100644
index 0000000000000000000000000000000000000000..d741a2f4ac0f33fe15af8f13300c95002a00a887
--- /dev/null
+++ b/workflow/tests/test_makeFeatureCounts.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+import os
+import utils
+
+data_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+	'/../../'
+
+
+@pytest.mark.makeFeatureCounts
+def test_makeFeatureCounts():
+	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.featureCounts'))
+	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.countTable.csv'))
diff --git a/workflow/tests/test_parseMetadata.py b/workflow/tests/test_parseMetadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..59677bbba7d40058bdeb78ccceeeeddba4565a14
--- /dev/null
+++ b/workflow/tests/test_parseMetadata.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+from io import StringIO
+import os
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../../'
+
+@pytest.mark.parseMetadata
+def test_parseMetadata():
+    assert os.path.exists(os.path.join(test_output_path, 'design.csv'))
+    assert readLine(os.path.join(test_output_path, 'design.csv'))
+
+def readLine(fileName):
+    data = False
+    file = open(fileName, "r")
+    line = file.readline()
+    if line.strip() == "uk,se,unstranded,no,Homo sapiens,75,Experiment_RID,Study_RID,Replicate_RID":
+        data = True
+
+    return data
diff --git a/workflow/tests/test_trimData.py b/workflow/tests/test_trimData.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba0eeda481262647abc9e4a8bf362515ac0dc0e7
--- /dev/null
+++ b/workflow/tests/test_trimData.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+from io import StringIO
+import os
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../../'
+
+@pytest.mark.trimData
+def test_trimData_se():
+    assert os.path.exists(os.path.join(test_output_path, 'Q-Y5F6_1M.se_trimmed.fq.gz'))
+
+
+@pytest.mark.trimData
+def test_trimData_pe():
+    assert os.path.exists(os.path.join(test_output_path, 'Q-Y5F6_1M.pe_R1_val_1.fq.gz'))
+    assert os.path.exists(os.path.join(test_output_path, 'Q-Y5F6_1M.pe_R2_val_2.fq.gz'))