Merge branch '78-tool_version' into develop

0a0b51e4 · Gervaise Henry · f232c58b · df286d4f · 0a0b51e4 · 0a0b51e4
Commit 0a0b51e4 authored 4 years ago by Gervaise Henry
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -9,6 +9,7 @@ before_script:
 stages:
  - unit
+  - aggregation
  - reference
  - integration
  - consistency
@@ -22,8 +23,15 @@ getBag:
    - merge_requests
  script:
  - ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json
+  - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' deriva-download-cli --version > version_deriva.txt
  - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6
  - pytest -m getBag
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_deriva.txt
+    expire_in: 7 days
 getData:
  stage: unit
@@ -33,10 +41,17 @@ getData:
  except:
    - merge_requests
  script:
+  - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bdbag --version > version_bdbag.txt
  - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
  - unzip ./test_data/bag/Replicate_Q-Y5F6.zip
  - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6 TEST
  - pytest -m getData
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_bdbag.txt
+    expire_in: 7 days
 parseMetadata:
  stage: unit
@@ -46,6 +61,7 @@ parseMetadata:
  except:
    - merge_requests
  script:
+  - singularity run 'docker://bicf/python3:2.0.1_indev' python3 --version > version_python.txt
  - rep=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID)
  - exp=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p expRID)
  - study=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p studyRID)
@@ -57,6 +73,12 @@ parseMetadata:
  - readLength=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.stageNew.csv" -p readLength)
  - echo -e "${endsMeta},${endsManual},${stranded},${spike},${species},${readLength},${exp},${study},${rep}" > design.csv
  - pytest -m parseMetadata
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_python.txt
+    expire_in: 7 days
 inferMetadata:
  stage: unit
@@ -66,6 +88,7 @@ inferMetadata:
  except:
    - merge_requests
  script:
+  - singularity run 'docker://bicf/rseqc3.0:2.0.1_indev' infer_experiment.py --version > version_rseqc.txt
  - >
    align=$(echo $(grep "Overall alignment rate" ./test_data/meta/Q-Y5F6_1M.se.alignSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) &&
    if [[ ${align} == "" ]]; then exit 1; fi
@@ -74,6 +97,12 @@ inferMetadata:
    ended=`singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/inferMeta.sh endness Q-Y5F6_1M.se.inferMetadata.log` &&
    if [[ ${ended} == "" ]]; then exit 1; fi
  - pytest -m inferMetadata
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_rseqc.txt
+    expire_in: 7 days
 trimData:
  stage: unit
@@ -83,11 +112,18 @@ trimData:
  except:
    - merge_requests
  script:
+  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --version > version_trimgalore.txt
  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
  - readLengthSE=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
  - readLengthPE=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
  - pytest -m trimData
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_trimgalore.txt
+    expire_in: 7 days
 downsampleData:
  stage: unit
@@ -100,6 +136,7 @@ downsampleData:
  - singularity run 'docker://bicf/seqtk:2.0.1_indev' seqtk sample -s100 ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz 1000 1> sampled.1.fq
  - pytest -m downsampleData
 alignData:
  stage: unit
  only:
@@ -108,6 +145,8 @@ alignData:
  except:
    - merge_requests
  script:
+  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 --version > version_hisat2.txt
+  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools --version > version_samtools.txt
  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary
  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam
  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam
@@ -117,6 +156,14 @@ alignData:
  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam
  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai
  - pytest -m alignData
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_hisat2.txt
+      - version_samtools.txt
+    expire_in: 7 days
 dedupData:
  stage: unit
@@ -126,6 +173,8 @@ dedupData:
  except:
    - merge_requests
  script:
+  - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools --version > version_samtools.txt
+  - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates --version 2> version_markdups.txt&
  - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true
  - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5F6_1M.se.deduped.bam
  - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai
@@ -134,6 +183,13 @@ dedupData:
      echo "samtools view -b Q-Y5F6_1M.se.sorted.deduped.bam ${i} > Q-Y5F6_1M.se.sorted.deduped.${i}.bam; samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.${i}.bam Q-Y5F6_1M.se.sorted.deduped.${i}.bam.bai;";
      done | singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' parallel -j 20 -k
  - pytest -m dedupData
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_markdups.txt
+      - version_samtools.txt
+    expire_in: 7 days
 countData:
  stage: unit
@@ -149,7 +205,16 @@ countData:
  - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.countData
  - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se
  - assignedReads=$(grep -m 1 'Assigned' *.summary | grep -oe '\([0-9.]*\)')
+  - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -v &> version_featurecounts.txt
+  - singularity run 'docker://bicf/subread2:2.0.0' R --version > version_r.txt
  - pytest -m makeFeatureCounts
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_featurecounts.txt
+      - version_r.txt
+    expire_in: 7 days
 makeBigWig:
  stage: unit
@@ -159,8 +224,15 @@ makeBigWig:
  except:
    - merge_requests
  script:
+  - singularity run 'docker://bicf/deeptools3.3:2.0.1_indev' deeptools --version > version_deeptools.txt
  - singularity run 'docker://bicf/deeptools3.3:2.0.1_indev' bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw
  - pytest -m makeBigWig
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_deeptools.txt
+    expire_in: 7 days
 fastqc:
  stage: unit
@@ -170,8 +242,16 @@ fastqc:
  except:
    - merge_requests
  script:
+  - singularity run 'docker://bicf/fastqc:2.0.1_indev' fastqc --version > version_fastqc.txt
  - singularity run 'docker://bicf/fastqc:2.0.1_indev' fastqc ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o .
  - pytest -m fastqc
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_fastqc.txt
+    expire_in: 7 days
 dataQC:
  stage: unit
@@ -199,6 +279,26 @@ outputBag:
  - pytest -m outputBag
+generateVersions:
+  stage: aggregation
+  only:
+    - push
+    - tags
+  except:
+    - merge_requests
+  script:
+  - singularity run 'docker://bicf/multiqc1.8:2.0.1_indev' multiqc --version > version_multiqc.txt
+  - python ./workflow/scripts/generate_versions.py -o software_versions
+  - python ./workflow/scripts/generate_references.py -r ./docs/references.md -o software_references
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - software_references_mqc.yaml
+      - software_versions_mqc.yaml
+    expire_in: 7 days
 humanBioHPC:
  stage: reference
  only:

--- a/.gitlab/merge_request_templates/Merge_Request.md
+++ b/.gitlab/merge_request_templates/Merge_Request.md
@@ -6,6 +6,8 @@ These are the most common things requested on pull requests.
 - [ ] If you've fixed a bug or added code that should be tested, add tests!
 - [ ] Documentation in `docs` is updated
 - [ ] Replace dag.png with the most recent CI pipleine integrated_pe artifact
+ - [ ] Replace software_versions_mqc.yaml with the most recent CI pipleine generateVersions artifact
+ - [ ] Replace software_references_mqc.yaml with the most recent CI pipleine generateVersions artifact
 - [ ] `CHANGELOG.md` is updated
 - [ ] `README.md` is updated
 - [ ] `LICENSE.md` is updated with new contributors

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,7 @@
 **User Facing**
 * Add option to pull references from datahub
 * Add option to send email on workflow error, with pipeline error message
+* Add versions and paper references of software used to report
 **Background**
 * Remove (comment out) option to pull references from S3

--- a/docs/dag.png
+++ b/docs/dag.png
--- a/docs/references.md
+++ b/docs/references.md
 ### References
+1. **python**:
+  * Anaconda (Anaconda Software Distribution, [https://anaconda.com](https://anaconda.com))
+2. **DERIVA**:
+  * Bugacov, A., Czajkowski, K., Kesselman, C., Kumar,  A., Schuler, R. E. and Tangmunarunkit, H. 2017 Experiences with DERIVA: An Asset Management Platform for Accelerating eScience. IEEE 13th International Conference on e-Science (e-Science), Auckland, 2017, pp. 79-88, doi:[10.1109/eScience.2017.20](https://doi.org/10.1109/eScience.2017.20).
+3. **BDBag**:  
+  * D'Arcy, M., Chard, K., Foster, I., Kesselman, C., Madduri, R., Saint, N., & Wagner, R.. 2019. Big Data Bags: A Scalable Packaging Format for Science. Zenodo. doi:[10.5281/zenodo.3338725](http://doi.org/10.5281/zenodo.3338725).
+4. **RSeQC**:
+  * Wang, L., Wang, S., Li, W. 2012 RSeQC: quality control of RNA-seq experiments. Bioinformatics. Aug 15;28(16):2184-5. doi:[10.1093/bioinformatics/bts356](https://doi.org/10.1093/bioinformatics/bts356).
+5. **trimgalore**:
+  * trimgalore [https://github.com/FelixKrueger/TrimGalore](https://github.com/FelixKrueger/TrimGalore)
+6. **hisat2**:
+  * Kim ,D.,Paggi, J.M., Park, C., Bennett, C., Salzberg, S.L. 2019 Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. Nat Biotechnol. Aug;37(8):907-915. doi:[10.1038/s41587-019-0201-4](https://doi.org/10.1038/s41587-019-0201-4).
+7. **samtools**:
+  * Li H., B. Handsaker, A. Wysoker, T. Fennell, J. Ruan, N. Homer, G. Marth, G. Abecasis, R. Durbin, and 1000 Genome Project Data Processing Subgroup. 2009. The Sequence alignment/map (SAM) format and SAMtools. Bioinformatics 25: 2078-9. doi:[10.1093/bioinformatics/btp352](http://dx.doi.org/10.1093/bioinformatics/btp352)
+8. **picard**:
+  * “Picard Toolkit.” 2019. Broad Institute, GitHub Repository. [http://broadinstitute.github.io/picard/](http://broadinstitute.github.io/picard/); Broad Institute
+9. **featureCounts**:
+  * Liao, Y., Smyth, G.K., Shi, W. 2014 featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics. Apr 1;30(7):923-30. doi:[10.1093/bioinformatics/btt656](https://doi.org/10.1093/bioinformatics/btt656).
+10. **R**:
+  * R Core Team 2014. R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL:[http://www.R-project.org/](http://www.R-project.org/).
+11. **deeptools**:
+  * Ramírez, F., D. P. Ryan, B. Grüning, V. Bhardwaj, F. Kilpert, A. S. Richter, S. Heyne, F. Dündar, and T. Manke. 2016. deepTools2: a next generation web server for deep-sequencing data analysis. Nucleic Acids Research 44: W160-165. doi:[10.1093/nar/gkw257](http://dx.doi.org/10.1093/nar/gkw257)
+12. **FastQC**
+  * FastQC [https://www.bioinformatics.babraham.ac.uk/projects/fastqc/](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
+13. **MultiQC**:
+  * Ewels P., Magnusson M., Lundin S. and Käller M. 2016. MultiQC: Summarize analysis results for multiple tools and samples in a single report. Bioinformatics 32(19): 3047–3048. doi:[10.1093/bioinformatics/btw354](https://dx.doi.org/10.1093/bioinformatics/btw354)
+14. **Nextflow**:
+  * Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., and Notredame, C. 2017. Nextflow enables reproducible computational workflows. Nature biotechnology, 35(4), 316.
--- a/docs/software_references_mqc.yaml
+++ b/docs/software_references_mqc.yaml
+        id: 'software_references'
+        section_name: 'Software References'
+        description: 'This section describes references for the tools used.'
+        plot_type: 'html'
+        data: |
+                <h3 id="references">References</h3>
+                <ol style="list-style-type: decimal">
+                <li><strong>python</strong>:</li>
+                </ol>
+                <ul>
+                <li>Anaconda (Anaconda Software Distribution, <a href="https://anaconda.com" class="uri">https://anaconda.com</a>)</li>
+                </ul>
+                <ol start="2" style="list-style-type: decimal">
+                <li><strong>DERIVA</strong>:</li>
+                </ol>
+                <ul>
+                <li>Bugacov, A., Czajkowski, K., Kesselman, C., Kumar, A., Schuler, R. E. and Tangmunarunkit, H. 2017 Experiences with DERIVA: An Asset Management Platform for Accelerating eScience. IEEE 13th International Conference on e-Science (e-Science), Auckland, 2017, pp. 79-88, doi:<a href="https://doi.org/10.1109/eScience.2017.20">10.1109/eScience.2017.20</a>.</li>
+                </ul>
+                <ol start="3" style="list-style-type: decimal">
+                <li><strong>BDBag</strong>:<br />
+                </li>
+                </ol>
+                <ul>
+                <li>D'Arcy, M., Chard, K., Foster, I., Kesselman, C., Madduri, R., Saint, N., &amp; Wagner, R.. 2019. Big Data Bags: A Scalable Packaging Format for Science. Zenodo. doi:<a href="http://doi.org/10.5281/zenodo.3338725">10.5281/zenodo.3338725</a>.</li>
+                </ul>
+                <ol start="4" style="list-style-type: decimal">
+                <li><strong>RSeQC</strong>:</li>
+                </ol>
+                <ul>
+                <li>Wang, L., Wang, S., Li, W. 2012 RSeQC: quality control of RNA-seq experiments. Bioinformatics. Aug 15;28(16):2184-5. doi:<a href="https://doi.org/10.1093/bioinformatics/bts356">10.1093/bioinformatics/bts356</a>.</li>
+                </ul>
+                <ol start="5" style="list-style-type: decimal">
+                <li><strong>trimgalore</strong>:</li>
+                </ol>
+                <ul>
+                <li>trimgalore <a href="https://github.com/FelixKrueger/TrimGalore" class="uri">https://github.com/FelixKrueger/TrimGalore</a></li>
+                </ul>
+                <ol start="6" style="list-style-type: decimal">
+                <li><strong>hisat2</strong>:</li>
+                </ol>
+                <ul>
+                <li>Kim ,D.,Paggi, J.M., Park, C., Bennett, C., Salzberg, S.L. Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. 2019 Nat Biotechnol. 2019 Aug;37(8):907-915. doi:<a href="https://doi.org/10.1038/s41587-019-0201-4">10.1038/s41587-019-0201-4</a></li>
+                </ul>
+                <ol start="7" style="list-style-type: decimal">
+                <li><strong>samtools</strong>:</li>
+                </ol>
+                <ul>
+                <li>Li H., B. Handsaker, A. Wysoker, T. Fennell, J. Ruan, N. Homer, G. Marth, G. Abecasis, R. Durbin, and 1000 Genome Project Data Processing Subgroup. 2009. The Sequence alignment/map (SAM) format and SAMtools. Bioinformatics 25: 2078-9. doi:<a href="http://dx.doi.org/10.1093/bioinformatics/btp352">10.1093/bioinformatics/btp352</a></li>
+                </ul>
+                <ol start="8" style="list-style-type: decimal">
+                <li><strong>picard</strong>:</li>
+                </ol>
+                <ul>
+                <li>“Picard Toolkit.” 2019. Broad Institute, GitHub Repository. <a href="http://broadinstitute.github.io/picard/" class="uri">http://broadinstitute.github.io/picard/</a>; Broad Institute</li>
+                </ul>
+                <ol start="9" style="list-style-type: decimal">
+                <li><strong>featureCounts</strong>:</li>
+                </ol>
+                <ul>
+                <li>Liao, Y., Smyth, G.K., Shi, W. 2014 featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics. Apr 1;30(7):923-30. doi:<a href="https://doi.org/10.1093/bioinformatics/btt656">10.1093/bioinformatics/btt656</a>.</li>
+                </ul>
+                <ol start="10" style="list-style-type: decimal">
+                <li><strong>R</strong>:</li>
+                </ol>
+                <ul>
+                <li>R Core Team 2014. R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL:<a href="http://www.R-project.org/" class="uri">http://www.R-project.org/</a>.</li>
+                </ul>
+                <ol start="11" style="list-style-type: decimal">
+                <li><strong>deeptools</strong>:</li>
+                </ol>
+                <ul>
+                <li>Ramírez, F., D. P. Ryan, B. Grüning, V. Bhardwaj, F. Kilpert, A. S. Richter, S. Heyne, F. Dündar, and T. Manke. 2016. deepTools2: a next generation web server for deep-sequencing data analysis. Nucleic Acids Research 44: W160-165. doi:<a href="http://dx.doi.org/10.1093/nar/gkw257">10.1093/nar/gkw257</a></li>
+                </ul>
+                <ol start="12" style="list-style-type: decimal">
+                <li><strong>FastQC</strong></li>
+                </ol>
+                <ul>
+                <li>FastQC <a href="https://www.bioinformatics.babraham.ac.uk/projects/fastqc/" class="uri">https://www.bioinformatics.babraham.ac.uk/projects/fastqc/</a></li>
+                </ul>
+                <ol start="13" style="list-style-type: decimal">
+                <li><strong>MultiQC</strong>:</li>
+                </ol>
+                <ul>
+                <li>Ewels P., Magnusson M., Lundin S. and Käller M. 2016. MultiQC: Summarize analysis results for multiple tools and samples in a single report. Bioinformatics 32(19): 3047–3048. doi:<a href="https://dx.doi.org/10.1093/bioinformatics/btw354">10.1093/bioinformatics/btw354</a></li>
+                </ul>
+                <ol start="14" style="list-style-type: decimal">
+                <li><strong>Nextflow</strong>:</li>
+                </ol>
+                <ul>
+                <li>Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., and Notredame, C. 2017. Nextflow enables reproducible computational workflows. Nature biotechnology, 35(4), 316.</li>
+                </ul>
--- a/docs/software_versions_mqc.yaml
+++ b/docs/software_versions_mqc.yaml
+        id: 'software_versions'
+        section_name: 'Software Versions'
+        section_href: 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/blob/78-tool_version/docs/RNA-Seq%20Pipeline%20Design%20Process%20Table.pdf'
+        plot_type: 'html'
+        description: 'are collected for pipeline version.'
+        data: |
+            <dl class="dl-horizontal">
+            <dt>Python</dt><dd>v3.7.7</dd>
+            <dt>DERIVA</dt><dd>v1.0.0</dd>
+            <dt>BDBag</dt><dd>v1.5.6</dd>
+            <dt>RSeQC</dt><dd>v3.0.1</dd>
+            <dt>Trim Galore!</dt><dd>v0.6.4</dd>
+            <dt>HISAT2</dt><dd>v2.1.0</dd>
+            <dt>Samtools</dt><dd>v1.9</dd>
+            <dt>picard (MarkDuplicates)</dt><dd>v2.23.0-SNAPSHOT</dd>
+            <dt>featureCounts</dt><dd>v2.0.0</dd>
+            <dt>R</dt><dd>v3.6.3</dd>
+            <dt>deepTools</dt><dd>v3.3.2</dd>
+            <dt>FastQC</dt><dd>v0.11.9</dd>
+            <dt>MultiQC</dt><dd>v1.8</dd>
+            <dt>Pipeline Version</dt><dd>v0.0.4_indev</dd>
+            </dl>
--- a/workflow/conf/multiqc_config.yaml
+++ b/workflow/conf/multiqc_config.yaml
@@ -56,6 +56,10 @@ report_section_order:
      order: 2000
    ref:
      order: 1000
+    software_versions:
+      order: -1000
+    software_references:
+      order: -2000
 skip_generalstats: true
@@ -152,4 +156,4 @@ sp:
    ref:
        fn: 'reference.tsv'
    tin:
        fn: '*.tin.hist.tsv'
\ No newline at end of file
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -70,6 +70,8 @@ if (params.refSource == "biohpc") {
 referenceInfer = Channel.fromList(["ERCC","GRCh","GRCm"])
 multiqcConfig = Channel.fromPath("${baseDir}/conf/multiqc_config.yaml")
 bicfLogo = Channel.fromPath("${baseDir}/../docs/bicf_logo.png")
+softwareReferences = Channel.fromPath("${baseDir}/../docs/software_references_mqc.yaml")
+softwareVersions = Channel.fromPath("${baseDir}/../docs/software_versions_mqc.yaml")
 // Define script files
 script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh")
@@ -1126,6 +1128,8 @@ process aggrQC {
  input:
    path multiqcConfig
    path bicfLogo
+    path softwareReferences
+    path softwareVersions
    path fastqc
    path trimQC
    path alignQC

--- a/workflow/scripts/generate_references.py
+++ b/workflow/scripts/generate_references.py
+#!/usr/bin/env python3
+#generate_references.py
+#*
+#* --------------------------------------------------------------------------
+#* Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/cellranger_count/blob/develop/LICENSE)
+#* --------------------------------------------------------------------------
+#*
+import argparse
+import subprocess
+import shlex
+import logging
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+def get_args():
+    '''Define arguments.'''
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('-r', '--reference',
+                        help="The reference file (markdown format).",
+                        required=True)
+    parser.add_argument('-o', '--output',
+                        help="The out file name.",
+                        default='references')
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    reference = args.reference
+    output = args.output
+    out_filename = output + '_mqc.yaml'
+    # Header for HTML
+    print(
+        '''
+        id: 'software_references'
+        section_name: 'Software References'
+        description: 'This section describes references for the tools used.'
+        plot_type: 'html'
+        data: |
+        '''
+    , file = open(out_filename, "w")
+    )
+    # Turn Markdown into HTML
+    references_html = 'bash -c "pandoc -p {} | sed \'s/^/                /\' >> {}"'
+    references_html = references_html.format(reference, out_filename)
+    subprocess.check_call(shlex.split(references_html))
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/generate_versions.py
+++ b/workflow/scripts/generate_versions.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# * --------------------------------------------------------------------------
+# * Licensed under MIT (https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/blob/master/LICENSE)
+# * --------------------------------------------------------------------------
+#
+'''Make YAML of software versions.'''
+from __future__ import print_function
+from collections import OrderedDict
+import re
+import os
+import logging
+import glob
+import argparse
+import numpy as np
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+# SETTINGS
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+SOFTWARE_REGEX = {
+    'Python': ['version_python.txt', r"Python (\S+)"],
+    'DERIVA': ['version_deriva.txt', r"(\S+)"],
+    'BDBag': ['version_bdbag.txt', r"BDBag (\S+) \(Bagit \S+\)"],
+    'RSeQC': ['version_rseqc.txt', r"infer_experiment.py (\S+)"],
+    'Trim Galore!': ['version_trimgalore.txt', r"version (\S+)"],
+    'HISAT2': ['version_hisat2.txt', r"version (\S+)"],
+    'Samtools': ['version_samtools.txt', r"samtools (\S+)"],
+    'picard (MarkDuplicates)': ['version_markdups.txt', r"(\S\.\S{2}\.\S+)"],
+    'featureCounts': ['version_featurecounts.txt', r"featureCounts v(\S+)"],
+    'R': ['version_r.txt', r"R version (\S+)"],
+    'deepTools': ['version_deeptools.txt', r"deeptools (\S+)"],
+    'FastQC': ['version_fastqc.txt', r"FastQC v(\S+)"],
+    'MultiQC': ['version_multiqc.txt', r"multiqc, version (\S+)"],
+    'Pipeline Version': ['./workflow/nextflow.config', r"version = 'v(\S+)'"]
+}
+def get_args():
+    '''Define arguments.'''
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('-o', '--output',
+                        help="The out file name.",
+                        required=True)
+    parser.add_argument('-t', '--test',
+                        help='Used for testing purposes',
+                        default=False,
+                        action='store_true')
+    args = parser.parse_args()
+    return args
+def check_files(files, test):
+    '''Check if version files are found.'''
+    logger.info("Running file check.")
+    software_files = np.array(list(SOFTWARE_REGEX.values()))[:,0]
+    extra_files =  set(files) - set(software_files)
+    if len(extra_files) > 0 and test:
+            logger.error('Missing regex: %s', list(extra_files))
+            raise Exception("Missing regex: %s" % list(extra_files))
+def main():
+    args = get_args()
+    output = args.output
+    test = args.test
+    out_filename = output + '_mqc.yaml'
+    results = OrderedDict()
+    results['Python'] = '<span style="color:#999999;\">Not Run</span>'
+    results['DERIVA'] = '<span style="color:#999999;\">Not Run</span>'
+    results['BDBag'] = '<span style="color:#999999;\">Not Run</span>'
+    results['RSeQC'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Trim Galore!'] = '<span style="color:#999999;\">Not Run</span>'
+    results['HISAT2'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Samtools'] = '<span style="color:#999999;\">Not Run</span>'
+    results['picard (MarkDuplicates)'] = '<span style="color:#999999;\">Not Run</span>'
+    results['featureCounts'] = '<span style="color:#999999;\">Not Run</span>'
+    results['R'] = '<span style="color:#999999;\">Not Run</span>'
+    results['deepTools'] = '<span style="color:#999999;\">Not Run</span>'
+    results['FastQC'] = '<span style="color:#999999;\">Not Run</span>'
+    results['MultiQC'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Pipeline Version'] = '<span style="color:#999999;\">Not Run</span>'
+    # list all files
+    files = glob.glob('**/*.txt', recursive=True)
+    # Check for version files:
+    check_files(files, test)
+    # Search each file using its regex
+    for k, v in SOFTWARE_REGEX.items():
+        if os.path.isfile(v[0]):
+            with open(v[0]) as x:
+                versions = x.read()
+                match = re.search(v[1], versions)
+                if match:
+                    results[k] = "v{}".format(match.group(1))
+    # Dump to YAML
+    print(
+        '''
+        id: 'software_versions'
+        section_name: 'Software Versions'
+        section_href: 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/blob/78-tool_version/docs/RNA-Seq%20Pipeline%20Design%20Process%20Table.pdf'
+        plot_type: 'html'
+        description: 'are collected for pipeline version.'
+        data: |
+            <dl class="dl-horizontal">
+        '''
+    , file = open(out_filename, "w"))
+    for k, v in results.items():
+        print("            <dt>{}</dt><dd>{}</dd>".format(k, v), file = open(out_filename, "a"))
+    print("            </dl>", file = open(out_filename, "a"))
+if __name__ == '__main__':
+    main()