From 10cd6a4cb92f160fbdb57e4eab47949f432c7ceb Mon Sep 17 00:00:00 2001
From: Venkat Malladi <venkat.malladi@utsouthwestern.edu>
Date: Mon, 9 Nov 2020 17:23:51 -0600
Subject: [PATCH] Add in software versions and references.

---
 .gitlab-ci.yml                                | 116 +++++++++++++-
 .../merge_request_templates/Merge_Request.md  |   2 +
 docs/references.md                            |  47 ++++++
 workflow/conf/multiqc_config.yaml             |   7 +-
 workflow/rna-seq.nf                           |  72 +++++----
 workflow/scripts/generate_references.py       |  70 +++++++++
 workflow/scripts/generate_versions.py         | 146 ++++++++++++++++++
 7 files changed, 420 insertions(+), 40 deletions(-)
 create mode 100644 workflow/scripts/generate_references.py
 create mode 100644 workflow/scripts/generate_versions.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 63a773a..65319d7 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -9,6 +9,7 @@ before_script:
 
 stages:
   - unit
+  - aggregation
   - reference
   - integration
   - consistency
@@ -22,8 +23,15 @@ getBag:
     - merge_requests
   script:
   - ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json
+  - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' deriva-download-cli --version > version_deriva.txt
   - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6
   - pytest -m getBag
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_deriva.txt
+    expire_in: 7 days
 
 getData:
   stage: unit
@@ -33,10 +41,17 @@ getData:
   except:
     - merge_requests
   script:
+  - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bdbag --version > version_bdbag.txt
   - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
   - unzip ./test_data/bag/Replicate_Q-Y5F6.zip
   - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6 TEST
   - pytest -m getData
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_bdbag.txt
+    expire_in: 7 days
 
 parseMetadata:
   stage: unit
@@ -46,6 +61,7 @@ parseMetadata:
   except:
     - merge_requests
   script:
+  - singularity run 'docker://bicf/python3:2.0.1_indev' python3 --version > version_python.txt
   - rep=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID)
   - exp=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p expRID)
   - study=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p studyRID)
@@ -57,6 +73,12 @@ parseMetadata:
   - readLength=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.stageNew.csv" -p readLength)
   - echo -e "${endsMeta},${endsManual},${stranded},${spike},${species},${readLength},${exp},${study},${rep}" > design.csv
   - pytest -m parseMetadata
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_python.txt
+    expire_in: 7 days
 
 inferMetadata:
   stage: unit
@@ -66,6 +88,7 @@ inferMetadata:
   except:
     - merge_requests
   script:
+  - singularity run 'docker://bicf/rseqc3.0:2.0.1_indev' infer_experiment.py --version > version_rseqc.txt
   - >
     align=$(echo $(grep "Overall alignment rate" ./test_data/meta/Q-Y5F6_1M.se.alignSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) &&
     if [[ ${align} == "" ]]; then exit 1; fi
@@ -74,6 +97,12 @@ inferMetadata:
     ended=`singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/inferMeta.sh endness Q-Y5F6_1M.se.inferMetadata.log` &&
     if [[ ${ended} == "" ]]; then exit 1; fi
   - pytest -m inferMetadata
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_rseqc.txt
+    expire_in: 7 days
 
 trimData:
   stage: unit
@@ -83,11 +112,18 @@ trimData:
   except:
     - merge_requests
   script:
+  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --version > version_trimgalore.txt
   - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
   - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
   - readLengthSE=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
   - readLengthPE=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
   - pytest -m trimData
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_trimgalore.txt
+    expire_in: 7 days
 
 downsampleData:
   stage: unit
@@ -97,8 +133,15 @@ downsampleData:
   except:
     - merge_requests
   script:
+  - singularity run 'docker://bicf/seqtk:2.0.1_indev' seqtk > version_seqtk.txt
   - singularity run 'docker://bicf/seqtk:2.0.1_indev' seqtk sample -s100 ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz 1000 1> sampled.1.fq
   - pytest -m downsampleData
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_seqtk.txt
+    expire_in: 7 days
 
 alignData:
   stage: unit
@@ -108,6 +151,8 @@ alignData:
   except:
     - merge_requests
   script:
+  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 --version > version_hisat2.txt
+  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools --version > version_samtools.txt
   - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary
   - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam
   - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam
@@ -117,6 +162,14 @@ alignData:
   - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam
   - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai
   - pytest -m alignData
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_hisat2.txt
+      - version_samtools.txt
+    expire_in: 7 days
+
 
 dedupData:
   stage: unit
@@ -126,14 +179,23 @@ dedupData:
   except:
     - merge_requests
   script:
+  - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates --version > version_markdups.txt
+  - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools --version > version_samtools.txt
   - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true
   - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5F6_1M.se.deduped.bam
   - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai
   - >
-    for i in {"chr8","chr4","chrY"}; do 
+    for i in {"chr8","chr4","chrY"}; do
       echo "samtools view -b Q-Y5F6_1M.se.sorted.deduped.bam ${i} > Q-Y5F6_1M.se.sorted.deduped.${i}.bam; samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.${i}.bam Q-Y5F6_1M.se.sorted.deduped.${i}.bam.bai;";
       done | singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' parallel -j 20 -k
   - pytest -m dedupData
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_markdups.txt
+      - version_samtools.txt
+    expire_in: 7 days
 
 countData:
   stage: unit
@@ -145,11 +207,20 @@ countData:
   script:
   - ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/geneID.tsv
   - ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/Entrez.tsv
-  - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se.countData -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam 
+  - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -v > version_featurecounts.txt
+  - singularity run 'docker://bicf/subread2:2.0.0' R --version > version_r.txt
+  - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se.countData -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam
   - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.countData
   - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se
   - assignedReads=$(grep -m 1 'Assigned' *.summary | grep -oe '\([0-9.]*\)')
   - pytest -m makeFeatureCounts
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_featurecounts.txt
+      - version_r.txt
+    expire_in: 7 days
 
 makeBigWig:
   stage: unit
@@ -159,8 +230,15 @@ makeBigWig:
   except:
     - merge_requests
   script:
+  - singularity run 'docker://bicf/deeptools3.3:2.0.1_indev' deeptools --version > version_deeptools.txt
   - singularity run 'docker://bicf/deeptools3.3:2.0.1_indev' bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw
   - pytest -m makeBigWig
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_deeptools.txt
+    expire_in: 7 days
 
 fastqc:
   stage: unit
@@ -170,8 +248,16 @@ fastqc:
   except:
     - merge_requests
   script:
+  - singularity run 'docker://bicf/fastqc:2.0.1_indev' fastqc --version > version_fastqc.txt
   - singularity run 'docker://bicf/fastqc:2.0.1_indev' fastqc ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o .
   - pytest -m fastqc
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_fastqc.txt
+    expire_in: 7 days
+
 
 dataQC:
   stage: unit
@@ -199,6 +285,26 @@ outputBag:
   - pytest -m outputBag
 
 
+generateVersions:
+  stage: aggregate
+  only:
+    - push
+    - tags
+  except:
+    - merge_requests
+  script:
+  - singularity run 'docker://bicf/multiqc1.8:2.0.1_indev' multiqc --version > version_multiqc.txt
+  - python ./workflow/scripts/generate_versions.py -o software_versions
+  - python ./workflow/scripts/generate_references.py -r ./docs/references.md -o software_references
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - software_references_mqc.yaml
+      - software_versions_mqc.yaml
+    expire_in: 7 days
+
+
 humanBioHPC:
   stage: reference
   only:
@@ -366,7 +472,7 @@ override_fastq:
     max: 1
     when:
       - always
-    
+
 override_species:
   stage: integration
   only: [merge_requests]
@@ -388,7 +494,7 @@ override_species:
     max: 1
     when:
       - always
-  
+
 
 consistency:
   stage: consistency
@@ -413,4 +519,4 @@ consistency:
       - assignedPE.txt
       - assignedExpectSE.txt
       - assignedExpectPE.txt
-    expire_in: 7 days
\ No newline at end of file
+    expire_in: 7 days
diff --git a/.gitlab/merge_request_templates/Merge_Request.md b/.gitlab/merge_request_templates/Merge_Request.md
index 88c50aa..4d3a6b0 100644
--- a/.gitlab/merge_request_templates/Merge_Request.md
+++ b/.gitlab/merge_request_templates/Merge_Request.md
@@ -6,6 +6,8 @@ These are the most common things requested on pull requests.
  - [ ] If you've fixed a bug or added code that should be tested, add tests!
  - [ ] Documentation in `docs` is updated
  - [ ] Replace dag.png with the most recent CI pipleine integrated_pe artifact
+ - [ ] Replace software_versions_mqc.yaml with the most recent CI pipleine generateVersions artifact
+ - [ ] Replace software_references_mqc.yaml with the most recent CI pipleine generateVersions artifact
  - [ ] `CHANGELOG.md` is updated
  - [ ] `README.md` is updated
  - [ ] `LICENSE.md` is updated with new contributors
diff --git a/docs/references.md b/docs/references.md
index 89002c5..78d9e54 100644
--- a/docs/references.md
+++ b/docs/references.md
@@ -1 +1,48 @@
 ### References
+
+1. **python**:
+  * Anaconda (Anaconda Software Distribution, [https://anaconda.com](https://anaconda.com))
+
+2. **DERIVA**:
+  * Bugacov, A., Czajkowski, K., Kesselman, C., Kumar,  A., Schuler, R. E. and Tangmunarunkit, H. 2017 Experiences with DERIVA: An Asset Management Platform for Accelerating eScience. IEEE 13th International Conference on e-Science (e-Science), Auckland, 2017, pp. 79-88, doi:[10.1109/eScience.2017.20](https://doi.org/10.1109/eScience.2017.20).
+
+3. **BDBag**:  
+  * D'Arcy, M., Chard, K., Foster, I., Kesselman, C., Madduri, R., Saint, N., & Wagner, R.. 2019. Big Data Bags: A Scalable Packaging Format for Science. Zenodo. doi:[10.5281/zenodo.3338725](http://doi.org/10.5281/zenodo.3338725).
+
+4. **RSeQC**:
+  * Wang, L., Wang, S., Li, W. 2012 RSeQC: quality control of RNA-seq experiments. Bioinformatics. Aug 15;28(16):2184-5. doi:[10.1093/bioinformatics/bts356](https://doi.org/10.1093/bioinformatics/bts356).
+
+5. **trimgalore**:
+  * trimgalore [https://github.com/FelixKrueger/TrimGalore](https://github.com/FelixKrueger/TrimGalore)
+
+6. **seqtk**:
+    * seqtk [https://github.com/lh3/seqtk](https://github.com/lh3/seqtk)
+
+7. **hisat2**:
+  * Kim ,D.,Paggi, J.M., Park, C., Bennett, C., Salzberg, S.L. Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. Nat Biotechnol. 2019 Aug;37(8):907-915. doi: 10.1038/s41587-019-0201-4.
+
+  Kim D, Paggi JM, Park C, Bennett C, Salzberg SL. Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. 2019 Nat Biotechnol. Aug;37(8):907-915. doi:[10.1038/s41587-019-0201-4](https://doi.org/10.1038/s41587-019-0201-4).
+
+8. **samtools**:
+  * Li H., B. Handsaker, A. Wysoker, T. Fennell, J. Ruan, N. Homer, G. Marth, G. Abecasis, R. Durbin, and 1000 Genome Project Data Processing Subgroup. 2009. The Sequence alignment/map (SAM) format and SAMtools. Bioinformatics 25: 2078-9. doi:[10.1093/bioinformatics/btp352](http://dx.doi.org/10.1093/bioinformatics/btp352)
+
+9. **picard**:
+  * “Picard Toolkit.” 2019. Broad Institute, GitHub Repository. [http://broadinstitute.github.io/picard/](http://broadinstitute.github.io/picard/); Broad Institute
+
+10. **featureCounts**:
+  * Liao, Y., Smyth, G.K., Shi, W. 2014 featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics. Apr 1;30(7):923-30. doi:[10.1093/bioinformatics/btt656](https://doi.org/10.1093/bioinformatics/btt656).
+
+11. **R**:
+  * R Core Team 2014. R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL:[http://www.R-project.org/](http://www.R-project.org/).
+
+12. **deeptools**:
+  * Ramírez, F., D. P. Ryan, B. Grüning, V. Bhardwaj, F. Kilpert, A. S. Richter, S. Heyne, F. Dündar, and T. Manke. 2016. deepTools2: a next generation web server for deep-sequencing data analysis. Nucleic Acids Research 44: W160-165. doi:[10.1093/nar/gkw257](http://dx.doi.org/10.1093/nar/gkw257)
+
+13. **FastQC**
+  * FastQC [https://www.bioinformatics.babraham.ac.uk/projects/fastqc/](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
+
+14. **MultiQC**:
+  * Ewels P., Magnusson M., Lundin S. and Käller M. 2016. MultiQC: Summarize analysis results for multiple tools and samples in a single report. Bioinformatics 32(19): 3047–3048. doi:[10.1093/bioinformatics/btw354](https://dx.doi.org/10.1093/bioinformatics/btw354)
+
+15. **Nextflow**:
+  * Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., and Notredame, C. 2017. Nextflow enables reproducible computational workflows. Nature biotechnology, 35(4), 316.
diff --git a/workflow/conf/multiqc_config.yaml b/workflow/conf/multiqc_config.yaml
index 0c780d9..627b51d 100644
--- a/workflow/conf/multiqc_config.yaml
+++ b/workflow/conf/multiqc_config.yaml
@@ -56,6 +56,11 @@ report_section_order:
       order: 2000
     ref:
       order: 1000
+    Software_Versions:
+      order: -1200
+    Software_References:
+      order: -1300
+
 
 skip_generalstats: true
 
@@ -152,4 +157,4 @@ sp:
     ref:
         fn: 'reference.tsv'
     tin:
-        fn: '*.tin.hist.tsv'
\ No newline at end of file
+        fn: '*.tin.hist.tsv'
diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf
index a940778..45653c4 100644
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -1,12 +1,12 @@
 #!/usr/bin/env nextflow
 
-//  ########  ####  ######  ######## 
-//  ##     ##  ##  ##    ## ##       
-//  ##     ##  ##  ##       ##       
-//  ########   ##  ##       ######   
-//  ##     ##  ##  ##       ##       
-//  ##     ##  ##  ##    ## ##       
-//  ########  ####  ######  ##       
+//  ########  ####  ######  ########
+//  ##     ##  ##  ##    ## ##
+//  ##     ##  ##  ##       ##
+//  ########   ##  ##       ######
+//  ##     ##  ##  ##       ##
+//  ##     ##  ##  ##    ## ##
+//  ########  ####  ######  ##
 
 // Define input variables
 params.deriva = "${baseDir}/../test_data/auth/credential.json"
@@ -66,6 +66,8 @@ if (params.refSource == "biohpc") {
 referenceInfer = Channel.fromList(["ERCC","GRCh","GRCm"])
 multiqcConfig = Channel.fromPath("${baseDir}/conf/multiqc_config.yaml")
 bicfLogo = Channel.fromPath("${baseDir}/../docs/bicf_logo.png")
+softwareReferences = Channel.fromPath("${baseDir}/../docs/software_references_mqc.yaml")
+softwareVersions = Channel.fromPath("${baseDir}/../docs/software_versions_mqc.yaml")
 
 // Define script files
 script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh")
@@ -89,7 +91,7 @@ process trackStart {
   """
   hostname
   ulimit -a
-  
+
   curl -H 'Content-Type: application/json' -X PUT -d \
     '{ \
       "sessionId": "${workflow.sessionId}", \
@@ -199,16 +201,16 @@ process getData {
     mkdir -p ~/.bdbag
     ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt
     echo -e "LOG: linked" >> ${repRID}.getData.log
-    
+
     # get bag basename
     replicate=\$(basename "${inputBag}" | cut -d "." -f1)
     echo -e "LOG: bag replicate name \${replicate}" >> ${repRID}.getData.log
-    
+
     # unzip bag
     echo -e "LOG: unzipping replicate bag" >> ${repRID}.getData.log
     unzip ${inputBag}
     echo -e "LOG: unzipped" >> ${repRID}.getData.log
-    
+
     # bag fetch fastq's only and rename by repRID
     echo -e "LOG: fetching replicate bdbag" >> ${repRID}.getData.log
     sh ${script_bdbagFetch} \${replicate} ${repRID}
@@ -259,7 +261,7 @@ process parseMetadata {
     # get experiment RID metadata
     exp=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p expRID)
     echo -e "LOG: experiment RID metadata parsed: \${exp}" >> ${repRID}.parseMetadata.log
-    
+
     # get study RID metadata
     study=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p studyRID)
     echo -e "LOG: study RID metadata parsed: \${study}" >> ${repRID}.parseMetadata.log
@@ -267,7 +269,7 @@ process parseMetadata {
     # get endedness metadata
     endsMeta=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p endsMeta)
     echo -e "LOG: endedness metadata parsed: \${endsMeta}" >> ${repRID}.parseMetadata.log
-    
+
     # ganually get endness
     endsManual=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p endsManual)
     echo -e "LOG: endedness manually detected: \${endsManual}" >> ${repRID}.parseMetadata.log
@@ -275,11 +277,11 @@ process parseMetadata {
     # get strandedness metadata
     stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p stranded)
     echo -e "LOG: strandedness metadata parsed: \${stranded}" >> ${repRID}.parseMetadata.log
-    
+
     # get spike-in metadata
     spike=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p spike)
     echo -e "LOG: spike-in metadata parsed: \${spike}" >> ${repRID}.parseMetadata.log
-    
+
     # get species metadata
     species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experiment}" -p species)
     echo -e "LOG: species metadata parsed: \${species}" >> ${repRID}.parseMetadata.log
@@ -358,7 +360,7 @@ process trimData {
     fi
     echo -e "LOG: trimmed" >> ${repRID}.trimData.log
     echo -e "LOG: average trimmed read length: \${readLength}" >> ${repRID}.trimData.log
-    
+
     # save read length file
     echo -e "\${readLength}" > readLength.csv
     """
@@ -381,7 +383,7 @@ getRefInferInput = referenceInfer.combine(deriva_getRefInfer.combine(script_refD
 
 /*
   * getRefInfer: dowloads appropriate reference for metadata inference
-*/  
+*/
 process getRefInfer {
   tag "${refName}"
 
@@ -391,7 +393,7 @@ process getRefInfer {
   output:
     tuple val (refName), path ("hisat2", type: 'dir'), path ("*.fna"), path ("*.gtf")  into refInfer
     path ("${refName}", type: 'dir') into bedInfer
- 
+
   script:
     """
     hostname > ${repRID}.${refName}.getRefInfer.log
@@ -532,14 +534,14 @@ process alignSampleData {
     echo -e "LOG: aligning ${ends}" >> ${repRID}.${ref}.alignSampleData.log
     if [ "${ends}" == "se" ]
     then
-     
+
       hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome -U ${fastq1} --summary-file ${ref}.alignSampleSummary.txt --new-summary
     elif [ "${ends}" == "pe" ]
     then
       hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome --no-mixed --no-discordant -1 ${fastq1} -2 ${fastq2} --summary-file ${ref}.alignSampleSummary.txt --new-summary
     fi
     echo -e "LOG: aliged" >> ${repRID}.${ref}.alignSampleData.log
-    
+
     # convert the output sam file to a sorted bam file using Samtools
     echo -e "LOG: converting from sam to bam" >> ${repRID}.${ref}.alignSampleData.log
     samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${ref}.sampled.bam ${ref}.sampled.sam
@@ -639,7 +641,7 @@ process inferMetadata {
 
     ended=`bash inferMeta.sh endness ${repRID}.infer_experiment.txt`
     fail=`bash inferMeta.sh fail ${repRID}.infer_experiment.txt`
-    if [ \${ended} == "PairEnd" ] 
+    if [ \${ended} == "PairEnd" ]
     then
       ends="pe"
       percentF=`bash inferMeta.sh pef ${repRID}.infer_experiment.txt`
@@ -728,7 +730,7 @@ process getRef {
 
   output:
     tuple path ("hisat2", type: 'dir'), path ("bed", type: 'dir'), path ("*.fna"), path ("*.gtf"), path ("geneID.tsv"), path ("Entrez.tsv")  into reference
- 
+
   script:
     """
     hostname > ${repRID}.getRef.log
@@ -847,7 +849,7 @@ process alignData {
         strandedParam="--rna-strandness R"
     elif [ "${stranded}" == "reverse" ] && [ "${ends}" == "pe" ]
     then
-      strandedParam="--rna-strandness RF"    
+      strandedParam="--rna-strandness RF"
     fi
 
     # align the reads with Hisat2
@@ -860,7 +862,7 @@ process alignData {
       hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} --summary-file ${repRID}.alignSummary.txt --new-summary
     fi
     echo -e "LOG: alignined" >> ${repRID}.align.log
-    
+
     # convert the output sam file to a sorted bam file using Samtools
     echo -e "LOG: converting from sam to bam" >> ${repRID}.align.log
     samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam
@@ -892,7 +894,7 @@ process dedupData {
 
   output:
     tuple path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bam.bai") into dedupBam
-    tuple path ("${repRID}.sorted.deduped.*.bam"), path ("${repRID}.sorted.deduped.*.bam.bai") into dedupChrBam 
+    tuple path ("${repRID}.sorted.deduped.*.bam"), path ("${repRID}.sorted.deduped.*.bam.bai") into dedupChrBam
     path ("*.deduped.Metrics.txt") into dedupQC
 
   script:
@@ -908,7 +910,7 @@ process dedupData {
     # sort the bam file using Samtools
     echo -e "LOG: sorting the bam file" >> ${repRID}.dedup.log
     samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.deduped.bam ${repRID}.deduped.bam
-    
+
     # index the sorted bam using Samtools
     echo -e "LOG: indexing sorted bam file" >> ${repRID}.dedup.log
     samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bam.bai
@@ -1004,7 +1006,7 @@ process countData {
       featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
     fi
     echo -e "LOG: counted" >> ${repRID}.countData.log
-    
+
     # extract assigned reads
     grep -m 1 'Assigned' *.countData.summary | grep -oe '\\([0-9.]*\\)' > assignedReads.csv
 
@@ -1069,12 +1071,12 @@ process dataQC {
     tuple path (bam), path (bai) from dedupBam_dataQC
     tuple path (chrBam), path (chrBai) from dedupChrBam
     val ends from endsInfer_dataQC
-    
+
   output:
     path "${repRID}.tin.hist.tsv" into tinHist
     path "${repRID}.tin.med.csv" into inferMetadata_tinMed
     path "${repRID}.insertSize.inner_distance_freq.txt" into innerDistance
-  
+
   script:
     """
     hostname > ${repRID}.dataQC.log
@@ -1122,6 +1124,8 @@ process aggrQC {
   input:
     path multiqcConfig
     path bicfLogo
+    path softwareReferences
+    path softwareVersions
     path fastqc
     path trimQC
     path alignQC
@@ -1179,8 +1183,8 @@ process aggrQC {
     echo -e "LOG: creating run table" >> ${repRID}.aggrQC.log
     echo -e "Session\tSession ID\tStart Time\tPipeline Version\tInput" > run.tsv
     echo -e "Session\t${workflow.sessionId}\t${workflow.start}\t${workflow.manifest.version}\t\${input}" >> run.tsv
-    
-    
+
+
     # make RID table
     echo -e "LOG: creating RID table" >> ${repRID}.aggrQC.log
     echo -e "Replicate\tReplicate RID\tExperiment RID\tStudy RID" > rid.tsv
@@ -1224,11 +1228,11 @@ process aggrQC {
 process outputBag {
   tag "${repRID}"
   publishDir "${outDir}/outputBag", mode: 'copy', pattern: "Replicate_${repRID}.outputBag.zip"
-  
+
   input:
     path multiqc
     path multiqcJSON
-  
+
   output:
     path ("Replicate_*.zip") into outputBag
 
@@ -1239,4 +1243,4 @@ process outputBag {
   cp ${multiqcJSON} Replicate_${repRID}.outputBag
   bdbag Replicate_${repRID}.outputBag --archiver zip
   """
-}
\ No newline at end of file
+}
diff --git a/workflow/scripts/generate_references.py b/workflow/scripts/generate_references.py
new file mode 100644
index 0000000..8e809f1
--- /dev/null
+++ b/workflow/scripts/generate_references.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+#generate_references.py
+#*
+#* --------------------------------------------------------------------------
+#* Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/cellranger_count/blob/develop/LICENSE)
+#* --------------------------------------------------------------------------
+#*
+
+import argparse
+import subprocess
+import shlex
+import logging
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-r', '--reference',
+                        help="The reference file (markdown format).",
+                        required=True)
+
+    parser.add_argument('-o', '--output',
+                        help="The out file name.",
+                        default='references')
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+    reference = args.reference
+    output = args.output
+
+    out_filename = output + '_mqc.yaml'
+
+    # Header for HTML
+    print(
+        '''
+        id: 'software_references'
+        section_name: 'Software References'
+        description: 'This section describes references for the tools used.'
+        plot_type: 'html'
+        data: |
+        '''
+    , file = open(out_filename, "w")
+    )
+
+    # Turn Markdown into HTML
+    references_html = 'bash -c "pandoc -p {} | sed \'s/^/                /\' >> {}"'
+    references_html = references_html.format(reference, out_filename)
+    subprocess.check_call(shlex.split(references_html))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/workflow/scripts/generate_versions.py b/workflow/scripts/generate_versions.py
new file mode 100644
index 0000000..08a239f
--- /dev/null
+++ b/workflow/scripts/generate_versions.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+#
+# * --------------------------------------------------------------------------
+# * Licensed under MIT (https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/blob/master/LICENSE)
+# * --------------------------------------------------------------------------
+#
+
+'''Make YAML of software versions.'''
+
+from __future__ import print_function
+from collections import OrderedDict
+import re
+import os
+import logging
+import glob
+import argparse
+import numpy as np
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+SOFTWARE_REGEX = {
+    'Python': ['version_python.txt', r"Python (\S+)"],
+    'DERIVA': ['version_deriva.txt', r"(\S+)"],
+    'BDBag': ['version_bdbag.txt', r"BDBag (\S+) \(Bagit \S+\)"],
+    'RSeQC': ['version_rseqc.txt', r"infer_experiment.py (\S+)"],
+    'Trim Galore!': ['version_trimgalore.txt', r"version (\S+)"],
+    'seqtk': ['version_seqtk.txt', r"Version: (\S+)"],
+    'HISAT2': ['version_hisat.txt', r"/hisat2-2.1.0/hisat2-align-s version (\S+)"],
+    'Samtools': ['version_samtools.txt', r"samtools (\S+)"],
+    'picard (MarkDuplicates)': ['version_markdups.txt', r"(\S+)"],
+    'featureCounts': ['version_featurecounts.txt', r"featureCounts v(\S+)"],
+    'R': ['version_r.txt', r"R version (\S+)"],
+    'deepTools': ['version_deeptools.txt', r"deeptools (\S+)"],
+    'FastQC': ['version_fastqc.txt', r"FastQC v(\S+)"],
+    'MultiQC': ['version_multiqc.txt', r"multiqc, version (\S+)"],
+    "Pipeline Version": ['./workflow/nextflow.config', r"version = '(\S+)'"]
+}
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-o', '--output',
+                        help="The out file name.",
+                        required=True)
+
+    parser.add_argument('-t', '--test',
+                        help='Used for testing purposes',
+                        default=False,
+                        action='store_true')
+
+    args = parser.parse_args()
+    return args
+
+
+def check_files(files, test):
+    '''Check if version files are found.'''
+
+    logger.info("Running file check.")
+
+    software_files = np.array(list(SOFTWARE_REGEX.values()))[:,0]
+
+    extra_files =  set(files) - set(software_files)
+
+    if len(extra_files) > 0 and test:
+            logger.error('Missing regex: %s', list(extra_files))
+            raise Exception("Missing regex: %s" % list(extra_files))
+
+
+def main():
+    args = get_args()
+    output = args.output
+    test = args.test
+
+    out_filename = output + '_mqc.yaml'
+
+    results = OrderedDict()
+    results['Nextflow'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Trim Galore!'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Cutadapt'] = '<span style="color:#999999;\">Not Run</span>'
+    results['BWA'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Samtools'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Sambamba'] = '<span style="color:#999999;\">Not Run</span>'
+    results['BEDTools'] = '<span style="color:#999999;\">Not Run</span>'
+    results['R'] = '<span style="color:#999999;\">Not Run</span>'
+    results['SPP'] = '<span style="color:#999999;\">Not Run</span>'
+    results['MACS2'] = '<span style="color:#999999;\">Not Run</span>'
+    results['bedGraphToBigWig'] = '<span style="color:#999999;\">Not Run</span>'
+    results['ChIPseeker'] = '<span style="color:#999999;\">Not Run</span>'
+    results['MEME-ChIP'] = '<span style="color:#999999;\">Not Run</span>'
+    results['DiffBind'] = '<span style="color:#999999;\">Not Run</span>'
+    results['deepTools'] = '<span style="color:#999999;\">Not Run</span>'
+    results['MultiQC'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Python'] = '<span style="color:#999999;\">Not Run</span>'
+
+    # list all files
+    files = glob.glob('**/*.txt', recursive=True)
+
+    # Check for version files:
+    check_files(files, test)
+
+    # Search each file using its regex
+    for k, v in SOFTWARE_REGEX.items():
+        if os.path.isfile(v[0]):
+            with open(v[0]) as x:
+                versions = x.read()
+                match = re.search(v[1], versions)
+                if match:
+                    results[k] = "v{}".format(match.group(1))
+
+    # Dump to YAML
+    print(
+        '''
+        id: 'Software Versions'
+        section_name: 'Software Versions'
+        section_href: 'https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/'
+        plot_type: 'html'
+        description: 'are collected at run time from the software output.'
+        data: |
+            <dl class="dl-horizontal">
+        '''
+    , file = open(out_filename, "w"))
+
+    for k, v in results.items():
+        print("            <dt>{}</dt><dd>{}</dd>".format(k, v), file = open(out_filename, "a"))
+    print("            </dl>", file = open(out_filename, "a"))
+
+
+if __name__ == '__main__':
+    main()
-- 
GitLab