diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 91c766d055e333005dbc44c0cdc4a7f444e771c9..3a60c64384aadb5de50677736fb18f6866f52d34 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -9,6 +9,7 @@ before_script: stages: - unit + - aggregation - reference - integration - consistency @@ -22,8 +23,15 @@ getBag: - merge_requests script: - ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json + - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' deriva-download-cli --version > version_deriva.txt - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6 - pytest -m getBag + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_deriva.txt + expire_in: 7 days getData: stage: unit @@ -33,10 +41,17 @@ getData: except: - merge_requests script: + - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bdbag --version > version_bdbag.txt - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt - unzip ./test_data/bag/Replicate_Q-Y5F6.zip - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6 TEST - pytest -m getData + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_bdbag.txt + expire_in: 7 days parseMetadata: stage: unit @@ -46,6 +61,7 @@ parseMetadata: except: - merge_requests script: + - singularity run 'docker://bicf/python3:2.0.1_indev' python3 --version > version_python.txt - rep=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID) - exp=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p expRID) - study=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p studyRID) @@ -57,6 +73,12 @@ parseMetadata: - readLength=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.stageNew.csv" -p readLength) - echo -e "${endsMeta},${endsManual},${stranded},${spike},${species},${readLength},${exp},${study},${rep}" > design.csv - pytest -m parseMetadata + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_python.txt + expire_in: 7 days inferMetadata: stage: unit @@ -66,6 +88,7 @@ inferMetadata: except: - merge_requests script: + - singularity run 'docker://bicf/rseqc3.0:2.0.1_indev' infer_experiment.py --version > version_rseqc.txt - > align=$(echo $(grep "Overall alignment rate" ./test_data/meta/Q-Y5F6_1M.se.alignSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) && if [[ ${align} == "" ]]; then exit 1; fi @@ -74,6 +97,12 @@ inferMetadata: ended=`singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/inferMeta.sh endness Q-Y5F6_1M.se.inferMetadata.log` && if [[ ${ended} == "" ]]; then exit 1; fi - pytest -m inferMetadata + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_rseqc.txt + expire_in: 7 days trimData: stage: unit @@ -83,11 +112,18 @@ trimData: except: - merge_requests script: + - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --version > version_trimgalore.txt - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz - readLengthSE=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') - readLengthPE=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') - pytest -m trimData + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_trimgalore.txt + expire_in: 7 days downsampleData: stage: unit @@ -100,6 +136,7 @@ downsampleData: - singularity run 'docker://bicf/seqtk:2.0.1_indev' seqtk sample -s100 ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz 1000 1> sampled.1.fq - pytest -m downsampleData + alignData: stage: unit only: @@ -108,6 +145,8 @@ alignData: except: - merge_requests script: + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 --version > version_hisat2.txt + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools --version > version_samtools.txt - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam @@ -117,6 +156,14 @@ alignData: - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai - pytest -m alignData + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_hisat2.txt + - version_samtools.txt + expire_in: 7 days + dedupData: stage: unit @@ -126,6 +173,8 @@ dedupData: except: - merge_requests script: + - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools --version > version_samtools.txt + - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates --version 2> version_markdups.txt& - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5F6_1M.se.deduped.bam - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai @@ -134,6 +183,13 @@ dedupData: echo "samtools view -b Q-Y5F6_1M.se.sorted.deduped.bam ${i} > Q-Y5F6_1M.se.sorted.deduped.${i}.bam; samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.${i}.bam Q-Y5F6_1M.se.sorted.deduped.${i}.bam.bai;"; done | singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' parallel -j 20 -k - pytest -m dedupData + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_markdups.txt + - version_samtools.txt + expire_in: 7 days countData: stage: unit @@ -149,7 +205,16 @@ countData: - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.countData - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se - assignedReads=$(grep -m 1 'Assigned' *.summary | grep -oe '\([0-9.]*\)') + - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -v &> version_featurecounts.txt + - singularity run 'docker://bicf/subread2:2.0.0' R --version > version_r.txt - pytest -m makeFeatureCounts + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_featurecounts.txt + - version_r.txt + expire_in: 7 days makeBigWig: stage: unit @@ -159,8 +224,15 @@ makeBigWig: except: - merge_requests script: + - singularity run 'docker://bicf/deeptools3.3:2.0.1_indev' deeptools --version > version_deeptools.txt - singularity run 'docker://bicf/deeptools3.3:2.0.1_indev' bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw - pytest -m makeBigWig + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_deeptools.txt + expire_in: 7 days fastqc: stage: unit @@ -170,8 +242,16 @@ fastqc: except: - merge_requests script: + - singularity run 'docker://bicf/fastqc:2.0.1_indev' fastqc --version > version_fastqc.txt - singularity run 'docker://bicf/fastqc:2.0.1_indev' fastqc ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o . - pytest -m fastqc + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_fastqc.txt + expire_in: 7 days + dataQC: stage: unit @@ -199,6 +279,26 @@ outputBag: - pytest -m outputBag +generateVersions: + stage: aggregation + only: + - push + - tags + except: + - merge_requests + script: + - singularity run 'docker://bicf/multiqc1.8:2.0.1_indev' multiqc --version > version_multiqc.txt + - python ./workflow/scripts/generate_versions.py -o software_versions + - python ./workflow/scripts/generate_references.py -r ./docs/references.md -o software_references + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - software_references_mqc.yaml + - software_versions_mqc.yaml + expire_in: 7 days + + humanBioHPC: stage: reference only: diff --git a/.gitlab/merge_request_templates/Merge_Request.md b/.gitlab/merge_request_templates/Merge_Request.md index 88c50aa2a683175a6e6635283724b11ad308e6e5..4d3a6b013244af0d542a58bf528d4ad972a0e828 100644 --- a/.gitlab/merge_request_templates/Merge_Request.md +++ b/.gitlab/merge_request_templates/Merge_Request.md @@ -6,6 +6,8 @@ These are the most common things requested on pull requests. - [ ] If you've fixed a bug or added code that should be tested, add tests! - [ ] Documentation in `docs` is updated - [ ] Replace dag.png with the most recent CI pipleine integrated_pe artifact + - [ ] Replace software_versions_mqc.yaml with the most recent CI pipleine generateVersions artifact + - [ ] Replace software_references_mqc.yaml with the most recent CI pipleine generateVersions artifact - [ ] `CHANGELOG.md` is updated - [ ] `README.md` is updated - [ ] `LICENSE.md` is updated with new contributors diff --git a/CHANGELOG.md b/CHANGELOG.md index 33465e4166f2e1f2bb8ab989f9c4a89e7643fc8d..5b47ef9673ccdecefa03bb17e3d1a2850d83e55e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ **User Facing** * Add option to pull references from datahub * Add option to send email on workflow error, with pipeline error message +* Add versions and paper references of software used to report **Background** * Remove (comment out) option to pull references from S3 diff --git a/docs/dag.png b/docs/dag.png index bbc8bffc0fed5b15c7542c8170caacec76a57727..e29fa207116d63af5ed95931b7e45bd618260ff3 100644 Binary files a/docs/dag.png and b/docs/dag.png differ diff --git a/docs/references.md b/docs/references.md index 89002c58f628df65f713dfd752bacdef9a8913ad..4ea1690ec755b51c923070352d4078634bc5e515 100644 --- a/docs/references.md +++ b/docs/references.md @@ -1 +1,43 @@ ### References + +1. **python**: + * Anaconda (Anaconda Software Distribution, [https://anaconda.com](https://anaconda.com)) + +2. **DERIVA**: + * Bugacov, A., Czajkowski, K., Kesselman, C., Kumar, A., Schuler, R. E. and Tangmunarunkit, H. 2017 Experiences with DERIVA: An Asset Management Platform for Accelerating eScience. IEEE 13th International Conference on e-Science (e-Science), Auckland, 2017, pp. 79-88, doi:[10.1109/eScience.2017.20](https://doi.org/10.1109/eScience.2017.20). + +3. **BDBag**: + * D'Arcy, M., Chard, K., Foster, I., Kesselman, C., Madduri, R., Saint, N., & Wagner, R.. 2019. Big Data Bags: A Scalable Packaging Format for Science. Zenodo. doi:[10.5281/zenodo.3338725](http://doi.org/10.5281/zenodo.3338725). + +4. **RSeQC**: + * Wang, L., Wang, S., Li, W. 2012 RSeQC: quality control of RNA-seq experiments. Bioinformatics. Aug 15;28(16):2184-5. doi:[10.1093/bioinformatics/bts356](https://doi.org/10.1093/bioinformatics/bts356). + +5. **trimgalore**: + * trimgalore [https://github.com/FelixKrueger/TrimGalore](https://github.com/FelixKrueger/TrimGalore) + +6. **hisat2**: + * Kim ,D.,Paggi, J.M., Park, C., Bennett, C., Salzberg, S.L. 2019 Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. Nat Biotechnol. Aug;37(8):907-915. doi:[10.1038/s41587-019-0201-4](https://doi.org/10.1038/s41587-019-0201-4). + +7. **samtools**: + * Li H., B. Handsaker, A. Wysoker, T. Fennell, J. Ruan, N. Homer, G. Marth, G. Abecasis, R. Durbin, and 1000 Genome Project Data Processing Subgroup. 2009. The Sequence alignment/map (SAM) format and SAMtools. Bioinformatics 25: 2078-9. doi:[10.1093/bioinformatics/btp352](http://dx.doi.org/10.1093/bioinformatics/btp352) + +8. **picard**: + * “Picard Toolkit.†2019. Broad Institute, GitHub Repository. [http://broadinstitute.github.io/picard/](http://broadinstitute.github.io/picard/); Broad Institute + +9. **featureCounts**: + * Liao, Y., Smyth, G.K., Shi, W. 2014 featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics. Apr 1;30(7):923-30. doi:[10.1093/bioinformatics/btt656](https://doi.org/10.1093/bioinformatics/btt656). + +10. **R**: + * R Core Team 2014. R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL:[http://www.R-project.org/](http://www.R-project.org/). + +11. **deeptools**: + * RamÃrez, F., D. P. Ryan, B. Grüning, V. Bhardwaj, F. Kilpert, A. S. Richter, S. Heyne, F. Dündar, and T. Manke. 2016. deepTools2: a next generation web server for deep-sequencing data analysis. Nucleic Acids Research 44: W160-165. doi:[10.1093/nar/gkw257](http://dx.doi.org/10.1093/nar/gkw257) + +12. **FastQC** + * FastQC [https://www.bioinformatics.babraham.ac.uk/projects/fastqc/](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) + +13. **MultiQC**: + * Ewels P., Magnusson M., Lundin S. and Käller M. 2016. MultiQC: Summarize analysis results for multiple tools and samples in a single report. Bioinformatics 32(19): 3047–3048. doi:[10.1093/bioinformatics/btw354](https://dx.doi.org/10.1093/bioinformatics/btw354) + +14. **Nextflow**: + * Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., and Notredame, C. 2017. Nextflow enables reproducible computational workflows. Nature biotechnology, 35(4), 316. diff --git a/docs/software_references_mqc.yaml b/docs/software_references_mqc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..675745fdb642ee27b2aa223bf5dfae78c1bb3897 --- /dev/null +++ b/docs/software_references_mqc.yaml @@ -0,0 +1,93 @@ + + id: 'software_references' + section_name: 'Software References' + description: 'This section describes references for the tools used.' + plot_type: 'html' + data: | + + <h3 id="references">References</h3> + <ol style="list-style-type: decimal"> + <li><strong>python</strong>:</li> + </ol> + <ul> + <li>Anaconda (Anaconda Software Distribution, <a href="https://anaconda.com" class="uri">https://anaconda.com</a>)</li> + </ul> + <ol start="2" style="list-style-type: decimal"> + <li><strong>DERIVA</strong>:</li> + </ol> + <ul> + <li>Bugacov, A., Czajkowski, K., Kesselman, C., Kumar, A., Schuler, R. E. and Tangmunarunkit, H. 2017 Experiences with DERIVA: An Asset Management Platform for Accelerating eScience. IEEE 13th International Conference on e-Science (e-Science), Auckland, 2017, pp. 79-88, doi:<a href="https://doi.org/10.1109/eScience.2017.20">10.1109/eScience.2017.20</a>.</li> + </ul> + <ol start="3" style="list-style-type: decimal"> + <li><strong>BDBag</strong>:<br /> + </li> + </ol> + <ul> + <li>D'Arcy, M., Chard, K., Foster, I., Kesselman, C., Madduri, R., Saint, N., & Wagner, R.. 2019. Big Data Bags: A Scalable Packaging Format for Science. Zenodo. doi:<a href="http://doi.org/10.5281/zenodo.3338725">10.5281/zenodo.3338725</a>.</li> + </ul> + <ol start="4" style="list-style-type: decimal"> + <li><strong>RSeQC</strong>:</li> + </ol> + <ul> + <li>Wang, L., Wang, S., Li, W. 2012 RSeQC: quality control of RNA-seq experiments. Bioinformatics. Aug 15;28(16):2184-5. doi:<a href="https://doi.org/10.1093/bioinformatics/bts356">10.1093/bioinformatics/bts356</a>.</li> + </ul> + <ol start="5" style="list-style-type: decimal"> + <li><strong>trimgalore</strong>:</li> + </ol> + <ul> + <li>trimgalore <a href="https://github.com/FelixKrueger/TrimGalore" class="uri">https://github.com/FelixKrueger/TrimGalore</a></li> + </ul> + <ol start="6" style="list-style-type: decimal"> + <li><strong>hisat2</strong>:</li> + </ol> + <ul> + <li>Kim ,D.,Paggi, J.M., Park, C., Bennett, C., Salzberg, S.L. Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. 2019 Nat Biotechnol. 2019 Aug;37(8):907-915. doi:<a href="https://doi.org/10.1038/s41587-019-0201-4">10.1038/s41587-019-0201-4</a></li> + </ul> + <ol start="7" style="list-style-type: decimal"> + <li><strong>samtools</strong>:</li> + </ol> + <ul> + <li>Li H., B. Handsaker, A. Wysoker, T. Fennell, J. Ruan, N. Homer, G. Marth, G. Abecasis, R. Durbin, and 1000 Genome Project Data Processing Subgroup. 2009. The Sequence alignment/map (SAM) format and SAMtools. Bioinformatics 25: 2078-9. doi:<a href="http://dx.doi.org/10.1093/bioinformatics/btp352">10.1093/bioinformatics/btp352</a></li> + </ul> + <ol start="8" style="list-style-type: decimal"> + <li><strong>picard</strong>:</li> + </ol> + <ul> + <li>“Picard Toolkit.†2019. Broad Institute, GitHub Repository. <a href="http://broadinstitute.github.io/picard/" class="uri">http://broadinstitute.github.io/picard/</a>; Broad Institute</li> + </ul> + <ol start="9" style="list-style-type: decimal"> + <li><strong>featureCounts</strong>:</li> + </ol> + <ul> + <li>Liao, Y., Smyth, G.K., Shi, W. 2014 featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics. Apr 1;30(7):923-30. doi:<a href="https://doi.org/10.1093/bioinformatics/btt656">10.1093/bioinformatics/btt656</a>.</li> + </ul> + <ol start="10" style="list-style-type: decimal"> + <li><strong>R</strong>:</li> + </ol> + <ul> + <li>R Core Team 2014. R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL:<a href="http://www.R-project.org/" class="uri">http://www.R-project.org/</a>.</li> + </ul> + <ol start="11" style="list-style-type: decimal"> + <li><strong>deeptools</strong>:</li> + </ol> + <ul> + <li>RamÃrez, F., D. P. Ryan, B. Grüning, V. Bhardwaj, F. Kilpert, A. S. Richter, S. Heyne, F. Dündar, and T. Manke. 2016. deepTools2: a next generation web server for deep-sequencing data analysis. Nucleic Acids Research 44: W160-165. doi:<a href="http://dx.doi.org/10.1093/nar/gkw257">10.1093/nar/gkw257</a></li> + </ul> + <ol start="12" style="list-style-type: decimal"> + <li><strong>FastQC</strong></li> + </ol> + <ul> + <li>FastQC <a href="https://www.bioinformatics.babraham.ac.uk/projects/fastqc/" class="uri">https://www.bioinformatics.babraham.ac.uk/projects/fastqc/</a></li> + </ul> + <ol start="13" style="list-style-type: decimal"> + <li><strong>MultiQC</strong>:</li> + </ol> + <ul> + <li>Ewels P., Magnusson M., Lundin S. and Käller M. 2016. MultiQC: Summarize analysis results for multiple tools and samples in a single report. Bioinformatics 32(19): 3047–3048. doi:<a href="https://dx.doi.org/10.1093/bioinformatics/btw354">10.1093/bioinformatics/btw354</a></li> + </ul> + <ol start="14" style="list-style-type: decimal"> + <li><strong>Nextflow</strong>:</li> + </ol> + <ul> + <li>Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., and Notredame, C. 2017. Nextflow enables reproducible computational workflows. Nature biotechnology, 35(4), 316.</li> + </ul> diff --git a/docs/software_versions_mqc.yaml b/docs/software_versions_mqc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7b44cf8c8b898ceb03770670a6acd4d7396a2550 --- /dev/null +++ b/docs/software_versions_mqc.yaml @@ -0,0 +1,24 @@ + + id: 'software_versions' + section_name: 'Software Versions' + section_href: 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/blob/78-tool_version/docs/RNA-Seq%20Pipeline%20Design%20Process%20Table.pdf' + plot_type: 'html' + description: 'are collected for pipeline version.' + data: | + <dl class="dl-horizontal"> + + <dt>Python</dt><dd>v3.7.7</dd> + <dt>DERIVA</dt><dd>v1.0.0</dd> + <dt>BDBag</dt><dd>v1.5.6</dd> + <dt>RSeQC</dt><dd>v3.0.1</dd> + <dt>Trim Galore!</dt><dd>v0.6.4</dd> + <dt>HISAT2</dt><dd>v2.1.0</dd> + <dt>Samtools</dt><dd>v1.9</dd> + <dt>picard (MarkDuplicates)</dt><dd>v2.23.0-SNAPSHOT</dd> + <dt>featureCounts</dt><dd>v2.0.0</dd> + <dt>R</dt><dd>v3.6.3</dd> + <dt>deepTools</dt><dd>v3.3.2</dd> + <dt>FastQC</dt><dd>v0.11.9</dd> + <dt>MultiQC</dt><dd>v1.8</dd> + <dt>Pipeline Version</dt><dd>v0.0.4_indev</dd> + </dl> diff --git a/workflow/conf/multiqc_config.yaml b/workflow/conf/multiqc_config.yaml index 0c780d967f2859c1c76e02a63f3348c26049c694..8a91a75b919a858cdb43c27d0349290bef04b967 100644 --- a/workflow/conf/multiqc_config.yaml +++ b/workflow/conf/multiqc_config.yaml @@ -56,6 +56,10 @@ report_section_order: order: 2000 ref: order: 1000 + software_versions: + order: -1000 + software_references: + order: -2000 skip_generalstats: true @@ -152,4 +156,4 @@ sp: ref: fn: 'reference.tsv' tin: - fn: '*.tin.hist.tsv' \ No newline at end of file + fn: '*.tin.hist.tsv' diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 7a7207ec05d8e44d4f1ab8824c8ba6a20490295c..551f18d262dca6808ae3b514c0f4a6e36d23cfad 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -70,6 +70,8 @@ if (params.refSource == "biohpc") { referenceInfer = Channel.fromList(["ERCC","GRCh","GRCm"]) multiqcConfig = Channel.fromPath("${baseDir}/conf/multiqc_config.yaml") bicfLogo = Channel.fromPath("${baseDir}/../docs/bicf_logo.png") +softwareReferences = Channel.fromPath("${baseDir}/../docs/software_references_mqc.yaml") +softwareVersions = Channel.fromPath("${baseDir}/../docs/software_versions_mqc.yaml") // Define script files script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh") @@ -1126,6 +1128,8 @@ process aggrQC { input: path multiqcConfig path bicfLogo + path softwareReferences + path softwareVersions path fastqc path trimQC path alignQC diff --git a/workflow/scripts/generate_references.py b/workflow/scripts/generate_references.py new file mode 100644 index 0000000000000000000000000000000000000000..8e809f1e4471d3393ec4960778d2a210d54c11d1 --- /dev/null +++ b/workflow/scripts/generate_references.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +#generate_references.py +#* +#* -------------------------------------------------------------------------- +#* Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/cellranger_count/blob/develop/LICENSE) +#* -------------------------------------------------------------------------- +#* + +import argparse +import subprocess +import shlex +import logging + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-r', '--reference', + help="The reference file (markdown format).", + required=True) + + parser.add_argument('-o', '--output', + help="The out file name.", + default='references') + + args = parser.parse_args() + return args + + +def main(): + args = get_args() + reference = args.reference + output = args.output + + out_filename = output + '_mqc.yaml' + + # Header for HTML + print( + ''' + id: 'software_references' + section_name: 'Software References' + description: 'This section describes references for the tools used.' + plot_type: 'html' + data: | + ''' + , file = open(out_filename, "w") + ) + + # Turn Markdown into HTML + references_html = 'bash -c "pandoc -p {} | sed \'s/^/ /\' >> {}"' + references_html = references_html.format(reference, out_filename) + subprocess.check_call(shlex.split(references_html)) + + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/generate_versions.py b/workflow/scripts/generate_versions.py new file mode 100644 index 0000000000000000000000000000000000000000..85d42da201697e8c6db07e672438fd28e639d1eb --- /dev/null +++ b/workflow/scripts/generate_versions.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# +# * -------------------------------------------------------------------------- +# * Licensed under MIT (https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/blob/master/LICENSE) +# * -------------------------------------------------------------------------- +# + +'''Make YAML of software versions.''' + +from __future__ import print_function +from collections import OrderedDict +import re +import os +import logging +import glob +import argparse +import numpy as np + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + +SOFTWARE_REGEX = { + 'Python': ['version_python.txt', r"Python (\S+)"], + 'DERIVA': ['version_deriva.txt', r"(\S+)"], + 'BDBag': ['version_bdbag.txt', r"BDBag (\S+) \(Bagit \S+\)"], + 'RSeQC': ['version_rseqc.txt', r"infer_experiment.py (\S+)"], + 'Trim Galore!': ['version_trimgalore.txt', r"version (\S+)"], + 'HISAT2': ['version_hisat2.txt', r"version (\S+)"], + 'Samtools': ['version_samtools.txt', r"samtools (\S+)"], + 'picard (MarkDuplicates)': ['version_markdups.txt', r"(\S\.\S{2}\.\S+)"], + 'featureCounts': ['version_featurecounts.txt', r"featureCounts v(\S+)"], + 'R': ['version_r.txt', r"R version (\S+)"], + 'deepTools': ['version_deeptools.txt', r"deeptools (\S+)"], + 'FastQC': ['version_fastqc.txt', r"FastQC v(\S+)"], + 'MultiQC': ['version_multiqc.txt', r"multiqc, version (\S+)"], + 'Pipeline Version': ['./workflow/nextflow.config', r"version = 'v(\S+)'"] +} + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-o', '--output', + help="The out file name.", + required=True) + + parser.add_argument('-t', '--test', + help='Used for testing purposes', + default=False, + action='store_true') + + args = parser.parse_args() + return args + + +def check_files(files, test): + '''Check if version files are found.''' + + logger.info("Running file check.") + + software_files = np.array(list(SOFTWARE_REGEX.values()))[:,0] + + extra_files = set(files) - set(software_files) + + if len(extra_files) > 0 and test: + logger.error('Missing regex: %s', list(extra_files)) + raise Exception("Missing regex: %s" % list(extra_files)) + + +def main(): + args = get_args() + output = args.output + test = args.test + + out_filename = output + '_mqc.yaml' + + results = OrderedDict() + results['Python'] = '<span style="color:#999999;\">Not Run</span>' + results['DERIVA'] = '<span style="color:#999999;\">Not Run</span>' + results['BDBag'] = '<span style="color:#999999;\">Not Run</span>' + results['RSeQC'] = '<span style="color:#999999;\">Not Run</span>' + results['Trim Galore!'] = '<span style="color:#999999;\">Not Run</span>' + results['HISAT2'] = '<span style="color:#999999;\">Not Run</span>' + results['Samtools'] = '<span style="color:#999999;\">Not Run</span>' + results['picard (MarkDuplicates)'] = '<span style="color:#999999;\">Not Run</span>' + results['featureCounts'] = '<span style="color:#999999;\">Not Run</span>' + results['R'] = '<span style="color:#999999;\">Not Run</span>' + results['deepTools'] = '<span style="color:#999999;\">Not Run</span>' + results['FastQC'] = '<span style="color:#999999;\">Not Run</span>' + results['MultiQC'] = '<span style="color:#999999;\">Not Run</span>' + results['Pipeline Version'] = '<span style="color:#999999;\">Not Run</span>' + + # list all files + files = glob.glob('**/*.txt', recursive=True) + + # Check for version files: + check_files(files, test) + + # Search each file using its regex + for k, v in SOFTWARE_REGEX.items(): + if os.path.isfile(v[0]): + with open(v[0]) as x: + versions = x.read() + match = re.search(v[1], versions) + if match: + results[k] = "v{}".format(match.group(1)) + + # Dump to YAML + print( + ''' + id: 'software_versions' + section_name: 'Software Versions' + section_href: 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/blob/78-tool_version/docs/RNA-Seq%20Pipeline%20Design%20Process%20Table.pdf' + plot_type: 'html' + description: 'are collected for pipeline version.' + data: | + <dl class="dl-horizontal"> + ''' + , file = open(out_filename, "w")) + + for k, v in results.items(): + print(" <dt>{}</dt><dd>{}</dd>".format(k, v), file = open(out_filename, "a")) + print(" </dl>", file = open(out_filename, "a")) + + +if __name__ == '__main__': + main()