Skip to content
Snippets Groups Projects
Commit 10cd6a4c authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Add in software versions and references.

parent c114a629
Branches
Tags
2 merge requests!58Develop,!48Resolve "Output tool version to report in multiqc"
Pipeline #8304 failed
......@@ -9,6 +9,7 @@ before_script:
stages:
- unit
- aggregation
- reference
- integration
- consistency
......@@ -22,8 +23,15 @@ getBag:
- merge_requests
script:
- ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json
- singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' deriva-download-cli --version > version_deriva.txt
- singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6
- pytest -m getBag
artifacts:
name: "$CI_JOB_NAME"
when: always
paths:
- version_deriva.txt
expire_in: 7 days
getData:
stage: unit
......@@ -33,10 +41,17 @@ getData:
except:
- merge_requests
script:
- singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bdbag --version > version_bdbag.txt
- ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
- unzip ./test_data/bag/Replicate_Q-Y5F6.zip
- singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6 TEST
- pytest -m getData
artifacts:
name: "$CI_JOB_NAME"
when: always
paths:
- version_bdbag.txt
expire_in: 7 days
parseMetadata:
stage: unit
......@@ -46,6 +61,7 @@ parseMetadata:
except:
- merge_requests
script:
- singularity run 'docker://bicf/python3:2.0.1_indev' python3 --version > version_python.txt
- rep=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID)
- exp=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p expRID)
- study=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p studyRID)
......@@ -57,6 +73,12 @@ parseMetadata:
- readLength=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.stageNew.csv" -p readLength)
- echo -e "${endsMeta},${endsManual},${stranded},${spike},${species},${readLength},${exp},${study},${rep}" > design.csv
- pytest -m parseMetadata
artifacts:
name: "$CI_JOB_NAME"
when: always
paths:
- version_python.txt
expire_in: 7 days
inferMetadata:
stage: unit
......@@ -66,6 +88,7 @@ inferMetadata:
except:
- merge_requests
script:
- singularity run 'docker://bicf/rseqc3.0:2.0.1_indev' infer_experiment.py --version > version_rseqc.txt
- >
align=$(echo $(grep "Overall alignment rate" ./test_data/meta/Q-Y5F6_1M.se.alignSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) &&
if [[ ${align} == "" ]]; then exit 1; fi
......@@ -74,6 +97,12 @@ inferMetadata:
ended=`singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/inferMeta.sh endness Q-Y5F6_1M.se.inferMetadata.log` &&
if [[ ${ended} == "" ]]; then exit 1; fi
- pytest -m inferMetadata
artifacts:
name: "$CI_JOB_NAME"
when: always
paths:
- version_rseqc.txt
expire_in: 7 days
trimData:
stage: unit
......@@ -83,11 +112,18 @@ trimData:
except:
- merge_requests
script:
- singularity run 'docker://bicf/trimgalore:1.1' trim_galore --version > version_trimgalore.txt
- singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
- singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
- readLengthSE=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
- readLengthPE=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
- pytest -m trimData
artifacts:
name: "$CI_JOB_NAME"
when: always
paths:
- version_trimgalore.txt
expire_in: 7 days
downsampleData:
stage: unit
......@@ -97,8 +133,15 @@ downsampleData:
except:
- merge_requests
script:
- singularity run 'docker://bicf/seqtk:2.0.1_indev' seqtk > version_seqtk.txt
- singularity run 'docker://bicf/seqtk:2.0.1_indev' seqtk sample -s100 ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz 1000 1> sampled.1.fq
- pytest -m downsampleData
artifacts:
name: "$CI_JOB_NAME"
when: always
paths:
- version_seqtk.txt
expire_in: 7 days
alignData:
stage: unit
......@@ -108,6 +151,8 @@ alignData:
except:
- merge_requests
script:
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 --version > version_hisat2.txt
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools --version > version_samtools.txt
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam
......@@ -117,6 +162,14 @@ alignData:
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai
- pytest -m alignData
artifacts:
name: "$CI_JOB_NAME"
when: always
paths:
- version_hisat2.txt
- version_samtools.txt
expire_in: 7 days
dedupData:
stage: unit
......@@ -126,14 +179,23 @@ dedupData:
except:
- merge_requests
script:
- singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates --version > version_markdups.txt
- singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools --version > version_samtools.txt
- singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true
- singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5F6_1M.se.deduped.bam
- singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai
- >
for i in {"chr8","chr4","chrY"}; do
for i in {"chr8","chr4","chrY"}; do
echo "samtools view -b Q-Y5F6_1M.se.sorted.deduped.bam ${i} > Q-Y5F6_1M.se.sorted.deduped.${i}.bam; samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.${i}.bam Q-Y5F6_1M.se.sorted.deduped.${i}.bam.bai;";
done | singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' parallel -j 20 -k
- pytest -m dedupData
artifacts:
name: "$CI_JOB_NAME"
when: always
paths:
- version_markdups.txt
- version_samtools.txt
expire_in: 7 days
countData:
stage: unit
......@@ -145,11 +207,20 @@ countData:
script:
- ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/geneID.tsv
- ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/Entrez.tsv
- singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se.countData -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam
- singularity run 'docker://bicf/subread2:2.0.0' featureCounts -v > version_featurecounts.txt
- singularity run 'docker://bicf/subread2:2.0.0' R --version > version_r.txt
- singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se.countData -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam
- singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.countData
- singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se
- assignedReads=$(grep -m 1 'Assigned' *.summary | grep -oe '\([0-9.]*\)')
- pytest -m makeFeatureCounts
artifacts:
name: "$CI_JOB_NAME"
when: always
paths:
- version_featurecounts.txt
- version_r.txt
expire_in: 7 days
makeBigWig:
stage: unit
......@@ -159,8 +230,15 @@ makeBigWig:
except:
- merge_requests
script:
- singularity run 'docker://bicf/deeptools3.3:2.0.1_indev' deeptools --version > version_deeptools.txt
- singularity run 'docker://bicf/deeptools3.3:2.0.1_indev' bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw
- pytest -m makeBigWig
artifacts:
name: "$CI_JOB_NAME"
when: always
paths:
- version_deeptools.txt
expire_in: 7 days
fastqc:
stage: unit
......@@ -170,8 +248,16 @@ fastqc:
except:
- merge_requests
script:
- singularity run 'docker://bicf/fastqc:2.0.1_indev' fastqc --version > version_fastqc.txt
- singularity run 'docker://bicf/fastqc:2.0.1_indev' fastqc ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o .
- pytest -m fastqc
artifacts:
name: "$CI_JOB_NAME"
when: always
paths:
- version_fastqc.txt
expire_in: 7 days
dataQC:
stage: unit
......@@ -199,6 +285,26 @@ outputBag:
- pytest -m outputBag
generateVersions:
stage: aggregate
only:
- push
- tags
except:
- merge_requests
script:
- singularity run 'docker://bicf/multiqc1.8:2.0.1_indev' multiqc --version > version_multiqc.txt
- python ./workflow/scripts/generate_versions.py -o software_versions
- python ./workflow/scripts/generate_references.py -r ./docs/references.md -o software_references
artifacts:
name: "$CI_JOB_NAME"
when: always
paths:
- software_references_mqc.yaml
- software_versions_mqc.yaml
expire_in: 7 days
humanBioHPC:
stage: reference
only:
......@@ -366,7 +472,7 @@ override_fastq:
max: 1
when:
- always
override_species:
stage: integration
only: [merge_requests]
......@@ -388,7 +494,7 @@ override_species:
max: 1
when:
- always
consistency:
stage: consistency
......@@ -413,4 +519,4 @@ consistency:
- assignedPE.txt
- assignedExpectSE.txt
- assignedExpectPE.txt
expire_in: 7 days
\ No newline at end of file
expire_in: 7 days
......@@ -6,6 +6,8 @@ These are the most common things requested on pull requests.
- [ ] If you've fixed a bug or added code that should be tested, add tests!
- [ ] Documentation in `docs` is updated
- [ ] Replace dag.png with the most recent CI pipleine integrated_pe artifact
- [ ] Replace software_versions_mqc.yaml with the most recent CI pipleine generateVersions artifact
- [ ] Replace software_references_mqc.yaml with the most recent CI pipleine generateVersions artifact
- [ ] `CHANGELOG.md` is updated
- [ ] `README.md` is updated
- [ ] `LICENSE.md` is updated with new contributors
......
### References
1. **python**:
* Anaconda (Anaconda Software Distribution, [https://anaconda.com](https://anaconda.com))
2. **DERIVA**:
* Bugacov, A., Czajkowski, K., Kesselman, C., Kumar, A., Schuler, R. E. and Tangmunarunkit, H. 2017 Experiences with DERIVA: An Asset Management Platform for Accelerating eScience. IEEE 13th International Conference on e-Science (e-Science), Auckland, 2017, pp. 79-88, doi:[10.1109/eScience.2017.20](https://doi.org/10.1109/eScience.2017.20).
3. **BDBag**:
* D'Arcy, M., Chard, K., Foster, I., Kesselman, C., Madduri, R., Saint, N., & Wagner, R.. 2019. Big Data Bags: A Scalable Packaging Format for Science. Zenodo. doi:[10.5281/zenodo.3338725](http://doi.org/10.5281/zenodo.3338725).
4. **RSeQC**:
* Wang, L., Wang, S., Li, W. 2012 RSeQC: quality control of RNA-seq experiments. Bioinformatics. Aug 15;28(16):2184-5. doi:[10.1093/bioinformatics/bts356](https://doi.org/10.1093/bioinformatics/bts356).
5. **trimgalore**:
* trimgalore [https://github.com/FelixKrueger/TrimGalore](https://github.com/FelixKrueger/TrimGalore)
6. **seqtk**:
* seqtk [https://github.com/lh3/seqtk](https://github.com/lh3/seqtk)
7. **hisat2**:
* Kim ,D.,Paggi, J.M., Park, C., Bennett, C., Salzberg, S.L. Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. Nat Biotechnol. 2019 Aug;37(8):907-915. doi: 10.1038/s41587-019-0201-4.
Kim D, Paggi JM, Park C, Bennett C, Salzberg SL. Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. 2019 Nat Biotechnol. Aug;37(8):907-915. doi:[10.1038/s41587-019-0201-4](https://doi.org/10.1038/s41587-019-0201-4).
8. **samtools**:
* Li H., B. Handsaker, A. Wysoker, T. Fennell, J. Ruan, N. Homer, G. Marth, G. Abecasis, R. Durbin, and 1000 Genome Project Data Processing Subgroup. 2009. The Sequence alignment/map (SAM) format and SAMtools. Bioinformatics 25: 2078-9. doi:[10.1093/bioinformatics/btp352](http://dx.doi.org/10.1093/bioinformatics/btp352)
9. **picard**:
* “Picard Toolkit.” 2019. Broad Institute, GitHub Repository. [http://broadinstitute.github.io/picard/](http://broadinstitute.github.io/picard/); Broad Institute
10. **featureCounts**:
* Liao, Y., Smyth, G.K., Shi, W. 2014 featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics. Apr 1;30(7):923-30. doi:[10.1093/bioinformatics/btt656](https://doi.org/10.1093/bioinformatics/btt656).
11. **R**:
* R Core Team 2014. R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL:[http://www.R-project.org/](http://www.R-project.org/).
12. **deeptools**:
* Ramírez, F., D. P. Ryan, B. Grüning, V. Bhardwaj, F. Kilpert, A. S. Richter, S. Heyne, F. Dündar, and T. Manke. 2016. deepTools2: a next generation web server for deep-sequencing data analysis. Nucleic Acids Research 44: W160-165. doi:[10.1093/nar/gkw257](http://dx.doi.org/10.1093/nar/gkw257)
13. **FastQC**
* FastQC [https://www.bioinformatics.babraham.ac.uk/projects/fastqc/](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
14. **MultiQC**:
* Ewels P., Magnusson M., Lundin S. and Käller M. 2016. MultiQC: Summarize analysis results for multiple tools and samples in a single report. Bioinformatics 32(19): 3047–3048. doi:[10.1093/bioinformatics/btw354](https://dx.doi.org/10.1093/bioinformatics/btw354)
15. **Nextflow**:
* Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., and Notredame, C. 2017. Nextflow enables reproducible computational workflows. Nature biotechnology, 35(4), 316.
......@@ -56,6 +56,11 @@ report_section_order:
order: 2000
ref:
order: 1000
Software_Versions:
order: -1200
Software_References:
order: -1300
skip_generalstats: true
......@@ -152,4 +157,4 @@ sp:
ref:
fn: 'reference.tsv'
tin:
fn: '*.tin.hist.tsv'
\ No newline at end of file
fn: '*.tin.hist.tsv'
#!/usr/bin/env nextflow
// ######## #### ###### ########
// ## ## ## ## ## ##
// ## ## ## ## ##
// ######## ## ## ######
// ## ## ## ## ##
// ## ## ## ## ## ##
// ######## #### ###### ##
// ######## #### ###### ########
// ## ## ## ## ## ##
// ## ## ## ## ##
// ######## ## ## ######
// ## ## ## ## ##
// ## ## ## ## ## ##
// ######## #### ###### ##
// Define input variables
params.deriva = "${baseDir}/../test_data/auth/credential.json"
......@@ -66,6 +66,8 @@ if (params.refSource == "biohpc") {
referenceInfer = Channel.fromList(["ERCC","GRCh","GRCm"])
multiqcConfig = Channel.fromPath("${baseDir}/conf/multiqc_config.yaml")
bicfLogo = Channel.fromPath("${baseDir}/../docs/bicf_logo.png")
softwareReferences = Channel.fromPath("${baseDir}/../docs/software_references_mqc.yaml")
softwareVersions = Channel.fromPath("${baseDir}/../docs/software_versions_mqc.yaml")
// Define script files
script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh")
......@@ -89,7 +91,7 @@ process trackStart {
"""
hostname
ulimit -a
curl -H 'Content-Type: application/json' -X PUT -d \
'{ \
"sessionId": "${workflow.sessionId}", \
......@@ -199,16 +201,16 @@ process getData {
mkdir -p ~/.bdbag
ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt
echo -e "LOG: linked" >> ${repRID}.getData.log
# get bag basename
replicate=\$(basename "${inputBag}" | cut -d "." -f1)
echo -e "LOG: bag replicate name \${replicate}" >> ${repRID}.getData.log
# unzip bag
echo -e "LOG: unzipping replicate bag" >> ${repRID}.getData.log
unzip ${inputBag}
echo -e "LOG: unzipped" >> ${repRID}.getData.log
# bag fetch fastq's only and rename by repRID
echo -e "LOG: fetching replicate bdbag" >> ${repRID}.getData.log
sh ${script_bdbagFetch} \${replicate} ${repRID}
......@@ -259,7 +261,7 @@ process parseMetadata {
# get experiment RID metadata
exp=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p expRID)
echo -e "LOG: experiment RID metadata parsed: \${exp}" >> ${repRID}.parseMetadata.log
# get study RID metadata
study=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p studyRID)
echo -e "LOG: study RID metadata parsed: \${study}" >> ${repRID}.parseMetadata.log
......@@ -267,7 +269,7 @@ process parseMetadata {
# get endedness metadata
endsMeta=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p endsMeta)
echo -e "LOG: endedness metadata parsed: \${endsMeta}" >> ${repRID}.parseMetadata.log
# ganually get endness
endsManual=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p endsManual)
echo -e "LOG: endedness manually detected: \${endsManual}" >> ${repRID}.parseMetadata.log
......@@ -275,11 +277,11 @@ process parseMetadata {
# get strandedness metadata
stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p stranded)
echo -e "LOG: strandedness metadata parsed: \${stranded}" >> ${repRID}.parseMetadata.log
# get spike-in metadata
spike=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p spike)
echo -e "LOG: spike-in metadata parsed: \${spike}" >> ${repRID}.parseMetadata.log
# get species metadata
species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experiment}" -p species)
echo -e "LOG: species metadata parsed: \${species}" >> ${repRID}.parseMetadata.log
......@@ -358,7 +360,7 @@ process trimData {
fi
echo -e "LOG: trimmed" >> ${repRID}.trimData.log
echo -e "LOG: average trimmed read length: \${readLength}" >> ${repRID}.trimData.log
# save read length file
echo -e "\${readLength}" > readLength.csv
"""
......@@ -381,7 +383,7 @@ getRefInferInput = referenceInfer.combine(deriva_getRefInfer.combine(script_refD
/*
* getRefInfer: dowloads appropriate reference for metadata inference
*/
*/
process getRefInfer {
tag "${refName}"
......@@ -391,7 +393,7 @@ process getRefInfer {
output:
tuple val (refName), path ("hisat2", type: 'dir'), path ("*.fna"), path ("*.gtf") into refInfer
path ("${refName}", type: 'dir') into bedInfer
script:
"""
hostname > ${repRID}.${refName}.getRefInfer.log
......@@ -532,14 +534,14 @@ process alignSampleData {
echo -e "LOG: aligning ${ends}" >> ${repRID}.${ref}.alignSampleData.log
if [ "${ends}" == "se" ]
then
hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome -U ${fastq1} --summary-file ${ref}.alignSampleSummary.txt --new-summary
elif [ "${ends}" == "pe" ]
then
hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome --no-mixed --no-discordant -1 ${fastq1} -2 ${fastq2} --summary-file ${ref}.alignSampleSummary.txt --new-summary
fi
echo -e "LOG: aliged" >> ${repRID}.${ref}.alignSampleData.log
# convert the output sam file to a sorted bam file using Samtools
echo -e "LOG: converting from sam to bam" >> ${repRID}.${ref}.alignSampleData.log
samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${ref}.sampled.bam ${ref}.sampled.sam
......@@ -639,7 +641,7 @@ process inferMetadata {
ended=`bash inferMeta.sh endness ${repRID}.infer_experiment.txt`
fail=`bash inferMeta.sh fail ${repRID}.infer_experiment.txt`
if [ \${ended} == "PairEnd" ]
if [ \${ended} == "PairEnd" ]
then
ends="pe"
percentF=`bash inferMeta.sh pef ${repRID}.infer_experiment.txt`
......@@ -728,7 +730,7 @@ process getRef {
output:
tuple path ("hisat2", type: 'dir'), path ("bed", type: 'dir'), path ("*.fna"), path ("*.gtf"), path ("geneID.tsv"), path ("Entrez.tsv") into reference
script:
"""
hostname > ${repRID}.getRef.log
......@@ -847,7 +849,7 @@ process alignData {
strandedParam="--rna-strandness R"
elif [ "${stranded}" == "reverse" ] && [ "${ends}" == "pe" ]
then
strandedParam="--rna-strandness RF"
strandedParam="--rna-strandness RF"
fi
# align the reads with Hisat2
......@@ -860,7 +862,7 @@ process alignData {
hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} --summary-file ${repRID}.alignSummary.txt --new-summary
fi
echo -e "LOG: alignined" >> ${repRID}.align.log
# convert the output sam file to a sorted bam file using Samtools
echo -e "LOG: converting from sam to bam" >> ${repRID}.align.log
samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam
......@@ -892,7 +894,7 @@ process dedupData {
output:
tuple path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bam.bai") into dedupBam
tuple path ("${repRID}.sorted.deduped.*.bam"), path ("${repRID}.sorted.deduped.*.bam.bai") into dedupChrBam
tuple path ("${repRID}.sorted.deduped.*.bam"), path ("${repRID}.sorted.deduped.*.bam.bai") into dedupChrBam
path ("*.deduped.Metrics.txt") into dedupQC
script:
......@@ -908,7 +910,7 @@ process dedupData {
# sort the bam file using Samtools
echo -e "LOG: sorting the bam file" >> ${repRID}.dedup.log
samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.deduped.bam ${repRID}.deduped.bam
# index the sorted bam using Samtools
echo -e "LOG: indexing sorted bam file" >> ${repRID}.dedup.log
samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bam.bai
......@@ -1004,7 +1006,7 @@ process countData {
featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
fi
echo -e "LOG: counted" >> ${repRID}.countData.log
# extract assigned reads
grep -m 1 'Assigned' *.countData.summary | grep -oe '\\([0-9.]*\\)' > assignedReads.csv
......@@ -1069,12 +1071,12 @@ process dataQC {
tuple path (bam), path (bai) from dedupBam_dataQC
tuple path (chrBam), path (chrBai) from dedupChrBam
val ends from endsInfer_dataQC
output:
path "${repRID}.tin.hist.tsv" into tinHist
path "${repRID}.tin.med.csv" into inferMetadata_tinMed
path "${repRID}.insertSize.inner_distance_freq.txt" into innerDistance
script:
"""
hostname > ${repRID}.dataQC.log
......@@ -1122,6 +1124,8 @@ process aggrQC {
input:
path multiqcConfig
path bicfLogo
path softwareReferences
path softwareVersions
path fastqc
path trimQC
path alignQC
......@@ -1179,8 +1183,8 @@ process aggrQC {
echo -e "LOG: creating run table" >> ${repRID}.aggrQC.log
echo -e "Session\tSession ID\tStart Time\tPipeline Version\tInput" > run.tsv
echo -e "Session\t${workflow.sessionId}\t${workflow.start}\t${workflow.manifest.version}\t\${input}" >> run.tsv
# make RID table
echo -e "LOG: creating RID table" >> ${repRID}.aggrQC.log
echo -e "Replicate\tReplicate RID\tExperiment RID\tStudy RID" > rid.tsv
......@@ -1224,11 +1228,11 @@ process aggrQC {
process outputBag {
tag "${repRID}"
publishDir "${outDir}/outputBag", mode: 'copy', pattern: "Replicate_${repRID}.outputBag.zip"
input:
path multiqc
path multiqcJSON
output:
path ("Replicate_*.zip") into outputBag
......@@ -1239,4 +1243,4 @@ process outputBag {
cp ${multiqcJSON} Replicate_${repRID}.outputBag
bdbag Replicate_${repRID}.outputBag --archiver zip
"""
}
\ No newline at end of file
}
#!/usr/bin/env python3
#generate_references.py
#*
#* --------------------------------------------------------------------------
#* Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/cellranger_count/blob/develop/LICENSE)
#* --------------------------------------------------------------------------
#*
import argparse
import subprocess
import shlex
import logging
EPILOG = '''
For more details:
%(prog)s --help
'''
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-r', '--reference',
help="The reference file (markdown format).",
required=True)
parser.add_argument('-o', '--output',
help="The out file name.",
default='references')
args = parser.parse_args()
return args
def main():
args = get_args()
reference = args.reference
output = args.output
out_filename = output + '_mqc.yaml'
# Header for HTML
print(
'''
id: 'software_references'
section_name: 'Software References'
description: 'This section describes references for the tools used.'
plot_type: 'html'
data: |
'''
, file = open(out_filename, "w")
)
# Turn Markdown into HTML
references_html = 'bash -c "pandoc -p {} | sed \'s/^/ /\' >> {}"'
references_html = references_html.format(reference, out_filename)
subprocess.check_call(shlex.split(references_html))
if __name__ == '__main__':
main()
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/blob/master/LICENSE)
# * --------------------------------------------------------------------------
#
'''Make YAML of software versions.'''
from __future__ import print_function
from collections import OrderedDict
import re
import os
import logging
import glob
import argparse
import numpy as np
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
SOFTWARE_REGEX = {
'Python': ['version_python.txt', r"Python (\S+)"],
'DERIVA': ['version_deriva.txt', r"(\S+)"],
'BDBag': ['version_bdbag.txt', r"BDBag (\S+) \(Bagit \S+\)"],
'RSeQC': ['version_rseqc.txt', r"infer_experiment.py (\S+)"],
'Trim Galore!': ['version_trimgalore.txt', r"version (\S+)"],
'seqtk': ['version_seqtk.txt', r"Version: (\S+)"],
'HISAT2': ['version_hisat.txt', r"/hisat2-2.1.0/hisat2-align-s version (\S+)"],
'Samtools': ['version_samtools.txt', r"samtools (\S+)"],
'picard (MarkDuplicates)': ['version_markdups.txt', r"(\S+)"],
'featureCounts': ['version_featurecounts.txt', r"featureCounts v(\S+)"],
'R': ['version_r.txt', r"R version (\S+)"],
'deepTools': ['version_deeptools.txt', r"deeptools (\S+)"],
'FastQC': ['version_fastqc.txt', r"FastQC v(\S+)"],
'MultiQC': ['version_multiqc.txt', r"multiqc, version (\S+)"],
"Pipeline Version": ['./workflow/nextflow.config', r"version = '(\S+)'"]
}
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-o', '--output',
help="The out file name.",
required=True)
parser.add_argument('-t', '--test',
help='Used for testing purposes',
default=False,
action='store_true')
args = parser.parse_args()
return args
def check_files(files, test):
'''Check if version files are found.'''
logger.info("Running file check.")
software_files = np.array(list(SOFTWARE_REGEX.values()))[:,0]
extra_files = set(files) - set(software_files)
if len(extra_files) > 0 and test:
logger.error('Missing regex: %s', list(extra_files))
raise Exception("Missing regex: %s" % list(extra_files))
def main():
args = get_args()
output = args.output
test = args.test
out_filename = output + '_mqc.yaml'
results = OrderedDict()
results['Nextflow'] = '<span style="color:#999999;\">Not Run</span>'
results['Trim Galore!'] = '<span style="color:#999999;\">Not Run</span>'
results['Cutadapt'] = '<span style="color:#999999;\">Not Run</span>'
results['BWA'] = '<span style="color:#999999;\">Not Run</span>'
results['Samtools'] = '<span style="color:#999999;\">Not Run</span>'
results['Sambamba'] = '<span style="color:#999999;\">Not Run</span>'
results['BEDTools'] = '<span style="color:#999999;\">Not Run</span>'
results['R'] = '<span style="color:#999999;\">Not Run</span>'
results['SPP'] = '<span style="color:#999999;\">Not Run</span>'
results['MACS2'] = '<span style="color:#999999;\">Not Run</span>'
results['bedGraphToBigWig'] = '<span style="color:#999999;\">Not Run</span>'
results['ChIPseeker'] = '<span style="color:#999999;\">Not Run</span>'
results['MEME-ChIP'] = '<span style="color:#999999;\">Not Run</span>'
results['DiffBind'] = '<span style="color:#999999;\">Not Run</span>'
results['deepTools'] = '<span style="color:#999999;\">Not Run</span>'
results['MultiQC'] = '<span style="color:#999999;\">Not Run</span>'
results['Python'] = '<span style="color:#999999;\">Not Run</span>'
# list all files
files = glob.glob('**/*.txt', recursive=True)
# Check for version files:
check_files(files, test)
# Search each file using its regex
for k, v in SOFTWARE_REGEX.items():
if os.path.isfile(v[0]):
with open(v[0]) as x:
versions = x.read()
match = re.search(v[1], versions)
if match:
results[k] = "v{}".format(match.group(1))
# Dump to YAML
print(
'''
id: 'Software Versions'
section_name: 'Software Versions'
section_href: 'https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/'
plot_type: 'html'
description: 'are collected at run time from the software output.'
data: |
<dl class="dl-horizontal">
'''
, file = open(out_filename, "w"))
for k, v in results.items():
print(" <dt>{}</dt><dd>{}</dd>".format(k, v), file = open(out_filename, "a"))
print(" </dl>", file = open(out_filename, "a"))
if __name__ == '__main__':
main()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment