diff --git a/.gitignore b/.gitignore index 2bc34493af5ad8d30a9ef477283aa7c4b32700b8..12288788210fa386427657fa55ab47b9ac14a6aa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,32 +1,6 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg - -# PyInstaller -# Created by https://www.gitignore.io/api/r,perl,macos,linux,python,windows -# Edit at https://www.gitignore.io/?templates=r,perl,macos,linux,python,windows +# Created by https://www.gitignore.io/api/r,perl,linux,macos,python,windows,microsoftoffice +# Edit at https://www.gitignore.io/?templates=r,perl,linux,macos,python,windows,microsoftoffice ### Linux ### *~ @@ -71,6 +45,27 @@ Network Trash Folder Temporary Items .apdisk +### MicrosoftOffice ### +*.tmp + +# Word temporary +~$*.doc* + +# Word Auto Backup File +Backup of *.doc* + +# Excel temporary +~$*.xls* + +# Excel Backup File +*.xlk + +# PowerPoint temporary +~$*.ppt* + +# Visio autosave temporary files +*.~vsd* + ### Perl ### !Build/ .last_cover_stats @@ -165,15 +160,6 @@ coverage.xml *.mo *.pot -# Django stuff: -*.log -local_settings.py -db.sqlite3 - -# Flask stuff: -instance/ -.webassets-cache - # Scrapy stuff: .scrapy @@ -183,31 +169,22 @@ docs/_build/ # PyBuilder target/ -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - # pyenv .python-version +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - # Spyder project settings .spyderproject .spyproject @@ -215,6 +192,11 @@ venv.bak/ # Rope project settings .ropeproject +# Mr Developer +.mr.developer.cfg +.project +.pydevproject + # mkdocs documentation /site @@ -226,9 +208,6 @@ dmypy.json # Pyre type checker .pyre/ -### Python Patch ### -.venv/ - ### R ### # History files .Rhistory @@ -237,6 +216,9 @@ dmypy.json # Session Data files .RData +# User-specific files +.Ruserdata + # Example code in package build process *-Ex.R @@ -257,7 +239,7 @@ vignettes/*.pdf .httr-oauth # knitr and R markdown default cache directories -/*_cache/ +*_cache/ /cache/ # Temporary files created by R markdown @@ -271,6 +253,7 @@ vignettes/*.pdf ### Windows ### # Windows thumbnail cache files Thumbs.db +Thumbs.db:encryptable ehthumbs.db ehthumbs_vista.db @@ -293,11 +276,11 @@ $RECYCLE.BIN/ # Windows shortcuts *.lnk -# End of https://www.gitignore.io/api/r,perl,macos,linux,python,windows +# End of https://www.gitignore.io/api/r,perl,linux,macos,python,windows,microsoftoffice # nextflow analysis folders/files /test_data/* -/workflow/docker/images/* +!/test_data/createTestData.sh /workflow/.nextflow/* /workflow/work/* /workflow/output/* @@ -314,6 +297,8 @@ timeline*.html* *.tmp *.swp *.out +*_studyRID.json +*_studyRID.csv run*.sh !.gitkeep diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..dc2eab106b9fff3a2d3a00dc5bde49c81046f5b2 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,185 @@ +before_script: + - module add python/3.6.4-anaconda + - pip install --user pytest-pythonpath==0.7.1 pytest-cov==2.5.1 + - module load singularity/3.5.3 + - module load nextflow/20.01.0 + - ln -sfn /project/BICF/BICF_Core/shared/gudmap/test_data/* ./test_data/ + - mkdir -p ~/.deriva + - mkdir -p ~/.bdbag + +stages: + - unit + - integration + - consistency + +getBag: + stage: unit + script: + - ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json + - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6 + - pytest -m getBag + +getData: + stage: unit + script: + - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt + - unzip ./test_data/bagit/Replicate_Q-Y5F6.zip + - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6 TEST + - pytest -m getData + +parseMetadata: + stage: unit + script: + - rep=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID) + - exp=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p expRID) + - study=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p studyRID) + - endsMeta=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsMeta) + - endsManual=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsManual) + - stranded=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded) + - spike=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p spike) + - species=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species) + - readLength=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.stageNew.csv" -p readLength) + - echo -e "${endsMeta},${endsManual},${stranded},${spike},${species},${readLength},${exp},${study},${rep}" > design.csv + - pytest -m parseMetadata + +inferMetadata: + stage: unit + script: + - > + align=$(echo $(grep "Overall alignment rate" ./test_data/meta/Q-Y5F6_1M.se.alignSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) && + if [[ ${align} == "" ]]; then exit 1; fi + - > + singularity run 'docker://bicf/rseqc3.0:2.0.1_indev' infer_experiment.py -r "/project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed" -i "./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam" 1>> Q-Y5F6_1M.se.inferMetadata.log && + ended=`singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/inferMeta.sh endness Q-Y5F6_1M.se.inferMetadata.log` && + if [[ ${ended} == "" ]]; then exit 1; fi + - pytest -m inferMetadata + +getRef: + stage: unit + script: + - mkdir -p hu + - mkdir -p mo + - cp -R /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2 ./hu/ + - cp -R /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2 ./mo/ + +trimData: + stage: unit + script: + - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5F6_1M.se -j 20 ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz + - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5F6_1M.pe -j 20 ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz + - readLengthSE=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') + - readLengthPE=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') + - pytest -m trimData + +downsampleData: + stage: unit + script: + - singularity run 'docker://bicf/seqtk:2.0.1_indev' seqtk sample -s100 ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz 1000 1> sampled.1.fq + - pytest -m downsampleData + +alignData: + stage: unit + script: + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.sorted.bam.bai + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.pe.unal.gz -S Q-Y5F6_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/small/Q-Y5F6_1M.pe_R1_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5F6_1M.pe_R2_val_2.fq.gz --summary-file Q-Y5F6_1M.pe.alignSummary.txt --new-summary + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.pe.bam Q-Y5F6_1M.pe.sam + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai + - pytest -m alignData + +dedupData: + stage: unit + script: + - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true + - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5F6_1M.se.deduped.bam + - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai + - > + for i in {"chr8","chr4","chrY"}; do + echo "samtools view -b Q-Y5F6_1M.se.sorted.deduped.bam ${i} > Q-Y5F6_1M.se.sorted.deduped.${i}.bam; samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.${i}.bam Q-Y5F6_1M.se.sorted.deduped.${i}.bam.bai;"; + done | singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' parallel -j 20 -k + - pytest -m dedupData + +countData: + stage: unit + script: + - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' -o Q-Y5F6_1M.se.featureCounts -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam + - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.featureCounts + - assignedReads=$(grep -m 1 'Assigned' *.summary | grep -oe '\([0-9.]*\)') + - pytest -m makeFeatureCounts + +makeBigWig: + stage: unit + script: + - singularity run 'docker://bicf/deeptools3.3:2.0.1_indev' bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw + - pytest -m makeBigWig + +fastqc: + stage: unit + script: + - singularity run 'docker://bicf/fastqc:2.0.1_indev' fastqc ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o . + - pytest -m fastqc + +dataQC: + stage: unit + script: + - echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > Q-Y5F6_1M.se.sorted.deduped.tin.xls + - for i in {"chr8","chr4","chrY"}; do + echo "tin.py -i ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"; done | singularity run 'docker://bicf/rseqc3.0:2.0.1_indev' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls + - pytest -m dataQC + + +integration_se: + stage: integration + script: + - hostname + - ulimit -a + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-1ZX4 -with-dag dag.png --ci true + - find . -type f -name "multiqc_data.json" -exec cp {} ./SE_multiqc_data.json \; + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - output/qc/ + - SE_multiqc_data.json + expire_in: 7 days + +integration_pe: + stage: integration + script: + - hostname + - ulimit -a + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA -with-dag dag.png --ci true + - find . -type f -name "multiqc_data.json" -exec cp {} ./PE_multiqc_data.json \; + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - dag.png + - output/qc/ + - PE_multiqc_data.json + expire_in: 7 days + +consistency: + stage: consistency + script: + - grep -m 1 \"Assigned\":.[0-9] SE_multiqc_data.json | grep -oe '\([0-9.]*\)' > assignedSE.txt + - grep -m 1 \"Assigned\":.[0-9] PE_multiqc_data.json | grep -oe '\([0-9.]*\)' > assignedPE.txt + - echo 7742416 > assignedExpectSE.txt + - echo 2599140 > assignedExpectPE.txt + - pytest -m consistencySE + - pytest -m consistencyPE + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - SE_multiqc_data.json + - PE_multiqc_data.json + - assignedSE.txt + - assignedPE.txt + - assignedExpectSE.txt + - assignedExpectPE.txt + expire_in: 7 days + diff --git a/.gitlab/issue_templates/Bug.md b/.gitlab/issue_templates/Bug.md new file mode 100644 index 0000000000000000000000000000000000000000..3af88f7da9144a3bffffeea91ce7d452cd5d8023 --- /dev/null +++ b/.gitlab/issue_templates/Bug.md @@ -0,0 +1,21 @@ +# Summary + + +# Steps to reproduce + + +# Observed bug behavior + + +# Expected behavior + + +# Relevant logs and/or screenshots + + +# Potential fixes + + + +/label ~bug ~"To Do" +/cc @ghenry @venkat.malladi @s181706 \ No newline at end of file diff --git a/.gitlab/merge_request_templates/Merge_Request.md b/.gitlab/merge_request_templates/Merge_Request.md new file mode 100644 index 0000000000000000000000000000000000000000..88c50aa2a683175a6e6635283724b11ad308e6e5 --- /dev/null +++ b/.gitlab/merge_request_templates/Merge_Request.md @@ -0,0 +1,20 @@ +Please fill in the appropriate checklist below (delete those which are not relevant). +These are the most common things requested on pull requests. + +## PR checklist + - [ ] This comment contains a description of changes (with reason) + - [ ] If you've fixed a bug or added code that should be tested, add tests! + - [ ] Documentation in `docs` is updated + - [ ] Replace dag.png with the most recent CI pipleine integrated_pe artifact + - [ ] `CHANGELOG.md` is updated + - [ ] `README.md` is updated + - [ ] `LICENSE.md` is updated with new contributors + - [ ] Docker images moved to production release and changed in pipeline + - [ ] Docker images used in the CI unit tests match those used in pipeline + + +* [ ] **Close issue**\ +Closes # + +/cc @ghenry @venkat.malladi +/assign @ghenry diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ff0911406a181f08e86e495c7530f87d6f43dae..904efcc5a2f1a025e82fa1a65d11d33f573809d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,14 @@ -# v0.0.1 (in development) +# v0.0.2 (in development) **User Facing** **Background** *Known Bugs* + +<hr> + +# v0.0.1 +**INITIAL BETA VERSION**\ +Does not include automatic data upload\ +This version is for initial upload of test data to GUDMAP/RBK data-hub for internal integration +<hr> diff --git a/LICENSE b/LICENSE index e4cf85dd516f931238ad4ff60f25ea8fa31a5256..af5e9c54a9121cc97781032e2ddbc58d59947ec0 100644 --- a/LICENSE +++ b/LICENSE @@ -1 +1,25 @@ -NOT YET LICENSED +MIT License + +Copyright (c) 2019 University of Texas Southwestern Medical Center. + +Contributors: Gervaise H. Henry, Jon Gesell, Jeremy Mathews, and Venkat Malladi + +Department: Bioinformatic Core Facility, Department of Bioinformatics + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md index abbd229ea18b8093a3e35ccaac9f1426ad09930e..23b24d840c661ec6f8c1694629b9024bf9b07caa 100644 --- a/README.md +++ b/README.md @@ -1,33 +1,118 @@ |*master*|*develop*| |:-:|:-:| -|[](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/commits/master)|[](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/commits/develop)| - +|[](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/commits/master)|[](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/commits/develop)| +<!-- [![DOI]()]() - -GUDMAP/RBK RNA-Seq Pipeline -=========================== +--> +RNA-Seq Analytic Pipeline for GUDMAP/RBK +======================================== Introduction ------------ +This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub. It is designed to run on the HPC cluster ([BioHPC](https://portal.biohpc.swmed.edu)) at UT Southwestern Medical Center (in conjunction with the standard nextflow profile: config `biohpc.config`) + + +Cloud Compatibility: +-------------------- +This pipeline is also capable of being run on AWS. To do so: +* Build a AWS batch queue and environment either manually or with [aws-cloudformantion](https://console.aws.amazon.com/cloudformation/home?#/stacks/new?stackName=Nextflow&templateURL=https://s3.amazonaws.com/aws-genomics-workflows/templates/nextflow/nextflow-aio.template.yaml) +* Edit one of the aws configs in workflow/config/ + * Replace workDir with the S3 bucket generated + * Change region if different + * Change queue to the aws batch queue generated +* The user must have awscli configured with an appropriate authentication (with `aws configure` and access keys) in the environment which nextflow will be run +* Add `-profile` with the name aws config which was customized To Run: ------- - * Available parameters: -* FULL EXAMPLE: - ``` - nextflow run workflow/main.nf - ``` -* Design example: + * `--deriva` active **credential.json** file from [deriva-auth](https://github.com/informatics-isi-edu/gudmap-rbk/wiki/Uploading-files-via-Deriva-client-tools#from-a-remote-server) + * `--bdbag` active **cookies.txt** file from [deriva-auth](https://github.com/informatics-isi-edu/gudmap-rbk/wiki/Uploading-files-via-Deriva-client-tools#from-a-remote-server) + * `--repRID` mRNA-seq replicate RID + * `--source` consortium server source + * **dev** = [dev.gudmap.org](dev.gudmap.org) (default, does not contain all data) + * **staging** = [staging.gudmap.org](staging.gudmap.org) (does not contain all data) + * **production** = [www.gudmap.org](www.gudmap.org) (***does contain all data***) + * `--refMoVersion` mouse reference version ***(optional)*** + * `--refHuVersion` human reference version ***(optional)*** + * `--refERCCVersion` human reference version ***(optional)*** + * `-profile` config profile to use ***(optional)***: + * defaut = processes on BioHPC cluster + * **biohpc** = process on BioHPC cluster + * **biohpc_max** = process on high power BioHPC cluster nodes (=> 128GB nodes), for resource testing + * **aws_ondemand** = AWS Batch on-demand instant requests + * **aws_spot** = AWS Batch spot instance requests +* NOTES: + * once deriva-auth is run and authenticated, the two files above are saved in ```~/.deriva/``` (see official documents from [deriva](https://github.com/informatics-isi-edu/deriva-client#installer-packages-for-windows-and-macosx) on the lifetime of the credentials) + * reference version consists of Genome Reference Consortium version, patch release and GENCODE annotation release # (leaving the params blank will use the default version tied to the pipeline version) + * *current mouse* **38.p6.vM22** = GRCm38.p6 with GENCODE annotation release M22 + * *current human* **38.p6.v31** = GRCh38.p12 with GENCODE annotation release 31 +* Tracking parameters ([Tracking Site](http://bicf.pipeline.tracker.s3-website-us-east-1.amazonaws.com/)): + * `--ci` boolean (default = false) + * `--dev` boolean (default = false) +FULL EXAMPLE: +------------- +``` +nextflow run workflow/rna-seq.nf --deriva ./data/credential.json --bdbag ./data/cookies.txt --repRID Q-Y5JA +``` +To run a set of replicates from study RID: +------------------------------------------ +Run in repo root dir: +* `sh workflow/scripts/splitStudy.sh [studyRID]` +It will run in parallel in batches of 5 replicatesRID + + +<hr> [**CHANGELOG**](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/blob/develop/CHANGELOG.md) +<hr> Credits -------- -This worklow is was developed by [Bioinformatic Core Facility (BICF), Department of Bioinformatics](http://www.utsouthwestern.edu/labs/bioinformatics/) +======= +This workflow is was developed by [Bioinformatic Core Facility (BICF), Department of Bioinformatics](http://www.utsouthwestern.edu/labs/bioinformatics/) +PI +-- +Venkat S. Malladi\ +*Faculty Associate & Director*\ +Bioinformatics Core Facility\ +UT Southwestern Medical Center\ +<a href="https://orcid.org/0000-0002-0144-0564" target="orcid.widget" rel="noopener noreferrer" style="vertical-align:top;"><img src="https://orcid.org/sites/default/files/images/orcid_16x16.png" style="width:1em;margin-right:.5em;" alt="ORCID iD icon">orcid.org/0000-0002-0144-0564</a>\ +[venkat.malladi@utsouthwestern.edu](mailto:venkat.malladi@utsouthwestern.edu) + + +Developers +---------- +Gervaise H. Henry\ +*Computational Biologist*\ +Department of Urology\ +UT Southwestern Medical Center\ +<a href="https://orcid.org/0000-0001-7772-9578" target="orcid.widget" rel="noopener noreferrer" style="vertical-align:top;"><img src="https://orcid.org/sites/default/files/images/orcid_16x16.png" style="width:1em;margin-right:.5em;" alt="ORCID iD icon">orcid.org/0000-0001-7772-9578</a>\ +[gervaise.henry@utsouthwestern.edu](mailto:gervaise.henry@utsouthwestern.edu) + +Jonathan Gesell\ +*Computational Biologist*\ +Bioinformatics Core Facility\ +UT Southwestern Medical Center\ +<a href="https://orcid.org/0000-0001-5902-3299" target="orcid.widget" rel="noopener noreferrer" style="vertical-align:top;"><img src="https://orcid.org/sites/default/files/images/orcid_16x16.png" style="width:1em;margin-right:.5em;" alt="ORCID iD icon">orcid.org/0000-0001-5902-3299</a>\ +[johnathan.gesell@utsouthwestern.edu](mailto:jonathn.gesell@utsouthwestern.edu) + +Jeremy A. Mathews\ +*Computational Intern*\ +Bioinformatics Core Facility\ +UT Southwestern Medical Center\ +<a href="https://orcid.org/0000-0002-2931-1430" target="orcid.widget" rel="noopener noreferrer" style="vertical-align:top;"><img src="https://orcid.org/sites/default/files/images/orcid_16x16.png" style="width:1em;margin-right:.5em;" alt="ORCID iD icon">orcid.org/0000-0002-2931-1430</a>\ +[jeremy.mathews@utsouthwestern.edu](mailto:jeremy.mathews@utsouthwestern.edu) Please cite in publications: Pipeline was developed by BICF from funding provided by **Cancer Prevention and Research Institute of Texas (RP150596)**. + +<hr> +<hr> + +Pipeline Directed Acyclic Graph +------------------------------- + + diff --git a/cleanup.sh b/cleanup.sh index 9569ff54fd71cd94bddde415af03a101820ab514..aa289201c531fa4f4667a04f80fd015d2200e40c 100644 --- a/cleanup.sh +++ b/cleanup.sh @@ -5,3 +5,5 @@ rm timeline*.html* rm .nextflow*.log* rm -r .nextflow/ rm -r work/ +rm *_studyRID.json +rm *_studyRID.csv diff --git a/docs/GUDMAP.RBK Pipeline.docx b/docs/GUDMAP.RBK Pipeline.docx deleted file mode 100755 index deae8a8fbfb7adc32ba2fba03a25eca6af57b4d7..0000000000000000000000000000000000000000 Binary files a/docs/GUDMAP.RBK Pipeline.docx and /dev/null differ diff --git a/docs/RNA-Seq Pipeline Design Flowchart.drawio b/docs/RNA-Seq Pipeline Design Flowchart.drawio new file mode 100644 index 0000000000000000000000000000000000000000..7b65c655592950fdb14c7a8b7c74a4f986816669 --- /dev/null +++ b/docs/RNA-Seq Pipeline Design Flowchart.drawio @@ -0,0 +1 @@ +<mxfile host="Electron" modified="2020-03-23T23:17:50.947Z" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/12.6.5 Chrome/80.0.3987.86 Electron/8.0.0 Safari/537.36" etag="EoueA3AcDhzhFmxKZ9Lg" version="12.6.5" type="device"><diagram name="Page-1" id="74e2e168-ea6b-b213-b513-2b3c1d86103e">7V1Zd5u6Fv41Xuveh2QxD49Nk7S9pzlNTpq0PS9dMsg2CQYXcBP3118xCDMIAWZ2nJcYIWGs/e1J2ntrxr9fv35wwGZ1Y+vQnHGM/jrjL2ccJzASj/75LbuwheNlOWxZOoYetrH7hnvjD4wamah1a+jQTXX0bNv0jE26UbMtC2peqg04jv2S7rawzfS3bsAS5hruNWDmW78ZurcKW3mGYfY3PkJjuYq+WsE35kB7Xjr21oq+b8bxi+AvvL0G+FlRf3cFdPsl0cRfzfj3jm174af163to+pOLpy0cd11wN35vB1pelQE3Px8/6OrD5hMrf9S/boXLT/faGX6M6+3whEAdzU90aTveyl7aFjCv9q0XwY+G/mMZdLXv89m2N6iRRY1P0PN2EbHB1rNR08pbm9Fd+Gp43xOff/iPOhejq8vX6MnBxS5xcQsdYw096OA2y3N235MX4ZM4EV/vnxVc4YeFP9r/pYWTiSfG3jpa1Ovix7+vd/zPn9zPP3f2Vn76+SIaZ6wU4d0DzhJ6lLkWYqIjboI2+iHODo1zoAk843f6TUCE6mXcLxr6znHALtFhYxuW5yaefOs3oA6vmM0Y9VyImDTiUUHOIKXCGJ4pGaOq7LnAqfGf0upwQUwORx/CecBXiQndNwX4r8ELotyYF5626w3uj/jd2Zz4gxW5PH8sdVa5+3PPvucerj5yc9eRTDgMf3CyjIDni/vorx6rlAxH2oIOe0lSKbBvOLwXruHUkwapzCHPl1/+dVcf/r57v/g4N73ftzdX5pkg5BmEyErKEAyiCLX1B2GIVCL/OVak8VHD4b0wAiZjS+rDsi34pnjj01f54euvp9fvPyF8+iOrMnv11xmLXYIEb9z9fOJvP/z1+Pjp7yX439L2ft1aZyIzG4A5WJ5TzxVpjzu2nqFFHy5KJajnGX944zH9sMfJ0+jC0xgTMzB5aFVyNbJwLBP2JAgfMqYX2GPv/wT7NmFPVBb8YKjvwcEuM3HGZSHJMpQ0VV6wqqQDReHPomn/DcxtRIgZJ5mImhcLO3jPPYNIv7Y2vnHmBhB/hzqwzOZ1fxN9Wvr/b4BhoZvvEHfsPENz8VPRC4YPDrvhZncDrOIve4nW+fyvE/wJDaHMaLZpO2Gzs5yD/wR3ODQJDPHTf+OBwVMXYG2Yu3D42rZs9A4I8Kku+19J/JEzTswuwYpo3v3WYP0wvsJ0EANKoJZL/7P/YqI/4SKib1lfNu6LufWgx3D7x4Q0j+8gwbbdBDfmYL3vFZIk7uV6jv0crcj6jX6nSMT5l2xw6a7AJhyzfl36k3O+MO0XbQUc71y3te0avX448GVlePA+nHjU/QV1Dp/gT7zfEsg0v2VhmOb7gN74XfiFokFNC/sHr5W5P1dEwVe5YkAp/Kuh48HXSvMa34qndC+9RJ+D9z2TRGB2iRtM8sZLPG3RzcS9VYjx+KaUuIdkXty+TLxClubBZUz4ZGMajlG/HG5jFg3ZEYM8oyfdF2NtgsALSqg2bWWY+mews7e+KnA9oD3jK0QdRPlIKQb8lVak8Qq8f2GCOTQv4jX8kKSx25WEX/i9Pp9eR5x8+QgdHVggao6+UUGXwDSWFvqsIQL7WjQBJ/838boIFV2YpYEU3FG4OS9J8TwQNGWEqH1TXtXFwj5n7LCRXxBhA7dFjavEBgf2IIgqMqEg6sl/Lif/kS6/AMuKVM9Rto15LVRS5RONF+iIk0qYU6GFOSVaRxzTfP0tuewAHK2ZyYlNydBQlEtszgL7ssy8DK6yxmqO9KGhU430XN4aJS7WiRVdMJbliTZw7+vbTEoMIDOQbiZmuqtMBqHNzELalJaLhj1qfcAlVHqk0FN4BHPXNrcefOdoEWaD1v2VkBPzXFdyBQu/ygKcT5OBFQhyhiS748YmcoZIJAznYfzYve/6I3GnTKbsxciPlBQhy5S8FGxRbJDWMWnM0J7UaEb1rtw2B25MQwOe7wf98+my0GvLQC5lEWRZN/QGkC8ArC0wP1mbwCYkiYm0IHEjs0JKywe2H96uytptWGV0TdUlkcPHzR18aw6WhleV6nlSJ3GAqU7xAMkoiOgerWb1QGqOZIN3Revc2+epjCTRJfDARE3weA88M8ltmOTRt/0DNQ9YS39dosbX8YRvk/nMtwETGa4W4o0LXxa5OcrWs6sKIi+kQXX2YI7AQZoc2zdlmlyqqMh5tm1NfgT2fwFOmwdenXB6ME65I8ApL/fgp0qVFehkHVVsI0zYUcV4bt2aXQDX+9Wxybremp5xhg3XAux0Qsfs1niejorSjaFK1Z6tUxG9HtADnn0ThCS5HF0RMvfyQo6G6Le68CYmwZT9Dj6rsQbfCmDFQUNxe7GjDjKZcGBmmclUNbyKk4/BZBLaNZmI0U/88XugRCh0B0jyNA+TOtHMtVR6MNnlmhposoY7VrXTMNypUpVg8m3Nww0+/AzTSJqAV8GaW2wC7m+Su997DrD0OgM2xjOs1R9qBoxfab9QTn0GavSnpsB0RUjwcriK4luSIIyaEsgmAd+HlqEB810U4bI2dD0Q1za64xu+qG2F2qA1i4Jrbm3X8Aw7FQ6Dn/I50yF+Gg6gMeEiitfTDGuJGs4CdkqE27BcP6yBIytLWUNpgTMo4r2QM4pw1ZLD5O+mZdiFFFk5LYBmcVWAkSrQan33pw0HIfeq+SALzzHW09/+EZTRuWESd/RGb3vL2WJFU1jJm8LUZMpJ+2Ztb7vczXcPmug+Wupy/rz6fPttd2Wcyc3zFccO0wOshQZ4JM/yMYQB8lIPvppSXUNN1k3DWngabhpVwLa+Mu9Tew31vAs0pY2X+gQebr2eKq8G8TV+acFY4B2Xs9GDYCBt2PXnXBB3xv+Bizx3Tsm1kITxuRbHv8PTnmuB9x5KE4wJsdn0TOShjTmunjGX6d5LTFd+/75AJEzWlsNCr8aO8YC2XAGgp5PVQa1O0EOmB0GakO1jpqo0aT1AtJldz3Rk1ztwAdG8BinvS2ih1y227tox5v2tvTlw4eFmfG3eFtO8TWBtkqXQhhVP11StU3PlJ9RfLyzklF8vPSTOr+d7n23kzll9iZ0pxzdkMFWeoMFW2fRX8OOCuuMxs+XTCn51xYiD98sUo5rXi8QqPtI4bOzuFvBzg7MbhWoG2OGURYMy2G7DVlfaLXw7jRIC3SKbGOGijsR9HP9egFpD2U3WgcQafRqbAVREn1aLe10tboqcXleLaUq+dfckKCl2PQfGeFySukFFMp2Sw+0XXRiG96rc3jnr/ylPf4wvW12QCHX9gvpu03dL1H7zyit8HTGvXBwgr1wWc6Q9fnuR6DDlgUdjktROBKkfnzckqVWxJ21ITsxHUt4g5ruHN7kiDcMdAb7bdpRoU1pJ/U7FUaLaGRN2lmJYn7ylLr2lTuAzvMfUWSm26blMxRQeo9tElgV5UbAGz/DCWH4zploTF99lu8iML/acFDYj6XmlIq9ynfpO4O9b8GDdws3u+9f5w43woG7WcebvG9tYyuOMyhVJS5LYkVDciNhPOQI7svVkEMqEVhNHUzEk6ZJ3wpZkPlWiJVMgoveIynJmcBAdxUDAQXQIQ3846LWaJ/EnEIokaYg1j6GoJzdkWif5nMvmi54TVt3U+iZUcJbqcpZw/CW5o3hS3tQZrCQHJq28+WNYBsqn4rejvQOaz7gLb4McekZzf78NRd4YE70qcuLJ13k9vt2YNtCPQJELQyZRFR8zXnWypyItqbCahrAkWl2D5ruN+fTMItYsN7dYwjZccb3jvs0tVkxjkGdTBlRpf1bp4VDXeJ3qBMsDYEkt+FuKyoLwqG5RybP1UJnt3w8qB03mO05UchVRqYwClWo9VAoCvT8+xvjQ/v2gvnlgzgn15BIvpahXh0C9lN2aL5HF2f5lKM7uJ9bt3wvqsRt1Qv0hqK9+zvx4YC+r9WCf7d8PLE+GcQNY0jIlUuUECiMB+walWtNby/bvB5Qnu7htUJJK5hTHFfS+hsDWdNdyA/rB5aDBQG8bl+wgJyY0B6bAlAzg1IYDeoE+P+iJf7WgPzDMqQXhx7kskV0G4EsgOMiygdBuzYMosv0Ny2PqXncSqMWbcr2LY6HmUkJuQD9QbTf1zF0BZ3PCalFQ5UihymWFahlUcwOkMmxnfbm6A9pmBgNou49fXu7ZL+wXV/vr31fWuCWESsQFo1MsUhA5I1LPCKoaf0N+yq2xgaZh+akJl9D1c5c45hpHy1BP9ilIw5qXJVwlYw+iZKncSTu5LCoHvdsfMA8e5fNUBD30XPFiJl76z0KM7+4DGPI8VjuwgMtEgRAyNOJqu8nAguyp6a2V+FKKq5i7G2BVAQE5COvufYKA4aOKoq6IcTzayjD1z2Bnb/0pcD2gPeOrfIxPWsgjwa4HyXH+RXAC0wUavQw6ZTLw0kEobPTjrsHaMH0KPUJHBxaYpU5QUEgoayN2pX5QkcKq54rExH9sWg4R4rlUYoxRGwFdRJUtEcpi2C+WC9YbEw4R1VWWUVfMI3kqjCYk+3TQTrpMH9UuouK01OOUCEFD5I5TPIpypJUoqJSoLlrGHMNYIpcooC3UDuMLYpS6Mjb2hNdRn+w5KXTrY+gk7oNIO1wWNzn2JEfXwEC6Pwo1jx/TRR2sZplYJ63fTOtXVfqEVWYy155UfqcqP68+ysTMsan8yWn8QoX/1mq1HELO4bQ8eVM5R0zDWkDnpugQ+enpeL7fii1qlYotMuH7ui3YQjY2Bg0v7sXaqGBG0PiiNCij6tIBN0hqR8MifwzFjjgAg7QJJAmgY9P+xUCbkPbn6FXRD1X++cqAAQyuAslHqd2X6X/vOcDSa43YGM+w3gCoGZD+VrOxVhQMdk5ubdfwDJu4o/Y50yF+Gt4dMeEiWtvRDGuJGs4CJiKfRt05Q1ROb1da4Aei9ZG3hoOlKq1vy6mimVRnK0rook7NcVhOx284HVovh8YkpWkqhDAVoow4tn2YA2BJm79yYTRmK6oqmiZS+qEYwB2soWByT2QBpZi8Y1xAIUbWCVyOlJ5hHYH27+SEcIr2Vypof7F37U+cTK6F7JbpHZhAtAhywAyDlojApPJPaQgrpn15+gu2I0+VccuEF57TEuE1FWuBLqCnYS7QIZ0lFONqtgPdATR7B/OORwyg2slJ80Lenj4Kx76TU4mPRbWfzv86QLXT+ae8PIVIyG0t6MmdlHtVKEsnKI8ayvwRQJmXe4ByPKl0TTwVO7XE2piGoUoXzxlCMU+uTVgUmUpt5o4I1mtxZroMSh6O4jsBfqrXoD5FR1NOOuus17DqFpTykSY3U52b8iSevNYtjjfpW8Xy+HTEignO2f69pOKLzUtJvaFU/Mpope3/VLARBzEJJb4eXrP9ceRAYf9Mdn3d/v3wQ7uL/Cd+mFVY4BwrR8hqPY7I9u8Hse2uXfWN2EPRqQN3FXt9nRsaHKGS6piQKsgZpHJ0pGb7H4Xsltpd+jrJbtpcV02lG6a4kFhPcGf791MycDp13lsR0iTeqoDTqrmdJBE9nmLXfqTrucDvK7zwteRpyXCurLIVw6nnAqfGf0qbw3mhj5pxzQ9OegtmThNuGVFd2YHZRWQasQt9eC/sgu2vN8Yuh3EIEfjqyE1+lpVoNcPEkkM+SoZLJcM5IRjeeEw/zIBpfmKGA5mBxWvTo+UGnlpBr5Qb6MOPixu4EzdU54b57kET3UdLXc6fV59vv+2ujDOZkAA+JmZQBN/6qcUAhCGVAFxsZDUc3g8rTHvFaECngqglxr5KKslqji9KFkoJQ0qBLfE056Ph8IZ8gS4d288e23f3QzdubB36Pf4P</diagram></mxfile> \ No newline at end of file diff --git a/docs/RNA-Seq Pipeline Design Flowchart.jpg b/docs/RNA-Seq Pipeline Design Flowchart.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a56462a2991aecb22d5bbff493c18e3f673f2b0f Binary files /dev/null and b/docs/RNA-Seq Pipeline Design Flowchart.jpg differ diff --git a/docs/RNA-Seq Pipeline Design Process Table.docx b/docs/RNA-Seq Pipeline Design Process Table.docx new file mode 100644 index 0000000000000000000000000000000000000000..21604d8f30662ffd93f8d0605b671a0921864b0c Binary files /dev/null and b/docs/RNA-Seq Pipeline Design Process Table.docx differ diff --git a/docs/RNA-Seq Pipeline Design Process Table.pdf b/docs/RNA-Seq Pipeline Design Process Table.pdf new file mode 100644 index 0000000000000000000000000000000000000000..97f1d5ddfb0ae0848aa0bf8b37681db9efeb3c6b Binary files /dev/null and b/docs/RNA-Seq Pipeline Design Process Table.pdf differ diff --git a/docs/dag.png b/docs/dag.png new file mode 100644 index 0000000000000000000000000000000000000000..8c4896f2f0f6d2c765d6b020cddb4cda23064c97 Binary files /dev/null and b/docs/dag.png differ diff --git a/docs/gudmap-rbk_logo.png b/docs/gudmap-rbk_logo.png new file mode 100644 index 0000000000000000000000000000000000000000..840030e2dac96bff2d41805d6697abab29c3e5bb Binary files /dev/null and b/docs/gudmap-rbk_logo.png differ diff --git a/nextflow.config b/nextflow.config deleted file mode 100644 index 28777047bfa85b13d08a0df02a22e6eac6d66540..0000000000000000000000000000000000000000 --- a/nextflow.config +++ /dev/null @@ -1,5 +0,0 @@ -profiles { - standard { - includeConfig 'workflow/conf/biohpc.config' - } -} diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000000000000000000000000000000000000..c8d98f29942ac24e83b741a0e102714b44230eab --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +python_paths = workflow/scripts diff --git a/test_data/createTestData.sh b/test_data/createTestData.sh new file mode 100644 index 0000000000000000000000000000000000000000..47c73d46fba5510f64fe5f9bb5c540eb4ad618cc --- /dev/null +++ b/test_data/createTestData.sh @@ -0,0 +1,104 @@ +#!/bin/bash + +#This script regenerates test data from replicate RID Q-Y5F6 + +module load singularity/3.5.3 +module load pigz/2.4 + +mkdir -p NEW_test_data + +ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json + +mkdir -p ./NEW_test_data/bagit +singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6 +cp Replicate_Q-Y5F6.zip ./NEW_test_data/bagit/Replicate_Q-Y5F6.zip + +mkdir -p ./NEW_test_data/fastq +unzip ./test_data/bagit/Replicate_Q-Y5F6.zip +singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6 +cp Replicate_Q-Y5F6.R1.fastq.gz ./NEW_test_data/fastq/Replicate_Q-Y5F6.R1.fastq.gz +cp Replicate_Q-Y5F6.R2.fastq.gz ./NEW_test_data/fastq/Replicate_Q-Y5F6.R2.fastq.gz + +mkdir -p ./NEW_test_data/fastq/small +singularity exec 'docker://bicf/seqtk:2.0.0' seqtk sample -s100 ./NEW_test_data/fastq/Replicate_Q-Y5F6.R1.fastq.gz 1000000 1> Q-Y5F6_1M.R1.fastq +singularity exec 'docker://bicf/seqtk:2.0.0' seqtk sample -s100 ./NEW_test_data/fastq/Replicate_Q-Y5F6.R2.fastq.gz 1000000 1> Q-Y5F6_1M.R2.fastq +pigz Q-Y5F6_1M.R1.fastq +pigz Q-Y5F6_1M.R2.fastq +cp Q-Y5F6_1M.R1.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz +cp Q-Y5F6_1M.R2.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz + +mkdir -p ./NEW_test_data/meta +singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5F6_1M.se -j 20 ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz +singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5F6_1M.pe -j 20 ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz +cp Q-Y5F6_1M.se_trimmed.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz +cp Q-Y5F6_1M.pe_R1_val_1.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_R1_val_1.fq.gz +cp Q-Y5F6_1M.pe_R2_val_2.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_R2_val_2.fq.gz +cp Q-Y5F6_1M.R1.fastq.gz_trimming_report.txt ./NEW_test_data/meta/Q-Y5F6_1M.R1.fastq.gz_trimming_report.txt +cp Q-Y5F6_1M.R2.fastq.gz_trimming_report.txt ./NEW_test_data/meta/Q-Y5F6_1M.R2.fastq.gz_trimming_report.txt + +touch metaTest.csv +echo 'Replicate_RID,Experiment_RID,Study_RID,Paired_End,File_Type,Has_Strand_Specific_Information,Used_Spike_Ins,Species' > metaTest.csv +echo 'Replicate_RID,Experiment_RID,Study_RID,uk,FastQ,no,no,Homo sapiens' >> metaTest.csv +cp metaTest.csv ./NEW_test_data/meta/metaTest.csv + +mkdir -p ./NEW_test_data/bam +mkdir -p ./NEW_test_data/bam/small +singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./NEW_test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary +singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam +singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam +singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.sorted.bam.bai +singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.pe.unal.gz -S Q-Y5F6_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_R1_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5F6_1M.pe_R2_val_2.fq.gz --summary-file Q-Y5F6_1M.pe.alignSummary.txt --new-summary +singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.pe.bam Q-Y5F6_1M.pe.sam +singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam +singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai +cp Q-Y5F6_1M.se.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.bam +cp Q-Y5F6_1M.pe.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.pe.bam +cp Q-Y5F6_1M.se.sorted.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.bam +cp Q-Y5F6_1M.se.sorted.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.bam.bai +cp Q-Y5F6_1M.pe.sorted.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.pe.sorted.bam +cp Q-Y5F6_1M.pe.sorted.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.pe.sorted.bam.bai +cp Q-Y5F6_1M.se.alignSummary.txt ./NEW_test_data/meta/Q-Y5F6_1M.se.alignSummary.txt +cp Q-Y5F6_1M.pe.alignSummary.txt ./NEW_test_data/meta/Q-Y5F6_1M.pe.alignSummary.txt + +singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true +singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.deduped.bam +singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai +cp Q-Y5F6_1M.se.deduped.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.deduped.bam +cp Q-Y5F6_1M.se.sorted.deduped.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam +cp Q-Y5F6_1M.se.sorted.deduped.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam.bai +cp Q-Y5F6_1M.se.deduped.Metrics.txt /NEW_test_data/meta/Q-Y5F6_1M.se.deduped.Metrics.txt +cp Q-Y5F6_1M.se.deduped.Metrics.txt ./NEW_test_data/meta/Q-Y5F6_1M.se.deduped.Metrics.txt + +for i in {"chr8","chr4","chrY"}; do + echo "samtools view -b ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam ${i} > Q-Y5F6_1M.se.sorted.deduped.${i}.bam; samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.${i}.bam Q-Y5F6_1M.se.sorted.deduped.${i}.bam.bai;"; + done | singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' parallel -j 20 -k +cp Q-Y5F6_1M.se.sorted.deduped.chr4.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chr4.bam +cp Q-Y5F6_1M.se.sorted.deduped.chr4.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chr4.bam.bai +cp Q-Y5F6_1M.se.sorted.deduped.chr8.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chr8.bam +cp Q-Y5F6_1M.se.sorted.deduped.chr8.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chr8.bam.bai +cp Q-Y5F6_1M.se.sorted.deduped.chrY.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chrY.bam +cp Q-Y5F6_1M.se.sorted.deduped.chrY.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chrY.bam.bai + +mkdir -p ./NEW_test_data/counts +mkdir -p ./NEW_test_data/counts/small +singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' -o Q-Y5F6_1M.se.featureCounts -s 1 -R SAM --primary --ignoreDup ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam +singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count Q-Y5F6_1M.se.featureCounts +cp Q-Y5F6_1M.se.featureCounts ./NEW_test_data/counts/small/Q-Y5F6_1M.se.featureCounts +cp Q-Y5F6_1M.se.countTable.csv ./NEW_test_data/counts/small/Q-Y5F6_1M.se.countTable.csv + +mkdir -p ./NEW_test_data/bw +mkdir -p ./NEW_test_data/bw/small +singularity run 'docker://bicf/deeptools3.3:2.0.0' bamCoverage -p 20 -b ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw +cp Q-Y5F6_1M.se.bw ./NEW_test_data/bw/small/Q-Y5F6_1M.se.bw + +mkdir -p ./NEW_test_data/fastqc +mkdir -p ./NEW_test_data/fastqc/small +singularity run 'docker://bicf/fastqc:2.0.0' ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o . +cp Q-Y5F6_1M.R1_fastqc.html ./NEW_test_data/fastqc/small/Q-Y5F6_1M.R1_fastqc.html +cp Q-Y5F6_1M.R1_fastqc.zip ./NEW_test_data/fastqc/small/Q-Y5F6_1M.R1_fastqc.zip + +echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > Q-Y5F6_1M.se.sorted.deduped.tin.xls +for i in {"chr8","chr4","chrY"}; do +echo "tin.py -i ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"; done | singularity run 'docker://bicf/rseqc3.0:2.0.0' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls +cp Q-Y5F6_1M.se.sorted.deduped.tin.xls ./NEW_test_data/meta/Q-Y5F6_1M.se.sorted.deduped.tin.xls + diff --git a/workflow/conf/aws.config b/workflow/conf/aws.config new file mode 100644 index 0000000000000000000000000000000000000000..9ecbfb98f593f167a35650299921adaf2fffbb42 --- /dev/null +++ b/workflow/conf/aws.config @@ -0,0 +1,83 @@ +workDir = 's3://gudmap-rbk.output/work' +aws.client.storageEncryption = 'AES256' +aws { + region = 'us-east-2' + batch { + cliPath = '/home/ec2-user/miniconda/bin/aws' + } +} + +process { + executor = 'awsbatch' + cpus = 1 + memory = '1 GB' + + withName: trackStart { + cpus = 1 + memory = '1 GB' + } + withName: getBag { + cpus = 1 + memory = '1 GB' + } + withName: getData { + cpus = 1 + memory = '1 GB' + } + withName: parseMetadata { + cpus = 15 + memory = '1 GB' + } + withName: trimData { + cpus = 20 + memory = '2 GB' + } + withName: getRefInfer { + cpus = 1 + memory = '1 GB' + } + withName: downsampleData { + cpus = 1 + memory = '1 GB' + } + withName: alignSampleData { + cpus = 50 + memory = '5 GB' + } + withName: inferMetadata { + cpus = 5 + memory = '1 GB' + } + withName: getRef { + cpus = 1 + memory = '1 GB' + } + withName: alignData { + cpus = 50 + memory = '10 GB' + } + withName: dedupData { + cpus = 5 + memory = '20 GB' + } + withName: countData { + cpus = 2 + memory = '5 GB' + } + withName: makeBigWig { + cpus = 15 + memory = '5 GB' + } + withName: fastqc { + cpus = 1 + memory = '1 GB' + } + withName: dataQC { + cpus = 15 + memory = '2 GB' + } + withName: aggrQC { + cpus = 2 + memory = '1 GB' + } +} diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 0ea74405dd6faeb0342698df1ef8736804dda5c6..338cd5d54d4258e499a60241054c5f28109e7c19 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -1,31 +1,68 @@ process { executor = 'slurm' - queue='super' + queue = 'super' + clusterOptions = '--hold' - // Process specific configuration - withLabel:getData { - executor = 'super' + withName: trackStart { + executor = 'local' + } + withName: getBag { + executor = 'local' + } + withName: getData { + executor = 'local' + } + withName: parseMetadata { + executor = 'local' + } + withName: trimData { + queue = 'super' + } + withName: getRefInfer { + executor = 'local' + } + withName: downsampleData { + executor = 'local' + } + withName: alignSampleData { + queue = 'super' + } + withName: inferMetadata { + queue = 'super' + } + withName: getRef { + executor = 'local' + } + withName: alignData { + queue = '256GB,256GBv1' + } + withName: dedupData { + queue = 'super' + } + withName: countData { + queue = 'super' + } + withName: makeBigWig { + queue = 'super' + } + withName: fastqc { + queue = 'super' + } + withName: dataQC { + queue = 'super' + } + withName: aggrQC { + executor = 'local' } } - -trace { - enabled = true - file = 'pipeline_trace.txt' - fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss' -} - -timeline { +singularity { enabled = true - file = 'timeline.html' + cacheDir = '/project/BICF/BICF_Core/shared/gudmap/singularity_cache/' } -report { - enabled = true - file = 'report.html' +env { + http_proxy = 'http://proxy.swmed.edu:3128' + https_proxy = 'http://proxy.swmed.edu:3128' + all_proxy = 'http://proxy.swmed.edu:3128' } - -tower { - accessToken = '3ade8f325d4855434b49aa387421a44c63e3360f' - enabled = true -} \ No newline at end of file diff --git a/workflow/conf/biohpc_max.config b/workflow/conf/biohpc_max.config new file mode 100755 index 0000000000000000000000000000000000000000..0e93ccf6a0be4c15c076ab6eb955a4bb39d96120 --- /dev/null +++ b/workflow/conf/biohpc_max.config @@ -0,0 +1,16 @@ +process { + executor = 'slurm' + queue = '256GB,256GBv1,384GB,128GB' + clusterOptions = '--hold' +} + +singularity { + enabled = true + cacheDir = '/project/BICF/BICF_Core/shared/gudmap/singularity_cache/' +} + +env { + http_proxy = 'http://proxy.swmed.edu:3128' + https_proxy = 'http://proxy.swmed.edu:3128' + all_proxy = 'http://proxy.swmed.edu:3128' +} diff --git a/workflow/conf/conda.env.bdbag.yml b/workflow/conf/conda.env.bdbag.yml deleted file mode 100644 index 33361d301b3fac561fa39807e3c740583e57d28b..0000000000000000000000000000000000000000 --- a/workflow/conf/conda.env.bdbag.yml +++ /dev/null @@ -1,5 +0,0 @@ -name: bdbag -dependencies: - - pandas=0.23.3=py36_0 - - pip: - - bdbag==1.5.5 diff --git a/workflow/conf/multiqc_config.yaml b/workflow/conf/multiqc_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..db6a335a8e5e6f8677dd8c433d391c6fd5224751 --- /dev/null +++ b/workflow/conf/multiqc_config.yaml @@ -0,0 +1,114 @@ +custom_logo: './bicf_logo.png' +custom_logo_url: 'https/utsouthwestern.edu/labs/bioinformatics/' +custom_logo_title: 'Bioinformatics Core Facility' + +report_header_info: + - Contact Email: 'bicf@utsouthwestern.edu' + - Application Type: 'RNA-Seq Analytic Pipeline for GUDMAP/RBK' + - Department: 'Bioinformatic Core Facility, Department of Bioinformatics, University of Texas Southwestern Medical Center' + +title: RNA-Seq Analytic Pipeline for GUDMAP/RBK + +report_comment: > + This report has been generated by the <a href="https://doi.org/10.5281/zenodo.3625056">GUDMAP/RBK RNA-Seq Pipeline</a> + +top_modules: + - fastqc: + name: 'Raw' + info: 'Replicate Raw fastq QC Results' + - cutadapt: + name: 'Trim' + info: 'Replicate Trim Adapter QC Results' + - hisat2: + name: 'Align' + info: 'Replicate Alignment QC Results' + path_filters: + - '*alignSummary*' + - picard: + name: 'Dedup' + info: 'Replicate Alignement Deduplication QC Results' + - rseqc: + name: 'Inner Distance' + info: 'Replicate Paired End Inner Distance Distribution Results' + path_filters: + - '*insertSize*' + - custom_content + - featureCounts: + name: 'Count' + info: 'Replicate Feature Count QC Results' + - hisat2: + name: 'Inference: Align' + info: 'Inference Alignment (1M downsampled reads) QC Results' + path_filters: + - '*alignSampleSummary*' + - rseqc: + name: 'Inference: Stranded' + info: '1M Downsampled Reads Strandedness Inference Results' + path_filters: + - '*infer_experiment*' + +report_section_order: + rid: + order: 2000 + meta: + order: 1000 + +skip_generalstats: true + +custom_data: + rid: + file_format: 'tsv' + section_name: 'RID' + description: 'This is the identifying RIDs' + plot_type: 'table' + pconfig: + id: 'rid' + headers: + Replicate RID + Experiment RID + Study RID + meta: + file_format: 'tsv' + section_name: 'Metadata' + description: 'This is the comparison of infered metadata, submitter provided, and calculated' + plot_type: 'table' + pconfig: + id: 'meta' + format: '{:,.0f}' + headers: + Source + Species + Ends + Stranded + Spike-in + Raw Reads + Assigned Reads + Median Read Length + Median TIN + tin: + file_format: 'tsv' + section_name: 'TIN' + description: 'This is the distribution of TIN values calculated by the tool RSeQC' + plot_type: 'bargraph' + pconfig: + id: 'tin' + headers: + chrom + 0 - 9 + 10 - 19 + 20 - 29 + 30 - 39 + 40 - 49 + 50 - 59 + 60 - 69 + 70 - 79 + 80 - 89 + 90 - 99 + +sp: + rid: + fn: 'rid.tsv' + meta: + fn: 'metadata.tsv' + tin: + fn: '*.tin.hist.tsv' diff --git a/workflow/conf/ondemand.config b/workflow/conf/ondemand.config new file mode 100755 index 0000000000000000000000000000000000000000..131fdbb19e1fedf1bc9e206a03d801f13791b810 --- /dev/null +++ b/workflow/conf/ondemand.config @@ -0,0 +1,3 @@ +process { + queue = 'highpriority-0ef8afb0-c7ad-11ea-b907-06c94a3c6390' +} diff --git a/workflow/conf/replicate_export_config.json b/workflow/conf/replicate_export_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ff17fa513c5bc130a2e2bdaf9aa41b070c99b290 --- /dev/null +++ b/workflow/conf/replicate_export_config.json @@ -0,0 +1,97 @@ +{ + "bag": { + "bag_name": "Replicate_{rid}", + "bag_algorithms": [ + "md5" + ], + "bag_archiver": "zip" + }, + "catalog": { + "query_processors": [ + { + "processor": "csv", + "processor_params": { + "output_path": "Study", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Study_RID)=(RNASeq:Study:RID)/Study_RID:=RID,Internal_ID,Title,Summary,Overall_Design,GEO_Series_Accession_ID,GEO_Platform_Accession_ID,Funding,Pubmed_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Experiment", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment:RID)/Experiment_RID:=RID,Study_RID,Internal_ID,Name,Description,Experiment_Method,Sequencing_Type,Species,Specimen_Type,Molecule_Type,Pooled_Sample,Pool_Size,Markers,Cell_Count,Treatment_Protocol,Treatment_Protocol_Reference,Isolation_Protocol,Isolation_Protocol_Reference,Growth_Protocol,Growth_Protocol_Reference,Label_Protocol,Label_Protocol_Reference,Hybridization_Protocol,Hybridization_Protocol_Reference,Scan_Protocol,Scan_Protocol_Reference,Data_Processing,Value_Definition,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Experiment Antibodies", + "query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Antibodies:Experiment_RID)?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Experiment Custom Metadata", + "query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Custom_Metadata:Experiment_RID)?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Experiment Settings", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Settings:Experiment_RID)/RID,Experiment_RID,Alignment_Format,Aligner,Aligner_Version,Reference_Genome,Sequence_Trimming,Duplicate_Removal,Pre-alignment_Sequence_Removal,Junction_Reads,Library_Type,Protocol_Reference,Library_Selection,Quantification_Format,Quantification_Software,Expression_Metric,Transcriptome_Model,Sequencing_Platform,Paired_End,Read_Length,Has_Strand_Specific_Information,Used_Spike_Ins,Spike_Ins_Amount,Visualization_Format,Visualization_Software,Visualization_Version,Visualization_Setting,Notes,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Replicate", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/RID,Study_RID,Experiment_RID,Biological_Replicate_Number,Technical_Replicate_Number,Specimen_RID,Collection_Date,Mapped_Reads,GEO_Sample_Accession_ID,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Specimen", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/S:=(Specimen_RID)=(Gene_Expression:Specimen:RID)/T:=left(Stage_ID)=(Vocabulary:Developmental_Stage:ID)/$S/RID,Title,Species,Stage_ID,Stage_Name:=T:Name,Stage_Detail,Assay_Type,Strain,Wild_Type,Sex,Passage,Phenotype,Cell_Line,Parent_Specimen,Upload_Notes,Preparation,Fixation,Embedding,Internal_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT,GUDMAP2_Accession_ID?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Specimen_Anatomical_Source", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Tissue:Specimen_RID)/RID,Specimen_RID,Tissue,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Specimen_Cell_Types", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Cell_Type:Specimen)/RID,Specimen_RID:=Specimen,Cell_Type,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Single Cell Metrics", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:Single_Cell_Metrics:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Reads_%28Millions%29,Reads%2FCell,Detected_Gene_Count,Genes%2FCell,UMI%2FCell,Estimated_Cell_Count,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "File", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Caption,File_Type,File_Name,URI,File_size,MD5,GEO_Archival_URL,dbGaP_Accession_ID,Processed,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT,Legacy_File_RID,GUDMAP_NGF_OID,GUDMAP_NGS_OID?limit=none" + } + }, + { + "processor": "fetch", + "processor_params": { + "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/url:=URI,length:=File_size,filename:=File_Name,md5:=MD5,Study_RID,Experiment_RID,Replicate_RID?limit=none" + } + } + ] + } +} diff --git a/workflow/conf/spot.config b/workflow/conf/spot.config new file mode 100755 index 0000000000000000000000000000000000000000..d9c7a4c8fa34aadd597da0170f8e3e223923011a --- /dev/null +++ b/workflow/conf/spot.config @@ -0,0 +1,3 @@ +process { + queue = 'default-0ef8afb0-c7ad-11ea-b907-06c94a3c6390' +} diff --git a/workflow/docker/.gitkeep b/workflow/docker/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/workflow/docker/getData b/workflow/docker/getData deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/workflow/docker/images/.gitkeep b/workflow/docker/images/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/workflow/docker/temp b/workflow/docker/temp deleted file mode 100644 index f7dcb3af08981d465bf0838d09de1b38e9e0c5aa..0000000000000000000000000000000000000000 --- a/workflow/docker/temp +++ /dev/null @@ -1,14 +0,0 @@ - - -RUN apt-get install -y python3.7 python3-pip - -RUN wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b && \ - rm Miniconda3-latest-Linux-x86_64.sh -ENV PATH=/miniconda/bin:${PATH} -RUN conda config --add channels defaults && \ - conda config --add channels bioconda && \ - conda config --add channels conda-forge && \ - conda update -n base -c defaults -y conda - -RUN pip install --upgrade pip diff --git a/workflow/nextflow.config b/workflow/nextflow.config index 30e47ea1aea37ed6550cc2944d69d26e69887489..db422b68cb4a8d1900cbae33801f6d5f5b8eb9a0 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -2,4 +2,98 @@ profiles { standard { includeConfig 'conf/biohpc.config' } + biohpc { + includeConfig 'conf/biohpc.config' + } + biohpc_max { + includeConfig 'conf/biohpc_max.config' + } + aws_ondemand { + includeConfig 'conf/aws.config' + includeConfig 'conf/ondemand.config' + } + aws_spot { + includeConfig 'conf/aws.config' + includeConfig 'conf/spot.config' + } +} + +process { + withName:getBag { + container = 'bicf/gudmaprbkfilexfer:2.0.1_indev' + } + withName:getData { + container = 'bicf/gudmaprbkfilexfer:2.0.1_indev' + } + withName: parseMetadata { + container = 'bicf/python3:2.0.1_indev' + } + withName: trimData { + container = 'bicf/trimgalore:1.1' + } + withName: getRefInfer { + container = 'bicf/awscli:1.1' + } + withName: downsampleData { + container = 'bicf/seqtk:2.0.1_indev' + } + withName: alignSampleData { + container = 'bicf/gudmaprbkaligner:2.0.1_indev' + } + withName: inferMetadata { + container = 'bicf/rseqc3.0:2.0.1_indev' + } + withName: getRef { + container = 'bicf/awscli:1.1' + } + withName: alignData { + container = 'bicf/gudmaprbkaligner:2.0.1_indev' + } + withName: dedupData { + container = 'bicf/gudmaprbkdedup:2.0.0' + } + withName: countData { + container = 'bicf/subread2:2.0.0' + } + withName: makeBigWig { + container = 'bicf/deeptools3.3:2.0.1_indev' + } + withName: fastqc { + container = 'bicf/fastqc:2.0.1_indev' + } + withName: dataQC { + container = 'bicf/rseqc3.0:2.0.1_indev' + } + withName: aggrQC { + container = 'bicf/multiqc1.8:2.0.1_indev' + } +} + +trace { + enabled = true + file = 'pipeline_trace.txt' + fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss' +} + +timeline { + enabled = true + file = 'timeline.html' +} + +report { + enabled = true + file = 'report.html' +} + +tower { + accessToken = '3ade8f325d4855434b49aa387421a44c63e3360f' + enabled = true +} + +manifest { + homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq' + description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.' + mainScript = 'rna-seq.nf' + version = 'v0.0.1' + nextflowVersion = '>=19.09.0' } diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf old mode 100755 new mode 100644 index d839044791bc3aaccc701c9fb62099105c97205b..772d076174400bca13110869a797f87617ff5c89 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -1,49 +1,1060 @@ #!/usr/bin/env nextflow -// Define input variables -params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip" +// ######## #### ###### ######## +// ## ## ## ## ## ## +// ## ## ## ## ## +// ######## ## ## ###### +// ## ## ## ## ## +// ## ## ## ## ## ## +// ######## #### ###### ## +// Define input variables +params.deriva = "${baseDir}/../test_data/auth/credential.json" +params.bdbag = "${baseDir}/../test_data/auth/cookies.txt" +//params.repRID = "16-1ZX4" +params.repRID = "Q-Y5JA" +params.source = "dev" +params.refMoVersion = "38.p6.vM22" +params.refHuVersion = "38.p12.v31" +params.refERCCVersion = "92" params.outDir = "${baseDir}/../output" // Parse input variables +deriva = Channel + .fromPath(params.deriva) + .ifEmpty { exit 1, "deriva credential file not found: ${params.deriva}" } bdbag = Channel .fromPath(params.bdbag) - .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" } - + .ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" } +repRID = params.repRID +refMoVersion = params.refMoVersion +refHuVersion = params.refHuVersion +refERCCVersion = params.refERCCVersion outDir = params.outDir +logsDir = "${outDir}/Logs" + +// Define fixed files +derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json") +if (params.source == "dev") { + source = "dev.gudmap.org" +} else if (params.source == "staging") { + source = "staging.gudmap.org" +} else if (params.source == "production") { + source = "www.gudmap.org" +} +referenceBase = "s3://bicf-references" +//referenceBase = "/project/BICF/BICF_Core/shared/gudmap/references" +referenceInfer = Channel.fromList(["ERCC","GRCh","GRCm"]) +multiqcConfig = Channel.fromPath("${baseDir}/conf/multiqc_config.yaml") +bicfLogo = Channel.fromPath("${baseDir}/../docs/bicf_logo.png") + +// Define script files +script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh") +script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py") +script_inferMeta = Channel.fromPath("${baseDir}/scripts/inferMeta.sh") +script_calculateTPM = Channel.fromPath("${baseDir}/scripts/calculateTPM.R") +script_convertGeneSymbols = Channel.fromPath("${baseDir}/scripts/convertGeneSymbols.R") +script_tinHist = Channel.fromPath("${baseDir}/scripts/tinHist.py") + +/* + * trackStart: track start of pipeline + */ +params.ci = false +params.dev = false +process trackStart { + container 'docker://bicf/bicfbase:2.1.0' + script: + """ + hostname + ulimit -a + + curl -H 'Content-Type: application/json' -X PUT -d \ + '{ \ + "sessionId": "${workflow.sessionId}", \ + "pipeline": "gudmap.rbk_rnaseq", \ + "start": "${workflow.start}", \ + "repRID": "${repRID}", \ + "astrocyte": false, \ + "status": "started", \ + "nextflowVersion": "${workflow.nextflow.version}", \ + "pipelineVersion": "${workflow.manifest.version}", \ + "ci": ${params.ci}, \ + "dev": ${params.dev} \ + }' \ + "https://xku43pcwnf.execute-api.us-east-1.amazonaws.com/ProdDeploy/pipeline-tracking" + """ +} + +log.info """\ +==================================== +BICF RNA-seq Pipeline for GUDMAP/RBK +==================================== +Replicate RID : ${params.repRID} +Source Server : ${params.source} +Mouse Reference Version: ${params.refMoVersion} +Human Reference Version: ${params.refHuVersion} +ERCC Reference Version : ${params.refERCCVersion} +Output Directory : ${params.outDir} +------------------------------------ +Nextflow Version : ${workflow.nextflow.version} +Pipeline Version : ${workflow.manifest.version} +Session ID : ${workflow.sessionId} +------------------------------------ +CI : ${params.ci} +Development : ${params.dev} +------------------------------------ +""" +/* + * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid + */ +process getBag { + tag "${repRID}" + + input: + path credential, stageAs: "credential.json" from deriva + path derivaConfig + + output: + path ("Replicate_*.zip") into bagit + + script: + """ + hostname > ${repRID}.getBag.log + ulimit -a >> ${repRID}.getBag.log + + # link credential file for authentication + echo -e "LOG: linking deriva credentials" >> ${repRID}.getBag.log + mkdir -p ~/.deriva + ln -sf `readlink -e credential.json` ~/.deriva/credential.json + echo -e "LOG: linked" >> ${repRID}.getBag.log + + # deriva-download replicate RID + echo -e "LOG: fetching bagit for ${repRID} in GUDMAP" >> ${repRID}.getBag.log + deriva-download-cli ${source} --catalog 2 ${derivaConfig} . rid=${repRID} + echo -e "LOG: fetched" >> ${repRID}.getBag.log + """ +} /* * getData: fetch study files from consortium with downloaded bdbag.zip - * python must be loaded prior to nextflow run, because conda env create from .yml doesn't work with nextflow loaded module (either process in-line, or config file) */ - process getData { - publishDir "${outDir}/temp/getData", mode: "symlink" - conda "${baseDir}/conf/conda.env.bdbag.yml" - - input: - file bdbag - - output: - file("**/*.R*.fastq.gz") into fastqPaths - file("**/File.csv") into filePaths - file("**/Experiment Settings.csv") into experimentSettingsPaths - file("**/Experiment.csv") into experimentPaths - - script: - """ - hostname - ulimit -a - study=\$(echo "${bdbag}" | cut -d'.' -f1) - echo LOG: \${study} - unzip ${bdbag} - echo LOG: bdgag unzipped - python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study} - echo LOG: fetch file filtered for only .fastq.gz - #bdbag --materialize "\$(echo "${bdbag}" | cut -d'.' -f1)" - sh ${baseDir}/scripts/bdbagFetch.sh \${study} - echo LOG: bdbag fetched - sh ${baseDir}/scripts/renameFastq.sh \${study} - echo LOG: fastq.gz files renamed to replicate RID - """ - } \ No newline at end of file +process getData { + tag "${repRID}" + + input: + path script_bdbagFetch + path cookies, stageAs: "deriva-cookies.txt" from bdbag + path bagit + + output: + path ("*.R{1,2}.fastq.gz") into fastqs + path ("**/File.csv") into fileMeta + path ("**/Experiment Settings.csv") into experimentSettingsMeta + path ("**/Experiment.csv") into experimentMeta + + script: + """ + hostname > ${repRID}.getData.log + ulimit -a >> ${repRID}.getData.log + + # link deriva cookie for authentication + echo -e "LOG: linking deriva cookie" >> ${repRID}.getData.log + mkdir -p ~/.bdbag + ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt + echo -e "LOG: linked" >> ${repRID}.getData.log + + # get bagit basename + replicate=\$(basename "${bagit}" | cut -d "." -f1) + echo -e "LOG: bagit replicate name \${replicate}" >> ${repRID}.getData.log + + # unzip bagit + echo -e "LOG: unzipping replicate bagit" >> ${repRID}.getData.log + unzip ${bagit} + echo -e "LOG: unzipped" >> ${repRID}.getData.log + + # bagit fetch fastq's only and rename by repRID + echo -e "LOG: fetching replicate bdbag" >> ${repRID}.getData.log + sh ${script_bdbagFetch} \${replicate} ${repRID} + echo -e "LOG: fetched" >> ${repRID}.getData.log + """ +} + +// Replicate raw fastq's for multiple process inputs +fastqs.into { + fastqs_trimData + fastqs_fastqc +} + +/* + * parseMetadata: parses metadata to extract experiment parameters +*/ +process parseMetadata { + tag "${repRID}" + + input: + path script_parseMeta + path file from fileMeta + path experimentSettings, stageAs: "ExperimentSettings.csv" from experimentSettingsMeta + path experiment from experimentMeta + + output: + path "design.csv" into metadata + + script: + """ + hostname > ${repRID}.parseMetadata.log + ulimit -a >> ${repRID}.parseMetadata.log + + # check replicate RID metadata + rep=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p repRID) + echo -e "LOG: replicate RID metadata parsed: \${rep}" >> ${repRID}.parseMetadata.log + + # get experiment RID metadata + exp=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p expRID) + echo -e "LOG: experiment RID metadata parsed: \${exp}" >> ${repRID}.parseMetadata.log + + # get study RID metadata + study=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p studyRID) + echo -e "LOG: study RID metadata parsed: \${study}" >> ${repRID}.parseMetadata.log + + # get endedness metadata + endsMeta=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p endsMeta) + echo -e "LOG: endedness metadata parsed: \${endsMeta}" >> ${repRID}.parseMetadata.log + + # ganually get endness + endsManual=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p endsManual) + echo -e "LOG: endedness manually detected: \${endsManual}" >> ${repRID}.parseMetadata.log + + # get strandedness metadata + stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p stranded) + echo -e "LOG: strandedness metadata parsed: \${stranded}" >> ${repRID}.parseMetadata.log + + # get spike-in metadata + spike=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p spike) + echo -e "LOG: spike-in metadata parsed: \${spike}" >> ${repRID}.parseMetadata.log + + # get species metadata + species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experiment}" -p species) + echo -e "LOG: species metadata parsed: \${species}" >> ${repRID}.parseMetadata.log + + # get read length metadata + readLength=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p readLength) + if [ "\${readLength}" = "nan"] + then + readLength="NA" + fi + echo -e "LOG: read length metadata parsed: \${readLength}" >> ${repRID}.parseMetadata.log + + # save design file + echo -e "\${endsMeta},\${endsManual},\${stranded},\${spike},\${species},\${readLength},\${exp},\${study}" > design.csv + """ +} + +// Split metadata into separate channels +endsMeta = Channel.create() +endsManual = Channel.create() +strandedMeta = Channel.create() +spikeMeta = Channel.create() +speciesMeta = Channel.create() +readLengthMeta = Channel.create() +expRID = Channel.create() +studyRID = Channel.create() +metadata.splitCsv(sep: ",", header: false).separate( + endsMeta, + endsManual, + strandedMeta, + spikeMeta, + speciesMeta, + readLengthMeta, + expRID, + studyRID +) +// Replicate metadata for multiple process inputs +endsManual.into { + endsManual_trimData + endsManual_downsampleData + endsManual_alignSampleData + endsManual_aggrQC +} + + +/* + * trimData: trims any adapter or non-host sequences from the data +*/ +process trimData { + tag "${repRID}" + + input: + val ends from endsManual_trimData + path (fastq) from fastqs_trimData + + output: + path ("*.fq.gz") into fastqsTrim + path ("*_trimming_report.txt") into trimQC + path ("readLength.csv") into inferMetadata_readLength + + script: + """ + hostname > ${repRID}.trimData.log + ulimit -a >> ${repRID}.trimData.log + + # trim fastq's using trim_galore and extract median read length + echo -e "LOG: trimming ${ends}" >> ${repRID}.trimData.log + if [ "${ends}" == "se" ] + then + trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} + readLength=\$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') + elif [ "${ends}" == "pe" ] + then + trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} + readLength=\$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') + fi + echo -e "LOG: trimmed" >> ${repRID}.trimData.log + echo -e "LOG: average trimmed read length: \${readLength}" >> ${repRID}.trimData.log + + # save read length file + echo -e "\${readLength}" > readLength.csv + """ +} + +// Extract calculated read length metadata into channel +readLengthInfer = Channel.create() +inferMetadata_readLength.splitCsv(sep: ",", header: false).separate( + readLengthInfer +) + +// Replicate trimmed fastq's +fastqsTrim.into { + fastqsTrim_alignData + fastqsTrim_downsampleData +} + +/* + * getRefInfer: dowloads appropriate reference for metadata inference +*/ +process getRefInfer { + tag "${refName}" + + input: + val refName from referenceInfer + + output: + tuple val (refName), path ("hisat2", type: 'dir'), path ("*.fna"), path ("*.gtf") into refInfer + path ("${refName}", type: 'dir') into bedInfer + + script: + """ + hostname > ${repRID}.${refName}.getRefInfer.log + ulimit -a >> ${repRID}.${refName}.getRefInfer.log + + # set the reference name + if [ "${refName}" == "ERCC" ] + then + references=\$(echo ${referenceBase}/ERCC${refERCCVersion}) + elif [ "${refName}" == "GRCm" ] + then + references=\$(echo ${referenceBase}/GRCm${refMoVersion}) + elif [ '${refName}' == "GRCh" ] + then + references=\$(echo ${referenceBase}/GRCh${refHuVersion}) + else + echo -e "LOG: ERROR - References could not be set!\nReference found: ${referenceBase}" >> ${repRID}.${refName}.getRefInfer.log + exit 1 + fi + mkdir ${refName} + + # retreive appropriate reference appropriate location + echo -e "LOG: fetching ${refName} reference files from ${referenceBase}" >> ${repRID}.${refName}.getRefInfer.log + if [ ${referenceBase} == "s3://bicf-references" ] + then + aws s3 cp "\${references}"/hisat2 ./hisat2 --recursive + aws s3 cp "\${references}"/bed ./${refName}/bed --recursive + aws s3 cp "\${references}"/genome.fna ./ + aws s3 cp "\${references}"/genome.gtf ./ + elif [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references" ] + then + ln -s "\${references}"/hisat2 + ln -s "\${references}"/bed ${refName}/bed + ln -s "\${references}"/genome.fna + ln -s "\${references}"/genome.gtf + fi + echo -e "LOG: fetched" >> ${repRID}.${refName}.getRefInfer.log + + # make blank bed folder for ERCC + echo -e "LOG: making dummy bed folder for ERCC" >> ${repRID}.${refName}.getRefInfer.log + if [ "${refName}" == "ERCC" ] + then + rm -rf ${refName}/bed + mkdir ${refName}/bed + touch ${refName}/bed/temp + fi + """ +} + +/* + * downsampleData: downsample fastq's for metadata inference + */ +process downsampleData { + tag "${repRID}" + + input: + val ends from endsManual_downsampleData + path fastq from fastqsTrim_downsampleData + + output: + path ("sampled.1.fq") into fastqs1Sample + path ("sampled.2.fq") into fastqs2Sample + + script: + """ + hostname > ${repRID}.downsampleData.log + ulimit -a >> ${repRID}.downsampleData.log + + if [ "${ends}" == "se" ] + then + echo -e "LOG: downsampling SE trimmed fastq" >> ${repRID}.downsampleData.log + seqtk sample -s100 *trimmed.fq.gz 100000 1> sampled.1.fq + touch sampled.2.fq + elif [ "${ends}" == "pe" ] + then + echo -e "LOG: downsampling R1 of PE trimmed fastq" >> ${repRID}.downsampleData.log + seqtk sample -s100 *1.fq.gz 1000000 1> sampled.1.fq + echo -e "LOG: downsampling R2 of PE trimmed fastq" >> ${repRID}.downsampleData.log + seqtk sample -s100 *2.fq.gz 1000000 1> sampled.2.fq + fi + echo -e "LOG: downsampled" >> ${repRID}.downsampleData.log + """ +} + +// Replicate the dowsampled fastq's and attatched to the references +inferInput = endsManual_alignSampleData.combine(refInfer.combine(fastqs1Sample.collect().combine(fastqs2Sample.collect()))) + +/* + * alignSampleData: aligns the downsampled reads to a reference database +*/ +process alignSampleData { + tag "${ref}" + + input: + tuple val (ends), val (ref), path (hisat2), path (fna), path (gtf), path (fastq1), path (fastq2) from inferInput + + output: + path ("${ref}.sampled.sorted.bam") into sampleBam + path ("${ref}.sampled.sorted.bam.bai") into sampleBai + path ("${ref}.alignSampleSummary.txt") into alignSampleQC + + script: + """ + hostname > ${repRID}.${ref}.alignSampleData.log + ulimit -a >> ${repRID}.${ref}.alignSampleData.log + + # align the reads with Hisat2 + echo -e "LOG: aligning ${ends}" >> ${repRID}.${ref}.alignSampleData.log + if [ "${ends}" == "se" ] + then + + hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome -U ${fastq1} --summary-file ${ref}.alignSampleSummary.txt --new-summary + elif [ "${ends}" == "pe" ] + then + hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome --no-mixed --no-discordant -1 ${fastq1} -2 ${fastq2} --summary-file ${ref}.alignSampleSummary.txt --new-summary + fi + echo -e "LOG: aliged" >> ${repRID}.${ref}.alignSampleData.log + + # convert the output sam file to a sorted bam file using Samtools + echo -e "LOG: converting from sam to bam" >> ${repRID}.${ref}.alignSampleData.log + samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${ref}.sampled.bam ${ref}.sampled.sam + + # sort the bam file using Samtools + echo -e "LOG: sorting the bam file" >> ${repRID}.${ref}.alignSampleData.log + samtools sort -@ `nproc` -O BAM -o ${ref}.sampled.sorted.bam ${ref}.sampled.bam + + # index the sorted bam using Samtools + echo -e "LOG: indexing sorted bam file" >> ${repRID}.${ref}.alignSampleData.log + samtools index -@ `nproc` -b ${ref}.sampled.sorted.bam ${ref}.sampled.sorted.bam.bai + """ +} + +alignSampleQC.into { + alignSampleQC_inferMetadata + alignSampleQC_aggrQC +} + +process inferMetadata { + tag "${repRID}" + + input: + path script_inferMeta + path beds from bedInfer.collect() + path bam from sampleBam.collect() + path bai from sampleBai.collect() + path alignSummary from alignSampleQC_inferMetadata.collect() + + output: + path "infer.csv" into inferMetadata + path "${repRID}.infer_experiment.txt" into inferExperiment + + script: + """ + hostname > ${repRID}.inferMetadata.log + ulimit -a >> ${repRID}.inferMetadata.log + + # collect alignment rates (round down to integers) + align_ercc=\$(echo \$(grep "Overall alignment rate" ERCC.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) + align_ercc=\$(echo \${align_ercc%.*}) + echo -e "LOG: alignment rate to ERCC: \${align_ercc}" >> ${repRID}.inferMetadata.log + align_hu=\$(echo \$(grep "Overall alignment rate" GRCh.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) + align_hu=\$(echo \${align_hu%.*}) + echo -e "LOG: alignment rate to GRCh: \${align_hu}" >> ${repRID}.inferMetadata.log + align_mo=\$(echo \$(grep "Overall alignment rate" GRCm.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) + align_mo=\$(echo \${align_mo%.*}) + echo -e "LOG: alignment rate to GRCm: \${align_mo}" >> ${repRID}.inferMetadata.log + + # determine spike-in + if [ 1 -eq \$(echo \$(expr \${align_ercc} ">=" 10)) ] + then + spike="yes" + else + spike="no" + fi + echo -e "LOG: inference of strandedness results is: \${spike}" >> ${repRID}.inferMetadata.log + + # determine species + if [ 1 -eq \$(echo \$(expr \${align_hu} ">=" 25)) ] && [ 1 -eq \$(echo \$(expr \${align_mo} "<" 25)) ] + then + species="Homo sapiens" + bam="GRCh.sampled.sorted.bam" + bed="./GRCh/bed/genome.bed" + elif [ 1 -eq \$(echo \$(expr \${align_mo} ">=" 25)) ] && [ 1 -eq \$(echo \$(expr \${align_hu} "<" 25)) ] + then + species="Mus musculus" + bam="GRCm.sampled.sorted.bam" + bed="./GRCm/bed/genome.bed" + else + echo -e "LOG: ERROR - inference of species returns an ambiguous result: hu=\${align_hu} mo=\${align_mo}" >> ${repRID}.inferMetadata.log + exit 1 + fi + echo -e "LOG: inference of species results in: \${species}" >> ${repRID}.inferMetadata.log + + # infer experimental setting from dedup bam + echo -e "LOG: infer experimental setting from dedup bam" >> ${repRID}.inferMetadata.log + infer_experiment.py -r "\${bed}" -i "\${bam}" 1>> ${repRID}.infer_experiment.txt + echo -e "LOG: infered" >> ${repRID}.inferMetadata.log + + ended=`bash inferMeta.sh endness ${repRID}.infer_experiment.txt` + fail=`bash inferMeta.sh fail ${repRID}.infer_experiment.txt` + if [ \${ended} == "PairEnd" ] + then + ends="pe" + percentF=`bash inferMeta.sh pef ${repRID}.infer_experiment.txt` + percentR=`bash inferMeta.sh per ${repRID}.infer_experiment.txt` + elif [ \${ended} == "SingleEnd" ] + then + ends="se" + percentF=`bash inferMeta.sh sef ${repRID}.infer_experiment.txt` + percentR=`bash inferMeta.sh ser ${repRID}.infer_experiment.txt` + fi + echo -e "LOG: percentage reads in the same direction as gene: \${percentF}" >> ${repRID}.inferMetadata.log + echo -e "LOG: percentage reads in the opposite direction as gene: \${percentR}" >> ${repRID}.inferMetadata.log + if [ 1 -eq \$(echo \$(expr \${percentF#*.} ">" 2500)) ] && [ 1 -eq \$(echo \$(expr \${percentR#*.} "<" 2500)) ] + then + stranded="forward" + elif [ 1 -eq \$(echo \$(expr \${percentR#*.} ">" 2500)) ] && [ 1 -eq \$(echo \$(expr \${percentF#*.} "<" 2500)) ] + then + stranded="reverse" + + else + stranded="unstranded" + fi + echo -e "LOG: stradedness set to: \${stranded}" >> ${repRID}.inferMetadata.log + + # write infered metadata to file + echo "\${ends},\${stranded},\${spike},\${species},\${align_ercc},\${align_hu},\${align_mo},\${percentF},\${percentR},\${fail}" 1>> infer.csv + """ +} + +// Split metadata into separate channels +endsInfer = Channel.create() +strandedInfer = Channel.create() +spikeInfer = Channel.create() +speciesInfer = Channel.create() +align_erccInfer = Channel.create() +align_huInfer = Channel.create() +align_moInfer = Channel.create() +percentFInfer = Channel.create() +percentRInfer = Channel.create() +failInfer = Channel.create() +inferMetadata.splitCsv(sep: ",", header: false).separate( + endsInfer, + strandedInfer, + spikeInfer, + speciesInfer, + align_erccInfer, + align_huInfer, + align_moInfer, + percentFInfer, + percentRInfer, + failInfer +) +// Replicate metadata for multiple process inputs +endsInfer.into { + endsInfer_alignData + endsInfer_countData + endsInfer_dataQC + endsInfer_aggrQC +} +strandedInfer.into { + strandedInfer_alignData + strandedInfer_countData + strandedInfer_aggrQC +} +spikeInfer.into{ + spikeInfer_getRef + spikeInfer_aggrQC +} +speciesInfer.into { + speciesInfer_getRef + speciesInfer_aggrQC +} + + +/* + * getRef: downloads appropriate reference +*/ +process getRef { + tag "${species}" + + input: + val spike from spikeInfer_getRef + val species from speciesInfer_getRef + + output: + tuple path ("hisat2", type: 'dir'), path ("bed", type: 'dir'), path ("*.fna"), path ("*.gtf"), path ("geneID.tsv"), path ("Entrez.tsv") into reference + + script: + """ + hostname > ${repRID}.getRef.log + ulimit -a >> ${repRID}.getRef.log + + # set the reference name + if [ "${species}" == "Mus musculus" ] + then + references=\$(echo ${referenceBase}/GRCm${refMoVersion}) + elif [ '${species}' == "Homo sapiens" ] + then + references=\$(echo ${referenceBase}/GRCh${refHuVersion}) + else + echo -e "LOG: ERROR - References could not be set!\nSpecies reference found: ${species}" >> ${repRID}.getRef.log + exit 1 + fi + if [ "${spike}" == "yes" ] + then + references=\$(echo \${reference}-S/) + elif [ "${spike}" == "no" ] + then + reference=\$(echo \${references}/) + fi + echo -e "LOG: species set to \${references}" >> ${repRID}.getRef.log + + # retreive appropriate reference appropriate location + echo -e "LOG: fetching ${species} reference files from ${referenceBase}" >> ${repRID}.getRef.log + if [ ${referenceBase} == "s3://bicf-references" ] + then + echo -e "LOG: grabbing reference files from S3" >> ${repRID}.getRef.log + aws s3 cp "\${references}"/hisat2 ./hisat2 --recursive + aws s3 cp "\${references}"/bed ./bed --recursive + aws s3 cp "\${references}"/genome.fna ./ + aws s3 cp "\${references}"/genome.gtf ./ + aws s3 cp "\${references}"/geneID.tsv ./ + aws s3 cp "\${references}"/Entrez.tsv ./ + elif [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references" ] + then + ln -s "\${references}"/hisat2 + ln -s "\${references}"/bed + ln -s "\${references}"/genome.fna + ln -s "\${references}"/genome.gtf + ln -s "\${references}"/geneID.tsv + ln -s "\${references}"/Entrez.tsv + fi + echo -e "LOG: fetched" >> ${repRID}.getRef.log + """ +} + +// Replicate reference for multiple process inputs +reference.into { + reference_alignData + reference_countData + reference_dataQC +} + +/* + * alignData: aligns the reads to a reference database +*/ +process alignData { + tag "${repRID}" + + input: + val ends from endsInfer_alignData + val stranded from strandedInfer_alignData + path fastq from fastqsTrim_alignData + path reference_alignData + + output: + tuple path ("${repRID}.sorted.bam"), path ("${repRID}.sorted.bam.bai") into rawBam + path ("*.alignSummary.txt") into alignQC + + script: + """ + hostname > ${repRID}.align.log + ulimit -a >> ${repRID}.align.log + + # set stranded param for hisat2 + if [ "${stranded}"=="unstranded" ] + then + strandedParam="" + elif [ "${stranded}" == "forward" ] && [ "${ends}" == "se" ] + then + strandedParam="--rna-strandness F" + elif [ "${stranded}" == "forward" ] && [ "${ends}" == "pe" ] + then + strandedParam="--rna-strandness FR" + elif [ "${stranded}" == "reverse" ] && [ "${ends}" == "se" ] + then + strandedParam="--rna-strandness R" + elif [ "${stranded}" == "reverse" ] && [ "${ends}" == "pe" ] + then + strandedParam="--rna-strandness RF" + fi + + # align the reads with Hisat2 + echo -e "LOG: aligning ${ends}" >> ${repRID}.align.log + if [ "${ends}" == "se" ] + then + hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} -U ${fastq[0]} --summary-file ${repRID}.alignSummary.txt --new-summary + elif [ "${ends}" == "pe" ] + then + hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} --summary-file ${repRID}.alignSummary.txt --new-summary + fi + echo -e "LOG: alignined" >> ${repRID}.align.log + + # convert the output sam file to a sorted bam file using Samtools + echo -e "LOG: converting from sam to bam" >> ${repRID}.align.log + samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam + + # sort the bam file using Samtools + echo -e "LOG: sorting the bam file" >> ${repRID}.align.log + samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam + + # index the sorted bam using Samtools + echo -e "LOG: indexing sorted bam file" >> ${repRID}.align.log + samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bam.bai + """ +} + +// Replicate rawBam for multiple process inputs +rawBam.into { + rawBam_dedupData +} + +/* + *dedupData: mark the duplicate reads, specifically focused on PCR or optical duplicates +*/ +process dedupData { + tag "${repRID}" + publishDir "${outDir}/bam", mode: 'copy', pattern: "*.deduped.bam" + + input: + tuple path (bam), path (bai) from rawBam_dedupData + + output: + tuple path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bam.bai") into dedupBam + tuple path ("${repRID}.sorted.deduped.*.bam"), path ("${repRID}.sorted.deduped.*.bam.bai") into dedupChrBam + path ("*.deduped.Metrics.txt") into dedupQC + + script: + """ + hostname > ${repRID}.dedup.log + ulimit -a >> ${repRID}.dedup.log + + # remove duplicated reads using Picard's MarkDuplicates + echo -e "LOG: deduplicating reads" >> ${repRID}.dedup.log + java -jar /picard/build/libs/picard.jar MarkDuplicates I=${bam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true + echo -e "LOG: deduplicated" >> ${repRID}.dedup.log + + # sort the bam file using Samtools + echo -e "LOG: sorting the bam file" >> ${repRID}.dedup.log + samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.deduped.bam ${repRID}.deduped.bam + + # index the sorted bam using Samtools + echo -e "LOG: indexing sorted bam file" >> ${repRID}.dedup.log + samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bam.bai + + # split the deduped BAM file for multi-threaded tin calculation + for i in `samtools view ${repRID}.sorted.deduped.bam | cut -f3 | sort | uniq`; + do + echo "echo \"LOG: splitting each chromosome into its own BAM and BAI files with Samtools\"; samtools view -b ${repRID}.sorted.deduped.bam \${i} 1>> ${repRID}.sorted.deduped.\${i}.bam; samtools index -@ `nproc` -b ${repRID}.sorted.deduped.\${i}.bam ${repRID}.sorted.deduped.\${i}.bam.bai" + done | parallel -j `nproc` -k + """ +} + +// Replicate dedup bam/bai for multiple process inputs +dedupBam.into { + dedupBam_countData + dedupBam_makeBigWig + dedupBam_dataQC +} + +/* + *makeBigWig: make BigWig files for output +*/ +process makeBigWig { + tag "${repRID}" + publishDir "${outDir}/bigwig", mode: 'copy', pattern: "${repRID}.bw" + + input: + tuple path (bam), path (bai) from dedupBam_makeBigWig + + output: + path ("${repRID}.bw") + + script: + """ + hostname > ${repRID}.makeBigWig.log + ulimit -a >> ${repRID}.makeBigWig.log + + # create bigwig + echo -e "LOG: creating bibWig" >> ${repRID}.makeBigWig.log + bamCoverage -p `nproc` -b ${bam} -o ${repRID}.bw + echo -e "LOG: created" >> ${repRID}.makeBigWig.log + """ +} + +/* + *countData: count data and calculate tpm +*/ +process countData { + tag "${repRID}" + publishDir "${outDir}/count", mode: 'copy', pattern: "${repRID}*.countTable.csv" + + input: + path script_calculateTPM + path script_convertGeneSymbols + tuple path (bam), path (bai) from dedupBam_countData + path ref from reference_countData + val ends from endsInfer_countData + val stranded from strandedInfer_countData + + output: + path ("*.tpmTable.csv") into counts + path ("*.countData.summary") into countsQC + path ("assignedReads.csv") into inferMetadata_assignedReads + + script: + """ + hostname > ${repRID}.countData.log + ulimit -a >> ${repRID}.countData.log + + # determine strandedness and setup strandig for countData + stranding=0; + if [ "${stranded}" == "unstranded" ] + then + stranding=0 + echo -e "LOG: strandedness set to unstranded [0]" >> ${repRID}.countData.log + elif [ "${stranded}" == "forward" ] + then + stranding=1 + echo -e "LOG: strandedness set to forward stranded [1]" >> ${repRID}.countData.log + elif [ "${stranded}" == "reverse" ] + then + stranding=2 + echo -e "LOG: strandedness set to reverse stranded [2]" >> ${repRID}.countData.log + fi + + # run featureCounts + echo -e "LOG: counting ${ends} features" >> ${repRID}.countData.log + if [ "${ends}" == "se" ] + then + featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam + elif [ "${ends}" == "pe" ] + then + featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam + fi + echo -e "LOG: counted" >> ${repRID}.countData.log + + # extract assigned reads + grep -m 1 'Assigned' *.countData.summary | grep -oe '\\([0-9.]*\\)' > assignedReads.csv + + # calculate TPM from the resulting countData table + echo -e "LOG: calculating TPM with R" >> ${repRID}.countData.log + Rscript calculateTPM.R --count "${repRID}.countData" + + # convert gene symbols to Entrez id's + echo -e "LOG: convert gene symbols to Entrez id's" >> ${repRID}.countData.log + Rscript convertGeneSymbols.R --repRID "${repRID}" + """ +} + +// Extract number of assigned reads metadata into channel +assignedReadsInfer = Channel.create() +inferMetadata_assignedReads.splitCsv(sep: ",", header: false).separate( + assignedReadsInfer +) + +/* + *fastqc: run fastqc on untrimmed fastq's +*/ +process fastqc { + tag "${repRID}" + + input: + path (fastq) from fastqs_fastqc + + output: + path ("*_fastqc.zip") into fastqc + path ("rawReads.csv") into inferMetadata_rawReads + + script: + """ + hostname > ${repRID}.fastqc.log + ulimit -a >> ${repRID}.fastqc.log + + # run fastqc + echo -e "LOG: running fastq on raw fastqs" >> ${repRID}.fastqc.log + fastqc *.fastq.gz -o . + + # count raw reads + zcat *.R1.fastq.gz | echo \$((`wc -l`/4)) > rawReads.csv + """ +} + +// Extract number of raw reads metadata into channel +rawReadsInfer = Channel.create() +inferMetadata_rawReads.splitCsv(sep: ",", header: false).separate( + rawReadsInfer +) + +/* + *dataQC: calculate transcript integrity numbers (TIN) and bin as well as calculate innerdistance of PE replicates +*/ +process dataQC { + tag "${repRID}" + + input: + path script_tinHist + path ref from reference_dataQC + tuple path (bam), path (bai) from dedupBam_dataQC + tuple path (chrBam), path (chrBai) from dedupChrBam + val ends from endsInfer_dataQC + + output: + path "${repRID}.tin.hist.tsv" into tinHist + path "${repRID}.tin.med.csv" into inferMetadata_tinMed + path "${repRID}.insertSize.inner_distance_freq.txt" into innerDistance + + script: + """ + hostname > ${repRID}.dataQC.log + ulimit -a >> ${repRID}.dataQC.log + + # calcualte TIN values per feature on each chromosome + echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > ${repRID}.sorted.deduped.tin.xls + for i in `cat ./bed/genome.bed | cut -f1 | sort | uniq`; do + echo "echo \"LOG: running tin.py on \${i}\" >> ${repRID}.dataQC.log; tin.py -i ${repRID}.sorted.deduped.\${i}.bam -r ./bed/genome.bed; cat ${repRID}.sorted.deduped.\${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \\\"\\\\t\${i}\\\\t\\\";"; + done | parallel -j `nproc` -k 1>> ${repRID}.sorted.deduped.tin.xls + + # bin TIN values + echo -e "LOG: binning TINs" >> ${repRID}.dataQC.log + python3 ${script_tinHist} -r ${repRID} + echo -e "LOG: binned" >> ${repRID}.dataQC.log + + # calculate inner-distances for PE data + if [ "${ends}" == "pe" ] + then + echo -e "LOG: calculating inner distances for ${ends}" >> ${repRID}.dataQC.log + inner_distance.py -i "${bam}" -o ${repRID}.insertSize -r ./bed/genome.bed + echo -e "LOG: calculated" >> ${repRID}.dataQC.log + elif [ "${ends}" == "se" ] + then + echo -e "LOG: creating dummy inner distance file for ${ends}" >> ${repRID}.dataQC.log + touch ${repRID}.insertSize.inner_distance_freq.txt + fi + """ +} + +// Extract median TIN metadata into channel +tinMedInfer = Channel.create() +inferMetadata_tinMed.splitCsv(sep: ",", header: false).separate( + tinMedInfer +) + +/* + *aggrQC: aggregate QC from processes as well as metadata and run MultiQC +*/ +process aggrQC { + tag "${repRID}" + publishDir "${outDir}/report", mode: 'copy', pattern: "${repRID}.multiqc.html" + publishDir "${outDir}/qc", mode: 'copy', pattern: "${repRID}.multiqc_data.json" + + input: + path multiqcConfig + path bicfLogo + path fastqc + path trimQC + path alignQC + path dedupQC + path countsQC + path innerDistance + path tinHist + path alignSampleQCs from alignSampleQC_aggrQC.collect() + path inferExperiment + val endsManual from endsManual_aggrQC + val endsM from endsMeta + val strandedM from strandedMeta + val spikeM from spikeMeta + val speciesM from speciesMeta + val endsI from endsInfer_aggrQC + val strandedI from strandedInfer_aggrQC + val spikeI from spikeInfer_aggrQC + val speciesI from speciesInfer_aggrQC + val readLengthM from readLengthMeta + val readLengthI from readLengthInfer + val rawReadsI from rawReadsInfer + val assignedReadsI from assignedReadsInfer + val tinMedI from tinMedInfer + val expRID + val studyRID + + output: + path "${repRID}.multiqc.html" into multiqc + path "${repRID}.multiqc_data.json" into multiqcJSON + + script: + """ + hostname > ${repRID}.aggrQC.log + ulimit -a >> ${repRID}.aggrQC.log + + # make RID table + echo -e "LOG: creating RID table" >> ${repRID}.aggrQC.log + echo -e "Replicate RID\tExperiment RID\tStudy RID" > rid.tsv + echo -e "${repRID}\t${expRID}\t${studyRID}" >> rid.tsv + + # make metadata table + echo -e "LOG: creating metadata table" >> ${repRID}.aggrQC.log + echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in\tRaw Reads\tAssigned Reads\tMedian Read Length\tMedian TIN" > metadata.tsv + echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t-\t-\t'${readLengthM}'\t-" >> metadata.tsv + echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv + echo -e "Measured\t-\t${endsManual}\t-\t-\t'${rawReadsI}'\t'${assignedReadsI}'\t'${readLengthI}'\t'${tinMedI}'" >> metadata.tsv + + # remove inner distance report if it is empty (SE repRID) + echo -e "LOG: removing dummy inner distance file" >> ${repRID}.aggrQC.log + if [ "${endsM}" == "se" ] + then + rm -f ${innerDistance} + fi + + # run MultiQC + echo -e "LOG: running multiqc" >> ${repRID}.aggrQC.log + multiqc -c ${multiqcConfig} . -n ${repRID}.multiqc.html + cp ${repRID}.multiqc_data/multiqc_data.json ${repRID}.multiqc_data.json + """ +} \ No newline at end of file diff --git a/workflow/scripts/bdbagFetch.sh b/workflow/scripts/bdbagFetch.sh index 28dab3f5338b3b6371b2b8f4ee7ac6bf2e715fa6..606b88397d5a6cf4feb4aa38d7615e3e3ba48735 100644 --- a/workflow/scripts/bdbagFetch.sh +++ b/workflow/scripts/bdbagFetch.sh @@ -1,3 +1,14 @@ -#!/bin +#!/bin/bash -bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1 \ No newline at end of file +if [ -z "${3}" ] +then + bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz ${1} + for i in $(find */ -name "*R*.fastq.gz") + do + path=${2}.$(echo ${i##*/} | grep -o "R[1,2].fastq.gz") + cp ${i} ./${path} + done +elif [ "${3}" == "TEST" ] +then + bdbag --resolve-fetch all --fetch-filter filename\$*.txt ${1} +fi diff --git a/workflow/scripts/calculateTPM.R b/workflow/scripts/calculateTPM.R new file mode 100644 index 0000000000000000000000000000000000000000..d4851d4806c5e12e06416a103f2e6972a8b4befe --- /dev/null +++ b/workflow/scripts/calculateTPM.R @@ -0,0 +1,30 @@ +gc() +library(optparse) + +option_list=list( + make_option("--count",action="store",type='character',help="Count File") +) +opt=parse_args(OptionParser(option_list=option_list)) +rm(option_list) + +if (!("count" %in% names(opt))){ + stop("No count file passed, exiting.") +} else if (!file.exists(opt$count)) { + stop("Count file doesn't exist, exiting.") +} + +repRID <- basename(gsub(".featureCounts","",opt$count)) + +count <- read.delim(opt$count, comment.char="#") # if featureCounts file changes structure, be sure to update count and Length columns below +colnames(count)[7] <- "count" + +rpk <- count$count/count$Length/1000 + +scale <- sum(rpk)/1000000 + +tpm <- rpk/scale + +output <- cbind(count,tpm) +colnames(output)[7] <- "count" + +write.table(output,file=paste0(repRID,".countTable.csv"),sep=",",row.names=FALSE,quote=FALSE) diff --git a/workflow/scripts/convertGeneSymbols.R b/workflow/scripts/convertGeneSymbols.R new file mode 100644 index 0000000000000000000000000000000000000000..7b05b92e5d251050a78e7f546aa8b042d994c623 --- /dev/null +++ b/workflow/scripts/convertGeneSymbols.R @@ -0,0 +1,24 @@ +gc() +library(optparse) + +option_list=list( + make_option("--repRID",action="store",type='character',help="Replicate RID") +) +opt=parse_args(OptionParser(option_list=option_list)) +rm(option_list) + +countTable <- read.csv(paste0(opt$repRID,".countData.countTable.csv"), stringsAsFactors=FALSE) +geneID <- read.delim("geneID.tsv", header=FALSE, stringsAsFactors=FALSE) +Entrez <- read.delim("Entrez.tsv", header=FALSE, stringsAsFactors=FALSE) + +convert <- data.frame(geneID=countTable$Geneid) +convert <- merge(x=convert,y=geneID[,1:2],by.x="geneID",by.y="V2",all.x=TRUE) +convert <- merge(x=convert,y=Entrez,by.x="V1",by.y="V1",all.x=TRUE) +convert[is.na(convert$V2),3] <- "" +convert <- convert[,-1] +colnames(convert) <- c("GeneID","EntrezID") +convert <- unique(convert) + +output <- merge(x=convert,y=countTable,by.x="GeneID",by.y="Geneid",all.x=TRUE) + +write.table(output,file=paste0(opt$repRID,".tpmTable.csv"),sep=",",row.names=FALSE,quote=FALSE) \ No newline at end of file diff --git a/workflow/scripts/inferMeta.sh b/workflow/scripts/inferMeta.sh new file mode 100644 index 0000000000000000000000000000000000000000..a8c9839ae2328cbed3dd39b9f2be427be12722ba --- /dev/null +++ b/workflow/scripts/inferMeta.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +if [ "${1}" == "endness" ] +then + awk '/Data/ {print}' "${2}" | sed -e 's/^This is //' -e 's/ Data$//' +elif [ "${1}" == "fail" ] +then + awk '/Fraction of reads failed/ {print}' "${2}" | sed -e 's/^Fraction of reads failed to determine: //' +elif [ "${1}" == "sef" ] +then + awk '/\+\+,--/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "++,--": //' +elif [ "${1}" == "ser" ] +then + awk '/\+-,-\+/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "+-,-+": //' +elif [ "${1}" == "pef" ] +then + awk '/1\+\+,1--,2\+-,2-\+/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "1++,1--,2+-,2-+": //' +elif [ "${1}" == "per" ] +then + awk '/1\+-,1-\+,2\+\+,2--/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "1+-,1-+,2++,2--": //' +fi \ No newline at end of file diff --git a/workflow/scripts/modifyFetch.py b/workflow/scripts/modifyFetch.py deleted file mode 100644 index 8a330e539054c8592363bd84bb4e6a0871b750f4..0000000000000000000000000000000000000000 --- a/workflow/scripts/modifyFetch.py +++ /dev/null @@ -1,17 +0,0 @@ -import argparse -import pandas as pd - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True) - args = parser.parse_args() - return args - -def main(): - args = get_args() - fetch = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None) - fetch_filtered = fetch[fetch[2].str[-9:]==".fastq.gz"] - fetch_filtered.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False) - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/workflow/scripts/parseMeta.py b/workflow/scripts/parseMeta.py new file mode 100644 index 0000000000000000000000000000000000000000..500054264f7c8c50310da92d9995f432930318d3 --- /dev/null +++ b/workflow/scripts/parseMeta.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 + +import argparse +import pandas as pd +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-r', '--repRID',help="The replicate RID.",required=True) + parser.add_argument('-m', '--metaFile',help="The metadata file to extract.",required=True) + parser.add_argument('-p', '--parameter',help="The parameter to extract.",required=True) + args = parser.parse_args() + return args + + +def main(): + args = get_args() + metaFile = pd.read_csv(args.metaFile,sep=",",header=0) + + # Check replicate RID metadata from 'File.csv' + if (args.parameter == "repRID"): + if (len(metaFile.Replicate_RID.unique()) > 1): + print("There are multiple replicate RID's in the metadata: " + " ".join(metaFile.Replicate_RID.unique())) + exit(1) + if not (metaFile.Replicate_RID.unique() == args.repRID): + print("Replicate RID in metadata does not match run parameters: " + metaFile.Replicate_RID.unique() + " vs " + args.repRID) + exit(1) + else: + rep=metaFile["Replicate_RID"].unique()[0] + print(rep) + if (len(metaFile[metaFile["File_Type"] == "FastQ"]) > 2): + print("There are more then 2 fastq's in the metadata: " + " ".join(metaFile[metaFile["File_Type"] == "FastQ"].RID)) + exit(1) + + # Check experiment RID metadata from 'Experiment.csv' + if (args.parameter == "expRID"): + if (len(metaFile.Experiment_RID.unique()) > 1): + print("There are multiple experoment RID's in the metadata: " + " ".join(metaFile.Experiment_RID.unique())) + exit(1) + else: + exp=metaFile["Experiment_RID"].unique()[0] + print(exp) + + # Check study RID metadata from 'Experiment.csv' + if (args.parameter == "studyRID"): + if (len(metaFile.Study_RID.unique()) > 1): + print("There are multiple study RID's in the metadata: " + " ".join(metaFile.Study_RID.unique())) + exit(1) + else: + study=metaFile["Study_RID"].unique()[0] + print(study) + + # Get endedness metadata from 'Experiment Settings.csv' + if (args.parameter == "endsMeta"): + if (metaFile.Paired_End.unique() == "Single End"): + endsMeta = "se" + elif (metaFile.Paired_End.unique() == "Paired End"): + endsMeta = "pe" + else: + endsMeta = "uk" + print(endsMeta) + + # Manually get endness count from 'File.csv' + if (args.parameter == "endsManual"): + if (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 1): + endsManual = "se" + elif (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 2): + endsManual = "pe" + print(endsManual) + + # Get strandedness metadata from 'Experiment Settings.csv' + if (args.parameter == "stranded"): + if (metaFile.Has_Strand_Specific_Information.unique() == "yes"): + stranded = "stranded" + elif (metaFile.Has_Strand_Specific_Information.unique() == "no"): + stranded = "unstranded" + else: + print("Stranded metadata not match expected options: " + metaFile.Has_Strand_Specific_Information.unique()) + exit(1) + print(stranded) + + # Get spike-in metadata from 'Experiment Settings.csv' + if (args.parameter == "spike"): + if (metaFile.Used_Spike_Ins.unique() == "yes"): + spike = "yes" + elif (metaFile.Used_Spike_Ins.unique() == "no"): + spike = "no" + else: + print("Spike-ins metadata not match expected options: " + metaFile.Used_Spike_Ins.unique()) + exit(1) + print(spike) + + # Get species metadata from 'Experiment.csv' + if (args.parameter == "species"): + if (metaFile.Species.unique() == "Mus musculus"): + species = "Mus musculus" + elif (metaFile.Species.unique() == "Homo sapiens"): + species = "Homo sapiens" + else: + print("Species metadata not match expected options: " + metaFile.Species.unique()) + exit(1) + print(species) + + # Get read length metadata from 'Experiment Settings.csv' + if (args.parameter == "readLength"): + readLength = metaFile.Read_Length.unique() + print(str(readLength).strip('[]')) + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/renameFastq.sh b/workflow/scripts/renameFastq.sh deleted file mode 100644 index f5593766b3a3bd645c3f2c8758d3a20fd354c9be..0000000000000000000000000000000000000000 --- a/workflow/scripts/renameFastq.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin - -while read loc checksum fileLocation -do - file=$(echo ${fileLocation##*/}) - fileName=$(echo ${file%.R*.fastq.gz}) - fileExt=$(echo ${file##${fileName}.}) - while IFS="," read RID Study_RID Experiment_RID Replicate_RID Caption File_Type File_Name URI File_size MD5 GEO_Archival_URL dbGaP_Accession_ID Processed Notes Principal_Investigator Consortium Release_Date RCT RMT Legacy_File_RID GUDMAP_NGF_OID GUDMAP_NGS_OID - do - if [ ${file} == ${File_Name} ] - then - find . -type f -name ${file} -execdir mv {} ${Replicate_RID}.${fileExt} ';' - fi - done < $1/data/File.csv -done < $1/fetch.txt \ No newline at end of file diff --git a/workflow/scripts/splitStudy.py b/workflow/scripts/splitStudy.py new file mode 100644 index 0000000000000000000000000000000000000000..82ffc2881857dd5d1d27eee5ea6a381b02d0e9f5 --- /dev/null +++ b/workflow/scripts/splitStudy.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 + +import argparse +import pandas as pd +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-s', '--studyRID',help="The study RID.",required=True) + args = parser.parse_args() + return args + +def main(): + args = get_args() + studyRID=pd.read_json(args.studyRID+"_studyRID.json") + if studyRID["RID"].count() > 0: + studyRID["RID"].to_csv(args.studyRID+"_studyRID.csv",header=False,index=False) + else: + raise Exception("No associated replicates found: %s" % + studyRID) + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/splitStudy.sh b/workflow/scripts/splitStudy.sh new file mode 100644 index 0000000000000000000000000000000000000000..a64b6d9e4cde818d1c6f91fd84144b821febc536 --- /dev/null +++ b/workflow/scripts/splitStudy.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# query GUDMAP/RBK for study RID +echo "curl --location --request GET 'https://www.gudmap.org/ermrest/catalog/2/entity/RNASeq:Replicate/Study_RID="${1}"'" | bash > $1_studyRID.json + +# extract replicate RIDs +module load python/3.6.4-anaconda +python3 ./workflow/scripts/splitStudy.py -s $1 + +# run pipeline on replicate RIDs in parallel +module load nextflow/20.01.0 +module load singularity/3.5.3 +while read repRID; do echo ${repRID}; done < "$1_studyRID.csv" | xargs -P 5 -I {} nextflow run workflow/rna-seq.nf --repRID {} + +# cleanup study RID files +rm $1_studyRID.json +rm $1_studyRID.csv diff --git a/workflow/scripts/tinHist.py b/workflow/scripts/tinHist.py new file mode 100644 index 0000000000000000000000000000000000000000..3d292c2eb8cadb3b16466c6b19d0574184d439d7 --- /dev/null +++ b/workflow/scripts/tinHist.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 + +import argparse +import pandas as pd +import numpy as np +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-r', '--repRID',help="The replicate RID.",required=True) + args = parser.parse_args() + return args + +def main(): + args = get_args() + tin = pd.read_csv(args.repRID + '.sorted.deduped.tin.xls',sep="\t",header=0) + + hist = pd.cut(tin['TIN'],bins=pd.interval_range(start=0,freq=10,end=100,closed='right')).value_counts(sort=False) + labels = ["{0} - {1}".format(i, i + 9) for i in range(1, 100, 10)] + #labels[0] = '0 - 10' + binned = tin.assign(Bins=lambda x: pd.cut(tin['TIN'],range(0,105,10),labels=labels,include_lowest=False,right=True)) + binned['chrom'] = binned['chrom'] = binned['chrom'].replace('chr1','chr01') + binned['chrom'] = binned['chrom'].replace('chr2','chr02') + binned['chrom'] = binned['chrom'].replace('chr3','chr03') + binned['chrom'] = binned['chrom'].replace('chr4','chr04') + binned['chrom'] = binned['chrom'].replace('chr5','chr05') + binned['chrom'] = binned['chrom'].replace('chr6','chr06') + binned['chrom'] = binned['chrom'].replace('chr7','chr07') + binned['chrom'] = binned['chrom'].replace('chr8','chr08') + binned['chrom'] = binned['chrom'].replace('chr9','chr09') + hist = pd.pivot_table(binned, values='geneID', index = 'Bins', columns = 'chrom', aggfunc=np.size) + hist['TOTAL'] = hist.sum(axis=1) + hist = hist[['TOTAL'] + [ i for i in hist.columns if i != 'TOTAL']] + hist = hist.T.fillna(0.0).astype(int) + #hist = hist.apply(lambda x: x/x.sum()*100, axis=1) + hist.to_csv(args.repRID + '.tin.hist.tsv',sep='\t') + medFile = open(args.repRID + '.tin.med.csv',"w") + medFile.write(str(round(tin['TIN'][(tin['TIN']!=0)].median(),2))) + medFile.close() + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/utils.py b/workflow/scripts/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5e4478e3b1cef96e9b4c05033f75122f87aad06e --- /dev/null +++ b/workflow/scripts/utils.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 + +# +# * -------------------------------------------------------------------------- +# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md) +# * -------------------------------------------------------------------------- +# + +'''General utilities.''' + + +import shlex +import logging +import subprocess +import sys +import os + + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = True + + +def run_pipe(steps, outfile=None): + from subprocess import Popen, PIPE + p = None + p_next = None + first_step_n = 1 + last_step_n = len(steps) + for n, step in enumerate(steps, start=first_step_n): + logger.debug("step %d: %s" % (n, step)) + if n == first_step_n: + if n == last_step_n and outfile: # one-step pipeline with outfile + with open(outfile, 'w') as fh: + print("one step shlex: %s to file: %s" % (shlex.split(step), outfile)) + p = Popen(shlex.split(step), stdout=fh) + break + print("first step shlex to stdout: %s" % (shlex.split(step))) + + p = Popen(shlex.split(step), stdout=PIPE) + elif n == last_step_n and outfile: # only treat the last step specially if you're sending stdout to a file + with open(outfile, 'w') as fh: + print("last step shlex: %s to file: %s" % (shlex.split(step), outfile)) + p_last = Popen(shlex.split(step), stdin=p.stdout, stdout=fh) + p.stdout.close() + p = p_last + else: # handles intermediate steps and, in the case of a pipe to stdout, the last step + print("intermediate step %d shlex to stdout: %s" % (n, shlex.split(step))) + p_next = Popen(shlex.split(step), stdin=p.stdout, stdout=PIPE) + p.stdout.close() + p = p_next + out, err = p.communicate() + return out, err + + +def block_on(command): + process = subprocess.Popen(shlex.split(command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) + for line in iter(process.stdout.readline, b''): + sys.stdout.write(line.decode('utf-8')) + process.communicate() + return process.returncode + + +def strip_extensions(filename, extensions): + '''Strips extensions to get basename of file.''' + + basename = filename + for extension in extensions: + basename = basename.rpartition(extension)[0] or basename + + return basename + + +def count_lines(filename): + import mimetypes + compressed_mimetypes = [ + "compress", + "bzip2", + "gzip" + ] + mime_type = mimetypes.guess_type(filename)[1] + if mime_type in compressed_mimetypes: + catcommand = 'gzip -dc' + else: + catcommand = 'cat' + out, err = run_pipe([ + '%s %s' % (catcommand, filename), + 'wc -l' + ]) + return int(out) + + +def rescale_scores(filename, scores_col, new_min=10, new_max=1000): + sorted_fn = 'sorted-%s' % (filename) + rescaled_fn = 'rescaled-%s' % (filename) + + out, err = run_pipe([ + 'sort -k %dgr,%dgr %s' % (scores_col, scores_col, filename), + r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (NF != 0) print $0}'"""], + sorted_fn) + + out, err = run_pipe([ + 'head -n 1 %s' % (sorted_fn), + 'cut -f %s' % (scores_col)]) + max_score = float(out.strip()) + logger.info("rescale_scores: max_score = %s", max_score) + + out, err = run_pipe([ + 'tail -n 1 %s' % (sorted_fn), + 'cut -f %s' % (scores_col)]) + min_score = float(out.strip()) + logger.info("rescale_scores: min_score = %s", min_score) + + a = min_score + b = max_score + x = new_min + y = new_max + + if min_score == max_score: # give all peaks new_min + rescale_formula = "x" + else: # n is the unscaled score from scores_col + rescale_formula = "((n-a)*(y-x)/(b-a))+x" + + out, err = run_pipe( + [ + 'cat %s' % (sorted_fn), + r"""awk 'BEGIN{OFS="\t"}{n=$%d;a=%d;b=%d;x=%d;y=%d}""" + % (scores_col, a, b, x, y) + + r"""{$%d=int(%s) ; print $0}'""" + % (scores_col, rescale_formula) + ], + rescaled_fn) + + os.remove(sorted_fn) + + return rescaled_fn diff --git a/workflow/tests/test_alignReads.py b/workflow/tests/test_alignReads.py new file mode 100644 index 0000000000000000000000000000000000000000..eae8780f5641404ac9caabe32089369a3b0b1ea2 --- /dev/null +++ b/workflow/tests/test_alignReads.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +import os +import utils + +data_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + + +@pytest.mark.alignData +def test_alignData_se(): + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.unal.gz')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.bam')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.bam.bai')) + + +@pytest.mark.alignData +def test_alignData_pe(): + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.pe.unal.gz')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.pe.sorted.bam')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.pe.sorted.bam.bai')) diff --git a/workflow/tests/test_consistency.py b/workflow/tests/test_consistency.py new file mode 100644 index 0000000000000000000000000000000000000000..073b12826b798ac94d16fda4291dfba2c1a42203 --- /dev/null +++ b/workflow/tests/test_consistency.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + +@pytest.mark.consistencySE +def test_consistencySE(): + assert os.path.exists(os.path.join(test_output_path, 'SE_multiqc_data.json')) + assert readAssigned("assignedSE.txt","assignedExpectSE.txt") + +@pytest.mark.consistencyPE +def test_consistencyPE(): + assert os.path.exists(os.path.join(test_output_path, 'PE_multiqc_data.json')) + assert readAssigned("assignedPE.txt","assignedExpectPE.txt") + +def readAssigned(fileAssigned,fileExpectAssigned): + data = False + assigned = open(fileAssigned, "r") + expect = open(fileExpectAssigned, "r") + lineAssigned = assigned.readline() + lineExpect = expect.readline() + if int(lineAssigned.strip()) < (int(lineExpect.strip())+(int(lineExpect.strip())*0.00001)) and int(lineAssigned.strip()) > (int(lineExpect.strip())-(int(lineExpect.strip())*0.00001)): + data = True + + return data diff --git a/workflow/tests/test_dataQC.py b/workflow/tests/test_dataQC.py new file mode 100644 index 0000000000000000000000000000000000000000..e77d4680fd8eac61c3a9b9a8fd175136a61244b9 --- /dev/null +++ b/workflow/tests/test_dataQC.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + +@pytest.mark.dataQC +def test_dataQC(): + assert os.path.exists(os.path.join(test_output_path, 'Q-Y5F6_1M.se.sorted.deduped.tin.xls')) + assert countLines(os.path.join(test_output_path, 'Q-Y5F6_1M.se.sorted.deduped.tin.xls')) + +def countLines(fileName): + data = False + file = open(fileName, "r") + file.readline() + if file.readlines()[6] != "geneID": + data = True + + return data diff --git a/workflow/tests/test_dedupReads.py b/workflow/tests/test_dedupReads.py new file mode 100644 index 0000000000000000000000000000000000000000..49cf420e2f0e2de923ae580c51dcdf42549f0713 --- /dev/null +++ b/workflow/tests/test_dedupReads.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +import os +import utils + +data_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + + +@pytest.mark.dedupData +def test_dedupData(): + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.bam')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.bam.bai')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr8.bam')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr8.bam.bai')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr4.bam')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr4.bam.bai')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chrY.bam')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chrY.bam.bai')) diff --git a/workflow/tests/test_downsampleData.py b/workflow/tests/test_downsampleData.py new file mode 100644 index 0000000000000000000000000000000000000000..fd42c49e169e387dd9662903b20866c40aec8907 --- /dev/null +++ b/workflow/tests/test_downsampleData.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + +@pytest.mark.downsampleData +def test_downsampleData(): + assert os.path.exists(os.path.join(test_output_path, 'sampled.1.fq')) \ No newline at end of file diff --git a/workflow/tests/test_fastqc.py b/workflow/tests/test_fastqc.py new file mode 100644 index 0000000000000000000000000000000000000000..89303fe78e081a26fd6a8ea633997dffb8f920a6 --- /dev/null +++ b/workflow/tests/test_fastqc.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + +@pytest.mark.fastqc +def test_fastqc(): + assert os.path.exists(os.path.join(test_output_path, 'Q-Y5F6_1M.R1_fastqc.zip')) diff --git a/workflow/tests/test_getBag.py b/workflow/tests/test_getBag.py new file mode 100644 index 0000000000000000000000000000000000000000..1c63c9d95ac33aceeaa965852ede7bfc5e86bdc7 --- /dev/null +++ b/workflow/tests/test_getBag.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + +@pytest.mark.getBag +def test_getBag(): + assert os.path.exists(os.path.join(test_output_path, 'Replicate_Q-Y5F6.zip')) diff --git a/workflow/tests/test_getData.py b/workflow/tests/test_getData.py new file mode 100644 index 0000000000000000000000000000000000000000..a14be93ea8103aadf44aca3156ee28036cb6113e --- /dev/null +++ b/workflow/tests/test_getData.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + +@pytest.mark.getData +def test_getData(): + assert os.path.exists(os.path.join(test_output_path, 'Replicate_Q-Y5F6/bagit.txt')) + assert os.path.exists(os.path.join(test_output_path, 'Replicate_Q-Y5F6/data/assets/Study/Q-Y4GY/Experiment/Q-Y4DP/Replicate/Q-Y5F6/mMARIS_Six2-#3.gene.rpkm.txt')) diff --git a/workflow/tests/test_inferMetadata.py b/workflow/tests/test_inferMetadata.py new file mode 100644 index 0000000000000000000000000000000000000000..518664ced5ab4dd4e713dede9c58b9d87594799a --- /dev/null +++ b/workflow/tests/test_inferMetadata.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + +@pytest.mark.inferMetadata +def test_inferMetadata(): + assert os.path.exists(os.path.join(test_output_path, 'Q-Y5F6_1M.se.inferMetadata.log')) + diff --git a/workflow/tests/test_makeBigWig.py b/workflow/tests/test_makeBigWig.py new file mode 100644 index 0000000000000000000000000000000000000000..9292ac64714017fde8371927cee616374a457b3a --- /dev/null +++ b/workflow/tests/test_makeBigWig.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +import os +import utils + +data_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + + +@pytest.mark.makeBigWig +def test_makeBigWig(): + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.bw')) diff --git a/workflow/tests/test_makeFeatureCounts.py b/workflow/tests/test_makeFeatureCounts.py new file mode 100644 index 0000000000000000000000000000000000000000..d741a2f4ac0f33fe15af8f13300c95002a00a887 --- /dev/null +++ b/workflow/tests/test_makeFeatureCounts.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +import os +import utils + +data_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + + +@pytest.mark.makeFeatureCounts +def test_makeFeatureCounts(): + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.featureCounts')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.countTable.csv')) diff --git a/workflow/tests/test_parseMetadata.py b/workflow/tests/test_parseMetadata.py new file mode 100644 index 0000000000000000000000000000000000000000..59677bbba7d40058bdeb78ccceeeeddba4565a14 --- /dev/null +++ b/workflow/tests/test_parseMetadata.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + +@pytest.mark.parseMetadata +def test_parseMetadata(): + assert os.path.exists(os.path.join(test_output_path, 'design.csv')) + assert readLine(os.path.join(test_output_path, 'design.csv')) + +def readLine(fileName): + data = False + file = open(fileName, "r") + line = file.readline() + if line.strip() == "uk,se,unstranded,no,Homo sapiens,75,Experiment_RID,Study_RID,Replicate_RID": + data = True + + return data diff --git a/workflow/tests/test_trimData.py b/workflow/tests/test_trimData.py new file mode 100644 index 0000000000000000000000000000000000000000..ba0eeda481262647abc9e4a8bf362515ac0dc0e7 --- /dev/null +++ b/workflow/tests/test_trimData.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + +@pytest.mark.trimData +def test_trimData_se(): + assert os.path.exists(os.path.join(test_output_path, 'Q-Y5F6_1M.se_trimmed.fq.gz')) + + +@pytest.mark.trimData +def test_trimData_pe(): + assert os.path.exists(os.path.join(test_output_path, 'Q-Y5F6_1M.pe_R1_val_1.fq.gz')) + assert os.path.exists(os.path.join(test_output_path, 'Q-Y5F6_1M.pe_R2_val_2.fq.gz'))