diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 344138f7b7822f578f1efba09709c1c355f59709..bf7355aa6e19f1c29ab020d8524ff204ffc9e7cc 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -63,8 +63,8 @@ getBag: - schedules script: - ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json - - singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-download-cli --version > version_deriva.txt - - singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-download-cli staging.gudmap.org --catalog 2 ./workflow/conf/Replicate_For_Input_Bag.json . rid=Q-Y5F6 + - singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-download-cli --version > version_deriva.txt + - singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-download-cli staging.gudmap.org --catalog 2 ./workflow/conf/Replicate_For_Input_Bag.json . rid=Q-Y5F6 - pytest -m getBag artifacts: name: "$CI_JOB_NAME" @@ -82,10 +82,10 @@ getData: - merge_requests - schedules script: - - singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' bdbag --version > version_bdbag.txt + - singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' bdbag --version > version_bdbag.txt - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt - unzip ./test_data/bag/Q-Y5F6_inputBag_xxxxtest.zip - - singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' bash ./workflow/scripts/bdbag_fetch.sh Q-Y5F6_inputBag Q-Y5F6 + - singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' bash ./workflow/scripts/bdbag_fetch.sh Q-Y5F6_inputBag Q-Y5F6 - pytest -m getData artifacts: name: "$CI_JOB_NAME" @@ -331,12 +331,12 @@ uploadInputBag: - > md5=$(md5sum ./test.txt | awk '{ print $1 }') && size=$(wc -c < ./test.txt) && - exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=${md5}) && + exist=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=${md5}) && if [ "${exist}" == "[]" ]; then cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && cookie=${cookie:11:-1} && - loc=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host staging.gudmap.org put ./test.txt /hatrac/resources/rnaseq/pipeline/input_bag/TEST/test.txt --parents) && - rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/upload_input_bag.py -f test.txt -l ${loc} -s ${md5} -b ${size} -n 'This is a test input bag' -o staging.gudmap.org -c ${cookie}) && + loc=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host staging.gudmap.org put ./test.txt /hatrac/resources/rnaseq/pipeline/input_bag/TEST/test.txt --parents) && + rid=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/upload_input_bag.py -f test.txt -l ${loc} -s ${md5} -b ${size} -n 'This is a test input bag' -o staging.gudmap.org -c ${cookie}) && echo ${rid} test input bag created else rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') && @@ -355,16 +355,16 @@ uploadExecutionRun: script: - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json - > - exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Replicate=17-BTFJ) && + exist=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Replicate=17-BTFJ) && cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && cookie=${cookie:11:-1} && if [ "${exist}" == "[]" ]; then - rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/upload_execution_run.py -r 17-BTFJ -w 17-BV2Y -g 17-BV90 -i 17-BTFT -s Success -d 'This is a test execution run' -o staging.gudmap.org -c ${cookie} -u F) && + rid=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/upload_execution_run.py -r 17-BTFJ -w 17-BV2Y -g 17-BV90 -i 17-BTFT -s Success -d 'This is a test execution run' -o staging.gudmap.org -c ${cookie} -u F) && echo ${rid} test execution run created else rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') && rid=${rid:7:-6} && - rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/upload_execution_run.py -r 17-BTFJ -w 17-BV2Y -g 17-BV90 -i 17-BTFT -s Success -d 'This is a test execution run' -o staging.gudmap.org -c ${cookie} -u ${rid}) && + rid=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/upload_execution_run.py -r 17-BTFJ -w 17-BV2Y -g 17-BV90 -i 17-BTFT -s Success -d 'This is a test execution run' -o staging.gudmap.org -c ${cookie} -u ${rid}) && echo ${rid} test execution run already exists fi @@ -379,17 +379,17 @@ uploadQC: script: - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json - > - exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=17-BTFJ) && + exist=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=17-BTFJ) && cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && cookie=${cookie:11:-1} && if [ "${exist}" != "[]" ]; then rids=$(echo ${exist} | grep -o '\"RID\":\".\{7\}' | sed 's/^.\{7\}//') && for rid in ${rids}; do - singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/delete_entry.py -r ${rid} -t mRNA_QC -o staging.gudmap.org -c ${cookie} + singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/delete_entry.py -r ${rid} -t mRNA_QC -o staging.gudmap.org -c ${cookie} done echo all old mRNA QC RIDs deleted fi - rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/upload_qc.py -r 17-BTFJ -e 17-BVDJ -p "Single Read" -s forward -l 35 -w 5 -f 1 -n "This is a test mRNA QC" -o staging.gudmap.org -c ${cookie} -u F) + rid=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/upload_qc.py -r 17-BTFJ -e 17-BVDJ -p "Single Read" -s forward -l 35 -w 5 -f 1 -t 1 -n "This is a test mRNA QC" -o staging.gudmap.org -c ${cookie} -u F) echo ${rid} test mRNA QC created uploadProcessedFile: @@ -406,20 +406,20 @@ uploadProcessedFile: - mkdir -p ./deriva/Seq/pipeline/17-BTFE/17-BVDJ/ - mv 17-BTFJ_test.csv ./deriva/Seq/pipeline/17-BTFE/17-BVDJ/17-BTFJ_test.csv - > - exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Processed_File/Replicate=17-BTFJ) && + exist=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Processed_File/Replicate=17-BTFJ) && cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && cookie=${cookie:11:-1} && if [ "${exist}" != "[]" ]; then rids=$(echo ${exist} | grep -o '\"RID\":\".\{7\}' | sed 's/^.\{7\}//') && for rid in ${rids}; do - singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/delete_entry.py -r ${rid} -t Processed_File -o staging.gudmap.org -c ${cookie} + singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/delete_entry.py -r ${rid} -t Processed_File -o staging.gudmap.org -c ${cookie} done echo all old processed file RIDs deleted fi - singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-upload-cli --catalog 2 --token ${cookie:9} staging.gudmap.org ./deriva + singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-upload-cli --catalog 2 --token ${cookie:9} staging.gudmap.org ./deriva echo test processed file uploaded - mkdir test - - singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' bdbag test --archiver zip + - singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' bdbag test --archiver zip - echo test output bag created - pytest -m outputBag @@ -437,12 +437,12 @@ uploadOutputBag: - > md5=$(md5sum ./test.txt | awk '{ print $1 }') && size=$(wc -c < ./test.txt) && - exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_MD5=${md5}) && + exist=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_MD5=${md5}) && if [ "${exist}" == "[]" ]; then cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && cookie=${cookie:11:-1} && - loc=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host staging.gudmap.org put ./test.txt /hatrac/resources/rnaseq/pipeline/output_bag/TEST/test.txt --parents) && - rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/upload_output_bag.py -e 17-BVDJ -f test.txt -l ${loc} -s ${md5} -b ${size} -n 'This is a test output bag' -o staging.gudmap.org -c ${cookie}) && + loc=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host staging.gudmap.org put ./test.txt /hatrac/resources/rnaseq/pipeline/output_bag/TEST/test.txt --parents) && + rid=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/upload_output_bag.py -e 17-BVDJ -f test.txt -l ${loc} -s ${md5} -b ${size} -n 'This is a test output bag' -o staging.gudmap.org -c ${cookie}) && echo ${rid} test output bag created else rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') && @@ -518,7 +518,7 @@ human_dev: - loc=$(dirname ${refURL}) - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') - - test=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) - test=$(echo ${test} | grep -o ${filename}) - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi @@ -544,7 +544,7 @@ mouse_dev: - loc=$(dirname ${refURL}) - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') - - test=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) - test=$(echo ${test} | grep -o ${filename}) - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi @@ -570,7 +570,7 @@ human_staging: - loc=$(dirname ${refURL}) - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') - - test=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) - test=$(echo ${test} | grep -o ${filename}) - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi @@ -597,7 +597,7 @@ mouse_staging: - loc=$(dirname ${refURL}) - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') - - test=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) - test=$(echo ${test} | grep -o ${filename}) - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi @@ -623,7 +623,7 @@ human_prod: - loc=$(dirname ${refURL}) - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') - - test=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) - test=$(echo ${test} | grep -o ${filename}) - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi @@ -650,7 +650,7 @@ mouse_prod: - loc=$(dirname ${refURL}) - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') - - test=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) - test=$(echo ${test} | grep -o ${filename}) - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c77f50591fc2503b8598505a2a88783eb75fca9..1bbd3c07775d72a1028eb875b9b81b2bceadeb46 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,20 @@ -# v0.1.0 (in development) +# v1.0.0 (in development) +**User Facing** +* Add link to reference builder script +* Output median TIN to mRNA_QC table + +**Background** +* Change consistency test to check if +/- 5% of standard +* Change tool version checker for badges to use latest tag +* Utilize pipeline tracking and qc AWS tables + +*Known Bugs* +* Override params (inputBag, fastq, species) aren't checked for integrity +* Authentication files and tokens must be active (active auth client) for the duration of the pipeline run (until long-lived token utilization included) + +<hr> + +# v0.1.0 **User Facing** * Add option to pull references from datahub * Add option to send email on workflow error, with pipeline error message @@ -32,7 +48,6 @@ * Make inputBag export config to create inputBag with only small txt file for CI unit test of getData (and update test) *Known Bugs* -* Datahub reference pull uses dev.gudmap.org as source until referencencs are placed on production * Override params (inputBag, fastq, species) aren't checked for integrity <hr> diff --git a/README.md b/README.md index 6f6581c9647eb5bdac8f0fe660ae1bd8308a7855..4639e51ee03b52d261274d95a9a3c72a7fd9a434 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,10 @@ |[](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/tree/master)|[](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/tree/develop)| |[](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/tree/master)|[](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/tree/develop)| -<!-- -[![DOI]()]() ---> + +[](https://doi.org/10.5281/zenodo.4429316) + + RNA-Seq Analytic Pipeline for GUDMAP/RBK ======================================== @@ -14,16 +15,9 @@ Introduction ------------ This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub. It is designed to run on the HPC cluster ([BioHPC](https://portal.biohpc.swmed.edu)) at UT Southwestern Medical Center (in conjunction with the standard nextflow profile: config `biohpc.config`) -Cloud Compatibility: --------------------- -This pipeline is also capable of being run on AWS. To do so: -* Build a AWS batch queue and environment either manually or with [aws-cloudformantion](https://console.aws.amazon.com/cloudformation/home?#/stacks/new?stackName=Nextflow&templateURL=https://s3.amazonaws.com/aws-genomics-workflows/templates/nextflow/nextflow-aio.template.yaml) -* Edit one of the aws configs in workflow/config/ - * Replace workDir with the S3 bucket generated - * Change region if different - * Change queue to the aws batch queue generated -* The user must have awscli configured with an appropriate authentication (with `aws configure` and access keys) in the environment which nextflow will be run -* Add `-profile` with the name aws config which was customized +Authentication: +---------------- +The consortium server used must be authentificated with the [deriva authentication client](https://github.com/informatics-isi-edu/gudmap-rbk/wiki/), and remain authentificated till the end of the pipeline run. Prematurely closing the client will result in invalidation of the tokens, and may result in the pipeline failure. The use of long-lived "globus" tokens is on the roadmap for use in the future. To Run: ------- @@ -49,7 +43,7 @@ To Run: * **aws_spot** = AWS Batch spot instance requests * `--email` email address(es) to send failure notification (comma separated) ***(optional)***: * e.g: `--email 'Venkat.Malladi@utsouthwestern.edu,Gervaise.Henry@UTSouthwestern.edu'` - + * NOTES: * once deriva-auth is run and authenticated, the two files above are saved in ```~/.deriva/``` (see official documents from [deriva](https://github.com/informatics-isi-edu/deriva-client#installer-packages-for-windows-and-macosx) on the lifetime of the credentials) * reference version consists of Genome Reference Consortium version, patch release and GENCODE annotation release # (leaving the params blank will use the default version tied to the pipeline version) @@ -75,12 +69,22 @@ FULL EXAMPLE: nextflow run workflow/rna-seq.nf --repRID Q-Y5JA --source production --deriva ./data/credential.json --bdbag ./data/cookies.txt --dev false --upload true -profile biohpc ``` -To run a set of replicates from study RID: +Cloud Compatibility: +-------------------- +This pipeline is also capable of being run on AWS. To do so: +* Build a AWS batch queue and environment either manually or with [aws-cloudformantion](https://console.aws.amazon.com/cloudformation/home?#/stacks/new?stackName=Nextflow&templateURL=https://s3.amazonaws.com/aws-genomics-workflows/templates/nextflow/nextflow-aio.template.yaml) +* Edit one of the aws configs in workflow/config/ + * Replace workDir with the S3 bucket generated + * Change region if different + * Change queue to the aws batch queue generated +* The user must have awscli configured with an appropriate authentication (with `aws configure` and access keys) in the environment which nextflow will be run +* Add `-profile` with the name aws config which was customized + +To generate you own references or new references: ------------------------------------------ -Run in repo root dir: -* `sh workflow/scripts/splitStudy.sh [studyRID]` -It will run in parallel in batches of 5 replicatesRID with 30 second delays between launches.\ -NOTE: Nextflow "local" processes for all replicates will run on the node/machine the bash script is launched from... consider running the study script on the BioHPC's SLURM cluster (use `sbatch`). +Download the [reference creation script](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/snippets/31). +This script will auto create human and mouse references from GENCODE. It can also create ERCC92 spike-in references as well as concatenate them to GENCODE references automatically. In addition, it can create references from manually downloaded FASTA and GTF files. + Errors: ------- @@ -95,7 +99,7 @@ Error reported back to the data-hub are (they aren't thrown on the command line |**Submitted metadata does not match inferred**|All required metadata for analysis of the data is internally inferred by the pipeline, if any of those do not match the submitted metadata, this error is detected to notify of a potential error.| <hr> -[**CHANGELOG**](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/blob/develop/CHANGELOG.md) +[**CHANGELOG**](CHANGELOG.md) <hr> Credits @@ -129,7 +133,7 @@ UT Southwestern Medical Center\ [johnathan.gesell@utsouthwestern.edu](mailto:jonathn.gesell@utsouthwestern.edu) Jeremy A. Mathews\ -*Computational Intern*\ +*Computational Biologist*\ Bioinformatics Core Facility\ UT Southwestern Medical Center\ <a href="https://orcid.org/0000-0002-2931-1430" target="orcid.widget" rel="noopener noreferrer" style="vertical-align:top;"><img src="https://orcid.org/sites/default/files/images/orcid_16x16.png" style="width:1em;margin-right:.5em;" alt="ORCID iD icon">orcid.org/0000-0002-2931-1430</a>\ diff --git a/docs/dag.png b/docs/dag.png index 5a3a8e635b8195e9ff15a09748f41627001157e7..48ae4dfa28fc0eceb4aaeb83332b22da8633ca29 100755 Binary files a/docs/dag.png and b/docs/dag.png differ diff --git a/docs/software_versions_mqc.yaml b/docs/software_versions_mqc.yaml index 4f0f08bfb5d317421a1ea9f7dd26a844cdd02cb7..feec6dd7c15e1bfd9e5d9c5d66e08f8f1b81d7f9 100755 --- a/docs/software_versions_mqc.yaml +++ b/docs/software_versions_mqc.yaml @@ -8,7 +8,7 @@ <dl class="dl-horizontal"> <dt>Python</dt><dd>v3.8.3</dd> - <dt>DERIVA</dt><dd>v1.3.0</dd> + <dt>DERIVA</dt><dd>v1.4.3</dd> <dt>BDBag</dt><dd>v1.5.6</dd> <dt>RSeQC</dt><dd>v4.0.0</dd> <dt>Trim Galore!</dt><dd>v0.6.4_dev</dd> @@ -20,5 +20,5 @@ <dt>deepTools</dt><dd>v3.5.0</dd> <dt>FastQC</dt><dd>v0.11.9</dd> <dt>MultiQC</dt><dd>v1.9</dd> - <dt>Pipeline Version</dt><dd>v0.1.0</dd> + <dt>Pipeline Version</dt><dd>v1.0.0</dd> </dl> diff --git a/workflow/nextflow.config b/workflow/nextflow.config index 8828d1ad62ec30d929643cda8e0aa5e89812c042..3b982bfbd4b12896c957e6f0af7cbc130e4c3039 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -20,10 +20,10 @@ profiles { process { withName:getBag { - container = 'gudmaprbk/deriva1.3:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.0' } withName:getData { - container = 'gudmaprbk/deriva1.3:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.0' } withName:parseMetadata { container = 'gudmaprbk/python3:1.0.0' @@ -32,7 +32,7 @@ process { container = 'gudmaprbk/trimgalore0.6.5:1.0.0' } withName:getRefInfer { - container = 'gudmaprbk/deriva1.3:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.0' } withName:downsampleData { container = 'gudmaprbk/seqtk1.3:1.0.0' @@ -47,7 +47,7 @@ process { container = 'gudmaprbk/gudmap-rbk_base:1.0.0' } withName:getRef { - container = 'gudmaprbk/deriva1.3:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.0' } withName:alignData { container = 'gudmaprbk/hisat2.2.1:1.0.0' @@ -71,28 +71,28 @@ process { container = 'gudmaprbk/multiqc1.9:1.0.0' } withName:uploadInputBag { - container = 'gudmaprbk/deriva1.3:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.0' } withName:uploadExecutionRun { - container = 'gudmaprbk/deriva1.3:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.0' } withName:uploadQC { - container = 'gudmaprbk/deriva1.3:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.0' } withName:uploadProcessedFile { - container = 'gudmaprbk/deriva1.3:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.0' } withName:uploadOutputBag { - container = 'gudmaprbk/deriva1.3:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.0' } withName:finalizeExecutionRun { - container = 'gudmaprbk/deriva1.3:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.0' } withName:failPreExecutionRun { - container = 'gudmaprbk/deriva1.3:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.0' } withName:failExecutionRun { - container = 'gudmaprbk/deriva1.3:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.0' } } @@ -122,6 +122,6 @@ manifest { homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq' description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.' mainScript = 'rna-seq.nf' - version = 'v0.1.0' + version = 'v1.0.0' nextflowVersion = '>=19.09.0' } diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 8c247c2508b426992237fa5cfe5fb496e0f99579..0990fdcb602ebc0e9a66b9ad1fe3c0dec4ede773 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -129,6 +129,18 @@ process trackStart { "dev": ${params.dev} \ }' \ "https://xku43pcwnf.execute-api.us-east-1.amazonaws.com/ProdDeploy/pipeline-tracking" + + curl -H 'Content-Type: application/json' -X PUT -d \ + '{ \ + "ID": "${workflow.sessionId}", \ + "repRID": "${repRID}", \ + "PipelineVersion": "${workflow.manifest.version}", \ + "Server": "${params.source}", \ + "Queued": "NA", \ + "CheckedOut": "NA", \ + "Started": "${workflow.start}" \ + }' \ + "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" """ } @@ -1243,6 +1255,13 @@ process uploadExecutionRun { fi echo "\${executionRun_rid}" > executionRunRID.csv + + curl -H 'Content-Type: application/json' -X PUT -d \ + '{ \ + "ID": "${workflow.sessionId}", \ + "ExecutionRunRID": "'\${executionRun_rid}'" \ + }' \ + "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" """ } @@ -1734,6 +1753,12 @@ tinMedInfer_fl.splitCsv(sep: ",", header: false).separate( tinMedInfer ) +// Replicate inferred median TIN for multiple process inputs +tinMedInfer.into { + tinMedInfer_aggrQC + tinMedInfer_uploadQC +} + /* *aggrQC: aggregate QC from processes as well as metadata and run MultiQC */ @@ -1769,7 +1794,7 @@ process aggrQC { val readLengthI from readLengthInfer_aggrQC val rawReadsI from rawReadsInfer_aggrQC val assignedReadsI from assignedReadsInfer_aggrQC - val tinMedI from tinMedInfer + val tinMedI from tinMedInfer_aggrQC val studyRID from studyRID_aggrQC val expRID from expRID_aggrQC val fastqCountError_aggrQC @@ -1850,7 +1875,11 @@ process aggrQC { echo -e "LOG: running multiqc" >> ${repRID}.aggrQC.log multiqc -c ${multiqcConfig} . -n ${repRID}.multiqc.html cp ${repRID}.multiqc_data/multiqc_data.json ${repRID}.multiqc_data.json - """ + + curl -H 'Content-Type: application/json' -X PUT -d \ + @./${repRID}.multiqc_data.json \ + "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/qc" + """ } /* @@ -1869,6 +1898,7 @@ process uploadQC { val length from readLengthInfer_uploadQC val rawCount from rawReadsInfer_uploadQC val finalCount from assignedReadsInfer_uploadQC + val tinMed from tinMedInfer_uploadQC val fastqCountError_uploadQC val fastqReadError_uploadQC val speciesError_uploadQC @@ -1912,7 +1942,7 @@ process uploadQC { echo LOG: all old mRNA QC RIDs deleted >> ${repRID}.uploadQC.log fi - qc_rid=\$(python3 ${script_uploadQC} -r ${repRID} -e ${executionRunRID} -p "\${end}" -s ${stranded} -l ${length} -w ${rawCount} -f ${finalCount} -o ${source} -c \${cookie} -u F) + qc_rid=\$(python3 ${script_uploadQC} -r ${repRID} -e ${executionRunRID} -p "\${end}" -s ${stranded} -l ${length} -w ${rawCount} -f ${finalCount} -t ${tinMed} -o ${source} -c \${cookie} -u F) echo LOG: mRNA QC RID uploaded - \${qc_rid} >> ${repRID}.uploadQC.log echo "\${qc_rid}" > qcRID.csv @@ -2124,6 +2154,14 @@ process finalizeExecutionRun { rid=\$(python3 ${script_uploadExecutionRun_finalizeExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Success -d 'Run Successful' -o ${source} -c \${cookie} -u ${executionRunRID}) echo LOG: execution run RID marked as successful - \${rid} >> ${repRID}.finalizeExecutionRun.log + + dt=`date +%FT%T.%3N%:z` + curl -H 'Content-Type: application/json' -X PUT -d \ + '{ \ + "ID": "${workflow.sessionId}", \ + "Complete": "\${dt}" \ + }' \ + "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" """ } @@ -2206,6 +2244,14 @@ process failPreExecutionRun { executionRun_rid==\$(python3 ${script_uploadExecutionRun_failPreExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u \${rid}) echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.failPreExecutionRun.log fi + + dt=`date +%FT%T.%3N%:z` + curl -H 'Content-Type: application/json' -X PUT -d \ + '{ \ + "ID": "${workflow.sessionId}", \ + "Failure": "\${dt}" \ + }' \ + "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" """ } @@ -2289,6 +2335,14 @@ process failExecutionRun { rid=\$(python3 ${script_uploadExecutionRun_failExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${pipelineError_details}" -o ${source} -c \${cookie} -u ${executionRunRID}) echo LOG: execution run RID marked as error - \${rid} >> ${repRID}.failExecutionRun.log fi + + dt=`date +%FT%T.%3N%:z` + curl -H 'Content-Type: application/json' -X PUT -d \ + '{ \ + "ID": "${workflow.sessionId}", \ + "Failure": "\${dt}" \ + }' \ + "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" """ } diff --git a/workflow/scripts/get_updated_badge_info.sh b/workflow/scripts/get_updated_badge_info.sh index 5081fe2716af90f5ec4987a8c6f317a4ca08f7aa..4b929272f2ea80ede5d47b84cd55bad2c6a3fa7b 100644 --- a/workflow/scripts/get_updated_badge_info.sh +++ b/workflow/scripts/get_updated_badge_info.sh @@ -10,19 +10,19 @@ develop_pipeline_version=$(git show origin/develop:workflow/nextflow.config | gr develop_nextflow_version=$(git show origin/develop:workflow/nextflow.config | grep -o nextflowVersion.* | grep -oP "(?<=').*(?=')") echo "collecting tool version for badges" -python_version=$(git show origin/develop:docs/software_versions_mqc.yaml | grep -o Python.* | grep -oP "(?<=d>).*(?=\<)") -deriva_version=$(git show origin/develop:docs/software_versions_mqc.yaml | grep -o DERIVA.* | grep -oP "(?<=d>).*(?=\<)") -bdbag_version=$(git show origin/develop:docs/software_versions_mqc.yaml | grep -o BDBag.* | grep -oP "(?<=d>).*(?=\<)") -rseqc_version=$(git show origin/develop:docs/software_versions_mqc.yaml | grep -o RSeQC.* | grep -oP "(?<=d>).*(?=\<)") -trimgalore_version=$(git show origin/develop:docs/software_versions_mqc.yaml | grep -o 'Trim Galore!'.* | grep -oP "(?<=d>).*(?=\<)") -hisat2_version=$(git show origin/develop:docs/software_versions_mqc.yaml | grep -o HISAT2.* | grep -oP "(?<=d>).*(?=\<)") -samtools_version=$(git show origin/develop:docs/software_versions_mqc.yaml | grep -o Samtools.* | grep -oP "(?<=d>).*(?=\<)") -picard_version=$(git show origin/develop:docs/software_versions_mqc.yaml | grep -o 'picard (MarkDuplicates)'.* | grep -oP "(?<=d>).*(?=\<)") -featurecounts_version=$(git show origin/develop:docs/software_versions_mqc.yaml | grep -o featureCounts.* | grep -oP "(?<=d>).*(?=\<)") -r_version=$(git show origin/develop:docs/software_versions_mqc.yaml | grep -o '>R<'.* | grep -oP "(?<=d>).*(?=\<)") -deeptools_version=$(git show origin/develop:docs/software_versions_mqc.yaml | grep -o deepTools.* | grep -oP "(?<=d>).*(?=\<)") -fastqc_version=$(git show origin/develop:docs/software_versions_mqc.yaml | grep -o FastQC.* | grep -oP "(?<=d>).*(?=\<)") -multiqc_version=$(git show origin/develop:docs/software_versions_mqc.yaml | grep -o MultiQC.* | grep -oP "(?<=d>).*(?=\<)") +python_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o Python.* | grep -oP "(?<=d>).*(?=\<)") +deriva_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o DERIVA.* | grep -oP "(?<=d>).*(?=\<)") +bdbag_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o BDBag.* | grep -oP "(?<=d>).*(?=\<)") +rseqc_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o RSeQC.* | grep -oP "(?<=d>).*(?=\<)") +trimgalore_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o 'Trim Galore!'.* | grep -oP "(?<=d>).*(?=\<)") +hisat2_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o HISAT2.* | grep -oP "(?<=d>).*(?=\<)") +samtools_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o Samtools.* | grep -oP "(?<=d>).*(?=\<)") +picard_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o 'picard (MarkDuplicates)'.* | grep -oP "(?<=d>).*(?=\<)") +featurecounts_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o featureCounts.* | grep -oP "(?<=d>).*(?=\<)") +r_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o '>R<'.* | grep -oP "(?<=d>).*(?=\<)") +deeptools_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o deepTools.* | grep -oP "(?<=d>).*(?=\<)") +fastqc_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o FastQC.* | grep -oP "(?<=d>).*(?=\<)") +multiqc_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o MultiQC.* | grep -oP "(?<=d>).*(?=\<)") echo "collecting badges" mkdir -p ./badges/tools diff --git a/workflow/scripts/split_study.py b/workflow/scripts/split_study.py deleted file mode 100644 index bf1129eb4913efe366f6fc148b55474d934b72a9..0000000000000000000000000000000000000000 --- a/workflow/scripts/split_study.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import pandas as pd -import warnings -warnings.simplefilter(action='ignore', category=FutureWarning) - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('-s', '--studyRID', - help="The study RID.", required=True) - args = parser.parse_args() - return args - - -def main(): - args = get_args() - studyRID = pd.read_json(args.studyRID+"_studyRID.json") - if studyRID["RID"].count() > 0: - studyRID["RID"].to_csv( - args.studyRID+"_studyRID.csv", header=False, index=False) - else: - raise Exception("No associated replicates found: %s" % - studyRID) - - -if __name__ == '__main__': - main() diff --git a/workflow/scripts/split_study.sh b/workflow/scripts/split_study.sh deleted file mode 100644 index aeec0fa1d1fc8e7fe41eaa2c332eae55fe2f0c3e..0000000000000000000000000000000000000000 --- a/workflow/scripts/split_study.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -#SBATCH -p super -#SBATCH --job-name GUDMAP-RBK_Study -#SBATCH -t 7-0:0:0 - -# query GUDMAP/RBK for study RID -echo "curl --location --request GET 'https://www.gudmap.org/ermrest/catalog/2/entity/RNASeq:Replicate/Study_RID="${1}"'" | bash > $1_studyRID.json - -# extract replicate RIDs -module load python/3.6.4-anaconda -python3 ./workflow/scripts/split_study.py -s $1 - -# run pipeline on replicate RIDs in parallel -module load nextflow/20.01.0 -module load singularity/3.5.3 -while read repRID; do echo ${repRID}; sleep 30; done < "$1_studyRID.csv" | xargs -P 5 -I {} nextflow -q run workflow/rna-seq.nf --repRID {} --source production --deriva /project/BICF/BICF_Core/shared/gudmap/test_data/auth/credential.json --bdbag /project/BICF/BICF_Core/shared/gudmap/test_data/auth/cookies.txt --dev false --upload true --email gervaise.henry@utsouthwestern.edu -with-report ./output/{}_report.html -with-timeline ./output/{}_timeline.html - -# cleanup study RID files -rm $1_studyRID.json -#rm $1_studyRID.csv diff --git a/workflow/scripts/upload_qc.py b/workflow/scripts/upload_qc.py index 930896d3abce8882aca7985a4ad304904f6b3a44..b842a7a36cc47fa4f599ab086a5c1b3dbece437a 100644 --- a/workflow/scripts/upload_qc.py +++ b/workflow/scripts/upload_qc.py @@ -12,6 +12,7 @@ def get_args(): parser.add_argument('-l', '--length', help="median read length", required=True) parser.add_argument('-w', '--rawCount', help="raw count", required=True) parser.add_argument('-f', '--assignedCount', help="final assigned count", required=True) + parser.add_argument('-t', '--tin', help="median TIN", required=True) parser.add_argument('-n', '--notes', help="notes", default="", required=False) parser.add_argument('-o', '--host', help="datahub host", required=True) parser.add_argument('-c', '--cookie', help="cookie token", required=True) @@ -33,6 +34,7 @@ def main(hostname, catalog_number, credential): "Median_Read_Length": args.length, "Raw_Count": args.rawCount, "Final_Count": args.assignedCount, + "Median_TIN": args.tin, "Notes": args.notes } entities = run_table.insert([run_data]) @@ -47,6 +49,7 @@ def main(hostname, catalog_number, credential): "Median_Read_Length": args.length, "Raw_Count": args.rawCount, "Final_Count": args.assignedCount, + "Median_TIN": args.tin, "Notes": args.notes } entities = run_table.update([run_data]) diff --git a/workflow/tests/test_consistency.py b/workflow/tests/test_consistency.py index 2ee7b83f0c52d18d3256e1be6e3ed7f7783b1c80..aa04f19bd23e3749532b87b598179b8f98b2218b 100644 --- a/workflow/tests/test_consistency.py +++ b/workflow/tests/test_consistency.py @@ -19,7 +19,11 @@ def test_consistencySE(): test_output_path, 'SE_multiqc_data.json')) as f: assigned_reads_json = json.load(f) assigned_reads = assigned_reads_json['report_general_stats_data'][4]['16-1ZX4_sorted']['Assigned'] - assert assigned_reads == 7746121 + baseline = 7746121 + baseline_hi = baseline+(baseline*0.05) + baseline_lo = baseline-(baseline*0.05) + assert (assigned_reads >= baseline_lo) + assert (assigned_reads <= baseline_hi) @pytest.mark.consistencyPE @@ -31,4 +35,8 @@ def test_consistencyPE(): test_output_path, 'PE_multiqc_data.json')) as f: assigned_reads_json = json.load(f) assigned_reads = assigned_reads_json['report_general_stats_data'][4]['Q-Y5JA_sorted']['Assigned'] - assert assigned_reads == 2596053 + baseline = 2596053 + baseline_hi = baseline+(baseline*0.05) + baseline_lo = baseline-(baseline*0.05) + assert (assigned_reads >= baseline_lo) + assert (assigned_reads <= baseline_hi)