From 09bd8e2b7de4d781e0fe823d0bdac64b67e27389 Mon Sep 17 00:00:00 2001 From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu> Date: Fri, 19 Mar 2021 15:29:02 -0500 Subject: [PATCH] Finalize aws env ci + docs --- .gitlab-ci.yml | 6 +- CHANGELOG.md | 1 + README.md | 54 ++++++++++----- docs/awsExample.json | 1 + docs/nxf_aws-ci-test.json | 1 + nextflow.config | 10 +-- nextflowConf/aws.config | 87 ++++++++++++------------ nextflowConf/aws.temp.config | 126 ----------------------------------- nextflowConf/ondemand.config | 3 - nextflowConf/spot.config | 3 - 10 files changed, 86 insertions(+), 206 deletions(-) create mode 100644 docs/awsExample.json create mode 100644 docs/nxf_aws-ci-test.json delete mode 100644 nextflowConf/aws.temp.config delete mode 100755 nextflowConf/ondemand.config delete mode 100755 nextflowConf/spot.config diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 030ed62..7b5a337 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1080,7 +1080,7 @@ aws: --job-name nf-GUDMAP_RBK_ci-env\ --job-queue default-bicf\ --job-definition nextflow-bicf-nextflow\ - --container-overrides command="utsw-bicf/gudmap_rbk.rna-seq","-r","env.ci","-profile","aws","--deriva","s3://bicf-output/ci-env/auth/credential.json","--bdbag","s3://bicf-output/ci-env/auth/cookies.txt","--repRID","Q-Y5F6","--source","staging","--upload","false","--dev","false","--ci","true","--track","false","-with-report","s3://bicf-output/ci-env/output/Q-Y5F6_fastqoverride_report.html","--refSource","datahub","--outDir","s3://bicf-output/ci-env/output/Q-Y5F6_fastqoverride","--fastqsForce","s3://bicf-output/ci-env/input/*.fastq.gz") + --container-overrides command=`cat ../docs/nxf_aws-ci-test.json`) id=$(echo ${id}| grep -oP "jobId\K.*" | tr -d '"' | tr -d ":" | tr -d " " | tr -d "}") - > status=$(aws batch describe-jobs --jobs ${id} | grep -oP "status\": \K.*" | tr -d '"' | tr -d ',' | tr -d " " ) && @@ -1091,9 +1091,9 @@ aws: done - > if [ "${status}" == "SUCCEEDED" ]; then - curl --request GET https://img.shields.io/badge/Envronment%3A%20AWS-run%20succesful-success?style=flat > ./badges/env/dnanexus.svg + curl --request GET https://img.shields.io/badge/Envronment%3A%20AWS-run%20succesful-success?style=flat > ./badges/env/aws.svg else - curl --request GET https://img.shields.io/badge/Envronment%3A%20AWS-run%20failed-critical?style=flat > ./badges/env/dnanexus.svg + curl --request GET https://img.shields.io/badge/Envronment%3A%20AWS-run%20failed-critical?style=flat > ./badges/env/aws.svg fi after_script: - module load awscli/1.11.139 diff --git a/CHANGELOG.md b/CHANGELOG.md index 09b6f43..36020f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,7 @@ * Override fastq count to override counts * Change ambiguous species ci to wrong species * Add test for DNAnexus env +* Add test for AWS env *Known Bugs* * Override params (inputBag, fastq, species) aren't checked for integrity diff --git a/README.md b/README.md index 419dc68..6bbf831 100644 --- a/README.md +++ b/README.md @@ -74,29 +74,40 @@ FULL EXAMPLE: ``` nextflow run workflow/rna-seq.nf --repRID Q-Y5JA --source production --deriva ./data/credential.json --bdbag ./data/cookies.txt --dev false --upload true -profile biohpc ``` - +<hr> Cloud Compatibility: -------------------- This pipeline is also capable of being run on AWS and DNAnexus. To do so: -### [AWS](https://aws.amazon.com/) -* Build a AWS batch queue and environment either manually or with [aws-cloudformantion](https://console.aws.amazon.com/cloudformation/home?#/stacks/new?stackName=Nextflow&templateURL=https://s3.amazonaws.com/aws-genomics-workflows/templates/nextflow/nextflow-aio.template.yaml) -* Edit one of the aws configs in workflow/config/ - * Replace workDir with the S3 bucket generated - * Change region if different - * Change queue to the aws batch queue generated -* The user must have awscli configured with an appropriate authentication (with `aws configure` and access keys) in the environment which nextflow will be run -* Add `-profile` with the name aws config which was customized -### DNAnexus (utilizes the [DNAnexus extension package for Nextflow (XPACK-DNANEXUS)](https://github.com/seqeralabs/xpack-dnanexus)) -* Follow the istructions from [XPACK-DNANEXUS](https://github.com/seqeralabs/xpack-dnanexus) about installing and authenticating (a valid license must be available for the extension package from Seqera Labs, as well as a subsription with DNAnexus) -* The nf-dxapp needs to be built with a custom scm config to allow nextflow to pull the pipeline from the UTSW self-hosted GitLab server (git.biohpc.swmed.edu) -``` -providers { +* The Nextflow binary needs to contain a custom scm config to allow nextflow to pull the pipeline from the UTSW self-hosted GitLab server (git.biohpc.swmed.edu) + ``` + providers { bicf { server = 'https://git.biohpc.swmed.edu' platform = 'gitlab' } -} -``` + } + ``` + This is required for the use of `nextflow run` or `nextflow pull` pointed directly to the git repo, but also the use in AWS or DNAnexus environments as those both use `nextflow run` directly to that repo. To get around this requirement, there is a clone of the repo hosted on [GitHub](https://github.com/utsw-bicf/gudmap_rbk.rna-seq) which can be used... but the currency of that clone cannot be guarnteed! +### [AWS](https://aws.amazon.com/) +* Build a AWS batch queue and environment either manually or with a template, such as: [Genomics Workflows on AWS](https://docs.opendata.aws/genomics-workflows/) +* The user must have awscli configured with an appropriate authentication (with `aws configure` and access keys) in the environment which nextflow +* Follow the instructions from [AWS](https://docs.aws.amazon.com/cli/latest/reference/batch/submit-job.html) about launching runs, using AWS cli. A template *json* file has been included ([awsExample.json](docs/awsExample.json)) + * `[version]` should be replaced with the pipeline version required (eg: `v2.0.0`) + * `[credential.json]` should be replaced with the location of the credential file outpted by authentification with Deriva + * `[cookies.txt]` should be replaced with the location of the cookies file outpted by authentification with Deriva for BDBag + * `[repRID]` should be replaced with the replicate RID to be analized (eg: `Q-Y5F6`) + * `[outDir]` should be replaced with the location to save local outputs of the pipeline + + example `aws batch submit-job` command (replaceing the parameters in `[]` with the appropriate values) + ``` + aws batch submit-job\ + --job-name [Job Name]\ + --job-queue [Queue]\ + --job-definition [Job Definition]\ + --container-overrides command=`cat ../docs/nxf_aws-ci-test.json` + ``` +### [DNAnexus](https://dnanexus.com/) (utilizes the [DNAnexus extension package for Nextflow (XPACK-DNANEXUS)](https://github.com/seqeralabs/xpack-dnanexus)) +* Follow the istructions from [XPACK-DNANEXUS](https://github.com/seqeralabs/xpack-dnanexus) about installing and authenticating (a valid license must be available for the extension package from Seqera Labs, as well as a subsription with DNAnexus) * Follow the instructions from [XPACK-DNANEXUS](https://github.com/seqeralabs/xpack-dnanexus) about launching runs. A template *json* file has been included ([dnanexusExample.json](docs/dnanexusExample.json)) * `[version]` should be replaced with the pipeline version required (eg: `v2.0.0`) * `[credential.json]` should be replaced with the location of the credential file outpted by authentification with Deriva @@ -104,12 +115,19 @@ providers { * `[repRID]` should be replaced with the replicate RID to be analized (eg: `Q-Y5F6`) * `[outDir]` should be replaced with the location to save local outputs of the pipeline + example `dx-run` command + ``` + dx run nf-dxapp-bicf \ + --delay-workspace-destruction \ + --instance-type mem1_ssd1_v2_x16 \ + --input-json "$(envsubst < ./docs/nxf_dnanexus-ci-test.json)" + ``` +<hr> To generate you own references or new references: ------------------------------------------ Download the [reference creation script](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/snippets/31). This script will auto create human and mouse references from GENCODE. It can also create ERCC92 spike-in references as well as concatenate them to GENCODE references automatically. In addition, it can create references from manually downloaded FASTA and GTF files. - - +<hr> Errors: ------- Error reported back to the data-hub are (they aren't thrown on the command line by the pipeline, but rather are submitted (if `--upload true`) to the data-hub for that replicate in the execution run submission): diff --git a/docs/awsExample.json b/docs/awsExample.json new file mode 100644 index 0000000..8e8f39f --- /dev/null +++ b/docs/awsExample.json @@ -0,0 +1 @@ +["https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq","-r","[Version]","-profile","aws","--deriva","[credential.json]","--bdbag","[cookies.txt]","--repRID","[repRID]","--outDir","[outDir]"] diff --git a/docs/nxf_aws-ci-test.json b/docs/nxf_aws-ci-test.json new file mode 100644 index 0000000..de8752b --- /dev/null +++ b/docs/nxf_aws-ci-test.json @@ -0,0 +1 @@ +["utsw-bicf/gudmap_rbk.rna-seq","-r","env.ci","-profile","aws","--deriva","s3://bicf-output/ci-env/auth/credential.json","--bdbag","s3://bicf-output/ci-env/auth/cookies.txt","--repRID","Q-Y5F6","--source","staging","--upload","false","--dev","false","--ci","true","--track","false","-with-report","s3://bicf-output/ci-env/output/Q-Y5F6_fastqoverride_report.html","--refSource","datahub","--outDir","s3://bicf-output/ci-env/output/Q-Y5F6_fastqoverride","--fastqsForce","s3://bicf-output/ci-env/input/*.fastq.gz"] diff --git a/nextflow.config b/nextflow.config index 2d00ca9..c8983df 100644 --- a/nextflow.config +++ b/nextflow.config @@ -8,19 +8,11 @@ profiles { biohpc_max { includeConfig 'nextflowConf/biohpc_max.config' } - aws_ondemand { - includeConfig 'nextflowConf/aws.config' - includeConfig 'nextflowConf/ondemand.config' - } - aws_spot { - includeConfig 'nextflowConf/aws.config' - includeConfig 'nextflowConf/spot.config' - } dnanexus { includeConfig 'nextflowConf/dnanexus.config' } aws { - includeConfig 'nextflowConf/aws.temp.config' + includeConfig 'nextflowConf/aws.config' } } diff --git a/nextflowConf/aws.config b/nextflowConf/aws.config index bf5b59c..a5133aa 100644 --- a/nextflowConf/aws.config +++ b/nextflowConf/aws.config @@ -1,21 +1,8 @@ params { - refSource = "aws" -} - -workDir = 's3://gudmap-rbk.output/work' -aws.client.storageEncryption = 'AES256' -aws { - region = 'us-east-2' - batch { - cliPath = '/home/ec2-user/miniconda/bin/aws' - } + refSource = "datahub" } process { - executor = 'awsbatch' - cpus = 1 - memory = '1 GB' - withName:trackStart { cpus = 1 memory = '1 GB' @@ -25,67 +12,75 @@ process { memory = '1 GB' } withName:getData { - cpus = 1 - memory = '1 GB' + cpus = 16 + memory = '32 GB' } withName:parseMetadata { - cpus = 15 + cpus = 1 memory = '1 GB' } - withName:trimData { - cpus = 20 - memory = '2 GB' + withName:getRefERCC { + cpus = 16 + memory = '32 GB' + } + withName:getRef { + cpus = 16 + memory = '32 GB' } - withName:getRefInfer { + withName:fastqc { + cpus = 16 + memory = '32 GB' + } + withName:seqwho { cpus = 1 memory = '1 GB' } + withName:trimData { + cpus = 16 + memory = '32 GB' + } withName:downsampleData { cpus = 1 memory = '1 GB' } + withName:alignSampleDataERCC { + cpus = 16 + memory = '32 GB' + } withName:alignSampleData { - cpus = 50 - memory = '5 GB' + cpus = 16 + memory = '32 GB' } withName:inferMetadata { - cpus = 5 - memory = '1 GB' + cpus = 16 + memory = '32 GB' } withName:checkMetadata { cpus = 1 memory = '1 GB' } - withName:getRef { - cpus = 1 - memory = '1 GB' - } withName:alignData { - cpus = 50 - memory = '10 GB' + cpus = 16 + memory = '32 GB' } withName:dedupData { - cpus = 5 - memory = '20 GB' + cpus = 16 + memory = '32 GB' } withName:countData { - cpus = 2 - memory = '5 GB' + cpus = 16 + memory = '32 GB' } withName:makeBigWig { - cpus = 15 - memory = '5 GB' - } - withName:fastqc { - cpus = 1 - memory = '1 GB' + cpus = 16 + memory = '32 GB' } withName:dataQC { - cpus = 15 - memory = '2 GB' + cpus = 16 + memory = '32 GB' } withName:aggrQC { - cpus = 2 + cpus = 1 memory = '1 GB' } withName:uploadInputBag { @@ -125,3 +120,7 @@ process { memory = '1 GB' } } + +docker { + enabled = true +} diff --git a/nextflowConf/aws.temp.config b/nextflowConf/aws.temp.config deleted file mode 100644 index a5133aa..0000000 --- a/nextflowConf/aws.temp.config +++ /dev/null @@ -1,126 +0,0 @@ -params { - refSource = "datahub" -} - -process { - withName:trackStart { - cpus = 1 - memory = '1 GB' - } - withName:getBag { - cpus = 1 - memory = '1 GB' - } - withName:getData { - cpus = 16 - memory = '32 GB' - } - withName:parseMetadata { - cpus = 1 - memory = '1 GB' - } - withName:getRefERCC { - cpus = 16 - memory = '32 GB' - } - withName:getRef { - cpus = 16 - memory = '32 GB' - } - withName:fastqc { - cpus = 16 - memory = '32 GB' - } - withName:seqwho { - cpus = 1 - memory = '1 GB' - } - withName:trimData { - cpus = 16 - memory = '32 GB' - } - withName:downsampleData { - cpus = 1 - memory = '1 GB' - } - withName:alignSampleDataERCC { - cpus = 16 - memory = '32 GB' - } - withName:alignSampleData { - cpus = 16 - memory = '32 GB' - } - withName:inferMetadata { - cpus = 16 - memory = '32 GB' - } - withName:checkMetadata { - cpus = 1 - memory = '1 GB' - } - withName:alignData { - cpus = 16 - memory = '32 GB' - } - withName:dedupData { - cpus = 16 - memory = '32 GB' - } - withName:countData { - cpus = 16 - memory = '32 GB' - } - withName:makeBigWig { - cpus = 16 - memory = '32 GB' - } - withName:dataQC { - cpus = 16 - memory = '32 GB' - } - withName:aggrQC { - cpus = 1 - memory = '1 GB' - } - withName:uploadInputBag { - cpus = 1 - memory = '1 GB' - } - withName:uploadExecutionRun { - cpus = 1 - memory = '1 GB' - } - withName:uploadQC { - cpus = 1 - memory = '1 GB' - } - withName:uploadProcessedFile { - cpus = 1 - memory = '1 GB' - } - withName:uploadOutputBag { - cpus = 1 - memory = '1 GB' - } - withName:finalizeExecutionRun { - cpus = 1 - memory = '1 GB' - } - withName:failPreExecutionRun { - cpus = 1 - memory = '1 GB' - } - withName:failExecutionRun { - cpus = 1 - memory = '1 GB' - } - withName:uploadQC_fail { - cpus = 1 - memory = '1 GB' - } -} - -docker { - enabled = true -} diff --git a/nextflowConf/ondemand.config b/nextflowConf/ondemand.config deleted file mode 100755 index 131fdbb..0000000 --- a/nextflowConf/ondemand.config +++ /dev/null @@ -1,3 +0,0 @@ -process { - queue = 'highpriority-0ef8afb0-c7ad-11ea-b907-06c94a3c6390' -} diff --git a/nextflowConf/spot.config b/nextflowConf/spot.config deleted file mode 100755 index d9c7a4c..0000000 --- a/nextflowConf/spot.config +++ /dev/null @@ -1,3 +0,0 @@ -process { - queue = 'default-0ef8afb0-c7ad-11ea-b907-06c94a3c6390' -} -- GitLab