From 2f239170500642249707b8fef39217b671b03692 Mon Sep 17 00:00:00 2001 From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu> Date: Sun, 13 Sep 2020 15:11:43 -0500 Subject: [PATCH] Update datahub ref download --- workflow/conf/biohpc.config | 2 +- workflow/rna-seq.nf | 90 +++++++++++++++++------------- workflow/scripts/extractRefData.py | 4 +- 3 files changed, 54 insertions(+), 42 deletions(-) diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index d2a933f..57b72fd 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -1,5 +1,5 @@ params { - refSource = "biohpc + refSource = "biohpc" } process { diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 641ecda..d020f1f 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -53,8 +53,8 @@ if (params.source == "dev") { } if (params.refSource == "biohpc") { referenceBase = "/project/BICF/BICF_Core/shared/gudmap/references" -} else if (params.refSource == "aws") { - referenceBase = "s3://bicf-references" +//} else if (params.refSource == "aws") { +// referenceBase = "s3://bicf-references" } else if (params.refSource == "datahub") { referenceBase = "dev.gudmap.org" } @@ -403,30 +403,35 @@ process getRefInfer { # retreive appropriate reference appropriate location echo -e "LOG: fetching ${refName} reference files from ${referenceBase}" >> ${repRID}.${refName}.getRefInfer.log - if [ ${referenceBase} == "s3://bicf-references" ] - then - aws s3 cp "\${references}"/hisat2 ./hisat2 --recursive - aws s3 cp "\${references}"/bed ./${refName}/bed --recursive - aws s3 cp "\${references}"/genome.fna ./ - aws s3 cp "\${references}"/genome.gtf ./ - elif [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references" ] + if [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references" ] then ln -s "\${references}"/hisat2 ln -s "\${references}"/bed ${refName}/bed ln -s "\${references}"/genome.fna ln -s "\${references}"/genome.gtf + #elif [ ${referenceBase} == "s3://bicf-references" ] + #then + # aws s3 cp "\${references}"/hisat2 ./hisat2 --recursive + # aws s3 cp "\${references}"/bed ./${refName}/bed --recursive + # aws s3 cp "\${references}"/genome.fna ./ + # aws s3 cp "\${references}"/genome.gtf ./ elif [ ${referenceBase} == "dev.gudmap.org" ] then - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}) - curl --request GET ${query} > refQuery.json - refURL=$(python ./workflow/scripts/extractRefData.py --returnParam URL) - loc=$(dirname ${refURL}) - if [ "${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') - deriva-hatrac-cli --host ${referenceBase} get ${refURL} + GRCv=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f1) + GRCp=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f2) + GENCODE=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f3) + query=\$(echo 'https://'"${referenceBase}"'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='\${GRCv}'.'\${GRCp}'/Annotation_Version=GENCODE%20'\${GENCODE}) + curl --request GET \${query} > refQuery.json + refURL=\$(python ./workflow/scripts/extractRefData.py --returnParam URL) + loc=\$(dirname \${refURL}) + fName=\$(python ./workflow/scripts/extractRefData.py --returnParam fName) + fName=\${fName%.*} + if [ "\${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi + filename=\$(echo \$(basename \${refURL}) | grep -oP '.*(?=:)') + deriva-hatrac-cli --host ${referenceBase} get \${refURL} + unzip \$(basename \${refURL}) + mkdir -p \${references} + mv \${fName}/* \${references}/ fi echo -e "LOG: fetched" >> ${repRID}.${refName}.getRefInfer.log @@ -724,36 +729,41 @@ process getRef { # retreive appropriate reference appropriate location echo -e "LOG: fetching ${species} reference files from ${referenceBase}" >> ${repRID}.getRef.log - if [ ${referenceBase} == "s3://bicf-references" ] - then - echo -e "LOG: grabbing reference files from S3" >> ${repRID}.getRef.log - aws s3 cp "\${references}"/hisat2 ./hisat2 --recursive - aws s3 cp "\${references}"/bed ./bed --recursive - aws s3 cp "\${references}"/genome.fna ./ - aws s3 cp "\${references}"/genome.gtf ./ - aws s3 cp "\${references}"/geneID.tsv ./ - aws s3 cp "\${references}"/Entrez.tsv ./ - elif [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references" ] + if [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references" ] then + echo -e "LOG: grabbing reference files from local (BioHPC)" >> ${repRID}.getRef.log ln -s "\${references}"/hisat2 ln -s "\${references}"/bed ln -s "\${references}"/genome.fna ln -s "\${references}"/genome.gtf ln -s "\${references}"/geneID.tsv ln -s "\${references}"/Entrez.tsv + #elif [ ${referenceBase} == "s3://bicf-references" ] + #then + # echo -e "LOG: grabbing reference files from S3" >> ${repRID}.getRef.log + # aws s3 cp "\${references}"/hisat2 ./hisat2 --recursive + # aws s3 cp "\${references}"/bed ./bed --recursive + # aws s3 cp "\${references}"/genome.fna ./ + # aws s3 cp "\${references}"/genome.gtf ./ + # aws s3 cp "\${references}"/geneID.tsv ./ + # aws s3 cp "\${references}"/Entrez.tsv ./ elif [ ${referenceBase} == "dev.gudmap.org" ] then - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}) - curl --request GET ${query} > refQuery.json - refURL=$(python ./workflow/scripts/extractRefData.py --returnParam URL) - loc=$(dirname ${refURL}) - if [ "${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') - deriva-hatrac-cli --host ${referenceBase} get ${refURL} - fi + GRCv=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f1) + GRCp=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f2) + GENCODE=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f3) + query=\$(echo 'https://'"${referenceBase}"'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='\${GRCv}'.'\${GRCp}'/Annotation_Version=GENCODE%20'\${GENCODE}) + curl --request GET \${query} > refQuery.json + refURL=\$(python ./workflow/scripts/extractRefData.py --returnParam URL) + loc=\$(dirname \${refURL}) + fName=\$(python ./workflow/scripts/extractRefData.py --returnParam fName) + fName=\${fName%.*} + if [ "\${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi + filename=\$(echo \$(basename \${refURL}) | grep -oP '.*(?=:)') + deriva-hatrac-cli --host ${referenceBase} get \${refURL} + unzip \$(basename \${refURL}) + mkdir -p \${references} + mv \${fName}/* \${references}/ fi echo -e "LOG: fetched" >> ${repRID}.getRef.log """ diff --git a/workflow/scripts/extractRefData.py b/workflow/scripts/extractRefData.py index 4e69b93..fb3668e 100644 --- a/workflow/scripts/extractRefData.py +++ b/workflow/scripts/extractRefData.py @@ -17,7 +17,9 @@ def main(): if refQuery["File_URL"].count() == 1: if args.returnParam == "URL": print(refQuery["File_URL"].values[0]) - elif args.returnParam == "": + elif args.returnParam == "fName": + print(refQuery["File_Name"].values[0]) + elif args.returnParam == "MD5": print(refQuery["File_MD5"].values[0]) else: raise Exception("Multple references found: \n%s" % -- GitLab