diff --git a/.gitignore b/.gitignore index f500ef73462d50e596722c6cbca6e15bdfe0a741..12288788210fa386427657fa55ab47b9ac14a6aa 100644 --- a/.gitignore +++ b/.gitignore @@ -297,6 +297,8 @@ timeline*.html* *.tmp *.swp *.out +*_studyRID.json +*_studyRID.csv run*.sh !.gitkeep diff --git a/cleanup.sh b/cleanup.sh index 9569ff54fd71cd94bddde415af03a101820ab514..aa289201c531fa4f4667a04f80fd015d2200e40c 100644 --- a/cleanup.sh +++ b/cleanup.sh @@ -5,3 +5,5 @@ rm timeline*.html* rm .nextflow*.log* rm -r .nextflow/ rm -r work/ +rm *_studyRID.json +rm *_studyRID.csv diff --git a/workflow/scripts/splitStudy.py b/workflow/scripts/splitStudy.py new file mode 100644 index 0000000000000000000000000000000000000000..82ffc2881857dd5d1d27eee5ea6a381b02d0e9f5 --- /dev/null +++ b/workflow/scripts/splitStudy.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 + +import argparse +import pandas as pd +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-s', '--studyRID',help="The study RID.",required=True) + args = parser.parse_args() + return args + +def main(): + args = get_args() + studyRID=pd.read_json(args.studyRID+"_studyRID.json") + if studyRID["RID"].count() > 0: + studyRID["RID"].to_csv(args.studyRID+"_studyRID.csv",header=False,index=False) + else: + raise Exception("No associated replicates found: %s" % + studyRID) + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/splitStudy.sh b/workflow/scripts/splitStudy.sh new file mode 100644 index 0000000000000000000000000000000000000000..a64b6d9e4cde818d1c6f91fd84144b821febc536 --- /dev/null +++ b/workflow/scripts/splitStudy.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# query GUDMAP/RBK for study RID +echo "curl --location --request GET 'https://www.gudmap.org/ermrest/catalog/2/entity/RNASeq:Replicate/Study_RID="${1}"'" | bash > $1_studyRID.json + +# extract replicate RIDs +module load python/3.6.4-anaconda +python3 ./workflow/scripts/splitStudy.py -s $1 + +# run pipeline on replicate RIDs in parallel +module load nextflow/20.01.0 +module load singularity/3.5.3 +while read repRID; do echo ${repRID}; done < "$1_studyRID.csv" | xargs -P 5 -I {} nextflow run workflow/rna-seq.nf --repRID {} + +# cleanup study RID files +rm $1_studyRID.json +rm $1_studyRID.csv