Merge branch 'develop' into 'master'

v0.0.1 See merge request !37

Merge branch 'develop' into 'master'
v0.0.1 See merge request !37
67e3eb6e · Gervaise Henry · 4442b00e · 8a8ce097 · 67e3eb6e · 4442b00e
Commit 67e3eb6e authored 4 years ago by Gervaise Henry
--- a/workflow/conf/biohpc_max.config
+++ b/workflow/conf/biohpc_max.config
+process {
+  executor = 'slurm'
+  queue = '256GB,256GBv1,384GB,128GB'
+  clusterOptions = '--hold'
+}
+
+singularity {
+  enabled = true
+  cacheDir = '/project/BICF/BICF_Core/shared/gudmap/singularity_cache/'
+}
+
+env {
+  http_proxy = 'http://proxy.swmed.edu:3128'
+  https_proxy = 'http://proxy.swmed.edu:3128'
+  all_proxy = 'http://proxy.swmed.edu:3128'
+}
--- a/workflow/conf/conda.env.bdbag.yml
+++ b/workflow/conf/conda.env.bdbag.yml
-name: bdbag
-dependencies:
-  - pandas=0.23.3=py36_0
-  - pip:
-    - bdbag==1.5.5
--- a/workflow/conf/multiqc_config.yaml
+++ b/workflow/conf/multiqc_config.yaml
+custom_logo: './bicf_logo.png'
+custom_logo_url: 'https/utsouthwestern.edu/labs/bioinformatics/'
+custom_logo_title: 'Bioinformatics Core Facility'
+
+report_header_info:
+  - Contact Email: 'bicf@utsouthwestern.edu'
+  - Application Type: 'RNA-Seq Analytic Pipeline for GUDMAP/RBK'
+  - Department: 'Bioinformatic Core Facility, Department of Bioinformatics, University of Texas Southwestern Medical Center'
+
+title: RNA-Seq Analytic Pipeline for GUDMAP/RBK
+
+report_comment: >
+  This report has been generated by the <a href="https://doi.org/10.5281/zenodo.3625056">GUDMAP/RBK RNA-Seq Pipeline</a>
+
+top_modules:
+  - fastqc:
+      name: 'Raw'
+      info: 'Replicate Raw fastq QC Results'
+  - cutadapt:
+      name: 'Trim'
+      info: 'Replicate Trim Adapter QC Results'
+  - hisat2:
+      name: 'Align'
+      info: 'Replicate Alignment QC Results'
+      path_filters:
+        - '*alignSummary*'
+  - picard:
+      name: 'Dedup'
+      info: 'Replicate Alignement Deduplication QC Results'
+  - rseqc:
+      name: 'Inner Distance'
+      info: 'Replicate Paired End Inner Distance Distribution Results'
+      path_filters:
+        - '*insertSize*'
+  - custom_content
+  - featureCounts:
+      name: 'Count'
+      info: 'Replicate Feature Count QC Results'
+  - hisat2:
+      name: 'Inference: Align'
+      info: 'Inference Alignment (1M downsampled reads) QC Results'
+      path_filters:
+        - '*alignSampleSummary*'
+  - rseqc:
+      name: 'Inference: Stranded'
+      info: '1M Downsampled Reads Strandedness Inference Results'
+      path_filters:
+        - '*infer_experiment*'
+
+report_section_order:
+    rid:
+      order: 2000
+    meta:
+      order: 1000
+
+skip_generalstats: true
+
+custom_data:
+    rid:
+        file_format: 'tsv'
+        section_name: 'RID'
+        description: 'This is the identifying RIDs'
+        plot_type: 'table'
+        pconfig:
+            id: 'rid'
+        headers:
+            Replicate RID
+            Experiment RID
+            Study RID
+    meta:
+        file_format: 'tsv'
+        section_name: 'Metadata'
+        description: 'This is the comparison of infered metadata, submitter provided, and calculated'
+        plot_type: 'table'
+        pconfig:
+            id: 'meta'
+            format: '{:,.0f}'
+        headers:
+            Source
+            Species
+            Ends
+            Stranded
+            Spike-in
+            Raw Reads
+            Assigned Reads
+            Median Read Length
+            Median TIN
+    tin:
+        file_format: 'tsv'
+        section_name: 'TIN'
+        description: 'This is the distribution of TIN values calculated by the tool RSeQC'
+        plot_type: 'bargraph'
+        pconfig:
+            id: 'tin'
+        headers:
+            chrom
+            0 - 9
+            10 - 19
+            20 - 29
+            30 - 39
+            40 - 49
+            50 - 59
+            60 - 69
+            70 - 79
+            80 - 89
+            90 - 99
+
+sp:
+    rid:
+        fn: 'rid.tsv'
+    meta:
+        fn: 'metadata.tsv'
+    tin:
+        fn: '*.tin.hist.tsv'
--- a/workflow/conf/ondemand.config
+++ b/workflow/conf/ondemand.config
+process {
+  queue = 'highpriority-0ef8afb0-c7ad-11ea-b907-06c94a3c6390'
+}
--- a/workflow/conf/replicate_export_config.json
+++ b/workflow/conf/replicate_export_config.json
+{
+  "bag": {
+    "bag_name": "Replicate_{rid}",
+    "bag_algorithms": [
+      "md5"
+    ],
+    "bag_archiver": "zip"
+  },
+  "catalog": {
+    "query_processors": [
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Study",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Study_RID)=(RNASeq:Study:RID)/Study_RID:=RID,Internal_ID,Title,Summary,Overall_Design,GEO_Series_Accession_ID,GEO_Platform_Accession_ID,Funding,Pubmed_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment:RID)/Experiment_RID:=RID,Study_RID,Internal_ID,Name,Description,Experiment_Method,Sequencing_Type,Species,Specimen_Type,Molecule_Type,Pooled_Sample,Pool_Size,Markers,Cell_Count,Treatment_Protocol,Treatment_Protocol_Reference,Isolation_Protocol,Isolation_Protocol_Reference,Growth_Protocol,Growth_Protocol_Reference,Label_Protocol,Label_Protocol_Reference,Hybridization_Protocol,Hybridization_Protocol_Reference,Scan_Protocol,Scan_Protocol_Reference,Data_Processing,Value_Definition,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment Antibodies",
+          "query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Antibodies:Experiment_RID)?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment Custom Metadata",
+          "query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Custom_Metadata:Experiment_RID)?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment Settings",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Settings:Experiment_RID)/RID,Experiment_RID,Alignment_Format,Aligner,Aligner_Version,Reference_Genome,Sequence_Trimming,Duplicate_Removal,Pre-alignment_Sequence_Removal,Junction_Reads,Library_Type,Protocol_Reference,Library_Selection,Quantification_Format,Quantification_Software,Expression_Metric,Transcriptome_Model,Sequencing_Platform,Paired_End,Read_Length,Has_Strand_Specific_Information,Used_Spike_Ins,Spike_Ins_Amount,Visualization_Format,Visualization_Software,Visualization_Version,Visualization_Setting,Notes,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Replicate",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/RID,Study_RID,Experiment_RID,Biological_Replicate_Number,Technical_Replicate_Number,Specimen_RID,Collection_Date,Mapped_Reads,GEO_Sample_Accession_ID,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Specimen",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/S:=(Specimen_RID)=(Gene_Expression:Specimen:RID)/T:=left(Stage_ID)=(Vocabulary:Developmental_Stage:ID)/$S/RID,Title,Species,Stage_ID,Stage_Name:=T:Name,Stage_Detail,Assay_Type,Strain,Wild_Type,Sex,Passage,Phenotype,Cell_Line,Parent_Specimen,Upload_Notes,Preparation,Fixation,Embedding,Internal_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT,GUDMAP2_Accession_ID?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Specimen_Anatomical_Source",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Tissue:Specimen_RID)/RID,Specimen_RID,Tissue,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Specimen_Cell_Types",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Cell_Type:Specimen)/RID,Specimen_RID:=Specimen,Cell_Type,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Single Cell Metrics",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:Single_Cell_Metrics:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Reads_%28Millions%29,Reads%2FCell,Detected_Gene_Count,Genes%2FCell,UMI%2FCell,Estimated_Cell_Count,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "File",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Caption,File_Type,File_Name,URI,File_size,MD5,GEO_Archival_URL,dbGaP_Accession_ID,Processed,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT,Legacy_File_RID,GUDMAP_NGF_OID,GUDMAP_NGS_OID?limit=none"
+        }
+      },
+      {
+        "processor": "fetch",
+        "processor_params": {
+          "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/url:=URI,length:=File_size,filename:=File_Name,md5:=MD5,Study_RID,Experiment_RID,Replicate_RID?limit=none"
+        }
+      }
+    ]
+  }
+}
--- a/workflow/conf/spot.config
+++ b/workflow/conf/spot.config
+process {
+  queue = 'default-0ef8afb0-c7ad-11ea-b907-06c94a3c6390'
+}
--- a/workflow/docker/.gitkeep
+++ b/workflow/docker/.gitkeep
--- a/workflow/docker/getData
+++ b/workflow/docker/getData
--- a/workflow/docker/images/.gitkeep
+++ b/workflow/docker/images/.gitkeep
--- a/workflow/docker/temp
+++ b/workflow/docker/temp
-
-
-RUN apt-get install -y python3.7 python3-pip
-
-RUN wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-  bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b && \
-  rm Miniconda3-latest-Linux-x86_64.sh
-ENV PATH=/miniconda/bin:${PATH}                                                                                      
-RUN conda config --add channels defaults && \
-  conda config --add channels bioconda && \
-  conda config --add channels conda-forge && \
-  conda update -n base -c defaults -y conda
-
-RUN pip install --upgrade pip
--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
@@ -2,4 +2,98 @@ profiles {
  standard {
    includeConfig 'conf/biohpc.config'
  }
+  biohpc {
+    includeConfig 'conf/biohpc.config'
+  }
+  biohpc_max {
+    includeConfig 'conf/biohpc_max.config'
+  }
+  aws_ondemand {
+    includeConfig 'conf/aws.config'
+    includeConfig 'conf/ondemand.config'
+  }
+  aws_spot {
+    includeConfig 'conf/aws.config'
+    includeConfig 'conf/spot.config'
+  }
+}
+
+process {
+  withName:getBag {
+    container = 'bicf/gudmaprbkfilexfer:2.0.1_indev'
+  }
+  withName:getData {
+    container = 'bicf/gudmaprbkfilexfer:2.0.1_indev'
+  }
+  withName: parseMetadata {
+    container = 'bicf/python3:2.0.1_indev'
+  }
+  withName: trimData {
+    container = 'bicf/trimgalore:1.1'
+  }
+  withName: getRefInfer {
+    container = 'bicf/awscli:1.1'
+  }
+  withName: downsampleData {
+    container = 'bicf/seqtk:2.0.1_indev'
+  }
+  withName: alignSampleData {
+    container = 'bicf/gudmaprbkaligner:2.0.1_indev'
+  }
+  withName: inferMetadata {
+    container = 'bicf/rseqc3.0:2.0.1_indev'
+  }
+  withName: getRef {
+    container = 'bicf/awscli:1.1'
+  }
+  withName: alignData {
+    container = 'bicf/gudmaprbkaligner:2.0.1_indev'
+  }
+  withName: dedupData {
+    container = 'bicf/gudmaprbkdedup:2.0.0'
+  }
+  withName: countData {
+    container = 'bicf/subread2:2.0.0'
+  }
+  withName: makeBigWig {
+    container = 'bicf/deeptools3.3:2.0.1_indev'
+  }
+  withName: fastqc {
+    container = 'bicf/fastqc:2.0.1_indev'
+  }
+  withName: dataQC {
+    container = 'bicf/rseqc3.0:2.0.1_indev'
+  }
+  withName: aggrQC {
+    container = 'bicf/multiqc1.8:2.0.1_indev'
+  }
+}
+
+trace {
+  enabled = true
+  file = 'pipeline_trace.txt'
+  fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
+}
+
+timeline {
+  enabled = true
+  file = 'timeline.html'
+}
+	
+report {
+  enabled = true
+  file = 'report.html'
+}
+
+tower {
+  accessToken = '3ade8f325d4855434b49aa387421a44c63e3360f'
+  enabled = true
+}
+
+manifest {
+  homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq'
+  description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.'
+  mainScript = 'rna-seq.nf'
+  version = 'v0.0.1'
+  nextflowVersion = '>=19.09.0'
 }
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
--- a/workflow/scripts/bdbagFetch.sh
+++ b/workflow/scripts/bdbagFetch.sh
-#!/bin
+#!/bin/bash

-bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1
\ No newline at end of file
+if [ -z "${3}" ]
+then
+    bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz ${1}
+    for i in $(find */ -name "*R*.fastq.gz")
+    do
+        path=${2}.$(echo ${i##*/} | grep -o "R[1,2].fastq.gz")
+        cp ${i} ./${path}
+    done
+elif [ "${3}" == "TEST" ]
+then
+    bdbag --resolve-fetch all --fetch-filter filename\$*.txt ${1}
+fi
--- a/workflow/scripts/calculateTPM.R
+++ b/workflow/scripts/calculateTPM.R
+gc()
+library(optparse)
+
+option_list=list(
+  make_option("--count",action="store",type='character',help="Count File")
+)
+opt=parse_args(OptionParser(option_list=option_list))
+rm(option_list)
+
+if (!("count" %in% names(opt))){
+  stop("No count file passed, exiting.")
+} else if (!file.exists(opt$count)) {
+  stop("Count file doesn't exist, exiting.")
+}
+
+repRID <- basename(gsub(".featureCounts","",opt$count))
+
+count <- read.delim(opt$count, comment.char="#") # if featureCounts file changes structure, be sure to update count and Length columns below
+colnames(count)[7] <- "count"
+
+rpk <- count$count/count$Length/1000
+
+scale <- sum(rpk)/1000000
+
+tpm <- rpk/scale
+
+output <- cbind(count,tpm)
+colnames(output)[7] <- "count"
+
+write.table(output,file=paste0(repRID,".countTable.csv"),sep=",",row.names=FALSE,quote=FALSE)
--- a/workflow/scripts/convertGeneSymbols.R
+++ b/workflow/scripts/convertGeneSymbols.R
+gc()
+library(optparse)
+
+option_list=list(
+  make_option("--repRID",action="store",type='character',help="Replicate RID")
+)
+opt=parse_args(OptionParser(option_list=option_list))
+rm(option_list)
+
+countTable <- read.csv(paste0(opt$repRID,".countData.countTable.csv"), stringsAsFactors=FALSE)
+geneID <- read.delim("geneID.tsv", header=FALSE, stringsAsFactors=FALSE)
+Entrez <- read.delim("Entrez.tsv", header=FALSE, stringsAsFactors=FALSE)
+
+convert <- data.frame(geneID=countTable$Geneid)
+convert <- merge(x=convert,y=geneID[,1:2],by.x="geneID",by.y="V2",all.x=TRUE)
+convert <- merge(x=convert,y=Entrez,by.x="V1",by.y="V1",all.x=TRUE)
+convert[is.na(convert$V2),3] <- ""
+convert <- convert[,-1]
+colnames(convert) <- c("GeneID","EntrezID")
+convert <- unique(convert)
+
+output <- merge(x=convert,y=countTable,by.x="GeneID",by.y="Geneid",all.x=TRUE)
+
+write.table(output,file=paste0(opt$repRID,".tpmTable.csv"),sep=",",row.names=FALSE,quote=FALSE)
\ No newline at end of file
--- a/workflow/scripts/inferMeta.sh
+++ b/workflow/scripts/inferMeta.sh
+#!/bin/bash
+
+if [ "${1}" == "endness" ]
+then
+    awk '/Data/ {print}' "${2}" | sed -e 's/^This is //' -e 's/ Data$//'
+elif [ "${1}" == "fail" ]
+then
+    awk '/Fraction of reads failed/ {print}' "${2}" | sed -e 's/^Fraction of reads failed to determine: //'
+elif [ "${1}" == "sef" ]
+then
+    awk '/\+\+,--/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "++,--": //'
+elif [ "${1}" == "ser" ]
+then
+    awk '/\+-,-\+/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "+-,-+": //'
+elif [ "${1}" == "pef" ]
+then
+    awk '/1\+\+,1--,2\+-,2-\+/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "1++,1--,2+-,2-+": //'
+elif [ "${1}" == "per" ]
+then
+    awk '/1\+-,1-\+,2\+\+,2--/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "1+-,1-+,2++,2--": //'
+fi
\ No newline at end of file
--- a/workflow/scripts/parseMeta.py
+++ b/workflow/scripts/parseMeta.py
+#!/usr/bin/env python3
+
+import argparse
+import pandas as pd
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-r', '--repRID',help="The replicate RID.",required=True)
+    parser.add_argument('-m', '--metaFile',help="The metadata file to extract.",required=True)
+    parser.add_argument('-p', '--parameter',help="The parameter to extract.",required=True)
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+    metaFile = pd.read_csv(args.metaFile,sep=",",header=0)
+
+    # Check replicate RID metadata from 'File.csv'
+    if (args.parameter == "repRID"):
+        if (len(metaFile.Replicate_RID.unique()) > 1):
+            print("There are multiple replicate RID's in the metadata: " + " ".join(metaFile.Replicate_RID.unique()))
+            exit(1)
+        if not (metaFile.Replicate_RID.unique() == args.repRID):
+            print("Replicate RID in metadata does not match run parameters: " + metaFile.Replicate_RID.unique() + " vs " + args.repRID)
+            exit(1)
+        else:
+            rep=metaFile["Replicate_RID"].unique()[0]
+            print(rep)
+        if (len(metaFile[metaFile["File_Type"] == "FastQ"]) > 2):
+            print("There are more then 2 fastq's in the metadata: " + " ".join(metaFile[metaFile["File_Type"] == "FastQ"].RID))
+            exit(1)
+
+    # Check experiment RID metadata from 'Experiment.csv'
+    if (args.parameter == "expRID"):
+        if (len(metaFile.Experiment_RID.unique()) > 1):
+            print("There are multiple experoment RID's in the metadata: " + " ".join(metaFile.Experiment_RID.unique()))
+            exit(1)
+        else:
+            exp=metaFile["Experiment_RID"].unique()[0]
+            print(exp)
+
+    # Check study RID metadata from 'Experiment.csv'
+    if (args.parameter == "studyRID"):
+        if (len(metaFile.Study_RID.unique()) > 1):
+            print("There are multiple study RID's in the metadata: " + " ".join(metaFile.Study_RID.unique()))
+            exit(1)
+        else:
+            study=metaFile["Study_RID"].unique()[0]
+            print(study)
+    
+    # Get endedness metadata from 'Experiment Settings.csv'
+    if (args.parameter == "endsMeta"):
+        if (metaFile.Paired_End.unique() == "Single End"):
+            endsMeta = "se"
+        elif (metaFile.Paired_End.unique() == "Paired End"):
+            endsMeta = "pe"
+        else:
+            endsMeta = "uk"
+        print(endsMeta)
+    
+    # Manually get endness count from 'File.csv'
+    if (args.parameter == "endsManual"):
+        if (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 1):
+            endsManual = "se"
+        elif (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 2):
+            endsManual = "pe"
+        print(endsManual)
+    
+    # Get strandedness metadata from 'Experiment Settings.csv'
+    if (args.parameter == "stranded"):
+        if (metaFile.Has_Strand_Specific_Information.unique() == "yes"):
+            stranded = "stranded"
+        elif (metaFile.Has_Strand_Specific_Information.unique() == "no"):
+            stranded = "unstranded"
+        else:
+            print("Stranded metadata not match expected options: " + metaFile.Has_Strand_Specific_Information.unique())
+            exit(1)
+        print(stranded)
+    
+    # Get spike-in metadata from 'Experiment Settings.csv'
+    if (args.parameter == "spike"):
+        if (metaFile.Used_Spike_Ins.unique() == "yes"):
+            spike = "yes"
+        elif (metaFile.Used_Spike_Ins.unique() == "no"):
+            spike = "no"
+        else:
+            print("Spike-ins metadata not match expected options: " + metaFile.Used_Spike_Ins.unique())
+            exit(1)
+        print(spike)
+
+    # Get species metadata from 'Experiment.csv'
+    if (args.parameter == "species"):
+        if (metaFile.Species.unique() == "Mus musculus"):
+            species = "Mus musculus"
+        elif (metaFile.Species.unique() == "Homo sapiens"):
+            species = "Homo sapiens"
+        else:
+            print("Species metadata not match expected options: " + metaFile.Species.unique())
+            exit(1)
+        print(species)
+
+    # Get read length metadata from 'Experiment Settings.csv'
+    if (args.parameter == "readLength"):
+        readLength = metaFile.Read_Length.unique()
+        print(str(readLength).strip('[]'))
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/renameFastq.sh
+++ b/workflow/scripts/renameFastq.sh
-#!/bin
-
-while read loc checksum fileLocation
-do
-    file=$(echo ${fileLocation##*/})
-    fileName=$(echo ${file%.R*.fastq.gz})
-    fileExt=$(echo ${file##${fileName}.})
-    while IFS="," read RID Study_RID Experiment_RID Replicate_RID Caption File_Type File_Name URI File_size MD5 GEO_Archival_URL dbGaP_Accession_ID Processed Notes Principal_Investigator Consortium Release_Date RCT RMT Legacy_File_RID GUDMAP_NGF_OID GUDMAP_NGS_OID
-    do
-        if [ ${file} == ${File_Name} ]
-        then
-            find . -type f -name ${file} -execdir mv {} ${Replicate_RID}.${fileExt} ';'
-        fi
-    done < $1/data/File.csv
-done < $1/fetch.txt
\ No newline at end of file
--- a/workflow/scripts/modifyFetch.py
+++ b/workflow/scripts/modifyFetch.py
+#!/usr/bin/env python3
+
 import argparse
 import pandas as pd
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)

 def get_args():
    parser = argparse.ArgumentParser()
-    parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True)
+    parser.add_argument('-s', '--studyRID',help="The study RID.",required=True)
    args = parser.parse_args()
    return args

 def main():
    args = get_args()
-    fetch = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None)
-    fetch_filtered = fetch[fetch[2].str[-9:]==".fastq.gz"]
-    fetch_filtered.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False)
+    studyRID=pd.read_json(args.studyRID+"_studyRID.json")
+    if studyRID["RID"].count() > 0:
+        studyRID["RID"].to_csv(args.studyRID+"_studyRID.csv",header=False,index=False)
+    else:
+        raise Exception("No associated replicates found: %s" %
+            studyRID)

 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
--- a/workflow/scripts/splitStudy.sh
+++ b/workflow/scripts/splitStudy.sh
+#!/bin/bash
+
+# query GUDMAP/RBK for study RID
+echo "curl --location --request GET 'https://www.gudmap.org/ermrest/catalog/2/entity/RNASeq:Replicate/Study_RID="${1}"'" | bash > $1_studyRID.json
+
+# extract replicate RIDs
+module load python/3.6.4-anaconda
+python3 ./workflow/scripts/splitStudy.py -s $1
+
+# run pipeline on replicate RIDs in parallel
+module load nextflow/20.01.0
+module load singularity/3.5.3
+while read repRID; do echo ${repRID}; done < "$1_studyRID.csv" | xargs -P 5 -I {} nextflow run workflow/rna-seq.nf --repRID {}
+
+# cleanup study RID files
+rm $1_studyRID.json
+rm $1_studyRID.csv