diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ef034806ec36810bf6ee22e065caf805d5ede6ca..8df331b6f4ebe20f6f880bfcd948c7a48986b94d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -123,7 +123,7 @@ parseMetadata:
       - version_python.txt
     expire_in: 7 days
 
-inferMetadata:
+fastqc:
   stage: unit
   only:
     - push
@@ -132,22 +132,37 @@ inferMetadata:
     - merge_requests
     - schedules
   script:
-  - singularity run 'docker://gudmaprbk/rseqc4.0.0:1.0.0' infer_experiment.py --version > version_rseqc.txt
-  - >
-    align=$(echo $(grep "Overall alignment rate" ./test_data/meta/Q-Y5F6_1M.se.alignSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) &&
-    if [[ ${align} == "" ]]; then exit 1; fi
-  - >
-    singularity run 'docker://gudmaprbk/rseqc4.0.0:1.0.0' infer_experiment.py -r "/project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/annotation/genome.bed" -i "./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam" 1>> Q-Y5F6_1M.se.inferMetadata.log &&
-    ended=`singularity run 'gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/infer_meta.sh endness Q-Y5F6_1M.se.inferMetadata.log` &&
-    if [[ ${ended} == "" ]]; then exit 1; fi
-  - pytest -m inferMetadata
+  - singularity run 'docker://gudmaprbk/fastqc0.11.9:1.0.0' fastqc --version > version_fastqc.txt
+  - singularity run 'docker://gudmaprbk/fastqc0.11.9:1.0.0' fastqc ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o .
+  - pytest -m fastqc
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - version_fastqc.txt
+    expire_in: 7 days
+
+seqwho:
+  stage: unit
+  only:
+    - push
+    - tags
+  except:
+    - merge_requests
+    - schedules
+  script:
+  - wget -O SeqWho.ix https://cloud.biohpc.swmed.edu/index.php/s/eeNWqZz8jqN5zWY/download
+  - singularity run 'docker://gudmaprbk/seqwho0.0.1:1.0.0' seqwho.py | grep -o Version.* > version_seqwho.txt
+  - singularity run 'docker://gudmaprbk/seqwho0.0.1:1.0.0' seqwho.py -f ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -x SeqWho.ix
+  - pytest -m seqwho
   artifacts:
     name: "$CI_JOB_NAME"
     when: always
     paths:
-      - version_rseqc.txt
+      - version_seqwho.txt
     expire_in: 7 days
 
+
 trimData:
   stage: unit
   only:
@@ -182,6 +197,31 @@ downsampleData:
   - singularity run 'docker://gudmaprbk/seqtk1.3:1.0.0' seqtk sample -s100 ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz 1000 1> sampled.1.fq
   - pytest -m downsampleData
 
+  inferMetadata:
+    stage: unit
+    only:
+      - push
+      - tags
+    except:
+      - merge_requests
+      - schedules
+    script:
+    - singularity run 'docker://gudmaprbk/rseqc4.0.0:1.0.0' infer_experiment.py --version > version_rseqc.txt
+    - >
+      align=$(echo $(grep "Overall alignment rate" ./test_data/meta/Q-Y5F6_1M.se.alignSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) &&
+      if [[ ${align} == "" ]]; then exit 1; fi
+    - >
+      singularity run 'docker://gudmaprbk/rseqc4.0.0:1.0.0' infer_experiment.py -r "/project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/annotation/genome.bed" -i "./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam" 1>> Q-Y5F6_1M.se.inferMetadata.log &&
+      ended=`singularity run 'gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/infer_meta.sh endness Q-Y5F6_1M.se.inferMetadata.log` &&
+      if [[ ${ended} == "" ]]; then exit 1; fi
+    - pytest -m inferMetadata
+    artifacts:
+      name: "$CI_JOB_NAME"
+      when: always
+      paths:
+        - version_rseqc.txt
+      expire_in: 7 days
+  
 alignData:
   stage: unit
   only:
@@ -282,26 +322,6 @@ makeBigWig:
       - version_deeptools.txt
     expire_in: 7 days
 
-fastqc:
-  stage: unit
-  only:
-    - push
-    - tags
-  except:
-    - merge_requests
-    - schedules
-  script:
-  - singularity run 'docker://gudmaprbk/fastqc0.11.9:1.0.0' fastqc --version > version_fastqc.txt
-  - singularity run 'docker://gudmaprbk/fastqc0.11.9:1.0.0' fastqc ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o .
-  - pytest -m fastqc
-  artifacts:
-    name: "$CI_JOB_NAME"
-    when: always
-    paths:
-      - version_fastqc.txt
-    expire_in: 7 days
-
-
 dataQC:
   stage: unit
   only:
diff --git a/docs/software_references_mqc.yaml b/docs/software_references_mqc.yaml
index d9d18558b7df3f626ff89cdb01c610228db92a8b..1456ff7a0e42f6b00caf4726844a398e6bf18a6a 100755
--- a/docs/software_references_mqc.yaml
+++ b/docs/software_references_mqc.yaml
@@ -80,12 +80,18 @@
                 <li>FastQC <a href="https://www.bioinformatics.babraham.ac.uk/projects/fastqc/" class="uri">https://www.bioinformatics.babraham.ac.uk/projects/fastqc/</a></li>
                 </ul>
                 <ol start="13" style="list-style-type: decimal">
+                <li><strong>SeqWho</strong></li>
+                </ol>
+                <ul>
+                <li>SeqWho <a href="https://git.biohpc.swmed.edu/s181649/seqwho" class="uri">https://git.biohpc.swmed.edu/s181649/seqwho/</a></li>
+                </ul>
+                <ol start="14" style="list-style-type: decimal">
                 <li><strong>MultiQC</strong>:</li>
                 </ol>
                 <ul>
                 <li>Ewels P., Magnusson M., Lundin S. and KÃ¤ller M. 2016. MultiQC: Summarize analysis results for multiple tools and samples in a single report. Bioinformatics 32(19): 3047â€“3048. doi:<a href="https://dx.doi.org/10.1093/bioinformatics/btw354">10.1093/bioinformatics/btw354</a></li>
                 </ul>
-                <ol start="14" style="list-style-type: decimal">
+                <ol start="15" style="list-style-type: decimal">
                 <li><strong>Nextflow</strong>:</li>
                 </ol>
                 <ul>
diff --git a/workflow/conf/aws.config b/workflow/conf/aws.config
index b745882bfa12898838cd1449013b499698cc5b54..7f6003cad373c86d9aa67e8afcb70e9696964754 100644
--- a/workflow/conf/aws.config
+++ b/workflow/conf/aws.config
@@ -32,6 +32,18 @@ process {
     cpus = 15
     memory = '1 GB'
   }
+  withName:getRefERCC {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName:getRef {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName:fastqc {
+    cpus = 1
+    memory = '1 GB'
+  }
   withName:seqwho {
     cpus = 1
     memory = '1 GB'
@@ -40,14 +52,14 @@ process {
     cpus = 20
     memory = '2 GB'
   }
-  withName:getRefInfer {
-    cpus = 1
-    memory = '1 GB'
-  }
   withName:downsampleData {
     cpus = 1
     memory = '1 GB'
   }
+  withName:alignSampleDataERCC {
+    cpus = 50
+    memory = '5 GB'
+  }
   withName:alignSampleData {
     cpus = 50
     memory = '5 GB'
@@ -60,10 +72,6 @@ process {
     cpus = 1
     memory = '1 GB'
   }
-  withName:getRef {
-    cpus = 1
-    memory = '1 GB'
-  }
   withName:alignData {
     cpus = 50
     memory = '10 GB'
@@ -80,10 +88,6 @@ process {
     cpus = 15
     memory = '5 GB'
   }
-  withName:fastqc {
-    cpus = 1
-    memory = '1 GB'
-  }
   withName:dataQC {
     cpus = 15
     memory = '2 GB'
diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config
index b71c2d652594a3ca16a04b6bf7f84c501126c616..dff28cb4ae54ee54ad63204ec8bd88e2441eb71b 100755
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -22,18 +22,27 @@ process {
   withName:parseMetadata {
     executor = 'local'
   }
+  withName:getRefERCC {
+    queue = 'super'
+  }
+  withName:getRef {
+    queue = 'super'
+  }
+  withName:fastqc {
+    queue = 'super'
+  }
   withName:seqwho {
     executor = 'local'
   }
   withName:trimData {
     queue = 'super'
   }
-  withName:getRefInfer {
-    queue = 'super'
-  }
   withName:downsampleData {
     executor = 'local'
   }
+  withName:alignSampleDataERCC {
+    queue = '128GB,256GB,256GBv1,384GB'
+  }
   withName:alignSampleData {
     queue = '128GB,256GB,256GBv1,384GB'
   }
@@ -43,9 +52,6 @@ process {
   withName:checkMetadata {
     executor = 'local'
   }
-  withName:getRef {
-    queue = 'super'
-  }
   withName:alignData {
     queue = '256GB,256GBv1'
   }
@@ -58,9 +64,6 @@ process {
   withName:makeBigWig {
     queue = 'super'
   }
-  withName:fastqc {
-    queue = 'super'
-  }
   withName:dataQC {
     queue = 'super'
   }
diff --git a/workflow/nextflow.config b/workflow/nextflow.config
index 868dac461a3a74e4a15e7ecdfd6ac81443c05d61..40da6fe4a4e9398be973003e7ae2dcb58f92b2c4 100644
--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
@@ -28,18 +28,27 @@ process {
   withName:parseMetadata {
     container = 'gudmaprbk/python3:1.0.0'
   }
+  withName:getRefERCC {
+    container = 'gudmaprbk/deriva1.4:1.0.0'
+  }
+  withName:getRef {
+    container = 'gudmaprbk/deriva1.4:1.0.0'
+  }
+  withName:fastqc {
+    container = 'gudmaprbk/fastqc0.11.9:1.0.0'
+  }
   withName:seqwho {
     container = 'gudmaprbk/seqwho0.0.1:1.0.0'
   }
   withName:trimData {
     container = 'gudmaprbk/trimgalore0.6.5:1.0.0'
   }
-  withName:getRefInfer {
-    container = 'gudmaprbk/deriva1.4:1.0.0'
-  }
   withName:downsampleData {
     container = 'gudmaprbk/seqtk1.3:1.0.0'
   }
+  withName:alignSampleDataERCC {
+    container = 'gudmaprbk/hisat2.2.1:1.0.0'
+  }
   withName:alignSampleData {
     container = 'gudmaprbk/hisat2.2.1:1.0.0'
   }
@@ -49,9 +58,6 @@ process {
   withName:checkMetadata {
     container = 'gudmaprbk/gudmap-rbk_base:1.0.0'
   }
-  withName:getRef {
-    container = 'gudmaprbk/deriva1.4:1.0.0'
-  }
   withName:alignData {
     container = 'gudmaprbk/hisat2.2.1:1.0.0'
   }
@@ -64,9 +70,6 @@ process {
   withName:makeBigWig {
     container = 'gudmaprbk/deeptools3.5.0:1.0.0'
   }
-  withName:fastqc {
-    container = 'gudmaprbk/fastqc0.11.9:1.0.0'
-  }
   withName:dataQC {
     container = 'gudmaprbk/rseqc4.0.0:1.0.0'
   }
diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf
index 7545b2522e9c9a161b30130dff1bee7b550f875c..d7f59ed5df22397c3487c2223f3066cc406d4b97 100644
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -22,7 +22,6 @@ params.upload = false
 params.email = ""
 params.track = false
 
-
 // Define override input variable
 params.refSource = "biohpc"
 params.inputBagForce = ""
@@ -35,14 +34,13 @@ params.spikeForce = ""
 params.ci = false
 params.dev = true
 
-
 // Parse input variables
 deriva = Channel
   .fromPath(params.deriva)
   .ifEmpty { exit 1, "deriva credential file not found: ${params.deriva}" }
 deriva.into {
   deriva_getBag
-  deriva_getRefInfer
+  deriva_getRefERCC
   deriva_getRef
   deriva_uploadInputBag
   deriva_uploadExecutionRun
@@ -220,7 +218,7 @@ process getBag {
     """
 }
 
-// Set inputBag to downloaded or forced input
+// Set inputBag to downloaded or forced input and replicate them for multiple process inputs 
 if (inputBagForce != "") {
   inputBag = Channel
     .fromPath(inputBagForce)
@@ -234,7 +232,7 @@ inputBag.into {
 }
 
 /*
- * getData: fetch replicate files from consortium with downloaded bdbag.zip
+ * getData: fetch replicate files from consortium with downloaded input bag
  */
 process getData {
   tag "${repRID}"
@@ -297,11 +295,15 @@ if (fastqsForce != "") {
     .fromPath(fastqsForce)
     .ifEmpty { exit 1, "override inputBag file not found: ${fastqsForce}" }
     .collect().into {
+      fastqs_seqwho
+      fastqs_trimData
       fastqs_parseMetadata
       fastqs_fastqc
     }
 } else {
   fastqs.collect().into {
+    fastqs_seqwho
+    fastqs_trimData
     fastqs_parseMetadata
     fastqs_fastqc
   }
@@ -309,7 +311,7 @@ if (fastqsForce != "") {
 
 /*
  * parseMetadata: parses metadata to extract experiment parameters
-*/
+ */
 process parseMetadata {
   tag "${repRID}"
 
@@ -464,7 +466,7 @@ process parseMetadata {
     """
 }
 
-// Split metadata into separate channels
+// Split metadata into separate channels and replicate them for multiple process inputs
 endsMeta = Channel.create()
 endsRaw = Channel.create()
 endsManual = Channel.create()
@@ -485,8 +487,6 @@ metadata_fl.splitCsv(sep: ",", header: false).separate(
   expRID,
   studyRID
 )
-
-// Replicate metadata for multiple process inputs
 endsMeta.into {
   endsMeta_checkMetadata
   endsMeta_aggrQC
@@ -496,6 +496,7 @@ endsManual.into {
   endsManual_seqwho
   endsManual_trimData
   endsManual_downsampleData
+  endsManual_alignSampleDataERCC
   endsManual_alignSampleData
   endsManual_aggrQC
 }
@@ -511,6 +512,7 @@ spikeMeta.into {
   spikeMeta_failExecutionRun
 }
 speciesMeta.into {
+  speciesMeta_seqwho
   speciesMeta_checkMetadata
   speciesMeta_aggrQC
   speciesMeta_failPreExecutionRun
@@ -527,7 +529,7 @@ expRID.into {
   expRID_uploadProcessedFile
 }
 
-// Split fastq count error into separate channel
+// Split fastq count error into separate channel and replicate them for multiple process inputs
 fastqCountError = Channel.create()
 fastqCountError_details = Channel.create()
 fastqReadError = Channel.create()
@@ -538,74 +540,74 @@ fastqError_fl.splitCsv(sep: ",", header: false).separate(
   fastqReadError,
   fastqReadError_details
 )
-
-//  Replicate errors for multiple process inputs
 fastqCountError.into {
   fastqCountError_fastqc
   fastqCountError_seqwho
+  fastqCountError_getRefERCC
+  fastqCountError_getRef
   fastqCountError_trimData
-  fastqCountError_getRefInfer
   fastqCountError_downsampleData
+  fastqCountError_alignSampleDataERCC
   fastqCountError_alignSampleData
   fastqCountError_inferMetadata
   fastqCountError_checkMetadata
-  fastqCountError_uploadExecutionRun
-  fastqCountError_getRef
   fastqCountError_alignData
   fastqCountError_dedupData
   fastqCountError_makeBigWig
   fastqCountError_countData
   fastqCountError_dataQC
   fastqCountError_aggrQC
+  fastqCountError_uploadExecutionRun
   fastqCountError_uploadQC
-  fastqCountError_uploadQC_fail
   fastqCountError_uploadProcessedFile
   fastqCountError_uploadOutputBag
-  fastqCountError_failPreExecutionRun_fastq
+  fastqCountError_finalizeExecutionRun
+  fastqCountError_uploadQC_fail
 }
 fastqReadError.into {
   fastqReadError_fastqc
   fastqReadError_seqwho
+  fastqReadError_getRefERCC
+  fastqReadError_getRef
   fastqReadError_trimData
-  fastqReadError_getRefInfer
   fastqReadError_downsampleData
+  fastqReadError_alignSampleDataERCC
   fastqReadError_alignSampleData
   fastqReadError_inferMetadata
   fastqReadError_checkMetadata
-  fastqReadError_uploadExecutionRun
-  fastqReadError_getRef
   fastqReadError_alignData
   fastqReadError_dedupData
   fastqReadError_makeBigWig
   fastqReadError_countData
   fastqReadError_dataQC
   fastqReadError_aggrQC
+  fastqReadError_uploadExecutionRun
   fastqReadError_uploadQC
-  fastqReadError_uploadQC_fail
   fastqReadError_uploadProcessedFile
   fastqReadError_uploadOutputBag
-  fastqReadError_failPreExecutionRun_fastq
+  fastqReadError_finalizeExecutionRun
+  fastqReadError_uploadQC_fail
 }
 
 /*
- *fastqc: run fastqc on untrimmed fastq's
-*/
+ * fastqc: run fastqc on untrimmed fastq's
+ */
 process fastqc {
   tag "${repRID}"
 
   input:
-    path (fastq) from fastqs_fastqc.collect()
-    val fastqCountError_fastqc
-    val fastqReadError_fastqc
+    path (fastq) from fastqs_fastqc
+    val fastqCountError from fastqCountError_fastqc
+    val fastqReadError from fastqReadError_fastqc
 
   output:
-    path ("*.R{1,2}.fastq.gz", includeInputs:true) into fastqs_out
     path ("*_fastqc.zip") into fastqc
     path ("rawReads.csv") into rawReadsInfer_fl
     path "fastqFileError.csv" into fastqFileError_fl
 
   when:
-    fastqCountError_fastqc == 'false' && fastqReadError_fastqc == 'false'
+    fastqCountError == 'false'
+    fastqReadError == 'false'
 
   script:
     """
@@ -636,76 +638,65 @@ process fastqc {
     """
 }
 
-// Replicate fastqs for downstream process inputs
-fastqs_out.into {
-  fastqs_seqwho
-  fastqs_trimData
-}
-
-// Extract number of raw reads metadata into channel
+// Extract number of raw reads metadata into channel and replicate them for multiple process inputs
 rawReadsInfer = Channel.create()
 rawReadsInfer_fl.splitCsv(sep: ",", header: false).separate(
   rawReadsInfer
 )
-
-// Replicate inferred raw reads for multiple process inputs
 rawReadsInfer.into {
   rawReadsInfer_aggrQC
   rawReadsInfer_uploadQC
 }
 
-// Split fastq count error into separate channel
+// Split fastq file error into separate channel and replicate them for multiple process inputs
 fastqFileError = Channel.create()
 fastqFileError_details = Channel.create()
 fastqFileError_fl.splitCsv(sep: ",", header: false).separate(
   fastqFileError,
   fastqFileError_details
 )
-
-//  Replicate errors for multiple process inputs
 fastqFileError.into {
-  fastqFileError_seqwho
   fastqFileError_trimData
-  fastqFileError_getRefInfer
+  fastqFileError_getRef
   fastqFileError_downsampleData
+  fastqFileError_alignSampleDataERCC
   fastqFileError_alignSampleData
   fastqFileError_inferMetadata
   fastqFileError_checkMetadata
-  fastqFileError_uploadExecutionRun
-  fastqFileError_getRef
   fastqFileError_alignData
   fastqFileError_dedupData
   fastqFileError_makeBigWig
   fastqFileError_countData
   fastqFileError_dataQC
   fastqFileError_aggrQC
+  fastqFileError_uploadExecutionRun
   fastqFileError_uploadQC
-  fastqFileError_uploadQC_fail
   fastqFileError_uploadProcessedFile
   fastqFileError_uploadOutputBag
-  fastqFileError_failPreExecutionRun_fastqFile
+  fastqFileError_finalizeExecutionRun
+  fastqFileError_uploadQC_fail
 }
 
 /*
- * seqwho: use seqwho to infer species and seq type
-*/
+ * seqwho: run seqwho to infer species and seq type
+ */
 process seqwho {
   tag "${repRID}"
 
   input:
     path (fastq) from fastqs_seqwho
     val ends from endsManual_seqwho
-    val fastqCountError_seqwho
-    val fastqReadError_seqwho
-    val fastqFileError_seqwho
+    val speciesMeta from speciesMeta_seqwho
+    val fastqCountError from fastqCountError_seqwho
+    val fastqReadError from fastqReadError_seqwho
 
   output:
+    path "inferSpecies.csv" into inferSpecies_fl
     path "inferError.csv" into inferError_fl
 
   when:
-    fastqCountError_seqwho == "false"
-    fastqReadError_seqwho == "false"
-    fastqFileError_seqwho == "false"
+    fastqCountError == "false"
+    fastqReadError == "false"
 
   script:
     """
@@ -785,21 +776,28 @@ process seqwho {
     echo -e "LOG: confidence converted to string" >> ${repRID}.seqwho.log
 
     # detect errors
-    speciesInferError=false
-    speciesInferError_details=""
-    seqtypeInferError=false
-    seqtypeInferError_details=""
+    speciesErrorSeqwho=false
+    speciesErrorSeqwho_details=""
+    seqtypeError=false
+    seqtypeError_details=""
     if [ "\${confidenceR1}" == "high" ] && [ "\${confidenceR2}" == "high" ]
     then
       echo -e "LOG: high confidence inference detected" >> ${repRID}.seqwho.log
       if [ "\${speciesR1}" == "\${speciesR2}" ]
       then
         speciesInfer=\${speciesR1}
+        if [ "\${speciesInfer}" == "human" ]
+        then
+          speciesInfer="Homo sapiens"
+        elif [ "\${speciesInfer}" == "mouse" ]
+        then
+          speciesInfer="Mus musculus"
+        fi
         echo -e "LOG: concordant species inference: \${speciesInfer}" >> ${repRID}.seqwho.log
       else
-        speciesInferError=true
-        speciesInferError_details="**Infered species does not match for R1 and R2:** Infered R1 = \${speciesR1} and infered R2 = \${speciesR2}"
-        echo -e "LOG: inference error: \${speciesInferError_details}" >> ${repRID}.seqwho.log
+        speciesErrorSeqwho=true
+        speciesErrorSeqwho_details="**Infered species does not match for R1 and R2:** Infered R1 = \${speciesR1} and infered R2 = \${speciesR2}"
+        echo -e "LOG: inference error: \${speciesErrorSeqwho_details}" >> ${repRID}.seqwho.log
       fi
       if [ "\${seqtypeR1}" == "\${seqtypeR2}" ]
       then
@@ -808,76 +806,191 @@ process seqwho {
           seqtpeInfer="rnaseq"
           echo -e "LOG: concordant rnaseq seq type inference detected" >> ${repRID}.seqwho.log
         else
-          seqtypeInferError=true
-          seqtypeInferError_details="**Infered sequencing type is not mRNA-seq:** Infered = \${seqtypeR1}"
-          echo -e "LOG: inference error: \${seqtypeInferError_details}" >> ${repRID}.seqwho.log
+          seqtypeError=true
+          seqtypeError_details="**Infered sequencing type is not mRNA-seq:** Infered = \${seqtypeR1}"
+          echo -e "LOG: inference error: \${seqtypeError_details}" >> ${repRID}.seqwho.log
         fi
       else
-        seqtypeInferError=true
-        seqtypeInferError_details="**Infered sequencing type does not match for R1 and R2:** Infered R1 = \${seqtypeR1} and infered R2 = \${seqtypeR2}"
-        echo -e "LOG: inference error: \${seqtypeInferError_details}" >> ${repRID}.seqwho.log
+        seqtypeError=true
+        seqtypeError_details="**Infered sequencing type does not match for R1 and R2:** Infered R1 = \${seqtypeR1} and infered R2 = \${seqtypeR2}"
+        echo -e "LOG: inference error: \${seqtypeError_details}" >> ${repRID}.seqwho.log
       fi
     else
-      speciesInferError=true
-      speciesInferError_details=\$(echo "**Infered species and or sequencing type confidence is low:**\\n")
-      speciesInferError_details=\$(echo \${speciesInferError_details}|fastq|Infered species confidence|Infered sequencing type confidence|\\n)
-      speciesInferError_details=\$(echo \${speciesInferError_details}|:--|:--:|:--:|\\n)
-      speciesInferError_details=\$(echo \${speciesInferError_details}|Read 1|\${speciesConfidenceR1}|\${seqtypeConfidenceR1}|\\n)
+      speciesErrorSeqwho=true
+      speciesErrorSeqwho_details=\$(echo "**Infered species and or sequencing type confidence is low:**\\n")
+      speciesErrorSeqwho_details=\$(echo \${speciesErrorSeqwho_details}|fastq|Infered species confidence|Infered sequencing type confidence|\\n)
+      speciesErrorSeqwho_details=\$(echo \${speciesErrorSeqwho_details}|:--|:--:|:--:|\\n)
+      speciesErrorSeqwho_details=\$(echo \${speciesErrorSeqwho_details}|Read 1|\${speciesConfidenceR1}|\${seqtypeConfidenceR1}|\\n)
       if [ "${ends}" == "pe" ]
       then
-        speciesInferError_details=\$(echo \${speciesInferError_details}|Read 2|\${speciesConfidenceR2}|\${seqtypeConfidenceR2}|\\n)
+        speciesErrorSeqwho_details=\$(echo \${speciesErrorSeqwho_details}|Read 2|\${speciesConfidenceR2}|\${seqtypeConfidenceR2}|\\n)
+      fi
+      echo -e "LOG: inference error: \${speciesErrorSeqwho_details}" >> ${repRID}.seqwho.log
+    fi
+
+    if [ "${speciesMeta}" != "\${speciesInfer}" ]
+    then
+      if [ "${params.speciesForce}" != "" ]
+      then
+        speciesError=false
+        echo -e "LOG: species forced: Submitted=${speciesMeta}; Inferred=\${speciesInfer}; Forced=${params.speciesForce}" >> ${repRID}.seqwho.log
+      else
+        speciesError=true
+        echo -e "LOG: species does not match: Submitted=${speciesMeta}; Inferred=\${speciesInfer}" >> ${repRID}.seqwho.log
       fi
-      echo -e "LOG: inference error: \${speciesInferError_details}" >> ${repRID}.seqwho.log
+    else
+      speciesError=false
+      echo -e "LOG: species matches: Submitted=${speciesMeta}; Inferred=\${speciesInfer}" >> ${repRID}.seqwho.log
     fi
+    
+    # save species file
+    echo "\${speciesInfer}" > inferSpecies.csv
 
     # save error file
-    echo "\${seqtypeInferError},\${seqtypeInferError_details}" > inferError.csv
+    echo "\${seqtypeError},\${seqtypeError_details},\${speciesErrorSeqwho},\${speciesErrorSeqwho_details},\${speciesError}" > inferError.csv
     """
 }
 
-// Split species error into separate channel
-seqtypeInferError = Channel.create()
-seqtypeInferError_details = Channel.create()
+// Extract infered sepecies metadata into channel and replicate them for multiple process inputs
+speciesInfer = Channel.create()
+inferSpecies_fl.splitCsv(sep: ",", header: false).separate(
+  speciesInfer
+)
+speciesInfer.into {
+  speciesInfer_getRef
+  speciesInfer_alignSampleData
+  speciesInfer_checkMetadata
+  speciesInfer_aggrQC
+  speciesInfer_uploadExecutionRun
+  speciesInfer_uploadProcessedFile
+  speciesInfer_failExecutionRun
+}
+
+// extract seq type and species error into separate channel and replicate them for multiple process inputs
+seqtypeError = Channel.create()
+seqtypeError_details = Channel.create()
+speciesErrorSeqwho = Channel.create()
+speciesErrorSeqwho_details = Channel.create()
+speciesError = Channel.create()
 inferError_fl.splitCsv(sep: ",", header: false).separate(
-  seqtypeInferError,
-  seqtypeInferError_details
+  seqtypeError,
+  seqtypeError_details,
+  speciesErrorSeqwho,
+  speciesErrorSeqwho_details,
+  speciesError
 )
+seqtypeError.into {
+  seqtypeError_getRef
+  seqtypeError_downsampleData
+  seqtypeError_alignSampleDataERCC
+  seqtypeError_alignSampleData
+  seqtypeError_inferMetadata
+  seqtypeError_checkMetadata
+  seqtypeError_alignData
+  seqtypeError_dedupData
+  seqtypeError_makeBigWig
+  seqtypeError_countData
+  seqtypeError_dataQC
+  seqtypeError_aggrQC
+  seqtypeError_uploadExecutionRun
+  seqtypeError_uploadQC
+  seqtypeError_uploadProcessedFile
+  seqtypeError_uploadOutputBag
+  seqtypeError_finalizeExecutionRun
+  seqtypeError_uploadQC_fail
+}
+speciesError.into {
+  speciesError_checkMetadata
+  speciesError_uploadExecutionRun
+  speciesError_getRef
+  speciesError_alignSampleData
+  speciesError_inferMetadata
+  speciesError_alignData
+  speciesError_dedupData
+  speciesError_makeBigWig
+  speciesError_countData
+  speciesError_fastqc
+  speciesError_dataQC
+  speciesError_aggrQC
+  speciesError_uploadQC
+  speciesError_uploadQC_fail
+  speciesError_uploadProcessedFile
+  speciesError_uploadOutputBag
+  speciesError_finalizeExecutionRun
+  speciesError_failPreExecutionRun_species
+}
+
+/*
+ * getRefERCC: downloads ERCC reference for spike metadata inference
+ */
+process getRefERCC {
+  tag "${repRID}"
+
+  input:
+    path (credential, stageAs: "credential.json") from deriva_getRefERCC
+    path script_refDataInfer
+    val fastqCountError from fastqCountError_getRefERCC
+    val fastqReadError from fastqReadError_getRefERCC
+
+  output:
+    tuple path ("hisat2", type: 'dir'), path ("*.fna"), path ("*.gtf")  into refERCC
+
+  when:
+    fastqCountError == "false"
+    fastqReadError == "false"
+
+  script:
+    """
+    hostname > ${repRID}.getRefERCC.log
+    ulimit -a >> ${repRID}.getRefERCC.log
+
+    # link credential file for authentication
+    echo -e "LOG: linking deriva credentials" >> ${repRID}.getRefERCC.log
+    mkdir -p ~/.deriva
+    ln -sf `readlink -e credential.json` ~/.deriva/credential.json
+    echo -e "LOG: linked" >> ${repRID}.getRefERCC.log
+
+    # set the reference name
+    references=\$(echo ${referenceBase}/ERCC${refERCCVersion})
 
-// Replicate errors for multiple process inputs
-seqtypeInferError.into {
-  seqtypeInferError_trimData
-  seqtypeInferError_getRefInfer
-  seqtypeInferError_downsampleData
-  seqtypeInferError_alignSampleData
-  seqtypeInferError_inferMetadata
-  seqtypeInferError_checkMetadata
-  seqtypeInferError_uploadExecutionRun
-  seqtypeInferError_getRef
-  seqtypeInferError_alignData
-  seqtypeInferError_dedupData
-  seqtypeInferError_makeBigWig
-  seqtypeInferError_countData
-  seqtypeInferError_dataQC
-  seqtypeInferError_aggrQC
-  seqtypeInferError_uploadQC
-  seqtypeInferError_uploadQC_fail
-  seqtypeInferError_uploadProcessedFile
-  seqtypeInferError_uploadOutputBag
+    # retreive appropriate reference appropriate location
+    echo -e "LOG: fetching ERCC reference files from ${referenceBase}" >> ${repRID}.getRefERCC.log
+    if [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references/new" ]
+    then
+      unzip \${references}.zip
+      mv \$(basename \${references})/data/* .
+    elif [ params.refSource == "datahub" ]
+    then
+      query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='\${GRCv}'.'\${GRCp}'/Annotation_Version=GENCODE%20'\${GENCODE})
+      curl --request GET \${query} > refQuery.json
+      refURL=\$(python ${script_refDataInfer} --returnParam URL)
+      loc=\$(dirname \${refURL})
+      fName=\$(python ${script_refDataInfer} --returnParam fName)
+      fName=\${fName%.*}
+      if [ "\${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi
+      filename=\$(echo \$(basename \${refURL}) | grep -oP '.*(?=:)')
+      deriva-hatrac-cli --host ${referenceBase} get \${refURL}
+      unzip \$(basename \${refURL})
+      mv \${fName}/data/* .
+    fi
+    mv ./annotation/genome.gtf .
+    mv ./sequence/genome.fna .
+    echo -e "LOG: fetched" >> ${repRID}.getRefERCC.log
+    """
 }
 
 /*
  * trimData: trims any adapter or non-host sequences from the data
-*/
+ */
 process trimData {
   tag "${repRID}"
 
   input:
     path (fastq) from fastqs_trimData
     val ends from endsManual_trimData
-    val fastqCountError_trimData
-    val fastqReadError_trimData
-    val fastqFileError_trimData
-    val seqtypeInferError_trimData
+    val fastqCountError from fastqCountError_trimData
+    val fastqReadError from fastqReadError_trimData
+    val fastqFileError from fastqFileError_trimData
 
   output:
     path ("*.fq.gz") into fastqsTrim
@@ -885,10 +998,9 @@ process trimData {
     path ("readLength.csv") into readLengthInfer_fl
 
   when:
-    fastqCountError_trimData == "false"
-    fastqReadError_trimData == "false"
-    fastqFileError_trimData == "false"
-    seqtypeInferError_trimData == "false"
+    fastqCountError == "false"
+    fastqReadError == "false"
+    fastqFileError == "false"
 
   script:
     """
@@ -914,108 +1026,19 @@ process trimData {
     """
 }
 
-// Extract calculated read length metadata into channel
+// Extract calculated read length metadata into channel and replicate them for multiple process inputs
 readLengthInfer = Channel.create()
 readLengthInfer_fl.splitCsv(sep: ",", header: false).separate(
   readLengthInfer
 )
-
-// Replicate inferred read length for multiple process inputs
 readLengthInfer.into {
   readLengthInfer_aggrQC
   readLengthInfer_uploadQC
 }
 // Replicate trimmed fastq's for multiple process inputs
 fastqsTrim.into {
-  fastqsTrim_alignData
   fastqsTrim_downsampleData
-}
-
-// Combine inputs of getRefInfer
-getRefInferInput = referenceInfer.combine(deriva_getRefInfer.combine(script_refDataInfer.combine(fastqCountError_getRefInfer.combine(fastqReadError_getRefInfer.combine(fastqFileError_getRefInfer.combine(seqtypeInferError_getRefInfer))))))
-
-/*
-  * getRefInfer: dowloads appropriate reference for metadata inference
-*/
-process getRefInfer {
-  tag "${refName}"
-
-  input:
-    tuple val (refName), path (credential, stageAs: "credential.json"), path (script_refDataInfer), val (fastqCountError), val (fastqReadError), val (fastqFileError), val (seqtypeInferError) from getRefInferInput
-
-  output:
-    tuple val (refName), path ("hisat2", type: 'dir'), path ("*.fna"), path ("*.gtf")  into refInfer
-    path ("${refName}", type: 'dir') into bedInfer
-
-  when:
-    fastqCountError == "false"
-    fastqReadError == "false"
-    fastqFileError == "false"
-    seqtypeInferError == "false"
-
-  script:
-    """
-    hostname > ${repRID}.${refName}.getRefInfer.log
-    ulimit -a >> ${repRID}.${refName}.getRefInfer.log
-
-    # link credential file for authentication
-    echo -e "LOG: linking deriva credentials" >> ${repRID}.${refName}.getRefInfer.log
-    mkdir -p ~/.deriva
-    ln -sf `readlink -e credential.json` ~/.deriva/credential.json
-    echo -e "LOG: linked" >> ${repRID}.${refName}.getRefInfer.log
-
-    # set the reference name
-    if [ "${refName}" == "ERCC" ]
-    then
-      references=\$(echo ${referenceBase}/ERCC${refERCCVersion})
-    elif [ "${refName}" == "GRCm" ]
-    then
-      references=\$(echo ${referenceBase}/GRCm${refMoVersion})
-    elif [ '${refName}' == "GRCh" ]
-    then
-      references=\$(echo ${referenceBase}/GRCh${refHuVersion})
-    else
-      echo -e "LOG: ERROR - References could not be set!\nReference found: ${referenceBase}" >> ${repRID}.${refName}.getRefInfer.log
-      exit 1
-    fi
-
-    # retreive appropriate reference appropriate location
-    echo -e "LOG: fetching ${refName} reference files from ${referenceBase}" >> ${repRID}.${refName}.getRefInfer.log
-    if [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references/new" ]
-    then
-      unzip \${references}.zip
-      mv \$(basename \${references})/data/* .
-    elif [ params.refSource == "datahub" ]
-    then
-      GRCv=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f1)
-      GRCp=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f2)
-      GENCODE=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f3)
-      if [ "${refName}" != "ERCC" ]
-      then
-        query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='\${GRCv}'.'\${GRCp}'/Annotation_Version=GENCODE%20'\${GENCODE})
-      else
-        query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version=${refName}${refERCCVersion}/Annotation_Version=${refName}${refERCCVersion}')
-      fi
-      curl --request GET \${query} > refQuery.json
-      refURL=\$(python ${script_refDataInfer} --returnParam URL)
-      loc=\$(dirname \${refURL})
-      fName=\$(python ${script_refDataInfer} --returnParam fName)
-      fName=\${fName%.*}
-      if [ "\${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi
-      filename=\$(echo \$(basename \${refURL}) | grep -oP '.*(?=:)')
-      deriva-hatrac-cli --host ${referenceBase} get \${refURL}
-      unzip \$(basename \${refURL})
-      mv \${fName}/data/* .
-    fi
-    mv ./annotation/genome.gtf .
-    mv ./sequence/genome.fna .
-    mkdir ${refName}
-    if [ "${refName}" != "ERCC" ]
-    then
-      mv ./annotation/genome.bed ./${refName}
-    fi
-    echo -e "LOG: fetched" >> ${repRID}.${refName}.getRefInfer.log
-    """
+  fastqsTrim_alignData
 }
 
 /*
@@ -1027,20 +1050,19 @@ process downsampleData {
   input:
     path fastq from fastqsTrim_downsampleData
     val ends from endsManual_downsampleData
-    val fastqCountError_downsampleData
-    val fastqReadError_downsampleData
-    val fastqFileError_downsampleData
-    val seqtypeInferError_downsampleData
+    val fastqCountError from fastqCountError_downsampleData
+    val fastqReadError from fastqReadError_downsampleData
+    val fastqFileError from fastqFileError_downsampleData
+    val seqtypeError from seqtypeError_downsampleData
 
   output:
-    path ("sampled.1.fq") into fastqs1Sample
-    path ("sampled.2.fq") into fastqs2Sample
+    path ("sampled.{1,2}.fq") into fastqsSample
 
   when:
-    fastqCountError_downsampleData == "false"
-    fastqReadError_downsampleData == "false"
-    fastqFileError_downsampleData == "false"
-    seqtypeInferError_downsampleData == "false"
+    fastqCountError == "false"
+    fastqReadError == "false"
+    fastqFileError == "false"
+    seqtypeError == "false"
 
   script:
     """
@@ -1063,169 +1085,313 @@ process downsampleData {
     """
 }
 
-// Replicate the dowsampled fastq's and attatched to the references
-inferInput = endsManual_alignSampleData.combine(refInfer.combine(fastqs1Sample.collect().combine(fastqs2Sample.collect().combine(fastqCountError_alignSampleData.combine(fastqReadError_alignSampleData.combine(fastqFileError_alignSampleData.combine(seqtypeInferError_alignSampleData)))))))
+// Replicate sampled fastq's for multiple process inputs
+fastqsSample.into {
+  fastqsSample_alignSampleDataERCC
+  fastqsSample_alignSampleData
+}
 
 /*
- * alignSampleData: aligns the downsampled reads to a reference database
-*/
-process alignSampleData {
-  tag "${ref}"
+ * alignSampleDataERCC: aligns the downsampled reads to the ERCC reference and infers spike in
+ */
+process alignSampleDataERCC {
+  tag "${repRID}"
 
   input:
-    tuple val (ends), val (ref), path (hisat2), path (fna), path (gtf), path (fastq1), path (fastq2), val (fastqCountError), val (fastqReadError), val (fastqFileError), val (seqtypeInferError) from inferInput
+    val ends from endsManual_alignSampleDataERCC
+    tuple path (hisat2), path (fna), path (gtf) from refERCC
+    path fastq from fastqsSample_alignSampleDataERCC
+    val spikeForce
+    val fastqCountError from fastqCountError_alignSampleDataERCC
+    val fastqReadError from fastqReadError_alignSampleDataERCC
+    val fastqFileError from fastqFileError_alignSampleDataERCC
+    val seqtypeError from seqtypeError_alignSampleDataERCC
 
   output:
-    path ("${ref}.sampled.sorted.bam") into sampleBam
-    path ("${ref}.sampled.sorted.bam.bai") into sampleBai
-    path ("${ref}.alignSampleSummary.txt") into alignSampleQC
+    path "inferSpike.csv" into inferSpike_fl
+    path ("ERCC.alignSampleSummary.txt") into alignSampleQC_ERCC
 
   when:
     fastqCountError == "false"
     fastqReadError == "false"
     fastqFileError == "false"
-    seqtypeInferError == "false"
+    seqtypeError == "false"
 
   script:
     """
-    hostname > ${repRID}.${ref}.alignSampleData.log
-    ulimit -a >> ${repRID}.${ref}.alignSampleData.log
+    hostname > ${repRID}.alignSampleDataERCC.log
+    ulimit -a >> ${repRID}.alignSampleDataERCC.log
 
     # align the reads with Hisat2
-    echo -e "LOG: aligning ${ends}" >> ${repRID}.${ref}.alignSampleData.log
+    echo -e "LOG: aligning ${ends}" >> ${repRID}.alignSampleDataERCC.log
     if [ "${ends}" == "se" ]
     then
 
-      hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome -U ${fastq1} --summary-file ${ref}.alignSampleSummary.txt --new-summary
+      hisat2 -p `nproc` --add-chrname -S ERCC.sampled.sam -x hisat2/genome -U ${fastq[0]} --summary-file ERCC.alignSampleSummary.txt --new-summary
     elif [ "${ends}" == "pe" ]
     then
-      hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome --no-mixed --no-discordant -1 ${fastq1} -2 ${fastq2} --summary-file ${ref}.alignSampleSummary.txt --new-summary
+      hisat2 -p `nproc` --add-chrname -S ERCC.sampled.sam -x hisat2/genome --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} --summary-file ERCC.alignSampleSummary.txt --new-summary
     fi
-    echo -e "LOG: aliged" >> ${repRID}.${ref}.alignSampleData.log
+    echo -e "LOG: aliged" >> ${repRID}.alignSampleDataERCC.log
 
     # convert the output sam file to a sorted bam file using Samtools
-    echo -e "LOG: converting from sam to bam" >> ${repRID}.${ref}.alignSampleData.log
-    samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${ref}.sampled.bam ${ref}.sampled.sam
+    echo -e "LOG: converting from sam to bam" >> ${repRID}.alignSampleDataERCC.log
+    samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ERCC.sampled.bam ERCC.sampled.sam
 
     # sort the bam file using Samtools
-    echo -e "LOG: sorting the bam file" >> ${repRID}.${ref}.alignSampleData.log
+    echo -e "LOG: sorting the bam file" >> ${repRID}.alignSampleDataERCC.log
     proc=\$(expr `nproc` - 1)
     mem=\$(vmstat -s -S K | grep 'total memory' | grep -o '[0-9]*')
     mem=\$(expr \${mem} / \${proc} \\* 85 / 100)
-    samtools sort -@ \${proc} -m \${mem}K -O BAM -o ${ref}.sampled.sorted.bam ${ref}.sampled.bam
+    samtools sort -@ \${proc} -m \${mem}K -O BAM -o ERCC.sampled.sorted.bam ERCC.sampled.bam
 
     # index the sorted bam using Samtools
-    echo -e "LOG: indexing sorted bam file" >> ${repRID}.${ref}.alignSampleData.log
-    samtools index -@ `nproc` -b ${ref}.sampled.sorted.bam ${ref}.sampled.sorted.bam.bai
+    echo -e "LOG: indexing sorted bam file" >> ${repRID}.alignSampleDataERCC.log
+    samtools index -@ `nproc` -b ERCC.sampled.sorted.bam ERCC.sampled.sorted.bam.bai
+
+    # collect alignment rates (round down to integers)
+    align=\$(echo \$(grep "Overall alignment rate" ERCC.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%'))
+    align=\$(echo \${align%.*})
+    echo -e "LOG: alignment rate to ERCC: \${align}" >> ${repRID}.alignSampleDataERCC.log
+
+    # determine spike-in
+    if [ 1 -eq \$(echo \$(expr \${align} ">=" 10)) ]
+    then
+      spike="true"
+    else
+      spike="false"
+    fi
+    echo -e "LOG: inference of strandedness results is: \${spike}" >> ${repRID}.alignSampleDataERCC.log
+    if [ "${spikeForce}" != "" ]
+    then
+      spike=${spikeForce}
+      echo -e "LOG: spike-in metadata forced: \${spike}" >> ${repRID}.alignSampleDataERCC.log
+    fi
+
+    # write inferred spike metadata to file
+    echo "\${spike},\${align}" > inferSpike.csv
     """
 }
 
-alignSampleQC.into {
-  alignSampleQC_inferMetadata
-  alignSampleQC_aggrQC
+// Extract spike in metadata and % aligned to ERCC into channel and replicate them for multiple process inputs
+spikeInfer = Channel.create()
+alignInferERCC = Channel.create()
+inferSpike_fl.splitCsv(sep: ",", header: false).separate(
+  spikeInfer,
+  alignInferERCC
+)
+spikeInfer.into {
+  spikeInfer_getRef
+  spikeInfer_checkMetadata
+  spikeInfer_aggrQC
+  spikeInfer_uploadExecutionRun
+  spikeInfer_failExecutionRun
 }
 
-process inferMetadata {
-  tag "${repRID}"
+/*
+ * getRef: downloads appropriate reference
+ */
+process getRef {
+  tag "${species}"
 
   input:
-    path script_inferMeta
-    path beds from bedInfer.collect()
-    path bam from sampleBam.collect()
-    path bai from sampleBai.collect()
-    path alignSummary from alignSampleQC_inferMetadata.collect()
-    val strandedForce
-    val spikeForce
-    val fastqCountError_inferMetadata
-    val fastqReadError_inferMetadata
-    val fastqFileError_inferMetadata
-    val seqtypeInferError_inferMetadata
+    path script_refData
+    path credential, stageAs: "credential.json" from deriva_getRef
+    val spike from spikeInfer_getRef
+    val species from speciesInfer_getRef
+    val fastqCountError from fastqCountError_getRef
+    val fastqReadError from fastqReadError_getRef
+    val fastqFileError from fastqFileError_getRef
+    val seqtypeError from seqtypeError_getRef
+    val speciesError from speciesError_getRef
 
   output:
-    path "infer.csv" into inferMetadata_fl
-    path "${repRID}.infer_experiment.txt" into inferExperiment
-    path "speciesError.csv" into speciesError_fl
+    tuple path ("hisat2", type: 'dir'), path ("*.bed"), path ("*.fna"), path ("*.gtf"), path ("geneID.tsv"), path ("Entrez.tsv")  into reference
 
   when:
-    fastqCountError_inferMetadata == "false"
-    fastqReadError_inferMetadata == "false"
-    fastqFileError_inferMetadata == "false"
-    seqtypeInferError_inferMetadata == "false"
+    fastqCountError == "false"
+    fastqReadError == "false"
+    fastqFileError == "false"
+    seqtypeError == "false"
+    speciesError == "false"
 
   script:
     """
-    hostname > ${repRID}.inferMetadata.log
-    ulimit -a >> ${repRID}.inferMetadata.log
+    hostname > ${repRID}.getRef.log
+    ulimit -a >> ${repRID}.getRef.log
 
-    # collect alignment rates (round down to integers)
-    align_ercc=\$(echo \$(grep "Overall alignment rate" ERCC.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%'))
-    align_ercc=\$(echo \${align_ercc%.*})
-    echo -e "LOG: alignment rate to ERCC: \${align_ercc}" >> ${repRID}.inferMetadata.log
-    align_hu=\$(echo \$(grep "Overall alignment rate" GRCh.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%'))
-    align_hu=\$(echo \${align_hu%.*})
-    echo -e "LOG: alignment rate to GRCh: \${align_hu}" >> ${repRID}.inferMetadata.log
-    align_mo=\$(echo \$(grep "Overall alignment rate" GRCm.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%'))
-    align_mo=\$(echo \${align_mo%.*})
-    echo -e "LOG: alignment rate to GRCm: \${align_mo}" >> ${repRID}.inferMetadata.log
+    # link credential file for authentication
+    echo -e "LOG: linking deriva credentials" >> ${repRID}.getRef.log
+    mkdir -p ~/.deriva
+    ln -sf `readlink -e credential.json` ~/.deriva/credential.json
+    echo -e "LOG: linked" >> ${repRID}.getRef.log
 
-    # determine spike-in
-    if [ 1 -eq \$(echo \$(expr \${align_ercc} ">=" 10)) ]
+    # set the reference name
+    if [ "${species}" == "Mus musculus" ]
     then
-      spike="true"
+      reference=\$(echo ${referenceBase}/GRCm${refMoVersion})
+      refName=GRCm
+    elif [ '${species}' == "Homo sapiens" ]
+    then
+      reference=\$(echo ${referenceBase}/GRCh${refHuVersion})
+      refName=GRCh
     else
-      spike="false"
+      echo -e "LOG: ERROR - References could not be set!\nSpecies reference found: ${species}" >> ${repRID}.getRef.log
+      exit 1
     fi
-    echo -e "LOG: inference of strandedness results is: \${spike}" >> ${repRID}.inferMetadata.log
-    if [ "${spikeForce}" != "" ]
+    if [ "${spike}" == "true" ]
     then
-      spike=${spikeForce}
-      echo -e "LOG: spike-in metadata forced: \${spike}" >> ${repRID}.parseMetadata.log
-    fi
+      reference=\$(echo \${reference}-S)
+    elif [ "${spike}" == "false" ]
+    then
+      reference=\$(echo \${reference})
+    fi
+    echo -e "LOG: species set to \${reference}" >> ${repRID}.getRef.log
 
-    speciesError=false
-    speciesError_details=""
-    # determine species
-    if [ 1 -eq \$(echo \$(expr \${align_hu} ">=" 40)) ] && [ 1 -eq \$(echo \$(expr \${align_mo} "<" 40)) ]
+    # retreive appropriate reference appropriate location
+    echo -e "LOG: fetching ${species} reference files from ${referenceBase}" >> ${repRID}.getRef.log
+    if [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references/new" ]
     then
-      species="Homo sapiens"
-      bam="GRCh.sampled.sorted.bam"
-      bed="./GRCh/genome.bed"
-      echo -e "LOG: inference of species results in: \${species}" >> ${repRID}.inferMetadata.log
-    elif [ 1 -eq \$(echo \$(expr \${align_mo} ">=" 40)) ] && [ 1 -eq \$(echo \$(expr \${align_hu} "<" 40)) ]
+      echo -e "LOG: grabbing reference files from local (BioHPC)" >> ${repRID}.getRef.log
+      unzip \${reference}.zip
+      mv \$(basename \${reference})/data/* .
+    elif [ arams.refSource == "datahub" ]
     then
-      species="Mus musculus"
-      bam="GRCm.sampled.sorted.bam"
-      bed="./GRCm/genome.bed"
-      echo -e "LOG: inference of species results in: \${species}" >> ${repRID}.inferMetadata.log
-    else
-      echo -e "LOG: ERROR - inference of species returns an ambiguous result: hu=\${align_hu} mo=\${align_mo}" >> ${repRID}.inferMetadata.log
-      if [ "${speciesForce}" == "" ]
-      then
-        speciesError=true
-        speciesError_details="**Inference of species returns an ambiguous result:** Percent aligned to human = \${align_hu} and percent aligned to mouse = \${align_mo}"
-      fi
+      echo -e "LOG: grabbing reference files from datahub" >> ${repRID}.getRef.log
+      GRCv=\$(echo \${reference} | grep -o \${refName}.* | cut -d '.' -f1)
+      GRCp=\$(echo \${reference} | grep -o \${refName}.* | cut -d '.' -f2)
+      GENCODE=\$(echo \${reference} | grep -o \${refName}.* | cut -d '.' -f3)
+      query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='\${GRCv}'.'\${GRCp}'/Annotation_Version=GENCODE%20'\${GENCODE})
+      curl --request GET \${query} > refQuery.json
+      refURL=\$(python ${script_refData} --returnParam URL)
+      loc=\$(dirname \${refURL})
+      fName=\$(python ${script_refData} --returnParam fName)
+      fName=\${fName%.*}
+      if [ "\${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi
+      filename=\$(echo \$(basename \${refURL}) | grep -oP '.*(?=:)')
+      deriva-hatrac-cli --host ${referenceBase} get \${refURL}
+      unzip \$(basename \${refURL})
+      mv \${fName}/data/* .
     fi
-    if [ "${speciesForce}" != "" ]
+    echo -e "LOG: fetched" >> ${repRID}.getRef.log
+
+    mv ./annotation/genome.gtf .
+    mv ./sequence/genome.fna .
+    mv ./annotation/genome.bed .
+    mv ./metadata/Entrez.tsv .
+    mv ./metadata/geneID.tsv .
+    """
+}
+
+// Replicate reference for multiple process inputs
+reference.into {
+  reference_alignSampleData
+  reference_inferMetadata
+  reference_alignData
+  reference_countData
+  reference_dataQC
+}
+/*
+ * alignSampleData: aligns the downsampled reads to the appripriate species reference
+ */
+process alignSampleData {
+  tag "${repRID}"
+
+  input:
+    path fastqSample from fastqsSample_alignSampleData
+    path reference_alignSampleData
+    val endsManual from endsManual_alignSampleData
+    val speciesInfer from speciesInfer_alignSampleData
+    val fastqCountError from fastqCountError_alignSampleData
+    val fastqReadError from fastqReadError_alignSampleData
+    val fastqFileError from fastqFileError_alignSampleData
+    val seqtypeError from seqtypeError_alignSampleData
+    val speciesError from speciesError_alignSampleData
+
+  output:
+    path ("sampled.bam") into sampledBam
+    path "align.csv" into align_fl
+    path ("*.alignSampleSummary.txt") into alignSampleQC
+
+  when:
+    fastqCountError == "false"
+    fastqReadError == "false"
+    fastqFileError == "false"
+    seqtypeError == "false"
+    speciesError == "false"
+
+  script:
+    """
+    hostname > ${repRID}.alignSampleData.log
+    ulimit -a >> ${repRID}.alignSampleData.log
+
+    # align the sampled reads with Hisat2
+    species="${speciesInfer}"
+    species=\${species// /_}
+    echo -e "LOG: aligning ${endsManual}" >> ${repRID}.alignSampleData.log
+    if [ "${endsManual}" == "se" ]
     then
-      speciesError=false
-      echo -e "LOG: species overridden to: ${speciesForce}"
-      species="${speciesForce}"
-      if [ "${speciesForce}" == "Homo sapiens" ]
-      then
-        bam="GRCh.sampled.sorted.bam"
-        bed="./GRCh/genome.bed"
-      elif [ "${speciesForce}" == "Mus musculus" ]
-      then
-        bam="GRCm.sampled.sorted.bam"
-        bed="./GRCm/genome.bed"
-      fi
+      hisat2 -p `nproc` --add-chrname -S sampled.sam -x hisat2/genome -U ${fastqSample[0]} --summary-file \${species}.alignSampleSummary.txt --new-summary
+    elif [ "${endsManual}" == "pe" ]
+    then
+      hisat2 -p `nproc` --add-chrname -S sampled.sam -x hisat2/genome --no-mixed --no-discordant -1 ${fastqSample[0]} -2 ${fastqSample[1]} --summary-file \${species}.alignSampleSummary.txt --new-summary
     fi
+    echo -e "LOG: aligned sampled reads" >> ${repRID}.alignSampleData.log
+
+    # collect alignment rates (round down to integers)
+    align=\$(echo \$(grep "Overall alignment rate" \${species}.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%'))
+    align=\$(echo \${align%.*})
+
+    # convert the sampled read output sam file to a sorted bam file using Samtools
+    echo -e "LOG: converting sampled reads from sam to bam" >> ${repRID}.alignSampleData.log
+    samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o sampled.bam sampled.sam
+
+    echo "\${align}" > align.csv
+    """
+}
+
+// Extract % aligned to appropriate reference into channel
+alignInfer = Channel.create()
+align_fl.splitCsv(sep: ",", header: false).separate(
+  alignInfer
+)
+
+/*
+ * inferMetadata: infers strandedness and endness from the aligned downsampled reads
+ */
+process inferMetadata {
+  tag "${repRID}"
+
+  input:
+    path sampledBam
+    path reference_inferMetadata
+    path script_inferMeta
+    val strandedForce
+    val fastqCountError from fastqCountError_inferMetadata
+    val fastqReadError from fastqReadError_inferMetadata
+    val fastqFileError from fastqFileError_inferMetadata
+    val seqtypeError from seqtypeError_inferMetadata
+    val speciesError from speciesError_inferMetadata
+
+  output:
+    path "infer.csv" into inferMetadata_fl
+    path "${repRID}.infer_experiment.txt" into inferExperiment
+
+  when:
+    fastqCountError == "false"
+    fastqReadError == "false"
+    fastqFileError == "false"
+    seqtypeError == "false"
+    speciesError == "false"
+
+  script:
+    """
+    hostname > ${repRID}.inferMetadata.log
+    ulimit -a >> ${repRID}.inferMetadata.log
 
-    if [ "\${speciesError}" == false ]
-    then
       # infer experimental setting from dedup bam
-      echo -e "LOG: infer experimental setting from dedup bam" >> ${repRID}.inferMetadata.log
-      infer_experiment.py -r "\${bed}" -i "\${bam}" 1>> ${repRID}.infer_experiment.txt
+      echo -e "LOG: infer experimental setting from bam" >> ${repRID}.inferMetadata.log
+      infer_experiment.py -r ./genome.bed -i ${sampledBam} 1>> ${repRID}.infer_experiment.txt
       echo -e "LOG: inferred" >> ${repRID}.inferMetadata.log
 
       ended=`bash ${script_inferMeta} endness ${repRID}.infer_experiment.txt`
@@ -1258,50 +1424,25 @@ process inferMetadata {
         stranded=${strandedForce}
         echo -e "LOG: spike-in metadata forced: \${stranded}" >> ${repRID}.inferMetadata.log
       fi
-    else
-      ends=""
-      stranded=""
-      spike=""
-      species=""
-      percentF=""
-      percentR=""
-      fail=""
-      touch ${repRID}.infer_experiment.txt
-    fi
 
     # write inferred metadata to file
-    echo "\${ends},\${stranded},\${spike},\${species},\${align_ercc},\${align_hu},\${align_mo},\${percentF},\${percentR},\${fail}" > infer.csv
-
-    # save species error file
-    echo "\${speciesError},\${speciesError_details}" > speciesError.csv
+    echo "\${ends},\${stranded},\${percentF},\${percentR},\${fail}" > infer.csv
     """
 }
 
-// Split metadata into separate channels
+// Extract metadata and replicate them for multiple process inputs
 endsInfer = Channel.create()
 strandedInfer = Channel.create()
-spikeInfer = Channel.create()
-speciesInfer = Channel.create()
-align_erccInfer = Channel.create()
-align_huInfer = Channel.create()
-align_moInfer = Channel.create()
 percentFInfer = Channel.create()
 percentRInfer = Channel.create()
 failInfer = Channel.create()
 inferMetadata_fl.splitCsv(sep: ",", header: false).separate(
   endsInfer,
   strandedInfer,
-  spikeInfer,
-  speciesInfer,
-  align_erccInfer,
-  align_huInfer,
-  align_moInfer,
   percentFInfer,
   percentRInfer,
   failInfer
 )
-
-// Replicate metadata for multiple process inputs
 endsInfer.into {
   endsInfer_checkMetadata
   endsInfer_alignData
@@ -1319,52 +1460,10 @@ strandedInfer.into {
   strandedInfer_uploadQC
   strandedInfer_failExecutionRun
 }
-spikeInfer.into{
-  spikeInfer_checkMetadata
-  spikeInfer_getRef
-  spikeInfer_aggrQC
-  spikeInfer_uploadExecutionRun
-  spikeInfer_failExecutionRun
-}
-speciesInfer.into {
-  speciesInfer_checkMetadata
-  speciesInfer_getRef
-  speciesInfer_aggrQC
-  speciesInfer_uploadExecutionRun
-  speciesInfer_uploadProcessedFile
-  speciesInfer_failExecutionRun
-}
-
-// Split species count error into separate channel
-speciesError = Channel.create()
-speciesError_details = Channel.create()
-speciesError_fl.splitCsv(sep: ",", header: false).separate(
-  speciesError,
-  speciesError_details
-)
-
-//  Replicate errors for multiple process inputs
-speciesError.into {
-  speciesError_checkMetadata
-  speciesError_uploadExecutionRun
-  speciesError_getRef
-  speciesError_alignData
-  speciesError_dedupData
-  speciesError_makeBigWig
-  speciesError_countData
-  speciesError_fastqc
-  speciesError_dataQC
-  speciesError_aggrQC
-  speciesError_uploadQC
-  speciesError_uploadQC_fail
-  speciesError_uploadProcessedFile
-  speciesError_uploadOutputBag
-  speciesError_failPreExecutionRun_species
-}
 
 /* 
- * checkMetadata: checks the submitted metada against inferred
-*/
+ * checkMetadata: checks the submitted metadata against inferred
+ */
 process checkMetadata {
   tag "${repRID}"
 
@@ -1377,23 +1476,22 @@ process checkMetadata {
     val strandedInfer from strandedInfer_checkMetadata
     val spikeInfer from spikeInfer_checkMetadata
     val speciesInfer from speciesInfer_checkMetadata
-    val fastqCountError_checkMetadata
-    val fastqReadError_checkMetadata
-    val seqtypeInferError_checkMetadata
-    val fastqFileError_checkMetadata
-
-    val speciesError_checkMetadata
+    val fastqCountError from fastqCountError_checkMetadata
+    val fastqReadError from fastqReadError_checkMetadata
+    val fastqFileError from fastqFileError_checkMetadata
+    val seqtypeError from seqtypeError_checkMetadata
+    val speciesError from speciesError_checkMetadata
 
   output:
     path ("check.csv") into checkMetadata_fl
     path ("outputBagRID.csv") optional true into outputBagRID_fl_dummy
 
   when:
-    fastqCountError_checkMetadata == "false"
-    fastqReadError_checkMetadata == "false"
-    fastqFileError_checkMetadata == "false"
-    seqtypeInferError_checkMetadata == "false"
-    speciesError_checkMetadata == "false"
+    fastqCountError == "false"
+    fastqReadError == "false"
+    fastqFileError == "false"
+    seqtypeError == "false"
+    speciesError == "false"
 
   script:
     """
@@ -1454,348 +1552,57 @@ process checkMetadata {
         pipelineError=true
         pipelineError_spike=true
         echo -e "LOG: spike does not match: Submitted=${spikeMeta}; Inferred=${spikeInfer}" >> ${repRID}.checkMetadata.log
-      fi
-    else
-      pipelineError_spike=false
-      echo -e "LOG: spike matches: Submitted=${spikeMeta}; Inferred=${spikeInfer}" >> ${repRID}.checkMetadata.log
-    fi
-    if [ "${speciesMeta}" != "${speciesInfer}" ]
-    then
-    if [[ "${params.speciesForce}" != "" ]]
-      then
-        pipelineError_species=false
-        echo -e "LOG: species forced: Submitted=${speciesMeta}; Inferred=${speciesInfer}" >> ${repRID}.checkMetadata.log
-      else
-        pipelineError=true
-        pipelineError_species=true
-        echo -e "LOG: species does not match: Submitted=${speciesMeta}; Inferred=${speciesInfer}" >> ${repRID}.checkMetadata.log
-      fi
-    else
-      pipelineError_species=false
-      echo -e "LOG: species matches: Submitted=${speciesMeta}; Inferred=${speciesInfer}" >> ${repRID}.checkMetadata.log
-    fi
-
-    # create dummy output bag rid if failure
-    if [ \${pipelineError} == true ]
-    then
-      echo "fail" > outputBagRID.csv
-    fi
-
-    # write checks to file
-    echo "\${pipelineError},\${pipelineError_ends},\${pipelineError_stranded},\${pipelineError_spike},\${pipelineError_species}" > check.csv
-    """
-}
-
-// Split errors into separate channels
-pipelineError = Channel.create()
-pipelineError_ends = Channel.create()
-pipelineError_stranded = Channel.create()
-pipelineError_spike = Channel.create()
-pipelineError_species = Channel.create()
-checkMetadata_fl.splitCsv(sep: ",", header: false).separate(
-  pipelineError,
-  pipelineError_ends,
-  pipelineError_stranded,
-  pipelineError_spike,
-  pipelineError_species
-)
-
-// Replicate errors for multiple process inputs
-pipelineError.into {
-  pipelineError_getRef
-  pipelineError_alignData
-  pipelineError_dedupData
-  pipelineError_makeBigWig
-  pipelineError_countData
-  pipelineError_fastqc
-  pipelineError_dataQC
-  pipelineError_aggrQC
-  pipelineError_uploadQC
-  pipelineError_uploadQC_fail
-  pipelineError_uploadProcessedFile
-  pipelineError_uploadOutputBag
-  pipelineError_failExecutionRun
-}
-
-/* 
- * uploadInputBag: uploads the input bag
-*/
-process uploadInputBag {
-  tag "${repRID}"
-
-  input:
-    path script_uploadInputBag
-    path credential, stageAs: "credential.json" from deriva_uploadInputBag
-    path inputBag from inputBag_uploadInputBag
-    val studyRID from studyRID_uploadInputBag
-
-  output:
-    path ("inputBagRID.csv") into inputBagRID_fl
-
-  when:
-    upload
-
-  script:
-    """
-    hostname > ${repRID}.uploadInputBag.log
-    ulimit -a >> ${repRID}.uploadInputBag.log
-
-    yr=\$(date +'%Y')
-    mn=\$(date +'%m')
-    dy=\$(date +'%d')
-
-    file=\$(basename -a ${inputBag})
-    md5=\$(md5sum ./\${file} | awk '{ print \$1 }')
-    echo LOG: ${repRID} input bag md5 sum - \${md5} >> ${repRID}.uploadInputBag.log
-    size=\$(wc -c < ./\${file})
-    echo LOG: ${repRID} input bag size - \${size} bytes >> ${repRID}.uploadInputBag.log
-    
-    exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=\${md5})
-    if [ "\${exist}" == "[]" ]
-    then
-        cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"')
-        cookie=\${cookie:11:-1}
-
-        loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/input_bag/study/${studyRID}/replicate/${repRID}/\${file} --parents)
-        inputBag_rid=\$(python3 ${script_uploadInputBag} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie})
-        echo LOG: input bag RID uploaded - \${inputBag_rid} >> ${repRID}.uploadInputBag.log
-        rid=\${inputBag_rid}
-    else
-        exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
-        exist=\${exist:7:-6}
-        echo LOG: input bag RID already exists - \${exist} >> ${repRID}.uploadInputBag.log
-        rid=\${exist}
-    fi
-
-    echo "\${rid}" > inputBagRID.csv
-    """
-}
-
-// Extract input bag RID into channel
-inputBagRID = Channel.create()
-inputBagRID_fl.splitCsv(sep: ",", header: false).separate(
-  inputBagRID
-)
-
-// Replicate input bag RID for multiple process inputs
-inputBagRID.into {
-  inputBagRID_uploadExecutionRun
-  inputBagRID_finalizeExecutionRun
-  inputBagRID_failPreExecutionRun
-  inputBagRID_failExecutionRun
-}
-
-/* 
- * uploadExecutionRun: uploads the execution run
-*/
-process uploadExecutionRun {
-  tag "${repRID}"
-
-  input:
-    path script_uploadExecutionRun_uploadExecutionRun
-    path credential, stageAs: "credential.json" from deriva_uploadExecutionRun
-    val spike from spikeInfer_uploadExecutionRun
-    val species from speciesInfer_uploadExecutionRun
-    val inputBagRID from inputBagRID_uploadExecutionRun
-    val fastqCountError_uploadExecutionRun
-    val fastqReadError_uploadExecutionRun
-    val fastqFileError_uploadExecutionRun
-    val seqtypeInferError_uploadExecutionRun
-    val speciesError_uploadExecutionRun
-    
-  output:
-    path ("executionRunRID.csv") into executionRunRID_fl
-
-  when:
-    upload
-    fastqCountError_uploadExecutionRun == "false"
-    fastqReadError_uploadExecutionRun == "false"
-    fastqFileError_uploadExecutionRun == "false"
-    seqtypeInferError_uploadExecutionRun == "false"
-    speciesError_uploadExecutionRun == "false"
-
-  script:
-    """
-    hostname > ${repRID}.uploadExecutionRun.log
-    ulimit -a >> ${repRID}.uploadExecutionRun.log
-
-    echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.uploadExecutionRun.log
-    workflow=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Workflow/Name=BICF%20mRNA%20Replicate/Version=${workflow.manifest.version})
-    workflow=\$(echo \${workflow} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
-    workflow=\${workflow:7:-6}
-    echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.uploadExecutionRun.log
-
-    if [ "${species}" == "Homo sapiens" ]
-    then
-      genomeName=\$(echo GRCh${refHuVersion})
-    elif [ "${species}" == "Mus musculus" ]
-    then
-      genomeName=\$(echo GRCm${refMoVersion})
-    fi
-    if [ "${spike}" == "true" ]
-    then
-      genomeName=\$(echo \${genomeName}-S)
-    fi
-    echo LOG: searching for genome name - \${genomeName} >> ${repRID}.uploadExecutionRun.log
-    genome=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Name=\${genomeName})
-    genome=\$(echo \${genome} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
-    genome=\${genome:7:-6}
-    echo LOG: genome RID extracted - \${genome} >> ${repRID}.uploadExecutionRun.log
-
-    cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"')
-    cookie=\${cookie:11:-1}
-
-    exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Replicate=${repRID}/Input_Bag=${inputBagRID})
-    echo \${exist} >> ${repRID}.uploadExecutionRun.log
-    if [ "\${exist}" == "[]" ]
-    then
-      executionRun_rid=\$(python3 ${script_uploadExecutionRun_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s In-progress -d 'Run in process' -o ${source} -c \${cookie} -u F)
-      echo LOG: execution run RID uploaded - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log
-    else
-      rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
-      rid=\${rid:7:-6}
-      echo \${rid} >> ${repRID}.uploadExecutionRun.log
-      executionRun_rid=\$(python3 ${script_uploadExecutionRun_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s In-progress -d 'Run in process' -o ${source} -c \${cookie} -u \${rid})
-      echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log
-    fi
-
-    echo "\${executionRun_rid}" > executionRunRID.csv
-
-    if [ ${params.track} == true ]
-    then
-      curl -H 'Content-Type: application/json' -X PUT -d \
-        '{ \
-          "ID": "${workflow.sessionId}", \
-          "ExecutionRunRID": "'\${executionRun_rid}'" \
-        }' \
-        "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track"
-    fi
-    """
-}
-
-// Extract execution run RID into channel
-executionRunRID = Channel.create()
-executionRunRID_fl.splitCsv(sep: ",", header: false).separate(
-  executionRunRID
-)
-
-// Replicate execution run RID for multiple process inputs
-executionRunRID.into {
-  executionRunRID_uploadQC
-  executionRunRID_uploadProcessedFile
-  executionRunRID_uploadOutputBag
-  executionRunRID_finalizeExecutionRun
-  executionRunRID_failExecutionRun
-  executionRunRID_fail
-}
-
-/*
-  * getRef: downloads appropriate reference
-*/
-process getRef {
-  tag "${species}"
-
-  input:
-    path script_refData
-    path credential, stageAs: "credential.json" from deriva_getRef
-    val spike from spikeInfer_getRef
-    val species from speciesInfer_getRef
-    val fastqCountError_getRef
-    val fastqReadError_getRef
-    val fastqFileError_getRef
-    val seqtypeInferError_getRef
-    val speciesError_getRef
-    val pipelineError_getRef
-
-  output:
-    tuple path ("hisat2", type: 'dir'), path ("*.bed"), path ("*.fna"), path ("*.gtf"), path ("geneID.tsv"), path ("Entrez.tsv")  into reference
-
-  when:
-    fastqCountError_getRef == "false"
-    fastqReadError_getRef == "false"
-    fastqFileError_getRef == "false"
-    seqtypeInferError_getRef == "false"
-    speciesError_getRef == "false"
-    pipelineError_getRef == "false"
-
-  script:
-    """
-    hostname > ${repRID}.getRef.log
-    ulimit -a >> ${repRID}.getRef.log
-
-    # link credential file for authentication
-    echo -e "LOG: linking deriva credentials" >> ${repRID}.getRef.log
-    mkdir -p ~/.deriva
-    ln -sf `readlink -e credential.json` ~/.deriva/credential.json
-    echo -e "LOG: linked" >> ${repRID}.getRef.log
-
-    # set the reference name
-    if [ "${species}" == "Mus musculus" ]
-    then
-      reference=\$(echo ${referenceBase}/GRCm${refMoVersion})
-      refName=GRCm
-    elif [ '${species}' == "Homo sapiens" ]
-    then
-      reference=\$(echo ${referenceBase}/GRCh${refHuVersion})
-      refName=GRCh
-    else
-      echo -e "LOG: ERROR - References could not be set!\nSpecies reference found: ${species}" >> ${repRID}.getRef.log
-      exit 1
-    fi
-    if [ "${spike}" == "true" ]
-    then
-      reference=\$(echo \${reference}-S)
-    elif [ "${spike}" == "false" ]
-    then
-      reference=\$(echo \${reference})
-    fi
-    echo -e "LOG: species set to \${reference}" >> ${repRID}.getRef.log
-
-    # retreive appropriate reference appropriate location
-    echo -e "LOG: fetching ${species} reference files from ${referenceBase}" >> ${repRID}.getRef.log
-    if [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references/new" ]
-    then
-      echo -e "LOG: grabbing reference files from local (BioHPC)" >> ${repRID}.getRef.log
-      unzip \${reference}.zip
-      mv \$(basename \${reference})/data/* .
-    elif [ arams.refSource == "datahub" ]
-    then
-      echo -e "LOG: grabbing reference files from datahub" >> ${repRID}.getRef.log
-      GRCv=\$(echo \${reference} | grep -o \${refName}.* | cut -d '.' -f1)
-      GRCp=\$(echo \${reference} | grep -o \${refName}.* | cut -d '.' -f2)
-      GENCODE=\$(echo \${reference} | grep -o \${refName}.* | cut -d '.' -f3)
-      query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='\${GRCv}'.'\${GRCp}'/Annotation_Version=GENCODE%20'\${GENCODE})
-      curl --request GET \${query} > refQuery.json
-      refURL=\$(python ${script_refData} --returnParam URL)
-      loc=\$(dirname \${refURL})
-      fName=\$(python ${script_refData} --returnParam fName)
-      fName=\${fName%.*}
-      if [ "\${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi
-      filename=\$(echo \$(basename \${refURL}) | grep -oP '.*(?=:)')
-      deriva-hatrac-cli --host ${referenceBase} get \${refURL}
-      unzip \$(basename \${refURL})
-      mv \${fName}/data/* .
+      fi
+    else
+      pipelineError_spike=false
+      echo -e "LOG: spike matches: Submitted=${spikeMeta}; Inferred=${spikeInfer}" >> ${repRID}.checkMetadata.log
     fi
-    echo -e "LOG: fetched" >> ${repRID}.getRef.log
 
-    mv ./annotation/genome.gtf .
-    mv ./sequence/genome.fna .
-    mv ./annotation/genome.bed .
-    mv ./metadata/Entrez.tsv .
-    mv ./metadata/geneID.tsv .
+    # create dummy output bag rid if failure
+    if [ \${pipelineError} == true ]
+    then
+      echo "fail" > outputBagRID.csv
+    fi
+
+    # write checks to file
+    echo "\${pipelineError},\${pipelineError_ends},\${pipelineError_stranded},\${pipelineError_spike},\${pipelineError_species}" > check.csv
     """
 }
 
-// Replicate reference for multiple process inputs
-reference.into {
-  reference_alignData
-  reference_countData
-  reference_dataQC
+// Split errors into separate channels and replicate them for multiple process inputs
+pipelineError = Channel.create()
+pipelineError_ends = Channel.create()
+pipelineError_stranded = Channel.create()
+pipelineError_spike = Channel.create()
+pipelineError_species = Channel.create()
+checkMetadata_fl.splitCsv(sep: ",", header: false).separate(
+  pipelineError,
+  pipelineError_ends,
+  pipelineError_stranded,
+  pipelineError_spike,
+  pipelineError_species
+)
+pipelineError.into {
+  pipelineError_getRef
+  pipelineError_alignData
+  pipelineError_dedupData
+  pipelineError_makeBigWig
+  pipelineError_countData
+  pipelineError_fastqc
+  pipelineError_dataQC
+  pipelineError_aggrQC
+  pipelineError_uploadQC
+  pipelineError_uploadProcessedFile
+  pipelineError_uploadOutputBag
+  pipelineError_failExecutionRun
+  pipelineError_uploadExecutionRun
+  pipelineError_finalizeExecutionRun
+  pipelineError_uploadQC_fail
 }
 
 /*
- * alignData: aligns the reads to a reference database
-*/
+ * alignData: aligns the reads to the appripriate species reference
+ */
 process alignData {
   tag "${repRID}"
 
@@ -1804,29 +1611,27 @@ process alignData {
     path reference_alignData
     val ends from endsInfer_alignData
     val stranded from strandedInfer_alignData
-    val fastqCountError_alignData
-    val fastqReadError_alignData
-    val fastqFileError_alignData
-    val seqtypeInferError_alignData
-    val speciesError_alignData
-    val pipelineError_alignData
+    val fastqCountError from fastqCountError_alignData
+    val fastqReadError from fastqReadError_alignData
+    val fastqFileError from fastqFileError_alignData
+    val seqtypeError from seqtypeError_alignData
+    val speciesError from speciesError_alignData
 
   output:
     tuple path ("${repRID}.sorted.bam"), path ("${repRID}.sorted.bam.bai") into rawBam
     path ("*.alignSummary.txt") into alignQC
 
   when:
-    fastqCountError_alignData == "false"
-    fastqReadError_alignData == "false"
-    fastqFileError_alignData == "false"
-    seqtypeInferError_alignData == "false"
-    speciesError_alignData == "false"
-    pipelineError_alignData == "false"
+    fastqCountError == "false"
+    fastqReadError == "false"
+    fastqFileError == "false"
+    seqtypeError == "false"
+    speciesError == "false"
 
   script:
     """
-    hostname > ${repRID}.align.log
-    ulimit -a >> ${repRID}.align.log
+    hostname > ${repRID}.alignData.log
+    ulimit -a >> ${repRID}.alignData.log
 
     # set stranded param for hisat2
     if [ "${stranded}"=="unstranded" ]
@@ -1847,7 +1652,7 @@ process alignData {
     fi
 
     # align the reads with Hisat2
-    echo -e "LOG: aligning ${ends}" >> ${repRID}.align.log
+    echo -e "LOG: aligning ${ends}" >> ${repRID}.alignData.log
     if [ "${ends}" == "se" ]
     then
       hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} -U ${fastq[0]} --summary-file ${repRID}.alignSummary.txt --new-summary
@@ -1855,45 +1660,40 @@ process alignData {
     then
       hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} --summary-file ${repRID}.alignSummary.txt --new-summary
     fi
-    echo -e "LOG: alignined" >> ${repRID}.align.log
+    echo -e "LOG: alignined" >> ${repRID}.alignData.log
 
     # convert the output sam file to a sorted bam file using Samtools
-    echo -e "LOG: converting from sam to bam" >> ${repRID}.align.log
+    echo -e "LOG: converting from sam to bam" >> ${repRID}.alignData.log
     samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam
 
     # sort the bam file using Samtools
-    echo -e "LOG: sorting the bam file" >> ${repRID}.align.log
+    echo -e "LOG: sorting the bam file" >> ${repRID}.alignData.log
     proc=\$(expr `nproc` - 1)
     mem=\$(vmstat -s -S K | grep 'total memory' | grep -o '[0-9]*')
     mem=\$(expr \${mem} / \${proc} \\* 75 / 100)
     samtools sort -@ \${proc} -m \${mem}K -O BAM -o ${repRID}.sorted.bam ${repRID}.bam
 
     # index the sorted bam using Samtools
-    echo -e "LOG: indexing sorted bam file" >> ${repRID}.align.log
+    echo -e "LOG: indexing sorted bam file" >> ${repRID}.alignData.log
     samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bam.bai
     """
 }
 
-// Replicate rawBam for multiple process inputs
-rawBam.set {
-  rawBam_dedupData
-}
-
 /*
- *dedupData: mark the duplicate reads, specifically focused on PCR or optical duplicates
-*/
+ * dedupData: mark the duplicate reads, specifically focused on PCR or optical duplicates
+ */
 process dedupData {
   tag "${repRID}"
-  publishDir "${outDir}/bam", mode: 'copy', pattern: "*.deduped.bam"
+  publishDir "${outDir}/bam", mode: 'copy', pattern: "*.deduped.{bam,bai}"
 
   input:
-    tuple path (bam), path (bai) from rawBam_dedupData
-    val fastqCountError_dedupData
-    val fastqReadError_dedupData
-    val fastqFileError_dedupData
-    val seqtypeInferError_dedupData
-    val speciesError_dedupData
-    val pipelineError_dedupData
+    tuple path (bam), path (bai) from rawBam
+    val fastqCountError from fastqCountError_dedupData
+    val fastqReadError from fastqReadError_dedupData
+    val fastqFileError from fastqFileError_dedupData
+    val seqtypeError from seqtypeError_dedupData
+    val speciesError from speciesError_dedupData
+    val pipelineError from pipelineError_dedupData
 
   output:
     tuple path ("${repRID}_sorted.deduped.bam"), path ("${repRID}_sorted.deduped.bam.bai") into dedupBam
@@ -1901,12 +1701,12 @@ process dedupData {
     path ("*.deduped.Metrics.txt") into dedupQC
 
   when:
-    fastqCountError_dedupData == "false"
-    fastqReadError_dedupData == "false"
-    fastqFileError_dedupData == "false"
-    seqtypeInferError_dedupData == "false"
-    speciesError_dedupData == "false"
-    pipelineError_dedupData == "false"
+    fastqCountError == "false"
+    fastqReadError == "false"
+    fastqFileError == "false"
+    seqtypeError == "false"
+    speciesError == "false"
+    pipelineError == "false"
 
   script:
     """
@@ -1943,31 +1743,31 @@ dedupBam.into {
 }
 
 /*
- *makeBigWig: make BigWig files for output
-*/
+ * makeBigWig: make BigWig files for output
+ */
 process makeBigWig {
   tag "${repRID}"
   publishDir "${outDir}/bigwig", mode: 'copy', pattern: "${repRID}.bw"
 
   input:
     tuple path (bam), path (bai) from dedupBam_makeBigWig
-    val fastqCountError_makeBigWig
-    val fastqReadError_makeBigWig
-    val fastqFileError_makeBigWig
-    val seqtypeInferError_makeBigWig
-    val speciesError_makeBigWig
-    val pipelineError_makeBigWig
+    val fastqCountError from fastqCountError_makeBigWig
+    val fastqReadError from fastqReadError_makeBigWig
+    val fastqFileError from fastqFileError_makeBigWig
+    val seqtypeError from seqtypeError_makeBigWig
+    val speciesError from speciesError_makeBigWig
+    val pipelineError from pipelineError_makeBigWig
 
   output:
     path ("${repRID}_sorted.deduped.bw") into bigwig
 
   when:
-    fastqCountError_makeBigWig == "false"
-    fastqReadError_makeBigWig == "false"
-    fastqFileError_makeBigWig == "false"
-    seqtypeInferError_makeBigWig == "false"
-    speciesError_makeBigWig == "false"
-    pipelineError_makeBigWig == "false"
+    fastqCountError == "false"
+    fastqReadError == "false"
+    fastqFileError == "false"
+    seqtypeError == "false"
+    speciesError == "false"
+    pipelineError == "false"
 
   script:
     """
@@ -1982,8 +1782,8 @@ process makeBigWig {
 }
 
 /*
- *countData: count data and calculate tpm
-*/
+ * countData: count data and calculate tpm
+ */
 process countData {
   tag "${repRID}"
   publishDir "${outDir}/count", mode: 'copy', pattern: "${repRID}*_tpmTable.csv"
@@ -1995,12 +1795,12 @@ process countData {
     path ref from reference_countData
     val ends from endsInfer_countData
     val stranded from strandedInfer_countData
-    val fastqCountError_countData
-    val fastqReadError_countData
-    val fastqFileError_countData
-    val seqtypeInferError_countData
-    val speciesError_countData
-    val pipelineError_countData
+    val fastqCountError from fastqCountError_countData
+    val fastqReadError from fastqReadError_countData
+    val fastqFileError from fastqFileError_countData
+    val seqtypeError from seqtypeError_countData
+    val speciesError from speciesError_countData
+    val pipelineError from pipelineError_countData
 
   output:
     path ("*_tpmTable.csv") into counts
@@ -2008,12 +1808,12 @@ process countData {
     path ("assignedReads.csv") into assignedReadsInfer_fl
 
   when:
-    fastqCountError_countData == "false"
-    fastqReadError_countData == "false"
-    fastqFileError_countData == "false"
-    seqtypeInferError_countData == "false"
-    speciesError_countData == "false"
-    pipelineError_countData == "false"
+    fastqCountError == "false"
+    fastqReadError == "false"
+    fastqFileError == "false"
+    seqtypeError == "false"
+    speciesError == "false"
+    pipelineError == "false"
 
   script:
     """
@@ -2060,21 +1860,19 @@ process countData {
     """
 }
 
-// Extract number of assigned reads metadata into channel
+// Extract number of assigned reads metadata into channel and replicate them for multiple process inputs
 assignedReadsInfer = Channel.create()
 assignedReadsInfer_fl.splitCsv(sep: ",", header: false).separate(
   assignedReadsInfer
 )
-
-// Replicate inferred assigned reads for multiple process inputs
 assignedReadsInfer.into {
   assignedReadsInfer_aggrQC
   assignedReadsInfer_uploadQC
 }
 
 /*
- *dataQC: calculate transcript integrity numbers (TIN) and bin as well as calculate innerdistance of PE replicates
-*/
+ * dataQC: calculate transcript integrity numbers (TIN) and bin as well as calculate innerdistance of PE replicates
+ */
 process dataQC {
   tag "${repRID}"
 
@@ -2084,12 +1882,12 @@ process dataQC {
     tuple path (bam), path (bai) from dedupBam_dataQC
     tuple path (chrBam), path (chrBai) from dedupChrBam
     val ends from endsInfer_dataQC
-    val fastqCountError_dataQC
-    val fastqReadError_dataQC
-    val fastqFileError_dataQC
-    val seqtypeInferError_dataQC
-    val speciesError_dataQC
-    val pipelineError_dataQC
+    val fastqCountError from fastqCountError_dataQC
+    val fastqReadError from fastqReadError_dataQC
+    val fastqFileError from fastqFileError_dataQC
+    val seqtypeError from seqtypeError_dataQC
+    val speciesError from speciesError_dataQC
+    val pipelineError from pipelineError_dataQC
 
   output:
     path "${repRID}_tin.hist.tsv" into tinHist
@@ -2097,12 +1895,12 @@ process dataQC {
     path "${repRID}_insertSize.inner_distance_freq.txt" into innerDistance
 
   when:
-    fastqCountError_dataQC == "false"
-    fastqReadError_dataQC == "false"
-    fastqFileError_dataQC == "false"
-    seqtypeInferError_dataQC == "false"
-    speciesError_dataQC == "false"
-    pipelineError_dataQC == "false"
+    fastqCountError == "false"
+    fastqReadError == "false"
+    fastqFileError == "false"
+    seqtypeError == "false"
+    speciesError == "false"
+    pipelineError == "false"
 
   script:
     """
@@ -2134,21 +1932,19 @@ process dataQC {
     """
 }
 
-// Extract median TIN metadata into channel
+// Extract median TIN metadata into channel and replicate them for multiple process inputs
 tinMedInfer = Channel.create()
 tinMedInfer_fl.splitCsv(sep: ",", header: false).separate(
   tinMedInfer
 )
-
-// Replicate inferred median TIN for multiple process inputs
 tinMedInfer.into {
   tinMedInfer_aggrQC
   tinMedInfer_uploadQC
 }
 
 /*
- *aggrQC: aggregate QC from processes as well as metadata and run MultiQC
-*/
+ * aggrQC: aggregate QC from processes as well as metadata and run MultiQC
+ */
 process aggrQC {
   tag "${repRID}"
   publishDir "${outDir}/report", mode: 'copy', pattern: "${repRID}.multiqc.html"
@@ -2166,7 +1962,8 @@ process aggrQC {
     path countsQC
     path innerDistance
     path tinHist
-    path alignSampleQCs from alignSampleQC_aggrQC.collect()
+    path alignSampleQC_ERCC from alignSampleQC_ERCC
+    path alignSampleQC from alignSampleQC
     path inferExperiment
     val endsManual from endsManual_aggrQC
     val endsM from endsMeta_aggrQC
@@ -2184,24 +1981,24 @@ process aggrQC {
     val tinMedI from tinMedInfer_aggrQC
     val studyRID from studyRID_aggrQC
     val expRID from expRID_aggrQC
-    val fastqCountError_aggrQC
-    val fastqReadError_aggrQC
-    val fastqFileError_aggrQC
-    val seqtypeInferError_aggrQC
-    val speciesError_aggrQC
-    val pipelineError_aggrQC
+    val fastqCountError from fastqCountError_aggrQC
+    val fastqReadError from fastqReadError_aggrQC
+    val fastqFileError from fastqFileError_aggrQC
+    val seqtypeError from seqtypeError_aggrQC
+    val speciesError from speciesError_aggrQC
+    val pipelineError from pipelineError_aggrQC
 
   output:
     path "${repRID}.multiqc.html" into multiqc
     path "${repRID}.multiqc_data.json" into multiqcJSON
 
   when:
-    fastqCountError_aggrQC == "false"
-    fastqReadError_aggrQC == "false"
-    fastqFileError_aggrQC == "false"
-    seqtypeInferError_aggrQC == "false"
-    speciesError_aggrQC == "false"
-    pipelineError_aggrQC == "false"
+    fastqCountError == "false"
+    fastqReadError == "false"
+    fastqFileError == "false"
+    seqtypeError == "false"
+    speciesError == "false"
+    pipelineError == "false"
 
   script:
     """
@@ -2298,9 +2095,179 @@ process aggrQC {
     """
 }
 
+/* 
+ * uploadInputBag: uploads the input bag
+ */
+process uploadInputBag {
+  tag "${repRID}"
+
+  input:
+    path script_uploadInputBag
+    path credential, stageAs: "credential.json" from deriva_uploadInputBag
+    path inputBag from inputBag_uploadInputBag
+    val studyRID from studyRID_uploadInputBag
+
+  output:
+    path ("inputBagRID.csv") into inputBagRID_fl
+
+  when:
+    upload
+
+  script:
+    """
+    hostname > ${repRID}.uploadInputBag.log
+    ulimit -a >> ${repRID}.uploadInputBag.log
+
+    yr=\$(date +'%Y')
+    mn=\$(date +'%m')
+    dy=\$(date +'%d')
+
+    file=\$(basename -a ${inputBag})
+    md5=\$(md5sum ./\${file} | awk '{ print \$1 }')
+    echo LOG: ${repRID} input bag md5 sum - \${md5} >> ${repRID}.uploadInputBag.log
+    size=\$(wc -c < ./\${file})
+    echo LOG: ${repRID} input bag size - \${size} bytes >> ${repRID}.uploadInputBag.log
+    
+    exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=\${md5})
+    if [ "\${exist}" == "[]" ]
+    then
+        cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"')
+        cookie=\${cookie:11:-1}
+
+        loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/input_bag/study/${studyRID}/replicate/${repRID}/\${file} --parents)
+        inputBag_rid=\$(python3 ${script_uploadInputBag} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie})
+        echo LOG: input bag RID uploaded - \${inputBag_rid} >> ${repRID}.uploadInputBag.log
+        rid=\${inputBag_rid}
+    else
+        exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
+        exist=\${exist:7:-6}
+        echo LOG: input bag RID already exists - \${exist} >> ${repRID}.uploadInputBag.log
+        rid=\${exist}
+    fi
+
+    echo "\${rid}" > inputBagRID.csv
+    """
+}
+
+// Extract input bag RID into channel and replicate them for multiple process inputs
+inputBagRID = Channel.create()
+inputBagRID_fl.splitCsv(sep: ",", header: false).separate(
+  inputBagRID
+)
+inputBagRID.into {
+  inputBagRID_uploadExecutionRun
+  inputBagRID_finalizeExecutionRun
+  inputBagRID_failPreExecutionRun
+  inputBagRID_failExecutionRun
+}
+
+/* 
+ * uploadExecutionRun: uploads the execution run
+ */
+process uploadExecutionRun {
+  tag "${repRID}"
+
+  input:
+    path script_uploadExecutionRun_uploadExecutionRun
+    path credential, stageAs: "credential.json" from deriva_uploadExecutionRun
+    val spike from spikeInfer_uploadExecutionRun
+    val species from speciesInfer_uploadExecutionRun
+    val inputBagRID from inputBagRID_uploadExecutionRun
+    val fastqCountError from fastqCountError_uploadExecutionRun
+    val fastqReadError from fastqReadError_uploadExecutionRun
+    val fastqFileError from fastqFileError_uploadExecutionRun
+    val seqtypeError from seqtypeError_uploadExecutionRun
+    val speciesError from speciesError_uploadExecutionRun
+    val pipelineError from pipelineError_uploadExecutionRun
+    
+  output:
+    path ("executionRunRID.csv") into executionRunRID_fl
+
+  when:
+    upload
+    fastqCountError == "false"
+    fastqReadError == "false"
+    fastqFileError == "false"
+    seqtypeError == "false"
+    speciesError == "false"
+    pipelineError == "false"
+
+  script:
+    """
+    hostname > ${repRID}.uploadExecutionRun.log
+    ulimit -a >> ${repRID}.uploadExecutionRun.log
+
+    echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.uploadExecutionRun.log
+    workflow=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Workflow/Name=BICF%20mRNA%20Replicate/Version=${workflow.manifest.version})
+    workflow=\$(echo \${workflow} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
+    workflow=\${workflow:7:-6}
+    echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.uploadExecutionRun.log
+
+    if [ "${species}" == "Homo sapiens" ]
+    then
+      genomeName=\$(echo GRCh${refHuVersion})
+    elif [ "${species}" == "Mus musculus" ]
+    then
+      genomeName=\$(echo GRCm${refMoVersion})
+    fi
+    if [ "${spike}" == "true" ]
+    then
+      genomeName=\$(echo \${genomeName}-S)
+    fi
+    echo LOG: searching for genome name - \${genomeName} >> ${repRID}.uploadExecutionRun.log
+    genome=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Name=\${genomeName})
+    genome=\$(echo \${genome} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
+    genome=\${genome:7:-6}
+    echo LOG: genome RID extracted - \${genome} >> ${repRID}.uploadExecutionRun.log
+
+    cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"')
+    cookie=\${cookie:11:-1}
+
+    exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Replicate=${repRID}/Input_Bag=${inputBagRID})
+    echo \${exist} >> ${repRID}.uploadExecutionRun.log
+    if [ "\${exist}" == "[]" ]
+    then
+      executionRun_rid=\$(python3 ${script_uploadExecutionRun_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s In-progress -d 'Run in process' -o ${source} -c \${cookie} -u F)
+      echo LOG: execution run RID uploaded - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log
+    else
+      rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
+      rid=\${rid:7:-6}
+      echo \${rid} >> ${repRID}.uploadExecutionRun.log
+      executionRun_rid=\$(python3 ${script_uploadExecutionRun_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s In-progress -d 'Run in process' -o ${source} -c \${cookie} -u \${rid})
+      echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log
+    fi
+
+    echo "\${executionRun_rid}" > executionRunRID.csv
+
+    if [ ${params.track} == true ]
+    then
+      curl -H 'Content-Type: application/json' -X PUT -d \
+        '{ \
+          "ID": "${workflow.sessionId}", \
+          "ExecutionRunRID": "'\${executionRun_rid}'" \
+        }' \
+        "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track"
+    fi
+    """
+}
+
+// Extract execution run RID into channel and replicate them for multiple process inputs
+executionRunRID = Channel.create()
+executionRunRID_fl.splitCsv(sep: ",", header: false).separate(
+  executionRunRID
+)
+executionRunRID.into {
+  executionRunRID_uploadQC
+  executionRunRID_uploadProcessedFile
+  executionRunRID_uploadOutputBag
+  executionRunRID_finalizeExecutionRun
+  executionRunRID_failExecutionRun
+  executionRunRID_fail
+}
+
 /* 
  * uploadQC: uploads the mRNA QC
-*/
+ */
 process uploadQC {
   tag "${repRID}"
 
@@ -2315,24 +2282,24 @@ process uploadQC {
     val rawCount from rawReadsInfer_uploadQC
     val finalCount from assignedReadsInfer_uploadQC
     val tinMed from tinMedInfer_uploadQC
-    val fastqCountError_uploadQC
-    val fastqReadError_uploadQC
-    val fastqFileError_uploadQC
-    val seqtypeInferError_uploadQC
-    val speciesError_uploadQC
-    val pipelineError_uploadQC
+    val fastqCountError from fastqCountError_uploadQC
+    val fastqReadError from fastqReadError_uploadQC
+    val fastqFileError from fastqFileError_uploadQC
+    val seqtypeError from seqtypeError_uploadQC
+    val speciesError from speciesError_uploadQC
+    val pipelineError from pipelineError_uploadQC
 
   output:
     path ("qcRID.csv") into qcRID_fl
 
   when:
     upload
-    fastqCountError_uploadQC == "false"
-    fastqReadError_uploadQC == "false"
-    fastqFileError_uploadQC == "false"
-    seqtypeInferError_uploadQC == "false"
-    speciesError_uploadQC == "false"
-    pipelineError_uploadQC == "false"
+    fastqCountError == "false"
+    fastqReadError == "false"
+    fastqFileError == "false"
+    seqtypeError == "false"
+    speciesError == "false"
+    pipelineError == "false"
 
   script:
     """
@@ -2370,8 +2337,8 @@ process uploadQC {
 }
 
 /*
- *uploadProcessedFile: uploads the processed files
-*/
+ * uploadProcessedFile: uploads the processed files
+ */
 process uploadProcessedFile {
   tag "${repRID}"
   publishDir "${outDir}/outputBag", mode: 'copy', pattern: "Replicate_${repRID}.outputBag.zip"
@@ -2389,24 +2356,24 @@ process uploadProcessedFile {
     val studyRID from studyRID_uploadProcessedFile
     val expRID from expRID_uploadProcessedFile
     val executionRunRID from executionRunRID_uploadProcessedFile
-    val fastqCountError_uploadProcessedFile
-    val fastqReadError_uploadProcessedFile
-    val fastqFileError_uploadProcessedFile
-    val seqtypeInferError_uploadProcessedFile
-    val speciesError_uploadProcessedFile
-    val pipelineError_uploadProcessedFile
+    val fastqCountError from fastqCountError_uploadProcessedFile
+    val fastqReadError from fastqReadError_uploadProcessedFile
+    val fastqFileError from fastqFileError_uploadProcessedFile
+    val seqtypeError from seqtypeError_uploadProcessedFile
+    val speciesError from speciesError_uploadProcessedFile
+    val pipelineError from pipelineError_uploadProcessedFile
 
   output:
     path ("${repRID}_Output_Bag.zip") into outputBag
 
   when:
     upload
-    fastqCountError_uploadProcessedFile == "false"
-    fastqReadError_uploadProcessedFile == "false"
-    fastqFileError_uploadProcessedFile == "false"
-    seqtypeInferError_uploadProcessedFile == "false"
-    speciesError_uploadProcessedFile == "false"
-    pipelineError_uploadProcessedFile == "false"
+    fastqCountError == "false"
+    fastqReadError == "false"
+    fastqFileError == "false"
+    seqtypeError == "false"
+    speciesError == "false"
+    pipelineError == "false"
 
   script:
     """
@@ -2475,7 +2442,7 @@ process uploadProcessedFile {
 
 /* 
  * uploadOutputBag: uploads the output bag
-*/
+ */
 process uploadOutputBag {
   tag "${repRID}"
 
@@ -2485,24 +2452,24 @@ process uploadOutputBag {
     path outputBag
     val studyRID from studyRID_uploadOutputBag
     val executionRunRID from executionRunRID_uploadOutputBag
-    val fastqCountError_uploadOutputBag
-    val fastqReadError_uploadOutputBag
-    val fastqFileError_uploadOutputBag
-    val seqtypeInferError_uploadOutputBag
-    val speciesError_uploadOutputBag
-    val pipelineError_uploadOutputBag
+    val fastqCountError from fastqCountError_uploadOutputBag
+    val fastqReadError from fastqReadError_uploadOutputBag
+    val fastqFileError from fastqFileError_uploadOutputBag
+    val seqtypeError from seqtypeError_uploadOutputBag
+    val speciesError from speciesError_uploadOutputBag
+    val pipelineError from pipelineError_uploadOutputBag
 
   output:
     path ("outputBagRID.csv") into outputBagRID_fl
 
   when:
     upload
-    fastqCountError_uploadOutputBag == "false"
-    fastqReadError_uploadOutputBag == "false"
-    fastqFileError_uploadOutputBag == "false"
-    seqtypeInferError_uploadOutputBag == "false"
-    speciesError_uploadOutputBag == "false"
-    pipelineError_uploadOutputBag == "false"
+    fastqCountError == "false"
+    fastqReadError == "false"
+    fastqFileError == "false"
+    seqtypeError == "false"
+    speciesError == "false"
+    pipelineError == "false"
 
   script:
     """
@@ -2555,7 +2522,7 @@ outputBagRID_fl.splitCsv(sep: ",", header: false).separate(
 
 /* 
  * finalizeExecutionRun: finalizes the execution run
-*/
+ */
 process finalizeExecutionRun {
   tag "${repRID}"
 
@@ -2565,9 +2532,21 @@ process finalizeExecutionRun {
     val executionRunRID from executionRunRID_finalizeExecutionRun
     val inputBagRID from inputBagRID_finalizeExecutionRun
     val outputBagRID
+    val fastqCountError from fastqCountError_finalizeExecutionRun
+    val fastqReadError from fastqReadError_finalizeExecutionRun
+    val fastqFileError from fastqFileError_finalizeExecutionRun
+    val seqtypeError from seqtypeError_finalizeExecutionRun
+    val speciesError from speciesError_finalizeExecutionRun
+    val pipelineError from pipelineError_finalizeExecutionRun
 
   when:
     upload
+    fastqCountError == "false"
+    fastqReadError == "false"
+    fastqFileError == "false"
+    seqtypeError == "false"
+    speciesError == "false"
+    pipelineError == "false"
 
   script:
     """
@@ -2598,16 +2577,16 @@ process finalizeExecutionRun {
 }
 
 // Combine errors
-error_meta = fastqCountError_uploadQC_fail.ifEmpty(false).combine(fastqReadError_uploadQC_fail.ifEmpty(false).combine(fastqFileError_uploadQC_fail.ifEmpty(false).combine(seqtypeInferError_uploadQC_fail.ifEmpty(false).combine(speciesError_uploadQC_fail.ifEmpty(false).combine(pipelineError_uploadQC_fail.ifEmpty(false))))))
-error_meta. into{
+error_meta = fastqCountError_uploadQC_fail.ifEmpty(false).combine(fastqReadError_uploadQC_fail.ifEmpty(false).combine(fastqFileError_uploadQC_fail.ifEmpty(false).combine(seqtypeError_uploadQC_fail.ifEmpty(false).combine(speciesError_uploadQC_fail.ifEmpty(false).combine(pipelineError_uploadQC_fail.ifEmpty(false))))))
+error_meta. into {
   error_failPreExecutionRun
   error_uploadQC_fail
 }
-errorDetails = fastqCountError_details.ifEmpty("").combine(fastqReadError_details.ifEmpty("").combine(fastqFileError_details.ifEmpty("").combine(seqtypeInferError_details.ifEmpty("").combine(speciesError_details.ifEmpty("")))))
+errorDetails = fastqCountError_details.ifEmpty("").combine(fastqReadError_details.ifEmpty("").combine(fastqFileError_details.ifEmpty("").combine(seqtypeError_details.ifEmpty("").combine(speciesErrorSeqwho_details.ifEmpty("")))))
 
 /* 
- * failPreExecutionRun_fastq: fail the execution run prematurely for fastq errors
-*/
+ * failPreExecutionRun: fail the execution run prematurely for fastq errors
+ */
 process failPreExecutionRun {
   tag "${repRID}"
 
@@ -2617,15 +2596,15 @@ process failPreExecutionRun {
     val spike from spikeMeta_failPreExecutionRun
     val species from speciesMeta_failPreExecutionRun
     val inputBagRID from inputBagRID_failPreExecutionRun
-    tuple val (fastqCountError), val (fastqReadError), val (fastqFileError), val (seqtypeInferError), val (speciesError), val (pipelineError) from error_failPreExecutionRun
-    tuple val (fastqCountError_details), val (fastqReadError_details), val (fastqFileError_details), val (seqtypeInferError_details), val (speciesError_details) from errorDetails
+    tuple val (fastqCountError), val (fastqReadError), val (fastqFileError), val (seqtypeError), val (speciesError), val (pipelineError) from error_failPreExecutionRun
+    tuple val (fastqCountError_details), val (fastqReadError_details), val (fastqFileError_details), val (seqtypeError_details), val (speciesError_details) from errorDetails
 
   output:
     path ("executionRunRID.csv") into executionRunRID_preFail_fl
 
   when:
     upload
-    fastqCountError == "true" || fastqReadError == "true" || fastqFileError == "true" || seqtypeInferError == "true" || speciesError == "true"
+    fastqCountError == "true" || fastqReadError == "true" || fastqFileError == "true" || seqtypeError == "true" || speciesError == "true"
 
   script:
     """
@@ -2642,9 +2621,9 @@ process failPreExecutionRun {
     elif [ ${fastqFileError} == true ]
     then
       errorDetails=\$(echo \$(errorDetails)${fastqFileError_details}"\\n")
-    elif [ ${seqtypeInferError} == true ]
+    elif [ ${seqtypeError} == true ]
     then
-      errorDetails=\$(echo \$(errorDetails)${seqtypeInferError_details}"\\n")
+      errorDetails=\$(echo \$(errorDetails)${seqtypeError_details}"\\n")
     elif [ ${speciesError} == true ]
     then
       errorDetails=\$(echo \$(errorDetails)${speciesError_details}"\\n")
@@ -2715,7 +2694,7 @@ failExecutionRunRID = executionRunRID_fail.ifEmpty('').mix(executionRunRID_preFa
 
 /* 
  * failExecutionRun: fail the execution run
-*/
+ */
 process failExecutionRun {
   tag "${repRID}"
 
@@ -2810,7 +2789,7 @@ process failExecutionRun {
 
 /* 
  * uploadQC_fail: uploads the mRNA QC on failed execution run
-*/
+ */
 process uploadQC_fail {
   tag "${repRID}"
 
@@ -2819,11 +2798,11 @@ process uploadQC_fail {
     path script_uploadQC_fail
     path credential, stageAs: "credential.json" from deriva_uploadQC_fail
     val executionRunRID from failExecutionRunRID
-    tuple val (fastqCountError), val (fastqReadError), val (fastqFileError), val (speciesError), val (pipelineError) from error_uploadQC_fail
+    tuple val (fastqCountError), val (fastqReadError), val (fastqFileError), val (seqtypeError), val (speciesError), val (pipelineError) from error_uploadQC_fail
 
   when:
     upload
-    fastqCountError == 'true' || fastqReadError == 'true' || fastqFileError == 'true' || speciesError == 'true' || pipelineError == 'true'
+    fastqCountError == "true" || fastqReadError == "true" || fastqFileError == "true" || seqtypeError == "true" || speciesError == "true" || pipelineError == 'true'
 
   script:
     """
@@ -2852,7 +2831,6 @@ process uploadQC_fail {
     """
 }
 
-
 workflow.onError = {
   subject = "$workflow.manifest.name FAILED: $params.repRID"
 
diff --git a/workflow/scripts/generate_versions.py b/workflow/scripts/generate_versions.py
index 09447d17a62a439a418753398e1cd77716ceaa74..ecaeb7c44a920bbd3cadcb73d7cccdaed7d8ab31 100644
--- a/workflow/scripts/generate_versions.py
+++ b/workflow/scripts/generate_versions.py
@@ -34,6 +34,7 @@ SOFTWARE_REGEX = {
     'Python': ['version_python.txt', r"Python (\S+)"],
     'DERIVA': ['version_deriva.txt', r"(\S+)"],
     'BDBag': ['version_bdbag.txt', r"BDBag (\S+) \(Bagit \S+\)"],
+    'SeqWho': ['version_seqwho.txt', r"Version: (\S+)"],
     'RSeQC': ['version_rseqc.txt', r"infer_experiment.py (\S+)"],
     'Trim Galore!': ['version_trimgalore.txt', r"version (\S+)"],
     'HISAT2': ['version_hisat2.txt', r"version (\S+)"],
@@ -93,6 +94,7 @@ def main():
     results['Python'] = '<span style="color:#999999;\">Not Run</span>'
     results['DERIVA'] = '<span style="color:#999999;\">Not Run</span>'
     results['BDBag'] = '<span style="color:#999999;\">Not Run</span>'
+    results['SeqWho'] = '<span style="color:#999999;\">Not Run</span>'
     results['RSeQC'] = '<span style="color:#999999;\">Not Run</span>'
     results['Trim Galore!'] = '<span style="color:#999999;\">Not Run</span>'
     results['HISAT2'] = '<span style="color:#999999;\">Not Run</span>'
diff --git a/workflow/scripts/get_updated_badge_info.sh b/workflow/scripts/get_updated_badge_info.sh
index 4b929272f2ea80ede5d47b84cd55bad2c6a3fa7b..a8c40333bfda4828240b961dd50dd27e6300f458 100644
--- a/workflow/scripts/get_updated_badge_info.sh
+++ b/workflow/scripts/get_updated_badge_info.sh
@@ -13,6 +13,7 @@ echo "collecting tool version for badges"
 python_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o Python.* | grep -oP "(?<=d>).*(?=\<)")
 deriva_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o DERIVA.* | grep -oP "(?<=d>).*(?=\<)")
 bdbag_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o BDBag.* | grep -oP "(?<=d>).*(?=\<)")
+seqwho_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o SeqWho.* | grep -oP "(?<=d>).*(?=\<)")
 rseqc_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o RSeQC.* | grep -oP "(?<=d>).*(?=\<)")
 trimgalore_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o 'Trim Galore!'.* | grep -oP "(?<=d>).*(?=\<)")
 hisat2_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o HISAT2.* | grep -oP "(?<=d>).*(?=\<)")
@@ -36,6 +37,7 @@ curl --request GET https://img.shields.io/badge/Nextflow%20Version-${develop_nex
 
 curl --request GET https://img.shields.io/badge/Python%20Version-${python_version}-blueviolet?style=flat > ./badges/tools/python.svg
 curl --request GET https://img.shields.io/badge/DERIVA%20Version-${deriva_version}-blueviolet?style=flat > ./badges/tools/deriva.svg
+curl --request GET https://img.shields.io/badge/SeqWho%20Version-${seqwho_version}-blueviolet?style=flat > ./badges/tools/seqwho.svg
 curl --request GET https://img.shields.io/badge/BDBag%20Version-${bdbag_version}-blueviolet?style=flat > ./badges/tools/bdbag.svg
 curl --request GET https://img.shields.io/badge/RSeQC%20Version-${rseqc_version}-blueviolet?style=flat > ./badges/tools/rseqc.svg
 curl --request GET https://img.shields.io/badge/Trim%20Galore%20Version-${trimgalore_version}-blueviolet?style=flat > ./badges/tools/trimgalore.svg
diff --git a/workflow/tests/test_seqwho.py b/workflow/tests/test_seqwho.py
new file mode 100644
index 0000000000000000000000000000000000000000..051cc4b379bc2378b2effff22f4737592d9b54cd
--- /dev/null
+++ b/workflow/tests/test_seqwho.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+from io import StringIO
+import os
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+    '/../../'
+
+
+@pytest.mark.seqwho
+def test_seqwho():
+    assert os.path.exists(os.path.join(
+        test_output_path, 'SeqWho_call.tsv'))