Merge branch '0.0.3' into 'develop'

0.0.3 Closes #74, #73, #72, #71, and #61 See merge request !42

Merge branch '0.0.3' into 'develop'
0.0.3 Closes #74, #73, #72, #71, and #61 See merge request !42
d14cdf77 · Venkat Malladi · 64c587de · 1bfecd57 · d14cdf77 · d14cdf77
Commit d14cdf77 authored 4 years ago by Venkat Malladi
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -14,6 +14,11 @@ stages:

 getBag:
  stage: unit
+  only:
+    - push
+    - tags
+  except:
+    - merge_requests
  script:
  - ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json
  - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6
@@ -21,14 +26,24 @@ getBag:

 getData:
  stage: unit
+  only:
+    - push
+    - tags
+  except:
+    - merge_requests
  script:
  - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
-  - unzip ./test_data/bagit/Replicate_Q-Y5F6.zip
+  - unzip ./test_data/bag/Replicate_Q-Y5F6.zip
  - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6 TEST
  - pytest -m getData

 parseMetadata:
  stage: unit
+  only:
+    - push
+    - tags
+  except:
+    - merge_requests
  script:
  - rep=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID)
  - exp=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p expRID)
@@ -44,6 +59,11 @@ parseMetadata:

 inferMetadata:
  stage: unit
+  only:
+    - push
+    - tags
+  except:
+    - merge_requests
  script:
  - >
    align=$(echo $(grep "Overall alignment rate" ./test_data/meta/Q-Y5F6_1M.se.alignSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) &&
@@ -56,6 +76,11 @@ inferMetadata:

 getRef:
  stage: unit
+  only:
+    - push
+    - tags
+  except:
+    - merge_requests
  script:
  - mkdir -p hu
  - mkdir -p mo
@@ -64,6 +89,11 @@ getRef:

 trimData:
  stage: unit
+  only:
+    - push
+    - tags
+  except:
+    - merge_requests
  script:
  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
@@ -73,12 +103,22 @@ trimData:

 downsampleData:
  stage: unit
+  only:
+    - push
+    - tags
+  except:
+    - merge_requests
  script:
  - singularity run 'docker://bicf/seqtk:2.0.1_indev' seqtk sample -s100 ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz 1000 1> sampled.1.fq
  - pytest -m downsampleData

 alignData:
  stage: unit
+  only:
+    - push
+    - tags
+  except:
+    - merge_requests
  script:
  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary
  - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam
@@ -92,6 +132,11 @@ alignData:

 dedupData:
  stage: unit
+  only:
+    - push
+    - tags
+  except:
+    - merge_requests
  script:
  - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true
  - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5F6_1M.se.deduped.bam
@@ -104,26 +149,49 @@ dedupData:

 countData:
  stage: unit
+  only:
+    - push
+    - tags
+  except:
+    - merge_requests
  script:
-  - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' -o Q-Y5F6_1M.se.featureCounts -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam 
-  - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.featureCounts
+  - ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/geneID.tsv
+  - ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/Entrez.tsv
+  - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se.countData -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam 
+  - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.countData
+  - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se
  - assignedReads=$(grep -m 1 'Assigned' *.summary | grep -oe '\([0-9.]*\)')
  - pytest -m makeFeatureCounts

 makeBigWig:
  stage: unit
+  only:
+    - push
+    - tags
+  except:
+    - merge_requests
  script:
  - singularity run 'docker://bicf/deeptools3.3:2.0.1_indev' bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw
  - pytest -m makeBigWig

 fastqc:
  stage: unit
+  only:
+    - push
+    - tags
+  except:
+    - merge_requests
  script:
  - singularity run 'docker://bicf/fastqc:2.0.1_indev' fastqc ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o .
  - pytest -m fastqc

 dataQC:
  stage: unit
+  only:
+    - push
+    - tags
+  except:
+    - merge_requests
  script:
  - echo -e  "geneID\tchrom\ttx_start\ttx_end\tTIN" > Q-Y5F6_1M.se.sorted.deduped.tin.xls
  - for i in {"chr8","chr4","chrY"}; do
@@ -132,6 +200,11 @@ dataQC:

 outputBag:
  stage: unit
+  only:
+    - push
+    - tags
+  except:
+    - merge_requests
  script:
  - mkdir test
  - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bdbag test --archiver zip
@@ -140,6 +213,11 @@ outputBag:

 integration_se:
  stage: integration
+  only:
+    - merge_requests
+  except:
+    refs:
+      - master
  script:
  - hostname
  - ulimit -a
@@ -150,11 +228,21 @@ integration_se:
    when: always
    paths:
      - output/qc/
+      - output/report/
      - SE_multiqc_data.json
    expire_in: 7 days
+  retry:
+    max: 1
+    when:
+      - always

 integration_pe:
  stage: integration
+  only:
+    - merge_requests
+  except:
+    refs:
+      - master
  script:
  - hostname
  - ulimit -a
@@ -166,11 +254,91 @@ integration_pe:
    paths:
      - dag.png
      - output/qc/
+      - output/report/
      - PE_multiqc_data.json
    expire_in: 7 days
+  retry:
+    max: 1
+    when:
+      - always
+
+override_inputBag:
+  stage: integration
+  only:
+    - merge_requests
+  except:
+    refs:
+      - master
+  script:
+  - hostname
+  - ulimit -a
+  - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --inputBagForce ./test_data/bag/Replicate_Q-Y5F6.zip --ci true
+  - find . -type f -name "multiqc_data.json" -exec cp {} ./inputBagOverride_PE_multiqc_data.json \;
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - inputBagOverride_PE_multiqc_data.json
+    expire_in: 7 days
+  retry:
+    max: 1
+    when:
+      - always
+
+override_fastq:
+  stage: integration
+  only:
+    - merge_requests
+  except:
+    refs:
+      - master
+  script:
+  - hostname
+  - ulimit -a
+  - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --fastqsForce './test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz' --ci true
+  - find . -type f -name "multiqc_data.json" -exec cp {} ./fastqOverride_PE_multiqc_data.json \;
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - fastqOverride_PE_multiqc_data.json
+    expire_in: 7 days
+  retry:
+    max: 1
+    when:
+      - always
+    
+override_species:
+  stage: integration
+  only:
+    - merge_requests
+  except:
+    refs:
+      - master
+  script:
+  - hostname
+  - ulimit -a
+  - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --speciesForce 'Homo sapiens' --ci true
+  - find . -type f -name "multiqc_data.json" -exec cp {} ./speciesOverride_PE_multiqc_data.json \;
+  artifacts:
+    name: "$CI_JOB_NAME"
+    when: always
+    paths:
+      - speciesOverride_PE_multiqc_data.json
+    expire_in: 7 days
+  retry:
+    max: 1
+    when:
+      - always
+  

 consistency:
  stage: consistency
+  only:
+    - merge_requests
+  except:
+    refs:
+      - master
  script:
  - grep -m 1 \"Assigned\":.[0-9] SE_multiqc_data.json | grep -oe '\([0-9.]*\)' > assignedSE.txt
  - grep -m 1 \"Assigned\":.[0-9] PE_multiqc_data.json | grep -oe '\([0-9.]*\)' > assignedPE.txt

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
-# v0.0.2 (in development)
+# v0.0.3 (in development)
+**User Facing**
+* TPM table:
+  * Add Ensembl Gene ID
+  * Rename columns: *GENCODE_Gene_Symbol*, *Ensembl_GeneID*, *NCBI_GeneID*
+* MultiQC output custom tables (html+JSON):
+  * Run table: *Session ID* and *Pipeline Version*
+  * Reference Table: *Species*, *Genome Reference Consortium Build*, *Genome Reference Consortium Patch*, *GENCODE Annotation Release* (outputs both human and mouse versions)
+* Add inputBag override param (`inputBagForce`) [`*.zip`]
+  * Uses provided inputBag instead of downloading from data-hub
+  * Still requires matching repRID input param
+* Add fastq override param (`fastqsForce`) [`R1`,`R2`]
+  * Uses provided fastq instead of downloading from data-hub
+  * Still requires matching repRID input param and will pull inputBag from data-hub to access submitted metadata for reporting
+* Add species override param (`speciesForce`) [`Mus musculus` or `Homo sapiens`]
+  * forces the use of the provided species
+  * ignores inferred ambiguous species
+
+**Background**
+* Add GeneSymbol/EnsemblID/EntrezID translation files to references
+
+*Known Bugs*
+* outputBag does not contain fetch for processed data
+* Does not include automatic data upload
+* Override params (inputBag, fastq, species) aren't checked for integrity
+
+<hr>
+
+# v0.0.2
 **User Facing**
 * Output:
  * inputBag
@@ -19,4 +47,4 @@
 **INITIAL BETA VERSION**\
 Does not include automatic data upload\
 This version is for initial upload of test data to GUDMAP/RBK data-hub for internal integration
-<hr>
+<hr>
\ No newline at end of file
--- a/README.md
+++ b/README.md
@@ -48,6 +48,13 @@ To Run:
  * reference version consists of Genome Reference Consortium version, patch release and GENCODE annotation release # (leaving the params blank will use the default version tied to the pipeline version)
    * *current mouse* **38.p6.vM22** = GRCm38.p6 with GENCODE annotation release M22
    * *current human* **38.p6.v31** = GRCh38.p12 with GENCODE annotation release 31
+* ***Optional*** input overrides
+  * `--inputBagForce` utilizes a local replicate inputBag instead of downloading from the data-hub (still requires accurate repRID input)
+    * eg: `--inputBagForce test_data/bag/Replicate_Q-Y5F6.zip` (must be the expected bag structure)
+  * `--fastqsForce` utilizes local fastq's instead of downloading from the data-hub (still requires accurate repRID input)
+    * eg: `--fastqsForce 'test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz'` (note the quotes around fastq's which must me named in the correct standard [*\*.R1.fastq.gz and/or \*.R2.fastq.gz*] and in the correct order)
+  * `--speciesForce` forces the species to be "Mus musculus" or "Homo sapiens", it bypasses ambiguous species error
+    * eg: `--speciesForce 'Mus musculus'`
 * Tracking parameters ([Tracking Site](http://bicf.pipeline.tracker.s3-website-us-east-1.amazonaws.com/)):
  * `--ci` boolean (default = false)
  * `--dev` boolean (default = false)

--- a/docs/dag.png
+++ b/docs/dag.png
--- a/test_data/createTestData.sh
+++ b/test_data/createTestData.sh
@@ -9,12 +9,12 @@ mkdir -p NEW_test_data

 ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json

-mkdir -p ./NEW_test_data/bagit
+mkdir -p ./NEW_test_data/bag
 singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6
-cp Replicate_Q-Y5F6.zip ./NEW_test_data/bagit/Replicate_Q-Y5F6.zip
+cp Replicate_Q-Y5F6.zip ./NEW_test_data/bag/Replicate_Q-Y5F6.zip

 mkdir -p ./NEW_test_data/fastq
-unzip ./test_data/bagit/Replicate_Q-Y5F6.zip
+unzip ./test_data/bag/Replicate_Q-Y5F6.zip
 singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6
 cp Replicate_Q-Y5F6.R1.fastq.gz ./NEW_test_data/fastq/Replicate_Q-Y5F6.R1.fastq.gz
 cp Replicate_Q-Y5F6.R2.fastq.gz ./NEW_test_data/fastq/Replicate_Q-Y5F6.R2.fastq.gz
@@ -81,10 +81,14 @@ cp Q-Y5F6_1M.se.sorted.deduped.chrY.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.

 mkdir -p ./NEW_test_data/counts
 mkdir -p ./NEW_test_data/counts/small
-singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' -o Q-Y5F6_1M.se.featureCounts -s 1 -R SAM --primary --ignoreDup ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam 
-singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count Q-Y5F6_1M.se.featureCounts
-cp Q-Y5F6_1M.se.featureCounts ./NEW_test_data/counts/small/Q-Y5F6_1M.se.featureCounts
+ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/geneID.tsv
+ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/Entrez.tsv
+singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se.countData -s 1 -R SAM --primary --ignoreDup ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam 
+singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count Q-Y5F6_1M.se.countData
+singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se
+cp Q-Y5F6_1M.se.featureCounts ./NEW_test_data/counts/small/Q-Y5F6_1M.se.countData
 cp Q-Y5F6_1M.se.countTable.csv ./NEW_test_data/counts/small/Q-Y5F6_1M.se.countTable.csv
+cp Q-Y5F6_1M.se.countTable.csv ./NEW_test_data/counts/small/Q-Y5F6_1M.se.tpmTable.csv

 mkdir -p ./NEW_test_data/bw
 mkdir -p ./NEW_test_data/bw/small

--- a/workflow/conf/multiqc_config.yaml
+++ b/workflow/conf/multiqc_config.yaml
@@ -48,14 +48,32 @@ top_modules:
        - '*infer_experiment*'

 report_section_order:
+    run:
+      order: 4000
    rid:
-      order: 2000
+      order: 3000
    meta:
+      order: 2000
+    ref:
      order: 1000

 skip_generalstats: true

 custom_data:
+    run:
+        file_format: 'tsv'
+        section_name: 'Run'
+        description: 'This is the run information'
+        plot_type: 'table'
+        pconfig:
+            id: 'run'
+            scale: false
+            format: '{}'
+        headers:
+            Session
+            Session ID
+            Pipeline Version
+            Input
    rid:
        file_format: 'tsv'
        section_name: 'RID'
@@ -63,7 +81,10 @@ custom_data:
        plot_type: 'table'
        pconfig:
            id: 'rid'
+            scale: false
+            format: '{}'
        headers:
+            Replicate
            Replicate RID
            Experiment RID
            Study RID
@@ -74,6 +95,7 @@ custom_data:
        plot_type: 'table'
        pconfig:
            id: 'meta'
+            scale: false
            format: '{:,.0f}'
        headers:
            Source
@@ -85,6 +107,21 @@ custom_data:
            Assigned Reads
            Median Read Length
            Median TIN
+            Pipeline Version
+    ref:
+        file_format: 'tsv'
+        section_name: 'Reference'
+        description: 'This is the referenec version information'
+        plot_type: 'table'
+        pconfig:
+            id: 'ref'
+            scale: false
+            format: '{}'
+        headers:
+            Species
+            Genome Reference Consortium Build
+            Genome Reference Consortium Patch
+            GENCODE Annotation Release"
    tin:
        file_format: 'tsv'
        section_name: 'TIN'
@@ -106,9 +143,13 @@ custom_data:
            90 - 99

 sp:
+    run:
+        fn: "run.tsv"
    rid:
        fn: 'rid.tsv'
    meta:
        fn: 'metadata.tsv'
+    ref:
+        fn: 'reference.tsv'
    tin:
-        fn: '*.tin.hist.tsv'
+        fn: '*.tin.hist.tsv'
\ No newline at end of file
--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
@@ -97,6 +97,6 @@ manifest {
  homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq'
  description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.'
  mainScript = 'rna-seq.nf'
-  version = 'v0.0.2_indev'
+  version = 'v0.0.3_indev'
  nextflowVersion = '>=19.09.0'
 }
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -19,6 +19,11 @@ params.refHuVersion = "38.p12.v31"
 params.refERCCVersion = "92"
 params.outDir = "${baseDir}/../output"

+// Define override input variable
+params.inputBagForce = ""
+params.fastqsForce = ""
+params.speciesForce = ""
+
 // Parse input variables
 deriva = Channel
  .fromPath(params.deriva)
@@ -32,6 +37,9 @@ refHuVersion = params.refHuVersion
 refERCCVersion = params.refERCCVersion
 outDir = params.outDir
 logsDir = "${outDir}/Logs"
+inputBagForce = params.inputBagForce
+fastqsForce = params.fastqsForce
+speciesForce = params.speciesForce

 // Define fixed files
 derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json")
@@ -117,7 +125,10 @@ process getBag {
    path derivaConfig

  output:
-    path ("Replicate_*.zip") into bagit
+    path ("Replicate_*.zip") into bag
+
+  when:
+    inputBagForce == ""

  script:
    """
@@ -131,12 +142,21 @@ process getBag {
    echo -e "LOG: linked" >> ${repRID}.getBag.log

    # deriva-download replicate RID
-    echo -e "LOG: fetching bagit for ${repRID} in GUDMAP" >> ${repRID}.getBag.log
+    echo -e "LOG: fetching bag for ${repRID} in GUDMAP" >> ${repRID}.getBag.log
    deriva-download-cli ${source} --catalog 2 ${derivaConfig} . rid=${repRID}
    echo -e "LOG: fetched" >> ${repRID}.getBag.log
    """
 }

+// Set inputBag to downloaded or forced input
+if (inputBagForce != "") {
+  inputBag = Channel
+    .fromPath(inputBagForce)
+    .ifEmpty { exit 1, "override inputBag file not found: ${inputBagForce}" }
+} else {
+  inputBag = bag
+}
+
 /*
 * getData: fetch study files from consortium with downloaded bdbag.zip
 */
@@ -146,7 +166,7 @@ process getData {
  input:
    path script_bdbagFetch
    path cookies, stageAs: "deriva-cookies.txt" from bdbag
-    path bagit
+    path inputBag

  output:
    path ("*.R{1,2}.fastq.gz") into fastqs
@@ -158,33 +178,43 @@ process getData {
    """
    hostname > ${repRID}.getData.log
    ulimit -a >> ${repRID}.getData.log
-    
+
    # link deriva cookie for authentication
    echo -e "LOG: linking deriva cookie" >> ${repRID}.getData.log
    mkdir -p ~/.bdbag
    ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt
    echo -e "LOG: linked" >> ${repRID}.getData.log
    
-    # get bagit basename
-    replicate=\$(basename "${bagit}" | cut -d "." -f1)
-    echo -e "LOG: bagit replicate name \${replicate}" >> ${repRID}.getData.log
+    # get bag basename
+    replicate=\$(basename "${inputBag}" | cut -d "." -f1)
+    echo -e "LOG: bag replicate name \${replicate}" >> ${repRID}.getData.log
    
-    # unzip bagit
-    echo -e "LOG: unzipping replicate bagit" >> ${repRID}.getData.log
-    unzip ${bagit}
+    # unzip bag
+    echo -e "LOG: unzipping replicate bag" >> ${repRID}.getData.log
+    unzip ${inputBag}
    echo -e "LOG: unzipped" >> ${repRID}.getData.log
    
-    # bagit fetch fastq's only and rename by repRID
+    # bag fetch fastq's only and rename by repRID
    echo -e "LOG: fetching replicate bdbag" >> ${repRID}.getData.log
    sh ${script_bdbagFetch} \${replicate} ${repRID}
    echo -e "LOG: fetched" >> ${repRID}.getData.log
    """
 }

-// Replicate raw fastq's for multiple process inputs
-fastqs.into {
-  fastqs_trimData
-  fastqs_fastqc
+// Set raw fastq to downloaded or forced input and replicate them for multiple process inputs
+if (fastqsForce != "") {
+  Channel
+    .fromPath(fastqsForce)
+    .ifEmpty { exit 1, "override inputBag file not found: ${fastqsForce}" }
+    .collect().into {
+      fastqs_trimData
+      fastqs_fastqc
+    }
+} else {
+  fastqs.into {
+    fastqs_trimData
+    fastqs_fastqc
+  }
 }

 /*
@@ -533,7 +563,24 @@ process inferMetadata {
      bed="./GRCm/bed/genome.bed"
    else
      echo -e "LOG: ERROR - inference of species returns an ambiguous result: hu=\${align_hu} mo=\${align_mo}" >> ${repRID}.inferMetadata.log
-      exit 1
+      if [ "${speciesForce}" == "" ]
+      then
+        exit 1
+      fi
+    fi
+    if [ "${speciesForce}" != "" ]
+    then
+      echo -e "LOG: species overridden to: ${speciesForce}"
+      species="${speciesForce}"
+      if [ "${speciesForce}" == "Homo sapiens" ]
+      then
+        bam="GRCh.sampled.sorted.bam"
+        bed="./GRCh/bed/genome.bed"
+      elif [ "${speciesForce}" == "Mus musculus" ]
+      then
+        bam="GRCm.sampled.sorted.bam"
+        bed="./GRCm/bed/genome.bed"
+      fi
    fi
    echo -e "LOG: inference of species results in: \${species}" >> ${repRID}.inferMetadata.log

@@ -875,10 +922,10 @@ process countData {
    echo -e "LOG: counting ${ends} features" >> ${repRID}.countData.log
    if [ "${ends}" == "se" ]
    then
-      featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
+      featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
    elif [ "${ends}" == "pe" ]
    then
-      featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
+      featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
    fi
    echo -e "LOG: counted" >> ${repRID}.countData.log
    
@@ -1034,18 +1081,53 @@ process aggrQC {
    hostname > ${repRID}.aggrQC.log
    ulimit -a >> ${repRID}.aggrQC.log

+    # make run table
+    if [ "${params.inputBagForce}" == "" ] && [ "${params.fastqsForce}" == "" ] && [ "${params.speciesForce}" == "" ]
+    then
+      input="default"
+    else
+      input="override:"
+      if [ "${params.inputBagForce}" != "" ]
+      then
+        input=\$(echo \${input} inputBag)
+      fi
+      if [ "${params.fastqsForce}" != "" ]
+      then
+        input=\$(echo \${input} fastq)
+      fi
+      if [ "${params.speciesForce}" != "" ]
+      then
+        input=\$(echo \${input} species)
+      fi
+    fi
+    echo -e "LOG: creating run table" >> ${repRID}.aggrQC.log
+    echo -e "Session\tSession ID\tPipeline Version\tInput" > run.tsv
+    echo -e "Session\t${workflow.sessionId}\t${workflow.manifest.version}\t\${input}" >> run.tsv
+    
+    
    # make RID table
    echo -e "LOG: creating RID table" >> ${repRID}.aggrQC.log
-    echo -e "Replicate RID\tExperiment RID\tStudy RID" > rid.tsv
-    echo -e "${repRID}\t${expRID}\t${studyRID}" >> rid.tsv
+    echo -e "Replicate\tReplicate RID\tExperiment RID\tStudy RID" > rid.tsv
+    echo -e "Replicate\t${repRID}\t${expRID}\t${studyRID}" >> rid.tsv

    # make metadata table
    echo -e "LOG: creating metadata table" >> ${repRID}.aggrQC.log
    echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in\tRaw Reads\tAssigned Reads\tMedian Read Length\tMedian TIN" > metadata.tsv
    echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t-\t-\t'${readLengthM}'\t-" >> metadata.tsv
-    echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv
+    if [ "${params.speciesForce}" == "" ]
+    then
+      echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv
+    else
+      echo -e "Infered\t${speciesI} (FORCED)\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv
+    fi
    echo -e "Measured\t-\t${endsManual}\t-\t-\t'${rawReadsI}'\t'${assignedReadsI}'\t'${readLengthI}'\t'${tinMedI}'" >> metadata.tsv

+    # make reference table
+    echo -e "LOG: creating referencerun table" >> ${repRID}.aggrQC.log
+    echo -e "Species\tGenome Reference Consortium Build\tGenome Reference Consortium Patch\tGENCODE Annotation Release" > reference.tsv
+    echo -e "Human\tGRCh\$(echo `echo ${params.refHuVersion} | cut -d "." -f 1`)\t\$(echo `echo ${params.refHuVersion} | cut -d "." -f 2`)\t'\$(echo `echo ${params.refHuVersion} | cut -d "." -f 3 | sed "s/^v//"`)'" >> reference.tsv
+    echo -e "Mouse\tGRCm\$(echo `echo ${params.refMoVersion} | cut -d "." -f 1`)\t\$(echo `echo ${params.refMoVersion} | cut -d "." -f 2`)\t'\$(echo `echo ${params.refMoVersion} | cut -d "." -f 3 | sed "s/^v//"`)'" >> reference.tsv
+
    # remove inner distance report if it is empty (SE repRID)
    echo -e "LOG: removing dummy inner distance file" >> ${repRID}.aggrQC.log
    if [ "${endsM}" == "se" ]
@@ -1081,5 +1163,4 @@ process outputBag {
  cp ${multiqcJSON} Replicate_${repRID}.outputBag
  bdbag Replicate_${repRID}.outputBag --archiver zip
  """
-}
-
+}
\ No newline at end of file
--- a/workflow/scripts/calculateTPM.R
+++ b/workflow/scripts/calculateTPM.R
@@ -13,10 +13,13 @@ if (!("count" %in% names(opt))){
  stop("Count file doesn't exist, exiting.")
 }

-repRID <- basename(gsub(".featureCounts","",opt$count))
+repRID <- basename(gsub(".countData","",opt$count))

 count <- read.delim(opt$count, comment.char="#") # if featureCounts file changes structure, be sure to update count and Length columns below
-colnames(count)[7] <- "count"
+colnames(count)[1] <- "gene_name"
+colnames(count)[7] <- "gene_id"
+colnames(count)[8] <- "count"
+count <- count[,c(1,7,2:6,8)]

 rpk <- count$count/count$Length/1000


--- a/workflow/scripts/convertGeneSymbols.R
+++ b/workflow/scripts/convertGeneSymbols.R
@@ -7,18 +7,20 @@ option_list=list(
 opt=parse_args(OptionParser(option_list=option_list))
 rm(option_list)

-countTable <- read.csv(paste0(opt$repRID,".countData.countTable.csv"), stringsAsFactors=FALSE)
+countTable <- read.csv(paste0(opt$repRID,".countTable.csv"), stringsAsFactors=FALSE)
 geneID <- read.delim("geneID.tsv", header=FALSE, stringsAsFactors=FALSE)
 Entrez <- read.delim("Entrez.tsv", header=FALSE, stringsAsFactors=FALSE)

-convert <- data.frame(geneID=countTable$Geneid)
-convert <- merge(x=convert,y=geneID[,1:2],by.x="geneID",by.y="V2",all.x=TRUE)
+convert <- data.frame(gene_name=countTable$gene_name)
+convert <- merge(x=convert,y=geneID[,1:2],by.x="gene_name",by.y="V2",all.x=TRUE)
 convert <- merge(x=convert,y=Entrez,by.x="V1",by.y="V1",all.x=TRUE)
 convert[is.na(convert$V2),3] <- ""
 convert <- convert[,-1]
 colnames(convert) <- c("GeneID","EntrezID")
 convert <- unique(convert)

-output <- merge(x=convert,y=countTable[,c("Geneid","count","tpm")],by.x="GeneID",by.y="Geneid",all.x=TRUE)
+output <- merge(x=convert,y=countTable[,c("gene_name","gene_id","count","tpm")],by.x="GeneID",by.y="gene_name",all.x=TRUE)
+colnames(output) <- c("GENCODE_Gene_Symbol","NCBI_GeneID","Ensembl_GeneID","count","tpm")
+output <- output[,c(1,3,2,4:5)]

 write.table(output,file=paste0(opt$repRID,".tpmTable.csv"),sep=",",row.names=FALSE,quote=FALSE)
--- a/workflow/tests/test_makeFeatureCounts.py
+++ b/workflow/tests/test_makeFeatureCounts.py
@@ -11,5 +11,6 @@ data_output_path = os.path.dirname(os.path.abspath(__file__)) + \

 @pytest.mark.makeFeatureCounts
 def test_makeFeatureCounts():
-	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.featureCounts'))
+	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.countData'))
 	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.countTable.csv'))
+	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.tpmTable.csv'))