Merge branch '6-processCounts' into 'develop'

Resolve "process_count" See merge request !22

Merge branch '6-processCounts' into 'develop'
Resolve "process_count" See merge request !22
fe73c982 · Venkat Malladi · ad35f8df · 5acdbca8 · fe73c982 · fe73c982
Commit fe73c982 authored 5 years ago by Venkat Malladi
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -77,6 +77,13 @@ makeBigWig:
  - singularity run 'docker://bicf/deeptools3.3:2.0.0' bamCoverage -p `nproc` -b ./test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.bam -o Q-Y5JA_1M.se.bw
  - pytest -m makeBigWig

+makeFeatureCounts:
+  stage: unit
+  script:
+  - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -R SAM -p -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -T `nproc` -s 1 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -o Q-Y5JA_1M.se.featureCounts -g 'gene_name' --primary --ignoreDup -B ./test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.bam 
+  - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count Q-Y5JA_1M.se.featureCounts
+  - pytest -m makeFeatureCounts
+
 fastqc:
  stage: unit
  script:
@@ -92,11 +99,13 @@ inferMetadata:
 integration_se:
  stage: integration
  script:
+  - ulimit -u 16384
  - nextflow run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-1ZX4

 integration_pe:
  stage: integration
  script:
+  - ulimit -u 16384
  - nextflow run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA -with-dag dag.png
  artifacts:
    name: "$CI_JOB_NAME"

--- a/dag.png
+++ b/dag.png
--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
@@ -41,6 +41,9 @@ process {
  withName:inferMetadata{
    container = 'bicf/rseqc3.0:2.0.0'
  }
+  withName: makeFeatureCounts {
+    container = 'bicf/subread2:2.0.0'
+  }
 }

 trace {

--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -38,6 +38,7 @@ referenceBase = "/project/BICF/BICF_Core/shared/gudmap/references"
 // Define script files
 script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh")
 script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py")
+script_calculateTPM = Channel.fromPath("${baseDir}/scripts/calculateTPM.R")
 script_inferMeta = Channel.fromPath("${baseDir}/scripts/inferMeta.sh")

 /*
@@ -45,7 +46,7 @@ script_inferMeta = Channel.fromPath("${baseDir}/scripts/inferMeta.sh")
 */
 process getBag {
  tag "${repRID}"
-  publishDir "${logsDir}", mode: "copy", pattern: "*.getBag.err"
+  publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.getBag.{out,err}"

  input:
    path credential, stageAs: "credential.json" from deriva
@@ -57,16 +58,17 @@ process getBag {

  script:
    """
-    hostname >>${repRID}.getBag.err
-    ulimit -a >>${repRID}.getBag.err
+    hostname > ${repRID}.getBag.err
+    ulimit -a >> ${repRID}.getBag.err
    export https_proxy=\${http_proxy}

    # link credential file for authentication
-    ln -sf `readlink -e credential.json` ~/.deriva/credential.json 2>>${repRID}.getBag.err
-    echo "LOG: deriva credentials linked" >>${repRID}.getBag.err
+    ln -sf `readlink -e credential.json` ~/.deriva/credential.json 1>> ${repRID}.getBag.out 2>> ${repRID}.getBag.err
+    echo "LOG: deriva credentials linked" >> ${repRID}.getBag.err

    # deriva-download replicate RID
-    deriva-download-cli dev.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID} 2>>${repRID}.getBag.err
+    echo "LOG: fetching deriva catalog for selected RID in GUDMAP." >> ${repRID}.getBag.err
+    deriva-download-cli dev.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID} 1>> ${repRID}.getBag.out 2>> ${repRID}.getBag.err
    """
 }

@@ -75,7 +77,7 @@ process getBag {
 */
 process getData {
  tag "${repRID}"
-  publishDir "${logsDir}", mode: "copy", pattern: "*.getData.err"
+  publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.getData.{out,err}"

  input:
    path script_bdbagFetch
@@ -87,29 +89,29 @@ process getData {
    path ("**/File.csv") into fileMeta
    path ("**/Experiment Settings.csv") into experimentSettingsMeta
    path ("**/Experiment.csv") into experimentMeta
-    path ("${repRID}.getData.err")
+    path ("${repRID}.getData.{out,err}")

  script:
    """
-    hostname >>${repRID}.getData.err
-    ulimit -a >>${repRID}.getData.err
+    hostname > ${repRID}.getData.err
+    ulimit -a >> ${repRID}.getData.err
    export https_proxy=\${http_proxy}
    
    # link deriva cookie for authentication
-    ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt >>${repRID}.getData.err
-    echo "LOG: deriva cookie linked" >>${repRID}.getData.err
+    ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt 1>> ${repRID}.getData.out 2>> ${repRID}.getData.err
+    echo "LOG: deriva cookie linked" >> ${repRID}.getData.err
    
    # get bagit basename
-    replicate=\$(basename "${bagit}" | cut -d "." -f1)
-    echo "LOG: \${replicate}" >>${repRID}.getData.err
+    replicate=\$(basename "${bagit}" | cut -d "." -f1) 1>> ${repRID}.getData.out 2>> ${repRID}.getData.err
+    echo "LOG: \${replicate}" >> ${repRID}.getData.err
    
    # unzip bagit
-    unzip ${bagit} 2>>${repRID}.getData.err
-    echo "LOG: replicate bdbag unzipped" >>${repRID}.getData.err
+    unzip ${bagit} 1>> ${repRID}.getData.out 2>> ${repRID}.getData.err
+    echo "LOG: replicate bdbag unzipped" >> ${repRID}.getData.err
    
    # bagit fetch fastq"s only and rename by repRID
-    sh ${script_bdbagFetch} \${replicate} ${repRID} 2>>${repRID}.getData.err
-    echo "LOG: replicate bdbag fetched" >>${repRID}.getData.err
+    sh ${script_bdbagFetch} \${replicate} ${repRID} 1>> ${repRID}.getData.out 2>> ${repRID}.getData.err
+    echo "LOG: replicate bdbag fetched" >> ${repRID}.getData.err
    """
 }

@@ -124,7 +126,7 @@ fastqs.into {
 */
 process parseMetadata {
  tag "${repRID}"
-  publishDir "${logsDir}", mode: "copy", pattern: "*.parseMetadata.err"
+  publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.parseMetadata.{out,err}"

  input:
    path script_parseMeta
@@ -135,35 +137,36 @@ process parseMetadata {

  output:
    path "design.csv" into metadata
+    path "${repRID}.parseMetadata.{out,err}"

  script:
    """
-    hostname >>${repRID}.parseMetadata.err
-    ulimit -a >>${repRID}.parseMetadata.err
+    hostname > ${repRID}.parseMetadata.err
+    ulimit -a >> ${repRID}.parseMetadata.err

    # Check replicate RID metadata
    rep=\$(python3 ${script_parseMeta} -r ${repRID} -m "${fileMeta}" -p repRID)
-    echo "LOG: replicate RID metadata parsed: \${rep}" >>${repRID}.parseMetadata.err
+    echo "LOG: replicate RID metadata parsed: \${rep}" >> ${repRID}.parseMetadata.err
    
    # Get endedness metadata
    endsMeta=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p endsMeta)
-    echo "LOG: endedness metadata parsed: \${endsMeta}" >>${repRID}.parseMetadata.err
+    echo "LOG: endedness metadata parsed: \${endsMeta}" >> ${repRID}.parseMetadata.err
    
    # Manually get endness
    endsManual=\$(python3 ${script_parseMeta} -r ${repRID} -m "${fileMeta}" -p endsManual)
-    echo "LOG: endedness manually detected: \${endsManual}" >>${repRID}.parseMetadata.err
+    echo "LOG: endedness manually detected: \${endsManual}" >> ${repRID}.parseMetadata.err

    # Get strandedness metadata
    stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p stranded -e \${endsManual})
-    echo "LOG: strandedness metadata parsed: \${stranded}" >>${repRID}.parseMetadata.err
+    echo "LOG: strandedness metadata parsed: \${stranded}" >> ${repRID}.parseMetadata.err
    
    # Get spike-in metadata
    spike=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p spike)
-    echo "LOG: spike-in metadata parsed: \${spike}" >>${repRID}.parseMetadata.err
+    echo "LOG: spike-in metadata parsed: \${spike}" >> ${repRID}.parseMetadata.err
    
    # Get species metadata
    species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentMeta}" -p species)
-    echo "LOG: species metadata parsed: \${species}" >>${repRID}.parseMetadata.err
+    echo "LOG: species metadata parsed: \${species}" >> ${repRID}.parseMetadata.err

    # Save design file
    echo "\${rep},\${endsMeta},\${endsManual},\${stranded},\${spike},\${species}" > design.csv
@@ -189,9 +192,11 @@ metadata.splitCsv(sep: ",", header: false).separate(
 endsManual.into {
  endsManual_trimData
  endsManual_alignData
+  endsManual_featureCounts
 }
 stranded.into {
  stranded_alignData
+  stranded_featureCounts
 }
 spike.into{
  spike_getRef
@@ -205,19 +210,20 @@ species.into {
 */
 process getRef {
  tag "${species_getRef}"
-  publishDir "${logsDir}", mode: "copy", pattern: "*.getRef.err"
+  publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.getRef.{out,err}"

  input:
    val spike_getRef
    val species_getRef

  output:
-    path ("*")  into reference
-  
+    tuple path ("hisat2", type: 'dir'), path ("bed", type: 'dir'), path ("*.fna"), path ("*.gtf")  into reference
+    path ("${repRID}.getRef.{out,err}")
+ 
  script:
    """
-    hostname >>${repRID}.getRef.err
-    ulimit -a >>${repRID}.getRef.err
+    hostname > ${repRID}.getRef.err
+    ulimit -a >> ${repRID}.getRef.err
    export https_proxy=\${http_proxy}

    # run set the reference name
@@ -228,6 +234,7 @@ process getRef {
    then
      references=\$(echo ${referenceBase}/GRCh${refHuVersion})
    else
+      echo -e "LOG: ERROR - References could not be set!\nSpecies reference found: ${species_getRef}" >> ${repRID}.getRef.err
      exit 1
    fi
    if [ "${spike_getRef}" == "yes" ]
@@ -237,26 +244,40 @@ process getRef {
    then
      reference=\$(echo \${references}/)
    fi
+    echo "LOG: species set to \${references}" >> ${repRID}.getRef.err

    # retreive appropriate reference appropriate location
    if [ ${referenceBase} == "s3://bicf-references" ]
    then
-      aws s3 cp "\${references}" /hisat2 ./ --recursive >>${repRID}.getRef.err
-      aws s3 cp "\${references}" /bed ./ --recursive >>${repRID}.getRef.err
+      echo "LOG: grabbing reference files from S3" >> ${repRID}.getRef.err
+      aws s3 cp "\${references}" /hisat2 ./ --recursive 1>> ${repRID}.getRef.out 2>> ${repRID}.getRef.err
+      aws s3 cp "\${references}" /bed ./ --recursive 1>> ${repRID}.getRef.out 2>> ${repRID}.getRef.err
+      aws s3 cp "\${references}" /*.fna --recursive 1>> ${repRID}.getRef.out 2>> ${repRID}.getRef.err
+      aws s3 cp "\${references}" /*.gtf --recursive 1>> ${repRID}.getRef.out 2>> ${repRID}.getRef.err
    elif [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references" ]
    then
-      cp -R "\${references}"/hisat2 ./ >>${repRID}.getRef.err
-      cp -R "\${references}"/bed ./ >>${repRID}.getRef.err
+      echo "LOG: using pre-defined locations for reference files" >> ${repRID}.getRef.err
+      ln -s "\${references}"/hisat2 1>> ${repRID}.getRef.out 2>> ${repRID}.getRef.err
+      ln -s "\${references}"/bed 1>> ${repRID}.getRef.out 2>> ${repRID}.getRef.err
+      ln -s "\${references}"/genome.fna 1>> ${repRID}.getRef.out 2>> ${repRID}.getRef.err
+      ln -s "\${references}"/genome.gtf 1>> ${repRID}.getRef.out 2>> ${repRID}.getRef.err
    fi
    """
 }

+// Replicate reference for multiple process inputs
+reference.into {
+  reference_alignData
+  reference_makeFeatureCounts
+  reference_inferMeta
+}
+
 /*
 * trimData: trims any adapter or non-host sequences from the data
 */
 process trimData {
  tag "${repRID}"
-  publishDir "${logsDir}", mode: "copy", pattern: "*.trimData.*"
+  publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.trimData.{out,err}"

  input:
    val endsManual_trimData
@@ -265,37 +286,32 @@ process trimData {
  output:
    path ("*.fq.gz") into fastqs_trimmed
    path ("*_trimming_report.txt") into trimQC
-    path ("${repRID}.trimData.log")
-    path ("${repRID}.trimData.err")
+    path ("${repRID}.trimData.{out,err}")

  script:
    """
-    hostname >>${repRID}.trimData.err
-    ulimit -a >>${repRID}.trimData.err
+    hostname > ${repRID}.trimData.err
+    ulimit -a >> ${repRID}.trimData.err

-    # trim fastqs
+    #Trim fastqs using trim_galore
    if [ "${endsManual_trimData}" == "se" ]
    then
-      trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err
+      echo "LOG: running trim_galore using single-end settings" >> ${repRID}.trimData.err
+      trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} 1>> ${repRID}.trimData.out 2>> ${repRID}.trimData.err
    elif [ "${endsManual_trimData}" == "pe" ]
    then
-      trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err
+      echo "LOG: running trim_galore using paired-end settings" >> ${repRID}.trimData.err
+      trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} 1>> ${repRID}.trimData.out 2>> ${repRID}.trimData.err
    fi
    """
 }

-// Replicate reference for multiple process inputs
-reference.into {
-  reference_alignData
-  reference_inferMeta
-}
-
 /*
 * alignData: aligns the reads to a reference database
 */
 process alignData {
  tag "${repRID}"
-  publishDir "${logsDir}", mode: "copy", pattern: "*.align.{out,err}"
+  publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.align.{out,err}"

  input:
    val endsManual_alignData
@@ -306,27 +322,35 @@ process alignData {
  output:
    tuple val ("${repRID}"), path ("${repRID}.sorted.bam"), path ("${repRID}.sorted.bam.bai") into rawBam
    path ("*.alignSummary.txt") into alignQC
-    path ("${repRID}.align.out")
-    path ("${repRID}.align.err")
+    path ("${repRID}.align.{out,err}")

  script:
    """
-    hostname >${repRID}.align.err
-    ulimit -a >>${repRID}.align.err
+    hostname > ${repRID}.align.err
+    ulimit -a >> ${repRID}.align.err

-    # align reads
+    #Align the reads with Hisat 2
    if [ "${endsManual_alignData}" == "se" ]
    then
-      hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} -U ${fastq[0]} --summary-file ${repRID}.alignSummary.txt --new-summary 1>${repRID}.align.out 2>${repRID}.align.err
+      echo "LOG: running Hisat2 with single-end settings" >> ${repRID}.align.err
+      hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} -U ${fastq[0]} --summary-file ${repRID}.alignSummary.txt --new-summary 1>> ${repRID}.align.out 2>> ${repRID}.align.err
    elif [ "${endsManual_alignData}" == "pe" ]
    then
-      hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} --summary-file ${repRID}.alignSummary.txt --new-summary 1>${repRID}.align.out 2>${repRID}.align.err
+      echo "LOG: running Hisat2 with paired-end settings" >> ${repRID}.align.err
+      hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} --summary-file ${repRID}.alignSummary.txt --new-summary 1>> ${repRID}.align.out 2>> ${repRID}.align.err
    fi
    
-    # convert sam to bam and sort and index
-    samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam 1>>${repRID}.align.out 2>>${repRID}.align.err;
-    samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam 1>>${repRID}.align.out 2>>${repRID}.align.err;
-    samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bam.bai 1>>${repRID}.align.out 2>>${repRID}.align.err;
+    #Convert the output sam file to a sorted bam file using Samtools
+    echo "LOG: converting from sam to bam" >> ${repRID}.align.err
+    samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam 1>> ${repRID}.align.out 2>> ${repRID}.align.err;
+
+    #Sort the bam file using Samtools
+    echo "LOG: sorting the bam file" >> ${repRID}.align.err
+    samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam 1>> ${repRID}.align.out 2>> ${repRID}.align.err;
+
+    #Index the sorted bam using Samtools
+    echo "LOG: indexing sorted bam file" >> ${repRID}.align.err
+    samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bam.bai 1>> ${repRID}.align.out 2>> ${repRID}.align.err;
    """
 }

@@ -341,31 +365,38 @@ rawBam.into {
 process dedupData {
  tag "${repRID}"
  publishDir "${outDir}/bam", mode: 'copy', pattern: "*.deduped.bam"
-  publishDir "${logsDir}", mode: 'copy', pattern: "*.dedup.{out,err}"
+  publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.dedup.{out,err}"

  input:
    set val (repRID), path (inBam), path (inBai) from rawBam_dedupData

  output:
-    tuple val ("${repRID}"), path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bam.bai") into dedupBam
+    tuple path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bam.bai") into dedupBam
    path ("*.deduped.Metrics.txt") into dedupQC
-    path ("${repRID}.dedup.out")
-    path ("${repRID}.dedup.err")
+    path ("${repRID}.dedup.{out,err}")

  script:
    """
-    hostname >${repRID}.dedup.err
-    ulimit -a >>${repRID}.dedup.err
+    hostname > ${repRID}.dedup.err
+    ulimit -a >> ${repRID}.dedup.err
+
+    # remove duplicated reads using Picard's MarkDuplicates
+    echo "LOG: running picard MarkDuplicates to remove duplicate reads" >> ${repRID}.dedup.err
+    java -jar /picard/build/libs/picard.jar MarkDuplicates I=${inBam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>> ${repRID}.dedup.out 2>> ${repRID}.dedup.err

-    # remove duplicated reads
-    java -jar /picard/build/libs/picard.jar MarkDuplicates I=${inBam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
-    samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.deduped.bam ${repRID}.deduped.bam 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
-    samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bam.bai 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
+    #Use SamTools to sort the now deduped bam file
+    echo "LOG: sorting the deduped bam file" >> ${repRID}.dedup.err
+    samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.deduped.bam ${repRID}.deduped.bam 1>> ${repRID}.dedup.out 2>> ${repRID}.dedup.err
+
+    #Use SamTools to index the now sorted deduped bam file
+    echo "LOG: indexing the sorted deduped bam file" >> ${repRID}.dedup.err
+    samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bam.bai 1>> ${repRID}.dedup.out 2>> ${repRID}.dedup.err
    """
 }

 // Replicate dedup bam/bai for multiple process inputs
 dedupBam.into {
+  dedupBam_makeFeatureCounts
  dedupBam_makeBigWig
  dedupBam_inferMeta
 }
@@ -375,17 +406,74 @@ dedupBam.into {
 */
 process makeBigWig {
  tag "${repRID}"
-  publishDir "${logsDir}", mode: 'copy', pattern: "*.makeBigWig.err"
+  publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.makeBigWig.{out,err}"
+  publishDir "${outDir}/bigwig", mode: 'copy', pattern: "${repRID}.bw"

  input:
-    set val (repRID), path (inBam), path (inBai) from dedupBam_makeBigWig
+    set path (inBam), path (inBai) from dedupBam_makeBigWig

  output:
    path ("${repRID}.bw")
+    path ("${repRID}.makeBigWig.{out,err}")
+
+  script:
+    """
+    hostname > ${repRID}.makeBigWig.err
+    ulimit -a >> ${repRID}.makeBigWig.err
+
+    #Run bamCoverage
+    echo "LOG: Running bigWig bamCoverage" >> ${repRID}.makeBigWig.err
+    bamCoverage -p `nproc` -b ${inBam} -o ${repRID}.bw 1>> ${repRID}.makeBigWig.out 2>> ${repRID}.makeBigWig.err
+    """
+}
+
+/*
+ *Run featureCounts and get the counts, tpm
+*/
+process makeFeatureCounts {
+  tag "${repRID}"
+  publishDir "${outDir}/featureCounts", mode: 'copy', pattern: "${repRID}*.countTable.csv"
+  publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.makeFetureCounts.{out,err}"
+
+  input:
+    path script_calculateTPM
+    tuple path (bam), path (bai) from dedupBam_makeFeatureCounts
+    path reference_makeFeatureCounts
+    val endsManual_featureCounts
+
+  output:
+    path ("*.countTable.csv") into counts
+    path ("*.featureCounts.summary") into countsQC
+    path ("${repRID}.makeFeatureCounts.{out,err}")

  script:
    """
-    bamCoverage -p `nproc` -b ${inBam} -o ${repRID}.bw
+    hostname > ${repRID}.makeFeatureCounts.err
+    ulimit -a >> ${repRID}.makeFeatureCounts.err
+
+    #Determine strandedness and setup strandig for featureCounts
+    stranding=0;
+    if [ "${stranded_featureCounts}" == "--rna-strandness F" ] || [ "${stranded_featureCounts}" == "--rna-strandness FR" ]
+      then
+      stranding=1
+      echo "LOG: strandedness set to stranded [1]" >> ${repRID}.makeFeatureCounts.err
+    else
+      stranding=0
+      echo "LOG: strandedness set to unstranded [0]" >> ${repRID}.makeFeatureCounts.err
+    fi;
+    #Run featureCounts
+    echo "LOG: running featureCounts on the data" >> ${repRID}.makeFeatureCounts.err
+    if [ "${endsManual_featureCounts }" == "se" ]
+    then
+      featureCounts -R SAM -p -G ./genome.fna -T `nproc` -s \${stranding} -a ./genome.gtf -o ${repRID}.featureCounts -g 'gene_name' --primary --ignoreDup ${repRID}.sorted.deduped.bam 1>> ${repRID}.makeFeatureCounts.out 2>> ${repRID}.makeFeatureCounts.err
+    elif [ "${endsManual_featureCounts }" == "pe" ]
+    then
+      featureCounts -R SAM -p -G ./genmome.fna -T `nproc` -s \${stranding} -a ./genome.gtf -o ${repRID}.featureCounts -g 'gene_name' --primary --ignoreDup -B ${repRID}.sorted.deduped.bam 1>> ${repRID}.makeFeatureCounts.out 2>> ${repRID}.makeFeatureCounts.err
+    fi
+
+    #Calculate TMP from the resulting featureCounts table
+    echo "LOG: calculating TMP with R" >> ${repRID}.makeFeatureCounts.err
+    Rscript calculateTPM.R --count "${repRID}.featureCounts" 1>> ${repRID}.makeFeatureCounts.out 2>> ${repRID}.makeFeatureCounts.err
    """
 }

@@ -395,61 +483,66 @@ process makeBigWig {
 process fastqc {
  tag "${repRID}"
  publishDir "${outDir}/fastqc", mode: 'copy', pattern: "*_fastqc.zip"
-  publishDir "${logsDir}", mode: 'copy', pattern: "*.fastq.err"
+  publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.fastqc.{out,err}"

  input:
    path (fastq) from fastqs_fastqc

  output:
    path ("*_fastqc.zip") into fastqc
+    path ("${repRID}.fastqc.{out,err}")

  script:
    """
-    hostname >${repRID}.fastqc.err
-    ulimit -a >>${repRID}.fastqc.err
+    hostname > ${repRID}.fastqc.err
+    ulimit -a >> ${repRID}.fastqc.err

    # run fastqc
-    fastqc *.fastq.gz -o . >>${repRID}.fastqc.err
+    echo "LOG: beginning FastQC analysis of the data" >> ${repRID}.fastqc.err
+    fastqc *.fastq.gz -o . 1>> ${repRID}.fastqc.out 2>> ${repRID}.fastqc.err
    """
 }

 /*
- *rseqc: run RSeQC to collect stats and infer experimental metadata
+ *inferMetadata: run RSeQC to collect stats and infer experimental metadata
 */
+
 process inferMetadata {
  tag "${repRID}"
-  publishDir "${logsDir}", mode: 'copy', pattern: "*.rseqc.err"
+  publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.rseqc.{out,err}"

  input:
    path script_inferMeta
    path reference_inferMeta
-    set val (repRID), path (inBam), path (inBai) from dedupBam_inferMeta
+    set path (inBam), path (inBai) from dedupBam_inferMeta

  output:
    path "infer.csv" into inferedMetadata
    path "${inBam.baseName}.tin.xls" into tin
    path "${repRID}.insertSize.inner_distance_freq.txt" optional true into innerDistance
-
+    path "${repRID}.rseqc.{out,err}" optional true

  script:
    """
-    hostname >${repRID}.rseqc.err
-    ulimit -a >>${repRID}.rseqc.err
+    hostname > ${repRID}.rseqc.err
+    ulimit -a >> ${repRID}.rseqc.err

    # infer experimental setting from dedup bam
-    infer_experiment.py -r ./bed/genome.bed -i "${inBam}" >${repRID}.rseqc.log
+    echo "LOG: running inference from bed file" >> ${repRID}.rseqc.err
+    infer_experiment.py -r ./bed/genome.bed -i "${inBam}" > ${repRID}.rseqc.log 2>> ${repRID}.rseqc.err

-    endness=`bash inferMeta.sh endness ${repRID}.rseqc.log`
-    fail=`bash inferMeta.sh fail ${repRID}.rseqc.log`
+    echo "LOG: determining endedness and strandedness from file" >> ${repRID}.rseqc.err
+    endness=`bash inferMeta.sh endness ${repRID}.rseqc.log` 1>> ${repRID}.rseqc.out 2>> ${repRID}.rseqc.err
+    fail=`bash inferMeta.sh fail ${repRID}.rseqc.log` 1>> ${repRID}.rseqc.out 2>> ${repRID}.rseqc.err
    if [ \${endness} == "PairEnd" ] 
    then
-      percentF=`bash inferMeta.sh pef ${repRID}.rseqc.log`
-      percentR=`bash inferMeta.sh per ${repRID}.rseqc.log`
-      inner_distance.py -i "${inBam}" -o ${repRID}.insertSize -r ./bed/genome.bed
+      percentF=`bash inferMeta.sh pef ${repRID}.rseqc.log` 1>> ${repRID}.rseqc.out 2>> ${repRID}.rseqc.err
+      percentR=`bash inferMeta.sh per ${repRID}.rseqc.log` 1>> ${repRID}.rseqc.out 2>> ${repRID}.rseqc.err
+      inner_distance.py -i "${inBam}" -o ${repRID}.insertSize -r ./bed/genome.bed 1>> ${repRID}.rseqc.out 2>> ${repRID}.rseqc.err
    elif [ \${endness} == "SingleEnd" ]
    then
-      percentF=`bash inferMeta.sh sef ${repRID}.rseqc.log`
-      percentR=`bash inferMeta.sh ser ${repRID}.rseqc.log`
+      percentF=`bash inferMeta.sh sef ${repRID}.rseqc.log` 1>> ${repRID}.rseqc.out 2>> ${repRID}.rseqc.err
+      percentR=`bash inferMeta.sh ser ${repRID}.rseqc.log` 1>> ${repRID}.rseqc.out 2>> ${repRID}.rseqc.err
    fi
    if [ \$percentF -gt 0.25 ] && [ \$percentR -lt 0.25 ]
    then
@@ -473,11 +566,12 @@ process inferMetadata {
      stranded="unstranded"
      strategy="us"
    fi
+    echo -e "LOG: strategy set to \${strategy}\nStranded set to ${stranded}" >> ${repRID}.rseqc.err

    # calcualte TIN values per feature
-    tin.py -i "${inBam}" -r ./bed/genome.bed
+    tin.py -i "${inBam}" -r ./bed/genome.bed 1>> ${repRID}.rseqc.out 2>> ${repRID}.rseqc.err

    # write infered metadata to file
-    echo \${endness},\${stranded},\${strategy},\${percentF},\${percentR},\${fail} > infer.csv
+    echo \${endness},\${stranded},\${strategy},\${percentF},\${percentR},\${fail} > infer.csv 2>> ${repRID}.rseqc.err
    """
-}
\ No newline at end of file
+}
--- a/workflow/scripts/calculateTPM.R
+++ b/workflow/scripts/calculateTPM.R
+gc()
+library(optparse)
+
+option_list=list(
+  make_option("--count",action="store",type='character',help="Count File")
+)
+opt=parse_args(OptionParser(option_list=option_list))
+rm(option_list)
+
+if (!("count" %in% names(opt))){
+  stop("No count file passed, exiting.")
+} else if (!file.exists(opt$count)) {
+  stop("Count file doesn't exist, exiting.")
+}
+
+repRID <- basename(gsub(".featureCounts","",opt$count))
+
+count <- read.delim(opt$count, comment.char="#") # if featureCounts file changes structure, be sure to update count and Length columns below
+colnames(count)[7] <- "count"
+
+rpk <- count$count/count$Length/1000
+
+scale <- sum(rpk)/1000000
+
+tpm <- rpk/scale
+
+output <- cbind(count,tpm)
+colnames(output)[7] <- "count"
+
+write.table(output,file=paste0(repRID,".countTable.csv"),sep=",",row.names=FALSE,quote=FALSE)
--- a/workflow/tests/test_makeFeatureCounts.py
+++ b/workflow/tests/test_makeFeatureCounts.py
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+import os
+import utils
+
+data_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+	'/../../'
+
+
+@pytest.mark.makeFeatureCounts
+def test_makeFeatureCounts():
+	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.featureCounts'))
+	assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.countTable.csv'))