From c41b4788274d7b5b389ca70759468e7c71ce2359 Mon Sep 17 00:00:00 2001
From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu>
Date: Mon, 10 Aug 2020 16:09:27 -0500
Subject: [PATCH] Convert gene symbols to EntrezID in count/tpm table

---
 workflow/rna-seq.nf                   | 22 +++++++++++++++++-----
 workflow/scripts/convertGeneSymbols.R | 24 ++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 5 deletions(-)
 create mode 100644 workflow/scripts/convertGeneSymbols.R

diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf
index efeb0f1..772d076 100644
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -53,6 +53,7 @@ script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh")
 script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py")
 script_inferMeta = Channel.fromPath("${baseDir}/scripts/inferMeta.sh")
 script_calculateTPM = Channel.fromPath("${baseDir}/scripts/calculateTPM.R")
+script_convertGeneSymbols = Channel.fromPath("${baseDir}/scripts/convertGeneSymbols.R")
 script_tinHist = Channel.fromPath("${baseDir}/scripts/tinHist.py")
 
 /*
@@ -323,7 +324,6 @@ inferMetadata_readLength.splitCsv(sep: ",", header: false).separate(
   readLengthInfer
 )
 
-
 // Replicate trimmed fastq's
 fastqsTrim.into {
   fastqsTrim_alignData
@@ -629,7 +629,7 @@ process getRef {
     val species from speciesInfer_getRef
 
   output:
-    tuple path ("hisat2", type: 'dir'), path ("bed", type: 'dir'), path ("*.fna"), path ("*.gtf")  into reference
+    tuple path ("hisat2", type: 'dir'), path ("bed", type: 'dir'), path ("*.fna"), path ("*.gtf"), path ("geneID.tsv"), path ("Entrez.tsv")  into reference
  
   script:
     """
@@ -665,12 +665,16 @@ process getRef {
       aws s3 cp "\${references}"/bed ./bed --recursive
       aws s3 cp "\${references}"/genome.fna ./
       aws s3 cp "\${references}"/genome.gtf ./
+      aws s3 cp "\${references}"/geneID.tsv ./
+      aws s3 cp "\${references}"/Entrez.tsv ./
     elif [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references" ]
     then
       ln -s "\${references}"/hisat2
       ln -s "\${references}"/bed
       ln -s "\${references}"/genome.fna
       ln -s "\${references}"/genome.gtf
+      ln -s "\${references}"/geneID.tsv
+      ln -s "\${references}"/Entrez.tsv
     fi
     echo -e "LOG: fetched" >> ${repRID}.getRef.log
     """
@@ -834,13 +838,14 @@ process countData {
 
   input:
     path script_calculateTPM
+    path script_convertGeneSymbols
     tuple path (bam), path (bai) from dedupBam_countData
     path ref from reference_countData
     val ends from endsInfer_countData
     val stranded from strandedInfer_countData
 
   output:
-    path ("*.countTable.csv") into counts
+    path ("*.tpmTable.csv") into counts
     path ("*.countData.summary") into countsQC
     path ("assignedReads.csv") into inferMetadata_assignedReads
 
@@ -882,6 +887,10 @@ process countData {
     # calculate TPM from the resulting countData table
     echo -e "LOG: calculating TPM with R" >> ${repRID}.countData.log
     Rscript calculateTPM.R --count "${repRID}.countData"
+
+    # convert gene symbols to Entrez id's
+    echo -e "LOG: convert gene symbols to Entrez id's" >> ${repRID}.countData.log
+    Rscript convertGeneSymbols.R --repRID "${repRID}"
     """
 }
 
@@ -983,7 +992,8 @@ inferMetadata_tinMed.splitCsv(sep: ",", header: false).separate(
 */
 process aggrQC {
   tag "${repRID}"
-  publishDir "${outDir}/qc", mode: 'copy', pattern: "${repRID}.multiqc.html"
+  publishDir "${outDir}/report", mode: 'copy', pattern: "${repRID}.multiqc.html"
+  publishDir "${outDir}/qc", mode: 'copy', pattern: "${repRID}.multiqc_data.json"
 
   input:
     path multiqcConfig
@@ -1016,6 +1026,7 @@ process aggrQC {
 
   output:
     path "${repRID}.multiqc.html" into multiqc
+    path "${repRID}.multiqc_data.json" into multiqcJSON
 
   script:
     """
@@ -1044,5 +1055,6 @@ process aggrQC {
     # run MultiQC
     echo -e "LOG: running multiqc" >> ${repRID}.aggrQC.log
     multiqc -c ${multiqcConfig} . -n ${repRID}.multiqc.html
+    cp ${repRID}.multiqc_data/multiqc_data.json ${repRID}.multiqc_data.json
     """
-}
+}
\ No newline at end of file
diff --git a/workflow/scripts/convertGeneSymbols.R b/workflow/scripts/convertGeneSymbols.R
new file mode 100644
index 0000000..7b05b92
--- /dev/null
+++ b/workflow/scripts/convertGeneSymbols.R
@@ -0,0 +1,24 @@
+gc()
+library(optparse)
+
+option_list=list(
+  make_option("--repRID",action="store",type='character',help="Replicate RID")
+)
+opt=parse_args(OptionParser(option_list=option_list))
+rm(option_list)
+
+countTable <- read.csv(paste0(opt$repRID,".countData.countTable.csv"), stringsAsFactors=FALSE)
+geneID <- read.delim("geneID.tsv", header=FALSE, stringsAsFactors=FALSE)
+Entrez <- read.delim("Entrez.tsv", header=FALSE, stringsAsFactors=FALSE)
+
+convert <- data.frame(geneID=countTable$Geneid)
+convert <- merge(x=convert,y=geneID[,1:2],by.x="geneID",by.y="V2",all.x=TRUE)
+convert <- merge(x=convert,y=Entrez,by.x="V1",by.y="V1",all.x=TRUE)
+convert[is.na(convert$V2),3] <- ""
+convert <- convert[,-1]
+colnames(convert) <- c("GeneID","EntrezID")
+convert <- unique(convert)
+
+output <- merge(x=convert,y=countTable,by.x="GeneID",by.y="Geneid",all.x=TRUE)
+
+write.table(output,file=paste0(opt$repRID,".tpmTable.csv"),sep=",",row.names=FALSE,quote=FALSE)
\ No newline at end of file
-- 
GitLab