From d5c23e26f2a338fb87ad7576cc2df7b09b9e3c96 Mon Sep 17 00:00:00 2001
From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu>
Date: Mon, 24 Aug 2020 14:17:26 -0500
Subject: [PATCH] Update tpm table #73

---
 workflow/rna-seq.nf                   |  4 ++--
 workflow/scripts/calculateTPM.R       |  7 +++++--
 workflow/scripts/convertGeneSymbols.R | 10 ++++++----
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf
index a311b9e..d3459fd 100644
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -875,10 +875,10 @@ process countData {
     echo -e "LOG: counting ${ends} features" >> ${repRID}.countData.log
     if [ "${ends}" == "se" ]
     then
-      featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
+      featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
     elif [ "${ends}" == "pe" ]
     then
-      featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
+      featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
     fi
     echo -e "LOG: counted" >> ${repRID}.countData.log
     
diff --git a/workflow/scripts/calculateTPM.R b/workflow/scripts/calculateTPM.R
index d4851d4..a26bf94 100644
--- a/workflow/scripts/calculateTPM.R
+++ b/workflow/scripts/calculateTPM.R
@@ -13,10 +13,13 @@ if (!("count" %in% names(opt))){
   stop("Count file doesn't exist, exiting.")
 }
 
-repRID <- basename(gsub(".featureCounts","",opt$count))
+repRID <- basename(gsub(".countData","",opt$count))
 
 count <- read.delim(opt$count, comment.char="#") # if featureCounts file changes structure, be sure to update count and Length columns below
-colnames(count)[7] <- "count"
+colnames(count)[1] <- "gene_name"
+colnames(count)[7] <- "gene_id"
+colnames(count)[8] <- "count"
+count <- count[,c(1,7,2:6,8)]
 
 rpk <- count$count/count$Length/1000
 
diff --git a/workflow/scripts/convertGeneSymbols.R b/workflow/scripts/convertGeneSymbols.R
index 92d3c3e..49752f1 100644
--- a/workflow/scripts/convertGeneSymbols.R
+++ b/workflow/scripts/convertGeneSymbols.R
@@ -7,18 +7,20 @@ option_list=list(
 opt=parse_args(OptionParser(option_list=option_list))
 rm(option_list)
 
-countTable <- read.csv(paste0(opt$repRID,".countData.countTable.csv"), stringsAsFactors=FALSE)
+countTable <- read.csv(paste0(opt$repRID,".countTable.csv"), stringsAsFactors=FALSE)
 geneID <- read.delim("geneID.tsv", header=FALSE, stringsAsFactors=FALSE)
 Entrez <- read.delim("Entrez.tsv", header=FALSE, stringsAsFactors=FALSE)
 
-convert <- data.frame(geneID=countTable$Geneid)
-convert <- merge(x=convert,y=geneID[,1:2],by.x="geneID",by.y="V2",all.x=TRUE)
+convert <- data.frame(gene_name=countTable$gene_name)
+convert <- merge(x=convert,y=geneID[,1:2],by.x="gene_name",by.y="V2",all.x=TRUE)
 convert <- merge(x=convert,y=Entrez,by.x="V1",by.y="V1",all.x=TRUE)
 convert[is.na(convert$V2),3] <- ""
 convert <- convert[,-1]
 colnames(convert) <- c("GeneID","EntrezID")
 convert <- unique(convert)
 
-output <- merge(x=convert,y=countTable[,c("Geneid","count","tpm")],by.x="GeneID",by.y="Geneid",all.x=TRUE)
+output <- merge(x=convert,y=countTable[,c("gene_name","gene_id","count","tpm")],by.x="GeneID",by.y="gene_name",all.x=TRUE)
+colnames(output) <- c("GENCODE_Gene_Symbol","NCBI_GeneID","Ensembl_GeneID","count","tpm")
+output <- output[,c(1,3,2,4:5)]
 
 write.table(output,file=paste0(opt$repRID,".tpmTable.csv"),sep=",",row.names=FALSE,quote=FALSE)
-- 
GitLab