From d5c23e26f2a338fb87ad7576cc2df7b09b9e3c96 Mon Sep 17 00:00:00 2001 From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu> Date: Mon, 24 Aug 2020 14:17:26 -0500 Subject: [PATCH] Update tpm table #73 --- workflow/rna-seq.nf | 4 ++-- workflow/scripts/calculateTPM.R | 7 +++++-- workflow/scripts/convertGeneSymbols.R | 10 ++++++---- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index a311b9e..d3459fd 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -875,10 +875,10 @@ process countData { echo -e "LOG: counting ${ends} features" >> ${repRID}.countData.log if [ "${ends}" == "se" ] then - featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam + featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam elif [ "${ends}" == "pe" ] then - featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam + featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam fi echo -e "LOG: counted" >> ${repRID}.countData.log diff --git a/workflow/scripts/calculateTPM.R b/workflow/scripts/calculateTPM.R index d4851d4..a26bf94 100644 --- a/workflow/scripts/calculateTPM.R +++ b/workflow/scripts/calculateTPM.R @@ -13,10 +13,13 @@ if (!("count" %in% names(opt))){ stop("Count file doesn't exist, exiting.") } -repRID <- basename(gsub(".featureCounts","",opt$count)) +repRID <- basename(gsub(".countData","",opt$count)) count <- read.delim(opt$count, comment.char="#") # if featureCounts file changes structure, be sure to update count and Length columns below -colnames(count)[7] <- "count" +colnames(count)[1] <- "gene_name" +colnames(count)[7] <- "gene_id" +colnames(count)[8] <- "count" +count <- count[,c(1,7,2:6,8)] rpk <- count$count/count$Length/1000 diff --git a/workflow/scripts/convertGeneSymbols.R b/workflow/scripts/convertGeneSymbols.R index 92d3c3e..49752f1 100644 --- a/workflow/scripts/convertGeneSymbols.R +++ b/workflow/scripts/convertGeneSymbols.R @@ -7,18 +7,20 @@ option_list=list( opt=parse_args(OptionParser(option_list=option_list)) rm(option_list) -countTable <- read.csv(paste0(opt$repRID,".countData.countTable.csv"), stringsAsFactors=FALSE) +countTable <- read.csv(paste0(opt$repRID,".countTable.csv"), stringsAsFactors=FALSE) geneID <- read.delim("geneID.tsv", header=FALSE, stringsAsFactors=FALSE) Entrez <- read.delim("Entrez.tsv", header=FALSE, stringsAsFactors=FALSE) -convert <- data.frame(geneID=countTable$Geneid) -convert <- merge(x=convert,y=geneID[,1:2],by.x="geneID",by.y="V2",all.x=TRUE) +convert <- data.frame(gene_name=countTable$gene_name) +convert <- merge(x=convert,y=geneID[,1:2],by.x="gene_name",by.y="V2",all.x=TRUE) convert <- merge(x=convert,y=Entrez,by.x="V1",by.y="V1",all.x=TRUE) convert[is.na(convert$V2),3] <- "" convert <- convert[,-1] colnames(convert) <- c("GeneID","EntrezID") convert <- unique(convert) -output <- merge(x=convert,y=countTable[,c("Geneid","count","tpm")],by.x="GeneID",by.y="Geneid",all.x=TRUE) +output <- merge(x=convert,y=countTable[,c("gene_name","gene_id","count","tpm")],by.x="GeneID",by.y="gene_name",all.x=TRUE) +colnames(output) <- c("GENCODE_Gene_Symbol","NCBI_GeneID","Ensembl_GeneID","count","tpm") +output <- output[,c(1,3,2,4:5)] write.table(output,file=paste0(opt$repRID,".tpmTable.csv"),sep=",",row.names=FALSE,quote=FALSE) -- GitLab