diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index a311b9e876249c0d4cfbbb1cd13e7d94b1902807..d3459fd26c6c8c67053cfee3a5fcd8fe8610dc23 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -875,10 +875,10 @@ process countData { echo -e "LOG: counting ${ends} features" >> ${repRID}.countData.log if [ "${ends}" == "se" ] then - featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam + featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam elif [ "${ends}" == "pe" ] then - featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam + featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam fi echo -e "LOG: counted" >> ${repRID}.countData.log diff --git a/workflow/scripts/calculateTPM.R b/workflow/scripts/calculateTPM.R index d4851d4806c5e12e06416a103f2e6972a8b4befe..a26bf943dce7d082ed03b77939390abb022bba82 100644 --- a/workflow/scripts/calculateTPM.R +++ b/workflow/scripts/calculateTPM.R @@ -13,10 +13,13 @@ if (!("count" %in% names(opt))){ stop("Count file doesn't exist, exiting.") } -repRID <- basename(gsub(".featureCounts","",opt$count)) +repRID <- basename(gsub(".countData","",opt$count)) count <- read.delim(opt$count, comment.char="#") # if featureCounts file changes structure, be sure to update count and Length columns below -colnames(count)[7] <- "count" +colnames(count)[1] <- "gene_name" +colnames(count)[7] <- "gene_id" +colnames(count)[8] <- "count" +count <- count[,c(1,7,2:6,8)] rpk <- count$count/count$Length/1000 diff --git a/workflow/scripts/convertGeneSymbols.R b/workflow/scripts/convertGeneSymbols.R index 92d3c3e2e8ad97ebacbca3c5f9d0fc185ceafb7e..49752f1bba5a4dd8d91ed8609c6f2f82b8fafacc 100644 --- a/workflow/scripts/convertGeneSymbols.R +++ b/workflow/scripts/convertGeneSymbols.R @@ -7,18 +7,20 @@ option_list=list( opt=parse_args(OptionParser(option_list=option_list)) rm(option_list) -countTable <- read.csv(paste0(opt$repRID,".countData.countTable.csv"), stringsAsFactors=FALSE) +countTable <- read.csv(paste0(opt$repRID,".countTable.csv"), stringsAsFactors=FALSE) geneID <- read.delim("geneID.tsv", header=FALSE, stringsAsFactors=FALSE) Entrez <- read.delim("Entrez.tsv", header=FALSE, stringsAsFactors=FALSE) -convert <- data.frame(geneID=countTable$Geneid) -convert <- merge(x=convert,y=geneID[,1:2],by.x="geneID",by.y="V2",all.x=TRUE) +convert <- data.frame(gene_name=countTable$gene_name) +convert <- merge(x=convert,y=geneID[,1:2],by.x="gene_name",by.y="V2",all.x=TRUE) convert <- merge(x=convert,y=Entrez,by.x="V1",by.y="V1",all.x=TRUE) convert[is.na(convert$V2),3] <- "" convert <- convert[,-1] colnames(convert) <- c("GeneID","EntrezID") convert <- unique(convert) -output <- merge(x=convert,y=countTable[,c("Geneid","count","tpm")],by.x="GeneID",by.y="Geneid",all.x=TRUE) +output <- merge(x=convert,y=countTable[,c("gene_name","gene_id","count","tpm")],by.x="GeneID",by.y="gene_name",all.x=TRUE) +colnames(output) <- c("GENCODE_Gene_Symbol","NCBI_GeneID","Ensembl_GeneID","count","tpm") +output <- output[,c(1,3,2,4:5)] write.table(output,file=paste0(opt$repRID,".tpmTable.csv"),sep=",",row.names=FALSE,quote=FALSE)