From c41b4788274d7b5b389ca70759468e7c71ce2359 Mon Sep 17 00:00:00 2001 From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu> Date: Mon, 10 Aug 2020 16:09:27 -0500 Subject: [PATCH] Convert gene symbols to EntrezID in count/tpm table --- workflow/rna-seq.nf | 22 +++++++++++++++++----- workflow/scripts/convertGeneSymbols.R | 24 ++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 5 deletions(-) create mode 100644 workflow/scripts/convertGeneSymbols.R diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index efeb0f1..772d076 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -53,6 +53,7 @@ script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh") script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py") script_inferMeta = Channel.fromPath("${baseDir}/scripts/inferMeta.sh") script_calculateTPM = Channel.fromPath("${baseDir}/scripts/calculateTPM.R") +script_convertGeneSymbols = Channel.fromPath("${baseDir}/scripts/convertGeneSymbols.R") script_tinHist = Channel.fromPath("${baseDir}/scripts/tinHist.py") /* @@ -323,7 +324,6 @@ inferMetadata_readLength.splitCsv(sep: ",", header: false).separate( readLengthInfer ) - // Replicate trimmed fastq's fastqsTrim.into { fastqsTrim_alignData @@ -629,7 +629,7 @@ process getRef { val species from speciesInfer_getRef output: - tuple path ("hisat2", type: 'dir'), path ("bed", type: 'dir'), path ("*.fna"), path ("*.gtf") into reference + tuple path ("hisat2", type: 'dir'), path ("bed", type: 'dir'), path ("*.fna"), path ("*.gtf"), path ("geneID.tsv"), path ("Entrez.tsv") into reference script: """ @@ -665,12 +665,16 @@ process getRef { aws s3 cp "\${references}"/bed ./bed --recursive aws s3 cp "\${references}"/genome.fna ./ aws s3 cp "\${references}"/genome.gtf ./ + aws s3 cp "\${references}"/geneID.tsv ./ + aws s3 cp "\${references}"/Entrez.tsv ./ elif [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references" ] then ln -s "\${references}"/hisat2 ln -s "\${references}"/bed ln -s "\${references}"/genome.fna ln -s "\${references}"/genome.gtf + ln -s "\${references}"/geneID.tsv + ln -s "\${references}"/Entrez.tsv fi echo -e "LOG: fetched" >> ${repRID}.getRef.log """ @@ -834,13 +838,14 @@ process countData { input: path script_calculateTPM + path script_convertGeneSymbols tuple path (bam), path (bai) from dedupBam_countData path ref from reference_countData val ends from endsInfer_countData val stranded from strandedInfer_countData output: - path ("*.countTable.csv") into counts + path ("*.tpmTable.csv") into counts path ("*.countData.summary") into countsQC path ("assignedReads.csv") into inferMetadata_assignedReads @@ -882,6 +887,10 @@ process countData { # calculate TPM from the resulting countData table echo -e "LOG: calculating TPM with R" >> ${repRID}.countData.log Rscript calculateTPM.R --count "${repRID}.countData" + + # convert gene symbols to Entrez id's + echo -e "LOG: convert gene symbols to Entrez id's" >> ${repRID}.countData.log + Rscript convertGeneSymbols.R --repRID "${repRID}" """ } @@ -983,7 +992,8 @@ inferMetadata_tinMed.splitCsv(sep: ",", header: false).separate( */ process aggrQC { tag "${repRID}" - publishDir "${outDir}/qc", mode: 'copy', pattern: "${repRID}.multiqc.html" + publishDir "${outDir}/report", mode: 'copy', pattern: "${repRID}.multiqc.html" + publishDir "${outDir}/qc", mode: 'copy', pattern: "${repRID}.multiqc_data.json" input: path multiqcConfig @@ -1016,6 +1026,7 @@ process aggrQC { output: path "${repRID}.multiqc.html" into multiqc + path "${repRID}.multiqc_data.json" into multiqcJSON script: """ @@ -1044,5 +1055,6 @@ process aggrQC { # run MultiQC echo -e "LOG: running multiqc" >> ${repRID}.aggrQC.log multiqc -c ${multiqcConfig} . -n ${repRID}.multiqc.html + cp ${repRID}.multiqc_data/multiqc_data.json ${repRID}.multiqc_data.json """ -} +} \ No newline at end of file diff --git a/workflow/scripts/convertGeneSymbols.R b/workflow/scripts/convertGeneSymbols.R new file mode 100644 index 0000000..7b05b92 --- /dev/null +++ b/workflow/scripts/convertGeneSymbols.R @@ -0,0 +1,24 @@ +gc() +library(optparse) + +option_list=list( + make_option("--repRID",action="store",type='character',help="Replicate RID") +) +opt=parse_args(OptionParser(option_list=option_list)) +rm(option_list) + +countTable <- read.csv(paste0(opt$repRID,".countData.countTable.csv"), stringsAsFactors=FALSE) +geneID <- read.delim("geneID.tsv", header=FALSE, stringsAsFactors=FALSE) +Entrez <- read.delim("Entrez.tsv", header=FALSE, stringsAsFactors=FALSE) + +convert <- data.frame(geneID=countTable$Geneid) +convert <- merge(x=convert,y=geneID[,1:2],by.x="geneID",by.y="V2",all.x=TRUE) +convert <- merge(x=convert,y=Entrez,by.x="V1",by.y="V1",all.x=TRUE) +convert[is.na(convert$V2),3] <- "" +convert <- convert[,-1] +colnames(convert) <- c("GeneID","EntrezID") +convert <- unique(convert) + +output <- merge(x=convert,y=countTable,by.x="GeneID",by.y="Geneid",all.x=TRUE) + +write.table(output,file=paste0(opt$repRID,".tpmTable.csv"),sep=",",row.names=FALSE,quote=FALSE) \ No newline at end of file -- GitLab