diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 7c1835f242c3601c79e4e34d00e83a0009c121f2..338cd5d54d4258e499a60241054c5f28109e7c19 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -3,7 +3,7 @@ process { queue = 'super' clusterOptions = '--hold' - withLabel: trackStart { + withName: trackStart { executor = 'local' } withName: getBag { @@ -51,6 +51,9 @@ process { withName: dataQC { queue = 'super' } + withName: aggrQC { + executor = 'local' + } } singularity { diff --git a/workflow/conf/multiqc_config.yaml b/workflow/conf/multiqc_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..934ee6aeb2b47a69dc5cae48fca8a6ca2132888d --- /dev/null +++ b/workflow/conf/multiqc_config.yaml @@ -0,0 +1,76 @@ +custom_logo: '../../docs/bicf_logo.png' +custom_logo_url: 'https/utsouthwestern.edu/labs/bioinformatics/' +custom_logo_title: 'Bioinformatics Core Facility' + +report_header_info: + - Contact Email: 'bicf@utsouthwestern.edu' + - Application Type: 'RNA-Seq Analytic Pipeline for GUDMAP/RBK' + - Department: 'Bioinformatic Core Facility, Department of Bioinformatics, University of Texas Southwestern Medical Center' + +title: RNA-Seq Analytic Pipeline for GUDMAP/RBK + +report_comment: > + This report has been generated by the <a href="https://doi.org/10.5281/zenodo.3625056">GUDMAP/RBK RNA-Seq Pipeline</a> + +top_modules: + - fastqc: + name: 'Raw' + info: 'Replicate Raw fastq QC Results' + - cutadapt: + name: 'Trim' + info: 'Replicate Trim Adapter QC Results' + - hisat2: + name: 'Align' + info: 'Replicate Alignment QC Results' + path_filters: + - '*alignSummary*' + - picard: + name: 'Dedup' + info: 'Replicate Alignement Deduplication QC Results' + - featureCounts: + name: 'Count' + info: 'Replicate Feature Count QC Results' + - rseqc: + name: 'Inner Distance' + info: 'Replicate Paired End Inner Distance Distribution Results' + path_filters: + - '*insertSize*' + - custom_content: + name: 'TIN' + info: 'Transcript Integrety Score Distribution Results' + - hisat2: + name: 'Inference: Align' + info: 'Inference Alignment (1M downsampled reads) QC Results' + path_filters: + - '*alignSampleSummary*' + - rseqc: + name: 'Inference: Stranded' + info: '1M Downsampled Reads Strandedness Inference Results' + path_filters: + - '*infer_experiment*' + +skip_generalstats: true + +custom_data: + tin: + file_format: 'tsv' + section_name: 'TIN' + description: 'This is the distribution of TIN values calculated by the tool RSeQC' + plot_type: 'bargraph' + pconfig: + id: 'tin' + headers: + chrom + 0 - 9 + 10 - 19 + 20 - 29 + 30 - 39 + 40 - 49 + 50 - 59 + 60 - 69 + 70 - 79 + 80 - 89 + 90 - 99 +sp: + tin: + fn: '*.tin.hist.tsv' diff --git a/workflow/nextflow.config b/workflow/nextflow.config index 0888063278039f3e3e7e280735670cdb94852fd4..b56aa1680442050a6f08c57a859f2cfcd1df92b8 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -56,6 +56,9 @@ process { withName: dataQC { container = 'bicf/rseqc3.0:2.0.0' } + withName: aggrQC { + container = 'bicf/multiqc:2.0.0' + } } trace { diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 716e532564e60ef8d62e56905af754ba9d7476c5..b6f7d0e28ea6747bca57c6b684ed4831f75f02e1 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -37,12 +37,14 @@ derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json") //referenceBase = "s3://bicf-references" referenceBase = "/project/BICF/BICF_Core/shared/gudmap/references" referenceInfer = Channel.fromList(["ERCC","GRCh","GRCm"]) +multiqcConfig = Channel.fromPath("${baseDir}/conf/multiqc_config.yaml") // Define script files script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh") script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py") -script_calculateTPM = Channel.fromPath("${baseDir}/scripts/calculateTPM.R") script_inferMeta = Channel.fromPath("${baseDir}/scripts/inferMeta.sh") +script_calculateTPM = Channel.fromPath("${baseDir}/scripts/calculateTPM.R") +script_tinHist = Channel.fromPath("${baseDir}/scripts/tinHist.py") /* * trackStart: track start of pipeline @@ -56,17 +58,19 @@ process trackStart { ulimit -a export https_proxy=\${http_proxy} - curl -H 'Content-Type: application/json' -X PUT -d '{ \ + curl -H 'Content-Type: application/json' -X PUT -d \ + '{ \ "sessionId": "${workflow.sessionId}", \ "pipeline": "gudmap.rbk_rnaseq", \ "start": "${workflow.start}", \ - "repRID": ${reRID} \ + "repRID": "${repRID}", \ "astrocyte": false, \ "status": "started", \ "nextflowVersion": "${workflow.nextflow.version}", \ "ci": ${params.ci}, \ - "dev": ${params.dev}}' \ - "https://xku43pcwnf.execute-api.us-east-1.amazonaws.com/ProdDeploy/pipeline-tracking" + "dev": ${params.dev} \ + }' \ + "https://xku43pcwnf.execute-api.us-east-1.amazonaws.com/ProdDeploy/pipeline-tracking" """ } @@ -409,6 +413,11 @@ process alignSampleData { """ } +alignSampleQC.into { + alignSampleQC_inferMetadata + alignSampleQC_aggrQC +} + process inferMetadata { tag "${repRID}" publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.inferMetadata.{out,err}" @@ -418,10 +427,11 @@ process inferMetadata { path beds from bedInfer.collect() path bam from sampleBam.collect() path bai from sampleBai.collect() - path alignSummary from alignSampleQC.collect() + path alignSummary from alignSampleQC_inferMetadata.collect() output: path "infer.csv" into inferMetadata + path "${repRID}.infer_experiment.txt" into inferExperiment path "${repRID}.inferMetadata.{out,err}" optional true script: @@ -462,21 +472,21 @@ process inferMetadata { # infer experimental setting from dedup bam echo "LOG: infer experimental setting from dedup bam" >> ${repRID}.inferMetadata.err - infer_experiment.py -r "\${bed}" -i "\${bam}" 1>> ${repRID}.inferMetadata.log 2>> ${repRID}.inferMetadata.err + infer_experiment.py -r "\${bed}" -i "\${bam}" 1>> ${repRID}.infer_experiment.txt 2>> ${repRID}.inferMetadata.err echo "LOG: determining endedness and strandedness from file" >> ${repRID}.inferMetadata.err - ended=`bash inferMeta.sh endness ${repRID}.inferMetadata.log` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err - fail=`bash inferMeta.sh fail ${repRID}.inferMetadata.log` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err + ended=`bash inferMeta.sh endness ${repRID}.infer_experiment.txt` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err + fail=`bash inferMeta.sh fail ${repRID}.infer_experiment.txt` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err if [ \${ended} == "PairEnd" ] then ends="pe" - percentF=`bash inferMeta.sh pef ${repRID}.inferMetadata.log` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err - percentR=`bash inferMeta.sh per ${repRID}.inferMetadata.log` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err + percentF=`bash inferMeta.sh pef ${repRID}.infer_experiment.txt` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err + percentR=`bash inferMeta.sh per ${repRID}.infer_experiment.txt` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err elif [ \${ended} == "SingleEnd" ] then ends="se" - percentF=`bash inferMeta.sh sef ${repRID}.inferMetadata.log` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err - percentR=`bash inferMeta.sh ser ${repRID}.inferMetadata.log` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err + percentF=`bash inferMeta.sh sef ${repRID}.infer_experiment.txt` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err + percentR=`bash inferMeta.sh ser ${repRID}.infer_experiment.txt` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err fi if [ 1 -eq \$(echo \$(expr \${percentF} ">" 0.25)) ] && [ 1 -eq \$(echo \$(expr \${percentR} "<" 0.25)) ] then @@ -522,6 +532,7 @@ inferMetadata.splitCsv(sep: ",", header: false).separate( endsInfer.into { endsInfer_alignData endsInfer_countData + endsInfer_dataQC } strandedInfer.into { strandedInfer_alignData @@ -682,7 +693,7 @@ process dedupData { publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.dedup.{out,err}" input: - set path (bam), path (bai) from rawBam_dedupData + tuple path (bam), path (bai) from rawBam_dedupData output: tuple path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bam.bai") into dedupBam @@ -717,6 +728,7 @@ process dedupData { dedupBam.into { dedupBam_countData dedupBam_makeBigWig + dedupBam_dataQC } /* @@ -728,7 +740,7 @@ process makeBigWig { publishDir "${outDir}/bigwig", mode: 'copy', pattern: "${repRID}.bw" input: - set path (bam), path (bai) from dedupBam_makeBigWig + tuple path (bam), path (bai) from dedupBam_makeBigWig output: path ("${repRID}.bw") @@ -830,18 +842,21 @@ process fastqc { /* *dataQC: run RSeQC to calculate transcript integrity numbers (TIN) */ - process dataQC { tag "${repRID}" publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.dataQC.{out,err}" input: + path script_tinHist path ref from reference_dataQC - set path (chrBam), path (chrBai) from dedupChrBam - + tuple path (bam), path (bai) from dedupBam_dataQC + tuple path (chrBam), path (chrBai) from dedupChrBam + val ends from endsInfer_dataQC + output: - path "${repRID}.sorted.deduped.tin.xls" into tin - path "${repRID}.dataQC.{out,err}" optional true + path "${repRID}.tin.hist.tsv" into tin + path "${repRID}.insertSize.inner_distance_freq.txt" into innerDistance + path "${repRID}.dataQC.{out,err}" script: """ @@ -851,7 +866,53 @@ process dataQC { # calcualte TIN values per feature on each chromosome echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > ${repRID}.sorted.deduped.tin.xls for i in `cat ./bed/genome.bed | cut -f1 | sort | uniq`; do - echo "echo \"LOG: running tin.py on \${i}\" >> ${repRID}.rseqc.err; tin.py -i ${repRID}.sorted.deduped.\${i}.bam -r ./bed/genome.bed 1>>${repRID}.rseqc.log 2>>${repRID}.rseqc.err; cat ${repRID}.sorted.deduped.\${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \\\"\\\\t\${i}\\\\t\\\";"; - done | parallel -j `nproc` -k 1>> ${repRID}.sorted.deduped.tin.xls 2>>${repRID}.rseqc.err + echo "echo \"LOG: running tin.py on \${i}\" >> ${repRID}.dataQC.err; tin.py -i ${repRID}.sorted.deduped.\${i}.bam -r ./bed/genome.bed 1>>${repRID}.dataQC.log 2>>${repRID}.dataQC.err; cat ${repRID}.sorted.deduped.\${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \\\"\\\\t\${i}\\\\t\\\";"; + done | parallel -j `nproc` -k 1>> ${repRID}.sorted.deduped.tin.xls 2>>${repRID}.dataQC.err + + # bin TIN values + python3 ${script_tinHist} -r ${repRID} + + # calculate inner-distances for PE dat + if [ "${ends}" == "pe" ] + then + inner_distance.py -i "${bam}" -o ${repRID}.insertSize -r ./bed/genome.bed 1>>${repRID}.dataQC.out 2>>${repRID}.dataQC.err + else + touch ${repRID}.insertSize.inner_distance_freq.txt + fi + """ +} + +/* + *aggrQC: aggregate QC from processes as wel as metadata and run MultiQC +*/ +process aggrQC { + tag "${repRID}" + publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.aggrQC.{out,err}" + + input: + path multiqcConfig + path fastqc + path trimQC + path alignQC + path dedupQC + path countsQC + path tin + path innerDistance + path alignSampleQCs from alignSampleQC_aggrQC.collect() + path inferExperiment + + output: + path "${repRID}.aggrQC.{out,err}" optional true + + script: + """ + hostname > ${repRID}.aggrQC.err + ulimit -a >> ${repRID}.aggrQC.err + + if [ wc -l ${innerDistance} | awk '{print\${1}}' -eq 0 ] + then + rm -f ${innerDistance} + fi + multiqc -c ${multiqcConfig} . """ } diff --git a/workflow/scripts/tinHist.py b/workflow/scripts/tinHist.py new file mode 100644 index 0000000000000000000000000000000000000000..df32a985c165e2b266fb26fc347bd13eb1580999 --- /dev/null +++ b/workflow/scripts/tinHist.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 + +import argparse +import pandas as pd +import numpy as np +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-r', '--repRID',help="The replicate RID.",required=True) + args = parser.parse_args() + return args + +def main(): + args = get_args() + tin = pd.read_csv(args.repRID + '.sorted.deduped.tin.xls',sep="\t",header=0) + hist = pd.cut(tin['TIN'],bins=pd.interval_range(start=0,freq=10,end=100,closed='right')).value_counts(sort=False) + labels = ["{0} - {1}".format(i, i + 9) for i in range(1, 100, 10)] + #labels[0] = '0 - 10' + binned = tin.assign(Bins=lambda x: pd.cut(tin['TIN'],range(0,105,10),labels=labels,include_lowest=False,right=True)) + binned['chrom'] = binned['chrom'] = binned['chrom'].replace('chr1','chr01') + binned['chrom'] = binned['chrom'].replace('chr2','chr02') + binned['chrom'] = binned['chrom'].replace('chr3','chr03') + binned['chrom'] = binned['chrom'].replace('chr4','chr04') + binned['chrom'] = binned['chrom'].replace('chr5','chr05') + binned['chrom'] = binned['chrom'].replace('chr6','chr06') + binned['chrom'] = binned['chrom'].replace('chr7','chr07') + binned['chrom'] = binned['chrom'].replace('chr8','chr08') + binned['chrom'] = binned['chrom'].replace('chr9','chr09') + hist = pd.pivot_table(binned, values='geneID', index = 'Bins', columns = 'chrom', aggfunc=np.size) + hist['TOTAL'] = hist.sum(axis=1) + hist = hist[['TOTAL'] + [ i for i in hist.columns if i != 'TOTAL']] + hist = hist.T.fillna(0.0).astype(int) + #hist = hist.apply(lambda x: x/x.sum()*100, axis=1) + hist.to_csv(args.repRID + '.tin.hist.tsv',sep='\t') + +if __name__ == '__main__': + main() \ No newline at end of file