From b5d905220108d688b1e6317cad3a9cea543452ce Mon Sep 17 00:00:00 2001 From: Venkat Malladi <venkat.malladi@utsouthwestern.edu> Date: Mon, 23 Oct 2017 20:55:21 -0500 Subject: [PATCH] Add in xcor step and qc values. --- workflow/conf/biohpc.config | 4 + workflow/conf/install_R_packages.R | 7 -- workflow/main.nf | 30 +++++++ workflow/scripts/xcor.py | 127 +++++++++++++++++++++++++++++ workflow/tests/test_xcor.py | 21 +++++ 5 files changed, 182 insertions(+), 7 deletions(-) delete mode 100644 workflow/conf/install_R_packages.R create mode 100644 workflow/scripts/xcor.py create mode 100644 workflow/tests/test_xcor.py diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 2bea1b7..82f2916 100644 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -27,6 +27,10 @@ process { module = ['python/3.6.1-2-anaconda', 'samtools/1.4.1', 'bedtools/2.26.0'] cpus = 32 } + $crossReads { + module = ['python/3.6.1-2-anaconda', 'phantompeakqualtools/1.2'] + cpus = 32 + } } params { diff --git a/workflow/conf/install_R_packages.R b/workflow/conf/install_R_packages.R deleted file mode 100644 index 5641c4f..0000000 --- a/workflow/conf/install_R_packages.R +++ /dev/null @@ -1,7 +0,0 @@ -install.packages("snow", repos="http://cran.us.r-project.org") -install.packages("snowfall", repos="http://cran.us.r-project.org") -install.packages("bitops", repos="http://cran.us.r-project.org") -install.packages("caTools", repos="http://cran.us.r-project.org") -source("http://bioconductor.org/biocLite.R") -biocLite("Rsamtools",suppressUpdates=TRUE) -install.packages("../../phantompeakqualtools/spp_1.14.tar.gz") diff --git a/workflow/main.nf b/workflow/main.nf index 70ea405..e97d587 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -219,3 +219,33 @@ process convertReads { } } + +// Calculate Cross-correlation using phantompeaktools +process crossReads { + + tag "$sampleId-$replicate" + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + set sampleId, seTagAlign, tagAlign, biosample, factor, treatment, replicate, controlId from tagReads + + output: + + set sampleId, seTagAlign, tagAlign, file('*.cc.qc') biosample, factor, treatment, replicate, controlId into xcorReads + set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/xcor.py -b $seTagAlign -p + """ + } + else { + """ + python3 $baseDir/scripts/xcor.py.py -b $seTagAlign + """ + } + +} diff --git a/workflow/scripts/xcor.py b/workflow/scripts/xcor.py new file mode 100644 index 0000000..484e84a --- /dev/null +++ b/workflow/scripts/xcor.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 + +'''Compute cross-correlation analysis.''' + +import os +import argparse +import shutil +import shlex +import logging +from multiprocessing import cpu_count +import utils + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-t', '--tag', + help="The tagAlign file to qc on.", + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=False, + action='store_true') + + args = parser.parse_args() + return args + +# Functions + + +def check_tools(): + '''Checks for required componenets on user system''' + + logger.info('Checking for required libraries and components on this system') + + r_path = shutil.which("R") + if r_path: + logger.info('Found R: %s', r_path) + else: + logger.error('Missing R') + raise Exception('Missing R') + + +def xcor(tag, paired): + '''Use spp to calculate cross-correlation stats.''' + + tag_basename = tag.rstrip('.gz') + + uncompressed_tag_filename = tag_basename + out, err = utils.run_pipe(['gzip -d %s' % (tag_basename)]) + + # Subsample tagAlign file + NREADS = 15000000 + + subsampled_tag_filename = \ + tag_basename + \ + ".%d.tagAlign.gz" % (NREADS/1000000) + steps = [ + 'grep -v "chrM" %s' % (uncompressed_tag_filename), + 'shuf -n %d --random-source=%s' % (NREADS, uncompressed_tag_filename) + ] + if paired: + steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]) + + steps.extend(['gzip -nc']) + + out, err = utils.run_pipe(steps, outfile=subsampled_tag_filename) + + # Calculate Cross-correlation QC scores + cc_scores_filename = subsampled_tag_filename + ".cc.qc" + cc_plot_filename = subsampled_tag_filename + ".cc.plot.pdf" + + # CC_SCORE FILE format + # Filename <tab> + # numReads <tab> + # estFragLen <tab> + # corr_estFragLen <tab> + # PhantomPeak <tab> + # corr_phantomPeak <tab> + # argmin_corr <tab> + # min_corr <tab> + # phantomPeakCoef <tab> + # relPhantomPeakCoef <tab> + # QualityTag + + run_spp_command = shutil.which("run_spp.R") + out, err = utils.run_pipe([ + "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" % + (run_spp_command, subsampled_tag_filename, cpu_count(), + cc_plot_filename, cc_scores_filename) + ]) + + +def main(): + args = get_args() + paired = args.paired + tag = args.tag + + # Create a file handler + handler = logging.FileHandler('xcor.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + + # Calculate Cross-correlation + xcor(tag, paired) + + +if __name__ == '__main__': + main() diff --git a/workflow/tests/test_xcor.py b/workflow/tests/test_xcor.py new file mode 100644 index 0000000..51f109a --- /dev/null +++ b/workflow/tests/test_xcor.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 + +import pytest +import os +import pandas as pd + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../output/crossReads/' + + +def test_convert_reads_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCFF833BLU.filt.nodup.tagAlign.15.tagAlign.gz.cc.plot.pdf')) + df_xcor = pd.read_csv("ENCFF833BLU.filt.nodup.tagAlign.15.tagAlign.gz.cc.qc", sep="\t", header=None) + assert df_xcor[2].iloc[0] = '195,215,230' + assert df_xcor[8].iloc[0] = 1.024836 + assert df_xcor[9].iloc[0] = 1.266678 + + +def test_map_qc_pairedend(): + # Do the same thing for paired end data + pass -- GitLab