Merge branch '5-phantompeakcalls' into 'master'

Resolve "Include Phantompeakqualtools step" Closes #5 See merge request !11

Merge branch '5-phantompeakcalls' into 'master'
Resolve "Include Phantompeakqualtools step" Closes #5 See merge request !11
17b7de97 · Venkat Malladi · ea889a62 · 48fa7dbb · 17b7de97 · 17b7de97
Commit 17b7de97 authored 7 years ago by Venkat Malladi
--- a/.gitmodules
+++ b/.gitmodules
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -27,6 +27,10 @@ process {
    module = ['python/3.6.1-2-anaconda',  'samtools/1.4.1', 'bedtools/2.26.0']
    cpus = 32
  }
+  $crossReads {
+    module = ['python/3.6.1-2-anaconda', 'phantompeakqualtools/1.2']
+    cpus = 32
+  }
 }
 params {

--- a/workflow/conf/install_R_packages.R
+++ b/workflow/conf/install_R_packages.R
-install.packages("snow", repos="http://cran.us.r-project.org")
-install.packages("snowfall", repos="http://cran.us.r-project.org")
-install.packages("bitops", repos="http://cran.us.r-project.org")
-install.packages("caTools", repos="http://cran.us.r-project.org")
-source("http://bioconductor.org/biocLite.R")
-biocLite("Rsamtools",suppressUpdates=TRUE)
-install.packages("../../phantompeakqualtools/spp_1.14.tar.gz")
--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -219,3 +219,33 @@ process convertReads {
  }
 }
+// Calculate Cross-correlation using phantompeaktools
+process crossReads {
+  tag "$sampleId-$replicate"
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+  input:
+  set sampleId, seTagAlign, tagAlign, biosample, factor, treatment, replicate, controlId from tagReads
+  output:
+  set sampleId, seTagAlign, tagAlign, file('*.cc.qc'), biosample, factor, treatment, replicate, controlId into xcorReads
+  set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats
+  script:
+  if (pairedEnd) {
+    """
+    python3 $baseDir/scripts/xcor.py -t $seTagAlign -p
+    """
+  }
+  else {
+    """
+    python3 $baseDir/scripts/xcor.py -t $seTagAlign
+    """
+  }
+}
--- a/workflow/scripts/xcor.py
+++ b/workflow/scripts/xcor.py
+#!/usr/bin/env python3
+'''Compute cross-correlation analysis.'''
+import os
+import argparse
+import shutil
+import logging
+from multiprocessing import cpu_count
+import utils
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+# SETTINGS
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+def get_args():
+    '''Define arguments.'''
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('-t', '--tag',
+                        help="The tagAlign file to qc on.",
+                        required=True)
+    parser.add_argument('-p', '--paired',
+                        help="True/False if paired-end or single end.",
+                        default=False,
+                        action='store_true')
+    args = parser.parse_args()
+    return args
+# Functions
+def check_tools():
+    '''Checks for required componenets on user system'''
+    logger.info('Checking for required libraries and components on this system')
+    r_path = shutil.which("R")
+    if r_path:
+        logger.info('Found R: %s', r_path)
+    else:
+        logger.error('Missing R')
+        raise Exception('Missing R')
+def xcor(tag, paired):
+    '''Use spp to calculate cross-correlation stats.'''
+    tag_basename = os.path.basename(utils.strip_extensions(tag, ['.gz']))
+    uncompressed_tag_filename = tag_basename
+    out, err = utils.run_pipe([
+        'gzip -dc %s > ' % (tag)], outfile=uncompressed_tag_filename)
+    # Subsample tagAlign file
+    NREADS = 15000000
+    subsampled_tag_filename = \
+        tag_basename + ".%d.tagAlign.gz" % (NREADS/1000000)
+    steps = [
+        'grep -v "chrM" %s' % (uncompressed_tag_filename),
+        'shuf -n %d --random-source=%s' % (NREADS, uncompressed_tag_filename)
+    ]
+    if paired:
+        steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""])
+    steps.extend(['gzip -nc'])
+    out, err = utils.run_pipe(steps, outfile=subsampled_tag_filename)
+    # Calculate Cross-correlation QC scores
+    cc_scores_filename = subsampled_tag_filename + ".cc.qc"
+    cc_plot_filename = subsampled_tag_filename + ".cc.plot.pdf"
+    # CC_SCORE FILE format
+    # Filename <tab>
+    # numReads <tab>
+    # estFragLen <tab>
+    # corr_estFragLen <tab>
+    # PhantomPeak <tab>
+    # corr_phantomPeak <tab>
+    # argmin_corr <tab>
+    # min_corr <tab>
+    # phantomPeakCoef <tab>
+    # relPhantomPeakCoef <tab>
+    # QualityTag
+    run_spp_command = shutil.which("run_spp.R")
+    out, err = utils.run_pipe([
+        "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" %
+        (run_spp_command, subsampled_tag_filename, cpu_count(),
+         cc_plot_filename, cc_scores_filename)
+    ])
+def main():
+    args = get_args()
+    paired = args.paired
+    tag = args.tag
+    # Create a file handler
+    handler = logging.FileHandler('xcor.log')
+    logger.addHandler(handler)
+    # Check if tools are present
+    check_tools()
+    # Calculate Cross-correlation
+    xcor(tag, paired)
+if __name__ == '__main__':
+    main()
--- a/workflow/tests/test_convert_reads.py
+++ b/workflow/tests/test_convert_reads.py
@@ -8,8 +8,8 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
 def test_convert_reads_singleend():
-    assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.tagAlign.gz'))
+    assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.tagAlign.gz'))
-    assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.bedse.gz'))
+    assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bedse.gz'))
 def test_map_qc_pairedend():

--- a/workflow/tests/test_xcor.py
+++ b/workflow/tests/test_xcor.py
+#!/usr/bin/env python3
+import pytest
+import os
+import pandas as pd
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../output/crossReads/'
+def test_convert_reads_singleend():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCFF833BLU.filt.nodup.tagAlign.15.tagAlign.gz.cc.plot.pdf'))
+    qc_file = os.path.join(test_output_path,"ENCFF833BLU.filt.nodup.tagAlign.15.tagAlign.gz.cc.qc")
+    df_xcor = pd.read_csv(qc_file, sep="\t", header=None)
+    assert df_xcor[2].iloc[0] == '195,215,230'
+    assert df_xcor[8].iloc[0] == 1.024836
+    assert df_xcor[9].iloc[0] == 1.266678
+def test_map_qc_pairedend():
+    # Do the same thing for paired end data
+    pass