Fix xcor

4e7b319a · Holly Ruess · f08db8a9 · 4e7b319a · 4e7b319a · 4e7b319a
Commit 4e7b319a authored 5 years ago by Holly Ruess
--- a/README.md
+++ b/README.md
@@ -70,7 +70,8 @@ experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sam
 experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples
 experimentQC | heatmeap_PearsonCorr.pdf | plot of Pearson correlation between samples
 experimentQC | sample_mbs.npz | array of multiple BAM summaries
+crossReads | *.cc.plot.pdf | Plot of cross-correlation to assess signal-to-noise ratios
+crossReads | *.cc.qc | cross-correlation metrics. File [HEADER](docs/xcor_header.txt)
 ## Common Errors
 If you find an error, please let the [BICF](mailto:BICF@UTSouthwestern.edu) know and we will add it here.

--- a/docs/xcor_header.txt
+++ b/docs/xcor_header.txt
+See https://github.com/crazyhottommy/phantompeakqualtools for more details
+COL1: Filename: tagAlign/BAM filename
+COL2: numReads: effective sequencing depth i.e. total number of mapped reads in input file
+COL3: estFragLen: comma separated strand cross-correlation peak(s) in decreasing order of correlation.
+	  The top 3 local maxima locations that are within 90% of the maximum cross-correlation value are output.
+	  In almost all cases, the top (first) value in the list represents the predominant fragment length.
+	  If you want to keep only the top value simply run
+	  sed -r 's/,[^\t]+//g' <outFile> > <newOutFile>
+COL4: corr_estFragLen: comma separated strand cross-correlation value(s) in decreasing order (col2 follows the same order)
+COL5: phantomPeak: Read length/phantom peak strand shift
+COL6: corr_phantomPeak: Correlation value at phantom peak
+COL7: argmin_corr: strand shift at which cross-correlation is lowest
+COL8: min_corr: minimum value of cross-correlation
+COL9: Normalized strand cross-correlation coefficient (NSC) = COL4 / COL8
+COL10: Relative strand cross-correlation coefficient (RSC) = (COL4 - COL8) / (COL6 - COL8)
+COL11: QualityTag: Quality tag based on thresholded RSC (codes: -2:veryLow,-1:Low,0:Medium,1:High,2:veryHigh)
--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -352,38 +352,53 @@ process convertReads {
 // Calculate Cross-correlation using phantompeaktools
 process crossReads {
-  tag "$sampleId-$replicate"
+  tag "${sampleId}-${replicate}"
-  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+  publishDir "${outDir}/${task.process}/${sampleId}", mode: 'copy'
  input:
+    set sampleId, tagAlign, experimentId, replicate from tagReads
-  set sampleId, tagAlign, experimentId, replicate from tagReads
  output:
+    set sampleId, tagAlign, file('*.cc.qc'), experimentId, replicate into xcorReads
-  set sampleId, tagAlign, file('*.cc.qc'), experimentId, replicate into xcorReads
+    set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats
-  set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats
  script:
+    if (params.astrocyte == true) {
-  if (pairedEnd) {
+      if (pairedEnd) {
-    """
+        """
-    python3 $baseDir/scripts/xcor.py -t $tagAlign -p
+        module load python/3.6.1-2-anaconda
-    """
+        module load phantompeakqualtools/1.2
-  }
+        python3 ${baseDir}/scripts/xcor.py -t ${tagAlign} -p
-  else {
+        """
-    """
+      }
-    python3 $baseDir/scripts/xcor.py -t $tagAlign
+      else {
-    """
+        """
-  }
+        module load python/3.6.1-2-anaconda
+        module load phantompeakqualtools/1.2
+        python3 ${baseDir}/scripts/xcor.py -t ${tagAlign}
+        """
+      }
+    }
+    else {
+      if (pairedEnd) {
+        """
+        python3 ${baseDir}/scripts/xcor.py -t ${tagAlign} -p
+        """
+      }
+      else {
+        """
+        python3 ${baseDir}/scripts/xcor.py -t ${tagAlign}
+        """
+      }
+    }
 }
 // Define channel collecting tagAlign and xcor into design file
 xcorDesign = xcorReads
              .map{ sampleId, tagAlign, xcor, experimentId, replicate ->
-              "$sampleId\t$tagAlign\t$xcor\t$experimentId\t$replicate\n"}
+              "${sampleId}\t${tagAlign}\t${xcor}\t${experimentId}\t${replicate}\n" }
-              .collectFile(name:'design_xcor.tsv', seed:"sample_id\ttag_align\txcor\texperiment_id\treplicate\n", storeDir:"$baseDir/output/design")
+              .collectFile(name:'design_xcor.tsv', seed:"sample_id\ttag_align\txcor\texperiment_id\treplicate\n", storeDir:"${outDir}/design")
 // Make Experiment design files to be read in for downstream analysis
 process defineExpDesignFiles {

--- a/workflow/scripts/xcor.py
+++ b/workflow/scripts/xcor.py
@@ -5,6 +5,7 @@
 import os
 import argparse
 import shutil
+import subprocess
 import logging
 from multiprocessing import cpu_count
 import utils
@@ -22,6 +23,13 @@ logger.propagate = False
 logger.setLevel(logging.INFO)
+# the order of this list is important.
+# strip_extensions strips from the right inward, so
+# the expected right-most extensions should appear first (like .gz)
+# Modified from J. Seth Strattan
+STRIP_EXTENSIONS = ['.gz', '.tagAlign']
 def get_args():
    '''Define arguments.'''
@@ -52,29 +60,55 @@ def check_tools():
    r_path = shutil.which("R")
    if r_path:
        logger.info('Found R: %s', r_path)
+        # Get Version
+        r_version_command = "R --version"
+        r_version = subprocess.check_output(r_version_command, shell=True)
+        # Write to file
+        r_file = open("version_r.txt", "wb")
+        r_file.write(r_version)
+        r_file.close()
    else:
        logger.error('Missing R')
        raise Exception('Missing R')
+    phantompeak_path = shutil.which("run_spp.R")
+    if phantompeak_path:
+        logger.info('Found phantompeak: %s', phantompeak_path)
+        # Get Version
+        spp_version_command = "R -e \"packageVersion('spp')\""
+        spp_version = subprocess.check_output(spp_version_command, shell=True)
+        # Write to file
+        spp_file = open("version_spp.txt", "wb")
+        spp_file.write(spp_version)
+        spp_file.close()
+    else:
+        logger.error('Missing phantompeak')
+        raise Exception('Missing phantompeak')
 def xcor(tag, paired):
    '''Use spp to calculate cross-correlation stats.'''
-    tag_basename = os.path.basename(utils.strip_extensions(tag, ['.gz']))
+    tag_basename = os.path.basename(utils.strip_extensions(tag, STRIP_EXTENSIONS))
    uncompressed_tag_filename = tag_basename
    out, err = utils.run_pipe([
        'gzip -dc %s > ' % (tag)], outfile=uncompressed_tag_filename)
    # Subsample tagAlign file
    numReads = 15000000
    subsampled_tag_filename = \
        tag_basename + ".%d.tagAlign.gz" % (numReads/1000000)
    steps = [
-        'grep -v "chrM" %s' % (uncompressed_tag_filename),
+        'zcat %s' % (tag),
-        'shuf -n %d --random-source=%s' % (numReads, uncompressed_tag_filename)
+        'shuf -n %d --random-source=%s' % (number_reads, tag),
-    ]
+        ]
    if paired:
        steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""])
@@ -83,8 +117,8 @@ def xcor(tag, paired):
    out, err = utils.run_pipe(steps, outfile=subsampled_tag_filename)
    # Calculate Cross-correlation QC scores
-    cc_scores_filename = subsampled_tag_filename + ".cc.qc"
+    cc_scores_filename = tag_basename + ".cc.qc"
-    cc_plot_filename = subsampled_tag_filename + ".cc.plot.pdf"
+    cc_plot_filename = tag_basename + ".cc.plot.pdf"
    # CC_SCORE FILE format
    # Filename <tab>
@@ -122,7 +156,7 @@ def main():
    check_tools()
    # Calculate Cross-correlation
-    xcor_filename = xcor(tag, paired)
+    xcor(tag, paired)
 if __name__ == '__main__':

--- a/workflow/tests/test_xcor.py
+++ b/workflow/tests/test_xcor.py
@@ -8,18 +8,29 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
                '/../output/crossReads/'
-@pytest.mark.integration
+@pytest.mark.singleend_human
-def test_convert_reads_singleend():
+def test_cross_plot_singleend_human():
-    #assert os.path.exists(os.path.join(test_output_path, 'ENCFF833BLU.filt.nodup.tagAlign.15.tagAlign.gz.cc.plot.pdf'))
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX/ENCLB969KTX.cc.plot.pdf'))
-    #qc_file = os.path.join(test_output_path,"ENCFF833BLU.filt.nodup.tagAlign.15.tagAlign.gz.cc.qc")
-    #df_xcor = pd.read_csv(qc_file, sep="\t", header=None)
-    #assert df_xcor[2].iloc[0] == '195,215,230'
+@pytest.mark.singleend_human
-    #assert df_xcor[8].iloc[0] == 1.024836
+def test_cross_qc_singleend_human():
-    #assert df_xcor[9].iloc[0] == 1.266678
+    qc_file = os.path.join(test_output_path,"ENCLB622FZX/ENCLB622FZX.cc.qc")
-    pass
+    df_xcor = pd.read_csv(qc_file, sep="\t", header=None)
+    assert df_xcor[2].iloc[0] == '0'
+    assert df_xcor[8].iloc[0] == 1.347895
-@pytest.mark.integration
+    assert df_xcor[9].iloc[0] == 1.970438
-def test_map_qc_pairedend():
-    # Do the same thing for paired end data
-    pass
+@pytest.mark.pairedend_mouse
+def test_cross_qc_pairedend_mouse():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP/ENCLB122XDP.cc.plot.pdf'))
+@pytest.mark.pairedend_mouse
+def test_cross_plot_pairedend_mouse():
+    qc_file = os.path.join(test_output_path,"ENCLB749GLW/ENCLB749GLW.cc.qc")
+    df_xcor = pd.read_csv(qc_file, sep="\t", header=None)
+    assert df_xcor[2].iloc[0] == '0,65,75'
+    assert df_xcor[8].iloc[0] == 1.55347
+    assert df_xcor[9].iloc[0] == 1.285233