diff --git a/README.md b/README.md index 5390db419a3eed6b3e5f45e36a8a43aeee0afcab..71df1acc498a2cb373a394ab52744f88442ffbb5 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,8 @@ experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sam experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples experimentQC | heatmeap_PearsonCorr.pdf | plot of Pearson correlation between samples experimentQC | sample_mbs.npz | array of multiple BAM summaries - +crossReads | *.cc.plot.pdf | Plot of cross-correlation to assess signal-to-noise ratios +crossReads | *.cc.qc | cross-correlation metrics. File [HEADER](docs/xcor_header.txt) ## Common Errors If you find an error, please let the [BICF](mailto:BICF@UTSouthwestern.edu) know and we will add it here. diff --git a/docs/xcor_header.txt b/docs/xcor_header.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4c712ba102e3ff79142ae8536003dba04d5eb25 --- /dev/null +++ b/docs/xcor_header.txt @@ -0,0 +1,17 @@ +See https://github.com/crazyhottommy/phantompeakqualtools for more details + +COL1: Filename: tagAlign/BAM filename +COL2: numReads: effective sequencing depth i.e. total number of mapped reads in input file +COL3: estFragLen: comma separated strand cross-correlation peak(s) in decreasing order of correlation. + The top 3 local maxima locations that are within 90% of the maximum cross-correlation value are output. + In almost all cases, the top (first) value in the list represents the predominant fragment length. + If you want to keep only the top value simply run + sed -r 's/,[^\t]+//g' <outFile> > <newOutFile> +COL4: corr_estFragLen: comma separated strand cross-correlation value(s) in decreasing order (col2 follows the same order) +COL5: phantomPeak: Read length/phantom peak strand shift +COL6: corr_phantomPeak: Correlation value at phantom peak +COL7: argmin_corr: strand shift at which cross-correlation is lowest +COL8: min_corr: minimum value of cross-correlation +COL9: Normalized strand cross-correlation coefficient (NSC) = COL4 / COL8 +COL10: Relative strand cross-correlation coefficient (RSC) = (COL4 - COL8) / (COL6 - COL8) +COL11: QualityTag: Quality tag based on thresholded RSC (codes: -2:veryLow,-1:Low,0:Medium,1:High,2:veryHigh) diff --git a/workflow/main.nf b/workflow/main.nf index be55e1c48e89ab9a43bf599b09f3388cada2b4cb..25d89a051b306f6f65d9c81ea82ec815d988a49b 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -352,38 +352,53 @@ process convertReads { // Calculate Cross-correlation using phantompeaktools process crossReads { - tag "$sampleId-$replicate" - publishDir "$baseDir/output/${task.process}", mode: 'copy' + tag "${sampleId}-${replicate}" + publishDir "${outDir}/${task.process}/${sampleId}", mode: 'copy' input: - - set sampleId, tagAlign, experimentId, replicate from tagReads + set sampleId, tagAlign, experimentId, replicate from tagReads output: - - set sampleId, tagAlign, file('*.cc.qc'), experimentId, replicate into xcorReads - set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats + set sampleId, tagAlign, file('*.cc.qc'), experimentId, replicate into xcorReads + set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats script: - - if (pairedEnd) { - """ - python3 $baseDir/scripts/xcor.py -t $tagAlign -p - """ - } - else { - """ - python3 $baseDir/scripts/xcor.py -t $tagAlign - """ - } + if (params.astrocyte == true) { + if (pairedEnd) { + """ + module load python/3.6.1-2-anaconda + module load phantompeakqualtools/1.2 + python3 ${baseDir}/scripts/xcor.py -t ${tagAlign} -p + """ + } + else { + """ + module load python/3.6.1-2-anaconda + module load phantompeakqualtools/1.2 + python3 ${baseDir}/scripts/xcor.py -t ${tagAlign} + """ + } + } + else { + if (pairedEnd) { + """ + python3 ${baseDir}/scripts/xcor.py -t ${tagAlign} -p + """ + } + else { + """ + python3 ${baseDir}/scripts/xcor.py -t ${tagAlign} + """ + } + } } // Define channel collecting tagAlign and xcor into design file xcorDesign = xcorReads .map{ sampleId, tagAlign, xcor, experimentId, replicate -> - "$sampleId\t$tagAlign\t$xcor\t$experimentId\t$replicate\n"} - .collectFile(name:'design_xcor.tsv', seed:"sample_id\ttag_align\txcor\texperiment_id\treplicate\n", storeDir:"$baseDir/output/design") + "${sampleId}\t${tagAlign}\t${xcor}\t${experimentId}\t${replicate}\n" } + .collectFile(name:'design_xcor.tsv', seed:"sample_id\ttag_align\txcor\texperiment_id\treplicate\n", storeDir:"${outDir}/design") // Make Experiment design files to be read in for downstream analysis process defineExpDesignFiles { diff --git a/workflow/scripts/xcor.py b/workflow/scripts/xcor.py index 09e2cb6268347485b578b1233e5c1d5df30e5764..8d4a977da1228363ce04d47d35a91fa35b6d07f5 100644 --- a/workflow/scripts/xcor.py +++ b/workflow/scripts/xcor.py @@ -5,6 +5,7 @@ import os import argparse import shutil +import subprocess import logging from multiprocessing import cpu_count import utils @@ -22,6 +23,13 @@ logger.propagate = False logger.setLevel(logging.INFO) +# the order of this list is important. +# strip_extensions strips from the right inward, so +# the expected right-most extensions should appear first (like .gz) +# Modified from J. Seth Strattan +STRIP_EXTENSIONS = ['.gz', '.tagAlign'] + + def get_args(): '''Define arguments.''' @@ -52,29 +60,55 @@ def check_tools(): r_path = shutil.which("R") if r_path: logger.info('Found R: %s', r_path) + + # Get Version + r_version_command = "R --version" + r_version = subprocess.check_output(r_version_command, shell=True) + + # Write to file + r_file = open("version_r.txt", "wb") + r_file.write(r_version) + r_file.close() else: logger.error('Missing R') raise Exception('Missing R') + phantompeak_path = shutil.which("run_spp.R") + if phantompeak_path: + logger.info('Found phantompeak: %s', phantompeak_path) + + # Get Version + spp_version_command = "R -e \"packageVersion('spp')\"" + spp_version = subprocess.check_output(spp_version_command, shell=True) + + # Write to file + spp_file = open("version_spp.txt", "wb") + spp_file.write(spp_version) + spp_file.close() + else: + logger.error('Missing phantompeak') + raise Exception('Missing phantompeak') + def xcor(tag, paired): '''Use spp to calculate cross-correlation stats.''' - tag_basename = os.path.basename(utils.strip_extensions(tag, ['.gz'])) - + tag_basename = os.path.basename(utils.strip_extensions(tag, STRIP_EXTENSIONS)) uncompressed_tag_filename = tag_basename + out, err = utils.run_pipe([ 'gzip -dc %s > ' % (tag)], outfile=uncompressed_tag_filename) # Subsample tagAlign file numReads = 15000000 - subsampled_tag_filename = \ tag_basename + ".%d.tagAlign.gz" % (numReads/1000000) + steps = [ - 'grep -v "chrM" %s' % (uncompressed_tag_filename), - 'shuf -n %d --random-source=%s' % (numReads, uncompressed_tag_filename) - ] + 'zcat %s' % (tag), + 'shuf -n %d --random-source=%s' % (number_reads, tag), + ] + if paired: steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]) @@ -83,8 +117,8 @@ def xcor(tag, paired): out, err = utils.run_pipe(steps, outfile=subsampled_tag_filename) # Calculate Cross-correlation QC scores - cc_scores_filename = subsampled_tag_filename + ".cc.qc" - cc_plot_filename = subsampled_tag_filename + ".cc.plot.pdf" + cc_scores_filename = tag_basename + ".cc.qc" + cc_plot_filename = tag_basename + ".cc.plot.pdf" # CC_SCORE FILE format # Filename <tab> @@ -122,7 +156,7 @@ def main(): check_tools() # Calculate Cross-correlation - xcor_filename = xcor(tag, paired) + xcor(tag, paired) if __name__ == '__main__': diff --git a/workflow/tests/test_xcor.py b/workflow/tests/test_xcor.py index 7d00b92f16d494b352a6507ec5058299a6b41b13..7e837afff321af6694253d62f15c0ba4e81b54a2 100644 --- a/workflow/tests/test_xcor.py +++ b/workflow/tests/test_xcor.py @@ -8,18 +8,29 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../output/crossReads/' -@pytest.mark.integration -def test_convert_reads_singleend(): - #assert os.path.exists(os.path.join(test_output_path, 'ENCFF833BLU.filt.nodup.tagAlign.15.tagAlign.gz.cc.plot.pdf')) - #qc_file = os.path.join(test_output_path,"ENCFF833BLU.filt.nodup.tagAlign.15.tagAlign.gz.cc.qc") - #df_xcor = pd.read_csv(qc_file, sep="\t", header=None) - #assert df_xcor[2].iloc[0] == '195,215,230' - #assert df_xcor[8].iloc[0] == 1.024836 - #assert df_xcor[9].iloc[0] == 1.266678 - pass - - -@pytest.mark.integration -def test_map_qc_pairedend(): - # Do the same thing for paired end data - pass +@pytest.mark.singleend_human +def test_cross_plot_singleend_human(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX/ENCLB969KTX.cc.plot.pdf')) + + +@pytest.mark.singleend_human +def test_cross_qc_singleend_human(): + qc_file = os.path.join(test_output_path,"ENCLB622FZX/ENCLB622FZX.cc.qc") + df_xcor = pd.read_csv(qc_file, sep="\t", header=None) + assert df_xcor[2].iloc[0] == '0' + assert df_xcor[8].iloc[0] == 1.347895 + assert df_xcor[9].iloc[0] == 1.970438 + + +@pytest.mark.pairedend_mouse +def test_cross_qc_pairedend_mouse(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP/ENCLB122XDP.cc.plot.pdf')) + + +@pytest.mark.pairedend_mouse +def test_cross_plot_pairedend_mouse(): + qc_file = os.path.join(test_output_path,"ENCLB749GLW/ENCLB749GLW.cc.qc") + df_xcor = pd.read_csv(qc_file, sep="\t", header=None) + assert df_xcor[2].iloc[0] == '0,65,75' + assert df_xcor[8].iloc[0] == 1.55347 + assert df_xcor[9].iloc[0] == 1.285233