Skip to content
Snippets Groups Projects
Commit 4e7b319a authored by Holly Ruess's avatar Holly Ruess
Browse files

Fix xcor

parent f08db8a9
1 merge request!12Resolve "Fix Pool and Pseudoreps"
Pipeline #5328 failed with stages
in 1 hour, 16 minutes, and 20 seconds
...@@ -70,7 +70,8 @@ experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sam ...@@ -70,7 +70,8 @@ experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sam
experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples
experimentQC | heatmeap_PearsonCorr.pdf | plot of Pearson correlation between samples experimentQC | heatmeap_PearsonCorr.pdf | plot of Pearson correlation between samples
experimentQC | sample_mbs.npz | array of multiple BAM summaries experimentQC | sample_mbs.npz | array of multiple BAM summaries
crossReads | *.cc.plot.pdf | Plot of cross-correlation to assess signal-to-noise ratios
crossReads | *.cc.qc | cross-correlation metrics. File [HEADER](docs/xcor_header.txt)
## Common Errors ## Common Errors
If you find an error, please let the [BICF](mailto:BICF@UTSouthwestern.edu) know and we will add it here. If you find an error, please let the [BICF](mailto:BICF@UTSouthwestern.edu) know and we will add it here.
......
See https://github.com/crazyhottommy/phantompeakqualtools for more details
COL1: Filename: tagAlign/BAM filename
COL2: numReads: effective sequencing depth i.e. total number of mapped reads in input file
COL3: estFragLen: comma separated strand cross-correlation peak(s) in decreasing order of correlation.
The top 3 local maxima locations that are within 90% of the maximum cross-correlation value are output.
In almost all cases, the top (first) value in the list represents the predominant fragment length.
If you want to keep only the top value simply run
sed -r 's/,[^\t]+//g' <outFile> > <newOutFile>
COL4: corr_estFragLen: comma separated strand cross-correlation value(s) in decreasing order (col2 follows the same order)
COL5: phantomPeak: Read length/phantom peak strand shift
COL6: corr_phantomPeak: Correlation value at phantom peak
COL7: argmin_corr: strand shift at which cross-correlation is lowest
COL8: min_corr: minimum value of cross-correlation
COL9: Normalized strand cross-correlation coefficient (NSC) = COL4 / COL8
COL10: Relative strand cross-correlation coefficient (RSC) = (COL4 - COL8) / (COL6 - COL8)
COL11: QualityTag: Quality tag based on thresholded RSC (codes: -2:veryLow,-1:Low,0:Medium,1:High,2:veryHigh)
...@@ -352,38 +352,53 @@ process convertReads { ...@@ -352,38 +352,53 @@ process convertReads {
// Calculate Cross-correlation using phantompeaktools // Calculate Cross-correlation using phantompeaktools
process crossReads { process crossReads {
tag "$sampleId-$replicate" tag "${sampleId}-${replicate}"
publishDir "$baseDir/output/${task.process}", mode: 'copy' publishDir "${outDir}/${task.process}/${sampleId}", mode: 'copy'
input: input:
set sampleId, tagAlign, experimentId, replicate from tagReads
set sampleId, tagAlign, experimentId, replicate from tagReads
output: output:
set sampleId, tagAlign, file('*.cc.qc'), experimentId, replicate into xcorReads
set sampleId, tagAlign, file('*.cc.qc'), experimentId, replicate into xcorReads set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats
set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats
script: script:
if (params.astrocyte == true) {
if (pairedEnd) { if (pairedEnd) {
""" """
python3 $baseDir/scripts/xcor.py -t $tagAlign -p module load python/3.6.1-2-anaconda
""" module load phantompeakqualtools/1.2
} python3 ${baseDir}/scripts/xcor.py -t ${tagAlign} -p
else { """
""" }
python3 $baseDir/scripts/xcor.py -t $tagAlign else {
""" """
} module load python/3.6.1-2-anaconda
module load phantompeakqualtools/1.2
python3 ${baseDir}/scripts/xcor.py -t ${tagAlign}
"""
}
}
else {
if (pairedEnd) {
"""
python3 ${baseDir}/scripts/xcor.py -t ${tagAlign} -p
"""
}
else {
"""
python3 ${baseDir}/scripts/xcor.py -t ${tagAlign}
"""
}
}
} }
// Define channel collecting tagAlign and xcor into design file // Define channel collecting tagAlign and xcor into design file
xcorDesign = xcorReads xcorDesign = xcorReads
.map{ sampleId, tagAlign, xcor, experimentId, replicate -> .map{ sampleId, tagAlign, xcor, experimentId, replicate ->
"$sampleId\t$tagAlign\t$xcor\t$experimentId\t$replicate\n"} "${sampleId}\t${tagAlign}\t${xcor}\t${experimentId}\t${replicate}\n" }
.collectFile(name:'design_xcor.tsv', seed:"sample_id\ttag_align\txcor\texperiment_id\treplicate\n", storeDir:"$baseDir/output/design") .collectFile(name:'design_xcor.tsv', seed:"sample_id\ttag_align\txcor\texperiment_id\treplicate\n", storeDir:"${outDir}/design")
// Make Experiment design files to be read in for downstream analysis // Make Experiment design files to be read in for downstream analysis
process defineExpDesignFiles { process defineExpDesignFiles {
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
import os import os
import argparse import argparse
import shutil import shutil
import subprocess
import logging import logging
from multiprocessing import cpu_count from multiprocessing import cpu_count
import utils import utils
...@@ -22,6 +23,13 @@ logger.propagate = False ...@@ -22,6 +23,13 @@ logger.propagate = False
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.gz', '.tagAlign']
def get_args(): def get_args():
'''Define arguments.''' '''Define arguments.'''
...@@ -52,29 +60,55 @@ def check_tools(): ...@@ -52,29 +60,55 @@ def check_tools():
r_path = shutil.which("R") r_path = shutil.which("R")
if r_path: if r_path:
logger.info('Found R: %s', r_path) logger.info('Found R: %s', r_path)
# Get Version
r_version_command = "R --version"
r_version = subprocess.check_output(r_version_command, shell=True)
# Write to file
r_file = open("version_r.txt", "wb")
r_file.write(r_version)
r_file.close()
else: else:
logger.error('Missing R') logger.error('Missing R')
raise Exception('Missing R') raise Exception('Missing R')
phantompeak_path = shutil.which("run_spp.R")
if phantompeak_path:
logger.info('Found phantompeak: %s', phantompeak_path)
# Get Version
spp_version_command = "R -e \"packageVersion('spp')\""
spp_version = subprocess.check_output(spp_version_command, shell=True)
# Write to file
spp_file = open("version_spp.txt", "wb")
spp_file.write(spp_version)
spp_file.close()
else:
logger.error('Missing phantompeak')
raise Exception('Missing phantompeak')
def xcor(tag, paired): def xcor(tag, paired):
'''Use spp to calculate cross-correlation stats.''' '''Use spp to calculate cross-correlation stats.'''
tag_basename = os.path.basename(utils.strip_extensions(tag, ['.gz'])) tag_basename = os.path.basename(utils.strip_extensions(tag, STRIP_EXTENSIONS))
uncompressed_tag_filename = tag_basename uncompressed_tag_filename = tag_basename
out, err = utils.run_pipe([ out, err = utils.run_pipe([
'gzip -dc %s > ' % (tag)], outfile=uncompressed_tag_filename) 'gzip -dc %s > ' % (tag)], outfile=uncompressed_tag_filename)
# Subsample tagAlign file # Subsample tagAlign file
numReads = 15000000 numReads = 15000000
subsampled_tag_filename = \ subsampled_tag_filename = \
tag_basename + ".%d.tagAlign.gz" % (numReads/1000000) tag_basename + ".%d.tagAlign.gz" % (numReads/1000000)
steps = [ steps = [
'grep -v "chrM" %s' % (uncompressed_tag_filename), 'zcat %s' % (tag),
'shuf -n %d --random-source=%s' % (numReads, uncompressed_tag_filename) 'shuf -n %d --random-source=%s' % (number_reads, tag),
] ]
if paired: if paired:
steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]) steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""])
...@@ -83,8 +117,8 @@ def xcor(tag, paired): ...@@ -83,8 +117,8 @@ def xcor(tag, paired):
out, err = utils.run_pipe(steps, outfile=subsampled_tag_filename) out, err = utils.run_pipe(steps, outfile=subsampled_tag_filename)
# Calculate Cross-correlation QC scores # Calculate Cross-correlation QC scores
cc_scores_filename = subsampled_tag_filename + ".cc.qc" cc_scores_filename = tag_basename + ".cc.qc"
cc_plot_filename = subsampled_tag_filename + ".cc.plot.pdf" cc_plot_filename = tag_basename + ".cc.plot.pdf"
# CC_SCORE FILE format # CC_SCORE FILE format
# Filename <tab> # Filename <tab>
...@@ -122,7 +156,7 @@ def main(): ...@@ -122,7 +156,7 @@ def main():
check_tools() check_tools()
# Calculate Cross-correlation # Calculate Cross-correlation
xcor_filename = xcor(tag, paired) xcor(tag, paired)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -8,18 +8,29 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ ...@@ -8,18 +8,29 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../output/crossReads/' '/../output/crossReads/'
@pytest.mark.integration @pytest.mark.singleend_human
def test_convert_reads_singleend(): def test_cross_plot_singleend_human():
#assert os.path.exists(os.path.join(test_output_path, 'ENCFF833BLU.filt.nodup.tagAlign.15.tagAlign.gz.cc.plot.pdf')) assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX/ENCLB969KTX.cc.plot.pdf'))
#qc_file = os.path.join(test_output_path,"ENCFF833BLU.filt.nodup.tagAlign.15.tagAlign.gz.cc.qc")
#df_xcor = pd.read_csv(qc_file, sep="\t", header=None)
#assert df_xcor[2].iloc[0] == '195,215,230' @pytest.mark.singleend_human
#assert df_xcor[8].iloc[0] == 1.024836 def test_cross_qc_singleend_human():
#assert df_xcor[9].iloc[0] == 1.266678 qc_file = os.path.join(test_output_path,"ENCLB622FZX/ENCLB622FZX.cc.qc")
pass df_xcor = pd.read_csv(qc_file, sep="\t", header=None)
assert df_xcor[2].iloc[0] == '0'
assert df_xcor[8].iloc[0] == 1.347895
@pytest.mark.integration assert df_xcor[9].iloc[0] == 1.970438
def test_map_qc_pairedend():
# Do the same thing for paired end data
pass @pytest.mark.pairedend_mouse
def test_cross_qc_pairedend_mouse():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP/ENCLB122XDP.cc.plot.pdf'))
@pytest.mark.pairedend_mouse
def test_cross_plot_pairedend_mouse():
qc_file = os.path.join(test_output_path,"ENCLB749GLW/ENCLB749GLW.cc.qc")
df_xcor = pd.read_csv(qc_file, sep="\t", header=None)
assert df_xcor[2].iloc[0] == '0,65,75'
assert df_xcor[8].iloc[0] == 1.55347
assert df_xcor[9].iloc[0] == 1.285233
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment