Skip to content
Snippets Groups Projects
Commit 6e39fc29 authored by Holly Ruess's avatar Holly Ruess
Browse files

fix MACS

parent 586ddd0c
1 merge request!13Resolve "fix consensus peaks"
Pipeline #5504 failed with stages
in 6 hours, 33 minutes, and 47 seconds
......@@ -74,6 +74,11 @@ experimentQC | heatmeap_PearsonCorr.pdf | plot of Pearson correlation between sa
experimentQC | sample_mbs.npz | array of multiple BAM summaries
crossReads | *.cc.plot.pdf | Plot of cross-correlation to assess signal-to-noise ratios
crossReads | *.cc.qc | cross-correlation metrics. File [HEADER](docs/xcor_header.txt)
callPeaksMACS | pooled/*pooled.fc_signal.bw | bigwig data file; raw fold enrichment of sample/control
callPeaksMACS | pooled/*pooled_peaks.xls | Excel file of peaks
callPeaksMACS | pooled/*.pvalue_signal.bw | bigwig data file; sample/control signal adjusted for pvalue significance
callPeaksMACS | pooled/*_pooled.narrowPeak | peaks file; see [HERE](https://genome.ucsc.edu/FAQ/FAQformat.html#format12) for ENCODE narrowPeak header format
## Common Errors
If you find an error, please let the [BICF](mailto:BICF@UTSouthwestern.edu) know and we will add it here.
......
......@@ -478,28 +478,53 @@ experimentRows = experimentPoolObjs
// Call Peaks using MACS
process callPeaksMACS {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/${task.process}", mode: 'copy'
tag "${sampleId}-${replicate}"
publishDir "${outDir}/${task.process}/${experimentId}/${replicate}", mode: 'copy'
input:
set sampleId, tagAlign, xcor, experimentId, replicate from experimentRows
set sampleId, tagAlign, xcor, experimentId, replicate from experimentRows
output:
set sampleId, file('*.narrowPeak'), file('*.fc_signal.bw'), file('*.pvalue_signal.bw'), experimentId, replicate into experimentPeaks
set sampleId, file('*.narrowPeak'), file('*.fc_signal.bw'), file('*.pvalue_signal.bw'), experimentId, replicate into experimentPeaks
file '*.xls' into callPeaksMACSsummit
file('version_*.txt') into callPeaksMACSVersions
script:
if (params.astrocyte == true) {
if (pairedEnd) {
"""
module load python/3.6.1-2-anaconda
module load macs/2.1.0-20151222
module load UCSC_userApps/v317
module load bedtools/2.26.0
module load phantompeakqualtools/1.2
python3 ${baseDir}/scripts/call_peaks_macs.py -t ${tagAlign} -x ${xcor} -s ${sampleId} -g ${genomeSize} -z ${chromSizes} -p
"""
}
else {
"""
module load python/3.6.1-2-anaconda
module load macs/2.1.0-20151222
module load UCSC_userApps/v317
module load bedtools/2.26.0
module load phantompeakqualtools/1.2
python3 ${baseDir}/scripts/call_peaks_macs.py -t ${tagAlign} -x ${xcor} -s ${sampleId} -g ${genomeSize} -z ${chromSizes}
"""
}
}
else {
if (pairedEnd) {
"""
python3 ${baseDir}/scripts/call_peaks_macs.py -t ${tagAlign} -x ${xcor} -s ${sampleId} -g ${genomeSize} -z ${chromSizes} -p
"""
}
else {
"""
python3 ${baseDir}/scripts/call_peaks_macs.py -t ${tagAlign} -x ${xcor} -s ${sampleId} -g ${genomeSize} -z ${chromSizes}
"""
}
}
if (pairedEnd) {
"""
python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -s $sampleId -g $genomeSize -z $chromSizes -p -a
"""
}
else {
"""
python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -s $sampleId -g $genomeSize -z $chromSizes -a
"""
}
}
......
......@@ -5,8 +5,8 @@
import os
import argparse
import shutil
import subprocess
import logging
from multiprocessing import cpu_count
import utils
from xcor import xcor as calculate_xcor
......@@ -38,10 +38,6 @@ def get_args():
help="The cross-correlation file (if already calculated).",
required=True)
parser.add_argument('-c', '--con',
help="The control tagAling file used for peak calling.",
default='none')
parser.add_argument('-s', '--sample',
help="The sample id to name the peak files.",
required=True)
......@@ -59,11 +55,6 @@ def get_args():
default=False,
action='store_true')
parser.add_argument('-a', '--atac',
help="True/False if ATAC-seq or ChIP-seq.",
default=False,
action='store_true')
args = parser.parse_args()
return args
......@@ -75,16 +66,19 @@ def check_tools():
logger.info('Checking for required libraries and components on this system')
r_path = shutil.which("R")
if r_path:
logger.info('Found R: %s', r_path)
else:
logger.error('Missing R')
raise Exception('Missing R')
macs_path = shutil.which("macs2")
if r_path:
if macs_path:
logger.info('Found MACS2: %s', macs_path)
# Get Version
macs_version_command = "macs2 --version"
macs_version = subprocess.check_output(macs_version_command, shell=True, stderr=subprocess.STDOUT)
# Write to file
macs_file = open("version_macs.txt", "wb")
macs_file.write(macs_version)
macs_file.close()
else:
logger.error('Missing MACS2')
raise Exception('Missing MACS2')
......@@ -92,6 +86,18 @@ def check_tools():
bg_bw_path = shutil.which("bedGraphToBigWig")
if bg_bw_path:
logger.info('Found bedGraphToBigWig: %s', bg_bw_path)
# Get Version
bg_bw_version_command = "bedGraphToBigWig"
try:
subprocess.check_output(bg_bw_version_command, shell=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
bg_bw_version = e.output
# Write to file
bg_bw_file = open("version_bedGraphToBigWig.txt", "wb")
bg_bw_file.write(bg_bw_version)
bg_bw_file.close()
else:
logger.error('Missing bedGraphToBigWig')
raise Exception('Missing bedGraphToBigWig')
......@@ -99,37 +105,30 @@ def check_tools():
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
# Get Version
bedtools_version_command = "bedtools --version"
bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
# Write to file
bedtools_file = open("version_bedtools.txt", "wb")
bedtools_file.write(bedtools_version)
bedtools_file.close()
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes, atac):
# Extract the fragment length estimate from column 3 of the
# cross-correlation scores file
if not atac:
with open(xcor, 'r') as xcor_fh:
firstline = xcor_fh.readline()
frag_lengths = firstline.split()[2] # third column
fragment_length = frag_lengths.split(',')[0] # grab first value
logger.info("Fraglen %s" % (fragment_length))
else:
fragment_length = int(round(float(73)/2.0))
logger.info("Fraglen %s" % (fragment_length))
def call_peaks_macs(experiment, xcor, prefix, genome_size, chrom_sizes):
'''Call peaks and signal tracks'''
# Generate narrow peaks and preliminary signal tracks
if atac:
command = 'macs2 callpeak ' + \
# For ATAC-seq use a shift of 37 and extsize of 73
command = 'macs2 callpeak ' + \
'-t %s ' % (experiment) + \
'-f BED -n %s ' % (prefix) + \
'-g %s -p 1e-2 --nomodel --shift 73 --extsize %s --keep-dup all -B --SPMR' % (genome_size, fragment_length)
else:
command = 'macs2 callpeak ' + \
'-t %s -c %s ' % (experiment, control) + \
'-f BED -n %s ' % (prefix) + \
'-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' % (genome_size, fragment_length)
'-g %s -p 1e-2 --nomodel --shift 37 --extsize 73 --keep-dup all -B --SPMR' % (genome_size)
logger.info(command)
returncode = utils.block_on(command)
......@@ -138,12 +137,11 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes,
# MACS2 sometimes calls features off the end of chromosomes.
# Remove coordinates outside chromosome sizes
narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix)
int_narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix)
narrowpeak_fn = '%s.narrowPeak' % (prefix)
clipped_narrowpeak_fn = 'clipped-%s' % (narrowpeak_fn)
steps = ['slopBed -i %s -g %s -b 0' % (narrowpeak_fn, chrom_sizes),
steps = ['slopBed -i %s -g %s -b 0' % (int_narrowpeak_fn, chrom_sizes),
'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)]
out, err = utils.run_pipe(steps)
......@@ -155,10 +153,9 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes,
# Sort by Col8 in descending order and replace long peak names in Column 4
# with Peak_<peakRank>
steps = [
'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn),
r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'"""
]
steps = ['sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn),
r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
]
out, err = utils.run_pipe(steps, '%s' % (narrowpeak_fn))
......@@ -168,10 +165,10 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes,
# Col2 (chromosome size in bp).
command = 'macs2 bdgcmp ' + \
'-t %s_treat_pileup.bdg ' % (prefix) + \
'-c %s_control_lambda.bdg ' % (prefix) + \
'-o %s_FE.bdg ' % (prefix) + \
'-m FE'
'-t %s_treat_pileup.bdg ' % (prefix) + \
'-c %s_control_lambda.bdg ' % (prefix) + \
'-o %s_FE.bdg ' % (prefix) + \
'-m FE'
logger.info(command)
returncode = utils.block_on(command)
......@@ -193,9 +190,9 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes,
# Convert bedgraph to bigwig
command = 'bedGraphToBigWig ' + \
'%s ' % (fc_bedgraph_sorted_fn) + \
'%s ' % (chrom_sizes) + \
'%s' % (fc_signal_fn)
'%s ' % (fc_bedgraph_sorted_fn) + \
'%s ' % (chrom_sizes) + \
'%s' % (fc_signal_fn)
logger.info(command)
returncode = utils.block_on(command)
......@@ -203,31 +200,11 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes,
assert returncode == 0, "bedGraphToBigWig non-zero return"
# For -log10(p-value) signal tracks
if atac:
command = 'macs2 bdgcmp ' + \
'-t %s_treat_pileup.bdg ' % (prefix) + \
'-c %s_control_lambda.bdg ' % (prefix) + \
'-o %s_ppois.bdg ' % (prefix) + \
'-m FE'
else:
# Compute sval =
# min(no. of reads in ChIP, no. of reads in control) / 1,000,000
out, err = utils.run_pipe(['gzip -dc %s' % (experiment), 'wc -l'])
chip_reads = out.strip()
out, err = utils.run_pipe(['gzip -dc %s' % (control), 'wc -l'])
control_reads = out.strip()
sval = str(min(float(chip_reads), float(control_reads)) / 1000000)
logger.info("chip_reads = %s, control_reads = %s, sval = %s" %
(chip_reads, control_reads, sval))
command = 'macs2 bdgcmp ' + \
command = 'macs2 bdgcmp ' + \
'-t %s_treat_pileup.bdg ' % (prefix) + \
'-c %s_control_lambda.bdg ' % (prefix) + \
'-o %s_ppois.bdg ' % (prefix) + \
'-m ppois -S %s' % (sval)
'-m FE'
logger.info(command)
returncode = utils.block_on(command)
......@@ -248,9 +225,9 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes,
# Convert bedgraph to bigwig
command = 'bedGraphToBigWig ' + \
'%s ' % (pvalue_bedgraph_sorted_fn) + \
'%s ' % (chrom_sizes) + \
'%s' % (pvalue_signal_fn)
'%s ' % (pvalue_bedgraph_sorted_fn) + \
'%s ' % (chrom_sizes) + \
'%s' % (pvalue_signal_fn)
logger.info(command)
returncode = utils.block_on(command)
......@@ -260,18 +237,16 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes,
# Remove temporary files
os.remove(clipped_narrowpeak_fn)
os.remove(rescaled_narrowpeak_fn)
os.remove(int_narrowpeak_fn)
def main():
args = get_args()
tag = args.tag
xcor = args.xcor
con = args.con
sample = args.sample
genome_size = args.genome
chrom_size = args.size
paired = args.paired
atac = args.atac
# Create a file handler
handler = logging.FileHandler('call_peaks.log')
......@@ -281,13 +256,14 @@ def main():
check_tools()
# Calculate Cross-correlation if not already calcualted
if xcor == 'Calculate' and not atac:
if xcor == 'Calculate':
xcor_file = calculate_xcor(tag, paired)
else:
xcor_file = xcor
# Call Peaks using MACS2
call_peaks_macs(tag, xcor_file, con, sample, genome_size, chrom_size, atac)
call_peaks_macs(tag, xcor_file, sample, genome_size, chrom_size)
if __name__ == '__main__':
......
......@@ -8,15 +8,18 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../output/callPeaksMACS/'
@pytest.mark.integration
def test_call_peaks_macs_singleend():
#assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT.fc_signal.bw'))
#assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT.pvalue_signal.bw'))
#peak_file = test_output_path + 'ENCLB144FDT_peaks.narrowPeak'
#assert utils.count_lines(peak_file) == 210349
pass
@pytest.mark.singleend_human
def test_call_peaks_macs_singleend_human():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR265ZXX/1/', 'ENCLB170KFO.fc_signal.bw'))
assert os.path.exists(os.path.join(test_output_path, 'ENCSR265ZXX/1/', 'ENCLB170KFO.pvalue_signal.bw'))
assert os.path.exists(os.path.join(test_output_path, 'ENCSR265ZXX/1/', 'ENCLB170KFO_peaks.xls'))
peak_file = test_output_path + 'ENCSR265ZXX/1/' + 'ENCLB170KFO.narrowPeak'
assert utils.count_lines(peak_file) == 287210
@pytest.mark.integration
def test_call_peaks_macs_pairedend():
# Do the same thing for paired end data
pass
@pytest.mark.pairedend_mouse
def test_call_peaks_macs_pairedend_mouse():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR451NAE/1/', 'ENCLB749GLW.fc_signal.bw'))
assert os.path.exists(os.path.join(test_output_path, 'ENCSR451NAE/1/', 'ENCLB749GLW.pvalue_signal.bw'))
assert os.path.exists(os.path.join(test_output_path, 'ENCSR451NAE/1/', 'ENCLB749GLW_peaks.xls'))
peak_file = test_output_path + 'ENCSR451NAE/1/' + 'ENCLB749GLW.narrowPeak'
assert utils.count_lines(peak_file) == 487955
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment