diff --git a/README.md b/README.md index d969755c58572b202a07035ccc9c5bc74a260835..03622746e8db40f3080e63e886c802447b04ca85 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,11 @@ experimentQC | heatmeap_PearsonCorr.pdf | plot of Pearson correlation between sa experimentQC | sample_mbs.npz | array of multiple BAM summaries crossReads | *.cc.plot.pdf | Plot of cross-correlation to assess signal-to-noise ratios crossReads | *.cc.qc | cross-correlation metrics. File [HEADER](docs/xcor_header.txt) +callPeaksMACS | pooled/*pooled.fc_signal.bw | bigwig data file; raw fold enrichment of sample/control +callPeaksMACS | pooled/*pooled_peaks.xls | Excel file of peaks +callPeaksMACS | pooled/*.pvalue_signal.bw | bigwig data file; sample/control signal adjusted for pvalue significance +callPeaksMACS | pooled/*_pooled.narrowPeak | peaks file; see [HERE](https://genome.ucsc.edu/FAQ/FAQformat.html#format12) for ENCODE narrowPeak header format + ## Common Errors If you find an error, please let the [BICF](mailto:BICF@UTSouthwestern.edu) know and we will add it here. diff --git a/workflow/main.nf b/workflow/main.nf index 551641f8e8c38db62a5a8a8b5105e6231c01fa4a..a35ae3bcb3fcdfbc00ddc454605d18d87b0237c7 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -478,28 +478,53 @@ experimentRows = experimentPoolObjs // Call Peaks using MACS process callPeaksMACS { - tag "$sampleId-$replicate" - publishDir "$baseDir/output/${task.process}", mode: 'copy' + tag "${sampleId}-${replicate}" + publishDir "${outDir}/${task.process}/${experimentId}/${replicate}", mode: 'copy' input: - set sampleId, tagAlign, xcor, experimentId, replicate from experimentRows + set sampleId, tagAlign, xcor, experimentId, replicate from experimentRows output: - - set sampleId, file('*.narrowPeak'), file('*.fc_signal.bw'), file('*.pvalue_signal.bw'), experimentId, replicate into experimentPeaks + set sampleId, file('*.narrowPeak'), file('*.fc_signal.bw'), file('*.pvalue_signal.bw'), experimentId, replicate into experimentPeaks + file '*.xls' into callPeaksMACSsummit + file('version_*.txt') into callPeaksMACSVersions script: + if (params.astrocyte == true) { + if (pairedEnd) { + """ + module load python/3.6.1-2-anaconda + module load macs/2.1.0-20151222 + module load UCSC_userApps/v317 + module load bedtools/2.26.0 + module load phantompeakqualtools/1.2 + python3 ${baseDir}/scripts/call_peaks_macs.py -t ${tagAlign} -x ${xcor} -s ${sampleId} -g ${genomeSize} -z ${chromSizes} -p + """ + } + else { + """ + module load python/3.6.1-2-anaconda + module load macs/2.1.0-20151222 + module load UCSC_userApps/v317 + module load bedtools/2.26.0 + module load phantompeakqualtools/1.2 + python3 ${baseDir}/scripts/call_peaks_macs.py -t ${tagAlign} -x ${xcor} -s ${sampleId} -g ${genomeSize} -z ${chromSizes} + """ + } + } + else { + if (pairedEnd) { + """ + python3 ${baseDir}/scripts/call_peaks_macs.py -t ${tagAlign} -x ${xcor} -s ${sampleId} -g ${genomeSize} -z ${chromSizes} -p + """ + } + else { + """ + python3 ${baseDir}/scripts/call_peaks_macs.py -t ${tagAlign} -x ${xcor} -s ${sampleId} -g ${genomeSize} -z ${chromSizes} + """ + } + } - if (pairedEnd) { - """ - python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -s $sampleId -g $genomeSize -z $chromSizes -p -a - """ - } - else { - """ - python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -s $sampleId -g $genomeSize -z $chromSizes -a - """ - } } diff --git a/workflow/scripts/call_peaks_macs.py b/workflow/scripts/call_peaks_macs.py index cffea47cb0c8197d225cb377a0426e9940fd520f..7f7d4674984620d401172a6876d6d7591426c59f 100644 --- a/workflow/scripts/call_peaks_macs.py +++ b/workflow/scripts/call_peaks_macs.py @@ -5,8 +5,8 @@ import os import argparse import shutil +import subprocess import logging -from multiprocessing import cpu_count import utils from xcor import xcor as calculate_xcor @@ -38,10 +38,6 @@ def get_args(): help="The cross-correlation file (if already calculated).", required=True) - parser.add_argument('-c', '--con', - help="The control tagAling file used for peak calling.", - default='none') - parser.add_argument('-s', '--sample', help="The sample id to name the peak files.", required=True) @@ -59,11 +55,6 @@ def get_args(): default=False, action='store_true') - parser.add_argument('-a', '--atac', - help="True/False if ATAC-seq or ChIP-seq.", - default=False, - action='store_true') - args = parser.parse_args() return args @@ -75,16 +66,19 @@ def check_tools(): logger.info('Checking for required libraries and components on this system') - r_path = shutil.which("R") - if r_path: - logger.info('Found R: %s', r_path) - else: - logger.error('Missing R') - raise Exception('Missing R') - macs_path = shutil.which("macs2") - if r_path: + if macs_path: logger.info('Found MACS2: %s', macs_path) + + # Get Version + macs_version_command = "macs2 --version" + macs_version = subprocess.check_output(macs_version_command, shell=True, stderr=subprocess.STDOUT) + + # Write to file + macs_file = open("version_macs.txt", "wb") + macs_file.write(macs_version) + macs_file.close() + else: logger.error('Missing MACS2') raise Exception('Missing MACS2') @@ -92,6 +86,18 @@ def check_tools(): bg_bw_path = shutil.which("bedGraphToBigWig") if bg_bw_path: logger.info('Found bedGraphToBigWig: %s', bg_bw_path) + + # Get Version + bg_bw_version_command = "bedGraphToBigWig" + try: + subprocess.check_output(bg_bw_version_command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + bg_bw_version = e.output + + # Write to file + bg_bw_file = open("version_bedGraphToBigWig.txt", "wb") + bg_bw_file.write(bg_bw_version) + bg_bw_file.close() else: logger.error('Missing bedGraphToBigWig') raise Exception('Missing bedGraphToBigWig') @@ -99,37 +105,30 @@ def check_tools(): bedtools_path = shutil.which("bedtools") if bedtools_path: logger.info('Found bedtools: %s', bedtools_path) + + # Get Version + bedtools_version_command = "bedtools --version" + bedtools_version = subprocess.check_output(bedtools_version_command, shell=True) + + # Write to file + bedtools_file = open("version_bedtools.txt", "wb") + bedtools_file.write(bedtools_version) + bedtools_file.close() + else: logger.error('Missing bedtools') raise Exception('Missing bedtools') -def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes, atac): - - # Extract the fragment length estimate from column 3 of the - # cross-correlation scores file - if not atac: - with open(xcor, 'r') as xcor_fh: - firstline = xcor_fh.readline() - frag_lengths = firstline.split()[2] # third column - fragment_length = frag_lengths.split(',')[0] # grab first value - logger.info("Fraglen %s" % (fragment_length)) - else: - fragment_length = int(round(float(73)/2.0)) - logger.info("Fraglen %s" % (fragment_length)) +def call_peaks_macs(experiment, xcor, prefix, genome_size, chrom_sizes): + '''Call peaks and signal tracks''' # Generate narrow peaks and preliminary signal tracks - - if atac: - command = 'macs2 callpeak ' + \ + # For ATAC-seq use a shift of 37 and extsize of 73 + command = 'macs2 callpeak ' + \ '-t %s ' % (experiment) + \ '-f BED -n %s ' % (prefix) + \ - '-g %s -p 1e-2 --nomodel --shift 73 --extsize %s --keep-dup all -B --SPMR' % (genome_size, fragment_length) - else: - command = 'macs2 callpeak ' + \ - '-t %s -c %s ' % (experiment, control) + \ - '-f BED -n %s ' % (prefix) + \ - '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' % (genome_size, fragment_length) + '-g %s -p 1e-2 --nomodel --shift 37 --extsize 73 --keep-dup all -B --SPMR' % (genome_size) logger.info(command) returncode = utils.block_on(command) @@ -138,12 +137,11 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes, # MACS2 sometimes calls features off the end of chromosomes. # Remove coordinates outside chromosome sizes - - narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix) + int_narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix) + narrowpeak_fn = '%s.narrowPeak' % (prefix) clipped_narrowpeak_fn = 'clipped-%s' % (narrowpeak_fn) - - steps = ['slopBed -i %s -g %s -b 0' % (narrowpeak_fn, chrom_sizes), + steps = ['slopBed -i %s -g %s -b 0' % (int_narrowpeak_fn, chrom_sizes), 'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)] out, err = utils.run_pipe(steps) @@ -155,10 +153,9 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes, # Sort by Col8 in descending order and replace long peak names in Column 4 # with Peak_<peakRank> - steps = [ - 'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn), - r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""" - ] + steps = ['sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn), + r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""", + ] out, err = utils.run_pipe(steps, '%s' % (narrowpeak_fn)) @@ -168,10 +165,10 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes, # Col2 (chromosome size in bp). command = 'macs2 bdgcmp ' + \ - '-t %s_treat_pileup.bdg ' % (prefix) + \ - '-c %s_control_lambda.bdg ' % (prefix) + \ - '-o %s_FE.bdg ' % (prefix) + \ - '-m FE' + '-t %s_treat_pileup.bdg ' % (prefix) + \ + '-c %s_control_lambda.bdg ' % (prefix) + \ + '-o %s_FE.bdg ' % (prefix) + \ + '-m FE' logger.info(command) returncode = utils.block_on(command) @@ -193,9 +190,9 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes, # Convert bedgraph to bigwig command = 'bedGraphToBigWig ' + \ - '%s ' % (fc_bedgraph_sorted_fn) + \ - '%s ' % (chrom_sizes) + \ - '%s' % (fc_signal_fn) + '%s ' % (fc_bedgraph_sorted_fn) + \ + '%s ' % (chrom_sizes) + \ + '%s' % (fc_signal_fn) logger.info(command) returncode = utils.block_on(command) @@ -203,31 +200,11 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes, assert returncode == 0, "bedGraphToBigWig non-zero return" # For -log10(p-value) signal tracks - - if atac: - command = 'macs2 bdgcmp ' + \ - '-t %s_treat_pileup.bdg ' % (prefix) + \ - '-c %s_control_lambda.bdg ' % (prefix) + \ - '-o %s_ppois.bdg ' % (prefix) + \ - '-m FE' - else: - - # Compute sval = - # min(no. of reads in ChIP, no. of reads in control) / 1,000,000 - out, err = utils.run_pipe(['gzip -dc %s' % (experiment), 'wc -l']) - chip_reads = out.strip() - out, err = utils.run_pipe(['gzip -dc %s' % (control), 'wc -l']) - control_reads = out.strip() - sval = str(min(float(chip_reads), float(control_reads)) / 1000000) - - logger.info("chip_reads = %s, control_reads = %s, sval = %s" % - (chip_reads, control_reads, sval)) - - command = 'macs2 bdgcmp ' + \ + command = 'macs2 bdgcmp ' + \ '-t %s_treat_pileup.bdg ' % (prefix) + \ '-c %s_control_lambda.bdg ' % (prefix) + \ '-o %s_ppois.bdg ' % (prefix) + \ - '-m ppois -S %s' % (sval) + '-m FE' logger.info(command) returncode = utils.block_on(command) @@ -248,9 +225,9 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes, # Convert bedgraph to bigwig command = 'bedGraphToBigWig ' + \ - '%s ' % (pvalue_bedgraph_sorted_fn) + \ - '%s ' % (chrom_sizes) + \ - '%s' % (pvalue_signal_fn) + '%s ' % (pvalue_bedgraph_sorted_fn) + \ + '%s ' % (chrom_sizes) + \ + '%s' % (pvalue_signal_fn) logger.info(command) returncode = utils.block_on(command) @@ -260,18 +237,16 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes, # Remove temporary files os.remove(clipped_narrowpeak_fn) os.remove(rescaled_narrowpeak_fn) - + os.remove(int_narrowpeak_fn) def main(): args = get_args() tag = args.tag xcor = args.xcor - con = args.con sample = args.sample genome_size = args.genome chrom_size = args.size paired = args.paired - atac = args.atac # Create a file handler handler = logging.FileHandler('call_peaks.log') @@ -281,13 +256,14 @@ def main(): check_tools() # Calculate Cross-correlation if not already calcualted - if xcor == 'Calculate' and not atac: + if xcor == 'Calculate': xcor_file = calculate_xcor(tag, paired) else: xcor_file = xcor + # Call Peaks using MACS2 - call_peaks_macs(tag, xcor_file, con, sample, genome_size, chrom_size, atac) + call_peaks_macs(tag, xcor_file, sample, genome_size, chrom_size) if __name__ == '__main__': diff --git a/workflow/tests/test_call_peaks_macs.py b/workflow/tests/test_call_peaks_macs.py index 2f40429d4c3dfb7d14f12d25fb7cafe09188fdb5..d1686841fa7258086bddbede64a6e51e2d701967 100644 --- a/workflow/tests/test_call_peaks_macs.py +++ b/workflow/tests/test_call_peaks_macs.py @@ -8,15 +8,18 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../output/callPeaksMACS/' -@pytest.mark.integration -def test_call_peaks_macs_singleend(): - #assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT.fc_signal.bw')) - #assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT.pvalue_signal.bw')) - #peak_file = test_output_path + 'ENCLB144FDT_peaks.narrowPeak' - #assert utils.count_lines(peak_file) == 210349 - pass +@pytest.mark.singleend_human +def test_call_peaks_macs_singleend_human(): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR265ZXX/1/', 'ENCLB170KFO.fc_signal.bw')) + assert os.path.exists(os.path.join(test_output_path, 'ENCSR265ZXX/1/', 'ENCLB170KFO.pvalue_signal.bw')) + assert os.path.exists(os.path.join(test_output_path, 'ENCSR265ZXX/1/', 'ENCLB170KFO_peaks.xls')) + peak_file = test_output_path + 'ENCSR265ZXX/1/' + 'ENCLB170KFO.narrowPeak' + assert utils.count_lines(peak_file) == 287210 -@pytest.mark.integration -def test_call_peaks_macs_pairedend(): - # Do the same thing for paired end data - pass +@pytest.mark.pairedend_mouse +def test_call_peaks_macs_pairedend_mouse(): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR451NAE/1/', 'ENCLB749GLW.fc_signal.bw')) + assert os.path.exists(os.path.join(test_output_path, 'ENCSR451NAE/1/', 'ENCLB749GLW.pvalue_signal.bw')) + assert os.path.exists(os.path.join(test_output_path, 'ENCSR451NAE/1/', 'ENCLB749GLW_peaks.xls')) + peak_file = test_output_path + 'ENCSR451NAE/1/' + 'ENCLB749GLW.narrowPeak' + assert utils.count_lines(peak_file) == 487955