Skip to content
Snippets Groups Projects
Commit a2609dc8 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Add in naive overlap peaks.

parent 0ee7bca0
Branches
Tags
No related merge requests found
......@@ -43,6 +43,10 @@ process {
module = ['python/3.6.1-2-anaconda', 'phantompeakqualtools/1.2', 'macs/2.1.0-20151222', 'UCSC_userApps/v317', 'bedtools/2.26.0']
cpus = 32
}
$callPeaksMACS {
module = ['python/3.6.1-2-anaconda', 'bedtools/2.26.0']
cpus = local
}
}
params {
......
......@@ -170,11 +170,12 @@ process filterReads {
}
// Define channel collecting dedup reads intp new design file
dedupDesign = dedupReads
.map{ sampleId, bam, bai, experimentId, biosample, factor, treatment, replicate, controlId ->
"$sampleId\t$bam\t$bai\texperimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"}
.collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design")
// Define channel collecting dedup reads into new design file
dedupReads
.map{ sampleId, bam, bai, experimentId, biosample, factor, treatment, replicate, controlId ->
"$sampleId\t$bam\t$bai\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"}
.collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design")
.into { dedupDesign; preDiffDesign }
// Quality Metrics using deeptools
process experimentQC {
......@@ -345,3 +346,32 @@ process callPeaksMACS {
}
}
// Define channel collecting peaks into design file
peaksDesign = experimentRows
.map{ sampleId, peak, fcSignal, pvalueSignal, experimentId, biosample, factor, treatment, replicate, controlId ->
"$sampleId\t$peak\t$fcSignal\t$pvalueSignal\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"}
.collectFile(name:'design_peak.tsv', seed:"sample_id\tpeak\txcor\tfcSignal\tpvalueSignal\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design")
// Calculate Consensus Peaks
process consensusPeaks {
publishDir "$baseDir/output/${task.process}", mode: 'copy'
input:
file peaksDesign
file preDiffDesign
output:
file '*.narrowPeak' into consensusPeaks
file("design_diffPeaks.tsv") into designFilePaths
script:
"""
python3 $baseDir/scripts/overlap_peaks.py -d $peaksDesign -f preDiffDesign
"""
}
#!/usr/bin/env python3
'''Generate naive overlap peak files and design file for downstream processing.'''
import argparse
import logging
import pandas as pd
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-d', '--design',
help="The design file of peaks (tsv format).",
required=True)
parser.add_argument('-f', '--files',
help="The design file of with bam files (tsv format).",
required=True)
args = parser.parse_args()
return args
def check_tools():
'''Checks for required componenets on user system.'''
logger.info('Checking for required libraries and components on this system')
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
def update_design(design):
'''Update design file for diffBind and remove controls.'''
logger.info("Running control file update.")
file_dict = design[['sample_id', 'bam_reads']] \
.set_index('sample_id').T.to_dict()
design['control_bam_reads'] = design['control_id'] \
.apply(lambda x: file_dict[x]['bam_reads'])
logger.info("Removing rows that are there own control.")
design = design[design['control_id'] != design['sample_id']]
logger.info("Removing columns that are there own control.")
design = design.drop('bam_index', axis=1)
logger.info("Adding peaks column.")
design = design.assign(peak = "", peak_caller = 'bed')
return design
def overlap(experiment, design):
'''Calculate the overlap of peaks'''
logger.info("Determining consenus peaks for experiment %s.", experiment)
# Output File names
peak_type = 'narrowPeak'
overlapping_peaks_fn = '%s.replicated.%s' %(experiment, peak_type)
rejected_peaks_fn = '%s.rejected.%s' %(experiment, peak_type)
# Intermediate File names
overlap_tr_fn = 'replicated_tr.%s' %(peak_type)
overlap_pr_fn = 'replicated_pr.%s' %(peak_type)
# Assign Pooled and Psuedoreplicate peaks
pool_peaks = design.loc[design.replicate == 'pooled', 'peak']
pr1_peaks = design.loc[design.replicate == '1_pr', 'peak']
pr2_peaks = design.loc[design.replicate == '2_pr', 'peak']
# Remove non true replicate rows
not_replicates = ['1_pr', '2_pr', 'pooled']
design_true_reps = design[~design['replicate'].isin(not_replicates)]
true_rep_peaks = design_true_reps.peaks.unique()
# Find overlaps
awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
cut_command = 'cut -f 1-10'
# Find pooled peaks that overlap Rep1 and Rep2
# where overlap is defined as the fractional overlap
# with any one of the overlapping peak pairs >= 0.5
steps_true = ['intersectBed -wo -a %s -b %s' %(pool_peaks, true_rep_peaks[0]),
awk_command,
cut_command,
'sort -u']
iter_true_peaks = iter(true_rep_peaks)
next(iter_true_peaks)
if len(true_rep_peaks) > 1:
for true_peak in true_rep_peaks[1:]:
steps_true.extend(['intersectBed -wo -a stdin -b %s' %(true_peak),
awk_command,
cut_command,
'sort -u'])
out, err = utils.run_pipe(steps_true, outfile=overlap_tr_fn)
print "%d peaks overlap with both true replicates" %(utils.count_lines(overlap_tr_fn))
# Find pooled peaks that overlap PseudoRep1 and PseudoRep2
# where overlap is defined as the fractional overlap
# with any one of the overlapping peak pairs >= 0.5
steps_pseudo = ['intersectBed -wo -a %s -b %s' %(pool_peaks, pr1_peaks),
awk_command,
cut_command,
'sort -u',
'intersectBed -wo -a stdin -b %s' %(pr2_peaks),
awk_command,
cut_command,
'sort -u']
out, err = utils.run_pipe(steps_pseudo, outfile=overlap_pr_fn)
print "%d peaks overlap with both pooled pseudoreplicates" \
% (utils.count_lines(overlap_pr_fn))
# Make union of peak lists
out, err = utils.run_pipe([
'cat %s %s' %(overlap_tr_fn, overlap_pr_fn),
'sort -u'
], overlapping_peaks_fn)
print "%d peaks overlap with true replicates or with pooled pseudorepliates" \
% (utils.count_lines(overlapping_peaks_fn))
# Make rejected peak list
out, err = utils.run_pipe([
'intersectBed -wa -v -a %s -b %s' %(pool_peaks, overlapping_peaks_fn)
], rejected_peaks_fn)
print "%d peaks were rejected" %(utils.count_lines(rejected_peaks_fn))
# Remove temporary files
os.remove(overlap_tr_fn)
os.remove(overlap_pr_fn)
return overlapping_peaks_fn
def main():
args = get_args()
design = args.design
files = args.design
# Create a file handler
handler = logging.FileHandler('consensus_peaks.log')
logger.addHandler(handler)
# Read files as dataframes
design_peaks_df = pd.read_csv(design, sep='\t')
design_files_df = pd.read_csv(files, sep='\t')
# Make a design file for
design_diff = update_design(design_files_df)
# Find consenus overlap peaks for each experiment
for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
replicated_peak = overlap(experiment, df_experiment)
design_diff.loc[design_diff.experiment_id == experiment, "peaks"] = replicated_peak
# Write out file
design_diff.columns = ['SampleID',
'bamReads',
'Condition',
'Tissue',
'Factor',
'Treatment',
'Replicate',
'ControlID',
'bamControl',
'Peaks',
'PeakCaller']
design_diff.to_csv("design_diffPeaks.tsv", header=True, sep='\t', index=False)
if __name__ == '__main__':
main()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment