Skip to content
Snippets Groups Projects
Commit 225c1fd0 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Fix errors in overlap peaks and design generation.

parent 163c967f
No related merge requests found
...@@ -354,7 +354,7 @@ process callPeaksMACS { ...@@ -354,7 +354,7 @@ process callPeaksMACS {
peaksDesign = experimentPeaks peaksDesign = experimentPeaks
.map{ sampleId, peak, fcSignal, pvalueSignal, experimentId, biosample, factor, treatment, replicate, controlId -> .map{ sampleId, peak, fcSignal, pvalueSignal, experimentId, biosample, factor, treatment, replicate, controlId ->
"$sampleId\t$peak\t$fcSignal\t$pvalueSignal\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} "$sampleId\t$peak\t$fcSignal\t$pvalueSignal\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"}
.collectFile(name:'design_peak.tsv', seed:"sample_id\tpeak\txcor\tfcSignal\tpvalueSignal\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") .collectFile(name:'design_peak.tsv', seed:"sample_id\tpeaks\tfc_signal\tpvalue_signal\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design")
// Calculate Consensus Peaks // Calculate Consensus Peaks
process consensusPeaks { process consensusPeaks {
......
...@@ -2,9 +2,12 @@ ...@@ -2,9 +2,12 @@
'''Generate naive overlap peak files and design file for downstream processing.''' '''Generate naive overlap peak files and design file for downstream processing.'''
import os
import argparse import argparse
import logging import logging
import shutil
import pandas as pd import pandas as pd
import utils
EPILOG = ''' EPILOG = '''
For more details: For more details:
...@@ -72,7 +75,7 @@ def update_design(design): ...@@ -72,7 +75,7 @@ def update_design(design):
logger.info("Adding peaks column.") logger.info("Adding peaks column.")
design = design.assign(peak = "", peak_caller = 'bed') design = design.assign(peak='', peak_caller='bed')
return design return design
...@@ -84,17 +87,17 @@ def overlap(experiment, design): ...@@ -84,17 +87,17 @@ def overlap(experiment, design):
# Output File names # Output File names
peak_type = 'narrowPeak' peak_type = 'narrowPeak'
overlapping_peaks_fn = '%s.replicated.%s' %(experiment, peak_type) overlapping_peaks_fn = '%s.replicated.%s' % (experiment, peak_type)
rejected_peaks_fn = '%s.rejected.%s' %(experiment, peak_type) rejected_peaks_fn = '%s.rejected.%s' % (experiment, peak_type)
# Intermediate File names # Intermediate File names
overlap_tr_fn = 'replicated_tr.%s' %(peak_type) overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
overlap_pr_fn = 'replicated_pr.%s' %(peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type)
# Assign Pooled and Psuedoreplicate peaks # Assign Pooled and Psuedoreplicate peaks
pool_peaks = design.loc[design.replicate == 'pooled', 'peak'] pool_peaks = design.loc[design.replicate == 'pooled', 'peaks'].values[0]
pr1_peaks = design.loc[design.replicate == '1_pr', 'peak'] pr1_peaks = design.loc[design.replicate == '1_pr', 'peaks'].values[0]
pr2_peaks = design.loc[design.replicate == '2_pr', 'peak'] pr2_peaks = design.loc[design.replicate == '2_pr', 'peaks'].values[0]
# Remove non true replicate rows # Remove non true replicate rows
not_replicates = ['1_pr', '2_pr', 'pooled'] not_replicates = ['1_pr', '2_pr', 'pooled']
...@@ -102,14 +105,14 @@ def overlap(experiment, design): ...@@ -102,14 +105,14 @@ def overlap(experiment, design):
true_rep_peaks = design_true_reps.peaks.unique() true_rep_peaks = design_true_reps.peaks.unique()
# Find overlaps # Find overlaps
awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
cut_command = 'cut -f 1-10' cut_command = 'cut -f 1-10'
# Find pooled peaks that overlap Rep1 and Rep2 # Find pooled peaks that overlap Rep1 and Rep2
# where overlap is defined as the fractional overlap # where overlap is defined as the fractional overlap
# with any one of the overlapping peak pairs >= 0.5 # with any one of the overlapping peak pairs >= 0.5
steps_true = ['intersectBed -wo -a %s -b %s' %(pool_peaks, true_rep_peaks[0]), steps_true = ['intersectBed -wo -a %s -b %s' % (pool_peaks, true_rep_peaks[0]),
awk_command, awk_command,
cut_command, cut_command,
'sort -u'] 'sort -u']
...@@ -119,44 +122,45 @@ def overlap(experiment, design): ...@@ -119,44 +122,45 @@ def overlap(experiment, design):
if len(true_rep_peaks) > 1: if len(true_rep_peaks) > 1:
for true_peak in true_rep_peaks[1:]: for true_peak in true_rep_peaks[1:]:
steps_true.extend(['intersectBed -wo -a stdin -b %s' %(true_peak), steps_true.extend(['intersectBed -wo -a stdin -b %s' % (true_peak),
awk_command, awk_command,
cut_command, cut_command,
'sort -u']) 'sort -u'])
out, err = utils.run_pipe(steps_true, outfile=overlap_tr_fn) out, err = utils.run_pipe(steps_true, outfile=overlap_tr_fn)
print "%d peaks overlap with both true replicates" %(utils.count_lines(overlap_tr_fn)) print("%d peaks overlap with both true replicates" %
(utils.count_lines(overlap_tr_fn)))
# Find pooled peaks that overlap PseudoRep1 and PseudoRep2 # Find pooled peaks that overlap PseudoRep1 and PseudoRep2
# where overlap is defined as the fractional overlap # where overlap is defined as the fractional overlap
# with any one of the overlapping peak pairs >= 0.5 # with any one of the overlapping peak pairs >= 0.5
steps_pseudo = ['intersectBed -wo -a %s -b %s' %(pool_peaks, pr1_peaks), steps_pseudo = ['intersectBed -wo -a %s -b %s' % (pool_peaks, pr1_peaks),
awk_command, awk_command,
cut_command, cut_command,
'sort -u', 'sort -u',
'intersectBed -wo -a stdin -b %s' %(pr2_peaks), 'intersectBed -wo -a stdin -b %s' % (pr2_peaks),
awk_command, awk_command,
cut_command, cut_command,
'sort -u'] 'sort -u']
out, err = utils.run_pipe(steps_pseudo, outfile=overlap_pr_fn) out, err = utils.run_pipe(steps_pseudo, outfile=overlap_pr_fn)
print "%d peaks overlap with both pooled pseudoreplicates" \ print("%d peaks overlap with both pooled pseudoreplicates"
% (utils.count_lines(overlap_pr_fn)) % (utils.count_lines(overlap_pr_fn)))
# Make union of peak lists # Make union of peak lists
out, err = utils.run_pipe([ out, err = utils.run_pipe([
'cat %s %s' %(overlap_tr_fn, overlap_pr_fn), 'cat %s %s' % (overlap_tr_fn, overlap_pr_fn),
'sort -u' 'sort -u'
], overlapping_peaks_fn) ], overlapping_peaks_fn)
print "%d peaks overlap with true replicates or with pooled pseudorepliates" \ print("%d peaks overlap with true replicates or with pooled pseudorepliates"
% (utils.count_lines(overlapping_peaks_fn)) % (utils.count_lines(overlapping_peaks_fn)))
# Make rejected peak list # Make rejected peak list
out, err = utils.run_pipe([ out, err = utils.run_pipe([
'intersectBed -wa -v -a %s -b %s' %(pool_peaks, overlapping_peaks_fn) 'intersectBed -wa -v -a %s -b %s' % (pool_peaks, overlapping_peaks_fn)
], rejected_peaks_fn) ], rejected_peaks_fn)
print "%d peaks were rejected" %(utils.count_lines(rejected_peaks_fn)) print("%d peaks were rejected" % (utils.count_lines(rejected_peaks_fn)))
# Remove temporary files # Remove temporary files
os.remove(overlap_tr_fn) os.remove(overlap_tr_fn)
...@@ -168,14 +172,14 @@ def overlap(experiment, design): ...@@ -168,14 +172,14 @@ def overlap(experiment, design):
def main(): def main():
args = get_args() args = get_args()
design = args.design design = args.design
files = args.design files = args.files
# Create a file handler # Create a file handler
handler = logging.FileHandler('consensus_peaks.log') handler = logging.FileHandler('consensus_peaks.log')
logger.addHandler(handler) logger.addHandler(handler)
# Read files as dataframes # Read files as dataframes
design_peaks_df = pd.read_csv(design, sep='\t') design_peaks_df = pd.read_csv('../design/design_peak.tsv', sep='\t')
design_files_df = pd.read_csv(files, sep='\t') design_files_df = pd.read_csv(files, sep='\t')
# Make a design file for # Make a design file for
...@@ -184,7 +188,7 @@ def main(): ...@@ -184,7 +188,7 @@ def main():
# Find consenus overlap peaks for each experiment # Find consenus overlap peaks for each experiment
for experiment, df_experiment in design_peaks_df.groupby('experiment_id'): for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
replicated_peak = overlap(experiment, df_experiment) replicated_peak = overlap(experiment, df_experiment)
design_diff.loc[design_diff.experiment_id == experiment, "peaks"] = replicated_peak design_diff.loc[design_diff.experiment_id == experiment, "peak"] = replicated_peak
# Write out file # Write out file
design_diff.columns = ['SampleID', design_diff.columns = ['SampleID',
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment