diff --git a/astrocyte_pkg.yml b/astrocyte_pkg.yml index 62c62971b76222f9bcd7786906b10d450ad3a5b8..7db5da7e1e95561cd005a3b725ddc8633fb76e13 100644 --- a/astrocyte_pkg.yml +++ b/astrocyte_pkg.yml @@ -48,6 +48,9 @@ workflow_modules: - 'sambamba/0.6.6' - 'bedtools/2.26.0' - 'deeptools/2.5.0.1' + - 'phantompeakqualtools/1.2' + - 'macs/2.1.0-20151222' + - 'UCSC_userApps/v317' # A list of parameters used by the workflow, defining how to present them, # options etc in the web interface. For each parameter: diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 74795c41b9035e8a9bf8241ac0fcd54cfcd1449c..d27698eaef3e1fc2f086eba759d37580f21136e1 100644 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -37,21 +37,41 @@ process { } $poolAndPsuedoReads { module = ['python/3.6.1-2-anaconda'] + executor = 'local' + } + $callPeaksMACS { + module = ['python/3.6.1-2-anaconda', 'phantompeakqualtools/1.2', 'macs/2.1.0-20151222', 'UCSC_userApps/v317', 'bedtools/2.26.0'] cpus = 32 } + $consensusPeaks { + module = ['python/3.6.1-2-anaconda', 'bedtools/2.26.0'] + cpus = local + } } params { // Reference file paths on BioHPC genomes { - 'GRCh38' { bwa = '/project/shared/bicf_workflow_ref/GRCh38' } - 'GRCh37' { bwa = '/project/shared/bicf_workflow_ref/GRCh37' } - 'GRCm38' { bwa = '/project/shared/bicf_workflow_ref/GRCm38' } + 'GRCh38' { + bwa = '/project/shared/bicf_workflow_ref/GRCh38' + genomesize = 'hs' + chromsizes = '/project/shared/bicf_workflow_ref/GRCh38/genomefile.txt' + } + 'GRCh37' { + bwa = '/project/shared/bicf_workflow_ref/GRCh37' + genomesize = 'hs' + chromsizes = '/project/shared/bicf_workflow_ref/GRCh37/genomefile.txt' + } + 'GRCm38' { + bwa = '/project/shared/bicf_workflow_ref/GRCm38' + genomesize = 'mm' + chromsizes = '/project/shared/bicf_workflow_ref/GRCm38/genomefile.txt' + } } } trace { - enabled = true - file = 'pipeline_trace.txt' - fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss' + enabled = true + file = 'pipeline_trace.txt' + fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss' } diff --git a/workflow/main.nf b/workflow/main.nf index 87dc34b1c6107ebefedda274a19a7a57429443fe..f96063110812e0b17032310cbf6a8d4b7de95b2e 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -10,6 +10,8 @@ params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt" params.genome = 'GRCm38' params.genomes = [] params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false +params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false +params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false params.cutoffRatio = 1.2 // Check inputs @@ -31,6 +33,8 @@ readsList = Channel // Define regular variables pairedEnd = params.pairedEnd designFile = params.designFile +genomeSize = params.genomeSize +chromSizes = params.chromSizes cutoffRatio = params.cutoffRatio process checkDesignFile { @@ -166,11 +170,12 @@ process filterReads { } -// Define channel collecting dedup reads intp new design file -dedupDesign = dedupReads - .map{ sampleId, bam, bai, experimentId, biosample, factor, treatment, replicate, controlId -> - "$sampleId\t$bam\t$bai\texperimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} - .collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") +// Define channel collecting dedup reads into new design file +dedupReads +.map{ sampleId, bam, bai, experimentId, biosample, factor, treatment, replicate, controlId -> +"$sampleId\t$bam\t$bai\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} +.collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") +.into { dedupDesign; preDiffDesign } // Quality Metrics using deeptools process experimentQC { @@ -283,6 +288,8 @@ process defineExpDesignFiles { // Make Experiment design files to be read in for downstream analysis process poolAndPsuedoReads { + + tag "${experimentObjs.baseName}" publishDir "$baseDir/output/design", mode: 'copy' input: @@ -307,3 +314,68 @@ process poolAndPsuedoReads { } } + +// Collect list of experiment design files into a single channel +experimentRows = experimentPoolObjs + .collect() + .splitCsv(sep:'\t', header: true) + .flatten() + .map { row -> [ row.sample_id, row.tag_align, row.xcor, row.experiment_id, row.biosample, row.factor, row.treatment, row.replicate, row.control_id, row.control_tag_align] } + +// Call Peaks using MACS +process callPeaksMACS { + + tag "$sampleId-$replicate" + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + set sampleId, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId, controlTagAlign from experimentRows + + output: + + set sampleId, file('*.narrowPeak'), file('*.fc_signal.bw'), file('*.pvalue_signal.bw'), experimentId, biosample, factor, treatment, replicate, controlId into experimentPeaks + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes -p + """ + } + else { + """ + python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes -p + """ + } + +} + +// Define channel collecting peaks into design file +peaksDesign = experimentPeaks + .map{ sampleId, peak, fcSignal, pvalueSignal, experimentId, biosample, factor, treatment, replicate, controlId -> + "$sampleId\t$peak\t$fcSignal\t$pvalueSignal\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} + .collectFile(name:'design_peak.tsv', seed:"sample_id\tpeaks\tfc_signal\tpvalue_signal\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") + +// Calculate Consensus Peaks +process consensusPeaks { + + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + file peaksDesign + file preDiffDesign + + output: + + file '*.replicated.*' into consensusPeaks + file '*.rejected.*' into rejectedPeaks + file("design_diffPeaks.tsv") into designDiffPeaks + + script: + + """ + python3 $baseDir/scripts/overlap_peaks.py -d $peaksDesign -f $preDiffDesign + """ + +} diff --git a/workflow/scripts/call_peaks_macs.py b/workflow/scripts/call_peaks_macs.py new file mode 100644 index 0000000000000000000000000000000000000000..23174e07936ec09ccf494d1c713bd71bb4d72713 --- /dev/null +++ b/workflow/scripts/call_peaks_macs.py @@ -0,0 +1,271 @@ +#!/usr/bin/env python3 + +'''Generate peaks from data.''' + +import os +import argparse +import shutil +import logging +from multiprocessing import cpu_count +import utils +from xcor import xcor as calculate_xcor + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-t', '--tag', + help="The tagAlign file to perform peak calling on.", + required=True) + + parser.add_argument('-x', '--xcor', + help="The cross-correlation file (if already calculated).", + required=True) + + parser.add_argument('-c', '--con', + help="The control tagAling file used for peak calling.", + required=True) + + parser.add_argument('-s', '--sample', + help="The sample id to name the peak files.", + required=True) + + parser.add_argument('-g', '--genome', + help="The genome size of reference genome.", + required=True) + + parser.add_argument('-z', '--size', + help="The file with chromosome sizes of reference genome.", + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=False, + action='store_true') + + args = parser.parse_args() + return args + +# Functions + + +def check_tools(): + '''Checks for required componenets on user system''' + + logger.info('Checking for required libraries and components on this system') + + r_path = shutil.which("R") + if r_path: + logger.info('Found R: %s', r_path) + else: + logger.error('Missing R') + raise Exception('Missing R') + + macs_path = shutil.which("macs2") + if r_path: + logger.info('Found MACS2: %s', macs_path) + else: + logger.error('Missing MACS2') + raise Exception('Missing MACS2') + + bg_bw_path = shutil.which("bedGraphToBigWig") + if bg_bw_path: + logger.info('Found bedGraphToBigWig: %s', bg_bw_path) + else: + logger.error('Missing bedGraphToBigWig') + raise Exception('Missing bedGraphToBigWig') + + bedtools_path = shutil.which("bedtools") + if bedtools_path: + logger.info('Found bedtools: %s', bedtools_path) + else: + logger.error('Missing bedtools') + raise Exception('Missing bedtools') + + +def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes): + + # Extract the fragment length estimate from column 3 of the + # cross-correlation scores file + with open(xcor, 'r') as xcor_fh: + firstline = xcor_fh.readline() + frag_lengths = firstline.split()[2] # third column + fragment_length = frag_lengths.split(',')[0] # grab first value + logger.info("Fraglen %s" % (fragment_length)) + + + # Generate narrow peaks and preliminary signal tracks + + command = 'macs2 callpeak ' + \ + '-t %s -c %s ' % (experiment, control) + \ + '-f BED -n %s ' % (prefix) + \ + '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' % (genome_size, fragment_length) + + logger.info(command) + returncode = utils.block_on(command) + logger.info("MACS2 exited with returncode %d" % (returncode)) + assert returncode == 0, "MACS2 non-zero return" + + # MACS2 sometimes calls features off the end of chromosomes. + # Remove coordinates outside chromosome sizes + + narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix) + clipped_narrowpeak_fn = 'clipped-%s' % (narrowpeak_fn) + + + steps = ['slopBed -i %s -g %s -b 0' % (narrowpeak_fn, chrom_sizes), + 'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)] + + out, err = utils.run_pipe(steps) + + # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format + # (score must be <1000) + rescaled_narrowpeak_fn = utils.rescale_scores( + clipped_narrowpeak_fn, scores_col=5) + + # Sort by Col8 in descending order and replace long peak names in Column 4 + # with Peak_<peakRank> + steps = [ + 'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn), + r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""" + ] + + out, err = utils.run_pipe(steps, '%s' % (narrowpeak_fn)) + + # For Fold enrichment signal tracks + + # This file is a tab delimited file with 2 columns Col1 (chromosome name), + # Col2 (chromosome size in bp). + + command = 'macs2 bdgcmp ' + \ + '-t %s_treat_pileup.bdg ' % (prefix) + \ + '-c %s_control_lambda.bdg ' % (prefix) + \ + '-o %s_FE.bdg ' % (prefix) + \ + '-m FE' + + logger.info(command) + returncode = utils.block_on(command) + logger.info("MACS2 exited with returncode %d" % (returncode)) + assert returncode == 0, "MACS2 non-zero return" + + # Remove coordinates outside chromosome sizes (MACS2 bug) + fc_bedgraph_fn = '%s.fc.signal.bedgraph' % (prefix) + fc_bedgraph_sorted_fn = 'sorted-%s' % (fc_bedgraph_fn) + fc_signal_fn = "%s.fc_signal.bw" % (prefix) + steps = ['slopBed -i %s_FE.bdg -g %s -b 0' % (prefix, chrom_sizes), + 'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)] + + out, err = utils.run_pipe(steps) + + # Sort file + out, err = utils.run_pipe([ + 'bedSort %s %s' % (fc_bedgraph_fn, fc_bedgraph_sorted_fn)]) + + # Convert bedgraph to bigwig + command = 'bedGraphToBigWig ' + \ + '%s ' % (fc_bedgraph_sorted_fn) + \ + '%s ' % (chrom_sizes) + \ + '%s' % (fc_signal_fn) + + logger.info(command) + returncode = utils.block_on(command) + logger.info("bedGraphToBigWig exited with returncode %d" % (returncode)) + assert returncode == 0, "bedGraphToBigWig non-zero return" + + # For -log10(p-value) signal tracks + + # Compute sval = + # min(no. of reads in ChIP, no. of reads in control) / 1,000,000 + out, err = utils.run_pipe(['gzip -dc %s' % (experiment), 'wc -l']) + chip_reads = out.strip() + out, err = utils.run_pipe(['gzip -dc %s' % (control), 'wc -l']) + control_reads = out.strip() + sval = str(min(float(chip_reads), float(control_reads)) / 1000000) + + logger.info("chip_reads = %s, control_reads = %s, sval = %s" % + (chip_reads, control_reads, sval)) + + command = 'macs2 bdgcmp ' + \ + '-t %s_treat_pileup.bdg ' % (prefix) + \ + '-c %s_control_lambda.bdg ' % (prefix) + \ + '-o %s_ppois.bdg ' % (prefix) + \ + '-m ppois -S %s' % (sval) + + logger.info(command) + returncode = utils.block_on(command) + assert returncode == 0, "MACS2 non-zero return" + + # Remove coordinates outside chromosome sizes (MACS2 bug) + pvalue_bedgraph_fn = '%s.pval.signal.bedgraph' % (prefix) + pvalue_bedgraph_sorted_fn = 'sort-%s' % (pvalue_bedgraph_fn) + pvalue_signal_fn = "%s.pvalue_signal.bw" % (prefix) + steps = ['slopBed -i %s_ppois.bdg -g %s -b 0' % (prefix, chrom_sizes), + 'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)] + + out, err = utils.run_pipe(steps) + + # Sort file + out, err = utils.run_pipe([ + 'bedSort %s %s' % (fc_bedgraph_fn, pvalue_bedgraph_sorted_fn)]) + + # Convert bedgraph to bigwig + command = 'bedGraphToBigWig ' + \ + '%s ' % (pvalue_bedgraph_sorted_fn) + \ + '%s ' % (chrom_sizes) + \ + '%s' % (pvalue_signal_fn) + + logger.info(command) + returncode = utils.block_on(command) + logger.info("bedGraphToBigWig exited with returncode %d" % (returncode)) + assert returncode == 0, "bedGraphToBigWig non-zero return" + + # Remove temporary files + os.remove(clipped_narrowpeak_fn) + os.remove(rescaled_narrowpeak_fn) + + +def main(): + args = get_args() + tag = args.tag + xcor = args.xcor + con = args.con + sample = args.sample + genome_size = args.genome + chrom_size = args.size + paired = args.paired + + # Create a file handler + handler = logging.FileHandler('call_peaks.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + + # Calculate Cross-correlation if not already calcualted + if xcor == 'Calculate': + xcor_file = calculate_xcor(tag, paired) + else: + xcor_file = xcor + + # Call Peaks using MACS2 + call_peaks_macs(tag, xcor_file, con, sample, genome_size, chrom_size) + + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/overlap_peaks.py b/workflow/scripts/overlap_peaks.py new file mode 100644 index 0000000000000000000000000000000000000000..379f8841c205b70a664002d6ac6ddae3917928d7 --- /dev/null +++ b/workflow/scripts/overlap_peaks.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 + +'''Generate naive overlap peak files and design file for downstream processing.''' + +import os +import argparse +import logging +import shutil +import pandas as pd +import utils + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-d', '--design', + help="The design file of peaks (tsv format).", + required=True) + + parser.add_argument('-f', '--files', + help="The design file of with bam files (tsv format).", + required=True) + + args = parser.parse_args() + return args + + +def check_tools(): + '''Checks for required componenets on user system.''' + + logger.info('Checking for required libraries and components on this system') + + bedtools_path = shutil.which("bedtools") + if bedtools_path: + logger.info('Found bedtools: %s', bedtools_path) + else: + logger.error('Missing bedtools') + raise Exception('Missing bedtools') + + +def update_design(design): + '''Update design file for diffBind and remove controls.''' + + logger.info("Running control file update.") + + file_dict = design[['sample_id', 'bam_reads']] \ + .set_index('sample_id').T.to_dict() + + design['control_bam_reads'] = design['control_id'] \ + .apply(lambda x: file_dict[x]['bam_reads']) + + logger.info("Removing rows that are there own control.") + + design = design[design['control_id'] != design['sample_id']] + + logger.info("Removing columns that are there own control.") + + design = design.drop('bam_index', axis=1) + + logger.info("Adding peaks column.") + + design = design.assign(peak='', peak_caller='bed') + + return design + + +def overlap(experiment, design): + '''Calculate the overlap of peaks''' + + logger.info("Determining consenus peaks for experiment %s.", experiment) + + # Output File names + peak_type = 'narrowPeak' + overlapping_peaks_fn = '%s.replicated.%s' % (experiment, peak_type) + rejected_peaks_fn = '%s.rejected.%s' % (experiment, peak_type) + + # Intermediate File names + overlap_tr_fn = 'replicated_tr.%s' % (peak_type) + overlap_pr_fn = 'replicated_pr.%s' % (peak_type) + + # Assign Pooled and Psuedoreplicate peaks + pool_peaks = design.loc[design.replicate == 'pooled', 'peaks'].values[0] + pr1_peaks = design.loc[design.replicate == '1_pr', 'peaks'].values[0] + pr2_peaks = design.loc[design.replicate == '2_pr', 'peaks'].values[0] + + # Remove non true replicate rows + not_replicates = ['1_pr', '2_pr', 'pooled'] + design_true_reps = design[~design['replicate'].isin(not_replicates)] + true_rep_peaks = design_true_reps.peaks.unique() + + # Find overlaps + awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" + cut_command = 'cut -f 1-10' + + # Find pooled peaks that overlap Rep1 and Rep2 + # where overlap is defined as the fractional overlap + # with any one of the overlapping peak pairs >= 0.5 + + steps_true = ['intersectBed -wo -a %s -b %s' % (pool_peaks, true_rep_peaks[0]), + awk_command, + cut_command, + 'sort -u'] + + iter_true_peaks = iter(true_rep_peaks) + next(iter_true_peaks) + + if len(true_rep_peaks) > 1: + for true_peak in true_rep_peaks[1:]: + steps_true.extend(['intersectBed -wo -a stdin -b %s' % (true_peak), + awk_command, + cut_command, + 'sort -u']) + + out, err = utils.run_pipe(steps_true, outfile=overlap_tr_fn) + print("%d peaks overlap with both true replicates" % + (utils.count_lines(overlap_tr_fn))) + + # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 + # where overlap is defined as the fractional overlap + # with any one of the overlapping peak pairs >= 0.5 + + steps_pseudo = ['intersectBed -wo -a %s -b %s' % (pool_peaks, pr1_peaks), + awk_command, + cut_command, + 'sort -u', + 'intersectBed -wo -a stdin -b %s' % (pr2_peaks), + awk_command, + cut_command, + 'sort -u'] + + out, err = utils.run_pipe(steps_pseudo, outfile=overlap_pr_fn) + print("%d peaks overlap with both pooled pseudoreplicates" + % (utils.count_lines(overlap_pr_fn))) + + # Make union of peak lists + out, err = utils.run_pipe([ + 'cat %s %s' % (overlap_tr_fn, overlap_pr_fn), + 'sort -u' + ], overlapping_peaks_fn) + print("%d peaks overlap with true replicates or with pooled pseudorepliates" + % (utils.count_lines(overlapping_peaks_fn))) + + # Make rejected peak list + out, err = utils.run_pipe([ + 'intersectBed -wa -v -a %s -b %s' % (pool_peaks, overlapping_peaks_fn) + ], rejected_peaks_fn) + print("%d peaks were rejected" % (utils.count_lines(rejected_peaks_fn))) + + # Remove temporary files + os.remove(overlap_tr_fn) + os.remove(overlap_pr_fn) + + return overlapping_peaks_fn + + +def main(): + args = get_args() + design = args.design + files = args.files + + # Create a file handler + handler = logging.FileHandler('consensus_peaks.log') + logger.addHandler(handler) + + # Read files as dataframes + design_peaks_df = pd.read_csv(design, sep='\t') + design_files_df = pd.read_csv(files, sep='\t') + + # Make a design file for + design_diff = update_design(design_files_df) + + # Find consenus overlap peaks for each experiment + for experiment, df_experiment in design_peaks_df.groupby('experiment_id'): + replicated_peak = overlap(experiment, df_experiment) + design_diff.loc[design_diff.experiment_id == experiment, "peak"] = replicated_peak + + # Write out file + design_diff.columns = ['SampleID', + 'bamReads', + 'Condition', + 'Tissue', + 'Factor', + 'Treatment', + 'Replicate', + 'ControlID', + 'bamControl', + 'Peaks', + 'PeakCaller'] + + design_diff.to_csv("design_diffPeaks.tsv", header=True, sep='\t', index=False) + + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/pool_and_psuedoreplicate.py b/workflow/scripts/pool_and_psuedoreplicate.py index 6dbe7c9169b849ae17163408215a2ea74e3eaffe..1ee66a712ee352f2a089ce77e1317d21045e4a69 100644 --- a/workflow/scripts/pool_and_psuedoreplicate.py +++ b/workflow/scripts/pool_and_psuedoreplicate.py @@ -190,17 +190,20 @@ def main(): replicate = design_df.at[0, 'replicate'] design_new_df = design_df.loc[np.repeat(design_df.index, 4)].reset_index() design_new_df['replicate'] = design_new_df['replicate'].astype(str) - design_new_df.at[1, 'sample_id'] = experiment_id + '_pr' + design_new_df.at[1, 'sample_id'] = experiment_id + '_pr1' design_new_df.at[1, 'replicate'] = '1_pr' - design_new_df.at[2, 'sample_id'] = experiment_id + '_pr' + design_new_df.at[1, 'xcor'] = 'Calculate' + design_new_df.at[2, 'sample_id'] = experiment_id + '_pr2' design_new_df.at[2, 'replicate'] = '2_pr' - design_new_df.at[3, 'sample_id'] = experiment_id + '_pr' + design_new_df.at[2, 'xcor'] = 'Calculate' + design_new_df.at[3, 'sample_id'] = experiment_id + 'pooled' design_new_df.at[3, 'replicate'] = 'pooled' + design_new_df.at[3, 'xcor'] = 'Calculate' # Make 2 self psuedoreplicates self_pseudoreplicates_dict = {} for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']): - replicate_prefix = experiment_id + '_' + rep + replicate_prefix = experiment_id + '_' + str(rep) self_pseudoreplicates_dict = \ self_psuedoreplication(tag_file, replicate_prefix, paired) diff --git a/workflow/scripts/utils.py b/workflow/scripts/utils.py index 2643c6e409938d30453d1bc93359df81651de5fe..a50167367eec13fc0548bfbffbfa49fa8db58c60 100644 --- a/workflow/scripts/utils.py +++ b/workflow/scripts/utils.py @@ -5,6 +5,9 @@ import shlex import logging +import subprocess +import sys +import os logger = logging.getLogger(__name__) @@ -45,6 +48,14 @@ def run_pipe(steps, outfile=None): return out, err +def block_on(command): + process = subprocess.Popen(shlex.split(command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) + for line in iter(process.stdout.readline, b''): + sys.stdout.write(line.decode('utf-8')) + process.communicate() + return process.returncode + + def strip_extensions(filename, extensions): '''Strips extensions to get basename of file.''' @@ -72,3 +83,49 @@ def count_lines(filename): 'wc -l' ]) return int(out) + + +def rescale_scores(filename, scores_col, new_min=10, new_max=1000): + sorted_fn = 'sorted-%s' % (filename) + rescaled_fn = 'rescaled-%s' % (filename) + + out, err = run_pipe([ + 'sort -k %dgr,%dgr %s' % (scores_col, scores_col, filename), + r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (NF != 0) print $0}'"""], + sorted_fn) + + out, err = run_pipe([ + 'head -n 1 %s' % (sorted_fn), + 'cut -f %s' % (scores_col)]) + max_score = float(out.strip()) + logger.info("rescale_scores: max_score = %s" % (max_score)) + + out, err = run_pipe([ + 'tail -n 1 %s' % (sorted_fn), + 'cut -f %s' % (scores_col)]) + min_score = float(out.strip()) + logger.info("rescale_scores: min_score = %s" % (min_score)) + + a = min_score + b = max_score + x = new_min + y = new_max + + if min_score == max_score: # give all peaks new_min + rescale_formula = "x" + else: # n is the unscaled score from scores_col + rescale_formula = "((n-a)*(y-x)/(b-a))+x" + + out, err = run_pipe( + [ + 'cat %s' % (sorted_fn), + r"""awk 'BEGIN{OFS="\t"}{n=$%d;a=%d;b=%d;x=%d;y=%d}""" + % (scores_col, a, b, x, y) + + r"""{$%d=int(%s) ; print $0}'""" + % (scores_col, rescale_formula) + ], + rescaled_fn) + + os.remove(sorted_fn) + + return rescaled_fn diff --git a/workflow/scripts/xcor.py b/workflow/scripts/xcor.py index 0bf51e6400d05d2ea9cdb978883fed3c765d3100..52838594571b351de72fe03afb05f17840bbdc2a 100644 --- a/workflow/scripts/xcor.py +++ b/workflow/scripts/xcor.py @@ -24,7 +24,7 @@ logger.setLevel(logging.INFO) def get_args(): '''Define arguments.''' - + parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter) @@ -106,6 +106,8 @@ def xcor(tag, paired): cc_plot_filename, cc_scores_filename) ]) + return cc_scores_filename + def main(): args = get_args() @@ -120,7 +122,7 @@ def main(): check_tools() # Calculate Cross-correlation - xcor(tag, paired) + xcor_filename = xcor(tag, paired) if __name__ == '__main__': diff --git a/workflow/tests/test_call_peaks_macs.py b/workflow/tests/test_call_peaks_macs.py new file mode 100644 index 0000000000000000000000000000000000000000..aba35702c689e3114442648763a855d01611c0cb --- /dev/null +++ b/workflow/tests/test_call_peaks_macs.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 + +import pytest +import os +import utils + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../output/callPeaksMACS/' + + +def test_call_peaks_macs_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT.fc_signal.bw')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT.pvalue_signal.bw')) + peak_file = test_output_path + 'ENCLB144FDT_peaks.narrowPeak' + assert utils.count_lines(peak_file) == 210349 + + +def test_call_peaks_macs_pairedend(): + # Do the same thing for paired end data + pass diff --git a/workflow/tests/test_overlap_peaks.py b/workflow/tests/test_overlap_peaks.py new file mode 100644 index 0000000000000000000000000000000000000000..db22fb868d474ed5e0ede4c84ae08d874935caf6 --- /dev/null +++ b/workflow/tests/test_overlap_peaks.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os +import utils +import overlap_peaks + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../output/consensusPeaks/' + +DESIGN_STRING = """sample_id\tbam_reads\tbam_index\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id +A_1\tA_1.bam\tA_1.bai\tA\tLiver\tH3K27ac\tNone\t1\tB_1 +A_2\tA_2.bam\tA_2.bai\tA\tLiver\tH3K27ac\tNone\t2\tB_2 +B_1\tB_1.bam\tB_1.bai\tB\tLiver\tH3K27ac\tNone\t1\tB_1 +B_2\tB_2.bam\tB_2.bai\tB\tLiver\tH3K27ac\tNone\t2\tB_2 +""" + + +@pytest.fixture +def design_diff(): + design_file = StringIO(DESIGN_STRING) + design_df = pd.read_csv(design_file, sep="\t") + return design_df + + +def test_check_update_design(design_diff): + new_design = overlap_peaks.update_design(design_diff) + assert new_design.shape[0] == 2 + assert new_design.loc[0, 'control_bam_reads'] == "B_1.bam" + assert new_design.loc[0, 'peak_caller'] == "bed" + + +def test_overlap_peaks_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC.rejected.narrowPeak')) + peak_file = test_output_path + 'ENCSR238SGC.replicated.narrowPeak' + assert utils.count_lines(peak_file) == 150096 + + +def test_call_peaks_macs_pairedend(): + # Do the same thing for paired end data + pass