diff --git a/workflow/scripts/call_peaks_macs.py b/workflow/scripts/call_peaks_macs.py index 28c546c5c606c9a534a80de2f363a4e7adb52adf..b5170e249bca5ff9c9fd69f9268d59d542adecb5 100644 --- a/workflow/scripts/call_peaks_macs.py +++ b/workflow/scripts/call_peaks_macs.py @@ -6,7 +6,6 @@ import os import argparse import shutil import logging -from multiprocessing import cpu_count import utils from xcor import xcor as calculate_xcor @@ -100,6 +99,7 @@ def check_tools(): def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes): + '''Call peaks and signal tracks''' # Extract the fragment length estimate from column 3 of the # cross-correlation scores file @@ -107,7 +107,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) firstline = xcor_fh.readline() frag_lengths = firstline.split()[2] # third column fragment_length = frag_lengths.split(',')[0] # grab first value - logger.info("Fraglen %s" % (fragment_length)) + logger.info("Fraglen %s", fragment_length) # Generate narrow peaks and preliminary signal tracks @@ -119,7 +119,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) logger.info(command) returncode = utils.block_on(command) - logger.info("MACS2 exited with returncode %d" % (returncode)) + logger.info("MACS2 exited with returncode %d", returncode) assert returncode == 0, "MACS2 non-zero return" # MACS2 sometimes calls features off the end of chromosomes. @@ -130,7 +130,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) steps = ['slopBed -i %s -g %s -b 0' % (narrowpeak_fn, chrom_sizes), - 'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)] + 'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)] out, err = utils.run_pipe(steps) @@ -161,7 +161,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) logger.info(command) returncode = utils.block_on(command) - logger.info("MACS2 exited with returncode %d" % (returncode)) + logger.info("MACS2 exited with returncode %d", returncode) assert returncode == 0, "MACS2 non-zero return" # Remove coordinates outside chromosome sizes (MACS2 bug) @@ -169,7 +169,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) fc_bedgraph_sorted_fn = 'sorted-%s' % (fc_bedgraph_fn) fc_signal_fn = "%s.fc_signal.bw" % (prefix) steps = ['slopBed -i %s_FE.bdg -g %s -b 0' % (prefix, chrom_sizes), - 'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)] + 'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)] out, err = utils.run_pipe(steps) @@ -185,7 +185,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) logger.info(command) returncode = utils.block_on(command) - logger.info("bedGraphToBigWig exited with returncode %d" % (returncode)) + logger.info("bedGraphToBigWig exited with returncode %d", returncode) assert returncode == 0, "bedGraphToBigWig non-zero return" # For -log10(p-value) signal tracks @@ -199,7 +199,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) sval = str(min(float(chip_reads), float(control_reads)) / 1000000) logger.info("chip_reads = %s, control_reads = %s, sval = %s" % - (chip_reads, control_reads, sval)) + (chip_reads, control_reads, sval)) command = 'macs2 bdgcmp ' + \ '-t %s_treat_pileup.bdg ' % (prefix) + \ @@ -216,7 +216,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) pvalue_bedgraph_sorted_fn = 'sort-%s' % (pvalue_bedgraph_fn) pvalue_signal_fn = "%s.pvalue_signal.bw" % (prefix) steps = ['slopBed -i %s_ppois.bdg -g %s -b 0' % (prefix, chrom_sizes), - 'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)] + 'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)] out, err = utils.run_pipe(steps) @@ -232,7 +232,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) logger.info(command) returncode = utils.block_on(command) - logger.info("bedGraphToBigWig exited with returncode %d" % (returncode)) + logger.info("bedGraphToBigWig exited with returncode %d", returncode) assert returncode == 0, "bedGraphToBigWig non-zero return" # Remove temporary files diff --git a/workflow/scripts/check_design.py b/workflow/scripts/check_design.py index 6eef6a13d5e041946efe4322565f7f2aa1336307..35af1f469826933578ba0df5694f45644836b8b2 100644 --- a/workflow/scripts/check_design.py +++ b/workflow/scripts/check_design.py @@ -5,7 +5,6 @@ import argparse import logging import pandas as pd -import re EPILOG = ''' For more details: @@ -84,7 +83,7 @@ def check_samples(design): malformated_samples = [] chars = set('-.') for sample in samples.index.values: - if ( any(char.isspace() for char in sample) | any((char in chars) for char in sample) ): + if(any(char.isspace() for char in sample) | any((char in chars) for char in sample)): malformated_samples.append(sample) if len(malformated_samples) > 0: @@ -104,7 +103,7 @@ def check_experiments(design): malformated_experiments = [] chars = set('-.') for experiment in experiments.index.values: - if ( any(char.isspace() for char in experiment) | any((char in chars) for char in experiment) ): + if(any(char.isspace() for char in experiment) | any((char in chars) for char in experiment)): malformated_experiments.append(experiment) if len(malformated_experiments) > 0: @@ -187,8 +186,8 @@ def main(): logger.addHandler(handler) # Read files as dataframes - design_df = pd.read_csv(args.design, sep='\t') - fastq_df = pd.read_csv(args.fastq, sep='\t', names=['name', 'path']) + design_df = pd.read_csv(design, sep='\t') + fastq_df = pd.read_csv(fastq, sep='\t', names=['name', 'path']) # Check design file check_design_headers(design_df, paired) diff --git a/workflow/scripts/convert_reads.py b/workflow/scripts/convert_reads.py index d235cfb628ec1922b27c041e97a4faee9dd2dad1..bba7c6f16e49b7ea608d50546ec0b5af21ed4448 100644 --- a/workflow/scripts/convert_reads.py +++ b/workflow/scripts/convert_reads.py @@ -91,8 +91,7 @@ def convert_mapped_pe(bam, bam_basename): out, err = utils.run_pipe([ "bamToBed -bedpe -mate1 -i %s" % (nmsrt_bam_filename), - "gzip -nc"], - outfile=bedpe_filename) + "gzip -nc"], outfile=bedpe_filename) def main(): @@ -117,7 +116,7 @@ def main(): if paired: # paired-end data convert_mapped_pe(bam, bam_basename) else: - bedse_filename = bam_basename + ".bedse.gz" + bedse_filename = bam_basename + ".bedse.gz" shutil.copy(tag_filename, bedse_filename) diff --git a/workflow/scripts/experiment_qc.py b/workflow/scripts/experiment_qc.py index 7386fcffd8e982d4d432e99becfc2aacb23ad2f2..c5cd47caa9a57c67059ef0c7a074599b1c5605d3 100644 --- a/workflow/scripts/experiment_qc.py +++ b/workflow/scripts/experiment_qc.py @@ -7,8 +7,9 @@ import argparse import logging import subprocess import shutil -import pandas as pd from multiprocessing import cpu_count +import pandas as pd + EPILOG = ''' For more details: @@ -173,13 +174,13 @@ def main(): # Run enrichment new_design_df = update_controls(design_df) - for index, row in new_design_df.iterrows(): + for row in new_design_df.iterrows(): check_enrichment( - row['sample_id'], - row['control_id'], - row['bam_reads'], - row['control_reads'], - extension) + row['sample_id'], + row['control_id'], + row['bam_reads'], + row['control_reads'], + extension) if __name__ == '__main__': diff --git a/workflow/scripts/map_qc.py b/workflow/scripts/map_qc.py index 78b9298282a59be5c94d7a62d544fb4aebb21deb..031034a99407223ff09df055fb6a86ef9b49bee4 100644 --- a/workflow/scripts/map_qc.py +++ b/workflow/scripts/map_qc.py @@ -106,7 +106,7 @@ def filter_mapped_pe(bam, bam_basename): # Will produce name sorted BAM "samtools sort -n -@ %d -o %s" % (cpu_count(), tmp_filt_bam_filename)]) if err: - logger.error("samtools filter error: %s" % (err)) + logger.error("samtools filter error: %s", err) # Remove orphan reads (pair was removed) # and read pairs mapping to different chromosomes @@ -136,7 +136,7 @@ def filter_mapped_se(bam, bam_basename): # not primary alignment, reads failing platform # Remove low MAPQ reads # Obtain name sorted BAM file - with open(filt_bam_filename, 'w') as fh: + with open(filt_bam_filename, 'w') as temp_file: samtools_filter_command = ( "samtools view -F 1804 -q 30 -b %s" % (bam) @@ -144,7 +144,7 @@ def filter_mapped_se(bam, bam_basename): logger.info(samtools_filter_command) subprocess.check_call( shlex.split(samtools_filter_command), - stdout=fh) + stdout=temp_file) return filt_bam_filename @@ -157,7 +157,7 @@ def dedup_mapped(bam, bam_basename, paired): tmp_dup_mark_filename = bam_basename + ".dupmark.bam" sambamba_params = "--hash-table-size=17592186044416" + \ " --overflow-list-size=20000000 --io-buffer-size=256" - with open(dup_file_qc_filename, 'w') as fh: + with open(dup_file_qc_filename, 'w') as temp_file: sambamba_markdup_command = ( "sambamba markdup -t %d %s --tmpdir=%s %s %s" % (cpu_count(), sambamba_params, os.getcwd(), bam, tmp_dup_mark_filename) @@ -165,7 +165,7 @@ def dedup_mapped(bam, bam_basename, paired): logger.info(sambamba_markdup_command) subprocess.check_call( shlex.split(sambamba_markdup_command), - stderr=fh) + stderr=temp_file) # Remove duplicates @@ -179,11 +179,11 @@ def dedup_mapped(bam, bam_basename, paired): samtools_dedupe_command = \ "samtools view -F 1804 -b %s" % (tmp_dup_mark_filename) - with open(final_bam_filename, 'w') as fh: + with open(final_bam_filename, 'w') as temp_file: logger.info(samtools_dedupe_command) subprocess.check_call( shlex.split(samtools_dedupe_command), - stdout=fh) + stdout=temp_file) # Index final bam file sambamba_index_command = \ @@ -192,12 +192,12 @@ def dedup_mapped(bam, bam_basename, paired): subprocess.check_output(shlex.split(sambamba_index_command)) # Generate mapping statistics - final_bam_file_mapstats_filename = final_bam_prefix + ".flagstat.qc" - with open(final_bam_file_mapstats_filename, 'w') as fh: + mapstats_filename = final_bam_prefix + ".flagstat.qc" + with open(mapstats_filename, 'w') as temp_file: flagstat_command = "sambamba flagstat -t %d %s" \ % (cpu_count(), final_bam_filename) logger.info(flagstat_command) - subprocess.check_call(shlex.split(flagstat_command), stdout=fh) + subprocess.check_call(shlex.split(flagstat_command), stdout=temp_file) os.remove(bam) return tmp_dup_mark_filename @@ -223,12 +223,12 @@ def compute_complexity(bam, paired, bam_basename): # PBC1=OnePair/Distinct [tab] # PBC2=OnePair/TwoPair pbc_headers = ['TotalReadPairs', - 'DistinctReadPairs', - 'OneReadPair', - 'TwoReadPairs', - 'NRF', - 'PBC1', - 'PBC2'] + 'DistinctReadPairs', + 'OneReadPair', + 'TwoReadPairs', + 'NRF', + 'PBC1', + 'PBC2'] if paired: steps = [ @@ -247,11 +247,11 @@ def compute_complexity(bam, paired, bam_basename): ]) out, err = utils.run_pipe(steps, tmp_pbc_file_qc_filename) if err: - logger.error("PBC file error: %s" % (err)) + logger.error("PBC file error: %s", err) # Add headers pbc_file = pd.read_csv(tmp_pbc_file_qc_filename, sep='\t', header=None, - names=pbc_headers) + names=pbc_headers) pbc_file.to_csv(pbc_file_qc_filename, header=True, sep='\t', index=False) os.remove(bam) os.remove(bam + '.bai') diff --git a/workflow/scripts/map_reads.py b/workflow/scripts/map_reads.py index b8a2ec54dc307949b374beec5f9af007af08685a..3c20ac81ddf051f11d5b7e4e37e15629991ffa31 100644 --- a/workflow/scripts/map_reads.py +++ b/workflow/scripts/map_reads.py @@ -9,7 +9,6 @@ import shutil import shlex import logging from multiprocessing import cpu_count -import json import utils EPILOG = ''' @@ -24,6 +23,12 @@ logger.addHandler(logging.NullHandler()) logger.propagate = False logger.setLevel(logging.INFO) +# the order of this list is important. +# strip_extensions strips from the right inward, so +# the expected right-most extensions should appear first (like .gz) +# Modified from J. Seth Strattan +STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '_trimmed'] + def get_args(): '''Define arguments.''' @@ -110,7 +115,7 @@ def align_se(fastq, sai, reference, fastq_basename): out, err = utils.run_pipe(steps) if err: - logger.error("samse/samtools error: %s" % (err)) + logger.error("samse/samtools error: %s", err) return bam_filename @@ -124,17 +129,17 @@ def align_pe(fastq, sai, reference, fastq_basename): # Remove read pairs with bad CIGAR strings and sort by position steps = [ - "bwa sampe -P %s %s %s %s %s" - % (reference, sai[0], sai[1], - fastq[0], fastq[1]), - "tee %s" % (sam_filename), - r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""", - "sort", - "uniq"] + "bwa sampe -P %s %s %s %s %s" + % (reference, sai[0], sai[1], + fastq[0], fastq[1]), + "tee %s" % (sam_filename), + r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""", + "sort", + "uniq"] out, err = utils.run_pipe(steps, badcigar_filename) if err: - logger.error("sampe error: %s" % (err)) + logger.error("sampe error: %s", err) steps = [ "cat %s" % (sam_filename), @@ -145,7 +150,7 @@ def align_pe(fastq, sai, reference, fastq_basename): out, err = utils.run_pipe(steps) if err: - logger.error("samtools error: %s" % (err)) + logger.error("samtools error: %s", err) return bam_filename @@ -166,8 +171,8 @@ def main(): # Run Suffix Array generation sai = [] - for fq in fastq: - sai_filename = generate_sa(fq, reference) + for fastq_file in fastq: + sai_filename = generate_sa(fastq_file, reference) sai.append(sai_filename) # Make file basename @@ -181,10 +186,10 @@ def main(): bam_filename = align_se(fastq, sai, reference, fastq_basename) bam_mapstats_filename = '%s.flagstat.qc' % (fastq_basename) - with open(bam_mapstats_filename, 'w') as fh: + with open(bam_mapstats_filename, 'w') as temp_file: subprocess.check_call( shlex.split("samtools flagstat %s" % (bam_filename)), - stdout=fh) + stdout=temp_file) # Remove sai files for sai_file in sai: diff --git a/workflow/scripts/motif_search.py b/workflow/scripts/motif_search.py index ab80819a7fc04984952b015cd2542574d5dd00c0..a7f6d1a9dab989b6786358e671193a685f03f284 100644 --- a/workflow/scripts/motif_search.py +++ b/workflow/scripts/motif_search.py @@ -3,16 +3,13 @@ '''Call Motifs on called peaks.''' import os -import sys -import re -from re import sub -import string import argparse import logging -import subprocess +import shutil +from multiprocessing import Pool import pandas as pd import utils -from multiprocessing import Pool + EPILOG = ''' For more details: @@ -31,7 +28,7 @@ logger.setLevel(logging.INFO) # strip_extensions strips from the right inward, so # the expected right-most extensions should appear first (like .gz) # Modified from J. Seth Strattan -STRIP_EXTENSIONS = ['.narrowPeak', '.replicated' ] +STRIP_EXTENSIONS = ['.narrowPeak', '.replicated'] def get_args(): @@ -58,8 +55,29 @@ def get_args(): # Functions +def check_tools(): + '''Checks for required componenets on user system''' + + logger.info('Checking for required libraries and components on this system') + + meme_path = shutil.which("meme") + if meme_path: + logger.info('Found meme: %s', meme_path) + else: + logger.error('Missing meme') + raise Exception('Missing meme') + + bedtools_path = shutil.which("bedtools") + if bedtools_path: + logger.info('Found bedtools: %s', bedtools_path) + else: + logger.error('Missing bedtools') + raise Exception('Missing bedtools') + + def run_wrapper(args): - motif_search(*args) + motif_search(*args) + def motif_search(filename, genome, experiment, peak): '''Run motif serach on peaks.''' @@ -82,9 +100,16 @@ def motif_search(filename, genome, experiment, peak): out, err = utils.run_pipe([ 'bedtools getfasta -fi %s -bed %s -fo %s' % (genome, sorted_fn, out_fa)]) + if err: + logger.error("bedtools error: %s", err) + + #Call memechip out, err = utils.run_pipe([ 'meme-chip -oc %s -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 %s -norand' % (out_motif, out_fa)]) + if err: + logger.error("meme-chip error: %s", err) + def main(): args = get_args() @@ -92,14 +117,22 @@ def main(): genome = args.genome peak = args.peak + # Create a file handler + handler = logging.FileHandler('motif.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + # Read files design_df = pd.read_csv(design, sep='\t') - meme_arglist = zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0]) - work_pool = Pool(min(12,design_df.shape[0])) - return_list = work_pool.map(run_wrapper, meme_arglist) + meme_arglist = zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0]) + work_pool = Pool(min(12, design_df.shape[0])) + return_list = work_pool.map(run_wrapper, meme_arglist) work_pool.close() work_pool.join() -if __name__=="__main__": - main() + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/overlap_peaks.py b/workflow/scripts/overlap_peaks.py index 434caa08799a2d6b3bf460ea8520d08a5b92c852..61f0c2e6d11553e54bd9cf5a1cc5c629fa9acb5c 100644 --- a/workflow/scripts/overlap_peaks.py +++ b/workflow/scripts/overlap_peaks.py @@ -113,9 +113,9 @@ def overlap(experiment, design): # with any one of the overlapping peak pairs >= 0.5 steps_true = ['intersectBed -wo -a %s -b %s' % (pool_peaks, true_rep_peaks[0]), - awk_command, - cut_command, - 'sort -u'] + awk_command, + cut_command, + 'sort -u'] iter_true_peaks = iter(true_rep_peaks) next(iter_true_peaks) @@ -123,13 +123,13 @@ def overlap(experiment, design): if len(true_rep_peaks) > 1: for true_peak in true_rep_peaks[1:]: steps_true.extend(['intersectBed -wo -a stdin -b %s' % (true_peak), - awk_command, - cut_command, - 'sort -u']) + awk_command, + cut_command, + 'sort -u']) out, err = utils.run_pipe(steps_true, outfile=overlap_tr_fn) print("%d peaks overlap with both true replicates" % - (utils.count_lines(overlap_tr_fn))) + (utils.count_lines(overlap_tr_fn))) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 # where overlap is defined as the fractional overlap @@ -146,7 +146,7 @@ def overlap(experiment, design): out, err = utils.run_pipe(steps_pseudo, outfile=overlap_pr_fn) print("%d peaks overlap with both pooled pseudoreplicates" - % (utils.count_lines(overlap_pr_fn))) + % (utils.count_lines(overlap_pr_fn))) # Make union of peak lists out, err = utils.run_pipe([ @@ -154,7 +154,7 @@ def overlap(experiment, design): 'sort -u' ], overlapping_peaks_fn) print("%d peaks overlap with true replicates or with pooled pseudorepliates" - % (utils.count_lines(overlapping_peaks_fn))) + % (utils.count_lines(overlapping_peaks_fn))) # Make rejected peak list out, err = utils.run_pipe([ @@ -187,7 +187,7 @@ def main(): # Make a design file for annotating Peaks anno_cols = ['Condition', 'Peaks'] - design_anno = pd.DataFrame(columns = anno_cols) + design_anno = pd.DataFrame(columns=anno_cols) # Find consenus overlap peaks for each experiment for experiment, df_experiment in design_peaks_df.groupby('experiment_id'): @@ -197,16 +197,16 @@ def main(): # Write out design files design_diff.columns = ['SampleID', - 'bamReads', - 'Condition', - 'Tissue', - 'Factor', - 'Treatment', - 'Replicate', - 'ControlID', - 'bamControl', - 'Peaks', - 'PeakCaller'] + 'bamReads', + 'Condition', + 'Tissue', + 'Factor', + 'Treatment', + 'Replicate', + 'ControlID', + 'bamControl', + 'Peaks', + 'PeakCaller'] design_diff.to_csv("design_diffPeaks.csv", header=True, sep=',', index=False) design_anno.to_csv("design_annotatePeaks.tsv", header=True, sep='\t', index=False) diff --git a/workflow/scripts/pool_and_psuedoreplicate.py b/workflow/scripts/pool_and_psuedoreplicate.py index 61864a4d0e32b7b783e2a81c9a372a17f0a59163..91f89b7579fd34fae72916192e8a6f2006624a5b 100644 --- a/workflow/scripts/pool_and_psuedoreplicate.py +++ b/workflow/scripts/pool_and_psuedoreplicate.py @@ -3,11 +3,11 @@ '''Generate pooled and pseudoreplicate from data.''' import argparse -import utils import logging +import os import pandas as pd import numpy as np -import os +import utils EPILOG = ''' For more details: @@ -25,7 +25,7 @@ logger.setLevel(logging.INFO) # strip_extensions strips from the right inward, so # the expected right-most extensions should appear first (like .gz) # Modified from J. Seth Strattan -STRIP_EXTENSIONS = ['.gz', '.tagAlign', '.bedse', 'bedpe' ] +STRIP_EXTENSIONS = ['.gz', '.tagAlign', '.bedse', 'bedpe'] def get_args(): @@ -98,8 +98,7 @@ def pool(tag_files, outfile, paired): # Merge files out, err = utils.run_pipe([ 'gzip -dc %s' % (' '.join(tag_files)), - 'gzip -cn'], - outfile=pooled_filename) + 'gzip -cn'], outfile=pooled_filename) return pooled_filename @@ -219,7 +218,7 @@ def main(): # Update tagAlign with single end data if paired: design_new_df['tag_align'] = design_new_df['se_tag_align'] - design_new_df.drop(labels = 'se_tag_align', axis = 1, inplace = True) + design_new_df.drop(labels='se_tag_align', axis=1, inplace=True) design_new_df['replicate'] = design_new_df['replicate'].astype(str) design_new_df.at[1, 'sample_id'] = experiment_id + '_pr1' @@ -281,7 +280,7 @@ def main(): # Update tagAlign with single end data if paired: design_new_df['tag_align'] = design_new_df['se_tag_align'] - design_new_df.drop(labels = 'se_tag_align', axis = 1, inplace = True) + design_new_df.drop(labels='se_tag_align', axis=1, inplace=True) # Check controls against cutoff_ratio # if so replace with pool_control # unless single control was used @@ -290,7 +289,7 @@ def main(): path_to_pool_control = cwd + '/' + pool_control if control_df.values.max() > cutoff_ratio: logger.info("Number of reads in controls differ by " + - " > factor of %f. Using pooled controls." % (cutoff_ratio)) + " > factor of %f. Using pooled controls." % (cutoff_ratio)) design_new_df['control_tag_align'] = path_to_pool_control else: for index, row in design_new_df.iterrows(): @@ -307,13 +306,13 @@ def main(): control = row['control_tag_align'] control_basename = os.path.basename( utils.strip_extensions(control, STRIP_EXTENSIONS)) - control_tmp = bedpe_to_tagalign(control , "control_basename") + control_tmp = bedpe_to_tagalign(control, control_basename) path_to_control = cwd + '/' + control_tmp design_new_df.loc[index, 'control_tag_align'] = \ path_to_control else: - path_to_pool_control = pool_control + path_to_pool_control = pool_control # Add in pseudo replicates tmp_metadata = design_new_df.loc[0].copy() @@ -337,7 +336,7 @@ def main(): # Write out new dataframe design_new_df.to_csv(experiment_id + '_ppr.tsv', - header=True, sep='\t', index=False) + header=True, sep='\t', index=False) if __name__ == '__main__': diff --git a/workflow/scripts/runDeepTools.py b/workflow/scripts/runDeepTools.py deleted file mode 100644 index 800544dc87c89329bf050369367e0cba9b805f28..0000000000000000000000000000000000000000 --- a/workflow/scripts/runDeepTools.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/python -# programmer : bbc -# usage: - -import sys -import argparse as ap -import logging -import subprocess -import pandas as pd -from multiprocessing import Pool -logging.basicConfig(level=10) - - -def prepare_argparser(): - description = "Make wig file for given bed using bam" - epilog = "For command line options of each command, type %(prog)% COMMAND -h" - argparser = ap.ArgumentParser(description=description, epilog = epilog) - argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BAM file") - argparser.add_argument("-g","--genome",dest = "genome",type=str,required=True, help="genome", default="hg19") - #argparser.add_argument("-b","--bed",dest="bedfile",type=str,required=True, help = "Gene locus in bed format") - #argparser.add_argument("-s","--strandtype",dest="stranded",type=str,default="none", choices=["none","reverse","yes"]) - #argparser.add_argument("-n","--name",dest="trackName",type=str,default="UserTrack",help = "track name for bedgraph header") - return(argparser) - -def run_qc(files, controls, labels): - mbs_command = "multiBamSummary bins --bamfiles "+' '.join(files)+" -out sample_mbs.npz" - p = subprocess.Popen(mbs_command, shell=True) - #logging.debug(mbs_command) - p.communicate() - pcor_command = "plotCorrelation -in sample_mbs.npz --corMethod spearman --skipZeros --plotTitle \"Spearman Correlation of Read Counts\" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers -o experiment.deeptools.heatmap_spearmanCorr_readCounts_v2.png --labels "+" ".join(labels) - #logging.debug(pcor_command) - p = subprocess.Popen(pcor_command, shell=True) - p.communicate() - #plotCoverage - pcov_command = "plotCoverage -b "+" ".join(files)+" --plotFile experiment.deeptools_coverage.png -n 1000000 --plotTitle \"sample coverage\" --ignoreDuplicates --minMappingQuality 10" - p = subprocess.Popen(pcov_command, shell=True) - p.communicate() - #draw fingerprints plots - for treat,ctrl,name in zip(files,controls,labels): - fp_command = "plotFingerprint -b "+treat+" "+ctrl+" --labels "+name+" control --plotFile "+name+".deeptools_fingerprints.png" - p = subprocess.Popen(fp_command, shell=True) - p.communicate() - -def bam2bw_wrapper(command): - p = subprocess.Popen(command, shell=True) - p.communicate() - -def run_signal(files, labels, genome): - #compute matrix and draw profile and heatmap - gene_bed = genome+"/gene.bed"#"/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/genome/"+genome+"/gene.bed" - bw_commands = [] - for f in files: - bw_commands.append("bamCoverage -bs 10 -b "+f+" -o "+f.replace("bam","bw")) - work_pool = Pool(min(len(files), 12)) - work_pool.map(bam2bw_wrapper, bw_commands) - work_pool.close() - work_pool.join() - - cm_command = "computeMatrix scale-regions -R "+gene_bed+" -a 3000 -b 3000 --regionBodyLength 5000 --skipZeros -S *.bw -o samples.deeptools_generegionscalematrix.gz" - p = subprocess.Popen(cm_command, shell=True) - p.communicate() - hm_command = "plotHeatmap -m samples.deeptools_generegionscalematrix.gz -out samples.deeptools_readsHeatmap.png" - p = subprocess.Popen(hm_command, shell=True) - p.communicate() - -def run(dfile,genome): - #parse dfile, suppose data files are the same folder as design file - dfile = pd.read_csv(dfile) - #QC: multiBamSummary and plotCorrelation - run_qc(dfile['bamReads'], dfile['bamControl'], dfile['SampleID']) - #signal plots - run_signal(dfile['bamReads'],dfile['SampleID'],genome) - -def main(): - argparser = prepare_argparser() - args = argparser.parse_args() - run(args.infile, args.genome) - - -if __name__=="__main__": - main() diff --git a/workflow/scripts/utils.py b/workflow/scripts/utils.py index a50167367eec13fc0548bfbffbfa49fa8db58c60..6c2da13b056b386e454a309552f5a3623b1ef254 100644 --- a/workflow/scripts/utils.py +++ b/workflow/scripts/utils.py @@ -16,7 +16,6 @@ logger.propagate = True def run_pipe(steps, outfile=None): - # TODO: capture stderr from subprocess import Popen, PIPE p = None p_next = None @@ -98,13 +97,13 @@ def rescale_scores(filename, scores_col, new_min=10, new_max=1000): 'head -n 1 %s' % (sorted_fn), 'cut -f %s' % (scores_col)]) max_score = float(out.strip()) - logger.info("rescale_scores: max_score = %s" % (max_score)) + logger.info("rescale_scores: max_score = %s", max_score) out, err = run_pipe([ 'tail -n 1 %s' % (sorted_fn), 'cut -f %s' % (scores_col)]) min_score = float(out.strip()) - logger.info("rescale_scores: min_score = %s" % (min_score)) + logger.info("rescale_scores: min_score = %s", min_score) a = min_score b = max_score diff --git a/workflow/scripts/xcor.py b/workflow/scripts/xcor.py index 2737984df794dca446f1f3f5bfb60c68ace48621..d0637391ac8fe00bc271a482f898ccfe3cde5a11 100644 --- a/workflow/scripts/xcor.py +++ b/workflow/scripts/xcor.py @@ -26,7 +26,7 @@ logger.setLevel(logging.INFO) # strip_extensions strips from the right inward, so # the expected right-most extensions should appear first (like .gz) # Modified from J. Seth Strattan -STRIP_EXTENSIONS = ['.gz', '.tagAlign', '.bedse', 'bedpe' ] +STRIP_EXTENSIONS = ['.gz', '.tagAlign', '.bedse', 'bedpe'] def get_args(): @@ -67,21 +67,20 @@ def check_tools(): def xcor(tag, paired): '''Use spp to calculate cross-correlation stats.''' - extension tag_basename = os.path.basename(utils.strip_extensions(tag, STRIP_EXTENSIONS)) uncompressed_tag_filename = tag_basename # Subsample tagAlign file - NREADS = 15000000 + number_reads = 15000000 subsampled_tag_filename = \ - tag_basename + ".%d.tagAlign.gz" % (NREADS/1000000) + tag_basename + ".%d.tagAlign.gz" % (number_reads/1000000) steps = [ 'zcat %s' % (tag), 'grep -v "chrM"', - 'shuf -n %d --random-source=%s' % (NREADS, tag)] + 'shuf -n %d --random-source=%s' % (number_reads, tag)] if paired: steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]) @@ -114,7 +113,6 @@ def xcor(tag, paired): cc_plot_filename, cc_scores_filename) ]) - return cc_scores_filename def main(): @@ -130,7 +128,7 @@ def main(): check_tools() # Calculate Cross-correlation - xcor_filename = xcor(tag, paired) + xcor(tag, paired) if __name__ == '__main__':