#!/usr/bin/env python3 # # * -------------------------------------------------------------------------- # * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md) # * -------------------------------------------------------------------------- # '''Experiment correlation and enrichment of reads over genome-wide bins.''' import argparse import logging import subprocess import shutil from multiprocessing import cpu_count import pandas as pd EPILOG = ''' For more details: %(prog)s --help ''' # SETTINGS logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) logger.propagate = False logger.setLevel(logging.INFO) def get_args(): '''Define arguments.''' parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-d', '--design', help="The design file to run QC (tsv format).", required=True) parser.add_argument('-e', '--extension', help="Number of base pairs to extend the reads", type=int, required=True) args = parser.parse_args() return args def check_tools(): '''Checks for required componenets on user system.''' logger.info('Checking for required libraries and components on this system') deeptools_path = shutil.which("deeptools") if deeptools_path: logger.info('Found deeptools: %s', deeptools_path) # Get Version deeptools_version_command = "deeptools --version" deeptools_version = subprocess.check_output(deeptools_version_command, shell=True, stderr=subprocess.STDOUT) # Write to file deeptools_file = open("version_deeptools.txt", "wb") deeptools_file.write(deeptools_version) deeptools_file.close() else: logger.error('Missing deeptools') raise Exception('Missing deeptools') def generate_read_summary(design, extension): '''Generate summary of data based on read counts.''' bam_files = ' '.join(design['bam_reads']) labels = ' '.join(design['sample_id']) mbs_filename = 'sample_mbs.npz' mbs_command = ( "multiBamSummary bins -p %d --bamfiles %s --extendReads %d --labels %s -out %s" % (cpu_count(), bam_files, extension, labels, mbs_filename) ) logger.info("Running deeptools with %s", mbs_command) read_summary = subprocess.Popen(mbs_command, shell=True) out, err = read_summary.communicate() return mbs_filename def check_spearman_correlation(mbs): '''Check Spearman pairwise correlation of samples based on read counts.''' spearman_filename = 'heatmap_SpearmanCorr.pdf' spearman_params = "--corMethod spearman --skipZero" + \ " --plotTitle \"Spearman Correlation of Read Counts\"" + \ " --whatToPlot heatmap --colorMap RdYlBu --plotNumbers" spearman_command = ( "plotCorrelation -in %s %s -o %s" % (mbs, spearman_params, spearman_filename) ) logger.info("Running deeptools with %s", spearman_command) spearman_correlation = subprocess.Popen(spearman_command, shell=True) out, err = spearman_correlation.communicate() def check_pearson_correlation(mbs): '''Check Pearson pairwise correlation of samples based on read counts.''' pearson_filename = 'heatmap_PearsonCorr.pdf' pearson_params = "--corMethod pearson --skipZero" + \ " --plotTitle \"Pearson Correlation of Read Counts\"" + \ " --whatToPlot heatmap --colorMap RdYlBu --plotNumbers" pearson_command = ( "plotCorrelation -in %s %s -o %s" % (mbs, pearson_params, pearson_filename) ) logger.info("Running deeptools with %s", pearson_command) pearson_correlation = subprocess.Popen(pearson_command, shell=True) out, err = pearson_correlation.communicate() def check_coverage(design, extension): '''Asses the sequencing depth of samples.''' bam_files = ' '.join(design['bam_reads']) labels = ' '.join(design['sample_id']) coverage_filename = 'coverage.pdf' coverage_params = "-n 1000000 --plotTitle \"Sample Coverage\"" + \ " --ignoreDuplicates --minMappingQuality 10" coverage_command = ( "plotCoverage -b %s --extendReads %d --labels %s %s --plotFile %s" % (bam_files, extension, labels, coverage_params, coverage_filename) ) logger.info("Running deeptools with %s", coverage_command) coverage_summary = subprocess.Popen(coverage_command, shell=True) out, err = coverage_summary.communicate() def update_controls(design): '''Update design file to append controls list.''' logger.info("Running control file update.") file_dict = design[['sample_id', 'bam_reads']] \ .set_index('sample_id').T.to_dict() design['control_reads'] = design['control_id'] \ .apply(lambda x: file_dict[x]['bam_reads']) logger.info("Removing rows that are there own control.") design = design[design['control_id'] != design['sample_id']] return design def check_enrichment(sample_id, control_id, sample_reads, control_reads, extension): '''Asses the enrichment per sample.''' fingerprint_filename = sample_id + '_fingerprint.pdf' fingerprint_command = ( "plotFingerprint -b %s %s --extendReads %d --labels %s %s --plotFile %s" % (sample_reads, control_reads, extension, sample_id, control_id, fingerprint_filename) ) logger.info("Running deeptools with %s", fingerprint_command) fingerprint_summary = subprocess.Popen(fingerprint_command, shell=True) out, err = fingerprint_summary.communicate() def main(): args = get_args() design = args.design extension = args.extension # Create a file handler handler = logging.FileHandler('experiment_qc.log') logger.addHandler(handler) # Check if tools are present check_tools() # Read files design_df = pd.read_csv(design, sep='\t') # Run correlation mbs_filename = generate_read_summary(design_df, extension) check_spearman_correlation(mbs_filename) check_pearson_correlation(mbs_filename) # Run coverage check_coverage(design_df, extension) # Run enrichment new_design_df = update_controls(design_df) for index, row in new_design_df.iterrows(): check_enrichment( row['sample_id'], row['control_id'], row['bam_reads'], row['control_reads'], extension) if __name__ == '__main__': main()