diff --git a/astrocyte_pkg.yml b/astrocyte_pkg.yml index 023968245603d637a7d8a9d4de07108ef961dcf0..62c62971b76222f9bcd7786906b10d450ad3a5b8 100644 --- a/astrocyte_pkg.yml +++ b/astrocyte_pkg.yml @@ -47,6 +47,7 @@ workflow_modules: - 'samtools/1.4.1' - 'sambamba/0.6.6' - 'bedtools/2.26.0' + - 'deeptools/2.5.0.1' # A list of parameters used by the workflow, defining how to present them, # options etc in the web interface. For each parameter: diff --git a/test_data/design_ENCSR238SGC_SE.txt b/test_data/design_ENCSR238SGC_SE.txt index d024a4caf8ed06f50cff74bc47787bc4cca844c7..45cf863983dd5c697c36092a0e0d7a8826a64bc3 100644 --- a/test_data/design_ENCSR238SGC_SE.txt +++ b/test_data/design_ENCSR238SGC_SE.txt @@ -1,5 +1,5 @@ sample_id experiment_id biosample factor treatment replicate control_id fastq_read1 -ENCBS844FSC ENCSR238SGC limb H3K4me1 None 1 ENCBS844FSC ENCFF833BLU.fastq.gz -ENCBS892NXC ENCSR238SGC limb H3K4me1 None 2 ENCBS892NXC ENCFF646LXU.fastq.gz -ENCBS844FSC ENCSR687ALB limb Control None 1 ENCBS844FSC ENCFF524CAC.fastq.gz -ENCBS892NXC ENCSR687ALB limb Control None 2 ENCBS892NXC ENCFF163AJI.fastq.gz +ENCLB144FDT ENCSR238SGC limb H3K4me1 None 1 ENCLB304SBJ ENCFF833BLU.fastq.gz +ENCLB831RUI ENCSR238SGC limb H3K4me1 None 2 ENCLB410VVO ENCFF646LXU.fastq.gz +ENCLB304SBJ ENCSR687ALB limb Control None 1 ENCLB304SBJ ENCFF524CAC.fastq.gz +ENCLB410VVO ENCSR687ALB limb Control None 2 ENCLB410VVO ENCFF163AJI.fastq.gz diff --git a/test_data/design_ENCSR729LGA_PE.txt b/test_data/design_ENCSR729LGA_PE.txt index 0e33564dcbc3f88971bf0c9cc7f90c181c2eca76..1d74d0351ec49ee03b3c002facecc37ac486fba0 100644 --- a/test_data/design_ENCSR729LGA_PE.txt +++ b/test_data/design_ENCSR729LGA_PE.txt @@ -1,5 +1,5 @@ sample_id experiment_id biosample factor treatment replicate control_id fastq_read1 fastq_read2 -ENCBS609QTY ENCSR729LGA MCF-7 SP1 None 1 ENCBS216AOQ ENCFF957SQS.fastq.gz ENCFF582IOZ.fastq.gz -ENCBS200IWR ENCSR729LGA MCF-7 SP1 None 2 ENCBS034XKZ ENCFF330MCZ.fastq.gz ENCFF293YFE.fastq.gz -ENCBS216AOQ ENCSR217LRF MCF-7 Control None 1 ENCBS216AOQ ENCFF002DTU.fastq.gz ENCFF002EFI.fastq.gz -ENCBS034XKZ ENCSR217LRF MCF-7 Control None 2 ENCBS034XKZ ENCFF002EFG.fastq.gz ENCFF002DTS.fastq.gz +ENCLB637LZP ENCSR729LGA MCF-7 SP1 None 1 ENCLB678IDC ENCFF957SQS.fastq.gz ENCFF582IOZ.fastq.gz +ENCLB568IYX ENCSR729LGA MCF-7 SP1 None 2 ENCLB336TVW ENCFF330MCZ.fastq.gz ENCFF293YFE.fastq.gz +ENCLB678IDC ENCSR217LRF MCF-7 Control None 1 ENCLB678IDC ENCFF002DTU.fastq.gz ENCFF002EFI.fastq.gz +ENCLB336TVW ENCSR217LRF MCF-7 Control None 2 ENCLB336TVW ENCFF002EFG.fastq.gz ENCFF002DTS.fastq.gz diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 3268a6efb9431382e9f12887539ee242615e4ee1..f9d95c2d75fbf1d1e483af1064a52599f8bc925d 100644 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -19,6 +19,10 @@ process { module = ['python/3.6.1-2-anaconda', 'samtools/1.4.1', 'sambamba/0.6.6', 'bedtools/2.26.0'] cpus = 32 } + $experimentQC { + module = ['python/3.6.1-2-anaconda', 'deeptools/2.5.0.1'] + executor = 'local' + } } params { diff --git a/workflow/main.nf b/workflow/main.nf index fd0c59d503f1c60e7f751db3058baa4e95f3b6be..384596cb4603af3a5d84586c7d9462b2b2c68790 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -162,3 +162,28 @@ process filterReads { } } + +// Define channel collecting new design file +dedupDesign = dedupReads. + collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\tbiosample\tfactor\ttreatment\treplicate\controlId\n", storeDir:""$baseDir/output/design") + +// Quality Metrics using deeptools +process experimentQC { + + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + file dedupDesign + + output: + + file '*.{png,npz}' into deepToolsStats + + script: + + """ + python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign + """ + +} diff --git a/workflow/scripts/experiment_qc.py b/workflow/scripts/experiment_qc.py index 75655172cac2cfc1a113ab81870da66407254ba6..47a398344fa789fb2a5c5abf159db5375b500efe 100644 --- a/workflow/scripts/experiment_qc.py +++ b/workflow/scripts/experiment_qc.py @@ -1,81 +1,177 @@ -#!/usr/bin/python -# programmer : bbc -# usage: +#!/usr/bin/env python3 -import sys -import argparse as ap +'''Experiment correlation and enrichment of reads over genome-wide bins.''' + + +import argparse import logging import subprocess +import shutil import pandas as pd -from multiprocessing import Pool -logging.basicConfig(level=10) - - -def prepare_argparser(): - description = "Make wig file for given bed using bam" - epilog = "For command line options of each command, type %(prog)% COMMAND -h" - argparser = ap.ArgumentParser(description=description, epilog = epilog) - argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BAM file") - argparser.add_argument("-g","--genome",dest = "genome",type=str,required=True, help="genome", default="hg19") - #argparser.add_argument("-b","--bed",dest="bedfile",type=str,required=True, help = "Gene locus in bed format") - #argparser.add_argument("-s","--strandtype",dest="stranded",type=str,default="none", choices=["none","reverse","yes"]) - #argparser.add_argument("-n","--name",dest="trackName",type=str,default="UserTrack",help = "track name for bedgraph header") - return(argparser) - -def run_qc(files, controls, labels): - mbs_command = "multiBamSummary bins --bamfiles "+' '.join(files)+" -out sample_mbs.npz" - p = subprocess.Popen(mbs_command, shell=True) - #logging.debug(mbs_command) - p.communicate() - pcor_command = "plotCorrelation -in sample_mbs.npz --corMethod spearman --skipZeros --plotTitle \"Spearman Correlation of Read Counts\" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers -o experiment.deeptools.heatmap_spearmanCorr_readCounts_v2.png --labels "+" ".join(labels) - #logging.debug(pcor_command) - p = subprocess.Popen(pcor_command, shell=True) - p.communicate() - #plotCoverage - pcov_command = "plotCoverage -b "+" ".join(files)+" --plotFile experiment.deeptools_coverage.png -n 1000000 --plotTitle \"sample coverage\" --ignoreDuplicates --minMappingQuality 10" - p = subprocess.Popen(pcov_command, shell=True) - p.communicate() - #draw fingerprints plots - for treat,ctrl,name in zip(files,controls,labels): - fp_command = "plotFingerprint -b "+treat+" "+ctrl+" --labels "+name+" control --plotFile "+name+".deeptools_fingerprints.png" - p = subprocess.Popen(fp_command, shell=True) - p.communicate() - -def bam2bw_wrapper(command): - p = subprocess.Popen(command, shell=True) - p.communicate() - -def run_signal(files, labels, genome): - #compute matrix and draw profile and heatmap - gene_bed = genome+"/gene.bed"#"/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/genome/"+genome+"/gene.bed" - bw_commands = [] - for f in files: - bw_commands.append("bamCoverage -bs 10 -b "+f+" -o "+f.replace("bam","bw")) - work_pool = Pool(min(len(files), 12)) - work_pool.map(bam2bw_wrapper, bw_commands) - work_pool.close() - work_pool.join() - - cm_command = "computeMatrix scale-regions -R "+gene_bed+" -a 3000 -b 3000 --regionBodyLength 5000 --skipZeros -S *.bw -o samples.deeptools_generegionscalematrix.gz" - p = subprocess.Popen(cm_command, shell=True) - p.communicate() - hm_command = "plotHeatmap -m samples.deeptools_generegionscalematrix.gz -out samples.deeptools_readsHeatmap.png" - p = subprocess.Popen(hm_command, shell=True) - p.communicate() - -def run(dfile,genome): - #parse dfile, suppose data files are the same folder as design file - dfile = pd.read_csv(dfile) - #QC: multiBamSummary and plotCorrelation - run_qc(dfile['bamReads'], dfile['bamControl'], dfile['SampleID']) - #signal plots - run_signal(dfile['bamReads'],dfile['SampleID'],genome) +from multiprocessing import cpu_count + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-d', '--design', + help="The design file to run QC (tsv format).", + required=True) + + args = parser.parse_args() + return args + + +def check_tools(): + '''Checks for required componenets on user system.''' + + logger.info('Checking for required libraries and components on this system') + + deeptools_path = shutil.which("deeptools") + if deeptools_path: + logger.info('Found deeptools: %s', deeptools_path) + else: + logger.error('Missing deeptools') + raise Exception('Missing deeptools') + + +def generate_read_summary(design): + '''Generate summary of data based on read counts.''' + + bam_files = ' '.join(design['bam_reads']) + labels = ' '.join(design['sample_id']) + mbs_filename = 'sample_mbs.npz' + + mbs_command = ( + "multiBamSummary bins -p %d --bamfiles %s --labels %s -out %s" + % (cpu_count(), bam_files, labels, mbs_filename) + ) + + logger.info("Running deeptools with %s", mbs_command) + + read_summary = subprocess.Popen(mbs_command, shell=True) + out, err = read_summary.communicate() + + return mbs_filename + + +def check_correlation(mbs): + '''Check Spearman pairwise correlation of samples based on read counts.''' + + spearman_filename = 'heatmap_SpearmanCorr.png' + spearman_params = "--corMethod spearman --skipZero" + \ + " --plotTitle \"Spearman Correlation of Read Counts\"" + \ + " --whatToPlot heatmap --colorMap RdYlBu --plotNumbers" + + spearman_command = ( + "plotCorrelation -in %s %s -o %s" + % (mbs, spearman_params, spearman_filename) + ) + + logger.info("Running deeptools with %s", spearman_command) + + spearman_correlation = subprocess.Popen(spearman_command, shell=True) + out, err = spearman_correlation.communicate() + + +def check_coverage(design): + '''Asses the sequencing depth of samples.''' + + bam_files = ' '.join(design['bam_reads']) + labels = ' '.join(design['sample_id']) + coverage_filename = 'coverage.png' + coverage_params = "-n 1000000 --plotTitle \"Sample Coverage\"" + \ + " --ignoreDuplicates --minMappingQuality 10" + + coverage_command = ( + "plotCoverage -b %s --labels %s %s --plotFile %s" + % (bam_files, labels, coverage_params, coverage_filename) + ) + + logger.info("Running deeptools with %s", coverage_command) + + coverage_summary = subprocess.Popen(coverage_command, shell=True) + out, err = coverage_summary.communicate() + + +def update_controls(design): + '''Update design file to append controls list.''' + + logger.info("Running control file update.") + + file_dict = design[['sample_id', 'bam_reads']] \ + .set_index('sample_id').T.to_dict() + + design['control_reads'] = design['control_id'] \ + .apply(lambda x: file_dict[x]['bam_reads']) + + logger.info("Removing rows that are there own control.") + + design = design[design['control_id'] != design['sample_id']] + + return design + + +def check_enrichment(sample_id, control_id, sample_reads, control_reads): + '''Asses the enrichment per sample.''' + + fingerprint_filename = sample_id + '_fingerprint.png' + + fingerprint_command = ( + "plotFingerprint -b %s %s --labels %s %s --plotFile %s" + % (sample_reads, control_reads, sample_id, control_id, fingerprint_filename) + ) + + logger.info("Running deeptools with %s", fingerprint_command) + + fingerprint_summary = subprocess.Popen(fingerprint_command, shell=True) + out, err = fingerprint_summary.communicate() + def main(): - argparser = prepare_argparser() - args = argparser.parse_args() - run(args.infile, args.genome) - + args = get_args() + + # Create a file handler + handler = logging.FileHandler('experiment_qc.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + + # Read files + design_file = pd.read_csv(args.design, sep='\t') + + # Run correlation + mbs_filename = generate_read_summary(design_file) + check_correlation(mbs_filename) + + # Run coverage + check_coverage(design_file) + + # Run enrichment + new_design = update_controls(design_file) + for index, row in new_design.iterrows(): + check_enrichment( + row['sample_id'], + row['control_id'], + row['bam_reads'], + row['control_reads']) + -if __name__=="__main__": - main() +if __name__ == '__main__': + main() diff --git a/workflow/tests/test_experiment_qc.py b/workflow/tests/test_experiment_qc.py new file mode 100644 index 0000000000000000000000000000000000000000..ad5a8675380b6c5d6c056b4c52e7b94aef920a30 --- /dev/null +++ b/workflow/tests/test_experiment_qc.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 + +import pytest +import os +import pandas as pd +from io import StringIO +import experiment_qc + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../output/filterReads/' + +DESIGN_STRING = """sample_id\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\tbam_reads +A_1\tA\tLiver\tH3K27ac\tNone\t1\tB_1\tA_1.bam +A_2\tA\tLiver\tH3K27ac\tNone\t2\tB_2\tA_2.bam +B_1\tB\tLiver\tInput\tNone\t1\tB_1\tB_1.bam +B_2\tB\tLiver\tInput\tNone\t2\tB_2\tB_2.bam +""" + + +@pytest.fixture +def design_bam(): + design_file = StringIO(DESIGN_STRING) + design_df = pd.read_csv(design_file, sep="\t") + return design_df + + +def test_check_update_controls(design_bam): + new_design = experiment_qc.update_controls(design_bam) + assert new_design.loc[0, 'control_reads'] == "B_1.bam" + + +def test_experiment_qc_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz')) + assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.png')) + assert os.path.exists(os.path.join(test_output_path, 'coverage.png')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT_fingerprint.png')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI_fingerprint.png')) + + +def test_experiment_qc_pairedend(): + # Do the same thing for paired end data + pass diff --git a/workflow/tests/test_utils.py b/workflow/tests/test_utils.py index b5a18a67de45fc3aede2fe009d4444ad8393da08..0abba1be016686a3df4053ba72a763e6488fb99a 100644 --- a/workflow/tests/test_utils.py +++ b/workflow/tests/test_utils.py @@ -30,7 +30,7 @@ def steps_2(steps_1): def test_run_one_step(steps_1, capsys): - check_output = 'ENCBS844FSC\tENCSR238SGC\tlimb\tH3K4me1\tNone\t1\tENCBS844FSC\tENCFF833BLU.fastq.gz'.encode('UTF-8') + check_output = 'ENCLB144FDT\tENCSR238SGC\tlimb\tH3K4me1\tNone\t1\tENCLB304SBJ\tENCFF833BLU.fastq.gz'.encode('UTF-8') out, err = utils.run_pipe(steps_1) output, errors = capsys.readouterr() assert "first step shlex to stdout" in output