Skip to content
Snippets Groups Projects
Commit 4e68354e authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Add in deeptools code to generate QC from deeptools.

parent 471773a6
Branches
Tags
1 merge request!8Resolve "Update deeptools"
Pipeline #1072 failed with stage
in 15 seconds
...@@ -47,6 +47,7 @@ workflow_modules: ...@@ -47,6 +47,7 @@ workflow_modules:
- 'samtools/1.4.1' - 'samtools/1.4.1'
- 'sambamba/0.6.6' - 'sambamba/0.6.6'
- 'bedtools/2.26.0' - 'bedtools/2.26.0'
- 'deeptools/2.5.0.1'
# A list of parameters used by the workflow, defining how to present them, # A list of parameters used by the workflow, defining how to present them,
# options etc in the web interface. For each parameter: # options etc in the web interface. For each parameter:
......
sample_id experiment_id biosample factor treatment replicate control_id fastq_read1 sample_id experiment_id biosample factor treatment replicate control_id fastq_read1
ENCBS844FSC ENCSR238SGC limb H3K4me1 None 1 ENCBS844FSC ENCFF833BLU.fastq.gz ENCLB144FDT ENCSR238SGC limb H3K4me1 None 1 ENCLB304SBJ ENCFF833BLU.fastq.gz
ENCBS892NXC ENCSR238SGC limb H3K4me1 None 2 ENCBS892NXC ENCFF646LXU.fastq.gz ENCLB831RUI ENCSR238SGC limb H3K4me1 None 2 ENCLB410VVO ENCFF646LXU.fastq.gz
ENCBS844FSC ENCSR687ALB limb Control None 1 ENCBS844FSC ENCFF524CAC.fastq.gz ENCLB304SBJ ENCSR687ALB limb Control None 1 ENCLB304SBJ ENCFF524CAC.fastq.gz
ENCBS892NXC ENCSR687ALB limb Control None 2 ENCBS892NXC ENCFF163AJI.fastq.gz ENCLB410VVO ENCSR687ALB limb Control None 2 ENCLB410VVO ENCFF163AJI.fastq.gz
sample_id experiment_id biosample factor treatment replicate control_id fastq_read1 fastq_read2 sample_id experiment_id biosample factor treatment replicate control_id fastq_read1 fastq_read2
ENCBS609QTY ENCSR729LGA MCF-7 SP1 None 1 ENCBS216AOQ ENCFF957SQS.fastq.gz ENCFF582IOZ.fastq.gz ENCLB637LZP ENCSR729LGA MCF-7 SP1 None 1 ENCLB678IDC ENCFF957SQS.fastq.gz ENCFF582IOZ.fastq.gz
ENCBS200IWR ENCSR729LGA MCF-7 SP1 None 2 ENCBS034XKZ ENCFF330MCZ.fastq.gz ENCFF293YFE.fastq.gz ENCLB568IYX ENCSR729LGA MCF-7 SP1 None 2 ENCLB336TVW ENCFF330MCZ.fastq.gz ENCFF293YFE.fastq.gz
ENCBS216AOQ ENCSR217LRF MCF-7 Control None 1 ENCBS216AOQ ENCFF002DTU.fastq.gz ENCFF002EFI.fastq.gz ENCLB678IDC ENCSR217LRF MCF-7 Control None 1 ENCLB678IDC ENCFF002DTU.fastq.gz ENCFF002EFI.fastq.gz
ENCBS034XKZ ENCSR217LRF MCF-7 Control None 2 ENCBS034XKZ ENCFF002EFG.fastq.gz ENCFF002DTS.fastq.gz ENCLB336TVW ENCSR217LRF MCF-7 Control None 2 ENCLB336TVW ENCFF002EFG.fastq.gz ENCFF002DTS.fastq.gz
...@@ -19,6 +19,10 @@ process { ...@@ -19,6 +19,10 @@ process {
module = ['python/3.6.1-2-anaconda', 'samtools/1.4.1', 'sambamba/0.6.6', 'bedtools/2.26.0'] module = ['python/3.6.1-2-anaconda', 'samtools/1.4.1', 'sambamba/0.6.6', 'bedtools/2.26.0']
cpus = 32 cpus = 32
} }
$experimentQC {
module = ['python/3.6.1-2-anaconda', 'deeptools/2.5.0.1']
executor = 'local'
}
} }
params { params {
......
...@@ -162,3 +162,28 @@ process filterReads { ...@@ -162,3 +162,28 @@ process filterReads {
} }
} }
// Define channel collecting new design file
dedupDesign = dedupReads.
collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\tbiosample\tfactor\ttreatment\treplicate\controlId\n", storeDir:""$baseDir/output/design")
// Quality Metrics using deeptools
process experimentQC {
publishDir "$baseDir/output/${task.process}", mode: 'copy'
input:
file dedupDesign
output:
file '*.{png,npz}' into deepToolsStats
script:
"""
python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign
"""
}
#!/usr/bin/python #!/usr/bin/env python3
# programmer : bbc
# usage:
import sys '''Experiment correlation and enrichment of reads over genome-wide bins.'''
import argparse as ap
import argparse
import logging import logging
import subprocess import subprocess
import shutil
import pandas as pd import pandas as pd
from multiprocessing import Pool from multiprocessing import cpu_count
logging.basicConfig(level=10)
EPILOG = '''
For more details:
def prepare_argparser(): %(prog)s --help
description = "Make wig file for given bed using bam" '''
epilog = "For command line options of each command, type %(prog)% COMMAND -h"
argparser = ap.ArgumentParser(description=description, epilog = epilog) # SETTINGS
argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BAM file")
argparser.add_argument("-g","--genome",dest = "genome",type=str,required=True, help="genome", default="hg19") logger = logging.getLogger(__name__)
#argparser.add_argument("-b","--bed",dest="bedfile",type=str,required=True, help = "Gene locus in bed format") logger.addHandler(logging.NullHandler())
#argparser.add_argument("-s","--strandtype",dest="stranded",type=str,default="none", choices=["none","reverse","yes"]) logger.propagate = False
#argparser.add_argument("-n","--name",dest="trackName",type=str,default="UserTrack",help = "track name for bedgraph header") logger.setLevel(logging.INFO)
return(argparser)
def run_qc(files, controls, labels): def get_args():
mbs_command = "multiBamSummary bins --bamfiles "+' '.join(files)+" -out sample_mbs.npz" '''Define arguments.'''
p = subprocess.Popen(mbs_command, shell=True) parser = argparse.ArgumentParser(
#logging.debug(mbs_command) description=__doc__, epilog=EPILOG,
p.communicate() formatter_class=argparse.RawDescriptionHelpFormatter)
pcor_command = "plotCorrelation -in sample_mbs.npz --corMethod spearman --skipZeros --plotTitle \"Spearman Correlation of Read Counts\" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers -o experiment.deeptools.heatmap_spearmanCorr_readCounts_v2.png --labels "+" ".join(labels)
#logging.debug(pcor_command) parser.add_argument('-d', '--design',
p = subprocess.Popen(pcor_command, shell=True) help="The design file to run QC (tsv format).",
p.communicate() required=True)
#plotCoverage
pcov_command = "plotCoverage -b "+" ".join(files)+" --plotFile experiment.deeptools_coverage.png -n 1000000 --plotTitle \"sample coverage\" --ignoreDuplicates --minMappingQuality 10" args = parser.parse_args()
p = subprocess.Popen(pcov_command, shell=True) return args
p.communicate()
#draw fingerprints plots
for treat,ctrl,name in zip(files,controls,labels): def check_tools():
fp_command = "plotFingerprint -b "+treat+" "+ctrl+" --labels "+name+" control --plotFile "+name+".deeptools_fingerprints.png" '''Checks for required componenets on user system.'''
p = subprocess.Popen(fp_command, shell=True)
p.communicate() logger.info('Checking for required libraries and components on this system')
def bam2bw_wrapper(command): deeptools_path = shutil.which("deeptools")
p = subprocess.Popen(command, shell=True) if deeptools_path:
p.communicate() logger.info('Found deeptools: %s', deeptools_path)
else:
def run_signal(files, labels, genome): logger.error('Missing deeptools')
#compute matrix and draw profile and heatmap raise Exception('Missing deeptools')
gene_bed = genome+"/gene.bed"#"/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/genome/"+genome+"/gene.bed"
bw_commands = []
for f in files: def generate_read_summary(design):
bw_commands.append("bamCoverage -bs 10 -b "+f+" -o "+f.replace("bam","bw")) '''Generate summary of data based on read counts.'''
work_pool = Pool(min(len(files), 12))
work_pool.map(bam2bw_wrapper, bw_commands) bam_files = ' '.join(design['bam_reads'])
work_pool.close() labels = ' '.join(design['sample_id'])
work_pool.join() mbs_filename = 'sample_mbs.npz'
cm_command = "computeMatrix scale-regions -R "+gene_bed+" -a 3000 -b 3000 --regionBodyLength 5000 --skipZeros -S *.bw -o samples.deeptools_generegionscalematrix.gz" mbs_command = (
p = subprocess.Popen(cm_command, shell=True) "multiBamSummary bins -p %d --bamfiles %s --labels %s -out %s"
p.communicate() % (cpu_count(), bam_files, labels, mbs_filename)
hm_command = "plotHeatmap -m samples.deeptools_generegionscalematrix.gz -out samples.deeptools_readsHeatmap.png" )
p = subprocess.Popen(hm_command, shell=True)
p.communicate() logger.info("Running deeptools with %s", mbs_command)
def run(dfile,genome): read_summary = subprocess.Popen(mbs_command, shell=True)
#parse dfile, suppose data files are the same folder as design file out, err = read_summary.communicate()
dfile = pd.read_csv(dfile)
#QC: multiBamSummary and plotCorrelation return mbs_filename
run_qc(dfile['bamReads'], dfile['bamControl'], dfile['SampleID'])
#signal plots
run_signal(dfile['bamReads'],dfile['SampleID'],genome) def check_correlation(mbs):
'''Check Spearman pairwise correlation of samples based on read counts.'''
spearman_filename = 'heatmap_SpearmanCorr.png'
spearman_params = "--corMethod spearman --skipZero" + \
" --plotTitle \"Spearman Correlation of Read Counts\"" + \
" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers"
spearman_command = (
"plotCorrelation -in %s %s -o %s"
% (mbs, spearman_params, spearman_filename)
)
logger.info("Running deeptools with %s", spearman_command)
spearman_correlation = subprocess.Popen(spearman_command, shell=True)
out, err = spearman_correlation.communicate()
def check_coverage(design):
'''Asses the sequencing depth of samples.'''
bam_files = ' '.join(design['bam_reads'])
labels = ' '.join(design['sample_id'])
coverage_filename = 'coverage.png'
coverage_params = "-n 1000000 --plotTitle \"Sample Coverage\"" + \
" --ignoreDuplicates --minMappingQuality 10"
coverage_command = (
"plotCoverage -b %s --labels %s %s --plotFile %s"
% (bam_files, labels, coverage_params, coverage_filename)
)
logger.info("Running deeptools with %s", coverage_command)
coverage_summary = subprocess.Popen(coverage_command, shell=True)
out, err = coverage_summary.communicate()
def update_controls(design):
'''Update design file to append controls list.'''
logger.info("Running control file update.")
file_dict = design[['sample_id', 'bam_reads']] \
.set_index('sample_id').T.to_dict()
design['control_reads'] = design['control_id'] \
.apply(lambda x: file_dict[x]['bam_reads'])
logger.info("Removing rows that are there own control.")
design = design[design['control_id'] != design['sample_id']]
return design
def check_enrichment(sample_id, control_id, sample_reads, control_reads):
'''Asses the enrichment per sample.'''
fingerprint_filename = sample_id + '_fingerprint.png'
fingerprint_command = (
"plotFingerprint -b %s %s --labels %s %s --plotFile %s"
% (sample_reads, control_reads, sample_id, control_id, fingerprint_filename)
)
logger.info("Running deeptools with %s", fingerprint_command)
fingerprint_summary = subprocess.Popen(fingerprint_command, shell=True)
out, err = fingerprint_summary.communicate()
def main(): def main():
argparser = prepare_argparser() args = get_args()
args = argparser.parse_args()
run(args.infile, args.genome) # Create a file handler
handler = logging.FileHandler('experiment_qc.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Read files
design_file = pd.read_csv(args.design, sep='\t')
# Run correlation
mbs_filename = generate_read_summary(design_file)
check_correlation(mbs_filename)
# Run coverage
check_coverage(design_file)
# Run enrichment
new_design = update_controls(design_file)
for index, row in new_design.iterrows():
check_enrichment(
row['sample_id'],
row['control_id'],
row['bam_reads'],
row['control_reads'])
if __name__=="__main__": if __name__ == '__main__':
main() main()
#!/usr/bin/env python3
import pytest
import os
import pandas as pd
from io import StringIO
import experiment_qc
test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../output/filterReads/'
DESIGN_STRING = """sample_id\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\tbam_reads
A_1\tA\tLiver\tH3K27ac\tNone\t1\tB_1\tA_1.bam
A_2\tA\tLiver\tH3K27ac\tNone\t2\tB_2\tA_2.bam
B_1\tB\tLiver\tInput\tNone\t1\tB_1\tB_1.bam
B_2\tB\tLiver\tInput\tNone\t2\tB_2\tB_2.bam
"""
@pytest.fixture
def design_bam():
design_file = StringIO(DESIGN_STRING)
design_df = pd.read_csv(design_file, sep="\t")
return design_df
def test_check_update_controls(design_bam):
new_design = experiment_qc.update_controls(design_bam)
assert new_design.loc[0, 'control_reads'] == "B_1.bam"
def test_experiment_qc_singleend():
assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz'))
assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.png'))
assert os.path.exists(os.path.join(test_output_path, 'coverage.png'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT_fingerprint.png'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI_fingerprint.png'))
def test_experiment_qc_pairedend():
# Do the same thing for paired end data
pass
...@@ -30,7 +30,7 @@ def steps_2(steps_1): ...@@ -30,7 +30,7 @@ def steps_2(steps_1):
def test_run_one_step(steps_1, capsys): def test_run_one_step(steps_1, capsys):
check_output = 'ENCBS844FSC\tENCSR238SGC\tlimb\tH3K4me1\tNone\t1\tENCBS844FSC\tENCFF833BLU.fastq.gz'.encode('UTF-8') check_output = 'ENCLB144FDT\tENCSR238SGC\tlimb\tH3K4me1\tNone\t1\tENCLB304SBJ\tENCFF833BLU.fastq.gz'.encode('UTF-8')
out, err = utils.run_pipe(steps_1) out, err = utils.run_pipe(steps_1)
output, errors = capsys.readouterr() output, errors = capsys.readouterr()
assert "first step shlex to stdout" in output assert "first step shlex to stdout" in output
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment