Skip to content
Snippets Groups Projects
Commit e914ddd5 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Merge branch '6-deeptools' into 'master'

Resolve "Update deeptools"

Closes #6

See merge request !8
parents a1fdbeab b8c85d1e
Branches
Tags
1 merge request!8Resolve "Update deeptools"
Pipeline #1081 canceled with stage
in 46 seconds
......@@ -47,6 +47,7 @@ workflow_modules:
- 'samtools/1.4.1'
- 'sambamba/0.6.6'
- 'bedtools/2.26.0'
- 'deeptools/2.5.0.1'
# A list of parameters used by the workflow, defining how to present them,
# options etc in the web interface. For each parameter:
......
sample_id experiment_id biosample factor treatment replicate control_id fastq_read1
ENCBS844FSC ENCSR238SGC limb H3K4me1 None 1 ENCBS844FSC ENCFF833BLU.fastq.gz
ENCBS892NXC ENCSR238SGC limb H3K4me1 None 2 ENCBS892NXC ENCFF646LXU.fastq.gz
ENCBS844FSC ENCSR687ALB limb Control None 1 ENCBS844FSC ENCFF524CAC.fastq.gz
ENCBS892NXC ENCSR687ALB limb Control None 2 ENCBS892NXC ENCFF163AJI.fastq.gz
ENCLB144FDT ENCSR238SGC limb H3K4me1 None 1 ENCLB304SBJ ENCFF833BLU.fastq.gz
ENCLB831RUI ENCSR238SGC limb H3K4me1 None 2 ENCLB410VVO ENCFF646LXU.fastq.gz
ENCLB304SBJ ENCSR687ALB limb Control None 1 ENCLB304SBJ ENCFF524CAC.fastq.gz
ENCLB410VVO ENCSR687ALB limb Control None 2 ENCLB410VVO ENCFF163AJI.fastq.gz
sample_id experiment_id biosample factor treatment replicate control_id fastq_read1 fastq_read2
ENCBS609QTY ENCSR729LGA MCF-7 SP1 None 1 ENCBS216AOQ ENCFF957SQS.fastq.gz ENCFF582IOZ.fastq.gz
ENCBS200IWR ENCSR729LGA MCF-7 SP1 None 2 ENCBS034XKZ ENCFF330MCZ.fastq.gz ENCFF293YFE.fastq.gz
ENCBS216AOQ ENCSR217LRF MCF-7 Control None 1 ENCBS216AOQ ENCFF002DTU.fastq.gz ENCFF002EFI.fastq.gz
ENCBS034XKZ ENCSR217LRF MCF-7 Control None 2 ENCBS034XKZ ENCFF002EFG.fastq.gz ENCFF002DTS.fastq.gz
ENCLB637LZP ENCSR729LGA MCF-7 SP1 None 1 ENCLB678IDC ENCFF957SQS.fastq.gz ENCFF582IOZ.fastq.gz
ENCLB568IYX ENCSR729LGA MCF-7 SP1 None 2 ENCLB336TVW ENCFF330MCZ.fastq.gz ENCFF293YFE.fastq.gz
ENCLB678IDC ENCSR217LRF MCF-7 Control None 1 ENCLB678IDC ENCFF002DTU.fastq.gz ENCFF002EFI.fastq.gz
ENCLB336TVW ENCSR217LRF MCF-7 Control None 2 ENCLB336TVW ENCFF002EFG.fastq.gz ENCFF002DTS.fastq.gz
......@@ -19,6 +19,10 @@ process {
module = ['python/3.6.1-2-anaconda', 'samtools/1.4.1', 'sambamba/0.6.6', 'bedtools/2.26.0']
cpus = 32
}
$experimentQC {
module = ['python/3.6.1-2-anaconda', 'deeptools/2.5.0.1']
executor = 'local'
}
}
params {
......
......@@ -162,3 +162,30 @@ process filterReads {
}
}
// Define channel collecting new design file
dedupDesign = dedupReads
.map{ sampleId, bam, bai, biosample, factor, treatment, replicate, controlId ->
"$sampleId\t$bam\t$bai\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"}
.collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design")
// Quality Metrics using deeptools
process experimentQC {
publishDir "$baseDir/output/${task.process}", mode: 'copy'
input:
file dedupDesign
output:
file '*.{png,npz}' into deepToolsStats
script:
"""
python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign
"""
}
#!/usr/bin/env python3
'''Experiment correlation and enrichment of reads over genome-wide bins.'''
import argparse
import logging
import subprocess
import shutil
import pandas as pd
from multiprocessing import cpu_count
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-d', '--design',
help="The design file to run QC (tsv format).",
required=True)
args = parser.parse_args()
return args
def check_tools():
'''Checks for required componenets on user system.'''
logger.info('Checking for required libraries and components on this system')
deeptools_path = shutil.which("deeptools")
if deeptools_path:
logger.info('Found deeptools: %s', deeptools_path)
else:
logger.error('Missing deeptools')
raise Exception('Missing deeptools')
def generate_read_summary(design):
'''Generate summary of data based on read counts.'''
bam_files = ' '.join(design['bam_reads'])
labels = ' '.join(design['sample_id'])
mbs_filename = 'sample_mbs.npz'
mbs_command = (
"multiBamSummary bins -p %d --bamfiles %s --labels %s -out %s"
% (cpu_count(), bam_files, labels, mbs_filename)
)
logger.info("Running deeptools with %s", mbs_command)
read_summary = subprocess.Popen(mbs_command, shell=True)
out, err = read_summary.communicate()
return mbs_filename
def check_correlation(mbs):
'''Check Spearman pairwise correlation of samples based on read counts.'''
spearman_filename = 'heatmap_SpearmanCorr.png'
spearman_params = "--corMethod spearman --skipZero" + \
" --plotTitle \"Spearman Correlation of Read Counts\"" + \
" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers"
spearman_command = (
"plotCorrelation -in %s %s -o %s"
% (mbs, spearman_params, spearman_filename)
)
logger.info("Running deeptools with %s", spearman_command)
spearman_correlation = subprocess.Popen(spearman_command, shell=True)
out, err = spearman_correlation.communicate()
def check_coverage(design):
'''Asses the sequencing depth of samples.'''
bam_files = ' '.join(design['bam_reads'])
labels = ' '.join(design['sample_id'])
coverage_filename = 'coverage.png'
coverage_params = "-n 1000000 --plotTitle \"Sample Coverage\"" + \
" --ignoreDuplicates --minMappingQuality 10"
coverage_command = (
"plotCoverage -b %s --labels %s %s --plotFile %s"
% (bam_files, labels, coverage_params, coverage_filename)
)
logger.info("Running deeptools with %s", coverage_command)
coverage_summary = subprocess.Popen(coverage_command, shell=True)
out, err = coverage_summary.communicate()
def update_controls(design):
'''Update design file to append controls list.'''
logger.info("Running control file update.")
file_dict = design[['sample_id', 'bam_reads']] \
.set_index('sample_id').T.to_dict()
design['control_reads'] = design['control_id'] \
.apply(lambda x: file_dict[x]['bam_reads'])
logger.info("Removing rows that are there own control.")
design = design[design['control_id'] != design['sample_id']]
return design
def check_enrichment(sample_id, control_id, sample_reads, control_reads):
'''Asses the enrichment per sample.'''
fingerprint_filename = sample_id + '_fingerprint.png'
fingerprint_command = (
"plotFingerprint -b %s %s --labels %s %s --plotFile %s"
% (sample_reads, control_reads, sample_id, control_id, fingerprint_filename)
)
logger.info("Running deeptools with %s", fingerprint_command)
fingerprint_summary = subprocess.Popen(fingerprint_command, shell=True)
out, err = fingerprint_summary.communicate()
def main():
args = get_args()
# Create a file handler
handler = logging.FileHandler('experiment_qc.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Read files
design_file = pd.read_csv(args.design, sep='\t')
# Run correlation
mbs_filename = generate_read_summary(design_file)
check_correlation(mbs_filename)
# Run coverage
check_coverage(design_file)
# Run enrichment
new_design = update_controls(design_file)
for index, row in new_design.iterrows():
check_enrichment(
row['sample_id'],
row['control_id'],
row['bam_reads'],
row['control_reads'])
if __name__ == '__main__':
main()
......@@ -154,12 +154,12 @@ def dedup_mapped(bam, bam_basename, paired):
# Markduplicates
dup_file_qc_filename = bam_basename + ".dup.qc"
tmp_dup_mark_filename = bam_basename + ".dupmark.bam"
sambamba_parms = "--hash-table-size=17592186044416" + \
sambamba_params = "--hash-table-size=17592186044416" + \
" --overflow-list-size=20000000 --io-buffer-size=256"
with open(dup_file_qc_filename, 'w') as fh:
sambamba_markdup_command = (
"sambamba markdup -t %d %s %s %s"
% (cpu_count(), sambamba_parms, bam, tmp_dup_mark_filename)
% (cpu_count(), sambamba_params, bam, tmp_dup_mark_filename)
)
logger.info(sambamba_markdup_command)
subprocess.check_call(
......
#!/usr/bin/python
# programmer : bbc
# usage:
import sys
import argparse as ap
import logging
import subprocess
import pandas as pd
from multiprocessing import Pool
logging.basicConfig(level=10)
def prepare_argparser():
description = "Make wig file for given bed using bam"
epilog = "For command line options of each command, type %(prog)% COMMAND -h"
argparser = ap.ArgumentParser(description=description, epilog = epilog)
argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BAM file")
argparser.add_argument("-g","--genome",dest = "genome",type=str,required=True, help="genome", default="hg19")
#argparser.add_argument("-b","--bed",dest="bedfile",type=str,required=True, help = "Gene locus in bed format")
#argparser.add_argument("-s","--strandtype",dest="stranded",type=str,default="none", choices=["none","reverse","yes"])
#argparser.add_argument("-n","--name",dest="trackName",type=str,default="UserTrack",help = "track name for bedgraph header")
return(argparser)
def run_qc(files, controls, labels):
mbs_command = "multiBamSummary bins --bamfiles "+' '.join(files)+" -out sample_mbs.npz"
p = subprocess.Popen(mbs_command, shell=True)
#logging.debug(mbs_command)
p.communicate()
pcor_command = "plotCorrelation -in sample_mbs.npz --corMethod spearman --skipZeros --plotTitle \"Spearman Correlation of Read Counts\" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers -o experiment.deeptools.heatmap_spearmanCorr_readCounts_v2.png --labels "+" ".join(labels)
#logging.debug(pcor_command)
p = subprocess.Popen(pcor_command, shell=True)
p.communicate()
#plotCoverage
pcov_command = "plotCoverage -b "+" ".join(files)+" --plotFile experiment.deeptools_coverage.png -n 1000000 --plotTitle \"sample coverage\" --ignoreDuplicates --minMappingQuality 10"
p = subprocess.Popen(pcov_command, shell=True)
p.communicate()
#draw fingerprints plots
for treat,ctrl,name in zip(files,controls,labels):
fp_command = "plotFingerprint -b "+treat+" "+ctrl+" --labels "+name+" control --plotFile "+name+".deeptools_fingerprints.png"
p = subprocess.Popen(fp_command, shell=True)
p.communicate()
def bam2bw_wrapper(command):
p = subprocess.Popen(command, shell=True)
p.communicate()
def run_signal(files, labels, genome):
#compute matrix and draw profile and heatmap
gene_bed = genome+"/gene.bed"#"/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/genome/"+genome+"/gene.bed"
bw_commands = []
for f in files:
bw_commands.append("bamCoverage -bs 10 -b "+f+" -o "+f.replace("bam","bw"))
work_pool = Pool(min(len(files), 12))
work_pool.map(bam2bw_wrapper, bw_commands)
work_pool.close()
work_pool.join()
cm_command = "computeMatrix scale-regions -R "+gene_bed+" -a 3000 -b 3000 --regionBodyLength 5000 --skipZeros -S *.bw -o samples.deeptools_generegionscalematrix.gz"
p = subprocess.Popen(cm_command, shell=True)
p.communicate()
hm_command = "plotHeatmap -m samples.deeptools_generegionscalematrix.gz -out samples.deeptools_readsHeatmap.png"
p = subprocess.Popen(hm_command, shell=True)
p.communicate()
def run(dfile,genome):
#parse dfile, suppose data files are the same folder as design file
dfile = pd.read_csv(dfile)
#QC: multiBamSummary and plotCorrelation
run_qc(dfile['bamReads'], dfile['bamControl'], dfile['SampleID'])
#signal plots
run_signal(dfile['bamReads'],dfile['SampleID'],genome)
def main():
argparser = prepare_argparser()
args = argparser.parse_args()
run(args.infile, args.genome)
if __name__=="__main__":
main()
#!/usr/bin/env python3
import pytest
import os
import pandas as pd
from io import StringIO
import experiment_qc
test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../output/experimentQC/'
DESIGN_STRING = """sample_id\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\tbam_reads
A_1\tA\tLiver\tH3K27ac\tNone\t1\tB_1\tA_1.bam
A_2\tA\tLiver\tH3K27ac\tNone\t2\tB_2\tA_2.bam
B_1\tB\tLiver\tInput\tNone\t1\tB_1\tB_1.bam
B_2\tB\tLiver\tInput\tNone\t2\tB_2\tB_2.bam
"""
@pytest.fixture
def design_bam():
design_file = StringIO(DESIGN_STRING)
design_df = pd.read_csv(design_file, sep="\t")
return design_df
def test_check_update_controls(design_bam):
new_design = experiment_qc.update_controls(design_bam)
assert new_design.loc[0, 'control_reads'] == "B_1.bam"
def test_experiment_qc_singleend():
assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz'))
assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.png'))
assert os.path.exists(os.path.join(test_output_path, 'coverage.png'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT_fingerprint.png'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI_fingerprint.png'))
def test_experiment_qc_pairedend():
# Do the same thing for paired end data
pass
......@@ -30,7 +30,7 @@ def steps_2(steps_1):
def test_run_one_step(steps_1, capsys):
check_output = 'ENCBS844FSC\tENCSR238SGC\tlimb\tH3K4me1\tNone\t1\tENCBS844FSC\tENCFF833BLU.fastq.gz'.encode('UTF-8')
check_output = 'ENCLB144FDT\tENCSR238SGC\tlimb\tH3K4me1\tNone\t1\tENCLB304SBJ\tENCFF833BLU.fastq.gz'.encode('UTF-8')
out, err = utils.run_pipe(steps_1)
output, errors = capsys.readouterr()
assert "first step shlex to stdout" in output
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment