Skip to content
Snippets Groups Projects
Commit 3a9a868b authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Add in steps to remove dups and filter reads.

parent 4357eb15
Branches
Tags
No related merge requests found
......@@ -44,6 +44,9 @@ workflow_modules:
- 'cutadapt/1.9.1'
- 'fastqc/0.11.2'
- 'bwa/intel/0.7.12'
- 'samtools/intel/1.3'
- 'sambamba/0.6.6'
- 'bedtools/2.26.0'
# A list of parameters used by the workflow, defining how to present them,
# options etc in the web interface. For each parameter:
......
......@@ -15,6 +15,10 @@ process {
module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/intel/1.3']
cpus = 32
}
$filterReads{
module = ['python/3.6.1-2-anaconda', 'samtools/intel/1.3', 'sambamba/0.6.6', 'bedtools/2.26.0']
cpus = 32
}
}
params {
......
......@@ -114,7 +114,7 @@ process alignReads {
output:
set sampleId, file('*.bam'), biosample, factor, treatment, replicate, controlId into mappedReads
set file('*.srt.bam.flagstat.qc')
file '*.srt.bam.flagstat.qc' into mappedReadsStats
script:
......@@ -130,3 +130,34 @@ process alignReads {
}
}
// Dedup reads using sambamba
process filterReads {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/${task.process}", mode: 'copy'
input:
set sampleId, mapped, biosample, factor, treatment, replicate, controlId from mappedReads
output:
set sampleId, file('*.bam'), biosample, factor, treatment, replicate, controlId into dedupReads
set file('*flagstat.qc') into dedupReadsStats
set file('*pbc.qc') into dedupReadsComplexity
script:
if (pairedEnd) {
"""
python3 $baseDir/scripts/map_qc.py -f $mapped -p
"""
}
else {
"""
python3 $baseDir/scripts/map_qc.py -f $mapped
"""
}
}
#!/usr/bin/env python3
'''Remove duplicates and filter unmapped reads.'''
import os
import subprocess
import argparse
import shutil
import shlex
import logging
from multiprocessing import cpu_count
import json
import utils
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.bam', '.srt']
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-b', '--bam',
help="The bam file to run filtering and qc on.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
action='store_true')
args = parser.parse_args()
return args
# Functions
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
samtools_path = shutil.which("samtools")
if samtools_path:
logger.info('Found samtools: %s', samtools_path)
else:
logger.error('Missing samtools')
raise Exception('Missing samtools')
sambamba_path = shutil.which("sambamba")
if bwa_path:
logger.info('Found sambamba: %s', bwa_path)
else:
logger.error('Missing sambamba')
raise Exception('Missing sambamba')
bedtools_path = shutil.which("bedtools")
if bwa_path:
logger.info('Found sambamba: %s', bwa_path)
else:
logger.error('Missing sambamba')
raise Exception('Missing sambamba')
def filter_mapped_pe(bam, bam_basename):
'''Use samtools to filter unmapped reads for PE data.'''
filt_bam_prefix = bam_basename + ".filt.srt"
filt_bam_filename = filt_bam_prefix + ".bam"
tmp_filt_bam_prefix = "tmp.%s" % (filt_bam_prefix)
tmp_filt_bam_filename = tmp_filt_bam_prefix + ".bam"
# Remove unmapped, mate unmapped
# not primary alignment, reads failing platform
# Remove low MAPQ reads
# Only keep properly paired reads
# Obtain name sorted BAM file
out, err = common.run_pipe([
# filter: -F 1804 FlAG bits to exclude; -f 2 FLAG bits to reqire;
# -q 30 exclude MAPQ < 30; -u uncompressed output
# exclude FLAG 1804: unmapped, next segment unmapped, secondary
# alignments, not passing platform q, PCR or optical duplicates
# require FLAG 2: properly aligned
"samtools view -F 1804 -f 2 -q 30 -u %s" % (bam),
# sort: -n sort by name; - take input from stdin;
# out to specified filename
# Will produce name sorted BAM
"samtools sort -n -@%d -o %s" % (cpu_count(), tmp_filt_bam_filename)])
if err:
logger.error("samtools filter error: %s" % (err))
# Remove orphan reads (pair was removed)
# and read pairs mapping to different chromosomes
# Obtain position sorted BAM
out, err = common.run_pipe([
# fill in mate coordinates, ISIZE and mate-related flags
# fixmate requires name-sorted alignment; -r removes secondary and
# unmapped (redundant here because already done above?)
# - send output to stdout
"samtools fixmate -r %s -" % (tmp_filt_bam_filename),
# repeat filtering after mate repair
"samtools view -F 1804 -f 2 -u -",
# produce the coordinate-sorted BAM
"samtools sort -@%d -o %s" % (cpu_count(), filt_bam_filename)])
os.remove(tmp_filt_bam_filename)
return filt_bam_filename
def filter_mapped_se(bam, bam_basename):
'''Use samtools to filter unmapped reads for SE data.'''
filt_bam_prefix = bam_basename + ".filt.srt"
filt_bam_filename = filt_bam_prefix + ".bam"
# Remove unmapped, mate unmapped
# not primary alignment, reads failing platform
# Remove low MAPQ reads
# Obtain name sorted BAM file
with open(filt_bam_filename, 'w') as fh:
samtools_filter_command = (
"samtools view -F 1804 -q 30 -b %s"
% (samtools_params, input_bam)
)
logger.info(samtools_filter_command)
subprocess.check_call(
shlex.split(samtools_filter_command),
stdout=fh)
return filt_bam_filename
def dedup_mapped(bam, bam_basename, paired):
'''Use sambamba and samtools to remove duplicates.'''
# Markduplicates
dup_file_qc_filename = bam_basename + ".dup.qc"
tmp_dup_mark_filename = bam_basename + ".dupmark.bam"
sambamba_parms = "--hash-table-size=17592186044416 \
--overflow-list-size=20000000 --io-buffer-size=256"
with open(dup_file_qc_filename, 'w') as fh:
sambamba_markdup_command = (
"sambamba markdup -t %d %s %s %s"
% (cpu_count, sambamba_parms, bam, tmp_dup_mark_filename)
)
logger.info(sambamba_markdup_command)
subprocess.check_call(
shlex.split(sambamba_markdup),
stdout=fh)
os.remove(tmp_dup_mark_filename + '.bai')
# Remove duplicates
final_bam_prefix = bam_basename + ".filt.nodup"
final_bam_filename = final_bam_prefix + ".bam"
if paired: # paired-end data
samtools_dedupe_command = \
"samtools view -F 1804 -f 2 -b %s" % (filt_bam_filename)
else:
samtools_dedupe_command = \
"samtools view -F 1804 -b %s" % (filt_bam_filename)
with open(final_bam_filename, 'w') as fh:
logger.info(samtools_dedupe_command)
subprocess.check_call(
shlex.split(samtools_dedupe_command),
stdout=fh)
# Index final bam file
sambamba_index_command = \
"samtools index -t %d %s" % (cpu_count(), final_bam_filename)
logger.info(sambamba_index_command)
subprocess.check_output(shlex.split(sambamba_index_command))
# Generate mapping statistics
final_bam_file_mapstats_filename = final_bam_prefix + ".flagstat.qc"
with open(final_bam_file_mapstats_filename, 'w') as fh:
flagstat_command = "sambamba flagstat -t %d %s"
% (cpu_count(), final_bam_filename)
logger.info(flagstat_command)
subprocess.check_call(shlex.split(flagstat_command), stdout=fh)
os.remove(filt_bam_filename)
return final_bam_filename
def compute_complexity(bam, paired, bam_basename):
'''Calculate library complexity .'''
pbc_file_qc_filename = bam_basename + "filt.nodup.pbc.qc"
# Sort by name
# convert to bedPE and obtain fragment coordinates
# sort by position and strand
# Obtain unique count statistics
# PBC File output
# TotalReadPairs [tab]
# DistinctReadPairs [tab]
# OneReadPair [tab]
# TwoReadPairs [tab]
# NRF=Distinct/Total [tab]
# PBC1=OnePair/Distinct [tab]
# PBC2=OnePair/TwoPair
if paired:
steps = [
"sambamba sort -t %d -n %s" % (cpu_count(), bam),
"bamToBed -bedpe -i stdin",
r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""]
else:
steps = [
"bamToBed -i %s" % (bam),
r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""]
steps.extend([
"grep -v 'chrM'",
"sort",
"uniq -c",
r"""awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{printf "%d\t%d\t%d\t%d\t%f\t%f\t%f\n",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'"""
])
out, err = common.run_pipe(steps, pbc_file_qc_filename)
if err:
logger.error("PBC file error: %s" % (err))
def main():
args = get_args()
paired = args.paired
bam = args.bam
# Create a file handler
handler = logging.FileHandler('filter_qc.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Run filtering for either PE or SE
bam_basename = os.path.basename(
utils.strip_extensions(bam, STRIP_EXTENSIONS))
if paired: # paired-end data
filter_bam_filename = filter_mapped_pe(bam, bam_basename)
else:
filter_bam_filename = filter_mapped_se(bam, bam_basename)
# Remove duplicates
dedup_bam_filename = dedup_mapped(filter_bam_filename, bam_basename, paired)
# Compute library complexity
compute_complexity(dedup_bam_filename, paired, bam_basename)
if __name__ == '__main__':
main()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment