Skip to content
Snippets Groups Projects
Commit 3a9a868b authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Add in steps to remove dups and filter reads.

parent 4357eb15
Branches
Tags
1 merge request!6Resolve "Bam Stats and Filter"
Pipeline #1054 failed with stage
in 54 minutes and 16 seconds
...@@ -44,6 +44,9 @@ workflow_modules: ...@@ -44,6 +44,9 @@ workflow_modules:
- 'cutadapt/1.9.1' - 'cutadapt/1.9.1'
- 'fastqc/0.11.2' - 'fastqc/0.11.2'
- 'bwa/intel/0.7.12' - 'bwa/intel/0.7.12'
- 'samtools/intel/1.3'
- 'sambamba/0.6.6'
- 'bedtools/2.26.0'
# A list of parameters used by the workflow, defining how to present them, # A list of parameters used by the workflow, defining how to present them,
# options etc in the web interface. For each parameter: # options etc in the web interface. For each parameter:
......
...@@ -15,6 +15,10 @@ process { ...@@ -15,6 +15,10 @@ process {
module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/intel/1.3'] module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/intel/1.3']
cpus = 32 cpus = 32
} }
$filterReads{
module = ['python/3.6.1-2-anaconda', 'samtools/intel/1.3', 'sambamba/0.6.6', 'bedtools/2.26.0']
cpus = 32
}
} }
params { params {
......
...@@ -114,7 +114,7 @@ process alignReads { ...@@ -114,7 +114,7 @@ process alignReads {
output: output:
set sampleId, file('*.bam'), biosample, factor, treatment, replicate, controlId into mappedReads set sampleId, file('*.bam'), biosample, factor, treatment, replicate, controlId into mappedReads
set file('*.srt.bam.flagstat.qc') file '*.srt.bam.flagstat.qc' into mappedReadsStats
script: script:
...@@ -130,3 +130,34 @@ process alignReads { ...@@ -130,3 +130,34 @@ process alignReads {
} }
} }
// Dedup reads using sambamba
process filterReads {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/${task.process}", mode: 'copy'
input:
set sampleId, mapped, biosample, factor, treatment, replicate, controlId from mappedReads
output:
set sampleId, file('*.bam'), biosample, factor, treatment, replicate, controlId into dedupReads
set file('*flagstat.qc') into dedupReadsStats
set file('*pbc.qc') into dedupReadsComplexity
script:
if (pairedEnd) {
"""
python3 $baseDir/scripts/map_qc.py -f $mapped -p
"""
}
else {
"""
python3 $baseDir/scripts/map_qc.py -f $mapped
"""
}
}
#!/usr/bin/env python3
'''Remove duplicates and filter unmapped reads.'''
import os
import subprocess
import argparse
import shutil
import shlex
import logging
from multiprocessing import cpu_count
import json
import utils
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.bam', '.srt']
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-b', '--bam',
help="The bam file to run filtering and qc on.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
action='store_true')
args = parser.parse_args()
return args
# Functions
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
samtools_path = shutil.which("samtools")
if samtools_path:
logger.info('Found samtools: %s', samtools_path)
else:
logger.error('Missing samtools')
raise Exception('Missing samtools')
sambamba_path = shutil.which("sambamba")
if bwa_path:
logger.info('Found sambamba: %s', bwa_path)
else:
logger.error('Missing sambamba')
raise Exception('Missing sambamba')
bedtools_path = shutil.which("bedtools")
if bwa_path:
logger.info('Found sambamba: %s', bwa_path)
else:
logger.error('Missing sambamba')
raise Exception('Missing sambamba')
def filter_mapped_pe(bam, bam_basename):
'''Use samtools to filter unmapped reads for PE data.'''
filt_bam_prefix = bam_basename + ".filt.srt"
filt_bam_filename = filt_bam_prefix + ".bam"
tmp_filt_bam_prefix = "tmp.%s" % (filt_bam_prefix)
tmp_filt_bam_filename = tmp_filt_bam_prefix + ".bam"
# Remove unmapped, mate unmapped
# not primary alignment, reads failing platform
# Remove low MAPQ reads
# Only keep properly paired reads
# Obtain name sorted BAM file
out, err = common.run_pipe([
# filter: -F 1804 FlAG bits to exclude; -f 2 FLAG bits to reqire;
# -q 30 exclude MAPQ < 30; -u uncompressed output
# exclude FLAG 1804: unmapped, next segment unmapped, secondary
# alignments, not passing platform q, PCR or optical duplicates
# require FLAG 2: properly aligned
"samtools view -F 1804 -f 2 -q 30 -u %s" % (bam),
# sort: -n sort by name; - take input from stdin;
# out to specified filename
# Will produce name sorted BAM
"samtools sort -n -@%d -o %s" % (cpu_count(), tmp_filt_bam_filename)])
if err:
logger.error("samtools filter error: %s" % (err))
# Remove orphan reads (pair was removed)
# and read pairs mapping to different chromosomes
# Obtain position sorted BAM
out, err = common.run_pipe([
# fill in mate coordinates, ISIZE and mate-related flags
# fixmate requires name-sorted alignment; -r removes secondary and
# unmapped (redundant here because already done above?)
# - send output to stdout
"samtools fixmate -r %s -" % (tmp_filt_bam_filename),
# repeat filtering after mate repair
"samtools view -F 1804 -f 2 -u -",
# produce the coordinate-sorted BAM
"samtools sort -@%d -o %s" % (cpu_count(), filt_bam_filename)])
os.remove(tmp_filt_bam_filename)
return filt_bam_filename
def filter_mapped_se(bam, bam_basename):
'''Use samtools to filter unmapped reads for SE data.'''
filt_bam_prefix = bam_basename + ".filt.srt"
filt_bam_filename = filt_bam_prefix + ".bam"
# Remove unmapped, mate unmapped
# not primary alignment, reads failing platform
# Remove low MAPQ reads
# Obtain name sorted BAM file
with open(filt_bam_filename, 'w') as fh:
samtools_filter_command = (
"samtools view -F 1804 -q 30 -b %s"
% (samtools_params, input_bam)
)
logger.info(samtools_filter_command)
subprocess.check_call(
shlex.split(samtools_filter_command),
stdout=fh)
return filt_bam_filename
def dedup_mapped(bam, bam_basename, paired):
'''Use sambamba and samtools to remove duplicates.'''
# Markduplicates
dup_file_qc_filename = bam_basename + ".dup.qc"
tmp_dup_mark_filename = bam_basename + ".dupmark.bam"
sambamba_parms = "--hash-table-size=17592186044416 \
--overflow-list-size=20000000 --io-buffer-size=256"
with open(dup_file_qc_filename, 'w') as fh:
sambamba_markdup_command = (
"sambamba markdup -t %d %s %s %s"
% (cpu_count, sambamba_parms, bam, tmp_dup_mark_filename)
)
logger.info(sambamba_markdup_command)
subprocess.check_call(
shlex.split(sambamba_markdup),
stdout=fh)
os.remove(tmp_dup_mark_filename + '.bai')
# Remove duplicates
final_bam_prefix = bam_basename + ".filt.nodup"
final_bam_filename = final_bam_prefix + ".bam"
if paired: # paired-end data
samtools_dedupe_command = \
"samtools view -F 1804 -f 2 -b %s" % (filt_bam_filename)
else:
samtools_dedupe_command = \
"samtools view -F 1804 -b %s" % (filt_bam_filename)
with open(final_bam_filename, 'w') as fh:
logger.info(samtools_dedupe_command)
subprocess.check_call(
shlex.split(samtools_dedupe_command),
stdout=fh)
# Index final bam file
sambamba_index_command = \
"samtools index -t %d %s" % (cpu_count(), final_bam_filename)
logger.info(sambamba_index_command)
subprocess.check_output(shlex.split(sambamba_index_command))
# Generate mapping statistics
final_bam_file_mapstats_filename = final_bam_prefix + ".flagstat.qc"
with open(final_bam_file_mapstats_filename, 'w') as fh:
flagstat_command = "sambamba flagstat -t %d %s"
% (cpu_count(), final_bam_filename)
logger.info(flagstat_command)
subprocess.check_call(shlex.split(flagstat_command), stdout=fh)
os.remove(filt_bam_filename)
return final_bam_filename
def compute_complexity(bam, paired, bam_basename):
'''Calculate library complexity .'''
pbc_file_qc_filename = bam_basename + "filt.nodup.pbc.qc"
# Sort by name
# convert to bedPE and obtain fragment coordinates
# sort by position and strand
# Obtain unique count statistics
# PBC File output
# TotalReadPairs [tab]
# DistinctReadPairs [tab]
# OneReadPair [tab]
# TwoReadPairs [tab]
# NRF=Distinct/Total [tab]
# PBC1=OnePair/Distinct [tab]
# PBC2=OnePair/TwoPair
if paired:
steps = [
"sambamba sort -t %d -n %s" % (cpu_count(), bam),
"bamToBed -bedpe -i stdin",
r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""]
else:
steps = [
"bamToBed -i %s" % (bam),
r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""]
steps.extend([
"grep -v 'chrM'",
"sort",
"uniq -c",
r"""awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{printf "%d\t%d\t%d\t%d\t%f\t%f\t%f\n",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'"""
])
out, err = common.run_pipe(steps, pbc_file_qc_filename)
if err:
logger.error("PBC file error: %s" % (err))
def main():
args = get_args()
paired = args.paired
bam = args.bam
# Create a file handler
handler = logging.FileHandler('filter_qc.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Run filtering for either PE or SE
bam_basename = os.path.basename(
utils.strip_extensions(bam, STRIP_EXTENSIONS))
if paired: # paired-end data
filter_bam_filename = filter_mapped_pe(bam, bam_basename)
else:
filter_bam_filename = filter_mapped_se(bam, bam_basename)
# Remove duplicates
dedup_bam_filename = dedup_mapped(filter_bam_filename, bam_basename, paired)
# Compute library complexity
compute_complexity(dedup_bam_filename, paired, bam_basename)
if __name__ == '__main__':
main()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment