diff --git a/astrocyte_pkg.yml b/astrocyte_pkg.yml index f8b8dcf0bd0e0684ac9042aeeff55e5f2c7e628a..51b4b74b8894dcfd6949ec24a50a88b8e4a0df84 100644 --- a/astrocyte_pkg.yml +++ b/astrocyte_pkg.yml @@ -44,6 +44,9 @@ workflow_modules: - 'cutadapt/1.9.1' - 'fastqc/0.11.2' - 'bwa/intel/0.7.12' + - 'samtools/intel/1.3' + - 'sambamba/0.6.6' + - 'bedtools/2.26.0' # A list of parameters used by the workflow, defining how to present them, # options etc in the web interface. For each parameter: diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index c92f9f5ecb8b174f5a6db24c177638466451c90d..ab289999da73dafcaee5173d057d06a58fe16296 100644 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -15,6 +15,10 @@ process { module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/intel/1.3'] cpus = 32 } + $filterReads{ + module = ['python/3.6.1-2-anaconda', 'samtools/intel/1.3', 'sambamba/0.6.6', 'bedtools/2.26.0'] + cpus = 32 + } } params { diff --git a/workflow/main.nf b/workflow/main.nf index 90763c5aa18db0c0417bcc83c6002c11b5b15946..40fdf1c6b8013184ea78a56fc0b7737ad0d8ce32 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -114,7 +114,7 @@ process alignReads { output: set sampleId, file('*.bam'), biosample, factor, treatment, replicate, controlId into mappedReads - set file('*.srt.bam.flagstat.qc') + file '*.srt.bam.flagstat.qc' into mappedReadsStats script: @@ -130,3 +130,34 @@ process alignReads { } } + +// Dedup reads using sambamba +process filterReads { + + tag "$sampleId-$replicate" + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + set sampleId, mapped, biosample, factor, treatment, replicate, controlId from mappedReads + + output: + + set sampleId, file('*.bam'), biosample, factor, treatment, replicate, controlId into dedupReads + set file('*flagstat.qc') into dedupReadsStats + set file('*pbc.qc') into dedupReadsComplexity + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/map_qc.py -f $mapped -p + """ + } + else { + """ + python3 $baseDir/scripts/map_qc.py -f $mapped + """ + } + +} diff --git a/workflow/scripts/map_qc.py b/workflow/scripts/map_qc.py new file mode 100644 index 0000000000000000000000000000000000000000..20f02aecc7e5354061a29dc931129fb1fbc4829b --- /dev/null +++ b/workflow/scripts/map_qc.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python3 + +'''Remove duplicates and filter unmapped reads.''' + +import os +import subprocess +import argparse +import shutil +import shlex +import logging +from multiprocessing import cpu_count +import json +import utils + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +# the order of this list is important. +# strip_extensions strips from the right inward, so +# the expected right-most extensions should appear first (like .gz) +# Modified from J. Seth Strattan +STRIP_EXTENSIONS = ['.bam', '.srt'] + + +def get_args(): + '''Define arguments.''' + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-b', '--bam', + help="The bam file to run filtering and qc on.", + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=False, + action='store_true') + + args = parser.parse_args() + return args + +# Functions + + +def check_tools(): + '''Checks for required componenets on user system''' + + logger.info('Checking for required libraries and components on this system') + + samtools_path = shutil.which("samtools") + if samtools_path: + logger.info('Found samtools: %s', samtools_path) + else: + logger.error('Missing samtools') + raise Exception('Missing samtools') + + sambamba_path = shutil.which("sambamba") + if bwa_path: + logger.info('Found sambamba: %s', bwa_path) + else: + logger.error('Missing sambamba') + raise Exception('Missing sambamba') + + bedtools_path = shutil.which("bedtools") + if bwa_path: + logger.info('Found sambamba: %s', bwa_path) + else: + logger.error('Missing sambamba') + raise Exception('Missing sambamba') + + +def filter_mapped_pe(bam, bam_basename): + '''Use samtools to filter unmapped reads for PE data.''' + + filt_bam_prefix = bam_basename + ".filt.srt" + filt_bam_filename = filt_bam_prefix + ".bam" + tmp_filt_bam_prefix = "tmp.%s" % (filt_bam_prefix) + tmp_filt_bam_filename = tmp_filt_bam_prefix + ".bam" + + # Remove unmapped, mate unmapped + # not primary alignment, reads failing platform + # Remove low MAPQ reads + # Only keep properly paired reads + # Obtain name sorted BAM file + out, err = common.run_pipe([ + # filter: -F 1804 FlAG bits to exclude; -f 2 FLAG bits to reqire; + # -q 30 exclude MAPQ < 30; -u uncompressed output + # exclude FLAG 1804: unmapped, next segment unmapped, secondary + # alignments, not passing platform q, PCR or optical duplicates + # require FLAG 2: properly aligned + "samtools view -F 1804 -f 2 -q 30 -u %s" % (bam), + # sort: -n sort by name; - take input from stdin; + # out to specified filename + # Will produce name sorted BAM + "samtools sort -n -@%d -o %s" % (cpu_count(), tmp_filt_bam_filename)]) + if err: + logger.error("samtools filter error: %s" % (err)) + + # Remove orphan reads (pair was removed) + # and read pairs mapping to different chromosomes + # Obtain position sorted BAM + out, err = common.run_pipe([ + # fill in mate coordinates, ISIZE and mate-related flags + # fixmate requires name-sorted alignment; -r removes secondary and + # unmapped (redundant here because already done above?) + # - send output to stdout + "samtools fixmate -r %s -" % (tmp_filt_bam_filename), + # repeat filtering after mate repair + "samtools view -F 1804 -f 2 -u -", + # produce the coordinate-sorted BAM + "samtools sort -@%d -o %s" % (cpu_count(), filt_bam_filename)]) + + os.remove(tmp_filt_bam_filename) + return filt_bam_filename + + +def filter_mapped_se(bam, bam_basename): + '''Use samtools to filter unmapped reads for SE data.''' + + filt_bam_prefix = bam_basename + ".filt.srt" + filt_bam_filename = filt_bam_prefix + ".bam" + + # Remove unmapped, mate unmapped + # not primary alignment, reads failing platform + # Remove low MAPQ reads + # Obtain name sorted BAM file + with open(filt_bam_filename, 'w') as fh: + samtools_filter_command = ( + "samtools view -F 1804 -q 30 -b %s" + % (samtools_params, input_bam) + ) + logger.info(samtools_filter_command) + subprocess.check_call( + shlex.split(samtools_filter_command), + stdout=fh) + + return filt_bam_filename + + +def dedup_mapped(bam, bam_basename, paired): + '''Use sambamba and samtools to remove duplicates.''' + + # Markduplicates + dup_file_qc_filename = bam_basename + ".dup.qc" + tmp_dup_mark_filename = bam_basename + ".dupmark.bam" + sambamba_parms = "--hash-table-size=17592186044416 \ + --overflow-list-size=20000000 --io-buffer-size=256" + with open(dup_file_qc_filename, 'w') as fh: + sambamba_markdup_command = ( + "sambamba markdup -t %d %s %s %s" + % (cpu_count, sambamba_parms, bam, tmp_dup_mark_filename) + ) + logger.info(sambamba_markdup_command) + subprocess.check_call( + shlex.split(sambamba_markdup), + stdout=fh) + + os.remove(tmp_dup_mark_filename + '.bai') + + # Remove duplicates + final_bam_prefix = bam_basename + ".filt.nodup" + final_bam_filename = final_bam_prefix + ".bam" + + if paired: # paired-end data + samtools_dedupe_command = \ + "samtools view -F 1804 -f 2 -b %s" % (filt_bam_filename) + else: + samtools_dedupe_command = \ + "samtools view -F 1804 -b %s" % (filt_bam_filename) + + with open(final_bam_filename, 'w') as fh: + logger.info(samtools_dedupe_command) + subprocess.check_call( + shlex.split(samtools_dedupe_command), + stdout=fh) + + # Index final bam file + sambamba_index_command = \ + "samtools index -t %d %s" % (cpu_count(), final_bam_filename) + logger.info(sambamba_index_command) + subprocess.check_output(shlex.split(sambamba_index_command)) + + # Generate mapping statistics + final_bam_file_mapstats_filename = final_bam_prefix + ".flagstat.qc" + with open(final_bam_file_mapstats_filename, 'w') as fh: + flagstat_command = "sambamba flagstat -t %d %s" + % (cpu_count(), final_bam_filename) + logger.info(flagstat_command) + subprocess.check_call(shlex.split(flagstat_command), stdout=fh) + + os.remove(filt_bam_filename) + return final_bam_filename + +def compute_complexity(bam, paired, bam_basename): + '''Calculate library complexity .''' + + pbc_file_qc_filename = bam_basename + "filt.nodup.pbc.qc" + + # Sort by name + # convert to bedPE and obtain fragment coordinates + # sort by position and strand + # Obtain unique count statistics + + # PBC File output + # TotalReadPairs [tab] + # DistinctReadPairs [tab] + # OneReadPair [tab] + # TwoReadPairs [tab] + # NRF=Distinct/Total [tab] + # PBC1=OnePair/Distinct [tab] + # PBC2=OnePair/TwoPair + + if paired: + steps = [ + "sambamba sort -t %d -n %s" % (cpu_count(), bam), + "bamToBed -bedpe -i stdin", + r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""] + else: + steps = [ + "bamToBed -i %s" % (bam), + r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""] + steps.extend([ + "grep -v 'chrM'", + "sort", + "uniq -c", + r"""awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{printf "%d\t%d\t%d\t%d\t%f\t%f\t%f\n",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'""" + ]) + out, err = common.run_pipe(steps, pbc_file_qc_filename) + if err: + logger.error("PBC file error: %s" % (err)) + + +def main(): + args = get_args() + paired = args.paired + bam = args.bam + + # Create a file handler + handler = logging.FileHandler('filter_qc.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + + + # Run filtering for either PE or SE + bam_basename = os.path.basename( + utils.strip_extensions(bam, STRIP_EXTENSIONS)) + if paired: # paired-end data + filter_bam_filename = filter_mapped_pe(bam, bam_basename) + + else: + filter_bam_filename = filter_mapped_se(bam, bam_basename) + + # Remove duplicates + dedup_bam_filename = dedup_mapped(filter_bam_filename, bam_basename, paired) + + # Compute library complexity + compute_complexity(dedup_bam_filename, paired, bam_basename) + + +if __name__ == '__main__': + main()