diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index acb133b5be9f6f5e79c352f6704633fbe716d907..bc3e9e879c8d95e072d2918d9b04e69c69601852 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -6,5 +6,5 @@ before_script: test: script: - - nextflow run workflow/main.nf + - nextflow run workflow/main.nf - pytest diff --git a/astrocyte_pkg.yml b/astrocyte_pkg.yml index f8b8dcf0bd0e0684ac9042aeeff55e5f2c7e628a..023968245603d637a7d8a9d4de07108ef961dcf0 100644 --- a/astrocyte_pkg.yml +++ b/astrocyte_pkg.yml @@ -44,6 +44,9 @@ workflow_modules: - 'cutadapt/1.9.1' - 'fastqc/0.11.2' - 'bwa/intel/0.7.12' + - 'samtools/1.4.1' + - 'sambamba/0.6.6' + - 'bedtools/2.26.0' # A list of parameters used by the workflow, defining how to present them, # options etc in the web interface. For each parameter: diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index c92f9f5ecb8b174f5a6db24c177638466451c90d..3268a6efb9431382e9f12887539ee242615e4ee1 100644 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -15,6 +15,10 @@ process { module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/intel/1.3'] cpus = 32 } + $filterReads{ + module = ['python/3.6.1-2-anaconda', 'samtools/1.4.1', 'sambamba/0.6.6', 'bedtools/2.26.0'] + cpus = 32 + } } params { diff --git a/workflow/main.nf b/workflow/main.nf index 90763c5aa18db0c0417bcc83c6002c11b5b15946..fd0c59d503f1c60e7f751db3058baa4e95f3b6be 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -114,7 +114,7 @@ process alignReads { output: set sampleId, file('*.bam'), biosample, factor, treatment, replicate, controlId into mappedReads - set file('*.srt.bam.flagstat.qc') + file '*.srt.bam.flagstat.qc' into mappedReadsStats script: @@ -130,3 +130,35 @@ process alignReads { } } + +// Dedup reads using sambamba +process filterReads { + + tag "$sampleId-$replicate" + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + set sampleId, mapped, biosample, factor, treatment, replicate, controlId from mappedReads + + output: + + set sampleId, file('*.bam'), file('*.bai'), biosample, factor, treatment, replicate, controlId into dedupReads + file '*flagstat.qc' into dedupReadsStats + file '*pbc.qc' into dedupReadsComplexity + file '*dup.qc' into dupReads + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/map_qc.py -b $mapped -p + """ + } + else { + """ + python3 $baseDir/scripts/map_qc.py -b $mapped + """ + } + +} diff --git a/workflow/scripts/check_design.py b/workflow/scripts/check_design.py index acf4f014f2c4ed24d25da9bd5e268ace9bd2fadd..52185d6f6adde130a34cc593b8fe4f113e9b3879 100644 --- a/workflow/scripts/check_design.py +++ b/workflow/scripts/check_design.py @@ -26,11 +26,11 @@ def get_args(): formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-d', '--design', - help="The design file to run QC (TSV format).", + help="The design file to run QC (tsv format).", required=True) parser.add_argument('-f', '--fastq', - help="File with list of fastq files (csv format).", + help="File with list of fastq files (tsv format).", required=True) parser.add_argument('-p', '--paired', diff --git a/workflow/scripts/map_qc.py b/workflow/scripts/map_qc.py new file mode 100644 index 0000000000000000000000000000000000000000..f44b6e4cd39895fa09984a13ce066cbf8d5299b8 --- /dev/null +++ b/workflow/scripts/map_qc.py @@ -0,0 +1,289 @@ +#!/usr/bin/env python3 + +'''Remove duplicates and filter unmapped reads.''' + +import os +import subprocess +import argparse +import shutil +import shlex +import logging +from multiprocessing import cpu_count +import utils +import pandas as pd + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +# the order of this list is important. +# strip_extensions strips from the right inward, so +# the expected right-most extensions should appear first (like .gz) +# Modified from J. Seth Strattan +STRIP_EXTENSIONS = ['.bam', '.srt'] + + +def get_args(): + '''Define arguments.''' + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-b', '--bam', + help="The bam file to run filtering and qc on.", + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=False, + action='store_true') + + args = parser.parse_args() + return args + +# Functions + + +def check_tools(): + '''Checks for required componenets on user system''' + + logger.info('Checking for required libraries and components on this system') + + samtools_path = shutil.which("samtools") + if samtools_path: + logger.info('Found samtools: %s', samtools_path) + else: + logger.error('Missing samtools') + raise Exception('Missing samtools') + + sambamba_path = shutil.which("sambamba") + if sambamba_path: + logger.info('Found sambamba: %s', sambamba_path) + else: + logger.error('Missing sambamba') + raise Exception('Missing sambamba') + + bedtools_path = shutil.which("bedtools") + if bedtools_path: + logger.info('Found bedtools: %s', bedtools_path) + else: + logger.error('Missing bedtools') + raise Exception('Missing bedtools') + + +def filter_mapped_pe(bam, bam_basename): + '''Use samtools to filter unmapped reads for PE data.''' + + filt_bam_prefix = bam_basename + ".filt.srt" + filt_bam_filename = filt_bam_prefix + ".bam" + tmp_filt_bam_prefix = "tmp.%s" % (filt_bam_prefix) + tmp_filt_bam_filename = tmp_filt_bam_prefix + ".bam" + + # Remove unmapped, mate unmapped + # not primary alignment, reads failing platform + # Remove low MAPQ reads + # Only keep properly paired reads + # Obtain name sorted BAM file + out, err = utils.run_pipe([ + # filter: -F 1804 FlAG bits to exclude; -f 2 FLAG bits to reqire; + # -q 30 exclude MAPQ < 30; -u uncompressed output + # exclude FLAG 1804: unmapped, next segment unmapped, secondary + # alignments, not passing platform q, PCR or optical duplicates + # require FLAG 2: properly aligned + "samtools view -F 1804 -f 2 -q 30 -u %s" % (bam), + # sort: -n sort by name; - take input from stdin; + # out to specified filename + # Will produce name sorted BAM + "samtools sort -n -@ %d -o %s" % (cpu_count(), tmp_filt_bam_filename)]) + if err: + logger.error("samtools filter error: %s" % (err)) + + # Remove orphan reads (pair was removed) + # and read pairs mapping to different chromosomes + # Obtain position sorted BAM + out, err = utils.run_pipe([ + # fill in mate coordinates, ISIZE and mate-related flags + # fixmate requires name-sorted alignment; -r removes secondary and + # unmapped (redundant here because already done above?) + # - send output to stdout + "samtools fixmate -r %s -" % (tmp_filt_bam_filename), + # repeat filtering after mate repair + "samtools view -F 1804 -f 2 -u -", + # produce the coordinate-sorted BAM + "samtools sort -@ %d -o %s" % (cpu_count(), filt_bam_filename)]) + + os.remove(tmp_filt_bam_filename) + return filt_bam_filename + + +def filter_mapped_se(bam, bam_basename): + '''Use samtools to filter unmapped reads for SE data.''' + + filt_bam_prefix = bam_basename + ".filt.srt" + filt_bam_filename = filt_bam_prefix + ".bam" + + # Remove unmapped, mate unmapped + # not primary alignment, reads failing platform + # Remove low MAPQ reads + # Obtain name sorted BAM file + with open(filt_bam_filename, 'w') as fh: + samtools_filter_command = ( + "samtools view -F 1804 -q 30 -b %s" + % (bam) + ) + logger.info(samtools_filter_command) + subprocess.check_call( + shlex.split(samtools_filter_command), + stdout=fh) + + return filt_bam_filename + + +def dedup_mapped(bam, bam_basename, paired): + '''Use sambamba and samtools to remove duplicates.''' + + # Markduplicates + dup_file_qc_filename = bam_basename + ".dup.qc" + tmp_dup_mark_filename = bam_basename + ".dupmark.bam" + sambamba_parms = "--hash-table-size=17592186044416" + \ + " --overflow-list-size=20000000 --io-buffer-size=256" + with open(dup_file_qc_filename, 'w') as fh: + sambamba_markdup_command = ( + "sambamba markdup -t %d %s %s %s" + % (cpu_count(), sambamba_parms, bam, tmp_dup_mark_filename) + ) + logger.info(sambamba_markdup_command) + subprocess.check_call( + shlex.split(sambamba_markdup_command), + stderr=fh) + + + # Remove duplicates + final_bam_prefix = bam_basename + ".filt.nodup" + final_bam_filename = final_bam_prefix + ".bam" + + if paired: # paired-end data + samtools_dedupe_command = \ + "samtools view -F 1804 -f 2 -b %s" % (tmp_dup_mark_filename) + else: + samtools_dedupe_command = \ + "samtools view -F 1804 -b %s" % (tmp_dup_mark_filename) + + with open(final_bam_filename, 'w') as fh: + logger.info(samtools_dedupe_command) + subprocess.check_call( + shlex.split(samtools_dedupe_command), + stdout=fh) + + # Index final bam file + sambamba_index_command = \ + "samtools index -@ %d %s" % (cpu_count(), final_bam_filename) + logger.info(sambamba_index_command) + subprocess.check_output(shlex.split(sambamba_index_command)) + + # Generate mapping statistics + final_bam_file_mapstats_filename = final_bam_prefix + ".flagstat.qc" + with open(final_bam_file_mapstats_filename, 'w') as fh: + flagstat_command = "sambamba flagstat -t %d %s" \ + % (cpu_count(), final_bam_filename) + logger.info(flagstat_command) + subprocess.check_call(shlex.split(flagstat_command), stdout=fh) + + os.remove(bam) + return tmp_dup_mark_filename + + +def compute_complexity(bam, paired, bam_basename): + '''Calculate library complexity .''' + + pbc_file_qc_filename = bam_basename + ".filt.nodup.pbc.qc" + tmp_pbc_file_qc_filename = "tmp.%s" % (pbc_file_qc_filename) + + # Sort by name + # convert to bedPE and obtain fragment coordinates + # sort by position and strand + # Obtain unique count statistics + + # PBC File output + # TotalReadPairs [tab] + # DistinctReadPairs [tab] + # OneReadPair [tab] + # TwoReadPairs [tab] + # NRF=Distinct/Total [tab] + # PBC1=OnePair/Distinct [tab] + # PBC2=OnePair/TwoPair + pbc_headers = ['TotalReadPairs', + 'DistinctReadPairs', + 'OneReadPair', + 'TwoReadPairs', + 'NRF', + 'PBC1', + 'PBC2'] + + if paired: + steps = [ + "sambamba sort -t %d -n %s" % (cpu_count(), bam), + "bamToBed -bedpe -i stdin", + r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""] + else: + steps = [ + "bamToBed -i %s" % (bam), + r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""] + steps.extend([ + "grep -v 'chrM'", + "sort", + "uniq -c", + r"""awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{printf "%d\t%d\t%d\t%d\t%f\t%f\t%f\n",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'""" + ]) + out, err = utils.run_pipe(steps, tmp_pbc_file_qc_filename) + if err: + logger.error("PBC file error: %s" % (err)) + + # Add headers + pbc_file = pd.read_csv(tmp_pbc_file_qc_filename, sep='\t', header=None, + names=pbc_headers) + pbc_file.to_csv(pbc_file_qc_filename, header=True, sep='\t', index=False) + os.remove(bam) + os.remove(bam + '.bai') + os.remove(tmp_pbc_file_qc_filename) + + +def main(): + args = get_args() + paired = args.paired + bam = args.bam + + # Create a file handler + handler = logging.FileHandler('map_qc.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + + # Run filtering for either PE or SE + bam_basename = os.path.basename( + utils.strip_extensions(bam, STRIP_EXTENSIONS)) + if paired: # paired-end data + filter_bam_filename = filter_mapped_pe(bam, bam_basename) + + else: + filter_bam_filename = filter_mapped_se(bam, bam_basename) + + # Remove duplicates + markdup_bam_filename = dedup_mapped(filter_bam_filename, bam_basename, paired) + + # Compute library complexity + compute_complexity(markdup_bam_filename, paired, bam_basename) + + +if __name__ == '__main__': + main() diff --git a/workflow/tests/test_map_qc.py b/workflow/tests/test_map_qc.py new file mode 100644 index 0000000000000000000000000000000000000000..ebc2cd5aae2808986fb59a48250cf53eadb162c4 --- /dev/null +++ b/workflow/tests/test_map_qc.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 + +import pytest +import os +import pandas as pd + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../output/filterReads/' + + +def test_map_qc_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam')) + assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam.bai')) + filtered_reads_report = test_output_path + 'ENCFF646LXU.filt.nodup.flagstat.qc' + samtools_report = open(filtered_reads_report).readlines() + assert '64962570 + 0 in total' in samtools_report[0] + assert '64962570 + 0 mapped (100.00%:N/A)' in samtools_report[4] + library_complexity = test_output_path + 'ENCFF646LXU.filt.nodup.pbc.qc' + df_library_complexity = pd.read_csv(library_complexity, sep='\t') + assert df_library_complexity["NRF"].iloc[0] == 0.926192 + assert df_library_complexity["PBC1"].iloc[0] == 0.926775 + assert df_library_complexity["PBC2"].iloc[0] == 13.706885 + + +def test_map_qc_pairedend(): + # Do the same thing for paired end data + pass diff --git a/workflow/tests/test_map_reads.py b/workflow/tests/test_map_reads.py index d5a83667a39e768c0c66698b298a53f1aa681ce2..04ffd08084999dfeb2eebde88055e73e9832951f 100644 --- a/workflow/tests/test_map_reads.py +++ b/workflow/tests/test_map_reads.py @@ -3,10 +3,13 @@ import pytest import os +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../output/alignReads/' + def test_map_reads_singleend(): - aligned_reads_report = os.path.dirname(os.path.abspath(__file__)) + \ - '/../output/alignReads/ENCFF646LXU.srt.bam.flagstat.qc' + assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.srt.bam')) + aligned_reads_report = test_output_path + 'ENCFF646LXU.srt.bam.flagstat.qc' samtools_report = open(aligned_reads_report).readlines() assert '80795025 + 0 in total' in samtools_report[0] assert '80050072 + 0 mapped (99.08% : N/A)' in samtools_report[4] diff --git a/workflow/tests/test_trim_reads.py b/workflow/tests/test_trim_reads.py index e9680f681251d9d9a9db07ce679e5e5d6a0eba2f..7ca03750cd6f585e078de78d550e3f7fbac824bd 100644 --- a/workflow/tests/test_trim_reads.py +++ b/workflow/tests/test_trim_reads.py @@ -3,14 +3,17 @@ import pytest import os +test_data_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../test_data/' +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../output/trimReads/' + def test_trim_reads_singleend(): - raw_fastq = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../test_data/ENCFF833BLU.fastq.gz' - trimmed_fastq = os.path.dirname(os.path.abspath(__file__)) + \ - '/../output/trimReads/ENCFF833BLU_trimmed.fq.gz' - trimmed_fastq_report = os.path.dirname(os.path.abspath(__file__)) + \ - '/../output/trimReads/ENCFF833BLU.fastq.gz_trimming_report.txt' + raw_fastq = test_data_path + 'ENCFF833BLU.fastq.gz' + trimmed_fastq = test_output_path + 'ENCFF833BLU_trimmed.fq.gz' + trimmed_fastq_report = test_output_path + \ + 'ENCFF833BLU.fastq.gz_trimming_report.txt' assert os.path.getsize(raw_fastq) != os.path.getsize(trimmed_fastq) assert os.path.getsize(trimmed_fastq) == 2512853101 assert 'Trimming mode: single-end' in open(trimmed_fastq_report).readlines()[4]