diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 6579c9aaedc710b8c6d3ace646298a0c4d10e0fe..de84d909077549b22b89da491545bc9ca993160e 100644 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -23,6 +23,10 @@ process { module = ['python/3.6.1-2-anaconda', 'deeptools/2.5.0.1'] executor = 'local' } + $convertReads { + module = ['python/3.6.1-2-anaconda', 'samtools/1.4.1', 'bedtools/2.26.0'] + cpus = 32 + } } params { diff --git a/workflow/main.nf b/workflow/main.nf index 0e07dd5e8bf6ff568eef02e7ccc7f4968bebc1a5..fda32915c51e3bb91aeba118ce566fb1476bb83d 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -144,6 +144,7 @@ process filterReads { output: set sampleId, file('*.bam'), file('*.bai'), biosample, factor, treatment, replicate, controlId into dedupReads + set sampleId, file('*.bam'), biosample, factor, treatment, replicate, controlId into convertReads file '*flagstat.qc' into dedupReadsStats file '*pbc.qc' into dedupReadsComplexity file '*dup.qc' into dupReads @@ -189,3 +190,32 @@ process experimentQC { """ } + +// Convert reads to bam +process convertReads { + + tag "$sampleId-$replicate" + publishDir "$baseDir/output/${task.process}", mode: 'copy' + + input: + + set sampleId, deduped, biosample, factor, treatment, replicate, controlId from convertReads + + output: + + set sampleId, file('*.tagAlign.gz'), file('*.bedpe.gz'), biosample, factor, treatment, replicate, controlId into dedupReads + + script: + + if (pairedEnd) { + """ + python3 $baseDir/scripts/convert_reads.py -b $deduped -p + """ + } + else { + """ + python3 $baseDir/scripts/convert_reads.py -b $deduped + """ + } + +} diff --git a/workflow/scripts/convert_reads.py b/workflow/scripts/convert_reads.py new file mode 100644 index 0000000000000000000000000000000000000000..8019f15cb792dcdee6bc0a34f6854684642cf5a0 --- /dev/null +++ b/workflow/scripts/convert_reads.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 + +'''Convert tagAlign files for further processing.''' + +import os +import argparse +import shutil +import shlex +import logging +from multiprocessing import cpu_count +import utils + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-b', '--bam', + help="The bam file to run filtering and qc on.", + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=False, + action='store_true') + + args = parser.parse_args() + return args + +# Functions + + +def check_tools(): + '''Checks for required componenets on user system''' + + logger.info('Checking for required libraries and components on this system') + + bedtools_path = shutil.which("bedtools") + if bedtools_path: + logger.info('Found bedtools: %s', bedtools_path) + else: + logger.error('Missing bedtools') + raise Exception('Missing bedtools') + + samtools_path = shutil.which("samtools") + if samtools_path: + logger.info('Found samtools: %s', samtools_path) + else: + logger.error('Missing samtools') + raise Exception('Missing samtools') + + +def convert_mapped(bam, bam_basename): + '''Use bedtools to convert to tagAlign.''' + + tag_filename = bam_basename + '.tagAlign.gz' + out, err = utils.run_pipe([ + "bamToBed -i %s" % (bam), + r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""", + "gzip -nc"], outfile=tag_filename) + + +def convert_mapped_pe(bam, bam_basename): + '''Use bedtools to convert to tagAlign PE data.''' + + bedpe_filename = bam_basename + ".bedpe.gz" + + # Name sort bam to make BEDPE + nmsrt_bam_filename = bam_basename + ".nmsrt.bam" + samtools_sort_command = \ + "samtools sort -n -@%d -o %s %s" \ + % (cpu_count(), nmsrt_bam_filename, bam) + + logger.info(samtools_sort_command) + utils.check_output(shlex.split(samtools_sort_command)) + + out, err = utils.run_pipe([ + "bamToBed -bedpe -mate1 -i %s" % (nmsrt_bam_filename), + "gzip -nc"], + outfile=bedpe_filename) + + +def main(): + args = get_args() + paired = args.paired + bam = args.bam + + # Create a file handler + handler = logging.FileHandler('convert_reads.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + + # Convert PE or SE to tagAlign + bam_basename = os.path.basename( + utils.strip_extensions(bam, ['.bam'])) + + convert_mapped(bam, bam_basename) + + if paired: # paired-end data + convert_mapped_pe(bam, bam_basename) + + +if __name__ == '__main__': + main() diff --git a/workflow/tests/test_convert_reads.py b/workflow/tests/test_convert_reads.py new file mode 100644 index 0000000000000000000000000000000000000000..8d25fbc093faed5b4a4a7184fcbd499386da9734 --- /dev/null +++ b/workflow/tests/test_convert_reads.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +import pytest +import os + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../output/convertReads/' + + +def test_convert_reads_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.tagAlign.gz')) + + +def test_map_qc_pairedend(): + # Do the same thing for paired end data + # Also check that bedpe exists + pass