diff --git a/astrocyte_pkg.yml b/astrocyte_pkg.yml index c7ff94f607a1aadf0c894c1b149acca748952005..568a7d115104d45d83f12e35efa696912411cb33 100644 --- a/astrocyte_pkg.yml +++ b/astrocyte_pkg.yml @@ -9,7 +9,7 @@ # A unique identifier for the workflow package, text/underscores only name: 'chipseq_analysis_bicf' # Who wrote this? -author: 'Beibei Chen' +author: 'Beibei Chen and Venkat Malladi' # A contact email address for questions email: 'biohpc-help@utsouthwestern.edu' # A more informative title for the workflow package @@ -17,8 +17,7 @@ title: 'BICF ChIP-seq Analysis Workflow' # A summary of the workflow package in plain text description: | This is a workflow package for the BioHPC/BICF ChIP-seq workflow system. - It implements a simple ChIP-seq analysis workflow and - visualization application. + It implements ChIP-seq analysis workflow and visualization application. # ----------------------------------------------------------------------------- # DOCUMENTATION @@ -42,9 +41,9 @@ documentation_files: workflow_modules: - 'python/3.6.1-2-anaconda' - 'trimgalore/0.4.1' - - 'fastqc/0.11.5' - - 'deeptools/2.3.5' - - 'meme/4.11.1-gcc-openmpi' + - 'cutadapt/1.9.1' + - 'fastqc/0.11.2' + - 'bwa/intel/0.7.15' # A list of parameters used by the workflow, defining how to present them, # options etc in the web interface. For each parameter: @@ -84,10 +83,11 @@ workflow_parameters: type: files required: true description: | - Fastq files of all samples - regex: "*_{1,2}.fastq.gz" + One or more input FASTQ files from a ChIP-seq expereiment and a design + file with the link bewetwen the same file name and sample id + regex: ".*(fastq|fq)*" - - id: singleEnd + - id: pairedEnd type: select required: true choices: @@ -100,6 +100,24 @@ workflow_parameters: read length, and then starts another round of reading from the opposite end of the fragment. + - id: design + type: file + required: true + description: | + A design file listing sample id, fastq files, corresponding control id + and additional information about the sample. + +- id: genome + type: select + choices: + - [ 'GRCh38', 'Human GRCh38'] + - [ 'GRCh37', 'Human GRCh37'] + - [ 'GRCh38', 'Mouse GRCh38'] + required: true + description: | + Reference species and genome used for alignment and subsequent analysis. + + # ----------------------------------------------------------------------------- # SHINY APP CONFIGURATION # ----------------------------------------------------------------------------- diff --git a/workflow/conf/biohpc.conf b/workflow/conf/biohpc.conf index 3587d62820c67993ed939318bc4f5aa865e29b26..9f4bf081e96bf6f31b3e1f1d815e86fa7c5dd081 100644 --- a/workflow/conf/biohpc.conf +++ b/workflow/conf/biohpc.conf @@ -10,4 +10,18 @@ process { module = ['python/3.6.1-2-anaconda', 'trimgalore/0.4.1'] cpus = 32 } + $alignReads{ + module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.15', 'samtools/intel/1.3'] + cpus = 32 + memory = 12.GB + } +} + +params { + // Reference file paths on BioHPC + genomes { + 'GRCh38' { bwa = '/project/shared/bicf_workflow_ref/GRCh38' } + 'GRCh37' { bwa = '/project/shared/bicf_workflow_ref/GRCh37' } + 'GRCm38' { bwa = '/project/shared/bicf_workflow_ref/GRCm38' } + } } diff --git a/workflow/main.nf b/workflow/main.nf index 6ee87c6c6c923739295ab73b0033f395d984d05a..4494a861280aee1b56c4775de4bb29a59fd7a63d 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -7,6 +7,18 @@ params.reads = "$baseDir/../test_data/*.fastq.gz" params.pairedEnd = false params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt" +params.genome = 'GRCh38' +params.genomes = [] +params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false + +// Check inputs +if( params.bwaIndex ){ + bwaIndex = Channel + .fromPath(params.bwaIndex) + .ifEmpty { exit 1, "BWA index not found: ${params.bwaIndex}" } +} else { + exit 1, "No reference genome specified." +} // Define List of Files readsList = Channel @@ -55,14 +67,14 @@ if (pairedEnd) { } else { rawReads = designFilePaths .splitCsv(sep: '\t', header: true) - .map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read1], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] } + .map { row -> [ row.sample_id, [row.fastq_read1], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] } } // Trim raw reads using trimgalore process trimReads { tag "$sampleId-$replicate" - publishDir "$baseDir/output/{task.process}/$sampleId-$replicate/", mode: 'copy' + publishDir "$baseDir/output/${task.process}", mode: 'copy' input: @@ -77,12 +89,42 @@ process trimReads { if (pairedEnd) { """ - python $baseDir/scripts/trim_reads.py -f $reads -p + python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -p + """ + } + else { + """ + python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} + """ + } + +} + +// Align trimmed reads using bwa +process alignReads { + + tag "$sampleId-$replicate" + publishDir "$baseDir/output/{task.process}", mode: 'copy' + + input: + + set sampleId, reads, biosample, factor, treatment, replicate, controlId from trimmedReads + file index from bwaIndex.first() + + output: + + set sampleId, file('*.bam'), biosample, factor, treatment, replicate, controlId into mappedReads + + script: + + if (pairedEnd) { + """ + python $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -p """ } else { """ - python $baseDir/scripts/check_design.py -f $reads + python $baseDir/scripts/map_reads.py -f ${reads[0]} -r ${index}/genome.fa """ } diff --git a/workflow/scripts/map_reads.py b/workflow/scripts/map_reads.py new file mode 100644 index 0000000000000000000000000000000000000000..e3a8d466f7fbd04fb4dd4d47cde7cae220dc380a --- /dev/null +++ b/workflow/scripts/map_reads.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 + +'''Align reads to reference genome.''' + +import os +import subprocess +import argparse +import shutil +import logging +import sys +from multiprocessing import cpu_count +import json +import utils + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +## SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + +# the order of this list is important. +# strip_extensions strips from the right inward, so +# the expected right-most extensions should appear first (like .gz) +# Modified from J. Seth Strattan +STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '_trimmed'] + + +def get_args(): + '''Define arguments.''' + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-f', '--fastq', + help="The fastq file to run triming on.", + nargs='+', + required=True) + + parser.add_argument('-r', '--reference', + help="The bwa index of the reference genome.", + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=False, + action='store_true') + + args = parser.parse_args() + return args + + + +## Functions + +def strip_extensions(filename, extensions): + '''Strips extensions to get basename of file.''' + + basename = filename + for extension in extensions: + basename = basename.rpartition(extension)[0] or basename + + return basename + + +def check_tools(): + '''Checks for required componenets on user system''' + + logger.info('Checking for required libraries and components on this system') + + bwa_path = shutil.which("bwa") + if trimgalore_path: + logger.info('Found bwa: %s', bwa_path) + else: + logger.error('Missing bwa') + raise Exception('Missing bwa') + + samtools_path = shutil.which("samtools") + if samtools_path: + logger.info('Found samtools: %s', samtools_path) + else: + logger.error('Missing samtools') + raise Exception('Missing samtools') + + +def generate_sa(fastq, reference): + '''Use BWA to generate Suffix Arrays.''' + + fastq_basename = os.path.basename(strip_extensions(fastq, STRIP_EXTENSIONS)) + + bwa_aln_params = '-q 5 -l 32 -k 2' + + sai = '%s.sai' % (fastq_basename) + with open(sai, 'w') as sai_file: + bwa_command = "bwa aln %s -t %d %s %s" \ + % (bwa_aln_params, cpu_count(), + reference, fastq_basename) + + logger.info("Running bwa with %s", bwa_command) + subprocess.check_call(shlex.split(bwa_command), stdout=sai_file) + + return sai + + +def align_se(fastq, sai, reference, fastq_basename): + '''Use BWA to align SE data.''' + + sam_filename = "%s.sam" % (fastq_basename) + + steps = [ + "bwa samse %s %s %s" + % (reference, fastq[0], sai[0]), + "samtools view -@%d -Su -" % (cpu_count()), + "samtools sort -@%d -o %s" + % (cpu_count(), fastq_basename)] + + out, err = utils.run_pipe(steps) + if err: + logger.error("samse/samtools error: %s" % (err)) + + +def align_pe(fastq, sai, reference, fastq_basename): + '''Use BWA to align PE data.''' + + sam_filename = "%s.sam" % (fastq_basename) + badcigar_filename = "%s.badReads" % (fastq_basename) + + # Remove read pairs with bad CIGAR strings and sort by position + steps = [ + "bwa sampe -P %s %s %s %s %s" + % (reference, fastq[0], fastq[1], + sai[0], sai[1]), + "tee %s" % (sam_filename), + r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""", + "sort", + "uniq"] + + out, err = utils.run_pipe(steps, badcigar_filename) + if err: + logger.error("sampe error: %s" % (err)) + + steps = [ + "cat %s" % (sam_filename), + "grep -v -F -f %s" % (badcigar_filename), + "samtools view -@%d -Su -" % (cpu_count()), + "samtools sort -@%d -o %s" + % (cpu_count(), fastq_basename)] + + out, err = utils.run_pipe(steps) + if err: + logger.error("samtools error: %s" % (err)) + + +def main(): + args = get_args() + + # Create a file handler + handler = logging.FileHandler('map.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + + # Run Suffix Array generation + sai = [] + for fastq in args.fastq: + sai_filename = generate_sa(fastq, args.reference) + sai.append(sai_filename) + + # Run alignment for either PE or SE + if paired: # paired-end data + fastq_r1_basename = os.path.basename( + strip_extensions(fastq[0], STRIP_EXTENSIONS)) + fastq_r2_basename = os.path.basename( + strip_extensions(fastq[1], STRIP_EXTENSIONS)) + fastq_basename = fastq_r1_basename + fastq_r2_basename + + align_pe(fastq, sai, fastq_basename) + + else: + fastq_basename = os.path.basename( + strip_extensions(fastq[0], STRIP_EXTENSIONS)) + + align_se(fastq, sai, fastq_basename) + + bam_mapstats_filename = '%s.raw.srt.bam.flagstat.qc' % (fastq_basename) + with open(raw_bam_mapstats_filename, 'w') as fh: + subprocess.check_call( + shlex.split("%s flagstat %s" % (samtools, bam_mapstats_filename)), + stdout=fh) + + # Remove sai files + for sai_file in sai: + os.remove(sai_file) + + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/utils.py b/workflow/scripts/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e8984472acc5bdf468cda3ad0d2893ea9f703ec1 --- /dev/null +++ b/workflow/scripts/utils.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 + +'''General utilities.''' + + +import sys +import os +import subprocess +import shlex +import logging + + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = True + + +def run_pipe(steps, outfile=None): + # TODO: capture stderr + from subprocess import Popen, PIPE + p = None + p_next = None + first_step_n = 1 + last_step_n = len(steps) + for n, step in enumerate(steps, start=first_step_n): + logger.debug("step %d: %s" % (n, step)) + if n == first_step_n: + if n == last_step_n and outfile: # one-step pipeline with outfile + with open(outfile, 'w') as fh: + print("one step shlex: %s to file: %s" % (shlex.split(step), outfile)) + p = Popen(shlex.split(step), stdout=fh) + break + print("first step shlex to stdout: %s" % (shlex.split(step))) + + p = Popen(shlex.split(step), stdout=PIPE) + elif n == last_step_n and outfile: # only treat the last step specially if you're sending stdout to a file + with open(outfile, 'w') as fh: + print("last step shlex: %s to file: %s" % (shlex.split(step), outfile)) + p_last = Popen(shlex.split(step), stdin=p.stdout, stdout=fh) + p.stdout.close() + p = p_last + else: # handles intermediate steps and, in the case of a pipe to stdout, the last step + print("intermediate step %d shlex to stdout: %s" % (n, shlex.split(step))) + p_next = Popen(shlex.split(step), stdin=p.stdout, stdout=PIPE) + p.stdout.close() + p = p_next + out, err = p.communicate() + return out, err