Add in maping step with apporpriate configurations.

14299993 · Venkat Malladi · 38f026a1 · 14299993 · 14299993 · 14299993
Commit 14299993 authored 7 years ago by Venkat Malladi
--- a/astrocyte_pkg.yml
+++ b/astrocyte_pkg.yml
@@ -9,7 +9,7 @@
 # A unique identifier for the workflow package, text/underscores only
 name: 'chipseq_analysis_bicf'
 # Who wrote this?
-author: 'Beibei Chen'
+author: 'Beibei Chen and Venkat Malladi'
 # A contact email address for questions
 email: 'biohpc-help@utsouthwestern.edu'
 # A more informative title for the workflow package
@@ -17,8 +17,7 @@ title: 'BICF ChIP-seq Analysis Workflow'
 # A summary of the workflow package in plain text
 description: |
  This is a workflow package for the BioHPC/BICF ChIP-seq workflow system.
-  It implements a simple ChIP-seq analysis workflow and
-  visualization application.
+  It implements ChIP-seq analysis workflow and visualization application.

 # -----------------------------------------------------------------------------
 # DOCUMENTATION
@@ -42,9 +41,9 @@ documentation_files:
 workflow_modules:
  - 'python/3.6.1-2-anaconda'
  - 'trimgalore/0.4.1'
-  - 'fastqc/0.11.5'
-  - 'deeptools/2.3.5'
-  - 'meme/4.11.1-gcc-openmpi'
+  - 'cutadapt/1.9.1'
+  - 'fastqc/0.11.2'
+  - 'bwa/intel/0.7.15'

 # A list of parameters used by the workflow, defining how to present them,
 # options etc in the web interface. For each parameter:
@@ -84,10 +83,11 @@ workflow_parameters:
    type: files
    required: true
    description: |
-      Fastq files of all  samples
-    regex: "*_{1,2}.fastq.gz"
+      One or more input FASTQ files from a ChIP-seq expereiment and a design
+      file with the link bewetwen the same file name and sample id
+    regex: ".*(fastq|fq)*"

-  - id: singleEnd
+  - id: pairedEnd
    type: select
    required: true
    choices:
@@ -100,6 +100,24 @@ workflow_parameters:
      read length, and then starts another round of reading from the opposite
      end of the fragment.

+ - id: design
+    type: file
+    required: true
+    description: |
+      A design file listing sample id, fastq files, corresponding control id
+      and additional information about the sample.
+
+- id: genome
+  type: select
+  choices:
+    - [ 'GRCh38', 'Human GRCh38']
+    - [ 'GRCh37', 'Human GRCh37']
+    - [ 'GRCh38', 'Mouse GRCh38']
+  required: true
+  description: |
+    Reference species and genome used for alignment and subsequent analysis.
+
+
 # -----------------------------------------------------------------------------
 # SHINY APP CONFIGURATION
 # -----------------------------------------------------------------------------

--- a/workflow/conf/biohpc.conf
+++ b/workflow/conf/biohpc.conf
@@ -10,4 +10,18 @@ process {
    module = ['python/3.6.1-2-anaconda', 'trimgalore/0.4.1']
    cpus = 32
  }
+  $alignReads{
+    module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.15', 'samtools/intel/1.3']
+    cpus = 32
+    memory = 12.GB
+  }
+}
+
+params {
+  // Reference file paths on BioHPC
+  genomes {
+    'GRCh38' { bwa = '/project/shared/bicf_workflow_ref/GRCh38' }
+    'GRCh37' { bwa = '/project/shared/bicf_workflow_ref/GRCh37' }
+    'GRCm38' { bwa = '/project/shared/bicf_workflow_ref/GRCm38' }
+  }
 }
--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -7,6 +7,18 @@
 params.reads = "$baseDir/../test_data/*.fastq.gz"
 params.pairedEnd = false
 params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt"
+params.genome = 'GRCh38'
+params.genomes = []
+params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false
+
+// Check inputs
+if( params.bwaIndex ){
+  bwaIndex = Channel
+    .fromPath(params.bwaIndex)
+    .ifEmpty { exit 1, "BWA index not found: ${params.bwaIndex}" }
+} else {
+  exit 1, "No reference genome specified."
+}

 // Define List of Files
 readsList = Channel
@@ -55,14 +67,14 @@ if (pairedEnd) {
 } else {
 rawReads = designFilePaths
  .splitCsv(sep: '\t', header: true)
-  .map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read1], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] }
+  .map { row -> [ row.sample_id, [row.fastq_read1], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] }
 }

 // Trim raw reads using trimgalore
 process trimReads {

  tag "$sampleId-$replicate"
-  publishDir "$baseDir/output/{task.process}/$sampleId-$replicate/", mode: 'copy'
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'

  input:

@@ -77,12 +89,42 @@ process trimReads {

  if (pairedEnd) {
    """
-    python $baseDir/scripts/trim_reads.py -f $reads -p
+    python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -p
+    """
+  }
+  else {
+    """
+    python3 $baseDir/scripts/trim_reads.py -f ${reads[0]}
+    """
+  }
+
+}
+
+// Align trimmed reads using bwa
+process alignReads {
+
+  tag "$sampleId-$replicate"
+  publishDir "$baseDir/output/{task.process}", mode: 'copy'
+
+  input:
+
+  set sampleId, reads, biosample, factor, treatment, replicate, controlId from trimmedReads
+  file index from bwaIndex.first()
+
+  output:
+
+  set sampleId, file('*.bam'), biosample, factor, treatment, replicate, controlId into mappedReads
+
+  script:
+
+  if (pairedEnd) {
+    """
+    python $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -p
    """
  }
  else {
    """
-    python $baseDir/scripts/check_design.py -f $reads
+    python $baseDir/scripts/map_reads.py -f ${reads[0]} -r ${index}/genome.fa
    """
  }


--- a/workflow/scripts/map_reads.py
+++ b/workflow/scripts/map_reads.py
+#!/usr/bin/env python3
+
+'''Align reads to reference genome.'''
+
+import os
+import subprocess
+import argparse
+import shutil
+import logging
+import sys
+from multiprocessing import cpu_count
+import json
+import utils
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+## SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+# the order of this list is important.
+# strip_extensions strips from the right inward, so
+# the expected right-most extensions should appear first (like .gz)
+# Modified from J. Seth Strattan
+STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '_trimmed']
+
+
+def get_args():
+    '''Define arguments.'''
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-f', '--fastq',
+                        help="The fastq file to run triming on.",
+                        nargs='+',
+                        required=True)
+
+    parser.add_argument('-r', '--reference',
+                        help="The bwa index of the reference genome.",
+                        required=True)
+
+    parser.add_argument('-p', '--paired',
+                        help="True/False if paired-end or single end.",
+                        default=False,
+                        action='store_true')
+
+    args = parser.parse_args()
+    return args
+
+
+
+## Functions
+
+def strip_extensions(filename, extensions):
+    '''Strips extensions to get basename of file.'''
+
+    basename = filename
+    for extension in extensions:
+        basename = basename.rpartition(extension)[0] or basename
+
+    return basename
+
+
+def check_tools():
+    '''Checks for required componenets on user system'''
+
+    logger.info('Checking for required libraries and components on this system')
+
+    bwa_path = shutil.which("bwa")
+    if trimgalore_path:
+        logger.info('Found bwa: %s', bwa_path)
+    else:
+        logger.error('Missing bwa')
+        raise Exception('Missing bwa')
+
+    samtools_path = shutil.which("samtools")
+    if samtools_path:
+        logger.info('Found samtools: %s', samtools_path)
+    else:
+        logger.error('Missing samtools')
+        raise Exception('Missing samtools')
+
+
+def generate_sa(fastq, reference):
+    '''Use BWA to generate Suffix Arrays.'''
+
+    fastq_basename = os.path.basename(strip_extensions(fastq, STRIP_EXTENSIONS))
+
+    bwa_aln_params = '-q 5 -l 32 -k 2'
+
+    sai = '%s.sai' % (fastq_basename)
+    with open(sai, 'w') as sai_file:
+        bwa_command = "bwa aln %s -t %d %s %s" \
+                % (bwa_aln_params, cpu_count(),
+                   reference, fastq_basename)
+
+        logger.info("Running bwa with %s", bwa_command)
+        subprocess.check_call(shlex.split(bwa_command), stdout=sai_file)
+
+    return sai
+
+
+def align_se(fastq, sai, reference, fastq_basename):
+    '''Use BWA to align SE data.'''
+
+    sam_filename = "%s.sam" % (fastq_basename)
+
+    steps = [
+        "bwa samse %s %s %s"
+        % (reference, fastq[0], sai[0]),
+        "samtools view -@%d -Su -" % (cpu_count()),
+        "samtools sort -@%d -o %s"
+        % (cpu_count(), fastq_basename)]
+
+    out, err = utils.run_pipe(steps)
+    if err:
+        logger.error("samse/samtools error: %s" % (err))
+
+
+def align_pe(fastq, sai, reference, fastq_basename):
+    '''Use BWA to align PE data.'''
+
+    sam_filename = "%s.sam" % (fastq_basename)
+    badcigar_filename = "%s.badReads" % (fastq_basename)
+
+    # Remove read pairs with bad CIGAR strings and sort by position
+    steps = [
+            "bwa sampe -P %s %s %s %s %s"
+            % (reference, fastq[0], fastq[1],
+               sai[0], sai[1]),
+            "tee %s" % (sam_filename),
+            r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
+            "sort",
+            "uniq"]
+
+    out, err = utils.run_pipe(steps, badcigar_filename)
+    if err:
+        logger.error("sampe error: %s" % (err))
+
+    steps = [
+        "cat %s" % (sam_filename),
+        "grep -v -F -f %s" % (badcigar_filename),
+        "samtools view -@%d -Su -" % (cpu_count()),
+        "samtools sort -@%d -o %s"
+        % (cpu_count(), fastq_basename)]
+
+    out, err = utils.run_pipe(steps)
+    if err:
+        logger.error("samtools error: %s" % (err))
+
+
+def main():
+    args = get_args()
+
+    # Create a file handler
+    handler = logging.FileHandler('map.log')
+    logger.addHandler(handler)
+
+    # Check if tools are present
+    check_tools()
+
+    # Run Suffix Array generation
+    sai = []
+    for fastq in args.fastq:
+        sai_filename = generate_sa(fastq, args.reference)
+        sai.append(sai_filename)
+
+    # Run alignment for either PE or SE
+    if paired: # paired-end data
+        fastq_r1_basename = os.path.basename(
+            strip_extensions(fastq[0], STRIP_EXTENSIONS))
+        fastq_r2_basename = os.path.basename(
+            strip_extensions(fastq[1], STRIP_EXTENSIONS))
+        fastq_basename = fastq_r1_basename + fastq_r2_basename
+
+        align_pe(fastq, sai, fastq_basename)
+
+    else:
+        fastq_basename = os.path.basename(
+            strip_extensions(fastq[0], STRIP_EXTENSIONS))
+
+        align_se(fastq, sai, fastq_basename)
+
+    bam_mapstats_filename = '%s.raw.srt.bam.flagstat.qc' % (fastq_basename)
+    with open(raw_bam_mapstats_filename, 'w') as fh:
+        subprocess.check_call(
+            shlex.split("%s flagstat %s" % (samtools, bam_mapstats_filename)),
+            stdout=fh)
+
+    # Remove sai files
+    for sai_file in sai:
+        os.remove(sai_file)
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/utils.py
+++ b/workflow/scripts/utils.py
+#!/usr/bin/env python3
+
+'''General utilities.'''
+
+
+import sys
+import os
+import subprocess
+import shlex
+import logging
+
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = True
+
+
+def run_pipe(steps, outfile=None):
+    # TODO:  capture stderr
+    from subprocess import Popen, PIPE
+    p = None
+    p_next = None
+    first_step_n = 1
+    last_step_n = len(steps)
+    for n, step in enumerate(steps, start=first_step_n):
+        logger.debug("step %d: %s" % (n, step))
+        if n == first_step_n:
+            if n == last_step_n and outfile:  # one-step pipeline with outfile
+                with open(outfile, 'w') as fh:
+                    print("one step shlex: %s to file: %s" % (shlex.split(step), outfile))
+                    p = Popen(shlex.split(step), stdout=fh)
+                break
+            print("first step shlex to stdout: %s" % (shlex.split(step)))
+
+            p = Popen(shlex.split(step), stdout=PIPE)
+        elif n == last_step_n and outfile:  # only treat the last step specially if you're sending stdout to a file
+            with open(outfile, 'w') as fh:
+                print("last step shlex: %s to file: %s" % (shlex.split(step), outfile))
+                p_last = Popen(shlex.split(step), stdin=p.stdout, stdout=fh)
+                p.stdout.close()
+                p = p_last
+        else:  # handles intermediate steps and, in the case of a pipe to stdout, the last step
+            print("intermediate step %d shlex to stdout: %s" % (n, shlex.split(step)))
+            p_next = Popen(shlex.split(step), stdin=p.stdout, stdout=PIPE)
+            p.stdout.close()
+            p = p_next
+    out, err = p.communicate()
+    return out, err