Merge branch '3-mapping-trimming' into 'master'

Resolve "Add mapping and trimming" Closes #3 See merge request !5

Merge branch '3-mapping-trimming' into 'master'
Resolve "Add mapping and trimming" Closes #3 See merge request !5
4357eb15 · Venkat Malladi · c9f0a2c4 · 034d2f58 · 4357eb15 · 4357eb15
Commit 4357eb15 authored 7 years ago by Venkat Malladi
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
 before_script:
  - module add  python/3.6.1-2-anaconda
  - pip install --user pytest-pythonpath
+  - module load nextflow/0.24.1-SNAPSHOT
+  - ln -s /project/shared/bicf_workflow_ref/workflow_testdata/chipseq/*fastq.gz test_data/

 test:
  script:
+  - nextflow run workflow/main.nf
  - pytest
--- a/astrocyte_pkg.yml
+++ b/astrocyte_pkg.yml
@@ -9,7 +9,7 @@
 # A unique identifier for the workflow package, text/underscores only
 name: 'chipseq_analysis_bicf'
 # Who wrote this?
-author: 'Beibei Chen'
+author: 'Beibei Chen and Venkat Malladi'
 # A contact email address for questions
 email: 'biohpc-help@utsouthwestern.edu'
 # A more informative title for the workflow package
@@ -17,8 +17,7 @@ title: 'BICF ChIP-seq Analysis Workflow'
 # A summary of the workflow package in plain text
 description: |
  This is a workflow package for the BioHPC/BICF ChIP-seq workflow system.
-  It implements a simple ChIP-seq analysis workflow and
-  visualization application.
+  It implements ChIP-seq analysis workflow and visualization application.

 # -----------------------------------------------------------------------------
 # DOCUMENTATION
@@ -40,9 +39,11 @@ documentation_files:
 # A list of cluster environment modules that this workflow requires to run.
 # Specify versioned module names to ensure reproducability.
 workflow_modules:
-  - 'fastqc/0.11.5'
-  - 'deeptools/2.3.5'
-  - 'meme/4.11.1-gcc-openmpi'
+  - 'python/3.6.1-2-anaconda'
+  - 'trimgalore/0.4.1'
+  - 'cutadapt/1.9.1'
+  - 'fastqc/0.11.2'
+  - 'bwa/intel/0.7.12'

 # A list of parameters used by the workflow, defining how to present them,
 # options etc in the web interface. For each parameter:
@@ -82,10 +83,11 @@ workflow_parameters:
    type: files
    required: true
    description: |
-      Fastq files of all  samples
-    regex: "*_{1,2}.fastq.gz"
+      One or more input FASTQ files from a ChIP-seq expereiment and a design
+      file with the link bewetwen the same file name and sample id
+    regex: ".*(fastq|fq)*"

-  - id: singleEnd
+  - id: pairedEnd
    type: select
    required: true
    choices:
@@ -98,6 +100,24 @@ workflow_parameters:
      read length, and then starts another round of reading from the opposite
      end of the fragment.

+  - id: design
+    type: file
+    required: true
+    description: |
+      A design file listing sample id, fastq files, corresponding control id
+      and additional information about the sample.
+
+  - id: genome
+    type: select
+    choices:
+      - [ 'GRCh38', 'Human GRCh38']
+      - [ 'GRCh37', 'Human GRCh37']
+      - [ 'GRCh38', 'Mouse GRCh38']
+    required: true
+    description: |
+      Reference species and genome used for alignment and subsequent analysis.
+
+
 # -----------------------------------------------------------------------------
 # SHINY APP CONFIGURATION
 # -----------------------------------------------------------------------------
@@ -114,7 +134,7 @@ vizapp_cran_packages:
  - shiny
  - shinyFiles

-# List of any Bioconductor packages, not provided by the modules, 
+# List of any Bioconductor packages, not provided by the modules,
 # that must be made available to the vizapp
 vizapp_bioc_packages:
  - qusage

--- a/nextflow.config
+++ b/nextflow.config
-process.executor='slurm'
-process.queue='super'
-process.time='5d'
-
-
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
+process {
+  executor = 'slurm'
+  queue='super'
+
+  // Process specific configuration
+  $checkDesignFile {
+    module = ['python/3.6.1-2-anaconda']
+    executor = 'local'
+  }
+  $trim_galore {
+    module = ['python/3.6.1-2-anaconda', 'trimgalore/0.4.1']
+    cpus = 32
+  }
+  $alignReads{
+    module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/intel/1.3']
+    cpus = 32
+  }
+}
+
+params {
+  // Reference file paths on BioHPC
+  genomes {
+    'GRCh38' { bwa = '/project/shared/bicf_workflow_ref/GRCh38' }
+    'GRCh37' { bwa = '/project/shared/bicf_workflow_ref/GRCh37' }
+    'GRCm38' { bwa = '/project/shared/bicf_workflow_ref/GRCm38' }
+  }
+}
+
+trace {
+    enabled = true
+    file = 'pipeline_trace.txt'
+    fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
+}
--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -7,6 +7,18 @@
 params.reads = "$baseDir/../test_data/*.fastq.gz"
 params.pairedEnd = false
 params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt"
+params.genome = 'GRCm38'
+params.genomes = []
+params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false
+
+// Check inputs
+if( params.bwaIndex ){
+  bwaIndex = Channel
+    .fromPath(params.bwaIndex)
+    .ifEmpty { exit 1, "BWA index not found: ${params.bwaIndex}" }
+} else {
+  exit 1, "No reference genome specified."
+}

 // Define List of Files
 readsList = Channel
@@ -36,7 +48,7 @@ process checkDesignFile {

  if (pairedEnd) {
    """
-    python $baseDir/scripts/check_design.py -d $designFile -f $readsList -p
+    python3 $baseDir/scripts/check_design.py -d $designFile -f $readsList -p
    """
  }
  else {
@@ -55,14 +67,14 @@ if (pairedEnd) {
 } else {
 rawReads = designFilePaths
  .splitCsv(sep: '\t', header: true)
-  .map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read1], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] }
+  .map { row -> [ row.sample_id, [row.fastq_read1], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] }
 }

-process fastQc {
+// Trim raw reads using trimgalore
+process trimReads {

  tag "$sampleId-$replicate"
-  publishDir "$baseDir/output/", mode: 'copy',
-    saveAs: {filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename"}
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'

  input:

@@ -70,11 +82,51 @@ process fastQc {

  output:

-  file '*_fastqc.{zip,html}' into fastqc_results
+  set sampleId, file('*.fq.gz'), biosample, factor, treatment, replicate, controlId into trimmedReads
+  file('*trimming_report.txt') into trimgalore_results

  script:

-  """
-  python $baseDir/scripts/qc_fastq.py -f $reads
-  """
+  if (pairedEnd) {
+    """
+    python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -p
+    """
+  }
+  else {
+    """
+    python3 $baseDir/scripts/trim_reads.py -f ${reads[0]}
+    """
+  }
+
+}
+
+// Align trimmed reads using bwa
+process alignReads {
+
+  tag "$sampleId-$replicate"
+  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+
+  input:
+
+  set sampleId, reads, biosample, factor, treatment, replicate, controlId from trimmedReads
+  file index from bwaIndex.first()
+
+  output:
+
+  set sampleId, file('*.bam'), biosample, factor, treatment, replicate, controlId into mappedReads
+  set file('*.srt.bam.flagstat.qc')
+
+  script:
+
+  if (pairedEnd) {
+    """
+    python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa -p
+    """
+  }
+  else {
+    """
+    python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa
+    """
+  }
+
 }
--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
+profiles {
+  standard {
+    includeConfig 'conf/biohpc.config'
+  }
+}
--- a/workflow/scripts/check_design.py
+++ b/workflow/scripts/check_design.py
@@ -11,7 +11,7 @@ For more details:
        %(prog)s --help
 '''

-## SETTINGS
+# SETTINGS

 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
@@ -57,7 +57,7 @@ def check_design_headers(design, paired):

    design_headers = list(design.columns.values)

-    if paired: # paired-end data
+    if paired:  # paired-end data
        design_template.extend(['fastq_read2'])

    # Check if headers
@@ -79,7 +79,8 @@ def check_controls(design):

    if len(missing_controls) > 0:
        logger.error('Missing control experiments: %s', list(missing_controls))
-        raise Exception("Missing control experiments: %s" % list(missing_controls))
+        raise Exception("Missing control experiments: %s" %
+                        list(missing_controls))


 def check_files(design, fastq, paired):
@@ -87,9 +88,9 @@ def check_files(design, fastq, paired):

    logger.info("Running file check.")

-    if paired: # paired-end data
+    if paired:  # paired-end data
        files = list(design['fastq_read1']) + list(design['fastq_read2'])
-    else: # single-end data
+    else:  # single-end data
        files = design['fastq_read1']

    files_found = fastq['name']
@@ -98,13 +99,14 @@ def check_files(design, fastq, paired):

    if len(missing_files) > 0:
        logger.error('Missing files from design file: %s', list(missing_files))
-        raise Exception("Missing files from design file: %s" % list(missing_files))
+        raise Exception("Missing files from design file: %s" %
+                        list(missing_files))
    else:
        file_dict = fastq.set_index('name').T.to_dict()

        design['fastq_read1'] = design['fastq_read1'] \
                                .apply(lambda x: file_dict[x]['path'])
-        if paired: # paired-end data
+        if paired:  # paired-end data
            design['fastq_read2'] = design['fastq_read2'] \
                                    .apply(lambda x: file_dict[x]['path'])
    return design

--- a/workflow/scripts/map_reads.py
+++ b/workflow/scripts/map_reads.py
+#!/usr/bin/env python3
+
+'''Align reads to reference genome.'''
+
+import os
+import subprocess
+import argparse
+import shutil
+import shlex
+import logging
+from multiprocessing import cpu_count
+import json
+import utils
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+# the order of this list is important.
+# strip_extensions strips from the right inward, so
+# the expected right-most extensions should appear first (like .gz)
+# Modified from J. Seth Strattan
+STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '_trimmed']
+
+
+def get_args():
+    '''Define arguments.'''
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-f', '--fastq',
+                        help="The fastq file to run triming on.",
+                        nargs='+',
+                        required=True)
+
+    parser.add_argument('-r', '--reference',
+                        help="The bwa index of the reference genome.",
+                        required=True)
+
+    parser.add_argument('-p', '--paired',
+                        help="True/False if paired-end or single end.",
+                        default=False,
+                        action='store_true')
+
+    args = parser.parse_args()
+    return args
+
+
+# Functions
+
+
+def check_tools():
+    '''Checks for required componenets on user system'''
+
+    logger.info('Checking for required libraries and components on this system')
+
+    bwa_path = shutil.which("bwa")
+    if bwa_path:
+        logger.info('Found bwa: %s', bwa_path)
+    else:
+        logger.error('Missing bwa')
+        raise Exception('Missing bwa')
+
+    samtools_path = shutil.which("samtools")
+    if samtools_path:
+        logger.info('Found samtools: %s', samtools_path)
+    else:
+        logger.error('Missing samtools')
+        raise Exception('Missing samtools')
+
+
+def generate_sa(fastq, reference):
+    '''Use BWA to generate Suffix Arrays.'''
+
+    fastq_basename = os.path.basename(utils.strip_extensions(fastq, STRIP_EXTENSIONS))
+
+    bwa_aln_params = '-q 5 -l 32 -k 2'
+
+    sai = '%s.sai' % (fastq_basename)
+    with open(sai, 'w') as sai_file:
+        bwa_command = "bwa aln %s -t %d %s %s" \
+                % (bwa_aln_params, cpu_count(),
+                   reference, fastq)
+
+        logger.info("Running bwa with %s", bwa_command)
+        subprocess.check_call(shlex.split(bwa_command), stdout=sai_file)
+
+    return sai
+
+
+def align_se(fastq, sai, reference, fastq_basename):
+    '''Use BWA to align SE data.'''
+
+    bam_filename = '%s.srt.bam' % (fastq_basename)
+
+    steps = [
+        "bwa samse %s %s %s"
+        % (reference, sai[0], fastq[0]),
+        "samtools view -@%d -Su -" % (cpu_count()),
+        "samtools sort -@%d -o %s"
+        % (cpu_count(), bam_filename)]
+
+    out, err = utils.run_pipe(steps)
+    if err:
+        logger.error("samse/samtools error: %s" % (err))
+
+    return bam_filename
+
+
+def align_pe(fastq, sai, reference, fastq_basename):
+    '''Use BWA to align PE data.'''
+
+    sam_filename = "%s.sam" % (fastq_basename)
+    badcigar_filename = "%s.badReads" % (fastq_basename)
+    bam_filename = '%s.srt.bam' % (fastq_basename)
+
+    # Remove read pairs with bad CIGAR strings and sort by position
+    steps = [
+            "bwa sampe -P %s %s %s %s %s"
+            % (reference, sai[0], sai[1],
+               fastq[0], fastq[1]),
+            "tee %s" % (sam_filename),
+            r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
+            "sort",
+            "uniq"]
+
+    out, err = utils.run_pipe(steps, badcigar_filename)
+    if err:
+        logger.error("sampe error: %s" % (err))
+
+    steps = [
+        "cat %s" % (sam_filename),
+        "grep -v -F -f %s" % (badcigar_filename),
+        "samtools view -@%d -Su -" % (cpu_count()),
+        "samtools sort -@%d -o %s"
+        % (cpu_count(), bam_filename)]
+
+    out, err = utils.run_pipe(steps)
+    if err:
+        logger.error("samtools error: %s" % (err))
+
+    return bam_filename
+
+
+def main():
+    args = get_args()
+    paired = args.paired
+    fastq = args.fastq
+    reference = args.reference
+
+    # Create a file handler
+    handler = logging.FileHandler('map.log')
+    logger.addHandler(handler)
+
+    # Check if tools are present
+    check_tools()
+
+    # Run Suffix Array generation
+    sai = []
+    for fq in fastq:
+        sai_filename = generate_sa(fq, reference)
+        sai.append(sai_filename)
+
+    # Run alignment for either PE or SE
+    if paired:  # paired-end data
+        fastq_r1_basename = os.path.basename(
+            utils.strip_extensions(fastq[0], STRIP_EXTENSIONS))
+        fastq_r2_basename = os.path.basename(
+            utils.strip_extensions(fastq[1], STRIP_EXTENSIONS))
+        fastq_basename = fastq_r1_basename + fastq_r2_basename
+
+        bam_filename = align_pe(fastq, sai, reference, fastq_basename)
+
+    else:
+        fastq_basename = os.path.basename(
+            utils.strip_extensions(fastq[0], STRIP_EXTENSIONS))
+
+        bam_filename = align_se(fastq, sai, reference, fastq_basename)
+
+    bam_mapstats_filename = '%s.srt.bam.flagstat.qc' % (fastq_basename)
+    with open(bam_mapstats_filename, 'w') as fh:
+        subprocess.check_call(
+            shlex.split("samtools flagstat %s" % (bam_filename)),
+            stdout=fh)
+
+    # Remove sai files
+    for sai_file in sai:
+        os.remove(sai_file)
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/qc_fastq.py
+++ b/workflow/scripts/qc_fastq.py
@@ -66,7 +66,7 @@ def main():

    # Create a file handler
    handler = logging.FileHandler('qc.log')
-    LOGGER.addHandler(handler)
+    logger.addHandler(handler)

    # Check if tools are present
    check_tools()

--- a/workflow/scripts/trim_reads.py
+++ b/workflow/scripts/trim_reads.py
+#!/usr/bin/env python3
+
+'''Trim low quality reads and remove sequences less than 35 base pairs.'''
+
+import subprocess
+import argparse
+import shutil
+import logging
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+def get_args():
+    '''Define arguments.'''
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-f', '--fastq',
+                        help="The fastq file to run triming on.",
+                        nargs='+',
+                        required=True)
+
+    parser.add_argument('-p', '--paired',
+                        help="True/False if paired-end or single end.",
+                        default=False,
+                        action='store_true')
+
+    args = parser.parse_args()
+    return args
+
+
+def check_tools():
+    '''Checks for required componenets on user system.'''
+
+    logger.info('Checking for required libraries and components on this system')
+
+    trimgalore_path = shutil.which("trim_galore")
+    if trimgalore_path:
+        logger.info('Found trimgalore: %s', trimgalore_path)
+    else:
+        logger.error('Missing trimgalore')
+        raise Exception('Missing trimgalore')
+
+    cutadapt_path = shutil.which("cutadapt")
+    if cutadapt_path:
+        logger.info('Found cutadapt: %s', cutadapt_path)
+    else:
+        logger.error('Missing cutadapt')
+        raise Exception('Missing cutadapt')
+
+
+def trim_reads(fastq, paired):
+    '''Run trim_galore on 1 or 2 files.'''
+
+    if paired:  # paired-end data
+        trim_params = '--paired -q 25 --illumina --gzip --length 35'
+        trim_command = "trim_galore %s %s %s " \
+                    % (trim_params, fastq[0], fastq[1])
+    else:
+        trim_params = '-q 25 --illumina --gzip --length 35'
+        trim_command = "trim_galore %s %s " \
+                    % (trim_params, fastq[0])
+
+    logger.info("Running trim_galore with %s", trim_command)
+
+    trim = subprocess.Popen(trim_command, shell=True)
+    out, err = trim.communicate()
+
+
+def main():
+    args = get_args()
+
+    # Create a file handler
+    handler = logging.FileHandler('trim.log')
+    logger.addHandler(handler)
+
+    # Check if tools are present
+    check_tools()
+
+    # Run trim_reads
+    trim_reads(args.fastq, args.paired)
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/utils.py
+++ b/workflow/scripts/utils.py
+#!/usr/bin/env python3
+
+'''General utilities.'''
+
+
+import shlex
+import logging
+
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = True
+
+
+def run_pipe(steps, outfile=None):
+    # TODO:  capture stderr
+    from subprocess import Popen, PIPE
+    p = None
+    p_next = None
+    first_step_n = 1
+    last_step_n = len(steps)
+    for n, step in enumerate(steps, start=first_step_n):
+        logger.debug("step %d: %s" % (n, step))
+        if n == first_step_n:
+            if n == last_step_n and outfile:  # one-step pipeline with outfile
+                with open(outfile, 'w') as fh:
+                    print("one step shlex: %s to file: %s" % (shlex.split(step), outfile))
+                    p = Popen(shlex.split(step), stdout=fh)
+                break
+            print("first step shlex to stdout: %s" % (shlex.split(step)))
+
+            p = Popen(shlex.split(step), stdout=PIPE)
+        elif n == last_step_n and outfile:  # only treat the last step specially if you're sending stdout to a file
+            with open(outfile, 'w') as fh:
+                print("last step shlex: %s to file: %s" % (shlex.split(step), outfile))
+                p_last = Popen(shlex.split(step), stdin=p.stdout, stdout=fh)
+                p.stdout.close()
+                p = p_last
+        else:  # handles intermediate steps and, in the case of a pipe to stdout, the last step
+            print("intermediate step %d shlex to stdout: %s" % (n, shlex.split(step)))
+            p_next = Popen(shlex.split(step), stdin=p.stdout, stdout=PIPE)
+            p.stdout.close()
+            p = p_next
+    out, err = p.communicate()
+    return out, err
+
+
+def strip_extensions(filename, extensions):
+    '''Strips extensions to get basename of file.'''
+
+    basename = filename
+    for extension in extensions:
+        basename = basename.rpartition(extension)[0] or basename
+
+    return basename
--- a/workflow/tests/test_check_design.py
+++ b/workflow/tests/test_check_design.py
 #!/usr/bin/env python3

-import os
 import pytest
 import pandas as pd
 from io import StringIO
 import check_design
-import sys


 DESIGN_STRING = """sample_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\tfastq_read1
@@ -49,11 +47,12 @@ def design_2(design):
    design_df = design.drop(design.index[2])
    return design_df

+
 @pytest.fixture
 def design_3(design):
    # Drop A_2 and B_2 and append as fastq_read2
-    design_df = design.drop(design.index[[1,3]])
-    design_df['fastq_read2'] = design.loc[[1,3],'fastq_read1'].values
+    design_df = design.drop(design.index[[1, 3]])
+    design_df['fastq_read2'] = design.loc[[1, 3], 'fastq_read1'].values
    return design_df


@@ -94,10 +93,10 @@ def test_check_files_missing_files(design, fastq_files_1):
 def test_check_files_output_singleend(design, fastq_files):
    paired = False
    new_design = check_design.check_files(design, fastq_files, paired)
-    assert new_design.loc[0,'fastq_read1'] == "/path/to/file/A_1.fastq.gz"
+    assert new_design.loc[0, 'fastq_read1'] == "/path/to/file/A_1.fastq.gz"


 def test_check_files_output_pairedend(design_3, fastq_files):
    paired = True
    new_design = check_design.check_files(design_3, fastq_files, paired)
-    assert new_design.loc[0,'fastq_read2'] == "/path/to/file/A_2.fastq.gz"
+    assert new_design.loc[0, 'fastq_read2'] == "/path/to/file/A_2.fastq.gz"
--- a/workflow/tests/test_map_reads.py
+++ b/workflow/tests/test_map_reads.py
+#!/usr/bin/env python3
+
+import pytest
+import os
+
+
+def test_map_reads_singleend():
+    aligned_reads_report = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../output/alignReads/ENCFF646LXU.srt.bam.flagstat.qc'
+    samtools_report = open(aligned_reads_report).readlines()
+    assert '80795025 + 0 in total' in samtools_report[0]
+    assert '80050072 + 0 mapped (99.08% : N/A)' in samtools_report[4]
+
+
+def test_map_reads_pairedend():
+    # Do the same thing for paired end data
+    pass
--- a/workflow/tests/test_trim_reads.py
+++ b/workflow/tests/test_trim_reads.py
+#!/usr/bin/env python3
+
+import pytest
+import os
+
+
+def test_trim_reads_singleend():
+    raw_fastq = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../../test_data/ENCFF833BLU.fastq.gz'
+    trimmed_fastq = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../output/trimReads/ENCFF833BLU_trimmed.fq.gz'
+    trimmed_fastq_report = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../output/trimReads/ENCFF833BLU.fastq.gz_trimming_report.txt'
+    assert os.path.getsize(raw_fastq) != os.path.getsize(trimmed_fastq)
+    assert os.path.getsize(trimmed_fastq) == 2512853101
+    assert 'Trimming mode: single-end' in open(trimmed_fastq_report).readlines()[4]
+
+
+def test_trim_reads_pairedend():
+    # Do the same thing for paired end data
+    pass
--- a/workflow/tests/test_utils.py
+++ b/workflow/tests/test_utils.py
+#!/usr/bin/env python3
+
+import pytest
+import utils
+
+
+STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '.fa', '.fasta']
+
+
+@pytest.fixture
+def steps():
+    steps = []
+    return steps
+
+
+@pytest.fixture
+def steps_1(steps):
+    design_file = "test_data/design_ENCSR238SGC_SE.txt"
+    step = [
+        "grep H3K4me1 %s " % (design_file)]
+    return step
+
+
+@pytest.fixture
+def steps_2(steps_1):
+    steps_1.extend([
+        "cut -f7"
+    ])
+    return steps_1
+
+
+def test_run_one_step(steps_1, capsys):
+    check_output = 'ENCSR238SGC\tlimb\tH3K4me1\tNone\t1\tENCSR687ALB\tENCFF833BLU.fastq.gz'.encode('UTF-8')
+    out, err = utils.run_pipe(steps_1)
+    output, errors = capsys.readouterr()
+    assert "first step shlex to stdout" in output
+    assert check_output in out
+
+
+def test_run_two_step(steps_2, capsys):
+    check_output = 'ENCFF833BLU.fastq.gz\nENCFF646LXU.fastq.gz'.encode('UTF-8')
+    out, err = utils.run_pipe(steps_2)
+    output, errors = capsys.readouterr()
+    assert "intermediate step 2 shlex to stdout" in output
+    assert check_output in out
+
+
+def test_run_last_step_file(steps_2, capsys, tmpdir):
+    check_output = 'ENCFF833BLU.fastq.gz\nENCFF646LXU.fastq.gz'
+    tmp_outfile = tmpdir.join('output.txt')
+    out, err = utils.run_pipe(steps_2, tmp_outfile.strpath)
+    output, errors = capsys.readouterr()
+    assert "last step shlex" in output
+    assert check_output in tmp_outfile.read()
+
+
+def test_strip_extensions():
+    filename = utils.strip_extensions('ENCFF833BLU.fastq.gz', STRIP_EXTENSIONS)
+    assert filename == 'ENCFF833BLU'
+
+
+def test_strip_extensions_not_valid():
+    filename = utils.strip_extensions('ENCFF833BLU.not.valid', STRIP_EXTENSIONS)
+    assert filename == 'ENCFF833BLU.not.valid'
+
+
+def test_strip_extensions_missing_basename():
+    filename = utils.strip_extensions('.fastq.gz', STRIP_EXTENSIONS)
+    assert filename == '.fastq'