Merge branch '1-fastqc' into 'master'

Resolve "Add in Fastqc" Closes #1 See merge request !4

Merge branch '1-fastqc' into 'master'
Resolve "Add in Fastqc" Closes #1 See merge request !4
c9f0a2c4 · Venkat Malladi · 2728fc7e · cb70f489 · c9f0a2c4 · c9f0a2c4
Commit c9f0a2c4 authored 7 years ago by Venkat Malladi
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+before_script:
+  - module add  python/3.6.1-2-anaconda
+  - pip install --user pytest-pythonpath
+
 test:
  script:
-    - module add  python/3.6.1-2-anaconda
+  - pytest
--- a/astrocyte_pkg.yml
+++ b/astrocyte_pkg.yml
@@ -17,7 +17,8 @@ title: 'BICF ChIP-seq Analysis Workflow'
 # A summary of the workflow package in plain text
 description: |
  This is a workflow package for the BioHPC/BICF ChIP-seq workflow system.
-  It implements a simple ChIP-seq analysis workflow using deepTools, Diffbind, ChipSeeker and MEME-ChIP, visualization application.
+  It implements a simple ChIP-seq analysis workflow and
+  visualization application.

 # -----------------------------------------------------------------------------
 # DOCUMENTATION
@@ -33,12 +34,13 @@ documentation_files:
 # NEXTFLOW WORKFLOW CONFIGURATION
 # -----------------------------------------------------------------------------

-# Remember - The workflow file is always named 'workflow/main.f'
+# Remember - The workflow file is always named 'workflow/main.nf'
 #            The workflow must publish all final output into $baseDir

-# A list of clueter environment modules that this workflow requires to run.
+# A list of cluster environment modules that this workflow requires to run.
 # Specify versioned module names to ensure reproducability.
 workflow_modules:
+  - 'fastqc/0.11.5'
  - 'deeptools/2.3.5'
  - 'meme/4.11.1-gcc-openmpi'

@@ -76,45 +78,25 @@ workflow_modules:

 workflow_parameters:

-  - id: bams
+  - id: reads
    type: files
    required: true
    description: |
-      Bam files of all samples
-    regex: ".*(bam|BAM)"
+      Fastq files of all  samples
+    regex: "*_{1,2}.fastq.gz"

-  - id: peaks
-    type: files
-    required: true
-    description: |
-      Peak files of all samples. Peaks should be sorted by user using either p_value or intensity of the signals.Bed format.
-    regex: ".*(narrowPeak|broadPeak|bed|BED)"
-
-
-  - id: design
-    type: files
-    required: true
-    regex: ".*(csv)"
-    description: |
-      A design file listing pairs of sample name and sample group. Must be in csv format
-      Columns must include: SampleID,Tissue, Factor, Condition, Replicate, Peaks, bamReads, bamControl, ControlID, PeakCaller
-
-  - id: genomepath
+  - id: singleEnd
    type: select
-    choices:
-      - [ '/project/shared/bicf_workflow_ref/GRCh38', 'human GRCh38']
-      - [ '/project/shared/bicf_workflow_ref/GRCh37', 'human GRCh37']
-      - [ '/project/shared/bicf_workflow_ref/GRCm38', 'mouse GRCm38']
-    required: true
-    description: |
-      Reference genome for annotation
-
-  - id: toppeakcount
-    type: integer
    required: true
-    default: -1
+    choices:
+      - [ 'true', 'True']
+      - [ 'false', 'False']
    description: |
-      The number of top peaks to use for motif discovery. This program will nott sort peak BED files for you, so please make sure your peak files are already sorted.If want all peaks to be used, use -1.
+      In single-end sequencing, the sequencer reads a fragment from only one
+      end to the other, generating the sequence of base pairs. In paired-end
+      reading it starts at one read, finishes this direction at the specified
+      read length, and then starts another round of reading from the opposite
+      end of the fragment.

 # -----------------------------------------------------------------------------
 # SHINY APP CONFIGURATION
@@ -132,11 +114,10 @@ vizapp_cran_packages:
  - shiny
  - shinyFiles

-# # List of any Bioconductor packages, not provided by the modules, that must be made
-# available to the vizapp
+# List of any Bioconductor packages, not provided by the modules, 
+# that must be made available to the vizapp
 vizapp_bioc_packages:
  - qusage
 #  - ballgown
 vizapp_github_packages:
  - js229/Vennerable
-
--- a/index.md
+++ b/index.md
-this is a read me
--- a/pytest.ini
+++ b/pytest.ini
+[pytest]
+python_paths = workflow/scripts
--- a/test_data/design_ENCSR238SGC_SE.txt
+++ b/test_data/design_ENCSR238SGC_SE.txt
+sample_id	biosample	factor	treatment	replicate	control_id	fastq_read1
+ENCSR238SGC	limb	H3K4me1	None	1	ENCSR687ALB	ENCFF833BLU.fastq.gz
+ENCSR238SGC	limb	H3K4me1	None	2	ENCSR687ALB	ENCFF646LXU.fastq.gz
+ENCSR687ALB	limb	Control	None	1	ENCSR687ALB	ENCFF524CAC.fastq.gz
+ENCSR687ALB	limb	Control	None	2	ENCSR687ALB	ENCFF163AJI.fastq.gz
--- a/test_data/design_ENCSR729LGA_PE.txt
+++ b/test_data/design_ENCSR729LGA_PE.txt
+sample_id	biosample	factor	treatment	replicate	control_id	fastq_read1	fastq_read2
+ENCSR729LGA	MCF-7	SP1	None	1	ENCSR217LRF	ENCFF957SQS.fastq.gz	ENCFF582IOZ.fastq.gz
+ENCSR729LGA	MCF-7	SP1	None	2	ENCSR217LRF	ENCFF330MCZ.fastq.gz	ENCFF293YFE.fastq.gz
+ENCSR217LRF	MCF-7	Control	None	1	ENCSR217LRF	ENCFF002DTU.fastq.gz	ENCFF002EFI.fastq.gz
+ENCSR217LRF	MCF-7	Control	None	1	ENCSR217LRF	ENCFF002EFG.fastq.gz	ENCFF002DTS.fastq.gz
--- a/test_data/fetch_test_data.sh
+++ b/test_data/fetch_test_data.sh
+echo "Downloading Single-end data set Mouse ENCSR238SGC and ENCSR687ALB"
+wget https://www.encodeproject.org/files/ENCFF833BLU/@@download/ENCFF833BLU.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF646LXU/@@download/ENCFF646LXU.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF524CAC/@@download/ENCFF524CAC.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF163AJI/@@download/ENCFF163AJI.fastq.gz
+echo "Done with Single-end"
+
+echo "Downloading Paired-end data set Human ENCSR729LGA and ENCSR217LRF"
+wget https://www.encodeproject.org/files/ENCFF957SQS/@@download/ENCFF957SQS.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF582IOZ/@@download/ENCFF582IOZ.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF330MCZ/@@download/ENCFF330MCZ.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF293YFE/@@download/ENCFF293YFE.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF002DTU/@@download/ENCFF002DTU.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF002EFI/@@download/ENCFF002EFI.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF002EFG/@@download/ENCFF002EFG.fastq.gz
+wget https://www.encodeproject.org/files/ENCFF002DTS/@@download/ENCFF002DTS.fastq.gz
+echo "Done with Paired-end"
--- a/workflow/main.nf
+++ b/workflow/main.nf
 #!/usr/bin/env nextflow
-   params.design="$baseDir/../test_data/samplesheet.csv"
-   params.bams = "$baseDir/../test_data/*.bam"
-   params.peaks = "$baseDir/../test_data/*.broadPeak"
-   params.genomepath="/project/shared/bicf_workflow_ref/GRCh37"
-   toppeakcount = -1
-   design_file = file(params.design)
-   deeptools_design = Channel.fromPath(params.design)
-   diffbind_design = Channel.fromPath(params.design)
-   chipseeker_design = Channel.fromPath(params.design)
-   meme_design = Channel.fromPath(params.design)
-   index_bams = Channel.fromPath(params.bams)
-   deeptools_bams = Channel.fromPath(params.bams) 
-   deeptools_peaks = Channel.fromPath(params.peaks) 
-   chipseeker_peaks = Channel.fromPath(params.peaks) 
-   diffbind_bams = Channel.fromPath(params.bams) 
-   diffbind_peaks = Channel.fromPath(params.peaks) 
-   meme_peaks = Channel.fromPath(params.peaks)
-
-process bamindex {
-   publishDir "$baseDir/output/", mode: 'copy'
-   input:
-     file index_bam_files from index_bams
-   output:
-     file "*bai" into deeptools_bamindex
-     file "*bai" into diffbind_bamindex
-
-   script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     module load samtools/intel/1.3
-     samtools index $index_bam_files
-     """
-}

-process run_deeptools {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-     file deeptools_design_file from deeptools_design
-     file deeptools_bam_files from deeptools_bams.toList()
-     file deeptools_peak_files from deeptools_peaks.toList()
-     file deeptools_bam_indexes from deeptools_bamindex.toList()
-   output:
-     file "*deeptools*" into deeptools_output
-     script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     module load deeptools/2.3.5
-     python $baseDir/scripts/runDeepTools.py -i ${params.design} -g ${params.genomepath}}
-"""
-}
+// Path to an input file, or a pattern for multiple inputs
+// Note - $baseDir is the location of this workflow file main.nf

+// Define Input variables
+params.reads = "$baseDir/../test_data/*.fastq.gz"
+params.pairedEnd = false
+params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt"

-process run_diffbind {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-     file diffbind_design_file from diffbind_design
-     file diffbind_bam_files from diffbind_bams.toList()
-     file diffbind_peak_files from diffbind_peaks.toList()
-     file diffbind_bam_indexes from diffbind_bamindex.toList()
-   output:
-     file "diffpeak.design" into diffpeaksdesign_chipseeker
-     file "diffpeak.design" into diffpeaksdesign_meme
-     file "*_diffbind.bed" into diffpeaks_meme
-     file "*_diffbind.bed" into diffpeaks_chipseeker
-   script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     Rscript $baseDir/scripts/runDiffBind.R $diffbind_design_file
-"""
-}
+// Define List of Files
+readsList = Channel
+  .fromPath( params.reads )
+  .flatten()
+  .map { file -> [ file.getFileName().toString(), file.toString() ].join("\t")}
+  .collectFile( name: 'fileList.tsv', newLine: true )

-process run_chipseeker_diffpeak {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-     file diffpeak_design_file from diffpeaksdesign_chipseeker
-     file diffpeaks from diffpeaks_chipseeker
-   output:
-     file "*chipseeker*" into chipseeker_diffpeak_output
-   script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     Rscript $baseDir/scripts/runChipseeker.R $diffpeak_design_file ${params.genomepath}
-"""
-}
+// Define regular variables
+pairedEnd = params.pairedEnd
+designFile = params.designFile

-process run_chipseeker_originalpeak {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-     file design_file from chipseeker_design
-     file chipseeker_peak_files from chipseeker_peaks.toList()
-   output:
-     file "*chipseeker*" into chipseeker_originalpeak_output
-   script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     Rscript $baseDir/scripts/runChipseeker.R $design_file ${params.genomepath}
-"""
-}
+process checkDesignFile {
+
+  publishDir "$baseDir/output/design", mode: 'copy'
+
+  input:
+
+  designFile
+  file readsList
+
+  output:
+
+  file("design.tsv") into designFilePaths
+
+  script:
+
+  if (pairedEnd) {
+    """
+    python $baseDir/scripts/check_design.py -d $designFile -f $readsList -p
+    """
+  }
+  else {
+    """
+    python $baseDir/scripts/check_design.py -d $designFile -f $readsList
+    """
+  }

-process run_meme_original {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-     file design_meme from meme_design
-     file meme_peak_files from meme_peaks.toList()
-   output:
-     file "*meme*" into meme_original_output
-   script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     module add deeptools/2.3.5
-     module load meme/4.11.1-gcc-openmpi
-     python $baseDir/scripts/runMemechip.py -i $design_meme -g ${params.genomepath} -l ${toppeakcount}
-"""
 }

-process run_meme_diffpeak {
-   publishDir "$baseDir/output", mode: 'copy'
-   input:
-     file peaks_meme from diffpeaks_meme
-     file diffpeak_design from diffpeaksdesign_meme
-   output:
-     file "*meme*" into meme_diffpeak_output
-   script:
-     """
-     module load python/2.7.x-anaconda
-     module load R/3.3.2-gccmkl
-     module add deeptools/2.3.5
-     module load meme/4.11.1-gcc-openmpi
-     python $baseDir/scripts/runMemechip.py -i $diffpeak_design -g ${params.genomepath} -l ${toppeakcount}
-"""
+// Define channel for raw reads
+if (pairedEnd) {
+  rawReads = designFilePaths
+    .splitCsv(sep: '\t', header: true)
+    .map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read2], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] }
+} else {
+rawReads = designFilePaths
+  .splitCsv(sep: '\t', header: true)
+  .map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read1], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] }
 }

+process fastQc {
+
+  tag "$sampleId-$replicate"
+  publishDir "$baseDir/output/", mode: 'copy',
+    saveAs: {filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename"}
+
+  input:
+
+  set sampleId, reads, biosample, factor, treatment, replicate, controlId from rawReads
+
+  output:
+
+  file '*_fastqc.{zip,html}' into fastqc_results
+
+  script:
+
+  """
+  python $baseDir/scripts/qc_fastq.py -f $reads
+  """
+}
--- a/workflow/scripts/__init__.py
+++ b/workflow/scripts/__init__.py
-
--- a/workflow/scripts/check_design.py
+++ b/workflow/scripts/check_design.py
+#!/usr/bin/env python3
+
+'''Check if design file is correctly formatted and matches files list.'''
+
+import argparse
+import logging
+import pandas as pd
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+## SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+def get_args():
+    '''Define arguments.'''
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-d', '--design',
+                        help="The design file to run QC (TSV format).",
+                        required=True)
+
+    parser.add_argument('-f', '--fastq',
+                        help="File with list of fastq files (csv format).",
+                        required=True)
+
+    parser.add_argument('-p', '--paired',
+                        help="True/False if paired-end or single end.",
+                        default=False,
+                        action='store_true')
+
+    args = parser.parse_args()
+    return args
+
+
+def check_design_headers(design, paired):
+    '''Check if design file conforms to sequencing type.'''
+
+    # Default headers
+    design_template = [
+        'sample_id',
+        'biosample',
+        'factor',
+        'treatment',
+        'replicate',
+        'control_id',
+        'fastq_read1']
+
+    design_headers = list(design.columns.values)
+
+    if paired: # paired-end data
+        design_template.extend(['fastq_read2'])
+
+    # Check if headers
+    logger.info("Running header check.")
+
+    missing_headers = set(design_template) - set(design_headers)
+
+    if len(missing_headers) > 0:
+        logger.error('Missing column headers: %s', list(missing_headers))
+        raise Exception("Missing column headers: %s" % list(missing_headers))
+
+
+def check_controls(design):
+    '''Check if design file has the correct control mapping.'''
+
+    logger.info("Running control check.")
+
+    missing_controls = set(design['control_id']) - set(design['sample_id'])
+
+    if len(missing_controls) > 0:
+        logger.error('Missing control experiments: %s', list(missing_controls))
+        raise Exception("Missing control experiments: %s" % list(missing_controls))
+
+
+def check_files(design, fastq, paired):
+    '''Check if design file has the files found.'''
+
+    logger.info("Running file check.")
+
+    if paired: # paired-end data
+        files = list(design['fastq_read1']) + list(design['fastq_read2'])
+    else: # single-end data
+        files = design['fastq_read1']
+
+    files_found = fastq['name']
+
+    missing_files = set(files) - set(files_found)
+
+    if len(missing_files) > 0:
+        logger.error('Missing files from design file: %s', list(missing_files))
+        raise Exception("Missing files from design file: %s" % list(missing_files))
+    else:
+        file_dict = fastq.set_index('name').T.to_dict()
+
+        design['fastq_read1'] = design['fastq_read1'] \
+                                .apply(lambda x: file_dict[x]['path'])
+        if paired: # paired-end data
+            design['fastq_read2'] = design['fastq_read2'] \
+                                    .apply(lambda x: file_dict[x]['path'])
+    return design
+
+
+def main():
+    args = get_args()
+
+    # Create a file handler
+    handler = logging.FileHandler('design.log')
+    logger.addHandler(handler)
+
+    # Read files
+    design_file = pd.read_csv(args.design, sep='\t')
+    fastq_file = pd.read_csv(args.fastq, sep='\t', names=['name', 'path'])
+
+    # Check design file
+    check_design_headers(design_file, args.paired)
+    check_controls(design_file)
+    new_design = check_files(design_file, fastq_file, args.paired)
+
+    # Write out new design file
+    new_design.to_csv('design.tsv', header=True, sep='\t', index=False)
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/qc_fastq.py
+++ b/workflow/scripts/qc_fastq.py
+#!/usr/bin/env python3
+
+'''QC check of raw .fastq files using FASTQC.'''
+
+import os
+import subprocess
+import argparse
+import shutil
+import logging
+import sys
+import json
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+## SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+def get_args():
+    '''Define arguments.'''
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-f', '--fastq',
+                        help="The fastq file to run QC check on.",
+                        nargs='+',
+                        required=True)
+
+    args = parser.parse_args()
+    return args
+
+
+def check_tools():
+    '''Checks for required componenets on user system'''
+
+    logger.info('Checking for required libraries and components on this system')
+
+    fastqc_path = shutil.which("fastqc")
+    if fastqc_path:
+        logger.info('Found fastqc: %s', fastqc_path)
+    else:
+        logger.error('Missing fastqc')
+        raise Exception('Missing fastqc')
+
+
+def check_qual_fastq(fastq):
+    '''Run fastqc on 1 or 2 files.'''
+    qc_command = "fastqc -t -f fastq " + " ".join(fastq)
+
+    logger.info("Running fastqc with %s", qc_command)
+
+    qual_fastq = subprocess.Popen(qc_command, shell=True)
+    out, err = qual_fastq.communicate()
+
+
+def main():
+    args = get_args()
+
+    # Create a file handler
+    handler = logging.FileHandler('qc.log')
+    LOGGER.addHandler(handler)
+
+    # Check if tools are present
+    check_tools()
+
+    # Run quality checks
+    check_qual_fastq(args.fastq)
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/runDeepTools.pyc
+++ b/workflow/scripts/runDeepTools.pyc
--- a/workflow/scripts/runMemechip.pyc
+++ b/workflow/scripts/runMemechip.pyc
--- a/workflow/tests/test_check_design.py
+++ b/workflow/tests/test_check_design.py
+#!/usr/bin/env python3
+
+import os
+import pytest
+import pandas as pd
+from io import StringIO
+import check_design
+import sys
+
+
+DESIGN_STRING = """sample_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\tfastq_read1
+A_1\tLiver\tH3K27ac\tNone\t1\tB_1\tA_1.fastq.gz
+A_2\tLiver\tH3K27ac\tNone\t2\tB_2\tA_2.fastq.gz
+B_1\tLiver\tInput\tNone\t1\tB_1\tB_1.fastq.gz
+B_2\tLiver\tInput\tNone\t2\tB_2\tB_2.fastq.gz
+"""
+
+FASTQ_STRING = """
+A_1.fastq.gz\t/path/to/file/A_1.fastq.gz
+A_2.fastq.gz\t/path/to/file/A_2.fastq.gz
+B_1.fastq.gz\t/path/to/file/B_1.fastq.gz
+B_2.fastq.gz\t/path/to/file/B_2.fastq.gz
+"""
+
+
+@pytest.fixture
+def design():
+    design_file = StringIO(DESIGN_STRING)
+    design_df = pd.read_csv(design_file, sep="\t")
+    return design_df
+
+
+@pytest.fixture
+def fastq_files():
+    fastq_file = StringIO(FASTQ_STRING)
+    fastq_df = pd.read_csv(fastq_file, sep='\t', names=['name', 'path'])
+    return fastq_df
+
+
+@pytest.fixture
+def design_1(design):
+    design_df = design.drop('fastq_read1', axis=1)
+    return design_df
+
+
+@pytest.fixture
+def design_2(design):
+    # Drop Control B_1
+    design_df = design.drop(design.index[2])
+    return design_df
+
+@pytest.fixture
+def design_3(design):
+    # Drop A_2 and B_2 and append as fastq_read2
+    design_df = design.drop(design.index[[1,3]])
+    design_df['fastq_read2'] = design.loc[[1,3],'fastq_read1'].values
+    return design_df
+
+
+@pytest.fixture
+def fastq_files_1(fastq_files):
+    # Drop B_2.fastq.gz
+    fastq_df = fastq_files.drop(fastq_files.index[3])
+    return fastq_df
+
+
+def test_check_headers_singleend(design_1):
+    paired = False
+    with pytest.raises(Exception) as excinfo:
+        check_design.check_design_headers(design_1, paired)
+    assert str(excinfo.value) == "Missing column headers: ['fastq_read1']"
+
+
+def test_check_headers_pairedend(design):
+    paired = True
+    with pytest.raises(Exception) as excinfo:
+        check_design.check_design_headers(design, paired)
+    assert str(excinfo.value) == "Missing column headers: ['fastq_read2']"
+
+
+def test_check_controls(design_2):
+    with pytest.raises(Exception) as excinfo:
+        check_design.check_controls(design_2)
+    assert str(excinfo.value) == "Missing control experiments: ['B_1']"
+
+
+def test_check_files_missing_files(design, fastq_files_1):
+    paired = False
+    with pytest.raises(Exception) as excinfo:
+        new_design = check_design.check_files(design, fastq_files_1, paired)
+    assert str(excinfo.value) == "Missing files from design file: ['B_2.fastq.gz']"
+
+
+def test_check_files_output_singleend(design, fastq_files):
+    paired = False
+    new_design = check_design.check_files(design, fastq_files, paired)
+    assert new_design.loc[0,'fastq_read1'] == "/path/to/file/A_1.fastq.gz"
+
+
+def test_check_files_output_pairedend(design_3, fastq_files):
+    paired = True
+    new_design = check_design.check_files(design_3, fastq_files, paired)
+    assert new_design.loc[0,'fastq_read2'] == "/path/to/file/A_2.fastq.gz"