cfdc3a98 · cfdc3a98 · cfdc3a98 · cfdc3a98 · cfdc3a98 · cfdc3a98
--- a/workflow/scripts/generate_versions.py
+++ b/workflow/scripts/generate_versions.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+'''Make YAML of software versions.'''
+
+from __future__ import print_function
+from collections import OrderedDict
+import re
+import os
+import logging
+import glob
+import argparse
+import numpy as np
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+SOFTWARE_REGEX = {
+    'Nextflow': ['version_nextflow.txt', r"(\S+)"],
+    'Trim Galore!': ['trimReads_vf/version_trimgalore.txt', r"version (\S+)"],
+    'Cutadapt': ['trimReads_vf/version_cutadapt.txt', r"Version (\S+)"],
+    'BWA': ['alignReads_vf/version_bwa.txt', r"Version: (\S+)"],
+    'Samtools': ['alignReads_vf/version_samtools.txt', r"samtools (\S+)"],
+    'Sambamba': ['filterReads_vf/version_sambamba.txt', r"sambamba (\S+)"],
+    'BEDTools': ['convertReads_vf/version_bedtools.txt', r"bedtools v(\S+)"],
+    'R': ['crossReads_vf/version_r.txt', r"R version (\S+)"],
+    'SPP': ['crossReads_vf/version_spp.txt', r"\[1\] ‘(1.14)’"],
+    'MACS2': ['callPeaksMACS_vf/version_macs.txt', r"macs2 (\S+)"],
+    'bedGraphToBigWig': ['callPeaksMACS_vf/version_bedGraphToBigWig.txt', r"bedGraphToBigWig v (\S+)"],
+    'ChIPseeker': ['peakAnnotation_vf/version_ChIPseeker.txt', r"Version (\S+)\""],
+    'MEME-ChIP': ['motifSearch_vf/version_memechip.txt', r"Version (\S+)"],
+    'DiffBind': ['diffPeaks_vf/version_DiffBind.txt', r"Version (\S+)\""],
+    'deepTools': ['experimentQC_vf/version_deeptools.txt', r"deeptools (\S+)"],
+    'MultiQC': ['version_multiqc.txt', r"multiqc, version (\S+)"],
+}
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-o', '--output',
+                        help="The out file name.",
+                        required=True)
+
+    parser.add_argument('-t', '--test',
+                        help='Used for testing purposes',
+                        default=False,
+                        action='store_true')
+
+    args = parser.parse_args()
+    return args
+
+
+def check_files(files, test):
+    '''Check if version files are found.'''
+
+    logger.info("Running file check.")
+
+    software_files = np.array(list(SOFTWARE_REGEX.values()))[:,0]
+
+    extra_files =  set(files) - set(software_files)
+
+    if len(extra_files) > 0 and test:
+            logger.error('Missing regex: %s', list(extra_files))
+            raise Exception("Missing regex: %s" % list(extra_files))
+
+
+def main():
+    args = get_args()
+    output = args.output
+    test = args.test
+
+    out_filename = output + '_mqc.yaml'
+
+    results = OrderedDict()
+    results['Nextflow'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Trim Galore!'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Cutadapt'] = '<span style="color:#999999;\">Not Run</span>'
+    results['BWA'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Samtools'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Sambamba'] = '<span style="color:#999999;\">Not Run</span>'
+    results['BEDTools'] = '<span style="color:#999999;\">Not Run</span>'
+    results['R'] = '<span style="color:#999999;\">Not Run</span>'
+    results['SPP'] = '<span style="color:#999999;\">Not Run</span>'
+    results['MACS2'] = '<span style="color:#999999;\">Not Run</span>'
+    results['bedGraphToBigWig'] = '<span style="color:#999999;\">Not Run</span>'
+    results['ChIPseeker'] = '<span style="color:#999999;\">Not Run</span>'
+    results['MEME-ChIP'] = '<span style="color:#999999;\">Not Run</span>'
+    results['DiffBind'] = '<span style="color:#999999;\">Not Run</span>'
+    results['deepTools'] = '<span style="color:#999999;\">Not Run</span>'
+    results['MultiQC'] = '<span style="color:#999999;\">Not Run</span>'
+
+    # list all files
+    files = glob.glob('**/*.txt', recursive=True)
+
+    # Check for version files:
+    check_files(files, test)
+
+    # Search each file using its regex
+    for k, v in SOFTWARE_REGEX.items():
+        if os.path.isfile(v[0]):
+            with open(v[0]) as x:
+                versions = x.read()
+                match = re.search(v[1], versions)
+                if match:
+                    results[k] = "v{}".format(match.group(1))
+
+    # Dump to YAML
+    print(
+        '''
+        id: 'Software Versions'
+        section_name: 'Software Versions'
+        section_href: 'https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/'
+        plot_type: 'html'
+        description: 'are collected at run time from the software output.'
+        data: |
+            <dl class="dl-horizontal">
+        '''
+    , file = open(out_filename, "w"))
+
+    for k, v in results.items():
+        print("            <dt>{}</dt><dd>{}</dd>".format(k, v), file = open(out_filename, "a"))
+    print("            </dl>", file = open(out_filename, "a"))
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/map_qc.py
+++ b/workflow/scripts/map_qc.py
@@ -62,6 +62,15 @@ def check_tools():
    samtools_path = shutil.which("samtools")
    if samtools_path:
        logger.info('Found samtools: %s', samtools_path)
+
+        # Get Version
+        samtools_version_command = "samtools --version"
+        samtools_version = subprocess.check_output(samtools_version_command, shell=True)
+
+        # Write to file
+        samtools_file = open("version_samtools.txt", "wb")
+        samtools_file.write(samtools_version)
+        samtools_file.close()
    else:
        logger.error('Missing samtools')
        raise Exception('Missing samtools')
@@ -69,6 +78,18 @@ def check_tools():
    sambamba_path = shutil.which("sambamba")
    if sambamba_path:
        logger.info('Found sambamba: %s', sambamba_path)
+
+        # Get Version
+        sambamba_version_command = "sambamba"
+        try:
+            subprocess.check_output(sambamba_version_command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError as e:
+            sambamba_version = e.output
+
+        # Write to file
+        sambamba_file = open("version_sambamba.txt", "wb")
+        sambamba_file.write(sambamba_version)
+        sambamba_file.close()
    else:
        logger.error('Missing sambamba')
        raise Exception('Missing sambamba')
@@ -76,6 +97,15 @@ def check_tools():
    bedtools_path = shutil.which("bedtools")
    if bedtools_path:
        logger.info('Found bedtools: %s', bedtools_path)
+
+        # Get Version
+        bedtools_version_command = "bedtools --version"
+        bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
+
+        # Write to file
+        bedtools_file = open("version_bedtools.txt", "wb")
+        bedtools_file.write(bedtools_version)
+        bedtools_file.close()
    else:
        logger.error('Missing bedtools')
        raise Exception('Missing bedtools')
@@ -106,7 +136,7 @@ def filter_mapped_pe(bam, bam_basename):
        # Will produce name sorted BAM
        "samtools sort -n -@ %d -o %s" % (cpu_count(), tmp_filt_bam_filename)])
    if err:
-        logger.error("samtools filter error: %s" % (err))
+        logger.error("samtools filter error: %s", err)

    # Remove orphan reads (pair was removed)
    # and read pairs mapping to different chromosomes
@@ -136,7 +166,7 @@ def filter_mapped_se(bam, bam_basename):
    # not primary alignment, reads failing platform
    # Remove low MAPQ reads
    # Obtain name sorted BAM file
-    with open(filt_bam_filename, 'w') as fh:
+    with open(filt_bam_filename, 'w') as temp_file:
        samtools_filter_command = (
            "samtools view -F 1804 -q 30 -b %s"
            % (bam)
@@ -144,7 +174,7 @@ def filter_mapped_se(bam, bam_basename):
        logger.info(samtools_filter_command)
        subprocess.check_call(
            shlex.split(samtools_filter_command),
-            stdout=fh)
+            stdout=temp_file)

    return filt_bam_filename

@@ -153,11 +183,11 @@ def dedup_mapped(bam, bam_basename, paired):
    '''Use sambamba and samtools to remove duplicates.'''

    # Markduplicates
-    dup_file_qc_filename = bam_basename + ".dup.qc"
+    dup_file_qc_filename = bam_basename + ".dedup.qc"
    tmp_dup_mark_filename = bam_basename + ".dupmark.bam"
    sambamba_params = "--hash-table-size=17592186044416" + \
                    " --overflow-list-size=20000000 --io-buffer-size=256"
-    with open(dup_file_qc_filename, 'w') as fh:
+    with open(dup_file_qc_filename, 'w') as temp_file:
        sambamba_markdup_command = (
            "sambamba markdup -t %d %s --tmpdir=%s %s %s"
            % (cpu_count(), sambamba_params, os.getcwd(), bam, tmp_dup_mark_filename)
@@ -165,11 +195,10 @@ def dedup_mapped(bam, bam_basename, paired):
        logger.info(sambamba_markdup_command)
        subprocess.check_call(
            shlex.split(sambamba_markdup_command),
-            stderr=fh)
-
+            stderr=temp_file)

    # Remove duplicates
-    final_bam_prefix = bam_basename + ".filt.nodup"
+    final_bam_prefix = bam_basename + ".dedup"
    final_bam_filename = final_bam_prefix + ".bam"

    if paired:  # paired-end data
@@ -179,11 +208,11 @@ def dedup_mapped(bam, bam_basename, paired):
        samtools_dedupe_command = \
            "samtools view -F 1804 -b %s" % (tmp_dup_mark_filename)

-    with open(final_bam_filename, 'w') as fh:
+    with open(final_bam_filename, 'w') as temp_file:
        logger.info(samtools_dedupe_command)
        subprocess.check_call(
            shlex.split(samtools_dedupe_command),
-            stdout=fh)
+            stdout=temp_file)

    # Index final bam file
    sambamba_index_command = \
@@ -192,12 +221,12 @@ def dedup_mapped(bam, bam_basename, paired):
    subprocess.check_output(shlex.split(sambamba_index_command))

    # Generate mapping statistics
-    final_bam_file_mapstats_filename = final_bam_prefix + ".flagstat.qc"
-    with open(final_bam_file_mapstats_filename, 'w') as fh:
+    mapstats_filename = final_bam_prefix + ".flagstat.qc"
+    with open(mapstats_filename, 'w') as temp_file:
        flagstat_command = "sambamba flagstat -t %d %s" \
                            % (cpu_count(), final_bam_filename)
        logger.info(flagstat_command)
-        subprocess.check_call(shlex.split(flagstat_command), stdout=fh)
+        subprocess.check_call(shlex.split(flagstat_command), stdout=temp_file)

    os.remove(bam)
    return tmp_dup_mark_filename
@@ -206,7 +235,7 @@ def dedup_mapped(bam, bam_basename, paired):
 def compute_complexity(bam, paired, bam_basename):
    '''Calculate library complexity .'''

-    pbc_file_qc_filename = bam_basename + ".filt.nodup.pbc.qc"
+    pbc_file_qc_filename = bam_basename + ".pbc.qc"
    tmp_pbc_file_qc_filename = "tmp.%s" % (pbc_file_qc_filename)

    # Sort by name
@@ -215,6 +244,7 @@ def compute_complexity(bam, paired, bam_basename):
    # Obtain unique count statistics

    # PBC File output
+    # Sample Name[tab]
    # TotalReadPairs [tab]
    # DistinctReadPairs [tab]
    # OneReadPair [tab]
@@ -223,12 +253,12 @@ def compute_complexity(bam, paired, bam_basename):
    # PBC1=OnePair/Distinct [tab]
    # PBC2=OnePair/TwoPair
    pbc_headers = ['TotalReadPairs',
-                    'DistinctReadPairs',
-                    'OneReadPair',
-                    'TwoReadPairs',
-                    'NRF',
-                    'PBC1',
-                    'PBC2']
+                   'DistinctReadPairs',
+                   'OneReadPair',
+                   'TwoReadPairs',
+                   'NRF',
+                   'PBC1',
+                   'PBC2']

    if paired:
        steps = [
@@ -247,11 +277,15 @@ def compute_complexity(bam, paired, bam_basename):
        ])
    out, err = utils.run_pipe(steps, tmp_pbc_file_qc_filename)
    if err:
-        logger.error("PBC file error: %s" % (err))
+        logger.error("PBC file error: %s", err)

-    # Add headers
+    # Add Sample Name and headers
    pbc_file = pd.read_csv(tmp_pbc_file_qc_filename, sep='\t', header=None,
-                          names=pbc_headers)
+                           names=pbc_headers)
+    pbc_file['Sample'] = bam_basename
+    pbc_headers_new = list(pbc_file)
+    pbc_headers_new.insert(0, pbc_headers_new.pop(pbc_headers_new.index('Sample')))
+    pbc_file = pbc_file[pbc_headers_new]
    pbc_file.to_csv(pbc_file_qc_filename, header=True, sep='\t', index=False)
    os.remove(bam)
    os.remove(bam + '.bai')

--- a/workflow/scripts/map_reads.py
+++ b/workflow/scripts/map_reads.py
@@ -9,7 +9,6 @@ import shutil
 import shlex
 import logging
 from multiprocessing import cpu_count
-import json
 import utils

 EPILOG = '''
@@ -33,7 +32,7 @@ STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '_trimmed']

 def get_args():
    '''Define arguments.'''
-    
+
    parser = argparse.ArgumentParser(
        description=__doc__, epilog=EPILOG,
        formatter_class=argparse.RawDescriptionHelpFormatter)
@@ -47,6 +46,10 @@ def get_args():
                        help="The bwa index of the reference genome.",
                        required=True)

+    parser.add_argument('-s', '--sample',
+                        help="The name of the sample.",
+                        required=True)
+
    parser.add_argument('-p', '--paired',
                        help="True/False if paired-end or single end.",
                        default=False,
@@ -67,6 +70,18 @@ def check_tools():
    bwa_path = shutil.which("bwa")
    if bwa_path:
        logger.info('Found bwa: %s', bwa_path)
+
+        # Get Version
+        bwa_version_command = "bwa"
+        try:
+            subprocess.check_output(bwa_version_command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError as e:
+            bwa_version = e.output
+
+        # Write to file
+        bwa_file = open("version_bwa.txt", "wb")
+        bwa_file.write(bwa_version)
+        bwa_file.close()
    else:
        logger.error('Missing bwa')
        raise Exception('Missing bwa')
@@ -74,6 +89,15 @@ def check_tools():
    samtools_path = shutil.which("samtools")
    if samtools_path:
        logger.info('Found samtools: %s', samtools_path)
+
+        # Get Version
+        samtools_version_command = "samtools --version"
+        samtools_version = subprocess.check_output(samtools_version_command, shell=True)
+
+        # Write to file
+        samtools_file = open("version_samtools.txt", "wb")
+        samtools_file.write(samtools_version)
+        samtools_file.close()
    else:
        logger.error('Missing samtools')
        raise Exception('Missing samtools')
@@ -101,7 +125,7 @@ def generate_sa(fastq, reference):
 def align_se(fastq, sai, reference, fastq_basename):
    '''Use BWA to align SE data.'''

-    bam_filename = '%s.srt.bam' % (fastq_basename)
+    bam_filename = '%s.bam' % (fastq_basename)

    steps = [
        "bwa samse %s %s %s"
@@ -112,7 +136,7 @@ def align_se(fastq, sai, reference, fastq_basename):

    out, err = utils.run_pipe(steps)
    if err:
-        logger.error("samse/samtools error: %s" % (err))
+        logger.error("samse/samtools error: %s", err)

    return bam_filename

@@ -122,21 +146,21 @@ def align_pe(fastq, sai, reference, fastq_basename):

    sam_filename = "%s.sam" % (fastq_basename)
    badcigar_filename = "%s.badReads" % (fastq_basename)
-    bam_filename = '%s.srt.bam' % (fastq_basename)
+    bam_filename = '%s.bam' % (fastq_basename)

    # Remove read pairs with bad CIGAR strings and sort by position
    steps = [
-            "bwa sampe -P %s %s %s %s %s"
-            % (reference, sai[0], sai[1],
-               fastq[0], fastq[1]),
-            "tee %s" % (sam_filename),
-            r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
-            "sort",
-            "uniq"]
+        "bwa sampe -P %s %s %s %s %s"
+        % (reference, sai[0], sai[1],
+           fastq[0], fastq[1]),
+        "tee %s" % (sam_filename),
+        r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
+        "sort",
+        "uniq"]

    out, err = utils.run_pipe(steps, badcigar_filename)
    if err:
-        logger.error("sampe error: %s" % (err))
+        logger.error("sampe error: %s", err)

    steps = [
        "cat %s" % (sam_filename),
@@ -147,7 +171,7 @@ def align_pe(fastq, sai, reference, fastq_basename):

    out, err = utils.run_pipe(steps)
    if err:
-        logger.error("samtools error: %s" % (err))
+        logger.error("samtools error: %s", err)

    return bam_filename

@@ -157,6 +181,7 @@ def main():
    paired = args.paired
    fastq = args.fastq
    reference = args.reference
+    sample = args.sample

    # Create a file handler
    handler = logging.FileHandler('map.log')
@@ -167,31 +192,25 @@ def main():

    # Run Suffix Array generation
    sai = []
-    for fq in fastq:
-        sai_filename = generate_sa(fq, reference)
+    for fastq_file in fastq:
+        sai_filename = generate_sa(fastq_file, reference)
        sai.append(sai_filename)

+    # Make file basename
+    fastq_basename = sample
+
    # Run alignment for either PE or SE
    if paired:  # paired-end data
-        fastq_r1_basename = os.path.basename(
-            utils.strip_extensions(fastq[0], STRIP_EXTENSIONS))
-        fastq_r2_basename = os.path.basename(
-            utils.strip_extensions(fastq[1], STRIP_EXTENSIONS))
-        fastq_basename = fastq_r1_basename + fastq_r2_basename
-
        bam_filename = align_pe(fastq, sai, reference, fastq_basename)

    else:
-        fastq_basename = os.path.basename(
-            utils.strip_extensions(fastq[0], STRIP_EXTENSIONS))
-
        bam_filename = align_se(fastq, sai, reference, fastq_basename)

-    bam_mapstats_filename = '%s.srt.bam.flagstat.qc' % (fastq_basename)
-    with open(bam_mapstats_filename, 'w') as fh:
+    bam_mapstats_filename = '%s.flagstat.qc' % (fastq_basename)
+    with open(bam_mapstats_filename, 'w') as temp_file:
        subprocess.check_call(
            shlex.split("samtools flagstat %s" % (bam_filename)),
-            stdout=fh)
+            stdout=temp_file)

    # Remove sai files
    for sai_file in sai:

--- a/workflow/scripts/motif_search.py
+++ b/workflow/scripts/motif_search.py
@@ -3,16 +3,14 @@
 '''Call Motifs on called peaks.'''

 import os
-import sys
-import re
-from re import sub
-import string
 import argparse
 import logging
+import shutil
 import subprocess
+from multiprocessing import Pool
 import pandas as pd
 import utils
-from multiprocessing import Pool
+

 EPILOG = '''
 For more details:
@@ -27,6 +25,13 @@ logger.propagate = False
 logger.setLevel(logging.INFO)


+# the order of this list is important.
+# strip_extensions strips from the right inward, so
+# the expected right-most extensions should appear first (like .gz)
+# Modified from J. Seth Strattan
+STRIP_EXTENSIONS = ['.narrowPeak', '.replicated']
+
+
 def get_args():
    '''Define arguments.'''

@@ -51,19 +56,66 @@ def get_args():

 # Functions

+
+def check_tools():
+    '''Checks for required componenets on user system'''
+
+    logger.info('Checking for required libraries and components on this system')
+
+    meme_path = shutil.which("meme")
+    if meme_path:
+        logger.info('Found meme: %s', meme_path)
+
+        # Get Version
+        memechip_version_command = "meme-chip --version"
+        memechip_version = subprocess.check_output(memechip_version_command, shell=True)
+
+        # Write to file
+        meme_file = open("version_memechip.txt", "wb")
+        meme_file.write(b"Version %s" % (memechip_version))
+        meme_file.close()
+    else:
+        logger.error('Missing meme')
+        raise Exception('Missing meme')
+
+    bedtools_path = shutil.which("bedtools")
+    if bedtools_path:
+        logger.info('Found bedtools: %s', bedtools_path)
+
+        # Get Version
+        bedtools_version_command = "bedtools --version"
+        bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
+
+        # Write to file
+        bedtools_file = open("version_bedtools.txt", "wb")
+        bedtools_file.write(bedtools_version)
+        bedtools_file.close()
+    else:
+        logger.error('Missing bedtools')
+        raise Exception('Missing bedtools')
+
+
 def run_wrapper(args):
-  motif_search(*args)
+    motif_search(*args)
+

 def motif_search(filename, genome, experiment, peak):
+    '''Run motif serach on peaks.'''
+
+    file_basename = os.path.basename(
+        utils.strip_extensions(filename, STRIP_EXTENSIONS))

-    file_basename = os.path.basename(filename)
-    sorted_fn = 'sorted-%s' % (file_basename)
    out_fa = '%s.fa' % (experiment)
    out_motif = '%s_memechip' % (experiment)

    # Sort Bed file and limit number of peaks
    if peak == -1:
        peak = utils.count_lines(filename)
+        peak_no = 'all'
+    else:
+        peak_no = peak
+
+    sorted_fn = '%s.%s.narrowPeak' % (file_basename, peak_no)

    out, err = utils.run_pipe([
        'sort -k %dgr,%dgr %s' % (5, 5, filename),
@@ -73,9 +125,15 @@ def motif_search(filename, genome, experiment, peak):
    out, err = utils.run_pipe([
        'bedtools getfasta -fi %s -bed %s -fo %s' % (genome, sorted_fn, out_fa)])

-    #Call memechip
+    if err:
+        logger.error("bedtools error: %s", err)
+
+    # Call memechip
    out, err = utils.run_pipe([
        'meme-chip -oc %s -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 %s -norand' % (out_motif, out_fa)])
+    if err:
+        logger.error("meme-chip error: %s", err)
+

 def main():
    args = get_args()
@@ -83,14 +141,22 @@ def main():
    genome = args.genome
    peak = args.peak

+    # Create a file handler
+    handler = logging.FileHandler('motif.log')
+    logger.addHandler(handler)
+
+    # Check if tools are present
+    check_tools()
+
    # Read files
    design_df = pd.read_csv(design, sep='\t')

-    meme_arglist =  zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0])
-    work_pool = Pool(min(12,design_df.shape[0]))
-    return_list =  work_pool.map(run_wrapper, meme_arglist)
+    meme_arglist = zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0])
+    work_pool = Pool(min(12, design_df.shape[0]))
+    return_list = work_pool.map(run_wrapper, meme_arglist)
    work_pool.close()
    work_pool.join()

-if __name__=="__main__":
-  main()
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/overlap_peaks.py
+++ b/workflow/scripts/overlap_peaks.py
@@ -6,6 +6,7 @@ import os
 import argparse
 import logging
 import shutil
+import subprocess
 import pandas as pd
 import utils

@@ -49,6 +50,15 @@ def check_tools():
    bedtools_path = shutil.which("bedtools")
    if bedtools_path:
        logger.info('Found bedtools: %s', bedtools_path)
+
+        # Get Version
+        bedtools_version_command = "bedtools --version"
+        bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
+
+        # Write to file
+        bedtools_file = open("version_bedtools.txt", "wb")
+        bedtools_file.write(bedtools_version)
+        bedtools_file.close()
    else:
        logger.error('Missing bedtools')
        raise Exception('Missing bedtools')
@@ -113,9 +123,9 @@ def overlap(experiment, design):
    # with any one of the overlapping peak pairs  >= 0.5

    steps_true = ['intersectBed -wo -a %s -b %s' % (pool_peaks, true_rep_peaks[0]),
-                    awk_command,
-                    cut_command,
-                    'sort -u']
+                  awk_command,
+                  cut_command,
+                  'sort -u']

    iter_true_peaks = iter(true_rep_peaks)
    next(iter_true_peaks)
@@ -123,13 +133,13 @@ def overlap(experiment, design):
    if len(true_rep_peaks) > 1:
        for true_peak in true_rep_peaks[1:]:
            steps_true.extend(['intersectBed -wo -a stdin -b %s' % (true_peak),
-                                awk_command,
-                                cut_command,
-                                'sort -u'])
+                               awk_command,
+                               cut_command,
+                               'sort -u'])

    out, err = utils.run_pipe(steps_true, outfile=overlap_tr_fn)
    print("%d peaks overlap with both true replicates" %
-        (utils.count_lines(overlap_tr_fn)))
+          (utils.count_lines(overlap_tr_fn)))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2
    # where overlap is defined as the fractional overlap
@@ -146,7 +156,7 @@ def overlap(experiment, design):

    out, err = utils.run_pipe(steps_pseudo, outfile=overlap_pr_fn)
    print("%d peaks overlap with both pooled pseudoreplicates"
-            % (utils.count_lines(overlap_pr_fn)))
+          % (utils.count_lines(overlap_pr_fn)))

    # Make union of peak lists
    out, err = utils.run_pipe([
@@ -154,7 +164,7 @@ def overlap(experiment, design):
                'sort -u'
                ], overlapping_peaks_fn)
    print("%d peaks overlap with true replicates or with pooled pseudorepliates"
-            % (utils.count_lines(overlapping_peaks_fn)))
+          % (utils.count_lines(overlapping_peaks_fn)))

    # Make rejected peak list
    out, err = utils.run_pipe([
@@ -178,6 +188,9 @@ def main():
    handler = logging.FileHandler('consensus_peaks.log')
    logger.addHandler(handler)

+    # Check if tools are present
+    check_tools()
+
    # Read files as dataframes
    design_peaks_df = pd.read_csv(design, sep='\t')
    design_files_df = pd.read_csv(files, sep='\t')
@@ -187,7 +200,7 @@ def main():

    # Make a design file for annotating Peaks
    anno_cols = ['Condition', 'Peaks']
-    design_anno = pd.DataFrame(columns = anno_cols)
+    design_anno = pd.DataFrame(columns=anno_cols)

    # Find consenus overlap peaks for each experiment
    for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
@@ -197,16 +210,16 @@ def main():

    # Write out design files
    design_diff.columns = ['SampleID',
-                            'bamReads',
-                            'Condition',
-                            'Tissue',
-                            'Factor',
-                            'Treatment',
-                            'Replicate',
-                            'ControlID',
-                            'bamControl',
-                            'Peaks',
-                            'PeakCaller']
+                           'bamReads',
+                           'Condition',
+                           'Tissue',
+                           'Factor',
+                           'Treatment',
+                           'Replicate',
+                           'ControlID',
+                           'bamControl',
+                           'Peaks',
+                           'PeakCaller']

    design_diff.to_csv("design_diffPeaks.csv", header=True, sep=',', index=False)
    design_anno.to_csv("design_annotatePeaks.tsv", header=True, sep='\t', index=False)

--- a/workflow/scripts/pool_and_psuedoreplicate.py
+++ b/workflow/scripts/pool_and_psuedoreplicate.py
@@ -3,11 +3,14 @@
 '''Generate pooled and pseudoreplicate from data.'''

 import argparse
-import utils
 import logging
+import os
+import subprocess
+import shutil
+import shlex
 import pandas as pd
 import numpy as np
-import os
+import utils

 EPILOG = '''
 For more details:
@@ -21,6 +24,12 @@ logger.addHandler(logging.NullHandler())
 logger.propagate = False
 logger.setLevel(logging.INFO)

+# the order of this list is important.
+# strip_extensions strips from the right inward, so
+# the expected right-most extensions should appear first (like .gz)
+# Modified from J. Seth Strattan
+STRIP_EXTENSIONS = ['.gz', '.tagAlign', '.bedse', '.bedpe']
+

 def get_args():
    '''Define arguments.'''
@@ -87,21 +96,23 @@ def pool(tag_files, outfile, paired):
    else:
        file_extension = '.bedse.gz'

-    pooled_filename = outfile + file_extension
+    pool_basename = os.path.basename(
+        utils.strip_extensions(outfile, STRIP_EXTENSIONS))
+
+    pooled_filename = pool_basename + file_extension

    # Merge files
    out, err = utils.run_pipe([
        'gzip -dc %s' % (' '.join(tag_files)),
-        'gzip -cn'],
-        outfile=pooled_filename)
+        'gzip -cn'], outfile=pooled_filename)

    return pooled_filename


 def bedpe_to_tagalign(tag_file, outfile):
-    '''Convert read pairs to reads itno standard tagAlign file.'''
+    '''Convert read pairs to reads into standard tagAlign file.'''

-    se_tag_filename = outfile + "bedse.tagAlign.gz"
+    se_tag_filename = outfile + ".tagAlign.gz"

    # Convert read pairs to reads into standard tagAlign file
    tag_steps = ["zcat -f %s" % (tag_file)]
@@ -122,7 +133,7 @@ def self_psuedoreplication(tag_file, prefix, paired):
    lines_per_rep = (no_lines+1)/2

    # Make an array of number of psuedoreplicatesfile names
-    pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.bedse.tagAlign.gz'
+    pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.tagAlign.gz'
                            for r in [0, 1]}

    # Shuffle and split file into equal parts
@@ -132,21 +143,25 @@ def self_psuedoreplication(tag_file, prefix, paired):

    splits_prefix = 'temp_split'

-    out, err = utils.run_pipe([
-        'gzip -dc %s' % (tag_file),
-        'shuf --random-source=%s' % (tag_file),
-        'split -d -l %d - %s' % (lines_per_rep, splits_prefix)])
+    psuedo_command = 'bash -c "zcat {} | shuf --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {} | wc -c) -nosalt </dev/zero 2>/dev/null) | '
+    psuedo_command += 'split -d -l {} - {}."'
+    psuedo_command = psuedo_command.format(
+        tag_file,
+        tag_file,
+        int(lines_per_rep),
+        splits_prefix)
+    logger.info("Running psuedo with %s", psuedo_command)
+    subprocess.check_call(shlex.split(psuedo_command))

    # Convert read pairs to reads into standard tagAlign file

    for i, index in enumerate([0, 1]):
-        string_index = '0' + str(index)
+        string_index = '.0' + str(index)
        steps = ['cat %s' % (splits_prefix + string_index)]
        if paired:
            steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""])
        steps.extend(['gzip -cn'])
        out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i])
-        os.remove(splits_prefix + string_index)

    return pseudoreplicate_dict

@@ -213,7 +228,7 @@ def main():
        # Update tagAlign with single end data
        if paired:
            design_new_df['tag_align'] = design_new_df['se_tag_align']
-        design_new_df.drop(labels = 'se_tag_align', axis = 1, inplace = True)
+        design_new_df.drop(labels='se_tag_align', axis=1, inplace=True)

        design_new_df['replicate'] = design_new_df['replicate'].astype(str)
        design_new_df.at[1, 'sample_id'] = experiment_id + '_pr1'
@@ -243,8 +258,6 @@ def main():
        # Drop index column
        design_new_df.drop(labels='index', axis=1, inplace=True)

-
-
    else:
        # Make pool of replicates
        replicate_files = design_df.tag_align.unique()
@@ -269,7 +282,7 @@ def main():
        pool_pseudoreplicates_dict = {}
        for index, row in pseudoreplicates_df.iterrows():
            replicate_id = index + 1
-            pr_filename = experiment_id + ".pr" + str(replicate_id) + '.bedse.gz'
+            pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz'
            pool_replicate = pool(row, pr_filename, False)
            pool_pseudoreplicates_dict[replicate_id] = pool_replicate

@@ -277,16 +290,16 @@ def main():
        # Update tagAlign with single end data
        if paired:
            design_new_df['tag_align'] = design_new_df['se_tag_align']
-        design_new_df.drop(labels = 'se_tag_align', axis = 1, inplace = True)
+        design_new_df.drop(labels='se_tag_align', axis=1, inplace=True)
        # Check controls against cutoff_ratio
        # if so replace with pool_control
        # unless single control was used

        if not single_control:
            path_to_pool_control = cwd + '/' + pool_control
-            if control_df.values.max() > 1.2:
+            if control_df.values.max() > cutoff_ratio:
                logger.info("Number of reads in controls differ by " +
-                    " > factor of %f. Using pooled controls." % (cutoff_ratio))
+                            " > factor of %f. Using pooled controls." % (cutoff_ratio))
                design_new_df['control_tag_align'] = path_to_pool_control
            else:
                for index, row in design_new_df.iterrows():
@@ -302,14 +315,14 @@ def main():
                        if paired:
                            control = row['control_tag_align']
                            control_basename = os.path.basename(
-                                utils.strip_extensions(control, ['.filt.nodup.bedpe.gz']))
-                            control_tmp = bedpe_to_tagalign(control , "control_basename")
+                                utils.strip_extensions(control, STRIP_EXTENSIONS))
+                            control_tmp = bedpe_to_tagalign(control, control_basename)
                            path_to_control = cwd + '/' + control_tmp
                            design_new_df.loc[index, 'control_tag_align'] = \
                                                                path_to_control

        else:
-            path_to_pool_control =  pool_control
+            path_to_pool_control = pool_control

        # Add in pseudo replicates
        tmp_metadata = design_new_df.loc[0].copy()
@@ -330,10 +343,9 @@ def main():
        tmp_metadata['tag_align'] = path_to_file
        design_new_df = design_new_df.append(tmp_metadata)

-
    # Write out new dataframe
    design_new_df.to_csv(experiment_id + '_ppr.tsv',
-                        header=True, sep='\t', index=False)
+                         header=True, sep='\t', index=False)


 if __name__ == '__main__':

--- a/workflow/scripts/runDeepTools.py
+++ b/workflow/scripts/runDeepTools.py
-#!/usr/bin/python
-# programmer : bbc
-# usage:
-
-import sys
-import argparse as ap
-import logging
-import subprocess
-import pandas as pd
-from multiprocessing import Pool
-logging.basicConfig(level=10)
-
-
-def prepare_argparser():
-  description = "Make wig file for given bed using bam"
-  epilog = "For command line options of each command, type %(prog)% COMMAND -h"
-  argparser = ap.ArgumentParser(description=description, epilog = epilog)
-  argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BAM file")
-  argparser.add_argument("-g","--genome",dest = "genome",type=str,required=True, help="genome", default="hg19")
-  #argparser.add_argument("-b","--bed",dest="bedfile",type=str,required=True, help = "Gene locus in bed format")
-  #argparser.add_argument("-s","--strandtype",dest="stranded",type=str,default="none", choices=["none","reverse","yes"])
-  #argparser.add_argument("-n","--name",dest="trackName",type=str,default="UserTrack",help = "track name for bedgraph header")
-  return(argparser)
-
-def run_qc(files, controls, labels):
-  mbs_command = "multiBamSummary bins --bamfiles "+' '.join(files)+" -out sample_mbs.npz"
-  p = subprocess.Popen(mbs_command, shell=True)
-  #logging.debug(mbs_command)
-  p.communicate()
-  pcor_command = "plotCorrelation -in sample_mbs.npz --corMethod spearman --skipZeros --plotTitle \"Spearman Correlation of Read Counts\" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers  -o experiment.deeptools.heatmap_spearmanCorr_readCounts_v2.png --labels "+" ".join(labels)
-  #logging.debug(pcor_command)
-  p = subprocess.Popen(pcor_command, shell=True)
-  p.communicate()
-  #plotCoverage
-  pcov_command = "plotCoverage -b "+" ".join(files)+" --plotFile experiment.deeptools_coverage.png -n 1000000 --plotTitle \"sample coverage\" --ignoreDuplicates --minMappingQuality 10"
-  p = subprocess.Popen(pcov_command, shell=True)
-  p.communicate()
-  #draw fingerprints plots
-  for treat,ctrl,name in zip(files,controls,labels):
-    fp_command = "plotFingerprint -b "+treat+" "+ctrl+" --labels "+name+" control --plotFile "+name+".deeptools_fingerprints.png"
-    p = subprocess.Popen(fp_command, shell=True)
-    p.communicate()
-
-def bam2bw_wrapper(command):
-  p = subprocess.Popen(command, shell=True)
-  p.communicate()
-
-def run_signal(files, labels, genome):
-  #compute matrix and draw profile and heatmap
-  gene_bed = genome+"/gene.bed"#"/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/genome/"+genome+"/gene.bed"
-  bw_commands = []
-  for f in files:
-    bw_commands.append("bamCoverage -bs 10 -b "+f+" -o "+f.replace("bam","bw"))
-  work_pool = Pool(min(len(files), 12))
-  work_pool.map(bam2bw_wrapper, bw_commands)
-  work_pool.close()
-  work_pool.join()
-
-  cm_command = "computeMatrix scale-regions -R "+gene_bed+" -a 3000 -b 3000 --regionBodyLength 5000 --skipZeros -S *.bw -o samples.deeptools_generegionscalematrix.gz"
-  p = subprocess.Popen(cm_command, shell=True)
-  p.communicate()
-  hm_command = "plotHeatmap -m samples.deeptools_generegionscalematrix.gz -out samples.deeptools_readsHeatmap.png"
-  p = subprocess.Popen(hm_command, shell=True)
-  p.communicate()
-
-def run(dfile,genome):
-  #parse dfile, suppose data files are the same folder as design file
-  dfile = pd.read_csv(dfile)
-  #QC: multiBamSummary and plotCorrelation
-  run_qc(dfile['bamReads'], dfile['bamControl'], dfile['SampleID'])
-  #signal plots
-  run_signal(dfile['bamReads'],dfile['SampleID'],genome)
-
-def main():
-  argparser = prepare_argparser()
-  args = argparser.parse_args()
-  run(args.infile, args.genome)
-
-
-if __name__=="__main__":
-  main()
--- a/workflow/scripts/trim_reads.py
+++ b/workflow/scripts/trim_reads.py
@@ -5,6 +5,7 @@
 import subprocess
 import argparse
 import shutil
+import os
 import logging

 EPILOG = '''
@@ -32,6 +33,10 @@ def get_args():
                        nargs='+',
                        required=True)

+    parser.add_argument('-s', '--sample',
+                        help="The name of the sample.",
+                        required=True)
+
    parser.add_argument('-p', '--paired',
                        help="True/False if paired-end or single end.",
                        default=False,
@@ -49,6 +54,15 @@ def check_tools():
    trimgalore_path = shutil.which("trim_galore")
    if trimgalore_path:
        logger.info('Found trimgalore: %s', trimgalore_path)
+
+        # Get Version
+        trim_version_command = "trim_galore --version"
+        trimgalore_version = subprocess.check_output(trim_version_command, shell=True)
+
+        # Write to file
+        trimgalore_file = open("version_trimgalore.txt", "wb")
+        trimgalore_file.write(trimgalore_version)
+        trimgalore_file.close()
    else:
        logger.error('Missing trimgalore')
        raise Exception('Missing trimgalore')
@@ -56,11 +70,46 @@ def check_tools():
    cutadapt_path = shutil.which("cutadapt")
    if cutadapt_path:
        logger.info('Found cutadapt: %s', cutadapt_path)
+
+        # Get Version
+        cutadapt_version_command = "cutadapt --version"
+        cutadapt_version = subprocess.check_output(cutadapt_version_command, shell=True)
+
+        # Write to file
+        cutadapt_file = open("version_cutadapt.txt", "wb")
+        cutadapt_file.write(b"Version %s" % (cutadapt_version))
+        cutadapt_file.close()
    else:
        logger.error('Missing cutadapt')
        raise Exception('Missing cutadapt')


+def rename_reads(fastq, sample, paired):
+    '''Rename fastq files by sample name.'''
+
+    # Get current directory to build paths
+    cwd = os.getcwd()
+
+    renamed_fastq = []
+
+    if paired:  # paired-end data
+        # Set file names
+        renamed_fastq.append(cwd + '/' + sample + '_R1.fastq.gz')
+        renamed_fastq.append(cwd + '/' + sample + '_R2.fastq.gz')
+
+        # Great symbolic links
+        os.symlink(fastq[0], renamed_fastq[0])
+        os.symlink(fastq[1], renamed_fastq[1])
+    else:
+        # Set file names
+        renamed_fastq.append(cwd + '/' + sample + '_R1.fastq.gz')
+
+        # Great symbolic links
+        os.symlink(fastq[0], renamed_fastq[0])
+
+    return renamed_fastq
+
+
 def trim_reads(fastq, paired):
    '''Run trim_galore on 1 or 2 files.'''

@@ -82,6 +131,7 @@ def trim_reads(fastq, paired):
 def main():
    args = get_args()
    fastq = args.fastq
+    sample = args.sample
    paired = args.paired

    # Create a file handler
@@ -91,8 +141,11 @@ def main():
    # Check if tools are present
    check_tools()

+    # Rename fastq files by sample
+    fastq_rename = rename_reads(fastq, sample, paired)
+
    # Run trim_reads
-    trim_reads(fastq, paired)
+    trim_reads(fastq_rename, paired)


 if __name__ == '__main__':

--- a/workflow/scripts/utils.py
+++ b/workflow/scripts/utils.py
@@ -16,7 +16,6 @@ logger.propagate = True


 def run_pipe(steps, outfile=None):
-    # TODO:  capture stderr
    from subprocess import Popen, PIPE
    p = None
    p_next = None
@@ -98,13 +97,13 @@ def rescale_scores(filename, scores_col, new_min=10, new_max=1000):
        'head -n 1 %s' % (sorted_fn),
        'cut -f %s' % (scores_col)])
    max_score = float(out.strip())
-    logger.info("rescale_scores: max_score = %s" % (max_score))
+    logger.info("rescale_scores: max_score = %s", max_score)

    out, err = run_pipe([
        'tail -n 1 %s' % (sorted_fn),
        'cut -f %s' % (scores_col)])
    min_score = float(out.strip())
-    logger.info("rescale_scores: min_score = %s" % (min_score))
+    logger.info("rescale_scores: min_score = %s", min_score)

    a = min_score
    b = max_score

--- a/workflow/scripts/xcor.py
+++ b/workflow/scripts/xcor.py
@@ -5,6 +5,7 @@
 import os
 import argparse
 import shutil
+import subprocess
 import logging
 from multiprocessing import cpu_count
 import utils
@@ -22,6 +23,13 @@ logger.propagate = False
 logger.setLevel(logging.INFO)


+# the order of this list is important.
+# strip_extensions strips from the right inward, so
+# the expected right-most extensions should appear first (like .gz)
+# Modified from J. Seth Strattan
+STRIP_EXTENSIONS = ['.gz', '.tagAlign', '.bedse', '.bedpe']
+
+
 def get_args():
    '''Define arguments.'''

@@ -52,28 +60,51 @@ def check_tools():
    r_path = shutil.which("R")
    if r_path:
        logger.info('Found R: %s', r_path)
+
+        # Get Version
+        r_version_command = "R --version"
+        r_version = subprocess.check_output(r_version_command, shell=True)
+
+        # Write to file
+        r_file = open("version_r.txt", "wb")
+        r_file.write(r_version)
+        r_file.close()
    else:
        logger.error('Missing R')
        raise Exception('Missing R')

+    phantompeak_path = shutil.which("run_spp.R")
+    if phantompeak_path:
+        logger.info('Found phantompeak: %s', phantompeak_path)
+
+        # Get Version
+        spp_version_command = "R -e \"packageVersion('spp')\""
+        spp_version = subprocess.check_output(spp_version_command, shell=True)
+
+        # Write to file
+        spp_file = open("version_spp.txt", "wb")
+        spp_file.write(spp_version)
+        spp_file.close()
+    else:
+        logger.error('Missing phantompeak')
+        raise Exception('Missing phantompeak')
+

 def xcor(tag, paired):
    '''Use spp to calculate cross-correlation stats.'''

-    tag_basename = os.path.basename(utils.strip_extensions(tag, ['.gz']))
+    tag_basename = os.path.basename(utils.strip_extensions(tag, STRIP_EXTENSIONS))
    uncompressed_tag_filename = tag_basename

-
    # Subsample tagAlign file
-    NREADS = 15000000
+    number_reads = 15000000
    subsampled_tag_filename = \
-        tag_basename + ".%d.tagAlign.gz" % (NREADS/1000000)
-
+        tag_basename + ".%d.tagAlign.gz" % (number_reads/1000000)

    steps = [
        'zcat %s' % (tag),
        'grep -v "chrM"',
-        'shuf -n %d --random-source=%s' % (NREADS, tag)]
+        'shuf -n %d --random-source=%s' % (number_reads, tag)]

    if paired:
        steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""])
@@ -83,8 +114,8 @@ def xcor(tag, paired):
    out, err = utils.run_pipe(steps, outfile=subsampled_tag_filename)

    # Calculate Cross-correlation QC scores
-    cc_scores_filename = subsampled_tag_filename + ".cc.qc"
-    cc_plot_filename = subsampled_tag_filename + ".cc.plot.pdf"
+    cc_scores_filename = tag_basename + ".cc.qc"
+    cc_plot_filename = tag_basename + ".cc.plot.pdf"

    # CC_SCORE FILE format
    # Filename <tab>
@@ -122,7 +153,7 @@ def main():
    check_tools()

    # Calculate Cross-correlation
-    xcor_filename = xcor(tag, paired)
+    xcor(tag, paired)


 if __name__ == '__main__':

--- a/workflow/tests/test_annotate_peaks.py
+++ b/workflow/tests/test_annotate_peaks.py
@@ -11,18 +11,34 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \


 @pytest.mark.singleend
-def test_annotate_peaks_singleend():
+def test_pie_singleend():
    assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC.chipseeker_pie.pdf'))
+
+
+@pytest.mark.singleend
+def test_upsetplot_singleend():
    assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC.chipseeker_upsetplot.pdf'))
+
+
+@pytest.mark.singleend
+def test_annotation_singleend():
    annotation_file = test_output_path + 'ENCSR238SGC.chipseeker_annotation.tsv'
    assert os.path.exists(annotation_file)
-    assert utils.count_lines(annotation_file) == 152840
+    assert utils.count_lines(annotation_file) == 152255


 @pytest.mark.pairedend
-def test_annotate_peaks_pairedend():
+def test_pie_pairedend():
    assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA.chipseeker_pie.pdf'))
+
+
+@pytest.mark.pairedend
+def test_upsetplot_pairedend():
    assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA.chipseeker_upsetplot.pdf'))
+
+
+@pytest.mark.pairedend
+def test_annotation_pairedend():
    annotation_file = test_output_path + 'ENCSR729LGA.chipseeker_annotation.tsv'
    assert os.path.exists(annotation_file)
-    assert utils.count_lines(annotation_file) == 25391
+    assert utils.count_lines(annotation_file) == 25494
--- a/workflow/tests/test_call_peaks_macs.py
+++ b/workflow/tests/test_call_peaks_macs.py
@@ -9,16 +9,42 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \


 @pytest.mark.singleend
-def test_call_peaks_macs_singleend():
-    assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT.fc_signal.bw'))
-    assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT.pvalue_signal.bw'))
-    peak_file = test_output_path + 'ENCLB144FDT_peaks.narrowPeak'
+def test_fc_signal_singleend():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC/1/', 'ENCLB144FDT.fc_signal.bw'))
+
+
+@pytest.mark.singleend
+def test_pvalue_signal_singleend():
+    assert os.path.exists(os.path.join(test_output_path,  'ENCSR238SGC/1/', 'ENCLB144FDT.pvalue_signal.bw'))
+
+
+@pytest.mark.singleend
+def test_peaks_xls_singleend():
+    assert os.path.exists(os.path.join(test_output_path,  'ENCSR238SGC/1/', 'ENCLB144FDT_peaks.xls'))
+
+
+@pytest.mark.singleend
+def test_peaks_bed_singleend():
+    peak_file = test_output_path +  'ENCSR238SGC/1/' + 'ENCLB144FDT.narrowPeak'
    assert utils.count_lines(peak_file) == 227389


 @pytest.mark.pairedend
-def test_call_peaks_macs_pairedend():
-    assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX.fc_signal.bw'))
-    assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX.pvalue_signal.bw'))
-    peak_file = test_output_path + 'ENCLB568IYX_peaks.narrowPeak'
-    assert utils.count_lines(peak_file) == 112652
+def test_fc_signal_pairedend():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA/2/', 'ENCLB568IYX.fc_signal.bw'))
+
+
+@pytest.mark.pairedend
+def test_pvalue_signal_pairedend():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA/2/', 'ENCLB568IYX.pvalue_signal.bw'))
+
+
+@pytest.mark.pairedend
+def test_peaks_xls_pairedend():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA/2/', 'ENCLB568IYX_peaks.xls'))
+
+
+@pytest.mark.pairedend
+def test_peaks_bed_pairedend():
+    peak_file = test_output_path + 'ENCSR729LGA/2/' + 'ENCLB568IYX.narrowPeak'
+    assert utils.count_lines(peak_file) == 113821
--- a/workflow/tests/test_check_design.py
+++ b/workflow/tests/test_check_design.py
@@ -63,6 +63,24 @@ def design_4(design):
    return design


+@pytest.fixture
+def design_5(design):
+    # Update sample_id to have -, spaces or periods
+    design.loc[design['sample_id'] == 'A_1', 'sample_id'] = 'A 1'
+    design.loc[design['sample_id'] == 'A_2', 'sample_id'] = 'A.2'
+    design.loc[design['sample_id'] == 'B_1', 'sample_id'] = 'B-1'
+    return design
+
+
+@pytest.fixture
+def design_6(design):
+    # Update experiment_id to have -, spaces or periods
+    design.loc[design['sample_id'] == 'A_1', 'experiment_id'] = 'A ChIP'
+    design.loc[design['sample_id'] == 'A_2', 'experiment_id'] = 'A.ChIP'
+    design.loc[design['sample_id'] == 'B_1', 'experiment_id'] = 'B-ChIP'
+    return design
+
+
 @pytest.fixture
 def fastq_files_1(fastq_files):
    # Drop B_2.fastq.gz
@@ -115,10 +133,25 @@ def test_check_files_output_pairedend(design_3, fastq_files):
    assert new_design.loc[0, 'fastq_read2'] == "/path/to/file/A_2.fastq.gz"


-
 @pytest.mark.unit
 def test_check_replicates(design_4):
    paired = False
    with pytest.raises(Exception) as excinfo:
        new_design = check_design.check_replicates(design_4)
    assert str(excinfo.value) == "Duplicate replicates in experiments: ['B']"
+
+
+@pytest.mark.unit
+def test_check_samples(design_5):
+    paired = False
+    with pytest.raises(Exception) as excinfo:
+        new_design = check_design.check_samples(design_5)
+    assert str(excinfo.value) == "Malformed samples from design file: ['A 1', 'A.2', 'B-1']"
+
+
+@pytest.mark.unit
+def test_check_experiments(design_6):
+    paired = False
+    with pytest.raises(Exception) as excinfo:
+        new_design = check_design.check_experiments(design_6)
+    assert str(excinfo.value) == "Malformed experiment from design file: ['A ChIP', 'A.ChIP', 'B-ChIP']"
--- a/workflow/tests/test_convert_reads.py
+++ b/workflow/tests/test_convert_reads.py
@@ -8,12 +8,20 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \


 @pytest.mark.singleend
-def test_convert_reads_singleend():
-    assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.tagAlign.gz'))
-    assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bedse.gz'))
+def test_tag_reads_singleend():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI/ENCLB831RUI.tagAlign.gz'))
+
+
+@pytest.mark.singleend
+def test_bed_reads_singleend():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI/ENCLB831RUI.bedse.gz'))
+
+
+@pytest.mark.pairedend
+def test_tag_reads_pairedend():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX/ENCLB568IYX.tagAlign.gz'))


 @pytest.mark.pairedend
-def test_map_qc_pairedend():
-    assert os.path.exists(os.path.join(test_output_path, 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.tagAlign.gz'))
-    assert os.path.exists(os.path.join(test_output_path, 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.bedpe.gz'))
+def test_bed_reads_pairedend():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX/ENCLB568IYX.bedpe.gz'))
--- a/workflow/tests/test_diff_peaks.py
+++ b/workflow/tests/test_diff_peaks.py
@@ -10,30 +10,65 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
                '/../output/diffPeaks/'


-@pytest.mark.singleend
+@pytest.mark.singleskip_true
 def test_diff_peaks_singleend_single_rep():
    assert os.path.isdir(test_output_path) == False

+
 @pytest.mark.pairedend
-def test_annotate_peaks_pairedend_single_rep():
+def test_diff_peaks_pairedend_single_rep():
    assert os.path.isdir(test_output_path) == False

+
 @pytest.mark.singlediff
-def test_diff_peaks_singleend_multiple_rep():
+def test_heatmap_singleend_multiple_rep():
    assert os.path.exists(os.path.join(test_output_path, 'heatmap.pdf'))
+
+
+@pytest.mark.singlediff
+def test_pca_singleend_multiple_rep():
    assert os.path.exists(os.path.join(test_output_path, 'pca.pdf'))
+
+
+@pytest.mark.singlediff
+def test_normcount_singleend_multiple_rep():
    assert os.path.exists(os.path.join(test_output_path, 'normcount_peaksets.txt'))
-    assert os.path.exists(os.path.join(test_output_path, 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.bed'))
-    diffbind_file = test_output_path + 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.csv'
+
+
+@pytest.mark.singlediff
+def test_diffbind_singleend_multiple_rep():
+    if os.path.isfile(os.path.join(test_output_path, 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.bed')):
+        assert os.path.exists(os.path.join(test_output_path, 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.bed'))
+        diffbind_file = test_output_path + 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.csv'
+    elif os.path.isfile(os.path.join(test_output_path, 'ENCSR238SGC_vs_ENCSR272GNQ_diffbind.bed')):
+        assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC_vs_ENCSR272GNQ_diffbind.bed'))
+        diffbind_file = test_output_path + 'ENCSR238SGC_vs_ENCSR272GNQ_diffbind.csv'
    assert os.path.exists(diffbind_file)
-    assert utils.count_lines(diffbind_file) == 201039
+    assert utils.count_lines(diffbind_file) == 197471
+

 @pytest.mark.paireddiff
-def test_annotate_peaks_pairedend_single_rep():
+def test_heatmap_pairedend_single_rep():
    assert os.path.exists(os.path.join(test_output_path, 'heatmap.pdf'))
+
+
+@pytest.mark.paireddiff
+def test_pca_pairedend_single_rep():
    assert os.path.exists(os.path.join(test_output_path, 'pca.pdf'))
+
+
+@pytest.mark.paireddiff
+def test_normcount_pairedend_single_rep():
    assert os.path.exists(os.path.join(test_output_path, 'normcount_peaksets.txt'))
-    assert os.path.exists(os.path.join(test_output_path, 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.bed'))
-    diffbind_file = test_output_path + 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.csv'
+
+
+@pytest.mark.paireddiff
+def test_diffbind_pairedend_single_rep():
+    if os.path.isfile(os.path.join(test_output_path, 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.bed')):
+        assert os.path.exists(os.path.join(test_output_path, 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.bed'))
+        diffbind_file = test_output_path + 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.csv'
+    elif os.path.isfile(os.path.join(test_output_path, 'ENCSR729LGA_vs_ENCSR757EMK_diffbind.bed')):
+        assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA_vs_ENCSR757EMK_diffbind.bed'))
+        diffbind_file = test_output_path + 'ENCSR729LGA_vs_ENCSR757EMK_diffbind.csv'
    assert os.path.exists(diffbind_file)
    assert utils.count_lines(diffbind_file) == 66201
--- a/workflow/tests/test_experiment_qc.py
+++ b/workflow/tests/test_experiment_qc.py
@@ -31,17 +31,44 @@ def test_check_update_controls(design_bam):


 @pytest.mark.singleend
-def test_experiment_qc_singleend():
+def test_coverage_singleend():
    assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz'))
-    assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.png'))
-    assert os.path.exists(os.path.join(test_output_path, 'coverage.png'))
-    assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT_fingerprint.png'))
-    assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI_fingerprint.png'))
+    assert os.path.exists(os.path.join(test_output_path, 'coverage.pdf'))
+
+
+@pytest.mark.singleend
+def test_spearman_singleend():
+    assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.pdf'))
+
+
+@pytest.mark.singleend
+def test_pearson_singleend():
+    assert os.path.exists(os.path.join(test_output_path, 'heatmap_PearsonCorr.pdf'))
+
+
+@pytest.mark.singleend
+def test_fingerprint_singleend():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT_fingerprint.pdf'))
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI_fingerprint.pdf'))
+

 @pytest.mark.pairdend
-def test_experiment_qc_pairedend():
+def test_coverage_pairedend():
    assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz'))
-    assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.png'))
-    assert os.path.exists(os.path.join(test_output_path, 'coverage.png'))
-    assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX_fingerprint.png'))
-    assert os.path.exists(os.path.join(test_output_path, 'ENCLB637LZP_fingerprint.png'))
+    assert os.path.exists(os.path.join(test_output_path, 'coverage.pdf'))
+
+
+@pytest.mark.pairdend
+def test_spearman_pairedend():
+    assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.pdf'))
+
+
+@pytest.mark.pairdend
+def test_pearson_pairedend():
+    assert os.path.exists(os.path.join(test_output_path, 'heatmap_PearsonCorr.pdf'))
+
+
+@pytest.mark.pairdend
+def test_fingerprint_pairedend():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX_fingerprint.pdf'))
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB637LZP_fingerprint.pdf'))
--- a/workflow/tests/test_generate_software_references.py
+++ b/workflow/tests/test_generate_software_references.py
+#!/usr/bin/env python3
+
+import pytest
+import os
+import utils
+import yaml
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../output/multiqcReport/'
+
+
+@pytest.mark.singleend
+def test_software_references():
+    assert os.path.exists(os.path.join(test_output_path, 'software_references_mqc.yaml'))
+
+
+@pytest.mark.singleend
+def test_software_references_output():
+    software_references = os.path.join(test_output_path, 'software_references_mqc.yaml')
+    with open(software_references, 'r') as stream:
+        data_loaded = yaml.load(stream)
+
+    assert len(data_loaded['data'].split('<dt>')) == 17
--- a/workflow/tests/test_generate_software_versions.py
+++ b/workflow/tests/test_generate_software_versions.py
+#!/usr/bin/env python3
+
+import pytest
+import os
+import utils
+import yaml
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../output/multiqcReport/'
+
+
+@pytest.mark.singleend
+def test_software_versions():
+    assert os.path.exists(os.path.join(test_output_path, 'software_versions_mqc.yaml'))
+
+
+@pytest.mark.singleend
+def test_software_versions_output():
+    software_versions = os.path.join(test_output_path, 'software_versions_mqc.yaml')
+    with open(software_versions, 'r') as stream:
+        data_loaded = yaml.load(stream)
+
+    assert len(data_loaded['data'].split('<dt>')) == 17
--- a/workflow/tests/test_map_qc.py
+++ b/workflow/tests/test_map_qc.py
@@ -8,31 +8,49 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
                '/../output/filterReads/'


+@pytest.mark.singleend
+def test_dedup_files_singleend():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI/ENCLB831RUI.dedup.bam'))
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI/ENCLB831RUI.dedup.bam.bai'))
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI/ENCLB831RUI.dedup.qc'))
+
+
 @pytest.mark.singleend
 def test_map_qc_singleend():
-    assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam'))
-    assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam.bai'))
-    filtered_reads_report = test_output_path + 'ENCFF646LXU.filt.nodup.flagstat.qc'
+    filtered_reads_report = test_output_path + 'ENCLB831RUI/ENCLB831RUI.dedup.flagstat.qc'
    samtools_report = open(filtered_reads_report).readlines()
    assert '64962570 + 0 in total' in samtools_report[0]
    assert '64962570 + 0 mapped (100.00%:N/A)' in samtools_report[4]
-    library_complexity = test_output_path + 'ENCFF646LXU.filt.nodup.pbc.qc'
+
+
+@pytest.mark.singleend
+def test_library_complexity_singleend():
+    library_complexity = test_output_path + 'ENCLB831RUI/ENCLB831RUI.pbc.qc'
    df_library_complexity = pd.read_csv(library_complexity, sep='\t')
    assert  df_library_complexity["NRF"].iloc[0] == 0.926192
    assert  df_library_complexity["PBC1"].iloc[0] == 0.926775
    assert  df_library_complexity["PBC2"].iloc[0] == 13.706885


+@pytest.mark.pairedend
+def test_dedup_files_pairedend():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX/ENCLB568IYX.dedup.bam'))
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX/ENCLB568IYX.dedup.bam.bai'))
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX/ENCLB568IYX.dedup.qc'))
+
+
 @pytest.mark.pairedend
 def test_map_qc_pairedend():
-    assert os.path.exists(os.path.join(test_output_path, 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.bam'))
-    assert os.path.exists(os.path.join(test_output_path, 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.bam.bai'))
-    filtered_reads_report = test_output_path + 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.flagstat.qc'
+    filtered_reads_report = test_output_path + 'ENCLB568IYX/ENCLB568IYX.dedup.flagstat.qc'
    samtools_report = open(filtered_reads_report).readlines()
-    assert '47389080 + 0 in total' in samtools_report[0]
-    assert '47389080 + 0 mapped (100.00%:N/A)' in samtools_report[4]
-    library_complexity = test_output_path + 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.pbc.qc'
+    assert '47388510 + 0 in total' in samtools_report[0]
+    assert '47388510 + 0 mapped (100.00%:N/A)' in samtools_report[4]
+
+
+@pytest.mark.pairedend
+def test_library_complexity_pairedend():
+    library_complexity = test_output_path + 'ENCLB568IYX/ENCLB568IYX.pbc.qc'
    df_library_complexity = pd.read_csv(library_complexity, sep='\t')
    assert  df_library_complexity["NRF"].iloc[0] == 0.947064
-    assert  df_library_complexity["PBC1"].iloc[0] == 0.946724
-    assert  df_library_complexity["PBC2"].iloc[0] == 18.643039
+    assert  round(df_library_complexity["PBC1"].iloc[0],6) == 0.946723
+    assert  round(df_library_complexity["PBC2"].iloc[0],6) == 18.642645
--- a/workflow/tests/test_map_reads.py
+++ b/workflow/tests/test_map_reads.py
@@ -9,8 +9,8 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \

 @pytest.mark.singleend
 def test_map_reads_singleend():
-    assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.srt.bam'))
-    aligned_reads_report = test_output_path + 'ENCFF646LXU.srt.bam.flagstat.qc'
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI/ENCLB831RUI.bam'))
+    aligned_reads_report = test_output_path + 'ENCLB831RUI/ENCLB831RUI.flagstat.qc'
    samtools_report = open(aligned_reads_report).readlines()
    assert '80795025 + 0 in total' in samtools_report[0]
    assert '80050072 + 0 mapped (99.08% : N/A)' in samtools_report[4]
@@ -18,8 +18,8 @@ def test_map_reads_singleend():

 @pytest.mark.pairedend
 def test_map_reads_pairedend():
-    assert os.path.exists(os.path.join(test_output_path, 'ENCFF002DTU_val_1ENCFF002EFI_val_2.srt.bam'))
-    aligned_reads_report = test_output_path + 'ENCFF002DTU_val_1ENCFF002EFI_val_2.srt.bam.flagstat.qc'
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB678IDC/ENCLB678IDC.bam'))
+    aligned_reads_report = test_output_path + 'ENCLB678IDC/ENCLB678IDC.flagstat.qc'
    samtools_report = open(aligned_reads_report).readlines()
    assert '72660890 + 0 in total' in samtools_report[0]
    assert '72053925 + 0 mapped (99.16% : N/A)' in samtools_report[4]
No results found