e4c84105 · e4c84105 · e4c84105 · e4c84105 · e4c84105 · e4c84105
--- a/workflow/scripts/call_peaks_macs.py
+++ b/workflow/scripts/call_peaks_macs.py
+#!/usr/bin/env python3
+
+#
+# * --------------------------------------------------------------------------
+# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
+# * --------------------------------------------------------------------------
+#
+
+'''Generate peaks from data.'''
+
+import os
+import argparse
+import shutil
+import subprocess
+import logging
+import utils
+from xcor import xcor as calculate_xcor
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-t', '--tag',
+                        help="The tagAlign file to perform peak calling on.",
+                        required=True)
+
+    parser.add_argument('-x', '--xcor',
+                        help="The cross-correlation file (if already calculated).",
+                        required=True)
+
+    parser.add_argument('-c', '--con',
+                        help="The control tagAling file used for peak calling.",
+                        required=True)
+
+    parser.add_argument('-s', '--sample',
+                        help="The sample id to name the peak files.",
+                        required=True)
+
+    parser.add_argument('-g', '--genome',
+                        help="The genome size of reference genome.",
+                        required=True)
+
+    parser.add_argument('-z', '--size',
+                        help="The file with chromosome sizes of reference genome.",
+                        required=True)
+
+    parser.add_argument('-p', '--paired',
+                        help="True/False if paired-end or single end.",
+                        default=False,
+                        action='store_true')
+
+    args = parser.parse_args()
+    return args
+
+# Functions
+
+
+def check_tools():
+    '''Checks for required componenets on user system'''
+
+    logger.info('Checking for required libraries and components on this system')
+
+    macs_path = shutil.which("macs2")
+    if macs_path:
+        logger.info('Found MACS2: %s', macs_path)
+
+        # Get Version
+        macs_version_command = "macs2  --version"
+        macs_version = subprocess.check_output(macs_version_command, shell=True, stderr=subprocess.STDOUT)
+
+        # Write to file
+        macs_file = open("version_macs.txt", "wb")
+        macs_file.write(macs_version)
+        macs_file.close()
+
+    else:
+        logger.error('Missing MACS2')
+        raise Exception('Missing MACS2')
+
+    bg_bw_path = shutil.which("bedGraphToBigWig")
+    if bg_bw_path:
+        logger.info('Found bedGraphToBigWig: %s', bg_bw_path)
+
+        # Get Version
+        bg_bw_version_command = "bedGraphToBigWig"
+        try:
+            subprocess.check_output(bg_bw_version_command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError as e:
+            bg_bw_version = e.output
+
+        # Write to file
+        bg_bw_file = open("version_bedGraphToBigWig.txt", "wb")
+        bg_bw_file.write(bg_bw_version)
+        bg_bw_file.close()
+    else:
+        logger.error('Missing bedGraphToBigWig')
+        raise Exception('Missing bedGraphToBigWig')
+
+    bedtools_path = shutil.which("bedtools")
+    if bedtools_path:
+        logger.info('Found bedtools: %s', bedtools_path)
+
+        # Get Version
+        bedtools_version_command = "bedtools --version"
+        bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
+
+        # Write to file
+        bedtools_file = open("version_bedtools.txt", "wb")
+        bedtools_file.write(bedtools_version)
+        bedtools_file.close()
+
+    else:
+        logger.error('Missing bedtools')
+        raise Exception('Missing bedtools')
+
+
+def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes):
+    '''Call peaks and signal tracks'''
+
+    # Extract the fragment length estimate from column 3 of the
+    # cross-correlation scores file
+    with open(xcor, 'r') as xcor_fh:
+        firstline = xcor_fh.readline()
+        frag_lengths = firstline.split()[2]  # third column
+        frag_lengths_array = frag_lengths.split(',')
+        fragment_length = 0
+        fragment = False
+        # Loop through all values of fragment length
+        for f in frag_lengths.split(','):
+            fragment_length = f
+            logger.info("Fraglen %s", fragment_length)
+            if int(fragment_length) > 50:
+                fragment = True
+                break
+
+        if fragment == False:
+            logger.info('Error in cross-correlation analysis: %s', frag_lengths_array)
+            raise Exception("Error in cross-correlation analysis: %s" % frag_lengths_array)
+
+    # Generate narrow peaks and preliminary signal tracks
+
+    command = 'macs2 callpeak ' + \
+              '-t %s -c %s ' % (experiment, control) + \
+              '-f BED -n %s ' % (prefix) + \
+              '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' % (genome_size, fragment_length)
+
+    logger.info(command)
+    returncode = utils.block_on(command)
+    logger.info("MACS2 exited with returncode %d", returncode)
+    assert returncode == 0, "MACS2 non-zero return"
+
+    # MACS2 sometimes calls features off the end of chromosomes.
+    # Remove coordinates outside chromosome sizes
+
+    int_narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix)
+    narrowpeak_fn = '%s.narrowPeak' % (prefix)
+    clipped_narrowpeak_fn = 'clipped-%s' % (narrowpeak_fn)
+
+    steps = ['slopBed -i %s -g %s -b 0' % (int_narrowpeak_fn, chrom_sizes),
+             'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)]
+
+    out, err = utils.run_pipe(steps)
+
+    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format
+    # (score must be <1000)
+    rescaled_narrowpeak_fn = utils.rescale_scores(
+        clipped_narrowpeak_fn, scores_col=5)
+
+    # Sort by Col8 in descending order and replace long peak names in Column 4
+    # with Peak_<peakRank>
+    steps = [
+        'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn),
+        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'"""
+    ]
+
+    out, err = utils.run_pipe(steps, '%s' % (narrowpeak_fn))
+
+    # For Fold enrichment signal tracks
+
+    # This file is a tab delimited file with 2 columns Col1 (chromosome name),
+    # Col2 (chromosome size in bp).
+
+    command = 'macs2 bdgcmp ' + \
+          '-t %s_treat_pileup.bdg ' % (prefix) + \
+          '-c %s_control_lambda.bdg ' % (prefix) + \
+          '-o %s_FE.bdg ' % (prefix) + \
+          '-m FE'
+
+    logger.info(command)
+    returncode = utils.block_on(command)
+    logger.info("MACS2 exited with returncode %d", returncode)
+    assert returncode == 0, "MACS2 non-zero return"
+
+    # Remove coordinates outside chromosome sizes (MACS2 bug)
+    fc_bedgraph_fn = '%s.fc.signal.bedgraph' % (prefix)
+    fc_bedgraph_sorted_fn = 'sorted-%s' % (fc_bedgraph_fn)
+    fc_signal_fn = "%s.fc_signal.bw" % (prefix)
+    steps = ['slopBed -i %s_FE.bdg -g %s -b 0' % (prefix, chrom_sizes),
+             'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)]
+
+    out, err = utils.run_pipe(steps)
+
+    # Sort file
+    out, err = utils.run_pipe([
+        'bedSort %s %s' % (fc_bedgraph_fn, fc_bedgraph_sorted_fn)])
+
+    # Convert bedgraph to bigwig
+    command = 'bedGraphToBigWig ' + \
+          '%s ' % (fc_bedgraph_sorted_fn) + \
+          '%s ' % (chrom_sizes) + \
+          '%s' % (fc_signal_fn)
+
+    logger.info(command)
+    returncode = utils.block_on(command)
+    logger.info("bedGraphToBigWig exited with returncode %d", returncode)
+    assert returncode == 0, "bedGraphToBigWig non-zero return"
+
+    # For -log10(p-value) signal tracks
+
+    # Compute sval =
+    # min(no. of reads in ChIP, no. of reads in control) / 1,000,000
+    out, err = utils.run_pipe(['gzip -dc %s' % (experiment), 'wc -l'])
+    chip_reads = out.strip()
+    out, err = utils.run_pipe(['gzip -dc %s' % (control), 'wc -l'])
+    control_reads = out.strip()
+    sval = str(min(float(chip_reads), float(control_reads)) / 1000000)
+
+    logger.info("chip_reads = %s, control_reads = %s, sval = %s" %
+                (chip_reads, control_reads, sval))
+
+    command = 'macs2 bdgcmp ' + \
+          '-t %s_treat_pileup.bdg ' % (prefix) + \
+          '-c %s_control_lambda.bdg ' % (prefix) + \
+          '-o %s_ppois.bdg ' % (prefix) + \
+          '-m ppois -S %s' % (sval)
+
+    logger.info(command)
+    returncode = utils.block_on(command)
+    assert returncode == 0, "MACS2 non-zero return"
+
+    # Remove coordinates outside chromosome sizes (MACS2 bug)
+    pvalue_bedgraph_fn = '%s.pval.signal.bedgraph' % (prefix)
+    pvalue_bedgraph_sorted_fn = 'sort-%s' % (pvalue_bedgraph_fn)
+    pvalue_signal_fn = "%s.pvalue_signal.bw" % (prefix)
+    steps = ['slopBed -i %s_ppois.bdg -g %s -b 0' % (prefix, chrom_sizes),
+             'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)]
+
+    out, err = utils.run_pipe(steps)
+
+    # Sort file
+    out, err = utils.run_pipe([
+        'bedSort %s %s' % (fc_bedgraph_fn, pvalue_bedgraph_sorted_fn)])
+
+    # Convert bedgraph to bigwig
+    command = 'bedGraphToBigWig ' + \
+          '%s ' % (pvalue_bedgraph_sorted_fn) + \
+          '%s ' % (chrom_sizes) + \
+          '%s' % (pvalue_signal_fn)
+
+    logger.info(command)
+    returncode = utils.block_on(command)
+    logger.info("bedGraphToBigWig exited with returncode %d", returncode)
+    assert returncode == 0, "bedGraphToBigWig non-zero return"
+
+    # Remove temporary files
+    os.remove(clipped_narrowpeak_fn)
+    os.remove(rescaled_narrowpeak_fn)
+    os.remove(int_narrowpeak_fn)
+
+
+def main():
+    args = get_args()
+    tag = args.tag
+    xcor = args.xcor
+    con = args.con
+    sample = args.sample
+    genome_size = args.genome
+    chrom_size = args.size
+    paired = args.paired
+
+    # Create a file handler
+    handler = logging.FileHandler('call_peaks.log')
+    logger.addHandler(handler)
+
+    # Check if tools are present
+    check_tools()
+
+    # Calculate Cross-correlation if not already calcualted
+    if xcor == 'Calculate':
+        xcor_file = calculate_xcor(tag, paired)
+    else:
+        xcor_file = xcor
+
+    # Call Peaks using MACS2
+    call_peaks_macs(tag, xcor_file, con, sample, genome_size, chrom_size)
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/check_design.py
+++ b/workflow/scripts/check_design.py
+#!/usr/bin/env python3
+
+#
+# * --------------------------------------------------------------------------
+# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
+# * --------------------------------------------------------------------------
+#
+
+'''Check if design file is correctly formatted and matches files list.'''
+
+import argparse
+import logging
+import pandas as pd
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-d', '--design',
+                        help="The design file to run QC (tsv format).",
+                        required=True)
+
+    parser.add_argument('-f', '--fastq',
+                        help="File with list of fastq files (tsv format).",
+                        required=True)
+
+    parser.add_argument('-p', '--paired',
+                        help="True/False if paired-end or single end.",
+                        default=False,
+                        action='store_true')
+
+    args = parser.parse_args()
+    return args
+
+
+def check_design_headers(design, paired):
+    '''Check if design file conforms to sequencing type.'''
+
+    # Default headers
+    design_template = [
+        'sample_id',
+        'experiment_id',
+        'biosample',
+        'factor',
+        'treatment',
+        'replicate',
+        'control_id',
+        'fastq_read1']
+
+    design_headers = list(design.columns.values)
+
+    if paired:  # paired-end data
+        design_template.extend(['fastq_read2'])
+
+    # Check if headers
+    logger.info("Running header check.")
+
+    missing_headers = set(design_template) - set(design_headers)
+
+    if len(missing_headers) > 0:
+        logger.error('Missing column headers: %s', list(missing_headers))
+        raise Exception("Missing column headers: %s" % list(missing_headers))
+
+
+def check_samples(design):
+    '''Check if design file has the correct sample name mapping.'''
+
+    logger.info("Running sample check.")
+
+    samples = design.groupby('sample_id') \
+                            .apply(list)
+
+    malformated_samples = []
+    chars = set('-.')
+    for sample in samples.index.values:
+        if(any(char.isspace() for char in sample) | any((char in chars) for char in sample)):
+            malformated_samples.append(sample)
+
+    if len(malformated_samples) > 0:
+        logger.error('Malformed samples from design file: %s', list(malformated_samples))
+        raise Exception("Malformed samples from design file: %s" %
+                        list(malformated_samples))
+
+
+def check_experiments(design):
+    '''Check if design file has the correct experiment name mapping.'''
+
+    logger.info("Running experiment check.")
+
+    experiments = design.groupby('experiment_id') \
+                            .apply(list)
+
+    malformated_experiments = []
+    chars = set('-.')
+    for experiment in experiments.index.values:
+        if(any(char.isspace() for char in experiment) | any((char in chars) for char in experiment)):
+            malformated_experiments.append(experiment)
+
+    if len(malformated_experiments) > 0:
+        logger.error('Malformed experiment from design file: %s', list(malformated_experiments))
+        raise Exception("Malformed experiment from design file: %s" %
+                        list(malformated_experiments))
+
+
+def check_controls(design):
+    '''Check if design file has the correct control mapping.'''
+
+    logger.info("Running control check.")
+
+    missing_controls = set(design['control_id']) - set(design['sample_id'])
+
+    if len(missing_controls) > 0:
+        logger.error('Missing control experiments: %s', list(missing_controls))
+        raise Exception("Missing control experiments: %s" %
+                        list(missing_controls))
+
+
+def check_replicates(design):
+    '''Check if design file has unique replicate numbersfor an experiment.'''
+
+    logger.info("Running replicate check.")
+
+    experiment_replicates = design.groupby('experiment_id')['replicate'] \
+                            .apply(list)
+
+    duplicated_replicates = []
+    for experiment in experiment_replicates.index.values:
+        replicates = experiment_replicates[experiment]
+        unique_replicates = set(replicates)
+        if len(replicates) != len(unique_replicates):
+            duplicated_replicates.append(experiment)
+
+    if len(duplicated_replicates) > 0:
+        logger.error('Duplicate replicates in experiments: %s', list(duplicated_replicates))
+        raise Exception("Duplicate replicates in experiments: %s" %
+                        list(duplicated_replicates))
+
+
+def check_files(design, fastq, paired):
+    '''Check if design file has the files found.'''
+
+    logger.info("Running file check.")
+
+    if paired:  # paired-end data
+        files = list(design['fastq_read1']) + list(design['fastq_read2'])
+    else:  # single-end data
+        files = design['fastq_read1']
+
+    files_found = fastq['name']
+
+    missing_files = set(files) - set(files_found)
+
+    if len(missing_files) > 0:
+        logger.error('Missing files from design file: %s', list(missing_files))
+        raise Exception("Missing files from design file: %s" %
+                        list(missing_files))
+    else:
+        file_dict = fastq.set_index('name').T.to_dict()
+
+        design['fastq_read1'] = design['fastq_read1'] \
+                                .apply(lambda x: file_dict[x]['path'])
+        if paired:  # paired-end data
+            design['fastq_read2'] = design['fastq_read2'] \
+                                    .apply(lambda x: file_dict[x]['path'])
+    return design
+
+
+def main():
+    args = get_args()
+    design = args.design
+    fastq = args.fastq
+    paired = args.paired
+
+    # Create a file handler
+    handler = logging.FileHandler('design.log')
+    logger.addHandler(handler)
+
+    # Read files as dataframes
+    design_df = pd.read_csv(design, sep='\t')
+    fastq_df = pd.read_csv(fastq, sep='\t', names=['name', 'path'])
+
+    # Check design file
+    check_design_headers(design_df, paired)
+    check_controls(design_df)
+    check_replicates(design_df)
+    new_design_df = check_files(design_df, fastq_df, paired)
+
+    # Write out new design file
+    new_design_df.to_csv('design.tsv', header=True, sep='\t', index=False)
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/convert_reads.py
+++ b/workflow/scripts/convert_reads.py
+#!/usr/bin/env python3
+
+#
+# * --------------------------------------------------------------------------
+# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
+# * --------------------------------------------------------------------------
+#
+
+'''Convert tagAlign files for further processing.'''
+
+import os
+import argparse
+import shutil
+import subprocess
+import shlex
+import logging
+from multiprocessing import cpu_count
+import utils
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-b', '--bam',
+                        help="The bam file to convert.",
+                        required=True)
+
+    parser.add_argument('-p', '--paired',
+                        help="True/False if paired-end or single end.",
+                        default=False,
+                        action='store_true')
+
+    args = parser.parse_args()
+    return args
+
+# Functions
+
+
+def check_tools():
+    '''Checks for required componenets on user system'''
+
+    logger.info('Checking for required libraries and components on this system')
+
+    bedtools_path = shutil.which("bedtools")
+    if bedtools_path:
+        logger.info('Found bedtools: %s', bedtools_path)
+
+        # Get Version
+        bedtools_version_command = "bedtools --version"
+        bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
+
+        # Write to file
+        bedtools_file = open("version_bedtools.txt", "wb")
+        bedtools_file.write(bedtools_version)
+        bedtools_file.close()
+    else:
+        logger.error('Missing bedtools')
+        raise Exception('Missing bedtools')
+
+    samtools_path = shutil.which("samtools")
+    if samtools_path:
+        logger.info('Found samtools: %s', samtools_path)
+
+        # Get Version
+        samtools_version_command = "samtools --version"
+        samtools_version = subprocess.check_output(samtools_version_command, shell=True)
+
+        # Write to file
+        samtools_file = open("version_samtools.txt", "wb")
+        samtools_file.write(samtools_version)
+        samtools_file.close()
+    else:
+        logger.error('Missing samtools')
+        raise Exception('Missing samtools')
+
+
+def convert_mapped(bam, tag_filename):
+    '''Use bedtools to convert to tagAlign.'''
+
+    out, err = utils.run_pipe([
+        "bamToBed -i %s" % (bam),
+        r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""",
+        "gzip -nc"], outfile=tag_filename)
+
+
+def convert_mapped_pe(bam, bam_basename):
+    '''Use bedtools to convert to tagAlign PE data.'''
+
+    bedpe_filename = bam_basename + ".bedpe.gz"
+
+    # Name sort bam to make BEDPE
+    nmsrt_bam_filename = bam_basename + ".nmsrt.bam"
+    samtools_sort_command = \
+        "samtools sort -n -@%d -o %s %s" \
+        % (cpu_count(), nmsrt_bam_filename, bam)
+
+    logger.info(samtools_sort_command)
+    subprocess.check_output(shlex.split(samtools_sort_command))
+
+    out, err = utils.run_pipe([
+        "bamToBed -bedpe -mate1 -i %s" % (nmsrt_bam_filename),
+        "gzip -nc"], outfile=bedpe_filename)
+
+
+def main():
+    args = get_args()
+    paired = args.paired
+    bam = args.bam
+
+    # Create a file handler
+    handler = logging.FileHandler('convert_reads.log')
+    logger.addHandler(handler)
+
+    # Check if tools are present
+    check_tools()
+
+    # Convert PE or SE to tagAlign
+    bam_basename = os.path.basename(
+        utils.strip_extensions(bam, ['.bam', '.dedup']))
+
+    tag_filename = bam_basename + '.tagAlign.gz'
+    convert_mapped(bam, tag_filename)
+
+    if paired:  # paired-end data
+        convert_mapped_pe(bam, bam_basename)
+    else:
+        bedse_filename = bam_basename + ".bedse.gz"
+        shutil.copy(tag_filename, bedse_filename)
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/runDiffBind.R
+++ b/workflow/scripts/runDiffBind.R
+#!/bin/Rscript
+
+#*
+#* --------------------------------------------------------------------------
+#* Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
+#* --------------------------------------------------------------------------
+#*
+
+# Load libraries
 library("DiffBind")

-#build dba object from sample sheet and do analysis
-args <- commandArgs(TRUE)
+# Create parser object
+args <- commandArgs(trailingOnly=TRUE)
+
+# Check input args
+if (length(args) != 1) {
+  stop("Usage: diff_peaks.R annotate_design.tsv ", call.=FALSE)
+}
+
+# Output version of DiffBind
+diffibind_version = packageVersion('DiffBind')
+write.table(paste("Version", diffibind_version), file = "version_DiffBind.txt", sep = "\t",
+            row.names = FALSE, col.names = FALSE)
+
+# Build DBA object from design file
 data <- dba(sampleSheet=args[1])
 data <- dba.count(data)
 data <- dba.contrast(data, minMembers = 2, categories=DBA_CONDITION)
 data <- dba.analyze(data)

-#Draw figures
-pdf("diffbind.samples.heatmap.pdf")
+# Draw figures
+pdf("heatmap.pdf")
 plot(data)
 dev.off()

-pdf("diffbind.samples.pca.pdf")
+pdf("pca.pdf")
 dba.plotPCA(data, DBA_TISSUE, label=DBA_CONDITION)
 dev.off()

-#Save peak reads count
+# Save peak reads count
 normcount <- dba.peakset(data, bRetrieve=T)
-write.table(as.data.frame(normcount),"diffbind.normcount.txt",sep="\t",quote=F,row.names=F)
+write.table(as.data.frame(normcount),"normcount_peaksets.txt",sep="\t",quote=F,row.names=F)

-#Retriving the differentially bound sites
-#make new design file for chipseeker at the same time
+# Reteriving the differentially bound sites
+# Make new design file for peakAnnotation at the same time
 new_SampleID = c()
 new_Peaks = c()
-for (i in c(1:length(data$contrasts)))
-{
+for (i in c(1:length(data$contrasts))) {
 contrast_bed_name = paste(data$contrasts[[i]]$name1,"vs",
                      data$contrasts[[i]]$name2,"diffbind.bed",sep="_")
 contrast_name = paste(data$contrasts[[i]]$name1,"vs",
-                      data$contrasts[[i]]$name2,"diffbind.xls",sep="_")
+                      data$contrasts[[i]]$name2,"diffbind.csv",sep="_")
 new_SampleID = c(new_SampleID,paste(data$contrasts[[i]]$name1,"vs",data$contrasts[[i]]$name2,sep="_"))
 new_Peaks = c(new_Peaks, contrast_bed_name)
 report <- dba.report(data, contrast=i, th=1, bCount=TRUE)
 report <- as.data.frame(report)
- print(head(report))
 colnames(report)[1:5]<-c("chrom","peak_start","peak_stop","peak_width","peak_strand")
- #print(head(report))
+
 write.table(report,contrast_name,sep="\t",quote=F,row.names=F)
 write.table(report[,c(1:3)],contrast_bed_name,sep="\t",quote=F,row.names=F, col.names=F)
 }
-#Write new design file
-newdesign = data.frame(SampleID=new_SampleID, Peaks=new_Peaks)
-write.csv(newdesign,"diffpeak.design",row.names=F,quote=F)
--- a/workflow/scripts/experiment_design.py
+++ b/workflow/scripts/experiment_design.py
+#!/usr/bin/env python3
+
+#
+# * --------------------------------------------------------------------------
+# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
+# * --------------------------------------------------------------------------
+#
+
+'''Generate experiment design files for downstream processing.'''
+
+import argparse
+import logging
+import pandas as pd
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-d', '--design',
+                        help="The design file to make experiemnts (tsv format).",
+                        required=True)
+
+    args = parser.parse_args()
+    return args
+
+
+def update_controls(design):
+    '''Update design file to append controls list.'''
+
+    logger.info("Running control file update.")
+
+    file_dict = design[['sample_id', 'tag_align']] \
+                .set_index('sample_id').T.to_dict()
+
+    design['control_tag_align'] = design['control_id'] \
+                                .apply(lambda x: file_dict[x]['tag_align'])
+
+    logger.info("Removing rows that are there own control.")
+
+    design = design[design['control_id'] != design['sample_id']]
+
+    return design
+
+
+def make_experiment_design(design):
+    '''Make design file by grouping for each experiment'''
+
+    logger.info("Running experiment design generation.")
+
+    for experiment, df_experiment in design.groupby('experiment_id'):
+        experiment_file = experiment + '.tsv'
+        df_experiment.to_csv(experiment_file, header=True, sep='\t', index=False)
+
+
+def main():
+    args = get_args()
+    design = args.design
+
+    # Create a file handler
+    handler = logging.FileHandler('experiment_generation.log')
+    logger.addHandler(handler)
+
+    # Read files as dataframes
+    design_df = pd.read_csv(design, sep='\t')
+
+    # Update design file for check_controls
+    new_design_df = update_controls(design_df)
+
+    # write out experiment design files
+    make_experiment_design(new_design_df)
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/experiment_qc.py
+++ b/workflow/scripts/experiment_qc.py
+#!/usr/bin/env python3
+
+#
+# * --------------------------------------------------------------------------
+# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
+# * --------------------------------------------------------------------------
+#
+
+'''Experiment correlation and enrichment of reads over genome-wide bins.'''
+
+
+import argparse
+import logging
+import subprocess
+import shutil
+from multiprocessing import cpu_count
+import pandas as pd
+
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-d', '--design',
+                        help="The design file to run QC (tsv format).",
+                        required=True)
+
+    parser.add_argument('-e', '--extension',
+                        help="Number of base pairs to extend the reads",
+                        type=int,
+                        required=True)
+
+    args = parser.parse_args()
+    return args
+
+
+def check_tools():
+    '''Checks for required componenets on user system.'''
+
+    logger.info('Checking for required libraries and components on this system')
+
+    deeptools_path = shutil.which("deeptools")
+    if deeptools_path:
+        logger.info('Found deeptools: %s', deeptools_path)
+
+        # Get Version
+        deeptools_version_command = "deeptools --version"
+        deeptools_version = subprocess.check_output(deeptools_version_command, shell=True, stderr=subprocess.STDOUT)
+
+        # Write to file
+        deeptools_file = open("version_deeptools.txt", "wb")
+        deeptools_file.write(deeptools_version)
+        deeptools_file.close()
+    else:
+        logger.error('Missing deeptools')
+        raise Exception('Missing deeptools')
+
+
+def generate_read_summary(design, extension):
+    '''Generate summary of data based on read counts.'''
+
+    bam_files = ' '.join(design['bam_reads'])
+    labels = ' '.join(design['sample_id'])
+    mbs_filename = 'sample_mbs.npz'
+
+    mbs_command = (
+        "multiBamSummary bins -p %d --bamfiles %s --extendReads %d --labels %s -out %s"
+        % (cpu_count(), bam_files, extension, labels, mbs_filename)
+        )
+
+    logger.info("Running deeptools with %s", mbs_command)
+
+    read_summary = subprocess.Popen(mbs_command, shell=True)
+    out, err = read_summary.communicate()
+
+    return mbs_filename
+
+
+def check_spearman_correlation(mbs):
+    '''Check Spearman pairwise correlation of samples based on read counts.'''
+
+    spearman_filename = 'heatmap_SpearmanCorr.pdf'
+    spearman_params = "--corMethod spearman --skipZero" + \
+                    " --plotTitle \"Spearman Correlation of Read Counts\"" + \
+                    " --whatToPlot heatmap --colorMap RdYlBu --plotNumbers"
+
+    spearman_command = (
+        "plotCorrelation -in %s %s -o %s"
+        % (mbs, spearman_params, spearman_filename)
+    )
+
+    logger.info("Running deeptools with %s", spearman_command)
+
+    spearman_correlation = subprocess.Popen(spearman_command, shell=True)
+    out, err = spearman_correlation.communicate()
+
+
+def check_pearson_correlation(mbs):
+    '''Check Pearson pairwise correlation of samples based on read counts.'''
+
+    pearson_filename = 'heatmap_PearsonCorr.pdf'
+    pearson_params = "--corMethod pearson --skipZero" + \
+                    " --plotTitle \"Pearson Correlation of Read Counts\"" + \
+                    " --whatToPlot heatmap --colorMap RdYlBu --plotNumbers"
+
+    pearson_command = (
+        "plotCorrelation -in %s %s -o %s"
+        % (mbs, pearson_params, pearson_filename)
+    )
+
+    logger.info("Running deeptools with %s", pearson_command)
+
+    pearson_correlation = subprocess.Popen(pearson_command, shell=True)
+    out, err = pearson_correlation.communicate()
+
+
+def check_coverage(design, extension):
+    '''Asses the sequencing depth of samples.'''
+
+    bam_files = ' '.join(design['bam_reads'])
+    labels = ' '.join(design['sample_id'])
+    coverage_filename = 'coverage.pdf'
+    coverage_params = "-n 1000000 --plotTitle \"Sample Coverage\"" + \
+                    " --ignoreDuplicates --minMappingQuality 10"
+
+    coverage_command = (
+        "plotCoverage -b %s --extendReads %d --labels %s %s --plotFile %s"
+        % (bam_files, extension, labels, coverage_params, coverage_filename)
+        )
+
+    logger.info("Running deeptools with %s", coverage_command)
+
+    coverage_summary = subprocess.Popen(coverage_command, shell=True)
+    out, err = coverage_summary.communicate()
+
+
+def update_controls(design):
+    '''Update design file to append controls list.'''
+
+    logger.info("Running control file update.")
+
+    file_dict = design[['sample_id', 'bam_reads']] \
+                .set_index('sample_id').T.to_dict()
+
+    design['control_reads'] = design['control_id'] \
+                                .apply(lambda x: file_dict[x]['bam_reads'])
+
+    logger.info("Removing rows that are there own control.")
+
+    design = design[design['control_id'] != design['sample_id']]
+
+    return design
+
+
+def check_enrichment(sample_id, control_id, sample_reads, control_reads, extension):
+    '''Asses the enrichment per sample.'''
+
+    fingerprint_filename = sample_id + '_fingerprint.pdf'
+
+    fingerprint_command = (
+        "plotFingerprint -b %s %s --extendReads %d --labels %s %s --plotFile %s"
+        % (sample_reads, control_reads, extension, sample_id, control_id, fingerprint_filename)
+        )
+
+    logger.info("Running deeptools with %s", fingerprint_command)
+
+    fingerprint_summary = subprocess.Popen(fingerprint_command, shell=True)
+    out, err = fingerprint_summary.communicate()
+
+
+def main():
+    args = get_args()
+    design = args.design
+    extension = args.extension
+
+    # Create a file handler
+    handler = logging.FileHandler('experiment_qc.log')
+    logger.addHandler(handler)
+
+    # Check if tools are present
+    check_tools()
+
+    # Read files
+    design_df = pd.read_csv(design, sep='\t')
+
+    # Run correlation
+    mbs_filename = generate_read_summary(design_df, extension)
+    check_spearman_correlation(mbs_filename)
+    check_pearson_correlation(mbs_filename)
+
+    # Run coverage
+    check_coverage(design_df, extension)
+
+    # Run enrichment
+    new_design_df = update_controls(design_df)
+    for index, row in new_design_df.iterrows():
+        check_enrichment(
+            row['sample_id'],
+            row['control_id'],
+            row['bam_reads'],
+            row['control_reads'],
+            extension)
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/generate_references.py
+++ b/workflow/scripts/generate_references.py
+#!/usr/bin/env python
+
+#
+# * --------------------------------------------------------------------------
+# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
+# * --------------------------------------------------------------------------
+#
+
+'''Make header for HTML of references.'''
+
+import argparse
+import subprocess
+import shlex
+import logging
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-r', '--reference',
+                        help="The reference file (markdown format).",
+                        required=True)
+
+    parser.add_argument('-o', '--output',
+                        help="The out file name.",
+                        default='references')
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+    reference = args.reference
+    output = args.output
+
+    out_filename = output + '_mqc.yaml'
+
+    # Header for HTML
+    print('''
+        id: 'Software References'
+        section_name: 'Software References'
+        description: 'This section describes references for the tools used.'
+        plot_type: 'html'
+        data: |
+        '''
+    , file = open(out_filename, "w")
+    )
+
+    # Turn Markdown into HTML
+    references_html = 'bash -c "pandoc -p {} | sed \'s/^/                /\' >> {}"'
+    references_html = references_html.format(reference, out_filename)
+    subprocess.check_call(shlex.split(references_html))
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/generate_versions.py
+++ b/workflow/scripts/generate_versions.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+#
+# * --------------------------------------------------------------------------
+# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
+# * --------------------------------------------------------------------------
+#
+
+'''Make YAML of software versions.'''
+
+from __future__ import print_function
+from collections import OrderedDict
+import re
+import os
+import logging
+import glob
+import argparse
+import numpy as np
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+SOFTWARE_REGEX = {
+    'Nextflow': ['version_nextflow.txt', r"(\S+)"],
+    'Trim Galore!': ['trimReads_vf/version_trimgalore.txt', r"version (\S+)"],
+    'Cutadapt': ['trimReads_vf/version_cutadapt.txt', r"Version (\S+)"],
+    'BWA': ['alignReads_vf/version_bwa.txt', r"Version: (\S+)"],
+    'Samtools': ['alignReads_vf/version_samtools.txt', r"samtools (\S+)"],
+    'Sambamba': ['filterReads_vf/version_sambamba.txt', r"sambamba (\S+)"],
+    'BEDTools': ['convertReads_vf/version_bedtools.txt', r"bedtools v(\S+)"],
+    'R': ['crossReads_vf/version_r.txt', r"R version (\S+)"],
+    'SPP': ['crossReads_vf/version_spp.txt', r"\[1\] ‘(1.14)’"],
+    'MACS2': ['callPeaksMACS_vf/version_macs.txt', r"macs2 (\S+)"],
+    'bedGraphToBigWig': ['callPeaksMACS_vf/version_bedGraphToBigWig.txt', r"bedGraphToBigWig v (\S+)"],
+    'ChIPseeker': ['peakAnnotation_vf/version_ChIPseeker.txt', r"Version (\S+)\""],
+    'MEME-ChIP': ['motifSearch_vf/version_memechip.txt', r"Version (\S+)"],
+    'DiffBind': ['diffPeaks_vf/version_DiffBind.txt', r"Version (\S+)\""],
+    'deepTools': ['experimentQC_vf/version_deeptools.txt', r"deeptools (\S+)"],
+    'Python': ['version_python.txt', r"Python (\S+)"],
+    'MultiQC': ['version_multiqc.txt', r"multiqc, version (\S+)"],
+}
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-o', '--output',
+                        help="The out file name.",
+                        required=True)
+
+    parser.add_argument('-t', '--test',
+                        help='Used for testing purposes',
+                        default=False,
+                        action='store_true')
+
+    args = parser.parse_args()
+    return args
+
+
+def check_files(files, test):
+    '''Check if version files are found.'''
+
+    logger.info("Running file check.")
+
+    software_files = np.array(list(SOFTWARE_REGEX.values()))[:,0]
+
+    extra_files =  set(files) - set(software_files)
+
+    if len(extra_files) > 0 and test:
+            logger.error('Missing regex: %s', list(extra_files))
+            raise Exception("Missing regex: %s" % list(extra_files))
+
+
+def main():
+    args = get_args()
+    output = args.output
+    test = args.test
+
+    out_filename = output + '_mqc.yaml'
+
+    results = OrderedDict()
+    results['Nextflow'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Trim Galore!'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Cutadapt'] = '<span style="color:#999999;\">Not Run</span>'
+    results['BWA'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Samtools'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Sambamba'] = '<span style="color:#999999;\">Not Run</span>'
+    results['BEDTools'] = '<span style="color:#999999;\">Not Run</span>'
+    results['R'] = '<span style="color:#999999;\">Not Run</span>'
+    results['SPP'] = '<span style="color:#999999;\">Not Run</span>'
+    results['MACS2'] = '<span style="color:#999999;\">Not Run</span>'
+    results['bedGraphToBigWig'] = '<span style="color:#999999;\">Not Run</span>'
+    results['ChIPseeker'] = '<span style="color:#999999;\">Not Run</span>'
+    results['MEME-ChIP'] = '<span style="color:#999999;\">Not Run</span>'
+    results['DiffBind'] = '<span style="color:#999999;\">Not Run</span>'
+    results['deepTools'] = '<span style="color:#999999;\">Not Run</span>'
+    results['MultiQC'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Python'] = '<span style="color:#999999;\">Not Run</span>'
+
+    # list all files
+    files = glob.glob('**/*.txt', recursive=True)
+
+    # Check for version files:
+    check_files(files, test)
+
+    # Search each file using its regex
+    for k, v in SOFTWARE_REGEX.items():
+        if os.path.isfile(v[0]):
+            with open(v[0]) as x:
+                versions = x.read()
+                match = re.search(v[1], versions)
+                if match:
+                    results[k] = "v{}".format(match.group(1))
+
+    # Dump to YAML
+    print(
+        '''
+        id: 'Software Versions'
+        section_name: 'Software Versions'
+        section_href: 'https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/'
+        plot_type: 'html'
+        description: 'are collected at run time from the software output.'
+        data: |
+            <dl class="dl-horizontal">
+        '''
+    , file = open(out_filename, "w"))
+
+    for k, v in results.items():
+        print("            <dt>{}</dt><dd>{}</dd>".format(k, v), file = open(out_filename, "a"))
+    print("            </dl>", file = open(out_filename, "a"))
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/map_qc.py
+++ b/workflow/scripts/map_qc.py
+#!/usr/bin/env python3
+
+#
+# * --------------------------------------------------------------------------
+# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
+# * --------------------------------------------------------------------------
+#
+
+'''Remove duplicates and filter unmapped reads.'''
+
+import os
+import subprocess
+import argparse
+import shutil
+import shlex
+import logging
+from multiprocessing import cpu_count
+import utils
+import pandas as pd
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+# the order of this list is important.
+# strip_extensions strips from the right inward, so
+# the expected right-most extensions should appear first (like .gz)
+# Modified from J. Seth Strattan
+STRIP_EXTENSIONS = ['.bam', '.srt']
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-b', '--bam',
+                        help="The bam file to run filtering and qc on.",
+                        required=True)
+
+    parser.add_argument('-p', '--paired',
+                        help="True/False if paired-end or single end.",
+                        default=False,
+                        action='store_true')
+
+    args = parser.parse_args()
+    return args
+
+# Functions
+
+
+def check_tools():
+    '''Checks for required componenets on user system'''
+
+    logger.info('Checking for required libraries and components on this system')
+
+    samtools_path = shutil.which("samtools")
+    if samtools_path:
+        logger.info('Found samtools: %s', samtools_path)
+
+        # Get Version
+        samtools_version_command = "samtools --version"
+        samtools_version = subprocess.check_output(samtools_version_command, shell=True)
+
+        # Write to file
+        samtools_file = open("version_samtools.txt", "wb")
+        samtools_file.write(samtools_version)
+        samtools_file.close()
+    else:
+        logger.error('Missing samtools')
+        raise Exception('Missing samtools')
+
+    sambamba_path = shutil.which("sambamba")
+    if sambamba_path:
+        logger.info('Found sambamba: %s', sambamba_path)
+
+        # Get Version
+        sambamba_version_command = "sambamba"
+        try:
+            subprocess.check_output(sambamba_version_command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError as e:
+            sambamba_version = e.output
+
+        # Write to file
+        sambamba_file = open("version_sambamba.txt", "wb")
+        sambamba_file.write(sambamba_version)
+        sambamba_file.close()
+    else:
+        logger.error('Missing sambamba')
+        raise Exception('Missing sambamba')
+
+    bedtools_path = shutil.which("bedtools")
+    if bedtools_path:
+        logger.info('Found bedtools: %s', bedtools_path)
+
+        # Get Version
+        bedtools_version_command = "bedtools --version"
+        bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
+
+        # Write to file
+        bedtools_file = open("version_bedtools.txt", "wb")
+        bedtools_file.write(bedtools_version)
+        bedtools_file.close()
+    else:
+        logger.error('Missing bedtools')
+        raise Exception('Missing bedtools')
+
+
+def filter_mapped_pe(bam, bam_basename):
+    '''Use samtools to filter unmapped reads for PE data.'''
+
+    filt_bam_prefix = bam_basename + ".filt.srt"
+    filt_bam_filename = filt_bam_prefix + ".bam"
+    tmp_filt_bam_prefix = "tmp.%s" % (filt_bam_prefix)
+    tmp_filt_bam_filename = tmp_filt_bam_prefix + ".bam"
+
+    # Remove  unmapped, mate unmapped
+    # not primary alignment, reads failing platform
+    # Remove low MAPQ reads
+    # Only keep properly paired reads
+    # Obtain name sorted BAM file
+    out, err = utils.run_pipe([
+        # filter: -F 1804 FlAG bits to exclude; -f 2 FLAG bits to reqire;
+        # -q 30 exclude MAPQ < 30; -u uncompressed output
+        # exclude FLAG 1804: unmapped, next segment unmapped, secondary
+        # alignments, not passing platform q, PCR or optical duplicates
+        # require FLAG 2: properly aligned
+        "samtools view -F 1804 -f 2 -q 30 -u %s" % (bam),
+        # sort:  -n sort by name; - take input from stdin;
+        # out to specified filename
+        # Will produce name sorted BAM
+        "samtools sort -n -@ %d -o %s" % (cpu_count(), tmp_filt_bam_filename)])
+    if err:
+        logger.error("samtools filter error: %s", err)
+
+    # Remove orphan reads (pair was removed)
+    # and read pairs mapping to different chromosomes
+    # Obtain position sorted BAM
+    out, err = utils.run_pipe([
+        # fill in mate coordinates, ISIZE and mate-related flags
+        # fixmate requires name-sorted alignment; -r removes secondary and
+        # unmapped (redundant here because already done above?)
+        # - send output to stdout
+        "samtools fixmate -r %s -" % (tmp_filt_bam_filename),
+        # repeat filtering after mate repair
+        "samtools view -F 1804 -f 2 -u -",
+        # produce the coordinate-sorted BAM
+        "samtools sort -@ %d -o %s" % (cpu_count(), filt_bam_filename)])
+
+    os.remove(tmp_filt_bam_filename)
+    return filt_bam_filename
+
+
+def filter_mapped_se(bam, bam_basename):
+    '''Use samtools to filter unmapped reads for SE data.'''
+
+    filt_bam_prefix = bam_basename + ".filt.srt"
+    filt_bam_filename = filt_bam_prefix + ".bam"
+
+    # Remove unmapped, mate unmapped
+    # not primary alignment, reads failing platform
+    # Remove low MAPQ reads
+    # Obtain name sorted BAM file
+    with open(filt_bam_filename, 'w') as temp_file:
+        samtools_filter_command = (
+            "samtools view -F 1804 -q 30 -b %s"
+            % (bam)
+            )
+        logger.info(samtools_filter_command)
+        subprocess.check_call(
+            shlex.split(samtools_filter_command),
+            stdout=temp_file)
+
+    return filt_bam_filename
+
+
+def dedup_mapped(bam, bam_basename, paired):
+    '''Use sambamba and samtools to remove duplicates.'''
+
+    # Markduplicates
+    dup_file_qc_filename = bam_basename + ".dedup.qc"
+    tmp_dup_mark_filename = bam_basename + ".dupmark.bam"
+    sambamba_params = "--hash-table-size=17592186044416" + \
+                    " --overflow-list-size=20000000 --io-buffer-size=256"
+    with open(dup_file_qc_filename, 'w') as temp_file:
+        sambamba_markdup_command = (
+            "sambamba markdup -t %d %s --tmpdir=%s %s %s"
+            % (cpu_count(), sambamba_params, os.getcwd(), bam, tmp_dup_mark_filename)
+            )
+        logger.info(sambamba_markdup_command)
+        subprocess.check_call(
+            shlex.split(sambamba_markdup_command),
+            stderr=temp_file)
+
+    # Remove duplicates
+    final_bam_prefix = bam_basename + ".dedup"
+    final_bam_filename = final_bam_prefix + ".bam"
+
+    if paired:  # paired-end data
+        samtools_dedupe_command = \
+            "samtools view -F 1804 -f 2 -b %s" % (tmp_dup_mark_filename)
+    else:
+        samtools_dedupe_command = \
+            "samtools view -F 1804 -b %s" % (tmp_dup_mark_filename)
+
+    with open(final_bam_filename, 'w') as temp_file:
+        logger.info(samtools_dedupe_command)
+        subprocess.check_call(
+            shlex.split(samtools_dedupe_command),
+            stdout=temp_file)
+
+    # Index final bam file
+    sambamba_index_command = \
+        "samtools index -@ %d %s" % (cpu_count(), final_bam_filename)
+    logger.info(sambamba_index_command)
+    subprocess.check_output(shlex.split(sambamba_index_command))
+
+    # Generate mapping statistics
+    mapstats_filename = final_bam_prefix + ".flagstat.qc"
+    with open(mapstats_filename, 'w') as temp_file:
+        flagstat_command = "sambamba flagstat -t %d %s" \
+                            % (cpu_count(), final_bam_filename)
+        logger.info(flagstat_command)
+        subprocess.check_call(shlex.split(flagstat_command), stdout=temp_file)
+
+    os.remove(bam)
+    return tmp_dup_mark_filename
+
+
+def compute_complexity(bam, paired, bam_basename):
+    '''Calculate library complexity .'''
+
+    pbc_file_qc_filename = bam_basename + ".pbc.qc"
+    tmp_pbc_file_qc_filename = "tmp.%s" % (pbc_file_qc_filename)
+
+    # Sort by name
+    # convert to bedPE and obtain fragment coordinates
+    # sort by position and strand
+    # Obtain unique count statistics
+
+    # PBC File output
+    # Sample Name[tab]
+    # TotalReadPairs [tab]
+    # DistinctReadPairs [tab]
+    # OneReadPair [tab]
+    # TwoReadPairs [tab]
+    # NRF=Distinct/Total [tab]
+    # PBC1=OnePair/Distinct [tab]
+    # PBC2=OnePair/TwoPair
+    pbc_headers = ['TotalReadPairs',
+                   'DistinctReadPairs',
+                   'OneReadPair',
+                   'TwoReadPairs',
+                   'NRF',
+                   'PBC1',
+                   'PBC2']
+
+    if paired:
+        steps = [
+            "samtools sort -@%d -n %s" % (cpu_count(), bam),
+            "bamToBed -bedpe -i stdin",
+            r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""]
+    else:
+        steps = [
+            "bamToBed -i %s" % (bam),
+            r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""]
+    steps.extend([
+        "grep -v 'chrM'",
+        "sort",
+        "uniq -c",
+        r"""awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{printf "%d\t%d\t%d\t%d\t%f\t%f\t%f\n",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'"""
+        ])
+    out, err = utils.run_pipe(steps, tmp_pbc_file_qc_filename)
+    if err:
+        logger.error("PBC file error: %s", err)
+
+    # Add Sample Name and headers
+    pbc_file = pd.read_csv(tmp_pbc_file_qc_filename, sep='\t', header=None,
+                           names=pbc_headers)
+    pbc_file['Sample'] = bam_basename
+    pbc_headers_new = list(pbc_file)
+    pbc_headers_new.insert(0, pbc_headers_new.pop(pbc_headers_new.index('Sample')))
+    pbc_file = pbc_file[pbc_headers_new]
+    pbc_file.to_csv(pbc_file_qc_filename, header=True, sep='\t', index=False)
+    os.remove(bam)
+    os.remove(bam + '.bai')
+    os.remove(tmp_pbc_file_qc_filename)
+
+
+def main():
+    args = get_args()
+    paired = args.paired
+    bam = args.bam
+
+    # Create a file handler
+    handler = logging.FileHandler('map_qc.log')
+    logger.addHandler(handler)
+
+    # Check if tools are present
+    check_tools()
+
+    # Run filtering for either PE or SE
+    bam_basename = os.path.basename(
+        utils.strip_extensions(bam, STRIP_EXTENSIONS))
+    if paired:  # paired-end data
+        filter_bam_filename = filter_mapped_pe(bam, bam_basename)
+
+    else:
+        filter_bam_filename = filter_mapped_se(bam, bam_basename)
+
+    # Remove duplicates
+    markdup_bam_filename = dedup_mapped(filter_bam_filename, bam_basename, paired)
+
+    # Compute library complexity
+    compute_complexity(markdup_bam_filename, paired, bam_basename)
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/map_reads.py
+++ b/workflow/scripts/map_reads.py
+#!/usr/bin/env python3
+
+#
+# * --------------------------------------------------------------------------
+# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
+# * --------------------------------------------------------------------------
+#
+
+'''Align reads to reference genome.'''
+
+import os
+import subprocess
+import argparse
+import shutil
+import shlex
+import logging
+from multiprocessing import cpu_count
+import utils
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+# the order of this list is important.
+# strip_extensions strips from the right inward, so
+# the expected right-most extensions should appear first (like .gz)
+# Modified from J. Seth Strattan
+STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '_trimmed']
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-f', '--fastq',
+                        help="The fastq file to run triming on.",
+                        nargs='+',
+                        required=True)
+
+    parser.add_argument('-r', '--reference',
+                        help="The bwa index of the reference genome.",
+                        required=True)
+
+    parser.add_argument('-s', '--sample',
+                        help="The name of the sample.",
+                        required=True)
+
+    parser.add_argument('-p', '--paired',
+                        help="True/False if paired-end or single end.",
+                        default=False,
+                        action='store_true')
+
+    args = parser.parse_args()
+    return args
+
+
+# Functions
+
+
+def check_tools():
+    '''Checks for required componenets on user system'''
+
+    logger.info('Checking for required libraries and components on this system')
+
+    bwa_path = shutil.which("bwa")
+    if bwa_path:
+        logger.info('Found bwa: %s', bwa_path)
+
+        # Get Version
+        bwa_version_command = "bwa"
+        try:
+            subprocess.check_output(bwa_version_command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError as e:
+            bwa_version = e.output
+
+        # Write to file
+        bwa_file = open("version_bwa.txt", "wb")
+        bwa_file.write(bwa_version)
+        bwa_file.close()
+    else:
+        logger.error('Missing bwa')
+        raise Exception('Missing bwa')
+
+    samtools_path = shutil.which("samtools")
+    if samtools_path:
+        logger.info('Found samtools: %s', samtools_path)
+
+        # Get Version
+        samtools_version_command = "samtools --version"
+        samtools_version = subprocess.check_output(samtools_version_command, shell=True)
+
+        # Write to file
+        samtools_file = open("version_samtools.txt", "wb")
+        samtools_file.write(samtools_version)
+        samtools_file.close()
+    else:
+        logger.error('Missing samtools')
+        raise Exception('Missing samtools')
+
+
+def generate_sa(fastq, reference):
+    '''Use BWA to generate Suffix Arrays.'''
+
+    fastq_basename = os.path.basename(utils.strip_extensions(fastq, STRIP_EXTENSIONS))
+
+    bwa_aln_params = '-q 5 -l 32 -k 2'
+
+    sai = '%s.sai' % (fastq_basename)
+    with open(sai, 'w') as sai_file:
+        bwa_command = "bwa aln %s -t %d %s %s" \
+                % (bwa_aln_params, cpu_count(),
+                   reference, fastq)
+
+        logger.info("Running bwa with %s", bwa_command)
+        subprocess.check_call(shlex.split(bwa_command), stdout=sai_file)
+
+    return sai
+
+
+def align_se(fastq, sai, reference, fastq_basename):
+    '''Use BWA to align SE data.'''
+
+    bam_filename = '%s.bam' % (fastq_basename)
+
+    steps = [
+        "bwa samse %s %s %s"
+        % (reference, sai[0], fastq[0]),
+        "samtools view -@%d -Su -" % (cpu_count()),
+        "samtools sort -@%d -o %s"
+        % (cpu_count(), bam_filename)]
+
+    out, err = utils.run_pipe(steps)
+    if err:
+        logger.error("samse/samtools error: %s", err)
+
+    return bam_filename
+
+
+def align_pe(fastq, sai, reference, fastq_basename):
+    '''Use BWA to align PE data.'''
+
+    sam_filename = "%s.sam" % (fastq_basename)
+    badcigar_filename = "%s.badReads" % (fastq_basename)
+    bam_filename = '%s.bam' % (fastq_basename)
+
+    # Remove read pairs with bad CIGAR strings and sort by position
+    steps = [
+        "bwa sampe -P %s %s %s %s %s"
+        % (reference, sai[0], sai[1],
+           fastq[0], fastq[1]),
+        "tee %s" % (sam_filename),
+        r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
+        "sort",
+        "uniq"]
+
+    out, err = utils.run_pipe(steps, badcigar_filename)
+    if err:
+        logger.error("sampe error: %s", err)
+
+    steps = [
+        "cat %s" % (sam_filename),
+        "grep -v -F -f %s" % (badcigar_filename),
+        "samtools view -@%d -Su -" % (cpu_count()),
+        "samtools sort -@%d -o %s"
+        % (cpu_count(), bam_filename)]
+
+    out, err = utils.run_pipe(steps)
+    if err:
+        logger.error("samtools error: %s", err)
+
+    return bam_filename
+
+
+def main():
+    args = get_args()
+    paired = args.paired
+    fastq = args.fastq
+    reference = args.reference
+    sample = args.sample
+
+    # Create a file handler
+    handler = logging.FileHandler('map.log')
+    logger.addHandler(handler)
+
+    # Check if tools are present
+    check_tools()
+
+    # Run Suffix Array generation
+    sai = []
+    for fastq_file in fastq:
+        sai_filename = generate_sa(fastq_file, reference)
+        sai.append(sai_filename)
+
+    # Make file basename
+    fastq_basename = sample
+
+    # Run alignment for either PE or SE
+    if paired:  # paired-end data
+        bam_filename = align_pe(fastq, sai, reference, fastq_basename)
+
+    else:
+        bam_filename = align_se(fastq, sai, reference, fastq_basename)
+
+    bam_mapstats_filename = '%s.flagstat.qc' % (fastq_basename)
+    with open(bam_mapstats_filename, 'w') as temp_file:
+        subprocess.check_call(
+            shlex.split("samtools flagstat %s" % (bam_filename)),
+            stdout=temp_file)
+
+    #Genome/Bad fastq File Check
+    file_check = open(bam_mapstats_filename).readlines()
+    percent = file_check[4].split('(')[1]
+    percent = percent.split('%')[0]
+    if float(percent) < 10:
+        raise Exception ('Mapped Genes too low: Check for correct Genotype')
+
+    # Remove sai files
+    for sai_file in sai:
+        os.remove(sai_file)
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/motif_search.py
+++ b/workflow/scripts/motif_search.py
+#!/usr/bin/env python3
+
+#
+# * --------------------------------------------------------------------------
+# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
+# * --------------------------------------------------------------------------
+#
+
+'''Call Motifs on called peaks.'''
+
+import os
+import argparse
+import logging
+import shutil
+import subprocess
+from multiprocessing import Pool
+import pandas as pd
+import utils
+
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+# the order of this list is important.
+# strip_extensions strips from the right inward, so
+# the expected right-most extensions should appear first (like .gz)
+# Modified from J. Seth Strattan
+STRIP_EXTENSIONS = ['.narrowPeak', '.replicated']
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-d', '--design',
+                        help="The design file to run motif search.",
+                        required=True)
+
+    parser.add_argument('-g', '--genome',
+                        help="The genome FASTA file.",
+                        required=True)
+
+    parser.add_argument('-p', '--peak',
+                        help="The number of peaks to use.",
+                        required=True)
+
+    args = parser.parse_args()
+    return args
+
+# Functions
+
+
+def check_tools():
+    '''Checks for required componenets on user system'''
+
+    logger.info('Checking for required libraries and components on this system')
+
+    meme_path = shutil.which("meme")
+    if meme_path:
+        logger.info('Found meme: %s', meme_path)
+
+        # Get Version
+        memechip_version_command = "meme-chip --version"
+        memechip_version = subprocess.check_output(memechip_version_command, shell=True)
+
+        # Write to file
+        meme_file = open("version_memechip.txt", "wb")
+        meme_file.write(b"Version %s" % (memechip_version))
+        meme_file.close()
+    else:
+        logger.error('Missing meme')
+        raise Exception('Missing meme')
+
+    bedtools_path = shutil.which("bedtools")
+    if bedtools_path:
+        logger.info('Found bedtools: %s', bedtools_path)
+
+        # Get Version
+        bedtools_version_command = "bedtools --version"
+        bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
+
+        # Write to file
+        bedtools_file = open("version_bedtools.txt", "wb")
+        bedtools_file.write(bedtools_version)
+        bedtools_file.close()
+    else:
+        logger.error('Missing bedtools')
+        raise Exception('Missing bedtools')
+
+
+def run_wrapper(args):
+    motif_search(*args)
+
+
+def motif_search(filename, genome, experiment, peak):
+    '''Run motif serach on peaks.'''
+
+    file_basename = os.path.basename(
+        utils.strip_extensions(filename, STRIP_EXTENSIONS))
+
+    out_fa = '%s.fa' % (experiment)
+    out_motif = '%s_memechip' % (experiment)
+
+    # Sort Bed file and limit number of peaks
+    if peak == -1:
+        peak = utils.count_lines(filename)
+        peak_no = 'all'
+    else:
+        peak_no = peak
+
+    sorted_fn = '%s.%s.narrowPeak' % (file_basename, peak_no)
+
+    out, err = utils.run_pipe([
+        'sort -k %dgr,%dgr %s' % (5, 5, filename),
+        'head -n %s' % (peak)], outfile=sorted_fn)
+
+    # Get fasta file
+    out, err = utils.run_pipe([
+        'bedtools getfasta -fi %s -bed %s -fo %s' % (genome, sorted_fn, out_fa)])
+
+    if err:
+        logger.error("bedtools error: %s", err)
+
+    # Call memechip
+    out, err = utils.run_pipe([
+        'meme-chip -oc %s -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 %s -norand' % (out_motif, out_fa)])
+    if err:
+        logger.error("meme-chip error: %s", err)
+
+
+def main():
+    args = get_args()
+    design = args.design
+    genome = args.genome
+    peak = args.peak
+
+    # Create a file handler
+    handler = logging.FileHandler('motif.log')
+    logger.addHandler(handler)
+
+    # Check if tools are present
+    check_tools()
+
+    # Read files
+    design_df = pd.read_csv(design, sep='\t')
+
+    meme_arglist = zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0])
+    work_pool = Pool(min(12, design_df.shape[0]))
+    return_list = work_pool.map(run_wrapper, meme_arglist)
+    work_pool.close()
+    work_pool.join()
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/overlap_peaks.py
+++ b/workflow/scripts/overlap_peaks.py
+#!/usr/bin/env python3
+
+#
+# * --------------------------------------------------------------------------
+# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
+# * --------------------------------------------------------------------------
+#
+
+'''Generate naive overlap peak files and design file for downstream processing.'''
+
+import os
+import argparse
+import logging
+import shutil
+import subprocess
+import pandas as pd
+import utils
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-d', '--design',
+                        help="The design file of peaks (tsv format).",
+                        required=True)
+
+    parser.add_argument('-f', '--files',
+                        help="The design file of with bam files (tsv format).",
+                        required=True)
+
+    args = parser.parse_args()
+    return args
+
+
+def check_tools():
+    '''Checks for required componenets on user system.'''
+
+    logger.info('Checking for required libraries and components on this system')
+
+    bedtools_path = shutil.which("bedtools")
+    if bedtools_path:
+        logger.info('Found bedtools: %s', bedtools_path)
+
+        # Get Version
+        bedtools_version_command = "bedtools --version"
+        bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
+
+        # Write to file
+        bedtools_file = open("version_bedtools.txt", "wb")
+        bedtools_file.write(bedtools_version)
+        bedtools_file.close()
+    else:
+        logger.error('Missing bedtools')
+        raise Exception('Missing bedtools')
+
+
+def update_design(design):
+    '''Update design file for diffBind and remove controls.'''
+
+    logger.info("Running control file update.")
+
+    file_dict = design[['sample_id', 'bam_reads']] \
+                .set_index('sample_id').T.to_dict()
+
+    design['control_bam_reads'] = design['control_id'] \
+                                .apply(lambda x: file_dict[x]['bam_reads'])
+
+    logger.info("Removing rows that are there own control.")
+
+    design = design[design['control_id'] != design['sample_id']]
+
+    logger.info("Removing columns that are there own control.")
+
+    design = design.drop('bam_index', axis=1)
+
+    logger.info("Adding peaks column.")
+
+    design = design.assign(peak='', peak_caller='bed')
+
+    return design
+
+
+def overlap(experiment, design):
+    '''Calculate the overlap of peaks'''
+
+    logger.info("Determining consenus peaks for experiment %s.", experiment)
+
+    # Output File names
+    peak_type = 'narrowPeak'
+    overlapping_peaks_fn = '%s.replicated.%s' % (experiment, peak_type)
+    rejected_peaks_fn = '%s.rejected.%s' % (experiment, peak_type)
+
+    # Intermediate File names
+    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
+    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)
+
+    # Assign Pooled and Psuedoreplicate peaks
+    pool_peaks = design.loc[design.replicate == 'pooled', 'peaks'].values[0]
+    pr1_peaks = design.loc[design.replicate == '1_pr', 'peaks'].values[0]
+    pr2_peaks = design.loc[design.replicate == '2_pr', 'peaks'].values[0]
+
+    # Remove non true replicate rows
+    not_replicates = ['1_pr', '2_pr', 'pooled']
+    design_true_reps = design[~design['replicate'].isin(not_replicates)]
+    true_rep_peaks = design_true_reps.peaks.unique()
+
+    # Find overlaps
+    awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
+    cut_command = 'cut -f 1-10'
+
+    # Find pooled peaks that overlap Rep1 and Rep2
+    # where overlap is defined as the fractional overlap
+    # with any one of the overlapping peak pairs  >= 0.5
+
+    steps_true = ['intersectBed -wo -a %s -b %s' % (pool_peaks, true_rep_peaks[0]),
+                  awk_command,
+                  cut_command,
+                  'sort -u']
+
+    iter_true_peaks = iter(true_rep_peaks)
+    next(iter_true_peaks)
+
+    if len(true_rep_peaks) > 1:
+        for true_peak in true_rep_peaks[1:]:
+            steps_true.extend(['intersectBed -wo -a stdin -b %s' % (true_peak),
+                               awk_command,
+                               cut_command,
+                               'sort -u'])
+
+    out, err = utils.run_pipe(steps_true, outfile=overlap_tr_fn)
+    print("%d peaks overlap with both true replicates" %
+          (utils.count_lines(overlap_tr_fn)))
+
+    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2
+    # where overlap is defined as the fractional overlap
+    # with any one of the overlapping peak pairs  >= 0.5
+
+    steps_pseudo = ['intersectBed -wo -a %s -b %s' % (pool_peaks, pr1_peaks),
+                    awk_command,
+                    cut_command,
+                    'sort -u',
+                    'intersectBed -wo -a stdin -b %s' % (pr2_peaks),
+                    awk_command,
+                    cut_command,
+                    'sort -u']
+
+    out, err = utils.run_pipe(steps_pseudo, outfile=overlap_pr_fn)
+    print("%d peaks overlap with both pooled pseudoreplicates"
+          % (utils.count_lines(overlap_pr_fn)))
+
+    # Make union of peak lists
+    out, err = utils.run_pipe([
+                'cat %s %s' % (overlap_tr_fn, overlap_pr_fn),
+                'sort -u'
+                ], overlapping_peaks_fn)
+    print("%d peaks overlap with true replicates or with pooled pseudorepliates"
+          % (utils.count_lines(overlapping_peaks_fn)))
+
+    # Make rejected peak list
+    out, err = utils.run_pipe([
+        'intersectBed -wa -v -a %s -b %s' % (pool_peaks, overlapping_peaks_fn)
+        ], rejected_peaks_fn)
+    print("%d peaks were rejected" % (utils.count_lines(rejected_peaks_fn)))
+
+    # Remove temporary files
+    os.remove(overlap_tr_fn)
+    os.remove(overlap_pr_fn)
+
+    return os.path.abspath(overlapping_peaks_fn)
+
+
+def main():
+    args = get_args()
+    design = args.design
+    files = args.files
+
+    # Create a file handler
+    handler = logging.FileHandler('consensus_peaks.log')
+    logger.addHandler(handler)
+
+    # Check if tools are present
+    check_tools()
+
+    # Read files as dataframes
+    design_peaks_df = pd.read_csv(design, sep='\t')
+    design_files_df = pd.read_csv(files, sep='\t')
+
+    # Make a design file for differential binding
+    design_diff = update_design(design_files_df)
+
+    # Make a design file for annotating Peaks
+    anno_cols = ['Condition', 'Peaks']
+    design_anno = pd.DataFrame(columns=anno_cols)
+
+    # Find consenus overlap peaks for each experiment
+    for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
+        replicated_peak = overlap(experiment, df_experiment)
+        design_diff.loc[design_diff.experiment_id == experiment, "peak"] = replicated_peak
+        design_anno.loc[experiment] = [experiment, replicated_peak]
+
+    # Write out design files
+    design_diff.columns = ['SampleID',
+                           'bamReads',
+                           'Condition',
+                           'Tissue',
+                           'Factor',
+                           'Treatment',
+                           'Replicate',
+                           'ControlID',
+                           'bamControl',
+                           'Peaks',
+                           'PeakCaller']
+
+    design_diff.to_csv("design_diffPeaks.csv", header=True, sep=',', index=False)
+    design_anno.to_csv("design_annotatePeaks.tsv", header=True, sep='\t', index=False)
+
+    # Write the unique conditions
+    unique_experiments = pd.DataFrame(design_diff['Condition'].unique().tolist(), columns=['Condition'])
+    unique_experiments.to_csv('unique_experiments.csv', index=False)
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/plot_profile.sh
+++ b/workflow/scripts/plot_profile.sh
+#!/bin/bash
+#plot_profile.sh
+
+script_name="plot_profile.sh"
+
+#Help function
+usage() {
+  echo "-h  --Help documentation for $script_name"
+  echo "-g  --File path to gtf/bed files"
+  echo "Example: $script_name -g 'genome.gtf'"
+  exit 1
+}
+
+
+raise()
+{
+  echo "${1}" >&2
+}
+
+check_tools() {
+  raise "
+   Checking for required libraries and components on this system
+   "
+   deeptools --version &> version_deeptools.txt
+   if [ $? -gt 0 ]
+     then
+      raise "Missing deeptools"
+      return 1
+    fi
+}
+
+compute_matrix() {
+  raise "
+  Computing matrix on ${1} using ${2}
+  "
+
+  computeMatrix reference-point \
+    --referencePoint TSS \
+    -S ${1} \
+    -R ${2} \
+    --skipZeros \
+    -o computeMatrix.gz \
+    -p max/2
+
+  if [ $? -gt 0 ]
+  then
+    raise "Problem building matrix"
+    return 1
+  fi
+}
+
+plot_profile() {
+  raise "
+  Plotting profile
+  "
+
+  plotProfile -m computeMatrix.gz \
+    -out plotProfile.png
+
+  if [ $? -gt 0 ]
+  then
+    raise "Problem plotting"
+    return 1
+  fi
+}
+
+
+run_main() {
+
+  # Parsing options
+  OPTIND=1 # Reset OPTIND
+  while getopts :g:h opt
+      do
+          case $opt in
+              g) gtf=$OPTARG;;
+              h) usage;;
+          esac
+      done
+
+  shift $(($OPTIND -1))
+
+  # Check for mandatory options
+  if [[ -z $gtf ]]; then
+      usage
+  fi
+
+  bws=$(ls *pooled.fc_signal.bw)
+
+  check_tools || exit 1
+
+  compute_matrix "${bws}" "${gtf}" || return 1
+
+  plot_profile || return 1
+
+  raise "ALL COMPLETE"
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]
+then
+  run_main "$@"
+  if [ $? -gt 0 ]
+  then
+    exit 1
+  fi
+fi
--- a/workflow/scripts/pool_and_psuedoreplicate.py
+++ b/workflow/scripts/pool_and_psuedoreplicate.py
+#!/usr/bin/env python3
+
+#
+# * --------------------------------------------------------------------------
+# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
+# * --------------------------------------------------------------------------
+#
+
+'''Generate pooled and pseudoreplicate from data.'''
+
+import argparse
+import logging
+import os
+import subprocess
+import shutil
+import shlex
+import pandas as pd
+import numpy as np
+import utils
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+# the order of this list is important.
+# strip_extensions strips from the right inward, so
+# the expected right-most extensions should appear first (like .gz)
+# Modified from J. Seth Strattan
+STRIP_EXTENSIONS = ['.gz', '.tagAlign', '.bedse', '.bedpe']
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-d', '--design',
+                        help="The design file to make experiemnts (tsv format).",
+                        required=True)
+
+    parser.add_argument('-p', '--paired',
+                        help="True/False if paired-end or single end.",
+                        default=False,
+                        action='store_true')
+
+    parser.add_argument('-c', '--cutoff',
+                        help="Cutoff ratio used for choosing controls.",
+                        type=float,
+                        default=1.2)
+
+    args = parser.parse_args()
+    return args
+
+
+def check_replicates(design):
+    '''Check the number of replicates for the experiment.'''
+
+    no_rep = design.shape[0]
+
+    return no_rep
+
+
+def check_controls(design):
+    '''Check the number of controls for the experiment.'''
+
+    no_controls = len(design.control_tag_align.unique())
+
+    return no_controls
+
+
+def get_read_count_ratio(design):
+    '''Get the ratio of read counts for unique controls.'''
+
+    controls = design.control_tag_align.unique()
+    control_dict = {}
+    for con in controls:
+        no_reads = utils.count_lines(con)
+        control_dict[con] = no_reads
+
+    control_matrix = {c: control_dict for c in controls}
+    control_df = pd.DataFrame.from_dict(control_matrix)
+
+    control_ratio = control_df.divide(list(control_dict.values()), axis=0)
+    return control_ratio
+
+
+def pool(tag_files, outfile, paired):
+    '''Pool files together.'''
+
+    if paired:
+        file_extension = '.bedpe.gz'
+    else:
+        file_extension = '.bedse.gz'
+
+    pool_basename = os.path.basename(
+        utils.strip_extensions(outfile, STRIP_EXTENSIONS))
+
+    pooled_filename = pool_basename + file_extension
+
+    # Merge files
+    out, err = utils.run_pipe([
+        'gzip -dc %s' % (' '.join(tag_files)),
+        'gzip -cn'], outfile=pooled_filename)
+
+    return pooled_filename
+
+
+def bedpe_to_tagalign(tag_file, outfile):
+    '''Convert read pairs to reads into standard tagAlign file.'''
+
+    se_tag_filename = outfile + ".tagAlign.gz"
+
+    # Convert read pairs to reads into standard tagAlign file
+    tag_steps = ["zcat -f %s" % (tag_file)]
+    tag_steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""])
+    tag_steps.extend(['gzip -cn'])
+    out, err = utils.run_pipe(tag_steps, outfile=se_tag_filename)
+
+    return se_tag_filename
+
+
+def self_psuedoreplication(tag_file, prefix, paired):
+    '''Make 2 self-psuedoreplicates.'''
+
+    # Get total number of reads
+    no_lines = utils.count_lines(tag_file)
+
+    # Number of lines to split into
+    lines_per_rep = (no_lines+1)/2
+
+    # Make an array of number of psuedoreplicatesfile names
+    pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.tagAlign.gz'
+                            for r in [0, 1]}
+
+    # Shuffle and split file into equal parts
+    # by using the input to seed shuf we ensure multiple runs with the same
+    # input will produce the same output
+    # Produces two files named splits_prefix0n, n=1,2
+
+    splits_prefix = 'temp_split'
+
+    psuedo_command = 'bash -c "zcat {} | shuf --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {} | wc -c) -nosalt </dev/zero 2>/dev/null) | '
+    psuedo_command += 'split -d -l {} - {}."'
+    psuedo_command = psuedo_command.format(
+        tag_file,
+        tag_file,
+        int(lines_per_rep),
+        splits_prefix)
+    logger.info("Running psuedo with %s", psuedo_command)
+    subprocess.check_call(shlex.split(psuedo_command))
+
+    # Convert read pairs to reads into standard tagAlign file
+
+    for i, index in enumerate([0, 1]):
+        string_index = '.0' + str(index)
+        steps = ['cat %s' % (splits_prefix + string_index)]
+        if paired:
+            steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""])
+        steps.extend(['gzip -cn'])
+        out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i])
+
+    return pseudoreplicate_dict
+
+
+def generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_controls):
+    if no_reps == 1:
+        logger.info("No other replicate specified "
+                    "so processing as an unreplicated experiment.")
+        replicated = False
+
+    else:
+        logger.info("Multiple replicates specified "
+                    "so processing as a replicated experiment.")
+        replicated = True
+
+    if no_unique_controls == 1 and replicated:
+        logger.info("Only a single control was specified "
+                    "so using same control for replicates, pool and psuedoreplicates.")
+        single_control = True
+    else:
+        logger.info("Will merge only unique controls for pooled.")
+        single_control = False
+
+    # Pool the controls for checking
+    if not single_control:
+        control_df = get_read_count_ratio(design_df)
+        control_files = design_df.control_tag_align.unique()
+        pool_control = pool(control_files, "pool_control", paired)
+    else:
+        pool_control = design_df.control_tag_align.unique()[0]
+
+    # if paired_end make tagAlign
+    if paired:
+        pool_control_tmp = bedpe_to_tagalign(pool_control, "pool_control")
+        pool_control = pool_control_tmp
+
+
+    # Duplicate rows and update for pool and psuedoreplicates and update tagAlign with single end data
+    experiment_id = design_df.at[0, 'experiment_id']
+    replicate_files = design_df.tag_align.unique()
+    pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired)
+
+    # Make 2 self psuedoreplicates
+    pseudoreplicates_dict = {}
+    for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']):
+        replicate_prefix = experiment_id + '_' + str(rep)
+        pr_dict = self_psuedoreplication(tag_file, replicate_prefix, paired)
+        pseudoreplicates_dict[rep] = pr_dict
+
+    # Update design to include new self pseudo replicates
+    pseudoreplicates_df = pd.DataFrame.from_dict(pseudoreplicates_dict)
+    pool_pseudoreplicates_dict = {}
+    for index, row in pseudoreplicates_df.iterrows():
+        replicate_id = index + 1
+        pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz'
+        pool_replicate = pool(row, pr_filename, False)
+        pool_pseudoreplicates_dict[replicate_id] = pool_replicate
+
+    design_new_df = design_df #.loc[np.repeat(design_df.index, 4)].reset_index()
+    # Update tagAlign with single end data
+    if paired:
+        design_new_df['tag_align'] = design_new_df['se_tag_align']
+    design_new_df.drop(labels='se_tag_align', axis=1, inplace=True)
+
+    # If paired change to single End
+    if paired:
+        pool_experiment_se = bedpe_to_tagalign(pool_experiment, experiment_id + "_pooled")
+    else:
+        pool_experiment_se = pool_experiment
+
+    # Check controls against cutoff_ratio
+    # if so replace with pool_control
+    # unless single control was used
+    if not single_control:
+        path_to_pool_control = cwd + '/' + pool_control
+        if control_df.values.max() > cutoff_ratio:
+            logger.info("Number of reads in controls differ by " +
+                        " > factor of %f. Using pooled controls." % (cutoff_ratio))
+            design_new_df['control_tag_align'] = path_to_pool_control
+        else:
+            for index, row in design_new_df.iterrows():
+                exp_no_reads = utils.count_lines(row['tag_align'])
+                con_no_reads = utils.count_lines(row['control_tag_align'])
+                if con_no_reads < exp_no_reads:
+                    logger.info("Fewer reads in control than experiment." +
+                                "Using pooled controls for replicate %s."
+                                % row['replicate'])
+                    design_new_df.loc[index, 'control_tag_align'] = \
+                                                        path_to_pool_control
+                else:
+                    if paired:
+                        control = row['control_tag_align']
+                        control_basename = os.path.basename(
+                            utils.strip_extensions(control, STRIP_EXTENSIONS))
+                        control_tmp = bedpe_to_tagalign(control, control_basename)
+                        path_to_control = cwd + '/' + control_tmp
+                        design_new_df.loc[index, 'control_tag_align'] = \
+                                                            path_to_control
+
+    else:
+        if paired:
+            path_to_pool_control = cwd + '/' + pool_control
+        else:
+            path_to_pool_control = pool_control
+        design_new_df['control_tag_align'] = path_to_pool_control
+
+    # Add in pseudo replicates
+    tmp_metadata = design_new_df.loc[0].copy()
+    tmp_metadata['control_tag_align'] = path_to_pool_control
+    for rep, pseudorep_file in pool_pseudoreplicates_dict.items():
+        tmp_metadata['sample_id'] = experiment_id + '_pr' + str(rep)
+        tmp_metadata['replicate'] = str(rep) + '_pr'
+        tmp_metadata['xcor'] = 'Calculate'
+        path_to_file = cwd + '/' + pseudorep_file
+        tmp_metadata['tag_align'] = path_to_file
+        design_new_df = design_new_df.append(tmp_metadata)
+
+    # Add in pool experiment
+    tmp_metadata['sample_id'] = experiment_id + '_pooled'
+    tmp_metadata['replicate'] = 'pooled'
+    tmp_metadata['xcor'] = 'Calculate'
+    path_to_file = cwd + '/' + pool_experiment_se
+    tmp_metadata['tag_align'] = path_to_file
+    design_new_df = design_new_df.append(tmp_metadata)
+
+    return design_new_df
+
+
+def main():
+    args = get_args()
+    paired = args.paired
+    design = args.design
+    cutoff_ratio = args.cutoff
+
+    # Create a file handler
+    handler = logging.FileHandler('experiment_generation.log')
+    logger.addHandler(handler)
+
+    # Read files as dataframes
+    design_df = pd.read_csv(design, sep='\t')
+
+    # Get current directory to build paths
+    cwd = os.getcwd()
+
+    # Check Number of replicates and replicates
+    no_reps = check_replicates(design_df)
+    no_unique_controls = check_controls(design_df)
+
+    # Generate new design file
+    design_new_df = generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_controls)
+
+    # Write out new dataframe
+    experiment_id = design_df.at[0, 'experiment_id']
+    design_new_df.to_csv(experiment_id + '_ppr.tsv',
+                         header=True, sep='\t', index=False)
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/process.py
+++ b/workflow/scripts/process.py
-#!/usr/bin/python
-# programmer : bbc
-# usage: main function to call all the procedures for chip-seq analysis
-import sys
-import os
-import argparse as ap
-import logging
-import pandas as pd
-import glob
-import subprocess
-from multiprocessing import Pool
-import runDeepTools
-import runMemechip
-logging.basicConfig(level=10)
-
-
-def prepare_argparser():
-  description = "Make wig file for given bed using bam"
-  epilog = "For command line options of each command, type %(prog)% COMMAND -h"
-  argparser = ap.ArgumentParser(description=description, epilog = epilog)
-  argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input design file")
-  argparser.add_argument("-g","--genome",dest = "genome",type=str,required=True, help="genome", default="hg19")
-  argparser.add_argument("--top-peak",dest="toppeak",type=int, default=-1, help = "Only use top peaks for motif call")
-  #argparser.add_argument("-s","--strandtype",dest="stranded",type=str,default="none", choices=["none","reverse","yes"])
-  #argparser.add_argument("-n","--name",dest="trackName",type=str,default="UserTrack",help = "track name for bedgraph header")
-  return(argparser)
-
-def memechip_wrapper(args):
-  #print args  
-  runMemechip.run(*args)
-
-def main():
-  argparser = prepare_argparser()
-  args = argparser.parse_args()
-  #dfile = pd.read_csv(args.infile)
-
-  #for testing, add testing path to all input files
-  test_path = "/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/"
-  designfile = pd.read_csv(args.infile)
-  designfile['Peaks'] = designfile['Peaks'].apply(lambda x: test_path+x)
-  designfile['bamReads'] = designfile['bamReads'].apply(lambda x: test_path+x)
-  designfile['bamControl'] = designfile['bamControl'].apply(lambda x: test_path+x)
-  designfile.to_csv(args.infile+"_new",index=False)
-  dfile = pd.read_csv(args.infile+"_new")
-  #call deeptools
-  runDeepTools.run(args.infile+"_new", args.genome) 
-  #call diffbind
-  this_script = os.path.abspath(__file__).split("/")
-  folder = "/".join(this_script[0:len(this_script)-1])
-  
-  diffbind_command = "Rscript "+folder+"/runDiffBind.R "+args.infile+"_new"
-  #logging.debug(diffbind_command)
-  p = subprocess.Popen(diffbind_command, shell=True)
-  p.communicate()
-  #call chipseeker on original peaks and overlapping peaks
-  chipseeker_command = "Rscript "+folder+"/runChipseeker.R "+",".join(dfile['Peaks'].tolist())+" "+",".join(dfile['SampleID'])
-#BC##  logging.debug(chipseeker_command)
-  p = subprocess.Popen(chipseeker_command, shell=True)
-  p.communicate()
-  overlapping_peaks = glob.glob('*diffbind.bed')
-  overlapping_peak_names = []
-  for pn in overlapping_peaks:
-    overlapping_peak_names.append(pn.split("_diffbind")[0].replace("!","non"))
-  chipseeker_overlap_command = "Rscript "+folder+"/runChipseeker.R "+",".join(overlapping_peaks)+" "+",".join(overlapping_peak_names)
-  p = subprocess.Popen(chipseeker_overlap_command, shell=True)
-  p.communicate()
-  #MEME-chip on all peaks
-  meme_arglist =  zip(dfile['Peaks'].tolist(),[test_path+"hg19.2bit"]*dfile.shape[0],[str(args.toppeak)]*dfile.shape[0],dfile['SampleID'].tolist())
-#BC#  #print meme_arglist
-  work_pool = Pool(min(12,dfile.shape[0]))
-  resultList = work_pool.map(memechip_wrapper, meme_arglist)
-  work_pool.close()
-  work_pool.join()
- 
-
-if __name__=="__main__":
-  main()
--- a/workflow/scripts/runChipseeker.R
+++ b/workflow/scripts/runChipseeker.R
-args = commandArgs(trailingOnly=TRUE)
-#if (length(args)==0) {
-#  stop("At least one argument must be supplied (input file).n", call.=FALSE)
-#} else if (length(args)==1) {
-#  # default output file
-#  args[3] = "out.txt"
-#}
-
-library(ChIPseeker)
-#Parse the genome path and get genome version
-path_elements = unlist(strsplit(args[2],"[/]"))
-genome = path_elements[length(path_elements)]
-
-if(genome=="GRCh37")
-{ 
-library(TxDb.Hsapiens.UCSC.hg19.knownGene)
-txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
-}
-if(genome=="GRCm38")
-{ 
-library(TxDb.Hsapiens.UCSC.mm10.knownGene)
-txdb <- TxDb.Hsapiens.UCSC.mm10.knownGene
-}
-if(genome=="GRCh38")
-{ 
-library(TxDb.Hsapiens.UCSC.hg38.knownGene)
-txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
-}
-
-design<-read.csv(args[1])
-files<-as.list(as.character(design$Peaks))
-names(files)<-design$SampleID
-
-
-peakAnnoList <- lapply(files, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE)
-for(index in c(1:length(peakAnnoList)))
-{
-  filename<-paste(names(files)[index],".chipseeker_annotation.xls",sep="")
-  write.table(as.data.frame(peakAnnoList[[index]]),filename,sep="\t",quote=F)
-  #draw individual plot
-  pie_name <- paste(names(files)[index],".chipseeker_pie.pdf",sep="") 
-  vennpie_name <- paste(names(files)[index],".chipseeker_vennpie.pdf",sep="") 
-  upsetplot_name <- paste(names(files)[index],".chipseeker_upsetplot.pdf",sep="")
-  pdf(pie_name)
-  plotAnnoPie(peakAnnoList[[index]])
-  dev.off()
-  pdf(vennpie_name)
-  vennpie(peakAnnoList[[index]])
-  dev.off()
-  pdf(upsetplot_name)
-  upsetplot(peakAnnoList[[index]])
-  dev.off()
-
-
-}
-
--- a/workflow/scripts/runDeepTools.py
+++ b/workflow/scripts/runDeepTools.py
--- a/workflow/scripts/runDeepTools.pyc
+++ b/workflow/scripts/runDeepTools.pyc
--- a/workflow/scripts/runMemechip.py
+++ b/workflow/scripts/runMemechip.py
--- a/workflow/scripts/runMemechip.pyc
+++ b/workflow/scripts/runMemechip.pyc
No results found