Skip to content
Snippets Groups Projects
Commit a8df6a7a authored by Spencer Barnes's avatar Spencer Barnes
Browse files

adding modified dastk scripts for atac seq analysis

parents
Branches
Tags
No related merge requests found
File added
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2
}
import DAStk.process_atac as p
import DAStk.differential_md_score as d
def process_atac():
p.main()
def differential_md_score():
d.main()
atac_peaks_filename='type proper path to the ATAC-seq peaks file here'
tf_motif_path='type proper path to the directory containing motif files here'
#! /usr/bin/env/python
from __future__ import print_function
import argparse
import datetime
import numpy as np
import sys
import matplotlib as mpl
# to prevent display-related issues
mpl.use('Agg')
import matplotlib.pyplot as plt
plt.ioff()
from matplotlib import cm
from adjustText import adjust_text
from scipy.stats import norm
from argparse import RawTextHelpFormatter
import pandas as pd
# Usage:
#
# $ python differential_md_score.py -x brg1fl -1 control -2 tamoxifen -p 0.000000000001
#
def main():
parser = argparse.ArgumentParser(description='This script produces an MA plot of TFs from ATAC-Seq data, for DMSO vs. treatment conditions.', \
epilog="IMPORTANT: Please ensure that ALL files used with this script are sorted by the same criteria.\n\nExample:\nFor your files:\n * mcf7_DMSO_md_scores.txt\n * mcf7_Nutlin_md_scores.txt\n... you can use the following arguments to generate an MA plot with barcodes at a p-value cutoff of 1e-4:\n\n$ python differential_md_score.py -x mcf7 -1 DMSO -2 Nutlin -p 0.0001 -b\n\n", formatter_class=RawTextHelpFormatter)
parser.add_argument('-x', '--prefix', dest='output_prefix', metavar='CELL_TYPE', \
help='Cell type (k562, imr90, etc), or any other appropriate output file prefix', required=True)
parser.add_argument('-p', '--p-value', dest='p_value', metavar='P_VALUE', \
help='p-value cutoff to define which motifs to label in the MA plot. Defaults to 0.00001.', default=0.00001, required=False)
parser.add_argument('-1', '--assay-1', dest='assay_1', metavar='ASSAY_1', \
help='Conditions label for the reference/control assay of the differential pair (e.g. "DMSO", "control", "wildtype"). Used to find the proper file with the calculated MD-scores and on the plot labels.', required=True)
parser.add_argument('-2', '--assay-2', dest='assay_2', metavar='ASSAY_2', \
help='Conditions label for the perturbation assay of the differential pair (e.g., "doxycyclin", "p53_knockout"). Used to find the proper file with the calculated MD-scores and on the plot labels.', required=True)
parser.add_argument('-b', '--barcodes', dest='gen_barcode', action='store_true', \
help='Generate a barcode plot for each significant motif', default=False, required=False)
args = parser.parse_args()
HISTOGRAM_BINS = 100
P_VALUE_CUTOFF = float(args.p_value)
print('Starting --- ' + str(datetime.datetime.now()))
control_mds = {}
control_nr_peaks = {}
control_barcode = {}
labels = []
control_fd = open('%s_%s_md_scores.txt' % (args.output_prefix, args.assay_1))
for line in control_fd:
line_chunks = line.split(',')
if '.bed' in line_chunks[0]:
control_mds[line_chunks[0][:-4]] = float(line_chunks[1])
labels.append(line_chunks[0][:-4])
control_nr_peaks[line_chunks[0][:-4]] = int(line_chunks[3])
control_barcode[line_chunks[0][:-4]] = line_chunks[5]
perturbation_mds = {}
perturbation_nr_peaks = {}
perturbation_barcode = {}
perturbation_fd = open('%s_%s_md_scores.txt' % (args.output_prefix, args.assay_2))
for line in perturbation_fd:
line_chunks = line.split(',')
if '.bed' in line_chunks[0]:
assert(line_chunks[0][:-4] in labels)
perturbation_mds[line_chunks[0][:-4]] = float(line_chunks[1])
perturbation_nr_peaks[line_chunks[0][:-4]] = int(line_chunks[3])
perturbation_barcode[line_chunks[0][:-4]] = line_chunks[5]
print('Done gathering data, ready to plot --- ' + str(datetime.datetime.now()))
control = []
perturbation = []
nr_peaks = []
fold_change = []
p_values = []
colors = []
sizes = []
for label in sorted(labels):
if control_mds[label] == 0 and perturbation_mds[label] == 0:
# Skip this motif, it will only skew the graph and won't provide any useful info
continue
control.append(float(control_mds[label]))
perturbation.append(float(perturbation_mds[label]))
nr_peaks.append(np.log2(float(control_nr_peaks[label]) + float(perturbation_nr_peaks[label])))
fold_change.append(float(perturbation_mds[label]) - float(control_mds[label]))
p1 = float(control_mds[label])
p2 = float(perturbation_mds[label])
n1 = float(control_nr_peaks[label])
n2 = float(perturbation_nr_peaks[label])
x1 = p1 * n1
x2 = p2 * n2
if n1 == 0:
print('%s had an MD-score of 0 in %s' % (label, args.assay_1))
n1 = 1
if n2 == 0:
print('%s had an MD-score of 0 in %s' % (label, args.assay_2))
n2 = 1
pooled = (x1 + x2)/(n1 + n2)
z_value = (p1 - p2) / np.sqrt(pooled * (1 - pooled) * ((1/n1) + (1/n2)))
p_value = norm.sf(abs(z_value))*2
p_values.append(p_value)
if p_value < (P_VALUE_CUTOFF / 10) and float(perturbation_mds[label]) > float(control_mds[label]):
colors.append('#c64e50')
sizes.append(70)
elif p_value < P_VALUE_CUTOFF and float(perturbation_mds[label]) > float(control_mds[label]):
colors.append('maroon')
sizes.append(70)
elif p_value < (P_VALUE_CUTOFF / 10) and float(perturbation_mds[label]) < float(control_mds[label]):
colors.append('darkviolet')
sizes.append(70)
elif p_value < P_VALUE_CUTOFF and float(perturbation_mds[label]) < float(control_mds[label]):
colors.append('purple')
sizes.append(70)
else:
colors.append('#4e74ae')
sizes.append(70)
np_control, np_perturbation, np_labels = np.array(control), np.array(perturbation), np.array(sorted(labels))
#add script to output scores as csv
md_enrichment_df = pd.DataFrame(zip(np_labels,nr_peaks, fold_change, p_values), columns = ['TFs', 'log2(Motif Enrichment)','Fold-Change MD-Score', 'p-value'])
md_enrichment_df.to_csv('%s_MA_%s_to_%s_md_score.csv'%(args.output_prefix, args.assay_1, args.assay_2),index = False)
# MA (mean/average) plot
most_relevant_tfs = []
plt.clf()
fig, ax = plt.subplots()
ax.scatter(nr_peaks, fold_change, s=sizes, edgecolor='white', linewidth=0.5, color=colors)
texts = []
for x, y, text, p_value in zip(nr_peaks, fold_change, np_labels, p_values):
if p_value < P_VALUE_CUTOFF:
#if (y > 0.02 or y < -0.02) and x > 12:
print('%s (%.3f, p-value = %.2E)' % (text, y, p_value))
label_color = 'maroon'
if p_value < (P_VALUE_CUTOFF/10):
label_color = '#c64e50'
if y < 0:
label_color = 'purple'
if p_value < (P_VALUE_CUTOFF/10):
label_color = 'darkviolet'
short_text = text.replace('HO_', '')
short_text = short_text.replace('_HUMAN.H10MO', '')
short_text = short_text.replace('_MOUSE.H10MO', '')
texts.append(ax.text(x, y, u'%s' % short_text, fontsize=8, color=label_color))
most_relevant_tfs.append(text)
#adjust_text(texts, force_points=1, on_basemap=True, expand_points=(5,5), expand_text=(3,3), arrowprops=dict(arrowstyle="-", lw=1, color='grey', alpha=0.5))
adjust_text(texts, force_points=1, expand_points=(2,2), expand_text=(2,2), arrowprops=dict(arrowstyle="-", lw=1, color='black', alpha=0.8))
plt.title(u'MA for %s vs. %s MD-scores\n(%s, p-value cutoff: %.2E)' % \
(args.assay_1, args.assay_2, args.output_prefix, P_VALUE_CUTOFF), fontsize=12)
plt.xlabel(u'$\log_2$(Sum #peaks overlapping 3kbp window)', fontsize=14)
plt.ylabel(u'${\Delta}$ MD-score', fontsize=14)
plt.xlim(np.min(nr_peaks), np.max(nr_peaks) + 1)
plt.xscale('log',basex=2)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.tick_params(axis='x',reset=False,which='both',length=5,width=1)
y_bound = max(np.abs(np.min(fold_change)), np.max(fold_change)) + 0.01
plt.ylim(-1 * y_bound, y_bound)
plt.savefig('%s_MA_%s_to_%s_md_score.png' % (args.output_prefix, args.assay_1, args.assay_2), dpi=600)
if args.gen_barcode:
# Generate barcodes for each relevant TF, for both conditions
print('Generating barcode plots for significant motifs...')
for relevant_tf in most_relevant_tfs:
plt.clf()
plt.title('Barcode plots for %s' % relevant_tf)
fig, (ax0, ax1) = plt.subplots(ncols=2)
control_bc_data = np.array(control_barcode[relevant_tf].split(';'))
# Condense the barcode to half the bins for prettier display
control_bc_data = control_bc_data.astype(int)
heat_m = np.nan * np.empty(shape=(int(HISTOGRAM_BINS/4), HISTOGRAM_BINS))
for row in range(HISTOGRAM_BINS/4):
heat_m[row] = control_bc_data
ax0.matshow(heat_m, cmap=cm.YlGnBu)
ax0.axis('off')
ax0.text(HISTOGRAM_BINS/2, HISTOGRAM_BINS/2, 'N(total) = %d\nMD-score = %.3f' % (control_nr_peaks[relevant_tf], control_mds[relevant_tf]), ha='center', size=18, zorder=0)
ax0.text(HISTOGRAM_BINS/2, -10, args.assay_1, ha='center', size=18, zorder=0)
perturbation_bc_data = np.array(perturbation_barcode[relevant_tf].split(';'))
perturbation_bc_data = perturbation_bc_data.astype(int)
heat_m = np.nan * np.empty(shape=(int(HISTOGRAM_BINS/4), HISTOGRAM_BINS))
for row in range(HISTOGRAM_BINS/4):
heat_m[row] = perturbation_bc_data
ax1.matshow(heat_m, cmap=cm.YlGnBu)
ax1.axis('off')
ax1.text(HISTOGRAM_BINS/2, HISTOGRAM_BINS/2, 'N(total) = %d\nMD-score = %.3f' % (perturbation_nr_peaks[relevant_tf], perturbation_mds[relevant_tf]), ha='center', size=18, zorder=0)
ax1.text(HISTOGRAM_BINS/2, -10, args.assay_2, ha='center', size=18, zorder=0)
plt.savefig('%s_%s_barcode_%s_vs_%s.png' % (args.output_prefix, relevant_tf, args.assay_1, args.assay_2), dpi=600)
print('All done --- ' + str(datetime.datetime.now()))
sys.exit(0)
if __name__=='__main__':
main()
#! /usr/bin/env/python
from __future__ import print_function
import argparse
import csv
import datetime
import glob
import imp
import multiprocessing
import numpy as np
import os.path
import pandas as pd
import sys
import matplotlib as mpl
# to prevent DISPLAY weirdness when running in the cluster:
mpl.use('Agg')
import matplotlib.pyplot as plt
plt.ioff()
from functools import partial
from operator import itemgetter
# returns ith column from the given matrix
def get_column(matrix,i):
f = itemgetter(i)
return map(f,matrix)
# return whether the given interval is within a window of size window_size
# around the given mean
def is_in_window(motif_interval, atac_median, window_size):
start = int(motif_interval.start)
end = int(motif_interval.end)
if (end >= (atac_median - window_size) and end <= (atac_median + window_size)) \
or (start >= (atac_median - window_size) and start <= (atac_median + window_size)):
return True
else:
return False
# this will be ran in parallel
def find_motifs_in_chrom(current_chrom, files):
tf_motif_filename, atac_peaks_filename = files
H = 1500 # in bps, the MD-score parameter (large window)
h = 150 # in bps, the MD-score parameter (small window)
REPRESSOR_MARGIN = 500 # in bps, distance from the large window boundaries
atac_df = pd.read_csv(atac_peaks_filename, header=None, comment='#', sep="\t", usecols=[0, 1, 2], \
names=['chrom', 'start', 'end'], na_filter=False, dtype={'chrom':'str', 'start':'int', 'end':'int'})
atac_iter = atac_df[(atac_df.chrom == current_chrom)].itertuples()
motif_df = pd.read_csv(tf_motif_filename, header=None, comment='#', sep="\t", usecols=[0, 1, 2], \
names=['chrom', 'start', 'end'], na_filter=False, dtype={'chrom':'str', 'start':'int', 'end':'int'})
if len(motif_df) == 0:
return None
motif_iter = motif_df[(motif_df.chrom == current_chrom)].itertuples()
last_motif = None
g_h = 0
g_H = 0
total_motif_sites = 0
tf_distances = []
motif_seqs = []
putative_activator_count = 0
putative_repressor_count = 0
try:
motif_region = next(motif_iter) # this gets the next motif in the list
except StopIteration:
print('No motifs for chromosome ' + current_chrom + ' on file ' + tf_motif_filename)
return None
peak_count_overlapping_motif = 0
for atac_peak in atac_iter:
motifs_within_region = True
tf_distance = 0
atac_median = atac_peak.start + (atac_peak.end - atac_peak.start)/2
# check the last motif, too, in case any of them falls within the region of
# interest of two sequential ATAC-Seq peaks
if last_motif:
if is_in_window(last_motif, atac_median, h):
g_h = g_h + 1
putative_activator_count += 1
if is_in_window(last_motif, atac_median, H):
g_H = g_H + 1
tf_median = last_motif.start + \
(last_motif.end - last_motif.start)/2
tf_distance = atac_median - tf_median
if not is_in_window(last_motif, atac_median, H - REPRESSOR_MARGIN):
putative_repressor_count += 1
while motifs_within_region:
total_motif_sites += 1
# account for those within the smaller window (h)
if is_in_window(motif_region, atac_median, h):
g_h = g_h + 1
putative_activator_count += 1
# account for those within the larger window (H)
if is_in_window(motif_region, atac_median, H):
g_H = g_H + 1
tf_median = motif_region.start + (motif_region.end - \
motif_region.start)/2
tf_distances.append(atac_median - tf_median)
#motif_seqs.append(motif_region.sequence)
if not is_in_window(motif_region, atac_median, H - REPRESSOR_MARGIN):
putative_repressor_count += 1
if motif_region.start <= (atac_median + H):
try:
motif_region = next(motif_iter) # this gets the next motif in the list
except StopIteration:
# No more TF motifs for this chromosome
break
last_motif = motif_region
else:
motifs_within_region = False
# Count any remaining TF motif sites after the last ATAC peak
while(len(motif_region) > 0):
try:
motif_region = next(motif_iter) # this gets the next motif in the list
total_motif_sites += 1
except StopIteration:
break
return [tf_distances, motif_seqs, g_h, g_H, total_motif_sites]
def get_md_score(tf_motif_filename, mp_threads, atac_peaks_filename):
HISTOGRAM_BINS = 100
# Smart way to make this organism-specific?
CHROMOSOMES = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', \
'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', \
'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', \
'chrX', 'chrY']
pool = multiprocessing.Pool(mp_threads)
results = pool.map(partial( find_motifs_in_chrom, \
files=[tf_motif_filename, atac_peaks_filename]), \
CHROMOSOMES)
pool.close()
pool.join()
results_matching_motif = [x for x in results if x is not None]
if len(results_matching_motif) > 0:
tf_distances = get_column(results_matching_motif, 0)
motif_seqs = get_column(results_matching_motif, 1)
sums = np.sum(results_matching_motif, axis=0)
overall_g_h = sums[2]
overall_g_H = sums[3]
overall_motif_sites = sums[4]
# Calculate the heatmap for this motif's barcode
tf_distances = results[0][0]
heatmap, xedges = np.histogram(tf_distances, bins=HISTOGRAM_BINS)
str_heatmap = np.char.mod('%d', heatmap)
# TODO: Use the motif sequences to generate a logo for each motif, based
# only on the overlapping ATAC-Seq peaks
if overall_g_H > 0:
return [float(overall_g_h)/overall_g_H, \
overall_g_h, \
overall_g_H, \
overall_motif_sites, \
';'.join(str_heatmap)]
else:
return [0, 0, 0, overall_motif_sites, np.zeros(HISTOGRAM_BINS)]
else:
return None
def main():
parser = argparse.ArgumentParser(description='This script analyzes ATAC-Seq and GRO-Seq data and produces various plots for further data analysis.', epilog='IMPORTANT: Please ensure that ALL bed files used with this script are sorted by the same criteria.')
parser.add_argument('-x', '--prefix', dest='output_prefix', metavar='CELL_TYPE', \
help='Cell type (k562, imr90, etc), or any other appropriate output file prefix', required=True)
parser.add_argument('-e', '--atac-peaks', dest='atac_peaks_filename', \
help='Full path to the ATAC-Seq broadPeak file.', \
default='', required=True)
parser.add_argument('-m', '--motif-path', dest='tf_motif_path', \
help='Path to the location of the motif sites for the desired reference genome (i.e., "/usr/local/motifs/human/hg19/*").', \
default='', required=True)
parser.add_argument('-t', '--threads', dest='mp_threads', \
help='Number of CPUs to use for multiprocessing of MD-score calculations. Depends on your hardware architecture.', \
default='1', required=False)
args = parser.parse_args()
#evaluation_radius = 750 # in bps
ATAC_WIDTH_THRESHOLD = 5000 # in bp
print('Starting --- ' + str(datetime.datetime.now()))
atac_peaks_file = open(args.atac_peaks_filename)
atac_csv_reader = csv.reader(atac_peaks_file, delimiter='\t')
atac_line = next(atac_csv_reader)
atac_peak_count = 0
# skip the BedGraph headers
while(atac_line[0][0] == '#'):
atac_line = next(atac_csv_reader)
# Determine the evaluation radius from the data itself
all_widths = []
while(atac_line): # count the rest of ATAC peaks
try:
new_peak_width = int(atac_line[2]) - int(atac_line[1])
if new_peak_width < ATAC_WIDTH_THRESHOLD:
all_widths.append(new_peak_width)
atac_line = next(atac_csv_reader)
except StopIteration:
break
except IndexError:
print("\nThere was an error with the ATAC-seq peaks file.\nPlease verify it conforms with a BedGraph-like format\n(tab-separated columns, any other lines commented with a leading '#')")
sys.exit(1)
atac_peak_mean = np.mean(all_widths)
atac_peak_std = np.std(all_widths)
evaluation_radius = (int(atac_peak_mean) + 2 * int(atac_peak_std)) / 2
print('ATAC mean width: %d bp (std: %d bp). Determined an evaluation radius of %d bp' % \
(atac_peak_mean, atac_peak_std, evaluation_radius))
# Start reading from the top of the ATAC-Seq BedGraph again
atac_peaks_file.seek(0)
atac_csv_reader = csv.reader(atac_peaks_file, delimiter='\t')
atac_line = next(atac_csv_reader)
while(atac_line[0][0] == '#'):
atac_line = next(atac_csv_reader)
motif_stats = []
sorted_motif_stats = []
tf_motif_path = args.tf_motif_path + '/*'
motif_filenames = glob.glob(tf_motif_path)
motif_count = len(motif_filenames)
print("Processing motif files in %s" % tf_motif_path)
for filename in motif_filenames:
filename_no_path = filename.split('/')[-1]
if os.path.getsize(filename) > 0 and \
os.path.basename(filename).endswith(tuple(['.bed', '.BedGraph', '.txt'])):
[md_score, small_window, large_window, motif_site_count, heat] = get_md_score(filename, int(args.mp_threads), args.atac_peaks_filename)
print('The MD-score for ATAC reads vs %s is %.6f' % (filename_no_path, md_score))
motif_stats.append({ 'motif_file': filename_no_path, \
'md_score': md_score, \
'small_window': small_window, \
'large_window': large_window, \
'motif_site_count': motif_site_count, \
'heat': heat })
# sort the stats dictionary by MD-score, descending order
sorted_motif_stats = sorted(motif_stats, key=itemgetter('md_score'), reverse=True)
md_score_fp = open("%s_md_scores.txt" % args.output_prefix, 'w')
for stat in sorted_motif_stats:
md_score_fp.write("%s,%s,%s,%s,%s,%s\n" % \
(stat['motif_file'], stat['md_score'], stat['small_window'], \
stat['large_window'], stat['motif_site_count'], stat['heat']))
md_score_fp.close()
print('All done --- %s' % str(datetime.datetime.now()))
sys.exit(0)
if __name__=='__main__':
main()
#!/bin/sh
if [[ $# < 6 ]]; then
echo "\nUsage:\n";
echo "$0 PREFIX CONDITION_1 CONDITION_2 ATAC_PEAKS_FILE_1 ATAC_PEAKS_FILE_2 MOTIFS_PATH [NUMBER_OF_PROCESSES]";
echo "\n(Only provide the number of processes if in a multiprocessing-able environment)";
exit 1;
fi
PREFIX=$1
COND1=$2
COND2=$3
ATACPEAKS1=$4
ATACPEAKS2=$5
MOTIFS=$6
THREADS=$7 # optional
PREFIX1=`echo "${PREFIX}_${COND1}"`
PREFIX2=`echo "${PREFIX}_${COND2}"`
if [[ -z $7 ]]; then
THREADS=1
fi
#original
#python process_atac.py --prefix $PREFIX1 --threads $THREADS --atac-peaks $ATACPEAKS1 --motif-path $MOTIFS
#python process_atac.py --prefix $PREFIX2 --threads $THREADS --atac-peaks $ATACPEAKS2 --motif-path $MOTIFS
#python differential_md_score.py --prefix $PREFIX -1 $COND1 -2 $COND2
#modified path
python ../DAStk/process_atac.py --prefix $PREFIX1 --threads $THREADS --atac-peaks $ATACPEAKS1 --motif-path $MOTIFS
python ../DAStk/process_atac.py --prefix $PREFIX2 --threads $THREADS --atac-peaks $ATACPEAKS2 --motif-path $MOTIFS
python ../DAStk/differential_md_score.py --prefix $PREFIX -1 $COND1 -2 $COND2
LICENSE 0 → 100644
BSD 3-Clause License
Copyright (c) 2018, dowelllab
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
README.md 0 → 100644
# DAStk
The Differential [ATAC-seq](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4374986/) Toolkit (DAStk) is a set of scripts to aid analyzing differential ATAC-Seq data. By leveraging changes in accessible chromatin, we can detect significant changes in transcription factor (TF) activity. This is a simple but powerful tool for cellular perturbation analysis.
You will need the following inputs:
- A pair of files listing peaks of ATAC-seq signal in two biological conditions (e.g. DMSO and drug-treated) in any BedGraph-compatible format (tab-delimited)
- A set of files listing the putative binding sites across the reference genome of choice, one file per transcription factor motif, also in any BedGraph-like format. These are normally generated from position weight matrices (PWMs) available at TF model databases like [HOCOMOCO](http://hocomoco11.autosome.ru). These files are expected to have a `.bed`, `.BedGraph` or `.txt` extension.
**IMPORTANT: All files mentioned above (ATAC-seq peaks and computed motif sites) MUST be sorted by the same criteria. Different bioinformatics tools use different lexical sorting criteria (e.g. chr10 after chr1, or chr2 after chr1) so please ensure the sorting criteria is uniform.**
### Install
You can install DAStk using `pip`:
$ pip install DAStk
This is the simplest option, and it will also create the executable commands `process_atac` and `differential_md_score`. Alternatively, you can clone this repository by running:
$ git clone https://biof-git.colorado.edu/dowelllab/DAStk
### Required Python libraries (can be installed thru `pip`):
* numpy
* argparse
* matplotlib
* scipy
* adjustText
* pandas
* multiprocessing
These scripts feature comprehensive help when called with the `--help` argument. Every argument provides a short and long form (i.e. `-t` or `--threads`), and can either be provided as input arguments or via a configuration file, depending on your preference. The are normally two steps in a typical workflow:
1. Process the ATAC-seq peak files to calculate the [MD-score statistic](https://genome.cshlp.org/content/28/3/334.short) for each motif provided.
2. Detect the most statistically significant changes in MD-score between both biological conditions, and generate MA and barcode plots.
### TL;DR;
If you satisfy all the Python library requirements, you can simply clone this repository and run `tf_activity_changes` with the following syntax:
$ ./tf_activity_changes PREFIX CONDITION_1_NAME CONDITION_2_NAME CONDITION_1_ATAC_PEAKS_FILE \
CONDITION_2_ATAC_PEAKS_FILE PATH_TO_MOTIF_FILES [NUMBER_OF_THREADS]
... then use `differential_md_score` (instructions below, or via `--help`) to explore which TFs are changing the most in activity for different p-value cutoffs.
### Usage examples
Unpack the motif files (see below for how to create your own, instead):
$ mkdir /path/to/hg19_motifs
$ tar xvfz motifs/human_motifs.tar.gz --directory /path/to/hg19_motifs
Calculate the MD-scores for the first biological condition:
$ process_atac --prefix 'mcf7_DMSO' --threads 8 --atac-peaks /path/to/DMSO/ATAC/peaks/file \
--motif-path /path/to/directory/containing/motif/files
The above command generates a file called `mcf7_DMSO_md_scores.txt`. The required prefix is a good way to keep track of what these new files represent. It's expected to be some sort of assay identifier and the biological condition, separated by a `_`; it's generally a good idea to use the cell type (or sample number) and a brief condition description (e.g. `k562_DMSO` or `SRR1234123_Metronidazole`). Alternatively, this could have been executed as:
$ process_atac --prefix 'mcf7_DMSO' --threads 8 --config /path/to/config/file.py
... where the contents of this configuration file `file.py` would look like:
atac_peaks_filename = '/path/to/DMSO/ATAC/peaks/file'
tf_motif_path = '/path/to/directory/containing/motif/files'
We would then generate the same file, for the other condition we are comparing against:
$ process_atac --prefix 'mcf7_Treatment' --threads 8 --atac-peaks /path/to/treatment/ATAC/peaks/file \
--motif-path /path/to/directory/containing/motif/files
The above generates a file called `mcf7_Treatment_md_scores.txt`. Finally:
$ differential_md_score --prefix mcf7 --assay-1 DMSO --assay-2 Treatment --p-value 0.0000001 -b
The above generates an MA plot that labels the most significant TF activity changes, at a p-value cutoff of 1e-7. Note that the condition names (DMSO and Treatment) were the same ones used earlier as the second half of the prefix. The plots look like the example below:
![Sample MA plot](./doc_files/sample_MA_plot.png)
The `-b` flag also generates a "barcode plot" of each of these statistically significat motif differences that depicts how close the ATAC-seq peak centers were to the motif centers, within a 1500 base-pair radius of the motif center:
![Sample barcode plot](./doc_files/sample_barcode_plot.png)
This entire process can be executed in this order by calling `tf_activity_changes`. If you can take advantage of multiprocessing, you can calculate MD-scores for both conditions simultaneously, assigning several threads to each, then generate the plots once both `*_md_scores.txt` files are ready.
### Motif Files
Feel free to use the motif files provided, [human_motifs.tar.gz](http://dowell.colorado.edu/pubs/DAStk/human_motifs.tar.gz) and [mouse_motifs.tar.gz](http://dowell.colorado.edu/pubs/DAStk/mouse_motifs.tar.gz) for the `hg19` and `mm10` reference genomes, respectively. They have been generated from HOCOMOCO's v10 mononucleotide model. To generate your own files for each motif, you can use FIMO in combination with the downloaded `.meme` files from your TF database of choice. For example, if using HOCOMOCO, you can create the motif file for TP53 using their mononucleotide model with a p-value threshold of 0.000001 by:
$ fimo -max-stored-scores 10000000 --thresh 1e-6 -oc /path/to/output/directory -motif /path/to/motif/file \
/path/to/HOCOMOCOv11_HUMAN_mono_meme_format.meme /path/to/whole_genome.fa
-----
### Citation
Please cite DAStk if you have used it in your research!
*Tripodi, I.J.; Allen, M.A.; Dowell, R.D. Detecting Differential Transcription Factor Activity from ATAC-Seq Data. Molecules 2018, 23, 1136.*
For any questions or bug reports, please use the Issue Tracker or email us:
*Ignacio Tripodi (ignacio.tripodi at colorado.edu)*
*Computer Science Department, BioFrontiers Institute*
*University of Colorado, Boulder, USA*
#!/bin/bash
# Create a conda environment
conda create -n dastk_env python=2.7
source activate dastk_env
pip install adjustText
doc_files/sample_MA_plot.png

72.2 KiB

doc_files/sample_barcode_plot.png

23.8 KiB

[metadata]
license_file = LICENSE
[bdist_wheel]
# support both Python 2 and Python 3
universal=1
setup.py 0 → 100644
#!/usr/bin/env python
# Always prefer setuptools over distutils
from setuptools import setup, find_packages
# To use a consistent encoding
from codecs import open
from os import path
here = path.abspath(path.dirname(__file__))
# Get the long description from the README file
with open(path.join(here, 'README.md'), encoding='utf-8') as f:
long_description = f.read()
setup(
name='DAStk',
version='0.1.5',
description='Differential ATAC-seq toolkit',
long_description=long_description,
license='BSD',
url='https://biof-git.colorado.edu/dowelllab/DAStk',
author='Ignacio Tripodi',
author_email='ignacio.tripodi@colorado.edu',
python_requires='>=2.7',
classifiers=[
'Development Status :: 4 - Beta',
'Environment :: Console',
'Intended Audience :: Developers',
'Intended Audience :: Science/Research',
'Topic :: Scientific/Engineering :: Bio-Informatics',
'License :: OSI Approved :: BSD License',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 3',
],
keywords='bioinformatics genomics chromatin ATAC-seq motif transcription_factor',
package_dir={'DAStk' : 'DAStk'},
packages=['DAStk'],
install_requires=[
'argparse',
'datetime',
'numpy',
'matplotlib',
'adjustText',
'scipy',
'pandas',
],
entry_points={
'console_scripts': [
'process_atac=DAStk:process_atac',
'differential_md_score=DAStk:differential_md_score',
],
},
project_urls={
'Bug Reports': 'https://biof-git.colorado.edu/dowelllab/DAStk/issues',
'Source': 'https://biof-git.colorado.edu/dowelllab/DAStk',
'Human Motif Sites': 'http://dowell.colorado.edu/pubs/DAStk/human_motifs.tar.gz',
'Mouse Motif Sites': 'http://dowell.colorado.edu/pubs/DAStk/mouse_motifs.tar.gz',
},
)
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment