Skip to content
Snippets Groups Projects
Commit f1b61220 authored by Felix Perez's avatar Felix Perez
Browse files

FP push final changes to repo before moving.

parent fbc41119
3 merge requests!3Containerized version of chip-seq analysis,!2Added new Docker file for motif search (named meme-5.5.4) installed from...,!1Added new Docker file for motif search (named meme-5.5.4) installed from...
......@@ -66,7 +66,7 @@ workflow_containers:
- docker://git.biohpc.swmed.edu:5050/s219741/chipseq-analysis-containerized/r:3.3.2
- docker://git.biohpc.swmed.edu:5050/s219741/chipseq-analysis-containerized/python:3.6.1
- docker://git.biohpc.swmed.edu:5050/s219741/chipseq-analysis-containerized/chipseq:1.0.0
- docker://git.biohpc.swmed.edu:5050/s219741/chipseq-analysis-containerized/motif-search:1.0.1
- docker://git.biohpc.swmed.edu:5050/s219741/chipseq-analysis-containerized/motif-search:meme-4.11.1
# A list of parameters used by the workflow, defining how to present them,
# options etc in the web interface. For each parameter:
......
FROM continuumio/miniconda3
FROM continuumio/miniconda3:24.3.0-0
LABEL IMAGE="chipseq" \
IMAGE_VERSION="1.0.0" \
......
FROM continuumio/miniconda3
LABEL IMAGE="chipseq-analysis-motifSearch" \
IMAGE_VERSION="1.0.0" \
AUTHORS="Felix Perez,Peng Lian" \
EMAIL="biohpc-help@utsouthwestern.edu"
WORKDIR /motif-search/
# Environment variables for the campus proxy.
ENV http_proxy="http://proxy.swmed.edu:3128" \
https_proxy="http://proxy.swmed.edu:3128"
COPY entrypoint.sh .
RUN eval "$(conda shell.bash hook)" && \
conda create -y -c bioconda -c conda-forge -c defaults -n motifSearch python=3.6.1 && \
conda activate motifSearch && \
# Install pandas and numpy.
pip install numpy==1.12.1 && \
pip install pandas==0.20.1 && \
# Install MEME 4.11.1
conda install -y --solver=libmamba -c conda-forge -c bioconda meme=4.11.2 && \
meme -version && \
# Install BEDTools 2.26 .0
conda install -y --solver=libmamba -c bioconda bedtools=2.26.0 && \
bedtools --version && \
conda deactivate
ENTRYPOINT ["/motif-search/entrypoint.sh"]
FROM continuumio/miniconda3
FROM continuumio/miniconda3:24.3.0-0
LABEL IMAGE="chipseq-analysis-motifSearch" \
IMAGE_VERSION="1.0.0" \
......@@ -7,54 +7,48 @@ LABEL IMAGE="chipseq-analysis-motifSearch" \
WORKDIR /motif-search/
# Environment variables for the campus proxy.
ENV http_proxy="http://proxy.swmed.edu:3128" \
https_proxy="http://proxy.swmed.edu:3128"
# Original module load order:
# module load python/3.6.1-2-anaconda
# module load meme/4.11.1-gcc-openmpi
# module load bedtools/2.26.0
# Environment variables for the campus proxy.
ENV http_proxy="http://proxy.swmed.edu:3128" \
https_proxy="http://proxy.swmed.edu:3128"
COPY entrypoint.sh .
# Enable the use of bash-specific conda commands in the layer.
RUN eval "$(conda shell.bash hook)" && \
chmod +x entrypoint.sh && \
conda create -y -c bioconda -c conda-forge -c defaults -n motifSearch python=2.7.12 pip=9.0.1 && \
echo "python ==2.7.12" >> /opt/conda/envs/motifSearch/conda-meta/pinned && \
conda activate motifSearch && \
python --version && \
# Install pandas and numpy.
pip install numpy==1.12.1 && \
pip install pandas==0.20.1 && \
# Install BEDTools 2.26 .0
# Install MEME 4.11.1
conda install -y --solver=libmamba -c conda-forge -c bioconda meme=4.11.1 && \
meme -version && \
# Install BEDTools 2.26.0
conda install -y --solver=libmamba -c bioconda bedtools=2.26.0 && \
bedtools --version && \
conda clean -y --all && \
# The stringi installation below solves a dependency issue encountered when DREME is used (meme-4.11.1)
conda install -y --solver=libmamba -c conda-forge r-stringi=1.2.4 && \
conda deactivate
# Install MEME 4.11.1
RUN eval "$(conda shell.bash hook)" && \
apt-get update && apt-get install -y build-essential libxml2 libxml2-dev libxslt1-dev && \
conda activate motifSearch && \
conda install -y --solver=libmamba -c conda-forge perl libgcc-ng && \
wget https://meme-suite.org/meme/meme-software/4.11.1/meme_4.11.1.tar.gz && \
tar -zxf meme_4.11.1.tar.gz && \
rm meme_4.11.1.tar.gz && \
cd meme_4.11.1 && \
./configure --prefix=/motif-search/meme --with-url="http://meme-suite.org" --enable-build-libxml2 --enable-build-libxslt && \
make && \
make test && \
make install && \
# Check that all relevant software has been installed correctly.
export PATH=/motif-search/meme/bin/:$PATH && \
meme -version && \
meme-chip --version && \
spamo --version && \
centrimo --version && \
apt-get remove -y build-essential libxml2 libxml2-dev libxslt1-dev && \
apt-get clean
# Create a Python 3.6.1 conda environment and initialize its use.
conda create -y -c bioconda -c defaults -c conda-forge -n py361 python=3.6.1 && \
echo "python ==3.6.1" >> /opt/conda/envs/py361/conda-meta/pinned && \
conda update conda -y && \
conda activate py361 && \
# Install the versions of numpy and pandas required for further software installs.
pip install numpy==1.12.1 && \
pip install pandas==0.20.1 && \
conda deactivate && \
ln -s /opt/conda/envs/py361/bin/python3 /opt/conda/envs/motifSearch/bin/python3
RUN chmod +x entrypoint.sh
ENTRYPOINT ["/motif-search/entrypoint.sh"]
FROM continuumio/miniconda3
FROM continuumio/miniconda3:24.3.0-0
LABEL IMAGE="Python/3.6.1" \
IMAGE_VERSION="0.0.1" \
......
FROM continuumio/miniconda3
FROM continuumio/miniconda3:24.3.0-0
LABEL IMAGE="R-3.3.2" \
IMAGE_VERSION="1.0.0" \
......
......@@ -79,7 +79,7 @@ process {
}
withName: motifSearch {
executor = 'local'
container = 'docker://git.biohpc.swmed.edu:5050/s219741/chipseq-analysis-containerized/motif-search:1.0.1'
container = 'docker://git.biohpc.swmed.edu:5050/s219741/chipseq-analysis-containerized/motif-search:meme-4.11.1'
//cpus = 32
}
withName: multiqcReport {
......
......@@ -578,10 +578,9 @@ process motifSearch {
"""
source /motif-search/entrypoint.sh
python3 $baseDir/scripts/motif_search.py -d $designMotifSearch -g $fasta -p $topPeakCount
python $baseDir/scripts/motif_search.py -d $designMotifSearch -g $fasta -p $topPeakCount
"""
}
/*
// Define channel to find number of unique experiments
uniqueExperimentsList = uniqueExperiments
......@@ -657,5 +656,4 @@ process multiqcReport {
python3 $baseDir/scripts/generate_versions.py -o software_versions
multiqc -c $multiqc .
"""
}
*/
\ No newline at end of file
}
\ No newline at end of file
#!/usr/bin/env python3
#!/usr/bin/env python2
#
# * --------------------------------------------------------------------------
......@@ -15,7 +15,7 @@ import shutil
import subprocess
from multiprocessing import Pool
import pandas as pd
import utils
import utils2 as utils
EPILOG = '''
......@@ -67,8 +67,10 @@ def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
meme_path = shutil.which("meme")
stream = os.popen('which meme')
output = stream.read()
meme_path = output.strip()
#meme_path = shutil.which("meme")
if meme_path:
logger.info('Found meme: %s', meme_path)
......@@ -83,8 +85,10 @@ def check_tools():
else:
logger.error('Missing meme')
raise Exception('Missing meme')
bedtools_path = shutil.which("bedtools")
stream = os.popen('which bedtools')
output = stream.read()
bedtools_path = output.strip()
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
......
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Call Motifs on called peaks.'''
import os
import argparse
import logging
import shutil
import subprocess
from multiprocessing import Pool
import pandas as pd
import utils
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.narrowPeak', '.replicated']
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-d', '--design',
help="The design file to run motif search.",
required=True)
parser.add_argument('-g', '--genome',
help="The genome FASTA file.",
required=True)
parser.add_argument('-p', '--peak',
help="The number of peaks to use.",
required=True)
args = parser.parse_args()
return args
# Functions
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
meme_path = shutil.which("meme")
if meme_path:
logger.info('Found meme: %s', meme_path)
# Get Version
memechip_version_command = "meme-chip --version"
memechip_version = subprocess.check_output(memechip_version_command, shell=True)
# Write to file
meme_file = open("version_memechip.txt", "wb")
meme_file.write(b"Version %s" % (memechip_version))
meme_file.close()
else:
logger.error('Missing meme')
raise Exception('Missing meme')
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
# Get Version
bedtools_version_command = "bedtools --version"
bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
# Write to file
bedtools_file = open("version_bedtools.txt", "wb")
bedtools_file.write(bedtools_version)
bedtools_file.close()
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
def run_wrapper(args):
motif_search(*args)
def motif_search(filename, genome, experiment, peak):
'''Run motif serach on peaks.'''
file_basename = os.path.basename(
utils.strip_extensions(filename, STRIP_EXTENSIONS))
out_fa = '%s.fa' % (experiment)
out_motif = '%s_memechip' % (experiment)
# Sort Bed file and limit number of peaks
if peak == -1:
peak = utils.count_lines(filename)
peak_no = 'all'
else:
peak_no = peak
sorted_fn = '%s.%s.narrowPeak' % (file_basename, peak_no)
out, err = utils.run_pipe([
'sort -k %dgr,%dgr %s' % (5, 5, filename),
'head -n %s' % (peak)], outfile=sorted_fn)
# Get fasta file
out, err = utils.run_pipe([
'bedtools getfasta -fi %s -bed %s -fo %s' % (genome, sorted_fn, out_fa)])
if err:
logger.error("bedtools error: %s", err)
# Call memechip
out, err = utils.run_pipe([
'meme-chip -oc %s -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 %s -norand' % (out_motif, out_fa)])
if err:
logger.error("meme-chip error: %s", err)
def main():
args = get_args()
design = args.design
genome = args.genome
peak = args.peak
# Create a file handler
handler = logging.FileHandler('motif.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Read files
design_df = pd.read_csv(design, sep='\t')
meme_arglist = zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0])
work_pool = Pool(min(12, design_df.shape[0]))
return_list = work_pool.map(run_wrapper, meme_arglist)
work_pool.close()
work_pool.join()
if __name__ == '__main__':
main()
#!/usr/bin/env python2
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''General utilities.'''
import shlex
import logging
import subprocess
import sys
import os
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = True
def run_pipe(steps, outfile=None):
from subprocess import Popen, PIPE
p = None
p_next = None
first_step_n = 1
last_step_n = len(steps)
for n, step in enumerate(steps, start=first_step_n):
logger.debug("step %d: %s" % (n, step))
if n == first_step_n:
if n == last_step_n and outfile: # one-step pipeline with outfile
with open(outfile, 'w') as fh:
print("one step shlex: %s to file: %s" % (shlex.split(step), outfile))
p = Popen(shlex.split(step), stdout=fh)
break
print("first step shlex to stdout: %s" % (shlex.split(step)))
p = Popen(shlex.split(step), stdout=PIPE)
elif n == last_step_n and outfile: # only treat the last step specially if you're sending stdout to a file
with open(outfile, 'w') as fh:
print("last step shlex: %s to file: %s" % (shlex.split(step), outfile))
p_last = Popen(shlex.split(step), stdin=p.stdout, stdout=fh)
p.stdout.close()
p = p_last
else: # handles intermediate steps and, in the case of a pipe to stdout, the last step
print("intermediate step %d shlex to stdout: %s" % (n, shlex.split(step)))
p_next = Popen(shlex.split(step), stdin=p.stdout, stdout=PIPE)
p.stdout.close()
p = p_next
out, err = p.communicate()
return out, err
def block_on(command):
process = subprocess.Popen(shlex.split(command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
for line in iter(process.stdout.readline, b''):
sys.stdout.write(line.decode('utf-8'))
process.communicate()
return process.returncode
def strip_extensions(filename, extensions):
'''Strips extensions to get basename of file.'''
basename = filename
for extension in extensions:
basename = basename.rpartition(extension)[0] or basename
return basename
def count_lines(filename):
import mimetypes
compressed_mimetypes = [
"compress",
"bzip2",
"gzip"
]
mime_type = mimetypes.guess_type(filename)[1]
if mime_type in compressed_mimetypes:
catcommand = 'gzip -dc'
else:
catcommand = 'cat'
out, err = run_pipe([
'%s %s' % (catcommand, filename),
'wc -l'
])
return int(out)
def rescale_scores(filename, scores_col, new_min=10, new_max=1000):
sorted_fn = 'sorted-%s' % (filename)
rescaled_fn = 'rescaled-%s' % (filename)
out, err = run_pipe([
'sort -k %dgr,%dgr %s' % (scores_col, scores_col, filename),
r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (NF != 0) print $0}'"""],
sorted_fn)
out, err = run_pipe([
'head -n 1 %s' % (sorted_fn),
'cut -f %s' % (scores_col)])
max_score = float(out.strip())
logger.info("rescale_scores: max_score = %s", max_score)
out, err = run_pipe([
'tail -n 1 %s' % (sorted_fn),
'cut -f %s' % (scores_col)])
min_score = float(out.strip())
logger.info("rescale_scores: min_score = %s", min_score)
a = min_score
b = max_score
x = new_min
y = new_max
if min_score == max_score: # give all peaks new_min
rescale_formula = "x"
else: # n is the unscaled score from scores_col
rescale_formula = "((n-a)*(y-x)/(b-a))+x"
out, err = run_pipe(
[
'cat %s' % (sorted_fn),
r"""awk 'BEGIN{OFS="\t"}{n=$%d;a=%d;b=%d;x=%d;y=%d}"""
% (scores_col, a, b, x, y) +
r"""{$%d=int(%s) ; print $0}'"""
% (scores_col, rescale_formula)
],
rescaled_fn)
os.remove(sorted_fn)
return rescaled_fn
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment