FP push final changes to repo before moving.

f1b61220 · Felix Perez · fbc41119 · f1b61220 · f1b61220 · fbc41119
Commit f1b61220 authored 10 months ago by Felix Perez
--- a/astrocyte_pkg.yml
+++ b/astrocyte_pkg.yml
@@ -66,7 +66,7 @@ workflow_containers:
  - docker://git.biohpc.swmed.edu:5050/s219741/chipseq-analysis-containerized/r:3.3.2
  - docker://git.biohpc.swmed.edu:5050/s219741/chipseq-analysis-containerized/python:3.6.1
  - docker://git.biohpc.swmed.edu:5050/s219741/chipseq-analysis-containerized/chipseq:1.0.0
-  - docker://git.biohpc.swmed.edu:5050/s219741/chipseq-analysis-containerized/motif-search:1.0.1
+  - docker://git.biohpc.swmed.edu:5050/s219741/chipseq-analysis-containerized/motif-search:meme-4.11.1

 # A list of parameters used by the workflow, defining how to present them,
 # options etc in the web interface. For each parameter:

--- a/building/docker/chipseq/Dockerfile
+++ b/building/docker/chipseq/Dockerfile
-FROM continuumio/miniconda3
+FROM continuumio/miniconda3:24.3.0-0

 LABEL IMAGE="chipseq" \
    IMAGE_VERSION="1.0.0" \

--- a/building/docker/motif-search/Dockerfile-bak2
+++ b/building/docker/motif-search/Dockerfile-bak2
-FROM continuumio/miniconda3
-
-LABEL IMAGE="chipseq-analysis-motifSearch" \
-    IMAGE_VERSION="1.0.0" \
-    AUTHORS="Felix Perez,Peng Lian" \
-    EMAIL="biohpc-help@utsouthwestern.edu"
-
-WORKDIR /motif-search/
-
-# Environment variables for the campus proxy.
-ENV http_proxy="http://proxy.swmed.edu:3128" \
-    https_proxy="http://proxy.swmed.edu:3128"
-
-
-COPY entrypoint.sh .
-
-RUN eval "$(conda shell.bash hook)" && \
-    conda create -y -c bioconda -c conda-forge -c defaults -n motifSearch python=3.6.1 && \
-    conda activate motifSearch && \
-    # Install pandas and numpy.
-    pip install numpy==1.12.1 && \
-    pip install pandas==0.20.1 && \
-    # Install MEME 4.11.1
-    conda install -y --solver=libmamba -c conda-forge -c bioconda meme=4.11.2 && \
-    meme -version && \
-    # Install BEDTools 2.26 .0
-    conda install -y --solver=libmamba -c bioconda bedtools=2.26.0 && \
-    bedtools --version && \
-    conda deactivate
-
-ENTRYPOINT ["/motif-search/entrypoint.sh"]
--- a/building/docker/motif-search/Dockerfile
+++ b/building/docker/motif-search/Dockerfile
-FROM continuumio/miniconda3
+FROM continuumio/miniconda3:24.3.0-0

 LABEL IMAGE="chipseq-analysis-motifSearch" \
    IMAGE_VERSION="1.0.0" \
@@ -7,54 +7,48 @@ LABEL IMAGE="chipseq-analysis-motifSearch" \

 WORKDIR /motif-search/

+# Environment variables for the campus proxy.
+ENV http_proxy="http://proxy.swmed.edu:3128" \
+    https_proxy="http://proxy.swmed.edu:3128"
+

 # Original module load order:
 # module load python/3.6.1-2-anaconda
 # module load meme/4.11.1-gcc-openmpi
 # module load bedtools/2.26.0

-# Environment variables for the campus proxy.
-ENV http_proxy="http://proxy.swmed.edu:3128" \
-    https_proxy="http://proxy.swmed.edu:3128"
-

 COPY entrypoint.sh .

-    # Enable the use of bash-specific conda commands in the layer.
 RUN eval "$(conda shell.bash hook)" && \
-    chmod +x entrypoint.sh && \
    conda create -y -c bioconda -c conda-forge -c defaults -n motifSearch python=2.7.12 pip=9.0.1 && \
+    echo "python ==2.7.12" >> /opt/conda/envs/motifSearch/conda-meta/pinned && \
    conda activate motifSearch && \
-    python --version && \
    # Install pandas and numpy.
    pip install numpy==1.12.1 && \
    pip install pandas==0.20.1 && \
-    # Install BEDTools 2.26 .0
+    # Install MEME 4.11.1
+    conda install -y --solver=libmamba -c conda-forge -c bioconda meme=4.11.1 && \
+    meme -version && \
+    # Install BEDTools 2.26.0
    conda install -y --solver=libmamba -c bioconda bedtools=2.26.0 && \
    bedtools --version && \
-    conda clean -y --all && \
+    # The stringi installation below solves a dependency issue encountered when DREME is used (meme-4.11.1)
+    conda install -y --solver=libmamba -c conda-forge r-stringi=1.2.4 && \
    conda deactivate

-    # Install MEME 4.11.1
 RUN eval "$(conda shell.bash hook)" && \
-    apt-get update && apt-get install -y build-essential libxml2 libxml2-dev libxslt1-dev && \
-    conda activate motifSearch && \
-    conda install -y --solver=libmamba -c conda-forge perl libgcc-ng && \
-    wget https://meme-suite.org/meme/meme-software/4.11.1/meme_4.11.1.tar.gz && \
-    tar -zxf meme_4.11.1.tar.gz && \
-    rm meme_4.11.1.tar.gz && \
-    cd meme_4.11.1 && \
-    ./configure --prefix=/motif-search/meme --with-url="http://meme-suite.org" --enable-build-libxml2 --enable-build-libxslt && \
-    make && \
-    make test && \
-    make install && \
-    # Check that all relevant software has been installed correctly.
-    export PATH=/motif-search/meme/bin/:$PATH && \
-    meme -version && \
-    meme-chip --version && \
-    spamo --version && \
-    centrimo --version && \
-    apt-get remove -y build-essential libxml2 libxml2-dev libxslt1-dev && \
-    apt-get clean
+    # Create a Python 3.6.1 conda environment and initialize its use.
+    conda create -y -c bioconda -c defaults -c conda-forge -n py361 python=3.6.1 && \
+    echo "python ==3.6.1" >> /opt/conda/envs/py361/conda-meta/pinned && \
+    conda update conda -y && \
+    conda activate py361 && \
+    # Install the versions of numpy and pandas required for further software installs.
+    pip install numpy==1.12.1 && \
+    pip install pandas==0.20.1 && \
+    conda deactivate && \
+    ln -s /opt/conda/envs/py361/bin/python3 /opt/conda/envs/motifSearch/bin/python3
+
+RUN chmod +x entrypoint.sh

 ENTRYPOINT ["/motif-search/entrypoint.sh"]
--- a/building/docker/motif-search/entrypoint.sh
+++ b/building/docker/motif-search/entrypoint.sh
--- a/building/docker/python-3.6.1/Dockerfile
+++ b/building/docker/python-3.6.1/Dockerfile
-FROM continuumio/miniconda3
+FROM continuumio/miniconda3:24.3.0-0

 LABEL IMAGE="Python/3.6.1" \
    IMAGE_VERSION="0.0.1" \

--- a/building/docker/r-3.3.2/Dockerfile
+++ b/building/docker/r-3.3.2/Dockerfile
-FROM continuumio/miniconda3
+FROM continuumio/miniconda3:24.3.0-0

 LABEL IMAGE="R-3.3.2" \
    IMAGE_VERSION="1.0.0" \

--- a/workflow/configs/biohpc.config
+++ b/workflow/configs/biohpc.config
@@ -79,7 +79,7 @@ process {
  }
  withName: motifSearch {
    executor = 'local'
-    container = 'docker://git.biohpc.swmed.edu:5050/s219741/chipseq-analysis-containerized/motif-search:1.0.1'
+    container = 'docker://git.biohpc.swmed.edu:5050/s219741/chipseq-analysis-containerized/motif-search:meme-4.11.1'
    //cpus = 32
  }
  withName: multiqcReport {

--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -578,10 +578,9 @@ process motifSearch {

  """
  source /motif-search/entrypoint.sh
-  python3 $baseDir/scripts/motif_search.py -d $designMotifSearch -g $fasta -p $topPeakCount
+  python $baseDir/scripts/motif_search.py -d $designMotifSearch -g $fasta -p $topPeakCount
  """
 }
-/*

 // Define channel to find number of unique experiments
 uniqueExperimentsList = uniqueExperiments
@@ -657,5 +656,4 @@ process multiqcReport {
  python3 $baseDir/scripts/generate_versions.py -o software_versions
  multiqc -c $multiqc .
  """
-}
-*/
\ No newline at end of file
+}
\ No newline at end of file
--- a/workflow/scripts/motif_search.py
+++ b/workflow/scripts/motif_search.py
-#!/usr/bin/env python3
+#!/usr/bin/env python2

 #
 # * --------------------------------------------------------------------------
@@ -15,7 +15,7 @@ import shutil
 import subprocess
 from multiprocessing import Pool
 import pandas as pd
-import utils
+import utils2 as utils


 EPILOG = '''
@@ -67,8 +67,10 @@ def check_tools():
    '''Checks for required componenets on user system'''

    logger.info('Checking for required libraries and components on this system')
-
-    meme_path = shutil.which("meme")
+    stream = os.popen('which meme')
+    output = stream.read()
+    meme_path = output.strip()
+    #meme_path = shutil.which("meme")
    if meme_path:
        logger.info('Found meme: %s', meme_path)

@@ -83,8 +85,10 @@ def check_tools():
    else:
        logger.error('Missing meme')
        raise Exception('Missing meme')
-
-    bedtools_path = shutil.which("bedtools")
+    stream = os.popen('which bedtools')
+    output = stream.read()
+    bedtools_path = output.strip()
+    
    if bedtools_path:
        logger.info('Found bedtools: %s', bedtools_path)


--- a/workflow/scripts/motif_search.py.original
+++ b/workflow/scripts/motif_search.py.original
+#!/usr/bin/env python3
+
+#
+# * --------------------------------------------------------------------------
+# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
+# * --------------------------------------------------------------------------
+#
+
+'''Call Motifs on called peaks.'''
+
+import os
+import argparse
+import logging
+import shutil
+import subprocess
+from multiprocessing import Pool
+import pandas as pd
+import utils
+
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+# the order of this list is important.
+# strip_extensions strips from the right inward, so
+# the expected right-most extensions should appear first (like .gz)
+# Modified from J. Seth Strattan
+STRIP_EXTENSIONS = ['.narrowPeak', '.replicated']
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-d', '--design',
+                        help="The design file to run motif search.",
+                        required=True)
+
+    parser.add_argument('-g', '--genome',
+                        help="The genome FASTA file.",
+                        required=True)
+
+    parser.add_argument('-p', '--peak',
+                        help="The number of peaks to use.",
+                        required=True)
+
+    args = parser.parse_args()
+    return args
+
+# Functions
+
+
+def check_tools():
+    '''Checks for required componenets on user system'''
+
+    logger.info('Checking for required libraries and components on this system')
+
+    meme_path = shutil.which("meme")
+    if meme_path:
+        logger.info('Found meme: %s', meme_path)
+
+        # Get Version
+        memechip_version_command = "meme-chip --version"
+        memechip_version = subprocess.check_output(memechip_version_command, shell=True)
+
+        # Write to file
+        meme_file = open("version_memechip.txt", "wb")
+        meme_file.write(b"Version %s" % (memechip_version))
+        meme_file.close()
+    else:
+        logger.error('Missing meme')
+        raise Exception('Missing meme')
+
+    bedtools_path = shutil.which("bedtools")
+    if bedtools_path:
+        logger.info('Found bedtools: %s', bedtools_path)
+
+        # Get Version
+        bedtools_version_command = "bedtools --version"
+        bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
+
+        # Write to file
+        bedtools_file = open("version_bedtools.txt", "wb")
+        bedtools_file.write(bedtools_version)
+        bedtools_file.close()
+    else:
+        logger.error('Missing bedtools')
+        raise Exception('Missing bedtools')
+
+
+def run_wrapper(args):
+    motif_search(*args)
+
+
+def motif_search(filename, genome, experiment, peak):
+    '''Run motif serach on peaks.'''
+
+    file_basename = os.path.basename(
+        utils.strip_extensions(filename, STRIP_EXTENSIONS))
+
+    out_fa = '%s.fa' % (experiment)
+    out_motif = '%s_memechip' % (experiment)
+
+    # Sort Bed file and limit number of peaks
+    if peak == -1:
+        peak = utils.count_lines(filename)
+        peak_no = 'all'
+    else:
+        peak_no = peak
+
+    sorted_fn = '%s.%s.narrowPeak' % (file_basename, peak_no)
+
+    out, err = utils.run_pipe([
+        'sort -k %dgr,%dgr %s' % (5, 5, filename),
+        'head -n %s' % (peak)], outfile=sorted_fn)
+
+    # Get fasta file
+    out, err = utils.run_pipe([
+        'bedtools getfasta -fi %s -bed %s -fo %s' % (genome, sorted_fn, out_fa)])
+
+    if err:
+        logger.error("bedtools error: %s", err)
+
+    # Call memechip
+    out, err = utils.run_pipe([
+        'meme-chip -oc %s -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 %s -norand' % (out_motif, out_fa)])
+    if err:
+        logger.error("meme-chip error: %s", err)
+
+
+def main():
+    args = get_args()
+    design = args.design
+    genome = args.genome
+    peak = args.peak
+
+    # Create a file handler
+    handler = logging.FileHandler('motif.log')
+    logger.addHandler(handler)
+
+    # Check if tools are present
+    check_tools()
+
+    # Read files
+    design_df = pd.read_csv(design, sep='\t')
+
+    meme_arglist = zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0])
+    work_pool = Pool(min(12, design_df.shape[0]))
+    return_list = work_pool.map(run_wrapper, meme_arglist)
+    work_pool.close()
+    work_pool.join()
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/utils2.py
+++ b/workflow/scripts/utils2.py
+#!/usr/bin/env python2
+
+#
+# * --------------------------------------------------------------------------
+# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
+# * --------------------------------------------------------------------------
+#
+
+'''General utilities.'''
+
+
+import shlex
+import logging
+import subprocess
+import sys
+import os
+
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = True
+
+
+def run_pipe(steps, outfile=None):
+    from subprocess import Popen, PIPE
+    p = None
+    p_next = None
+    first_step_n = 1
+    last_step_n = len(steps)
+    for n, step in enumerate(steps, start=first_step_n):
+        logger.debug("step %d: %s" % (n, step))
+        if n == first_step_n:
+            if n == last_step_n and outfile:  # one-step pipeline with outfile
+                with open(outfile, 'w') as fh:
+                    print("one step shlex: %s to file: %s" % (shlex.split(step), outfile))
+                    p = Popen(shlex.split(step), stdout=fh)
+                break
+            print("first step shlex to stdout: %s" % (shlex.split(step)))
+
+            p = Popen(shlex.split(step), stdout=PIPE)
+        elif n == last_step_n and outfile:  # only treat the last step specially if you're sending stdout to a file
+            with open(outfile, 'w') as fh:
+                print("last step shlex: %s to file: %s" % (shlex.split(step), outfile))
+                p_last = Popen(shlex.split(step), stdin=p.stdout, stdout=fh)
+                p.stdout.close()
+                p = p_last
+        else:  # handles intermediate steps and, in the case of a pipe to stdout, the last step
+            print("intermediate step %d shlex to stdout: %s" % (n, shlex.split(step)))
+            p_next = Popen(shlex.split(step), stdin=p.stdout, stdout=PIPE)
+            p.stdout.close()
+            p = p_next
+    out, err = p.communicate()
+    return out, err
+
+
+def block_on(command):
+    process = subprocess.Popen(shlex.split(command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
+    for line in iter(process.stdout.readline, b''):
+        sys.stdout.write(line.decode('utf-8'))
+    process.communicate()
+    return process.returncode
+
+
+def strip_extensions(filename, extensions):
+    '''Strips extensions to get basename of file.'''
+
+    basename = filename
+    for extension in extensions:
+        basename = basename.rpartition(extension)[0] or basename
+
+    return basename
+
+
+def count_lines(filename):
+    import mimetypes
+    compressed_mimetypes = [
+        "compress",
+        "bzip2",
+        "gzip"
+        ]
+    mime_type = mimetypes.guess_type(filename)[1]
+    if mime_type in compressed_mimetypes:
+        catcommand = 'gzip -dc'
+    else:
+        catcommand = 'cat'
+    out, err = run_pipe([
+        '%s %s' % (catcommand, filename),
+        'wc -l'
+        ])
+    return int(out)
+
+
+def rescale_scores(filename, scores_col, new_min=10, new_max=1000):
+    sorted_fn = 'sorted-%s' % (filename)
+    rescaled_fn = 'rescaled-%s' % (filename)
+
+    out, err = run_pipe([
+        'sort -k %dgr,%dgr %s' % (scores_col, scores_col, filename),
+        r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (NF != 0) print $0}'"""],
+        sorted_fn)
+
+    out, err = run_pipe([
+        'head -n 1 %s' % (sorted_fn),
+        'cut -f %s' % (scores_col)])
+    max_score = float(out.strip())
+    logger.info("rescale_scores: max_score = %s", max_score)
+
+    out, err = run_pipe([
+        'tail -n 1 %s' % (sorted_fn),
+        'cut -f %s' % (scores_col)])
+    min_score = float(out.strip())
+    logger.info("rescale_scores: min_score = %s", min_score)
+
+    a = min_score
+    b = max_score
+    x = new_min
+    y = new_max
+
+    if min_score == max_score:  # give all peaks new_min
+        rescale_formula = "x"
+    else:  # n is the unscaled score from scores_col
+        rescale_formula = "((n-a)*(y-x)/(b-a))+x"
+
+    out, err = run_pipe(
+        [
+            'cat %s' % (sorted_fn),
+            r"""awk 'BEGIN{OFS="\t"}{n=$%d;a=%d;b=%d;x=%d;y=%d}"""
+            % (scores_col, a, b, x, y) +
+            r"""{$%d=int(%s) ; print $0}'"""
+            % (scores_col, rescale_formula)
+        ],
+        rescaled_fn)
+
+    os.remove(sorted_fn)
+
+    return rescaled_fn