Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • BICF/Astrocyte/chipseq_analysis
  • s190984/chipseq_analysis
  • astrocyte/workflows/bicf/chipseq_analysis
  • s219741/chipseq-analysis-containerized
Show changes
singularity {
enabled = true
runOptions = '' // Please do NOT use "--disable-cache" in this runOptions.
// Starting from version 2.0.0, the astrocyte_cli will clean up the cache automatically.
// runOptions = '--bind /vagrant:/vagrant' // Use this one for vagrant development env only
cacheDir = "$baseDir/images/singularity" // Singularity images specified in `workflow_containers` of astrocyte_pkg.yml will be saved to
// this folder automatically, before running the workflow. The images will be renamed as
// "NAME-TAG.img", e.g. ubuntu-latest.img, centos-centos8.img, r-4.1.1.img, etc.
}
process {
executor = 'slurm'
queue = 'super'
......@@ -5,69 +15,73 @@ process {
beforeScript= 'ulimit -Ss unlimited'
// Process specific configuration
withName: checkDesignFile {
module = ['python/3.6.1-2-anaconda']
withName: trackStart {
executor = 'local'
}
withName: checkDesignFile {
container = 'docker://git.biohpc.swmed.edu:5050/astrocyte/workflows/bicf/chipseq_analysis/py36:1.0.0'
queue = 'super'
}
withName: trimReads {
module = ['python/3.6.1-2-anaconda', 'trimgalore/0.4.1']
cpus = 32
container = 'docker://git.biohpc.swmed.edu:5050/astrocyte/workflows/bicf/chipseq_analysis/chipseq:1.0.0'
queue = '256GB,256GBv1'
}
withName: alignReads{
module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/1.6']
queue = '128GB,256GB,256GBv1'
container = 'docker://git.biohpc.swmed.edu:5050/astrocyte/workflows/bicf/chipseq_analysis/chipseq:1.0.0'
queue = '256GB,256GBv1'
}
withName: filterReads{
module = ['python/3.6.1-2-anaconda', 'samtools/1.6', 'sambamba/0.6.6', 'bedtools/2.26.0']
queue = '128GB,256GB,256GBv1'
container = 'docker://git.biohpc.swmed.edu:5050/astrocyte/workflows/bicf/chipseq_analysis/chipseq:1.0.0'
queue = '256GB,256GBv1'
}
withName: experimentQC {
module = ['python/3.6.1-2-anaconda', 'deeptools/2.5.0.1']
queue = '128GB,256GB,256GBv1'
container = 'docker://git.biohpc.swmed.edu:5050/astrocyte/workflows/bicf/chipseq_analysis/chipseq:1.0.0'
queue = '256GB,256GBv1'
}
withName: convertReads {
module = ['python/3.6.1-2-anaconda', 'samtools/1.6', 'bedtools/2.26.0']
queue = '128GB,256GB,256GBv1'
container = 'docker://git.biohpc.swmed.edu:5050/astrocyte/workflows/bicf/chipseq_analysis/chipseq:1.0.0'
queue = '256GB,256GBv1'
}
withName: crossReads {
module = ['python/3.6.1-2-anaconda', 'phantompeakqualtools/1.2']
cpus = 32
container = 'docker://git.biohpc.swmed.edu:5050/astrocyte/workflows/bicf/chipseq_analysis/chipseq:1.0.0'
queue = 'super'
}
withName: defineExpDesignFiles {
module = ['python/3.6.1-2-anaconda']
executor = 'local'
container = 'docker://git.biohpc.swmed.edu:5050/astrocyte/workflows/bicf/chipseq_analysis/py36:1.0.0'
queue = 'super'
}
withName: poolAndPsuedoReads {
module = ['python/3.6.1-2-anaconda']
executor = 'local'
container = 'docker://git.biohpc.swmed.edu:5050/astrocyte/workflows/bicf/chipseq_analysis/py36:1.0.0'
queue = 'super'
}
withName: callPeaksMACS {
module = ['python/3.6.1-2-anaconda', 'macs/2.1.0-20151222', 'UCSC_userApps/v317', 'bedtools/2.26.0', 'phantompeakqualtools/1.2']
container = 'docker://git.biohpc.swmed.edu:5050/astrocyte/workflows/bicf/chipseq_analysis/chipseq:1.0.0'
queue = '128GB,256GB,256GBv1'
}
withName: plotProfile {
module = ['deeptools/2.5.0.1']
cpus = 32
container = 'docker://git.biohpc.swmed.edu:5050/astrocyte/workflows/bicf/chipseq_analysis/chipseq:1.0.0'
queue = 'super'
}
withName: consensusPeaks {
module = ['python/3.6.1-2-anaconda', 'bedtools/2.26.0']
executor = 'local'
container = 'docker://git.biohpc.swmed.edu:5050/astrocyte/workflows/bicf/chipseq_analysis/chipseq:1.0.0'
queue = 'super'
}
withName: peakAnnotation {
module = ['R/3.3.2-gccmkl']
executor = 'local'
container = 'docker://git.biohpc.swmed.edu:5050/astrocyte/workflows/bicf/chipseq_analysis/r:3.3.2'
queue = 'super'
}
withName: diffPeaks {
module = ['R/3.3.2-gccmkl']
cpus = 32
container = 'docker://git.biohpc.swmed.edu:5050/astrocyte/workflows/bicf/chipseq_analysis/r:3.3.2'
queue = '128GB,256GB,256GBv1'
}
withName: motifSearch {
module = ['python/3.6.1-2-anaconda', 'meme/4.11.1-gcc-openmpi', 'bedtools/2.26.0']
cpus = 32
container = 'docker://git.biohpc.swmed.edu:5050/astrocyte/workflows/bicf/chipseq_analysis/motif-search:meme-5.5.4'
queue = 'super'
}
withName: multiqcReport {
module = ['python/3.6.1-2-anaconda', 'pandoc/2.7', 'singularity/3.0.2']
executor = 'local'
container = 'docker://git.biohpc.swmed.edu:5050/astrocyte/workflows/bicf/chipseq_analysis/chipseq:1.0.0'
queue = 'super'
}
}
......@@ -116,3 +130,16 @@ report {
enabled = true
file = 'report.html'
}
env {
http_proxy = 'http://proxy.swmed.edu:3128'
https_proxy = 'http://proxy.swmed.edu:3128'
all_proxy = 'http://proxy.swmed.edu:3128'
}
manifest {
mainScript = 'main.nf'
version = '2.0.1'
nextflowVersion = '>=19.07.0'
}
......@@ -27,7 +27,7 @@ params.skipDiff = false
params.skipMotif = false
params.skipPlotProfile = false
params.references = "$baseDir/../docs/references.md"
params.multiqc = "$baseDir/conf/multiqc_config.yaml"
params.multiqc = "$baseDir/configs/multiqc_config.yaml"
params.ci = false
params.dev = false
......@@ -147,14 +147,14 @@ process checkDesignFile {
if (pairedEnd) {
"""
module load python/3.6.1-2-anaconda
source /python361/entrypoint.sh
python3 $baseDir/scripts/check_design.py -d $designFile -f $readsList -p
"""
}
else {
"""
module load python/3.6.1-2-anaconda
python $baseDir/scripts/check_design.py -d $designFile -f $readsList
source /python361/entrypoint.sh
python3 $baseDir/scripts/check_design.py -d $designFile -f $readsList
"""
}
......@@ -176,7 +176,7 @@ process trimReads {
tag "$sampleId-$replicate"
publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy'
maxForks 4
input:
set sampleId, reads, experimentId, biosample, factor, treatment, replicate, controlId from rawReads
......@@ -191,15 +191,13 @@ process trimReads {
if (pairedEnd) {
"""
module load python/3.6.1-2-anaconda
module load trimgalore/0.4.1
source /chipseq/entrypoint.sh
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -s $sampleId -p
"""
}
else {
"""
module load python/3.6.1-2-anaconda
module load trimgalore/0.4.1
source /chipseq/entrypoint.sh
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} -s $sampleId
"""
}
......@@ -209,10 +207,10 @@ process trimReads {
// Align trimmed reads using bwa
process alignReads {
queue '128GB,256GB,256GBv1'
queue '256GB,256GBv1'
tag "$sampleId-$replicate"
publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy'
maxForks 4
input:
set sampleId, reads, experimentId, biosample, factor, treatment, replicate, controlId from trimmedReads
......@@ -228,17 +226,15 @@ process alignReads {
if (pairedEnd) {
"""
module load python/3.6.1-2-anaconda
module load bwa/intel/0.7.12
module load samtools/1.6
source /chipseq/entrypoint.sh
free -hm
python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -s $sampleId -p
"""
}
else {
"""
module load python/3.6.1-2-anaconda
module load bwa/intel/0.7.12
module load samtools/1.6
source /chipseq/entrypoint.sh
free -hm
python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa -s $sampleId
"""
}
......@@ -251,7 +247,7 @@ process filterReads {
queue '128GB,256GB,256GBv1'
tag "$sampleId-$replicate"
publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy'
maxForks 4
input:
set sampleId, mapped, experimentId, biosample, factor, treatment, replicate, controlId from mappedReads
......@@ -269,19 +265,13 @@ process filterReads {
if (pairedEnd) {
"""
module load python/3.6.1-2-anaconda
module load samtools/1.6
module load sambamba/0.6.6
module load bedtools/2.26.0
source /chipseq/entrypoint.sh
python3 $baseDir/scripts/map_qc.py -b $mapped -p
"""
}
else {
"""
module load python/3.6.1-2-anaconda
module load samtools/1.6
module load sambamba/0.6.6
module load bedtools/2.26.0
source /chipseq/entrypoint.sh
python3 $baseDir/scripts/map_qc.py -b $mapped
"""
}
......@@ -313,8 +303,7 @@ process experimentQC {
script:
"""
module load python/3.6.1-2-anaconda
module load deeptools/2.5.0.1
source /chipseq/entrypoint.sh
python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign -e $extendReadsLen
"""
......@@ -340,17 +329,13 @@ process convertReads {
if (pairedEnd) {
"""
module load python/3.6.1-2-anaconda
module load samtools/1.6
module load bedtools/2.26.0
source /chipseq/entrypoint.sh
python3 $baseDir/scripts/convert_reads.py -b $deduped -p
"""
}
else {
"""
module load python/3.6.1-2-anaconda
module load samtools/1.6
module load bedtools/2.26.0
source /chipseq/entrypoint.sh
python3 $baseDir/scripts/convert_reads.py -b $deduped
"""
}
......@@ -377,15 +362,13 @@ process crossReads {
if (pairedEnd) {
"""
module load python/3.6.1-2-anaconda
module load phantompeakqualtools/1.2
source /chipseq/entrypoint.sh
python3 $baseDir/scripts/xcor.py -t $seTagAlign -p
"""
}
else {
"""
module load python/3.6.1-2-anaconda
module load phantompeakqualtools/1.2
source /chipseq/entrypoint.sh
python3 $baseDir/scripts/xcor.py -t $seTagAlign
"""
}
......@@ -414,7 +397,7 @@ process defineExpDesignFiles {
script:
"""
module load python/3.6.1-2-anaconda
source /python361/entrypoint.sh
python3 $baseDir/scripts/experiment_design.py -d $xcorDesign
"""
......@@ -440,13 +423,13 @@ process poolAndPsuedoReads {
if (pairedEnd) {
"""
module load python/3.6.1-2-anaconda
source /python361/entrypoint.sh
python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio -p
"""
}
else {
"""
module load python/3.6.1-2-anaconda
source /python361/entrypoint.sh
python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio
"""
}
......@@ -478,21 +461,13 @@ process callPeaksMACS {
if (pairedEnd) {
"""
module load python/3.6.1-2-anaconda
module load macs/2.1.0-20151222
module load UCSC_userApps/v317
module load bedtools/2.26.0
module load phantompeakqualtools/1.2
source /chipseq/entrypoint.sh
python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes -p
"""
}
else {
"""
module load python/3.6.1-2-anaconda
module load macs/2.1.0-20151222
module load UCSC_userApps/v317
module load bedtools/2.26.0
module load phantompeakqualtools/1.2
source /chipseq/entrypoint.sh
python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes
"""
}
......@@ -523,8 +498,8 @@ process plotProfile {
script:
"""
module load deeptools/2.5.0.1
bash $baseDir/scripts/plot_profile.sh -g $gtfFile
source /chipseq/entrypoint.sh
/bin/bash $baseDir/scripts/plot_profile.sh -g $gtfFile
"""
}
......@@ -551,8 +526,7 @@ process consensusPeaks {
script:
"""
module load python/3.6.1-2-anaconda
module load bedtools/2.26.0
source /chipseq/entrypoint.sh
python3 $baseDir/scripts/overlap_peaks.py -d $peaksDesign -f $preDiffDesign
"""
......@@ -575,7 +549,7 @@ process peakAnnotation {
script:
"""
module load R/3.3.2-gccmkl
source /r-332/entrypoint.sh
Rscript $baseDir/scripts/annotate_peaks.R $designAnnotatePeaks $gtfFile $geneNames
"""
......@@ -603,10 +577,8 @@ process motifSearch {
script:
"""
module load python/3.6.1-2-anaconda
module load meme/4.11.1-gcc-openmpi
module load bedtools/2.26.0
python3 $baseDir/scripts/motif_search.py -d $designMotifSearch -g $fasta -p $topPeakCount
source /motif-search/entrypoint.sh
python $baseDir/scripts/motif_search.py -d $designMotifSearch -g $fasta -p $topPeakCount
"""
}
......@@ -639,13 +611,12 @@ process diffPeaks {
script:
"""
module load python/3.6.1-2-anaconda
module load R/3.3.2-gccmkl
source /r-332/entrypoint.sh
Rscript $baseDir/scripts/diff_peaks.R $designDiffPeaks
"""
}
// Generate Multiqc Report, gerernate Software Versions and references
// Generate Multiqc Report, generate software versions and references
process multiqcReport {
publishDir "$outDir/${task.process}", mode: 'copy'
......@@ -659,8 +630,8 @@ process multiqcReport {
file ('callPeaksMACS_vf/*') from callPeaksMACSVersions.first()
file ('consensusPeaks_vf/*') from consensusPeaksVersions.first()
file ('peakAnnotation_vf/*') from peakAnnotationVersions.first()
file ('motifSearch_vf/*') from motifSearchVersions.first().ifEmpty()
file ('diffPeaks_vf/*') from diffPeaksVersions.first().ifEmpty()
file ('motifSearch_vf/*') from motifSearchVersions.first().ifEmpty("No Meme-chip is found")
file ('diffPeaks_vf/*') from diffPeaksVersions.first().ifEmpty("No Rstudio is found")
file ('experimentQC_vf/*') from experimentQCVersions.first()
file ('trimReads/*') from trimgaloreResults.collect()
file ('alignReads/*') from mappedReadsStats.collect()
......@@ -677,14 +648,12 @@ process multiqcReport {
script:
"""
module load python/3.6.1-2-anaconda
module load pandoc/2.7
module load singularity/3.0.2
source /chipseq/entrypoint.sh
echo $workflow.nextflow.version > version_nextflow.txt
singularity exec /project/shared/bicf_workflow_ref/singularity_images/bicf-multiqc-2.0.0.img multiqc --version > version_multiqc.txt
multiqc --version > version_multiqc.txt
python --version &> version_python.txt
python3 $baseDir/scripts/generate_references.py -r $references -o software_references
python3 $baseDir/scripts/generate_versions.py -o software_versions
singularity exec /project/shared/bicf_workflow_ref/singularity_images/bicf-multiqc-2.0.0.img multiqc -c $multiqc .
multiqc -c $multiqc .
"""
}
}
\ No newline at end of file
profiles {
standard {
includeConfig 'conf/biohpc.config'
}
}
trace {
enabled = true
file = 'pipeline_trace.txt'
fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
}
timeline {
enabled = true
file = 'timeline.html'
}
report {
enabled = true
file = 'report.html'
}
manifest {
name = 'chipseq_analysis'
description = 'BICF ChIP-seq Analysis Workflow.'
homePage = 'https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis'
version = '1.1.2'
mainScript = 'main.nf'
nextflowVersion = '>=0.31.0'
}
......@@ -106,9 +106,16 @@ def convert_mapped_pe(bam, bam_basename):
# Name sort bam to make BEDPE
nmsrt_bam_filename = bam_basename + ".nmsrt.bam"
# Allocate memory for each thread used during samtools sort.
# Determine the available memory on the host system (kB).
avail_sys_mem = int(subprocess.check_output("grep MemTotal /proc/meminfo | grep -o '[0-9]*'", shell=True).decode("utf-8").strip())
# Divide the total by the number of processors and apply a scaling factor (0.85) to not use all memory.
mem_per_thread = math.floor((avail_sys_mem ) * 0.7)
# DEBUGGING
mem_per_thread = mem_per_thread * 4
samtools_sort_command = \
"samtools sort -n -@%d -o %s %s" \
% (cpu_count(), nmsrt_bam_filename, bam)
"samtools sort -m %d -n -@%d -o %s %s" \
% (mem_per_thread, cpu_count(), nmsrt_bam_filename, bam)
logger.info(samtools_sort_command)
subprocess.check_output(shlex.split(samtools_sort_command))
......
......@@ -14,6 +14,7 @@ import argparse
import shutil
import shlex
import logging
import math
from multiprocessing import cpu_count
import utils
......@@ -133,12 +134,18 @@ def align_se(fastq, sai, reference, fastq_basename):
bam_filename = '%s.bam' % (fastq_basename)
# Allocate memory for each thread used during samtools sort.
# Determine the available memory on the host system (kB).
avail_sys_mem = int(subprocess.check_output("grep MemTotal /proc/meminfo | grep -o '[0-9]*'", shell=True).decode("utf-8").strip())
# Divide the total by the number of processors and apply a scaling factor (0.85) to not use all memory.
mem_per_thread = math.floor((avail_sys_mem) * 0.7)
steps = [
"bwa samse %s %s %s"
% (reference, sai[0], fastq[0]),
"samtools view -@%d -Su -" % (cpu_count()),
"samtools sort -@%d -o %s"
% (cpu_count(), bam_filename)]
"samtools sort -m %d -@%d -o %s"
% (mem_per_thread, cpu_count(), bam_filename)]
out, err = utils.run_pipe(steps)
if err:
......@@ -168,12 +175,19 @@ def align_pe(fastq, sai, reference, fastq_basename):
if err:
logger.error("sampe error: %s", err)
# Allocate memory for each thread used during samtools sort.
# Determine the available memory on the host system (kB).
avail_sys_mem = int(subprocess.check_output("grep MemTotal /proc/meminfo | grep -o '[0-9]*'", shell=True).decode("utf-8").strip())
# Divide the total by the number of processors and apply a scaling factor (0.85) to not use all memory.
mem_per_thread = math.floor((avail_sys_mem // cpu_count()) * 0.85)
# DEBUGGING
mem_per_thread = mem_per_thread * 4
steps = [
"cat %s" % (sam_filename),
"grep -v -F -f %s" % (badcigar_filename),
"samtools view -@%d -Su -" % (cpu_count()),
"samtools sort -@%d -o %s"
% (cpu_count(), bam_filename)]
"samtools sort -m %d -@%d -o %s"
% (mem_per_thread, cpu_count(), bam_filename)]
out, err = utils.run_pipe(steps)
if err:
......
#!/usr/bin/env python3
#!/usr/bin/env python2
#
# * --------------------------------------------------------------------------
......@@ -15,7 +15,7 @@ import shutil
import subprocess
from multiprocessing import Pool
import pandas as pd
import utils
import utils2 as utils
EPILOG = '''
......@@ -67,8 +67,10 @@ def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
meme_path = shutil.which("meme")
stream = os.popen('which meme')
output = stream.read()
meme_path = output.strip()
#meme_path = shutil.which("meme")
if meme_path:
logger.info('Found meme: %s', meme_path)
......@@ -83,8 +85,10 @@ def check_tools():
else:
logger.error('Missing meme')
raise Exception('Missing meme')
bedtools_path = shutil.which("bedtools")
stream = os.popen('which bedtools')
output = stream.read()
bedtools_path = output.strip()
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
......@@ -135,8 +139,10 @@ def motif_search(filename, genome, experiment, peak):
logger.error("bedtools error: %s", err)
# Call memechip
#out, err = utils.run_pipe([
# 'meme-chip -oc %s -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 %s -norand' % (out_motif, out_fa)])
out, err = utils.run_pipe([
'meme-chip -oc %s -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 %s -norand' % (out_motif, out_fa)])
'meme-chip -oc %s -minw 5 -maxw 15 -meme-nmotifs 10 %s -meme-norand ' % (out_motif, out_fa)])
if err:
logger.error("meme-chip error: %s", err)
......
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Call Motifs on called peaks.'''
import os
import argparse
import logging
import shutil
import subprocess
from multiprocessing import Pool
import pandas as pd
import utils
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.narrowPeak', '.replicated']
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-d', '--design',
help="The design file to run motif search.",
required=True)
parser.add_argument('-g', '--genome',
help="The genome FASTA file.",
required=True)
parser.add_argument('-p', '--peak',
help="The number of peaks to use.",
required=True)
args = parser.parse_args()
return args
# Functions
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
meme_path = shutil.which("meme")
if meme_path:
logger.info('Found meme: %s', meme_path)
# Get Version
memechip_version_command = "meme-chip --version"
memechip_version = subprocess.check_output(memechip_version_command, shell=True)
# Write to file
meme_file = open("version_memechip.txt", "wb")
meme_file.write(b"Version %s" % (memechip_version))
meme_file.close()
else:
logger.error('Missing meme')
raise Exception('Missing meme')
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
# Get Version
bedtools_version_command = "bedtools --version"
bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
# Write to file
bedtools_file = open("version_bedtools.txt", "wb")
bedtools_file.write(bedtools_version)
bedtools_file.close()
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
def run_wrapper(args):
motif_search(*args)
def motif_search(filename, genome, experiment, peak):
'''Run motif serach on peaks.'''
file_basename = os.path.basename(
utils.strip_extensions(filename, STRIP_EXTENSIONS))
out_fa = '%s.fa' % (experiment)
out_motif = '%s_memechip' % (experiment)
# Sort Bed file and limit number of peaks
if peak == -1:
peak = utils.count_lines(filename)
peak_no = 'all'
else:
peak_no = peak
sorted_fn = '%s.%s.narrowPeak' % (file_basename, peak_no)
out, err = utils.run_pipe([
'sort -k %dgr,%dgr %s' % (5, 5, filename),
'head -n %s' % (peak)], outfile=sorted_fn)
# Get fasta file
out, err = utils.run_pipe([
'bedtools getfasta -fi %s -bed %s -fo %s' % (genome, sorted_fn, out_fa)])
if err:
logger.error("bedtools error: %s", err)
# Call memechip
out, err = utils.run_pipe([
'meme-chip -oc %s -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 %s -norand' % (out_motif, out_fa)])
if err:
logger.error("meme-chip error: %s", err)
def main():
args = get_args()
design = args.design
genome = args.genome
peak = args.peak
# Create a file handler
handler = logging.FileHandler('motif.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Read files
design_df = pd.read_csv(design, sep='\t')
meme_arglist = zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0])
work_pool = Pool(min(12, design_df.shape[0]))
return_list = work_pool.map(run_wrapper, meme_arglist)
work_pool.close()
work_pool.join()
if __name__ == '__main__':
main()
#!/usr/bin/env python2
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''General utilities.'''
import shlex
import logging
import subprocess
import sys
import os
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = True
def run_pipe(steps, outfile=None):
from subprocess import Popen, PIPE
p = None
p_next = None
first_step_n = 1
last_step_n = len(steps)
for n, step in enumerate(steps, start=first_step_n):
logger.debug("step %d: %s" % (n, step))
if n == first_step_n:
if n == last_step_n and outfile: # one-step pipeline with outfile
with open(outfile, 'w') as fh:
print("one step shlex: %s to file: %s" % (shlex.split(step), outfile))
p = Popen(shlex.split(step), stdout=fh)
break
print("first step shlex to stdout: %s" % (shlex.split(step)))
p = Popen(shlex.split(step), stdout=PIPE)
elif n == last_step_n and outfile: # only treat the last step specially if you're sending stdout to a file
with open(outfile, 'w') as fh:
print("last step shlex: %s to file: %s" % (shlex.split(step), outfile))
p_last = Popen(shlex.split(step), stdin=p.stdout, stdout=fh)
p.stdout.close()
p = p_last
else: # handles intermediate steps and, in the case of a pipe to stdout, the last step
print("intermediate step %d shlex to stdout: %s" % (n, shlex.split(step)))
p_next = Popen(shlex.split(step), stdin=p.stdout, stdout=PIPE)
p.stdout.close()
p = p_next
out, err = p.communicate()
return out, err
def block_on(command):
process = subprocess.Popen(shlex.split(command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
for line in iter(process.stdout.readline, b''):
sys.stdout.write(line.decode('utf-8'))
process.communicate()
return process.returncode
def strip_extensions(filename, extensions):
'''Strips extensions to get basename of file.'''
basename = filename
for extension in extensions:
basename = basename.rpartition(extension)[0] or basename
return basename
def count_lines(filename):
import mimetypes
compressed_mimetypes = [
"compress",
"bzip2",
"gzip"
]
mime_type = mimetypes.guess_type(filename)[1]
if mime_type in compressed_mimetypes:
catcommand = 'gzip -dc'
else:
catcommand = 'cat'
out, err = run_pipe([
'%s %s' % (catcommand, filename),
'wc -l'
])
return int(out)
def rescale_scores(filename, scores_col, new_min=10, new_max=1000):
sorted_fn = 'sorted-%s' % (filename)
rescaled_fn = 'rescaled-%s' % (filename)
out, err = run_pipe([
'sort -k %dgr,%dgr %s' % (scores_col, scores_col, filename),
r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (NF != 0) print $0}'"""],
sorted_fn)
out, err = run_pipe([
'head -n 1 %s' % (sorted_fn),
'cut -f %s' % (scores_col)])
max_score = float(out.strip())
logger.info("rescale_scores: max_score = %s", max_score)
out, err = run_pipe([
'tail -n 1 %s' % (sorted_fn),
'cut -f %s' % (scores_col)])
min_score = float(out.strip())
logger.info("rescale_scores: min_score = %s", min_score)
a = min_score
b = max_score
x = new_min
y = new_max
if min_score == max_score: # give all peaks new_min
rescale_formula = "x"
else: # n is the unscaled score from scores_col
rescale_formula = "((n-a)*(y-x)/(b-a))+x"
out, err = run_pipe(
[
'cat %s' % (sorted_fn),
r"""awk 'BEGIN{OFS="\t"}{n=$%d;a=%d;b=%d;x=%d;y=%d}"""
% (scores_col, a, b, x, y) +
r"""{$%d=int(%s) ; print $0}'"""
% (scores_col, rescale_formula)
],
rescaled_fn)
os.remove(sorted_fn)
return rescaled_fn