Skip to content
Snippets Groups Projects
Commit 14299993 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Add in maping step with apporpriate configurations.

parent 38f026a1
Branches
Tags
No related merge requests found
......@@ -9,7 +9,7 @@
# A unique identifier for the workflow package, text/underscores only
name: 'chipseq_analysis_bicf'
# Who wrote this?
author: 'Beibei Chen'
author: 'Beibei Chen and Venkat Malladi'
# A contact email address for questions
email: 'biohpc-help@utsouthwestern.edu'
# A more informative title for the workflow package
......@@ -17,8 +17,7 @@ title: 'BICF ChIP-seq Analysis Workflow'
# A summary of the workflow package in plain text
description: |
This is a workflow package for the BioHPC/BICF ChIP-seq workflow system.
It implements a simple ChIP-seq analysis workflow and
visualization application.
It implements ChIP-seq analysis workflow and visualization application.
# -----------------------------------------------------------------------------
# DOCUMENTATION
......@@ -42,9 +41,9 @@ documentation_files:
workflow_modules:
- 'python/3.6.1-2-anaconda'
- 'trimgalore/0.4.1'
- 'fastqc/0.11.5'
- 'deeptools/2.3.5'
- 'meme/4.11.1-gcc-openmpi'
- 'cutadapt/1.9.1'
- 'fastqc/0.11.2'
- 'bwa/intel/0.7.15'
# A list of parameters used by the workflow, defining how to present them,
# options etc in the web interface. For each parameter:
......@@ -84,10 +83,11 @@ workflow_parameters:
type: files
required: true
description: |
Fastq files of all samples
regex: "*_{1,2}.fastq.gz"
One or more input FASTQ files from a ChIP-seq expereiment and a design
file with the link bewetwen the same file name and sample id
regex: ".*(fastq|fq)*"
- id: singleEnd
- id: pairedEnd
type: select
required: true
choices:
......@@ -100,6 +100,24 @@ workflow_parameters:
read length, and then starts another round of reading from the opposite
end of the fragment.
- id: design
type: file
required: true
description: |
A design file listing sample id, fastq files, corresponding control id
and additional information about the sample.
- id: genome
type: select
choices:
- [ 'GRCh38', 'Human GRCh38']
- [ 'GRCh37', 'Human GRCh37']
- [ 'GRCh38', 'Mouse GRCh38']
required: true
description: |
Reference species and genome used for alignment and subsequent analysis.
# -----------------------------------------------------------------------------
# SHINY APP CONFIGURATION
# -----------------------------------------------------------------------------
......
......@@ -10,4 +10,18 @@ process {
module = ['python/3.6.1-2-anaconda', 'trimgalore/0.4.1']
cpus = 32
}
$alignReads{
module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.15', 'samtools/intel/1.3']
cpus = 32
memory = 12.GB
}
}
params {
// Reference file paths on BioHPC
genomes {
'GRCh38' { bwa = '/project/shared/bicf_workflow_ref/GRCh38' }
'GRCh37' { bwa = '/project/shared/bicf_workflow_ref/GRCh37' }
'GRCm38' { bwa = '/project/shared/bicf_workflow_ref/GRCm38' }
}
}
......@@ -7,6 +7,18 @@
params.reads = "$baseDir/../test_data/*.fastq.gz"
params.pairedEnd = false
params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt"
params.genome = 'GRCh38'
params.genomes = []
params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false
// Check inputs
if( params.bwaIndex ){
bwaIndex = Channel
.fromPath(params.bwaIndex)
.ifEmpty { exit 1, "BWA index not found: ${params.bwaIndex}" }
} else {
exit 1, "No reference genome specified."
}
// Define List of Files
readsList = Channel
......@@ -55,14 +67,14 @@ if (pairedEnd) {
} else {
rawReads = designFilePaths
.splitCsv(sep: '\t', header: true)
.map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read1], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] }
.map { row -> [ row.sample_id, [row.fastq_read1], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] }
}
// Trim raw reads using trimgalore
process trimReads {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/{task.process}/$sampleId-$replicate/", mode: 'copy'
publishDir "$baseDir/output/${task.process}", mode: 'copy'
input:
......@@ -77,12 +89,42 @@ process trimReads {
if (pairedEnd) {
"""
python $baseDir/scripts/trim_reads.py -f $reads -p
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -p
"""
}
else {
"""
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]}
"""
}
}
// Align trimmed reads using bwa
process alignReads {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/{task.process}", mode: 'copy'
input:
set sampleId, reads, biosample, factor, treatment, replicate, controlId from trimmedReads
file index from bwaIndex.first()
output:
set sampleId, file('*.bam'), biosample, factor, treatment, replicate, controlId into mappedReads
script:
if (pairedEnd) {
"""
python $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -p
"""
}
else {
"""
python $baseDir/scripts/check_design.py -f $reads
python $baseDir/scripts/map_reads.py -f ${reads[0]} -r ${index}/genome.fa
"""
}
......
#!/usr/bin/env python3
'''Align reads to reference genome.'''
import os
import subprocess
import argparse
import shutil
import logging
import sys
from multiprocessing import cpu_count
import json
import utils
EPILOG = '''
For more details:
%(prog)s --help
'''
## SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '_trimmed']
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-f', '--fastq',
help="The fastq file to run triming on.",
nargs='+',
required=True)
parser.add_argument('-r', '--reference',
help="The bwa index of the reference genome.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
action='store_true')
args = parser.parse_args()
return args
## Functions
def strip_extensions(filename, extensions):
'''Strips extensions to get basename of file.'''
basename = filename
for extension in extensions:
basename = basename.rpartition(extension)[0] or basename
return basename
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
bwa_path = shutil.which("bwa")
if trimgalore_path:
logger.info('Found bwa: %s', bwa_path)
else:
logger.error('Missing bwa')
raise Exception('Missing bwa')
samtools_path = shutil.which("samtools")
if samtools_path:
logger.info('Found samtools: %s', samtools_path)
else:
logger.error('Missing samtools')
raise Exception('Missing samtools')
def generate_sa(fastq, reference):
'''Use BWA to generate Suffix Arrays.'''
fastq_basename = os.path.basename(strip_extensions(fastq, STRIP_EXTENSIONS))
bwa_aln_params = '-q 5 -l 32 -k 2'
sai = '%s.sai' % (fastq_basename)
with open(sai, 'w') as sai_file:
bwa_command = "bwa aln %s -t %d %s %s" \
% (bwa_aln_params, cpu_count(),
reference, fastq_basename)
logger.info("Running bwa with %s", bwa_command)
subprocess.check_call(shlex.split(bwa_command), stdout=sai_file)
return sai
def align_se(fastq, sai, reference, fastq_basename):
'''Use BWA to align SE data.'''
sam_filename = "%s.sam" % (fastq_basename)
steps = [
"bwa samse %s %s %s"
% (reference, fastq[0], sai[0]),
"samtools view -@%d -Su -" % (cpu_count()),
"samtools sort -@%d -o %s"
% (cpu_count(), fastq_basename)]
out, err = utils.run_pipe(steps)
if err:
logger.error("samse/samtools error: %s" % (err))
def align_pe(fastq, sai, reference, fastq_basename):
'''Use BWA to align PE data.'''
sam_filename = "%s.sam" % (fastq_basename)
badcigar_filename = "%s.badReads" % (fastq_basename)
# Remove read pairs with bad CIGAR strings and sort by position
steps = [
"bwa sampe -P %s %s %s %s %s"
% (reference, fastq[0], fastq[1],
sai[0], sai[1]),
"tee %s" % (sam_filename),
r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
"sort",
"uniq"]
out, err = utils.run_pipe(steps, badcigar_filename)
if err:
logger.error("sampe error: %s" % (err))
steps = [
"cat %s" % (sam_filename),
"grep -v -F -f %s" % (badcigar_filename),
"samtools view -@%d -Su -" % (cpu_count()),
"samtools sort -@%d -o %s"
% (cpu_count(), fastq_basename)]
out, err = utils.run_pipe(steps)
if err:
logger.error("samtools error: %s" % (err))
def main():
args = get_args()
# Create a file handler
handler = logging.FileHandler('map.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Run Suffix Array generation
sai = []
for fastq in args.fastq:
sai_filename = generate_sa(fastq, args.reference)
sai.append(sai_filename)
# Run alignment for either PE or SE
if paired: # paired-end data
fastq_r1_basename = os.path.basename(
strip_extensions(fastq[0], STRIP_EXTENSIONS))
fastq_r2_basename = os.path.basename(
strip_extensions(fastq[1], STRIP_EXTENSIONS))
fastq_basename = fastq_r1_basename + fastq_r2_basename
align_pe(fastq, sai, fastq_basename)
else:
fastq_basename = os.path.basename(
strip_extensions(fastq[0], STRIP_EXTENSIONS))
align_se(fastq, sai, fastq_basename)
bam_mapstats_filename = '%s.raw.srt.bam.flagstat.qc' % (fastq_basename)
with open(raw_bam_mapstats_filename, 'w') as fh:
subprocess.check_call(
shlex.split("%s flagstat %s" % (samtools, bam_mapstats_filename)),
stdout=fh)
# Remove sai files
for sai_file in sai:
os.remove(sai_file)
if __name__ == '__main__':
main()
#!/usr/bin/env python3
'''General utilities.'''
import sys
import os
import subprocess
import shlex
import logging
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = True
def run_pipe(steps, outfile=None):
# TODO: capture stderr
from subprocess import Popen, PIPE
p = None
p_next = None
first_step_n = 1
last_step_n = len(steps)
for n, step in enumerate(steps, start=first_step_n):
logger.debug("step %d: %s" % (n, step))
if n == first_step_n:
if n == last_step_n and outfile: # one-step pipeline with outfile
with open(outfile, 'w') as fh:
print("one step shlex: %s to file: %s" % (shlex.split(step), outfile))
p = Popen(shlex.split(step), stdout=fh)
break
print("first step shlex to stdout: %s" % (shlex.split(step)))
p = Popen(shlex.split(step), stdout=PIPE)
elif n == last_step_n and outfile: # only treat the last step specially if you're sending stdout to a file
with open(outfile, 'w') as fh:
print("last step shlex: %s to file: %s" % (shlex.split(step), outfile))
p_last = Popen(shlex.split(step), stdin=p.stdout, stdout=fh)
p.stdout.close()
p = p_last
else: # handles intermediate steps and, in the case of a pipe to stdout, the last step
print("intermediate step %d shlex to stdout: %s" % (n, shlex.split(step)))
p_next = Popen(shlex.split(step), stdin=p.stdout, stdout=PIPE)
p.stdout.close()
p = p_next
out, err = p.communicate()
return out, err
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment