Skip to content
Snippets Groups Projects
Commit 14299993 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Add in maping step with apporpriate configurations.

parent 38f026a1
Branches
Tags
1 merge request!5Resolve "Add mapping and trimming"
Pipeline #1035 canceled with stage
......@@ -9,7 +9,7 @@
# A unique identifier for the workflow package, text/underscores only
name: 'chipseq_analysis_bicf'
# Who wrote this?
author: 'Beibei Chen'
author: 'Beibei Chen and Venkat Malladi'
# A contact email address for questions
email: 'biohpc-help@utsouthwestern.edu'
# A more informative title for the workflow package
......@@ -17,8 +17,7 @@ title: 'BICF ChIP-seq Analysis Workflow'
# A summary of the workflow package in plain text
description: |
This is a workflow package for the BioHPC/BICF ChIP-seq workflow system.
It implements a simple ChIP-seq analysis workflow and
visualization application.
It implements ChIP-seq analysis workflow and visualization application.
# -----------------------------------------------------------------------------
# DOCUMENTATION
......@@ -42,9 +41,9 @@ documentation_files:
workflow_modules:
- 'python/3.6.1-2-anaconda'
- 'trimgalore/0.4.1'
- 'fastqc/0.11.5'
- 'deeptools/2.3.5'
- 'meme/4.11.1-gcc-openmpi'
- 'cutadapt/1.9.1'
- 'fastqc/0.11.2'
- 'bwa/intel/0.7.15'
# A list of parameters used by the workflow, defining how to present them,
# options etc in the web interface. For each parameter:
......@@ -84,10 +83,11 @@ workflow_parameters:
type: files
required: true
description: |
Fastq files of all samples
regex: "*_{1,2}.fastq.gz"
One or more input FASTQ files from a ChIP-seq expereiment and a design
file with the link bewetwen the same file name and sample id
regex: ".*(fastq|fq)*"
- id: singleEnd
- id: pairedEnd
type: select
required: true
choices:
......@@ -100,6 +100,24 @@ workflow_parameters:
read length, and then starts another round of reading from the opposite
end of the fragment.
- id: design
type: file
required: true
description: |
A design file listing sample id, fastq files, corresponding control id
and additional information about the sample.
- id: genome
type: select
choices:
- [ 'GRCh38', 'Human GRCh38']
- [ 'GRCh37', 'Human GRCh37']
- [ 'GRCh38', 'Mouse GRCh38']
required: true
description: |
Reference species and genome used for alignment and subsequent analysis.
# -----------------------------------------------------------------------------
# SHINY APP CONFIGURATION
# -----------------------------------------------------------------------------
......
......@@ -10,4 +10,18 @@ process {
module = ['python/3.6.1-2-anaconda', 'trimgalore/0.4.1']
cpus = 32
}
$alignReads{
module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.15', 'samtools/intel/1.3']
cpus = 32
memory = 12.GB
}
}
params {
// Reference file paths on BioHPC
genomes {
'GRCh38' { bwa = '/project/shared/bicf_workflow_ref/GRCh38' }
'GRCh37' { bwa = '/project/shared/bicf_workflow_ref/GRCh37' }
'GRCm38' { bwa = '/project/shared/bicf_workflow_ref/GRCm38' }
}
}
......@@ -7,6 +7,18 @@
params.reads = "$baseDir/../test_data/*.fastq.gz"
params.pairedEnd = false
params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt"
params.genome = 'GRCh38'
params.genomes = []
params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false
// Check inputs
if( params.bwaIndex ){
bwaIndex = Channel
.fromPath(params.bwaIndex)
.ifEmpty { exit 1, "BWA index not found: ${params.bwaIndex}" }
} else {
exit 1, "No reference genome specified."
}
// Define List of Files
readsList = Channel
......@@ -55,14 +67,14 @@ if (pairedEnd) {
} else {
rawReads = designFilePaths
.splitCsv(sep: '\t', header: true)
.map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read1], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] }
.map { row -> [ row.sample_id, [row.fastq_read1], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] }
}
// Trim raw reads using trimgalore
process trimReads {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/{task.process}/$sampleId-$replicate/", mode: 'copy'
publishDir "$baseDir/output/${task.process}", mode: 'copy'
input:
......@@ -77,12 +89,42 @@ process trimReads {
if (pairedEnd) {
"""
python $baseDir/scripts/trim_reads.py -f $reads -p
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -p
"""
}
else {
"""
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]}
"""
}
}
// Align trimmed reads using bwa
process alignReads {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/{task.process}", mode: 'copy'
input:
set sampleId, reads, biosample, factor, treatment, replicate, controlId from trimmedReads
file index from bwaIndex.first()
output:
set sampleId, file('*.bam'), biosample, factor, treatment, replicate, controlId into mappedReads
script:
if (pairedEnd) {
"""
python $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -p
"""
}
else {
"""
python $baseDir/scripts/check_design.py -f $reads
python $baseDir/scripts/map_reads.py -f ${reads[0]} -r ${index}/genome.fa
"""
}
......
#!/usr/bin/env python3
'''Align reads to reference genome.'''
import os
import subprocess
import argparse
import shutil
import logging
import sys
from multiprocessing import cpu_count
import json
import utils
EPILOG = '''
For more details:
%(prog)s --help
'''
## SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '_trimmed']
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-f', '--fastq',
help="The fastq file to run triming on.",
nargs='+',
required=True)
parser.add_argument('-r', '--reference',
help="The bwa index of the reference genome.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
action='store_true')
args = parser.parse_args()
return args
## Functions
def strip_extensions(filename, extensions):
'''Strips extensions to get basename of file.'''
basename = filename
for extension in extensions:
basename = basename.rpartition(extension)[0] or basename
return basename
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
bwa_path = shutil.which("bwa")
if trimgalore_path:
logger.info('Found bwa: %s', bwa_path)
else:
logger.error('Missing bwa')
raise Exception('Missing bwa')
samtools_path = shutil.which("samtools")
if samtools_path:
logger.info('Found samtools: %s', samtools_path)
else:
logger.error('Missing samtools')
raise Exception('Missing samtools')
def generate_sa(fastq, reference):
'''Use BWA to generate Suffix Arrays.'''
fastq_basename = os.path.basename(strip_extensions(fastq, STRIP_EXTENSIONS))
bwa_aln_params = '-q 5 -l 32 -k 2'
sai = '%s.sai' % (fastq_basename)
with open(sai, 'w') as sai_file:
bwa_command = "bwa aln %s -t %d %s %s" \
% (bwa_aln_params, cpu_count(),
reference, fastq_basename)
logger.info("Running bwa with %s", bwa_command)
subprocess.check_call(shlex.split(bwa_command), stdout=sai_file)
return sai
def align_se(fastq, sai, reference, fastq_basename):
'''Use BWA to align SE data.'''
sam_filename = "%s.sam" % (fastq_basename)
steps = [
"bwa samse %s %s %s"
% (reference, fastq[0], sai[0]),
"samtools view -@%d -Su -" % (cpu_count()),
"samtools sort -@%d -o %s"
% (cpu_count(), fastq_basename)]
out, err = utils.run_pipe(steps)
if err:
logger.error("samse/samtools error: %s" % (err))
def align_pe(fastq, sai, reference, fastq_basename):
'''Use BWA to align PE data.'''
sam_filename = "%s.sam" % (fastq_basename)
badcigar_filename = "%s.badReads" % (fastq_basename)
# Remove read pairs with bad CIGAR strings and sort by position
steps = [
"bwa sampe -P %s %s %s %s %s"
% (reference, fastq[0], fastq[1],
sai[0], sai[1]),
"tee %s" % (sam_filename),
r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
"sort",
"uniq"]
out, err = utils.run_pipe(steps, badcigar_filename)
if err:
logger.error("sampe error: %s" % (err))
steps = [
"cat %s" % (sam_filename),
"grep -v -F -f %s" % (badcigar_filename),
"samtools view -@%d -Su -" % (cpu_count()),
"samtools sort -@%d -o %s"
% (cpu_count(), fastq_basename)]
out, err = utils.run_pipe(steps)
if err:
logger.error("samtools error: %s" % (err))
def main():
args = get_args()
# Create a file handler
handler = logging.FileHandler('map.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Run Suffix Array generation
sai = []
for fastq in args.fastq:
sai_filename = generate_sa(fastq, args.reference)
sai.append(sai_filename)
# Run alignment for either PE or SE
if paired: # paired-end data
fastq_r1_basename = os.path.basename(
strip_extensions(fastq[0], STRIP_EXTENSIONS))
fastq_r2_basename = os.path.basename(
strip_extensions(fastq[1], STRIP_EXTENSIONS))
fastq_basename = fastq_r1_basename + fastq_r2_basename
align_pe(fastq, sai, fastq_basename)
else:
fastq_basename = os.path.basename(
strip_extensions(fastq[0], STRIP_EXTENSIONS))
align_se(fastq, sai, fastq_basename)
bam_mapstats_filename = '%s.raw.srt.bam.flagstat.qc' % (fastq_basename)
with open(raw_bam_mapstats_filename, 'w') as fh:
subprocess.check_call(
shlex.split("%s flagstat %s" % (samtools, bam_mapstats_filename)),
stdout=fh)
# Remove sai files
for sai_file in sai:
os.remove(sai_file)
if __name__ == '__main__':
main()
#!/usr/bin/env python3
'''General utilities.'''
import sys
import os
import subprocess
import shlex
import logging
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = True
def run_pipe(steps, outfile=None):
# TODO: capture stderr
from subprocess import Popen, PIPE
p = None
p_next = None
first_step_n = 1
last_step_n = len(steps)
for n, step in enumerate(steps, start=first_step_n):
logger.debug("step %d: %s" % (n, step))
if n == first_step_n:
if n == last_step_n and outfile: # one-step pipeline with outfile
with open(outfile, 'w') as fh:
print("one step shlex: %s to file: %s" % (shlex.split(step), outfile))
p = Popen(shlex.split(step), stdout=fh)
break
print("first step shlex to stdout: %s" % (shlex.split(step)))
p = Popen(shlex.split(step), stdout=PIPE)
elif n == last_step_n and outfile: # only treat the last step specially if you're sending stdout to a file
with open(outfile, 'w') as fh:
print("last step shlex: %s to file: %s" % (shlex.split(step), outfile))
p_last = Popen(shlex.split(step), stdin=p.stdout, stdout=fh)
p.stdout.close()
p = p_last
else: # handles intermediate steps and, in the case of a pipe to stdout, the last step
print("intermediate step %d shlex to stdout: %s" % (n, shlex.split(step)))
p_next = Popen(shlex.split(step), stdin=p.stdout, stdout=PIPE)
p.stdout.close()
p = p_next
out, err = p.communicate()
return out, err
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment