Skip to content
Snippets Groups Projects
Commit 4357eb15 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Merge branch '3-mapping-trimming' into 'master'

Resolve "Add mapping and trimming"

Closes #3

See merge request !5
parents c9f0a2c4 034d2f58
Branches
Tags
1 merge request!5Resolve "Add mapping and trimming"
Pipeline #1053 passed with stage
in 1 hour, 8 minutes, and 5 seconds
before_script:
- module add python/3.6.1-2-anaconda
- pip install --user pytest-pythonpath
- module load nextflow/0.24.1-SNAPSHOT
- ln -s /project/shared/bicf_workflow_ref/workflow_testdata/chipseq/*fastq.gz test_data/
test:
script:
- nextflow run workflow/main.nf
- pytest
......@@ -9,7 +9,7 @@
# A unique identifier for the workflow package, text/underscores only
name: 'chipseq_analysis_bicf'
# Who wrote this?
author: 'Beibei Chen'
author: 'Beibei Chen and Venkat Malladi'
# A contact email address for questions
email: 'biohpc-help@utsouthwestern.edu'
# A more informative title for the workflow package
......@@ -17,8 +17,7 @@ title: 'BICF ChIP-seq Analysis Workflow'
# A summary of the workflow package in plain text
description: |
This is a workflow package for the BioHPC/BICF ChIP-seq workflow system.
It implements a simple ChIP-seq analysis workflow and
visualization application.
It implements ChIP-seq analysis workflow and visualization application.
# -----------------------------------------------------------------------------
# DOCUMENTATION
......@@ -40,9 +39,11 @@ documentation_files:
# A list of cluster environment modules that this workflow requires to run.
# Specify versioned module names to ensure reproducability.
workflow_modules:
- 'fastqc/0.11.5'
- 'deeptools/2.3.5'
- 'meme/4.11.1-gcc-openmpi'
- 'python/3.6.1-2-anaconda'
- 'trimgalore/0.4.1'
- 'cutadapt/1.9.1'
- 'fastqc/0.11.2'
- 'bwa/intel/0.7.12'
# A list of parameters used by the workflow, defining how to present them,
# options etc in the web interface. For each parameter:
......@@ -82,10 +83,11 @@ workflow_parameters:
type: files
required: true
description: |
Fastq files of all samples
regex: "*_{1,2}.fastq.gz"
One or more input FASTQ files from a ChIP-seq expereiment and a design
file with the link bewetwen the same file name and sample id
regex: ".*(fastq|fq)*"
- id: singleEnd
- id: pairedEnd
type: select
required: true
choices:
......@@ -98,6 +100,24 @@ workflow_parameters:
read length, and then starts another round of reading from the opposite
end of the fragment.
- id: design
type: file
required: true
description: |
A design file listing sample id, fastq files, corresponding control id
and additional information about the sample.
- id: genome
type: select
choices:
- [ 'GRCh38', 'Human GRCh38']
- [ 'GRCh37', 'Human GRCh37']
- [ 'GRCh38', 'Mouse GRCh38']
required: true
description: |
Reference species and genome used for alignment and subsequent analysis.
# -----------------------------------------------------------------------------
# SHINY APP CONFIGURATION
# -----------------------------------------------------------------------------
......@@ -114,7 +134,7 @@ vizapp_cran_packages:
- shiny
- shinyFiles
# List of any Bioconductor packages, not provided by the modules,
# List of any Bioconductor packages, not provided by the modules,
# that must be made available to the vizapp
vizapp_bioc_packages:
- qusage
......
process.executor='slurm'
process.queue='super'
process.time='5d'
process {
executor = 'slurm'
queue='super'
// Process specific configuration
$checkDesignFile {
module = ['python/3.6.1-2-anaconda']
executor = 'local'
}
$trim_galore {
module = ['python/3.6.1-2-anaconda', 'trimgalore/0.4.1']
cpus = 32
}
$alignReads{
module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/intel/1.3']
cpus = 32
}
}
params {
// Reference file paths on BioHPC
genomes {
'GRCh38' { bwa = '/project/shared/bicf_workflow_ref/GRCh38' }
'GRCh37' { bwa = '/project/shared/bicf_workflow_ref/GRCh37' }
'GRCm38' { bwa = '/project/shared/bicf_workflow_ref/GRCm38' }
}
}
trace {
enabled = true
file = 'pipeline_trace.txt'
fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
}
......@@ -7,6 +7,18 @@
params.reads = "$baseDir/../test_data/*.fastq.gz"
params.pairedEnd = false
params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt"
params.genome = 'GRCm38'
params.genomes = []
params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false
// Check inputs
if( params.bwaIndex ){
bwaIndex = Channel
.fromPath(params.bwaIndex)
.ifEmpty { exit 1, "BWA index not found: ${params.bwaIndex}" }
} else {
exit 1, "No reference genome specified."
}
// Define List of Files
readsList = Channel
......@@ -36,7 +48,7 @@ process checkDesignFile {
if (pairedEnd) {
"""
python $baseDir/scripts/check_design.py -d $designFile -f $readsList -p
python3 $baseDir/scripts/check_design.py -d $designFile -f $readsList -p
"""
}
else {
......@@ -55,14 +67,14 @@ if (pairedEnd) {
} else {
rawReads = designFilePaths
.splitCsv(sep: '\t', header: true)
.map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read1], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] }
.map { row -> [ row.sample_id, [row.fastq_read1], row.biosample, row.factor, row.treatment, row.replicate, row.control_id ] }
}
process fastQc {
// Trim raw reads using trimgalore
process trimReads {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/", mode: 'copy',
saveAs: {filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename"}
publishDir "$baseDir/output/${task.process}", mode: 'copy'
input:
......@@ -70,11 +82,51 @@ process fastQc {
output:
file '*_fastqc.{zip,html}' into fastqc_results
set sampleId, file('*.fq.gz'), biosample, factor, treatment, replicate, controlId into trimmedReads
file('*trimming_report.txt') into trimgalore_results
script:
"""
python $baseDir/scripts/qc_fastq.py -f $reads
"""
if (pairedEnd) {
"""
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -p
"""
}
else {
"""
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]}
"""
}
}
// Align trimmed reads using bwa
process alignReads {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/${task.process}", mode: 'copy'
input:
set sampleId, reads, biosample, factor, treatment, replicate, controlId from trimmedReads
file index from bwaIndex.first()
output:
set sampleId, file('*.bam'), biosample, factor, treatment, replicate, controlId into mappedReads
set file('*.srt.bam.flagstat.qc')
script:
if (pairedEnd) {
"""
python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa -p
"""
}
else {
"""
python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa
"""
}
}
profiles {
standard {
includeConfig 'conf/biohpc.config'
}
}
......@@ -11,7 +11,7 @@ For more details:
%(prog)s --help
'''
## SETTINGS
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
......@@ -57,7 +57,7 @@ def check_design_headers(design, paired):
design_headers = list(design.columns.values)
if paired: # paired-end data
if paired: # paired-end data
design_template.extend(['fastq_read2'])
# Check if headers
......@@ -79,7 +79,8 @@ def check_controls(design):
if len(missing_controls) > 0:
logger.error('Missing control experiments: %s', list(missing_controls))
raise Exception("Missing control experiments: %s" % list(missing_controls))
raise Exception("Missing control experiments: %s" %
list(missing_controls))
def check_files(design, fastq, paired):
......@@ -87,9 +88,9 @@ def check_files(design, fastq, paired):
logger.info("Running file check.")
if paired: # paired-end data
if paired: # paired-end data
files = list(design['fastq_read1']) + list(design['fastq_read2'])
else: # single-end data
else: # single-end data
files = design['fastq_read1']
files_found = fastq['name']
......@@ -98,13 +99,14 @@ def check_files(design, fastq, paired):
if len(missing_files) > 0:
logger.error('Missing files from design file: %s', list(missing_files))
raise Exception("Missing files from design file: %s" % list(missing_files))
raise Exception("Missing files from design file: %s" %
list(missing_files))
else:
file_dict = fastq.set_index('name').T.to_dict()
design['fastq_read1'] = design['fastq_read1'] \
.apply(lambda x: file_dict[x]['path'])
if paired: # paired-end data
if paired: # paired-end data
design['fastq_read2'] = design['fastq_read2'] \
.apply(lambda x: file_dict[x]['path'])
return design
......
#!/usr/bin/env python3
'''Align reads to reference genome.'''
import os
import subprocess
import argparse
import shutil
import shlex
import logging
from multiprocessing import cpu_count
import json
import utils
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '_trimmed']
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-f', '--fastq',
help="The fastq file to run triming on.",
nargs='+',
required=True)
parser.add_argument('-r', '--reference',
help="The bwa index of the reference genome.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
action='store_true')
args = parser.parse_args()
return args
# Functions
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
bwa_path = shutil.which("bwa")
if bwa_path:
logger.info('Found bwa: %s', bwa_path)
else:
logger.error('Missing bwa')
raise Exception('Missing bwa')
samtools_path = shutil.which("samtools")
if samtools_path:
logger.info('Found samtools: %s', samtools_path)
else:
logger.error('Missing samtools')
raise Exception('Missing samtools')
def generate_sa(fastq, reference):
'''Use BWA to generate Suffix Arrays.'''
fastq_basename = os.path.basename(utils.strip_extensions(fastq, STRIP_EXTENSIONS))
bwa_aln_params = '-q 5 -l 32 -k 2'
sai = '%s.sai' % (fastq_basename)
with open(sai, 'w') as sai_file:
bwa_command = "bwa aln %s -t %d %s %s" \
% (bwa_aln_params, cpu_count(),
reference, fastq)
logger.info("Running bwa with %s", bwa_command)
subprocess.check_call(shlex.split(bwa_command), stdout=sai_file)
return sai
def align_se(fastq, sai, reference, fastq_basename):
'''Use BWA to align SE data.'''
bam_filename = '%s.srt.bam' % (fastq_basename)
steps = [
"bwa samse %s %s %s"
% (reference, sai[0], fastq[0]),
"samtools view -@%d -Su -" % (cpu_count()),
"samtools sort -@%d -o %s"
% (cpu_count(), bam_filename)]
out, err = utils.run_pipe(steps)
if err:
logger.error("samse/samtools error: %s" % (err))
return bam_filename
def align_pe(fastq, sai, reference, fastq_basename):
'''Use BWA to align PE data.'''
sam_filename = "%s.sam" % (fastq_basename)
badcigar_filename = "%s.badReads" % (fastq_basename)
bam_filename = '%s.srt.bam' % (fastq_basename)
# Remove read pairs with bad CIGAR strings and sort by position
steps = [
"bwa sampe -P %s %s %s %s %s"
% (reference, sai[0], sai[1],
fastq[0], fastq[1]),
"tee %s" % (sam_filename),
r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
"sort",
"uniq"]
out, err = utils.run_pipe(steps, badcigar_filename)
if err:
logger.error("sampe error: %s" % (err))
steps = [
"cat %s" % (sam_filename),
"grep -v -F -f %s" % (badcigar_filename),
"samtools view -@%d -Su -" % (cpu_count()),
"samtools sort -@%d -o %s"
% (cpu_count(), bam_filename)]
out, err = utils.run_pipe(steps)
if err:
logger.error("samtools error: %s" % (err))
return bam_filename
def main():
args = get_args()
paired = args.paired
fastq = args.fastq
reference = args.reference
# Create a file handler
handler = logging.FileHandler('map.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Run Suffix Array generation
sai = []
for fq in fastq:
sai_filename = generate_sa(fq, reference)
sai.append(sai_filename)
# Run alignment for either PE or SE
if paired: # paired-end data
fastq_r1_basename = os.path.basename(
utils.strip_extensions(fastq[0], STRIP_EXTENSIONS))
fastq_r2_basename = os.path.basename(
utils.strip_extensions(fastq[1], STRIP_EXTENSIONS))
fastq_basename = fastq_r1_basename + fastq_r2_basename
bam_filename = align_pe(fastq, sai, reference, fastq_basename)
else:
fastq_basename = os.path.basename(
utils.strip_extensions(fastq[0], STRIP_EXTENSIONS))
bam_filename = align_se(fastq, sai, reference, fastq_basename)
bam_mapstats_filename = '%s.srt.bam.flagstat.qc' % (fastq_basename)
with open(bam_mapstats_filename, 'w') as fh:
subprocess.check_call(
shlex.split("samtools flagstat %s" % (bam_filename)),
stdout=fh)
# Remove sai files
for sai_file in sai:
os.remove(sai_file)
if __name__ == '__main__':
main()
......@@ -66,7 +66,7 @@ def main():
# Create a file handler
handler = logging.FileHandler('qc.log')
LOGGER.addHandler(handler)
logger.addHandler(handler)
# Check if tools are present
check_tools()
......
#!/usr/bin/env python3
'''Trim low quality reads and remove sequences less than 35 base pairs.'''
import subprocess
import argparse
import shutil
import logging
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-f', '--fastq',
help="The fastq file to run triming on.",
nargs='+',
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
action='store_true')
args = parser.parse_args()
return args
def check_tools():
'''Checks for required componenets on user system.'''
logger.info('Checking for required libraries and components on this system')
trimgalore_path = shutil.which("trim_galore")
if trimgalore_path:
logger.info('Found trimgalore: %s', trimgalore_path)
else:
logger.error('Missing trimgalore')
raise Exception('Missing trimgalore')
cutadapt_path = shutil.which("cutadapt")
if cutadapt_path:
logger.info('Found cutadapt: %s', cutadapt_path)
else:
logger.error('Missing cutadapt')
raise Exception('Missing cutadapt')
def trim_reads(fastq, paired):
'''Run trim_galore on 1 or 2 files.'''
if paired: # paired-end data
trim_params = '--paired -q 25 --illumina --gzip --length 35'
trim_command = "trim_galore %s %s %s " \
% (trim_params, fastq[0], fastq[1])
else:
trim_params = '-q 25 --illumina --gzip --length 35'
trim_command = "trim_galore %s %s " \
% (trim_params, fastq[0])
logger.info("Running trim_galore with %s", trim_command)
trim = subprocess.Popen(trim_command, shell=True)
out, err = trim.communicate()
def main():
args = get_args()
# Create a file handler
handler = logging.FileHandler('trim.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Run trim_reads
trim_reads(args.fastq, args.paired)
if __name__ == '__main__':
main()
#!/usr/bin/env python3
'''General utilities.'''
import shlex
import logging
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = True
def run_pipe(steps, outfile=None):
# TODO: capture stderr
from subprocess import Popen, PIPE
p = None
p_next = None
first_step_n = 1
last_step_n = len(steps)
for n, step in enumerate(steps, start=first_step_n):
logger.debug("step %d: %s" % (n, step))
if n == first_step_n:
if n == last_step_n and outfile: # one-step pipeline with outfile
with open(outfile, 'w') as fh:
print("one step shlex: %s to file: %s" % (shlex.split(step), outfile))
p = Popen(shlex.split(step), stdout=fh)
break
print("first step shlex to stdout: %s" % (shlex.split(step)))
p = Popen(shlex.split(step), stdout=PIPE)
elif n == last_step_n and outfile: # only treat the last step specially if you're sending stdout to a file
with open(outfile, 'w') as fh:
print("last step shlex: %s to file: %s" % (shlex.split(step), outfile))
p_last = Popen(shlex.split(step), stdin=p.stdout, stdout=fh)
p.stdout.close()
p = p_last
else: # handles intermediate steps and, in the case of a pipe to stdout, the last step
print("intermediate step %d shlex to stdout: %s" % (n, shlex.split(step)))
p_next = Popen(shlex.split(step), stdin=p.stdout, stdout=PIPE)
p.stdout.close()
p = p_next
out, err = p.communicate()
return out, err
def strip_extensions(filename, extensions):
'''Strips extensions to get basename of file.'''
basename = filename
for extension in extensions:
basename = basename.rpartition(extension)[0] or basename
return basename
#!/usr/bin/env python3
import os
import pytest
import pandas as pd
from io import StringIO
import check_design
import sys
DESIGN_STRING = """sample_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\tfastq_read1
......@@ -49,11 +47,12 @@ def design_2(design):
design_df = design.drop(design.index[2])
return design_df
@pytest.fixture
def design_3(design):
# Drop A_2 and B_2 and append as fastq_read2
design_df = design.drop(design.index[[1,3]])
design_df['fastq_read2'] = design.loc[[1,3],'fastq_read1'].values
design_df = design.drop(design.index[[1, 3]])
design_df['fastq_read2'] = design.loc[[1, 3], 'fastq_read1'].values
return design_df
......@@ -94,10 +93,10 @@ def test_check_files_missing_files(design, fastq_files_1):
def test_check_files_output_singleend(design, fastq_files):
paired = False
new_design = check_design.check_files(design, fastq_files, paired)
assert new_design.loc[0,'fastq_read1'] == "/path/to/file/A_1.fastq.gz"
assert new_design.loc[0, 'fastq_read1'] == "/path/to/file/A_1.fastq.gz"
def test_check_files_output_pairedend(design_3, fastq_files):
paired = True
new_design = check_design.check_files(design_3, fastq_files, paired)
assert new_design.loc[0,'fastq_read2'] == "/path/to/file/A_2.fastq.gz"
assert new_design.loc[0, 'fastq_read2'] == "/path/to/file/A_2.fastq.gz"
#!/usr/bin/env python3
import pytest
import os
def test_map_reads_singleend():
aligned_reads_report = os.path.dirname(os.path.abspath(__file__)) + \
'/../output/alignReads/ENCFF646LXU.srt.bam.flagstat.qc'
samtools_report = open(aligned_reads_report).readlines()
assert '80795025 + 0 in total' in samtools_report[0]
assert '80050072 + 0 mapped (99.08% : N/A)' in samtools_report[4]
def test_map_reads_pairedend():
# Do the same thing for paired end data
pass
#!/usr/bin/env python3
import pytest
import os
def test_trim_reads_singleend():
raw_fastq = os.path.dirname(os.path.abspath(__file__)) + \
'/../../test_data/ENCFF833BLU.fastq.gz'
trimmed_fastq = os.path.dirname(os.path.abspath(__file__)) + \
'/../output/trimReads/ENCFF833BLU_trimmed.fq.gz'
trimmed_fastq_report = os.path.dirname(os.path.abspath(__file__)) + \
'/../output/trimReads/ENCFF833BLU.fastq.gz_trimming_report.txt'
assert os.path.getsize(raw_fastq) != os.path.getsize(trimmed_fastq)
assert os.path.getsize(trimmed_fastq) == 2512853101
assert 'Trimming mode: single-end' in open(trimmed_fastq_report).readlines()[4]
def test_trim_reads_pairedend():
# Do the same thing for paired end data
pass
#!/usr/bin/env python3
import pytest
import utils
STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '.fa', '.fasta']
@pytest.fixture
def steps():
steps = []
return steps
@pytest.fixture
def steps_1(steps):
design_file = "test_data/design_ENCSR238SGC_SE.txt"
step = [
"grep H3K4me1 %s " % (design_file)]
return step
@pytest.fixture
def steps_2(steps_1):
steps_1.extend([
"cut -f7"
])
return steps_1
def test_run_one_step(steps_1, capsys):
check_output = 'ENCSR238SGC\tlimb\tH3K4me1\tNone\t1\tENCSR687ALB\tENCFF833BLU.fastq.gz'.encode('UTF-8')
out, err = utils.run_pipe(steps_1)
output, errors = capsys.readouterr()
assert "first step shlex to stdout" in output
assert check_output in out
def test_run_two_step(steps_2, capsys):
check_output = 'ENCFF833BLU.fastq.gz\nENCFF646LXU.fastq.gz'.encode('UTF-8')
out, err = utils.run_pipe(steps_2)
output, errors = capsys.readouterr()
assert "intermediate step 2 shlex to stdout" in output
assert check_output in out
def test_run_last_step_file(steps_2, capsys, tmpdir):
check_output = 'ENCFF833BLU.fastq.gz\nENCFF646LXU.fastq.gz'
tmp_outfile = tmpdir.join('output.txt')
out, err = utils.run_pipe(steps_2, tmp_outfile.strpath)
output, errors = capsys.readouterr()
assert "last step shlex" in output
assert check_output in tmp_outfile.read()
def test_strip_extensions():
filename = utils.strip_extensions('ENCFF833BLU.fastq.gz', STRIP_EXTENSIONS)
assert filename == 'ENCFF833BLU'
def test_strip_extensions_not_valid():
filename = utils.strip_extensions('ENCFF833BLU.not.valid', STRIP_EXTENSIONS)
assert filename == 'ENCFF833BLU.not.valid'
def test_strip_extensions_missing_basename():
filename = utils.strip_extensions('.fastq.gz', STRIP_EXTENSIONS)
assert filename == '.fastq'
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment