Skip to content
Snippets Groups Projects
Commit 2074cdc8 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Merge branch '16-FixAlignReads' into 'master'

Resolve "Fix Align Reads"

Closes #16

See merge request !8
parents e6657f34 32201ef4
1 merge request!8Resolve "Fix Align Reads"
Pipeline #5287 passed with stages
in 6 hours and 16 minutes
...@@ -13,6 +13,7 @@ All notable changes to this project will be documented in this file. ...@@ -13,6 +13,7 @@ All notable changes to this project will be documented in this file.
- Added new CI/CD tests for better coverage - Added new CI/CD tests for better coverage
- Added punctuation check in design file - Added punctuation check in design file
- Added sequence (fastq1) length into design file for better mapping - Added sequence (fastq1) length into design file for better mapping
- Raw fastq1 sequence length determines mapper
## [publish_1.0.0 ] - 2019-12-03 ## [publish_1.0.0 ] - 2019-12-03
Initial release of pipeline Initial release of pipeline
......
...@@ -46,6 +46,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git ...@@ -46,6 +46,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
+ There are ??? steps to the pipeline + There are ??? steps to the pipeline
1. Check input files 1. Check input files
2. Trim adaptors with TrimGalore! 2. Trim adaptors with TrimGalore!
3. Map reads with BWA, filter with SamTools, and sort with Sambamba
## Output Files ## Output Files
......
...@@ -9,9 +9,9 @@ ...@@ -9,9 +9,9 @@
# A unique identifier for the workflow package, text/underscores only # A unique identifier for the workflow package, text/underscores only
name: 'atacseq_analysis_bicf' name: 'atacseq_analysis_bicf'
# Who wrote this? # Who wrote this?
author: 'Venkat Malladi' author: 'Holly Ruess and Venkat Malladi'
# A contact email address for questions # A contact email address for questions
email: 'bicfp@utsouthwestern.edu' email: 'bicf@utsouthwestern.edu'
# A more informative title for the workflow package # A more informative title for the workflow package
title: 'BICF ATAC-seq Analysis Workflow' title: 'BICF ATAC-seq Analysis Workflow'
# A summary of the workflow package in plain text # A summary of the workflow package in plain text
......
...@@ -12,7 +12,7 @@ process { ...@@ -12,7 +12,7 @@ process {
cpus = 32 cpus = 32
} }
$alignReads{ $alignReads{
module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/intel/1.3'] module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/intel/1.3', 'sambamba/0.6.6']
queue = '128GB,256GB,256GBv1' queue = '128GB,256GB,256GBv1'
} }
$filterReads{ $filterReads{
......
...@@ -146,32 +146,51 @@ process trimReads { ...@@ -146,32 +146,51 @@ process trimReads {
// Align trimmed reads using bwa // Align trimmed reads using bwa
process alignReads { process alignReads {
tag "$sampleId-$replicate" tag "${sampleId}-${replicate}"
publishDir "$baseDir/output/${task.process}", mode: 'copy' publishDir "${outDir}/${task.process}/${sampleId}", mode: 'copy'
input: input:
set sampleId, reads, experimentId, replicate, fqLength from trimmedReads
set sampleId, reads, experimentId, replicate from trimmedReads file index from bwaIndex.first()
file index from bwaIndex.first()
output: output:
set sampleId, file('*.bam'), experimentId, replicate into mappedReads
set sampleId, file('*.bam'), experimentId, replicate into mappedReads file '*.flagstat.qc' into mappedReadsStats
file '*.srt.bam.flagstat.qc' into mappedReadsStats
script: script:
if (params.astrocyte == true) {
if (pairedEnd) { if (pairedEnd) {
""" """
python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -p module load python/3.6.1-2-anaconda
""" module load bwa/intel/0.7.12
} module load samtools/intel/1.3
else { module load sambamba/0.6.6
""" python3 ${baseDir}/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -s $sampleId -l ${fqLength} -p
python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa """
""" }
} else {
"""
module load python/3.6.1-2-anaconda
module load bwa/intel/0.7.12
module load samtools/intel/1.3
module load sambamba/0.6.6
python3 ${baseDir}/scripts/map_reads.py -f ${reads} -r ${index}/genome.fa -s $sampleId -l ${fqLength}
"""
}
}
else {
if (pairedEnd) {
"""
python3 ${baseDir}/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -s $sampleId -l ${fqLength} -p
"""
}
else {
"""
python3 ${baseDir}/scripts/map_reads.py -f ${reads} -r ${index}/genome.fa -s $sampleId -l ${fqLength}
"""
}
}
} }
......
...@@ -9,6 +9,7 @@ import shutil ...@@ -9,6 +9,7 @@ import shutil
import shlex import shlex
import logging import logging
from multiprocessing import cpu_count from multiprocessing import cpu_count
from psutil import virtual_memory
import utils import utils
EPILOG = ''' EPILOG = '''
...@@ -46,6 +47,14 @@ def get_args(): ...@@ -46,6 +47,14 @@ def get_args():
help="The bwa index of the reference genome.", help="The bwa index of the reference genome.",
required=True) required=True)
parser.add_argument('-s', '--sample',
help="The name of the sample.",
required=True)
parser.add_argument('-l', '--fqlength',
help="The length of untrimmed reads.",
required=True)
parser.add_argument('-p', '--paired', parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.", help="True/False if paired-end or single end.",
default=False, default=False,
...@@ -56,8 +65,6 @@ def get_args(): ...@@ -56,8 +65,6 @@ def get_args():
# Functions # Functions
def check_tools(): def check_tools():
'''Checks for required componenets on user system''' '''Checks for required componenets on user system'''
...@@ -66,6 +73,18 @@ def check_tools(): ...@@ -66,6 +73,18 @@ def check_tools():
bwa_path = shutil.which("bwa") bwa_path = shutil.which("bwa")
if bwa_path: if bwa_path:
logger.info('Found bwa: %s', bwa_path) logger.info('Found bwa: %s', bwa_path)
# Get Version
bwa_version_command = "bwa"
try:
subprocess.check_output(bwa_version_command, shell=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
bwa_version = e.output
# Write to file
bwa_file = open("version_bwa.txt", "wb")
bwa_file.write(bwa_version)
bwa_file.close()
else: else:
logger.error('Missing bwa') logger.error('Missing bwa')
raise Exception('Missing bwa') raise Exception('Missing bwa')
...@@ -73,10 +92,38 @@ def check_tools(): ...@@ -73,10 +92,38 @@ def check_tools():
samtools_path = shutil.which("samtools") samtools_path = shutil.which("samtools")
if samtools_path: if samtools_path:
logger.info('Found samtools: %s', samtools_path) logger.info('Found samtools: %s', samtools_path)
# Get Version
samtools_version_command = "samtools --version"
samtools_version = subprocess.check_output(samtools_version_command, shell=True)
# Write to file
samtools_file = open("version_samtools.txt", "wb")
samtools_file.write(samtools_version)
samtools_file.close()
else: else:
logger.error('Missing samtools') logger.error('Missing samtools')
raise Exception('Missing samtools') raise Exception('Missing samtools')
sambamba_path = shutil.which("sambamba")
if sambamba_path:
logger.info('Found sambamba: %s', sambamba_path)
# Get Version
sambamba_version_command = "sambamba"
try:
subprocess.check_output(sambamba_version_command, shell=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
sambamba_version = e.output
# Write to file
sambamba_file = open("version_sambamba.txt", "wb")
sambamba_file.write(sambamba_version)
sambamba_file.close()
else:
logger.error('Missing sambamba')
raise Exception('Missing sambamba')
def generate_sa(fastq, reference): def generate_sa(fastq, reference):
'''Use BWA to generate Suffix Arrays.''' '''Use BWA to generate Suffix Arrays.'''
...@@ -97,31 +144,55 @@ def generate_sa(fastq, reference): ...@@ -97,31 +144,55 @@ def generate_sa(fastq, reference):
return sai return sai
def align_se(fastq, sai, reference, fastq_basename): def short_align_se(fastq, sai, reference, fastq_basename):
'''Use BWA to align SE data.''' '''Use BWA samse to align short SE data.'''
bam_filename = '%s.srt.bam' % (fastq_basename) bam_filename = '%s.bam' % (fastq_basename)
steps = [ steps = [
"bwa samse %s %s %s" "bwa samse %s %s %s"
% (reference, sai[0], fastq[0]), % (reference, sai[0], fastq[0]),
"samtools view -@%d -Su -" % (cpu_count()), "samtools view -@%d -Su -" % (cpu_count()),
"samtools sort -@%d -o %s" "sambamba sort -t %d -m %dKB -o %s /dev/stdin"
% (cpu_count(), bam_filename)] % (cpu_count(), virtual_memory()[1], bam_filename),
]
out, err = utils.run_pipe(steps) out, err = utils.run_pipe(steps)
logger.info("Running bwa alignment with %s", steps)
if err: if err:
logger.error("samse/samtools error: %s", err) logger.error("samse/samtools error: %s", err)
return bam_filename return bam_filename
def align_pe(fastq, sai, reference, fastq_basename): def long_align_se(fastq, reference, fastq_basename):
'''Use BWA to align PE data.''' '''Use BWA mem to align long SE data.'''
bam_filename = '%s.bam' % (fastq_basename)
steps = [
"bwa mem %s %s"
% (reference, fastq[0]),
"samtools view -@%d -Su -" % (cpu_count()),
"sambamba sort -t %d -m %dKB -o %s /dev/stdin"
% (cpu_count(), virtual_memory()[1], bam_filename)]
out, err = utils.run_pipe(steps)
logger.info("Running bwa alignment with %s", steps)
if err:
logger.error("mem/samtools error: %s", err)
return bam_filename
def short_align_pe(fastq, sai, reference, fastq_basename):
'''Use BWA samse to align short PE data.'''
sam_filename = "%s.sam" % (fastq_basename) sam_filename = "%s.sam" % (fastq_basename)
badcigar_filename = "%s.badReads" % (fastq_basename) badcigar_filename = "%s.badReads" % (fastq_basename)
bam_filename = '%s.srt.bam' % (fastq_basename) bam_filename = '%s.bam' % (fastq_basename)
# Remove read pairs with bad CIGAR strings and sort by position # Remove read pairs with bad CIGAR strings and sort by position
steps = [ steps = [
...@@ -131,9 +202,12 @@ def align_pe(fastq, sai, reference, fastq_basename): ...@@ -131,9 +202,12 @@ def align_pe(fastq, sai, reference, fastq_basename):
"tee %s" % (sam_filename), "tee %s" % (sam_filename),
r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""", r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
"sort", "sort",
"uniq"] "uniq",
]
out, err = utils.run_pipe(steps, badcigar_filename) out, err = utils.run_pipe(steps, badcigar_filename)
logger.info("Running bwa alignment with %s", steps)
if err: if err:
logger.error("sampe error: %s", err) logger.error("sampe error: %s", err)
...@@ -141,21 +215,47 @@ def align_pe(fastq, sai, reference, fastq_basename): ...@@ -141,21 +215,47 @@ def align_pe(fastq, sai, reference, fastq_basename):
"cat %s" % (sam_filename), "cat %s" % (sam_filename),
"grep -v -F -f %s" % (badcigar_filename), "grep -v -F -f %s" % (badcigar_filename),
"samtools view -@%d -Su -" % (cpu_count()), "samtools view -@%d -Su -" % (cpu_count()),
"samtools sort -@%d -o %s" "sambamba sort -t %d -m %dKB -o %s /dev/stdin"
% (cpu_count(), bam_filename)] % (cpu_count(), virtual_memory()[1], bam_filename),
]
out, err = utils.run_pipe(steps) out, err = utils.run_pipe(steps)
logger.info("Running bwa alignment with %s", steps)
if err: if err:
logger.error("samtools error: %s", err) logger.error("samtools error: %s", err)
return bam_filename return bam_filename
def long_align_pe(fastq, reference, fastq_basename):
'''Use BWA mem to align long PE data.'''
bam_filename = '%s.bam' % (fastq_basename)
steps = [
"bwa mem %s %s %s"
% (reference, fastq[0], fastq[1]),
"samtools view -@%d -Su -" % (cpu_count()),
"sambamba sort -t %d -m %dKB -o %s /dev/stdin"
% (cpu_count(), virtual_memory()[1], bam_filename)]
out, err = utils.run_pipe(steps)
logger.info("Running bwa alignment with %s", steps)
if err:
logger.error("mem/samtools error: %s", err)
return bam_filename
def main(): def main():
args = get_args() args = get_args()
paired = args.paired paired = args.paired
fastq = args.fastq fastq = args.fastq
reference = args.reference reference = args.reference
sample = args.sample
fqlength = args.fqlength
# Create a file handler # Create a file handler
handler = logging.FileHandler('map.log') handler = logging.FileHandler('map.log')
...@@ -164,38 +264,47 @@ def main(): ...@@ -164,38 +264,47 @@ def main():
# Check if tools are present # Check if tools are present
check_tools() check_tools()
# Run Suffix Array generation # Make file basename
sai = [] fastq_basename = sample
for fq in fastq:
sai_filename = generate_sa(fq, reference)
sai.append(sai_filename)
# Run alignment for either PE or SE # Run alignment for either PE or SE
if paired: # paired-end data if paired: # paired-end data
fastq_r1_basename = os.path.basename( if int(fqlength) < 70:
utils.strip_extensions(fastq[0], STRIP_EXTENSIONS)) sai = []
fastq_r2_basename = os.path.basename( for fastq_file in fastq:
utils.strip_extensions(fastq[1], STRIP_EXTENSIONS)) sai_filename = generate_sa(fastq_file, reference)
fastq_basename = fastq_r1_basename + fastq_r2_basename sai.append(sai_filename)
bam_filename = short_align_pe(fastq, sai, reference, fastq_basename)
bam_filename = align_pe(fastq, sai, reference, fastq_basename) else:
bam_filename = long_align_pe(fastq, reference, fastq_basename)
else: else:
fastq_basename = os.path.basename( if int(fqlength) < 70:
utils.strip_extensions(fastq[0], STRIP_EXTENSIONS)) sai = []
for fastq_file in fastq:
bam_filename = align_se(fastq, sai, reference, fastq_basename) sai_filename = generate_sa(fastq_file, reference)
sai.append(sai_filename)
bam_mapstats_filename = '%s.srt.bam.flagstat.qc' % (fastq_basename) bam_filename = short_align_se(fastq, sai, reference, fastq_basename)
with open(bam_mapstats_filename, 'w') as out_file: else:
bam_filename = long_align_se(fastq, reference, fastq_basename)
bam_mapstats_filename = '%s.flagstat.qc' % (fastq_basename)
with open(bam_mapstats_filename, 'w') as temp_file:
subprocess.check_call( subprocess.check_call(
shlex.split("samtools flagstat %s" % (bam_filename)), shlex.split("samtools flagstat %s" % (bam_filename)),
stdout=out_file) stdout=temp_file)
# Remove sai files #Genome/Bad fastq File Check
for sai_file in sai: file_check = open(bam_mapstats_filename).readlines()
os.remove(sai_file) percent = file_check[4].split('(')[1]
percent = percent.split('%')[0]
if float(percent) < 10:
raise Exception ('Mapped Genes too low: Check for correct Genotype')
# Remove sai files
if int(fqlength) < 70:
for sai_file in sai:
os.remove(sai_file)
if __name__ == '__main__': if __name__ == '__main__':
main() main()
...@@ -7,17 +7,20 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ ...@@ -7,17 +7,20 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../output/alignReads/' '/../output/alignReads/'
@pytest.mark.integration @pytest.mark.pairedend_mouse
def test_map_reads_singleend(): def test_align_reads_pairedend_mouse():
#assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.srt.bam')) assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP/ENCLB122XDP.bam'))
#aligned_reads_report = test_output_path + 'ENCFF646LXU.srt.bam.flagstat.qc' aligned_reads_report = test_output_path + 'ENCLB122XDP/ENCLB122XDP.flagstat.qc'
#samtools_report = open(aligned_reads_report).readlines() samtools_report = open(aligned_reads_report).readlines()
#assert '80795025 + 0 in total' in samtools_report[0] assert '62618664 + 0 in total' in samtools_report[0]
#assert '80050072 + 0 mapped (99.08% : N/A)' in samtools_report[4] assert '59858415 + 0 mapped (95.59% : N/A)' in samtools_report[4]
pass assert '55827837 + 0 properly paired (89.16% : N/A)' in samtools_report[8]
@pytest.mark.integration @pytest.mark.singleend_human
def test_map_reads_pairedend(): def test_align_reads_singleend_human():
# Do the same thing for paired end data assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX/ENCLB969KTX.bam'))
pass aligned_reads_report = test_output_path + 'ENCLB969KTX/ENCLB969KTX.flagstat.qc'
samtools_report = open(aligned_reads_report).readlines()
assert '61046225 + 0 in total' in samtools_report[0]
assert '59599010 + 0 mapped (97.63% : N/A)' in samtools_report[4]
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment