Skip to content
Snippets Groups Projects
Commit 2074cdc8 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Merge branch '16-FixAlignReads' into 'master'

Resolve "Fix Align Reads"

Closes #16

See merge request !8
parents e6657f34 32201ef4
1 merge request!8Resolve "Fix Align Reads"
Pipeline #5287 passed with stages
in 6 hours and 16 minutes
......@@ -13,6 +13,7 @@ All notable changes to this project will be documented in this file.
- Added new CI/CD tests for better coverage
- Added punctuation check in design file
- Added sequence (fastq1) length into design file for better mapping
- Raw fastq1 sequence length determines mapper
## [publish_1.0.0 ] - 2019-12-03
Initial release of pipeline
......
......@@ -46,6 +46,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
+ There are ??? steps to the pipeline
1. Check input files
2. Trim adaptors with TrimGalore!
3. Map reads with BWA, filter with SamTools, and sort with Sambamba
## Output Files
......
......@@ -9,9 +9,9 @@
# A unique identifier for the workflow package, text/underscores only
name: 'atacseq_analysis_bicf'
# Who wrote this?
author: 'Venkat Malladi'
author: 'Holly Ruess and Venkat Malladi'
# A contact email address for questions
email: 'bicfp@utsouthwestern.edu'
email: 'bicf@utsouthwestern.edu'
# A more informative title for the workflow package
title: 'BICF ATAC-seq Analysis Workflow'
# A summary of the workflow package in plain text
......
......@@ -12,7 +12,7 @@ process {
cpus = 32
}
$alignReads{
module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/intel/1.3']
module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/intel/1.3', 'sambamba/0.6.6']
queue = '128GB,256GB,256GBv1'
}
$filterReads{
......
......@@ -146,32 +146,51 @@ process trimReads {
// Align trimmed reads using bwa
process alignReads {
tag "$sampleId-$replicate"
publishDir "$baseDir/output/${task.process}", mode: 'copy'
tag "${sampleId}-${replicate}"
publishDir "${outDir}/${task.process}/${sampleId}", mode: 'copy'
input:
set sampleId, reads, experimentId, replicate from trimmedReads
file index from bwaIndex.first()
set sampleId, reads, experimentId, replicate, fqLength from trimmedReads
file index from bwaIndex.first()
output:
set sampleId, file('*.bam'), experimentId, replicate into mappedReads
file '*.srt.bam.flagstat.qc' into mappedReadsStats
set sampleId, file('*.bam'), experimentId, replicate into mappedReads
file '*.flagstat.qc' into mappedReadsStats
script:
if (pairedEnd) {
"""
python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -p
"""
}
else {
"""
python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa
"""
}
if (params.astrocyte == true) {
if (pairedEnd) {
"""
module load python/3.6.1-2-anaconda
module load bwa/intel/0.7.12
module load samtools/intel/1.3
module load sambamba/0.6.6
python3 ${baseDir}/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -s $sampleId -l ${fqLength} -p
"""
}
else {
"""
module load python/3.6.1-2-anaconda
module load bwa/intel/0.7.12
module load samtools/intel/1.3
module load sambamba/0.6.6
python3 ${baseDir}/scripts/map_reads.py -f ${reads} -r ${index}/genome.fa -s $sampleId -l ${fqLength}
"""
}
}
else {
if (pairedEnd) {
"""
python3 ${baseDir}/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -s $sampleId -l ${fqLength} -p
"""
}
else {
"""
python3 ${baseDir}/scripts/map_reads.py -f ${reads} -r ${index}/genome.fa -s $sampleId -l ${fqLength}
"""
}
}
}
......
......@@ -9,6 +9,7 @@ import shutil
import shlex
import logging
from multiprocessing import cpu_count
from psutil import virtual_memory
import utils
EPILOG = '''
......@@ -46,6 +47,14 @@ def get_args():
help="The bwa index of the reference genome.",
required=True)
parser.add_argument('-s', '--sample',
help="The name of the sample.",
required=True)
parser.add_argument('-l', '--fqlength',
help="The length of untrimmed reads.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
......@@ -56,8 +65,6 @@ def get_args():
# Functions
def check_tools():
'''Checks for required componenets on user system'''
......@@ -66,6 +73,18 @@ def check_tools():
bwa_path = shutil.which("bwa")
if bwa_path:
logger.info('Found bwa: %s', bwa_path)
# Get Version
bwa_version_command = "bwa"
try:
subprocess.check_output(bwa_version_command, shell=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
bwa_version = e.output
# Write to file
bwa_file = open("version_bwa.txt", "wb")
bwa_file.write(bwa_version)
bwa_file.close()
else:
logger.error('Missing bwa')
raise Exception('Missing bwa')
......@@ -73,10 +92,38 @@ def check_tools():
samtools_path = shutil.which("samtools")
if samtools_path:
logger.info('Found samtools: %s', samtools_path)
# Get Version
samtools_version_command = "samtools --version"
samtools_version = subprocess.check_output(samtools_version_command, shell=True)
# Write to file
samtools_file = open("version_samtools.txt", "wb")
samtools_file.write(samtools_version)
samtools_file.close()
else:
logger.error('Missing samtools')
raise Exception('Missing samtools')
sambamba_path = shutil.which("sambamba")
if sambamba_path:
logger.info('Found sambamba: %s', sambamba_path)
# Get Version
sambamba_version_command = "sambamba"
try:
subprocess.check_output(sambamba_version_command, shell=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
sambamba_version = e.output
# Write to file
sambamba_file = open("version_sambamba.txt", "wb")
sambamba_file.write(sambamba_version)
sambamba_file.close()
else:
logger.error('Missing sambamba')
raise Exception('Missing sambamba')
def generate_sa(fastq, reference):
'''Use BWA to generate Suffix Arrays.'''
......@@ -97,31 +144,55 @@ def generate_sa(fastq, reference):
return sai
def align_se(fastq, sai, reference, fastq_basename):
'''Use BWA to align SE data.'''
def short_align_se(fastq, sai, reference, fastq_basename):
'''Use BWA samse to align short SE data.'''
bam_filename = '%s.srt.bam' % (fastq_basename)
bam_filename = '%s.bam' % (fastq_basename)
steps = [
"bwa samse %s %s %s"
% (reference, sai[0], fastq[0]),
"samtools view -@%d -Su -" % (cpu_count()),
"samtools sort -@%d -o %s"
% (cpu_count(), bam_filename)]
"sambamba sort -t %d -m %dKB -o %s /dev/stdin"
% (cpu_count(), virtual_memory()[1], bam_filename),
]
out, err = utils.run_pipe(steps)
logger.info("Running bwa alignment with %s", steps)
if err:
logger.error("samse/samtools error: %s", err)
return bam_filename
def align_pe(fastq, sai, reference, fastq_basename):
'''Use BWA to align PE data.'''
def long_align_se(fastq, reference, fastq_basename):
'''Use BWA mem to align long SE data.'''
bam_filename = '%s.bam' % (fastq_basename)
steps = [
"bwa mem %s %s"
% (reference, fastq[0]),
"samtools view -@%d -Su -" % (cpu_count()),
"sambamba sort -t %d -m %dKB -o %s /dev/stdin"
% (cpu_count(), virtual_memory()[1], bam_filename)]
out, err = utils.run_pipe(steps)
logger.info("Running bwa alignment with %s", steps)
if err:
logger.error("mem/samtools error: %s", err)
return bam_filename
def short_align_pe(fastq, sai, reference, fastq_basename):
'''Use BWA samse to align short PE data.'''
sam_filename = "%s.sam" % (fastq_basename)
badcigar_filename = "%s.badReads" % (fastq_basename)
bam_filename = '%s.srt.bam' % (fastq_basename)
bam_filename = '%s.bam' % (fastq_basename)
# Remove read pairs with bad CIGAR strings and sort by position
steps = [
......@@ -131,9 +202,12 @@ def align_pe(fastq, sai, reference, fastq_basename):
"tee %s" % (sam_filename),
r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
"sort",
"uniq"]
"uniq",
]
out, err = utils.run_pipe(steps, badcigar_filename)
logger.info("Running bwa alignment with %s", steps)
if err:
logger.error("sampe error: %s", err)
......@@ -141,21 +215,47 @@ def align_pe(fastq, sai, reference, fastq_basename):
"cat %s" % (sam_filename),
"grep -v -F -f %s" % (badcigar_filename),
"samtools view -@%d -Su -" % (cpu_count()),
"samtools sort -@%d -o %s"
% (cpu_count(), bam_filename)]
"sambamba sort -t %d -m %dKB -o %s /dev/stdin"
% (cpu_count(), virtual_memory()[1], bam_filename),
]
out, err = utils.run_pipe(steps)
logger.info("Running bwa alignment with %s", steps)
if err:
logger.error("samtools error: %s", err)
return bam_filename
def long_align_pe(fastq, reference, fastq_basename):
'''Use BWA mem to align long PE data.'''
bam_filename = '%s.bam' % (fastq_basename)
steps = [
"bwa mem %s %s %s"
% (reference, fastq[0], fastq[1]),
"samtools view -@%d -Su -" % (cpu_count()),
"sambamba sort -t %d -m %dKB -o %s /dev/stdin"
% (cpu_count(), virtual_memory()[1], bam_filename)]
out, err = utils.run_pipe(steps)
logger.info("Running bwa alignment with %s", steps)
if err:
logger.error("mem/samtools error: %s", err)
return bam_filename
def main():
args = get_args()
paired = args.paired
fastq = args.fastq
reference = args.reference
sample = args.sample
fqlength = args.fqlength
# Create a file handler
handler = logging.FileHandler('map.log')
......@@ -164,38 +264,47 @@ def main():
# Check if tools are present
check_tools()
# Run Suffix Array generation
sai = []
for fq in fastq:
sai_filename = generate_sa(fq, reference)
sai.append(sai_filename)
# Make file basename
fastq_basename = sample
# Run alignment for either PE or SE
if paired: # paired-end data
fastq_r1_basename = os.path.basename(
utils.strip_extensions(fastq[0], STRIP_EXTENSIONS))
fastq_r2_basename = os.path.basename(
utils.strip_extensions(fastq[1], STRIP_EXTENSIONS))
fastq_basename = fastq_r1_basename + fastq_r2_basename
bam_filename = align_pe(fastq, sai, reference, fastq_basename)
if int(fqlength) < 70:
sai = []
for fastq_file in fastq:
sai_filename = generate_sa(fastq_file, reference)
sai.append(sai_filename)
bam_filename = short_align_pe(fastq, sai, reference, fastq_basename)
else:
bam_filename = long_align_pe(fastq, reference, fastq_basename)
else:
fastq_basename = os.path.basename(
utils.strip_extensions(fastq[0], STRIP_EXTENSIONS))
bam_filename = align_se(fastq, sai, reference, fastq_basename)
bam_mapstats_filename = '%s.srt.bam.flagstat.qc' % (fastq_basename)
with open(bam_mapstats_filename, 'w') as out_file:
if int(fqlength) < 70:
sai = []
for fastq_file in fastq:
sai_filename = generate_sa(fastq_file, reference)
sai.append(sai_filename)
bam_filename = short_align_se(fastq, sai, reference, fastq_basename)
else:
bam_filename = long_align_se(fastq, reference, fastq_basename)
bam_mapstats_filename = '%s.flagstat.qc' % (fastq_basename)
with open(bam_mapstats_filename, 'w') as temp_file:
subprocess.check_call(
shlex.split("samtools flagstat %s" % (bam_filename)),
stdout=out_file)
stdout=temp_file)
# Remove sai files
for sai_file in sai:
os.remove(sai_file)
#Genome/Bad fastq File Check
file_check = open(bam_mapstats_filename).readlines()
percent = file_check[4].split('(')[1]
percent = percent.split('%')[0]
if float(percent) < 10:
raise Exception ('Mapped Genes too low: Check for correct Genotype')
# Remove sai files
if int(fqlength) < 70:
for sai_file in sai:
os.remove(sai_file)
if __name__ == '__main__':
main()
......@@ -7,17 +7,20 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../output/alignReads/'
@pytest.mark.integration
def test_map_reads_singleend():
#assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.srt.bam'))
#aligned_reads_report = test_output_path + 'ENCFF646LXU.srt.bam.flagstat.qc'
#samtools_report = open(aligned_reads_report).readlines()
#assert '80795025 + 0 in total' in samtools_report[0]
#assert '80050072 + 0 mapped (99.08% : N/A)' in samtools_report[4]
pass
@pytest.mark.pairedend_mouse
def test_align_reads_pairedend_mouse():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP/ENCLB122XDP.bam'))
aligned_reads_report = test_output_path + 'ENCLB122XDP/ENCLB122XDP.flagstat.qc'
samtools_report = open(aligned_reads_report).readlines()
assert '62618664 + 0 in total' in samtools_report[0]
assert '59858415 + 0 mapped (95.59% : N/A)' in samtools_report[4]
assert '55827837 + 0 properly paired (89.16% : N/A)' in samtools_report[8]
@pytest.mark.integration
def test_map_reads_pairedend():
# Do the same thing for paired end data
pass
@pytest.mark.singleend_human
def test_align_reads_singleend_human():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX/ENCLB969KTX.bam'))
aligned_reads_report = test_output_path + 'ENCLB969KTX/ENCLB969KTX.flagstat.qc'
samtools_report = open(aligned_reads_report).readlines()
assert '61046225 + 0 in total' in samtools_report[0]
assert '59599010 + 0 mapped (97.63% : N/A)' in samtools_report[4]
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment