Newer
Older
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Align reads to reference genome.'''
import os
import subprocess
import argparse
import shutil
import logging
from multiprocessing import cpu_count
import utils
EPILOG = '''
For more details:
%(prog)s --help
'''
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '_trimmed']
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-f', '--fastq',
help="The fastq file to run triming on.",
nargs='+',
required=True)
parser.add_argument('-r', '--reference',
help="The bwa index of the reference genome.",
required=True)
parser.add_argument('-s', '--sample',
help="The name of the sample.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
action='store_true')
args = parser.parse_args()
return args
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
bwa_path = shutil.which("bwa")
logger.info('Found bwa: %s', bwa_path)
# Get Version
bwa_version_command = "bwa"
try:
subprocess.check_output(bwa_version_command, shell=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
bwa_version = e.output
# Write to file
bwa_file.write(bwa_version)
bwa_file.close()
else:
logger.error('Missing bwa')
raise Exception('Missing bwa')
samtools_path = shutil.which("samtools")
if samtools_path:
logger.info('Found samtools: %s', samtools_path)
# Get Version
samtools_version_command = "samtools --version"
samtools_version = subprocess.check_output(samtools_version_command, shell=True)
# Write to file
samtools_file = open("version_samtools.txt", "wb")
samtools_file.write(samtools_version)
samtools_file.close()
else:
logger.error('Missing samtools')
raise Exception('Missing samtools')
def generate_sa(fastq, reference):
'''Use BWA to generate Suffix Arrays.'''
fastq_basename = os.path.basename(utils.strip_extensions(fastq, STRIP_EXTENSIONS))
bwa_aln_params = '-q 5 -l 32 -k 2'
sai = '%s.sai' % (fastq_basename)
with open(sai, 'w') as sai_file:
% (bwa_aln_params, cpu_count(),
logger.info("Running bwa with %s", bwa_command)
subprocess.check_call(shlex.split(bwa_command), stdout=sai_file)
return sai
def align_se(fastq, sai, reference, fastq_basename):
'''Use BWA to align SE data.'''
bam_filename = '%s.bam' % (fastq_basename)
steps = [
"bwa samse %s %s %s"
"samtools view -@%d -Su -" % (cpu_count()),
"samtools sort -@%d -o %s"
out, err = utils.run_pipe(steps)
if err:
def align_pe(fastq, sai, reference, fastq_basename):
'''Use BWA to align PE data.'''
sam_filename = "%s.sam" % (fastq_basename)
badcigar_filename = "%s.badReads" % (fastq_basename)
bam_filename = '%s.bam' % (fastq_basename)
# Remove read pairs with bad CIGAR strings and sort by position
steps = [
"bwa sampe -P %s %s %s %s %s"
% (reference, sai[0], sai[1],
fastq[0], fastq[1]),
"tee %s" % (sam_filename),
r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
"sort",
"uniq"]
out, err = utils.run_pipe(steps, badcigar_filename)
if err:
steps = [
"cat %s" % (sam_filename),
"grep -v -F -f %s" % (badcigar_filename),
"samtools view -@%d -Su -" % (cpu_count()),
"samtools sort -@%d -o %s"
out, err = utils.run_pipe(steps)
if err:
def main():
args = get_args()
paired = args.paired
fastq = args.fastq
reference = args.reference
# Create a file handler
handler = logging.FileHandler('map.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Run Suffix Array generation
sai = []
for fastq_file in fastq:
sai_filename = generate_sa(fastq_file, reference)
sai.append(sai_filename)
# Make file basename
fastq_basename = sample
# Run alignment for either PE or SE
bam_filename = align_pe(fastq, sai, reference, fastq_basename)
bam_filename = align_se(fastq, sai, reference, fastq_basename)
bam_mapstats_filename = '%s.flagstat.qc' % (fastq_basename)
subprocess.check_call(
shlex.split("samtools flagstat %s" % (bam_filename)),
#Genome/Bad fastq File Check
file_check = open(bam_mapstats_filename).readlines()
percent = file_check[4].split('(')[1]
percent = percent.split('%')[0]
if float(percent) < 10:
raise Exception ('Mapped Genes too low: Check for correct Genotype')
# Remove sai files
for sai_file in sai:
os.remove(sai_file)
if __name__ == '__main__':
main()