diff --git a/CHANGELOG.md b/CHANGELOG.md index e09bcd3010aa0c4c2e5bc55f8b30f3d8e666f19f..3f3f686151c0b472366f34aeef106d1edd434494 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ All notable changes to this project will be documented in this file. - Added new CI/CD tests for better coverage - Added punctuation check in design file - Added sequence (fastq1) length into design file for better mapping + - Raw fastq1 sequence length determines mapper ## [publish_1.0.0 ] - 2019-12-03 Initial release of pipeline diff --git a/README.md b/README.md index 50ef590a5b9df27330d7ac509fda029d7edcb26f..74a4bbc6be332a96259220d4fc3bb1691fe169e3 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git + There are ??? steps to the pipeline 1. Check input files 2. Trim adaptors with TrimGalore! + 3. Map reads with BWA, filter with SamTools, and sort with Sambamba ## Output Files diff --git a/astrocyte_pkg.yml b/astrocyte_pkg.yml index 5194f24d84a89965fdbfdcf9c464050d00b6ce8d..8ec854f84cf588e37337496a2a5e8a60711a3418 100644 --- a/astrocyte_pkg.yml +++ b/astrocyte_pkg.yml @@ -9,9 +9,9 @@ # A unique identifier for the workflow package, text/underscores only name: 'atacseq_analysis_bicf' # Who wrote this? -author: 'Venkat Malladi' +author: 'Holly Ruess and Venkat Malladi' # A contact email address for questions -email: 'bicfp@utsouthwestern.edu' +email: 'bicf@utsouthwestern.edu' # A more informative title for the workflow package title: 'BICF ATAC-seq Analysis Workflow' # A summary of the workflow package in plain text diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index a2853f0c7a1d0d7468c05e0935b1186ec3215e26..7f124276b72f6fdce7e315641d5cc255ed04e343 100644 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -12,7 +12,7 @@ process { cpus = 32 } $alignReads{ - module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/intel/1.3'] + module = ['python/3.6.1-2-anaconda', 'bwa/intel/0.7.12', 'samtools/intel/1.3', 'sambamba/0.6.6'] queue = '128GB,256GB,256GBv1' } $filterReads{ diff --git a/workflow/main.nf b/workflow/main.nf index c1a07fb0885a8192388d7e163399b9c964d89aff..7c0298135160c4b70b19b2d227bf8842679eec8f 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -146,32 +146,51 @@ process trimReads { // Align trimmed reads using bwa process alignReads { - tag "$sampleId-$replicate" - publishDir "$baseDir/output/${task.process}", mode: 'copy' + tag "${sampleId}-${replicate}" + publishDir "${outDir}/${task.process}/${sampleId}", mode: 'copy' input: - - set sampleId, reads, experimentId, replicate from trimmedReads - file index from bwaIndex.first() + set sampleId, reads, experimentId, replicate, fqLength from trimmedReads + file index from bwaIndex.first() output: - - set sampleId, file('*.bam'), experimentId, replicate into mappedReads - file '*.srt.bam.flagstat.qc' into mappedReadsStats + set sampleId, file('*.bam'), experimentId, replicate into mappedReads + file '*.flagstat.qc' into mappedReadsStats script: - - if (pairedEnd) { - """ - python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -p - """ - } - else { - """ - python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa - """ - } + if (params.astrocyte == true) { + if (pairedEnd) { + """ + module load python/3.6.1-2-anaconda + module load bwa/intel/0.7.12 + module load samtools/intel/1.3 + module load sambamba/0.6.6 + python3 ${baseDir}/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -s $sampleId -l ${fqLength} -p + """ + } + else { + """ + module load python/3.6.1-2-anaconda + module load bwa/intel/0.7.12 + module load samtools/intel/1.3 + module load sambamba/0.6.6 + python3 ${baseDir}/scripts/map_reads.py -f ${reads} -r ${index}/genome.fa -s $sampleId -l ${fqLength} + """ + } + } + else { + if (pairedEnd) { + """ + python3 ${baseDir}/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -s $sampleId -l ${fqLength} -p + """ + } + else { + """ + python3 ${baseDir}/scripts/map_reads.py -f ${reads} -r ${index}/genome.fa -s $sampleId -l ${fqLength} + """ + } + } } diff --git a/workflow/scripts/map_reads.py b/workflow/scripts/map_reads.py index dee86d53913f85dd5fbdd0b813820ebf93d61b33..9fc6fa71a9b9d08b58dc635a07ab2eb7cb0eb9d6 100644 --- a/workflow/scripts/map_reads.py +++ b/workflow/scripts/map_reads.py @@ -9,6 +9,7 @@ import shutil import shlex import logging from multiprocessing import cpu_count +from psutil import virtual_memory import utils EPILOG = ''' @@ -46,6 +47,14 @@ def get_args(): help="The bwa index of the reference genome.", required=True) + parser.add_argument('-s', '--sample', + help="The name of the sample.", + required=True) + + parser.add_argument('-l', '--fqlength', + help="The length of untrimmed reads.", + required=True) + parser.add_argument('-p', '--paired', help="True/False if paired-end or single end.", default=False, @@ -56,8 +65,6 @@ def get_args(): # Functions - - def check_tools(): '''Checks for required componenets on user system''' @@ -66,6 +73,18 @@ def check_tools(): bwa_path = shutil.which("bwa") if bwa_path: logger.info('Found bwa: %s', bwa_path) + + # Get Version + bwa_version_command = "bwa" + try: + subprocess.check_output(bwa_version_command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + bwa_version = e.output + + # Write to file + bwa_file = open("version_bwa.txt", "wb") + bwa_file.write(bwa_version) + bwa_file.close() else: logger.error('Missing bwa') raise Exception('Missing bwa') @@ -73,10 +92,38 @@ def check_tools(): samtools_path = shutil.which("samtools") if samtools_path: logger.info('Found samtools: %s', samtools_path) + + # Get Version + samtools_version_command = "samtools --version" + samtools_version = subprocess.check_output(samtools_version_command, shell=True) + + # Write to file + samtools_file = open("version_samtools.txt", "wb") + samtools_file.write(samtools_version) + samtools_file.close() else: logger.error('Missing samtools') raise Exception('Missing samtools') + sambamba_path = shutil.which("sambamba") + if sambamba_path: + logger.info('Found sambamba: %s', sambamba_path) + + # Get Version + sambamba_version_command = "sambamba" + try: + subprocess.check_output(sambamba_version_command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + sambamba_version = e.output + + # Write to file + sambamba_file = open("version_sambamba.txt", "wb") + sambamba_file.write(sambamba_version) + sambamba_file.close() + else: + logger.error('Missing sambamba') + raise Exception('Missing sambamba') + def generate_sa(fastq, reference): '''Use BWA to generate Suffix Arrays.''' @@ -97,31 +144,55 @@ def generate_sa(fastq, reference): return sai -def align_se(fastq, sai, reference, fastq_basename): - '''Use BWA to align SE data.''' +def short_align_se(fastq, sai, reference, fastq_basename): + '''Use BWA samse to align short SE data.''' - bam_filename = '%s.srt.bam' % (fastq_basename) + bam_filename = '%s.bam' % (fastq_basename) steps = [ "bwa samse %s %s %s" % (reference, sai[0], fastq[0]), "samtools view -@%d -Su -" % (cpu_count()), - "samtools sort -@%d -o %s" - % (cpu_count(), bam_filename)] + "sambamba sort -t %d -m %dKB -o %s /dev/stdin" + % (cpu_count(), virtual_memory()[1], bam_filename), + ] out, err = utils.run_pipe(steps) + logger.info("Running bwa alignment with %s", steps) + if err: logger.error("samse/samtools error: %s", err) return bam_filename -def align_pe(fastq, sai, reference, fastq_basename): - '''Use BWA to align PE data.''' +def long_align_se(fastq, reference, fastq_basename): + '''Use BWA mem to align long SE data.''' + + bam_filename = '%s.bam' % (fastq_basename) + + steps = [ + "bwa mem %s %s" + % (reference, fastq[0]), + "samtools view -@%d -Su -" % (cpu_count()), + "sambamba sort -t %d -m %dKB -o %s /dev/stdin" + % (cpu_count(), virtual_memory()[1], bam_filename)] + + out, err = utils.run_pipe(steps) + logger.info("Running bwa alignment with %s", steps) + + if err: + logger.error("mem/samtools error: %s", err) + + return bam_filename + + +def short_align_pe(fastq, sai, reference, fastq_basename): + '''Use BWA samse to align short PE data.''' sam_filename = "%s.sam" % (fastq_basename) badcigar_filename = "%s.badReads" % (fastq_basename) - bam_filename = '%s.srt.bam' % (fastq_basename) + bam_filename = '%s.bam' % (fastq_basename) # Remove read pairs with bad CIGAR strings and sort by position steps = [ @@ -131,9 +202,12 @@ def align_pe(fastq, sai, reference, fastq_basename): "tee %s" % (sam_filename), r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""", "sort", - "uniq"] + "uniq", + ] out, err = utils.run_pipe(steps, badcigar_filename) + logger.info("Running bwa alignment with %s", steps) + if err: logger.error("sampe error: %s", err) @@ -141,21 +215,47 @@ def align_pe(fastq, sai, reference, fastq_basename): "cat %s" % (sam_filename), "grep -v -F -f %s" % (badcigar_filename), "samtools view -@%d -Su -" % (cpu_count()), - "samtools sort -@%d -o %s" - % (cpu_count(), bam_filename)] + "sambamba sort -t %d -m %dKB -o %s /dev/stdin" + % (cpu_count(), virtual_memory()[1], bam_filename), + ] out, err = utils.run_pipe(steps) + logger.info("Running bwa alignment with %s", steps) + if err: logger.error("samtools error: %s", err) return bam_filename +def long_align_pe(fastq, reference, fastq_basename): + '''Use BWA mem to align long PE data.''' + + bam_filename = '%s.bam' % (fastq_basename) + + steps = [ + "bwa mem %s %s %s" + % (reference, fastq[0], fastq[1]), + "samtools view -@%d -Su -" % (cpu_count()), + "sambamba sort -t %d -m %dKB -o %s /dev/stdin" + % (cpu_count(), virtual_memory()[1], bam_filename)] + + out, err = utils.run_pipe(steps) + logger.info("Running bwa alignment with %s", steps) + + if err: + logger.error("mem/samtools error: %s", err) + + return bam_filename + + def main(): args = get_args() paired = args.paired fastq = args.fastq reference = args.reference + sample = args.sample + fqlength = args.fqlength # Create a file handler handler = logging.FileHandler('map.log') @@ -164,38 +264,47 @@ def main(): # Check if tools are present check_tools() - # Run Suffix Array generation - sai = [] - for fq in fastq: - sai_filename = generate_sa(fq, reference) - sai.append(sai_filename) + # Make file basename + fastq_basename = sample # Run alignment for either PE or SE if paired: # paired-end data - fastq_r1_basename = os.path.basename( - utils.strip_extensions(fastq[0], STRIP_EXTENSIONS)) - fastq_r2_basename = os.path.basename( - utils.strip_extensions(fastq[1], STRIP_EXTENSIONS)) - fastq_basename = fastq_r1_basename + fastq_r2_basename - - bam_filename = align_pe(fastq, sai, reference, fastq_basename) + if int(fqlength) < 70: + sai = [] + for fastq_file in fastq: + sai_filename = generate_sa(fastq_file, reference) + sai.append(sai_filename) + bam_filename = short_align_pe(fastq, sai, reference, fastq_basename) + else: + bam_filename = long_align_pe(fastq, reference, fastq_basename) else: - fastq_basename = os.path.basename( - utils.strip_extensions(fastq[0], STRIP_EXTENSIONS)) - - bam_filename = align_se(fastq, sai, reference, fastq_basename) - - bam_mapstats_filename = '%s.srt.bam.flagstat.qc' % (fastq_basename) - with open(bam_mapstats_filename, 'w') as out_file: + if int(fqlength) < 70: + sai = [] + for fastq_file in fastq: + sai_filename = generate_sa(fastq_file, reference) + sai.append(sai_filename) + bam_filename = short_align_se(fastq, sai, reference, fastq_basename) + else: + bam_filename = long_align_se(fastq, reference, fastq_basename) + + bam_mapstats_filename = '%s.flagstat.qc' % (fastq_basename) + with open(bam_mapstats_filename, 'w') as temp_file: subprocess.check_call( shlex.split("samtools flagstat %s" % (bam_filename)), - stdout=out_file) + stdout=temp_file) - # Remove sai files - for sai_file in sai: - os.remove(sai_file) + #Genome/Bad fastq File Check + file_check = open(bam_mapstats_filename).readlines() + percent = file_check[4].split('(')[1] + percent = percent.split('%')[0] + if float(percent) < 10: + raise Exception ('Mapped Genes too low: Check for correct Genotype') + # Remove sai files + if int(fqlength) < 70: + for sai_file in sai: + os.remove(sai_file) if __name__ == '__main__': main() diff --git a/workflow/tests/test_map_reads.py b/workflow/tests/test_map_reads.py index dc0112d2c8fb88c871d0451c92203aa3811ecea4..37eeae763cbdf4825a0d36ab8deea24d9bce4ccc 100644 --- a/workflow/tests/test_map_reads.py +++ b/workflow/tests/test_map_reads.py @@ -7,17 +7,20 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../output/alignReads/' -@pytest.mark.integration -def test_map_reads_singleend(): - #assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.srt.bam')) - #aligned_reads_report = test_output_path + 'ENCFF646LXU.srt.bam.flagstat.qc' - #samtools_report = open(aligned_reads_report).readlines() - #assert '80795025 + 0 in total' in samtools_report[0] - #assert '80050072 + 0 mapped (99.08% : N/A)' in samtools_report[4] - pass +@pytest.mark.pairedend_mouse +def test_align_reads_pairedend_mouse(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP/ENCLB122XDP.bam')) + aligned_reads_report = test_output_path + 'ENCLB122XDP/ENCLB122XDP.flagstat.qc' + samtools_report = open(aligned_reads_report).readlines() + assert '62618664 + 0 in total' in samtools_report[0] + assert '59858415 + 0 mapped (95.59% : N/A)' in samtools_report[4] + assert '55827837 + 0 properly paired (89.16% : N/A)' in samtools_report[8] -@pytest.mark.integration -def test_map_reads_pairedend(): - # Do the same thing for paired end data - pass +@pytest.mark.singleend_human +def test_align_reads_singleend_human(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX/ENCLB969KTX.bam')) + aligned_reads_report = test_output_path + 'ENCLB969KTX/ENCLB969KTX.flagstat.qc' + samtools_report = open(aligned_reads_report).readlines() + assert '61046225 + 0 in total' in samtools_report[0] + assert '59599010 + 0 mapped (97.63% : N/A)' in samtools_report[4]