From f08db8a9b0b43616353328425b67978ead0604d0 Mon Sep 17 00:00:00 2001 From: Holly Ruess <s185797@biohpcwsc012.biohpc.swmed.edu> Date: Thu, 19 Dec 2019 15:22:48 -0600 Subject: [PATCH] Fix convert reads --- CHANGELOG.md | 2 + README.md | 3 +- workflow/main.nf | 87 +++++++++------------- workflow/scripts/convert_reads.py | 104 ++++++++++----------------- workflow/tests/test_convert_reads.py | 15 ++-- 5 files changed, 80 insertions(+), 131 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 84794a0..b713d74 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ All notable changes to this project will be documented in this file. - Removed biosample, factor, treatment from design file - Updated documentation - Output graphics are in pdf format + - Not optional to remove chrM while converting to tagAlign + - Not optional to do a tn5 shift on tagAlign files ### Added - Changelog diff --git a/README.md b/README.md index 6a965f5..5390db4 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git 3. Map reads with BWA, filter with SamTools, and sort with Sambamba 4. Mark duplicates with Sambamba, Filter reads with SamTools, and calculate library complexity with SamTools and bedtools 5. QC metrics with deep tools + 6. Convert bam files to tagAlign files; remove chrM and adds tn5 shift ## Output Files @@ -64,8 +65,8 @@ filterReads | *.dedup.bam | filtered bam file with duplicate reads removed filterReads | *.dedup.bam.bai | indexed filtered bam file filterReads | *.dedup.flagstat.qc | QC metrics of filtered bam file (mapping stats, samtools) filterReads | *.dedup.pbc.qc | QC metrics of library complexity +convertReads | *.tagAlign.gz | bed alignent in BEDPE or BEDSE format experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sample -experimentQC | *_fingerprint.pdf | plot to determine if the antibody-treatment enriched sufficiently experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples experimentQC | heatmeap_PearsonCorr.pdf | plot of Pearson correlation between samples experimentQC | sample_mbs.npz | array of multiple BAM summaries diff --git a/workflow/main.nf b/workflow/main.nf index ab19e27..be55e1c 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -6,8 +6,6 @@ // Define Input variables params.reads = "$baseDir/../test_data/*.fastq.gz" params.pairedEnd = false -params.chrM = true -params.tn5 = true params.designFile = "$baseDir/../test_data/design_ENCSR265ZXX_SE.txt" params.genome = 'GRCh38' params.genomes = [] @@ -36,8 +34,6 @@ readsList = Channel // Define regular variables pairedEnd = params.pairedEnd -chrM = params.chrM -tn5 = params.tn5 designFile = params.designFile genomeSize = params.genomeSize chromSizes = params.chromSizes @@ -286,64 +282,47 @@ process experimentQC { // Convert reads to bam process convertReads { - tag "$sampleId-$replicate" - publishDir "$baseDir/output/${task.process}", mode: 'copy' + tag "${sampleId}-${replicate}" + publishDir "${outDir}/${task.process}", mode: 'copy' input: - - set sampleId, deduped, bai, experimentId, replicate from convertReads + set sampleId, deduped, bai, experimentId, replicate from convertReads output: - - set sampleId, file('*.tagAlign.gz'), experimentId, replicate into tagReads - + set sampleId, file('*.tagAlign.gz'), experimentId, replicate into tagReads + file('version_*.txt') into convertReadsVersions script: - - if (pairedEnd) { - if (tn5 && chrM){ - """ - python3 $baseDir/scripts/convert_reads.py -b $deduped -p -m -t - """ - } - else if (tn5){ - """ - python3 $baseDir/scripts/convert_reads.py -b $deduped -p -t - """ - } - else if (chrM){ - """ - python3 $baseDir/scripts/convert_reads.py -b $deduped -p -m - """ - } - else{ - """ - python3 $baseDir/scripts/convert_reads.py -b $deduped -p - """ - } - } - else { - if (tn5 && chrM){ - """ - python3 $baseDir/scripts/convert_reads.py -b $deduped -m -t - """ - } - else if (tn5){ - """ - python3 $baseDir/scripts/convert_reads.py -b $deduped -t - """ - } - else if (chrM){ - """ - python3 $baseDir/scripts/convert_reads.py -b $deduped -m - """ + if (params.astrocyte == true) { + if (pairedEnd) { + """ + module load python/3.6.1-2-anaconda + module load samtools/1.4.1 + module load bedtools/2.26.0 + python3 ${baseDir}/scripts/convert_reads.py -b ${deduped} -p + """ + } + else { + """ + module load python/3.6.1-2-anaconda + module load samtools/1.4.1 + module load bedtools/2.26.0 + python3 ${baseDir}/scripts/convert_reads.py -b ${deduped} + """ + } } - else{ - """ - python3 $baseDir/scripts/convert_reads.py -b $deduped - """ + else { + if (pairedEnd) { + """ + python3 ${baseDir}/scripts/convert_reads.py -b ${deduped} -p + """ + } + else { + """ + python3 ${baseDir}/scripts/convert_reads.py -b ${deduped} + """ + } } - } } diff --git a/workflow/scripts/convert_reads.py b/workflow/scripts/convert_reads.py index 6f94fb2..6497e84 100644 --- a/workflow/scripts/convert_reads.py +++ b/workflow/scripts/convert_reads.py @@ -40,16 +40,6 @@ def get_args(): default=False, action='store_true') - parser.add_argument('-m', '--chrm', - help="True/False if to remove ChrM.", - default=False, - action='store_true') - - parser.add_argument('-t', '--tn5', - help='Enable TN5 shifting for ATAC-Seq.', - default=False, - action='store_true') - args = parser.parse_args() return args @@ -64,6 +54,15 @@ def check_tools(): bedtools_path = shutil.which("bedtools") if bedtools_path: logger.info('Found bedtools: %s', bedtools_path) + + # Get Version + bedtools_version_command = "bedtools --version" + bedtools_version = subprocess.check_output(bedtools_version_command, shell=True) + + # Write to file + bedtools_file = open("version_bedtools.txt", "wb") + bedtools_file.write(bedtools_version) + bedtools_file.close() else: logger.error('Missing bedtools') raise Exception('Missing bedtools') @@ -71,31 +70,34 @@ def check_tools(): samtools_path = shutil.which("samtools") if samtools_path: logger.info('Found samtools: %s', samtools_path) + + # Get Version + samtools_version_command = "samtools --version" + samtools_version = subprocess.check_output(samtools_version_command, shell=True) + + # Write to file + samtools_file = open("version_samtools.txt", "wb") + samtools_file.write(samtools_version) + samtools_file.close() else: logger.error('Missing samtools') raise Exception('Missing samtools') -def convert_mapped(bam, tag_filename, chrm): +def convert_mapped(bam, tag_filename): '''Use bedtools to convert to tagAlign.''' - steps = [ + out, err = utils.run_pipe([ "bamToBed -i %s" % (bam), - r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""] - - if chrm: - steps.extend(["grep -v 'chrM'"]) - - steps.extend(["gzip -nc"]) - - out, err = utils.run_pipe(steps, outfile=tag_filename) + r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""", + "grep -v 'chrM'", + r"""awk 'BEGIN {OFS = "\t"}{ if ($6 == "+") {$2 = $2 + 4} else if ($6 == "-") {$3 = $3 - 5} print $0}'""", + "gzip -nc"], outfile=tag_filename) -def convert_mapped_pe(bam, bam_basename, tag_filename, chrm): +def convert_mapped_pe(bam, bam_basename, tag_filename): '''Use bedtools to convert to tagAlign PE data.''' - bedpe_filename = bam_basename + ".bedpe.gz" - # Name sort bam to make BEDPE nmsrt_bam_filename = bam_basename + ".nmsrt.bam" samtools_sort_command = \ @@ -105,42 +107,22 @@ def convert_mapped_pe(bam, bam_basename, tag_filename, chrm): logger.info(samtools_sort_command) subprocess.check_output(shlex.split(samtools_sort_command)) - steps = ["bamToBed -bedpe -mate1 -i %s" % (nmsrt_bam_filename)] - - steps.extend(["gzip -nc"]) + # Steps: 1) bamToBed, 2) remove mito, + # 3)make paired reads on separate lines, 4)tn5 shift, 5)compress + out, err = utils.run_pipe([ + "bamToBed -bedpe -mate1 -i %s" % (nmsrt_bam_filename), + "grep -v 'chrM'", + r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'""", + r"""awk 'BEGIN {OFS = "\t"}{ if ($6 == "+") {$2 = $2 + 4} else if ($6 == "-") {$3 = $3 - 5} print $0}'""", + "gzip -nc"], outfile=tag_filename) - out, err = utils.run_pipe(steps, outfile=bedpe_filename) os.remove(nmsrt_bam_filename) - if chrm: - steps.extend(["grep -v 'chrM'"]) - - # Convert read pairs to reads into standard tagAlign file - tag_steps = ["zcat -f %s" % (bedpe_filename)] - tag_steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""]) - if chrm: - tag_steps.extend(["grep -v 'chrM'"]) - tag_steps.extend(['gzip -cn']) - out, err = utils.run_pipe(tag_steps, outfile=tag_filename) - os.remove(bedpe_filename) - - -def tn5_shift_tagalign(tag_filename, shitfted_tag_filename): - '''Shift tagalign file for Tn5.''' - - steps = [ - "zcat -f %s" % (tag_filename), - r"""awk 'BEGIN {OFS = "\t"}{ if ($6 == "+") {$2 = $2 + 4} else if ($6 == "-") {$3 = $3 - 5} print $0}'""", - "gzip -nc"] - out, err = utils.run_pipe(steps, outfile=shitfted_tag_filename) - def main(): args = get_args() paired = args.paired bam = args.bam - chrm = args.chrm - tn5 = args.tn5 # Create a file handler handler = logging.FileHandler('convert_reads.log') @@ -149,28 +131,16 @@ def main(): # Check if tools are present check_tools() - # Convert PE or SE to tagAlign + # Get bam basename bam_basename = os.path.basename( - utils.strip_extensions(bam, ['.bam'])) + utils.strip_extensions(bam, ['.bam', '.dedup'])) tag_filename = bam_basename + '.tagAlign.gz' - shitfted_tag_filename = bam_basename + ".tn5.tagAlign.gz" - - if paired: # paired-end data - convert_mapped_pe(bam, bam_basename, tag_filename, chrm) + convert_mapped_pe(bam, bam_basename, tag_filename) else: - convert_mapped(bam, tag_filename, chrm) - - if tn5: - logger.info("TN5-shifting TAGALIGN...") - tn5_shift_tagalign(tag_filename, shitfted_tag_filename) - else: - tn5_shift = ["cp %s %s" % (tag_filename, shitfted_tag_filename)] - out, err = utils.run_pipe(tn5_shift) - - os.remove(tag_filename) + convert_mapped(bam, tag_filename) if __name__ == '__main__': main() diff --git a/workflow/tests/test_convert_reads.py b/workflow/tests/test_convert_reads.py index 5aa515f..e4bdf43 100644 --- a/workflow/tests/test_convert_reads.py +++ b/workflow/tests/test_convert_reads.py @@ -7,14 +7,11 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../output/convertReads/' -@pytest.mark.integration -def test_convert_reads_singleend(): - assert os.path.exists(os.path.join(test_output_path, 'ENCFF115PAE.filt.nodup.tagAlign.gz')) - assert os.path.exists(os.path.join(test_output_path, 'ENCFF115PAE.filt.nodup.bedse.gz')) +@pytest.mark.singleend_human +def test_convert_reads_singleend_human(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB622FZX.tagAlign.gz')) -@pytest.mark.integration -def test_map_qc_pairedend(): - # Do the same thing for paired end data - # Also check that bedpe exists - pass +@pytest.mark.pairedend_mouse +def test_convert_reads_pairedend_mouse(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB749GLW.tagAlign.gz')) -- GitLab