diff --git a/CHANGELOG.md b/CHANGELOG.md index 84794a0b19763ea48cf1b318df133a2051ad67fd..b713d74390968295f6b3eb1370b26ba4461a3e60 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ All notable changes to this project will be documented in this file. - Removed biosample, factor, treatment from design file - Updated documentation - Output graphics are in pdf format + - Not optional to remove chrM while converting to tagAlign + - Not optional to do a tn5 shift on tagAlign files ### Added - Changelog diff --git a/README.md b/README.md index 6a965f5e744cb8f6278b38f92a9054c397909927..5390db419a3eed6b3e5f45e36a8a43aeee0afcab 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git 3. Map reads with BWA, filter with SamTools, and sort with Sambamba 4. Mark duplicates with Sambamba, Filter reads with SamTools, and calculate library complexity with SamTools and bedtools 5. QC metrics with deep tools + 6. Convert bam files to tagAlign files; remove chrM and adds tn5 shift ## Output Files @@ -64,8 +65,8 @@ filterReads | *.dedup.bam | filtered bam file with duplicate reads removed filterReads | *.dedup.bam.bai | indexed filtered bam file filterReads | *.dedup.flagstat.qc | QC metrics of filtered bam file (mapping stats, samtools) filterReads | *.dedup.pbc.qc | QC metrics of library complexity +convertReads | *.tagAlign.gz | bed alignent in BEDPE or BEDSE format experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sample -experimentQC | *_fingerprint.pdf | plot to determine if the antibody-treatment enriched sufficiently experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples experimentQC | heatmeap_PearsonCorr.pdf | plot of Pearson correlation between samples experimentQC | sample_mbs.npz | array of multiple BAM summaries diff --git a/workflow/main.nf b/workflow/main.nf index ab19e278d5edd1c310b5cf097b8bc6cc273f2a02..be55e1c48e89ab9a43bf599b09f3388cada2b4cb 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -6,8 +6,6 @@ // Define Input variables params.reads = "$baseDir/../test_data/*.fastq.gz" params.pairedEnd = false -params.chrM = true -params.tn5 = true params.designFile = "$baseDir/../test_data/design_ENCSR265ZXX_SE.txt" params.genome = 'GRCh38' params.genomes = [] @@ -36,8 +34,6 @@ readsList = Channel // Define regular variables pairedEnd = params.pairedEnd -chrM = params.chrM -tn5 = params.tn5 designFile = params.designFile genomeSize = params.genomeSize chromSizes = params.chromSizes @@ -286,64 +282,47 @@ process experimentQC { // Convert reads to bam process convertReads { - tag "$sampleId-$replicate" - publishDir "$baseDir/output/${task.process}", mode: 'copy' + tag "${sampleId}-${replicate}" + publishDir "${outDir}/${task.process}", mode: 'copy' input: - - set sampleId, deduped, bai, experimentId, replicate from convertReads + set sampleId, deduped, bai, experimentId, replicate from convertReads output: - - set sampleId, file('*.tagAlign.gz'), experimentId, replicate into tagReads - + set sampleId, file('*.tagAlign.gz'), experimentId, replicate into tagReads + file('version_*.txt') into convertReadsVersions script: - - if (pairedEnd) { - if (tn5 && chrM){ - """ - python3 $baseDir/scripts/convert_reads.py -b $deduped -p -m -t - """ - } - else if (tn5){ - """ - python3 $baseDir/scripts/convert_reads.py -b $deduped -p -t - """ - } - else if (chrM){ - """ - python3 $baseDir/scripts/convert_reads.py -b $deduped -p -m - """ - } - else{ - """ - python3 $baseDir/scripts/convert_reads.py -b $deduped -p - """ - } - } - else { - if (tn5 && chrM){ - """ - python3 $baseDir/scripts/convert_reads.py -b $deduped -m -t - """ - } - else if (tn5){ - """ - python3 $baseDir/scripts/convert_reads.py -b $deduped -t - """ - } - else if (chrM){ - """ - python3 $baseDir/scripts/convert_reads.py -b $deduped -m - """ + if (params.astrocyte == true) { + if (pairedEnd) { + """ + module load python/3.6.1-2-anaconda + module load samtools/1.4.1 + module load bedtools/2.26.0 + python3 ${baseDir}/scripts/convert_reads.py -b ${deduped} -p + """ + } + else { + """ + module load python/3.6.1-2-anaconda + module load samtools/1.4.1 + module load bedtools/2.26.0 + python3 ${baseDir}/scripts/convert_reads.py -b ${deduped} + """ + } } - else{ - """ - python3 $baseDir/scripts/convert_reads.py -b $deduped - """ + else { + if (pairedEnd) { + """ + python3 ${baseDir}/scripts/convert_reads.py -b ${deduped} -p + """ + } + else { + """ + python3 ${baseDir}/scripts/convert_reads.py -b ${deduped} + """ + } } - } } diff --git a/workflow/scripts/convert_reads.py b/workflow/scripts/convert_reads.py index 6f94fb256a9c979f33be06f69a5ff0f231a84384..6497e84cf3c63d9a94dbd2cdb91d04ebf5c1691b 100644 --- a/workflow/scripts/convert_reads.py +++ b/workflow/scripts/convert_reads.py @@ -40,16 +40,6 @@ def get_args(): default=False, action='store_true') - parser.add_argument('-m', '--chrm', - help="True/False if to remove ChrM.", - default=False, - action='store_true') - - parser.add_argument('-t', '--tn5', - help='Enable TN5 shifting for ATAC-Seq.', - default=False, - action='store_true') - args = parser.parse_args() return args @@ -64,6 +54,15 @@ def check_tools(): bedtools_path = shutil.which("bedtools") if bedtools_path: logger.info('Found bedtools: %s', bedtools_path) + + # Get Version + bedtools_version_command = "bedtools --version" + bedtools_version = subprocess.check_output(bedtools_version_command, shell=True) + + # Write to file + bedtools_file = open("version_bedtools.txt", "wb") + bedtools_file.write(bedtools_version) + bedtools_file.close() else: logger.error('Missing bedtools') raise Exception('Missing bedtools') @@ -71,31 +70,34 @@ def check_tools(): samtools_path = shutil.which("samtools") if samtools_path: logger.info('Found samtools: %s', samtools_path) + + # Get Version + samtools_version_command = "samtools --version" + samtools_version = subprocess.check_output(samtools_version_command, shell=True) + + # Write to file + samtools_file = open("version_samtools.txt", "wb") + samtools_file.write(samtools_version) + samtools_file.close() else: logger.error('Missing samtools') raise Exception('Missing samtools') -def convert_mapped(bam, tag_filename, chrm): +def convert_mapped(bam, tag_filename): '''Use bedtools to convert to tagAlign.''' - steps = [ + out, err = utils.run_pipe([ "bamToBed -i %s" % (bam), - r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""] - - if chrm: - steps.extend(["grep -v 'chrM'"]) - - steps.extend(["gzip -nc"]) - - out, err = utils.run_pipe(steps, outfile=tag_filename) + r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""", + "grep -v 'chrM'", + r"""awk 'BEGIN {OFS = "\t"}{ if ($6 == "+") {$2 = $2 + 4} else if ($6 == "-") {$3 = $3 - 5} print $0}'""", + "gzip -nc"], outfile=tag_filename) -def convert_mapped_pe(bam, bam_basename, tag_filename, chrm): +def convert_mapped_pe(bam, bam_basename, tag_filename): '''Use bedtools to convert to tagAlign PE data.''' - bedpe_filename = bam_basename + ".bedpe.gz" - # Name sort bam to make BEDPE nmsrt_bam_filename = bam_basename + ".nmsrt.bam" samtools_sort_command = \ @@ -105,42 +107,22 @@ def convert_mapped_pe(bam, bam_basename, tag_filename, chrm): logger.info(samtools_sort_command) subprocess.check_output(shlex.split(samtools_sort_command)) - steps = ["bamToBed -bedpe -mate1 -i %s" % (nmsrt_bam_filename)] - - steps.extend(["gzip -nc"]) + # Steps: 1) bamToBed, 2) remove mito, + # 3)make paired reads on separate lines, 4)tn5 shift, 5)compress + out, err = utils.run_pipe([ + "bamToBed -bedpe -mate1 -i %s" % (nmsrt_bam_filename), + "grep -v 'chrM'", + r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'""", + r"""awk 'BEGIN {OFS = "\t"}{ if ($6 == "+") {$2 = $2 + 4} else if ($6 == "-") {$3 = $3 - 5} print $0}'""", + "gzip -nc"], outfile=tag_filename) - out, err = utils.run_pipe(steps, outfile=bedpe_filename) os.remove(nmsrt_bam_filename) - if chrm: - steps.extend(["grep -v 'chrM'"]) - - # Convert read pairs to reads into standard tagAlign file - tag_steps = ["zcat -f %s" % (bedpe_filename)] - tag_steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""]) - if chrm: - tag_steps.extend(["grep -v 'chrM'"]) - tag_steps.extend(['gzip -cn']) - out, err = utils.run_pipe(tag_steps, outfile=tag_filename) - os.remove(bedpe_filename) - - -def tn5_shift_tagalign(tag_filename, shitfted_tag_filename): - '''Shift tagalign file for Tn5.''' - - steps = [ - "zcat -f %s" % (tag_filename), - r"""awk 'BEGIN {OFS = "\t"}{ if ($6 == "+") {$2 = $2 + 4} else if ($6 == "-") {$3 = $3 - 5} print $0}'""", - "gzip -nc"] - out, err = utils.run_pipe(steps, outfile=shitfted_tag_filename) - def main(): args = get_args() paired = args.paired bam = args.bam - chrm = args.chrm - tn5 = args.tn5 # Create a file handler handler = logging.FileHandler('convert_reads.log') @@ -149,28 +131,16 @@ def main(): # Check if tools are present check_tools() - # Convert PE or SE to tagAlign + # Get bam basename bam_basename = os.path.basename( - utils.strip_extensions(bam, ['.bam'])) + utils.strip_extensions(bam, ['.bam', '.dedup'])) tag_filename = bam_basename + '.tagAlign.gz' - shitfted_tag_filename = bam_basename + ".tn5.tagAlign.gz" - - if paired: # paired-end data - convert_mapped_pe(bam, bam_basename, tag_filename, chrm) + convert_mapped_pe(bam, bam_basename, tag_filename) else: - convert_mapped(bam, tag_filename, chrm) - - if tn5: - logger.info("TN5-shifting TAGALIGN...") - tn5_shift_tagalign(tag_filename, shitfted_tag_filename) - else: - tn5_shift = ["cp %s %s" % (tag_filename, shitfted_tag_filename)] - out, err = utils.run_pipe(tn5_shift) - - os.remove(tag_filename) + convert_mapped(bam, tag_filename) if __name__ == '__main__': main() diff --git a/workflow/tests/test_convert_reads.py b/workflow/tests/test_convert_reads.py index 5aa515f1ec470641431889f66abd1fd6010e54e6..e4bdf43f011c13fec170fe9330fdb98c4636a476 100644 --- a/workflow/tests/test_convert_reads.py +++ b/workflow/tests/test_convert_reads.py @@ -7,14 +7,11 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../output/convertReads/' -@pytest.mark.integration -def test_convert_reads_singleend(): - assert os.path.exists(os.path.join(test_output_path, 'ENCFF115PAE.filt.nodup.tagAlign.gz')) - assert os.path.exists(os.path.join(test_output_path, 'ENCFF115PAE.filt.nodup.bedse.gz')) +@pytest.mark.singleend_human +def test_convert_reads_singleend_human(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB622FZX.tagAlign.gz')) -@pytest.mark.integration -def test_map_qc_pairedend(): - # Do the same thing for paired end data - # Also check that bedpe exists - pass +@pytest.mark.pairedend_mouse +def test_convert_reads_pairedend_mouse(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB749GLW.tagAlign.gz'))