From 4cadae4e4053bfda8b1dcecf0c956d32ee089f55 Mon Sep 17 00:00:00 2001 From: Holly Ruess <s185797@biohpcwsc012.biohpc.swmed.edu> Date: Tue, 17 Dec 2019 11:59:50 -0600 Subject: [PATCH] fix filter reads --- README.md | 13 +++++-- workflow/main.nf | 66 +++++++++++++++++++++++------------ workflow/scripts/map_qc.py | 51 ++++++++++++++++++++++----- workflow/tests/test_map_qc.py | 62 +++++++++++++++++++++----------- 4 files changed, 139 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index 74a4bbc..1e206ec 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Astrocyte ATAC-seq analysis Workflow Package -[](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master) -[](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master) +[](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master) +[](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master) [](https://www.nextflow.io/) [](https://astrocyte-test.biohpc.swmed.edu/static/docs/index.html) @@ -47,6 +47,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git 1. Check input files 2. Trim adaptors with TrimGalore! 3. Map reads with BWA, filter with SamTools, and sort with Sambamba + 4. Mark duplicates with Sambamba, Filter reads with SamTools, and calculate library complexity with SamTools and bedtools ## Output Files @@ -55,6 +56,14 @@ Folder | File | Description design | N/A | Inputs used for analysis; can ignore trimReads | *_trimming_report.txt | report detailing how many reads were trimmed trimReads | *_trimmed.fq.gz | trimmed fastq files used for analysis +alignReads | *.flagstat.qc | QC metrics from the mapping process +alignReads | *.bam | sorted bam file +filterReads | *.dedup.qc | QC metrics of find duplicate reads (sambamba) +filterReads | *.dedup.bam | filtered bam file with duplicate reads removed +filterReads | *.dedup.bam.bai | indexed filtered bam file +filterReads | *.dedup.flagstat.qc | QC metrics of filtered bam file (mapping stats, samtools) +filterReads | *.dedup.pbc.qc | QC metrics of library complexity + ## Common Errors diff --git a/workflow/main.nf b/workflow/main.nf index 7c02981..379169f 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -156,7 +156,7 @@ process alignReads { output: set sampleId, file('*.bam'), experimentId, replicate into mappedReads file '*.flagstat.qc' into mappedReadsStats - + file('version_*.txt') into alignReadsVersions script: if (params.astrocyte == true) { @@ -197,34 +197,54 @@ process alignReads { // Dedup reads using sambamba process filterReads { - tag "$sampleId-$replicate" - publishDir "$baseDir/output/${task.process}", mode: 'copy' + tag "${sampleId}-${replicate}" + publishDir "${outDir}/${task.process}/${sampleId}", mode: 'copy' input: - - set sampleId, mapped, experimentId, replicate from mappedReads + set sampleId, mapped, experimentId, replicate from mappedReads output: - - set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into dedupReads - set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into tssEnrich - set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into convertReads - file '*flagstat.qc' into dedupReadsStats - file '*pbc.qc' into dedupReadsComplexity - file '*dup.qc' into dupReads + set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into dedupReads + set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into tssEnrich + set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into convertReads + file '*flagstat.qc' into dedupReadsStats + file '*pbc.qc' into dedupReadsComplexity + file '*dup.qc' into dupReads + file('version_*.txt') into filterReadsVersions script: - - if (pairedEnd) { - """ - python3 $baseDir/scripts/map_qc.py -b $mapped -p - """ - } - else { - """ - python3 $baseDir/scripts/map_qc.py -b $mapped - """ - } + if (params.astrocyte == true) { + if (pairedEnd) { + """ + module load python/3.6.1-2-anaconda + module load samtools/1.4.1 + module load sambamba/0.6.6 + module load bedtools/2.26.0 + python3 ${baseDir}/scripts/map_qc.py -b ${mapped} -p + """ + } + else { + """ + module load python/3.6.1-2-anaconda + module load samtools/1.4.1 + module load sambamba/0.6.6 + module load bedtools/2.26.0 + python3 ${baseDir}/scripts/map_qc.py -b ${mapped} + """ + } + } + else { + if (pairedEnd) { + """ + python3 ${baseDir}/scripts/map_qc.py -b ${mapped} -p + """ + } + else { + """ + python3 ${baseDir}/scripts/map_qc.py -b ${mapped} + """ + } + } } diff --git a/workflow/scripts/map_qc.py b/workflow/scripts/map_qc.py index d58ae8f..af12e4d 100644 --- a/workflow/scripts/map_qc.py +++ b/workflow/scripts/map_qc.py @@ -12,7 +12,6 @@ from multiprocessing import cpu_count import pandas as pd import utils - EPILOG = ''' For more details: %(prog)s --help @@ -63,6 +62,15 @@ def check_tools(): samtools_path = shutil.which("samtools") if samtools_path: logger.info('Found samtools: %s', samtools_path) + + # Get Version + samtools_version_command = "samtools --version" + samtools_version = subprocess.check_output(samtools_version_command, shell=True) + + # Write to file + samtools_file = open("version_samtools.txt", "wb") + samtools_file.write(samtools_version) + samtools_file.close() else: logger.error('Missing samtools') raise Exception('Missing samtools') @@ -70,6 +78,18 @@ def check_tools(): sambamba_path = shutil.which("sambamba") if sambamba_path: logger.info('Found sambamba: %s', sambamba_path) + + # Get Version + sambamba_version_command = "sambamba" + try: + subprocess.check_output(sambamba_version_command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + sambamba_version = e.output + + # Write to file + sambamba_file = open("version_sambamba.txt", "wb") + sambamba_file.write(sambamba_version) + sambamba_file.close() else: logger.error('Missing sambamba') raise Exception('Missing sambamba') @@ -77,6 +97,15 @@ def check_tools(): bedtools_path = shutil.which("bedtools") if bedtools_path: logger.info('Found bedtools: %s', bedtools_path) + + # Get Version + bedtools_version_command = "bedtools --version" + bedtools_version = subprocess.check_output(bedtools_version_command, shell=True) + + # Write to file + bedtools_file = open("version_bedtools.txt", "wb") + bedtools_file.write(bedtools_version) + bedtools_file.close() else: logger.error('Missing bedtools') raise Exception('Missing bedtools') @@ -168,9 +197,8 @@ def dedup_mapped(bam, bam_basename, paired): shlex.split(sambamba_markdup_command), stderr=dup_qc) - # Remove duplicates - final_bam_prefix = bam_basename + ".filt.nodup" + final_bam_prefix = bam_basename + ".dedup" final_bam_filename = final_bam_prefix + ".bam" if paired: # paired-end data @@ -207,7 +235,7 @@ def dedup_mapped(bam, bam_basename, paired): def compute_complexity(bam, paired, bam_basename): '''Calculate library complexity .''' - pbc_file_qc_filename = bam_basename + ".filt.nodup.pbc.qc" + pbc_file_qc_filename = bam_basename + ".pbc.qc" tmp_pbc_file_qc_filename = "tmp.%s" % (pbc_file_qc_filename) # Sort by name @@ -230,17 +258,20 @@ def compute_complexity(bam, paired, bam_basename): 'TwoReadPairs', 'NRF', 'PBC1', - 'PBC2'] + 'PBC2', + ] if paired: steps = [ "samtools sort -@%d -n %s" % (cpu_count(), bam), "bamToBed -bedpe -i stdin", - r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""] + r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'""", + ] else: steps = [ "bamToBed -i %s" % (bam), - r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""] + r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'""", + ] steps.extend([ "grep -v 'chrM'", "sort", @@ -251,9 +282,13 @@ def compute_complexity(bam, paired, bam_basename): if err: logger.error("PBC file error: %s", err) - # Add headers + # Add Sample Name and headers pbc_file = pd.read_csv(tmp_pbc_file_qc_filename, sep='\t', header=None, names=pbc_headers) + pbc_file['Sample'] = bam_basename + pbc_headers_new = list(pbc_file) + pbc_headers_new.insert(0, pbc_headers_new.pop(pbc_headers_new.index('Sample'))) + pbc_file = pbc_file[pbc_headers_new] pbc_file.to_csv(pbc_file_qc_filename, header=True, sep='\t', index=False) os.remove(bam) os.remove(bam + '.bai') diff --git a/workflow/tests/test_map_qc.py b/workflow/tests/test_map_qc.py index 4c3525b..086a1d7 100644 --- a/workflow/tests/test_map_qc.py +++ b/workflow/tests/test_map_qc.py @@ -8,23 +8,45 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../output/filterReads/' -@pytest.mark.integration -def test_map_qc_singleend(): - #assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam')) - #assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam.bai')) - #filtered_reads_report = test_output_path + 'ENCFF646LXU.filt.nodup.flagstat.qc' - #samtools_report = open(filtered_reads_report).readlines() - #assert '64962570 + 0 in total' in samtools_report[0] - #assert '64962570 + 0 mapped (100.00%:N/A)' in samtools_report[4] - #library_complexity = test_output_path + 'ENCFF646LXU.filt.nodup.pbc.qc' - #df_library_complexity = pd.read_csv(library_complexity, sep='\t') - #assert df_library_complexity["NRF"].iloc[0] == 0.926192 - #assert df_library_complexity["PBC1"].iloc[0] == 0.926775 - #assert df_library_complexity["PBC2"].iloc[0] == 13.706885 - pass - - -@pytest.mark.integration -def test_map_qc_pairedend(): - # Do the same thing for paired end data - pass +@pytest.mark.singleend_human +def test_align_reads_singleend_human(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX/ENCLB969KTX.dedup.bam')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX/ENCLB969KTX.dedup.bam.bai')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX/ENCLB969KTX.dedup.qc')) + +@pytest.mark.singleend_human +def test_align_reads_singleend_human(): + filtered_reads_report = test_output_path + 'ENCLB969KTX/ENCLB969KTX.dedup.flagstat.qc' + samtools_report = open(filtered_reads_report).readlines() + assert '28118054 + 0 in total' in samtools_report[0] + assert '28118054 + 0 mapped (100.00%:N/A)' in samtools_report[4] + +@pytest.mark.singleend +def test_library_complexity_singleend(): + library_complexity = test_output_path + 'ENCLB969KTX/ENCLB969KTX.pbc.qc' + df_library_complexity = pd.read_csv(library_complexity, sep='\t') + assert df_library_complexity["NRF"].iloc[0] == 0.608847 + assert df_library_complexity["PBC1"].iloc[0] == 0.641176 + assert df_library_complexity["PBC2"].iloc[0] == 2.887092 + + +@pytest.mark.pairedend_mouse +def test_align_reads_pairedend_mouse(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP/ENCLB122XDP.dedup.bam')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP/ENCLB122XDP.dedup.bam.bai')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP/ENCLB122XDP.dedup.qc')) + +@pytest.mark.pairedend_mouse +def test_align_reads_pairedend_mouse(): + filtered_reads_report = test_output_path + 'ENCLB122XDP/ENCLB122XDP.dedup.flagstat.qc' + samtools_report = open(filtered_reads_report).readlines() + assert '34342124 + 0 in total' in samtools_report[0] + assert '34342124 + 0 mapped (100.00%:N/A)' in samtools_report[4] + +@pytest.mark.pairedend_mouse +def test_align_reads_pairedend_mouse(): + library_complexity = test_output_path + 'ENCLB122XDP/ENCLB122XDP.pbc.qc' + df_library_complexity = pd.read_csv(library_complexity, sep='\t') + assert df_library_complexity["NRF"].iloc[0] == 0.912926 + assert round(df_library_complexity["PBC1"].iloc[0],6) == 0.913898 + assert round(df_library_complexity["PBC2"].iloc[0],6) == 11.654689 -- GitLab