Merge branch '15-FixFilterReads' into 'master'

fix filter reads Closes #15 See merge request !9

Merge branch '15-FixFilterReads' into 'master'
fix filter reads Closes #15 See merge request !9
a14bd619 · Venkat Malladi · 2074cdc8 · 4cadae4e · a14bd619 · a14bd619
Commit a14bd619 authored 5 years ago by Venkat Malladi
--- a/README.md
+++ b/README.md
 # Astrocyte ATAC-seq analysis Workflow Package

-[![Build Status](https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/badges/master/build.svg)](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master)
-[![Coverage Report](https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/badges/master/coverage.svg)](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master)
+[![Build Status](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/badges/master/build.svg)](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master)
+[![Coverage Report](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/badges/master/coverage.svg)](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master)
 [![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A50.24.0-brightgreen.svg
 )](https://www.nextflow.io/)
 [![Astrocyte](https://img.shields.io/badge/astrocyte-%E2%89%A50.1.0-blue.svg)](https://astrocyte-test.biohpc.swmed.edu/static/docs/index.html)
@@ -47,6 +47,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
    1. Check input files
    2. Trim adaptors with TrimGalore!
    3. Map reads with BWA, filter with SamTools, and sort with Sambamba
+    4. Mark duplicates with Sambamba, Filter reads with SamTools, and calculate library complexity with SamTools and bedtools


 ## Output Files
@@ -55,6 +56,14 @@ Folder | File | Description
 design | N/A | Inputs used for analysis; can ignore
 trimReads | *_trimming_report.txt | report detailing how many reads were trimmed
 trimReads | *_trimmed.fq.gz | trimmed fastq files used for analysis
+alignReads | *.flagstat.qc | QC metrics from the mapping process
+alignReads | *.bam | sorted bam file
+filterReads | *.dedup.qc | QC metrics of find duplicate reads (sambamba)
+filterReads | *.dedup.bam | filtered bam file with duplicate reads removed
+filterReads | *.dedup.bam.bai | indexed filtered bam file
+filterReads | *.dedup.flagstat.qc | QC metrics of filtered bam file (mapping stats, samtools)
+filterReads | *.dedup.pbc.qc | QC metrics of library complexity
+


 ## Common Errors

--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -156,7 +156,7 @@ process alignReads {
  output:
    set sampleId, file('*.bam'), experimentId, replicate into mappedReads
    file '*.flagstat.qc' into mappedReadsStats
-
+    file('version_*.txt') into alignReadsVersions

  script:
    if (params.astrocyte == true) {
@@ -197,34 +197,54 @@ process alignReads {
 // Dedup reads using sambamba
 process filterReads {

-  tag "$sampleId-$replicate"
-  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+  tag "${sampleId}-${replicate}"
+  publishDir "${outDir}/${task.process}/${sampleId}", mode: 'copy'

  input:
-
-  set sampleId, mapped, experimentId, replicate from mappedReads
+    set sampleId, mapped, experimentId, replicate from mappedReads

  output:
-
-  set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into dedupReads
-  set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into tssEnrich
-  set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into convertReads
-  file '*flagstat.qc' into dedupReadsStats
-  file '*pbc.qc' into dedupReadsComplexity
-  file '*dup.qc' into dupReads
+    set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into dedupReads
+    set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into tssEnrich
+    set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into convertReads
+    file '*flagstat.qc' into dedupReadsStats
+    file '*pbc.qc' into dedupReadsComplexity
+    file '*dup.qc' into dupReads
+    file('version_*.txt') into filterReadsVersions

  script:
-
-  if (pairedEnd) {
-    """
-    python3 $baseDir/scripts/map_qc.py -b $mapped -p
-    """
-  }
-  else {
-    """
-    python3 $baseDir/scripts/map_qc.py -b $mapped
-    """
-  }
+    if (params.astrocyte == true) {
+      if (pairedEnd) {
+        """
+        module load python/3.6.1-2-anaconda
+        module load samtools/1.4.1
+        module load sambamba/0.6.6
+        module load bedtools/2.26.0
+        python3 ${baseDir}/scripts/map_qc.py -b ${mapped} -p
+        """
+      }
+      else {
+        """
+        module load python/3.6.1-2-anaconda
+        module load samtools/1.4.1
+        module load sambamba/0.6.6
+        module load bedtools/2.26.0
+        python3 ${baseDir}/scripts/map_qc.py -b ${mapped}
+        """
+      }
+    }
+    else {
+      if (pairedEnd) {
+        """
+        python3 ${baseDir}/scripts/map_qc.py -b ${mapped} -p
+        """
+      }
+      else {
+        """
+        python3 ${baseDir}/scripts/map_qc.py -b ${mapped}
+        """
+      }
+    }

 }


--- a/workflow/scripts/map_qc.py
+++ b/workflow/scripts/map_qc.py
@@ -12,7 +12,6 @@ from multiprocessing import cpu_count
 import pandas as pd
 import utils

-
 EPILOG = '''
 For more details:
        %(prog)s --help
@@ -63,6 +62,15 @@ def check_tools():
    samtools_path = shutil.which("samtools")
    if samtools_path:
        logger.info('Found samtools: %s', samtools_path)
+
+        # Get Version
+        samtools_version_command = "samtools --version"
+        samtools_version = subprocess.check_output(samtools_version_command, shell=True)
+
+        # Write to file
+        samtools_file = open("version_samtools.txt", "wb")
+        samtools_file.write(samtools_version)
+        samtools_file.close()
    else:
        logger.error('Missing samtools')
        raise Exception('Missing samtools')
@@ -70,6 +78,18 @@ def check_tools():
    sambamba_path = shutil.which("sambamba")
    if sambamba_path:
        logger.info('Found sambamba: %s', sambamba_path)
+
+        # Get Version
+        sambamba_version_command = "sambamba"
+        try:
+            subprocess.check_output(sambamba_version_command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError as e:
+            sambamba_version = e.output
+
+        # Write to file
+        sambamba_file = open("version_sambamba.txt", "wb")
+        sambamba_file.write(sambamba_version)
+        sambamba_file.close()
    else:
        logger.error('Missing sambamba')
        raise Exception('Missing sambamba')
@@ -77,6 +97,15 @@ def check_tools():
    bedtools_path = shutil.which("bedtools")
    if bedtools_path:
        logger.info('Found bedtools: %s', bedtools_path)
+
+        # Get Version
+        bedtools_version_command = "bedtools --version"
+        bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
+
+        # Write to file
+        bedtools_file = open("version_bedtools.txt", "wb")
+        bedtools_file.write(bedtools_version)
+        bedtools_file.close()
    else:
        logger.error('Missing bedtools')
        raise Exception('Missing bedtools')
@@ -168,9 +197,8 @@ def dedup_mapped(bam, bam_basename, paired):
            shlex.split(sambamba_markdup_command),
            stderr=dup_qc)

-
    # Remove duplicates
-    final_bam_prefix = bam_basename + ".filt.nodup"
+    final_bam_prefix = bam_basename + ".dedup"
    final_bam_filename = final_bam_prefix + ".bam"

    if paired:  # paired-end data
@@ -207,7 +235,7 @@ def dedup_mapped(bam, bam_basename, paired):
 def compute_complexity(bam, paired, bam_basename):
    '''Calculate library complexity .'''

-    pbc_file_qc_filename = bam_basename + ".filt.nodup.pbc.qc"
+    pbc_file_qc_filename = bam_basename + ".pbc.qc"
    tmp_pbc_file_qc_filename = "tmp.%s" % (pbc_file_qc_filename)

    # Sort by name
@@ -230,17 +258,20 @@ def compute_complexity(bam, paired, bam_basename):
        'TwoReadPairs',
        'NRF',
        'PBC1',
-        'PBC2']
+        'PBC2',
+        ]

    if paired:
        steps = [
            "samtools sort -@%d -n %s" % (cpu_count(), bam),
            "bamToBed -bedpe -i stdin",
-            r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""]
+            r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'""",
+            ]
    else:
        steps = [
            "bamToBed -i %s" % (bam),
-            r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""]
+            r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'""",
+            ]
    steps.extend([
        "grep -v 'chrM'",
        "sort",
@@ -251,9 +282,13 @@ def compute_complexity(bam, paired, bam_basename):
    if err:
        logger.error("PBC file error: %s", err)

-    # Add headers
+    # Add Sample Name and headers
    pbc_file = pd.read_csv(tmp_pbc_file_qc_filename, sep='\t', header=None,
                           names=pbc_headers)
+    pbc_file['Sample'] = bam_basename
+    pbc_headers_new = list(pbc_file)
+    pbc_headers_new.insert(0, pbc_headers_new.pop(pbc_headers_new.index('Sample')))
+    pbc_file = pbc_file[pbc_headers_new]
    pbc_file.to_csv(pbc_file_qc_filename, header=True, sep='\t', index=False)
    os.remove(bam)
    os.remove(bam + '.bai')

--- a/workflow/tests/test_map_qc.py
+++ b/workflow/tests/test_map_qc.py
@@ -8,23 +8,45 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
                '/../output/filterReads/'


-@pytest.mark.integration
-def test_map_qc_singleend():
-    #assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam'))
-    #assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam.bai'))
-    #filtered_reads_report = test_output_path + 'ENCFF646LXU.filt.nodup.flagstat.qc'
-    #samtools_report = open(filtered_reads_report).readlines()
-    #assert '64962570 + 0 in total' in samtools_report[0]
-    #assert '64962570 + 0 mapped (100.00%:N/A)' in samtools_report[4]
-    #library_complexity = test_output_path + 'ENCFF646LXU.filt.nodup.pbc.qc'
-    #df_library_complexity = pd.read_csv(library_complexity, sep='\t')
-    #assert  df_library_complexity["NRF"].iloc[0] == 0.926192
-    #assert  df_library_complexity["PBC1"].iloc[0] == 0.926775
-    #assert  df_library_complexity["PBC2"].iloc[0] == 13.706885
-    pass
-
-
-@pytest.mark.integration
-def test_map_qc_pairedend():
-    # Do the same thing for paired end data
-    pass
+@pytest.mark.singleend_human
+def test_align_reads_singleend_human():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX/ENCLB969KTX.dedup.bam'))
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX/ENCLB969KTX.dedup.bam.bai'))
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX/ENCLB969KTX.dedup.qc'))
+
+@pytest.mark.singleend_human
+def test_align_reads_singleend_human():
+    filtered_reads_report = test_output_path + 'ENCLB969KTX/ENCLB969KTX.dedup.flagstat.qc'
+    samtools_report = open(filtered_reads_report).readlines()
+    assert '28118054 + 0 in total' in samtools_report[0]
+    assert '28118054 + 0 mapped (100.00%:N/A)' in samtools_report[4]
+
+@pytest.mark.singleend
+def test_library_complexity_singleend():
+    library_complexity = test_output_path + 'ENCLB969KTX/ENCLB969KTX.pbc.qc'
+    df_library_complexity = pd.read_csv(library_complexity, sep='\t')
+    assert  df_library_complexity["NRF"].iloc[0] == 0.608847
+    assert  df_library_complexity["PBC1"].iloc[0] == 0.641176
+    assert  df_library_complexity["PBC2"].iloc[0] == 2.887092
+
+
+@pytest.mark.pairedend_mouse
+def test_align_reads_pairedend_mouse():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP/ENCLB122XDP.dedup.bam'))
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP/ENCLB122XDP.dedup.bam.bai'))
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP/ENCLB122XDP.dedup.qc'))
+
+@pytest.mark.pairedend_mouse
+def test_align_reads_pairedend_mouse():
+    filtered_reads_report = test_output_path + 'ENCLB122XDP/ENCLB122XDP.dedup.flagstat.qc'
+    samtools_report = open(filtered_reads_report).readlines()
+    assert '34342124 + 0 in total' in samtools_report[0]
+    assert '34342124 + 0 mapped (100.00%:N/A)' in samtools_report[4]
+
+@pytest.mark.pairedend_mouse
+def test_align_reads_pairedend_mouse():
+    library_complexity = test_output_path + 'ENCLB122XDP/ENCLB122XDP.pbc.qc'
+    df_library_complexity = pd.read_csv(library_complexity, sep='\t')
+    assert  df_library_complexity["NRF"].iloc[0] == 0.912926
+    assert  round(df_library_complexity["PBC1"].iloc[0],6) == 0.913898
+    assert  round(df_library_complexity["PBC2"].iloc[0],6) == 11.654689