From 4cadae4e4053bfda8b1dcecf0c956d32ee089f55 Mon Sep 17 00:00:00 2001
From: Holly Ruess <s185797@biohpcwsc012.biohpc.swmed.edu>
Date: Tue, 17 Dec 2019 11:59:50 -0600
Subject: [PATCH] fix filter reads

---
 README.md                     | 13 +++++--
 workflow/main.nf              | 66 +++++++++++++++++++++++------------
 workflow/scripts/map_qc.py    | 51 ++++++++++++++++++++++-----
 workflow/tests/test_map_qc.py | 62 +++++++++++++++++++++-----------
 4 files changed, 139 insertions(+), 53 deletions(-)

diff --git a/README.md b/README.md
index 74a4bbc..1e206ec 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # Astrocyte ATAC-seq analysis Workflow Package
 
-[![Build Status](https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/badges/master/build.svg)](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master)
-[![Coverage Report](https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/badges/master/coverage.svg)](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master)
+[![Build Status](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/badges/master/build.svg)](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master)
+[![Coverage Report](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/badges/master/coverage.svg)](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master)
 [![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A50.24.0-brightgreen.svg
 )](https://www.nextflow.io/)
 [![Astrocyte](https://img.shields.io/badge/astrocyte-%E2%89%A50.1.0-blue.svg)](https://astrocyte-test.biohpc.swmed.edu/static/docs/index.html)
@@ -47,6 +47,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
     1. Check input files
     2. Trim adaptors with TrimGalore!
     3. Map reads with BWA, filter with SamTools, and sort with Sambamba
+    4. Mark duplicates with Sambamba, Filter reads with SamTools, and calculate library complexity with SamTools and bedtools
 
 
 ## Output Files
@@ -55,6 +56,14 @@ Folder | File | Description
 design | N/A | Inputs used for analysis; can ignore
 trimReads | *_trimming_report.txt | report detailing how many reads were trimmed
 trimReads | *_trimmed.fq.gz | trimmed fastq files used for analysis
+alignReads | *.flagstat.qc | QC metrics from the mapping process
+alignReads | *.bam | sorted bam file
+filterReads | *.dedup.qc | QC metrics of find duplicate reads (sambamba)
+filterReads | *.dedup.bam | filtered bam file with duplicate reads removed
+filterReads | *.dedup.bam.bai | indexed filtered bam file
+filterReads | *.dedup.flagstat.qc | QC metrics of filtered bam file (mapping stats, samtools)
+filterReads | *.dedup.pbc.qc | QC metrics of library complexity
+
 
 
 ## Common Errors
diff --git a/workflow/main.nf b/workflow/main.nf
index 7c02981..379169f 100644
--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -156,7 +156,7 @@ process alignReads {
   output:
     set sampleId, file('*.bam'), experimentId, replicate into mappedReads
     file '*.flagstat.qc' into mappedReadsStats
-
+    file('version_*.txt') into alignReadsVersions
 
   script:
     if (params.astrocyte == true) {
@@ -197,34 +197,54 @@ process alignReads {
 // Dedup reads using sambamba
 process filterReads {
 
-  tag "$sampleId-$replicate"
-  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+  tag "${sampleId}-${replicate}"
+  publishDir "${outDir}/${task.process}/${sampleId}", mode: 'copy'
 
   input:
-
-  set sampleId, mapped, experimentId, replicate from mappedReads
+    set sampleId, mapped, experimentId, replicate from mappedReads
 
   output:
-
-  set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into dedupReads
-  set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into tssEnrich
-  set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into convertReads
-  file '*flagstat.qc' into dedupReadsStats
-  file '*pbc.qc' into dedupReadsComplexity
-  file '*dup.qc' into dupReads
+    set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into dedupReads
+    set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into tssEnrich
+    set sampleId, file('*.bam'), file('*.bai'), experimentId, replicate into convertReads
+    file '*flagstat.qc' into dedupReadsStats
+    file '*pbc.qc' into dedupReadsComplexity
+    file '*dup.qc' into dupReads
+    file('version_*.txt') into filterReadsVersions
 
   script:
-
-  if (pairedEnd) {
-    """
-    python3 $baseDir/scripts/map_qc.py -b $mapped -p
-    """
-  }
-  else {
-    """
-    python3 $baseDir/scripts/map_qc.py -b $mapped
-    """
-  }
+    if (params.astrocyte == true) {
+      if (pairedEnd) {
+        """
+        module load python/3.6.1-2-anaconda
+        module load samtools/1.4.1
+        module load sambamba/0.6.6
+        module load bedtools/2.26.0
+        python3 ${baseDir}/scripts/map_qc.py -b ${mapped} -p
+        """
+      }
+      else {
+        """
+        module load python/3.6.1-2-anaconda
+        module load samtools/1.4.1
+        module load sambamba/0.6.6
+        module load bedtools/2.26.0
+        python3 ${baseDir}/scripts/map_qc.py -b ${mapped}
+        """
+      }
+    }
+    else {
+      if (pairedEnd) {
+        """
+        python3 ${baseDir}/scripts/map_qc.py -b ${mapped} -p
+        """
+      }
+      else {
+        """
+        python3 ${baseDir}/scripts/map_qc.py -b ${mapped}
+        """
+      }
+    }
 
 }
 
diff --git a/workflow/scripts/map_qc.py b/workflow/scripts/map_qc.py
index d58ae8f..af12e4d 100644
--- a/workflow/scripts/map_qc.py
+++ b/workflow/scripts/map_qc.py
@@ -12,7 +12,6 @@ from multiprocessing import cpu_count
 import pandas as pd
 import utils
 
-
 EPILOG = '''
 For more details:
         %(prog)s --help
@@ -63,6 +62,15 @@ def check_tools():
     samtools_path = shutil.which("samtools")
     if samtools_path:
         logger.info('Found samtools: %s', samtools_path)
+
+        # Get Version
+        samtools_version_command = "samtools --version"
+        samtools_version = subprocess.check_output(samtools_version_command, shell=True)
+
+        # Write to file
+        samtools_file = open("version_samtools.txt", "wb")
+        samtools_file.write(samtools_version)
+        samtools_file.close()
     else:
         logger.error('Missing samtools')
         raise Exception('Missing samtools')
@@ -70,6 +78,18 @@ def check_tools():
     sambamba_path = shutil.which("sambamba")
     if sambamba_path:
         logger.info('Found sambamba: %s', sambamba_path)
+
+        # Get Version
+        sambamba_version_command = "sambamba"
+        try:
+            subprocess.check_output(sambamba_version_command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError as e:
+            sambamba_version = e.output
+
+        # Write to file
+        sambamba_file = open("version_sambamba.txt", "wb")
+        sambamba_file.write(sambamba_version)
+        sambamba_file.close()
     else:
         logger.error('Missing sambamba')
         raise Exception('Missing sambamba')
@@ -77,6 +97,15 @@ def check_tools():
     bedtools_path = shutil.which("bedtools")
     if bedtools_path:
         logger.info('Found bedtools: %s', bedtools_path)
+
+        # Get Version
+        bedtools_version_command = "bedtools --version"
+        bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
+
+        # Write to file
+        bedtools_file = open("version_bedtools.txt", "wb")
+        bedtools_file.write(bedtools_version)
+        bedtools_file.close()
     else:
         logger.error('Missing bedtools')
         raise Exception('Missing bedtools')
@@ -168,9 +197,8 @@ def dedup_mapped(bam, bam_basename, paired):
             shlex.split(sambamba_markdup_command),
             stderr=dup_qc)
 
-
     # Remove duplicates
-    final_bam_prefix = bam_basename + ".filt.nodup"
+    final_bam_prefix = bam_basename + ".dedup"
     final_bam_filename = final_bam_prefix + ".bam"
 
     if paired:  # paired-end data
@@ -207,7 +235,7 @@ def dedup_mapped(bam, bam_basename, paired):
 def compute_complexity(bam, paired, bam_basename):
     '''Calculate library complexity .'''
 
-    pbc_file_qc_filename = bam_basename + ".filt.nodup.pbc.qc"
+    pbc_file_qc_filename = bam_basename + ".pbc.qc"
     tmp_pbc_file_qc_filename = "tmp.%s" % (pbc_file_qc_filename)
 
     # Sort by name
@@ -230,17 +258,20 @@ def compute_complexity(bam, paired, bam_basename):
         'TwoReadPairs',
         'NRF',
         'PBC1',
-        'PBC2']
+        'PBC2',
+        ]
 
     if paired:
         steps = [
             "samtools sort -@%d -n %s" % (cpu_count(), bam),
             "bamToBed -bedpe -i stdin",
-            r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""]
+            r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'""",
+            ]
     else:
         steps = [
             "bamToBed -i %s" % (bam),
-            r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""]
+            r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'""",
+            ]
     steps.extend([
         "grep -v 'chrM'",
         "sort",
@@ -251,9 +282,13 @@ def compute_complexity(bam, paired, bam_basename):
     if err:
         logger.error("PBC file error: %s", err)
 
-    # Add headers
+    # Add Sample Name and headers
     pbc_file = pd.read_csv(tmp_pbc_file_qc_filename, sep='\t', header=None,
                            names=pbc_headers)
+    pbc_file['Sample'] = bam_basename
+    pbc_headers_new = list(pbc_file)
+    pbc_headers_new.insert(0, pbc_headers_new.pop(pbc_headers_new.index('Sample')))
+    pbc_file = pbc_file[pbc_headers_new]
     pbc_file.to_csv(pbc_file_qc_filename, header=True, sep='\t', index=False)
     os.remove(bam)
     os.remove(bam + '.bai')
diff --git a/workflow/tests/test_map_qc.py b/workflow/tests/test_map_qc.py
index 4c3525b..086a1d7 100644
--- a/workflow/tests/test_map_qc.py
+++ b/workflow/tests/test_map_qc.py
@@ -8,23 +8,45 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
                 '/../output/filterReads/'
 
 
-@pytest.mark.integration
-def test_map_qc_singleend():
-    #assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam'))
-    #assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam.bai'))
-    #filtered_reads_report = test_output_path + 'ENCFF646LXU.filt.nodup.flagstat.qc'
-    #samtools_report = open(filtered_reads_report).readlines()
-    #assert '64962570 + 0 in total' in samtools_report[0]
-    #assert '64962570 + 0 mapped (100.00%:N/A)' in samtools_report[4]
-    #library_complexity = test_output_path + 'ENCFF646LXU.filt.nodup.pbc.qc'
-    #df_library_complexity = pd.read_csv(library_complexity, sep='\t')
-    #assert  df_library_complexity["NRF"].iloc[0] == 0.926192
-    #assert  df_library_complexity["PBC1"].iloc[0] == 0.926775
-    #assert  df_library_complexity["PBC2"].iloc[0] == 13.706885
-    pass
-
-
-@pytest.mark.integration
-def test_map_qc_pairedend():
-    # Do the same thing for paired end data
-    pass
+@pytest.mark.singleend_human
+def test_align_reads_singleend_human():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX/ENCLB969KTX.dedup.bam'))
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX/ENCLB969KTX.dedup.bam.bai'))
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX/ENCLB969KTX.dedup.qc'))
+
+@pytest.mark.singleend_human
+def test_align_reads_singleend_human():
+    filtered_reads_report = test_output_path + 'ENCLB969KTX/ENCLB969KTX.dedup.flagstat.qc'
+    samtools_report = open(filtered_reads_report).readlines()
+    assert '28118054 + 0 in total' in samtools_report[0]
+    assert '28118054 + 0 mapped (100.00%:N/A)' in samtools_report[4]
+
+@pytest.mark.singleend
+def test_library_complexity_singleend():
+    library_complexity = test_output_path + 'ENCLB969KTX/ENCLB969KTX.pbc.qc'
+    df_library_complexity = pd.read_csv(library_complexity, sep='\t')
+    assert  df_library_complexity["NRF"].iloc[0] == 0.608847
+    assert  df_library_complexity["PBC1"].iloc[0] == 0.641176
+    assert  df_library_complexity["PBC2"].iloc[0] == 2.887092
+
+
+@pytest.mark.pairedend_mouse
+def test_align_reads_pairedend_mouse():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP/ENCLB122XDP.dedup.bam'))
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP/ENCLB122XDP.dedup.bam.bai'))
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP/ENCLB122XDP.dedup.qc'))
+
+@pytest.mark.pairedend_mouse
+def test_align_reads_pairedend_mouse():
+    filtered_reads_report = test_output_path + 'ENCLB122XDP/ENCLB122XDP.dedup.flagstat.qc'
+    samtools_report = open(filtered_reads_report).readlines()
+    assert '34342124 + 0 in total' in samtools_report[0]
+    assert '34342124 + 0 mapped (100.00%:N/A)' in samtools_report[4]
+
+@pytest.mark.pairedend_mouse
+def test_align_reads_pairedend_mouse():
+    library_complexity = test_output_path + 'ENCLB122XDP/ENCLB122XDP.pbc.qc'
+    df_library_complexity = pd.read_csv(library_complexity, sep='\t')
+    assert  df_library_complexity["NRF"].iloc[0] == 0.912926
+    assert  round(df_library_complexity["PBC1"].iloc[0],6) == 0.913898
+    assert  round(df_library_complexity["PBC2"].iloc[0],6) == 11.654689
-- 
GitLab