added blacklist removal

946bd51c · Holly Ruess · a093f6e5 · 946bd51c · 946bd51c · 946bd51c
Commit 946bd51c authored 4 years ago by Holly Ruess
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,7 @@ All notable changes to this project will be documented in this file.
 - Added FRiP score (per sample: reads in replicated peaks/total reads per)
 - Added TSS enrichment
 - Added Fragment/Insert Length size
+ - Removal of blacklist regions

 ## [publish_1.0.0 ] - 2019-12-03
 Initial release of pipeline

--- a/README.md
+++ b/README.md
@@ -43,14 +43,14 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git


 ## Pipeline
-  + There are ??? steps to the pipeline
+  + There are 9 steps to the pipeline
    1. Check input files
    2. Trim adaptors with TrimGalore!
    3. Map reads with BWA, filter with SamTools, and sort with Sambamba
    4. Mark duplicates with Sambamba, Filter reads with SamTools, calculate percentage of reads in mitochondria, and calculate library complexity with SamTools and bedtools
    5. Calculate cross-correlation using PhantomPeakQualTools
    6. Call peaks with MACS2 from overlaps of pooled replicates
-    7. Call consensus peaks
+    7. Call consensus peaks and optional removal of blacklist peaks
    8. QC metrics
    9. MultiQC report

@@ -79,6 +79,7 @@ callPeaksMACS | pooled/*.pvalue_signal.bw | bigwig data file; sample/control sig
 callPeaksMACS | pooled/*_pooled.narrowPeak | peaks file; see [HERE](https://genome.ucsc.edu/FAQ/FAQformat.html#format12) for ENCODE narrowPeak header format
 consensusPeaks | *.rejected.narrowPeak | peaks not supported by multiple testing (replicates and pseudo-replicates)
 consensusPeaks | *.replicated.narrowPeak | peaks supported by multiple testing (replicates and pseudo-replicates)
+consensusPeaks | *.replicated_noblacklist.narrowPeak | peaks supported by multiple testing (replicates and pseudo-replicates) with blacklist regions removed
 experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sample
 experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples
 experimentQC | heatmeap_PearsonCorr.pdf | plot of Pearson correlation between samples

--- a/astrocyte_pkg.yml
+++ b/astrocyte_pkg.yml
@@ -110,6 +110,19 @@ workflow_parameters:
      read length, and then starts another round of reading from the opposite
      end of the fragment.

+  - id: blacklist
+    type: select
+    required: true
+    choices:
+      - [ 'true', 'True']
+      - [ 'false', 'False']
+    description: |
+      The Blacklisted Regions aim to identify a comprehensive set of regions 
+      that have anomalous, unstructured, high signal/read counts in next gen
+      sequencing experiments independent of cell line and type of experiment.
+      If you would like these regions excluded from replicated peaks, select
+      True. (recommended)
+
  - id: design
    type: file
    required: true

--- a/docs/index.md
+++ b/docs/index.md
@@ -43,14 +43,14 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git


 ## Pipeline
-  + There are ??? steps to the pipeline
+  + There are 9 steps to the pipeline
    1. Check input files
    2. Trim adaptors with TrimGalore!
    3. Map reads with BWA, filter with SamTools, and sort with Sambamba
    4. Mark duplicates with Sambamba, Filter reads with SamTools, calculate percentage of reads in mitochondria, and calculate library complexity with SamTools and bedtools
    5. Calculate cross-correlation using PhantomPeakQualTools
    6. Call peaks with MACS2 from overlaps of pooled replicates
-    7. Call consensus peaks
+    7. Call consensus peaks and optional removal of blacklist peaks
    8. QC metrics
    9. MultiQC report

@@ -79,6 +79,7 @@ callPeaksMACS | pooled/*.pvalue_signal.bw | bigwig data file; sample/control sig
 callPeaksMACS | pooled/*_pooled.narrowPeak | peaks file; see [HERE](https://genome.ucsc.edu/FAQ/FAQformat.html#format12) for ENCODE narrowPeak header format
 consensusPeaks | *.rejected.narrowPeak | peaks not supported by multiple testing (replicates and pseudo-replicates)
 consensusPeaks | *.replicated.narrowPeak | peaks supported by multiple testing (replicates and pseudo-replicates)
+consensusPeaks | *.replicated_noblacklist.narrowPeak | peaks supported by multiple testing (replicates and pseudo-replicates) with blacklist regions removed
 experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sample
 experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples
 experimentQC | heatmeap_PearsonCorr.pdf | plot of Pearson correlation between samples

--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -18,11 +18,11 @@ process {
    queue = '256GB,256GBv1'
  }
  withName: filterReads {
-    module = ['python/3.6.1-2-anaconda', 'samtools/1.4.1', 'sambamba/0.6.6', 'bedtools/2.26.0']
+    module = ['python/3.6.1-2-anaconda', 'samtools/1.4.1', 'sambamba/0.6.6', 'bedtools/2.25.0']
    queue = '128GB,256GB,256GBv1'
  }
  withName: convertReads {
-    module = ['python/3.6.1-2-anaconda', 'samtools/1.4.1', 'bedtools/2.26.0']
+    module = ['python/3.6.1-2-anaconda', 'samtools/1.4.1', 'bedtools/2.25.0']
    queue = '128GB,256GB,256GBv1'
  }
  withName: crossReads {
@@ -38,15 +38,15 @@ process {
    executor = 'local'
  }
  withName: callPeaksMACS {
-    module = ['python/3.6.1-2-anaconda', 'macs/2.1.0-20151222', 'phantompeakqualtools/1.2', 'UCSC_userApps/v317', 'bedtools/2.26.0']
+    module = ['python/3.6.1-2-anaconda', 'macs/2.1.0-20151222', 'phantompeakqualtools/1.2', 'UCSC_userApps/v317', 'bedtools/2.25.0']
    queue = '128GB,256GB,256GBv1'
  }
  withName: consensusPeaks {
-    module = ['python/3.6.1-2-anaconda', 'bedtools/2.26.0']
+    module = ['python/3.6.1-2-anaconda', 'bedtools/2.25.0']
    executor = 'local'
  }
  withName: experimentQC {
-    module = ['python/3.6.1-2-anaconda', 'deeptools/2.5.0.1', 'samtools/1.4.1', 'bedtools/2.26.0', 'singularity/2.6.1']
+    module = ['python/3.6.1-2-anaconda', 'deeptools/2.5.0.1', 'samtools/1.4.1', 'bedtools/2.25.0', 'singularity/2.6.1']
    queue = '128GB,256GB,256GBv1'
  }
  withName: multiqcReport {
@@ -64,12 +64,14 @@ params {
      genomesize = 'hs'
      chromsizes = '/project/shared/bicf_workflow_ref/GRCh38/genomefile.txt'
      gtffile = '/project/shared/bicf_workflow_ref/GRCh38/gencode.v25.chr_patch_hapl_scaff.annotation.gtf'
+      blacklistfile = '/project/shared/bicf_workflow_ref/GRCh38/ENCFF356LFX.bed'
    }
    'GRCm38' {
      bwa = '/project/shared/bicf_workflow_ref/GRCm38'
      genomesize = 'mm'
      chromsizes = '/project/shared/bicf_workflow_ref/GRCm38/genomefile.txt'
      gtffile = '/project/shared/bicf_workflow_ref/GRCm38/gencode.vM20.annotation.gtf'
+      blacklistfile = '/project/shared/bicf_workflow_ref/GRCm38/ENCFF999QPV.bed'
    }
  }
 }

--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -6,6 +6,7 @@
 // Define Input variables
 params.reads = "$baseDir/../test_data/*.fastq.gz"
 params.pairedEnd = false
+params.blacklist = false
 params.designFile = "$baseDir/../test_data/design_ENCSR265ZXX_SE.txt"
 params.genome = 'GRCh38'
 params.genomes = []
@@ -13,6 +14,7 @@ params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false :
 params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false
 params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false
 params.gtfFile = params.genome ? params.genomes[ params.genome ].gtffile ?: false : false
+params.blacklistFile = params.genome ? params.genomes[ params.genome ].blacklistfile ?: false : false
 params.astrocyte = false
 params.outDir= "${baseDir}/output"
 params.references = "${baseDir}/../docs/references.md"
@@ -36,6 +38,7 @@ readsList = Channel

 // Define regular variables
 pairedEnd = params.pairedEnd
+blacklist = params.blacklist
 designFile = params.designFile
 genomeSize = params.genomeSize
 chromSizes = params.chromSizes
@@ -43,6 +46,7 @@ outDir = params.outDir
 references = params.references
 multiqc = params.multiqc
 gtfFile = params.gtfFile
+blacklistFile = params.blacklistFile

 process checkDesignFile {

@@ -508,16 +512,32 @@ process consensusPeaks {

  script:
    if (params.astrocyte == true) {
-      """
-      module load python/3.6.1-2-anaconda
-      module load bedtools/2.26.0
-      python3 ${baseDir}/scripts/overlap_peaks.py -d ${peaksDesign} -f ${preDiffDesign}
-      """
+        if (blacklist) {
+          """
+          module load python/3.6.1-2-anaconda
+          module load bedtools/2.26.0
+          python3 ${baseDir}/scripts/overlap_peaks.py -d ${peaksDesign} -f ${preDiffDesign} -b ${blacklistFile}
+          """
+        }
+        else {
+          """
+          module load python/3.6.1-2-anaconda
+          module load bedtools/2.26.0
+          python3 ${baseDir}/scripts/overlap_peaks.py -d ${peaksDesign} -f ${preDiffDesign}
+          """
+        }
    }
    else {
-      """
-      python3 ${baseDir}/scripts/overlap_peaks.py -d ${peaksDesign} -f ${preDiffDesign}
-      """
+        if (blacklist) {
+          """
+          python3 ${baseDir}/scripts/overlap_peaks.py -d ${peaksDesign} -f ${preDiffDesign} -b ${blacklistFile}
+          """
+        }
+        else {
+          """
+          python3 ${baseDir}/scripts/overlap_peaks.py -d ${peaksDesign} -f ${preDiffDesign}
+          """
+        }
    }

 }
@@ -543,7 +563,7 @@ process experimentQC {

  output:
    file '*.{pdf,npz}' into deepToolsStats
-    file '*.fragment_length_count.txt' into fdlStats
+    file '*.fragment_length_count.txt' optional true into fdlStats
    file('version_*.txt') into experimentQCVersions
    file '*.FRiPscore.tsv' into FRiPscore
    file '*.TSSenrichment.tsv' into TSSenrichment

--- a/workflow/scripts/overlap_peaks.py
+++ b/workflow/scripts/overlap_peaks.py
@@ -38,6 +38,10 @@ def get_args():
                        help="The design file of with bam files (tsv format).",
                        required=True)

+    parser.add_argument('-b', '--blacklist',
+                        help="Bed file of blacklisted regions to remove",
+                        required=False)
+
    args = parser.parse_args()
    return args

@@ -173,10 +177,27 @@ def overlap(experiment, design):
    return os.path.abspath(overlapping_peaks_fn)


+def blacklist_peaks(experiment, blacklist):
+    '''Remove blacklist peaks from replicated peaks'''
+
+    logger.info("Removing blacklist regions from replicated peaks")
+
+    # Assign output
+    blpo_filename = "%s.replicated_noblacklist.narrowPeak" % (experiment)
+
+    # Remove
+    steps = [
+        "bedtools intersect -v -a %s -b %s" % (experiment, blacklist),
+        "awk 'BEGIN{OFS='\t'} {if ($5>1000) $5=1000; print $0}'",
+        ]
+    out, err = utils.run_pipe(steps, blpo_filename)
+
+
 def main():
    args = get_args()
    design = args.design
    files = args.files
+    blacklist = args.files

    # Create a file handler
    handler = logging.FileHandler('consensus_peaks.log')
@@ -207,6 +228,10 @@ def main():

    design_diff.to_csv("design_exQC.tsv", header=True, sep='\t', index=False)

+    # Remove blacklist regions; if blacklist = True
+    if blacklist:
+        for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
+            bl_peaks = blacklist_peaks(experiment, blacklist)

 if __name__ == '__main__':
    main()
--- a/workflow/tests/test_overlap_peaks.py
+++ b/workflow/tests/test_overlap_peaks.py
@@ -44,3 +44,6 @@ def test_overlap_peaks_pairedend_mouse():
    assert os.path.exists(os.path.join(test_output_path, 'ENCSR451NAE.rejected.narrowPeak'))
    peak_file = test_output_path + 'ENCSR451NAE.replicated.narrowPeak'
    assert utils.count_lines(peak_file) >= 78039
+    assert os.path.exists(os.path.join(test_output_path, 'ENCSR451NAE.replicated_noblacklist.narrowPeak'))
+    bl_peak_file = test_output_path + 'ENCSR451NAE.replicated_noblacklist.narrowPeak'
+    assert utils.count_lines(bl_peak_file) >= 77532