Fix merge conflicts.

7b79b436 · Venkat Malladi · cf41e35b · 9eb09c23 · 7b79b436 · 7b79b436
Commit 7b79b436 authored 6 years ago by Venkat Malladi
--- a/.gitignore
+++ b/.gitignore
@@ -97,3 +97,6 @@ ENV/

 # mypy
 .mypy_cache/
+
+# Mac OS
+.DS_Store
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
 before_script:
  - module add  python/3.6.1-2-anaconda
-  - pip install --user pytest-pythonpath pytest-cov
-  - module load nextflow/0.31.0
-  - ln -s /work/BICF/s163035/chipseq/*fastq.gz test_data/
+  - pip install --user pytest-pythonpath==0.7.1 pytest-cov==2.5.1
+  - module load  nextflow/0.31.0
+  - ln -s /project/shared/bicf_workflow_ref/workflow_testdata/chipseq/*fastq.gz test_data/

 stages:
  - unit
  - astrocyte
  - single
  - multiple
+  - skip

 user_configuration:
  stage: unit
@@ -28,6 +29,8 @@ astrocyte:

 single_end_mouse:
  stage: single
+  only:
+    - master
  script:
  - nextflow run workflow/main.nf -resume
  - pytest -m singleend
@@ -36,6 +39,10 @@ single_end_mouse:

 paired_end_human:
  stage: single
+  only:
+    - branches
+  except:
+    - master
  script:
  - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_ENCSR729LGA_PE.txt" --genome 'GRCh38' --pairedEnd true -resume
  - pytest -m pairedend
@@ -44,6 +51,10 @@ paired_end_human:

 single_end_diff:
  stage: multiple
+  only:
+    - branches
+  except:
+    - master
  script:
  - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' -resume
  - pytest -m singlediff
@@ -51,9 +62,21 @@ single_end_diff:
    expire_in: 2 days

 paired_end_diff:
+  only:
+    - master
  stage: multiple
  script:
  - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_PE.txt" --genome 'GRCh38' --pairedEnd true -resume
  - pytest -m paireddiff
  artifacts:
    expire_in: 2 days
+
+single_end_skip:
+  stage: skip
+  only:
+    - master
+  script:
+  - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' --skipDiff true --skipMotif true -resume
+  - pytest -m singleskip_true
+  artifacts:
+    expire_in: 2 days
--- a/docs/references.md
+++ b/docs/references.md
+### References
+
+1. **python**:
+  * Anaconda (Anaconda Software Distribution, [https://anaconda.com](https://anaconda.com))
+
+2. **trimgalore**:
+  * trimgalore [https://github.com/FelixKrueger/TrimGalore](https://github.com/FelixKrueger/TrimGalore)
+
+3. **cutadapt**:
+  * Marcel, M. 2011. Cutadapt removes adapter sequences from high-throughput sequencing reads. EMBnet.journal 17(1):10-12. doi:[10.14806/ej.17.1.200](http://dx.doi.org/10.14806/ej.17.1.200)
+
+4. **bwa**:
+  * Li H., and R. Durbin. 2009. Fast and accurate short read alignment with Burrows-Wheeler Transform. Bioinformatics 25: 1754-60. doi:[10.1093/bioinformatics/btp324](http://dx.doi.org/10.1093/bioinformatics/btp324)
+
+5. **samtools**:
+  * Li H., B. Handsaker, A. Wysoker, T. Fennell, J. Ruan, N. Homer, G. Marth, G. Abecasis, R. Durbin, and 1000 Genome Project Data Processing Subgroup. 2009. The Sequence alignment/map (SAM) format and SAMtools. Bioinformatics 25: 2078-9. doi:[10.1093/bioinformatics/btp352](http://dx.doi.org/10.1093/bioinformatics/btp352)
+
+6. **sambamba**:
+  * Tarasov, A., A. J. Vilella, E. Cuppen, I. J. Nijman, and P. Prins. 2015 Sambamba: fast processing of NGS alignment formats. Bioinformatics 31(12): 2032-2034. doi:[10.1093/bioinformatics/btv098](http://dx.doi.org/10.1093/bioinformatics/btv098)
+
+7. **bedtools**:
+  * Quinlan, A. R., and I. M. Hall. 2010. BEDTools: a flexible suite of utilities for comparing genomic feautures. Bioinformatics 26(6): 841-842. doi:[10.1093/bioinformatics/btq033](http://dx.doi.org/10.1093/bioinformatics/btq033)
+
+8. **deeptools**:
+  * Ramírez, F., D. P. Ryan, B. Grüning, V. Bhardwaj, F. Kilpert, A. S. Richter, S. Heyne, F. Dündar, and T. Manke. 2016. deepTools2: a next generation web server for deep-sequencing data analysis. Nucleic Acids Research 44: W160-165. doi:[10.1093/nar/gkw257](http://dx.doi.org/10.1093/nar/gkw257)
+
+9. **phantompeakqualtools**:
+  * Landt S. G., G. K. Marinov, A. Kundaje, et al. 2012. ChIP-seq guidelines and practices of the ENCODE and modENCODE consortia. Genome Res 9: 1813-31. doi:[10.1101/gr.136184.111](http://dx.doi.org/10.1101/gr.136184.111)
+  * Kharchenko P. K., M. Y. Tolstorukov, and P. J. Park. 2008. Design and analysis of ChIP-seq experiments for DNA-binding proteins. Nat Biotechnol 26(12): 1351-1359. doi:[10.1038/nbt.1508](https://dx.doi.org/10.1038/nbt.1508)
+
+10. **macs**:
+  * Zhang Y., T. Liu, C. A. Meyer, J. Eeckhoute, D. S. Johnson, B. E. Bernstein, C. Nusbaum, R. M. Myers, M. Brown, W. Li, and X. S. Liu. 2008. Model-based Analysis of ChIP-Seq (MACS). Genome Biol 9: R137. doi:[10.1186/gb-2008-9-9-r137](https://dx.doi.org/10.1186/gb-2008-9-9-r137)
+
+11. **UCSC(bedGraphToBigWig)**:
+  * Kent W. J., A. S. Zweig, G. Barber, A. S. Hinrichs, and D. Karolchik. BigWig and BigBed: enabling browsing of large distributed data sets. Bioinformatics 26(17): 2204-2207. doi:[10.1093/bioinformatics/btq351](https://dx.doi.org/10.1093/bioinformatics/btq351)
+
+12. **R**:
+  * R Core Team 2014. R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL:[http://www.R-project.org/](http://www.R-project.org/).
+
+13. **meme**:
+  * Bailey T. L., M. Bodén, F. A. Buske, M. Frith, C. E. Grant, L. Clementi, J. Ren, W. W. Li, and W. S. Noble. 2009. MEME SUITE: tools for motif discovery and searching. Nucleic Acids Research 37: W202-W208. doi:[10.1093/nar/gkp335](https://dx.doi.org/10.1093/nar/gkp335)
+  * Machanick P., and T. L. Bailey. 2011. MEME-ChIP: motif analysis of large DNA datasets. Bioinformatics 27(12): 1696-1697. doi:[10.1093/bioinformatics/btr189](https://dx.doi.org/10.1093/bioinformatics/btr189)
+
+14. **ChIPseeker**:
+  * Yu G., L. Wang, and Q. He. 2015. ChIPseeker: an R/Bioconductor package for ChIP peak annotation, comparison and visualization. Bioinformatics 31(14): 2382-2383. doi:[10.1093/bioinformatics/btv145](https://dx.doi.org/10.1093/bioinformatics/btv145)
+
+15. **DiffBind**:
+  * Stark R., and G. Brown. 2011. DiffBind: differential binding analysis of ChIP-Seq peak data. [http://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf](http://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf). doi:[10.18129/B9.bioc.DiffBind](https://dx.doi.org/10.18129/B9.bioc.DiffBind)
+  * Ross-Innes C. S., R. Stark, A. E. Teschendorff, K. A. Holmes, H. R. Ali, M. J. Dunning,  G. D. Brown, O. Gojis, I. O. Ellis, A. R. Green, S. Ali, S. Chin, C. Palmieri, C. Caldas, and J. S. Carroll. 2012. Differential oestrogen receptor binding is associated with clinical outcome in breast cancer. Nature 481: 389-393. doi:[10.1038/nature10730](https://dx.doi.org/10.1038/nature10730)
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -41,7 +41,7 @@ process {
    executor = 'local'
  }
  withName: callPeaksMACS {
-    module = ['python/3.6.1-2-anaconda', 'macs/2.1.0-20151222', 'phantompeakqualtools/1.2', 'UCSC_userApps/v317', 'bedtools/2.26.0']
+    module = ['python/3.6.1-2-anaconda', 'macs/2.1.0-20151222', 'UCSC_userApps/v317', 'bedtools/2.26.0', 'phantompeakqualtools/1.2']
    queue = '128GB,256GB,256GBv1'
  }
  withName: consensusPeaks {
@@ -60,7 +60,10 @@ process {
    module = ['python/3.6.1-2-anaconda', 'meme/4.11.1-gcc-openmpi', 'bedtools/2.26.0']
    cpus = 32
  }
-
+  withName: softwareReport {
+    module = ['python/3.6.1-2-anaconda', 'pandoc/2.7']
+    executor = 'local'
+  }
 }

 params {

--- a/workflow/conf/multiqc_config.yaml
+++ b/workflow/conf/multiqc_config.yaml
+# Title to use for the report.
+title: BICF ChIP-seq Analysis Report
+
+report_comment: >
+    This report has been generated by the <a href="https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/" target="_blank">BICF/chipseq_analysis</a>
+    pipeline.
+
+report_section_order:
+    software_versions:
+        order: -1000
+
+report_section_order:
+    software_references:
+        order: -1000
+
+extra_fn_clean_exts:
+    - '_R1'
+    - '_R2'
+    - 'pbc.qc'
+
+fn_ignore_files:
+    - '*dedup.flagstat.qc'
+
+custom_data:
+    library_complexity:
+      file_format: 'tsv'
+      id: 'library_complexity'
+      contents: 'TotalReadPairs  DistinctReadPairs       OneReadPair     TwoReadPairs    NRF     PBC1    PBC2'
+      section_name: 'Library complexity'
+      plot_type: 'generalstats'
+
+sp:
+    phantompeakqualtools/out:
+        fn: '*cc.qc'
+    library_complexity:
+        fn: '*pbc.qc'
--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -12,6 +12,9 @@ params.outDir= "$baseDir/output"
 params.extendReadsLen = 100
 params.topPeakCount = 600
 params.astrocyte = 'false'
+params.skipDiff = false
+params.skipMotif = false
+params.references = "$baseDir/../docs/references.md"

 // Assign variables if astrocyte
 params.genome = 'GRCm38'
@@ -59,6 +62,9 @@ cutoffRatio = params.cutoffRatio
 outDir = params.outDir
 extendReadsLen = params.extendReadsLen
 topPeakCount = params.topPeakCount
+skipDiff = params.skipDiff
+skipMotif = params.skipMotif
+references = params.references

 if (params.pairedEnd == 'false'){
  pairedEnd = false
@@ -110,7 +116,7 @@ rawReads = designFilePaths
 process trimReads {

  tag "$sampleId-$replicate"
-  publishDir "$outDir/${task.process}", mode: 'copy'
+  publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy'

  input:

@@ -119,7 +125,8 @@ process trimReads {
  output:

  set sampleId, file('*.fq.gz'), experimentId, biosample, factor, treatment, replicate, controlId into trimmedReads
-  file('*trimming_report.txt') into trimgalore_results
+  file('*trimming_report.txt') into trimgaloreResults
+  file('version_*.txt') into trimReadsVersions

  script:

@@ -140,7 +147,7 @@ process trimReads {
 process alignReads {

  tag "$sampleId-$replicate"
-  publishDir "$outDir/${task.process}", mode: 'copy'
+  publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy'

  input:

@@ -151,6 +158,7 @@ process alignReads {

  set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into mappedReads
  file '*.flagstat.qc' into mappedReadsStats
+  file('version_*.txt') into alignReadsVersions

  script:

@@ -171,7 +179,7 @@ process alignReads {
 process filterReads {

  tag "$sampleId-$replicate"
-  publishDir "$outDir/${task.process}", mode: 'copy'
+  publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy'

  input:

@@ -184,6 +192,7 @@ process filterReads {
  file '*.flagstat.qc' into dedupReadsStats
  file '*.pbc.qc' into dedupReadsComplexity
  file '*.dedup.qc' into dupReads
+  file('version_*.txt') into filterReadsVersions

  script:

@@ -218,7 +227,8 @@ process experimentQC {

  output:

-  file '*.{png,npz}' into deepToolsStats
+  file '*.{pdf,npz}' into experimentQCStats
+  file('version_*.txt') into experimentQCVersions

  script:

@@ -232,7 +242,7 @@ process experimentQC {
 process convertReads {

  tag "$sampleId-$replicate"
-  publishDir "$outDir/${task.process}", mode: 'copy'
+  publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy'

  input:

@@ -241,6 +251,7 @@ process convertReads {
  output:

  set sampleId, file('*.tagAlign.gz'), file('*.bed{pe,se}.gz'), experimentId, biosample, factor, treatment, replicate, controlId into tagReads
+  file('version_*.txt') into convertReadsVersions

  script:

@@ -261,7 +272,7 @@ process convertReads {
 process crossReads {

  tag "$sampleId-$replicate"
-  publishDir "$outDir/${task.process}", mode: 'copy'
+  publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy'

  input:

@@ -270,7 +281,8 @@ process crossReads {
  output:

  set sampleId, seTagAlign, tagAlign, file('*.cc.qc'), experimentId, biosample, factor, treatment, replicate, controlId into xcorReads
-  set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats
+  set file('*.cc.qc'), file('*.cc.plot.pdf') into crossReadsStats
+  file('version_*.txt') into crossReadsVersions

  script:

@@ -354,7 +366,7 @@ experimentRows = experimentPoolObjs
 process callPeaksMACS {

  tag "$sampleId-$replicate"
-  publishDir "$outDir/${task.process}", mode: 'copy'
+  publishDir "$outDir/${task.process}/${experimentId}/${replicate}", mode: 'copy'

  input:
  set sampleId, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId, controlTagAlign from experimentRows
@@ -362,7 +374,8 @@ process callPeaksMACS {
  output:

  set sampleId, file('*.narrowPeak'), file('*.fc_signal.bw'), file('*.pvalue_signal.bw'), experimentId, biosample, factor, treatment, replicate, controlId into experimentPeaks
-  file '*.xls' into summit
+  file '*.xls' into callPeaksMACSsummit
+  file('version_*.txt') into callPeaksMACSVersions

  script:

@@ -403,6 +416,7 @@ process consensusPeaks {
  file 'design_diffPeaks.csv'  into designDiffPeaks
  file 'design_annotatePeaks.tsv'  into designAnnotatePeaks, designMotifSearch
  file 'unique_experiments.csv' into uniqueExperiments
+  file('version_*.txt') into consensusPeaksVersions

  script:

@@ -415,7 +429,7 @@ process consensusPeaks {
 // Annotate Peaks
 process peakAnnotation {

-  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+  publishDir "$outDir/${task.process}", mode: 'copy'

  input:

@@ -424,6 +438,7 @@ process peakAnnotation {
  output:

  file "*chipseeker*" into peakAnnotation
+  file('version_*.txt') into peakAnnotationVersions

  script:

@@ -433,10 +448,10 @@ process peakAnnotation {

 }

-// Motif Search  Peaks
+// Motif Search Peaks
 process motifSearch {

-  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+  publishDir "$outDir/${task.process}", mode: 'copy'

  input:

@@ -446,6 +461,10 @@ process motifSearch {

  file "*memechip" into motifSearch
  file "*narrowPeak" into filteredPeaks
+  file('version_*.txt') into motifSearchVersions
+
+  when:
+  !skipMotif

  script:

@@ -461,7 +480,7 @@ uniqueExperimentsList = uniqueExperiments
 // Calculate Differential Binding Activity
 process diffPeaks {

-  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+  publishDir "$outDir/${task.process}", mode: 'copy'

  input:

@@ -474,12 +493,47 @@ process diffPeaks {
  file '*_diffbind.csv' into diffPeaksCounts
  file '*.pdf' into diffPeaksStats
  file 'normcount_peaksets.txt' into normCountPeaks
+  file('version_*.txt') into diffPeaksVersions

  when:
-  noUniqueExperiments > 1
+  noUniqueExperiments > 1 && !skipDiff

  script:
  """
  Rscript $baseDir/scripts/diff_peaks.R $designDiffPeaks
  """
 }
+
+// Collect Software Versions and references
+process softwareReport {
+
+  publishDir "$outDir/${task.process}", mode: 'copy'
+
+  input:
+
+    file ('trimReads_vf/*') from trimReadsVersions.first()
+    file ('alignReads_vf/*') from alignReadsVersions.first()
+    file ('filterReads_vf/*') from filterReadsVersions.first()
+    file ('convertReads_vf/*') from convertReadsVersions.first()
+    file ('crossReads_vf/*') from crossReadsVersions.first()
+    file ('callPeaksMACS_vf/*') from callPeaksMACSVersions.first()
+    file ('consensusPeaks_vf/*') from consensusPeaksVersions.first()
+    file ('peakAnnotation_vf/*') from peakAnnotationVersions.first()
+    file ('motifSearch_vf/*') from motifSearchVersions.first().ifEmpty()
+    file ('diffPeaks_vf/*') from diffPeaksVersions.first().ifEmpty()
+    file ('experimentQC_vf/*') from experimentQCVersions.first()
+
+  output:
+
+  file('software_versions_mqc.yaml') into softwareVersions
+  file('software_references_mqc.yaml') into softwareReferences
+
+  script:
+
+  """
+  echo $workflow.nextflow.version > version_nextflow.txt
+  python3 $baseDir/scripts/generate_references.py -r $references -o software_references
+  python3 $baseDir/scripts/generate_versions.py -o software_versions
+
+  """
+}
--- a/workflow/scripts/annotate_peaks.R
+++ b/workflow/scripts/annotate_peaks.R
@@ -35,6 +35,11 @@ if(genome_assembly=='GRCh37') {
    annodb <- 'org.Hs.eg.db'
 }

+# Output version of ChIPseeker
+chipseeker_version = packageVersion('ChIPseeker')
+write.table(paste("Version", chipseeker_version), file = "version_ChIPseeker.txt", sep = "\t",
+            row.names = FALSE, col.names = FALSE)
+
 # Load design file
 design <- read.csv(design_file, sep ='\t')
 files <- as.list(as.character(design$Peaks))

--- a/workflow/scripts/call_peaks_macs.py
+++ b/workflow/scripts/call_peaks_macs.py
@@ -5,6 +5,7 @@
 import os
 import argparse
 import shutil
+import subprocess
 import logging
 import utils
 from xcor import xcor as calculate_xcor
@@ -69,16 +70,19 @@ def check_tools():

    logger.info('Checking for required libraries and components on this system')

-    r_path = shutil.which("R")
-    if r_path:
-        logger.info('Found R: %s', r_path)
-    else:
-        logger.error('Missing R')
-        raise Exception('Missing R')
-
    macs_path = shutil.which("macs2")
-    if r_path:
+    if macs_path:
        logger.info('Found MACS2: %s', macs_path)
+
+        # Get Version
+        macs_version_command = "macs2  --version"
+        macs_version = subprocess.check_output(macs_version_command, shell=True, stderr=subprocess.STDOUT)
+
+        # Write to file
+        macs_file = open("version_macs.txt", "wb")
+        macs_file.write(macs_version)
+        macs_file.close()
+
    else:
        logger.error('Missing MACS2')
        raise Exception('Missing MACS2')
@@ -86,6 +90,18 @@ def check_tools():
    bg_bw_path = shutil.which("bedGraphToBigWig")
    if bg_bw_path:
        logger.info('Found bedGraphToBigWig: %s', bg_bw_path)
+
+        # Get Version
+        bg_bw_version_command = "bedGraphToBigWig"
+        try:
+            subprocess.check_output(bg_bw_version_command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError as e:
+            bg_bw_version = e.output
+
+        # Write to file
+        bg_bw_file = open("version_bedGraphToBigWig.txt", "wb")
+        bg_bw_file.write(bg_bw_version)
+        bg_bw_file.close()
    else:
        logger.error('Missing bedGraphToBigWig')
        raise Exception('Missing bedGraphToBigWig')
@@ -93,6 +109,16 @@ def check_tools():
    bedtools_path = shutil.which("bedtools")
    if bedtools_path:
        logger.info('Found bedtools: %s', bedtools_path)
+
+        # Get Version
+        bedtools_version_command = "bedtools --version"
+        bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
+
+        # Write to file
+        bedtools_file = open("version_bedtools.txt", "wb")
+        bedtools_file.write(bedtools_version)
+        bedtools_file.close()
+
    else:
        logger.error('Missing bedtools')
        raise Exception('Missing bedtools')
@@ -109,7 +135,6 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
        fragment_length = frag_lengths.split(',')[0]  # grab first value
        logger.info("Fraglen %s", fragment_length)

-
    # Generate narrow peaks and preliminary signal tracks

    command = 'macs2 callpeak ' + \
@@ -129,7 +154,6 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
    narrowpeak_fn = '%s.narrowPeak' % (prefix)
    clipped_narrowpeak_fn = 'clipped-%s' % (narrowpeak_fn)

-
    steps = ['slopBed -i %s -g %s -b 0' % (int_narrowpeak_fn, chrom_sizes),
             'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)]


--- a/workflow/scripts/convert_reads.py
+++ b/workflow/scripts/convert_reads.py
@@ -54,6 +54,15 @@ def check_tools():
    bedtools_path = shutil.which("bedtools")
    if bedtools_path:
        logger.info('Found bedtools: %s', bedtools_path)
+
+        # Get Version
+        bedtools_version_command = "bedtools --version"
+        bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
+
+        # Write to file
+        bedtools_file = open("version_bedtools.txt", "wb")
+        bedtools_file.write(bedtools_version)
+        bedtools_file.close()
    else:
        logger.error('Missing bedtools')
        raise Exception('Missing bedtools')
@@ -61,6 +70,15 @@ def check_tools():
    samtools_path = shutil.which("samtools")
    if samtools_path:
        logger.info('Found samtools: %s', samtools_path)
+
+        # Get Version
+        samtools_version_command = "samtools --version"
+        samtools_version = subprocess.check_output(samtools_version_command, shell=True)
+
+        # Write to file
+        samtools_file = open("version_samtools.txt", "wb")
+        samtools_file.write(samtools_version)
+        samtools_file.close()
    else:
        logger.error('Missing samtools')
        raise Exception('Missing samtools')

--- a/workflow/scripts/diff_peaks.R
+++ b/workflow/scripts/diff_peaks.R
@@ -11,6 +11,11 @@ if (length(args) != 1) {
  stop("Usage: diff_peaks.R annotate_design.tsv ", call.=FALSE)
 }

+# Output version of DiffBind
+diffibind_version = packageVersion('DiffBind')
+write.table(paste("Version", diffibind_version), file = "version_DiffBind.txt", sep = "\t",
+            row.names = FALSE, col.names = FALSE)
+
 # Build DBA object from design file
 data <- dba(sampleSheet=args[1])
 data <- dba.count(data)

--- a/workflow/scripts/experiment_qc.py
+++ b/workflow/scripts/experiment_qc.py
@@ -52,6 +52,15 @@ def check_tools():
    deeptools_path = shutil.which("deeptools")
    if deeptools_path:
        logger.info('Found deeptools: %s', deeptools_path)
+
+        # Get Version
+        deeptools_version_command = "deeptools --version"
+        deeptools_version = subprocess.check_output(deeptools_version_command, shell=True, stderr=subprocess.STDOUT)
+
+        # Write to file
+        deeptools_file = open("version_deeptools.txt", "wb")
+        deeptools_file.write(deeptools_version)
+        deeptools_file.close()
    else:
        logger.error('Missing deeptools')
        raise Exception('Missing deeptools')
@@ -77,10 +86,10 @@ def generate_read_summary(design, extension):
    return mbs_filename


-def check_correlation(mbs):
+def check_spearman_correlation(mbs):
    '''Check Spearman pairwise correlation of samples based on read counts.'''

-    spearman_filename = 'heatmap_SpearmanCorr.png'
+    spearman_filename = 'heatmap_SpearmanCorr.pdf'
    spearman_params = "--corMethod spearman --skipZero" + \
                    " --plotTitle \"Spearman Correlation of Read Counts\"" + \
                    " --whatToPlot heatmap --colorMap RdYlBu --plotNumbers"
@@ -96,12 +105,31 @@ def check_correlation(mbs):
    out, err = spearman_correlation.communicate()


+def check_pearson_correlation(mbs):
+    '''Check Pearson pairwise correlation of samples based on read counts.'''
+
+    pearson_filename = 'heatmap_PearsonCorr.pdf'
+    pearson_params = "--corMethod pearson --skipZero" + \
+                    " --plotTitle \"Pearson Correlation of Read Counts\"" + \
+                    " --whatToPlot heatmap --colorMap RdYlBu --plotNumbers"
+
+    pearson_command = (
+        "plotCorrelation -in %s %s -o %s"
+        % (mbs, pearson_params, pearson_filename)
+    )
+
+    logger.info("Running deeptools with %s", pearson_command)
+
+    pearson_correlation = subprocess.Popen(pearson_command, shell=True)
+    out, err = pearson_correlation.communicate()
+
+
 def check_coverage(design, extension):
    '''Asses the sequencing depth of samples.'''

    bam_files = ' '.join(design['bam_reads'])
    labels = ' '.join(design['sample_id'])
-    coverage_filename = 'coverage.png'
+    coverage_filename = 'coverage.pdf'
    coverage_params = "-n 1000000 --plotTitle \"Sample Coverage\"" + \
                    " --ignoreDuplicates --minMappingQuality 10"

@@ -137,7 +165,7 @@ def update_controls(design):
 def check_enrichment(sample_id, control_id, sample_reads, control_reads, extension):
    '''Asses the enrichment per sample.'''

-    fingerprint_filename = sample_id + '_fingerprint.png'
+    fingerprint_filename = sample_id + '_fingerprint.pdf'

    fingerprint_command = (
        "plotFingerprint -b %s %s --extendReads %d --labels %s %s --plotFile %s"
@@ -167,7 +195,8 @@ def main():

    # Run correlation
    mbs_filename = generate_read_summary(design_df, extension)
-    check_correlation(mbs_filename)
+    check_spearman_correlation(mbs_filename)
+    check_pearson_correlation(mbs_filename)

    # Run coverage
    check_coverage(design_df, extension)

--- a/workflow/scripts/generate_references.py
+++ b/workflow/scripts/generate_references.py
+#!/usr/bin/env python
+
+'''Make header for HTML of references.'''
+
+import argparse
+import subprocess
+import shlex
+import logging
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-r', '--reference',
+                        help="The reference file (markdown format).",
+                        required=True)
+
+    parser.add_argument('-o', '--output',
+                        help="The out file name.",
+                        default='references')
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+    reference = args.reference
+    output = args.output
+
+    out_filename = output + '_mqc.yaml'
+
+    # Header for HTML
+    print('''
+        id: 'Software References'
+        section_name: 'Software References'
+        description: 'This section describes references for the tools used.'
+        plot_type: 'html'
+        data: |
+        '''
+    , file = open(out_filename, "w")
+    )
+
+    # Turn Markdown into HTML
+    references_html = 'bash -c "pandoc -p {} | sed \'s/^/                /\' >> {}"'
+    references_html = references_html.format(reference, out_filename)
+    subprocess.check_call(shlex.split(references_html))
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/generate_versions.py
+++ b/workflow/scripts/generate_versions.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+'''Make YAML of software versions.'''
+
+from __future__ import print_function
+from collections import OrderedDict
+import re
+import os
+import logging
+import glob
+import argparse
+import numpy as np
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+SOFTWARE_REGEX = {
+    'Nextflow': ['version_nextflow.txt', r"(\S+)"],
+    'Trim Galore!': ['trimReads_vf/version_trimgalore.txt', r"version (\S+)"],
+    'Cutadapt': ['trimReads_vf/version_cutadapt.txt', r"Version (\S+)"],
+    'BWA': ['alignReads_vf/version_bwa.txt', r"Version: (\S+)"],
+    'Samtools': ['alignReads_vf/version_samtools.txt', r"samtools (\S+)"],
+    'Sambamba': ['filterReads_vf/version_sambamba.txt', r"sambamba (\S+)"],
+    'BEDTools': ['convertReads_vf/version_bedtools.txt', r"bedtools v(\S+)"],
+    'R': ['crossReads_vf/version_r.txt', r"R version (\S+)"],
+    'SPP': ['crossReads_vf/version_spp.txt', r"\[1\] ‘(1.14)’"],
+    'MACS2': ['callPeaksMACS_vf/version_macs.txt', r"macs2 (\S+)"],
+    'bedGraphToBigWig': ['callPeaksMACS_vf/version_bedGraphToBigWig.txt', r"bedGraphToBigWig v (\S+)"],
+    'ChIPseeker': ['peakAnnotation_vf/version_ChIPseeker.txt', r"Version (\S+)\""],
+    'MEME-ChIP': ['motifSearch_vf/version_memechip.txt', r"Version (\S+)"],
+    'DiffBind': ['diffPeaks_vf/version_DiffBind.txt', r"Version (\S+)\""],
+    'deepTools': ['experimentQC_vf/version_deeptools.txt', r"deeptools (\S+)"],
+}
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-o', '--output',
+                        help="The out file name.",
+                        required=True)
+
+    parser.add_argument('-t', '--test',
+                        help='Used for testing purposes',
+                        default=False,
+                        action='store_true')
+
+    args = parser.parse_args()
+    return args
+
+
+def check_files(files, test):
+    '''Check if version files are found.'''
+
+    logger.info("Running file check.")
+
+    software_files = np.array(list(SOFTWARE_REGEX.values()))[:,0]
+
+    extra_files =  set(files) - set(software_files)
+
+    if len(extra_files) > 0 and test:
+            logger.error('Missing regex: %s', list(extra_files))
+            raise Exception("Missing regex: %s" % list(extra_files))
+
+
+def main():
+    args = get_args()
+    output = args.output
+    test = args.test
+
+    out_filename = output + '_mqc.yaml'
+
+    results = OrderedDict()
+    results['Nextflow'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Trim Galore!'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Cutadapt'] = '<span style="color:#999999;\">Not Run</span>'
+    results['BWA'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Samtools'] = '<span style="color:#999999;\">Not Run</span>'
+    results['Sambamba'] = '<span style="color:#999999;\">Not Run</span>'
+    results['BEDTools'] = '<span style="color:#999999;\">Not Run</span>'
+    results['R'] = '<span style="color:#999999;\">Not Run</span>'
+    results['SPP'] = '<span style="color:#999999;\">Not Run</span>'
+    results['MACS2'] = '<span style="color:#999999;\">Not Run</span>'
+    results['bedGraphToBigWig'] = '<span style="color:#999999;\">Not Run</span>'
+    results['ChIPseeker'] = '<span style="color:#999999;\">Not Run</span>'
+    results['MEME-ChIP'] = '<span style="color:#999999;\">Not Run</span>'
+    results['DiffBind'] = '<span style="color:#999999;\">Not Run</span>'
+    results['deepTools'] = '<span style="color:#999999;\">Not Run</span>'
+
+    # list all files
+    files = glob.glob('**/*.txt', recursive=True)
+
+    # Check for version files:
+    check_files(files, test)
+
+    # Search each file using its regex
+    for k, v in SOFTWARE_REGEX.items():
+        if os.path.isfile(v[0]):
+            with open(v[0]) as x:
+                versions = x.read()
+                match = re.search(v[1], versions)
+                if match:
+                    results[k] = "v{}".format(match.group(1))
+
+    # Dump to YAML
+    print(
+        '''
+        id: 'Software Versions'
+        section_name: 'Software Versions'
+        section_href: 'https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/'
+        plot_type: 'html'
+        description: 'are collected at run time from the software output.'
+        data: |
+            <dl class="dl-horizontal">
+        '''
+    , file = open(out_filename, "w"))
+
+    for k, v in results.items():
+        print("        <dt>{}</dt><dd>{}</dd>".format(k, v), file = open(out_filename, "a"))
+    print("        </dl>", file = open(out_filename, "a"))
+
+
+if __name__ == '__main__':
+    main()
--- a/workflow/scripts/map_qc.py
+++ b/workflow/scripts/map_qc.py
@@ -62,6 +62,15 @@ def check_tools():
    samtools_path = shutil.which("samtools")
    if samtools_path:
        logger.info('Found samtools: %s', samtools_path)
+
+        # Get Version
+        samtools_version_command = "samtools --version"
+        samtools_version = subprocess.check_output(samtools_version_command, shell=True)
+
+        # Write to file
+        samtools_file = open("version_samtools.txt", "wb")
+        samtools_file.write(samtools_version)
+        samtools_file.close()
    else:
        logger.error('Missing samtools')
        raise Exception('Missing samtools')
@@ -69,6 +78,18 @@ def check_tools():
    sambamba_path = shutil.which("sambamba")
    if sambamba_path:
        logger.info('Found sambamba: %s', sambamba_path)
+
+        # Get Version
+        sambamba_version_command = "sambamba"
+        try:
+            subprocess.check_output(sambamba_version_command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError as e:
+            sambamba_version = e.output
+
+        # Write to file
+        sambamba_file = open("version_sambamba.txt", "wb")
+        sambamba_file.write(sambamba_version)
+        sambamba_file.close()
    else:
        logger.error('Missing sambamba')
        raise Exception('Missing sambamba')
@@ -76,6 +97,15 @@ def check_tools():
    bedtools_path = shutil.which("bedtools")
    if bedtools_path:
        logger.info('Found bedtools: %s', bedtools_path)
+
+        # Get Version
+        bedtools_version_command = "bedtools --version"
+        bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
+
+        # Write to file
+        bedtools_file = open("version_bedtools.txt", "wb")
+        bedtools_file.write(bedtools_version)
+        bedtools_file.close()
    else:
        logger.error('Missing bedtools')
        raise Exception('Missing bedtools')
@@ -167,7 +197,6 @@ def dedup_mapped(bam, bam_basename, paired):
            shlex.split(sambamba_markdup_command),
            stderr=temp_file)

-
    # Remove duplicates
    final_bam_prefix = bam_basename + ".dedup"
    final_bam_filename = final_bam_prefix + ".bam"

--- a/workflow/scripts/map_reads.py
+++ b/workflow/scripts/map_reads.py
@@ -70,6 +70,18 @@ def check_tools():
    bwa_path = shutil.which("bwa")
    if bwa_path:
        logger.info('Found bwa: %s', bwa_path)
+
+        # Get Version
+        bwa_version_command = "bwa"
+        try:
+            subprocess.check_output(bwa_version_command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError as e:
+            bwa_version = e.output
+
+        # Write to file
+        bwa_file = open("version_bwa.txt", "wb")
+        bwa_file.write(bwa_version)
+        bwa_file.close()
    else:
        logger.error('Missing bwa')
        raise Exception('Missing bwa')
@@ -77,6 +89,15 @@ def check_tools():
    samtools_path = shutil.which("samtools")
    if samtools_path:
        logger.info('Found samtools: %s', samtools_path)
+
+        # Get Version
+        samtools_version_command = "samtools --version"
+        samtools_version = subprocess.check_output(samtools_version_command, shell=True)
+
+        # Write to file
+        samtools_file = open("version_samtools.txt", "wb")
+        samtools_file.write(samtools_version)
+        samtools_file.close()
    else:
        logger.error('Missing samtools')
        raise Exception('Missing samtools')

--- a/workflow/scripts/motif_search.py
+++ b/workflow/scripts/motif_search.py
@@ -6,6 +6,7 @@ import os
 import argparse
 import logging
 import shutil
+import subprocess
 from multiprocessing import Pool
 import pandas as pd
 import utils
@@ -55,6 +56,7 @@ def get_args():

 # Functions

+
 def check_tools():
    '''Checks for required componenets on user system'''

@@ -63,6 +65,15 @@ def check_tools():
    meme_path = shutil.which("meme")
    if meme_path:
        logger.info('Found meme: %s', meme_path)
+
+        # Get Version
+        memechip_version_command = "meme-chip --version"
+        memechip_version = subprocess.check_output(memechip_version_command, shell=True)
+
+        # Write to file
+        meme_file = open("version_memechip.txt", "wb")
+        meme_file.write(b"Version %s" % (memechip_version))
+        meme_file.close()
    else:
        logger.error('Missing meme')
        raise Exception('Missing meme')
@@ -70,6 +81,15 @@ def check_tools():
    bedtools_path = shutil.which("bedtools")
    if bedtools_path:
        logger.info('Found bedtools: %s', bedtools_path)
+
+        # Get Version
+        bedtools_version_command = "bedtools --version"
+        bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
+
+        # Write to file
+        bedtools_file = open("version_bedtools.txt", "wb")
+        bedtools_file.write(bedtools_version)
+        bedtools_file.close()
    else:
        logger.error('Missing bedtools')
        raise Exception('Missing bedtools')
@@ -95,7 +115,7 @@ def motif_search(filename, genome, experiment, peak):
    else:
        peak_no = peak

-    sorted_fn = '%s.%s.narrowPeak' % (file_basename, peak)
+    sorted_fn = '%s.%s.narrowPeak' % (file_basename, peak_no)

    out, err = utils.run_pipe([
        'sort -k %dgr,%dgr %s' % (5, 5, filename),
@@ -108,8 +128,7 @@ def motif_search(filename, genome, experiment, peak):
    if err:
        logger.error("bedtools error: %s", err)

-
-    #Call memechip
+    # Call memechip
    out, err = utils.run_pipe([
        'meme-chip -oc %s -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 %s -norand' % (out_motif, out_fa)])
    if err:

--- a/workflow/scripts/overlap_peaks.py
+++ b/workflow/scripts/overlap_peaks.py
@@ -6,6 +6,7 @@ import os
 import argparse
 import logging
 import shutil
+import subprocess
 import pandas as pd
 import utils

@@ -49,6 +50,15 @@ def check_tools():
    bedtools_path = shutil.which("bedtools")
    if bedtools_path:
        logger.info('Found bedtools: %s', bedtools_path)
+
+        # Get Version
+        bedtools_version_command = "bedtools --version"
+        bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
+
+        # Write to file
+        bedtools_file = open("version_bedtools.txt", "wb")
+        bedtools_file.write(bedtools_version)
+        bedtools_file.close()
    else:
        logger.error('Missing bedtools')
        raise Exception('Missing bedtools')
@@ -178,6 +188,9 @@ def main():
    handler = logging.FileHandler('consensus_peaks.log')
    logger.addHandler(handler)

+    # Check if tools are present
+    check_tools()
+
    # Read files as dataframes
    design_peaks_df = pd.read_csv(design, sep='\t')
    design_files_df = pd.read_csv(files, sep='\t')

--- a/workflow/scripts/pool_and_psuedoreplicate.py
+++ b/workflow/scripts/pool_and_psuedoreplicate.py
@@ -5,6 +5,9 @@
 import argparse
 import logging
 import os
+import subprocess
+import shutil
+import shlex
 import pandas as pd
 import numpy as np
 import utils
@@ -140,21 +143,25 @@ def self_psuedoreplication(tag_file, prefix, paired):

    splits_prefix = 'temp_split'

-    out, err = utils.run_pipe([
-        'gzip -dc %s' % (tag_file),
-        'shuf --random-source=%s' % (tag_file),
-        'split -d -l %d - %s' % (lines_per_rep, splits_prefix)])
+    psuedo_command = 'bash -c "zcat {} | shuf --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {} | wc -c) -nosalt </dev/zero 2>/dev/null) | '
+    psuedo_command += 'split -d -l {} - {}."'
+    psuedo_command = psuedo_command.format(
+        tag_file,
+        tag_file,
+        int(lines_per_rep),
+        splits_prefix)
+    logger.info("Running psuedo with %s", psuedo_command)
+    subprocess.check_call(shlex.split(psuedo_command))

    # Convert read pairs to reads into standard tagAlign file

    for i, index in enumerate([0, 1]):
-        string_index = '0' + str(index)
+        string_index = '.0' + str(index)
        steps = ['cat %s' % (splits_prefix + string_index)]
        if paired:
            steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""])
        steps.extend(['gzip -cn'])
        out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i])
-        os.remove(splits_prefix + string_index)

    return pseudoreplicate_dict

@@ -336,7 +343,6 @@ def main():
        tmp_metadata['tag_align'] = path_to_file
        design_new_df = design_new_df.append(tmp_metadata)

-
    # Write out new dataframe
    design_new_df.to_csv(experiment_id + '_ppr.tsv',
                         header=True, sep='\t', index=False)

--- a/workflow/scripts/trim_reads.py
+++ b/workflow/scripts/trim_reads.py
@@ -54,6 +54,15 @@ def check_tools():
    trimgalore_path = shutil.which("trim_galore")
    if trimgalore_path:
        logger.info('Found trimgalore: %s', trimgalore_path)
+
+        # Get Version
+        trim_version_command = "trim_galore --version"
+        trimgalore_version = subprocess.check_output(trim_version_command, shell=True)
+
+        # Write to file
+        trimgalore_file = open("version_trimgalore.txt", "wb")
+        trimgalore_file.write(trimgalore_version)
+        trimgalore_file.close()
    else:
        logger.error('Missing trimgalore')
        raise Exception('Missing trimgalore')
@@ -61,6 +70,15 @@ def check_tools():
    cutadapt_path = shutil.which("cutadapt")
    if cutadapt_path:
        logger.info('Found cutadapt: %s', cutadapt_path)
+
+        # Get Version
+        cutadapt_version_command = "cutadapt --version"
+        cutadapt_version = subprocess.check_output(cutadapt_version_command, shell=True)
+
+        # Write to file
+        cutadapt_file = open("version_cutadapt.txt", "wb")
+        cutadapt_file.write(b"Version %s" % (cutadapt_version))
+        cutadapt_file.close()
    else:
        logger.error('Missing cutadapt')
        raise Exception('Missing cutadapt')

--- a/workflow/scripts/xcor.py
+++ b/workflow/scripts/xcor.py
@@ -5,6 +5,7 @@
 import os
 import argparse
 import shutil
+import subprocess
 import logging
 from multiprocessing import cpu_count
 import utils
@@ -59,10 +60,35 @@ def check_tools():
    r_path = shutil.which("R")
    if r_path:
        logger.info('Found R: %s', r_path)
+
+        # Get Version
+        r_version_command = "R --version"
+        r_version = subprocess.check_output(r_version_command, shell=True)
+
+        # Write to file
+        r_file = open("version_r.txt", "wb")
+        r_file.write(r_version)
+        r_file.close()
    else:
        logger.error('Missing R')
        raise Exception('Missing R')

+    phantompeak_path = shutil.which("run_spp.R")
+    if phantompeak_path:
+        logger.info('Found phantompeak: %s', phantompeak_path)
+
+        # Get Version
+        spp_version_command = "R -e \"packageVersion('spp')\""
+        spp_version = subprocess.check_output(spp_version_command, shell=True)
+
+        # Write to file
+        spp_file = open("version_spp.txt", "wb")
+        spp_file.write(spp_version)
+        spp_file.close()
+    else:
+        logger.error('Missing phantompeak')
+        raise Exception('Missing phantompeak')
+

 def xcor(tag, paired):
    '''Use spp to calculate cross-correlation stats.'''
@@ -70,13 +96,11 @@ def xcor(tag, paired):
    tag_basename = os.path.basename(utils.strip_extensions(tag, STRIP_EXTENSIONS))
    uncompressed_tag_filename = tag_basename

-
    # Subsample tagAlign file
    number_reads = 15000000
    subsampled_tag_filename = \
        tag_basename + ".%d.tagAlign.gz" % (number_reads/1000000)

-
    steps = [
        'zcat %s' % (tag),
        'grep -v "chrM"',
@@ -116,7 +140,6 @@ def xcor(tag, paired):
    return cc_scores_filename


-
 def main():
    args = get_args()
    paired = args.paired