cec86acd · b1c3fb87 · 4c3add75 · e57b3605 · cf00e0e5 · f760c6df
--- a/.gitignore
+++ b/.gitignore
@@ -97,3 +97,6 @@ ENV/

 # mypy
 .mypy_cache/
+
+# Mac OS
+.DS_Store
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
 before_script:
  - module add  python/3.6.1-2-anaconda
-  - pip install --user pytest-pythonpath pytest-cov
+  - pip install --user pytest-pythonpath==0.7.1 pytest-cov==2.5.1
  - module load  nextflow/0.31.0
-  - ln -s /work/BICF/s163035/chipseq/*fastq.gz test_data/
+  - ln -s /project/shared/bicf_workflow_ref/workflow_testdata/chipseq/*fastq.gz test_data/

 stages:
  - unit
+  - astrocyte
  - single
  - multiple
+  - skip

 user_configuration:
  stage: unit
@@ -15,34 +17,66 @@ user_configuration:
  - pytest -m unit
  - pytest -m unit --cov=./workflow/scripts

+astrocyte:
+  stage: astrocyte
+  script:
+  - module load astrocyte/0.1.0
+  - module unload nextflow
+  - cd ..
+  - astrocyte_cli validate chipseq_analysis
+  artifacts:
+    expire_in: 2 days
+
 single_end_mouse:
  stage: single
+  only:
+    - master
  script:
-  - nextflow run workflow/main.nf -resume
+  - nextflow run workflow/main.nf --astrocyte true -resume
  - pytest -m singleend
  artifacts:
    expire_in: 2 days

 paired_end_human:
  stage: single
+  only:
+    - branches
+  except:
+    - master
  script:
-  - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_ENCSR729LGA_PE.txt" --genome 'GRCh38' --pairedEnd true -resume
+  - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_ENCSR729LGA_PE.txt" --genome 'GRCh38' --pairedEnd true --astrocyte false -resume
  - pytest -m pairedend
  artifacts:
    expire_in: 2 days

 single_end_diff:
  stage: multiple
+  only:
+    - branches
+  except:
+    - master
  script:
-  - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' -resume
+  - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' --astrocyte false -resume
  - pytest -m singlediff
  artifacts:
    expire_in: 2 days

 paired_end_diff:
+  only:
+    - master
  stage: multiple
  script:
-  - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_PE.txt" --genome 'GRCh38' --pairedEnd true -resume
+  - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_PE.txt" --genome 'GRCh38' --pairedEnd true --astrocyte false -resume
  - pytest -m paireddiff
  artifacts:
    expire_in: 2 days
+
+single_end_skip:
+  stage: skip
+  only:
+    - master
+  script:
+  - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' --skipDiff true --skipMotif true --astrocyte false -resume
+  - pytest -m singleskip_true
+  artifacts:
+    expire_in: 2 days
--- a/README.md
+++ b/README.md
@@ -5,6 +5,7 @@
 [![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A50.24.0-brightgreen.svg
 )](https://www.nextflow.io/)
 [![Astrocyte](https://img.shields.io/badge/astrocyte-%E2%89%A50.1.0-blue.svg)](https://astrocyte-test.biohpc.swmed.edu/static/docs/index.html)
+[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.2648845.svg)](https://doi.org/10.5281/zenodo.2648845)


 ## Introduction

--- a/astrocyte_pkg.yml
+++ b/astrocyte_pkg.yml
@@ -11,7 +11,7 @@ name: 'chipseq_analysis_bicf'
 # Who wrote this?
 author: 'Beibei Chen and Venkat Malladi'
 # A contact email address for questions
-email: 'biohpc-help@utsouthwestern.edu'
+email: 'bicf@utsouthwestern.edu'
 # A more informative title for the workflow package
 title: 'BICF ChIP-seq Analysis Workflow'
 # A summary of the workflow package in plain text
@@ -27,7 +27,7 @@ description: |
 # web interface. These files are in the 'docs' subdirectory. The first file
 # listed will be used as a documentation index and is index.md by convention
 documentation_files:
-  - ['index.md', 'chipseq-analysis']
+  - 'index.md'

 # -----------------------------------------------------------------------------
 # NEXTFLOW WORKFLOW CONFIGURATION
@@ -42,15 +42,17 @@ workflow_modules:
  - 'python/3.6.1-2-anaconda'
  - 'trimgalore/0.4.1'
  - 'bwa/intel/0.7.12'
+  - 'samtools/1.6'
  - 'sambamba/0.6.6'
  - 'bedtools/2.26.0'
  - 'deeptools/2.5.0.1'
  - 'phantompeakqualtools/1.2'
  - 'macs/2.1.0-20151222'
  - 'UCSC_userApps/v317'
-  - 'R/3.4.1-gccmkl'
+  - 'R/3.3.2-gccmkl'
  - 'meme/4.11.1-gcc-openmpi'
-  - 'python/2.7.x-anaconda'
+  - 'pandoc/2.7'
+

 # A list of parameters used by the workflow, defining how to present them,
 # options etc in the web interface. For each parameter:
@@ -92,20 +94,21 @@ workflow_parameters:
    description: |
      One or more input FASTQ files from a ChIP-seq expereiment and a design
      file with the link bewetwen the same file name and sample id
-    regex: ".*(fastq|fq)*"
+    regex: ".*(fastq|fq)*gz"
+    min: 2

  - id: pairedEnd
    type: select
    required: true
    choices:
-      - [ 'true', 'True']
-      - [ 'false', 'False']
+      - [ 'true', 'true']
+      - [ 'false', 'false']
    description: |
      In single-end sequencing, the sequencer reads a fragment from only one
      end to the other, generating the sequence of base pairs. In paired-end
      reading it starts at one read, finishes this direction at the specified
      read length, and then starts another round of reading from the opposite
-      end of the fragment.
+      end of the fragment. (Paired-end: True, Single-end: False)

  - id: design
    type: file
@@ -113,18 +116,45 @@ workflow_parameters:
    description: |
      A design file listing sample id, fastq files, corresponding control id
      and additional information about the sample.
-    regex: ".*tsv"
+    regex: ".*txt"

  - id: genome
    type: select
+    required: true
    choices:
      - [ 'GRCh38', 'Human GRCh38']
      - [ 'GRCh37', 'Human GRCh37']
      - [ 'GRCm38', 'Mouse GRCm38']
-    required: true
    description: |
      Reference species and genome used for alignment and subsequent analysis.

+  - id: skipDiff
+    type: select
+    required: true
+    choices:
+      - [ 'true', 'true']
+      - [ 'false', 'false']
+    description: |
+      Run differential peak analysis
+
+  - id: skipMotif
+    type: select
+    required: true
+    choices:
+      - [ 'true', 'true']
+      - [ 'false', 'false']
+    description: |
+      Run motif calling
+
+  - id: astrocyte
+    type: select
+    choices:
+      - [ 'true', 'true' ]
+    required: true
+    default: 'true'
+    description: |
+      Ensure configuraton for astrocyte.
+

 # -----------------------------------------------------------------------------
 # SHINY APP CONFIGURATION
@@ -144,8 +174,4 @@ vizapp_cran_packages:

 # List of any Bioconductor packages, not provided by the modules,
 # that must be made available to the vizapp
-vizapp_bioc_packages:
-  - qusage
-#  - ballgown
-vizapp_github_packages:
-  - js229/Vennerable
+vizapp_bioc_packages: []
--- a/docs/design_example.csv
+++ b/docs/design_example.csv
-SampleID,Tissue,Factor,Condition,Replicate,Peaks,bamReads,bamControl,ControlID,PeakCaller
-A_1,A,H3K27AC,A,1,A_1.broadPeak,A_1.bam,A_1_input.bam,A_1_input,bed
-A_2,A,H3K27AC,A,2,A_2.broadPeak,A_2.bam,A_2_input.bam,A_2_input,bed
-B_1,B,H3K27AC,B,1,B_1.broadPeak,B_1.bam,B_1_input.bam,B_1_input,bed
-B_2,B,H3K27AC,B,2,B_2.broadPeak,B_2.bam,B_2_input.bam,B_2_input,bed
-C_1,C,H3K27AC,C,1,C_1.broadPeak,C_1.bam,C_1_input.bam,C_1_input,bed
-C_2,C,H3K27AC,C,2,C_2.broadPeak,C_2.bam,C_2_input.bam,C_2_input,bed
--- a/docs/design_example.txt
+++ b/docs/design_example.txt
+sample_id	experiment_id	biosample	factor	treatment	replicate	control_id	fastq_read1
+A1	A	tissueA	H3K27AC	None	1	B1	A1.fastq.gz
+A2	A	tissueA	H3K27AC	None	2	B2	A2.fastq.gz
+B1	B	tissueB	Input	None	1	B1	B1.fastq.gz
+B2	A	tissueB	Input	None	2	B2	B2.fastq.gz
--- a/docs/index.md
+++ b/docs/index.md
-# Astrocyte ChIPseq analysis Workflow Package
+# BICF ChIP-seq Analysis Workflow

 ## Introduction
 **ChIP-seq Analysis** is a bioinformatics best-practice analysis pipeline used for chromatin immunoprecipitation (ChIP-seq) data analysis.
@@ -7,16 +7,16 @@ The pipeline uses [Nextflow](https://www.nextflow.io), a bioinformatics workflow

 ### Pipeline Steps

-1) Trim adaptors TrimGalore!
-2) Align with BWA
-3) Filter reads with Sambamba  S
-4) Quality control with DeepTools
-5) Calculate Cross-correlation using SPP and PhantomPeakQualTools
-6) Signal profiling using MACS2
-7) Call consenus peaks
-8) Annotate all peaks using ChipSeeker
-9) Use MEME-ChIP to find motifs in original peaks
-10) Find differential expressed peaks using DiffBind (If more than 1 experiment)
+  1) Trim adaptors TrimGalore!
+  2) Align with BWA
+  3) Filter reads with Sambamba  S
+  4) Quality control with DeepTools
+  5) Calculate Cross-correlation using SPP and PhantomPeakQualTools
+  6) Signal profiling using MACS2
+  7) Call consenus peaks
+  8) Annotate all peaks using ChipSeeker
+  9) Use MEME-ChIP to find motifs in original peaks
+  10) Find differential expressed peaks using DiffBind (If more than 1 experiment)


 ## Workflow Parameters
@@ -25,41 +25,35 @@ The pipeline uses [Nextflow](https://www.nextflow.io), a bioinformatics workflow
    pairedEnd - Choose True/False if data is paired-end
    design - Choose the file with the experiment design information. TSV format
    genome - Choose a genomic reference (genome).
+    skipDiff - Choose True/False if data if you want to run Differential Peaks
+    skipMotif - Choose True/False if data if you want to run Motif Calling


 ## Design file

- The following columns are necessary, must be named as in template. An design file template can be downloaded [HERE](https://git.biohpc.swmed.edu/bchen4/chipseq_analysis/raw/master/docs/design_example.csv)
-
-    SampleID
-        The id of the sample. This will be the header in output files, please make sure it is concise
-    Tissue
-        Tissue of the sample
-    Factor
-        Factor of the experiment
-    Condition
-	    This is the group that will be used for pairwise differential expression analysis
-	Replicate
-	    Replicate id
-    Peaks
-        The file name of the peak file for this sample
-    bamReads
-        The file name of the IP BAM for this sample
-    bamControl
-        The file name of the control BAM for this sample
-    ContorlID
-        The id of the control sample
-    PeakCaller
-        The peak caller used
-
+ The following columns are necessary, must be named as in template. An design file template can be downloaded [HERE](https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/blob/master/docs/design_example.txt)
+
+    sample_id
+        The id of the sample. This will be the name used in output files, please make sure it is concise and informative.
+    experiment_id
+        The id of the experiment. Used for grouping replicates.
+    biosample
+        The name of the biological sample.
+    factor
+        Factor of the experiment.
+    treatment
+        Treatment used in experiment.
+    replicate
+        Replicate number.
+    control_id
+	    The sample_id of the control used for this sample.
+    fastq_read1
+      File name of fastq file, if paired-end this is read1.
+    fastq_read2
+      File name of read2 (for paired-end), not needed for single-end data.


 ### Credits
-This example worklow is derived from original scripts kindly contributed by the Bioinformatic Core Facility (BICF), Department of Bioinformatics
-
-### References
+This worklow is was developed jointly with the [Bioinformatic Core Facility (BICF), Department of Bioinformatics](http://www.utsouthwestern.edu/labs/bioinformatics/)

-* ChipSeeker: http://bioconductor.org/packages/release/bioc/html/ChIPseeker.html
-* DiffBind: http://bioconductor.org/packages/release/bioc/html/DiffBind.html
-* Deeptools: https://deeptools.github.io/
-* MEME-ChIP: http://meme-suite.org/doc/meme-chip.html
+Please cite in publications: Pipeline was developed by BICF from funding provided by **Cancer Prevention and Research Institute of Texas (RP150596)**.
--- a/docs/references.md
+++ b/docs/references.md
+### References
+
+1. **python**:
+  * Anaconda (Anaconda Software Distribution, [https://anaconda.com](https://anaconda.com))
+
+2. **trimgalore**:
+  * trimgalore [https://github.com/FelixKrueger/TrimGalore](https://github.com/FelixKrueger/TrimGalore)
+
+3. **cutadapt**:
+  * Marcel, M. 2011. Cutadapt removes adapter sequences from high-throughput sequencing reads. EMBnet.journal 17(1):10-12. doi:[10.14806/ej.17.1.200](http://dx.doi.org/10.14806/ej.17.1.200)
+
+4. **bwa**:
+  * Li H., and R. Durbin. 2009. Fast and accurate short read alignment with Burrows-Wheeler Transform. Bioinformatics 25: 1754-60. doi:[10.1093/bioinformatics/btp324](http://dx.doi.org/10.1093/bioinformatics/btp324)
+
+5. **samtools**:
+  * Li H., B. Handsaker, A. Wysoker, T. Fennell, J. Ruan, N. Homer, G. Marth, G. Abecasis, R. Durbin, and 1000 Genome Project Data Processing Subgroup. 2009. The Sequence alignment/map (SAM) format and SAMtools. Bioinformatics 25: 2078-9. doi:[10.1093/bioinformatics/btp352](http://dx.doi.org/10.1093/bioinformatics/btp352)
+
+6. **sambamba**:
+  * Tarasov, A., A. J. Vilella, E. Cuppen, I. J. Nijman, and P. Prins. 2015 Sambamba: fast processing of NGS alignment formats. Bioinformatics 31(12): 2032-2034. doi:[10.1093/bioinformatics/btv098](http://dx.doi.org/10.1093/bioinformatics/btv098)
+
+7. **bedtools**:
+  * Quinlan, A. R., and I. M. Hall. 2010. BEDTools: a flexible suite of utilities for comparing genomic feautures. Bioinformatics 26(6): 841-842. doi:[10.1093/bioinformatics/btq033](http://dx.doi.org/10.1093/bioinformatics/btq033)
+
+8. **deeptools**:
+  * Ramírez, F., D. P. Ryan, B. Grüning, V. Bhardwaj, F. Kilpert, A. S. Richter, S. Heyne, F. Dündar, and T. Manke. 2016. deepTools2: a next generation web server for deep-sequencing data analysis. Nucleic Acids Research 44: W160-165. doi:[10.1093/nar/gkw257](http://dx.doi.org/10.1093/nar/gkw257)
+
+9. **phantompeakqualtools**:
+  * Landt S. G., G. K. Marinov, A. Kundaje, et al. 2012. ChIP-seq guidelines and practices of the ENCODE and modENCODE consortia. Genome Res 9: 1813-31. doi:[10.1101/gr.136184.111](http://dx.doi.org/10.1101/gr.136184.111)
+  * Kharchenko P. K., M. Y. Tolstorukov, and P. J. Park. 2008. Design and analysis of ChIP-seq experiments for DNA-binding proteins. Nat Biotechnol 26(12): 1351-1359. doi:[10.1038/nbt.1508](https://dx.doi.org/10.1038/nbt.1508)
+
+10. **macs**:
+  * Zhang Y., T. Liu, C. A. Meyer, J. Eeckhoute, D. S. Johnson, B. E. Bernstein, C. Nusbaum, R. M. Myers, M. Brown, W. Li, and X. S. Liu. 2008. Model-based Analysis of ChIP-Seq (MACS). Genome Biol 9: R137. doi:[10.1186/gb-2008-9-9-r137](https://dx.doi.org/10.1186/gb-2008-9-9-r137)
+
+11. **UCSC(bedGraphToBigWig)**:
+  * Kent W. J., A. S. Zweig, G. Barber, A. S. Hinrichs, and D. Karolchik. BigWig and BigBed: enabling browsing of large distributed data sets. Bioinformatics 26(17): 2204-2207. doi:[10.1093/bioinformatics/btq351](https://dx.doi.org/10.1093/bioinformatics/btq351)
+
+12. **R**:
+  * R Core Team 2014. R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL:[http://www.R-project.org/](http://www.R-project.org/).
+
+13. **meme**:
+  * Bailey T. L., M. Bodén, F. A. Buske, M. Frith, C. E. Grant, L. Clementi, J. Ren, W. W. Li, and W. S. Noble. 2009. MEME SUITE: tools for motif discovery and searching. Nucleic Acids Research 37: W202-W208. doi:[10.1093/nar/gkp335](https://dx.doi.org/10.1093/nar/gkp335)
+  * Machanick P., and T. L. Bailey. 2011. MEME-ChIP: motif analysis of large DNA datasets. Bioinformatics 27(12): 1696-1697. doi:[10.1093/bioinformatics/btr189](https://dx.doi.org/10.1093/bioinformatics/btr189)
+
+14. **ChIPseeker**:
+  * Yu G., L. Wang, and Q. He. 2015. ChIPseeker: an R/Bioconductor package for ChIP peak annotation, comparison and visualization. Bioinformatics 31(14): 2382-2383. doi:[10.1093/bioinformatics/btv145](https://dx.doi.org/10.1093/bioinformatics/btv145)
+
+15. **DiffBind**:
+  * Stark R., and G. Brown. 2011. DiffBind: differential binding analysis of ChIP-Seq peak data. [http://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf](http://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf). doi:[10.18129/B9.bioc.DiffBind](https://dx.doi.org/10.18129/B9.bioc.DiffBind)
+  * Ross-Innes C. S., R. Stark, A. E. Teschendorff, K. A. Holmes, H. R. Ali, M. J. Dunning,  G. D. Brown, O. Gojis, I. O. Ellis, A. R. Green, S. Ali, S. Chin, C. Palmieri, C. Caldas, and J. S. Carroll. 2012. Differential oestrogen receptor binding is associated with clinical outcome in breast cancer. Nature 481: 389-393. doi:[10.1038/nature10730](https://dx.doi.org/10.1038/nature10730)
+
+16. **MultiQc**:
+  * Ewels P., Magnusson M., Lundin S. and Käller M. 2016. MultiQC: Summarize analysis results for multiple tools and samples in a single report. Bioinformatics 32(19): 3047–3048. doi:[10.1093/bioinformatics/btw354 ](https://dx.doi.org/10.1093/bioinformatics/btw354)
+
+17. **BICF ChIP-seq Analysis Workflow**:
+  * Venkat S. Malladi and Beibei Chen. (2019). BICF ChIP-seq Analysis Workflow (publish_1.0.0). Zenodo. doi:[10.5281/zenodo.2648845](https://doi.org/10.5281/zenodo.2648845)
+
+Please cite in publications: Pipeline was developed by BICF from funding provided by **Cancer Prevention and Research Institute of Texas (RP150596)**.
--- a/env.md
+++ b/env.md
-## Create new env in specific folder
-```shell
-conda create -p /project/shared/bicf_workflow_ref/chipseq_bchen4/ -c r r-essentials
-#Add channels
-conda config --add channels conda-forge
-conda config --add channels r
-conda config --add channels bioconda
-pip install --user twobitreader
-conda install -c r r-xml
-```
-
-Install bioconductor in R console:
-```R
-source("http://bioconductor.org/biocLite.R")
-biocLite()
-biocLite(c("DiffBind","ChIPseeker"))
-```
\ No newline at end of file
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -41,7 +41,7 @@ process {
    executor = 'local'
  }
  withName: callPeaksMACS {
-    module = ['python/3.6.1-2-anaconda', 'macs/2.1.0-20151222', 'phantompeakqualtools/1.2', 'UCSC_userApps/v317', 'bedtools/2.26.0']
+    module = ['python/3.6.1-2-anaconda', 'macs/2.1.0-20151222', 'UCSC_userApps/v317', 'bedtools/2.26.0', 'phantompeakqualtools/1.2']
    queue = '128GB,256GB,256GBv1'
  }
  withName: consensusPeaks {
@@ -60,7 +60,10 @@ process {
    module = ['python/3.6.1-2-anaconda', 'meme/4.11.1-gcc-openmpi', 'bedtools/2.26.0']
    cpus = 32
  }
-
+  withName: multiqcReport {
+    module = ['python/3.6.1-2-anaconda', 'pandoc/2.7', 'multiqc/1.7']
+    executor = 'local'
+  }
 }

 params {

--- a/workflow/conf/multiqc_config.yaml
+++ b/workflow/conf/multiqc_config.yaml
+# Title to use for the report.
+title: BICF ChIP-seq Analysis Report
+
+report_comment: >
+  This report has been generated by the <a href="https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/" target="_blank">BICF/chipseq_analysis</a>
+  pipeline.
+
+extra_fn_clean_exts:
+  - 'pbc.qc'
+  - 'cc.qc'
+
+fn_ignore_files:
+  - '*dedup.flagstat.qc'
+
+custom_data:
+  library_complexity:
+    file_format: 'tsv'
+    id: 'library_complexity'
+    contents: 'TotalReadPairs  DistinctReadPairs       OneReadPair     TwoReadPairs    NRF     PBC1    PBC2'
+    section_name: 'Library complexity'
+    plot_type: 'generalstats'
+    pconfig:
+        TotalReadPairs:
+          decimalPlaces: 0
+          shared_key: read_count
+        DistinctReadPairs:
+          decimalPlaces: 0
+          shared_key: read_count
+        NRF:
+          decimalPlaces: 2
+        PBC1:
+          decimalPlaces: 2
+        PBC2:
+          decimalPlaces: 2
+
+sp:
+    phantompeakqualtools/out:
+      fn: '*cc.qc'
+    library_complexity:
+      fn: '*pbc.qc'
+    macs2:
+      fn: '*_peaks.xls'
+
+
+report_section_order:
+    cutadapt:
+      order: -1000
+    Samtools:
+      order: -1100
+    Software_Versions:
+      order: -1200
+    Software_References:
+      order: -1300
+
+table_columns_placement:
+    library_complexity:
+      TotalReadPairs: 1100
+      DistinctReadPairs: 1200
+      NRF: 1300
+      PBC1: 1400
+      PBC2: 1500
+    phantompeakqualtools:
+      Estimated_Fragment_Length_bp: 1600
+      NSC: 1700
+      RSC: 1800
+
+table_columns_visible:
+  cutadapt:
+    percent_trimmed: False
+  library_complexity:
+    OneReadPair: False
+    TwoReadPairs: False
+
+table_cond_formatting_rules:
+    library_complexity_mqc-generalstats-library_complexity-NRF:
+      pass:
+        - gt: 0.8
+      warn:
+        - lt: 0.8
+      fail:
+        - lt: 0.5
+    library_complexity_mqc-generalstats-library_complexity-PBC1:
+      pass:
+        - gt: 0.8
+      warn:
+        - lt: 0.8
+      fail:
+        - lt: 0.5
+    library_complexity_mqc-generalstats-library_complexity-PBC2:
+      pass:
+        - gt: 3
+      warn:
+        - lt: 3
+      fail:
+        - lt: 1
+
+thousandsSep_format: ''
--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -8,15 +8,36 @@ params.reads = "$baseDir/../test_data/*.fastq.gz"
 params.pairedEnd = false
 params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt"
 params.genome = 'GRCm38'
-params.genomes = []
-params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false
-params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false
-params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false
-params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false
 params.cutoffRatio = 1.2
 params.outDir= "$baseDir/output"
 params.extendReadsLen = 100
 params.topPeakCount = 600
+params.astrocyte = false
+params.skipDiff = false
+params.skipMotif = false
+params.references = "$baseDir/../docs/references.md"
+params.multiqc =  "$baseDir/conf/multiqc_config.yaml"
+
+// Assign variables if astrocyte
+if (params.astrocyte) {
+  print("Running under astrocyte")
+  referenceLocation = "/project/shared/bicf_workflow_ref"
+  params.bwaIndex = "$referenceLocation/$params.genome"
+  params.chromSizes = "$referenceLocation/$params.genome/genomefile.txt"
+  params.fasta = "$referenceLocation/$params.genome/genome.fa.txt"
+  if (params.genome == 'GRCh37' || params.genome == 'GRCh38') {
+    params.genomeSize = 'hs'
+  } else if (params.genome == 'GRCm38') {
+    params.genomeSize = 'mm'
+  }
+} else {
+    params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false
+    params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false
+    params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false
+    params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false
+}
+
+

 // Check inputs
 if( params.bwaIndex ){
@@ -45,6 +66,10 @@ cutoffRatio = params.cutoffRatio
 outDir = params.outDir
 extendReadsLen = params.extendReadsLen
 topPeakCount = params.topPeakCount
+skipDiff = params.skipDiff
+skipMotif = params.skipMotif
+references = params.references
+multiqc = params.multiqc

 // Check design file for errors
 process checkDesignFile {
@@ -64,11 +89,13 @@ process checkDesignFile {

  if (pairedEnd) {
    """
+    module load python/3.6.1-2-anaconda
    python3 $baseDir/scripts/check_design.py -d $designFile -f $readsList -p
    """
  }
  else {
    """
+    module load python/3.6.1-2-anaconda
    python $baseDir/scripts/check_design.py -d $designFile -f $readsList
    """
  }
@@ -90,7 +117,7 @@ rawReads = designFilePaths
 process trimReads {

  tag "$sampleId-$replicate"
-  publishDir "$outDir/${task.process}", mode: 'copy'
+  publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy'

  input:

@@ -99,18 +126,23 @@ process trimReads {
  output:

  set sampleId, file('*.fq.gz'), experimentId, biosample, factor, treatment, replicate, controlId into trimmedReads
-  file('*trimming_report.txt') into trimgalore_results
+  file('*trimming_report.txt') into trimgaloreResults
+  file('version_*.txt') into trimReadsVersions

  script:

  if (pairedEnd) {
    """
-    python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -p
+    module load python/3.6.1-2-anaconda
+    module load trimgalore/0.4.1
+    python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -s $sampleId -p
    """
  }
  else {
    """
-    python3 $baseDir/scripts/trim_reads.py -f ${reads[0]}
+    module load python/3.6.1-2-anaconda
+    module load trimgalore/0.4.1
+    python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} -s $sampleId
    """
  }

@@ -119,8 +151,9 @@ process trimReads {
 // Align trimmed reads using bwa
 process alignReads {

+  queue '128GB,256GB,256GBv1'
  tag "$sampleId-$replicate"
-  publishDir "$outDir/${task.process}", mode: 'copy'
+  publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy'

  input:

@@ -130,18 +163,25 @@ process alignReads {
  output:

  set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into mappedReads
-  file '*.srt.bam.flagstat.qc' into mappedReadsStats
+  file '*.flagstat.qc' into mappedReadsStats
+  file('version_*.txt') into alignReadsVersions

  script:

  if (pairedEnd) {
    """
-    python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -p
+    module load python/3.6.1-2-anaconda
+    module load bwa/intel/0.7.12
+    module load samtools/1.6
+    python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -s $sampleId -p
    """
  }
  else {
    """
-    python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa
+    module load python/3.6.1-2-anaconda
+    module load bwa/intel/0.7.12
+    module load samtools/1.6
+    python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa -s $sampleId
    """
  }

@@ -150,8 +190,9 @@ process alignReads {
 // Dedup reads using sambamba
 process filterReads {

+  queue '128GB,256GB,256GBv1'
  tag "$sampleId-$replicate"
-  publishDir "$outDir/${task.process}", mode: 'copy'
+  publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy'

  input:

@@ -161,19 +202,28 @@ process filterReads {

  set sampleId, file('*.bam'), file('*.bai'), experimentId, biosample, factor, treatment, replicate, controlId into dedupReads
  set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into convertReads
-  file '*flagstat.qc' into dedupReadsStats
-  file '*pbc.qc' into dedupReadsComplexity
-  file '*dup.qc' into dupReads
+  file '*.flagstat.qc' into dedupReadsStats
+  file '*.pbc.qc' into dedupReadsComplexity
+  file '*.dedup.qc' into dupReads
+  file('version_*.txt') into filterReadsVersions

  script:

  if (pairedEnd) {
    """
+    module load python/3.6.1-2-anaconda
+    module load samtools/1.6
+    module load sambamba/0.6.6
+    module load bedtools/2.26.0
    python3 $baseDir/scripts/map_qc.py -b $mapped -p
    """
  }
  else {
    """
+    module load python/3.6.1-2-anaconda
+    module load samtools/1.6
+    module load sambamba/0.6.6
+    module load bedtools/2.26.0
    python3 $baseDir/scripts/map_qc.py -b $mapped
    """
  }
@@ -190,6 +240,7 @@ dedupReads
 // Quality Metrics using deeptools
 process experimentQC {

+  queue '128GB,256GB,256GBv1'
  publishDir "$outDir/${task.process}", mode: 'copy'

  input:
@@ -198,11 +249,14 @@ process experimentQC {

  output:

-  file '*.{png,npz}' into deepToolsStats
+  file '*.{pdf,npz}' into experimentQCStats
+  file('version_*.txt') into experimentQCVersions

  script:

  """
+  module load python/3.6.1-2-anaconda
+  module load deeptools/2.5.0.1
  python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign -e $extendReadsLen
  """

@@ -211,8 +265,9 @@ process experimentQC {
 // Convert reads to bam
 process convertReads {

+  queue '128GB,256GB,256GBv1'
  tag "$sampleId-$replicate"
-  publishDir "$outDir/${task.process}", mode: 'copy'
+  publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy'

  input:

@@ -221,16 +276,23 @@ process convertReads {
  output:

  set sampleId, file('*.tagAlign.gz'), file('*.bed{pe,se}.gz'), experimentId, biosample, factor, treatment, replicate, controlId into tagReads
+  file('version_*.txt') into convertReadsVersions

  script:

  if (pairedEnd) {
    """
+    module load python/3.6.1-2-anaconda
+    module load samtools/1.6
+    module load bedtools/2.26.0
    python3 $baseDir/scripts/convert_reads.py -b $deduped -p
    """
  }
  else {
    """
+    module load python/3.6.1-2-anaconda
+    module load samtools/1.6
+    module load bedtools/2.26.0
    python3 $baseDir/scripts/convert_reads.py -b $deduped
    """
  }
@@ -241,7 +303,7 @@ process convertReads {
 process crossReads {

  tag "$sampleId-$replicate"
-  publishDir "$outDir/${task.process}", mode: 'copy'
+  publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy'

  input:

@@ -250,17 +312,22 @@ process crossReads {
  output:

  set sampleId, seTagAlign, tagAlign, file('*.cc.qc'), experimentId, biosample, factor, treatment, replicate, controlId into xcorReads
-  set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats
+  set file('*.cc.qc'), file('*.cc.plot.pdf') into crossReadsStats
+  file('version_*.txt') into crossReadsVersions

  script:

  if (pairedEnd) {
    """
+    module load python/3.6.1-2-anaconda
+    module load phantompeakqualtools/1.2
    python3 $baseDir/scripts/xcor.py -t $seTagAlign -p
    """
  }
  else {
    """
+    module load python/3.6.1-2-anaconda
+    module load phantompeakqualtools/1.2
    python3 $baseDir/scripts/xcor.py -t $seTagAlign
    """
  }
@@ -289,6 +356,7 @@ process defineExpDesignFiles {
  script:

  """
+  module load python/3.6.1-2-anaconda
  python3 $baseDir/scripts/experiment_design.py -d $xcorDesign
  """

@@ -314,11 +382,13 @@ process poolAndPsuedoReads {

  if (pairedEnd) {
    """
+    module load python/3.6.1-2-anaconda
    python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio -p
    """
  }
  else {
    """
+    module load python/3.6.1-2-anaconda
    python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio
    """
  }
@@ -334,7 +404,7 @@ experimentRows = experimentPoolObjs
 process callPeaksMACS {

  tag "$sampleId-$replicate"
-  publishDir "$outDir/${task.process}", mode: 'copy'
+  publishDir "$outDir/${task.process}/${experimentId}/${replicate}", mode: 'copy'

  input:
  set sampleId, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId, controlTagAlign from experimentRows
@@ -342,16 +412,28 @@ process callPeaksMACS {
  output:

  set sampleId, file('*.narrowPeak'), file('*.fc_signal.bw'), file('*.pvalue_signal.bw'), experimentId, biosample, factor, treatment, replicate, controlId into experimentPeaks
+  file '*.xls' into callPeaksMACSsummit
+  file('version_*.txt') into callPeaksMACSVersions

  script:

  if (pairedEnd) {
    """
+    module load python/3.6.1-2-anaconda
+    module load macs/2.1.0-20151222
+    module load UCSC_userApps/v317
+    module load bedtools/2.26.0
+    module load phantompeakqualtools/1.2
    python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes -p
    """
  }
  else {
    """
+    module load python/3.6.1-2-anaconda
+    module load macs/2.1.0-20151222
+    module load UCSC_userApps/v317
+    module load bedtools/2.26.0
+    module load phantompeakqualtools/1.2
    python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes
    """
  }
@@ -368,6 +450,7 @@ peaksDesign = experimentPeaks
 process consensusPeaks {

  publishDir "$outDir/${task.process}", mode: 'copy'
+  publishDir "$outDir/design", mode: 'copy',  pattern: '*.{csv|tsv}'

  input:

@@ -381,10 +464,13 @@ process consensusPeaks {
  file 'design_diffPeaks.csv'  into designDiffPeaks
  file 'design_annotatePeaks.tsv'  into designAnnotatePeaks, designMotifSearch
  file 'unique_experiments.csv' into uniqueExperiments
+  file('version_*.txt') into consensusPeaksVersions

  script:

  """
+  module load python/3.6.1-2-anaconda
+  module load bedtools/2.26.0
  python3 $baseDir/scripts/overlap_peaks.py -d $peaksDesign -f $preDiffDesign
  """

@@ -393,7 +479,7 @@ process consensusPeaks {
 // Annotate Peaks
 process peakAnnotation {

-  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+  publishDir "$outDir/${task.process}", mode: 'copy'

  input:

@@ -402,19 +488,21 @@ process peakAnnotation {
  output:

  file "*chipseeker*" into peakAnnotation
+  file('version_*.txt') into peakAnnotationVersions

  script:

  """
+  module load R/3.3.2-gccmkl
  Rscript $baseDir/scripts/annotate_peaks.R $designAnnotatePeaks $genome
  """

 }

-// Motif Search  Peaks
+// Motif Search Peaks
 process motifSearch {

-  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+  publishDir "$outDir/${task.process}", mode: 'copy'

  input:

@@ -423,11 +511,17 @@ process motifSearch {
  output:

  file "*memechip" into motifSearch
-  file "sorted-*" into filteredPeaks
+  file "*narrowPeak" into filteredPeaks
+  file('version_*.txt') into motifSearchVersions
+
+  when:
+
+  !skipMotif

  script:

  """
+  module load R/3.3.2-gccmkl
  python3 $baseDir/scripts/motif_search.py -d $designMotifSearch -g $fasta -p $topPeakCount
  """
 }
@@ -439,7 +533,7 @@ uniqueExperimentsList = uniqueExperiments
 // Calculate Differential Binding Activity
 process diffPeaks {

-  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+  publishDir "$outDir/${task.process}", mode: 'copy'

  input:

@@ -452,13 +546,62 @@ process diffPeaks {
  file '*_diffbind.csv' into diffPeaksCounts
  file '*.pdf' into diffPeaksStats
  file 'normcount_peaksets.txt' into normCountPeaks
+  file('version_*.txt') into diffPeaksVersions

  when:
-  noUniqueExperiments > 1

+  noUniqueExperiments > 1 && !skipDiff

  script:
+
  """
+  module load python/3.6.1-2-anaconda
+  module load meme/4.11.1-gcc-openmpi
+  module load bedtools/2.26.0
  Rscript $baseDir/scripts/diff_peaks.R $designDiffPeaks
  """
 }
+
+// Generate Multiqc Report, gerernate Software Versions and references
+process multiqcReport {
+
+  publishDir "$outDir/${task.process}", mode: 'copy'
+
+  input:
+
+  file ('trimReads_vf/*') from trimReadsVersions.first()
+  file ('alignReads_vf/*') from alignReadsVersions.first()
+  file ('filterReads_vf/*') from filterReadsVersions.first()
+  file ('convertReads_vf/*') from convertReadsVersions.first()
+  file ('crossReads_vf/*') from crossReadsVersions.first()
+  file ('callPeaksMACS_vf/*') from callPeaksMACSVersions.first()
+  file ('consensusPeaks_vf/*') from consensusPeaksVersions.first()
+  file ('peakAnnotation_vf/*') from peakAnnotationVersions.first()
+  file ('motifSearch_vf/*') from motifSearchVersions.first().ifEmpty()
+  file ('diffPeaks_vf/*') from diffPeaksVersions.first().ifEmpty()
+  file ('experimentQC_vf/*') from experimentQCVersions.first()
+  file ('trimReads/*') from trimgaloreResults.collect()
+  file ('alignReads/*') from mappedReadsStats.collect()
+  file ('filterReads/*') from dedupReadsComplexity.collect()
+  file ('crossReads/*') from crossReadsStats.collect()
+
+  output:
+
+  file('software_versions_mqc.yaml') into softwareVersions
+  file('software_references_mqc.yaml') into softwareReferences
+  file "multiqc_report.html" into multiqcReport
+  file "*_data" into multiqcData
+
+  script:
+
+  """
+  module load python/3.6.1-2-anaconda
+  module load pandoc/2.7
+  module load multiqc/1.7
+  echo $workflow.nextflow.version > version_nextflow.txt
+  multiqc --version > version_multiqc.txt
+  python3 $baseDir/scripts/generate_references.py -r $references -o software_references
+  python3 $baseDir/scripts/generate_versions.py -o software_versions
+  multiqc -c $multiqc .
+  """
+}
--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
@@ -3,3 +3,12 @@ profiles {
    includeConfig 'conf/biohpc.config'
  }
 }
+
+manifest {
+  name = 'chipseq_analysis'
+  description = 'BICF ChIP-seq Analysis Workflow.'
+  homePage = 'https://github.com/nf-core/rnaseq'
+  version = '1.0.0'
+  mainScript = 'main.nf'
+  nextflowVersion = '>=0.31.0'
+}
--- a/workflow/scripts/annotate_peaks.R
+++ b/workflow/scripts/annotate_peaks.R
@@ -17,7 +17,7 @@ args <- commandArgs(trailingOnly=TRUE)

 # Check input args
 if (length(args) != 2) {
-  stop("Usage: annotate_peaks.R [ annotate_design.tsv ] [ genome_assembly ]", call.=FALSE)
+  stop("Usage: annotate_peaks.R annotate_design.tsv genome_assembly", call.=FALSE)
 }

 design_file <- args[1]
@@ -35,6 +35,11 @@ if(genome_assembly=='GRCh37') {
    annodb <- 'org.Hs.eg.db'
 }

+# Output version of ChIPseeker
+chipseeker_version = packageVersion('ChIPseeker')
+write.table(paste("Version", chipseeker_version), file = "version_ChIPseeker.txt", sep = "\t",
+            row.names = FALSE, col.names = FALSE)
+
 # Load design file
 design <- read.csv(design_file, sep ='\t')
 files <- as.list(as.character(design$Peaks))

--- a/workflow/scripts/call_peaks_macs.py
+++ b/workflow/scripts/call_peaks_macs.py
@@ -5,8 +5,8 @@
 import os
 import argparse
 import shutil
+import subprocess
 import logging
-from multiprocessing import cpu_count
 import utils
 from xcor import xcor as calculate_xcor

@@ -70,16 +70,19 @@ def check_tools():

    logger.info('Checking for required libraries and components on this system')

-    r_path = shutil.which("R")
-    if r_path:
-        logger.info('Found R: %s', r_path)
-    else:
-        logger.error('Missing R')
-        raise Exception('Missing R')
-
    macs_path = shutil.which("macs2")
-    if r_path:
+    if macs_path:
        logger.info('Found MACS2: %s', macs_path)
+
+        # Get Version
+        macs_version_command = "macs2  --version"
+        macs_version = subprocess.check_output(macs_version_command, shell=True, stderr=subprocess.STDOUT)
+
+        # Write to file
+        macs_file = open("version_macs.txt", "wb")
+        macs_file.write(macs_version)
+        macs_file.close()
+
    else:
        logger.error('Missing MACS2')
        raise Exception('Missing MACS2')
@@ -87,6 +90,18 @@ def check_tools():
    bg_bw_path = shutil.which("bedGraphToBigWig")
    if bg_bw_path:
        logger.info('Found bedGraphToBigWig: %s', bg_bw_path)
+
+        # Get Version
+        bg_bw_version_command = "bedGraphToBigWig"
+        try:
+            subprocess.check_output(bg_bw_version_command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError as e:
+            bg_bw_version = e.output
+
+        # Write to file
+        bg_bw_file = open("version_bedGraphToBigWig.txt", "wb")
+        bg_bw_file.write(bg_bw_version)
+        bg_bw_file.close()
    else:
        logger.error('Missing bedGraphToBigWig')
        raise Exception('Missing bedGraphToBigWig')
@@ -94,12 +109,23 @@ def check_tools():
    bedtools_path = shutil.which("bedtools")
    if bedtools_path:
        logger.info('Found bedtools: %s', bedtools_path)
+
+        # Get Version
+        bedtools_version_command = "bedtools --version"
+        bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
+
+        # Write to file
+        bedtools_file = open("version_bedtools.txt", "wb")
+        bedtools_file.write(bedtools_version)
+        bedtools_file.close()
+
    else:
        logger.error('Missing bedtools')
        raise Exception('Missing bedtools')


 def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes):
+    '''Call peaks and signal tracks'''

    # Extract the fragment length estimate from column 3 of the
    # cross-correlation scores file
@@ -107,8 +133,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
        firstline = xcor_fh.readline()
        frag_lengths = firstline.split()[2]  # third column
        fragment_length = frag_lengths.split(',')[0]  # grab first value
-        logger.info("Fraglen %s" % (fragment_length))
-
+        logger.info("Fraglen %s", fragment_length)

    # Generate narrow peaks and preliminary signal tracks

@@ -119,18 +144,18 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)

    logger.info(command)
    returncode = utils.block_on(command)
-    logger.info("MACS2 exited with returncode %d" % (returncode))
+    logger.info("MACS2 exited with returncode %d", returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # MACS2 sometimes calls features off the end of chromosomes.
    # Remove coordinates outside chromosome sizes

-    narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix)
+    int_narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix)
+    narrowpeak_fn = '%s.narrowPeak' % (prefix)
    clipped_narrowpeak_fn = 'clipped-%s' % (narrowpeak_fn)

-
-    steps = ['slopBed -i %s -g %s -b 0' % (narrowpeak_fn, chrom_sizes),
-            'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)]
+    steps = ['slopBed -i %s -g %s -b 0' % (int_narrowpeak_fn, chrom_sizes),
+             'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)]

    out, err = utils.run_pipe(steps)

@@ -161,7 +186,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)

    logger.info(command)
    returncode = utils.block_on(command)
-    logger.info("MACS2 exited with returncode %d" % (returncode))
+    logger.info("MACS2 exited with returncode %d", returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # Remove coordinates outside chromosome sizes (MACS2 bug)
@@ -169,7 +194,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
    fc_bedgraph_sorted_fn = 'sorted-%s' % (fc_bedgraph_fn)
    fc_signal_fn = "%s.fc_signal.bw" % (prefix)
    steps = ['slopBed -i %s_FE.bdg -g %s -b 0' % (prefix, chrom_sizes),
-            'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)]
+             'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)]

    out, err = utils.run_pipe(steps)

@@ -185,7 +210,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)

    logger.info(command)
    returncode = utils.block_on(command)
-    logger.info("bedGraphToBigWig exited with returncode %d" % (returncode))
+    logger.info("bedGraphToBigWig exited with returncode %d", returncode)
    assert returncode == 0, "bedGraphToBigWig non-zero return"

    # For -log10(p-value) signal tracks
@@ -199,7 +224,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
    sval = str(min(float(chip_reads), float(control_reads)) / 1000000)

    logger.info("chip_reads = %s, control_reads = %s, sval = %s" %
-            (chip_reads, control_reads, sval))
+                (chip_reads, control_reads, sval))

    command = 'macs2 bdgcmp ' + \
          '-t %s_treat_pileup.bdg ' % (prefix) + \
@@ -216,7 +241,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
    pvalue_bedgraph_sorted_fn = 'sort-%s' % (pvalue_bedgraph_fn)
    pvalue_signal_fn = "%s.pvalue_signal.bw" % (prefix)
    steps = ['slopBed -i %s_ppois.bdg -g %s -b 0' % (prefix, chrom_sizes),
-            'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)]
+             'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)]

    out, err = utils.run_pipe(steps)

@@ -232,12 +257,13 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)

    logger.info(command)
    returncode = utils.block_on(command)
-    logger.info("bedGraphToBigWig exited with returncode %d" % (returncode))
+    logger.info("bedGraphToBigWig exited with returncode %d", returncode)
    assert returncode == 0, "bedGraphToBigWig non-zero return"

    # Remove temporary files
    os.remove(clipped_narrowpeak_fn)
    os.remove(rescaled_narrowpeak_fn)
+    os.remove(int_narrowpeak_fn)


 def main():

--- a/workflow/scripts/check_design.py
+++ b/workflow/scripts/check_design.py
@@ -72,6 +72,46 @@ def check_design_headers(design, paired):
        raise Exception("Missing column headers: %s" % list(missing_headers))


+def check_samples(design):
+    '''Check if design file has the correct sample name mapping.'''
+
+    logger.info("Running sample check.")
+
+    samples = design.groupby('sample_id') \
+                            .apply(list)
+
+    malformated_samples = []
+    chars = set('-.')
+    for sample in samples.index.values:
+        if(any(char.isspace() for char in sample) | any((char in chars) for char in sample)):
+            malformated_samples.append(sample)
+
+    if len(malformated_samples) > 0:
+        logger.error('Malformed samples from design file: %s', list(malformated_samples))
+        raise Exception("Malformed samples from design file: %s" %
+                        list(malformated_samples))
+
+
+def check_experiments(design):
+    '''Check if design file has the correct experiment name mapping.'''
+
+    logger.info("Running experiment check.")
+
+    experiments = design.groupby('experiment_id') \
+                            .apply(list)
+
+    malformated_experiments = []
+    chars = set('-.')
+    for experiment in experiments.index.values:
+        if(any(char.isspace() for char in experiment) | any((char in chars) for char in experiment)):
+            malformated_experiments.append(experiment)
+
+    if len(malformated_experiments) > 0:
+        logger.error('Malformed experiment from design file: %s', list(malformated_experiments))
+        raise Exception("Malformed experiment from design file: %s" %
+                        list(malformated_experiments))
+
+
 def check_controls(design):
    '''Check if design file has the correct control mapping.'''

@@ -146,8 +186,8 @@ def main():
    logger.addHandler(handler)

    # Read files as dataframes
-    design_df = pd.read_csv(args.design, sep='\t')
-    fastq_df = pd.read_csv(args.fastq, sep='\t', names=['name', 'path'])
+    design_df = pd.read_csv(design, sep='\t')
+    fastq_df = pd.read_csv(fastq, sep='\t', names=['name', 'path'])

    # Check design file
    check_design_headers(design_df, paired)

--- a/workflow/scripts/convert_reads.py
+++ b/workflow/scripts/convert_reads.py
@@ -54,6 +54,15 @@ def check_tools():
    bedtools_path = shutil.which("bedtools")
    if bedtools_path:
        logger.info('Found bedtools: %s', bedtools_path)
+
+        # Get Version
+        bedtools_version_command = "bedtools --version"
+        bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
+
+        # Write to file
+        bedtools_file = open("version_bedtools.txt", "wb")
+        bedtools_file.write(bedtools_version)
+        bedtools_file.close()
    else:
        logger.error('Missing bedtools')
        raise Exception('Missing bedtools')
@@ -61,6 +70,15 @@ def check_tools():
    samtools_path = shutil.which("samtools")
    if samtools_path:
        logger.info('Found samtools: %s', samtools_path)
+
+        # Get Version
+        samtools_version_command = "samtools --version"
+        samtools_version = subprocess.check_output(samtools_version_command, shell=True)
+
+        # Write to file
+        samtools_file = open("version_samtools.txt", "wb")
+        samtools_file.write(samtools_version)
+        samtools_file.close()
    else:
        logger.error('Missing samtools')
        raise Exception('Missing samtools')
@@ -91,8 +109,7 @@ def convert_mapped_pe(bam, bam_basename):

    out, err = utils.run_pipe([
        "bamToBed -bedpe -mate1 -i %s" % (nmsrt_bam_filename),
-        "gzip -nc"],
-        outfile=bedpe_filename)
+        "gzip -nc"], outfile=bedpe_filename)


 def main():
@@ -109,7 +126,7 @@ def main():

    # Convert PE or SE to tagAlign
    bam_basename = os.path.basename(
-        utils.strip_extensions(bam, ['.bam']))
+        utils.strip_extensions(bam, ['.bam', '.dedup']))

    tag_filename = bam_basename + '.tagAlign.gz'
    convert_mapped(bam, tag_filename)
@@ -117,7 +134,7 @@ def main():
    if paired:  # paired-end data
        convert_mapped_pe(bam, bam_basename)
    else:
-        bedse_filename =  bam_basename + ".bedse.gz"
+        bedse_filename = bam_basename + ".bedse.gz"
        shutil.copy(tag_filename, bedse_filename)



--- a/workflow/scripts/diff_peaks.R
+++ b/workflow/scripts/diff_peaks.R
@@ -8,9 +8,14 @@ args <- commandArgs(trailingOnly=TRUE)

 # Check input args
 if (length(args) != 1) {
-  stop("Usage: diff_peaks.R [ annotate_design.tsv ] ", call.=FALSE)
+  stop("Usage: diff_peaks.R annotate_design.tsv ", call.=FALSE)
 }

+# Output version of DiffBind
+diffibind_version = packageVersion('DiffBind')
+write.table(paste("Version", diffibind_version), file = "version_DiffBind.txt", sep = "\t",
+            row.names = FALSE, col.names = FALSE)
+
 # Build DBA object from design file
 data <- dba(sampleSheet=args[1])
 data <- dba.count(data)

--- a/workflow/scripts/experiment_qc.py
+++ b/workflow/scripts/experiment_qc.py
@@ -7,8 +7,9 @@ import argparse
 import logging
 import subprocess
 import shutil
-import pandas as pd
 from multiprocessing import cpu_count
+import pandas as pd
+

 EPILOG = '''
 For more details:
@@ -51,6 +52,15 @@ def check_tools():
    deeptools_path = shutil.which("deeptools")
    if deeptools_path:
        logger.info('Found deeptools: %s', deeptools_path)
+
+        # Get Version
+        deeptools_version_command = "deeptools --version"
+        deeptools_version = subprocess.check_output(deeptools_version_command, shell=True, stderr=subprocess.STDOUT)
+
+        # Write to file
+        deeptools_file = open("version_deeptools.txt", "wb")
+        deeptools_file.write(deeptools_version)
+        deeptools_file.close()
    else:
        logger.error('Missing deeptools')
        raise Exception('Missing deeptools')
@@ -76,10 +86,10 @@ def generate_read_summary(design, extension):
    return mbs_filename


-def check_correlation(mbs):
+def check_spearman_correlation(mbs):
    '''Check Spearman pairwise correlation of samples based on read counts.'''

-    spearman_filename = 'heatmap_SpearmanCorr.png'
+    spearman_filename = 'heatmap_SpearmanCorr.pdf'
    spearman_params = "--corMethod spearman --skipZero" + \
                    " --plotTitle \"Spearman Correlation of Read Counts\"" + \
                    " --whatToPlot heatmap --colorMap RdYlBu --plotNumbers"
@@ -95,12 +105,31 @@ def check_correlation(mbs):
    out, err = spearman_correlation.communicate()


+def check_pearson_correlation(mbs):
+    '''Check Pearson pairwise correlation of samples based on read counts.'''
+
+    pearson_filename = 'heatmap_PearsonCorr.pdf'
+    pearson_params = "--corMethod pearson --skipZero" + \
+                    " --plotTitle \"Pearson Correlation of Read Counts\"" + \
+                    " --whatToPlot heatmap --colorMap RdYlBu --plotNumbers"
+
+    pearson_command = (
+        "plotCorrelation -in %s %s -o %s"
+        % (mbs, pearson_params, pearson_filename)
+    )
+
+    logger.info("Running deeptools with %s", pearson_command)
+
+    pearson_correlation = subprocess.Popen(pearson_command, shell=True)
+    out, err = pearson_correlation.communicate()
+
+
 def check_coverage(design, extension):
    '''Asses the sequencing depth of samples.'''

    bam_files = ' '.join(design['bam_reads'])
    labels = ' '.join(design['sample_id'])
-    coverage_filename = 'coverage.png'
+    coverage_filename = 'coverage.pdf'
    coverage_params = "-n 1000000 --plotTitle \"Sample Coverage\"" + \
                    " --ignoreDuplicates --minMappingQuality 10"

@@ -136,7 +165,7 @@ def update_controls(design):
 def check_enrichment(sample_id, control_id, sample_reads, control_reads, extension):
    '''Asses the enrichment per sample.'''

-    fingerprint_filename = sample_id + '_fingerprint.png'
+    fingerprint_filename = sample_id + '_fingerprint.pdf'

    fingerprint_command = (
        "plotFingerprint -b %s %s --extendReads %d --labels %s %s --plotFile %s"
@@ -166,7 +195,8 @@ def main():

    # Run correlation
    mbs_filename = generate_read_summary(design_df, extension)
-    check_correlation(mbs_filename)
+    check_spearman_correlation(mbs_filename)
+    check_pearson_correlation(mbs_filename)

    # Run coverage
    check_coverage(design_df, extension)
@@ -175,11 +205,11 @@ def main():
    new_design_df = update_controls(design_df)
    for index, row in new_design_df.iterrows():
        check_enrichment(
-                            row['sample_id'],
-                            row['control_id'],
-                            row['bam_reads'],
-                            row['control_reads'],
-                            extension)
+            row['sample_id'],
+            row['control_id'],
+            row['bam_reads'],
+            row['control_reads'],
+            extension)


 if __name__ == '__main__':

--- a/workflow/scripts/generate_references.py
+++ b/workflow/scripts/generate_references.py
+#!/usr/bin/env python
+
+'''Make header for HTML of references.'''
+
+import argparse
+import subprocess
+import shlex
+import logging
+
+EPILOG = '''
+For more details:
+        %(prog)s --help
+'''
+
+# SETTINGS
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+def get_args():
+    '''Define arguments.'''
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, epilog=EPILOG,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-r', '--reference',
+                        help="The reference file (markdown format).",
+                        required=True)
+
+    parser.add_argument('-o', '--output',
+                        help="The out file name.",
+                        default='references')
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+    reference = args.reference
+    output = args.output
+
+    out_filename = output + '_mqc.yaml'
+
+    # Header for HTML
+    print('''
+        id: 'Software References'
+        section_name: 'Software References'
+        description: 'This section describes references for the tools used.'
+        plot_type: 'html'
+        data: |
+        '''
+    , file = open(out_filename, "w")
+    )
+
+    # Turn Markdown into HTML
+    references_html = 'bash -c "pandoc -p {} | sed \'s/^/                /\' >> {}"'
+    references_html = references_html.format(reference, out_filename)
+    subprocess.check_call(shlex.split(references_html))
+
+
+if __name__ == '__main__':
+    main()
No results found