Merge branch '39-astrocyte_cli' into 'master'

Resolve "Tests for astrocyte" Closes #39 See merge request !27

Merge branch '39-astrocyte_cli' into 'master'
Resolve "Tests for astrocyte" Closes #39 See merge request !27
54529fc9 · Holly Ruess · ad75fe9a · c34d2612 · 54529fc9 · 54529fc9
Commit 54529fc9 authored 4 years ago by Holly Ruess
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -6,21 +6,31 @@ before_script:
 stages:
  - unit
+  - astrocyte
  - integration
 user_configuration:
  stage: unit
  script:
-  - pytest -m unit
  - pytest -m unit --cov=./workflow/scripts
+astrocyte:
+  stage: astrocyte
+  script:
+  - module load astrocyte/0.2.0
+  - module unload nextflow
+  - cd ..
+  - astrocyte_cli validate atacseq_analysis
 single_end_human:
  stage: integration
  only:
    - branches
    - master
  script:
-  - NXF_OPTS="-Dleveldb.mmap=false" nextflow run workflow/main.nf
+  - NXF_OPTS="-Dleveldb.mmap=false" nextflow run workflow/main.nf --ci true --dev true
  - pytest -m singleend_human
@@ -30,5 +40,5 @@ paired_end_mouse:
    - branches
    - master
  script:
-  - NXF_OPTS="-Dleveldb.mmap=false" nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_ENCSR451NAE_PE.txt" --genome 'GRCm38' --pairedEnd true --blacklist true
+  - NXF_OPTS="-Dleveldb.mmap=false" nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_ENCSR451NAE_PE.txt" --genome 'GRCm38' --pairedEnd true --blacklist true --astrocyte true --ci true --dev true
  - pytest -m pairedend_mouse
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,7 +2,7 @@
 All notable changes to this project will be documented in this file.
-## [Unreleased]
+## [publish_2.0.0 ] - 2020-06-12
 ### Fixed
 - Removed biosample, factor, treatment from design file
 - Updated documentation
@@ -28,4 +28,3 @@ All notable changes to this project will be documented in this file.
 ## [publish_1.0.0 ] - 2019-12-03
 Initial release of pipeline
--- a/README.md
+++ b/README.md
@@ -25,11 +25,11 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
 ```
 ## Input files
-##### 1) Fastq Files
+### Fastq Files
  + You will need the full path to the files for the Bash Scipt
-## Design file
+### Design file
  + The Design file is a tab-delimited file with 4 columns for Single-End and 5 columns for Paired-End.  Letter, numbers, and underlines can be used in the names. However, the names must begin with a letter. Columns must be as follows:
    1. sample_id - The id of the sample. This will be the header in output files, please make sure it is concise
@@ -37,7 +37,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
    3. replicate - Replicate number
    4. fastq_read1 - Name of fastq file 1 for SE or PE data
    5. fastq_read2 - Name of fastq file 2 for PE data
  + See [HERE](/docs/design_ENCSR451NAE_PE.txt) for an example design file, paired-end
  + See [HERE](/docs/design_ENCSR265ZXX_SE.txt) for an example design file, single-end
@@ -112,9 +112,8 @@ If you find an error, please let the [BICF](mailto:BICF@UTSouthwestern.edu) know
 ## Citation
-Please cite individual programs and versions used [HERE](docs/references.md), and the pipeline doi: coming soon. Please cite in publications: Pipeline was developed by BICF from funding provided by Cancer Prevention and Research Institute of Texas (RP150596).
+Please cite individual programs and versions of pipeline used [HERE](docs/references.md), and the overall pipeline doi: 10.5281/zenodo.3526149. Please cite in publications: Pipeline was developed by BICF from funding provided by Cancer Prevention and Research Institute of Texas (RP150596).
 ### Credits
 This example worklow is derived from original scripts kindly contributed by the Bioinformatic Core Facility ([BICF](https://www.utsouthwestern.edu/labs/bioinformatics/)), in the [Department of Bioinformatics](https://www.utsouthwestern.edu/departments/bioinformatics/).
--- a/astrocyte_pkg.yml
+++ b/astrocyte_pkg.yml
@@ -46,12 +46,12 @@ workflow_modules:
  - 'bwa/intel/0.7.12'
  - 'samtools/1.4.1'
  - 'sambamba/0.6.6'
-  - 'bedtools/2.26.0'
+  - 'bedtools/2.25.0'
  - 'deeptools/2.5.0.1'
  - 'phantompeakqualtools/1.2'
  - 'macs/2.1.0-20151222'
  - 'UCSC_userApps/v317'
-  - 'singularity/2.6.1'
+  - 'R/3.3.2-gccmkl'
  - 'pandoc/2.7'
  - 'singularity/3.0.2'
@@ -95,7 +95,7 @@ workflow_parameters:
    description: |
      One or more input FASTQ files from a ATAC-seq expereiment and a design
      file with the link bewetwen the same file name and sample id
-    regex: ".*(fastq|fq)*"
+    regex: ".*(fastq|fq)*gz"
  - id: pairedEnd
    type: select
@@ -117,7 +117,7 @@ workflow_parameters:
      - [ 'true', 'True']
      - [ 'false', 'False']
    description: |
-      The Blacklisted Regions aim to identify a comprehensive set of regions 
+      The Blacklisted Regions aim to identify a comprehensive set of regions
      that have anomalous, unstructured, high signal/read counts in next gen
      sequencing experiments independent of cell line and type of experiment.
      If you would like these regions excluded from replicated peaks, select
@@ -134,12 +134,22 @@ workflow_parameters:
    type: select
    choices:
      - [ 'GRCh38', 'Human GRCh38']
-      - [ 'GRCh38', 'Mouse GRCh38']
+      - [ 'GRCm38', 'Mouse GRCm38']
    required: true
    description: |
      Reference species and genome used for alignment and subsequent analysis.
+  - id: astrocyte
+    type: select
+    choices:
+      - [ 'true', 'true' ]
+    required: true
+    default: 'true'
+    description: |
+      Ensure configuraton for astrocyte.
 # -----------------------------------------------------------------------------
 # SHINY APP CONFIGURATION
 # -----------------------------------------------------------------------------
@@ -148,7 +158,7 @@ workflow_parameters:
 #            The workflow must publish all final output into $baseDir
 # Name of the R module that the vizapp will run against
-vizapp_r_module: 'R/3.2.1-intel'
+vizapp_r_module: 'R/3.4.1-gccmkl'
 # List of any CRAN packages, not provided by the modules, that must be made
 # available to the vizapp
@@ -158,8 +168,4 @@ vizapp_cran_packages:
 # List of any Bioconductor packages, not provided by the modules,
 # that must be made available to the vizapp
-vizapp_bioc_packages:
+vizapp_bioc_packages: []
-#  - qusage
-#  - ballgown
-vizapp_github_packages:
-  - js229/Vennerable
--- a/docs/index.md
+++ b/docs/index.md
-# Astrocyte ATAC-seq analysis Workflow Package
+# BICF ATAC-seq Analysis Workflow
-[![pipeline Status](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/badges/master/pipeline.svg)](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master)
-[![Coverage Report](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/badges/master/coverage.svg)](https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis/commits/master)
-[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A50.24.0-brightgreen.svg
-)](https://www.nextflow.io/)
-[![Astrocyte](https://img.shields.io/badge/astrocyte-%E2%89%A50.1.0-blue.svg)](https://astrocyte-test.biohpc.swmed.edu/static/docs/index.html)
 ## Introduction
@@ -12,22 +6,10 @@ BICF ATAC-seq is a bioinformatics best-practice analysis pipeline used for ATAC-
 The pipeline uses [Nextflow](https://www.nextflow.io), a bioinformatics workflow tool. It pre-processes raw data from FastQ inputs, aligns the reads and performs extensive quality-control on the results.
-This pipeline is primarily used with a SLURM cluster on the [BioHPC Cluster](https://portal.biohpc.swmed.edu/content/). However, the pipeline should be able to run on any system that supports Nextflow.
-Additionally, the pipeline is designed to work with [Astrocyte Workflow System](https://astrocyte.biohpc.swmed.edu/static/docs/index.html) using a simple web interface.
-Current version of the software and issue reports are at
-https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis
-To download the current (working not tagged) version of the software
-```bash
-$ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
-```
 ## Input files
 ##### 1) Fastq Files
-  + You will need the full path to the files for the Bash Scipt
+  + One or more input FASTQ files from a ATAC-seq experiment
 ## Design file
  + The Design file is a tab-delimited file with 4 columns for Single-End and 5 columns for Paired-End.  Letter, numbers, and underlines can be used in the names. However, the names must begin with a letter. Columns must be as follows:
@@ -37,7 +19,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
    3. replicate - Replicate number
    4. fastq_read1 - Name of fastq file 1 for SE or PE data
    5. fastq_read2 - Name of fastq file 2 for PE data
  + See [HERE](/docs/design_ENCSR451NAE_PE.txt) for an example design file, paired-end
  + See [HERE](/docs/design_ENCSR265ZXX_SE.txt) for an example design file, single-end
@@ -112,9 +94,8 @@ If you find an error, please let the [BICF](mailto:BICF@UTSouthwestern.edu) know
 ## Citation
-Please cite individual programs and versions used [HERE](docs/references.md), and the pipeline doi: coming soon. Please cite in publications: Pipeline was developed by BICF from funding provided by Cancer Prevention and Research Institute of Texas (RP150596).
+Please cite individual programs and versions of pipeline used [HERE](docs/references.md), and the overall pipeline doi: 10.5281/zenodo.3526149. Please cite in publications: Pipeline was developed by BICF from funding provided by Cancer Prevention and Research Institute of Texas (RP150596).
 ### Credits
 This example worklow is derived from original scripts kindly contributed by the Bioinformatic Core Facility ([BICF](https://www.utsouthwestern.edu/labs/bioinformatics/)), in the [Department of Bioinformatics](https://www.utsouthwestern.edu/departments/bioinformatics/).
--- a/docs/references.md
+++ b/docs/references.md
@@ -40,7 +40,11 @@
 13. **MultiQc**:
  * Ewels P., Magnusson M., Lundin S. and Käller M. 2016. MultiQC: Summarize analysis results for multiple tools and samples in a single report. Bioinformatics 32(19): 3047–3048. doi:[10.1093/bioinformatics/btw354](https://dx.doi.org/10.1093/bioinformatics/btw354)
-14. **Nextflow**:
+14. **BICF ChIP-seq Analysis Workflow**:
+  *  Holly Ruess, Spencer D. Barnes and Venkat S. Malladi. 2020. BICF ATAC-seq Analysis Workflow (publish_2.0.0). Zenodo. doi:[10.5281/zenodo.3891417](https://doi.org/10.5281/zenodo.3891417)
+15. **Nextflow**:
  * Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., and Notredame, C. 2017. Nextflow enables reproducible computational workflows. Nature biotechnology, 35(4), 316.
 Please cite in publications: Pipeline was developed by BICF from funding provided by **Cancer Prevention and Research Institute of Texas (RP150596)**.
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -50,7 +50,7 @@ process {
    executor = 'local'
  }
  withName: experimentQC {
-    module = ['python/3.6.1-2-anaconda', 'deeptools/2.5.0.1', 'samtools/1.4.1', 'bedtools/2.25.0', 'singularity/2.6.1']
+    module = ['python/3.6.1-2-anaconda', 'deeptools/2.5.0.1', 'samtools/1.4.1', 'bedtools/2.25.0', 'singularity/3.0.2']
    queue = '128GB,256GB,256GBv1'
  }
  withName: multiqcReport {
@@ -81,19 +81,3 @@ params {
    }
  }
 }
-trace {
-  enabled = true
-  file = 'pipeline_trace.txt'
-  fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
-}
-timeline {
-  enabled = true
-  file = 'timeline.html'
-}
-report {
-  enabled = true
-  file = 'report.html'
-}
--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -20,6 +20,8 @@ params.astrocyte = false
 params.outDir= "${baseDir}/output"
 params.references = "${baseDir}/../docs/references.md"
 params.multiqc =  "${baseDir}/conf/multiqc_config.yaml"
+params.ci = false
+params.dev = false
 // Check inputs
 if(params.bwaIndex) {
@@ -50,6 +52,33 @@ gtfFile = params.gtfFile
 blacklistFile = params.blacklistFile
 geneNames = params.geneNames
+/*
+ * trackStart: track start of pipeline
+ */
+process trackStart {
+  script:
+  """
+  hostname
+  ulimit -a
+  export https_proxy=\${http_proxy}
+  curl -H 'Content-Type: application/json' -X PUT -d '{ \
+      "sessionId": "${workflow.sessionId}", \
+      "pipeline": "atacseq_analysis", \
+      "start": "${workflow.start}", \
+      "astrocyte": ${params.astrocyte}, \
+      "status": "started", \
+      "nextflowVersion": "${workflow.nextflow.version}", \
+      "pipelineVersion": "2.0.0", \
+      "ci": ${params.ci}, \
+      "dev": ${params.dev}}' \
+  "https://xku43pcwnf.execute-api.us-east-1.amazonaws.com/ProdDeploy/pipeline-tracking"
+  """
+}
 process checkDesignFile {
  publishDir "${outDir}/design", mode: 'copy'
@@ -97,7 +126,7 @@ if (pairedEnd) {
  rawReads = designFilePaths
    .splitCsv(sep: '\t', header: true)
    .map { row -> [ row.sample_id, [row.fastq_read1, row.fastq_read2], row.experiment_id, row.replicate, row.fq_length ] }
-} 
+}
 else {
 rawReads = designFilePaths
  .splitCsv(sep: '\t', header: true)
@@ -518,14 +547,14 @@ process consensusPeaks {
        if (blacklist) {
          """
          module load python/3.6.1-2-anaconda
-          module load bedtools/2.26.0
+          module load bedtools/2.25.0
          python3 ${baseDir}/scripts/overlap_peaks.py -d ${peaksDesign} -f ${preDiffDesign} -b ${blacklistFile}
          """
        }
        else {
          """
          module load python/3.6.1-2-anaconda
-          module load bedtools/2.26.0
+          module load bedtools/2.25.0
          python3 ${baseDir}/scripts/overlap_peaks.py -d ${peaksDesign} -f ${preDiffDesign}
          """
        }
@@ -602,7 +631,7 @@ process experimentQC {
          module load bedtools/2.26.0
          python3 ${baseDir}/scripts/experiment_qc.py -d ${designExperimentQC} -p
          bash ${baseDir}/scripts/make_tss.sh ${gtfFile}
-          module load singularity/2.6.1
+          module load singularity/3.0.2
          singularity run /project/shared/bicf_workflow_ref/singularity_images/metaseq.simg ${baseDir}/scripts/atac_qc.py -d ${designExperimentQC} -l ${fqlengthDesign} -t gencode.tss -c ${chromSizes}
          """
      }
@@ -614,7 +643,7 @@ process experimentQC {
          module load bedtools/2.26.0
          python3 ${baseDir}/scripts/experiment_qc.py -d ${designExperimentQC}
          bash ${baseDir}/scripts/make_tss.sh ${gtfFile}
-          module load singularity/2.6.1
+          module load singularity/3.0.2
          singularity run /project/shared/bicf_workflow_ref/singularity_images/metaseq.simg ${baseDir}/scripts/atac_qc.py -d ${designExperimentQC} -l ${fqlengthDesign} -t gencode.tss -c ${chromSizes}
          """
      }
@@ -693,4 +722,3 @@ process multiqcReport {
    }
 }
--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
@@ -3,3 +3,29 @@ profiles {
    includeConfig 'conf/biohpc.config'
  }
 }
+trace {
+  enabled = true
+  file = 'pipeline_trace.txt'
+  fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
+}
+timeline {
+  enabled = true
+  file = 'timeline.html'
+}
+report {
+  enabled = true
+  file = 'report.html'
+}
+manifest {
+  name = 'atacseq_analysis'
+  description = 'BICF ATAC-seq Analysis Workflow.'
+  homePage = 'https://git.biohpc.swmed.edu/BICF/Astrocyte/atacseq_analysis'
+  version = '2.0.0'
+  mainScript = 'main.nf'
+  nextflowVersion = '>=0.31.0'
+}
--- a/workflow/scripts/overlap_peaks.py
+++ b/workflow/scripts/overlap_peaks.py
@@ -40,8 +40,7 @@ def get_args():
    parser.add_argument('-b', '--blacklist',
                        help="Bed file of blacklisted regions to remove",
-                        required=False,
+                        required=False)
-                        default="None")
    args = parser.parse_args()
    return args
@@ -240,18 +239,19 @@ def main():
    design_anno = pd.DataFrame(columns=anno_cols)
    # Find consenus overlap peaks for each experiment
    for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
        replicated_peak, chr_peak = overlap(experiment, df_experiment)
        design_diff.loc[design_diff.experiment_id == experiment, "peak"] = replicated_peak
        design_anno.loc[experiment] = [experiment, chr_peak]
-    # Remove blacklist regions; if blacklist = True
+        # Remove blacklist regions; if blacklist = True
-    if os.path.exists(blacklist):
+        if blacklist and os.path.exists(blacklist):
-        for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
            bl_peaks, bl_chr_peak = blacklist_peaks(experiment, blacklist)
            design_diff.loc[design_diff.experiment_id == experiment, "peak"] = bl_peaks
            design_anno.loc[design_anno.Condition == experiment, "Peaks"] = bl_chr_peak
    # Write out file
    design_diff.columns = ['SampleID',
                           'bamReads',

--- a/workflow/tests/test_generate_software_references.py
+++ b/workflow/tests/test_generate_software_references.py
@@ -17,7 +17,7 @@ def test_software_references_singleend_human():
    software_references = os.path.join(test_output_path, 'software_references_mqc.yaml')
    with open(software_references, 'r') as stream:
        data_loaded = yaml.load(stream)
-    assert len(data_loaded['data'].split('<ul>')) == 15
+    assert len(data_loaded['data'].split('<ul>')) == 16
    multiqc_report = os.path.join(test_output_path, 'multiqc_report.html')
    html_file = open(multiqc_report, 'r')
@@ -25,7 +25,7 @@ def test_software_references_singleend_human():
    multiqc_html = BeautifulSoup(source_code, 'html.parser')
    references = multiqc_html.find(id="mqc-module-section-Software_References")
    assert references is not None
-    assert len(references.find_all('ul')) == 14
+    assert len(references.find_all('ul')) == 15
@@ -35,7 +35,7 @@ def test_software_references_pairedend_mouse():
    software_references = os.path.join(test_output_path, 'software_references_mqc.yaml')
    with open(software_references, 'r') as stream:
        data_loaded = yaml.load(stream)
-    assert len(data_loaded['data'].split('<ul>')) == 15
+    assert len(data_loaded['data'].split('<ul>')) == 16
    multiqc_report = os.path.join(test_output_path, 'multiqc_report.html')
    html_file = open(multiqc_report, 'r')
@@ -43,5 +43,4 @@ def test_software_references_pairedend_mouse():
    multiqc_html = BeautifulSoup(source_code, 'html.parser')
    references = multiqc_html.find(id="mqc-module-section-Software_References")
    assert references is not None
-    assert len(references.find_all('ul')) == 14
+    assert len(references.find_all('ul')) == 15