Merge branch 'develop' into '15-auto.bagit.download'

Develop See merge request !6

Merge branch 'develop' into '15-auto.bagit.download'
Develop See merge request !6
20daf0f9 · Gervaise Henry · f6b9f1f2 · 0147ece8 · 20daf0f9 · 20daf0f9
Commit 20daf0f9 authored 5 years ago by Gervaise Henry
--- a/.gitignore
+++ b/.gitignore
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class

-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-
-# PyInstaller
-# Created by https://www.gitignore.io/api/r,perl,macos,linux,python,windows
-# Edit at https://www.gitignore.io/?templates=r,perl,macos,linux,python,windows
+# Created by https://www.gitignore.io/api/r,perl,linux,macos,python,windows,microsoftoffice
+# Edit at https://www.gitignore.io/?templates=r,perl,linux,macos,python,windows,microsoftoffice

 ### Linux ###
 *~
@@ -71,6 +45,27 @@ Network Trash Folder
 Temporary Items
 .apdisk

+### MicrosoftOffice ###
+*.tmp
+
+# Word temporary
+~$*.doc*
+
+# Word Auto Backup File
+Backup of *.doc*
+
+# Excel temporary
+~$*.xls*
+
+# Excel Backup File
+*.xlk
+
+# PowerPoint temporary
+~$*.ppt*
+
+# Visio autosave temporary files
+*.~vsd*
+
 ### Perl ###
 !Build/
 .last_cover_stats
@@ -165,15 +160,6 @@ coverage.xml
 *.mo
 *.pot

-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-
-# Flask stuff:
-instance/
-.webassets-cache
-
 # Scrapy stuff:
 .scrapy

@@ -183,31 +169,22 @@ docs/_build/
 # PyBuilder
 target/

-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
 # pyenv
 .python-version

+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
 # celery beat schedule file
 celerybeat-schedule

 # SageMath parsed files
 *.sage.py

-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
 # Spyder project settings
 .spyderproject
 .spyproject
@@ -215,6 +192,11 @@ venv.bak/
 # Rope project settings
 .ropeproject

+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
+
 # mkdocs documentation
 /site

@@ -226,9 +208,6 @@ dmypy.json
 # Pyre type checker
 .pyre/

-### Python Patch ###
-.venv/
-
 ### R ###
 # History files
 .Rhistory
@@ -237,6 +216,9 @@ dmypy.json
 # Session Data files
 .RData

+# User-specific files
+.Ruserdata
+
 # Example code in package build process
 *-Ex.R

@@ -257,7 +239,7 @@ vignettes/*.pdf
 .httr-oauth

 # knitr and R markdown default cache directories
-/*_cache/
+*_cache/
 /cache/

 # Temporary files created by R markdown
@@ -271,6 +253,7 @@ vignettes/*.pdf
 ### Windows ###
 # Windows thumbnail cache files
 Thumbs.db
+Thumbs.db:encryptable
 ehthumbs.db
 ehthumbs_vista.db

@@ -293,7 +276,7 @@ $RECYCLE.BIN/
 # Windows shortcuts
 *.lnk

-# End of https://www.gitignore.io/api/r,perl,macos,linux,python,windows
+# End of https://www.gitignore.io/api/r,perl,linux,macos,python,windows,microsoftoffice

 # nextflow analysis folders/files
 /test_data/*

--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+before_script:
+  - module add  python/3.6.1-2-anaconda
+  - pip install --user pytest-pythonpath==0.7.1 pytest-cov==2.5.1
+  - module load singularity/3.0.2
+  - ln -sfn /project/BICF/BICF_Core/shared/gudmap/cookies/deriva-cookies.txt ./test_data/
+  - ln -sfn /project/BICF/BICF_Core/shared/gudmap/test_data/File_Q-Y53P.zip ./test_data/
+
+stages:
+  - unit
+
+unit:
+  stage: unit
+  script:
+  - singularity run 'docker://bicf/gudmaprbkfilexfer:1.1' bdbag --materialize ./test_data/File_Q-Y53P.zip
+  - singularity run 'docker://bicf/gudmaprbkfilexfer:1.1' bdbag --validate full ./test_data/File_Q-Y53P/
+  - if [[ $(md5sum test_data/File_Q-Y53P/data/assets/Study/Q-Y4H0/Experiment/Q-Y4BY/Replicate/Q-Y5F8/hMARIS_SIX2+_RiboDep#1.gene.rpkm.txt | awk '{ print $1 }') != $(cat test_data/File_Q-Y53P/data/File.csv | cut -d ',' -f10 | tail -1) ]]; then exit 21; fi;
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,5 +2,6 @@
 **User Facing**

 **Background**
+* Implementation of CI

 *Known Bugs*
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -5,10 +5,14 @@ process {

  // Process specific configuration
  withName:splitData {
-    container = 'docker://bicf/bdbag:1.0'
+    container = 'docker://bicf/gudmaprbkfilexfer:1.1'
  }
  withName:getData {
-    container = 'docker://bicf/bdbag:1.0'
+    container = 'docker://bicf/gudmaprbkfilexfer:1.1'
+  }
+  withName:trimData {
+    container = 'docker://bicf/trimgalore:1.1'
+    queue = '256GB,256GBv1,384GB'
  }
 }


--- a/workflow/conf/conda.env.bdbag.yml
+++ b/workflow/conf/conda.env.bdbag.yml
-name: bdbag
-dependencies:
-  - pandas=0.23.3=py36_0
-  - pip:
-    - bdbag==1.5.5
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -13,12 +13,15 @@ bdbag = Channel
  .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" }

 outDir = params.outDir
+logsDir = "${outDir}/Logs"

 /*
 * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid
 */
 process splitData {
  tag "${bdbag.baseName}"
+  executor 'local'
+  publishDir "${logsDir}/splitData", mode: 'symlink', pattern: "${bdbag.baseName}.splitData.err"

  input:
    file bdbag
@@ -29,23 +32,24 @@ process splitData {
    file("${bdbag.baseName}/data/File.csv") into fileMeta
    file("${bdbag.baseName}/data/Experiment Settings.csv") into experimentSettingsMeta
    file("${bdbag.baseName}/data/Experiment.csv") into experimentMeta
+    file ("${bdbag.baseName}.splitData.err")

  script:
    """
-    hostname
-    ulimit -a
-    ln -sf `readlink -e cookies.txt` ~/.bdbag/deriva-cookies.txt
-    echo "LOG: deriva cookie linked"
-    study=`echo "${bdbag}" | cut -d '.' -f1`
-    echo "LOG: \${study}"
-    unzip ${bdbag}
-    echo "LOG: bdgag unzipped"
-    python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study}
-    echo "LOG: fetch file filtered for only .fastq.gz"
-    python3 ${baseDir}/scripts/splitFetch.py --fetchFile \${study}
-    echo "LOG: fetch file split by replicates"
-    sh ${baseDir}/scripts/splitBag.sh \${study}
-    echo "LOG: bag recreated with replicate split fetch file"
+    hostname >> ${bdbag.baseName}.splitData.err
+    ulimit -a >> ${bdbag.baseName}.splitData.err
+    ln -sf `readlink -e cookies.txt` ~/.bdbag/deriva-cookies.txt 2>>${bdbag.baseName}.splitData.err
+    echo "LOG: deriva cookie linked" >> ${bdbag.baseName}.splitData.err 
+    study=`echo "${bdbag}" | cut -d '.' -f1` 2>>${bdbag.baseName}.splitData.err
+    echo "LOG: \${study}" >> ${bdbag.baseName}.splitData.err
+    unzip ${bdbag} 2>>${bdbag.baseName}.splitData.err
+    echo "LOG: bdgag unzipped" >> ${bdbag.baseName}.splitData.err
+    python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study} 2>>${bdbag.baseName}.splitData.err
+    echo "LOG: fetch file filtered for only .fastq.gz" >> ${bdbag.baseName}.splitData.err
+    python3 ${baseDir}/scripts/splitFetch.py --fetchFile \${study} 2>>${bdbag.baseName}.splitData.err
+    echo "LOG: fetch file split by replicates" >> ${bdbag.baseName}.splitData.err
+    sh ${baseDir}/scripts/splitBag.sh \${study} 2>>${bdbag.baseName}.splitData.err
+    echo "LOG: bag recreated with replicate split fetch file" >> ${bdbag.baseName}.splitData.err
    """
 }

@@ -54,24 +58,45 @@ process splitData {
 */
 process getData {
  tag "${rep.baseName}"
-  publishDir "${outDir}/tempOut/fastqs", mode: "symlink"
+  publishDir "${logsDir}/getData", mode: 'symlink', pattern: "${rep.baseName}.getData.err"

  input:
    each rep from bdbagSplit

  output:
-    path ("*.R*.fastq.gz", type: 'file', maxDepth: '0') into fastq
+    set val ("${rep.baseName}"), file ("*.R{1,2}.fastq.gz") into trimming

  script:
    """
-    hostname
-    ulimit -a
+    hostname >>${rep.baseName}.getData.err
+    ulimit -a >>${rep.baseName}.getData.err
    export https_proxy=\${http_proxy}
    replicate=\$(basename "${rep}" | cut -d '.' -f1)
-    echo "LOG: \${replicate}"
-    unzip ${rep}
-    echo "LOG: replicate bdbag unzipped"
-    sh ${baseDir}/scripts/bdbagFetch.sh \${replicate}
-    echo "LOG: replicate bdbag fetched"
+    echo "LOG: \${replicate}" >>${rep.baseName}.getData.err
+    unzip ${rep} 2>>${rep.baseName}.getData.err
+    echo "LOG: replicate bdbag unzipped" >>${rep.baseName}.getData.err
+    sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} 2>>${rep.baseName}.getData.err
+    echo "LOG: replicate bdbag fetched" >>${rep.baseName}.getData.err
    """
- }
+}
+
+/*
+ * trimData: trims any adapter or non-host sequences from the data
+*/
+process trimData {
+  tag "trim-${repID}"
+  publishDir "${outDir}/tempOut/trimmed", mode: "symlink", pattern: "*_val_{1,2}.fq.gz"
+  publishDir "${logsDir}/trimData", mode: 'symlink', pattern: "\${rep}.trimData.*"
+
+  input:
+    set repID, reads from trimming
+
+  output:
+    path ("*_val_{1,2}.fq.gz", type: 'file', maxDepth: '0')
+
+  script:
+    """
+    rep=`echo ${repID} | cut -f2- -d '_'`;
+    trim_galore --gzip --max_n 1 --paired --basename \${rep} -j `nproc` ${reads[0]} ${reads[1]} 1>>\${rep}.trimData.log 2>>\${rep}.trimData.err;
+    """
+}