Merge branch 'develop' into '2-process_createManifest'

Develop See merge request !11

Merge branch 'develop' into '2-process_createManifest'
Develop See merge request !11
47fc8ba7 · Gervaise Henry · 0bd693e4 · 947e7543 · 47fc8ba7 · 47fc8ba7
Commit 47fc8ba7 authored 5 years ago by Gervaise Henry
--- a/.gitignore
+++ b/.gitignore
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class

-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-
-# PyInstaller
-# Created by https://www.gitignore.io/api/r,perl,macos,linux,python,windows
-# Edit at https://www.gitignore.io/?templates=r,perl,macos,linux,python,windows
+# Created by https://www.gitignore.io/api/r,perl,linux,macos,python,windows,microsoftoffice
+# Edit at https://www.gitignore.io/?templates=r,perl,linux,macos,python,windows,microsoftoffice

 ### Linux ###
 *~
@@ -71,6 +45,27 @@ Network Trash Folder
 Temporary Items
 .apdisk

+### MicrosoftOffice ###
+*.tmp
+
+# Word temporary
+~$*.doc*
+
+# Word Auto Backup File
+Backup of *.doc*
+
+# Excel temporary
+~$*.xls*
+
+# Excel Backup File
+*.xlk
+
+# PowerPoint temporary
+~$*.ppt*
+
+# Visio autosave temporary files
+*.~vsd*
+
 ### Perl ###
 !Build/
 .last_cover_stats
@@ -165,15 +160,6 @@ coverage.xml
 *.mo
 *.pot

-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-
-# Flask stuff:
-instance/
-.webassets-cache
-
 # Scrapy stuff:
 .scrapy

@@ -183,31 +169,22 @@ docs/_build/
 # PyBuilder
 target/

-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
 # pyenv
 .python-version

+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
 # celery beat schedule file
 celerybeat-schedule

 # SageMath parsed files
 *.sage.py

-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
 # Spyder project settings
 .spyderproject
 .spyproject
@@ -215,6 +192,11 @@ venv.bak/
 # Rope project settings
 .ropeproject

+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
+
 # mkdocs documentation
 /site

@@ -226,9 +208,6 @@ dmypy.json
 # Pyre type checker
 .pyre/

-### Python Patch ###
-.venv/
-
 ### R ###
 # History files
 .Rhistory
@@ -237,6 +216,9 @@ dmypy.json
 # Session Data files
 .RData

+# User-specific files
+.Ruserdata
+
 # Example code in package build process
 *-Ex.R

@@ -257,7 +239,7 @@ vignettes/*.pdf
 .httr-oauth

 # knitr and R markdown default cache directories
-/*_cache/
+*_cache/
 /cache/

 # Temporary files created by R markdown
@@ -271,6 +253,7 @@ vignettes/*.pdf
 ### Windows ###
 # Windows thumbnail cache files
 Thumbs.db
+Thumbs.db:encryptable
 ehthumbs.db
 ehthumbs_vista.db

@@ -293,7 +276,7 @@ $RECYCLE.BIN/
 # Windows shortcuts
 *.lnk

-# End of https://www.gitignore.io/api/r,perl,macos,linux,python,windows
+# End of https://www.gitignore.io/api/r,perl,linux,macos,python,windows,microsoftoffice

 # nextflow analysis folders/files
 /test_data/*

--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+before_script:
+  - module add  python/3.6.1-2-anaconda
+  - pip install --user pytest-pythonpath==0.7.1 pytest-cov==2.5.1
+  - module load singularity/3.0.2
+  - module load nextflow/19.09.0
+  - ln -sfn /project/BICF/BICF_Core/shared/gudmap/test_data/* ./test_data/
+  - mkdir -p ~/.deriva
+  - mkdir -p ~/.bdbag
+
+stages:
+  - unit
+  - integration
+
+getBag:
+  stage: unit
+  script:
+  - ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json
+  - singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=16-1ZX4
+  - pytest -m getBag
+
+getData:
+  stage: unit
+  script:
+  - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
+  - unzip ./test_data/bagit/Replicate_16-1ZX4
+  - singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' sh ./workflow/scripts/bdbagFetch.sh Replicate_16-1ZX4 16-1ZX4
+  - pytest -m getData
+
+trimData:
+  stage: unit
+  script:
+  - if [ `nproc` -gt 8 ]; then ncore=8; else ncore=`nproc`; fi
+  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename 16-1ZX4 -j ${ncore} ./test_data/fastq/16-1ZX4.R1.fastq.gz
+  - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5JA -j ${ncore} ./test_data/fastq/Q-Y5JA.R1.fastq.gz ./test_data/fastq/Q-Y5JA.R2.fastq.gz
+  - pytest -m trimData
+
+integration_se:
+  stage: integration
+  script:
+  - nextflow run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-1ZX4
+
+integration_pe:
+  stage: integration
+  script:
+  - nextflow run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA
\ No newline at end of file
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,5 +2,6 @@
 **User Facing**

 **Background**
+* Implementation of CI

 *Known Bugs*
--- a/README.md
+++ b/README.md
-<!--
 |*master*|*develop*|
 |:-:|:-:|
 |[![Build Status](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/badges/master/build.svg)](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/commits/master)|[![Build Status](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/badges/develop/build.svg)](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/commits/develop)|
-
+<!--
 [![DOI]()]()
 -->
 GUDMAP/RBK RNA-Seq Pipeline
@@ -10,16 +9,19 @@ GUDMAP/RBK RNA-Seq Pipeline

 Introduction
 ------------
-
+This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.

 To Run:
 -------
 * Available parameters:
+  * *--deriva* active ```credential.json``` file from [deriva-auth](https://github.com/informatics-isi-edu/gudmap-rbk/wiki/Uploading-files-via-Deriva-client-tools#from-a-remote-server)
+  * *--bdbag* active ```cookies.txt``` file from [deriva-auth](https://github.com/informatics-isi-edu/gudmap-rbk/wiki/Uploading-files-via-Deriva-client-tools#from-a-remote-server)
+  * *--repRID* mRNA-seq replicate RID\
+  note: once deriva-auth is run and authenticated, the two files above are saved in ```~/.deriva/``` (see official documents from [deriva](https://github.com/informatics-isi-edu/deriva-client#installer-packages-for-windows-and-macosx) on the lifetime of the credentials)
 * FULL EXAMPLE:
  ```
-  nextflow run workflow/rna-seq.nf
+  nextflow run workflow/rna-seq.nf --deriva ./data/credential.json --bdbag ./data/cookies.txt --repRID Q-Y5JA
  ```
-* Design example:




--- a/workflow/conf/aws_ondemand.config
+++ b/workflow/conf/aws_ondemand.config
+workDir = 's3://gudmap.rbk/work'
+aws.client.storageEncryption = 'AES256'
+aws {
+  region = 'us-east-2'
+  batch {
+    cliPath = '/home/ec2-user/miniconda/bin/aws'
+  }
+}
+
+process {
+  executor = 'awsbatch'
+  queue = 'highpriority-3278a8b0-1fc8-11ea-b1ac-021e2396e2cc'
+  cpus = 1
+  memory = '1 GB'
+
+  withName:getBag {
+    container = 'bicf/gudmaprbkfilexfer:1.3'
+  }
+  withName:getData {
+    container = 'bicf/gudmaprbkfilexfer:1.3'
+  }
+  withName:trimData {
+    container = 'bicf/trimgalore:1.1'
+    cpus = 15
+  }
+}
\ No newline at end of file
--- a/workflow/conf/aws_spot.config
+++ b/workflow/conf/aws_spot.config
+workDir = 's3://gudmap.rbk/work'
+aws.client.storageEncryption = 'AES256'
+aws {
+  region = 'us-east-2'
+  batch {
+    cliPath = '/home/ec2-user/miniconda/bin/aws'
+  }
+}
+
+process {
+  executor = 'awsbatch'
+  queue = 'default-3278a8b0-1fc8-11ea-b1ac-021e2396e2cc'
+  cpus = 1
+  memory = '1 GB'
+
+  withName:getBag {
+    container = 'bicf/gudmaprbkfilexfer:1.3'
+  }
+  withName:getData {
+    container = 'bicf/gudmaprbkfilexfer:1.3'
+  }
+  withName:trimData {
+    container = 'bicf/trimgalore:1.1'
+    cpus = 15
+  }
+}
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -3,41 +3,20 @@ process {
  queue = 'super'
  clusterOptions = '--hold'

-  // Process specific configuration
-  withName:splitData {
-    container = 'docker://bicf/gudmaprbkfilexfer:1.0'
+  withName:getBag {
+    executor = 'local'
+    container = 'docker://bicf/gudmaprbkfilexfer:1.3'
  }
  withName:getData {
-    container = 'docker://bicf/gudmaprbkfilexfer:1.0'
+    executor = 'local'
+    container = 'docker://bicf/gudmaprbkfilexfer:1.3'
  }
  withName:trimData {
-    container = 'docker://bicf/trimgalore:1.0'
+    container = 'docker://bicf/trimgalore:1.1'
    queue = '256GB,256GBv1,384GB'
  }
 }

-
-trace {
-  enabled = true
-  file = 'pipeline_trace.txt'
-  fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
-}
-
-timeline {
-  enabled = true
-  file = 'timeline.html'
-}
-
-report {
-  enabled = true
-  file = 'report.html'
-}
-
-tower {
-  accessToken = '3ade8f325d4855434b49aa387421a44c63e3360f'
-  enabled = true
-}
-
 singularity {
  enabled = true
  cacheDir = '/project/shared/bicf_workflow_ref/singularity_images/'
@@ -47,4 +26,4 @@ env {
  http_proxy = 'http://proxy.swmed.edu:3128'
  https_proxy = 'http://proxy.swmed.edu:3128'
  all_proxy = 'http://proxy.swmed.edu:3128'
-}
+}
\ No newline at end of file
--- a/workflow/conf/conda.env.bdbag.yml
+++ b/workflow/conf/conda.env.bdbag.yml
-name: bdbag
-dependencies:
-  - pandas=0.23.3=py36_0
-  - pip:
-    - bdbag==1.5.5
--- a/workflow/conf/replicate_export_config.json
+++ b/workflow/conf/replicate_export_config.json
+{
+  "bag": {
+    "bag_name": "Replicate_{rid}",
+    "bag_algorithms": [
+      "md5"
+    ],
+    "bag_archiver": "zip"
+  },
+  "catalog": {
+    "query_processors": [
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Study",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Study_RID)=(RNASeq:Study:RID)/Study_RID:=RID,Internal_ID,Title,Summary,Overall_Design,GEO_Series_Accession_ID,GEO_Platform_Accession_ID,Funding,Pubmed_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment:RID)/Experiment_RID:=RID,Study_RID,Internal_ID,Name,Description,Experiment_Method,Sequencing_Type,Species,Specimen_Type,Molecule_Type,Pooled_Sample,Pool_Size,Markers,Cell_Count,Treatment_Protocol,Treatment_Protocol_Reference,Isolation_Protocol,Isolation_Protocol_Reference,Growth_Protocol,Growth_Protocol_Reference,Label_Protocol,Label_Protocol_Reference,Hybridization_Protocol,Hybridization_Protocol_Reference,Scan_Protocol,Scan_Protocol_Reference,Data_Processing,Value_Definition,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment Antibodies",
+          "query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Antibodies:Experiment_RID)?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment Custom Metadata",
+          "query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Custom_Metadata:Experiment_RID)?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment Settings",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Settings:Experiment_RID)/RID,Experiment_RID,Alignment_Format,Aligner,Aligner_Version,Reference_Genome,Sequence_Trimming,Duplicate_Removal,Pre-alignment_Sequence_Removal,Junction_Reads,Library_Type,Protocol_Reference,Library_Selection,Quantification_Format,Quantification_Software,Expression_Metric,Transcriptome_Model,Sequencing_Platform,Paired_End,Read_Length,Has_Strand_Specific_Information,Used_Spike_Ins,Spike_Ins_Amount,Visualization_Format,Visualization_Software,Visualization_Version,Visualization_Setting,Notes,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Replicate",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/RID,Study_RID,Experiment_RID,Biological_Replicate_Number,Technical_Replicate_Number,Specimen_RID,Collection_Date,Mapped_Reads,GEO_Sample_Accession_ID,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Specimen",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/S:=(Specimen_RID)=(Gene_Expression:Specimen:RID)/T:=left(Stage_ID)=(Vocabulary:Developmental_Stage:ID)/$S/RID,Title,Species,Stage_ID,Stage_Name:=T:Name,Stage_Detail,Assay_Type,Strain,Wild_Type,Sex,Passage,Phenotype,Cell_Line,Parent_Specimen,Upload_Notes,Preparation,Fixation,Embedding,Internal_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT,GUDMAP2_Accession_ID?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Specimen_Anatomical_Source",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Tissue:Specimen_RID)/RID,Specimen_RID,Tissue,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Specimen_Cell_Types",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Cell_Type:Specimen)/RID,Specimen_RID:=Specimen,Cell_Type,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Single Cell Metrics",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:Single_Cell_Metrics:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Reads_%28Millions%29,Reads%2FCell,Detected_Gene_Count,Genes%2FCell,UMI%2FCell,Estimated_Cell_Count,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "File",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Caption,File_Type,File_Name,URI,File_size,MD5,GEO_Archival_URL,dbGaP_Accession_ID,Processed,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT,Legacy_File_RID,GUDMAP_NGF_OID,GUDMAP_NGS_OID?limit=none"
+        }
+      },
+      {
+        "processor": "fetch",
+        "processor_params": {
+          "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/url:=URI,length:=File_size,filename:=File_Name,md5:=MD5,Study_RID,Experiment_RID,Replicate_RID?limit=none"
+        }
+      }
+    ]
+  }
+}
--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
@@ -2,4 +2,39 @@ profiles {
  standard {
    includeConfig 'conf/biohpc.config'
  }
+  aws_ondemand {
+    includeConfig 'conf/aws_ondemand.config'
+  }
+  aws_spot {
+    includeConfig 'conf/aws_spot.config'
+  }
+}
+
+trace {
+  enabled = true
+  file = 'pipeline_trace.txt'
+  fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
+}
+
+timeline {
+  enabled = true
+  file = 'timeline.html'
+}
+	
+report {
+  enabled = true
+  file = 'report.html'
+}
+
+tower {
+  accessToken = '3ade8f325d4855434b49aa387421a44c63e3360f'
+  enabled = true
 }
+
+manifest {
+  homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq'
+  description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.'
+  mainScript = 'rna-seq.nf'
+  version = 'v0.0.1_indev'
+  nextflowVersion = '>=19.09.0'
+}
\ No newline at end of file
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
 #!/usr/bin/env nextflow

 // Define input variables
-params.deriva = "/project/BICF/BICF_Core/shared/gudmap/cookies/deriva-cookies.txt"
-params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip"
+params.deriva = "${baseDir}/../test_data/credential.json"
+params.bdbag = "${baseDir}/../test_data/cookies.txt"
+//params.repRID = "16-1ZX4"
+params.repRID = "Q-Y5JA"

 params.outDir = "${baseDir}/../output"

 // Parse input variables
-deriva = file(params.deriva, checkIfExists: 'true')
+deriva = Channel
+  .fromPath(params.deriva)
+  .ifEmpty { exit 1, "deriva credential file not found: ${params.deriva}" }
 bdbag = Channel
  .fromPath(params.bdbag)
-  .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" }
+  .ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" }
+
+Channel.from(params.repRID)
+  .into {
+    repRID_getBag
+    repRID_getData
+    repRID_trimData
+  }

 outDir = params.outDir
 logsDir = "${outDir}/Logs"

+// Define fixed files
+derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json")
+
 /*
- * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid
+ * getData: get bagit file from consortium
 */
-process splitData {
-  tag "${bdbag.baseName}"
+process getBag {
  executor 'local'
-  publishDir "${logsDir}/splitData", mode: 'symlink', pattern: "${bdbag.baseName}.splitData.err"
+  tag "${repRID_getBag}"
+  publishDir "${logsDir}/getBag", mode: 'symlink', pattern: "${repRID_getBag}.getBag.err"

  input:
-    file bdbag
-    path cookies, stageAs: 'cookies.txt' from deriva
+    val repRID_getBag
+    path credential, stageAs: 'credential.json' from deriva
+    path derivaConfig

  output:
-    file("Replicate_*.zip") into bdbagSplit mode flatten
-    file("${bdbag.baseName}/data/File.csv") into fileMeta
-    file("${bdbag.baseName}/data/Experiment Settings.csv") into experimentSettingsMeta
-    file("${bdbag.baseName}/data/Experiment.csv") into experimentMeta
-    file ("${bdbag.baseName}.splitData.err")
+    path ("Replicate_*.zip") into bagit
+    file ("${repRID_getBag}.getBag.err")

  script:
    """
-    hostname >> ${bdbag.baseName}.splitData.err
-    ulimit -a >> ${bdbag.baseName}.splitData.err
-    ln -sf `readlink -e cookies.txt` ~/.bdbag/deriva-cookies.txt 2>>${bdbag.baseName}.splitData.err
-    echo "LOG: deriva cookie linked" >> ${bdbag.baseName}.splitData.err 
-    study=`echo "${bdbag}" | cut -d '.' -f1` 2>>${bdbag.baseName}.splitData.err
-    echo "LOG: \${study}" >> ${bdbag.baseName}.splitData.err
-    unzip ${bdbag} 2>>${bdbag.baseName}.splitData.err
-    echo "LOG: bdgag unzipped" >> ${bdbag.baseName}.splitData.err
-    python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study} 2>>${bdbag.baseName}.splitData.err
-    echo "LOG: fetch file filtered for only .fastq.gz" >> ${bdbag.baseName}.splitData.err
-    python3 ${baseDir}/scripts/splitFetch.py --fetchFile \${study} 2>>${bdbag.baseName}.splitData.err
-    echo "LOG: fetch file split by replicates" >> ${bdbag.baseName}.splitData.err
-    sh ${baseDir}/scripts/splitBag.sh \${study} 2>>${bdbag.baseName}.splitData.err
-    echo "LOG: bag recreated with replicate split fetch file" >> ${bdbag.baseName}.splitData.err
+    hostname >>${repRID_getBag}.getBag.err
+    ulimit -a >>${repRID_getBag}.getBag.err
+    export https_proxy=\${http_proxy}
+    ln -sf `readlink -e credential.json` ~/.deriva/credential.json 2>>${repRID_getBag}.getBag.err
+    echo "LOG: deriva credentials linked" >>${repRID_getBag}.getBag.err
+    deriva-download-cli dev.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID_getBag} 2>>${repRID_getBag}.getBag.err
    """
 }

@@ -57,26 +61,36 @@ process splitData {
 * getData: fetch study files from consortium with downloaded bdbag.zip
 */
 process getData {
-  tag "${rep.baseName}"
-  publishDir "${logsDir}/getData", mode: 'symlink', pattern: "${rep.baseName}.getData.err"
+  tag "${repRID_getData}"
+  publishDir "${logsDir}/getData", mode: 'symlink', pattern: "${repRID_getData}.getData.err"

  input:
-    each rep from bdbagSplit
+    val repRID_getData
+    executor 'local'
+    path cookies, stageAs: 'deriva-cookies.txt' from bdbag
+    path bagit

  output:
-    set val ("${rep.baseName}"), file ("*.R{1,2}.fastq.gz") into trimming
+    path ("*.R{1,2}.fastq.gz") into fastqs
+    file("**/File.csv") into fileMeta
+    file("**/Experiment Settings.csv") into experimentSettingsMeta
+    file("**/Experiment.csv") into experimentMeta
+    file ("${repRID_getData}.getData.err")
+

  script:
    """
-    hostname >>${rep.baseName}.getData.err
-    ulimit -a >>${rep.baseName}.getData.err
+    hostname >>${repRID_getData}.getData.err
+    ulimit -a >>${repRID_getData}.getData.err
    export https_proxy=\${http_proxy}
-    replicate=\$(basename "${rep}" | cut -d '.' -f1)
-    echo "LOG: \${replicate}" >>${rep.baseName}.getData.err
-    unzip ${rep} 2>>${rep.baseName}.getData.err
-    echo "LOG: replicate bdbag unzipped" >>${rep.baseName}.getData.err
-    sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} 2>>${rep.baseName}.getData.err
-    echo "LOG: replicate bdbag fetched" >>${rep.baseName}.getData.err
+    ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt >>${repRID_getData}.getData.err
+    echo "LOG: deriva cookie linked" >>${repRID_getData}.getData.err
+    replicate=\$(basename "${bagit}" | cut -d '.' -f1)
+    echo "LOG: \${replicate}" >>${repRID_getData}.getData.err
+    unzip ${bagit} 2>>${repRID_getData}.getData.err
+    echo "LOG: replicate bdbag unzipped" >>${repRID_getData}.getData.err
+    sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} ${repRID_getData} 2>>${repRID_getData}.getData.err
+    echo "LOG: replicate bdbag fetched" >>${repRID_getData}.getData.err
    """
 }

@@ -84,19 +98,34 @@ process getData {
 * trimData: trims any adapter or non-host sequences from the data
 */
 process trimData {
-  tag "trim-${repID}"
-  publishDir "${outDir}/tempOut/trimmed", mode: "symlink", pattern: "*_val_{1,2}.fq.gz"
-  publishDir "${logsDir}/trimData", mode: 'symlink', pattern: "\${rep}.trimData.*"
+  tag "${repRID_trimData}"
+  publishDir "${logsDir}/trimData", mode: 'symlink', pattern: "\${repRID_trimData}.trimData.*"

  input:
-    set repID, reads from trimming
+    val repRID_trimData
+    file(fastq) from fastqs

  output:
-    path ("*_val_{1,2}.fq.gz", type: 'file', maxDepth: '0')
+    path ("*.fq.gz") into fastqs_trimmed
+    val ends
+    file ("${repRID_trimData}.trimData.log")
+    file ("${repRID_trimData}.trimData.err")

  script:
    """
-    rep=`echo ${repID} | cut -f2- -d '_'`;
-    trim_galore --gzip --max_n 1 --paired --basename \${rep} -j `nproc` ${reads[0]} ${reads[1]} 1>>\${rep}.trimData.log 2>>\${rep}.trimData.err;
+    if [ `nproc` -gt 8 ]
+    then
+      ncore=8
+    else
+      ncore=`nproc`
+    fi
+    if [ '${fastq[1]}' == 'null' ]
+    then
+      ends='se'
+      trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID_trimData} -j \${ncore} ${fastq[0]} 1>>${repRID_trimData}.trimData.log 2>>${repRID_trimData}.trimData.err;
+    else
+      ends='pe'
+      trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID_trimData} -j \${ncore} ${fastq[0]} ${fastq[1]} 1>>${repRID_trimData}.trimData.log 2>>${repRID_trimData}.trimData.err;
+    fi
    """
-}
+}
\ No newline at end of file
--- a/workflow/scripts/bdbagFetch.sh
+++ b/workflow/scripts/bdbagFetch.sh
 #!/bin/bash

-bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1 &&
+bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1
 for i in $(find */ -name "*.R*.fastq.gz"); do
-  mv ${i} .;
-done;
+    path=${2}$(echo ${i##*/} | grep -o "\.R.\.fastq\.gz");
+    mv ${i} ./${path}
+done;
\ No newline at end of file
--- a/workflow/scripts/splitBag.sh
+++ b/workflow/scripts/splitBag.sh
-#!/bin
-
-for i in $(ls -d Replicate_*)
-do
-rsync -r $1/ ${i} --exclude=fetch.txt
-zip -r ${i}.zip ${i}
-done
\ No newline at end of file
--- a/workflow/tests/test_getBag.py
+++ b/workflow/tests/test_getBag.py
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+from io import StringIO
+import os
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../../'
+
+@pytest.mark.getBag
+def test_getBag():
+    assert os.path.exists(os.path.join(test_output_path, 'Replicate_16-1ZX4.zip'))
\ No newline at end of file
--- a/workflow/tests/test_getData.py
+++ b/workflow/tests/test_getData.py
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+from io import StringIO
+import os
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../../'
+
+@pytest.mark.getData
+def test_getData():
+    assert os.path.exists(os.path.join(test_output_path, 'Replicate_16-1ZX4/bagit.txt'))
+    assert os.path.exists(os.path.join(test_output_path, '16-1ZX4.R1.fastq.gz'))
\ No newline at end of file
--- a/workflow/tests/test_trimData.py
+++ b/workflow/tests/test_trimData.py
+#!/usr/bin/env python3
+
+import pytest
+import pandas as pd
+from io import StringIO
+import os
+
+test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
+                '/../../'
+
+@pytest.mark.trimData
+def test_trimData():
+    assert os.path.exists(os.path.join(test_output_path, '16-1ZX4_trimmed.fq.gz'))
+    assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R1_val_1.fq.gz'))
+    assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R2_val_2.fq.gz'))
\ No newline at end of file