Skip to content
Snippets Groups Projects
Commit 47fc8ba7 authored by Gervaise Henry's avatar Gervaise Henry :cowboy:
Browse files

Merge branch 'develop' into '2-process_createManifest'

Develop

See merge request !11
parents 0bd693e4 947e7543
Branches
Tags
3 merge requests!37v0.0.1,!14Resolve "process_createManifest",!11Develop
Pipeline #5710 passed with stages
in 22 minutes and 32 seconds
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions # Created by https://www.gitignore.io/api/r,perl,linux,macos,python,windows,microsoftoffice
*.so # Edit at https://www.gitignore.io/?templates=r,perl,linux,macos,python,windows,microsoftoffice
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Created by https://www.gitignore.io/api/r,perl,macos,linux,python,windows
# Edit at https://www.gitignore.io/?templates=r,perl,macos,linux,python,windows
### Linux ### ### Linux ###
*~ *~
...@@ -71,6 +45,27 @@ Network Trash Folder ...@@ -71,6 +45,27 @@ Network Trash Folder
Temporary Items Temporary Items
.apdisk .apdisk
### MicrosoftOffice ###
*.tmp
# Word temporary
~$*.doc*
# Word Auto Backup File
Backup of *.doc*
# Excel temporary
~$*.xls*
# Excel Backup File
*.xlk
# PowerPoint temporary
~$*.ppt*
# Visio autosave temporary files
*.~vsd*
### Perl ### ### Perl ###
!Build/ !Build/
.last_cover_stats .last_cover_stats
...@@ -165,15 +160,6 @@ coverage.xml ...@@ -165,15 +160,6 @@ coverage.xml
*.mo *.mo
*.pot *.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff: # Scrapy stuff:
.scrapy .scrapy
...@@ -183,31 +169,22 @@ docs/_build/ ...@@ -183,31 +169,22 @@ docs/_build/
# PyBuilder # PyBuilder
target/ target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv # pyenv
.python-version .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# celery beat schedule file # celery beat schedule file
celerybeat-schedule celerybeat-schedule
# SageMath parsed files # SageMath parsed files
*.sage.py *.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings # Spyder project settings
.spyderproject .spyderproject
.spyproject .spyproject
...@@ -215,6 +192,11 @@ venv.bak/ ...@@ -215,6 +192,11 @@ venv.bak/
# Rope project settings # Rope project settings
.ropeproject .ropeproject
# Mr Developer
.mr.developer.cfg
.project
.pydevproject
# mkdocs documentation # mkdocs documentation
/site /site
...@@ -226,9 +208,6 @@ dmypy.json ...@@ -226,9 +208,6 @@ dmypy.json
# Pyre type checker # Pyre type checker
.pyre/ .pyre/
### Python Patch ###
.venv/
### R ### ### R ###
# History files # History files
.Rhistory .Rhistory
...@@ -237,6 +216,9 @@ dmypy.json ...@@ -237,6 +216,9 @@ dmypy.json
# Session Data files # Session Data files
.RData .RData
# User-specific files
.Ruserdata
# Example code in package build process # Example code in package build process
*-Ex.R *-Ex.R
...@@ -257,7 +239,7 @@ vignettes/*.pdf ...@@ -257,7 +239,7 @@ vignettes/*.pdf
.httr-oauth .httr-oauth
# knitr and R markdown default cache directories # knitr and R markdown default cache directories
/*_cache/ *_cache/
/cache/ /cache/
# Temporary files created by R markdown # Temporary files created by R markdown
...@@ -271,6 +253,7 @@ vignettes/*.pdf ...@@ -271,6 +253,7 @@ vignettes/*.pdf
### Windows ### ### Windows ###
# Windows thumbnail cache files # Windows thumbnail cache files
Thumbs.db Thumbs.db
Thumbs.db:encryptable
ehthumbs.db ehthumbs.db
ehthumbs_vista.db ehthumbs_vista.db
...@@ -293,7 +276,7 @@ $RECYCLE.BIN/ ...@@ -293,7 +276,7 @@ $RECYCLE.BIN/
# Windows shortcuts # Windows shortcuts
*.lnk *.lnk
# End of https://www.gitignore.io/api/r,perl,macos,linux,python,windows # End of https://www.gitignore.io/api/r,perl,linux,macos,python,windows,microsoftoffice
# nextflow analysis folders/files # nextflow analysis folders/files
/test_data/* /test_data/*
......
before_script:
- module add python/3.6.1-2-anaconda
- pip install --user pytest-pythonpath==0.7.1 pytest-cov==2.5.1
- module load singularity/3.0.2
- module load nextflow/19.09.0
- ln -sfn /project/BICF/BICF_Core/shared/gudmap/test_data/* ./test_data/
- mkdir -p ~/.deriva
- mkdir -p ~/.bdbag
stages:
- unit
- integration
getBag:
stage: unit
script:
- ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json
- singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=16-1ZX4
- pytest -m getBag
getData:
stage: unit
script:
- ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
- unzip ./test_data/bagit/Replicate_16-1ZX4
- singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' sh ./workflow/scripts/bdbagFetch.sh Replicate_16-1ZX4 16-1ZX4
- pytest -m getData
trimData:
stage: unit
script:
- if [ `nproc` -gt 8 ]; then ncore=8; else ncore=`nproc`; fi
- singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename 16-1ZX4 -j ${ncore} ./test_data/fastq/16-1ZX4.R1.fastq.gz
- singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5JA -j ${ncore} ./test_data/fastq/Q-Y5JA.R1.fastq.gz ./test_data/fastq/Q-Y5JA.R2.fastq.gz
- pytest -m trimData
integration_se:
stage: integration
script:
- nextflow run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-1ZX4
integration_pe:
stage: integration
script:
- nextflow run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA
\ No newline at end of file
...@@ -2,5 +2,6 @@ ...@@ -2,5 +2,6 @@
**User Facing** **User Facing**
**Background** **Background**
* Implementation of CI
*Known Bugs* *Known Bugs*
<!--
|*master*|*develop*| |*master*|*develop*|
|:-:|:-:| |:-:|:-:|
|[![Build Status](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/badges/master/build.svg)](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/commits/master)|[![Build Status](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/badges/develop/build.svg)](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/commits/develop)| |[![Build Status](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/badges/master/build.svg)](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/commits/master)|[![Build Status](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/badges/develop/build.svg)](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/commits/develop)|
<!--
[![DOI]()]() [![DOI]()]()
--> -->
GUDMAP/RBK RNA-Seq Pipeline GUDMAP/RBK RNA-Seq Pipeline
...@@ -10,16 +9,19 @@ GUDMAP/RBK RNA-Seq Pipeline ...@@ -10,16 +9,19 @@ GUDMAP/RBK RNA-Seq Pipeline
Introduction Introduction
------------ ------------
This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.
To Run: To Run:
------- -------
* Available parameters: * Available parameters:
* *--deriva* active ```credential.json``` file from [deriva-auth](https://github.com/informatics-isi-edu/gudmap-rbk/wiki/Uploading-files-via-Deriva-client-tools#from-a-remote-server)
* *--bdbag* active ```cookies.txt``` file from [deriva-auth](https://github.com/informatics-isi-edu/gudmap-rbk/wiki/Uploading-files-via-Deriva-client-tools#from-a-remote-server)
* *--repRID* mRNA-seq replicate RID\
note: once deriva-auth is run and authenticated, the two files above are saved in ```~/.deriva/``` (see official documents from [deriva](https://github.com/informatics-isi-edu/deriva-client#installer-packages-for-windows-and-macosx) on the lifetime of the credentials)
* FULL EXAMPLE: * FULL EXAMPLE:
``` ```
nextflow run workflow/rna-seq.nf nextflow run workflow/rna-seq.nf --deriva ./data/credential.json --bdbag ./data/cookies.txt --repRID Q-Y5JA
``` ```
* Design example:
......
workDir = 's3://gudmap.rbk/work'
aws.client.storageEncryption = 'AES256'
aws {
region = 'us-east-2'
batch {
cliPath = '/home/ec2-user/miniconda/bin/aws'
}
}
process {
executor = 'awsbatch'
queue = 'highpriority-3278a8b0-1fc8-11ea-b1ac-021e2396e2cc'
cpus = 1
memory = '1 GB'
withName:getBag {
container = 'bicf/gudmaprbkfilexfer:1.3'
}
withName:getData {
container = 'bicf/gudmaprbkfilexfer:1.3'
}
withName:trimData {
container = 'bicf/trimgalore:1.1'
cpus = 15
}
}
\ No newline at end of file
workDir = 's3://gudmap.rbk/work'
aws.client.storageEncryption = 'AES256'
aws {
region = 'us-east-2'
batch {
cliPath = '/home/ec2-user/miniconda/bin/aws'
}
}
process {
executor = 'awsbatch'
queue = 'default-3278a8b0-1fc8-11ea-b1ac-021e2396e2cc'
cpus = 1
memory = '1 GB'
withName:getBag {
container = 'bicf/gudmaprbkfilexfer:1.3'
}
withName:getData {
container = 'bicf/gudmaprbkfilexfer:1.3'
}
withName:trimData {
container = 'bicf/trimgalore:1.1'
cpus = 15
}
}
...@@ -3,41 +3,20 @@ process { ...@@ -3,41 +3,20 @@ process {
queue = 'super' queue = 'super'
clusterOptions = '--hold' clusterOptions = '--hold'
// Process specific configuration withName:getBag {
withName:splitData { executor = 'local'
container = 'docker://bicf/gudmaprbkfilexfer:1.0' container = 'docker://bicf/gudmaprbkfilexfer:1.3'
} }
withName:getData { withName:getData {
container = 'docker://bicf/gudmaprbkfilexfer:1.0' executor = 'local'
container = 'docker://bicf/gudmaprbkfilexfer:1.3'
} }
withName:trimData { withName:trimData {
container = 'docker://bicf/trimgalore:1.0' container = 'docker://bicf/trimgalore:1.1'
queue = '256GB,256GBv1,384GB' queue = '256GB,256GBv1,384GB'
} }
} }
trace {
enabled = true
file = 'pipeline_trace.txt'
fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
}
timeline {
enabled = true
file = 'timeline.html'
}
report {
enabled = true
file = 'report.html'
}
tower {
accessToken = '3ade8f325d4855434b49aa387421a44c63e3360f'
enabled = true
}
singularity { singularity {
enabled = true enabled = true
cacheDir = '/project/shared/bicf_workflow_ref/singularity_images/' cacheDir = '/project/shared/bicf_workflow_ref/singularity_images/'
...@@ -47,4 +26,4 @@ env { ...@@ -47,4 +26,4 @@ env {
http_proxy = 'http://proxy.swmed.edu:3128' http_proxy = 'http://proxy.swmed.edu:3128'
https_proxy = 'http://proxy.swmed.edu:3128' https_proxy = 'http://proxy.swmed.edu:3128'
all_proxy = 'http://proxy.swmed.edu:3128' all_proxy = 'http://proxy.swmed.edu:3128'
} }
\ No newline at end of file
name: bdbag
dependencies:
- pandas=0.23.3=py36_0
- pip:
- bdbag==1.5.5
{
"bag": {
"bag_name": "Replicate_{rid}",
"bag_algorithms": [
"md5"
],
"bag_archiver": "zip"
},
"catalog": {
"query_processors": [
{
"processor": "csv",
"processor_params": {
"output_path": "Study",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Study_RID)=(RNASeq:Study:RID)/Study_RID:=RID,Internal_ID,Title,Summary,Overall_Design,GEO_Series_Accession_ID,GEO_Platform_Accession_ID,Funding,Pubmed_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Experiment",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment:RID)/Experiment_RID:=RID,Study_RID,Internal_ID,Name,Description,Experiment_Method,Sequencing_Type,Species,Specimen_Type,Molecule_Type,Pooled_Sample,Pool_Size,Markers,Cell_Count,Treatment_Protocol,Treatment_Protocol_Reference,Isolation_Protocol,Isolation_Protocol_Reference,Growth_Protocol,Growth_Protocol_Reference,Label_Protocol,Label_Protocol_Reference,Hybridization_Protocol,Hybridization_Protocol_Reference,Scan_Protocol,Scan_Protocol_Reference,Data_Processing,Value_Definition,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Experiment Antibodies",
"query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Antibodies:Experiment_RID)?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Experiment Custom Metadata",
"query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Custom_Metadata:Experiment_RID)?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Experiment Settings",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Settings:Experiment_RID)/RID,Experiment_RID,Alignment_Format,Aligner,Aligner_Version,Reference_Genome,Sequence_Trimming,Duplicate_Removal,Pre-alignment_Sequence_Removal,Junction_Reads,Library_Type,Protocol_Reference,Library_Selection,Quantification_Format,Quantification_Software,Expression_Metric,Transcriptome_Model,Sequencing_Platform,Paired_End,Read_Length,Has_Strand_Specific_Information,Used_Spike_Ins,Spike_Ins_Amount,Visualization_Format,Visualization_Software,Visualization_Version,Visualization_Setting,Notes,RCT,RMT?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Replicate",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/RID,Study_RID,Experiment_RID,Biological_Replicate_Number,Technical_Replicate_Number,Specimen_RID,Collection_Date,Mapped_Reads,GEO_Sample_Accession_ID,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Specimen",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/S:=(Specimen_RID)=(Gene_Expression:Specimen:RID)/T:=left(Stage_ID)=(Vocabulary:Developmental_Stage:ID)/$S/RID,Title,Species,Stage_ID,Stage_Name:=T:Name,Stage_Detail,Assay_Type,Strain,Wild_Type,Sex,Passage,Phenotype,Cell_Line,Parent_Specimen,Upload_Notes,Preparation,Fixation,Embedding,Internal_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT,GUDMAP2_Accession_ID?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Specimen_Anatomical_Source",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Tissue:Specimen_RID)/RID,Specimen_RID,Tissue,RCT,RMT?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Specimen_Cell_Types",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Cell_Type:Specimen)/RID,Specimen_RID:=Specimen,Cell_Type,RCT,RMT?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Single Cell Metrics",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:Single_Cell_Metrics:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Reads_%28Millions%29,Reads%2FCell,Detected_Gene_Count,Genes%2FCell,UMI%2FCell,Estimated_Cell_Count,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "File",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Caption,File_Type,File_Name,URI,File_size,MD5,GEO_Archival_URL,dbGaP_Accession_ID,Processed,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT,Legacy_File_RID,GUDMAP_NGF_OID,GUDMAP_NGS_OID?limit=none"
}
},
{
"processor": "fetch",
"processor_params": {
"output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/url:=URI,length:=File_size,filename:=File_Name,md5:=MD5,Study_RID,Experiment_RID,Replicate_RID?limit=none"
}
}
]
}
}
...@@ -2,4 +2,39 @@ profiles { ...@@ -2,4 +2,39 @@ profiles {
standard { standard {
includeConfig 'conf/biohpc.config' includeConfig 'conf/biohpc.config'
} }
aws_ondemand {
includeConfig 'conf/aws_ondemand.config'
}
aws_spot {
includeConfig 'conf/aws_spot.config'
}
}
trace {
enabled = true
file = 'pipeline_trace.txt'
fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
}
timeline {
enabled = true
file = 'timeline.html'
}
report {
enabled = true
file = 'report.html'
}
tower {
accessToken = '3ade8f325d4855434b49aa387421a44c63e3360f'
enabled = true
} }
manifest {
homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq'
description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.'
mainScript = 'rna-seq.nf'
version = 'v0.0.1_indev'
nextflowVersion = '>=19.09.0'
}
\ No newline at end of file
#!/usr/bin/env nextflow #!/usr/bin/env nextflow
// Define input variables // Define input variables
params.deriva = "/project/BICF/BICF_Core/shared/gudmap/cookies/deriva-cookies.txt" params.deriva = "${baseDir}/../test_data/credential.json"
params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip" params.bdbag = "${baseDir}/../test_data/cookies.txt"
//params.repRID = "16-1ZX4"
params.repRID = "Q-Y5JA"
params.outDir = "${baseDir}/../output" params.outDir = "${baseDir}/../output"
// Parse input variables // Parse input variables
deriva = file(params.deriva, checkIfExists: 'true') deriva = Channel
.fromPath(params.deriva)
.ifEmpty { exit 1, "deriva credential file not found: ${params.deriva}" }
bdbag = Channel bdbag = Channel
.fromPath(params.bdbag) .fromPath(params.bdbag)
.ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" } .ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" }
Channel.from(params.repRID)
.into {
repRID_getBag
repRID_getData
repRID_trimData
}
outDir = params.outDir outDir = params.outDir
logsDir = "${outDir}/Logs" logsDir = "${outDir}/Logs"
// Define fixed files
derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json")
/* /*
* splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid * getData: get bagit file from consortium
*/ */
process splitData { process getBag {
tag "${bdbag.baseName}"
executor 'local' executor 'local'
publishDir "${logsDir}/splitData", mode: 'symlink', pattern: "${bdbag.baseName}.splitData.err" tag "${repRID_getBag}"
publishDir "${logsDir}/getBag", mode: 'symlink', pattern: "${repRID_getBag}.getBag.err"
input: input:
file bdbag val repRID_getBag
path cookies, stageAs: 'cookies.txt' from deriva path credential, stageAs: 'credential.json' from deriva
path derivaConfig
output: output:
file("Replicate_*.zip") into bdbagSplit mode flatten path ("Replicate_*.zip") into bagit
file("${bdbag.baseName}/data/File.csv") into fileMeta file ("${repRID_getBag}.getBag.err")
file("${bdbag.baseName}/data/Experiment Settings.csv") into experimentSettingsMeta
file("${bdbag.baseName}/data/Experiment.csv") into experimentMeta
file ("${bdbag.baseName}.splitData.err")
script: script:
""" """
hostname >> ${bdbag.baseName}.splitData.err hostname >>${repRID_getBag}.getBag.err
ulimit -a >> ${bdbag.baseName}.splitData.err ulimit -a >>${repRID_getBag}.getBag.err
ln -sf `readlink -e cookies.txt` ~/.bdbag/deriva-cookies.txt 2>>${bdbag.baseName}.splitData.err export https_proxy=\${http_proxy}
echo "LOG: deriva cookie linked" >> ${bdbag.baseName}.splitData.err ln -sf `readlink -e credential.json` ~/.deriva/credential.json 2>>${repRID_getBag}.getBag.err
study=`echo "${bdbag}" | cut -d '.' -f1` 2>>${bdbag.baseName}.splitData.err echo "LOG: deriva credentials linked" >>${repRID_getBag}.getBag.err
echo "LOG: \${study}" >> ${bdbag.baseName}.splitData.err deriva-download-cli dev.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID_getBag} 2>>${repRID_getBag}.getBag.err
unzip ${bdbag} 2>>${bdbag.baseName}.splitData.err
echo "LOG: bdgag unzipped" >> ${bdbag.baseName}.splitData.err
python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study} 2>>${bdbag.baseName}.splitData.err
echo "LOG: fetch file filtered for only .fastq.gz" >> ${bdbag.baseName}.splitData.err
python3 ${baseDir}/scripts/splitFetch.py --fetchFile \${study} 2>>${bdbag.baseName}.splitData.err
echo "LOG: fetch file split by replicates" >> ${bdbag.baseName}.splitData.err
sh ${baseDir}/scripts/splitBag.sh \${study} 2>>${bdbag.baseName}.splitData.err
echo "LOG: bag recreated with replicate split fetch file" >> ${bdbag.baseName}.splitData.err
""" """
} }
...@@ -57,26 +61,36 @@ process splitData { ...@@ -57,26 +61,36 @@ process splitData {
* getData: fetch study files from consortium with downloaded bdbag.zip * getData: fetch study files from consortium with downloaded bdbag.zip
*/ */
process getData { process getData {
tag "${rep.baseName}" tag "${repRID_getData}"
publishDir "${logsDir}/getData", mode: 'symlink', pattern: "${rep.baseName}.getData.err" publishDir "${logsDir}/getData", mode: 'symlink', pattern: "${repRID_getData}.getData.err"
input: input:
each rep from bdbagSplit val repRID_getData
executor 'local'
path cookies, stageAs: 'deriva-cookies.txt' from bdbag
path bagit
output: output:
set val ("${rep.baseName}"), file ("*.R{1,2}.fastq.gz") into trimming path ("*.R{1,2}.fastq.gz") into fastqs
file("**/File.csv") into fileMeta
file("**/Experiment Settings.csv") into experimentSettingsMeta
file("**/Experiment.csv") into experimentMeta
file ("${repRID_getData}.getData.err")
script: script:
""" """
hostname >>${rep.baseName}.getData.err hostname >>${repRID_getData}.getData.err
ulimit -a >>${rep.baseName}.getData.err ulimit -a >>${repRID_getData}.getData.err
export https_proxy=\${http_proxy} export https_proxy=\${http_proxy}
replicate=\$(basename "${rep}" | cut -d '.' -f1) ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt >>${repRID_getData}.getData.err
echo "LOG: \${replicate}" >>${rep.baseName}.getData.err echo "LOG: deriva cookie linked" >>${repRID_getData}.getData.err
unzip ${rep} 2>>${rep.baseName}.getData.err replicate=\$(basename "${bagit}" | cut -d '.' -f1)
echo "LOG: replicate bdbag unzipped" >>${rep.baseName}.getData.err echo "LOG: \${replicate}" >>${repRID_getData}.getData.err
sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} 2>>${rep.baseName}.getData.err unzip ${bagit} 2>>${repRID_getData}.getData.err
echo "LOG: replicate bdbag fetched" >>${rep.baseName}.getData.err echo "LOG: replicate bdbag unzipped" >>${repRID_getData}.getData.err
sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} ${repRID_getData} 2>>${repRID_getData}.getData.err
echo "LOG: replicate bdbag fetched" >>${repRID_getData}.getData.err
""" """
} }
...@@ -84,19 +98,34 @@ process getData { ...@@ -84,19 +98,34 @@ process getData {
* trimData: trims any adapter or non-host sequences from the data * trimData: trims any adapter or non-host sequences from the data
*/ */
process trimData { process trimData {
tag "trim-${repID}" tag "${repRID_trimData}"
publishDir "${outDir}/tempOut/trimmed", mode: "symlink", pattern: "*_val_{1,2}.fq.gz" publishDir "${logsDir}/trimData", mode: 'symlink', pattern: "\${repRID_trimData}.trimData.*"
publishDir "${logsDir}/trimData", mode: 'symlink', pattern: "\${rep}.trimData.*"
input: input:
set repID, reads from trimming val repRID_trimData
file(fastq) from fastqs
output: output:
path ("*_val_{1,2}.fq.gz", type: 'file', maxDepth: '0') path ("*.fq.gz") into fastqs_trimmed
val ends
file ("${repRID_trimData}.trimData.log")
file ("${repRID_trimData}.trimData.err")
script: script:
""" """
rep=`echo ${repID} | cut -f2- -d '_'`; if [ `nproc` -gt 8 ]
trim_galore --gzip --max_n 1 --paired --basename \${rep} -j `nproc` ${reads[0]} ${reads[1]} 1>>\${rep}.trimData.log 2>>\${rep}.trimData.err; then
ncore=8
else
ncore=`nproc`
fi
if [ '${fastq[1]}' == 'null' ]
then
ends='se'
trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID_trimData} -j \${ncore} ${fastq[0]} 1>>${repRID_trimData}.trimData.log 2>>${repRID_trimData}.trimData.err;
else
ends='pe'
trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID_trimData} -j \${ncore} ${fastq[0]} ${fastq[1]} 1>>${repRID_trimData}.trimData.log 2>>${repRID_trimData}.trimData.err;
fi
""" """
} }
\ No newline at end of file
#!/bin/bash #!/bin/bash
bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1 && bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1
for i in $(find */ -name "*.R*.fastq.gz"); do for i in $(find */ -name "*.R*.fastq.gz"); do
mv ${i} .; path=${2}$(echo ${i##*/} | grep -o "\.R.\.fastq\.gz");
done; mv ${i} ./${path}
done;
\ No newline at end of file
#!/bin
for i in $(ls -d Replicate_*)
do
rsync -r $1/ ${i} --exclude=fetch.txt
zip -r ${i}.zip ${i}
done
\ No newline at end of file
#!/usr/bin/env python3
import pytest
import pandas as pd
from io import StringIO
import os
test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../../'
@pytest.mark.getBag
def test_getBag():
assert os.path.exists(os.path.join(test_output_path, 'Replicate_16-1ZX4.zip'))
\ No newline at end of file
#!/usr/bin/env python3
import pytest
import pandas as pd
from io import StringIO
import os
test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../../'
@pytest.mark.getData
def test_getData():
assert os.path.exists(os.path.join(test_output_path, 'Replicate_16-1ZX4/bagit.txt'))
assert os.path.exists(os.path.join(test_output_path, '16-1ZX4.R1.fastq.gz'))
\ No newline at end of file
#!/usr/bin/env python3
import pytest
import pandas as pd
from io import StringIO
import os
test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../../'
@pytest.mark.trimData
def test_trimData():
assert os.path.exists(os.path.join(test_output_path, '16-1ZX4_trimmed.fq.gz'))
assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R1_val_1.fq.gz'))
assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R2_val_2.fq.gz'))
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment