diff --git a/.gitignore b/.gitignore index 8b4b1eadf6253fc94cefe75b485a051ed8f3d71e..9b75a201f875574a221bd0e4bf073e5a0d0db406 100644 --- a/.gitignore +++ b/.gitignore @@ -1,32 +1,6 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg - -# PyInstaller -# Created by https://www.gitignore.io/api/r,perl,macos,linux,python,windows -# Edit at https://www.gitignore.io/?templates=r,perl,macos,linux,python,windows +# Created by https://www.gitignore.io/api/r,perl,linux,macos,python,windows,microsoftoffice +# Edit at https://www.gitignore.io/?templates=r,perl,linux,macos,python,windows,microsoftoffice ### Linux ### *~ @@ -71,6 +45,27 @@ Network Trash Folder Temporary Items .apdisk +### MicrosoftOffice ### +*.tmp + +# Word temporary +~$*.doc* + +# Word Auto Backup File +Backup of *.doc* + +# Excel temporary +~$*.xls* + +# Excel Backup File +*.xlk + +# PowerPoint temporary +~$*.ppt* + +# Visio autosave temporary files +*.~vsd* + ### Perl ### !Build/ .last_cover_stats @@ -165,15 +160,6 @@ coverage.xml *.mo *.pot -# Django stuff: -*.log -local_settings.py -db.sqlite3 - -# Flask stuff: -instance/ -.webassets-cache - # Scrapy stuff: .scrapy @@ -183,31 +169,22 @@ docs/_build/ # PyBuilder target/ -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - # pyenv .python-version +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - # Spyder project settings .spyderproject .spyproject @@ -215,6 +192,11 @@ venv.bak/ # Rope project settings .ropeproject +# Mr Developer +.mr.developer.cfg +.project +.pydevproject + # mkdocs documentation /site @@ -226,9 +208,6 @@ dmypy.json # Pyre type checker .pyre/ -### Python Patch ### -.venv/ - ### R ### # History files .Rhistory @@ -237,6 +216,9 @@ dmypy.json # Session Data files .RData +# User-specific files +.Ruserdata + # Example code in package build process *-Ex.R @@ -257,7 +239,7 @@ vignettes/*.pdf .httr-oauth # knitr and R markdown default cache directories -/*_cache/ +*_cache/ /cache/ # Temporary files created by R markdown @@ -271,6 +253,7 @@ vignettes/*.pdf ### Windows ### # Windows thumbnail cache files Thumbs.db +Thumbs.db:encryptable ehthumbs.db ehthumbs_vista.db @@ -293,7 +276,7 @@ $RECYCLE.BIN/ # Windows shortcuts *.lnk -# End of https://www.gitignore.io/api/r,perl,macos,linux,python,windows +# End of https://www.gitignore.io/api/r,perl,linux,macos,python,windows,microsoftoffice # nextflow analysis folders/files /test_data/* diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..f35caee3a7ce07870b96d60475903727a785d2f6 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,45 @@ +before_script: + - module add python/3.6.1-2-anaconda + - pip install --user pytest-pythonpath==0.7.1 pytest-cov==2.5.1 + - module load singularity/3.0.2 + - module load nextflow/19.09.0 + - ln -sfn /project/BICF/BICF_Core/shared/gudmap/test_data/* ./test_data/ + - mkdir -p ~/.deriva + - mkdir -p ~/.bdbag + +stages: + - unit + - integration + +getBag: + stage: unit + script: + - ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json + - singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=16-1ZX4 + - pytest -m getBag + +getData: + stage: unit + script: + - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt + - unzip ./test_data/bagit/Replicate_16-1ZX4 + - singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' sh ./workflow/scripts/bdbagFetch.sh Replicate_16-1ZX4 16-1ZX4 + - pytest -m getData + +trimData: + stage: unit + script: + - if [ `nproc` -gt 8 ]; then ncore=8; else ncore=`nproc`; fi + - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename 16-1ZX4 -j ${ncore} ./test_data/fastq/16-1ZX4.R1.fastq.gz + - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5JA -j ${ncore} ./test_data/fastq/Q-Y5JA.R1.fastq.gz ./test_data/fastq/Q-Y5JA.R2.fastq.gz + - pytest -m trimData + +integration_se: + stage: integration + script: + - nextflow run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-1ZX4 + +integration_pe: + stage: integration + script: + - nextflow run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ff0911406a181f08e86e495c7530f87d6f43dae..2dec9e320556503056f41874817d032ad618f3af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,5 +2,6 @@ **User Facing** **Background** +* Implementation of CI *Known Bugs* diff --git a/README.md b/README.md index b2a3fecf8142b15790274609eb38a35a318af72e..0cdfaa9aaea69df9d66493cf27d50f1f1653722f 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,7 @@ -<!-- |*master*|*develop*| |:-:|:-:| |[](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/commits/master)|[](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/commits/develop)| - +<!-- [![DOI]()]() --> GUDMAP/RBK RNA-Seq Pipeline @@ -10,16 +9,19 @@ GUDMAP/RBK RNA-Seq Pipeline Introduction ------------ - +This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub. To Run: ------- * Available parameters: + * *--deriva* active ```credential.json``` file from [deriva-auth](https://github.com/informatics-isi-edu/gudmap-rbk/wiki/Uploading-files-via-Deriva-client-tools#from-a-remote-server) + * *--bdbag* active ```cookies.txt``` file from [deriva-auth](https://github.com/informatics-isi-edu/gudmap-rbk/wiki/Uploading-files-via-Deriva-client-tools#from-a-remote-server) + * *--repRID* mRNA-seq replicate RID\ + note: once deriva-auth is run and authenticated, the two files above are saved in ```~/.deriva/``` (see official documents from [deriva](https://github.com/informatics-isi-edu/deriva-client#installer-packages-for-windows-and-macosx) on the lifetime of the credentials) * FULL EXAMPLE: ``` - nextflow run workflow/rna-seq.nf + nextflow run workflow/rna-seq.nf --deriva ./data/credential.json --bdbag ./data/cookies.txt --repRID Q-Y5JA ``` -* Design example: diff --git a/workflow/conf/aws_ondemand.config b/workflow/conf/aws_ondemand.config new file mode 100755 index 0000000000000000000000000000000000000000..1a14ebf3dc44d33198c8472a231796f980e312da --- /dev/null +++ b/workflow/conf/aws_ondemand.config @@ -0,0 +1,26 @@ +workDir = 's3://gudmap.rbk/work' +aws.client.storageEncryption = 'AES256' +aws { + region = 'us-east-2' + batch { + cliPath = '/home/ec2-user/miniconda/bin/aws' + } +} + +process { + executor = 'awsbatch' + queue = 'highpriority-3278a8b0-1fc8-11ea-b1ac-021e2396e2cc' + cpus = 1 + memory = '1 GB' + + withName:getBag { + container = 'bicf/gudmaprbkfilexfer:1.3' + } + withName:getData { + container = 'bicf/gudmaprbkfilexfer:1.3' + } + withName:trimData { + container = 'bicf/trimgalore:1.1' + cpus = 15 + } +} \ No newline at end of file diff --git a/workflow/conf/aws_spot.config b/workflow/conf/aws_spot.config new file mode 100755 index 0000000000000000000000000000000000000000..b5239a2388616beb2936e41020e5c387f87118a6 --- /dev/null +++ b/workflow/conf/aws_spot.config @@ -0,0 +1,26 @@ +workDir = 's3://gudmap.rbk/work' +aws.client.storageEncryption = 'AES256' +aws { + region = 'us-east-2' + batch { + cliPath = '/home/ec2-user/miniconda/bin/aws' + } +} + +process { + executor = 'awsbatch' + queue = 'default-3278a8b0-1fc8-11ea-b1ac-021e2396e2cc' + cpus = 1 + memory = '1 GB' + + withName:getBag { + container = 'bicf/gudmaprbkfilexfer:1.3' + } + withName:getData { + container = 'bicf/gudmaprbkfilexfer:1.3' + } + withName:trimData { + container = 'bicf/trimgalore:1.1' + cpus = 15 + } +} diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 2e90c4f157e816cfccb05ec5eb68b4417c651fd0..20da91a7f7a241e610708d7186d299d397958c41 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -3,41 +3,20 @@ process { queue = 'super' clusterOptions = '--hold' - // Process specific configuration - withName:splitData { - container = 'docker://bicf/gudmaprbkfilexfer:1.0' + withName:getBag { + executor = 'local' + container = 'docker://bicf/gudmaprbkfilexfer:1.3' } withName:getData { - container = 'docker://bicf/gudmaprbkfilexfer:1.0' + executor = 'local' + container = 'docker://bicf/gudmaprbkfilexfer:1.3' } withName:trimData { - container = 'docker://bicf/trimgalore:1.0' + container = 'docker://bicf/trimgalore:1.1' queue = '256GB,256GBv1,384GB' } } - -trace { - enabled = true - file = 'pipeline_trace.txt' - fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss' -} - -timeline { - enabled = true - file = 'timeline.html' -} - -report { - enabled = true - file = 'report.html' -} - -tower { - accessToken = '3ade8f325d4855434b49aa387421a44c63e3360f' - enabled = true -} - singularity { enabled = true cacheDir = '/project/shared/bicf_workflow_ref/singularity_images/' @@ -47,4 +26,4 @@ env { http_proxy = 'http://proxy.swmed.edu:3128' https_proxy = 'http://proxy.swmed.edu:3128' all_proxy = 'http://proxy.swmed.edu:3128' -} +} \ No newline at end of file diff --git a/workflow/conf/conda.env.bdbag.yml b/workflow/conf/conda.env.bdbag.yml deleted file mode 100644 index 33361d301b3fac561fa39807e3c740583e57d28b..0000000000000000000000000000000000000000 --- a/workflow/conf/conda.env.bdbag.yml +++ /dev/null @@ -1,5 +0,0 @@ -name: bdbag -dependencies: - - pandas=0.23.3=py36_0 - - pip: - - bdbag==1.5.5 diff --git a/workflow/conf/replicate_export_config.json b/workflow/conf/replicate_export_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ff17fa513c5bc130a2e2bdaf9aa41b070c99b290 --- /dev/null +++ b/workflow/conf/replicate_export_config.json @@ -0,0 +1,97 @@ +{ + "bag": { + "bag_name": "Replicate_{rid}", + "bag_algorithms": [ + "md5" + ], + "bag_archiver": "zip" + }, + "catalog": { + "query_processors": [ + { + "processor": "csv", + "processor_params": { + "output_path": "Study", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Study_RID)=(RNASeq:Study:RID)/Study_RID:=RID,Internal_ID,Title,Summary,Overall_Design,GEO_Series_Accession_ID,GEO_Platform_Accession_ID,Funding,Pubmed_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Experiment", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment:RID)/Experiment_RID:=RID,Study_RID,Internal_ID,Name,Description,Experiment_Method,Sequencing_Type,Species,Specimen_Type,Molecule_Type,Pooled_Sample,Pool_Size,Markers,Cell_Count,Treatment_Protocol,Treatment_Protocol_Reference,Isolation_Protocol,Isolation_Protocol_Reference,Growth_Protocol,Growth_Protocol_Reference,Label_Protocol,Label_Protocol_Reference,Hybridization_Protocol,Hybridization_Protocol_Reference,Scan_Protocol,Scan_Protocol_Reference,Data_Processing,Value_Definition,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Experiment Antibodies", + "query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Antibodies:Experiment_RID)?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Experiment Custom Metadata", + "query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Custom_Metadata:Experiment_RID)?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Experiment Settings", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Settings:Experiment_RID)/RID,Experiment_RID,Alignment_Format,Aligner,Aligner_Version,Reference_Genome,Sequence_Trimming,Duplicate_Removal,Pre-alignment_Sequence_Removal,Junction_Reads,Library_Type,Protocol_Reference,Library_Selection,Quantification_Format,Quantification_Software,Expression_Metric,Transcriptome_Model,Sequencing_Platform,Paired_End,Read_Length,Has_Strand_Specific_Information,Used_Spike_Ins,Spike_Ins_Amount,Visualization_Format,Visualization_Software,Visualization_Version,Visualization_Setting,Notes,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Replicate", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/RID,Study_RID,Experiment_RID,Biological_Replicate_Number,Technical_Replicate_Number,Specimen_RID,Collection_Date,Mapped_Reads,GEO_Sample_Accession_ID,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Specimen", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/S:=(Specimen_RID)=(Gene_Expression:Specimen:RID)/T:=left(Stage_ID)=(Vocabulary:Developmental_Stage:ID)/$S/RID,Title,Species,Stage_ID,Stage_Name:=T:Name,Stage_Detail,Assay_Type,Strain,Wild_Type,Sex,Passage,Phenotype,Cell_Line,Parent_Specimen,Upload_Notes,Preparation,Fixation,Embedding,Internal_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT,GUDMAP2_Accession_ID?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Specimen_Anatomical_Source", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Tissue:Specimen_RID)/RID,Specimen_RID,Tissue,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Specimen_Cell_Types", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Cell_Type:Specimen)/RID,Specimen_RID:=Specimen,Cell_Type,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Single Cell Metrics", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:Single_Cell_Metrics:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Reads_%28Millions%29,Reads%2FCell,Detected_Gene_Count,Genes%2FCell,UMI%2FCell,Estimated_Cell_Count,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "File", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Caption,File_Type,File_Name,URI,File_size,MD5,GEO_Archival_URL,dbGaP_Accession_ID,Processed,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT,Legacy_File_RID,GUDMAP_NGF_OID,GUDMAP_NGS_OID?limit=none" + } + }, + { + "processor": "fetch", + "processor_params": { + "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/url:=URI,length:=File_size,filename:=File_Name,md5:=MD5,Study_RID,Experiment_RID,Replicate_RID?limit=none" + } + } + ] + } +} diff --git a/workflow/nextflow.config b/workflow/nextflow.config index 30e47ea1aea37ed6550cc2944d69d26e69887489..37584999cf8152c9776b676d0b013f8aeb5e8709 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -2,4 +2,39 @@ profiles { standard { includeConfig 'conf/biohpc.config' } + aws_ondemand { + includeConfig 'conf/aws_ondemand.config' + } + aws_spot { + includeConfig 'conf/aws_spot.config' + } +} + +trace { + enabled = true + file = 'pipeline_trace.txt' + fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss' +} + +timeline { + enabled = true + file = 'timeline.html' +} + +report { + enabled = true + file = 'report.html' +} + +tower { + accessToken = '3ade8f325d4855434b49aa387421a44c63e3360f' + enabled = true } + +manifest { + homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq' + description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.' + mainScript = 'rna-seq.nf' + version = 'v0.0.1_indev' + nextflowVersion = '>=19.09.0' +} \ No newline at end of file diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 035faa8ee8b3f2b95f298e4edc8f074a0b695587..c1d72efac168ca372471a35e4baa9ccfd96aae18 100755 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -1,55 +1,59 @@ #!/usr/bin/env nextflow // Define input variables -params.deriva = "/project/BICF/BICF_Core/shared/gudmap/cookies/deriva-cookies.txt" -params.bdbag = "${baseDir}/../test_data/Study_Q-Y4H0.zip" +params.deriva = "${baseDir}/../test_data/credential.json" +params.bdbag = "${baseDir}/../test_data/cookies.txt" +//params.repRID = "16-1ZX4" +params.repRID = "Q-Y5JA" params.outDir = "${baseDir}/../output" // Parse input variables -deriva = file(params.deriva, checkIfExists: 'true') +deriva = Channel + .fromPath(params.deriva) + .ifEmpty { exit 1, "deriva credential file not found: ${params.deriva}" } bdbag = Channel .fromPath(params.bdbag) - .ifEmpty { exit 1, "bdbag zip file not found: ${params.bdbag}" } + .ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" } + +Channel.from(params.repRID) + .into { + repRID_getBag + repRID_getData + repRID_trimData + } outDir = params.outDir logsDir = "${outDir}/Logs" +// Define fixed files +derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json") + /* - * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid + * getData: get bagit file from consortium */ -process splitData { - tag "${bdbag.baseName}" +process getBag { executor 'local' - publishDir "${logsDir}/splitData", mode: 'symlink', pattern: "${bdbag.baseName}.splitData.err" + tag "${repRID_getBag}" + publishDir "${logsDir}/getBag", mode: 'symlink', pattern: "${repRID_getBag}.getBag.err" input: - file bdbag - path cookies, stageAs: 'cookies.txt' from deriva + val repRID_getBag + path credential, stageAs: 'credential.json' from deriva + path derivaConfig output: - file("Replicate_*.zip") into bdbagSplit mode flatten - file("${bdbag.baseName}/data/File.csv") into fileMeta - file("${bdbag.baseName}/data/Experiment Settings.csv") into experimentSettingsMeta - file("${bdbag.baseName}/data/Experiment.csv") into experimentMeta - file ("${bdbag.baseName}.splitData.err") + path ("Replicate_*.zip") into bagit + file ("${repRID_getBag}.getBag.err") script: """ - hostname >> ${bdbag.baseName}.splitData.err - ulimit -a >> ${bdbag.baseName}.splitData.err - ln -sf `readlink -e cookies.txt` ~/.bdbag/deriva-cookies.txt 2>>${bdbag.baseName}.splitData.err - echo "LOG: deriva cookie linked" >> ${bdbag.baseName}.splitData.err - study=`echo "${bdbag}" | cut -d '.' -f1` 2>>${bdbag.baseName}.splitData.err - echo "LOG: \${study}" >> ${bdbag.baseName}.splitData.err - unzip ${bdbag} 2>>${bdbag.baseName}.splitData.err - echo "LOG: bdgag unzipped" >> ${bdbag.baseName}.splitData.err - python3 ${baseDir}/scripts/modifyFetch.py --fetchFile \${study} 2>>${bdbag.baseName}.splitData.err - echo "LOG: fetch file filtered for only .fastq.gz" >> ${bdbag.baseName}.splitData.err - python3 ${baseDir}/scripts/splitFetch.py --fetchFile \${study} 2>>${bdbag.baseName}.splitData.err - echo "LOG: fetch file split by replicates" >> ${bdbag.baseName}.splitData.err - sh ${baseDir}/scripts/splitBag.sh \${study} 2>>${bdbag.baseName}.splitData.err - echo "LOG: bag recreated with replicate split fetch file" >> ${bdbag.baseName}.splitData.err + hostname >>${repRID_getBag}.getBag.err + ulimit -a >>${repRID_getBag}.getBag.err + export https_proxy=\${http_proxy} + ln -sf `readlink -e credential.json` ~/.deriva/credential.json 2>>${repRID_getBag}.getBag.err + echo "LOG: deriva credentials linked" >>${repRID_getBag}.getBag.err + deriva-download-cli dev.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID_getBag} 2>>${repRID_getBag}.getBag.err """ } @@ -57,26 +61,36 @@ process splitData { * getData: fetch study files from consortium with downloaded bdbag.zip */ process getData { - tag "${rep.baseName}" - publishDir "${logsDir}/getData", mode: 'symlink', pattern: "${rep.baseName}.getData.err" + tag "${repRID_getData}" + publishDir "${logsDir}/getData", mode: 'symlink', pattern: "${repRID_getData}.getData.err" input: - each rep from bdbagSplit + val repRID_getData + executor 'local' + path cookies, stageAs: 'deriva-cookies.txt' from bdbag + path bagit output: - set val ("${rep.baseName}"), file ("*.R{1,2}.fastq.gz") into trimming + path ("*.R{1,2}.fastq.gz") into fastqs + file("**/File.csv") into fileMeta + file("**/Experiment Settings.csv") into experimentSettingsMeta + file("**/Experiment.csv") into experimentMeta + file ("${repRID_getData}.getData.err") + script: """ - hostname >>${rep.baseName}.getData.err - ulimit -a >>${rep.baseName}.getData.err + hostname >>${repRID_getData}.getData.err + ulimit -a >>${repRID_getData}.getData.err export https_proxy=\${http_proxy} - replicate=\$(basename "${rep}" | cut -d '.' -f1) - echo "LOG: \${replicate}" >>${rep.baseName}.getData.err - unzip ${rep} 2>>${rep.baseName}.getData.err - echo "LOG: replicate bdbag unzipped" >>${rep.baseName}.getData.err - sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} 2>>${rep.baseName}.getData.err - echo "LOG: replicate bdbag fetched" >>${rep.baseName}.getData.err + ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt >>${repRID_getData}.getData.err + echo "LOG: deriva cookie linked" >>${repRID_getData}.getData.err + replicate=\$(basename "${bagit}" | cut -d '.' -f1) + echo "LOG: \${replicate}" >>${repRID_getData}.getData.err + unzip ${bagit} 2>>${repRID_getData}.getData.err + echo "LOG: replicate bdbag unzipped" >>${repRID_getData}.getData.err + sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} ${repRID_getData} 2>>${repRID_getData}.getData.err + echo "LOG: replicate bdbag fetched" >>${repRID_getData}.getData.err """ } @@ -84,19 +98,34 @@ process getData { * trimData: trims any adapter or non-host sequences from the data */ process trimData { - tag "trim-${repID}" - publishDir "${outDir}/tempOut/trimmed", mode: "symlink", pattern: "*_val_{1,2}.fq.gz" - publishDir "${logsDir}/trimData", mode: 'symlink', pattern: "\${rep}.trimData.*" + tag "${repRID_trimData}" + publishDir "${logsDir}/trimData", mode: 'symlink', pattern: "\${repRID_trimData}.trimData.*" input: - set repID, reads from trimming + val repRID_trimData + file(fastq) from fastqs output: - path ("*_val_{1,2}.fq.gz", type: 'file', maxDepth: '0') + path ("*.fq.gz") into fastqs_trimmed + val ends + file ("${repRID_trimData}.trimData.log") + file ("${repRID_trimData}.trimData.err") script: """ - rep=`echo ${repID} | cut -f2- -d '_'`; - trim_galore --gzip --max_n 1 --paired --basename \${rep} -j `nproc` ${reads[0]} ${reads[1]} 1>>\${rep}.trimData.log 2>>\${rep}.trimData.err; + if [ `nproc` -gt 8 ] + then + ncore=8 + else + ncore=`nproc` + fi + if [ '${fastq[1]}' == 'null' ] + then + ends='se' + trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID_trimData} -j \${ncore} ${fastq[0]} 1>>${repRID_trimData}.trimData.log 2>>${repRID_trimData}.trimData.err; + else + ends='pe' + trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID_trimData} -j \${ncore} ${fastq[0]} ${fastq[1]} 1>>${repRID_trimData}.trimData.log 2>>${repRID_trimData}.trimData.err; + fi """ -} +} \ No newline at end of file diff --git a/workflow/scripts/bdbagFetch.sh b/workflow/scripts/bdbagFetch.sh index 9af4eb46c0e716e0e1db7cb66e9f027f63611218..902222a2ebb6aa7e978f0a820ad3c04472395848 100644 --- a/workflow/scripts/bdbagFetch.sh +++ b/workflow/scripts/bdbagFetch.sh @@ -1,6 +1,7 @@ #!/bin/bash -bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1 && +bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1 for i in $(find */ -name "*.R*.fastq.gz"); do - mv ${i} .; -done; + path=${2}$(echo ${i##*/} | grep -o "\.R.\.fastq\.gz"); + mv ${i} ./${path} +done; \ No newline at end of file diff --git a/workflow/scripts/splitBag.sh b/workflow/scripts/splitBag.sh deleted file mode 100644 index 3f6f6cdb610c684bdb57f666822dc0deb864fb04..0000000000000000000000000000000000000000 --- a/workflow/scripts/splitBag.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin - -for i in $(ls -d Replicate_*) -do -rsync -r $1/ ${i} --exclude=fetch.txt -zip -r ${i}.zip ${i} -done \ No newline at end of file diff --git a/workflow/tests/test_getBag.py b/workflow/tests/test_getBag.py new file mode 100644 index 0000000000000000000000000000000000000000..78d8478c7b7b50565de467ab3d1c07cbfea697ff --- /dev/null +++ b/workflow/tests/test_getBag.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + +@pytest.mark.getBag +def test_getBag(): + assert os.path.exists(os.path.join(test_output_path, 'Replicate_16-1ZX4.zip')) \ No newline at end of file diff --git a/workflow/tests/test_getData.py b/workflow/tests/test_getData.py new file mode 100644 index 0000000000000000000000000000000000000000..36d0b22155fc5ef860497bf887dc595ca368c6ad --- /dev/null +++ b/workflow/tests/test_getData.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + +@pytest.mark.getData +def test_getData(): + assert os.path.exists(os.path.join(test_output_path, 'Replicate_16-1ZX4/bagit.txt')) + assert os.path.exists(os.path.join(test_output_path, '16-1ZX4.R1.fastq.gz')) \ No newline at end of file diff --git a/workflow/tests/test_trimData.py b/workflow/tests/test_trimData.py new file mode 100644 index 0000000000000000000000000000000000000000..ea75252f558adc95e5feb564551afbe3f8858cb0 --- /dev/null +++ b/workflow/tests/test_trimData.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + +@pytest.mark.trimData +def test_trimData(): + assert os.path.exists(os.path.join(test_output_path, '16-1ZX4_trimmed.fq.gz')) + assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R1_val_1.fq.gz')) + assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R2_val_2.fq.gz')) \ No newline at end of file