diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c2b63ce58bde32bc80c7bfbd5ee2792bad99fdb4..1f93dcef7aced3bfb7ed15c9faadfaa59f1d0d2e 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,6 +1,6 @@ before_script: - module load python/3.6.4-anaconda - - pip install --user attrs==19.1.0 pytest-pythonpath==0.7.1 pytest-cov==2.5.1 + - pip install --user attrs==20.3.0 pytest==6.2.2 pytest-pythonpath==0.7.3 pytest-cov==2.11.1 - module load singularity/3.5.3 - module load nextflow/20.01.0 - ln -sfn /project/BICF/BICF_Core/shared/gudmap/test_data/* ./test_data/ @@ -389,7 +389,7 @@ uploadQC: done echo all old mRNA QC RIDs deleted fi - rid=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/upload_qc.py -r 17-BTFJ -e 17-BVDJ -p "Single Read" -s forward -l 35 -w 5 -f 1 -t 1 -n "This is a test mRNA QC" -o staging.gudmap.org -c ${cookie} -u F) + rid=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/upload_qc.py -r 17-BTFJ -e 17-BVDJ -p "Single End" -s forward -l 35 -w 5 -f 1 -t 1 -n "This is a test mRNA QC" -o staging.gudmap.org -c ${cookie} -u F) echo ${rid} test mRNA QC created uploadProcessedFile: @@ -660,12 +660,13 @@ integration_se: only: [merge_requests] except: variables: - - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-1ZX4 --source staging --upload true -with-dag dag.png --dev false --ci true --email 'venkat.malladi@utsouthwestern.edu,Gervaise.Henry@UTSouthwestern.edu' + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-1ZX4 --source staging --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./SE_report.html - find . -type f -name "multiqc_data.json" -exec cp {} ./SE_multiqc_data.json \; + - pytest -m completionMultiqc --filename SE_multiqc_data.json artifacts: name: "$CI_JOB_NAME" when: always @@ -684,12 +685,13 @@ integration_pe: only: [merge_requests] except: variables: - - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA --source staging --upload true -with-dag dag.png --dev false --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA --source staging --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./PE_report.html - find . -type f -name "multiqc_data.json" -exec cp {} ./PE_multiqc_data.json \; + - pytest -m completionMultiqc --filename PE_multiqc_data.json artifacts: name: "$CI_JOB_NAME" when: always @@ -710,11 +712,11 @@ failAmbiguousSpecies: only: [merge_requests] except: variables: - - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --source staging --upload true -with-dag dag.png --dev false --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --source staging --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./failAmbiguousSpecies_report.html retry: max: 0 when: @@ -725,11 +727,11 @@ failTrunkation: only: [merge_requests] except: variables: - - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ET --source staging --upload true -with-dag dag.png --dev false --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ET --source staging --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./failTrunkation_report.html retry: max: 0 when: @@ -740,11 +742,11 @@ failMismatchR1R2: only: [merge_requests] except: variables: - - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-CWH4 --source staging --upload true -with-dag dag.png --dev false --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-CWH4 --source staging --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./failMismatchR1R2_report.html retry: max: 0 when: @@ -755,11 +757,11 @@ failUnexpectedMeta: only: [merge_requests] except: variables: - - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 14-3R4R --source staging --upload true -with-dag dag.png --dev false --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 14-3R4R --source staging --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./failUnexpectedMeta_report.html retry: max: 0 when: @@ -770,11 +772,11 @@ failFileStructure: only: [merge_requests] except: variables: - - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5HT --source staging --upload true -with-dag dag.png --dev false --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5HT --source staging --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./failFileStructure_report.html retry: max: 0 when: @@ -785,17 +787,20 @@ override_inputBag: only: [merge_requests] except: variables: - - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --source staging --inputBagForce ./test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip --upload false --dev false --ci true - - find . -type f -name "multiqc_data.json" -exec cp {} ./inputBagOverride_PE_multiqc_data.json \; + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --source staging --inputBagForce ./test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip --upload false --dev false --ci true --track false -with-report ./inputBagOverride_report.html + - find . -type f -name "multiqc_data.json" -exec cp {} ./inputBagOverride_multiqc_data.json \; + - find ./**/report/ -type f -name "*multiqc.html" -exec cp {} ./inputBagOverride_multiqc.html \; + - pytest -m completionMultiqc --filename inputBagOverride_multiqc_data.json artifacts: name: "$CI_JOB_NAME" when: always paths: - - inputBagOverride_PE_multiqc_data.json + - inputBagOverride_multiqc_data.json + - inputBagOverride_multiqc.html expire_in: 7 days retry: max: 0 @@ -807,17 +812,20 @@ override_fastq: only: [merge_requests] except: variables: - - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --source staging --fastqsForce './test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz' --upload false --dev false --ci true - - find . -type f -name "multiqc_data.json" -exec cp {} ./fastqOverride_PE_multiqc_data.json \; + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --source staging --fastqsForce './test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz' --upload false --dev false --ci true --track false -with-report ./fastqOverride_report.html + - find . -type f -name "multiqc_data.json" -exec cp {} ./fastqOverride_multiqc_data.json \; + - find ./**/report/ -type f -name "*multiqc.html" -exec cp {} ./fastqOverride_multiqc.html \; + - pytest -m completionMultiqc --filename fastqOverride_multiqc_data.json artifacts: name: "$CI_JOB_NAME" when: always paths: - - fastqOverride_PE_multiqc_data.json + - fastqOverride_multiqc_data.json + - fastqOverride_multiqc.html expire_in: 7 days retry: max: 0 @@ -829,17 +837,70 @@ override_species: only: [merge_requests] except: variables: - - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + script: + - hostname + - ulimit -a + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5EW --source staging --speciesForce 'Homo sapiens' --upload true --dev false --ci true --track false -with-report ./speciesOverride_report.html + - find . -type f -name "multiqc_data.json" -exec cp {} ./speciesOverride_multiqc_data.json \; + - find ./**/report/ -type f -name "*multiqc.html" -exec cp {} ./speciesOverride_multiqc.html \; + - pytest -m completionMultiqc --filename speciesOverride_multiqc_data.json + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - speciesOverride_multiqc_data.json + - speciesOverride_multiqc.html + expire_in: 7 days + retry: + max: 0 + when: + - always + +override_stranded: + stage: integration + only: [merge_requests] + except: + variables: + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + script: + - hostname + - ulimit -a + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5EY --source staging --strandedForce unstranded --upload true --dev false --ci true --track false -with-report ./strandedOverride_report.html + - find . -type f -name "multiqc_data.json" -exec cp {} ./strandedOverride_multiqc_data.json \; + - find ./**/report/ -type f -name "*multiqc.html" -exec cp {} ./strandedOverride_multiqc.html \; + - pytest -m completionMultiqc --filename strandedOverride_multiqc_data.json + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - strandedOverride_multiqc_data.json + - strandedOverride_multiqc.html + expire_in: 7 days + retry: + max: 0 + when: + - always + +override_spike: + stage: integration + only: [merge_requests] + except: + variables: + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --source staging --speciesForce 'Homo sapiens' --upload false --dev false --ci true - - find . -type f -name "multiqc_data.json" -exec cp {} ./speciesOverride_PE_multiqc_data.json \; + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F0 --source staging --spikeForce true --upload true --dev false --ci true --track false -with-report ./spikeOverride_report.html + - find . -type f -name "multiqc_data.json" -exec cp {} ./spikeOverride_multiqc_data.json \; + - find ./**/report/ -type f -name "*multiqc.html" -exec cp {} ./spikeOverride_multiqc.html \; + - pytest -m completionMultiqc --filename spikeOverride_multiqc_data.json artifacts: name: "$CI_JOB_NAME" when: always paths: - - speciesOverride_PE_multiqc_data.json + - spikedOverride_multiqc_data.json + - spikeOverride_multiqc.html expire_in: 7 days retry: max: 0 diff --git a/CHANGELOG.md b/CHANGELOG.md index 3382228f8fdee34882c81df36e2773641a82a604..21fb63812a42a7580996eacbd70c0a03f4866648 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ -# v1.0.3 +# v2.0.0rc01 **User Facing** +* Endness metadata "Single Read" changed to "Single End" in data-hub, pipeline updated to handle (#110) ("Single Read" still acceptable for backwards compatibility) +* Strandedness metadata "yes"/"no" changed to boolean "t"/"f" in data-hub, pipeline updated to handle (#70) ("yes"/"no" still acceptable for backwards compatibility) +* Upload empty mRNA_QC entry if data error (#111) +* Allow forcing of strandedness and spike (#100) **Background** * Add memory limit (75%) per thread for samtools sort (#108) @@ -15,10 +19,16 @@ * Detect malformed fastq's (#107) * Restrict sampled alignment process to use >32GB nodes on BioHPC (#108) * Use nproc**-1** for alignment processes (#108) +* Data-hub column title change from "Sequencing_Type" to "Experiment_Type" (#114) +* Data-hub column title change from "Has_Strand_Specific_Information" to "Strandedness" (#115) +* Merge data error pre-inference execution run upload/finalize to 1 process +* Change uploadOutputBag logic to change reuse hatrac file if alread exists (re-uses Output_Bag entry by reassigning Execution_Run RID) (#112) +* Add new CI py tests for override and integration *Known Bugs* * Override params (inputBag, fastq, species) aren't checked for integrity * Authentication files and tokens must be active (active auth client) for the duration of the pipeline run (until long-lived token utilization included) +* Check for outputBag in hatrac doesn't check for any uploaded by chaise <hr> diff --git a/README.md b/README.md index 90878a2234e5fe326cefa12c915d5c75fab81bb2..7b715d4b94ce92f8cd93806dd21bd04481f83b0a 100644 --- a/README.md +++ b/README.md @@ -57,8 +57,12 @@ To Run: * eg: `--inputBagForce test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip` (must be the expected bag structure, this example will not work because it is a test bag) * `--fastqsForce` utilizes local fastq's instead of downloading from the data-hub (still requires accurate repRID input) * eg: `--fastqsForce 'test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz'` (note the quotes around fastq's which must me named in the correct standard [*\*.R1.fastq.gz and/or \*.R2.fastq.gz*] and in the correct order) - * `--speciesForce` forces the species to be "Mus musculus" or "Homo sapiens", it bypasses ambiguous species error + * `--speciesForce` forces the species to be "Mus musculus" or "Homo sapiens", it bypasses a metadata mismatch or an ambiguous species error * eg: `--speciesForce 'Mus musculus'` + * `--strandedForce` forces the strandedness to be "forward", "reverse" or "unstranded", it bypasses a metadata mismatch error + * eg: `--strandedForce 'unstranded'` + * `--spikeForce` forces the spike-in to be "false" or "true", it bypasses a metadata mismatch error + * eg: `--spikeForce 'true'` * Tracking parameters ([Tracking Site](http://bicf.pipeline.tracker.s3-website-us-east-1.amazonaws.com/)): * `--ci` boolean (default = false) * `--dev` boolean (default = true) diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..289b4930cebfbe0f9850a46260c90e7d2f794d8f --- /dev/null +++ b/conftest.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 +import pytest + + +def pytest_addoption(parser): + parser.addoption("--filename", action="store") + +@pytest.fixture(scope='session') +def filename(request): + filename_value = request.config.option.filename + if filename_value is None: + pytest.skip() + return filename_value \ No newline at end of file diff --git a/docs/dag.png b/docs/dag.png index 7eb7c79f097a7fcac368a58ef63cfda42b154f41..a19e02c47d0ca333f420061965ffda893ae42c83 100755 Binary files a/docs/dag.png and b/docs/dag.png differ diff --git a/test_data/Replicate_For_Input_Bag(test).json b/test_data/Replicate_For_Input_Bag(test).json index 46fefe878c7c370792b403c4fb89d3ac79fd5c69..2feaa8ccac3dcf7149e9af8f381dbc4c156ea207 100644 --- a/test_data/Replicate_For_Input_Bag(test).json +++ b/test_data/Replicate_For_Input_Bag(test).json @@ -19,7 +19,7 @@ "processor": "csv", "processor_params": { "output_path": "Experiment", - "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment:RID)/Experiment_RID:=RID,Study_RID,Internal_ID,Name,Description,Experiment_Method,Sequencing_Type,Species,Specimen_Type,Molecule_Type,Pooled_Sample,Pool_Size,Markers,Cell_Count,Treatment_Protocol,Treatment_Protocol_Reference,Isolation_Protocol,Isolation_Protocol_Reference,Growth_Protocol,Growth_Protocol_Reference,Label_Protocol,Label_Protocol_Reference,Hybridization_Protocol,Hybridization_Protocol_Reference,Scan_Protocol,Scan_Protocol_Reference,Data_Processing,Value_Definition,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none" + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment:RID)/Experiment_RID:=RID,Study_RID,Internal_ID,Name,Description,Experiment_Method,Experiment_Type,Species,Specimen_Type,Molecule_Type,Pooled_Sample,Pool_Size,Markers,Cell_Count,Treatment_Protocol,Treatment_Protocol_Reference,Isolation_Protocol,Isolation_Protocol_Reference,Growth_Protocol,Growth_Protocol_Reference,Label_Protocol,Label_Protocol_Reference,Hybridization_Protocol,Hybridization_Protocol_Reference,Scan_Protocol,Scan_Protocol_Reference,Data_Processing,Value_Definition,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none" } }, { @@ -40,7 +40,7 @@ "processor": "csv", "processor_params": { "output_path": "Experiment Settings", - "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Settings:Experiment_RID)/RID,Experiment_RID,Alignment_Format,Aligner,Aligner_Version,Reference_Genome,Sequence_Trimming,Duplicate_Removal,Pre-alignment_Sequence_Removal,Junction_Reads,Library_Type,Protocol_Reference,Library_Selection,Quantification_Format,Quantification_Software,Expression_Metric,Transcriptome_Model,Sequencing_Platform,Paired_End,Read_Length,Has_Strand_Specific_Information,Used_Spike_Ins,Spike_Ins_Amount,Visualization_Format,Visualization_Software,Visualization_Version,Visualization_Setting,Notes,RCT,RMT?limit=none" + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Settings:Experiment_RID)/RID,Experiment_RID,Alignment_Format,Aligner,Aligner_Version,Reference_Genome,Sequence_Trimming,Duplicate_Removal,Pre-alignment_Sequence_Removal,Junction_Reads,Library_Type,Protocol_Reference,Library_Selection,Quantification_Format,Quantification_Software,Expression_Metric,Transcriptome_Model,Sequencing_Platform,Paired_End,Read_Length,Strandedness,Used_Spike_Ins,Spike_Ins_Amount,Visualization_Format,Visualization_Software,Visualization_Version,Visualization_Setting,Notes,RCT,RMT?limit=none" } }, { diff --git a/test_data/createTestData.sh b/test_data/createTestData.sh index 35fa2a4f467627a09bedd6f2675df04971c341f1..5d876ed032790d0e3442aed94a0fd79e5e430e60 100644 --- a/test_data/createTestData.sh +++ b/test_data/createTestData.sh @@ -41,8 +41,8 @@ cp Q-Y5F6_1M.R1.fastq.gz_trimming_report.txt ./NEW_test_data/meta/Q-Y5F6_1M.R1.f cp Q-Y5F6_1M.R2.fastq.gz_trimming_report.txt ./NEW_test_data/meta/Q-Y5F6_1M.R2.fastq.gz_trimming_report.txt touch metaTest.csv -echo 'Replicate_RID,Experiment_RID,Study_RID,Paired_End,File_Type,Has_Strand_Specific_Information,Used_Spike_Ins,Species,Read_Length' > metaTest.csv -echo 'Replicate_RID,Experiment_RID,Study_RID,uk,FastQ,no,no,Homo sapiens,75' >> metaTest.csv +echo 'Replicate_RID,Experiment_RID,Study_RID,Paired_End,File_Type,Strandedness,Used_Spike_Ins,Species,Read_Length' > metaTest.csv +echo 'Replicate_RID,Experiment_RID,Study_RID,uk,FastQ,unstranded,f,Homo sapiens,75' >> metaTest.csv cp metaTest.csv ./NEW_test_data/meta/metaTest.csv mkdir -p ./NEW_test_data/bam diff --git a/workflow/conf/Replicate_For_Input_Bag.json b/workflow/conf/Replicate_For_Input_Bag.json index 842cf62fbb5237481a62173ff88be71fb22d04a4..508a0245051534fae39020792719b04d78947613 100644 --- a/workflow/conf/Replicate_For_Input_Bag.json +++ b/workflow/conf/Replicate_For_Input_Bag.json @@ -19,7 +19,7 @@ "processor": "csv", "processor_params": { "output_path": "Experiment", - "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment:RID)/Experiment_RID:=RID,Study_RID,Internal_ID,Name,Description,Experiment_Method,Sequencing_Type,Species,Specimen_Type,Molecule_Type,Pooled_Sample,Pool_Size,Markers,Cell_Count,Treatment_Protocol,Treatment_Protocol_Reference,Isolation_Protocol,Isolation_Protocol_Reference,Growth_Protocol,Growth_Protocol_Reference,Label_Protocol,Label_Protocol_Reference,Hybridization_Protocol,Hybridization_Protocol_Reference,Scan_Protocol,Scan_Protocol_Reference,Data_Processing,Value_Definition,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none" + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment:RID)/Experiment_RID:=RID,Study_RID,Internal_ID,Name,Description,Experiment_Method,Experiment_Type,Species,Specimen_Type,Molecule_Type,Pooled_Sample,Pool_Size,Markers,Cell_Count,Treatment_Protocol,Treatment_Protocol_Reference,Isolation_Protocol,Isolation_Protocol_Reference,Growth_Protocol,Growth_Protocol_Reference,Label_Protocol,Label_Protocol_Reference,Hybridization_Protocol,Hybridization_Protocol_Reference,Scan_Protocol,Scan_Protocol_Reference,Data_Processing,Value_Definition,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none" } }, { @@ -40,7 +40,7 @@ "processor": "csv", "processor_params": { "output_path": "Experiment Settings", - "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Settings:Experiment_RID)/RID,Experiment_RID,Alignment_Format,Aligner,Aligner_Version,Reference_Genome,Sequence_Trimming,Duplicate_Removal,Pre-alignment_Sequence_Removal,Junction_Reads,Library_Type,Protocol_Reference,Library_Selection,Quantification_Format,Quantification_Software,Expression_Metric,Transcriptome_Model,Sequencing_Platform,Paired_End,Read_Length,Has_Strand_Specific_Information,Used_Spike_Ins,Spike_Ins_Amount,Visualization_Format,Visualization_Software,Visualization_Version,Visualization_Setting,Notes,RCT,RMT?limit=none" + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Settings:Experiment_RID)/RID,Experiment_RID,Alignment_Format,Aligner,Aligner_Version,Reference_Genome,Sequence_Trimming,Duplicate_Removal,Pre-alignment_Sequence_Removal,Junction_Reads,Library_Type,Protocol_Reference,Library_Selection,Quantification_Format,Quantification_Software,Expression_Metric,Transcriptome_Model,Sequencing_Platform,Paired_End,Read_Length,Strandedness,Used_Spike_Ins,Spike_Ins_Amount,Visualization_Format,Visualization_Software,Visualization_Version,Visualization_Setting,Notes,RCT,RMT?limit=none" } }, { diff --git a/workflow/conf/aws.config b/workflow/conf/aws.config index d87669599a7e70add448f0cc7f0636dd8bef499b..bf5b59c7cf9db00606a5db9f97c706d53f21137f 100644 --- a/workflow/conf/aws.config +++ b/workflow/conf/aws.config @@ -112,20 +112,15 @@ process { cpus = 1 memory = '1 GB' } - withName:failPreExecutionRun_fastq { + withName:failPreExecutionRun { cpus = 1 memory = '1 GB' } - withName:failPreExecutionRun_fastqFile { - cpus = 1 - memory = '1 GB' - } - withName:failPreExecutionRun_species { - { + withName:failExecutionRun { cpus = 1 memory = '1 GB' } - withName:failExecutionRun { + withName:uploadQC_fail { cpus = 1 memory = '1 GB' } diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index cc058dfef74bc49adf3932554994678934ac7a44..a12f2a704b3c63df9031789c2bb05d11e04d6b3a 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -82,16 +82,13 @@ process { withName:finalizeExecutionRun { executor = 'local' } - withName:failPreExecutionRun_fastq { + withName:failPreExecutionRun { executor = 'local' } - withName:failPreExecutionRun_fastqFile { - executor = 'local' - } - withName:failPreExecutionRun_species { + withName:failExecutionRun { executor = 'local' } - withName:failExecutionRun { + withName:uploadQC_fail { executor = 'local' } } diff --git a/workflow/nextflow.config b/workflow/nextflow.config index 29c8515a852fb3356335b89a71f4ddf4f2c7a78b..44f2df5255691ee4eaf11ecf9cee1af2fa27f743 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -88,16 +88,13 @@ process { withName:finalizeExecutionRun { container = 'gudmaprbk/deriva1.4:1.0.0' } - withName:failPreExecutionRun_fastq { + withName:failPreExecutionRun { container = 'gudmaprbk/deriva1.4:1.0.0' } - withName:failPreExecutionRun_fastqFile { - container = 'gudmaprbk/deriva1.4:1.0.0' - } - withName:failPreExecutionRun_species { + withName:failExecutionRun { container = 'gudmaprbk/deriva1.4:1.0.0' } - withName:failExecutionRun { + withName:uploadQC_fail { container = 'gudmaprbk/deriva1.4:1.0.0' } } @@ -128,6 +125,6 @@ manifest { homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq' description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.' mainScript = 'rna-seq.nf' - version = 'v1.0.3' + version = 'v2.0.0rc01' nextflowVersion = '>=19.09.0' } diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index cac1b878d53b22685e973560cfc1c01ac0713e52..e3690bfcce75dbce98d553648995e023c97af905 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -20,6 +20,7 @@ params.refERCCVersion = "92" params.outDir = "${baseDir}/../output" params.upload = false params.email = "" +params.track = false // Define override input variable @@ -27,6 +28,8 @@ params.refSource = "biohpc" params.inputBagForce = "" params.fastqsForce = "" params.speciesForce = "" +params.strandedForce = "" +params.spikeForce = "" // Define tracking input variables params.ci = false @@ -44,12 +47,11 @@ deriva.into { deriva_uploadInputBag deriva_uploadExecutionRun deriva_uploadQC + deriva_uploadQC_fail deriva_uploadProcessedFile deriva_uploadOutputBag deriva_finalizeExecutionRun - deriva_failPreExecutionRun_fastq - deriva_failPreExecutionRun_fastqFile - deriva_failPreExecutionRun_species + deriva_failPreExecutionRun deriva_failExecutionRun } bdbag = Channel @@ -65,6 +67,8 @@ upload = params.upload inputBagForce = params.inputBagForce fastqsForce = params.fastqsForce speciesForce = params.speciesForce +strandedForce = params.strandedForce +spikeForce = params.spikeForce email = params.email // Define fixed files and variables @@ -100,13 +104,13 @@ script_tinHist = Channel.fromPath("${baseDir}/scripts/tin_hist.py") script_uploadInputBag = Channel.fromPath("${baseDir}/scripts/upload_input_bag.py") script_uploadExecutionRun_uploadExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") script_uploadExecutionRun_finalizeExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") -script_uploadExecutionRun_failPreExecutionRun_fastq = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") -script_uploadExecutionRun_failPreExecutionRun_fastqFile = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") -script_uploadExecutionRun_failPreExecutionRun_species = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") +script_uploadExecutionRun_failPreExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") script_uploadExecutionRun_failExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") script_uploadQC = Channel.fromPath("${baseDir}/scripts/upload_qc.py") +script_uploadQC_fail = Channel.fromPath("${baseDir}/scripts/upload_qc.py") script_uploadOutputBag = Channel.fromPath("${baseDir}/scripts/upload_output_bag.py") script_deleteEntry_uploadQC = Channel.fromPath("${baseDir}/scripts/delete_entry.py") +script_deleteEntry_uploadQC_fail = Channel.fromPath("${baseDir}/scripts/delete_entry.py") script_deleteEntry_uploadProcessedFile = Channel.fromPath("${baseDir}/scripts/delete_entry.py") /* @@ -115,37 +119,40 @@ script_deleteEntry_uploadProcessedFile = Channel.fromPath("${baseDir}/scripts/de process trackStart { container 'docker://gudmaprbk/gudmap-rbk_base:1.0.0' script: - """ - hostname - ulimit -a - - curl -H 'Content-Type: application/json' -X PUT -d \ - '{ \ - "sessionId": "${workflow.sessionId}", \ - "pipeline": "gudmap.rbk_rnaseq", \ - "start": "${workflow.start}", \ - "repRID": "${repRID}", \ - "astrocyte": false, \ - "status": "started", \ - "nextflowVersion": "${workflow.nextflow.version}", \ - "pipelineVersion": "${workflow.manifest.version}", \ - "ci": ${params.ci}, \ - "dev": ${params.dev} \ - }' \ - "https://xku43pcwnf.execute-api.us-east-1.amazonaws.com/ProdDeploy/pipeline-tracking" - - curl -H 'Content-Type: application/json' -X PUT -d \ - '{ \ - "ID": "${workflow.sessionId}", \ - "repRID": "${repRID}", \ - "PipelineVersion": "${workflow.manifest.version}", \ - "Server": "${params.source}", \ - "Queued": "NA", \ - "CheckedOut": "NA", \ - "Started": "${workflow.start}" \ - }' \ - "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" - """ + """ + hostname + ulimit -a + + curl -H 'Content-Type: application/json' -X PUT -d \ + '{ \ + "sessionId": "${workflow.sessionId}", \ + "pipeline": "gudmap.rbk_rnaseq", \ + "start": "${workflow.start}", \ + "repRID": "${repRID}", \ + "astrocyte": false, \ + "status": "started", \ + "nextflowVersion": "${workflow.nextflow.version}", \ + "pipelineVersion": "${workflow.manifest.version}", \ + "ci": ${params.ci}, \ + "dev": ${params.dev} \ + }' \ + "https://xku43pcwnf.execute-api.us-east-1.amazonaws.com/ProdDeploy/pipeline-tracking" + + if [ ${params.track} == true ] + then + curl -H 'Content-Type: application/json' -X PUT -d \ + '{ \ + "ID": "${workflow.sessionId}", \ + "repRID": "${repRID}", \ + "PipelineVersion": "${workflow.manifest.version}", \ + "Server": "${params.source}", \ + "Queued": "NA", \ + "CheckedOut": "NA", \ + "Started": "${workflow.start}" \ + }' \ + "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" + fi + """ } log.info """\ @@ -160,6 +167,7 @@ ERCC Reference Version : ${params.refERCCVersion} Reference source : ${params.refSource} Output Directory : ${params.outDir} Upload : ${upload} +Track : ${params.track} ------------------------------------ Nextflow Version : ${workflow.nextflow.version} Pipeline Version : ${workflow.manifest.version} @@ -337,12 +345,16 @@ process parseMetadata { # get endedness metadata endsRaw=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p endsMeta) echo -e "LOG: endedness metadata parsed: \${endsRaw}" >> ${repRID}.parseMetadata.log - if [ "\${endsRaw}" == "Single Read" ] + if [ "\${endsRaw}" == "Single End" ] then endsMeta="se" elif [ "\${endsRaw}" == "Paired End" ] then endsMeta="pe" + elif [ "\${endsRaw}" == "Single Read" ] + # "Single Read" depreciated as of Jan 2021, this option is present for backwards compatibility + then + endsMeta="se" elif [ "\${endsRaw}" == "nan" ] then endsRaw="_No value_" @@ -361,14 +373,45 @@ process parseMetadata { # get strandedness metadata stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p stranded) echo -e "LOG: strandedness metadata parsed: \${stranded}" >> ${repRID}.parseMetadata.log + if [ "\${stranded}" == "nan" ] + then + stranded="_No value_" + fi # get spike-in metadata spike=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p spike) echo -e "LOG: spike-in metadata parsed: \${spike}" >> ${repRID}.parseMetadata.log + if [ "\${spike}" == "nan" ] + then + spike="_No value_" + fi + if [ "\${spike}" == "f" ] + then + spike="false" + elif [ "\${spike}" == "t" ] + then + spike="true" + elif [ "\${spike}" == "no" ] + # "yes"/"no" depreciated as of Jan 2021, this option is present for backwards compatibility + then + spike="false" + elif [ "\${spike}" == "yes" ] + # "yes"/"no" depreciated as of Jan 2021, this option is present for backwards compatibility + then + spike="true" + elif [ "\${spike}" == "nan" ] + then + endsRaw="_No value_" + endsMeta="NA" + fi # get species metadata species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experiment}" -p species) echo -e "LOG: species metadata parsed: \${species}" >> ${repRID}.parseMetadata.log + if [ "\${species}" == "nan" ] + then + species="_No value_" + fi # get read length metadata readLength=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p readLength) @@ -463,17 +506,13 @@ strandedMeta.into { spikeMeta.into { spikeMeta_checkMetadata spikeMeta_aggrQC - spikeMeta_failPreExecutionRun_fastq - spikeMeta_failPreExecutionRun_fastqFile - spikeMeta_failPreExecutionRun_species + spikeMeta_failPreExecutionRun spikeMeta_failExecutionRun } speciesMeta.into { speciesMeta_checkMetadata speciesMeta_aggrQC - speciesMeta_failPreExecutionRun_fastq - speciesMeta_failPreExecutionRun_fastqFile - speciesMeta_failPreExecutionRun_species + speciesMeta_failPreExecutionRun speciesMeta_failExecutionRun } studyRID.into { @@ -517,6 +556,7 @@ fastqCountError.into { fastqCountError_dataQC fastqCountError_aggrQC fastqCountError_uploadQC + fastqCountError_uploadQC_fail fastqCountError_uploadProcessedFile fastqCountError_uploadOutputBag fastqCountError_failPreExecutionRun_fastq @@ -538,6 +578,7 @@ fastqReadError.into { fastqReadError_dataQC fastqReadError_aggrQC fastqReadError_uploadQC + fastqReadError_uploadQC_fail fastqReadError_uploadProcessedFile fastqReadError_uploadOutputBag fastqReadError_failPreExecutionRun_fastq @@ -630,6 +671,7 @@ fastqFileError.into { fastqFileError_dataQC fastqFileError_aggrQC fastqFileError_uploadQC + fastqFileError_uploadQC_fail fastqFileError_uploadProcessedFile fastqFileError_uploadOutputBag fastqFileError_failPreExecutionRun_fastqFile @@ -898,6 +940,8 @@ process inferMetadata { path bam from sampleBam.collect() path bai from sampleBai.collect() path alignSummary from alignSampleQC_inferMetadata.collect() + val strandedForce + val spikeForce val fastqCountError_inferMetadata val fastqReadError_inferMetadata val fastqFileError_inferMetadata @@ -931,11 +975,16 @@ process inferMetadata { # determine spike-in if [ 1 -eq \$(echo \$(expr \${align_ercc} ">=" 10)) ] then - spike="yes" + spike="true" else - spike="no" + spike="false" fi echo -e "LOG: inference of strandedness results is: \${spike}" >> ${repRID}.inferMetadata.log + if [ "${spikeForce}" != "" ] + then + spike=${spikeForce} + echo -e "LOG: spike-in metadata forced: \${spike}" >> ${repRID}.parseMetadata.log + fi speciesError=false speciesError_details="" @@ -1008,6 +1057,11 @@ process inferMetadata { stranded="unstranded" fi echo -e "LOG: stradedness set to: \${stranded}" >> ${repRID}.inferMetadata.log + if [ "${strandedForce}" != "" ] + then + stranded=${strandedForce} + echo -e "LOG: spike-in metadata forced: \${stranded}" >> ${repRID}.inferMetadata.log + fi else ends="" stranded="" @@ -1106,6 +1160,7 @@ speciesError.into { speciesError_dataQC speciesError_aggrQC speciesError_uploadQC + speciesError_uploadQC_fail speciesError_uploadProcessedFile speciesError_uploadOutputBag speciesError_failPreExecutionRun_species @@ -1147,52 +1202,75 @@ process checkMetadata { ulimit -a >> ${repRID}.checkMetadata.log pipelineError=false + pipelineError_ends=false + pipelineError_stranded=false + pipelineError_spike=false + pipelineError_species=false # check if submitted metadata matches inferred - if [ "${endsMeta}" != "${endsInfer}" ] - then - pipelineError=true - pipelineError_ends=true - echo -e "LOG: ends do not match: Submitted=${endsMeta}; Inferred=${endsInfer}" >> ${repRID}.checkMetadata.log - else - pipelineError_ends=false - echo -e "LOG: ends matches: Submitted=${endsMeta}; Inferred=${endsInfer}" >> ${repRID}.checkMetadata.log - fi if [ "${strandedMeta}" != "${strandedInfer}" ] then - pipelineError=true - pipelineError_stranded=true - if [ "${strandedMeta}" == "stranded" ] + if [ "${params.strandedForce}" != "" ] then - if [[ "${strandedInfer}" == "forward" ]] || [[ "${strandedInfer}" == "reverse" ]] + pipelineError=false + pipelineError_stranded=false + echo -e "LOG: stranded forced: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log + else + pipelineError=true + pipelineError_stranded=true + if [ "${strandedMeta}" == "stranded" ] then - pipelineError=false - pipelineError_stranded=false - echo -e "LOG: stranded matches: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log + if [[ "${strandedInfer}" == "forward" ]] || [[ "${strandedInfer}" == "reverse" ]] + then + pipelineError=false + pipelineError_stranded=false + echo -e "LOG: stranded matches: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log + else + echo -e "LOG: stranded does not match: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log + fi else echo -e "LOG: stranded does not match: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log fi - else - echo -e "LOG: stranded does not match: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log fi else pipelineError=false pipelineError_stranded=false echo -e "LOG: stranded matches: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log fi - if [ "${spikeMeta}" != "${spikeInfer}" ] + if [ "${endsMeta}" != "${endsInfer}" ] then pipelineError=true - pipelineError_spike=true - echo -e "LOG: spike does not match: Submitted=${spikeMeta}; Inferred=${spikeInfer}" >> ${repRID}.checkMetadata.log + pipelineError_ends=true + echo -e "LOG: ends do not match: Submitted=${endsMeta}; Inferred=${endsInfer}" >> ${repRID}.checkMetadata.log + else + pipelineError_ends=false + echo -e "LOG: ends matches: Submitted=${endsMeta}; Inferred=${endsInfer}" >> ${repRID}.checkMetadata.log + fi + if [ "${spikeMeta}" != "${spikeInfer}" ] + then + if [[ "${params.spikeForce}" != "" ]] + then + pipelineError_spike=false + echo -e "LOG: spike forced: Submitted=${spikeMeta}; Inferred=${spikeInfer}" >> ${repRID}.checkMetadata.log + else + pipelineError=true + pipelineError_spike=true + echo -e "LOG: spike does not match: Submitted=${spikeMeta}; Inferred=${spikeInfer}" >> ${repRID}.checkMetadata.log + fi else pipelineError_spike=false - echo -e "LOG: stranded matches: Submitted=${spikeMeta}; Inferred=${spikeInfer}" >> ${repRID}.checkMetadata.log + echo -e "LOG: spike matches: Submitted=${spikeMeta}; Inferred=${spikeInfer}" >> ${repRID}.checkMetadata.log fi if [ "${speciesMeta}" != "${speciesInfer}" ] then - pipelineError=true - pipelineError_species=true - echo -e "LOG: species does not match: Submitted=${speciesMeta}; Inferred=${speciesInfer}" >> ${repRID}.checkMetadata.log + if [[ "${params.speciesForce}" != "" ]] + then + pipelineError_species=false + echo -e "LOG: species forced: Submitted=${speciesMeta}; Inferred=${speciesInfer}" >> ${repRID}.checkMetadata.log + else + pipelineError=true + pipelineError_species=true + echo -e "LOG: species does not match: Submitted=${speciesMeta}; Inferred=${speciesInfer}" >> ${repRID}.checkMetadata.log + fi else pipelineError_species=false echo -e "LOG: species matches: Submitted=${speciesMeta}; Inferred=${speciesInfer}" >> ${repRID}.checkMetadata.log @@ -1234,6 +1312,7 @@ pipelineError.into { pipelineError_dataQC pipelineError_aggrQC pipelineError_uploadQC + pipelineError_uploadQC_fail pipelineError_uploadProcessedFile pipelineError_uploadOutputBag pipelineError_failExecutionRun @@ -1258,39 +1337,39 @@ process uploadInputBag { upload script: - """ - hostname > ${repRID}.uploadInputBag.log - ulimit -a >> ${repRID}.uploadInputBag.log - - yr=\$(date +'%Y') - mn=\$(date +'%m') - dy=\$(date +'%d') - - file=\$(basename -a ${inputBag}) - md5=\$(md5sum ./\${file} | awk '{ print \$1 }') - echo LOG: ${repRID} input bag md5 sum - \${md5} >> ${repRID}.uploadInputBag.log - size=\$(wc -c < ./\${file}) - echo LOG: ${repRID} input bag size - \${size} bytes >> ${repRID}.uploadInputBag.log - - exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=\${md5}) - if [ "\${exist}" == "[]" ] - then - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/input_bag/study/${studyRID}/replicate/${repRID}/\${file} --parents) - inputBag_rid=\$(python3 ${script_uploadInputBag} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie}) - echo LOG: input bag RID uploaded - \${inputBag_rid} >> ${repRID}.uploadInputBag.log - rid=\${inputBag_rid} - else - exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - exist=\${exist:7:-6} - echo LOG: input bag RID already exists - \${exist} >> ${repRID}.uploadInputBag.log - rid=\${exist} - fi + """ + hostname > ${repRID}.uploadInputBag.log + ulimit -a >> ${repRID}.uploadInputBag.log - echo "\${rid}" > inputBagRID.csv - """ + yr=\$(date +'%Y') + mn=\$(date +'%m') + dy=\$(date +'%d') + + file=\$(basename -a ${inputBag}) + md5=\$(md5sum ./\${file} | awk '{ print \$1 }') + echo LOG: ${repRID} input bag md5 sum - \${md5} >> ${repRID}.uploadInputBag.log + size=\$(wc -c < ./\${file}) + echo LOG: ${repRID} input bag size - \${size} bytes >> ${repRID}.uploadInputBag.log + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=\${md5}) + if [ "\${exist}" == "[]" ] + then + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/input_bag/study/${studyRID}/replicate/${repRID}/\${file} --parents) + inputBag_rid=\$(python3 ${script_uploadInputBag} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie}) + echo LOG: input bag RID uploaded - \${inputBag_rid} >> ${repRID}.uploadInputBag.log + rid=\${inputBag_rid} + else + exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + exist=\${exist:7:-6} + echo LOG: input bag RID already exists - \${exist} >> ${repRID}.uploadInputBag.log + rid=\${exist} + fi + + echo "\${rid}" > inputBagRID.csv + """ } // Extract input bag RID into channel @@ -1303,9 +1382,7 @@ inputBagRID_fl.splitCsv(sep: ",", header: false).separate( inputBagRID.into { inputBagRID_uploadExecutionRun inputBagRID_finalizeExecutionRun - inputBagRID_failPreExecutionRun_fastq - inputBagRID_failPreExecutionRun_fastqFile - inputBagRID_failPreExecutionRun_species + inputBagRID_failPreExecutionRun inputBagRID_failExecutionRun } @@ -1337,59 +1414,62 @@ process uploadExecutionRun { speciesError_uploadExecutionRun == "false" script: - """ - hostname > ${repRID}.uploadExecutionRun.log - ulimit -a >> ${repRID}.uploadExecutionRun.log - - echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.uploadExecutionRun.log - workflow=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Workflow/Name=BICF%20mRNA%20Replicate/Version=${workflow.manifest.version}) - workflow=\$(echo \${workflow} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - workflow=\${workflow:7:-6} - echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.uploadExecutionRun.log - - if [ "${species}" == "Homo sapiens" ] - then - genomeName=\$(echo GRCh${refHuVersion}) - elif [ "${species}" == "Mus musculus" ] - then - genomeName=\$(echo GRCm${refMoVersion}) - fi - if [ "${spike}" == "yes" ] - then - genomeName=\$(echo \${genomeName}-S) - fi - echo LOG: searching for genome name - \${genomeName} >> ${repRID}.uploadExecutionRun.log - genome=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Name=\${genomeName}) - genome=\$(echo \${genome} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - genome=\${genome:7:-6} - echo LOG: genome RID extracted - \${genome} >> ${repRID}.uploadExecutionRun.log - - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Replicate=${repRID}/Input_Bag=${inputBagRID}) - echo \${exist} >> ${repRID}.uploadExecutionRun.log - if [ "\${exist}" == "[]" ] - then - executionRun_rid=\$(python3 ${script_uploadExecutionRun_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s In-progress -d 'Run in process' -o ${source} -c \${cookie} -u F) - echo LOG: execution run RID uploaded - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log - else - rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - rid=\${rid:7:-6} - echo \${rid} >> ${repRID}.uploadExecutionRun.log - executionRun_rid=\$(python3 ${script_uploadExecutionRun_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s In-progress -d 'Run in process' -o ${source} -c \${cookie} -u \${rid}) - echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log - fi - - echo "\${executionRun_rid}" > executionRunRID.csv - - curl -H 'Content-Type: application/json' -X PUT -d \ - '{ \ - "ID": "${workflow.sessionId}", \ - "ExecutionRunRID": "'\${executionRun_rid}'" \ - }' \ - "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" - """ + """ + hostname > ${repRID}.uploadExecutionRun.log + ulimit -a >> ${repRID}.uploadExecutionRun.log + + echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.uploadExecutionRun.log + workflow=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Workflow/Name=BICF%20mRNA%20Replicate/Version=${workflow.manifest.version}) + workflow=\$(echo \${workflow} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + workflow=\${workflow:7:-6} + echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.uploadExecutionRun.log + + if [ "${species}" == "Homo sapiens" ] + then + genomeName=\$(echo GRCh${refHuVersion}) + elif [ "${species}" == "Mus musculus" ] + then + genomeName=\$(echo GRCm${refMoVersion}) + fi + if [ "${spike}" == "true" ] + then + genomeName=\$(echo \${genomeName}-S) + fi + echo LOG: searching for genome name - \${genomeName} >> ${repRID}.uploadExecutionRun.log + genome=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Name=\${genomeName}) + genome=\$(echo \${genome} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + genome=\${genome:7:-6} + echo LOG: genome RID extracted - \${genome} >> ${repRID}.uploadExecutionRun.log + + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Replicate=${repRID}/Input_Bag=${inputBagRID}) + echo \${exist} >> ${repRID}.uploadExecutionRun.log + if [ "\${exist}" == "[]" ] + then + executionRun_rid=\$(python3 ${script_uploadExecutionRun_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s In-progress -d 'Run in process' -o ${source} -c \${cookie} -u F) + echo LOG: execution run RID uploaded - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log + else + rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + rid=\${rid:7:-6} + echo \${rid} >> ${repRID}.uploadExecutionRun.log + executionRun_rid=\$(python3 ${script_uploadExecutionRun_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s In-progress -d 'Run in process' -o ${source} -c \${cookie} -u \${rid}) + echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log + fi + + echo "\${executionRun_rid}" > executionRunRID.csv + + if [ ${params.track} == true ] + then + curl -H 'Content-Type: application/json' -X PUT -d \ + '{ \ + "ID": "${workflow.sessionId}", \ + "ExecutionRunRID": "'\${executionRun_rid}'" \ + }' \ + "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" + fi + """ } // Extract execution run RID into channel @@ -1405,6 +1485,7 @@ executionRunRID.into { executionRunRID_uploadOutputBag executionRunRID_finalizeExecutionRun executionRunRID_failExecutionRun + executionRunRID_fail } /* @@ -1448,24 +1529,24 @@ process getRef { # set the reference name if [ "${species}" == "Mus musculus" ] then - references=\$(echo ${referenceBase}/GRCm${refMoVersion}) + reference=\$(echo ${referenceBase}/GRCm${refMoVersion}) refName=GRCm elif [ '${species}' == "Homo sapiens" ] then - references=\$(echo ${referenceBase}/GRCh${refHuVersion}) + reference=\$(echo ${referenceBase}/GRCh${refHuVersion}) refName=GRCh else echo -e "LOG: ERROR - References could not be set!\nSpecies reference found: ${species}" >> ${repRID}.getRef.log exit 1 fi - if [ "${spike}" == "yes" ] + if [ "${spike}" == "true" ] then - references=\$(echo \${reference}-S) - elif [ "${spike}" == "no" ] + reference=\$(echo \${reference}-S) + elif [ "${spike}" == "false" ] then - reference=\$(echo \${references}) + reference=\$(echo \${reference}) fi - echo -e "LOG: species set to \${references}" >> ${repRID}.getRef.log + echo -e "LOG: species set to \${reference}" >> ${repRID}.getRef.log # retreive appropriate reference appropriate location echo -e "LOG: fetching ${species} reference files from ${referenceBase}" >> ${repRID}.getRef.log @@ -1477,9 +1558,9 @@ process getRef { elif [ arams.refSource == "datahub" ] then echo -e "LOG: grabbing reference files from datahub" >> ${repRID}.getRef.log - GRCv=\$(echo \${references} | grep -o \${refName}.* | cut -d '.' -f1) - GRCp=\$(echo \${references} | grep -o \${refName}.* | cut -d '.' -f2) - GENCODE=\$(echo \${references} | grep -o \${refName}.* | cut -d '.' -f3) + GRCv=\$(echo \${reference} | grep -o \${refName}.* | cut -d '.' -f1) + GRCp=\$(echo \${reference} | grep -o \${refName}.* | cut -d '.' -f2) + GENCODE=\$(echo \${reference} | grep -o \${refName}.* | cut -d '.' -f3) query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='\${GRCv}'.'\${GRCp}'/Annotation_Version=GENCODE%20'\${GENCODE}) curl --request GET \${query} > refQuery.json refURL=\$(python ${script_refData} --returnParam URL) @@ -1913,7 +1994,7 @@ process aggrQC { ulimit -a >> ${repRID}.aggrQC.log # make run table - if [ "${params.inputBagForce}" == "" ] && [ "${params.fastqsForce}" == "" ] && [ "${params.speciesForce}" == "" ] + if [ "${params.inputBagForce}" == "" ] && [ "${params.fastqsForce}" == "" ] && [ "${params.speciesForce}" == "" ] && [ "${params.strandedForce}" == "" ] && [ "${params.spikeForce}" == "" ] then input="default" else @@ -1930,6 +2011,14 @@ process aggrQC { then input=\$(echo \${input} species) fi + if [ "${params.strandedForce}" != "" ] + then + input=\$(echo \${input} stranded) + fi + if [ "${params.spikeForce}" != "" ] + then + input=\$(echo \${input} spike) + fi fi echo -e "LOG: creating run table" >> ${repRID}.aggrQC.log echo -e "Session\tSession ID\tStart Time\tPipeline Version\tInput" > run.tsv @@ -1947,10 +2036,24 @@ process aggrQC { echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t-\t-\t'${readLengthM}'\t-" >> metadata.tsv if [ "${params.speciesForce}" == "" ] then - echo -e "Inferred\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv + input=\$(echo "Inferred\\t${speciesI}\\t") + else + input=\$(echo "Inferred\\t${speciesI} (FORCED)\\t") + fi + input=\$(echo \${input}"${endsI}\\t") + if [ "${params.strandedForce}" == "" ] + then + input=\$(echo \${input}"${strandedI}\\t") else - echo -e "Inferred\t${speciesI} (FORCED)\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv + input=\$(echo \${input}"${strandedI} (FORCED)\\t") fi + if [ "${params.spikeForce}" == "" ] + then + input=\$(echo \${input}"${spikeI}\\t-\\t-\\t-\\t-") + else + input=\$(echo \${input}"${spikeI} (FORCED)\\t-\\t-\\t-\\t-") + fi + echo -e \${input} >> metadata.tsv echo -e "Measured\t-\t${endsManual}\t-\t-\t'${rawReadsI}'\t'${assignedReadsI}'\t'${readLengthI}'\t'${tinMedI}'" >> metadata.tsv # make reference table @@ -1971,10 +2074,13 @@ process aggrQC { multiqc -c ${multiqcConfig} . -n ${repRID}.multiqc.html cp ${repRID}.multiqc_data/multiqc_data.json ${repRID}.multiqc_data.json - curl -H 'Content-Type: application/json' -X PUT -d \ - @./${repRID}.multiqc_data.json \ - "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/qc" - """ + if [ ${params.track} == true ] + then + curl -H 'Content-Type: application/json' -X PUT -d \ + @./${repRID}.multiqc_data.json \ + "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/qc" + fi + """ } /* @@ -2012,48 +2118,42 @@ process uploadQC { pipelineError_uploadQC == 'false' script: - """ - hostname > ${repRID}.uploadQC.log - ulimit -a >> ${repRID}.uploadQC.log - - if [ "${ends}" == "pe" ] - then - end="Paired End" - elif [ "${ends}" == "se" ] - then - end="Single Read" - fi - - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=${repRID}) - if [ "\${exist}" != "[]" ] - then - rids=\$(echo \${exist} | grep -o '\\"RID\\":\\".\\{7\\}' | sed 's/^.\\{7\\}//') - for rid in \${rids} - do - python3 ${script_deleteEntry_uploadQC} -r \${rid} -t mRNA_QC -o ${source} -c \${cookie} - echo LOG: old mRNA QC RID deleted - \${rid} >> ${repRID}.uploadQC.log - done - echo LOG: all old mRNA QC RIDs deleted >> ${repRID}.uploadQC.log - fi - - qc_rid=\$(python3 ${script_uploadQC} -r ${repRID} -e ${executionRunRID} -p "\${end}" -s ${stranded} -l ${length} -w ${rawCount} -f ${finalCount} -t ${tinMed} -o ${source} -c \${cookie} -u F) - echo LOG: mRNA QC RID uploaded - \${qc_rid} >> ${repRID}.uploadQC.log - - echo "\${qc_rid}" > qcRID.csv - """ -} + """ + hostname > ${repRID}.uploadQC.log + ulimit -a >> ${repRID}.uploadQC.log -// Extract mRNA qc RID into channel -qcRID = Channel.create() -qcRID_fl.splitCsv(sep: ",", header: false).separate( - qcRID -) + if [ "${ends}" == "pe" ] + then + end="Paired End" + elif [ "${ends}" == "se" ] + then + end="Single End" + fi + + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=${repRID}) + if [ "\${exist}" != "[]" ] + then + rids=\$(echo \${exist} | grep -o '\\"RID\\":\\".\\{7\\}' | sed 's/^.\\{7\\}//') + for rid in \${rids} + do + python3 ${script_deleteEntry_uploadQC} -r \${rid} -t mRNA_QC -o ${source} -c \${cookie} + echo LOG: old mRNA QC RID deleted - \${rid} >> ${repRID}.uploadQC.log + done + echo LOG: all old mRNA QC RIDs deleted >> ${repRID}.uploadQC.log + fi + + qc_rid=\$(python3 ${script_uploadQC} -r ${repRID} -e ${executionRunRID} -p "\${end}" -s ${stranded} -l ${length} -w ${rawCount} -f ${finalCount} -t ${tinMed} -o ${source} -c \${cookie} -u F) + echo LOG: mRNA QC RID uploaded - \${qc_rid} >> ${repRID}.uploadQC.log + + echo "\${qc_rid}" > qcRID.csv + """ +} /* - *ouputBag: create ouputBag + *uploadProcessedFile: uploads the processed files */ process uploadProcessedFile { tag "${repRID}" @@ -2090,68 +2190,68 @@ process uploadProcessedFile { pipelineError_uploadProcessedFile == 'false' script: - """ - hostname > ${repRID}.outputBag.log - ulimit -a >> ${repRID}.outputBag.log - - mkdir -p ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ - cp ${bam} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ - cp ${bai} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ - cp ${bigwig} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ - cp ${counts} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ - - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Processed_File/Replicate=${repRID}) - if [ "\${exist}" != "[]" ] - then - rids=\$(echo \${exist} | grep -o '\\"RID\\":\\".\\{7\\}' | sed 's/^.\\{7\\}//') - for rid in \${rids} - do - python3 ${script_deleteEntry_uploadProcessedFile} -r \${rid} -t Processed_File -o ${source} -c \${cookie} - done - echo LOG: all old processed file RIDs deleted >> ${repRID}.uploadQC.log - fi - - deriva-upload-cli --catalog 2 --token \${cookie:9} ${source} ./deriva - echo LOG: processed files uploaded >> ${repRID}.outputBag.log - - deriva-download-cli --catalog 2 --token \${cookie:9} ${source} ${executionRunExportConfig} . rid=${executionRunRID} - echo LOG: execution run bag downloaded >> ${repRID}.outputBag.log - - echo -e "### Run Details" >> runDetails.md - echo -e "**Workflow URL:** https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq" >> runDetails.md - echo -e "**Workflow Version:** ${workflow.manifest.version}" >> runDetails.md - echo -e "**Description:** ${workflow.manifest.description}" >> runDetails.md - if [ "${species}" == "Mus musculus" ]; then - genome=\$(echo GRCm${refMoVersion} | cut -d '.' -f1) - patch=\$(echo ${refMoVersion} | cut -d '.' -f2) - annotation=\$(echo ${refMoVersion} | cut -d '.' -f3 | tr -d 'v') - elif [ "${species}" == "Homo sapiens" ]; then - genome=\$(echo GRCh${refHuVersion} | cut -d '.' -f1) - patch=\$(echo ${refHuVersion} | cut -d '.' -f2) - annotation=\$(echo ${refHuVersion} | cut -d '.' -f3 | tr -d 'v') - fi - echo -e "**Genome Assembly Version:** \${genome} patch \${patch}" >> runDetails.md - echo -e "**Annotation Version:** GENCODE release \${annotation}" >> runDetails.md - echo -e "**Run ID:** ${repRID}" >> runDetails.md - echo LOG: runDetails.md created >> ${repRID}.outputBag.log - - unzip Execution_Run_${executionRunRID}.zip - yr=\$(date +'%Y') - mn=\$(date +'%m') - dy=\$(date +'%d') - mv Execution_Run_${executionRunRID} ${repRID}_Output_Bag_\${yr}\${mn}\${dy} - loc=./${repRID}_Output_Bag/data/assets/Study/${studyRID}/Experiment/${expRID}/Replicate/${repRID}/Execution_Run/${executionRunRID}/Output_Files/ - mkdir -p \${loc} - cp runDetails.md \${loc} - cp ${multiqc} \${loc} - cp ${multiqcJSON} \${loc} - - bdbag ./${repRID}_Output_Bag/ --update --archiver zip --debug - echo LOG: output bag created >> ${repRID}.outputBag.log - """ + """ + hostname > ${repRID}.outputBag.log + ulimit -a >> ${repRID}.outputBag.log + + mkdir -p ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ + cp ${bam} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ + cp ${bai} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ + cp ${bigwig} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ + cp ${counts} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ + + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Processed_File/Replicate=${repRID}) + if [ "\${exist}" != "[]" ] + then + rids=\$(echo \${exist} | grep -o '\\"RID\\":\\".\\{7\\}' | sed 's/^.\\{7\\}//') + for rid in \${rids} + do + python3 ${script_deleteEntry_uploadProcessedFile} -r \${rid} -t Processed_File -o ${source} -c \${cookie} + done + echo LOG: all old processed file RIDs deleted >> ${repRID}.uploadQC.log + fi + + deriva-upload-cli --catalog 2 --token \${cookie:9} ${source} ./deriva + echo LOG: processed files uploaded >> ${repRID}.outputBag.log + + deriva-download-cli --catalog 2 --token \${cookie:9} ${source} ${executionRunExportConfig} . rid=${executionRunRID} + echo LOG: execution run bag downloaded >> ${repRID}.outputBag.log + + echo -e "### Run Details" >> runDetails.md + echo -e "**Workflow URL:** https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq" >> runDetails.md + echo -e "**Workflow Version:** ${workflow.manifest.version}" >> runDetails.md + echo -e "**Description:** ${workflow.manifest.description}" >> runDetails.md + if [ "${species}" == "Mus musculus" ]; then + genome=\$(echo GRCm${refMoVersion} | cut -d '.' -f1) + patch=\$(echo ${refMoVersion} | cut -d '.' -f2) + annotation=\$(echo ${refMoVersion} | cut -d '.' -f3 | tr -d 'v') + elif [ "${species}" == "Homo sapiens" ]; then + genome=\$(echo GRCh${refHuVersion} | cut -d '.' -f1) + patch=\$(echo ${refHuVersion} | cut -d '.' -f2) + annotation=\$(echo ${refHuVersion} | cut -d '.' -f3 | tr -d 'v') + fi + echo -e "**Genome Assembly Version:** \${genome} patch \${patch}" >> runDetails.md + echo -e "**Annotation Version:** GENCODE release \${annotation}" >> runDetails.md + echo -e "**Run ID:** ${repRID}" >> runDetails.md + echo LOG: runDetails.md created >> ${repRID}.outputBag.log + + unzip Execution_Run_${executionRunRID}.zip + yr=\$(date +'%Y') + mn=\$(date +'%m') + dy=\$(date +'%d') + mv Execution_Run_${executionRunRID} ${repRID}_Output_Bag_\${yr}\${mn}\${dy} + loc=./${repRID}_Output_Bag/data/assets/Study/${studyRID}/Experiment/${expRID}/Replicate/${repRID}/Execution_Run/${executionRunRID}/Output_Files/ + mkdir -p \${loc} + cp runDetails.md \${loc} + cp ${multiqc} \${loc} + cp ${multiqcJSON} \${loc} + + bdbag ./${repRID}_Output_Bag/ --update --archiver zip --debug + echo LOG: output bag created >> ${repRID}.outputBag.log + """ } /* @@ -2184,39 +2284,46 @@ process uploadOutputBag { pipelineError_uploadOutputBag == 'false' script: - """ - hostname > ${repRID}.uploadOutputBag.log - ulimit -a >> ${repRID}.uploadOutputBag.log - - yr=\$(date +'%Y') - mn=\$(date +'%m') - dy=\$(date +'%d') - - file=\$(basename -a ${outputBag}) - md5=\$(md5sum ./\${file} | awk '{ print \$1 }') - echo LOG: ${repRID} output bag md5 sum - \${md5} >> ${repRID}.uploadOutputBag.log - size=\$(wc -c < ./\${file}) - echo LOG: ${repRID} output bag size - \${size} bytes >> ${repRID}.uploadOutputBag.log + """ + hostname > ${repRID}.uploadOutputBag.log + ulimit -a >> ${repRID}.uploadOutputBag.log + + yr=\$(date +'%Y') + mn=\$(date +'%m') + dy=\$(date +'%d') + + file=\$(basename -a ${outputBag}) + md5=\$(md5sum ./\${file} | awk '{ print \$1 }') + echo LOG: ${repRID} output bag md5 sum - \${md5} >> ${repRID}.uploadOutputBag.log + size=\$(wc -c < ./\${file}) + echo LOG: ${repRID} output bag size - \${size} bytes >> ${repRID}.uploadOutputBag.log - exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_MD5=\${md5}) - if [ "\${exist}" == "[]" ] - then + loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/output_bag/study/${studyRID}/replicate/${repRID}/\${file} --parents) + echo LOG: output bag uploaded - \${loc} >> ${repRID}.uploadOutputBag.log + # url-ify the location + loc=\${loc//\\//%2F} + loc=\${loc//:/%3A} + loc=\${loc// /@20} + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} + cookie=\${cookie:11:-1} - loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/output_bag/study/${studyRID}/replicate/${repRID}/\${file} --parents) - outputBag_rid=\$(python3 ${script_uploadOutputBag} -e ${executionRunRID} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie}) + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_URL=\${loc}) + if [ "\${exist}" == "[]" ] + then + outputBag_rid=\$(python3 ${script_uploadOutputBag} -e ${executionRunRID} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie} -u F) echo LOG: output bag RID uploaded - \${outputBag_rid} >> ${repRID}.uploadOutputBag.log rid=\${outputBag_rid} - else + else exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') exist=\${exist:8:-6} + outputBag_rid=\$(python3 ${script_uploadOutputBag} -e ${executionRunRID} -o ${source} -c \${cookie} -u \${exist}) echo LOG: output bag RID already exists - \${exist} >> ${repRID}.uploadOutputBag.log rid=\${exist} - fi + fi - echo "\${rid}" > outputBagRID.csv - """ + echo "\${rid}" > outputBagRID.csv + """ } // Extract output bag RID into channel @@ -2242,277 +2349,145 @@ process finalizeExecutionRun { upload script: - """ - hostname > ${repRID}.finalizeExecutionRun.log - ulimit -a >> ${repRID}.finalizeExecutionRun.log - - executionRun=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/RID=${executionRunRID}) - workflow=\$(echo \${executionRun} | grep -o '\\"Workflow\\":.*\\"Reference' | grep -oP '(?<=\\"Workflow\\":\\").*(?=\\",\\"Reference)') - genome=\$(echo \${executionRun} | grep -o '\\"Reference_Genome\\":.*\\"Input_Bag' | grep -oP '(?<=\\"Reference_Genome\\":\\").*(?=\\",\\"Input_Bag)') - - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - rid=\$(python3 ${script_uploadExecutionRun_finalizeExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Success -d 'Run Successful' -o ${source} -c \${cookie} -u ${executionRunRID}) - echo LOG: execution run RID marked as successful - \${rid} >> ${repRID}.finalizeExecutionRun.log - - dt=`date +%FT%T.%3N%:z` - curl -H 'Content-Type: application/json' -X PUT -d \ - '{ \ - "ID": "${workflow.sessionId}", \ - "Complete": "'\${dt}'" \ - }' \ - "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" - """ -} + """ + hostname > ${repRID}.finalizeExecutionRun.log + ulimit -a >> ${repRID}.finalizeExecutionRun.log -/* - * failPreExecutionRun_fastq: fail the execution run prematurely for fastq errors -*/ -process failPreExecutionRun_fastq { - tag "${repRID}" + executionRun=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/RID=${executionRunRID}) + workflow=\$(echo \${executionRun} | grep -o '\\"Workflow\\":.*\\"Reference' | grep -oP '(?<=\\"Workflow\\":\\").*(?=\\",\\"Reference)') + genome=\$(echo \${executionRun} | grep -o '\\"Reference_Genome\\":.*\\"Input_Bag' | grep -oP '(?<=\\"Reference_Genome\\":\\").*(?=\\",\\"Input_Bag)') - input: - path script_uploadExecutionRun from script_uploadExecutionRun_failPreExecutionRun_fastq - path credential, stageAs: "credential.json" from deriva_failPreExecutionRun_fastq - val spike from spikeMeta_failPreExecutionRun_fastq - val species from speciesMeta_failPreExecutionRun_fastq - val inputBagRID from inputBagRID_failPreExecutionRun_fastq - val fastqCountError from fastqCountError_failPreExecutionRun_fastq - val fastqCountError_details - val fastqReadError from fastqReadError_failPreExecutionRun_fastq - val fastqReadError_details + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} - when: - upload - fastqCountError == 'true' || fastqReadError == 'true' + rid=\$(python3 ${script_uploadExecutionRun_finalizeExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Success -d 'Run Successful' -o ${source} -c \${cookie} -u ${executionRunRID}) + echo LOG: execution run RID marked as successful - \${rid} >> ${repRID}.finalizeExecutionRun.log - script: - """ - hostname > ${repRID}.failPreExecutionRun_fastq.log - ulimit -a >> ${repRID}.failPreExecutionRun_fastq.log - - errorDetails="" - if [ ${fastqCountError} == true ] - then - errorDetails=\$(echo ${fastqCountError_details}"\\n") - elif [ ${fastqReadError} == true ] - then - errorDetails=\$(echo \$(errorDetails)${fastqReadError_details}"\\n") - fi - - echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.failPreExecutionRun_fastq.log - workflow=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Workflow/Name=BICF%20mRNA%20Replicate/Version=${workflow.manifest.version}) - workflow=\$(echo \${workflow} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - workflow=\${workflow:7:-6} - echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.failPreExecutionRun_fastq.log - - if [ "${species}" == "Homo sapiens" ] - then - genomeName=\$(echo GRCh${refHuVersion}) - elif [ "${species}" == "Mus musculus" ] - then - genomeName=\$(echo GRCm${refMoVersion}) - fi - if [ "${spike}" == "yes" ] - then - genomeName=\$(echo \${genomeName}-S) - fi - echo LOG: searching for genome name - \${genomeName} >> ${repRID}.failPreExecutionRun_fastq.log - genome=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Name=\${genomeName}) - genome=\$(echo \${genome} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - genome=\${genome:7:-6} - echo LOG: genome RID extracted - \${genome} >> ${repRID}.failPreExecutionRun_fastq.log - - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Replicate=${repRID}/Input_Bag=${inputBagRID}) - echo \${exist} >> ${repRID}.failPreExecutionRun_fastq.log - if [ "\${exist}" == "[]" ] - then - rid=\$(python3 ${script_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u F) - echo LOG: execution run RID uploaded - \${rid} >> ${repRID}.failPreExecutionRun_fastq.log - else - rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - rid=\${rid:7:-6} - echo \${rid} >> ${repRID}.failPreExecutionRun_fastq.log - executionRun_rid==\$(python3 ${script_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u \${rid}) - echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.failPreExecutionRun_fastq.log - fi - - dt=`date +%FT%T.%3N%:z` - curl -H 'Content-Type: application/json' -X PUT -d \ - '{ \ - "ID": "${workflow.sessionId}", \ - "ExecutionRunRID": "'\${rid}'", \ - "Failure": "'\${dt}'" \ - }' \ - "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" - """ + if [ ${params.track} == true ] + then + dt=`date +%FT%T.%3N%:z` + curl -H 'Content-Type: application/json' -X PUT -d \ + '{ \ + "ID": "${workflow.sessionId}", \ + "Complete": "'\${dt}'" \ + }' \ + "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" + fi + """ +} + +// Combine errors +error_meta = fastqCountError_uploadQC_fail.ifEmpty(false).combine(fastqReadError_uploadQC_fail.ifEmpty(false).combine(fastqFileError_uploadQC_fail.ifEmpty(false).combine(speciesError_uploadQC_fail.ifEmpty(false).combine(pipelineError_uploadQC_fail.ifEmpty(false))))) +error_meta. into{ + error_failPreExecutionRun + error_uploadQC_fail } +errorDetails = fastqCountError_details.ifEmpty("").combine(fastqReadError_details.ifEmpty("").combine(fastqFileError_details.ifEmpty("").combine(speciesError_details.ifEmpty("")))) /* - * failPreExecutionRun_fastqFile: fail the execution run prematurely for fastqFile errors + * failPreExecutionRun_fastq: fail the execution run prematurely for fastq errors */ -process failPreExecutionRun_fastqFile { +process failPreExecutionRun { tag "${repRID}" input: - path script_uploadExecutionRun from script_uploadExecutionRun_failPreExecutionRun_fastqFile - path credential, stageAs: "credential.json" from deriva_failPreExecutionRun_fastqFile - val spike from spikeMeta_failPreExecutionRun_fastqFile - val species from speciesMeta_failPreExecutionRun_fastqFile - val inputBagRID from inputBagRID_failPreExecutionRun_fastqFile - val fastqFileError from fastqFileError_failPreExecutionRun_fastqFile - val fastqFileError_details + path script_uploadExecutionRun from script_uploadExecutionRun_failPreExecutionRun + path credential, stageAs: "credential.json" from deriva_failPreExecutionRun + val spike from spikeMeta_failPreExecutionRun + val species from speciesMeta_failPreExecutionRun + val inputBagRID from inputBagRID_failPreExecutionRun + tuple val (fastqCountError), val (fastqReadError), val (fastqFileError), val (speciesError), val (pipelineError) from error_failPreExecutionRun + tuple val (fastqCountError_details), val (fastqReadError_details), val (fastqFileError_details), val (speciesError_details) from errorDetails + + output: + path ("executionRunRID.csv") into executionRunRID_preFail_fl when: upload - fastqFileError == 'true' + fastqCountError == 'true' || fastqReadError == 'true' || fastqFileError == 'true' || speciesError == 'true' script: - """ - hostname > ${repRID}.failPreExecutionRun_fastqfile.log - ulimit -a >> ${repRID}.failPreExecutionRun_fastqfile.log - - errorDetails="" - if [ ${fastqFileError} == true ] - then - errorDetails=\$(echo \$(errorDetails)${fastqFileError_details}"\\n") - fi - - echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.failPreExecutionRun_fastqfile.log - workflow=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Workflow/Name=BICF%20mRNA%20Replicate/Version=${workflow.manifest.version}) - workflow=\$(echo \${workflow} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - workflow=\${workflow:7:-6} - echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.failPreExecutionRun_fastqfile.log - - if [ "${species}" == "Homo sapiens" ] - then - genomeName=\$(echo GRCh${refHuVersion}) - elif [ "${species}" == "Mus musculus" ] - then - genomeName=\$(echo GRCm${refMoVersion}) - fi - if [ "${spike}" == "yes" ] - then - genomeName=\$(echo \${genomeName}-S) - fi - echo LOG: searching for genome name - \${genomeName} >> ${repRID}.failPreExecutionRun_fastqfile.log - genome=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Name=\${genomeName}) - genome=\$(echo \${genome} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - genome=\${genome:7:-6} - echo LOG: genome RID extracted - \${genome} >> ${repRID}.failPreExecutionRun_fastqfile.log - - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Replicate=${repRID}/Input_Bag=${inputBagRID}) - echo \${exist} >> ${repRID}.failPreExecutionRun_fastqfile.log - if [ "\${exist}" == "[]" ] - then - rid=\$(python3 ${script_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u F) - echo LOG: execution run RID uploaded - \${rid} >> ${repRID}.failPreExecutionRun_fastqfile.log - else - rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - rid=\${rid:7:-6} - echo \${rid} >> ${repRID}.failPreExecutionRun_fastqfile.log - executionRun_rid==\$(python3 ${script_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u \${rid}) - echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.failPreExecutionRun_fastqfile.log - fi - - dt=`date +%FT%T.%3N%:z` - curl -H 'Content-Type: application/json' -X PUT -d \ - '{ \ - "ID": "${workflow.sessionId}", \ - "ExecutionRunRID": "'\${rid}'", \ - "Failure": "'\${dt}'" \ - }' \ - "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" - """ -} + """ + hostname > ${repRID}.failPreExecutionRun.log + ulimit -a >> ${repRID}.failPreExecutionRun.log -/* - * failPreExecutionRun_species: fail the execution run prematurely for species error -*/ -process failPreExecutionRun_species { - tag "${repRID}" + errorDetails="" + if [ ${fastqCountError} == true ] + then + errorDetails=\$(echo ${fastqCountError_details}"\\n") + elif [ ${fastqReadError} == true ] + then + errorDetails=\$(echo \$(errorDetails)${fastqReadError_details}"\\n") + elif [ ${fastqFileError} == true ] + then + errorDetails=\$(echo \$(errorDetails)${fastqReadError_details}"\\n") + elif [ ${speciesError} == true ] + then + errorDetails=\$(echo \$(errorDetails)${fastqReadError_details}"\\n") + fi - input: - path script_uploadExecutionRun from script_uploadExecutionRun_failPreExecutionRun_species - path credential, stageAs: "credential.json" from deriva_failPreExecutionRun_species - val spike from spikeMeta_failPreExecutionRun_species - val species from speciesMeta_failPreExecutionRun_species - val inputBagRID from inputBagRID_failPreExecutionRun_species - val speciesError from speciesError_failPreExecutionRun_species - val speciesError_details + echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.failPreExecutionRun.log + workflow=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Workflow/Name=BICF%20mRNA%20Replicate/Version=${workflow.manifest.version}) + workflow=\$(echo \${workflow} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + workflow=\${workflow:7:-6} + echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.failPreExecutionRun.log - when: - upload - speciesError == 'true' + if [ "${species}" == "Homo sapiens" ] + then + genomeName=\$(echo GRCh${refHuVersion}) + elif [ "${species}" == "Mus musculus" ] + then + genomeName=\$(echo GRCm${refMoVersion}) + fi + if [ "${spike}" == "true" ] + then + genomeName=\$(echo \${genomeName}-S) + fi + echo LOG: searching for genome name - \${genomeName} >> ${repRID}.failPreExecutionRun.log + genome=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Name=\${genomeName}) + genome=\$(echo \${genome} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + genome=\${genome:7:-6} + echo LOG: genome RID extracted - \${genome} >> ${repRID}.failPreExecutionRun.log - script: - """ - hostname > ${repRID}.failPreExecutionRun_species.log - ulimit -a >> ${repRID}.failPreExecutionRun_species.log - - errorDetails="" - if [ ${speciesError} == true ] - then - errorDetails=\$(echo \$(errorDetails)${speciesError_details}"\\n") - fi - - echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.failPreExecutionRun_species.log - workflow=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Workflow/Name=BICF%20mRNA%20Replicate/Version=${workflow.manifest.version}) - workflow=\$(echo \${workflow} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - workflow=\${workflow:7:-6} - echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.failPreExecutionRun_species.log - - if [ "${species}" == "Homo sapiens" ] - then - genomeName=\$(echo GRCh${refHuVersion}) - elif [ "${species}" == "Mus musculus" ] - then - genomeName=\$(echo GRCm${refMoVersion}) - fi - if [ "${spike}" == "yes" ] - then - genomeName=\$(echo \${genomeName}-S) - fi - echo LOG: searching for genome name - \${genomeName} >> ${repRID}.failPreExecutionRun_species.log - genome=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Name=\${genomeName}) - genome=\$(echo \${genome} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - genome=\${genome:7:-6} - echo LOG: genome RID extracted - \${genome} >> ${repRID}.failPreExecutionRun_species.log - - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Replicate=${repRID}/Input_Bag=${inputBagRID}) - echo \${exist} >> ${repRID}.failPreExecutionRun_species.log - if [ "\${exist}" == "[]" ] - then - rid=\$(python3 ${script_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u F) - echo LOG: execution run RID uploaded - \${rid} >> ${repRID}.failPreExecutionRun_species.log - else - rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - rid=\${rid:7:-6} - echo \${rid} >> ${repRID}.failPreExecutionRun_species.log - executionRun_rid==\$(python3 ${script_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u \${rid}) - echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.failPreExecutionRun_species.log - fi - - dt=`date +%FT%T.%3N%:z` - curl -H 'Content-Type: application/json' -X PUT -d \ - '{ \ - "ID": "${workflow.sessionId}", \ - "ExecutionRunRID": "'\${rid}'", \ - "Failure": "'\${dt}'" \ - }' \ - "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Replicate=${repRID}/Input_Bag=${inputBagRID}) + echo \${exist} >> ${repRID}.failPreExecutionRun.log + if [ "\${exist}" == "[]" ] + then + rid=\$(python3 ${script_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u F) + echo LOG: execution run RID uploaded - \${rid} >> ${repRID}.failPreExecutionRun.log + else + rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + rid=\${rid:7:-6} + echo \${rid} >> ${repRID}.failPreExecutionRun.log + executionRun_rid=\$(python3 ${script_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u \${rid}) + echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.failPreExecutionRun.log + fi + + echo "\${rid}" > executionRunRID.csv + + if [ ${params.track} == true ] + then + dt=`date +%FT%T.%3N%:z` + curl -H 'Content-Type: application/json' -X PUT -d \ + '{ \ + "ID": "${workflow.sessionId}", \ + "ExecutionRunRID": "'\${rid}'", \ + "Failure": "'\${dt}'" \ + }' \ + "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" + fi """ } +// Extract execution run RID into channel +executionRunRID_preFail = Channel.create() +executionRunRID_preFail_fl.splitCsv(sep: ",", header: false).separate( + executionRunRID_preFail +) + +failExecutionRunRID = executionRunRID_fail.ifEmpty('').mix(executionRunRID_preFail.ifEmpty('')).filter { it != "" } /* * failExecutionRun: fail the execution run @@ -2545,65 +2520,112 @@ process failExecutionRun { pipelineError == 'true' script: - """ - hostname > ${repRID}.failExecutionRun.log - ulimit -a >> ${repRID}.failExecutionRun.log - - executionRun=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/RID=${executionRunRID}) - workflow=\$(echo \${executionRun} | grep -o '\\"Workflow\\":.*\\"Reference' | grep -oP '(?<=\\"Workflow\\":\\").*(?=\\",\\"Reference)') - genome=\$(echo \${executionRun} | grep -o '\\"Reference_Genome\\":.*\\"Input_Bag' | grep -oP '(?<=\\"Reference_Genome\\":\\").*(?=\\",\\"Input_Bag)') - - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - errorDetails="" - if [ ${pipelineError} == false ] - then - rid=\$(python3 ${script_uploadExecutionRun_failExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Success -d 'Run Successful' -o ${source} -c \${cookie} -u ${executionRunRID}) - echo LOG: execution run RID marked as successful - \${rid} >> ${repRID}.failExecutionRun.log - else - pipelineError_details=\$(echo "**Submitted metadata does not match inferred:**\\n") - pipelineError_details=\$(echo \${pipelineError_details}"|Metadata|Submitted value|Inferred value|\\n") - pipelineError_details=\$(echo \${pipelineError_details}"|:-:|-:|-:|\\n") - if ${pipelineError_ends} - then - if [ "${endsInfer}" == "se" ] + """ + hostname > ${repRID}.failExecutionRun.log + ulimit -a >> ${repRID}.failExecutionRun.log + + executionRun=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/RID=${executionRunRID}) + workflow=\$(echo \${executionRun} | grep -o '\\"Workflow\\":.*\\"Reference' | grep -oP '(?<=\\"Workflow\\":\\").*(?=\\",\\"Reference)') + genome=\$(echo \${executionRun} | grep -o '\\"Reference_Genome\\":.*\\"Input_Bag' | grep -oP '(?<=\\"Reference_Genome\\":\\").*(?=\\",\\"Input_Bag)') + + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + errorDetails="" + if [ ${pipelineError} == false ] + then + rid=\$(python3 ${script_uploadExecutionRun_failExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Success -d 'Run Successful' -o ${source} -c \${cookie} -u ${executionRunRID}) + echo LOG: execution run RID marked as successful - \${rid} >> ${repRID}.failExecutionRun.log + else + pipelineError_details=\$(echo "**Submitted metadata does not match inferred:**\\n") + pipelineError_details=\$(echo \${pipelineError_details}"|Metadata|Submitted value|Inferred value|\\n") + pipelineError_details=\$(echo \${pipelineError_details}"|:-:|-:|-:|\\n") + if ${pipelineError_ends} + then + if [ "${endsInfer}" == "se" ] + then + endInfer="Single End" + elif [ "${endsInfer}" == "pe" ] + then + endInfer="Paired End" + else + endInfer="unknown" + fi + pipelineError_details=\$(echo \${pipelineError_details}"|Paired End|${endsRaw}|"\${endInfer}"|\\n") + fi + if ${pipelineError_stranded} then - endInfer="Single End" - elif [ "${endsInfer}" == "pe" ] + pipelineError_details=\$(echo \${pipelineError_details}"|Strandedness|${strandedMeta}|${strandedInfer}|\\n") + fi + if ${pipelineError_spike} then - endInfer="Paired End" - else - endInfer="unknown" + pipelineError_details=\$(echo \${pipelineError_details}"|Used Spike Ins|${spikeMeta}|${spikeInfer}|\\n") fi - pipelineError_details=\$(echo \${pipelineError_details}"|Paired End|${endsRaw}|"\${endInfer}"|\\n") - fi - if ${pipelineError_stranded} - then - pipelineError_details=\$(echo \${pipelineError_details}"|Strandedness|${strandedMeta}|${strandedInfer}|\\n") + if ${pipelineError_species} + then + pipelineError_details=\$(echo \${pipelineError_details}"|Species|${speciesMeta}|${speciesInfer}|\\n") + fi + pipelineError_details=\${pipelineError_details::-2} + rid=\$(python3 ${script_uploadExecutionRun_failExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${pipelineError_details}" -o ${source} -c \${cookie} -u ${executionRunRID}) + echo LOG: execution run RID marked as error - \${rid} >> ${repRID}.failExecutionRun.log fi - if ${pipelineError_spike} + + if [ ${params.track} == true ] then - pipelineError_details=\$(echo \${pipelineError_details}"|Used Spike Ins|${spikeMeta}|${spikeInfer}|\\n") + dt=`date +%FT%T.%3N%:z` + curl -H 'Content-Type: application/json' -X PUT -d \ + '{ \ + "ID": "${workflow.sessionId}", \ + "ExecutionRunRID": "'\${rid}'", \ + "Failure": "'\${dt}'" \ + }' \ + "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" fi - if ${pipelineError_species} + """ +} + +/* + * uploadQC_fail: uploads the mRNA QC on failed execution run +*/ +process uploadQC_fail { + tag "${repRID}" + + input: + path script_deleteEntry_uploadQC_fail + path script_uploadQC_fail + path credential, stageAs: "credential.json" from deriva_uploadQC_fail + val executionRunRID from failExecutionRunRID + tuple val (fastqCountError), val (fastqReadError), val (fastqFileError), val (speciesError), val (pipelineError) from error_uploadQC_fail + + when: + upload + fastqCountError == 'true' || fastqReadError == 'true' || fastqFileError == 'true' || speciesError == 'true' || pipelineError == 'true' + + script: + """ + hostname > ${repRID}.uploadQC.log + ulimit -a >> ${repRID}.uploadQC.log + + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=${repRID}) + if [ "\${exist}" != "[]" ] then - pipelineError_details=\$(echo \${pipelineError_details}"|Species|${speciesMeta}|${speciesInfer}|\\n") + rids=\$(echo \${exist} | grep -o '\\"RID\\":\\".\\{7\\}' | sed 's/^.\\{7\\}//') + for rid in \${rids} + do + python3 ${script_deleteEntry_uploadQC_fail} -r \${rid} -t mRNA_QC -o ${source} -c \${cookie} + echo LOG: old mRNA QC RID deleted - \${rid} >> ${repRID}.uploadQC.log + done + echo LOG: all old mRNA QC RIDs deleted >> ${repRID}.uploadQC.log fi - pipelineError_details=\${pipelineError_details::-2} - rid=\$(python3 ${script_uploadExecutionRun_failExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${pipelineError_details}" -o ${source} -c \${cookie} -u ${executionRunRID}) - echo LOG: execution run RID marked as error - \${rid} >> ${repRID}.failExecutionRun.log - fi - - dt=`date +%FT%T.%3N%:z` - curl -H 'Content-Type: application/json' -X PUT -d \ - '{ \ - "ID": "${workflow.sessionId}", \ - "ExecutionRunRID": "'\${rid}'", \ - "Failure": "'\${dt}'" \ - }' \ - "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" - """ + + qc_rid=\$(python3 ${script_uploadQC_fail} -r ${repRID} -e ${executionRunRID} -o ${source} -c \${cookie} -u E) + echo LOG: mRNA QC RID uploaded - \${qc_rid} >> ${repRID}.uploadQC.log + + echo "\${qc_rid}" > qcRID.csv + """ } diff --git a/workflow/scripts/parse_meta.py b/workflow/scripts/parse_meta.py index fdbc86c12a2fb6832217ec0f08263d0102c9e566..52f0f18200525f776fe73fcf2f4cd5be8db35045 100644 --- a/workflow/scripts/parse_meta.py +++ b/workflow/scripts/parse_meta.py @@ -63,32 +63,17 @@ def main(): # Get strandedness metadata from 'Experiment Settings.csv' if (args.parameter == "stranded"): - if (metaFile.Has_Strand_Specific_Information.unique() == "yes"): - stranded = "stranded" - elif (metaFile.Has_Strand_Specific_Information.unique() == "no"): - stranded = "unstranded" - else: - stranded = metaFile.Has_Strand_Specific_Information.unique()[0] + stranded = metaFile.Strandedness.unique()[0] print(stranded) # Get spike-in metadata from 'Experiment Settings.csv' if (args.parameter == "spike"): - if (metaFile.Used_Spike_Ins.unique() == "yes"): - spike = "yes" - elif (metaFile.Used_Spike_Ins.unique() == "no"): - spike = "no" - else: - spike = metaFile.Used_Spike_Ins.unique()[0] + spike = metaFile.Used_Spike_Ins.unique()[0] print(spike) # Get species metadata from 'Experiment.csv' if (args.parameter == "species"): - if (metaFile.Species.unique() == "Mus musculus"): - species = "Mus musculus" - elif (metaFile.Species.unique() == "Homo sapiens"): - species = "Homo sapiens" - else: - species = metaFile.Species.unique()[0] + species = metaFile.Species.unique()[0] print(species) # Get read length metadata from 'Experiment Settings.csv' diff --git a/workflow/scripts/upload_execution_run.py b/workflow/scripts/upload_execution_run.py index 2e8ea8de7745a3f048b580486f20e25d8904dd0c..405c81aa9c2aa31c500157bee3db39413789a7fa 100644 --- a/workflow/scripts/upload_execution_run.py +++ b/workflow/scripts/upload_execution_run.py @@ -48,7 +48,6 @@ def main(hostname, catalog_number, credential): } entities = run_table.update([run_data]) rid = args.update - print(rid) diff --git a/workflow/scripts/upload_output_bag.py b/workflow/scripts/upload_output_bag.py index 397658c0ccef21af86e529a040a6dcb2ac506833..e1e1fc1a7fc59e2b003f3c6602f06182c9a3b054 100644 --- a/workflow/scripts/upload_output_bag.py +++ b/workflow/scripts/upload_output_bag.py @@ -14,6 +14,7 @@ def get_args(): parser.add_argument('-n', '--notes', help="notes", default="", required=False) parser.add_argument('-o', '--host', help="datahub host", required=True) parser.add_argument('-c', '--cookie', help="cookie token", required=True) + parser.add_argument('-u', '--update', help="update?", default="F", required=True) args = parser.parse_args() return args @@ -22,19 +23,27 @@ def main(hostname, catalog_number, credential): pb = catalog.getPathBuilder() outputBag_table = pb.RNASeq.Output_Bag - outputBag_data = { - "Execution_Run": args.executionRunRID, - "File_Name": args.file, - "File_URL": args.loc, - "File_MD5": args.md5, - "File_Bytes": args.bytes, - "File_Creation_Time": datetime.now().replace(microsecond=0).isoformat(), - "Notes": args.notes, - "Bag_Type": "mRNA_Replicate_Analysis" + if args.update == "F": + outputBag_data = { + "Execution_Run": args.executionRunRID, + "File_Name": args.file, + "File_URL": args.loc, + "File_MD5": args.md5, + "File_Bytes": args.bytes, + "File_Creation_Time": datetime.now().replace(microsecond=0).isoformat(), + "Notes": args.notes, + "Bag_Type": "mRNA_Replicate_Analysis" } + entities = outputBag_table.insert([outputBag_data]) + rid = entities[0]["RID"] - entities = outputBag_table.insert([outputBag_data]) - rid = entities[0]["RID"] + else: + outputBag_data = { + "RID": args.update, + "Execution_Run": args.executionRunRID + } + entities = outputBag_table.insert([outputBag_data]) + rid = entities[0]["RID"] print(rid) diff --git a/workflow/scripts/upload_qc.py b/workflow/scripts/upload_qc.py index b842a7a36cc47fa4f599ab086a5c1b3dbece437a..29fac063d9812ad05877d3e8f8f0d865d52eca14 100644 --- a/workflow/scripts/upload_qc.py +++ b/workflow/scripts/upload_qc.py @@ -7,12 +7,12 @@ def get_args(): parser = argparse.ArgumentParser() parser.add_argument('-r', '--repRID', help="replicate RID", required=True) parser.add_argument('-e', '--executionRunRID', help="exection run RID", required=True) - parser.add_argument('-p', '--ends', help="single/paired ends", required=True) - parser.add_argument('-s', '--stranded', help="stranded?", required=True) - parser.add_argument('-l', '--length', help="median read length", required=True) - parser.add_argument('-w', '--rawCount', help="raw count", required=True) - parser.add_argument('-f', '--assignedCount', help="final assigned count", required=True) - parser.add_argument('-t', '--tin', help="median TIN", required=True) + parser.add_argument('-p', '--ends', help="single/paired ends", required=False) + parser.add_argument('-s', '--stranded', help="stranded?", required=False) + parser.add_argument('-l', '--length', help="median read length", required=False) + parser.add_argument('-w', '--rawCount', help="raw count", required=False) + parser.add_argument('-f', '--assignedCount', help="final assigned count", required=False) + parser.add_argument('-t', '--tin', help="median TIN", required=False) parser.add_argument('-n', '--notes', help="notes", default="", required=False) parser.add_argument('-o', '--host', help="datahub host", required=True) parser.add_argument('-c', '--cookie', help="cookie token", required=True) @@ -39,6 +39,13 @@ def main(hostname, catalog_number, credential): } entities = run_table.insert([run_data]) rid = entities[0]["RID"] + elif args.update == "E": + run_data = { + "Execution_Run": args.executionRunRID, + "Replicate": args.repRID + } + entities = run_table.insert([run_data]) + rid = entities[0]["RID"] else: run_data = { "RID": args.update, diff --git a/workflow/tests/test_alignReads.py b/workflow/tests/test_alignReads.py index 11f0f3d0d09236a3f3494a2c851fb4294f2a1323..06429a5b8c11daa539499fe1460ef34085031793 100644 --- a/workflow/tests/test_alignReads.py +++ b/workflow/tests/test_alignReads.py @@ -5,25 +5,25 @@ import pandas as pd import os import utils -data_output_path = os.path.dirname(os.path.abspath(__file__)) + \ +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../../' @pytest.mark.alignData def test_alignData_se(): assert os.path.exists(os.path.join( - data_output_path, 'Q-Y5F6_1M.se.unal.gz')) + test_output_path, 'Q-Y5F6_1M.se.unal.gz')) assert os.path.exists(os.path.join( - data_output_path, 'Q-Y5F6_1M.se.sorted.bam')) + test_output_path, 'Q-Y5F6_1M.se.sorted.bam')) assert os.path.exists(os.path.join( - data_output_path, 'Q-Y5F6_1M.se.sorted.bam.bai')) + test_output_path, 'Q-Y5F6_1M.se.sorted.bam.bai')) @pytest.mark.alignData def test_alignData_pe(): assert os.path.exists(os.path.join( - data_output_path, 'Q-Y5F6_1M.pe.unal.gz')) + test_output_path, 'Q-Y5F6_1M.pe.unal.gz')) assert os.path.exists(os.path.join( - data_output_path, 'Q-Y5F6_1M.pe.sorted.bam')) + test_output_path, 'Q-Y5F6_1M.pe.sorted.bam')) assert os.path.exists(os.path.join( - data_output_path, 'Q-Y5F6_1M.pe.sorted.bam.bai')) + test_output_path, 'Q-Y5F6_1M.pe.sorted.bam.bai')) diff --git a/workflow/tests/test_completion.py b/workflow/tests/test_completion.py new file mode 100644 index 0000000000000000000000000000000000000000..25a9941de634d36b84ad750c5968fa75009dfd27 --- /dev/null +++ b/workflow/tests/test_completion.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os +import json + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + +@pytest.mark.completionMultiqc +def test_multiqcExist(filename): + assert os.path.exists(os.path.join( + test_output_path, filename)) \ No newline at end of file diff --git a/workflow/tests/test_dedupReads.py b/workflow/tests/test_dedupReads.py index 89fc2b10fa4db847ccc16d5cce664bf551b29ee3..15e227d4586334721257bc6382d60cf0709bac62 100644 --- a/workflow/tests/test_dedupReads.py +++ b/workflow/tests/test_dedupReads.py @@ -5,25 +5,25 @@ import pandas as pd import os import utils -data_output_path = os.path.dirname(os.path.abspath(__file__)) + \ +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../../' @pytest.mark.dedupData def test_dedupData(): assert os.path.exists(os.path.join( - data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.bam')) + test_output_path, 'Q-Y5F6_1M.se.sorted.deduped.bam')) assert os.path.exists(os.path.join( - data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.bam.bai')) + test_output_path, 'Q-Y5F6_1M.se.sorted.deduped.bam.bai')) assert os.path.exists(os.path.join( - data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr8.bam')) + test_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr8.bam')) assert os.path.exists(os.path.join( - data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr8.bam.bai')) + test_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr8.bam.bai')) assert os.path.exists(os.path.join( - data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr4.bam')) + test_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr4.bam')) assert os.path.exists(os.path.join( - data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr4.bam.bai')) + test_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr4.bam.bai')) assert os.path.exists(os.path.join( - data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chrY.bam')) + test_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chrY.bam')) assert os.path.exists(os.path.join( - data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chrY.bam.bai')) + test_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chrY.bam.bai')) diff --git a/workflow/tests/test_makeBigWig.py b/workflow/tests/test_makeBigWig.py index d8f62f5edfb3b57868d0b4b18ed6a0deb6bd651e..273b2cdbb892a464a26b152db2c5d5c0a46922bf 100644 --- a/workflow/tests/test_makeBigWig.py +++ b/workflow/tests/test_makeBigWig.py @@ -5,10 +5,10 @@ import pandas as pd import os import utils -data_output_path = os.path.dirname(os.path.abspath(__file__)) + \ +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../../' @pytest.mark.makeBigWig def test_makeBigWig(): - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.bw')) + assert os.path.exists(os.path.join(test_output_path, 'Q-Y5F6_1M.se.bw')) diff --git a/workflow/tests/test_makeFeatureCounts.py b/workflow/tests/test_makeFeatureCounts.py index e14793511b226a6c82d502ce2f84867c087bc41a..43e6810482d6131b064b3604c95e89e9296db603 100644 --- a/workflow/tests/test_makeFeatureCounts.py +++ b/workflow/tests/test_makeFeatureCounts.py @@ -5,15 +5,15 @@ import pandas as pd import os import utils -data_output_path = os.path.dirname(os.path.abspath(__file__)) + \ +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../../' @pytest.mark.makeFeatureCounts def test_makeFeatureCounts(): assert os.path.exists(os.path.join( - data_output_path, 'Q-Y5F6_1M.se_countData')) + test_output_path, 'Q-Y5F6_1M.se_countData')) assert os.path.exists(os.path.join( - data_output_path, 'Q-Y5F6_1M.se.countTable.csv')) + test_output_path, 'Q-Y5F6_1M.se.countTable.csv')) assert os.path.exists(os.path.join( - data_output_path, 'Q-Y5F6_1M.se_tpmTable.csv')) + test_output_path, 'Q-Y5F6_1M.se_tpmTable.csv')) diff --git a/workflow/tests/test_parseMetadata.py b/workflow/tests/test_parseMetadata.py index 5a14fcd885b79d944e46de5d936d17fc941def7b..738f5ec66ee4e08d4e64400da719b759a7876a37 100644 --- a/workflow/tests/test_parseMetadata.py +++ b/workflow/tests/test_parseMetadata.py @@ -19,7 +19,7 @@ def readLine(fileName): data = False file = open(fileName, "r") line = file.readline() - if line.strip() == "uk,uk,se,unstranded,no,Homo sapiens,75,Experiment_RID,Study_RID,Replicate_RID": + if line.strip() == "uk,uk,se,unstranded,f,Homo sapiens,75,Experiment_RID,Study_RID,Replicate_RID": data = True return data