diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index bf7355aa6e19f1c29ab020d8524ff204ffc9e7cc..34062a6bedaf41b7d145f2556276cb8d425ddbb1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -705,6 +705,51 @@ integration_pe: - always +failAmbiguousSpecies: + stage: integration + only: [merge_requests] + except: + variables: + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + script: + - hostname + - ulimit -a + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --source staging --upload true -with-dag dag.png --dev false --ci true + retry: + max: 0 + when: + - always + +failTrunkation: + stage: integration + only: [merge_requests] + except: + variables: + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + script: + - hostname + - ulimit -a + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ET --source staging --upload true -with-dag dag.png --dev false --ci true + retry: + max: 0 + when: + - always + +failMismatchR1R2: + stage: integration + only: [merge_requests] + except: + variables: + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + script: + - hostname + - ulimit -a + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-CWH4 --source staging --upload true -with-dag dag.png --dev false --ci true + retry: + max: 0 + when: + - always + override_inputBag: stage: integration only: [merge_requests] diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bbd3c07775d72a1028eb875b9b81b2bceadeb46..9fdafbc361104eb9c96601ed1b41d4b703495ee6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,20 @@ -# v1.0.0 (in development) +# v1.0.1 (in development) +**User Facing** +* + +**Background** +* Split non-metadata mismatch error handling proces into 2, 1 to handle fastq errors and one for species errors (BUG FIX #101) +* Add known errors to integration CI tests (ambiguous species, trunkated fastq, R1/R2 mismatch (#103) +* Fix pre exeuction run fails uploading of execution run RID to tracking site (#96, #97) +* Change CI replicate count badge CI to count all execution runs that match major version + +*Known Bugs* +* Override params (inputBag, fastq, species) aren't checked for integrity +* Authentication files and tokens must be active (active auth client) for the duration of the pipeline run (until long-lived token utilization included) + +<hr> + +# v1.0.0 **User Facing** * Add link to reference builder script * Output median TIN to mRNA_QC table diff --git a/docs/dag.png b/docs/dag.png index 48ae4dfa28fc0eceb4aaeb83332b22da8633ca29..74bf1dbbf26f30d9cee216682aa795393db24ae8 100755 Binary files a/docs/dag.png and b/docs/dag.png differ diff --git a/docs/software_versions_mqc.yaml b/docs/software_versions_mqc.yaml index feec6dd7c15e1bfd9e5d9c5d66e08f8f1b81d7f9..b1982d393f7f24cf549a830d48be20afc749533b 100755 --- a/docs/software_versions_mqc.yaml +++ b/docs/software_versions_mqc.yaml @@ -20,5 +20,5 @@ <dt>deepTools</dt><dd>v3.5.0</dd> <dt>FastQC</dt><dd>v0.11.9</dd> <dt>MultiQC</dt><dd>v1.9</dd> - <dt>Pipeline Version</dt><dd>v1.0.0</dd> + <dt>Pipeline Version</dt><dd>v1.0.1</dd> </dl> diff --git a/workflow/nextflow.config b/workflow/nextflow.config index 3b982bfbd4b12896c957e6f0af7cbc130e4c3039..8d23a6c8136bcc357afc54a1626f69ad50eeb805 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -122,6 +122,6 @@ manifest { homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq' description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.' mainScript = 'rna-seq.nf' - version = 'v1.0.0' + version = 'v1.0.1' nextflowVersion = '>=19.09.0' } diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index dcfc7186b431c5e26e609ec36a2dd07fb658cc44..fb7158f9272d8731a98b6f7a4eb74bd03fe8b11b 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -47,7 +47,8 @@ deriva.into { deriva_uploadProcessedFile deriva_uploadOutputBag deriva_finalizeExecutionRun - deriva_failPreExecutionRun + deriva_failPreExecutionRun_fastq + deriva_failPreExecutionRun_species deriva_failExecutionRun } bdbag = Channel @@ -98,7 +99,8 @@ script_tinHist = Channel.fromPath("${baseDir}/scripts/tin_hist.py") script_uploadInputBag = Channel.fromPath("${baseDir}/scripts/upload_input_bag.py") script_uploadExecutionRun_uploadExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") script_uploadExecutionRun_finalizeExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") -script_uploadExecutionRun_failPreExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") +script_uploadExecutionRun_failPreExecutionRun_fastq = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") +script_uploadExecutionRun_failPreExecutionRun_species = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") script_uploadExecutionRun_failExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") script_uploadQC = Channel.fromPath("${baseDir}/scripts/upload_qc.py") script_uploadOutputBag = Channel.fromPath("${baseDir}/scripts/upload_output_bag.py") @@ -448,13 +450,15 @@ strandedMeta.into { spikeMeta.into { spikeMeta_checkMetadata spikeMeta_aggrQC - spikeMeta_failPreExecutionRun + spikeMeta_failPreExecutionRun_fastq + spikeMeta_failPreExecutionRun_species spikeMeta_failExecutionRun } speciesMeta.into { speciesMeta_checkMetadata speciesMeta_aggrQC - speciesMeta_failPreExecutionRun + speciesMeta_failPreExecutionRun_fastq + speciesMeta_failPreExecutionRun_species speciesMeta_failExecutionRun } studyRID.into { @@ -500,7 +504,7 @@ fastqCountError.into { fastqCountError_uploadQC fastqCountError_uploadProcessedFile fastqCountError_uploadOutputBag - fastqCountError_failPreExecutionRun + fastqCountError_failPreExecutionRun_fastq } fastqReadError.into { fastqReadError_trimData @@ -521,7 +525,7 @@ fastqReadError.into { fastqReadError_uploadQC fastqReadError_uploadProcessedFile fastqReadError_uploadOutputBag - fastqReadError_failPreExecutionRun + fastqReadError_failPreExecutionRun_fastq } /* @@ -987,7 +991,7 @@ speciesError.into { speciesError_uploadQC speciesError_uploadProcessedFile speciesError_uploadOutputBag - speciesError_failPreExecutionRun + speciesError_failPreExecutionRun_species } /* @@ -1180,7 +1184,8 @@ inputBagRID_fl.splitCsv(sep: ",", header: false).separate( inputBagRID.into { inputBagRID_uploadExecutionRun inputBagRID_finalizeExecutionRun - inputBagRID_failPreExecutionRun + inputBagRID_failPreExecutionRun_fastq + inputBagRID_failPreExecutionRun_species inputBagRID_failExecutionRun } @@ -2166,32 +2171,30 @@ process finalizeExecutionRun { } /* - * failPreExecutionRun: fail the execution run prematurely + * failPreExecutionRun_fastq: fail the execution run prematurely for fastq errors */ -process failPreExecutionRun { +process failPreExecutionRun_fastq { tag "${repRID}" input: - path script_uploadExecutionRun_failPreExecutionRun - path credential, stageAs: "credential.json" from deriva_failPreExecutionRun - val spike from spikeMeta_failPreExecutionRun - val species from speciesMeta_failPreExecutionRun - val inputBagRID from inputBagRID_failPreExecutionRun - val fastqCountError from fastqCountError_failPreExecutionRun + path script_uploadExecutionRun from script_uploadExecutionRun_failPreExecutionRun_fastq + path credential, stageAs: "credential.json" from deriva_failPreExecutionRun_fastq + val spike from spikeMeta_failPreExecutionRun_fastq + val species from speciesMeta_failPreExecutionRun_fastq + val inputBagRID from inputBagRID_failPreExecutionRun_fastq + val fastqCountError from fastqCountError_failPreExecutionRun_fastq val fastqCountError_details - val fastqReadError from fastqReadError_failPreExecutionRun + val fastqReadError from fastqReadError_failPreExecutionRun_fastq val fastqReadError_details - val speciesError from speciesError_failPreExecutionRun - val speciesError_details when: upload - fastqCountError == 'true' || fastqReadError == 'true' || speciesError == 'true' + fastqCountError == 'true' || fastqReadError == 'true' script: """ - hostname > ${repRID}.failPreExecutionRun.log - ulimit -a >> ${repRID}.failPreExecutionRun.log + hostname > ${repRID}.failPreExecutionRun_fastq.log + ulimit -a >> ${repRID}.failPreExecutionRun_fastq.log errorDetails="" if [ ${fastqCountError} == true ] @@ -2200,16 +2203,95 @@ process failPreExecutionRun { elif [ ${fastqReadError} == true ] then errorDetails=\$(echo \$(errorDetails)${fastqReadError_details}"\\n") - elif [ ${speciesError} == true ] + fi + + echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.failPreExecutionRun_fastq.log + workflow=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Workflow/Name=BICF%20mRNA%20Replicate/Version=${workflow.manifest.version}) + workflow=\$(echo \${workflow} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + workflow=\${workflow:7:-6} + echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.failPreExecutionRun_fastq.log + + if [ "${species}" == "Homo sapiens" ] + then + genomeName=\$(echo GRCh${refHuVersion}) + elif [ "${species}" == "Mus musculus" ] + then + genomeName=\$(echo GRCm${refMoVersion}) + fi + if [ "${spike}" == "yes" ] + then + genomeName=\$(echo \${genomeName}-S) + fi + echo LOG: searching for genome name - \${genomeName} >> ${repRID}.failPreExecutionRun_fastq.log + genome=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Name=\${genomeName}) + genome=\$(echo \${genome} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + genome=\${genome:7:-6} + echo LOG: genome RID extracted - \${genome} >> ${repRID}.failPreExecutionRun_fastq.log + + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Replicate=${repRID}/Input_Bag=${inputBagRID}) + echo \${exist} >> ${repRID}.failPreExecutionRun_fastq.log + if [ "\${exist}" == "[]" ] + then + rid=\$(python3 ${script_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u F) + echo LOG: execution run RID uploaded - \${rid} >> ${repRID}.failPreExecutionRun_fastq.log + else + rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + rid=\${rid:7:-6} + echo \${rid} >> ${repRID}.failPreExecutionRun_fastq.log + executionRun_rid==\$(python3 ${script_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u \${rid}) + echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.failPreExecutionRun_fastq.log + fi + + dt=`date +%FT%T.%3N%:z` + curl -H 'Content-Type: application/json' -X PUT -d \ + '{ \ + "ID": "${workflow.sessionId}", \ + "ExecutionRunRID": "'\${rid}'", \ + "Failure": "'\${dt}'" \ + }' \ + "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" + """ +} + + +/* + * failPreExecutionRun_species: fail the execution run prematurely for species error +*/ +process failPreExecutionRun_species { + tag "${repRID}" + + input: + path script_uploadExecutionRun from script_uploadExecutionRun_failPreExecutionRun_species + path credential, stageAs: "credential.json" from deriva_failPreExecutionRun_species + val spike from spikeMeta_failPreExecutionRun_species + val species from speciesMeta_failPreExecutionRun_species + val inputBagRID from inputBagRID_failPreExecutionRun_species + val speciesError from speciesError_failPreExecutionRun_species + val speciesError_details + + when: + upload + speciesError == 'true' + + script: + """ + hostname > ${repRID}.failPreExecutionRun_species.log + ulimit -a >> ${repRID}.failPreExecutionRun_species.log + + errorDetails="" + if [ ${speciesError} == true ] then errorDetails=\$(echo \$(errorDetails)${speciesError_details}"\\n") fi - echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.failPreExecutionRun.log + echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.failPreExecutionRun_species.log workflow=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Workflow/Name=BICF%20mRNA%20Replicate/Version=${workflow.manifest.version}) workflow=\$(echo \${workflow} | grep -o '\\"RID\\":\\".*\\",\\"RCT') workflow=\${workflow:7:-6} - echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.failPreExecutionRun.log + echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.failPreExecutionRun_species.log if [ "${species}" == "Homo sapiens" ] then @@ -2222,33 +2304,34 @@ process failPreExecutionRun { then genomeName=\$(echo \${genomeName}-S) fi - echo LOG: searching for genome name - \${genomeName} >> ${repRID}.failPreExecutionRun.log + echo LOG: searching for genome name - \${genomeName} >> ${repRID}.failPreExecutionRun_species.log genome=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Name=\${genomeName}) genome=\$(echo \${genome} | grep -o '\\"RID\\":\\".*\\",\\"RCT') genome=\${genome:7:-6} - echo LOG: genome RID extracted - \${genome} >> ${repRID}.failPreExecutionRun.log + echo LOG: genome RID extracted - \${genome} >> ${repRID}.failPreExecutionRun_species.log cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') cookie=\${cookie:11:-1} exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Replicate=${repRID}/Input_Bag=${inputBagRID}) - echo \${exist} >> ${repRID}.failPreExecutionRun.log + echo \${exist} >> ${repRID}.failPreExecutionRun_species.log if [ "\${exist}" == "[]" ] then - rid=\$(python3 ${script_uploadExecutionRun_failPreExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u F) - echo LOG: execution run RID uploaded - \${rid} >> ${repRID}.failPreExecutionRun.log + rid=\$(python3 ${script_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u F) + echo LOG: execution run RID uploaded - \${rid} >> ${repRID}.failPreExecutionRun_species.log else rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') rid=\${rid:7:-6} - echo \${rid} >> ${repRID}.failPreExecutionRun.log - executionRun_rid==\$(python3 ${script_uploadExecutionRun_failPreExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u \${rid}) - echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.failPreExecutionRun.log + echo \${rid} >> ${repRID}.failPreExecutionRun_species.log + executionRun_rid==\$(python3 ${script_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u \${rid}) + echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.failPreExecutionRun_species.log fi dt=`date +%FT%T.%3N%:z` curl -H 'Content-Type: application/json' -X PUT -d \ '{ \ "ID": "${workflow.sessionId}", \ + "ExecutionRunRID": "'\${rid}'", \ "Failure": "'\${dt}'" \ }' \ "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" @@ -2340,6 +2423,7 @@ process failExecutionRun { curl -H 'Content-Type: application/json' -X PUT -d \ '{ \ "ID": "${workflow.sessionId}", \ + "ExecutionRunRID": "'\${rid}'", \ "Failure": "'\${dt}'" \ }' \ "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" diff --git a/workflow/scripts/get_updated_rep_count.sh b/workflow/scripts/get_updated_rep_count.sh index be054ea1a8531cb2166436f21aab2e42a09065f4..daeb0575d08f2126b40f2db089ae82af4f01ed0c 100644 --- a/workflow/scripts/get_updated_rep_count.sh +++ b/workflow/scripts/get_updated_rep_count.sh @@ -3,19 +3,34 @@ echo "collecting stats for badges" latest_release_tag=$(git tag --sort=-committerdate -l *.*.* | head -1) current_pipeline_version=$(git show ${latest_release_tag}:workflow/nextflow.config | grep -o version.* | grep -oP "(?<=').*(?=')") +current_pipeline_versionMajor=$(echo ${current_pipeline_version} | cut -f1 -d".") +current_pipeline_versionMajor=$(echo ${current_pipeline_versionMajor}".") +echo "Major pipeline version for search: "${current_pipeline_versionMajor} echo "collecting workflow RIDs from servers" -dev_workflow_RID=$(curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:Workflow/Version=${current_pipeline_version} | grep -o '\"RID\":\".*\",\"RCT') -dev_workflow_RID=${dev_workflow_RID:7:-6} -staging_workflow_RID=$(curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Workflow/Version=${current_pipeline_version} | grep -o '\"RID\":\".*\",\"RCT') -staging_workflow_RID=${staging_workflow_RID:7:-6} -prod_workflow_RID=$(curl -s https://www.gudmap.org/ermrest/catalog/2/entity/RNASeq:Workflow/Version=${current_pipeline_version} | grep -o '\"RID\":\".*\",\"RCT') -prod_workflow_RID=${prod_workflow_RID:7:-6} +dev_workflow_RID=$(curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:Workflow/Version::ciregexp::%5E${current_pipeline_versionMajor} | grep -o '\"RID\":\".*\",\"RCT' | cut -f4 -d"\"") +staging_workflow_RID=$(curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Workflow/Version::ciregexp::%5E${current_pipeline_versionMajor} | grep -o '\"RID\":\".*\",\"RCT' | cut -f4 -d"\"") +prod_workflow_RID=$(curl -s https://www.gudmap.org/ermrest/catalog/2/entity/RNASeq:Workflow/Version::ciregexp::%5E${current_pipeline_versionMajor} | grep -o '\"RID\":\".*\",\"RCT' | cut -f4 -d"\"") echo "collecting unique replicates with successful execution runs" -dev_count=$(curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Execution_Status=Success/Workflow=${dev_workflow_RID} | grep -o \"Replicate\".*,\"Workflow | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Workflow)" | sort | uniq | wc -l) -staging_count=$(curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Execution_Status=Success/Workflow=${staging_workflow_RID} | grep -o \"Replicate\".*,\"Workflow | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Workflow)" | sort | uniq | wc -l) -prod_count=$(curl -s https://www.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Execution_Status=Success/Workflow=${prod_workflow_RID} | grep -o \"Replicate\".*,\"Workflow | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Workflow)" | sort | uniq | wc -l) +dev_count=0 +for rid in ${dev_workflow_RID} +do + temp_count=$(curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Execution_Status=Success/Workflow=${rid} | grep -o \"Replicate\".*,\"Workflow | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Workflow)" | sort | uniq | wc -l) + dev_count=$(expr ${dev_count} + ${temp_count}) +done +staging_count=0 +for rid in ${staging_workflow_RID} +do + temp_count=$(curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Execution_Status=Success/Workflow=${rid} | grep -o \"Replicate\".*,\"Workflow | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Workflow)" | sort | uniq | wc -l) + staging_count=$(expr ${staging_count} + ${temp_count}) +done +prod_count=0 +for rid in ${prod_workflow_RID} +do + temp_count=$(curl -s https://www.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Execution_Status=Success/Workflow=${rid} | grep -o \"Replicate\".*,\"Workflow | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Workflow)" | sort | uniq | wc -l) + prod_count=$(expr ${prod_count} + ${temp_count}) +done echo "collecting badges" mkdir -p ./badges/counts