diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1f93dcef7aced3bfb7ed15c9faadfaa59f1d0d2e..da81becce22ae6a06ae0419b5770b87a1039dee9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -2,38 +2,154 @@ before_script: - module load python/3.6.4-anaconda - pip install --user attrs==20.3.0 pytest==6.2.2 pytest-pythonpath==0.7.3 pytest-cov==2.11.1 - module load singularity/3.5.3 + - export SINGULARITY_CACHEDIR=${dir}cache/ - module load nextflow/20.01.0 - ln -sfn /project/BICF/BICF_Core/shared/gudmap/test_data/* ./test_data/ - mkdir -p ~/.deriva - mkdir -p ~/.bdbag +after_script: + - unset SINGULARITY_CACHEDIR + variables: refMoVersion: "38.p6.vM25" refHuVersion: "38.p13.v36" refERCCVersion: "92" + dir: "/project/BICF/BICF_Core/shared/gudmap/singularity_cache/" stages: + - environment + - singularity + - versions + - aggregation - badges - deploy - unit - - aggregation - reference - integration - consistency + +img_cache: + stage: singularity + script: + - mkdir -p ${dir}cache/ + - cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | xargs -P 1 -I {} bash -c "singularity pull --dir ${dir} 'docker://'{} || true" + - wait + - echo images cached + +collect: + stage: versions + script: + - derivaImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f1) + - derivaVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${derivaImg}_${derivaVar}.sif + - singularity run ${dir}${derivaImg}_${derivaVar}.sif deriva-download-cli --version > version_deriva.txt + - singularity run ${dir}${derivaImg}_${derivaVar}.sif bdbag --version > version_bdbag.txt + - pythonImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep python | cut -d"/" -f2 | cut -d":" -f1) + - pythonVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep python | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${pythonImg}_${pythonVar}.sif + - singularity run ${dir}${pythonImg}_${pythonVar}.sif python3 --version > version_python.txt + - fastqcImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep fastqc | cut -d"/" -f2 | cut -d":" -f1) + - fastqcVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep fastqc | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${fastqcImg}_${fastqcVar}.sif + - singularity run ${dir}${fastqcImg}_${fastqcVar}.sif fastqc --version > version_fastqc.txt + - seqwhoImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep seqwho | cut -d"/" -f2 | cut -d":" -f1) + - seqwhoVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep seqwho | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${seqwhoImg}_${seqwhoVar}.sif + - singularity run ${dir}${seqwhoImg}_${seqwhoVar}.sif seqwho.py -h | grep -o Version.* > version_seqwho.txt & + - trimgaloreImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep trimgalore | cut -d"/" -f2 | cut -d":" -f1) + - trimgaloreVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep trimgalore | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${trimgaloreImg}_${trimgaloreVar}.sif + - singularity run ${dir}${trimgaloreImg}_${trimgaloreVar}.sif trim_galore --version > version_trimgalore.txt + - seqtkImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep seqtk | cut -d"/" -f2 | cut -d":" -f1) + - seqtkVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep seqtk | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${seqtkImg}_${seqtkVar}.sif + - singularity run ${dir}${seqtkImg}_${seqtkVar}.sif seqtk 2>&1 | grep -o Version.* > version_seqtk.txt & + - rseqcImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep rseqc | cut -d"/" -f2 | cut -d":" -f1) + - rseqcVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep rseqc | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${rseqcImg}_${rseqcVar}.sif + - singularity run ${dir}${rseqcImg}_${rseqcVar}.sif infer_experiment.py --version > version_rseqc.txt + - hisatImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep hisat | cut -d"/" -f2 | cut -d":" -f1) + - hisatVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep hisat | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${hisatImg}_${hisatVar}.sif + - singularity run ${dir}${hisatImg}_${hisatVar}.sif hisat2 --version > version_hisat2.txt + - singularity run ${dir}${hisatImg}_${hisatVar}.sif samtools --version > version_samtools.txt + - picardImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep picard | cut -d"/" -f2 | cut -d":" -f1) + - picardVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep picard | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${picardImg}_${picardVar}.sif + - singularity run ${dir}${picardImg}_${picardVar}.sif java -jar /picard/build/libs/picard.jar MarkDuplicates --version 2> version_markdups.txt & + - subreadImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep subread | cut -d"/" -f2 | cut -d":" -f1) + - subreadVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep subread | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${subreadImg}_${subreadVar}.sif + - singularity run ${dir}${subreadImg}_${subreadVar}.sif featureCounts -v &> version_featurecounts.txt + - singularity run ${dir}${subreadImg}_${subreadVar}.sif R --version > version_r.txt + - deeptoolsImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deeptools | cut -d"/" -f2 | cut -d":" -f1) + - deeptoolsVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deeptools | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${deeptoolsImg}_${deeptoolsVar}.sif + - singularity run ${dir}${deeptoolsImg}_${deeptoolsVar}.sif deeptools --version > version_deeptools.txt + - multiqcImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep multiqc | cut -d"/" -f2 | cut -d":" -f1) + - multiqcVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep multiqc | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${multiqcImg}_${multiqcVar}.sif + - singularity run ${dir}${multiqcImg}_${multiqcVar}.sif multiqc --version > version_multiqc.txt + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_deriva.txt + - version_bdbag.txt + - version_python.txt + - version_fastqc.txt + - version_seqwho.txt + - version_trimgalore.txt + - version_seqtk.txt + - version_rseqc.txt + - version_hisat2.txt + - version_samtools.txt + - version_markdups.txt + - version_featurecounts.txt + - version_r.txt + - version_deeptools.txt + - version_multiqc.txt + expire_in: 7 days + +generateVersions: + stage: aggregation + only: + - push + - tags + except: + - merge_requests + - schedules + script: + - python ./workflow/scripts/generate_versions.py -o software_versions + - python ./workflow/scripts/generate_references.py -r ./docs/references.md -o software_references + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - software_references_mqc.yaml + - software_versions_mqc.yaml + expire_in: 7 days + + build_badges: stage: badges only: - master - develop - tags + - schedules before_script: - module load singularity/3.5.3 - chmod +x ./workflow/scripts/get_updated_badge_info.sh script: - echo "Building badges" - - singularity run 'docker://gudmaprbk/gudmap-rbk_base:1.0.0' bash ./workflow/scripts/get_updated_badge_info.sh - - singularity run 'docker://gudmaprbk/gudmap-rbk_base:1.0.0' bash ./workflow/scripts/get_updated_rep_count.sh + - baseImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep gudmap-rbk_base | cut -d"/" -f2 | cut -d":" -f1) + - baseVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep gudmap-rbk_base | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${baseImg}_${baseVar}.sif + - singularity run ${dir}${baseImg}_${baseVar}.sif bash ./workflow/scripts/get_updated_badge_info.sh + - singularity run ${dir}${baseImg}_${baseVar}.sif bash ./workflow/scripts/get_updated_rep_count.sh artifacts: paths: - badges/ @@ -44,6 +160,7 @@ pages: - master - develop - tags + - schedules dependencies: - build_badges script: @@ -62,16 +179,12 @@ getBag: - merge_requests - schedules script: - - ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json - - singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-download-cli --version > version_deriva.txt - - singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-download-cli staging.gudmap.org --catalog 2 ./workflow/conf/Replicate_For_Input_Bag.json . rid=Q-Y5F6 - - pytest -m getBag - artifacts: - name: "$CI_JOB_NAME" - when: always - paths: - - version_deriva.txt - expire_in: 7 days + - derivaImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f1) + - derivaVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${derivaImg}_${derivaVar}.sif + - ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json + - singularity run ${dir}${derivaImg}_${derivaVar}.sif deriva-download-cli staging.gudmap.org --catalog 2 ./workflow/conf/Replicate_For_Input_Bag.json . rid=Q-Y5F6 + - pytest -m getBag getData: stage: unit @@ -82,17 +195,13 @@ getData: - merge_requests - schedules script: - - singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' bdbag --version > version_bdbag.txt - - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt - - unzip ./test_data/bag/Q-Y5F6_inputBag_xxxxtest.zip - - singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' bash ./workflow/scripts/bdbag_fetch.sh Q-Y5F6_inputBag Q-Y5F6 - - pytest -m getData - artifacts: - name: "$CI_JOB_NAME" - when: always - paths: - - version_bdbag.txt - expire_in: 7 days + - derivaImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f1) + - derivaVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${derivaImg}_${derivaVar}.sif + - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt + - unzip ./test_data/bag/Q-Y5F6_inputBag_xxxxtest.zip + - singularity run ${dir}${derivaImg}_${derivaVar}.sif bash ./workflow/scripts/bdbag_fetch.sh Q-Y5F6_inputBag Q-Y5F6 + - pytest -m getData parseMetadata: stage: unit @@ -103,27 +212,23 @@ parseMetadata: - merge_requests - schedules script: - - singularity run 'docker://gudmaprbk/python3:1.0.0' python3 --version > version_python.txt - - rep=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID) - - exp=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p expRID) - - study=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p studyRID) - - endsRaw=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsMeta) - - endsMeta="uk" - - endsManual="se" - - stranded=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded) - - spike=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p spike) - - species=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species) - - readLength=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p readLength) - - echo -e "${endsMeta},${endsRaw},${endsManual},${stranded},${spike},${species},${readLength},${exp},${study},${rep}" > design.csv - - pytest -m parseMetadata - artifacts: - name: "$CI_JOB_NAME" - when: always - paths: - - version_python.txt - expire_in: 7 days + - pythonImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep python | cut -d"/" -f2 | cut -d":" -f1) + - pythonVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep python | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${pythonImg}_${pythonVar}.sif + - rep=$(singularity run ${dir}${pythonImg}_${pythonVar}.sif python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID) + - exp=$(singularity run ${dir}${pythonImg}_${pythonVar}.sif python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p expRID) + - study=$(singularity run ${dir}${pythonImg}_${pythonVar}.sif python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p studyRID) + - endsRaw=$(singularity run ${dir}${pythonImg}_${pythonVar}.sif python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsMeta) + - stranded=$(singularity run ${dir}${pythonImg}_${pythonVar}.sif python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded) + - spike=$(singularity run ${dir}${pythonImg}_${pythonVar}.sif python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p spike) + - species=$(singularity run ${dir}${pythonImg}_${pythonVar}.sif python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species) + - readLength=$(singularity run ${dir}${pythonImg}_${pythonVar}.sif python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p readLength) + - endsMeta="uk" + - endsManual="se" + - echo -e "${endsMeta},${endsRaw},${endsManual},${stranded},${spike},${species},${readLength},${exp},${study},${rep}" > design.csv + - pytest -m parseMetadata -inferMetadata: +fastqc: stage: unit only: - push @@ -132,23 +237,13 @@ inferMetadata: - merge_requests - schedules script: - - singularity run 'docker://gudmaprbk/rseqc4.0.0:1.0.0' infer_experiment.py --version > version_rseqc.txt - - > - align=$(echo $(grep "Overall alignment rate" ./test_data/meta/Q-Y5F6_1M.se.alignSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) && - if [[ ${align} == "" ]]; then exit 1; fi - - > - singularity run 'docker://gudmaprbk/rseqc4.0.0:1.0.0' infer_experiment.py -r "/project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/annotation/genome.bed" -i "./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam" 1>> Q-Y5F6_1M.se.inferMetadata.log && - ended=`singularity run 'gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/infer_meta.sh endness Q-Y5F6_1M.se.inferMetadata.log` && - if [[ ${ended} == "" ]]; then exit 1; fi - - pytest -m inferMetadata - artifacts: - name: "$CI_JOB_NAME" - when: always - paths: - - version_rseqc.txt - expire_in: 7 days + - fastqcImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep fastqc | cut -d"/" -f2 | cut -d":" -f1) + - fastqcVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep fastqc | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${fastqcImg}_${fastqcVar}.sif + - singularity run ${dir}${fastqcImg}_${fastqcVar}.sif fastqc ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o . + - pytest -m fastqc -trimData: +seqwho: stage: unit only: - push @@ -157,20 +252,15 @@ trimData: - merge_requests - schedules script: - - singularity run 'docker://gudmaprbk/trimgalore0.6.5:1.0.0' trim_galore --version > version_trimgalore.txt - - singularity run 'docker://gudmaprbk/trimgalore0.6.5:1.0.0' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz - - singularity run 'docker://gudmaprbk/trimgalore0.6.5:1.0.0' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz - - readLengthSE=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') - - readLengthPE=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') - - pytest -m trimData - artifacts: - name: "$CI_JOB_NAME" - when: always - paths: - - version_trimgalore.txt - expire_in: 7 days + - seqwhoImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep seqwho | cut -d"/" -f2 | cut -d":" -f1) + - seqwhoVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep seqwho | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${seqwhoImg}_${seqwhoVar}.sif + - wget -O SeqWho.ix https://cloud.biohpc.swmed.edu/index.php/s/eeNWqZz8jqN5zWY/download + - mkdir -p SeqWho_call_plots/test_data/fastq/small/ + - singularity run ${dir}${seqwhoImg}_${seqwhoVar}.sif seqwho.py -f test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -x SeqWho.ix + - pytest -m seqwho -downsampleData: +trimData: stage: unit only: - push @@ -179,10 +269,16 @@ downsampleData: - merge_requests - schedules script: - - singularity run 'docker://gudmaprbk/seqtk1.3:1.0.0' seqtk sample -s100 ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz 1000 1> sampled.1.fq - - pytest -m downsampleData + - trimgaloreImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep trimgalore | cut -d"/" -f2 | cut -d":" -f1) + - trimgaloreVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep trimgalore | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${trimgaloreImg}_${trimgaloreVar}.sif + - singularity run ${dir}${trimgaloreImg}_${trimgaloreVar}.sif trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz + - singularity run ${dir}${trimgaloreImg}_${trimgaloreVar}.sif trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz + - readLengthSE=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') + - readLengthPE=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') + - pytest -m trimData -alignData: +downsampleData: stage: unit only: - push @@ -191,26 +287,13 @@ alignData: - merge_requests - schedules script: - - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' hisat2 --version > version_hisat2.txt - - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools --version > version_samtools.txt - - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary - - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam - - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam - - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.sorted.bam.bai - - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.pe.unal.gz -S Q-Y5F6_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/small/Q-Y5F6_1M.pe_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5F6_1M.pe_val_2.fq.gz --summary-file Q-Y5F6_1M.pe.alignSummary.txt --new-summary - - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.pe.bam Q-Y5F6_1M.pe.sam - - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam - - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai - - pytest -m alignData - artifacts: - name: "$CI_JOB_NAME" - when: always - paths: - - version_hisat2.txt - - version_samtools.txt - expire_in: 7 days + - seqtkImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep seqtk | cut -d"/" -f2 | cut -d":" -f1) + - seqtkVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep seqtk | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${seqtkImg}_${seqtkVar}.sif + - singularity run ${dir}${seqtkImg}_${seqtkVar}.sif seqtk sample -s100 ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz 1000 1> sampled.1.fq + - pytest -m downsampleData -dedupData: +inferMetadata: stage: unit only: - push @@ -219,25 +302,18 @@ dedupData: - merge_requests - schedules script: - - singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' samtools --version > version_samtools.txt - - singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates --version 2> version_markdups.txt& - - singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true - - singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5F6_1M.se.deduped.bam - - singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' samtools index -@ 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai - - > - for i in {"chr8","chr4","chrY"}; do - echo "samtools view -b Q-Y5F6_1M.se.sorted.deduped.bam ${i} > Q-Y5F6_1M.se.sorted.deduped.${i}.bam; samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.${i}.bam Q-Y5F6_1M.se.sorted.deduped.${i}.bam.bai;"; - done | singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' parallel -j 20 -k - - pytest -m dedupData - artifacts: - name: "$CI_JOB_NAME" - when: always - paths: - - version_markdups.txt - - version_samtools.txt - expire_in: 7 days - -countData: + - rseqcImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep rseqc | cut -d"/" -f2 | cut -d":" -f1) + - rseqcVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep rseqc | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${rseqcImg}_${rseqcVar}.sif + - > + align=$(echo $(grep "Overall alignment rate" ./test_data/meta/Q-Y5F6_1M.se.alignSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) && + if [[ ${align} == "" ]]; then exit 1; fi + - singularity run ${dir}${rseqcImg}_${rseqcVar}.sif infer_experiment.py -r "/project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/annotation/genome.bed" -i "./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam" 1>> Q-Y5F6_1M.se.inferMetadata.log && + - ended=`singularity run ${dir}${rseqcImg}_${rseqcVar}.sif python3 ./workflow/scripts/infer_meta.sh endness Q-Y5F6_1M.se.inferMetadata.log` && + if [[ ${ended} == "" ]]; then exit 1; fi + - pytest -m inferMetadata + +alignData: stage: unit only: - push @@ -246,24 +322,20 @@ countData: - merge_requests - schedules script: - - ln -s /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/metadata/geneID.tsv - - ln -s /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/metadata/Entrez.tsv - - singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/annotation/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/sequence/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se_countData -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam - - singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se_countData - - singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se - - assignedReads=$(grep -m 1 'Assigned' *.summary | grep -oe '\([0-9.]*\)') - - singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' featureCounts -v &> version_featurecounts.txt - - singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' R --version > version_r.txt - - pytest -m makeFeatureCounts - artifacts: - name: "$CI_JOB_NAME" - when: always - paths: - - version_featurecounts.txt - - version_r.txt - expire_in: 7 days + - hisatImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep hisat | cut -d"/" -f2 | cut -d":" -f1) + - hisatVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep hisat | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${hisatImg}_${hisatVar}.sif + - singularity run ${dir}${hisatImg}_${hisatVar}.sif hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary + - singularity run ${dir}${hisatImg}_${hisatVar}.sif samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam + - singularity run ${dir}${hisatImg}_${hisatVar}.sif samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam + - singularity run ${dir}${hisatImg}_${hisatVar}.sif samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.sorted.bam.bai + - singularity run ${dir}${hisatImg}_${hisatVar}.sif hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.pe.unal.gz -S Q-Y5F6_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/small/Q-Y5F6_1M.pe_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5F6_1M.pe_val_2.fq.gz --summary-file Q-Y5F6_1M.pe.alignSummary.txt --new-summary + - singularity run ${dir}${hisatImg}_${hisatVar}.sif samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.pe.bam Q-Y5F6_1M.pe.sam + - singularity run ${dir}${hisatImg}_${hisatVar}.sif samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam + - singularity run ${dir}${hisatImg}_${hisatVar}.sif samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai + - pytest -m alignData -makeBigWig: +dedupData: stage: unit only: - push @@ -272,17 +344,18 @@ makeBigWig: - merge_requests - schedules script: - - singularity run 'docker://gudmaprbk/deeptools3.5.0:1.0.0' deeptools --version > version_deeptools.txt - - singularity run 'docker://gudmaprbk/deeptools3.5.0:1.0.0' bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw - - pytest -m makeBigWig - artifacts: - name: "$CI_JOB_NAME" - when: always - paths: - - version_deeptools.txt - expire_in: 7 days + - picardImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep picard | cut -d"/" -f2 | cut -d":" -f1) + - picardVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep picard | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${picardImg}_${picardVar}.sif + - singularity run ${dir}${picardImg}_${picardVar}.sif java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true + - singularity run ${dir}${picardImg}_${picardVar}.sif samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5F6_1M.se.deduped.bam + - singularity run ${dir}${picardImg}_${picardVar}.sif samtools index -@ 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai + - for i in {"chr8","chr4","chrY"}; do + echo "samtools view -b Q-Y5F6_1M.se.sorted.deduped.bam ${i} > Q-Y5F6_1M.se.sorted.deduped.${i}.bam; samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.${i}.bam Q-Y5F6_1M.se.sorted.deduped.${i}.bam.bai;"; + done | singularity run ${dir}${picardImg}_${picardVar}.sif parallel -j 20 -k + - pytest -m dedupData -fastqc: +countData: stage: unit only: - push @@ -291,60 +364,51 @@ fastqc: - merge_requests - schedules script: - - singularity run 'docker://gudmaprbk/fastqc0.11.9:1.0.0' fastqc --version > version_fastqc.txt - - singularity run 'docker://gudmaprbk/fastqc0.11.9:1.0.0' fastqc ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o . - - pytest -m fastqc - artifacts: - name: "$CI_JOB_NAME" - when: always - paths: - - version_fastqc.txt - expire_in: 7 days + - ln -s /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/metadata/geneID.tsv + - ln -s /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/metadata/Entrez.tsv + - subreadImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep subread | cut -d"/" -f2 | cut -d":" -f1) + - subreadVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep subread | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${subreadImg}_${subreadVar}.sif + - singularity run ${dir}${subreadImg}_${subreadVar}.sif featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/annotation/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/sequence/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se_countData -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam + - singularity run ${dir}${subreadImg}_${subreadVar}.sif Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se_countData + - singularity run ${dir}${subreadImg}_${subreadVar}.sif Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se + - assignedReads=$(grep -m 1 'Assigned' *.summary | grep -oe '\([0-9.]*\)') + - pytest -m makeFeatureCounts - -dataQC: +makeBigWig: stage: unit only: - push - tags except: - merge_requests + - schedules script: - - echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > Q-Y5F6_1M.se.sorted.deduped.tin.xls - - > - for i in {"chr8","chr4","chrY"}; do - echo "tin.py -i ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/annotation/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";" - done | singularity run 'docker://gudmaprbk/rseqc4.0.0:1.0.0' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls - - pytest -m dataQC + - deeptoolsImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deeptools | cut -d"/" -f2 | cut -d":" -f1) + - deeptoolsVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deeptools | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${deeptoolsImg}_${deeptoolsVar}.sif + - singularity run ${dir}${deeptoolsImg}_${deeptoolsVar}.sif bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw + - pytest -m makeBigWig -uploadInputBag: +dataQC: stage: unit only: - push - tags except: - merge_requests - - schedules script: - - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json - - echo THIS IS A TEST FILE > test.txt - - > - md5=$(md5sum ./test.txt | awk '{ print $1 }') && - size=$(wc -c < ./test.txt) && - exist=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=${md5}) && - if [ "${exist}" == "[]" ]; then - cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && - cookie=${cookie:11:-1} && - loc=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host staging.gudmap.org put ./test.txt /hatrac/resources/rnaseq/pipeline/input_bag/TEST/test.txt --parents) && - rid=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/upload_input_bag.py -f test.txt -l ${loc} -s ${md5} -b ${size} -n 'This is a test input bag' -o staging.gudmap.org -c ${cookie}) && - echo ${rid} test input bag created - else - rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') && - rid=${rid:8:-6} && - echo ${rid} test input bag already exists - fi + - rseqcImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep rseqc | cut -d"/" -f2 | cut -d":" -f1) + - rseqcVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep rseqc | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${rseqcImg}_${rseqcVar}.sif + - echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > Q-Y5F6_1M.se.sorted.deduped.tin.xls + - > + for i in {"chr8","chr4","chrY"}; do + echo "tin.py -i ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/annotation/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";" + done | singularity run ${dir}${rseqcImg}_${rseqcVar}.sif parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls + - pytest -m dataQC -uploadExecutionRun: +uploadInputBag: stage: unit only: - push @@ -353,22 +417,28 @@ uploadExecutionRun: - merge_requests - schedules script: - - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json - - > - exist=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Replicate=17-BTFJ) && - cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && - cookie=${cookie:11:-1} && - if [ "${exist}" == "[]" ]; then - rid=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/upload_execution_run.py -r 17-BTFJ -w 17-BV2Y -g 17-BV90 -i 17-BTFT -s Success -d 'This is a test execution run' -o staging.gudmap.org -c ${cookie} -u F) && - echo ${rid} test execution run created - else - rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') && - rid=${rid:7:-6} && - rid=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/upload_execution_run.py -r 17-BTFJ -w 17-BV2Y -g 17-BV90 -i 17-BTFT -s Success -d 'This is a test execution run' -o staging.gudmap.org -c ${cookie} -u ${rid}) && - echo ${rid} test execution run already exists - fi + - derivaImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f1) + - derivaVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${derivaImg}_${derivaVar}.sif + - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json + - echo THIS IS A TEST FILE > test.txt + - md5=$(md5sum ./test.txt | awk '{ print $1 }') && + - size=$(wc -c < ./test.txt) && + - > + exist=$(singularity run ${dir}${derivaImg}_${derivaVar}.sif curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=${md5}) && + if [ "${exist}" == "[]" ]; then + cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && + cookie=${cookie:11:-1} && + loc=$(singularity run ${dir}${derivaImg}_${derivaVar}.sif deriva-hatrac-cli --host staging.gudmap.org put ./test.txt /hatrac/resources/rnaseq/pipeline/input_bag/TEST/test.txt --parents) && + rid=$(singularity run ${dir}${derivaImg}_${derivaVar}.sif python3 ./workflow/scripts/upload_input_bag.py -f test.txt -l ${loc} -s ${md5} -b ${size} -n 'This is a test input bag' -o staging.gudmap.org -c ${cookie}) && + echo ${rid} test input bag created + else + rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') && + rid=${rid:8:-6} && + echo ${rid} test input bag already exists + fi -uploadQC: +uploadExecutionRun: stage: unit only: - push @@ -377,22 +447,25 @@ uploadQC: - merge_requests - schedules script: - - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json - - > - exist=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=17-BTFJ) && - cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && - cookie=${cookie:11:-1} && - if [ "${exist}" != "[]" ]; then - rids=$(echo ${exist} | grep -o '\"RID\":\".\{7\}' | sed 's/^.\{7\}//') && - for rid in ${rids}; do - singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/delete_entry.py -r ${rid} -t mRNA_QC -o staging.gudmap.org -c ${cookie} - done - echo all old mRNA QC RIDs deleted - fi - rid=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/upload_qc.py -r 17-BTFJ -e 17-BVDJ -p "Single End" -s forward -l 35 -w 5 -f 1 -t 1 -n "This is a test mRNA QC" -o staging.gudmap.org -c ${cookie} -u F) - echo ${rid} test mRNA QC created + - derivaImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f1) + - derivaVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${derivaImg}_${derivaVar}.sif + - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json + - > + exist=$(singularity run ${dir}${derivaImg}_${derivaVar}.sif curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Replicate=17-BTFJ) && + cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && + cookie=${cookie:11:-1} && + if [ "${exist}" == "[]" ]; then + rid=$(singularity run ${dir}${derivaImg}_${derivaVar}.sif python3 ./workflow/scripts/upload_execution_run.py -r 17-BTFJ -w 17-BV2Y -g 17-BV90 -i 17-BTFT -s Success -d 'This is a test execution run' -o staging.gudmap.org -c ${cookie} -u F) && + echo ${rid} test execution run created + else + rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') && + rid=${rid:7:-6} && + rid=$(singularity run ${dir}${derivaImg}_${derivaVar}.sif python3 ./workflow/scripts/upload_execution_run.py -r 17-BTFJ -w 17-BV2Y -g 17-BV90 -i 17-BTFT -s Success -d 'This is a test execution run' -o staging.gudmap.org -c ${cookie} -u ${rid}) && + echo ${rid} test execution run already exists + fi -uploadProcessedFile: +uploadQC: stage: unit only: - push @@ -401,29 +474,25 @@ uploadProcessedFile: - merge_requests - schedules script: - - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json - - echo THIS IS A TEST FILE > 17-BTFJ_test.csv - - mkdir -p ./deriva/Seq/pipeline/17-BTFE/17-BVDJ/ - - mv 17-BTFJ_test.csv ./deriva/Seq/pipeline/17-BTFE/17-BVDJ/17-BTFJ_test.csv - - > - exist=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Processed_File/Replicate=17-BTFJ) && - cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && - cookie=${cookie:11:-1} && - if [ "${exist}" != "[]" ]; then - rids=$(echo ${exist} | grep -o '\"RID\":\".\{7\}' | sed 's/^.\{7\}//') && - for rid in ${rids}; do - singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/delete_entry.py -r ${rid} -t Processed_File -o staging.gudmap.org -c ${cookie} - done - echo all old processed file RIDs deleted - fi - singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-upload-cli --catalog 2 --token ${cookie:9} staging.gudmap.org ./deriva - echo test processed file uploaded - - mkdir test - - singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' bdbag test --archiver zip - - echo test output bag created - - pytest -m outputBag + - derivaImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f1) + - derivaVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${derivaImg}_${derivaVar}.sif + - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json + - > + exist=$(singularity run ${dir}${derivaImg}_${derivaVar}.sif curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=17-BTFJ) && + cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && + cookie=${cookie:11:-1} && + if [ "${exist}" != "[]" ]; then + rids=$(echo ${exist} | grep -o '\"RID\":\".\{7\}' | sed 's/^.\{7\}//') && + for rid in ${rids}; do + singularity run ${dir}${derivaImg}_${derivaVar}.sif python3 ./workflow/scripts/delete_entry.py -r ${rid} -t mRNA_QC -o staging.gudmap.org -c ${cookie} + done + echo all old mRNA QC RIDs deleted + fi + rid=$(singularity run ${dir}${derivaImg}_${derivaVar}.sif python3 ./workflow/scripts/upload_qc.py -r 17-BTFJ -e 17-BVDJ -p "Single End" -s forward -l 35 -w 5 -f 1 -t 1 -n "This is a test mRNA QC" -o staging.gudmap.org -c ${cookie} -u F) + echo ${rid} test mRNA QC created -uploadOutputBag: +uploadProcessedFile: stage: unit only: - push @@ -432,27 +501,33 @@ uploadOutputBag: - merge_requests - schedules script: - - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json - - echo THIS IS A TEST FILE > test.txt - - > - md5=$(md5sum ./test.txt | awk '{ print $1 }') && - size=$(wc -c < ./test.txt) && - exist=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_MD5=${md5}) && - if [ "${exist}" == "[]" ]; then + - derivaImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f1) + - derivaVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${derivaImg}_${derivaVar}.sif + - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json + - echo THIS IS A TEST FILE > 17-BTFJ_test.csv + - mkdir -p ./deriva/Seq/pipeline/17-BTFE/17-BVDJ/ + - mv 17-BTFJ_test.csv ./deriva/Seq/pipeline/17-BTFE/17-BVDJ/17-BTFJ_test.csv + - > + exist=$(singularity run ${dir}${derivaImg}_${derivaVar}.sif curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Processed_File/Replicate=17-BTFJ) && cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && cookie=${cookie:11:-1} && - loc=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host staging.gudmap.org put ./test.txt /hatrac/resources/rnaseq/pipeline/output_bag/TEST/test.txt --parents) && - rid=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/upload_output_bag.py -e 17-BVDJ -f test.txt -l ${loc} -s ${md5} -b ${size} -n 'This is a test output bag' -o staging.gudmap.org -c ${cookie}) && - echo ${rid} test output bag created - else - rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') && - rid=${rid:8:-6} && - echo ${rid} test output bag already exists - fi - + if [ "${exist}" != "[]" ]; then + rids=$(echo ${exist} | grep -o '\"RID\":\".\{7\}' | sed 's/^.\{7\}//') && + for rid in ${rids}; do + singularity run ${dir}${derivaImg}_${derivaVar}.sif python3 ./workflow/scripts/delete_entry.py -r ${rid} -t Processed_File -o staging.gudmap.org -c ${cookie} + done + echo all old processed file RIDs deleted + fi + singularity run ${dir}${derivaImg}_${derivaVar}.sif deriva-upload-cli --catalog 2 --token ${cookie:9} staging.gudmap.org ./deriva + echo test processed file uploaded + - mkdir test + - singularity run ${dir}${derivaImg}_${derivaVar}.sif bdbag test --archiver zip + - echo test output bag created + - pytest -m outputBag -generateVersions: - stage: aggregation +uploadOutputBag: + stage: unit only: - push - tags @@ -460,16 +535,26 @@ generateVersions: - merge_requests - schedules script: - - singularity run 'docker://gudmaprbk/multiqc1.9:1.0.0' multiqc --version > version_multiqc.txt - - python ./workflow/scripts/generate_versions.py -o software_versions - - python ./workflow/scripts/generate_references.py -r ./docs/references.md -o software_references - artifacts: - name: "$CI_JOB_NAME" - when: always - paths: - - software_references_mqc.yaml - - software_versions_mqc.yaml - expire_in: 7 days + - derivaImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f1) + - derivaVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${derivaImg}_${derivaVar}.sif + - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json + - echo THIS IS A TEST FILE > test.txt + - > + md5=$(md5sum ./test.txt | awk '{ print $1 }') && + size=$(wc -c < ./test.txt) && + exist=$(singularity run ${dir}${derivaImg}_${derivaVar}.sif curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_MD5=${md5}) && + if [ "${exist}" == "[]" ]; then + cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && + cookie=${cookie:11:-1} && + loc=$(singularity run ${dir}${derivaImg}_${derivaVar}.sif deriva-hatrac-cli --host staging.gudmap.org put ./test.txt /hatrac/resources/rnaseq/pipeline/output_bag/TEST/test.txt --parents) && + rid=$(singularity run ${dir}${derivaImg}_${derivaVar}.sif python3 ./workflow/scripts/upload_output_bag.py -e 17-BVDJ -f test.txt -l ${loc} -s ${md5} -b ${size} -n 'This is a test output bag' -o staging.gudmap.org -c ${cookie}) && + echo ${rid} test output bag created + else + rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') && + rid=${rid:8:-6} && + echo ${rid} test output bag already exists + fi human_BioHPC: @@ -481,8 +566,8 @@ human_BioHPC: - merge_requests - schedules script: - - mkdir -p hu - - cp -R /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/hisat2 ./hu/ + - mkdir -p hu + - cp -R /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/hisat2 ./hu/ mouse_BioHPC: stage: reference @@ -493,8 +578,8 @@ mouse_BioHPC: - merge_requests - schedules script: - - mkdir -p mo - - cp -R /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/hisat2 ./mo/ + - mkdir -p mo + - cp -R /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/hisat2 ./mo/ human_dev: stage: reference @@ -505,22 +590,25 @@ human_dev: - merge_requests - schedules script: - - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt - - referenceBase=dev.gudmap.org - - refName=GRCh - - references=$(echo ${referenceBase}/${refName}${refHuVersion}) - - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) - - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) - - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) - - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}/Used_Spike_Ins=false) - - curl --request GET ${query} > refQuery.json - - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) - - loc=$(dirname ${refURL}) - - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi - - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') - - test=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) - - test=$(echo ${test} | grep -o ${filename}) - - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi + - derivaImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f1) + - derivaVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${derivaImg}_${derivaVar}.sif + - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt + - referenceBase=dev.gudmap.org + - refName=GRCh + - references=$(echo ${referenceBase}/${refName}${refHuVersion}) + - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) + - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) + - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) + - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}/Used_Spike_Ins=false) + - curl --request GET ${query} > refQuery.json + - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) + - loc=$(dirname ${refURL}) + - if [ "${loc}" == "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi + - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') + - test=$(singularity run ${dir}${derivaImg}_${derivaVar}.sif deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(echo ${test} | grep -o ${filename}) + - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi mouse_dev: stage: reference @@ -531,22 +619,25 @@ mouse_dev: - merge_requests - schedules script: - - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt - - referenceBase=dev.gudmap.org - - refName=GRCm - - references=$(echo ${referenceBase}/${refName}${refMoVersion}) - - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) - - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) - - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) - - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}/Used_Spike_Ins=false) - - curl --request GET ${query} > refQuery.json - - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) - - loc=$(dirname ${refURL}) - - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi - - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') - - test=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) - - test=$(echo ${test} | grep -o ${filename}) - - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi + - derivaImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f1) + - derivaVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${derivaImg}_${derivaVar}.sif + - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt + - referenceBase=dev.gudmap.org + - refName=GRCm + - references=$(echo ${referenceBase}/${refName}${refMoVersion}) + - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) + - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) + - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) + - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}/Used_Spike_Ins=false) + - curl --request GET ${query} > refQuery.json + - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) + - loc=$(dirname ${refURL}) + - if [ "${loc}" == "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi + - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') + - test=$(singularity run ${dir}${derivaImg}_${derivaVar}.sif deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(echo ${test} | grep -o ${filename}) + - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi human_staging: stage: reference @@ -557,22 +648,25 @@ human_staging: - merge_requests - schedules script: - - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt - - referenceBase=staging.gudmap.org - - refName=GRCh - - references=$(echo ${referenceBase}/${refName}${refHuVersion}) - - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) - - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) - - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) - - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}/Used_Spike_Ins=false) - - curl --request GET ${query} > refQuery.json - - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) - - loc=$(dirname ${refURL}) - - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi - - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') - - test=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) - - test=$(echo ${test} | grep -o ${filename}) - - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi + - derivaImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f1) + - derivaVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${derivaImg}_${derivaVar}.sif + - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt + - referenceBase=staging.gudmap.org + - refName=GRCh + - references=$(echo ${referenceBase}/${refName}${refHuVersion}) + - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) + - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) + - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) + - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}/Used_Spike_Ins=false) + - curl --request GET ${query} > refQuery.json + - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) + - loc=$(dirname ${refURL}) + - if [ "${loc}" == "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi + - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') + - test=$(singularity run ${dir}${derivaImg}_${derivaVar}.sif deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(echo ${test} | grep -o ${filename}) + - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi mouse_staging: stage: reference @@ -583,23 +677,26 @@ mouse_staging: - merge_requests - schedules script: - - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt - - referenceBase=staging.gudmap.org - - refName=GRCm - - refHuVersion=38.p6.vM22 - - references=$(echo ${referenceBase}/${refName}${refMoVersion}) - - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) - - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) - - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) - - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}/Used_Spike_Ins=false) - - curl --request GET ${query} > refQuery.json - - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) - - loc=$(dirname ${refURL}) - - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi - - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') - - test=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) - - test=$(echo ${test} | grep -o ${filename}) - - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi + - derivaImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f1) + - derivaVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${derivaImg}_${derivaVar}.sif + - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt + - referenceBase=staging.gudmap.org + - refName=GRCm + - refHuVersion=38.p6.vM22 + - references=$(echo ${referenceBase}/${refName}${refMoVersion}) + - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) + - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) + - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) + - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}/Used_Spike_Ins=false) + - curl --request GET ${query} > refQuery.json + - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) + - loc=$(dirname ${refURL}) + - if [ "${loc}" == "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi + - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') + - test=$(singularity run ${dir}${derivaImg}_${derivaVar}.sif deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(echo ${test} | grep -o ${filename}) + - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi human_prod: stage: reference @@ -610,22 +707,25 @@ human_prod: - merge_requests - schedules script: - - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt - - referenceBase=www.gudmap.org - - refName=GRCh - - references=$(echo ${referenceBase}/${refName}${refHuVersion}) - - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) - - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) - - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) - - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}/Used_Spike_Ins=false) - - curl --request GET ${query} > refQuery.json - - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) - - loc=$(dirname ${refURL}) - - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi - - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') - - test=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) - - test=$(echo ${test} | grep -o ${filename}) - - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi + - derivaImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f1) + - derivaVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${derivaImg}_${derivaVar}.sif + - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt + - referenceBase=www.gudmap.org + - refName=GRCh + - references=$(echo ${referenceBase}/${refName}${refHuVersion}) + - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) + - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) + - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) + - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}/Used_Spike_Ins=false) + - curl --request GET ${query} > refQuery.json + - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) + - loc=$(dirname ${refURL}) + - if [ "${loc}" == "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi + - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') + - test=$(singularity run ${dir}${derivaImg}_${derivaVar}.sif deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(echo ${test} | grep -o ${filename}) + - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi mouse_prod: stage: reference @@ -636,23 +736,26 @@ mouse_prod: - merge_requests - schedules script: - - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt - - referenceBase=www.gudmap.org - - refName=GRCm - - refHuVersion=38.p6.vM22 - - references=$(echo ${referenceBase}/${refName}${refMoVersion}) - - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) - - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) - - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) - - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}/Used_Spike_Ins=false) - - curl --request GET ${query} > refQuery.json - - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) - - loc=$(dirname ${refURL}) - - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi - - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') - - test=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) - - test=$(echo ${test} | grep -o ${filename}) - - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi + - derivaImg=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f1) + - derivaVar=$(cat nextflow.config | grep -oP "container = \K.*" | tr -d "'" | sort | uniq | grep deriva | cut -d"/" -f2 | cut -d":" -f2) + - echo ${dir}${derivaImg}_${derivaVar}.sif + - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt + - referenceBase=www.gudmap.org + - refName=GRCm + - refHuVersion=38.p6.vM22 + - references=$(echo ${referenceBase}/${refName}${refMoVersion}) + - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) + - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) + - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) + - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}/Used_Spike_Ins=false) + - curl --request GET ${query} > refQuery.json + - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) + - loc=$(dirname ${refURL}) + - if [ "${loc}" == "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi + - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') + - test=$(singularity run ${dir}${derivaImg}_${derivaVar}.sif deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(echo ${test} | grep -o ${filename}) + - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi integration_se: @@ -662,11 +765,11 @@ integration_se: variables: - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - - hostname - - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-1ZX4 --source staging --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./SE_report.html - - find . -type f -name "multiqc_data.json" -exec cp {} ./SE_multiqc_data.json \; - - pytest -m completionMultiqc --filename SE_multiqc_data.json + - hostname + - ulimit -a + - nextflow -q run ./rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-1ZX4 --source staging --refSource datahub --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./SE_report.html + - find . -type f -name "multiqc_data.json" -exec cp {} ./SE_multiqc_data.json \; + - pytest -m completionMultiqc --filename SE_multiqc_data.json artifacts: name: "$CI_JOB_NAME" when: always @@ -687,11 +790,11 @@ integration_pe: variables: - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - - hostname - - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA --source staging --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./PE_report.html - - find . -type f -name "multiqc_data.json" -exec cp {} ./PE_multiqc_data.json \; - - pytest -m completionMultiqc --filename PE_multiqc_data.json + - hostname + - ulimit -a + - nextflow -q run ./rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA --source staging --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./PE_report.html + - find . -type f -name "multiqc_data.json" -exec cp {} ./PE_multiqc_data.json \; + - pytest -m completionMultiqc --filename PE_multiqc_data.json artifacts: name: "$CI_JOB_NAME" when: always @@ -707,76 +810,76 @@ integration_pe: - always -failAmbiguousSpecies: +failTrunkation: stage: integration only: [merge_requests] except: variables: - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - - hostname - - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --source staging --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./failAmbiguousSpecies_report.html + - hostname + - ulimit -a + - nextflow -q run ./rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ET --source staging --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./failTrunkation_report.html retry: max: 0 when: - always -failTrunkation: +failMismatchR1R2: stage: integration only: [merge_requests] except: variables: - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - - hostname - - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ET --source staging --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./failTrunkation_report.html + - hostname + - ulimit -a + - nextflow -q run ./rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-CWH4 --source staging --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./failMismatchR1R2_report.html retry: max: 0 when: - always -failMismatchR1R2: +failUnexpectedMeta: stage: integration only: [merge_requests] except: variables: - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - - hostname - - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-CWH4 --source staging --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./failMismatchR1R2_report.html + - hostname + - ulimit -a + - nextflow -q run ./rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 14-3R4R --source staging --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./failUnexpectedMeta_report.html retry: max: 0 when: - always -failUnexpectedMeta: +failFileStructure: stage: integration only: [merge_requests] except: variables: - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - - hostname - - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 14-3R4R --source staging --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./failUnexpectedMeta_report.html + - hostname + - ulimit -a + - nextflow -q run ./rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5HT --source staging --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./failFileStructure_report.html retry: max: 0 when: - always -failFileStructure: +failSeqType: stage: integration only: [merge_requests] except: variables: - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - - hostname - - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5HT --source staging --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./failFileStructure_report.html + - hostname + - ulimit -a + - nextflow -q run ./rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-DNDJ --source staging --upload true -with-dag dag.png --dev false --ci true --track true -with-report ./failSeqType_report.html retry: max: 0 when: @@ -789,12 +892,12 @@ override_inputBag: variables: - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - - hostname - - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --source staging --inputBagForce ./test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip --upload false --dev false --ci true --track false -with-report ./inputBagOverride_report.html - - find . -type f -name "multiqc_data.json" -exec cp {} ./inputBagOverride_multiqc_data.json \; - - find ./**/report/ -type f -name "*multiqc.html" -exec cp {} ./inputBagOverride_multiqc.html \; - - pytest -m completionMultiqc --filename inputBagOverride_multiqc_data.json + - hostname + - ulimit -a + - nextflow -q run ./rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --source staging --inputBagForce ./test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip --upload false --dev false --ci true --track false -with-report ./inputBagOverride_report.html + - find . -type f -name "multiqc_data.json" -exec cp {} ./inputBagOverride_multiqc_data.json \; + - find ./**/report/ -type f -name "*multiqc.html" -exec cp {} ./inputBagOverride_multiqc.html \; + - pytest -m completionMultiqc --filename inputBagOverride_multiqc_data.json artifacts: name: "$CI_JOB_NAME" when: always @@ -814,12 +917,12 @@ override_fastq: variables: - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - - hostname - - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --source staging --fastqsForce './test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz' --upload false --dev false --ci true --track false -with-report ./fastqOverride_report.html - - find . -type f -name "multiqc_data.json" -exec cp {} ./fastqOverride_multiqc_data.json \; - - find ./**/report/ -type f -name "*multiqc.html" -exec cp {} ./fastqOverride_multiqc.html \; - - pytest -m completionMultiqc --filename fastqOverride_multiqc_data.json + - hostname + - ulimit -a + - nextflow -q run ./rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --source staging --fastqsForce './test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz' --upload false --dev false --ci true --track false -with-report ./fastqOverride_report.html + - find . -type f -name "multiqc_data.json" -exec cp {} ./fastqOverride_multiqc_data.json \; + - find ./**/report/ -type f -name "*multiqc.html" -exec cp {} ./fastqOverride_multiqc.html \; + - pytest -m completionMultiqc --filename fastqOverride_multiqc_data.json artifacts: name: "$CI_JOB_NAME" when: always @@ -839,12 +942,12 @@ override_species: variables: - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - - hostname - - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5EW --source staging --speciesForce 'Homo sapiens' --upload true --dev false --ci true --track false -with-report ./speciesOverride_report.html - - find . -type f -name "multiqc_data.json" -exec cp {} ./speciesOverride_multiqc_data.json \; - - find ./**/report/ -type f -name "*multiqc.html" -exec cp {} ./speciesOverride_multiqc.html \; - - pytest -m completionMultiqc --filename speciesOverride_multiqc_data.json + - hostname + - ulimit -a + - nextflow -q run ./rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5EW --source staging --speciesForce 'Homo sapiens' --upload true --dev false --ci true --track false -with-report ./speciesOverride_report.html + - find . -type f -name "multiqc_data.json" -exec cp {} ./speciesOverride_multiqc_data.json \; + - find ./**/report/ -type f -name "*multiqc.html" -exec cp {} ./speciesOverride_multiqc.html \; + - pytest -m completionMultiqc --filename speciesOverride_multiqc_data.json artifacts: name: "$CI_JOB_NAME" when: always @@ -864,12 +967,12 @@ override_stranded: variables: - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - - hostname - - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5EY --source staging --strandedForce unstranded --upload true --dev false --ci true --track false -with-report ./strandedOverride_report.html - - find . -type f -name "multiqc_data.json" -exec cp {} ./strandedOverride_multiqc_data.json \; - - find ./**/report/ -type f -name "*multiqc.html" -exec cp {} ./strandedOverride_multiqc.html \; - - pytest -m completionMultiqc --filename strandedOverride_multiqc_data.json + - hostname + - ulimit -a + - nextflow -q run ./rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5EY --source staging --strandedForce unstranded --upload true --dev false --ci true --track false -with-report ./strandedOverride_report.html + - find . -type f -name "multiqc_data.json" -exec cp {} ./strandedOverride_multiqc_data.json \; + - find ./**/report/ -type f -name "*multiqc.html" -exec cp {} ./strandedOverride_multiqc.html \; + - pytest -m completionMultiqc --filename strandedOverride_multiqc_data.json artifacts: name: "$CI_JOB_NAME" when: always @@ -889,12 +992,12 @@ override_spike: variables: - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - - hostname - - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F0 --source staging --spikeForce true --upload true --dev false --ci true --track false -with-report ./spikeOverride_report.html - - find . -type f -name "multiqc_data.json" -exec cp {} ./spikeOverride_multiqc_data.json \; - - find ./**/report/ -type f -name "*multiqc.html" -exec cp {} ./spikeOverride_multiqc.html \; - - pytest -m completionMultiqc --filename spikeOverride_multiqc_data.json + - hostname + - ulimit -a + - nextflow -q run ./rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F0 --source staging --spikeForce true --upload true --dev false --ci true --track false -with-report ./spikeOverride_report.html + - find . -type f -name "multiqc_data.json" -exec cp {} ./spikeOverride_multiqc_data.json \; + - find ./**/report/ -type f -name "*multiqc.html" -exec cp {} ./spikeOverride_multiqc.html \; + - pytest -m completionMultiqc --filename spikeOverride_multiqc_data.json artifacts: name: "$CI_JOB_NAME" when: always @@ -915,8 +1018,8 @@ consistency: variables: - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - - pytest -m consistencySE - - pytest -m consistencyPE + - pytest -m consistencySE + - pytest -m consistencyPE artifacts: name: "$CI_JOB_NAME" when: always @@ -924,3 +1027,104 @@ consistency: - SE_multiqc_data.json - PE_multiqc_data.json expire_in: 7 days + + +dnanexus: + stage: environment + only: + variables: + - $dnanexusEnv == "true" + except: + - push + - tags + - merge_requests + script: + - hostname + - ulimit -a + - mkdir -p ./badges/env + - curl --request GET https://img.shields.io/badge/Envronment%3A%20DNAnexus-inactive-critical?style=flat > ./badges/env/dnanexus.svg + - module load dxtoolkit/python27/0.294.0 + - export NXF_XPACK_LICENSE=${nxf_license} + - dx upload ./test_data/auth/c* --path /ci-env/auth/ --parents --auth-token ${dnanexus_authToken} --project-context-id ${dnanexus_workspace} + - dx upload ./test_data/fastq/xsmall/Q-Y5F6_10K.R{1,2}.fastq.gz --path /ci-env/input/ --parents --auth-token ${dnanexus_authToken} --project-context-id ${dnanexus_workspace} + - latest_release_tag=$(git tag --sort=-committerdate -l *.*.* | head -1) + - > + dx run nf-dxapp-bicf --auth-token ${dnanexus_authToken} --project-context-id ${dnanexus_workspace} \ + --delay-workspace-destruction \ + --instance-type mem1_ssd1_v2_x16 \ + --input-json "$(envsubst < ./docs/nxf_dnanexus-ci-test.json)" \ + > dx.log + - > + jobID=$(cat dx.log | grep -oP "Job ID: \K.*") + - dx watch ${jobID} --auth-token ${dnanexus_authToken} --project-context-id ${dnanexus_workspace} + - status=$(dx find executions --id ${jobID} --state failed --brief --auth-token ${dnanexus_authToken} --project-context-id ${dnanexus_workspace}) + - > + if [ "${status}" == "" ]; then + curl --request GET https://img.shields.io/badge/Envronment%3A%20DNAnexus-run%20succesful-success?style=flat > ./badges/env/dnanexus.svg + else + curl --request GET https://img.shields.io/badge/Envronment%3A%20DNAnexus-run%20failed-critical?style=flat > ./badges/env/dnanexus.svg + fi + after_script: + - module load dxtoolkit/python27/0.294.0 + - dx rm /ci-env/auth/* --all --auth-token ${dnanexus_authToken} --project-context-id ${dnanexus_workspace} + - dx rm /ci-env/input/* --all --auth-token ${dnanexus_authToken} --project-context-id ${dnanexus_workspace} + artifacts: + when: always + paths: + - badges/ + allow_failure: true + +aws: + stage: environment + only: + variables: + - $awsEnv == "true" + except: + - push + - tags + - merge_requests + script: + - hostname + - ulimit -a + - mkdir -p ./badges/env + - curl --request GET https://img.shields.io/badge/Envronment%3A%20AWS-inactive-critical?style=flat > ./badges/env/aws.svg + - module load awscli/1.11.139 + - export AWS_ACCESS_KEY_ID=${aws_accesskeyid} + - export AWS_SECRET_ACCESS_KEY=${aws_secretaccesskey} + - aws configure set region ${aws_region} + - aws s3 cp ./test_data/auth/ s3://bicf-output/ci-env/auth/ --exclude "*" --include "c*" --recursive + - aws s3 cp ./test_data/fastq/xsmall/ s3://bicf-output/ci-env/input/ --exclude "*" --include "Q-Y5F6_10K.R*.fastq.gz" --recursive + - latest_release_tag=$(git tag --sort=-committerdate -l *.*.* | head -1) + - > + id=$(aws batch submit-job\ + --job-name nf-GUDMAP_RBK_ci-env\ + --job-queue default-bicf\ + --job-definition nextflow-bicf-nextflow\ + --container-overrides command=$(envsubst < ./docs/nxf_aws-ci-test.json)) + id=$(echo ${id}| grep -oP "jobId\K.*" | tr -d '"' | tr -d ":" | tr -d " " | tr -d "}") + - > + status=$(aws batch describe-jobs --jobs ${id} | grep -oP "status\": \K.*" | tr -d '"' | tr -d ',' | tr -d " " ) && + until [[ "${status}" == "SUCCEEDED" || "${status}" == "FAILED" ]]; do + status=$(aws batch describe-jobs --jobs ${id} | grep -oP "status\": \K.*" | tr -d '"' | tr -d ',' | tr -d " " ) && + echo ${status} && + sleep 5m + done + - > + if [ "${status}" == "SUCCEEDED" ]; then + curl --request GET https://img.shields.io/badge/Envronment%3A%20AWS-run%20succesful-success?style=flat > ./badges/env/aws.svg + else + curl --request GET https://img.shields.io/badge/Envronment%3A%20AWS-run%20failed-critical?style=flat > ./badges/env/aws.svg + fi + after_script: + - module load awscli/1.11.139 + - > + export AWS_ACCESS_KEY_ID=${aws_accesskeyid} + export AWS_SECRET_ACCESS_KEY=${aws_secretaccesskey} + aws configure set region ${aws_region} + - aws s3 rm s3://bicf-output/ci-env/auth/ --recursive + - aws s3 rm s3://bicf-output/ci-env/input/ --recursive + artifacts: + when: always + paths: + - badges/ + allow_failure: true \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 21fb63812a42a7580996eacbd70c0a03f4866648..f11168501c90b6c80d9b0c2bc51d44480896c9a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,15 @@ -# v2.0.0rc01 +# v2.0.0 **User Facing** * Endness metadata "Single Read" changed to "Single End" in data-hub, pipeline updated to handle (#110) ("Single Read" still acceptable for backwards compatibility) * Strandedness metadata "yes"/"no" changed to boolean "t"/"f" in data-hub, pipeline updated to handle (#70) ("yes"/"no" still acceptable for backwards compatibility) * Upload empty mRNA_QC entry if data error (#111) * Allow forcing of strandedness and spike (#100) +* Add seqwho +* Add seqwho results to multiqc report +* Modify repository structure to allow for use with XPACK-DNANEXUS +* Add override for endness +* Add seqtk to references +* Update software versions to latest (containers) **Background** * Add memory limit (75%) per thread for samtools sort (#108) @@ -24,11 +30,22 @@ * Merge data error pre-inference execution run upload/finalize to 1 process * Change uploadOutputBag logic to change reuse hatrac file if alread exists (re-uses Output_Bag entry by reassigning Execution_Run RID) (#112) * Add new CI py tests for override and integration +* Fix fastq file and species error status detail bub (#118) +* Make compatible with XPACK-DNANEXUS +* Don't download fastq's if fastq override present +* Override fastq count to override counts +* Change ambiguous species ci to wrong species +* Add test for DNAnexus env +* Add test for AWS env *Known Bugs* * Override params (inputBag, fastq, species) aren't checked for integrity * Authentication files and tokens must be active (active auth client) for the duration of the pipeline run (until long-lived token utilization included) * Check for outputBag in hatrac doesn't check for any uploaded by chaise +* CI container cache will fail if cache folder is not owned by CI runner user +* CI container cache will not error if container failed to pull +* CI (container cache, version collection, and unit tests) will not work correctly if containers reffered to in nextflow.config aren't prefixed perfectly with: "container = " + * also, it is assumed that the containers are on dockerhub and don't have the "docker://" prefix <hr> diff --git a/README.md b/README.md index 7b715d4b94ce92f8cd93806dd21bd04481f83b0a..c4ec0fdeeeecfa381ad6780193fa0fe37f4f4ba4 100644 --- a/README.md +++ b/README.md @@ -56,12 +56,14 @@ To Run: * `--inputBagForce` utilizes a local replicate inputBag instead of downloading from the data-hub (still requires accurate repRID input) * eg: `--inputBagForce test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip` (must be the expected bag structure, this example will not work because it is a test bag) * `--fastqsForce` utilizes local fastq's instead of downloading from the data-hub (still requires accurate repRID input) - * eg: `--fastqsForce 'test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz'` (note the quotes around fastq's which must me named in the correct standard [*\*.R1.fastq.gz and/or \*.R2.fastq.gz*] and in the correct order) + * eg: `--fastqsForce 'test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz'` (note the quotes around fastq's which must me named in the correct standard [*\*.R1.fastq.gz and/or \*.R2.fastq.gz*] and in the correct order, also consider using `endsForce` if the endness doesn't match submitted value) * `--speciesForce` forces the species to be "Mus musculus" or "Homo sapiens", it bypasses a metadata mismatch or an ambiguous species error * eg: `--speciesForce 'Mus musculus'` + * `--endsForce` forces the endness to be "se", or "pe", it bypasses a metadata mismatch error + * eg: `--endsForce 'pe'` * `--strandedForce` forces the strandedness to be "forward", "reverse" or "unstranded", it bypasses a metadata mismatch error * eg: `--strandedForce 'unstranded'` - * `--spikeForce` forces the spike-in to be "false" or "true", it bypasses a metadata mismatch error + * `--spikeForce` forces the spike-in to be "false", or "true", it bypasses a metadata mismatch error * eg: `--spikeForce 'true'` * Tracking parameters ([Tracking Site](http://bicf.pipeline.tracker.s3-website-us-east-1.amazonaws.com/)): * `--ci` boolean (default = false) @@ -72,24 +74,62 @@ FULL EXAMPLE: ``` nextflow run workflow/rna-seq.nf --repRID Q-Y5JA --source production --deriva ./data/credential.json --bdbag ./data/cookies.txt --dev false --upload true -profile biohpc ``` - +<hr> Cloud Compatibility: -------------------- -This pipeline is also capable of being run on AWS. To do so: -* Build a AWS batch queue and environment either manually or with [aws-cloudformantion](https://console.aws.amazon.com/cloudformation/home?#/stacks/new?stackName=Nextflow&templateURL=https://s3.amazonaws.com/aws-genomics-workflows/templates/nextflow/nextflow-aio.template.yaml) -* Edit one of the aws configs in workflow/config/ - * Replace workDir with the S3 bucket generated - * Change region if different - * Change queue to the aws batch queue generated -* The user must have awscli configured with an appropriate authentication (with `aws configure` and access keys) in the environment which nextflow will be run -* Add `-profile` with the name aws config which was customized - +This pipeline is also capable of being run on AWS and DNAnexus. To do so: +* The Nextflow binary needs to contain a custom scm config to allow nextflow to pull the pipeline from the UTSW self-hosted GitLab server (git.biohpc.swmed.edu) + ``` + providers { + bicf { + server = 'https://git.biohpc.swmed.edu' + platform = 'gitlab' + } + } + ``` + This is required for the use of `nextflow run` or `nextflow pull` pointed directly to the git repo, but also the use in AWS or DNAnexus environments as those both use `nextflow run` directly to that repo. To get around this requirement, there is a clone of the repo hosted on [GitHub](https://github.com/utsw-bicf/gudmap_rbk.rna-seq) which can be used... but the currency of that clone cannot be guarnteed! +### [AWS](https://aws.amazon.com/) +* Build a AWS batch queue and environment either manually or with a template, such as: [Genomics Workflows on AWS](https://docs.opendata.aws/genomics-workflows/) +* The user must have awscli configured with an appropriate authentication (with `aws configure` and access keys) in the environment which nextflow +* Follow the instructions from [AWS](https://docs.aws.amazon.com/cli/latest/reference/batch/submit-job.html) about launching runs, using AWS cli. A template *json* file has been included ([awsExample.json](docs/awsExample.json)) + * `[version]` should be replaced with the pipeline version required (eg: `v2.0.0`) + * `[credential.json]` should be replaced with the location of the credential file outpted by authentification with Deriva + * `[cookies.txt]` should be replaced with the location of the cookies file outpted by authentification with Deriva for BDBag + * `[repRID]` should be replaced with the replicate RID to be analized (eg: `Q-Y5F6`) + * `[outDir]` should be replaced with the location to save local outputs of the pipeline + + example `aws batch submit-job` command (replaceing the parameters in `[]` with the appropriate values) + ``` + aws batch submit-job\ + --job-name [Job Name]\ + --job-queue [Queue]\ + --job-definition [Job Definition]\ + --container-overrides command=$(envsubst < ./docs/nxf_aws-ci-test.json) + ``` +### [DNAnexus](https://dnanexus.com/) (utilizes the [DNAnexus extension package for Nextflow (XPACK-DNANEXUS)](https://github.com/seqeralabs/xpack-dnanexus)) +* Follow the istructions from [XPACK-DNANEXUS](https://github.com/seqeralabs/xpack-dnanexus) about installing and authenticating (a valid license must be available for the extension package from Seqera Labs, as well as a subsription with DNAnexus) +* Follow the instructions from [XPACK-DNANEXUS](https://github.com/seqeralabs/xpack-dnanexus) about launching runs. A template *json* file has been included ([dnanexusExample.json](docs/dnanexusExample.json)) + * `[version]` should be replaced with the pipeline version required (eg: `v2.0.0`) + * `[credential.json]` should be replaced with the location of the credential file outpted by authentification with Deriva + * `[cookies.txt]` should be replaced with the location of the cookies file outpted by authentification with Deriva for BDBag + * `[repRID]` should be replaced with the replicate RID to be analized (eg: `Q-Y5F6`) + * `[outDir]` should be replaced with the location to save local outputs of the pipeline + + example `dx-run` command + ``` + dx run nf-dxapp-bicf \ + --delay-workspace-destruction \ + --instance-type mem1_ssd1_v2_x16 \ + --input-json "$(envsubst < ./docs/nxf_dnanexus-ci-test.json)" + ``` +### NOTE: +* File locations used in cloud deployments (auth files and output folder) need to be accessible in that environment (eg s3 location, or DNAnexus location). Local paths cannot be read local locations. +<hr> To generate you own references or new references: ------------------------------------------ Download the [reference creation script](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/snippets/31). This script will auto create human and mouse references from GENCODE. It can also create ERCC92 spike-in references as well as concatenate them to GENCODE references automatically. In addition, it can create references from manually downloaded FASTA and GTF files. - - +<hr> Errors: ------- Error reported back to the data-hub are (they aren't thrown on the command line by the pipeline, but rather are submitted (if `--upload true`) to the data-hub for that replicate in the execution run submission): @@ -101,7 +141,11 @@ Error reported back to the data-hub are (they aren't thrown on the command line |**Number of fastqs detected does not match submitted endness**|Single-end sequenced replicates can only have one fastq, while paried\-end can only have two (see above).| |**Number of reads do not match for R1 and R2**|For paired\-end sequenced studies the number of reads in read\-1 fastq must match that of read\-2. This error is usually indicative of uploading of currupted, trunkated, or wrong fastq files.| |**There is an error with the structure of the fastq**|The fastq's fail a test of their structure. This error is usually indicative of uploading of currupted, trunkated, or wrong fastq files.| -|**Inference of species returns an ambiguous result**|Species of the replicate is done by aligning a random subset of 1 million reads from the data to both the human and mouse reference genomes. If there isn't a clear difference between the alignment rates (`>=40%` of one species, but `<40%` of the other), then this error is detected.| +|**Infered species does not match for R1 and R2**|The species inferred from each read does not match. This error is usually indicative of uploading of wrong fastq files.| +|**Infered species confidence is low**|The confidence of the species inferrence call is low. This is usually indicative of very low quality samples.| +|**Infered sequencing type is not mRNA-seq**|The sequence type inferred is not mRNA-seq. This is usually indicative of uploading wrong fastq files.| +|**Infered sequencing type does not match for R1 and R2**|The sequencing type inferred from each read does not match. This error is usually indicative of uploading of wrong fastq files.| +|**Infered species confidence is low**|The confidence of the species inferrence call is low AND 3 sets of a random sampling of the fastq's do not match. This is usually indicative of very low quality samples.| |**Submitted metadata does not match inferred**|All required metadata for analysis of the data is internally inferred by the pipeline, if any of those do not match the submitted metadata, this error is detected to notify of a potential error. The mismatched metadata will be listed.| <hr> diff --git a/cleanup.sh b/cleanup.sh index aa289201c531fa4f4667a04f80fd015d2200e40c..0d61cfe0c4ae911824335431d8590eebfc07e70a 100644 --- a/cleanup.sh +++ b/cleanup.sh @@ -1,7 +1,7 @@ rm *.out rm pipeline_trace*.txt* -rm report*.html* -rm timeline*.html* +rm *report*.html* +rm *timeline*.html* rm .nextflow*.log* rm -r .nextflow/ rm -r work/ diff --git a/docs/awsExample.json b/docs/awsExample.json new file mode 100644 index 0000000000000000000000000000000000000000..8e8f39f7785559eb3c941aa46dbb577bea1a6bf4 --- /dev/null +++ b/docs/awsExample.json @@ -0,0 +1 @@ +["https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq","-r","[Version]","-profile","aws","--deriva","[credential.json]","--bdbag","[cookies.txt]","--repRID","[repRID]","--outDir","[outDir]"] diff --git a/docs/dag.png b/docs/dag.png index a19e02c47d0ca333f420061965ffda893ae42c83..f80f38b64729f19e6c78b29ee2af427eebe4aaba 100755 Binary files a/docs/dag.png and b/docs/dag.png differ diff --git a/docs/dnanexusExample.json b/docs/dnanexusExample.json new file mode 100644 index 0000000000000000000000000000000000000000..e03a6bccfba19f39afb59f62f711f428e6919248 --- /dev/null +++ b/docs/dnanexusExample.json @@ -0,0 +1,5 @@ +{ + "pipeline_url": "https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq -r [version]", + "args": "-profile dnanexus --deriva [credential.json] --bdbag [cookies.txt] --repRID [repRID] --outDir [outDir], + "license": "$NXF_XPACK_LICENSE" +} diff --git a/docs/nxf_aws-ci-test.json b/docs/nxf_aws-ci-test.json new file mode 100644 index 0000000000000000000000000000000000000000..1de31087dd7fbaa917ec79c1dcda43f92899a6b9 --- /dev/null +++ b/docs/nxf_aws-ci-test.json @@ -0,0 +1 @@ +["utsw-bicf/gudmap_rbk.rna-seq","-r","${latest_release_tag}","-profile","aws","--deriva","s3://bicf-output/ci-env/auth/credential.json","--bdbag","s3://bicf-output/ci-env/auth/cookies.txt","--repRID","Q-Y5F6","--source","staging","--upload","false","--dev","false","--ci","true","--track","false","-with-report","s3://bicf-output/ci-env/output/Q-Y5F6_fastqoverride_report.html","--refSource","datahub","--outDir","s3://bicf-output/ci-env/output/Q-Y5F6_fastqoverride","--fastqsForce","s3://bicf-output/ci-env/input/*.fastq.gz"] diff --git a/docs/nxf_dnanexus-ci-test.json b/docs/nxf_dnanexus-ci-test.json new file mode 100644 index 0000000000000000000000000000000000000000..e4dde61dc430803f11cb5266b83ab66d92f3f720 --- /dev/null +++ b/docs/nxf_dnanexus-ci-test.json @@ -0,0 +1,5 @@ +{ + "pipeline_url": "https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq.git -r ${latest_release_tag}", + "args": "-profile dnanexus --deriva dx://NextFlow_Prototype:/ci-env/auth/credential.json --bdbag dx://NextFlow_Prototype:/ci-env/auth/cookies.txt --repRID Q-Y5F6 --source staging --upload false --dev false --ci true --track false -with-report dx://NextFlow_Prototype:/ci-env/output/Q-Y5F6_fastqoverride_report.html --refSource datahub --outDir dx://NextFlow_Prototype:ci-env/output/Q-Y5F6_fastqoverride --fastqsForce dx://NextFlow_Prototype:/ci-env/input/*.fastq.gz", + "license": "${NXF_XPACK_LICENSE}" +} diff --git a/docs/references.md b/docs/references.md index 4ea1690ec755b51c923070352d4078634bc5e515..3aa5e67f4b5a5bf680fe88e2f4e5d8e2a4b67f62 100644 --- a/docs/references.md +++ b/docs/references.md @@ -4,40 +4,46 @@ * Anaconda (Anaconda Software Distribution, [https://anaconda.com](https://anaconda.com)) 2. **DERIVA**: - * Bugacov, A., Czajkowski, K., Kesselman, C., Kumar, A., Schuler, R. E. and Tangmunarunkit, H. 2017 Experiences with DERIVA: An Asset Management Platform for Accelerating eScience. IEEE 13th International Conference on e-Science (e-Science), Auckland, 2017, pp. 79-88, doi:[10.1109/eScience.2017.20](https://doi.org/10.1109/eScience.2017.20). + * Bugacov, A., Czajkowski, K., Kesselman, C., Kumar, A., Schuler, R. E., & Tangmunarunkit, H. (2017, October). Experiences with DERIVA: An asset management platform for accelerating eScience. In 2017 IEEE 13th International Conference on e-Science (e-Science) (pp. 79-88). IEEE. doi:[10.1109/eScience.2017.20](https://doi.org/10.1109/eScience.2017.20). 3. **BDBag**: - * D'Arcy, M., Chard, K., Foster, I., Kesselman, C., Madduri, R., Saint, N., & Wagner, R.. 2019. Big Data Bags: A Scalable Packaging Format for Science. Zenodo. doi:[10.5281/zenodo.3338725](http://doi.org/10.5281/zenodo.3338725). + * Madduri, R., Chard, K., DÂ’Arcy, M., Jung, S. C., Rodriguez, A., Sulakhe, D., ... & Foster, I. (2019). Reproducible big data science: A case study in continuous FAIRness. PloS one, 14(4), e0213013. doi:[10.1371/journal.pone.0213013](https://doi.org/10.1371/journal.pone.0213013). -4. **RSeQC**: - * Wang, L., Wang, S., Li, W. 2012 RSeQC: quality control of RNA-seq experiments. Bioinformatics. Aug 15;28(16):2184-5. doi:[10.1093/bioinformatics/bts356](https://doi.org/10.1093/bioinformatics/bts356). - -5. **trimgalore**: +4. **trimgalore**: * trimgalore [https://github.com/FelixKrueger/TrimGalore](https://github.com/FelixKrueger/TrimGalore) -6. **hisat2**: - * Kim ,D.,Paggi, J.M., Park, C., Bennett, C., Salzberg, S.L. 2019 Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. Nat Biotechnol. Aug;37(8):907-915. doi:[10.1038/s41587-019-0201-4](https://doi.org/10.1038/s41587-019-0201-4). +5. **hisat2**: + * Kim, D., Paggi, J. M., Park, C., Bennett, C., & Salzberg, S. L. (2019). Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. Nature biotechnology, 37(8), 907-915. doi:[10.1038/s41587-019-0201-4](https://doi.org/10.1038/s41587-019-0201-4). -7. **samtools**: - * Li H., B. Handsaker, A. Wysoker, T. Fennell, J. Ruan, N. Homer, G. Marth, G. Abecasis, R. Durbin, and 1000 Genome Project Data Processing Subgroup. 2009. The Sequence alignment/map (SAM) format and SAMtools. Bioinformatics 25: 2078-9. doi:[10.1093/bioinformatics/btp352](http://dx.doi.org/10.1093/bioinformatics/btp352) +6. **samtools**: + * Li, H., Handsaker, B., Wysoker, A., Fennell, T., Ruan, J., Homer, N., ... & Durbin, R. (2009). The sequence alignment/map format and SAMtools. Bioinformatics, 25(16), 2078-2079. doi:[10.1093/bioinformatics/btp352](http://dx.doi.org/10.1093/bioinformatics/btp352) -8. **picard**: +7. **picard**: * “Picard Toolkit.†2019. Broad Institute, GitHub Repository. [http://broadinstitute.github.io/picard/](http://broadinstitute.github.io/picard/); Broad Institute -9. **featureCounts**: - * Liao, Y., Smyth, G.K., Shi, W. 2014 featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics. Apr 1;30(7):923-30. doi:[10.1093/bioinformatics/btt656](https://doi.org/10.1093/bioinformatics/btt656). +8. **featureCounts**: + * Liao, Y., Smyth, G. K., & Shi, W. (2014). featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics, 30(7), 923-930. doi:[10.1093/bioinformatics/btt656](https://doi.org/10.1093/bioinformatics/btt656). -10. **R**: - * R Core Team 2014. R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL:[http://www.R-project.org/](http://www.R-project.org/). +9. **deeptools**: + * RamÃrez, F., Ryan, D. P., Grüning, B., Bhardwaj, V., Kilpert, F., Richter, A. S., ... & Manke, T. (2016). deepTools2: a next generation web server for deep-sequencing data analysis. Nucleic acids research, 44(W1), W160-W165. doi:[10.1093/nar/gkw257](http://dx.doi.org/10.1093/nar/gkw257) + +10. **Seqtk**: + * Seqtk [https://github.com/lh3/seqtk](https://github.com/lh3/seqtk) -11. **deeptools**: - * RamÃrez, F., D. P. Ryan, B. Grüning, V. Bhardwaj, F. Kilpert, A. S. Richter, S. Heyne, F. Dündar, and T. Manke. 2016. deepTools2: a next generation web server for deep-sequencing data analysis. Nucleic Acids Research 44: W160-165. doi:[10.1093/nar/gkw257](http://dx.doi.org/10.1093/nar/gkw257) +11. **R**: + * R Core Team 2014. R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL:[http://www.R-project.org/](http://www.R-project.org/). 12. **FastQC** * FastQC [https://www.bioinformatics.babraham.ac.uk/projects/fastqc/](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) -13. **MultiQC**: - * Ewels P., Magnusson M., Lundin S. and Käller M. 2016. MultiQC: Summarize analysis results for multiple tools and samples in a single report. Bioinformatics 32(19): 3047–3048. doi:[10.1093/bioinformatics/btw354](https://dx.doi.org/10.1093/bioinformatics/btw354) +13. **SeqWho** + * Bennett, C., Thornton, M., Park, C., Henry, G., Zhang, Y., Malladi, V. S., & Kim, D. (2021). SeqWho: Reliable, rapid determination of sequence file identity using k-mer frequencies. bioRxiv, 2021.2003.2010.434827. doi:[10.1101/2021.03.10.434827](https://doi.org/10.1101/2021.03.10.434827) + +14. **RSeQC**: + * Wang, L., Wang, S., Li, W. 2012 RSeQC: quality control of RNA-seq experiments. Bioinformatics. Aug 15;28(16):2184-5. doi:[10.1093/bioinformatics/bts356](https://doi.org/10.1093/bioinformatics/bts356). + +15. **MultiQC**: + * Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics, 32(19), 3047-3048. doi:[10.1093/bioinformatics/btw354](https://dx.doi.org/10.1093/bioinformatics/btw354) -14. **Nextflow**: - * Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., and Notredame, C. 2017. Nextflow enables reproducible computational workflows. Nature biotechnology, 35(4), 316. +16. **Nextflow**: + * Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature biotechnology, 35(4), 316-319. \ No newline at end of file diff --git a/docs/software_references_mqc.yaml b/docs/software_references_mqc.yaml old mode 100755 new mode 100644 index d9d18558b7df3f626ff89cdb01c610228db92a8b..825f21fc153d952ebe7b654936f8eb9af76ae302 --- a/docs/software_references_mqc.yaml +++ b/docs/software_references_mqc.yaml @@ -16,62 +16,62 @@ <li><strong>DERIVA</strong>:</li> </ol> <ul> - <li>Bugacov, A., Czajkowski, K., Kesselman, C., Kumar, A., Schuler, R. E. and Tangmunarunkit, H. 2017 Experiences with DERIVA: An Asset Management Platform for Accelerating eScience. IEEE 13th International Conference on e-Science (e-Science), Auckland, 2017, pp. 79-88, doi:<a href="https://doi.org/10.1109/eScience.2017.20">10.1109/eScience.2017.20</a>.</li> + <li>Bugacov, A., Czajkowski, K., Kesselman, C., Kumar, A., Schuler, R. E., & Tangmunarunkit, H. (2017, October). Experiences with DERIVA: An asset management platform for accelerating eScience. In 2017 IEEE 13th International Conference on e-Science (e-Science) (pp. 79-88). IEEE. doi:<a href="https://doi.org/10.1109/eScience.2017.20">10.1109/eScience.2017.20</a>.</li> </ul> <ol start="3" style="list-style-type: decimal"> <li><strong>BDBag</strong>:<br /> </li> </ol> <ul> - <li>D'Arcy, M., Chard, K., Foster, I., Kesselman, C., Madduri, R., Saint, N., & Wagner, R.. 2019. Big Data Bags: A Scalable Packaging Format for Science. Zenodo. doi:<a href="http://doi.org/10.5281/zenodo.3338725">10.5281/zenodo.3338725</a>.</li> + <li>Madduri, R., Chard, K., DÂ’Arcy, M., Jung, S. C., Rodriguez, A., Sulakhe, D., ... & Foster, I. (2019). Reproducible big data science: A case study in continuous FAIRness. PloS one, 14(4), e0213013. doi:<a href="https://doi.org/10.1371/journal.pone.0213013">10.1371/journal.pone.0213013</a>.</li> </ul> <ol start="4" style="list-style-type: decimal"> - <li><strong>RSeQC</strong>:</li> + <li><strong>trimgalore</strong>:</li> </ol> <ul> - <li>Wang, L., Wang, S., Li, W. 2012 RSeQC: quality control of RNA-seq experiments. Bioinformatics. Aug 15;28(16):2184-5. doi:<a href="https://doi.org/10.1093/bioinformatics/bts356">10.1093/bioinformatics/bts356</a>.</li> + <li>trimgalore <a href="https://github.com/FelixKrueger/TrimGalore" class="uri">https://github.com/FelixKrueger/TrimGalore</a></li> </ul> <ol start="5" style="list-style-type: decimal"> - <li><strong>trimgalore</strong>:</li> + <li><strong>hisat2</strong>:</li> </ol> <ul> - <li>trimgalore <a href="https://github.com/FelixKrueger/TrimGalore" class="uri">https://github.com/FelixKrueger/TrimGalore</a></li> + <li>Kim, D., Paggi, J. M., Park, C., Bennett, C., & Salzberg, S. L. (2019). Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. Nature biotechnology, 37(8), 907-915. doi:<a href="https://doi.org/10.1038/s41587-019-0201-4">10.1038/s41587-019-0201-4</a>.</li> </ul> <ol start="6" style="list-style-type: decimal"> - <li><strong>hisat2</strong>:</li> + <li><strong>samtools</strong>:</li> </ol> <ul> - <li>Kim ,D.,Paggi, J.M., Park, C., Bennett, C., Salzberg, S.L. 2019 Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. Nat Biotechnol. Aug;37(8):907-915. doi:<a href="https://doi.org/10.1038/s41587-019-0201-4">10.1038/s41587-019-0201-4</a>.</li> + <li>Li, H., Handsaker, B., Wysoker, A., Fennell, T., Ruan, J., Homer, N., ... & Durbin, R. (2009). The sequence alignment/map format and SAMtools. Bioinformatics, 25(16), 2078-2079. doi:<a href="http://dx.doi.org/10.1093/bioinformatics/btp352">10.1093/bioinformatics/btp352</a></li> </ul> <ol start="7" style="list-style-type: decimal"> - <li><strong>samtools</strong>:</li> + <li><strong>picard</strong>:</li> </ol> <ul> - <li>Li H., B. Handsaker, A. Wysoker, T. Fennell, J. Ruan, N. Homer, G. Marth, G. Abecasis, R. Durbin, and 1000 Genome Project Data Processing Subgroup. 2009. The Sequence alignment/map (SAM) format and SAMtools. Bioinformatics 25: 2078-9. doi:<a href="http://dx.doi.org/10.1093/bioinformatics/btp352">10.1093/bioinformatics/btp352</a></li> + <li>“Picard Toolkit.†2019. Broad Institute, GitHub Repository. <a href="http://broadinstitute.github.io/picard/" class="uri">http://broadinstitute.github.io/picard/</a>; Broad Institute</li> </ul> <ol start="8" style="list-style-type: decimal"> - <li><strong>picard</strong>:</li> + <li><strong>featureCounts</strong>:</li> </ol> <ul> - <li>“Picard Toolkit.†2019. Broad Institute, GitHub Repository. <a href="http://broadinstitute.github.io/picard/" class="uri">http://broadinstitute.github.io/picard/</a>; Broad Institute</li> + <li>Liao, Y., Smyth, G. K., & Shi, W. (2014). featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics, 30(7), 923-930. doi:<a href="https://doi.org/10.1093/bioinformatics/btt656">10.1093/bioinformatics/btt656</a>.</li> </ul> <ol start="9" style="list-style-type: decimal"> - <li><strong>featureCounts</strong>:</li> + <li><strong>deeptools</strong>:</li> </ol> <ul> - <li>Liao, Y., Smyth, G.K., Shi, W. 2014 featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics. Apr 1;30(7):923-30. doi:<a href="https://doi.org/10.1093/bioinformatics/btt656">10.1093/bioinformatics/btt656</a>.</li> + <li>RamÃrez, F., Ryan, D. P., Grüning, B., Bhardwaj, V., Kilpert, F., Richter, A. S., ... & Manke, T. (2016). deepTools2: a next generation web server for deep-sequencing data analysis. Nucleic acids research, 44(W1), W160-W165. doi:<a href="http://dx.doi.org/10.1093/nar/gkw257">10.1093/nar/gkw257</a></li> </ul> <ol start="10" style="list-style-type: decimal"> - <li><strong>R</strong>:</li> + <li><strong>Seqtk</strong>:</li> </ol> <ul> - <li>R Core Team 2014. R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL:<a href="http://www.R-project.org/" class="uri">http://www.R-project.org/</a>.</li> + <li>Seqtk <a href="https://github.com/lh3/seqtk" class="uri">https://github.com/lh3/seqtk</a></li> </ul> <ol start="11" style="list-style-type: decimal"> - <li><strong>deeptools</strong>:</li> + <li><strong>R</strong>:</li> </ol> <ul> - <li>RamÃrez, F., D. P. Ryan, B. Grüning, V. Bhardwaj, F. Kilpert, A. S. Richter, S. Heyne, F. Dündar, and T. Manke. 2016. deepTools2: a next generation web server for deep-sequencing data analysis. Nucleic Acids Research 44: W160-165. doi:<a href="http://dx.doi.org/10.1093/nar/gkw257">10.1093/nar/gkw257</a></li> + <li>R Core Team 2014. R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL:<a href="http://www.R-project.org/" class="uri">http://www.R-project.org/</a>.</li> </ul> <ol start="12" style="list-style-type: decimal"> <li><strong>FastQC</strong></li> @@ -80,14 +80,26 @@ <li>FastQC <a href="https://www.bioinformatics.babraham.ac.uk/projects/fastqc/" class="uri">https://www.bioinformatics.babraham.ac.uk/projects/fastqc/</a></li> </ul> <ol start="13" style="list-style-type: decimal"> - <li><strong>MultiQC</strong>:</li> + <li><strong>SeqWho</strong></li> </ol> <ul> - <li>Ewels P., Magnusson M., Lundin S. and Käller M. 2016. MultiQC: Summarize analysis results for multiple tools and samples in a single report. Bioinformatics 32(19): 3047–3048. doi:<a href="https://dx.doi.org/10.1093/bioinformatics/btw354">10.1093/bioinformatics/btw354</a></li> + <li>Bennett, C., Thornton, M., Park, C., Henry, G., Zhang, Y., Malladi, V. S., & Kim, D. (2021). SeqWho: Reliable, rapid determination of sequence file identity using k-mer frequencies. bioRxiv, 2021.2003.2010.434827. doi:<a href="https://doi.org/10.1101/2021.03.10.434827">10.1101/2021.03.10.434827</a></li> </ul> <ol start="14" style="list-style-type: decimal"> + <li><strong>RSeQC</strong>:</li> + </ol> + <ul> + <li>Wang, L., Wang, S., Li, W. 2012 RSeQC: quality control of RNA-seq experiments. Bioinformatics. Aug 15;28(16):2184-5. doi:<a href="https://doi.org/10.1093/bioinformatics/bts356">10.1093/bioinformatics/bts356</a>.</li> + </ul> + <ol start="15" style="list-style-type: decimal"> + <li><strong>MultiQC</strong>:</li> + </ol> + <ul> + <li>Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics, 32(19), 3047-3048. doi:<a href="https://dx.doi.org/10.1093/bioinformatics/btw354">10.1093/bioinformatics/btw354</a></li> + </ul> + <ol start="16" style="list-style-type: decimal"> <li><strong>Nextflow</strong>:</li> </ol> <ul> - <li>Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., and Notredame, C. 2017. Nextflow enables reproducible computational workflows. Nature biotechnology, 35(4), 316.</li> + <li>Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature biotechnology, 35(4), 316-319.</li> </ul> diff --git a/docs/software_versions_mqc.yaml b/docs/software_versions_mqc.yaml old mode 100755 new mode 100644 index 5eb233d1b251787f3ad3cd14b2b7133259383e02..06585acbc9964a76d4ba1dff62a4d4464b73bd8e --- a/docs/software_versions_mqc.yaml +++ b/docs/software_versions_mqc.yaml @@ -1,24 +1,26 @@ id: 'software_versions' section_name: 'Software Versions' - section_href: 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/blob/78-tool_version/docs/RNA-Seq%20Pipeline%20Design%20Process%20Table.pdf' + section_href: 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/wikis/Pipeline/Tool-Versions' plot_type: 'html' description: 'are collected for pipeline version.' data: | <dl class="dl-horizontal"> - <dt>Python</dt><dd>v3.8.3</dd> - <dt>DERIVA</dt><dd>v1.4.3</dd> - <dt>BDBag</dt><dd>v1.5.6</dd> - <dt>RSeQC</dt><dd>v4.0.0</dd> - <dt>Trim Galore!</dt><dd>v0.6.4_dev</dd> + <dt>Python</dt><dd>v3.8.5</dd> + <dt>DERIVA</dt><dd>v1.4.5</dd> + <dt>BDBag</dt><dd>v1.6.0</dd> + <dt>Trim Galore!</dt><dd>v0.6.6</dd> <dt>HISAT2</dt><dd>v2.2.1</dd> <dt>Samtools</dt><dd>v1.11</dd> - <dt>picard (MarkDuplicates)</dt><dd>v2.23.9</dd> + <dt>picard (MarkDuplicates)</dt><dd>v2.25.0</dd> <dt>featureCounts</dt><dd>v2.0.1</dd> - <dt>R</dt><dd>v4.0.3</dd> <dt>deepTools</dt><dd>v3.5.0</dd> + <dt>Seqtk</dt><dd>v1.3-r106</dd> + <dt>R</dt><dd>v4.0.4</dd> <dt>FastQC</dt><dd>v0.11.9</dd> - <dt>MultiQC</dt><dd>v1.9</dd> - <dt>Pipeline Version</dt><dd>v1.0.2</dd> + <dt>SeqWho</dt><dd>vBeta-1.0.0</dd> + <dt>RSeQC</dt><dd>v4.0.0</dd> + <dt>MultiQC</dt><dd>v1.10</dd> + <dt>Pipeline Version</dt><dd>v2.0.0rc01</dd> </dl> diff --git a/nextflow.config b/nextflow.config index 44f2df5255691ee4eaf11ecf9cee1af2fa27f743..f715f6b004039a8edf349b473f1b0b242005da33 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,101 +1,108 @@ profiles { standard { - includeConfig 'conf/biohpc.config' + includeConfig 'nextflowConf/biohpc.config' } biohpc { - includeConfig 'conf/biohpc.config' + includeConfig 'nextflowConf/biohpc.config' } biohpc_max { - includeConfig 'conf/biohpc_max.config' + includeConfig 'nextflowConf/biohpc_max.config' } - aws_ondemand { - includeConfig 'conf/aws.config' - includeConfig 'conf/ondemand.config' + dnanexus { + includeConfig 'nextflowConf/dnanexus.config' } - aws_spot { - includeConfig 'conf/aws.config' - includeConfig 'conf/spot.config' + aws { + includeConfig 'nextflowConf/aws.config' } } process { + withName:trackStart { + container = 'gudmaprbk/gudmap-rbk_base:1.0.1' + } withName:getBag { - container = 'gudmaprbk/deriva1.4:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.1' } withName:getData { - container = 'gudmaprbk/deriva1.4:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.1' } withName:parseMetadata { - container = 'gudmaprbk/python3:1.0.0' + container = 'gudmaprbk/python3:1.0.1' } - withName:trimData { - container = 'gudmaprbk/trimgalore0.6.5:1.0.0' + withName:getRefERCC { + container = 'gudmaprbk/deriva1.4:1.0.1' } - withName:getRefInfer { - container = 'gudmaprbk/deriva1.4:1.0.0' + withName:getRef { + container = 'gudmaprbk/deriva1.4:1.0.1' + } + withName:fastqc { + container = 'gudmaprbk/fastqc0.11.9:1.0.1' + } + withName:seqwho { + container = 'gudmaprbk/seqwho1.0.0:1.0.0' + } + withName:trimData { + container = 'gudmaprbk/trimgalore0.6.6:1.0.0' } withName:downsampleData { - container = 'gudmaprbk/seqtk1.3:1.0.0' + container = 'gudmaprbk/seqtk1.3:1.0.1' + } + withName:alignSampleDataERCC { + container = 'gudmaprbk/hisat2.2.1:1.0.1' } withName:alignSampleData { - container = 'gudmaprbk/hisat2.2.1:1.0.0' + container = 'gudmaprbk/hisat2.2.1:1.0.1' } withName:inferMetadata { - container = 'gudmaprbk/rseqc4.0.0:1.0.0' + container = 'gudmaprbk/rseqc4.0.0:1.0.1' } withName:checkMetadata { - container = 'gudmaprbk/gudmap-rbk_base:1.0.0' - } - withName:getRef { - container = 'gudmaprbk/deriva1.4:1.0.0' + container = 'gudmaprbk/gudmap-rbk_base:1.0.1' } withName:alignData { - container = 'gudmaprbk/hisat2.2.1:1.0.0' + container = 'gudmaprbk/hisat2.2.1:1.0.1' } withName:dedupData { - container = 'gudmaprbk/picard2.23.9:1.0.0' + container = 'gudmaprbk/picard2.25.0:1.0.0' } withName:countData { - container = 'gudmaprbk/subread2.0.1:1.0.0' + container = 'gudmaprbk/subread2.0.1:1.0.1' } withName:makeBigWig { - container = 'gudmaprbk/deeptools3.5.0:1.0.0' - } - withName:fastqc { - container = 'gudmaprbk/fastqc0.11.9:1.0.0' + container = 'gudmaprbk/deeptools3.5.0:1.0.1' } withName:dataQC { - container = 'gudmaprbk/rseqc4.0.0:1.0.0' + container = 'gudmaprbk/rseqc4.0.0:1.0.1' } withName:aggrQC { - container = 'gudmaprbk/multiqc1.9:1.0.0' + container = 'gudmaprbk/multiqc1.10:1.0.0' } withName:uploadInputBag { - container = 'gudmaprbk/deriva1.4:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.1' } withName:uploadExecutionRun { - container = 'gudmaprbk/deriva1.4:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.1' } withName:uploadQC { - container = 'gudmaprbk/deriva1.4:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.1' } withName:uploadProcessedFile { - container = 'gudmaprbk/deriva1.4:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.1' } withName:uploadOutputBag { - container = 'gudmaprbk/deriva1.4:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.1' } withName:finalizeExecutionRun { - container = 'gudmaprbk/deriva1.4:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.1' } withName:failPreExecutionRun { - container = 'gudmaprbk/deriva1.4:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.1' } withName:failExecutionRun { - container = 'gudmaprbk/deriva1.4:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.1' } withName:uploadQC_fail { - container = 'gudmaprbk/deriva1.4:1.0.0' + container = 'gudmaprbk/deriva1.4:1.0.1' } } @@ -125,6 +132,6 @@ manifest { homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq' description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.' mainScript = 'rna-seq.nf' - version = 'v2.0.0rc01' + version = 'v2.0.0' nextflowVersion = '>=19.09.0' } diff --git a/main.nf b/nextflowConf/.gitkeep similarity index 100% rename from main.nf rename to nextflowConf/.gitkeep diff --git a/workflow/conf/aws.config b/nextflowConf/aws.config similarity index 58% rename from workflow/conf/aws.config rename to nextflowConf/aws.config index bf5b59c7cf9db00606a5db9f97c706d53f21137f..659e7f21c471f5039c11ea4c23e69c7a7c6829d3 100644 --- a/workflow/conf/aws.config +++ b/nextflowConf/aws.config @@ -1,127 +1,142 @@ params { - refSource = "aws" -} - -workDir = 's3://gudmap-rbk.output/work' -aws.client.storageEncryption = 'AES256' -aws { - region = 'us-east-2' - batch { - cliPath = '/home/ec2-user/miniconda/bin/aws' - } + refSource = "datahub" } process { - executor = 'awsbatch' - cpus = 1 - memory = '1 GB' - withName:trackStart { + executor = 'local' cpus = 1 memory = '1 GB' } withName:getBag { + executor = 'local' cpus = 1 memory = '1 GB' } withName:getData { - cpus = 1 - memory = '1 GB' + cpus = 16 + memory = '32 GB' } withName:parseMetadata { - cpus = 15 + executor = 'local' + cpus = 1 memory = '1 GB' } - withName:trimData { - cpus = 20 - memory = '2 GB' + withName:getRefERCC { + cpus = 16 + memory = '32 GB' + } + withName:getRef { + cpus = 16 + memory = '32 GB' } - withName:getRefInfer { + withName:fastqc { + cpus = 16 + memory = '32 GB' + } + withName:seqwho { + executor = 'local' cpus = 1 memory = '1 GB' } + withName:trimData { + cpus = 16 + memory = '32 GB' + } withName:downsampleData { + executor = 'local' cpus = 1 memory = '1 GB' } + withName:alignSampleDataERCC { + cpus = 16 + memory = '32 GB' + } withName:alignSampleData { - cpus = 50 - memory = '5 GB' + cpus = 16 + memory = '32 GB' } withName:inferMetadata { - cpus = 5 - memory = '1 GB' + cpus = 16 + memory = '32 GB' } withName:checkMetadata { - cpus = 1 - memory = '1 GB' - } - withName:getRef { + executor = 'local' cpus = 1 memory = '1 GB' } withName:alignData { - cpus = 50 - memory = '10 GB' + cpus = 16 + memory = '32 GB' } withName:dedupData { - cpus = 5 - memory = '20 GB' + cpus = 16 + memory = '32 GB' } withName:countData { - cpus = 2 - memory = '5 GB' + cpus = 16 + memory = '32 GB' } withName:makeBigWig { - cpus = 15 - memory = '5 GB' - } - withName:fastqc { - cpus = 1 - memory = '1 GB' + cpus = 16 + memory = '32 GB' } withName:dataQC { - cpus = 15 - memory = '2 GB' + cpus = 16 + memory = '32 GB' } withName:aggrQC { - cpus = 2 + executor = 'local' + cpus = 1 memory = '1 GB' } withName:uploadInputBag { + executor = 'local' cpus = 1 memory = '1 GB' } withName:uploadExecutionRun { + executor = 'local' cpus = 1 memory = '1 GB' } withName:uploadQC { + executor = 'local' cpus = 1 memory = '1 GB' } withName:uploadProcessedFile { + executor = 'local' cpus = 1 memory = '1 GB' } withName:uploadOutputBag { + executor = 'local' cpus = 1 memory = '1 GB' } withName:finalizeExecutionRun { + executor = 'local' cpus = 1 memory = '1 GB' } withName:failPreExecutionRun { + executor = 'local' cpus = 1 memory = '1 GB' } withName:failExecutionRun { + executor = 'local' cpus = 1 memory = '1 GB' } withName:uploadQC_fail { + executor = 'local' cpus = 1 memory = '1 GB' } } + +docker { + enabled = true +} diff --git a/workflow/conf/biohpc.config b/nextflowConf/biohpc.config similarity index 92% rename from workflow/conf/biohpc.config rename to nextflowConf/biohpc.config index a12f2a704b3c63df9031789c2bb05d11e04d6b3a..dff28cb4ae54ee54ad63204ec8bd88e2441eb71b 100755 --- a/workflow/conf/biohpc.config +++ b/nextflowConf/biohpc.config @@ -22,15 +22,27 @@ process { withName:parseMetadata { executor = 'local' } - withName:trimData { + withName:getRefERCC { queue = 'super' } - withName:getRefInfer { + withName:getRef { + queue = 'super' + } + withName:fastqc { + queue = 'super' + } + withName:seqwho { + executor = 'local' + } + withName:trimData { queue = 'super' } withName:downsampleData { executor = 'local' } + withName:alignSampleDataERCC { + queue = '128GB,256GB,256GBv1,384GB' + } withName:alignSampleData { queue = '128GB,256GB,256GBv1,384GB' } @@ -40,9 +52,6 @@ process { withName:checkMetadata { executor = 'local' } - withName:getRef { - queue = 'super' - } withName:alignData { queue = '256GB,256GBv1' } @@ -55,9 +64,6 @@ process { withName:makeBigWig { queue = 'super' } - withName:fastqc { - queue = 'super' - } withName:dataQC { queue = 'super' } diff --git a/nextflowConf/biohpc_local.config b/nextflowConf/biohpc_local.config new file mode 100755 index 0000000000000000000000000000000000000000..d3a6c3a38689c234d65288c07d81d5b7286404c4 --- /dev/null +++ b/nextflowConf/biohpc_local.config @@ -0,0 +1,14 @@ +process { + executor = 'local' +} + +singularity { + enabled = true + cacheDir = '/project/BICF/BICF_Core/shared/gudmap/singularity_cache/' +} + +env { + http_proxy = 'http://proxy.swmed.edu:3128' + https_proxy = 'http://proxy.swmed.edu:3128' + all_proxy = 'http://proxy.swmed.edu:3128' +} diff --git a/workflow/conf/biohpc_max.config b/nextflowConf/biohpc_max.config similarity index 100% rename from workflow/conf/biohpc_max.config rename to nextflowConf/biohpc_max.config diff --git a/nextflowConf/dnanexus.config b/nextflowConf/dnanexus.config new file mode 100755 index 0000000000000000000000000000000000000000..d7c10297d2548536ba7deb7c9501b5f1f8de0836 --- /dev/null +++ b/nextflowConf/dnanexus.config @@ -0,0 +1,155 @@ +params { + refSource = "datahub" +} + +process { + withName:trackStart { + executor = 'dnanexus' + cpus = 1 + memory = '1 GB' + } + withName:getBag { + executor = 'dnanexus' + cpus = 1 + memory = '1 GB' + } + withName:getData { + machineType = 'mem1_ssd1_v2_x16' + cpus = 16 + memory = '32 GB' + } + withName:parseMetadata { + executor = 'dnanexus' + cpus = 1 + memory = '1 GB' + } + withName:getRefERCC { + machineType = 'mem1_ssd1_v2_x16' + cpus = 16 + memory = '32 GB' + } + withName:getRef { + machineType = 'mem1_ssd1_v2_x16' + cpus = 16 + memory = '32 GB' + } + withName:fastqc { + machineType = 'mem1_ssd1_v2_x16' + cpus = 16 + memory = '32 GB' + } + withName:seqwho { + executor = 'dnanexus' + cpus = 1 + memory = '1 GB' + } + withName:trimData { + machineType = 'mem1_ssd1_v2_x16' + cpus = 16 + memory = '32 GB' + } + withName:downsampleData { + executor = 'dnanexus' + cpus = 1 + memory = '1 GB' + } + withName:alignSampleDataERCC { + machineType = 'mem3_ssd1_v2_x16' + cpus = 16 + memory = '32 GB' + } + withName:alignSampleData { + machineType = 'mem3_ssd1_v2_x16' + cpus = 16 + memory = '32 GB' + } + withName:inferMetadata { + machineType = 'mem1_ssd1_v2_x16' + cpus = 16 + memory = '32 GB' + } + withName:checkMetadata { + executor = 'dnanexus' + cpus = 1 + memory = '1 GB' + } + withName:alignData { + machineType = 'mem3_ssd1_v2_x32' + cpus = 16 + memory = '32 GB' + } + withName:dedupData { + machineType = 'mem1_ssd1_v2_x16' + cpus = 16 + memory = '32 GB' + } + withName:countData { + machineType = 'mem1_ssd1_v2_x16' + cpus = 16 + memory = '32 GB' + } + withName:makeBigWig { + machineType = 'mem1_ssd1_v2_x16' + cpus = 16 + memory = '32 GB' + } + withName:dataQC { + machineType = 'mem1_ssd1_v2_x16' + cpus = 16 + memory = '32 GB' + } + withName:aggrQC { + executor = 'dnanexus' + cpus = 1 + memory = '1 GB' + } + withName:uploadInputBag { + executor = 'dnanexus' + cpus = 1 + memory = '1 GB' + } + withName:uploadExecutionRun { + executor = 'dnanexus' + cpus = 1 + memory = '1 GB' + } + withName:uploadQC { + executor = 'dnanexus' + cpus = 1 + memory = '1 GB' + } + withName:uploadProcessedFile { + executor = 'dnanexus' + cpus = 1 + memory = '1 GB' + } + withName:uploadOutputBag { + executor = 'dnanexus' + cpus = 1 + memory = '1 GB' + } + withName:finalizeExecutionRun { + executor = 'dnanexus' + cpus = 1 + memory = '1 GB' + } + withName:failPreExecutionRun { + executor = 'dnanexus' + cpus = 1 + memory = '1 GB' + } + withName:failExecutionRun { + executor = 'dnanexus' + cpus = 1 + memory = '1 GB' + } + withName:uploadQC_fail { + executor = 'dnanexus' + cpus = 1 + memory = '1 GB' + } +} + +docker { + enabled = true +} diff --git a/nextflowConf/local.config b/nextflowConf/local.config new file mode 100755 index 0000000000000000000000000000000000000000..0e4c34de957ced30e3923a9cf8fbb510d0dcd0a2 --- /dev/null +++ b/nextflowConf/local.config @@ -0,0 +1,7 @@ +process { + executor = 'local' +} + +docker { + enabled = true +} diff --git a/rna-seq.nf b/rna-seq.nf index ec289fc6242bbff2f7a74a2f6e57f8728ecc487c..9f4be25cb0d8df7623574e397198c46621719718 100644 --- a/rna-seq.nf +++ b/rna-seq.nf @@ -9,24 +9,24 @@ // ######## #### ###### ## // Define input variables -params.deriva = "${baseDir}/../test_data/auth/credential.json" -params.bdbag = "${baseDir}/../test_data/auth/cookies.txt" +params.deriva = "${baseDir}/test_data/auth/credential.json" +params.bdbag = "${baseDir}/test_data/auth/cookies.txt" //params.repRID = "16-1ZX4" params.repRID = "Q-Y5F6" params.source = "dev" params.refMoVersion = "38.p6.vM25" params.refHuVersion = "38.p13.v36" params.refERCCVersion = "92" -params.outDir = "${baseDir}/../output" +params.outDir = "${baseDir}/output" params.upload = false params.email = "" params.track = false - // Define override input variable params.refSource = "biohpc" params.inputBagForce = "" params.fastqsForce = "" +params.endsForce = "" params.speciesForce = "" params.strandedForce = "" params.spikeForce = "" @@ -35,14 +35,13 @@ params.spikeForce = "" params.ci = false params.dev = true - // Parse input variables deriva = Channel .fromPath(params.deriva) .ifEmpty { exit 1, "deriva credential file not found: ${params.deriva}" } deriva.into { deriva_getBag - deriva_getRefInfer + deriva_getRefERCC deriva_getRef deriva_uploadInputBag deriva_uploadExecutionRun @@ -66,14 +65,16 @@ logsDir = "${outDir}/Logs" upload = params.upload inputBagForce = params.inputBagForce fastqsForce = params.fastqsForce +endsForce = params.endsForce speciesForce = params.speciesForce strandedForce = params.strandedForce spikeForce = params.spikeForce email = params.email // Define fixed files and variables -replicateExportConfig = Channel.fromPath("${baseDir}/conf/Replicate_For_Input_Bag.json") -executionRunExportConfig = Channel.fromPath("${baseDir}/conf/Execution_Run_For_Output_Bag.json") +bdbagConfig = Channel.fromPath("${baseDir}/workflow/conf/bdbag.json") +replicateExportConfig = Channel.fromPath("${baseDir}/workflow/conf/Replicate_For_Input_Bag.json") +executionRunExportConfig = Channel.fromPath("${baseDir}/workflow/conf/Execution_Run_For_Output_Bag.json") if (params.source == "dev") { source = "dev.gudmap.org" } else if (params.source == "staging") { @@ -86,38 +87,36 @@ if (params.refSource == "biohpc") { } else if (params.refSource == "datahub") { referenceBase = "www.gudmap.org" } -referenceInfer = Channel.fromList(["ERCC","GRCh","GRCm"]) -multiqcConfig = Channel.fromPath("${baseDir}/conf/multiqc_config.yaml") -bicfLogo = Channel.fromPath("${baseDir}/../docs/bicf_logo.png") -softwareReferences = Channel.fromPath("${baseDir}/../docs/software_references_mqc.yaml") -softwareVersions = Channel.fromPath("${baseDir}/../docs/software_versions_mqc.yaml") +multiqcConfig = Channel.fromPath("${baseDir}/workflow/conf/multiqc_config.yaml") +bicfLogo = Channel.fromPath("${baseDir}/docs/bicf_logo.png") +softwareReferences = Channel.fromPath("${baseDir}/docs/software_references_mqc.yaml") +softwareVersions = Channel.fromPath("${baseDir}/docs/software_versions_mqc.yaml") // Define script files -script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbag_fetch.sh") -script_parseMeta = Channel.fromPath("${baseDir}/scripts/parse_meta.py") -script_inferMeta = Channel.fromPath("${baseDir}/scripts/infer_meta.sh") -script_refDataInfer = Channel.fromPath("${baseDir}/scripts/extract_ref_data.py") -script_refData = Channel.fromPath("${baseDir}/scripts/extract_ref_data.py") -script_calculateTPM = Channel.fromPath("${baseDir}/scripts/calculateTPM.R") -script_convertGeneSymbols = Channel.fromPath("${baseDir}/scripts/convertGeneSymbols.R") -script_tinHist = Channel.fromPath("${baseDir}/scripts/tin_hist.py") -script_uploadInputBag = Channel.fromPath("${baseDir}/scripts/upload_input_bag.py") -script_uploadExecutionRun_uploadExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") -script_uploadExecutionRun_finalizeExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") -script_uploadExecutionRun_failPreExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") -script_uploadExecutionRun_failExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") -script_uploadQC = Channel.fromPath("${baseDir}/scripts/upload_qc.py") -script_uploadQC_fail = Channel.fromPath("${baseDir}/scripts/upload_qc.py") -script_uploadOutputBag = Channel.fromPath("${baseDir}/scripts/upload_output_bag.py") -script_deleteEntry_uploadQC = Channel.fromPath("${baseDir}/scripts/delete_entry.py") -script_deleteEntry_uploadQC_fail = Channel.fromPath("${baseDir}/scripts/delete_entry.py") -script_deleteEntry_uploadProcessedFile = Channel.fromPath("${baseDir}/scripts/delete_entry.py") +script_bdbagFetch = Channel.fromPath("${baseDir}/workflow/scripts/bdbag_fetch.sh") +script_parseMeta = Channel.fromPath("${baseDir}/workflow/scripts/parse_meta.py") +script_inferMeta = Channel.fromPath("${baseDir}/workflow/scripts/infer_meta.sh") +script_refDataInfer = Channel.fromPath("${baseDir}/workflow/scripts/extract_ref_data.py") +script_refData = Channel.fromPath("${baseDir}/workflow/scripts/extract_ref_data.py") +script_calculateTPM = Channel.fromPath("${baseDir}/workflow/scripts/calculateTPM.R") +script_convertGeneSymbols = Channel.fromPath("${baseDir}/workflow/scripts/convertGeneSymbols.R") +script_tinHist = Channel.fromPath("${baseDir}/workflow/scripts/tin_hist.py") +script_uploadInputBag = Channel.fromPath("${baseDir}/workflow/scripts/upload_input_bag.py") +script_uploadExecutionRun_uploadExecutionRun = Channel.fromPath("${baseDir}/workflow/scripts/upload_execution_run.py") +script_uploadExecutionRun_finalizeExecutionRun = Channel.fromPath("${baseDir}/workflow/scripts/upload_execution_run.py") +script_uploadExecutionRun_failPreExecutionRun = Channel.fromPath("${baseDir}/workflow/scripts/upload_execution_run.py") +script_uploadExecutionRun_failExecutionRun = Channel.fromPath("${baseDir}/workflow/scripts/upload_execution_run.py") +script_uploadQC = Channel.fromPath("${baseDir}/workflow/scripts/upload_qc.py") +script_uploadQC_fail = Channel.fromPath("${baseDir}/workflow/scripts/upload_qc.py") +script_uploadOutputBag = Channel.fromPath("${baseDir}/workflow/scripts/upload_output_bag.py") +script_deleteEntry_uploadQC = Channel.fromPath("${baseDir}/workflow/scripts/delete_entry.py") +script_deleteEntry_uploadQC_fail = Channel.fromPath("${baseDir}/workflow/scripts/delete_entry.py") +script_deleteEntry_uploadProcessedFile = Channel.fromPath("${baseDir}/workflow/scripts/delete_entry.py") /* * trackStart: track start of pipeline */ process trackStart { - container 'docker://gudmaprbk/gudmap-rbk_base:1.0.0' script: """ hostname @@ -211,8 +210,7 @@ process getBag { deriva-download-cli ${source} --catalog 2 ${replicateExportConfig} . rid=${repRID} echo -e "LOG: fetched" >> ${repRID}.getBag.log - name=\$(ls *.zip) - name=\$(basename \${name} | cut -d "." -f1) + name=${repRID}_inputBag yr=\$(date +'%Y') mn=\$(date +'%m') dy=\$(date +'%d') @@ -220,7 +218,7 @@ process getBag { """ } -// Set inputBag to downloaded or forced input +// Set inputBag to downloaded or forced input and replicate them for multiple process inputs if (inputBagForce != "") { inputBag = Channel .fromPath(inputBagForce) @@ -234,12 +232,13 @@ inputBag.into { } /* - * getData: fetch replicate files from consortium with downloaded bdbag.zip + * getData: fetch replicate files from consortium with downloaded input bag */ process getData { tag "${repRID}" input: + path bdbagConfig path script_bdbagFetch path cookies, stageAs: "deriva-cookies.txt" from bdbag path inputBag from inputBag_getData @@ -247,7 +246,7 @@ process getData { output: path ("*.R{1,2}.fastq.gz") into fastqs path ("**/File.csv") into fileMeta - path ("**/Experiment Settings.csv") into experimentSettingsMeta + path ("ExperimentSettings.csv") into experimentSettingsMeta path ("**/Experiment.csv") into experimentMeta path "fastqCount.csv" into fastqCount_fl @@ -256,12 +255,6 @@ process getData { hostname > ${repRID}.getData.log ulimit -a >> ${repRID}.getData.log - # link deriva cookie for authentication - echo -e "LOG: linking deriva cookie" >> ${repRID}.getData.log - mkdir -p ~/.bdbag - ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt - echo -e "LOG: linked" >> ${repRID}.getData.log - # get bag basename replicate=\$(basename "${inputBag}") echo -e "LOG: bag replicate name \${replicate}" >> ${repRID}.getData.log @@ -272,23 +265,32 @@ process getData { echo -e "LOG: unzipped" >> ${repRID}.getData.log # bag fetch fastq's only and rename by repRID - echo -e "LOG: fetching replicate bdbag" >> ${repRID}.getData.log - sh ${script_bdbagFetch} \${replicate::-13} ${repRID} - echo -e "LOG: fetched" >> ${repRID}.getData.log - - fastqCount=\$(ls *.fastq.gz | wc -l) + if [ "${params.fastqsForce}" == "" ] + then + echo -e "LOG: fetching replicate bdbag" >> ${repRID}.getData.log + fastqCount=\$(sh ${script_bdbagFetch} \${replicate::-13} ${repRID}) + echo -e "LOG: fetched" >> ${repRID}.getData.log + else + echo -e "LOG: fastq override detected, not fetching fastqs" >> ${repRID}.getData.log + fastqCount="0" + fi + if [ "\${fastqCount}" == "0" ] then touch dummy.R1.fastq.gz + touch dummy.R2.fastq.gz fi echo "\${fastqCount}" > fastqCount.csv + + cp "${repRID}_inputBag/data/Experiment Settings.csv" ExperimentSettings.csv """ } // Split fastq count into channel +fastqCountTemp = Channel.create() fastqCount = Channel.create() fastqCount_fl.splitCsv(sep: ",", header: false).separate( - fastqCount + fastqCountTemp ) // Set raw fastq to downloaded or forced input and replicate them for multiple process inputs @@ -297,19 +299,31 @@ if (fastqsForce != "") { .fromPath(fastqsForce) .ifEmpty { exit 1, "override inputBag file not found: ${fastqsForce}" } .collect().into { + fastqs_seqwho + fastqs_trimData fastqs_parseMetadata fastqs_fastqc } + Channel + .fromPath(fastqsForce) + .count().set { + fastqCount + } } else { fastqs.collect().into { + fastqs_seqwho + fastqs_trimData fastqs_parseMetadata fastqs_fastqc } + fastqCountTemp.set { + fastqCount + } } /* * parseMetadata: parses metadata to extract experiment parameters -*/ + */ process parseMetadata { tag "${repRID}" @@ -464,7 +478,7 @@ process parseMetadata { """ } -// Split metadata into separate channels +// Split metadata into separate channels and replicate them for multiple process inputs endsMeta = Channel.create() endsRaw = Channel.create() endsManual = Channel.create() @@ -485,16 +499,16 @@ metadata_fl.splitCsv(sep: ",", header: false).separate( expRID, studyRID ) - -// Replicate metadata for multiple process inputs endsMeta.into { endsMeta_checkMetadata endsMeta_aggrQC endsMeta_failExecutionRun } endsManual.into { + endsManual_seqwho endsManual_trimData endsManual_downsampleData + endsManual_alignSampleDataERCC endsManual_alignSampleData endsManual_aggrQC } @@ -510,6 +524,7 @@ spikeMeta.into { spikeMeta_failExecutionRun } speciesMeta.into { + speciesMeta_seqwho speciesMeta_checkMetadata speciesMeta_aggrQC speciesMeta_failPreExecutionRun @@ -526,7 +541,7 @@ expRID.into { expRID_uploadProcessedFile } -// Split fastq count error into separate channel +// Split fastq count error into separate channel and replicate them for multiple process inputs fastqCountError = Channel.create() fastqCountError_details = Channel.create() fastqReadError = Channel.create() @@ -537,72 +552,73 @@ fastqError_fl.splitCsv(sep: ",", header: false).separate( fastqReadError, fastqReadError_details ) - -// Replicate errors for multiple process inputs fastqCountError.into { fastqCountError_fastqc + fastqCountError_seqwho + fastqCountError_getRefERCC + fastqCountError_getRef fastqCountError_trimData - fastqCountError_getRefInfer fastqCountError_downsampleData + fastqCountError_alignSampleDataERCC fastqCountError_alignSampleData fastqCountError_inferMetadata fastqCountError_checkMetadata - fastqCountError_uploadExecutionRun - fastqCountError_getRef fastqCountError_alignData fastqCountError_dedupData fastqCountError_makeBigWig fastqCountError_countData fastqCountError_dataQC fastqCountError_aggrQC + fastqCountError_uploadExecutionRun fastqCountError_uploadQC - fastqCountError_uploadQC_fail fastqCountError_uploadProcessedFile fastqCountError_uploadOutputBag - fastqCountError_failPreExecutionRun_fastq + fastqCountError_finalizeExecutionRun + fastqCountError_uploadQC_fail } fastqReadError.into { fastqReadError_fastqc + fastqReadError_seqwho + fastqReadError_getRefERCC + fastqReadError_getRef fastqReadError_trimData - fastqReadError_getRefInfer fastqReadError_downsampleData + fastqReadError_alignSampleDataERCC fastqReadError_alignSampleData fastqReadError_inferMetadata fastqReadError_checkMetadata - fastqReadError_uploadExecutionRun - fastqReadError_getRef fastqReadError_alignData fastqReadError_dedupData fastqReadError_makeBigWig fastqReadError_countData fastqReadError_dataQC fastqReadError_aggrQC + fastqReadError_uploadExecutionRun fastqReadError_uploadQC - fastqReadError_uploadQC_fail fastqReadError_uploadProcessedFile fastqReadError_uploadOutputBag - fastqReadError_failPreExecutionRun_fastq + fastqReadError_finalizeExecutionRun + fastqReadError_uploadQC_fail } /* - *fastqc: run fastqc on untrimmed fastq's -*/ + * fastqc: run fastqc on untrimmed fastq's + */ process fastqc { tag "${repRID}" input: - path (fastq) from fastqs_fastqc.collect() - val fastqCountError_fastqc - val fastqReadError_fastqc + path (fastq) from fastqs_fastqc + val fastqCountError from fastqCountError_fastqc + val fastqReadError from fastqReadError_fastqc output: - path ("*.R{1,2}.fastq.gz", includeInputs:true) into fastqs_trimData path ("*_fastqc.zip") into fastqc path ("rawReads.csv") into rawReadsInfer_fl path "fastqFileError.csv" into fastqFileError_fl when: - fastqCountError_fastqc == 'false' && fastqReadError_fastqc == 'false' + fastqCountError == "false" && fastqReadError == "false" script: """ @@ -633,178 +649,429 @@ process fastqc { """ } -// Extract number of raw reads metadata into channel +// Extract number of raw reads metadata into channel and replicate them for multiple process inputs rawReadsInfer = Channel.create() rawReadsInfer_fl.splitCsv(sep: ",", header: false).separate( rawReadsInfer ) - -// Replicate inferred raw reads for multiple process inputs rawReadsInfer.into { rawReadsInfer_aggrQC rawReadsInfer_uploadQC } -// Split fastq count error into separate channel +// Split fastq file error into separate channel and replicate them for multiple process inputs fastqFileError = Channel.create() fastqFileError_details = Channel.create() fastqFileError_fl.splitCsv(sep: ",", header: false).separate( fastqFileError, fastqFileError_details ) - -// Replicate errors for multiple process inputs fastqFileError.into { - fastqFileError_fastqc fastqFileError_trimData - fastqFileError_getRefInfer + fastqFileError_getRef fastqFileError_downsampleData + fastqFileError_alignSampleDataERCC fastqFileError_alignSampleData fastqFileError_inferMetadata fastqFileError_checkMetadata - fastqFileError_uploadExecutionRun - fastqFileError_getRef fastqFileError_alignData fastqFileError_dedupData fastqFileError_makeBigWig fastqFileError_countData fastqFileError_dataQC fastqFileError_aggrQC + fastqFileError_uploadExecutionRun fastqFileError_uploadQC - fastqFileError_uploadQC_fail fastqFileError_uploadProcessedFile fastqFileError_uploadOutputBag - fastqFileError_failPreExecutionRun_fastqFile + fastqFileError_finalizeExecutionRun + fastqFileError_uploadQC_fail } /* - * trimData: trims any adapter or non-host sequences from the data -*/ -process trimData { + * seqwho: run seqwho to infer species and seq type + */ +process seqwho { tag "${repRID}" input: - path (fastq) from fastqs_trimData - val ends from endsManual_trimData - val fastqCountError_trimData - val fastqReadError_trimData - val fastqFileError_trimData + path (fastq) from fastqs_seqwho + val ends from endsManual_seqwho + val speciesMeta from speciesMeta_seqwho + val fastqCountError from fastqCountError_seqwho + val fastqReadError from fastqReadError_seqwho output: - path ("*.fq.gz") into fastqsTrim - path ("*_trimming_report.txt") into trimQC - path ("readLength.csv") into readLengthInfer_fl + path "seqwhoInfer.tsv" into seqwhoInfer + path "inferSpecies.csv" into inferSpecies_fl + path "inferError.csv" into inferError_fl when: - fastqCountError_trimData == "false" - fastqReadError_trimData == "false" - fastqFileError_trimData == "false" + fastqCountError == "false" && fastqReadError == "false" script: """ - hostname > ${repRID}.trimData.log - ulimit -a >> ${repRID}.trimData.log + hostname > ${repRID}.seqwho.log + ulimit -a >> ${repRID}.seqwho.log - # trim fastq's using trim_galore and extract median read length - echo -e "LOG: trimming ${ends}" >> ${repRID}.trimData.log - if [ "${ends}" == "se" ] + # get seqwho index + wget -O SeqWho.ix https://cloud.biohpc.swmed.edu/index.php/s/eeNWqZz8jqN5zWY/download + echo -e "LOG: seqwho index downloaded" >> ${repRID}.seqwho.log + + # run seqwho + seqwho.py -f *.fastq.gz -x SeqWho.ix + echo -e "LOG: seqwho ran" >> ${repRID}.seqwho.log + + # parse inference from R1 + speciesR1=\$(cat SeqWho_call.tsv | grep ${fastq[0]} | cut -f18 -d\$'\t' | cut -f2 -d":" | tr -d " ") + seqtypeR1=\$(cat SeqWho_call.tsv | grep ${fastq[0]} | cut -f19 -d\$'\t' | cut -f2 -d":" | tr -d " ") + confidenceR1=\$(cat SeqWho_call.tsv | grep ${fastq[0]} | cut -f17 -d\$'\t' | cut -f2 -d":" | tr -d " ") + if [ "\${confidenceR1}" == "low" ] + then + speciesConfidenceR1=\$(cat SeqWho_call.tsv | grep ${fastq[0]} | cut -f17 -d\$'\t' | cut -f3 -d":" | tr -d " ") + seqtypeConfidenceR1=\$(cat SeqWho_call.tsv | grep ${fastq[0]} | cut -f17 -d\$'\t' | cut -f4 -d":" | tr -d " ") + else + speciesConfidenceR1="1" + seqtypeConfidenceR1="1" + fi + echo -e "LOG: R1 inference parsed" >> ${repRID}.seqwho.log + + # parse inference from R2 + if [ "${ends}" == "pe" ] then - trim_galore --gzip -q 25 --length 35 --basename ${repRID} ${fastq[0]} - readLength=\$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') - elif [ "${ends}" == "pe" ] + speciesR2=\$(cat SeqWho_call.tsv | grep ${fastq[1]} | cut -f18 -d\$'\t' | cut -f2 -d":" | tr -d " ") + seqtypeR2=\$(cat SeqWho_call.tsv | grep ${fastq[1]} | cut -f19 -d\$'\t' | cut -f2 -d":" | tr -d " ") + confidenceR2=\$(cat SeqWho_call.tsv | grep ${fastq[1]} | cut -f17 -d\$'\t' | cut -f2 -d":" | tr -d " ") + if [ "\${confidenceR2}" == "low" ] + then + speciesConfidenceR2=\$(cat SeqWho_call.tsv | grep ${fastq[1]} | cut -f17 -d\$'\t' | cut -f3 -d":" | tr -d " ") + seqtypeConfidenceR2=\$(cat SeqWho_call.tsv | grep ${fastq[1]} | cut -f17 -d\$'\t' | cut -f4 -d":" | tr -d " ") + else + speciesConfidenceR2="1" + seqtypeConfidenceR2="1" + fi + echo -e "LOG: R2 inference parsed" >> ${repRID}.seqwho.log + else + speciesR2=\${speciesR1} + seqtypeR2=\${seqtypeR1} + confidenceR2=\${confidenceR1} + speciesConfidenceR2="1" + seqtypeConfidenceR2="1" + fi + cp SeqWho_call.tsv SeqWho_call_full.tsv + + speciesErrorSeqwho=false + speciesErrorSeqwho_details="" + seqtypeError=false + seqtypeError_details="" + + # convert numeric confidence to string + if [ \${speciesConfidenceR1} == "1" ] then - trim_galore --gzip -q 25 --length 35 --paired --basename ${repRID} ${fastq[0]} ${fastq[1]} - readLength=\$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') + speciesConfidenceR1="high" + else + speciesConfidenceR1="low" fi - echo -e "LOG: trimmed" >> ${repRID}.trimData.log - echo -e "LOG: average trimmed read length: \${readLength}" >> ${repRID}.trimData.log + if [ \${speciesConfidenceR2} == "1" ] + then + speciesConfidenceR2="high" + else + speciesConfidenceR2="low" + fi + if [ \${seqtypeConfidenceR1} == "1" ] + then + seqtypeConfidenceR1="high" + else + seqtypeConfidenceR1="low" + fi + if [ \${seqtypeConfidenceR2} == "1" ] + then + seqtypeConfidenceR2="high" + else + seqtypeConfidenceR2="low" + fi + echo -e "LOG: confidence converted to string" >> ${repRID}.seqwho.log - # save read length file - echo "\${readLength}" > readLength.csv + # set species + if [ "\${speciesR1}" == "\${speciesR2}" ] + then + speciesInfer=\${speciesR1} + if [ "\${speciesInfer}" == "human" ] + then + speciesInfer="Homo sapiens" + elif [ "\${speciesInfer}" == "mouse" ] + then + speciesInfer="Mus musculus" + fi + echo -e "LOG: concordant species inference: \${speciesInfer}" >> ${repRID}.seqwho.log + else + speciesErrorSeqwho=true + speciesErrorSeqwho_details="**Infered species does not match for R1 and R2:** Infered R1 = \${speciesR1} and infered R2 = \${speciesR2}" + echo -e "LOG: inference error: \${speciesErrorSeqwho_details}" >> ${repRID}.seqwho.log + fi + + # detect species confidence errors + if [ "\${speciesConfidenceR1}" == "high" ] && [ "\${speciesConfidenceR2}" == "high" ] + then + echo -e "LOG: high confidence species inference detected" >> ${repRID}.seqwho.log + else + speciesErrorSeqwho=true + speciesErrorSeqwho_details=\$(echo "**Infered species confidence is low:**\\n") + speciesErrorSeqwho_details=\$(echo \${speciesErrorSeqwho_details}"|fastq|Infered species confidence|\\n") + speciesErrorSeqwho_details=\$(echo \${speciesErrorSeqwho_details}"|:--|:--:|\\n") + speciesErrorSeqwho_details=\$(echo \${speciesErrorSeqwho_details}"|Read 1|\${speciesConfidenceR1}|\\n") + if [ "${ends}" == "pe" ] + then + speciesErrorSeqwho_details=\$(echo \${speciesErrorSeqwho_details}"|Read 2|\${speciesConfidenceR2}|\\n") + fi + echo -e "LOG: inference error: \${speciesErrorSeqwho_details}" >> ${repRID}.seqwho.log + fi + + # detect seq type errors and set type + if [ "\${seqtypeConfidenceR1}" == "high" ] && [ "\${seqtypeConfidenceR2}" == "high" ] + then + echo -e "LOG: high confidence seq type inference detected" >> ${repRID}.seqwho.log + # set seq type + if [ "\${seqtypeR1}" == "\${seqtypeR2}" ] + then + if [ "\${seqtypeR1}" == "rnaseq" ] + then + seqtpeInfer="rnaseq" + echo -e "LOG: concordant rnaseq seq type inference detected" >> ${repRID}.seqwho.log + else + seqtypeError=true + seqtypeError_details="**Infered sequencing type is not mRNA-seq:** Infered = \${seqtypeR1}" + echo -e "LOG: inference error: \${seqtypeError_details}" >> ${repRID}.seqwho.log + fi + else + seqtypeError=true + seqtypeError_details="**Infered sequencing type does not match for R1 and R2:** Infered R1 = \${seqtypeR1} and infered R2 = \${seqtypeR2}" + echo -e "LOG: inference error: \${seqtypeError_details}" >> ${repRID}.seqwho.log + fi + consensus="-" + else + echo -e "LOG: low confidence seq type inference detected" >> ${repRID}.seqwho.log + seqtk sample -s100 ${fastq[0]} 1000000 1> sampled.1.seed100.fastq & + seqtk sample -s200 ${fastq[0]} 1000000 1> sampled.1.seed200.fastq & + seqtk sample -s300 ${fastq[0]} 1000000 1> sampled.1.seed300.fastq & + wait + gzip sampled.1.seed100.fastq & + gzip sampled.1.seed200.fastq & + gzip sampled.1.seed300.fastq & + wait + seqwho.py -f sampled.1.seed*.fastq.gz -x SeqWho.ix + seqtypeR1_1=\$(cat SeqWho_call.tsv | grep sampled.1.seed100.fastq.gz | cut -f19 -d\$'\t' | cut -f2 -d":" | tr -d " ") + seqtypeR1_2=\$(cat SeqWho_call.tsv | grep sampled.1.seed200.fastq.gz | cut -f19 -d\$'\t' | cut -f2 -d":" | tr -d " ") + seqtypeR1_3=\$(cat SeqWho_call.tsv | grep sampled.1.seed300.fastq.gz | cut -f19 -d\$'\t' | cut -f2 -d":" | tr -d " ") + cp SeqWho_call.tsv SeqWho_call_sampledR1.tsv + if [ "\${seqtypeR1_1}" == "\${seqtypeR1}" ] && [ "\${seqtypeR1_2}" == "\${seqtypeR1}" ] && [ "\${seqtypeR1_3}" == "\${seqtypeR1}" ] + then + consensus=true + else + consensus=false + fi + if [ "${ends}" == "pe" ] + then + seqtk sample -s100 ${fastq[1]} 1000000 1> sampled.2.seed100.fastq & + seqtk sample -s200 ${fastq[1]} 1000000 1> sampled.2.seed200.fastq & + seqtk sample -s300 ${fastq[1]} 1000000 1> sampled.2.seed300.fastq & + wait + gzip sampled.2.seed100.fastq & + gzip sampled.2.seed200.fastq & + gzip sampled.2.seed300.fastq & + wait + seqwho.py -f sampled.2.seed*.fastq.gz -x SeqWho.ix + seqtypeR2_1=\$(cat SeqWho_call.tsv | grep sampled.2.seed100.fastq.gz | cut -f19 -d\$'\t' | cut -f2 -d":" | tr -d " ") + seqtypeR2_2=\$(cat SeqWho_call.tsv | grep sampled.2.seed200.fastq.gz | cut -f19 -d\$'\t' | cut -f2 -d":" | tr -d " ") + seqtypeR2_3=\$(cat SeqWho_call.tsv | grep sampled.2.seed300.fastq.gz | cut -f19 -d\$'\t' | cut -f2 -d":" | tr -d " ") + cp SeqWho_call.tsv SeqWho_call_sampledR2.tsv + if [ "\${seqtypeR2_1}" == "\${seqtypeR1}" ] && [ "\${seqtypeR2_2}" == "\${seqtypeR1}" ] && [ "\${seqtypeR2_3}" == "\${seqtypeR1}" ] + then + consensus=\${consensus} + else + consensus=false + fi + fi + if [ \${consensus} == false ] + then + seqtypeError=true + seqtypeError_details=\$(echo "**Infered species confidence is low:**\\n") + seqtypeError_details=\$(echo \${seqtypeError_details}"|fastq|Infered seq type|Infered seq type confidence|\\n") + seqtypeError_details=\$(echo \${seqtypeError_details}"|:--|:--:|:--:|\\n") + seqtypeError_details=\$(echo \${seqtypeError_details}"|Read 1|\${seqtypeR1}|\${seqtypeConfidenceR1}|\\n") + if [ "${ends}" == "pe" ] + then + seqtypeError_details=\$(echo \${seqtypeError_details}"|Read 2|\${seqtypeR2}|\${seqtypeConfidenceR2}|\\n") + fi + echo -e "LOG: inference error: \${seqtypeError_details}" >> ${repRID}.seqwho.log + fi + fi + + # check for species match error + if [ "${speciesMeta}" != "\${speciesInfer}" ] + then + if [ "${params.speciesForce}" != "" ] + then + speciesError=false + echo -e "LOG: species forced: Submitted=${speciesMeta}; Inferred=\${speciesInfer}; Forced=${params.speciesForce}" >> ${repRID}.seqwho.log + else + speciesError=true + echo -e "LOG: species does not match: Submitted=${speciesMeta}; Inferred=\${speciesInfer}" >> ${repRID}.seqwho.log + fi + else + speciesError=false + echo -e "LOG: species matches: Submitted=${speciesMeta}; Inferred=\${speciesInfer}" >> ${repRID}.seqwho.log + fi + + # save seqwho multiqc report + echo -e "Read\tSeq Type\tSpecies\tSeq Type Confidence\tSeq Type Consensus\tSpecies Confidence" > seqwhoInfer.tsv + echo -e "Read 1\t\${seqtypeR1}\t\${speciesR1}\t\${seqtypeConfidenceR1}\t\${consensus}\t\${speciesConfidenceR1}" >> seqwhoInfer.tsv + if [ "${ends}" == "pe" ] + then + echo -e "Read 2\t\${seqtypeR2}\t\${speciesR2}\t\${seqtypeConfidenceR2}\t\${consensus}\t\${speciesConfidenceR2}" >> seqwhoInfer.tsv + fi + + # save species file + echo "\${speciesInfer}" > inferSpecies.csv + + # save error file + echo "\${seqtypeError},\${seqtypeError_details},\${speciesErrorSeqwho},\${speciesErrorSeqwho_details},\${speciesError}" > inferError.csv """ } -// Extract calculated read length metadata into channel -readLengthInfer = Channel.create() -readLengthInfer_fl.splitCsv(sep: ",", header: false).separate( - readLengthInfer +// Extract infered sepecies metadata into channel and replicate them for multiple process inputs +speciesInfer = Channel.create() +inferSpecies_fl.splitCsv(sep: ",", header: false).separate( + speciesInfer ) +speciesInfer.into { + speciesInfer_getRef + speciesInfer_alignSampleData + speciesInfer_checkMetadata + speciesInfer_aggrQC + speciesInfer_uploadExecutionRun + speciesInfer_uploadProcessedFile + speciesInfer_failExecutionRun +} -// Replicate inferred read length for multiple process inputs -readLengthInfer.into { - readLengthInfer_aggrQC - readLengthInfer_uploadQC +// extract seq type and species error into separate channel and replicate them for multiple process inputs +seqtypeError = Channel.create() +seqtypeError_details = Channel.create() +speciesErrorSeqwho = Channel.create() +speciesErrorSeqwho_details = Channel.create() +speciesError = Channel.create() +inferError_fl.splitCsv(sep: ",", header: false).separate( + seqtypeError, + seqtypeError_details, + speciesErrorSeqwho, + speciesErrorSeqwho_details, + speciesError +) +seqtypeError.into { + seqtypeError_trimData + seqtypeError_getRef + seqtypeError_downsampleData + seqtypeError_alignSampleDataERCC + seqtypeError_alignSampleData + seqtypeError_inferMetadata + seqtypeError_checkMetadata + seqtypeError_alignData + seqtypeError_dedupData + seqtypeError_makeBigWig + seqtypeError_countData + seqtypeError_dataQC + seqtypeError_aggrQC + seqtypeError_uploadExecutionRun + seqtypeError_uploadQC + seqtypeError_uploadProcessedFile + seqtypeError_uploadOutputBag + seqtypeError_finalizeExecutionRun + seqtypeError_uploadQC_fail } -// Replicate trimmed fastq's for multiple process inputs -fastqsTrim.into { - fastqsTrim_alignData - fastqsTrim_downsampleData +speciesErrorSeqwho.into { + speciesErrorSeqwho_trimData + speciesErrorSeqwho_getRef + speciesErrorSeqwho_downsampleData + speciesErrorSeqwho_alignSampleDataERCC + speciesErrorSeqwho_alignSampleData + speciesErrorSeqwho_inferMetadata + speciesErrorSeqwho_checkMetadata + speciesErrorSeqwho_alignData + speciesErrorSeqwho_dedupData + speciesErrorSeqwho_makeBigWig + speciesErrorSeqwho_countData + speciesErrorSeqwho_dataQC + speciesErrorSeqwho_aggrQC + speciesErrorSeqwho_uploadExecutionRun + speciesErrorSeqwho_uploadQC + speciesErrorSeqwho_uploadProcessedFile + speciesErrorSeqwho_uploadOutputBag + speciesErrorSeqwho_finalizeExecutionRun + speciesErrorSeqwho_uploadQC_fail +} +speciesError.into { + speciesError_trimData + speciesError_getRef + speciesError_downsampleData + speciesError_alignSampleDataERCC + speciesError_alignSampleData + speciesError_inferMetadata + speciesError_checkMetadata + speciesError_alignData + speciesError_dedupData + speciesError_makeBigWig + speciesError_countData + speciesError_dataQC + speciesError_aggrQC + speciesError_uploadExecutionRun + speciesError_uploadQC + speciesError_uploadProcessedFile + speciesError_uploadOutputBag + speciesError_finalizeExecutionRun + speciesError_uploadQC_fail } - -// Combine inputs of getRefInfer -getRefInferInput = referenceInfer.combine(deriva_getRefInfer.combine(script_refDataInfer.combine(fastqCountError_getRefInfer.combine(fastqReadError_getRefInfer.combine(fastqFileError_getRefInfer))))) /* - * getRefInfer: dowloads appropriate reference for metadata inference -*/ -process getRefInfer { - tag "${refName}" + * getRefERCC: downloads ERCC reference for spike metadata inference + */ +process getRefERCC { + tag "${repRID}" input: - tuple val (refName), path (credential, stageAs: "credential.json"), path (script_refDataInfer), val (fastqCountError), val (fastqReadError), val (fastqFileError) from getRefInferInput + path (credential, stageAs: "credential.json") from deriva_getRefERCC + path script_refDataInfer + val fastqCountError from fastqCountError_getRefERCC + val fastqReadError from fastqReadError_getRefERCC output: - tuple val (refName), path ("hisat2", type: 'dir'), path ("*.fna"), path ("*.gtf") into refInfer - path ("${refName}", type: 'dir') into bedInfer + tuple path ("hisat2", type: 'dir'), path ("*.fna"), path ("*.gtf") into refERCC when: - fastqCountError == "false" - fastqReadError == "false" - fastqFileError == "false" + fastqCountError == "false" && fastqReadError == "false" script: """ - hostname > ${repRID}.${refName}.getRefInfer.log - ulimit -a >> ${repRID}.${refName}.getRefInfer.log + hostname > ${repRID}.getRefERCC.log + ulimit -a >> ${repRID}.getRefERCC.log # link credential file for authentication - echo -e "LOG: linking deriva credentials" >> ${repRID}.${refName}.getRefInfer.log + echo -e "LOG: linking deriva credentials" >> ${repRID}.getRefERCC.log mkdir -p ~/.deriva ln -sf `readlink -e credential.json` ~/.deriva/credential.json - echo -e "LOG: linked" >> ${repRID}.${refName}.getRefInfer.log + echo -e "LOG: linked" >> ${repRID}.getRefERCC.log # set the reference name - if [ "${refName}" == "ERCC" ] - then - references=\$(echo ${referenceBase}/ERCC${refERCCVersion}) - elif [ "${refName}" == "GRCm" ] - then - references=\$(echo ${referenceBase}/GRCm${refMoVersion}) - elif [ '${refName}' == "GRCh" ] - then - references=\$(echo ${referenceBase}/GRCh${refHuVersion}) - else - echo -e "LOG: ERROR - References could not be set!\nReference found: ${referenceBase}" >> ${repRID}.${refName}.getRefInfer.log - exit 1 - fi + references=\$(echo ${referenceBase}/ERCC${refERCCVersion}) # retreive appropriate reference appropriate location - echo -e "LOG: fetching ${refName} reference files from ${referenceBase}" >> ${repRID}.${refName}.getRefInfer.log - if [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references/new" ] + echo -e "LOG: fetching ERCC reference files from ${referenceBase}" >> ${repRID}.getRefERCC.log + if [ "${referenceBase}" == "/project/BICF/BICF_Core/shared/gudmap/references/new" ] then unzip \${references}.zip mv \$(basename \${references})/data/* . - elif [ params.refSource == "datahub" ] + elif [ "${params.refSource}" == "datahub" ] then - GRCv=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f1) - GRCp=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f2) - GENCODE=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f3) - if [ "${refName}" != "ERCC" ] - then - query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='\${GRCv}'.'\${GRCp}'/Annotation_Version=GENCODE%20'\${GENCODE}) - else - query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version=${refName}${refERCCVersion}/Annotation_Version=${refName}${refERCCVersion}') - fi + query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='ERCC${refERCCVersion}'/Annotation_Version='ERCC${refERCCVersion}'/Used_Spike_Ins=false') curl --request GET \${query} > refQuery.json refURL=\$(python ${script_refDataInfer} --returnParam URL) loc=\$(dirname \${refURL}) @@ -818,15 +1085,74 @@ process getRefInfer { fi mv ./annotation/genome.gtf . mv ./sequence/genome.fna . - mkdir ${refName} - if [ "${refName}" != "ERCC" ] + echo -e "LOG: fetched" >> ${repRID}.getRefERCC.log + """ +} + +/* + * trimData: trims any adapter or non-host sequences from the data + */ +process trimData { + tag "${repRID}" + + input: + path (fastq) from fastqs_trimData + val ends from endsManual_trimData + val fastqCountError from fastqCountError_trimData + val fastqReadError from fastqReadError_trimData + val fastqFileError from fastqFileError_trimData + val seqtypeError from seqtypeError_trimData + val speciesErrorSeqwho from speciesErrorSeqwho_trimData + val speciesError from speciesError_trimData + + output: + path ("*.fq.gz") into fastqsTrim + path ("*_trimming_report.txt") into trimQC + path ("readLength.csv") into readLengthInfer_fl + + when: + fastqCountError == "false" && fastqReadError == "false" && fastqFileError == "false" && seqtypeError == "false" && speciesErrorSeqwho == "false" && speciesError == "false" + + script: + """ + hostname > ${repRID}.trimData.log + ulimit -a >> ${repRID}.trimData.log + echo fastqFileError ${fastqFileError} + + # trim fastq's using trim_galore and extract median read length + echo -e "LOG: trimming ${ends}" >> ${repRID}.trimData.log + if [ "${ends}" == "se" ] then - mv ./annotation/genome.bed ./${refName} + trim_galore --gzip -q 25 --length 35 --basename ${repRID} ${fastq[0]} + readLength=\$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') + elif [ "${ends}" == "pe" ] + then + trim_galore --gzip -q 25 --length 35 --paired --basename ${repRID} ${fastq[0]} ${fastq[1]} + readLength=\$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') fi - echo -e "LOG: fetched" >> ${repRID}.${refName}.getRefInfer.log + echo -e "LOG: trimmed" >> ${repRID}.trimData.log + echo -e "LOG: average trimmed read length: \${readLength}" >> ${repRID}.trimData.log + + # save read length file + echo "\${readLength}" > readLength.csv """ } +// Extract calculated read length metadata into channel and replicate them for multiple process inputs +readLengthInfer = Channel.create() +readLengthInfer_fl.splitCsv(sep: ",", header: false).separate( + readLengthInfer +) +readLengthInfer.into { + readLengthInfer_aggrQC + readLengthInfer_uploadQC +} +// Replicate trimmed fastq's for multiple process inputs +fastqsTrim.into { + fastqsTrim_downsampleData + fastqsTrim_alignData +} + /* * downsampleData: downsample fastq's for metadata inference */ @@ -836,18 +1162,18 @@ process downsampleData { input: path fastq from fastqsTrim_downsampleData val ends from endsManual_downsampleData - val fastqCountError_downsampleData - val fastqReadError_downsampleData - val fastqFileError_downsampleData + val fastqCountError from fastqCountError_downsampleData + val fastqReadError from fastqReadError_downsampleData + val fastqFileError from fastqFileError_downsampleData + val seqtypeError from seqtypeError_downsampleData + val speciesErrorSeqwho from speciesErrorSeqwho_downsampleData + val speciesError from speciesError_downsampleData output: - path ("sampled.1.fq") into fastqs1Sample - path ("sampled.2.fq") into fastqs2Sample + path ("sampled.{1,2}.fq") into fastqsSample when: - fastqCountError_downsampleData == "false" - fastqReadError_downsampleData == "false" - fastqFileError_downsampleData == "false" + fastqCountError == "false" && fastqReadError == "false" && fastqFileError == "false" && seqtypeError == "false" && speciesErrorSeqwho == "false" && speciesError == "false" script: """ @@ -870,242 +1196,366 @@ process downsampleData { """ } -// Replicate the dowsampled fastq's and attatched to the references -inferInput = endsManual_alignSampleData.combine(refInfer.combine(fastqs1Sample.collect().combine(fastqs2Sample.collect().combine(fastqCountError_alignSampleData.combine(fastqReadError_alignSampleData.combine(fastqFileError_alignSampleData)))))) +// Replicate sampled fastq's for multiple process inputs +fastqsSample.into { + fastqsSample_alignSampleDataERCC + fastqsSample_alignSampleData +} /* - * alignSampleData: aligns the downsampled reads to a reference database -*/ -process alignSampleData { - tag "${ref}" + * alignSampleDataERCC: aligns the downsampled reads to the ERCC reference and infers spike in + */ +process alignSampleDataERCC { + tag "${repRID}" input: - tuple val (ends), val (ref), path (hisat2), path (fna), path (gtf), path (fastq1), path (fastq2), val (fastqCountError), val (fastqReadError), val (fastqFileError) from inferInput + val ends from endsManual_alignSampleDataERCC + tuple path (hisat2), path (fna), path (gtf) from refERCC + path fastq from fastqsSample_alignSampleDataERCC + val spikeForce + val fastqCountError from fastqCountError_alignSampleDataERCC + val fastqReadError from fastqReadError_alignSampleDataERCC + val fastqFileError from fastqFileError_alignSampleDataERCC + val seqtypeError from seqtypeError_alignSampleDataERCC + val speciesErrorSeqwho from speciesErrorSeqwho_alignSampleDataERCC + val speciesError from speciesError_alignSampleDataERCC output: - path ("${ref}.sampled.sorted.bam") into sampleBam - path ("${ref}.sampled.sorted.bam.bai") into sampleBai - path ("${ref}.alignSampleSummary.txt") into alignSampleQC + path "inferSpike.csv" into inferSpike_fl + path ("ERCC.alignSampleSummary.txt") into alignSampleQC_ERCC when: - fastqCountError == "false" - fastqReadError == "false" - fastqFileError == "false" + fastqCountError == "false" && fastqReadError == "false" && fastqFileError == "false" && seqtypeError == "false" && speciesError == "false" script: """ - hostname > ${repRID}.${ref}.alignSampleData.log - ulimit -a >> ${repRID}.${ref}.alignSampleData.log + hostname > ${repRID}.alignSampleDataERCC.log + ulimit -a >> ${repRID}.alignSampleDataERCC.log # align the reads with Hisat2 - echo -e "LOG: aligning ${ends}" >> ${repRID}.${ref}.alignSampleData.log + echo -e "LOG: aligning ${ends}" >> ${repRID}.alignSampleDataERCC.log if [ "${ends}" == "se" ] then - hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome -U ${fastq1} --summary-file ${ref}.alignSampleSummary.txt --new-summary + hisat2 -p `nproc` --add-chrname -S ERCC.sampled.sam -x hisat2/genome -U ${fastq[0]} --summary-file ERCC.alignSampleSummary.txt --new-summary elif [ "${ends}" == "pe" ] then - hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome --no-mixed --no-discordant -1 ${fastq1} -2 ${fastq2} --summary-file ${ref}.alignSampleSummary.txt --new-summary + hisat2 -p `nproc` --add-chrname -S ERCC.sampled.sam -x hisat2/genome --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} --summary-file ERCC.alignSampleSummary.txt --new-summary fi - echo -e "LOG: aliged" >> ${repRID}.${ref}.alignSampleData.log + echo -e "LOG: aliged" >> ${repRID}.alignSampleDataERCC.log # convert the output sam file to a sorted bam file using Samtools - echo -e "LOG: converting from sam to bam" >> ${repRID}.${ref}.alignSampleData.log - samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${ref}.sampled.bam ${ref}.sampled.sam + echo -e "LOG: converting from sam to bam" >> ${repRID}.alignSampleDataERCC.log + samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ERCC.sampled.bam ERCC.sampled.sam # sort the bam file using Samtools - echo -e "LOG: sorting the bam file" >> ${repRID}.${ref}.alignSampleData.log + echo -e "LOG: sorting the bam file" >> ${repRID}.alignSampleDataERCC.log proc=\$(expr `nproc` - 1) mem=\$(vmstat -s -S K | grep 'total memory' | grep -o '[0-9]*') mem=\$(expr \${mem} / \${proc} \\* 85 / 100) - samtools sort -@ \${proc} -m \${mem}K -O BAM -o ${ref}.sampled.sorted.bam ${ref}.sampled.bam + samtools sort -@ \${proc} -m \${mem}K -O BAM -o ERCC.sampled.sorted.bam ERCC.sampled.bam # index the sorted bam using Samtools - echo -e "LOG: indexing sorted bam file" >> ${repRID}.${ref}.alignSampleData.log - samtools index -@ `nproc` -b ${ref}.sampled.sorted.bam ${ref}.sampled.sorted.bam.bai - """ -} + echo -e "LOG: indexing sorted bam file" >> ${repRID}.alignSampleDataERCC.log + samtools index -@ `nproc` -b ERCC.sampled.sorted.bam ERCC.sampled.sorted.bam.bai -alignSampleQC.into { - alignSampleQC_inferMetadata - alignSampleQC_aggrQC -} + # collect alignment rates (round down to integers) + align=\$(echo \$(grep "Overall alignment rate" ERCC.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) + align=\$(echo \${align%.*}) + echo -e "LOG: alignment rate to ERCC: \${align}" >> ${repRID}.alignSampleDataERCC.log -process inferMetadata { - tag "${repRID}" + # determine spike-in + if [ 1 -eq \$(echo \$(expr \${align} ">=" 10)) ] + then + spike="true" + else + spike="false" + fi + echo -e "LOG: inference of strandedness results is: \${spike}" >> ${repRID}.alignSampleDataERCC.log + if [ "${spikeForce}" != "" ] + then + spike=${spikeForce} + echo -e "LOG: spike-in metadata forced: \${spike}" >> ${repRID}.alignSampleDataERCC.log + fi - input: - path script_inferMeta - path beds from bedInfer.collect() - path bam from sampleBam.collect() - path bai from sampleBai.collect() - path alignSummary from alignSampleQC_inferMetadata.collect() - val strandedForce - val spikeForce - val fastqCountError_inferMetadata - val fastqReadError_inferMetadata - val fastqFileError_inferMetadata + # write inferred spike metadata to file + echo "\${spike},\${align}" > inferSpike.csv + """ +} + +// Extract spike in metadata and % aligned to ERCC into channel and replicate them for multiple process inputs +spikeInfer = Channel.create() +alignInferERCC = Channel.create() +inferSpike_fl.splitCsv(sep: ",", header: false).separate( + spikeInfer, + alignInferERCC +) +spikeInfer.into { + spikeInfer_getRef + spikeInfer_checkMetadata + spikeInfer_aggrQC + spikeInfer_uploadExecutionRun + spikeInfer_failExecutionRun +} + +/* + * getRef: downloads appropriate reference + */ +process getRef { + tag "${species}" + + input: + path script_refData + path credential, stageAs: "credential.json" from deriva_getRef + val spike from spikeInfer_getRef + val species from speciesInfer_getRef + val fastqCountError from fastqCountError_getRef + val fastqReadError from fastqReadError_getRef + val fastqFileError from fastqFileError_getRef + val seqtypeError from seqtypeError_getRef + val speciesErrorSeqwho from speciesErrorSeqwho_getRef + val speciesError from speciesError_getRef output: - path "infer.csv" into inferMetadata_fl - path "${repRID}.infer_experiment.txt" into inferExperiment - path "speciesError.csv" into speciesError_fl + tuple path ("hisat2", type: 'dir'), path ("*.bed"), path ("*.fna"), path ("*.gtf"), path ("geneID.tsv"), path ("Entrez.tsv") into reference when: - fastqCountError_inferMetadata == "false" - fastqReadError_inferMetadata == "false" - fastqFileError_inferMetadata == "false" + fastqCountError == "false" && fastqReadError == "false" && fastqFileError == "false" && seqtypeError == "false" && speciesErrorSeqwho == "false" && speciesError == "false" script: """ - hostname > ${repRID}.inferMetadata.log - ulimit -a >> ${repRID}.inferMetadata.log + hostname > ${repRID}.getRef.log + ulimit -a >> ${repRID}.getRef.log - # collect alignment rates (round down to integers) - align_ercc=\$(echo \$(grep "Overall alignment rate" ERCC.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) - align_ercc=\$(echo \${align_ercc%.*}) - echo -e "LOG: alignment rate to ERCC: \${align_ercc}" >> ${repRID}.inferMetadata.log - align_hu=\$(echo \$(grep "Overall alignment rate" GRCh.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) - align_hu=\$(echo \${align_hu%.*}) - echo -e "LOG: alignment rate to GRCh: \${align_hu}" >> ${repRID}.inferMetadata.log - align_mo=\$(echo \$(grep "Overall alignment rate" GRCm.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) - align_mo=\$(echo \${align_mo%.*}) - echo -e "LOG: alignment rate to GRCm: \${align_mo}" >> ${repRID}.inferMetadata.log + # link credential file for authentication + echo -e "LOG: linking deriva credentials" >> ${repRID}.getRef.log + mkdir -p ~/.deriva + ln -sf `readlink -e credential.json` ~/.deriva/credential.json + echo -e "LOG: linked" >> ${repRID}.getRef.log - # determine spike-in - if [ 1 -eq \$(echo \$(expr \${align_ercc} ">=" 10)) ] + # set the reference name + if [ "${species}" == "Mus musculus" ] then - spike="true" + reference=\$(echo ${referenceBase}/GRCm${refMoVersion}) + refName=GRCm + elif [ '${species}' == "Homo sapiens" ] + then + reference=\$(echo ${referenceBase}/GRCh${refHuVersion}) + refName=GRCh else - spike="false" + echo -e "LOG: ERROR - References could not be set!\nSpecies reference found: ${species}" >> ${repRID}.getRef.log + exit 1 fi - echo -e "LOG: inference of strandedness results is: \${spike}" >> ${repRID}.inferMetadata.log - if [ "${spikeForce}" != "" ] + if [ "${spike}" == "true" ] then - spike=${spikeForce} - echo -e "LOG: spike-in metadata forced: \${spike}" >> ${repRID}.parseMetadata.log + reference=\$(echo \${reference}-S) + elif [ "${spike}" == "false" ] + then + reference=\$(echo \${reference}) fi + echo -e "LOG: species set to \${reference}" >> ${repRID}.getRef.log - speciesError=false - speciesError_details="" - # determine species - if [ 1 -eq \$(echo \$(expr \${align_hu} ">=" 40)) ] && [ 1 -eq \$(echo \$(expr \${align_mo} "<" 40)) ] + # retreive appropriate reference appropriate location + echo -e "LOG: fetching ${species} reference files from ${referenceBase}" >> ${repRID}.getRef.log + if [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references/new" ] then - species="Homo sapiens" - bam="GRCh.sampled.sorted.bam" - bed="./GRCh/genome.bed" - echo -e "LOG: inference of species results in: \${species}" >> ${repRID}.inferMetadata.log - elif [ 1 -eq \$(echo \$(expr \${align_mo} ">=" 40)) ] && [ 1 -eq \$(echo \$(expr \${align_hu} "<" 40)) ] + echo -e "LOG: grabbing reference files from local (BioHPC)" >> ${repRID}.getRef.log + unzip \${reference}.zip + mv \$(basename \${reference})/data/* . + elif [ "${params.refSource}" == "datahub" ] then - species="Mus musculus" - bam="GRCm.sampled.sorted.bam" - bed="./GRCm/genome.bed" - echo -e "LOG: inference of species results in: \${species}" >> ${repRID}.inferMetadata.log - else - echo -e "LOG: ERROR - inference of species returns an ambiguous result: hu=\${align_hu} mo=\${align_mo}" >> ${repRID}.inferMetadata.log - if [ "${speciesForce}" == "" ] + echo -e "LOG: grabbing reference files from datahub" >> ${repRID}.getRef.log + GRCv=\$(echo \${reference} | grep -o \${refName}.* | cut -d '.' -f1) + GRCp=\$(echo \${reference} | grep -o \${refName}.* | cut -d '.' -f2) + GENCODE=\$(echo \${reference} | grep -o \${refName}.* | cut -d '.' -f3) + if [ "${spike}" == "true" ] then - speciesError=true - speciesError_details="**Inference of species returns an ambiguous result:** Percent aligned to human = \${align_hu} and percent aligned to mouse = \${align_mo}" + query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='\${GRCv}'.'\${GRCp}'/Annotation_Version=GENCODE%20'\${GENCODE}'/Used_Spike_Ins=true') + else + query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='\${GRCv}'.'\${GRCp}'/Annotation_Version=GENCODE%20'\${GENCODE}'/Used_Spike_Ins=false') fi + curl --request GET \${query} > refQuery.json + refURL=\$(python ${script_refData} --returnParam URL) + loc=\$(dirname \${refURL}) + fName=\$(python ${script_refData} --returnParam fName) + fName=\${fName%.*} + if [ "\${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi + filename=\$(echo \$(basename \${refURL}) | grep -oP '.*(?=:)') + deriva-hatrac-cli --host ${referenceBase} get \${refURL} + unzip \$(basename \${refURL}) + mv \${fName}/data/* . fi - if [ "${speciesForce}" != "" ] + echo -e "LOG: fetched" >> ${repRID}.getRef.log + + mv ./annotation/genome.gtf . + mv ./sequence/genome.fna . + mv ./annotation/genome.bed . + mv ./metadata/Entrez.tsv . + mv ./metadata/geneID.tsv . + """ +} + +// Replicate reference for multiple process inputs +reference.into { + reference_alignSampleData + reference_inferMetadata + reference_alignData + reference_countData + reference_dataQC +} +/* + * alignSampleData: aligns the downsampled reads to the appripriate species reference + */ +process alignSampleData { + tag "${repRID}" + + input: + path fastqSample from fastqsSample_alignSampleData + path reference_alignSampleData + val endsManual from endsManual_alignSampleData + val speciesInfer from speciesInfer_alignSampleData + val fastqCountError from fastqCountError_alignSampleData + val fastqReadError from fastqReadError_alignSampleData + val fastqFileError from fastqFileError_alignSampleData + val seqtypeError from seqtypeError_alignSampleData + val speciesErrorSeqwho from speciesErrorSeqwho_alignSampleData + val speciesError from speciesError_alignSampleData + + output: + path ("sampled.bam") into sampledBam + path "align.csv" into align_fl + path ("*.alignSampleSummary.txt") into alignSampleQC + + when: + fastqCountError == "false" && fastqReadError == "false" && fastqFileError == "false" && seqtypeError == "false" && speciesErrorSeqwho == "false" && speciesError == "false" + + script: + """ + hostname > ${repRID}.alignSampleData.log + ulimit -a >> ${repRID}.alignSampleData.log + + # align the sampled reads with Hisat2 + species="${speciesInfer}" + species=\${species// /_} + echo -e "LOG: aligning ${endsManual}" >> ${repRID}.alignSampleData.log + if [ "${endsManual}" == "se" ] then - speciesError=false - echo -e "LOG: species overridden to: ${speciesForce}" - species="${speciesForce}" - if [ "${speciesForce}" == "Homo sapiens" ] - then - bam="GRCh.sampled.sorted.bam" - bed="./GRCh/genome.bed" - elif [ "${speciesForce}" == "Mus musculus" ] - then - bam="GRCm.sampled.sorted.bam" - bed="./GRCm/genome.bed" - fi + hisat2 -p `nproc` --add-chrname -S sampled.sam -x hisat2/genome -U ${fastqSample[0]} --summary-file \${species}.alignSampleSummary.txt --new-summary + elif [ "${endsManual}" == "pe" ] + then + hisat2 -p `nproc` --add-chrname -S sampled.sam -x hisat2/genome --no-mixed --no-discordant -1 ${fastqSample[0]} -2 ${fastqSample[1]} --summary-file \${species}.alignSampleSummary.txt --new-summary fi + echo -e "LOG: aligned sampled reads" >> ${repRID}.alignSampleData.log - if [ "\${speciesError}" == false ] - then - # infer experimental setting from dedup bam - echo -e "LOG: infer experimental setting from dedup bam" >> ${repRID}.inferMetadata.log - infer_experiment.py -r "\${bed}" -i "\${bam}" 1>> ${repRID}.infer_experiment.txt - echo -e "LOG: inferred" >> ${repRID}.inferMetadata.log + # collect alignment rates (round down to integers) + align=\$(echo \$(grep "Overall alignment rate" \${species}.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) + align=\$(echo \${align%.*}) - ended=`bash ${script_inferMeta} endness ${repRID}.infer_experiment.txt` - fail=`bash ${script_inferMeta} fail ${repRID}.infer_experiment.txt` - if [ \${ended} == "PairEnd" ] - then - ends="pe" - percentF=`bash ${script_inferMeta} pef ${repRID}.infer_experiment.txt` - percentR=`bash ${script_inferMeta} per ${repRID}.infer_experiment.txt` - elif [ \${ended} == "SingleEnd" ] - then - ends="se" - percentF=`bash ${script_inferMeta} sef ${repRID}.infer_experiment.txt` - percentR=`bash ${script_inferMeta} ser ${repRID}.infer_experiment.txt` - fi - echo -e "LOG: percentage reads in the same direction as gene: \${percentF}" >> ${repRID}.inferMetadata.log - echo -e "LOG: percentage reads in the opposite direction as gene: \${percentR}" >> ${repRID}.inferMetadata.log - if [ 1 -eq \$(echo \$(expr \${percentF#*.} ">" 2500)) ] && [ 1 -eq \$(echo \$(expr \${percentR#*.} "<" 2500)) ] - then - stranded="forward" - elif [ 1 -eq \$(echo \$(expr \${percentR#*.} ">" 2500)) ] && [ 1 -eq \$(echo \$(expr \${percentF#*.} "<" 2500)) ] - then - stranded="reverse" - else - stranded="unstranded" - fi - echo -e "LOG: stradedness set to: \${stranded}" >> ${repRID}.inferMetadata.log - if [ "${strandedForce}" != "" ] - then - stranded=${strandedForce} - echo -e "LOG: spike-in metadata forced: \${stranded}" >> ${repRID}.inferMetadata.log - fi + # convert the sampled read output sam file to a sorted bam file using Samtools + echo -e "LOG: converting sampled reads from sam to bam" >> ${repRID}.alignSampleData.log + samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o sampled.bam sampled.sam + + echo "\${align}" > align.csv + """ +} + +// Extract % aligned to appropriate reference into channel +alignInfer = Channel.create() +align_fl.splitCsv(sep: ",", header: false).separate( + alignInfer +) + +/* + * inferMetadata: infers strandedness and endness from the aligned downsampled reads + */ +process inferMetadata { + tag "${repRID}" + + input: + path sampledBam + path reference_inferMetadata + path script_inferMeta + val endsForce + val strandedForce + val fastqCountError from fastqCountError_inferMetadata + val fastqReadError from fastqReadError_inferMetadata + val fastqFileError from fastqFileError_inferMetadata + val seqtypeError from seqtypeError_inferMetadata + val speciesErrorSeqwho from speciesErrorSeqwho_inferMetadata + val speciesError from speciesError_inferMetadata + + output: + path "infer.csv" into inferMetadata_fl + path "${repRID}.infer_experiment.txt" into inferExperiment + + when: + fastqCountError == "false" && fastqReadError == "false" && fastqFileError == "false" && seqtypeError == "false" && speciesErrorSeqwho == "false" && speciesError == "false" + + script: + """ + hostname > ${repRID}.inferMetadata.log + ulimit -a >> ${repRID}.inferMetadata.log + + # infer experimental setting from dedup bam + echo -e "LOG: infer experimental setting from bam" >> ${repRID}.inferMetadata.log + infer_experiment.py -r ./genome.bed -i ${sampledBam} 1>> ${repRID}.infer_experiment.txt + echo -e "LOG: inferred" >> ${repRID}.inferMetadata.log + + ended=`bash ${script_inferMeta} endness ${repRID}.infer_experiment.txt` + fail=`bash ${script_inferMeta} fail ${repRID}.infer_experiment.txt` + if [ \${ended} == "PairEnd" ] + then + ends="pe" + percentF=`bash ${script_inferMeta} pef ${repRID}.infer_experiment.txt` + percentR=`bash ${script_inferMeta} per ${repRID}.infer_experiment.txt` + elif [ \${ended} == "SingleEnd" ] + then + ends="se" + percentF=`bash ${script_inferMeta} sef ${repRID}.infer_experiment.txt` + percentR=`bash ${script_inferMeta} ser ${repRID}.infer_experiment.txt` + fi + echo -e "LOG: percentage reads in the same direction as gene: \${percentF}" >> ${repRID}.inferMetadata.log + echo -e "LOG: percentage reads in the opposite direction as gene: \${percentR}" >> ${repRID}.inferMetadata.log + if [ 1 -eq \$(echo \$(expr \${percentF#*.} ">" 2500)) ] && [ 1 -eq \$(echo \$(expr \${percentR#*.} "<" 2500)) ] + then + stranded="forward" + elif [ 1 -eq \$(echo \$(expr \${percentR#*.} ">" 2500)) ] && [ 1 -eq \$(echo \$(expr \${percentF#*.} "<" 2500)) ] + then + stranded="reverse" else - ends="" - stranded="" - spike="" - species="" - percentF="" - percentR="" - fail="" - touch ${repRID}.infer_experiment.txt + stranded="unstranded" + fi + echo -e "LOG: ends set to: \${ends}" >> ${repRID}.inferMetadata.log + if [ "${endsForce}" != "" ] + then + ends=${endsForce} + echo -e "LOG: ends metadata forced: \${ends}" >> ${repRID}.inferMetadata.log + fi + echo -e "LOG: stradedness set to: \${stranded}" >> ${repRID}.inferMetadata.log + if [ "${strandedForce}" != "" ] + then + stranded=${strandedForce} + echo -e "LOG: spike-in metadata forced: \${stranded}" >> ${repRID}.inferMetadata.log fi # write inferred metadata to file - echo "\${ends},\${stranded},\${spike},\${species},\${align_ercc},\${align_hu},\${align_mo},\${percentF},\${percentR},\${fail}" > infer.csv - - # save species error file - echo "\${speciesError},\${speciesError_details}" > speciesError.csv + echo "\${ends},\${stranded},\${percentF},\${percentR},\${fail}" > infer.csv """ } -// Split metadata into separate channels +// Extract metadata and replicate them for multiple process inputs endsInfer = Channel.create() strandedInfer = Channel.create() -spikeInfer = Channel.create() -speciesInfer = Channel.create() -align_erccInfer = Channel.create() -align_huInfer = Channel.create() -align_moInfer = Channel.create() percentFInfer = Channel.create() percentRInfer = Channel.create() failInfer = Channel.create() inferMetadata_fl.splitCsv(sep: ",", header: false).separate( endsInfer, strandedInfer, - spikeInfer, - speciesInfer, - align_erccInfer, - align_huInfer, - align_moInfer, percentFInfer, percentRInfer, failInfer ) - -// Replicate metadata for multiple process inputs endsInfer.into { endsInfer_checkMetadata endsInfer_alignData @@ -1123,52 +1573,10 @@ strandedInfer.into { strandedInfer_uploadQC strandedInfer_failExecutionRun } -spikeInfer.into{ - spikeInfer_checkMetadata - spikeInfer_getRef - spikeInfer_aggrQC - spikeInfer_uploadExecutionRun - spikeInfer_failExecutionRun -} -speciesInfer.into { - speciesInfer_checkMetadata - speciesInfer_getRef - speciesInfer_aggrQC - speciesInfer_uploadExecutionRun - speciesInfer_uploadProcessedFile - speciesInfer_failExecutionRun -} - -// Split species count error into separate channel -speciesError = Channel.create() -speciesError_details = Channel.create() -speciesError_fl.splitCsv(sep: ",", header: false).separate( - speciesError, - speciesError_details -) - -// Replicate errors for multiple process inputs -speciesError.into { - speciesError_checkMetadata - speciesError_uploadExecutionRun - speciesError_getRef - speciesError_alignData - speciesError_dedupData - speciesError_makeBigWig - speciesError_countData - speciesError_fastqc - speciesError_dataQC - speciesError_aggrQC - speciesError_uploadQC - speciesError_uploadQC_fail - speciesError_uploadProcessedFile - speciesError_uploadOutputBag - speciesError_failPreExecutionRun_species -} /* - * checkMetadata: checks the submitted metada against inferred -*/ + * checkMetadata: checks the submitted metadata against inferred + */ process checkMetadata { tag "${repRID}" @@ -1181,20 +1589,19 @@ process checkMetadata { val strandedInfer from strandedInfer_checkMetadata val spikeInfer from spikeInfer_checkMetadata val speciesInfer from speciesInfer_checkMetadata - val fastqCountError_checkMetadata - val fastqReadError_checkMetadata - val fastqFileError_checkMetadata - val speciesError_checkMetadata + val fastqCountError from fastqCountError_checkMetadata + val fastqReadError from fastqReadError_checkMetadata + val fastqFileError from fastqFileError_checkMetadata + val seqtypeError from seqtypeError_checkMetadata + val speciesErrorSeqwho from speciesErrorSeqwho_checkMetadata + val speciesError from speciesError_checkMetadata output: path ("check.csv") into checkMetadata_fl path ("outputBagRID.csv") optional true into outputBagRID_fl_dummy when: - fastqCountError_checkMetadata == "false" - fastqReadError_checkMetadata == "false" - fastqFileError_checkMetadata == "false" - speciesError_checkMetadata == "false" + fastqCountError == "false" && fastqReadError == "false" && fastqFileError == "false" && seqtypeError == "false" && speciesErrorSeqwho == "false" && speciesError == "false" script: """ @@ -1238,9 +1645,16 @@ process checkMetadata { fi if [ "${endsMeta}" != "${endsInfer}" ] then - pipelineError=true - pipelineError_ends=true - echo -e "LOG: ends do not match: Submitted=${endsMeta}; Inferred=${endsInfer}" >> ${repRID}.checkMetadata.log + if [ "${params.endsForce}" != "" ] + then + pipelineError=false + pipelineError_ends=false + echo -e "LOG: ends forced: Submitted=${endsMeta}; Inferred=${endsInfer}" >> ${repRID}.checkMetadata.log + else + pipelineError=true + pipelineError_ends=true + echo -e "LOG: ends do not match: Submitted=${endsMeta}; Inferred=${endsInfer}" >> ${repRID}.checkMetadata.log + fi else pipelineError_ends=false echo -e "LOG: ends matches: Submitted=${endsMeta}; Inferred=${endsInfer}" >> ${repRID}.checkMetadata.log @@ -1260,339 +1674,48 @@ process checkMetadata { pipelineError_spike=false echo -e "LOG: spike matches: Submitted=${spikeMeta}; Inferred=${spikeInfer}" >> ${repRID}.checkMetadata.log fi - if [ "${speciesMeta}" != "${speciesInfer}" ] - then - if [[ "${params.speciesForce}" != "" ]] - then - pipelineError_species=false - echo -e "LOG: species forced: Submitted=${speciesMeta}; Inferred=${speciesInfer}" >> ${repRID}.checkMetadata.log - else - pipelineError=true - pipelineError_species=true - echo -e "LOG: species does not match: Submitted=${speciesMeta}; Inferred=${speciesInfer}" >> ${repRID}.checkMetadata.log - fi - else - pipelineError_species=false - echo -e "LOG: species matches: Submitted=${speciesMeta}; Inferred=${speciesInfer}" >> ${repRID}.checkMetadata.log - fi - - # create dummy output bag rid if failure - if [ \${pipelineError} == true ] - then - echo "fail" > outputBagRID.csv - fi - - # write checks to file - echo "\${pipelineError},\${pipelineError_ends},\${pipelineError_stranded},\${pipelineError_spike},\${pipelineError_species}" > check.csv - """ -} - -// Split errors into separate channels -pipelineError = Channel.create() -pipelineError_ends = Channel.create() -pipelineError_stranded = Channel.create() -pipelineError_spike = Channel.create() -pipelineError_species = Channel.create() -checkMetadata_fl.splitCsv(sep: ",", header: false).separate( - pipelineError, - pipelineError_ends, - pipelineError_stranded, - pipelineError_spike, - pipelineError_species -) - -// Replicate errors for multiple process inputs -pipelineError.into { - pipelineError_getRef - pipelineError_alignData - pipelineError_dedupData - pipelineError_makeBigWig - pipelineError_countData - pipelineError_fastqc - pipelineError_dataQC - pipelineError_aggrQC - pipelineError_uploadQC - pipelineError_uploadQC_fail - pipelineError_uploadProcessedFile - pipelineError_uploadOutputBag - pipelineError_failExecutionRun -} - -/* - * uploadInputBag: uploads the input bag -*/ -process uploadInputBag { - tag "${repRID}" - - input: - path script_uploadInputBag - path credential, stageAs: "credential.json" from deriva_uploadInputBag - path inputBag from inputBag_uploadInputBag - val studyRID from studyRID_uploadInputBag - - output: - path ("inputBagRID.csv") into inputBagRID_fl - - when: - upload - - script: - """ - hostname > ${repRID}.uploadInputBag.log - ulimit -a >> ${repRID}.uploadInputBag.log - - yr=\$(date +'%Y') - mn=\$(date +'%m') - dy=\$(date +'%d') - - file=\$(basename -a ${inputBag}) - md5=\$(md5sum ./\${file} | awk '{ print \$1 }') - echo LOG: ${repRID} input bag md5 sum - \${md5} >> ${repRID}.uploadInputBag.log - size=\$(wc -c < ./\${file}) - echo LOG: ${repRID} input bag size - \${size} bytes >> ${repRID}.uploadInputBag.log - - exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=\${md5}) - if [ "\${exist}" == "[]" ] - then - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/input_bag/study/${studyRID}/replicate/${repRID}/\${file} --parents) - inputBag_rid=\$(python3 ${script_uploadInputBag} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie}) - echo LOG: input bag RID uploaded - \${inputBag_rid} >> ${repRID}.uploadInputBag.log - rid=\${inputBag_rid} - else - exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - exist=\${exist:7:-6} - echo LOG: input bag RID already exists - \${exist} >> ${repRID}.uploadInputBag.log - rid=\${exist} - fi - - echo "\${rid}" > inputBagRID.csv - """ -} - -// Extract input bag RID into channel -inputBagRID = Channel.create() -inputBagRID_fl.splitCsv(sep: ",", header: false).separate( - inputBagRID -) - -// Replicate input bag RID for multiple process inputs -inputBagRID.into { - inputBagRID_uploadExecutionRun - inputBagRID_finalizeExecutionRun - inputBagRID_failPreExecutionRun - inputBagRID_failExecutionRun -} - -/* - * uploadExecutionRun: uploads the execution run -*/ -process uploadExecutionRun { - tag "${repRID}" - - input: - path script_uploadExecutionRun_uploadExecutionRun - path credential, stageAs: "credential.json" from deriva_uploadExecutionRun - val spike from spikeInfer_uploadExecutionRun - val species from speciesInfer_uploadExecutionRun - val inputBagRID from inputBagRID_uploadExecutionRun - val fastqCountError_uploadExecutionRun - val fastqReadError_uploadExecutionRun - val fastqFileError_uploadExecutionRun - val speciesError_uploadExecutionRun - - output: - path ("executionRunRID.csv") into executionRunRID_fl - - when: - upload - fastqCountError_uploadExecutionRun == "false" - fastqReadError_uploadExecutionRun == "false" - fastqFileError_uploadExecutionRun == "false" - speciesError_uploadExecutionRun == "false" - - script: - """ - hostname > ${repRID}.uploadExecutionRun.log - ulimit -a >> ${repRID}.uploadExecutionRun.log - - echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.uploadExecutionRun.log - workflow=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Workflow/Name=BICF%20mRNA%20Replicate/Version=${workflow.manifest.version}) - workflow=\$(echo \${workflow} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - workflow=\${workflow:7:-6} - echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.uploadExecutionRun.log - - if [ "${species}" == "Homo sapiens" ] - then - genomeName=\$(echo GRCh${refHuVersion}) - elif [ "${species}" == "Mus musculus" ] - then - genomeName=\$(echo GRCm${refMoVersion}) - fi - if [ "${spike}" == "true" ] - then - genomeName=\$(echo \${genomeName}-S) - fi - echo LOG: searching for genome name - \${genomeName} >> ${repRID}.uploadExecutionRun.log - genome=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Name=\${genomeName}) - genome=\$(echo \${genome} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - genome=\${genome:7:-6} - echo LOG: genome RID extracted - \${genome} >> ${repRID}.uploadExecutionRun.log - - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Replicate=${repRID}/Input_Bag=${inputBagRID}) - echo \${exist} >> ${repRID}.uploadExecutionRun.log - if [ "\${exist}" == "[]" ] - then - executionRun_rid=\$(python3 ${script_uploadExecutionRun_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s In-progress -d 'Run in process' -o ${source} -c \${cookie} -u F) - echo LOG: execution run RID uploaded - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log - else - rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - rid=\${rid:7:-6} - echo \${rid} >> ${repRID}.uploadExecutionRun.log - executionRun_rid=\$(python3 ${script_uploadExecutionRun_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s In-progress -d 'Run in process' -o ${source} -c \${cookie} -u \${rid}) - echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log - fi - - echo "\${executionRun_rid}" > executionRunRID.csv - - if [ ${params.track} == true ] - then - curl -H 'Content-Type: application/json' -X PUT -d \ - '{ \ - "ID": "${workflow.sessionId}", \ - "ExecutionRunRID": "'\${executionRun_rid}'" \ - }' \ - "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" - fi - """ -} - -// Extract execution run RID into channel -executionRunRID = Channel.create() -executionRunRID_fl.splitCsv(sep: ",", header: false).separate( - executionRunRID -) - -// Replicate execution run RID for multiple process inputs -executionRunRID.into { - executionRunRID_uploadQC - executionRunRID_uploadProcessedFile - executionRunRID_uploadOutputBag - executionRunRID_finalizeExecutionRun - executionRunRID_failExecutionRun - executionRunRID_fail -} - -/* - * getRef: downloads appropriate reference -*/ -process getRef { - tag "${species}" - - input: - path script_refData - path credential, stageAs: "credential.json" from deriva_getRef - val spike from spikeInfer_getRef - val species from speciesInfer_getRef - val fastqCountError_getRef - val fastqReadError_getRef - val fastqFileError_getRef - val speciesError_getRef - val pipelineError_getRef - - output: - tuple path ("hisat2", type: 'dir'), path ("*.bed"), path ("*.fna"), path ("*.gtf"), path ("geneID.tsv"), path ("Entrez.tsv") into reference - - when: - fastqCountError_getRef == "false" - fastqReadError_getRef == "false" - fastqFileError_getRef == "false" - speciesError_getRef == "false" - pipelineError_getRef == "false" - - script: - """ - hostname > ${repRID}.getRef.log - ulimit -a >> ${repRID}.getRef.log - - # link credential file for authentication - echo -e "LOG: linking deriva credentials" >> ${repRID}.getRef.log - mkdir -p ~/.deriva - ln -sf `readlink -e credential.json` ~/.deriva/credential.json - echo -e "LOG: linked" >> ${repRID}.getRef.log - - # set the reference name - if [ "${species}" == "Mus musculus" ] - then - reference=\$(echo ${referenceBase}/GRCm${refMoVersion}) - refName=GRCm - elif [ '${species}' == "Homo sapiens" ] - then - reference=\$(echo ${referenceBase}/GRCh${refHuVersion}) - refName=GRCh - else - echo -e "LOG: ERROR - References could not be set!\nSpecies reference found: ${species}" >> ${repRID}.getRef.log - exit 1 - fi - if [ "${spike}" == "true" ] - then - reference=\$(echo \${reference}-S) - elif [ "${spike}" == "false" ] - then - reference=\$(echo \${reference}) - fi - echo -e "LOG: species set to \${reference}" >> ${repRID}.getRef.log - - # retreive appropriate reference appropriate location - echo -e "LOG: fetching ${species} reference files from ${referenceBase}" >> ${repRID}.getRef.log - if [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references/new" ] - then - echo -e "LOG: grabbing reference files from local (BioHPC)" >> ${repRID}.getRef.log - unzip \${reference}.zip - mv \$(basename \${reference})/data/* . - elif [ arams.refSource == "datahub" ] + + # create dummy output bag rid if failure + if [ \${pipelineError} == true ] then - echo -e "LOG: grabbing reference files from datahub" >> ${repRID}.getRef.log - GRCv=\$(echo \${reference} | grep -o \${refName}.* | cut -d '.' -f1) - GRCp=\$(echo \${reference} | grep -o \${refName}.* | cut -d '.' -f2) - GENCODE=\$(echo \${reference} | grep -o \${refName}.* | cut -d '.' -f3) - query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='\${GRCv}'.'\${GRCp}'/Annotation_Version=GENCODE%20'\${GENCODE}) - curl --request GET \${query} > refQuery.json - refURL=\$(python ${script_refData} --returnParam URL) - loc=\$(dirname \${refURL}) - fName=\$(python ${script_refData} --returnParam fName) - fName=\${fName%.*} - if [ "\${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi - filename=\$(echo \$(basename \${refURL}) | grep -oP '.*(?=:)') - deriva-hatrac-cli --host ${referenceBase} get \${refURL} - unzip \$(basename \${refURL}) - mv \${fName}/data/* . + echo "fail" > outputBagRID.csv fi - echo -e "LOG: fetched" >> ${repRID}.getRef.log - mv ./annotation/genome.gtf . - mv ./sequence/genome.fna . - mv ./annotation/genome.bed . - mv ./metadata/Entrez.tsv . - mv ./metadata/geneID.tsv . + # write checks to file + echo "\${pipelineError},\${pipelineError_ends},\${pipelineError_stranded},\${pipelineError_spike},\${pipelineError_species}" > check.csv """ } -// Replicate reference for multiple process inputs -reference.into { - reference_alignData - reference_countData - reference_dataQC +// Split errors into separate channels and replicate them for multiple process inputs +pipelineError = Channel.create() +pipelineError_ends = Channel.create() +pipelineError_stranded = Channel.create() +pipelineError_spike = Channel.create() +pipelineError_species = Channel.create() +checkMetadata_fl.splitCsv(sep: ",", header: false).separate( + pipelineError, + pipelineError_ends, + pipelineError_stranded, + pipelineError_spike, + pipelineError_species +) +pipelineError.into { + pipelineError_dedupData + pipelineError_makeBigWig + pipelineError_countData + pipelineError_dataQC + pipelineError_aggrQC + pipelineError_uploadQC + pipelineError_uploadProcessedFile + pipelineError_uploadOutputBag + pipelineError_failExecutionRun + pipelineError_finalizeExecutionRun + pipelineError_uploadQC_fail } /* - * alignData: aligns the reads to a reference database -*/ + * alignData: aligns the reads to the appripriate species reference + */ process alignData { tag "${repRID}" @@ -1601,27 +1724,24 @@ process alignData { path reference_alignData val ends from endsInfer_alignData val stranded from strandedInfer_alignData - val fastqCountError_alignData - val fastqReadError_alignData - val fastqFileError_alignData - val speciesError_alignData - val pipelineError_alignData + val fastqCountError from fastqCountError_alignData + val fastqReadError from fastqReadError_alignData + val fastqFileError from fastqFileError_alignData + val seqtypeError from seqtypeError_alignData + val speciesErrorSeqwho from speciesErrorSeqwho_alignData + val speciesError from speciesError_alignData output: tuple path ("${repRID}.sorted.bam"), path ("${repRID}.sorted.bam.bai") into rawBam path ("*.alignSummary.txt") into alignQC when: - fastqCountError_alignData == "false" - fastqReadError_alignData == "false" - fastqFileError_alignData == "false" - speciesError_alignData == "false" - pipelineError_alignData == "false" + fastqCountError == "false" && fastqReadError == "false" && fastqFileError == "false" && seqtypeError == "false" && speciesErrorSeqwho == "false" && speciesError == "false" script: """ - hostname > ${repRID}.align.log - ulimit -a >> ${repRID}.align.log + hostname > ${repRID}.alignData.log + ulimit -a >> ${repRID}.alignData.log # set stranded param for hisat2 if [ "${stranded}"=="unstranded" ] @@ -1642,7 +1762,7 @@ process alignData { fi # align the reads with Hisat2 - echo -e "LOG: aligning ${ends}" >> ${repRID}.align.log + echo -e "LOG: aligning ${ends}" >> ${repRID}.alignData.log if [ "${ends}" == "se" ] then hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} -U ${fastq[0]} --summary-file ${repRID}.alignSummary.txt --new-summary @@ -1650,44 +1770,41 @@ process alignData { then hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} --summary-file ${repRID}.alignSummary.txt --new-summary fi - echo -e "LOG: alignined" >> ${repRID}.align.log + echo -e "LOG: alignined" >> ${repRID}.alignData.log # convert the output sam file to a sorted bam file using Samtools - echo -e "LOG: converting from sam to bam" >> ${repRID}.align.log + echo -e "LOG: converting from sam to bam" >> ${repRID}.alignData.log samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam # sort the bam file using Samtools - echo -e "LOG: sorting the bam file" >> ${repRID}.align.log + echo -e "LOG: sorting the bam file" >> ${repRID}.alignData.log proc=\$(expr `nproc` - 1) mem=\$(vmstat -s -S K | grep 'total memory' | grep -o '[0-9]*') mem=\$(expr \${mem} / \${proc} \\* 75 / 100) samtools sort -@ \${proc} -m \${mem}K -O BAM -o ${repRID}.sorted.bam ${repRID}.bam # index the sorted bam using Samtools - echo -e "LOG: indexing sorted bam file" >> ${repRID}.align.log + echo -e "LOG: indexing sorted bam file" >> ${repRID}.alignData.log samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bam.bai """ } -// Replicate rawBam for multiple process inputs -rawBam.set { - rawBam_dedupData -} - /* - *dedupData: mark the duplicate reads, specifically focused on PCR or optical duplicates -*/ + * dedupData: mark the duplicate reads, specifically focused on PCR or optical duplicates + */ process dedupData { tag "${repRID}" - publishDir "${outDir}/bam", mode: 'copy', pattern: "*.deduped.bam" + publishDir "${outDir}/bam", mode: 'copy', pattern: "*.deduped.{bam,bai}" input: - tuple path (bam), path (bai) from rawBam_dedupData - val fastqCountError_dedupData - val fastqReadError_dedupData - val fastqFileError_dedupData - val speciesError_dedupData - val pipelineError_dedupData + tuple path (bam), path (bai) from rawBam + val fastqCountError from fastqCountError_dedupData + val fastqReadError from fastqReadError_dedupData + val fastqFileError from fastqFileError_dedupData + val seqtypeError from seqtypeError_dedupData + val speciesErrorSeqwho from speciesErrorSeqwho_dedupData + val speciesError from speciesError_dedupData + val pipelineError from pipelineError_dedupData output: tuple path ("${repRID}_sorted.deduped.bam"), path ("${repRID}_sorted.deduped.bam.bai") into dedupBam @@ -1695,11 +1812,7 @@ process dedupData { path ("*.deduped.Metrics.txt") into dedupQC when: - fastqCountError_dedupData == 'false' - fastqReadError_dedupData == 'false' - fastqFileError_dedupData == 'false' - speciesError_dedupData == 'false' - pipelineError_dedupData == 'false' + fastqCountError == "false" && fastqReadError == "false" && fastqFileError == "false" && seqtypeError == "false" && speciesErrorSeqwho == "false" && speciesError == "false" && pipelineError == "false" script: """ @@ -1736,29 +1849,27 @@ dedupBam.into { } /* - *makeBigWig: make BigWig files for output -*/ + * makeBigWig: make BigWig files for output + */ process makeBigWig { tag "${repRID}" - publishDir "${outDir}/bigwig", mode: 'copy', pattern: "${repRID}.bw" + publishDir "${outDir}/bigwig", mode: 'copy', pattern: "${repRID}_sorted.deduped.bw" input: tuple path (bam), path (bai) from dedupBam_makeBigWig - val fastqCountError_makeBigWig - val fastqReadError_makeBigWig - val fastqFileError_makeBigWig - val speciesError_makeBigWig - val pipelineError_makeBigWig + val fastqCountError from fastqCountError_makeBigWig + val fastqReadError from fastqReadError_makeBigWig + val fastqFileError from fastqFileError_makeBigWig + val seqtypeError from seqtypeError_makeBigWig + val speciesErrorSeqwho from speciesErrorSeqwho_makeBigWig + val speciesError from speciesError_makeBigWig + val pipelineError from pipelineError_makeBigWig output: path ("${repRID}_sorted.deduped.bw") into bigwig when: - fastqCountError_makeBigWig == 'false' - fastqReadError_makeBigWig == 'false' - fastqFileError_makeBigWig == 'false' - speciesError_makeBigWig == 'false' - pipelineError_makeBigWig == 'false' + fastqCountError == "false" && fastqReadError == "false" && fastqFileError == "false" && seqtypeError == "false" && speciesErrorSeqwho == "false" && speciesError == "false" && pipelineError == "false" script: """ @@ -1773,8 +1884,8 @@ process makeBigWig { } /* - *countData: count data and calculate tpm -*/ + * countData: count data and calculate tpm + */ process countData { tag "${repRID}" publishDir "${outDir}/count", mode: 'copy', pattern: "${repRID}*_tpmTable.csv" @@ -1786,11 +1897,13 @@ process countData { path ref from reference_countData val ends from endsInfer_countData val stranded from strandedInfer_countData - val fastqCountError_countData - val fastqReadError_countData - val fastqFileError_countData - val speciesError_countData - val pipelineError_countData + val fastqCountError from fastqCountError_countData + val fastqReadError from fastqReadError_countData + val fastqFileError from fastqFileError_countData + val seqtypeError from seqtypeError_countData + val speciesErrorSeqwho from speciesErrorSeqwho_countData + val speciesError from speciesError_countData + val pipelineError from pipelineError_countData output: path ("*_tpmTable.csv") into counts @@ -1798,11 +1911,7 @@ process countData { path ("assignedReads.csv") into assignedReadsInfer_fl when: - fastqCountError_countData == 'false' - fastqReadError_countData == 'false' - fastqFileError_countData == 'false' - speciesError_countData == 'false' - pipelineError_countData == 'false' + fastqCountError == "false" && fastqReadError == "false" && fastqFileError == "false" && seqtypeError == "false" && speciesErrorSeqwho == "false" && speciesError == "false" && pipelineError == "false" script: """ @@ -1849,21 +1958,19 @@ process countData { """ } -// Extract number of assigned reads metadata into channel +// Extract number of assigned reads metadata into channel and replicate them for multiple process inputs assignedReadsInfer = Channel.create() assignedReadsInfer_fl.splitCsv(sep: ",", header: false).separate( assignedReadsInfer ) - -// Replicate inferred assigned reads for multiple process inputs assignedReadsInfer.into { assignedReadsInfer_aggrQC assignedReadsInfer_uploadQC } /* - *dataQC: calculate transcript integrity numbers (TIN) and bin as well as calculate innerdistance of PE replicates -*/ + * dataQC: calculate transcript integrity numbers (TIN) and bin as well as calculate innerdistance of PE replicates + */ process dataQC { tag "${repRID}" @@ -1873,11 +1980,13 @@ process dataQC { tuple path (bam), path (bai) from dedupBam_dataQC tuple path (chrBam), path (chrBai) from dedupChrBam val ends from endsInfer_dataQC - val fastqCountError_dataQC - val fastqReadError_dataQC - val fastqFileError_dataQC - val speciesError_dataQC - val pipelineError_dataQC + val fastqCountError from fastqCountError_dataQC + val fastqReadError from fastqReadError_dataQC + val fastqFileError from fastqFileError_dataQC + val seqtypeError from seqtypeError_dataQC + val speciesErrorSeqwho from speciesErrorSeqwho_dataQC + val speciesError from speciesError_dataQC + val pipelineError from pipelineError_dataQC output: path "${repRID}_tin.hist.tsv" into tinHist @@ -1885,11 +1994,7 @@ process dataQC { path "${repRID}_insertSize.inner_distance_freq.txt" into innerDistance when: - fastqCountError_dataQC == 'false' - fastqReadError_dataQC == 'false' - fastqFileError_dataQC == 'false' - speciesError_dataQC == 'false' - pipelineError_dataQC == 'false' + fastqCountError == "false" && fastqReadError == "false" && fastqFileError == "false" && seqtypeError == "false" && speciesErrorSeqwho == "false" && speciesError == "false" && pipelineError == "false" script: """ @@ -1921,21 +2026,19 @@ process dataQC { """ } -// Extract median TIN metadata into channel +// Extract median TIN metadata into channel and replicate them for multiple process inputs tinMedInfer = Channel.create() tinMedInfer_fl.splitCsv(sep: ",", header: false).separate( tinMedInfer ) - -// Replicate inferred median TIN for multiple process inputs tinMedInfer.into { tinMedInfer_aggrQC tinMedInfer_uploadQC } /* - *aggrQC: aggregate QC from processes as well as metadata and run MultiQC -*/ + * aggrQC: aggregate QC from processes as well as metadata and run MultiQC + */ process aggrQC { tag "${repRID}" publishDir "${outDir}/report", mode: 'copy', pattern: "${repRID}.multiqc.html" @@ -1944,6 +2047,7 @@ process aggrQC { input: path multiqcConfig path bicfLogo + path seqwhoInfer path softwareReferences path softwareVersions path fastqc @@ -1953,7 +2057,8 @@ process aggrQC { path countsQC path innerDistance path tinHist - path alignSampleQCs from alignSampleQC_aggrQC.collect() + path alignSampleQC_ERCC from alignSampleQC_ERCC + path alignSampleQC from alignSampleQC path inferExperiment val endsManual from endsManual_aggrQC val endsM from endsMeta_aggrQC @@ -1971,22 +2076,20 @@ process aggrQC { val tinMedI from tinMedInfer_aggrQC val studyRID from studyRID_aggrQC val expRID from expRID_aggrQC - val fastqCountError_aggrQC - val fastqReadError_aggrQC - val fastqFileError_aggrQC - val speciesError_aggrQC - val pipelineError_aggrQC + val fastqCountError from fastqCountError_aggrQC + val fastqReadError from fastqReadError_aggrQC + val fastqFileError from fastqFileError_aggrQC + val seqtypeError from seqtypeError_aggrQC + val speciesErrorSeqwho from speciesErrorSeqwho_aggrQC + val speciesError from speciesError_aggrQC + val pipelineError from pipelineError_aggrQC output: path "${repRID}.multiqc.html" into multiqc path "${repRID}.multiqc_data.json" into multiqcJSON when: - fastqCountError_aggrQC == 'false' - fastqReadError_aggrQC == 'false' - fastqFileError_aggrQC == 'false' - speciesError_aggrQC == 'false' - pipelineError_aggrQC == 'false' + fastqCountError == "false" && fastqReadError == "false" && fastqFileError == "false" && seqtypeError == "false" && speciesErrorSeqwho == "false" && speciesError == "false" && pipelineError == "false" script: """ @@ -2082,9 +2185,186 @@ process aggrQC { """ } +/* + * uploadInputBag: uploads the input bag + */ +process uploadInputBag { + tag "${repRID}" + + input: + path script_uploadInputBag + path credential, stageAs: "credential.json" from deriva_uploadInputBag + path inputBag from inputBag_uploadInputBag + val studyRID from studyRID_uploadInputBag + + output: + path ("inputBagRID.csv") into inputBagRID_fl + + when: + upload + + script: + """ + hostname > ${repRID}.uploadInputBag.log + ulimit -a >> ${repRID}.uploadInputBag.log + + # link credential file for authentication + echo -e "LOG: linking deriva credentials" >> ${repRID}.uploadInputBag.log + mkdir -p ~/.deriva + ln -sf `readlink -e credential.json` ~/.deriva/credential.json + echo -e "LOG: linked" >> ${repRID}.uploadInputBag.log + + yr=\$(date +'%Y') + mn=\$(date +'%m') + dy=\$(date +'%d') + + file=\$(basename -a ${inputBag}) + md5=\$(md5sum ./\${file} | awk '{ print \$1 }') + echo LOG: ${repRID} input bag md5 sum - \${md5} >> ${repRID}.uploadInputBag.log + size=\$(wc -c < ./\${file}) + echo LOG: ${repRID} input bag size - \${size} bytes >> ${repRID}.uploadInputBag.log + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=\${md5}) + if [ "\${exist}" == "[]" ] + then + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/input_bag/study/${studyRID}/replicate/${repRID}/\${file} --parents) + inputBag_rid=\$(python3 ${script_uploadInputBag} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie}) + echo LOG: input bag RID uploaded - \${inputBag_rid} >> ${repRID}.uploadInputBag.log + rid=\${inputBag_rid} + else + exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + exist=\${exist:7:-6} + echo LOG: input bag RID already exists - \${exist} >> ${repRID}.uploadInputBag.log + rid=\${exist} + fi + + echo "\${rid}" > inputBagRID.csv + """ +} + +// Extract input bag RID into channel and replicate them for multiple process inputs +inputBagRID = Channel.create() +inputBagRID_fl.splitCsv(sep: ",", header: false).separate( + inputBagRID +) +inputBagRID.into { + inputBagRID_uploadExecutionRun + inputBagRID_finalizeExecutionRun + inputBagRID_failPreExecutionRun + inputBagRID_failExecutionRun +} + +/* + * uploadExecutionRun: uploads the execution run + */ +process uploadExecutionRun { + tag "${repRID}" + + input: + path script_uploadExecutionRun_uploadExecutionRun + path credential, stageAs: "credential.json" from deriva_uploadExecutionRun + val spike from spikeInfer_uploadExecutionRun + val species from speciesInfer_uploadExecutionRun + val inputBagRID from inputBagRID_uploadExecutionRun + val fastqCountError from fastqCountError_uploadExecutionRun + val fastqReadError from fastqReadError_uploadExecutionRun + val fastqFileError from fastqFileError_uploadExecutionRun + val seqtypeError from seqtypeError_uploadExecutionRun + val speciesErrorSeqwho from speciesErrorSeqwho_uploadExecutionRun + val speciesError from speciesError_uploadExecutionRun + + output: + path ("executionRunRID.csv") into executionRunRID_fl + + when: + upload + fastqCountError == "false" && fastqReadError == "false" && fastqFileError == "false" && seqtypeError == "false" && speciesErrorSeqwho == "false" && speciesError == "false" + + script: + """ + hostname > ${repRID}.uploadExecutionRun.log + ulimit -a >> ${repRID}.uploadExecutionRun.log + + # link credential file for authentication + echo -e "LOG: linking deriva credentials" >> ${repRID}.uploadExecutionRun.log + mkdir -p ~/.deriva + ln -sf `readlink -e credential.json` ~/.deriva/credential.json + echo -e "LOG: linked" >> ${repRID}.uploadExecutionRun.log + + echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.uploadExecutionRun.log + workflow=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Workflow/Name=BICF%20mRNA%20Replicate/Version=${workflow.manifest.version}) + workflow=\$(echo \${workflow} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + workflow=\${workflow:7:-6} + echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.uploadExecutionRun.log + + if [ "${species}" == "Homo sapiens" ] + then + genomeName=\$(echo GRCh${refHuVersion}) + elif [ "${species}" == "Mus musculus" ] + then + genomeName=\$(echo GRCm${refMoVersion}) + fi + if [ "${spike}" == "true" ] + then + genomeName=\$(echo \${genomeName}-S) + fi + echo LOG: searching for genome name - \${genomeName} >> ${repRID}.uploadExecutionRun.log + genome=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Name=\${genomeName}) + genome=\$(echo \${genome} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + genome=\${genome:7:-6} + echo LOG: genome RID extracted - \${genome} >> ${repRID}.uploadExecutionRun.log + + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Replicate=${repRID}/Input_Bag=${inputBagRID}) + echo \${exist} >> ${repRID}.uploadExecutionRun.log + if [ "\${exist}" == "[]" ] + then + executionRun_rid=\$(python3 ${script_uploadExecutionRun_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s In-progress -d 'Run in process' -o ${source} -c \${cookie} -u F) + echo LOG: execution run RID uploaded - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log + else + rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + rid=\${rid:7:-6} + echo \${rid} >> ${repRID}.uploadExecutionRun.log + executionRun_rid=\$(python3 ${script_uploadExecutionRun_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s In-progress -d 'Run in process' -o ${source} -c \${cookie} -u \${rid}) + echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log + fi + + echo "\${executionRun_rid}" > executionRunRID.csv + + if [ ${params.track} == true ] + then + curl -H 'Content-Type: application/json' -X PUT -d \ + '{ \ + "ID": "${workflow.sessionId}", \ + "ExecutionRunRID": "'\${executionRun_rid}'" \ + }' \ + "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" + fi + """ +} + +// Extract execution run RID into channel and replicate them for multiple process inputs +executionRunRID = Channel.create() +executionRunRID_fl.splitCsv(sep: ",", header: false).separate( + executionRunRID +) +executionRunRID.into { + executionRunRID_uploadQC + executionRunRID_uploadProcessedFile + executionRunRID_uploadOutputBag + executionRunRID_finalizeExecutionRun + executionRunRID_failExecutionRun + executionRunRID_fail +} + /* * uploadQC: uploads the mRNA QC -*/ + */ process uploadQC { tag "${repRID}" @@ -2099,28 +2379,32 @@ process uploadQC { val rawCount from rawReadsInfer_uploadQC val finalCount from assignedReadsInfer_uploadQC val tinMed from tinMedInfer_uploadQC - val fastqCountError_uploadQC - val fastqReadError_uploadQC - val fastqFileError_uploadQC - val speciesError_uploadQC - val pipelineError_uploadQC + val fastqCountError from fastqCountError_uploadQC + val fastqReadError from fastqReadError_uploadQC + val fastqFileError from fastqFileError_uploadQC + val seqtypeError from seqtypeError_uploadQC + val speciesErrorSeqwho from speciesErrorSeqwho_uploadQC + val speciesError from speciesError_uploadQC + val pipelineError from pipelineError_uploadQC output: path ("qcRID.csv") into qcRID_fl when: upload - fastqCountError_uploadQC == 'false' - fastqReadError_uploadQC == 'false' - fastqFileError_uploadQC == 'false' - speciesError_uploadQC == 'false' - pipelineError_uploadQC == 'false' + fastqCountError == "false" && fastqReadError == "false" && fastqFileError == "false" && seqtypeError == "false" && speciesErrorSeqwho == "false" && speciesError == "false" && pipelineError == "false" script: """ hostname > ${repRID}.uploadQC.log ulimit -a >> ${repRID}.uploadQC.log + # link credential file for authentication + echo -e "LOG: linking deriva credentials" >> ${repRID}.uploadQC.log + mkdir -p ~/.deriva + ln -sf `readlink -e credential.json` ~/.deriva/credential.json + echo -e "LOG: linked" >> ${repRID}.uploadQC.log + if [ "${ends}" == "pe" ] then end="Paired End" @@ -2152,8 +2436,8 @@ process uploadQC { } /* - *uploadProcessedFile: uploads the processed files -*/ + * uploadProcessedFile: uploads the processed files + */ process uploadProcessedFile { tag "${repRID}" publishDir "${outDir}/outputBag", mode: 'copy', pattern: "Replicate_${repRID}.outputBag.zip" @@ -2171,27 +2455,32 @@ process uploadProcessedFile { val studyRID from studyRID_uploadProcessedFile val expRID from expRID_uploadProcessedFile val executionRunRID from executionRunRID_uploadProcessedFile - val fastqCountError_uploadProcessedFile - val fastqReadError_uploadProcessedFile - val fastqFileError_uploadProcessedFile - val speciesError_uploadProcessedFile - val pipelineError_uploadProcessedFile + val fastqCountError from fastqCountError_uploadProcessedFile + val fastqReadError from fastqReadError_uploadProcessedFile + val fastqFileError from fastqFileError_uploadProcessedFile + val seqtypeError from seqtypeError_uploadProcessedFile + val speciesErrorSeqwho from speciesErrorSeqwho_uploadProcessedFile + val speciesError from speciesError_uploadProcessedFile + val pipelineError from pipelineError_uploadProcessedFile output: path ("${repRID}_Output_Bag.zip") into outputBag when: upload - fastqCountError_uploadProcessedFile == 'false' - fastqReadError_uploadProcessedFile == 'false' - fastqFileError_uploadProcessedFile == 'false' - speciesError_uploadProcessedFile == 'false' - pipelineError_uploadProcessedFile == 'false' + fastqCountError == "false" && fastqReadError == "false" && fastqFileError == "false" && seqtypeError == "false" && speciesErrorSeqwho == "false" && speciesError == "false" && pipelineError == "false" script: """ - hostname > ${repRID}.outputBag.log - ulimit -a >> ${repRID}.outputBag.log + + hostname > ${repRID}.uploadProcessedFile.log + ulimit -a >> ${repRID}.uploadProcessedFile.log + + # link credential file for authentication + echo -e "LOG: linking deriva credentials" >> ${repRID}.uploadProcessedFile.log + mkdir -p ~/.deriva + ln -sf `readlink -e credential.json` ~/.deriva/credential.json + echo -e "LOG: linked" >> ${repRID}.uploadProcessedFile.log mkdir -p ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ cp ${bam} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ @@ -2210,14 +2499,14 @@ process uploadProcessedFile { do python3 ${script_deleteEntry_uploadProcessedFile} -r \${rid} -t Processed_File -o ${source} -c \${cookie} done - echo LOG: all old processed file RIDs deleted >> ${repRID}.uploadQC.log + echo LOG: all old processed file RIDs deleted >> ${repRID}.uploadProcessedFile.log fi deriva-upload-cli --catalog 2 --token \${cookie:9} ${source} ./deriva - echo LOG: processed files uploaded >> ${repRID}.outputBag.log + echo LOG: processed files uploaded >> ${repRID}.outpuploadProcessedFileutBag.log deriva-download-cli --catalog 2 --token \${cookie:9} ${source} ${executionRunExportConfig} . rid=${executionRunRID} - echo LOG: execution run bag downloaded >> ${repRID}.outputBag.log + echo LOG: execution run bag downloaded >> ${repRID}.uploadProcessedFile.log echo -e "### Run Details" >> runDetails.md echo -e "**Workflow URL:** https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq" >> runDetails.md @@ -2235,7 +2524,7 @@ process uploadProcessedFile { echo -e "**Genome Assembly Version:** \${genome} patch \${patch}" >> runDetails.md echo -e "**Annotation Version:** GENCODE release \${annotation}" >> runDetails.md echo -e "**Run ID:** ${repRID}" >> runDetails.md - echo LOG: runDetails.md created >> ${repRID}.outputBag.log + echo LOG: runDetails.md created >> ${repRID}.uploadProcessedFile.log unzip Execution_Run_${executionRunRID}.zip yr=\$(date +'%Y') @@ -2249,13 +2538,13 @@ process uploadProcessedFile { cp ${multiqcJSON} \${loc} bdbag ./${repRID}_Output_Bag/ --update --archiver zip --debug - echo LOG: output bag created >> ${repRID}.outputBag.log + echo LOG: output bag created >> ${repRID}.uploadProcessedFile.log """ } /* * uploadOutputBag: uploads the output bag -*/ + */ process uploadOutputBag { tag "${repRID}" @@ -2265,28 +2554,32 @@ process uploadOutputBag { path outputBag val studyRID from studyRID_uploadOutputBag val executionRunRID from executionRunRID_uploadOutputBag - val fastqCountError_uploadOutputBag - val fastqReadError_uploadOutputBag - val fastqFileError_uploadOutputBag - val speciesError_uploadOutputBag - val pipelineError_uploadOutputBag + val fastqCountError from fastqCountError_uploadOutputBag + val fastqReadError from fastqReadError_uploadOutputBag + val fastqFileError from fastqFileError_uploadOutputBag + val seqtypeError from seqtypeError_uploadOutputBag + val speciesErrorSeqwho from speciesErrorSeqwho_uploadOutputBag + val speciesError from speciesError_uploadOutputBag + val pipelineError from pipelineError_uploadOutputBag output: path ("outputBagRID.csv") into outputBagRID_fl when: upload - fastqCountError_uploadOutputBag == 'false' - fastqReadError_uploadOutputBag == 'false' - fastqFileError_uploadOutputBag == 'false' - speciesError_uploadOutputBag == 'false' - pipelineError_uploadOutputBag == 'false' + fastqCountError == "false" && fastqReadError == "false" && fastqFileError == "false" && seqtypeError == "false" && speciesErrorSeqwho == "false" && speciesError == "false" && pipelineError == "false" script: """ hostname > ${repRID}.uploadOutputBag.log ulimit -a >> ${repRID}.uploadOutputBag.log + # link credential file for authentication + echo -e "LOG: linking deriva credentials" >> ${repRID}.uploadOutputBag.log + mkdir -p ~/.deriva + ln -sf `readlink -e credential.json` ~/.deriva/credential.json + echo -e "LOG: linked" >> ${repRID}.uploadOutputBag.log + yr=\$(date +'%Y') mn=\$(date +'%m') dy=\$(date +'%d') @@ -2333,7 +2626,7 @@ outputBagRID_fl.splitCsv(sep: ",", header: false).separate( /* * finalizeExecutionRun: finalizes the execution run -*/ + */ process finalizeExecutionRun { tag "${repRID}" @@ -2343,9 +2636,17 @@ process finalizeExecutionRun { val executionRunRID from executionRunRID_finalizeExecutionRun val inputBagRID from inputBagRID_finalizeExecutionRun val outputBagRID + val fastqCountError from fastqCountError_finalizeExecutionRun + val fastqReadError from fastqReadError_finalizeExecutionRun + val fastqFileError from fastqFileError_finalizeExecutionRun + val seqtypeError from seqtypeError_finalizeExecutionRun + val speciesErrorSeqwho from speciesErrorSeqwho_finalizeExecutionRun + val speciesError from speciesError_finalizeExecutionRun + val pipelineError from pipelineError_finalizeExecutionRun when: upload + fastqCountError == "false" && fastqReadError == "false" && fastqFileError == "false" && seqtypeError == "false" && speciesErrorSeqwho == "false" && speciesError == "false" && pipelineError == "false" script: """ @@ -2376,16 +2677,16 @@ process finalizeExecutionRun { } // Combine errors -error_meta = fastqCountError_uploadQC_fail.ifEmpty(false).combine(fastqReadError_uploadQC_fail.ifEmpty(false).combine(fastqFileError_uploadQC_fail.ifEmpty(false).combine(speciesError_uploadQC_fail.ifEmpty(false).combine(pipelineError_uploadQC_fail.ifEmpty(false))))) -error_meta. into{ +error_meta = fastqCountError_uploadQC_fail.ifEmpty(false).combine(fastqReadError_uploadQC_fail.ifEmpty(false).combine(fastqFileError_uploadQC_fail.ifEmpty(false).combine(seqtypeError_uploadQC_fail.ifEmpty(false).combine(speciesErrorSeqwho_uploadQC_fail.ifEmpty(false).combine(speciesError_uploadQC_fail.ifEmpty(false).combine(pipelineError_uploadQC_fail.ifEmpty(false))))))) +error_meta. into { error_failPreExecutionRun error_uploadQC_fail } -errorDetails = fastqCountError_details.ifEmpty("").combine(fastqReadError_details.ifEmpty("").combine(fastqFileError_details.ifEmpty("").combine(speciesError_details.ifEmpty("")))) +errorDetails = fastqCountError_details.ifEmpty("").combine(fastqReadError_details.ifEmpty("").combine(fastqFileError_details.ifEmpty("").combine(seqtypeError_details.ifEmpty("").combine(speciesErrorSeqwho_details.ifEmpty(""))))) /* - * failPreExecutionRun_fastq: fail the execution run prematurely for fastq errors -*/ + * failPreExecutionRun: fail the execution run prematurely for fastq errors + */ process failPreExecutionRun { tag "${repRID}" @@ -2395,15 +2696,15 @@ process failPreExecutionRun { val spike from spikeMeta_failPreExecutionRun val species from speciesMeta_failPreExecutionRun val inputBagRID from inputBagRID_failPreExecutionRun - tuple val (fastqCountError), val (fastqReadError), val (fastqFileError), val (speciesError), val (pipelineError) from error_failPreExecutionRun - tuple val (fastqCountError_details), val (fastqReadError_details), val (fastqFileError_details), val (speciesError_details) from errorDetails + tuple val (fastqCountError), val (fastqReadError), val (fastqFileError), val (seqtypeError), val (speciesErrorSeqwho), val (speciesError), val (pipelineError) from error_failPreExecutionRun + tuple val (fastqCountError_details), val (fastqReadError_details), val (fastqFileError_details), val (seqtypeError_details), val (speciesError_details) from errorDetails output: path ("executionRunRID.csv") into executionRunRID_preFail_fl when: upload - fastqCountError == 'true' || fastqReadError == 'true' || fastqFileError == 'true' || speciesError == 'true' + fastqCountError == "true" || fastqReadError == "true" || fastqFileError == "true" || seqtypeError == "true" || speciesError == "true" script: """ @@ -2419,10 +2720,13 @@ process failPreExecutionRun { errorDetails=\$(echo \$(errorDetails)${fastqReadError_details}"\\n") elif [ ${fastqFileError} == true ] then - errorDetails=\$(echo \$(errorDetails)${fastqReadError_details}"\\n") + errorDetails=\$(echo \$(errorDetails)${fastqFileError_details}"\\n") + elif [ ${seqtypeError} == true ] + then + errorDetails=\$(echo \$(errorDetails)${seqtypeError_details}"\\n") elif [ ${speciesError} == true ] then - errorDetails=\$(echo \$(errorDetails)${fastqReadError_details}"\\n") + errorDetails=\$(echo \$(errorDetails)${speciesError_details}"\\n") fi echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.failPreExecutionRun.log @@ -2490,7 +2794,7 @@ failExecutionRunRID = executionRunRID_fail.ifEmpty('').mix(executionRunRID_preFa /* * failExecutionRun: fail the execution run -*/ + */ process failExecutionRun { tag "${repRID}" @@ -2516,7 +2820,7 @@ process failExecutionRun { when: upload - pipelineError == 'true' + pipelineError == "true" script: """ @@ -2585,7 +2889,7 @@ process failExecutionRun { /* * uploadQC_fail: uploads the mRNA QC on failed execution run -*/ + */ process uploadQC_fail { tag "${repRID}" @@ -2594,17 +2898,23 @@ process uploadQC_fail { path script_uploadQC_fail path credential, stageAs: "credential.json" from deriva_uploadQC_fail val executionRunRID from failExecutionRunRID - tuple val (fastqCountError), val (fastqReadError), val (fastqFileError), val (speciesError), val (pipelineError) from error_uploadQC_fail + tuple val (fastqCountError), val (fastqReadError), val (fastqFileError), val (seqtypeError), val (speciesErrorSeqwho), val (speciesError), val (pipelineError) from error_uploadQC_fail when: upload - fastqCountError == 'true' || fastqReadError == 'true' || fastqFileError == 'true' || speciesError == 'true' || pipelineError == 'true' + fastqCountError == "true" || fastqReadError == "true" || fastqFileError == "true" || seqtypeError == "true" || speciesErrorSeqwho == "true" || speciesError == "true" || pipelineError == "true" script: """ hostname > ${repRID}.uploadQC.log ulimit -a >> ${repRID}.uploadQC.log + # link credential file for authentication + echo -e "LOG: linking deriva credentials" >> ${repRID}.uploadQC.log + mkdir -p ~/.deriva + ln -sf `readlink -e credential.json` ~/.deriva/credential.json + echo -e "LOG: linked" >> ${repRID}.uploadQC.log + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') cookie=\${cookie:11:-1} @@ -2627,7 +2937,6 @@ process uploadQC_fail { """ } - workflow.onError = { subject = "$workflow.manifest.name FAILED: $params.repRID" diff --git a/test_data/createTestData.sh b/test_data/createTestData.sh index 5d876ed032790d0e3442aed94a0fd79e5e430e60..0f0454112ad278e5e032582bf30042f5a41495d2 100644 --- a/test_data/createTestData.sh +++ b/test_data/createTestData.sh @@ -31,6 +31,14 @@ pigz Q-Y5F6_1M.R2.fastq cp Q-Y5F6_1M.R1.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz cp Q-Y5F6_1M.R2.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz +mkdir -p ./NEW_test_data/fastq/xsmall +singularity exec 'docker://gudmaprbk/seqtk1.3:1.0.0' seqtk sample -s100 ./NEW_test_data/fastq/Q-Y5F6.R1.fastq.gz 10000 1> Q-Y5F6_10K.R1.fastq +singularity exec 'docker://gudmaprbk/seqtk1.3:1.0.0' seqtk sample -s100 ./NEW_test_data/fastq/Q-Y5F6.R2.fastq.gz 10000 1> Q-Y5F6_10K.R2.fastq +pigz Q-Y5F6_10K.R1.fastq +pigz Q-Y5F6_10K.R2.fastq +cp Q-Y5F6_10K.R1.fastq.gz ./NEW_test_data/fastq/xsmall/Q-Y5F6_10K.R1.fastq.gz +cp Q-Y5F6_10K.R2.fastq.gz ./NEW_test_data/fastq/xsmall/Q-Y5F6_10K.R2.fastq.gz + mkdir -p ./NEW_test_data/meta singularity run 'docker://gudmaprbk/trimgalore0.6.5:1.0.0' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5F6_1M.se -j 20 ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz singularity run 'docker://gudmaprbk/trimgalore0.6.5:1.0.0' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5F6_1M.pe -j 20 ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz diff --git a/workflow/conf/bdbag.json b/workflow/conf/bdbag.json new file mode 100644 index 0000000000000000000000000000000000000000..2c2ab245e7d3470d8bb341136dac278360b4d99f --- /dev/null +++ b/workflow/conf/bdbag.json @@ -0,0 +1,28 @@ +{ + "fetch_config": { + "http": { + "http_cookies": { + "file_names": [ + "*cookies.txt" + ], + "scan_for_cookie_files": true, + "search_paths": [ + "." + ], + "search_paths_filter": "*cookies.txt" + } + }, + "https": { + "http_cookies": { + "file_names": [ + "*cookies.txt" + ], + "scan_for_cookie_files": true, + "search_paths": [ + "." + ], + "search_paths_filter": "*cookies.txt" + } + } + } +} diff --git a/workflow/conf/multiqc_config.yaml b/workflow/conf/multiqc_config.yaml index ed1375aed47a454394029e5057695b0c15babd8c..89059e01682adfcd9354e3c7d78b6a65a87bf569 100644 --- a/workflow/conf/multiqc_config.yaml +++ b/workflow/conf/multiqc_config.yaml @@ -27,17 +27,17 @@ top_modules: - picard: name: 'Dedup' info: 'Replicate Alignement Deduplication QC Results' + - featureCounts: + name: 'Count' + info: 'Replicate Feature Count QC Results' - rseqc: name: 'Inner Distance' info: 'Replicate Paired End Inner Distance Distribution Results' path_filters: - '*insertSize*' - custom_content - - featureCounts: - name: 'Count' - info: 'Replicate Feature Count QC Results' - hisat2: - name: 'Inference: Align' + name: 'Inference: Spike-in' info: 'Inference Alignment (1M downsampled reads) QC Results' path_filters: - '*alignSampleSummary*' @@ -131,8 +131,8 @@ custom_data: ref: file_format: 'tsv' - section_name: 'Reference' - description: 'This is the reference version information' + section_name: 'Genome Reference' + description: 'This is the genome reference version information' plot_type: 'table' pconfig: id: 'ref' @@ -166,6 +166,28 @@ custom_data: 71 - 80 81 - 90 91 - 100 + seqwho: + file_format: 'tsv' + section_name: 'Inference: Sequencing Type & Species' + description: 'This is the inference results from the SeqWho tool' + plot_type: 'table' + pconfig: + id: 'ref' + scale: false + format: '{}' + headers: + Read: + description: 'Sequencing read (R1/R2)' + Seq Type: + description: 'Inferred Sequence Type' + Species: + description: 'Inferred Species' + Seq Type Confidence: + description: 'Seqence type call confidence' + Seq Type Consensus: + description: 'Low confidence sequence type call sampling inference consensus' + Species Confidence: + description: 'Species call confidence' sp: run: @@ -178,3 +200,5 @@ sp: fn: 'reference.tsv' tin: fn: '*_tin.hist.tsv' + seqwho: + fn: 'seqwhoInfer.tsv' diff --git a/workflow/conf/ondemand.config b/workflow/conf/ondemand.config deleted file mode 100755 index 131fdbb19e1fedf1bc9e206a03d801f13791b810..0000000000000000000000000000000000000000 --- a/workflow/conf/ondemand.config +++ /dev/null @@ -1,3 +0,0 @@ -process { - queue = 'highpriority-0ef8afb0-c7ad-11ea-b907-06c94a3c6390' -} diff --git a/workflow/conf/spot.config b/workflow/conf/spot.config deleted file mode 100755 index d9c7a4c8fa34aadd597da0170f8e3e223923011a..0000000000000000000000000000000000000000 --- a/workflow/conf/spot.config +++ /dev/null @@ -1,3 +0,0 @@ -process { - queue = 'default-0ef8afb0-c7ad-11ea-b907-06c94a3c6390' -} diff --git a/workflow/nextflow.config b/workflow/nextflow.config deleted file mode 100644 index 44f2df5255691ee4eaf11ecf9cee1af2fa27f743..0000000000000000000000000000000000000000 --- a/workflow/nextflow.config +++ /dev/null @@ -1,130 +0,0 @@ -profiles { - standard { - includeConfig 'conf/biohpc.config' - } - biohpc { - includeConfig 'conf/biohpc.config' - } - biohpc_max { - includeConfig 'conf/biohpc_max.config' - } - aws_ondemand { - includeConfig 'conf/aws.config' - includeConfig 'conf/ondemand.config' - } - aws_spot { - includeConfig 'conf/aws.config' - includeConfig 'conf/spot.config' - } -} - -process { - withName:getBag { - container = 'gudmaprbk/deriva1.4:1.0.0' - } - withName:getData { - container = 'gudmaprbk/deriva1.4:1.0.0' - } - withName:parseMetadata { - container = 'gudmaprbk/python3:1.0.0' - } - withName:trimData { - container = 'gudmaprbk/trimgalore0.6.5:1.0.0' - } - withName:getRefInfer { - container = 'gudmaprbk/deriva1.4:1.0.0' - } - withName:downsampleData { - container = 'gudmaprbk/seqtk1.3:1.0.0' - } - withName:alignSampleData { - container = 'gudmaprbk/hisat2.2.1:1.0.0' - } - withName:inferMetadata { - container = 'gudmaprbk/rseqc4.0.0:1.0.0' - } - withName:checkMetadata { - container = 'gudmaprbk/gudmap-rbk_base:1.0.0' - } - withName:getRef { - container = 'gudmaprbk/deriva1.4:1.0.0' - } - withName:alignData { - container = 'gudmaprbk/hisat2.2.1:1.0.0' - } - withName:dedupData { - container = 'gudmaprbk/picard2.23.9:1.0.0' - } - withName:countData { - container = 'gudmaprbk/subread2.0.1:1.0.0' - } - withName:makeBigWig { - container = 'gudmaprbk/deeptools3.5.0:1.0.0' - } - withName:fastqc { - container = 'gudmaprbk/fastqc0.11.9:1.0.0' - } - withName:dataQC { - container = 'gudmaprbk/rseqc4.0.0:1.0.0' - } - withName:aggrQC { - container = 'gudmaprbk/multiqc1.9:1.0.0' - } - withName:uploadInputBag { - container = 'gudmaprbk/deriva1.4:1.0.0' - } - withName:uploadExecutionRun { - container = 'gudmaprbk/deriva1.4:1.0.0' - } - withName:uploadQC { - container = 'gudmaprbk/deriva1.4:1.0.0' - } - withName:uploadProcessedFile { - container = 'gudmaprbk/deriva1.4:1.0.0' - } - withName:uploadOutputBag { - container = 'gudmaprbk/deriva1.4:1.0.0' - } - withName:finalizeExecutionRun { - container = 'gudmaprbk/deriva1.4:1.0.0' - } - withName:failPreExecutionRun { - container = 'gudmaprbk/deriva1.4:1.0.0' - } - withName:failExecutionRun { - container = 'gudmaprbk/deriva1.4:1.0.0' - } - withName:uploadQC_fail { - container = 'gudmaprbk/deriva1.4:1.0.0' - } -} - -trace { - enabled = false - file = 'trace.txt' - fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss' -} - -timeline { - enabled = false - file = 'timeline.html' -} - -report { - enabled = false - file = 'report.html' -} - -tower { - accessToken = '3ade8f325d4855434b49aa387421a44c63e3360f' - enabled = true -} - -manifest { - name = 'gudmap_rbk/rna-seq' - homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq' - description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.' - mainScript = 'rna-seq.nf' - version = 'v2.0.0rc01' - nextflowVersion = '>=19.09.0' -} diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf deleted file mode 100644 index ec289fc6242bbff2f7a74a2f6e57f8728ecc487c..0000000000000000000000000000000000000000 --- a/workflow/rna-seq.nf +++ /dev/null @@ -1,2649 +0,0 @@ -#!/usr/bin/env nextflow - -// ######## #### ###### ######## -// ## ## ## ## ## ## -// ## ## ## ## ## -// ######## ## ## ###### -// ## ## ## ## ## -// ## ## ## ## ## ## -// ######## #### ###### ## - -// Define input variables -params.deriva = "${baseDir}/../test_data/auth/credential.json" -params.bdbag = "${baseDir}/../test_data/auth/cookies.txt" -//params.repRID = "16-1ZX4" -params.repRID = "Q-Y5F6" -params.source = "dev" -params.refMoVersion = "38.p6.vM25" -params.refHuVersion = "38.p13.v36" -params.refERCCVersion = "92" -params.outDir = "${baseDir}/../output" -params.upload = false -params.email = "" -params.track = false - - -// Define override input variable -params.refSource = "biohpc" -params.inputBagForce = "" -params.fastqsForce = "" -params.speciesForce = "" -params.strandedForce = "" -params.spikeForce = "" - -// Define tracking input variables -params.ci = false -params.dev = true - - -// Parse input variables -deriva = Channel - .fromPath(params.deriva) - .ifEmpty { exit 1, "deriva credential file not found: ${params.deriva}" } -deriva.into { - deriva_getBag - deriva_getRefInfer - deriva_getRef - deriva_uploadInputBag - deriva_uploadExecutionRun - deriva_uploadQC - deriva_uploadQC_fail - deriva_uploadProcessedFile - deriva_uploadOutputBag - deriva_finalizeExecutionRun - deriva_failPreExecutionRun - deriva_failExecutionRun -} -bdbag = Channel - .fromPath(params.bdbag) - .ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" } -repRID = params.repRID -refMoVersion = params.refMoVersion -refHuVersion = params.refHuVersion -refERCCVersion = params.refERCCVersion -outDir = params.outDir -logsDir = "${outDir}/Logs" -upload = params.upload -inputBagForce = params.inputBagForce -fastqsForce = params.fastqsForce -speciesForce = params.speciesForce -strandedForce = params.strandedForce -spikeForce = params.spikeForce -email = params.email - -// Define fixed files and variables -replicateExportConfig = Channel.fromPath("${baseDir}/conf/Replicate_For_Input_Bag.json") -executionRunExportConfig = Channel.fromPath("${baseDir}/conf/Execution_Run_For_Output_Bag.json") -if (params.source == "dev") { - source = "dev.gudmap.org" -} else if (params.source == "staging") { - source = "staging.gudmap.org" -} else if (params.source == "production") { - source = "www.gudmap.org" -} -if (params.refSource == "biohpc") { - referenceBase = "/project/BICF/BICF_Core/shared/gudmap/references/new" -} else if (params.refSource == "datahub") { - referenceBase = "www.gudmap.org" -} -referenceInfer = Channel.fromList(["ERCC","GRCh","GRCm"]) -multiqcConfig = Channel.fromPath("${baseDir}/conf/multiqc_config.yaml") -bicfLogo = Channel.fromPath("${baseDir}/../docs/bicf_logo.png") -softwareReferences = Channel.fromPath("${baseDir}/../docs/software_references_mqc.yaml") -softwareVersions = Channel.fromPath("${baseDir}/../docs/software_versions_mqc.yaml") - -// Define script files -script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbag_fetch.sh") -script_parseMeta = Channel.fromPath("${baseDir}/scripts/parse_meta.py") -script_inferMeta = Channel.fromPath("${baseDir}/scripts/infer_meta.sh") -script_refDataInfer = Channel.fromPath("${baseDir}/scripts/extract_ref_data.py") -script_refData = Channel.fromPath("${baseDir}/scripts/extract_ref_data.py") -script_calculateTPM = Channel.fromPath("${baseDir}/scripts/calculateTPM.R") -script_convertGeneSymbols = Channel.fromPath("${baseDir}/scripts/convertGeneSymbols.R") -script_tinHist = Channel.fromPath("${baseDir}/scripts/tin_hist.py") -script_uploadInputBag = Channel.fromPath("${baseDir}/scripts/upload_input_bag.py") -script_uploadExecutionRun_uploadExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") -script_uploadExecutionRun_finalizeExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") -script_uploadExecutionRun_failPreExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") -script_uploadExecutionRun_failExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") -script_uploadQC = Channel.fromPath("${baseDir}/scripts/upload_qc.py") -script_uploadQC_fail = Channel.fromPath("${baseDir}/scripts/upload_qc.py") -script_uploadOutputBag = Channel.fromPath("${baseDir}/scripts/upload_output_bag.py") -script_deleteEntry_uploadQC = Channel.fromPath("${baseDir}/scripts/delete_entry.py") -script_deleteEntry_uploadQC_fail = Channel.fromPath("${baseDir}/scripts/delete_entry.py") -script_deleteEntry_uploadProcessedFile = Channel.fromPath("${baseDir}/scripts/delete_entry.py") - -/* - * trackStart: track start of pipeline - */ -process trackStart { - container 'docker://gudmaprbk/gudmap-rbk_base:1.0.0' - script: - """ - hostname - ulimit -a - - curl -H 'Content-Type: application/json' -X PUT -d \ - '{ \ - "sessionId": "${workflow.sessionId}", \ - "pipeline": "gudmap.rbk_rnaseq", \ - "start": "${workflow.start}", \ - "repRID": "${repRID}", \ - "astrocyte": false, \ - "status": "started", \ - "nextflowVersion": "${workflow.nextflow.version}", \ - "pipelineVersion": "${workflow.manifest.version}", \ - "ci": ${params.ci}, \ - "dev": ${params.dev} \ - }' \ - "https://xku43pcwnf.execute-api.us-east-1.amazonaws.com/ProdDeploy/pipeline-tracking" - - if [ ${params.track} == true ] - then - curl -H 'Content-Type: application/json' -X PUT -d \ - '{ \ - "ID": "${workflow.sessionId}", \ - "repRID": "${repRID}", \ - "PipelineVersion": "${workflow.manifest.version}", \ - "Server": "${params.source}", \ - "Queued": "NA", \ - "CheckedOut": "NA", \ - "Started": "${workflow.start}" \ - }' \ - "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" - fi - """ -} - -log.info """\ -==================================== -BICF RNA-seq Pipeline for GUDMAP/RBK -==================================== -Replicate RID : ${params.repRID} -Source Server : ${params.source} -Mouse Reference Version: ${params.refMoVersion} -Human Reference Version: ${params.refHuVersion} -ERCC Reference Version : ${params.refERCCVersion} -Reference source : ${params.refSource} -Output Directory : ${params.outDir} -Upload : ${upload} -Track : ${params.track} ------------------------------------- -Nextflow Version : ${workflow.nextflow.version} -Pipeline Version : ${workflow.manifest.version} -Session ID : ${workflow.sessionId} ------------------------------------- -CI : ${params.ci} -Development : ${params.dev} ------------------------------------- -""" - -/* - * getBag: download input bag - */ -process getBag { - tag "${repRID}" - publishDir "${outDir}/inputBag", mode: 'copy', pattern: "*_inputBag_*.zip" - - input: - path credential, stageAs: "credential.json" from deriva_getBag - path replicateExportConfig - - output: - path ("*.zip") into bag - - when: - inputBagForce == "" - - script: - """ - hostname > ${repRID}.getBag.log - ulimit -a >> ${repRID}.getBag.log - - # link credential file for authentication - echo -e "LOG: linking deriva credentials" >> ${repRID}.getBag.log - mkdir -p ~/.deriva - ln -sf `readlink -e credential.json` ~/.deriva/credential.json - echo -e "LOG: linked" >> ${repRID}.getBag.log - - # deriva-download replicate RID - echo -e "LOG: fetching bag for ${repRID} in GUDMAP" >> ${repRID}.getBag.log - deriva-download-cli ${source} --catalog 2 ${replicateExportConfig} . rid=${repRID} - echo -e "LOG: fetched" >> ${repRID}.getBag.log - - name=\$(ls *.zip) - name=\$(basename \${name} | cut -d "." -f1) - yr=\$(date +'%Y') - mn=\$(date +'%m') - dy=\$(date +'%d') - mv \${name}.zip \${name}_\${yr}\${mn}\${dy}.zip - """ -} - -// Set inputBag to downloaded or forced input -if (inputBagForce != "") { - inputBag = Channel - .fromPath(inputBagForce) - .ifEmpty { exit 1, "override inputBag file not found: ${inputBagForce}" } -} else { - inputBag = bag -} -inputBag.into { - inputBag_getData - inputBag_uploadInputBag -} - -/* - * getData: fetch replicate files from consortium with downloaded bdbag.zip - */ -process getData { - tag "${repRID}" - - input: - path script_bdbagFetch - path cookies, stageAs: "deriva-cookies.txt" from bdbag - path inputBag from inputBag_getData - - output: - path ("*.R{1,2}.fastq.gz") into fastqs - path ("**/File.csv") into fileMeta - path ("**/Experiment Settings.csv") into experimentSettingsMeta - path ("**/Experiment.csv") into experimentMeta - path "fastqCount.csv" into fastqCount_fl - - script: - """ - hostname > ${repRID}.getData.log - ulimit -a >> ${repRID}.getData.log - - # link deriva cookie for authentication - echo -e "LOG: linking deriva cookie" >> ${repRID}.getData.log - mkdir -p ~/.bdbag - ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt - echo -e "LOG: linked" >> ${repRID}.getData.log - - # get bag basename - replicate=\$(basename "${inputBag}") - echo -e "LOG: bag replicate name \${replicate}" >> ${repRID}.getData.log - - # unzip bag - echo -e "LOG: unzipping replicate bag" >> ${repRID}.getData.log - unzip ${inputBag} - echo -e "LOG: unzipped" >> ${repRID}.getData.log - - # bag fetch fastq's only and rename by repRID - echo -e "LOG: fetching replicate bdbag" >> ${repRID}.getData.log - sh ${script_bdbagFetch} \${replicate::-13} ${repRID} - echo -e "LOG: fetched" >> ${repRID}.getData.log - - fastqCount=\$(ls *.fastq.gz | wc -l) - if [ "\${fastqCount}" == "0" ] - then - touch dummy.R1.fastq.gz - fi - echo "\${fastqCount}" > fastqCount.csv - """ -} - -// Split fastq count into channel -fastqCount = Channel.create() -fastqCount_fl.splitCsv(sep: ",", header: false).separate( - fastqCount -) - -// Set raw fastq to downloaded or forced input and replicate them for multiple process inputs -if (fastqsForce != "") { - Channel - .fromPath(fastqsForce) - .ifEmpty { exit 1, "override inputBag file not found: ${fastqsForce}" } - .collect().into { - fastqs_parseMetadata - fastqs_fastqc - } -} else { - fastqs.collect().into { - fastqs_parseMetadata - fastqs_fastqc - } -} - -/* - * parseMetadata: parses metadata to extract experiment parameters -*/ -process parseMetadata { - tag "${repRID}" - - input: - path script_parseMeta - path file from fileMeta - path experimentSettings, stageAs: "ExperimentSettings.csv" from experimentSettingsMeta - path experiment from experimentMeta - path (fastq) from fastqs_parseMetadata.collect() - val fastqCount - - output: - path "design.csv" into metadata_fl - path "fastqError.csv" into fastqError_fl - - script: - """ - hostname > ${repRID}.parseMetadata.log - ulimit -a >> ${repRID}.parseMetadata.log - - # check replicate RID metadata - rep=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p repRID) - echo -e "LOG: replicate RID metadata parsed: \${rep}" >> ${repRID}.parseMetadata.log - - # get experiment RID metadata - exp=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p expRID) - echo -e "LOG: experiment RID metadata parsed: \${exp}" >> ${repRID}.parseMetadata.log - - # get study RID metadata - study=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p studyRID) - echo -e "LOG: study RID metadata parsed: \${study}" >> ${repRID}.parseMetadata.log - - # get endedness metadata - endsRaw=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p endsMeta) - echo -e "LOG: endedness metadata parsed: \${endsRaw}" >> ${repRID}.parseMetadata.log - if [ "\${endsRaw}" == "Single End" ] - then - endsMeta="se" - elif [ "\${endsRaw}" == "Paired End" ] - then - endsMeta="pe" - elif [ "\${endsRaw}" == "Single Read" ] - # "Single Read" depreciated as of Jan 2021, this option is present for backwards compatibility - then - endsMeta="se" - elif [ "\${endsRaw}" == "nan" ] - then - endsRaw="_No value_" - endsMeta="NA" - fi - - # ganually get endness - if [ "${fastqCount}" == "1" ] - then - endsManual="se" - else - endsManual="pe" - fi - echo -e "LOG: endedness manually detected: ${fastqCount}" >> ${repRID}.parseMetadata.log - - # get strandedness metadata - stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p stranded) - echo -e "LOG: strandedness metadata parsed: \${stranded}" >> ${repRID}.parseMetadata.log - if [ "\${stranded}" == "nan" ] - then - stranded="_No value_" - fi - - # get spike-in metadata - spike=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p spike) - echo -e "LOG: spike-in metadata parsed: \${spike}" >> ${repRID}.parseMetadata.log - if [ "\${spike}" == "nan" ] - then - spike="_No value_" - fi - if [ "\${spike}" == "f" ] - then - spike="false" - elif [ "\${spike}" == "t" ] - then - spike="true" - elif [ "\${spike}" == "no" ] - # "yes"/"no" depreciated as of Jan 2021, this option is present for backwards compatibility - then - spike="false" - elif [ "\${spike}" == "yes" ] - # "yes"/"no" depreciated as of Jan 2021, this option is present for backwards compatibility - then - spike="true" - elif [ "\${spike}" == "nan" ] - then - endsRaw="_No value_" - endsMeta="NA" - fi - - # get species metadata - species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experiment}" -p species) - echo -e "LOG: species metadata parsed: \${species}" >> ${repRID}.parseMetadata.log - if [ "\${species}" == "nan" ] - then - species="_No value_" - fi - - # get read length metadata - readLength=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p readLength) - if [ "\${readLength}" = "nan" ] - then - readLength="NA" - fi - echo -e "LOG: read length metadata parsed: \${readLength}" >> ${repRID}.parseMetadata.log - - # check not incorrect number of fastqs - fastqCountError=false - fastqCountError_details="" - if [ "${fastqCount}" -gt "2" ] - then - fastqCountError=true - fastqCountError_details="**Too many fastqs detected (>2)**" - elif [ "${fastqCount}" -eq "0" ] - then - fastqCountError=true - fastqCountError_details="**No valid fastqs detected \\(may not match {_.}R{12}.fastq.gz convention\\)**" - elif [ "\${endsMeta}" == "se" ] && [ "${fastqCount}" -ne "1" ] - then - fastqCountError=true - fastqCountError_details="**Number of fastqs detected does not match submitted endness**" - elif [ "\${endsMeta}" == "pe" ] && [ "${fastqCount}" -ne "2" ] - then - fastqCountError=true - fastqCountError_details="**Number of fastqs detected does not match submitted endness**" - fi - - # check read counts match for fastqs - fastqReadError=false - fastqReadError_details="" - if [ "\${endsManual}" == "pe" ] - then - r1Count=\$(zcat ${fastq[0]} | wc -l) - r2Count=\$(zcat ${fastq[1]} | wc -l) - if [ "\${r1Count}" -ne "\${r2Count}" ] - then - fastqReadError=true - fastqReadError_details="**Number of reads do not match for R1 and R2:** there may be a trunkation or mismatch of fastq files" - fi - fi - - # save design file - echo "\${endsMeta},\${endsRaw},\${endsManual},\${stranded},\${spike},\${species},\${readLength},\${exp},\${study}" > design.csv - - # save fastq error file - echo "\${fastqCountError},\${fastqCountError_details},\${fastqReadError},\${fastqReadError_details}" > fastqError.csv - """ -} - -// Split metadata into separate channels -endsMeta = Channel.create() -endsRaw = Channel.create() -endsManual = Channel.create() -strandedMeta = Channel.create() -spikeMeta = Channel.create() -speciesMeta = Channel.create() -readLengthMeta = Channel.create() -expRID = Channel.create() -studyRID = Channel.create() -metadata_fl.splitCsv(sep: ",", header: false).separate( - endsMeta, - endsRaw, - endsManual, - strandedMeta, - spikeMeta, - speciesMeta, - readLengthMeta, - expRID, - studyRID -) - -// Replicate metadata for multiple process inputs -endsMeta.into { - endsMeta_checkMetadata - endsMeta_aggrQC - endsMeta_failExecutionRun -} -endsManual.into { - endsManual_trimData - endsManual_downsampleData - endsManual_alignSampleData - endsManual_aggrQC -} -strandedMeta.into { - strandedMeta_checkMetadata - strandedMeta_aggrQC - strandedMeta_failExecutionRun -} -spikeMeta.into { - spikeMeta_checkMetadata - spikeMeta_aggrQC - spikeMeta_failPreExecutionRun - spikeMeta_failExecutionRun -} -speciesMeta.into { - speciesMeta_checkMetadata - speciesMeta_aggrQC - speciesMeta_failPreExecutionRun - speciesMeta_failExecutionRun -} -studyRID.into { - studyRID_aggrQC - studyRID_uploadInputBag - studyRID_uploadProcessedFile - studyRID_uploadOutputBag -} -expRID.into { - expRID_aggrQC - expRID_uploadProcessedFile -} - -// Split fastq count error into separate channel -fastqCountError = Channel.create() -fastqCountError_details = Channel.create() -fastqReadError = Channel.create() -fastqReadError_details = Channel.create() -fastqError_fl.splitCsv(sep: ",", header: false).separate( - fastqCountError, - fastqCountError_details, - fastqReadError, - fastqReadError_details -) - -// Replicate errors for multiple process inputs -fastqCountError.into { - fastqCountError_fastqc - fastqCountError_trimData - fastqCountError_getRefInfer - fastqCountError_downsampleData - fastqCountError_alignSampleData - fastqCountError_inferMetadata - fastqCountError_checkMetadata - fastqCountError_uploadExecutionRun - fastqCountError_getRef - fastqCountError_alignData - fastqCountError_dedupData - fastqCountError_makeBigWig - fastqCountError_countData - fastqCountError_dataQC - fastqCountError_aggrQC - fastqCountError_uploadQC - fastqCountError_uploadQC_fail - fastqCountError_uploadProcessedFile - fastqCountError_uploadOutputBag - fastqCountError_failPreExecutionRun_fastq -} -fastqReadError.into { - fastqReadError_fastqc - fastqReadError_trimData - fastqReadError_getRefInfer - fastqReadError_downsampleData - fastqReadError_alignSampleData - fastqReadError_inferMetadata - fastqReadError_checkMetadata - fastqReadError_uploadExecutionRun - fastqReadError_getRef - fastqReadError_alignData - fastqReadError_dedupData - fastqReadError_makeBigWig - fastqReadError_countData - fastqReadError_dataQC - fastqReadError_aggrQC - fastqReadError_uploadQC - fastqReadError_uploadQC_fail - fastqReadError_uploadProcessedFile - fastqReadError_uploadOutputBag - fastqReadError_failPreExecutionRun_fastq -} - -/* - *fastqc: run fastqc on untrimmed fastq's -*/ -process fastqc { - tag "${repRID}" - - input: - path (fastq) from fastqs_fastqc.collect() - val fastqCountError_fastqc - val fastqReadError_fastqc - - output: - path ("*.R{1,2}.fastq.gz", includeInputs:true) into fastqs_trimData - path ("*_fastqc.zip") into fastqc - path ("rawReads.csv") into rawReadsInfer_fl - path "fastqFileError.csv" into fastqFileError_fl - - when: - fastqCountError_fastqc == 'false' && fastqReadError_fastqc == 'false' - - script: - """ - hostname > ${repRID}.fastqc.log - ulimit -a >> ${repRID}.fastqc.log - - # run fastqc - echo -e "LOG: running fastq on raw fastqs" >> ${repRID}.fastqc.log - fastqc *.fastq.gz -o . &> fastqc.out || true - fastqcErrorOut=\$(cat fastqc.out | grep -c 'Failed to process file') || fastqcErrorOut=0 - fastqFileError=false - fastqFileError_details="" - if [ "\${fastqcErrorOut}" -ne "0" ] - then - fastqFileError=true - fastqFileError_details="**There is an error with the structure of the fastq**" - echo -e "LOG: There is an error with the structure of the fastq" >> ${repRID}.fastqc.log - touch dummy_fastqc.zip - else - echo -e "LOG: The structure of the fastq is correct" >> ${repRID}.fastqc.log - fi - - # count raw reads - zcat *.R1.fastq.gz | echo \$((`wc -l`/4)) > rawReads.csv - - # save fastq error file - echo "\${fastqFileError},\${fastqFileError_details}" > fastqFileError.csv - """ -} - -// Extract number of raw reads metadata into channel -rawReadsInfer = Channel.create() -rawReadsInfer_fl.splitCsv(sep: ",", header: false).separate( - rawReadsInfer -) - -// Replicate inferred raw reads for multiple process inputs -rawReadsInfer.into { - rawReadsInfer_aggrQC - rawReadsInfer_uploadQC -} - -// Split fastq count error into separate channel -fastqFileError = Channel.create() -fastqFileError_details = Channel.create() -fastqFileError_fl.splitCsv(sep: ",", header: false).separate( - fastqFileError, - fastqFileError_details -) - -// Replicate errors for multiple process inputs -fastqFileError.into { - fastqFileError_fastqc - fastqFileError_trimData - fastqFileError_getRefInfer - fastqFileError_downsampleData - fastqFileError_alignSampleData - fastqFileError_inferMetadata - fastqFileError_checkMetadata - fastqFileError_uploadExecutionRun - fastqFileError_getRef - fastqFileError_alignData - fastqFileError_dedupData - fastqFileError_makeBigWig - fastqFileError_countData - fastqFileError_dataQC - fastqFileError_aggrQC - fastqFileError_uploadQC - fastqFileError_uploadQC_fail - fastqFileError_uploadProcessedFile - fastqFileError_uploadOutputBag - fastqFileError_failPreExecutionRun_fastqFile -} - -/* - * trimData: trims any adapter or non-host sequences from the data -*/ -process trimData { - tag "${repRID}" - - input: - path (fastq) from fastqs_trimData - val ends from endsManual_trimData - val fastqCountError_trimData - val fastqReadError_trimData - val fastqFileError_trimData - - output: - path ("*.fq.gz") into fastqsTrim - path ("*_trimming_report.txt") into trimQC - path ("readLength.csv") into readLengthInfer_fl - - when: - fastqCountError_trimData == "false" - fastqReadError_trimData == "false" - fastqFileError_trimData == "false" - - script: - """ - hostname > ${repRID}.trimData.log - ulimit -a >> ${repRID}.trimData.log - - # trim fastq's using trim_galore and extract median read length - echo -e "LOG: trimming ${ends}" >> ${repRID}.trimData.log - if [ "${ends}" == "se" ] - then - trim_galore --gzip -q 25 --length 35 --basename ${repRID} ${fastq[0]} - readLength=\$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') - elif [ "${ends}" == "pe" ] - then - trim_galore --gzip -q 25 --length 35 --paired --basename ${repRID} ${fastq[0]} ${fastq[1]} - readLength=\$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') - fi - echo -e "LOG: trimmed" >> ${repRID}.trimData.log - echo -e "LOG: average trimmed read length: \${readLength}" >> ${repRID}.trimData.log - - # save read length file - echo "\${readLength}" > readLength.csv - """ -} - -// Extract calculated read length metadata into channel -readLengthInfer = Channel.create() -readLengthInfer_fl.splitCsv(sep: ",", header: false).separate( - readLengthInfer -) - -// Replicate inferred read length for multiple process inputs -readLengthInfer.into { - readLengthInfer_aggrQC - readLengthInfer_uploadQC -} -// Replicate trimmed fastq's for multiple process inputs -fastqsTrim.into { - fastqsTrim_alignData - fastqsTrim_downsampleData -} - -// Combine inputs of getRefInfer -getRefInferInput = referenceInfer.combine(deriva_getRefInfer.combine(script_refDataInfer.combine(fastqCountError_getRefInfer.combine(fastqReadError_getRefInfer.combine(fastqFileError_getRefInfer))))) - -/* - * getRefInfer: dowloads appropriate reference for metadata inference -*/ -process getRefInfer { - tag "${refName}" - - input: - tuple val (refName), path (credential, stageAs: "credential.json"), path (script_refDataInfer), val (fastqCountError), val (fastqReadError), val (fastqFileError) from getRefInferInput - - output: - tuple val (refName), path ("hisat2", type: 'dir'), path ("*.fna"), path ("*.gtf") into refInfer - path ("${refName}", type: 'dir') into bedInfer - - when: - fastqCountError == "false" - fastqReadError == "false" - fastqFileError == "false" - - script: - """ - hostname > ${repRID}.${refName}.getRefInfer.log - ulimit -a >> ${repRID}.${refName}.getRefInfer.log - - # link credential file for authentication - echo -e "LOG: linking deriva credentials" >> ${repRID}.${refName}.getRefInfer.log - mkdir -p ~/.deriva - ln -sf `readlink -e credential.json` ~/.deriva/credential.json - echo -e "LOG: linked" >> ${repRID}.${refName}.getRefInfer.log - - # set the reference name - if [ "${refName}" == "ERCC" ] - then - references=\$(echo ${referenceBase}/ERCC${refERCCVersion}) - elif [ "${refName}" == "GRCm" ] - then - references=\$(echo ${referenceBase}/GRCm${refMoVersion}) - elif [ '${refName}' == "GRCh" ] - then - references=\$(echo ${referenceBase}/GRCh${refHuVersion}) - else - echo -e "LOG: ERROR - References could not be set!\nReference found: ${referenceBase}" >> ${repRID}.${refName}.getRefInfer.log - exit 1 - fi - - # retreive appropriate reference appropriate location - echo -e "LOG: fetching ${refName} reference files from ${referenceBase}" >> ${repRID}.${refName}.getRefInfer.log - if [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references/new" ] - then - unzip \${references}.zip - mv \$(basename \${references})/data/* . - elif [ params.refSource == "datahub" ] - then - GRCv=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f1) - GRCp=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f2) - GENCODE=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f3) - if [ "${refName}" != "ERCC" ] - then - query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='\${GRCv}'.'\${GRCp}'/Annotation_Version=GENCODE%20'\${GENCODE}) - else - query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version=${refName}${refERCCVersion}/Annotation_Version=${refName}${refERCCVersion}') - fi - curl --request GET \${query} > refQuery.json - refURL=\$(python ${script_refDataInfer} --returnParam URL) - loc=\$(dirname \${refURL}) - fName=\$(python ${script_refDataInfer} --returnParam fName) - fName=\${fName%.*} - if [ "\${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi - filename=\$(echo \$(basename \${refURL}) | grep -oP '.*(?=:)') - deriva-hatrac-cli --host ${referenceBase} get \${refURL} - unzip \$(basename \${refURL}) - mv \${fName}/data/* . - fi - mv ./annotation/genome.gtf . - mv ./sequence/genome.fna . - mkdir ${refName} - if [ "${refName}" != "ERCC" ] - then - mv ./annotation/genome.bed ./${refName} - fi - echo -e "LOG: fetched" >> ${repRID}.${refName}.getRefInfer.log - """ -} - -/* - * downsampleData: downsample fastq's for metadata inference - */ -process downsampleData { - tag "${repRID}" - - input: - path fastq from fastqsTrim_downsampleData - val ends from endsManual_downsampleData - val fastqCountError_downsampleData - val fastqReadError_downsampleData - val fastqFileError_downsampleData - - output: - path ("sampled.1.fq") into fastqs1Sample - path ("sampled.2.fq") into fastqs2Sample - - when: - fastqCountError_downsampleData == "false" - fastqReadError_downsampleData == "false" - fastqFileError_downsampleData == "false" - - script: - """ - hostname > ${repRID}.downsampleData.log - ulimit -a >> ${repRID}.downsampleData.log - - if [ "${ends}" == "se" ] - then - echo -e "LOG: downsampling SE trimmed fastq" >> ${repRID}.downsampleData.log - seqtk sample -s100 *trimmed.fq.gz 100000 1> sampled.1.fq - touch sampled.2.fq - elif [ "${ends}" == "pe" ] - then - echo -e "LOG: downsampling R1 of PE trimmed fastq" >> ${repRID}.downsampleData.log - seqtk sample -s100 *1.fq.gz 1000000 1> sampled.1.fq - echo -e "LOG: downsampling R2 of PE trimmed fastq" >> ${repRID}.downsampleData.log - seqtk sample -s100 *2.fq.gz 1000000 1> sampled.2.fq - fi - echo -e "LOG: downsampled" >> ${repRID}.downsampleData.log - """ -} - -// Replicate the dowsampled fastq's and attatched to the references -inferInput = endsManual_alignSampleData.combine(refInfer.combine(fastqs1Sample.collect().combine(fastqs2Sample.collect().combine(fastqCountError_alignSampleData.combine(fastqReadError_alignSampleData.combine(fastqFileError_alignSampleData)))))) - -/* - * alignSampleData: aligns the downsampled reads to a reference database -*/ -process alignSampleData { - tag "${ref}" - - input: - tuple val (ends), val (ref), path (hisat2), path (fna), path (gtf), path (fastq1), path (fastq2), val (fastqCountError), val (fastqReadError), val (fastqFileError) from inferInput - - output: - path ("${ref}.sampled.sorted.bam") into sampleBam - path ("${ref}.sampled.sorted.bam.bai") into sampleBai - path ("${ref}.alignSampleSummary.txt") into alignSampleQC - - when: - fastqCountError == "false" - fastqReadError == "false" - fastqFileError == "false" - - script: - """ - hostname > ${repRID}.${ref}.alignSampleData.log - ulimit -a >> ${repRID}.${ref}.alignSampleData.log - - # align the reads with Hisat2 - echo -e "LOG: aligning ${ends}" >> ${repRID}.${ref}.alignSampleData.log - if [ "${ends}" == "se" ] - then - - hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome -U ${fastq1} --summary-file ${ref}.alignSampleSummary.txt --new-summary - elif [ "${ends}" == "pe" ] - then - hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome --no-mixed --no-discordant -1 ${fastq1} -2 ${fastq2} --summary-file ${ref}.alignSampleSummary.txt --new-summary - fi - echo -e "LOG: aliged" >> ${repRID}.${ref}.alignSampleData.log - - # convert the output sam file to a sorted bam file using Samtools - echo -e "LOG: converting from sam to bam" >> ${repRID}.${ref}.alignSampleData.log - samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${ref}.sampled.bam ${ref}.sampled.sam - - # sort the bam file using Samtools - echo -e "LOG: sorting the bam file" >> ${repRID}.${ref}.alignSampleData.log - proc=\$(expr `nproc` - 1) - mem=\$(vmstat -s -S K | grep 'total memory' | grep -o '[0-9]*') - mem=\$(expr \${mem} / \${proc} \\* 85 / 100) - samtools sort -@ \${proc} -m \${mem}K -O BAM -o ${ref}.sampled.sorted.bam ${ref}.sampled.bam - - # index the sorted bam using Samtools - echo -e "LOG: indexing sorted bam file" >> ${repRID}.${ref}.alignSampleData.log - samtools index -@ `nproc` -b ${ref}.sampled.sorted.bam ${ref}.sampled.sorted.bam.bai - """ -} - -alignSampleQC.into { - alignSampleQC_inferMetadata - alignSampleQC_aggrQC -} - -process inferMetadata { - tag "${repRID}" - - input: - path script_inferMeta - path beds from bedInfer.collect() - path bam from sampleBam.collect() - path bai from sampleBai.collect() - path alignSummary from alignSampleQC_inferMetadata.collect() - val strandedForce - val spikeForce - val fastqCountError_inferMetadata - val fastqReadError_inferMetadata - val fastqFileError_inferMetadata - - output: - path "infer.csv" into inferMetadata_fl - path "${repRID}.infer_experiment.txt" into inferExperiment - path "speciesError.csv" into speciesError_fl - - when: - fastqCountError_inferMetadata == "false" - fastqReadError_inferMetadata == "false" - fastqFileError_inferMetadata == "false" - - script: - """ - hostname > ${repRID}.inferMetadata.log - ulimit -a >> ${repRID}.inferMetadata.log - - # collect alignment rates (round down to integers) - align_ercc=\$(echo \$(grep "Overall alignment rate" ERCC.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) - align_ercc=\$(echo \${align_ercc%.*}) - echo -e "LOG: alignment rate to ERCC: \${align_ercc}" >> ${repRID}.inferMetadata.log - align_hu=\$(echo \$(grep "Overall alignment rate" GRCh.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) - align_hu=\$(echo \${align_hu%.*}) - echo -e "LOG: alignment rate to GRCh: \${align_hu}" >> ${repRID}.inferMetadata.log - align_mo=\$(echo \$(grep "Overall alignment rate" GRCm.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) - align_mo=\$(echo \${align_mo%.*}) - echo -e "LOG: alignment rate to GRCm: \${align_mo}" >> ${repRID}.inferMetadata.log - - # determine spike-in - if [ 1 -eq \$(echo \$(expr \${align_ercc} ">=" 10)) ] - then - spike="true" - else - spike="false" - fi - echo -e "LOG: inference of strandedness results is: \${spike}" >> ${repRID}.inferMetadata.log - if [ "${spikeForce}" != "" ] - then - spike=${spikeForce} - echo -e "LOG: spike-in metadata forced: \${spike}" >> ${repRID}.parseMetadata.log - fi - - speciesError=false - speciesError_details="" - # determine species - if [ 1 -eq \$(echo \$(expr \${align_hu} ">=" 40)) ] && [ 1 -eq \$(echo \$(expr \${align_mo} "<" 40)) ] - then - species="Homo sapiens" - bam="GRCh.sampled.sorted.bam" - bed="./GRCh/genome.bed" - echo -e "LOG: inference of species results in: \${species}" >> ${repRID}.inferMetadata.log - elif [ 1 -eq \$(echo \$(expr \${align_mo} ">=" 40)) ] && [ 1 -eq \$(echo \$(expr \${align_hu} "<" 40)) ] - then - species="Mus musculus" - bam="GRCm.sampled.sorted.bam" - bed="./GRCm/genome.bed" - echo -e "LOG: inference of species results in: \${species}" >> ${repRID}.inferMetadata.log - else - echo -e "LOG: ERROR - inference of species returns an ambiguous result: hu=\${align_hu} mo=\${align_mo}" >> ${repRID}.inferMetadata.log - if [ "${speciesForce}" == "" ] - then - speciesError=true - speciesError_details="**Inference of species returns an ambiguous result:** Percent aligned to human = \${align_hu} and percent aligned to mouse = \${align_mo}" - fi - fi - if [ "${speciesForce}" != "" ] - then - speciesError=false - echo -e "LOG: species overridden to: ${speciesForce}" - species="${speciesForce}" - if [ "${speciesForce}" == "Homo sapiens" ] - then - bam="GRCh.sampled.sorted.bam" - bed="./GRCh/genome.bed" - elif [ "${speciesForce}" == "Mus musculus" ] - then - bam="GRCm.sampled.sorted.bam" - bed="./GRCm/genome.bed" - fi - fi - - if [ "\${speciesError}" == false ] - then - # infer experimental setting from dedup bam - echo -e "LOG: infer experimental setting from dedup bam" >> ${repRID}.inferMetadata.log - infer_experiment.py -r "\${bed}" -i "\${bam}" 1>> ${repRID}.infer_experiment.txt - echo -e "LOG: inferred" >> ${repRID}.inferMetadata.log - - ended=`bash ${script_inferMeta} endness ${repRID}.infer_experiment.txt` - fail=`bash ${script_inferMeta} fail ${repRID}.infer_experiment.txt` - if [ \${ended} == "PairEnd" ] - then - ends="pe" - percentF=`bash ${script_inferMeta} pef ${repRID}.infer_experiment.txt` - percentR=`bash ${script_inferMeta} per ${repRID}.infer_experiment.txt` - elif [ \${ended} == "SingleEnd" ] - then - ends="se" - percentF=`bash ${script_inferMeta} sef ${repRID}.infer_experiment.txt` - percentR=`bash ${script_inferMeta} ser ${repRID}.infer_experiment.txt` - fi - echo -e "LOG: percentage reads in the same direction as gene: \${percentF}" >> ${repRID}.inferMetadata.log - echo -e "LOG: percentage reads in the opposite direction as gene: \${percentR}" >> ${repRID}.inferMetadata.log - if [ 1 -eq \$(echo \$(expr \${percentF#*.} ">" 2500)) ] && [ 1 -eq \$(echo \$(expr \${percentR#*.} "<" 2500)) ] - then - stranded="forward" - elif [ 1 -eq \$(echo \$(expr \${percentR#*.} ">" 2500)) ] && [ 1 -eq \$(echo \$(expr \${percentF#*.} "<" 2500)) ] - then - stranded="reverse" - else - stranded="unstranded" - fi - echo -e "LOG: stradedness set to: \${stranded}" >> ${repRID}.inferMetadata.log - if [ "${strandedForce}" != "" ] - then - stranded=${strandedForce} - echo -e "LOG: spike-in metadata forced: \${stranded}" >> ${repRID}.inferMetadata.log - fi - else - ends="" - stranded="" - spike="" - species="" - percentF="" - percentR="" - fail="" - touch ${repRID}.infer_experiment.txt - fi - - # write inferred metadata to file - echo "\${ends},\${stranded},\${spike},\${species},\${align_ercc},\${align_hu},\${align_mo},\${percentF},\${percentR},\${fail}" > infer.csv - - # save species error file - echo "\${speciesError},\${speciesError_details}" > speciesError.csv - """ -} - -// Split metadata into separate channels -endsInfer = Channel.create() -strandedInfer = Channel.create() -spikeInfer = Channel.create() -speciesInfer = Channel.create() -align_erccInfer = Channel.create() -align_huInfer = Channel.create() -align_moInfer = Channel.create() -percentFInfer = Channel.create() -percentRInfer = Channel.create() -failInfer = Channel.create() -inferMetadata_fl.splitCsv(sep: ",", header: false).separate( - endsInfer, - strandedInfer, - spikeInfer, - speciesInfer, - align_erccInfer, - align_huInfer, - align_moInfer, - percentFInfer, - percentRInfer, - failInfer -) - -// Replicate metadata for multiple process inputs -endsInfer.into { - endsInfer_checkMetadata - endsInfer_alignData - endsInfer_countData - endsInfer_dataQC - endsInfer_aggrQC - endsInfer_uploadQC - endsInfer_failExecutionRun -} -strandedInfer.into { - strandedInfer_checkMetadata - strandedInfer_alignData - strandedInfer_countData - strandedInfer_aggrQC - strandedInfer_uploadQC - strandedInfer_failExecutionRun -} -spikeInfer.into{ - spikeInfer_checkMetadata - spikeInfer_getRef - spikeInfer_aggrQC - spikeInfer_uploadExecutionRun - spikeInfer_failExecutionRun -} -speciesInfer.into { - speciesInfer_checkMetadata - speciesInfer_getRef - speciesInfer_aggrQC - speciesInfer_uploadExecutionRun - speciesInfer_uploadProcessedFile - speciesInfer_failExecutionRun -} - -// Split species count error into separate channel -speciesError = Channel.create() -speciesError_details = Channel.create() -speciesError_fl.splitCsv(sep: ",", header: false).separate( - speciesError, - speciesError_details -) - -// Replicate errors for multiple process inputs -speciesError.into { - speciesError_checkMetadata - speciesError_uploadExecutionRun - speciesError_getRef - speciesError_alignData - speciesError_dedupData - speciesError_makeBigWig - speciesError_countData - speciesError_fastqc - speciesError_dataQC - speciesError_aggrQC - speciesError_uploadQC - speciesError_uploadQC_fail - speciesError_uploadProcessedFile - speciesError_uploadOutputBag - speciesError_failPreExecutionRun_species -} - -/* - * checkMetadata: checks the submitted metada against inferred -*/ -process checkMetadata { - tag "${repRID}" - - input: - val endsMeta from endsMeta_checkMetadata - val strandedMeta from strandedMeta_checkMetadata - val spikeMeta from spikeMeta_checkMetadata - val speciesMeta from speciesMeta_checkMetadata - val endsInfer from endsInfer_checkMetadata - val strandedInfer from strandedInfer_checkMetadata - val spikeInfer from spikeInfer_checkMetadata - val speciesInfer from speciesInfer_checkMetadata - val fastqCountError_checkMetadata - val fastqReadError_checkMetadata - val fastqFileError_checkMetadata - val speciesError_checkMetadata - - output: - path ("check.csv") into checkMetadata_fl - path ("outputBagRID.csv") optional true into outputBagRID_fl_dummy - - when: - fastqCountError_checkMetadata == "false" - fastqReadError_checkMetadata == "false" - fastqFileError_checkMetadata == "false" - speciesError_checkMetadata == "false" - - script: - """ - hostname > ${repRID}.checkMetadata.log - ulimit -a >> ${repRID}.checkMetadata.log - - pipelineError=false - pipelineError_ends=false - pipelineError_stranded=false - pipelineError_spike=false - pipelineError_species=false - # check if submitted metadata matches inferred - if [ "${strandedMeta}" != "${strandedInfer}" ] - then - if [ "${params.strandedForce}" != "" ] - then - pipelineError=false - pipelineError_stranded=false - echo -e "LOG: stranded forced: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log - else - pipelineError=true - pipelineError_stranded=true - if [ "${strandedMeta}" == "stranded" ] - then - if [[ "${strandedInfer}" == "forward" ]] || [[ "${strandedInfer}" == "reverse" ]] - then - pipelineError=false - pipelineError_stranded=false - echo -e "LOG: stranded matches: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log - else - echo -e "LOG: stranded does not match: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log - fi - else - echo -e "LOG: stranded does not match: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log - fi - fi - else - pipelineError=false - pipelineError_stranded=false - echo -e "LOG: stranded matches: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log - fi - if [ "${endsMeta}" != "${endsInfer}" ] - then - pipelineError=true - pipelineError_ends=true - echo -e "LOG: ends do not match: Submitted=${endsMeta}; Inferred=${endsInfer}" >> ${repRID}.checkMetadata.log - else - pipelineError_ends=false - echo -e "LOG: ends matches: Submitted=${endsMeta}; Inferred=${endsInfer}" >> ${repRID}.checkMetadata.log - fi - if [ "${spikeMeta}" != "${spikeInfer}" ] - then - if [[ "${params.spikeForce}" != "" ]] - then - pipelineError_spike=false - echo -e "LOG: spike forced: Submitted=${spikeMeta}; Inferred=${spikeInfer}" >> ${repRID}.checkMetadata.log - else - pipelineError=true - pipelineError_spike=true - echo -e "LOG: spike does not match: Submitted=${spikeMeta}; Inferred=${spikeInfer}" >> ${repRID}.checkMetadata.log - fi - else - pipelineError_spike=false - echo -e "LOG: spike matches: Submitted=${spikeMeta}; Inferred=${spikeInfer}" >> ${repRID}.checkMetadata.log - fi - if [ "${speciesMeta}" != "${speciesInfer}" ] - then - if [[ "${params.speciesForce}" != "" ]] - then - pipelineError_species=false - echo -e "LOG: species forced: Submitted=${speciesMeta}; Inferred=${speciesInfer}" >> ${repRID}.checkMetadata.log - else - pipelineError=true - pipelineError_species=true - echo -e "LOG: species does not match: Submitted=${speciesMeta}; Inferred=${speciesInfer}" >> ${repRID}.checkMetadata.log - fi - else - pipelineError_species=false - echo -e "LOG: species matches: Submitted=${speciesMeta}; Inferred=${speciesInfer}" >> ${repRID}.checkMetadata.log - fi - - # create dummy output bag rid if failure - if [ \${pipelineError} == true ] - then - echo "fail" > outputBagRID.csv - fi - - # write checks to file - echo "\${pipelineError},\${pipelineError_ends},\${pipelineError_stranded},\${pipelineError_spike},\${pipelineError_species}" > check.csv - """ -} - -// Split errors into separate channels -pipelineError = Channel.create() -pipelineError_ends = Channel.create() -pipelineError_stranded = Channel.create() -pipelineError_spike = Channel.create() -pipelineError_species = Channel.create() -checkMetadata_fl.splitCsv(sep: ",", header: false).separate( - pipelineError, - pipelineError_ends, - pipelineError_stranded, - pipelineError_spike, - pipelineError_species -) - -// Replicate errors for multiple process inputs -pipelineError.into { - pipelineError_getRef - pipelineError_alignData - pipelineError_dedupData - pipelineError_makeBigWig - pipelineError_countData - pipelineError_fastqc - pipelineError_dataQC - pipelineError_aggrQC - pipelineError_uploadQC - pipelineError_uploadQC_fail - pipelineError_uploadProcessedFile - pipelineError_uploadOutputBag - pipelineError_failExecutionRun -} - -/* - * uploadInputBag: uploads the input bag -*/ -process uploadInputBag { - tag "${repRID}" - - input: - path script_uploadInputBag - path credential, stageAs: "credential.json" from deriva_uploadInputBag - path inputBag from inputBag_uploadInputBag - val studyRID from studyRID_uploadInputBag - - output: - path ("inputBagRID.csv") into inputBagRID_fl - - when: - upload - - script: - """ - hostname > ${repRID}.uploadInputBag.log - ulimit -a >> ${repRID}.uploadInputBag.log - - yr=\$(date +'%Y') - mn=\$(date +'%m') - dy=\$(date +'%d') - - file=\$(basename -a ${inputBag}) - md5=\$(md5sum ./\${file} | awk '{ print \$1 }') - echo LOG: ${repRID} input bag md5 sum - \${md5} >> ${repRID}.uploadInputBag.log - size=\$(wc -c < ./\${file}) - echo LOG: ${repRID} input bag size - \${size} bytes >> ${repRID}.uploadInputBag.log - - exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=\${md5}) - if [ "\${exist}" == "[]" ] - then - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/input_bag/study/${studyRID}/replicate/${repRID}/\${file} --parents) - inputBag_rid=\$(python3 ${script_uploadInputBag} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie}) - echo LOG: input bag RID uploaded - \${inputBag_rid} >> ${repRID}.uploadInputBag.log - rid=\${inputBag_rid} - else - exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - exist=\${exist:7:-6} - echo LOG: input bag RID already exists - \${exist} >> ${repRID}.uploadInputBag.log - rid=\${exist} - fi - - echo "\${rid}" > inputBagRID.csv - """ -} - -// Extract input bag RID into channel -inputBagRID = Channel.create() -inputBagRID_fl.splitCsv(sep: ",", header: false).separate( - inputBagRID -) - -// Replicate input bag RID for multiple process inputs -inputBagRID.into { - inputBagRID_uploadExecutionRun - inputBagRID_finalizeExecutionRun - inputBagRID_failPreExecutionRun - inputBagRID_failExecutionRun -} - -/* - * uploadExecutionRun: uploads the execution run -*/ -process uploadExecutionRun { - tag "${repRID}" - - input: - path script_uploadExecutionRun_uploadExecutionRun - path credential, stageAs: "credential.json" from deriva_uploadExecutionRun - val spike from spikeInfer_uploadExecutionRun - val species from speciesInfer_uploadExecutionRun - val inputBagRID from inputBagRID_uploadExecutionRun - val fastqCountError_uploadExecutionRun - val fastqReadError_uploadExecutionRun - val fastqFileError_uploadExecutionRun - val speciesError_uploadExecutionRun - - output: - path ("executionRunRID.csv") into executionRunRID_fl - - when: - upload - fastqCountError_uploadExecutionRun == "false" - fastqReadError_uploadExecutionRun == "false" - fastqFileError_uploadExecutionRun == "false" - speciesError_uploadExecutionRun == "false" - - script: - """ - hostname > ${repRID}.uploadExecutionRun.log - ulimit -a >> ${repRID}.uploadExecutionRun.log - - echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.uploadExecutionRun.log - workflow=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Workflow/Name=BICF%20mRNA%20Replicate/Version=${workflow.manifest.version}) - workflow=\$(echo \${workflow} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - workflow=\${workflow:7:-6} - echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.uploadExecutionRun.log - - if [ "${species}" == "Homo sapiens" ] - then - genomeName=\$(echo GRCh${refHuVersion}) - elif [ "${species}" == "Mus musculus" ] - then - genomeName=\$(echo GRCm${refMoVersion}) - fi - if [ "${spike}" == "true" ] - then - genomeName=\$(echo \${genomeName}-S) - fi - echo LOG: searching for genome name - \${genomeName} >> ${repRID}.uploadExecutionRun.log - genome=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Name=\${genomeName}) - genome=\$(echo \${genome} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - genome=\${genome:7:-6} - echo LOG: genome RID extracted - \${genome} >> ${repRID}.uploadExecutionRun.log - - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Replicate=${repRID}/Input_Bag=${inputBagRID}) - echo \${exist} >> ${repRID}.uploadExecutionRun.log - if [ "\${exist}" == "[]" ] - then - executionRun_rid=\$(python3 ${script_uploadExecutionRun_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s In-progress -d 'Run in process' -o ${source} -c \${cookie} -u F) - echo LOG: execution run RID uploaded - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log - else - rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - rid=\${rid:7:-6} - echo \${rid} >> ${repRID}.uploadExecutionRun.log - executionRun_rid=\$(python3 ${script_uploadExecutionRun_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s In-progress -d 'Run in process' -o ${source} -c \${cookie} -u \${rid}) - echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log - fi - - echo "\${executionRun_rid}" > executionRunRID.csv - - if [ ${params.track} == true ] - then - curl -H 'Content-Type: application/json' -X PUT -d \ - '{ \ - "ID": "${workflow.sessionId}", \ - "ExecutionRunRID": "'\${executionRun_rid}'" \ - }' \ - "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" - fi - """ -} - -// Extract execution run RID into channel -executionRunRID = Channel.create() -executionRunRID_fl.splitCsv(sep: ",", header: false).separate( - executionRunRID -) - -// Replicate execution run RID for multiple process inputs -executionRunRID.into { - executionRunRID_uploadQC - executionRunRID_uploadProcessedFile - executionRunRID_uploadOutputBag - executionRunRID_finalizeExecutionRun - executionRunRID_failExecutionRun - executionRunRID_fail -} - -/* - * getRef: downloads appropriate reference -*/ -process getRef { - tag "${species}" - - input: - path script_refData - path credential, stageAs: "credential.json" from deriva_getRef - val spike from spikeInfer_getRef - val species from speciesInfer_getRef - val fastqCountError_getRef - val fastqReadError_getRef - val fastqFileError_getRef - val speciesError_getRef - val pipelineError_getRef - - output: - tuple path ("hisat2", type: 'dir'), path ("*.bed"), path ("*.fna"), path ("*.gtf"), path ("geneID.tsv"), path ("Entrez.tsv") into reference - - when: - fastqCountError_getRef == "false" - fastqReadError_getRef == "false" - fastqFileError_getRef == "false" - speciesError_getRef == "false" - pipelineError_getRef == "false" - - script: - """ - hostname > ${repRID}.getRef.log - ulimit -a >> ${repRID}.getRef.log - - # link credential file for authentication - echo -e "LOG: linking deriva credentials" >> ${repRID}.getRef.log - mkdir -p ~/.deriva - ln -sf `readlink -e credential.json` ~/.deriva/credential.json - echo -e "LOG: linked" >> ${repRID}.getRef.log - - # set the reference name - if [ "${species}" == "Mus musculus" ] - then - reference=\$(echo ${referenceBase}/GRCm${refMoVersion}) - refName=GRCm - elif [ '${species}' == "Homo sapiens" ] - then - reference=\$(echo ${referenceBase}/GRCh${refHuVersion}) - refName=GRCh - else - echo -e "LOG: ERROR - References could not be set!\nSpecies reference found: ${species}" >> ${repRID}.getRef.log - exit 1 - fi - if [ "${spike}" == "true" ] - then - reference=\$(echo \${reference}-S) - elif [ "${spike}" == "false" ] - then - reference=\$(echo \${reference}) - fi - echo -e "LOG: species set to \${reference}" >> ${repRID}.getRef.log - - # retreive appropriate reference appropriate location - echo -e "LOG: fetching ${species} reference files from ${referenceBase}" >> ${repRID}.getRef.log - if [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references/new" ] - then - echo -e "LOG: grabbing reference files from local (BioHPC)" >> ${repRID}.getRef.log - unzip \${reference}.zip - mv \$(basename \${reference})/data/* . - elif [ arams.refSource == "datahub" ] - then - echo -e "LOG: grabbing reference files from datahub" >> ${repRID}.getRef.log - GRCv=\$(echo \${reference} | grep -o \${refName}.* | cut -d '.' -f1) - GRCp=\$(echo \${reference} | grep -o \${refName}.* | cut -d '.' -f2) - GENCODE=\$(echo \${reference} | grep -o \${refName}.* | cut -d '.' -f3) - query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='\${GRCv}'.'\${GRCp}'/Annotation_Version=GENCODE%20'\${GENCODE}) - curl --request GET \${query} > refQuery.json - refURL=\$(python ${script_refData} --returnParam URL) - loc=\$(dirname \${refURL}) - fName=\$(python ${script_refData} --returnParam fName) - fName=\${fName%.*} - if [ "\${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi - filename=\$(echo \$(basename \${refURL}) | grep -oP '.*(?=:)') - deriva-hatrac-cli --host ${referenceBase} get \${refURL} - unzip \$(basename \${refURL}) - mv \${fName}/data/* . - fi - echo -e "LOG: fetched" >> ${repRID}.getRef.log - - mv ./annotation/genome.gtf . - mv ./sequence/genome.fna . - mv ./annotation/genome.bed . - mv ./metadata/Entrez.tsv . - mv ./metadata/geneID.tsv . - """ -} - -// Replicate reference for multiple process inputs -reference.into { - reference_alignData - reference_countData - reference_dataQC -} - -/* - * alignData: aligns the reads to a reference database -*/ -process alignData { - tag "${repRID}" - - input: - path fastq from fastqsTrim_alignData - path reference_alignData - val ends from endsInfer_alignData - val stranded from strandedInfer_alignData - val fastqCountError_alignData - val fastqReadError_alignData - val fastqFileError_alignData - val speciesError_alignData - val pipelineError_alignData - - output: - tuple path ("${repRID}.sorted.bam"), path ("${repRID}.sorted.bam.bai") into rawBam - path ("*.alignSummary.txt") into alignQC - - when: - fastqCountError_alignData == "false" - fastqReadError_alignData == "false" - fastqFileError_alignData == "false" - speciesError_alignData == "false" - pipelineError_alignData == "false" - - script: - """ - hostname > ${repRID}.align.log - ulimit -a >> ${repRID}.align.log - - # set stranded param for hisat2 - if [ "${stranded}"=="unstranded" ] - then - strandedParam="" - elif [ "${stranded}" == "forward" ] && [ "${ends}" == "se" ] - then - strandedParam="--rna-strandness F" - elif [ "${stranded}" == "forward" ] && [ "${ends}" == "pe" ] - then - strandedParam="--rna-strandness FR" - elif [ "${stranded}" == "reverse" ] && [ "${ends}" == "se" ] - then - strandedParam="--rna-strandness R" - elif [ "${stranded}" == "reverse" ] && [ "${ends}" == "pe" ] - then - strandedParam="--rna-strandness RF" - fi - - # align the reads with Hisat2 - echo -e "LOG: aligning ${ends}" >> ${repRID}.align.log - if [ "${ends}" == "se" ] - then - hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} -U ${fastq[0]} --summary-file ${repRID}.alignSummary.txt --new-summary - elif [ "${ends}" == "pe" ] - then - hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} --summary-file ${repRID}.alignSummary.txt --new-summary - fi - echo -e "LOG: alignined" >> ${repRID}.align.log - - # convert the output sam file to a sorted bam file using Samtools - echo -e "LOG: converting from sam to bam" >> ${repRID}.align.log - samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam - - # sort the bam file using Samtools - echo -e "LOG: sorting the bam file" >> ${repRID}.align.log - proc=\$(expr `nproc` - 1) - mem=\$(vmstat -s -S K | grep 'total memory' | grep -o '[0-9]*') - mem=\$(expr \${mem} / \${proc} \\* 75 / 100) - samtools sort -@ \${proc} -m \${mem}K -O BAM -o ${repRID}.sorted.bam ${repRID}.bam - - # index the sorted bam using Samtools - echo -e "LOG: indexing sorted bam file" >> ${repRID}.align.log - samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bam.bai - """ -} - -// Replicate rawBam for multiple process inputs -rawBam.set { - rawBam_dedupData -} - -/* - *dedupData: mark the duplicate reads, specifically focused on PCR or optical duplicates -*/ -process dedupData { - tag "${repRID}" - publishDir "${outDir}/bam", mode: 'copy', pattern: "*.deduped.bam" - - input: - tuple path (bam), path (bai) from rawBam_dedupData - val fastqCountError_dedupData - val fastqReadError_dedupData - val fastqFileError_dedupData - val speciesError_dedupData - val pipelineError_dedupData - - output: - tuple path ("${repRID}_sorted.deduped.bam"), path ("${repRID}_sorted.deduped.bam.bai") into dedupBam - tuple path ("${repRID}_sorted.deduped.*.bam"), path ("${repRID}_sorted.deduped.*.bam.bai") into dedupChrBam - path ("*.deduped.Metrics.txt") into dedupQC - - when: - fastqCountError_dedupData == 'false' - fastqReadError_dedupData == 'false' - fastqFileError_dedupData == 'false' - speciesError_dedupData == 'false' - pipelineError_dedupData == 'false' - - script: - """ - hostname > ${repRID}.dedup.log - ulimit -a >> ${repRID}.dedup.log - - # remove duplicated reads using Picard's MarkDuplicates - echo -e "LOG: deduplicating reads" >> ${repRID}.dedup.log - java -jar /picard/build/libs/picard.jar MarkDuplicates I=${bam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true - echo -e "LOG: deduplicated" >> ${repRID}.dedup.log - - # sort the bam file using Samtools - echo -e "LOG: sorting the bam file" >> ${repRID}.dedup.log - samtools sort -@ `nproc` -O BAM -o ${repRID}_sorted.deduped.bam ${repRID}.deduped.bam - - # index the sorted bam using Samtools - echo -e "LOG: indexing sorted bam file" >> ${repRID}.dedup.log - samtools index -@ `nproc` -b ${repRID}_sorted.deduped.bam ${repRID}_sorted.deduped.bam.bai - - # split the deduped BAM file for multi-threaded tin calculation - for i in `samtools view ${repRID}_sorted.deduped.bam | cut -f3 | grep -o chr.[0-9]* | sort | uniq`; - do - echo "echo \"LOG: splitting each chromosome into its own BAM and BAI files with Samtools\"; samtools view -b ${repRID}_sorted.deduped.bam \${i} 1>> ${repRID}_sorted.deduped.\${i}.bam; samtools index -@ `nproc` -b ${repRID}_sorted.deduped.\${i}.bam ${repRID}_sorted.deduped.\${i}.bam.bai" - done | parallel -j `nproc` -k - """ -} - -// Replicate dedup bam/bai for multiple process inputs -dedupBam.into { - dedupBam_countData - dedupBam_makeBigWig - dedupBam_dataQC - dedupBam_uploadProcessedFile -} - -/* - *makeBigWig: make BigWig files for output -*/ -process makeBigWig { - tag "${repRID}" - publishDir "${outDir}/bigwig", mode: 'copy', pattern: "${repRID}.bw" - - input: - tuple path (bam), path (bai) from dedupBam_makeBigWig - val fastqCountError_makeBigWig - val fastqReadError_makeBigWig - val fastqFileError_makeBigWig - val speciesError_makeBigWig - val pipelineError_makeBigWig - - output: - path ("${repRID}_sorted.deduped.bw") into bigwig - - when: - fastqCountError_makeBigWig == 'false' - fastqReadError_makeBigWig == 'false' - fastqFileError_makeBigWig == 'false' - speciesError_makeBigWig == 'false' - pipelineError_makeBigWig == 'false' - - script: - """ - hostname > ${repRID}.makeBigWig.log - ulimit -a >> ${repRID}.makeBigWig.log - - # create bigwig - echo -e "LOG: creating bibWig" >> ${repRID}.makeBigWig.log - bamCoverage -p `nproc` -b ${bam} -o ${repRID}_sorted.deduped.bw - echo -e "LOG: created" >> ${repRID}.makeBigWig.log - """ -} - -/* - *countData: count data and calculate tpm -*/ -process countData { - tag "${repRID}" - publishDir "${outDir}/count", mode: 'copy', pattern: "${repRID}*_tpmTable.csv" - - input: - path script_calculateTPM - path script_convertGeneSymbols - tuple path (bam), path (bai) from dedupBam_countData - path ref from reference_countData - val ends from endsInfer_countData - val stranded from strandedInfer_countData - val fastqCountError_countData - val fastqReadError_countData - val fastqFileError_countData - val speciesError_countData - val pipelineError_countData - - output: - path ("*_tpmTable.csv") into counts - path ("*_countData.summary") into countsQC - path ("assignedReads.csv") into assignedReadsInfer_fl - - when: - fastqCountError_countData == 'false' - fastqReadError_countData == 'false' - fastqFileError_countData == 'false' - speciesError_countData == 'false' - pipelineError_countData == 'false' - - script: - """ - hostname > ${repRID}.countData.log - ulimit -a >> ${repRID}.countData.log - - # determine strandedness and setup strandig for countData - stranding=0; - if [ "${stranded}" == "unstranded" ] - then - stranding=0 - echo -e "LOG: strandedness set to unstranded [0]" >> ${repRID}.countData.log - elif [ "${stranded}" == "forward" ] - then - stranding=1 - echo -e "LOG: strandedness set to forward stranded [1]" >> ${repRID}.countData.log - elif [ "${stranded}" == "reverse" ] - then - stranding=2 - echo -e "LOG: strandedness set to reverse stranded [2]" >> ${repRID}.countData.log - fi - - # run featureCounts - echo -e "LOG: counting ${ends} features" >> ${repRID}.countData.log - if [ "${ends}" == "se" ] - then - featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}_countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}_sorted.deduped.bam - elif [ "${ends}" == "pe" ] - then - featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}_countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}_sorted.deduped.bam - fi - echo -e "LOG: counted" >> ${repRID}.countData.log - - # extract assigned reads - grep -m 1 'Assigned' *_countData.summary | grep -oe '\\([0-9.]*\\)' > assignedReads.csv - - # calculate TPM from the resulting countData table - echo -e "LOG: calculating TPM with R" >> ${repRID}.countData.log - Rscript ${script_calculateTPM} --count "${repRID}_countData" - - # convert gene symbols to Entrez id's - echo -e "LOG: convert gene symbols to Entrez id's" >> ${repRID}.countData.log - Rscript ${script_convertGeneSymbols} --repRID "${repRID}" - """ -} - -// Extract number of assigned reads metadata into channel -assignedReadsInfer = Channel.create() -assignedReadsInfer_fl.splitCsv(sep: ",", header: false).separate( - assignedReadsInfer -) - -// Replicate inferred assigned reads for multiple process inputs -assignedReadsInfer.into { - assignedReadsInfer_aggrQC - assignedReadsInfer_uploadQC -} - -/* - *dataQC: calculate transcript integrity numbers (TIN) and bin as well as calculate innerdistance of PE replicates -*/ -process dataQC { - tag "${repRID}" - - input: - path script_tinHist - path ref from reference_dataQC - tuple path (bam), path (bai) from dedupBam_dataQC - tuple path (chrBam), path (chrBai) from dedupChrBam - val ends from endsInfer_dataQC - val fastqCountError_dataQC - val fastqReadError_dataQC - val fastqFileError_dataQC - val speciesError_dataQC - val pipelineError_dataQC - - output: - path "${repRID}_tin.hist.tsv" into tinHist - path "${repRID}_tin.med.csv" into tinMedInfer_fl - path "${repRID}_insertSize.inner_distance_freq.txt" into innerDistance - - when: - fastqCountError_dataQC == 'false' - fastqReadError_dataQC == 'false' - fastqFileError_dataQC == 'false' - speciesError_dataQC == 'false' - pipelineError_dataQC == 'false' - - script: - """ - hostname > ${repRID}.dataQC.log - ulimit -a >> ${repRID}.dataQC.log - - # calcualte TIN values per feature on each chromosome - echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > ${repRID}_sorted.deduped.tin.xls - for i in `cat ./genome.bed | cut -f1 | grep -o chr.[0-9]* | sort | uniq`; do - echo "echo \"LOG: running tin.py on \${i}\" >> ${repRID}.dataQC.log; tin.py -i ${repRID}_sorted.deduped.\${i}.bam -r ./genome.bed; cat ${repRID}_sorted.deduped.\${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \\\"\\\\t\${i}\\\\t\\\";"; - done | parallel -j `nproc` -k 1>> ${repRID}_sorted.deduped.tin.xls - - # bin TIN values - echo -e "LOG: binning TINs" >> ${repRID}.dataQC.log - python3 ${script_tinHist} -r ${repRID} - echo -e "LOG: binned" >> ${repRID}.dataQC.log - - # calculate inner-distances for PE data - if [ "${ends}" == "pe" ] - then - echo -e "LOG: calculating inner distances for ${ends}" >> ${repRID}.dataQC.log - inner_distance.py -i "${bam}" -o ${repRID}_insertSize -r ./genome.bed - echo -e "LOG: calculated" >> ${repRID}.dataQC.log - elif [ "${ends}" == "se" ] - then - echo -e "LOG: creating dummy inner distance file for ${ends}" >> ${repRID}.dataQC.log - touch ${repRID}_insertSize.inner_distance_freq.txt - fi - """ -} - -// Extract median TIN metadata into channel -tinMedInfer = Channel.create() -tinMedInfer_fl.splitCsv(sep: ",", header: false).separate( - tinMedInfer -) - -// Replicate inferred median TIN for multiple process inputs -tinMedInfer.into { - tinMedInfer_aggrQC - tinMedInfer_uploadQC -} - -/* - *aggrQC: aggregate QC from processes as well as metadata and run MultiQC -*/ -process aggrQC { - tag "${repRID}" - publishDir "${outDir}/report", mode: 'copy', pattern: "${repRID}.multiqc.html" - publishDir "${outDir}/qc", mode: 'copy', pattern: "${repRID}.multiqc_data.json" - - input: - path multiqcConfig - path bicfLogo - path softwareReferences - path softwareVersions - path fastqc - path trimQC - path alignQC - path dedupQC - path countsQC - path innerDistance - path tinHist - path alignSampleQCs from alignSampleQC_aggrQC.collect() - path inferExperiment - val endsManual from endsManual_aggrQC - val endsM from endsMeta_aggrQC - val strandedM from strandedMeta_aggrQC - val spikeM from spikeMeta_aggrQC - val speciesM from speciesMeta_aggrQC - val endsI from endsInfer_aggrQC - val strandedI from strandedInfer_aggrQC - val spikeI from spikeInfer_aggrQC - val speciesI from speciesInfer_aggrQC - val readLengthM from readLengthMeta - val readLengthI from readLengthInfer_aggrQC - val rawReadsI from rawReadsInfer_aggrQC - val assignedReadsI from assignedReadsInfer_aggrQC - val tinMedI from tinMedInfer_aggrQC - val studyRID from studyRID_aggrQC - val expRID from expRID_aggrQC - val fastqCountError_aggrQC - val fastqReadError_aggrQC - val fastqFileError_aggrQC - val speciesError_aggrQC - val pipelineError_aggrQC - - output: - path "${repRID}.multiqc.html" into multiqc - path "${repRID}.multiqc_data.json" into multiqcJSON - - when: - fastqCountError_aggrQC == 'false' - fastqReadError_aggrQC == 'false' - fastqFileError_aggrQC == 'false' - speciesError_aggrQC == 'false' - pipelineError_aggrQC == 'false' - - script: - """ - hostname > ${repRID}.aggrQC.log - ulimit -a >> ${repRID}.aggrQC.log - - # make run table - if [ "${params.inputBagForce}" == "" ] && [ "${params.fastqsForce}" == "" ] && [ "${params.speciesForce}" == "" ] && [ "${params.strandedForce}" == "" ] && [ "${params.spikeForce}" == "" ] - then - input="default" - else - input="override:" - if [ "${params.inputBagForce}" != "" ] - then - input=\$(echo \${input} inputBag) - fi - if [ "${params.fastqsForce}" != "" ] - then - input=\$(echo \${input} fastq) - fi - if [ "${params.speciesForce}" != "" ] - then - input=\$(echo \${input} species) - fi - if [ "${params.strandedForce}" != "" ] - then - input=\$(echo \${input} stranded) - fi - if [ "${params.spikeForce}" != "" ] - then - input=\$(echo \${input} spike) - fi - fi - echo -e "LOG: creating run table" >> ${repRID}.aggrQC.log - echo -e "Session\tSession ID\tStart Time\tPipeline Version\tInput" > run.tsv - echo -e "Session\t${workflow.sessionId}\t${workflow.start}\t${workflow.manifest.version}\t\${input}" >> run.tsv - - # make RID table - echo -e "LOG: creating RID table" >> ${repRID}.aggrQC.log - echo -e "Replicate\tReplicate RID\tExperiment RID\tStudy RID" > rid.tsv - echo -e "Replicate\t${repRID}\t${expRID}\t${studyRID}" >> rid.tsv - - # make metadata table - echo -e "LOG: creating metadata table" >> ${repRID}.aggrQC.log - echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in\tRaw Reads\tAssigned Reads\tMedian Read Length\tMedian TIN" > metadata.tsv - echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t-\t-\t'${readLengthM}'\t-" >> metadata.tsv - if [ "${params.speciesForce}" == "" ] - then - input=\$(echo "Inferred\\t${speciesI}\\t") - else - input=\$(echo "Inferred\\t${speciesI} (FORCED)\\t") - fi - input=\$(echo \${input}"${endsI}\\t") - if [ "${params.strandedForce}" == "" ] - then - input=\$(echo \${input}"${strandedI}\\t") - else - input=\$(echo \${input}"${strandedI} (FORCED)\\t") - fi - if [ "${params.spikeForce}" == "" ] - then - input=\$(echo \${input}"${spikeI}\\t-\\t-\\t-\\t-") - else - input=\$(echo \${input}"${spikeI} (FORCED)\\t-\\t-\\t-\\t-") - fi - echo -e \${input} >> metadata.tsv - echo -e "Measured\t-\t${endsManual}\t-\t-\t'${rawReadsI}'\t'${assignedReadsI}'\t'${readLengthI}'\t'${tinMedI}'" >> metadata.tsv - - # make reference table - echo -e "LOG: creating referencerun table" >> ${repRID}.aggrQC.log - echo -e "Species\tGenome Reference Consortium Build\tGenome Reference Consortium Patch\tGENCODE Annotation Release" > reference.tsv - echo -e "Human\tGRCh\$(echo `echo ${params.refHuVersion} | cut -d "." -f 1`)\t\$(echo `echo ${params.refHuVersion} | cut -d "." -f 2`)\t'\$(echo `echo ${params.refHuVersion} | cut -d "." -f 3 | sed "s/^v//"`)'" >> reference.tsv - echo -e "Mouse\tGRCm\$(echo `echo ${params.refMoVersion} | cut -d "." -f 1`)\t\$(echo `echo ${params.refMoVersion} | cut -d "." -f 2`)\t'\$(echo `echo ${params.refMoVersion} | cut -d "." -f 3 | sed "s/^v//"`)'" >> reference.tsv - - # remove inner distance report if it is empty (SE repRID) - echo -e "LOG: removing dummy inner distance file" >> ${repRID}.aggrQC.log - if [ "${endsM}" == "se" ] - then - rm -f ${innerDistance} - fi - - # run MultiQC - echo -e "LOG: running multiqc" >> ${repRID}.aggrQC.log - multiqc -c ${multiqcConfig} . -n ${repRID}.multiqc.html - cp ${repRID}.multiqc_data/multiqc_data.json ${repRID}.multiqc_data.json - - if [ ${params.track} == true ] - then - curl -H 'Content-Type: application/json' -X PUT -d \ - @./${repRID}.multiqc_data.json \ - "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/qc" - fi - """ -} - -/* - * uploadQC: uploads the mRNA QC -*/ -process uploadQC { - tag "${repRID}" - - input: - path script_deleteEntry_uploadQC - path script_uploadQC - path credential, stageAs: "credential.json" from deriva_uploadQC - val executionRunRID from executionRunRID_uploadQC - val ends from endsInfer_uploadQC - val stranded from strandedInfer_uploadQC - val length from readLengthInfer_uploadQC - val rawCount from rawReadsInfer_uploadQC - val finalCount from assignedReadsInfer_uploadQC - val tinMed from tinMedInfer_uploadQC - val fastqCountError_uploadQC - val fastqReadError_uploadQC - val fastqFileError_uploadQC - val speciesError_uploadQC - val pipelineError_uploadQC - - output: - path ("qcRID.csv") into qcRID_fl - - when: - upload - fastqCountError_uploadQC == 'false' - fastqReadError_uploadQC == 'false' - fastqFileError_uploadQC == 'false' - speciesError_uploadQC == 'false' - pipelineError_uploadQC == 'false' - - script: - """ - hostname > ${repRID}.uploadQC.log - ulimit -a >> ${repRID}.uploadQC.log - - if [ "${ends}" == "pe" ] - then - end="Paired End" - elif [ "${ends}" == "se" ] - then - end="Single End" - fi - - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=${repRID}) - if [ "\${exist}" != "[]" ] - then - rids=\$(echo \${exist} | grep -o '\\"RID\\":\\".\\{7\\}' | sed 's/^.\\{7\\}//') - for rid in \${rids} - do - python3 ${script_deleteEntry_uploadQC} -r \${rid} -t mRNA_QC -o ${source} -c \${cookie} - echo LOG: old mRNA QC RID deleted - \${rid} >> ${repRID}.uploadQC.log - done - echo LOG: all old mRNA QC RIDs deleted >> ${repRID}.uploadQC.log - fi - - qc_rid=\$(python3 ${script_uploadQC} -r ${repRID} -e ${executionRunRID} -p "\${end}" -s ${stranded} -l ${length} -w ${rawCount} -f ${finalCount} -t ${tinMed} -o ${source} -c \${cookie} -u F) - echo LOG: mRNA QC RID uploaded - \${qc_rid} >> ${repRID}.uploadQC.log - - echo "\${qc_rid}" > qcRID.csv - """ -} - -/* - *uploadProcessedFile: uploads the processed files -*/ -process uploadProcessedFile { - tag "${repRID}" - publishDir "${outDir}/outputBag", mode: 'copy', pattern: "Replicate_${repRID}.outputBag.zip" - - input: - path script_deleteEntry_uploadProcessedFile - path credential, stageAs: "credential.json" from deriva_uploadProcessedFile - path executionRunExportConfig - path multiqc - path multiqcJSON - tuple path (bam),path (bai) from dedupBam_uploadProcessedFile - path bigwig - path counts - val species from speciesInfer_uploadProcessedFile - val studyRID from studyRID_uploadProcessedFile - val expRID from expRID_uploadProcessedFile - val executionRunRID from executionRunRID_uploadProcessedFile - val fastqCountError_uploadProcessedFile - val fastqReadError_uploadProcessedFile - val fastqFileError_uploadProcessedFile - val speciesError_uploadProcessedFile - val pipelineError_uploadProcessedFile - - output: - path ("${repRID}_Output_Bag.zip") into outputBag - - when: - upload - fastqCountError_uploadProcessedFile == 'false' - fastqReadError_uploadProcessedFile == 'false' - fastqFileError_uploadProcessedFile == 'false' - speciesError_uploadProcessedFile == 'false' - pipelineError_uploadProcessedFile == 'false' - - script: - """ - hostname > ${repRID}.outputBag.log - ulimit -a >> ${repRID}.outputBag.log - - mkdir -p ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ - cp ${bam} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ - cp ${bai} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ - cp ${bigwig} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ - cp ${counts} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ - - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Processed_File/Replicate=${repRID}) - if [ "\${exist}" != "[]" ] - then - rids=\$(echo \${exist} | grep -o '\\"RID\\":\\".\\{7\\}' | sed 's/^.\\{7\\}//') - for rid in \${rids} - do - python3 ${script_deleteEntry_uploadProcessedFile} -r \${rid} -t Processed_File -o ${source} -c \${cookie} - done - echo LOG: all old processed file RIDs deleted >> ${repRID}.uploadQC.log - fi - - deriva-upload-cli --catalog 2 --token \${cookie:9} ${source} ./deriva - echo LOG: processed files uploaded >> ${repRID}.outputBag.log - - deriva-download-cli --catalog 2 --token \${cookie:9} ${source} ${executionRunExportConfig} . rid=${executionRunRID} - echo LOG: execution run bag downloaded >> ${repRID}.outputBag.log - - echo -e "### Run Details" >> runDetails.md - echo -e "**Workflow URL:** https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq" >> runDetails.md - echo -e "**Workflow Version:** ${workflow.manifest.version}" >> runDetails.md - echo -e "**Description:** ${workflow.manifest.description}" >> runDetails.md - if [ "${species}" == "Mus musculus" ]; then - genome=\$(echo GRCm${refMoVersion} | cut -d '.' -f1) - patch=\$(echo ${refMoVersion} | cut -d '.' -f2) - annotation=\$(echo ${refMoVersion} | cut -d '.' -f3 | tr -d 'v') - elif [ "${species}" == "Homo sapiens" ]; then - genome=\$(echo GRCh${refHuVersion} | cut -d '.' -f1) - patch=\$(echo ${refHuVersion} | cut -d '.' -f2) - annotation=\$(echo ${refHuVersion} | cut -d '.' -f3 | tr -d 'v') - fi - echo -e "**Genome Assembly Version:** \${genome} patch \${patch}" >> runDetails.md - echo -e "**Annotation Version:** GENCODE release \${annotation}" >> runDetails.md - echo -e "**Run ID:** ${repRID}" >> runDetails.md - echo LOG: runDetails.md created >> ${repRID}.outputBag.log - - unzip Execution_Run_${executionRunRID}.zip - yr=\$(date +'%Y') - mn=\$(date +'%m') - dy=\$(date +'%d') - mv Execution_Run_${executionRunRID} ${repRID}_Output_Bag_\${yr}\${mn}\${dy} - loc=./${repRID}_Output_Bag/data/assets/Study/${studyRID}/Experiment/${expRID}/Replicate/${repRID}/Execution_Run/${executionRunRID}/Output_Files/ - mkdir -p \${loc} - cp runDetails.md \${loc} - cp ${multiqc} \${loc} - cp ${multiqcJSON} \${loc} - - bdbag ./${repRID}_Output_Bag/ --update --archiver zip --debug - echo LOG: output bag created >> ${repRID}.outputBag.log - """ -} - -/* - * uploadOutputBag: uploads the output bag -*/ -process uploadOutputBag { - tag "${repRID}" - - input: - path script_uploadOutputBag - path credential, stageAs: "credential.json" from deriva_uploadOutputBag - path outputBag - val studyRID from studyRID_uploadOutputBag - val executionRunRID from executionRunRID_uploadOutputBag - val fastqCountError_uploadOutputBag - val fastqReadError_uploadOutputBag - val fastqFileError_uploadOutputBag - val speciesError_uploadOutputBag - val pipelineError_uploadOutputBag - - output: - path ("outputBagRID.csv") into outputBagRID_fl - - when: - upload - fastqCountError_uploadOutputBag == 'false' - fastqReadError_uploadOutputBag == 'false' - fastqFileError_uploadOutputBag == 'false' - speciesError_uploadOutputBag == 'false' - pipelineError_uploadOutputBag == 'false' - - script: - """ - hostname > ${repRID}.uploadOutputBag.log - ulimit -a >> ${repRID}.uploadOutputBag.log - - yr=\$(date +'%Y') - mn=\$(date +'%m') - dy=\$(date +'%d') - - file=\$(basename -a ${outputBag}) - md5=\$(md5sum ./\${file} | awk '{ print \$1 }') - echo LOG: ${repRID} output bag md5 sum - \${md5} >> ${repRID}.uploadOutputBag.log - size=\$(wc -c < ./\${file}) - echo LOG: ${repRID} output bag size - \${size} bytes >> ${repRID}.uploadOutputBag.log - - loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/output_bag/study/${studyRID}/replicate/${repRID}/\${file} --parents) - echo LOG: output bag uploaded - \${loc} >> ${repRID}.uploadOutputBag.log - # url-ify the location - loc=\${loc//\\//%2F} - loc=\${loc//:/%3A} - loc=\${loc// /@20} - - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_URL=\${loc}) - if [ "\${exist}" == "[]" ] - then - outputBag_rid=\$(python3 ${script_uploadOutputBag} -e ${executionRunRID} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie} -u F) - echo LOG: output bag RID uploaded - \${outputBag_rid} >> ${repRID}.uploadOutputBag.log - rid=\${outputBag_rid} - else - exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - exist=\${exist:8:-6} - outputBag_rid=\$(python3 ${script_uploadOutputBag} -e ${executionRunRID} -o ${source} -c \${cookie} -u \${exist}) - echo LOG: output bag RID already exists - \${exist} >> ${repRID}.uploadOutputBag.log - rid=\${exist} - fi - - echo "\${rid}" > outputBagRID.csv - """ -} - -// Extract output bag RID into channel -outputBagRID = Channel.create() -outputBagRID_fl.splitCsv(sep: ",", header: false).separate( - outputBagRID -) - -/* - * finalizeExecutionRun: finalizes the execution run -*/ -process finalizeExecutionRun { - tag "${repRID}" - - input: - path script_uploadExecutionRun_finalizeExecutionRun - path credential, stageAs: "credential.json" from deriva_finalizeExecutionRun - val executionRunRID from executionRunRID_finalizeExecutionRun - val inputBagRID from inputBagRID_finalizeExecutionRun - val outputBagRID - - when: - upload - - script: - """ - hostname > ${repRID}.finalizeExecutionRun.log - ulimit -a >> ${repRID}.finalizeExecutionRun.log - - executionRun=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/RID=${executionRunRID}) - workflow=\$(echo \${executionRun} | grep -o '\\"Workflow\\":.*\\"Reference' | grep -oP '(?<=\\"Workflow\\":\\").*(?=\\",\\"Reference)') - genome=\$(echo \${executionRun} | grep -o '\\"Reference_Genome\\":.*\\"Input_Bag' | grep -oP '(?<=\\"Reference_Genome\\":\\").*(?=\\",\\"Input_Bag)') - - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - rid=\$(python3 ${script_uploadExecutionRun_finalizeExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Success -d 'Run Successful' -o ${source} -c \${cookie} -u ${executionRunRID}) - echo LOG: execution run RID marked as successful - \${rid} >> ${repRID}.finalizeExecutionRun.log - - if [ ${params.track} == true ] - then - dt=`date +%FT%T.%3N%:z` - curl -H 'Content-Type: application/json' -X PUT -d \ - '{ \ - "ID": "${workflow.sessionId}", \ - "Complete": "'\${dt}'" \ - }' \ - "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" - fi - """ -} - -// Combine errors -error_meta = fastqCountError_uploadQC_fail.ifEmpty(false).combine(fastqReadError_uploadQC_fail.ifEmpty(false).combine(fastqFileError_uploadQC_fail.ifEmpty(false).combine(speciesError_uploadQC_fail.ifEmpty(false).combine(pipelineError_uploadQC_fail.ifEmpty(false))))) -error_meta. into{ - error_failPreExecutionRun - error_uploadQC_fail -} -errorDetails = fastqCountError_details.ifEmpty("").combine(fastqReadError_details.ifEmpty("").combine(fastqFileError_details.ifEmpty("").combine(speciesError_details.ifEmpty("")))) - -/* - * failPreExecutionRun_fastq: fail the execution run prematurely for fastq errors -*/ -process failPreExecutionRun { - tag "${repRID}" - - input: - path script_uploadExecutionRun from script_uploadExecutionRun_failPreExecutionRun - path credential, stageAs: "credential.json" from deriva_failPreExecutionRun - val spike from spikeMeta_failPreExecutionRun - val species from speciesMeta_failPreExecutionRun - val inputBagRID from inputBagRID_failPreExecutionRun - tuple val (fastqCountError), val (fastqReadError), val (fastqFileError), val (speciesError), val (pipelineError) from error_failPreExecutionRun - tuple val (fastqCountError_details), val (fastqReadError_details), val (fastqFileError_details), val (speciesError_details) from errorDetails - - output: - path ("executionRunRID.csv") into executionRunRID_preFail_fl - - when: - upload - fastqCountError == 'true' || fastqReadError == 'true' || fastqFileError == 'true' || speciesError == 'true' - - script: - """ - hostname > ${repRID}.failPreExecutionRun.log - ulimit -a >> ${repRID}.failPreExecutionRun.log - - errorDetails="" - if [ ${fastqCountError} == true ] - then - errorDetails=\$(echo ${fastqCountError_details}"\\n") - elif [ ${fastqReadError} == true ] - then - errorDetails=\$(echo \$(errorDetails)${fastqReadError_details}"\\n") - elif [ ${fastqFileError} == true ] - then - errorDetails=\$(echo \$(errorDetails)${fastqReadError_details}"\\n") - elif [ ${speciesError} == true ] - then - errorDetails=\$(echo \$(errorDetails)${fastqReadError_details}"\\n") - fi - - echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.failPreExecutionRun.log - workflow=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Workflow/Name=BICF%20mRNA%20Replicate/Version=${workflow.manifest.version}) - workflow=\$(echo \${workflow} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - workflow=\${workflow:7:-6} - echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.failPreExecutionRun.log - - if [ "${species}" == "Homo sapiens" ] - then - genomeName=\$(echo GRCh${refHuVersion}) - elif [ "${species}" == "Mus musculus" ] - then - genomeName=\$(echo GRCm${refMoVersion}) - fi - if [ "${spike}" == "true" ] - then - genomeName=\$(echo \${genomeName}-S) - fi - echo LOG: searching for genome name - \${genomeName} >> ${repRID}.failPreExecutionRun.log - genome=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Name=\${genomeName}) - genome=\$(echo \${genome} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - genome=\${genome:7:-6} - echo LOG: genome RID extracted - \${genome} >> ${repRID}.failPreExecutionRun.log - - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Replicate=${repRID}/Input_Bag=${inputBagRID}) - echo \${exist} >> ${repRID}.failPreExecutionRun.log - if [ "\${exist}" == "[]" ] - then - rid=\$(python3 ${script_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u F) - echo LOG: execution run RID uploaded - \${rid} >> ${repRID}.failPreExecutionRun.log - else - rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') - rid=\${rid:7:-6} - echo \${rid} >> ${repRID}.failPreExecutionRun.log - executionRun_rid=\$(python3 ${script_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u \${rid}) - echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.failPreExecutionRun.log - fi - - echo "\${rid}" > executionRunRID.csv - - if [ ${params.track} == true ] - then - dt=`date +%FT%T.%3N%:z` - curl -H 'Content-Type: application/json' -X PUT -d \ - '{ \ - "ID": "${workflow.sessionId}", \ - "ExecutionRunRID": "'\${rid}'", \ - "Failure": "'\${dt}'" \ - }' \ - "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" - fi - """ -} -// Extract execution run RID into channel -executionRunRID_preFail = Channel.create() -executionRunRID_preFail_fl.splitCsv(sep: ",", header: false).separate( - executionRunRID_preFail -) - -failExecutionRunRID = executionRunRID_fail.ifEmpty('').mix(executionRunRID_preFail.ifEmpty('')).filter { it != "" } - -/* - * failExecutionRun: fail the execution run -*/ -process failExecutionRun { - tag "${repRID}" - - input: - path script_uploadExecutionRun_failExecutionRun - path credential, stageAs: "credential.json" from deriva_failExecutionRun - val executionRunRID from executionRunRID_failExecutionRun - val inputBagRID from inputBagRID_failExecutionRun - val endsMeta from endsMeta_failExecutionRun - val endsRaw - val strandedMeta from strandedMeta_failExecutionRun - val spikeMeta from spikeMeta_failExecutionRun - val speciesMeta from speciesMeta_failExecutionRun - val endsInfer from endsInfer_failExecutionRun - val strandedInfer from strandedInfer_failExecutionRun - val spikeInfer from spikeInfer_failExecutionRun - val speciesInfer from speciesInfer_failExecutionRun - val pipelineError from pipelineError_failExecutionRun - val pipelineError_ends - val pipelineError_stranded - val pipelineError_spike - val pipelineError_species - - when: - upload - pipelineError == 'true' - - script: - """ - hostname > ${repRID}.failExecutionRun.log - ulimit -a >> ${repRID}.failExecutionRun.log - - executionRun=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/RID=${executionRunRID}) - workflow=\$(echo \${executionRun} | grep -o '\\"Workflow\\":.*\\"Reference' | grep -oP '(?<=\\"Workflow\\":\\").*(?=\\",\\"Reference)') - genome=\$(echo \${executionRun} | grep -o '\\"Reference_Genome\\":.*\\"Input_Bag' | grep -oP '(?<=\\"Reference_Genome\\":\\").*(?=\\",\\"Input_Bag)') - - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - errorDetails="" - if [ ${pipelineError} == false ] - then - rid=\$(python3 ${script_uploadExecutionRun_failExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Success -d 'Run Successful' -o ${source} -c \${cookie} -u ${executionRunRID}) - echo LOG: execution run RID marked as successful - \${rid} >> ${repRID}.failExecutionRun.log - else - pipelineError_details=\$(echo "**Submitted metadata does not match inferred:**\\n") - pipelineError_details=\$(echo \${pipelineError_details}"|Metadata|Submitted value|Inferred value|\\n") - pipelineError_details=\$(echo \${pipelineError_details}"|:-:|-:|-:|\\n") - if ${pipelineError_ends} - then - if [ "${endsInfer}" == "se" ] - then - endInfer="Single End" - elif [ "${endsInfer}" == "pe" ] - then - endInfer="Paired End" - else - endInfer="unknown" - fi - pipelineError_details=\$(echo \${pipelineError_details}"|Paired End|${endsRaw}|"\${endInfer}"|\\n") - fi - if ${pipelineError_stranded} - then - pipelineError_details=\$(echo \${pipelineError_details}"|Strandedness|${strandedMeta}|${strandedInfer}|\\n") - fi - if ${pipelineError_spike} - then - pipelineError_details=\$(echo \${pipelineError_details}"|Used Spike Ins|${spikeMeta}|${spikeInfer}|\\n") - fi - if ${pipelineError_species} - then - pipelineError_details=\$(echo \${pipelineError_details}"|Species|${speciesMeta}|${speciesInfer}|\\n") - fi - pipelineError_details=\${pipelineError_details::-2} - rid=\$(python3 ${script_uploadExecutionRun_failExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${pipelineError_details}" -o ${source} -c \${cookie} -u ${executionRunRID}) - echo LOG: execution run RID marked as error - \${rid} >> ${repRID}.failExecutionRun.log - fi - - if [ ${params.track} == true ] - then - dt=`date +%FT%T.%3N%:z` - curl -H 'Content-Type: application/json' -X PUT -d \ - '{ \ - "ID": "${workflow.sessionId}", \ - "ExecutionRunRID": "'\${rid}'", \ - "Failure": "'\${dt}'" \ - }' \ - "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" - fi - """ -} - -/* - * uploadQC_fail: uploads the mRNA QC on failed execution run -*/ -process uploadQC_fail { - tag "${repRID}" - - input: - path script_deleteEntry_uploadQC_fail - path script_uploadQC_fail - path credential, stageAs: "credential.json" from deriva_uploadQC_fail - val executionRunRID from failExecutionRunRID - tuple val (fastqCountError), val (fastqReadError), val (fastqFileError), val (speciesError), val (pipelineError) from error_uploadQC_fail - - when: - upload - fastqCountError == 'true' || fastqReadError == 'true' || fastqFileError == 'true' || speciesError == 'true' || pipelineError == 'true' - - script: - """ - hostname > ${repRID}.uploadQC.log - ulimit -a >> ${repRID}.uploadQC.log - - cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') - cookie=\${cookie:11:-1} - - exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=${repRID}) - if [ "\${exist}" != "[]" ] - then - rids=\$(echo \${exist} | grep -o '\\"RID\\":\\".\\{7\\}' | sed 's/^.\\{7\\}//') - for rid in \${rids} - do - python3 ${script_deleteEntry_uploadQC_fail} -r \${rid} -t mRNA_QC -o ${source} -c \${cookie} - echo LOG: old mRNA QC RID deleted - \${rid} >> ${repRID}.uploadQC.log - done - echo LOG: all old mRNA QC RIDs deleted >> ${repRID}.uploadQC.log - fi - - qc_rid=\$(python3 ${script_uploadQC_fail} -r ${repRID} -e ${executionRunRID} -o ${source} -c \${cookie} -u E) - echo LOG: mRNA QC RID uploaded - \${qc_rid} >> ${repRID}.uploadQC.log - - echo "\${qc_rid}" > qcRID.csv - """ -} - - -workflow.onError = { - subject = "$workflow.manifest.name FAILED: $params.repRID" - - def msg = """\ - - Pipeline error summary - --------------------------- - RID : ${params.repRID} - Version : ${workflow.manifest.version} - Duration : ${workflow.duration} - Nf Version : ${workflow.nextflow.version} - Message : ${workflow.errorMessage} - exit status : ${workflow.exitStatus} - """ - .stripIndent() - if (email != '') { - sendMail(to: email, subject: subject , body: msg) - } -} diff --git a/workflow/scripts/bdbag_fetch.sh b/workflow/scripts/bdbag_fetch.sh index 45ee14a7da409e011494921bafa204b44e96f795..d336829125c030563c96aaf6c354b2f00bdb5a47 100644 --- a/workflow/scripts/bdbag_fetch.sh +++ b/workflow/scripts/bdbag_fetch.sh @@ -9,7 +9,7 @@ then n=0 until [ "${n}" -ge "3" ] do - bdbag --resolve-fetch missing --validate full ${1} --debug && validate=$(tail -n validate.txt | grep -o 'is valid') && break + bdbag --resolve-fetch missing --validate full ${1} --debug --config-file bdbag.json && validate=$(tail -n validate.txt | grep -o 'is valid') && break n=$((n+1)) sleep 15 done @@ -18,8 +18,10 @@ if [ "${validate}" != "is valid" ] then exit 1 fi +count=$(find */ -name "*[_.]R[1-2].fastq.gz" | wc -l) for i in $(find */ -name "*[_.]R[1-2].fastq.gz") do path=${2}.$(echo ${i##*/} | grep -o "R[1,2].fastq.gz") cp ${i} ./${path} -done \ No newline at end of file +done +echo ${count} diff --git a/workflow/scripts/generate_versions.py b/workflow/scripts/generate_versions.py index 09447d17a62a439a418753398e1cd77716ceaa74..2ff498659cc9acbf989ec45e61e8b755b9cc3a66 100644 --- a/workflow/scripts/generate_versions.py +++ b/workflow/scripts/generate_versions.py @@ -34,17 +34,19 @@ SOFTWARE_REGEX = { 'Python': ['version_python.txt', r"Python (\S+)"], 'DERIVA': ['version_deriva.txt', r"(\S+)"], 'BDBag': ['version_bdbag.txt', r"BDBag (\S+) \(Bagit \S+\)"], - 'RSeQC': ['version_rseqc.txt', r"infer_experiment.py (\S+)"], 'Trim Galore!': ['version_trimgalore.txt', r"version (\S+)"], 'HISAT2': ['version_hisat2.txt', r"version (\S+)"], 'Samtools': ['version_samtools.txt', r"samtools (\S+)"], 'picard (MarkDuplicates)': ['version_markdups.txt', r"Version:(\S+)"], 'featureCounts': ['version_featurecounts.txt', r"featureCounts v(\S+)"], - 'R': ['version_r.txt', r"R version (\S+)"], 'deepTools': ['version_deeptools.txt', r"deeptools (\S+)"], + 'Seqtk': ['version_seqtk.txt', r"Version: (\S+)"], + 'R': ['version_r.txt', r"R version (\S+)"], 'FastQC': ['version_fastqc.txt', r"FastQC v(\S+)"], + 'SeqWho': ['version_seqwho.txt', r"Version (\S+)"], + 'RSeQC': ['version_rseqc.txt', r"infer_experiment.py (\S+)"], 'MultiQC': ['version_multiqc.txt', r"multiqc, version (\S+)"], - 'Pipeline Version': ['./workflow/nextflow.config', r"version = 'v(\S+)'"] + 'Pipeline Version': ['./nextflow.config', r"version = 'v(\S+)'"] } @@ -93,15 +95,17 @@ def main(): results['Python'] = '<span style="color:#999999;\">Not Run</span>' results['DERIVA'] = '<span style="color:#999999;\">Not Run</span>' results['BDBag'] = '<span style="color:#999999;\">Not Run</span>' - results['RSeQC'] = '<span style="color:#999999;\">Not Run</span>' results['Trim Galore!'] = '<span style="color:#999999;\">Not Run</span>' results['HISAT2'] = '<span style="color:#999999;\">Not Run</span>' results['Samtools'] = '<span style="color:#999999;\">Not Run</span>' results['picard (MarkDuplicates)'] = '<span style="color:#999999;\">Not Run</span>' results['featureCounts'] = '<span style="color:#999999;\">Not Run</span>' - results['R'] = '<span style="color:#999999;\">Not Run</span>' results['deepTools'] = '<span style="color:#999999;\">Not Run</span>' + results['Seqtk'] = '<span style="color:#999999;\">Not Run</span>' + results['R'] = '<span style="color:#999999;\">Not Run</span>' results['FastQC'] = '<span style="color:#999999;\">Not Run</span>' + results['SeqWho'] = '<span style="color:#999999;\">Not Run</span>' + results['RSeQC'] = '<span style="color:#999999;\">Not Run</span>' results['MultiQC'] = '<span style="color:#999999;\">Not Run</span>' results['Pipeline Version'] = '<span style="color:#999999;\">Not Run</span>' @@ -125,7 +129,7 @@ def main(): ''' id: 'software_versions' section_name: 'Software Versions' - section_href: 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/blob/78-tool_version/docs/RNA-Seq%20Pipeline%20Design%20Process%20Table.pdf' + section_href: 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/wikis/Pipeline/Tool-Versions' plot_type: 'html' description: 'are collected for pipeline version.' data: | diff --git a/workflow/scripts/get_updated_badge_info.sh b/workflow/scripts/get_updated_badge_info.sh index 4b929272f2ea80ede5d47b84cd55bad2c6a3fa7b..3a5df46c52a6e1fe0cbd41946cdea09c67d1e08e 100644 --- a/workflow/scripts/get_updated_badge_info.sh +++ b/workflow/scripts/get_updated_badge_info.sh @@ -2,27 +2,29 @@ echo "collecting stats for badges" latest_release_tag=$(git tag --sort=-committerdate -l *.*.* | head -1) -current_pipeline_version=$(git show ${latest_release_tag}:workflow/nextflow.config | grep -o version.* | grep -oP "(?<=').*(?=')") -current_nextflow_version=$(git show ${latest_release_tag}:workflow/nextflow.config | grep -o nextflowVersion.* | grep -oP "(?<=').*(?=')") -master_pipeline_version=$(git show origin/master:workflow/nextflow.config | grep -o version.* | grep -oP "(?<=').*(?=')") -master_nextflow_version=$(git show origin/master:workflow/nextflow.config | grep -o nextflowVersion.* | grep -oP "(?<=').*(?=')") -develop_pipeline_version=$(git show origin/develop:workflow/nextflow.config | grep -o version.* | grep -oP "(?<=').*(?=')") -develop_nextflow_version=$(git show origin/develop:workflow/nextflow.config | grep -o nextflowVersion.* | grep -oP "(?<=').*(?=')") +current_pipeline_version=$(git show ${latest_release_tag}:nextflow.config | grep -o version.* | grep -oP "(?<=').*(?=')" | tr "-" _) +current_nextflow_version=$(git show ${latest_release_tag}:nextflow.config | grep -o nextflowVersion.* | grep -oP "(?<=').*(?=')" | tr "-" _) +master_pipeline_version=$(git show origin/master:nextflow.config | grep -o version.* | grep -oP "(?<=').*(?=')" | tr "-" _) +master_nextflow_version=$(git show origin/master:nextflow.config | grep -o nextflowVersion.* | grep -oP "(?<=').*(?=')" | tr "-" _) +develop_pipeline_version=$(git show origin/develop:nextflow.config | grep -o version.* | grep -oP "(?<=').*(?=')") +develop_nextflow_version=$(git show origin/develop:nextflow.config | grep -o nextflowVersion.* | grep -oP "(?<=').*(?=')" | tr "-" _) echo "collecting tool version for badges" -python_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o Python.* | grep -oP "(?<=d>).*(?=\<)") -deriva_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o DERIVA.* | grep -oP "(?<=d>).*(?=\<)") -bdbag_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o BDBag.* | grep -oP "(?<=d>).*(?=\<)") -rseqc_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o RSeQC.* | grep -oP "(?<=d>).*(?=\<)") -trimgalore_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o 'Trim Galore!'.* | grep -oP "(?<=d>).*(?=\<)") -hisat2_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o HISAT2.* | grep -oP "(?<=d>).*(?=\<)") -samtools_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o Samtools.* | grep -oP "(?<=d>).*(?=\<)") -picard_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o 'picard (MarkDuplicates)'.* | grep -oP "(?<=d>).*(?=\<)") -featurecounts_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o featureCounts.* | grep -oP "(?<=d>).*(?=\<)") -r_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o '>R<'.* | grep -oP "(?<=d>).*(?=\<)") -deeptools_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o deepTools.* | grep -oP "(?<=d>).*(?=\<)") -fastqc_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o FastQC.* | grep -oP "(?<=d>).*(?=\<)") -multiqc_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o MultiQC.* | grep -oP "(?<=d>).*(?=\<)") +python_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o Python.* | grep -oP "(?<=d>).*(?=\<)" | tr "-" _) +deriva_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o DERIVA.* | grep -oP "(?<=d>).*(?=\<)" | tr "-" _) +bdbag_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o BDBag.* | grep -oP "(?<=d>).*(?=\<)" | tr "-" _) +trimgalore_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o 'Trim Galore!'.* | grep -oP "(?<=d>).*(?=\<)" | tr "-" _) +hisat2_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o HISAT2.* | grep -oP "(?<=d>).*(?=\<)" | tr "-" _) +samtools_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o Samtools.* | grep -oP "(?<=d>).*(?=\<)" | tr "-" _) +picard_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o 'picard (MarkDuplicates)'.* | grep -oP "(?<=d>).*(?=\<)" | tr "-" _) +featurecounts_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o featureCounts.* | grep -oP "(?<=d>).*(?=\<)" | tr "-" _) +deeptools_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o deepTools.* | grep -oP "(?<=d>).*(?=\<)" | tr "-" _) +seqtk_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o Seqtk.* | grep -oP "(?<=d>).*(?=\<)" | tr "-" _) +r_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o '>R<'.* | grep -oP "(?<=d>).*(?=\<)" | tr "-" _) +fastqc_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o FastQC.* | grep -oP "(?<=d>).*(?=\<)" | tr "-" _) +seqwho_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o SeqWho.* | grep -oP "(?<=d>).*(?=\<)" | tr "-" _) +rseqc_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o RSeQC.* | grep -oP "(?<=d>).*(?=\<)" | tr "-" _) +multiqc_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o MultiQC.* | grep -oP "(?<=d>).*(?=\<)" | tr "-" _) echo "collecting badges" mkdir -p ./badges/tools @@ -37,13 +39,34 @@ curl --request GET https://img.shields.io/badge/Nextflow%20Version-${develop_nex curl --request GET https://img.shields.io/badge/Python%20Version-${python_version}-blueviolet?style=flat > ./badges/tools/python.svg curl --request GET https://img.shields.io/badge/DERIVA%20Version-${deriva_version}-blueviolet?style=flat > ./badges/tools/deriva.svg curl --request GET https://img.shields.io/badge/BDBag%20Version-${bdbag_version}-blueviolet?style=flat > ./badges/tools/bdbag.svg -curl --request GET https://img.shields.io/badge/RSeQC%20Version-${rseqc_version}-blueviolet?style=flat > ./badges/tools/rseqc.svg curl --request GET https://img.shields.io/badge/Trim%20Galore%20Version-${trimgalore_version}-blueviolet?style=flat > ./badges/tools/trimgalore.svg curl --request GET https://img.shields.io/badge/HISAT2%20Version-${hisat2_version}-blueviolet?style=flat > ./badges/tools/hisat2.svg curl --request GET https://img.shields.io/badge/Samtools%20Version-${samtools_version}-blueviolet?style=flat > ./badges/tools/samtools.svg curl --request GET https://img.shields.io/badge/picard%20Version-${picard_version}-blueviolet?style=flat > ./badges/tools/picard.svg curl --request GET https://img.shields.io/badge/featureCounts%20Version-${featurecounts_version}-blueviolet?style=flat > ./badges/tools/featurecounts.svg -curl --request GET https://img.shields.io/badge/R%20Version-${r_version}-blueviolet?style=flat > ./badges/tools/r.svg curl --request GET https://img.shields.io/badge/deepTools%20Version-${deeptools_version}-blueviolet?style=flat > ./badges/tools/deeptools.svg +curl --request GET https://img.shields.io/badge/Seqtk%20Version-${seqtk_version}-blueviolet?style=flat > ./badges/tools/seqtk.svg +curl --request GET https://img.shields.io/badge/R%20Version-${r_version}-blueviolet?style=flat > ./badges/tools/r.svg curl --request GET https://img.shields.io/badge/FastQC%20Version-${fastqc_version}-blueviolet?style=flat > ./badges/tools/fastqc.svg -curl --request GET https://img.shields.io/badge/MultiQC%20Version-${multiqc_version}-blueviolet?style=flat > ./badges/tools/multiqc.svg \ No newline at end of file +curl --request GET https://img.shields.io/badge/SeqWho%20Version-${seqwho_version}-blueviolet?style=flat > ./badges/tools/seqwho.svg +curl --request GET https://img.shields.io/badge/RSeQC%20Version-${rseqc_version}-blueviolet?style=flat > ./badges/tools/rseqc.svg +curl --request GET https://img.shields.io/badge/MultiQC%20Version-${multiqc_version}-blueviolet?style=flat > ./badges/tools/multiqc.svg + +echo "creating blank env badges if not tested" +mkdir -p ./badges/env +if [ ! -f ./badges/env/dnanexus.svg ] +then +curl --request GET https://img.shields.io/badge/Envronment%3A%20DNAnexus-not_tested-important?style=flat > ./badges/env/dnanexus.svg +fi +if [ ! -f ./badges/env/aws.svg ] +then +curl --request GET https://img.shields.io/badge/Envronment%3A%20AWS-not_tested-important?style=flat > ./badges/env/aws.svg +fi +if [ ! -f ./badges/env/azure.svg ] +then +curl --request GET https://img.shields.io/badge/Envronment%3A%20Azure-not_tested-important?style=flat > ./badges/env/azure.svg +fi +if [ ! -f ./badges/env/gcp.svg ] +then +curl --request GET https://img.shields.io/badge/Envronment%3A%20GCP-not_tested-important?style=flat > ./badges/env/gcp.svg +fi \ No newline at end of file diff --git a/workflow/scripts/get_updated_rep_count.sh b/workflow/scripts/get_updated_rep_count.sh index daeb0575d08f2126b40f2db089ae82af4f01ed0c..4805a9c61e83c39d40a3f5fc2bf33ffabc30e552 100644 --- a/workflow/scripts/get_updated_rep_count.sh +++ b/workflow/scripts/get_updated_rep_count.sh @@ -2,7 +2,7 @@ echo "collecting stats for badges" latest_release_tag=$(git tag --sort=-committerdate -l *.*.* | head -1) -current_pipeline_version=$(git show ${latest_release_tag}:workflow/nextflow.config | grep -o version.* | grep -oP "(?<=').*(?=')") +current_pipeline_version=$(git show ${latest_release_tag}:nextflow.config | grep -o version.* | grep -oP "(?<=').*(?=')") current_pipeline_versionMajor=$(echo ${current_pipeline_version} | cut -f1 -d".") current_pipeline_versionMajor=$(echo ${current_pipeline_versionMajor}".") echo "Major pipeline version for search: "${current_pipeline_versionMajor} @@ -13,27 +13,52 @@ staging_workflow_RID=$(curl -s https://staging.gudmap.org/ermrest/catalog/2/enti prod_workflow_RID=$(curl -s https://www.gudmap.org/ermrest/catalog/2/entity/RNASeq:Workflow/Version::ciregexp::%5E${current_pipeline_versionMajor} | grep -o '\"RID\":\".*\",\"RCT' | cut -f4 -d"\"") echo "collecting unique replicates with successful execution runs" -dev_count=0 +dev_total=0 +dev_success=0 +dev_error=0 for rid in ${dev_workflow_RID} do - temp_count=$(curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Execution_Status=Success/Workflow=${rid} | grep -o \"Replicate\".*,\"Workflow | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Workflow)" | sort | uniq | wc -l) - dev_count=$(expr ${dev_count} + ${temp_count}) + temp_total=$(curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/Q:=RNASeq:mRNA_QC/E:=\(Execution_Run\)=\(RNASeq:Execution_Run:RID\)/Workflow=${rid}/\!Execution_Status=In-progress/\$Q | grep -o \"Replicate\".*,\"Paired_End | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Paired_End)" | sort | uniq | wc -l) + dev_total=$(expr ${dev_total} + ${temp_total}) + temp_success=$(curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/Q:=RNASeq:mRNA_QC/E:=\(Execution_Run\)=\(RNASeq:Execution_Run:RID\)/Workflow=${rid}/Execution_Status=Success/\$Q | grep -o \"Replicate\".*,\"Paired_End | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Paired_End)" | sort | uniq | wc -l) + dev_success=$(expr ${dev_success} + ${temp_success}) + temp_error=$(curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/Q:=RNASeq:mRNA_QC/E:=\(Execution_Run\)=\(RNASeq:Execution_Run:RID\)/Workflow=${rid}/Execution_Status=Error/\$Q | grep -o \"Replicate\".*,\"Paired_End | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Paired_End)" | sort | uniq | wc -l) + dev_error=$(expr ${dev_error} + ${temp_error}) done -staging_count=0 +staging_total=0 +staging_success=0 +staging_error=0 for rid in ${staging_workflow_RID} do - temp_count=$(curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Execution_Status=Success/Workflow=${rid} | grep -o \"Replicate\".*,\"Workflow | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Workflow)" | sort | uniq | wc -l) - staging_count=$(expr ${staging_count} + ${temp_count}) + temp_total=$(curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/Q:=RNASeq:mRNA_QC/E:=\(Execution_Run\)=\(RNASeq:Execution_Run:RID\)/Workflow=${rid}/\!Execution_Status=In-progress/\$Q | grep -o \"Replicate\".*,\"Paired_End | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Paired_End)" | sort | uniq | wc -l) + staging_total=$(expr ${staging_total} + ${temp_total}) + temp_success=$(curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/Q:=RNASeq:mRNA_QC/E:=\(Execution_Run\)=\(RNASeq:Execution_Run:RID\)/Workflow=${rid}/Execution_Status=Success/\$Q | grep -o \"Replicate\".*,\"Paired_End | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Paired_End)" | sort | uniq | wc -l) + staging_success=$(expr ${staging_success} + ${temp_success}) + temp_error=$(curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/Q:=RNASeq:mRNA_QC/E:=\(Execution_Run\)=\(RNASeq:Execution_Run:RID\)/Workflow=${rid}/Execution_Status=Error/\$Q | grep -o \"Replicate\".*,\"Paired_End | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Paired_End)" | sort | uniq | wc -l) + staging_error=$(expr ${staging_error} + ${temp_error}) done -prod_count=0 +prod_total=0 +prod_success=0 +prod_error=0 for rid in ${prod_workflow_RID} do - temp_count=$(curl -s https://www.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Execution_Status=Success/Workflow=${rid} | grep -o \"Replicate\".*,\"Workflow | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Workflow)" | sort | uniq | wc -l) - prod_count=$(expr ${prod_count} + ${temp_count}) + temp_total=$(curl -s https://www.gudmap.org/ermrest/catalog/2/entity/Q:=RNASeq:mRNA_QC/E:=\(Execution_Run\)=\(RNASeq:Execution_Run:RID\)/Workflow=${rid}/\!Execution_Status=In-progress/\$Q | grep -o \"Replicate\".*,\"Paired_End | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Paired_End)" | sort | uniq | wc -l) + prod_total=$(expr ${prod_total} + ${temp_total}) + temp_success=$(curl -s https://www.gudmap.org/ermrest/catalog/2/entity/Q:=RNASeq:mRNA_QC/E:=\(Execution_Run\)=\(RNASeq:Execution_Run:RID\)/Workflow=${rid}/Execution_Status=Success/\$Q | grep -o \"Replicate\".*,\"Paired_End | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Paired_End)" | sort | uniq | wc -l) + prod_success=$(expr ${prod_success} + ${temp_success}) + temp_error=$(curl -s https://www.gudmap.org/ermrest/catalog/2/entity/Q:=RNASeq:mRNA_QC/E:=\(Execution_Run\)=\(RNASeq:Execution_Run:RID\)/Workflow=${rid}/Execution_Status=Error/\$Q | grep -o \"Replicate\".*,\"Paired_End | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Paired_End)" | sort | uniq | wc -l) + prod_error=$(expr ${prod_error} + ${temp_error}) done echo "collecting badges" mkdir -p ./badges/counts -curl --request GET https://img.shields.io/badge/Development%20Replicate%20Count-${dev_count}-lightgrey?style=flat > ./badges/counts/dev_counts.svg -curl --request GET https://img.shields.io/badge/Staging%20Replicate%20Count-${staging_count}-lightgrey?style=flat > ./badges/counts/staging_counts.svg -curl --request GET https://img.shields.io/badge/Production%20Replicate%20Count-${prod_count}-lightgrey?style=flat > ./badges/counts/prod_counts.svg +curl --request GET https://img.shields.io/badge/Development%20Successful%20Replicates-${dev_success}/${dev_total}-green?style=flat > ./badges/counts/dev_success.svg +curl --request GET https://img.shields.io/badge/Development%20Erroneous%20Replicates-${dev_error}/${dev_total}-red?style=flat > ./badges/counts/dev_error.svg +curl --request GET https://img.shields.io/badge/Staging%20Successful%20Replicates-${staging_success}/${staging_total}-green?style=flat > ./badges/counts/staging_success.svg +curl --request GET https://img.shields.io/badge/Staging%20Erroneous%20Replicates-${staging_error}/${staging_total}-red?style=flat > ./badges/counts/staging_error.svg +curl --request GET https://img.shields.io/badge/Production%20Successful%20Replicates-${prod_success}/${prod_total}-green?style=flat > ./badges/counts/prod_success.svg +curl --request GET https://img.shields.io/badge/Production%20Erroneous%20Replicates-${prod_error}/${prod_total}-red?style=flat > ./badges/counts/prod_error.svg + +curl --request GET https://img.shields.io/badge/Development%20Replicate%20Count-${dev_success}-lightgrey?style=flat > ./badges/counts/dev_counts.svg +curl --request GET https://img.shields.io/badge/Staging%20Replicate%20Count-${staging_success}-lightgrey?style=flat > ./badges/counts/staging_counts.svg +curl --request GET https://img.shields.io/badge/Production%20Replicate%20Count-${prod_success}-lightgrey?style=flat > ./badges/counts/prod_counts.svg diff --git a/workflow/tests/test_seqwho.py b/workflow/tests/test_seqwho.py new file mode 100644 index 0000000000000000000000000000000000000000..051cc4b379bc2378b2effff22f4737592d9b54cd --- /dev/null +++ b/workflow/tests/test_seqwho.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + + +@pytest.mark.seqwho +def test_seqwho(): + assert os.path.exists(os.path.join( + test_output_path, 'SeqWho_call.tsv'))