diff --git a/.gitignore b/.gitignore index 12288788210fa386427657fa55ab47b9ac14a6aa..c154fe7a68856b7217057a894e93fdd0ef803328 100644 --- a/.gitignore +++ b/.gitignore @@ -281,6 +281,7 @@ $RECYCLE.BIN/ # nextflow analysis folders/files /test_data/* !/test_data/createTestData.sh +!/test_data/Replicate_For_Input_Bag(test).json /workflow/.nextflow/* /workflow/work/* /workflow/output/* @@ -301,4 +302,4 @@ timeline*.html* *_studyRID.csv run*.sh -!.gitkeep +!.gitkeep \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1cf1103821fac0e31e30bafbb9901591bdf8c919..bf7355aa6e19f1c29ab020d8524ff204ffc9e7cc 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,17 +1,58 @@ before_script: - - module add python/3.6.4-anaconda - - pip install --user pytest-pythonpath==0.7.1 pytest-cov==2.5.1 + - module load python/3.6.4-anaconda + - pip install --user attrs==19.1.0 pytest-pythonpath==0.7.1 pytest-cov==2.5.1 deriva==1.3.0 - module load singularity/3.5.3 - module load nextflow/20.01.0 - ln -sfn /project/BICF/BICF_Core/shared/gudmap/test_data/* ./test_data/ - mkdir -p ~/.deriva - mkdir -p ~/.bdbag +variables: + refMoVersion: "38.p6.vM25" + refHuVersion: "38.p13.v36" + refERCCVersion: "92" + stages: + - badges + - deploy - unit + - aggregation + - reference - integration - consistency +build_badges: + stage: badges + only: + - master + - develop + - tags + before_script: + - module load singularity/3.5.3 + - chmod +x ./workflow/scripts/get_updated_badge_info.sh + script: + - echo "Building badges" + - singularity run 'docker://gudmaprbk/gudmap-rbk_base:1.0.0' bash ./workflow/scripts/get_updated_badge_info.sh + - singularity run 'docker://gudmaprbk/gudmap-rbk_base:1.0.0' bash ./workflow/scripts/get_updated_rep_count.sh + artifacts: + paths: + - badges/ + +pages: + stage: deploy + only: + - master + - develop + - tags + dependencies: + - build_badges + script: + - mkdir -p public + - mv badges/ public/badges/ + artifacts: + paths: + - public + getBag: stage: unit only: @@ -19,10 +60,18 @@ getBag: - tags except: - merge_requests + - schedules script: - ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json - - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6 + - singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-download-cli --version > version_deriva.txt + - singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-download-cli staging.gudmap.org --catalog 2 ./workflow/conf/Replicate_For_Input_Bag.json . rid=Q-Y5F6 - pytest -m getBag + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_deriva.txt + expire_in: 7 days getData: stage: unit @@ -31,11 +80,19 @@ getData: - tags except: - merge_requests + - schedules script: + - singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' bdbag --version > version_bdbag.txt - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt - - unzip ./test_data/bag/Replicate_Q-Y5F6.zip - - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6 TEST + - unzip ./test_data/bag/Q-Y5F6_inputBag_xxxxtest.zip + - singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' bash ./workflow/scripts/bdbag_fetch.sh Q-Y5F6_inputBag Q-Y5F6 - pytest -m getData + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_bdbag.txt + expire_in: 7 days parseMetadata: stage: unit @@ -44,18 +101,27 @@ parseMetadata: - tags except: - merge_requests + - schedules script: - - rep=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID) - - exp=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p expRID) - - study=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p studyRID) - - endsMeta=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsMeta) - - endsManual=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsManual) - - stranded=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded) - - spike=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p spike) - - species=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species) - - readLength=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.stageNew.csv" -p readLength) - - echo -e "${endsMeta},${endsManual},${stranded},${spike},${species},${readLength},${exp},${study},${rep}" > design.csv + - singularity run 'docker://gudmaprbk/python3:1.0.0' python3 --version > version_python.txt + - rep=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID) + - exp=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p expRID) + - study=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p studyRID) + - endsRaw=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsMeta) + - endsMeta="uk" + - endsManual=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsManual) + - stranded=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded) + - spike=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p spike) + - species=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species) + - readLength=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p readLength) + - echo -e "${endsMeta},${endsRaw},${endsManual},${stranded},${spike},${species},${readLength},${exp},${study},${rep}" > design.csv - pytest -m parseMetadata + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_python.txt + expire_in: 7 days inferMetadata: stage: unit @@ -64,28 +130,23 @@ inferMetadata: - tags except: - merge_requests + - schedules script: + - singularity run 'docker://gudmaprbk/rseqc4.0.0:1.0.0' infer_experiment.py --version > version_rseqc.txt - > align=$(echo $(grep "Overall alignment rate" ./test_data/meta/Q-Y5F6_1M.se.alignSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) && if [[ ${align} == "" ]]; then exit 1; fi - > - singularity run 'docker://bicf/rseqc3.0:2.0.1_indev' infer_experiment.py -r "/project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed" -i "./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam" 1>> Q-Y5F6_1M.se.inferMetadata.log && - ended=`singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/inferMeta.sh endness Q-Y5F6_1M.se.inferMetadata.log` && + singularity run 'docker://gudmaprbk/rseqc4.0.0:1.0.0' infer_experiment.py -r "/project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/annotation/genome.bed" -i "./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam" 1>> Q-Y5F6_1M.se.inferMetadata.log && + ended=`singularity run 'gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/infer_meta.sh endness Q-Y5F6_1M.se.inferMetadata.log` && if [[ ${ended} == "" ]]; then exit 1; fi - pytest -m inferMetadata - -getRef: - stage: unit - only: - - push - - tags - except: - - merge_requests - script: - - mkdir -p hu - - mkdir -p mo - - cp -R /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2 ./hu/ - - cp -R /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2 ./mo/ + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_rseqc.txt + expire_in: 7 days trimData: stage: unit @@ -94,12 +155,20 @@ trimData: - tags except: - merge_requests + - schedules script: - - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz - - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz + - singularity run 'docker://gudmaprbk/trimgalore0.6.5:1.0.0' trim_galore --version > version_trimgalore.txt + - singularity run 'docker://gudmaprbk/trimgalore0.6.5:1.0.0' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz + - singularity run 'docker://gudmaprbk/trimgalore0.6.5:1.0.0' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz - readLengthSE=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') - readLengthPE=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') - pytest -m trimData + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_trimgalore.txt + expire_in: 7 days downsampleData: stage: unit @@ -108,8 +177,9 @@ downsampleData: - tags except: - merge_requests + - schedules script: - - singularity run 'docker://bicf/seqtk:2.0.1_indev' seqtk sample -s100 ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz 1000 1> sampled.1.fq + - singularity run 'docker://gudmaprbk/seqtk1.3:1.0.0' seqtk sample -s100 ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz 1000 1> sampled.1.fq - pytest -m downsampleData alignData: @@ -119,16 +189,26 @@ alignData: - tags except: - merge_requests + - schedules script: - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.sorted.bam.bai - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.pe.unal.gz -S Q-Y5F6_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/small/Q-Y5F6_1M.pe_R1_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5F6_1M.pe_R2_val_2.fq.gz --summary-file Q-Y5F6_1M.pe.alignSummary.txt --new-summary - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.pe.bam Q-Y5F6_1M.pe.sam - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai + - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' hisat2 --version > version_hisat2.txt + - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools --version > version_samtools.txt + - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary + - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam + - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam + - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.sorted.bam.bai + - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.pe.unal.gz -S Q-Y5F6_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/small/Q-Y5F6_1M.pe_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5F6_1M.pe_val_2.fq.gz --summary-file Q-Y5F6_1M.pe.alignSummary.txt --new-summary + - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.pe.bam Q-Y5F6_1M.pe.sam + - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam + - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai - pytest -m alignData + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_hisat2.txt + - version_samtools.txt + expire_in: 7 days dedupData: stage: unit @@ -137,15 +217,25 @@ dedupData: - tags except: - merge_requests + - schedules script: - - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true - - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5F6_1M.se.deduped.bam - - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai + - singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' samtools --version > version_samtools.txt + - singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates --version 2> version_markdups.txt& + - singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true + - singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5F6_1M.se.deduped.bam + - singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' samtools index -@ 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai - > - for i in {"chr8","chr4","chrY"}; do + for i in {"chr8","chr4","chrY"}; do echo "samtools view -b Q-Y5F6_1M.se.sorted.deduped.bam ${i} > Q-Y5F6_1M.se.sorted.deduped.${i}.bam; samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.${i}.bam Q-Y5F6_1M.se.sorted.deduped.${i}.bam.bai;"; - done | singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' parallel -j 20 -k + done | singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' parallel -j 20 -k - pytest -m dedupData + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_markdups.txt + - version_samtools.txt + expire_in: 7 days countData: stage: unit @@ -154,14 +244,24 @@ countData: - tags except: - merge_requests + - schedules script: - - ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/geneID.tsv - - ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/Entrez.tsv - - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se.countData -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam - - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.countData - - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se + - ln -s /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/metadata/geneID.tsv + - ln -s /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/metadata/Entrez.tsv + - singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/annotation/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/sequence/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se_countData -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam + - singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se_countData + - singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se - assignedReads=$(grep -m 1 'Assigned' *.summary | grep -oe '\([0-9.]*\)') + - singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' featureCounts -v &> version_featurecounts.txt + - singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' R --version > version_r.txt - pytest -m makeFeatureCounts + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_featurecounts.txt + - version_r.txt + expire_in: 7 days makeBigWig: stage: unit @@ -170,9 +270,17 @@ makeBigWig: - tags except: - merge_requests + - schedules script: - - singularity run 'docker://bicf/deeptools3.3:2.0.1_indev' bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw + - singularity run 'docker://gudmaprbk/deeptools3.5.0:1.0.0' deeptools --version > version_deeptools.txt + - singularity run 'docker://gudmaprbk/deeptools3.5.0:1.0.0' bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw - pytest -m makeBigWig + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_deeptools.txt + expire_in: 7 days fastqc: stage: unit @@ -181,9 +289,18 @@ fastqc: - tags except: - merge_requests + - schedules script: - - singularity run 'docker://bicf/fastqc:2.0.1_indev' fastqc ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o . + - singularity run 'docker://gudmaprbk/fastqc0.11.9:1.0.0' fastqc --version > version_fastqc.txt + - singularity run 'docker://gudmaprbk/fastqc0.11.9:1.0.0' fastqc ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o . - pytest -m fastqc + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - version_fastqc.txt + expire_in: 7 days + dataQC: stage: unit @@ -194,22 +311,349 @@ dataQC: - merge_requests script: - echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > Q-Y5F6_1M.se.sorted.deduped.tin.xls - - for i in {"chr8","chr4","chrY"}; do - echo "tin.py -i ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"; done | singularity run 'docker://bicf/rseqc3.0:2.0.1_indev' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls + - > + for i in {"chr8","chr4","chrY"}; do + echo "tin.py -i ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/annotation/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";" + done | singularity run 'docker://gudmaprbk/rseqc4.0.0:1.0.0' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls - pytest -m dataQC -outputBag: +uploadInputBag: + stage: unit + only: + - push + - tags + except: + - merge_requests + - schedules + script: + - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json + - echo THIS IS A TEST FILE > test.txt + - > + md5=$(md5sum ./test.txt | awk '{ print $1 }') && + size=$(wc -c < ./test.txt) && + exist=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=${md5}) && + if [ "${exist}" == "[]" ]; then + cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && + cookie=${cookie:11:-1} && + loc=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host staging.gudmap.org put ./test.txt /hatrac/resources/rnaseq/pipeline/input_bag/TEST/test.txt --parents) && + rid=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/upload_input_bag.py -f test.txt -l ${loc} -s ${md5} -b ${size} -n 'This is a test input bag' -o staging.gudmap.org -c ${cookie}) && + echo ${rid} test input bag created + else + rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') && + rid=${rid:8:-6} && + echo ${rid} test input bag already exists + fi + +uploadExecutionRun: + stage: unit + only: + - push + - tags + except: + - merge_requests + - schedules + script: + - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json + - > + exist=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Replicate=17-BTFJ) && + cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && + cookie=${cookie:11:-1} && + if [ "${exist}" == "[]" ]; then + rid=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/upload_execution_run.py -r 17-BTFJ -w 17-BV2Y -g 17-BV90 -i 17-BTFT -s Success -d 'This is a test execution run' -o staging.gudmap.org -c ${cookie} -u F) && + echo ${rid} test execution run created + else + rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') && + rid=${rid:7:-6} && + rid=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/upload_execution_run.py -r 17-BTFJ -w 17-BV2Y -g 17-BV90 -i 17-BTFT -s Success -d 'This is a test execution run' -o staging.gudmap.org -c ${cookie} -u ${rid}) && + echo ${rid} test execution run already exists + fi + +uploadQC: + stage: unit + only: + - push + - tags + except: + - merge_requests + - schedules + script: + - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json + - > + exist=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=17-BTFJ) && + cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && + cookie=${cookie:11:-1} && + if [ "${exist}" != "[]" ]; then + rids=$(echo ${exist} | grep -o '\"RID\":\".\{7\}' | sed 's/^.\{7\}//') && + for rid in ${rids}; do + singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/delete_entry.py -r ${rid} -t mRNA_QC -o staging.gudmap.org -c ${cookie} + done + echo all old mRNA QC RIDs deleted + fi + rid=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/upload_qc.py -r 17-BTFJ -e 17-BVDJ -p "Single Read" -s forward -l 35 -w 5 -f 1 -t 1 -n "This is a test mRNA QC" -o staging.gudmap.org -c ${cookie} -u F) + echo ${rid} test mRNA QC created + +uploadProcessedFile: stage: unit only: - push - tags except: - merge_requests + - schedules script: + - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json + - echo THIS IS A TEST FILE > 17-BTFJ_test.csv + - mkdir -p ./deriva/Seq/pipeline/17-BTFE/17-BVDJ/ + - mv 17-BTFJ_test.csv ./deriva/Seq/pipeline/17-BTFE/17-BVDJ/17-BTFJ_test.csv + - > + exist=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Processed_File/Replicate=17-BTFJ) && + cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && + cookie=${cookie:11:-1} && + if [ "${exist}" != "[]" ]; then + rids=$(echo ${exist} | grep -o '\"RID\":\".\{7\}' | sed 's/^.\{7\}//') && + for rid in ${rids}; do + singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/delete_entry.py -r ${rid} -t Processed_File -o staging.gudmap.org -c ${cookie} + done + echo all old processed file RIDs deleted + fi + singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-upload-cli --catalog 2 --token ${cookie:9} staging.gudmap.org ./deriva + echo test processed file uploaded - mkdir test - - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bdbag test --archiver zip + - singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' bdbag test --archiver zip + - echo test output bag created - pytest -m outputBag +uploadOutputBag: + stage: unit + only: + - push + - tags + except: + - merge_requests + - schedules + script: + - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json + - echo THIS IS A TEST FILE > test.txt + - > + md5=$(md5sum ./test.txt | awk '{ print $1 }') && + size=$(wc -c < ./test.txt) && + exist=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_MD5=${md5}) && + if [ "${exist}" == "[]" ]; then + cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && + cookie=${cookie:11:-1} && + loc=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host staging.gudmap.org put ./test.txt /hatrac/resources/rnaseq/pipeline/output_bag/TEST/test.txt --parents) && + rid=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' python3 ./workflow/scripts/upload_output_bag.py -e 17-BVDJ -f test.txt -l ${loc} -s ${md5} -b ${size} -n 'This is a test output bag' -o staging.gudmap.org -c ${cookie}) && + echo ${rid} test output bag created + else + rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') && + rid=${rid:8:-6} && + echo ${rid} test output bag already exists + fi + + +generateVersions: + stage: aggregation + only: + - push + - tags + except: + - merge_requests + - schedules + script: + - singularity run 'docker://gudmaprbk/multiqc1.9:1.0.0' multiqc --version > version_multiqc.txt + - python ./workflow/scripts/generate_versions.py -o software_versions + - python ./workflow/scripts/generate_references.py -r ./docs/references.md -o software_references + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - software_references_mqc.yaml + - software_versions_mqc.yaml + expire_in: 7 days + + +human_BioHPC: + stage: reference + only: + - push + - tags + except: + - merge_requests + - schedules + script: + - mkdir -p hu + - cp -R /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/hisat2 ./hu/ + +mouse_BioHPC: + stage: reference + only: + - push + - tags + except: + - merge_requests + - schedules + script: + - mkdir -p mo + - cp -R /project/BICF/BICF_Core/shared/gudmap/references/new/GRCh38.p13.v36/data/hisat2 ./mo/ + +human_dev: + stage: reference + only: + - push + - tags + except: + - merge_requests + - schedules + script: + - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt + - referenceBase=dev.gudmap.org + - refName=GRCh + - references=$(echo ${referenceBase}/${refName}${refHuVersion}) + - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) + - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) + - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) + - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}/Used_Spike_Ins=false) + - curl --request GET ${query} > refQuery.json + - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) + - loc=$(dirname ${refURL}) + - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi + - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') + - test=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(echo ${test} | grep -o ${filename}) + - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi + +mouse_dev: + stage: reference + only: + - push + - tags + except: + - merge_requests + - schedules + script: + - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt + - referenceBase=dev.gudmap.org + - refName=GRCm + - references=$(echo ${referenceBase}/${refName}${refMoVersion}) + - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) + - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) + - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) + - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}/Used_Spike_Ins=false) + - curl --request GET ${query} > refQuery.json + - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) + - loc=$(dirname ${refURL}) + - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi + - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') + - test=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(echo ${test} | grep -o ${filename}) + - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi + +human_staging: + stage: reference + only: + - push + - tags + except: + - merge_requests + - schedules + script: + - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt + - referenceBase=staging.gudmap.org + - refName=GRCh + - references=$(echo ${referenceBase}/${refName}${refHuVersion}) + - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) + - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) + - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) + - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}/Used_Spike_Ins=false) + - curl --request GET ${query} > refQuery.json + - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) + - loc=$(dirname ${refURL}) + - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi + - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') + - test=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(echo ${test} | grep -o ${filename}) + - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi + +mouse_staging: + stage: reference + only: + - push + - tags + except: + - merge_requests + - schedules + script: + - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt + - referenceBase=staging.gudmap.org + - refName=GRCm + - refHuVersion=38.p6.vM22 + - references=$(echo ${referenceBase}/${refName}${refMoVersion}) + - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) + - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) + - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) + - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}/Used_Spike_Ins=false) + - curl --request GET ${query} > refQuery.json + - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) + - loc=$(dirname ${refURL}) + - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi + - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') + - test=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(echo ${test} | grep -o ${filename}) + - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi + +human_prod: + stage: reference + only: + - push + - tags + except: + - merge_requests + - schedules + script: + - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt + - referenceBase=www.gudmap.org + - refName=GRCh + - references=$(echo ${referenceBase}/${refName}${refHuVersion}) + - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) + - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) + - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) + - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}/Used_Spike_Ins=false) + - curl --request GET ${query} > refQuery.json + - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) + - loc=$(dirname ${refURL}) + - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi + - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') + - test=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(echo ${test} | grep -o ${filename}) + - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi + +mouse_prod: + stage: reference + only: + - push + - tags + except: + - merge_requests + - schedules + script: + - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt + - referenceBase=www.gudmap.org + - refName=GRCm + - refHuVersion=38.p6.vM22 + - references=$(echo ${referenceBase}/${refName}${refMoVersion}) + - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) + - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) + - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) + - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}/Used_Spike_Ins=false) + - curl --request GET ${query} > refQuery.json + - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) + - loc=$(dirname ${refURL}) + - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi + - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') + - test=$(singularity run 'docker://gudmaprbk/deriva1.4:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(echo ${test} | grep -o ${filename}) + - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi + integration_se: stage: integration @@ -220,7 +664,7 @@ integration_se: script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-1ZX4 -with-dag dag.png --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-1ZX4 --source staging --upload true -with-dag dag.png --dev false --ci true --email 'venkat.malladi@utsouthwestern.edu,Gervaise.Henry@UTSouthwestern.edu' - find . -type f -name "multiqc_data.json" -exec cp {} ./SE_multiqc_data.json \; artifacts: name: "$CI_JOB_NAME" @@ -231,7 +675,7 @@ integration_se: - SE_multiqc_data.json expire_in: 7 days retry: - max: 1 + max: 0 when: - always @@ -244,7 +688,7 @@ integration_pe: script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA -with-dag dag.png --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA --source staging --upload true -with-dag dag.png --dev false --ci true - find . -type f -name "multiqc_data.json" -exec cp {} ./PE_multiqc_data.json \; artifacts: name: "$CI_JOB_NAME" @@ -256,10 +700,11 @@ integration_pe: - PE_multiqc_data.json expire_in: 7 days retry: - max: 1 + max: 0 when: - always + override_inputBag: stage: integration only: [merge_requests] @@ -269,7 +714,7 @@ override_inputBag: script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --inputBagForce ./test_data/bag/Replicate_Q-Y5F6.zip --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --source staging --inputBagForce ./test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip --upload false --dev false --ci true - find . -type f -name "multiqc_data.json" -exec cp {} ./inputBagOverride_PE_multiqc_data.json \; artifacts: name: "$CI_JOB_NAME" @@ -278,7 +723,7 @@ override_inputBag: - inputBagOverride_PE_multiqc_data.json expire_in: 7 days retry: - max: 1 + max: 0 when: - always @@ -291,7 +736,7 @@ override_fastq: script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --fastqsForce './test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz' --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --source staging --fastqsForce './test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz' --upload false --dev false --ci true - find . -type f -name "multiqc_data.json" -exec cp {} ./fastqOverride_PE_multiqc_data.json \; artifacts: name: "$CI_JOB_NAME" @@ -300,10 +745,10 @@ override_fastq: - fastqOverride_PE_multiqc_data.json expire_in: 7 days retry: - max: 1 + max: 0 when: - always - + override_species: stage: integration only: [merge_requests] @@ -313,7 +758,7 @@ override_species: script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --speciesForce 'Homo sapiens' --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --source staging --speciesForce 'Homo sapiens' --upload false --dev false --ci true - find . -type f -name "multiqc_data.json" -exec cp {} ./speciesOverride_PE_multiqc_data.json \; artifacts: name: "$CI_JOB_NAME" @@ -322,10 +767,10 @@ override_species: - speciesOverride_PE_multiqc_data.json expire_in: 7 days retry: - max: 1 + max: 0 when: - always - + consistency: stage: consistency @@ -334,10 +779,6 @@ consistency: variables: - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - - grep -m 1 \"Assigned\":.[0-9] SE_multiqc_data.json | grep -oe '\([0-9.]*\)' > assignedSE.txt - - grep -m 1 \"Assigned\":.[0-9] PE_multiqc_data.json | grep -oe '\([0-9.]*\)' > assignedPE.txt - - echo 7742416 > assignedExpectSE.txt - - echo 2599140 > assignedExpectPE.txt - pytest -m consistencySE - pytest -m consistencyPE artifacts: @@ -346,8 +787,4 @@ consistency: paths: - SE_multiqc_data.json - PE_multiqc_data.json - - assignedSE.txt - - assignedPE.txt - - assignedExpectSE.txt - - assignedExpectPE.txt - expire_in: 7 days \ No newline at end of file + expire_in: 7 days diff --git a/.gitlab/merge_request_templates/Merge_Request.md b/.gitlab/merge_request_templates/Merge_Request.md index 88c50aa2a683175a6e6635283724b11ad308e6e5..9106ab37a0d604eca393e2d7f700a31f47e86c29 100644 --- a/.gitlab/merge_request_templates/Merge_Request.md +++ b/.gitlab/merge_request_templates/Merge_Request.md @@ -5,7 +5,9 @@ These are the most common things requested on pull requests. - [ ] This comment contains a description of changes (with reason) - [ ] If you've fixed a bug or added code that should be tested, add tests! - [ ] Documentation in `docs` is updated - - [ ] Replace dag.png with the most recent CI pipleine integrated_pe artifact + - [ ] Replace dag.png with the most recent CI pipeline integrated_pe artifact + - [ ] Replace software_versions_mqc.yaml with the most recent CI pipeline generateVersions artifact + - [ ] Replace software_references_mqc.yaml with the most recent CI pipeline generateVersions artifact - [ ] `CHANGELOG.md` is updated - [ ] `README.md` is updated - [ ] `LICENSE.md` is updated with new contributors diff --git a/CHANGELOG.md b/CHANGELOG.md index 4f6c9f5510c455c6d6a6e6f4ab449740afa92361..025eec05f1490b1c1d2a7a786c7611228e38f1d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,57 @@ +# v1.0.0 +**User Facing** +* Add link to reference builder script +* Output median TIN to mRNA_QC table + +**Background** +* Change consistency test to check if +/- 5% of standard +* Change tool version checker for badges to use latest tag +* Utilize pipeline tracking and qc AWS tables + +*Known Bugs* +* Override params (inputBag, fastq, species) aren't checked for integrity +* Authentication files and tokens must be active (active auth client) for the duration of the pipeline run (until long-lived token utilization included) + +<hr> + +# v0.1.0 +**User Facing** +* Add option to pull references from datahub +* Add option to send email on workflow error, with pipeline error message +* Add versions and paper references of software used to report +* Upload input bag +* Upload execution run +* Upload mRNA QC +* Create and upload output bag +* Add optional to not upload +* Update references to use bags +* Update to newer references (GRCh38.p13.v36 and GRCm38.p6.vM25) +* Use production server for data-hub reference call +* Error pipeline if submitted does not match infered +* Update execution run with "Success" or "Error" +* Error if fastq error (>2, if pe != 2, if se !=1) +* Error if pe and line count of R1 != R2 +* Error if ambiguous species inference +* Remove non fastq from inputBag from the export bag config level + +**Background** +* Remove (comment out) option to pull references from S3 +* Make pull references from BioHPC default (including in biohpc.config) +* Start using new gudmaprbk dockerhub (images autobuilt) +* Moved consistency checks to be fully python +* Changed order of steps so that fastqc is done after the trim step +* Change docker images to production +* Add automated version badges +* Only calculate/report tin values on regular chromosomes (from gtf) +* Change inputBag fetch to manifest then validate (if fail fetch missing and revalidate up to 3 times) +* Retry getData and trimData processes up to once +* Make inputBag export config to create inputBag with only small txt file for CI unit test of getData (and update test) + +*Known Bugs* +* Override params (inputBag, fastq, species) aren't checked for integrity + +<hr> + # v0.0.3 **User Facing** * TPM table: diff --git a/README.md b/README.md index 49069c53ada7b1f8351684cea8ba161830369208..4639e51ee03b52d261274d95a9a3c72a7fd9a434 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,13 @@ -|*master*|*develop*| +|master|develop| |:-:|:-:| |[](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/commits/master)|[](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/commits/develop)| -<!-- -[![DOI]()]() ---> +|[](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/tree/master)|[](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/tree/develop)| +|[](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/tree/master)|[](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/tree/develop)| + + +[](https://doi.org/10.5281/zenodo.4429316) + + RNA-Seq Analytic Pipeline for GUDMAP/RBK ======================================== @@ -11,18 +15,9 @@ Introduction ------------ This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub. It is designed to run on the HPC cluster ([BioHPC](https://portal.biohpc.swmed.edu)) at UT Southwestern Medical Center (in conjunction with the standard nextflow profile: config `biohpc.config`) - - -Cloud Compatibility: --------------------- -This pipeline is also capable of being run on AWS. To do so: -* Build a AWS batch queue and environment either manually or with [aws-cloudformantion](https://console.aws.amazon.com/cloudformation/home?#/stacks/new?stackName=Nextflow&templateURL=https://s3.amazonaws.com/aws-genomics-workflows/templates/nextflow/nextflow-aio.template.yaml) -* Edit one of the aws configs in workflow/config/ - * Replace workDir with the S3 bucket generated - * Change region if different - * Change queue to the aws batch queue generated -* The user must have awscli configured with an appropriate authentication (with `aws configure` and access keys) in the environment which nextflow will be run -* Add `-profile` with the name aws config which was customized +Authentication: +---------------- +The consortium server used must be authentificated with the [deriva authentication client](https://github.com/informatics-isi-edu/gudmap-rbk/wiki/), and remain authentificated till the end of the pipeline run. Prematurely closing the client will result in invalidation of the tokens, and may result in the pipeline failure. The use of long-lived "globus" tokens is on the roadmap for use in the future. To Run: ------- @@ -34,48 +29,77 @@ To Run: * **dev** = [dev.gudmap.org](dev.gudmap.org) (default, does not contain all data) * **staging** = [staging.gudmap.org](staging.gudmap.org) (does not contain all data) * **production** = [www.gudmap.org](www.gudmap.org) (***does contain all data***) - * `--refMoVersion` mouse reference version ***(optional)*** - * `--refHuVersion` human reference version ***(optional)*** - * `--refERCCVersion` human reference version ***(optional)*** + * `--refMoVersion` mouse reference version ***(optional, default = 38.p6.vM25)*** + * `--refHuVersion` human reference version ***(optional, default = 38.p13.v36)*** + * `--refERCCVersion` human reference version ***(optional, default = 92)*** + * `--upload` option to not upload output back to the data-hub ***(optional, default = false)*** + * **true** = upload outputs to the data-hub + * **false** = do *NOT* upload outputs to the data-hub * `-profile` config profile to use ***(optional)***: * defaut = processes on BioHPC cluster * **biohpc** = process on BioHPC cluster * **biohpc_max** = process on high power BioHPC cluster nodes (=> 128GB nodes), for resource testing * **aws_ondemand** = AWS Batch on-demand instant requests * **aws_spot** = AWS Batch spot instance requests + * `--email` email address(es) to send failure notification (comma separated) ***(optional)***: + * e.g: `--email 'Venkat.Malladi@utsouthwestern.edu,Gervaise.Henry@UTSouthwestern.edu'` + * NOTES: * once deriva-auth is run and authenticated, the two files above are saved in ```~/.deriva/``` (see official documents from [deriva](https://github.com/informatics-isi-edu/deriva-client#installer-packages-for-windows-and-macosx) on the lifetime of the credentials) * reference version consists of Genome Reference Consortium version, patch release and GENCODE annotation release # (leaving the params blank will use the default version tied to the pipeline version) - * *current mouse* **38.p6.vM22** = GRCm38.p6 with GENCODE annotation release M22 - * *current human* **38.p6.v31** = GRCh38.p12 with GENCODE annotation release 31 + * *current mouse* **38.p6.vM25** = GRCm38.p6 with GENCODE annotation release M25 + * *current human* **38.p13.v36** = GRCh38.p13 with GENCODE annotation release 36 * ***Optional*** input overrides + * `--refSource` source for pulling references + * **biohpc** = source references from BICF_Core gudmap reference local location (workflow must be run on BioHPC system) + * **datahub** = source references from GUDMAP/RBK reference_table location (currently uses dev.gudmap.org) * `--inputBagForce` utilizes a local replicate inputBag instead of downloading from the data-hub (still requires accurate repRID input) - * eg: `--inputBagForce test_data/bag/Replicate_Q-Y5F6.zip` (must be the expected bag structure) + * eg: `--inputBagForce test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip` (must be the expected bag structure, this example will not work because it is a test bag) * `--fastqsForce` utilizes local fastq's instead of downloading from the data-hub (still requires accurate repRID input) * eg: `--fastqsForce 'test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz'` (note the quotes around fastq's which must me named in the correct standard [*\*.R1.fastq.gz and/or \*.R2.fastq.gz*] and in the correct order) * `--speciesForce` forces the species to be "Mus musculus" or "Homo sapiens", it bypasses ambiguous species error * eg: `--speciesForce 'Mus musculus'` * Tracking parameters ([Tracking Site](http://bicf.pipeline.tracker.s3-website-us-east-1.amazonaws.com/)): * `--ci` boolean (default = false) - * `--dev` boolean (default = false) + * `--dev` boolean (default = true) FULL EXAMPLE: ------------- ``` -nextflow run workflow/rna-seq.nf --deriva ./data/credential.json --bdbag ./data/cookies.txt --repRID Q-Y5JA +nextflow run workflow/rna-seq.nf --repRID Q-Y5JA --source production --deriva ./data/credential.json --bdbag ./data/cookies.txt --dev false --upload true -profile biohpc ``` -To run a set of replicates from study RID: +Cloud Compatibility: +-------------------- +This pipeline is also capable of being run on AWS. To do so: +* Build a AWS batch queue and environment either manually or with [aws-cloudformantion](https://console.aws.amazon.com/cloudformation/home?#/stacks/new?stackName=Nextflow&templateURL=https://s3.amazonaws.com/aws-genomics-workflows/templates/nextflow/nextflow-aio.template.yaml) +* Edit one of the aws configs in workflow/config/ + * Replace workDir with the S3 bucket generated + * Change region if different + * Change queue to the aws batch queue generated +* The user must have awscli configured with an appropriate authentication (with `aws configure` and access keys) in the environment which nextflow will be run +* Add `-profile` with the name aws config which was customized + +To generate you own references or new references: ------------------------------------------ -Run in repo root dir: -* `sh workflow/scripts/splitStudy.sh [studyRID]` -It will run in parallel in batches of 25 replicatesRID with 30 second delays between launches.\ -NOTE: Nextflow "local" processes for all replicates will run on the node/machine the bash script is launched from... consider running the study script on the BioHPC's SLURM cluster (use `sbatch`). +Download the [reference creation script](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/snippets/31). +This script will auto create human and mouse references from GENCODE. It can also create ERCC92 spike-in references as well as concatenate them to GENCODE references automatically. In addition, it can create references from manually downloaded FASTA and GTF files. +Errors: +------- +Error reported back to the data-hub are (they aren't thrown on the command line by the pipeline, but rather are submitted (if `--upload true`) to the data-hub for that replicate in the execution run submission): + +|Error|Descripton| +|:-|:-:| +|**Too many fastqs detected (>2)**|Data-hub standards and that of this pipeline is for one read-1 fastq and if paired-end, one read\-2 fastq. As a result, the maximum number of fastq's per replicate cannot be more than 2.| +|**Number of fastqs detected does not match submitted endness**|Single-end sequenced replicates can only have one fastq, while paried\-end can only have two (see above).| +|**Number of reads do not match for R1 and R2**|For paired\-end sequenced studies the number of reads in read\-1 fastq must match that of read\-2. This error is usually indicative of uploading of currupted, trunkated, or wrong fastq files| +|**Inference of species returns an ambiguous result**|Species of the replicate is done by aligning a random subset of 1 million reads from the data to both the human and mouse reference genomes. If there isn't a clear difference between the alignment rates (`>=40%` of one species, but `<40%` of the other), then this error is detected.| +|**Submitted metadata does not match inferred**|All required metadata for analysis of the data is internally inferred by the pipeline, if any of those do not match the submitted metadata, this error is detected to notify of a potential error.| <hr> -[**CHANGELOG**](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/blob/develop/CHANGELOG.md) +[**CHANGELOG**](CHANGELOG.md) <hr> Credits @@ -109,7 +133,7 @@ UT Southwestern Medical Center\ [johnathan.gesell@utsouthwestern.edu](mailto:jonathn.gesell@utsouthwestern.edu) Jeremy A. Mathews\ -*Computational Intern*\ +*Computational Biologist*\ Bioinformatics Core Facility\ UT Southwestern Medical Center\ <a href="https://orcid.org/0000-0002-2931-1430" target="orcid.widget" rel="noopener noreferrer" style="vertical-align:top;"><img src="https://orcid.org/sites/default/files/images/orcid_16x16.png" style="width:1em;margin-right:.5em;" alt="ORCID iD icon">orcid.org/0000-0002-2931-1430</a>\ @@ -123,4 +147,4 @@ Please cite in publications: Pipeline was developed by BICF from funding provide Pipeline Directed Acyclic Graph ------------------------------- - \ No newline at end of file + diff --git a/docs/RNA-Seq Pipeline Design Flowchart.drawio b/docs/RNA-Seq Pipeline Design Flowchart.drawio deleted file mode 100644 index 7b65c655592950fdb14c7a8b7c74a4f986816669..0000000000000000000000000000000000000000 --- a/docs/RNA-Seq Pipeline Design Flowchart.drawio +++ /dev/null @@ -1 +0,0 @@ -<mxfile host="Electron" modified="2020-03-23T23:17:50.947Z" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/12.6.5 Chrome/80.0.3987.86 Electron/8.0.0 Safari/537.36" etag="EoueA3AcDhzhFmxKZ9Lg" version="12.6.5" type="device"><diagram name="Page-1" id="74e2e168-ea6b-b213-b513-2b3c1d86103e">7V1Zd5u6Fv41Xuveh2QxD49Nk7S9pzlNTpq0PS9dMsg2CQYXcBP3118xCDMIAWZ2nJcYIWGs/e1J2ntrxr9fv35wwGZ1Y+vQnHGM/jrjL2ccJzASj/75LbuwheNlOWxZOoYetrH7hnvjD4wamah1a+jQTXX0bNv0jE26UbMtC2peqg04jv2S7rawzfS3bsAS5hruNWDmW78ZurcKW3mGYfY3PkJjuYq+WsE35kB7Xjr21oq+b8bxi+AvvL0G+FlRf3cFdPsl0cRfzfj3jm174af163to+pOLpy0cd11wN35vB1pelQE3Px8/6OrD5hMrf9S/boXLT/faGX6M6+3whEAdzU90aTveyl7aFjCv9q0XwY+G/mMZdLXv89m2N6iRRY1P0PN2EbHB1rNR08pbm9Fd+Gp43xOff/iPOhejq8vX6MnBxS5xcQsdYw096OA2y3N235MX4ZM4EV/vnxVc4YeFP9r/pYWTiSfG3jpa1Ovix7+vd/zPn9zPP3f2Vn76+SIaZ6wU4d0DzhJ6lLkWYqIjboI2+iHODo1zoAk843f6TUCE6mXcLxr6znHALtFhYxuW5yaefOs3oA6vmM0Y9VyImDTiUUHOIKXCGJ4pGaOq7LnAqfGf0upwQUwORx/CecBXiQndNwX4r8ELotyYF5626w3uj/jd2Zz4gxW5PH8sdVa5+3PPvucerj5yc9eRTDgMf3CyjIDni/vorx6rlAxH2oIOe0lSKbBvOLwXruHUkwapzCHPl1/+dVcf/r57v/g4N73ftzdX5pkg5BmEyErKEAyiCLX1B2GIVCL/OVak8VHD4b0wAiZjS+rDsi34pnjj01f54euvp9fvPyF8+iOrMnv11xmLXYIEb9z9fOJvP/z1+Pjp7yX439L2ft1aZyIzG4A5WJ5TzxVpjzu2nqFFHy5KJajnGX944zH9sMfJ0+jC0xgTMzB5aFVyNbJwLBP2JAgfMqYX2GPv/wT7NmFPVBb8YKjvwcEuM3HGZSHJMpQ0VV6wqqQDReHPomn/DcxtRIgZJ5mImhcLO3jPPYNIv7Y2vnHmBhB/hzqwzOZ1fxN9Wvr/b4BhoZvvEHfsPENz8VPRC4YPDrvhZncDrOIve4nW+fyvE/wJDaHMaLZpO2Gzs5yD/wR3ODQJDPHTf+OBwVMXYG2Yu3D42rZs9A4I8Kku+19J/JEzTswuwYpo3v3WYP0wvsJ0EANKoJZL/7P/YqI/4SKib1lfNu6LufWgx3D7x4Q0j+8gwbbdBDfmYL3vFZIk7uV6jv0crcj6jX6nSMT5l2xw6a7AJhyzfl36k3O+MO0XbQUc71y3te0avX448GVlePA+nHjU/QV1Dp/gT7zfEsg0v2VhmOb7gN74XfiFokFNC/sHr5W5P1dEwVe5YkAp/Kuh48HXSvMa34qndC+9RJ+D9z2TRGB2iRtM8sZLPG3RzcS9VYjx+KaUuIdkXty+TLxClubBZUz4ZGMajlG/HG5jFg3ZEYM8oyfdF2NtgsALSqg2bWWY+mews7e+KnA9oD3jK0QdRPlIKQb8lVak8Qq8f2GCOTQv4jX8kKSx25WEX/i9Pp9eR5x8+QgdHVggao6+UUGXwDSWFvqsIQL7WjQBJ/838boIFV2YpYEU3FG4OS9J8TwQNGWEqH1TXtXFwj5n7LCRXxBhA7dFjavEBgf2IIgqMqEg6sl/Lif/kS6/AMuKVM9Rto15LVRS5RONF+iIk0qYU6GFOSVaRxzTfP0tuewAHK2ZyYlNydBQlEtszgL7ssy8DK6yxmqO9KGhU430XN4aJS7WiRVdMJbliTZw7+vbTEoMIDOQbiZmuqtMBqHNzELalJaLhj1qfcAlVHqk0FN4BHPXNrcefOdoEWaD1v2VkBPzXFdyBQu/ygKcT5OBFQhyhiS748YmcoZIJAznYfzYve/6I3GnTKbsxciPlBQhy5S8FGxRbJDWMWnM0J7UaEb1rtw2B25MQwOe7wf98+my0GvLQC5lEWRZN/QGkC8ArC0wP1mbwCYkiYm0IHEjs0JKywe2H96uytptWGV0TdUlkcPHzR18aw6WhleV6nlSJ3GAqU7xAMkoiOgerWb1QGqOZIN3Revc2+epjCTRJfDARE3weA88M8ltmOTRt/0DNQ9YS39dosbX8YRvk/nMtwETGa4W4o0LXxa5OcrWs6sKIi+kQXX2YI7AQZoc2zdlmlyqqMh5tm1NfgT2fwFOmwdenXB6ME65I8ApL/fgp0qVFehkHVVsI0zYUcV4bt2aXQDX+9Wxybremp5xhg3XAux0Qsfs1niejorSjaFK1Z6tUxG9HtADnn0ThCS5HF0RMvfyQo6G6Le68CYmwZT9Dj6rsQbfCmDFQUNxe7GjDjKZcGBmmclUNbyKk4/BZBLaNZmI0U/88XugRCh0B0jyNA+TOtHMtVR6MNnlmhposoY7VrXTMNypUpVg8m3Nww0+/AzTSJqAV8GaW2wC7m+Su997DrD0OgM2xjOs1R9qBoxfab9QTn0GavSnpsB0RUjwcriK4luSIIyaEsgmAd+HlqEB810U4bI2dD0Q1za64xu+qG2F2qA1i4Jrbm3X8Aw7FQ6Dn/I50yF+Gg6gMeEiitfTDGuJGs4CdkqE27BcP6yBIytLWUNpgTMo4r2QM4pw1ZLD5O+mZdiFFFk5LYBmcVWAkSrQan33pw0HIfeq+SALzzHW09/+EZTRuWESd/RGb3vL2WJFU1jJm8LUZMpJ+2Ztb7vczXcPmug+Wupy/rz6fPttd2Wcyc3zFccO0wOshQZ4JM/yMYQB8lIPvppSXUNN1k3DWngabhpVwLa+Mu9Tew31vAs0pY2X+gQebr2eKq8G8TV+acFY4B2Xs9GDYCBt2PXnXBB3xv+Bizx3Tsm1kITxuRbHv8PTnmuB9x5KE4wJsdn0TOShjTmunjGX6d5LTFd+/75AJEzWlsNCr8aO8YC2XAGgp5PVQa1O0EOmB0GakO1jpqo0aT1AtJldz3Rk1ztwAdG8BinvS2ih1y227tox5v2tvTlw4eFmfG3eFtO8TWBtkqXQhhVP11StU3PlJ9RfLyzklF8vPSTOr+d7n23kzll9iZ0pxzdkMFWeoMFW2fRX8OOCuuMxs+XTCn51xYiD98sUo5rXi8QqPtI4bOzuFvBzg7MbhWoG2OGURYMy2G7DVlfaLXw7jRIC3SKbGOGijsR9HP9egFpD2U3WgcQafRqbAVREn1aLe10tboqcXleLaUq+dfckKCl2PQfGeFySukFFMp2Sw+0XXRiG96rc3jnr/ylPf4wvW12QCHX9gvpu03dL1H7zyit8HTGvXBwgr1wWc6Q9fnuR6DDlgUdjktROBKkfnzckqVWxJ21ITsxHUt4g5ruHN7kiDcMdAb7bdpRoU1pJ/U7FUaLaGRN2lmJYn7ylLr2lTuAzvMfUWSm26blMxRQeo9tElgV5UbAGz/DCWH4zploTF99lu8iML/acFDYj6XmlIq9ynfpO4O9b8GDdws3u+9f5w43woG7WcebvG9tYyuOMyhVJS5LYkVDciNhPOQI7svVkEMqEVhNHUzEk6ZJ3wpZkPlWiJVMgoveIynJmcBAdxUDAQXQIQ3846LWaJ/EnEIokaYg1j6GoJzdkWif5nMvmi54TVt3U+iZUcJbqcpZw/CW5o3hS3tQZrCQHJq28+WNYBsqn4rejvQOaz7gLb4McekZzf78NRd4YE70qcuLJ13k9vt2YNtCPQJELQyZRFR8zXnWypyItqbCahrAkWl2D5ruN+fTMItYsN7dYwjZccb3jvs0tVkxjkGdTBlRpf1bp4VDXeJ3qBMsDYEkt+FuKyoLwqG5RybP1UJnt3w8qB03mO05UchVRqYwClWo9VAoCvT8+xvjQ/v2gvnlgzgn15BIvpahXh0C9lN2aL5HF2f5lKM7uJ9bt3wvqsRt1Qv0hqK9+zvx4YC+r9WCf7d8PLE+GcQNY0jIlUuUECiMB+walWtNby/bvB5Qnu7htUJJK5hTHFfS+hsDWdNdyA/rB5aDBQG8bl+wgJyY0B6bAlAzg1IYDeoE+P+iJf7WgPzDMqQXhx7kskV0G4EsgOMiygdBuzYMosv0Ny2PqXncSqMWbcr2LY6HmUkJuQD9QbTf1zF0BZ3PCalFQ5UihymWFahlUcwOkMmxnfbm6A9pmBgNou49fXu7ZL+wXV/vr31fWuCWESsQFo1MsUhA5I1LPCKoaf0N+yq2xgaZh+akJl9D1c5c45hpHy1BP9ilIw5qXJVwlYw+iZKncSTu5LCoHvdsfMA8e5fNUBD30XPFiJl76z0KM7+4DGPI8VjuwgMtEgRAyNOJqu8nAguyp6a2V+FKKq5i7G2BVAQE5COvufYKA4aOKoq6IcTzayjD1z2Bnb/0pcD2gPeOrfIxPWsgjwa4HyXH+RXAC0wUavQw6ZTLw0kEobPTjrsHaMH0KPUJHBxaYpU5QUEgoayN2pX5QkcKq54rExH9sWg4R4rlUYoxRGwFdRJUtEcpi2C+WC9YbEw4R1VWWUVfMI3kqjCYk+3TQTrpMH9UuouK01OOUCEFD5I5TPIpypJUoqJSoLlrGHMNYIpcooC3UDuMLYpS6Mjb2hNdRn+w5KXTrY+gk7oNIO1wWNzn2JEfXwEC6Pwo1jx/TRR2sZplYJ63fTOtXVfqEVWYy155UfqcqP68+ysTMsan8yWn8QoX/1mq1HELO4bQ8eVM5R0zDWkDnpugQ+enpeL7fii1qlYotMuH7ui3YQjY2Bg0v7sXaqGBG0PiiNCij6tIBN0hqR8MifwzFjjgAg7QJJAmgY9P+xUCbkPbn6FXRD1X++cqAAQyuAslHqd2X6X/vOcDSa43YGM+w3gCoGZD+VrOxVhQMdk5ubdfwDJu4o/Y50yF+Gt4dMeEiWtvRDGuJGs4CJiKfRt05Q1ROb1da4Aei9ZG3hoOlKq1vy6mimVRnK0rook7NcVhOx284HVovh8YkpWkqhDAVoow4tn2YA2BJm79yYTRmK6oqmiZS+qEYwB2soWByT2QBpZi8Y1xAIUbWCVyOlJ5hHYH27+SEcIr2Vypof7F37U+cTK6F7JbpHZhAtAhywAyDlojApPJPaQgrpn15+gu2I0+VccuEF57TEuE1FWuBLqCnYS7QIZ0lFONqtgPdATR7B/OORwyg2slJ80Lenj4Kx76TU4mPRbWfzv86QLXT+ae8PIVIyG0t6MmdlHtVKEsnKI8ayvwRQJmXe4ByPKl0TTwVO7XE2piGoUoXzxlCMU+uTVgUmUpt5o4I1mtxZroMSh6O4jsBfqrXoD5FR1NOOuus17DqFpTykSY3U52b8iSevNYtjjfpW8Xy+HTEignO2f69pOKLzUtJvaFU/Mpope3/VLARBzEJJb4eXrP9ceRAYf9Mdn3d/v3wQ7uL/Cd+mFVY4BwrR8hqPY7I9u8Hse2uXfWN2EPRqQN3FXt9nRsaHKGS6piQKsgZpHJ0pGb7H4Xsltpd+jrJbtpcV02lG6a4kFhPcGf791MycDp13lsR0iTeqoDTqrmdJBE9nmLXfqTrucDvK7zwteRpyXCurLIVw6nnAqfGf0qbw3mhj5pxzQ9OegtmThNuGVFd2YHZRWQasQt9eC/sgu2vN8Yuh3EIEfjqyE1+lpVoNcPEkkM+SoZLJcM5IRjeeEw/zIBpfmKGA5mBxWvTo+UGnlpBr5Qb6MOPixu4EzdU54b57kET3UdLXc6fV59vv+2ujDOZkAA+JmZQBN/6qcUAhCGVAFxsZDUc3g8rTHvFaECngqglxr5KKslqji9KFkoJQ0qBLfE056Ph8IZ8gS4d288e23f3QzdubB36Pf4P</diagram></mxfile> \ No newline at end of file diff --git a/docs/RNA-Seq Pipeline Design Flowchart.jpg b/docs/RNA-Seq Pipeline Design Flowchart.jpg deleted file mode 100644 index a56462a2991aecb22d5bbff493c18e3f673f2b0f..0000000000000000000000000000000000000000 Binary files a/docs/RNA-Seq Pipeline Design Flowchart.jpg and /dev/null differ diff --git a/docs/RNA-Seq Pipeline Design Process Table.docx b/docs/RNA-Seq Pipeline Design Process Table.docx deleted file mode 100644 index 21604d8f30662ffd93f8d0605b671a0921864b0c..0000000000000000000000000000000000000000 Binary files a/docs/RNA-Seq Pipeline Design Process Table.docx and /dev/null differ diff --git a/docs/RNA-Seq Pipeline Design Process Table.pdf b/docs/RNA-Seq Pipeline Design Process Table.pdf deleted file mode 100644 index 97f1d5ddfb0ae0848aa0bf8b37681db9efeb3c6b..0000000000000000000000000000000000000000 Binary files a/docs/RNA-Seq Pipeline Design Process Table.pdf and /dev/null differ diff --git a/docs/bicf_logo.png b/docs/bicf_logo.png index 0d8015590c5a94f92c39ec2470bd02baa3d09077..3c6366884ae1707d8f32e8a2bdd77c5258a9bfbc 100644 Binary files a/docs/bicf_logo.png and b/docs/bicf_logo.png differ diff --git a/docs/dag.png b/docs/dag.png index 2a2ec56ccc0615d8b1c9c1f6ce73201c0073874d..48ae4dfa28fc0eceb4aaeb83332b22da8633ca29 100755 Binary files a/docs/dag.png and b/docs/dag.png differ diff --git a/docs/references.md b/docs/references.md index 89002c58f628df65f713dfd752bacdef9a8913ad..4ea1690ec755b51c923070352d4078634bc5e515 100644 --- a/docs/references.md +++ b/docs/references.md @@ -1 +1,43 @@ ### References + +1. **python**: + * Anaconda (Anaconda Software Distribution, [https://anaconda.com](https://anaconda.com)) + +2. **DERIVA**: + * Bugacov, A., Czajkowski, K., Kesselman, C., Kumar, A., Schuler, R. E. and Tangmunarunkit, H. 2017 Experiences with DERIVA: An Asset Management Platform for Accelerating eScience. IEEE 13th International Conference on e-Science (e-Science), Auckland, 2017, pp. 79-88, doi:[10.1109/eScience.2017.20](https://doi.org/10.1109/eScience.2017.20). + +3. **BDBag**: + * D'Arcy, M., Chard, K., Foster, I., Kesselman, C., Madduri, R., Saint, N., & Wagner, R.. 2019. Big Data Bags: A Scalable Packaging Format for Science. Zenodo. doi:[10.5281/zenodo.3338725](http://doi.org/10.5281/zenodo.3338725). + +4. **RSeQC**: + * Wang, L., Wang, S., Li, W. 2012 RSeQC: quality control of RNA-seq experiments. Bioinformatics. Aug 15;28(16):2184-5. doi:[10.1093/bioinformatics/bts356](https://doi.org/10.1093/bioinformatics/bts356). + +5. **trimgalore**: + * trimgalore [https://github.com/FelixKrueger/TrimGalore](https://github.com/FelixKrueger/TrimGalore) + +6. **hisat2**: + * Kim ,D.,Paggi, J.M., Park, C., Bennett, C., Salzberg, S.L. 2019 Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. Nat Biotechnol. Aug;37(8):907-915. doi:[10.1038/s41587-019-0201-4](https://doi.org/10.1038/s41587-019-0201-4). + +7. **samtools**: + * Li H., B. Handsaker, A. Wysoker, T. Fennell, J. Ruan, N. Homer, G. Marth, G. Abecasis, R. Durbin, and 1000 Genome Project Data Processing Subgroup. 2009. The Sequence alignment/map (SAM) format and SAMtools. Bioinformatics 25: 2078-9. doi:[10.1093/bioinformatics/btp352](http://dx.doi.org/10.1093/bioinformatics/btp352) + +8. **picard**: + * “Picard Toolkit.†2019. Broad Institute, GitHub Repository. [http://broadinstitute.github.io/picard/](http://broadinstitute.github.io/picard/); Broad Institute + +9. **featureCounts**: + * Liao, Y., Smyth, G.K., Shi, W. 2014 featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics. Apr 1;30(7):923-30. doi:[10.1093/bioinformatics/btt656](https://doi.org/10.1093/bioinformatics/btt656). + +10. **R**: + * R Core Team 2014. R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL:[http://www.R-project.org/](http://www.R-project.org/). + +11. **deeptools**: + * RamÃrez, F., D. P. Ryan, B. Grüning, V. Bhardwaj, F. Kilpert, A. S. Richter, S. Heyne, F. Dündar, and T. Manke. 2016. deepTools2: a next generation web server for deep-sequencing data analysis. Nucleic Acids Research 44: W160-165. doi:[10.1093/nar/gkw257](http://dx.doi.org/10.1093/nar/gkw257) + +12. **FastQC** + * FastQC [https://www.bioinformatics.babraham.ac.uk/projects/fastqc/](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) + +13. **MultiQC**: + * Ewels P., Magnusson M., Lundin S. and Käller M. 2016. MultiQC: Summarize analysis results for multiple tools and samples in a single report. Bioinformatics 32(19): 3047–3048. doi:[10.1093/bioinformatics/btw354](https://dx.doi.org/10.1093/bioinformatics/btw354) + +14. **Nextflow**: + * Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., and Notredame, C. 2017. Nextflow enables reproducible computational workflows. Nature biotechnology, 35(4), 316. diff --git a/docs/software_references_mqc.yaml b/docs/software_references_mqc.yaml new file mode 100755 index 0000000000000000000000000000000000000000..d9d18558b7df3f626ff89cdb01c610228db92a8b --- /dev/null +++ b/docs/software_references_mqc.yaml @@ -0,0 +1,93 @@ + + id: 'software_references' + section_name: 'Software References' + description: 'This section describes references for the tools used.' + plot_type: 'html' + data: | + + <h3 id="references">References</h3> + <ol style="list-style-type: decimal"> + <li><strong>python</strong>:</li> + </ol> + <ul> + <li>Anaconda (Anaconda Software Distribution, <a href="https://anaconda.com" class="uri">https://anaconda.com</a>)</li> + </ul> + <ol start="2" style="list-style-type: decimal"> + <li><strong>DERIVA</strong>:</li> + </ol> + <ul> + <li>Bugacov, A., Czajkowski, K., Kesselman, C., Kumar, A., Schuler, R. E. and Tangmunarunkit, H. 2017 Experiences with DERIVA: An Asset Management Platform for Accelerating eScience. IEEE 13th International Conference on e-Science (e-Science), Auckland, 2017, pp. 79-88, doi:<a href="https://doi.org/10.1109/eScience.2017.20">10.1109/eScience.2017.20</a>.</li> + </ul> + <ol start="3" style="list-style-type: decimal"> + <li><strong>BDBag</strong>:<br /> + </li> + </ol> + <ul> + <li>D'Arcy, M., Chard, K., Foster, I., Kesselman, C., Madduri, R., Saint, N., & Wagner, R.. 2019. Big Data Bags: A Scalable Packaging Format for Science. Zenodo. doi:<a href="http://doi.org/10.5281/zenodo.3338725">10.5281/zenodo.3338725</a>.</li> + </ul> + <ol start="4" style="list-style-type: decimal"> + <li><strong>RSeQC</strong>:</li> + </ol> + <ul> + <li>Wang, L., Wang, S., Li, W. 2012 RSeQC: quality control of RNA-seq experiments. Bioinformatics. Aug 15;28(16):2184-5. doi:<a href="https://doi.org/10.1093/bioinformatics/bts356">10.1093/bioinformatics/bts356</a>.</li> + </ul> + <ol start="5" style="list-style-type: decimal"> + <li><strong>trimgalore</strong>:</li> + </ol> + <ul> + <li>trimgalore <a href="https://github.com/FelixKrueger/TrimGalore" class="uri">https://github.com/FelixKrueger/TrimGalore</a></li> + </ul> + <ol start="6" style="list-style-type: decimal"> + <li><strong>hisat2</strong>:</li> + </ol> + <ul> + <li>Kim ,D.,Paggi, J.M., Park, C., Bennett, C., Salzberg, S.L. 2019 Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. Nat Biotechnol. Aug;37(8):907-915. doi:<a href="https://doi.org/10.1038/s41587-019-0201-4">10.1038/s41587-019-0201-4</a>.</li> + </ul> + <ol start="7" style="list-style-type: decimal"> + <li><strong>samtools</strong>:</li> + </ol> + <ul> + <li>Li H., B. Handsaker, A. Wysoker, T. Fennell, J. Ruan, N. Homer, G. Marth, G. Abecasis, R. Durbin, and 1000 Genome Project Data Processing Subgroup. 2009. The Sequence alignment/map (SAM) format and SAMtools. Bioinformatics 25: 2078-9. doi:<a href="http://dx.doi.org/10.1093/bioinformatics/btp352">10.1093/bioinformatics/btp352</a></li> + </ul> + <ol start="8" style="list-style-type: decimal"> + <li><strong>picard</strong>:</li> + </ol> + <ul> + <li>“Picard Toolkit.†2019. Broad Institute, GitHub Repository. <a href="http://broadinstitute.github.io/picard/" class="uri">http://broadinstitute.github.io/picard/</a>; Broad Institute</li> + </ul> + <ol start="9" style="list-style-type: decimal"> + <li><strong>featureCounts</strong>:</li> + </ol> + <ul> + <li>Liao, Y., Smyth, G.K., Shi, W. 2014 featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics. Apr 1;30(7):923-30. doi:<a href="https://doi.org/10.1093/bioinformatics/btt656">10.1093/bioinformatics/btt656</a>.</li> + </ul> + <ol start="10" style="list-style-type: decimal"> + <li><strong>R</strong>:</li> + </ol> + <ul> + <li>R Core Team 2014. R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL:<a href="http://www.R-project.org/" class="uri">http://www.R-project.org/</a>.</li> + </ul> + <ol start="11" style="list-style-type: decimal"> + <li><strong>deeptools</strong>:</li> + </ol> + <ul> + <li>RamÃrez, F., D. P. Ryan, B. Grüning, V. Bhardwaj, F. Kilpert, A. S. Richter, S. Heyne, F. Dündar, and T. Manke. 2016. deepTools2: a next generation web server for deep-sequencing data analysis. Nucleic Acids Research 44: W160-165. doi:<a href="http://dx.doi.org/10.1093/nar/gkw257">10.1093/nar/gkw257</a></li> + </ul> + <ol start="12" style="list-style-type: decimal"> + <li><strong>FastQC</strong></li> + </ol> + <ul> + <li>FastQC <a href="https://www.bioinformatics.babraham.ac.uk/projects/fastqc/" class="uri">https://www.bioinformatics.babraham.ac.uk/projects/fastqc/</a></li> + </ul> + <ol start="13" style="list-style-type: decimal"> + <li><strong>MultiQC</strong>:</li> + </ol> + <ul> + <li>Ewels P., Magnusson M., Lundin S. and Käller M. 2016. MultiQC: Summarize analysis results for multiple tools and samples in a single report. Bioinformatics 32(19): 3047–3048. doi:<a href="https://dx.doi.org/10.1093/bioinformatics/btw354">10.1093/bioinformatics/btw354</a></li> + </ul> + <ol start="14" style="list-style-type: decimal"> + <li><strong>Nextflow</strong>:</li> + </ol> + <ul> + <li>Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., and Notredame, C. 2017. Nextflow enables reproducible computational workflows. Nature biotechnology, 35(4), 316.</li> + </ul> diff --git a/docs/software_versions_mqc.yaml b/docs/software_versions_mqc.yaml new file mode 100755 index 0000000000000000000000000000000000000000..feec6dd7c15e1bfd9e5d9c5d66e08f8f1b81d7f9 --- /dev/null +++ b/docs/software_versions_mqc.yaml @@ -0,0 +1,24 @@ + + id: 'software_versions' + section_name: 'Software Versions' + section_href: 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/blob/78-tool_version/docs/RNA-Seq%20Pipeline%20Design%20Process%20Table.pdf' + plot_type: 'html' + description: 'are collected for pipeline version.' + data: | + <dl class="dl-horizontal"> + + <dt>Python</dt><dd>v3.8.3</dd> + <dt>DERIVA</dt><dd>v1.4.3</dd> + <dt>BDBag</dt><dd>v1.5.6</dd> + <dt>RSeQC</dt><dd>v4.0.0</dd> + <dt>Trim Galore!</dt><dd>v0.6.4_dev</dd> + <dt>HISAT2</dt><dd>v2.2.1</dd> + <dt>Samtools</dt><dd>v1.11</dd> + <dt>picard (MarkDuplicates)</dt><dd>v2.23.9</dd> + <dt>featureCounts</dt><dd>v2.0.1</dd> + <dt>R</dt><dd>v4.0.3</dd> + <dt>deepTools</dt><dd>v3.5.0</dd> + <dt>FastQC</dt><dd>v0.11.9</dd> + <dt>MultiQC</dt><dd>v1.9</dd> + <dt>Pipeline Version</dt><dd>v1.0.0</dd> + </dl> diff --git a/workflow/conf/replicate_export_config.json b/test_data/Replicate_For_Input_Bag(test).json similarity index 96% rename from workflow/conf/replicate_export_config.json rename to test_data/Replicate_For_Input_Bag(test).json index ff17fa513c5bc130a2e2bdaf9aa41b070c99b290..46fefe878c7c370792b403c4fb89d3ac79fd5c69 100644 --- a/workflow/conf/replicate_export_config.json +++ b/test_data/Replicate_For_Input_Bag(test).json @@ -1,6 +1,6 @@ { "bag": { - "bag_name": "Replicate_{rid}", + "bag_name": "{rid}_inputBag", "bag_algorithms": [ "md5" ], @@ -89,7 +89,7 @@ "processor": "fetch", "processor_params": { "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}", - "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/url:=URI,length:=File_size,filename:=File_Name,md5:=MD5,Study_RID,Experiment_RID,Replicate_RID?limit=none" + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/File_Type=txt/url:=URI,length:=File_size,filename:=File_Name,md5:=MD5,Study_RID,Experiment_RID,Replicate_RID?limit=none" } } ] diff --git a/test_data/createTestData.sh b/test_data/createTestData.sh index aec4e91bb71d5bb79d87cd76994f6253cfe63ea3..35fa2a4f467627a09bedd6f2675df04971c341f1 100644 --- a/test_data/createTestData.sh +++ b/test_data/createTestData.sh @@ -5,52 +5,56 @@ module load singularity/3.5.3 module load pigz/2.4 +ln -sfn /project/BICF/BICF_Core/shared/gudmap/test_data/* ../test_data/ + mkdir -p NEW_test_data -ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json +ln -sfn ./test_data/auth/credential.json ~/.deriva/credential.json mkdir -p ./NEW_test_data/bag -singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6 -cp Replicate_Q-Y5F6.zip ./NEW_test_data/bag/Replicate_Q-Y5F6.zip +singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-download-cli staging.gudmap.org --catalog 2 './Replicate_For_Input_Bag(test).json' . rid=Q-Y5F6 +cp Q-Y5F6_inputBag.zip ./NEW_test_data/bag/Q-Y5F6_inputBag_xxxxtest.zip +singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-download-cli staging.gudmap.org --catalog 2 ../workflow/conf/Replicate_For_Input_Bag.json . rid=Q-Y5F6 +cp Q-Y5F6_inputBag.zip ./NEW_test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip mkdir -p ./NEW_test_data/fastq -unzip ./test_data/bag/Replicate_Q-Y5F6.zip -singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6 -cp Replicate_Q-Y5F6.R1.fastq.gz ./NEW_test_data/fastq/Replicate_Q-Y5F6.R1.fastq.gz -cp Replicate_Q-Y5F6.R2.fastq.gz ./NEW_test_data/fastq/Replicate_Q-Y5F6.R2.fastq.gz +unzip ./Q-Y5F6_inputBag.zip +singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' bash ../workflow/scripts/bdbag_fetch.sh Q-Y5F6_inputBag Q-Y5F6 +cp Q-Y5F6.R1.fastq.gz ./NEW_test_data/fastq/Q-Y5F6.R1.fastq.gz +cp Q-Y5F6.R2.fastq.gz ./NEW_test_data/fastq/Q-Y5F6.R2.fastq.gz mkdir -p ./NEW_test_data/fastq/small -singularity exec 'docker://bicf/seqtk:2.0.0' seqtk sample -s100 ./NEW_test_data/fastq/Replicate_Q-Y5F6.R1.fastq.gz 1000000 1> Q-Y5F6_1M.R1.fastq -singularity exec 'docker://bicf/seqtk:2.0.0' seqtk sample -s100 ./NEW_test_data/fastq/Replicate_Q-Y5F6.R2.fastq.gz 1000000 1> Q-Y5F6_1M.R2.fastq +singularity exec 'docker://gudmaprbk/seqtk1.3:1.0.0' seqtk sample -s100 ./NEW_test_data/fastq/Q-Y5F6.R1.fastq.gz 1000000 1> Q-Y5F6_1M.R1.fastq +singularity exec 'docker://gudmaprbk/seqtk1.3:1.0.0' seqtk sample -s100 ./NEW_test_data/fastq/Q-Y5F6.R2.fastq.gz 1000000 1> Q-Y5F6_1M.R2.fastq pigz Q-Y5F6_1M.R1.fastq pigz Q-Y5F6_1M.R2.fastq cp Q-Y5F6_1M.R1.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz cp Q-Y5F6_1M.R2.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz mkdir -p ./NEW_test_data/meta -singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5F6_1M.se -j 20 ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5F6_1M.pe -j 20 ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz +singularity run 'docker://gudmaprbk/trimgalore0.6.5:1.0.0' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5F6_1M.se -j 20 ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz +singularity run 'docker://gudmaprbk/trimgalore0.6.5:1.0.0' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5F6_1M.pe -j 20 ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz cp Q-Y5F6_1M.se_trimmed.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz -cp Q-Y5F6_1M.pe_R1_val_1.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_R1_val_1.fq.gz -cp Q-Y5F6_1M.pe_R2_val_2.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_R2_val_2.fq.gz +cp Q-Y5F6_1M.pe_val_1.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_val_1.fq.gz +cp Q-Y5F6_1M.pe_val_2.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_val_2.fq.gz cp Q-Y5F6_1M.R1.fastq.gz_trimming_report.txt ./NEW_test_data/meta/Q-Y5F6_1M.R1.fastq.gz_trimming_report.txt cp Q-Y5F6_1M.R2.fastq.gz_trimming_report.txt ./NEW_test_data/meta/Q-Y5F6_1M.R2.fastq.gz_trimming_report.txt touch metaTest.csv -echo 'Replicate_RID,Experiment_RID,Study_RID,Paired_End,File_Type,Has_Strand_Specific_Information,Used_Spike_Ins,Species' > metaTest.csv -echo 'Replicate_RID,Experiment_RID,Study_RID,uk,FastQ,no,no,Homo sapiens' >> metaTest.csv +echo 'Replicate_RID,Experiment_RID,Study_RID,Paired_End,File_Type,Has_Strand_Specific_Information,Used_Spike_Ins,Species,Read_Length' > metaTest.csv +echo 'Replicate_RID,Experiment_RID,Study_RID,uk,FastQ,no,no,Homo sapiens,75' >> metaTest.csv cp metaTest.csv ./NEW_test_data/meta/metaTest.csv mkdir -p ./NEW_test_data/bam mkdir -p ./NEW_test_data/bam/small -singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./NEW_test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary -singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam -singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam -singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.sorted.bam.bai -singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.pe.unal.gz -S Q-Y5F6_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_R1_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5F6_1M.pe_R2_val_2.fq.gz --summary-file Q-Y5F6_1M.pe.alignSummary.txt --new-summary -singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.pe.bam Q-Y5F6_1M.pe.sam -singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam -singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai +singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/new/GRCm38.p6.vM25/data/hisat2/genome --rna-strandness F -U ./NEW_test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary +singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam +singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam +singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.sorted.bam.bai +singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.pe.unal.gz -S Q-Y5F6_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/new/GRCm38.p6.vM25/data/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_val_1.fq.gz -2 ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_val_2.fq.gz --summary-file Q-Y5F6_1M.pe.alignSummary.txt --new-summary +singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.pe.bam Q-Y5F6_1M.pe.sam +singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam +singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai cp Q-Y5F6_1M.se.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.bam cp Q-Y5F6_1M.pe.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.pe.bam cp Q-Y5F6_1M.se.sorted.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.bam @@ -60,18 +64,17 @@ cp Q-Y5F6_1M.pe.sorted.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.pe.sorted.bam cp Q-Y5F6_1M.se.alignSummary.txt ./NEW_test_data/meta/Q-Y5F6_1M.se.alignSummary.txt cp Q-Y5F6_1M.pe.alignSummary.txt ./NEW_test_data/meta/Q-Y5F6_1M.pe.alignSummary.txt -singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true -singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.deduped.bam -singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai +singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true +singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.deduped.bam +singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai cp Q-Y5F6_1M.se.deduped.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.deduped.bam cp Q-Y5F6_1M.se.sorted.deduped.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam cp Q-Y5F6_1M.se.sorted.deduped.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam.bai -cp Q-Y5F6_1M.se.deduped.Metrics.txt /NEW_test_data/meta/Q-Y5F6_1M.se.deduped.Metrics.txt cp Q-Y5F6_1M.se.deduped.Metrics.txt ./NEW_test_data/meta/Q-Y5F6_1M.se.deduped.Metrics.txt for i in {"chr8","chr4","chrY"}; do echo "samtools view -b ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam ${i} > Q-Y5F6_1M.se.sorted.deduped.${i}.bam; samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.${i}.bam Q-Y5F6_1M.se.sorted.deduped.${i}.bam.bai;"; - done | singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' parallel -j 20 -k + done | singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' parallel -j 20 -k cp Q-Y5F6_1M.se.sorted.deduped.chr4.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chr4.bam cp Q-Y5F6_1M.se.sorted.deduped.chr4.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chr4.bam.bai cp Q-Y5F6_1M.se.sorted.deduped.chr8.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chr8.bam @@ -81,28 +84,30 @@ cp Q-Y5F6_1M.se.sorted.deduped.chrY.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M. mkdir -p ./NEW_test_data/counts mkdir -p ./NEW_test_data/counts/small -ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/geneID.tsv -ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/Entrez.tsv -singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se.countData -s 1 -R SAM --primary --ignoreDup ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count Q-Y5F6_1M.se.countData -singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se -cp Q-Y5F6_1M.se.featureCounts ./NEW_test_data/counts/small/Q-Y5F6_1M.se.countData +ln -s /project/BICF/BICF_Core/shared/gudmap/references/new/GRCm38.p6.vM25/data/metadata/geneID.tsv +ln -s /project/BICF/BICF_Core/shared/gudmap/references/new/GRCm38.p6.vM25/data/metadata/Entrez.tsv +singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/new/GRCm38.p6.vM25/data/annotation/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/new/GRCm38.p6.vM25/data/sequence/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se_countData -s 1 -R SAM --primary --ignoreDup ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam +singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' Rscript ../workflow/scripts/calculateTPM.R --count Q-Y5F6_1M.se_countData +singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' Rscript ../workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se +cp Q-Y5F6_1M.se_countData ./NEW_test_data/counts/small/Q-Y5F6_1M.se_countData cp Q-Y5F6_1M.se.countTable.csv ./NEW_test_data/counts/small/Q-Y5F6_1M.se.countTable.csv -cp Q-Y5F6_1M.se.countTable.csv ./NEW_test_data/counts/small/Q-Y5F6_1M.se.tpmTable.csv +cp Q-Y5F6_1M.se_tpmTable.csv ./NEW_test_data/counts/small/Q-Y5F6_1M.se_tpmTable.csv mkdir -p ./NEW_test_data/bw mkdir -p ./NEW_test_data/bw/small -singularity run 'docker://bicf/deeptools3.3:2.0.0' bamCoverage -p 20 -b ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw +singularity run 'docker://gudmaprbk/deeptools3.5.0:1.0.0' bamCoverage -p 20 -b ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw cp Q-Y5F6_1M.se.bw ./NEW_test_data/bw/small/Q-Y5F6_1M.se.bw mkdir -p ./NEW_test_data/fastqc mkdir -p ./NEW_test_data/fastqc/small -singularity run 'docker://bicf/fastqc:2.0.0' ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o . +singularity run 'docker://gudmaprbk/fastqc0.11.9:1.0.0' fastqc ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o . cp Q-Y5F6_1M.R1_fastqc.html ./NEW_test_data/fastqc/small/Q-Y5F6_1M.R1_fastqc.html cp Q-Y5F6_1M.R1_fastqc.zip ./NEW_test_data/fastqc/small/Q-Y5F6_1M.R1_fastqc.zip echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > Q-Y5F6_1M.se.sorted.deduped.tin.xls for i in {"chr8","chr4","chrY"}; do -echo "tin.py -i ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"; done | singularity run 'docker://bicf/rseqc3.0:2.0.0' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls +echo "tin.py -i ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/new/GRCm38.p6.vM25/data/annotation/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"; done | singularity run 'docker://gudmaprbk/rseqc4.0.0:1.0.0' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls cp Q-Y5F6_1M.se.sorted.deduped.tin.xls ./NEW_test_data/meta/Q-Y5F6_1M.se.sorted.deduped.tin.xls +chgrp -R BICF_Core ./NEW_test_data +chmod -R 750 ./NEW_test_data diff --git a/workflow/conf/Execution_Run_For_Output_Bag.json b/workflow/conf/Execution_Run_For_Output_Bag.json new file mode 100755 index 0000000000000000000000000000000000000000..5945b1eb8c4c5e3ec862840f232ed7a8e386d770 --- /dev/null +++ b/workflow/conf/Execution_Run_For_Output_Bag.json @@ -0,0 +1,64 @@ +{ + "bag": { + "bag_name": "Execution_Run_{rid}", + "bag_algorithms": [ + "md5" + ], + "bag_archiver": "zip", + "bag_metadata": {} + }, + "catalog": { + "catalog_id": "2", + "query_processors": [ + { + "processor": "csv", + "processor_params": { + "output_path": "Execution_Run", + "query_path": "/attribute/M:=RNASeq:Execution_Run/RID=17-BPAG/RID,Replicate_RID:=Replicate,Workflow_RID:=Workflow,Reference_Genone_RID:=Reference_Genome,Input_Bag_RID:=Input_Bag,Notes,Execution_Status,Execution_Status_Detail,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Workflow", + "query_path": "/entity/M:=RNASeq:Execution_Run/RID=17-BPAG/RNASeq:Workflow?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Reference_Genome", + "query_path": "/entity/M:=RNASeq:Execution_Run/RID=17-BPAG/RNASeq:Reference_Genome?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Input_Bag", + "query_path": "/entity/M:=RNASeq:Execution_Run/RID=17-BPAG/RNASeq:Input_Bag?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "mRNA_QC", + "query_path": "/attribute/M:=RNASeq:Execution_Run/RID=17-BPAG/(RID)=(RNASeq:mRNA_QC:Execution_Run)/RID,Execution_Run_RID:=Execution_Run,Replicate_RID:=Replicate,Paired_End,Strandedness,Median_Read_Length,Raw_Count,Final_Count,Notes,RCT,RMT?limit=none" + } + }, + { + "processor": "fetch", + "processor_params": { + "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}/Execution_Run/{Execution_Run_RID}/Output_Files", + "query_path": "/attribute/M:=RNASeq:Execution_Run/RID=17-BPAG/R:=RNASeq:Replicate/$M/(RID)=(RNASeq:Processed_File:Execution_Run)/url:=File_URL,length:=File_Bytes,filename:=File_Name,md5:=File_MD5,Execution_Run_RID:=M:RID,Study_RID:=R:Study_RID,Experiment_RID:=R:Experiment_RID,Replicate_RID:=R:RID?limit=none" + } + }, + { + "processor": "fetch", + "processor_params": { + "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}/Execution_Run/{Execution_Run_RID}/Input_Bag", + "query_path": "/attribute/M:=RNASeq:Execution_Run/RID=17-BPAG/R:=RNASeq:Replicate/$M/RNASeq:Input_Bag/url:=File_URL,length:=File_Bytes,filename:=File_Name,md5:=File_MD5,Execution_Run_RID:=M:RID,Study_RID:=R:Study_RID,Experiment_RID:=R:Experiment_RID,Replicate_RID:=R:RID?limit=none" + } + } + ] + } +} \ No newline at end of file diff --git a/workflow/conf/Replicate_For_Input_Bag.json b/workflow/conf/Replicate_For_Input_Bag.json new file mode 100644 index 0000000000000000000000000000000000000000..278d0bf4d9d9f5074d7e3c4ef948287eb97ed767 --- /dev/null +++ b/workflow/conf/Replicate_For_Input_Bag.json @@ -0,0 +1,97 @@ +{ + "bag": { + "bag_name": "{rid}_inputBag", + "bag_algorithms": [ + "md5" + ], + "bag_archiver": "zip" + }, + "catalog": { + "query_processors": [ + { + "processor": "csv", + "processor_params": { + "output_path": "Study", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Study_RID)=(RNASeq:Study:RID)/Study_RID:=RID,Internal_ID,Title,Summary,Overall_Design,GEO_Series_Accession_ID,GEO_Platform_Accession_ID,Funding,Pubmed_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Experiment", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment:RID)/Experiment_RID:=RID,Study_RID,Internal_ID,Name,Description,Experiment_Method,Sequencing_Type,Species,Specimen_Type,Molecule_Type,Pooled_Sample,Pool_Size,Markers,Cell_Count,Treatment_Protocol,Treatment_Protocol_Reference,Isolation_Protocol,Isolation_Protocol_Reference,Growth_Protocol,Growth_Protocol_Reference,Label_Protocol,Label_Protocol_Reference,Hybridization_Protocol,Hybridization_Protocol_Reference,Scan_Protocol,Scan_Protocol_Reference,Data_Processing,Value_Definition,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Experiment Antibodies", + "query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Antibodies:Experiment_RID)?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Experiment Custom Metadata", + "query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Custom_Metadata:Experiment_RID)?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Experiment Settings", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Settings:Experiment_RID)/RID,Experiment_RID,Alignment_Format,Aligner,Aligner_Version,Reference_Genome,Sequence_Trimming,Duplicate_Removal,Pre-alignment_Sequence_Removal,Junction_Reads,Library_Type,Protocol_Reference,Library_Selection,Quantification_Format,Quantification_Software,Expression_Metric,Transcriptome_Model,Sequencing_Platform,Paired_End,Read_Length,Has_Strand_Specific_Information,Used_Spike_Ins,Spike_Ins_Amount,Visualization_Format,Visualization_Software,Visualization_Version,Visualization_Setting,Notes,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Replicate", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/RID,Study_RID,Experiment_RID,Biological_Replicate_Number,Technical_Replicate_Number,Specimen_RID,Collection_Date,Mapped_Reads,GEO_Sample_Accession_ID,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Specimen", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/S:=(Specimen_RID)=(Gene_Expression:Specimen:RID)/T:=left(Stage_ID)=(Vocabulary:Developmental_Stage:ID)/$S/RID,Title,Species,Stage_ID,Stage_Name:=T:Name,Stage_Detail,Assay_Type,Strain,Wild_Type,Sex,Passage,Phenotype,Cell_Line,Parent_Specimen,Upload_Notes,Preparation,Fixation,Embedding,Internal_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT,GUDMAP2_Accession_ID?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Specimen_Anatomical_Source", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Tissue:Specimen_RID)/RID,Specimen_RID,Tissue,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Specimen_Cell_Types", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Cell_Type:Specimen)/RID,Specimen_RID:=Specimen,Cell_Type,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Single Cell Metrics", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:Single_Cell_Metrics:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Reads_%28Millions%29,Reads%2FCell,Detected_Gene_Count,Genes%2FCell,UMI%2FCell,Estimated_Cell_Count,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "File", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Caption,File_Type,File_Name,URI,File_size,MD5,GEO_Archival_URL,dbGaP_Accession_ID,Processed,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT,Legacy_File_RID,GUDMAP_NGF_OID,GUDMAP_NGS_OID?limit=none" + } + }, + { + "processor": "fetch", + "processor_params": { + "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/File_Type=FastQ/url:=URI,length:=File_size,filename:=File_Name,md5:=MD5,Study_RID,Experiment_RID,Replicate_RID?limit=none" + } + } + ] + } +} diff --git a/workflow/conf/aws.config b/workflow/conf/aws.config index b5054f724c810b4eeaa01ae03e6db1ae421ab0cc..cdd91da7bb7ede67aa9a004ab4d617186edee334 100644 --- a/workflow/conf/aws.config +++ b/workflow/conf/aws.config @@ -1,3 +1,7 @@ +params { + refSource = "aws" +} + workDir = 's3://gudmap-rbk.output/work' aws.client.storageEncryption = 'AES256' aws { @@ -12,75 +16,107 @@ process { cpus = 1 memory = '1 GB' - withName: trackStart { + withName:trackStart { cpus = 1 memory = '1 GB' } - withName: getBag { + withName:getBag { cpus = 1 memory = '1 GB' } - withName: getData { + withName:getData { cpus = 1 memory = '1 GB' } - withName: parseMetadata { + withName:parseMetadata { cpus = 15 memory = '1 GB' } - withName: trimData { + withName:trimData { cpus = 20 memory = '2 GB' } - withName: getRefInfer { + withName:getRefInfer { cpus = 1 memory = '1 GB' } - withName: downsampleData { + withName:downsampleData { cpus = 1 memory = '1 GB' } - withName: alignSampleData { + withName:alignSampleData { cpus = 50 memory = '5 GB' } - withName: inferMetadata { + withName:inferMetadata { cpus = 5 memory = '1 GB' } - withName: getRef { + withName:checkMetadata { cpus = 1 memory = '1 GB' } - withName: alignData { + withName:getRef { + cpus = 1 + memory = '1 GB' + } + withName:alignData { cpus = 50 memory = '10 GB' } - withName: dedupData { + withName:dedupData { cpus = 5 memory = '20 GB' } - withName: countData { + withName:countData { cpus = 2 memory = '5 GB' } - withName: makeBigWig { + withName:makeBigWig { cpus = 15 memory = '5 GB' } - withName: fastqc { + withName:fastqc { cpus = 1 memory = '1 GB' } - withName: dataQC { + withName:dataQC { cpus = 15 memory = '2 GB' } - withName: aggrQC { + withName:aggrQC { cpus = 2 memory = '1 GB' } - withName: outputBag { + withName:uploadInputBag { + cpus = 1 + memory = '1 GB' + } + withName:uploadExecutionRun { + cpus = 1 + memory = '1 GB' + } + withName:uploadQC { + cpus = 1 + memory = '1 GB' + } + withName:uploadProcessedFile { + cpus = 1 + memory = '1 GB' + } + withName:uploadOutputBag { + cpus = 1 + memory = '1 GB' + } + withName:finalizeExecutionRun { + cpus = 1 + memory = '1 GB' + } + withName:failPreExecutionRun { + cpus = 1 + memory = '1 GB' + } + withName:failExecutionRun { cpus = 1 memory = '1 GB' } diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index efe86bea962eca3577471efec640248748950625..8cfc34ad4e8deea2735e0e49f613dfa48bb6defc 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -1,60 +1,94 @@ +params { + refSource = "biohpc" +} + process { executor = 'slurm' queue = 'super' clusterOptions = '--hold' + time = '4h' + errorStrategy = 'retry' + maxRetries = 1 - withName: trackStart { + withName:trackStart { executor = 'local' } - withName: getBag { + withName:getBag { executor = 'local' } - withName: getData { + withName:getData { queue = 'super' } - withName: parseMetadata { + withName:parseMetadata { executor = 'local' } - withName: trimData { + withName:trimData { queue = 'super' } - withName: getRefInfer { + withName:getRefInfer { queue = 'super' } - withName: downsampleData { + withName:downsampleData { executor = 'local' } - withName: alignSampleData { + withName:alignSampleData { queue = 'super' } - withName: inferMetadata { + withName:inferMetadata { queue = 'super' } - withName: getRef { + withName:checkMetadata { + executor = 'local' + } + withName:getRef { queue = 'super' } - withName: alignData { + withName:alignData { queue = '256GB,256GBv1' } - withName: dedupData { + withName:dedupData { queue = 'super' } - withName: countData { + withName:countData { queue = 'super' } - withName: makeBigWig { + withName:makeBigWig { queue = 'super' } - withName: fastqc { + withName:fastqc { queue = 'super' } - withName: dataQC { + withName:dataQC { queue = 'super' } - withName: aggrQC { + withName:aggrQC { + executor = 'local' + } + withName:uploadInputBag { + executor = 'local' + } + withName:uploadExecutionRun { + executor = 'local' + } + withName:uploadQC { + executor = 'local' + } + withName:uploadProcessedFile { + executor = 'local' + } + withName:uploadOutputBag { + executor = 'local' + } + withName:finalizeExecutionRun { + executor = 'local' + } + withName:failPreExecutionRun { + executor = 'local' + } + withName:failPreExecutionRun { executor = 'local' } - withName: outputBag { + withName:failExecutionRun { executor = 'local' } } diff --git a/workflow/conf/multiqc_config.yaml b/workflow/conf/multiqc_config.yaml index 0c780d967f2859c1c76e02a63f3348c26049c694..ed1375aed47a454394029e5057695b0c15babd8c 100644 --- a/workflow/conf/multiqc_config.yaml +++ b/workflow/conf/multiqc_config.yaml @@ -56,6 +56,10 @@ report_section_order: order: 2000 ref: order: 1000 + software_versions: + order: -1000 + software_references: + order: -2000 skip_generalstats: true @@ -70,10 +74,14 @@ custom_data: scale: false format: '{}' headers: - Session - Session ID - Pipeline Version - Input + Session: + description: '' + Session ID: + description: 'Nextflow session ID' + Pipeline Version: + description: 'BICF pipeline version' + Input: + description: 'Input overrides' rid: file_format: 'tsv' section_name: 'RID' @@ -84,10 +92,14 @@ custom_data: scale: false format: '{}' headers: - Replicate - Replicate RID - Experiment RID - Study RID + Replicate: + description: '' + Replicate RID: + description: 'Replicate RID' + Experiment RID: + description: 'Experiment RID' + Study RID: + description: 'Study RID' meta: file_format: 'tsv' section_name: 'Metadata' @@ -98,30 +110,43 @@ custom_data: scale: false format: '{:,.0f}' headers: - Source - Species - Ends - Stranded - Spike-in - Raw Reads - Assigned Reads - Median Read Length - Median TIN - Pipeline Version + Source: + description: 'Metadata source' + Species: + description: 'Species' + Ends: + description: 'Single or paired end sequencing' + Stranded: + description: 'Stranded (forward/reverse) or unstranded library prep' + Spike-in: + description: 'ERCC spike in' + Raw Reads: + description: 'Number of reads of the sequencer' + Assigned Reads: + description: 'Final reads after fintering' + Median Read Length: + description: 'Average read length' + Median TIN: + description: 'Average transcript integrity number' + ref: file_format: 'tsv' section_name: 'Reference' - description: 'This is the referenec version information' + description: 'This is the reference version information' plot_type: 'table' pconfig: id: 'ref' scale: false format: '{}' headers: - Species - Genome Reference Consortium Build - Genome Reference Consortium Patch - GENCODE Annotation Release" + Species: + description: 'Reference species' + Genome Reference Consortium Build: + description: 'Reference source build' + Genome Reference Consortium Patch: + description: 'Reference source patch version' + GENCODE Annotation Release: + description: 'Annotation release version' tin: file_format: 'tsv' section_name: 'TIN' @@ -131,16 +156,16 @@ custom_data: id: 'tin' headers: chrom - 0 - 9 - 10 - 19 - 20 - 29 - 30 - 39 - 40 - 49 - 50 - 59 - 60 - 69 - 70 - 79 - 80 - 89 - 90 - 99 + 1 - 10 + 11 - 20 + 21 - 30 + 31 - 40 + 41 - 50 + 51 - 60 + 61 - 70 + 71 - 80 + 81 - 90 + 91 - 100 sp: run: @@ -152,4 +177,4 @@ sp: ref: fn: 'reference.tsv' tin: - fn: '*.tin.hist.tsv' \ No newline at end of file + fn: '*_tin.hist.tsv' diff --git a/workflow/nextflow.config b/workflow/nextflow.config index 14798a1f5b0ca1fda7aa79b178bd9f556875be42..3b982bfbd4b12896c957e6f0af7cbc130e4c3039 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -20,61 +20,85 @@ profiles { process { withName:getBag { - container = 'bicf/gudmaprbkfilexfer:2.0.1_indev' + container = 'gudmaprbk/deriva1.4:1.0.0' } withName:getData { - container = 'bicf/gudmaprbkfilexfer:2.0.1_indev' + container = 'gudmaprbk/deriva1.4:1.0.0' } - withName: parseMetadata { - container = 'bicf/python3:2.0.1_indev' + withName:parseMetadata { + container = 'gudmaprbk/python3:1.0.0' } - withName: trimData { - container = 'bicf/trimgalore:1.1' + withName:trimData { + container = 'gudmaprbk/trimgalore0.6.5:1.0.0' } - withName: getRefInfer { - container = 'bicf/awscli:1.1' + withName:getRefInfer { + container = 'gudmaprbk/deriva1.4:1.0.0' } - withName: downsampleData { - container = 'bicf/seqtk:2.0.1_indev' + withName:downsampleData { + container = 'gudmaprbk/seqtk1.3:1.0.0' } - withName: alignSampleData { - container = 'bicf/gudmaprbkaligner:2.0.1_indev' + withName:alignSampleData { + container = 'gudmaprbk/hisat2.2.1:1.0.0' } - withName: inferMetadata { - container = 'bicf/rseqc3.0:2.0.1_indev' + withName:inferMetadata { + container = 'gudmaprbk/rseqc4.0.0:1.0.0' } - withName: getRef { - container = 'bicf/awscli:1.1' + withName:checkMetadata { + container = 'gudmaprbk/gudmap-rbk_base:1.0.0' } - withName: alignData { - container = 'bicf/gudmaprbkaligner:2.0.1_indev' + withName:getRef { + container = 'gudmaprbk/deriva1.4:1.0.0' } - withName: dedupData { - container = 'bicf/gudmaprbkdedup:2.0.0' + withName:alignData { + container = 'gudmaprbk/hisat2.2.1:1.0.0' } - withName: countData { - container = 'bicf/subread2:2.0.0' + withName:dedupData { + container = 'gudmaprbk/picard2.23.9:1.0.0' } - withName: makeBigWig { - container = 'bicf/deeptools3.3:2.0.1_indev' + withName:countData { + container = 'gudmaprbk/subread2.0.1:1.0.0' } - withName: fastqc { - container = 'bicf/fastqc:2.0.1_indev' + withName:makeBigWig { + container = 'gudmaprbk/deeptools3.5.0:1.0.0' } - withName: dataQC { - container = 'bicf/rseqc3.0:2.0.1_indev' + withName:fastqc { + container = 'gudmaprbk/fastqc0.11.9:1.0.0' } - withName: aggrQC { - container = 'bicf/multiqc1.8:2.0.1_indev' + withName:dataQC { + container = 'gudmaprbk/rseqc4.0.0:1.0.0' } - withName:outputBag { - container = 'bicf/gudmaprbkfilexfer:2.0.1_indev' + withName:aggrQC { + container = 'gudmaprbk/multiqc1.9:1.0.0' + } + withName:uploadInputBag { + container = 'gudmaprbk/deriva1.4:1.0.0' + } + withName:uploadExecutionRun { + container = 'gudmaprbk/deriva1.4:1.0.0' + } + withName:uploadQC { + container = 'gudmaprbk/deriva1.4:1.0.0' + } + withName:uploadProcessedFile { + container = 'gudmaprbk/deriva1.4:1.0.0' + } + withName:uploadOutputBag { + container = 'gudmaprbk/deriva1.4:1.0.0' + } + withName:finalizeExecutionRun { + container = 'gudmaprbk/deriva1.4:1.0.0' + } + withName:failPreExecutionRun { + container = 'gudmaprbk/deriva1.4:1.0.0' + } + withName:failExecutionRun { + container = 'gudmaprbk/deriva1.4:1.0.0' } } trace { enabled = false - file = 'pipeline_trace.txt' + file = 'trace.txt' fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss' } @@ -82,7 +106,7 @@ timeline { enabled = false file = 'timeline.html' } - + report { enabled = false file = 'report.html' @@ -94,9 +118,10 @@ tower { } manifest { + name = 'gudmap_rbk/rna-seq' homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq' description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.' mainScript = 'rna-seq.nf' - version = 'v0.0.3' + version = 'v1.0.0' nextflowVersion = '>=19.09.0' } diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index d5c86e2cc0a4bddbeaa6d674cfd0eb8944c83362..9ee700aa0a42a07936f7f1295fed3a082cb903f4 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -1,33 +1,55 @@ #!/usr/bin/env nextflow -// ######## #### ###### ######## -// ## ## ## ## ## ## -// ## ## ## ## ## -// ######## ## ## ###### -// ## ## ## ## ## -// ## ## ## ## ## ## -// ######## #### ###### ## +// ######## #### ###### ######## +// ## ## ## ## ## ## +// ## ## ## ## ## +// ######## ## ## ###### +// ## ## ## ## ## +// ## ## ## ## ## ## +// ######## #### ###### ## // Define input variables params.deriva = "${baseDir}/../test_data/auth/credential.json" params.bdbag = "${baseDir}/../test_data/auth/cookies.txt" //params.repRID = "16-1ZX4" -params.repRID = "Q-Y5JA" +params.repRID = "Q-Y5F6" params.source = "dev" -params.refMoVersion = "38.p6.vM22" -params.refHuVersion = "38.p12.v31" +params.refMoVersion = "38.p6.vM25" +params.refHuVersion = "38.p13.v36" params.refERCCVersion = "92" params.outDir = "${baseDir}/../output" +params.upload = false +params.email = "" + // Define override input variable +params.refSource = "biohpc" params.inputBagForce = "" params.fastqsForce = "" params.speciesForce = "" +// Define tracking input variables +params.ci = false +params.dev = true + + // Parse input variables deriva = Channel .fromPath(params.deriva) .ifEmpty { exit 1, "deriva credential file not found: ${params.deriva}" } +deriva.into { + deriva_getBag + deriva_getRefInfer + deriva_getRef + deriva_uploadInputBag + deriva_uploadExecutionRun + deriva_uploadQC + deriva_uploadProcessedFile + deriva_uploadOutputBag + deriva_finalizeExecutionRun + deriva_failPreExecutionRun + deriva_failExecutionRun +} bdbag = Channel .fromPath(params.bdbag) .ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" } @@ -37,12 +59,15 @@ refHuVersion = params.refHuVersion refERCCVersion = params.refERCCVersion outDir = params.outDir logsDir = "${outDir}/Logs" +upload = params.upload inputBagForce = params.inputBagForce fastqsForce = params.fastqsForce speciesForce = params.speciesForce +email = params.email -// Define fixed files -derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json") +// Define fixed files and variables +replicateExportConfig = Channel.fromPath("${baseDir}/conf/Replicate_For_Input_Bag.json") +executionRunExportConfig = Channel.fromPath("${baseDir}/conf/Execution_Run_For_Output_Bag.json") if (params.source == "dev") { source = "dev.gudmap.org" } else if (params.source == "staging") { @@ -50,32 +75,46 @@ if (params.source == "dev") { } else if (params.source == "production") { source = "www.gudmap.org" } -referenceBase = "s3://bicf-references" -//referenceBase = "/project/BICF/BICF_Core/shared/gudmap/references" +if (params.refSource == "biohpc") { + referenceBase = "/project/BICF/BICF_Core/shared/gudmap/references/new" +} else if (params.refSource == "datahub") { + referenceBase = "www.gudmap.org" +} referenceInfer = Channel.fromList(["ERCC","GRCh","GRCm"]) multiqcConfig = Channel.fromPath("${baseDir}/conf/multiqc_config.yaml") bicfLogo = Channel.fromPath("${baseDir}/../docs/bicf_logo.png") +softwareReferences = Channel.fromPath("${baseDir}/../docs/software_references_mqc.yaml") +softwareVersions = Channel.fromPath("${baseDir}/../docs/software_versions_mqc.yaml") // Define script files -script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh") -script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py") -script_inferMeta = Channel.fromPath("${baseDir}/scripts/inferMeta.sh") +script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbag_fetch.sh") +script_parseMeta = Channel.fromPath("${baseDir}/scripts/parse_meta.py") +script_inferMeta = Channel.fromPath("${baseDir}/scripts/infer_meta.sh") +script_refDataInfer = Channel.fromPath("${baseDir}/scripts/extract_ref_data.py") +script_refData = Channel.fromPath("${baseDir}/scripts/extract_ref_data.py") script_calculateTPM = Channel.fromPath("${baseDir}/scripts/calculateTPM.R") script_convertGeneSymbols = Channel.fromPath("${baseDir}/scripts/convertGeneSymbols.R") -script_tinHist = Channel.fromPath("${baseDir}/scripts/tinHist.py") +script_tinHist = Channel.fromPath("${baseDir}/scripts/tin_hist.py") +script_uploadInputBag = Channel.fromPath("${baseDir}/scripts/upload_input_bag.py") +script_uploadExecutionRun_uploadExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") +script_uploadExecutionRun_finalizeExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") +script_uploadExecutionRun_failPreExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") +script_uploadExecutionRun_failExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") +script_uploadQC = Channel.fromPath("${baseDir}/scripts/upload_qc.py") +script_uploadOutputBag = Channel.fromPath("${baseDir}/scripts/upload_output_bag.py") +script_deleteEntry_uploadQC = Channel.fromPath("${baseDir}/scripts/delete_entry.py") +script_deleteEntry_uploadProcessedFile = Channel.fromPath("${baseDir}/scripts/delete_entry.py") /* * trackStart: track start of pipeline */ -params.ci = false -params.dev = false process trackStart { - container 'docker://bicf/bicfbase:2.1.0' + container 'docker://gudmaprbk/gudmap-rbk_base:1.0.0' script: """ hostname ulimit -a - + curl -H 'Content-Type: application/json' -X PUT -d \ '{ \ "sessionId": "${workflow.sessionId}", \ @@ -90,6 +129,18 @@ process trackStart { "dev": ${params.dev} \ }' \ "https://xku43pcwnf.execute-api.us-east-1.amazonaws.com/ProdDeploy/pipeline-tracking" + + curl -H 'Content-Type: application/json' -X PUT -d \ + '{ \ + "ID": "${workflow.sessionId}", \ + "repRID": "${repRID}", \ + "PipelineVersion": "${workflow.manifest.version}", \ + "Server": "${params.source}", \ + "Queued": "NA", \ + "CheckedOut": "NA", \ + "Started": "${workflow.start}" \ + }' \ + "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" """ } @@ -102,7 +153,9 @@ Source Server : ${params.source} Mouse Reference Version: ${params.refMoVersion} Human Reference Version: ${params.refHuVersion} ERCC Reference Version : ${params.refERCCVersion} +Reference source : ${params.refSource} Output Directory : ${params.outDir} +Upload : ${upload} ------------------------------------ Nextflow Version : ${workflow.nextflow.version} Pipeline Version : ${workflow.manifest.version} @@ -114,18 +167,18 @@ Development : ${params.dev} """ /* - * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid + * getBag: download input bag */ process getBag { tag "${repRID}" - publishDir "${outDir}/inputBag", mode: 'copy', pattern: "Replicate_*.zip" + publishDir "${outDir}/inputBag", mode: 'copy', pattern: "*_inputBag_*.zip" input: - path credential, stageAs: "credential.json" from deriva - path derivaConfig + path credential, stageAs: "credential.json" from deriva_getBag + path replicateExportConfig output: - path ("Replicate_*.zip") into bag + path ("*.zip") into bag when: inputBagForce == "" @@ -143,8 +196,15 @@ process getBag { # deriva-download replicate RID echo -e "LOG: fetching bag for ${repRID} in GUDMAP" >> ${repRID}.getBag.log - deriva-download-cli ${source} --catalog 2 ${derivaConfig} . rid=${repRID} + deriva-download-cli ${source} --catalog 2 ${replicateExportConfig} . rid=${repRID} echo -e "LOG: fetched" >> ${repRID}.getBag.log + + name=\$(ls *.zip) + name=\$(basename \${name} | cut -d "." -f1) + yr=\$(date +'%Y') + mn=\$(date +'%m') + dy=\$(date +'%d') + mv \${name}.zip \${name}_\${yr}\${mn}\${dy}.zip """ } @@ -156,9 +216,13 @@ if (inputBagForce != "") { } else { inputBag = bag } +inputBag.into { + inputBag_getData + inputBag_uploadInputBag +} /* - * getData: fetch study files from consortium with downloaded bdbag.zip + * getData: fetch replicate files from consortium with downloaded bdbag.zip */ process getData { tag "${repRID}" @@ -166,13 +230,14 @@ process getData { input: path script_bdbagFetch path cookies, stageAs: "deriva-cookies.txt" from bdbag - path inputBag + path inputBag from inputBag_getData output: path ("*.R{1,2}.fastq.gz") into fastqs path ("**/File.csv") into fileMeta path ("**/Experiment Settings.csv") into experimentSettingsMeta path ("**/Experiment.csv") into experimentMeta + path "fastqCount.csv" into fastqCount_fl script: """ @@ -184,36 +249,45 @@ process getData { mkdir -p ~/.bdbag ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt echo -e "LOG: linked" >> ${repRID}.getData.log - + # get bag basename - replicate=\$(basename "${inputBag}" | cut -d "." -f1) + replicate=\$(basename "${inputBag}") echo -e "LOG: bag replicate name \${replicate}" >> ${repRID}.getData.log - + # unzip bag echo -e "LOG: unzipping replicate bag" >> ${repRID}.getData.log unzip ${inputBag} echo -e "LOG: unzipped" >> ${repRID}.getData.log - + # bag fetch fastq's only and rename by repRID echo -e "LOG: fetching replicate bdbag" >> ${repRID}.getData.log - sh ${script_bdbagFetch} \${replicate} ${repRID} + sh ${script_bdbagFetch} \${replicate::-13} ${repRID} echo -e "LOG: fetched" >> ${repRID}.getData.log + + fastqCount=\$(ls *.fastq.gz | wc -l) + echo "\${fastqCount}" > fastqCount.csv """ } +// Split fastq count into channel +fastqCount = Channel.create() +fastqCount_fl.splitCsv(sep: ",", header: false).separate( + fastqCount +) + // Set raw fastq to downloaded or forced input and replicate them for multiple process inputs if (fastqsForce != "") { Channel .fromPath(fastqsForce) .ifEmpty { exit 1, "override inputBag file not found: ${fastqsForce}" } .collect().into { + fastqs_parseMetadata fastqs_trimData - fastqs_fastqc } } else { fastqs.into { + fastqs_parseMetadata fastqs_trimData - fastqs_fastqc } } @@ -228,9 +302,12 @@ process parseMetadata { path file from fileMeta path experimentSettings, stageAs: "ExperimentSettings.csv" from experimentSettingsMeta path experiment from experimentMeta + path (fastq) from fastqs_parseMetadata + val fastqCount output: - path "design.csv" into metadata + path "design.csv" into metadata_fl + path "fastqError.csv" into fastqError_fl script: """ @@ -244,15 +321,28 @@ process parseMetadata { # get experiment RID metadata exp=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p expRID) echo -e "LOG: experiment RID metadata parsed: \${exp}" >> ${repRID}.parseMetadata.log - + # get study RID metadata study=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p studyRID) echo -e "LOG: study RID metadata parsed: \${study}" >> ${repRID}.parseMetadata.log # get endedness metadata - endsMeta=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p endsMeta) - echo -e "LOG: endedness metadata parsed: \${endsMeta}" >> ${repRID}.parseMetadata.log - + endsRaw=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p endsMeta) + echo -e "LOG: endedness metadata parsed: \${endsRaw}" >> ${repRID}.parseMetadata.log + if [ "\${endsRaw}" == "Single Read" ] + then + endsMeta="se" + elif [ "\${endsRaw}" == "Paired End" ] + then + endsMeta="pe" + else + endsMeta="unknown" + fi + if [ "\${endsRaw}" == "" ] + then + endsRaw="_No value_" + fi + # ganually get endness endsManual=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p endsManual) echo -e "LOG: endedness manually detected: \${endsManual}" >> ${repRID}.parseMetadata.log @@ -260,30 +350,65 @@ process parseMetadata { # get strandedness metadata stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p stranded) echo -e "LOG: strandedness metadata parsed: \${stranded}" >> ${repRID}.parseMetadata.log - + # get spike-in metadata spike=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p spike) echo -e "LOG: spike-in metadata parsed: \${spike}" >> ${repRID}.parseMetadata.log - + # get species metadata species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experiment}" -p species) echo -e "LOG: species metadata parsed: \${species}" >> ${repRID}.parseMetadata.log # get read length metadata readLength=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p readLength) - if [ "\${readLength}" = "nan"] + if [ "\${readLength}" = "nan" ] then readLength="NA" fi echo -e "LOG: read length metadata parsed: \${readLength}" >> ${repRID}.parseMetadata.log + # check not incorrect number of fastqs + fastqCountError=false + fastqCountError_details="" + if [ "${fastqCount}" -gt "2" ] + then + fastqCountError=true + fastqCountError_details="**Too many fastqs detected (>2)**" + elif [ "\${endsMeta}" == "se" ] && [ "${fastqCount}" -ne "1" ] + then + fastqCountError=true + fastqCountError_details="**Number of fastqs detected does not match submitted endness**" + elif [ "\${endsMeta}" == "pe" ] && [ "${fastqCount}" -ne "2" ] + then + fastqCountError=true + fastqCountError_details="**Number of fastqs detected does not match submitted endness**" + fi + + # check read counts match for fastqs + fastqReadError=false + fastqReadError_details="" + if [ "\${endsManual}" == "pe" ] + then + r1Count=\$(zcat ${fastq[0]} | wc -l) + r2Count=\$(zcat ${fastq[1]} | wc -l) + if [ "\${r1Count}" -ne "\${r2Count}" ] + then + fastqReadError=true + fastqReadError_details="**Number of reads do not match for R1 and R2:** there may be a trunkation or mismatch of fastq files" + fi + fi + # save design file - echo -e "\${endsMeta},\${endsManual},\${stranded},\${spike},\${species},\${readLength},\${exp},\${study}" > design.csv + echo "\${endsMeta},\${endsRaw},\${endsManual},\${stranded},\${spike},\${species},\${readLength},\${exp},\${study}" > design.csv + + # save fastq error file + echo "\${fastqCountError},\${fastqCountError_details},\${fastqReadError},\${fastqReadError_details}" > fastqError.csv """ } // Split metadata into separate channels endsMeta = Channel.create() +endsRaw = Channel.create() endsManual = Channel.create() strandedMeta = Channel.create() spikeMeta = Channel.create() @@ -291,8 +416,9 @@ speciesMeta = Channel.create() readLengthMeta = Channel.create() expRID = Channel.create() studyRID = Channel.create() -metadata.splitCsv(sep: ",", header: false).separate( +metadata_fl.splitCsv(sep: ",", header: false).separate( endsMeta, + endsRaw, endsManual, strandedMeta, spikeMeta, @@ -301,14 +427,102 @@ metadata.splitCsv(sep: ",", header: false).separate( expRID, studyRID ) + // Replicate metadata for multiple process inputs +endsMeta.into { + endsMeta_checkMetadata + endsMeta_aggrQC + endsMeta_failExecutionRun +} endsManual.into { endsManual_trimData endsManual_downsampleData endsManual_alignSampleData endsManual_aggrQC } +strandedMeta.into { + strandedMeta_checkMetadata + strandedMeta_aggrQC + strandedMeta_failExecutionRun +} +spikeMeta.into { + spikeMeta_checkMetadata + spikeMeta_aggrQC + spikeMeta_failPreExecutionRun + spikeMeta_failExecutionRun +} +speciesMeta.into { + speciesMeta_checkMetadata + speciesMeta_aggrQC + speciesMeta_failPreExecutionRun + speciesMeta_failExecutionRun +} +studyRID.into { + studyRID_aggrQC + studyRID_uploadInputBag + studyRID_uploadProcessedFile + studyRID_uploadOutputBag +} +expRID.into { + expRID_aggrQC + expRID_uploadProcessedFile +} + +// Split fastq count error into separate channel +fastqCountError = Channel.create() +fastqCountError_details = Channel.create() +fastqReadError = Channel.create() +fastqReadError_details = Channel.create() +fastqError_fl.splitCsv(sep: ",", header: false).separate( + fastqCountError, + fastqCountError_details, + fastqReadError, + fastqReadError_details +) +// Replicate errors for multiple process inputs +fastqCountError.into { + fastqCountError_trimData + fastqCountError_getRefInfer + fastqCountError_downsampleData + fastqCountError_alignSampleData + fastqCountError_inferMetadata + fastqCountError_checkMetadata + fastqCountError_uploadExecutionRun + fastqCountError_getRef + fastqCountError_alignData + fastqCountError_dedupData + fastqCountError_makeBigWig + fastqCountError_countData + fastqCountError_fastqc + fastqCountError_dataQC + fastqCountError_aggrQC + fastqCountError_uploadQC + fastqCountError_uploadProcessedFile + fastqCountError_uploadOutputBag + fastqCountError_failPreExecutionRun +} +fastqReadError.into { + fastqReadError_trimData + fastqReadError_getRefInfer + fastqReadError_downsampleData + fastqReadError_alignSampleData + fastqReadError_inferMetadata + fastqReadError_checkMetadata + fastqReadError_uploadExecutionRun + fastqReadError_getRef + fastqReadError_alignData + fastqReadError_dedupData + fastqReadError_makeBigWig + fastqReadError_countData + fastqReadError_fastqc + fastqReadError_dataQC + fastqReadError_aggrQC + fastqReadError_uploadQC + fastqReadError_uploadProcessedFile + fastqReadError_uploadOutputBag + fastqReadError_failPreExecutionRun +} /* * trimData: trims any adapter or non-host sequences from the data @@ -317,13 +531,20 @@ process trimData { tag "${repRID}" input: - val ends from endsManual_trimData path (fastq) from fastqs_trimData + val ends from endsManual_trimData + val fastqCountError_trimData + val fastqReadError_trimData output: path ("*.fq.gz") into fastqsTrim + path ("*.fastq.gz", includeInputs:true) into fastqs_fastqc path ("*_trimming_report.txt") into trimQC - path ("readLength.csv") into inferMetadata_readLength + path ("readLength.csv") into readLengthInfer_fl + + when: + fastqCountError_trimData == "false" + fastqReadError_trimData == "false" script: """ @@ -343,24 +564,32 @@ process trimData { fi echo -e "LOG: trimmed" >> ${repRID}.trimData.log echo -e "LOG: average trimmed read length: \${readLength}" >> ${repRID}.trimData.log - + # save read length file - echo -e "\${readLength}" > readLength.csv + echo "\${readLength}" > readLength.csv """ } // Extract calculated read length metadata into channel readLengthInfer = Channel.create() -inferMetadata_readLength.splitCsv(sep: ",", header: false).separate( +readLengthInfer_fl.splitCsv(sep: ",", header: false).separate( readLengthInfer ) -// Replicate trimmed fastq's +// Replicate inferred read length for multiple process inputs +readLengthInfer.into { + readLengthInfer_aggrQC + readLengthInfer_uploadQC +} +// Replicate trimmed fastq's for multiple process inputs fastqsTrim.into { fastqsTrim_alignData fastqsTrim_downsampleData } +// Combine inputs of getRefInfer +getRefInferInput = referenceInfer.combine(deriva_getRefInfer.combine(script_refDataInfer.combine(fastqCountError_getRefInfer.combine(fastqReadError_getRefInfer)))) + /* * getRefInfer: dowloads appropriate reference for metadata inference */ @@ -368,17 +597,27 @@ process getRefInfer { tag "${refName}" input: - val refName from referenceInfer + tuple val (refName), path (credential, stageAs: "credential.json"), path (script_refDataInfer), val (fastqCountError), val (fastqReadError) from getRefInferInput output: tuple val (refName), path ("hisat2", type: 'dir'), path ("*.fna"), path ("*.gtf") into refInfer path ("${refName}", type: 'dir') into bedInfer - + + when: + fastqCountError == "false" + fastqReadError == "false" + script: """ hostname > ${repRID}.${refName}.getRefInfer.log ulimit -a >> ${repRID}.${refName}.getRefInfer.log + # link credential file for authentication + echo -e "LOG: linking deriva credentials" >> ${repRID}.${refName}.getRefInfer.log + mkdir -p ~/.deriva + ln -sf `readlink -e credential.json` ~/.deriva/credential.json + echo -e "LOG: linked" >> ${repRID}.${refName}.getRefInfer.log + # set the reference name if [ "${refName}" == "ERCC" ] then @@ -393,33 +632,43 @@ process getRefInfer { echo -e "LOG: ERROR - References could not be set!\nReference found: ${referenceBase}" >> ${repRID}.${refName}.getRefInfer.log exit 1 fi - mkdir ${refName} # retreive appropriate reference appropriate location echo -e "LOG: fetching ${refName} reference files from ${referenceBase}" >> ${repRID}.${refName}.getRefInfer.log - if [ ${referenceBase} == "s3://bicf-references" ] + if [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references/new" ] then - aws s3 cp "\${references}"/hisat2 ./hisat2 --recursive - aws s3 cp "\${references}"/bed ./${refName}/bed --recursive - aws s3 cp "\${references}"/genome.fna ./ - aws s3 cp "\${references}"/genome.gtf ./ - elif [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references" ] + unzip \${references}.zip + mv \$(basename \${references})/data/* . + elif [ params.refSource == "datahub" ] then - ln -s "\${references}"/hisat2 - ln -s "\${references}"/bed ${refName}/bed - ln -s "\${references}"/genome.fna - ln -s "\${references}"/genome.gtf + GRCv=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f1) + GRCp=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f2) + GENCODE=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f3) + if [ "${refName}" != "ERCC" ] + then + query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='\${GRCv}'.'\${GRCp}'/Annotation_Version=GENCODE%20'\${GENCODE}) + else + query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version=${refName}${refERCCVersion}/Annotation_Version=${refName}${refERCCVersion}') + fi + curl --request GET \${query} > refQuery.json + refURL=\$(python ${script_refDataInfer} --returnParam URL) + loc=\$(dirname \${refURL}) + fName=\$(python ${script_refDataInfer} --returnParam fName) + fName=\${fName%.*} + if [ "\${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi + filename=\$(echo \$(basename \${refURL}) | grep -oP '.*(?=:)') + deriva-hatrac-cli --host ${referenceBase} get \${refURL} + unzip \$(basename \${refURL}) + mv \${fName}/data/* . fi - echo -e "LOG: fetched" >> ${repRID}.${refName}.getRefInfer.log - - # make blank bed folder for ERCC - echo -e "LOG: making dummy bed folder for ERCC" >> ${repRID}.${refName}.getRefInfer.log - if [ "${refName}" == "ERCC" ] + mv ./annotation/genome.gtf . + mv ./sequence/genome.fna . + mkdir ${refName} + if [ "${refName}" != "ERCC" ] then - rm -rf ${refName}/bed - mkdir ${refName}/bed - touch ${refName}/bed/temp + mv ./annotation/genome.bed ./${refName} fi + echo -e "LOG: fetched" >> ${repRID}.${refName}.getRefInfer.log """ } @@ -430,13 +679,19 @@ process downsampleData { tag "${repRID}" input: - val ends from endsManual_downsampleData path fastq from fastqsTrim_downsampleData + val ends from endsManual_downsampleData + val fastqCountError_downsampleData + val fastqReadError_downsampleData output: path ("sampled.1.fq") into fastqs1Sample path ("sampled.2.fq") into fastqs2Sample + when: + fastqCountError_downsampleData == "false" + fastqReadError_downsampleData == "false" + script: """ hostname > ${repRID}.downsampleData.log @@ -459,7 +714,7 @@ process downsampleData { } // Replicate the dowsampled fastq's and attatched to the references -inferInput = endsManual_alignSampleData.combine(refInfer.combine(fastqs1Sample.collect().combine(fastqs2Sample.collect()))) +inferInput = endsManual_alignSampleData.combine(refInfer.combine(fastqs1Sample.collect().combine(fastqs2Sample.collect().combine(fastqCountError_alignSampleData.combine(fastqReadError_alignSampleData))))) /* * alignSampleData: aligns the downsampled reads to a reference database @@ -468,13 +723,17 @@ process alignSampleData { tag "${ref}" input: - tuple val (ends), val (ref), path (hisat2), path (fna), path (gtf), path (fastq1), path (fastq2) from inferInput + tuple val (ends), val (ref), path (hisat2), path (fna), path (gtf), path (fastq1), path (fastq2), val (fastqCountError), val (fastqReadError) from inferInput output: path ("${ref}.sampled.sorted.bam") into sampleBam path ("${ref}.sampled.sorted.bam.bai") into sampleBai path ("${ref}.alignSampleSummary.txt") into alignSampleQC + when: + fastqCountError == "false" + fastqReadError == "false" + script: """ hostname > ${repRID}.${ref}.alignSampleData.log @@ -484,14 +743,14 @@ process alignSampleData { echo -e "LOG: aligning ${ends}" >> ${repRID}.${ref}.alignSampleData.log if [ "${ends}" == "se" ] then - + hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome -U ${fastq1} --summary-file ${ref}.alignSampleSummary.txt --new-summary elif [ "${ends}" == "pe" ] then hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome --no-mixed --no-discordant -1 ${fastq1} -2 ${fastq2} --summary-file ${ref}.alignSampleSummary.txt --new-summary fi echo -e "LOG: aliged" >> ${repRID}.${ref}.alignSampleData.log - + # convert the output sam file to a sorted bam file using Samtools echo -e "LOG: converting from sam to bam" >> ${repRID}.${ref}.alignSampleData.log samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${ref}.sampled.bam ${ref}.sampled.sam @@ -520,10 +779,17 @@ process inferMetadata { path bam from sampleBam.collect() path bai from sampleBai.collect() path alignSummary from alignSampleQC_inferMetadata.collect() + val fastqCountError_inferMetadata + val fastqReadError_inferMetadata output: - path "infer.csv" into inferMetadata + path "infer.csv" into inferMetadata_fl path "${repRID}.infer_experiment.txt" into inferExperiment + path "speciesError.csv" into speciesError_fl + + when: + fastqCountError_inferMetadata == "false" + fastqReadError_inferMetadata == "false" script: """ @@ -550,74 +816,93 @@ process inferMetadata { fi echo -e "LOG: inference of strandedness results is: \${spike}" >> ${repRID}.inferMetadata.log + speciesError=false + speciesError_details="" # determine species - if [ 1 -eq \$(echo \$(expr \${align_hu} ">=" 25)) ] && [ 1 -eq \$(echo \$(expr \${align_mo} "<" 25)) ] + if [ 1 -eq \$(echo \$(expr \${align_hu} ">=" 40)) ] && [ 1 -eq \$(echo \$(expr \${align_mo} "<" 40)) ] then species="Homo sapiens" bam="GRCh.sampled.sorted.bam" - bed="./GRCh/bed/genome.bed" - elif [ 1 -eq \$(echo \$(expr \${align_mo} ">=" 25)) ] && [ 1 -eq \$(echo \$(expr \${align_hu} "<" 25)) ] + bed="./GRCh/genome.bed" + echo -e "LOG: inference of species results in: \${species}" >> ${repRID}.inferMetadata.log + elif [ 1 -eq \$(echo \$(expr \${align_mo} ">=" 40)) ] && [ 1 -eq \$(echo \$(expr \${align_hu} "<" 40)) ] then species="Mus musculus" bam="GRCm.sampled.sorted.bam" - bed="./GRCm/bed/genome.bed" + bed="./GRCm/genome.bed" + echo -e "LOG: inference of species results in: \${species}" >> ${repRID}.inferMetadata.log else echo -e "LOG: ERROR - inference of species returns an ambiguous result: hu=\${align_hu} mo=\${align_mo}" >> ${repRID}.inferMetadata.log if [ "${speciesForce}" == "" ] then - exit 1 + speciesError=true + speciesError_details="**Inference of species returns an ambiguous result:** Percent aligned to human = \${align_hu} and percent aligned to mouse = \${align_mo}" fi fi if [ "${speciesForce}" != "" ] then + speciesError=false echo -e "LOG: species overridden to: ${speciesForce}" species="${speciesForce}" if [ "${speciesForce}" == "Homo sapiens" ] then bam="GRCh.sampled.sorted.bam" - bed="./GRCh/bed/genome.bed" + bed="./GRCh/genome.bed" elif [ "${speciesForce}" == "Mus musculus" ] then bam="GRCm.sampled.sorted.bam" - bed="./GRCm/bed/genome.bed" + bed="./GRCm/genome.bed" fi fi - echo -e "LOG: inference of species results in: \${species}" >> ${repRID}.inferMetadata.log - - # infer experimental setting from dedup bam - echo -e "LOG: infer experimental setting from dedup bam" >> ${repRID}.inferMetadata.log - infer_experiment.py -r "\${bed}" -i "\${bam}" 1>> ${repRID}.infer_experiment.txt - echo -e "LOG: infered" >> ${repRID}.inferMetadata.log - ended=`bash inferMeta.sh endness ${repRID}.infer_experiment.txt` - fail=`bash inferMeta.sh fail ${repRID}.infer_experiment.txt` - if [ \${ended} == "PairEnd" ] - then - ends="pe" - percentF=`bash inferMeta.sh pef ${repRID}.infer_experiment.txt` - percentR=`bash inferMeta.sh per ${repRID}.infer_experiment.txt` - elif [ \${ended} == "SingleEnd" ] - then - ends="se" - percentF=`bash inferMeta.sh sef ${repRID}.infer_experiment.txt` - percentR=`bash inferMeta.sh ser ${repRID}.infer_experiment.txt` - fi - echo -e "LOG: percentage reads in the same direction as gene: \${percentF}" >> ${repRID}.inferMetadata.log - echo -e "LOG: percentage reads in the opposite direction as gene: \${percentR}" >> ${repRID}.inferMetadata.log - if [ 1 -eq \$(echo \$(expr \${percentF#*.} ">" 2500)) ] && [ 1 -eq \$(echo \$(expr \${percentR#*.} "<" 2500)) ] - then - stranded="forward" - elif [ 1 -eq \$(echo \$(expr \${percentR#*.} ">" 2500)) ] && [ 1 -eq \$(echo \$(expr \${percentF#*.} "<" 2500)) ] + if [ "\${speciesError}" == false ] then - stranded="reverse" - + # infer experimental setting from dedup bam + echo -e "LOG: infer experimental setting from dedup bam" >> ${repRID}.inferMetadata.log + infer_experiment.py -r "\${bed}" -i "\${bam}" 1>> ${repRID}.infer_experiment.txt + echo -e "LOG: inferred" >> ${repRID}.inferMetadata.log + + ended=`bash ${script_inferMeta} endness ${repRID}.infer_experiment.txt` + fail=`bash ${script_inferMeta} fail ${repRID}.infer_experiment.txt` + if [ \${ended} == "PairEnd" ] + then + ends="pe" + percentF=`bash ${script_inferMeta} pef ${repRID}.infer_experiment.txt` + percentR=`bash ${script_inferMeta} per ${repRID}.infer_experiment.txt` + elif [ \${ended} == "SingleEnd" ] + then + ends="se" + percentF=`bash ${script_inferMeta} sef ${repRID}.infer_experiment.txt` + percentR=`bash ${script_inferMeta} ser ${repRID}.infer_experiment.txt` + fi + echo -e "LOG: percentage reads in the same direction as gene: \${percentF}" >> ${repRID}.inferMetadata.log + echo -e "LOG: percentage reads in the opposite direction as gene: \${percentR}" >> ${repRID}.inferMetadata.log + if [ 1 -eq \$(echo \$(expr \${percentF#*.} ">" 2500)) ] && [ 1 -eq \$(echo \$(expr \${percentR#*.} "<" 2500)) ] + then + stranded="forward" + elif [ 1 -eq \$(echo \$(expr \${percentR#*.} ">" 2500)) ] && [ 1 -eq \$(echo \$(expr \${percentF#*.} "<" 2500)) ] + then + stranded="reverse" + else + stranded="unstranded" + fi + echo -e "LOG: stradedness set to: \${stranded}" >> ${repRID}.inferMetadata.log else - stranded="unstranded" + ends="" + stranded="" + spike="" + species="" + percentF="" + percentR="" + fail="" + touch ${repRID}.infer_experiment.txt fi - echo -e "LOG: stradedness set to: \${stranded}" >> ${repRID}.inferMetadata.log - # write infered metadata to file - echo "\${ends},\${stranded},\${spike},\${species},\${align_ercc},\${align_hu},\${align_mo},\${percentF},\${percentR},\${fail}" 1>> infer.csv + # write inferred metadata to file + echo "\${ends},\${stranded},\${spike},\${species},\${align_ercc},\${align_hu},\${align_mo},\${percentF},\${percentR},\${fail}" > infer.csv + + # save species error file + echo "\${speciesError},\${speciesError_details}" > speciesError.csv """ } @@ -632,7 +917,7 @@ align_moInfer = Channel.create() percentFInfer = Channel.create() percentRInfer = Channel.create() failInfer = Channel.create() -inferMetadata.splitCsv(sep: ",", header: false).separate( +inferMetadata_fl.splitCsv(sep: ",", header: false).separate( endsInfer, strandedInfer, spikeInfer, @@ -644,27 +929,356 @@ inferMetadata.splitCsv(sep: ",", header: false).separate( percentRInfer, failInfer ) + // Replicate metadata for multiple process inputs endsInfer.into { + endsInfer_checkMetadata endsInfer_alignData endsInfer_countData endsInfer_dataQC endsInfer_aggrQC + endsInfer_uploadQC + endsInfer_failExecutionRun } strandedInfer.into { + strandedInfer_checkMetadata strandedInfer_alignData strandedInfer_countData strandedInfer_aggrQC + strandedInfer_uploadQC + strandedInfer_failExecutionRun } spikeInfer.into{ + spikeInfer_checkMetadata spikeInfer_getRef spikeInfer_aggrQC + spikeInfer_uploadExecutionRun + spikeInfer_failExecutionRun } speciesInfer.into { + speciesInfer_checkMetadata speciesInfer_getRef speciesInfer_aggrQC + speciesInfer_uploadExecutionRun + speciesInfer_uploadProcessedFile + speciesInfer_failExecutionRun +} + +// Split species count error into separate channel +speciesError = Channel.create() +speciesError_details = Channel.create() +speciesError_fl.splitCsv(sep: ",", header: false).separate( + speciesError, + speciesError_details +) + +// Replicate errors for multiple process inputs +speciesError.into { + speciesError_checkMetadata + speciesError_uploadExecutionRun + speciesError_getRef + speciesError_alignData + speciesError_dedupData + speciesError_makeBigWig + speciesError_countData + speciesError_fastqc + speciesError_dataQC + speciesError_aggrQC + speciesError_uploadQC + speciesError_uploadProcessedFile + speciesError_uploadOutputBag + speciesError_failPreExecutionRun +} + +/* + * checkMetadata: checks the submitted metada against inferred +*/ +process checkMetadata { + tag "${repRID}" + + input: + val endsMeta from endsMeta_checkMetadata + val strandedMeta from strandedMeta_checkMetadata + val spikeMeta from spikeMeta_checkMetadata + val speciesMeta from speciesMeta_checkMetadata + val endsInfer from endsInfer_checkMetadata + val strandedInfer from strandedInfer_checkMetadata + val spikeInfer from spikeInfer_checkMetadata + val speciesInfer from speciesInfer_checkMetadata + val fastqCountError_checkMetadata + val fastqReadError_checkMetadata + val speciesError_checkMetadata + + output: + path ("check.csv") into checkMetadata_fl + path ("outputBagRID.csv") optional true into outputBagRID_fl_dummy + + when: + fastqCountError_checkMetadata == "false" + fastqReadError_checkMetadata == "false" + speciesError_checkMetadata == "false" + + script: + """ + hostname > ${repRID}.checkMetadata.log + ulimit -a >> ${repRID}.checkMetadata.log + + pipelineError=false + # check if submitted metadata matches inferred + if [ "${endsMeta}" != "${endsInfer}" ] + then + pipelineError=true + pipelineError_ends=true + echo -e "LOG: ends do not match: Submitted=${endsMeta}; Inferred=${endsInfer}" >> ${repRID}.checkMetadata.log + else + pipelineError_ends=false + echo -e "LOG: ends matches: Submitted=${endsMeta}; Inferred=${endsInfer}" >> ${repRID}.checkMetadata.log + fi + if [ "${strandedMeta}" != "${strandedInfer}" ] + then + pipelineError=true + pipelineError_stranded=true + if [ "${strandedMeta}" == "stranded" ] + then + if [[ "${strandedInfer}" == "forward" ]] || [[ "${strandedInfer}" == "reverse" ]] + then + pipelineError=false + pipelineError_stranded=false + echo -e "LOG: stranded matches: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log + else + echo -e "LOG: stranded does not match: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log + fi + else + echo -e "LOG: stranded does not match: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log + fi + else + pipelineError=false + pipelineError_stranded=false + echo -e "LOG: stranded matches: Submitted=${strandedMeta}; Inferred=${strandedInfer}" >> ${repRID}.checkMetadata.log + fi + if [ "${spikeMeta}" != "${spikeInfer}" ] + then + pipelineError=true + pipelineError_spike=true + echo -e "LOG: spike does not match: Submitted=${spikeMeta}; Inferred=${spikeInfer}" >> ${repRID}.checkMetadata.log + else + pipelineError_spike=false + echo -e "LOG: stranded matches: Submitted=${spikeMeta}; Inferred=${spikeInfer}" >> ${repRID}.checkMetadata.log + fi + if [ "${speciesMeta}" != "${speciesInfer}" ] + then + pipelineError=true + pipelineError_species=true + echo -e "LOG: species does not match: Submitted=${speciesMeta}; Inferred=${speciesInfer}" >> ${repRID}.checkMetadata.log + else + pipelineError_species=false + echo -e "LOG: species matches: Submitted=${speciesMeta}; Inferred=${speciesInfer}" >> ${repRID}.checkMetadata.log + fi + + # create dummy output bag rid if failure + if [ \${pipelineError} == true ] + then + echo "fail" > outputBagRID.csv + fi + + # write checks to file + echo "\${pipelineError},\${pipelineError_ends},\${pipelineError_stranded},\${pipelineError_spike},\${pipelineError_species}" > check.csv + """ +} + +// Split errors into separate channels +pipelineError = Channel.create() +pipelineError_ends = Channel.create() +pipelineError_stranded = Channel.create() +pipelineError_spike = Channel.create() +pipelineError_species = Channel.create() +checkMetadata_fl.splitCsv(sep: ",", header: false).separate( + pipelineError, + pipelineError_ends, + pipelineError_stranded, + pipelineError_spike, + pipelineError_species +) + +// Replicate errors for multiple process inputs +pipelineError.into { + pipelineError_getRef + pipelineError_alignData + pipelineError_dedupData + pipelineError_makeBigWig + pipelineError_countData + pipelineError_fastqc + pipelineError_dataQC + pipelineError_aggrQC + pipelineError_uploadQC + pipelineError_uploadProcessedFile + pipelineError_uploadOutputBag + pipelineError_failExecutionRun +} + +/* + * uploadInputBag: uploads the input bag +*/ +process uploadInputBag { + tag "${repRID}" + + input: + path script_uploadInputBag + path credential, stageAs: "credential.json" from deriva_uploadInputBag + path inputBag from inputBag_uploadInputBag + val studyRID from studyRID_uploadInputBag + + output: + path ("inputBagRID.csv") into inputBagRID_fl + + when: + upload + + script: + """ + hostname > ${repRID}.uploadInputBag.log + ulimit -a >> ${repRID}.uploadInputBag.log + + yr=\$(date +'%Y') + mn=\$(date +'%m') + dy=\$(date +'%d') + + file=\$(basename -a ${inputBag}) + md5=\$(md5sum ./\${file} | awk '{ print \$1 }') + echo LOG: ${repRID} input bag md5 sum - \${md5} >> ${repRID}.uploadInputBag.log + size=\$(wc -c < ./\${file}) + echo LOG: ${repRID} input bag size - \${size} bytes >> ${repRID}.uploadInputBag.log + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=\${md5}) + if [ "\${exist}" == "[]" ] + then + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/input_bag/study/${studyRID}/replicate/${repRID}/\${file} --parents) + inputBag_rid=\$(python3 ${script_uploadInputBag} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie}) + echo LOG: input bag RID uploaded - \${inputBag_rid} >> ${repRID}.uploadInputBag.log + rid=\${inputBag_rid} + else + exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + exist=\${exist:7:-6} + echo LOG: input bag RID already exists - \${exist} >> ${repRID}.uploadInputBag.log + rid=\${exist} + fi + + echo "\${rid}" > inputBagRID.csv + """ +} + +// Extract input bag RID into channel +inputBagRID = Channel.create() +inputBagRID_fl.splitCsv(sep: ",", header: false).separate( + inputBagRID +) + +// Replicate input bag RID for multiple process inputs +inputBagRID.into { + inputBagRID_uploadExecutionRun + inputBagRID_finalizeExecutionRun + inputBagRID_failPreExecutionRun + inputBagRID_failExecutionRun +} + +/* + * uploadExecutionRun: uploads the execution run +*/ +process uploadExecutionRun { + tag "${repRID}" + + input: + path script_uploadExecutionRun_uploadExecutionRun + path credential, stageAs: "credential.json" from deriva_uploadExecutionRun + val spike from spikeInfer_uploadExecutionRun + val species from speciesInfer_uploadExecutionRun + val inputBagRID from inputBagRID_uploadExecutionRun + val fastqCountError_uploadExecutionRun + val fastqReadError_uploadExecutionRun + val speciesError_uploadExecutionRun + + output: + path ("executionRunRID.csv") into executionRunRID_fl + + when: + upload + fastqCountError_uploadExecutionRun == "false" + fastqReadError_uploadExecutionRun == "false" + speciesError_uploadExecutionRun == "false" + + script: + """ + hostname > ${repRID}.uploadExecutionRun.log + ulimit -a >> ${repRID}.uploadExecutionRun.log + + echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.uploadExecutionRun.log + workflow=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Workflow/Name=BICF%20mRNA%20Replicate/Version=${workflow.manifest.version}) + workflow=\$(echo \${workflow} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + workflow=\${workflow:7:-6} + echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.uploadExecutionRun.log + + if [ "${species}" == "Homo sapiens" ] + then + genomeName=\$(echo GRCh${refHuVersion}) + elif [ "${species}" == "Mus musculus" ] + then + genomeName=\$(echo GRCm${refMoVersion}) + fi + if [ "${spike}" == "yes" ] + then + genomeName=\$(echo \${genomeName}-S) + fi + echo LOG: searching for genome name - \${genomeName} >> ${repRID}.uploadExecutionRun.log + genome=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Name=\${genomeName}) + genome=\$(echo \${genome} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + genome=\${genome:7:-6} + echo LOG: genome RID extracted - \${genome} >> ${repRID}.uploadExecutionRun.log + + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Replicate=${repRID}/Input_Bag=${inputBagRID}) + echo \${exist} >> ${repRID}.uploadExecutionRun.log + if [ "\${exist}" == "[]" ] + then + executionRun_rid=\$(python3 ${script_uploadExecutionRun_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s In-progress -d 'Run in process' -o ${source} -c \${cookie} -u F) + echo LOG: execution run RID uploaded - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log + else + rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + rid=\${rid:7:-6} + echo \${rid} >> ${repRID}.uploadExecutionRun.log + executionRun_rid=\$(python3 ${script_uploadExecutionRun_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s In-progress -d 'Run in process' -o ${source} -c \${cookie} -u \${rid}) + echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log + fi + + echo "\${executionRun_rid}" > executionRunRID.csv + + curl -H 'Content-Type: application/json' -X PUT -d \ + '{ \ + "ID": "${workflow.sessionId}", \ + "ExecutionRunRID": "'\${executionRun_rid}'" \ + }' \ + "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" + """ } +// Extract execution run RID into channel +executionRunRID = Channel.create() +executionRunRID_fl.splitCsv(sep: ",", header: false).separate( + executionRunRID +) + +// Replicate execution run RID for multiple process inputs +executionRunRID.into { + executionRunRID_uploadQC + executionRunRID_uploadProcessedFile + executionRunRID_uploadOutputBag + executionRunRID_finalizeExecutionRun + executionRunRID_failExecutionRun +} /* * getRef: downloads appropriate reference @@ -673,58 +1287,89 @@ process getRef { tag "${species}" input: + path script_refData + path credential, stageAs: "credential.json" from deriva_getRef val spike from spikeInfer_getRef val species from speciesInfer_getRef + val fastqCountError_getRef + val fastqReadError_getRef + val speciesError_getRef + val pipelineError_getRef output: - tuple path ("hisat2", type: 'dir'), path ("bed", type: 'dir'), path ("*.fna"), path ("*.gtf"), path ("geneID.tsv"), path ("Entrez.tsv") into reference - + tuple path ("hisat2", type: 'dir'), path ("*.bed"), path ("*.fna"), path ("*.gtf"), path ("geneID.tsv"), path ("Entrez.tsv") into reference + + when: + fastqCountError_getRef == "false" + fastqReadError_getRef == "false" + speciesError_getRef == "false" + pipelineError_getRef == "false" + script: """ hostname > ${repRID}.getRef.log ulimit -a >> ${repRID}.getRef.log + # link credential file for authentication + echo -e "LOG: linking deriva credentials" >> ${repRID}.getRef.log + mkdir -p ~/.deriva + ln -sf `readlink -e credential.json` ~/.deriva/credential.json + echo -e "LOG: linked" >> ${repRID}.getRef.log + # set the reference name if [ "${species}" == "Mus musculus" ] then references=\$(echo ${referenceBase}/GRCm${refMoVersion}) + refName=GRCm elif [ '${species}' == "Homo sapiens" ] then references=\$(echo ${referenceBase}/GRCh${refHuVersion}) + refName=GRCh else echo -e "LOG: ERROR - References could not be set!\nSpecies reference found: ${species}" >> ${repRID}.getRef.log exit 1 fi if [ "${spike}" == "yes" ] then - references=\$(echo \${reference}-S/) + references=\$(echo \${reference}-S) elif [ "${spike}" == "no" ] then - reference=\$(echo \${references}/) + reference=\$(echo \${references}) fi echo -e "LOG: species set to \${references}" >> ${repRID}.getRef.log # retreive appropriate reference appropriate location echo -e "LOG: fetching ${species} reference files from ${referenceBase}" >> ${repRID}.getRef.log - if [ ${referenceBase} == "s3://bicf-references" ] + if [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references/new" ] then - echo -e "LOG: grabbing reference files from S3" >> ${repRID}.getRef.log - aws s3 cp "\${references}"/hisat2 ./hisat2 --recursive - aws s3 cp "\${references}"/bed ./bed --recursive - aws s3 cp "\${references}"/genome.fna ./ - aws s3 cp "\${references}"/genome.gtf ./ - aws s3 cp "\${references}"/geneID.tsv ./ - aws s3 cp "\${references}"/Entrez.tsv ./ - elif [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references" ] + echo -e "LOG: grabbing reference files from local (BioHPC)" >> ${repRID}.getRef.log + unzip \${reference}.zip + mv \$(basename \${reference})/data/* . + elif [ arams.refSource == "datahub" ] then - ln -s "\${references}"/hisat2 - ln -s "\${references}"/bed - ln -s "\${references}"/genome.fna - ln -s "\${references}"/genome.gtf - ln -s "\${references}"/geneID.tsv - ln -s "\${references}"/Entrez.tsv + echo -e "LOG: grabbing reference files from datahub" >> ${repRID}.getRef.log + GRCv=\$(echo \${references} | grep -o \${refName}.* | cut -d '.' -f1) + GRCp=\$(echo \${references} | grep -o \${refName}.* | cut -d '.' -f2) + GENCODE=\$(echo \${references} | grep -o \${refName}.* | cut -d '.' -f3) + query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='\${GRCv}'.'\${GRCp}'/Annotation_Version=GENCODE%20'\${GENCODE}) + curl --request GET \${query} > refQuery.json + refURL=\$(python ${script_refData} --returnParam URL) + loc=\$(dirname \${refURL}) + fName=\$(python ${script_refData} --returnParam fName) + fName=\${fName%.*} + if [ "\${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi + filename=\$(echo \$(basename \${refURL}) | grep -oP '.*(?=:)') + deriva-hatrac-cli --host ${referenceBase} get \${refURL} + unzip \$(basename \${refURL}) + mv \${fName}/data/* . fi echo -e "LOG: fetched" >> ${repRID}.getRef.log + + mv ./annotation/genome.gtf . + mv ./sequence/genome.fna . + mv ./annotation/genome.bed . + mv ./metadata/Entrez.tsv . + mv ./metadata/geneID.tsv . """ } @@ -742,15 +1387,25 @@ process alignData { tag "${repRID}" input: - val ends from endsInfer_alignData - val stranded from strandedInfer_alignData path fastq from fastqsTrim_alignData path reference_alignData + val ends from endsInfer_alignData + val stranded from strandedInfer_alignData + val fastqCountError_alignData + val fastqReadError_alignData + val speciesError_alignData + val pipelineError_alignData output: tuple path ("${repRID}.sorted.bam"), path ("${repRID}.sorted.bam.bai") into rawBam path ("*.alignSummary.txt") into alignQC + when: + fastqCountError_alignData == "false" + fastqReadError_alignData == "false" + speciesError_alignData == "false" + pipelineError_alignData == "false" + script: """ hostname > ${repRID}.align.log @@ -771,7 +1426,7 @@ process alignData { strandedParam="--rna-strandness R" elif [ "${stranded}" == "reverse" ] && [ "${ends}" == "pe" ] then - strandedParam="--rna-strandness RF" + strandedParam="--rna-strandness RF" fi # align the reads with Hisat2 @@ -784,7 +1439,7 @@ process alignData { hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} --summary-file ${repRID}.alignSummary.txt --new-summary fi echo -e "LOG: alignined" >> ${repRID}.align.log - + # convert the output sam file to a sorted bam file using Samtools echo -e "LOG: converting from sam to bam" >> ${repRID}.align.log samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam @@ -800,7 +1455,7 @@ process alignData { } // Replicate rawBam for multiple process inputs -rawBam.into { +rawBam.set { rawBam_dedupData } @@ -813,12 +1468,22 @@ process dedupData { input: tuple path (bam), path (bai) from rawBam_dedupData + val fastqCountError_dedupData + val fastqReadError_dedupData + val speciesError_dedupData + val pipelineError_dedupData output: - tuple path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bam.bai") into dedupBam - tuple path ("${repRID}.sorted.deduped.*.bam"), path ("${repRID}.sorted.deduped.*.bam.bai") into dedupChrBam + tuple path ("${repRID}_sorted.deduped.bam"), path ("${repRID}_sorted.deduped.bam.bai") into dedupBam + tuple path ("${repRID}_sorted.deduped.*.bam"), path ("${repRID}_sorted.deduped.*.bam.bai") into dedupChrBam path ("*.deduped.Metrics.txt") into dedupQC + when: + fastqCountError_dedupData == 'false' + fastqReadError_dedupData == 'false' + speciesError_dedupData == 'false' + pipelineError_dedupData == 'false' + script: """ hostname > ${repRID}.dedup.log @@ -831,16 +1496,16 @@ process dedupData { # sort the bam file using Samtools echo -e "LOG: sorting the bam file" >> ${repRID}.dedup.log - samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.deduped.bam ${repRID}.deduped.bam - + samtools sort -@ `nproc` -O BAM -o ${repRID}_sorted.deduped.bam ${repRID}.deduped.bam + # index the sorted bam using Samtools echo -e "LOG: indexing sorted bam file" >> ${repRID}.dedup.log - samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bam.bai + samtools index -@ `nproc` -b ${repRID}_sorted.deduped.bam ${repRID}_sorted.deduped.bam.bai # split the deduped BAM file for multi-threaded tin calculation - for i in `samtools view ${repRID}.sorted.deduped.bam | cut -f3 | sort | uniq`; + for i in `samtools view ${repRID}_sorted.deduped.bam | cut -f3 | grep -o chr.[0-9]* | sort | uniq`; do - echo "echo \"LOG: splitting each chromosome into its own BAM and BAI files with Samtools\"; samtools view -b ${repRID}.sorted.deduped.bam \${i} 1>> ${repRID}.sorted.deduped.\${i}.bam; samtools index -@ `nproc` -b ${repRID}.sorted.deduped.\${i}.bam ${repRID}.sorted.deduped.\${i}.bam.bai" + echo "echo \"LOG: splitting each chromosome into its own BAM and BAI files with Samtools\"; samtools view -b ${repRID}_sorted.deduped.bam \${i} 1>> ${repRID}_sorted.deduped.\${i}.bam; samtools index -@ `nproc` -b ${repRID}_sorted.deduped.\${i}.bam ${repRID}_sorted.deduped.\${i}.bam.bai" done | parallel -j `nproc` -k """ } @@ -850,6 +1515,7 @@ dedupBam.into { dedupBam_countData dedupBam_makeBigWig dedupBam_dataQC + dedupBam_uploadProcessedFile } /* @@ -861,9 +1527,19 @@ process makeBigWig { input: tuple path (bam), path (bai) from dedupBam_makeBigWig + val fastqCountError_makeBigWig + val fastqReadError_makeBigWig + val speciesError_makeBigWig + val pipelineError_makeBigWig output: - path ("${repRID}.bw") + path ("${repRID}_sorted.deduped.bw") into bigwig + + when: + fastqCountError_makeBigWig == 'false' + fastqReadError_makeBigWig == 'false' + speciesError_makeBigWig == 'false' + pipelineError_makeBigWig == 'false' script: """ @@ -872,7 +1548,7 @@ process makeBigWig { # create bigwig echo -e "LOG: creating bibWig" >> ${repRID}.makeBigWig.log - bamCoverage -p `nproc` -b ${bam} -o ${repRID}.bw + bamCoverage -p `nproc` -b ${bam} -o ${repRID}_sorted.deduped.bw echo -e "LOG: created" >> ${repRID}.makeBigWig.log """ } @@ -882,7 +1558,7 @@ process makeBigWig { */ process countData { tag "${repRID}" - publishDir "${outDir}/count", mode: 'copy', pattern: "${repRID}*.tpmTable.csv" + publishDir "${outDir}/count", mode: 'copy', pattern: "${repRID}*_tpmTable.csv" input: path script_calculateTPM @@ -891,11 +1567,21 @@ process countData { path ref from reference_countData val ends from endsInfer_countData val stranded from strandedInfer_countData + val fastqCountError_countData + val fastqReadError_countData + val speciesError_countData + val pipelineError_countData output: - path ("*.tpmTable.csv") into counts - path ("*.countData.summary") into countsQC - path ("assignedReads.csv") into inferMetadata_assignedReads + path ("*_tpmTable.csv") into counts + path ("*_countData.summary") into countsQC + path ("assignedReads.csv") into assignedReadsInfer_fl + + when: + fastqCountError_countData == 'false' + fastqReadError_countData == 'false' + speciesError_countData == 'false' + pipelineError_countData == 'false' script: """ @@ -922,32 +1608,38 @@ process countData { echo -e "LOG: counting ${ends} features" >> ${repRID}.countData.log if [ "${ends}" == "se" ] then - featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam + featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}_countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}_sorted.deduped.bam elif [ "${ends}" == "pe" ] then - featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam + featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}_countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}_sorted.deduped.bam fi echo -e "LOG: counted" >> ${repRID}.countData.log - + # extract assigned reads - grep -m 1 'Assigned' *.countData.summary | grep -oe '\\([0-9.]*\\)' > assignedReads.csv + grep -m 1 'Assigned' *_countData.summary | grep -oe '\\([0-9.]*\\)' > assignedReads.csv # calculate TPM from the resulting countData table echo -e "LOG: calculating TPM with R" >> ${repRID}.countData.log - Rscript calculateTPM.R --count "${repRID}.countData" + Rscript ${script_calculateTPM} --count "${repRID}_countData" # convert gene symbols to Entrez id's echo -e "LOG: convert gene symbols to Entrez id's" >> ${repRID}.countData.log - Rscript convertGeneSymbols.R --repRID "${repRID}" + Rscript ${script_convertGeneSymbols} --repRID "${repRID}" """ } // Extract number of assigned reads metadata into channel assignedReadsInfer = Channel.create() -inferMetadata_assignedReads.splitCsv(sep: ",", header: false).separate( +assignedReadsInfer_fl.splitCsv(sep: ",", header: false).separate( assignedReadsInfer ) +// Replicate inferred assigned reads for multiple process inputs +assignedReadsInfer.into { + assignedReadsInfer_aggrQC + assignedReadsInfer_uploadQC +} + /* *fastqc: run fastqc on untrimmed fastq's */ @@ -956,10 +1648,20 @@ process fastqc { input: path (fastq) from fastqs_fastqc + val fastqCountError_fastqc + val fastqReadError_fastqc + val speciesError_fastqc + val pipelineError_fastqc output: path ("*_fastqc.zip") into fastqc - path ("rawReads.csv") into inferMetadata_rawReads + path ("rawReads.csv") into rawReadsInfer_fl + + when: + fastqCountError_fastqc == 'false' + fastqReadError_fastqc == 'false' + speciesError_fastqc == 'false' + pipelineError_fastqc == 'false' script: """ @@ -977,10 +1679,16 @@ process fastqc { // Extract number of raw reads metadata into channel rawReadsInfer = Channel.create() -inferMetadata_rawReads.splitCsv(sep: ",", header: false).separate( +rawReadsInfer_fl.splitCsv(sep: ",", header: false).separate( rawReadsInfer ) +// Replicate inferred raw reads for multiple process inputs +rawReadsInfer.into { + rawReadsInfer_aggrQC + rawReadsInfer_uploadQC +} + /* *dataQC: calculate transcript integrity numbers (TIN) and bin as well as calculate innerdistance of PE replicates */ @@ -993,22 +1701,32 @@ process dataQC { tuple path (bam), path (bai) from dedupBam_dataQC tuple path (chrBam), path (chrBai) from dedupChrBam val ends from endsInfer_dataQC - + val fastqCountError_dataQC + val fastqReadError_dataQC + val speciesError_dataQC + val pipelineError_dataQC + output: - path "${repRID}.tin.hist.tsv" into tinHist - path "${repRID}.tin.med.csv" into inferMetadata_tinMed - path "${repRID}.insertSize.inner_distance_freq.txt" into innerDistance - + path "${repRID}_tin.hist.tsv" into tinHist + path "${repRID}_tin.med.csv" into tinMedInfer_fl + path "${repRID}_insertSize.inner_distance_freq.txt" into innerDistance + + when: + fastqCountError_dataQC == 'false' + fastqReadError_dataQC == 'false' + speciesError_dataQC == 'false' + pipelineError_dataQC == 'false' + script: """ hostname > ${repRID}.dataQC.log ulimit -a >> ${repRID}.dataQC.log # calcualte TIN values per feature on each chromosome - echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > ${repRID}.sorted.deduped.tin.xls - for i in `cat ./bed/genome.bed | cut -f1 | sort | uniq`; do - echo "echo \"LOG: running tin.py on \${i}\" >> ${repRID}.dataQC.log; tin.py -i ${repRID}.sorted.deduped.\${i}.bam -r ./bed/genome.bed; cat ${repRID}.sorted.deduped.\${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \\\"\\\\t\${i}\\\\t\\\";"; - done | parallel -j `nproc` -k 1>> ${repRID}.sorted.deduped.tin.xls + echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > ${repRID}_sorted.deduped.tin.xls + for i in `cat ./genome.bed | cut -f1 | grep -o chr.[0-9]* | sort | uniq`; do + echo "echo \"LOG: running tin.py on \${i}\" >> ${repRID}.dataQC.log; tin.py -i ${repRID}_sorted.deduped.\${i}.bam -r ./genome.bed; cat ${repRID}_sorted.deduped.\${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \\\"\\\\t\${i}\\\\t\\\";"; + done | parallel -j `nproc` -k 1>> ${repRID}_sorted.deduped.tin.xls # bin TIN values echo -e "LOG: binning TINs" >> ${repRID}.dataQC.log @@ -1019,22 +1737,28 @@ process dataQC { if [ "${ends}" == "pe" ] then echo -e "LOG: calculating inner distances for ${ends}" >> ${repRID}.dataQC.log - inner_distance.py -i "${bam}" -o ${repRID}.insertSize -r ./bed/genome.bed + inner_distance.py -i "${bam}" -o ${repRID}_insertSize -r ./genome.bed echo -e "LOG: calculated" >> ${repRID}.dataQC.log elif [ "${ends}" == "se" ] then echo -e "LOG: creating dummy inner distance file for ${ends}" >> ${repRID}.dataQC.log - touch ${repRID}.insertSize.inner_distance_freq.txt + touch ${repRID}_insertSize.inner_distance_freq.txt fi """ } // Extract median TIN metadata into channel tinMedInfer = Channel.create() -inferMetadata_tinMed.splitCsv(sep: ",", header: false).separate( +tinMedInfer_fl.splitCsv(sep: ",", header: false).separate( tinMedInfer ) +// Replicate inferred median TIN for multiple process inputs +tinMedInfer.into { + tinMedInfer_aggrQC + tinMedInfer_uploadQC +} + /* *aggrQC: aggregate QC from processes as well as metadata and run MultiQC */ @@ -1046,6 +1770,8 @@ process aggrQC { input: path multiqcConfig path bicfLogo + path softwareReferences + path softwareVersions path fastqc path trimQC path alignQC @@ -1056,26 +1782,36 @@ process aggrQC { path alignSampleQCs from alignSampleQC_aggrQC.collect() path inferExperiment val endsManual from endsManual_aggrQC - val endsM from endsMeta - val strandedM from strandedMeta - val spikeM from spikeMeta - val speciesM from speciesMeta + val endsM from endsMeta_aggrQC + val strandedM from strandedMeta_aggrQC + val spikeM from spikeMeta_aggrQC + val speciesM from speciesMeta_aggrQC val endsI from endsInfer_aggrQC val strandedI from strandedInfer_aggrQC val spikeI from spikeInfer_aggrQC val speciesI from speciesInfer_aggrQC val readLengthM from readLengthMeta - val readLengthI from readLengthInfer - val rawReadsI from rawReadsInfer - val assignedReadsI from assignedReadsInfer - val tinMedI from tinMedInfer - val expRID - val studyRID + val readLengthI from readLengthInfer_aggrQC + val rawReadsI from rawReadsInfer_aggrQC + val assignedReadsI from assignedReadsInfer_aggrQC + val tinMedI from tinMedInfer_aggrQC + val studyRID from studyRID_aggrQC + val expRID from expRID_aggrQC + val fastqCountError_aggrQC + val fastqReadError_aggrQC + val speciesError_aggrQC + val pipelineError_aggrQC output: path "${repRID}.multiqc.html" into multiqc path "${repRID}.multiqc_data.json" into multiqcJSON + when: + fastqCountError_aggrQC == 'false' + fastqReadError_aggrQC == 'false' + speciesError_aggrQC == 'false' + pipelineError_aggrQC == 'false' + script: """ hostname > ${repRID}.aggrQC.log @@ -1103,8 +1839,7 @@ process aggrQC { echo -e "LOG: creating run table" >> ${repRID}.aggrQC.log echo -e "Session\tSession ID\tStart Time\tPipeline Version\tInput" > run.tsv echo -e "Session\t${workflow.sessionId}\t${workflow.start}\t${workflow.manifest.version}\t\${input}" >> run.tsv - - + # make RID table echo -e "LOG: creating RID table" >> ${repRID}.aggrQC.log echo -e "Replicate\tReplicate RID\tExperiment RID\tStudy RID" > rid.tsv @@ -1116,9 +1851,9 @@ process aggrQC { echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t-\t-\t'${readLengthM}'\t-" >> metadata.tsv if [ "${params.speciesForce}" == "" ] then - echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv + echo -e "Inferred\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv else - echo -e "Infered\t${speciesI} (FORCED)\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv + echo -e "Inferred\t${speciesI} (FORCED)\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv fi echo -e "Measured\t-\t${endsManual}\t-\t-\t'${rawReadsI}'\t'${assignedReadsI}'\t'${readLengthI}'\t'${tinMedI}'" >> metadata.tsv @@ -1139,28 +1874,494 @@ process aggrQC { echo -e "LOG: running multiqc" >> ${repRID}.aggrQC.log multiqc -c ${multiqcConfig} . -n ${repRID}.multiqc.html cp ${repRID}.multiqc_data/multiqc_data.json ${repRID}.multiqc_data.json - """ + + curl -H 'Content-Type: application/json' -X PUT -d \ + @./${repRID}.multiqc_data.json \ + "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/qc" + """ } +/* + * uploadQC: uploads the mRNA QC +*/ +process uploadQC { + tag "${repRID}" + + input: + path script_deleteEntry_uploadQC + path script_uploadQC + path credential, stageAs: "credential.json" from deriva_uploadQC + val executionRunRID from executionRunRID_uploadQC + val ends from endsInfer_uploadQC + val stranded from strandedInfer_uploadQC + val length from readLengthInfer_uploadQC + val rawCount from rawReadsInfer_uploadQC + val finalCount from assignedReadsInfer_uploadQC + val tinMed from tinMedInfer_uploadQC + val fastqCountError_uploadQC + val fastqReadError_uploadQC + val speciesError_uploadQC + val pipelineError_uploadQC + + output: + path ("qcRID.csv") into qcRID_fl + + when: + upload + fastqCountError_uploadQC == 'false' + fastqReadError_uploadQC == 'false' + speciesError_uploadQC == 'false' + pipelineError_uploadQC == 'false' + + script: + """ + hostname > ${repRID}.uploadQC.log + ulimit -a >> ${repRID}.uploadQC.log + + if [ "${ends}" == "pe" ] + then + end="Paired End" + elif [ "${ends}" == "se" ] + then + end="Single Read" + fi + + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=${repRID}) + if [ "\${exist}" != "[]" ] + then + rids=\$(echo \${exist} | grep -o '\\"RID\\":\\".\\{7\\}' | sed 's/^.\\{7\\}//') + for rid in \${rids} + do + python3 ${script_deleteEntry_uploadQC} -r \${rid} -t mRNA_QC -o ${source} -c \${cookie} + echo LOG: old mRNA QC RID deleted - \${rid} >> ${repRID}.uploadQC.log + done + echo LOG: all old mRNA QC RIDs deleted >> ${repRID}.uploadQC.log + fi + + qc_rid=\$(python3 ${script_uploadQC} -r ${repRID} -e ${executionRunRID} -p "\${end}" -s ${stranded} -l ${length} -w ${rawCount} -f ${finalCount} -t ${tinMed} -o ${source} -c \${cookie} -u F) + echo LOG: mRNA QC RID uploaded - \${qc_rid} >> ${repRID}.uploadQC.log + + echo "\${qc_rid}" > qcRID.csv + """ +} + +// Extract mRNA qc RID into channel +qcRID = Channel.create() +qcRID_fl.splitCsv(sep: ",", header: false).separate( + qcRID +) + /* *ouputBag: create ouputBag */ -process outputBag { +process uploadProcessedFile { tag "${repRID}" publishDir "${outDir}/outputBag", mode: 'copy', pattern: "Replicate_${repRID}.outputBag.zip" - + input: + path script_deleteEntry_uploadProcessedFile + path credential, stageAs: "credential.json" from deriva_uploadProcessedFile + path executionRunExportConfig path multiqc path multiqcJSON - + tuple path (bam),path (bai) from dedupBam_uploadProcessedFile + path bigwig + path counts + val species from speciesInfer_uploadProcessedFile + val studyRID from studyRID_uploadProcessedFile + val expRID from expRID_uploadProcessedFile + val executionRunRID from executionRunRID_uploadProcessedFile + val fastqCountError_uploadProcessedFile + val fastqReadError_uploadProcessedFile + val speciesError_uploadProcessedFile + val pipelineError_uploadProcessedFile + output: - path ("Replicate_*.zip") into outputBag + path ("${repRID}_Output_Bag.zip") into outputBag + + when: + upload + fastqCountError_uploadProcessedFile == 'false' + fastqReadError_uploadProcessedFile == 'false' + speciesError_uploadProcessedFile == 'false' + pipelineError_uploadProcessedFile == 'false' + + script: + """ + hostname > ${repRID}.outputBag.log + ulimit -a >> ${repRID}.outputBag.log + + mkdir -p ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ + cp ${bam} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ + cp ${bai} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ + cp ${bigwig} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ + cp ${counts} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ + + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Processed_File/Replicate=${repRID}) + if [ "\${exist}" != "[]" ] + then + rids=\$(echo \${exist} | grep -o '\\"RID\\":\\".\\{7\\}' | sed 's/^.\\{7\\}//') + for rid in \${rids} + do + python3 ${script_deleteEntry_uploadProcessedFile} -r \${rid} -t Processed_File -o ${source} -c \${cookie} + done + echo LOG: all old processed file RIDs deleted >> ${repRID}.uploadQC.log + fi + + deriva-upload-cli --catalog 2 --token \${cookie:9} ${source} ./deriva + echo LOG: processed files uploaded >> ${repRID}.outputBag.log + + deriva-download-cli --catalog 2 --token \${cookie:9} ${source} ${executionRunExportConfig} . rid=${executionRunRID} + echo LOG: execution run bag downloaded >> ${repRID}.outputBag.log + + echo -e "### Run Details" >> runDetails.md + echo -e "**Workflow URL:** https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq" >> runDetails.md + echo -e "**Workflow Version:** ${workflow.manifest.version}" >> runDetails.md + echo -e "**Description:** ${workflow.manifest.description}" >> runDetails.md + if [ "${species}" == "Mus musculus" ]; then + genome=\$(echo GRCm${refMoVersion} | cut -d '.' -f1) + patch=\$(echo ${refMoVersion} | cut -d '.' -f2) + annotation=\$(echo ${refMoVersion} | cut -d '.' -f3 | tr -d 'v') + elif [ "${species}" == "Homo sapiens" ]; then + genome=\$(echo GRCh${refHuVersion} | cut -d '.' -f1) + patch=\$(echo ${refHuVersion} | cut -d '.' -f2) + annotation=\$(echo ${refHuVersion} | cut -d '.' -f3 | tr -d 'v') + fi + echo -e "**Genome Assembly Version:** \${genome} patch \${patch}" >> runDetails.md + echo -e "**Annotation Version:** GENCODE release \${annotation}" >> runDetails.md + echo -e "**Run ID:** ${repRID}" >> runDetails.md + echo LOG: runDetails.md created >> ${repRID}.outputBag.log + + unzip Execution_Run_${executionRunRID}.zip + yr=\$(date +'%Y') + mn=\$(date +'%m') + dy=\$(date +'%d') + mv Execution_Run_${executionRunRID} ${repRID}_Output_Bag_\${yr}\${mn}\${dy} + loc=./${repRID}_Output_Bag/data/assets/Study/${studyRID}/Experiment/${expRID}/Replicate/${repRID}/Execution_Run/${executionRunRID}/Output_Files/ + mkdir -p \${loc} + cp runDetails.md \${loc} + cp ${multiqc} \${loc} + cp ${multiqcJSON} \${loc} + + bdbag ./${repRID}_Output_Bag/ --update --archiver zip --debug + echo LOG: output bag created >> ${repRID}.outputBag.log + """ +} + +/* + * uploadOutputBag: uploads the output bag +*/ +process uploadOutputBag { + tag "${repRID}" + + input: + path script_uploadOutputBag + path credential, stageAs: "credential.json" from deriva_uploadOutputBag + path outputBag + val studyRID from studyRID_uploadOutputBag + val executionRunRID from executionRunRID_uploadOutputBag + val fastqCountError_uploadOutputBag + val fastqReadError_uploadOutputBag + val speciesError_uploadOutputBag + val pipelineError_uploadOutputBag + + output: + path ("outputBagRID.csv") into outputBagRID_fl + + when: + upload + fastqCountError_uploadOutputBag == 'false' + fastqReadError_uploadOutputBag == 'false' + speciesError_uploadOutputBag == 'false' + pipelineError_uploadOutputBag == 'false' + + script: + """ + hostname > ${repRID}.uploadOutputBag.log + ulimit -a >> ${repRID}.uploadOutputBag.log + + yr=\$(date +'%Y') + mn=\$(date +'%m') + dy=\$(date +'%d') + + file=\$(basename -a ${outputBag}) + md5=\$(md5sum ./\${file} | awk '{ print \$1 }') + echo LOG: ${repRID} output bag md5 sum - \${md5} >> ${repRID}.uploadOutputBag.log + size=\$(wc -c < ./\${file}) + echo LOG: ${repRID} output bag size - \${size} bytes >> ${repRID}.uploadOutputBag.log + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_MD5=\${md5}) + if [ "\${exist}" == "[]" ] + then + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/output_bag/study/${studyRID}/replicate/${repRID}/\${file} --parents) + outputBag_rid=\$(python3 ${script_uploadOutputBag} -e ${executionRunRID} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie}) + echo LOG: output bag RID uploaded - \${outputBag_rid} >> ${repRID}.uploadOutputBag.log + rid=\${outputBag_rid} + else + exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + exist=\${exist:8:-6} + echo LOG: output bag RID already exists - \${exist} >> ${repRID}.uploadOutputBag.log + rid=\${exist} + fi + + echo "\${rid}" > outputBagRID.csv + """ +} + +// Extract output bag RID into channel +outputBagRID = Channel.create() +outputBagRID_fl.splitCsv(sep: ",", header: false).separate( + outputBagRID +) + +/* + * finalizeExecutionRun: finalizes the execution run +*/ +process finalizeExecutionRun { + tag "${repRID}" + + input: + path script_uploadExecutionRun_finalizeExecutionRun + path credential, stageAs: "credential.json" from deriva_finalizeExecutionRun + val executionRunRID from executionRunRID_finalizeExecutionRun + val inputBagRID from inputBagRID_finalizeExecutionRun + val outputBagRID + + when: + upload + + script: + """ + hostname > ${repRID}.finalizeExecutionRun.log + ulimit -a >> ${repRID}.finalizeExecutionRun.log + + executionRun=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/RID=${executionRunRID}) + workflow=\$(echo \${executionRun} | grep -o '\\"Workflow\\":.*\\"Reference' | grep -oP '(?<=\\"Workflow\\":\\").*(?=\\",\\"Reference)') + genome=\$(echo \${executionRun} | grep -o '\\"Reference_Genome\\":.*\\"Input_Bag' | grep -oP '(?<=\\"Reference_Genome\\":\\").*(?=\\",\\"Input_Bag)') + + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + rid=\$(python3 ${script_uploadExecutionRun_finalizeExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Success -d 'Run Successful' -o ${source} -c \${cookie} -u ${executionRunRID}) + echo LOG: execution run RID marked as successful - \${rid} >> ${repRID}.finalizeExecutionRun.log + + dt=`date +%FT%T.%3N%:z` + curl -H 'Content-Type: application/json' -X PUT -d \ + '{ \ + "ID": "${workflow.sessionId}", \ + "Complete": "\${dt}" \ + }' \ + "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" + """ +} + +/* + * failPreExecutionRun: fail the execution run prematurely +*/ +process failPreExecutionRun { + tag "${repRID}" + + input: + path script_uploadExecutionRun_failPreExecutionRun + path credential, stageAs: "credential.json" from deriva_failPreExecutionRun + val spike from spikeMeta_failPreExecutionRun + val species from speciesMeta_failPreExecutionRun + val inputBagRID from inputBagRID_failPreExecutionRun + val fastqCountError from fastqCountError_failPreExecutionRun + val fastqCountError_details + val fastqReadError from fastqReadError_failPreExecutionRun + val fastqReadError_details + val speciesError from speciesError_failPreExecutionRun + val speciesError_details + + when: + upload + fastqCountError == 'true' || fastqReadError == 'true' || speciesError == 'true' script: """ - mkdir Replicate_${repRID}.outputBag - cp ${multiqc} Replicate_${repRID}.outputBag - cp ${multiqcJSON} Replicate_${repRID}.outputBag - bdbag Replicate_${repRID}.outputBag --archiver zip + hostname > ${repRID}.failPreExecutionRun.log + ulimit -a >> ${repRID}.failPreExecutionRun.log + + errorDetails="" + if [ ${fastqCountError} == true ] + then + errorDetails=\$(echo ${fastqCountError_details}"\\n") + elif [ ${fastqReadError} == true ] + then + errorDetails=\$(echo \$(errorDetails)${fastqReadError_details}"\\n") + elif [ ${speciesError} == true ] + then + errorDetails=\$(echo \$(errorDetails)${speciesError_details}"\\n") + fi + + echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.failPreExecutionRun.log + workflow=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Workflow/Name=BICF%20mRNA%20Replicate/Version=${workflow.manifest.version}) + workflow=\$(echo \${workflow} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + workflow=\${workflow:7:-6} + echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.failPreExecutionRun.log + + if [ "${species}" == "Homo sapiens" ] + then + genomeName=\$(echo GRCh${refHuVersion}) + elif [ "${species}" == "Mus musculus" ] + then + genomeName=\$(echo GRCm${refMoVersion}) + fi + if [ "${spike}" == "yes" ] + then + genomeName=\$(echo \${genomeName}-S) + fi + echo LOG: searching for genome name - \${genomeName} >> ${repRID}.failPreExecutionRun.log + genome=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Name=\${genomeName}) + genome=\$(echo \${genome} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + genome=\${genome:7:-6} + echo LOG: genome RID extracted - \${genome} >> ${repRID}.failPreExecutionRun.log + + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Replicate=${repRID}/Input_Bag=${inputBagRID}) + echo \${exist} >> ${repRID}.failPreExecutionRun.log + if [ "\${exist}" == "[]" ] + then + rid=\$(python3 ${script_uploadExecutionRun_failPreExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u F) + echo LOG: execution run RID uploaded - \${rid} >> ${repRID}.failPreExecutionRun.log + else + rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + rid=\${rid:7:-6} + echo \${rid} >> ${repRID}.failPreExecutionRun.log + executionRun_rid==\$(python3 ${script_uploadExecutionRun_failPreExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u \${rid}) + echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.failPreExecutionRun.log + fi + + dt=`date +%FT%T.%3N%:z` + curl -H 'Content-Type: application/json' -X PUT -d \ + '{ \ + "ID": "${workflow.sessionId}", \ + "Failure": "\${dt}" \ + }' \ + "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" + """ +} + +/* + * failExecutionRun: fail the execution run +*/ +process failExecutionRun { + tag "${repRID}" + + input: + path script_uploadExecutionRun_failExecutionRun + path credential, stageAs: "credential.json" from deriva_failExecutionRun + val executionRunRID from executionRunRID_failExecutionRun + val inputBagRID from inputBagRID_failExecutionRun + val endsMeta from endsMeta_failExecutionRun + val endsRaw + val strandedMeta from strandedMeta_failExecutionRun + val spikeMeta from spikeMeta_failExecutionRun + val speciesMeta from speciesMeta_failExecutionRun + val endsInfer from endsInfer_failExecutionRun + val strandedInfer from strandedInfer_failExecutionRun + val spikeInfer from spikeInfer_failExecutionRun + val speciesInfer from speciesInfer_failExecutionRun + val pipelineError from pipelineError_failExecutionRun + val pipelineError_ends + val pipelineError_stranded + val pipelineError_spike + val pipelineError_species + + when: + upload + pipelineError == 'true' + + script: """ -} \ No newline at end of file + hostname > ${repRID}.failExecutionRun.log + ulimit -a >> ${repRID}.failExecutionRun.log + + executionRun=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/RID=${executionRunRID}) + workflow=\$(echo \${executionRun} | grep -o '\\"Workflow\\":.*\\"Reference' | grep -oP '(?<=\\"Workflow\\":\\").*(?=\\",\\"Reference)') + genome=\$(echo \${executionRun} | grep -o '\\"Reference_Genome\\":.*\\"Input_Bag' | grep -oP '(?<=\\"Reference_Genome\\":\\").*(?=\\",\\"Input_Bag)') + + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + errorDetails="" + if [ ${pipelineError} == false ] + then + rid=\$(python3 ${script_uploadExecutionRun_failExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Success -d 'Run Successful' -o ${source} -c \${cookie} -u ${executionRunRID}) + echo LOG: execution run RID marked as successful - \${rid} >> ${repRID}.failExecutionRun.log + else + pipelineError_details=\$(echo "**Submitted metadata does not match inferred:**\\n") + pipelineError_details=\$(echo \${pipelineError_details}"|Metadata|Submitted value|Inferred value|\\n") + pipelineError_details=\$(echo \${pipelineError_details}"|:-:|-:|-:|\\n") + if ${pipelineError_ends} + then + if [ "${endsInfer}" == "se" ] + then + endInfer="Single End" + elif [ "${endsInfer}" == "pe" ] + then + endInfer="Paired End" + else + endInfer="unknown" + fi + pipelineError_details=\$(echo \${pipelineError_details}"|Paired End|${endsRaw}|"\${endInfer}"|\\n") + fi + if ${pipelineError_stranded} + then + pipelineError_details=\$(echo \${pipelineError_details}"|Strandedness|${strandedMeta}|${strandedInfer}|\\n") + fi + if ${pipelineError_spike} + then + pipelineError_details=\$(echo \${pipelineError_details}"|Used Spike Ins|${spikeMeta}|${spikeInfer}|\\n") + fi + if ${pipelineError_species} + then + pipelineError_details=\$(echo \${pipelineError_details}"|Species|${speciesMeta}|${speciesInfer}|\\n") + fi + pipelineError_details=\${pipelineError_details::-2} + rid=\$(python3 ${script_uploadExecutionRun_failExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${pipelineError_details}" -o ${source} -c \${cookie} -u ${executionRunRID}) + echo LOG: execution run RID marked as error - \${rid} >> ${repRID}.failExecutionRun.log + fi + + dt=`date +%FT%T.%3N%:z` + curl -H 'Content-Type: application/json' -X PUT -d \ + '{ \ + "ID": "${workflow.sessionId}", \ + "Failure": "\${dt}" \ + }' \ + "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track" + """ +} + + +workflow.onError = { + subject = "$workflow.manifest.name FAILED: $params.repRID" + + def msg = """\ + + Pipeline error summary + --------------------------- + RID : ${params.repRID} + Version : ${workflow.manifest.version} + Duration : ${workflow.duration} + Nf Version : ${workflow.nextflow.version} + Message : ${workflow.errorMessage} + exit status : ${workflow.exitStatus} + """ + .stripIndent() + if (email != '') { + sendMail(to: email, subject: subject , body: msg) + } +} diff --git a/workflow/scripts/bdbagFetch.sh b/workflow/scripts/bdbagFetch.sh deleted file mode 100644 index 606b88397d5a6cf4feb4aa38d7615e3e3ba48735..0000000000000000000000000000000000000000 --- a/workflow/scripts/bdbagFetch.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -if [ -z "${3}" ] -then - bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz ${1} - for i in $(find */ -name "*R*.fastq.gz") - do - path=${2}.$(echo ${i##*/} | grep -o "R[1,2].fastq.gz") - cp ${i} ./${path} - done -elif [ "${3}" == "TEST" ] -then - bdbag --resolve-fetch all --fetch-filter filename\$*.txt ${1} -fi diff --git a/workflow/scripts/bdbag_fetch.sh b/workflow/scripts/bdbag_fetch.sh new file mode 100644 index 0000000000000000000000000000000000000000..c34dc756d0cc5a47382fb9f96267e378c19ae79a --- /dev/null +++ b/workflow/scripts/bdbag_fetch.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +bdbag --materialize ${1} --debug +validate="" +bdbag --validate full ${1} 2> validate.txt +validate=$(tail -n1 validate.txt | grep -o 'is valid') +if [ "${validate}" != "is valid" ] +then + n=0 + until [ "${n}" -ge "3" ] + do + bdbag --resolve-fetch missing --validate full ${1} --debug && validate=$(tail -n validate.txt | grep -o 'is valid') && break + n=$((n+1)) + sleep 15 + done +fi +if [ "${validate}" != "is valid" ] +then + exit 1 +fi +for i in $(find */ -name "*R*.fastq.gz") +do + path=${2}.$(echo ${i##*/} | grep -o "R[1,2].fastq.gz") + cp ${i} ./${path} +done \ No newline at end of file diff --git a/workflow/scripts/convertGeneSymbols.R b/workflow/scripts/convertGeneSymbols.R index 49752f1bba5a4dd8d91ed8609c6f2f82b8fafacc..6cc5c0a1089881ae1dfd32e248a1ffbcbcd7b24a 100644 --- a/workflow/scripts/convertGeneSymbols.R +++ b/workflow/scripts/convertGeneSymbols.R @@ -23,4 +23,4 @@ output <- merge(x=convert,y=countTable[,c("gene_name","gene_id","count","tpm")], colnames(output) <- c("GENCODE_Gene_Symbol","NCBI_GeneID","Ensembl_GeneID","count","tpm") output <- output[,c(1,3,2,4:5)] -write.table(output,file=paste0(opt$repRID,".tpmTable.csv"),sep=",",row.names=FALSE,quote=FALSE) +write.table(output,file=paste0(opt$repRID,"_tpmTable.csv"),sep=",",row.names=FALSE,quote=FALSE) diff --git a/workflow/scripts/delete_entry.py b/workflow/scripts/delete_entry.py new file mode 100644 index 0000000000000000000000000000000000000000..1b26509a8c1541f65a39e660ec7e6ec158194ef1 --- /dev/null +++ b/workflow/scripts/delete_entry.py @@ -0,0 +1,37 @@ +import argparse +from deriva.core import ErmrestCatalog, get_credential, BaseCLI +import sys +import csv + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-r', '--RID', help="replicate RID", required=True) + parser.add_argument('-t', '--table', help="source table", required=True) + parser.add_argument('-o', '--host', help="datahub host", required=True) + parser.add_argument('-c', '--cookie', help="cookie token", required=True) + args = parser.parse_args() + return args + +def main(hostname, catalog_number, credential): + catalog = ErmrestCatalog('https', hostname, catalog_number, credential) + pb = catalog.getPathBuilder() + if args.table == 'mRNA_QC': + run_table = pb.RNASeq.mRNA_QC + elif args.table == "Processed_File": + run_table = pb.RNASeq.Processed_File + + path = run_table.filter(run_table.RID == args.RID) + path.delete() + rid = args.RID + + + print(rid + " deleted") + + +if __name__ == '__main__': + args = get_args() + cli = BaseCLI("Custom RNASeq query", None, 1) + cli.remove_options(["--config-file"]) + host = args.host + credentials = {"cookie": args.cookie} + main(host, 2, credentials) \ No newline at end of file diff --git a/workflow/scripts/extract_ref_data.py b/workflow/scripts/extract_ref_data.py new file mode 100644 index 0000000000000000000000000000000000000000..bf06c95696354e6fe5d445718fbfb0769df4182d --- /dev/null +++ b/workflow/scripts/extract_ref_data.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 + +import argparse +import pandas as pd +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-r', '--returnParam', + help="The parameter to return (URL or MD5).", required=True) + args = parser.parse_args() + return args + + +def main(): + args = get_args() + refQuery = pd.read_json("refQuery.json") + if refQuery["File_URL"].count() == 1: + if args.returnParam == "URL": + print(refQuery["File_URL"].values[0]) + elif args.returnParam == "fName": + print(refQuery["File_Name"].values[0]) + elif args.returnParam == "MD5": + print(refQuery["File_MD5"].values[0]) + else: + raise Exception("Multple references found: \n%s" % + refQuery["RID"]) + + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/generate_references.py b/workflow/scripts/generate_references.py new file mode 100644 index 0000000000000000000000000000000000000000..8e809f1e4471d3393ec4960778d2a210d54c11d1 --- /dev/null +++ b/workflow/scripts/generate_references.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +#generate_references.py +#* +#* -------------------------------------------------------------------------- +#* Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/cellranger_count/blob/develop/LICENSE) +#* -------------------------------------------------------------------------- +#* + +import argparse +import subprocess +import shlex +import logging + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-r', '--reference', + help="The reference file (markdown format).", + required=True) + + parser.add_argument('-o', '--output', + help="The out file name.", + default='references') + + args = parser.parse_args() + return args + + +def main(): + args = get_args() + reference = args.reference + output = args.output + + out_filename = output + '_mqc.yaml' + + # Header for HTML + print( + ''' + id: 'software_references' + section_name: 'Software References' + description: 'This section describes references for the tools used.' + plot_type: 'html' + data: | + ''' + , file = open(out_filename, "w") + ) + + # Turn Markdown into HTML + references_html = 'bash -c "pandoc -p {} | sed \'s/^/ /\' >> {}"' + references_html = references_html.format(reference, out_filename) + subprocess.check_call(shlex.split(references_html)) + + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/generate_versions.py b/workflow/scripts/generate_versions.py new file mode 100644 index 0000000000000000000000000000000000000000..09447d17a62a439a418753398e1cd77716ceaa74 --- /dev/null +++ b/workflow/scripts/generate_versions.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# +# * -------------------------------------------------------------------------- +# * Licensed under MIT (https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/blob/master/LICENSE) +# * -------------------------------------------------------------------------- +# + +'''Make YAML of software versions.''' + +from __future__ import print_function +from collections import OrderedDict +import re +import os +import logging +import glob +import argparse +import numpy as np + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + +SOFTWARE_REGEX = { + 'Python': ['version_python.txt', r"Python (\S+)"], + 'DERIVA': ['version_deriva.txt', r"(\S+)"], + 'BDBag': ['version_bdbag.txt', r"BDBag (\S+) \(Bagit \S+\)"], + 'RSeQC': ['version_rseqc.txt', r"infer_experiment.py (\S+)"], + 'Trim Galore!': ['version_trimgalore.txt', r"version (\S+)"], + 'HISAT2': ['version_hisat2.txt', r"version (\S+)"], + 'Samtools': ['version_samtools.txt', r"samtools (\S+)"], + 'picard (MarkDuplicates)': ['version_markdups.txt', r"Version:(\S+)"], + 'featureCounts': ['version_featurecounts.txt', r"featureCounts v(\S+)"], + 'R': ['version_r.txt', r"R version (\S+)"], + 'deepTools': ['version_deeptools.txt', r"deeptools (\S+)"], + 'FastQC': ['version_fastqc.txt', r"FastQC v(\S+)"], + 'MultiQC': ['version_multiqc.txt', r"multiqc, version (\S+)"], + 'Pipeline Version': ['./workflow/nextflow.config', r"version = 'v(\S+)'"] +} + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-o', '--output', + help="The out file name.", + required=True) + + parser.add_argument('-t', '--test', + help='Used for testing purposes', + default=False, + action='store_true') + + args = parser.parse_args() + return args + + +def check_files(files, test): + '''Check if version files are found.''' + + logger.info("Running file check.") + + software_files = np.array(list(SOFTWARE_REGEX.values()))[:,0] + + extra_files = set(files) - set(software_files) + + if len(extra_files) > 0 and test: + logger.error('Missing regex: %s', list(extra_files)) + raise Exception("Missing regex: %s" % list(extra_files)) + + +def main(): + args = get_args() + output = args.output + test = args.test + + out_filename = output + '_mqc.yaml' + + results = OrderedDict() + results['Python'] = '<span style="color:#999999;\">Not Run</span>' + results['DERIVA'] = '<span style="color:#999999;\">Not Run</span>' + results['BDBag'] = '<span style="color:#999999;\">Not Run</span>' + results['RSeQC'] = '<span style="color:#999999;\">Not Run</span>' + results['Trim Galore!'] = '<span style="color:#999999;\">Not Run</span>' + results['HISAT2'] = '<span style="color:#999999;\">Not Run</span>' + results['Samtools'] = '<span style="color:#999999;\">Not Run</span>' + results['picard (MarkDuplicates)'] = '<span style="color:#999999;\">Not Run</span>' + results['featureCounts'] = '<span style="color:#999999;\">Not Run</span>' + results['R'] = '<span style="color:#999999;\">Not Run</span>' + results['deepTools'] = '<span style="color:#999999;\">Not Run</span>' + results['FastQC'] = '<span style="color:#999999;\">Not Run</span>' + results['MultiQC'] = '<span style="color:#999999;\">Not Run</span>' + results['Pipeline Version'] = '<span style="color:#999999;\">Not Run</span>' + + # list all files + files = glob.glob('**/*.txt', recursive=True) + + # Check for version files: + check_files(files, test) + + # Search each file using its regex + for k, v in SOFTWARE_REGEX.items(): + if os.path.isfile(v[0]): + with open(v[0]) as x: + versions = x.read() + match = re.search(v[1], versions) + if match: + results[k] = "v{}".format(match.group(1)) + + # Dump to YAML + print( + ''' + id: 'software_versions' + section_name: 'Software Versions' + section_href: 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/-/blob/78-tool_version/docs/RNA-Seq%20Pipeline%20Design%20Process%20Table.pdf' + plot_type: 'html' + description: 'are collected for pipeline version.' + data: | + <dl class="dl-horizontal"> + ''' + , file = open(out_filename, "w")) + + for k, v in results.items(): + print(" <dt>{}</dt><dd>{}</dd>".format(k, v), file = open(out_filename, "a")) + print(" </dl>", file = open(out_filename, "a")) + + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/get_updated_badge_info.sh b/workflow/scripts/get_updated_badge_info.sh new file mode 100644 index 0000000000000000000000000000000000000000..4b929272f2ea80ede5d47b84cd55bad2c6a3fa7b --- /dev/null +++ b/workflow/scripts/get_updated_badge_info.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +echo "collecting stats for badges" +latest_release_tag=$(git tag --sort=-committerdate -l *.*.* | head -1) +current_pipeline_version=$(git show ${latest_release_tag}:workflow/nextflow.config | grep -o version.* | grep -oP "(?<=').*(?=')") +current_nextflow_version=$(git show ${latest_release_tag}:workflow/nextflow.config | grep -o nextflowVersion.* | grep -oP "(?<=').*(?=')") +master_pipeline_version=$(git show origin/master:workflow/nextflow.config | grep -o version.* | grep -oP "(?<=').*(?=')") +master_nextflow_version=$(git show origin/master:workflow/nextflow.config | grep -o nextflowVersion.* | grep -oP "(?<=').*(?=')") +develop_pipeline_version=$(git show origin/develop:workflow/nextflow.config | grep -o version.* | grep -oP "(?<=').*(?=')") +develop_nextflow_version=$(git show origin/develop:workflow/nextflow.config | grep -o nextflowVersion.* | grep -oP "(?<=').*(?=')") + +echo "collecting tool version for badges" +python_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o Python.* | grep -oP "(?<=d>).*(?=\<)") +deriva_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o DERIVA.* | grep -oP "(?<=d>).*(?=\<)") +bdbag_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o BDBag.* | grep -oP "(?<=d>).*(?=\<)") +rseqc_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o RSeQC.* | grep -oP "(?<=d>).*(?=\<)") +trimgalore_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o 'Trim Galore!'.* | grep -oP "(?<=d>).*(?=\<)") +hisat2_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o HISAT2.* | grep -oP "(?<=d>).*(?=\<)") +samtools_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o Samtools.* | grep -oP "(?<=d>).*(?=\<)") +picard_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o 'picard (MarkDuplicates)'.* | grep -oP "(?<=d>).*(?=\<)") +featurecounts_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o featureCounts.* | grep -oP "(?<=d>).*(?=\<)") +r_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o '>R<'.* | grep -oP "(?<=d>).*(?=\<)") +deeptools_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o deepTools.* | grep -oP "(?<=d>).*(?=\<)") +fastqc_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o FastQC.* | grep -oP "(?<=d>).*(?=\<)") +multiqc_version=$(git show ${latest_release_tag}:docs/software_versions_mqc.yaml | grep -o MultiQC.* | grep -oP "(?<=d>).*(?=\<)") + +echo "collecting badges" +mkdir -p ./badges/tools +curl --request GET https://img.shields.io/badge/Latest%20Release-${latest_release_tag}-informational?style=flat > ./badges/release.svg +curl --request GET https://img.shields.io/badge/Pipeline%20Version-${current_pipeline_version}-informational?style=flat > ./badges/releasePipeline.svg +curl --request GET https://img.shields.io/badge/Nextflow%20Version-${current_nextflow_version}-informational?style=flat > ./badges/releaseNextflow.svg +curl --request GET https://img.shields.io/badge/Pipeline%20Version-${master_pipeline_version}-informational?style=flat > ./badges/masterPipeline.svg +curl --request GET https://img.shields.io/badge/Nextflow%20Version-${master_nextflow_version}-informational?style=flat > ./badges/masterNextflow.svg +curl --request GET https://img.shields.io/badge/Pipeline%20Version-${develop_pipeline_version}-informational?style=flat > ./badges/developPipeline.svg +curl --request GET https://img.shields.io/badge/Nextflow%20Version-${develop_nextflow_version}-informational?style=flat > ./badges/developNextflow.svg + +curl --request GET https://img.shields.io/badge/Python%20Version-${python_version}-blueviolet?style=flat > ./badges/tools/python.svg +curl --request GET https://img.shields.io/badge/DERIVA%20Version-${deriva_version}-blueviolet?style=flat > ./badges/tools/deriva.svg +curl --request GET https://img.shields.io/badge/BDBag%20Version-${bdbag_version}-blueviolet?style=flat > ./badges/tools/bdbag.svg +curl --request GET https://img.shields.io/badge/RSeQC%20Version-${rseqc_version}-blueviolet?style=flat > ./badges/tools/rseqc.svg +curl --request GET https://img.shields.io/badge/Trim%20Galore%20Version-${trimgalore_version}-blueviolet?style=flat > ./badges/tools/trimgalore.svg +curl --request GET https://img.shields.io/badge/HISAT2%20Version-${hisat2_version}-blueviolet?style=flat > ./badges/tools/hisat2.svg +curl --request GET https://img.shields.io/badge/Samtools%20Version-${samtools_version}-blueviolet?style=flat > ./badges/tools/samtools.svg +curl --request GET https://img.shields.io/badge/picard%20Version-${picard_version}-blueviolet?style=flat > ./badges/tools/picard.svg +curl --request GET https://img.shields.io/badge/featureCounts%20Version-${featurecounts_version}-blueviolet?style=flat > ./badges/tools/featurecounts.svg +curl --request GET https://img.shields.io/badge/R%20Version-${r_version}-blueviolet?style=flat > ./badges/tools/r.svg +curl --request GET https://img.shields.io/badge/deepTools%20Version-${deeptools_version}-blueviolet?style=flat > ./badges/tools/deeptools.svg +curl --request GET https://img.shields.io/badge/FastQC%20Version-${fastqc_version}-blueviolet?style=flat > ./badges/tools/fastqc.svg +curl --request GET https://img.shields.io/badge/MultiQC%20Version-${multiqc_version}-blueviolet?style=flat > ./badges/tools/multiqc.svg \ No newline at end of file diff --git a/workflow/scripts/get_updated_rep_count.sh b/workflow/scripts/get_updated_rep_count.sh new file mode 100644 index 0000000000000000000000000000000000000000..be054ea1a8531cb2166436f21aab2e42a09065f4 --- /dev/null +++ b/workflow/scripts/get_updated_rep_count.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +echo "collecting stats for badges" +latest_release_tag=$(git tag --sort=-committerdate -l *.*.* | head -1) +current_pipeline_version=$(git show ${latest_release_tag}:workflow/nextflow.config | grep -o version.* | grep -oP "(?<=').*(?=')") + +echo "collecting workflow RIDs from servers" +dev_workflow_RID=$(curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:Workflow/Version=${current_pipeline_version} | grep -o '\"RID\":\".*\",\"RCT') +dev_workflow_RID=${dev_workflow_RID:7:-6} +staging_workflow_RID=$(curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Workflow/Version=${current_pipeline_version} | grep -o '\"RID\":\".*\",\"RCT') +staging_workflow_RID=${staging_workflow_RID:7:-6} +prod_workflow_RID=$(curl -s https://www.gudmap.org/ermrest/catalog/2/entity/RNASeq:Workflow/Version=${current_pipeline_version} | grep -o '\"RID\":\".*\",\"RCT') +prod_workflow_RID=${prod_workflow_RID:7:-6} + +echo "collecting unique replicates with successful execution runs" +dev_count=$(curl -s https://dev.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Execution_Status=Success/Workflow=${dev_workflow_RID} | grep -o \"Replicate\".*,\"Workflow | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Workflow)" | sort | uniq | wc -l) +staging_count=$(curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Execution_Status=Success/Workflow=${staging_workflow_RID} | grep -o \"Replicate\".*,\"Workflow | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Workflow)" | sort | uniq | wc -l) +prod_count=$(curl -s https://www.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Execution_Status=Success/Workflow=${prod_workflow_RID} | grep -o \"Replicate\".*,\"Workflow | grep -oP "(?<=\"Replicate\":\").*(?=\",\"Workflow)" | sort | uniq | wc -l) + +echo "collecting badges" +mkdir -p ./badges/counts +curl --request GET https://img.shields.io/badge/Development%20Replicate%20Count-${dev_count}-lightgrey?style=flat > ./badges/counts/dev_counts.svg +curl --request GET https://img.shields.io/badge/Staging%20Replicate%20Count-${staging_count}-lightgrey?style=flat > ./badges/counts/staging_counts.svg +curl --request GET https://img.shields.io/badge/Production%20Replicate%20Count-${prod_count}-lightgrey?style=flat > ./badges/counts/prod_counts.svg diff --git a/workflow/scripts/inferMeta.sh b/workflow/scripts/infer_meta.sh similarity index 100% rename from workflow/scripts/inferMeta.sh rename to workflow/scripts/infer_meta.sh diff --git a/workflow/scripts/parseMeta.py b/workflow/scripts/parse_meta.py similarity index 73% rename from workflow/scripts/parseMeta.py rename to workflow/scripts/parse_meta.py index 500054264f7c8c50310da92d9995f432930318d3..12cc7c7233b94509e5c3a7307e8ef7985a94a958 100644 --- a/workflow/scripts/parseMeta.py +++ b/workflow/scripts/parse_meta.py @@ -5,62 +5,66 @@ import pandas as pd import warnings warnings.simplefilter(action='ignore', category=FutureWarning) + def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('-r', '--repRID',help="The replicate RID.",required=True) - parser.add_argument('-m', '--metaFile',help="The metadata file to extract.",required=True) - parser.add_argument('-p', '--parameter',help="The parameter to extract.",required=True) + parser.add_argument( + '-r', '--repRID', help="The replicate RID.", required=True) + parser.add_argument('-m', '--metaFile', + help="The metadata file to extract.", required=True) + parser.add_argument('-p', '--parameter', + help="The parameter to extract.", required=True) args = parser.parse_args() return args def main(): args = get_args() - metaFile = pd.read_csv(args.metaFile,sep=",",header=0) + metaFile = pd.read_csv(args.metaFile, sep=",", header=0) # Check replicate RID metadata from 'File.csv' if (args.parameter == "repRID"): if (len(metaFile.Replicate_RID.unique()) > 1): - print("There are multiple replicate RID's in the metadata: " + " ".join(metaFile.Replicate_RID.unique())) + print("There are multiple replicate RID's in the metadata: " + + " ".join(metaFile.Replicate_RID.unique())) exit(1) if not (metaFile.Replicate_RID.unique() == args.repRID): - print("Replicate RID in metadata does not match run parameters: " + metaFile.Replicate_RID.unique() + " vs " + args.repRID) + print("Replicate RID in metadata does not match run parameters: " + + metaFile.Replicate_RID.unique() + " vs " + args.repRID) exit(1) else: - rep=metaFile["Replicate_RID"].unique()[0] + rep = metaFile["Replicate_RID"].unique()[0] print(rep) if (len(metaFile[metaFile["File_Type"] == "FastQ"]) > 2): - print("There are more then 2 fastq's in the metadata: " + " ".join(metaFile[metaFile["File_Type"] == "FastQ"].RID)) + print("There are more then 2 fastq's in the metadata: " + + " ".join(metaFile[metaFile["File_Type"] == "FastQ"].RID)) exit(1) # Check experiment RID metadata from 'Experiment.csv' if (args.parameter == "expRID"): if (len(metaFile.Experiment_RID.unique()) > 1): - print("There are multiple experoment RID's in the metadata: " + " ".join(metaFile.Experiment_RID.unique())) + print("There are multiple experoment RID's in the metadata: " + + " ".join(metaFile.Experiment_RID.unique())) exit(1) else: - exp=metaFile["Experiment_RID"].unique()[0] + exp = metaFile["Experiment_RID"].unique()[0] print(exp) # Check study RID metadata from 'Experiment.csv' if (args.parameter == "studyRID"): if (len(metaFile.Study_RID.unique()) > 1): - print("There are multiple study RID's in the metadata: " + " ".join(metaFile.Study_RID.unique())) + print("There are multiple study RID's in the metadata: " + + " ".join(metaFile.Study_RID.unique())) exit(1) else: - study=metaFile["Study_RID"].unique()[0] + study = metaFile["Study_RID"].unique()[0] print(study) - + # Get endedness metadata from 'Experiment Settings.csv' if (args.parameter == "endsMeta"): - if (metaFile.Paired_End.unique() == "Single End"): - endsMeta = "se" - elif (metaFile.Paired_End.unique() == "Paired End"): - endsMeta = "pe" - else: - endsMeta = "uk" + endsMeta = metaFile.Paired_End.unique()[0] print(endsMeta) - + # Manually get endness count from 'File.csv' if (args.parameter == "endsManual"): if (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 1): @@ -68,7 +72,7 @@ def main(): elif (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 2): endsManual = "pe" print(endsManual) - + # Get strandedness metadata from 'Experiment Settings.csv' if (args.parameter == "stranded"): if (metaFile.Has_Strand_Specific_Information.unique() == "yes"): @@ -76,10 +80,11 @@ def main(): elif (metaFile.Has_Strand_Specific_Information.unique() == "no"): stranded = "unstranded" else: - print("Stranded metadata not match expected options: " + metaFile.Has_Strand_Specific_Information.unique()) + print("Stranded metadata not match expected options: " + + metaFile.Has_Strand_Specific_Information.unique()) exit(1) print(stranded) - + # Get spike-in metadata from 'Experiment Settings.csv' if (args.parameter == "spike"): if (metaFile.Used_Spike_Ins.unique() == "yes"): @@ -87,7 +92,8 @@ def main(): elif (metaFile.Used_Spike_Ins.unique() == "no"): spike = "no" else: - print("Spike-ins metadata not match expected options: " + metaFile.Used_Spike_Ins.unique()) + print("Spike-ins metadata not match expected options: " + + metaFile.Used_Spike_Ins.unique()) exit(1) print(spike) @@ -98,7 +104,8 @@ def main(): elif (metaFile.Species.unique() == "Homo sapiens"): species = "Homo sapiens" else: - print("Species metadata not match expected options: " + metaFile.Species.unique()) + print("Species metadata not match expected options: " + + metaFile.Species.unique()) exit(1) print(species) @@ -107,5 +114,6 @@ def main(): readLength = metaFile.Read_Length.unique() print(str(readLength).strip('[]')) + if __name__ == '__main__': main() diff --git a/workflow/scripts/splitStudy.py b/workflow/scripts/splitStudy.py deleted file mode 100644 index 82ffc2881857dd5d1d27eee5ea6a381b02d0e9f5..0000000000000000000000000000000000000000 --- a/workflow/scripts/splitStudy.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import pandas as pd -import warnings -warnings.simplefilter(action='ignore', category=FutureWarning) - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('-s', '--studyRID',help="The study RID.",required=True) - args = parser.parse_args() - return args - -def main(): - args = get_args() - studyRID=pd.read_json(args.studyRID+"_studyRID.json") - if studyRID["RID"].count() > 0: - studyRID["RID"].to_csv(args.studyRID+"_studyRID.csv",header=False,index=False) - else: - raise Exception("No associated replicates found: %s" % - studyRID) - -if __name__ == '__main__': - main() diff --git a/workflow/scripts/splitStudy.sh b/workflow/scripts/splitStudy.sh deleted file mode 100644 index 1f82af6132dad6148adf506a34769c0af1fe9992..0000000000000000000000000000000000000000 --- a/workflow/scripts/splitStudy.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -#SBATCH -p super -#SBATCH --job-name GUDMAP-RBK_Study -#SBATCH -t 7-0:0:0 - -# query GUDMAP/RBK for study RID -echo "curl --location --request GET 'https://www.gudmap.org/ermrest/catalog/2/entity/RNASeq:Replicate/Study_RID="${1}"'" | bash > $1_studyRID.json - -# extract replicate RIDs -module load python/3.6.4-anaconda -python3 ./workflow/scripts/splitStudy.py -s $1 - -# run pipeline on replicate RIDs in parallel -module load nextflow/20.01.0 -module load singularity/3.5.3 -while read repRID; do echo ${repRID}; sleep 15; done < "$1_studyRID.csv" | xargs -P 5 -I {} nextflow -q run workflow/rna-seq.nf --repRID {} - -# cleanup study RID files -rm $1_studyRID.json -rm $1_studyRID.csv diff --git a/workflow/scripts/tinHist.py b/workflow/scripts/tinHist.py deleted file mode 100644 index 3d292c2eb8cadb3b16466c6b19d0574184d439d7..0000000000000000000000000000000000000000 --- a/workflow/scripts/tinHist.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import pandas as pd -import numpy as np -import warnings -warnings.simplefilter(action='ignore', category=FutureWarning) - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('-r', '--repRID',help="The replicate RID.",required=True) - args = parser.parse_args() - return args - -def main(): - args = get_args() - tin = pd.read_csv(args.repRID + '.sorted.deduped.tin.xls',sep="\t",header=0) - - hist = pd.cut(tin['TIN'],bins=pd.interval_range(start=0,freq=10,end=100,closed='right')).value_counts(sort=False) - labels = ["{0} - {1}".format(i, i + 9) for i in range(1, 100, 10)] - #labels[0] = '0 - 10' - binned = tin.assign(Bins=lambda x: pd.cut(tin['TIN'],range(0,105,10),labels=labels,include_lowest=False,right=True)) - binned['chrom'] = binned['chrom'] = binned['chrom'].replace('chr1','chr01') - binned['chrom'] = binned['chrom'].replace('chr2','chr02') - binned['chrom'] = binned['chrom'].replace('chr3','chr03') - binned['chrom'] = binned['chrom'].replace('chr4','chr04') - binned['chrom'] = binned['chrom'].replace('chr5','chr05') - binned['chrom'] = binned['chrom'].replace('chr6','chr06') - binned['chrom'] = binned['chrom'].replace('chr7','chr07') - binned['chrom'] = binned['chrom'].replace('chr8','chr08') - binned['chrom'] = binned['chrom'].replace('chr9','chr09') - hist = pd.pivot_table(binned, values='geneID', index = 'Bins', columns = 'chrom', aggfunc=np.size) - hist['TOTAL'] = hist.sum(axis=1) - hist = hist[['TOTAL'] + [ i for i in hist.columns if i != 'TOTAL']] - hist = hist.T.fillna(0.0).astype(int) - #hist = hist.apply(lambda x: x/x.sum()*100, axis=1) - hist.to_csv(args.repRID + '.tin.hist.tsv',sep='\t') - medFile = open(args.repRID + '.tin.med.csv',"w") - medFile.write(str(round(tin['TIN'][(tin['TIN']!=0)].median(),2))) - medFile.close() - -if __name__ == '__main__': - main() diff --git a/workflow/scripts/tin_hist.py b/workflow/scripts/tin_hist.py new file mode 100644 index 0000000000000000000000000000000000000000..ee36bb6447dfe0adcdaab60e1224cca5b5a6e246 --- /dev/null +++ b/workflow/scripts/tin_hist.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 + +import argparse +import pandas as pd +import numpy as np +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + '-r', '--repRID', help="The replicate RID.", required=True) + args = parser.parse_args() + return args + + +def main(): + args = get_args() + tin = pd.read_csv(args.repRID + '_sorted.deduped.tin.xls', + sep="\t", header=0) + + hist = pd.cut(tin['TIN'], bins=pd.interval_range( + start=0, freq=10, end=100, closed='right')).value_counts(sort=False) + labels = ["{0} - {1}".format(i, i + 9) for i in range(1, 100, 10)] + #labels[0] = '0 - 10' + binned = tin.assign(Bins=lambda x: pd.cut(tin['TIN'], range( + 0, 105, 10), labels=labels, include_lowest=False, right=True)) + binned['chrom'] = binned['chrom'] = binned['chrom'].replace( + 'chr1', 'chr01') + binned['chrom'] = binned['chrom'].replace('chr2', 'chr02') + binned['chrom'] = binned['chrom'].replace('chr3', 'chr03') + binned['chrom'] = binned['chrom'].replace('chr4', 'chr04') + binned['chrom'] = binned['chrom'].replace('chr5', 'chr05') + binned['chrom'] = binned['chrom'].replace('chr6', 'chr06') + binned['chrom'] = binned['chrom'].replace('chr7', 'chr07') + binned['chrom'] = binned['chrom'].replace('chr8', 'chr08') + binned['chrom'] = binned['chrom'].replace('chr9', 'chr09') + hist = pd.pivot_table(binned, values='geneID', + index='Bins', columns='chrom', aggfunc=np.size) + hist['TOTAL'] = hist.sum(axis=1) + hist = hist[['TOTAL'] + [i for i in hist.columns if i != 'TOTAL']] + hist = hist.T.fillna(0.0).astype(int) + #hist = hist.apply(lambda x: x/x.sum()*100, axis=1) + hist.to_csv(args.repRID + '_tin.hist.tsv', sep='\t') + medFile = open(args.repRID + '_tin.med.csv', "w") + medFile.write(str(round(tin['TIN'][(tin['TIN'] != 0)].median(), 2))) + medFile.close() + + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/upload_execution_run.py b/workflow/scripts/upload_execution_run.py new file mode 100644 index 0000000000000000000000000000000000000000..2e8ea8de7745a3f048b580486f20e25d8904dd0c --- /dev/null +++ b/workflow/scripts/upload_execution_run.py @@ -0,0 +1,62 @@ +import argparse +from deriva.core import ErmrestCatalog, get_credential, BaseCLI +import sys +import csv + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-r', '--repRID', help="replicate RID", required=True) + parser.add_argument('-w', '--workflowRID', help="workflow RID", required=True) + parser.add_argument('-g', '--referenceRID', help="reference genome RID", required=True) + parser.add_argument('-i', '--inputBagRID', help="inputBag RID", required=True) + parser.add_argument('-n', '--notes', help="notes", default="", required=False) + parser.add_argument('-s', '--status', help="run status", default="", required=False) + parser.add_argument('-d', '--statusDetail', help="status detail", default="", required=False) + parser.add_argument('-o', '--host', help="datahub host", required=True) + parser.add_argument('-c', '--cookie', help="cookie token", required=True) + parser.add_argument('-u', '--update', help="update?", default="F", required=True) + args = parser.parse_args() + return args + +def main(hostname, catalog_number, credential): + catalog = ErmrestCatalog('https', hostname, catalog_number, credential) + pb = catalog.getPathBuilder() + run_table = pb.RNASeq.Execution_Run + + if args.update == "F": + run_data = { + "Replicate": args.repRID, + "Workflow": args.workflowRID, + "Reference_Genome": args.referenceRID, + "Input_Bag": args.inputBagRID, + "Notes": args.notes, + "Execution_Status": args.status, + "Execution_Status_Detail": args.statusDetail.replace('\\n','\n') + } + entities = run_table.insert([run_data]) + rid = entities[0]["RID"] + else: + run_data = { + "RID": args.update, + "Replicate": args.repRID, + "Workflow": args.workflowRID, + "Reference_Genome": args.referenceRID, + "Input_Bag": args.inputBagRID, + "Notes": args.notes, + "Execution_Status": args.status, + "Execution_Status_Detail": args.statusDetail.replace('\\n','\n') + } + entities = run_table.update([run_data]) + rid = args.update + + + print(rid) + + +if __name__ == '__main__': + args = get_args() + cli = BaseCLI("Custom RNASeq query", None, 1) + cli.remove_options(["--config-file"]) + host = args.host + credentials = {"cookie": args.cookie} + main(host, 2, credentials) \ No newline at end of file diff --git a/workflow/scripts/upload_input_bag.py b/workflow/scripts/upload_input_bag.py new file mode 100644 index 0000000000000000000000000000000000000000..ab4d338074ffec3098667dcf2817041e01ded8bd --- /dev/null +++ b/workflow/scripts/upload_input_bag.py @@ -0,0 +1,46 @@ +import argparse +from deriva.core import ErmrestCatalog, get_credential, BaseCLI +import sys +import csv +from datetime import datetime + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-f', '--file', help="file name", required=True) + parser.add_argument('-l', '--loc', help="datahub location", required=True) + parser.add_argument('-s', '--md5', help="md5 sum", required=True) + parser.add_argument('-b', '--bytes', help="size in bytes", required=True) + parser.add_argument('-n', '--notes', help="notes", default="", required=False) + parser.add_argument('-o', '--host', help="datahub host", required=True) + parser.add_argument('-c', '--cookie', help="cookie token", required=True) + args = parser.parse_args() + return args + +def main(hostname, catalog_number, credential): + catalog = ErmrestCatalog('https', hostname, catalog_number, credential) + pb = catalog.getPathBuilder() + inputBag_table = pb.RNASeq.Input_Bag + + inputBag_data = { + "File_Name": args.file, + "File_URL": args.loc, + "File_MD5": args.md5, + "File_Bytes": args.bytes, + "File_Creation_Time": datetime.now().replace(microsecond=0).isoformat(), + "Notes": args.notes, + "Bag_Type": "Replicate_Input_Seq" + } + + entities = inputBag_table.insert([inputBag_data]) + rid = entities[0]["RID"] + + print(rid) + + +if __name__ == '__main__': + args = get_args() + cli = BaseCLI("Custom RNASeq query", None, 1) + cli.remove_options(["--config-file"]) + host = args.host + credential = {"cookie": args.cookie} + main(host, 2, credential) \ No newline at end of file diff --git a/workflow/scripts/upload_output_bag.py b/workflow/scripts/upload_output_bag.py new file mode 100644 index 0000000000000000000000000000000000000000..397658c0ccef21af86e529a040a6dcb2ac506833 --- /dev/null +++ b/workflow/scripts/upload_output_bag.py @@ -0,0 +1,48 @@ +import argparse +from deriva.core import ErmrestCatalog, get_credential, BaseCLI +import sys +import csv +from datetime import datetime + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-e', '--executionRunRID', help="exection run RID", required=True) + parser.add_argument('-f', '--file', help="file name", required=True) + parser.add_argument('-l', '--loc', help="datahub location", required=True) + parser.add_argument('-s', '--md5', help="md5 sum", required=True) + parser.add_argument('-b', '--bytes', help="size in bytes", required=True) + parser.add_argument('-n', '--notes', help="notes", default="", required=False) + parser.add_argument('-o', '--host', help="datahub host", required=True) + parser.add_argument('-c', '--cookie', help="cookie token", required=True) + args = parser.parse_args() + return args + +def main(hostname, catalog_number, credential): + catalog = ErmrestCatalog('https', hostname, catalog_number, credential) + pb = catalog.getPathBuilder() + outputBag_table = pb.RNASeq.Output_Bag + + outputBag_data = { + "Execution_Run": args.executionRunRID, + "File_Name": args.file, + "File_URL": args.loc, + "File_MD5": args.md5, + "File_Bytes": args.bytes, + "File_Creation_Time": datetime.now().replace(microsecond=0).isoformat(), + "Notes": args.notes, + "Bag_Type": "mRNA_Replicate_Analysis" + } + + entities = outputBag_table.insert([outputBag_data]) + rid = entities[0]["RID"] + + print(rid) + + +if __name__ == '__main__': + args = get_args() + cli = BaseCLI("Custom RNASeq query", None, 1) + cli.remove_options(["--config-file"]) + host = args.host + credential = {"cookie": args.cookie} + main(host, 2, credential) \ No newline at end of file diff --git a/workflow/scripts/upload_qc.py b/workflow/scripts/upload_qc.py new file mode 100644 index 0000000000000000000000000000000000000000..b842a7a36cc47fa4f599ab086a5c1b3dbece437a --- /dev/null +++ b/workflow/scripts/upload_qc.py @@ -0,0 +1,68 @@ +import argparse +from deriva.core import ErmrestCatalog, get_credential, BaseCLI +import sys +import csv + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-r', '--repRID', help="replicate RID", required=True) + parser.add_argument('-e', '--executionRunRID', help="exection run RID", required=True) + parser.add_argument('-p', '--ends', help="single/paired ends", required=True) + parser.add_argument('-s', '--stranded', help="stranded?", required=True) + parser.add_argument('-l', '--length', help="median read length", required=True) + parser.add_argument('-w', '--rawCount', help="raw count", required=True) + parser.add_argument('-f', '--assignedCount', help="final assigned count", required=True) + parser.add_argument('-t', '--tin', help="median TIN", required=True) + parser.add_argument('-n', '--notes', help="notes", default="", required=False) + parser.add_argument('-o', '--host', help="datahub host", required=True) + parser.add_argument('-c', '--cookie', help="cookie token", required=True) + parser.add_argument('-u', '--update', help="update?", default="F", required=True) + args = parser.parse_args() + return args + +def main(hostname, catalog_number, credential): + catalog = ErmrestCatalog('https', hostname, catalog_number, credential) + pb = catalog.getPathBuilder() + run_table = pb.RNASeq.mRNA_QC + + if args.update == "F": + run_data = { + "Execution_Run": args.executionRunRID, + "Replicate": args.repRID, + "Paired_End": args.ends, + "Strandedness": args.stranded, + "Median_Read_Length": args.length, + "Raw_Count": args.rawCount, + "Final_Count": args.assignedCount, + "Median_TIN": args.tin, + "Notes": args.notes + } + entities = run_table.insert([run_data]) + rid = entities[0]["RID"] + else: + run_data = { + "RID": args.update, + "Execution_Run": args.executionRunRID, + "Replicate": args.repRID, + "Paired_End": args.ends, + "Strandedness": args.stranded, + "Median_Read_Length": args.length, + "Raw_Count": args.rawCount, + "Final_Count": args.assignedCount, + "Median_TIN": args.tin, + "Notes": args.notes + } + entities = run_table.update([run_data]) + rid = args.update + + + print(rid) + + +if __name__ == '__main__': + args = get_args() + cli = BaseCLI("Custom RNASeq query", None, 1) + cli.remove_options(["--config-file"]) + host = args.host + credentials = {"cookie": args.cookie} + main(host, 2, credentials) \ No newline at end of file diff --git a/workflow/scripts/utils.py b/workflow/scripts/utils.py index 5e4478e3b1cef96e9b4c05033f75122f87aad06e..548b84c0642ee1479c1ac2e968ba1cd1aa9ed660 100644 --- a/workflow/scripts/utils.py +++ b/workflow/scripts/utils.py @@ -1,14 +1,5 @@ #!/usr/bin/env python3 -# -# * -------------------------------------------------------------------------- -# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md) -# * -------------------------------------------------------------------------- -# - -'''General utilities.''' - - import shlex import logging import subprocess @@ -32,7 +23,8 @@ def run_pipe(steps, outfile=None): if n == first_step_n: if n == last_step_n and outfile: # one-step pipeline with outfile with open(outfile, 'w') as fh: - print("one step shlex: %s to file: %s" % (shlex.split(step), outfile)) + print("one step shlex: %s to file: %s" % + (shlex.split(step), outfile)) p = Popen(shlex.split(step), stdout=fh) break print("first step shlex to stdout: %s" % (shlex.split(step))) @@ -40,12 +32,14 @@ def run_pipe(steps, outfile=None): p = Popen(shlex.split(step), stdout=PIPE) elif n == last_step_n and outfile: # only treat the last step specially if you're sending stdout to a file with open(outfile, 'w') as fh: - print("last step shlex: %s to file: %s" % (shlex.split(step), outfile)) + print("last step shlex: %s to file: %s" % + (shlex.split(step), outfile)) p_last = Popen(shlex.split(step), stdin=p.stdout, stdout=fh) p.stdout.close() p = p_last else: # handles intermediate steps and, in the case of a pipe to stdout, the last step - print("intermediate step %d shlex to stdout: %s" % (n, shlex.split(step))) + print("intermediate step %d shlex to stdout: %s" % + (n, shlex.split(step))) p_next = Popen(shlex.split(step), stdin=p.stdout, stdout=PIPE) p.stdout.close() p = p_next @@ -54,7 +48,8 @@ def run_pipe(steps, outfile=None): def block_on(command): - process = subprocess.Popen(shlex.split(command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) + process = subprocess.Popen(shlex.split( + command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) for line in iter(process.stdout.readline, b''): sys.stdout.write(line.decode('utf-8')) process.communicate() @@ -77,7 +72,7 @@ def count_lines(filename): "compress", "bzip2", "gzip" - ] + ] mime_type = mimetypes.guess_type(filename)[1] if mime_type in compressed_mimetypes: catcommand = 'gzip -dc' @@ -86,7 +81,7 @@ def count_lines(filename): out, err = run_pipe([ '%s %s' % (catcommand, filename), 'wc -l' - ]) + ]) return int(out) diff --git a/workflow/tests/test_alignReads.py b/workflow/tests/test_alignReads.py index eae8780f5641404ac9caabe32089369a3b0b1ea2..11f0f3d0d09236a3f3494a2c851fb4294f2a1323 100644 --- a/workflow/tests/test_alignReads.py +++ b/workflow/tests/test_alignReads.py @@ -6,18 +6,24 @@ import os import utils data_output_path = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../' + '/../../' @pytest.mark.alignData def test_alignData_se(): - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.unal.gz')) - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.bam')) - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.bam.bai')) + assert os.path.exists(os.path.join( + data_output_path, 'Q-Y5F6_1M.se.unal.gz')) + assert os.path.exists(os.path.join( + data_output_path, 'Q-Y5F6_1M.se.sorted.bam')) + assert os.path.exists(os.path.join( + data_output_path, 'Q-Y5F6_1M.se.sorted.bam.bai')) @pytest.mark.alignData def test_alignData_pe(): - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.pe.unal.gz')) - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.pe.sorted.bam')) - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.pe.sorted.bam.bai')) + assert os.path.exists(os.path.join( + data_output_path, 'Q-Y5F6_1M.pe.unal.gz')) + assert os.path.exists(os.path.join( + data_output_path, 'Q-Y5F6_1M.pe.sorted.bam')) + assert os.path.exists(os.path.join( + data_output_path, 'Q-Y5F6_1M.pe.sorted.bam.bai')) diff --git a/workflow/tests/test_consistency.py b/workflow/tests/test_consistency.py index 073b12826b798ac94d16fda4291dfba2c1a42203..aa04f19bd23e3749532b87b598179b8f98b2218b 100644 --- a/workflow/tests/test_consistency.py +++ b/workflow/tests/test_consistency.py @@ -4,27 +4,39 @@ import pytest import pandas as pd from io import StringIO import os +import json test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../' + '/../../' + @pytest.mark.consistencySE def test_consistencySE(): - assert os.path.exists(os.path.join(test_output_path, 'SE_multiqc_data.json')) - assert readAssigned("assignedSE.txt","assignedExpectSE.txt") + assert os.path.exists(os.path.join( + test_output_path, 'SE_multiqc_data.json')) + + with open(os.path.join( + test_output_path, 'SE_multiqc_data.json')) as f: + assigned_reads_json = json.load(f) + assigned_reads = assigned_reads_json['report_general_stats_data'][4]['16-1ZX4_sorted']['Assigned'] + baseline = 7746121 + baseline_hi = baseline+(baseline*0.05) + baseline_lo = baseline-(baseline*0.05) + assert (assigned_reads >= baseline_lo) + assert (assigned_reads <= baseline_hi) + @pytest.mark.consistencyPE def test_consistencyPE(): - assert os.path.exists(os.path.join(test_output_path, 'PE_multiqc_data.json')) - assert readAssigned("assignedPE.txt","assignedExpectPE.txt") - -def readAssigned(fileAssigned,fileExpectAssigned): - data = False - assigned = open(fileAssigned, "r") - expect = open(fileExpectAssigned, "r") - lineAssigned = assigned.readline() - lineExpect = expect.readline() - if int(lineAssigned.strip()) < (int(lineExpect.strip())+(int(lineExpect.strip())*0.00001)) and int(lineAssigned.strip()) > (int(lineExpect.strip())-(int(lineExpect.strip())*0.00001)): - data = True - - return data + assert os.path.exists(os.path.join( + test_output_path, 'PE_multiqc_data.json')) + + with open(os.path.join( + test_output_path, 'PE_multiqc_data.json')) as f: + assigned_reads_json = json.load(f) + assigned_reads = assigned_reads_json['report_general_stats_data'][4]['Q-Y5JA_sorted']['Assigned'] + baseline = 2596053 + baseline_hi = baseline+(baseline*0.05) + baseline_lo = baseline-(baseline*0.05) + assert (assigned_reads >= baseline_lo) + assert (assigned_reads <= baseline_hi) diff --git a/workflow/tests/test_dataQC.py b/workflow/tests/test_dataQC.py index e77d4680fd8eac61c3a9b9a8fd175136a61244b9..55df66deaeda6dac18ea26455dd1e3948ceb28ba 100644 --- a/workflow/tests/test_dataQC.py +++ b/workflow/tests/test_dataQC.py @@ -6,12 +6,16 @@ from io import StringIO import os test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../' + '/../../' + @pytest.mark.dataQC def test_dataQC(): - assert os.path.exists(os.path.join(test_output_path, 'Q-Y5F6_1M.se.sorted.deduped.tin.xls')) - assert countLines(os.path.join(test_output_path, 'Q-Y5F6_1M.se.sorted.deduped.tin.xls')) + assert os.path.exists(os.path.join( + test_output_path, 'Q-Y5F6_1M.se.sorted.deduped.tin.xls')) + assert countLines(os.path.join(test_output_path, + 'Q-Y5F6_1M.se.sorted.deduped.tin.xls')) + def countLines(fileName): data = False diff --git a/workflow/tests/test_dedupReads.py b/workflow/tests/test_dedupReads.py index 49cf420e2f0e2de923ae580c51dcdf42549f0713..89fc2b10fa4db847ccc16d5cce664bf551b29ee3 100644 --- a/workflow/tests/test_dedupReads.py +++ b/workflow/tests/test_dedupReads.py @@ -6,16 +6,24 @@ import os import utils data_output_path = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../' + '/../../' @pytest.mark.dedupData def test_dedupData(): - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.bam')) - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.bam.bai')) - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr8.bam')) - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr8.bam.bai')) - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr4.bam')) - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr4.bam.bai')) - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chrY.bam')) - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chrY.bam.bai')) + assert os.path.exists(os.path.join( + data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.bam')) + assert os.path.exists(os.path.join( + data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.bam.bai')) + assert os.path.exists(os.path.join( + data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr8.bam')) + assert os.path.exists(os.path.join( + data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr8.bam.bai')) + assert os.path.exists(os.path.join( + data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr4.bam')) + assert os.path.exists(os.path.join( + data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr4.bam.bai')) + assert os.path.exists(os.path.join( + data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chrY.bam')) + assert os.path.exists(os.path.join( + data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chrY.bam.bai')) diff --git a/workflow/tests/test_downsampleData.py b/workflow/tests/test_downsampleData.py index fd42c49e169e387dd9662903b20866c40aec8907..6d98ad656b0bcb0a3e8aee5507149cdcf3cec5f0 100644 --- a/workflow/tests/test_downsampleData.py +++ b/workflow/tests/test_downsampleData.py @@ -6,8 +6,9 @@ from io import StringIO import os test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../' + '/../../' + @pytest.mark.downsampleData def test_downsampleData(): - assert os.path.exists(os.path.join(test_output_path, 'sampled.1.fq')) \ No newline at end of file + assert os.path.exists(os.path.join(test_output_path, 'sampled.1.fq')) diff --git a/workflow/tests/test_fastqc.py b/workflow/tests/test_fastqc.py index 89303fe78e081a26fd6a8ea633997dffb8f920a6..07e76108fbfc92f945060d8e5d1e1ea8f74e6a4a 100644 --- a/workflow/tests/test_fastqc.py +++ b/workflow/tests/test_fastqc.py @@ -6,8 +6,10 @@ from io import StringIO import os test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../' + '/../../' + @pytest.mark.fastqc def test_fastqc(): - assert os.path.exists(os.path.join(test_output_path, 'Q-Y5F6_1M.R1_fastqc.zip')) + assert os.path.exists(os.path.join( + test_output_path, 'Q-Y5F6_1M.R1_fastqc.zip')) diff --git a/workflow/tests/test_getBag.py b/workflow/tests/test_getBag.py index 1c63c9d95ac33aceeaa965852ede7bfc5e86bdc7..23bfc0ea50c260a2f5c4cbf62321c066b5743ac2 100644 --- a/workflow/tests/test_getBag.py +++ b/workflow/tests/test_getBag.py @@ -6,8 +6,10 @@ from io import StringIO import os test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../' + '/../../' + @pytest.mark.getBag def test_getBag(): - assert os.path.exists(os.path.join(test_output_path, 'Replicate_Q-Y5F6.zip')) + assert os.path.exists(os.path.join( + test_output_path, 'Q-Y5F6_inputBag.zip')) diff --git a/workflow/tests/test_getData.py b/workflow/tests/test_getData.py index a14be93ea8103aadf44aca3156ee28036cb6113e..596a120abe904eac8f3e0ad871c9f8c03a6cba5f 100644 --- a/workflow/tests/test_getData.py +++ b/workflow/tests/test_getData.py @@ -6,9 +6,12 @@ from io import StringIO import os test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../' + '/../../' + @pytest.mark.getData def test_getData(): - assert os.path.exists(os.path.join(test_output_path, 'Replicate_Q-Y5F6/bagit.txt')) - assert os.path.exists(os.path.join(test_output_path, 'Replicate_Q-Y5F6/data/assets/Study/Q-Y4GY/Experiment/Q-Y4DP/Replicate/Q-Y5F6/mMARIS_Six2-#3.gene.rpkm.txt')) + assert os.path.exists(os.path.join( + test_output_path, 'Q-Y5F6_inputBag/bagit.txt')) + assert os.path.exists(os.path.join( + test_output_path, 'Q-Y5F6_inputBag/data/assets/Study/Q-Y4GY/Experiment/Q-Y4DP/Replicate/Q-Y5F6/mMARIS_Six2-#3.gene.rpkm.txt')) diff --git a/workflow/tests/test_inferMetadata.py b/workflow/tests/test_inferMetadata.py index 518664ced5ab4dd4e713dede9c58b9d87594799a..7485163631e2604ac2d5477a5c27b2fc9b235b44 100644 --- a/workflow/tests/test_inferMetadata.py +++ b/workflow/tests/test_inferMetadata.py @@ -6,9 +6,10 @@ from io import StringIO import os test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../' + '/../../' + @pytest.mark.inferMetadata def test_inferMetadata(): - assert os.path.exists(os.path.join(test_output_path, 'Q-Y5F6_1M.se.inferMetadata.log')) - + assert os.path.exists(os.path.join( + test_output_path, 'Q-Y5F6_1M.se.inferMetadata.log')) diff --git a/workflow/tests/test_makeBigWig.py b/workflow/tests/test_makeBigWig.py index 9292ac64714017fde8371927cee616374a457b3a..d8f62f5edfb3b57868d0b4b18ed6a0deb6bd651e 100644 --- a/workflow/tests/test_makeBigWig.py +++ b/workflow/tests/test_makeBigWig.py @@ -6,7 +6,7 @@ import os import utils data_output_path = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../' + '/../../' @pytest.mark.makeBigWig diff --git a/workflow/tests/test_makeFeatureCounts.py b/workflow/tests/test_makeFeatureCounts.py index d33527a9c7bffcaa9919639b72842cc8cf63b14f..e14793511b226a6c82d502ce2f84867c087bc41a 100644 --- a/workflow/tests/test_makeFeatureCounts.py +++ b/workflow/tests/test_makeFeatureCounts.py @@ -6,11 +6,14 @@ import os import utils data_output_path = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../' + '/../../' @pytest.mark.makeFeatureCounts def test_makeFeatureCounts(): - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.countData')) - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.countTable.csv')) - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.tpmTable.csv')) + assert os.path.exists(os.path.join( + data_output_path, 'Q-Y5F6_1M.se_countData')) + assert os.path.exists(os.path.join( + data_output_path, 'Q-Y5F6_1M.se.countTable.csv')) + assert os.path.exists(os.path.join( + data_output_path, 'Q-Y5F6_1M.se_tpmTable.csv')) diff --git a/workflow/tests/test_outputBag.py b/workflow/tests/test_outputBag.py index 4132d834996e5557024dbbf587d4aca41594bf9e..c73e6474efdb9093b0cc66c00b220426be707690 100644 --- a/workflow/tests/test_outputBag.py +++ b/workflow/tests/test_outputBag.py @@ -6,7 +6,8 @@ from io import StringIO import os test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../' + '/../../' + @pytest.mark.outputBag def test_outputBag(): diff --git a/workflow/tests/test_parseMetadata.py b/workflow/tests/test_parseMetadata.py index 59677bbba7d40058bdeb78ccceeeeddba4565a14..5a14fcd885b79d944e46de5d936d17fc941def7b 100644 --- a/workflow/tests/test_parseMetadata.py +++ b/workflow/tests/test_parseMetadata.py @@ -6,18 +6,20 @@ from io import StringIO import os test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../' + '/../../' + @pytest.mark.parseMetadata def test_parseMetadata(): assert os.path.exists(os.path.join(test_output_path, 'design.csv')) assert readLine(os.path.join(test_output_path, 'design.csv')) + def readLine(fileName): data = False file = open(fileName, "r") line = file.readline() - if line.strip() == "uk,se,unstranded,no,Homo sapiens,75,Experiment_RID,Study_RID,Replicate_RID": + if line.strip() == "uk,uk,se,unstranded,no,Homo sapiens,75,Experiment_RID,Study_RID,Replicate_RID": data = True return data diff --git a/workflow/tests/test_trimData.py b/workflow/tests/test_trimData.py index ba0eeda481262647abc9e4a8bf362515ac0dc0e7..a0938e756715fb30254e5c72fee4cd38bffec330 100644 --- a/workflow/tests/test_trimData.py +++ b/workflow/tests/test_trimData.py @@ -6,14 +6,18 @@ from io import StringIO import os test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ - '/../../' + '/../../' + @pytest.mark.trimData def test_trimData_se(): - assert os.path.exists(os.path.join(test_output_path, 'Q-Y5F6_1M.se_trimmed.fq.gz')) + assert os.path.exists(os.path.join( + test_output_path, 'Q-Y5F6_1M.se_trimmed.fq.gz')) @pytest.mark.trimData def test_trimData_pe(): - assert os.path.exists(os.path.join(test_output_path, 'Q-Y5F6_1M.pe_R1_val_1.fq.gz')) - assert os.path.exists(os.path.join(test_output_path, 'Q-Y5F6_1M.pe_R2_val_2.fq.gz')) + assert os.path.exists(os.path.join( + test_output_path, 'Q-Y5F6_1M.pe_val_1.fq.gz')) + assert os.path.exists(os.path.join( + test_output_path, 'Q-Y5F6_1M.pe_val_2.fq.gz'))