diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 78c3478cf9183a3ca440bac36a059a3e90e0c23d..868ac0c5d1f3f473422602f4d1efe4f22613abd0 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -7,6 +7,11 @@ before_script: - mkdir -p ~/.deriva - mkdir -p ~/.bdbag +variables: + refMoVersion: "38.p6.vM22" + refHuVersion: "38.p12.v31" + refERCCVersion: "92" + stages: - badges - deploy @@ -47,8 +52,8 @@ getBag: - merge_requests script: - ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json - - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' deriva-download-cli --version > version_deriva.txt - - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6 + - singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-download-cli --version > version_deriva.txt + - singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-download-cli staging.gudmap.org --catalog 2 ./workflow/conf/Replicate_For_Input_Bag.json . rid=Q-Y5F6 - pytest -m getBag artifacts: name: "$CI_JOB_NAME" @@ -65,10 +70,10 @@ getData: except: - merge_requests script: - - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bdbag --version > version_bdbag.txt + - singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' bdbag --version > version_bdbag.txt - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt - - unzip ./test_data/bag/staging/Replicate_Q-Y5F6.zip - - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6 TEST + - unzip ./test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip + - singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' bash ./workflow/scripts/bdbag_fetch.sh Q-Y5F6_inputBag Q-Y5F6 TEST - pytest -m getData artifacts: name: "$CI_JOB_NAME" @@ -85,16 +90,16 @@ parseMetadata: except: - merge_requests script: - - singularity run 'docker://bicf/python3:2.0.1_indev' python3 --version > version_python.txt - - rep=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID) - - exp=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p expRID) - - study=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p studyRID) - - endsMeta=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsMeta) - - endsManual=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsManual) - - stranded=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded) - - spike=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p spike) - - species=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species) - - readLength=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.stageNew.csv" -p readLength) + - singularity run 'docker://gudmaprbk/python3:1.0.0' python3 --version > version_python.txt + - rep=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID) + - exp=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p expRID) + - study=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p studyRID) + - endsMeta=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsMeta) + - endsManual=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsManual) + - stranded=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded) + - spike=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p spike) + - species=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species) + - readLength=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p readLength) - echo -e "${endsMeta},${endsManual},${stranded},${spike},${species},${readLength},${exp},${study},${rep}" > design.csv - pytest -m parseMetadata artifacts: @@ -112,13 +117,13 @@ inferMetadata: except: - merge_requests script: - - singularity run 'docker://bicf/rseqc3.0:2.0.1_indev' infer_experiment.py --version > version_rseqc.txt + - singularity run 'docker://gudmaprbk/rseqc4.0.0:1.0.0' infer_experiment.py --version > version_rseqc.txt - > align=$(echo $(grep "Overall alignment rate" ./test_data/meta/Q-Y5F6_1M.se.alignSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) && if [[ ${align} == "" ]]; then exit 1; fi - > - singularity run 'docker://bicf/rseqc3.0:2.0.1_indev' infer_experiment.py -r "/project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed" -i "./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam" 1>> Q-Y5F6_1M.se.inferMetadata.log && - ended=`singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/inferMeta.sh endness Q-Y5F6_1M.se.inferMetadata.log` && + singularity run 'docker://gudmaprbk/rseqc4.0.0:1.0.0' infer_experiment.py -r "/project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed" -i "./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam" 1>> Q-Y5F6_1M.se.inferMetadata.log && + ended=`singularity run 'gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/infer_meta.sh endness Q-Y5F6_1M.se.inferMetadata.log` && if [[ ${ended} == "" ]]; then exit 1; fi - pytest -m inferMetadata artifacts: @@ -136,9 +141,9 @@ trimData: except: - merge_requests script: - - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --version > version_trimgalore.txt - - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz - - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz + - singularity run 'docker://gudmaprbk/trimgalore0.6.5:1.0.0' trim_galore --version > version_trimgalore.txt + - singularity run 'docker://gudmaprbk/trimgalore0.6.5:1.0.0' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz + - singularity run 'docker://gudmaprbk/trimgalore0.6.5:1.0.0' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz - readLengthSE=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') - readLengthPE=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') - pytest -m trimData @@ -157,10 +162,9 @@ downsampleData: except: - merge_requests script: - - singularity run 'docker://bicf/seqtk:2.0.1_indev' seqtk sample -s100 ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz 1000 1> sampled.1.fq + - singularity run 'docker://gudmaprbk/seqtk1.3:1.0.0' seqtk sample -s100 ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz 1000 1> sampled.1.fq - pytest -m downsampleData - alignData: stage: unit only: @@ -169,16 +173,16 @@ alignData: except: - merge_requests script: - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 --version > version_hisat2.txt - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools --version > version_samtools.txt - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.sorted.bam.bai - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.pe.unal.gz -S Q-Y5F6_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/small/Q-Y5F6_1M.pe_R1_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5F6_1M.pe_R2_val_2.fq.gz --summary-file Q-Y5F6_1M.pe.alignSummary.txt --new-summary - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.pe.bam Q-Y5F6_1M.pe.sam - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai + - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' hisat2 --version > version_hisat2.txt + - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools --version > version_samtools.txt + - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary + - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam + - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam + - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.sorted.bam.bai + - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.pe.unal.gz -S Q-Y5F6_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/small/Q-Y5F6_1M.pe_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5F6_1M.pe_val_2.fq.gz --summary-file Q-Y5F6_1M.pe.alignSummary.txt --new-summary + - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.pe.bam Q-Y5F6_1M.pe.sam + - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam + - singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai - pytest -m alignData artifacts: name: "$CI_JOB_NAME" @@ -188,7 +192,6 @@ alignData: - version_samtools.txt expire_in: 7 days - dedupData: stage: unit only: @@ -197,15 +200,15 @@ dedupData: except: - merge_requests script: - - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools --version > version_samtools.txt - - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates --version 2> version_markdups.txt& - - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true - - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5F6_1M.se.deduped.bam - - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai + - singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' samtools --version > version_samtools.txt + - singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates --version 2> version_markdups.txt& + - singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true + - singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5F6_1M.se.deduped.bam + - singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' samtools index -@ 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai - > for i in {"chr8","chr4","chrY"}; do echo "samtools view -b Q-Y5F6_1M.se.sorted.deduped.bam ${i} > Q-Y5F6_1M.se.sorted.deduped.${i}.bam; samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.${i}.bam Q-Y5F6_1M.se.sorted.deduped.${i}.bam.bai;"; - done | singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' parallel -j 20 -k + done | singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' parallel -j 20 -k - pytest -m dedupData artifacts: name: "$CI_JOB_NAME" @@ -225,12 +228,12 @@ countData: script: - ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/geneID.tsv - ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/Entrez.tsv - - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se.countData -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam - - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.countData - - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se + - singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se_countData -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam + - singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se_countData + - singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se - assignedReads=$(grep -m 1 'Assigned' *.summary | grep -oe '\([0-9.]*\)') - - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -v &> version_featurecounts.txt - - singularity run 'docker://bicf/subread2:2.0.0' R --version > version_r.txt + - singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' featureCounts -v &> version_featurecounts.txt + - singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' R --version > version_r.txt - pytest -m makeFeatureCounts artifacts: name: "$CI_JOB_NAME" @@ -248,8 +251,8 @@ makeBigWig: except: - merge_requests script: - - singularity run 'docker://bicf/deeptools3.3:2.0.1_indev' deeptools --version > version_deeptools.txt - - singularity run 'docker://bicf/deeptools3.3:2.0.1_indev' bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw + - singularity run 'docker://gudmaprbk/deeptools3.5.0:1.0.0' deeptools --version > version_deeptools.txt + - singularity run 'docker://gudmaprbk/deeptools3.5.0:1.0.0' bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw - pytest -m makeBigWig artifacts: name: "$CI_JOB_NAME" @@ -266,8 +269,8 @@ fastqc: except: - merge_requests script: - - singularity run 'docker://bicf/fastqc:2.0.1_indev' fastqc --version > version_fastqc.txt - - singularity run 'docker://bicf/fastqc:2.0.1_indev' fastqc ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o . + - singularity run 'docker://gudmaprbk/fastqc0.11.9:1.0.0' fastqc --version > version_fastqc.txt + - singularity run 'docker://gudmaprbk/fastqc0.11.9:1.0.0' fastqc ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o . - pytest -m fastqc artifacts: name: "$CI_JOB_NAME" @@ -286,11 +289,85 @@ dataQC: - merge_requests script: - echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > Q-Y5F6_1M.se.sorted.deduped.tin.xls - - for i in {"chr8","chr4","chrY"}; do - echo "tin.py -i ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"; done | singularity run 'docker://bicf/rseqc3.0:2.0.1_indev' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls + - > + for i in {"chr8","chr4","chrY"}; do + echo "tin.py -i ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";" + done | singularity run 'docker://gudmaprbk/rseqc4.0.0:1.0.0' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls - pytest -m dataQC -outputBag: +uploadInputBag: + stage: unit + only: + - push + - tags + except: + - merge_requests + script: + - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json + - echo THIS IS A TEST FILE > test.txt + - > + md5=$(md5sum ./test.txt | awk '{ print $1 }') && + size=$(wc -c < ./test.txt) && + exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=${md5}) && + if [ "${exist}" == "[]" ]; then + cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && + cookie=${cookie:11:-1} && + loc=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host staging.gudmap.org put ./test.txt /hatrac/resources/rnaseq/pipeline/input_bag/TEST/test.txt --parents) && + rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/upload_input_bag.py -f test.txt -l ${loc} -s ${md5} -b ${size} -n 'This is a test input bag' -o staging.gudmap.org -c ${cookie}) && + echo ${rid} test input bag created + else + rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') && + rid=${rid:8:-6} && + echo ${rid} test input bag already exists + fi + +uploadExecutionRun: + stage: unit + only: + - push + - tags + except: + - merge_requests + script: + - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json + - > + exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Replicate=17-BTFJ) && + cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && + cookie=${cookie:11:-1} && + if [ "${exist}" == "[]" ]; then + rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/upload_execution_run.py -r 17-BTFJ -w 17-BTFM -g 17-BT50 -i 17-BTFT -s Success -d 'This is a test execution run' -o staging.gudmap.org -c ${cookie} -u F) && + echo ${rid} test execution run created + else + rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') && + rid=${rid:7:-6} && + rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/upload_execution_run.py -r 17-BTFJ -w 17-BTFM -g 17-BT50 -i 17-BTFT -s Success -d 'This is a test execution run' -o staging.gudmap.org -c ${cookie} -u ${rid}) && + echo ${rid} test execution run already exists + fi + +uploadQC: + stage: unit + only: + - push + - tags + except: + - merge_requests + script: + - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json + - > + exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=17-BTFJ) && + cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && + cookie=${cookie:11:-1} && + if [ "${exist}" != "[]" ]; then + rids=$(echo ${exist} | grep -o '\"RID\":\".\{7\}' | sed 's/^.\{7\}//') && + for rid in ${rids}; do + singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/delete_entry.py -r ${rid} -t mRNA_QC -o staging.gudmap.org -c ${cookie} + done + echo all old mRNA QC RIDs deleted + fi + rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/upload_qc.py -r 17-BTFJ -e 17-BTG4 -p "Single Read" -s forward -l 35 -w 5 -f 1 -n "This is a test mRNA QC" -o staging.gudmap.org -c ${cookie} -u F) + echo ${rid} test mRNA QC created + +uploadProcessedFile: stage: unit only: - push @@ -298,10 +375,54 @@ outputBag: except: - merge_requests script: + - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json + - echo THIS IS A TEST FILE > 17-BTFJ_test.csv + - mkdir -p ./deriva/Seq/pipeline/17-BTFE/17-BTG4/ + - mv 17-BTFJ_test.csv ./deriva/Seq/pipeline/17-BTFE/17-BTG4/17-BTFJ_test.csv + - > + exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Processed_File/Replicate=17-BTFJ) && + cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && + cookie=${cookie:11:-1} && + if [ "${exist}" != "[]" ]; then + rids=$(echo ${exist} | grep -o '\"RID\":\".\{7\}' | sed 's/^.\{7\}//') && + for rid in ${rids}; do + singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/delete_entry.py -r ${rid} -t Processed_File -o staging.gudmap.org -c ${cookie} + done + echo all old processed file RIDs deleted + fi + singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-upload-cli --catalog 2 --token ${cookie:9} staging.gudmap.org ./deriva + echo test processed file uploaded - mkdir test - - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bdbag test --archiver zip + - singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' bdbag test --archiver zip + - echo test output bag created - pytest -m outputBag +uploadOutputBag: + stage: unit + only: + - push + - tags + except: + - merge_requests + script: + - ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json + - echo THIS IS A TEST FILE > test.txt + - > + md5=$(md5sum ./test.txt | awk '{ print $1 }') && + size=$(wc -c < ./test.txt) && + exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_MD5=${md5}) && + if [ "${exist}" == "[]" ]; then + cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') && + cookie=${cookie:11:-1} && + loc=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host staging.gudmap.org put ./test.txt /hatrac/resources/rnaseq/pipeline/output_bag/TEST/test.txt --parents) && + rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/upload_output_bag.py -e 17-BTG4 -f test.txt -l ${loc} -s ${md5} -b ${size} -n 'This is a test output bag' -o staging.gudmap.org -c ${cookie}) && + echo ${rid} test output bag created + else + rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') && + rid=${rid:8:-6} && + echo ${rid} test output bag already exists + fi + generateVersions: stage: aggregation @@ -311,7 +432,7 @@ generateVersions: except: - merge_requests script: - - singularity run 'docker://bicf/multiqc1.8:2.0.1_indev' multiqc --version > version_multiqc.txt + - singularity run 'docker://gudmaprbk/multiqc1.9:1.0.0' multiqc --version > version_multiqc.txt - python ./workflow/scripts/generate_versions.py -o software_versions - python ./workflow/scripts/generate_references.py -r ./docs/references.md -o software_references artifacts: @@ -323,7 +444,7 @@ generateVersions: expire_in: 7 days -humanBioHPC: +human_BioHPC: stage: reference only: - push @@ -334,7 +455,7 @@ humanBioHPC: - mkdir -p hu - cp -R /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2 ./hu/ -mouseBioHPC: +mouse_BioHPC: stage: reference only: - push @@ -345,7 +466,7 @@ mouseBioHPC: - mkdir -p mo - cp -R /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2 ./mo/ -humanDataHub: +human_dev: stage: reference only: - push @@ -356,14 +477,13 @@ humanDataHub: - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt - referenceBase=dev.gudmap.org - refName=GRCh - - refHuVersion=38.p12.v31 - references=$(echo ${referenceBase}/${refName}${refHuVersion}) - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}) - curl --request GET ${query} > refQuery.json - - refURL=$(python ./workflow/scripts/extractRefData.py --returnParam URL) + - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) - loc=$(dirname ${refURL}) - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') @@ -371,7 +491,7 @@ humanDataHub: - test=$(echo ${test} | grep -o ${filename}) - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi -mousenDataHub: +mouse_dev: stage: reference only: - push @@ -382,14 +502,115 @@ mousenDataHub: - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt - referenceBase=dev.gudmap.org - refName=GRCm + - references=$(echo ${referenceBase}/${refName}${refMoVersion}) + - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) + - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) + - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) + - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}) + - curl --request GET ${query} > refQuery.json + - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) + - loc=$(dirname ${refURL}) + - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi + - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') + - test=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(echo ${test} | grep -o ${filename}) + - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi + +human_staging: + stage: reference + only: + - push + - tags + except: + - merge_requests + script: + - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt + - referenceBase=staging.gudmap.org + - refName=GRCh + - references=$(echo ${referenceBase}/${refName}${refHuVersion}) + - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) + - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) + - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) + - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}) + - curl --request GET ${query} > refQuery.json + - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) + - loc=$(dirname ${refURL}) + - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi + - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') + - test=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(echo ${test} | grep -o ${filename}) + - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi + +mouse_staging: + stage: reference + only: + - push + - tags + except: + - merge_requests + script: + - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt + - referenceBase=staging.gudmap.org + - refName=GRCm - refHuVersion=38.p6.vM22 + - references=$(echo ${referenceBase}/${refName}${refMoVersion}) + - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) + - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) + - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) + - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}) + - curl --request GET ${query} > refQuery.json + - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) + - loc=$(dirname ${refURL}) + - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi + - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') + - test=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(echo ${test} | grep -o ${filename}) + - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi + +human_prod: + stage: reference + only: + - push + - tags + except: + - merge_requests + script: + - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt + - referenceBase=www.gudmap.org + - refName=GRCh - references=$(echo ${referenceBase}/${refName}${refHuVersion}) - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}) - curl --request GET ${query} > refQuery.json - - refURL=$(python ./workflow/scripts/extractRefData.py --returnParam URL) + - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) + - loc=$(dirname ${refURL}) + - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi + - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') + - test=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/) + - test=$(echo ${test} | grep -o ${filename}) + - if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi + +mouse_prod: + stage: reference + only: + - push + - tags + except: + - merge_requests + script: + - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt + - referenceBase=www.gudmap.org + - refName=GRCm + - refHuVersion=38.p6.vM22 + - references=$(echo ${referenceBase}/${refName}${refMoVersion}) + - GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1) + - GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2) + - GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3) + - query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE}) + - curl --request GET ${query} > refQuery.json + - refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL) - loc=$(dirname ${refURL}) - if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi - filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)') @@ -418,7 +639,7 @@ integration_se: - SE_multiqc_data.json expire_in: 7 days retry: - max: 1 + max: 0 when: - always @@ -443,10 +664,11 @@ integration_pe: - PE_multiqc_data.json expire_in: 7 days retry: - max: 1 + max: 0 when: - always + override_inputBag: stage: integration only: [merge_requests] @@ -456,7 +678,7 @@ override_inputBag: script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --inputBagForce ./test_data/bag/staging/Replicate_Q-Y5F6.zip --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --inputBagForce ./test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip --upload false --ci true - find . -type f -name "multiqc_data.json" -exec cp {} ./inputBagOverride_PE_multiqc_data.json \; artifacts: name: "$CI_JOB_NAME" @@ -465,7 +687,7 @@ override_inputBag: - inputBagOverride_PE_multiqc_data.json expire_in: 7 days retry: - max: 1 + max: 0 when: - always @@ -478,7 +700,7 @@ override_fastq: script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --fastqsForce './test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz' --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --fastqsForce './test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz' --upload false --ci true - find . -type f -name "multiqc_data.json" -exec cp {} ./fastqOverride_PE_multiqc_data.json \; artifacts: name: "$CI_JOB_NAME" @@ -487,7 +709,7 @@ override_fastq: - fastqOverride_PE_multiqc_data.json expire_in: 7 days retry: - max: 1 + max: 0 when: - always @@ -500,7 +722,7 @@ override_species: script: - hostname - ulimit -a - - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --speciesForce 'Homo sapiens' --ci true + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --speciesForce 'Homo sapiens' --upload false --ci true - find . -type f -name "multiqc_data.json" -exec cp {} ./speciesOverride_PE_multiqc_data.json \; artifacts: name: "$CI_JOB_NAME" @@ -509,7 +731,7 @@ override_species: - speciesOverride_PE_multiqc_data.json expire_in: 7 days retry: - max: 1 + max: 0 when: - always diff --git a/.gitlab/merge_request_templates/Merge_Request.md b/.gitlab/merge_request_templates/Merge_Request.md index 4d3a6b013244af0d542a58bf528d4ad972a0e828..9106ab37a0d604eca393e2d7f700a31f47e86c29 100644 --- a/.gitlab/merge_request_templates/Merge_Request.md +++ b/.gitlab/merge_request_templates/Merge_Request.md @@ -5,9 +5,9 @@ These are the most common things requested on pull requests. - [ ] This comment contains a description of changes (with reason) - [ ] If you've fixed a bug or added code that should be tested, add tests! - [ ] Documentation in `docs` is updated - - [ ] Replace dag.png with the most recent CI pipleine integrated_pe artifact - - [ ] Replace software_versions_mqc.yaml with the most recent CI pipleine generateVersions artifact - - [ ] Replace software_references_mqc.yaml with the most recent CI pipleine generateVersions artifact + - [ ] Replace dag.png with the most recent CI pipeline integrated_pe artifact + - [ ] Replace software_versions_mqc.yaml with the most recent CI pipeline generateVersions artifact + - [ ] Replace software_references_mqc.yaml with the most recent CI pipeline generateVersions artifact - [ ] `CHANGELOG.md` is updated - [ ] `README.md` is updated - [ ] `LICENSE.md` is updated with new contributors diff --git a/CHANGELOG.md b/CHANGELOG.md index 57b95447b438ca5d213c8ddc32d0917975aa618a..6b3749cd9b15f1b7ea5f54f41601a827ec2268d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,13 @@ -# v0.0.4 (in development) +# v0.1.0 (in development) **User Facing** * Add option to pull references from datahub * Add option to send email on workflow error, with pipeline error message * Add versions and paper references of software used to report +* Upload input bag +* Upload execution run +* Upload mRNA QC +* Create and upload output bag +* Add optional to not upload **Background** * Remove (comment out) option to pull references from S3 @@ -10,11 +15,11 @@ * Start using new gudmaprbk dockerhub (images autobuilt) * Moved consistency checks to be fully python * Changed order of steps so that fastqc is done after the trim step +* Change docker images to production +* Add automated version badges *Known Bugs* * Datahub reference pull uses dev.gudmap.org as source until referencencs are placed on production -* outputBag does not contain fetch for processed data -* Does not include automatic data upload * Override params (inputBag, fastq, species) aren't checked for integrity <hr> diff --git a/README.md b/README.md index 7ddb775a4fd60eca262e604145395405bba5b47e..f8d0b49dac58b7e4d2a6f59c8166f73353dcac4a 100644 --- a/README.md +++ b/README.md @@ -37,9 +37,12 @@ To Run: * **dev** = [dev.gudmap.org](dev.gudmap.org) (default, does not contain all data) * **staging** = [staging.gudmap.org](staging.gudmap.org) (does not contain all data) * **production** = [www.gudmap.org](www.gudmap.org) (***does contain all data***) - * `--refMoVersion` mouse reference version ***(optional)*** - * `--refHuVersion` human reference version ***(optional)*** - * `--refERCCVersion` human reference version ***(optional)*** + * `--refMoVersion` mouse reference version ***(optional, default = 38.p6.vM22)*** + * `--refHuVersion` human reference version ***(optional, default = 38.p12.v31)*** + * `--refERCCVersion` human reference version ***(optional, default = 92)*** + * `--upload` option to not upload output back to the data-hub ***(optional, default = true)*** + * **true** = upload outputs to the data-hub + * **false** = do *NOT* upload outputs to the data-hub * `-profile` config profile to use ***(optional)***: * defaut = processes on BioHPC cluster * **biohpc** = process on BioHPC cluster @@ -47,7 +50,7 @@ To Run: * **aws_ondemand** = AWS Batch on-demand instant requests * **aws_spot** = AWS Batch spot instance requests * `--email` email address(es) to send failure notification (comma separated) ***(optional)***: - * e.g: `--email 'venkat.malladi@utsouthwestern.edu,Gervaise.Henry@UTSouthwestern.edu'` + * e.g: `--email 'Venkat.Malladi@utsouthwestern.edu,Gervaise.Henry@UTSouthwestern.edu'` * NOTES: * once deriva-auth is run and authenticated, the two files above are saved in ```~/.deriva/``` (see official documents from [deriva](https://github.com/informatics-isi-edu/deriva-client#installer-packages-for-windows-and-macosx) on the lifetime of the credentials) diff --git a/docs/dag.png b/docs/dag.png index 785a852de0310576415c2c99f80c3452f7ad176e..58456bbcad81eb5752fb85a764dfb6792cea9aaa 100644 Binary files a/docs/dag.png and b/docs/dag.png differ diff --git a/docs/software_references_mqc.yaml b/docs/software_references_mqc.yaml index 675745fdb642ee27b2aa223bf5dfae78c1bb3897..d9d18558b7df3f626ff89cdb01c610228db92a8b 100644 --- a/docs/software_references_mqc.yaml +++ b/docs/software_references_mqc.yaml @@ -4,7 +4,7 @@ description: 'This section describes references for the tools used.' plot_type: 'html' data: | - + <h3 id="references">References</h3> <ol style="list-style-type: decimal"> <li><strong>python</strong>:</li> @@ -41,7 +41,7 @@ <li><strong>hisat2</strong>:</li> </ol> <ul> - <li>Kim ,D.,Paggi, J.M., Park, C., Bennett, C., Salzberg, S.L. Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. 2019 Nat Biotechnol. 2019 Aug;37(8):907-915. doi:<a href="https://doi.org/10.1038/s41587-019-0201-4">10.1038/s41587-019-0201-4</a></li> + <li>Kim ,D.,Paggi, J.M., Park, C., Bennett, C., Salzberg, S.L. 2019 Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. Nat Biotechnol. Aug;37(8):907-915. doi:<a href="https://doi.org/10.1038/s41587-019-0201-4">10.1038/s41587-019-0201-4</a>.</li> </ul> <ol start="7" style="list-style-type: decimal"> <li><strong>samtools</strong>:</li> diff --git a/docs/software_versions_mqc.yaml b/docs/software_versions_mqc.yaml index 7b44cf8c8b898ceb03770670a6acd4d7396a2550..ea5487adc7ac4894ff48bae93783a5348a945160 100644 --- a/docs/software_versions_mqc.yaml +++ b/docs/software_versions_mqc.yaml @@ -6,19 +6,19 @@ description: 'are collected for pipeline version.' data: | <dl class="dl-horizontal"> - - <dt>Python</dt><dd>v3.7.7</dd> - <dt>DERIVA</dt><dd>v1.0.0</dd> + + <dt>Python</dt><dd>v3.8.3</dd> + <dt>DERIVA</dt><dd>v1.3.0</dd> <dt>BDBag</dt><dd>v1.5.6</dd> - <dt>RSeQC</dt><dd>v3.0.1</dd> - <dt>Trim Galore!</dt><dd>v0.6.4</dd> - <dt>HISAT2</dt><dd>v2.1.0</dd> - <dt>Samtools</dt><dd>v1.9</dd> - <dt>picard (MarkDuplicates)</dt><dd>v2.23.0-SNAPSHOT</dd> - <dt>featureCounts</dt><dd>v2.0.0</dd> - <dt>R</dt><dd>v3.6.3</dd> - <dt>deepTools</dt><dd>v3.3.2</dd> + <dt>RSeQC</dt><dd>v4.0.0</dd> + <dt>Trim Galore!</dt><dd>v0.6.4_dev</dd> + <dt>HISAT2</dt><dd>v2.2.1</dd> + <dt>Samtools</dt><dd>v1.11</dd> + <dt>picard (MarkDuplicates)</dt><dd>v2.23.9</dd> + <dt>featureCounts</dt><dd>v2.0.1</dd> + <dt>R</dt><dd>v4.0.3</dd> + <dt>deepTools</dt><dd>v3.5.0</dd> <dt>FastQC</dt><dd>v0.11.9</dd> - <dt>MultiQC</dt><dd>v1.8</dd> + <dt>MultiQC</dt><dd>v1.9</dd> <dt>Pipeline Version</dt><dd>v0.0.4_indev</dd> </dl> diff --git a/test_data/createTestData.sh b/test_data/createTestData.sh index aec4e91bb71d5bb79d87cd76994f6253cfe63ea3..8f5cebc6180a739189693c62451a3eb0f1970245 100644 --- a/test_data/createTestData.sh +++ b/test_data/createTestData.sh @@ -5,52 +5,54 @@ module load singularity/3.5.3 module load pigz/2.4 +ln -sfn /project/BICF/BICF_Core/shared/gudmap/test_data/* ../test_data/ + mkdir -p NEW_test_data -ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json +ln -sfn ./test_data/auth/credential.json ~/.deriva/credential.json mkdir -p ./NEW_test_data/bag -singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6 -cp Replicate_Q-Y5F6.zip ./NEW_test_data/bag/Replicate_Q-Y5F6.zip +singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-download-cli staging.gudmap.org --catalog 2 ../workflow/conf/Replicate_For_Input_Bag.json . rid=Q-Y5F6 +cp Q-Y5F6_inputBag.zip ./NEW_test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip mkdir -p ./NEW_test_data/fastq -unzip ./test_data/bag/Replicate_Q-Y5F6.zip -singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6 -cp Replicate_Q-Y5F6.R1.fastq.gz ./NEW_test_data/fastq/Replicate_Q-Y5F6.R1.fastq.gz -cp Replicate_Q-Y5F6.R2.fastq.gz ./NEW_test_data/fastq/Replicate_Q-Y5F6.R2.fastq.gz +unzip ./NEW_test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip +singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' bash ../workflow/scripts/bdbagFetch.sh Q-Y5F6_inputBag Q-Y5F6 +cp Q-Y5F6.R1.fastq.gz ./NEW_test_data/fastq/Q-Y5F6.R1.fastq.gz +cp Q-Y5F6.R2.fastq.gz ./NEW_test_data/fastq/Q-Y5F6.R2.fastq.gz mkdir -p ./NEW_test_data/fastq/small -singularity exec 'docker://bicf/seqtk:2.0.0' seqtk sample -s100 ./NEW_test_data/fastq/Replicate_Q-Y5F6.R1.fastq.gz 1000000 1> Q-Y5F6_1M.R1.fastq -singularity exec 'docker://bicf/seqtk:2.0.0' seqtk sample -s100 ./NEW_test_data/fastq/Replicate_Q-Y5F6.R2.fastq.gz 1000000 1> Q-Y5F6_1M.R2.fastq +singularity exec 'docker://gudmaprbk/seqtk1.3:1.0.0' seqtk sample -s100 ./NEW_test_data/fastq/Q-Y5F6.R1.fastq.gz 1000000 1> Q-Y5F6_1M.R1.fastq +singularity exec 'docker://gudmaprbk/seqtk1.3:1.0.0' seqtk sample -s100 ./NEW_test_data/fastq/Q-Y5F6.R2.fastq.gz 1000000 1> Q-Y5F6_1M.R2.fastq pigz Q-Y5F6_1M.R1.fastq pigz Q-Y5F6_1M.R2.fastq cp Q-Y5F6_1M.R1.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz cp Q-Y5F6_1M.R2.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz mkdir -p ./NEW_test_data/meta -singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5F6_1M.se -j 20 ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5F6_1M.pe -j 20 ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz +singularity run 'docker://gudmaprbk/trimgalore0.6.5:1.0.0' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5F6_1M.se -j 20 ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz +singularity run 'docker://gudmaprbk/trimgalore0.6.5:1.0.0' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5F6_1M.pe -j 20 ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz cp Q-Y5F6_1M.se_trimmed.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz -cp Q-Y5F6_1M.pe_R1_val_1.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_R1_val_1.fq.gz -cp Q-Y5F6_1M.pe_R2_val_2.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_R2_val_2.fq.gz +cp Q-Y5F6_1M.pe_val_1.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_val_1.fq.gz +cp Q-Y5F6_1M.pe_val_2.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_val_2.fq.gz cp Q-Y5F6_1M.R1.fastq.gz_trimming_report.txt ./NEW_test_data/meta/Q-Y5F6_1M.R1.fastq.gz_trimming_report.txt cp Q-Y5F6_1M.R2.fastq.gz_trimming_report.txt ./NEW_test_data/meta/Q-Y5F6_1M.R2.fastq.gz_trimming_report.txt touch metaTest.csv -echo 'Replicate_RID,Experiment_RID,Study_RID,Paired_End,File_Type,Has_Strand_Specific_Information,Used_Spike_Ins,Species' > metaTest.csv -echo 'Replicate_RID,Experiment_RID,Study_RID,uk,FastQ,no,no,Homo sapiens' >> metaTest.csv +echo 'Replicate_RID,Experiment_RID,Study_RID,Paired_End,File_Type,Has_Strand_Specific_Information,Used_Spike_Ins,Species,Read_Length' > metaTest.csv +echo 'Replicate_RID,Experiment_RID,Study_RID,uk,FastQ,no,no,Homo sapiens,75' >> metaTest.csv cp metaTest.csv ./NEW_test_data/meta/metaTest.csv mkdir -p ./NEW_test_data/bam mkdir -p ./NEW_test_data/bam/small -singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./NEW_test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary -singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam -singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam -singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.sorted.bam.bai -singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.pe.unal.gz -S Q-Y5F6_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_R1_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5F6_1M.pe_R2_val_2.fq.gz --summary-file Q-Y5F6_1M.pe.alignSummary.txt --new-summary -singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.pe.bam Q-Y5F6_1M.pe.sam -singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam -singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai +singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCm38.p6.vM22/hisat2/genome --rna-strandness F -U ./NEW_test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary +singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam +singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam +singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.sorted.bam.bai +singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.pe.unal.gz -S Q-Y5F6_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCm38.p6.vM22/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_val_1.fq.gz -2 ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_val_2.fq.gz --summary-file Q-Y5F6_1M.pe.alignSummary.txt --new-summary +singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.pe.bam Q-Y5F6_1M.pe.sam +singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam +singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai cp Q-Y5F6_1M.se.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.bam cp Q-Y5F6_1M.pe.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.pe.bam cp Q-Y5F6_1M.se.sorted.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.bam @@ -60,18 +62,17 @@ cp Q-Y5F6_1M.pe.sorted.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.pe.sorted.bam cp Q-Y5F6_1M.se.alignSummary.txt ./NEW_test_data/meta/Q-Y5F6_1M.se.alignSummary.txt cp Q-Y5F6_1M.pe.alignSummary.txt ./NEW_test_data/meta/Q-Y5F6_1M.pe.alignSummary.txt -singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true -singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.deduped.bam -singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai +singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true +singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.deduped.bam +singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai cp Q-Y5F6_1M.se.deduped.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.deduped.bam cp Q-Y5F6_1M.se.sorted.deduped.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam cp Q-Y5F6_1M.se.sorted.deduped.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam.bai -cp Q-Y5F6_1M.se.deduped.Metrics.txt /NEW_test_data/meta/Q-Y5F6_1M.se.deduped.Metrics.txt cp Q-Y5F6_1M.se.deduped.Metrics.txt ./NEW_test_data/meta/Q-Y5F6_1M.se.deduped.Metrics.txt for i in {"chr8","chr4","chrY"}; do echo "samtools view -b ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam ${i} > Q-Y5F6_1M.se.sorted.deduped.${i}.bam; samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.${i}.bam Q-Y5F6_1M.se.sorted.deduped.${i}.bam.bai;"; - done | singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' parallel -j 20 -k + done | singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' parallel -j 20 -k cp Q-Y5F6_1M.se.sorted.deduped.chr4.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chr4.bam cp Q-Y5F6_1M.se.sorted.deduped.chr4.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chr4.bam.bai cp Q-Y5F6_1M.se.sorted.deduped.chr8.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chr8.bam @@ -81,28 +82,30 @@ cp Q-Y5F6_1M.se.sorted.deduped.chrY.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M. mkdir -p ./NEW_test_data/counts mkdir -p ./NEW_test_data/counts/small -ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/geneID.tsv -ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/Entrez.tsv -singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se.countData -s 1 -R SAM --primary --ignoreDup ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count Q-Y5F6_1M.se.countData -singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se -cp Q-Y5F6_1M.se.featureCounts ./NEW_test_data/counts/small/Q-Y5F6_1M.se.countData +ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCm38.p6.vM22/geneID.tsv +ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCm38.p6.vM22/Entrez.tsv +singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCm38.p6.vM22/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCm38.p6.vM22/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se_countData -s 1 -R SAM --primary --ignoreDup ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam +singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' Rscript ../workflow/scripts/calculateTPM.R --count Q-Y5F6_1M.se_countData +singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' Rscript ../workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se +cp Q-Y5F6_1M.se_countData ./NEW_test_data/counts/small/Q-Y5F6_1M.se_countData cp Q-Y5F6_1M.se.countTable.csv ./NEW_test_data/counts/small/Q-Y5F6_1M.se.countTable.csv -cp Q-Y5F6_1M.se.countTable.csv ./NEW_test_data/counts/small/Q-Y5F6_1M.se.tpmTable.csv +cp Q-Y5F6_1M.se_tpmTable.csv ./NEW_test_data/counts/small/Q-Y5F6_1M.se_tpmTable.csv mkdir -p ./NEW_test_data/bw mkdir -p ./NEW_test_data/bw/small -singularity run 'docker://bicf/deeptools3.3:2.0.0' bamCoverage -p 20 -b ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw +singularity run 'docker://gudmaprbk/deeptools3.5.0:1.0.0' bamCoverage -p 20 -b ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw cp Q-Y5F6_1M.se.bw ./NEW_test_data/bw/small/Q-Y5F6_1M.se.bw mkdir -p ./NEW_test_data/fastqc mkdir -p ./NEW_test_data/fastqc/small -singularity run 'docker://bicf/fastqc:2.0.0' ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o . +singularity run 'docker://gudmaprbk/fastqc0.11.9:1.0.0' fastqc ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o . cp Q-Y5F6_1M.R1_fastqc.html ./NEW_test_data/fastqc/small/Q-Y5F6_1M.R1_fastqc.html cp Q-Y5F6_1M.R1_fastqc.zip ./NEW_test_data/fastqc/small/Q-Y5F6_1M.R1_fastqc.zip echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > Q-Y5F6_1M.se.sorted.deduped.tin.xls for i in {"chr8","chr4","chrY"}; do -echo "tin.py -i ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"; done | singularity run 'docker://bicf/rseqc3.0:2.0.0' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls +echo "tin.py -i ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCm38.p6.vM22/bed/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"; done | singularity run 'docker://gudmaprbk/rseqc4.0.0:1.0.0' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls cp Q-Y5F6_1M.se.sorted.deduped.tin.xls ./NEW_test_data/meta/Q-Y5F6_1M.se.sorted.deduped.tin.xls +chgrp -R BICF_Core ./NEW_test_data +chmod -R 750 ./NEW_test_data diff --git a/workflow/conf/Execution_Run_For_Output_Bag.json b/workflow/conf/Execution_Run_For_Output_Bag.json new file mode 100755 index 0000000000000000000000000000000000000000..5945b1eb8c4c5e3ec862840f232ed7a8e386d770 --- /dev/null +++ b/workflow/conf/Execution_Run_For_Output_Bag.json @@ -0,0 +1,64 @@ +{ + "bag": { + "bag_name": "Execution_Run_{rid}", + "bag_algorithms": [ + "md5" + ], + "bag_archiver": "zip", + "bag_metadata": {} + }, + "catalog": { + "catalog_id": "2", + "query_processors": [ + { + "processor": "csv", + "processor_params": { + "output_path": "Execution_Run", + "query_path": "/attribute/M:=RNASeq:Execution_Run/RID=17-BPAG/RID,Replicate_RID:=Replicate,Workflow_RID:=Workflow,Reference_Genone_RID:=Reference_Genome,Input_Bag_RID:=Input_Bag,Notes,Execution_Status,Execution_Status_Detail,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Workflow", + "query_path": "/entity/M:=RNASeq:Execution_Run/RID=17-BPAG/RNASeq:Workflow?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Reference_Genome", + "query_path": "/entity/M:=RNASeq:Execution_Run/RID=17-BPAG/RNASeq:Reference_Genome?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Input_Bag", + "query_path": "/entity/M:=RNASeq:Execution_Run/RID=17-BPAG/RNASeq:Input_Bag?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "mRNA_QC", + "query_path": "/attribute/M:=RNASeq:Execution_Run/RID=17-BPAG/(RID)=(RNASeq:mRNA_QC:Execution_Run)/RID,Execution_Run_RID:=Execution_Run,Replicate_RID:=Replicate,Paired_End,Strandedness,Median_Read_Length,Raw_Count,Final_Count,Notes,RCT,RMT?limit=none" + } + }, + { + "processor": "fetch", + "processor_params": { + "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}/Execution_Run/{Execution_Run_RID}/Output_Files", + "query_path": "/attribute/M:=RNASeq:Execution_Run/RID=17-BPAG/R:=RNASeq:Replicate/$M/(RID)=(RNASeq:Processed_File:Execution_Run)/url:=File_URL,length:=File_Bytes,filename:=File_Name,md5:=File_MD5,Execution_Run_RID:=M:RID,Study_RID:=R:Study_RID,Experiment_RID:=R:Experiment_RID,Replicate_RID:=R:RID?limit=none" + } + }, + { + "processor": "fetch", + "processor_params": { + "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}/Execution_Run/{Execution_Run_RID}/Input_Bag", + "query_path": "/attribute/M:=RNASeq:Execution_Run/RID=17-BPAG/R:=RNASeq:Replicate/$M/RNASeq:Input_Bag/url:=File_URL,length:=File_Bytes,filename:=File_Name,md5:=File_MD5,Execution_Run_RID:=M:RID,Study_RID:=R:Study_RID,Experiment_RID:=R:Experiment_RID,Replicate_RID:=R:RID?limit=none" + } + } + ] + } +} \ No newline at end of file diff --git a/workflow/conf/replicate_export_config.json b/workflow/conf/Replicate_For_Input_Bag.json similarity index 99% rename from workflow/conf/replicate_export_config.json rename to workflow/conf/Replicate_For_Input_Bag.json index ff17fa513c5bc130a2e2bdaf9aa41b070c99b290..4380e46734a4425f7df57ad0cf0553a868b03c9d 100644 --- a/workflow/conf/replicate_export_config.json +++ b/workflow/conf/Replicate_For_Input_Bag.json @@ -1,6 +1,6 @@ { "bag": { - "bag_name": "Replicate_{rid}", + "bag_name": "{rid}_inputBag", "bag_algorithms": [ "md5" ], diff --git a/workflow/conf/aws.config b/workflow/conf/aws.config index c26e1a4a7e44318c53f4baa561ee6c5cb3020798..3e3cbb65a60f726cb936d0dacaefe3a1a07662e4 100644 --- a/workflow/conf/aws.config +++ b/workflow/conf/aws.config @@ -84,7 +84,23 @@ process { cpus = 2 memory = '1 GB' } - withName: outputBag { + withName: uploadInputBag { + cpus = 1 + memory = '1 GB' + } + withName: uploadExecutionRun { + cpus = 1 + memory = '1 GB' + } + withName: uploadQC { + cpus = 1 + memory = '1 GB' + } + withName: uploadProcessedFile { + cpus = 1 + memory = '1 GB' + } + withName: uploadOutputBag { cpus = 1 memory = '1 GB' } diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 57b72fd929898a976d9d077ff1b23470388cc37a..ca9f0e4f935099a50f6ef241dfe42f37e6e382b9 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -58,7 +58,19 @@ process { withName: aggrQC { executor = 'local' } - withName: outputBag { + withName: uploadInputBag { + executor = 'local' + } + withName: uploadExecutionRun { + executor = 'local' + } + withName: uploadQC { + executor = 'local' + } + withName: uploadProcessedFile { + executor = 'local' + } + withName: uploadOutputBag { executor = 'local' } } diff --git a/workflow/conf/multiqc_config.yaml b/workflow/conf/multiqc_config.yaml index 8a91a75b919a858cdb43c27d0349290bef04b967..ed1375aed47a454394029e5057695b0c15babd8c 100644 --- a/workflow/conf/multiqc_config.yaml +++ b/workflow/conf/multiqc_config.yaml @@ -74,10 +74,14 @@ custom_data: scale: false format: '{}' headers: - Session - Session ID - Pipeline Version - Input + Session: + description: '' + Session ID: + description: 'Nextflow session ID' + Pipeline Version: + description: 'BICF pipeline version' + Input: + description: 'Input overrides' rid: file_format: 'tsv' section_name: 'RID' @@ -88,10 +92,14 @@ custom_data: scale: false format: '{}' headers: - Replicate - Replicate RID - Experiment RID - Study RID + Replicate: + description: '' + Replicate RID: + description: 'Replicate RID' + Experiment RID: + description: 'Experiment RID' + Study RID: + description: 'Study RID' meta: file_format: 'tsv' section_name: 'Metadata' @@ -102,30 +110,43 @@ custom_data: scale: false format: '{:,.0f}' headers: - Source - Species - Ends - Stranded - Spike-in - Raw Reads - Assigned Reads - Median Read Length - Median TIN - Pipeline Version + Source: + description: 'Metadata source' + Species: + description: 'Species' + Ends: + description: 'Single or paired end sequencing' + Stranded: + description: 'Stranded (forward/reverse) or unstranded library prep' + Spike-in: + description: 'ERCC spike in' + Raw Reads: + description: 'Number of reads of the sequencer' + Assigned Reads: + description: 'Final reads after fintering' + Median Read Length: + description: 'Average read length' + Median TIN: + description: 'Average transcript integrity number' + ref: file_format: 'tsv' section_name: 'Reference' - description: 'This is the referenec version information' + description: 'This is the reference version information' plot_type: 'table' pconfig: id: 'ref' scale: false format: '{}' headers: - Species - Genome Reference Consortium Build - Genome Reference Consortium Patch - GENCODE Annotation Release" + Species: + description: 'Reference species' + Genome Reference Consortium Build: + description: 'Reference source build' + Genome Reference Consortium Patch: + description: 'Reference source patch version' + GENCODE Annotation Release: + description: 'Annotation release version' tin: file_format: 'tsv' section_name: 'TIN' @@ -135,16 +156,16 @@ custom_data: id: 'tin' headers: chrom - 0 - 9 - 10 - 19 - 20 - 29 - 30 - 39 - 40 - 49 - 50 - 59 - 60 - 69 - 70 - 79 - 80 - 89 - 90 - 99 + 1 - 10 + 11 - 20 + 21 - 30 + 31 - 40 + 41 - 50 + 51 - 60 + 61 - 70 + 71 - 80 + 81 - 90 + 91 - 100 sp: run: @@ -156,4 +177,4 @@ sp: ref: fn: 'reference.tsv' tin: - fn: '*.tin.hist.tsv' + fn: '*_tin.hist.tsv' diff --git a/workflow/nextflow.config b/workflow/nextflow.config index f9fcef964a79aafb697c020defa67e68b93f5ec0..33b16b42074e47dcce027c2ac391304b95c91ff3 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -20,55 +20,67 @@ profiles { process { withName:getBag { - container = 'bicf/gudmaprbkfilexfer:2.0.1_indev' + container = 'gudmaprbk/deriva1.3:1.0.0' } withName:getData { - container = 'bicf/gudmaprbkfilexfer:2.0.1_indev' + container = 'gudmaprbk/deriva1.3:1.0.0' } withName: parseMetadata { container = 'gudmaprbk/python3:1.0.0' } withName: trimData { - container = 'bicf/trimgalore:1.1' + container = 'gudmaprbk/trimgalore0.6.5:1.0.0' } withName: getRefInfer { container = 'gudmaprbk/deriva1.3:1.0.0' } withName: downsampleData { - container = 'bicf/seqtk:2.0.1_indev' + container = 'gudmaprbk/seqtk1.3:1.0.0' } withName: alignSampleData { - container = 'bicf/gudmaprbkaligner:2.0.1_indev' + container = 'gudmaprbk/hisat2.2.1:1.0.0' } withName: inferMetadata { - container = 'bicf/rseqc3.0:2.0.1_indev' + container = 'gudmaprbk/rseqc4.0.0:1.0.0' } withName: getRef { container = 'gudmaprbk/deriva1.3:1.0.0' } withName: alignData { - container = 'bicf/gudmaprbkaligner:2.0.1_indev' + container = 'gudmaprbk/hisat2.2.1:1.0.0' } withName: dedupData { - container = 'bicf/gudmaprbkdedup:2.0.0' + container = 'gudmaprbk/picard2.23.9:1.0.0' } withName: countData { - container = 'bicf/subread2:2.0.0' + container = 'gudmaprbk/subread2.0.1:1.0.0' } withName: makeBigWig { - container = 'bicf/deeptools3.3:2.0.1_indev' + container = 'gudmaprbk/deeptools3.5.0:1.0.0' } withName: fastqc { - container = 'bicf/fastqc:2.0.1_indev' + container = 'gudmaprbk/fastqc0.11.9:1.0.0' } withName: dataQC { - container = 'bicf/rseqc3.0:2.0.1_indev' + container = 'gudmaprbk/rseqc4.0.0:1.0.0' } withName: aggrQC { - container = 'bicf/multiqc1.8:2.0.1_indev' + container = 'gudmaprbk/multiqc1.9:1.0.0' + } + withName:uploadInputBag { + container = 'gudmaprbk/deriva1.3:1.0.0' + } + withName:uploadExecutionRun { + container = 'gudmaprbk/deriva1.3:1.0.0' } - withName:outputBag { - container = 'bicf/gudmaprbkfilexfer:2.0.1_indev' + withName:uploadQC { + container = 'gudmaprbk/deriva1.3:1.0.0' + } + withName:uploadProcessedFile { + container = 'gudmaprbk/deriva1.3:1.0.0' + } + withName:uploadOutputBag { + container = 'gudmaprbk/deriva1.3:1.0.0' } } diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 04709a14369741efdafe5f4fbc0dbc7bb06a7627..bd1aee4ea8fda6acce19febbec38e0118ed0079b 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -18,6 +18,7 @@ params.refMoVersion = "38.p6.vM22" params.refHuVersion = "38.p12.v31" params.refERCCVersion = "92" params.outDir = "${baseDir}/../output" +params.upload = true params.email = "" @@ -36,6 +37,11 @@ deriva.into { deriva_getBag deriva_getRefInfer deriva_getRef + deriva_uploadInputBag + deriva_uploadExecutionRun + deriva_uploadQC + deriva_uploadProcessedFile + deriva_uploadOutputBag } bdbag = Channel .fromPath(params.bdbag) @@ -46,13 +52,15 @@ refHuVersion = params.refHuVersion refERCCVersion = params.refERCCVersion outDir = params.outDir logsDir = "${outDir}/Logs" +upload = params.upload inputBagForce = params.inputBagForce fastqsForce = params.fastqsForce speciesForce = params.speciesForce email = params.email -// Define fixed files -derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json") +// Define fixed files and +replicateExportConfig = Channel.fromPath("${baseDir}/conf/Replicate_For_Input_Bag.json") +executionRunExportConfig = Channel.fromPath("${baseDir}/conf/Execution_Run_For_Output_Bag.json") if (params.source == "dev") { source = "dev.gudmap.org" } else if (params.source == "staging") { @@ -74,15 +82,20 @@ softwareReferences = Channel.fromPath("${baseDir}/../docs/software_references_mq softwareVersions = Channel.fromPath("${baseDir}/../docs/software_versions_mqc.yaml") // Define script files -script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh") -script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py") -script_inferMeta = Channel.fromPath("${baseDir}/scripts/inferMeta.sh") -script_refDataInfer = Channel.fromPath("${baseDir}/scripts/extractRefData.py") -script_refData = Channel.fromPath("${baseDir}/scripts/extractRefData.py") +script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbag_fetch.sh") +script_parseMeta = Channel.fromPath("${baseDir}/scripts/parse_meta.py") +script_inferMeta = Channel.fromPath("${baseDir}/scripts/infer_meta.sh") +script_refDataInfer = Channel.fromPath("${baseDir}/scripts/extract_ref_data.py") +script_refData = Channel.fromPath("${baseDir}/scripts/extract_ref_data.py") script_calculateTPM = Channel.fromPath("${baseDir}/scripts/calculateTPM.R") script_convertGeneSymbols = Channel.fromPath("${baseDir}/scripts/convertGeneSymbols.R") -script_tinHist = Channel.fromPath("${baseDir}/scripts/tinHist.py") - +script_tinHist = Channel.fromPath("${baseDir}/scripts/tin_hist.py") +script_uploadInputBag = Channel.fromPath("${baseDir}/scripts/upload_input_bag.py") +script_uploadExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py") +script_uploadQC = Channel.fromPath("${baseDir}/scripts/upload_qc.py") +script_uploadOutputBag = Channel.fromPath("${baseDir}/scripts/upload_output_bag.py") +script_deleteEntry_uploadQC = Channel.fromPath("${baseDir}/scripts/delete_entry.py") +script_deleteEntry_uploadProcessedFile = Channel.fromPath("${baseDir}/scripts/delete_entry.py") /* * trackStart: track start of pipeline @@ -143,10 +156,10 @@ process getBag { input: path credential, stageAs: "credential.json" from deriva_getBag - path derivaConfig + path replicateExportConfig output: - path ("Replicate_*.zip") into bag + path ("*.zip") into bag when: inputBagForce == "" @@ -164,8 +177,15 @@ process getBag { # deriva-download replicate RID echo -e "LOG: fetching bag for ${repRID} in GUDMAP" >> ${repRID}.getBag.log - deriva-download-cli staging.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID} + deriva-download-cli ${source} --catalog 2 ${replicateExportConfig} . rid=${repRID} echo -e "LOG: fetched" >> ${repRID}.getBag.log + + name=\$(ls *.zip) + name=\$(basename \${name} | cut -d "." -f1) + yr=\$(date +'%Y') + mn=\$(date +'%m') + dy=\$(date +'%d') + mv \${name}.zip \${name}_\${yr}\${mn}\${dy}.zip """ } @@ -177,6 +197,10 @@ if (inputBagForce != "") { } else { inputBag = bag } +inputBag.into { + inputBag_getData + inputBag_uploadInputBag +} /* * getData: fetch study files from consortium with downloaded bdbag.zip @@ -187,7 +211,7 @@ process getData { input: path script_bdbagFetch path cookies, stageAs: "deriva-cookies.txt" from bdbag - path inputBag + path inputBag from inputBag_getData output: path ("*.R{1,2}.fastq.gz") into fastqs @@ -207,7 +231,7 @@ process getData { echo -e "LOG: linked" >> ${repRID}.getData.log # get bag basename - replicate=\$(basename "${inputBag}" | cut -d "." -f1) + replicate=\$(basename "${inputBag}") echo -e "LOG: bag replicate name \${replicate}" >> ${repRID}.getData.log # unzip bag @@ -217,7 +241,7 @@ process getData { # bag fetch fastq's only and rename by repRID echo -e "LOG: fetching replicate bdbag" >> ${repRID}.getData.log - sh ${script_bdbagFetch} \${replicate} ${repRID} + sh ${script_bdbagFetch} \${replicate::-13} ${repRID} echo -e "LOG: fetched" >> ${repRID}.getData.log """ } @@ -249,7 +273,7 @@ process parseMetadata { path experiment from experimentMeta output: - path "design.csv" into metadata + path "design.csv" into metadata_fl script: """ @@ -310,7 +334,7 @@ speciesMeta = Channel.create() readLengthMeta = Channel.create() expRID = Channel.create() studyRID = Channel.create() -metadata.splitCsv(sep: ",", header: false).separate( +metadata_fl.splitCsv(sep: ",", header: false).separate( endsMeta, endsManual, strandedMeta, @@ -320,6 +344,7 @@ metadata.splitCsv(sep: ",", header: false).separate( expRID, studyRID ) + // Replicate metadata for multiple process inputs endsManual.into { endsManual_trimData @@ -327,6 +352,16 @@ endsManual.into { endsManual_alignSampleData endsManual_aggrQC } +studyRID.into { + studyRID_aggrQC + studyRID_uploadInputBag + studyRID_uploadProcessedFile + studyRID_uploadOutputBag +} +expRID.into { + expRID_aggrQC + expRID_uploadProcessedFile +} /* @@ -336,14 +371,14 @@ process trimData { tag "${repRID}" input: - val ends from endsManual_trimData path (fastq) from fastqs_trimData + val ends from endsManual_trimData output: path ("*.fq.gz") into fastqsTrim path ("*.fastq.gz", includeInputs:true) into fastqs_fastqc path ("*_trimming_report.txt") into trimQC - path ("readLength.csv") into inferMetadata_readLength + path ("readLength.csv") into readLengthInfer_fl script: """ @@ -371,11 +406,16 @@ process trimData { // Extract calculated read length metadata into channel readLengthInfer = Channel.create() -inferMetadata_readLength.splitCsv(sep: ",", header: false).separate( +readLengthInfer_fl.splitCsv(sep: ",", header: false).separate( readLengthInfer ) -// Replicate trimmed fastq's +// Replicate infered read length for multiple process inputs +readLengthInfer.into { + readLengthInfer_aggrQC + readLengthInfer_uploadQC +} +// Replicate trimmed fastq's for multiple process inputs fastqsTrim.into { fastqsTrim_alignData fastqsTrim_downsampleData @@ -450,9 +490,9 @@ process getRefInfer { query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version=${refName}${refERCCVersion}/Annotation_Version=${refName}${refERCCVersion}') fi curl --request GET \${query} > refQuery.json - refURL=\$(python extractRefData.py --returnParam URL) + refURL=\$(python ${script_refDataInfer} --returnParam URL) loc=\$(dirname \${refURL}) - fName=\$(python extractRefData.py --returnParam fName) + fName=\$(python ${script_refDataInfer} --returnParam fName) fName=\${fName%.*} if [ "\${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi filename=\$(echo \$(basename \${refURL}) | grep -oP '.*(?=:)') @@ -483,8 +523,8 @@ process downsampleData { tag "${repRID}" input: - val ends from endsManual_downsampleData path fastq from fastqsTrim_downsampleData + val ends from endsManual_downsampleData output: path ("sampled.1.fq") into fastqs1Sample @@ -575,7 +615,7 @@ process inferMetadata { path alignSummary from alignSampleQC_inferMetadata.collect() output: - path "infer.csv" into inferMetadata + path "infer.csv" into inferMetadata_fl path "${repRID}.infer_experiment.txt" into inferExperiment script: @@ -642,18 +682,18 @@ process inferMetadata { infer_experiment.py -r "\${bed}" -i "\${bam}" 1>> ${repRID}.infer_experiment.txt echo -e "LOG: infered" >> ${repRID}.inferMetadata.log - ended=`bash inferMeta.sh endness ${repRID}.infer_experiment.txt` - fail=`bash inferMeta.sh fail ${repRID}.infer_experiment.txt` + ended=`bash ${script_inferMeta} endness ${repRID}.infer_experiment.txt` + fail=`bash ${script_inferMeta} fail ${repRID}.infer_experiment.txt` if [ \${ended} == "PairEnd" ] then ends="pe" - percentF=`bash inferMeta.sh pef ${repRID}.infer_experiment.txt` - percentR=`bash inferMeta.sh per ${repRID}.infer_experiment.txt` + percentF=`bash ${script_inferMeta} pef ${repRID}.infer_experiment.txt` + percentR=`bash ${script_inferMeta} per ${repRID}.infer_experiment.txt` elif [ \${ended} == "SingleEnd" ] then ends="se" - percentF=`bash inferMeta.sh sef ${repRID}.infer_experiment.txt` - percentR=`bash inferMeta.sh ser ${repRID}.infer_experiment.txt` + percentF=`bash ${script_inferMeta} sef ${repRID}.infer_experiment.txt` + percentR=`bash ${script_inferMeta} ser ${repRID}.infer_experiment.txt` fi echo -e "LOG: percentage reads in the same direction as gene: \${percentF}" >> ${repRID}.inferMetadata.log echo -e "LOG: percentage reads in the opposite direction as gene: \${percentR}" >> ${repRID}.inferMetadata.log @@ -684,7 +724,7 @@ align_moInfer = Channel.create() percentFInfer = Channel.create() percentRInfer = Channel.create() failInfer = Channel.create() -inferMetadata.splitCsv(sep: ",", header: false).separate( +inferMetadata_fl.splitCsv(sep: ",", header: false).separate( endsInfer, strandedInfer, spikeInfer, @@ -703,20 +743,24 @@ endsInfer.into { endsInfer_countData endsInfer_dataQC endsInfer_aggrQC + endsInfer_uploadQC } strandedInfer.into { strandedInfer_alignData strandedInfer_countData strandedInfer_aggrQC + strandedInfer_uploadQC } spikeInfer.into{ spikeInfer_getRef spikeInfer_aggrQC + spikeInfer_uploadExecutionRun } speciesInfer.into { speciesInfer_getRef speciesInfer_aggrQC - speciesInfer_outputBag + speciesInfer_uploadExecutionRun + speciesInfer_uploadProcessedFile } @@ -727,8 +771,8 @@ process getRef { tag "${species}" input: - path credential, stageAs: "credential.json" from deriva_getRef path script_refData + path credential, stageAs: "credential.json" from deriva_getRef val spike from spikeInfer_getRef val species from speciesInfer_getRef @@ -796,9 +840,9 @@ process getRef { GENCODE=\$(echo \${references} | grep -o \${refName}.* | cut -d '.' -f3) query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='\${GRCv}'.'\${GRCp}'/Annotation_Version=GENCODE%20'\${GENCODE}) curl --request GET \${query} > refQuery.json - refURL=\$(python extractRefData.py --returnParam URL) + refURL=\$(python ${script_refData} --returnParam URL) loc=\$(dirname \${refURL}) - fName=\$(python extractRefData.py --returnParam fName) + fName=\$(python ${script_refData} --returnParam fName) fName=\${fName%.*} if [ "\${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi filename=\$(echo \$(basename \${refURL}) | grep -oP '.*(?=:)') @@ -824,10 +868,10 @@ process alignData { tag "${repRID}" input: - val ends from endsInfer_alignData - val stranded from strandedInfer_alignData path fastq from fastqsTrim_alignData path reference_alignData + val ends from endsInfer_alignData + val stranded from strandedInfer_alignData output: tuple path ("${repRID}.sorted.bam"), path ("${repRID}.sorted.bam.bai") into rawBam @@ -897,8 +941,8 @@ process dedupData { tuple path (bam), path (bai) from rawBam_dedupData output: - tuple path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bam.bai") into dedupBam - tuple path ("${repRID}.sorted.deduped.*.bam"), path ("${repRID}.sorted.deduped.*.bam.bai") into dedupChrBam + tuple path ("${repRID}_sorted.deduped.bam"), path ("${repRID}_sorted.deduped.bam.bai") into dedupBam + tuple path ("${repRID}_sorted.deduped.*.bam"), path ("${repRID}_sorted.deduped.*.bam.bai") into dedupChrBam path ("*.deduped.Metrics.txt") into dedupQC script: @@ -913,16 +957,16 @@ process dedupData { # sort the bam file using Samtools echo -e "LOG: sorting the bam file" >> ${repRID}.dedup.log - samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.deduped.bam ${repRID}.deduped.bam + samtools sort -@ `nproc` -O BAM -o ${repRID}_sorted.deduped.bam ${repRID}.deduped.bam # index the sorted bam using Samtools echo -e "LOG: indexing sorted bam file" >> ${repRID}.dedup.log - samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bam.bai + samtools index -@ `nproc` -b ${repRID}_sorted.deduped.bam ${repRID}_sorted.deduped.bam.bai # split the deduped BAM file for multi-threaded tin calculation - for i in `samtools view ${repRID}.sorted.deduped.bam | cut -f3 | sort | uniq`; + for i in `samtools view ${repRID}_sorted.deduped.bam | cut -f3 | sort | uniq`; do - echo "echo \"LOG: splitting each chromosome into its own BAM and BAI files with Samtools\"; samtools view -b ${repRID}.sorted.deduped.bam \${i} 1>> ${repRID}.sorted.deduped.\${i}.bam; samtools index -@ `nproc` -b ${repRID}.sorted.deduped.\${i}.bam ${repRID}.sorted.deduped.\${i}.bam.bai" + echo "echo \"LOG: splitting each chromosome into its own BAM and BAI files with Samtools\"; samtools view -b ${repRID}_sorted.deduped.bam \${i} 1>> ${repRID}_sorted.deduped.\${i}.bam; samtools index -@ `nproc` -b ${repRID}_sorted.deduped.\${i}.bam ${repRID}_sorted.deduped.\${i}.bam.bai" done | parallel -j `nproc` -k """ } @@ -932,6 +976,7 @@ dedupBam.into { dedupBam_countData dedupBam_makeBigWig dedupBam_dataQC + dedupBam_uploadProcessedFile } /* @@ -945,7 +990,7 @@ process makeBigWig { tuple path (bam), path (bai) from dedupBam_makeBigWig output: - path ("${repRID}.bw") + path ("${repRID}_sorted.deduped.bw") into bigwig script: """ @@ -954,7 +999,7 @@ process makeBigWig { # create bigwig echo -e "LOG: creating bibWig" >> ${repRID}.makeBigWig.log - bamCoverage -p `nproc` -b ${bam} -o ${repRID}.bw + bamCoverage -p `nproc` -b ${bam} -o ${repRID}_sorted.deduped.bw echo -e "LOG: created" >> ${repRID}.makeBigWig.log """ } @@ -964,7 +1009,7 @@ process makeBigWig { */ process countData { tag "${repRID}" - publishDir "${outDir}/count", mode: 'copy', pattern: "${repRID}*.tpmTable.csv" + publishDir "${outDir}/count", mode: 'copy', pattern: "${repRID}*_tpmTable.csv" input: path script_calculateTPM @@ -975,9 +1020,9 @@ process countData { val stranded from strandedInfer_countData output: - path ("*.tpmTable.csv") into counts - path ("*.countData.summary") into countsQC - path ("assignedReads.csv") into inferMetadata_assignedReads + path ("*_tpmTable.csv") into counts + path ("*_countData.summary") into countsQC + path ("assignedReads.csv") into assignedReadsInfer_fl script: """ @@ -1004,32 +1049,38 @@ process countData { echo -e "LOG: counting ${ends} features" >> ${repRID}.countData.log if [ "${ends}" == "se" ] then - featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam + featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}_countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}_sorted.deduped.bam elif [ "${ends}" == "pe" ] then - featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam + featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}_countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}_sorted.deduped.bam fi echo -e "LOG: counted" >> ${repRID}.countData.log # extract assigned reads - grep -m 1 'Assigned' *.countData.summary | grep -oe '\\([0-9.]*\\)' > assignedReads.csv + grep -m 1 'Assigned' *_countData.summary | grep -oe '\\([0-9.]*\\)' > assignedReads.csv # calculate TPM from the resulting countData table echo -e "LOG: calculating TPM with R" >> ${repRID}.countData.log - Rscript calculateTPM.R --count "${repRID}.countData" + Rscript ${script_calculateTPM} --count "${repRID}_countData" # convert gene symbols to Entrez id's echo -e "LOG: convert gene symbols to Entrez id's" >> ${repRID}.countData.log - Rscript convertGeneSymbols.R --repRID "${repRID}" + Rscript ${script_convertGeneSymbols} --repRID "${repRID}" """ } // Extract number of assigned reads metadata into channel assignedReadsInfer = Channel.create() -inferMetadata_assignedReads.splitCsv(sep: ",", header: false).separate( +assignedReadsInfer_fl.splitCsv(sep: ",", header: false).separate( assignedReadsInfer ) +// Replicate infered assigned reads for multiple process inputs +assignedReadsInfer.into { + assignedReadsInfer_aggrQC + assignedReadsInfer_uploadQC +} + /* *fastqc: run fastqc on untrimmed fastq's */ @@ -1041,7 +1092,7 @@ process fastqc { output: path ("*_fastqc.zip") into fastqc - path ("rawReads.csv") into inferMetadata_rawReads + path ("rawReads.csv") into rawReadsInfer_fl script: """ @@ -1059,10 +1110,16 @@ process fastqc { // Extract number of raw reads metadata into channel rawReadsInfer = Channel.create() -inferMetadata_rawReads.splitCsv(sep: ",", header: false).separate( +rawReadsInfer_fl.splitCsv(sep: ",", header: false).separate( rawReadsInfer ) +// Replicate infered raw reads for multiple process inputs +rawReadsInfer.into { + rawReadsInfer_aggrQC + rawReadsInfer_uploadQC +} + /* *dataQC: calculate transcript integrity numbers (TIN) and bin as well as calculate innerdistance of PE replicates */ @@ -1077,9 +1134,9 @@ process dataQC { val ends from endsInfer_dataQC output: - path "${repRID}.tin.hist.tsv" into tinHist - path "${repRID}.tin.med.csv" into inferMetadata_tinMed - path "${repRID}.insertSize.inner_distance_freq.txt" into innerDistance + path "${repRID}_tin.hist.tsv" into tinHist + path "${repRID}_tin.med.csv" into tinMedInfer_fl + path "${repRID}_insertSize.inner_distance_freq.txt" into innerDistance script: """ @@ -1087,10 +1144,10 @@ process dataQC { ulimit -a >> ${repRID}.dataQC.log # calcualte TIN values per feature on each chromosome - echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > ${repRID}.sorted.deduped.tin.xls + echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > ${repRID}_sorted.deduped.tin.xls for i in `cat ./bed/genome.bed | cut -f1 | sort | uniq`; do - echo "echo \"LOG: running tin.py on \${i}\" >> ${repRID}.dataQC.log; tin.py -i ${repRID}.sorted.deduped.\${i}.bam -r ./bed/genome.bed; cat ${repRID}.sorted.deduped.\${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \\\"\\\\t\${i}\\\\t\\\";"; - done | parallel -j `nproc` -k 1>> ${repRID}.sorted.deduped.tin.xls + echo "echo \"LOG: running tin.py on \${i}\" >> ${repRID}.dataQC.log; tin.py -i ${repRID}_sorted.deduped.\${i}.bam -r ./bed/genome.bed; cat ${repRID}_sorted.deduped.\${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \\\"\\\\t\${i}\\\\t\\\";"; + done | parallel -j `nproc` -k 1>> ${repRID}_sorted.deduped.tin.xls # bin TIN values echo -e "LOG: binning TINs" >> ${repRID}.dataQC.log @@ -1101,19 +1158,19 @@ process dataQC { if [ "${ends}" == "pe" ] then echo -e "LOG: calculating inner distances for ${ends}" >> ${repRID}.dataQC.log - inner_distance.py -i "${bam}" -o ${repRID}.insertSize -r ./bed/genome.bed + inner_distance.py -i "${bam}" -o ${repRID}_insertSize -r ./bed/genome.bed echo -e "LOG: calculated" >> ${repRID}.dataQC.log elif [ "${ends}" == "se" ] then echo -e "LOG: creating dummy inner distance file for ${ends}" >> ${repRID}.dataQC.log - touch ${repRID}.insertSize.inner_distance_freq.txt + touch ${repRID}_insertSize.inner_distance_freq.txt fi """ } // Extract median TIN metadata into channel tinMedInfer = Channel.create() -inferMetadata_tinMed.splitCsv(sep: ",", header: false).separate( +tinMedInfer_fl.splitCsv(sep: ",", header: false).separate( tinMedInfer ) @@ -1149,12 +1206,12 @@ process aggrQC { val spikeI from spikeInfer_aggrQC val speciesI from speciesInfer_aggrQC val readLengthM from readLengthMeta - val readLengthI from readLengthInfer - val rawReadsI from rawReadsInfer - val assignedReadsI from assignedReadsInfer + val readLengthI from readLengthInfer_aggrQC + val rawReadsI from rawReadsInfer_aggrQC + val assignedReadsI from assignedReadsInfer_aggrQC val tinMedI from tinMedInfer - val expRID - val studyRID + val studyRID from studyRID_aggrQC + val expRID from expRID_aggrQC output: path "${repRID}.multiqc.html" into multiqc @@ -1226,24 +1283,270 @@ process aggrQC { """ } +/* + * uploadInputBag: uploads the input bag +*/ +process uploadInputBag { + tag "${repRID}" + + input: + path script_uploadInputBag + path credential, stageAs: "credential.json" from deriva_uploadInputBag + path inputBag from inputBag_uploadInputBag + val studyRID from studyRID_uploadInputBag + + output: + path ("inputBagRID.csv") into inputBagRID_fl + + when: + upload + + script: + """ + hostname > ${repRID}.uploadInputBag.log + ulimit -a >> ${repRID}.uploadInputBag.log + + yr=\$(date +'%Y') + mn=\$(date +'%m') + dy=\$(date +'%d') + + file=\$(basename -a ${inputBag}) + md5=\$(md5sum ./\${file} | awk '{ print \$1 }') + echo LOG: ${repRID} input bag md5 sum - \${md5} >> ${repRID}.uploadInputBag.log + size=\$(wc -c < ./\${file}) + echo LOG: ${repRID} input bag size - \${size} bytes >> ${repRID}.uploadInputBag.log + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=\${md5}) + if [ "\${exist}" == "[]" ] + then + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/input_bag/study/${studyRID}/replicate/${repRID}/\${file} --parents) + inputBag_rid=\$(python3 ${script_uploadInputBag} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie}) + echo LOG: input bag RID uploaded - \${inputBag_rid} >> ${repRID}.uploadInputBag.log + rid=\${inputBag_rid} + else + exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + exist=\${exist:7:-6} + echo LOG: input bag RID already exists - \${exist} >> ${repRID}.uploadInputBag.log + rid=\${exist} + fi + + echo \${rid} > inputBagRID.csv + """ +} + +// Extract input bag RID into channel +inputBagRID = Channel.create() +inputBagRID_fl.splitCsv(sep: ",", header: false).separate( + inputBagRID +) + +/* + * uploadExecutionRun: uploads the execution run +*/ +process uploadExecutionRun { + tag "${repRID}" + + input: + path script_uploadExecutionRun + path credential, stageAs: "credential.json" from deriva_uploadExecutionRun + val spike from spikeInfer_uploadExecutionRun + val species from speciesInfer_uploadExecutionRun + val inputBagRID + + output: + path ("executionRunRID.csv") into executionRunRID_fl + + when: + upload + + script: + """ + hostname > ${repRID}.uploadExecutionRun.log + ulimit -a >> ${repRID}.uploadExecutionRun.log + + echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.uploadExecutionRun.log + workflow=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Workflow/Name=BICF%20mRNA%20Replicate/Version=${workflow.manifest.version}) + workflow=\$(echo \${workflow} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + workflow=\${workflow:7:-6} + echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.uploadExecutionRun.log + + if [ "${species}" == "Homo sapiens" ] + then + genomeName=\$(echo GRCh${refHuVersion}) + elif [ "${species}" == "Mus musculus" ] + then + genomeName=\$(echo GRCm${refMoVersion}) + fi + if [ "${spike}" == "yes" ] + then + genomeName=\$(echo \${genomeName}-S) + fi + echo LOG: searching for genome name - \${genomeName} >> ${repRID}.uploadExecutionRun.log + genome=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Name=\${genomeName}_indev) + genome=\$(echo \${genome} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + genome=\${genome:7:-6} + echo LOG: genome RID extracted - \${genome} >> ${repRID}.uploadExecutionRun.log + + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Replicate=${repRID}/Input_Bag=${inputBagRID}) + echo \${exist} >> ${repRID}.uploadExecutionRun.log + if [ "\${exist}" == "[]" ] + then + executionRun_rid=\$(python3 ${script_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s In-progress -d 'Run in process' -o ${source} -c \${cookie} -u F) + echo LOG: execution run RID uploaded - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log + else + rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + rid=\${rid:7:-6} + echo \${rid} >> ${repRID}.uploadExecutionRun.log + executionRun_rid=\$(python3 ${script_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s In-progress -d 'Run in process' -o ${source} -c \${cookie} -u \${rid}) + echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log + fi + + echo \${executionRun_rid} > executionRunRID.csv + """ +} + +// Extract execution run RID into channel +executionRunRID = Channel.create() +executionRunRID_fl.splitCsv(sep: ",", header: false).separate( + executionRunRID +) + +// +executionRunRID.into { + executionRunRID_uploadQC + executionRunRID_uploadProcessedFile + executionRunRID_uploadOutputBag +} + +/* + * uploadQC: uploads the mRNA QC +*/ +process uploadQC { + tag "${repRID}" + + input: + path script_deleteEntry_uploadQC + path script_uploadQC + path credential, stageAs: "credential.json" from deriva_uploadQC + val executionRunRID from executionRunRID_uploadQC + val ends from endsInfer_uploadQC + val stranded from strandedInfer_uploadQC + val length from readLengthInfer_uploadQC + val rawCount from rawReadsInfer_uploadQC + val finalCount from assignedReadsInfer_uploadQC + + + output: + path ("qcRID.csv") into qcRID_fl + + when: + upload + + script: + """ + hostname > ${repRID}.uploadQC.log + ulimit -a >> ${repRID}.uploadQC.log + + if [ "${ends}" == "pe" ] + then + end="Paired End" + elif [ "${ends}" == "se" ] + then + end="Single Read" + fi + + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=${repRID}) + if [ "\${exist}" != "[]" ] + then + rids=\$(echo \${exist} | grep -o '\\"RID\\":\\".\\{7\\}' | sed 's/^.\\{7\\}//') + for rid in \${rids} + do + python3 ${script_deleteEntry_uploadQC} -r \${rid} -t mRNA_QC -o ${source} -c \${cookie} + echo LOG: old mRNA QC RID deleted - \${rid} >> ${repRID}.uploadQC.log + done + echo LOG: all old mRNA QC RIDs deleted >> ${repRID}.uploadQC.log + fi + + qc_rid=\$(python3 ${script_uploadQC} -r ${repRID} -e ${executionRunRID} -p "\${end}" -s ${stranded} -l ${length} -w ${rawCount} -f ${finalCount} -o ${source} -c \${cookie} -u F) + echo LOG: mRNA QC RID uploaded - \${qc_rid} >> ${repRID}.uploadQC.log + + echo \${qc_rid} > qcRID.csv + """ +} + +// Extract mRNA qc RID into channel +qcRID = Channel.create() +qcRID_fl.splitCsv(sep: ",", header: false).separate( + qcRID +) + /* *ouputBag: create ouputBag */ -process outputBag { +process uploadProcessedFile { tag "${repRID}" publishDir "${outDir}/outputBag", mode: 'copy', pattern: "Replicate_${repRID}.outputBag.zip" input: + path script_deleteEntry_uploadProcessedFile + path credential, stageAs: "credential.json" from deriva_uploadProcessedFile + path executionRunExportConfig path multiqc path multiqcJSON - val species from speciesInfer_outputBag + tuple path (bam),path (bai) from dedupBam_uploadProcessedFile + path bigwig + path counts + val species from speciesInfer_uploadProcessedFile + val studyRID from studyRID_uploadProcessedFile + val expRID from expRID_uploadProcessedFile + val executionRunRID from executionRunRID_uploadProcessedFile output: - path ("Replicate_*.zip") into outputBag + path ("${repRID}_Output_Bag.zip") into outputBag + + when: + upload script: """ - mkdir Replicate_${repRID}.outputBag + hostname > ${repRID}.outputBag.log + ulimit -a >> ${repRID}.outputBag.log + + mkdir -p ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ + cp ${bam} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ + cp ${bai} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ + cp ${bigwig} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ + cp ${counts} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/ + + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Processed_File/Replicate=${repRID}) + if [ "\${exist}" != "[]" ] + then + rids=\$(echo \${exist} | grep -o '\\"RID\\":\\".\\{7\\}' | sed 's/^.\\{7\\}//') + for rid in \${rids} + do + python3 ${script_deleteEntry_uploadProcessedFile} -r \${rid} -t Processed_File -o ${source} -c \${cookie} + done + echo LOG: all old processed file RIDs deleted >> ${repRID}.uploadQC.log + fi + + deriva-upload-cli --catalog 2 --token \${cookie:9} ${source} ./deriva + echo LOG: processed files uploaded >> ${repRID}.outputBag.log + + deriva-download-cli --catalog 2 --token \${cookie:9} ${source} ${executionRunExportConfig} . rid=${executionRunRID} + echo LOG: execution run bag downloaded >> ${repRID}.outputBag.log + echo -e "### Run Details" >> runDetails.md echo -e "**Workflow URL:** https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq" >> runDetails.md echo -e "**Workflow Version:** ${workflow.manifest.version}" >> runDetails.md @@ -1260,13 +1563,85 @@ process outputBag { echo -e "**Genome Assembly Version:** \${genome} patch \${patch}" >> runDetails.md echo -e "**Annotation Version:** GENCODE release \${annotation}" >> runDetails.md echo -e "**Run ID:** ${repRID}" >> runDetails.md - cp runDetails.md Replicate_${repRID}.outputBag - cp ${multiqc} Replicate_${repRID}.outputBag - cp ${multiqcJSON} Replicate_${repRID}.outputBag - bdbag Replicate_${repRID}.outputBag --archiver zip + echo LOG: runDetails.md created >> ${repRID}.outputBag.log + + unzip Execution_Run_${executionRunRID}.zip + yr=\$(date +'%Y') + mn=\$(date +'%m') + dy=\$(date +'%d') + mv Execution_Run_${executionRunRID} ${repRID}_Output_Bag_\${yr}\${mn}\${dy} + loc=./${repRID}_Output_Bag/data/assets/Study/${studyRID}/Experiment/${expRID}/Replicate/${repRID}/Execution_Run/${executionRunRID}/Output_Files/ + mkdir -p \${loc} + cp runDetails.md \${loc} + cp ${multiqc} \${loc} + cp ${multiqcJSON} \${loc} + + bdbag ./${repRID}_Output_Bag/ --update --archiver zip --debug + echo LOG: output bag created >> ${repRID}.outputBag.log + """ +} + +/* + * uploadOutputBag: uploads the output bag +*/ +process uploadOutputBag { + tag "${repRID}" + + input: + path script_uploadOutputBag + path credential, stageAs: "credential.json" from deriva_uploadOutputBag + path outputBag + val studyRID from studyRID_uploadOutputBag + val executionRunRID from executionRunRID_uploadOutputBag + + output: + path ("outputBagRID.csv") into outputBagRID_fl + + when: + upload + + script: + """ + hostname > ${repRID}.uploadOutputBag.log + ulimit -a >> ${repRID}.uploadOutputBag.log + + yr=\$(date +'%Y') + mn=\$(date +'%m') + dy=\$(date +'%d') + + file=\$(basename -a ${outputBag}) + md5=\$(md5sum ./\${file} | awk '{ print \$1 }') + echo LOG: ${repRID} output bag md5 sum - \${md5} >> ${repRID}.uploadOutputBag.log + size=\$(wc -c < ./\${file}) + echo LOG: ${repRID} output bag size - \${size} bytes >> ${repRID}.uploadOutputBag.log + + exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_MD5=\${md5}) + if [ "\${exist}" == "[]" ] + then + cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"') + cookie=\${cookie:11:-1} + + loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/output_bag/study/${studyRID}/replicate/${repRID}/\${file} --parents) + outputBag_rid=\$(python3 ${script_uploadOutputBag} -e ${executionRunRID} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie}) + echo LOG: output bag RID uploaded - \${outputBag_rid} >> ${repRID}.uploadOutputBag.log + rid=\${outputBag_rid} + else + exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT') + exist=\${exist:8:-6} + echo LOG: output bag RID already exists - \${exist} >> ${repRID}.uploadOutputBag.log + rid=\${exist} + fi + + echo \${rid} > outputBagRID.csv """ } +// Extract output bag RID into channel +outputBagRID = Channel.create() +outputBagRID_fl.splitCsv(sep: ",", header: false).separate( + outputBagRID +) + workflow.onError = { subject = "$workflow.manifest.name FAILED: $params.repRID" diff --git a/workflow/scripts/bdbagFetch.sh b/workflow/scripts/bdbag_fetch.sh similarity index 100% rename from workflow/scripts/bdbagFetch.sh rename to workflow/scripts/bdbag_fetch.sh diff --git a/workflow/scripts/convertGeneSymbols.R b/workflow/scripts/convertGeneSymbols.R index 49752f1bba5a4dd8d91ed8609c6f2f82b8fafacc..6cc5c0a1089881ae1dfd32e248a1ffbcbcd7b24a 100644 --- a/workflow/scripts/convertGeneSymbols.R +++ b/workflow/scripts/convertGeneSymbols.R @@ -23,4 +23,4 @@ output <- merge(x=convert,y=countTable[,c("gene_name","gene_id","count","tpm")], colnames(output) <- c("GENCODE_Gene_Symbol","NCBI_GeneID","Ensembl_GeneID","count","tpm") output <- output[,c(1,3,2,4:5)] -write.table(output,file=paste0(opt$repRID,".tpmTable.csv"),sep=",",row.names=FALSE,quote=FALSE) +write.table(output,file=paste0(opt$repRID,"_tpmTable.csv"),sep=",",row.names=FALSE,quote=FALSE) diff --git a/workflow/scripts/delete_entry.py b/workflow/scripts/delete_entry.py new file mode 100644 index 0000000000000000000000000000000000000000..1b26509a8c1541f65a39e660ec7e6ec158194ef1 --- /dev/null +++ b/workflow/scripts/delete_entry.py @@ -0,0 +1,37 @@ +import argparse +from deriva.core import ErmrestCatalog, get_credential, BaseCLI +import sys +import csv + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-r', '--RID', help="replicate RID", required=True) + parser.add_argument('-t', '--table', help="source table", required=True) + parser.add_argument('-o', '--host', help="datahub host", required=True) + parser.add_argument('-c', '--cookie', help="cookie token", required=True) + args = parser.parse_args() + return args + +def main(hostname, catalog_number, credential): + catalog = ErmrestCatalog('https', hostname, catalog_number, credential) + pb = catalog.getPathBuilder() + if args.table == 'mRNA_QC': + run_table = pb.RNASeq.mRNA_QC + elif args.table == "Processed_File": + run_table = pb.RNASeq.Processed_File + + path = run_table.filter(run_table.RID == args.RID) + path.delete() + rid = args.RID + + + print(rid + " deleted") + + +if __name__ == '__main__': + args = get_args() + cli = BaseCLI("Custom RNASeq query", None, 1) + cli.remove_options(["--config-file"]) + host = args.host + credentials = {"cookie": args.cookie} + main(host, 2, credentials) \ No newline at end of file diff --git a/workflow/scripts/extractRefData.py b/workflow/scripts/extract_ref_data.py similarity index 100% rename from workflow/scripts/extractRefData.py rename to workflow/scripts/extract_ref_data.py diff --git a/workflow/scripts/generate_versions.py b/workflow/scripts/generate_versions.py index 85d42da201697e8c6db07e672438fd28e639d1eb..09447d17a62a439a418753398e1cd77716ceaa74 100644 --- a/workflow/scripts/generate_versions.py +++ b/workflow/scripts/generate_versions.py @@ -38,7 +38,7 @@ SOFTWARE_REGEX = { 'Trim Galore!': ['version_trimgalore.txt', r"version (\S+)"], 'HISAT2': ['version_hisat2.txt', r"version (\S+)"], 'Samtools': ['version_samtools.txt', r"samtools (\S+)"], - 'picard (MarkDuplicates)': ['version_markdups.txt', r"(\S\.\S{2}\.\S+)"], + 'picard (MarkDuplicates)': ['version_markdups.txt', r"Version:(\S+)"], 'featureCounts': ['version_featurecounts.txt', r"featureCounts v(\S+)"], 'R': ['version_r.txt', r"R version (\S+)"], 'deepTools': ['version_deeptools.txt', r"deeptools (\S+)"], diff --git a/workflow/scripts/inferMeta.sh b/workflow/scripts/infer_meta.sh similarity index 100% rename from workflow/scripts/inferMeta.sh rename to workflow/scripts/infer_meta.sh diff --git a/workflow/scripts/parseMeta.py b/workflow/scripts/parse_meta.py similarity index 100% rename from workflow/scripts/parseMeta.py rename to workflow/scripts/parse_meta.py diff --git a/workflow/scripts/splitStudy.py b/workflow/scripts/split_study.py similarity index 100% rename from workflow/scripts/splitStudy.py rename to workflow/scripts/split_study.py diff --git a/workflow/scripts/splitStudy.sh b/workflow/scripts/split_study.sh similarity index 100% rename from workflow/scripts/splitStudy.sh rename to workflow/scripts/split_study.sh diff --git a/workflow/scripts/tinHist.py b/workflow/scripts/tin_hist.py similarity index 91% rename from workflow/scripts/tinHist.py rename to workflow/scripts/tin_hist.py index a95a9c23b4cf5ac3b25c85580a7b2c326bcb0eb1..ee36bb6447dfe0adcdaab60e1224cca5b5a6e246 100644 --- a/workflow/scripts/tinHist.py +++ b/workflow/scripts/tin_hist.py @@ -17,7 +17,7 @@ def get_args(): def main(): args = get_args() - tin = pd.read_csv(args.repRID + '.sorted.deduped.tin.xls', + tin = pd.read_csv(args.repRID + '_sorted.deduped.tin.xls', sep="\t", header=0) hist = pd.cut(tin['TIN'], bins=pd.interval_range( @@ -42,8 +42,8 @@ def main(): hist = hist[['TOTAL'] + [i for i in hist.columns if i != 'TOTAL']] hist = hist.T.fillna(0.0).astype(int) #hist = hist.apply(lambda x: x/x.sum()*100, axis=1) - hist.to_csv(args.repRID + '.tin.hist.tsv', sep='\t') - medFile = open(args.repRID + '.tin.med.csv', "w") + hist.to_csv(args.repRID + '_tin.hist.tsv', sep='\t') + medFile = open(args.repRID + '_tin.med.csv', "w") medFile.write(str(round(tin['TIN'][(tin['TIN'] != 0)].median(), 2))) medFile.close() diff --git a/workflow/scripts/upload_execution_run.py b/workflow/scripts/upload_execution_run.py new file mode 100644 index 0000000000000000000000000000000000000000..5af8565ab0426bd32dc886188a0347360ff4b42c --- /dev/null +++ b/workflow/scripts/upload_execution_run.py @@ -0,0 +1,62 @@ +import argparse +from deriva.core import ErmrestCatalog, get_credential, BaseCLI +import sys +import csv + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-r', '--repRID', help="replicate RID", required=True) + parser.add_argument('-w', '--workflowRID', help="workflow RID", required=True) + parser.add_argument('-g', '--referenceRID', help="reference genome RID", required=True) + parser.add_argument('-i', '--inputBagRID', help="inputBag RID", required=True) + parser.add_argument('-n', '--notes', help="notes", default="", required=False) + parser.add_argument('-s', '--status', help="run status", default="", required=False) + parser.add_argument('-d', '--statusDetail', help="status detail", default="", required=False) + parser.add_argument('-o', '--host', help="datahub host", required=True) + parser.add_argument('-c', '--cookie', help="cookie token", required=True) + parser.add_argument('-u', '--update', help="update?", default="F", required=True) + args = parser.parse_args() + return args + +def main(hostname, catalog_number, credential): + catalog = ErmrestCatalog('https', hostname, catalog_number, credential) + pb = catalog.getPathBuilder() + run_table = pb.RNASeq.Execution_Run + + if args.update == "F": + run_data = { + "Replicate": args.repRID, + "Workflow": args.workflowRID, + "Reference_Genome": args.referenceRID, + "Input_Bag": args.inputBagRID, + "Notes": args.notes, + "Execution_Status": args.status, + "Execution_Status_Detail": args.statusDetail + } + entities = run_table.insert([run_data]) + rid = entities[0]["RID"] + else: + run_data = { + "RID": args.update, + "Replicate": args.repRID, + "Workflow": args.workflowRID, + "Reference_Genome": args.referenceRID, + "Input_Bag": args.inputBagRID, + "Notes": args.notes, + "Execution_Status": args.status, + "Execution_Status_Detail": args.statusDetail + } + entities = run_table.update([run_data]) + rid = args.update + + + print(rid) + + +if __name__ == '__main__': + args = get_args() + cli = BaseCLI("Custom RNASeq query", None, 1) + cli.remove_options(["--config-file"]) + host = args.host + credentials = {"cookie": args.cookie} + main(host, 2, credentials) \ No newline at end of file diff --git a/workflow/scripts/upload_input_bag.py b/workflow/scripts/upload_input_bag.py new file mode 100644 index 0000000000000000000000000000000000000000..ab4d338074ffec3098667dcf2817041e01ded8bd --- /dev/null +++ b/workflow/scripts/upload_input_bag.py @@ -0,0 +1,46 @@ +import argparse +from deriva.core import ErmrestCatalog, get_credential, BaseCLI +import sys +import csv +from datetime import datetime + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-f', '--file', help="file name", required=True) + parser.add_argument('-l', '--loc', help="datahub location", required=True) + parser.add_argument('-s', '--md5', help="md5 sum", required=True) + parser.add_argument('-b', '--bytes', help="size in bytes", required=True) + parser.add_argument('-n', '--notes', help="notes", default="", required=False) + parser.add_argument('-o', '--host', help="datahub host", required=True) + parser.add_argument('-c', '--cookie', help="cookie token", required=True) + args = parser.parse_args() + return args + +def main(hostname, catalog_number, credential): + catalog = ErmrestCatalog('https', hostname, catalog_number, credential) + pb = catalog.getPathBuilder() + inputBag_table = pb.RNASeq.Input_Bag + + inputBag_data = { + "File_Name": args.file, + "File_URL": args.loc, + "File_MD5": args.md5, + "File_Bytes": args.bytes, + "File_Creation_Time": datetime.now().replace(microsecond=0).isoformat(), + "Notes": args.notes, + "Bag_Type": "Replicate_Input_Seq" + } + + entities = inputBag_table.insert([inputBag_data]) + rid = entities[0]["RID"] + + print(rid) + + +if __name__ == '__main__': + args = get_args() + cli = BaseCLI("Custom RNASeq query", None, 1) + cli.remove_options(["--config-file"]) + host = args.host + credential = {"cookie": args.cookie} + main(host, 2, credential) \ No newline at end of file diff --git a/workflow/scripts/upload_output_bag.py b/workflow/scripts/upload_output_bag.py new file mode 100644 index 0000000000000000000000000000000000000000..397658c0ccef21af86e529a040a6dcb2ac506833 --- /dev/null +++ b/workflow/scripts/upload_output_bag.py @@ -0,0 +1,48 @@ +import argparse +from deriva.core import ErmrestCatalog, get_credential, BaseCLI +import sys +import csv +from datetime import datetime + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-e', '--executionRunRID', help="exection run RID", required=True) + parser.add_argument('-f', '--file', help="file name", required=True) + parser.add_argument('-l', '--loc', help="datahub location", required=True) + parser.add_argument('-s', '--md5', help="md5 sum", required=True) + parser.add_argument('-b', '--bytes', help="size in bytes", required=True) + parser.add_argument('-n', '--notes', help="notes", default="", required=False) + parser.add_argument('-o', '--host', help="datahub host", required=True) + parser.add_argument('-c', '--cookie', help="cookie token", required=True) + args = parser.parse_args() + return args + +def main(hostname, catalog_number, credential): + catalog = ErmrestCatalog('https', hostname, catalog_number, credential) + pb = catalog.getPathBuilder() + outputBag_table = pb.RNASeq.Output_Bag + + outputBag_data = { + "Execution_Run": args.executionRunRID, + "File_Name": args.file, + "File_URL": args.loc, + "File_MD5": args.md5, + "File_Bytes": args.bytes, + "File_Creation_Time": datetime.now().replace(microsecond=0).isoformat(), + "Notes": args.notes, + "Bag_Type": "mRNA_Replicate_Analysis" + } + + entities = outputBag_table.insert([outputBag_data]) + rid = entities[0]["RID"] + + print(rid) + + +if __name__ == '__main__': + args = get_args() + cli = BaseCLI("Custom RNASeq query", None, 1) + cli.remove_options(["--config-file"]) + host = args.host + credential = {"cookie": args.cookie} + main(host, 2, credential) \ No newline at end of file diff --git a/workflow/scripts/upload_qc.py b/workflow/scripts/upload_qc.py new file mode 100644 index 0000000000000000000000000000000000000000..930896d3abce8882aca7985a4ad304904f6b3a44 --- /dev/null +++ b/workflow/scripts/upload_qc.py @@ -0,0 +1,65 @@ +import argparse +from deriva.core import ErmrestCatalog, get_credential, BaseCLI +import sys +import csv + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-r', '--repRID', help="replicate RID", required=True) + parser.add_argument('-e', '--executionRunRID', help="exection run RID", required=True) + parser.add_argument('-p', '--ends', help="single/paired ends", required=True) + parser.add_argument('-s', '--stranded', help="stranded?", required=True) + parser.add_argument('-l', '--length', help="median read length", required=True) + parser.add_argument('-w', '--rawCount', help="raw count", required=True) + parser.add_argument('-f', '--assignedCount', help="final assigned count", required=True) + parser.add_argument('-n', '--notes', help="notes", default="", required=False) + parser.add_argument('-o', '--host', help="datahub host", required=True) + parser.add_argument('-c', '--cookie', help="cookie token", required=True) + parser.add_argument('-u', '--update', help="update?", default="F", required=True) + args = parser.parse_args() + return args + +def main(hostname, catalog_number, credential): + catalog = ErmrestCatalog('https', hostname, catalog_number, credential) + pb = catalog.getPathBuilder() + run_table = pb.RNASeq.mRNA_QC + + if args.update == "F": + run_data = { + "Execution_Run": args.executionRunRID, + "Replicate": args.repRID, + "Paired_End": args.ends, + "Strandedness": args.stranded, + "Median_Read_Length": args.length, + "Raw_Count": args.rawCount, + "Final_Count": args.assignedCount, + "Notes": args.notes + } + entities = run_table.insert([run_data]) + rid = entities[0]["RID"] + else: + run_data = { + "RID": args.update, + "Execution_Run": args.executionRunRID, + "Replicate": args.repRID, + "Paired_End": args.ends, + "Strandedness": args.stranded, + "Median_Read_Length": args.length, + "Raw_Count": args.rawCount, + "Final_Count": args.assignedCount, + "Notes": args.notes + } + entities = run_table.update([run_data]) + rid = args.update + + + print(rid) + + +if __name__ == '__main__': + args = get_args() + cli = BaseCLI("Custom RNASeq query", None, 1) + cli.remove_options(["--config-file"]) + host = args.host + credentials = {"cookie": args.cookie} + main(host, 2, credentials) \ No newline at end of file diff --git a/workflow/tests/test_consistency.py b/workflow/tests/test_consistency.py index 8b7eb86f9fff3e252d5de463a5f3ec188d997d6e..2ee7b83f0c52d18d3256e1be6e3ed7f7783b1c80 100644 --- a/workflow/tests/test_consistency.py +++ b/workflow/tests/test_consistency.py @@ -18,8 +18,8 @@ def test_consistencySE(): with open(os.path.join( test_output_path, 'SE_multiqc_data.json')) as f: assigned_reads_json = json.load(f) - assigned_reads = assigned_reads_json['report_general_stats_data'][4]['16-1ZX4']['Assigned'] - assert assigned_reads == 7742416 + assigned_reads = assigned_reads_json['report_general_stats_data'][4]['16-1ZX4_sorted']['Assigned'] + assert assigned_reads == 7746121 @pytest.mark.consistencyPE @@ -30,5 +30,5 @@ def test_consistencyPE(): with open(os.path.join( test_output_path, 'PE_multiqc_data.json')) as f: assigned_reads_json = json.load(f) - assigned_reads = assigned_reads_json['report_general_stats_data'][4]['Q-Y5JA']['Assigned'] - assert assigned_reads == 2599149 + assigned_reads = assigned_reads_json['report_general_stats_data'][4]['Q-Y5JA_sorted']['Assigned'] + assert assigned_reads == 2596053 diff --git a/workflow/tests/test_getBag.py b/workflow/tests/test_getBag.py index a99acc6963e51bd27b8e7acf0865e182876dbd01..23bfc0ea50c260a2f5c4cbf62321c066b5743ac2 100644 --- a/workflow/tests/test_getBag.py +++ b/workflow/tests/test_getBag.py @@ -12,4 +12,4 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ @pytest.mark.getBag def test_getBag(): assert os.path.exists(os.path.join( - test_output_path, 'Replicate_Q-Y5F6.zip')) + test_output_path, 'Q-Y5F6_inputBag.zip')) diff --git a/workflow/tests/test_getData.py b/workflow/tests/test_getData.py index 95e2018eb5de04c55bdd8c6a118c961065add807..596a120abe904eac8f3e0ad871c9f8c03a6cba5f 100644 --- a/workflow/tests/test_getData.py +++ b/workflow/tests/test_getData.py @@ -12,6 +12,6 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ @pytest.mark.getData def test_getData(): assert os.path.exists(os.path.join( - test_output_path, 'Replicate_Q-Y5F6/bagit.txt')) + test_output_path, 'Q-Y5F6_inputBag/bagit.txt')) assert os.path.exists(os.path.join( - test_output_path, 'Replicate_Q-Y5F6/data/assets/Study/Q-Y4GY/Experiment/Q-Y4DP/Replicate/Q-Y5F6/mMARIS_Six2-#3.gene.rpkm.txt')) + test_output_path, 'Q-Y5F6_inputBag/data/assets/Study/Q-Y4GY/Experiment/Q-Y4DP/Replicate/Q-Y5F6/mMARIS_Six2-#3.gene.rpkm.txt')) diff --git a/workflow/tests/test_makeFeatureCounts.py b/workflow/tests/test_makeFeatureCounts.py index e67bca804a3a1bcf1787a55088a95528076f08fe..e14793511b226a6c82d502ce2f84867c087bc41a 100644 --- a/workflow/tests/test_makeFeatureCounts.py +++ b/workflow/tests/test_makeFeatureCounts.py @@ -12,8 +12,8 @@ data_output_path = os.path.dirname(os.path.abspath(__file__)) + \ @pytest.mark.makeFeatureCounts def test_makeFeatureCounts(): assert os.path.exists(os.path.join( - data_output_path, 'Q-Y5F6_1M.se.countData')) + data_output_path, 'Q-Y5F6_1M.se_countData')) assert os.path.exists(os.path.join( data_output_path, 'Q-Y5F6_1M.se.countTable.csv')) assert os.path.exists(os.path.join( - data_output_path, 'Q-Y5F6_1M.se.tpmTable.csv')) + data_output_path, 'Q-Y5F6_1M.se_tpmTable.csv')) diff --git a/workflow/tests/test_trimData.py b/workflow/tests/test_trimData.py index 40dd2dcb79c4f9d615e20f48364247aa61265f7f..a0938e756715fb30254e5c72fee4cd38bffec330 100644 --- a/workflow/tests/test_trimData.py +++ b/workflow/tests/test_trimData.py @@ -18,6 +18,6 @@ def test_trimData_se(): @pytest.mark.trimData def test_trimData_pe(): assert os.path.exists(os.path.join( - test_output_path, 'Q-Y5F6_1M.pe_R1_val_1.fq.gz')) + test_output_path, 'Q-Y5F6_1M.pe_val_1.fq.gz')) assert os.path.exists(os.path.join( - test_output_path, 'Q-Y5F6_1M.pe_R2_val_2.fq.gz')) + test_output_path, 'Q-Y5F6_1M.pe_val_2.fq.gz'))