Skip to content
Snippets Groups Projects
Commit ad5bf09e authored by Gervaise Henry's avatar Gervaise Henry 🤠
Browse files

Merge branch '11-deriva.upload' into 'develop'

Resolve "process_derivaUpload"

Closes #24, #75, and #11

See merge request !53
parents 30143e2f 5e6b9051
2 merge requests!58Develop,!53Resolve "process_derivaUpload"
Pipeline #8733 failed with stages
in 8 minutes and 17 seconds
Showing
with 1035 additions and 265 deletions
......@@ -7,6 +7,11 @@ before_script:
- mkdir -p ~/.deriva
- mkdir -p ~/.bdbag
variables:
refMoVersion: "38.p6.vM22"
refHuVersion: "38.p12.v31"
refERCCVersion: "92"
stages:
- badges
- deploy
......@@ -47,8 +52,8 @@ getBag:
- merge_requests
script:
- ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json
- singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' deriva-download-cli --version > version_deriva.txt
- singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6
- singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-download-cli --version > version_deriva.txt
- singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-download-cli staging.gudmap.org --catalog 2 ./workflow/conf/Replicate_For_Input_Bag.json . rid=Q-Y5F6
- pytest -m getBag
artifacts:
name: "$CI_JOB_NAME"
......@@ -65,10 +70,10 @@ getData:
except:
- merge_requests
script:
- singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bdbag --version > version_bdbag.txt
- singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' bdbag --version > version_bdbag.txt
- ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
- unzip ./test_data/bag/staging/Replicate_Q-Y5F6.zip
- singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6 TEST
- unzip ./test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip
- singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' bash ./workflow/scripts/bdbag_fetch.sh Q-Y5F6_inputBag Q-Y5F6 TEST
- pytest -m getData
artifacts:
name: "$CI_JOB_NAME"
......@@ -85,16 +90,16 @@ parseMetadata:
except:
- merge_requests
script:
- singularity run 'docker://bicf/python3:2.0.1_indev' python3 --version > version_python.txt
- rep=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID)
- exp=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p expRID)
- study=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p studyRID)
- endsMeta=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsMeta)
- endsManual=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsManual)
- stranded=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded)
- spike=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p spike)
- species=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species)
- readLength=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.stageNew.csv" -p readLength)
- singularity run 'docker://gudmaprbk/python3:1.0.0' python3 --version > version_python.txt
- rep=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID)
- exp=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p expRID)
- study=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p studyRID)
- endsMeta=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsMeta)
- endsManual=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsManual)
- stranded=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded)
- spike=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p spike)
- species=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species)
- readLength=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p readLength)
- echo -e "${endsMeta},${endsManual},${stranded},${spike},${species},${readLength},${exp},${study},${rep}" > design.csv
- pytest -m parseMetadata
artifacts:
......@@ -112,13 +117,13 @@ inferMetadata:
except:
- merge_requests
script:
- singularity run 'docker://bicf/rseqc3.0:2.0.1_indev' infer_experiment.py --version > version_rseqc.txt
- singularity run 'docker://gudmaprbk/rseqc4.0.0:1.0.0' infer_experiment.py --version > version_rseqc.txt
- >
align=$(echo $(grep "Overall alignment rate" ./test_data/meta/Q-Y5F6_1M.se.alignSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) &&
if [[ ${align} == "" ]]; then exit 1; fi
- >
singularity run 'docker://bicf/rseqc3.0:2.0.1_indev' infer_experiment.py -r "/project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed" -i "./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam" 1>> Q-Y5F6_1M.se.inferMetadata.log &&
ended=`singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/inferMeta.sh endness Q-Y5F6_1M.se.inferMetadata.log` &&
singularity run 'docker://gudmaprbk/rseqc4.0.0:1.0.0' infer_experiment.py -r "/project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed" -i "./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam" 1>> Q-Y5F6_1M.se.inferMetadata.log &&
ended=`singularity run 'gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/infer_meta.sh endness Q-Y5F6_1M.se.inferMetadata.log` &&
if [[ ${ended} == "" ]]; then exit 1; fi
- pytest -m inferMetadata
artifacts:
......@@ -136,9 +141,9 @@ trimData:
except:
- merge_requests
script:
- singularity run 'docker://bicf/trimgalore:1.1' trim_galore --version > version_trimgalore.txt
- singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
- singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
- singularity run 'docker://gudmaprbk/trimgalore0.6.5:1.0.0' trim_galore --version > version_trimgalore.txt
- singularity run 'docker://gudmaprbk/trimgalore0.6.5:1.0.0' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
- singularity run 'docker://gudmaprbk/trimgalore0.6.5:1.0.0' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
- readLengthSE=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
- readLengthPE=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
- pytest -m trimData
......@@ -157,10 +162,9 @@ downsampleData:
except:
- merge_requests
script:
- singularity run 'docker://bicf/seqtk:2.0.1_indev' seqtk sample -s100 ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz 1000 1> sampled.1.fq
- singularity run 'docker://gudmaprbk/seqtk1.3:1.0.0' seqtk sample -s100 ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz 1000 1> sampled.1.fq
- pytest -m downsampleData
alignData:
stage: unit
only:
......@@ -169,16 +173,16 @@ alignData:
except:
- merge_requests
script:
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 --version > version_hisat2.txt
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools --version > version_samtools.txt
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.sorted.bam.bai
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.pe.unal.gz -S Q-Y5F6_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/small/Q-Y5F6_1M.pe_R1_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5F6_1M.pe_R2_val_2.fq.gz --summary-file Q-Y5F6_1M.pe.alignSummary.txt --new-summary
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.pe.bam Q-Y5F6_1M.pe.sam
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai
- singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' hisat2 --version > version_hisat2.txt
- singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools --version > version_samtools.txt
- singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary
- singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam
- singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam
- singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.sorted.bam.bai
- singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.pe.unal.gz -S Q-Y5F6_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/small/Q-Y5F6_1M.pe_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5F6_1M.pe_val_2.fq.gz --summary-file Q-Y5F6_1M.pe.alignSummary.txt --new-summary
- singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.pe.bam Q-Y5F6_1M.pe.sam
- singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam
- singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai
- pytest -m alignData
artifacts:
name: "$CI_JOB_NAME"
......@@ -188,7 +192,6 @@ alignData:
- version_samtools.txt
expire_in: 7 days
dedupData:
stage: unit
only:
......@@ -197,15 +200,15 @@ dedupData:
except:
- merge_requests
script:
- singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools --version > version_samtools.txt
- singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates --version 2> version_markdups.txt&
- singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true
- singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5F6_1M.se.deduped.bam
- singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai
- singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' samtools --version > version_samtools.txt
- singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates --version 2> version_markdups.txt&
- singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true
- singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5F6_1M.se.deduped.bam
- singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' samtools index -@ 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai
- >
for i in {"chr8","chr4","chrY"}; do
echo "samtools view -b Q-Y5F6_1M.se.sorted.deduped.bam ${i} > Q-Y5F6_1M.se.sorted.deduped.${i}.bam; samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.${i}.bam Q-Y5F6_1M.se.sorted.deduped.${i}.bam.bai;";
done | singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' parallel -j 20 -k
done | singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' parallel -j 20 -k
- pytest -m dedupData
artifacts:
name: "$CI_JOB_NAME"
......@@ -225,12 +228,12 @@ countData:
script:
- ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/geneID.tsv
- ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/Entrez.tsv
- singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se.countData -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam
- singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.countData
- singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se
- singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se_countData -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam
- singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se_countData
- singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se
- assignedReads=$(grep -m 1 'Assigned' *.summary | grep -oe '\([0-9.]*\)')
- singularity run 'docker://bicf/subread2:2.0.0' featureCounts -v &> version_featurecounts.txt
- singularity run 'docker://bicf/subread2:2.0.0' R --version > version_r.txt
- singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' featureCounts -v &> version_featurecounts.txt
- singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' R --version > version_r.txt
- pytest -m makeFeatureCounts
artifacts:
name: "$CI_JOB_NAME"
......@@ -248,8 +251,8 @@ makeBigWig:
except:
- merge_requests
script:
- singularity run 'docker://bicf/deeptools3.3:2.0.1_indev' deeptools --version > version_deeptools.txt
- singularity run 'docker://bicf/deeptools3.3:2.0.1_indev' bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw
- singularity run 'docker://gudmaprbk/deeptools3.5.0:1.0.0' deeptools --version > version_deeptools.txt
- singularity run 'docker://gudmaprbk/deeptools3.5.0:1.0.0' bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw
- pytest -m makeBigWig
artifacts:
name: "$CI_JOB_NAME"
......@@ -266,8 +269,8 @@ fastqc:
except:
- merge_requests
script:
- singularity run 'docker://bicf/fastqc:2.0.1_indev' fastqc --version > version_fastqc.txt
- singularity run 'docker://bicf/fastqc:2.0.1_indev' fastqc ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o .
- singularity run 'docker://gudmaprbk/fastqc0.11.9:1.0.0' fastqc --version > version_fastqc.txt
- singularity run 'docker://gudmaprbk/fastqc0.11.9:1.0.0' fastqc ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o .
- pytest -m fastqc
artifacts:
name: "$CI_JOB_NAME"
......@@ -286,11 +289,85 @@ dataQC:
- merge_requests
script:
- echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > Q-Y5F6_1M.se.sorted.deduped.tin.xls
- for i in {"chr8","chr4","chrY"}; do
echo "tin.py -i ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"; done | singularity run 'docker://bicf/rseqc3.0:2.0.1_indev' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls
- >
for i in {"chr8","chr4","chrY"}; do
echo "tin.py -i ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"
done | singularity run 'docker://gudmaprbk/rseqc4.0.0:1.0.0' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls
- pytest -m dataQC
outputBag:
uploadInputBag:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json
- echo THIS IS A TEST FILE > test.txt
- >
md5=$(md5sum ./test.txt | awk '{ print $1 }') &&
size=$(wc -c < ./test.txt) &&
exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=${md5}) &&
if [ "${exist}" == "[]" ]; then
cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') &&
cookie=${cookie:11:-1} &&
loc=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host staging.gudmap.org put ./test.txt /hatrac/resources/rnaseq/pipeline/input_bag/TEST/test.txt --parents) &&
rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/upload_input_bag.py -f test.txt -l ${loc} -s ${md5} -b ${size} -n 'This is a test input bag' -o staging.gudmap.org -c ${cookie}) &&
echo ${rid} test input bag created
else
rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') &&
rid=${rid:8:-6} &&
echo ${rid} test input bag already exists
fi
uploadExecutionRun:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json
- >
exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Execution_Run/Replicate=17-BTFJ) &&
cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') &&
cookie=${cookie:11:-1} &&
if [ "${exist}" == "[]" ]; then
rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/upload_execution_run.py -r 17-BTFJ -w 17-BTFM -g 17-BT50 -i 17-BTFT -s Success -d 'This is a test execution run' -o staging.gudmap.org -c ${cookie} -u F) &&
echo ${rid} test execution run created
else
rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') &&
rid=${rid:7:-6} &&
rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/upload_execution_run.py -r 17-BTFJ -w 17-BTFM -g 17-BT50 -i 17-BTFT -s Success -d 'This is a test execution run' -o staging.gudmap.org -c ${cookie} -u ${rid}) &&
echo ${rid} test execution run already exists
fi
uploadQC:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json
- >
exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=17-BTFJ) &&
cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') &&
cookie=${cookie:11:-1} &&
if [ "${exist}" != "[]" ]; then
rids=$(echo ${exist} | grep -o '\"RID\":\".\{7\}' | sed 's/^.\{7\}//') &&
for rid in ${rids}; do
singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/delete_entry.py -r ${rid} -t mRNA_QC -o staging.gudmap.org -c ${cookie}
done
echo all old mRNA QC RIDs deleted
fi
rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/upload_qc.py -r 17-BTFJ -e 17-BTG4 -p "Single Read" -s forward -l 35 -w 5 -f 1 -n "This is a test mRNA QC" -o staging.gudmap.org -c ${cookie} -u F)
echo ${rid} test mRNA QC created
uploadProcessedFile:
stage: unit
only:
- push
......@@ -298,10 +375,54 @@ outputBag:
except:
- merge_requests
script:
- ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json
- echo THIS IS A TEST FILE > 17-BTFJ_test.csv
- mkdir -p ./deriva/Seq/pipeline/17-BTFE/17-BTG4/
- mv 17-BTFJ_test.csv ./deriva/Seq/pipeline/17-BTFE/17-BTG4/17-BTFJ_test.csv
- >
exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Processed_File/Replicate=17-BTFJ) &&
cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') &&
cookie=${cookie:11:-1} &&
if [ "${exist}" != "[]" ]; then
rids=$(echo ${exist} | grep -o '\"RID\":\".\{7\}' | sed 's/^.\{7\}//') &&
for rid in ${rids}; do
singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/delete_entry.py -r ${rid} -t Processed_File -o staging.gudmap.org -c ${cookie}
done
echo all old processed file RIDs deleted
fi
singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-upload-cli --catalog 2 --token ${cookie:9} staging.gudmap.org ./deriva
echo test processed file uploaded
- mkdir test
- singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bdbag test --archiver zip
- singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' bdbag test --archiver zip
- echo test output bag created
- pytest -m outputBag
uploadOutputBag:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- ln -sfn `readlink -e ./test_data/auth/credential.json` ./credential.json
- echo THIS IS A TEST FILE > test.txt
- >
md5=$(md5sum ./test.txt | awk '{ print $1 }') &&
size=$(wc -c < ./test.txt) &&
exist=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' curl -s https://staging.gudmap.org/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_MD5=${md5}) &&
if [ "${exist}" == "[]" ]; then
cookie=$(cat credential.json | grep -A 1 '\"staging.gudmap.org\": {' | grep -o '\"cookie\": \".*\"') &&
cookie=${cookie:11:-1} &&
loc=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host staging.gudmap.org put ./test.txt /hatrac/resources/rnaseq/pipeline/output_bag/TEST/test.txt --parents) &&
rid=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' python3 ./workflow/scripts/upload_output_bag.py -e 17-BTG4 -f test.txt -l ${loc} -s ${md5} -b ${size} -n 'This is a test output bag' -o staging.gudmap.org -c ${cookie}) &&
echo ${rid} test output bag created
else
rid=$(echo ${exist} | grep -o '\"RID\":\".*\",\"RCT') &&
rid=${rid:8:-6} &&
echo ${rid} test output bag already exists
fi
generateVersions:
stage: aggregation
......@@ -311,7 +432,7 @@ generateVersions:
except:
- merge_requests
script:
- singularity run 'docker://bicf/multiqc1.8:2.0.1_indev' multiqc --version > version_multiqc.txt
- singularity run 'docker://gudmaprbk/multiqc1.9:1.0.0' multiqc --version > version_multiqc.txt
- python ./workflow/scripts/generate_versions.py -o software_versions
- python ./workflow/scripts/generate_references.py -r ./docs/references.md -o software_references
artifacts:
......@@ -323,7 +444,7 @@ generateVersions:
expire_in: 7 days
humanBioHPC:
human_BioHPC:
stage: reference
only:
- push
......@@ -334,7 +455,7 @@ humanBioHPC:
- mkdir -p hu
- cp -R /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2 ./hu/
mouseBioHPC:
mouse_BioHPC:
stage: reference
only:
- push
......@@ -345,7 +466,7 @@ mouseBioHPC:
- mkdir -p mo
- cp -R /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2 ./mo/
humanDataHub:
human_dev:
stage: reference
only:
- push
......@@ -356,14 +477,13 @@ humanDataHub:
- ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
- referenceBase=dev.gudmap.org
- refName=GRCh
- refHuVersion=38.p12.v31
- references=$(echo ${referenceBase}/${refName}${refHuVersion})
- GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1)
- GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2)
- GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3)
- query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE})
- curl --request GET ${query} > refQuery.json
- refURL=$(python ./workflow/scripts/extractRefData.py --returnParam URL)
- refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL)
- loc=$(dirname ${refURL})
- if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi
- filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)')
......@@ -371,7 +491,7 @@ humanDataHub:
- test=$(echo ${test} | grep -o ${filename})
- if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi
mousenDataHub:
mouse_dev:
stage: reference
only:
- push
......@@ -382,14 +502,115 @@ mousenDataHub:
- ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
- referenceBase=dev.gudmap.org
- refName=GRCm
- references=$(echo ${referenceBase}/${refName}${refMoVersion})
- GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1)
- GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2)
- GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3)
- query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE})
- curl --request GET ${query} > refQuery.json
- refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL)
- loc=$(dirname ${refURL})
- if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi
- filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)')
- test=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/)
- test=$(echo ${test} | grep -o ${filename})
- if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi
human_staging:
stage: reference
only:
- push
- tags
except:
- merge_requests
script:
- ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
- referenceBase=staging.gudmap.org
- refName=GRCh
- references=$(echo ${referenceBase}/${refName}${refHuVersion})
- GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1)
- GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2)
- GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3)
- query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE})
- curl --request GET ${query} > refQuery.json
- refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL)
- loc=$(dirname ${refURL})
- if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi
- filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)')
- test=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/)
- test=$(echo ${test} | grep -o ${filename})
- if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi
mouse_staging:
stage: reference
only:
- push
- tags
except:
- merge_requests
script:
- ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
- referenceBase=staging.gudmap.org
- refName=GRCm
- refHuVersion=38.p6.vM22
- references=$(echo ${referenceBase}/${refName}${refMoVersion})
- GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1)
- GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2)
- GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3)
- query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE})
- curl --request GET ${query} > refQuery.json
- refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL)
- loc=$(dirname ${refURL})
- if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi
- filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)')
- test=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/)
- test=$(echo ${test} | grep -o ${filename})
- if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi
human_prod:
stage: reference
only:
- push
- tags
except:
- merge_requests
script:
- ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
- referenceBase=www.gudmap.org
- refName=GRCh
- references=$(echo ${referenceBase}/${refName}${refHuVersion})
- GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1)
- GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2)
- GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3)
- query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE})
- curl --request GET ${query} > refQuery.json
- refURL=$(python ./workflow/scripts/extractRefData.py --returnParam URL)
- refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL)
- loc=$(dirname ${refURL})
- if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi
- filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)')
- test=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/)
- test=$(echo ${test} | grep -o ${filename})
- if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi
mouse_prod:
stage: reference
only:
- push
- tags
except:
- merge_requests
script:
- ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
- referenceBase=www.gudmap.org
- refName=GRCm
- refHuVersion=38.p6.vM22
- references=$(echo ${referenceBase}/${refName}${refMoVersion})
- GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1)
- GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2)
- GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3)
- query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE})
- curl --request GET ${query} > refQuery.json
- refURL=$(python ./workflow/scripts/extract_ref_data.py --returnParam URL)
- loc=$(dirname ${refURL})
- if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi
- filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)')
......@@ -418,7 +639,7 @@ integration_se:
- SE_multiqc_data.json
expire_in: 7 days
retry:
max: 1
max: 0
when:
- always
......@@ -443,10 +664,11 @@ integration_pe:
- PE_multiqc_data.json
expire_in: 7 days
retry:
max: 1
max: 0
when:
- always
override_inputBag:
stage: integration
only: [merge_requests]
......@@ -456,7 +678,7 @@ override_inputBag:
script:
- hostname
- ulimit -a
- nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --inputBagForce ./test_data/bag/staging/Replicate_Q-Y5F6.zip --ci true
- nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --inputBagForce ./test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip --upload false --ci true
- find . -type f -name "multiqc_data.json" -exec cp {} ./inputBagOverride_PE_multiqc_data.json \;
artifacts:
name: "$CI_JOB_NAME"
......@@ -465,7 +687,7 @@ override_inputBag:
- inputBagOverride_PE_multiqc_data.json
expire_in: 7 days
retry:
max: 1
max: 0
when:
- always
......@@ -478,7 +700,7 @@ override_fastq:
script:
- hostname
- ulimit -a
- nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --fastqsForce './test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz' --ci true
- nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --fastqsForce './test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz' --upload false --ci true
- find . -type f -name "multiqc_data.json" -exec cp {} ./fastqOverride_PE_multiqc_data.json \;
artifacts:
name: "$CI_JOB_NAME"
......@@ -487,7 +709,7 @@ override_fastq:
- fastqOverride_PE_multiqc_data.json
expire_in: 7 days
retry:
max: 1
max: 0
when:
- always
......@@ -500,7 +722,7 @@ override_species:
script:
- hostname
- ulimit -a
- nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --speciesForce 'Homo sapiens' --ci true
- nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --speciesForce 'Homo sapiens' --upload false --ci true
- find . -type f -name "multiqc_data.json" -exec cp {} ./speciesOverride_PE_multiqc_data.json \;
artifacts:
name: "$CI_JOB_NAME"
......@@ -509,7 +731,7 @@ override_species:
- speciesOverride_PE_multiqc_data.json
expire_in: 7 days
retry:
max: 1
max: 0
when:
- always
......
......@@ -5,9 +5,9 @@ These are the most common things requested on pull requests.
- [ ] This comment contains a description of changes (with reason)
- [ ] If you've fixed a bug or added code that should be tested, add tests!
- [ ] Documentation in `docs` is updated
- [ ] Replace dag.png with the most recent CI pipleine integrated_pe artifact
- [ ] Replace software_versions_mqc.yaml with the most recent CI pipleine generateVersions artifact
- [ ] Replace software_references_mqc.yaml with the most recent CI pipleine generateVersions artifact
- [ ] Replace dag.png with the most recent CI pipeline integrated_pe artifact
- [ ] Replace software_versions_mqc.yaml with the most recent CI pipeline generateVersions artifact
- [ ] Replace software_references_mqc.yaml with the most recent CI pipeline generateVersions artifact
- [ ] `CHANGELOG.md` is updated
- [ ] `README.md` is updated
- [ ] `LICENSE.md` is updated with new contributors
......
# v0.0.4 (in development)
# v0.1.0 (in development)
**User Facing**
* Add option to pull references from datahub
* Add option to send email on workflow error, with pipeline error message
* Add versions and paper references of software used to report
* Upload input bag
* Upload execution run
* Upload mRNA QC
* Create and upload output bag
* Add optional to not upload
**Background**
* Remove (comment out) option to pull references from S3
......@@ -10,11 +15,11 @@
* Start using new gudmaprbk dockerhub (images autobuilt)
* Moved consistency checks to be fully python
* Changed order of steps so that fastqc is done after the trim step
* Change docker images to production
* Add automated version badges
*Known Bugs*
* Datahub reference pull uses dev.gudmap.org as source until referencencs are placed on production
* outputBag does not contain fetch for processed data
* Does not include automatic data upload
* Override params (inputBag, fastq, species) aren't checked for integrity
<hr>
......
......@@ -37,9 +37,12 @@ To Run:
* **dev** = [dev.gudmap.org](dev.gudmap.org) (default, does not contain all data)
* **staging** = [staging.gudmap.org](staging.gudmap.org) (does not contain all data)
* **production** = [www.gudmap.org](www.gudmap.org) (***does contain all data***)
* `--refMoVersion` mouse reference version ***(optional)***
* `--refHuVersion` human reference version ***(optional)***
* `--refERCCVersion` human reference version ***(optional)***
* `--refMoVersion` mouse reference version ***(optional, default = 38.p6.vM22)***
* `--refHuVersion` human reference version ***(optional, default = 38.p12.v31)***
* `--refERCCVersion` human reference version ***(optional, default = 92)***
* `--upload` option to not upload output back to the data-hub ***(optional, default = true)***
* **true** = upload outputs to the data-hub
* **false** = do *NOT* upload outputs to the data-hub
* `-profile` config profile to use ***(optional)***:
* defaut = processes on BioHPC cluster
* **biohpc** = process on BioHPC cluster
......@@ -47,7 +50,7 @@ To Run:
* **aws_ondemand** = AWS Batch on-demand instant requests
* **aws_spot** = AWS Batch spot instance requests
* `--email` email address(es) to send failure notification (comma separated) ***(optional)***:
* e.g: `--email 'venkat.malladi@utsouthwestern.edu,Gervaise.Henry@UTSouthwestern.edu'`
* e.g: `--email 'Venkat.Malladi@utsouthwestern.edu,Gervaise.Henry@UTSouthwestern.edu'`
* NOTES:
* once deriva-auth is run and authenticated, the two files above are saved in ```~/.deriva/``` (see official documents from [deriva](https://github.com/informatics-isi-edu/deriva-client#installer-packages-for-windows-and-macosx) on the lifetime of the credentials)
......
docs/dag.png

762 KiB | W: | H:

docs/dag.png

1.36 MiB | W: | H:

docs/dag.png
docs/dag.png
docs/dag.png
docs/dag.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -4,7 +4,7 @@
description: 'This section describes references for the tools used.'
plot_type: 'html'
data: |
<h3 id="references">References</h3>
<ol style="list-style-type: decimal">
<li><strong>python</strong>:</li>
......@@ -41,7 +41,7 @@
<li><strong>hisat2</strong>:</li>
</ol>
<ul>
<li>Kim ,D.,Paggi, J.M., Park, C., Bennett, C., Salzberg, S.L. Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. 2019 Nat Biotechnol. 2019 Aug;37(8):907-915. doi:<a href="https://doi.org/10.1038/s41587-019-0201-4">10.1038/s41587-019-0201-4</a></li>
<li>Kim ,D.,Paggi, J.M., Park, C., Bennett, C., Salzberg, S.L. 2019 Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. Nat Biotechnol. Aug;37(8):907-915. doi:<a href="https://doi.org/10.1038/s41587-019-0201-4">10.1038/s41587-019-0201-4</a>.</li>
</ul>
<ol start="7" style="list-style-type: decimal">
<li><strong>samtools</strong>:</li>
......
......@@ -6,19 +6,19 @@
description: 'are collected for pipeline version.'
data: |
<dl class="dl-horizontal">
<dt>Python</dt><dd>v3.7.7</dd>
<dt>DERIVA</dt><dd>v1.0.0</dd>
<dt>Python</dt><dd>v3.8.3</dd>
<dt>DERIVA</dt><dd>v1.3.0</dd>
<dt>BDBag</dt><dd>v1.5.6</dd>
<dt>RSeQC</dt><dd>v3.0.1</dd>
<dt>Trim Galore!</dt><dd>v0.6.4</dd>
<dt>HISAT2</dt><dd>v2.1.0</dd>
<dt>Samtools</dt><dd>v1.9</dd>
<dt>picard (MarkDuplicates)</dt><dd>v2.23.0-SNAPSHOT</dd>
<dt>featureCounts</dt><dd>v2.0.0</dd>
<dt>R</dt><dd>v3.6.3</dd>
<dt>deepTools</dt><dd>v3.3.2</dd>
<dt>RSeQC</dt><dd>v4.0.0</dd>
<dt>Trim Galore!</dt><dd>v0.6.4_dev</dd>
<dt>HISAT2</dt><dd>v2.2.1</dd>
<dt>Samtools</dt><dd>v1.11</dd>
<dt>picard (MarkDuplicates)</dt><dd>v2.23.9</dd>
<dt>featureCounts</dt><dd>v2.0.1</dd>
<dt>R</dt><dd>v4.0.3</dd>
<dt>deepTools</dt><dd>v3.5.0</dd>
<dt>FastQC</dt><dd>v0.11.9</dd>
<dt>MultiQC</dt><dd>v1.8</dd>
<dt>MultiQC</dt><dd>v1.9</dd>
<dt>Pipeline Version</dt><dd>v0.0.4_indev</dd>
</dl>
......@@ -5,52 +5,54 @@
module load singularity/3.5.3
module load pigz/2.4
ln -sfn /project/BICF/BICF_Core/shared/gudmap/test_data/* ../test_data/
mkdir -p NEW_test_data
ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json
ln -sfn ./test_data/auth/credential.json ~/.deriva/credential.json
mkdir -p ./NEW_test_data/bag
singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6
cp Replicate_Q-Y5F6.zip ./NEW_test_data/bag/Replicate_Q-Y5F6.zip
singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-download-cli staging.gudmap.org --catalog 2 ../workflow/conf/Replicate_For_Input_Bag.json . rid=Q-Y5F6
cp Q-Y5F6_inputBag.zip ./NEW_test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip
mkdir -p ./NEW_test_data/fastq
unzip ./test_data/bag/Replicate_Q-Y5F6.zip
singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6
cp Replicate_Q-Y5F6.R1.fastq.gz ./NEW_test_data/fastq/Replicate_Q-Y5F6.R1.fastq.gz
cp Replicate_Q-Y5F6.R2.fastq.gz ./NEW_test_data/fastq/Replicate_Q-Y5F6.R2.fastq.gz
unzip ./NEW_test_data/bag/Q-Y5F6_inputBag_xxxxxxxx.zip
singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' bash ../workflow/scripts/bdbagFetch.sh Q-Y5F6_inputBag Q-Y5F6
cp Q-Y5F6.R1.fastq.gz ./NEW_test_data/fastq/Q-Y5F6.R1.fastq.gz
cp Q-Y5F6.R2.fastq.gz ./NEW_test_data/fastq/Q-Y5F6.R2.fastq.gz
mkdir -p ./NEW_test_data/fastq/small
singularity exec 'docker://bicf/seqtk:2.0.0' seqtk sample -s100 ./NEW_test_data/fastq/Replicate_Q-Y5F6.R1.fastq.gz 1000000 1> Q-Y5F6_1M.R1.fastq
singularity exec 'docker://bicf/seqtk:2.0.0' seqtk sample -s100 ./NEW_test_data/fastq/Replicate_Q-Y5F6.R2.fastq.gz 1000000 1> Q-Y5F6_1M.R2.fastq
singularity exec 'docker://gudmaprbk/seqtk1.3:1.0.0' seqtk sample -s100 ./NEW_test_data/fastq/Q-Y5F6.R1.fastq.gz 1000000 1> Q-Y5F6_1M.R1.fastq
singularity exec 'docker://gudmaprbk/seqtk1.3:1.0.0' seqtk sample -s100 ./NEW_test_data/fastq/Q-Y5F6.R2.fastq.gz 1000000 1> Q-Y5F6_1M.R2.fastq
pigz Q-Y5F6_1M.R1.fastq
pigz Q-Y5F6_1M.R2.fastq
cp Q-Y5F6_1M.R1.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
cp Q-Y5F6_1M.R2.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
mkdir -p ./NEW_test_data/meta
singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5F6_1M.se -j 20 ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5F6_1M.pe -j 20 ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
singularity run 'docker://gudmaprbk/trimgalore0.6.5:1.0.0' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5F6_1M.se -j 20 ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
singularity run 'docker://gudmaprbk/trimgalore0.6.5:1.0.0' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5F6_1M.pe -j 20 ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
cp Q-Y5F6_1M.se_trimmed.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz
cp Q-Y5F6_1M.pe_R1_val_1.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_R1_val_1.fq.gz
cp Q-Y5F6_1M.pe_R2_val_2.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_R2_val_2.fq.gz
cp Q-Y5F6_1M.pe_val_1.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_val_1.fq.gz
cp Q-Y5F6_1M.pe_val_2.fq.gz ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_val_2.fq.gz
cp Q-Y5F6_1M.R1.fastq.gz_trimming_report.txt ./NEW_test_data/meta/Q-Y5F6_1M.R1.fastq.gz_trimming_report.txt
cp Q-Y5F6_1M.R2.fastq.gz_trimming_report.txt ./NEW_test_data/meta/Q-Y5F6_1M.R2.fastq.gz_trimming_report.txt
touch metaTest.csv
echo 'Replicate_RID,Experiment_RID,Study_RID,Paired_End,File_Type,Has_Strand_Specific_Information,Used_Spike_Ins,Species' > metaTest.csv
echo 'Replicate_RID,Experiment_RID,Study_RID,uk,FastQ,no,no,Homo sapiens' >> metaTest.csv
echo 'Replicate_RID,Experiment_RID,Study_RID,Paired_End,File_Type,Has_Strand_Specific_Information,Used_Spike_Ins,Species,Read_Length' > metaTest.csv
echo 'Replicate_RID,Experiment_RID,Study_RID,uk,FastQ,no,no,Homo sapiens,75' >> metaTest.csv
cp metaTest.csv ./NEW_test_data/meta/metaTest.csv
mkdir -p ./NEW_test_data/bam
mkdir -p ./NEW_test_data/bam/small
singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./NEW_test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary
singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam
singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam
singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.sorted.bam.bai
singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.pe.unal.gz -S Q-Y5F6_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_R1_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5F6_1M.pe_R2_val_2.fq.gz --summary-file Q-Y5F6_1M.pe.alignSummary.txt --new-summary
singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.pe.bam Q-Y5F6_1M.pe.sam
singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam
singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai
singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCm38.p6.vM22/hisat2/genome --rna-strandness F -U ./NEW_test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary
singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam
singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.bam
singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.bam Q-Y5F6_1M.se.sorted.bam.bai
singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.pe.unal.gz -S Q-Y5F6_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCm38.p6.vM22/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_val_1.fq.gz -2 ./NEW_test_data/fastq/small/Q-Y5F6_1M.pe_val_2.fq.gz --summary-file Q-Y5F6_1M.pe.alignSummary.txt --new-summary
singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.pe.bam Q-Y5F6_1M.pe.sam
singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.bam
singularity run 'docker://gudmaprbk/hisat2.2.1:1.0.0' samtools index -@ 20 -b Q-Y5F6_1M.pe.sorted.bam Q-Y5F6_1M.pe.sorted.bam.bai
cp Q-Y5F6_1M.se.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.bam
cp Q-Y5F6_1M.pe.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.pe.bam
cp Q-Y5F6_1M.se.sorted.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.bam
......@@ -60,18 +62,17 @@ cp Q-Y5F6_1M.pe.sorted.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.pe.sorted.bam
cp Q-Y5F6_1M.se.alignSummary.txt ./NEW_test_data/meta/Q-Y5F6_1M.se.alignSummary.txt
cp Q-Y5F6_1M.pe.alignSummary.txt ./NEW_test_data/meta/Q-Y5F6_1M.pe.alignSummary.txt
singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true
singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.deduped.bam
singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai
singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true
singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.deduped.bam
singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.bam Q-Y5F6_1M.se.sorted.deduped.bam.bai
cp Q-Y5F6_1M.se.deduped.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.deduped.bam
cp Q-Y5F6_1M.se.sorted.deduped.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam
cp Q-Y5F6_1M.se.sorted.deduped.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam.bai
cp Q-Y5F6_1M.se.deduped.Metrics.txt /NEW_test_data/meta/Q-Y5F6_1M.se.deduped.Metrics.txt
cp Q-Y5F6_1M.se.deduped.Metrics.txt ./NEW_test_data/meta/Q-Y5F6_1M.se.deduped.Metrics.txt
for i in {"chr8","chr4","chrY"}; do
echo "samtools view -b ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam ${i} > Q-Y5F6_1M.se.sorted.deduped.${i}.bam; samtools index -@ 20 -b Q-Y5F6_1M.se.sorted.deduped.${i}.bam Q-Y5F6_1M.se.sorted.deduped.${i}.bam.bai;";
done | singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' parallel -j 20 -k
done | singularity run 'docker://gudmaprbk/picard2.23.9:1.0.0' parallel -j 20 -k
cp Q-Y5F6_1M.se.sorted.deduped.chr4.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chr4.bam
cp Q-Y5F6_1M.se.sorted.deduped.chr4.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chr4.bam.bai
cp Q-Y5F6_1M.se.sorted.deduped.chr8.bam ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.chr8.bam
......@@ -81,28 +82,30 @@ cp Q-Y5F6_1M.se.sorted.deduped.chrY.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.
mkdir -p ./NEW_test_data/counts
mkdir -p ./NEW_test_data/counts/small
ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/geneID.tsv
ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/Entrez.tsv
singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se.countData -s 1 -R SAM --primary --ignoreDup ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam
singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count Q-Y5F6_1M.se.countData
singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se
cp Q-Y5F6_1M.se.featureCounts ./NEW_test_data/counts/small/Q-Y5F6_1M.se.countData
ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCm38.p6.vM22/geneID.tsv
ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCm38.p6.vM22/Entrez.tsv
singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCm38.p6.vM22/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCm38.p6.vM22/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se_countData -s 1 -R SAM --primary --ignoreDup ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam
singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' Rscript ../workflow/scripts/calculateTPM.R --count Q-Y5F6_1M.se_countData
singularity run 'docker://gudmaprbk/subread2.0.1:1.0.0' Rscript ../workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se
cp Q-Y5F6_1M.se_countData ./NEW_test_data/counts/small/Q-Y5F6_1M.se_countData
cp Q-Y5F6_1M.se.countTable.csv ./NEW_test_data/counts/small/Q-Y5F6_1M.se.countTable.csv
cp Q-Y5F6_1M.se.countTable.csv ./NEW_test_data/counts/small/Q-Y5F6_1M.se.tpmTable.csv
cp Q-Y5F6_1M.se_tpmTable.csv ./NEW_test_data/counts/small/Q-Y5F6_1M.se_tpmTable.csv
mkdir -p ./NEW_test_data/bw
mkdir -p ./NEW_test_data/bw/small
singularity run 'docker://bicf/deeptools3.3:2.0.0' bamCoverage -p 20 -b ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw
singularity run 'docker://gudmaprbk/deeptools3.5.0:1.0.0' bamCoverage -p 20 -b ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw
cp Q-Y5F6_1M.se.bw ./NEW_test_data/bw/small/Q-Y5F6_1M.se.bw
mkdir -p ./NEW_test_data/fastqc
mkdir -p ./NEW_test_data/fastqc/small
singularity run 'docker://bicf/fastqc:2.0.0' ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o .
singularity run 'docker://gudmaprbk/fastqc0.11.9:1.0.0' fastqc ./NEW_test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o .
cp Q-Y5F6_1M.R1_fastqc.html ./NEW_test_data/fastqc/small/Q-Y5F6_1M.R1_fastqc.html
cp Q-Y5F6_1M.R1_fastqc.zip ./NEW_test_data/fastqc/small/Q-Y5F6_1M.R1_fastqc.zip
echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > Q-Y5F6_1M.se.sorted.deduped.tin.xls
for i in {"chr8","chr4","chrY"}; do
echo "tin.py -i ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"; done | singularity run 'docker://bicf/rseqc3.0:2.0.0' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls
echo "tin.py -i ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCm38.p6.vM22/bed/genome.bed; cat Q-Y5F6_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"; done | singularity run 'docker://gudmaprbk/rseqc4.0.0:1.0.0' parallel -j 20 -k >> Q-Y5F6_1M.se.sorted.deduped.tin.xls
cp Q-Y5F6_1M.se.sorted.deduped.tin.xls ./NEW_test_data/meta/Q-Y5F6_1M.se.sorted.deduped.tin.xls
chgrp -R BICF_Core ./NEW_test_data
chmod -R 750 ./NEW_test_data
{
"bag": {
"bag_name": "Execution_Run_{rid}",
"bag_algorithms": [
"md5"
],
"bag_archiver": "zip",
"bag_metadata": {}
},
"catalog": {
"catalog_id": "2",
"query_processors": [
{
"processor": "csv",
"processor_params": {
"output_path": "Execution_Run",
"query_path": "/attribute/M:=RNASeq:Execution_Run/RID=17-BPAG/RID,Replicate_RID:=Replicate,Workflow_RID:=Workflow,Reference_Genone_RID:=Reference_Genome,Input_Bag_RID:=Input_Bag,Notes,Execution_Status,Execution_Status_Detail,RCT,RMT?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Workflow",
"query_path": "/entity/M:=RNASeq:Execution_Run/RID=17-BPAG/RNASeq:Workflow?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Reference_Genome",
"query_path": "/entity/M:=RNASeq:Execution_Run/RID=17-BPAG/RNASeq:Reference_Genome?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Input_Bag",
"query_path": "/entity/M:=RNASeq:Execution_Run/RID=17-BPAG/RNASeq:Input_Bag?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "mRNA_QC",
"query_path": "/attribute/M:=RNASeq:Execution_Run/RID=17-BPAG/(RID)=(RNASeq:mRNA_QC:Execution_Run)/RID,Execution_Run_RID:=Execution_Run,Replicate_RID:=Replicate,Paired_End,Strandedness,Median_Read_Length,Raw_Count,Final_Count,Notes,RCT,RMT?limit=none"
}
},
{
"processor": "fetch",
"processor_params": {
"output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}/Execution_Run/{Execution_Run_RID}/Output_Files",
"query_path": "/attribute/M:=RNASeq:Execution_Run/RID=17-BPAG/R:=RNASeq:Replicate/$M/(RID)=(RNASeq:Processed_File:Execution_Run)/url:=File_URL,length:=File_Bytes,filename:=File_Name,md5:=File_MD5,Execution_Run_RID:=M:RID,Study_RID:=R:Study_RID,Experiment_RID:=R:Experiment_RID,Replicate_RID:=R:RID?limit=none"
}
},
{
"processor": "fetch",
"processor_params": {
"output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}/Execution_Run/{Execution_Run_RID}/Input_Bag",
"query_path": "/attribute/M:=RNASeq:Execution_Run/RID=17-BPAG/R:=RNASeq:Replicate/$M/RNASeq:Input_Bag/url:=File_URL,length:=File_Bytes,filename:=File_Name,md5:=File_MD5,Execution_Run_RID:=M:RID,Study_RID:=R:Study_RID,Experiment_RID:=R:Experiment_RID,Replicate_RID:=R:RID?limit=none"
}
}
]
}
}
\ No newline at end of file
{
"bag": {
"bag_name": "Replicate_{rid}",
"bag_name": "{rid}_inputBag",
"bag_algorithms": [
"md5"
],
......
......@@ -84,7 +84,23 @@ process {
cpus = 2
memory = '1 GB'
}
withName: outputBag {
withName: uploadInputBag {
cpus = 1
memory = '1 GB'
}
withName: uploadExecutionRun {
cpus = 1
memory = '1 GB'
}
withName: uploadQC {
cpus = 1
memory = '1 GB'
}
withName: uploadProcessedFile {
cpus = 1
memory = '1 GB'
}
withName: uploadOutputBag {
cpus = 1
memory = '1 GB'
}
......
......@@ -58,7 +58,19 @@ process {
withName: aggrQC {
executor = 'local'
}
withName: outputBag {
withName: uploadInputBag {
executor = 'local'
}
withName: uploadExecutionRun {
executor = 'local'
}
withName: uploadQC {
executor = 'local'
}
withName: uploadProcessedFile {
executor = 'local'
}
withName: uploadOutputBag {
executor = 'local'
}
}
......
......@@ -74,10 +74,14 @@ custom_data:
scale: false
format: '{}'
headers:
Session
Session ID
Pipeline Version
Input
Session:
description: ''
Session ID:
description: 'Nextflow session ID'
Pipeline Version:
description: 'BICF pipeline version'
Input:
description: 'Input overrides'
rid:
file_format: 'tsv'
section_name: 'RID'
......@@ -88,10 +92,14 @@ custom_data:
scale: false
format: '{}'
headers:
Replicate
Replicate RID
Experiment RID
Study RID
Replicate:
description: ''
Replicate RID:
description: 'Replicate RID'
Experiment RID:
description: 'Experiment RID'
Study RID:
description: 'Study RID'
meta:
file_format: 'tsv'
section_name: 'Metadata'
......@@ -102,30 +110,43 @@ custom_data:
scale: false
format: '{:,.0f}'
headers:
Source
Species
Ends
Stranded
Spike-in
Raw Reads
Assigned Reads
Median Read Length
Median TIN
Pipeline Version
Source:
description: 'Metadata source'
Species:
description: 'Species'
Ends:
description: 'Single or paired end sequencing'
Stranded:
description: 'Stranded (forward/reverse) or unstranded library prep'
Spike-in:
description: 'ERCC spike in'
Raw Reads:
description: 'Number of reads of the sequencer'
Assigned Reads:
description: 'Final reads after fintering'
Median Read Length:
description: 'Average read length'
Median TIN:
description: 'Average transcript integrity number'
ref:
file_format: 'tsv'
section_name: 'Reference'
description: 'This is the referenec version information'
description: 'This is the reference version information'
plot_type: 'table'
pconfig:
id: 'ref'
scale: false
format: '{}'
headers:
Species
Genome Reference Consortium Build
Genome Reference Consortium Patch
GENCODE Annotation Release"
Species:
description: 'Reference species'
Genome Reference Consortium Build:
description: 'Reference source build'
Genome Reference Consortium Patch:
description: 'Reference source patch version'
GENCODE Annotation Release:
description: 'Annotation release version'
tin:
file_format: 'tsv'
section_name: 'TIN'
......@@ -135,16 +156,16 @@ custom_data:
id: 'tin'
headers:
chrom
0 - 9
10 - 19
20 - 29
30 - 39
40 - 49
50 - 59
60 - 69
70 - 79
80 - 89
90 - 99
1 - 10
11 - 20
21 - 30
31 - 40
41 - 50
51 - 60
61 - 70
71 - 80
81 - 90
91 - 100
sp:
run:
......@@ -156,4 +177,4 @@ sp:
ref:
fn: 'reference.tsv'
tin:
fn: '*.tin.hist.tsv'
fn: '*_tin.hist.tsv'
......@@ -20,55 +20,67 @@ profiles {
process {
withName:getBag {
container = 'bicf/gudmaprbkfilexfer:2.0.1_indev'
container = 'gudmaprbk/deriva1.3:1.0.0'
}
withName:getData {
container = 'bicf/gudmaprbkfilexfer:2.0.1_indev'
container = 'gudmaprbk/deriva1.3:1.0.0'
}
withName: parseMetadata {
container = 'gudmaprbk/python3:1.0.0'
}
withName: trimData {
container = 'bicf/trimgalore:1.1'
container = 'gudmaprbk/trimgalore0.6.5:1.0.0'
}
withName: getRefInfer {
container = 'gudmaprbk/deriva1.3:1.0.0'
}
withName: downsampleData {
container = 'bicf/seqtk:2.0.1_indev'
container = 'gudmaprbk/seqtk1.3:1.0.0'
}
withName: alignSampleData {
container = 'bicf/gudmaprbkaligner:2.0.1_indev'
container = 'gudmaprbk/hisat2.2.1:1.0.0'
}
withName: inferMetadata {
container = 'bicf/rseqc3.0:2.0.1_indev'
container = 'gudmaprbk/rseqc4.0.0:1.0.0'
}
withName: getRef {
container = 'gudmaprbk/deriva1.3:1.0.0'
}
withName: alignData {
container = 'bicf/gudmaprbkaligner:2.0.1_indev'
container = 'gudmaprbk/hisat2.2.1:1.0.0'
}
withName: dedupData {
container = 'bicf/gudmaprbkdedup:2.0.0'
container = 'gudmaprbk/picard2.23.9:1.0.0'
}
withName: countData {
container = 'bicf/subread2:2.0.0'
container = 'gudmaprbk/subread2.0.1:1.0.0'
}
withName: makeBigWig {
container = 'bicf/deeptools3.3:2.0.1_indev'
container = 'gudmaprbk/deeptools3.5.0:1.0.0'
}
withName: fastqc {
container = 'bicf/fastqc:2.0.1_indev'
container = 'gudmaprbk/fastqc0.11.9:1.0.0'
}
withName: dataQC {
container = 'bicf/rseqc3.0:2.0.1_indev'
container = 'gudmaprbk/rseqc4.0.0:1.0.0'
}
withName: aggrQC {
container = 'bicf/multiqc1.8:2.0.1_indev'
container = 'gudmaprbk/multiqc1.9:1.0.0'
}
withName:uploadInputBag {
container = 'gudmaprbk/deriva1.3:1.0.0'
}
withName:uploadExecutionRun {
container = 'gudmaprbk/deriva1.3:1.0.0'
}
withName:outputBag {
container = 'bicf/gudmaprbkfilexfer:2.0.1_indev'
withName:uploadQC {
container = 'gudmaprbk/deriva1.3:1.0.0'
}
withName:uploadProcessedFile {
container = 'gudmaprbk/deriva1.3:1.0.0'
}
withName:uploadOutputBag {
container = 'gudmaprbk/deriva1.3:1.0.0'
}
}
......
......@@ -18,6 +18,7 @@ params.refMoVersion = "38.p6.vM22"
params.refHuVersion = "38.p12.v31"
params.refERCCVersion = "92"
params.outDir = "${baseDir}/../output"
params.upload = true
params.email = ""
......@@ -36,6 +37,11 @@ deriva.into {
deriva_getBag
deriva_getRefInfer
deriva_getRef
deriva_uploadInputBag
deriva_uploadExecutionRun
deriva_uploadQC
deriva_uploadProcessedFile
deriva_uploadOutputBag
}
bdbag = Channel
.fromPath(params.bdbag)
......@@ -46,13 +52,15 @@ refHuVersion = params.refHuVersion
refERCCVersion = params.refERCCVersion
outDir = params.outDir
logsDir = "${outDir}/Logs"
upload = params.upload
inputBagForce = params.inputBagForce
fastqsForce = params.fastqsForce
speciesForce = params.speciesForce
email = params.email
// Define fixed files
derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json")
// Define fixed files and
replicateExportConfig = Channel.fromPath("${baseDir}/conf/Replicate_For_Input_Bag.json")
executionRunExportConfig = Channel.fromPath("${baseDir}/conf/Execution_Run_For_Output_Bag.json")
if (params.source == "dev") {
source = "dev.gudmap.org"
} else if (params.source == "staging") {
......@@ -74,15 +82,20 @@ softwareReferences = Channel.fromPath("${baseDir}/../docs/software_references_mq
softwareVersions = Channel.fromPath("${baseDir}/../docs/software_versions_mqc.yaml")
// Define script files
script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh")
script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py")
script_inferMeta = Channel.fromPath("${baseDir}/scripts/inferMeta.sh")
script_refDataInfer = Channel.fromPath("${baseDir}/scripts/extractRefData.py")
script_refData = Channel.fromPath("${baseDir}/scripts/extractRefData.py")
script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbag_fetch.sh")
script_parseMeta = Channel.fromPath("${baseDir}/scripts/parse_meta.py")
script_inferMeta = Channel.fromPath("${baseDir}/scripts/infer_meta.sh")
script_refDataInfer = Channel.fromPath("${baseDir}/scripts/extract_ref_data.py")
script_refData = Channel.fromPath("${baseDir}/scripts/extract_ref_data.py")
script_calculateTPM = Channel.fromPath("${baseDir}/scripts/calculateTPM.R")
script_convertGeneSymbols = Channel.fromPath("${baseDir}/scripts/convertGeneSymbols.R")
script_tinHist = Channel.fromPath("${baseDir}/scripts/tinHist.py")
script_tinHist = Channel.fromPath("${baseDir}/scripts/tin_hist.py")
script_uploadInputBag = Channel.fromPath("${baseDir}/scripts/upload_input_bag.py")
script_uploadExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py")
script_uploadQC = Channel.fromPath("${baseDir}/scripts/upload_qc.py")
script_uploadOutputBag = Channel.fromPath("${baseDir}/scripts/upload_output_bag.py")
script_deleteEntry_uploadQC = Channel.fromPath("${baseDir}/scripts/delete_entry.py")
script_deleteEntry_uploadProcessedFile = Channel.fromPath("${baseDir}/scripts/delete_entry.py")
/*
* trackStart: track start of pipeline
......@@ -143,10 +156,10 @@ process getBag {
input:
path credential, stageAs: "credential.json" from deriva_getBag
path derivaConfig
path replicateExportConfig
output:
path ("Replicate_*.zip") into bag
path ("*.zip") into bag
when:
inputBagForce == ""
......@@ -164,8 +177,15 @@ process getBag {
# deriva-download replicate RID
echo -e "LOG: fetching bag for ${repRID} in GUDMAP" >> ${repRID}.getBag.log
deriva-download-cli staging.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID}
deriva-download-cli ${source} --catalog 2 ${replicateExportConfig} . rid=${repRID}
echo -e "LOG: fetched" >> ${repRID}.getBag.log
name=\$(ls *.zip)
name=\$(basename \${name} | cut -d "." -f1)
yr=\$(date +'%Y')
mn=\$(date +'%m')
dy=\$(date +'%d')
mv \${name}.zip \${name}_\${yr}\${mn}\${dy}.zip
"""
}
......@@ -177,6 +197,10 @@ if (inputBagForce != "") {
} else {
inputBag = bag
}
inputBag.into {
inputBag_getData
inputBag_uploadInputBag
}
/*
* getData: fetch study files from consortium with downloaded bdbag.zip
......@@ -187,7 +211,7 @@ process getData {
input:
path script_bdbagFetch
path cookies, stageAs: "deriva-cookies.txt" from bdbag
path inputBag
path inputBag from inputBag_getData
output:
path ("*.R{1,2}.fastq.gz") into fastqs
......@@ -207,7 +231,7 @@ process getData {
echo -e "LOG: linked" >> ${repRID}.getData.log
# get bag basename
replicate=\$(basename "${inputBag}" | cut -d "." -f1)
replicate=\$(basename "${inputBag}")
echo -e "LOG: bag replicate name \${replicate}" >> ${repRID}.getData.log
# unzip bag
......@@ -217,7 +241,7 @@ process getData {
# bag fetch fastq's only and rename by repRID
echo -e "LOG: fetching replicate bdbag" >> ${repRID}.getData.log
sh ${script_bdbagFetch} \${replicate} ${repRID}
sh ${script_bdbagFetch} \${replicate::-13} ${repRID}
echo -e "LOG: fetched" >> ${repRID}.getData.log
"""
}
......@@ -249,7 +273,7 @@ process parseMetadata {
path experiment from experimentMeta
output:
path "design.csv" into metadata
path "design.csv" into metadata_fl
script:
"""
......@@ -310,7 +334,7 @@ speciesMeta = Channel.create()
readLengthMeta = Channel.create()
expRID = Channel.create()
studyRID = Channel.create()
metadata.splitCsv(sep: ",", header: false).separate(
metadata_fl.splitCsv(sep: ",", header: false).separate(
endsMeta,
endsManual,
strandedMeta,
......@@ -320,6 +344,7 @@ metadata.splitCsv(sep: ",", header: false).separate(
expRID,
studyRID
)
// Replicate metadata for multiple process inputs
endsManual.into {
endsManual_trimData
......@@ -327,6 +352,16 @@ endsManual.into {
endsManual_alignSampleData
endsManual_aggrQC
}
studyRID.into {
studyRID_aggrQC
studyRID_uploadInputBag
studyRID_uploadProcessedFile
studyRID_uploadOutputBag
}
expRID.into {
expRID_aggrQC
expRID_uploadProcessedFile
}
/*
......@@ -336,14 +371,14 @@ process trimData {
tag "${repRID}"
input:
val ends from endsManual_trimData
path (fastq) from fastqs_trimData
val ends from endsManual_trimData
output:
path ("*.fq.gz") into fastqsTrim
path ("*.fastq.gz", includeInputs:true) into fastqs_fastqc
path ("*_trimming_report.txt") into trimQC
path ("readLength.csv") into inferMetadata_readLength
path ("readLength.csv") into readLengthInfer_fl
script:
"""
......@@ -371,11 +406,16 @@ process trimData {
// Extract calculated read length metadata into channel
readLengthInfer = Channel.create()
inferMetadata_readLength.splitCsv(sep: ",", header: false).separate(
readLengthInfer_fl.splitCsv(sep: ",", header: false).separate(
readLengthInfer
)
// Replicate trimmed fastq's
// Replicate infered read length for multiple process inputs
readLengthInfer.into {
readLengthInfer_aggrQC
readLengthInfer_uploadQC
}
// Replicate trimmed fastq's for multiple process inputs
fastqsTrim.into {
fastqsTrim_alignData
fastqsTrim_downsampleData
......@@ -450,9 +490,9 @@ process getRefInfer {
query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version=${refName}${refERCCVersion}/Annotation_Version=${refName}${refERCCVersion}')
fi
curl --request GET \${query} > refQuery.json
refURL=\$(python extractRefData.py --returnParam URL)
refURL=\$(python ${script_refDataInfer} --returnParam URL)
loc=\$(dirname \${refURL})
fName=\$(python extractRefData.py --returnParam fName)
fName=\$(python ${script_refDataInfer} --returnParam fName)
fName=\${fName%.*}
if [ "\${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi
filename=\$(echo \$(basename \${refURL}) | grep -oP '.*(?=:)')
......@@ -483,8 +523,8 @@ process downsampleData {
tag "${repRID}"
input:
val ends from endsManual_downsampleData
path fastq from fastqsTrim_downsampleData
val ends from endsManual_downsampleData
output:
path ("sampled.1.fq") into fastqs1Sample
......@@ -575,7 +615,7 @@ process inferMetadata {
path alignSummary from alignSampleQC_inferMetadata.collect()
output:
path "infer.csv" into inferMetadata
path "infer.csv" into inferMetadata_fl
path "${repRID}.infer_experiment.txt" into inferExperiment
script:
......@@ -642,18 +682,18 @@ process inferMetadata {
infer_experiment.py -r "\${bed}" -i "\${bam}" 1>> ${repRID}.infer_experiment.txt
echo -e "LOG: infered" >> ${repRID}.inferMetadata.log
ended=`bash inferMeta.sh endness ${repRID}.infer_experiment.txt`
fail=`bash inferMeta.sh fail ${repRID}.infer_experiment.txt`
ended=`bash ${script_inferMeta} endness ${repRID}.infer_experiment.txt`
fail=`bash ${script_inferMeta} fail ${repRID}.infer_experiment.txt`
if [ \${ended} == "PairEnd" ]
then
ends="pe"
percentF=`bash inferMeta.sh pef ${repRID}.infer_experiment.txt`
percentR=`bash inferMeta.sh per ${repRID}.infer_experiment.txt`
percentF=`bash ${script_inferMeta} pef ${repRID}.infer_experiment.txt`
percentR=`bash ${script_inferMeta} per ${repRID}.infer_experiment.txt`
elif [ \${ended} == "SingleEnd" ]
then
ends="se"
percentF=`bash inferMeta.sh sef ${repRID}.infer_experiment.txt`
percentR=`bash inferMeta.sh ser ${repRID}.infer_experiment.txt`
percentF=`bash ${script_inferMeta} sef ${repRID}.infer_experiment.txt`
percentR=`bash ${script_inferMeta} ser ${repRID}.infer_experiment.txt`
fi
echo -e "LOG: percentage reads in the same direction as gene: \${percentF}" >> ${repRID}.inferMetadata.log
echo -e "LOG: percentage reads in the opposite direction as gene: \${percentR}" >> ${repRID}.inferMetadata.log
......@@ -684,7 +724,7 @@ align_moInfer = Channel.create()
percentFInfer = Channel.create()
percentRInfer = Channel.create()
failInfer = Channel.create()
inferMetadata.splitCsv(sep: ",", header: false).separate(
inferMetadata_fl.splitCsv(sep: ",", header: false).separate(
endsInfer,
strandedInfer,
spikeInfer,
......@@ -703,20 +743,24 @@ endsInfer.into {
endsInfer_countData
endsInfer_dataQC
endsInfer_aggrQC
endsInfer_uploadQC
}
strandedInfer.into {
strandedInfer_alignData
strandedInfer_countData
strandedInfer_aggrQC
strandedInfer_uploadQC
}
spikeInfer.into{
spikeInfer_getRef
spikeInfer_aggrQC
spikeInfer_uploadExecutionRun
}
speciesInfer.into {
speciesInfer_getRef
speciesInfer_aggrQC
speciesInfer_outputBag
speciesInfer_uploadExecutionRun
speciesInfer_uploadProcessedFile
}
......@@ -727,8 +771,8 @@ process getRef {
tag "${species}"
input:
path credential, stageAs: "credential.json" from deriva_getRef
path script_refData
path credential, stageAs: "credential.json" from deriva_getRef
val spike from spikeInfer_getRef
val species from speciesInfer_getRef
......@@ -796,9 +840,9 @@ process getRef {
GENCODE=\$(echo \${references} | grep -o \${refName}.* | cut -d '.' -f3)
query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='\${GRCv}'.'\${GRCp}'/Annotation_Version=GENCODE%20'\${GENCODE})
curl --request GET \${query} > refQuery.json
refURL=\$(python extractRefData.py --returnParam URL)
refURL=\$(python ${script_refData} --returnParam URL)
loc=\$(dirname \${refURL})
fName=\$(python extractRefData.py --returnParam fName)
fName=\$(python ${script_refData} --returnParam fName)
fName=\${fName%.*}
if [ "\${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi
filename=\$(echo \$(basename \${refURL}) | grep -oP '.*(?=:)')
......@@ -824,10 +868,10 @@ process alignData {
tag "${repRID}"
input:
val ends from endsInfer_alignData
val stranded from strandedInfer_alignData
path fastq from fastqsTrim_alignData
path reference_alignData
val ends from endsInfer_alignData
val stranded from strandedInfer_alignData
output:
tuple path ("${repRID}.sorted.bam"), path ("${repRID}.sorted.bam.bai") into rawBam
......@@ -897,8 +941,8 @@ process dedupData {
tuple path (bam), path (bai) from rawBam_dedupData
output:
tuple path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bam.bai") into dedupBam
tuple path ("${repRID}.sorted.deduped.*.bam"), path ("${repRID}.sorted.deduped.*.bam.bai") into dedupChrBam
tuple path ("${repRID}_sorted.deduped.bam"), path ("${repRID}_sorted.deduped.bam.bai") into dedupBam
tuple path ("${repRID}_sorted.deduped.*.bam"), path ("${repRID}_sorted.deduped.*.bam.bai") into dedupChrBam
path ("*.deduped.Metrics.txt") into dedupQC
script:
......@@ -913,16 +957,16 @@ process dedupData {
# sort the bam file using Samtools
echo -e "LOG: sorting the bam file" >> ${repRID}.dedup.log
samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.deduped.bam ${repRID}.deduped.bam
samtools sort -@ `nproc` -O BAM -o ${repRID}_sorted.deduped.bam ${repRID}.deduped.bam
# index the sorted bam using Samtools
echo -e "LOG: indexing sorted bam file" >> ${repRID}.dedup.log
samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bam.bai
samtools index -@ `nproc` -b ${repRID}_sorted.deduped.bam ${repRID}_sorted.deduped.bam.bai
# split the deduped BAM file for multi-threaded tin calculation
for i in `samtools view ${repRID}.sorted.deduped.bam | cut -f3 | sort | uniq`;
for i in `samtools view ${repRID}_sorted.deduped.bam | cut -f3 | sort | uniq`;
do
echo "echo \"LOG: splitting each chromosome into its own BAM and BAI files with Samtools\"; samtools view -b ${repRID}.sorted.deduped.bam \${i} 1>> ${repRID}.sorted.deduped.\${i}.bam; samtools index -@ `nproc` -b ${repRID}.sorted.deduped.\${i}.bam ${repRID}.sorted.deduped.\${i}.bam.bai"
echo "echo \"LOG: splitting each chromosome into its own BAM and BAI files with Samtools\"; samtools view -b ${repRID}_sorted.deduped.bam \${i} 1>> ${repRID}_sorted.deduped.\${i}.bam; samtools index -@ `nproc` -b ${repRID}_sorted.deduped.\${i}.bam ${repRID}_sorted.deduped.\${i}.bam.bai"
done | parallel -j `nproc` -k
"""
}
......@@ -932,6 +976,7 @@ dedupBam.into {
dedupBam_countData
dedupBam_makeBigWig
dedupBam_dataQC
dedupBam_uploadProcessedFile
}
/*
......@@ -945,7 +990,7 @@ process makeBigWig {
tuple path (bam), path (bai) from dedupBam_makeBigWig
output:
path ("${repRID}.bw")
path ("${repRID}_sorted.deduped.bw") into bigwig
script:
"""
......@@ -954,7 +999,7 @@ process makeBigWig {
# create bigwig
echo -e "LOG: creating bibWig" >> ${repRID}.makeBigWig.log
bamCoverage -p `nproc` -b ${bam} -o ${repRID}.bw
bamCoverage -p `nproc` -b ${bam} -o ${repRID}_sorted.deduped.bw
echo -e "LOG: created" >> ${repRID}.makeBigWig.log
"""
}
......@@ -964,7 +1009,7 @@ process makeBigWig {
*/
process countData {
tag "${repRID}"
publishDir "${outDir}/count", mode: 'copy', pattern: "${repRID}*.tpmTable.csv"
publishDir "${outDir}/count", mode: 'copy', pattern: "${repRID}*_tpmTable.csv"
input:
path script_calculateTPM
......@@ -975,9 +1020,9 @@ process countData {
val stranded from strandedInfer_countData
output:
path ("*.tpmTable.csv") into counts
path ("*.countData.summary") into countsQC
path ("assignedReads.csv") into inferMetadata_assignedReads
path ("*_tpmTable.csv") into counts
path ("*_countData.summary") into countsQC
path ("assignedReads.csv") into assignedReadsInfer_fl
script:
"""
......@@ -1004,32 +1049,38 @@ process countData {
echo -e "LOG: counting ${ends} features" >> ${repRID}.countData.log
if [ "${ends}" == "se" ]
then
featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}_countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}_sorted.deduped.bam
elif [ "${ends}" == "pe" ]
then
featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}_countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}_sorted.deduped.bam
fi
echo -e "LOG: counted" >> ${repRID}.countData.log
# extract assigned reads
grep -m 1 'Assigned' *.countData.summary | grep -oe '\\([0-9.]*\\)' > assignedReads.csv
grep -m 1 'Assigned' *_countData.summary | grep -oe '\\([0-9.]*\\)' > assignedReads.csv
# calculate TPM from the resulting countData table
echo -e "LOG: calculating TPM with R" >> ${repRID}.countData.log
Rscript calculateTPM.R --count "${repRID}.countData"
Rscript ${script_calculateTPM} --count "${repRID}_countData"
# convert gene symbols to Entrez id's
echo -e "LOG: convert gene symbols to Entrez id's" >> ${repRID}.countData.log
Rscript convertGeneSymbols.R --repRID "${repRID}"
Rscript ${script_convertGeneSymbols} --repRID "${repRID}"
"""
}
// Extract number of assigned reads metadata into channel
assignedReadsInfer = Channel.create()
inferMetadata_assignedReads.splitCsv(sep: ",", header: false).separate(
assignedReadsInfer_fl.splitCsv(sep: ",", header: false).separate(
assignedReadsInfer
)
// Replicate infered assigned reads for multiple process inputs
assignedReadsInfer.into {
assignedReadsInfer_aggrQC
assignedReadsInfer_uploadQC
}
/*
*fastqc: run fastqc on untrimmed fastq's
*/
......@@ -1041,7 +1092,7 @@ process fastqc {
output:
path ("*_fastqc.zip") into fastqc
path ("rawReads.csv") into inferMetadata_rawReads
path ("rawReads.csv") into rawReadsInfer_fl
script:
"""
......@@ -1059,10 +1110,16 @@ process fastqc {
// Extract number of raw reads metadata into channel
rawReadsInfer = Channel.create()
inferMetadata_rawReads.splitCsv(sep: ",", header: false).separate(
rawReadsInfer_fl.splitCsv(sep: ",", header: false).separate(
rawReadsInfer
)
// Replicate infered raw reads for multiple process inputs
rawReadsInfer.into {
rawReadsInfer_aggrQC
rawReadsInfer_uploadQC
}
/*
*dataQC: calculate transcript integrity numbers (TIN) and bin as well as calculate innerdistance of PE replicates
*/
......@@ -1077,9 +1134,9 @@ process dataQC {
val ends from endsInfer_dataQC
output:
path "${repRID}.tin.hist.tsv" into tinHist
path "${repRID}.tin.med.csv" into inferMetadata_tinMed
path "${repRID}.insertSize.inner_distance_freq.txt" into innerDistance
path "${repRID}_tin.hist.tsv" into tinHist
path "${repRID}_tin.med.csv" into tinMedInfer_fl
path "${repRID}_insertSize.inner_distance_freq.txt" into innerDistance
script:
"""
......@@ -1087,10 +1144,10 @@ process dataQC {
ulimit -a >> ${repRID}.dataQC.log
# calcualte TIN values per feature on each chromosome
echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > ${repRID}.sorted.deduped.tin.xls
echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > ${repRID}_sorted.deduped.tin.xls
for i in `cat ./bed/genome.bed | cut -f1 | sort | uniq`; do
echo "echo \"LOG: running tin.py on \${i}\" >> ${repRID}.dataQC.log; tin.py -i ${repRID}.sorted.deduped.\${i}.bam -r ./bed/genome.bed; cat ${repRID}.sorted.deduped.\${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \\\"\\\\t\${i}\\\\t\\\";";
done | parallel -j `nproc` -k 1>> ${repRID}.sorted.deduped.tin.xls
echo "echo \"LOG: running tin.py on \${i}\" >> ${repRID}.dataQC.log; tin.py -i ${repRID}_sorted.deduped.\${i}.bam -r ./bed/genome.bed; cat ${repRID}_sorted.deduped.\${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \\\"\\\\t\${i}\\\\t\\\";";
done | parallel -j `nproc` -k 1>> ${repRID}_sorted.deduped.tin.xls
# bin TIN values
echo -e "LOG: binning TINs" >> ${repRID}.dataQC.log
......@@ -1101,19 +1158,19 @@ process dataQC {
if [ "${ends}" == "pe" ]
then
echo -e "LOG: calculating inner distances for ${ends}" >> ${repRID}.dataQC.log
inner_distance.py -i "${bam}" -o ${repRID}.insertSize -r ./bed/genome.bed
inner_distance.py -i "${bam}" -o ${repRID}_insertSize -r ./bed/genome.bed
echo -e "LOG: calculated" >> ${repRID}.dataQC.log
elif [ "${ends}" == "se" ]
then
echo -e "LOG: creating dummy inner distance file for ${ends}" >> ${repRID}.dataQC.log
touch ${repRID}.insertSize.inner_distance_freq.txt
touch ${repRID}_insertSize.inner_distance_freq.txt
fi
"""
}
// Extract median TIN metadata into channel
tinMedInfer = Channel.create()
inferMetadata_tinMed.splitCsv(sep: ",", header: false).separate(
tinMedInfer_fl.splitCsv(sep: ",", header: false).separate(
tinMedInfer
)
......@@ -1149,12 +1206,12 @@ process aggrQC {
val spikeI from spikeInfer_aggrQC
val speciesI from speciesInfer_aggrQC
val readLengthM from readLengthMeta
val readLengthI from readLengthInfer
val rawReadsI from rawReadsInfer
val assignedReadsI from assignedReadsInfer
val readLengthI from readLengthInfer_aggrQC
val rawReadsI from rawReadsInfer_aggrQC
val assignedReadsI from assignedReadsInfer_aggrQC
val tinMedI from tinMedInfer
val expRID
val studyRID
val studyRID from studyRID_aggrQC
val expRID from expRID_aggrQC
output:
path "${repRID}.multiqc.html" into multiqc
......@@ -1226,24 +1283,270 @@ process aggrQC {
"""
}
/*
* uploadInputBag: uploads the input bag
*/
process uploadInputBag {
tag "${repRID}"
input:
path script_uploadInputBag
path credential, stageAs: "credential.json" from deriva_uploadInputBag
path inputBag from inputBag_uploadInputBag
val studyRID from studyRID_uploadInputBag
output:
path ("inputBagRID.csv") into inputBagRID_fl
when:
upload
script:
"""
hostname > ${repRID}.uploadInputBag.log
ulimit -a >> ${repRID}.uploadInputBag.log
yr=\$(date +'%Y')
mn=\$(date +'%m')
dy=\$(date +'%d')
file=\$(basename -a ${inputBag})
md5=\$(md5sum ./\${file} | awk '{ print \$1 }')
echo LOG: ${repRID} input bag md5 sum - \${md5} >> ${repRID}.uploadInputBag.log
size=\$(wc -c < ./\${file})
echo LOG: ${repRID} input bag size - \${size} bytes >> ${repRID}.uploadInputBag.log
exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Input_Bag/File_MD5=\${md5})
if [ "\${exist}" == "[]" ]
then
cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"')
cookie=\${cookie:11:-1}
loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/input_bag/study/${studyRID}/replicate/${repRID}/\${file} --parents)
inputBag_rid=\$(python3 ${script_uploadInputBag} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie})
echo LOG: input bag RID uploaded - \${inputBag_rid} >> ${repRID}.uploadInputBag.log
rid=\${inputBag_rid}
else
exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
exist=\${exist:7:-6}
echo LOG: input bag RID already exists - \${exist} >> ${repRID}.uploadInputBag.log
rid=\${exist}
fi
echo \${rid} > inputBagRID.csv
"""
}
// Extract input bag RID into channel
inputBagRID = Channel.create()
inputBagRID_fl.splitCsv(sep: ",", header: false).separate(
inputBagRID
)
/*
* uploadExecutionRun: uploads the execution run
*/
process uploadExecutionRun {
tag "${repRID}"
input:
path script_uploadExecutionRun
path credential, stageAs: "credential.json" from deriva_uploadExecutionRun
val spike from spikeInfer_uploadExecutionRun
val species from speciesInfer_uploadExecutionRun
val inputBagRID
output:
path ("executionRunRID.csv") into executionRunRID_fl
when:
upload
script:
"""
hostname > ${repRID}.uploadExecutionRun.log
ulimit -a >> ${repRID}.uploadExecutionRun.log
echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.uploadExecutionRun.log
workflow=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Workflow/Name=BICF%20mRNA%20Replicate/Version=${workflow.manifest.version})
workflow=\$(echo \${workflow} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
workflow=\${workflow:7:-6}
echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.uploadExecutionRun.log
if [ "${species}" == "Homo sapiens" ]
then
genomeName=\$(echo GRCh${refHuVersion})
elif [ "${species}" == "Mus musculus" ]
then
genomeName=\$(echo GRCm${refMoVersion})
fi
if [ "${spike}" == "yes" ]
then
genomeName=\$(echo \${genomeName}-S)
fi
echo LOG: searching for genome name - \${genomeName} >> ${repRID}.uploadExecutionRun.log
genome=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Name=\${genomeName}_indev)
genome=\$(echo \${genome} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
genome=\${genome:7:-6}
echo LOG: genome RID extracted - \${genome} >> ${repRID}.uploadExecutionRun.log
cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"')
cookie=\${cookie:11:-1}
exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Replicate=${repRID}/Input_Bag=${inputBagRID})
echo \${exist} >> ${repRID}.uploadExecutionRun.log
if [ "\${exist}" == "[]" ]
then
executionRun_rid=\$(python3 ${script_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s In-progress -d 'Run in process' -o ${source} -c \${cookie} -u F)
echo LOG: execution run RID uploaded - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log
else
rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
rid=\${rid:7:-6}
echo \${rid} >> ${repRID}.uploadExecutionRun.log
executionRun_rid=\$(python3 ${script_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s In-progress -d 'Run in process' -o ${source} -c \${cookie} -u \${rid})
echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.uploadExecutionRun.log
fi
echo \${executionRun_rid} > executionRunRID.csv
"""
}
// Extract execution run RID into channel
executionRunRID = Channel.create()
executionRunRID_fl.splitCsv(sep: ",", header: false).separate(
executionRunRID
)
//
executionRunRID.into {
executionRunRID_uploadQC
executionRunRID_uploadProcessedFile
executionRunRID_uploadOutputBag
}
/*
* uploadQC: uploads the mRNA QC
*/
process uploadQC {
tag "${repRID}"
input:
path script_deleteEntry_uploadQC
path script_uploadQC
path credential, stageAs: "credential.json" from deriva_uploadQC
val executionRunRID from executionRunRID_uploadQC
val ends from endsInfer_uploadQC
val stranded from strandedInfer_uploadQC
val length from readLengthInfer_uploadQC
val rawCount from rawReadsInfer_uploadQC
val finalCount from assignedReadsInfer_uploadQC
output:
path ("qcRID.csv") into qcRID_fl
when:
upload
script:
"""
hostname > ${repRID}.uploadQC.log
ulimit -a >> ${repRID}.uploadQC.log
if [ "${ends}" == "pe" ]
then
end="Paired End"
elif [ "${ends}" == "se" ]
then
end="Single Read"
fi
cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"')
cookie=\${cookie:11:-1}
exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:mRNA_QC/Replicate=${repRID})
if [ "\${exist}" != "[]" ]
then
rids=\$(echo \${exist} | grep -o '\\"RID\\":\\".\\{7\\}' | sed 's/^.\\{7\\}//')
for rid in \${rids}
do
python3 ${script_deleteEntry_uploadQC} -r \${rid} -t mRNA_QC -o ${source} -c \${cookie}
echo LOG: old mRNA QC RID deleted - \${rid} >> ${repRID}.uploadQC.log
done
echo LOG: all old mRNA QC RIDs deleted >> ${repRID}.uploadQC.log
fi
qc_rid=\$(python3 ${script_uploadQC} -r ${repRID} -e ${executionRunRID} -p "\${end}" -s ${stranded} -l ${length} -w ${rawCount} -f ${finalCount} -o ${source} -c \${cookie} -u F)
echo LOG: mRNA QC RID uploaded - \${qc_rid} >> ${repRID}.uploadQC.log
echo \${qc_rid} > qcRID.csv
"""
}
// Extract mRNA qc RID into channel
qcRID = Channel.create()
qcRID_fl.splitCsv(sep: ",", header: false).separate(
qcRID
)
/*
*ouputBag: create ouputBag
*/
process outputBag {
process uploadProcessedFile {
tag "${repRID}"
publishDir "${outDir}/outputBag", mode: 'copy', pattern: "Replicate_${repRID}.outputBag.zip"
input:
path script_deleteEntry_uploadProcessedFile
path credential, stageAs: "credential.json" from deriva_uploadProcessedFile
path executionRunExportConfig
path multiqc
path multiqcJSON
val species from speciesInfer_outputBag
tuple path (bam),path (bai) from dedupBam_uploadProcessedFile
path bigwig
path counts
val species from speciesInfer_uploadProcessedFile
val studyRID from studyRID_uploadProcessedFile
val expRID from expRID_uploadProcessedFile
val executionRunRID from executionRunRID_uploadProcessedFile
output:
path ("Replicate_*.zip") into outputBag
path ("${repRID}_Output_Bag.zip") into outputBag
when:
upload
script:
"""
mkdir Replicate_${repRID}.outputBag
hostname > ${repRID}.outputBag.log
ulimit -a >> ${repRID}.outputBag.log
mkdir -p ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/
cp ${bam} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/
cp ${bai} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/
cp ${bigwig} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/
cp ${counts} ./deriva/Seq/pipeline/${studyRID}/${executionRunRID}/
cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"')
cookie=\${cookie:11:-1}
exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Processed_File/Replicate=${repRID})
if [ "\${exist}" != "[]" ]
then
rids=\$(echo \${exist} | grep -o '\\"RID\\":\\".\\{7\\}' | sed 's/^.\\{7\\}//')
for rid in \${rids}
do
python3 ${script_deleteEntry_uploadProcessedFile} -r \${rid} -t Processed_File -o ${source} -c \${cookie}
done
echo LOG: all old processed file RIDs deleted >> ${repRID}.uploadQC.log
fi
deriva-upload-cli --catalog 2 --token \${cookie:9} ${source} ./deriva
echo LOG: processed files uploaded >> ${repRID}.outputBag.log
deriva-download-cli --catalog 2 --token \${cookie:9} ${source} ${executionRunExportConfig} . rid=${executionRunRID}
echo LOG: execution run bag downloaded >> ${repRID}.outputBag.log
echo -e "### Run Details" >> runDetails.md
echo -e "**Workflow URL:** https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq" >> runDetails.md
echo -e "**Workflow Version:** ${workflow.manifest.version}" >> runDetails.md
......@@ -1260,13 +1563,85 @@ process outputBag {
echo -e "**Genome Assembly Version:** \${genome} patch \${patch}" >> runDetails.md
echo -e "**Annotation Version:** GENCODE release \${annotation}" >> runDetails.md
echo -e "**Run ID:** ${repRID}" >> runDetails.md
cp runDetails.md Replicate_${repRID}.outputBag
cp ${multiqc} Replicate_${repRID}.outputBag
cp ${multiqcJSON} Replicate_${repRID}.outputBag
bdbag Replicate_${repRID}.outputBag --archiver zip
echo LOG: runDetails.md created >> ${repRID}.outputBag.log
unzip Execution_Run_${executionRunRID}.zip
yr=\$(date +'%Y')
mn=\$(date +'%m')
dy=\$(date +'%d')
mv Execution_Run_${executionRunRID} ${repRID}_Output_Bag_\${yr}\${mn}\${dy}
loc=./${repRID}_Output_Bag/data/assets/Study/${studyRID}/Experiment/${expRID}/Replicate/${repRID}/Execution_Run/${executionRunRID}/Output_Files/
mkdir -p \${loc}
cp runDetails.md \${loc}
cp ${multiqc} \${loc}
cp ${multiqcJSON} \${loc}
bdbag ./${repRID}_Output_Bag/ --update --archiver zip --debug
echo LOG: output bag created >> ${repRID}.outputBag.log
"""
}
/*
* uploadOutputBag: uploads the output bag
*/
process uploadOutputBag {
tag "${repRID}"
input:
path script_uploadOutputBag
path credential, stageAs: "credential.json" from deriva_uploadOutputBag
path outputBag
val studyRID from studyRID_uploadOutputBag
val executionRunRID from executionRunRID_uploadOutputBag
output:
path ("outputBagRID.csv") into outputBagRID_fl
when:
upload
script:
"""
hostname > ${repRID}.uploadOutputBag.log
ulimit -a >> ${repRID}.uploadOutputBag.log
yr=\$(date +'%Y')
mn=\$(date +'%m')
dy=\$(date +'%d')
file=\$(basename -a ${outputBag})
md5=\$(md5sum ./\${file} | awk '{ print \$1 }')
echo LOG: ${repRID} output bag md5 sum - \${md5} >> ${repRID}.uploadOutputBag.log
size=\$(wc -c < ./\${file})
echo LOG: ${repRID} output bag size - \${size} bytes >> ${repRID}.uploadOutputBag.log
exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Output_Bag/File_MD5=\${md5})
if [ "\${exist}" == "[]" ]
then
cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"')
cookie=\${cookie:11:-1}
loc=\$(deriva-hatrac-cli --host ${source} put ./\${file} /hatrac/resources/rnaseq/pipeline/output_bag/study/${studyRID}/replicate/${repRID}/\${file} --parents)
outputBag_rid=\$(python3 ${script_uploadOutputBag} -e ${executionRunRID} -f \${file} -l \${loc} -s \${md5} -b \${size} -o ${source} -c \${cookie})
echo LOG: output bag RID uploaded - \${outputBag_rid} >> ${repRID}.uploadOutputBag.log
rid=\${outputBag_rid}
else
exist=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
exist=\${exist:8:-6}
echo LOG: output bag RID already exists - \${exist} >> ${repRID}.uploadOutputBag.log
rid=\${exist}
fi
echo \${rid} > outputBagRID.csv
"""
}
// Extract output bag RID into channel
outputBagRID = Channel.create()
outputBagRID_fl.splitCsv(sep: ",", header: false).separate(
outputBagRID
)
workflow.onError = {
subject = "$workflow.manifest.name FAILED: $params.repRID"
......
File moved
......@@ -23,4 +23,4 @@ output <- merge(x=convert,y=countTable[,c("gene_name","gene_id","count","tpm")],
colnames(output) <- c("GENCODE_Gene_Symbol","NCBI_GeneID","Ensembl_GeneID","count","tpm")
output <- output[,c(1,3,2,4:5)]
write.table(output,file=paste0(opt$repRID,".tpmTable.csv"),sep=",",row.names=FALSE,quote=FALSE)
write.table(output,file=paste0(opt$repRID,"_tpmTable.csv"),sep=",",row.names=FALSE,quote=FALSE)
import argparse
from deriva.core import ErmrestCatalog, get_credential, BaseCLI
import sys
import csv
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--RID', help="replicate RID", required=True)
parser.add_argument('-t', '--table', help="source table", required=True)
parser.add_argument('-o', '--host', help="datahub host", required=True)
parser.add_argument('-c', '--cookie', help="cookie token", required=True)
args = parser.parse_args()
return args
def main(hostname, catalog_number, credential):
catalog = ErmrestCatalog('https', hostname, catalog_number, credential)
pb = catalog.getPathBuilder()
if args.table == 'mRNA_QC':
run_table = pb.RNASeq.mRNA_QC
elif args.table == "Processed_File":
run_table = pb.RNASeq.Processed_File
path = run_table.filter(run_table.RID == args.RID)
path.delete()
rid = args.RID
print(rid + " deleted")
if __name__ == '__main__':
args = get_args()
cli = BaseCLI("Custom RNASeq query", None, 1)
cli.remove_options(["--config-file"])
host = args.host
credentials = {"cookie": args.cookie}
main(host, 2, credentials)
\ No newline at end of file
......@@ -38,7 +38,7 @@ SOFTWARE_REGEX = {
'Trim Galore!': ['version_trimgalore.txt', r"version (\S+)"],
'HISAT2': ['version_hisat2.txt', r"version (\S+)"],
'Samtools': ['version_samtools.txt', r"samtools (\S+)"],
'picard (MarkDuplicates)': ['version_markdups.txt', r"(\S\.\S{2}\.\S+)"],
'picard (MarkDuplicates)': ['version_markdups.txt', r"Version:(\S+)"],
'featureCounts': ['version_featurecounts.txt', r"featureCounts v(\S+)"],
'R': ['version_r.txt', r"R version (\S+)"],
'deepTools': ['version_deeptools.txt', r"deeptools (\S+)"],
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment