diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b14d0264fe388d2b717e6b02423f1a41acb0ff83..abe5e0b52c5ab9dbe33f8cce2864917df135c0a8 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -46,8 +46,8 @@ getRef: trimData: stage: unit script: - - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5JA_1M.se -j `nproc` ./test_data/fastq/small/Q-Y5JA_1M.R1.fastq.gz - - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5JA_1M.pe -j `nproc` ./test_data/fastq/small/Q-Y5JA_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5JA_1M.R2.fastq.gz + - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5JA_1M.se -j 20 ./test_data/fastq/small/Q-Y5JA_1M.R1.fastq.gz + - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5JA_1M.pe -j 20 ./test_data/fastq/small/Q-Y5JA_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5JA_1M.R2.fastq.gz - pytest -m trimData downsampleData: @@ -59,34 +59,37 @@ downsampleData: alignData: stage: unit script: - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p `nproc` --add-chrname --un-gz Q-Y5JA_1M.se.unal.gz -S Q-Y5JA_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5JA_1M_trimmed.fq.gz --summary-file ${repRID}.alignSummary.txt --new-summary - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o Q-Y5JA_1M.se.bam Q-Y5JA_1M.se.sam - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ `nproc` -O BAM -o Q-Y5JA_1M.se.sorted.bam Q-Y5JA_1M.se.bam - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA_1M.se.sorted.bam Q-Y5JA_1M.se.sorted.bam.bai - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p `nproc` --add-chrname --un-gz Q-Y5JA_1M.pe.unal.gz -S Q-Y5JA_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/small/Q-Y5JA_1M_R1_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5JA_1M_R2_val_2.fq.gz --summary-file ${repRID}.alignSummary.txt --new-summary - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o Q-Y5JA_1M.pe.bam Q-Y5JA_1M.pe.sam - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ `nproc` -O BAM -o Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.bam - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ `nproc` -b Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.sorted.bam.bai + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5JA_1M.se.unal.gz -S Q-Y5JA_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5JA_1M_trimmed.fq.gz --summary-file ${repRID}.alignSummary.txt --new-summary + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5JA_1M.se.bam Q-Y5JA_1M.se.sam + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5JA_1M.se.sorted.bam Q-Y5JA_1M.se.bam + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ 20 -b Q-Y5JA_1M.se.sorted.bam Q-Y5JA_1M.se.sorted.bam.bai + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5JA_1M.pe.unal.gz -S Q-Y5JA_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/small/Q-Y5JA_1M_R1_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5JA_1M_R2_val_2.fq.gz --summary-file ${repRID}.alignSummary.txt --new-summary + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5JA_1M.pe.bam Q-Y5JA_1M.pe.sam + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.bam + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ 20 -b Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.sorted.bam.bai - pytest -m alignData dedupData: stage: unit script: - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5JA_1M.se.sorted.bam O=Q-Y5JA_1M.se.deduped.bam M=Q-Y5JA_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true - - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ `nproc` -O BAM -o Q-Y5JA_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5JA_1M.se.deduped.bam - - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ `nproc` -b ./test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.bam Q-Y5JA_1M.se.sorted.deduped.bam.bai + - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5JA_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5JA_1M.se.deduped.bam + - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ 20 -b ./test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.bam Q-Y5JA_1M.se.sorted.deduped.bam.bai + - for i in {"chr8","chr4","chrY"}; do + echo "samtools view -b Q-Y5JA_1M.se.sorted.deduped.bam ${i} > Q-Y5JA_1M.se.sorted.deduped.${i}.bam; samtools index -@ 20 -b Q-Y5JA_1M.se.sorted.deduped.${i}.bam Q-Y5JA_1M.se.sorted.deduped.${i}.bam.bai;"; + done | singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' parallel -j 20 -k - pytest -m dedupData makeBigWig: stage: unit script: - - singularity run 'docker://bicf/deeptools3.3:2.0.0' bamCoverage -p `nproc` -b ./test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.bam -o Q-Y5JA_1M.se.bw + - singularity run 'docker://bicf/deeptools3.3:2.0.0' bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.bam -o Q-Y5JA_1M.se.bw - pytest -m makeBigWig makeFeatureCounts: stage: unit script: - - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -R SAM -p -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -T `nproc` -s 1 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -o Q-Y5JA_1M.se.featureCounts -g 'gene_name' --primary --ignoreDup -B ./test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.bam + - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -R SAM -p -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -T 20 -s 1 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -o Q-Y5JA_1M.se.featureCounts -g 'gene_name' --primary --ignoreDup -B ./test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.bam - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count Q-Y5JA_1M.se.featureCounts - pytest -m makeFeatureCounts @@ -99,20 +102,23 @@ fastqc: inferMetadata: stage: unit script: - - singularity run 'docker://bicf/rseqc3.0:2.0.0' tin.py -i ./test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed + - for i in {"chr8","chr4","chrY"}; do + echo " tin.py -i /project/BICF/BICF_Core/shared/gudmap/test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed; cat Q-Y5JA_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"; done | singularity run 'docker://bicf/rseqc3.0:2.0.0' parallel -j 20 -k > Q-Y5JA_1M.se.sorted.deduped.tin.xls - pytest -m inferMetadata integration_se: stage: integration script: + - hostname - ulimit -a - - nextflow run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-1ZX4 + - nextflow -bg run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 16-1ZX4 integration_pe: stage: integration script: + - hostname - ulimit -a - - nextflow run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA -with-dag dag.png + - nextflow -bg run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5JA -with-dag dag.png artifacts: name: "$CI_JOB_NAME" when: always diff --git a/README.md b/README.md index 9ae8e4d92c20e7f880c85c8784809a0f7cde506e..fbb700ef12c3d006870dd8378741af9c5a404ca1 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ |*master*|*develop*| |:-:|:-:| -|[](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/commits/master)|[](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/commits/develop)| +|[](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/commits/master)|[](https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq/commits/develop)| <!-- [![DOI]()]() --> diff --git a/docs/dag.png b/docs/dag.png index 55e0e4781a59ea7159802b8922d9ff2baf411a29..f9c379f904ad2bc32b628556036517e98e77a165 100644 Binary files a/docs/dag.png and b/docs/dag.png differ diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index bd377933d123799cfe6cb55452a951af59788548..dc5a6261723f77fcb4b428a7b4c0107e8d9cb11f 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -46,7 +46,7 @@ process { singularity { enabled = true - cacheDir = '/project/shared/bicf_workflow_ref/singularity_images/' + cacheDir = '/project/BICF/BICF_Core/shared/gudmap/singularity_cache/' } env { diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 9ee3df735f602f97595e74d64cae5ab1cfc174c6..fe62dfe7137b6c529e64c47c7a1c677cb115df5f 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -477,7 +477,7 @@ process alignData { path reference_alignData output: - tuple val ("${repRID}"), path ("${repRID}.sorted.bam"), path ("${repRID}.sorted.bam.bai") into rawBam + tuple path ("${repRID}.sorted.bam"), path ("${repRID}.sorted.bam.bai") into rawBam path ("*.alignSummary.txt") into alignQC path ("${repRID}.align.{out,err}") @@ -525,10 +525,11 @@ process dedupData { publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.dedup.{out,err}" input: - set val (repRID), path (inBam), path (inBai) from rawBam_dedupData + set path (inBam), path (inBai) from rawBam_dedupData output: tuple path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bam.bai") into dedupBam + tuple path ("${repRID}.sorted.deduped.*.bam"), path ("${repRID}.sorted.deduped.*.bam.bai") into dedupChrBam path ("*.deduped.Metrics.txt") into dedupQC path ("${repRID}.dedup.{out,err}") @@ -541,13 +542,15 @@ process dedupData { echo "LOG: running picard MarkDuplicates to remove duplicate reads" >> ${repRID}.dedup.err java -jar /picard/build/libs/picard.jar MarkDuplicates I=${inBam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>> ${repRID}.dedup.out 2>> ${repRID}.dedup.err - #Use SamTools to sort the now deduped bam file - echo "LOG: sorting the deduped bam file" >> ${repRID}.dedup.err - samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.deduped.bam ${repRID}.deduped.bam 1>> ${repRID}.dedup.out 2>> ${repRID}.dedup.err - - #Use SamTools to index the now sorted deduped bam file - echo "LOG: indexing the sorted deduped bam file" >> ${repRID}.dedup.err - samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bam.bai 1>> ${repRID}.dedup.out 2>> ${repRID}.dedup.err + # remove duplicated reads + java -jar /picard/build/libs/picard.jar MarkDuplicates I=${inBam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err + samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.deduped.bam ${repRID}.deduped.bam 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err + samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bam.bai 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err + # Split the deduped BAM file for multi-threaded tin calculation + for i in `samtools view ${repRID}.sorted.deduped.bam | cut -f3 | sort | uniq`; + do + echo "echo \"LOG: splitting each chromosome into its own BAM and BAI files with Samtools\" >> ${repRID}.dedup.err; samtools view -b ${repRID}.sorted.deduped.bam \${i} > ${repRID}.sorted.deduped.\${i}.bam; samtools index -@ `nproc` -b ${repRID}.sorted.deduped.\${i}.bam ${repRID}.sorted.deduped.\${i}.bam.bai" + done | parallel -j `nproc` -k 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err """ } @@ -672,6 +675,7 @@ process inferMetadata { path script_inferMeta path reference_inferMeta set path (inBam), path (inBai) from dedupBam_inferMeta + set path (inChrBam), path (inChrBai) from dedupChrBam output: path "infer.csv" into inferedMetadata @@ -701,7 +705,7 @@ process inferMetadata { percentF=`bash inferMeta.sh sef ${repRID}.rseqc.log` 1>> ${repRID}.rseqc.out 2>> ${repRID}.rseqc.err percentR=`bash inferMeta.sh ser ${repRID}.rseqc.log` 1>> ${repRID}.rseqc.out 2>> ${repRID}.rseqc.err fi - if [ \$percentF -gt 0.25 ] && [ \$percentR -lt 0.25 ] + if [ 1 -eq \$(echo \$(expr \$percentF ">" 0.25)) ] && [ 1 -eq \$(echo \$(expr \$percentR "<" 0.25)) ] then stranded="forward" if [ \$endness == "PairEnd" ] @@ -710,7 +714,7 @@ process inferMetadata { else strategy="++,--" fi - elif [ \$percentR -gt 0.25 ] && [ \$percentF -lt 0.25 ] + elif [ 1 -eq \$(echo \$(expr \$percentR ">" 0.25)) ] && [ 1 -eq \$(echo \$(expr \$percentF "<" 0.25)) ] then stranded="reverse" if [ \$endness == "PairEnd" ] @@ -725,8 +729,10 @@ process inferMetadata { fi echo -e "LOG: strategy set to \${strategy}\nStranded set to ${stranded}" >> ${repRID}.rseqc.err - # calcualte TIN values per feature - tin.py -i "${inBam}" -r ./bed/genome.bed 1>> ${repRID}.rseqc.out 2>> ${repRID}.rseqc.err + # calcualte TIN values per feature on each chromosome + for i in `cat ./bed/genome.bed | cut -f1 | sort | uniq`; do + echo "echo \"LOG: running tin.py on \${i}\" >> ${repRID}.rseqc.err; tin.py -i ${repRID}.sorted.deduped.\${i}.bam -r ./bed/genome.bed 1>>${repRID}.rseqc.log 2>>${repRID}.rseqc.err; cat ${repRID}.sorted.deduped.\${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \\\"\\\\t\${i}\\\\t\\\";"; + done | parallel -j `nproc` -k > ${repRID}.sorted.deduped.tin.xls 2>>${repRID}.rseqc.err # write infered metadata to file echo \${endness},\${stranded},\${strategy},\${percentF},\${percentR},\${fail} > infer.csv 2>> ${repRID}.rseqc.err diff --git a/workflow/tests/test_dedupReads.py b/workflow/tests/test_dedupReads.py index 5ca60ebd6c10957b0da697ee8cacf196fa75d525..459390d7b0dba21101118448e17dcc7728745469 100644 --- a/workflow/tests/test_dedupReads.py +++ b/workflow/tests/test_dedupReads.py @@ -11,5 +11,11 @@ data_output_path = os.path.dirname(os.path.abspath(__file__)) + \ @pytest.mark.dedupData def test_dedupData(): - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.deduped.bam')) - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.deduped.bam.bai')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.deduped.bam')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.deduped.bam.bai')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.deduped.chr8.bam')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.deduped.chr8.bam.bai')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.deduped.chr4.bam')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.deduped.chr4.bam.bai')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.deduped.chrY.bam')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5JA_1M.se.sorted.deduped.chrY.bam.bai'))