diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3d66ceb48acf6e3fde51ccfdeb9261654fa3d6f8..658a53c878fac2c68a06e42334d57c05a8a58873 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -32,9 +32,21 @@ parseMetadata: - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p ends - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsManual - - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded -e se + - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species +inferMetadata: + stage: unit + script: + - > + align=$(echo $(grep "Overall alignment rate" ./test_data/meta/Q-Y5JA_1M.se.alignSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) && + if [[ ${align} == "" ]]; then exit 1; fi + - > + singularity run 'docker://bicf/rseqc3.0:2.0.0' infer_experiment.py -r "/project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed" -i "./test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.bam" 1>> Q-Y5JA_1M.se.inferMetadata.log && + ended=`singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/inferMeta.sh endness Q-Y5JA_1M.se.inferMetadata.log` && + if [[ ${ended} == "" ]]; then exit 1; fi + - pytest -m inferMetadata + getRef: stage: unit script: @@ -50,14 +62,20 @@ trimData: - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5JA_1M.pe -j 20 ./test_data/fastq/small/Q-Y5JA_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5JA_1M.R2.fastq.gz - pytest -m trimData +downsampleData: + stage: unit + script: + - singularity exec 'docker://bicf/seqtk:2.0.0' seqtk sample -s100 ./test_data/fastq/small/Q-Y5JA_1M_trimmed.fq.gz 1000 1> sampled.1.fq + - pytest -m downsampleData + alignData: stage: unit script: - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5JA_1M.se.unal.gz -S Q-Y5JA_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5JA_1M_trimmed.fq.gz --summary-file ${repRID}.alignSummary.txt --new-summary + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5JA_1M.se.unal.gz -S Q-Y5JA_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5JA_1M_trimmed.fq.gz --summary-file Q-Y5JA_1M.se.alignSummary.txt --new-summary - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5JA_1M.se.bam Q-Y5JA_1M.se.sam - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5JA_1M.se.sorted.bam Q-Y5JA_1M.se.bam - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ 20 -b Q-Y5JA_1M.se.sorted.bam Q-Y5JA_1M.se.sorted.bam.bai - - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5JA_1M.pe.unal.gz -S Q-Y5JA_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/small/Q-Y5JA_1M_R1_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5JA_1M_R2_val_2.fq.gz --summary-file ${repRID}.alignSummary.txt --new-summary + - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' hisat2 -p 20 --add-chrname --un-gz Q-Y5JA_1M.pe.unal.gz -S Q-Y5JA_1M.pe.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness FR --no-mixed --no-discordant -1 ./test_data/fastq/small/Q-Y5JA_1M_R1_val_1.fq.gz -2 ./test_data/fastq/small/Q-Y5JA_1M_R2_val_2.fq.gz --summary-file Q-Y5JA_1M.pe.alignSummary.txt --new-summary - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5JA_1M.pe.bam Q-Y5JA_1M.pe.sam - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.bam - singularity run 'docker://bicf/gudmaprbkaligner:2.0.0' samtools index -@ 20 -b Q-Y5JA_1M.pe.sorted.bam Q-Y5JA_1M.pe.sorted.bam.bai @@ -69,36 +87,39 @@ dedupData: - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5JA_1M.se.sorted.bam O=Q-Y5JA_1M.se.deduped.bam M=Q-Y5JA_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5JA_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5JA_1M.se.deduped.bam - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools index -@ 20 -b ./test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.bam Q-Y5JA_1M.se.sorted.deduped.bam.bai - - for i in {"chr8","chr4","chrY"}; do + - > + for i in {"chr8","chr4","chrY"}; do echo "samtools view -b Q-Y5JA_1M.se.sorted.deduped.bam ${i} > Q-Y5JA_1M.se.sorted.deduped.${i}.bam; samtools index -@ 20 -b Q-Y5JA_1M.se.sorted.deduped.${i}.bam Q-Y5JA_1M.se.sorted.deduped.${i}.bam.bai;"; done | singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' parallel -j 20 -k - pytest -m dedupData - -makeBigWig: - stage: unit - script: - - singularity run 'docker://bicf/deeptools3.3:2.0.0' bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.bam -o Q-Y5JA_1M.se.bw - - pytest -m makeBigWig -makeFeatureCounts: +countData: stage: unit script: - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -R SAM -p -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -T 20 -s 1 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -o Q-Y5JA_1M.se.featureCounts -g 'gene_name' --primary --ignoreDup -B ./test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.bam - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count Q-Y5JA_1M.se.featureCounts - pytest -m makeFeatureCounts +makeBigWig: + stage: unit + script: + - singularity run 'docker://bicf/deeptools3.3:2.0.0' bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.bam -o Q-Y5JA_1M.se.bw + - pytest -m makeBigWig + fastqc: stage: unit script: - singularity run 'docker://bicf/fastqc:2.0.0' ./test_data/fastq/small/Q-Y5JA_1M.R1.fastq.gz -o . - pytest -m fastqc -inferMetadata: +dataQC: stage: unit script: + - echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > Q-Y5JA_1M.se.sorted.deduped.tin.xls - for i in {"chr8","chr4","chrY"}; do - echo " tin.py -i /project/BICF/BICF_Core/shared/gudmap/test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed; cat Q-Y5JA_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"; done | singularity run 'docker://bicf/rseqc3.0:2.0.0' parallel -j 20 -k > Q-Y5JA_1M.se.sorted.deduped.tin.xls - - pytest -m inferMetadata + echo "tin.py -i /project/BICF/BICF_Core/shared/gudmap/test_data/bam/small/Q-Y5JA_1M.se.sorted.deduped.${i}.bam -r /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/bed/genome.bed; cat Q-Y5JA_1M.se.sorted.deduped.${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \"\\t${i}\\t\";"; done | singularity run 'docker://bicf/rseqc3.0:2.0.0' parallel -j 20 -k >> Q-Y5JA_1M.se.sorted.deduped.tin.xls + - pytest -m dataQC + integration_se: stage: integration diff --git a/README.md b/README.md index fbb700ef12c3d006870dd8378741af9c5a404ca1..e2a78488d3dc1f4e605f593253697c28588a2909 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ RNA-Seq Analytic Pipeline for GUDMAP/RBK Introduction ------------ -This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub. +This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub. It is designed to run on the HPC cluster ([BioHPC](https://portal.biohpc.swmed.edu)) at UT Southwestern Medical Center (in conjunction with the standard nextflow profile: config `biohpc.config`)  @@ -21,18 +21,19 @@ This pipeline is also capable of being run on AWS. To do so: * Replace workDir with the S3 bucket generated * Change region if different * Change queue to the aws batch queue generated -* The user must have awscli configured with an appropriate authentication (with ```aws configure``` and access keys) in the environment which nextflow will be run -* Add ```-profile ``` with the name aws config which was customized +* The user must have awscli configured with an appropriate authentication (with `aws configure` and access keys) in the environment which nextflow will be run +* Add `-profile` with the name aws config which was customized To Run: ------- * Available parameters: - * ```--deriva``` active **credential.json** file from [deriva-auth](https://github.com/informatics-isi-edu/gudmap-rbk/wiki/Uploading-files-via-Deriva-client-tools#from-a-remote-server) - * ```--bdbag``` active **cookies.txt** file from [deriva-auth](https://github.com/informatics-isi-edu/gudmap-rbk/wiki/Uploading-files-via-Deriva-client-tools#from-a-remote-server) - * ```--repRID``` mRNA-seq replicate RID - * ```--refMoVersion``` mouse reference version ***(optional)*** - * ```--refHuVersion``` human reference version ***(optional)*** - * ```-profile``` config profile to use: standard = local processes on BioHPC (default), biohpc = BioHPC cluster, aws_ondemand = AWS Batch on-demand instant requests, aws_spot = AWS Batch spot instance requests ***(optional)*** + * `--deriva` active **credential.json** file from [deriva-auth](https://github.com/informatics-isi-edu/gudmap-rbk/wiki/Uploading-files-via-Deriva-client-tools#from-a-remote-server) + * `--bdbag` active **cookies.txt** file from [deriva-auth](https://github.com/informatics-isi-edu/gudmap-rbk/wiki/Uploading-files-via-Deriva-client-tools#from-a-remote-server) + * `--repRID` mRNA-seq replicate RID + * `--refMoVersion` mouse reference version ***(optional)*** + * `--refHuVersion` human reference version ***(optional)*** + * `--refERCCVersion` human reference version ***(optional)*** + * `-profile` config profile to use: standard = processes on BioHPC cluster, aws_ondemand = AWS Batch on-demand instant requests, aws_spot = AWS Batch spot instance requests ***(optional)*** * NOTES: * once deriva-auth is run and authenticated, the two files above are saved in ```~/.deriva/``` (see official documents from [deriva](https://github.com/informatics-isi-edu/deriva-client#installer-packages-for-windows-and-macosx) on the lifetime of the credentials) * reference version consists of Genome Reference Consortium version, patch release and GENCODE annotation release # (leaving the params blank will use the default version tied to the pipeline version) @@ -96,4 +97,4 @@ Please cite in publications: Pipeline was developed by BICF from funding provide Pipeline Directed Acyclic Graph ------------------------------- - \ No newline at end of file + diff --git a/docs/dag.png b/docs/dag.png index f9c379f904ad2bc32b628556036517e98e77a165..1fedc931294b459b15780cc408faf38949a83986 100644 Binary files a/docs/dag.png and b/docs/dag.png differ diff --git a/workflow/conf/aws_ondemand.config b/workflow/conf/aws_ondemand.config index 0c8b6ff0fd242cda9ca2fb43473e82f99547efef..79c5fc6d9431c377e4ac3ed16fde26f2192ded56 100755 --- a/workflow/conf/aws_ondemand.config +++ b/workflow/conf/aws_ondemand.config @@ -1,4 +1,4 @@ -workDir = 's3://gudmap.rbk/work' +workDir = 's3://' aws.client.storageEncryption = 'AES256' aws { region = 'us-east-2' @@ -9,29 +9,7 @@ aws { process { executor = 'awsbatch' - queue = 'highpriority-3278a8b0-1fc8-11ea-b1ac-021e2396e2cc' + queue = 'highpriority-' cpus = 1 memory = '2 GB' - - withName:parseMetadata { - cpus = 5 - } - withName:getRef { - cpus = 8 - } - withName:trimData { - cpus = 8 - memory = '2 GB' - } - withName:alignData { - cpus = 50 - memory = '5 GB' - } - withName:dedupData { - cpus = 2 - memory = '20 GB' - } - withName:fastqc { - memory = '5 GB' - } -} \ No newline at end of file +} diff --git a/workflow/conf/aws_spot.config b/workflow/conf/aws_spot.config index 4708298a63ced25cd74edf2646fc3c67a0a7753c..f1935697bb1c1c7e6aeedf310d5da2f38da1811c 100755 --- a/workflow/conf/aws_spot.config +++ b/workflow/conf/aws_spot.config @@ -1,4 +1,4 @@ -workDir = 's3://gudmap.rbk/work' +workDir = 's3://' aws.client.storageEncryption = 'AES256' aws { region = 'us-east-2' @@ -9,29 +9,7 @@ aws { process { executor = 'awsbatch' - queue = 'default-3278a8b0-1fc8-11ea-b1ac-021e2396e2cc' + queue = 'default-' cpus = 1 memory = '2 GB' - - withName:parseMetadata { - cpus = 5 - } - withName:getRef { - cpus = 8 - } - withName:trimData { - cpus = 8 - memory = '2 GB' - } - withName:alignData { - cpus = 50 - memory = '5 GB' - } - withName:dedupData { - cpus = 2 - memory = '20 GB' - } - withName:fastq { - memory = '5 GB' - } } diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index ed78dbe46c81d516caf2ffbdd19f024fc2dbe29d..9448731eec9d06e48bb0185188474734c38b1b00 100755 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -3,34 +3,49 @@ process { queue = 'super' clusterOptions = '--hold' - withName:getBag { + withName: getBag { executor = 'local' } - withName:getData { + withName: getData { executor = 'local' } - withName:parseMetadata { + withName: parseMetadata { executor = 'local' } - withName:getRef { + withName: trimData { + queue = 'super' + } + withName: getRefInfer { + executor = 'local' + } + withName: downsampleData { executor = 'local' } - withName:trimData { + withName: alignSampleData { + queue = 'super' + } + withName: inferMetadata { queue = 'super' } - withName:alignData { + withName: getRef { + executor = 'local' + } + withName: alignData { queue = '256GB,256GBv1' } withName: dedupData { queue = 'super' } - withName:fastqc { + withName: countData { + queue = 'super' + } + withName: makeBigWig { queue = 'super' } - withName:inferMetadata { + withName: fastqc { queue = 'super' } - withName: makeBigWig { + withName: dataQC { queue = 'super' } } diff --git a/workflow/nextflow.config b/workflow/nextflow.config index f0be347ef28c9306abe43974d48464484f48b5c4..0888063278039f3e3e7e280735670cdb94852fd4 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -20,30 +20,42 @@ process { withName: parseMetadata { container = 'bicf/python3:1.3' } - withName: getRef { - container = 'bicf/awscli:1.1' - } withName: trimData { container = 'bicf/trimgalore:1.1' } + withName: getRefInfer { + container = 'bicf/awscli:1.1' + } + withName: downsampleData { + container = 'bicf/seqtk:2.0.0' + } + withName: alignSampleData { + container = 'bicf/gudmaprbkaligner:2.0.0' + } + withName: inferMetadata { + container = 'bicf/rseqc3.0:2.0.0' + } + withName: getRef { + container = 'bicf/awscli:1.1' + } withName: alignData { container = 'bicf/gudmaprbkaligner:2.0.0' } withName: dedupData { container = 'bicf/gudmaprbkdedup:2.0.0' } + withName: countData { + container = 'bicf/subread2:2.0.0' + } withName: makeBigWig { container = 'bicf/deeptools3.3:2.0.0' } withName: fastqc { container = 'bicf/fastqc:2.0.0' } - withName:inferMetadata{ + withName: dataQC { container = 'bicf/rseqc3.0:2.0.0' } - withName: makeFeatureCounts { - container = 'bicf/subread2:2.0.0' - } } trace { diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index b082b15649cde7c8559edcbc8dd91fdb15ea00ef..615c68e9e902ffa3ebb9c6d934168050058aa56c 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -15,6 +15,7 @@ params.bdbag = "${baseDir}/../test_data/auth/cookies.txt" params.repRID = "Q-Y5JA" params.refMoVersion = "38.p6.vM22" params.refHuVersion = "38.p12.v31" +params.refERCCVersion = "92" params.outDir = "${baseDir}/../output" // Parse input variables @@ -27,6 +28,7 @@ bdbag = Channel repRID = params.repRID refMoVersion = params.refMoVersion refHuVersion = params.refHuVersion +refERCCVersion = params.refERCCVersion outDir = params.outDir logsDir = "${outDir}/Logs" @@ -34,6 +36,7 @@ logsDir = "${outDir}/Logs" derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json") //referenceBase = "s3://bicf-references" referenceBase = "/project/BICF/BICF_Core/shared/gudmap/references" +referenceInfer = Channel.fromList(["ERCC","GRCh","GRCm"]) // Define script files script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh") @@ -115,7 +118,7 @@ process getData { """ } -// Replicate raw fastqs for multiple process inputs +// Replicate raw fastq's for multiple process inputs fastqs.into { fastqs_trimData fastqs_fastqc @@ -130,7 +133,6 @@ process parseMetadata { input: path script_parseMeta - val repRID path fileMeta path experimentSettingsMeta path experimentMeta @@ -157,7 +159,7 @@ process parseMetadata { echo "LOG: endedness manually detected: \${endsManual}" >> ${repRID}.parseMetadata.err # Get strandedness metadata - stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p stranded -e \${endsManual}) + stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p stranded) echo "LOG: strandedness metadata parsed: \${stranded}" >> ${repRID}.parseMetadata.err # Get spike-in metadata @@ -169,52 +171,354 @@ process parseMetadata { echo "LOG: species metadata parsed: \${species}" >> ${repRID}.parseMetadata.err # Save design file - echo "\${rep},\${endsMeta},\${endsManual},\${stranded},\${spike},\${species}" > design.csv + echo "\${endsMeta},\${endsManual},\${stranded},\${spike},\${species}" > design.csv """ } // Split metadata into separate channels -rep = Channel.create() endsMeta = Channel.create() endsManual = Channel.create() -stranded = Channel.create() -spike = Channel.create() -species = Channel.create() +strandedMeta = Channel.create() +spikeMeta = Channel.create() +speciesMeta = Channel.create() metadata.splitCsv(sep: ",", header: false).separate( - rep, endsMeta, endsManual, - stranded, - spike, - species + strandedMeta, + spikeMeta, + speciesMeta ) // Replicate metadata for multiple process inputs endsManual.into { endsManual_trimData - endsManual_alignData - endsManual_featureCounts + endsManual_downsampleData + endsManual_alignSampleData } -stranded.into { - stranded_alignData - stranded_featureCounts + + +/* + * trimData: trims any adapter or non-host sequences from the data +*/ +process trimData { + tag "${repRID}" + publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.trimData.{out,err}" + + input: + val ends from endsManual_trimData + path (fastq) from fastqs_trimData + + output: + path ("*.fq.gz") into fastqsTrim + path ("*_trimming_report.txt") into trimQC + path ("${repRID}.trimData.{out,err}") + + script: + """ + hostname > ${repRID}.trimData.err + ulimit -a >> ${repRID}.trimData.err + + #Trim fastq's using trim_galore + if [ "${ends}" == "se" ] + then + echo "LOG: running trim_galore using single-end settings" >> ${repRID}.trimData.err + trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} 1>> ${repRID}.trimData.out 2>> ${repRID}.trimData.err + elif [ "${ends}" == "pe" ] + then + echo "LOG: running trim_galore using paired-end settings" >> ${repRID}.trimData.err + trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} 1>> ${repRID}.trimData.out 2>> ${repRID}.trimData.err + fi + """ } -spike.into{ - spike_getRef + +// Replicate trimmed fastq's +fastqsTrim.into { + fastqsTrim_alignData + fastqsTrim_downsampleData } -species.into { - species_getRef + +/* + * getRefInfer: Dowloads appropriate reference for metadata inference +*/ +process getRefInfer { + tag "${refName}" + publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.getRefInfer.{out,err}" + + input: + val refName from referenceInfer + + output: + tuple val (refName), path ("hisat2", type: 'dir'), path ("*.fna"), path ("*.gtf") into refInfer + path ("${refName}", type: 'dir') into bedInfer + path ("${repRID}.getRefInfer.{out,err}") + + script: + """ + hostname > ${repRID}.getRefInfer.err + ulimit -a >> ${repRID}.getRefInfer.err + export https_proxy=\${http_proxy} + + #Set the reference name + if [ "${refName}" == "ERCC" ] + then + references=\$(echo ${referenceBase}/ERCC${refERCCVersion}) + elif [ "${refName}" == "GRCm" ] + then + references=\$(echo ${referenceBase}/GRCm${refMoVersion}) + elif [ '${refName}' == "GRCh" ] + then + references=\$(echo ${referenceBase}/GRCh${refHuVersion}) + else + echo -e "LOG: ERROR - References could not be set!\nReference found: ${referenceBase}" >> ${repRID}.getRefInfer.err + exit 1 + fi + mkdir ${refName} + #Retreive appropriate reference appropriate location + if [ ${referenceBase} == "s3://bicf-references" ] + then + echo "LOG: grabbing reference files from S3" >> ${repRID}.getRefInfer.err + aws s3 cp "\${references}" /hisat2 ./ --recursive 1>> ${repRID}.getRefInfer.out 2>> ${repRID}.getRefInfer.err + aws s3 cp "\${references}" /bed ./${refName}/ --recursive 1>> ${repRID}.getRefInfer.out 2>> ${repRID}.getRefInfer.err + aws s3 cp "\${references}" /*.fna --recursive 1>> ${repRID}.getRefInfer.out 2>> ${repRID}.getRefInfer.err + aws s3 cp "\${references}" /*.gtf --recursive 1>> ${repRID}.getRefInfer.out 2>> ${repRID}.getRefInfer.err + elif [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references" ] + then + echo "LOG: using pre-defined locations for reference files" >> ${repRID}.getRefInfer.err + ln -s "\${references}"/hisat2 1>> ${repRID}.getRefInfer.out 2>> ${repRID}.getRefInfer.err + ln -s "\${references}"/bed ${refName}/bed 1>> ${repRID}.getRefInfer.out 2>> ${repRID}.getRefInfer.err + ln -s "\${references}"/genome.fna 1>> ${repRID}.getRefInfer.out 2>> ${repRID}.getRefInfer.err + ln -s "\${references}"/genome.gtf 1>> ${repRID}.getRefInfer.out 2>> ${repRID}.getRefInfer.err + fi + + #Make blank bed folder for ERCC + if [ "${refName}" == "ERCC" ] + then + rm ${refName}/bed + mkdir ${refName}/bed + fi + """ } +/* + * downsampleData: downsample fastq's for metadata inference + */ +process downsampleData { + tag "${repRID}" + publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.downsampleData.{out,err}" + + input: + val ends from endsManual_downsampleData + path fastq from fastqsTrim_downsampleData + + output: + path ("sampled.1.fq") into fastqs1Sample + path ("sampled.2.fq") optional true into fastqs2Sample + path ("${repRID}.downsampleData.{out,err}") + + script: + """ + hostname > ${repRID}.downsampleData.err + ulimit -a >> ${repRID}.downsampleData.err + export https_proxy=\${http_proxy} + + if [ "${ends}" == "se" ] + then + echo "LOG: downsampling single-end trimmed fastq" >> ${repRID}.downsampleData.err + seqtk sample -s100 *trimmed.fq.gz 100000 1> sampled.1.fq 2>> ${repRID}.downsampleData.err + elif [ "${ends}" == "pe" ] + then + echo "LOG: downsampling read 1 of paired-end trimmed fastq" >> ${repRID}.downsampleData.err + seqtk sample -s100 *1.fq.gz 1000000 1> sampled.1.fq 2>> ${repRID}.downsampleData.err + echo "LOG: downsampling read 2 of paired-end trimmed fastq" >> ${repRID}.downsampleData.err + seqtk sample -s100 *2.fq.gz 1000000 1> sampled.2.fq 2>> ${repRID}.downsampleData.err + fi + """ +} + +// Replicate the dowsampled fastq's and attatched to the references +inferInput = endsManual_alignSampleData.combine(refInfer.combine(fastqs1Sample.collect().combine(fastqs2Sample.collect()))) + +/* + * alignSampleData: aligns the downsampled reads to a reference database +*/ +process alignSampleData { + tag "${ref}" + publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.alignSampleData.{out,err}" + + input: + tuple val (ends), val (ref), path (hisat2), path (fna), path (gtf), path (fastq1), path (fastq2) from inferInput + + output: + path ("${ref}.sampled.sorted.bam") into sampleBam + path ("${ref}.sampled.sorted.bam.bai") into sampleBai + path ("${ref}.alignSampleSummary.txt") into alignSampleQC + path ("${repRID}.${ref}.alignSampleData.{out,err}") + + script: + """ + hostname > ${repRID}.${ref}.alignSampleData.err + ulimit -a >> ${repRID}.${ref}.alignSampleData.err + + #Align the reads with Hisat 2 + if [ "${ends}" == "se" ] + then + echo "LOG: running Hisat2 with single-end settings" >> ${repRID}.${ref}.alignSampleData.err + hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome -U ${fastq1} --summary-file ${repRID}.alignSampleSummary.txt --new-summary 1>> ${repRID}.${ref}.alignSampleData.out 2>> ${repRID}.${ref}.alignSampleData.err + elif [ "${ends}" == "pe" ] + then + echo "LOG: running Hisat2 with paired-end settings" >> ${repRID}.${ref}.alignSampleData.err + hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome --no-mixed --no-discordant -1 ${fastq1} -2 ${fastq2} --summary-file ${ref}.alignSampleSummary.txt --new-summary 1>> ${repRID}.${ref}.alignSampleData.out 2>> ${repRID}.${ref}.alignSampleData.err + fi + + #Convert the output sam file to a sorted bam file using Samtools + echo "LOG: converting from sam to bam" >> ${repRID}.${ref}.alignSampleData.err + samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${ref}.sampled.bam ${ref}.sampled.sam 1>> ${repRID}.${ref}.alignSampleData.out 2>> ${repRID}.${ref}.alignSampleData.err; + + #Sort the bam file using Samtools + echo "LOG: sorting the bam file" >> ${repRID}.${ref}.alignSampleData.err + samtools sort -@ `nproc` -O BAM -o ${ref}.sampled.sorted.bam ${ref}.sampled.bam 1>> ${repRID}.${ref}.alignSampleData.out 2>> ${repRID}.${ref}.alignSampleData.err; + + #Index the sorted bam using Samtools + echo "LOG: indexing sorted bam file" >> ${repRID}.${ref}.alignSampleData.err + samtools index -@ `nproc` -b ${ref}.sampled.sorted.bam ${ref}.sampled.sorted.bam.bai 1>> ${repRID}.${ref}.alignSampleData.out 2>> ${repRID}.${ref}.alignSampleData.err; + """ +} + +process inferMetadata { + tag "${repRID}" + publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.inferMetadata.{out,err}" + + input: + path script_inferMeta + path beds from bedInfer.collect() + path bam from sampleBam.collect() + path bai from sampleBai.collect() + path alignSummary from alignSampleQC.collect() + + output: + path "infer.csv" into inferMetadata + path "${repRID}.inferMetadata.{out,err}" optional true + + script: + """ + hostname > ${repRID}.inferMetadata.err + ulimit -a >> ${repRID}.inferMetadata.err + + # collect alignment rates + align_ercc=\$(echo \$(grep "Overall alignment rate" ERCC.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) + align_hu=\$(echo \$(grep "Overall alignment rate" GRCh.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) + align_mo=\$(echo \$(grep "Overall alignment rate" GRCm.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) + + # determine spike-in + if [ 1 -eq \$(echo \$(expr \${align_ercc} ">=" 0.1)) ] + then + spike="yes" + else + spike="no" + fi + echo -e "LOG: Inference of strandedness results in: \${spike}" >> ${repRID}.inferMetadata.err + + # determine species + if [ 1 -eq \$(echo \$(expr \${align_hu} ">=" 25)) ] && [ 1 -eq \$(echo \$(expr \${align_mo} "<" 25)) ] + then + species="Homo sapiens" + bam="GRCh.sampled.sorted.bam" + bed="./GRCh/bed/genome.bed" + elif [ 1 -eq \$(echo \$(expr \${align_mo} ">=" 25)) ] && [ 1 -eq \$(echo \$(expr \${align_hu} "<" 25)) ] + then + species="Mus musculus" + bam="GRCm.sampled.sorted.bam" + bed="./GRCm/bed/genome.bed" + else + echo "LOG: ERROR - Inference of species returns an ambiguous result" >> ${repRID}.inferMetadata.err + exit 1 + fi + echo -e "LOG: Inference of species results in: \${species}" >> ${repRID}.inferMetadata.err + + # infer experimental setting from dedup bam + echo "LOG: infer experimental setting from dedup bam" >> ${repRID}.inferMetadata.err + infer_experiment.py -r "\${bed}" -i "\${bam}" 1>> ${repRID}.inferMetadata.log 2>> ${repRID}.inferMetadata.err + + echo "LOG: determining endedness and strandedness from file" >> ${repRID}.inferMetadata.err + ended=`bash inferMeta.sh endness ${repRID}.inferMetadata.log` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err + fail=`bash inferMeta.sh fail ${repRID}.inferMetadata.log` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err + if [ \${ended} == "PairEnd" ] + then + ends="pe" + percentF=`bash inferMeta.sh pef ${repRID}.inferMetadata.log` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err + percentR=`bash inferMeta.sh per ${repRID}.inferMetadata.log` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err + elif [ \${ended} == "SingleEnd" ] + then + ends="se" + percentF=`bash inferMeta.sh sef ${repRID}.inferMetadata.log` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err + percentR=`bash inferMeta.sh ser ${repRID}.inferMetadata.log` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err + fi + if [ 1 -eq \$(echo \$(expr \${percentF} ">" 0.25)) ] && [ 1 -eq \$(echo \$(expr \${percentR} "<" 0.25)) ] + then + stranded="forward" + elif [ 1 -eq \$(echo \$(expr \${percentR} ">" 0.25)) ] && [ 1 -eq \$(echo \$(expr \${percentF} "<" 0.25)) ] + then + stranded="reverse" + + else + stranded="unstranded" + fi + echo -e "LOG: stradedness set to \${stranded}" >> ${repRID}.inferMetadata.err + + # write infered metadata to file + echo "\${ends},\${stranded},\${spike},\${species},\${align_ercc},\${align_hu},\${align_mo},\${percentF},\${percentR},\${fail}" 1>> infer.csv 2>> ${repRID}.inferMetadata.err + """ +} + +// Split metadata into separate channels +endsInfer = Channel.create() +strandedInfer = Channel.create() +spikeInfer = Channel.create() +speciesInfer = Channel.create() +align_erccInfer = Channel.create() +align_huInfer = Channel.create() +align_moInfer = Channel.create() +percentFInfer = Channel.create() +percentRInfer = Channel.create() +failInfer = Channel.create() +inferMetadata.splitCsv(sep: ",", header: false).separate( + endsInfer, + strandedInfer, + spikeInfer, + speciesInfer, + align_erccInfer, + align_huInfer, + align_moInfer, + percentFInfer, + percentRInfer, + failInfer +) +// Replicate metadata for multiple process inputs +endsInfer.into { + endsInfer_alignData + endsInfer_countData +} +strandedInfer.into { + strandedInfer_alignData + strandedInfer_countData +} +spikeInfer.into{ + spikeInfer_getRef +} +speciesInfer.into { + speciesInfer_getRef +} + + /* * getRef: Dowloads appropriate reference */ process getRef { - tag "${species_getRef}" + tag "${species}" publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.getRef.{out,err}" input: - val spike_getRef - val species_getRef + val spike from spikeInfer_getRef + val species from speciesInfer_getRef output: tuple path ("hisat2", type: 'dir'), path ("bed", type: 'dir'), path ("*.fna"), path ("*.gtf") into reference @@ -226,27 +530,27 @@ process getRef { ulimit -a >> ${repRID}.getRef.err export https_proxy=\${http_proxy} - # run set the reference name - if [ "${species_getRef}" == "Mus musculus" ] + #Set the reference name + if [ "${species}" == "Mus musculus" ] then references=\$(echo ${referenceBase}/GRCm${refMoVersion}) - elif [ '${species_getRef}' == "Homo sapiens" ] + elif [ '${species}' == "Homo sapiens" ] then references=\$(echo ${referenceBase}/GRCh${refHuVersion}) else - echo -e "LOG: ERROR - References could not be set!\nSpecies reference found: ${species_getRef}" >> ${repRID}.getRef.err + echo -e "LOG: ERROR - References could not be set!\nSpecies reference found: ${species}" >> ${repRID}.getRef.err exit 1 fi - if [ "${spike_getRef}" == "yes" ] + if [ "${spike}" == "yes" ] then references=\$(echo \${reference}-S/) - elif [ "${spike_getRef}" == "no" ] + elif [ "${spike}" == "no" ] then reference=\$(echo \${references}/) fi echo "LOG: species set to \${references}" >> ${repRID}.getRef.err - # retreive appropriate reference appropriate location + #Retreive appropriate reference appropriate location if [ ${referenceBase} == "s3://bicf-references" ] then echo "LOG: grabbing reference files from S3" >> ${repRID}.getRef.err @@ -268,42 +572,8 @@ process getRef { // Replicate reference for multiple process inputs reference.into { reference_alignData - reference_makeFeatureCounts - reference_inferMeta -} - -/* - * trimData: trims any adapter or non-host sequences from the data -*/ -process trimData { - tag "${repRID}" - publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.trimData.{out,err}" - - input: - val endsManual_trimData - path (fastq) from fastqs_trimData - - output: - path ("*.fq.gz") into fastqs_trimmed - path ("*_trimming_report.txt") into trimQC - path ("${repRID}.trimData.{out,err}") - - script: - """ - hostname > ${repRID}.trimData.err - ulimit -a >> ${repRID}.trimData.err - - #Trim fastqs using trim_galore - if [ "${endsManual_trimData}" == "se" ] - then - echo "LOG: running trim_galore using single-end settings" >> ${repRID}.trimData.err - trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} 1>> ${repRID}.trimData.out 2>> ${repRID}.trimData.err - elif [ "${endsManual_trimData}" == "pe" ] - then - echo "LOG: running trim_galore using paired-end settings" >> ${repRID}.trimData.err - trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} 1>> ${repRID}.trimData.out 2>> ${repRID}.trimData.err - fi - """ + reference_countData + reference_dataQC } /* @@ -314,9 +584,9 @@ process alignData { publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.align.{out,err}" input: - val endsManual_alignData - val stranded_alignData - path fastq from fastqs_trimmed + val ends from endsInfer_alignData + val stranded from strandedInfer_alignData + path fastq from fastqsTrim_alignData path reference_alignData output: @@ -329,15 +599,33 @@ process alignData { hostname > ${repRID}.align.err ulimit -a >> ${repRID}.align.err + #Set stranded param for hisat2 + if [ "${stranded}"=="unstranded" ] + then + strandedParam="" + elif [ "${stranded}" == "forward" ] && [ "${ends}" == "se" ] + then + strandedParam="--rna-strandness F" + elif [ "${stranded}" == "forward" ] && [ "${ends}" == "pe" ] + then + strandedParam="--rna-strandness FR" + elif [ "${stranded}" == "reverse" ] && [ "${ends}" == "se" ] + then + strandedParam="--rna-strandness R" + elif [ "${stranded}" == "reverse" ] && [ "${ends}" == "pe" ] + then + strandedParam="--rna-strandness RF" + fi + #Align the reads with Hisat 2 - if [ "${endsManual_alignData}" == "se" ] + if [ "${ends}" == "se" ] then echo "LOG: running Hisat2 with single-end settings" >> ${repRID}.align.err - hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} -U ${fastq[0]} --summary-file ${repRID}.alignSummary.txt --new-summary 1>> ${repRID}.align.out 2>> ${repRID}.align.err - elif [ "${endsManual_alignData}" == "pe" ] + hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} -U ${fastq[0]} --summary-file ${repRID}.alignSummary.txt --new-summary 1>> ${repRID}.align.out 2>> ${repRID}.align.err + elif [ "${ends}" == "pe" ] then echo "LOG: running Hisat2 with paired-end settings" >> ${repRID}.align.err - hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome ${stranded_alignData} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} --summary-file ${repRID}.alignSummary.txt --new-summary 1>> ${repRID}.align.out 2>> ${repRID}.align.err + hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} --summary-file ${repRID}.alignSummary.txt --new-summary 1>> ${repRID}.align.out 2>> ${repRID}.align.err fi #Convert the output sam file to a sorted bam file using Samtools @@ -368,7 +656,7 @@ process dedupData { publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.dedup.{out,err}" input: - set path (inBam), path (inBai) from rawBam_dedupData + set path (bam), path (bai) from rawBam_dedupData output: tuple path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bam.bai") into dedupBam @@ -383,12 +671,14 @@ process dedupData { # remove duplicated reads using Picard's MarkDuplicates echo "LOG: running picard MarkDuplicates to remove duplicate reads" >> ${repRID}.dedup.err - java -jar /picard/build/libs/picard.jar MarkDuplicates I=${inBam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>> ${repRID}.dedup.out 2>> ${repRID}.dedup.err + java -jar /picard/build/libs/picard.jar MarkDuplicates I=${bam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>> ${repRID}.dedup.out 2>> ${repRID}.dedup.err - # remove duplicated reads - java -jar /picard/build/libs/picard.jar MarkDuplicates I=${inBam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err + # Sort the bam file using Samtools samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.deduped.bam ${repRID}.deduped.bam 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err + + # Index the sorted bam using Samtools samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bam.bai 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err + # Split the deduped BAM file for multi-threaded tin calculation for i in `samtools view ${repRID}.sorted.deduped.bam | cut -f3 | sort | uniq`; do @@ -399,9 +689,8 @@ process dedupData { // Replicate dedup bam/bai for multiple process inputs dedupBam.into { - dedupBam_makeFeatureCounts + dedupBam_countData dedupBam_makeBigWig - dedupBam_inferMeta } /* @@ -413,7 +702,7 @@ process makeBigWig { publishDir "${outDir}/bigwig", mode: 'copy', pattern: "${repRID}.bw" input: - set path (inBam), path (inBai) from dedupBam_makeBigWig + set path (bam), path (bai) from dedupBam_makeBigWig output: path ("${repRID}.bw") @@ -426,57 +715,63 @@ process makeBigWig { #Run bamCoverage echo "LOG: Running bigWig bamCoverage" >> ${repRID}.makeBigWig.err - bamCoverage -p `nproc` -b ${inBam} -o ${repRID}.bw 1>> ${repRID}.makeBigWig.out 2>> ${repRID}.makeBigWig.err + bamCoverage -p `nproc` -b ${bam} -o ${repRID}.bw 1>> ${repRID}.makeBigWig.out 2>> ${repRID}.makeBigWig.err """ } /* - *Run featureCounts and get the counts, tpm + *Run countData and get the counts, tpm */ -process makeFeatureCounts { +process countData { tag "${repRID}" - publishDir "${outDir}/featureCounts", mode: 'copy', pattern: "${repRID}*.countTable.csv" - publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.makeFetureCounts.{out,err}" + publishDir "${outDir}/countData", mode: 'copy', pattern: "${repRID}*.countTable.csv" + publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.countData.{out,err}" input: path script_calculateTPM - tuple path (bam), path (bai) from dedupBam_makeFeatureCounts - path reference_makeFeatureCounts - val endsManual_featureCounts + tuple path (bam), path (bai) from dedupBam_countData + path ref from reference_countData + val ends from endsInfer_countData + val stranded from strandedInfer_countData output: path ("*.countTable.csv") into counts - path ("*.featureCounts.summary") into countsQC - path ("${repRID}.makeFeatureCounts.{out,err}") + path ("*.countData.summary") into countsQC + path ("${repRID}.countData.{out,err}") script: """ - hostname > ${repRID}.makeFeatureCounts.err - ulimit -a >> ${repRID}.makeFeatureCounts.err + hostname > ${repRID}.countData.err + ulimit -a >> ${repRID}.countData.err - #Determine strandedness and setup strandig for featureCounts + #Determine strandedness and setup strandig for countData stranding=0; - if [ "${stranded_featureCounts}" == "--rna-strandness F" ] || [ "${stranded_featureCounts}" == "--rna-strandness FR" ] - then - stranding=1 - echo "LOG: strandedness set to stranded [1]" >> ${repRID}.makeFeatureCounts.err - else + if [ "${stranded}" == "unstranded" ] + then stranding=0 - echo "LOG: strandedness set to unstranded [0]" >> ${repRID}.makeFeatureCounts.err - fi; - #Run featureCounts - echo "LOG: running featureCounts on the data" >> ${repRID}.makeFeatureCounts.err - if [ "${endsManual_featureCounts }" == "se" ] + echo "LOG: strandedness set to unstranded [0]" >> ${repRID}.countData.err + elif [ "${stranded}" == "forward" ] then - featureCounts -R SAM -p -G ./genome.fna -T `nproc` -s \${stranding} -a ./genome.gtf -o ${repRID}.featureCounts -g 'gene_name' --primary --ignoreDup ${repRID}.sorted.deduped.bam 1>> ${repRID}.makeFeatureCounts.out 2>> ${repRID}.makeFeatureCounts.err - elif [ "${endsManual_featureCounts }" == "pe" ] + stranding=1 + echo "LOG: strandedness set to forward stranded [1]" >> ${repRID}.countData.err + elif [ "${stranded}" == "reverse" ] then - featureCounts -R SAM -p -G ./genmome.fna -T `nproc` -s \${stranding} -a ./genome.gtf -o ${repRID}.featureCounts -g 'gene_name' --primary --ignoreDup -B ${repRID}.sorted.deduped.bam 1>> ${repRID}.makeFeatureCounts.out 2>> ${repRID}.makeFeatureCounts.err + stranding=2 + echo "LOG: strandedness set to forward stranded [2]" >> ${repRID}.countData.err + fi + #Run countData + echo "LOG: running countData on the data" >> ${repRID}.countData.err + if [ "${ends}" == "se" ] + then + featureCounts -R SAM -p -G ./genome.fna -T `nproc` -s \${stranding} -a ./genome.gtf -o ${repRID}.countData -g 'gene_name' --primary --ignoreDup ${repRID}.sorted.deduped.bam 1>> ${repRID}.countData.out 2>> ${repRID}.countData.err + elif [ "${ends}" == "pe" ] + then + featureCounts -R SAM -p -G ./genmome.fna -T `nproc` -s \${stranding} -a ./genome.gtf -o ${repRID}.countData -g 'gene_name' --primary --ignoreDup -B ${repRID}.sorted.deduped.bam 1>> ${repRID}.countData.out 2>> ${repRID}.countData.err fi - #Calculate TMP from the resulting featureCounts table - echo "LOG: calculating TMP with R" >> ${repRID}.makeFeatureCounts.err - Rscript calculateTPM.R --count "${repRID}.featureCounts" 1>> ${repRID}.makeFeatureCounts.out 2>> ${repRID}.makeFeatureCounts.err + #Calculate TPM from the resulting countData table + echo "LOG: calculating TPM with R" >> ${repRID}.countData.err + Rscript calculateTPM.R --count "${repRID}.countData" 1>> ${repRID}.countData.out 2>> ${repRID}.countData.err """ } @@ -507,77 +802,30 @@ process fastqc { } /* - *inferMetadata: run RSeQC to collect stats and infer experimental metadata + *dataQC: run RSeQC to calculate transcript integrity numbers (TIN) */ -process inferMetadata { +process dataQC { tag "${repRID}" - publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.rseqc.{out,err}" + publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.dataQC.{out,err}" input: - path script_inferMeta - path reference_inferMeta - set path (inBam), path (inBai) from dedupBam_inferMeta - set path (inChrBam), path (inChrBai) from dedupChrBam + path ref from reference_dataQC + set path (chrBam), path (chrBai) from dedupChrBam output: - path "infer.csv" into inferedMetadata - path "${inBam.baseName}.tin.xls" into tin - path "${repRID}.insertSize.inner_distance_freq.txt" optional true into innerDistance - path "${repRID}.rseqc.{out,err}" optional true + path "${repRID}.sorted.deduped.tin.xls" into tin + path "${repRID}.dataQC.{out,err}" optional true script: """ - hostname > ${repRID}.rseqc.err - ulimit -a >> ${repRID}.rseqc.err - - # infer experimental setting from dedup bam - echo "LOG: running inference from bed file" >> ${repRID}.rseqc.err - infer_experiment.py -r ./bed/genome.bed -i "${inBam}" > ${repRID}.rseqc.log 2>> ${repRID}.rseqc.err - - echo "LOG: determining endedness and strandedness from file" >> ${repRID}.rseqc.err - endness=`bash inferMeta.sh endness ${repRID}.rseqc.log` 1>> ${repRID}.rseqc.out 2>> ${repRID}.rseqc.err - fail=`bash inferMeta.sh fail ${repRID}.rseqc.log` 1>> ${repRID}.rseqc.out 2>> ${repRID}.rseqc.err - if [ \${endness} == "PairEnd" ] - then - percentF=`bash inferMeta.sh pef ${repRID}.rseqc.log` 1>> ${repRID}.rseqc.out 2>> ${repRID}.rseqc.err - percentR=`bash inferMeta.sh per ${repRID}.rseqc.log` 1>> ${repRID}.rseqc.out 2>> ${repRID}.rseqc.err - inner_distance.py -i "${inBam}" -o ${repRID}.insertSize -r ./bed/genome.bed 1>> ${repRID}.rseqc.out 2>> ${repRID}.rseqc.err - elif [ \${endness} == "SingleEnd" ] - then - percentF=`bash inferMeta.sh sef ${repRID}.rseqc.log` 1>> ${repRID}.rseqc.out 2>> ${repRID}.rseqc.err - percentR=`bash inferMeta.sh ser ${repRID}.rseqc.log` 1>> ${repRID}.rseqc.out 2>> ${repRID}.rseqc.err - fi - if [ 1 -eq \$(echo \$(expr \$percentF ">" 0.25)) ] && [ 1 -eq \$(echo \$(expr \$percentR "<" 0.25)) ] - then - stranded="forward" - if [ \$endness == "PairEnd" ] - then - strategy="1++,1--,2+-,2-+" - else - strategy="++,--" - fi - elif [ 1 -eq \$(echo \$(expr \$percentR ">" 0.25)) ] && [ 1 -eq \$(echo \$(expr \$percentF "<" 0.25)) ] - then - stranded="reverse" - if [ \$endness == "PairEnd" ] - then - strategy="1+-,1-+,2++,2--" - else - strategy="+-,-+" - fi - else - stranded="unstranded" - strategy="us" - fi - echo -e "LOG: strategy set to \${strategy}\nStranded set to ${stranded}" >> ${repRID}.rseqc.err + hostname > ${repRID}.dataQC.err + ulimit -a >> ${repRID}.dataQC.err # calcualte TIN values per feature on each chromosome + echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > ${repRID}.sorted.deduped.tin.xls for i in `cat ./bed/genome.bed | cut -f1 | sort | uniq`; do echo "echo \"LOG: running tin.py on \${i}\" >> ${repRID}.rseqc.err; tin.py -i ${repRID}.sorted.deduped.\${i}.bam -r ./bed/genome.bed 1>>${repRID}.rseqc.log 2>>${repRID}.rseqc.err; cat ${repRID}.sorted.deduped.\${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \\\"\\\\t\${i}\\\\t\\\";"; - done | parallel -j `nproc` -k > ${repRID}.sorted.deduped.tin.xls 2>>${repRID}.rseqc.err - - # write infered metadata to file - echo \${endness},\${stranded},\${strategy},\${percentF},\${percentR},\${fail} > infer.csv 2>> ${repRID}.rseqc.err + done | parallel -j `nproc` -k 1>> ${repRID}.sorted.deduped.tin.xls 2>>${repRID}.rseqc.err """ -} +} \ No newline at end of file diff --git a/workflow/scripts/parseMeta.py b/workflow/scripts/parseMeta.py index eaa2656f9266a5c3de07fb0fc80ad5aeb2ea8422..78b0e2f4e9fb51f9fc827321f303ccc0d3bdcad1 100644 --- a/workflow/scripts/parseMeta.py +++ b/workflow/scripts/parseMeta.py @@ -10,7 +10,6 @@ def get_args(): parser.add_argument('-r', '--repRID',help="The replicate RID.",required=True) parser.add_argument('-m', '--metaFile',help="The metadata file to extract.",required=True) parser.add_argument('-p', '--parameter',help="The parameter to extract.",required=True) - parser.add_argument('-e', '--endsManual',help="The endness.",required=False) args = parser.parse_args() return args @@ -56,12 +55,9 @@ def main(): # Get strandedness metadata from 'Experiment Settings.csv' if (args.parameter == "stranded"): if (metaFile.Has_Strand_Specific_Information.unique() == "yes"): - if (args.endsManual=="se"): - stranded = "--rna-strandness F" - elif (args.endsManual=="pe"): - stranded = "--rna-strandness FR" + stranded = "stranded" elif (metaFile.Has_Strand_Specific_Information.unique() == "no"): - stranded = "" + stranded = "unstranded" else: print("Stranded metadata not match expected options: " + metaFile.Has_Strand_Specific_Information.unique()) exit(1) diff --git a/workflow/tests/test_dataQC.py b/workflow/tests/test_dataQC.py new file mode 100644 index 0000000000000000000000000000000000000000..95eb332736bf7af93945402e6f79f04daac83f9f --- /dev/null +++ b/workflow/tests/test_dataQC.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + +@pytest.mark.dataQC +def test_dataQC(): + assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_1M.se.sorted.deduped.tin.xls')) + diff --git a/workflow/tests/test_downsampleData.py b/workflow/tests/test_downsampleData.py new file mode 100644 index 0000000000000000000000000000000000000000..fd42c49e169e387dd9662903b20866c40aec8907 --- /dev/null +++ b/workflow/tests/test_downsampleData.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 + +import pytest +import pandas as pd +from io import StringIO +import os + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../' + +@pytest.mark.downsampleData +def test_downsampleData(): + assert os.path.exists(os.path.join(test_output_path, 'sampled.1.fq')) \ No newline at end of file diff --git a/workflow/tests/test_inferMetadata.py b/workflow/tests/test_inferMetadata.py index 44ffbc3f239630b357b5fde7746787fe9ca65d62..f2908bee242281c23e814f8bf8e3e3d68e395d9f 100644 --- a/workflow/tests/test_inferMetadata.py +++ b/workflow/tests/test_inferMetadata.py @@ -10,5 +10,5 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ @pytest.mark.inferMetadata def test_inferMetadata(): - assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_1M.se.sorted.deduped.tin.xls')) + assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_1M.se.inferMetadata.log'))