diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index 6c6b1161fb917daf564ea4bf933561a4209818bf..417af3da72dc969bb0ccfe8ac9ff3db9509bee95 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -176,39 +176,39 @@ process parseMetadata { hostname > ${repRID}.parseMetadata.err ulimit -a >> ${repRID}.parseMetadata.err - # Check replicate RID metadata + # check replicate RID metadata rep=\$(python3 ${script_parseMeta} -r ${repRID} -m "${fileMeta}" -p repRID) echo "LOG: replicate RID metadata parsed: \${rep}" >> ${repRID}.parseMetadata.err - # Get experiment RID metadata + # get experiment RID metadata exp=\$(python3 ${script_parseMeta} -r ${repRID} -m "${fileMeta}" -p expRID) echo "LOG: experiment RID metadata parsed: \${exp}" >> ${repRID}.parseMetadata.err - # Get study RID metadata + # get study RID metadata study=\$(python3 ${script_parseMeta} -r ${repRID} -m "${fileMeta}" -p studyRID) echo "LOG: study RID metadata parsed: \${study}" >> ${repRID}.parseMetadata.err - # Get endedness metadata + # get endedness metadata endsMeta=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p endsMeta) echo "LOG: endedness metadata parsed: \${endsMeta}" >> ${repRID}.parseMetadata.err - # Manually get endness + # ganually get endness endsManual=\$(python3 ${script_parseMeta} -r ${repRID} -m "${fileMeta}" -p endsManual) echo "LOG: endedness manually detected: \${endsManual}" >> ${repRID}.parseMetadata.err - # Get strandedness metadata + # get strandedness metadata stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p stranded) echo "LOG: strandedness metadata parsed: \${stranded}" >> ${repRID}.parseMetadata.err - # Get spike-in metadata + # get spike-in metadata spike=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p spike) echo "LOG: spike-in metadata parsed: \${spike}" >> ${repRID}.parseMetadata.err - # Get species metadata + # get species metadata species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentMeta}" -p species) echo "LOG: species metadata parsed: \${species}" >> ${repRID}.parseMetadata.err - # Save design file + # gave design file echo "\${endsMeta},\${endsManual},\${stranded},\${spike},\${species},\${exp},\${study}" > design.csv """ } @@ -260,7 +260,7 @@ process trimData { hostname > ${repRID}.trimData.err ulimit -a >> ${repRID}.trimData.err - #Trim fastq's using trim_galore + # trim fastq's using trim_galore if [ "${ends}" == "se" ] then echo "LOG: running trim_galore using single-end settings" >> ${repRID}.trimData.err @@ -300,7 +300,7 @@ process getRefInfer { ulimit -a >> ${repRID}.getRefInfer.err export https_proxy=\${http_proxy} - #Set the reference name + # set the reference name if [ "${refName}" == "ERCC" ] then references=\$(echo ${referenceBase}/ERCC${refERCCVersion}) @@ -315,7 +315,8 @@ process getRefInfer { exit 1 fi mkdir ${refName} - #Retreive appropriate reference appropriate location + + # retreive appropriate reference appropriate location if [ ${referenceBase} == "s3://bicf-references" ] then echo "LOG: grabbing reference files from S3" >> ${repRID}.getRefInfer.err @@ -332,7 +333,7 @@ process getRefInfer { ln -s "\${references}"/genome.gtf 1>> ${repRID}.getRefInfer.out 2>> ${repRID}.getRefInfer.err fi - #Make blank bed folder for ERCC + # make blank bed folder for ERCC if [ "${refName}" == "ERCC" ] then rm ${refName}/bed @@ -402,7 +403,7 @@ process alignSampleData { hostname > ${repRID}.${ref}.alignSampleData.err ulimit -a >> ${repRID}.${ref}.alignSampleData.err - #Align the reads with Hisat 2 + # align the reads with Hisat 2 if [ "${ends}" == "se" ] then echo "LOG: running Hisat2 with single-end settings" >> ${repRID}.${ref}.alignSampleData.err @@ -413,15 +414,15 @@ process alignSampleData { hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome --no-mixed --no-discordant -1 ${fastq1} -2 ${fastq2} --summary-file ${ref}.alignSampleSummary.txt --new-summary 1>> ${repRID}.${ref}.alignSampleData.out 2>> ${repRID}.${ref}.alignSampleData.err fi - #Convert the output sam file to a sorted bam file using Samtools + # convert the output sam file to a sorted bam file using Samtools echo "LOG: converting from sam to bam" >> ${repRID}.${ref}.alignSampleData.err samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${ref}.sampled.bam ${ref}.sampled.sam 1>> ${repRID}.${ref}.alignSampleData.out 2>> ${repRID}.${ref}.alignSampleData.err; - #Sort the bam file using Samtools + # sort the bam file using Samtools echo "LOG: sorting the bam file" >> ${repRID}.${ref}.alignSampleData.err samtools sort -@ `nproc` -O BAM -o ${ref}.sampled.sorted.bam ${ref}.sampled.bam 1>> ${repRID}.${ref}.alignSampleData.out 2>> ${repRID}.${ref}.alignSampleData.err; - #Index the sorted bam using Samtools + # index the sorted bam using Samtools echo "LOG: indexing sorted bam file" >> ${repRID}.${ref}.alignSampleData.err samtools index -@ `nproc` -b ${ref}.sampled.sorted.bam ${ref}.sampled.sorted.bam.bai 1>> ${repRID}.${ref}.alignSampleData.out 2>> ${repRID}.${ref}.alignSampleData.err; """ @@ -588,7 +589,7 @@ process getRef { ulimit -a >> ${repRID}.getRef.err export https_proxy=\${http_proxy} - #Set the reference name + # set the reference name if [ "${species}" == "Mus musculus" ] then references=\$(echo ${referenceBase}/GRCm${refMoVersion}) @@ -608,7 +609,7 @@ process getRef { fi echo "LOG: species set to \${references}" >> ${repRID}.getRef.err - #Retreive appropriate reference appropriate location + # retreive appropriate reference appropriate location if [ ${referenceBase} == "s3://bicf-references" ] then echo "LOG: grabbing reference files from S3" >> ${repRID}.getRef.err @@ -657,7 +658,7 @@ process alignData { hostname > ${repRID}.align.err ulimit -a >> ${repRID}.align.err - #Set stranded param for hisat2 + # set stranded param for hisat2 if [ "${stranded}"=="unstranded" ] then strandedParam="" @@ -675,7 +676,7 @@ process alignData { strandedParam="--rna-strandness RF" fi - #Align the reads with Hisat 2 + # align the reads with Hisat 2 if [ "${ends}" == "se" ] then echo "LOG: running Hisat2 with single-end settings" >> ${repRID}.align.err @@ -686,15 +687,15 @@ process alignData { hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} --summary-file ${repRID}.alignSummary.txt --new-summary 1>> ${repRID}.align.out 2>> ${repRID}.align.err fi - #Convert the output sam file to a sorted bam file using Samtools + # convert the output sam file to a sorted bam file using Samtools echo "LOG: converting from sam to bam" >> ${repRID}.align.err samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam 1>> ${repRID}.align.out 2>> ${repRID}.align.err; - #Sort the bam file using Samtools + # sort the bam file using Samtools echo "LOG: sorting the bam file" >> ${repRID}.align.err samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam 1>> ${repRID}.align.out 2>> ${repRID}.align.err; - #Index the sorted bam using Samtools + # index the sorted bam using Samtools echo "LOG: indexing sorted bam file" >> ${repRID}.align.err samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bam.bai 1>> ${repRID}.align.out 2>> ${repRID}.align.err; """ @@ -731,13 +732,13 @@ process dedupData { echo "LOG: running picard MarkDuplicates to remove duplicate reads" >> ${repRID}.dedup.err java -jar /picard/build/libs/picard.jar MarkDuplicates I=${bam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>> ${repRID}.dedup.out 2>> ${repRID}.dedup.err - # Sort the bam file using Samtools + # sort the bam file using Samtools samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.deduped.bam ${repRID}.deduped.bam 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err - # Index the sorted bam using Samtools + # index the sorted bam using Samtools samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bam.bai 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err - # Split the deduped BAM file for multi-threaded tin calculation + # split the deduped BAM file for multi-threaded tin calculation for i in `samtools view ${repRID}.sorted.deduped.bam | cut -f3 | sort | uniq`; do echo "echo \"LOG: splitting each chromosome into its own BAM and BAI files with Samtools\" >> ${repRID}.dedup.err; samtools view -b ${repRID}.sorted.deduped.bam \${i} > ${repRID}.sorted.deduped.\${i}.bam; samtools index -@ `nproc` -b ${repRID}.sorted.deduped.\${i}.bam ${repRID}.sorted.deduped.\${i}.bam.bai" @@ -772,7 +773,7 @@ process makeBigWig { hostname > ${repRID}.makeBigWig.err ulimit -a >> ${repRID}.makeBigWig.err - #Run bamCoverage + # run bamCoverage echo "LOG: Running bigWig bamCoverage" >> ${repRID}.makeBigWig.err bamCoverage -p `nproc` -b ${bam} -o ${repRID}.bw 1>> ${repRID}.makeBigWig.out 2>> ${repRID}.makeBigWig.err """ @@ -803,7 +804,7 @@ process countData { hostname > ${repRID}.countData.err ulimit -a >> ${repRID}.countData.err - #Determine strandedness and setup strandig for countData + # determine strandedness and setup strandig for countData stranding=0; if [ "${stranded}" == "unstranded" ] then @@ -818,19 +819,18 @@ process countData { stranding=2 echo "LOG: strandedness set to forward stranded [2]" >> ${repRID}.countData.err fi - #Run countData - echo "LOG: running countData on the data" >> ${repRID}.countData.err + + # run featureCounts + echo "LOG: running featureCounts on the data" >> ${repRID}.countData.err if [ "${ends}" == "se" ] then featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam 1>> ${repRID}.countData.out 2>> ${repRID}.countData.err - #featureCounts -R SAM -p -G ./genome.fna -T `nproc` -s \${stranding} -a ./genome.gtf -o ${repRID}.countData -g 'gene_name' --primary --ignoreDup ${repRID}.sorted.deduped.bam 1>> ${repRID}.countData.out 2>> ${repRID}.countData.err elif [ "${ends}" == "pe" ] then featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam 1>> ${repRID}.countData.out 2>> ${repRID}.countData.err - #featureCounts -R SAM -p -G ./genome.fna -T `nproc` -s \${stranding} -a ./genome.gtf -o ${repRID}.countData -g 'gene_name' --primary --ignoreDup -B ${repRID}.sorted.deduped.bam 1>> ${repRID}.countData.out 2>> ${repRID}.countData.err fi - #Calculate TPM from the resulting countData table + # calculate TPM from the resulting countData table echo "LOG: calculating TPM with R" >> ${repRID}.countData.err Rscript calculateTPM.R --count "${repRID}.countData" 1>> ${repRID}.countData.out 2>> ${repRID}.countData.err """ @@ -841,7 +841,6 @@ process countData { */ process fastqc { tag "${repRID}" - publishDir "${outDir}/fastqc", mode: 'copy', pattern: "*_fastqc.zip" publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.fastqc.{out,err}" input: @@ -943,9 +942,11 @@ process aggrQC { hostname > ${repRID}.aggrQC.err ulimit -a >> ${repRID}.aggrQC.err + # make RID table echo -e "Replicate RID\tExperiment RID\tStudy RID" > rid.tsv echo -e "${repRID}\t${expRID}\t${studyRID}" >> rid.tsv + # make metadata table echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in" > metadata.tsv echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}" >> metadata.tsv echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}" >> metadata.tsv @@ -957,7 +958,7 @@ process aggrQC { rm -f ${innerDistance} fi - #run MultiQC + # run MultiQC multiqc -c ${multiqcConfig} . """ }