From 11a742a37c0a68c6d70361db30fc085817ff212d Mon Sep 17 00:00:00 2001 From: "Gervaise H. Henry" <gervaise.henry@utsouthwestern.edu> Date: Tue, 4 Aug 2020 09:36:24 -0500 Subject: [PATCH] Add new metadata extraction to ci --- .gitlab-ci.yml | 3 +++ workflow/rna-seq.nf | 17 +++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 29f4925..6264f76 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -67,6 +67,8 @@ trimData: script: - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5F6_1M.se -j 20 ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5F6_1M.pe -j 20 ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz + - readLengthSE=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') + - readLengthPE=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}') - pytest -m trimData downsampleData: @@ -105,6 +107,7 @@ countData: script: - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' -o Q-Y5F6_1M.se.featureCounts -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.featureCounts + - assignedReads=grep -m 1 'Assigned' *.countData.summary | grep -oe '\([0-9.]*\)' - pytest -m makeFeatureCounts makeBigWig: diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index a8a63aa..4be9f4f 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -298,7 +298,7 @@ process trimData { hostname > ${repRID}.trimData.log ulimit -a >> ${repRID}.trimData.log - # trim fastq's using trim_galore + # trim fastq's using trim_galore and extract median read length echo -e "LOG: trimming ${ends}" >> ${repRID}.trimData.log if [ "${ends}" == "se" ] then @@ -875,7 +875,8 @@ process countData { featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam fi echo -e "LOG: counted" >> ${repRID}.countData.log - + + # extract assigned reads assignedReads=grep -m 1 'Assigned' *.countData.summary | grep -oe '\([0-9.]*\)' echo -e \${assignedReads} > assignedReads.csv echo -e "LOG: assigned reads: "\${assignedReads} >> ${repRID}.countData.log @@ -887,6 +888,12 @@ process countData { """ } +// Extract number of assigned reads metadata into channel +assignedReadsInfer = Channel.create() +inferMetadata_assignedReads.splitCsv(sep: ",", header: false).separate( + assignedReads +) + /* *fastqc: run fastqc on untrimmed fastq's */ @@ -911,12 +918,6 @@ process fastqc { """ } -// Extract number of assigned reads metadata into channel -assignedReadsInfer = Channel.create() -inferMetadata_assignedReads.splitCsv(sep: ",", header: false).separate( - assignedReads -) - /* *dataQC: calculate transcript integrity numbers (TIN) and bin as well as calculate innerdistance of PE replicates */ -- GitLab