Skip to content
Snippets Groups Projects
Commit fe51a743 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Merge branch 'new.qc.out' into 'develop'

Metadata output update

Closes #56 and #52

See merge request !36
parents 0b6630da 6341ca53
Branches
Tags
2 merge requests!37v0.0.1,!36Metadata output update
Pipeline #7838 passed with stages
in 1 hour, 20 minutes, and 15 seconds
...@@ -297,6 +297,8 @@ timeline*.html* ...@@ -297,6 +297,8 @@ timeline*.html*
*.tmp *.tmp
*.swp *.swp
*.out *.out
*_studyRID.json
*_studyRID.csv
run*.sh run*.sh
!.gitkeep !.gitkeep
...@@ -38,7 +38,8 @@ parseMetadata: ...@@ -38,7 +38,8 @@ parseMetadata:
- stranded=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded) - stranded=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded)
- spike=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p spike) - spike=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p spike)
- species=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species) - species=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species)
- echo -e "${endsMeta},${endsManual},${stranded},${spike},${species},${exp},${study},${rep}" > design.csv - readLength=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.stageNew.csv" -p readLength)
- echo -e "${endsMeta},${endsManual},${stranded},${spike},${species},${readLength},${exp},${study},${rep}" > design.csv
- pytest -m parseMetadata - pytest -m parseMetadata
inferMetadata: inferMetadata:
...@@ -66,6 +67,8 @@ trimData: ...@@ -66,6 +67,8 @@ trimData:
script: script:
- singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5F6_1M.se -j 20 ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --basename Q-Y5F6_1M.se -j 20 ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
- singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5F6_1M.pe -j 20 ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --illumina --length 35 --paired --basename Q-Y5F6_1M.pe -j 20 ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
- readLengthSE=$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
- readLengthPE=$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length($1)}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
- pytest -m trimData - pytest -m trimData
downsampleData: downsampleData:
...@@ -104,6 +107,7 @@ countData: ...@@ -104,6 +107,7 @@ countData:
script: script:
- singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' -o Q-Y5F6_1M.se.featureCounts -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' -o Q-Y5F6_1M.se.featureCounts -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam
- singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.featureCounts - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.featureCounts
- assignedReads=$(grep -m 1 'Assigned' *.summary | grep -oe '\([0-9.]*\)')
- pytest -m makeFeatureCounts - pytest -m makeFeatureCounts
makeBigWig: makeBigWig:
...@@ -164,7 +168,7 @@ consistency: ...@@ -164,7 +168,7 @@ consistency:
- grep -m 1 \"Assigned\":.[0-9] SE_multiqc_data.json | grep -oe '\([0-9.]*\)' > assignedSE.txt - grep -m 1 \"Assigned\":.[0-9] SE_multiqc_data.json | grep -oe '\([0-9.]*\)' > assignedSE.txt
- grep -m 1 \"Assigned\":.[0-9] PE_multiqc_data.json | grep -oe '\([0-9.]*\)' > assignedPE.txt - grep -m 1 \"Assigned\":.[0-9] PE_multiqc_data.json | grep -oe '\([0-9.]*\)' > assignedPE.txt
- echo 7742416 > assignedExpectSE.txt - echo 7742416 > assignedExpectSE.txt
- echo 2599149 > assignedExpectPE.txt - echo 2599140 > assignedExpectPE.txt
- pytest -m consistencySE - pytest -m consistencySE
- pytest -m consistencyPE - pytest -m consistencyPE
artifacts: artifacts:
......
...@@ -58,11 +58,17 @@ FULL EXAMPLE: ...@@ -58,11 +58,17 @@ FULL EXAMPLE:
nextflow run workflow/rna-seq.nf --deriva ./data/credential.json --bdbag ./data/cookies.txt --repRID Q-Y5JA nextflow run workflow/rna-seq.nf --deriva ./data/credential.json --bdbag ./data/cookies.txt --repRID Q-Y5JA
``` ```
To run a set of replicates from study RID:
------------------------------------------
Run in repo root dir:
* `sh workflow/scripts/splitStudy.sh [studyRID]`
It will run in parallel in batches of 5 replicatesRID
[**CHANGELOG**](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/blob/develop/CHANGELOG.md)
--- <hr>
[**CHANGELOG**](https://git.biohpc.swmed.edu/BICF/gudmap_rbk/rna-seq/blob/develop/CHANGELOG.md)
<hr>
Credits Credits
======= =======
......
...@@ -5,3 +5,5 @@ rm timeline*.html* ...@@ -5,3 +5,5 @@ rm timeline*.html*
rm .nextflow*.log* rm .nextflow*.log*
rm -r .nextflow/ rm -r .nextflow/
rm -r work/ rm -r work/
rm *_studyRID.json
rm *_studyRID.csv
docs/dag.png

594 KiB | W: | H:

docs/dag.png

665 KiB | W: | H:

docs/dag.png
docs/dag.png
docs/dag.png
docs/dag.png
  • 2-up
  • Swipe
  • Onion skin
...@@ -70,16 +70,21 @@ custom_data: ...@@ -70,16 +70,21 @@ custom_data:
meta: meta:
file_format: 'tsv' file_format: 'tsv'
section_name: 'Metadata' section_name: 'Metadata'
description: 'This is the comparison of infered metadata and submitter provided' description: 'This is the comparison of infered metadata, submitter provided, and calculated'
plot_type: 'table' plot_type: 'table'
pconfig: pconfig:
id: 'meta' id: 'meta'
format: '{:,.0f}'
headers: headers:
Source Source
Species Species
Ends Ends
Stranded Stranded
Spike-in Spike-in
Raw Reads
Assigned Reads
Median Read Length
Median TIN
tin: tin:
file_format: 'tsv' file_format: 'tsv'
section_name: 'TIN' section_name: 'TIN'
......
...@@ -53,6 +53,7 @@ script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh") ...@@ -53,6 +53,7 @@ script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh")
script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py") script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py")
script_inferMeta = Channel.fromPath("${baseDir}/scripts/inferMeta.sh") script_inferMeta = Channel.fromPath("${baseDir}/scripts/inferMeta.sh")
script_calculateTPM = Channel.fromPath("${baseDir}/scripts/calculateTPM.R") script_calculateTPM = Channel.fromPath("${baseDir}/scripts/calculateTPM.R")
script_convertGeneSymbols = Channel.fromPath("${baseDir}/scripts/convertGeneSymbols.R")
script_tinHist = Channel.fromPath("${baseDir}/scripts/tinHist.py") script_tinHist = Channel.fromPath("${baseDir}/scripts/tinHist.py")
/* /*
...@@ -237,8 +238,16 @@ process parseMetadata { ...@@ -237,8 +238,16 @@ process parseMetadata {
species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experiment}" -p species) species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experiment}" -p species)
echo -e "LOG: species metadata parsed: \${species}" >> ${repRID}.parseMetadata.log echo -e "LOG: species metadata parsed: \${species}" >> ${repRID}.parseMetadata.log
# gave design file # get read length metadata
echo -e "\${endsMeta},\${endsManual},\${stranded},\${spike},\${species},\${exp},\${study}" > design.csv readLength=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p readLength)
if [ "\${readLength}" = "nan"]
then
readLength="NA"
fi
echo -e "LOG: read length metadata parsed: \${readLength}" >> ${repRID}.parseMetadata.log
# save design file
echo -e "\${endsMeta},\${endsManual},\${stranded},\${spike},\${species},\${readLength},\${exp},\${study}" > design.csv
""" """
} }
...@@ -248,6 +257,7 @@ endsManual = Channel.create() ...@@ -248,6 +257,7 @@ endsManual = Channel.create()
strandedMeta = Channel.create() strandedMeta = Channel.create()
spikeMeta = Channel.create() spikeMeta = Channel.create()
speciesMeta = Channel.create() speciesMeta = Channel.create()
readLengthMeta = Channel.create()
expRID = Channel.create() expRID = Channel.create()
studyRID = Channel.create() studyRID = Channel.create()
metadata.splitCsv(sep: ",", header: false).separate( metadata.splitCsv(sep: ",", header: false).separate(
...@@ -256,6 +266,7 @@ metadata.splitCsv(sep: ",", header: false).separate( ...@@ -256,6 +266,7 @@ metadata.splitCsv(sep: ",", header: false).separate(
strandedMeta, strandedMeta,
spikeMeta, spikeMeta,
speciesMeta, speciesMeta,
readLengthMeta,
expRID, expRID,
studyRID studyRID
) )
...@@ -281,25 +292,38 @@ process trimData { ...@@ -281,25 +292,38 @@ process trimData {
output: output:
path ("*.fq.gz") into fastqsTrim path ("*.fq.gz") into fastqsTrim
path ("*_trimming_report.txt") into trimQC path ("*_trimming_report.txt") into trimQC
path ("readLength.csv") into inferMetadata_readLength
script: script:
""" """
hostname > ${repRID}.trimData.log hostname > ${repRID}.trimData.log
ulimit -a >> ${repRID}.trimData.log ulimit -a >> ${repRID}.trimData.log
# trim fastq's using trim_galore # trim fastq's using trim_galore and extract median read length
echo -e "LOG: trimming ${ends}" >> ${repRID}.trimData.log echo -e "LOG: trimming ${ends}" >> ${repRID}.trimData.log
if [ "${ends}" == "se" ] if [ "${ends}" == "se" ]
then then
trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]}
readLength=\$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
elif [ "${ends}" == "pe" ] elif [ "${ends}" == "pe" ]
then then
trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]}
readLength=\$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
fi fi
echo -e "LOG: trimmed" >> ${repRID}.trimData.log echo -e "LOG: trimmed" >> ${repRID}.trimData.log
echo -e "LOG: average trimmed read length: \${readLength}" >> ${repRID}.trimData.log
# save read length file
echo -e "\${readLength}" > readLength.csv
""" """
} }
// Extract calculated read length metadata into channel
readLengthInfer = Channel.create()
inferMetadata_readLength.splitCsv(sep: ",", header: false).separate(
readLengthInfer
)
// Replicate trimmed fastq's // Replicate trimmed fastq's
fastqsTrim.into { fastqsTrim.into {
fastqsTrim_alignData fastqsTrim_alignData
...@@ -605,7 +629,7 @@ process getRef { ...@@ -605,7 +629,7 @@ process getRef {
val species from speciesInfer_getRef val species from speciesInfer_getRef
output: output:
tuple path ("hisat2", type: 'dir'), path ("bed", type: 'dir'), path ("*.fna"), path ("*.gtf") into reference tuple path ("hisat2", type: 'dir'), path ("bed", type: 'dir'), path ("*.fna"), path ("*.gtf"), path ("geneID.tsv"), path ("Entrez.tsv") into reference
script: script:
""" """
...@@ -641,12 +665,16 @@ process getRef { ...@@ -641,12 +665,16 @@ process getRef {
aws s3 cp "\${references}"/bed ./bed --recursive aws s3 cp "\${references}"/bed ./bed --recursive
aws s3 cp "\${references}"/genome.fna ./ aws s3 cp "\${references}"/genome.fna ./
aws s3 cp "\${references}"/genome.gtf ./ aws s3 cp "\${references}"/genome.gtf ./
aws s3 cp "\${references}"/geneID.tsv ./
aws s3 cp "\${references}"/Entrez.tsv ./
elif [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references" ] elif [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references" ]
then then
ln -s "\${references}"/hisat2 ln -s "\${references}"/hisat2
ln -s "\${references}"/bed ln -s "\${references}"/bed
ln -s "\${references}"/genome.fna ln -s "\${references}"/genome.fna
ln -s "\${references}"/genome.gtf ln -s "\${references}"/genome.gtf
ln -s "\${references}"/geneID.tsv
ln -s "\${references}"/Entrez.tsv
fi fi
echo -e "LOG: fetched" >> ${repRID}.getRef.log echo -e "LOG: fetched" >> ${repRID}.getRef.log
""" """
...@@ -810,14 +838,16 @@ process countData { ...@@ -810,14 +838,16 @@ process countData {
input: input:
path script_calculateTPM path script_calculateTPM
path script_convertGeneSymbols
tuple path (bam), path (bai) from dedupBam_countData tuple path (bam), path (bai) from dedupBam_countData
path ref from reference_countData path ref from reference_countData
val ends from endsInfer_countData val ends from endsInfer_countData
val stranded from strandedInfer_countData val stranded from strandedInfer_countData
output: output:
path ("*.countTable.csv") into counts path ("*.tpmTable.csv") into counts
path ("*.countData.summary") into countsQC path ("*.countData.summary") into countsQC
path ("assignedReads.csv") into inferMetadata_assignedReads
script: script:
""" """
...@@ -837,7 +867,7 @@ process countData { ...@@ -837,7 +867,7 @@ process countData {
elif [ "${stranded}" == "reverse" ] elif [ "${stranded}" == "reverse" ]
then then
stranding=2 stranding=2
echo -e "LOG: strandedness set to forward stranded [2]" >> ${repRID}.countData.log echo -e "LOG: strandedness set to reverse stranded [2]" >> ${repRID}.countData.log
fi fi
# run featureCounts # run featureCounts
...@@ -850,13 +880,26 @@ process countData { ...@@ -850,13 +880,26 @@ process countData {
featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
fi fi
echo -e "LOG: counted" >> ${repRID}.countData.log echo -e "LOG: counted" >> ${repRID}.countData.log
# extract assigned reads
grep -m 1 'Assigned' *.countData.summary | grep -oe '\\([0-9.]*\\)' > assignedReads.csv
# calculate TPM from the resulting countData table # calculate TPM from the resulting countData table
echo -e "LOG: calculating TPM with R" >> ${repRID}.countData.log echo -e "LOG: calculating TPM with R" >> ${repRID}.countData.log
Rscript calculateTPM.R --count "${repRID}.countData" Rscript calculateTPM.R --count "${repRID}.countData"
# convert gene symbols to Entrez id's
echo -e "LOG: convert gene symbols to Entrez id's" >> ${repRID}.countData.log
Rscript convertGeneSymbols.R --repRID "${repRID}"
""" """
} }
// Extract number of assigned reads metadata into channel
assignedReadsInfer = Channel.create()
inferMetadata_assignedReads.splitCsv(sep: ",", header: false).separate(
assignedReadsInfer
)
/* /*
*fastqc: run fastqc on untrimmed fastq's *fastqc: run fastqc on untrimmed fastq's
*/ */
...@@ -868,6 +911,7 @@ process fastqc { ...@@ -868,6 +911,7 @@ process fastqc {
output: output:
path ("*_fastqc.zip") into fastqc path ("*_fastqc.zip") into fastqc
path ("rawReads.csv") into inferMetadata_rawReads
script: script:
""" """
...@@ -876,11 +920,19 @@ process fastqc { ...@@ -876,11 +920,19 @@ process fastqc {
# run fastqc # run fastqc
echo -e "LOG: running fastq on raw fastqs" >> ${repRID}.fastqc.log echo -e "LOG: running fastq on raw fastqs" >> ${repRID}.fastqc.log
#fastqc *.fastq.gz -o . fastqc *.fastq.gz -o .
touch test_fastqc.zip
# count raw reads
zcat *.R1.fastq.gz | echo \$((`wc -l`/4)) > rawReads.csv
""" """
} }
// Extract number of raw reads metadata into channel
rawReadsInfer = Channel.create()
inferMetadata_rawReads.splitCsv(sep: ",", header: false).separate(
rawReadsInfer
)
/* /*
*dataQC: calculate transcript integrity numbers (TIN) and bin as well as calculate innerdistance of PE replicates *dataQC: calculate transcript integrity numbers (TIN) and bin as well as calculate innerdistance of PE replicates
*/ */
...@@ -895,7 +947,8 @@ process dataQC { ...@@ -895,7 +947,8 @@ process dataQC {
val ends from endsInfer_dataQC val ends from endsInfer_dataQC
output: output:
path "${repRID}.tin.hist.tsv" into tin path "${repRID}.tin.hist.tsv" into tinHist
path "${repRID}.tin.med.csv" into inferMetadata_tinMed
path "${repRID}.insertSize.inner_distance_freq.txt" into innerDistance path "${repRID}.insertSize.inner_distance_freq.txt" into innerDistance
script: script:
...@@ -928,12 +981,19 @@ process dataQC { ...@@ -928,12 +981,19 @@ process dataQC {
""" """
} }
// Extract median TIN metadata into channel
tinMedInfer = Channel.create()
inferMetadata_tinMed.splitCsv(sep: ",", header: false).separate(
tinMedInfer
)
/* /*
*aggrQC: aggregate QC from processes as well as metadata and run MultiQC *aggrQC: aggregate QC from processes as well as metadata and run MultiQC
*/ */
process aggrQC { process aggrQC {
tag "${repRID}" tag "${repRID}"
publishDir "${outDir}/qc", mode: 'copy', pattern: "${repRID}.multiqc.html" publishDir "${outDir}/report", mode: 'copy', pattern: "${repRID}.multiqc.html"
publishDir "${outDir}/qc", mode: 'copy', pattern: "${repRID}.multiqc_data.json"
input: input:
path multiqcConfig path multiqcConfig
...@@ -944,7 +1004,7 @@ process aggrQC { ...@@ -944,7 +1004,7 @@ process aggrQC {
path dedupQC path dedupQC
path countsQC path countsQC
path innerDistance path innerDistance
path tin path tinHist
path alignSampleQCs from alignSampleQC_aggrQC.collect() path alignSampleQCs from alignSampleQC_aggrQC.collect()
path inferExperiment path inferExperiment
val endsManual from endsManual_aggrQC val endsManual from endsManual_aggrQC
...@@ -956,11 +1016,17 @@ process aggrQC { ...@@ -956,11 +1016,17 @@ process aggrQC {
val strandedI from strandedInfer_aggrQC val strandedI from strandedInfer_aggrQC
val spikeI from spikeInfer_aggrQC val spikeI from spikeInfer_aggrQC
val speciesI from speciesInfer_aggrQC val speciesI from speciesInfer_aggrQC
val readLengthM from readLengthMeta
val readLengthI from readLengthInfer
val rawReadsI from rawReadsInfer
val assignedReadsI from assignedReadsInfer
val tinMedI from tinMedInfer
val expRID val expRID
val studyRID val studyRID
output: output:
path "${repRID}.multiqc.html" into multiqc path "${repRID}.multiqc.html" into multiqc
path "${repRID}.multiqc_data.json" into multiqcJSON
script: script:
""" """
...@@ -974,14 +1040,14 @@ process aggrQC { ...@@ -974,14 +1040,14 @@ process aggrQC {
# make metadata table # make metadata table
echo -e "LOG: creating metadata table" >> ${repRID}.aggrQC.log echo -e "LOG: creating metadata table" >> ${repRID}.aggrQC.log
echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in" > metadata.tsv echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in\tRaw Reads\tAssigned Reads\tMedian Read Length\tMedian TIN" > metadata.tsv
echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}" >> metadata.tsv echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t-\t-\t'${readLengthM}'\t-" >> metadata.tsv
echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}" >> metadata.tsv echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv
echo -e "Manual\t-\t${endsManual}\t-\t-" >> metadata.tsv echo -e "Measured\t-\t${endsManual}\t-\t-\t'${rawReadsI}'\t'${assignedReadsI}'\t'${readLengthI}'\t'${tinMedI}'" >> metadata.tsv
# remove inner distance report if it is empty (SE repRID) # remove inner distance report if it is empty (SE repRID)
echo -e "LOG: removing dummy inner distance file" >> ${repRID}.aggrQC.log echo -e "LOG: removing dummy inner distance file" >> ${repRID}.aggrQC.log
if [ wc -l ${innerDistance} | awk '{print\${1}}' -eq 0 ] if [ "${endsM}" == "se" ]
then then
rm -f ${innerDistance} rm -f ${innerDistance}
fi fi
...@@ -989,5 +1055,6 @@ process aggrQC { ...@@ -989,5 +1055,6 @@ process aggrQC {
# run MultiQC # run MultiQC
echo -e "LOG: running multiqc" >> ${repRID}.aggrQC.log echo -e "LOG: running multiqc" >> ${repRID}.aggrQC.log
multiqc -c ${multiqcConfig} . -n ${repRID}.multiqc.html multiqc -c ${multiqcConfig} . -n ${repRID}.multiqc.html
cp ${repRID}.multiqc_data/multiqc_data.json ${repRID}.multiqc_data.json
""" """
} }
\ No newline at end of file
gc()
library(optparse)
option_list=list(
make_option("--repRID",action="store",type='character',help="Replicate RID")
)
opt=parse_args(OptionParser(option_list=option_list))
rm(option_list)
countTable <- read.csv(paste0(opt$repRID,".countData.countTable.csv"), stringsAsFactors=FALSE)
geneID <- read.delim("geneID.tsv", header=FALSE, stringsAsFactors=FALSE)
Entrez <- read.delim("Entrez.tsv", header=FALSE, stringsAsFactors=FALSE)
convert <- data.frame(geneID=countTable$Geneid)
convert <- merge(x=convert,y=geneID[,1:2],by.x="geneID",by.y="V2",all.x=TRUE)
convert <- merge(x=convert,y=Entrez,by.x="V1",by.y="V1",all.x=TRUE)
convert[is.na(convert$V2),3] <- ""
convert <- convert[,-1]
colnames(convert) <- c("GeneID","EntrezID")
convert <- unique(convert)
output <- merge(x=convert,y=countTable,by.x="GeneID",by.y="Geneid",all.x=TRUE)
write.table(output,file=paste0(opt$repRID,".tpmTable.csv"),sep=",",row.names=FALSE,quote=FALSE)
\ No newline at end of file
...@@ -102,5 +102,10 @@ def main(): ...@@ -102,5 +102,10 @@ def main():
exit(1) exit(1)
print(species) print(species)
# Get read length metadata from 'Experiment Settings.csv'
if (args.parameter == "readLength"):
readLength = metaFile.Read_Length.unique()
print(str(readLength).strip('[]'))
if __name__ == '__main__': if __name__ == '__main__':
main() main()
#!/usr/bin/env python3
import argparse
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--studyRID',help="The study RID.",required=True)
args = parser.parse_args()
return args
def main():
args = get_args()
studyRID=pd.read_json(args.studyRID+"_studyRID.json")
if studyRID["RID"].count() > 0:
studyRID["RID"].to_csv(args.studyRID+"_studyRID.csv",header=False,index=False)
else:
raise Exception("No associated replicates found: %s" %
studyRID)
if __name__ == '__main__':
main()
#!/bin/bash
# query GUDMAP/RBK for study RID
echo "curl --location --request GET 'https://www.gudmap.org/ermrest/catalog/2/entity/RNASeq:Replicate/Study_RID="${1}"'" | bash > $1_studyRID.json
# extract replicate RIDs
module load python/3.6.4-anaconda
python3 ./workflow/scripts/splitStudy.py -s $1
# run pipeline on replicate RIDs in parallel
module load nextflow/20.01.0
module load singularity/3.5.3
while read repRID; do echo ${repRID}; done < "$1_studyRID.csv" | xargs -P 5 -I {} nextflow run workflow/rna-seq.nf --repRID {}
# cleanup study RID files
rm $1_studyRID.json
rm $1_studyRID.csv
...@@ -15,6 +15,7 @@ def get_args(): ...@@ -15,6 +15,7 @@ def get_args():
def main(): def main():
args = get_args() args = get_args()
tin = pd.read_csv(args.repRID + '.sorted.deduped.tin.xls',sep="\t",header=0) tin = pd.read_csv(args.repRID + '.sorted.deduped.tin.xls',sep="\t",header=0)
hist = pd.cut(tin['TIN'],bins=pd.interval_range(start=0,freq=10,end=100,closed='right')).value_counts(sort=False) hist = pd.cut(tin['TIN'],bins=pd.interval_range(start=0,freq=10,end=100,closed='right')).value_counts(sort=False)
labels = ["{0} - {1}".format(i, i + 9) for i in range(1, 100, 10)] labels = ["{0} - {1}".format(i, i + 9) for i in range(1, 100, 10)]
#labels[0] = '0 - 10' #labels[0] = '0 - 10'
...@@ -34,6 +35,9 @@ def main(): ...@@ -34,6 +35,9 @@ def main():
hist = hist.T.fillna(0.0).astype(int) hist = hist.T.fillna(0.0).astype(int)
#hist = hist.apply(lambda x: x/x.sum()*100, axis=1) #hist = hist.apply(lambda x: x/x.sum()*100, axis=1)
hist.to_csv(args.repRID + '.tin.hist.tsv',sep='\t') hist.to_csv(args.repRID + '.tin.hist.tsv',sep='\t')
medFile = open(args.repRID + '.tin.med.csv',"w")
medFile.write(str(round(tin['TIN'][(tin['TIN']!=0)].median(),2)))
medFile.close()
if __name__ == '__main__': if __name__ == '__main__':
main() main()
\ No newline at end of file
...@@ -24,7 +24,7 @@ def readAssigned(fileAssigned,fileExpectAssigned): ...@@ -24,7 +24,7 @@ def readAssigned(fileAssigned,fileExpectAssigned):
expect = open(fileExpectAssigned, "r") expect = open(fileExpectAssigned, "r")
lineAssigned = assigned.readline() lineAssigned = assigned.readline()
lineExpect = expect.readline() lineExpect = expect.readline()
if lineAssigned.strip() == lineExpect.strip(): if int(lineAssigned.strip()) < (int(lineExpect.strip())+(int(lineExpect.strip())*0.00001)) and int(lineAssigned.strip()) > (int(lineExpect.strip())-(int(lineExpect.strip())*0.00001)):
data = True data = True
return data return data
...@@ -17,7 +17,7 @@ def readLine(fileName): ...@@ -17,7 +17,7 @@ def readLine(fileName):
data = False data = False
file = open(fileName, "r") file = open(fileName, "r")
line = file.readline() line = file.readline()
if line.strip() == "uk,se,unstranded,no,Homo sapiens,Experiment_RID,Study_RID,Replicate_RID": if line.strip() == "uk,se,unstranded,no,Homo sapiens,75,Experiment_RID,Study_RID,Replicate_RID":
data = True data = True
return data return data
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment