Newer
Older
// ######## #### ###### ########
// ## ## ## ## ## ##
// ## ## ## ## ##
// ######## ## ## ######
// ## ## ## ## ##
// ## ## ## ## ## ##
// ######## #### ###### ##
params.deriva = "${baseDir}/../test_data/auth/credential.json"
params.bdbag = "${baseDir}/../test_data/auth/cookies.txt"
//params.repRID = "16-1ZX4"
params.repRID = "Q-Y5JA"
params.refMoVersion = "38.p6.vM22"
params.refHuVersion = "38.p12.v31"
deriva = Channel
.fromPath(params.deriva)
.ifEmpty { exit 1, "deriva credential file not found: ${params.deriva}" }
deriva.into {
deriva_getBag
deriva_getRefInfer
deriva_getRef
}
bdbag = Channel
.fromPath(params.bdbag)
.ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" }
refMoVersion = params.refMoVersion
logsDir = "${outDir}/Logs"
derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json")
if (params.source == "dev") {
source = "dev.gudmap.org"
} else if (params.source == "staging") {
source = "staging.gudmap.org"
} else if (params.source == "production") {
source = "www.gudmap.org"
}
if (params.refSource == "biohpc") {
referenceBase = "/project/BICF/BICF_Core/shared/gudmap/references"
//} else if (params.refSource == "aws") {
// referenceBase = "s3://bicf-references"
} else if (params.refSource == "datahub") {
referenceBase = "dev.gudmap.org"
}
referenceInfer = Channel.fromList(["ERCC","GRCh","GRCm"])
multiqcConfig = Channel.fromPath("${baseDir}/conf/multiqc_config.yaml")
bicfLogo = Channel.fromPath("${baseDir}/../docs/bicf_logo.png")
script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh")
script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py")
script_inferMeta = Channel.fromPath("${baseDir}/scripts/inferMeta.sh")
script_refDataInfer = Channel.fromPath("${baseDir}/scripts/extractRefData.py")
script_refData = Channel.fromPath("${baseDir}/scripts/extractRefData.py")
script_calculateTPM = Channel.fromPath("${baseDir}/scripts/calculateTPM.R")
script_convertGeneSymbols = Channel.fromPath("${baseDir}/scripts/convertGeneSymbols.R")
script_tinHist = Channel.fromPath("${baseDir}/scripts/tinHist.py")
curl -H 'Content-Type: application/json' -X PUT -d \
'{ \
"sessionId": "${workflow.sessionId}", \
"pipeline": "gudmap.rbk_rnaseq", \
"start": "${workflow.start}", \
"astrocyte": false, \
"status": "started", \
"nextflowVersion": "${workflow.nextflow.version}", \
"pipelineVersion": "${workflow.manifest.version}", \
"dev": ${params.dev} \
}' \
"https://xku43pcwnf.execute-api.us-east-1.amazonaws.com/ProdDeploy/pipeline-tracking"
log.info """\
====================================
BICF RNA-seq Pipeline for GUDMAP/RBK
====================================
Replicate RID : ${params.repRID}
Source Server : ${params.source}
Human Reference Version: ${params.refHuVersion}
ERCC Reference Version : ${params.refERCCVersion}
Nextflow Version : ${workflow.nextflow.version}
Pipeline Version : ${workflow.manifest.version}
Session ID : ${workflow.sessionId}
CI : ${params.ci}
Development : ${params.dev}
* splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid
publishDir "${outDir}/inputBag", mode: 'copy', pattern: "Replicate_*.zip"
path credential, stageAs: "credential.json" from deriva_getBag
hostname > ${repRID}.getBag.log
ulimit -a >> ${repRID}.getBag.log
# link credential file for authentication
echo -e "LOG: linking deriva credentials" >> ${repRID}.getBag.log
ln -sf `readlink -e credential.json` ~/.deriva/credential.json
echo -e "LOG: fetching bag for ${repRID} in GUDMAP" >> ${repRID}.getBag.log
deriva-download-cli ${source} --catalog 2 ${derivaConfig} . rid=${repRID}
// Set inputBag to downloaded or forced input
if (inputBagForce != "") {
inputBag = Channel
.fromPath(inputBagForce)
.ifEmpty { exit 1, "override inputBag file not found: ${inputBagForce}" }
/*
* getData: fetch study files from consortium with downloaded bdbag.zip
*/
process getData {
path cookies, stageAs: "deriva-cookies.txt" from bdbag
path ("*.R{1,2}.fastq.gz") into fastqs
path ("**/File.csv") into fileMeta
path ("**/Experiment Settings.csv") into experimentSettingsMeta
path ("**/Experiment.csv") into experimentMeta
hostname > ${repRID}.getData.log
ulimit -a >> ${repRID}.getData.log
echo -e "LOG: linking deriva cookie" >> ${repRID}.getData.log
ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt
# get bag basename
replicate=\$(basename "${inputBag}" | cut -d "." -f1)
echo -e "LOG: bag replicate name \${replicate}" >> ${repRID}.getData.log
# unzip bag
echo -e "LOG: unzipping replicate bag" >> ${repRID}.getData.log
echo -e "LOG: fetching replicate bdbag" >> ${repRID}.getData.log
// Set raw fastq to downloaded or forced input and replicate them for multiple process inputs
if (fastqsForce != "") {
Channel
.fromPath(fastqsForce)
.ifEmpty { exit 1, "override inputBag file not found: ${fastqsForce}" }
.collect().into {
fastqs_trimData
fastqs_fastqc
}
} else {
fastqs.into {
fastqs_trimData
fastqs_fastqc
}
/*
* parseMetadata: parses metadata to extract experiment parameters
*/
process parseMetadata {
path file from fileMeta
path experimentSettings, stageAs: "ExperimentSettings.csv" from experimentSettingsMeta
path experiment from experimentMeta
hostname > ${repRID}.parseMetadata.log
ulimit -a >> ${repRID}.parseMetadata.log
rep=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p repRID)
echo -e "LOG: replicate RID metadata parsed: \${rep}" >> ${repRID}.parseMetadata.log
exp=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p expRID)
echo -e "LOG: experiment RID metadata parsed: \${exp}" >> ${repRID}.parseMetadata.log
study=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p studyRID)
echo -e "LOG: study RID metadata parsed: \${study}" >> ${repRID}.parseMetadata.log
endsMeta=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p endsMeta)
echo -e "LOG: endedness metadata parsed: \${endsMeta}" >> ${repRID}.parseMetadata.log
endsManual=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p endsManual)
echo -e "LOG: endedness manually detected: \${endsManual}" >> ${repRID}.parseMetadata.log
stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p stranded)
echo -e "LOG: strandedness metadata parsed: \${stranded}" >> ${repRID}.parseMetadata.log
spike=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p spike)
echo -e "LOG: spike-in metadata parsed: \${spike}" >> ${repRID}.parseMetadata.log
species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experiment}" -p species)
echo -e "LOG: species metadata parsed: \${species}" >> ${repRID}.parseMetadata.log
readLength=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p readLength)
if [ "\${readLength}" = "nan"]
echo -e "LOG: read length metadata parsed: \${readLength}" >> ${repRID}.parseMetadata.log
# save design file
echo -e "\${endsMeta},\${endsManual},\${stranded},\${spike},\${species},\${readLength},\${exp},\${study}" > design.csv
// Split metadata into separate channels
endsMeta = Channel.create()
endsManual = Channel.create()
strandedMeta = Channel.create()
spikeMeta = Channel.create()
speciesMeta = Channel.create()
readLengthMeta = Channel.create()
expRID = Channel.create()
studyRID = Channel.create()
metadata.splitCsv(sep: ",", header: false).separate(
strandedMeta,
spikeMeta,
// Replicate metadata for multiple process inputs
/*
* trimData: trims any adapter or non-host sequences from the data
*/
process trimData {
tag "${repRID}"
input:
val ends from endsManual_trimData
path (fastq) from fastqs_trimData
output:
path ("*.fq.gz") into fastqsTrim
path ("*_trimming_report.txt") into trimQC
path ("readLength.csv") into inferMetadata_readLength
hostname > ${repRID}.trimData.log
ulimit -a >> ${repRID}.trimData.log
# trim fastq's using trim_galore and extract median read length
echo -e "LOG: trimming ${ends}" >> ${repRID}.trimData.log
if [ "${ends}" == "se" ]
then
trim_galore --gzip -q 25 --length 35 --basename ${repRID} ${fastq[0]}
readLength=\$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
elif [ "${ends}" == "pe" ]
then
trim_galore --gzip -q 25 --length 35 --paired --basename ${repRID} ${fastq[0]} ${fastq[1]}
readLength=\$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
echo -e "LOG: average trimmed read length: \${readLength}" >> ${repRID}.trimData.log
# save read length file
echo -e "\${readLength}" > readLength.csv
// Extract calculated read length metadata into channel
readLengthInfer = Channel.create()
inferMetadata_readLength.splitCsv(sep: ",", header: false).separate(
readLengthInfer
// Replicate trimmed fastq's
fastqsTrim.into {
fastqsTrim_alignData
fastqsTrim_downsampleData
// Combine inputs of getRefInfer
getRefInferInput = referenceInfer.combine(deriva_getRefInfer.combine(script_refDataInfer))
* getRefInfer: dowloads appropriate reference for metadata inference
tuple val (refName), path (credential, stageAs: "credential.json"), path (script_refDataInfer) from getRefInferInput
tuple val (refName), path ("hisat2", type: 'dir'), path ("*.fna"), path ("*.gtf") into refInfer
path ("${refName}", type: 'dir') into bedInfer
hostname > ${repRID}.${refName}.getRefInfer.log
ulimit -a >> ${repRID}.${refName}.getRefInfer.log
# link credential file for authentication
echo -e "LOG: linking deriva credentials" >> ${repRID}.getRefInfer.log
mkdir -p ~/.deriva
ln -sf `readlink -e credential.json` ~/.deriva/credential.json
echo -e "LOG: linked" >> ${repRID}.getRefInfer.log
if [ "${refName}" == "ERCC" ]
then
references=\$(echo ${referenceBase}/ERCC${refERCCVersion})
elif [ "${refName}" == "GRCm" ]
then
references=\$(echo ${referenceBase}/GRCm${refMoVersion})
elif [ '${refName}' == "GRCh" ]
then
references=\$(echo ${referenceBase}/GRCh${refHuVersion})
else
echo -e "LOG: ERROR - References could not be set!\nReference found: ${referenceBase}" >> ${repRID}.${refName}.getRefInfer.log
# retreive appropriate reference appropriate location
echo -e "LOG: fetching ${refName} reference files from ${referenceBase}" >> ${repRID}.${refName}.getRefInfer.log
if [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references" ]
ln -s "\${references}"/hisat2
ln -s "\${references}"/bed ${refName}/bed
ln -s "\${references}"/genome.fna
ln -s "\${references}"/genome.gtf
#elif [ ${referenceBase} == "s3://bicf-references" ]
#then
# aws s3 cp "\${references}"/hisat2 ./hisat2 --recursive
# aws s3 cp "\${references}"/bed ./${refName}/bed --recursive
# aws s3 cp "\${references}"/genome.fna ./
# aws s3 cp "\${references}"/genome.gtf ./
elif [ ${referenceBase} == "dev.gudmap.org" ]
then
GRCv=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f1)
GRCp=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f2)
GENCODE=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f3)
if [ "${refName}" != "ERCC" ]
then
query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='\${GRCv}'.'\${GRCp}'/Annotation_Version=GENCODE%20'\${GENCODE})
else
query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version=${refName}${refERCCVersion}/Annotation_Version=${refName}${refERCCVersion}')
fi
curl --request GET \${query} > refQuery.json
fName=\${fName%.*}
if [ "\${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi
filename=\$(echo \$(basename \${refURL}) | grep -oP '.*(?=:)')
deriva-hatrac-cli --host ${referenceBase} get \${refURL}
unzip \$(basename \${refURL})
echo -e "LOG: fetched" >> ${repRID}.${refName}.getRefInfer.log
echo -e "LOG: making dummy bed folder for ERCC" >> ${repRID}.${refName}.getRefInfer.log
if [ "${refName}" == "ERCC" ]
elif [ ${referenceBase} == "dev.gudmap.org" ]
fi
"""
}
/*
* downsampleData: downsample fastq's for metadata inference
*/
process downsampleData {
tag "${repRID}"
input:
val ends from endsManual_downsampleData
path fastq from fastqsTrim_downsampleData
output:
path ("sampled.1.fq") into fastqs1Sample
path ("sampled.2.fq") into fastqs2Sample
hostname > ${repRID}.downsampleData.log
ulimit -a >> ${repRID}.downsampleData.log
if [ "${ends}" == "se" ]
then
echo -e "LOG: downsampling SE trimmed fastq" >> ${repRID}.downsampleData.log
seqtk sample -s100 *trimmed.fq.gz 100000 1> sampled.1.fq
elif [ "${ends}" == "pe" ]
then
echo -e "LOG: downsampling R1 of PE trimmed fastq" >> ${repRID}.downsampleData.log
seqtk sample -s100 *1.fq.gz 1000000 1> sampled.1.fq
echo -e "LOG: downsampling R2 of PE trimmed fastq" >> ${repRID}.downsampleData.log
seqtk sample -s100 *2.fq.gz 1000000 1> sampled.2.fq
echo -e "LOG: downsampled" >> ${repRID}.downsampleData.log
// Replicate the dowsampled fastq's and attatched to the references
inferInput = endsManual_alignSampleData.combine(refInfer.combine(fastqs1Sample.collect().combine(fastqs2Sample.collect())))
/*
* alignSampleData: aligns the downsampled reads to a reference database
*/
process alignSampleData {
tag "${ref}"
input:
tuple val (ends), val (ref), path (hisat2), path (fna), path (gtf), path (fastq1), path (fastq2) from inferInput
output:
path ("${ref}.sampled.sorted.bam") into sampleBam
path ("${ref}.sampled.sorted.bam.bai") into sampleBai
path ("${ref}.alignSampleSummary.txt") into alignSampleQC
script:
"""
hostname > ${repRID}.${ref}.alignSampleData.log
ulimit -a >> ${repRID}.${ref}.alignSampleData.log
# align the reads with Hisat2
echo -e "LOG: aligning ${ends}" >> ${repRID}.${ref}.alignSampleData.log
if [ "${ends}" == "se" ]
then
hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome -U ${fastq1} --summary-file ${ref}.alignSampleSummary.txt --new-summary
elif [ "${ends}" == "pe" ]
then
hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome --no-mixed --no-discordant -1 ${fastq1} -2 ${fastq2} --summary-file ${ref}.alignSampleSummary.txt --new-summary
echo -e "LOG: aliged" >> ${repRID}.${ref}.alignSampleData.log
# convert the output sam file to a sorted bam file using Samtools
echo -e "LOG: converting from sam to bam" >> ${repRID}.${ref}.alignSampleData.log
samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${ref}.sampled.bam ${ref}.sampled.sam
echo -e "LOG: sorting the bam file" >> ${repRID}.${ref}.alignSampleData.log
samtools sort -@ `nproc` -O BAM -o ${ref}.sampled.sorted.bam ${ref}.sampled.bam
echo -e "LOG: indexing sorted bam file" >> ${repRID}.${ref}.alignSampleData.log
samtools index -@ `nproc` -b ${ref}.sampled.sorted.bam ${ref}.sampled.sorted.bam.bai
alignSampleQC.into {
alignSampleQC_inferMetadata
alignSampleQC_aggrQC
}
process inferMetadata {
tag "${repRID}"
input:
path script_inferMeta
path beds from bedInfer.collect()
path bam from sampleBam.collect()
path bai from sampleBai.collect()
path alignSummary from alignSampleQC_inferMetadata.collect()
output:
path "infer.csv" into inferMetadata
path "${repRID}.infer_experiment.txt" into inferExperiment
hostname > ${repRID}.inferMetadata.log
ulimit -a >> ${repRID}.inferMetadata.log
# collect alignment rates (round down to integers)
align_ercc=\$(echo \$(grep "Overall alignment rate" ERCC.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%'))
echo -e "LOG: alignment rate to ERCC: \${align_ercc}" >> ${repRID}.inferMetadata.log
align_hu=\$(echo \$(grep "Overall alignment rate" GRCh.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%'))
echo -e "LOG: alignment rate to GRCh: \${align_hu}" >> ${repRID}.inferMetadata.log
align_mo=\$(echo \$(grep "Overall alignment rate" GRCm.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%'))
echo -e "LOG: alignment rate to GRCm: \${align_mo}" >> ${repRID}.inferMetadata.log
if [ 1 -eq \$(echo \$(expr \${align_ercc} ">=" 10)) ]
then
spike="yes"
else
spike="no"
fi
echo -e "LOG: inference of strandedness results is: \${spike}" >> ${repRID}.inferMetadata.log
if [ 1 -eq \$(echo \$(expr \${align_hu} ">=" 40)) ] && [ 1 -eq \$(echo \$(expr \${align_mo} "<" 40)) ]
then
species="Homo sapiens"
bam="GRCh.sampled.sorted.bam"
bed="./GRCh/bed/genome.bed"
elif [ 1 -eq \$(echo \$(expr \${align_mo} ">=" 40)) ] && [ 1 -eq \$(echo \$(expr \${align_hu} "<" 40)) ]
then
species="Mus musculus"
bam="GRCm.sampled.sorted.bam"
bed="./GRCm/bed/genome.bed"
else
echo -e "LOG: ERROR - inference of species returns an ambiguous result: hu=\${align_hu} mo=\${align_mo}" >> ${repRID}.inferMetadata.log
if [ "${speciesForce}" == "" ]
then
exit 1
fi
fi
if [ "${speciesForce}" != "" ]
then
echo -e "LOG: species overridden to: ${speciesForce}"
species="${speciesForce}"
if [ "${speciesForce}" == "Homo sapiens" ]
then
bam="GRCh.sampled.sorted.bam"
bed="./GRCh/bed/genome.bed"
elif [ "${speciesForce}" == "Mus musculus" ]
then
bam="GRCm.sampled.sorted.bam"
bed="./GRCm/bed/genome.bed"
fi
echo -e "LOG: inference of species results in: \${species}" >> ${repRID}.inferMetadata.log
# infer experimental setting from dedup bam
echo -e "LOG: infer experimental setting from dedup bam" >> ${repRID}.inferMetadata.log
infer_experiment.py -r "\${bed}" -i "\${bam}" 1>> ${repRID}.infer_experiment.txt
ended=`bash inferMeta.sh endness ${repRID}.infer_experiment.txt`
fail=`bash inferMeta.sh fail ${repRID}.infer_experiment.txt`
percentF=`bash inferMeta.sh pef ${repRID}.infer_experiment.txt`
percentR=`bash inferMeta.sh per ${repRID}.infer_experiment.txt`
elif [ \${ended} == "SingleEnd" ]
then
ends="se"
percentF=`bash inferMeta.sh sef ${repRID}.infer_experiment.txt`
percentR=`bash inferMeta.sh ser ${repRID}.infer_experiment.txt`
echo -e "LOG: percentage reads in the same direction as gene: \${percentF}" >> ${repRID}.inferMetadata.log
echo -e "LOG: percentage reads in the opposite direction as gene: \${percentR}" >> ${repRID}.inferMetadata.log
if [ 1 -eq \$(echo \$(expr \${percentF#*.} ">" 2500)) ] && [ 1 -eq \$(echo \$(expr \${percentR#*.} "<" 2500)) ]
then
stranded="forward"
elif [ 1 -eq \$(echo \$(expr \${percentR#*.} ">" 2500)) ] && [ 1 -eq \$(echo \$(expr \${percentF#*.} "<" 2500)) ]
then
stranded="reverse"
else
stranded="unstranded"
fi
echo -e "LOG: stradedness set to: \${stranded}" >> ${repRID}.inferMetadata.log
# write infered metadata to file
echo "\${ends},\${stranded},\${spike},\${species},\${align_ercc},\${align_hu},\${align_mo},\${percentF},\${percentR},\${fail}" 1>> infer.csv
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
"""
}
// Split metadata into separate channels
endsInfer = Channel.create()
strandedInfer = Channel.create()
spikeInfer = Channel.create()
speciesInfer = Channel.create()
align_erccInfer = Channel.create()
align_huInfer = Channel.create()
align_moInfer = Channel.create()
percentFInfer = Channel.create()
percentRInfer = Channel.create()
failInfer = Channel.create()
inferMetadata.splitCsv(sep: ",", header: false).separate(
endsInfer,
strandedInfer,
spikeInfer,
speciesInfer,
align_erccInfer,
align_huInfer,
align_moInfer,
percentFInfer,
percentRInfer,
failInfer
)
// Replicate metadata for multiple process inputs
endsInfer.into {
endsInfer_alignData
endsInfer_countData
}
strandedInfer.into {
strandedInfer_alignData
strandedInfer_countData
}
spikeInfer.into{
spikeInfer_getRef
}
speciesInfer.into {
speciesInfer_getRef
path credential, stageAs: "credential.json" from deriva_getRef
val spike from spikeInfer_getRef
val species from speciesInfer_getRef
tuple path ("hisat2", type: 'dir'), path ("bed", type: 'dir'), path ("*.fna"), path ("*.gtf"), path ("geneID.tsv"), path ("Entrez.tsv") into reference
hostname > ${repRID}.getRef.log
ulimit -a >> ${repRID}.getRef.log
# link credential file for authentication
echo -e "LOG: linking deriva credentials" >> ${repRID}.getRef.log
mkdir -p ~/.deriva
ln -sf `readlink -e credential.json` ~/.deriva/credential.json
echo -e "LOG: linked" >> ${repRID}.getRef.log
if [ "${species}" == "Mus musculus" ]
references=\$(echo ${referenceBase}/GRCm${refMoVersion})
elif [ '${species}' == "Homo sapiens" ]
references=\$(echo ${referenceBase}/GRCh${refHuVersion})
echo -e "LOG: ERROR - References could not be set!\nSpecies reference found: ${species}" >> ${repRID}.getRef.log
if [ "${spike}" == "yes" ]
elif [ "${spike}" == "no" ]
then
reference=\$(echo \${references}/)
fi
echo -e "LOG: species set to \${references}" >> ${repRID}.getRef.log
# retreive appropriate reference appropriate location
echo -e "LOG: fetching ${species} reference files from ${referenceBase}" >> ${repRID}.getRef.log
if [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references" ]
echo -e "LOG: grabbing reference files from local (BioHPC)" >> ${repRID}.getRef.log
ln -s "\${references}"/hisat2
ln -s "\${references}"/bed
ln -s "\${references}"/genome.fna
ln -s "\${references}"/genome.gtf
ln -s "\${references}"/geneID.tsv
ln -s "\${references}"/Entrez.tsv
#elif [ ${referenceBase} == "s3://bicf-references" ]
#then
# echo -e "LOG: grabbing reference files from S3" >> ${repRID}.getRef.log
# aws s3 cp "\${references}"/hisat2 ./hisat2 --recursive
# aws s3 cp "\${references}"/bed ./bed --recursive
# aws s3 cp "\${references}"/genome.fna ./
# aws s3 cp "\${references}"/genome.gtf ./
# aws s3 cp "\${references}"/geneID.tsv ./
# aws s3 cp "\${references}"/Entrez.tsv ./
elif [ ${referenceBase} == "dev.gudmap.org" ]
then
echo -e "LOG: grabbing reference files from datahub" >> ${repRID}.getRef.log
GRCv=\$(echo \${references} | grep -o \${refName}.* | cut -d '.' -f1)
GRCp=\$(echo \${references} | grep -o \${refName}.* | cut -d '.' -f2)
GENCODE=\$(echo \${references} | grep -o \${refName}.* | cut -d '.' -f3)
query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='\${GRCv}'.'\${GRCp}'/Annotation_Version=GENCODE%20'\${GENCODE})
curl --request GET \${query} > refQuery.json
fName=\${fName%.*}
if [ "\${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi
filename=\$(echo \$(basename \${refURL}) | grep -oP '.*(?=:)')
deriva-hatrac-cli --host ${referenceBase} get \${refURL}
unzip \$(basename \${refURL})
// Replicate reference for multiple process inputs
reference.into {
reference_alignData
reference_countData
reference_dataQC
* alignData: aligns the reads to a reference database
tag "${repRID}"
val ends from endsInfer_alignData
val stranded from strandedInfer_alignData
tuple path ("${repRID}.sorted.bam"), path ("${repRID}.sorted.bam.bai") into rawBam
hostname > ${repRID}.align.log
ulimit -a >> ${repRID}.align.log
if [ "${stranded}"=="unstranded" ]
then
strandedParam=""
elif [ "${stranded}" == "forward" ] && [ "${ends}" == "se" ]
then
strandedParam="--rna-strandness F"
elif [ "${stranded}" == "forward" ] && [ "${ends}" == "pe" ]
then
strandedParam="--rna-strandness FR"
elif [ "${stranded}" == "reverse" ] && [ "${ends}" == "se" ]
then
strandedParam="--rna-strandness R"
elif [ "${stranded}" == "reverse" ] && [ "${ends}" == "pe" ]
then
# align the reads with Hisat2
echo -e "LOG: aligning ${ends}" >> ${repRID}.align.log
hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} -U ${fastq[0]} --summary-file ${repRID}.alignSummary.txt --new-summary
elif [ "${ends}" == "pe" ]
hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} --summary-file ${repRID}.alignSummary.txt --new-summary
# convert the output sam file to a sorted bam file using Samtools
echo -e "LOG: converting from sam to bam" >> ${repRID}.align.log
samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam
echo -e "LOG: sorting the bam file" >> ${repRID}.align.log
samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam
echo -e "LOG: indexing sorted bam file" >> ${repRID}.align.log
samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bam.bai
// Replicate rawBam for multiple process inputs
rawBam.into {
rawBam_dedupData
}
*dedupData: mark the duplicate reads, specifically focused on PCR or optical duplicates
publishDir "${outDir}/bam", mode: 'copy', pattern: "*.deduped.bam"
tuple path (bam), path (bai) from rawBam_dedupData
tuple path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bam.bai") into dedupBam
tuple path ("${repRID}.sorted.deduped.*.bam"), path ("${repRID}.sorted.deduped.*.bam.bai") into dedupChrBam
hostname > ${repRID}.dedup.log
ulimit -a >> ${repRID}.dedup.log
# remove duplicated reads using Picard's MarkDuplicates
echo -e "LOG: deduplicating reads" >> ${repRID}.dedup.log
java -jar /picard/build/libs/picard.jar MarkDuplicates I=${bam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true
echo -e "LOG: sorting the bam file" >> ${repRID}.dedup.log
samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.deduped.bam ${repRID}.deduped.bam
echo -e "LOG: indexing sorted bam file" >> ${repRID}.dedup.log
samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bam.bai
# split the deduped BAM file for multi-threaded tin calculation
for i in `samtools view ${repRID}.sorted.deduped.bam | cut -f3 | sort | uniq`;
do
echo "echo \"LOG: splitting each chromosome into its own BAM and BAI files with Samtools\"; samtools view -b ${repRID}.sorted.deduped.bam \${i} 1>> ${repRID}.sorted.deduped.\${i}.bam; samtools index -@ `nproc` -b ${repRID}.sorted.deduped.\${i}.bam ${repRID}.sorted.deduped.\${i}.bam.bai"
done | parallel -j `nproc` -k
// Replicate dedup bam/bai for multiple process inputs
dedupBam.into {
*/
process makeBigWig {
tag "${repRID}"
publishDir "${outDir}/bigwig", mode: 'copy', pattern: "${repRID}.bw"
tuple path (bam), path (bai) from dedupBam_makeBigWig
output:
path ("${repRID}.bw")
script:
"""
hostname > ${repRID}.makeBigWig.log
ulimit -a >> ${repRID}.makeBigWig.log
# create bigwig
echo -e "LOG: creating bibWig" >> ${repRID}.makeBigWig.log
bamCoverage -p `nproc` -b ${bam} -o ${repRID}.bw
publishDir "${outDir}/count", mode: 'copy', pattern: "${repRID}*.tpmTable.csv"
path script_convertGeneSymbols
tuple path (bam), path (bai) from dedupBam_countData
path ref from reference_countData
val ends from endsInfer_countData
val stranded from strandedInfer_countData
path ("*.tpmTable.csv") into counts
path ("*.countData.summary") into countsQC
path ("assignedReads.csv") into inferMetadata_assignedReads
hostname > ${repRID}.countData.log
ulimit -a >> ${repRID}.countData.log
# determine strandedness and setup strandig for countData
if [ "${stranded}" == "unstranded" ]
then
echo -e "LOG: strandedness set to unstranded [0]" >> ${repRID}.countData.log
elif [ "${stranded}" == "forward" ]
echo -e "LOG: strandedness set to forward stranded [1]" >> ${repRID}.countData.log
elif [ "${stranded}" == "reverse" ]
echo -e "LOG: strandedness set to reverse stranded [2]" >> ${repRID}.countData.log