Newer
Older
// ######## #### ###### ########
// ## ## ## ## ## ##
// ## ## ## ## ##
// ######## ## ## ######
// ## ## ## ## ##
// ## ## ## ## ## ##
// ######## #### ###### ##
params.deriva = "${baseDir}/../test_data/auth/credential.json"
params.bdbag = "${baseDir}/../test_data/auth/cookies.txt"
params.refMoVersion = "38.p6.vM25"
params.refHuVersion = "38.p13.v36"
params.strandedForce = ""
params.spikeForce = ""
// Define tracking input variables
params.ci = false
params.dev = true
deriva = Channel
.fromPath(params.deriva)
.ifEmpty { exit 1, "deriva credential file not found: ${params.deriva}" }
deriva.into {
deriva_getBag
deriva_getRefInfer
deriva_getRef
deriva_uploadProcessedFile
deriva_finalizeExecutionRun
bdbag = Channel
.fromPath(params.bdbag)
.ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" }
refMoVersion = params.refMoVersion
logsDir = "${outDir}/Logs"
strandedForce = params.strandedForce
spikeForce = params.spikeForce
// Define fixed files and variables
replicateExportConfig = Channel.fromPath("${baseDir}/conf/Replicate_For_Input_Bag.json")
executionRunExportConfig = Channel.fromPath("${baseDir}/conf/Execution_Run_For_Output_Bag.json")
if (params.source == "dev") {
source = "dev.gudmap.org"
} else if (params.source == "staging") {
source = "staging.gudmap.org"
} else if (params.source == "production") {
source = "www.gudmap.org"
}
referenceBase = "/project/BICF/BICF_Core/shared/gudmap/references/new"
referenceBase = "www.gudmap.org"
referenceInfer = Channel.fromList(["ERCC","GRCh","GRCm"])
multiqcConfig = Channel.fromPath("${baseDir}/conf/multiqc_config.yaml")
bicfLogo = Channel.fromPath("${baseDir}/../docs/bicf_logo.png")
softwareReferences = Channel.fromPath("${baseDir}/../docs/software_references_mqc.yaml")
softwareVersions = Channel.fromPath("${baseDir}/../docs/software_versions_mqc.yaml")
script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbag_fetch.sh")
script_parseMeta = Channel.fromPath("${baseDir}/scripts/parse_meta.py")
script_inferMeta = Channel.fromPath("${baseDir}/scripts/infer_meta.sh")
script_refDataInfer = Channel.fromPath("${baseDir}/scripts/extract_ref_data.py")
script_refData = Channel.fromPath("${baseDir}/scripts/extract_ref_data.py")
script_calculateTPM = Channel.fromPath("${baseDir}/scripts/calculateTPM.R")
script_convertGeneSymbols = Channel.fromPath("${baseDir}/scripts/convertGeneSymbols.R")
script_tinHist = Channel.fromPath("${baseDir}/scripts/tin_hist.py")
script_uploadInputBag = Channel.fromPath("${baseDir}/scripts/upload_input_bag.py")
script_uploadExecutionRun_uploadExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py")
script_uploadExecutionRun_finalizeExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py")
script_uploadExecutionRun_failPreExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py")
script_uploadExecutionRun_failExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py")
script_uploadQC = Channel.fromPath("${baseDir}/scripts/upload_qc.py")
script_uploadQC_fail = Channel.fromPath("${baseDir}/scripts/upload_qc.py")
script_uploadOutputBag = Channel.fromPath("${baseDir}/scripts/upload_output_bag.py")
script_deleteEntry_uploadQC = Channel.fromPath("${baseDir}/scripts/delete_entry.py")
script_deleteEntry_uploadQC_fail = Channel.fromPath("${baseDir}/scripts/delete_entry.py")
script_deleteEntry_uploadProcessedFile = Channel.fromPath("${baseDir}/scripts/delete_entry.py")
/*
* trackStart: track start of pipeline
*/
process trackStart {
container 'docker://gudmaprbk/gudmap-rbk_base:1.0.0'
"""
hostname
ulimit -a
curl -H 'Content-Type: application/json' -X PUT -d \
'{ \
"sessionId": "${workflow.sessionId}", \
"pipeline": "gudmap.rbk_rnaseq", \
"start": "${workflow.start}", \
"repRID": "${repRID}", \
"astrocyte": false, \
"status": "started", \
"nextflowVersion": "${workflow.nextflow.version}", \
"pipelineVersion": "${workflow.manifest.version}", \
"ci": ${params.ci}, \
"dev": ${params.dev} \
}' \
"https://xku43pcwnf.execute-api.us-east-1.amazonaws.com/ProdDeploy/pipeline-tracking"
then
curl -H 'Content-Type: application/json' -X PUT -d \
'{ \
"ID": "${workflow.sessionId}", \
"repRID": "${repRID}", \
"PipelineVersion": "${workflow.manifest.version}", \
"Server": "${params.source}", \
"Queued": "NA", \
"CheckedOut": "NA", \
"Started": "${workflow.start}" \
}' \
"https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track"
fi
"""
log.info """\
====================================
BICF RNA-seq Pipeline for GUDMAP/RBK
====================================
Replicate RID : ${params.repRID}
Source Server : ${params.source}
Human Reference Version: ${params.refHuVersion}
ERCC Reference Version : ${params.refERCCVersion}
Upload : ${upload}
Nextflow Version : ${workflow.nextflow.version}
Pipeline Version : ${workflow.manifest.version}
Session ID : ${workflow.sessionId}
CI : ${params.ci}
Development : ${params.dev}
publishDir "${outDir}/inputBag", mode: 'copy', pattern: "*_inputBag_*.zip"
path credential, stageAs: "credential.json" from deriva_getBag
hostname > ${repRID}.getBag.log
ulimit -a >> ${repRID}.getBag.log
# link credential file for authentication
echo -e "LOG: linking deriva credentials" >> ${repRID}.getBag.log
ln -sf `readlink -e credential.json` ~/.deriva/credential.json
echo -e "LOG: fetching bag for ${repRID} in GUDMAP" >> ${repRID}.getBag.log
deriva-download-cli ${source} --catalog 2 ${replicateExportConfig} . rid=${repRID}
name=\$(ls *.zip)
name=\$(basename \${name} | cut -d "." -f1)
yr=\$(date +'%Y')
mn=\$(date +'%m')
dy=\$(date +'%d')
mv \${name}.zip \${name}_\${yr}\${mn}\${dy}.zip
// Set inputBag to downloaded or forced input
if (inputBagForce != "") {
inputBag = Channel
.fromPath(inputBagForce)
.ifEmpty { exit 1, "override inputBag file not found: ${inputBagForce}" }
inputBag.into {
inputBag_getData
inputBag_uploadInputBag
}
* getData: fetch replicate files from consortium with downloaded bdbag.zip
*/
process getData {
path cookies, stageAs: "deriva-cookies.txt" from bdbag
path ("*.R{1,2}.fastq.gz") into fastqs
path ("**/File.csv") into fileMeta
path ("**/Experiment Settings.csv") into experimentSettingsMeta
path ("**/Experiment.csv") into experimentMeta
path "fastqCount.csv" into fastqCount_fl
hostname > ${repRID}.getData.log
ulimit -a >> ${repRID}.getData.log
echo -e "LOG: linking deriva cookie" >> ${repRID}.getData.log
ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt
echo -e "LOG: bag replicate name \${replicate}" >> ${repRID}.getData.log
# unzip bag
echo -e "LOG: unzipping replicate bag" >> ${repRID}.getData.log
echo -e "LOG: fetching replicate bdbag" >> ${repRID}.getData.log
sh ${script_bdbagFetch} \${replicate::-13} ${repRID}
fastqCount=\$(ls *.fastq.gz | wc -l)
if [ "\${fastqCount}" == "0" ]
then
touch dummy.R1.fastq.gz
fi
// Split fastq count into channel
fastqCount = Channel.create()
fastqCount_fl.splitCsv(sep: ",", header: false).separate(
fastqCount
)
// Set raw fastq to downloaded or forced input and replicate them for multiple process inputs
if (fastqsForce != "") {
Channel
.fromPath(fastqsForce)
.ifEmpty { exit 1, "override inputBag file not found: ${fastqsForce}" }
/*
* parseMetadata: parses metadata to extract experiment parameters
*/
process parseMetadata {
path file from fileMeta
path experimentSettings, stageAs: "ExperimentSettings.csv" from experimentSettingsMeta
path experiment from experimentMeta
path (fastq) from fastqs_parseMetadata.collect()
path "fastqError.csv" into fastqError_fl
hostname > ${repRID}.parseMetadata.log
ulimit -a >> ${repRID}.parseMetadata.log
rep=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p repRID)
echo -e "LOG: replicate RID metadata parsed: \${rep}" >> ${repRID}.parseMetadata.log
exp=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p expRID)
echo -e "LOG: experiment RID metadata parsed: \${exp}" >> ${repRID}.parseMetadata.log
study=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p studyRID)
echo -e "LOG: study RID metadata parsed: \${study}" >> ${repRID}.parseMetadata.log
endsRaw=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p endsMeta)
echo -e "LOG: endedness metadata parsed: \${endsRaw}" >> ${repRID}.parseMetadata.log
then
endsMeta="se"
elif [ "\${endsRaw}" == "Paired End" ]
then
endsMeta="pe"
# "Single Read" depreciated as of Jan 2021, this option is present for backwards compatibility
then
endsMeta="se"
endsManual="se"
else
endsManual="pe"
fi
echo -e "LOG: endedness manually detected: ${fastqCount}" >> ${repRID}.parseMetadata.log
stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p stranded)
echo -e "LOG: strandedness metadata parsed: \${stranded}" >> ${repRID}.parseMetadata.log
if [ "\${stranded}" == "nan" ]
then
stranded="_No value_"
fi
spike=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p spike)
echo -e "LOG: spike-in metadata parsed: \${spike}" >> ${repRID}.parseMetadata.log
if [ "\${spike}" == "nan" ]
then
spike="_No value_"
fi
if [ "\${spike}" == "f" ]
then
spike="false"
elif [ "\${spike}" == "t" ]
then
spike="true"
# "yes"/"no" depreciated as of Jan 2021, this option is present for backwards compatibility
then
spike="false"
# "yes"/"no" depreciated as of Jan 2021, this option is present for backwards compatibility
then
spike="true"
elif [ "\${spike}" == "nan" ]
then
endsRaw="_No value_"
endsMeta="NA"
fi
species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experiment}" -p species)
echo -e "LOG: species metadata parsed: \${species}" >> ${repRID}.parseMetadata.log
if [ "\${species}" == "nan" ]
then
species="_No value_"
fi
readLength=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p readLength)
echo -e "LOG: read length metadata parsed: \${readLength}" >> ${repRID}.parseMetadata.log
# check not incorrect number of fastqs
fastqCountError=false
fastqCountError_details=""
if [ "${fastqCount}" -gt "2" ]
fastqCountError_details="**Too many fastqs detected (>2)**"
elif [ "${fastqCount}" -eq "0" ]
then
fastqCountError=true
fastqCountError_details="**No valid fastqs detected \\(may not match {_.}R{12}.fastq.gz convention\\)**"
elif [ "\${endsMeta}" == "se" ] && [ "${fastqCount}" -ne "1" ]
fastqCountError_details="**Number of fastqs detected does not match submitted endness**"
elif [ "\${endsMeta}" == "pe" ] && [ "${fastqCount}" -ne "2" ]
fastqCountError_details="**Number of fastqs detected does not match submitted endness**"
# check read counts match for fastqs
fastqReadError=false
fastqReadError_details=""
r1Count=\$(zcat ${fastq[0]} | wc -l)
r2Count=\$(zcat ${fastq[1]} | wc -l)
if [ "\${r1Count}" -ne "\${r2Count}" ]
fastqReadError_details="**Number of reads do not match for R1 and R2:** there may be a trunkation or mismatch of fastq files"
# save design file
echo "\${endsMeta},\${endsRaw},\${endsManual},\${stranded},\${spike},\${species},\${readLength},\${exp},\${study}" > design.csv
echo "\${fastqCountError},\${fastqCountError_details},\${fastqReadError},\${fastqReadError_details}" > fastqError.csv
// Split metadata into separate channels
endsMeta = Channel.create()
strandedMeta = Channel.create()
spikeMeta = Channel.create()
speciesMeta = Channel.create()
readLengthMeta = Channel.create()
expRID = Channel.create()
studyRID = Channel.create()
metadata_fl.splitCsv(sep: ",", header: false).separate(
strandedMeta,
spikeMeta,
// Replicate metadata for multiple process inputs
endsMeta.into {
endsMeta_checkMetadata
endsMeta_aggrQC
endsMeta_failExecutionRun
strandedMeta.into {
strandedMeta_checkMetadata
strandedMeta_aggrQC
strandedMeta_failExecutionRun
}
spikeMeta.into {
spikeMeta_checkMetadata
spikeMeta_aggrQC
spikeMeta_failExecutionRun
}
speciesMeta.into {
speciesMeta_checkMetadata
speciesMeta_aggrQC
speciesMeta_failExecutionRun
studyRID_uploadInputBag
studyRID_uploadProcessedFile
studyRID_uploadOutputBag
expRID_uploadProcessedFile
// Split fastq count error into separate channel
fastqCountError = Channel.create()
fastqCountError_details = Channel.create()
fastqReadError = Channel.create()
fastqReadError_details = Channel.create()
fastqError_fl.splitCsv(sep: ",", header: false).separate(
fastqCountError_details,
fastqReadError,
fastqReadError_details
)
// Replicate errors for multiple process inputs
fastqCountError.into {
fastqCountError_trimData
fastqCountError_getRefInfer
fastqCountError_downsampleData
fastqCountError_alignSampleData
fastqCountError_inferMetadata
fastqCountError_checkMetadata
fastqCountError_uploadExecutionRun
fastqCountError_getRef
fastqCountError_alignData
fastqCountError_dedupData
fastqCountError_makeBigWig
fastqCountError_countData
fastqCountError_dataQC
fastqCountError_aggrQC
fastqCountError_uploadQC
fastqCountError_uploadProcessedFile
fastqCountError_uploadOutputBag
fastqCountError_failPreExecutionRun_fastq
fastqReadError_trimData
fastqReadError_getRefInfer
fastqReadError_downsampleData
fastqReadError_alignSampleData
fastqReadError_inferMetadata
fastqReadError_checkMetadata
fastqReadError_uploadExecutionRun
fastqReadError_getRef
fastqReadError_alignData
fastqReadError_dedupData
fastqReadError_makeBigWig
fastqReadError_countData
fastqReadError_dataQC
fastqReadError_aggrQC
fastqReadError_uploadQC
fastqReadError_uploadProcessedFile
fastqReadError_uploadOutputBag
fastqReadError_failPreExecutionRun_fastq
/*
*fastqc: run fastqc on untrimmed fastq's
*/
process fastqc {
tag "${repRID}"
input:
path (fastq) from fastqs_fastqc.collect()
val fastqCountError_fastqc
val fastqReadError_fastqc
output:
path ("*.R{1,2}.fastq.gz", includeInputs:true) into fastqs_trimData
path ("*_fastqc.zip") into fastqc
path ("rawReads.csv") into rawReadsInfer_fl
path "fastqFileError.csv" into fastqFileError_fl
fastqCountError_fastqc == 'false' && fastqReadError_fastqc == 'false'
script:
"""
hostname > ${repRID}.fastqc.log
ulimit -a >> ${repRID}.fastqc.log
# run fastqc
echo -e "LOG: running fastq on raw fastqs" >> ${repRID}.fastqc.log
fastqc *.fastq.gz -o . &> fastqc.out || true
fastqcErrorOut=\$(cat fastqc.out | grep -c 'Failed to process file') || fastqcErrorOut=0
fastqFileError=false
fastqFileError_details=""
then
fastqFileError=true
fastqFileError_details="**There is an error with the structure of the fastq**"
echo -e "LOG: There is an error with the structure of the fastq" >> ${repRID}.fastqc.log
else
echo -e "LOG: The structure of the fastq is correct" >> ${repRID}.fastqc.log
# count raw reads
zcat *.R1.fastq.gz | echo \$((`wc -l`/4)) > rawReads.csv
# save fastq error file
echo "\${fastqFileError},\${fastqFileError_details}" > fastqFileError.csv
"""
}
// Extract number of raw reads metadata into channel
rawReadsInfer = Channel.create()
rawReadsInfer_fl.splitCsv(sep: ",", header: false).separate(
rawReadsInfer
)
// Replicate inferred raw reads for multiple process inputs
rawReadsInfer.into {
rawReadsInfer_aggrQC
rawReadsInfer_uploadQC
}
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
// Split fastq count error into separate channel
fastqFileError = Channel.create()
fastqFileError_details = Channel.create()
fastqFileError_fl.splitCsv(sep: ",", header: false).separate(
fastqFileError,
fastqFileError_details
)
// Replicate errors for multiple process inputs
fastqFileError.into {
fastqFileError_fastqc
fastqFileError_trimData
fastqFileError_getRefInfer
fastqFileError_downsampleData
fastqFileError_alignSampleData
fastqFileError_inferMetadata
fastqFileError_checkMetadata
fastqFileError_uploadExecutionRun
fastqFileError_getRef
fastqFileError_alignData
fastqFileError_dedupData
fastqFileError_makeBigWig
fastqFileError_countData
fastqFileError_dataQC
fastqFileError_aggrQC
fastqFileError_uploadQC
fastqFileError_uploadProcessedFile
fastqFileError_uploadOutputBag
fastqFileError_failPreExecutionRun_fastqFile
/*
* trimData: trims any adapter or non-host sequences from the data
*/
process trimData {
tag "${repRID}"
input:
path (fastq) from fastqs_trimData
val fastqCountError_trimData
val fastqReadError_trimData
output:
path ("*.fq.gz") into fastqsTrim
path ("*_trimming_report.txt") into trimQC
when:
fastqCountError_trimData == "false"
fastqReadError_trimData == "false"
hostname > ${repRID}.trimData.log
ulimit -a >> ${repRID}.trimData.log
# trim fastq's using trim_galore and extract median read length
echo -e "LOG: trimming ${ends}" >> ${repRID}.trimData.log
if [ "${ends}" == "se" ]
then
trim_galore --gzip -q 25 --length 35 --basename ${repRID} ${fastq[0]}
readLength=\$(zcat *_trimmed.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
elif [ "${ends}" == "pe" ]
then
trim_galore --gzip -q 25 --length 35 --paired --basename ${repRID} ${fastq[0]} ${fastq[1]}
readLength=\$(zcat *_1.fq.gz | awk '{if(NR%4==2) print length(\$1)}' | sort -n | awk '{a[NR]=\$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}')
echo -e "LOG: average trimmed read length: \${readLength}" >> ${repRID}.trimData.log
# save read length file
// Extract calculated read length metadata into channel
readLengthInfer = Channel.create()
readLengthInfer_fl.splitCsv(sep: ",", header: false).separate(
readLengthInfer
// Replicate inferred read length for multiple process inputs
readLengthInfer.into {
readLengthInfer_aggrQC
readLengthInfer_uploadQC
}
// Replicate trimmed fastq's for multiple process inputs
fastqsTrim.into {
fastqsTrim_alignData
fastqsTrim_downsampleData
getRefInferInput = referenceInfer.combine(deriva_getRefInfer.combine(script_refDataInfer.combine(fastqCountError_getRefInfer.combine(fastqReadError_getRefInfer.combine(fastqFileError_getRefInfer)))))
* getRefInfer: dowloads appropriate reference for metadata inference
tuple val (refName), path (credential, stageAs: "credential.json"), path (script_refDataInfer), val (fastqCountError), val (fastqReadError), val (fastqFileError) from getRefInferInput
tuple val (refName), path ("hisat2", type: 'dir'), path ("*.fna"), path ("*.gtf") into refInfer
path ("${refName}", type: 'dir') into bedInfer
fastqCountError == "false"
fastqReadError == "false"
hostname > ${repRID}.${refName}.getRefInfer.log
ulimit -a >> ${repRID}.${refName}.getRefInfer.log
echo -e "LOG: linking deriva credentials" >> ${repRID}.${refName}.getRefInfer.log
mkdir -p ~/.deriva
ln -sf `readlink -e credential.json` ~/.deriva/credential.json
echo -e "LOG: linked" >> ${repRID}.${refName}.getRefInfer.log
if [ "${refName}" == "ERCC" ]
then
references=\$(echo ${referenceBase}/ERCC${refERCCVersion})
elif [ "${refName}" == "GRCm" ]
then
references=\$(echo ${referenceBase}/GRCm${refMoVersion})
elif [ '${refName}' == "GRCh" ]
then
references=\$(echo ${referenceBase}/GRCh${refHuVersion})
else
echo -e "LOG: ERROR - References could not be set!\nReference found: ${referenceBase}" >> ${repRID}.${refName}.getRefInfer.log
# retreive appropriate reference appropriate location
echo -e "LOG: fetching ${refName} reference files from ${referenceBase}" >> ${repRID}.${refName}.getRefInfer.log
if [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references/new" ]
unzip \${references}.zip
mv \$(basename \${references})/data/* .
elif [ params.refSource == "datahub" ]
GRCv=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f1)
GRCp=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f2)
GENCODE=\$(echo \${references} | grep -o ${refName}.* | cut -d '.' -f3)
if [ "${refName}" != "ERCC" ]
then
query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='\${GRCv}'.'\${GRCp}'/Annotation_Version=GENCODE%20'\${GENCODE})
else
query=\$(echo 'https://${referenceBase}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version=${refName}${refERCCVersion}/Annotation_Version=${refName}${refERCCVersion}')
fi
curl --request GET \${query} > refQuery.json
refURL=\$(python ${script_refDataInfer} --returnParam URL)
fName=\$(python ${script_refDataInfer} --returnParam fName)
fName=\${fName%.*}
if [ "\${loc}" = "/hatrac/*" ]; then echo "LOG: Reference not present in hatrac"; exit 1; fi
filename=\$(echo \$(basename \${refURL}) | grep -oP '.*(?=:)')
deriva-hatrac-cli --host ${referenceBase} get \${refURL}
unzip \$(basename \${refURL})
mv ./annotation/genome.gtf .
mv ./sequence/genome.fna .
mkdir ${refName}
if [ "${refName}" != "ERCC" ]
mv ./annotation/genome.bed ./${refName}
echo -e "LOG: fetched" >> ${repRID}.${refName}.getRefInfer.log
"""
}
/*
* downsampleData: downsample fastq's for metadata inference
*/
process downsampleData {
tag "${repRID}"
input:
path fastq from fastqsTrim_downsampleData
val fastqCountError_downsampleData
val fastqReadError_downsampleData
output:
path ("sampled.1.fq") into fastqs1Sample
path ("sampled.2.fq") into fastqs2Sample
when:
fastqCountError_downsampleData == "false"
fastqReadError_downsampleData == "false"
hostname > ${repRID}.downsampleData.log
ulimit -a >> ${repRID}.downsampleData.log
if [ "${ends}" == "se" ]
then
echo -e "LOG: downsampling SE trimmed fastq" >> ${repRID}.downsampleData.log
seqtk sample -s100 *trimmed.fq.gz 100000 1> sampled.1.fq
elif [ "${ends}" == "pe" ]
then
echo -e "LOG: downsampling R1 of PE trimmed fastq" >> ${repRID}.downsampleData.log
seqtk sample -s100 *1.fq.gz 1000000 1> sampled.1.fq
echo -e "LOG: downsampling R2 of PE trimmed fastq" >> ${repRID}.downsampleData.log
seqtk sample -s100 *2.fq.gz 1000000 1> sampled.2.fq
echo -e "LOG: downsampled" >> ${repRID}.downsampleData.log
// Replicate the dowsampled fastq's and attatched to the references
inferInput = endsManual_alignSampleData.combine(refInfer.combine(fastqs1Sample.collect().combine(fastqs2Sample.collect().combine(fastqCountError_alignSampleData.combine(fastqReadError_alignSampleData.combine(fastqFileError_alignSampleData))))))
/*
* alignSampleData: aligns the downsampled reads to a reference database
*/
process alignSampleData {
tag "${ref}"
input:
tuple val (ends), val (ref), path (hisat2), path (fna), path (gtf), path (fastq1), path (fastq2), val (fastqCountError), val (fastqReadError), val (fastqFileError) from inferInput
output:
path ("${ref}.sampled.sorted.bam") into sampleBam
path ("${ref}.sampled.sorted.bam.bai") into sampleBai
path ("${ref}.alignSampleSummary.txt") into alignSampleQC
fastqCountError == "false"
fastqReadError == "false"
hostname > ${repRID}.${ref}.alignSampleData.log
ulimit -a >> ${repRID}.${ref}.alignSampleData.log
# align the reads with Hisat2
echo -e "LOG: aligning ${ends}" >> ${repRID}.${ref}.alignSampleData.log
if [ "${ends}" == "se" ]
then
hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome -U ${fastq1} --summary-file ${ref}.alignSampleSummary.txt --new-summary
elif [ "${ends}" == "pe" ]
then
hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome --no-mixed --no-discordant -1 ${fastq1} -2 ${fastq2} --summary-file ${ref}.alignSampleSummary.txt --new-summary
echo -e "LOG: aliged" >> ${repRID}.${ref}.alignSampleData.log
# convert the output sam file to a sorted bam file using Samtools
echo -e "LOG: converting from sam to bam" >> ${repRID}.${ref}.alignSampleData.log
samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${ref}.sampled.bam ${ref}.sampled.sam
echo -e "LOG: sorting the bam file" >> ${repRID}.${ref}.alignSampleData.log
proc=\$(expr `nproc` - 1)
mem=\$(vmstat -s -S K | grep 'total memory' | grep -o '[0-9]*')
mem=\$(expr \${mem} / \${proc} \\* 85 / 100)
samtools sort -@ \${proc} -m \${mem}K -O BAM -o ${ref}.sampled.sorted.bam ${ref}.sampled.bam
echo -e "LOG: indexing sorted bam file" >> ${repRID}.${ref}.alignSampleData.log
samtools index -@ `nproc` -b ${ref}.sampled.sorted.bam ${ref}.sampled.sorted.bam.bai
alignSampleQC.into {
alignSampleQC_inferMetadata
alignSampleQC_aggrQC
}
process inferMetadata {
tag "${repRID}"
input:
path script_inferMeta
path beds from bedInfer.collect()
path bam from sampleBam.collect()
path bai from sampleBai.collect()
path alignSummary from alignSampleQC_inferMetadata.collect()
val strandedForce
val spikeForce
val fastqCountError_inferMetadata
val fastqReadError_inferMetadata
path "${repRID}.infer_experiment.txt" into inferExperiment
path "speciesError.csv" into speciesError_fl
when:
fastqCountError_inferMetadata == "false"
fastqReadError_inferMetadata == "false"
hostname > ${repRID}.inferMetadata.log
ulimit -a >> ${repRID}.inferMetadata.log
# collect alignment rates (round down to integers)
align_ercc=\$(echo \$(grep "Overall alignment rate" ERCC.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%'))
echo -e "LOG: alignment rate to ERCC: \${align_ercc}" >> ${repRID}.inferMetadata.log
align_hu=\$(echo \$(grep "Overall alignment rate" GRCh.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%'))
echo -e "LOG: alignment rate to GRCh: \${align_hu}" >> ${repRID}.inferMetadata.log
align_mo=\$(echo \$(grep "Overall alignment rate" GRCm.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%'))
echo -e "LOG: alignment rate to GRCm: \${align_mo}" >> ${repRID}.inferMetadata.log
if [ 1 -eq \$(echo \$(expr \${align_ercc} ">=" 10)) ]
echo -e "LOG: inference of strandedness results is: \${spike}" >> ${repRID}.inferMetadata.log
if [ "${spikeForce}" != "" ]
then
spike=${spikeForce}
echo -e "LOG: spike-in metadata forced: \${spike}" >> ${repRID}.parseMetadata.log
fi
if [ 1 -eq \$(echo \$(expr \${align_hu} ">=" 40)) ] && [ 1 -eq \$(echo \$(expr \${align_mo} "<" 40)) ]
then
species="Homo sapiens"
bam="GRCh.sampled.sorted.bam"
bed="./GRCh/genome.bed"
echo -e "LOG: inference of species results in: \${species}" >> ${repRID}.inferMetadata.log
elif [ 1 -eq \$(echo \$(expr \${align_mo} ">=" 40)) ] && [ 1 -eq \$(echo \$(expr \${align_hu} "<" 40)) ]
then
species="Mus musculus"