Newer
Older
// ######## #### ###### ########
// ## ## ## ## ## ##
// ## ## ## ## ##
// ######## ## ## ######
// ## ## ## ## ##
// ## ## ## ## ## ##
// ######## #### ###### ##
params.deriva = "${baseDir}/../test_data/auth/credential.json"
params.bdbag = "${baseDir}/../test_data/auth/cookies.txt"
//params.repRID = "16-1ZX4"
params.repRID = "Q-Y5JA"
params.refMoVersion = "38.p6.vM22"
params.refHuVersion = "38.p12.v31"
params.outDir = "${baseDir}/../output"
// Parse input variables
deriva = Channel
.fromPath(params.deriva)
.ifEmpty { exit 1, "deriva credential file not found: ${params.deriva}" }
bdbag = Channel
.fromPath(params.bdbag)
.ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" }
refMoVersion = params.refMoVersion
logsDir = "${outDir}/Logs"
derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json")
//referenceBase = "s3://bicf-references"
referenceBase = "/project/BICF/BICF_Core/shared/gudmap/references"
referenceInfer = Channel.fromList(["ERCC","GRCh","GRCm"])
script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh")
script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py")
script_calculateTPM = Channel.fromPath("${baseDir}/scripts/calculateTPM.R")
script_inferMeta = Channel.fromPath("${baseDir}/scripts/inferMeta.sh")
* splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid
publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.getBag.{out,err}"
path credential, stageAs: "credential.json" from deriva
path ("Replicate_*.zip") into bagit
hostname > ${repRID}.getBag.err
ulimit -a >> ${repRID}.getBag.err
# link credential file for authentication
ln -sf `readlink -e credential.json` ~/.deriva/credential.json 1>> ${repRID}.getBag.out 2>> ${repRID}.getBag.err
echo "LOG: deriva credentials linked" >> ${repRID}.getBag.err
echo "LOG: fetching deriva catalog for selected RID in GUDMAP." >> ${repRID}.getBag.err
deriva-download-cli dev.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID} 1>> ${repRID}.getBag.out 2>> ${repRID}.getBag.err
/*
* getData: fetch study files from consortium with downloaded bdbag.zip
*/
process getData {
publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.getData.{out,err}"
path cookies, stageAs: "deriva-cookies.txt" from bdbag
path ("*.R{1,2}.fastq.gz") into fastqs
path ("**/File.csv") into fileMeta
path ("**/Experiment Settings.csv") into experimentSettingsMeta
path ("**/Experiment.csv") into experimentMeta
path ("${repRID}.getData.{out,err}")
hostname > ${repRID}.getData.err
ulimit -a >> ${repRID}.getData.err
# link deriva cookie for authentication
ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt 1>> ${repRID}.getData.out 2>> ${repRID}.getData.err
echo "LOG: deriva cookie linked" >> ${repRID}.getData.err
replicate=\$(basename "${bagit}" | cut -d "." -f1) 1>> ${repRID}.getData.out 2>> ${repRID}.getData.err
echo "LOG: \${replicate}" >> ${repRID}.getData.err
unzip ${bagit} 1>> ${repRID}.getData.out 2>> ${repRID}.getData.err
echo "LOG: replicate bdbag unzipped" >> ${repRID}.getData.err
# bagit fetch fastq"s only and rename by repRID
sh ${script_bdbagFetch} \${replicate} ${repRID} 1>> ${repRID}.getData.out 2>> ${repRID}.getData.err
echo "LOG: replicate bdbag fetched" >> ${repRID}.getData.err
// Replicate raw fastq's for multiple process inputs
/*
* parseMetadata: parses metadata to extract experiment parameters
*/
process parseMetadata {
publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.parseMetadata.{out,err}"
path fileMeta
path experimentSettingsMeta
path experimentMeta
path "${repRID}.parseMetadata.{out,err}"
hostname > ${repRID}.parseMetadata.err
ulimit -a >> ${repRID}.parseMetadata.err
rep=\$(python3 ${script_parseMeta} -r ${repRID} -m "${fileMeta}" -p repRID)
echo "LOG: replicate RID metadata parsed: \${rep}" >> ${repRID}.parseMetadata.err
endsMeta=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p endsMeta)
echo "LOG: endedness metadata parsed: \${endsMeta}" >> ${repRID}.parseMetadata.err
# Manually get endness
endsManual=\$(python3 ${script_parseMeta} -r ${repRID} -m "${fileMeta}" -p endsManual)
echo "LOG: endedness manually detected: \${endsManual}" >> ${repRID}.parseMetadata.err
stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p stranded)
echo "LOG: strandedness metadata parsed: \${stranded}" >> ${repRID}.parseMetadata.err
spike=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p spike)
echo "LOG: spike-in metadata parsed: \${spike}" >> ${repRID}.parseMetadata.err
species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentMeta}" -p species)
echo "LOG: species metadata parsed: \${species}" >> ${repRID}.parseMetadata.err
echo "\${endsMeta},\${endsManual},\${stranded},\${spike},\${species}" > design.csv
// Split metadata into separate channels
endsMeta = Channel.create()
endsManual = Channel.create()
strandedMeta = Channel.create()
spikeMeta = Channel.create()
speciesMeta = Channel.create()
metadata.splitCsv(sep: ",", header: false).separate(
strandedMeta,
spikeMeta,
speciesMeta
// Replicate metadata for multiple process inputs
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
/*
* trimData: trims any adapter or non-host sequences from the data
*/
process trimData {
tag "${repRID}"
publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.trimData.{out,err}"
input:
val ends from endsManual_trimData
path (fastq) from fastqs_trimData
output:
path ("*.fq.gz") into fastqsTrim
path ("*_trimming_report.txt") into trimQC
path ("${repRID}.trimData.{out,err}")
script:
"""
hostname > ${repRID}.trimData.err
ulimit -a >> ${repRID}.trimData.err
#Trim fastq's using trim_galore
if [ "${ends}" == "se" ]
then
echo "LOG: running trim_galore using single-end settings" >> ${repRID}.trimData.err
trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} 1>> ${repRID}.trimData.out 2>> ${repRID}.trimData.err
elif [ "${ends}" == "pe" ]
then
echo "LOG: running trim_galore using paired-end settings" >> ${repRID}.trimData.err
trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} 1>> ${repRID}.trimData.out 2>> ${repRID}.trimData.err
fi
"""
// Replicate trimmed fastq's
fastqsTrim.into {
fastqsTrim_alignData
fastqsTrim_downsampleData
/*
* getRefInfer: Dowloads appropriate reference for metadata inference
*/
process getRefInfer {
publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.getRefInfer.{out,err}"
input:
val refName from referenceInfer
tuple val (refName), path ("hisat2", type: 'dir'), path ("*.fna"), path ("*.gtf") into refInfer
path ("${refName}", type: 'dir') into bedInfer
path ("${repRID}.getRefInfer.{out,err}")
script:
"""
hostname > ${repRID}.getRefInfer.err
ulimit -a >> ${repRID}.getRefInfer.err
export https_proxy=\${http_proxy}
#Set the reference name
if [ "${refName}" == "ERCC" ]
then
references=\$(echo ${referenceBase}/ERCC${refERCCVersion})
elif [ "${refName}" == "GRCm" ]
then
references=\$(echo ${referenceBase}/GRCm${refMoVersion})
elif [ '${refName}' == "GRCh" ]
then
references=\$(echo ${referenceBase}/GRCh${refHuVersion})
else
echo -e "LOG: ERROR - References could not be set!\nReference found: ${referenceBase}" >> ${repRID}.getRefInfer.err
exit 1
fi
#Retreive appropriate reference appropriate location
if [ ${referenceBase} == "s3://bicf-references" ]
then
echo "LOG: grabbing reference files from S3" >> ${repRID}.getRefInfer.err
aws s3 cp "\${references}" /hisat2 ./ --recursive 1>> ${repRID}.getRefInfer.out 2>> ${repRID}.getRefInfer.err
aws s3 cp "\${references}" /bed ./${refName}/ --recursive 1>> ${repRID}.getRefInfer.out 2>> ${repRID}.getRefInfer.err
aws s3 cp "\${references}" /*.fna --recursive 1>> ${repRID}.getRefInfer.out 2>> ${repRID}.getRefInfer.err
aws s3 cp "\${references}" /*.gtf --recursive 1>> ${repRID}.getRefInfer.out 2>> ${repRID}.getRefInfer.err
elif [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references" ]
then
echo "LOG: using pre-defined locations for reference files" >> ${repRID}.getRefInfer.err
ln -s "\${references}"/hisat2 1>> ${repRID}.getRefInfer.out 2>> ${repRID}.getRefInfer.err
ln -s "\${references}"/bed ${refName}/bed 1>> ${repRID}.getRefInfer.out 2>> ${repRID}.getRefInfer.err
ln -s "\${references}"/genome.fna 1>> ${repRID}.getRefInfer.out 2>> ${repRID}.getRefInfer.err
ln -s "\${references}"/genome.gtf 1>> ${repRID}.getRefInfer.out 2>> ${repRID}.getRefInfer.err
fi
#Make blank bed folder for ERCC
if [ "${refName}" == "ERCC" ]
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
rm ${refName}/bed
mkdir ${refName}/bed
fi
"""
}
/*
* downsampleData: downsample fastq's for metadata inference
*/
process downsampleData {
tag "${repRID}"
publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.downsampleData.{out,err}"
input:
val ends from endsManual_downsampleData
path fastq from fastqsTrim_downsampleData
output:
path ("sampled.1.fq") into fastqs1Sample
path ("sampled.2.fq") optional true into fastqs2Sample
path ("${repRID}.downsampleData.{out,err}")
script:
"""
hostname > ${repRID}.downsampleData.err
ulimit -a >> ${repRID}.downsampleData.err
export https_proxy=\${http_proxy}
if [ "${ends}" == "se" ]
then
echo "LOG: downsampling single-end trimmed fastq" >> ${repRID}.downsampleData.err
seqtk sample -s100 *trimmed.fq.gz 10000 1> sampled.1.fq 2>> ${repRID}.downsampleData.err
elif [ "${ends}" == "pe" ]
then
echo "LOG: downsampling read 1 of paired-end trimmed fastq" >> ${repRID}.downsampleData.err
seqtk sample -s100 *1.fq.gz 1000000 1> sampled.1.fq 2>> ${repRID}.downsampleData.err
echo "LOG: downsampling read 2 of paired-end trimmed fastq" >> ${repRID}.downsampleData.err
seqtk sample -s100 *2.fq.gz 1000000 1> sampled.2.fq 2>> ${repRID}.downsampleData.err
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
// Replicate the dowsampled fastq's and attatched to the references
inferInput = endsManual_alignSampleData.combine(refInfer.combine(fastqs1Sample.collect().combine(fastqs2Sample.collect())))
/*
* alignSampleData: aligns the downsampled reads to a reference database
*/
process alignSampleData {
tag "${ref}"
publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.alignSampleData.{out,err}"
input:
tuple val (ends), val (ref), path (hisat2), path (fna), path (gtf), path (fastq1), path (fastq2) from inferInput
output:
path ("${ref}.sampled.sorted.bam") into sampleBam
path ("${ref}.sampled.sorted.bam.bai") into sampleBai
path ("${ref}.alignSampleSummary.txt") into alignSampleQC
path ("${repRID}.${ref}.alignSampleData.{out,err}")
script:
"""
hostname > ${repRID}.${ref}.alignSampleData.err
ulimit -a >> ${repRID}.${ref}.alignSampleData.err
#Align the reads with Hisat 2
if [ "${ends}" == "se" ]
then
echo "LOG: running Hisat2 with single-end settings" >> ${repRID}.${ref}.alignSampleData.err
hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome -U ${fastq1} --summary-file ${repRID}.alignSampleSummary.txt --new-summary 1>> ${repRID}.${ref}.alignSampleData.out 2>> ${repRID}.${ref}.alignSampleData.err
elif [ "${ends}" == "pe" ]
then
echo "LOG: running Hisat2 with paired-end settings" >> ${repRID}.${ref}.alignSampleData.err
hisat2 -p `nproc` --add-chrname -S ${ref}.sampled.sam -x hisat2/genome --no-mixed --no-discordant -1 ${fastq1} -2 ${fastq2} --summary-file ${ref}.alignSampleSummary.txt --new-summary 1>> ${repRID}.${ref}.alignSampleData.out 2>> ${repRID}.${ref}.alignSampleData.err
fi
#Convert the output sam file to a sorted bam file using Samtools
echo "LOG: converting from sam to bam" >> ${repRID}.${ref}.alignSampleData.err
samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${ref}.sampled.bam ${ref}.sampled.sam 1>> ${repRID}.${ref}.alignSampleData.out 2>> ${repRID}.${ref}.alignSampleData.err;
#Sort the bam file using Samtools
echo "LOG: sorting the bam file" >> ${repRID}.${ref}.alignSampleData.err
samtools sort -@ `nproc` -O BAM -o ${ref}.sampled.sorted.bam ${ref}.sampled.bam 1>> ${repRID}.${ref}.alignSampleData.out 2>> ${repRID}.${ref}.alignSampleData.err;
#Index the sorted bam using Samtools
echo "LOG: indexing sorted bam file" >> ${repRID}.${ref}.alignSampleData.err
samtools index -@ `nproc` -b ${ref}.sampled.sorted.bam ${ref}.sampled.sorted.bam.bai 1>> ${repRID}.${ref}.alignSampleData.out 2>> ${repRID}.${ref}.alignSampleData.err;
"""
}
process inferMetadata {
tag "${repRID}"
publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.inferMetadata.{out,err}"
input:
path script_inferMeta
path beds from bedInfer.collect()
path bam from sampleBam.collect()
path bai from sampleBai.collect()
path alignSummary from alignSampleQC.collect()
output:
path "infer.csv" into inferMetadata
path "${repRID}.inferMetadata.{out,err}" optional true
script:
"""
hostname > ${repRID}.inferMetadata.err
ulimit -a >> ${repRID}.inferMetadata.err
# collect alignment rates
align_ercc=\$(echo \$(grep "Overall alignment rate" ERCC.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%'))
align_hu=\$(echo \$(grep "Overall alignment rate" GRCh.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%'))
align_mo=\$(echo \$(grep "Overall alignment rate" GRCm.alignSampleSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%'))
# determine spike-in
if [ 1 -eq \$(echo \$(expr \${align_ercc} ">=" 0.1)) ]
then
spike="yes"
else
spike="no"
fi
echo -e "LOG: Inference of strandedness results in: \${spike}" >> ${repRID}.inferMetadata.err
# determine species
if [ 1 -eq \$(echo \$(expr \${align_hu} ">=" 25)) ] && [ 1 -eq \$(echo \$(expr \${align_mo} "<" 25)) ]
then
species="Homo sapiens"
bam="GRCh.sampled.sorted.bam"
bed="./GRCh/bed/genome.bed"
elif [ 1 -eq \$(echo \$(expr \${align_mo} ">=" 25)) ] && [ 1 -eq \$(echo \$(expr \${align_hu} "<" 25)) ]
then
species="Mus musculus"
bam="GRCm.sampled.sorted.bam"
bed="./GRCm/bed/genome.bed"
else
echo "LOG: ERROR - Inference of species returns an ambiguous result" >> ${repRID}.inferMetadata.err
exit 1
fi
echo -e "LOG: Inference of species results in: \${species}" >> ${repRID}.inferMetadata.err
# infer experimental setting from dedup bam
echo "LOG: infer experimental setting from dedup bam" >> ${repRID}.inferMetadata.err
infer_experiment.py -r "\${bed}" -i "\${bam}" 1>> ${repRID}.inferMetadata.log 2>> ${repRID}.inferMetadata.err
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
echo "LOG: determining endedness and strandedness from file" >> ${repRID}.inferMetadata.err
ended=`bash inferMeta.sh endness ${repRID}.inferMetadata.log` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err
fail=`bash inferMeta.sh fail ${repRID}.inferMetadata.log` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err
if [ \${ended} == "PairEnd" ]
then
ends="pe"
percentF=`bash inferMeta.sh pef ${repRID}.inferMetadata.log` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err
percentR=`bash inferMeta.sh per ${repRID}.inferMetadata.log` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err
elif [ \${ended} == "SingleEnd" ]
then
ends="se"
percentF=`bash inferMeta.sh sef ${repRID}.inferMetadata.log` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err
percentR=`bash inferMeta.sh ser ${repRID}.inferMetadata.log` 1>> ${repRID}.inferMetadata.out 2>> ${repRID}.inferMetadata.err
fi
if [ 1 -eq \$(echo \$(expr \${percentF} ">" 0.25)) ] && [ 1 -eq \$(echo \$(expr \${percentR} "<" 0.25)) ]
then
stranded="forward"
elif [ 1 -eq \$(echo \$(expr \${percentR} ">" 0.25)) ] && [ 1 -eq \$(echo \$(expr \${percentF} "<" 0.25)) ]
then
stranded="reverse"
else
stranded="unstranded"
fi
echo -e "LOG: stradedness set to \${stranded}" >> ${repRID}.inferMetadata.err
# write infered metadata to file
echo "\${ends},\${stranded},\${spike},\${species},\${align_ercc},\${align_hu},\${align_mo},\${percentF},\${percentR},\${fail}" 1>> infer.csv 2>> ${repRID}.inferMetadata.err
"""
}
// Split metadata into separate channels
endsInfer = Channel.create()
strandedInfer = Channel.create()
spikeInfer = Channel.create()
speciesInfer = Channel.create()
align_erccInfer = Channel.create()
align_huInfer = Channel.create()
align_moInfer = Channel.create()
percentFInfer = Channel.create()
percentRInfer = Channel.create()
failInfer = Channel.create()
inferMetadata.splitCsv(sep: ",", header: false).separate(
endsInfer,
strandedInfer,
spikeInfer,
speciesInfer,
align_erccInfer,
align_huInfer,
align_moInfer,
percentFInfer,
percentRInfer,
failInfer
)
// Replicate metadata for multiple process inputs
endsInfer.into {
endsInfer_trimData
endsInfer_alignData
endsInfer_countData
}
strandedInfer.into {
strandedInfer_alignData
strandedInfer_countData
}
spikeInfer.into{
spikeInfer_getRef
}
speciesInfer.into {
speciesInfer_getRef
}
/*
* getRef: Dowloads appropriate reference
*/
process getRef {
publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.getRef.{out,err}"
val spike from spikeInfer_getRef
val species from speciesInfer_getRef
tuple path ("hisat2", type: 'dir'), path ("bed", type: 'dir'), path ("*.fna"), path ("*.gtf") into reference
path ("${repRID}.getRef.{out,err}")
hostname > ${repRID}.getRef.err
ulimit -a >> ${repRID}.getRef.err
if [ "${species}" == "Mus musculus" ]
references=\$(echo ${referenceBase}/GRCm${refMoVersion})
elif [ '${species}' == "Homo sapiens" ]
references=\$(echo ${referenceBase}/GRCh${refHuVersion})
echo -e "LOG: ERROR - References could not be set!\nSpecies reference found: ${species}" >> ${repRID}.getRef.err
if [ "${spike}" == "yes" ]
elif [ "${spike}" == "no" ]
then
reference=\$(echo \${references}/)
fi
echo "LOG: species set to \${references}" >> ${repRID}.getRef.err
if [ ${referenceBase} == "s3://bicf-references" ]
then
echo "LOG: grabbing reference files from S3" >> ${repRID}.getRef.err
aws s3 cp "\${references}" /hisat2 ./ --recursive 1>> ${repRID}.getRef.out 2>> ${repRID}.getRef.err
aws s3 cp "\${references}" /bed ./ --recursive 1>> ${repRID}.getRef.out 2>> ${repRID}.getRef.err
aws s3 cp "\${references}" /*.fna --recursive 1>> ${repRID}.getRef.out 2>> ${repRID}.getRef.err
aws s3 cp "\${references}" /*.gtf --recursive 1>> ${repRID}.getRef.out 2>> ${repRID}.getRef.err
elif [ ${referenceBase} == "/project/BICF/BICF_Core/shared/gudmap/references" ]
then
echo "LOG: using pre-defined locations for reference files" >> ${repRID}.getRef.err
ln -s "\${references}"/hisat2 1>> ${repRID}.getRef.out 2>> ${repRID}.getRef.err
ln -s "\${references}"/bed 1>> ${repRID}.getRef.out 2>> ${repRID}.getRef.err
ln -s "\${references}"/genome.fna 1>> ${repRID}.getRef.out 2>> ${repRID}.getRef.err
ln -s "\${references}"/genome.gtf 1>> ${repRID}.getRef.out 2>> ${repRID}.getRef.err
// Replicate reference for multiple process inputs
reference.into {
reference_alignData
reference_countData
reference_dataQC
* alignData: aligns the reads to a reference database
tag "${repRID}"
publishDir "${logsDir}", mode: "copy", pattern: "${repRID}.align.{out,err}"
val ends from endsInfer_alignData
val stranded from strandedInfer_alignData
tuple path ("${repRID}.sorted.bam"), path ("${repRID}.sorted.bam.bai") into rawBam
path ("${repRID}.align.{out,err}")
hostname > ${repRID}.align.err
ulimit -a >> ${repRID}.align.err
#Set stranded param for hisat2
if [ "${stranded}"=="unstranded" ]
then
strandedParam=""
elif [ "${stranded}" == "forward" ] && [ "${ends}" == "se" ]
then
strandedParam="--rna-strandness F"
elif [ "${stranded}" == "forward" ] && [ "${ends}" == "pe" ]
then
strandedParam="--rna-strandness FR"
elif [ "${stranded}" == "reverse" ] && [ "${ends}" == "se" ]
then
strandedParam="--rna-strandness R"
elif [ "${stranded}" == "reverse" ] && [ "${ends}" == "pe" ]
then
strandedParam="--rna-strandness RF"
fi
#Align the reads with Hisat 2
echo "LOG: running Hisat2 with single-end settings" >> ${repRID}.align.err
hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} -U ${fastq[0]} --summary-file ${repRID}.alignSummary.txt --new-summary 1>> ${repRID}.align.out 2>> ${repRID}.align.err
elif [ "${ends}" == "pe" ]
echo "LOG: running Hisat2 with paired-end settings" >> ${repRID}.align.err
hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x hisat2/genome \${strandedParam} --no-mixed --no-discordant -1 ${fastq[0]} -2 ${fastq[1]} --summary-file ${repRID}.alignSummary.txt --new-summary 1>> ${repRID}.align.out 2>> ${repRID}.align.err
#Convert the output sam file to a sorted bam file using Samtools
echo "LOG: converting from sam to bam" >> ${repRID}.align.err
samtools view -1 -@ `nproc` -F 4 -F 8 -F 256 -o ${repRID}.bam ${repRID}.sam 1>> ${repRID}.align.out 2>> ${repRID}.align.err;
#Sort the bam file using Samtools
echo "LOG: sorting the bam file" >> ${repRID}.align.err
samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam 1>> ${repRID}.align.out 2>> ${repRID}.align.err;
#Index the sorted bam using Samtools
echo "LOG: indexing sorted bam file" >> ${repRID}.align.err
samtools index -@ `nproc` -b ${repRID}.sorted.bam ${repRID}.sorted.bam.bai 1>> ${repRID}.align.out 2>> ${repRID}.align.err;
// Replicate rawBam for multiple process inputs
rawBam.into {
rawBam_dedupData
}
*dedupData: mark the duplicate reads, specifically focused on PCR or optical duplicates
publishDir "${outDir}/bam", mode: 'copy', pattern: "*.deduped.bam"
publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.dedup.{out,err}"
set path (bam), path (bai) from rawBam_dedupData
tuple path ("${repRID}.sorted.deduped.bam"), path ("${repRID}.sorted.deduped.bam.bai") into dedupBam
tuple path ("${repRID}.sorted.deduped.*.bam"), path ("${repRID}.sorted.deduped.*.bam.bai") into dedupChrBam
path ("${repRID}.dedup.{out,err}")
hostname > ${repRID}.dedup.err
ulimit -a >> ${repRID}.dedup.err
# remove duplicated reads using Picard's MarkDuplicates
echo "LOG: running picard MarkDuplicates to remove duplicate reads" >> ${repRID}.dedup.err
java -jar /picard/build/libs/picard.jar MarkDuplicates I=${bam} O=${repRID}.deduped.bam M=${repRID}.deduped.Metrics.txt REMOVE_DUPLICATES=true 1>> ${repRID}.dedup.out 2>> ${repRID}.dedup.err
# Sort the bam file using Samtools
samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.deduped.bam ${repRID}.deduped.bam 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
# Index the sorted bam using Samtools
samtools index -@ `nproc` -b ${repRID}.sorted.deduped.bam ${repRID}.sorted.deduped.bam.bai 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
# Split the deduped BAM file for multi-threaded tin calculation
for i in `samtools view ${repRID}.sorted.deduped.bam | cut -f3 | sort | uniq`;
do
echo "echo \"LOG: splitting each chromosome into its own BAM and BAI files with Samtools\" >> ${repRID}.dedup.err; samtools view -b ${repRID}.sorted.deduped.bam \${i} > ${repRID}.sorted.deduped.\${i}.bam; samtools index -@ `nproc` -b ${repRID}.sorted.deduped.\${i}.bam ${repRID}.sorted.deduped.\${i}.bam.bai"
done | parallel -j `nproc` -k 1>>${repRID}.dedup.out 2>> ${repRID}.dedup.err
// Replicate dedup bam/bai for multiple process inputs
dedupBam.into {
*/
process makeBigWig {
tag "${repRID}"
publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.makeBigWig.{out,err}"
publishDir "${outDir}/bigwig", mode: 'copy', pattern: "${repRID}.bw"
set path (bam), path (bai) from dedupBam_makeBigWig
path ("${repRID}.makeBigWig.{out,err}")
hostname > ${repRID}.makeBigWig.err
ulimit -a >> ${repRID}.makeBigWig.err
#Run bamCoverage
echo "LOG: Running bigWig bamCoverage" >> ${repRID}.makeBigWig.err
bamCoverage -p `nproc` -b ${bam} -o ${repRID}.bw 1>> ${repRID}.makeBigWig.out 2>> ${repRID}.makeBigWig.err
*Run countData and get the counts, tpm
publishDir "${outDir}/countData", mode: 'copy', pattern: "${repRID}*.countTable.csv"
publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.countData.{out,err}"
tuple path (bam), path (bai) from dedupBam_countData
path ref from reference_countData
val ends from endsInfer_countData
val stranded from strandedInfer_countData
path ("*.countData.summary") into countsQC
path ("${repRID}.countData.{out,err}")
hostname > ${repRID}.countData.err
ulimit -a >> ${repRID}.countData.err
#Determine strandedness and setup strandig for countData
if [ "${stranded}" == "unstranded" ]
then
echo "LOG: strandedness set to unstranded [0]" >> ${repRID}.countData.err
elif [ "${stranded}" == "forward" ]
stranding=1
echo "LOG: strandedness set to forward stranded [1]" >> ${repRID}.countData.err
elif [ "${stranded}" == "reverse" ]
stranding=2
echo "LOG: strandedness set to forward stranded [2]" >> ${repRID}.countData.err
fi
#Run countData
echo "LOG: running countData on the data" >> ${repRID}.countData.err
if [ "${ends}" == "se" ]
then
featureCounts -R SAM -p -G ./genome.fna -T `nproc` -s \${stranding} -a ./genome.gtf -o ${repRID}.countData -g 'gene_name' --primary --ignoreDup ${repRID}.sorted.deduped.bam 1>> ${repRID}.countData.out 2>> ${repRID}.countData.err
elif [ "${ends}" == "pe" ]
then
featureCounts -R SAM -p -G ./genmome.fna -T `nproc` -s \${stranding} -a ./genome.gtf -o ${repRID}.countData -g 'gene_name' --primary --ignoreDup -B ${repRID}.sorted.deduped.bam 1>> ${repRID}.countData.out 2>> ${repRID}.countData.err
#Calculate TPM from the resulting countData table
echo "LOG: calculating TPM with R" >> ${repRID}.countData.err
Rscript calculateTPM.R --count "${repRID}.countData" 1>> ${repRID}.countData.out 2>> ${repRID}.countData.err
/*
*fastqc: run fastqc on untrimmed fastq's
*/
process fastqc {
tag "${repRID}"
publishDir "${outDir}/fastqc", mode: 'copy', pattern: "*_fastqc.zip"
publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.fastqc.{out,err}"
input:
path (fastq) from fastqs_fastqc
output:
path ("*_fastqc.zip") into fastqc
path ("${repRID}.fastqc.{out,err}")
hostname > ${repRID}.fastqc.err
ulimit -a >> ${repRID}.fastqc.err
echo "LOG: beginning FastQC analysis of the data" >> ${repRID}.fastqc.err
fastqc *.fastq.gz -o . 1>> ${repRID}.fastqc.out 2>> ${repRID}.fastqc.err
*dataQC: run RSeQC to calculate transcript integrity numbers (TIN)
publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.dataQC.{out,err}"
path ref from reference_dataQC
set path (chrBam), path (chrBai) from dedupChrBam
path "${repRID}.sorted.deduped.tin.xls" into tin
path "${repRID}.dataQC.{out,err}" optional true
hostname > ${repRID}.dataQC.err
ulimit -a >> ${repRID}.dataQC.err
# calcualte TIN values per feature on each chromosome
echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > ${repRID}.sorted.deduped.tin.xls
for i in `cat ./bed/genome.bed | cut -f1 | sort | uniq`; do
echo "echo \"LOG: running tin.py on \${i}\" >> ${repRID}.rseqc.err; tin.py -i ${repRID}.sorted.deduped.\${i}.bam -r ./bed/genome.bed 1>>${repRID}.rseqc.log 2>>${repRID}.rseqc.err; cat ${repRID}.sorted.deduped.\${i}.tin.xls | tr -s \"\\w\" \"\\t\" | grep -P \\\"\\\\t\${i}\\\\t\\\";";
done | parallel -j `nproc` -k 1>> ${repRID}.sorted.deduped.tin.xls 2>>${repRID}.rseqc.err