diff --git a/workflow/main.nf b/workflow/main.nf index 373469854d65643314e08163b2ed69972d607897..41622759eee0e4c74157d9ca4b403efa91ae3833 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -3,39 +3,6 @@ // Default parameter values to run tests params.input = "$baseDir" params.output= "$baseDir/output" - -params.files="$params.input/*.f*" -files=Channel.fromPath(params.files) - -process checkinputfiles { - - input: - - file fqs from files - - output: - - file("*.fastq.gz") into fastqs - - script: - - """ - if [[ $fqs == *.fq ]]; - then new_name=`echo ${fqs} | sed -e "s/.fq\$/.fastq/"`; - mv ${fqs} \${new_name}; - gzip -f \${new_name}; - elif [[ $fqs == *.fastq ]]; - then gzip -f $fqs; - elif [[ $fqs == *.fq.gz ]]; - then new_name=`echo ${fqs} | sed -e "s/.fq.gz\$/.fastq.gz/"`; - mv ${fqs} \${new_name}; - else - zcat ${fqs} | gzip -c > ${fqs}.temp - mv ${fqs}.temp ${fqs} - fi; - """ -} - params.design="$params.input/design.txt" params.genome="/project/shared/bicf_workflow_ref/human/grch38/" params.markdups="picard" @@ -46,7 +13,6 @@ params.align = 'hisat' params.fusion = 'skip' params.dea = 'detect' -design_file = file(params.design) design_file = file(params.design) gtf_file = file("$params.genome/gencode.gtf") genenames = file("$params.genome/genenames.txt") @@ -57,19 +23,61 @@ indel="$params.genome/goldindels.vcf.gz" knownindel=file(indel) dbsnp=file(dbsnp) +files = Channel + .fromPath("$params.input/*") +good = Channel.fromPath("$params.input/*.fastq.gz") + +process checkinputfiles { + module 'parallel/20150122:pigz/2.4' + queue 'super' + + input: + + file ("*") from files.collect() + file design_file name "design.tsv" + + output: + + set file ("design.tsv"), file ("*.fastq.gz") into design + file ("*.fastq.gz") into fastqs + + script: + + """ + for fqs in `ls | grep -v "^design.tsv\$"`; + do if [[ \$fqs == *.fq ]]; + then new_name=`echo \${fqs} | sed -e "s/.fq\$/.fastq/"`; + mv \${fqs} \${new_name}; + echo "pigz -f \${new_name}"; + elif [[ \$fqs == *.fastq ]]; + then echo "pigz -f \$fqs"; + elif [[ \$fqs == *.fq.gz ]]; + then new_name=`echo \${fqs} | sed -e "s/.fq.gz\$/.fastq.gz/"`; + mv \${fqs} \${new_name}; + fi; + done | shuf | parallel -j\${SLURM_CPUS_ON_NODE}; + """ +} + // params genome is the directory // base name for the index is always genome index_path = file(params.genome) process checkdesignfile { + executor 'local' publishDir "$params.output", mode: 'copy' + input: - file design_file name 'design.ori.txt' + + set file ("design.ori.txt"), file ("*") from design output: + file("design.valid.txt") into newdesign stdout spltnames + script: + """ perl -p -e 's/\\r\\n*/\\n/g' design.ori.txt > design.fix.txt perl $baseDir/scripts/check_designfile.pl ${params.pairs} design.fix.txt @@ -78,14 +86,17 @@ process checkdesignfile { def fileMap = [:] -fastqs.each { +fastqs + .mix(good) + .flatten() + .each { final fileName = it.getFileName().toString() prefix = fileName.lastIndexOf('/') fileMap[fileName] = it } if (params.pairs == 'pe') { -spltnames + spltnames .splitCsv() .filter { fileMap.get(it[1]) != null & fileMap.get(it[2]) != null } .map { it -> tuple(it[0], fileMap.get(it[1]), fileMap.get(it[2])) } @@ -94,7 +105,7 @@ spltnames else { spltnames .splitCsv() - .filter { fleMap.get(it[1]) != null } + .filter { fileMap.get(it[1]) != null } .map { it -> tuple(it[0], fileMap.get(it[1]),'') } .set { read } } @@ -103,14 +114,21 @@ if( ! read ) { error "Didn't match any input files with entries in the design fi // // Trim raw reads using trimgalore // + process trim { errorStrategy 'ignore' + input: + set pair_id, file(read1), file(read2) from read + output: + set pair_id, file("${pair_id}.trim.R1.fastq.gz"),file("${pair_id}.trim.R2.fastq.gz") into trimread set pair_id, file("${pair_id}.trim.R1.fastq.gz"),file("${pair_id}.trim.R2.fastq.gz") into fusionfq + script: + """ bash $baseDir/process_scripts/preproc_fastq/trimgalore.sh -p ${pair_id} -a ${read1} -b ${read2} """ @@ -126,13 +144,21 @@ process trim { process starfusion { errorStrategy 'ignore' publishDir "$params.output", mode: 'copy' + input: + set pair_id, file(fq1), file(fq2) from fusionfq + output: + file("${pair_id}.starfusion.txt") into fusionout + when: + params.fusion == 'detect' && params.pairs == 'pe' + script: + """ bash $baseDir/process_scripts/alignment/starfusion.sh -p ${pair_id} -r ${index_path} -a ${fq1} -b ${fq2} -m trinity -f """ @@ -143,12 +169,17 @@ process align { publishDir "$params.output", mode: 'copy' input: + set pair_id, file(fq1), file(fq2) from trimread + output: + set pair_id, file("${pair_id}.bam") into aligned set pair_id, file("${pair_id}.bam") into aligned2 file("${pair_id}.alignerout.txt") into hsatout + script: + """ bash $baseDir/process_scripts/alignment/rnaseqalign.sh -a $params.align -p ${pair_id} -r ${index_path} -x ${fq1} -y ${fq2} """ @@ -159,11 +190,16 @@ process alignqc { publishDir "$params.output", mode: 'copy' input: + set pair_id, file(bam) from aligned2 + output: + file("${pair_id}.flagstat.txt") into alignstats set file("${pair_id}_fastqc.zip"),file("${pair_id}_fastqc.html") into fastqc + script: + """ bash $baseDir/process_scripts/alignment/bamqc.sh -p ${pair_id} -b ${bam} -y rna """ @@ -175,10 +211,16 @@ process parse_alignstat { publishDir "$params.output", mode: 'copy' input: + file(txt) from alignstats.toList() file(txt) from hsatout.toList() + output: + file('alignment.summary.txt') + + script: + """ perl $baseDir/scripts/parse_flagstat.pl *.flagstat.txt """ @@ -190,11 +232,16 @@ process markdups { publishDir "$params.output", mode: 'copy' input: + set pair_id, file(sbam) from aligned + output: + set pair_id, file("${pair_id}.dedup.bam") into deduped1 set pair_id, file("${pair_id}.dedup.bam") into deduped2 + script: + """ bash $baseDir/process_scripts/alignment/markdups.sh -a $params.markdups -b $sbam -p $pair_id """ @@ -206,13 +253,20 @@ process markdups { process geneabund { errorStrategy 'ignore' publishDir "$params.output", mode: 'copy' + input: + set pair_id, file(sbam) from deduped1 + output: + file("${pair_id}.cts") into counts file("${pair_id}.cts.summary") into ctsum file("${pair_id}_stringtie") into strcts file("${pair_id}.fpkm.txt") into fpkm + + script: + """ bash $baseDir/process_scripts/genect_rnaseq/geneabundance.sh -s $params.stranded -g ${gtf_file} -p ${pair_id} -b ${sbam} """ @@ -221,7 +275,9 @@ process geneabund { process statanal { errorStrategy 'ignore' publishDir "$params.output", mode: 'copy' + input: + file count_file from counts.toList() file count_sum from ctsum.toList() file newdesign name 'design.txt' @@ -229,29 +285,41 @@ process statanal { file geneset name 'geneset.gmt' file fpkm_file from fpkm.toList() file stringtie_dir from strcts.toList() - output: + + output: + file "*.txt" into txtfiles file "*.png" into psfiles file("*.rda") into rdafiles file("geneset.shiny.gmt") into gmtfile + when: + script: + """ bash $baseDir/process_scripts/genect_rnaseq/statanal.sh -d $params.dea """ } + process gatkbam { errorStrategy 'ignore' publishDir "$params.output", mode: 'copy' input: + set pair_id, file(rbam) from deduped2 output: + set file("${pair_id}.final.bam"),file("${pair_id}.final.bai") into gatkbam + when: + params.align == 'hisat' && $index_path == '/project/shared/bicf_workflow_ref/GRCh38/' + script: + """ bash $baseDir/process_scripts/variants/gatkrunner.sh -a gatkbam_rna -b $rbam -r ${index_path}/hisat_index -p $pair_id """ diff --git a/workflow/scripts/check_designfile.pl b/workflow/scripts/check_designfile.pl index 692f9ce498cf22ead33c692d9d721a9837af1055..bf941465e13a345dbe2b1d0bed9406fbd6bc4184 100755 --- a/workflow/scripts/check_designfile.pl +++ b/workflow/scripts/check_designfile.pl @@ -1,6 +1,9 @@ #!/usr/bin/perl -w #check_designfile.pl +use strict; +use warnings; + my $pe = shift @ARGV; my $dfile = shift @ARGV; open OUT, ">design.valid.txt" or die $!; @@ -11,7 +14,6 @@ $head =~ s/FullPathTo//g; my @colnames = split(/\t/,$head); my %newcols = map {$_=> 1} @colnames; - unless (grep(/FqR1/,@colnames)) { die "Missing Sequence Files in Design File: FqR1\n"; } @@ -49,16 +51,36 @@ while (my $line = <DFILE>) { $hash{FinalBam} = $hash{SampleID}.".final.bam"; $hash{Bam} = $hash{SampleID}.".bam"; unless ($hash{SampleGroup}) { - $j = $lnct %% 2; + my $j = $lnct %% 2; $hash{SampleGroup} = $grp[$j]; } $hash{SampleGroup} =~ s/_//g; + unless ($hash{FqR1} =~ m/.fastq.gz/) { + my $name = $hash{FqR1}; + $name =~ s/.f.*/.fastq.gz/; + unless ($hash{FqR1} eq $name) { + $hash{FqR1} = $name; + unless (-e ($name)) { + die "Unable to find fastq read 1\n${name}\n"; + } + } + } + $hash{FqR2} = 'na' unless ($hash{FqR2}); + unless ($hash{FqR2} eq 'na') { + my $name = $hash{FqR2}; + $name =~ s/.f.*/.fastq.gz/; + unless ($hash{FqR2} eq $name) { + $hash{FqR2} = $name; + unless (-e ($name)) { + die "Unable to find fastq read 2\n${name}\n"; + } + } + } my @line; - foreach $f (@cols) { + foreach my $f (@cols) { push @line, $hash{$f}; } print OUT join("\t",@line),"\n"; - $hash{FqR2} = 'na' unless ($hash{FqR2}); print join(",",$hash{SampleID},$hash{FqR1},$hash{FqR2}),"\n"; $lnct ++; }