Test File Input - Process Error

3caba697 · Jeremy Mathews · 5fc25b2c · 3caba697 · 3caba697
Commit 3caba697 authored 5 years ago by Jeremy Mathews
--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -3,39 +3,6 @@
 // Default parameter values to run tests
 params.input = "$baseDir"
 params.output= "$baseDir/output"
-
-params.files="$params.input/*.f*"
-files=Channel.fromPath(params.files)
-
-process checkinputfiles {
-
-  input:
-
-  file fqs from files
-
-  output:
-
-  file("*.fastq.gz") into fastqs
-
-  script:
-
-  """
-  if [[ $fqs == *.fq ]];
-    then new_name=`echo ${fqs} | sed -e "s/.fq\$/.fastq/"`;
-    mv ${fqs} \${new_name};
-    gzip -f \${new_name};
-  elif [[ $fqs == *.fastq ]];
-    then gzip -f $fqs;
-  elif [[ $fqs == *.fq.gz ]];
-    then new_name=`echo ${fqs} | sed -e "s/.fq.gz\$/.fastq.gz/"`;
-    mv ${fqs} \${new_name};
-  else
-    zcat ${fqs} | gzip -c > ${fqs}.temp
-    mv ${fqs}.temp ${fqs}
-  fi;
-  """
-}
-
 params.design="$params.input/design.txt"
 params.genome="/project/shared/bicf_workflow_ref/human/grch38/"
 params.markdups="picard"
@@ -46,7 +13,6 @@ params.align = 'hisat'
 params.fusion = 'skip'
 params.dea = 'detect'

-design_file = file(params.design)
 design_file = file(params.design)
 gtf_file = file("$params.genome/gencode.gtf")
 genenames = file("$params.genome/genenames.txt")
@@ -57,19 +23,61 @@ indel="$params.genome/goldindels.vcf.gz"
 knownindel=file(indel)
 dbsnp=file(dbsnp)

+files = Channel
+  .fromPath("$params.input/*")
+good = Channel.fromPath("$params.input/*.fastq.gz")
+
+process checkinputfiles {
+  module 'parallel/20150122:pigz/2.4'
+  queue 'super'
+
+  input:
+
+  file ("*") from files.collect()
+  file design_file name "design.tsv"
+
+  output:
+
+  set file ("design.tsv"), file ("*.fastq.gz") into design
+  file ("*.fastq.gz") into fastqs
+
+  script:
+
+  """
+  for fqs in `ls | grep -v "^design.tsv\$"`;
+  do if [[ \$fqs == *.fq ]];
+      then new_name=`echo \${fqs} | sed -e "s/.fq\$/.fastq/"`;
+      mv \${fqs} \${new_name};
+      echo "pigz -f \${new_name}";
+    elif [[ \$fqs == *.fastq ]];
+      then echo "pigz -f \$fqs";
+    elif [[ \$fqs == *.fq.gz ]];
+      then new_name=`echo \${fqs} | sed -e "s/.fq.gz\$/.fastq.gz/"`;
+      mv \${fqs} \${new_name};
+    fi;
+  done | shuf | parallel -j\${SLURM_CPUS_ON_NODE};
+  """
+}
+
 // params genome is the directory
 // base name for the index is always genome
 index_path = file(params.genome)

 process checkdesignfile {
+  executor 'local'
  publishDir "$params.output", mode: 'copy'
+
  input:
-  file design_file name 'design.ori.txt'
+
+  set file ("design.ori.txt"), file ("*") from design

  output:
+
  file("design.valid.txt") into newdesign
  stdout spltnames
+
  script:
+
  """
  perl -p -e 's/\\r\\n*/\\n/g' design.ori.txt > design.fix.txt
  perl $baseDir/scripts/check_designfile.pl ${params.pairs} design.fix.txt
@@ -78,14 +86,17 @@ process checkdesignfile {

 def fileMap = [:]

-fastqs.each {
+fastqs
+  .mix(good)
+  .flatten()
+  .each {
   final fileName = it.getFileName().toString()
   prefix = fileName.lastIndexOf('/')
   fileMap[fileName] = it
 }

 if (params.pairs == 'pe') {
-spltnames
+  spltnames
 .splitCsv()
 .filter { fileMap.get(it[1]) != null & fileMap.get(it[2]) != null }
 .map { it -> tuple(it[0], fileMap.get(it[1]), fileMap.get(it[2])) }
@@ -94,7 +105,7 @@ spltnames
 else {
 spltnames
 .splitCsv()
- .filter { fleMap.get(it[1]) != null }
+ .filter { fileMap.get(it[1]) != null }
 .map { it -> tuple(it[0], fileMap.get(it[1]),'') }
 .set { read }
 }
@@ -103,14 +114,21 @@ if( ! read ) { error "Didn't match any input files with entries in the design fi
 //
 // Trim raw reads using trimgalore
 //
+
 process trim {
  errorStrategy 'ignore'
+
  input:
+
  set pair_id, file(read1), file(read2) from read
+
  output:
+
  set pair_id, file("${pair_id}.trim.R1.fastq.gz"),file("${pair_id}.trim.R2.fastq.gz") into trimread
  set pair_id, file("${pair_id}.trim.R1.fastq.gz"),file("${pair_id}.trim.R2.fastq.gz") into fusionfq
+
  script:
+
  """
  bash $baseDir/process_scripts/preproc_fastq/trimgalore.sh -p ${pair_id} -a ${read1} -b ${read2}
  """
@@ -126,13 +144,21 @@ process trim {
 process starfusion {
  errorStrategy 'ignore'
  publishDir "$params.output", mode: 'copy'
+
  input:
+
  set pair_id, file(fq1), file(fq2) from fusionfq
+
  output:
+
  file("${pair_id}.starfusion.txt") into fusionout
+
  when:
+
  params.fusion == 'detect' && params.pairs == 'pe'
+
  script:
+
  """
  bash $baseDir/process_scripts/alignment/starfusion.sh -p ${pair_id} -r ${index_path} -a ${fq1} -b ${fq2} -m trinity -f
  """
@@ -143,12 +169,17 @@ process align {
  publishDir "$params.output", mode: 'copy'

  input:
+
  set pair_id, file(fq1), file(fq2) from trimread
+
  output:
+
  set pair_id, file("${pair_id}.bam") into aligned
  set pair_id, file("${pair_id}.bam") into aligned2
  file("${pair_id}.alignerout.txt") into hsatout
+
  script:
+
  """
  bash $baseDir/process_scripts/alignment/rnaseqalign.sh -a $params.align -p ${pair_id} -r ${index_path} -x ${fq1} -y ${fq2}
  """
@@ -159,11 +190,16 @@ process alignqc {
  publishDir "$params.output", mode: 'copy'

  input:
+
  set pair_id, file(bam) from aligned2
+
  output:
+
  file("${pair_id}.flagstat.txt") into alignstats
  set file("${pair_id}_fastqc.zip"),file("${pair_id}_fastqc.html") into fastqc
+
  script:
+
  """
  bash $baseDir/process_scripts/alignment/bamqc.sh -p ${pair_id} -b ${bam} -y rna
  """
@@ -175,10 +211,16 @@ process parse_alignstat {
  publishDir "$params.output", mode: 'copy'

  input:
+
  file(txt) from alignstats.toList()
  file(txt) from  hsatout.toList()
+
  output:
+
  file('alignment.summary.txt')
+
+  script:
+
  """
  perl $baseDir/scripts/parse_flagstat.pl *.flagstat.txt
  """
@@ -190,11 +232,16 @@ process markdups {
  publishDir "$params.output", mode: 'copy'

  input:
+
  set pair_id, file(sbam) from aligned
+
  output:
+
  set pair_id, file("${pair_id}.dedup.bam") into deduped1
  set pair_id, file("${pair_id}.dedup.bam") into deduped2
+
  script:
+
  """
  bash $baseDir/process_scripts/alignment/markdups.sh -a $params.markdups -b $sbam -p $pair_id
  """
@@ -206,13 +253,20 @@ process markdups {
 process geneabund {
  errorStrategy 'ignore'
  publishDir "$params.output", mode: 'copy'
+
  input:
+
  set pair_id, file(sbam) from deduped1
+
  output:
+
  file("${pair_id}.cts")  into counts
  file("${pair_id}.cts.summary")  into ctsum
  file("${pair_id}_stringtie") into strcts
  file("${pair_id}.fpkm.txt") into fpkm
+
+  script:
+
  """
  bash $baseDir/process_scripts/genect_rnaseq/geneabundance.sh -s $params.stranded -g ${gtf_file} -p ${pair_id} -b ${sbam}
  """
@@ -221,7 +275,9 @@ process geneabund {
 process statanal {
  errorStrategy 'ignore'
  publishDir "$params.output", mode: 'copy'
+
  input:
+
  file count_file from counts.toList()
  file count_sum from ctsum.toList()
  file newdesign name 'design.txt'
@@ -229,29 +285,41 @@ process statanal {
  file geneset name 'geneset.gmt'
  file fpkm_file from fpkm.toList()
  file stringtie_dir from strcts.toList()
-   output:
+
+  output:
+
  file "*.txt" into txtfiles
  file "*.png" into psfiles
  file("*.rda") into rdafiles
  file("geneset.shiny.gmt") into gmtfile
+
  when:
+
  script:
+
  """
  bash $baseDir/process_scripts/genect_rnaseq/statanal.sh -d $params.dea
  """
 }
+
 process gatkbam {
  errorStrategy 'ignore'
  publishDir "$params.output", mode: 'copy'

  input:
+
  set pair_id, file(rbam) from deduped2

  output:
+
  set file("${pair_id}.final.bam"),file("${pair_id}.final.bai") into gatkbam
+
  when:
+
  params.align == 'hisat' && $index_path == '/project/shared/bicf_workflow_ref/GRCh38/'
+
  script:
+
  """
  bash $baseDir/process_scripts/variants/gatkrunner.sh -a gatkbam_rna -b $rbam -r ${index_path}/hisat_index -p $pair_id
  """

--- a/workflow/scripts/check_designfile.pl
+++ b/workflow/scripts/check_designfile.pl
 #!/usr/bin/perl -w
 #check_designfile.pl

+use strict;
+use warnings;
+
 my $pe = shift @ARGV;
 my $dfile = shift @ARGV;
 open OUT, ">design.valid.txt" or die $!;
@@ -11,7 +14,6 @@ $head =~ s/FullPathTo//g;
 my @colnames = split(/\t/,$head);
 my %newcols = map {$_=> 1} @colnames;

-
 unless (grep(/FqR1/,@colnames)) {
    die "Missing Sequence Files in Design File: FqR1\n";
 }
@@ -49,16 +51,36 @@ while (my $line = <DFILE>) {
    $hash{FinalBam} = $hash{SampleID}.".final.bam";
    $hash{Bam} = $hash{SampleID}.".bam";
    unless ($hash{SampleGroup}) {
-	$j = $lnct %% 2;
+	my $j = $lnct %% 2;
 	$hash{SampleGroup} = $grp[$j];
    }
    $hash{SampleGroup} =~ s/_//g;
+    unless ($hash{FqR1} =~ m/.fastq.gz/) {
+        my $name = $hash{FqR1};
+        $name =~ s/.f.*/.fastq.gz/;
+        unless ($hash{FqR1} eq $name) {
+	    $hash{FqR1} = $name;
+            unless (-e ($name)) {
+                    die "Unable to find fastq read 1\n${name}\n";
+            }
+	}
+    }
+    $hash{FqR2} = 'na' unless ($hash{FqR2});
+    unless ($hash{FqR2} eq 'na') {
+        my $name = $hash{FqR2};
+        $name =~ s/.f.*/.fastq.gz/;
+        unless ($hash{FqR2} eq $name) {
+	    $hash{FqR2} = $name;
+            unless (-e ($name)) {
+                    die "Unable to find fastq read 2\n${name}\n";
+            }
+	}
+    }
    my @line;
-    foreach $f (@cols) {
+    foreach my $f (@cols) {
 	push @line, $hash{$f};
    }
    print OUT join("\t",@line),"\n";
-    $hash{FqR2} = 'na' unless ($hash{FqR2});
    print join(",",$hash{SampleID},$hash{FqR1},$hash{FqR2}),"\n";
    $lnct ++;
 }