File Name Check

e7ad35bc · Jeremy Mathews · 3caba697 · e7ad35bc · e7ad35bc · e7ad35bc
Commit e7ad35bc authored 5 years ago by Jeremy Mathews
--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -4,7 +4,7 @@
 params.input = "$baseDir"
 params.output= "$baseDir/output"
 params.design="$params.input/design.txt"
-params.genome="/project/shared/bicf_workflow_ref/human/grch38/"
+params.genome="/project/shared/bicf_workflow_ref/human/GRCh38/"
 params.markdups="picard"
 params.stranded="0"
 params.pairs="pe"
@@ -14,313 +14,231 @@ params.fusion = 'skip'
 params.dea = 'detect'

 design_file = file(params.design)
+fqs = Channel.fromPath("$params.input/*")
 gtf_file = file("$params.genome/gencode.gtf")
 genenames = file("$params.genome/genenames.txt")
-
 geneset = file("$params.genome/../gsea_gmt/$params.geneset")
-dbsnp="$params.genome/dbsnp.vcf.gz"
-indel="$params.genome/goldindels.vcf.gz"
+dbsnp="$params.genome/dbSnp.vcf.gz"
+indel="$params.genome/GoldIndels.vcf.gz"
 knownindel=file(indel)
 dbsnp=file(dbsnp)

-files = Channel
-  .fromPath("$params.input/*")
-good = Channel.fromPath("$params.input/*.fastq.gz")
-
-process checkinputfiles {
-  module 'parallel/20150122:pigz/2.4'
-  queue 'super'
-
-  input:
-
-  file ("*") from files.collect()
-  file design_file name "design.tsv"
-
-  output:
-
-  set file ("design.tsv"), file ("*.fastq.gz") into design
-  file ("*.fastq.gz") into fastqs
-
-  script:
-
-  """
-  for fqs in `ls | grep -v "^design.tsv\$"`;
-  do if [[ \$fqs == *.fq ]];
-      then new_name=`echo \${fqs} | sed -e "s/.fq\$/.fastq/"`;
-      mv \${fqs} \${new_name};
-      echo "pigz -f \${new_name}";
-    elif [[ \$fqs == *.fastq ]];
-      then echo "pigz -f \$fqs";
-    elif [[ \$fqs == *.fq.gz ]];
-      then new_name=`echo \${fqs} | sed -e "s/.fq.gz\$/.fastq.gz/"`;
-      mv \${fqs} \${new_name};
-    fi;
-  done | shuf | parallel -j\${SLURM_CPUS_ON_NODE};
-  """
-}
-
 // params genome is the directory
 // base name for the index is always genome
 index_path = file(params.genome)

 process checkdesignfile {
-  executor 'local'
-  publishDir "$params.output", mode: 'copy'
-
-  input:
-
-  set file ("design.ori.txt"), file ("*") from design
-
-  output:
-
-  file("design.valid.txt") into newdesign
-  stdout spltnames
-
-  script:
-
-  """
-  perl -p -e 's/\\r\\n*/\\n/g' design.ori.txt > design.fix.txt
-  perl $baseDir/scripts/check_designfile.pl ${params.pairs} design.fix.txt
-  """
+	queue 'super'
+	module 'parallel/20150122:pigz/2.4'
+	publishDir "$params.output", mode: 'copy'
+
+	input:
+        file design_file name 'design.ori.txt'
+	file ("*") from fqs.collect()
+
+	output:
+	file("design.valid.txt") into newdesign
+	file("*.fastq.gz") into fastqs mode flatten
+	stdout spltnames
+
+	script:
+	"""
+	bash $baseDir/scripts/check_inputfiles.sh 
+	perl -p -e 's/\\r\\n*/\\n/g' design.ori.txt > design.fix.txt
+	perl $baseDir/scripts/check_designfile.pl ${params.pairs} design.fix.txt
+	"""
 }

 def fileMap = [:]

 fastqs
-  .mix(good)
-  .flatten()
-  .each {
-   final fileName = it.getFileName().toString()
-   prefix = fileName.lastIndexOf('/')
-   fileMap[fileName] = it
-}
+    .subscribe { 
+	def fileName = it.getFileName()
+	fileMap."$fileName" = it
+    }

 if (params.pairs == 'pe') {
  spltnames
- .splitCsv()
- .filter { fileMap.get(it[1]) != null & fileMap.get(it[2]) != null }
- .map { it -> tuple(it[0], fileMap.get(it[1]), fileMap.get(it[2])) }
- .set { read }
-}
-else {
-spltnames
- .splitCsv()
- .filter { fileMap.get(it[1]) != null }
- .map { it -> tuple(it[0], fileMap.get(it[1]),'') }
- .set { read }
+	.splitCsv()
+	.filter { fileMap.get(it[1]) != null & fileMap.get(it[2]) != null }
+	.map { it -> tuple(it[0], fileMap.get(it[1]), fileMap.get(it[2])) }
+	.set { read }
+} else {
+  spltnames
+	.splitCsv()
+	.filter { fileMap.get(it[1]) != null }
+	.map { it -> tuple(it[0], fileMap.get(it[1]),'') }
+	.set { read }
 }
 if( ! read ) { error "Didn't match any input files with entries in the design file" }

-//
 // Trim raw reads using trimgalore
-//
-
 process trim {
-  errorStrategy 'ignore'
-
-  input:

-  set pair_id, file(read1), file(read2) from read
+	input:
+	set pair_id, file(read1), file(read2) from read

-  output:
+	output:
+	set pair_id, file("${pair_id}.trim.R1.fastq.gz"),file("${pair_id}.trim.R2.fastq.gz") into trimread
+	set pair_id, file("${pair_id}.trim.R1.fastq.gz"),file("${pair_id}.trim.R2.fastq.gz") into fusionfq

-  set pair_id, file("${pair_id}.trim.R1.fastq.gz"),file("${pair_id}.trim.R2.fastq.gz") into trimread
-  set pair_id, file("${pair_id}.trim.R1.fastq.gz"),file("${pair_id}.trim.R2.fastq.gz") into fusionfq
-
-  script:
-
-  """
-  bash $baseDir/process_scripts/preproc_fastq/trimgalore.sh -p ${pair_id} -a ${read1} -b ${read2}
-  """
+	script:
+	"""
+	bash $baseDir/process_scripts/preproc_fastq/trimgalore.sh -p ${pair_id} -a ${read1} -b ${read2}
+	"""
 }

-//
 // Align trimmed reads to genome indes with hisat2
 // Sort and index with samtools
 // QC aligned reads with fastqc
 // Alignment stats with samtools
-//

 process starfusion {
-  errorStrategy 'ignore'
-  publishDir "$params.output", mode: 'copy'
-
-  input:
-
-  set pair_id, file(fq1), file(fq2) from fusionfq
-
-  output:
-
-  file("${pair_id}.starfusion.txt") into fusionout
+        publishDir "$params.output", mode: 'copy'

-  when:
+        input:
+        set pair_id, file(fq1), file(fq2) from fusionfq

-  params.fusion == 'detect' && params.pairs == 'pe'
+        output:
+        file("${pair_id}.starfusion.txt") into fusionout

-  script:
+        when:
+        params.fusion == 'detect' && params.pairs == 'pe'

-  """
-  bash $baseDir/process_scripts/alignment/starfusion.sh -p ${pair_id} -r ${index_path} -a ${fq1} -b ${fq2} -m trinity -f
-  """
+        script:
+        """
+        bash $baseDir/process_scripts/alignment/starfusion.sh -p ${pair_id} -r ${index_path} -a ${fq1} -b ${fq2} -m trinity -f
+        """
 }

 process align {
-  errorStrategy 'ignore'
-  publishDir "$params.output", mode: 'copy'
+        publishDir "$params.output", mode: 'copy'

-  input:
+        input:
+        set pair_id, file(fq1), file(fq2) from trimread

-  set pair_id, file(fq1), file(fq2) from trimread
+        output:
+        set pair_id, file("${pair_id}.bam") into aligned
+        set pair_id, file("${pair_id}.bam") into aligned2
+        file("${pair_id}.alignerout.txt") into hsatout

-  output:
-
-  set pair_id, file("${pair_id}.bam") into aligned
-  set pair_id, file("${pair_id}.bam") into aligned2
-  file("${pair_id}.alignerout.txt") into hsatout
-
-  script:
-
-  """
-  bash $baseDir/process_scripts/alignment/rnaseqalign.sh -a $params.align -p ${pair_id} -r ${index_path} -x ${fq1} -y ${fq2}
-  """
+        script:
+        """
+        bash $baseDir/process_scripts/alignment/rnaseqalign.sh -a $params.align -p ${pair_id} -r ${index_path} -x ${fq1} -y ${fq2}
+        """
 }

 process alignqc {
-  errorStrategy 'ignore'
-  publishDir "$params.output", mode: 'copy'
-
-  input:
+        publishDir "$params.output", mode: 'copy'

-  set pair_id, file(bam) from aligned2
+        input:
+        set pair_id, file(bam) from aligned2

-  output:
+        output:
+        file("${pair_id}.flagstat.txt") into alignstats
+        set file("${pair_id}_fastqc.zip"),file("${pair_id}_fastqc.html") into fastqc

-  file("${pair_id}.flagstat.txt") into alignstats
-  set file("${pair_id}_fastqc.zip"),file("${pair_id}_fastqc.html") into fastqc
-
-  script:
-
-  """
-  bash $baseDir/process_scripts/alignment/bamqc.sh -p ${pair_id} -b ${bam} -y rna
-  """
+        script:
+        """
+        bash $baseDir/process_scripts/alignment/bamqc.sh -p ${pair_id} -b ${bam} -y rna
+        """
 }

 // Summarize all flagstat output

 process parse_alignstat {
-  publishDir "$params.output", mode: 'copy'
-
-  input:
+        publishDir "$params.output", mode: 'copy'

-  file(txt) from alignstats.toList()
-  file(txt) from  hsatout.toList()
+        input:
+        file(txt) from alignstats.toList()
+        file(txt) from  hsatout.toList()

-  output:
+        output:
+        file('alignment.summary.txt')

-  file('alignment.summary.txt')
-
-  script:
-
-  """
-  perl $baseDir/scripts/parse_flagstat.pl *.flagstat.txt
-  """
+        script:
+        """
+        perl $baseDir/scripts/parse_flagstat.pl *.flagstat.txt
+        """
 }

 // Identify duplicate reads with Picard

 process markdups {
-  publishDir "$params.output", mode: 'copy'
-
-  input:
+        publishDir "$params.output", mode: 'copy'

-  set pair_id, file(sbam) from aligned
+        input:
+        set pair_id, file(sbam) from aligned

-  output:
+        output:
+        set pair_id, file("${pair_id}.dedup.bam") into deduped1
+        set pair_id, file("${pair_id}.dedup.bam") into deduped2

-  set pair_id, file("${pair_id}.dedup.bam") into deduped1
-  set pair_id, file("${pair_id}.dedup.bam") into deduped2
-
-  script:
-
-  """
-  bash $baseDir/process_scripts/alignment/markdups.sh -a $params.markdups -b $sbam -p $pair_id
-  """
+        script:
+        """
+        bash $baseDir/process_scripts/alignment/markdups.sh -a $params.markdups -b $sbam -p $pair_id
+        """
 }

 // Read summarization with subread
 // Assemble transcripts with stringtie

 process geneabund {
-  errorStrategy 'ignore'
-  publishDir "$params.output", mode: 'copy'
-
-  input:
+        publishDir "$params.output", mode: 'copy'

-  set pair_id, file(sbam) from deduped1
+        input:
+        set pair_id, file(sbam) from deduped1

-  output:
+        output:
+        file("${pair_id}.cts") into counts
+        file("${pair_id}.cts.summary") into ctsum
+        file("${pair_id}_stringtie") into strcts
+        file("${pair_id}.fpkm.txt") into fpkm

-  file("${pair_id}.cts")  into counts
-  file("${pair_id}.cts.summary")  into ctsum
-  file("${pair_id}_stringtie") into strcts
-  file("${pair_id}.fpkm.txt") into fpkm
-
-  script:
-
-  """
-  bash $baseDir/process_scripts/genect_rnaseq/geneabundance.sh -s $params.stranded -g ${gtf_file} -p ${pair_id} -b ${sbam}
-  """
+        script:
+        """
+        bash $baseDir/process_scripts/genect_rnaseq/geneabundance.sh -s $params.stranded -g ${gtf_file} -p ${pair_id} -b ${sbam}
+        """
 }

 process statanal {
-  errorStrategy 'ignore'
-  publishDir "$params.output", mode: 'copy'
-
-  input:
-
-  file count_file from counts.toList()
-  file count_sum from ctsum.toList()
-  file newdesign name 'design.txt'
-  file genenames
-  file geneset name 'geneset.gmt'
-  file fpkm_file from fpkm.toList()
-  file stringtie_dir from strcts.toList()
-
-  output:
-
-  file "*.txt" into txtfiles
-  file "*.png" into psfiles
-  file("*.rda") into rdafiles
-  file("geneset.shiny.gmt") into gmtfile
-
-  when:
-
-  script:
-
-  """
-  bash $baseDir/process_scripts/genect_rnaseq/statanal.sh -d $params.dea
-  """
+        publishDir "$params.output", mode: 'copy'
+
+        input:
+        file count_file from counts.toList()
+        file count_sum from ctsum.toList()
+        file newdesign name 'design.txt'
+        file genenames
+        file geneset name 'geneset.gmt'
+        file fpkm_file from fpkm.toList()
+        file stringtie_dir from strcts.toList()
+
+        output:
+        file "*.txt" into txtfiles
+        file "*.png" into psfiles
+        file("*.rda") into rdafiles
+        file("geneset.shiny.gmt") into gmtfile
+
+        script:
+        """
+        bash $baseDir/process_scripts/genect_rnaseq/statanal.sh -d $params.dea
+        """
 }

 process gatkbam {
-  errorStrategy 'ignore'
-  publishDir "$params.output", mode: 'copy'
-
-  input:
-
-  set pair_id, file(rbam) from deduped2
-
-  output:
-
-  set file("${pair_id}.final.bam"),file("${pair_id}.final.bai") into gatkbam
+        publishDir "$params.output", mode: 'copy'

-  when:
+        input:
+        set pair_id, file(rbam) from deduped2

-  params.align == 'hisat' && $index_path == '/project/shared/bicf_workflow_ref/GRCh38/'
+        output:
+        set file("${pair_id}.final.bam"),file("${pair_id}.final.bai") into gatkbam

-  script:
+        when:
+        params.align == 'hisat' && $index_path == '/project/shared/bicf_workflow_ref/GRCh38/'

-  """
-  bash $baseDir/process_scripts/variants/gatkrunner.sh -a gatkbam_rna -b $rbam -r ${index_path}/hisat_index -p $pair_id
-  """
+        script:
+        """
+        bash $baseDir/process_scripts/variants/gatkrunner.sh -a gatkbam_rna -b $rbam -r ${index_path}/hisat_index -p $pair_id
+        """
 }
--- a/workflow/scripts/check_designfile.pl
+++ b/workflow/scripts/check_designfile.pl
@@ -55,9 +55,9 @@ while (my $line = <DFILE>) {
 	$hash{SampleGroup} = $grp[$j];
    }
    $hash{SampleGroup} =~ s/_//g;
-    unless ($hash{FqR1} =~ m/.fastq.gz/) {
+    unless ($hash{FqR1} =~ m/_good.fastq.gz/) {
        my $name = $hash{FqR1};
-        $name =~ s/.f.*/.fastq.gz/;
+        $name =~ s/.f.*/_good.fastq.gz/;
        unless ($hash{FqR1} eq $name) {
 	    $hash{FqR1} = $name;
            unless (-e ($name)) {
@@ -68,7 +68,7 @@ while (my $line = <DFILE>) {
    $hash{FqR2} = 'na' unless ($hash{FqR2});
    unless ($hash{FqR2} eq 'na') {
        my $name = $hash{FqR2};
-        $name =~ s/.f.*/.fastq.gz/;
+        $name =~ s/.f.*/_good.fastq.gz/;
        unless ($hash{FqR2} eq $name) {
 	    $hash{FqR2} = $name;
            unless (-e ($name)) {

--- a/workflow/scripts/check_inputfiles.sh
+++ b/workflow/scripts/check_inputfiles.sh
+#!/bin/bash
+#check_inputfiles.sh
+
+fqs=`ls *.f*`
+
+for i in $fqs;
+do
+   if [[ ${i} == *.fq ]];
+   then
+	new_name=`echo ${i} | sed -e "s/.fq\$/_good.fastq/"`;
+	mv ${i} ${new_name};
+	`pigz -f ${new_name}`;
+   elif [[ ${i} == *.fastq ]];
+   then
+	new_name=`echo ${i} | sed -e "s/.fastq\$/_good.fastq/"`;
+	mv ${i} ${new_name};
+	`pigz -f ${new_name}`;
+   elif [[ ${i} == *.fq.gz ]];
+   then
+	new_name=`echo ${i} | sed -e "s/.fq.gz\$/_good.fastq.gz/"`;
+	mv ${i} ${new_name};
+   else
+	new_name=`echo ${i} | sed -e "s/.fastq.gz\$/_good.fastq.gz/"`;
+	mv ${i} ${new_name};
+   fi;
+done