Commit 7c2ae90f authored by Brandi Cantarel's avatar Brandi Cantarel
Browse files

Merge branch '15-CheckInputFiles' into 'master'

Resolve "Check file inputs"

Closes #15 and #16

See merge request !7
parents cc35440c 888e1ee4
Pipeline #4285 passed with stage
in 156 minutes and 55 seconds
......@@ -19,6 +19,6 @@ test_human:
test_mouse:
stage: integration
script:
- nextflow run -with-dag flowchart.png -with-timeline mouse_timeline.html -with-report mouse_report.html workflow/main.nf --input /project/shared/bicf_workflow_ref/workflow_testdata/rnaseq --design /project/shared/bicf_workflow_ref/workflow_testdata/rnaseq/mouse_se.design.txt --pairs se --fusion skip --genome /project/shared/bicf_workflow_ref/GRCm38 --markdups null --output mouse_output
- nextflow run -with-dag flowchart.png -with-timeline mouse_timeline.html -with-report mouse_report.html workflow/main.nf --input /project/shared/bicf_workflow_ref/workflow_testdata/rnaseq --design /project/shared/bicf_workflow_ref/workflow_testdata/rnaseq/mouse_se.design.txt --pairs se --fusion skip --genome /project/shared/bicf_workflow_ref/mouse/GRCm38 --markdups null --output mouse_output
artifacts:
expire_in: 2 days
......@@ -3,9 +3,7 @@
// Default parameter values to run tests
params.input = "$baseDir"
params.output= "$baseDir/output"
params.fastqs="$params.input/*.fastq.gz"
params.design="$params.input/design.txt"
params.genome="/project/shared/bicf_workflow_ref/human/GRCh38/"
params.markdups="picard"
params.stranded="0"
......@@ -16,11 +14,9 @@ params.fusion = 'skip'
params.dea = 'detect'
design_file = file(params.design)
fastqs=file(params.fastqs)
design_file = file(params.design)
fqs = Channel.fromPath("$params.input/*")
gtf_file = file("$params.genome/gencode.gtf")
genenames = file("$params.genome/genenames.txt")
geneset = file("$params.genome/../gsea_gmt/$params.geneset")
dbsnp="$params.genome/dbSnp.vcf.gz"
indel="$params.genome/GoldIndels.vcf.gz"
......@@ -32,197 +28,217 @@ dbsnp=file(dbsnp)
index_path = file(params.genome)
process checkdesignfile {
publishDir "$params.output", mode: 'copy'
input:
file design_file name 'design.ori.txt'
output:
file("design.valid.txt") into newdesign
stdout spltnames
script:
"""
perl -p -e 's/\\r\\n*/\\n/g' design.ori.txt > design.fix.txt
perl $baseDir/scripts/check_designfile.pl ${params.pairs} design.fix.txt
"""
queue 'super'
module 'parallel/20150122:pigz/2.4'
publishDir "$params.output", mode: 'copy'
input:
file design_file name 'design.ori.txt'
file ("*") from fqs.collect()
output:
file("design.valid.txt") into newdesign
file("*.fastq.gz") into fastqs mode flatten
stdout spltnames
script:
"""
bash $baseDir/scripts/check_inputfiles.sh
perl -p -e 's/\\r\\n*/\\n/g' design.ori.txt > design.fix.txt
perl $baseDir/scripts/check_designfile.pl ${params.pairs} design.fix.txt
"""
}
def fileMap = [:]
fastqs.each {
final fileName = it.getFileName().toString()
prefix = fileName.lastIndexOf('/')
fileMap[fileName] = it
}
fastqs
.subscribe {
def fileName = it.getFileName()
fileMap."$fileName" = it
}
if (params.pairs == 'pe') {
spltnames
.splitCsv()
.filter { fileMap.get(it[1]) != null & fileMap.get(it[2]) != null }
.map { it -> tuple(it[0], fileMap.get(it[1]), fileMap.get(it[2])) }
.set { read }
}
else {
spltnames
.splitCsv()
.filter { fileMap.get(it[1]) != null }
.map { it -> tuple(it[0], fileMap.get(it[1]),'') }
.set { read }
spltnames
.splitCsv()
.filter { fileMap.get(it[1]) != null & fileMap.get(it[2]) != null }
.map { it -> tuple(it[0], fileMap.get(it[1]), fileMap.get(it[2])) }
.set { read }
} else {
spltnames
.splitCsv()
.filter { fileMap.get(it[1]) != null }
.map { it -> tuple(it[0], fileMap.get(it[1]),'') }
.set { read }
}
if( ! read ) { error "Didn't match any input files with entries in the design file" }
//
// Trim raw reads using trimgalore
//
process trim {
errorStrategy 'ignore'
input:
set pair_id, file(read1), file(read2) from read
output:
set pair_id, file("${pair_id}.trim.R1.fastq.gz"),file("${pair_id}.trim.R2.fastq.gz") into trimread
set pair_id, file("${pair_id}.trim.R1.fastq.gz"),file("${pair_id}.trim.R2.fastq.gz") into fusionfq
script:
"""
bash $baseDir/process_scripts/preproc_fastq/trimgalore.sh -p ${pair_id} -a ${read1} -b ${read2}
"""
input:
set pair_id, file(read1), file(read2) from read
output:
set pair_id, file("${pair_id}.trim.R1.fastq.gz"),file("${pair_id}.trim.R2.fastq.gz") into trimread
set pair_id, file("${pair_id}.trim.R1.fastq.gz"),file("${pair_id}.trim.R2.fastq.gz") into fusionfq
script:
"""
bash $baseDir/process_scripts/preproc_fastq/trimgalore.sh -p ${pair_id} -a ${read1} -b ${read2}
"""
}
//
// Align trimmed reads to genome indes with hisat2
// Sort and index with samtools
// QC aligned reads with fastqc
// Alignment stats with samtools
//
process starfusion {
errorStrategy 'ignore'
publishDir "$params.output", mode: 'copy'
input:
set pair_id, file(fq1), file(fq2) from fusionfq
output:
file("${pair_id}.starfusion.txt") into fusionout
when:
params.fusion == 'detect' && params.pairs == 'pe'
script:
"""
bash $baseDir/process_scripts/alignment/starfusion.sh -p ${pair_id} -r ${index_path} -a ${fq1} -b ${fq2} -m trinity -f
"""
publishDir "$params.output", mode: 'copy'
input:
set pair_id, file(fq1), file(fq2) from fusionfq
output:
file("${pair_id}.starfusion.txt") into fusionout
when:
params.fusion == 'detect' && params.pairs == 'pe'
script:
"""
bash $baseDir/process_scripts/alignment/starfusion.sh -p ${pair_id} -r ${index_path} -a ${fq1} -b ${fq2} -m trinity -f
"""
}
process align {
errorStrategy 'ignore'
publishDir "$params.output", mode: 'copy'
input:
set pair_id, file(fq1), file(fq2) from trimread
output:
set pair_id, file("${pair_id}.bam") into aligned
set pair_id, file("${pair_id}.bam") into aligned2
file("${pair_id}.alignerout.txt") into hsatout
script:
"""
bash $baseDir/process_scripts/alignment/rnaseqalign.sh -a $params.align -p ${pair_id} -r ${index_path} -x ${fq1} -y ${fq2}
"""
publishDir "$params.output", mode: 'copy'
input:
set pair_id, file(fq1), file(fq2) from trimread
output:
set pair_id, file("${pair_id}.bam") into aligned
set pair_id, file("${pair_id}.bam") into aligned2
file("${pair_id}.alignerout.txt") into hsatout
script:
"""
bash $baseDir/process_scripts/alignment/rnaseqalign.sh -a $params.align -p ${pair_id} -r ${index_path} -x ${fq1} -y ${fq2}
"""
}
process alignqc {
errorStrategy 'ignore'
publishDir "$params.output", mode: 'copy'
input:
set pair_id, file(bam) from aligned2
output:
file("${pair_id}.flagstat.txt") into alignstats
set file("${pair_id}_fastqc.zip"),file("${pair_id}_fastqc.html") into fastqc
script:
"""
bash $baseDir/process_scripts/alignment/bamqc.sh -p ${pair_id} -b ${bam} -y rna
"""
publishDir "$params.output", mode: 'copy'
input:
set pair_id, file(bam) from aligned2
output:
file("${pair_id}.flagstat.txt") into alignstats
set file("${pair_id}_fastqc.zip"),file("${pair_id}_fastqc.html") into fastqc
script:
"""
bash $baseDir/process_scripts/alignment/bamqc.sh -p ${pair_id} -b ${bam} -y rna
"""
}
// Summarize all flagstat output
process parse_alignstat {
publishDir "$params.output", mode: 'copy'
input:
file(txt) from alignstats.toList()
file(txt) from hsatout.toList()
output:
file('alignment.summary.txt')
"""
perl $baseDir/scripts/parse_flagstat.pl *.flagstat.txt
"""
publishDir "$params.output", mode: 'copy'
input:
file(txt) from alignstats.toList()
file(txt) from hsatout.toList()
output:
file('alignment.summary.txt')
script:
"""
perl $baseDir/scripts/parse_flagstat.pl *.flagstat.txt
"""
}
// Identify duplicate reads with Picard
process markdups {
publishDir "$params.output", mode: 'copy'
input:
set pair_id, file(sbam) from aligned
output:
set pair_id, file("${pair_id}.dedup.bam") into deduped1
set pair_id, file("${pair_id}.dedup.bam") into deduped2
script:
"""
bash $baseDir/process_scripts/alignment/markdups.sh -a $params.markdups -b $sbam -p $pair_id
"""
publishDir "$params.output", mode: 'copy'
input:
set pair_id, file(sbam) from aligned
output:
set pair_id, file("${pair_id}.dedup.bam") into deduped1
set pair_id, file("${pair_id}.dedup.bam") into deduped2
script:
"""
bash $baseDir/process_scripts/alignment/markdups.sh -a $params.markdups -b $sbam -p $pair_id
"""
}
// Read summarization with subread
// Assemble transcripts with stringtie
process geneabund {
errorStrategy 'ignore'
publishDir "$params.output", mode: 'copy'
input:
set pair_id, file(sbam) from deduped1
output:
file("${pair_id}.cts") into counts
file("${pair_id}.cts.summary") into ctsum
file("${pair_id}_stringtie") into strcts
file("${pair_id}.fpkm.txt") into fpkm
"""
bash $baseDir/process_scripts/genect_rnaseq/geneabundance.sh -s $params.stranded -g ${gtf_file} -p ${pair_id} -b ${sbam}
"""
publishDir "$params.output", mode: 'copy'
input:
set pair_id, file(sbam) from deduped1
output:
file("${pair_id}.cts") into counts
file("${pair_id}.cts.summary") into ctsum
file("${pair_id}_stringtie") into strcts
file("${pair_id}.fpkm.txt") into fpkm
script:
"""
bash $baseDir/process_scripts/genect_rnaseq/geneabundance.sh -s $params.stranded -g ${gtf_file} -p ${pair_id} -b ${sbam}
"""
}
process statanal {
errorStrategy 'ignore'
publishDir "$params.output", mode: 'copy'
input:
file count_file from counts.toList()
file count_sum from ctsum.toList()
file newdesign name 'design.txt'
file genenames
file geneset name 'geneset.gmt'
file fpkm_file from fpkm.toList()
file stringtie_dir from strcts.toList()
output:
file "*.txt" into txtfiles
file "*.png" into psfiles
file("*.rda") into rdafiles
file("geneset.shiny.gmt") into gmtfile
when:
script:
"""
bash $baseDir/process_scripts/genect_rnaseq/statanal.sh -d $params.dea
"""
publishDir "$params.output", mode: 'copy'
input:
file count_file from counts.toList()
file count_sum from ctsum.toList()
file newdesign name 'design.txt'
file genenames
file geneset name 'geneset.gmt'
file fpkm_file from fpkm.toList()
file stringtie_dir from strcts.toList()
output:
file "*.txt" into txtfiles
file "*.png" into psfiles
file("*.rda") into rdafiles
file("geneset.shiny.gmt") into gmtfile
script:
"""
bash $baseDir/process_scripts/genect_rnaseq/statanal.sh -d $params.dea
"""
}
process gatkbam {
errorStrategy 'ignore'
publishDir "$params.output", mode: 'copy'
input:
set pair_id, file(rbam) from deduped2
output:
set file("${pair_id}.final.bam"),file("${pair_id}.final.bai") into gatkbam
when:
params.align == 'hisat' && $index_path == '/project/shared/bicf_workflow_ref/GRCh38/'
script:
"""
bash $baseDir/process_scripts/variants/gatkrunner.sh -a gatkbam_rna -b $rbam -r ${index_path}/hisat_index -p $pair_id
"""
publishDir "$params.output", mode: 'copy'
input:
set pair_id, file(rbam) from deduped2
output:
set file("${pair_id}.final.bam"),file("${pair_id}.final.bai") into gatkbam
when:
params.align == 'hisat' && $index_path == '/project/shared/bicf_workflow_ref/GRCh38/'
script:
"""
bash $baseDir/process_scripts/variants/gatkrunner.sh -a gatkbam_rna -b $rbam -r ${index_path}/hisat_index -p $pair_id
"""
}
#!/usr/bin/perl -w
#check_designfile.pl
use strict;
use warnings;
my $pe = shift @ARGV;
my $dfile = shift @ARGV;
open OUT, ">design.valid.txt" or die $!;
......@@ -11,7 +14,6 @@ $head =~ s/FullPathTo//g;
my @colnames = split(/\t/,$head);
my %newcols = map {$_=> 1} @colnames;
unless (grep(/FqR1/,@colnames)) {
die "Missing Sequence Files in Design File: FqR1\n";
}
......@@ -49,16 +51,32 @@ while (my $line = <DFILE>) {
$hash{FinalBam} = $hash{SampleID}.".final.bam";
$hash{Bam} = $hash{SampleID}.".bam";
unless ($hash{SampleGroup}) {
$j = $lnct %% 2;
my $j = $lnct %% 2;
$hash{SampleGroup} = $grp[$j];
}
$lnct ++;
$hash{SampleGroup} =~ s/_//g;
unless ($hash{FqR1} =~ m/_good.fastq.gz/) {
my $name = $hash{FqR1};
$name =~ s/.f.*/_good.fastq.gz/;
unless ($hash{FqR1} eq $name) {
$hash{FqR1} = $name;
next unless (-e ($name));
}
}
$hash{FqR2} = 'na' unless ($hash{FqR2});
unless ($hash{FqR2} eq 'na') {
my $name = $hash{FqR2};
$name =~ s/.f.*/_good.fastq.gz/;
unless ($hash{FqR2} eq $name) {
$hash{FqR2} = $name;
next unless (-e ($name));
}
}
my @line;
foreach $f (@cols) {
foreach my $f (@cols) {
push @line, $hash{$f};
}
print OUT join("\t",@line),"\n";
$hash{FqR2} = 'na' unless ($hash{FqR2});
print join(",",$hash{SampleID},$hash{FqR1},$hash{FqR2}),"\n";
$lnct ++;
}
#!/bin/bash
#check_inputfiles.sh
fqs=`ls *.f*`
for i in $fqs;
do
if [[ ${i} == *.fq ]];
then
new_name=`echo ${i} | sed -e "s/.fq\$/_good.fastq/"`;
mv ${i} ${new_name};
`pigz -f ${new_name}`;
elif [[ ${i} == *.fastq ]];
then
new_name=`echo ${i} | sed -e "s/.fastq\$/_good.fastq/"`;
mv ${i} ${new_name};
`pigz -f ${new_name}`;
elif [[ ${i} == *.fq.gz ]];
then
new_name=`echo ${i} | sed -e "s/.fq.gz\$/_good.fastq.gz/"`;
mv ${i} ${new_name};
else
new_name=`echo ${i} | sed -e "s/.fastq.gz\$/_good.fastq.gz/"`;
mv ${i} ${new_name};
fi;
done
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment