Skip to content
Snippets Groups Projects
Commit e7ad35bc authored by Jeremy Mathews's avatar Jeremy Mathews
Browse files

File Name Check

parent 3caba697
Branches
Tags
1 merge request!7Resolve "Check file inputs"
Pipeline #4137 failed with stage
in 4 seconds
......@@ -4,7 +4,7 @@
params.input = "$baseDir"
params.output= "$baseDir/output"
params.design="$params.input/design.txt"
params.genome="/project/shared/bicf_workflow_ref/human/grch38/"
params.genome="/project/shared/bicf_workflow_ref/human/GRCh38/"
params.markdups="picard"
params.stranded="0"
params.pairs="pe"
......@@ -14,313 +14,231 @@ params.fusion = 'skip'
params.dea = 'detect'
design_file = file(params.design)
fqs = Channel.fromPath("$params.input/*")
gtf_file = file("$params.genome/gencode.gtf")
genenames = file("$params.genome/genenames.txt")
geneset = file("$params.genome/../gsea_gmt/$params.geneset")
dbsnp="$params.genome/dbsnp.vcf.gz"
indel="$params.genome/goldindels.vcf.gz"
dbsnp="$params.genome/dbSnp.vcf.gz"
indel="$params.genome/GoldIndels.vcf.gz"
knownindel=file(indel)
dbsnp=file(dbsnp)
files = Channel
.fromPath("$params.input/*")
good = Channel.fromPath("$params.input/*.fastq.gz")
process checkinputfiles {
module 'parallel/20150122:pigz/2.4'
queue 'super'
input:
file ("*") from files.collect()
file design_file name "design.tsv"
output:
set file ("design.tsv"), file ("*.fastq.gz") into design
file ("*.fastq.gz") into fastqs
script:
"""
for fqs in `ls | grep -v "^design.tsv\$"`;
do if [[ \$fqs == *.fq ]];
then new_name=`echo \${fqs} | sed -e "s/.fq\$/.fastq/"`;
mv \${fqs} \${new_name};
echo "pigz -f \${new_name}";
elif [[ \$fqs == *.fastq ]];
then echo "pigz -f \$fqs";
elif [[ \$fqs == *.fq.gz ]];
then new_name=`echo \${fqs} | sed -e "s/.fq.gz\$/.fastq.gz/"`;
mv \${fqs} \${new_name};
fi;
done | shuf | parallel -j\${SLURM_CPUS_ON_NODE};
"""
}
// params genome is the directory
// base name for the index is always genome
index_path = file(params.genome)
process checkdesignfile {
executor 'local'
publishDir "$params.output", mode: 'copy'
input:
set file ("design.ori.txt"), file ("*") from design
output:
file("design.valid.txt") into newdesign
stdout spltnames
script:
"""
perl -p -e 's/\\r\\n*/\\n/g' design.ori.txt > design.fix.txt
perl $baseDir/scripts/check_designfile.pl ${params.pairs} design.fix.txt
"""
queue 'super'
module 'parallel/20150122:pigz/2.4'
publishDir "$params.output", mode: 'copy'
input:
file design_file name 'design.ori.txt'
file ("*") from fqs.collect()
output:
file("design.valid.txt") into newdesign
file("*.fastq.gz") into fastqs mode flatten
stdout spltnames
script:
"""
bash $baseDir/scripts/check_inputfiles.sh
perl -p -e 's/\\r\\n*/\\n/g' design.ori.txt > design.fix.txt
perl $baseDir/scripts/check_designfile.pl ${params.pairs} design.fix.txt
"""
}
def fileMap = [:]
fastqs
.mix(good)
.flatten()
.each {
final fileName = it.getFileName().toString()
prefix = fileName.lastIndexOf('/')
fileMap[fileName] = it
}
.subscribe {
def fileName = it.getFileName()
fileMap."$fileName" = it
}
if (params.pairs == 'pe') {
spltnames
.splitCsv()
.filter { fileMap.get(it[1]) != null & fileMap.get(it[2]) != null }
.map { it -> tuple(it[0], fileMap.get(it[1]), fileMap.get(it[2])) }
.set { read }
}
else {
spltnames
.splitCsv()
.filter { fileMap.get(it[1]) != null }
.map { it -> tuple(it[0], fileMap.get(it[1]),'') }
.set { read }
.splitCsv()
.filter { fileMap.get(it[1]) != null & fileMap.get(it[2]) != null }
.map { it -> tuple(it[0], fileMap.get(it[1]), fileMap.get(it[2])) }
.set { read }
} else {
spltnames
.splitCsv()
.filter { fileMap.get(it[1]) != null }
.map { it -> tuple(it[0], fileMap.get(it[1]),'') }
.set { read }
}
if( ! read ) { error "Didn't match any input files with entries in the design file" }
//
// Trim raw reads using trimgalore
//
process trim {
errorStrategy 'ignore'
input:
set pair_id, file(read1), file(read2) from read
input:
set pair_id, file(read1), file(read2) from read
output:
output:
set pair_id, file("${pair_id}.trim.R1.fastq.gz"),file("${pair_id}.trim.R2.fastq.gz") into trimread
set pair_id, file("${pair_id}.trim.R1.fastq.gz"),file("${pair_id}.trim.R2.fastq.gz") into fusionfq
set pair_id, file("${pair_id}.trim.R1.fastq.gz"),file("${pair_id}.trim.R2.fastq.gz") into trimread
set pair_id, file("${pair_id}.trim.R1.fastq.gz"),file("${pair_id}.trim.R2.fastq.gz") into fusionfq
script:
"""
bash $baseDir/process_scripts/preproc_fastq/trimgalore.sh -p ${pair_id} -a ${read1} -b ${read2}
"""
script:
"""
bash $baseDir/process_scripts/preproc_fastq/trimgalore.sh -p ${pair_id} -a ${read1} -b ${read2}
"""
}
//
// Align trimmed reads to genome indes with hisat2
// Sort and index with samtools
// QC aligned reads with fastqc
// Alignment stats with samtools
//
process starfusion {
errorStrategy 'ignore'
publishDir "$params.output", mode: 'copy'
input:
set pair_id, file(fq1), file(fq2) from fusionfq
output:
file("${pair_id}.starfusion.txt") into fusionout
publishDir "$params.output", mode: 'copy'
when:
input:
set pair_id, file(fq1), file(fq2) from fusionfq
params.fusion == 'detect' && params.pairs == 'pe'
output:
file("${pair_id}.starfusion.txt") into fusionout
script:
when:
params.fusion == 'detect' && params.pairs == 'pe'
"""
bash $baseDir/process_scripts/alignment/starfusion.sh -p ${pair_id} -r ${index_path} -a ${fq1} -b ${fq2} -m trinity -f
"""
script:
"""
bash $baseDir/process_scripts/alignment/starfusion.sh -p ${pair_id} -r ${index_path} -a ${fq1} -b ${fq2} -m trinity -f
"""
}
process align {
errorStrategy 'ignore'
publishDir "$params.output", mode: 'copy'
publishDir "$params.output", mode: 'copy'
input:
input:
set pair_id, file(fq1), file(fq2) from trimread
set pair_id, file(fq1), file(fq2) from trimread
output:
set pair_id, file("${pair_id}.bam") into aligned
set pair_id, file("${pair_id}.bam") into aligned2
file("${pair_id}.alignerout.txt") into hsatout
output:
set pair_id, file("${pair_id}.bam") into aligned
set pair_id, file("${pair_id}.bam") into aligned2
file("${pair_id}.alignerout.txt") into hsatout
script:
"""
bash $baseDir/process_scripts/alignment/rnaseqalign.sh -a $params.align -p ${pair_id} -r ${index_path} -x ${fq1} -y ${fq2}
"""
script:
"""
bash $baseDir/process_scripts/alignment/rnaseqalign.sh -a $params.align -p ${pair_id} -r ${index_path} -x ${fq1} -y ${fq2}
"""
}
process alignqc {
errorStrategy 'ignore'
publishDir "$params.output", mode: 'copy'
input:
publishDir "$params.output", mode: 'copy'
set pair_id, file(bam) from aligned2
input:
set pair_id, file(bam) from aligned2
output:
output:
file("${pair_id}.flagstat.txt") into alignstats
set file("${pair_id}_fastqc.zip"),file("${pair_id}_fastqc.html") into fastqc
file("${pair_id}.flagstat.txt") into alignstats
set file("${pair_id}_fastqc.zip"),file("${pair_id}_fastqc.html") into fastqc
script:
"""
bash $baseDir/process_scripts/alignment/bamqc.sh -p ${pair_id} -b ${bam} -y rna
"""
script:
"""
bash $baseDir/process_scripts/alignment/bamqc.sh -p ${pair_id} -b ${bam} -y rna
"""
}
// Summarize all flagstat output
process parse_alignstat {
publishDir "$params.output", mode: 'copy'
input:
publishDir "$params.output", mode: 'copy'
file(txt) from alignstats.toList()
file(txt) from hsatout.toList()
input:
file(txt) from alignstats.toList()
file(txt) from hsatout.toList()
output:
output:
file('alignment.summary.txt')
file('alignment.summary.txt')
script:
"""
perl $baseDir/scripts/parse_flagstat.pl *.flagstat.txt
"""
script:
"""
perl $baseDir/scripts/parse_flagstat.pl *.flagstat.txt
"""
}
// Identify duplicate reads with Picard
process markdups {
publishDir "$params.output", mode: 'copy'
input:
publishDir "$params.output", mode: 'copy'
set pair_id, file(sbam) from aligned
input:
set pair_id, file(sbam) from aligned
output:
output:
set pair_id, file("${pair_id}.dedup.bam") into deduped1
set pair_id, file("${pair_id}.dedup.bam") into deduped2
set pair_id, file("${pair_id}.dedup.bam") into deduped1
set pair_id, file("${pair_id}.dedup.bam") into deduped2
script:
"""
bash $baseDir/process_scripts/alignment/markdups.sh -a $params.markdups -b $sbam -p $pair_id
"""
script:
"""
bash $baseDir/process_scripts/alignment/markdups.sh -a $params.markdups -b $sbam -p $pair_id
"""
}
// Read summarization with subread
// Assemble transcripts with stringtie
process geneabund {
errorStrategy 'ignore'
publishDir "$params.output", mode: 'copy'
input:
publishDir "$params.output", mode: 'copy'
set pair_id, file(sbam) from deduped1
input:
set pair_id, file(sbam) from deduped1
output:
output:
file("${pair_id}.cts") into counts
file("${pair_id}.cts.summary") into ctsum
file("${pair_id}_stringtie") into strcts
file("${pair_id}.fpkm.txt") into fpkm
file("${pair_id}.cts") into counts
file("${pair_id}.cts.summary") into ctsum
file("${pair_id}_stringtie") into strcts
file("${pair_id}.fpkm.txt") into fpkm
script:
"""
bash $baseDir/process_scripts/genect_rnaseq/geneabundance.sh -s $params.stranded -g ${gtf_file} -p ${pair_id} -b ${sbam}
"""
script:
"""
bash $baseDir/process_scripts/genect_rnaseq/geneabundance.sh -s $params.stranded -g ${gtf_file} -p ${pair_id} -b ${sbam}
"""
}
process statanal {
errorStrategy 'ignore'
publishDir "$params.output", mode: 'copy'
input:
file count_file from counts.toList()
file count_sum from ctsum.toList()
file newdesign name 'design.txt'
file genenames
file geneset name 'geneset.gmt'
file fpkm_file from fpkm.toList()
file stringtie_dir from strcts.toList()
output:
file "*.txt" into txtfiles
file "*.png" into psfiles
file("*.rda") into rdafiles
file("geneset.shiny.gmt") into gmtfile
when:
script:
"""
bash $baseDir/process_scripts/genect_rnaseq/statanal.sh -d $params.dea
"""
publishDir "$params.output", mode: 'copy'
input:
file count_file from counts.toList()
file count_sum from ctsum.toList()
file newdesign name 'design.txt'
file genenames
file geneset name 'geneset.gmt'
file fpkm_file from fpkm.toList()
file stringtie_dir from strcts.toList()
output:
file "*.txt" into txtfiles
file "*.png" into psfiles
file("*.rda") into rdafiles
file("geneset.shiny.gmt") into gmtfile
script:
"""
bash $baseDir/process_scripts/genect_rnaseq/statanal.sh -d $params.dea
"""
}
process gatkbam {
errorStrategy 'ignore'
publishDir "$params.output", mode: 'copy'
input:
set pair_id, file(rbam) from deduped2
output:
set file("${pair_id}.final.bam"),file("${pair_id}.final.bai") into gatkbam
publishDir "$params.output", mode: 'copy'
when:
input:
set pair_id, file(rbam) from deduped2
params.align == 'hisat' && $index_path == '/project/shared/bicf_workflow_ref/GRCh38/'
output:
set file("${pair_id}.final.bam"),file("${pair_id}.final.bai") into gatkbam
script:
when:
params.align == 'hisat' && $index_path == '/project/shared/bicf_workflow_ref/GRCh38/'
"""
bash $baseDir/process_scripts/variants/gatkrunner.sh -a gatkbam_rna -b $rbam -r ${index_path}/hisat_index -p $pair_id
"""
script:
"""
bash $baseDir/process_scripts/variants/gatkrunner.sh -a gatkbam_rna -b $rbam -r ${index_path}/hisat_index -p $pair_id
"""
}
......@@ -55,9 +55,9 @@ while (my $line = <DFILE>) {
$hash{SampleGroup} = $grp[$j];
}
$hash{SampleGroup} =~ s/_//g;
unless ($hash{FqR1} =~ m/.fastq.gz/) {
unless ($hash{FqR1} =~ m/_good.fastq.gz/) {
my $name = $hash{FqR1};
$name =~ s/.f.*/.fastq.gz/;
$name =~ s/.f.*/_good.fastq.gz/;
unless ($hash{FqR1} eq $name) {
$hash{FqR1} = $name;
unless (-e ($name)) {
......@@ -68,7 +68,7 @@ while (my $line = <DFILE>) {
$hash{FqR2} = 'na' unless ($hash{FqR2});
unless ($hash{FqR2} eq 'na') {
my $name = $hash{FqR2};
$name =~ s/.f.*/.fastq.gz/;
$name =~ s/.f.*/_good.fastq.gz/;
unless ($hash{FqR2} eq $name) {
$hash{FqR2} = $name;
unless (-e ($name)) {
......
#!/bin/bash
#check_inputfiles.sh
fqs=`ls *.f*`
for i in $fqs;
do
if [[ ${i} == *.fq ]];
then
new_name=`echo ${i} | sed -e "s/.fq\$/_good.fastq/"`;
mv ${i} ${new_name};
`pigz -f ${new_name}`;
elif [[ ${i} == *.fastq ]];
then
new_name=`echo ${i} | sed -e "s/.fastq\$/_good.fastq/"`;
mv ${i} ${new_name};
`pigz -f ${new_name}`;
elif [[ ${i} == *.fq.gz ]];
then
new_name=`echo ${i} | sed -e "s/.fq.gz\$/_good.fastq.gz/"`;
mv ${i} ${new_name};
else
new_name=`echo ${i} | sed -e "s/.fastq.gz\$/_good.fastq.gz/"`;
mv ${i} ${new_name};
fi;
done
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment