Commit 0bf417d4 authored by David Trudgian's avatar David Trudgian

Try to fix freezes under valid and invalid design file conditions

parent 22c3f7cf
#!/usr/bin/env nextflow
// Default parameter values to run tests
params.pairs="pe"
params.markdups="mark"
params.genome="/project/apps_database/hisat2_index/hg38/"
params.gtf="/project/apps_database/iGenomes/Homo_sapiens/NCBI/GRCh38/Annotation/Genes.gencode/genes.gtf"
params.fastqs="$baseDir/../test_data/*.fastq.gz"
params.design="$baseDir/../test_data/design.txt"
design_file = file(params.design)
fastqs=file(params.fastqs)
design_file = file(params.design)
gtf_file = file(params.gtf)
// params genome is the directory
// base name for the index is always genome
index_path = file(params.genome)
index_name = "genome"
// Pair handling, helper function taken from rnatoy
// which is covered by the GNU General Public License v3
// https://github.com/nextflow-io/rnatoy/blob/master/main.nf
read_pe = Channel.empty()
read_se = Channel.empty()
def fileMap = [:]
fastqs.each {
final fileName = it.getFileName().toString()
prefix = fileName.lastIndexOf('/')
fileMap[fileName] = it
}
def prefix = []
new File(params.design).withReader { reader ->
def hline = reader.readLine()
def header = hline.split("\t")
prefixidx = header.findIndexOf{it == 'SampleID'};
oneidx = header.findIndexOf{it == 'FullPathToFqR1'};
twoidx = header.findIndexOf{it == 'FullPathToFqR2'};
if (twoidx == -1) {
twoidx = oneidx
}
while (line = reader.readLine()) {
def row = line.split("\t")
if (fileMap.get(row[oneidx]) != null) {
prefix << tuple(row[prefixidx],fileMap.get(row[oneidx]),fileMap.get(row[twoidx]))
}
}
}
if (params.pairs == 'pe') {
Channel
.from(prefix)
.into { read_pe }
}
if (params.pairs == 'se') {
Channel
.from(prefix)
.into { read_se }
}
//
// Trim raw reads using trimgalore
//
process trimpe {
when:
params.pairs == 'pe'
input:
set pair_id, file(read1), file(read2) from read_pe
output:
set pair_id, file("${read1.baseName.split("\\.", 2)[0]}_val_1.fq.gz"), file("${read2.baseName.split("\\.", 2)[0]}_val_2.fq.gz") into trimpe
script:
"""
module load trimgalore/0.4.1 cutadapt/1.9.1
trim_galore --paired -q 25 --illumina --gzip --length 35 --no_report_file ${read1} ${read2}
"""
}
process trimse {
when:
params.pairs == 'se'
input:
set pair_id, file(read1) from read_se
output:
set pair_id, file("${read1.baseName.split("\\.", 2)[0]}_trimmed.fq.gz") into trimse
script:
"""
module load trimgalore/0.4.1 cutadapt/1.9.1
trim_galore -q 25 --illumina --gzip --length 35 --no_report_file ${read1}
"""
}
//
// Align trimmed reads to genome indes with hisat2
// Sort and index with samtools
// QC aligned reads with fastqc
// Alignment stats with samtools
//
process alignpe {
//publishDir $outDir, mode: 'copy'
cpus 4
input:
set pair_id, file(fq1), file(fq2) from trimpe
output:
set pair_id, file("${pair_id}.bam") into alignpe
file("${pair_id}.flagstat.txt") into alignstats_pe
"""
module load hisat2/2.0.1-beta-intel samtools/intel/1.3 fastqc/0.11.2 picard/1.127
hisat2 -p 4 --no-unal --dta -x ${index_path}/${index_name} -1 ${fq1} -2 ${fq2} -S out.sam 2> ${pair_id}.hisatout.txt
java -Xmx4g -jar \$PICARD/picard.jar CleanSam INPUT=out.sam OUTPUT=out.clean.sam
java -Xmx4g -jar \$PICARD/picard.jar SamFormatConverter INPUT=out.clean.sam OUTPUT=out.bam
samtools sort -o ${pair_id}.bam out.bam
samtools index ${pair_id}.bam
fastqc -f bam ${pair_id}.bam
samtools flagstat ${pair_id}.bam > ${pair_id}.flagstat.txt
"""
}
process alignse {
//publishDir $outDir, mode: 'copy'
cpus 4
input:
set pair_id, file(fq1) from trimse
output:
set pair_id, file("${pair_id}.bam") into alignse
file("${pair_id}.flagstat.txt") into alignstats_se
"""
module load hisat2/2.0.1-beta-intel samtools/intel/1.3 fastqc/0.11.2 picard/1.127
hisat2 -p 4 --no-unal --dta -x ${index_path}/${index_name} -U ${fq1} -S out.sam 2> ${pair_id}.hisatout.txt
java -Xmx4g -jar \$PICARD/picard.jar CleanSam INPUT=out.sam OUTPUT=out.clean.sam
java -Xmx4g -jar \$PICARD/picard.jar SamFormatConverter INPUT=out.clean.sam OUTPUT=out.bam
samtools sort -o ${pair_id}.bam out.bam
samtools index ${pair_id}.bam
fastqc -f bam ${pair_id}.bam
samtools flagstat ${pair_id}.bam > ${pair_id}.flagstat.txt
"""
}
// From here on we are the same for PE and SE, so merge channels and carry on
Channel
.empty()
.mix(alignse, alignpe)
.tap { aligned2 }
.set { aligned }
Channel
.empty()
.mix(alignstats_se, alignstats_pe)
.set { alignstats }
//
// Summarize all flagstat output
//
process parse_alignstat {
publishDir "$baseDir/output", mode: 'copy'
input:
file(txt) from alignstats.toList()
output:
file('alignment.summary.txt')
"""
perl $baseDir/scripts/parse_flagstat.pl *.flagstat.txt
"""
}
//
// Identify duplicate reads with Picard
//
process markdups {
memory '4GB'
publishDir "$baseDir/output", mode: 'copy'
input:
set pair_id, file(sbam) from aligned
output:
set pair_id, file("${pair_id}.dedup.bam") into deduped1
set pair_id, file("${pair_id}.dedup.bam") into deduped2
script:
if (params.markdups == 'mark')
"""
module load picard/1.127
java -Xmx4g -jar \$PICARD/picard.jar MarkDuplicates INPUT=${sbam} REMOVE_DUPLICATES=true VALIDATION_STRINGENCY=LENIENT ASSUME_SORTED=true METRICS_FILE=${pair_id}.dups OUTPUT=${pair_id}.dedup.bam
"""
else
"""
cp ${sbam} ${pair_id}.dedup.bam
"""
}
//
// Read summarization with subread
//
process featurect {
publishDir "$baseDir/output", mode: 'copy'
cpus 4
input:
set pair_id, file(dbam) from deduped1
file gtf_file
output:
file("${pair_id}.cts") into counts
"""
module load module subread/1.5.0-intel
featureCounts -T 4 -p -g gene_name -a ${gtf_file} -o ${pair_id}.cts ${dbam}
"""
}
//
// Assemble transcripts with stringtie
//
process stringtie {
publishDir "$baseDir/output", mode: 'copy'
cpus 4
input:
set pair_id, file(dbam) from deduped2
file gtf_file
output:
file("${pair_id}_stringtie") into strcts
"""
module load stringtie/1.1.2-intel
mkdir ${pair_id}_stringtie
cd ${pair_id}_stringtie
stringtie ../${dbam} -p 4 -G ../${gtf_file} -B -e -o denovo.gtf
"""
}
// Create tab-delim gene names file that is required by stat analysis
process genenames {
input:
file gtf_file
output:
file "genenames.txt" into genenames
"""
perl $baseDir/scripts/gencode_genename.pl ${gtf_file}
"""
}
process statanal {
publishDir "$baseDir/output", mode: 'copy'
input:
file count_file from counts.toList()
file design_file name 'design.txt'
file genenames_file name 'genenames.txt' from genenames
output:
"""
module load R/3.2.1-intel
perl $baseDir/scripts/concat_cts.pl -g genenames.txt -o ./ *.cts
Rscript $baseDir/scripts/DEA_htseq.R
"""
}
process buildrda {
publishDir "$baseDir/output", mode: 'copy'
input:
file stringtie_dir from strcts.toList()
file design_file name 'design.txt'
output:
"""
module load R/3.2.1-intel
Rscript $baseDir/scripts/build_ballgown.R *_stringtie
"""
}
#!/usr/bin/env nextflow
// Default parameter values to run tests
params.pairs="pe"
params.markdups="mark"
params.genome="/project/apps_database/hisat2_index/hg38/"
params.gtf="/project/apps_database/iGenomes/Homo_sapiens/NCBI/GRCh38/Annotation/Genes.gencode/genes.gtf"
params.fastqs="$baseDir/../test_data/*.fastq.gz"
params.design="$baseDir/../test_data/design.txt"
design_file = file(params.design)
fastqs=file(params.fastqs)
design_file = file(params.design)
gtf_file = file(params.gtf)
// params genome is the directory
// base name for the index is always genome
index_path = file(params.genome)
index_name = "genome"
// Pair handling, helper function taken from rnatoy
// which is covered by the GNU General Public License v3
// https://github.com/nextflow-io/rnatoy/blob/master/main.nf
def fileMap = [:]
fastqs.each {
final fileName = it.fileName.toString()
int p = fileName.lastIndexOf('/')
if( p != -1 ) fileName = filePattern.substring(p+1)
fileMap[fileName] = it
}
def prefix = []
new File(params.design).withReader { reader ->
def hline = reader.readLine()
def header = hline.split("\t")
prefixidx = header.findIndexOf{it == 'SampleID'};
oneidx = header.findIndexOf{it == 'FullPathToFqR1'};
twoidx = header.findIndexOf{it == 'FullPathToFqR2'};
if (twoidx == -1) {
twoidx = oneidx
}
while (line = reader.readLine()) {
def row = line.split("\t")
if (fileMap.get(row[oneidx]) != null) {
prefix << tuple(row[prefixidx],fileMap.get(row[oneidx]),fileMap.get(row[twoidx]))
}
}
}
if( ! prefix) { error "Didn't match any input files with entries in the design file" }
if (params.pairs == 'pe') {
Channel
.from(prefix)
.set { read_pe }
Channel
.empty()
.set { read_se }
}
if (params.pairs == 'se') {
Channel
.from(prefix)
.set { read_se }
Channel
.empty()
.set { read_pe }
}
//
// Trim raw reads using trimgalore
//
process trimpe {
input:
set pair_id, file(read1), file(read2) from read_pe
output:
set pair_id, file("${read1.baseName.split("\\.", 2)[0]}_val_1.fq.gz"), file("${read2.baseName.split("\\.", 2)[0]}_val_2.fq.gz") into trimpe
script:
"""
module load trimgalore/0.4.1 cutadapt/1.9.1
trim_galore --paired -q 25 --illumina --gzip --length 35 --no_report_file ${read1} ${read2}
"""
}
process trimse {
input:
set pair_id, file(read1) from read_se
output:
set pair_id, file("${read1.baseName.split("\\.", 2)[0]}_trimmed.fq.gz") into trimse
script:
"""
module load trimgalore/0.4.1 cutadapt/1.9.1
trim_galore -q 25 --illumina --gzip --length 35 --no_report_file ${read1}
"""
}
//
// Align trimmed reads to genome indes with hisat2
// Sort and index with samtools
// QC aligned reads with fastqc
// Alignment stats with samtools
//
process alignpe {
//publishDir $outDir, mode: 'copy'
cpus 4
input:
set pair_id, file(fq1), file(fq2) from trimpe
output:
set pair_id, file("${pair_id}.bam") into alignpe
file("${pair_id}.flagstat.txt") into alignstats_pe
"""
module load hisat2/2.0.1-beta-intel samtools/intel/1.3 fastqc/0.11.2 picard/1.127
hisat2 -p 4 --no-unal --dta -x ${index_path}/${index_name} -1 ${fq1} -2 ${fq2} -S out.sam 2> ${pair_id}.hisatout.txt
java -Xmx4g -jar \$PICARD/picard.jar CleanSam INPUT=out.sam OUTPUT=out.clean.sam
java -Xmx4g -jar \$PICARD/picard.jar SamFormatConverter INPUT=out.clean.sam OUTPUT=out.bam
samtools sort -o ${pair_id}.bam out.bam
samtools index ${pair_id}.bam
fastqc -f bam ${pair_id}.bam
samtools flagstat ${pair_id}.bam > ${pair_id}.flagstat.txt
"""
}
process alignse {
//publishDir $outDir, mode: 'copy'
cpus 4
input:
set pair_id, file(fq1) from trimse
output:
set pair_id, file("${pair_id}.bam") into alignse
file("${pair_id}.flagstat.txt") into alignstats_se
"""
module load hisat2/2.0.1-beta-intel samtools/intel/1.3 fastqc/0.11.2 picard/1.127
hisat2 -p 4 --no-unal --dta -x ${index_path}/${index_name} -U ${fq1} -S out.sam 2> ${pair_id}.hisatout.txt
java -Xmx4g -jar \$PICARD/picard.jar CleanSam INPUT=out.sam OUTPUT=out.clean.sam
java -Xmx4g -jar \$PICARD/picard.jar SamFormatConverter INPUT=out.clean.sam OUTPUT=out.bam
samtools sort -o ${pair_id}.bam out.bam
samtools index ${pair_id}.bam
fastqc -f bam ${pair_id}.bam
samtools flagstat ${pair_id}.bam > ${pair_id}.flagstat.txt
"""
}
// From here on we are the same for PE and SE, so merge channels and carry on
Channel
.empty()
.mix(alignse, alignpe)
.tap { aligned2 }
.set { aligned }
Channel
.empty()
.mix(alignstats_se, alignstats_pe)
.set { alignstats }
//
// Summarize all flagstat output
//
process parse_alignstat {
publishDir "$baseDir/output", mode: 'copy'
input:
file(txt) from alignstats.toList()
output:
file('alignment.summary.txt')
"""
perl $baseDir/scripts/parse_flagstat.pl *.flagstat.txt
"""
}
//
// Identify duplicate reads with Picard
//
process markdups {
memory '4GB'
publishDir "$baseDir/output", mode: 'copy'
input:
set pair_id, file(sbam) from aligned
output:
set pair_id, file("${pair_id}.dedup.bam") into deduped1
set pair_id, file("${pair_id}.dedup.bam") into deduped2
script:
if (params.markdups == 'mark')
"""
module load picard/1.127
java -Xmx4g -jar \$PICARD/picard.jar MarkDuplicates INPUT=${sbam} REMOVE_DUPLICATES=true VALIDATION_STRINGENCY=LENIENT ASSUME_SORTED=true METRICS_FILE=${pair_id}.dups OUTPUT=${pair_id}.dedup.bam
"""
else
"""
cp ${sbam} ${pair_id}.dedup.bam
"""
}
//
// Read summarization with subread
//
process featurect {
publishDir "$baseDir/output", mode: 'copy'
cpus 4
input:
set pair_id, file(dbam) from deduped1
file gtf_file
output:
file("${pair_id}.cts") into counts
"""
module load module subread/1.5.0-intel
featureCounts -T 4 -p -g gene_name -a ${gtf_file} -o ${pair_id}.cts ${dbam}
"""
}
//
// Assemble transcripts with stringtie
//
process stringtie {
publishDir "$baseDir/output", mode: 'copy'
cpus 4
input:
set pair_id, file(dbam) from deduped2
file gtf_file
output:
file("${pair_id}_stringtie") into strcts
"""
module load stringtie/1.1.2-intel
mkdir ${pair_id}_stringtie
cd ${pair_id}_stringtie
stringtie ../${dbam} -p 4 -G ../${gtf_file} -B -e -o denovo.gtf
"""
}
// Create tab-delim gene names file that is required by stat analysis
process genenames {
input:
file gtf_file
output:
file "genenames.txt" into genenames
"""
perl $baseDir/scripts/gencode_genename.pl ${gtf_file}
"""
}
process statanal {
publishDir "$baseDir/output", mode: 'copy'
input:
file count_file from counts.toList()
file design_file name 'design.txt'
file genenames_file name 'genenames.txt' from genenames
output:
"""
module load R/3.2.1-intel
perl $baseDir/scripts/concat_cts.pl -g genenames.txt -o ./ *.cts
Rscript $baseDir/scripts/DEA_htseq.R
"""
}
process buildrda {
publishDir "$baseDir/output", mode: 'copy'
input:
file stringtie_dir from strcts.toList()
file design_file name 'design.txt'
output:
"""
module load R/3.2.1-intel
Rscript $baseDir/scripts/build_ballgown.R *_stringtie
"""
}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment