diff --git a/workflow/main.nf b/workflow/main.nf index 7532c0b28471baea03efb788ace7cb11c99a2265..0c1825b2515616f9be665d9058c7a17ae2bbd66a 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -1,22 +1,16 @@ #!/usr/bin/env nextflow // Default parameter values to run tests -params.bams="$baseDir/../test_data/*.fastq.gz" -params.design="$baseDir/../test_data/design.pe.txt" -params.genome="/project/shared/bicf_workflow_ref/GRCh38/" +params.bams="$baseDir/../test/*.bam" +params.design="$baseDir/../test/design.csv" +params.genome="/project/shared/bicf_workflow_ref/GRCh37/" design_file = file(params.design) -fastqs=file(params.fastqs) -design_file = file(params.design) +bams=file(params.bams) gtf_file = file("$params.genome/gencode.gtf") genenames = file("$params.genome/genenames.txt") geneset = file("$params.genome/gsea_gmt/$params.geneset") -// params genome is the directory -// base name for the index is always genome -index_path = file(params.genome) -index_name = "genome" -star_index = 'star_index/' // Pair handling, helper function taken from rnatoy // which is covered by the GNU General Public License v3 @@ -66,221 +60,34 @@ Channel .set { read_pe } } -// -// Trim raw reads using trimgalore -// -process trimpe { - input: - set pair_id, file(read1), file(read2) from read_pe - output: - set pair_id, file("${read1.baseName.split("\\.", 2)[0]}_val_1.fq.gz"), file("${read2.baseName.split("\\.", 2)[0]}_val_2.fq.gz") into trimpe - script: - """ - module load trimgalore/0.4.1 cutadapt/1.9.1 - trim_galore --paired -q 25 --illumina --gzip --length 35 ${read1} ${read2} - """ -} -process trimse { - input: - set pair_id, file(read1) from read_se - output: - set pair_id, file("${read1.baseName.split("\\.", 2)[0]}_trimmed.fq.gz") into trimse - script: - """ - module load trimgalore/0.4.1 cutadapt/1.9.1 - trim_galore -q 25 --illumina --gzip --length 35 ${read1} - """ -} - -// -// Align trimmed reads to genome indes with hisat2 -// Sort and index with samtools -// QC aligned reads with fastqc -// Alignment stats with samtools -// -process alignpe { - - publishDir "$baseDir/output", mode: 'copy' - cpus 32 - - input: - set pair_id, file(fq1), file(fq2) from trimpe - output: - set pair_id, file("${pair_id}.bam") into alignpe - file("${pair_id}.flagstat.txt") into alignstats_pe - file("${pair_id}.hisatout.txt") into hsatoutpe - set file("${pair_id}_fastqc.zip"),file("${pair_id}_fastqc.html") into fastqcpe - script: - if (params.align == 'hisat') - """ - module load hisat2/2.0.1-beta-intel samtools/intel/1.3 fastqc/0.11.2 picard/1.127 speedseq/20160506 - hisat2 -p 30 --no-unal --dta -x ${index_path}/${index_name} -1 ${fq1} -2 ${fq2} -S out.sam 2> ${pair_id}.hisatout.txt - sambamba view -t 30 -f bam -S -o output.bam out.sam - sambamba sort -t 30 -o ${pair_id}.bam output.bam - sambamba flagstat -t 30 ${pair_id}.bam > ${pair_id}.flagstat.txt - fastqc -f bam ${pair_id}.bam - """ - else - """ - module load star/2.4.2a samtools/intel/1.3 fastqc/0.11.2 picard/1.127 speedseq/20160506 - STAR --genomeDir ${index_path}/${star_index} --readFilesIn ${fq1} ${fq2} --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out - mv outLog.final.out ${pair_id}.hisatout.txt - sambamba sort -t 30 -o ${pair_id}.bam outAligned.sortedByCoord.out.bam - sambamba flagstat -t 30 ${pair_id}.bam > ${pair_id}.flagstat.txt - fastqc -f bam ${pair_id}.bam - """ -} -process alignse { - - publishDir "$baseDir/output", mode: 'copy' - cpus 32 - - input: - set pair_id, file(fq1) from trimse - output: - set pair_id, file("${pair_id}.bam") into alignse - file("${pair_id}.flagstat.txt") into alignstats_se - file("${pair_id}.hisatout.txt") into hsatoutse - set file("${pair_id}_fastqc.zip"),file("${pair_id}_fastqc.html") into fastqcse - script: - if (params.align == 'hisat') - """ - module load hisat2/2.0.1-beta-intel samtools/intel/1.3 fastqc/0.11.2 speedseq/20160506 picard/1.127 - hisat2 -p 30 --no-unal --dta -x ${index_path}/${index_name} -U ${fq1} -S out.sam 2> ${pair_id}.hisatout.txt - sambamba view -t 30 -f bam -S -o output.bam out.sam - sambamba sort -t 30 -o ${pair_id}.bam output.bam - sambamba flagstat -t 30 ${pair_id}.bam > ${pair_id}.flagstat.txt - fastqc -f bam ${pair_id}.bam - """ - else - """ - module load star/2.4.2a samtools/intel/1.3 fastqc/0.11.2 picard/1.127 speedseq/20160506 - STAR --genomeDir ${index_path}/${star_index} --readFilesIn ${fq1} --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype SAM --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out - mv outLog.final.out ${pair_id}.hisatout.txt - sambamba view -t 30 -f bam -S -o output.bam outAligned.out.sam - sambamba sort -t 30 -o ${pair_id}.bam output.bam - sambamba flagstat -t 30 ${pair_id}.bam > ${pair_id}.flagstat.txt - fastqc -f bam ${pair_id}.bam - """ -} - -// From here on we are the same for PE and SE, so merge channels and carry on - -Channel - .empty() - .mix(alignse, alignpe) - .tap { aligned2 } - .set { aligned } - -Channel - .empty() - .mix(alignstats_se, alignstats_pe) - .set { alignstats } - -Channel - .empty() - .mix(hsatoutse, hsatoutpe) - .set { hsatout } - -// -// Summarize all flagstat output -// -process parse_alignstat { - - publishDir "$baseDir/output", mode: 'copy' - input: - file(txt) from alignstats.toList() - file(txt) from hsatout.toList() - - output: - file('alignment.summary.txt') - - """ - perl $baseDir/scripts/parse_flagstat.pl *.flagstat.txt - """ -} - -// -// Identify duplicate reads with Picard -// -process markdups { - - memory '4GB' - publishDir "$baseDir/output", mode: 'copy' - - input: - set pair_id, file(sbam) from aligned - output: - set pair_id, file("${pair_id}.dedup.bam") into deduped1 - set pair_id, file("${pair_id}.dedup.bam") into deduped2 - script: - if (params.markdups == 'mark') - """ - module load picard/1.127 speedseq/20160506 - sambamba markdup -t 20 -r ${sbam} ${pair_id}.dedup.bam - """ - else - """ - cp ${sbam} ${pair_id}.dedup.bam - """ -} - -// -// Read summarization with subread -// -process featurect { - - publishDir "$baseDir/output", mode: 'copy' - cpus 32 - - input: - set pair_id, file(dbam) from deduped1 - file gtf_file - output: - file("${pair_id}.cts") into counts - """ - module load module subread/1.5.0-intel - featureCounts -s params.stranded -T 30 -p -g gene_name -a ${gtf_file} -o ${pair_id}.cts ${dbam} - """ -} - -// -// Assemble transcripts with stringtie -// - -process stringtie { - - publishDir "$baseDir/output", mode: 'copy' - cpus 32 - - input: - set pair_id, file(dbam) from deduped2 - file gtf_file - output: - file("${pair_id}_stringtie") into strcts - """ - module load stringtie/1.1.2-intel - mkdir ${pair_id}_stringtie - cd ${pair_id}_stringtie - stringtie ../${dbam} -p 30 -G ../${gtf_file} -B -e -o denovo.gtf -A ../geneabund.stringtie.tab - """ +process peakanno { + publishDir "$baseDir/output", mode: 'copy' + input: + file peak_file from greencenter + file design_file from input + file annotation Tdx + output: + set peak_id, file("${pattern}_annotation.xls"), file("${pattern}_peakTssDistribution.png") into peakanno + """ + module load R/3.2.1-intel + Rscript $baseDir/scripts/runchipseeker.R + """ } -process statanal { +//Need to do it with more than 1 condition +process diffbind { publishDir "$baseDir/output", mode: 'copy' input: - file count_file from counts.toList() + file peak_file from input file design_file name 'design.txt' - file genenames - file geneset name 'geneset.gmt' + file bam_file from input output: file "*.txt" into txtfiles file "*.png" into psfiles file "*" """ module load R/3.2.1-intel - perl $baseDir/scripts/concat_cts.pl -o ./ *.cts cp design.txt design.shiny.txt cp geneset.gmt geneset.shiny.gmt Rscript $baseDir/scripts/dea.R