diff --git a/astrocyte_package.yml b/astrocyte_package.yml index abd6e7a9fc1925393a8c607feb37d73478cd477c..697e657ea785527d6ec90ad787266ed4ca242c84 100644 --- a/astrocyte_package.yml +++ b/astrocyte_package.yml @@ -39,14 +39,7 @@ documentation_files: # A list of clueter environment modules that this workflow requires to run. # Specify versioned module names to ensure reproducability. workflow_modules: - - 'trimgalore/0.4.1' - - 'cutadapt/1.9.1' - - 'hisat2/2.0.1-beta-intel' - - 'samtools/intel/1.3' - - 'picard/1.127' - - 'subread/1.5.0-intel' - - 'stringtie/1.1.2-intel' - - 'speedseq/20160506' + - 'deeptools/2.3.5' # A list of parameters used by the workflow, defining how to present them, # options etc in the web interface. For each parameter: # @@ -81,50 +74,14 @@ workflow_modules: workflow_parameters: - - id: fastqs + - id: bams type: files required: true description: | - One or more input paired-end FASTQ files from a RNASeq experiment and a design file with the link between the same name and the sample group - regex: ".*(fastq|fq)*" + One or more BAM alignments from a ChIP-Seq experiment and a design file with the link between the same name and the sample group. All files will be imported from GreenCenter ChIP-seq peak calling pipeline + regex: ".*(bam)*" min: 1 - - id: stranded - type: select - required: true - choices: - - [ '0', 'Unstranded'] - - [ '1', 'Stranded'] - - [ '2', 'Reverse Stranded'] - description: | - In the case that the sequence libraries where generated using a stranded specific protocol. - - - id: pairs - type: select - required: true - choices: - - [ 'pe', 'Paired End'] - - [ 'se', 'Single End'] - description: | - In single-end sequencing, the sequencer reads a fragment from only one end to the other, generating the sequence of base pairs. In paired-end reading it starts at one read, finishes this direction at the specified read length, and then starts another round of reading from the opposite end of the fragment. - - - id: align - type: select - required: true - choices: - - [ 'hisat', 'HiSAT2'] - - [ 'star', 'STAR'] - description: | - Alignment tool - - - id: markdups - type: select - required: true - choices: - - [ 'mark', 'Remove Duplicates'] - - [ 'keep', 'Keep All Sequences'] - description: | - Duplicate reads are defined as originating from the same original fragment of DNA. Duplicates are identified as read pairs having identical 5-prime positions (coordinate and strand) for both reads in a mate pair and optionally, matching unique molecular identifier reads. - id: design type: file @@ -144,20 +101,6 @@ workflow_parameters: description: | Reference genome for alignment - - id: geneset - type: select - choices: - - ['h.all.v5.1.symbols.gmt','Hallmark Gene Sets'] - - ['c2.all.v5.1.symbols.gmt','Curated Gene Sets'] - - ['c3.all.v5.1.symbols.gmt','Motif Gene Sets'] - - ['c5.all.v5.1.entrez.gmt','Gene Ontology Gene Sets'] - - ['c6.all.v5.1.symbols.gmt','Oncogenic Signatures'] - - ['c7.all.v5.1.entrez.gmt','Immunological Signatures'] - - required: true - description: | - Gene Set Definitions used for QuSAGE Analysis -- see http://software.broadinstitute.org/gsea/msigdb/ for geneset descriptions - # ----------------------------------------------------------------------------- # SHINY APP CONFIGURATION @@ -187,8 +130,6 @@ vizapp_cran_packages: vizapp_bioc_packages: - qusage - ballgown - - edgeR - - DESeq2 vizapp_github_packages: - js229/Vennerable diff --git a/workflow/main.nf b/workflow/main.nf new file mode 100644 index 0000000000000000000000000000000000000000..5a77874c0d8dd7b2e83f126cd52581fc87b6dc1c --- /dev/null +++ b/workflow/main.nf @@ -0,0 +1,309 @@ +#!/usr/bin/env nextflow + +// Default parameter values to run tests +params.fastqs="$baseDir/../test_data/*.fastq.gz" +params.design="$baseDir/../test_data/design.pe.txt" +params.genome="/project/shared/bicf_workflow_ref/GRCh38/" +params.markdups="mark" +params.stranded="0" +params.pairs="pe" +params.geneset = 'h.all.v5.1.symbols.gmt' +params.align = 'hisat' + +design_file = file(params.design) +fastqs=file(params.fastqs) +design_file = file(params.design) +gtf_file = file("$params.genome/gencode.gtf") +genenames = file("$params.genome/genenames.txt") +geneset = file("$params.genome/gsea_gmt/$params.geneset") + +// params genome is the directory +// base name for the index is always genome +index_path = file(params.genome) +index_name = "genome" +star_index = 'star_index/' + +// Pair handling, helper function taken from rnatoy +// which is covered by the GNU General Public License v3 +// https://github.com/nextflow-io/rnatoy/blob/master/main.nf + +def fileMap = [:] + +fastqs.each { + final fileName = it.getFileName().toString() + prefix = fileName.lastIndexOf('/') + fileMap[fileName] = it +} +def prefix = [] +new File(params.design).withReader { reader -> + def hline = reader.readLine() + def header = hline.split("\t") + prefixidx = header.findIndexOf{it == 'SampleID'}; + oneidx = header.findIndexOf{it == 'FullPathToFqR1'}; + twoidx = header.findIndexOf{it == 'FullPathToFqR2'}; + if (twoidx == -1) { + twoidx = oneidx + } + while (line = reader.readLine()) { + def row = line.split("\t") + if (fileMap.get(row[oneidx]) != null) { + prefix << tuple(row[prefixidx],fileMap.get(row[oneidx]),fileMap.get(row[twoidx])) + } + +} +} +if( ! prefix) { error "Didn't match any input files with entries in the design file" } + +if (params.pairs == 'pe') { +Channel + .from(prefix) + .set { read_pe } +Channel + .empty() + .set { read_se } +} +if (params.pairs == 'se') { +Channel + .from(prefix) + .into { read_se } +Channel + .empty() + .set { read_pe } +} + +// +// Trim raw reads using trimgalore +// +process trimpe { + input: + set pair_id, file(read1), file(read2) from read_pe + output: + set pair_id, file("${read1.baseName.split("\\.", 2)[0]}_val_1.fq.gz"), file("${read2.baseName.split("\\.", 2)[0]}_val_2.fq.gz") into trimpe + script: + """ + module load trimgalore/0.4.1 cutadapt/1.9.1 + trim_galore --paired -q 25 --illumina --gzip --length 35 ${read1} ${read2} + """ +} +process trimse { + input: + set pair_id, file(read1) from read_se + output: + set pair_id, file("${read1.baseName.split("\\.", 2)[0]}_trimmed.fq.gz") into trimse + script: + """ + module load trimgalore/0.4.1 cutadapt/1.9.1 + trim_galore -q 25 --illumina --gzip --length 35 ${read1} + """ +} + +// +// Align trimmed reads to genome indes with hisat2 +// Sort and index with samtools +// QC aligned reads with fastqc +// Alignment stats with samtools +// +process alignpe { + + publishDir "$baseDir/output", mode: 'copy' + cpus 32 + + input: + set pair_id, file(fq1), file(fq2) from trimpe + output: + set pair_id, file("${pair_id}.bam") into alignpe + file("${pair_id}.flagstat.txt") into alignstats_pe + file("${pair_id}.hisatout.txt") into hsatoutpe + set file("${pair_id}_fastqc.zip"),file("${pair_id}_fastqc.html") into fastqcpe + script: + if (params.align == 'hisat') + """ + module load hisat2/2.0.1-beta-intel samtools/intel/1.3 fastqc/0.11.2 picard/1.127 speedseq/20160506 + hisat2 -p 30 --no-unal --dta -x ${index_path}/${index_name} -1 ${fq1} -2 ${fq2} -S out.sam 2> ${pair_id}.hisatout.txt + sambamba view -t 30 -f bam -S -o output.bam out.sam + sambamba sort -t 30 -o ${pair_id}.bam output.bam + sambamba flagstat -t 30 ${pair_id}.bam > ${pair_id}.flagstat.txt + fastqc -f bam ${pair_id}.bam + """ + else + """ + module load star/2.4.2a samtools/intel/1.3 fastqc/0.11.2 picard/1.127 speedseq/20160506 + STAR --genomeDir ${index_path}/${star_index} --readFilesIn ${fq1} ${fq2} --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out + mv outLog.final.out ${pair_id}.hisatout.txt + sambamba sort -t 30 -o ${pair_id}.bam outAligned.sortedByCoord.out.bam + sambamba flagstat -t 30 ${pair_id}.bam > ${pair_id}.flagstat.txt + fastqc -f bam ${pair_id}.bam + """ +} +process alignse { + + publishDir "$baseDir/output", mode: 'copy' + cpus 32 + + input: + set pair_id, file(fq1) from trimse + output: + set pair_id, file("${pair_id}.bam") into alignse + file("${pair_id}.flagstat.txt") into alignstats_se + file("${pair_id}.hisatout.txt") into hsatoutse + set file("${pair_id}_fastqc.zip"),file("${pair_id}_fastqc.html") into fastqcse + script: + if (params.align == 'hisat') + """ + module load hisat2/2.0.1-beta-intel samtools/intel/1.3 fastqc/0.11.2 speedseq/20160506 picard/1.127 + hisat2 -p 30 --no-unal --dta -x ${index_path}/${index_name} -U ${fq1} -S out.sam 2> ${pair_id}.hisatout.txt + sambamba view -t 30 -f bam -S -o output.bam out.sam + sambamba sort -t 30 -o ${pair_id}.bam output.bam + sambamba flagstat -t 30 ${pair_id}.bam > ${pair_id}.flagstat.txt + fastqc -f bam ${pair_id}.bam + """ + else + """ + module load star/2.4.2a samtools/intel/1.3 fastqc/0.11.2 picard/1.127 speedseq/20160506 + STAR --genomeDir ${index_path}/${star_index} --readFilesIn ${fq1} --readFilesCommand zcat --genomeLoad NoSharedMemory --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD --outSAMstrandField intronMotif --outSAMtype SAM --quantMode TranscriptomeSAM --sjdbScore 1 --limitBAMsortRAM 60000000000 --outFileNamePrefix out + mv outLog.final.out ${pair_id}.hisatout.txt + sambamba view -t 30 -f bam -S -o output.bam outAligned.out.sam + sambamba sort -t 30 -o ${pair_id}.bam output.bam + sambamba flagstat -t 30 ${pair_id}.bam > ${pair_id}.flagstat.txt + fastqc -f bam ${pair_id}.bam + """ +} + +// From here on we are the same for PE and SE, so merge channels and carry on + +Channel + .empty() + .mix(alignse, alignpe) + .tap { aligned2 } + .set { aligned } + +Channel + .empty() + .mix(alignstats_se, alignstats_pe) + .set { alignstats } + +Channel + .empty() + .mix(hsatoutse, hsatoutpe) + .set { hsatout } + +// +// Summarize all flagstat output +// +process parse_alignstat { + + publishDir "$baseDir/output", mode: 'copy' + + input: + file(txt) from alignstats.toList() + file(txt) from hsatout.toList() + + output: + file('alignment.summary.txt') + + """ + perl $baseDir/scripts/parse_flagstat.pl *.flagstat.txt + """ +} + +// +// Identify duplicate reads with Picard +// +process markdups { + + memory '4GB' + publishDir "$baseDir/output", mode: 'copy' + + input: + set pair_id, file(sbam) from aligned + output: + set pair_id, file("${pair_id}.dedup.bam") into deduped1 + set pair_id, file("${pair_id}.dedup.bam") into deduped2 + script: + if (params.markdups == 'mark') + """ + module load picard/1.127 speedseq/20160506 + sambamba markdup -t 20 -r ${sbam} ${pair_id}.dedup.bam + """ + else + """ + cp ${sbam} ${pair_id}.dedup.bam + """ +} + +// +// Read summarization with subread +// +process featurect { + + publishDir "$baseDir/output", mode: 'copy' + cpus 32 + + input: + set pair_id, file(dbam) from deduped1 + file gtf_file + output: + file("${pair_id}.cts") into counts + """ + module load module subread/1.5.0-intel + featureCounts -s params.stranded -T 30 -p -g gene_name -a ${gtf_file} -o ${pair_id}.cts ${dbam} + """ +} + +// +// Assemble transcripts with stringtie +// + +process stringtie { + + publishDir "$baseDir/output", mode: 'copy' + cpus 32 + + input: + set pair_id, file(dbam) from deduped2 + file gtf_file + output: + file("${pair_id}_stringtie") into strcts + """ + module load stringtie/1.1.2-intel + mkdir ${pair_id}_stringtie + cd ${pair_id}_stringtie + stringtie ../${dbam} -p 30 -G ../${gtf_file} -B -e -o denovo.gtf -A ../geneabund.stringtie.tab + """ +} + +process statanal { + publishDir "$baseDir/output", mode: 'copy' + input: + file count_file from counts.toList() + file design_file name 'design.txt' + file genenames + file geneset name 'geneset.gmt' + output: + file "*.txt" into txtfiles + file "*.png" into psfiles + file "*" + """ + module load R/3.2.1-intel + perl $baseDir/scripts/concat_cts.pl -o ./ *.cts + cp design.txt design.shiny.txt + cp geneset.gmt geneset.shiny.gmt + Rscript $baseDir/scripts/dea.R + perl $baseDir/scripts/concat_edgeR.pl *.edgeR.txt + """ +} + +process buildrda { + publishDir "$baseDir/output", mode: 'copy' + input: + file stringtie_dir from strcts.toList() + file design_file name 'design.txt' + output: + file "bg.rda" into rdafiles + """ + module load R/3.2.1-intel + Rscript $baseDir/scripts/build_ballgown.R *_stringtie + """ + } + +