Commit 1bcd46a3 authored by Brandi Cantarel's avatar Brandi Cantarel
Browse files

fixing se mode and add dedup option

parents c07a6a71 3861d9ad
......@@ -99,13 +99,26 @@ workflow_parameters:
- [ 'se', 'Single End']
description: |
In single-end sequencing, the sequencer reads a fragment from only one end to the other, generating the sequence of base pairs. In paired-end reading it starts at one read, finishes this direction at the specified read length, and then starts another round of reading from the opposite end of the fragment.
- id: markdups
type: select
required: true
choices:
- [ 'mark', 'Remove Duplicates']
- [ 'keep', 'Keep All Sequences']
description: |
Duplicate reads are defined as originating from the same original fragment of DNA. Duplicates are identified as read pairs having identical 5' positions (coordinate and strand) for both reads in a mate pair and optionally, matching unique molecular identifier reads.
- id: design
type: file
required: true
description: |
A design file listing pairs of sample name and sample group.
Columns must include: SampleID,SampleName,SampleGroup
Columns must include: SampleID,SampleName,SampleGroup,FullPathToFqR1,FullPathToFqR2
FullPathToFqR1 and FullPathToFqR2 are the file names of the R1 and R2 read files.
ie Sample1.R1.fastq.gz, Sample_1.fastq.gz, Sample.fq.gz
Single end mode needs only the FullPathToFqR1 column
Optional columns are encouraged, such as:
SubjectID,Tissue,Gender,CultureDate,Organism,CellPopulation
......
......@@ -3,12 +3,14 @@
// Default parameter values to run tests
params.pairs="pe"
params.markdups="mark"
params.genome="/project/apps_database/hisat2_index/hg38/"
params.gtf="/project/apps_database/iGenomes/Homo_sapiens/NCBI/GRCh38/Annotation/Genes.gencode/genes.gtf"
params.design="$baseDir/../test_data/design.txt"
params.fastqs="$baseDir/../test_data/*.fastq.gz"
params.design="$baseDir/../test_data/design.txt"
design_file = file(params.design)
fastqs=params.fastqs
fastqs=file(params.fastqs)
design_file = file(params.design)
gtf_file = file(params.gtf)
......@@ -21,84 +23,55 @@ index_name = "genome"
// which is covered by the GNU General Public License v3
// https://github.com/nextflow-io/rnatoy/blob/master/main.nf
read_pe = Channel.empty()
read_se = Channel.empty()
def fileMap = [:]
fastqs.each {
final fileName = it.getFileName().toString()
prefix = fileName.lastIndexOf('/')
fileMap[fileName] = it
}
def prefix = []
new File(params.design).withReader { reader ->
def hline = reader.readLine()
def header = hline.split("\t")
prefixidx = header.findIndexOf{it == 'SampleID'};
oneidx = header.findIndexOf{it == 'FullPathToFqR1'};
twoidx = header.findIndexOf{it == 'FullPathToFqR2'};
if (twoidx == -1) {
twoidx = oneidx
}
while (line = reader.readLine()) {
def row = line.split("\t")
if (fileMap.get(row[oneidx]) != null) {
prefix << tuple(row[prefixidx],fileMap.get(row[oneidx]),fileMap.get(row[twoidx]))
}
}
}
if (params.pairs == 'pe') {
Channel
.fromPath( fastqs )
.ifEmpty { error "Cannot find any files matching: ${fastqs}" }
.map { path ->
def prefix = readPrefix(path, '*{1,2}.{fastq,fq}{,.gz}')
tuple(prefix, path)
}
.groupTuple(sort: true)
.map { id, files -> tuple(id, files[0],files[1])}
.set { read_pe }
}
.from(prefix)
.into { read_pe }
}
if (params.pairs == 'se') {
Channel
.fromPath( fastqs )
.ifEmpty { error "Cannot find any files matching: ${fastqs}" }
.map { path ->
def prefix = readPrefix(path, '*{1,2}.{fastq,fq}{,.gz}')
tuple(prefix, path)
}
.groupTuple(sort: true)
.map { id, files -> tuple(id, files[0])}
.set { read_se }
}
/*
* Helper function, given a file Path
* returns the file name region matching a specified glob pattern
* starting from the beginning of the name up to last matching group.
*
* For example:
* readPrefix('/some/data/file_alpha_1.fa', 'file*_1.fa' )
*
* Returns:
* 'file_alpha'
*/
def readPrefix( Path actual, template ) {
final fileName = actual.getFileName().toString()
def filePattern = template.toString()
int p = filePattern.lastIndexOf('/')
if( p != -1 ) filePattern = filePattern.substring(p+1)
if( !filePattern.contains('*') && !filePattern.contains('?') )
filePattern = '*' + filePattern
def regex = filePattern
.replace('.','\\.')
.replace('*','(.*)')
.replace('?','(.?)')
.replace('{','(?:')
.replace('}',')')
.replace(',','|')
def matcher = (fileName =~ /$regex/)
if( matcher.matches() ) {
def end = matcher.end(matcher.groupCount() )
def prefix = fileName.substring(0,end)
while(prefix.endsWith('-') || prefix.endsWith('_') || prefix.endsWith('.') )
prefix=prefix[0..-2]
return prefix
}
return null
.from(prefix)
.into { read_se }
}
//
// Trim raw reads using trimgalore
//
process trimpe {
when:
params.pairs == 'pe'
input:
set pair_id, file(read1), file(read2) from read_pe
output:
set pair_id, file("${read1.baseName.split("\\.", 2)[0]}_val_1.fq.gz"), file("${read2.baseName.split("\\.", 2)[0]}_val_2.fq.gz") into trimpe
when:
params.pairs == 'pe'
script:
"""
module load trimgalore/0.4.1 cutadapt/1.9.1
......@@ -106,12 +79,12 @@ process trimpe {
"""
}
process trimse {
when:
params.pairs == 'se'
input:
set pair_id, file(read1) from read_se
output:
set pair_id, file("${read1.baseName.split("\\.", 2)[0]}_trimmed.fq.gz") into trimse
when:
params.pairs == 'se'
script:
"""
module load trimgalore/0.4.1 cutadapt/1.9.1
......@@ -133,9 +106,8 @@ process alignpe {
input:
set pair_id, file(fq1), file(fq2) from trimpe
output:
set pair_id, file("${pair_id}.bam") into aligned
set pair_id, file("${pair_id}.bam") into aligned2
file("${pair_id}.flagstat.txt") into alignstats
set pair_id, file("${pair_id}.bam") into alignpe
file("${pair_id}.flagstat.txt") into alignstats_pe
"""
module load hisat2/2.0.1-beta-intel samtools/intel/1.3 fastqc/0.11.2 picard/1.127
hisat2 -p 4 --no-unal --dta -x ${index_path}/${index_name} -1 ${fq1} -2 ${fq2} -S out.sam 2> ${pair_id}.hisatout.txt
......@@ -155,9 +127,8 @@ process alignse {
input:
set pair_id, file(fq1) from trimse
output:
set pair_id, file("${pair_id}.bam") into aligned
set pair_id, file("${pair_id}.bam") into aligned2
file("${pair_id}.flagstat.txt") into alignstats
set pair_id, file("${pair_id}.bam") into alignse
file("${pair_id}.flagstat.txt") into alignstats_se
"""
module load hisat2/2.0.1-beta-intel samtools/intel/1.3 fastqc/0.11.2 picard/1.127
hisat2 -p 4 --no-unal --dta -x ${index_path}/${index_name} -U ${fq1} -S out.sam 2> ${pair_id}.hisatout.txt
......@@ -170,6 +141,20 @@ process alignse {
"""
}
// From here on we are the same for PE and SE, so merge channels and carry on
Channel
.empty()
.mix(alignse, alignpe)
.tap { aligned2 }
.set { aligned }
Channel
.empty()
.mix(alignstats_se, alignstats_pe)
.set { alignstats }
//
// Summarize all flagstat output
//
......@@ -201,10 +186,15 @@ process markdups {
output:
set pair_id, file("${pair_id}.dedup.bam") into deduped1
set pair_id, file("${pair_id}.dedup.bam") into deduped2
set pair_id, file("${pair_id}.dups") into dupstats
script:
if (params.markdups == 'mark')
"""
module load picard/1.127
java -Xmx4g -jar \$PICARD/picard.jar MarkDuplicates INPUT=${sbam} REMOVE_DUPLICATES=true VALIDATION_STRINGENCY=LENIENT ASSUME_SORTED=true METRICS_FILE="${pair_id}.dups" OUTPUT="${pair_id}.dedup.bam"
java -Xmx4g -jar \$PICARD/picard.jar MarkDuplicates INPUT=${sbam} REMOVE_DUPLICATES=true VALIDATION_STRINGENCY=LENIENT ASSUME_SORTED=true METRICS_FILE=${pair_id}.dups OUTPUT=${pair_id}.dedup.bam
"""
else
"""
cp ${sbam} ${pair_id}.dedup.bam
"""
}
......@@ -288,16 +278,3 @@ process buildrda {
Rscript $baseDir/scripts/build_ballgown.R *_stringtie
"""
}
// process printHello {
// input:
// file r1
// file r2
// output:
// stdout into result
// """
// echo ${index_path} ${index_name}
// """
// }
// result.println()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment