Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • BICF/Astrocyte/chipseq_analysis
  • s190984/chipseq_analysis
  • astrocyte/workflows/bicf/chipseq_analysis
  • s219741/chipseq-analysis-containerized
Show changes
Showing
with 254 additions and 62 deletions
File added
File added
sample_id experiment_id biosample factor treatment replicate control_id fastq_read1
ENCLB497XZB ENCSR000DXB Panc1 H3K4me3 None 1 ENCLB304SBJ ENCFF001GBW.fastq.gz
ENCLB304SBJ ENCSR000DXC Panc1 Control None 1 ENCLB304SBJ ENCFF001HWJ.fastq.gz
......@@ -25,3 +25,9 @@ wget https://www.encodeproject.org/files/ENCFF161HBP/@@download/ENCFF161HBP.fast
wget https://www.encodeproject.org/files/ENCFF776KZU/@@download/ENCFF776KZU.fastq.gz
wget https://www.encodeproject.org/files/ENCFF119KHM/@@download/ENCFF119KHM.fastq.gz
echo "Done with Paired-end"
echo "Downloading Single-end data set Human ENCSR000DXB and ENCSR000DXC"
wget https://www.encodeproject.org/files/ENCFF001GBW/@@download/ENCFF001GBW.fastq.gz
wget https://www.encodeproject.org/files/ENCFF001GBV/@@download/ENCFF001GBV.fastq.gz
wget https://www.encodeproject.org/files/ENCFF001HWJ/@@download/ENCFF001HWJ.fastq.gz
echo "Done with Single-end"
Test.20.tagAlign.gz 18588987 0,20,33 0.211525291335199,0.211232019956852,0.211139666755398 35 0.2123067 1500 0.209429 1.01001 0.7284536 0
workflow/conf/bicf_logo.png

24.3 KiB

......@@ -2,6 +2,7 @@ process {
executor = 'slurm'
queue = 'super'
clusterOptions = '--hold'
beforeScript= 'ulimit -Ss unlimited'
// Process specific configuration
withName: checkDesignFile {
......@@ -44,6 +45,10 @@ process {
module = ['python/3.6.1-2-anaconda', 'macs/2.1.0-20151222', 'UCSC_userApps/v317', 'bedtools/2.26.0', 'phantompeakqualtools/1.2']
queue = '128GB,256GB,256GBv1'
}
withName: plotProfile {
module = ['deeptools/2.5.0.1']
cpus = 32
}
withName: consensusPeaks {
module = ['python/3.6.1-2-anaconda', 'bedtools/2.26.0']
executor = 'local'
......@@ -61,7 +66,7 @@ process {
cpus = 32
}
withName: multiqcReport {
module = ['python/3.6.1-2-anaconda', 'pandoc/2.7', 'multiqc/1.7']
module = ['python/3.6.1-2-anaconda', 'pandoc/2.7', 'singularity/3.0.2']
executor = 'local'
}
}
......@@ -70,22 +75,28 @@ params {
// Reference file paths on BioHPC
genomes {
'GRCh38' {
bwa = '/project/shared/bicf_workflow_ref/GRCh38'
bwa = '/project/shared/bicf_workflow_ref/human/GRCh38'
genomesize = 'hs'
chromsizes = '/project/shared/bicf_workflow_ref/GRCh38/genomefile.txt'
fasta = '/project/shared/bicf_workflow_ref/GRCh38/genome.fa'
chromsizes = '/project/shared/bicf_workflow_ref/human/GRCh38/genomefile.txt'
fasta = '/project/shared/bicf_workflow_ref/human/GRCh38/genome.fa'
gtf = '/project/shared/bicf_workflow_ref/human/GRCh38/gencode.v25.chr_patch_hapl_scaff.annotation.gtf'
geneNames = '/project/shared/bicf_workflow_ref/human/GRCh38/genenames.txt'
}
'GRCh37' {
bwa = '/project/shared/bicf_workflow_ref/GRCh37'
bwa = '/project/shared/bicf_workflow_ref/human/GRCh37'
genomesize = 'hs'
chromsizes = '/project/shared/bicf_workflow_ref/GRCh37/genomefile.txt'
fasta = '/project/shared/bicf_workflow_ref/GRCh37/genome.fa'
chromsizes = '/project/shared/bicf_workflow_ref/human/GRCh37/genomefile.txt'
fasta = '/project/shared/bicf_workflow_ref/human/GRCh37/genome.fa'
gtf = '/project/shared/bicf_workflow_ref/human/GRCh37/gencode.v19.chr_patch_hapl_scaff.annotation.gtf'
geneNames = '/project/shared/bicf_workflow_ref/human/GRCh37/genenames.txt'
}
'GRCm38' {
bwa = '/project/shared/bicf_workflow_ref/GRCm38'
bwa = '/project/shared/bicf_workflow_ref/mouse/GRCm38'
genomesize = 'mm'
chromsizes = '/project/shared/bicf_workflow_ref/GRCm38/genomefile.txt'
fasta = '/project/shared/bicf_workflow_ref/GRCm38/genome.fa'
chromsizes = '/project/shared/bicf_workflow_ref/mouse/GRCm38/genomefile.txt'
fasta = '/project/shared/bicf_workflow_ref/mouse/GRCm38/genome.fa'
gtf = '/project/shared/bicf_workflow_ref/mouse/GRCm38/gencode.vM20.annotation.gtf'
geneNames = '/project/shared/bicf_workflow_ref/mouse/GRCm38/genenames.txt'
}
}
}
......
# Custom Logo
custom_logo: 'bicf_logo.png'
custom_logo_url: 'https://www.utsouthwestern.edu/labs/bioinformatics/'
custom_logo_title: 'Bioinformatics Core Facility'
report_header_info:
- Contact E-mail: 'bicf@utsouthwestern.edu'
- Application Type: 'ChIP-seq'
- Department: 'Bioinformatic Core Facility, Department of Bioinformatics'
- Contributors and Licensing: 'See https://doi.org/10.5281/zenodo.2648844'
# Title to use for the report.
title: BICF ChIP-seq Analysis Report
......
#!/usr/bin/env nextflow
/*
BICF ChIP-seq Analysis Workflow
#### Homepage / Documentation
https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/
Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
*/
// Path to an input file, or a pattern for multiple inputs
// Note - $baseDir is the location of this workflow file main.nf
// Define Input variables
params.reads = "$baseDir/../test_data/*.fastq.gz"
params.pairedEnd = 'false'
params.pairedEnd = false
params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt"
params.genome = 'GRCm38'
params.cutoffRatio = 1.2
params.outDir= "$baseDir/output"
params.extendReadsLen = 100
params.topPeakCount = 600
params.astrocyte = 'false'
params.astrocyte = false
params.skipDiff = false
params.skipMotif = false
params.skipPlotProfile = false
params.references = "$baseDir/../docs/references.md"
params.multiqc = "$baseDir/conf/multiqc_config.yaml"
params.ci = false
params.dev = false
// Assign variables if astrocyte
if (params.astrocyte) {
print("Running under astrocyte")
referenceLocation = "/project/shared/bicf_workflow_ref"
params.bwaIndex = "$referenceLocation/$params.genome"
params.chromSizes = "$referenceLocation/$params.genome/genomefile.txt"
params.fasta = "$referenceLocation/$params.genome/genome.fa.txt"
if (params.genome == 'GRCh37' || params.genome == 'GRCh38') {
if (params.genome == 'GRCh37') {
params.bwaIndex = "$referenceLocation/human/$params.genome"
params.chromSizes = "$referenceLocation/human/$params.genome/genomefile.txt"
params.fasta = "$referenceLocation/human/$params.genome/genome.fa"
params.gtf = "$referenceLocation/human/$params.genome/gencode.v19.chr_patch_hapl_scaff.annotation.gtf"
params.geneNames = "$referenceLocation/human/$params.genome/genenames.txt"
params.genomeSize = 'hs'
} else if (params.genome == 'GRCm38') {
params.bwaIndex = "$referenceLocation/mouse/$params.genome"
params.chromSizes = "$referenceLocation/mouse/$params.genome/genomefile.txt"
params.fasta = "$referenceLocation/mouse/$params.genome/genome.fa"
params.gtf = "$referenceLocation/mouse/$params.genome/gencode.vM20.annotation.gtf"
params.geneNames = "$referenceLocation/mouse/$params.genome/genenames.txt"
params.genomeSize = 'mm'
} else if (params.genome == 'GRCh38') {
params.bwaIndex = "$referenceLocation/human/$params.genome"
params.chromSizes = "$referenceLocation/human/$params.genome/genomefile.txt"
params.fasta = "$referenceLocation/human/$params.genome/genome.fa"
params.gtf = "$referenceLocation/human/$params.genome/gencode.v25.chr_patch_hapl_scaff.annotation.gtf"
params.geneNames = "$referenceLocation/human/$params.genome/genenames.txt"
params.genomeSize = 'hs'
}
} else {
params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false
params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false
params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false
params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false
params.gtf = params.genome ? params.genomes[ params.genome ].gtf ?: false : false
params.geneNames = params.genome ? params.genomes[ params.genome ].geneNames ?: false : false
}
......@@ -56,7 +86,8 @@ readsList = Channel
.collectFile( name: 'fileList.tsv', newLine: true )
// Define regular variables
designFile = params.designFile
pairedEnd = params.pairedEnd
designFile = Channel.fromPath(params.designFile)
genomeSize = params.genomeSize
genome = params.genome
chromSizes = params.chromSizes
......@@ -67,15 +98,37 @@ extendReadsLen = params.extendReadsLen
topPeakCount = params.topPeakCount
skipDiff = params.skipDiff
skipMotif = params.skipMotif
skipPlotProfile = params.skipPlotProfile
references = params.references
multiqc = params.multiqc
gtfFile = params.gtf
geneNames = params.geneNames
if (params.pairedEnd == 'false'){
pairedEnd = false
} else {
pairedEnd = true
/*
* trackStart: track start of pipeline
*/
process trackStart {
script:
"""
hostname
ulimit -a
curl -H 'Content-Type: application/json' -X PUT -d '{ \
"sessionId": "${workflow.sessionId}", \
"pipeline": "chipseq_analysis", \
"start": "${workflow.start}", \
"astrocyte": ${params.astrocyte}, \
"status": "started", \
"nextflowVersion": "${workflow.nextflow.version}", \
"pipelineVersion": "1.1.2", \
"ci": ${params.ci}, \
"dev": ${params.dev}}' \
"https://xku43pcwnf.execute-api.us-east-1.amazonaws.com/ProdDeploy/pipeline-tracking"
"""
}
// Check design file for errors
process checkDesignFile {
......@@ -83,7 +136,7 @@ process checkDesignFile {
input:
designFile
file designFile
file readsList
output:
......@@ -331,6 +384,8 @@ process crossReads {
}
else {
"""
module load python/3.6.1-2-anaconda
module load phantompeakqualtools/1.2
python3 $baseDir/scripts/xcor.py -t $seTagAlign
"""
}
......@@ -417,6 +472,7 @@ process callPeaksMACS {
set sampleId, file('*.narrowPeak'), file('*.fc_signal.bw'), file('*.pvalue_signal.bw'), experimentId, biosample, factor, treatment, replicate, controlId into experimentPeaks
file '*.xls' into callPeaksMACSsummit
file('version_*.txt') into callPeaksMACSVersions
file("*.fc_signal.bw") into bigwigs
script:
......@@ -449,6 +505,29 @@ peaksDesign = experimentPeaks
"$sampleId\t$peak\t$fcSignal\t$pvalueSignal\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"}
.collectFile(name:'design_peak.tsv', seed:"sample_id\tpeaks\tfc_signal\tpvalue_signal\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$outDir/design")
//plotProfile
process plotProfile {
publishDir "$outDir/experimentQC", mode: 'copy'
input:
file bigWigList from bigwigs.collect()
output:
file '*.{png,gz}' into plotProfile
when:
!skipPlotProfile
script:
"""
module load deeptools/2.5.0.1
bash $baseDir/scripts/plot_profile.sh -g $gtfFile
"""
}
// Calculate Consensus Peaks
process consensusPeaks {
......@@ -497,7 +576,7 @@ process peakAnnotation {
"""
module load R/3.3.2-gccmkl
Rscript $baseDir/scripts/annotate_peaks.R $designAnnotatePeaks $genome
Rscript $baseDir/scripts/annotate_peaks.R $designAnnotatePeaks $gtfFile $geneNames
"""
}
......@@ -524,7 +603,9 @@ process motifSearch {
script:
"""
module load R/3.3.2-gccmkl
module load python/3.6.1-2-anaconda
module load meme/4.11.1-gcc-openmpi
module load bedtools/2.26.0
python3 $baseDir/scripts/motif_search.py -d $designMotifSearch -g $fasta -p $topPeakCount
"""
}
......@@ -559,15 +640,13 @@ process diffPeaks {
"""
module load python/3.6.1-2-anaconda
module load meme/4.11.1-gcc-openmpi
module load bedtools/2.26.0
module load R/3.3.2-gccmkl
Rscript $baseDir/scripts/diff_peaks.R $designDiffPeaks
"""
}
// Generate Multiqc Report, gerernate Software Versions and references
process multiqcReport {
publishDir "$outDir/${task.process}", mode: 'copy'
input:
......@@ -600,11 +679,12 @@ process multiqcReport {
"""
module load python/3.6.1-2-anaconda
module load pandoc/2.7
module load multiqc/1.7
module load singularity/3.0.2
echo $workflow.nextflow.version > version_nextflow.txt
multiqc --version > version_multiqc.txt
singularity exec /project/shared/bicf_workflow_ref/singularity_images/bicf-multiqc-2.0.0.img multiqc --version > version_multiqc.txt
python --version &> version_python.txt
python3 $baseDir/scripts/generate_references.py -r $references -o software_references
python3 $baseDir/scripts/generate_versions.py -o software_versions
multiqc -c $multiqc .
singularity exec /project/shared/bicf_workflow_ref/singularity_images/bicf-multiqc-2.0.0.img multiqc -c $multiqc .
"""
}
......@@ -4,11 +4,28 @@ profiles {
}
}
trace {
enabled = true
file = 'pipeline_trace.txt'
fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
}
timeline {
enabled = true
file = 'timeline.html'
}
report {
enabled = true
file = 'report.html'
}
manifest {
name = 'chipseq_analysis'
description = 'BICF ChIP-seq Analysis Workflow.'
homePage = 'https://github.com/nf-core/rnaseq'
version = '1.0.0'
homePage = 'https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis'
version = '1.1.2'
mainScript = 'main.nf'
nextflowVersion = '>=0.31.0'
}
#!/bin/Rscript
# Load libraries
library("ChIPseeker")
# Currently mouse or human
#*
#* --------------------------------------------------------------------------
#* Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
#* --------------------------------------------------------------------------
#*
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
library("TxDb.Mmusculus.UCSC.mm10.knownGene")
library("TxDb.Hsapiens.UCSC.hg38.knownGene")
library("org.Hs.eg.db")
library("org.Mm.eg.db")
#Currently Human or Mouse
# Load libraries
library("ChIPseeker")
library(GenomicFeatures)
# Create parser object
args <- commandArgs(trailingOnly=TRUE)
# Check input args
if (length(args) != 2) {
stop("Usage: annotate_peaks.R annotate_design.tsv genome_assembly", call.=FALSE)
if (length(args) != 3) {
stop("Usage: annotate_peaks.R annotate_design.tsv gtf geneNames", call.=FALSE)
}
design_file <- args[1]
genome_assembly <- args[2]
gtf <- args[2]
geneNames <- args[3]
# Load UCSC Known Genes
if(genome_assembly=='GRCh37') {
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
annodb <- 'org.Hs.eg.db'
} else if(genome_assembly=='GRCm38') {
txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
annodb <- 'org.Mm.eg.db'
} else if(genome_assembly=='GRCh38') {
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
annodb <- 'org.Hs.eg.db'
}
txdb <- makeTxDbFromGFF(gtf)
sym <- read.table(geneNames, header=T, sep='\t') [,4:5]
# Output version of ChIPseeker
chipseeker_version = packageVersion('ChIPseeker')
......@@ -48,18 +41,19 @@ names(files) <- design$Condition
# Granges of files
peaks <- lapply(files, readPeakFile, as = "GRanges", header = FALSE)
peakAnnoList <- lapply(peaks, annotatePeak, TxDb=txdb, annoDb=annodb, tssRegion=c(-3000, 3000), verbose=FALSE)
peakAnnoList <- lapply(peaks, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE)
column_names <- c("chr", "start", "end", "width", "strand_1", "name", "score", "strand", "signalValue",
column_names <- c("geneId","chr", "start", "end", "width", "strand_1", "name", "score", "strand", "signalValue",
"pValue", "qValue", "peak", "annotation", "geneChr", "geneStart", "geneEnd",
"geneLength" ,"geneStrand", "geneId", "transcriptId", "distanceToTSS",
"ENSEMBL", "symbol", "geneName")
"geneLength" ,"geneStrand", "transcriptId", "distanceToTSS", "symbol")
for(index in c(1:length(peakAnnoList))) {
filename <- paste(names(peaks)[index], ".chipseeker_annotation.tsv", sep="")
df <- as.data.frame(peakAnnoList[[index]])
colnames(df) <- column_names
write.table(df[ , !(names(df) %in% c('strand_1'))], filename, sep="\t" ,quote=F, row.names=F)
df$geneId <- sapply(strsplit(as.character(df$geneId), split = "\\."), "[[", 1)
df_final <- merge(df, sym, by.x="geneId", by.y="ensembl", all.x=T)
colnames(df_final) <- column_names
write.table(df_final[ , !(names(df_final) %in% c('strand_1'))], filename, sep="\t" ,quote=F, row.names=F)
# Draw individual plots
......
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Generate peaks from data.'''
import os
......@@ -132,8 +138,20 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
with open(xcor, 'r') as xcor_fh:
firstline = xcor_fh.readline()
frag_lengths = firstline.split()[2] # third column
fragment_length = frag_lengths.split(',')[0] # grab first value
logger.info("Fraglen %s", fragment_length)
frag_lengths_array = frag_lengths.split(',')
fragment_length = 0
fragment = False
# Loop through all values of fragment length
for f in frag_lengths.split(','):
fragment_length = f
logger.info("Fraglen %s", fragment_length)
if int(fragment_length) > 50:
fragment = True
break
if fragment == False:
logger.info('Error in cross-correlation analysis: %s', frag_lengths_array)
raise Exception("Error in cross-correlation analysis: %s" % frag_lengths_array)
# Generate narrow peaks and preliminary signal tracks
......
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Check if design file is correctly formatted and matches files list.'''
import argparse
......
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Convert tagAlign files for further processing.'''
import os
......
#!/bin/Rscript
#*
#* --------------------------------------------------------------------------
#* Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
#* --------------------------------------------------------------------------
#*
# Load libraries
library("DiffBind")
......
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Generate experiment design files for downstream processing.'''
import argparse
......
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Experiment correlation and enrichment of reads over genome-wide bins.'''
......
#!/usr/bin/env python
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Make header for HTML of references.'''
import argparse
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Make YAML of software versions.'''
from __future__ import print_function
......@@ -40,6 +46,7 @@ SOFTWARE_REGEX = {
'MEME-ChIP': ['motifSearch_vf/version_memechip.txt', r"Version (\S+)"],
'DiffBind': ['diffPeaks_vf/version_DiffBind.txt', r"Version (\S+)\""],
'deepTools': ['experimentQC_vf/version_deeptools.txt', r"deeptools (\S+)"],
'Python': ['version_python.txt', r"Python (\S+)"],
'MultiQC': ['version_multiqc.txt', r"multiqc, version (\S+)"],
}
......@@ -102,6 +109,7 @@ def main():
results['DiffBind'] = '<span style="color:#999999;\">Not Run</span>'
results['deepTools'] = '<span style="color:#999999;\">Not Run</span>'
results['MultiQC'] = '<span style="color:#999999;\">Not Run</span>'
results['Python'] = '<span style="color:#999999;\">Not Run</span>'
# list all files
files = glob.glob('**/*.txt', recursive=True)
......
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Remove duplicates and filter unmapped reads.'''
import os
......