Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • BICF/Astrocyte/chipseq_analysis
  • s190984/chipseq_analysis
  • astrocyte/workflows/bicf/chipseq_analysis
  • s219741/chipseq-analysis-containerized
Show changes
Commits on Source (155)
Showing
with 695 additions and 157 deletions
......@@ -97,3 +97,6 @@ ENV/
# mypy
.mypy_cache/
# Mac OS
.DS_Store
before_script:
- module add python/3.6.1-2-anaconda
- pip install --user pytest-pythonpath pytest-cov
- pip install --user pytest-pythonpath==0.7.1 pytest-cov==2.5.1
- module load nextflow/0.31.0
- ln -s /work/BICF/s163035/chipseq/*fastq.gz test_data/
- ln -s /project/shared/bicf_workflow_ref/workflow_testdata/chipseq/*fastq.gz test_data/
stages:
- unit
- astrocyte
- single
- multiple
- skip
user_configuration:
stage: unit
......@@ -15,34 +17,66 @@ user_configuration:
- pytest -m unit
- pytest -m unit --cov=./workflow/scripts
astrocyte:
stage: astrocyte
script:
- module load astrocyte/0.1.0
- module unload nextflow
- cd ..
- astrocyte_cli validate chipseq_analysis
artifacts:
expire_in: 2 days
single_end_mouse:
stage: single
only:
- master
script:
- nextflow run workflow/main.nf -resume
- nextflow run workflow/main.nf --astrocyte true -resume
- pytest -m singleend
artifacts:
expire_in: 2 days
paired_end_human:
stage: single
only:
- branches
except:
- master
script:
- nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_ENCSR729LGA_PE.txt" --genome 'GRCh38' --pairedEnd true -resume
- nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_ENCSR729LGA_PE.txt" --genome 'GRCh38' --pairedEnd true --astrocyte false -resume
- pytest -m pairedend
artifacts:
expire_in: 2 days
single_end_diff:
stage: multiple
only:
- branches
except:
- master
script:
- nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' -resume
- nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' --astrocyte false -resume
- pytest -m singlediff
artifacts:
expire_in: 2 days
paired_end_diff:
only:
- master
stage: multiple
script:
- nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_PE.txt" --genome 'GRCh38' --pairedEnd true -resume
- nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_PE.txt" --genome 'GRCh38' --pairedEnd true --astrocyte false -resume
- pytest -m paireddiff
artifacts:
expire_in: 2 days
single_end_skip:
stage: skip
only:
- master
script:
- nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' --skipDiff true --skipMotif true --astrocyte false -resume
- pytest -m singleskip_true
artifacts:
expire_in: 2 days
......@@ -5,6 +5,7 @@
[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A50.24.0-brightgreen.svg
)](https://www.nextflow.io/)
[![Astrocyte](https://img.shields.io/badge/astrocyte-%E2%89%A50.1.0-blue.svg)](https://astrocyte-test.biohpc.swmed.edu/static/docs/index.html)
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.2648845.svg)](https://doi.org/10.5281/zenodo.2648845)
## Introduction
......
......@@ -11,7 +11,7 @@ name: 'chipseq_analysis_bicf'
# Who wrote this?
author: 'Beibei Chen and Venkat Malladi'
# A contact email address for questions
email: 'biohpc-help@utsouthwestern.edu'
email: 'bicf@utsouthwestern.edu'
# A more informative title for the workflow package
title: 'BICF ChIP-seq Analysis Workflow'
# A summary of the workflow package in plain text
......@@ -27,7 +27,7 @@ description: |
# web interface. These files are in the 'docs' subdirectory. The first file
# listed will be used as a documentation index and is index.md by convention
documentation_files:
- ['index.md', 'chipseq-analysis']
- 'index.md'
# -----------------------------------------------------------------------------
# NEXTFLOW WORKFLOW CONFIGURATION
......@@ -42,15 +42,17 @@ workflow_modules:
- 'python/3.6.1-2-anaconda'
- 'trimgalore/0.4.1'
- 'bwa/intel/0.7.12'
- 'samtools/1.6'
- 'sambamba/0.6.6'
- 'bedtools/2.26.0'
- 'deeptools/2.5.0.1'
- 'phantompeakqualtools/1.2'
- 'macs/2.1.0-20151222'
- 'UCSC_userApps/v317'
- 'R/3.4.1-gccmkl'
- 'R/3.3.2-gccmkl'
- 'meme/4.11.1-gcc-openmpi'
- 'python/2.7.x-anaconda'
- 'pandoc/2.7'
# A list of parameters used by the workflow, defining how to present them,
# options etc in the web interface. For each parameter:
......@@ -92,20 +94,21 @@ workflow_parameters:
description: |
One or more input FASTQ files from a ChIP-seq expereiment and a design
file with the link bewetwen the same file name and sample id
regex: ".*(fastq|fq)*"
regex: ".*(fastq|fq)*gz"
min: 2
- id: pairedEnd
type: select
required: true
choices:
- [ 'true', 'True']
- [ 'false', 'False']
- [ 'true', 'true']
- [ 'false', 'false']
description: |
In single-end sequencing, the sequencer reads a fragment from only one
end to the other, generating the sequence of base pairs. In paired-end
reading it starts at one read, finishes this direction at the specified
read length, and then starts another round of reading from the opposite
end of the fragment.
end of the fragment. (Paired-end: True, Single-end: False)
- id: design
type: file
......@@ -113,18 +116,45 @@ workflow_parameters:
description: |
A design file listing sample id, fastq files, corresponding control id
and additional information about the sample.
regex: ".*tsv"
regex: ".*txt"
- id: genome
type: select
required: true
choices:
- [ 'GRCh38', 'Human GRCh38']
- [ 'GRCh37', 'Human GRCh37']
- [ 'GRCm38', 'Mouse GRCm38']
required: true
description: |
Reference species and genome used for alignment and subsequent analysis.
- id: skipDiff
type: select
required: true
choices:
- [ 'true', 'true']
- [ 'false', 'false']
description: |
Run differential peak analysis
- id: skipMotif
type: select
required: true
choices:
- [ 'true', 'true']
- [ 'false', 'false']
description: |
Run motif calling
- id: astrocyte
type: select
choices:
- [ 'true', 'true' ]
required: true
default: 'true'
description: |
Ensure configuraton for astrocyte.
# -----------------------------------------------------------------------------
# SHINY APP CONFIGURATION
......@@ -144,8 +174,4 @@ vizapp_cran_packages:
# List of any Bioconductor packages, not provided by the modules,
# that must be made available to the vizapp
vizapp_bioc_packages:
- qusage
# - ballgown
vizapp_github_packages:
- js229/Vennerable
vizapp_bioc_packages: []
SampleID,Tissue,Factor,Condition,Replicate,Peaks,bamReads,bamControl,ControlID,PeakCaller
A_1,A,H3K27AC,A,1,A_1.broadPeak,A_1.bam,A_1_input.bam,A_1_input,bed
A_2,A,H3K27AC,A,2,A_2.broadPeak,A_2.bam,A_2_input.bam,A_2_input,bed
B_1,B,H3K27AC,B,1,B_1.broadPeak,B_1.bam,B_1_input.bam,B_1_input,bed
B_2,B,H3K27AC,B,2,B_2.broadPeak,B_2.bam,B_2_input.bam,B_2_input,bed
C_1,C,H3K27AC,C,1,C_1.broadPeak,C_1.bam,C_1_input.bam,C_1_input,bed
C_2,C,H3K27AC,C,2,C_2.broadPeak,C_2.bam,C_2_input.bam,C_2_input,bed
sample_id experiment_id biosample factor treatment replicate control_id fastq_read1
A1 A tissueA H3K27AC None 1 B1 A1.fastq.gz
A2 A tissueA H3K27AC None 2 B2 A2.fastq.gz
B1 B tissueB Input None 1 B1 B1.fastq.gz
B2 A tissueB Input None 2 B2 B2.fastq.gz
# Astrocyte ChIPseq analysis Workflow Package
# BICF ChIP-seq Analysis Workflow
## Introduction
**ChIP-seq Analysis** is a bioinformatics best-practice analysis pipeline used for chromatin immunoprecipitation (ChIP-seq) data analysis.
......@@ -7,16 +7,16 @@ The pipeline uses [Nextflow](https://www.nextflow.io), a bioinformatics workflow
### Pipeline Steps
1) Trim adaptors TrimGalore!
2) Align with BWA
3) Filter reads with Sambamba S
4) Quality control with DeepTools
5) Calculate Cross-correlation using SPP and PhantomPeakQualTools
6) Signal profiling using MACS2
7) Call consenus peaks
8) Annotate all peaks using ChipSeeker
9) Use MEME-ChIP to find motifs in original peaks
10) Find differential expressed peaks using DiffBind (If more than 1 experiment)
1) Trim adaptors TrimGalore!
2) Align with BWA
3) Filter reads with Sambamba S
4) Quality control with DeepTools
5) Calculate Cross-correlation using SPP and PhantomPeakQualTools
6) Signal profiling using MACS2
7) Call consenus peaks
8) Annotate all peaks using ChipSeeker
9) Use MEME-ChIP to find motifs in original peaks
10) Find differential expressed peaks using DiffBind (If more than 1 experiment)
## Workflow Parameters
......@@ -25,41 +25,35 @@ The pipeline uses [Nextflow](https://www.nextflow.io), a bioinformatics workflow
pairedEnd - Choose True/False if data is paired-end
design - Choose the file with the experiment design information. TSV format
genome - Choose a genomic reference (genome).
skipDiff - Choose True/False if data if you want to run Differential Peaks
skipMotif - Choose True/False if data if you want to run Motif Calling
## Design file
The following columns are necessary, must be named as in template. An design file template can be downloaded [HERE](https://git.biohpc.swmed.edu/bchen4/chipseq_analysis/raw/master/docs/design_example.csv)
SampleID
The id of the sample. This will be the header in output files, please make sure it is concise
Tissue
Tissue of the sample
Factor
Factor of the experiment
Condition
This is the group that will be used for pairwise differential expression analysis
Replicate
Replicate id
Peaks
The file name of the peak file for this sample
bamReads
The file name of the IP BAM for this sample
bamControl
The file name of the control BAM for this sample
ContorlID
The id of the control sample
PeakCaller
The peak caller used
The following columns are necessary, must be named as in template. An design file template can be downloaded [HERE](https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/blob/master/docs/design_example.txt)
sample_id
The id of the sample. This will be the name used in output files, please make sure it is concise and informative.
experiment_id
The id of the experiment. Used for grouping replicates.
biosample
The name of the biological sample.
factor
Factor of the experiment.
treatment
Treatment used in experiment.
replicate
Replicate number.
control_id
The sample_id of the control used for this sample.
fastq_read1
File name of fastq file, if paired-end this is read1.
fastq_read2
File name of read2 (for paired-end), not needed for single-end data.
### Credits
This example worklow is derived from original scripts kindly contributed by the Bioinformatic Core Facility (BICF), Department of Bioinformatics
### References
This worklow is was developed jointly with the [Bioinformatic Core Facility (BICF), Department of Bioinformatics](http://www.utsouthwestern.edu/labs/bioinformatics/)
* ChipSeeker: http://bioconductor.org/packages/release/bioc/html/ChIPseeker.html
* DiffBind: http://bioconductor.org/packages/release/bioc/html/DiffBind.html
* Deeptools: https://deeptools.github.io/
* MEME-ChIP: http://meme-suite.org/doc/meme-chip.html
Please cite in publications: Pipeline was developed by BICF from funding provided by **Cancer Prevention and Research Institute of Texas (RP150596)**.
### References
1. **python**:
* Anaconda (Anaconda Software Distribution, [https://anaconda.com](https://anaconda.com))
2. **trimgalore**:
* trimgalore [https://github.com/FelixKrueger/TrimGalore](https://github.com/FelixKrueger/TrimGalore)
3. **cutadapt**:
* Marcel, M. 2011. Cutadapt removes adapter sequences from high-throughput sequencing reads. EMBnet.journal 17(1):10-12. doi:[10.14806/ej.17.1.200](http://dx.doi.org/10.14806/ej.17.1.200)
4. **bwa**:
* Li H., and R. Durbin. 2009. Fast and accurate short read alignment with Burrows-Wheeler Transform. Bioinformatics 25: 1754-60. doi:[10.1093/bioinformatics/btp324](http://dx.doi.org/10.1093/bioinformatics/btp324)
5. **samtools**:
* Li H., B. Handsaker, A. Wysoker, T. Fennell, J. Ruan, N. Homer, G. Marth, G. Abecasis, R. Durbin, and 1000 Genome Project Data Processing Subgroup. 2009. The Sequence alignment/map (SAM) format and SAMtools. Bioinformatics 25: 2078-9. doi:[10.1093/bioinformatics/btp352](http://dx.doi.org/10.1093/bioinformatics/btp352)
6. **sambamba**:
* Tarasov, A., A. J. Vilella, E. Cuppen, I. J. Nijman, and P. Prins. 2015 Sambamba: fast processing of NGS alignment formats. Bioinformatics 31(12): 2032-2034. doi:[10.1093/bioinformatics/btv098](http://dx.doi.org/10.1093/bioinformatics/btv098)
7. **bedtools**:
* Quinlan, A. R., and I. M. Hall. 2010. BEDTools: a flexible suite of utilities for comparing genomic feautures. Bioinformatics 26(6): 841-842. doi:[10.1093/bioinformatics/btq033](http://dx.doi.org/10.1093/bioinformatics/btq033)
8. **deeptools**:
* Ramírez, F., D. P. Ryan, B. Grüning, V. Bhardwaj, F. Kilpert, A. S. Richter, S. Heyne, F. Dündar, and T. Manke. 2016. deepTools2: a next generation web server for deep-sequencing data analysis. Nucleic Acids Research 44: W160-165. doi:[10.1093/nar/gkw257](http://dx.doi.org/10.1093/nar/gkw257)
9. **phantompeakqualtools**:
* Landt S. G., G. K. Marinov, A. Kundaje, et al. 2012. ChIP-seq guidelines and practices of the ENCODE and modENCODE consortia. Genome Res 9: 1813-31. doi:[10.1101/gr.136184.111](http://dx.doi.org/10.1101/gr.136184.111)
* Kharchenko P. K., M. Y. Tolstorukov, and P. J. Park. 2008. Design and analysis of ChIP-seq experiments for DNA-binding proteins. Nat Biotechnol 26(12): 1351-1359. doi:[10.1038/nbt.1508](https://dx.doi.org/10.1038/nbt.1508)
10. **macs**:
* Zhang Y., T. Liu, C. A. Meyer, J. Eeckhoute, D. S. Johnson, B. E. Bernstein, C. Nusbaum, R. M. Myers, M. Brown, W. Li, and X. S. Liu. 2008. Model-based Analysis of ChIP-Seq (MACS). Genome Biol 9: R137. doi:[10.1186/gb-2008-9-9-r137](https://dx.doi.org/10.1186/gb-2008-9-9-r137)
11. **UCSC(bedGraphToBigWig)**:
* Kent W. J., A. S. Zweig, G. Barber, A. S. Hinrichs, and D. Karolchik. BigWig and BigBed: enabling browsing of large distributed data sets. Bioinformatics 26(17): 2204-2207. doi:[10.1093/bioinformatics/btq351](https://dx.doi.org/10.1093/bioinformatics/btq351)
12. **R**:
* R Core Team 2014. R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL:[http://www.R-project.org/](http://www.R-project.org/).
13. **meme**:
* Bailey T. L., M. Bodén, F. A. Buske, M. Frith, C. E. Grant, L. Clementi, J. Ren, W. W. Li, and W. S. Noble. 2009. MEME SUITE: tools for motif discovery and searching. Nucleic Acids Research 37: W202-W208. doi:[10.1093/nar/gkp335](https://dx.doi.org/10.1093/nar/gkp335)
* Machanick P., and T. L. Bailey. 2011. MEME-ChIP: motif analysis of large DNA datasets. Bioinformatics 27(12): 1696-1697. doi:[10.1093/bioinformatics/btr189](https://dx.doi.org/10.1093/bioinformatics/btr189)
14. **ChIPseeker**:
* Yu G., L. Wang, and Q. He. 2015. ChIPseeker: an R/Bioconductor package for ChIP peak annotation, comparison and visualization. Bioinformatics 31(14): 2382-2383. doi:[10.1093/bioinformatics/btv145](https://dx.doi.org/10.1093/bioinformatics/btv145)
15. **DiffBind**:
* Stark R., and G. Brown. 2011. DiffBind: differential binding analysis of ChIP-Seq peak data. [http://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf](http://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf). doi:[10.18129/B9.bioc.DiffBind](https://dx.doi.org/10.18129/B9.bioc.DiffBind)
* Ross-Innes C. S., R. Stark, A. E. Teschendorff, K. A. Holmes, H. R. Ali, M. J. Dunning, G. D. Brown, O. Gojis, I. O. Ellis, A. R. Green, S. Ali, S. Chin, C. Palmieri, C. Caldas, and J. S. Carroll. 2012. Differential oestrogen receptor binding is associated with clinical outcome in breast cancer. Nature 481: 389-393. doi:[10.1038/nature10730](https://dx.doi.org/10.1038/nature10730)
16. **MultiQc**:
* Ewels P., Magnusson M., Lundin S. and Käller M. 2016. MultiQC: Summarize analysis results for multiple tools and samples in a single report. Bioinformatics 32(19): 3047–3048. doi:[10.1093/bioinformatics/btw354 ](https://dx.doi.org/10.1093/bioinformatics/btw354)
17. **BICF ChIP-seq Analysis Workflow**:
* Venkat S. Malladi and Beibei Chen. (2019). BICF ChIP-seq Analysis Workflow (publish_1.0.0). Zenodo. doi:[10.5281/zenodo.2648845](https://doi.org/10.5281/zenodo.2648845)
Please cite in publications: Pipeline was developed by BICF from funding provided by **Cancer Prevention and Research Institute of Texas (RP150596)**.
## Create new env in specific folder
```shell
conda create -p /project/shared/bicf_workflow_ref/chipseq_bchen4/ -c r r-essentials
#Add channels
conda config --add channels conda-forge
conda config --add channels r
conda config --add channels bioconda
pip install --user twobitreader
conda install -c r r-xml
```
Install bioconductor in R console:
```R
source("http://bioconductor.org/biocLite.R")
biocLite()
biocLite(c("DiffBind","ChIPseeker"))
```
\ No newline at end of file
......@@ -41,7 +41,7 @@ process {
executor = 'local'
}
withName: callPeaksMACS {
module = ['python/3.6.1-2-anaconda', 'macs/2.1.0-20151222', 'phantompeakqualtools/1.2', 'UCSC_userApps/v317', 'bedtools/2.26.0']
module = ['python/3.6.1-2-anaconda', 'macs/2.1.0-20151222', 'UCSC_userApps/v317', 'bedtools/2.26.0', 'phantompeakqualtools/1.2']
queue = '128GB,256GB,256GBv1'
}
withName: consensusPeaks {
......@@ -60,7 +60,10 @@ process {
module = ['python/3.6.1-2-anaconda', 'meme/4.11.1-gcc-openmpi', 'bedtools/2.26.0']
cpus = 32
}
withName: multiqcReport {
module = ['python/3.6.1-2-anaconda', 'pandoc/2.7', 'multiqc/1.7']
executor = 'local'
}
}
params {
......
# Title to use for the report.
title: BICF ChIP-seq Analysis Report
report_comment: >
This report has been generated by the <a href="https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/" target="_blank">BICF/chipseq_analysis</a>
pipeline.
extra_fn_clean_exts:
- 'pbc.qc'
- 'cc.qc'
fn_ignore_files:
- '*dedup.flagstat.qc'
custom_data:
library_complexity:
file_format: 'tsv'
id: 'library_complexity'
contents: 'TotalReadPairs DistinctReadPairs OneReadPair TwoReadPairs NRF PBC1 PBC2'
section_name: 'Library complexity'
plot_type: 'generalstats'
pconfig:
TotalReadPairs:
decimalPlaces: 0
shared_key: read_count
DistinctReadPairs:
decimalPlaces: 0
shared_key: read_count
NRF:
decimalPlaces: 2
PBC1:
decimalPlaces: 2
PBC2:
decimalPlaces: 2
sp:
phantompeakqualtools/out:
fn: '*cc.qc'
library_complexity:
fn: '*pbc.qc'
macs2:
fn: '*_peaks.xls'
report_section_order:
cutadapt:
order: -1000
Samtools:
order: -1100
Software_Versions:
order: -1200
Software_References:
order: -1300
table_columns_placement:
library_complexity:
TotalReadPairs: 1100
DistinctReadPairs: 1200
NRF: 1300
PBC1: 1400
PBC2: 1500
phantompeakqualtools:
Estimated_Fragment_Length_bp: 1600
NSC: 1700
RSC: 1800
table_columns_visible:
cutadapt:
percent_trimmed: False
library_complexity:
OneReadPair: False
TwoReadPairs: False
table_cond_formatting_rules:
library_complexity_mqc-generalstats-library_complexity-NRF:
pass:
- gt: 0.8
warn:
- lt: 0.8
fail:
- lt: 0.5
library_complexity_mqc-generalstats-library_complexity-PBC1:
pass:
- gt: 0.8
warn:
- lt: 0.8
fail:
- lt: 0.5
library_complexity_mqc-generalstats-library_complexity-PBC2:
pass:
- gt: 3
warn:
- lt: 3
fail:
- lt: 1
thousandsSep_format: ''
......@@ -8,15 +8,36 @@ params.reads = "$baseDir/../test_data/*.fastq.gz"
params.pairedEnd = false
params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt"
params.genome = 'GRCm38'
params.genomes = []
params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false
params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false
params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false
params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false
params.cutoffRatio = 1.2
params.outDir= "$baseDir/output"
params.extendReadsLen = 100
params.topPeakCount = 600
params.astrocyte = false
params.skipDiff = false
params.skipMotif = false
params.references = "$baseDir/../docs/references.md"
params.multiqc = "$baseDir/conf/multiqc_config.yaml"
// Assign variables if astrocyte
if (params.astrocyte) {
print("Running under astrocyte")
referenceLocation = "/project/shared/bicf_workflow_ref"
params.bwaIndex = "$referenceLocation/$params.genome"
params.chromSizes = "$referenceLocation/$params.genome/genomefile.txt"
params.fasta = "$referenceLocation/$params.genome/genome.fa.txt"
if (params.genome == 'GRCh37' || params.genome == 'GRCh38') {
params.genomeSize = 'hs'
} else if (params.genome == 'GRCm38') {
params.genomeSize = 'mm'
}
} else {
params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false
params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false
params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false
params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false
}
// Check inputs
if( params.bwaIndex ){
......@@ -45,6 +66,10 @@ cutoffRatio = params.cutoffRatio
outDir = params.outDir
extendReadsLen = params.extendReadsLen
topPeakCount = params.topPeakCount
skipDiff = params.skipDiff
skipMotif = params.skipMotif
references = params.references
multiqc = params.multiqc
// Check design file for errors
process checkDesignFile {
......@@ -64,11 +89,13 @@ process checkDesignFile {
if (pairedEnd) {
"""
module load python/3.6.1-2-anaconda
python3 $baseDir/scripts/check_design.py -d $designFile -f $readsList -p
"""
}
else {
"""
module load python/3.6.1-2-anaconda
python $baseDir/scripts/check_design.py -d $designFile -f $readsList
"""
}
......@@ -90,7 +117,7 @@ rawReads = designFilePaths
process trimReads {
tag "$sampleId-$replicate"
publishDir "$outDir/${task.process}", mode: 'copy'
publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy'
input:
......@@ -99,18 +126,23 @@ process trimReads {
output:
set sampleId, file('*.fq.gz'), experimentId, biosample, factor, treatment, replicate, controlId into trimmedReads
file('*trimming_report.txt') into trimgalore_results
file('*trimming_report.txt') into trimgaloreResults
file('version_*.txt') into trimReadsVersions
script:
if (pairedEnd) {
"""
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -p
module load python/3.6.1-2-anaconda
module load trimgalore/0.4.1
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -s $sampleId -p
"""
}
else {
"""
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]}
module load python/3.6.1-2-anaconda
module load trimgalore/0.4.1
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} -s $sampleId
"""
}
......@@ -119,8 +151,9 @@ process trimReads {
// Align trimmed reads using bwa
process alignReads {
queue '128GB,256GB,256GBv1'
tag "$sampleId-$replicate"
publishDir "$outDir/${task.process}", mode: 'copy'
publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy'
input:
......@@ -130,18 +163,25 @@ process alignReads {
output:
set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into mappedReads
file '*.srt.bam.flagstat.qc' into mappedReadsStats
file '*.flagstat.qc' into mappedReadsStats
file('version_*.txt') into alignReadsVersions
script:
if (pairedEnd) {
"""
python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -p
module load python/3.6.1-2-anaconda
module load bwa/intel/0.7.12
module load samtools/1.6
python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -s $sampleId -p
"""
}
else {
"""
python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa
module load python/3.6.1-2-anaconda
module load bwa/intel/0.7.12
module load samtools/1.6
python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa -s $sampleId
"""
}
......@@ -150,8 +190,9 @@ process alignReads {
// Dedup reads using sambamba
process filterReads {
queue '128GB,256GB,256GBv1'
tag "$sampleId-$replicate"
publishDir "$outDir/${task.process}", mode: 'copy'
publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy'
input:
......@@ -161,19 +202,28 @@ process filterReads {
set sampleId, file('*.bam'), file('*.bai'), experimentId, biosample, factor, treatment, replicate, controlId into dedupReads
set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into convertReads
file '*flagstat.qc' into dedupReadsStats
file '*pbc.qc' into dedupReadsComplexity
file '*dup.qc' into dupReads
file '*.flagstat.qc' into dedupReadsStats
file '*.pbc.qc' into dedupReadsComplexity
file '*.dedup.qc' into dupReads
file('version_*.txt') into filterReadsVersions
script:
if (pairedEnd) {
"""
module load python/3.6.1-2-anaconda
module load samtools/1.6
module load sambamba/0.6.6
module load bedtools/2.26.0
python3 $baseDir/scripts/map_qc.py -b $mapped -p
"""
}
else {
"""
module load python/3.6.1-2-anaconda
module load samtools/1.6
module load sambamba/0.6.6
module load bedtools/2.26.0
python3 $baseDir/scripts/map_qc.py -b $mapped
"""
}
......@@ -190,6 +240,7 @@ dedupReads
// Quality Metrics using deeptools
process experimentQC {
queue '128GB,256GB,256GBv1'
publishDir "$outDir/${task.process}", mode: 'copy'
input:
......@@ -198,11 +249,14 @@ process experimentQC {
output:
file '*.{png,npz}' into deepToolsStats
file '*.{pdf,npz}' into experimentQCStats
file('version_*.txt') into experimentQCVersions
script:
"""
module load python/3.6.1-2-anaconda
module load deeptools/2.5.0.1
python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign -e $extendReadsLen
"""
......@@ -211,8 +265,9 @@ process experimentQC {
// Convert reads to bam
process convertReads {
queue '128GB,256GB,256GBv1'
tag "$sampleId-$replicate"
publishDir "$outDir/${task.process}", mode: 'copy'
publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy'
input:
......@@ -221,16 +276,23 @@ process convertReads {
output:
set sampleId, file('*.tagAlign.gz'), file('*.bed{pe,se}.gz'), experimentId, biosample, factor, treatment, replicate, controlId into tagReads
file('version_*.txt') into convertReadsVersions
script:
if (pairedEnd) {
"""
module load python/3.6.1-2-anaconda
module load samtools/1.6
module load bedtools/2.26.0
python3 $baseDir/scripts/convert_reads.py -b $deduped -p
"""
}
else {
"""
module load python/3.6.1-2-anaconda
module load samtools/1.6
module load bedtools/2.26.0
python3 $baseDir/scripts/convert_reads.py -b $deduped
"""
}
......@@ -241,7 +303,7 @@ process convertReads {
process crossReads {
tag "$sampleId-$replicate"
publishDir "$outDir/${task.process}", mode: 'copy'
publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy'
input:
......@@ -250,17 +312,22 @@ process crossReads {
output:
set sampleId, seTagAlign, tagAlign, file('*.cc.qc'), experimentId, biosample, factor, treatment, replicate, controlId into xcorReads
set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats
set file('*.cc.qc'), file('*.cc.plot.pdf') into crossReadsStats
file('version_*.txt') into crossReadsVersions
script:
if (pairedEnd) {
"""
module load python/3.6.1-2-anaconda
module load phantompeakqualtools/1.2
python3 $baseDir/scripts/xcor.py -t $seTagAlign -p
"""
}
else {
"""
module load python/3.6.1-2-anaconda
module load phantompeakqualtools/1.2
python3 $baseDir/scripts/xcor.py -t $seTagAlign
"""
}
......@@ -289,6 +356,7 @@ process defineExpDesignFiles {
script:
"""
module load python/3.6.1-2-anaconda
python3 $baseDir/scripts/experiment_design.py -d $xcorDesign
"""
......@@ -314,11 +382,13 @@ process poolAndPsuedoReads {
if (pairedEnd) {
"""
module load python/3.6.1-2-anaconda
python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio -p
"""
}
else {
"""
module load python/3.6.1-2-anaconda
python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio
"""
}
......@@ -334,7 +404,7 @@ experimentRows = experimentPoolObjs
process callPeaksMACS {
tag "$sampleId-$replicate"
publishDir "$outDir/${task.process}", mode: 'copy'
publishDir "$outDir/${task.process}/${experimentId}/${replicate}", mode: 'copy'
input:
set sampleId, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId, controlTagAlign from experimentRows
......@@ -342,16 +412,28 @@ process callPeaksMACS {
output:
set sampleId, file('*.narrowPeak'), file('*.fc_signal.bw'), file('*.pvalue_signal.bw'), experimentId, biosample, factor, treatment, replicate, controlId into experimentPeaks
file '*.xls' into callPeaksMACSsummit
file('version_*.txt') into callPeaksMACSVersions
script:
if (pairedEnd) {
"""
module load python/3.6.1-2-anaconda
module load macs/2.1.0-20151222
module load UCSC_userApps/v317
module load bedtools/2.26.0
module load phantompeakqualtools/1.2
python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes -p
"""
}
else {
"""
module load python/3.6.1-2-anaconda
module load macs/2.1.0-20151222
module load UCSC_userApps/v317
module load bedtools/2.26.0
module load phantompeakqualtools/1.2
python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes
"""
}
......@@ -368,6 +450,7 @@ peaksDesign = experimentPeaks
process consensusPeaks {
publishDir "$outDir/${task.process}", mode: 'copy'
publishDir "$outDir/design", mode: 'copy', pattern: '*.{csv|tsv}'
input:
......@@ -381,10 +464,13 @@ process consensusPeaks {
file 'design_diffPeaks.csv' into designDiffPeaks
file 'design_annotatePeaks.tsv' into designAnnotatePeaks, designMotifSearch
file 'unique_experiments.csv' into uniqueExperiments
file('version_*.txt') into consensusPeaksVersions
script:
"""
module load python/3.6.1-2-anaconda
module load bedtools/2.26.0
python3 $baseDir/scripts/overlap_peaks.py -d $peaksDesign -f $preDiffDesign
"""
......@@ -393,7 +479,7 @@ process consensusPeaks {
// Annotate Peaks
process peakAnnotation {
publishDir "$baseDir/output/${task.process}", mode: 'copy'
publishDir "$outDir/${task.process}", mode: 'copy'
input:
......@@ -402,19 +488,21 @@ process peakAnnotation {
output:
file "*chipseeker*" into peakAnnotation
file('version_*.txt') into peakAnnotationVersions
script:
"""
module load R/3.3.2-gccmkl
Rscript $baseDir/scripts/annotate_peaks.R $designAnnotatePeaks $genome
"""
}
// Motif Search Peaks
// Motif Search Peaks
process motifSearch {
publishDir "$baseDir/output/${task.process}", mode: 'copy'
publishDir "$outDir/${task.process}", mode: 'copy'
input:
......@@ -423,11 +511,17 @@ process motifSearch {
output:
file "*memechip" into motifSearch
file "sorted-*" into filteredPeaks
file "*narrowPeak" into filteredPeaks
file('version_*.txt') into motifSearchVersions
when:
!skipMotif
script:
"""
module load R/3.3.2-gccmkl
python3 $baseDir/scripts/motif_search.py -d $designMotifSearch -g $fasta -p $topPeakCount
"""
}
......@@ -439,7 +533,7 @@ uniqueExperimentsList = uniqueExperiments
// Calculate Differential Binding Activity
process diffPeaks {
publishDir "$baseDir/output/${task.process}", mode: 'copy'
publishDir "$outDir/${task.process}", mode: 'copy'
input:
......@@ -452,13 +546,62 @@ process diffPeaks {
file '*_diffbind.csv' into diffPeaksCounts
file '*.pdf' into diffPeaksStats
file 'normcount_peaksets.txt' into normCountPeaks
file('version_*.txt') into diffPeaksVersions
when:
noUniqueExperiments > 1
noUniqueExperiments > 1 && !skipDiff
script:
"""
module load python/3.6.1-2-anaconda
module load meme/4.11.1-gcc-openmpi
module load bedtools/2.26.0
Rscript $baseDir/scripts/diff_peaks.R $designDiffPeaks
"""
}
// Generate Multiqc Report, gerernate Software Versions and references
process multiqcReport {
publishDir "$outDir/${task.process}", mode: 'copy'
input:
file ('trimReads_vf/*') from trimReadsVersions.first()
file ('alignReads_vf/*') from alignReadsVersions.first()
file ('filterReads_vf/*') from filterReadsVersions.first()
file ('convertReads_vf/*') from convertReadsVersions.first()
file ('crossReads_vf/*') from crossReadsVersions.first()
file ('callPeaksMACS_vf/*') from callPeaksMACSVersions.first()
file ('consensusPeaks_vf/*') from consensusPeaksVersions.first()
file ('peakAnnotation_vf/*') from peakAnnotationVersions.first()
file ('motifSearch_vf/*') from motifSearchVersions.first().ifEmpty()
file ('diffPeaks_vf/*') from diffPeaksVersions.first().ifEmpty()
file ('experimentQC_vf/*') from experimentQCVersions.first()
file ('trimReads/*') from trimgaloreResults.collect()
file ('alignReads/*') from mappedReadsStats.collect()
file ('filterReads/*') from dedupReadsComplexity.collect()
file ('crossReads/*') from crossReadsStats.collect()
output:
file('software_versions_mqc.yaml') into softwareVersions
file('software_references_mqc.yaml') into softwareReferences
file "multiqc_report.html" into multiqcReport
file "*_data" into multiqcData
script:
"""
module load python/3.6.1-2-anaconda
module load pandoc/2.7
module load multiqc/1.7
echo $workflow.nextflow.version > version_nextflow.txt
multiqc --version > version_multiqc.txt
python3 $baseDir/scripts/generate_references.py -r $references -o software_references
python3 $baseDir/scripts/generate_versions.py -o software_versions
multiqc -c $multiqc .
"""
}
......@@ -3,3 +3,12 @@ profiles {
includeConfig 'conf/biohpc.config'
}
}
manifest {
name = 'chipseq_analysis'
description = 'BICF ChIP-seq Analysis Workflow.'
homePage = 'https://github.com/nf-core/rnaseq'
version = '1.0.0'
mainScript = 'main.nf'
nextflowVersion = '>=0.31.0'
}
......@@ -17,7 +17,7 @@ args <- commandArgs(trailingOnly=TRUE)
# Check input args
if (length(args) != 2) {
stop("Usage: annotate_peaks.R [ annotate_design.tsv ] [ genome_assembly ]", call.=FALSE)
stop("Usage: annotate_peaks.R annotate_design.tsv genome_assembly", call.=FALSE)
}
design_file <- args[1]
......@@ -35,6 +35,11 @@ if(genome_assembly=='GRCh37') {
annodb <- 'org.Hs.eg.db'
}
# Output version of ChIPseeker
chipseeker_version = packageVersion('ChIPseeker')
write.table(paste("Version", chipseeker_version), file = "version_ChIPseeker.txt", sep = "\t",
row.names = FALSE, col.names = FALSE)
# Load design file
design <- read.csv(design_file, sep ='\t')
files <- as.list(as.character(design$Peaks))
......
......@@ -5,8 +5,8 @@
import os
import argparse
import shutil
import subprocess
import logging
from multiprocessing import cpu_count
import utils
from xcor import xcor as calculate_xcor
......@@ -70,16 +70,19 @@ def check_tools():
logger.info('Checking for required libraries and components on this system')
r_path = shutil.which("R")
if r_path:
logger.info('Found R: %s', r_path)
else:
logger.error('Missing R')
raise Exception('Missing R')
macs_path = shutil.which("macs2")
if r_path:
if macs_path:
logger.info('Found MACS2: %s', macs_path)
# Get Version
macs_version_command = "macs2 --version"
macs_version = subprocess.check_output(macs_version_command, shell=True, stderr=subprocess.STDOUT)
# Write to file
macs_file = open("version_macs.txt", "wb")
macs_file.write(macs_version)
macs_file.close()
else:
logger.error('Missing MACS2')
raise Exception('Missing MACS2')
......@@ -87,6 +90,18 @@ def check_tools():
bg_bw_path = shutil.which("bedGraphToBigWig")
if bg_bw_path:
logger.info('Found bedGraphToBigWig: %s', bg_bw_path)
# Get Version
bg_bw_version_command = "bedGraphToBigWig"
try:
subprocess.check_output(bg_bw_version_command, shell=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
bg_bw_version = e.output
# Write to file
bg_bw_file = open("version_bedGraphToBigWig.txt", "wb")
bg_bw_file.write(bg_bw_version)
bg_bw_file.close()
else:
logger.error('Missing bedGraphToBigWig')
raise Exception('Missing bedGraphToBigWig')
......@@ -94,12 +109,23 @@ def check_tools():
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
# Get Version
bedtools_version_command = "bedtools --version"
bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
# Write to file
bedtools_file = open("version_bedtools.txt", "wb")
bedtools_file.write(bedtools_version)
bedtools_file.close()
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes):
'''Call peaks and signal tracks'''
# Extract the fragment length estimate from column 3 of the
# cross-correlation scores file
......@@ -107,8 +133,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
firstline = xcor_fh.readline()
frag_lengths = firstline.split()[2] # third column
fragment_length = frag_lengths.split(',')[0] # grab first value
logger.info("Fraglen %s" % (fragment_length))
logger.info("Fraglen %s", fragment_length)
# Generate narrow peaks and preliminary signal tracks
......@@ -119,18 +144,18 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
logger.info(command)
returncode = utils.block_on(command)
logger.info("MACS2 exited with returncode %d" % (returncode))
logger.info("MACS2 exited with returncode %d", returncode)
assert returncode == 0, "MACS2 non-zero return"
# MACS2 sometimes calls features off the end of chromosomes.
# Remove coordinates outside chromosome sizes
narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix)
int_narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix)
narrowpeak_fn = '%s.narrowPeak' % (prefix)
clipped_narrowpeak_fn = 'clipped-%s' % (narrowpeak_fn)
steps = ['slopBed -i %s -g %s -b 0' % (narrowpeak_fn, chrom_sizes),
'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)]
steps = ['slopBed -i %s -g %s -b 0' % (int_narrowpeak_fn, chrom_sizes),
'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)]
out, err = utils.run_pipe(steps)
......@@ -161,7 +186,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
logger.info(command)
returncode = utils.block_on(command)
logger.info("MACS2 exited with returncode %d" % (returncode))
logger.info("MACS2 exited with returncode %d", returncode)
assert returncode == 0, "MACS2 non-zero return"
# Remove coordinates outside chromosome sizes (MACS2 bug)
......@@ -169,7 +194,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
fc_bedgraph_sorted_fn = 'sorted-%s' % (fc_bedgraph_fn)
fc_signal_fn = "%s.fc_signal.bw" % (prefix)
steps = ['slopBed -i %s_FE.bdg -g %s -b 0' % (prefix, chrom_sizes),
'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)]
'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)]
out, err = utils.run_pipe(steps)
......@@ -185,7 +210,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
logger.info(command)
returncode = utils.block_on(command)
logger.info("bedGraphToBigWig exited with returncode %d" % (returncode))
logger.info("bedGraphToBigWig exited with returncode %d", returncode)
assert returncode == 0, "bedGraphToBigWig non-zero return"
# For -log10(p-value) signal tracks
......@@ -199,7 +224,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
sval = str(min(float(chip_reads), float(control_reads)) / 1000000)
logger.info("chip_reads = %s, control_reads = %s, sval = %s" %
(chip_reads, control_reads, sval))
(chip_reads, control_reads, sval))
command = 'macs2 bdgcmp ' + \
'-t %s_treat_pileup.bdg ' % (prefix) + \
......@@ -216,7 +241,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
pvalue_bedgraph_sorted_fn = 'sort-%s' % (pvalue_bedgraph_fn)
pvalue_signal_fn = "%s.pvalue_signal.bw" % (prefix)
steps = ['slopBed -i %s_ppois.bdg -g %s -b 0' % (prefix, chrom_sizes),
'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)]
'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)]
out, err = utils.run_pipe(steps)
......@@ -232,12 +257,13 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
logger.info(command)
returncode = utils.block_on(command)
logger.info("bedGraphToBigWig exited with returncode %d" % (returncode))
logger.info("bedGraphToBigWig exited with returncode %d", returncode)
assert returncode == 0, "bedGraphToBigWig non-zero return"
# Remove temporary files
os.remove(clipped_narrowpeak_fn)
os.remove(rescaled_narrowpeak_fn)
os.remove(int_narrowpeak_fn)
def main():
......
......@@ -72,6 +72,46 @@ def check_design_headers(design, paired):
raise Exception("Missing column headers: %s" % list(missing_headers))
def check_samples(design):
'''Check if design file has the correct sample name mapping.'''
logger.info("Running sample check.")
samples = design.groupby('sample_id') \
.apply(list)
malformated_samples = []
chars = set('-.')
for sample in samples.index.values:
if(any(char.isspace() for char in sample) | any((char in chars) for char in sample)):
malformated_samples.append(sample)
if len(malformated_samples) > 0:
logger.error('Malformed samples from design file: %s', list(malformated_samples))
raise Exception("Malformed samples from design file: %s" %
list(malformated_samples))
def check_experiments(design):
'''Check if design file has the correct experiment name mapping.'''
logger.info("Running experiment check.")
experiments = design.groupby('experiment_id') \
.apply(list)
malformated_experiments = []
chars = set('-.')
for experiment in experiments.index.values:
if(any(char.isspace() for char in experiment) | any((char in chars) for char in experiment)):
malformated_experiments.append(experiment)
if len(malformated_experiments) > 0:
logger.error('Malformed experiment from design file: %s', list(malformated_experiments))
raise Exception("Malformed experiment from design file: %s" %
list(malformated_experiments))
def check_controls(design):
'''Check if design file has the correct control mapping.'''
......@@ -146,8 +186,8 @@ def main():
logger.addHandler(handler)
# Read files as dataframes
design_df = pd.read_csv(args.design, sep='\t')
fastq_df = pd.read_csv(args.fastq, sep='\t', names=['name', 'path'])
design_df = pd.read_csv(design, sep='\t')
fastq_df = pd.read_csv(fastq, sep='\t', names=['name', 'path'])
# Check design file
check_design_headers(design_df, paired)
......
......@@ -54,6 +54,15 @@ def check_tools():
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
# Get Version
bedtools_version_command = "bedtools --version"
bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
# Write to file
bedtools_file = open("version_bedtools.txt", "wb")
bedtools_file.write(bedtools_version)
bedtools_file.close()
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
......@@ -61,6 +70,15 @@ def check_tools():
samtools_path = shutil.which("samtools")
if samtools_path:
logger.info('Found samtools: %s', samtools_path)
# Get Version
samtools_version_command = "samtools --version"
samtools_version = subprocess.check_output(samtools_version_command, shell=True)
# Write to file
samtools_file = open("version_samtools.txt", "wb")
samtools_file.write(samtools_version)
samtools_file.close()
else:
logger.error('Missing samtools')
raise Exception('Missing samtools')
......@@ -91,8 +109,7 @@ def convert_mapped_pe(bam, bam_basename):
out, err = utils.run_pipe([
"bamToBed -bedpe -mate1 -i %s" % (nmsrt_bam_filename),
"gzip -nc"],
outfile=bedpe_filename)
"gzip -nc"], outfile=bedpe_filename)
def main():
......@@ -109,7 +126,7 @@ def main():
# Convert PE or SE to tagAlign
bam_basename = os.path.basename(
utils.strip_extensions(bam, ['.bam']))
utils.strip_extensions(bam, ['.bam', '.dedup']))
tag_filename = bam_basename + '.tagAlign.gz'
convert_mapped(bam, tag_filename)
......@@ -117,7 +134,7 @@ def main():
if paired: # paired-end data
convert_mapped_pe(bam, bam_basename)
else:
bedse_filename = bam_basename + ".bedse.gz"
bedse_filename = bam_basename + ".bedse.gz"
shutil.copy(tag_filename, bedse_filename)
......
......@@ -8,9 +8,14 @@ args <- commandArgs(trailingOnly=TRUE)
# Check input args
if (length(args) != 1) {
stop("Usage: diff_peaks.R [ annotate_design.tsv ] ", call.=FALSE)
stop("Usage: diff_peaks.R annotate_design.tsv ", call.=FALSE)
}
# Output version of DiffBind
diffibind_version = packageVersion('DiffBind')
write.table(paste("Version", diffibind_version), file = "version_DiffBind.txt", sep = "\t",
row.names = FALSE, col.names = FALSE)
# Build DBA object from design file
data <- dba(sampleSheet=args[1])
data <- dba.count(data)
......
......@@ -7,8 +7,9 @@ import argparse
import logging
import subprocess
import shutil
import pandas as pd
from multiprocessing import cpu_count
import pandas as pd
EPILOG = '''
For more details:
......@@ -51,6 +52,15 @@ def check_tools():
deeptools_path = shutil.which("deeptools")
if deeptools_path:
logger.info('Found deeptools: %s', deeptools_path)
# Get Version
deeptools_version_command = "deeptools --version"
deeptools_version = subprocess.check_output(deeptools_version_command, shell=True, stderr=subprocess.STDOUT)
# Write to file
deeptools_file = open("version_deeptools.txt", "wb")
deeptools_file.write(deeptools_version)
deeptools_file.close()
else:
logger.error('Missing deeptools')
raise Exception('Missing deeptools')
......@@ -76,10 +86,10 @@ def generate_read_summary(design, extension):
return mbs_filename
def check_correlation(mbs):
def check_spearman_correlation(mbs):
'''Check Spearman pairwise correlation of samples based on read counts.'''
spearman_filename = 'heatmap_SpearmanCorr.png'
spearman_filename = 'heatmap_SpearmanCorr.pdf'
spearman_params = "--corMethod spearman --skipZero" + \
" --plotTitle \"Spearman Correlation of Read Counts\"" + \
" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers"
......@@ -95,12 +105,31 @@ def check_correlation(mbs):
out, err = spearman_correlation.communicate()
def check_pearson_correlation(mbs):
'''Check Pearson pairwise correlation of samples based on read counts.'''
pearson_filename = 'heatmap_PearsonCorr.pdf'
pearson_params = "--corMethod pearson --skipZero" + \
" --plotTitle \"Pearson Correlation of Read Counts\"" + \
" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers"
pearson_command = (
"plotCorrelation -in %s %s -o %s"
% (mbs, pearson_params, pearson_filename)
)
logger.info("Running deeptools with %s", pearson_command)
pearson_correlation = subprocess.Popen(pearson_command, shell=True)
out, err = pearson_correlation.communicate()
def check_coverage(design, extension):
'''Asses the sequencing depth of samples.'''
bam_files = ' '.join(design['bam_reads'])
labels = ' '.join(design['sample_id'])
coverage_filename = 'coverage.png'
coverage_filename = 'coverage.pdf'
coverage_params = "-n 1000000 --plotTitle \"Sample Coverage\"" + \
" --ignoreDuplicates --minMappingQuality 10"
......@@ -136,7 +165,7 @@ def update_controls(design):
def check_enrichment(sample_id, control_id, sample_reads, control_reads, extension):
'''Asses the enrichment per sample.'''
fingerprint_filename = sample_id + '_fingerprint.png'
fingerprint_filename = sample_id + '_fingerprint.pdf'
fingerprint_command = (
"plotFingerprint -b %s %s --extendReads %d --labels %s %s --plotFile %s"
......@@ -166,7 +195,8 @@ def main():
# Run correlation
mbs_filename = generate_read_summary(design_df, extension)
check_correlation(mbs_filename)
check_spearman_correlation(mbs_filename)
check_pearson_correlation(mbs_filename)
# Run coverage
check_coverage(design_df, extension)
......@@ -175,11 +205,11 @@ def main():
new_design_df = update_controls(design_df)
for index, row in new_design_df.iterrows():
check_enrichment(
row['sample_id'],
row['control_id'],
row['bam_reads'],
row['control_reads'],
extension)
row['sample_id'],
row['control_id'],
row['bam_reads'],
row['control_reads'],
extension)
if __name__ == '__main__':
......
#!/usr/bin/env python
'''Make header for HTML of references.'''
import argparse
import subprocess
import shlex
import logging
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-r', '--reference',
help="The reference file (markdown format).",
required=True)
parser.add_argument('-o', '--output',
help="The out file name.",
default='references')
args = parser.parse_args()
return args
def main():
args = get_args()
reference = args.reference
output = args.output
out_filename = output + '_mqc.yaml'
# Header for HTML
print('''
id: 'Software References'
section_name: 'Software References'
description: 'This section describes references for the tools used.'
plot_type: 'html'
data: |
'''
, file = open(out_filename, "w")
)
# Turn Markdown into HTML
references_html = 'bash -c "pandoc -p {} | sed \'s/^/ /\' >> {}"'
references_html = references_html.format(reference, out_filename)
subprocess.check_call(shlex.split(references_html))
if __name__ == '__main__':
main()