diff --git a/.gitignore b/.gitignore index 113294a5f18d979085b4ad570d8a22a8e4eabd58..eb94944ee1efc2ed6d8571d51ad1bb7245c354ed 100644 --- a/.gitignore +++ b/.gitignore @@ -97,3 +97,6 @@ ENV/ # mypy .mypy_cache/ + +# Mac OS +.DS_Store diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a54c5d7d11d173b292452f3906efd258c4b0582b..128756e1c26f5b1b7d4d9609675e67ca593c31f5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,13 +1,15 @@ before_script: - module add python/3.6.1-2-anaconda - - pip install --user pytest-pythonpath pytest-cov + - pip install --user pytest-pythonpath==0.7.1 pytest-cov==2.5.1 - module load nextflow/0.31.0 - - ln -s /work/BICF/s163035/chipseq/*fastq.gz test_data/ + - ln -s /project/shared/bicf_workflow_ref/workflow_testdata/chipseq/*fastq.gz test_data/ stages: - unit + - astrocyte - single - multiple + - skip user_configuration: stage: unit @@ -15,34 +17,66 @@ user_configuration: - pytest -m unit - pytest -m unit --cov=./workflow/scripts +astrocyte: + stage: astrocyte + script: + - module load astrocyte/0.1.0 + - module unload nextflow + - cd .. + - astrocyte_cli validate chipseq_analysis + artifacts: + expire_in: 2 days + single_end_mouse: stage: single + only: + - master script: - - nextflow run workflow/main.nf -resume + - nextflow run workflow/main.nf --astrocyte true -resume - pytest -m singleend artifacts: expire_in: 2 days paired_end_human: stage: single + only: + - branches + except: + - master script: - - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_ENCSR729LGA_PE.txt" --genome 'GRCh38' --pairedEnd true -resume + - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_ENCSR729LGA_PE.txt" --genome 'GRCh38' --pairedEnd true --astrocyte false -resume - pytest -m pairedend artifacts: expire_in: 2 days single_end_diff: stage: multiple + only: + - branches + except: + - master script: - - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' -resume + - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' --astrocyte false -resume - pytest -m singlediff artifacts: expire_in: 2 days paired_end_diff: + only: + - master stage: multiple script: - - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_PE.txt" --genome 'GRCh38' --pairedEnd true -resume + - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_PE.txt" --genome 'GRCh38' --pairedEnd true --astrocyte false -resume - pytest -m paireddiff artifacts: expire_in: 2 days + +single_end_skip: + stage: skip + only: + - master + script: + - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' --skipDiff true --skipMotif true --astrocyte false -resume + - pytest -m singleskip_true + artifacts: + expire_in: 2 days diff --git a/README.md b/README.md index ca40490f653e0f2454695d00ebdbb9f899e0a88b..439a531365138f51401d49abf8b0698cdddba525 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ [](https://www.nextflow.io/) [](https://astrocyte-test.biohpc.swmed.edu/static/docs/index.html) +[](https://doi.org/10.5281/zenodo.2648845) ## Introduction diff --git a/astrocyte_pkg.yml b/astrocyte_pkg.yml index 929e2316a228ee29b6cd6f8f23627d382b9de224..2c040e8fe3fd412d8daa8687672de4f8cbfb17e0 100644 --- a/astrocyte_pkg.yml +++ b/astrocyte_pkg.yml @@ -11,7 +11,7 @@ name: 'chipseq_analysis_bicf' # Who wrote this? author: 'Beibei Chen and Venkat Malladi' # A contact email address for questions -email: 'biohpc-help@utsouthwestern.edu' +email: 'bicf@utsouthwestern.edu' # A more informative title for the workflow package title: 'BICF ChIP-seq Analysis Workflow' # A summary of the workflow package in plain text @@ -27,7 +27,7 @@ description: | # web interface. These files are in the 'docs' subdirectory. The first file # listed will be used as a documentation index and is index.md by convention documentation_files: - - ['index.md', 'chipseq-analysis'] + - 'index.md' # ----------------------------------------------------------------------------- # NEXTFLOW WORKFLOW CONFIGURATION @@ -42,15 +42,17 @@ workflow_modules: - 'python/3.6.1-2-anaconda' - 'trimgalore/0.4.1' - 'bwa/intel/0.7.12' + - 'samtools/1.6' - 'sambamba/0.6.6' - 'bedtools/2.26.0' - 'deeptools/2.5.0.1' - 'phantompeakqualtools/1.2' - 'macs/2.1.0-20151222' - 'UCSC_userApps/v317' - - 'R/3.4.1-gccmkl' + - 'R/3.3.2-gccmkl' - 'meme/4.11.1-gcc-openmpi' - - 'python/2.7.x-anaconda' + - 'pandoc/2.7' + # A list of parameters used by the workflow, defining how to present them, # options etc in the web interface. For each parameter: @@ -92,20 +94,21 @@ workflow_parameters: description: | One or more input FASTQ files from a ChIP-seq expereiment and a design file with the link bewetwen the same file name and sample id - regex: ".*(fastq|fq)*" + regex: ".*(fastq|fq)*gz" + min: 2 - id: pairedEnd type: select required: true choices: - - [ 'true', 'True'] - - [ 'false', 'False'] + - [ 'true', 'true'] + - [ 'false', 'false'] description: | In single-end sequencing, the sequencer reads a fragment from only one end to the other, generating the sequence of base pairs. In paired-end reading it starts at one read, finishes this direction at the specified read length, and then starts another round of reading from the opposite - end of the fragment. + end of the fragment. (Paired-end: True, Single-end: False) - id: design type: file @@ -113,18 +116,45 @@ workflow_parameters: description: | A design file listing sample id, fastq files, corresponding control id and additional information about the sample. - regex: ".*tsv" + regex: ".*txt" - id: genome type: select + required: true choices: - [ 'GRCh38', 'Human GRCh38'] - [ 'GRCh37', 'Human GRCh37'] - [ 'GRCm38', 'Mouse GRCm38'] - required: true description: | Reference species and genome used for alignment and subsequent analysis. + - id: skipDiff + type: select + required: true + choices: + - [ 'true', 'true'] + - [ 'false', 'false'] + description: | + Run differential peak analysis + + - id: skipMotif + type: select + required: true + choices: + - [ 'true', 'true'] + - [ 'false', 'false'] + description: | + Run motif calling + + - id: astrocyte + type: select + choices: + - [ 'true', 'true' ] + required: true + default: 'true' + description: | + Ensure configuraton for astrocyte. + # ----------------------------------------------------------------------------- # SHINY APP CONFIGURATION @@ -144,8 +174,4 @@ vizapp_cran_packages: # List of any Bioconductor packages, not provided by the modules, # that must be made available to the vizapp -vizapp_bioc_packages: - - qusage -# - ballgown -vizapp_github_packages: - - js229/Vennerable +vizapp_bioc_packages: [] diff --git a/docs/design_example.csv b/docs/design_example.csv deleted file mode 100644 index 6fa48f3f3f61a8d70dada47f4cfa6c9c0bf1fbfd..0000000000000000000000000000000000000000 --- a/docs/design_example.csv +++ /dev/null @@ -1,7 +0,0 @@ -SampleID,Tissue,Factor,Condition,Replicate,Peaks,bamReads,bamControl,ControlID,PeakCaller -A_1,A,H3K27AC,A,1,A_1.broadPeak,A_1.bam,A_1_input.bam,A_1_input,bed -A_2,A,H3K27AC,A,2,A_2.broadPeak,A_2.bam,A_2_input.bam,A_2_input,bed -B_1,B,H3K27AC,B,1,B_1.broadPeak,B_1.bam,B_1_input.bam,B_1_input,bed -B_2,B,H3K27AC,B,2,B_2.broadPeak,B_2.bam,B_2_input.bam,B_2_input,bed -C_1,C,H3K27AC,C,1,C_1.broadPeak,C_1.bam,C_1_input.bam,C_1_input,bed -C_2,C,H3K27AC,C,2,C_2.broadPeak,C_2.bam,C_2_input.bam,C_2_input,bed diff --git a/docs/design_example.txt b/docs/design_example.txt new file mode 100644 index 0000000000000000000000000000000000000000..61bee7c96e760a90856a2e17b75df942344815e6 --- /dev/null +++ b/docs/design_example.txt @@ -0,0 +1,5 @@ +sample_id experiment_id biosample factor treatment replicate control_id fastq_read1 +A1 A tissueA H3K27AC None 1 B1 A1.fastq.gz +A2 A tissueA H3K27AC None 2 B2 A2.fastq.gz +B1 B tissueB Input None 1 B1 B1.fastq.gz +B2 A tissueB Input None 2 B2 B2.fastq.gz diff --git a/docs/index.md b/docs/index.md index bf486e6cb1c85632bf39e5848e77bc75d864b12a..2742e165db19ea79c4660d23e74f3cfe8bc8fc2a 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,4 +1,4 @@ -# Astrocyte ChIPseq analysis Workflow Package +# BICF ChIP-seq Analysis Workflow ## Introduction **ChIP-seq Analysis** is a bioinformatics best-practice analysis pipeline used for chromatin immunoprecipitation (ChIP-seq) data analysis. diff --git a/docs/references.md b/docs/references.md new file mode 100644 index 0000000000000000000000000000000000000000..ea99ec7cb1a6356dfea0062f965f418d4cc58b15 --- /dev/null +++ b/docs/references.md @@ -0,0 +1,57 @@ +### References + +1. **python**: + * Anaconda (Anaconda Software Distribution, [https://anaconda.com](https://anaconda.com)) + +2. **trimgalore**: + * trimgalore [https://github.com/FelixKrueger/TrimGalore](https://github.com/FelixKrueger/TrimGalore) + +3. **cutadapt**: + * Marcel, M. 2011. Cutadapt removes adapter sequences from high-throughput sequencing reads. EMBnet.journal 17(1):10-12. doi:[10.14806/ej.17.1.200](http://dx.doi.org/10.14806/ej.17.1.200) + +4. **bwa**: + * Li H., and R. Durbin. 2009. Fast and accurate short read alignment with Burrows-Wheeler Transform. Bioinformatics 25: 1754-60. doi:[10.1093/bioinformatics/btp324](http://dx.doi.org/10.1093/bioinformatics/btp324) + +5. **samtools**: + * Li H., B. Handsaker, A. Wysoker, T. Fennell, J. Ruan, N. Homer, G. Marth, G. Abecasis, R. Durbin, and 1000 Genome Project Data Processing Subgroup. 2009. The Sequence alignment/map (SAM) format and SAMtools. Bioinformatics 25: 2078-9. doi:[10.1093/bioinformatics/btp352](http://dx.doi.org/10.1093/bioinformatics/btp352) + +6. **sambamba**: + * Tarasov, A., A. J. Vilella, E. Cuppen, I. J. Nijman, and P. Prins. 2015 Sambamba: fast processing of NGS alignment formats. Bioinformatics 31(12): 2032-2034. doi:[10.1093/bioinformatics/btv098](http://dx.doi.org/10.1093/bioinformatics/btv098) + +7. **bedtools**: + * Quinlan, A. R., and I. M. Hall. 2010. BEDTools: a flexible suite of utilities for comparing genomic feautures. Bioinformatics 26(6): 841-842. doi:[10.1093/bioinformatics/btq033](http://dx.doi.org/10.1093/bioinformatics/btq033) + +8. **deeptools**: + * RamÃrez, F., D. P. Ryan, B. Grüning, V. Bhardwaj, F. Kilpert, A. S. Richter, S. Heyne, F. Dündar, and T. Manke. 2016. deepTools2: a next generation web server for deep-sequencing data analysis. Nucleic Acids Research 44: W160-165. doi:[10.1093/nar/gkw257](http://dx.doi.org/10.1093/nar/gkw257) + +9. **phantompeakqualtools**: + * Landt S. G., G. K. Marinov, A. Kundaje, et al. 2012. ChIP-seq guidelines and practices of the ENCODE and modENCODE consortia. Genome Res 9: 1813-31. doi:[10.1101/gr.136184.111](http://dx.doi.org/10.1101/gr.136184.111) + * Kharchenko P. K., M. Y. Tolstorukov, and P. J. Park. 2008. Design and analysis of ChIP-seq experiments for DNA-binding proteins. Nat Biotechnol 26(12): 1351-1359. doi:[10.1038/nbt.1508](https://dx.doi.org/10.1038/nbt.1508) + +10. **macs**: + * Zhang Y., T. Liu, C. A. Meyer, J. Eeckhoute, D. S. Johnson, B. E. Bernstein, C. Nusbaum, R. M. Myers, M. Brown, W. Li, and X. S. Liu. 2008. Model-based Analysis of ChIP-Seq (MACS). Genome Biol 9: R137. doi:[10.1186/gb-2008-9-9-r137](https://dx.doi.org/10.1186/gb-2008-9-9-r137) + +11. **UCSC(bedGraphToBigWig)**: + * Kent W. J., A. S. Zweig, G. Barber, A. S. Hinrichs, and D. Karolchik. BigWig and BigBed: enabling browsing of large distributed data sets. Bioinformatics 26(17): 2204-2207. doi:[10.1093/bioinformatics/btq351](https://dx.doi.org/10.1093/bioinformatics/btq351) + +12. **R**: + * R Core Team 2014. R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL:[http://www.R-project.org/](http://www.R-project.org/). + +13. **meme**: + * Bailey T. L., M. Bodén, F. A. Buske, M. Frith, C. E. Grant, L. Clementi, J. Ren, W. W. Li, and W. S. Noble. 2009. MEME SUITE: tools for motif discovery and searching. Nucleic Acids Research 37: W202-W208. doi:[10.1093/nar/gkp335](https://dx.doi.org/10.1093/nar/gkp335) + * Machanick P., and T. L. Bailey. 2011. MEME-ChIP: motif analysis of large DNA datasets. Bioinformatics 27(12): 1696-1697. doi:[10.1093/bioinformatics/btr189](https://dx.doi.org/10.1093/bioinformatics/btr189) + +14. **ChIPseeker**: + * Yu G., L. Wang, and Q. He. 2015. ChIPseeker: an R/Bioconductor package for ChIP peak annotation, comparison and visualization. Bioinformatics 31(14): 2382-2383. doi:[10.1093/bioinformatics/btv145](https://dx.doi.org/10.1093/bioinformatics/btv145) + +15. **DiffBind**: + * Stark R., and G. Brown. 2011. DiffBind: differential binding analysis of ChIP-Seq peak data. [http://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf](http://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf). doi:[10.18129/B9.bioc.DiffBind](https://dx.doi.org/10.18129/B9.bioc.DiffBind) + * Ross-Innes C. S., R. Stark, A. E. Teschendorff, K. A. Holmes, H. R. Ali, M. J. Dunning, G. D. Brown, O. Gojis, I. O. Ellis, A. R. Green, S. Ali, S. Chin, C. Palmieri, C. Caldas, and J. S. Carroll. 2012. Differential oestrogen receptor binding is associated with clinical outcome in breast cancer. Nature 481: 389-393. doi:[10.1038/nature10730](https://dx.doi.org/10.1038/nature10730) + +16. **MultiQc**: + * Ewels P., Magnusson M., Lundin S. and Käller M. 2016. MultiQC: Summarize analysis results for multiple tools and samples in a single report. Bioinformatics 32(19): 3047–3048. doi:[10.1093/bioinformatics/btw354 ](https://dx.doi.org/10.1093/bioinformatics/btw354) + +17. **BICF ChIP-seq Analysis Workflow**: + * Venkat S. Malladi and Beibei Chen. (2019). BICF ChIP-seq Analysis Workflow (publish_1.0.0). Zenodo. doi:[10.5281/zenodo.2648845](https://doi.org/10.5281/zenodo.2648845) + +Please cite in publications: Pipeline was developed by BICF from funding provided by **Cancer Prevention and Research Institute of Texas (RP150596)**. diff --git a/env.md b/env.md deleted file mode 100644 index 54176d86aa83a2fd8cf7825ad2fee7643c6234d3..0000000000000000000000000000000000000000 --- a/env.md +++ /dev/null @@ -1,17 +0,0 @@ -## Create new env in specific folder -```shell -conda create -p /project/shared/bicf_workflow_ref/chipseq_bchen4/ -c r r-essentials -#Add channels -conda config --add channels conda-forge -conda config --add channels r -conda config --add channels bioconda -pip install --user twobitreader -conda install -c r r-xml -``` - -Install bioconductor in R console: -```R -source("http://bioconductor.org/biocLite.R") -biocLite() -biocLite(c("DiffBind","ChIPseeker")) -``` \ No newline at end of file diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 2c0f1c31a9c46b5f9f45699f3c1eb506d6c62e10..6e2e885661a011956aaf1bac1e4fb0da3969d686 100644 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -41,7 +41,7 @@ process { executor = 'local' } withName: callPeaksMACS { - module = ['python/3.6.1-2-anaconda', 'macs/2.1.0-20151222', 'phantompeakqualtools/1.2', 'UCSC_userApps/v317', 'bedtools/2.26.0'] + module = ['python/3.6.1-2-anaconda', 'macs/2.1.0-20151222', 'UCSC_userApps/v317', 'bedtools/2.26.0', 'phantompeakqualtools/1.2'] queue = '128GB,256GB,256GBv1' } withName: consensusPeaks { @@ -60,7 +60,10 @@ process { module = ['python/3.6.1-2-anaconda', 'meme/4.11.1-gcc-openmpi', 'bedtools/2.26.0'] cpus = 32 } - + withName: multiqcReport { + module = ['python/3.6.1-2-anaconda', 'pandoc/2.7', 'multiqc/1.7'] + executor = 'local' + } } params { diff --git a/workflow/conf/multiqc_config.yaml b/workflow/conf/multiqc_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bab17e5b675bc2ea8458f54cc3636b0c73f884f1 --- /dev/null +++ b/workflow/conf/multiqc_config.yaml @@ -0,0 +1,97 @@ +# Title to use for the report. +title: BICF ChIP-seq Analysis Report + +report_comment: > + This report has been generated by the <a href="https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/" target="_blank">BICF/chipseq_analysis</a> + pipeline. + +extra_fn_clean_exts: + - 'pbc.qc' + - 'cc.qc' + +fn_ignore_files: + - '*dedup.flagstat.qc' + +custom_data: + library_complexity: + file_format: 'tsv' + id: 'library_complexity' + contents: 'TotalReadPairs DistinctReadPairs OneReadPair TwoReadPairs NRF PBC1 PBC2' + section_name: 'Library complexity' + plot_type: 'generalstats' + pconfig: + TotalReadPairs: + decimalPlaces: 0 + shared_key: read_count + DistinctReadPairs: + decimalPlaces: 0 + shared_key: read_count + NRF: + decimalPlaces: 2 + PBC1: + decimalPlaces: 2 + PBC2: + decimalPlaces: 2 + +sp: + phantompeakqualtools/out: + fn: '*cc.qc' + library_complexity: + fn: '*pbc.qc' + macs2: + fn: '*_peaks.xls' + + +report_section_order: + cutadapt: + order: -1000 + Samtools: + order: -1100 + Software_Versions: + order: -1200 + Software_References: + order: -1300 + +table_columns_placement: + library_complexity: + TotalReadPairs: 1100 + DistinctReadPairs: 1200 + NRF: 1300 + PBC1: 1400 + PBC2: 1500 + phantompeakqualtools: + Estimated_Fragment_Length_bp: 1600 + NSC: 1700 + RSC: 1800 + +table_columns_visible: + cutadapt: + percent_trimmed: False + library_complexity: + OneReadPair: False + TwoReadPairs: False + +table_cond_formatting_rules: + library_complexity_mqc-generalstats-library_complexity-NRF: + pass: + - gt: 0.8 + warn: + - lt: 0.8 + fail: + - lt: 0.5 + library_complexity_mqc-generalstats-library_complexity-PBC1: + pass: + - gt: 0.8 + warn: + - lt: 0.8 + fail: + - lt: 0.5 + library_complexity_mqc-generalstats-library_complexity-PBC2: + pass: + - gt: 3 + warn: + - lt: 3 + fail: + - lt: 1 + +thousandsSep_format: '' diff --git a/workflow/main.nf b/workflow/main.nf index 29afb260c8dee81e663e4031fbf6625398eaa321..99474e515cf8d39101044ebdc139f20f308cfb07 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -8,15 +8,36 @@ params.reads = "$baseDir/../test_data/*.fastq.gz" params.pairedEnd = false params.designFile = "$baseDir/../test_data/design_ENCSR238SGC_SE.txt" params.genome = 'GRCm38' -params.genomes = [] -params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false -params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false -params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false -params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false params.cutoffRatio = 1.2 params.outDir= "$baseDir/output" params.extendReadsLen = 100 params.topPeakCount = 600 +params.astrocyte = false +params.skipDiff = false +params.skipMotif = false +params.references = "$baseDir/../docs/references.md" +params.multiqc = "$baseDir/conf/multiqc_config.yaml" + +// Assign variables if astrocyte +if (params.astrocyte) { + print("Running under astrocyte") + referenceLocation = "/project/shared/bicf_workflow_ref" + params.bwaIndex = "$referenceLocation/$params.genome" + params.chromSizes = "$referenceLocation/$params.genome/genomefile.txt" + params.fasta = "$referenceLocation/$params.genome/genome.fa.txt" + if (params.genome == 'GRCh37' || params.genome == 'GRCh38') { + params.genomeSize = 'hs' + } else if (params.genome == 'GRCm38') { + params.genomeSize = 'mm' + } +} else { + params.bwaIndex = params.genome ? params.genomes[ params.genome ].bwa ?: false : false + params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false + params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false + params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false +} + + // Check inputs if( params.bwaIndex ){ @@ -45,6 +66,10 @@ cutoffRatio = params.cutoffRatio outDir = params.outDir extendReadsLen = params.extendReadsLen topPeakCount = params.topPeakCount +skipDiff = params.skipDiff +skipMotif = params.skipMotif +references = params.references +multiqc = params.multiqc // Check design file for errors process checkDesignFile { @@ -64,11 +89,13 @@ process checkDesignFile { if (pairedEnd) { """ + module load python/3.6.1-2-anaconda python3 $baseDir/scripts/check_design.py -d $designFile -f $readsList -p """ } else { """ + module load python/3.6.1-2-anaconda python $baseDir/scripts/check_design.py -d $designFile -f $readsList """ } @@ -90,7 +117,7 @@ rawReads = designFilePaths process trimReads { tag "$sampleId-$replicate" - publishDir "$outDir/${task.process}", mode: 'copy' + publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy' input: @@ -99,18 +126,23 @@ process trimReads { output: set sampleId, file('*.fq.gz'), experimentId, biosample, factor, treatment, replicate, controlId into trimmedReads - file('*trimming_report.txt') into trimgalore_results + file('*trimming_report.txt') into trimgaloreResults + file('version_*.txt') into trimReadsVersions script: if (pairedEnd) { """ - python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -p + module load python/3.6.1-2-anaconda + module load trimgalore/0.4.1 + python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -s $sampleId -p """ } else { """ - python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} + module load python/3.6.1-2-anaconda + module load trimgalore/0.4.1 + python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} -s $sampleId """ } @@ -119,8 +151,9 @@ process trimReads { // Align trimmed reads using bwa process alignReads { + queue '128GB,256GB,256GBv1' tag "$sampleId-$replicate" - publishDir "$outDir/${task.process}", mode: 'copy' + publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy' input: @@ -130,18 +163,25 @@ process alignReads { output: set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into mappedReads - file '*.srt.bam.flagstat.qc' into mappedReadsStats + file '*.flagstat.qc' into mappedReadsStats + file('version_*.txt') into alignReadsVersions script: if (pairedEnd) { """ - python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -p + module load python/3.6.1-2-anaconda + module load bwa/intel/0.7.12 + module load samtools/1.6 + python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -s $sampleId -p """ } else { """ - python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa + module load python/3.6.1-2-anaconda + module load bwa/intel/0.7.12 + module load samtools/1.6 + python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa -s $sampleId """ } @@ -150,8 +190,9 @@ process alignReads { // Dedup reads using sambamba process filterReads { + queue '128GB,256GB,256GBv1' tag "$sampleId-$replicate" - publishDir "$outDir/${task.process}", mode: 'copy' + publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy' input: @@ -161,19 +202,28 @@ process filterReads { set sampleId, file('*.bam'), file('*.bai'), experimentId, biosample, factor, treatment, replicate, controlId into dedupReads set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into convertReads - file '*flagstat.qc' into dedupReadsStats - file '*pbc.qc' into dedupReadsComplexity - file '*dup.qc' into dupReads + file '*.flagstat.qc' into dedupReadsStats + file '*.pbc.qc' into dedupReadsComplexity + file '*.dedup.qc' into dupReads + file('version_*.txt') into filterReadsVersions script: if (pairedEnd) { """ + module load python/3.6.1-2-anaconda + module load samtools/1.6 + module load sambamba/0.6.6 + module load bedtools/2.26.0 python3 $baseDir/scripts/map_qc.py -b $mapped -p """ } else { """ + module load python/3.6.1-2-anaconda + module load samtools/1.6 + module load sambamba/0.6.6 + module load bedtools/2.26.0 python3 $baseDir/scripts/map_qc.py -b $mapped """ } @@ -190,6 +240,7 @@ dedupReads // Quality Metrics using deeptools process experimentQC { + queue '128GB,256GB,256GBv1' publishDir "$outDir/${task.process}", mode: 'copy' input: @@ -198,11 +249,14 @@ process experimentQC { output: - file '*.{png,npz}' into deepToolsStats + file '*.{pdf,npz}' into experimentQCStats + file('version_*.txt') into experimentQCVersions script: """ + module load python/3.6.1-2-anaconda + module load deeptools/2.5.0.1 python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign -e $extendReadsLen """ @@ -211,8 +265,9 @@ process experimentQC { // Convert reads to bam process convertReads { + queue '128GB,256GB,256GBv1' tag "$sampleId-$replicate" - publishDir "$outDir/${task.process}", mode: 'copy' + publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy' input: @@ -221,16 +276,23 @@ process convertReads { output: set sampleId, file('*.tagAlign.gz'), file('*.bed{pe,se}.gz'), experimentId, biosample, factor, treatment, replicate, controlId into tagReads + file('version_*.txt') into convertReadsVersions script: if (pairedEnd) { """ + module load python/3.6.1-2-anaconda + module load samtools/1.6 + module load bedtools/2.26.0 python3 $baseDir/scripts/convert_reads.py -b $deduped -p """ } else { """ + module load python/3.6.1-2-anaconda + module load samtools/1.6 + module load bedtools/2.26.0 python3 $baseDir/scripts/convert_reads.py -b $deduped """ } @@ -241,7 +303,7 @@ process convertReads { process crossReads { tag "$sampleId-$replicate" - publishDir "$outDir/${task.process}", mode: 'copy' + publishDir "$outDir/${task.process}/${sampleId}", mode: 'copy' input: @@ -250,17 +312,22 @@ process crossReads { output: set sampleId, seTagAlign, tagAlign, file('*.cc.qc'), experimentId, biosample, factor, treatment, replicate, controlId into xcorReads - set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats + set file('*.cc.qc'), file('*.cc.plot.pdf') into crossReadsStats + file('version_*.txt') into crossReadsVersions script: if (pairedEnd) { """ + module load python/3.6.1-2-anaconda + module load phantompeakqualtools/1.2 python3 $baseDir/scripts/xcor.py -t $seTagAlign -p """ } else { """ + module load python/3.6.1-2-anaconda + module load phantompeakqualtools/1.2 python3 $baseDir/scripts/xcor.py -t $seTagAlign """ } @@ -289,6 +356,7 @@ process defineExpDesignFiles { script: """ + module load python/3.6.1-2-anaconda python3 $baseDir/scripts/experiment_design.py -d $xcorDesign """ @@ -314,11 +382,13 @@ process poolAndPsuedoReads { if (pairedEnd) { """ + module load python/3.6.1-2-anaconda python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio -p """ } else { """ + module load python/3.6.1-2-anaconda python3 $baseDir/scripts/pool_and_psuedoreplicate.py -d $experimentObjs -c $cutoffRatio """ } @@ -334,7 +404,7 @@ experimentRows = experimentPoolObjs process callPeaksMACS { tag "$sampleId-$replicate" - publishDir "$outDir/${task.process}", mode: 'copy' + publishDir "$outDir/${task.process}/${experimentId}/${replicate}", mode: 'copy' input: set sampleId, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId, controlTagAlign from experimentRows @@ -342,16 +412,28 @@ process callPeaksMACS { output: set sampleId, file('*.narrowPeak'), file('*.fc_signal.bw'), file('*.pvalue_signal.bw'), experimentId, biosample, factor, treatment, replicate, controlId into experimentPeaks + file '*.xls' into callPeaksMACSsummit + file('version_*.txt') into callPeaksMACSVersions script: if (pairedEnd) { """ + module load python/3.6.1-2-anaconda + module load macs/2.1.0-20151222 + module load UCSC_userApps/v317 + module load bedtools/2.26.0 + module load phantompeakqualtools/1.2 python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes -p """ } else { """ + module load python/3.6.1-2-anaconda + module load macs/2.1.0-20151222 + module load UCSC_userApps/v317 + module load bedtools/2.26.0 + module load phantompeakqualtools/1.2 python3 $baseDir/scripts/call_peaks_macs.py -t $tagAlign -x $xcor -c $controlTagAlign -s $sampleId -g $genomeSize -z $chromSizes """ } @@ -368,6 +450,7 @@ peaksDesign = experimentPeaks process consensusPeaks { publishDir "$outDir/${task.process}", mode: 'copy' + publishDir "$outDir/design", mode: 'copy', pattern: '*.{csv|tsv}' input: @@ -381,10 +464,13 @@ process consensusPeaks { file 'design_diffPeaks.csv' into designDiffPeaks file 'design_annotatePeaks.tsv' into designAnnotatePeaks, designMotifSearch file 'unique_experiments.csv' into uniqueExperiments + file('version_*.txt') into consensusPeaksVersions script: """ + module load python/3.6.1-2-anaconda + module load bedtools/2.26.0 python3 $baseDir/scripts/overlap_peaks.py -d $peaksDesign -f $preDiffDesign """ @@ -393,7 +479,7 @@ process consensusPeaks { // Annotate Peaks process peakAnnotation { - publishDir "$baseDir/output/${task.process}", mode: 'copy' + publishDir "$outDir/${task.process}", mode: 'copy' input: @@ -402,19 +488,21 @@ process peakAnnotation { output: file "*chipseeker*" into peakAnnotation + file('version_*.txt') into peakAnnotationVersions script: """ + module load R/3.3.2-gccmkl Rscript $baseDir/scripts/annotate_peaks.R $designAnnotatePeaks $genome """ } -// Motif Search Peaks +// Motif Search Peaks process motifSearch { - publishDir "$baseDir/output/${task.process}", mode: 'copy' + publishDir "$outDir/${task.process}", mode: 'copy' input: @@ -423,11 +511,17 @@ process motifSearch { output: file "*memechip" into motifSearch - file "sorted-*" into filteredPeaks + file "*narrowPeak" into filteredPeaks + file('version_*.txt') into motifSearchVersions + + when: + + !skipMotif script: """ + module load R/3.3.2-gccmkl python3 $baseDir/scripts/motif_search.py -d $designMotifSearch -g $fasta -p $topPeakCount """ } @@ -439,7 +533,7 @@ uniqueExperimentsList = uniqueExperiments // Calculate Differential Binding Activity process diffPeaks { - publishDir "$baseDir/output/${task.process}", mode: 'copy' + publishDir "$outDir/${task.process}", mode: 'copy' input: @@ -452,13 +546,62 @@ process diffPeaks { file '*_diffbind.csv' into diffPeaksCounts file '*.pdf' into diffPeaksStats file 'normcount_peaksets.txt' into normCountPeaks + file('version_*.txt') into diffPeaksVersions when: - noUniqueExperiments > 1 + noUniqueExperiments > 1 && !skipDiff script: + """ + module load python/3.6.1-2-anaconda + module load meme/4.11.1-gcc-openmpi + module load bedtools/2.26.0 Rscript $baseDir/scripts/diff_peaks.R $designDiffPeaks """ } + +// Generate Multiqc Report, gerernate Software Versions and references +process multiqcReport { + + publishDir "$outDir/${task.process}", mode: 'copy' + + input: + + file ('trimReads_vf/*') from trimReadsVersions.first() + file ('alignReads_vf/*') from alignReadsVersions.first() + file ('filterReads_vf/*') from filterReadsVersions.first() + file ('convertReads_vf/*') from convertReadsVersions.first() + file ('crossReads_vf/*') from crossReadsVersions.first() + file ('callPeaksMACS_vf/*') from callPeaksMACSVersions.first() + file ('consensusPeaks_vf/*') from consensusPeaksVersions.first() + file ('peakAnnotation_vf/*') from peakAnnotationVersions.first() + file ('motifSearch_vf/*') from motifSearchVersions.first().ifEmpty() + file ('diffPeaks_vf/*') from diffPeaksVersions.first().ifEmpty() + file ('experimentQC_vf/*') from experimentQCVersions.first() + file ('trimReads/*') from trimgaloreResults.collect() + file ('alignReads/*') from mappedReadsStats.collect() + file ('filterReads/*') from dedupReadsComplexity.collect() + file ('crossReads/*') from crossReadsStats.collect() + + output: + + file('software_versions_mqc.yaml') into softwareVersions + file('software_references_mqc.yaml') into softwareReferences + file "multiqc_report.html" into multiqcReport + file "*_data" into multiqcData + + script: + + """ + module load python/3.6.1-2-anaconda + module load pandoc/2.7 + module load multiqc/1.7 + echo $workflow.nextflow.version > version_nextflow.txt + multiqc --version > version_multiqc.txt + python3 $baseDir/scripts/generate_references.py -r $references -o software_references + python3 $baseDir/scripts/generate_versions.py -o software_versions + multiqc -c $multiqc . + """ +} diff --git a/workflow/nextflow.config b/workflow/nextflow.config index 30e47ea1aea37ed6550cc2944d69d26e69887489..f9fbe846df7cbce559c0e8ec87097921792640a2 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -3,3 +3,12 @@ profiles { includeConfig 'conf/biohpc.config' } } + +manifest { + name = 'chipseq_analysis' + description = 'BICF ChIP-seq Analysis Workflow.' + homePage = 'https://github.com/nf-core/rnaseq' + version = '1.0.0' + mainScript = 'main.nf' + nextflowVersion = '>=0.31.0' +} diff --git a/workflow/scripts/annotate_peaks.R b/workflow/scripts/annotate_peaks.R index c15ba698dc3f9d120024426b4d7f57bd24a741a6..ff826b6a1f3af6bd8e38fa3555912ee7dda96609 100644 --- a/workflow/scripts/annotate_peaks.R +++ b/workflow/scripts/annotate_peaks.R @@ -17,7 +17,7 @@ args <- commandArgs(trailingOnly=TRUE) # Check input args if (length(args) != 2) { - stop("Usage: annotate_peaks.R [ annotate_design.tsv ] [ genome_assembly ]", call.=FALSE) + stop("Usage: annotate_peaks.R annotate_design.tsv genome_assembly", call.=FALSE) } design_file <- args[1] @@ -35,6 +35,11 @@ if(genome_assembly=='GRCh37') { annodb <- 'org.Hs.eg.db' } +# Output version of ChIPseeker +chipseeker_version = packageVersion('ChIPseeker') +write.table(paste("Version", chipseeker_version), file = "version_ChIPseeker.txt", sep = "\t", + row.names = FALSE, col.names = FALSE) + # Load design file design <- read.csv(design_file, sep ='\t') files <- as.list(as.character(design$Peaks)) diff --git a/workflow/scripts/call_peaks_macs.py b/workflow/scripts/call_peaks_macs.py index 23174e07936ec09ccf494d1c713bd71bb4d72713..cc003c9e892fe49b90c1fad285bd378bffbd9364 100644 --- a/workflow/scripts/call_peaks_macs.py +++ b/workflow/scripts/call_peaks_macs.py @@ -5,8 +5,8 @@ import os import argparse import shutil +import subprocess import logging -from multiprocessing import cpu_count import utils from xcor import xcor as calculate_xcor @@ -70,16 +70,19 @@ def check_tools(): logger.info('Checking for required libraries and components on this system') - r_path = shutil.which("R") - if r_path: - logger.info('Found R: %s', r_path) - else: - logger.error('Missing R') - raise Exception('Missing R') - macs_path = shutil.which("macs2") - if r_path: + if macs_path: logger.info('Found MACS2: %s', macs_path) + + # Get Version + macs_version_command = "macs2 --version" + macs_version = subprocess.check_output(macs_version_command, shell=True, stderr=subprocess.STDOUT) + + # Write to file + macs_file = open("version_macs.txt", "wb") + macs_file.write(macs_version) + macs_file.close() + else: logger.error('Missing MACS2') raise Exception('Missing MACS2') @@ -87,6 +90,18 @@ def check_tools(): bg_bw_path = shutil.which("bedGraphToBigWig") if bg_bw_path: logger.info('Found bedGraphToBigWig: %s', bg_bw_path) + + # Get Version + bg_bw_version_command = "bedGraphToBigWig" + try: + subprocess.check_output(bg_bw_version_command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + bg_bw_version = e.output + + # Write to file + bg_bw_file = open("version_bedGraphToBigWig.txt", "wb") + bg_bw_file.write(bg_bw_version) + bg_bw_file.close() else: logger.error('Missing bedGraphToBigWig') raise Exception('Missing bedGraphToBigWig') @@ -94,12 +109,23 @@ def check_tools(): bedtools_path = shutil.which("bedtools") if bedtools_path: logger.info('Found bedtools: %s', bedtools_path) + + # Get Version + bedtools_version_command = "bedtools --version" + bedtools_version = subprocess.check_output(bedtools_version_command, shell=True) + + # Write to file + bedtools_file = open("version_bedtools.txt", "wb") + bedtools_file.write(bedtools_version) + bedtools_file.close() + else: logger.error('Missing bedtools') raise Exception('Missing bedtools') def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes): + '''Call peaks and signal tracks''' # Extract the fragment length estimate from column 3 of the # cross-correlation scores file @@ -107,8 +133,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) firstline = xcor_fh.readline() frag_lengths = firstline.split()[2] # third column fragment_length = frag_lengths.split(',')[0] # grab first value - logger.info("Fraglen %s" % (fragment_length)) - + logger.info("Fraglen %s", fragment_length) # Generate narrow peaks and preliminary signal tracks @@ -119,18 +144,18 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) logger.info(command) returncode = utils.block_on(command) - logger.info("MACS2 exited with returncode %d" % (returncode)) + logger.info("MACS2 exited with returncode %d", returncode) assert returncode == 0, "MACS2 non-zero return" # MACS2 sometimes calls features off the end of chromosomes. # Remove coordinates outside chromosome sizes - narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix) + int_narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix) + narrowpeak_fn = '%s.narrowPeak' % (prefix) clipped_narrowpeak_fn = 'clipped-%s' % (narrowpeak_fn) - - steps = ['slopBed -i %s -g %s -b 0' % (narrowpeak_fn, chrom_sizes), - 'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)] + steps = ['slopBed -i %s -g %s -b 0' % (int_narrowpeak_fn, chrom_sizes), + 'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)] out, err = utils.run_pipe(steps) @@ -161,7 +186,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) logger.info(command) returncode = utils.block_on(command) - logger.info("MACS2 exited with returncode %d" % (returncode)) + logger.info("MACS2 exited with returncode %d", returncode) assert returncode == 0, "MACS2 non-zero return" # Remove coordinates outside chromosome sizes (MACS2 bug) @@ -169,7 +194,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) fc_bedgraph_sorted_fn = 'sorted-%s' % (fc_bedgraph_fn) fc_signal_fn = "%s.fc_signal.bw" % (prefix) steps = ['slopBed -i %s_FE.bdg -g %s -b 0' % (prefix, chrom_sizes), - 'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)] + 'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)] out, err = utils.run_pipe(steps) @@ -185,7 +210,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) logger.info(command) returncode = utils.block_on(command) - logger.info("bedGraphToBigWig exited with returncode %d" % (returncode)) + logger.info("bedGraphToBigWig exited with returncode %d", returncode) assert returncode == 0, "bedGraphToBigWig non-zero return" # For -log10(p-value) signal tracks @@ -199,7 +224,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) sval = str(min(float(chip_reads), float(control_reads)) / 1000000) logger.info("chip_reads = %s, control_reads = %s, sval = %s" % - (chip_reads, control_reads, sval)) + (chip_reads, control_reads, sval)) command = 'macs2 bdgcmp ' + \ '-t %s_treat_pileup.bdg ' % (prefix) + \ @@ -216,7 +241,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) pvalue_bedgraph_sorted_fn = 'sort-%s' % (pvalue_bedgraph_fn) pvalue_signal_fn = "%s.pvalue_signal.bw" % (prefix) steps = ['slopBed -i %s_ppois.bdg -g %s -b 0' % (prefix, chrom_sizes), - 'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)] + 'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)] out, err = utils.run_pipe(steps) @@ -232,12 +257,13 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes) logger.info(command) returncode = utils.block_on(command) - logger.info("bedGraphToBigWig exited with returncode %d" % (returncode)) + logger.info("bedGraphToBigWig exited with returncode %d", returncode) assert returncode == 0, "bedGraphToBigWig non-zero return" # Remove temporary files os.remove(clipped_narrowpeak_fn) os.remove(rescaled_narrowpeak_fn) + os.remove(int_narrowpeak_fn) def main(): diff --git a/workflow/scripts/check_design.py b/workflow/scripts/check_design.py index 083151213fe483f7cbd4a3feb76c09d760b212d7..35af1f469826933578ba0df5694f45644836b8b2 100644 --- a/workflow/scripts/check_design.py +++ b/workflow/scripts/check_design.py @@ -72,6 +72,46 @@ def check_design_headers(design, paired): raise Exception("Missing column headers: %s" % list(missing_headers)) +def check_samples(design): + '''Check if design file has the correct sample name mapping.''' + + logger.info("Running sample check.") + + samples = design.groupby('sample_id') \ + .apply(list) + + malformated_samples = [] + chars = set('-.') + for sample in samples.index.values: + if(any(char.isspace() for char in sample) | any((char in chars) for char in sample)): + malformated_samples.append(sample) + + if len(malformated_samples) > 0: + logger.error('Malformed samples from design file: %s', list(malformated_samples)) + raise Exception("Malformed samples from design file: %s" % + list(malformated_samples)) + + +def check_experiments(design): + '''Check if design file has the correct experiment name mapping.''' + + logger.info("Running experiment check.") + + experiments = design.groupby('experiment_id') \ + .apply(list) + + malformated_experiments = [] + chars = set('-.') + for experiment in experiments.index.values: + if(any(char.isspace() for char in experiment) | any((char in chars) for char in experiment)): + malformated_experiments.append(experiment) + + if len(malformated_experiments) > 0: + logger.error('Malformed experiment from design file: %s', list(malformated_experiments)) + raise Exception("Malformed experiment from design file: %s" % + list(malformated_experiments)) + + def check_controls(design): '''Check if design file has the correct control mapping.''' @@ -146,8 +186,8 @@ def main(): logger.addHandler(handler) # Read files as dataframes - design_df = pd.read_csv(args.design, sep='\t') - fastq_df = pd.read_csv(args.fastq, sep='\t', names=['name', 'path']) + design_df = pd.read_csv(design, sep='\t') + fastq_df = pd.read_csv(fastq, sep='\t', names=['name', 'path']) # Check design file check_design_headers(design_df, paired) diff --git a/workflow/scripts/convert_reads.py b/workflow/scripts/convert_reads.py index 2c39680f97fcdc95eaee94a6d27cb5511e05ce95..43fe5cfee1ceccc13c563d14c8547c391dbd4542 100644 --- a/workflow/scripts/convert_reads.py +++ b/workflow/scripts/convert_reads.py @@ -54,6 +54,15 @@ def check_tools(): bedtools_path = shutil.which("bedtools") if bedtools_path: logger.info('Found bedtools: %s', bedtools_path) + + # Get Version + bedtools_version_command = "bedtools --version" + bedtools_version = subprocess.check_output(bedtools_version_command, shell=True) + + # Write to file + bedtools_file = open("version_bedtools.txt", "wb") + bedtools_file.write(bedtools_version) + bedtools_file.close() else: logger.error('Missing bedtools') raise Exception('Missing bedtools') @@ -61,6 +70,15 @@ def check_tools(): samtools_path = shutil.which("samtools") if samtools_path: logger.info('Found samtools: %s', samtools_path) + + # Get Version + samtools_version_command = "samtools --version" + samtools_version = subprocess.check_output(samtools_version_command, shell=True) + + # Write to file + samtools_file = open("version_samtools.txt", "wb") + samtools_file.write(samtools_version) + samtools_file.close() else: logger.error('Missing samtools') raise Exception('Missing samtools') @@ -91,8 +109,7 @@ def convert_mapped_pe(bam, bam_basename): out, err = utils.run_pipe([ "bamToBed -bedpe -mate1 -i %s" % (nmsrt_bam_filename), - "gzip -nc"], - outfile=bedpe_filename) + "gzip -nc"], outfile=bedpe_filename) def main(): @@ -109,7 +126,7 @@ def main(): # Convert PE or SE to tagAlign bam_basename = os.path.basename( - utils.strip_extensions(bam, ['.bam'])) + utils.strip_extensions(bam, ['.bam', '.dedup'])) tag_filename = bam_basename + '.tagAlign.gz' convert_mapped(bam, tag_filename) @@ -117,7 +134,7 @@ def main(): if paired: # paired-end data convert_mapped_pe(bam, bam_basename) else: - bedse_filename = bam_basename + ".bedse.gz" + bedse_filename = bam_basename + ".bedse.gz" shutil.copy(tag_filename, bedse_filename) diff --git a/workflow/scripts/diff_peaks.R b/workflow/scripts/diff_peaks.R index 76caa95495c7cae1d014530e78e1e1e3c4af09dc..68fda5624018ea6168a63242dc2e47ad7178a47d 100644 --- a/workflow/scripts/diff_peaks.R +++ b/workflow/scripts/diff_peaks.R @@ -8,9 +8,14 @@ args <- commandArgs(trailingOnly=TRUE) # Check input args if (length(args) != 1) { - stop("Usage: diff_peaks.R [ annotate_design.tsv ] ", call.=FALSE) + stop("Usage: diff_peaks.R annotate_design.tsv ", call.=FALSE) } +# Output version of DiffBind +diffibind_version = packageVersion('DiffBind') +write.table(paste("Version", diffibind_version), file = "version_DiffBind.txt", sep = "\t", + row.names = FALSE, col.names = FALSE) + # Build DBA object from design file data <- dba(sampleSheet=args[1]) data <- dba.count(data) diff --git a/workflow/scripts/experiment_qc.py b/workflow/scripts/experiment_qc.py index 7386fcffd8e982d4d432e99becfc2aacb23ad2f2..0418d0e34bee026e2756eec12462bfc85ba73dce 100644 --- a/workflow/scripts/experiment_qc.py +++ b/workflow/scripts/experiment_qc.py @@ -7,8 +7,9 @@ import argparse import logging import subprocess import shutil -import pandas as pd from multiprocessing import cpu_count +import pandas as pd + EPILOG = ''' For more details: @@ -51,6 +52,15 @@ def check_tools(): deeptools_path = shutil.which("deeptools") if deeptools_path: logger.info('Found deeptools: %s', deeptools_path) + + # Get Version + deeptools_version_command = "deeptools --version" + deeptools_version = subprocess.check_output(deeptools_version_command, shell=True, stderr=subprocess.STDOUT) + + # Write to file + deeptools_file = open("version_deeptools.txt", "wb") + deeptools_file.write(deeptools_version) + deeptools_file.close() else: logger.error('Missing deeptools') raise Exception('Missing deeptools') @@ -76,10 +86,10 @@ def generate_read_summary(design, extension): return mbs_filename -def check_correlation(mbs): +def check_spearman_correlation(mbs): '''Check Spearman pairwise correlation of samples based on read counts.''' - spearman_filename = 'heatmap_SpearmanCorr.png' + spearman_filename = 'heatmap_SpearmanCorr.pdf' spearman_params = "--corMethod spearman --skipZero" + \ " --plotTitle \"Spearman Correlation of Read Counts\"" + \ " --whatToPlot heatmap --colorMap RdYlBu --plotNumbers" @@ -95,12 +105,31 @@ def check_correlation(mbs): out, err = spearman_correlation.communicate() +def check_pearson_correlation(mbs): + '''Check Pearson pairwise correlation of samples based on read counts.''' + + pearson_filename = 'heatmap_PearsonCorr.pdf' + pearson_params = "--corMethod pearson --skipZero" + \ + " --plotTitle \"Pearson Correlation of Read Counts\"" + \ + " --whatToPlot heatmap --colorMap RdYlBu --plotNumbers" + + pearson_command = ( + "plotCorrelation -in %s %s -o %s" + % (mbs, pearson_params, pearson_filename) + ) + + logger.info("Running deeptools with %s", pearson_command) + + pearson_correlation = subprocess.Popen(pearson_command, shell=True) + out, err = pearson_correlation.communicate() + + def check_coverage(design, extension): '''Asses the sequencing depth of samples.''' bam_files = ' '.join(design['bam_reads']) labels = ' '.join(design['sample_id']) - coverage_filename = 'coverage.png' + coverage_filename = 'coverage.pdf' coverage_params = "-n 1000000 --plotTitle \"Sample Coverage\"" + \ " --ignoreDuplicates --minMappingQuality 10" @@ -136,7 +165,7 @@ def update_controls(design): def check_enrichment(sample_id, control_id, sample_reads, control_reads, extension): '''Asses the enrichment per sample.''' - fingerprint_filename = sample_id + '_fingerprint.png' + fingerprint_filename = sample_id + '_fingerprint.pdf' fingerprint_command = ( "plotFingerprint -b %s %s --extendReads %d --labels %s %s --plotFile %s" @@ -166,7 +195,8 @@ def main(): # Run correlation mbs_filename = generate_read_summary(design_df, extension) - check_correlation(mbs_filename) + check_spearman_correlation(mbs_filename) + check_pearson_correlation(mbs_filename) # Run coverage check_coverage(design_df, extension) @@ -175,11 +205,11 @@ def main(): new_design_df = update_controls(design_df) for index, row in new_design_df.iterrows(): check_enrichment( - row['sample_id'], - row['control_id'], - row['bam_reads'], - row['control_reads'], - extension) + row['sample_id'], + row['control_id'], + row['bam_reads'], + row['control_reads'], + extension) if __name__ == '__main__': diff --git a/workflow/scripts/generate_references.py b/workflow/scripts/generate_references.py new file mode 100644 index 0000000000000000000000000000000000000000..51be186599be8d78d0bc585af363e90ca1efe826 --- /dev/null +++ b/workflow/scripts/generate_references.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python + +'''Make header for HTML of references.''' + +import argparse +import subprocess +import shlex +import logging + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-r', '--reference', + help="The reference file (markdown format).", + required=True) + + parser.add_argument('-o', '--output', + help="The out file name.", + default='references') + + args = parser.parse_args() + return args + + +def main(): + args = get_args() + reference = args.reference + output = args.output + + out_filename = output + '_mqc.yaml' + + # Header for HTML + print(''' + id: 'Software References' + section_name: 'Software References' + description: 'This section describes references for the tools used.' + plot_type: 'html' + data: | + ''' + , file = open(out_filename, "w") + ) + + # Turn Markdown into HTML + references_html = 'bash -c "pandoc -p {} | sed \'s/^/ /\' >> {}"' + references_html = references_html.format(reference, out_filename) + subprocess.check_call(shlex.split(references_html)) + + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/generate_versions.py b/workflow/scripts/generate_versions.py new file mode 100644 index 0000000000000000000000000000000000000000..661b266223b4c6974d368380f93c04c01a51c46c --- /dev/null +++ b/workflow/scripts/generate_versions.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +'''Make YAML of software versions.''' + +from __future__ import print_function +from collections import OrderedDict +import re +import os +import logging +import glob +import argparse +import numpy as np + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + +SOFTWARE_REGEX = { + 'Nextflow': ['version_nextflow.txt', r"(\S+)"], + 'Trim Galore!': ['trimReads_vf/version_trimgalore.txt', r"version (\S+)"], + 'Cutadapt': ['trimReads_vf/version_cutadapt.txt', r"Version (\S+)"], + 'BWA': ['alignReads_vf/version_bwa.txt', r"Version: (\S+)"], + 'Samtools': ['alignReads_vf/version_samtools.txt', r"samtools (\S+)"], + 'Sambamba': ['filterReads_vf/version_sambamba.txt', r"sambamba (\S+)"], + 'BEDTools': ['convertReads_vf/version_bedtools.txt', r"bedtools v(\S+)"], + 'R': ['crossReads_vf/version_r.txt', r"R version (\S+)"], + 'SPP': ['crossReads_vf/version_spp.txt', r"\[1\] ‘(1.14)’"], + 'MACS2': ['callPeaksMACS_vf/version_macs.txt', r"macs2 (\S+)"], + 'bedGraphToBigWig': ['callPeaksMACS_vf/version_bedGraphToBigWig.txt', r"bedGraphToBigWig v (\S+)"], + 'ChIPseeker': ['peakAnnotation_vf/version_ChIPseeker.txt', r"Version (\S+)\""], + 'MEME-ChIP': ['motifSearch_vf/version_memechip.txt', r"Version (\S+)"], + 'DiffBind': ['diffPeaks_vf/version_DiffBind.txt', r"Version (\S+)\""], + 'deepTools': ['experimentQC_vf/version_deeptools.txt', r"deeptools (\S+)"], + 'MultiQC': ['version_multiqc.txt', r"multiqc, version (\S+)"], +} + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-o', '--output', + help="The out file name.", + required=True) + + parser.add_argument('-t', '--test', + help='Used for testing purposes', + default=False, + action='store_true') + + args = parser.parse_args() + return args + + +def check_files(files, test): + '''Check if version files are found.''' + + logger.info("Running file check.") + + software_files = np.array(list(SOFTWARE_REGEX.values()))[:,0] + + extra_files = set(files) - set(software_files) + + if len(extra_files) > 0 and test: + logger.error('Missing regex: %s', list(extra_files)) + raise Exception("Missing regex: %s" % list(extra_files)) + + +def main(): + args = get_args() + output = args.output + test = args.test + + out_filename = output + '_mqc.yaml' + + results = OrderedDict() + results['Nextflow'] = '<span style="color:#999999;\">Not Run</span>' + results['Trim Galore!'] = '<span style="color:#999999;\">Not Run</span>' + results['Cutadapt'] = '<span style="color:#999999;\">Not Run</span>' + results['BWA'] = '<span style="color:#999999;\">Not Run</span>' + results['Samtools'] = '<span style="color:#999999;\">Not Run</span>' + results['Sambamba'] = '<span style="color:#999999;\">Not Run</span>' + results['BEDTools'] = '<span style="color:#999999;\">Not Run</span>' + results['R'] = '<span style="color:#999999;\">Not Run</span>' + results['SPP'] = '<span style="color:#999999;\">Not Run</span>' + results['MACS2'] = '<span style="color:#999999;\">Not Run</span>' + results['bedGraphToBigWig'] = '<span style="color:#999999;\">Not Run</span>' + results['ChIPseeker'] = '<span style="color:#999999;\">Not Run</span>' + results['MEME-ChIP'] = '<span style="color:#999999;\">Not Run</span>' + results['DiffBind'] = '<span style="color:#999999;\">Not Run</span>' + results['deepTools'] = '<span style="color:#999999;\">Not Run</span>' + results['MultiQC'] = '<span style="color:#999999;\">Not Run</span>' + + # list all files + files = glob.glob('**/*.txt', recursive=True) + + # Check for version files: + check_files(files, test) + + # Search each file using its regex + for k, v in SOFTWARE_REGEX.items(): + if os.path.isfile(v[0]): + with open(v[0]) as x: + versions = x.read() + match = re.search(v[1], versions) + if match: + results[k] = "v{}".format(match.group(1)) + + # Dump to YAML + print( + ''' + id: 'Software Versions' + section_name: 'Software Versions' + section_href: 'https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/' + plot_type: 'html' + description: 'are collected at run time from the software output.' + data: | + <dl class="dl-horizontal"> + ''' + , file = open(out_filename, "w")) + + for k, v in results.items(): + print(" <dt>{}</dt><dd>{}</dd>".format(k, v), file = open(out_filename, "a")) + print(" </dl>", file = open(out_filename, "a")) + + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/map_qc.py b/workflow/scripts/map_qc.py index 4cc549fa1652da3403b0d527b9127273b58e465c..7fda9a66abc72df81b03b62a5c375d440fa81438 100644 --- a/workflow/scripts/map_qc.py +++ b/workflow/scripts/map_qc.py @@ -62,6 +62,15 @@ def check_tools(): samtools_path = shutil.which("samtools") if samtools_path: logger.info('Found samtools: %s', samtools_path) + + # Get Version + samtools_version_command = "samtools --version" + samtools_version = subprocess.check_output(samtools_version_command, shell=True) + + # Write to file + samtools_file = open("version_samtools.txt", "wb") + samtools_file.write(samtools_version) + samtools_file.close() else: logger.error('Missing samtools') raise Exception('Missing samtools') @@ -69,6 +78,18 @@ def check_tools(): sambamba_path = shutil.which("sambamba") if sambamba_path: logger.info('Found sambamba: %s', sambamba_path) + + # Get Version + sambamba_version_command = "sambamba" + try: + subprocess.check_output(sambamba_version_command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + sambamba_version = e.output + + # Write to file + sambamba_file = open("version_sambamba.txt", "wb") + sambamba_file.write(sambamba_version) + sambamba_file.close() else: logger.error('Missing sambamba') raise Exception('Missing sambamba') @@ -76,6 +97,15 @@ def check_tools(): bedtools_path = shutil.which("bedtools") if bedtools_path: logger.info('Found bedtools: %s', bedtools_path) + + # Get Version + bedtools_version_command = "bedtools --version" + bedtools_version = subprocess.check_output(bedtools_version_command, shell=True) + + # Write to file + bedtools_file = open("version_bedtools.txt", "wb") + bedtools_file.write(bedtools_version) + bedtools_file.close() else: logger.error('Missing bedtools') raise Exception('Missing bedtools') @@ -106,7 +136,7 @@ def filter_mapped_pe(bam, bam_basename): # Will produce name sorted BAM "samtools sort -n -@ %d -o %s" % (cpu_count(), tmp_filt_bam_filename)]) if err: - logger.error("samtools filter error: %s" % (err)) + logger.error("samtools filter error: %s", err) # Remove orphan reads (pair was removed) # and read pairs mapping to different chromosomes @@ -136,7 +166,7 @@ def filter_mapped_se(bam, bam_basename): # not primary alignment, reads failing platform # Remove low MAPQ reads # Obtain name sorted BAM file - with open(filt_bam_filename, 'w') as fh: + with open(filt_bam_filename, 'w') as temp_file: samtools_filter_command = ( "samtools view -F 1804 -q 30 -b %s" % (bam) @@ -144,7 +174,7 @@ def filter_mapped_se(bam, bam_basename): logger.info(samtools_filter_command) subprocess.check_call( shlex.split(samtools_filter_command), - stdout=fh) + stdout=temp_file) return filt_bam_filename @@ -153,11 +183,11 @@ def dedup_mapped(bam, bam_basename, paired): '''Use sambamba and samtools to remove duplicates.''' # Markduplicates - dup_file_qc_filename = bam_basename + ".dup.qc" + dup_file_qc_filename = bam_basename + ".dedup.qc" tmp_dup_mark_filename = bam_basename + ".dupmark.bam" sambamba_params = "--hash-table-size=17592186044416" + \ " --overflow-list-size=20000000 --io-buffer-size=256" - with open(dup_file_qc_filename, 'w') as fh: + with open(dup_file_qc_filename, 'w') as temp_file: sambamba_markdup_command = ( "sambamba markdup -t %d %s --tmpdir=%s %s %s" % (cpu_count(), sambamba_params, os.getcwd(), bam, tmp_dup_mark_filename) @@ -165,11 +195,10 @@ def dedup_mapped(bam, bam_basename, paired): logger.info(sambamba_markdup_command) subprocess.check_call( shlex.split(sambamba_markdup_command), - stderr=fh) - + stderr=temp_file) # Remove duplicates - final_bam_prefix = bam_basename + ".filt.nodup" + final_bam_prefix = bam_basename + ".dedup" final_bam_filename = final_bam_prefix + ".bam" if paired: # paired-end data @@ -179,11 +208,11 @@ def dedup_mapped(bam, bam_basename, paired): samtools_dedupe_command = \ "samtools view -F 1804 -b %s" % (tmp_dup_mark_filename) - with open(final_bam_filename, 'w') as fh: + with open(final_bam_filename, 'w') as temp_file: logger.info(samtools_dedupe_command) subprocess.check_call( shlex.split(samtools_dedupe_command), - stdout=fh) + stdout=temp_file) # Index final bam file sambamba_index_command = \ @@ -192,12 +221,12 @@ def dedup_mapped(bam, bam_basename, paired): subprocess.check_output(shlex.split(sambamba_index_command)) # Generate mapping statistics - final_bam_file_mapstats_filename = final_bam_prefix + ".flagstat.qc" - with open(final_bam_file_mapstats_filename, 'w') as fh: + mapstats_filename = final_bam_prefix + ".flagstat.qc" + with open(mapstats_filename, 'w') as temp_file: flagstat_command = "sambamba flagstat -t %d %s" \ % (cpu_count(), final_bam_filename) logger.info(flagstat_command) - subprocess.check_call(shlex.split(flagstat_command), stdout=fh) + subprocess.check_call(shlex.split(flagstat_command), stdout=temp_file) os.remove(bam) return tmp_dup_mark_filename @@ -206,7 +235,7 @@ def dedup_mapped(bam, bam_basename, paired): def compute_complexity(bam, paired, bam_basename): '''Calculate library complexity .''' - pbc_file_qc_filename = bam_basename + ".filt.nodup.pbc.qc" + pbc_file_qc_filename = bam_basename + ".pbc.qc" tmp_pbc_file_qc_filename = "tmp.%s" % (pbc_file_qc_filename) # Sort by name @@ -215,6 +244,7 @@ def compute_complexity(bam, paired, bam_basename): # Obtain unique count statistics # PBC File output + # Sample Name[tab] # TotalReadPairs [tab] # DistinctReadPairs [tab] # OneReadPair [tab] @@ -223,12 +253,12 @@ def compute_complexity(bam, paired, bam_basename): # PBC1=OnePair/Distinct [tab] # PBC2=OnePair/TwoPair pbc_headers = ['TotalReadPairs', - 'DistinctReadPairs', - 'OneReadPair', - 'TwoReadPairs', - 'NRF', - 'PBC1', - 'PBC2'] + 'DistinctReadPairs', + 'OneReadPair', + 'TwoReadPairs', + 'NRF', + 'PBC1', + 'PBC2'] if paired: steps = [ @@ -247,11 +277,15 @@ def compute_complexity(bam, paired, bam_basename): ]) out, err = utils.run_pipe(steps, tmp_pbc_file_qc_filename) if err: - logger.error("PBC file error: %s" % (err)) + logger.error("PBC file error: %s", err) - # Add headers + # Add Sample Name and headers pbc_file = pd.read_csv(tmp_pbc_file_qc_filename, sep='\t', header=None, - names=pbc_headers) + names=pbc_headers) + pbc_file['Sample'] = bam_basename + pbc_headers_new = list(pbc_file) + pbc_headers_new.insert(0, pbc_headers_new.pop(pbc_headers_new.index('Sample'))) + pbc_file = pbc_file[pbc_headers_new] pbc_file.to_csv(pbc_file_qc_filename, header=True, sep='\t', index=False) os.remove(bam) os.remove(bam + '.bai') diff --git a/workflow/scripts/map_reads.py b/workflow/scripts/map_reads.py index b7d3144fe8484f2d8939c773229c45b58e283315..3ba41dc9cd616759754abaaa6abc390362f00ab0 100644 --- a/workflow/scripts/map_reads.py +++ b/workflow/scripts/map_reads.py @@ -9,7 +9,6 @@ import shutil import shlex import logging from multiprocessing import cpu_count -import json import utils EPILOG = ''' @@ -33,7 +32,7 @@ STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '_trimmed'] def get_args(): '''Define arguments.''' - + parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter) @@ -47,6 +46,10 @@ def get_args(): help="The bwa index of the reference genome.", required=True) + parser.add_argument('-s', '--sample', + help="The name of the sample.", + required=True) + parser.add_argument('-p', '--paired', help="True/False if paired-end or single end.", default=False, @@ -67,6 +70,18 @@ def check_tools(): bwa_path = shutil.which("bwa") if bwa_path: logger.info('Found bwa: %s', bwa_path) + + # Get Version + bwa_version_command = "bwa" + try: + subprocess.check_output(bwa_version_command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + bwa_version = e.output + + # Write to file + bwa_file = open("version_bwa.txt", "wb") + bwa_file.write(bwa_version) + bwa_file.close() else: logger.error('Missing bwa') raise Exception('Missing bwa') @@ -74,6 +89,15 @@ def check_tools(): samtools_path = shutil.which("samtools") if samtools_path: logger.info('Found samtools: %s', samtools_path) + + # Get Version + samtools_version_command = "samtools --version" + samtools_version = subprocess.check_output(samtools_version_command, shell=True) + + # Write to file + samtools_file = open("version_samtools.txt", "wb") + samtools_file.write(samtools_version) + samtools_file.close() else: logger.error('Missing samtools') raise Exception('Missing samtools') @@ -101,7 +125,7 @@ def generate_sa(fastq, reference): def align_se(fastq, sai, reference, fastq_basename): '''Use BWA to align SE data.''' - bam_filename = '%s.srt.bam' % (fastq_basename) + bam_filename = '%s.bam' % (fastq_basename) steps = [ "bwa samse %s %s %s" @@ -112,7 +136,7 @@ def align_se(fastq, sai, reference, fastq_basename): out, err = utils.run_pipe(steps) if err: - logger.error("samse/samtools error: %s" % (err)) + logger.error("samse/samtools error: %s", err) return bam_filename @@ -122,21 +146,21 @@ def align_pe(fastq, sai, reference, fastq_basename): sam_filename = "%s.sam" % (fastq_basename) badcigar_filename = "%s.badReads" % (fastq_basename) - bam_filename = '%s.srt.bam' % (fastq_basename) + bam_filename = '%s.bam' % (fastq_basename) # Remove read pairs with bad CIGAR strings and sort by position steps = [ - "bwa sampe -P %s %s %s %s %s" - % (reference, sai[0], sai[1], - fastq[0], fastq[1]), - "tee %s" % (sam_filename), - r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""", - "sort", - "uniq"] + "bwa sampe -P %s %s %s %s %s" + % (reference, sai[0], sai[1], + fastq[0], fastq[1]), + "tee %s" % (sam_filename), + r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""", + "sort", + "uniq"] out, err = utils.run_pipe(steps, badcigar_filename) if err: - logger.error("sampe error: %s" % (err)) + logger.error("sampe error: %s", err) steps = [ "cat %s" % (sam_filename), @@ -147,7 +171,7 @@ def align_pe(fastq, sai, reference, fastq_basename): out, err = utils.run_pipe(steps) if err: - logger.error("samtools error: %s" % (err)) + logger.error("samtools error: %s", err) return bam_filename @@ -157,6 +181,7 @@ def main(): paired = args.paired fastq = args.fastq reference = args.reference + sample = args.sample # Create a file handler handler = logging.FileHandler('map.log') @@ -167,31 +192,25 @@ def main(): # Run Suffix Array generation sai = [] - for fq in fastq: - sai_filename = generate_sa(fq, reference) + for fastq_file in fastq: + sai_filename = generate_sa(fastq_file, reference) sai.append(sai_filename) + # Make file basename + fastq_basename = sample + # Run alignment for either PE or SE if paired: # paired-end data - fastq_r1_basename = os.path.basename( - utils.strip_extensions(fastq[0], STRIP_EXTENSIONS)) - fastq_r2_basename = os.path.basename( - utils.strip_extensions(fastq[1], STRIP_EXTENSIONS)) - fastq_basename = fastq_r1_basename + fastq_r2_basename - bam_filename = align_pe(fastq, sai, reference, fastq_basename) else: - fastq_basename = os.path.basename( - utils.strip_extensions(fastq[0], STRIP_EXTENSIONS)) - bam_filename = align_se(fastq, sai, reference, fastq_basename) - bam_mapstats_filename = '%s.srt.bam.flagstat.qc' % (fastq_basename) - with open(bam_mapstats_filename, 'w') as fh: + bam_mapstats_filename = '%s.flagstat.qc' % (fastq_basename) + with open(bam_mapstats_filename, 'w') as temp_file: subprocess.check_call( shlex.split("samtools flagstat %s" % (bam_filename)), - stdout=fh) + stdout=temp_file) # Remove sai files for sai_file in sai: diff --git a/workflow/scripts/motif_search.py b/workflow/scripts/motif_search.py index 9feac8c39201af96700f69370a70b24a7111a5fc..37e1f650ec1aeb415ad77a96ad0ad9c30241ec31 100644 --- a/workflow/scripts/motif_search.py +++ b/workflow/scripts/motif_search.py @@ -3,16 +3,14 @@ '''Call Motifs on called peaks.''' import os -import sys -import re -from re import sub -import string import argparse import logging +import shutil import subprocess +from multiprocessing import Pool import pandas as pd import utils -from multiprocessing import Pool + EPILOG = ''' For more details: @@ -27,6 +25,13 @@ logger.propagate = False logger.setLevel(logging.INFO) +# the order of this list is important. +# strip_extensions strips from the right inward, so +# the expected right-most extensions should appear first (like .gz) +# Modified from J. Seth Strattan +STRIP_EXTENSIONS = ['.narrowPeak', '.replicated'] + + def get_args(): '''Define arguments.''' @@ -51,19 +56,66 @@ def get_args(): # Functions + +def check_tools(): + '''Checks for required componenets on user system''' + + logger.info('Checking for required libraries and components on this system') + + meme_path = shutil.which("meme") + if meme_path: + logger.info('Found meme: %s', meme_path) + + # Get Version + memechip_version_command = "meme-chip --version" + memechip_version = subprocess.check_output(memechip_version_command, shell=True) + + # Write to file + meme_file = open("version_memechip.txt", "wb") + meme_file.write(b"Version %s" % (memechip_version)) + meme_file.close() + else: + logger.error('Missing meme') + raise Exception('Missing meme') + + bedtools_path = shutil.which("bedtools") + if bedtools_path: + logger.info('Found bedtools: %s', bedtools_path) + + # Get Version + bedtools_version_command = "bedtools --version" + bedtools_version = subprocess.check_output(bedtools_version_command, shell=True) + + # Write to file + bedtools_file = open("version_bedtools.txt", "wb") + bedtools_file.write(bedtools_version) + bedtools_file.close() + else: + logger.error('Missing bedtools') + raise Exception('Missing bedtools') + + def run_wrapper(args): - motif_search(*args) + motif_search(*args) + def motif_search(filename, genome, experiment, peak): + '''Run motif serach on peaks.''' + + file_basename = os.path.basename( + utils.strip_extensions(filename, STRIP_EXTENSIONS)) - file_basename = os.path.basename(filename) - sorted_fn = 'sorted-%s' % (file_basename) out_fa = '%s.fa' % (experiment) out_motif = '%s_memechip' % (experiment) # Sort Bed file and limit number of peaks if peak == -1: peak = utils.count_lines(filename) + peak_no = 'all' + else: + peak_no = peak + + sorted_fn = '%s.%s.narrowPeak' % (file_basename, peak_no) out, err = utils.run_pipe([ 'sort -k %dgr,%dgr %s' % (5, 5, filename), @@ -73,9 +125,15 @@ def motif_search(filename, genome, experiment, peak): out, err = utils.run_pipe([ 'bedtools getfasta -fi %s -bed %s -fo %s' % (genome, sorted_fn, out_fa)]) - #Call memechip + if err: + logger.error("bedtools error: %s", err) + + # Call memechip out, err = utils.run_pipe([ 'meme-chip -oc %s -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 %s -norand' % (out_motif, out_fa)]) + if err: + logger.error("meme-chip error: %s", err) + def main(): args = get_args() @@ -83,14 +141,22 @@ def main(): genome = args.genome peak = args.peak + # Create a file handler + handler = logging.FileHandler('motif.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + # Read files design_df = pd.read_csv(design, sep='\t') - meme_arglist = zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0]) - work_pool = Pool(min(12,design_df.shape[0])) - return_list = work_pool.map(run_wrapper, meme_arglist) + meme_arglist = zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0]) + work_pool = Pool(min(12, design_df.shape[0])) + return_list = work_pool.map(run_wrapper, meme_arglist) work_pool.close() work_pool.join() -if __name__=="__main__": - main() + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/overlap_peaks.py b/workflow/scripts/overlap_peaks.py index 434caa08799a2d6b3bf460ea8520d08a5b92c852..bdcfcee6ebf8951210b5d57e9df3d6b6daf683b2 100644 --- a/workflow/scripts/overlap_peaks.py +++ b/workflow/scripts/overlap_peaks.py @@ -6,6 +6,7 @@ import os import argparse import logging import shutil +import subprocess import pandas as pd import utils @@ -49,6 +50,15 @@ def check_tools(): bedtools_path = shutil.which("bedtools") if bedtools_path: logger.info('Found bedtools: %s', bedtools_path) + + # Get Version + bedtools_version_command = "bedtools --version" + bedtools_version = subprocess.check_output(bedtools_version_command, shell=True) + + # Write to file + bedtools_file = open("version_bedtools.txt", "wb") + bedtools_file.write(bedtools_version) + bedtools_file.close() else: logger.error('Missing bedtools') raise Exception('Missing bedtools') @@ -113,9 +123,9 @@ def overlap(experiment, design): # with any one of the overlapping peak pairs >= 0.5 steps_true = ['intersectBed -wo -a %s -b %s' % (pool_peaks, true_rep_peaks[0]), - awk_command, - cut_command, - 'sort -u'] + awk_command, + cut_command, + 'sort -u'] iter_true_peaks = iter(true_rep_peaks) next(iter_true_peaks) @@ -123,13 +133,13 @@ def overlap(experiment, design): if len(true_rep_peaks) > 1: for true_peak in true_rep_peaks[1:]: steps_true.extend(['intersectBed -wo -a stdin -b %s' % (true_peak), - awk_command, - cut_command, - 'sort -u']) + awk_command, + cut_command, + 'sort -u']) out, err = utils.run_pipe(steps_true, outfile=overlap_tr_fn) print("%d peaks overlap with both true replicates" % - (utils.count_lines(overlap_tr_fn))) + (utils.count_lines(overlap_tr_fn))) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 # where overlap is defined as the fractional overlap @@ -146,7 +156,7 @@ def overlap(experiment, design): out, err = utils.run_pipe(steps_pseudo, outfile=overlap_pr_fn) print("%d peaks overlap with both pooled pseudoreplicates" - % (utils.count_lines(overlap_pr_fn))) + % (utils.count_lines(overlap_pr_fn))) # Make union of peak lists out, err = utils.run_pipe([ @@ -154,7 +164,7 @@ def overlap(experiment, design): 'sort -u' ], overlapping_peaks_fn) print("%d peaks overlap with true replicates or with pooled pseudorepliates" - % (utils.count_lines(overlapping_peaks_fn))) + % (utils.count_lines(overlapping_peaks_fn))) # Make rejected peak list out, err = utils.run_pipe([ @@ -178,6 +188,9 @@ def main(): handler = logging.FileHandler('consensus_peaks.log') logger.addHandler(handler) + # Check if tools are present + check_tools() + # Read files as dataframes design_peaks_df = pd.read_csv(design, sep='\t') design_files_df = pd.read_csv(files, sep='\t') @@ -187,7 +200,7 @@ def main(): # Make a design file for annotating Peaks anno_cols = ['Condition', 'Peaks'] - design_anno = pd.DataFrame(columns = anno_cols) + design_anno = pd.DataFrame(columns=anno_cols) # Find consenus overlap peaks for each experiment for experiment, df_experiment in design_peaks_df.groupby('experiment_id'): @@ -197,16 +210,16 @@ def main(): # Write out design files design_diff.columns = ['SampleID', - 'bamReads', - 'Condition', - 'Tissue', - 'Factor', - 'Treatment', - 'Replicate', - 'ControlID', - 'bamControl', - 'Peaks', - 'PeakCaller'] + 'bamReads', + 'Condition', + 'Tissue', + 'Factor', + 'Treatment', + 'Replicate', + 'ControlID', + 'bamControl', + 'Peaks', + 'PeakCaller'] design_diff.to_csv("design_diffPeaks.csv", header=True, sep=',', index=False) design_anno.to_csv("design_annotatePeaks.tsv", header=True, sep='\t', index=False) diff --git a/workflow/scripts/pool_and_psuedoreplicate.py b/workflow/scripts/pool_and_psuedoreplicate.py index 3481495e69af6d0a734052a4b4f391a917174f95..e050d7e034af1717c5ca3ad9b5fc198987f7bc2e 100644 --- a/workflow/scripts/pool_and_psuedoreplicate.py +++ b/workflow/scripts/pool_and_psuedoreplicate.py @@ -3,11 +3,14 @@ '''Generate pooled and pseudoreplicate from data.''' import argparse -import utils import logging +import os +import subprocess +import shutil +import shlex import pandas as pd import numpy as np -import os +import utils EPILOG = ''' For more details: @@ -21,6 +24,12 @@ logger.addHandler(logging.NullHandler()) logger.propagate = False logger.setLevel(logging.INFO) +# the order of this list is important. +# strip_extensions strips from the right inward, so +# the expected right-most extensions should appear first (like .gz) +# Modified from J. Seth Strattan +STRIP_EXTENSIONS = ['.gz', '.tagAlign', '.bedse', '.bedpe'] + def get_args(): '''Define arguments.''' @@ -87,21 +96,23 @@ def pool(tag_files, outfile, paired): else: file_extension = '.bedse.gz' - pooled_filename = outfile + file_extension + pool_basename = os.path.basename( + utils.strip_extensions(outfile, STRIP_EXTENSIONS)) + + pooled_filename = pool_basename + file_extension # Merge files out, err = utils.run_pipe([ 'gzip -dc %s' % (' '.join(tag_files)), - 'gzip -cn'], - outfile=pooled_filename) + 'gzip -cn'], outfile=pooled_filename) return pooled_filename def bedpe_to_tagalign(tag_file, outfile): - '''Convert read pairs to reads itno standard tagAlign file.''' + '''Convert read pairs to reads into standard tagAlign file.''' - se_tag_filename = outfile + "bedse.tagAlign.gz" + se_tag_filename = outfile + ".tagAlign.gz" # Convert read pairs to reads into standard tagAlign file tag_steps = ["zcat -f %s" % (tag_file)] @@ -122,7 +133,7 @@ def self_psuedoreplication(tag_file, prefix, paired): lines_per_rep = (no_lines+1)/2 # Make an array of number of psuedoreplicatesfile names - pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.bedse.tagAlign.gz' + pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.tagAlign.gz' for r in [0, 1]} # Shuffle and split file into equal parts @@ -132,21 +143,25 @@ def self_psuedoreplication(tag_file, prefix, paired): splits_prefix = 'temp_split' - out, err = utils.run_pipe([ - 'gzip -dc %s' % (tag_file), - 'shuf --random-source=%s' % (tag_file), - 'split -d -l %d - %s' % (lines_per_rep, splits_prefix)]) + psuedo_command = 'bash -c "zcat {} | shuf --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {} | wc -c) -nosalt </dev/zero 2>/dev/null) | ' + psuedo_command += 'split -d -l {} - {}."' + psuedo_command = psuedo_command.format( + tag_file, + tag_file, + int(lines_per_rep), + splits_prefix) + logger.info("Running psuedo with %s", psuedo_command) + subprocess.check_call(shlex.split(psuedo_command)) # Convert read pairs to reads into standard tagAlign file for i, index in enumerate([0, 1]): - string_index = '0' + str(index) + string_index = '.0' + str(index) steps = ['cat %s' % (splits_prefix + string_index)] if paired: steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""]) steps.extend(['gzip -cn']) out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i]) - os.remove(splits_prefix + string_index) return pseudoreplicate_dict @@ -213,7 +228,7 @@ def main(): # Update tagAlign with single end data if paired: design_new_df['tag_align'] = design_new_df['se_tag_align'] - design_new_df.drop(labels = 'se_tag_align', axis = 1, inplace = True) + design_new_df.drop(labels='se_tag_align', axis=1, inplace=True) design_new_df['replicate'] = design_new_df['replicate'].astype(str) design_new_df.at[1, 'sample_id'] = experiment_id + '_pr1' @@ -243,8 +258,6 @@ def main(): # Drop index column design_new_df.drop(labels='index', axis=1, inplace=True) - - else: # Make pool of replicates replicate_files = design_df.tag_align.unique() @@ -269,7 +282,7 @@ def main(): pool_pseudoreplicates_dict = {} for index, row in pseudoreplicates_df.iterrows(): replicate_id = index + 1 - pr_filename = experiment_id + ".pr" + str(replicate_id) + '.bedse.gz' + pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz' pool_replicate = pool(row, pr_filename, False) pool_pseudoreplicates_dict[replicate_id] = pool_replicate @@ -277,16 +290,16 @@ def main(): # Update tagAlign with single end data if paired: design_new_df['tag_align'] = design_new_df['se_tag_align'] - design_new_df.drop(labels = 'se_tag_align', axis = 1, inplace = True) + design_new_df.drop(labels='se_tag_align', axis=1, inplace=True) # Check controls against cutoff_ratio # if so replace with pool_control # unless single control was used if not single_control: path_to_pool_control = cwd + '/' + pool_control - if control_df.values.max() > 1.2: + if control_df.values.max() > cutoff_ratio: logger.info("Number of reads in controls differ by " + - " > factor of %f. Using pooled controls." % (cutoff_ratio)) + " > factor of %f. Using pooled controls." % (cutoff_ratio)) design_new_df['control_tag_align'] = path_to_pool_control else: for index, row in design_new_df.iterrows(): @@ -302,14 +315,14 @@ def main(): if paired: control = row['control_tag_align'] control_basename = os.path.basename( - utils.strip_extensions(control, ['.filt.nodup.bedpe.gz'])) - control_tmp = bedpe_to_tagalign(control , "control_basename") + utils.strip_extensions(control, STRIP_EXTENSIONS)) + control_tmp = bedpe_to_tagalign(control, control_basename) path_to_control = cwd + '/' + control_tmp design_new_df.loc[index, 'control_tag_align'] = \ path_to_control else: - path_to_pool_control = pool_control + path_to_pool_control = pool_control # Add in pseudo replicates tmp_metadata = design_new_df.loc[0].copy() @@ -330,10 +343,9 @@ def main(): tmp_metadata['tag_align'] = path_to_file design_new_df = design_new_df.append(tmp_metadata) - # Write out new dataframe design_new_df.to_csv(experiment_id + '_ppr.tsv', - header=True, sep='\t', index=False) + header=True, sep='\t', index=False) if __name__ == '__main__': diff --git a/workflow/scripts/runDeepTools.py b/workflow/scripts/runDeepTools.py deleted file mode 100644 index 800544dc87c89329bf050369367e0cba9b805f28..0000000000000000000000000000000000000000 --- a/workflow/scripts/runDeepTools.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/python -# programmer : bbc -# usage: - -import sys -import argparse as ap -import logging -import subprocess -import pandas as pd -from multiprocessing import Pool -logging.basicConfig(level=10) - - -def prepare_argparser(): - description = "Make wig file for given bed using bam" - epilog = "For command line options of each command, type %(prog)% COMMAND -h" - argparser = ap.ArgumentParser(description=description, epilog = epilog) - argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BAM file") - argparser.add_argument("-g","--genome",dest = "genome",type=str,required=True, help="genome", default="hg19") - #argparser.add_argument("-b","--bed",dest="bedfile",type=str,required=True, help = "Gene locus in bed format") - #argparser.add_argument("-s","--strandtype",dest="stranded",type=str,default="none", choices=["none","reverse","yes"]) - #argparser.add_argument("-n","--name",dest="trackName",type=str,default="UserTrack",help = "track name for bedgraph header") - return(argparser) - -def run_qc(files, controls, labels): - mbs_command = "multiBamSummary bins --bamfiles "+' '.join(files)+" -out sample_mbs.npz" - p = subprocess.Popen(mbs_command, shell=True) - #logging.debug(mbs_command) - p.communicate() - pcor_command = "plotCorrelation -in sample_mbs.npz --corMethod spearman --skipZeros --plotTitle \"Spearman Correlation of Read Counts\" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers -o experiment.deeptools.heatmap_spearmanCorr_readCounts_v2.png --labels "+" ".join(labels) - #logging.debug(pcor_command) - p = subprocess.Popen(pcor_command, shell=True) - p.communicate() - #plotCoverage - pcov_command = "plotCoverage -b "+" ".join(files)+" --plotFile experiment.deeptools_coverage.png -n 1000000 --plotTitle \"sample coverage\" --ignoreDuplicates --minMappingQuality 10" - p = subprocess.Popen(pcov_command, shell=True) - p.communicate() - #draw fingerprints plots - for treat,ctrl,name in zip(files,controls,labels): - fp_command = "plotFingerprint -b "+treat+" "+ctrl+" --labels "+name+" control --plotFile "+name+".deeptools_fingerprints.png" - p = subprocess.Popen(fp_command, shell=True) - p.communicate() - -def bam2bw_wrapper(command): - p = subprocess.Popen(command, shell=True) - p.communicate() - -def run_signal(files, labels, genome): - #compute matrix and draw profile and heatmap - gene_bed = genome+"/gene.bed"#"/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/genome/"+genome+"/gene.bed" - bw_commands = [] - for f in files: - bw_commands.append("bamCoverage -bs 10 -b "+f+" -o "+f.replace("bam","bw")) - work_pool = Pool(min(len(files), 12)) - work_pool.map(bam2bw_wrapper, bw_commands) - work_pool.close() - work_pool.join() - - cm_command = "computeMatrix scale-regions -R "+gene_bed+" -a 3000 -b 3000 --regionBodyLength 5000 --skipZeros -S *.bw -o samples.deeptools_generegionscalematrix.gz" - p = subprocess.Popen(cm_command, shell=True) - p.communicate() - hm_command = "plotHeatmap -m samples.deeptools_generegionscalematrix.gz -out samples.deeptools_readsHeatmap.png" - p = subprocess.Popen(hm_command, shell=True) - p.communicate() - -def run(dfile,genome): - #parse dfile, suppose data files are the same folder as design file - dfile = pd.read_csv(dfile) - #QC: multiBamSummary and plotCorrelation - run_qc(dfile['bamReads'], dfile['bamControl'], dfile['SampleID']) - #signal plots - run_signal(dfile['bamReads'],dfile['SampleID'],genome) - -def main(): - argparser = prepare_argparser() - args = argparser.parse_args() - run(args.infile, args.genome) - - -if __name__=="__main__": - main() diff --git a/workflow/scripts/trim_reads.py b/workflow/scripts/trim_reads.py index 036c5f700eb61011cce4008230b60c634ea1c582..7fc54b0adec31cf0b8a62027fcd59c201e19aaa9 100644 --- a/workflow/scripts/trim_reads.py +++ b/workflow/scripts/trim_reads.py @@ -5,6 +5,7 @@ import subprocess import argparse import shutil +import os import logging EPILOG = ''' @@ -32,6 +33,10 @@ def get_args(): nargs='+', required=True) + parser.add_argument('-s', '--sample', + help="The name of the sample.", + required=True) + parser.add_argument('-p', '--paired', help="True/False if paired-end or single end.", default=False, @@ -49,6 +54,15 @@ def check_tools(): trimgalore_path = shutil.which("trim_galore") if trimgalore_path: logger.info('Found trimgalore: %s', trimgalore_path) + + # Get Version + trim_version_command = "trim_galore --version" + trimgalore_version = subprocess.check_output(trim_version_command, shell=True) + + # Write to file + trimgalore_file = open("version_trimgalore.txt", "wb") + trimgalore_file.write(trimgalore_version) + trimgalore_file.close() else: logger.error('Missing trimgalore') raise Exception('Missing trimgalore') @@ -56,11 +70,46 @@ def check_tools(): cutadapt_path = shutil.which("cutadapt") if cutadapt_path: logger.info('Found cutadapt: %s', cutadapt_path) + + # Get Version + cutadapt_version_command = "cutadapt --version" + cutadapt_version = subprocess.check_output(cutadapt_version_command, shell=True) + + # Write to file + cutadapt_file = open("version_cutadapt.txt", "wb") + cutadapt_file.write(b"Version %s" % (cutadapt_version)) + cutadapt_file.close() else: logger.error('Missing cutadapt') raise Exception('Missing cutadapt') +def rename_reads(fastq, sample, paired): + '''Rename fastq files by sample name.''' + + # Get current directory to build paths + cwd = os.getcwd() + + renamed_fastq = [] + + if paired: # paired-end data + # Set file names + renamed_fastq.append(cwd + '/' + sample + '_R1.fastq.gz') + renamed_fastq.append(cwd + '/' + sample + '_R2.fastq.gz') + + # Great symbolic links + os.symlink(fastq[0], renamed_fastq[0]) + os.symlink(fastq[1], renamed_fastq[1]) + else: + # Set file names + renamed_fastq.append(cwd + '/' + sample + '_R1.fastq.gz') + + # Great symbolic links + os.symlink(fastq[0], renamed_fastq[0]) + + return renamed_fastq + + def trim_reads(fastq, paired): '''Run trim_galore on 1 or 2 files.''' @@ -82,6 +131,7 @@ def trim_reads(fastq, paired): def main(): args = get_args() fastq = args.fastq + sample = args.sample paired = args.paired # Create a file handler @@ -91,8 +141,11 @@ def main(): # Check if tools are present check_tools() + # Rename fastq files by sample + fastq_rename = rename_reads(fastq, sample, paired) + # Run trim_reads - trim_reads(fastq, paired) + trim_reads(fastq_rename, paired) if __name__ == '__main__': diff --git a/workflow/scripts/utils.py b/workflow/scripts/utils.py index a50167367eec13fc0548bfbffbfa49fa8db58c60..6c2da13b056b386e454a309552f5a3623b1ef254 100644 --- a/workflow/scripts/utils.py +++ b/workflow/scripts/utils.py @@ -16,7 +16,6 @@ logger.propagate = True def run_pipe(steps, outfile=None): - # TODO: capture stderr from subprocess import Popen, PIPE p = None p_next = None @@ -98,13 +97,13 @@ def rescale_scores(filename, scores_col, new_min=10, new_max=1000): 'head -n 1 %s' % (sorted_fn), 'cut -f %s' % (scores_col)]) max_score = float(out.strip()) - logger.info("rescale_scores: max_score = %s" % (max_score)) + logger.info("rescale_scores: max_score = %s", max_score) out, err = run_pipe([ 'tail -n 1 %s' % (sorted_fn), 'cut -f %s' % (scores_col)]) min_score = float(out.strip()) - logger.info("rescale_scores: min_score = %s" % (min_score)) + logger.info("rescale_scores: min_score = %s", min_score) a = min_score b = max_score diff --git a/workflow/scripts/xcor.py b/workflow/scripts/xcor.py index f799137aa41caddf5f27729be9abb0eebeeb7482..14e4ed30143755b4deece75f8d2cb8f43bc4703e 100644 --- a/workflow/scripts/xcor.py +++ b/workflow/scripts/xcor.py @@ -5,6 +5,7 @@ import os import argparse import shutil +import subprocess import logging from multiprocessing import cpu_count import utils @@ -22,6 +23,13 @@ logger.propagate = False logger.setLevel(logging.INFO) +# the order of this list is important. +# strip_extensions strips from the right inward, so +# the expected right-most extensions should appear first (like .gz) +# Modified from J. Seth Strattan +STRIP_EXTENSIONS = ['.gz', '.tagAlign', '.bedse', '.bedpe'] + + def get_args(): '''Define arguments.''' @@ -52,28 +60,51 @@ def check_tools(): r_path = shutil.which("R") if r_path: logger.info('Found R: %s', r_path) + + # Get Version + r_version_command = "R --version" + r_version = subprocess.check_output(r_version_command, shell=True) + + # Write to file + r_file = open("version_r.txt", "wb") + r_file.write(r_version) + r_file.close() else: logger.error('Missing R') raise Exception('Missing R') + phantompeak_path = shutil.which("run_spp.R") + if phantompeak_path: + logger.info('Found phantompeak: %s', phantompeak_path) + + # Get Version + spp_version_command = "R -e \"packageVersion('spp')\"" + spp_version = subprocess.check_output(spp_version_command, shell=True) + + # Write to file + spp_file = open("version_spp.txt", "wb") + spp_file.write(spp_version) + spp_file.close() + else: + logger.error('Missing phantompeak') + raise Exception('Missing phantompeak') + def xcor(tag, paired): '''Use spp to calculate cross-correlation stats.''' - tag_basename = os.path.basename(utils.strip_extensions(tag, ['.gz'])) + tag_basename = os.path.basename(utils.strip_extensions(tag, STRIP_EXTENSIONS)) uncompressed_tag_filename = tag_basename - # Subsample tagAlign file - NREADS = 15000000 + number_reads = 15000000 subsampled_tag_filename = \ - tag_basename + ".%d.tagAlign.gz" % (NREADS/1000000) - + tag_basename + ".%d.tagAlign.gz" % (number_reads/1000000) steps = [ 'zcat %s' % (tag), 'grep -v "chrM"', - 'shuf -n %d --random-source=%s' % (NREADS, tag)] + 'shuf -n %d --random-source=%s' % (number_reads, tag)] if paired: steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]) @@ -83,8 +114,8 @@ def xcor(tag, paired): out, err = utils.run_pipe(steps, outfile=subsampled_tag_filename) # Calculate Cross-correlation QC scores - cc_scores_filename = subsampled_tag_filename + ".cc.qc" - cc_plot_filename = subsampled_tag_filename + ".cc.plot.pdf" + cc_scores_filename = tag_basename + ".cc.qc" + cc_plot_filename = tag_basename + ".cc.plot.pdf" # CC_SCORE FILE format # Filename <tab> @@ -122,7 +153,7 @@ def main(): check_tools() # Calculate Cross-correlation - xcor_filename = xcor(tag, paired) + xcor(tag, paired) if __name__ == '__main__': diff --git a/workflow/tests/test_annotate_peaks.py b/workflow/tests/test_annotate_peaks.py index 692ddced7762a9a9742c6bf51e652673b62bdd1f..1342a125db90619ff19838d7cfb5c8c1be7b9d69 100644 --- a/workflow/tests/test_annotate_peaks.py +++ b/workflow/tests/test_annotate_peaks.py @@ -11,18 +11,34 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ @pytest.mark.singleend -def test_annotate_peaks_singleend(): +def test_pie_singleend(): assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC.chipseeker_pie.pdf')) + + +@pytest.mark.singleend +def test_upsetplot_singleend(): assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC.chipseeker_upsetplot.pdf')) + + +@pytest.mark.singleend +def test_annotation_singleend(): annotation_file = test_output_path + 'ENCSR238SGC.chipseeker_annotation.tsv' assert os.path.exists(annotation_file) - assert utils.count_lines(annotation_file) == 152840 + assert utils.count_lines(annotation_file) == 152255 @pytest.mark.pairedend -def test_annotate_peaks_pairedend(): +def test_pie_pairedend(): assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA.chipseeker_pie.pdf')) + + +@pytest.mark.pairedend +def test_upsetplot_pairedend(): assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA.chipseeker_upsetplot.pdf')) + + +@pytest.mark.pairedend +def test_annotation_pairedend(): annotation_file = test_output_path + 'ENCSR729LGA.chipseeker_annotation.tsv' assert os.path.exists(annotation_file) - assert utils.count_lines(annotation_file) == 25391 + assert utils.count_lines(annotation_file) == 25494 diff --git a/workflow/tests/test_call_peaks_macs.py b/workflow/tests/test_call_peaks_macs.py index 71358bc045801d23d28b8f7385c5ef9f608347b6..28881bd6e61a084ffa51a83647db9cd76ace6854 100644 --- a/workflow/tests/test_call_peaks_macs.py +++ b/workflow/tests/test_call_peaks_macs.py @@ -9,16 +9,42 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ @pytest.mark.singleend -def test_call_peaks_macs_singleend(): - assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT.fc_signal.bw')) - assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT.pvalue_signal.bw')) - peak_file = test_output_path + 'ENCLB144FDT_peaks.narrowPeak' +def test_fc_signal_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC/1/', 'ENCLB144FDT.fc_signal.bw')) + + +@pytest.mark.singleend +def test_pvalue_signal_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC/1/', 'ENCLB144FDT.pvalue_signal.bw')) + + +@pytest.mark.singleend +def test_peaks_xls_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC/1/', 'ENCLB144FDT_peaks.xls')) + + +@pytest.mark.singleend +def test_peaks_bed_singleend(): + peak_file = test_output_path + 'ENCSR238SGC/1/' + 'ENCLB144FDT.narrowPeak' assert utils.count_lines(peak_file) == 227389 @pytest.mark.pairedend -def test_call_peaks_macs_pairedend(): - assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX.fc_signal.bw')) - assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX.pvalue_signal.bw')) - peak_file = test_output_path + 'ENCLB568IYX_peaks.narrowPeak' - assert utils.count_lines(peak_file) == 112652 +def test_fc_signal_pairedend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA/2/', 'ENCLB568IYX.fc_signal.bw')) + + +@pytest.mark.pairedend +def test_pvalue_signal_pairedend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA/2/', 'ENCLB568IYX.pvalue_signal.bw')) + + +@pytest.mark.pairedend +def test_peaks_xls_pairedend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA/2/', 'ENCLB568IYX_peaks.xls')) + + +@pytest.mark.pairedend +def test_peaks_bed_pairedend(): + peak_file = test_output_path + 'ENCSR729LGA/2/' + 'ENCLB568IYX.narrowPeak' + assert utils.count_lines(peak_file) == 113821 diff --git a/workflow/tests/test_check_design.py b/workflow/tests/test_check_design.py index 517c53c71cb31edb18c159b29636381aa66972d0..de02891773f3a090ea476ceee60a8c30f052c57d 100644 --- a/workflow/tests/test_check_design.py +++ b/workflow/tests/test_check_design.py @@ -63,6 +63,24 @@ def design_4(design): return design +@pytest.fixture +def design_5(design): + # Update sample_id to have -, spaces or periods + design.loc[design['sample_id'] == 'A_1', 'sample_id'] = 'A 1' + design.loc[design['sample_id'] == 'A_2', 'sample_id'] = 'A.2' + design.loc[design['sample_id'] == 'B_1', 'sample_id'] = 'B-1' + return design + + +@pytest.fixture +def design_6(design): + # Update experiment_id to have -, spaces or periods + design.loc[design['sample_id'] == 'A_1', 'experiment_id'] = 'A ChIP' + design.loc[design['sample_id'] == 'A_2', 'experiment_id'] = 'A.ChIP' + design.loc[design['sample_id'] == 'B_1', 'experiment_id'] = 'B-ChIP' + return design + + @pytest.fixture def fastq_files_1(fastq_files): # Drop B_2.fastq.gz @@ -115,10 +133,25 @@ def test_check_files_output_pairedend(design_3, fastq_files): assert new_design.loc[0, 'fastq_read2'] == "/path/to/file/A_2.fastq.gz" - @pytest.mark.unit def test_check_replicates(design_4): paired = False with pytest.raises(Exception) as excinfo: new_design = check_design.check_replicates(design_4) assert str(excinfo.value) == "Duplicate replicates in experiments: ['B']" + + +@pytest.mark.unit +def test_check_samples(design_5): + paired = False + with pytest.raises(Exception) as excinfo: + new_design = check_design.check_samples(design_5) + assert str(excinfo.value) == "Malformed samples from design file: ['A 1', 'A.2', 'B-1']" + + +@pytest.mark.unit +def test_check_experiments(design_6): + paired = False + with pytest.raises(Exception) as excinfo: + new_design = check_design.check_experiments(design_6) + assert str(excinfo.value) == "Malformed experiment from design file: ['A ChIP', 'A.ChIP', 'B-ChIP']" diff --git a/workflow/tests/test_convert_reads.py b/workflow/tests/test_convert_reads.py index ff8a003bfc91f5778648b238a2d94a1e04c366cc..bf4528234807779c38123bf02f66af57ce44bc21 100644 --- a/workflow/tests/test_convert_reads.py +++ b/workflow/tests/test_convert_reads.py @@ -8,12 +8,20 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ @pytest.mark.singleend -def test_convert_reads_singleend(): - assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.tagAlign.gz')) - assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bedse.gz')) +def test_tag_reads_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI/ENCLB831RUI.tagAlign.gz')) + + +@pytest.mark.singleend +def test_bed_reads_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI/ENCLB831RUI.bedse.gz')) + + +@pytest.mark.pairedend +def test_tag_reads_pairedend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX/ENCLB568IYX.tagAlign.gz')) @pytest.mark.pairedend -def test_map_qc_pairedend(): - assert os.path.exists(os.path.join(test_output_path, 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.tagAlign.gz')) - assert os.path.exists(os.path.join(test_output_path, 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.bedpe.gz')) +def test_bed_reads_pairedend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX/ENCLB568IYX.bedpe.gz')) diff --git a/workflow/tests/test_diff_peaks.py b/workflow/tests/test_diff_peaks.py index 1d48b6182efd600400b8d2032732dfbb62b704e7..8782f21065f387c18f1b58fcf7051d6222abc7f5 100644 --- a/workflow/tests/test_diff_peaks.py +++ b/workflow/tests/test_diff_peaks.py @@ -10,30 +10,65 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../output/diffPeaks/' -@pytest.mark.singleend +@pytest.mark.singleskip_true def test_diff_peaks_singleend_single_rep(): assert os.path.isdir(test_output_path) == False + @pytest.mark.pairedend -def test_annotate_peaks_pairedend_single_rep(): +def test_diff_peaks_pairedend_single_rep(): assert os.path.isdir(test_output_path) == False + @pytest.mark.singlediff -def test_diff_peaks_singleend_multiple_rep(): +def test_heatmap_singleend_multiple_rep(): assert os.path.exists(os.path.join(test_output_path, 'heatmap.pdf')) + + +@pytest.mark.singlediff +def test_pca_singleend_multiple_rep(): assert os.path.exists(os.path.join(test_output_path, 'pca.pdf')) + + +@pytest.mark.singlediff +def test_normcount_singleend_multiple_rep(): assert os.path.exists(os.path.join(test_output_path, 'normcount_peaksets.txt')) - assert os.path.exists(os.path.join(test_output_path, 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.bed')) - diffbind_file = test_output_path + 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.csv' + + +@pytest.mark.singlediff +def test_diffbind_singleend_multiple_rep(): + if os.path.isfile(os.path.join(test_output_path, 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.bed')): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.bed')) + diffbind_file = test_output_path + 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.csv' + elif os.path.isfile(os.path.join(test_output_path, 'ENCSR238SGC_vs_ENCSR272GNQ_diffbind.bed')): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC_vs_ENCSR272GNQ_diffbind.bed')) + diffbind_file = test_output_path + 'ENCSR238SGC_vs_ENCSR272GNQ_diffbind.csv' assert os.path.exists(diffbind_file) - assert utils.count_lines(diffbind_file) == 201039 + assert utils.count_lines(diffbind_file) == 197471 + @pytest.mark.paireddiff -def test_annotate_peaks_pairedend_single_rep(): +def test_heatmap_pairedend_single_rep(): assert os.path.exists(os.path.join(test_output_path, 'heatmap.pdf')) + + +@pytest.mark.paireddiff +def test_pca_pairedend_single_rep(): assert os.path.exists(os.path.join(test_output_path, 'pca.pdf')) + + +@pytest.mark.paireddiff +def test_normcount_pairedend_single_rep(): assert os.path.exists(os.path.join(test_output_path, 'normcount_peaksets.txt')) - assert os.path.exists(os.path.join(test_output_path, 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.bed')) - diffbind_file = test_output_path + 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.csv' + + +@pytest.mark.paireddiff +def test_diffbind_pairedend_single_rep(): + if os.path.isfile(os.path.join(test_output_path, 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.bed')): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.bed')) + diffbind_file = test_output_path + 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.csv' + elif os.path.isfile(os.path.join(test_output_path, 'ENCSR729LGA_vs_ENCSR757EMK_diffbind.bed')): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA_vs_ENCSR757EMK_diffbind.bed')) + diffbind_file = test_output_path + 'ENCSR729LGA_vs_ENCSR757EMK_diffbind.csv' assert os.path.exists(diffbind_file) assert utils.count_lines(diffbind_file) == 66201 diff --git a/workflow/tests/test_experiment_qc.py b/workflow/tests/test_experiment_qc.py index 5256da5fbebc424ea29a1977d9257bfc95060ae3..c0cd5d7d340b27b3217620ef4b12a1391841820b 100644 --- a/workflow/tests/test_experiment_qc.py +++ b/workflow/tests/test_experiment_qc.py @@ -31,17 +31,44 @@ def test_check_update_controls(design_bam): @pytest.mark.singleend -def test_experiment_qc_singleend(): +def test_coverage_singleend(): assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz')) - assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.png')) - assert os.path.exists(os.path.join(test_output_path, 'coverage.png')) - assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT_fingerprint.png')) - assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI_fingerprint.png')) + assert os.path.exists(os.path.join(test_output_path, 'coverage.pdf')) + + +@pytest.mark.singleend +def test_spearman_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.pdf')) + + +@pytest.mark.singleend +def test_pearson_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'heatmap_PearsonCorr.pdf')) + + +@pytest.mark.singleend +def test_fingerprint_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT_fingerprint.pdf')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI_fingerprint.pdf')) + @pytest.mark.pairdend -def test_experiment_qc_pairedend(): +def test_coverage_pairedend(): assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz')) - assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.png')) - assert os.path.exists(os.path.join(test_output_path, 'coverage.png')) - assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX_fingerprint.png')) - assert os.path.exists(os.path.join(test_output_path, 'ENCLB637LZP_fingerprint.png')) + assert os.path.exists(os.path.join(test_output_path, 'coverage.pdf')) + + +@pytest.mark.pairdend +def test_spearman_pairedend(): + assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.pdf')) + + +@pytest.mark.pairdend +def test_pearson_pairedend(): + assert os.path.exists(os.path.join(test_output_path, 'heatmap_PearsonCorr.pdf')) + + +@pytest.mark.pairdend +def test_fingerprint_pairedend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX_fingerprint.pdf')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB637LZP_fingerprint.pdf')) diff --git a/workflow/tests/test_generate_software_references.py b/workflow/tests/test_generate_software_references.py new file mode 100644 index 0000000000000000000000000000000000000000..ce3640ccaf644ae2c53c19608f9f9dd381f130ca --- /dev/null +++ b/workflow/tests/test_generate_software_references.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 + +import pytest +import os +import utils +import yaml + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../output/multiqcReport/' + + +@pytest.mark.singleend +def test_software_references(): + assert os.path.exists(os.path.join(test_output_path, 'software_references_mqc.yaml')) + + +@pytest.mark.singleend +def test_software_references_output(): + software_references = os.path.join(test_output_path, 'software_references_mqc.yaml') + with open(software_references, 'r') as stream: + data_loaded = yaml.load(stream) + + assert len(data_loaded['data'].split('<dt>')) == 17 diff --git a/workflow/tests/test_generate_software_versions.py b/workflow/tests/test_generate_software_versions.py new file mode 100644 index 0000000000000000000000000000000000000000..77c674051476e707a8d5ab556d1c4ff498d8e94d --- /dev/null +++ b/workflow/tests/test_generate_software_versions.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 + +import pytest +import os +import utils +import yaml + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../output/multiqcReport/' + + +@pytest.mark.singleend +def test_software_versions(): + assert os.path.exists(os.path.join(test_output_path, 'software_versions_mqc.yaml')) + + +@pytest.mark.singleend +def test_software_versions_output(): + software_versions = os.path.join(test_output_path, 'software_versions_mqc.yaml') + with open(software_versions, 'r') as stream: + data_loaded = yaml.load(stream) + + assert len(data_loaded['data'].split('<dt>')) == 17 diff --git a/workflow/tests/test_map_qc.py b/workflow/tests/test_map_qc.py index 95841df0aa2e6150da1136cdb295eb93651c5a18..ab94969b9699c204b30df31680fae088cf021c52 100644 --- a/workflow/tests/test_map_qc.py +++ b/workflow/tests/test_map_qc.py @@ -8,31 +8,49 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../output/filterReads/' +@pytest.mark.singleend +def test_dedup_files_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI/ENCLB831RUI.dedup.bam')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI/ENCLB831RUI.dedup.bam.bai')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI/ENCLB831RUI.dedup.qc')) + + @pytest.mark.singleend def test_map_qc_singleend(): - assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam')) - assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam.bai')) - filtered_reads_report = test_output_path + 'ENCFF646LXU.filt.nodup.flagstat.qc' + filtered_reads_report = test_output_path + 'ENCLB831RUI/ENCLB831RUI.dedup.flagstat.qc' samtools_report = open(filtered_reads_report).readlines() assert '64962570 + 0 in total' in samtools_report[0] assert '64962570 + 0 mapped (100.00%:N/A)' in samtools_report[4] - library_complexity = test_output_path + 'ENCFF646LXU.filt.nodup.pbc.qc' + + +@pytest.mark.singleend +def test_library_complexity_singleend(): + library_complexity = test_output_path + 'ENCLB831RUI/ENCLB831RUI.pbc.qc' df_library_complexity = pd.read_csv(library_complexity, sep='\t') assert df_library_complexity["NRF"].iloc[0] == 0.926192 assert df_library_complexity["PBC1"].iloc[0] == 0.926775 assert df_library_complexity["PBC2"].iloc[0] == 13.706885 +@pytest.mark.pairedend +def test_dedup_files_pairedend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX/ENCLB568IYX.dedup.bam')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX/ENCLB568IYX.dedup.bam.bai')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX/ENCLB568IYX.dedup.qc')) + + @pytest.mark.pairedend def test_map_qc_pairedend(): - assert os.path.exists(os.path.join(test_output_path, 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.bam')) - assert os.path.exists(os.path.join(test_output_path, 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.bam.bai')) - filtered_reads_report = test_output_path + 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.flagstat.qc' + filtered_reads_report = test_output_path + 'ENCLB568IYX/ENCLB568IYX.dedup.flagstat.qc' samtools_report = open(filtered_reads_report).readlines() - assert '47389080 + 0 in total' in samtools_report[0] - assert '47389080 + 0 mapped (100.00%:N/A)' in samtools_report[4] - library_complexity = test_output_path + 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.pbc.qc' + assert '47388510 + 0 in total' in samtools_report[0] + assert '47388510 + 0 mapped (100.00%:N/A)' in samtools_report[4] + + +@pytest.mark.pairedend +def test_library_complexity_pairedend(): + library_complexity = test_output_path + 'ENCLB568IYX/ENCLB568IYX.pbc.qc' df_library_complexity = pd.read_csv(library_complexity, sep='\t') assert df_library_complexity["NRF"].iloc[0] == 0.947064 - assert df_library_complexity["PBC1"].iloc[0] == 0.946724 - assert df_library_complexity["PBC2"].iloc[0] == 18.643039 + assert round(df_library_complexity["PBC1"].iloc[0],6) == 0.946723 + assert round(df_library_complexity["PBC2"].iloc[0],6) == 18.642645 diff --git a/workflow/tests/test_map_reads.py b/workflow/tests/test_map_reads.py index 328858216abb88528c2d25e06bce20d7d469f1cb..9ff817f452c24420788d6cbe75ce14a6989da167 100644 --- a/workflow/tests/test_map_reads.py +++ b/workflow/tests/test_map_reads.py @@ -9,8 +9,8 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ @pytest.mark.singleend def test_map_reads_singleend(): - assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.srt.bam')) - aligned_reads_report = test_output_path + 'ENCFF646LXU.srt.bam.flagstat.qc' + assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI/ENCLB831RUI.bam')) + aligned_reads_report = test_output_path + 'ENCLB831RUI/ENCLB831RUI.flagstat.qc' samtools_report = open(aligned_reads_report).readlines() assert '80795025 + 0 in total' in samtools_report[0] assert '80050072 + 0 mapped (99.08% : N/A)' in samtools_report[4] @@ -18,8 +18,8 @@ def test_map_reads_singleend(): @pytest.mark.pairedend def test_map_reads_pairedend(): - assert os.path.exists(os.path.join(test_output_path, 'ENCFF002DTU_val_1ENCFF002EFI_val_2.srt.bam')) - aligned_reads_report = test_output_path + 'ENCFF002DTU_val_1ENCFF002EFI_val_2.srt.bam.flagstat.qc' + assert os.path.exists(os.path.join(test_output_path, 'ENCLB678IDC/ENCLB678IDC.bam')) + aligned_reads_report = test_output_path + 'ENCLB678IDC/ENCLB678IDC.flagstat.qc' samtools_report = open(aligned_reads_report).readlines() assert '72660890 + 0 in total' in samtools_report[0] assert '72053925 + 0 mapped (99.16% : N/A)' in samtools_report[4] diff --git a/workflow/tests/test_motif_search.py b/workflow/tests/test_motif_search.py index ca84f4a60745e5f9080ba40fe148787b5bd740cf..28aec0466d45ae1bf1185b5696b1c5fb669ef1c6 100644 --- a/workflow/tests/test_motif_search.py +++ b/workflow/tests/test_motif_search.py @@ -10,18 +10,32 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../output/motifSearch/' +@pytest.mark.singleend +def test_limited_peaks_singleend(): + peak_file_ENCSR238SGC = test_output_path + 'ENCSR238SGC.600.narrowPeak' + assert os.path.exists(peak_file_ENCSR238SGC) + assert utils.count_lines(peak_file_ENCSR238SGC) == 600 + + @pytest.mark.singleend def test_motif_search_singleend(): assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC_memechip', 'ENCSR238SGC.fa')) assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC_memechip', 'index.html')) - peak_file_ENCSR238SGC = test_output_path + 'sorted-ENCSR238SGC.replicated.narrowPeak' - assert os.path.exists(peak_file_ENCSR238SGC) - assert utils.count_lines(peak_file_ENCSR238SGC) == 600 + + +@pytest.mark.singleskip_true +def test_motif_search_singleend(): + assert os.path.isdir(test_output_path) == False + + +@pytest.mark.pairedend +def test_limited_peaks_pairedend(): + peak_file_ENCSR729LGA= test_output_path + 'ENCSR729LGA.600.narrowPeak' + assert os.path.exists(peak_file_ENCSR729LGA) + assert utils.count_lines(peak_file_ENCSR729LGA) == 600 + @pytest.mark.pairedend def test_motif_search_pairedend(): assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA_memechip', 'ENCSR729LGA.fa')) assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA_memechip', 'index.html')) - peak_file_ENCSR729LGA= test_output_path + 'sorted-ENCSR729LGA.replicated.narrowPeak' - assert os.path.exists(peak_file_ENCSR729LGA) - assert utils.count_lines(peak_file_ENCSR729LGA) == 600 diff --git a/workflow/tests/test_overlap_peaks.py b/workflow/tests/test_overlap_peaks.py index 99d43b87939617c801bad0389f14e2683cde0f82..ff61c937f42af911bba8e8e21acde80bd41a592e 100644 --- a/workflow/tests/test_overlap_peaks.py +++ b/workflow/tests/test_overlap_peaks.py @@ -37,11 +37,11 @@ def test_check_update_design(design_diff): def test_overlap_peaks_singleend(): assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC.rejected.narrowPeak')) peak_file = test_output_path + 'ENCSR238SGC.replicated.narrowPeak' - assert utils.count_lines(peak_file) == 152848 + assert utils.count_lines(peak_file) == 152262 @pytest.mark.pairedend def test_overlap_peaks_pairedend(): assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA.rejected.narrowPeak')) peak_file = test_output_path + 'ENCSR729LGA.replicated.narrowPeak' - assert utils.count_lines(peak_file) == 25655 + assert utils.count_lines(peak_file) == 25758 diff --git a/workflow/tests/test_trim_reads.py b/workflow/tests/test_trim_reads.py index 8e2176d7044392f9ed9e00801b3e14d1c627874d..e01707687b244fc3e7bf1b1b7fbd53797a43afbf 100644 --- a/workflow/tests/test_trim_reads.py +++ b/workflow/tests/test_trim_reads.py @@ -13,20 +13,28 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ @pytest.mark.singleend def test_trim_reads_singleend(): raw_fastq = test_data_path + 'ENCFF833BLU.fastq.gz' - trimmed_fastq = test_output_path + 'ENCFF833BLU_trimmed.fq.gz' - trimmed_fastq_report = test_output_path + \ - 'ENCFF833BLU.fastq.gz_trimming_report.txt' + trimmed_fastq = test_output_path + 'ENCLB144FDT/ENCLB144FDT_R1_trimmed.fq.gz' assert os.path.getsize(raw_fastq) != os.path.getsize(trimmed_fastq) assert os.path.getsize(trimmed_fastq) == 2512853101 + + +@pytest.mark.singleend +def test_trim_report_singleend(): + trimmed_fastq_report = test_output_path + \ + 'ENCLB144FDT/ENCLB144FDT_R1.fastq.gz_trimming_report.txt' assert 'Trimming mode: single-end' in open(trimmed_fastq_report).readlines()[4] @pytest.mark.pairedend def test_trim_reads_pairedend(): raw_fastq = test_data_path + 'ENCFF582IOZ.fastq.gz' - trimmed_fastq = test_output_path + 'ENCFF582IOZ_val_2.fq.gz' - trimmed_fastq_report = test_output_path + \ - 'ENCFF582IOZ.fastq.gz_trimming_report.txt' + trimmed_fastq = test_output_path + 'ENCLB637LZP/ENCLB637LZP_R2_val_2.fq.gz' assert os.path.getsize(raw_fastq) != os.path.getsize(trimmed_fastq) assert os.path.getsize(trimmed_fastq) == 2229312710 + + +@pytest.mark.pairedend +def test_trim_report_pairedend(): + trimmed_fastq_report = test_output_path + \ + 'ENCLB637LZP/ENCLB637LZP_R2.fastq.gz_trimming_report.txt' assert 'Trimming mode: paired-end' in open(trimmed_fastq_report).readlines()[4] diff --git a/workflow/tests/test_xcor.py b/workflow/tests/test_xcor.py index 2d3fdcff16d3a42c5af2e7c4beb6f777e0ddbba0..fd475943f3f60317c4e0d5dd1ef067cc23fd842d 100644 --- a/workflow/tests/test_xcor.py +++ b/workflow/tests/test_xcor.py @@ -9,9 +9,13 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ @pytest.mark.singleend -def test_cross_singleend(): - assert os.path.exists(os.path.join(test_output_path, 'ENCFF833BLU.filt.nodup.tagAlign.15.tagAlign.gz.cc.plot.pdf')) - qc_file = os.path.join(test_output_path,"ENCFF833BLU.filt.nodup.tagAlign.15.tagAlign.gz.cc.qc") +def test_cross_plot_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT/ENCLB144FDT.cc.plot.pdf')) + + +@pytest.mark.singleend +def test_cross_qc_singleend(): + qc_file = os.path.join(test_output_path,"ENCLB144FDT/ENCLB144FDT.cc.qc") df_xcor = pd.read_csv(qc_file, sep="\t", header=None) assert df_xcor[2].iloc[0] == '190,200,210' assert df_xcor[8].iloc[0] == 1.025906 @@ -19,10 +23,14 @@ def test_cross_singleend(): @pytest.mark.pairedend -def test_cross_pairedend(): - assert os.path.exists(os.path.join(test_output_path, 'ENCFF582IOZ_val_2ENCFF957SQS_val_1.filt.nodup.tagAlign.15.tagAlign.gz.cc.plot.pdf')) - qc_file = os.path.join(test_output_path,"ENCFF582IOZ_val_2ENCFF957SQS_val_1.filt.nodup.tagAlign.15.tagAlign.gz.cc.qc") +def test_cross_qc_pairedend(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX/ENCLB568IYX.cc.plot.pdf')) + + +@pytest.mark.pairedend +def test_cross_plot_pairedend(): + qc_file = os.path.join(test_output_path,"ENCLB568IYX/ENCLB568IYX.cc.qc") df_xcor = pd.read_csv(qc_file, sep="\t", header=None) - assert df_xcor[2].iloc[0] == '210,220,475' - assert round(df_xcor[8].iloc[0],6) == 1.062032 - assert df_xcor[9].iloc[0] == 3.737722 + assert df_xcor[2].iloc[0] == '220,430,475' + assert round(df_xcor[8].iloc[0],6) == 1.060018 + assert df_xcor[9].iloc[0] == 4.099357