diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 128756e1c26f5b1b7d4d9609675e67ca593c31f5..3e6393ed8d83c8b402f9b37d1713fe2244c49433 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,7 +1,7 @@ before_script: - module add python/3.6.1-2-anaconda - pip install --user pytest-pythonpath==0.7.1 pytest-cov==2.5.1 - - module load nextflow/0.31.0 + - module load nextflow/0.31.0 - ln -s /project/shared/bicf_workflow_ref/workflow_testdata/chipseq/*fastq.gz test_data/ stages: @@ -76,7 +76,7 @@ single_end_skip: only: - master script: - - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' --skipDiff true --skipMotif true --astrocyte false -resume + - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' --skipDiff true --skipMotif true --skipPlotProfile true --astrocyte false -resume - pytest -m singleskip_true artifacts: expire_in: 2 days diff --git a/CHANGELOG.md b/CHANGELOG.md index 99f5a5c6dddd9c015c86cbc3fe5f110fbfee7bd9..317661b08f2d3bde216aa71f3e5b433af5d7adf0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ All notable changes to this project will be documented in this file. ## [Unreleased] - Fix references.md link in citation of README.md - Add Nextflow to references.md +- Add PlotProfile Option +- Add Python version to MultiQC ## [publish_1.0.6 ] - 2019-05-31 ### Added diff --git a/README.md b/README.md index e2361654d9dda12c7692d8bd1f8e8edd95f0fda4..a5bcdeb373cd19ccafb4d8248c61a1af2a815599 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,8 @@ diffPeaks | normcount_peaksets.txt | Use only for replicated samples; peak set v diffPeaks | pca.pdf | Use only for replicated samples; PCA of peak location and peak intensity diffPeaks | *_diffbind.bed | Use only for replicated samples; bed file of peak locations between replicates diffPeaks | *_diffbind.csv | Use only for replicated samples; CSV file of peaks between replicates +plotProfile | plotProfile.png | Plot profile of the TSS region +plotProfile | computeMatrix.gz | Compute Matrix from deeptools to create custom plots other than plotProfile ## Common Quality Control Metrics + These are the list of files that should be reviewed before continuing on with the CHIPseq experiment. If your experiment fails any of these metrics, you should pause and re-evaluate whether the data should remain in the study. diff --git a/astrocyte_pkg.yml b/astrocyte_pkg.yml index ff68685e488ac30e45c0d2a863cdf131b71c41cc..704bc8fa415a0e624163971f225f034115f4c9f3 100644 --- a/astrocyte_pkg.yml +++ b/astrocyte_pkg.yml @@ -142,6 +142,15 @@ workflow_parameters: description: | Skip motif calling + - id: skipPlotProfile + type: select + required: true + choices: + - [ 'true', 'true'] + - [ 'false', 'false'] + description: | + Skip Plot Profile Analysis + - id: astrocyte type: select choices: diff --git a/docs/index.md b/docs/index.md index 18ae6ed89ce26535ecd5b055013b94c693706866..193f3ec3a395fce2253c3b90e9e7e9de8a7359c2 100644 --- a/docs/index.md +++ b/docs/index.md @@ -90,6 +90,8 @@ diffPeaks | normcount_peaksets.txt | Use only for replicated samples; peak set v diffPeaks | pca.pdf | Use only for replicated samples; PCA of peak location and peak intensity diffPeaks | *_diffbind.bed | Use only for replicated samples; bed file of peak locations between replicates diffPeaks | *_diffbind.csv | Use only for replicated samples; CSV file of peaks between replicates +plotProfile | plotProfile.png | Plot profile of the TSS region +plotProfile | computeMatrix.gz | Compute Matrix from deeptools to create custom plots other than plotProfile ## Common Quality Control Metrics + These are the list of files that should be reviewed before continuing on with the CHIPseq experiment. If your experiment fails any of these metrics, you should pause and re-evaluate whether the data should remain in the study. diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 6e2e885661a011956aaf1bac1e4fb0da3969d686..7237cad47346ee0699fcadcf73dcd2a70f3431a5 100644 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -44,6 +44,10 @@ process { module = ['python/3.6.1-2-anaconda', 'macs/2.1.0-20151222', 'UCSC_userApps/v317', 'bedtools/2.26.0', 'phantompeakqualtools/1.2'] queue = '128GB,256GB,256GBv1' } + withName: plotProfile { + module = ['deeptools/2.5.0.1'] + cpus = 32 + } withName: consensusPeaks { module = ['python/3.6.1-2-anaconda', 'bedtools/2.26.0'] executor = 'local' @@ -74,18 +78,21 @@ params { genomesize = 'hs' chromsizes = '/project/shared/bicf_workflow_ref/GRCh38/genomefile.txt' fasta = '/project/shared/bicf_workflow_ref/GRCh38/genome.fa' + gtf = '/project/shared/bicf_workflow_ref/GRCh38/gencode.gtf' } 'GRCh37' { bwa = '/project/shared/bicf_workflow_ref/GRCh37' genomesize = 'hs' chromsizes = '/project/shared/bicf_workflow_ref/GRCh37/genomefile.txt' fasta = '/project/shared/bicf_workflow_ref/GRCh37/genome.fa' + gtf = '/project/shared/bicf_workflow_ref/GRCh37/gencode.gtf' } 'GRCm38' { bwa = '/project/shared/bicf_workflow_ref/GRCm38' genomesize = 'mm' chromsizes = '/project/shared/bicf_workflow_ref/GRCm38/genomefile.txt' fasta = '/project/shared/bicf_workflow_ref/GRCm38/genome.fa' + gtf = '/project/shared/bicf_workflow_ref/GRCm38/gencode.gtf' } } } diff --git a/workflow/main.nf b/workflow/main.nf index 9b45adb8235ef8964ac0d00fc86fd47b12d5cf0a..e239d4082ce31b6509d31c3e47353eaf7a35ffa6 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -25,6 +25,7 @@ params.topPeakCount = 600 params.astrocyte = false params.skipDiff = false params.skipMotif = false +params.skipPlotProfile = false params.references = "$baseDir/../docs/references.md" params.multiqc = "$baseDir/conf/multiqc_config.yaml" @@ -35,6 +36,7 @@ if (params.astrocyte) { params.bwaIndex = "$referenceLocation/$params.genome" params.chromSizes = "$referenceLocation/$params.genome/genomefile.txt" params.fasta = "$referenceLocation/$params.genome/genome.fa" + params.gtf = "$referenceLocation/$params.genome/gencode.gtf" if (params.genome == 'GRCh37' || params.genome == 'GRCh38') { params.genomeSize = 'hs' } else if (params.genome == 'GRCm38') { @@ -45,6 +47,7 @@ if (params.astrocyte) { params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false + params.gtf = params.genome ? params.genomes[ params.genome ].fasta ?: false : false } @@ -78,8 +81,10 @@ extendReadsLen = params.extendReadsLen topPeakCount = params.topPeakCount skipDiff = params.skipDiff skipMotif = params.skipMotif +skipPlotProfile = params.skipPlotProfile references = params.references multiqc = params.multiqc +gtfFile = Channel.fromPath(params.gtf) // Check design file for errors process checkDesignFile { @@ -424,6 +429,7 @@ process callPeaksMACS { set sampleId, file('*.narrowPeak'), file('*.fc_signal.bw'), file('*.pvalue_signal.bw'), experimentId, biosample, factor, treatment, replicate, controlId into experimentPeaks file '*.xls' into callPeaksMACSsummit file('version_*.txt') into callPeaksMACSVersions + file("*.fc_signal.bw") into bigwigs script: @@ -456,6 +462,30 @@ peaksDesign = experimentPeaks "$sampleId\t$peak\t$fcSignal\t$pvalueSignal\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} .collectFile(name:'design_peak.tsv', seed:"sample_id\tpeaks\tfc_signal\tpvalue_signal\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$outDir/design") +//plotProfile +process plotProfile { + publishDir "$outDir/experimentQC", mode: 'copy' + + input: + + file ("*.pooled.fc_signal.bw") from bigwigs.collect() + file gtf from gtfFile + + output: + + file '*.{png,gz}' into plotProfile + + when: + + !skipPlotProfile + + script: + """ + module load deeptools/2.5.0.1 + bash $baseDir/scripts/plotProfile.sh + """ +} + // Calculate Consensus Peaks process consensusPeaks { @@ -575,7 +605,6 @@ process diffPeaks { // Generate Multiqc Report, gerernate Software Versions and references process multiqcReport { - publishDir "$outDir/${task.process}", mode: 'copy' input: @@ -611,6 +640,7 @@ process multiqcReport { module load multiqc/1.7 echo $workflow.nextflow.version > version_nextflow.txt multiqc --version > version_multiqc.txt + python --version &> version_python.txt python3 $baseDir/scripts/generate_references.py -r $references -o software_references python3 $baseDir/scripts/generate_versions.py -o software_versions multiqc -c $multiqc . diff --git a/workflow/nextflow.config b/workflow/nextflow.config index f9fbe846df7cbce559c0e8ec87097921792640a2..e1dd50f4e24db974e055d648003c901258f2c8c2 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -7,8 +7,8 @@ profiles { manifest { name = 'chipseq_analysis' description = 'BICF ChIP-seq Analysis Workflow.' - homePage = 'https://github.com/nf-core/rnaseq' - version = '1.0.0' + homePage = 'https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis' + version = '1.0.6' mainScript = 'main.nf' nextflowVersion = '>=0.31.0' } diff --git a/workflow/scripts/generate_versions.py b/workflow/scripts/generate_versions.py index 98c319331b7ff3574cb89bda39e0cc41397e7ded..4f0d8b143b36efbe207a303b2eb164d0de1a0e8a 100644 --- a/workflow/scripts/generate_versions.py +++ b/workflow/scripts/generate_versions.py @@ -46,6 +46,7 @@ SOFTWARE_REGEX = { 'MEME-ChIP': ['motifSearch_vf/version_memechip.txt', r"Version (\S+)"], 'DiffBind': ['diffPeaks_vf/version_DiffBind.txt', r"Version (\S+)\""], 'deepTools': ['experimentQC_vf/version_deeptools.txt', r"deeptools (\S+)"], + 'Python': ['version_python.txt', r"python, version (\S+)"], 'MultiQC': ['version_multiqc.txt', r"multiqc, version (\S+)"], } @@ -108,6 +109,7 @@ def main(): results['DiffBind'] = '<span style="color:#999999;\">Not Run</span>' results['deepTools'] = '<span style="color:#999999;\">Not Run</span>' results['MultiQC'] = '<span style="color:#999999;\">Not Run</span>' + results['Python'] = '<span style="color:#999999;\">Not Run</span>' # list all files files = glob.glob('**/*.txt', recursive=True) diff --git a/workflow/scripts/plotProfile.sh b/workflow/scripts/plotProfile.sh new file mode 100644 index 0000000000000000000000000000000000000000..7f62dc5bdd9e13026315222b076b8b939f839d00 --- /dev/null +++ b/workflow/scripts/plotProfile.sh @@ -0,0 +1,15 @@ +#!/bin/bash +#plotProfile.sh + +bws=$(ls *.bw) +gtf=$(ls *.gtf *.bed) + +computeMatrix reference-point \ + --referencePoint TSS \ + -S $bws \ + -R $gtf \ + --skipZeros \ + -o computeMatrix.gz + +plotProfile -m computeMatrix.gz \ + -out plotProfile.png \ diff --git a/workflow/scripts/pool_and_psuedoreplicate.py b/workflow/scripts/pool_and_psuedoreplicate.py index f6ba5a8246d944a5c7e5c9bc0b9d06d045b0dbbf..07eac44c67414a043a3a33bd1c1ccc4a86eb9a87 100644 --- a/workflow/scripts/pool_and_psuedoreplicate.py +++ b/workflow/scripts/pool_and_psuedoreplicate.py @@ -301,55 +301,56 @@ def main(): # if so replace with pool_control # unless single control was used - if not single_control: - path_to_pool_control = cwd + '/' + pool_control - if control_df.values.max() > cutoff_ratio: - logger.info("Number of reads in controls differ by " + - " > factor of %f. Using pooled controls." % (cutoff_ratio)) - design_new_df['control_tag_align'] = path_to_pool_control - else: - for index, row in design_new_df.iterrows(): - exp_no_reads = utils.count_lines(row['tag_align']) - con_no_reads = utils.count_lines(row['control_tag_align']) - if con_no_reads < exp_no_reads: - logger.info("Fewer reads in control than experiment." + - "Using pooled controls for replicate %s." - % row['replicate']) - design_new_df.loc[index, 'control_tag_align'] = \ - path_to_pool_control - else: - if paired: - control = row['control_tag_align'] - control_basename = os.path.basename( - utils.strip_extensions(control, STRIP_EXTENSIONS)) - control_tmp = bedpe_to_tagalign(control, control_basename) - path_to_control = cwd + '/' + control_tmp - design_new_df.loc[index, 'control_tag_align'] = \ - path_to_control - else: - path_to_pool_control = cwd + '/' + pool_control + if not single_control: + path_to_pool_control = cwd + '/' + pool_control + if control_df.values.max() > cutoff_ratio: + logger.info("Number of reads in controls differ by " + + " > factor of %f. Using pooled controls." % (cutoff_ratio)) design_new_df['control_tag_align'] = path_to_pool_control + else: + for index, row in design_new_df.iterrows(): + exp_no_reads = utils.count_lines(row['tag_align']) + con_no_reads = utils.count_lines(row['control_tag_align']) + if con_no_reads < exp_no_reads: + logger.info("Fewer reads in control than experiment." + + "Using pooled controls for replicate %s." + % row['replicate']) + design_new_df.loc[index, 'control_tag_align'] = \ + path_to_pool_control + else: + if paired: + control = row['control_tag_align'] + control_basename = os.path.basename( + utils.strip_extensions(control, STRIP_EXTENSIONS)) + control_tmp = bedpe_to_tagalign(control, control_basename) + path_to_control = cwd + '/' + control_tmp + design_new_df.loc[index, 'control_tag_align'] = \ + path_to_control - # Add in pseudo replicates - tmp_metadata = design_new_df.loc[0].copy() - tmp_metadata['control_tag_align'] = path_to_pool_control - for rep, pseudorep_file in pool_pseudoreplicates_dict.items(): - tmp_metadata['sample_id'] = experiment_id + '_pr' + str(rep) - tmp_metadata['replicate'] = str(rep) + '_pr' - tmp_metadata['xcor'] = 'Calculate' - path_to_file = cwd + '/' + pseudorep_file - tmp_metadata['tag_align'] = path_to_file - design_new_df = design_new_df.append(tmp_metadata) - - # Add in pool experiment - tmp_metadata['sample_id'] = experiment_id + '_pooled' - tmp_metadata['replicate'] = 'pooled' + else: + path_to_pool_control = cwd + '/' + pool_control + design_new_df['control_tag_align'] = path_to_pool_control + + # Add in pseudo replicates + tmp_metadata = design_new_df.loc[0].copy() + tmp_metadata['control_tag_align'] = path_to_pool_control + for rep, pseudorep_file in pool_pseudoreplicates_dict.items(): + tmp_metadata['sample_id'] = experiment_id + '_pr' + str(rep) + tmp_metadata['replicate'] = str(rep) + '_pr' tmp_metadata['xcor'] = 'Calculate' - path_to_file = cwd + '/' + pool_experiment_se + path_to_file = cwd + '/' + pseudorep_file tmp_metadata['tag_align'] = path_to_file design_new_df = design_new_df.append(tmp_metadata) + # Add in pool experiment + tmp_metadata['sample_id'] = experiment_id + '_pooled' + tmp_metadata['replicate'] = 'pooled' + tmp_metadata['xcor'] = 'Calculate' + path_to_file = cwd + '/' + pool_experiment_se + tmp_metadata['tag_align'] = path_to_file + design_new_df = design_new_df.append(tmp_metadata) + # Write out new dataframe design_new_df.to_csv(experiment_id + '_ppr.tsv', header=True, sep='\t', index=False) diff --git a/workflow/tests/test_plot_profile.py b/workflow/tests/test_plot_profile.py new file mode 100644 index 0000000000000000000000000000000000000000..6c9605d6d654f66adec865372223e13ddeaf6b19 --- /dev/null +++ b/workflow/tests/test_plot_profile.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 + +import pytest +import os +import utils + +test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../output/experimentQC/' + + +@pytest.mark.singleend +def test_plot_singleend(): + assert os.path.exists(os.path.join(test_output_path, 'plotProfile.png')) + + +@pytest.mark.pairedend +def test_plot_pairedend(): + assert os.path.exists(os.path.join(test_output_path, 'computeMatrix.gz'))