diff --git a/CHANGELOG.md b/CHANGELOG.md index 317661b08f2d3bde216aa71f3e5b433af5d7adf0..bf5947140341380f402e5a180abd1d652139d0f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,8 +5,11 @@ All notable changes to this project will be documented in this file. ## [Unreleased] - Fix references.md link in citation of README.md - Add Nextflow to references.md +- Fix pool_and_psuedoreplicate.py to run single experiment +- Add test data for test_pool_and_pseudoreplicate - Add PlotProfile Option - Add Python version to MultiQC +- Add and Update tests ## [publish_1.0.6 ] - 2019-05-31 ### Added diff --git a/README.md b/README.md index a5bcdeb373cd19ccafb4d8248c61a1af2a815599..ec8e8c7908178f3db77dfc753fec25b483d65a50 100644 --- a/README.md +++ b/README.md @@ -135,22 +135,22 @@ If you find an error, please let the [BICF](mailto:BICF@UTSouthwestern.edu) know Please cite individual programs and versions used [HERE](docs/references.md), and the pipeline doi:[10.5281/zenodo.2648844](https://doi.org/10.5281/zenodo.2648844). Please cite in publications: Pipeline was developed by BICF from funding provided by Cancer Prevention and Research Institute of Texas (RP150596). ## Programs and Versions - + python/3.6.1-2-anaconda [website](https://www.anaconda.com/download/#linux) [citation](docs/references.txt) - + trimgalore/0.4.1 [website](https://github.com/FelixKrueger/TrimGalore) [citation](docs/references.txt) - + cutadapt/1.9.1 [website](https://cutadapt.readthedocs.io/en/stable/index.html) [citation](docs/references.txt) - + bwa/intel/0.7.12 [website](http://bio-bwa.sourceforge.net/) [citation](docs/references.txt) - + samtools/1.6 [website](http://samtools.sourceforge.net/) [citation](docs/references.txt) - + sambamba/0.6.6 [website](http://lomereiter.github.io/sambamba/) [citation](docs/references.txt) - + bedtools/2.26.0 [website](https://bedtools.readthedocs.io/en/latest/) [citation](docs/references.txt) - + deeptools/2.5.0.1 [website](https://deeptools.readthedocs.io/en/develop/) [citation](docs/references.txt) - + phantompeakqualtools/1.2 [website](https://github.com/kundajelab/phantompeakqualtools) [citation](docs/references.txt) - + macs/2.1.0-20151222 [website](http://liulab.dfci.harvard.edu/MACS/) [citation](docs/references.txt) - + UCSC_userApps/v317 [website](https://genome.ucsc.edu/util.html) [citation](docs/references.txt) - + R/3.4.1 [website](https://www.r-project.org/) [citation](docs/references.txt) + + python/3.6.1-2-anaconda [website](https://www.anaconda.com/download/#linux) [citation](docs/references.md) + + trimgalore/0.4.1 [website](https://github.com/FelixKrueger/TrimGalore) [citation](docs/references.md) + + cutadapt/1.9.1 [website](https://cutadapt.readthedocs.io/en/stable/index.html) [citation](docs/references.md) + + bwa/intel/0.7.12 [website](http://bio-bwa.sourceforge.net/) [citation](docs/references.md) + + samtools/1.6 [website](http://samtools.sourceforge.net/) [citation](docs/references.md) + + sambamba/0.6.6 [website](http://lomereiter.github.io/sambamba/) [citation](docs/references.md) + + bedtools/2.26.0 [website](https://bedtools.readthedocs.io/en/latest/) [citation](docs/references.md) + + deeptools/2.5.0.1 [website](https://deeptools.readthedocs.io/en/develop/) [citation](docs/references.md) + + phantompeakqualtools/1.2 [website](https://github.com/kundajelab/phantompeakqualtools) [citation](docs/references.md) + + macs/2.1.0-20151222 [website](http://liulab.dfci.harvard.edu/MACS/) [citation](docs/references.md) + + UCSC_userApps/v317 [website](https://genome.ucsc.edu/util.html) [citation](docs/references.md) + + R/3.4.1 [website](https://www.r-project.org/) [citation](docs/references.md) + SPP/1.14 - + meme/4.11.1-gcc-openmpi [website](http://meme-suite.org/doc/install.html?man_type=web) [citation](docs/references.txt) - + ChIPseeker [website](https://bioconductor.org/packages/release/bioc/html/ChIPseeker.html) [citation](docs/references.txt) - + DiffBind [website](https://bioconductor.org/packages/release/bioc/html/DiffBind.html) [citation](docs/references.txt) + + meme/4.11.1-gcc-openmpi [website](http://meme-suite.org/doc/install.html?man_type=web) [citation](docs/references.md) + + ChIPseeker [website](https://bioconductor.org/packages/release/bioc/html/ChIPseeker.html) [citation](docs/references.md) + + DiffBind [website](https://bioconductor.org/packages/release/bioc/html/DiffBind.html) [citation](docs/references.md) diff --git a/docs/index.md b/docs/index.md index 193f3ec3a395fce2253c3b90e9e7e9de8a7359c2..3b4a685cfc112291a221c04d7efa4fb85db9545a 100644 --- a/docs/index.md +++ b/docs/index.md @@ -125,3 +125,6 @@ Please cite in publications: Pipeline was developed by BICF from funding provide + meme/4.11.1-gcc-openmpi [website](http://meme-suite.org/doc/install.html?man_type=web) [citation](https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/blob/master/docs/references.md) + ChIPseeker [website](https://bioconductor.org/packages/release/bioc/html/ChIPseeker.html) [citation](https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/blob/master/docs/references.md) + DiffBind [website](https://bioconductor.org/packages/release/bioc/html/DiffBind.html) [citation](https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/blob/master/docs/references.md) + + MultiQC [website](https://multiqc.info/) [citation](https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/blob/master/docs/references.md) + + BICFChip-seqAnalysisWorkflow [website](https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis) [citation](https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/blob/master/docs/references.md) + + Nextflow [website](https://www.nextflow.io/) [citation](https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/blob/master/docs/references.md) diff --git a/test_data/A_1.bedse.gz b/test_data/A_1.bedse.gz new file mode 100644 index 0000000000000000000000000000000000000000..4bbf615c84c514040c206bde70190b83fac1003e Binary files /dev/null and b/test_data/A_1.bedse.gz differ diff --git a/test_data/A_1.tagAlign.gz b/test_data/A_1.tagAlign.gz new file mode 100644 index 0000000000000000000000000000000000000000..b5cc068550fc3c850c014935e21586a89b5e8118 Binary files /dev/null and b/test_data/A_1.tagAlign.gz differ diff --git a/test_data/B_1.bedse.gz b/test_data/B_1.bedse.gz new file mode 100644 index 0000000000000000000000000000000000000000..4bbf615c84c514040c206bde70190b83fac1003e Binary files /dev/null and b/test_data/B_1.bedse.gz differ diff --git a/test_data/B_1.tagAlign.gz b/test_data/B_1.tagAlign.gz new file mode 100644 index 0000000000000000000000000000000000000000..b5cc068550fc3c850c014935e21586a89b5e8118 Binary files /dev/null and b/test_data/B_1.tagAlign.gz differ diff --git a/workflow/main.nf b/workflow/main.nf index e239d4082ce31b6509d31c3e47353eaf7a35ffa6..da287e857bdf38a5131992d6b1adfd7ff5a9cc1b 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -47,7 +47,7 @@ if (params.astrocyte) { params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ?: false : false params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false - params.gtf = params.genome ? params.genomes[ params.genome ].fasta ?: false : false + params.gtf = params.genome ? params.genomes[ params.genome ].gtf ?: false : false } diff --git a/workflow/scripts/pool_and_psuedoreplicate.py b/workflow/scripts/pool_and_psuedoreplicate.py index 07eac44c67414a043a3a33bd1c1ccc4a86eb9a87..6c37eed940ca89f71ebd3c49a2dae042f94d5b05 100644 --- a/workflow/scripts/pool_and_psuedoreplicate.py +++ b/workflow/scripts/pool_and_psuedoreplicate.py @@ -172,26 +172,7 @@ def self_psuedoreplication(tag_file, prefix, paired): return pseudoreplicate_dict -def main(): - args = get_args() - paired = args.paired - design = args.design - cutoff_ratio = args.cutoff - - # Create a file handler - handler = logging.FileHandler('experiment_generation.log') - logger.addHandler(handler) - - # Read files as dataframes - design_df = pd.read_csv(design, sep='\t') - - # Get current directory to build paths - cwd = os.getcwd() - - # Check Number of replicates and replicates - no_reps = check_replicates(design_df) - no_unique_controls = check_controls(design_df) - +def generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_controls): if no_reps == 1: logger.info("No other replicate specified " "so processing as an unreplicated experiment.") @@ -223,85 +204,42 @@ def main(): pool_control_tmp = bedpe_to_tagalign(pool_control, "pool_control") pool_control = pool_control_tmp - # Psuedoreplicate and update design accordingly - if not replicated: - - # Duplicate rows and update for pool and psuedoreplicates and update tagAlign with single end data - experiment_id = design_df.at[0, 'experiment_id'] - replicate = design_df.at[0, 'replicate'] - design_new_df = design_df.loc[np.repeat(design_df.index, 4)].reset_index() - - # Update tagAlign with single end data - if paired: - design_new_df['tag_align'] = design_new_df['se_tag_align'] - design_new_df.drop(labels='se_tag_align', axis=1, inplace=True) - - design_new_df['replicate'] = design_new_df['replicate'].astype(str) - design_new_df.at[1, 'sample_id'] = experiment_id + '_pr1' - design_new_df.at[1, 'replicate'] = '1_pr' - design_new_df.at[1, 'xcor'] = 'Calculate' - design_new_df.at[2, 'sample_id'] = experiment_id + '_pr2' - design_new_df.at[2, 'replicate'] = '2_pr' - design_new_df.at[2, 'xcor'] = 'Calculate' - design_new_df.at[3, 'sample_id'] = experiment_id + '_pooled' - design_new_df.at[3, 'replicate'] = 'pooled' - design_new_df.at[3, 'xcor'] = 'Calculate' - design_new_df.at[3, 'tag_align'] = design_new_df.at[0, 'tag_align'] - - # Make 2 self psuedoreplicates - self_pseudoreplicates_dict = {} - for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']): - replicate_prefix = experiment_id + '_' + str(rep) - self_pseudoreplicates_dict = \ - self_psuedoreplication(tag_file, replicate_prefix, paired) - - # Update design to include new self pseudo replicates - for rep, pseudorep_file in self_pseudoreplicates_dict.items(): - path_to_file = cwd + '/' + pseudorep_file - replicate = rep + 1 - design_new_df.loc[replicate, 'tag_align'] = path_to_file - - # Drop index column - design_new_df.drop(labels='index', axis=1, inplace=True) + # Duplicate rows and update for pool and psuedoreplicates and update tagAlign with single end data + experiment_id = design_df.at[0, 'experiment_id'] + replicate_files = design_df.tag_align.unique() + pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired) + + # Make 2 self psuedoreplicates + pseudoreplicates_dict = {} + for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']): + replicate_prefix = experiment_id + '_' + str(rep) + pr_dict = self_psuedoreplication(tag_file, replicate_prefix, paired) + pseudoreplicates_dict[rep] = pr_dict + + # Update design to include new self pseudo replicates + pseudoreplicates_df = pd.DataFrame.from_dict(pseudoreplicates_dict) + pool_pseudoreplicates_dict = {} + for index, row in pseudoreplicates_df.iterrows(): + replicate_id = index + 1 + pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz' + pool_replicate = pool(row, pr_filename, False) + pool_pseudoreplicates_dict[replicate_id] = pool_replicate + + design_new_df = design_df #.loc[np.repeat(design_df.index, 4)].reset_index() + # Update tagAlign with single end data + if paired: + design_new_df['tag_align'] = design_new_df['se_tag_align'] + design_new_df.drop(labels='se_tag_align', axis=1, inplace=True) + # If paired change to single End + if paired: + pool_experiment_se = bedpe_to_tagalign(pool_experiment, experiment_id + "_pooled") else: - # Make pool of replicates - replicate_files = design_df.tag_align.unique() - experiment_id = design_df.at[0, 'experiment_id'] - pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired) + pool_experiment_se = pool_experiment - # If paired change to single End - if paired: - pool_experiment_se = bedpe_to_tagalign(pool_experiment, experiment_id + "_pooled") - else: - pool_experiment_se = pool_experiment - - # Make self psuedoreplicates equivalent to number of replicates - pseudoreplicates_dict = {} - for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']): - replicate_prefix = experiment_id + '_' + str(rep) - pr_dict = self_psuedoreplication(tag_file, replicate_prefix, paired) - pseudoreplicates_dict[rep] = pr_dict - - # Merge self psuedoreplication for each true replicate - pseudoreplicates_df = pd.DataFrame.from_dict(pseudoreplicates_dict) - pool_pseudoreplicates_dict = {} - for index, row in pseudoreplicates_df.iterrows(): - replicate_id = index + 1 - pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz' - pool_replicate = pool(row, pr_filename, False) - pool_pseudoreplicates_dict[replicate_id] = pool_replicate - - design_new_df = design_df - # Update tagAlign with single end data - if paired: - design_new_df['tag_align'] = design_new_df['se_tag_align'] - design_new_df.drop(labels='se_tag_align', axis=1, inplace=True) # Check controls against cutoff_ratio # if so replace with pool_control # unless single control was used - - if not single_control: path_to_pool_control = cwd + '/' + pool_control if control_df.values.max() > cutoff_ratio: @@ -351,7 +289,34 @@ def main(): tmp_metadata['tag_align'] = path_to_file design_new_df = design_new_df.append(tmp_metadata) + return design_new_df + + +def main(): + args = get_args() + paired = args.paired + design = args.design + cutoff_ratio = args.cutoff + + # Create a file handler + handler = logging.FileHandler('experiment_generation.log') + logger.addHandler(handler) + + # Read files as dataframes + design_df = pd.read_csv(design, sep='\t') + + # Get current directory to build paths + cwd = os.getcwd() + + # Check Number of replicates and replicates + no_reps = check_replicates(design_df) + no_unique_controls = check_controls(design_df) + + # Generate new design file + design_new_df = generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_controls) + # Write out new dataframe + experiment_id = design_df.at[0, 'experiment_id'] design_new_df.to_csv(experiment_id + '_ppr.tsv', header=True, sep='\t', index=False) diff --git a/workflow/tests/test_pool_and_psuedoreplicate.py b/workflow/tests/test_pool_and_psuedoreplicate.py index 31fffc57e7eaa598a933c97c92b1c901ba731ba7..f251e888f906de1c7e360502cadf45c8c9b85a3c 100644 --- a/workflow/tests/test_pool_and_psuedoreplicate.py +++ b/workflow/tests/test_pool_and_psuedoreplicate.py @@ -5,16 +5,18 @@ import pandas as pd from io import StringIO import os import pool_and_psuedoreplicate +import shutil +test_design_path = os.path.dirname(os.path.abspath(__file__)) + \ + '/../../test_data/' test_output_path = os.path.dirname(os.path.abspath(__file__)) + \ '/../output/design/' -DESIGN_STRING = """sample_id\ttag_align\txcor\tbiosample\tfactor\ttreatment\treplicate\tcontrol_tag_align -A_1\tA_1.bedse.gz\tA_1.cc.qc\tLiver\tH3K27ac\tNone\t1\tB_1.bedse.gz -A_2\tA_2.bedse.gz\tA_2.cc.qc\tLiver\tH3K27ac\tNone\t2\tB_2.bedse.gz +DESIGN_STRING = """sample_id\tse_tag_align\ttag_align\txcor\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\tcontrol_tag_align +A_1\tA_1.tagAlign.gz\tA_1.bedse.gz\tA_1.cc.qc\tA\tLiver\tH3K27ac\tNone\t1\tB_1\tB_1.bedse.gz +A_2\tA_2.tagAlign.gz\tA_2.bedse.gz\tA_2.cc.qc\tA\tLiver\tH3K27ac\tNone\t2\tB_2\tB_2.bedse.gz """ - @pytest.fixture def design_experiment(): design_file = StringIO(DESIGN_STRING) @@ -60,6 +62,16 @@ def test_check_controls_single(design_experiment_3): assert no_controls == 1 +@pytest.mark.unit +def test_single_rep(design_experiment_2): + cwd = os.getcwd() + shutil.copy(test_design_path + 'A_1.bedse.gz', cwd) + shutil.copy(test_design_path + 'B_1.bedse.gz', cwd) + shutil.copy(test_design_path + 'A_1.tagAlign.gz', cwd) + shutil.copy(test_design_path + 'B_1.tagAlign.gz', cwd) + single_rep = pool_and_psuedoreplicate.generate_design('false', 1.2, design_experiment_2, cwd, 1, 1) + assert single_rep.shape[0] == 4 + @pytest.mark.singleend def test_pool_and_psuedoreplicate_singleend(): design_file = os.path.join(test_output_path, 'ENCSR238SGC_ppr.tsv')