diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 10cbc0e6d358c818a2e94d75199bdd9c2300f93b..2e7d111950c506e8f496977b45a6789c11f1e7b7 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -10,6 +10,7 @@ stages: - single - multiple - skip + - cleanup user_configuration: stage: unit @@ -26,19 +27,19 @@ bash_tests: astrocyte: stage: astrocyte script: - - module load astrocyte/0.1.0 + - module load astrocyte/0.2.0 - module unload nextflow - cd .. - astrocyte_cli validate chipseq_analysis - artifacts: - expire_in: 2 days + after_script: + - rm -rf work/ single_end_mouse: stage: single only: - master script: - - nextflow run workflow/main.nf --astrocyte true -resume + - NXF_OPTS="-Dleveldb.mmap=false" nextflow run workflow/main.nf --astrocyte true - pytest -m singleend paired_end_human: @@ -48,9 +49,19 @@ paired_end_human: except: - master script: - - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_ENCSR729LGA_PE.txt" --genome 'GRCh38' --pairedEnd true --astrocyte false -resume + - NXF_OPTS="-Dleveldb.mmap=false" nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_ENCSR729LGA_PE.txt" --genome 'GRCh38' --pairedEnd true --astrocyte false - pytest -m pairedend +single_end_single_control: + stage: single + only: + - branches + except: + - master + script: + - NXF_OPTS="-Dleveldb.mmap=false" nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_single_contol_SE.txt" --genome 'GRCh38' --pairedEnd false --astrocyte false + - pytest -m singlecontrol + single_end_diff: stage: multiple only: @@ -58,7 +69,7 @@ single_end_diff: except: - master script: - - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' --astrocyte false -resume + - NXF_OPTS="-Dleveldb.mmap=false" nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' --astrocyte false - pytest -m singleend - pytest -m singlediff @@ -67,7 +78,7 @@ paired_end_diff: - master stage: multiple script: - - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_PE.txt" --genome 'GRCh38' --pairedEnd true --astrocyte false -resume + - NXF_OPTS="-Dleveldb.mmap=false" nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_PE.txt" --genome 'GRCh38' --pairedEnd true --astrocyte false - pytest -m pairedend - pytest -m paireddiff @@ -76,5 +87,12 @@ single_end_skip: only: - master script: - - nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' --skipDiff true --skipMotif true --skipPlotProfile true --astrocyte false -resume + - NXF_OPTS="-Dleveldb.mmap=false" nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' --skipDiff true --skipMotif true --skipPlotProfile true --astrocyte false - pytest -m singleskip_true + + +cleanup_job: + stage: cleanup + script: + - cd $CI_BUILDS_DIR/$CI_RUNNER_SHORT_TOKEN/$CI_PROJECT_NAME + - rm -fr $CI_PIPELINE_ID/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c08ab63c3fd19a0995ab5741fdf0e124db4571a..b73f5aec00dfbc3bc42a66dc84e99fedeb49aebe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,10 +14,11 @@ All notable changes to this project will be documented in this file. - Make gtf and geneName files as param inputs - Fix xcor to increase file size for --random-source - Fix skip diff test for paired-end data +- Add test data for single control and single replicate - Fix python version for MultiQC report - Fix xcor to get lowest non zero value above 50 - Fix references to display in Multiqc report - +- Update astrocyte testing to 0.2.0 ## [publish_1.0.6 ] - 2019-05-31 ### Added diff --git a/README.md b/README.md index bdd7b7cf44b37d47272ec72165634702478fe40e..2420017468b5a170ceb596852b717fa6e180661a 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,13 @@ # BICF ChIP-seq Pipeline -[](https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/commits/master) -[](https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/commits/master) + + +[](https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/commits/master) +[](https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/commits/master) [](https://www.nextflow.io/) -[](https://astrocyte-test.biohpc.swmed.edu/static/docs/index.html) +[](https://astrocyte-test.biohpc.swmed.edu/static/docs/index.html) [](https://doi.org/10.5281/zenodo.2648845) diff --git a/test_data/A_1.bedpe.gz b/test_data/A_1.bedpe.gz new file mode 100644 index 0000000000000000000000000000000000000000..52beb16aa36e95cb5e14dfd4d7fb867f6b4183bc Binary files /dev/null and b/test_data/A_1.bedpe.gz differ diff --git a/test_data/B_1.bedpe.gz b/test_data/B_1.bedpe.gz new file mode 100644 index 0000000000000000000000000000000000000000..52beb16aa36e95cb5e14dfd4d7fb867f6b4183bc Binary files /dev/null and b/test_data/B_1.bedpe.gz differ diff --git a/test_data/design_single_contol_SE.txt b/test_data/design_single_contol_SE.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca6a141de782a8aa37bba7bc0fc56547487eee41 --- /dev/null +++ b/test_data/design_single_contol_SE.txt @@ -0,0 +1,3 @@ +sample_id experiment_id biosample factor treatment replicate control_id fastq_read1 +ENCLB497XZB ENCSR000DXB Panc1 H3K4me3 None 1 ENCLB304SBJ ENCFF001GBW.fastq.gz +ENCLB304SBJ ENCSR000DXC Panc1 Control None 1 ENCLB304SBJ ENCFF001HWJ.fastq.gz diff --git a/test_data/fetch_test_data.sh b/test_data/fetch_test_data.sh index 8b9a33125f8e55f4f96947961f402ed6f97897c9..09e30b59e46ed041a090685c26cd460c96d03368 100644 --- a/test_data/fetch_test_data.sh +++ b/test_data/fetch_test_data.sh @@ -25,3 +25,9 @@ wget https://www.encodeproject.org/files/ENCFF161HBP/@@download/ENCFF161HBP.fast wget https://www.encodeproject.org/files/ENCFF776KZU/@@download/ENCFF776KZU.fastq.gz wget https://www.encodeproject.org/files/ENCFF119KHM/@@download/ENCFF119KHM.fastq.gz echo "Done with Paired-end" + +echo "Downloading Single-end data set Human ENCSR000DXB and ENCSR000DXC" +wget https://www.encodeproject.org/files/ENCFF001GBW/@@download/ENCFF001GBW.fastq.gz +wget https://www.encodeproject.org/files/ENCFF001GBV/@@download/ENCFF001GBV.fastq.gz +wget https://www.encodeproject.org/files/ENCFF001HWJ/@@download/ENCFF001HWJ.fastq.gz +echo "Done with Single-end" diff --git a/workflow/conf/biohpc.config b/workflow/conf/biohpc.config index 037d1e43e37c7daeb21489a00dff85701c8f8fd0..7da4c5f0682a25992dbe5bb67d8bd69c9ae48248 100644 --- a/workflow/conf/biohpc.config +++ b/workflow/conf/biohpc.config @@ -2,6 +2,7 @@ process { executor = 'slurm' queue = 'super' clusterOptions = '--hold' + beforeScript= 'ulimit -Ss unlimited' // Process specific configuration withName: checkDesignFile { diff --git a/workflow/scripts/pool_and_psuedoreplicate.py b/workflow/scripts/pool_and_psuedoreplicate.py index 6c37eed940ca89f71ebd3c49a2dae042f94d5b05..efefad9e39b98aa8cd50f4b94b6e9f75202d587b 100644 --- a/workflow/scripts/pool_and_psuedoreplicate.py +++ b/workflow/scripts/pool_and_psuedoreplicate.py @@ -204,6 +204,7 @@ def generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_con pool_control_tmp = bedpe_to_tagalign(pool_control, "pool_control") pool_control = pool_control_tmp + # Duplicate rows and update for pool and psuedoreplicates and update tagAlign with single end data experiment_id = design_df.at[0, 'experiment_id'] replicate_files = design_df.tag_align.unique() @@ -237,9 +238,9 @@ def generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_con else: pool_experiment_se = pool_experiment - # Check controls against cutoff_ratio - # if so replace with pool_control - # unless single control was used + # Check controls against cutoff_ratio + # if so replace with pool_control + # unless single control was used if not single_control: path_to_pool_control = cwd + '/' + pool_control if control_df.values.max() > cutoff_ratio: @@ -267,7 +268,10 @@ def generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_con path_to_control else: - path_to_pool_control = cwd + '/' + pool_control + if paired: + path_to_pool_control = cwd + '/' + pool_control + else: + path_to_pool_control = pool_control design_new_df['control_tag_align'] = path_to_pool_control # Add in pseudo replicates @@ -306,7 +310,7 @@ def main(): design_df = pd.read_csv(design, sep='\t') # Get current directory to build paths - cwd = os.getcwd() + cwd = os.getcwd() # Check Number of replicates and replicates no_reps = check_replicates(design_df) diff --git a/workflow/tests/test_overlap_peaks.py b/workflow/tests/test_overlap_peaks.py index c450551ab5ac582977163c5d59220f5e807615c6..8786f4ed152466fddf42ca4e9ea523218a6855a8 100644 --- a/workflow/tests/test_overlap_peaks.py +++ b/workflow/tests/test_overlap_peaks.py @@ -45,3 +45,9 @@ def test_overlap_peaks_pairedend(): assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA.rejected.narrowPeak')) peak_file = test_output_path + 'ENCSR729LGA.replicated.narrowPeak' assert utils.count_lines(peak_file) >= 25657 + +@pytest.mark.singlecontrol +def test_overlap_peaks_singlecontrol(): + assert os.path.exists(os.path.join(test_output_path, 'ENCSR000DXB.rejected.narrowPeak')) + peak_file = test_output_path + 'ENCSR000DXB.replicated.narrowPeak' + assert utils.count_lines(peak_file) >= 35097 diff --git a/workflow/tests/test_pool_and_psuedoreplicate.py b/workflow/tests/test_pool_and_psuedoreplicate.py index f251e888f906de1c7e360502cadf45c8c9b85a3c..d3c3a6d842e3bb1934ce9eaa0b1fad67b2833972 100644 --- a/workflow/tests/test_pool_and_psuedoreplicate.py +++ b/workflow/tests/test_pool_and_psuedoreplicate.py @@ -33,9 +33,12 @@ def design_experiment_2(design_experiment): @pytest.fixture def design_experiment_3(design_experiment): - # Update second control to be same as first - design_experiment.loc[1, 'control_tag_align'] = 'B_1.bedse.gz' - return design_experiment + # Drop Replicate A_2 + design_df = design_experiment.drop(design_experiment.index[1]) + # Update to be paired as first + design_df.loc[0, 'control_tag_align'] = 'B_1.bedpe.gz' + design_df.loc[0, 'tag_align'] = 'A_1.bedpe.gz' + return design_df @pytest.mark.unit @@ -71,6 +74,19 @@ def test_single_rep(design_experiment_2): shutil.copy(test_design_path + 'B_1.tagAlign.gz', cwd) single_rep = pool_and_psuedoreplicate.generate_design('false', 1.2, design_experiment_2, cwd, 1, 1) assert single_rep.shape[0] == 4 + assert len(single_rep['control_tag_align'].unique()) == 2 + assert 'pool_control.tagAlign.gz' in single_rep['control_tag_align'].unique()[1] + + +@pytest.mark.unit +def test_single_control(design_experiment_3): + cwd = os.getcwd() + shutil.copy(test_design_path + 'A_1.bedpe.gz', cwd) + shutil.copy(test_design_path + 'B_1.bedpe.gz', cwd) + shutil.copy(test_design_path + 'A_1.tagAlign.gz', cwd) + single_control = pool_and_psuedoreplicate.generate_design('true', 1.2, design_experiment_3, cwd, 1, 1) + assert 'pool_control.tagAlign.gz' in single_control['control_tag_align'].unique()[0] + @pytest.mark.singleend def test_pool_and_psuedoreplicate_singleend():