Skip to content
Snippets Groups Projects
Commit e09a8b62 authored by Holly Ruess's avatar Holly Ruess
Browse files

Merge branch '73-single_pool' into 'master'

Resolve "Single Rep Pool and Pseudo"

Closes #73

See merge request BICF/Astrocyte/chipseq_analysis!63
parents 14d69096 21a78fc3
No related merge requests found
......@@ -2,6 +2,7 @@ before_script:
- module add python/3.6.1-2-anaconda
- pip install --user pytest-pythonpath==0.7.1 pytest-cov==2.5.1
- module load nextflow/0.31.0
- ulimit -c unlimited
- ln -s /project/shared/bicf_workflow_ref/workflow_testdata/chipseq/*fastq.gz test_data/
stages:
......@@ -14,7 +15,6 @@ stages:
user_configuration:
stage: unit
script:
- pytest -m unit
- pytest -m unit --cov=./workflow/scripts
astrocyte:
......@@ -30,7 +30,7 @@ single_end_mouse:
only:
- master
script:
- nextflow run workflow/main.nf --astrocyte true -resume
- nextflow run workflow/main.nf --astrocyte true
- pytest -m singleend
paired_end_human:
......@@ -40,9 +40,19 @@ paired_end_human:
except:
- master
script:
- nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_ENCSR729LGA_PE.txt" --genome 'GRCh38' --pairedEnd true --astrocyte false -resume
- nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_ENCSR729LGA_PE.txt" --genome 'GRCh38' --pairedEnd true --astrocyte false
- pytest -m pairedend
single_end_single_control:
stage: single
only:
- branches
except:
- master
script:
- nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_single_contol_SE.txt" --genome 'GRCh38' --pairedEnd false --astrocyte false
- pytest -m singlecontrol
single_end_diff:
stage: multiple
only:
......@@ -50,7 +60,7 @@ single_end_diff:
except:
- master
script:
- nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' --astrocyte false -resume
- nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' --astrocyte false
- pytest -m singleend
- pytest -m singlediff
......@@ -59,7 +69,7 @@ paired_end_diff:
- master
stage: multiple
script:
- nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_PE.txt" --genome 'GRCh38' --pairedEnd true --astrocyte false -resume
- nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_PE.txt" --genome 'GRCh38' --pairedEnd true --astrocyte false
- pytest -m pairedend
- pytest -m paireddiff
......@@ -68,5 +78,5 @@ single_end_skip:
only:
- master
script:
- nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' --skipDiff true --skipMotif true --skipPlotProfile true --astrocyte false -resume
- nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' --skipDiff true --skipMotif true --skipPlotProfile true --astrocyte false
- pytest -m singleskip_true
......@@ -14,6 +14,7 @@ All notable changes to this project will be documented in this file.
- Make gtf and geneName files as param inputs
- Fix xcor to increase file size for --random-source
- Fix skip diff test for paired-end data
- Add test data for single control and single replicate
- Fix python version for MultiQC report
- Fix xcor to get lowest non zero value above 50
- Fix references to display in Multiqc report
......
File added
File added
sample_id experiment_id biosample factor treatment replicate control_id fastq_read1
ENCLB497XZB ENCSR000DXB Panc1 H3K4me3 None 1 ENCLB304SBJ ENCFF001GBW.fastq.gz
ENCLB304SBJ ENCSR000DXC Panc1 Control None 1 ENCLB304SBJ ENCFF001HWJ.fastq.gz
......@@ -25,3 +25,9 @@ wget https://www.encodeproject.org/files/ENCFF161HBP/@@download/ENCFF161HBP.fast
wget https://www.encodeproject.org/files/ENCFF776KZU/@@download/ENCFF776KZU.fastq.gz
wget https://www.encodeproject.org/files/ENCFF119KHM/@@download/ENCFF119KHM.fastq.gz
echo "Done with Paired-end"
echo "Downloading Single-end data set Human ENCSR000DXB and ENCSR000DXC"
wget https://www.encodeproject.org/files/ENCFF001GBW/@@download/ENCFF001GBW.fastq.gz
wget https://www.encodeproject.org/files/ENCFF001GBV/@@download/ENCFF001GBV.fastq.gz
wget https://www.encodeproject.org/files/ENCFF001HWJ/@@download/ENCFF001HWJ.fastq.gz
echo "Done with Single-end"
......@@ -2,6 +2,7 @@ process {
executor = 'slurm'
queue = 'super'
clusterOptions = '--hold'
beforeScript= 'ulimit -Ss unlimited'
// Process specific configuration
withName: checkDesignFile {
......
......@@ -204,6 +204,7 @@ def generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_con
pool_control_tmp = bedpe_to_tagalign(pool_control, "pool_control")
pool_control = pool_control_tmp
# Duplicate rows and update for pool and psuedoreplicates and update tagAlign with single end data
experiment_id = design_df.at[0, 'experiment_id']
replicate_files = design_df.tag_align.unique()
......@@ -237,9 +238,9 @@ def generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_con
else:
pool_experiment_se = pool_experiment
# Check controls against cutoff_ratio
# if so replace with pool_control
# unless single control was used
# Check controls against cutoff_ratio
# if so replace with pool_control
# unless single control was used
if not single_control:
path_to_pool_control = cwd + '/' + pool_control
if control_df.values.max() > cutoff_ratio:
......@@ -267,7 +268,10 @@ def generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_con
path_to_control
else:
path_to_pool_control = cwd + '/' + pool_control
if paired:
path_to_pool_control = cwd + '/' + pool_control
else:
path_to_pool_control = pool_control
design_new_df['control_tag_align'] = path_to_pool_control
# Add in pseudo replicates
......@@ -306,7 +310,7 @@ def main():
design_df = pd.read_csv(design, sep='\t')
# Get current directory to build paths
cwd = os.getcwd()
cwd = os.getcwd()
# Check Number of replicates and replicates
no_reps = check_replicates(design_df)
......
......@@ -45,3 +45,9 @@ def test_overlap_peaks_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA.rejected.narrowPeak'))
peak_file = test_output_path + 'ENCSR729LGA.replicated.narrowPeak'
assert utils.count_lines(peak_file) >= 25657
@pytest.mark.singlecontrol
def test_overlap_peaks_singlecontrol():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR000DXB.rejected.narrowPeak'))
peak_file = test_output_path + 'ENCSR000DXB.replicated.narrowPeak'
assert utils.count_lines(peak_file) >= 35097
......@@ -33,9 +33,12 @@ def design_experiment_2(design_experiment):
@pytest.fixture
def design_experiment_3(design_experiment):
# Update second control to be same as first
design_experiment.loc[1, 'control_tag_align'] = 'B_1.bedse.gz'
return design_experiment
# Drop Replicate A_2
design_df = design_experiment.drop(design_experiment.index[1])
# Update to be paired as first
design_df.loc[0, 'control_tag_align'] = 'B_1.bedpe.gz'
design_df.loc[0, 'tag_align'] = 'A_1.bedpe.gz'
return design_df
@pytest.mark.unit
......@@ -71,6 +74,19 @@ def test_single_rep(design_experiment_2):
shutil.copy(test_design_path + 'B_1.tagAlign.gz', cwd)
single_rep = pool_and_psuedoreplicate.generate_design('false', 1.2, design_experiment_2, cwd, 1, 1)
assert single_rep.shape[0] == 4
assert len(single_rep['control_tag_align'].unique()) == 2
assert 'pool_control.tagAlign.gz' in single_rep['control_tag_align'].unique()[1]
@pytest.mark.unit
def test_single_control(design_experiment_3):
cwd = os.getcwd()
shutil.copy(test_design_path + 'A_1.bedpe.gz', cwd)
shutil.copy(test_design_path + 'B_1.bedpe.gz', cwd)
shutil.copy(test_design_path + 'A_1.tagAlign.gz', cwd)
single_control = pool_and_psuedoreplicate.generate_design('true', 1.2, design_experiment_3, cwd, 1, 1)
assert 'pool_control.tagAlign.gz' in single_control['control_tag_align'].unique()[0]
@pytest.mark.singleend
def test_pool_and_psuedoreplicate_singleend():
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment