From 2bb7f6ed95347cd9e55b499b469573af41106e66 Mon Sep 17 00:00:00 2001 From: Venkat Malladi <venkat.malladi@utsouthwestern.edu> Date: Wed, 11 Oct 2017 13:32:43 -0500 Subject: [PATCH] Add in experiment_id and check to see if unique replicate ids are given. --- test_data/design_ENCSR238SGC_SE.txt | 10 +++++----- test_data/design_ENCSR729LGA_PE.txt | 10 +++++----- workflow/scripts/check_design.py | 23 +++++++++++++++++++++++ workflow/tests/test_check_design.py | 24 +++++++++++++++++++----- 4 files changed, 52 insertions(+), 15 deletions(-) diff --git a/test_data/design_ENCSR238SGC_SE.txt b/test_data/design_ENCSR238SGC_SE.txt index 80d4b42..da5c56a 100644 --- a/test_data/design_ENCSR238SGC_SE.txt +++ b/test_data/design_ENCSR238SGC_SE.txt @@ -1,5 +1,5 @@ -sample_id biosample factor treatment replicate control_id fastq_read1 -ENCSR238SGC limb H3K4me1 None 1 ENCSR687ALB ENCFF833BLU.fastq.gz -ENCSR238SGC limb H3K4me1 None 2 ENCSR687ALB ENCFF646LXU.fastq.gz -ENCSR687ALB limb Control None 1 ENCSR687ALB ENCFF524CAC.fastq.gz -ENCSR687ALB limb Control None 2 ENCSR687ALB ENCFF163AJI.fastq.gz +sample_id experiment_id biosample factor treatment replicate control_id fastq_read1 +ENCBS844FSC ENCSR238SGC limb H3K4me1 None 1 ENCBS844FSC ENCFF833BLU.fastq.gz +ENCBS892NXC ENCSR238SGC limb H3K4me1 None 2 ENCBS892NXC ENCFF646LXU.fastq.gz +ENCBS844FSC ENCSR687ALB limb Control None 1 ENCBS844FSC ENCFF524CAC.fastq.gz +ENCBS892NXC ENCSR687ALB limb Control None 2 ENCBS892NXC ENCFF163AJI.fastq.gz diff --git a/test_data/design_ENCSR729LGA_PE.txt b/test_data/design_ENCSR729LGA_PE.txt index 0bd9fa2..51855d6 100644 --- a/test_data/design_ENCSR729LGA_PE.txt +++ b/test_data/design_ENCSR729LGA_PE.txt @@ -1,5 +1,5 @@ -sample_id biosample factor treatment replicate control_id fastq_read1 fastq_read2 -ENCSR729LGA MCF-7 SP1 None 1 ENCSR217LRF ENCFF957SQS.fastq.gz ENCFF582IOZ.fastq.gz -ENCSR729LGA MCF-7 SP1 None 2 ENCSR217LRF ENCFF330MCZ.fastq.gz ENCFF293YFE.fastq.gz -ENCSR217LRF MCF-7 Control None 1 ENCSR217LRF ENCFF002DTU.fastq.gz ENCFF002EFI.fastq.gz -ENCSR217LRF MCF-7 Control None 1 ENCSR217LRF ENCFF002EFG.fastq.gz ENCFF002DTS.fastq.gz +sample_id experiment_id biosample factor treatment replicate control_id fastq_read1 fastq_read2 +ENCBS609QTY ENCSR729LGA MCF-7 SP1 None 1 ENCBS216AOQ ENCFF957SQS.fastq.gz ENCFF582IOZ.fastq.gz +ENCBS200IWR ENCSR729LGA MCF-7 SP1 None 2 ENCBS034XKZ ENCFF330MCZ.fastq.gz ENCFF293YFE.fastq.gz +ENCBS216AOQ ENCSR217LRF MCF-7 Control None 1 ENCBS216AOQ ENCFF002DTU.fastq.gz ENCFF002EFI.fastq.gz +ENCBS034XKZ ENCSR217LRF MCF-7 Control None 2 ENCBS034XKZ ENCFF002EFG.fastq.gz ENCFF002DTS.fastq.gz diff --git a/workflow/scripts/check_design.py b/workflow/scripts/check_design.py index 52185d6..161660e 100644 --- a/workflow/scripts/check_design.py +++ b/workflow/scripts/check_design.py @@ -48,6 +48,7 @@ def check_design_headers(design, paired): # Default headers design_template = [ 'sample_id', + 'experiment_id', 'biosample', 'factor', 'treatment', @@ -83,6 +84,27 @@ def check_controls(design): list(missing_controls)) +def check_replicates(design): + '''Check if design file has unique replicate numbersfor an experiment.''' + + logger.info("Running replicate check.") + + experiment_replicates = design.groupby('experiment_id')['replicate'] \ + .apply(list) + + duplicated_replicates = [] + for experiment in experiment_replicates.index.values: + replicates = experiment_replicates[experiment] + unique_replicates = set(replicates) + if len(replicates) != len(unique_replicates): + duplicated_replicates.append(experiment) + + if len(duplicated_replicates) > 0: + logger.error('Duplicate replicates in experiments: %s', list(duplicated_replicates)) + raise Exception("Duplicate replicates in experiments: %s" % + list(duplicated_replicates)) + + def check_files(design, fastq, paired): '''Check if design file has the files found.''' @@ -126,6 +148,7 @@ def main(): # Check design file check_design_headers(design_file, args.paired) check_controls(design_file) + check_replicates(design_file) new_design = check_files(design_file, fastq_file, args.paired) # Write out new design file diff --git a/workflow/tests/test_check_design.py b/workflow/tests/test_check_design.py index a63bc77..23575ec 100644 --- a/workflow/tests/test_check_design.py +++ b/workflow/tests/test_check_design.py @@ -6,11 +6,11 @@ from io import StringIO import check_design -DESIGN_STRING = """sample_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\tfastq_read1 -A_1\tLiver\tH3K27ac\tNone\t1\tB_1\tA_1.fastq.gz -A_2\tLiver\tH3K27ac\tNone\t2\tB_2\tA_2.fastq.gz -B_1\tLiver\tInput\tNone\t1\tB_1\tB_1.fastq.gz -B_2\tLiver\tInput\tNone\t2\tB_2\tB_2.fastq.gz +DESIGN_STRING = """sample_id\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\tfastq_read1 +A_1\tA\tLiver\tH3K27ac\tNone\t1\tB_1\tA_1.fastq.gz +A_2\tA\tLiver\tH3K27ac\tNone\t2\tB_2\tA_2.fastq.gz +B_1\tB\tLiver\tInput\tNone\t1\tB_1\tB_1.fastq.gz +B_2\tB\tLiver\tInput\tNone\t2\tB_2\tB_2.fastq.gz """ FASTQ_STRING = """ @@ -56,6 +56,13 @@ def design_3(design): return design_df +@pytest.fixture +def design_4(design): + # Update replicate 2 for experiment B to be 1 + design.loc[design['sample_id'] == 'B_2', 'replicate'] = 1 + return design + + @pytest.fixture def fastq_files_1(fastq_files): # Drop B_2.fastq.gz @@ -100,3 +107,10 @@ def test_check_files_output_pairedend(design_3, fastq_files): paired = True new_design = check_design.check_files(design_3, fastq_files, paired) assert new_design.loc[0, 'fastq_read2'] == "/path/to/file/A_2.fastq.gz" + + +def test_check_replicates(design_4): + paired = False + with pytest.raises(Exception) as excinfo: + new_design = check_design.check_replicates(design_4) + assert str(excinfo.value) == "Duplicate replicates in experiments: ['B']" -- GitLab