From 2bb7f6ed95347cd9e55b499b469573af41106e66 Mon Sep 17 00:00:00 2001
From: Venkat Malladi <venkat.malladi@utsouthwestern.edu>
Date: Wed, 11 Oct 2017 13:32:43 -0500
Subject: [PATCH] Add in experiment_id and check to see if unique replicate ids
 are given.

---
 test_data/design_ENCSR238SGC_SE.txt | 10 +++++-----
 test_data/design_ENCSR729LGA_PE.txt | 10 +++++-----
 workflow/scripts/check_design.py    | 23 +++++++++++++++++++++++
 workflow/tests/test_check_design.py | 24 +++++++++++++++++++-----
 4 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/test_data/design_ENCSR238SGC_SE.txt b/test_data/design_ENCSR238SGC_SE.txt
index 80d4b42..da5c56a 100644
--- a/test_data/design_ENCSR238SGC_SE.txt
+++ b/test_data/design_ENCSR238SGC_SE.txt
@@ -1,5 +1,5 @@
-sample_id	biosample	factor	treatment	replicate	control_id	fastq_read1
-ENCSR238SGC	limb	H3K4me1	None	1	ENCSR687ALB	ENCFF833BLU.fastq.gz
-ENCSR238SGC	limb	H3K4me1	None	2	ENCSR687ALB	ENCFF646LXU.fastq.gz
-ENCSR687ALB	limb	Control	None	1	ENCSR687ALB	ENCFF524CAC.fastq.gz
-ENCSR687ALB	limb	Control	None	2	ENCSR687ALB	ENCFF163AJI.fastq.gz
+sample_id experiment_id biosample	factor	treatment	replicate	control_id	fastq_read1
+ENCBS844FSC ENCSR238SGC	limb	H3K4me1	None	1	ENCBS844FSC	ENCFF833BLU.fastq.gz
+ENCBS892NXC ENCSR238SGC	limb	H3K4me1	None	2	ENCBS892NXC	ENCFF646LXU.fastq.gz
+ENCBS844FSC ENCSR687ALB	limb	Control	None	1	ENCBS844FSC	ENCFF524CAC.fastq.gz
+ENCBS892NXC ENCSR687ALB	limb	Control	None	2	ENCBS892NXC	ENCFF163AJI.fastq.gz
diff --git a/test_data/design_ENCSR729LGA_PE.txt b/test_data/design_ENCSR729LGA_PE.txt
index 0bd9fa2..51855d6 100644
--- a/test_data/design_ENCSR729LGA_PE.txt
+++ b/test_data/design_ENCSR729LGA_PE.txt
@@ -1,5 +1,5 @@
-sample_id	biosample	factor	treatment	replicate	control_id	fastq_read1	fastq_read2
-ENCSR729LGA	MCF-7	SP1	None	1	ENCSR217LRF	ENCFF957SQS.fastq.gz	ENCFF582IOZ.fastq.gz
-ENCSR729LGA	MCF-7	SP1	None	2	ENCSR217LRF	ENCFF330MCZ.fastq.gz	ENCFF293YFE.fastq.gz
-ENCSR217LRF	MCF-7	Control	None	1	ENCSR217LRF	ENCFF002DTU.fastq.gz	ENCFF002EFI.fastq.gz
-ENCSR217LRF	MCF-7	Control	None	1	ENCSR217LRF	ENCFF002EFG.fastq.gz	ENCFF002DTS.fastq.gz
+sample_id experiment_id biosample	factor	treatment	replicate	control_id	fastq_read1	fastq_read2
+ENCBS609QTY ENCSR729LGA	MCF-7	SP1	None	1	ENCBS216AOQ	ENCFF957SQS.fastq.gz	ENCFF582IOZ.fastq.gz
+ENCBS200IWR ENCSR729LGA	MCF-7	SP1	None	2	ENCBS034XKZ	ENCFF330MCZ.fastq.gz	ENCFF293YFE.fastq.gz
+ENCBS216AOQ ENCSR217LRF	MCF-7	Control	None	1	ENCBS216AOQ	ENCFF002DTU.fastq.gz	ENCFF002EFI.fastq.gz
+ENCBS034XKZ ENCSR217LRF	MCF-7	Control	None	2	ENCBS034XKZ	ENCFF002EFG.fastq.gz	ENCFF002DTS.fastq.gz
diff --git a/workflow/scripts/check_design.py b/workflow/scripts/check_design.py
index 52185d6..161660e 100644
--- a/workflow/scripts/check_design.py
+++ b/workflow/scripts/check_design.py
@@ -48,6 +48,7 @@ def check_design_headers(design, paired):
     # Default headers
     design_template = [
         'sample_id',
+        'experiment_id',
         'biosample',
         'factor',
         'treatment',
@@ -83,6 +84,27 @@ def check_controls(design):
                         list(missing_controls))
 
 
+def check_replicates(design):
+    '''Check if design file has unique replicate numbersfor an experiment.'''
+
+    logger.info("Running replicate check.")
+
+    experiment_replicates = design.groupby('experiment_id')['replicate'] \
+                            .apply(list)
+
+    duplicated_replicates = []
+    for experiment in experiment_replicates.index.values:
+        replicates = experiment_replicates[experiment]
+        unique_replicates = set(replicates)
+        if len(replicates) != len(unique_replicates):
+            duplicated_replicates.append(experiment)
+    
+    if len(duplicated_replicates) > 0:
+        logger.error('Duplicate replicates in experiments: %s', list(duplicated_replicates))
+        raise Exception("Duplicate replicates in experiments: %s" %
+                        list(duplicated_replicates))
+
+
 def check_files(design, fastq, paired):
     '''Check if design file has the files found.'''
 
@@ -126,6 +148,7 @@ def main():
     # Check design file
     check_design_headers(design_file, args.paired)
     check_controls(design_file)
+    check_replicates(design_file)
     new_design = check_files(design_file, fastq_file, args.paired)
 
     # Write out new design file
diff --git a/workflow/tests/test_check_design.py b/workflow/tests/test_check_design.py
index a63bc77..23575ec 100644
--- a/workflow/tests/test_check_design.py
+++ b/workflow/tests/test_check_design.py
@@ -6,11 +6,11 @@ from io import StringIO
 import check_design
 
 
-DESIGN_STRING = """sample_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\tfastq_read1
-A_1\tLiver\tH3K27ac\tNone\t1\tB_1\tA_1.fastq.gz
-A_2\tLiver\tH3K27ac\tNone\t2\tB_2\tA_2.fastq.gz
-B_1\tLiver\tInput\tNone\t1\tB_1\tB_1.fastq.gz
-B_2\tLiver\tInput\tNone\t2\tB_2\tB_2.fastq.gz
+DESIGN_STRING = """sample_id\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\tfastq_read1
+A_1\tA\tLiver\tH3K27ac\tNone\t1\tB_1\tA_1.fastq.gz
+A_2\tA\tLiver\tH3K27ac\tNone\t2\tB_2\tA_2.fastq.gz
+B_1\tB\tLiver\tInput\tNone\t1\tB_1\tB_1.fastq.gz
+B_2\tB\tLiver\tInput\tNone\t2\tB_2\tB_2.fastq.gz
 """
 
 FASTQ_STRING = """
@@ -56,6 +56,13 @@ def design_3(design):
     return design_df
 
 
+@pytest.fixture
+def design_4(design):
+    # Update replicate 2 for experiment B to be 1
+    design.loc[design['sample_id'] == 'B_2', 'replicate'] = 1
+    return design
+
+
 @pytest.fixture
 def fastq_files_1(fastq_files):
     # Drop B_2.fastq.gz
@@ -100,3 +107,10 @@ def test_check_files_output_pairedend(design_3, fastq_files):
     paired = True
     new_design = check_design.check_files(design_3, fastq_files, paired)
     assert new_design.loc[0, 'fastq_read2'] == "/path/to/file/A_2.fastq.gz"
+
+
+def test_check_replicates(design_4):
+    paired = False
+    with pytest.raises(Exception) as excinfo:
+        new_design = check_design.check_replicates(design_4)
+    assert str(excinfo.value) == "Duplicate replicates in experiments: ['B']"
-- 
GitLab