diff --git a/workflow/scripts/check_design.py b/workflow/scripts/check_design.py index 083151213fe483f7cbd4a3feb76c09d760b212d7..6eef6a13d5e041946efe4322565f7f2aa1336307 100644 --- a/workflow/scripts/check_design.py +++ b/workflow/scripts/check_design.py @@ -5,6 +5,7 @@ import argparse import logging import pandas as pd +import re EPILOG = ''' For more details: @@ -72,6 +73,46 @@ def check_design_headers(design, paired): raise Exception("Missing column headers: %s" % list(missing_headers)) +def check_samples(design): + '''Check if design file has the correct sample name mapping.''' + + logger.info("Running sample check.") + + samples = design.groupby('sample_id') \ + .apply(list) + + malformated_samples = [] + chars = set('-.') + for sample in samples.index.values: + if ( any(char.isspace() for char in sample) | any((char in chars) for char in sample) ): + malformated_samples.append(sample) + + if len(malformated_samples) > 0: + logger.error('Malformed samples from design file: %s', list(malformated_samples)) + raise Exception("Malformed samples from design file: %s" % + list(malformated_samples)) + + +def check_experiments(design): + '''Check if design file has the correct experiment name mapping.''' + + logger.info("Running experiment check.") + + experiments = design.groupby('experiment_id') \ + .apply(list) + + malformated_experiments = [] + chars = set('-.') + for experiment in experiments.index.values: + if ( any(char.isspace() for char in experiment) | any((char in chars) for char in experiment) ): + malformated_experiments.append(experiment) + + if len(malformated_experiments) > 0: + logger.error('Malformed experiment from design file: %s', list(malformated_experiments)) + raise Exception("Malformed experiment from design file: %s" % + list(malformated_experiments)) + + def check_controls(design): '''Check if design file has the correct control mapping.''' diff --git a/workflow/tests/test_check_design.py b/workflow/tests/test_check_design.py index 517c53c71cb31edb18c159b29636381aa66972d0..de02891773f3a090ea476ceee60a8c30f052c57d 100644 --- a/workflow/tests/test_check_design.py +++ b/workflow/tests/test_check_design.py @@ -63,6 +63,24 @@ def design_4(design): return design +@pytest.fixture +def design_5(design): + # Update sample_id to have -, spaces or periods + design.loc[design['sample_id'] == 'A_1', 'sample_id'] = 'A 1' + design.loc[design['sample_id'] == 'A_2', 'sample_id'] = 'A.2' + design.loc[design['sample_id'] == 'B_1', 'sample_id'] = 'B-1' + return design + + +@pytest.fixture +def design_6(design): + # Update experiment_id to have -, spaces or periods + design.loc[design['sample_id'] == 'A_1', 'experiment_id'] = 'A ChIP' + design.loc[design['sample_id'] == 'A_2', 'experiment_id'] = 'A.ChIP' + design.loc[design['sample_id'] == 'B_1', 'experiment_id'] = 'B-ChIP' + return design + + @pytest.fixture def fastq_files_1(fastq_files): # Drop B_2.fastq.gz @@ -115,10 +133,25 @@ def test_check_files_output_pairedend(design_3, fastq_files): assert new_design.loc[0, 'fastq_read2'] == "/path/to/file/A_2.fastq.gz" - @pytest.mark.unit def test_check_replicates(design_4): paired = False with pytest.raises(Exception) as excinfo: new_design = check_design.check_replicates(design_4) assert str(excinfo.value) == "Duplicate replicates in experiments: ['B']" + + +@pytest.mark.unit +def test_check_samples(design_5): + paired = False + with pytest.raises(Exception) as excinfo: + new_design = check_design.check_samples(design_5) + assert str(excinfo.value) == "Malformed samples from design file: ['A 1', 'A.2', 'B-1']" + + +@pytest.mark.unit +def test_check_experiments(design_6): + paired = False + with pytest.raises(Exception) as excinfo: + new_design = check_design.check_experiments(design_6) + assert str(excinfo.value) == "Malformed experiment from design file: ['A ChIP', 'A.ChIP', 'B-ChIP']"