diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000000000000000000000000000000000000..c8d98f29942ac24e83b741a0e102714b44230eab --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +python_paths = workflow/scripts diff --git a/workflow/scripts/__init__.py b/workflow/scripts/__init__.py deleted file mode 100644 index 8b137891791fe96927ad78e64b0aad7bded08bdc..0000000000000000000000000000000000000000 --- a/workflow/scripts/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/workflow/scripts/check_design.py b/workflow/scripts/check_design.py new file mode 100644 index 0000000000000000000000000000000000000000..beb59b25ffa2e7f4d9f7f3f66cbaf0eee464d3c5 --- /dev/null +++ b/workflow/scripts/check_design.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 + +'''Check if design file is correctly formatted and matches files list.''' + +import argparse +import logging +import pandas as pd + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +## SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-d', '--design', + help="The design file to run QC (TSV format).", + required=True) + + parser.add_argument('-f', '--fastq', + help="File with list of fastq files (csv format).", + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=True, + action='store_true') + + args = parser.parse_args() + return args + + +def check_design_headers(design, paired): + '''Check if design file conforms to sequencing type.''' + + # Default headers + design_template = [ + 'sample_id', + 'biosample', + 'factor', + 'treatment', + 'replicate', + 'control_id', + 'fastq_read1'] + + design_headers = list(design.columns.values) + + if paired: # paired-end data + design_template.extend(['fastq_read2']) + + # Check if headers + logger.info("Running header check.") + + missing_headers = set(design_template) - set(design_headers) + + if len(missing_headers) > 0: + logger.error('Missing column headers: %s', list(missing_headers)) + raise Exception("Missing column headers: %s" % list(missing_headers)) + + +def check_controls(design): + '''Check if design file has the correct control mapping.''' + + logger.info("Running control check.") + + missing_controls = set(design['control_id']) - set(design['sample_id']) + + if len(missing_controls) > 0: + logger.error('Missing control experiments: %s', list(missing_controls)) + raise Exception("Missing control experiments: %s" % list(missing_controls)) + + +def check_files(design, fastq, paired): + '''Check if design file has the files found.''' + + logger.info("Running file check.") + + if paired: # paired-end data + files = list(design['fastq_read1']) + list(design['fastq_read2']) + else: # single-end data + files = design['fastq_read1'] + + files_found = fastq['name'] + + missing_files = set(files) - set(files_found) + + if len(missing_files) > 0: + logger.error('Missing files from design file: %s', list(missing_files)) + raise Exception("Missing files from design file: %s" % list(missing_files)) + else: + file_dict = fastq.set_index('name').T.to_dict() + + design['fastq_read1'] = design['fastq_read1'] \ + .apply(lambda x: file_dict[x]['path']) + if paired: # paired-end data + design['fastq_read2'] = design['fastq_read2'] \ + .apply(lambda x: file_dict[x]['path']) + return design + + +def main(): + args = get_args() + + # Create a file handler + handler = logging.FileHandler('design.log') + logger.addHandler(handler) + + # Read files + design_file = pd.read_csv(args.design, sep='\t') + fastq_file = pd.read_csv(args.design, sep='\t', names=['name', 'path']) + + # Check design file + check_design_headers(design_file, args.paired) + check_controls(design_file) + new_design = check_files(design_file, fastq_file, args.paired) + + # Write out new design file + new_design.to_csv('design.tsv', header=True, index=False) + + +if __name__ == '__main__': + main() diff --git a/workflow/tests/test_check_design.py b/workflow/tests/test_check_design.py new file mode 100644 index 0000000000000000000000000000000000000000..394d5251b382b682bf758debdedd12aaf5637551 --- /dev/null +++ b/workflow/tests/test_check_design.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 + +import os +import pytest +import pandas as pd +from io import StringIO +import check_design +import sys + + +DESIGN_STRING = """sample_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\tfastq_read1 +A_1\tLiver\tH3K27ac\tNone\t1\tB_1\tA_1.fastq.gz +A_2\tLiver\tH3K27ac\tNone\t2\tB_2\tA_2.fastq.gz +B_1\tLiver\tInput\tNone\t1\tB_1\tB_1.fastq.gz +B_2\tLiver\tInput\tNone\t2\tB_2\tB_2.fastq.gz +""" + +FASTQ_STRING = """ +A_1.fastq.gz\t/path/to/file/A_1.fastq.gz +A_2.fastq.gz\t/path/to/file/A_2.fastq.gz +B_1.fastq.gz\t/path/to/file/B_1.fastq.gz +B_2.fastq.gz\t/path/to/file/B_2.fastq.gz +""" + + +@pytest.fixture +def design(): + design_file = StringIO(DESIGN_STRING) + design_df = pd.read_csv(design_file, sep="\t") + return design_df + + +@pytest.fixture +def fastq_files(): + fastq_file = StringIO(FASTQ_STRING) + fastq_df = pd.read_csv(fastq_file, sep='\t', names=['name', 'path']) + return fastq_df + + +@pytest.fixture +def design_1(design): + design_df = design.drop('fastq_read1', axis=1) + return design_df + + +@pytest.fixture +def design_2(design): + # Drop Control B_1 + design_df = design.drop(design.index[2]) + return design_df + +@pytest.fixture +def design_3(design): + # Drop A_2 and B_2 and append as fastq_read2 + design_df = design.drop(design.index[[1,3]]) + design_df['fastq_read2'] = design.loc[[1,3],'fastq_read1'].values + return design_df + + +@pytest.fixture +def fastq_files_1(fastq_files): + # Drop B_2.fastq.gz + fastq_df = fastq_files.drop(fastq_files.index[3]) + return fastq_df + + +def test_check_headers_singleend(design_1): + paired = False + with pytest.raises(Exception) as excinfo: + check_design.check_design_headers(design_1, paired) + assert str(excinfo.value) == "Missing column headers: ['fastq_read1']" + + +def test_check_headers_pairedend(design): + paired = True + with pytest.raises(Exception) as excinfo: + check_design.check_design_headers(design, paired) + assert str(excinfo.value) == "Missing column headers: ['fastq_read2']" + + +def test_check_controls(design_2): + with pytest.raises(Exception) as excinfo: + check_design.check_controls(design_2) + assert str(excinfo.value) == "Missing control experiments: ['B_1']" + + +def test_check_files_missing_files(design, fastq_files_1): + paired = False + with pytest.raises(Exception) as excinfo: + new_design = check_design.check_files(design, fastq_files_1, paired) + assert str(excinfo.value) == "Missing files from design file: ['B_2.fastq.gz']" + + +def test_check_files_output_singleend(design, fastq_files): + paired = False + new_design = check_design.check_files(design, fastq_files, paired) + assert new_design.loc[0,'fastq_read1'] == "/path/to/file/A_1.fastq.gz" + + +def test_check_files_output_pairedend(design_3, fastq_files): + paired = True + new_design = check_design.check_files(design_3, fastq_files, paired) + assert new_design.loc[0,'fastq_read2'] == "/path/to/file/A_2.fastq.gz"