Skip to content
Snippets Groups Projects
Commit 02bef1b4 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Add in test for design files.

parent dfed27c2
No related merge requests found
[pytest]
python_paths = workflow/scripts
#!/usr/bin/env python3
'''Check if design file is correctly formatted and matches files list.'''
import argparse
import logging
import pandas as pd
EPILOG = '''
For more details:
%(prog)s --help
'''
## SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-d', '--design',
help="The design file to run QC (TSV format).",
required=True)
parser.add_argument('-f', '--fastq',
help="File with list of fastq files (csv format).",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=True,
action='store_true')
args = parser.parse_args()
return args
def check_design_headers(design, paired):
'''Check if design file conforms to sequencing type.'''
# Default headers
design_template = [
'sample_id',
'biosample',
'factor',
'treatment',
'replicate',
'control_id',
'fastq_read1']
design_headers = list(design.columns.values)
if paired: # paired-end data
design_template.extend(['fastq_read2'])
# Check if headers
logger.info("Running header check.")
missing_headers = set(design_template) - set(design_headers)
if len(missing_headers) > 0:
logger.error('Missing column headers: %s', list(missing_headers))
raise Exception("Missing column headers: %s" % list(missing_headers))
def check_controls(design):
'''Check if design file has the correct control mapping.'''
logger.info("Running control check.")
missing_controls = set(design['control_id']) - set(design['sample_id'])
if len(missing_controls) > 0:
logger.error('Missing control experiments: %s', list(missing_controls))
raise Exception("Missing control experiments: %s" % list(missing_controls))
def check_files(design, fastq, paired):
'''Check if design file has the files found.'''
logger.info("Running file check.")
if paired: # paired-end data
files = list(design['fastq_read1']) + list(design['fastq_read2'])
else: # single-end data
files = design['fastq_read1']
files_found = fastq['name']
missing_files = set(files) - set(files_found)
if len(missing_files) > 0:
logger.error('Missing files from design file: %s', list(missing_files))
raise Exception("Missing files from design file: %s" % list(missing_files))
else:
file_dict = fastq.set_index('name').T.to_dict()
design['fastq_read1'] = design['fastq_read1'] \
.apply(lambda x: file_dict[x]['path'])
if paired: # paired-end data
design['fastq_read2'] = design['fastq_read2'] \
.apply(lambda x: file_dict[x]['path'])
return design
def main():
args = get_args()
# Create a file handler
handler = logging.FileHandler('design.log')
logger.addHandler(handler)
# Read files
design_file = pd.read_csv(args.design, sep='\t')
fastq_file = pd.read_csv(args.design, sep='\t', names=['name', 'path'])
# Check design file
check_design_headers(design_file, args.paired)
check_controls(design_file)
new_design = check_files(design_file, fastq_file, args.paired)
# Write out new design file
new_design.to_csv('design.tsv', header=True, index=False)
if __name__ == '__main__':
main()
#!/usr/bin/env python3
import os
import pytest
import pandas as pd
from io import StringIO
import check_design
import sys
DESIGN_STRING = """sample_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\tfastq_read1
A_1\tLiver\tH3K27ac\tNone\t1\tB_1\tA_1.fastq.gz
A_2\tLiver\tH3K27ac\tNone\t2\tB_2\tA_2.fastq.gz
B_1\tLiver\tInput\tNone\t1\tB_1\tB_1.fastq.gz
B_2\tLiver\tInput\tNone\t2\tB_2\tB_2.fastq.gz
"""
FASTQ_STRING = """
A_1.fastq.gz\t/path/to/file/A_1.fastq.gz
A_2.fastq.gz\t/path/to/file/A_2.fastq.gz
B_1.fastq.gz\t/path/to/file/B_1.fastq.gz
B_2.fastq.gz\t/path/to/file/B_2.fastq.gz
"""
@pytest.fixture
def design():
design_file = StringIO(DESIGN_STRING)
design_df = pd.read_csv(design_file, sep="\t")
return design_df
@pytest.fixture
def fastq_files():
fastq_file = StringIO(FASTQ_STRING)
fastq_df = pd.read_csv(fastq_file, sep='\t', names=['name', 'path'])
return fastq_df
@pytest.fixture
def design_1(design):
design_df = design.drop('fastq_read1', axis=1)
return design_df
@pytest.fixture
def design_2(design):
# Drop Control B_1
design_df = design.drop(design.index[2])
return design_df
@pytest.fixture
def design_3(design):
# Drop A_2 and B_2 and append as fastq_read2
design_df = design.drop(design.index[[1,3]])
design_df['fastq_read2'] = design.loc[[1,3],'fastq_read1'].values
return design_df
@pytest.fixture
def fastq_files_1(fastq_files):
# Drop B_2.fastq.gz
fastq_df = fastq_files.drop(fastq_files.index[3])
return fastq_df
def test_check_headers_singleend(design_1):
paired = False
with pytest.raises(Exception) as excinfo:
check_design.check_design_headers(design_1, paired)
assert str(excinfo.value) == "Missing column headers: ['fastq_read1']"
def test_check_headers_pairedend(design):
paired = True
with pytest.raises(Exception) as excinfo:
check_design.check_design_headers(design, paired)
assert str(excinfo.value) == "Missing column headers: ['fastq_read2']"
def test_check_controls(design_2):
with pytest.raises(Exception) as excinfo:
check_design.check_controls(design_2)
assert str(excinfo.value) == "Missing control experiments: ['B_1']"
def test_check_files_missing_files(design, fastq_files_1):
paired = False
with pytest.raises(Exception) as excinfo:
new_design = check_design.check_files(design, fastq_files_1, paired)
assert str(excinfo.value) == "Missing files from design file: ['B_2.fastq.gz']"
def test_check_files_output_singleend(design, fastq_files):
paired = False
new_design = check_design.check_files(design, fastq_files, paired)
assert new_design.loc[0,'fastq_read1'] == "/path/to/file/A_1.fastq.gz"
def test_check_files_output_pairedend(design_3, fastq_files):
paired = True
new_design = check_design.check_files(design_3, fastq_files, paired)
assert new_design.loc[0,'fastq_read2'] == "/path/to/file/A_2.fastq.gz"
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment