Skip to content
Snippets Groups Projects
Commit 02bef1b4 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Add in test for design files.

parent dfed27c2
Branches
Tags
No related merge requests found
[pytest]
python_paths = workflow/scripts
#!/usr/bin/env python3
'''Check if design file is correctly formatted and matches files list.'''
import argparse
import logging
import pandas as pd
EPILOG = '''
For more details:
%(prog)s --help
'''
## SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-d', '--design',
help="The design file to run QC (TSV format).",
required=True)
parser.add_argument('-f', '--fastq',
help="File with list of fastq files (csv format).",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=True,
action='store_true')
args = parser.parse_args()
return args
def check_design_headers(design, paired):
'''Check if design file conforms to sequencing type.'''
# Default headers
design_template = [
'sample_id',
'biosample',
'factor',
'treatment',
'replicate',
'control_id',
'fastq_read1']
design_headers = list(design.columns.values)
if paired: # paired-end data
design_template.extend(['fastq_read2'])
# Check if headers
logger.info("Running header check.")
missing_headers = set(design_template) - set(design_headers)
if len(missing_headers) > 0:
logger.error('Missing column headers: %s', list(missing_headers))
raise Exception("Missing column headers: %s" % list(missing_headers))
def check_controls(design):
'''Check if design file has the correct control mapping.'''
logger.info("Running control check.")
missing_controls = set(design['control_id']) - set(design['sample_id'])
if len(missing_controls) > 0:
logger.error('Missing control experiments: %s', list(missing_controls))
raise Exception("Missing control experiments: %s" % list(missing_controls))
def check_files(design, fastq, paired):
'''Check if design file has the files found.'''
logger.info("Running file check.")
if paired: # paired-end data
files = list(design['fastq_read1']) + list(design['fastq_read2'])
else: # single-end data
files = design['fastq_read1']
files_found = fastq['name']
missing_files = set(files) - set(files_found)
if len(missing_files) > 0:
logger.error('Missing files from design file: %s', list(missing_files))
raise Exception("Missing files from design file: %s" % list(missing_files))
else:
file_dict = fastq.set_index('name').T.to_dict()
design['fastq_read1'] = design['fastq_read1'] \
.apply(lambda x: file_dict[x]['path'])
if paired: # paired-end data
design['fastq_read2'] = design['fastq_read2'] \
.apply(lambda x: file_dict[x]['path'])
return design
def main():
args = get_args()
# Create a file handler
handler = logging.FileHandler('design.log')
logger.addHandler(handler)
# Read files
design_file = pd.read_csv(args.design, sep='\t')
fastq_file = pd.read_csv(args.design, sep='\t', names=['name', 'path'])
# Check design file
check_design_headers(design_file, args.paired)
check_controls(design_file)
new_design = check_files(design_file, fastq_file, args.paired)
# Write out new design file
new_design.to_csv('design.tsv', header=True, index=False)
if __name__ == '__main__':
main()
#!/usr/bin/env python3
import os
import pytest
import pandas as pd
from io import StringIO
import check_design
import sys
DESIGN_STRING = """sample_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\tfastq_read1
A_1\tLiver\tH3K27ac\tNone\t1\tB_1\tA_1.fastq.gz
A_2\tLiver\tH3K27ac\tNone\t2\tB_2\tA_2.fastq.gz
B_1\tLiver\tInput\tNone\t1\tB_1\tB_1.fastq.gz
B_2\tLiver\tInput\tNone\t2\tB_2\tB_2.fastq.gz
"""
FASTQ_STRING = """
A_1.fastq.gz\t/path/to/file/A_1.fastq.gz
A_2.fastq.gz\t/path/to/file/A_2.fastq.gz
B_1.fastq.gz\t/path/to/file/B_1.fastq.gz
B_2.fastq.gz\t/path/to/file/B_2.fastq.gz
"""
@pytest.fixture
def design():
design_file = StringIO(DESIGN_STRING)
design_df = pd.read_csv(design_file, sep="\t")
return design_df
@pytest.fixture
def fastq_files():
fastq_file = StringIO(FASTQ_STRING)
fastq_df = pd.read_csv(fastq_file, sep='\t', names=['name', 'path'])
return fastq_df
@pytest.fixture
def design_1(design):
design_df = design.drop('fastq_read1', axis=1)
return design_df
@pytest.fixture
def design_2(design):
# Drop Control B_1
design_df = design.drop(design.index[2])
return design_df
@pytest.fixture
def design_3(design):
# Drop A_2 and B_2 and append as fastq_read2
design_df = design.drop(design.index[[1,3]])
design_df['fastq_read2'] = design.loc[[1,3],'fastq_read1'].values
return design_df
@pytest.fixture
def fastq_files_1(fastq_files):
# Drop B_2.fastq.gz
fastq_df = fastq_files.drop(fastq_files.index[3])
return fastq_df
def test_check_headers_singleend(design_1):
paired = False
with pytest.raises(Exception) as excinfo:
check_design.check_design_headers(design_1, paired)
assert str(excinfo.value) == "Missing column headers: ['fastq_read1']"
def test_check_headers_pairedend(design):
paired = True
with pytest.raises(Exception) as excinfo:
check_design.check_design_headers(design, paired)
assert str(excinfo.value) == "Missing column headers: ['fastq_read2']"
def test_check_controls(design_2):
with pytest.raises(Exception) as excinfo:
check_design.check_controls(design_2)
assert str(excinfo.value) == "Missing control experiments: ['B_1']"
def test_check_files_missing_files(design, fastq_files_1):
paired = False
with pytest.raises(Exception) as excinfo:
new_design = check_design.check_files(design, fastq_files_1, paired)
assert str(excinfo.value) == "Missing files from design file: ['B_2.fastq.gz']"
def test_check_files_output_singleend(design, fastq_files):
paired = False
new_design = check_design.check_files(design, fastq_files, paired)
assert new_design.loc[0,'fastq_read1'] == "/path/to/file/A_1.fastq.gz"
def test_check_files_output_pairedend(design_3, fastq_files):
paired = True
new_design = check_design.check_files(design_3, fastq_files, paired)
assert new_design.loc[0,'fastq_read2'] == "/path/to/file/A_2.fastq.gz"
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment