Skip to content
Snippets Groups Projects
check_design.test.py 3.67 KiB
Newer Older
#!/usr/bin/env python3

'''Check if design file is correctly formatted and matches files list.'''

import argparse
import logging
import pandas as pd

EPILOG = '''
For more details:
        %(prog)s --help
'''

# SETTINGS

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)


def get_args():
    '''Define arguments.'''

    parser = argparse.ArgumentParser(
        description=__doc__, epilog=EPILOG,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument('-d', '--design',
                        help="The design file to run QC (tsv format).",
                        required=True )

    parser.add_argument('-f', '--fastq',
                        help="File with list of fastq files (tsv format).",
                        required=True )

    parser.add_argument('-t', '--feature',
                        help="Additional features to count?",
                        required=True )

    args = parser.parse_args()
    return args


def check_design_headers_n(design):
    '''Check if design file conforms to sequencing type.'''

    # Default headers
    design_template = [
        'Sample',
	    'fastq_R1',
	    'fastq_R2']

    design_headers = list(design.columns.values)

    # Check if headers
    logger.info("Running header check.")

    missing_headers = set(design_template) - set(design_headers)

    if len(missing_headers) > 0:
        logger.error('Missing column headers: %s', list(missing_headers))
        raise Exception("Missing column headers: %s" % list(missing_headers))
    
    return design

def check_design_headers_y(design):
    '''Check if design file conforms to sequencing type.'''

    # Default headers
    design_template = [
        'Sample',
	    'fastq_R1',
	    'fastq_R2',
	    'library_type']

    design_headers = list(design.columns.values)

    # Check if headers
    logger.info("Running header check.")

    missing_headers = set(design_template) - set(design_headers)

    if len(missing_headers) > 0:
        logger.error('Missing column headers: %s', list(missing_headers))
        raise Exception("Missing column headers: %s" % list(missing_headers))
    
    return design

def check_files(design, fastq):
    '''Check if design file has the files found.'''

    logger.info("Running file check.")

    files = list(design['fastq_R1']) + list(design['fastq_R2'])

    files_found = fastq['name']

    missing_files = set(files) - set(files_found)

    if len(missing_files) > 0:
        logger.error('Missing files from design file: %s', list(missing_files))
        raise Exception("Missing files from design file: %s" %
            list(missing_files))
    else:
        file_dict = fastq.set_index('name').T.to_dict()
    
    design['fastq_R1'] = design['fastq_R1'].apply(lambda x: file_dict[x]['path'])
    design['fastq_R2'] = design['fastq_R2'].apply(lambda x: file_dict[x]['path'])

    return design


def main():
    args = get_args()
    design = args.design

    # Create a file handler
    handler = logging.FileHandler('design.log')
    logger.addHandler(handler)

    # Read files as dataframes
    design_df = pd.read_csv(args.design, sep=',')
    fastq_df = pd.read_csv(args.fastq, sep='\t', names=['name', 'path'])

    # Check design file
    if args.feature == 'no':
    	new_design_df = check_design_headers_n(design_df)
    else:
    	new_design_df = check_design_headers_y(design_df)
	#new_design_df[['sample']].to_csv('library.checked.csv', header=True, sep=',', index=False)

    check_files(design_df, fastq_df)
    new_design_df.drop('library_type', 1).to_csv('design.checked.csv', header=True, sep=',', index=False)



if __name__ == '__main__':
    main()