Skip to content
Snippets Groups Projects

Develop

Merged Gervaise Henry requested to merge develop into 10-count.features
Compare and Show latest version
3 files
+ 238
0
Preferences
File browser
Compare changes
+ 139
0
#!/usr/bin/env python3
'''Check if design file is correctly formatted and matches files list.'''
import argparse
import logging
import pandas as pd
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-d', '--design',
help="The design file to run QC (tsv format).",
required=True )
parser.add_argument('-f', '--fastq',
help="File with list of fastq files (tsv format).",
required=True )
parser.add_argument('-t', '--feature',
help="Additional features to count?",
required=True )
args = parser.parse_args()
return args
def check_design_headers_n(design):
'''Check if design file conforms to sequencing type.'''
# Default headers
design_template = [
'Sample',
'fastq_R1',
'fastq_R2']
design_headers = list(design.columns.values)
# Check if headers
logger.info("Running header check.")
missing_headers = set(design_template) - set(design_headers)
if len(missing_headers) > 0:
logger.error('Missing column headers: %s', list(missing_headers))
raise Exception("Missing column headers: %s" % list(missing_headers))
return design
def check_design_headers_y(design):
'''Check if design file conforms to sequencing type.'''
# Default headers
design_template = [
'Sample',
'fastq_R1',
'fastq_R2',
'library_type']
design_headers = list(design.columns.values)
# Check if headers
logger.info("Running header check.")
missing_headers = set(design_template) - set(design_headers)
if len(missing_headers) > 0:
logger.error('Missing column headers: %s', list(missing_headers))
raise Exception("Missing column headers: %s" % list(missing_headers))
return design
def check_files(design, fastq):
'''Check if design file has the files found.'''
logger.info("Running file check.")
files = list(design['fastq_R1']) + list(design['fastq_R2'])
files_found = fastq['name']
missing_files = set(files) - set(files_found)
if len(missing_files) > 0:
logger.error('Missing files from design file: %s', list(missing_files))
raise Exception("Missing files from design file: %s" %
list(missing_files))
else:
file_dict = fastq.set_index('name').T.to_dict()
design['fastq_R1'] = design['fastq_R1'].apply(lambda x: file_dict[x]['path'])
design['fastq_R2'] = design['fastq_R2'].apply(lambda x: file_dict[x]['path'])
return design
def main():
args = get_args()
design = args.design
# Create a file handler
handler = logging.FileHandler('design.log')
logger.addHandler(handler)
# Read files as dataframes
design_df = pd.read_csv(args.design, sep=',')
fastq_df = pd.read_csv(args.fastq, sep='\t', names=['name', 'path'])
# Check design file
if args.feature == 'no':
new_design_df = check_design_headers_n(design_df)
else:
new_design_df = check_design_headers_y(design_df)
#new_design_df[['sample']].to_csv('library.checked.csv', header=True, sep=',', index=False)
check_files(design_df, fastq_df)
new_design_df.drop('library_type', 1).to_csv('design.checked.csv', header=True, sep=',', index=False)
if __name__ == '__main__':
main()