Skip to content
Snippets Groups Projects

Resolve "update check design file"

Merged Holly Ruess requested to merge 13-UpdateCheckDesign into master
Compare and
19 files
+ 239
51
Preferences
File browser
Compare changes
@@ -5,6 +5,7 @@
import argparse
import logging
import pandas as pd
import os
EPILOG = '''
For more details:
@@ -49,14 +50,15 @@ def get_args():
def check_design_headers(design, paired, atac):
'''Check if design file conforms to sequencing type.'''
'''Check if design file has proper headers.'''
# Default headers
design_template = [
'sample_id',
'experiment_id',
'replicate',
'fastq_read1']
'fastq_read1',
]
design_headers = list(design.columns.values)
@@ -76,6 +78,46 @@ def check_design_headers(design, paired, atac):
raise Exception("Missing column headers: %s" % list(missing_headers))
def check_samples(design):
'''Check if design file has the correct sample name mapping.'''
logger.info("Running sample check.")
samples = design.groupby('sample_id') \
.apply(list)
malformated_samples = []
chars = set('-.')
for sample in samples.index.values:
if(any(char.isspace() for char in sample) | any((char in chars) for char in sample)):
malformated_samples.append(sample)
if len(malformated_samples) > 0:
logger.error('Malformed samples from design file: %s', list(malformated_samples))
raise Exception("Malformed samples from design file: %s" %
list(malformated_samples))
def check_experiments(design):
'''Check if design file has the correct experiment name mapping.'''
logger.info("Running experiment check.")
experiments = design.groupby('experiment_id') \
.apply(list)
malformated_experiments = []
chars = set('-.')
for experiment in experiments.index.values:
if(any(char.isspace() for char in experiment) | any((char in chars) for char in experiment)):
malformated_experiments.append(experiment)
if len(malformated_experiments) > 0:
logger.error('Malformed experiment from design file: %s', list(malformated_experiments))
raise Exception("Malformed experiment from design file: %s" %
list(malformated_experiments))
def check_controls(design):
'''Check if design file has the correct control mapping.'''
@@ -90,7 +132,7 @@ def check_controls(design):
def check_replicates(design):
'''Check if design file has unique replicate numbersfor an experiment.'''
'''Check if design file has unique replicate numbers for an experiment.'''
logger.info("Running replicate check.")
@@ -111,7 +153,7 @@ def check_replicates(design):
def check_files(design, fastq, paired):
'''Check if design file has the files found.'''
'''Check if design file fastq lists are actually present.'''
logger.info("Running file check.")
@@ -139,6 +181,24 @@ def check_files(design, fastq, paired):
return design
def get_length(checked_design_df):
'''Adds length of reads to design file.'''
logger.info("Adding read length to design file.")
fq_len_list = []
files = checked_design_df['fastq_read1']
for f in files:
len_fq = os.popen("zcat %s | head -n 2 | tail -n 1 | wc -c " % (f)).read().rstrip()
fq_len_list.append(len_fq)
checked_design_df['fq_length'] = fq_len_list
return checked_design_df
def main():
args = get_args()
design = args.design
@@ -161,7 +221,12 @@ def main():
check_controls(design_df)
check_replicates(design_df)
new_design_df = check_files(design_df, fastq_df, paired)
check_samples(design_df)
check_experiments(design_df)
checked_design_df = check_files(design_df, fastq_df, paired)
# Add length of each read to design file
new_design_df = get_length(checked_design_df)
# Write out new design file
new_design_df.to_csv('design.tsv', header=True, sep='\t', index=False)