#!/usr/bin/env python3 '''Generate pooled and pseudoreplicate from data.''' import argparse import logging import os import subprocess import shutil import shlex import pandas as pd import numpy as np import utils EPILOG = ''' For more details: %(prog)s --help ''' # SETTINGS logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) logger.propagate = False logger.setLevel(logging.INFO) # the order of this list is important. # strip_extensions strips from the right inward, so # the expected right-most extensions should appear first (like .gz) # Modified from J. Seth Strattan STRIP_EXTENSIONS = ['.gz', '.tagAlign', '.bedse', '.bedpe'] def get_args(): '''Define arguments.''' parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-d', '--design', help="The design file to make experiemnts (tsv format).", required=True) parser.add_argument('-p', '--paired', help="True/False if paired-end or single end.", default=False, action='store_true') parser.add_argument('-c', '--cutoff', help="Cutoff ratio used for choosing controls.", type=float, default=1.2) args = parser.parse_args() return args def check_replicates(design): '''Check the number of replicates for the experiment.''' no_rep = design.shape[0] return no_rep def check_controls(design): '''Check the number of controls for the experiment.''' no_controls = len(design.control_tag_align.unique()) return no_controls def get_read_count_ratio(design): '''Get the ratio of read counts for unique controls.''' controls = design.control_tag_align.unique() control_dict = {} for con in controls: no_reads = utils.count_lines(con) control_dict[con] = no_reads control_matrix = {c: control_dict for c in controls} control_df = pd.DataFrame.from_dict(control_matrix) control_ratio = control_df.divide(list(control_dict.values()), axis=0) return control_ratio def pool(tag_files, outfile, paired): '''Pool files together.''' if paired: file_extension = '.bedpe.gz' else: file_extension = '.bedse.gz' pool_basename = os.path.basename( utils.strip_extensions(outfile, STRIP_EXTENSIONS)) pooled_filename = pool_basename + file_extension # Merge files out, err = utils.run_pipe([ 'gzip -dc %s' % (' '.join(tag_files)), 'gzip -cn'], outfile=pooled_filename) return pooled_filename def bedpe_to_tagalign(tag_file, outfile): '''Convert read pairs to reads into standard tagAlign file.''' se_tag_filename = outfile + ".tagAlign.gz" # Convert read pairs to reads into standard tagAlign file tag_steps = ["zcat -f %s" % (tag_file)] tag_steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""]) tag_steps.extend(['gzip -cn']) out, err = utils.run_pipe(tag_steps, outfile=se_tag_filename) return se_tag_filename def self_psuedoreplication(tag_file, prefix, paired): '''Make 2 self-psuedoreplicates.''' # Get total number of reads no_lines = utils.count_lines(tag_file) # Number of lines to split into lines_per_rep = (no_lines+1)/2 # Make an array of number of psuedoreplicatesfile names pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.tagAlign.gz' for r in [0, 1]} # Shuffle and split file into equal parts # by using the input to seed shuf we ensure multiple runs with the same # input will produce the same output # Produces two files named splits_prefix0n, n=1,2 splits_prefix = 'temp_split' psuedo_command = 'bash -c "zcat {} | shuf --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {} | wc -c) -nosalt </dev/zero 2>/dev/null) | ' psuedo_command += 'split -d -l {} - {}."' psuedo_command = psuedo_command.format( tag_file, tag_file, int(lines_per_rep), splits_prefix) logger.info("Running psuedo with %s", psuedo_command) subprocess.check_call(shlex.split(psuedo_command)) # Convert read pairs to reads into standard tagAlign file for i, index in enumerate([0, 1]): string_index = '.0' + str(index) steps = ['cat %s' % (splits_prefix + string_index)] if paired: steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""]) steps.extend(['gzip -cn']) out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i]) return pseudoreplicate_dict def main(): args = get_args() paired = args.paired design = args.design cutoff_ratio = args.cutoff # Create a file handler handler = logging.FileHandler('experiment_generation.log') logger.addHandler(handler) # Read files as dataframes design_df = pd.read_csv(design, sep='\t') # Get current directory to build paths cwd = os.getcwd() # Check Number of replicates and replicates no_reps = check_replicates(design_df) no_unique_controls = check_controls(design_df) if no_reps == 1: logger.info("No other replicate specified " "so processing as an unreplicated experiment.") replicated = False else: logger.info("Multiple replicates specified " "so processing as a replicated experiment.") replicated = True if no_unique_controls == 1 and replicated: logger.info("Only a single control was specified " "so using same control for replicates, pool and psuedoreplicates.") single_control = True else: logger.info("Will merge only unique controls for pooled.") single_control = False # Pool the controls for checking if not single_control: control_df = get_read_count_ratio(design_df) control_files = design_df.control_tag_align.unique() pool_control = pool(control_files, "pool_control", paired) else: pool_control = design_df.control_tag_align.unique()[0] # if paired_end make tagAlign if paired: pool_control_tmp = bedpe_to_tagalign(pool_control, "pool_control") pool_control = pool_control_tmp # Psuedoreplicate and update design accordingly if not replicated: # Duplicate rows and update for pool and psuedoreplicates and update tagAlign with single end data experiment_id = design_df.at[0, 'experiment_id'] replicate = design_df.at[0, 'replicate'] design_new_df = design_df.loc[np.repeat(design_df.index, 4)].reset_index() # Update tagAlign with single end data if paired: design_new_df['tag_align'] = design_new_df['se_tag_align'] design_new_df.drop(labels='se_tag_align', axis=1, inplace=True) design_new_df['replicate'] = design_new_df['replicate'].astype(str) design_new_df.at[1, 'sample_id'] = experiment_id + '_pr1' design_new_df.at[1, 'replicate'] = '1_pr' design_new_df.at[1, 'xcor'] = 'Calculate' design_new_df.at[2, 'sample_id'] = experiment_id + '_pr2' design_new_df.at[2, 'replicate'] = '2_pr' design_new_df.at[2, 'xcor'] = 'Calculate' design_new_df.at[3, 'sample_id'] = experiment_id + '_pooled' design_new_df.at[3, 'replicate'] = 'pooled' design_new_df.at[3, 'xcor'] = 'Calculate' design_new_df.at[3, 'tag_align'] = design_new_df.at[0, 'tag_align'] # Make 2 self psuedoreplicates self_pseudoreplicates_dict = {} for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']): replicate_prefix = experiment_id + '_' + str(rep) self_pseudoreplicates_dict = \ self_psuedoreplication(tag_file, replicate_prefix, paired) # Update design to include new self pseudo replicates for rep, pseudorep_file in self_pseudoreplicates_dict.items(): path_to_file = cwd + '/' + pseudorep_file replicate = rep + 1 design_new_df.loc[replicate, 'tag_align'] = path_to_file # Drop index column design_new_df.drop(labels='index', axis=1, inplace=True) else: # Make pool of replicates replicate_files = design_df.tag_align.unique() experiment_id = design_df.at[0, 'experiment_id'] pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired) # If paired change to single End if paired: pool_experiment_se = bedpe_to_tagalign(pool_experiment, experiment_id + "_pooled") else: pool_experiment_se = pool_experiment # Make self psuedoreplicates equivalent to number of replicates pseudoreplicates_dict = {} for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']): replicate_prefix = experiment_id + '_' + str(rep) pr_dict = self_psuedoreplication(tag_file, replicate_prefix, paired) pseudoreplicates_dict[rep] = pr_dict # Merge self psuedoreplication for each true replicate pseudoreplicates_df = pd.DataFrame.from_dict(pseudoreplicates_dict) pool_pseudoreplicates_dict = {} for index, row in pseudoreplicates_df.iterrows(): replicate_id = index + 1 pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz' pool_replicate = pool(row, pr_filename, False) pool_pseudoreplicates_dict[replicate_id] = pool_replicate design_new_df = design_df # Update tagAlign with single end data if paired: design_new_df['tag_align'] = design_new_df['se_tag_align'] design_new_df.drop(labels='se_tag_align', axis=1, inplace=True) # Check controls against cutoff_ratio # if so replace with pool_control # unless single control was used if not single_control: path_to_pool_control = cwd + '/' + pool_control if control_df.values.max() > cutoff_ratio: logger.info("Number of reads in controls differ by " + " > factor of %f. Using pooled controls." % (cutoff_ratio)) design_new_df['control_tag_align'] = path_to_pool_control else: for index, row in design_new_df.iterrows(): exp_no_reads = utils.count_lines(row['tag_align']) con_no_reads = utils.count_lines(row['control_tag_align']) if con_no_reads < exp_no_reads: logger.info("Fewer reads in control than experiment." + "Using pooled controls for replicate %s." % row['replicate']) design_new_df.loc[index, 'control_tag_align'] = \ path_to_pool_control else: if paired: control = row['control_tag_align'] control_basename = os.path.basename( utils.strip_extensions(control, STRIP_EXTENSIONS)) control_tmp = bedpe_to_tagalign(control, control_basename) path_to_control = cwd + '/' + control_tmp design_new_df.loc[index, 'control_tag_align'] = \ path_to_control else: path_to_pool_control = cwd + '/' + pool_control design_new_df['control_tag_align'] = path_to_pool_control # Add in pseudo replicates tmp_metadata = design_new_df.loc[0].copy() tmp_metadata['control_tag_align'] = path_to_pool_control for rep, pseudorep_file in pool_pseudoreplicates_dict.items(): tmp_metadata['sample_id'] = experiment_id + '_pr' + str(rep) tmp_metadata['replicate'] = str(rep) + '_pr' tmp_metadata['xcor'] = 'Calculate' path_to_file = cwd + '/' + pseudorep_file tmp_metadata['tag_align'] = path_to_file design_new_df = design_new_df.append(tmp_metadata) # Add in pool experiment tmp_metadata['sample_id'] = experiment_id + '_pooled' tmp_metadata['replicate'] = 'pooled' tmp_metadata['xcor'] = 'Calculate' path_to_file = cwd + '/' + pool_experiment_se tmp_metadata['tag_align'] = path_to_file design_new_df = design_new_df.append(tmp_metadata) # Write out new dataframe design_new_df.to_csv(experiment_id + '_ppr.tsv', header=True, sep='\t', index=False) if __name__ == '__main__': main()