diff --git a/workflow/scripts/pool_and_psuedoreplicate.py b/workflow/scripts/pool_and_psuedoreplicate.py index 8f313010ffa7778f5b4191f9da91c06d5ab74640..6c37eed940ca89f71ebd3c49a2dae042f94d5b05 100644 --- a/workflow/scripts/pool_and_psuedoreplicate.py +++ b/workflow/scripts/pool_and_psuedoreplicate.py @@ -204,77 +204,32 @@ def generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_con pool_control_tmp = bedpe_to_tagalign(pool_control, "pool_control") pool_control = pool_control_tmp - if not replicated: - - # Duplicate rows and update for pool and psuedoreplicates and update tagAlign with single end data - experiment_id = design_df.at[0, 'experiment_id'] - replicate = design_df.at[0, 'replicate'] - design_new_df = design_df.loc[np.repeat(design_df.index, 4)].reset_index() - pool_experiment = design_df.tag_align.unique() - - # Update tagAlign with single end data - if paired: - design_new_df['tag_align'] = design_new_df['se_tag_align'] - design_new_df.drop(labels='se_tag_align', axis=1, inplace=True) - - design_new_df['replicate'] = design_new_df['replicate'].astype(str) - design_new_df.at[1, 'sample_id'] = experiment_id + '_pr1' - design_new_df.at[1, 'replicate'] = '1_pr' - design_new_df.at[1, 'xcor'] = 'Calculate' - design_new_df.at[2, 'sample_id'] = experiment_id + '_pr2' - design_new_df.at[2, 'replicate'] = '2_pr' - design_new_df.at[2, 'xcor'] = 'Calculate' - design_new_df.at[3, 'sample_id'] = experiment_id + '_pooled' - design_new_df.at[3, 'replicate'] = 'pooled' - design_new_df.at[3, 'xcor'] = 'Calculate' - design_new_df.at[3, 'tag_align'] = design_new_df.at[0, 'tag_align'] - - # Make 2 self psuedoreplicates - self_pseudoreplicates_dict = {} - for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']): - replicate_prefix = experiment_id + '_' + str(rep) - self_pseudoreplicates_dict[rep] = self_psuedoreplication(tag_file, replicate_prefix, paired) - - - # Update design to include new self pseudo replicates - self_pseudoreplicates_df = pd.DataFrame.from_dict(self_pseudoreplicates_dict) - pool_pseudoreplicates_dict = {} - for rep, pseudorep_file in self_pseudoreplicates_df.iterrows(): - path_to_file = cwd + '/' + pseudorep_file - replicate = rep + 1 - design_new_df.loc[replicate, 'tag_align'] = path_to_file - pool_pseudoreplicates_dict[replicate] = path_to_file - - # Drop index column - design_new_df.drop(labels='index', axis=1, inplace=True) - - else: - # Make pool of replicates - replicate_files = design_df.tag_align.unique() - experiment_id = design_df.at[0, 'experiment_id'] - pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired) - - # Make self psuedoreplicates equivalent to number of replicates - pseudoreplicates_dict = {} - for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']): - replicate_prefix = experiment_id + '_' + str(rep) - pr_dict = self_psuedoreplication(tag_file, replicate_prefix, paired) - pseudoreplicates_dict[rep] = pr_dict - - # Merge self psuedoreplication for each true replicate - pseudoreplicates_df = pd.DataFrame.from_dict(pseudoreplicates_dict) - pool_pseudoreplicates_dict = {} - for index, row in pseudoreplicates_df.iterrows(): - replicate_id = index + 1 - pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz' - pool_replicate = pool(row, pr_filename, False) - pool_pseudoreplicates_dict[replicate_id] = pool_replicate - - design_new_df = design_df - # Update tagAlign with single end data - if paired: - design_new_df['tag_align'] = design_new_df['se_tag_align'] - design_new_df.drop(labels='se_tag_align', axis=1, inplace=True) + # Duplicate rows and update for pool and psuedoreplicates and update tagAlign with single end data + experiment_id = design_df.at[0, 'experiment_id'] + replicate_files = design_df.tag_align.unique() + pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired) + + # Make 2 self psuedoreplicates + pseudoreplicates_dict = {} + for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']): + replicate_prefix = experiment_id + '_' + str(rep) + pr_dict = self_psuedoreplication(tag_file, replicate_prefix, paired) + pseudoreplicates_dict[rep] = pr_dict + + # Update design to include new self pseudo replicates + pseudoreplicates_df = pd.DataFrame.from_dict(pseudoreplicates_dict) + pool_pseudoreplicates_dict = {} + for index, row in pseudoreplicates_df.iterrows(): + replicate_id = index + 1 + pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz' + pool_replicate = pool(row, pr_filename, False) + pool_pseudoreplicates_dict[replicate_id] = pool_replicate + + design_new_df = design_df #.loc[np.repeat(design_df.index, 4)].reset_index() + # Update tagAlign with single end data + if paired: + design_new_df['tag_align'] = design_new_df['se_tag_align'] + design_new_df.drop(labels='se_tag_align', axis=1, inplace=True) # If paired change to single End if paired: diff --git a/workflow/tests/test_pool_and_psuedoreplicate.py b/workflow/tests/test_pool_and_psuedoreplicate.py index c780122ed95c4bb720852976336054c538f11e83..f251e888f906de1c7e360502cadf45c8c9b85a3c 100644 --- a/workflow/tests/test_pool_and_psuedoreplicate.py +++ b/workflow/tests/test_pool_and_psuedoreplicate.py @@ -70,7 +70,6 @@ def test_single_rep(design_experiment_2): shutil.copy(test_design_path + 'A_1.tagAlign.gz', cwd) shutil.copy(test_design_path + 'B_1.tagAlign.gz', cwd) single_rep = pool_and_psuedoreplicate.generate_design('false', 1.2, design_experiment_2, cwd, 1, 1) - print(single_rep) assert single_rep.shape[0] == 4 @pytest.mark.singleend