From 87f907089fb432c11e5d6f70cbacfef0a6309440 Mon Sep 17 00:00:00 2001 From: Venkat Malladi <venkat.malladi@utsouthwestern.edu> Date: Wed, 27 Jun 2018 17:12:28 -0500 Subject: [PATCH] Fix paired end. --- workflow/main.nf | 8 +-- workflow/scripts/pool_and_psuedoreplicate.py | 53 +++++++++++++++++++- 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/workflow/main.nf b/workflow/main.nf index 9cd780d..4fa5d9f 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -239,7 +239,7 @@ process crossReads { output: - set sampleId, tagAlign, file('*.cc.qc'), experimentId, biosample, factor, treatment, replicate, controlId into xcorReads + set sampleId, seTagAlign, tagAlign, file('*.cc.qc'), experimentId, biosample, factor, treatment, replicate, controlId into xcorReads set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats script: @@ -259,9 +259,9 @@ process crossReads { // Define channel collecting tagAlign and xcor into design file xcorDesign = xcorReads - .map{ sampleId, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId -> - "$sampleId\t$tagAlign\t$xcor\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} - .collectFile(name:'design_xcor.tsv', seed:"sample_id\ttag_align\txcor\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") + .map{ sampleId, seTagAlign, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId -> + "$sampleId\t$seTagAlign\t$tagAlign\t$xcor\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} + .collectFile(name:'design_xcor.tsv', seed:"sample_id\tse_tag_align\ttag_align\txcor\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") // Make Experiment design files to be read in for downstream analysis process defineExpDesignFiles { diff --git a/workflow/scripts/pool_and_psuedoreplicate.py b/workflow/scripts/pool_and_psuedoreplicate.py index 72a2558..3481495 100644 --- a/workflow/scripts/pool_and_psuedoreplicate.py +++ b/workflow/scripts/pool_and_psuedoreplicate.py @@ -98,6 +98,20 @@ def pool(tag_files, outfile, paired): return pooled_filename +def bedpe_to_tagalign(tag_file, outfile): + '''Convert read pairs to reads itno standard tagAlign file.''' + + se_tag_filename = outfile + "bedse.tagAlign.gz" + + # Convert read pairs to reads into standard tagAlign file + tag_steps = ["zcat -f %s" % (tag_file)] + tag_steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""]) + tag_steps.extend(['gzip -cn']) + out, err = utils.run_pipe(tag_steps, outfile=se_tag_filename) + + return se_tag_filename + + def self_psuedoreplication(tag_file, prefix, paired): '''Make 2 self-psuedoreplicates.''' @@ -183,13 +197,24 @@ def main(): else: pool_control = design_df.control_tag_align.unique()[0] + # if paired_end make tagAlign + if paired: + pool_control_tmp = bedpe_to_tagalign(pool_control, "pool_control") + pool_control = pool_control_tmp + # Psuedoreplicate and update design accordingly if not replicated: - # Duplicate rows and update for pool and psuedoreplicates + # Duplicate rows and update for pool and psuedoreplicates and update tagAlign with single end data experiment_id = design_df.at[0, 'experiment_id'] replicate = design_df.at[0, 'replicate'] design_new_df = design_df.loc[np.repeat(design_df.index, 4)].reset_index() + + # Update tagAlign with single end data + if paired: + design_new_df['tag_align'] = design_new_df['se_tag_align'] + design_new_df.drop(labels = 'se_tag_align', axis = 1, inplace = True) + design_new_df['replicate'] = design_new_df['replicate'].astype(str) design_new_df.at[1, 'sample_id'] = experiment_id + '_pr1' design_new_df.at[1, 'replicate'] = '1_pr' @@ -200,6 +225,7 @@ def main(): design_new_df.at[3, 'sample_id'] = experiment_id + '_pooled' design_new_df.at[3, 'replicate'] = 'pooled' design_new_df.at[3, 'xcor'] = 'Calculate' + design_new_df.at[3, 'tag_align'] = design_new_df.at[0, 'tag_align'] # Make 2 self psuedoreplicates self_pseudoreplicates_dict = {} @@ -217,12 +243,20 @@ def main(): # Drop index column design_new_df.drop(labels='index', axis=1, inplace=True) + + else: # Make pool of replicates replicate_files = design_df.tag_align.unique() experiment_id = design_df.at[0, 'experiment_id'] pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired) + # If paired change to single End + if paired: + pool_experiment_se = bedpe_to_tagalign(pool_experiment, experiment_id + "_pooled") + else: + pool_experiment_se = pool_experiment + # Make self psuedoreplicates equivalent to number of replicates pseudoreplicates_dict = {} for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']): @@ -240,6 +274,10 @@ def main(): pool_pseudoreplicates_dict[replicate_id] = pool_replicate design_new_df = design_df + # Update tagAlign with single end data + if paired: + design_new_df['tag_align'] = design_new_df['se_tag_align'] + design_new_df.drop(labels = 'se_tag_align', axis = 1, inplace = True) # Check controls against cutoff_ratio # if so replace with pool_control # unless single control was used @@ -260,6 +298,16 @@ def main(): % row['replicate']) design_new_df.loc[index, 'control_tag_align'] = \ path_to_pool_control + else: + if paired: + control = row['control_tag_align'] + control_basename = os.path.basename( + utils.strip_extensions(control, ['.filt.nodup.bedpe.gz'])) + control_tmp = bedpe_to_tagalign(control , "control_basename") + path_to_control = cwd + '/' + control_tmp + design_new_df.loc[index, 'control_tag_align'] = \ + path_to_control + else: path_to_pool_control = pool_control @@ -278,10 +326,11 @@ def main(): tmp_metadata['sample_id'] = experiment_id + '_pooled' tmp_metadata['replicate'] = 'pooled' tmp_metadata['xcor'] = 'Calculate' - path_to_file = cwd + '/' + pool_experiment + path_to_file = cwd + '/' + pool_experiment_se tmp_metadata['tag_align'] = path_to_file design_new_df = design_new_df.append(tmp_metadata) + # Write out new dataframe design_new_df.to_csv(experiment_id + '_ppr.tsv', header=True, sep='\t', index=False) -- GitLab