diff --git a/workflow/main.nf b/workflow/main.nf index 6dfe31af33183ffcf17a43e875c3e883bd237629..cbfef2ac439224a8103e1cc442587cfc6dc74382 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -289,7 +289,7 @@ process convertReads { set sampleId, deduped, bai, experimentId, replicate from convertReads output: - set sampleId, file('*.tagAlign.gz'), experimentId, replicate into tagReads + set sampleId, file('*.tagAlign.gz'), file('*.bed{pe,se}.gz'), experimentId, replicate into tagReads file('version_*.txt') into convertReadsVersions script: @@ -356,10 +356,10 @@ process crossReads { publishDir "${outDir}/${task.process}/${sampleId}", mode: 'copy' input: - set sampleId, tagAlign, experimentId, replicate from tagReads + set sampleId, seTagAlign, tagAlign, experimentId, replicate from tagReads output: - set sampleId, tagAlign, file('*.cc.qc'), experimentId, replicate into xcorReads + set sampleId, seTagAlign, tagAlign, file('*.cc.qc'), experimentId, replicate into xcorReads set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats script: @@ -396,8 +396,8 @@ process crossReads { // Define channel collecting tagAlign and xcor into design file xcorDesign = xcorReads - .map{ sampleId, tagAlign, xcor, experimentId, replicate -> - "${sampleId}\t${tagAlign}\t${xcor}\t${experimentId}\t${replicate}\n" } + .map{ sampleId, seTagAlign, tagAlign, xcor, experimentId, replicate -> + "${sampleId}\t${seTagAlign}\t${tagAlign}\t${xcor}\t${experimentId}\t${replicate}\n" } .collectFile(name:'design_xcor.tsv', seed:"sample_id\ttag_align\txcor\texperiment_id\treplicate\n", storeDir:"${outDir}/design") // Make Experiment design files to be read in for downstream analysis diff --git a/workflow/scripts/convert_reads.py b/workflow/scripts/convert_reads.py index 6497e84cf3c63d9a94dbd2cdb91d04ebf5c1691b..7191356e677a2c420f69fde1b6201844318277e4 100644 --- a/workflow/scripts/convert_reads.py +++ b/workflow/scripts/convert_reads.py @@ -98,6 +98,8 @@ def convert_mapped(bam, tag_filename): def convert_mapped_pe(bam, bam_basename, tag_filename): '''Use bedtools to convert to tagAlign PE data.''' + bedpe_filename = bam_basename + ".bedpe.gz" + # Name sort bam to make BEDPE nmsrt_bam_filename = bam_basename + ".nmsrt.bam" samtools_sort_command = \ @@ -114,7 +116,7 @@ def convert_mapped_pe(bam, bam_basename, tag_filename): "grep -v 'chrM'", r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'""", r"""awk 'BEGIN {OFS = "\t"}{ if ($6 == "+") {$2 = $2 + 4} else if ($6 == "-") {$3 = $3 - 5} print $0}'""", - "gzip -nc"], outfile=tag_filename) + "gzip -nc"], outfile=bedpe_filename) os.remove(nmsrt_bam_filename) @@ -138,9 +140,12 @@ def main(): tag_filename = bam_basename + '.tagAlign.gz' if paired: # paired-end data + convert_mapped(bam, tag_filename) convert_mapped_pe(bam, bam_basename, tag_filename) else: convert_mapped(bam, tag_filename) + bedse_filename = bam_basename + ".bedse.gz" + shutil.copy(tag_filename, bedse_filename) if __name__ == '__main__': main() diff --git a/workflow/scripts/pool_and_psuedoreplicate.py b/workflow/scripts/pool_and_psuedoreplicate.py index eadb41d55768f5d01e5b63a06a9dc665170c6573..9a01e6b1e7de6248dda202ba453b7b1793d29a06 100644 --- a/workflow/scripts/pool_and_psuedoreplicate.py +++ b/workflow/scripts/pool_and_psuedoreplicate.py @@ -7,8 +7,10 @@ import logging import pandas as pd import numpy as np import os -import sys import utils +import subprocess +import shutil +import shlex EPILOG = ''' For more details: @@ -39,9 +41,9 @@ def get_args(): default=False, action='store_true') - parser.add_argument('-c', '--cutoff', - help="Cutoff ratio used for choosing controls.", - default=1.2) +# parser.add_argument('-c', '--cutoff', +# help="Cutoff ratio used for choosing controls.", +# default=1.2) args = parser.parse_args() return args @@ -55,28 +57,28 @@ def check_replicates(design): return no_rep -def check_controls(design): - '''Check the number of controls for the experiment.''' +#def check_controls(design): +# '''Check the number of controls for the experiment.''' - no_controls = len(design.control_tag_align.unique()) +# no_controls = len(design.control_tag_align.unique()) - return no_controls +# return no_controls -def get_read_count_ratio(design): - '''Get the ratio of read counts for unique controls.''' +#def get_read_count_ratio(design): +# '''Get the ratio of read counts for unique controls.''' - controls = design.control_tag_align.unique() - control_dict = {} - for con in controls: - no_reads = utils.count_lines(con) - control_dict[con] = no_reads +# controls = design.control_tag_align.unique() +# control_dict = {} +# for con in controls: +# no_reads = utils.count_lines(con) +# control_dict[con] = no_reads - control_matrix = {c: control_dict for c in controls} - control_df = pd.DataFrame.from_dict(control_matrix) +# control_matrix = {c: control_dict for c in controls} +# control_df = pd.DataFrame.from_dict(control_matrix) - control_ratio = control_df.divide(list(control_dict.values()), axis=0) - return control_ratio +# control_ratio = control_df.divide(list(control_dict.values()), axis=0) +# return control_ratio def pool(tag_files, outfile, paired): @@ -107,20 +109,26 @@ def self_psuedoreplication(tag_file, prefix, paired): lines_per_rep = (no_lines+1)/2 # Make an array of number of psuedoreplicatesfile names - pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.bedse.tagAlign.gz' + pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.tagAlign.gz' for r in [0, 1]} # Shuffle and split file into equal parts # by using the input to seed shuf we ensure multiple runs with the same # input will produce the same output # Produces two files named splits_prefix0n, n=1,2 + # Coded for reproducibility splits_prefix = 'temp_split' - out, err = utils.run_pipe([ - 'gzip -dc %s' % (tag_file), - 'shuf --random-source=%s' % (tag_file), - 'split -d -l %d - %s' % (lines_per_rep, splits_prefix)]) + psuedo_command = 'bash -c "zcat {} | shuf --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {} | wc -c) -nosalt </dev/zero 2>/dev/null) | ' + psuedo_command += 'split -d -l {} - {}."' + psuedo_command = psuedo_command.format( + tag_file, + tag_file, + int(lines_per_rep), + splits_prefix) + logger.info("Running psuedo with %s", psuedo_command) + subprocess.check_call(shlex.split(psuedo_command)) # Convert read pairs to reads into standard tagAlign file @@ -131,7 +139,7 @@ def self_psuedoreplication(tag_file, prefix, paired): steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""]) steps.extend(['gzip -cn']) out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i]) - os.remove(splits_prefix + string_index) +# os.remove(splits_prefix + string_index) return pseudoreplicate_dict @@ -140,7 +148,7 @@ def main(): args = get_args() paired = args.paired design = args.design - cutoff_ratio = args.cutoff +# cutoff_ratio = args.cutoff # Create a file handler handler = logging.FileHandler('experiment_generation.log') @@ -155,10 +163,10 @@ def main(): # Check Number of replicates and controls no_reps = check_replicates(design_df) - if atac: - no_unique_controls = 0 - else: - no_unique_controls = check_controls(design_df) +# if atac: +# no_unique_controls = 0 +# else: +# no_unique_controls = check_controls(design_df) if no_reps == 1: logger.info("No other replicate specified " @@ -170,33 +178,39 @@ def main(): "so processing as a replicated experiment.") replicated = True - if no_unique_controls == 1 and atac: - logger.info("ATAC-seq experiment speficifed " - "no controls are required.") - single_control = False - if no_unique_controls == 1 and replicated: - logger.info("Only a single control was specified " - "so using same control for replicates, pool and psuedoreplicates.") - single_control = True - else: - logger.info("Will merge only unique controls for pooled.") - single_control = False +# if no_unique_controls == 1 and atac: +# logger.info("ATAC-seq experiment speficifed " +# "no controls are required.") + # single_control = False +# if no_unique_controls == 1 and replicated: +# logger.info("Only a single control was specified " +# "so using same control for replicates, pool and psuedoreplicates.") +# single_control = True +# else: +# logger.info("Will merge only unique controls for pooled.") +# single_control = False # Pool the controls for checking - if not single_control and not atac: - control_df = get_read_count_ratio(design_df) - control_files = design_df.control_tag_align.unique() - pool_control = pool(control_files, "pool_control", paired) - elif not atac: - pool_control = design_df.control_tag_align.unique()[0] +# if not single_control and not atac: +# control_df = get_read_count_ratio(design_df) +# control_files = design_df.control_tag_align.unique() +# pool_control = pool(control_files, "pool_control", paired) +# elif not atac: +# pool_control = design_df.control_tag_align.unique()[0] # Psuedoreplicate and update design accordingly if not replicated: - # Duplicate rows and update for pool and psuedoreplicates + # Duplicate rows and update for pool and psuedoreplicates and update tagAlign with single end data experiment_id = design_df.at[0, 'experiment_id'] replicate = design_df.at[0, 'replicate'] design_new_df = design_df.loc[np.repeat(design_df.index, 4)].reset_index() + + # Update tagAlign with single end data + if paired: + design_new_df['tag_align'] = design_new_df['se_tag_align'] + design_new_df.drop(labels='se_tag_align', axis=1, inplace=True) + design_new_df['replicate'] = design_new_df['replicate'].astype(str) design_new_df.at[1, 'sample_id'] = experiment_id + '_pr1' design_new_df.at[1, 'replicate'] = '1_pr' @@ -207,6 +221,7 @@ def main(): design_new_df.at[3, 'sample_id'] = experiment_id + '_pooled' design_new_df.at[3, 'replicate'] = 'pooled' design_new_df.at[3, 'xcor'] = 'Calculate' + design_new_df.at[3, 'tag_align'] = design_new_df.at[0, 'tag_align'] # Make 2 self psuedoreplicates self_pseudoreplicates_dict = {} @@ -230,6 +245,12 @@ def main(): experiment_id = design_df.at[0, 'experiment_id'] pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired) + # If paired change to single End + if paired: + pool_experiment_se = bedpe_to_tagalign(pool_experiment, experiment_id + "_pooled") + else: + pool_experiment_se = pool_experiment + # Make self psuedoreplicates equivalent to number of replicates pseudoreplicates_dict = {} for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']): @@ -242,38 +263,44 @@ def main(): pool_pseudoreplicates_dict = {} for index, row in pseudoreplicates_df.iterrows(): replicate_id = index + 1 - pr_filename = experiment_id + ".pr" + str(replicate_id) + '.bedse.gz' + pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz' pool_replicate = pool(row, pr_filename, False) pool_pseudoreplicates_dict[replicate_id] = pool_replicate design_new_df = design_df + + # Update tagAlign with single end data + if paired: + design_new_df['tag_align'] = design_new_df['se_tag_align'] + design_new_df.drop(labels='se_tag_align', axis=1, inplace=True) + # Check controls against cutoff_ratio # if so replace with pool_control # unless single control was used - if not single_control and not atac: - path_to_pool_control = cwd + '/' + pool_control - if control_df.values.max() > 1.2: - logger.info("Number of reads in controls differ by " + - " > factor of %f. Using pooled controls." % (cutoff_ratio)) - design_new_df['control_tag_align'] = path_to_pool_control - else: - for index, row in design_new_df.iterrows(): - exp_no_reads = utils.count_lines(row['tag_align']) - con_no_reads = utils.count_lines(row['control_tag_align']) - if con_no_reads < exp_no_reads: - logger.info("Fewer reads in control than experiment." + - "Using pooled controls for replicate %s." - % row['replicate']) - design_new_df.loc[index, 'control_tag_align'] = \ - path_to_pool_control - elif not atac: - path_to_pool_control = pool_control +# if not single_control and not atac: +# path_to_pool_control = cwd + '/' + pool_control +# if control_df.values.max() > 1.2: +# logger.info("Number of reads in controls differ by " + +# " > factor of %f. Using pooled controls." % (cutoff_ratio)) +# design_new_df['control_tag_align'] = path_to_pool_control +# else: +# for index, row in design_new_df.iterrows(): +# exp_no_reads = utils.count_lines(row['tag_align']) +# con_no_reads = utils.count_lines(row['control_tag_align']) +# if con_no_reads < exp_no_reads: +# logger.info("Fewer reads in control than experiment." + +# "Using pooled controls for replicate %s." +# % row['replicate']) +# design_new_df.loc[index, 'control_tag_align'] = \ +# path_to_pool_control +# elif not atac: +# path_to_pool_control = pool_control # Add in pseudo replicates tmp_metadata = design_new_df.loc[0].copy() - if not atac: - tmp_metadata['control_tag_align'] = path_to_pool_control +# if not atac: +# tmp_metadata['control_tag_align'] = path_to_pool_control for rep, pseudorep_file in pool_pseudoreplicates_dict.items(): tmp_metadata['sample_id'] = experiment_id + '_pr' + str(rep) tmp_metadata['replicate'] = str(rep) + '_pr' diff --git a/workflow/scripts/xcor.py b/workflow/scripts/xcor.py index 8d4a977da1228363ce04d47d35a91fa35b6d07f5..8dc4394d3af768e47fa8314549634ddb1a23bd9b 100644 --- a/workflow/scripts/xcor.py +++ b/workflow/scripts/xcor.py @@ -100,7 +100,7 @@ def xcor(tag, paired): 'gzip -dc %s > ' % (tag)], outfile=uncompressed_tag_filename) # Subsample tagAlign file - numReads = 15000000 + number_reads = 15000000 subsampled_tag_filename = \ tag_basename + ".%d.tagAlign.gz" % (numReads/1000000)