Skip to content
Snippets Groups Projects
Commit fd1e0176 authored by Holly Ruess's avatar Holly Ruess
Browse files

fix pool and pseudo

parent 4c9bb1e9
Branches
1 merge request!12Resolve "Fix Pool and Pseudoreps"
...@@ -289,7 +289,7 @@ process convertReads { ...@@ -289,7 +289,7 @@ process convertReads {
set sampleId, deduped, bai, experimentId, replicate from convertReads set sampleId, deduped, bai, experimentId, replicate from convertReads
output: output:
set sampleId, file('*.tagAlign.gz'), experimentId, replicate into tagReads set sampleId, file('*.tagAlign.gz'), file('*.bed{pe,se}.gz'), experimentId, replicate into tagReads
file('version_*.txt') into convertReadsVersions file('version_*.txt') into convertReadsVersions
script: script:
...@@ -356,10 +356,10 @@ process crossReads { ...@@ -356,10 +356,10 @@ process crossReads {
publishDir "${outDir}/${task.process}/${sampleId}", mode: 'copy' publishDir "${outDir}/${task.process}/${sampleId}", mode: 'copy'
input: input:
set sampleId, tagAlign, experimentId, replicate from tagReads set sampleId, seTagAlign, tagAlign, experimentId, replicate from tagReads
output: output:
set sampleId, tagAlign, file('*.cc.qc'), experimentId, replicate into xcorReads set sampleId, seTagAlign, tagAlign, file('*.cc.qc'), experimentId, replicate into xcorReads
set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats
script: script:
...@@ -396,8 +396,8 @@ process crossReads { ...@@ -396,8 +396,8 @@ process crossReads {
// Define channel collecting tagAlign and xcor into design file // Define channel collecting tagAlign and xcor into design file
xcorDesign = xcorReads xcorDesign = xcorReads
.map{ sampleId, tagAlign, xcor, experimentId, replicate -> .map{ sampleId, seTagAlign, tagAlign, xcor, experimentId, replicate ->
"${sampleId}\t${tagAlign}\t${xcor}\t${experimentId}\t${replicate}\n" } "${sampleId}\t${seTagAlign}\t${tagAlign}\t${xcor}\t${experimentId}\t${replicate}\n" }
.collectFile(name:'design_xcor.tsv', seed:"sample_id\ttag_align\txcor\texperiment_id\treplicate\n", storeDir:"${outDir}/design") .collectFile(name:'design_xcor.tsv', seed:"sample_id\ttag_align\txcor\texperiment_id\treplicate\n", storeDir:"${outDir}/design")
// Make Experiment design files to be read in for downstream analysis // Make Experiment design files to be read in for downstream analysis
......
...@@ -98,6 +98,8 @@ def convert_mapped(bam, tag_filename): ...@@ -98,6 +98,8 @@ def convert_mapped(bam, tag_filename):
def convert_mapped_pe(bam, bam_basename, tag_filename): def convert_mapped_pe(bam, bam_basename, tag_filename):
'''Use bedtools to convert to tagAlign PE data.''' '''Use bedtools to convert to tagAlign PE data.'''
bedpe_filename = bam_basename + ".bedpe.gz"
# Name sort bam to make BEDPE # Name sort bam to make BEDPE
nmsrt_bam_filename = bam_basename + ".nmsrt.bam" nmsrt_bam_filename = bam_basename + ".nmsrt.bam"
samtools_sort_command = \ samtools_sort_command = \
...@@ -114,7 +116,7 @@ def convert_mapped_pe(bam, bam_basename, tag_filename): ...@@ -114,7 +116,7 @@ def convert_mapped_pe(bam, bam_basename, tag_filename):
"grep -v 'chrM'", "grep -v 'chrM'",
r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'""", r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'""",
r"""awk 'BEGIN {OFS = "\t"}{ if ($6 == "+") {$2 = $2 + 4} else if ($6 == "-") {$3 = $3 - 5} print $0}'""", r"""awk 'BEGIN {OFS = "\t"}{ if ($6 == "+") {$2 = $2 + 4} else if ($6 == "-") {$3 = $3 - 5} print $0}'""",
"gzip -nc"], outfile=tag_filename) "gzip -nc"], outfile=bedpe_filename)
os.remove(nmsrt_bam_filename) os.remove(nmsrt_bam_filename)
...@@ -138,9 +140,12 @@ def main(): ...@@ -138,9 +140,12 @@ def main():
tag_filename = bam_basename + '.tagAlign.gz' tag_filename = bam_basename + '.tagAlign.gz'
if paired: # paired-end data if paired: # paired-end data
convert_mapped(bam, tag_filename)
convert_mapped_pe(bam, bam_basename, tag_filename) convert_mapped_pe(bam, bam_basename, tag_filename)
else: else:
convert_mapped(bam, tag_filename) convert_mapped(bam, tag_filename)
bedse_filename = bam_basename + ".bedse.gz"
shutil.copy(tag_filename, bedse_filename)
if __name__ == '__main__': if __name__ == '__main__':
main() main()
...@@ -7,8 +7,10 @@ import logging ...@@ -7,8 +7,10 @@ import logging
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import os import os
import sys
import utils import utils
import subprocess
import shutil
import shlex
EPILOG = ''' EPILOG = '''
For more details: For more details:
...@@ -39,9 +41,9 @@ def get_args(): ...@@ -39,9 +41,9 @@ def get_args():
default=False, default=False,
action='store_true') action='store_true')
parser.add_argument('-c', '--cutoff', # parser.add_argument('-c', '--cutoff',
help="Cutoff ratio used for choosing controls.", # help="Cutoff ratio used for choosing controls.",
default=1.2) # default=1.2)
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -55,28 +57,28 @@ def check_replicates(design): ...@@ -55,28 +57,28 @@ def check_replicates(design):
return no_rep return no_rep
def check_controls(design): #def check_controls(design):
'''Check the number of controls for the experiment.''' # '''Check the number of controls for the experiment.'''
no_controls = len(design.control_tag_align.unique()) # no_controls = len(design.control_tag_align.unique())
return no_controls # return no_controls
def get_read_count_ratio(design): #def get_read_count_ratio(design):
'''Get the ratio of read counts for unique controls.''' # '''Get the ratio of read counts for unique controls.'''
controls = design.control_tag_align.unique() # controls = design.control_tag_align.unique()
control_dict = {} # control_dict = {}
for con in controls: # for con in controls:
no_reads = utils.count_lines(con) # no_reads = utils.count_lines(con)
control_dict[con] = no_reads # control_dict[con] = no_reads
control_matrix = {c: control_dict for c in controls} # control_matrix = {c: control_dict for c in controls}
control_df = pd.DataFrame.from_dict(control_matrix) # control_df = pd.DataFrame.from_dict(control_matrix)
control_ratio = control_df.divide(list(control_dict.values()), axis=0) # control_ratio = control_df.divide(list(control_dict.values()), axis=0)
return control_ratio # return control_ratio
def pool(tag_files, outfile, paired): def pool(tag_files, outfile, paired):
...@@ -107,20 +109,26 @@ def self_psuedoreplication(tag_file, prefix, paired): ...@@ -107,20 +109,26 @@ def self_psuedoreplication(tag_file, prefix, paired):
lines_per_rep = (no_lines+1)/2 lines_per_rep = (no_lines+1)/2
# Make an array of number of psuedoreplicatesfile names # Make an array of number of psuedoreplicatesfile names
pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.bedse.tagAlign.gz' pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.tagAlign.gz'
for r in [0, 1]} for r in [0, 1]}
# Shuffle and split file into equal parts # Shuffle and split file into equal parts
# by using the input to seed shuf we ensure multiple runs with the same # by using the input to seed shuf we ensure multiple runs with the same
# input will produce the same output # input will produce the same output
# Produces two files named splits_prefix0n, n=1,2 # Produces two files named splits_prefix0n, n=1,2
# Coded for reproducibility
splits_prefix = 'temp_split' splits_prefix = 'temp_split'
out, err = utils.run_pipe([ psuedo_command = 'bash -c "zcat {} | shuf --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {} | wc -c) -nosalt </dev/zero 2>/dev/null) | '
'gzip -dc %s' % (tag_file), psuedo_command += 'split -d -l {} - {}."'
'shuf --random-source=%s' % (tag_file), psuedo_command = psuedo_command.format(
'split -d -l %d - %s' % (lines_per_rep, splits_prefix)]) tag_file,
tag_file,
int(lines_per_rep),
splits_prefix)
logger.info("Running psuedo with %s", psuedo_command)
subprocess.check_call(shlex.split(psuedo_command))
# Convert read pairs to reads into standard tagAlign file # Convert read pairs to reads into standard tagAlign file
...@@ -131,7 +139,7 @@ def self_psuedoreplication(tag_file, prefix, paired): ...@@ -131,7 +139,7 @@ def self_psuedoreplication(tag_file, prefix, paired):
steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""]) steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""])
steps.extend(['gzip -cn']) steps.extend(['gzip -cn'])
out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i]) out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i])
os.remove(splits_prefix + string_index) # os.remove(splits_prefix + string_index)
return pseudoreplicate_dict return pseudoreplicate_dict
...@@ -140,7 +148,7 @@ def main(): ...@@ -140,7 +148,7 @@ def main():
args = get_args() args = get_args()
paired = args.paired paired = args.paired
design = args.design design = args.design
cutoff_ratio = args.cutoff # cutoff_ratio = args.cutoff
# Create a file handler # Create a file handler
handler = logging.FileHandler('experiment_generation.log') handler = logging.FileHandler('experiment_generation.log')
...@@ -155,10 +163,10 @@ def main(): ...@@ -155,10 +163,10 @@ def main():
# Check Number of replicates and controls # Check Number of replicates and controls
no_reps = check_replicates(design_df) no_reps = check_replicates(design_df)
if atac: # if atac:
no_unique_controls = 0 # no_unique_controls = 0
else: # else:
no_unique_controls = check_controls(design_df) # no_unique_controls = check_controls(design_df)
if no_reps == 1: if no_reps == 1:
logger.info("No other replicate specified " logger.info("No other replicate specified "
...@@ -170,33 +178,39 @@ def main(): ...@@ -170,33 +178,39 @@ def main():
"so processing as a replicated experiment.") "so processing as a replicated experiment.")
replicated = True replicated = True
if no_unique_controls == 1 and atac: # if no_unique_controls == 1 and atac:
logger.info("ATAC-seq experiment speficifed " # logger.info("ATAC-seq experiment speficifed "
"no controls are required.") # "no controls are required.")
single_control = False # single_control = False
if no_unique_controls == 1 and replicated: # if no_unique_controls == 1 and replicated:
logger.info("Only a single control was specified " # logger.info("Only a single control was specified "
"so using same control for replicates, pool and psuedoreplicates.") # "so using same control for replicates, pool and psuedoreplicates.")
single_control = True # single_control = True
else: # else:
logger.info("Will merge only unique controls for pooled.") # logger.info("Will merge only unique controls for pooled.")
single_control = False # single_control = False
# Pool the controls for checking # Pool the controls for checking
if not single_control and not atac: # if not single_control and not atac:
control_df = get_read_count_ratio(design_df) # control_df = get_read_count_ratio(design_df)
control_files = design_df.control_tag_align.unique() # control_files = design_df.control_tag_align.unique()
pool_control = pool(control_files, "pool_control", paired) # pool_control = pool(control_files, "pool_control", paired)
elif not atac: # elif not atac:
pool_control = design_df.control_tag_align.unique()[0] # pool_control = design_df.control_tag_align.unique()[0]
# Psuedoreplicate and update design accordingly # Psuedoreplicate and update design accordingly
if not replicated: if not replicated:
# Duplicate rows and update for pool and psuedoreplicates # Duplicate rows and update for pool and psuedoreplicates and update tagAlign with single end data
experiment_id = design_df.at[0, 'experiment_id'] experiment_id = design_df.at[0, 'experiment_id']
replicate = design_df.at[0, 'replicate'] replicate = design_df.at[0, 'replicate']
design_new_df = design_df.loc[np.repeat(design_df.index, 4)].reset_index() design_new_df = design_df.loc[np.repeat(design_df.index, 4)].reset_index()
# Update tagAlign with single end data
if paired:
design_new_df['tag_align'] = design_new_df['se_tag_align']
design_new_df.drop(labels='se_tag_align', axis=1, inplace=True)
design_new_df['replicate'] = design_new_df['replicate'].astype(str) design_new_df['replicate'] = design_new_df['replicate'].astype(str)
design_new_df.at[1, 'sample_id'] = experiment_id + '_pr1' design_new_df.at[1, 'sample_id'] = experiment_id + '_pr1'
design_new_df.at[1, 'replicate'] = '1_pr' design_new_df.at[1, 'replicate'] = '1_pr'
...@@ -207,6 +221,7 @@ def main(): ...@@ -207,6 +221,7 @@ def main():
design_new_df.at[3, 'sample_id'] = experiment_id + '_pooled' design_new_df.at[3, 'sample_id'] = experiment_id + '_pooled'
design_new_df.at[3, 'replicate'] = 'pooled' design_new_df.at[3, 'replicate'] = 'pooled'
design_new_df.at[3, 'xcor'] = 'Calculate' design_new_df.at[3, 'xcor'] = 'Calculate'
design_new_df.at[3, 'tag_align'] = design_new_df.at[0, 'tag_align']
# Make 2 self psuedoreplicates # Make 2 self psuedoreplicates
self_pseudoreplicates_dict = {} self_pseudoreplicates_dict = {}
...@@ -230,6 +245,12 @@ def main(): ...@@ -230,6 +245,12 @@ def main():
experiment_id = design_df.at[0, 'experiment_id'] experiment_id = design_df.at[0, 'experiment_id']
pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired) pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired)
# If paired change to single End
if paired:
pool_experiment_se = bedpe_to_tagalign(pool_experiment, experiment_id + "_pooled")
else:
pool_experiment_se = pool_experiment
# Make self psuedoreplicates equivalent to number of replicates # Make self psuedoreplicates equivalent to number of replicates
pseudoreplicates_dict = {} pseudoreplicates_dict = {}
for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']): for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']):
...@@ -242,38 +263,44 @@ def main(): ...@@ -242,38 +263,44 @@ def main():
pool_pseudoreplicates_dict = {} pool_pseudoreplicates_dict = {}
for index, row in pseudoreplicates_df.iterrows(): for index, row in pseudoreplicates_df.iterrows():
replicate_id = index + 1 replicate_id = index + 1
pr_filename = experiment_id + ".pr" + str(replicate_id) + '.bedse.gz' pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz'
pool_replicate = pool(row, pr_filename, False) pool_replicate = pool(row, pr_filename, False)
pool_pseudoreplicates_dict[replicate_id] = pool_replicate pool_pseudoreplicates_dict[replicate_id] = pool_replicate
design_new_df = design_df design_new_df = design_df
# Update tagAlign with single end data
if paired:
design_new_df['tag_align'] = design_new_df['se_tag_align']
design_new_df.drop(labels='se_tag_align', axis=1, inplace=True)
# Check controls against cutoff_ratio # Check controls against cutoff_ratio
# if so replace with pool_control # if so replace with pool_control
# unless single control was used # unless single control was used
if not single_control and not atac: # if not single_control and not atac:
path_to_pool_control = cwd + '/' + pool_control # path_to_pool_control = cwd + '/' + pool_control
if control_df.values.max() > 1.2: # if control_df.values.max() > 1.2:
logger.info("Number of reads in controls differ by " + # logger.info("Number of reads in controls differ by " +
" > factor of %f. Using pooled controls." % (cutoff_ratio)) # " > factor of %f. Using pooled controls." % (cutoff_ratio))
design_new_df['control_tag_align'] = path_to_pool_control # design_new_df['control_tag_align'] = path_to_pool_control
else: # else:
for index, row in design_new_df.iterrows(): # for index, row in design_new_df.iterrows():
exp_no_reads = utils.count_lines(row['tag_align']) # exp_no_reads = utils.count_lines(row['tag_align'])
con_no_reads = utils.count_lines(row['control_tag_align']) # con_no_reads = utils.count_lines(row['control_tag_align'])
if con_no_reads < exp_no_reads: # if con_no_reads < exp_no_reads:
logger.info("Fewer reads in control than experiment." + # logger.info("Fewer reads in control than experiment." +
"Using pooled controls for replicate %s." # "Using pooled controls for replicate %s."
% row['replicate']) # % row['replicate'])
design_new_df.loc[index, 'control_tag_align'] = \ # design_new_df.loc[index, 'control_tag_align'] = \
path_to_pool_control # path_to_pool_control
elif not atac: # elif not atac:
path_to_pool_control = pool_control # path_to_pool_control = pool_control
# Add in pseudo replicates # Add in pseudo replicates
tmp_metadata = design_new_df.loc[0].copy() tmp_metadata = design_new_df.loc[0].copy()
if not atac: # if not atac:
tmp_metadata['control_tag_align'] = path_to_pool_control # tmp_metadata['control_tag_align'] = path_to_pool_control
for rep, pseudorep_file in pool_pseudoreplicates_dict.items(): for rep, pseudorep_file in pool_pseudoreplicates_dict.items():
tmp_metadata['sample_id'] = experiment_id + '_pr' + str(rep) tmp_metadata['sample_id'] = experiment_id + '_pr' + str(rep)
tmp_metadata['replicate'] = str(rep) + '_pr' tmp_metadata['replicate'] = str(rep) + '_pr'
......
...@@ -100,7 +100,7 @@ def xcor(tag, paired): ...@@ -100,7 +100,7 @@ def xcor(tag, paired):
'gzip -dc %s > ' % (tag)], outfile=uncompressed_tag_filename) 'gzip -dc %s > ' % (tag)], outfile=uncompressed_tag_filename)
# Subsample tagAlign file # Subsample tagAlign file
numReads = 15000000 number_reads = 15000000
subsampled_tag_filename = \ subsampled_tag_filename = \
tag_basename + ".%d.tagAlign.gz" % (numReads/1000000) tag_basename + ".%d.tagAlign.gz" % (numReads/1000000)
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment