Skip to content
Snippets Groups Projects
Commit fd1e0176 authored by Holly Ruess's avatar Holly Ruess
Browse files

fix pool and pseudo

parent 4c9bb1e9
Branches
Tags
1 merge request!12Resolve "Fix Pool and Pseudoreps"
......@@ -289,7 +289,7 @@ process convertReads {
set sampleId, deduped, bai, experimentId, replicate from convertReads
output:
set sampleId, file('*.tagAlign.gz'), experimentId, replicate into tagReads
set sampleId, file('*.tagAlign.gz'), file('*.bed{pe,se}.gz'), experimentId, replicate into tagReads
file('version_*.txt') into convertReadsVersions
script:
......@@ -356,10 +356,10 @@ process crossReads {
publishDir "${outDir}/${task.process}/${sampleId}", mode: 'copy'
input:
set sampleId, tagAlign, experimentId, replicate from tagReads
set sampleId, seTagAlign, tagAlign, experimentId, replicate from tagReads
output:
set sampleId, tagAlign, file('*.cc.qc'), experimentId, replicate into xcorReads
set sampleId, seTagAlign, tagAlign, file('*.cc.qc'), experimentId, replicate into xcorReads
set file('*.cc.qc'), file('*.cc.plot.pdf') into xcorReadsStats
script:
......@@ -396,8 +396,8 @@ process crossReads {
// Define channel collecting tagAlign and xcor into design file
xcorDesign = xcorReads
.map{ sampleId, tagAlign, xcor, experimentId, replicate ->
"${sampleId}\t${tagAlign}\t${xcor}\t${experimentId}\t${replicate}\n" }
.map{ sampleId, seTagAlign, tagAlign, xcor, experimentId, replicate ->
"${sampleId}\t${seTagAlign}\t${tagAlign}\t${xcor}\t${experimentId}\t${replicate}\n" }
.collectFile(name:'design_xcor.tsv', seed:"sample_id\ttag_align\txcor\texperiment_id\treplicate\n", storeDir:"${outDir}/design")
// Make Experiment design files to be read in for downstream analysis
......
......@@ -98,6 +98,8 @@ def convert_mapped(bam, tag_filename):
def convert_mapped_pe(bam, bam_basename, tag_filename):
'''Use bedtools to convert to tagAlign PE data.'''
bedpe_filename = bam_basename + ".bedpe.gz"
# Name sort bam to make BEDPE
nmsrt_bam_filename = bam_basename + ".nmsrt.bam"
samtools_sort_command = \
......@@ -114,7 +116,7 @@ def convert_mapped_pe(bam, bam_basename, tag_filename):
"grep -v 'chrM'",
r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'""",
r"""awk 'BEGIN {OFS = "\t"}{ if ($6 == "+") {$2 = $2 + 4} else if ($6 == "-") {$3 = $3 - 5} print $0}'""",
"gzip -nc"], outfile=tag_filename)
"gzip -nc"], outfile=bedpe_filename)
os.remove(nmsrt_bam_filename)
......@@ -138,9 +140,12 @@ def main():
tag_filename = bam_basename + '.tagAlign.gz'
if paired: # paired-end data
convert_mapped(bam, tag_filename)
convert_mapped_pe(bam, bam_basename, tag_filename)
else:
convert_mapped(bam, tag_filename)
bedse_filename = bam_basename + ".bedse.gz"
shutil.copy(tag_filename, bedse_filename)
if __name__ == '__main__':
main()
......@@ -7,8 +7,10 @@ import logging
import pandas as pd
import numpy as np
import os
import sys
import utils
import subprocess
import shutil
import shlex
EPILOG = '''
For more details:
......@@ -39,9 +41,9 @@ def get_args():
default=False,
action='store_true')
parser.add_argument('-c', '--cutoff',
help="Cutoff ratio used for choosing controls.",
default=1.2)
# parser.add_argument('-c', '--cutoff',
# help="Cutoff ratio used for choosing controls.",
# default=1.2)
args = parser.parse_args()
return args
......@@ -55,28 +57,28 @@ def check_replicates(design):
return no_rep
def check_controls(design):
'''Check the number of controls for the experiment.'''
#def check_controls(design):
# '''Check the number of controls for the experiment.'''
no_controls = len(design.control_tag_align.unique())
# no_controls = len(design.control_tag_align.unique())
return no_controls
# return no_controls
def get_read_count_ratio(design):
'''Get the ratio of read counts for unique controls.'''
#def get_read_count_ratio(design):
# '''Get the ratio of read counts for unique controls.'''
controls = design.control_tag_align.unique()
control_dict = {}
for con in controls:
no_reads = utils.count_lines(con)
control_dict[con] = no_reads
# controls = design.control_tag_align.unique()
# control_dict = {}
# for con in controls:
# no_reads = utils.count_lines(con)
# control_dict[con] = no_reads
control_matrix = {c: control_dict for c in controls}
control_df = pd.DataFrame.from_dict(control_matrix)
# control_matrix = {c: control_dict for c in controls}
# control_df = pd.DataFrame.from_dict(control_matrix)
control_ratio = control_df.divide(list(control_dict.values()), axis=0)
return control_ratio
# control_ratio = control_df.divide(list(control_dict.values()), axis=0)
# return control_ratio
def pool(tag_files, outfile, paired):
......@@ -107,20 +109,26 @@ def self_psuedoreplication(tag_file, prefix, paired):
lines_per_rep = (no_lines+1)/2
# Make an array of number of psuedoreplicatesfile names
pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.bedse.tagAlign.gz'
pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.tagAlign.gz'
for r in [0, 1]}
# Shuffle and split file into equal parts
# by using the input to seed shuf we ensure multiple runs with the same
# input will produce the same output
# Produces two files named splits_prefix0n, n=1,2
# Coded for reproducibility
splits_prefix = 'temp_split'
out, err = utils.run_pipe([
'gzip -dc %s' % (tag_file),
'shuf --random-source=%s' % (tag_file),
'split -d -l %d - %s' % (lines_per_rep, splits_prefix)])
psuedo_command = 'bash -c "zcat {} | shuf --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {} | wc -c) -nosalt </dev/zero 2>/dev/null) | '
psuedo_command += 'split -d -l {} - {}."'
psuedo_command = psuedo_command.format(
tag_file,
tag_file,
int(lines_per_rep),
splits_prefix)
logger.info("Running psuedo with %s", psuedo_command)
subprocess.check_call(shlex.split(psuedo_command))
# Convert read pairs to reads into standard tagAlign file
......@@ -131,7 +139,7 @@ def self_psuedoreplication(tag_file, prefix, paired):
steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""])
steps.extend(['gzip -cn'])
out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i])
os.remove(splits_prefix + string_index)
# os.remove(splits_prefix + string_index)
return pseudoreplicate_dict
......@@ -140,7 +148,7 @@ def main():
args = get_args()
paired = args.paired
design = args.design
cutoff_ratio = args.cutoff
# cutoff_ratio = args.cutoff
# Create a file handler
handler = logging.FileHandler('experiment_generation.log')
......@@ -155,10 +163,10 @@ def main():
# Check Number of replicates and controls
no_reps = check_replicates(design_df)
if atac:
no_unique_controls = 0
else:
no_unique_controls = check_controls(design_df)
# if atac:
# no_unique_controls = 0
# else:
# no_unique_controls = check_controls(design_df)
if no_reps == 1:
logger.info("No other replicate specified "
......@@ -170,33 +178,39 @@ def main():
"so processing as a replicated experiment.")
replicated = True
if no_unique_controls == 1 and atac:
logger.info("ATAC-seq experiment speficifed "
"no controls are required.")
single_control = False
if no_unique_controls == 1 and replicated:
logger.info("Only a single control was specified "
"so using same control for replicates, pool and psuedoreplicates.")
single_control = True
else:
logger.info("Will merge only unique controls for pooled.")
single_control = False
# if no_unique_controls == 1 and atac:
# logger.info("ATAC-seq experiment speficifed "
# "no controls are required.")
# single_control = False
# if no_unique_controls == 1 and replicated:
# logger.info("Only a single control was specified "
# "so using same control for replicates, pool and psuedoreplicates.")
# single_control = True
# else:
# logger.info("Will merge only unique controls for pooled.")
# single_control = False
# Pool the controls for checking
if not single_control and not atac:
control_df = get_read_count_ratio(design_df)
control_files = design_df.control_tag_align.unique()
pool_control = pool(control_files, "pool_control", paired)
elif not atac:
pool_control = design_df.control_tag_align.unique()[0]
# if not single_control and not atac:
# control_df = get_read_count_ratio(design_df)
# control_files = design_df.control_tag_align.unique()
# pool_control = pool(control_files, "pool_control", paired)
# elif not atac:
# pool_control = design_df.control_tag_align.unique()[0]
# Psuedoreplicate and update design accordingly
if not replicated:
# Duplicate rows and update for pool and psuedoreplicates
# Duplicate rows and update for pool and psuedoreplicates and update tagAlign with single end data
experiment_id = design_df.at[0, 'experiment_id']
replicate = design_df.at[0, 'replicate']
design_new_df = design_df.loc[np.repeat(design_df.index, 4)].reset_index()
# Update tagAlign with single end data
if paired:
design_new_df['tag_align'] = design_new_df['se_tag_align']
design_new_df.drop(labels='se_tag_align', axis=1, inplace=True)
design_new_df['replicate'] = design_new_df['replicate'].astype(str)
design_new_df.at[1, 'sample_id'] = experiment_id + '_pr1'
design_new_df.at[1, 'replicate'] = '1_pr'
......@@ -207,6 +221,7 @@ def main():
design_new_df.at[3, 'sample_id'] = experiment_id + '_pooled'
design_new_df.at[3, 'replicate'] = 'pooled'
design_new_df.at[3, 'xcor'] = 'Calculate'
design_new_df.at[3, 'tag_align'] = design_new_df.at[0, 'tag_align']
# Make 2 self psuedoreplicates
self_pseudoreplicates_dict = {}
......@@ -230,6 +245,12 @@ def main():
experiment_id = design_df.at[0, 'experiment_id']
pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired)
# If paired change to single End
if paired:
pool_experiment_se = bedpe_to_tagalign(pool_experiment, experiment_id + "_pooled")
else:
pool_experiment_se = pool_experiment
# Make self psuedoreplicates equivalent to number of replicates
pseudoreplicates_dict = {}
for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']):
......@@ -242,38 +263,44 @@ def main():
pool_pseudoreplicates_dict = {}
for index, row in pseudoreplicates_df.iterrows():
replicate_id = index + 1
pr_filename = experiment_id + ".pr" + str(replicate_id) + '.bedse.gz'
pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz'
pool_replicate = pool(row, pr_filename, False)
pool_pseudoreplicates_dict[replicate_id] = pool_replicate
design_new_df = design_df
# Update tagAlign with single end data
if paired:
design_new_df['tag_align'] = design_new_df['se_tag_align']
design_new_df.drop(labels='se_tag_align', axis=1, inplace=True)
# Check controls against cutoff_ratio
# if so replace with pool_control
# unless single control was used
if not single_control and not atac:
path_to_pool_control = cwd + '/' + pool_control
if control_df.values.max() > 1.2:
logger.info("Number of reads in controls differ by " +
" > factor of %f. Using pooled controls." % (cutoff_ratio))
design_new_df['control_tag_align'] = path_to_pool_control
else:
for index, row in design_new_df.iterrows():
exp_no_reads = utils.count_lines(row['tag_align'])
con_no_reads = utils.count_lines(row['control_tag_align'])
if con_no_reads < exp_no_reads:
logger.info("Fewer reads in control than experiment." +
"Using pooled controls for replicate %s."
% row['replicate'])
design_new_df.loc[index, 'control_tag_align'] = \
path_to_pool_control
elif not atac:
path_to_pool_control = pool_control
# if not single_control and not atac:
# path_to_pool_control = cwd + '/' + pool_control
# if control_df.values.max() > 1.2:
# logger.info("Number of reads in controls differ by " +
# " > factor of %f. Using pooled controls." % (cutoff_ratio))
# design_new_df['control_tag_align'] = path_to_pool_control
# else:
# for index, row in design_new_df.iterrows():
# exp_no_reads = utils.count_lines(row['tag_align'])
# con_no_reads = utils.count_lines(row['control_tag_align'])
# if con_no_reads < exp_no_reads:
# logger.info("Fewer reads in control than experiment." +
# "Using pooled controls for replicate %s."
# % row['replicate'])
# design_new_df.loc[index, 'control_tag_align'] = \
# path_to_pool_control
# elif not atac:
# path_to_pool_control = pool_control
# Add in pseudo replicates
tmp_metadata = design_new_df.loc[0].copy()
if not atac:
tmp_metadata['control_tag_align'] = path_to_pool_control
# if not atac:
# tmp_metadata['control_tag_align'] = path_to_pool_control
for rep, pseudorep_file in pool_pseudoreplicates_dict.items():
tmp_metadata['sample_id'] = experiment_id + '_pr' + str(rep)
tmp_metadata['replicate'] = str(rep) + '_pr'
......
......@@ -100,7 +100,7 @@ def xcor(tag, paired):
'gzip -dc %s > ' % (tag)], outfile=uncompressed_tag_filename)
# Subsample tagAlign file
numReads = 15000000
number_reads = 15000000
subsampled_tag_filename = \
tag_basename + ".%d.tagAlign.gz" % (numReads/1000000)
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment