Skip to content
Snippets Groups Projects
Commit 403d4903 authored by Holly Ruess's avatar Holly Ruess
Browse files

fix pool and pseudo

parent fefc3182
1 merge request!12Resolve "Fix Pool and Pseudoreps"
Pipeline #5477 failed with stages
in 6 hours, 58 minutes, and 15 seconds
......@@ -17,6 +17,7 @@ All notable changes to this project will be documented in this file.
- Added punctuation check in design file
- Added sequence (fastq1) length into design file for better mapping
- Raw fastq1 sequence length determines mapper
- Added paired-end peak calling
## [publish_1.0.0 ] - 2019-12-03
Initial release of pipeline
......
......@@ -49,7 +49,8 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
3. Map reads with BWA, filter with SamTools, and sort with Sambamba
4. Mark duplicates with Sambamba, Filter reads with SamTools, and calculate library complexity with SamTools and bedtools
5. QC metrics with deep tools
6. Convert bam files to tagAlign files; remove chrM and adds tn5 shift
6. Calculate cross-correlation using PhantomPeakQualTools
7. Call peaks with MACS2 from overlaps of pooled replicates
## Output Files
......@@ -65,6 +66,7 @@ filterReads | *.dedup.bam | filtered bam file with duplicate reads removed
filterReads | *.dedup.bam.bai | indexed filtered bam file
filterReads | *.dedup.flagstat.qc | QC metrics of filtered bam file (mapping stats, samtools)
filterReads | *.dedup.pbc.qc | QC metrics of library complexity
convertReads | *.filt.nodup.bedse.gz | bed alignment in BEDPE format
convertReads | *.tagAlign.gz | bed alignent in BEDPE or BEDSE format
experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sample
experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples
......
......@@ -41,10 +41,6 @@ def get_args():
default=False,
action='store_true')
# parser.add_argument('-c', '--cutoff',
# help="Cutoff ratio used for choosing controls.",
# default=1.2)
args = parser.parse_args()
return args
......@@ -57,30 +53,6 @@ def check_replicates(design):
return no_rep
#def check_controls(design):
# '''Check the number of controls for the experiment.'''
# no_controls = len(design.control_tag_align.unique())
# return no_controls
#def get_read_count_ratio(design):
# '''Get the ratio of read counts for unique controls.'''
# controls = design.control_tag_align.unique()
# control_dict = {}
# for con in controls:
# no_reads = utils.count_lines(con)
# control_dict[con] = no_reads
# control_matrix = {c: control_dict for c in controls}
# control_df = pd.DataFrame.from_dict(control_matrix)
# control_ratio = control_df.divide(list(control_dict.values()), axis=0)
# return control_ratio
def pool(tag_files, outfile, paired):
'''Pool files together.'''
......@@ -153,7 +125,7 @@ def self_psuedoreplication(tag_file, prefix, paired):
steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""])
steps.extend(['gzip -cn'])
out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i])
# os.remove(splits_prefix + string_index)
os.remove(splits_prefix + string_index)
return pseudoreplicate_dict
......@@ -162,7 +134,6 @@ def main():
args = get_args()
paired = args.paired
design = args.design
# cutoff_ratio = args.cutoff
# Create a file handler
handler = logging.FileHandler('experiment_generation.log')
......@@ -177,11 +148,6 @@ def main():
# Check Number of replicates and controls
no_reps = check_replicates(design_df)
# if atac:
# no_unique_controls = 0
# else:
# no_unique_controls = check_controls(design_df)
if no_reps == 1:
logger.info("No other replicate specified "
"so processing as an unreplicated experiment.")
......@@ -192,26 +158,6 @@ def main():
"so processing as a replicated experiment.")
replicated = True
# if no_unique_controls == 1 and atac:
# logger.info("ATAC-seq experiment speficifed "
# "no controls are required.")
# single_control = False
# if no_unique_controls == 1 and replicated:
# logger.info("Only a single control was specified "
# "so using same control for replicates, pool and psuedoreplicates.")
# single_control = True
# else:
# logger.info("Will merge only unique controls for pooled.")
# single_control = False
# Pool the controls for checking
# if not single_control and not atac:
# control_df = get_read_count_ratio(design_df)
# control_files = design_df.control_tag_align.unique()
# pool_control = pool(control_files, "pool_control", paired)
# elif not atac:
# pool_control = design_df.control_tag_align.unique()[0]
# Psuedoreplicate and update design accordingly
if not replicated:
......@@ -289,33 +235,9 @@ def main():
design_new_df['tag_align'] = design_new_df['seTagAlign']
design_new_df.drop(labels='seTagAlign', axis=1, inplace=True)
# Check controls against cutoff_ratio
# if so replace with pool_control
# unless single control was used
# if not single_control and not atac:
# path_to_pool_control = cwd + '/' + pool_control
# if control_df.values.max() > 1.2:
# logger.info("Number of reads in controls differ by " +
# " > factor of %f. Using pooled controls." % (cutoff_ratio))
# design_new_df['control_tag_align'] = path_to_pool_control
# else:
# for index, row in design_new_df.iterrows():
# exp_no_reads = utils.count_lines(row['tag_align'])
# con_no_reads = utils.count_lines(row['control_tag_align'])
# if con_no_reads < exp_no_reads:
# logger.info("Fewer reads in control than experiment." +
# "Using pooled controls for replicate %s."
# % row['replicate'])
# design_new_df.loc[index, 'control_tag_align'] = \
# path_to_pool_control
# elif not atac:
# path_to_pool_control = pool_control
# Add in pseudo replicates
tmp_metadata = design_new_df.loc[0].copy()
# if not atac:
# tmp_metadata['control_tag_align'] = path_to_pool_control
for rep, pseudorep_file in pool_pseudoreplicates_dict.items():
tmp_metadata['sample_id'] = experiment_id + '_pr' + str(rep)
tmp_metadata['replicate'] = str(rep) + '_pr'
......
......@@ -48,12 +48,6 @@ def test_pool_and_psuedoreplicate_singleend_human():
design_df = pd.read_csv(design_file, sep="\t")
assert design_df.shape[0] == 6
@pytest.mark.pairedend_mouse
def test_experiment_design_pairedend_mouse():
# Do the same thing for paired end data
pass
@pytest.mark.pairedend_mouse
def test_pool_and_psuedoreplicate_pairedend_mouse():
design_file = os.path.join(test_output_path, 'ENCSR451NAE_ppr.tsv')
......
......@@ -32,5 +32,5 @@ def test_cross_plot_pairedend_mouse():
qc_file = os.path.join(test_output_path,"ENCLB749GLW/ENCLB749GLW.cc.qc")
df_xcor = pd.read_csv(qc_file, sep="\t", header=None)
assert df_xcor[2].iloc[0] == '0,65,75'
assert df_xcor[8].iloc[0] == 1.55347
assert round(df_xcor[8].iloc[0],6) == 1.55347
assert df_xcor[9].iloc[0] == 1.285233
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment