Skip to content
Snippets Groups Projects
Commit 403d4903 authored by Holly Ruess's avatar Holly Ruess
Browse files

fix pool and pseudo

parent fefc3182
Branches
Tags
1 merge request!12Resolve "Fix Pool and Pseudoreps"
Pipeline #5477 failed with stages
in 6 hours, 58 minutes, and 15 seconds
...@@ -17,6 +17,7 @@ All notable changes to this project will be documented in this file. ...@@ -17,6 +17,7 @@ All notable changes to this project will be documented in this file.
- Added punctuation check in design file - Added punctuation check in design file
- Added sequence (fastq1) length into design file for better mapping - Added sequence (fastq1) length into design file for better mapping
- Raw fastq1 sequence length determines mapper - Raw fastq1 sequence length determines mapper
- Added paired-end peak calling
## [publish_1.0.0 ] - 2019-12-03 ## [publish_1.0.0 ] - 2019-12-03
Initial release of pipeline Initial release of pipeline
......
...@@ -49,7 +49,8 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git ...@@ -49,7 +49,8 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
3. Map reads with BWA, filter with SamTools, and sort with Sambamba 3. Map reads with BWA, filter with SamTools, and sort with Sambamba
4. Mark duplicates with Sambamba, Filter reads with SamTools, and calculate library complexity with SamTools and bedtools 4. Mark duplicates with Sambamba, Filter reads with SamTools, and calculate library complexity with SamTools and bedtools
5. QC metrics with deep tools 5. QC metrics with deep tools
6. Convert bam files to tagAlign files; remove chrM and adds tn5 shift 6. Calculate cross-correlation using PhantomPeakQualTools
7. Call peaks with MACS2 from overlaps of pooled replicates
## Output Files ## Output Files
...@@ -65,6 +66,7 @@ filterReads | *.dedup.bam | filtered bam file with duplicate reads removed ...@@ -65,6 +66,7 @@ filterReads | *.dedup.bam | filtered bam file with duplicate reads removed
filterReads | *.dedup.bam.bai | indexed filtered bam file filterReads | *.dedup.bam.bai | indexed filtered bam file
filterReads | *.dedup.flagstat.qc | QC metrics of filtered bam file (mapping stats, samtools) filterReads | *.dedup.flagstat.qc | QC metrics of filtered bam file (mapping stats, samtools)
filterReads | *.dedup.pbc.qc | QC metrics of library complexity filterReads | *.dedup.pbc.qc | QC metrics of library complexity
convertReads | *.filt.nodup.bedse.gz | bed alignment in BEDPE format
convertReads | *.tagAlign.gz | bed alignent in BEDPE or BEDSE format convertReads | *.tagAlign.gz | bed alignent in BEDPE or BEDSE format
experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sample experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sample
experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples
......
...@@ -41,10 +41,6 @@ def get_args(): ...@@ -41,10 +41,6 @@ def get_args():
default=False, default=False,
action='store_true') action='store_true')
# parser.add_argument('-c', '--cutoff',
# help="Cutoff ratio used for choosing controls.",
# default=1.2)
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -57,30 +53,6 @@ def check_replicates(design): ...@@ -57,30 +53,6 @@ def check_replicates(design):
return no_rep return no_rep
#def check_controls(design):
# '''Check the number of controls for the experiment.'''
# no_controls = len(design.control_tag_align.unique())
# return no_controls
#def get_read_count_ratio(design):
# '''Get the ratio of read counts for unique controls.'''
# controls = design.control_tag_align.unique()
# control_dict = {}
# for con in controls:
# no_reads = utils.count_lines(con)
# control_dict[con] = no_reads
# control_matrix = {c: control_dict for c in controls}
# control_df = pd.DataFrame.from_dict(control_matrix)
# control_ratio = control_df.divide(list(control_dict.values()), axis=0)
# return control_ratio
def pool(tag_files, outfile, paired): def pool(tag_files, outfile, paired):
'''Pool files together.''' '''Pool files together.'''
...@@ -153,7 +125,7 @@ def self_psuedoreplication(tag_file, prefix, paired): ...@@ -153,7 +125,7 @@ def self_psuedoreplication(tag_file, prefix, paired):
steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""]) steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""])
steps.extend(['gzip -cn']) steps.extend(['gzip -cn'])
out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i]) out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i])
# os.remove(splits_prefix + string_index) os.remove(splits_prefix + string_index)
return pseudoreplicate_dict return pseudoreplicate_dict
...@@ -162,7 +134,6 @@ def main(): ...@@ -162,7 +134,6 @@ def main():
args = get_args() args = get_args()
paired = args.paired paired = args.paired
design = args.design design = args.design
# cutoff_ratio = args.cutoff
# Create a file handler # Create a file handler
handler = logging.FileHandler('experiment_generation.log') handler = logging.FileHandler('experiment_generation.log')
...@@ -177,11 +148,6 @@ def main(): ...@@ -177,11 +148,6 @@ def main():
# Check Number of replicates and controls # Check Number of replicates and controls
no_reps = check_replicates(design_df) no_reps = check_replicates(design_df)
# if atac:
# no_unique_controls = 0
# else:
# no_unique_controls = check_controls(design_df)
if no_reps == 1: if no_reps == 1:
logger.info("No other replicate specified " logger.info("No other replicate specified "
"so processing as an unreplicated experiment.") "so processing as an unreplicated experiment.")
...@@ -192,26 +158,6 @@ def main(): ...@@ -192,26 +158,6 @@ def main():
"so processing as a replicated experiment.") "so processing as a replicated experiment.")
replicated = True replicated = True
# if no_unique_controls == 1 and atac:
# logger.info("ATAC-seq experiment speficifed "
# "no controls are required.")
# single_control = False
# if no_unique_controls == 1 and replicated:
# logger.info("Only a single control was specified "
# "so using same control for replicates, pool and psuedoreplicates.")
# single_control = True
# else:
# logger.info("Will merge only unique controls for pooled.")
# single_control = False
# Pool the controls for checking
# if not single_control and not atac:
# control_df = get_read_count_ratio(design_df)
# control_files = design_df.control_tag_align.unique()
# pool_control = pool(control_files, "pool_control", paired)
# elif not atac:
# pool_control = design_df.control_tag_align.unique()[0]
# Psuedoreplicate and update design accordingly # Psuedoreplicate and update design accordingly
if not replicated: if not replicated:
...@@ -289,33 +235,9 @@ def main(): ...@@ -289,33 +235,9 @@ def main():
design_new_df['tag_align'] = design_new_df['seTagAlign'] design_new_df['tag_align'] = design_new_df['seTagAlign']
design_new_df.drop(labels='seTagAlign', axis=1, inplace=True) design_new_df.drop(labels='seTagAlign', axis=1, inplace=True)
# Check controls against cutoff_ratio
# if so replace with pool_control
# unless single control was used
# if not single_control and not atac:
# path_to_pool_control = cwd + '/' + pool_control
# if control_df.values.max() > 1.2:
# logger.info("Number of reads in controls differ by " +
# " > factor of %f. Using pooled controls." % (cutoff_ratio))
# design_new_df['control_tag_align'] = path_to_pool_control
# else:
# for index, row in design_new_df.iterrows():
# exp_no_reads = utils.count_lines(row['tag_align'])
# con_no_reads = utils.count_lines(row['control_tag_align'])
# if con_no_reads < exp_no_reads:
# logger.info("Fewer reads in control than experiment." +
# "Using pooled controls for replicate %s."
# % row['replicate'])
# design_new_df.loc[index, 'control_tag_align'] = \
# path_to_pool_control
# elif not atac:
# path_to_pool_control = pool_control
# Add in pseudo replicates # Add in pseudo replicates
tmp_metadata = design_new_df.loc[0].copy() tmp_metadata = design_new_df.loc[0].copy()
# if not atac:
# tmp_metadata['control_tag_align'] = path_to_pool_control
for rep, pseudorep_file in pool_pseudoreplicates_dict.items(): for rep, pseudorep_file in pool_pseudoreplicates_dict.items():
tmp_metadata['sample_id'] = experiment_id + '_pr' + str(rep) tmp_metadata['sample_id'] = experiment_id + '_pr' + str(rep)
tmp_metadata['replicate'] = str(rep) + '_pr' tmp_metadata['replicate'] = str(rep) + '_pr'
......
...@@ -48,12 +48,6 @@ def test_pool_and_psuedoreplicate_singleend_human(): ...@@ -48,12 +48,6 @@ def test_pool_and_psuedoreplicate_singleend_human():
design_df = pd.read_csv(design_file, sep="\t") design_df = pd.read_csv(design_file, sep="\t")
assert design_df.shape[0] == 6 assert design_df.shape[0] == 6
@pytest.mark.pairedend_mouse
def test_experiment_design_pairedend_mouse():
# Do the same thing for paired end data
pass
@pytest.mark.pairedend_mouse @pytest.mark.pairedend_mouse
def test_pool_and_psuedoreplicate_pairedend_mouse(): def test_pool_and_psuedoreplicate_pairedend_mouse():
design_file = os.path.join(test_output_path, 'ENCSR451NAE_ppr.tsv') design_file = os.path.join(test_output_path, 'ENCSR451NAE_ppr.tsv')
......
...@@ -32,5 +32,5 @@ def test_cross_plot_pairedend_mouse(): ...@@ -32,5 +32,5 @@ def test_cross_plot_pairedend_mouse():
qc_file = os.path.join(test_output_path,"ENCLB749GLW/ENCLB749GLW.cc.qc") qc_file = os.path.join(test_output_path,"ENCLB749GLW/ENCLB749GLW.cc.qc")
df_xcor = pd.read_csv(qc_file, sep="\t", header=None) df_xcor = pd.read_csv(qc_file, sep="\t", header=None)
assert df_xcor[2].iloc[0] == '0,65,75' assert df_xcor[2].iloc[0] == '0,65,75'
assert df_xcor[8].iloc[0] == 1.55347 assert round(df_xcor[8].iloc[0],6) == 1.55347
assert df_xcor[9].iloc[0] == 1.285233 assert df_xcor[9].iloc[0] == 1.285233
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment