fix pool and pseudo

403d4903 · Holly Ruess · fefc3182 · 403d4903 · 403d4903 · 403d4903
Commit 403d4903 authored 5 years ago by Holly Ruess
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,7 @@ All notable changes to this project will be documented in this file.
 - Added punctuation check in design file
 - Added sequence (fastq1) length into design file for better mapping
 - Raw fastq1 sequence length determines mapper
+ - Added paired-end peak calling

 ## [publish_1.0.0 ] - 2019-12-03
 Initial release of pipeline

--- a/README.md
+++ b/README.md
@@ -49,7 +49,8 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
    3. Map reads with BWA, filter with SamTools, and sort with Sambamba
    4. Mark duplicates with Sambamba, Filter reads with SamTools, and calculate library complexity with SamTools and bedtools
    5. QC metrics with deep tools
-    6. Convert bam files to tagAlign files; remove chrM and adds tn5 shift
+    6. Calculate cross-correlation using PhantomPeakQualTools
+    7. Call peaks with MACS2 from overlaps of pooled replicates


 ## Output Files
@@ -65,6 +66,7 @@ filterReads | *.dedup.bam | filtered bam file with duplicate reads removed
 filterReads | *.dedup.bam.bai | indexed filtered bam file
 filterReads | *.dedup.flagstat.qc | QC metrics of filtered bam file (mapping stats, samtools)
 filterReads | *.dedup.pbc.qc | QC metrics of library complexity
+convertReads | *.filt.nodup.bedse.gz | bed alignment in BEDPE format
 convertReads | *.tagAlign.gz | bed alignent in BEDPE or BEDSE format
 experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sample
 experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples

--- a/workflow/scripts/pool_and_psuedoreplicate.py
+++ b/workflow/scripts/pool_and_psuedoreplicate.py
@@ -41,10 +41,6 @@ def get_args():
                        default=False,
                        action='store_true')

-#    parser.add_argument('-c', '--cutoff',
-#                        help="Cutoff ratio used for choosing controls.",
-#                        default=1.2)
-
    args = parser.parse_args()
    return args

@@ -57,30 +53,6 @@ def check_replicates(design):
    return no_rep


-#def check_controls(design):
-#    '''Check the number of controls for the experiment.'''
-
-#    no_controls = len(design.control_tag_align.unique())
-
-#    return no_controls
-
-
-#def get_read_count_ratio(design):
-#    '''Get the ratio of read counts for unique controls.'''
-
-#    controls = design.control_tag_align.unique()
-#    control_dict = {}
-#    for con in controls:
-#        no_reads = utils.count_lines(con)
-#        control_dict[con] = no_reads
-
-#    control_matrix = {c: control_dict for c in controls}
-#    control_df = pd.DataFrame.from_dict(control_matrix)
-
-#    control_ratio = control_df.divide(list(control_dict.values()), axis=0)
-#    return control_ratio
-
-
 def pool(tag_files, outfile, paired):
    '''Pool files together.'''

@@ -153,7 +125,7 @@ def self_psuedoreplication(tag_file, prefix, paired):
            steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""])
        steps.extend(['gzip -cn'])
        out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i])
-#        os.remove(splits_prefix + string_index)
+        os.remove(splits_prefix + string_index)

    return pseudoreplicate_dict

@@ -162,7 +134,6 @@ def main():
    args = get_args()
    paired = args.paired
    design = args.design
-#    cutoff_ratio = args.cutoff

    # Create a file handler
    handler = logging.FileHandler('experiment_generation.log')
@@ -177,11 +148,6 @@ def main():
    # Check Number of replicates and controls
    no_reps = check_replicates(design_df)

-#    if atac:
-#        no_unique_controls = 0
-#    else:
-#        no_unique_controls = check_controls(design_df)
-
    if no_reps == 1:
        logger.info("No other replicate specified "
                    "so processing as an unreplicated experiment.")
@@ -192,26 +158,6 @@ def main():
                    "so processing as a replicated experiment.")
        replicated = True

-#    if no_unique_controls == 1 and atac:
-#        logger.info("ATAC-seq experiment speficifed "
-#                    "no controls are required.")
- #       single_control = False
-#    if no_unique_controls == 1 and replicated:
-#        logger.info("Only a single control was specified "
-#                    "so using same control for replicates, pool and psuedoreplicates.")
-#        single_control = True
-#    else:
-#        logger.info("Will merge only unique controls for pooled.")
-#        single_control = False
-
-    # Pool the controls for checking
-#    if not single_control and not atac:
-#        control_df = get_read_count_ratio(design_df)
-#        control_files = design_df.control_tag_align.unique()
-#        pool_control = pool(control_files, "pool_control", paired)
-#    elif not atac:
-#        pool_control = design_df.control_tag_align.unique()[0]
-
    # Psuedoreplicate and update design accordingly
    if not replicated:

@@ -289,33 +235,9 @@ def main():
            design_new_df['tag_align'] = design_new_df['seTagAlign']
        design_new_df.drop(labels='seTagAlign', axis=1, inplace=True)

-        # Check controls against cutoff_ratio
-        # if so replace with pool_control
-        # unless single control was used
-
-#        if not single_control and not atac:
-#            path_to_pool_control = cwd + '/' + pool_control
-#            if control_df.values.max() > 1.2:
-#                logger.info("Number of reads in controls differ by " +
-#                            " > factor of %f. Using pooled controls." % (cutoff_ratio))
-#                design_new_df['control_tag_align'] = path_to_pool_control
-#            else:
-#                for index, row in design_new_df.iterrows():
-#                    exp_no_reads = utils.count_lines(row['tag_align'])
-#                    con_no_reads = utils.count_lines(row['control_tag_align'])
-#                    if con_no_reads < exp_no_reads:
-#                        logger.info("Fewer reads in control than experiment." +
-#                                    "Using pooled controls for replicate %s."
-#                                    % row['replicate'])
-#                        design_new_df.loc[index, 'control_tag_align'] = \
-#                                                            path_to_pool_control
-#        elif not atac:
-#            path_to_pool_control = pool_control
-
        # Add in pseudo replicates
        tmp_metadata = design_new_df.loc[0].copy()
-#        if not atac:
-#            tmp_metadata['control_tag_align'] = path_to_pool_control
+
        for rep, pseudorep_file in pool_pseudoreplicates_dict.items():
            tmp_metadata['sample_id'] = experiment_id + '_pr' + str(rep)
            tmp_metadata['replicate'] = str(rep) + '_pr'

--- a/workflow/tests/test_pool_and_psuedoreplicate.py
+++ b/workflow/tests/test_pool_and_psuedoreplicate.py
@@ -48,12 +48,6 @@ def test_pool_and_psuedoreplicate_singleend_human():
    design_df = pd.read_csv(design_file, sep="\t")
    assert design_df.shape[0] == 6

-
-@pytest.mark.pairedend_mouse
-def test_experiment_design_pairedend_mouse():
-    # Do the same thing for paired end data
-    pass
-
 @pytest.mark.pairedend_mouse
 def test_pool_and_psuedoreplicate_pairedend_mouse():
    design_file = os.path.join(test_output_path, 'ENCSR451NAE_ppr.tsv')

--- a/workflow/tests/test_xcor.py
+++ b/workflow/tests/test_xcor.py
@@ -32,5 +32,5 @@ def test_cross_plot_pairedend_mouse():
    qc_file = os.path.join(test_output_path,"ENCLB749GLW/ENCLB749GLW.cc.qc")
    df_xcor = pd.read_csv(qc_file, sep="\t", header=None)
    assert df_xcor[2].iloc[0] == '0,65,75'
-    assert df_xcor[8].iloc[0] == 1.55347
+    assert round(df_xcor[8].iloc[0],6) == 1.55347
    assert df_xcor[9].iloc[0] == 1.285233